Python: fastest way of checking if there are more than x files in a folder

Question

I am looking for a very rapid way to check whether a folder contains more than 2 files.

I worry that len(os.listdir('/path/')) > 2 may become very slow if there are a lot of files in /path/, especially since this function will be called frequently by multiple processes at a time.

Asked By: bluppfisk

||

Source

Answer 1

There is indeed another function introduced by PEP471 : os.scandir(path)

As it returns a generator, no list will be created and worse case scenario (huge directory) will still be lightweight.

Its higher level interface os.walk(path) will allow you to go through a directory without having to list all of it.

Here is a code example for your specific case :

import os

MINIMUM_SIZE = 2

file_count = 0
for entry in os.scandir('.'):
    if entry.is_file():
        file_count += 1
    if file_count == MINIMUM_SIZE:
        break

enough_files = (file_count == MINIMUM_SIZE)

Answered By: PoneyUHC

Answer 2

If you want something more explicit using pathlib, you can try:

from pathlib import Path

directory_path = Path('/path/').resolve()
nb_files = 0 
enough_files = False
for file_path in directory_path.glob("*"):
    if file_path.is_file():
        nb_files += 1
    if nb_files >= 2:
        enough_files = True
        break
print(enough_files)

Answered By: matleg

Answer 3

To get the fastest it’s probably something hacky.

My guess was:


def iterdir_approach(path):
    iter_of_files = (x for x in Path(path).iterdir() if x.isfile())
    try:
        next(iter_of_files)
        next(iter_of_files)
        next(iter_of_files)
        return True
    except:
        return False

We create a generator and try to exhaust it, catching the thrown exception if necessary.

To profile the approaches we create a bunch of directories with a bunch of files in them :

import shutil
import tempfile
import timeit
import matplotlib.pyplot as plt
from pathlib import Path


def create_temp_directory(num_directories):
    temp_dir = tempfile.mkdtemp()
    for i in range(num_directories):
        dir_path = os.path.join(temp_dir, f"subdir_{i}")
        os.makedirs(dir_path)
        for j in range(random.randint(0,i)):
            file_path = os.path.join(dir_path, f"file_{j}.txt")
            with open(file_path, 'w') as file:
                file.write("Sample content")
    return temp_dir

We define the various approaches (Copied the other two from the answers to the question:


def iterdir_approach(path):
    #@swozny
    iter_of_files = (x for x in Path(path).iterdir() if x.isfile())
    try:
        next(iter_of_files)
        next(iter_of_files)
        next(iter_of_files)
        return True
    except:
        return False

def len_os_dir_approach(path):
    #@bluppfisk
    return len(os.listdir(path)) > 2


def check_files_os_scandir_approach(path):
    #@PoneyUHC
    MINIMUM_SIZE = 3
    file_count = 0
    for entry in os.scandir(path):
        if entry.is_file():
            file_count += 1
        if file_count == MINIMUM_SIZE:
            return True
    return False


def path_resolve_approach(path):
    #@matleg
    directory_path = Path(path).resolve()
    nb_files = 0
    enough_files = False
    for file_path in directory_path.glob("*"):
        if file_path.is_file():
            nb_files += 1
        if nb_files > 2:
            return True
    return False

def dilettant_approach(path):
    #@dilettant
    gen = os.scandir(path)  # OP states only files in folder /path/
    enough = 3  # At least 2 files

    has_enough = len(list(itertools.islice(gen, enough))) >= enough

    return has_enough
def adrian_ang_approach(path):
    #@adrian_ang
    count = 0
    with os.scandir(path) as entries:
        for entry in entries:
            if entry.is_file():
                count += 1
                if count > 2:
                    return True
    return False

Then we profile the code using timeit.timeit and plot the execution times for various amounts of directories:


num_directories_list = [10, 50, 100, 200, 500,1000]
approach1_times = []
approach2_times = []
approach3_times = []
approach4_times = []
approach5_times = []
approach6_times = []


for num_directories in num_directories_list:
    temp_dir = create_temp_directory(num_directories)
    subdir_paths = [str(p) for p in Path(create_temp_directory(num_directories)).iterdir()]
    approach1_time = timeit.timeit(lambda: [iterdir_approach(path)for path in subdir_paths], number=5)
    approach2_time = timeit.timeit(lambda: [check_files_os_scandir_approach(path)for path in subdir_paths], number=5)
    approach3_time = timeit.timeit(lambda: [path_resolve_approach(path)for path in subdir_paths], number=5)
    approach4_time = timeit.timeit(lambda: [len_os_dir_approach(path)for path in subdir_paths], number=5)
    approach5_time = timeit.timeit(lambda: [dilettant_approach(path)for path in subdir_paths], number=5)
    approach6_time = timeit.timeit(lambda: [adrian_ang_approach(path)for path in subdir_paths], number=5)


    approach1_times.append(approach1_time)
    approach2_times.append(approach2_time)
    approach3_times.append(approach3_time)
    approach4_times.append(approach4_time)
    approach5_times.append(approach5_time)
    approach6_times.append(approach6_time)




    shutil.rmtree(temp_dir)

Visualization of the results


plt.plot(num_directories_list, approach1_times, label='iterdir_approach')
plt.plot(num_directories_list, approach2_times, label='check_files_os_scandir_approach')
plt.plot(num_directories_list, approach3_times, label='path_resolve_approach')
plt.plot(num_directories_list, approach4_times, label='os_dir_approach')
plt.plot(num_directories_list, approach5_times, label='dilettant_approach')
plt.plot(num_directories_list, approach6_times, label='adrian_ang_approach')


plt.xlabel('Number of Directories')
plt.ylabel('Execution Time (seconds)')
plt.title('Performance Comparison')
plt.legend()
plt.show()

Closeup of best 3 solutions:

Answered By: Sebastian Wozny

Answer 4

As the OP knows there are only files within /path/ one optimization is to not test on the file attributes.

This version should be profiting from the prior knowledge / constraints:

import itertools
import os

gen = os.scandir('/path/')  # OP states only files in folder /path/
enough = 2  # At least 2 files

# Build an iterator that only returns the first enough elements
# measure the length of the resulting list (at most enough elements)
# and apply the criterion to get the boolean result
has_enough = len(list(itertools.islice(gen, enough))) >= enough

print(has_enough)

Placing this in a shell script and use hyperfine to measure some random performance (folder with 500+ files):

❯ hyperfine ./ssssss.sh
Benchmark #1: ./ssssss.sh
  Time (mean ± σ):      77.6 ms ±   0.6 ms    [User: 29.9 ms, System: 31.8 ms]
  Range (min … max):    76.3 ms …  79.4 ms    36 runs

… and as it should not really matter same system on a folder with more than 100k files:

❯ ls -l |wc -l
  100204

~
❯ hyperfine ./ssssss.sh
Benchmark #1: ./ssssss.sh
  Time (mean ± σ):      79.6 ms ±   1.1 ms    [User: 31.9 ms, System: 33.5 ms]
  Range (min … max):    76.8 ms …  82.1 ms    35 runs

Answered By: Dilettant

Answer 5

for anyone wanting to try the C approach, here’s a module you can import from Python (only does files, not subdirs)

#define PY_SSIZE_T_CLEAN
#include <stdio.h>
#include <dirent.h>
#include <stdlib.h>
#include <Python.h>

static PyObject *
method_dircnt(PyObject *self, PyObject *args)
{
    DIR *dir;
    const char *dirname;
    long min_count, count = 0;
    struct dirent *ent;

    if (!PyArg_ParseTuple(args, "sl", &dirname, &min_count))
    {
        return NULL;
    }

    dir = opendir(dirname);

    while((ent = readdir(dir)))
            if (ent->d_name[0] != '.') {
                ++count;
                if (count >= min_count) {
                    closedir(dir);
                    Py_RETURN_FALSE;
                }
            }

    closedir(dir);

    Py_RETURN_TRUE;
}

static char dircnt_docs[] = "dircnt(dir, min_count): Returns False if dir countains more than min_count files.n";

static PyMethodDef dircnt_methods[] = {
    {"dircnt", (PyCFunction)method_dircnt, METH_VARARGS, dircnt_docs},
    {NULL, NULL, 0, NULL}
};

static struct PyModuleDef dircnt_module_def = 
{
    PyModuleDef_HEAD_INIT,
    "dircnt",
    "Check if there are more than N files in dir",
    -1,
    dircnt_methods
};

PyMODINIT_FUNC PyInit_dircnt(void){
    // Py_Initialize();

    return PyModule_Create(&dircnt_module_def);
}

build:

gcc -I /usr/include/python3.11 dircnt.c -v -shared -fPIC -o dircnt.so (or wherever your headers from the python-dev package are)

usage:

from dircnt import dircnt
dircnt(path, min_count)

It is a fair bit faster especially for higher min_count values:

min_count = 2

min_count = 200

Answered By: bluppfisk

Answer 6

You can use the os.scandir function instead. For example, to check if a folder contains more than 2 files, it iterates over directory entries only 2 times and returns positively when the directory has at least 2 files:

import os

def has_more_than_two_files(path):
    count = 0
    with os.scandir(path) as entries:
        for entry in entries:
            if entry.is_file():
                count += 1
                if count > 2:
                    return True
    return False

Answered By: Adrian Ang

Python: fastest way of checking if there are more than x files in a folder

Question:

Answers: