Python: fastest way of checking if there are more than x files in a folder
Question:
I am looking for a very rapid way to check whether a folder contains more than 2 files.
I worry that len(os.listdir('/path/')) > 2
may become very slow if there are a lot of files in /path/, especially since this function will be called frequently by multiple processes at a time.
Answers:
There is indeed another function introduced by PEP471 : os.scandir(path)
As it returns a generator, no list will be created and worse case scenario (huge directory) will still be lightweight.
Its higher level interface os.walk(path)
will allow you to go through a directory without having to list all of it.
Here is a code example for your specific case :
import os
MINIMUM_SIZE = 2
file_count = 0
for entry in os.scandir('.'):
if entry.is_file():
file_count += 1
if file_count == MINIMUM_SIZE:
break
enough_files = (file_count == MINIMUM_SIZE)
If you want something more explicit using pathlib, you can try:
from pathlib import Path
directory_path = Path('/path/').resolve()
nb_files = 0
enough_files = False
for file_path in directory_path.glob("*"):
if file_path.is_file():
nb_files += 1
if nb_files >= 2:
enough_files = True
break
print(enough_files)
To get the fastest it’s probably something hacky.
My guess was:
def iterdir_approach(path):
iter_of_files = (x for x in Path(path).iterdir() if x.isfile())
try:
next(iter_of_files)
next(iter_of_files)
next(iter_of_files)
return True
except:
return False
We create a generator and try to exhaust it, catching the thrown exception if necessary.
To profile the approaches we create a bunch of directories with a bunch of files in them :
import shutil
import tempfile
import timeit
import matplotlib.pyplot as plt
from pathlib import Path
def create_temp_directory(num_directories):
temp_dir = tempfile.mkdtemp()
for i in range(num_directories):
dir_path = os.path.join(temp_dir, f"subdir_{i}")
os.makedirs(dir_path)
for j in range(random.randint(0,i)):
file_path = os.path.join(dir_path, f"file_{j}.txt")
with open(file_path, 'w') as file:
file.write("Sample content")
return temp_dir
We define the various approaches (Copied the other two from the answers to the question:
def iterdir_approach(path):
#@swozny
iter_of_files = (x for x in Path(path).iterdir() if x.isfile())
try:
next(iter_of_files)
next(iter_of_files)
next(iter_of_files)
return True
except:
return False
def len_os_dir_approach(path):
#@bluppfisk
return len(os.listdir(path)) > 2
def check_files_os_scandir_approach(path):
#@PoneyUHC
MINIMUM_SIZE = 3
file_count = 0
for entry in os.scandir(path):
if entry.is_file():
file_count += 1
if file_count == MINIMUM_SIZE:
return True
return False
def path_resolve_approach(path):
#@matleg
directory_path = Path(path).resolve()
nb_files = 0
enough_files = False
for file_path in directory_path.glob("*"):
if file_path.is_file():
nb_files += 1
if nb_files > 2:
return True
return False
def dilettant_approach(path):
#@dilettant
gen = os.scandir(path) # OP states only files in folder /path/
enough = 3 # At least 2 files
has_enough = len(list(itertools.islice(gen, enough))) >= enough
return has_enough
def adrian_ang_approach(path):
#@adrian_ang
count = 0
with os.scandir(path) as entries:
for entry in entries:
if entry.is_file():
count += 1
if count > 2:
return True
return False
Then we profile the code using timeit.timeit
and plot the execution times for various amounts of directories:
num_directories_list = [10, 50, 100, 200, 500,1000]
approach1_times = []
approach2_times = []
approach3_times = []
approach4_times = []
approach5_times = []
approach6_times = []
for num_directories in num_directories_list:
temp_dir = create_temp_directory(num_directories)
subdir_paths = [str(p) for p in Path(create_temp_directory(num_directories)).iterdir()]
approach1_time = timeit.timeit(lambda: [iterdir_approach(path)for path in subdir_paths], number=5)
approach2_time = timeit.timeit(lambda: [check_files_os_scandir_approach(path)for path in subdir_paths], number=5)
approach3_time = timeit.timeit(lambda: [path_resolve_approach(path)for path in subdir_paths], number=5)
approach4_time = timeit.timeit(lambda: [len_os_dir_approach(path)for path in subdir_paths], number=5)
approach5_time = timeit.timeit(lambda: [dilettant_approach(path)for path in subdir_paths], number=5)
approach6_time = timeit.timeit(lambda: [adrian_ang_approach(path)for path in subdir_paths], number=5)
approach1_times.append(approach1_time)
approach2_times.append(approach2_time)
approach3_times.append(approach3_time)
approach4_times.append(approach4_time)
approach5_times.append(approach5_time)
approach6_times.append(approach6_time)
shutil.rmtree(temp_dir)
Visualization of the results
plt.plot(num_directories_list, approach1_times, label='iterdir_approach')
plt.plot(num_directories_list, approach2_times, label='check_files_os_scandir_approach')
plt.plot(num_directories_list, approach3_times, label='path_resolve_approach')
plt.plot(num_directories_list, approach4_times, label='os_dir_approach')
plt.plot(num_directories_list, approach5_times, label='dilettant_approach')
plt.plot(num_directories_list, approach6_times, label='adrian_ang_approach')
plt.xlabel('Number of Directories')
plt.ylabel('Execution Time (seconds)')
plt.title('Performance Comparison')
plt.legend()
plt.show()
As the OP knows there are only files within /path/ one optimization is to not test on the file attributes.
This version should be profiting from the prior knowledge / constraints:
import itertools
import os
gen = os.scandir('/path/') # OP states only files in folder /path/
enough = 2 # At least 2 files
# Build an iterator that only returns the first enough elements
# measure the length of the resulting list (at most enough elements)
# and apply the criterion to get the boolean result
has_enough = len(list(itertools.islice(gen, enough))) >= enough
print(has_enough)
Placing this in a shell script and use hyperfine to measure some random performance (folder with 500+ files):
❯ hyperfine ./ssssss.sh
Benchmark #1: ./ssssss.sh
Time (mean ± σ): 77.6 ms ± 0.6 ms [User: 29.9 ms, System: 31.8 ms]
Range (min … max): 76.3 ms … 79.4 ms 36 runs
… and as it should not really matter same system on a folder with more than 100k files:
❯ ls -l |wc -l
100204
~
❯ hyperfine ./ssssss.sh
Benchmark #1: ./ssssss.sh
Time (mean ± σ): 79.6 ms ± 1.1 ms [User: 31.9 ms, System: 33.5 ms]
Range (min … max): 76.8 ms … 82.1 ms 35 runs
for anyone wanting to try the C approach, here’s a module you can import from Python (only does files, not subdirs)
#define PY_SSIZE_T_CLEAN
#include <stdio.h>
#include <dirent.h>
#include <stdlib.h>
#include <Python.h>
static PyObject *
method_dircnt(PyObject *self, PyObject *args)
{
DIR *dir;
const char *dirname;
long min_count, count = 0;
struct dirent *ent;
if (!PyArg_ParseTuple(args, "sl", &dirname, &min_count))
{
return NULL;
}
dir = opendir(dirname);
while((ent = readdir(dir)))
if (ent->d_name[0] != '.') {
++count;
if (count >= min_count) {
closedir(dir);
Py_RETURN_FALSE;
}
}
closedir(dir);
Py_RETURN_TRUE;
}
static char dircnt_docs[] = "dircnt(dir, min_count): Returns False if dir countains more than min_count files.n";
static PyMethodDef dircnt_methods[] = {
{"dircnt", (PyCFunction)method_dircnt, METH_VARARGS, dircnt_docs},
{NULL, NULL, 0, NULL}
};
static struct PyModuleDef dircnt_module_def =
{
PyModuleDef_HEAD_INIT,
"dircnt",
"Check if there are more than N files in dir",
-1,
dircnt_methods
};
PyMODINIT_FUNC PyInit_dircnt(void){
// Py_Initialize();
return PyModule_Create(&dircnt_module_def);
}
build:
gcc -I /usr/include/python3.11 dircnt.c -v -shared -fPIC -o dircnt.so
(or wherever your headers from the python-dev package are)
usage:
from dircnt import dircnt
dircnt(path, min_count)
It is a fair bit faster especially for higher min_count
values:
You can use the os.scandir
function instead. For example, to check if a folder contains more than 2 files, it iterates over directory entries only 2 times and returns positively when the directory has at least 2 files:
import os
def has_more_than_two_files(path):
count = 0
with os.scandir(path) as entries:
for entry in entries:
if entry.is_file():
count += 1
if count > 2:
return True
return False
I am looking for a very rapid way to check whether a folder contains more than 2 files.
I worry that len(os.listdir('/path/')) > 2
may become very slow if there are a lot of files in /path/, especially since this function will be called frequently by multiple processes at a time.
There is indeed another function introduced by PEP471 : os.scandir(path)
As it returns a generator, no list will be created and worse case scenario (huge directory) will still be lightweight.
Its higher level interface os.walk(path)
will allow you to go through a directory without having to list all of it.
Here is a code example for your specific case :
import os
MINIMUM_SIZE = 2
file_count = 0
for entry in os.scandir('.'):
if entry.is_file():
file_count += 1
if file_count == MINIMUM_SIZE:
break
enough_files = (file_count == MINIMUM_SIZE)
If you want something more explicit using pathlib, you can try:
from pathlib import Path
directory_path = Path('/path/').resolve()
nb_files = 0
enough_files = False
for file_path in directory_path.glob("*"):
if file_path.is_file():
nb_files += 1
if nb_files >= 2:
enough_files = True
break
print(enough_files)
To get the fastest it’s probably something hacky.
My guess was:
def iterdir_approach(path):
iter_of_files = (x for x in Path(path).iterdir() if x.isfile())
try:
next(iter_of_files)
next(iter_of_files)
next(iter_of_files)
return True
except:
return False
We create a generator and try to exhaust it, catching the thrown exception if necessary.
To profile the approaches we create a bunch of directories with a bunch of files in them :
import shutil
import tempfile
import timeit
import matplotlib.pyplot as plt
from pathlib import Path
def create_temp_directory(num_directories):
temp_dir = tempfile.mkdtemp()
for i in range(num_directories):
dir_path = os.path.join(temp_dir, f"subdir_{i}")
os.makedirs(dir_path)
for j in range(random.randint(0,i)):
file_path = os.path.join(dir_path, f"file_{j}.txt")
with open(file_path, 'w') as file:
file.write("Sample content")
return temp_dir
We define the various approaches (Copied the other two from the answers to the question:
def iterdir_approach(path):
#@swozny
iter_of_files = (x for x in Path(path).iterdir() if x.isfile())
try:
next(iter_of_files)
next(iter_of_files)
next(iter_of_files)
return True
except:
return False
def len_os_dir_approach(path):
#@bluppfisk
return len(os.listdir(path)) > 2
def check_files_os_scandir_approach(path):
#@PoneyUHC
MINIMUM_SIZE = 3
file_count = 0
for entry in os.scandir(path):
if entry.is_file():
file_count += 1
if file_count == MINIMUM_SIZE:
return True
return False
def path_resolve_approach(path):
#@matleg
directory_path = Path(path).resolve()
nb_files = 0
enough_files = False
for file_path in directory_path.glob("*"):
if file_path.is_file():
nb_files += 1
if nb_files > 2:
return True
return False
def dilettant_approach(path):
#@dilettant
gen = os.scandir(path) # OP states only files in folder /path/
enough = 3 # At least 2 files
has_enough = len(list(itertools.islice(gen, enough))) >= enough
return has_enough
def adrian_ang_approach(path):
#@adrian_ang
count = 0
with os.scandir(path) as entries:
for entry in entries:
if entry.is_file():
count += 1
if count > 2:
return True
return False
Then we profile the code using timeit.timeit
and plot the execution times for various amounts of directories:
num_directories_list = [10, 50, 100, 200, 500,1000]
approach1_times = []
approach2_times = []
approach3_times = []
approach4_times = []
approach5_times = []
approach6_times = []
for num_directories in num_directories_list:
temp_dir = create_temp_directory(num_directories)
subdir_paths = [str(p) for p in Path(create_temp_directory(num_directories)).iterdir()]
approach1_time = timeit.timeit(lambda: [iterdir_approach(path)for path in subdir_paths], number=5)
approach2_time = timeit.timeit(lambda: [check_files_os_scandir_approach(path)for path in subdir_paths], number=5)
approach3_time = timeit.timeit(lambda: [path_resolve_approach(path)for path in subdir_paths], number=5)
approach4_time = timeit.timeit(lambda: [len_os_dir_approach(path)for path in subdir_paths], number=5)
approach5_time = timeit.timeit(lambda: [dilettant_approach(path)for path in subdir_paths], number=5)
approach6_time = timeit.timeit(lambda: [adrian_ang_approach(path)for path in subdir_paths], number=5)
approach1_times.append(approach1_time)
approach2_times.append(approach2_time)
approach3_times.append(approach3_time)
approach4_times.append(approach4_time)
approach5_times.append(approach5_time)
approach6_times.append(approach6_time)
shutil.rmtree(temp_dir)
Visualization of the results
plt.plot(num_directories_list, approach1_times, label='iterdir_approach')
plt.plot(num_directories_list, approach2_times, label='check_files_os_scandir_approach')
plt.plot(num_directories_list, approach3_times, label='path_resolve_approach')
plt.plot(num_directories_list, approach4_times, label='os_dir_approach')
plt.plot(num_directories_list, approach5_times, label='dilettant_approach')
plt.plot(num_directories_list, approach6_times, label='adrian_ang_approach')
plt.xlabel('Number of Directories')
plt.ylabel('Execution Time (seconds)')
plt.title('Performance Comparison')
plt.legend()
plt.show()
As the OP knows there are only files within /path/ one optimization is to not test on the file attributes.
This version should be profiting from the prior knowledge / constraints:
import itertools
import os
gen = os.scandir('/path/') # OP states only files in folder /path/
enough = 2 # At least 2 files
# Build an iterator that only returns the first enough elements
# measure the length of the resulting list (at most enough elements)
# and apply the criterion to get the boolean result
has_enough = len(list(itertools.islice(gen, enough))) >= enough
print(has_enough)
Placing this in a shell script and use hyperfine to measure some random performance (folder with 500+ files):
❯ hyperfine ./ssssss.sh
Benchmark #1: ./ssssss.sh
Time (mean ± σ): 77.6 ms ± 0.6 ms [User: 29.9 ms, System: 31.8 ms]
Range (min … max): 76.3 ms … 79.4 ms 36 runs
… and as it should not really matter same system on a folder with more than 100k files:
❯ ls -l |wc -l
100204
~
❯ hyperfine ./ssssss.sh
Benchmark #1: ./ssssss.sh
Time (mean ± σ): 79.6 ms ± 1.1 ms [User: 31.9 ms, System: 33.5 ms]
Range (min … max): 76.8 ms … 82.1 ms 35 runs
for anyone wanting to try the C approach, here’s a module you can import from Python (only does files, not subdirs)
#define PY_SSIZE_T_CLEAN
#include <stdio.h>
#include <dirent.h>
#include <stdlib.h>
#include <Python.h>
static PyObject *
method_dircnt(PyObject *self, PyObject *args)
{
DIR *dir;
const char *dirname;
long min_count, count = 0;
struct dirent *ent;
if (!PyArg_ParseTuple(args, "sl", &dirname, &min_count))
{
return NULL;
}
dir = opendir(dirname);
while((ent = readdir(dir)))
if (ent->d_name[0] != '.') {
++count;
if (count >= min_count) {
closedir(dir);
Py_RETURN_FALSE;
}
}
closedir(dir);
Py_RETURN_TRUE;
}
static char dircnt_docs[] = "dircnt(dir, min_count): Returns False if dir countains more than min_count files.n";
static PyMethodDef dircnt_methods[] = {
{"dircnt", (PyCFunction)method_dircnt, METH_VARARGS, dircnt_docs},
{NULL, NULL, 0, NULL}
};
static struct PyModuleDef dircnt_module_def =
{
PyModuleDef_HEAD_INIT,
"dircnt",
"Check if there are more than N files in dir",
-1,
dircnt_methods
};
PyMODINIT_FUNC PyInit_dircnt(void){
// Py_Initialize();
return PyModule_Create(&dircnt_module_def);
}
build:
gcc -I /usr/include/python3.11 dircnt.c -v -shared -fPIC -o dircnt.so
(or wherever your headers from the python-dev package are)
usage:
from dircnt import dircnt
dircnt(path, min_count)
It is a fair bit faster especially for higher min_count
values:
You can use the os.scandir
function instead. For example, to check if a folder contains more than 2 files, it iterates over directory entries only 2 times and returns positively when the directory has at least 2 files:
import os
def has_more_than_two_files(path):
count = 0
with os.scandir(path) as entries:
for entry in entries:
if entry.is_file():
count += 1
if count > 2:
return True
return False