Find all files matching given path format with datetime
Question:
I’m looking to identify all files whose path matches a given format (for example %Y/%j/data_%d.txt
) and retrieve the associated date.
A simple solution is to use strptime, but this doesn’t work if a ‘format code’ appears twice in the format (e.g. %Y/%j/data_%Y%m%d.txt
). This is a strptime limitation linked to the use of re, which has long been documented (https://github.com/python/cpython/issues/48680).
Do you have any idea how I can deal with this?
I’ve long used this (somewhat complicated) piece of code to handle cases where a ‘format code’ appears twice but in two different levels of the tree (folder, file …):
def get_path(path_format: List[str], path: str = "") -> List[str]:
""" return all path matching the given pattern with datetime format (%Y, %m etc...) handled
:param path_format: List of directories to explore, ex: ['test', '%Y', '%Y%m', 'data_%Y%m%d.txt']
:param path: directory to explore
:return: List of all matching path
"""
if len(path_format) == 0:
return [path]
current_format = path_format[0]
if '%' in current_format:
out = []
current_content = os.listdir(path)
for content in current_content:
try:
datetime.datetime.strptime(content, current_format)
except ValueError:
continue
out += get_path(path_format[1:], os.path.join(path, content))
return out
else:
path = os.path.join(path, current_format)
if os.path.exists(path):
return get_path(path_format[1:], path)
else:
return []
$ tree data/2017/
data/2017/
├── 001
│ └── data_20170101.txt
└── 100
└── data_20170407.txt
>>> p = 'data/%Y/%j/data_%Y%m%d.txt'
>>> p = p.split('/')
>>> get_path(p)
['data/2017/100/data_20170407.txt', 'data/2017/001/data_20170101.txt']
However, I need to manage new data where the ‘code format’ appears twice in the same folder or file name (for example: data/%Y/%j/data_%Y%m%dT0000_%Y%m%dT9000.txt
).
Edit: I found a workaround during the weekend, the solution is not ideal but will do the job until I find a better one. I just split the file name in multiple parts using a separator character (in my case _
). Here is the code:
def merge_two_dates(d1, d2):
return d1.replace(year=max(d1.year, d2.year),
month=max(d1.month, d2.month),
day=max(d1.day, d2.day),
hour=max(d1.hour, d2.hour),
minute=max(d1.minute, d2.minute),
second=max(d1.second, d2.second),
microsecond=max(d1.microsecond, d2.microsecond))
def merge_dates(dates):
date = datetime.datetime.min
for d in dates:
date = merge_two_dates(d, date)
return date
def dates_from_path_rec(path_format: List[str], split_c: str = "", path: str = "") -> List[datetime.datetime]:
""" return all path matching the given pattern with datetime format (%Y, %m etc...) handled
:param path_format: List of directories to explore, ex: ['test', '%Y', '%Y%m', 'data_%Y%m%d.txt']
:param split_c: If two dates are in the same directory / file name, split_c will be used to split this name.
:param path: directory to explore
:return: List of date of all matching path
"""
if len(path_format) == 0:
return [datetime.datetime.min]
current_format = path_format[0]
if '%' in current_format:
split_format = current_format.split(split_c) if len(split_c) > 0 else [current_format]
out = []
current_content = os.listdir(path)
for content in current_content:
split_content = content.split(split_c) if len(split_c) > 0 else [content]
if len(split_format) != len(split_content):
continue
try:
dates = [datetime.datetime.strptime(sc, sf)
for sc, sf in zip(split_content, split_format) if '%' in sf]
except ValueError:
continue
date = merge_dates(dates)
if date.strftime(current_format) != content:
continue
out += [merge_two_dates(date, d)
for d in dates_from_path_rec(path_format[1:], split_c, os.path.join(path, content))]
return out
else:
path = os.path.join(path, current_format)
if os.path.exists(path):
return dates_from_path_rec(path_format[1:], split_c, path)
else:
return []
Answers:
I’ve finally found a solution to my problem.
The regex package extends the capabilities of the re library, and in particular handles cases of group name
redefinition. By copying and modifying the _strptime.py file from python, I can obtain a function that accepts formats with group name redefinition.
My custome strptime.py
file (copy of _strptime.py):
from regex import compile as regex_compile
...
class TimeRE(dict):
...
def compile(self, format):
"""Return a compiled re object for the format string."""
return regex_compile(self.pattern(format), IGNORECASE)
...
def str_to_time(data_string, format="%a %b %d %H:%M:%S %Y"):
"""Return a time struct based on the input string and the
format string."""
tt = _strptime(data_string, format)[0]
return time.struct_time(tt[:time._STRUCT_TM_ITEMS])
def str_to_cls(cls, data_string, format="%a %b %d %H:%M:%S %Y"):
"""Return a class cls instance based on the input string and the
format string."""
tt, fraction, gmtoff_fraction = _strptime(data_string, format)
tzname, gmtoff = tt[-2:]
args = tt[:6] + (fraction,)
if gmtoff is not None:
tzdelta = datetime_timedelta(seconds=gmtoff, microseconds=gmtoff_fraction)
if tzname:
tz = datetime_timezone(tzdelta, tzname)
else:
tz = datetime_timezone(tzdelta)
args += (tz,)
return cls(*args)
I can then use these functions to rewrite my function date_from_path()
:
import strptime
def matching_paths(path_format: List[str], path: str = "") -> List[str]:
""" return all path matching the given pattern with datetime format (%Y, %m etc...) handled
:param path_format: List of directories to explore, ex: ['test', '%Y', '%Y%m', 'data_%Y%m%d.txt']
:param path: directory to explore
:return: List of date of all matching path
"""
if len(path_format) == 0:
return [path]
current_format = path_format[0]
if '%' in current_format:
out = []
current_content = os.listdir(path)
for content in current_content:
try:
strptime.str_to_time(content, current_format)
except ValueError:
continue
out += matching_paths(path_format[1:], os.path.join(path, content))
return out
else:
path = os.path.join(path, current_format)
if os.path.exists(path):
return matching_paths(path_format[1:], path)
else:
return []
def dates_from_path(fmt: str) -> Set[datetime.datetime]:
""" Return all date from file matching fmt
:param fmt: input path to explore to find input_object, can contain %Y, %m, %d ...
:return:
"""
fmt = os.path.normpath(fmt)
# set of dates (remove redundancy)
dates = matching_paths(fmt.split(os.sep))
dates = [strptime.str_to_cls(datetime.datetime, d, fmt) for d in dates]
dates = set(dates)
# assert each date is working, avoid potential problem that could have occurred during recurrence
dates = set([date for date in dates if os.path.exists(date.strftime(fmt))])
return dates
I’m looking to identify all files whose path matches a given format (for example %Y/%j/data_%d.txt
) and retrieve the associated date.
A simple solution is to use strptime, but this doesn’t work if a ‘format code’ appears twice in the format (e.g. %Y/%j/data_%Y%m%d.txt
). This is a strptime limitation linked to the use of re, which has long been documented (https://github.com/python/cpython/issues/48680).
Do you have any idea how I can deal with this?
I’ve long used this (somewhat complicated) piece of code to handle cases where a ‘format code’ appears twice but in two different levels of the tree (folder, file …):
def get_path(path_format: List[str], path: str = "") -> List[str]:
""" return all path matching the given pattern with datetime format (%Y, %m etc...) handled
:param path_format: List of directories to explore, ex: ['test', '%Y', '%Y%m', 'data_%Y%m%d.txt']
:param path: directory to explore
:return: List of all matching path
"""
if len(path_format) == 0:
return [path]
current_format = path_format[0]
if '%' in current_format:
out = []
current_content = os.listdir(path)
for content in current_content:
try:
datetime.datetime.strptime(content, current_format)
except ValueError:
continue
out += get_path(path_format[1:], os.path.join(path, content))
return out
else:
path = os.path.join(path, current_format)
if os.path.exists(path):
return get_path(path_format[1:], path)
else:
return []
$ tree data/2017/
data/2017/
├── 001
│ └── data_20170101.txt
└── 100
└── data_20170407.txt
>>> p = 'data/%Y/%j/data_%Y%m%d.txt'
>>> p = p.split('/')
>>> get_path(p)
['data/2017/100/data_20170407.txt', 'data/2017/001/data_20170101.txt']
However, I need to manage new data where the ‘code format’ appears twice in the same folder or file name (for example: data/%Y/%j/data_%Y%m%dT0000_%Y%m%dT9000.txt
).
Edit: I found a workaround during the weekend, the solution is not ideal but will do the job until I find a better one. I just split the file name in multiple parts using a separator character (in my case _
). Here is the code:
def merge_two_dates(d1, d2):
return d1.replace(year=max(d1.year, d2.year),
month=max(d1.month, d2.month),
day=max(d1.day, d2.day),
hour=max(d1.hour, d2.hour),
minute=max(d1.minute, d2.minute),
second=max(d1.second, d2.second),
microsecond=max(d1.microsecond, d2.microsecond))
def merge_dates(dates):
date = datetime.datetime.min
for d in dates:
date = merge_two_dates(d, date)
return date
def dates_from_path_rec(path_format: List[str], split_c: str = "", path: str = "") -> List[datetime.datetime]:
""" return all path matching the given pattern with datetime format (%Y, %m etc...) handled
:param path_format: List of directories to explore, ex: ['test', '%Y', '%Y%m', 'data_%Y%m%d.txt']
:param split_c: If two dates are in the same directory / file name, split_c will be used to split this name.
:param path: directory to explore
:return: List of date of all matching path
"""
if len(path_format) == 0:
return [datetime.datetime.min]
current_format = path_format[0]
if '%' in current_format:
split_format = current_format.split(split_c) if len(split_c) > 0 else [current_format]
out = []
current_content = os.listdir(path)
for content in current_content:
split_content = content.split(split_c) if len(split_c) > 0 else [content]
if len(split_format) != len(split_content):
continue
try:
dates = [datetime.datetime.strptime(sc, sf)
for sc, sf in zip(split_content, split_format) if '%' in sf]
except ValueError:
continue
date = merge_dates(dates)
if date.strftime(current_format) != content:
continue
out += [merge_two_dates(date, d)
for d in dates_from_path_rec(path_format[1:], split_c, os.path.join(path, content))]
return out
else:
path = os.path.join(path, current_format)
if os.path.exists(path):
return dates_from_path_rec(path_format[1:], split_c, path)
else:
return []
I’ve finally found a solution to my problem.
The regex package extends the capabilities of the re library, and in particular handles cases of group name
redefinition. By copying and modifying the _strptime.py file from python, I can obtain a function that accepts formats with group name redefinition.
My custome strptime.py
file (copy of _strptime.py):
from regex import compile as regex_compile
...
class TimeRE(dict):
...
def compile(self, format):
"""Return a compiled re object for the format string."""
return regex_compile(self.pattern(format), IGNORECASE)
...
def str_to_time(data_string, format="%a %b %d %H:%M:%S %Y"):
"""Return a time struct based on the input string and the
format string."""
tt = _strptime(data_string, format)[0]
return time.struct_time(tt[:time._STRUCT_TM_ITEMS])
def str_to_cls(cls, data_string, format="%a %b %d %H:%M:%S %Y"):
"""Return a class cls instance based on the input string and the
format string."""
tt, fraction, gmtoff_fraction = _strptime(data_string, format)
tzname, gmtoff = tt[-2:]
args = tt[:6] + (fraction,)
if gmtoff is not None:
tzdelta = datetime_timedelta(seconds=gmtoff, microseconds=gmtoff_fraction)
if tzname:
tz = datetime_timezone(tzdelta, tzname)
else:
tz = datetime_timezone(tzdelta)
args += (tz,)
return cls(*args)
I can then use these functions to rewrite my function date_from_path()
:
import strptime
def matching_paths(path_format: List[str], path: str = "") -> List[str]:
""" return all path matching the given pattern with datetime format (%Y, %m etc...) handled
:param path_format: List of directories to explore, ex: ['test', '%Y', '%Y%m', 'data_%Y%m%d.txt']
:param path: directory to explore
:return: List of date of all matching path
"""
if len(path_format) == 0:
return [path]
current_format = path_format[0]
if '%' in current_format:
out = []
current_content = os.listdir(path)
for content in current_content:
try:
strptime.str_to_time(content, current_format)
except ValueError:
continue
out += matching_paths(path_format[1:], os.path.join(path, content))
return out
else:
path = os.path.join(path, current_format)
if os.path.exists(path):
return matching_paths(path_format[1:], path)
else:
return []
def dates_from_path(fmt: str) -> Set[datetime.datetime]:
""" Return all date from file matching fmt
:param fmt: input path to explore to find input_object, can contain %Y, %m, %d ...
:return:
"""
fmt = os.path.normpath(fmt)
# set of dates (remove redundancy)
dates = matching_paths(fmt.split(os.sep))
dates = [strptime.str_to_cls(datetime.datetime, d, fmt) for d in dates]
dates = set(dates)
# assert each date is working, avoid potential problem that could have occurred during recurrence
dates = set([date for date in dates if os.path.exists(date.strftime(fmt))])
return dates