Find all files matching given path format with datetime


I’m looking to identify all files whose path matches a given format (for example %Y/%j/data_%d.txt) and retrieve the associated date.

A simple solution is to use strptime, but this doesn’t work if a ‘format code’ appears twice in the format (e.g. %Y/%j/data_%Y%m%d.txt). This is a strptime limitation linked to the use of re, which has long been documented (

Do you have any idea how I can deal with this?

I’ve long used this (somewhat complicated) piece of code to handle cases where a ‘format code’ appears twice but in two different levels of the tree (folder, file …):

def get_path(path_format: List[str], path: str = "") -> List[str]:
    """ return all path matching the given pattern with datetime format (%Y, %m etc...) handled

    :param path_format: List of directories to explore, ex: ['test', '%Y', '%Y%m', 'data_%Y%m%d.txt']
    :param path: directory to explore
    :return: List of all matching path
    if len(path_format) == 0:
        return [path]
    current_format = path_format[0]
    if '%' in current_format:
        out = []
        current_content = os.listdir(path)
        for content in current_content:
                datetime.datetime.strptime(content, current_format)
            except ValueError:
            out += get_path(path_format[1:], os.path.join(path, content))
        return out
        path = os.path.join(path, current_format)
        if os.path.exists(path):
            return get_path(path_format[1:], path)
            return []
$ tree data/2017/
├── 001
│   └── data_20170101.txt
└── 100
    └── data_20170407.txt
>>> p = 'data/%Y/%j/data_%Y%m%d.txt'
>>> p = p.split('/')
>>> get_path(p)
['data/2017/100/data_20170407.txt', 'data/2017/001/data_20170101.txt']

However, I need to manage new data where the ‘code format’ appears twice in the same folder or file name (for example: data/%Y/%j/data_%Y%m%dT0000_%Y%m%dT9000.txt).

Edit: I found a workaround during the weekend, the solution is not ideal but will do the job until I find a better one. I just split the file name in multiple parts using a separator character (in my case _). Here is the code:

def merge_two_dates(d1, d2):
    return d1.replace(year=max(d1.year, d2.year),
                      month=max(d1.month, d2.month),
                      hour=max(d1.hour, d2.hour),
                      minute=max(d1.minute, d2.minute),
                      second=max(d1.second, d2.second),
                      microsecond=max(d1.microsecond, d2.microsecond))

def merge_dates(dates):
    date = datetime.datetime.min
    for d in dates:
        date = merge_two_dates(d, date)
    return date

def dates_from_path_rec(path_format: List[str], split_c: str = "", path: str = "") -> List[datetime.datetime]:
    """ return all path matching the given pattern with datetime format (%Y, %m etc...) handled

    :param path_format: List of directories to explore, ex: ['test', '%Y', '%Y%m', 'data_%Y%m%d.txt']
    :param split_c: If two dates are in the same directory / file name, split_c will be used to split this name.
    :param path: directory to explore
    :return: List of date of all matching path
    if len(path_format) == 0:
        return [datetime.datetime.min]
    current_format = path_format[0]

    if '%' in current_format:
        split_format = current_format.split(split_c) if len(split_c) > 0 else [current_format]

        out = []
        current_content = os.listdir(path)
        for content in current_content:
            split_content = content.split(split_c) if len(split_c) > 0 else [content]
            if len(split_format) != len(split_content):

                dates = [datetime.datetime.strptime(sc, sf)
                         for sc, sf in zip(split_content, split_format) if '%' in sf]
            except ValueError:

            date = merge_dates(dates)
            if date.strftime(current_format) != content:

            out += [merge_two_dates(date, d)
                    for d in dates_from_path_rec(path_format[1:], split_c, os.path.join(path, content))]
        return out
        path = os.path.join(path, current_format)
        if os.path.exists(path):
            return dates_from_path_rec(path_format[1:], split_c, path)
            return []
Asked By: Alexandre Novius



I’ve finally found a solution to my problem.
The regex package extends the capabilities of the re library, and in particular handles cases of group name redefinition. By copying and modifying the file from python, I can obtain a function that accepts formats with group name redefinition.

My custome file (copy of

from regex import compile as regex_compile


class TimeRE(dict):

    def compile(self, format):
        """Return a compiled re object for the format string."""
        return regex_compile(self.pattern(format), IGNORECASE)


def str_to_time(data_string, format="%a %b %d %H:%M:%S %Y"):
    """Return a time struct based on the input string and the
    format string."""
    tt = _strptime(data_string, format)[0]
    return time.struct_time(tt[:time._STRUCT_TM_ITEMS])

def str_to_cls(cls, data_string, format="%a %b %d %H:%M:%S %Y"):
    """Return a class cls instance based on the input string and the
    format string."""
    tt, fraction, gmtoff_fraction = _strptime(data_string, format)
    tzname, gmtoff = tt[-2:]
    args = tt[:6] + (fraction,)
    if gmtoff is not None:
        tzdelta = datetime_timedelta(seconds=gmtoff, microseconds=gmtoff_fraction)
        if tzname:
            tz = datetime_timezone(tzdelta, tzname)
            tz = datetime_timezone(tzdelta)
        args += (tz,)

    return cls(*args)

I can then use these functions to rewrite my function date_from_path():

import strptime

def matching_paths(path_format: List[str], path: str = "") -> List[str]:
    """ return all path matching the given pattern with datetime format (%Y, %m etc...) handled

    :param path_format: List of directories to explore, ex: ['test', '%Y', '%Y%m', 'data_%Y%m%d.txt']
    :param path: directory to explore
    :return: List of date of all matching path
    if len(path_format) == 0:
        return [path]
    current_format = path_format[0]

    if '%' in current_format:
        out = []
        current_content = os.listdir(path)
        for content in current_content:
                strptime.str_to_time(content, current_format)
            except ValueError:
            out += matching_paths(path_format[1:], os.path.join(path, content))
        return out
        path = os.path.join(path, current_format)
        if os.path.exists(path):
            return matching_paths(path_format[1:], path)
            return []

def dates_from_path(fmt: str) -> Set[datetime.datetime]:
    """ Return all date from file matching fmt
    :param fmt: input path to explore to find input_object, can contain %Y, %m, %d ...
    fmt = os.path.normpath(fmt)

    # set of dates (remove redundancy)
    dates = matching_paths(fmt.split(os.sep))
    dates = [strptime.str_to_cls(datetime.datetime, d, fmt) for d in dates]
    dates = set(dates)

    # assert each date is working, avoid potential problem that could have occurred during recurrence
    dates = set([date for date in dates if os.path.exists(date.strftime(fmt))])
    return dates
Answered By: Alexandre Novius
Categories: questions Tags: , ,
Answers are sorted by their score. The answer accepted by the question owner as the best is marked with
at the top-right corner.