Ignore case in glob() on Linux
Question:
I’m writing a script which will have to work on directories which are modified by hand by Windows and Linux users alike. The Windows users tend to not care at all about case in assigning filenames.
Is there a way to handle this on the Linux side in Python, i.e. can I get a case-insensitive, glob-like behaviour?
Answers:
Use case-insensitive regexes instead of glob patterns. fnmatch.translate
generates a regex from a glob pattern, so
re.compile(fnmatch.translate(pattern), re.IGNORECASE)
gives you a case-insensitive version of a glob pattern as a compiled RE.
Keep in mind that, if the filesystem is hosted by a Linux box on a Unix-like filesystem, users will be able to create files foo
, Foo
and FOO
in the same directory.
You can replace each alphabetic character c with [cC], via
import glob
def insensitive_glob(pattern):
def either(c):
return '[%s%s]' % (c.lower(), c.upper()) if c.isalpha() else c
return glob.glob(''.join(map(either, pattern)))
Non recursively
In order to retrieve the files (and files only) of a directory "path", with "globexpression":
list_path = [i for i in os.listdir(path) if os.path.isfile(os.path.join(path, i))]
result = [os.path.join(path, j) for j in list_path if re.match(fnmatch.translate(globexpression), j, re.IGNORECASE)]
Recursively
with walk:
result = []
for root, dirs, files in os.walk(path, topdown=True):
result += [os.path.join(root, j) for j in files
if re.match(fnmatch.translate(globexpression), j, re.IGNORECASE)]
Better also compile the regular expression, so instead of
re.match(fnmatch.translate(globexpression)
do (before the loop):
reg_expr = re.compile(fnmatch.translate(globexpression), re.IGNORECASE)
and then replace in the loop:
result += [os.path.join(root, j) for j in files if re.match(reg_expr, j)]
Depending on your case, you might use .lower()
on both file pattern and results from folder listing and only then compare the pattern with the filename
Here is my non-recursive file search for Python with glob like behavior for Python 3.5+
# Eg: find_files('~/Downloads', '*.Xls', ignore_case=True)
def find_files(path: str, glob_pat: str, ignore_case: bool = False):
rule = re.compile(fnmatch.translate(glob_pat), re.IGNORECASE) if ignore_case
else re.compile(fnmatch.translate(glob_pat))
return [n for n in os.listdir(os.path.expanduser(path)) if rule.match(n)]
Note: This version handles home directory expansion
Riffing off of @Timothy C. Quinn’s answer, this modification allows the use of wildcards anywhere in the path. This is admittedly only case insensitive for the glob_pat argument.
import re
import os
import fnmatch
import glob
def find_files(path: str, glob_pat: str, ignore_case: bool = False):
rule = re.compile(fnmatch.translate(glob_pat), re.IGNORECASE) if ignore_case
else re.compile(fnmatch.translate(glob_pat))
return [n for n in glob.glob(os.path.join(path, '*')) if rule.match(n)]
Here is a working example with fnmatch.translate()
:
from glob import glob
from pathlib import Path
import fnmatch, re
mask_str = '"*_*_yyww.TXT" | "*_yyww.TXT" | "*_*_yyww_*.TXT" | "*_yyww_*.TXT"'
masks_list = ["yyyy", "yy", "mmmmm", "mmm", "mm", "#d", "#w", "#m", "ww"]
for mask_item in masks_list:
mask_str = mask_str.replace(mask_item, "*")
clean_quotes_and_spaces = mask_str.replace(" ", "").replace('"', '')
remove_double_star = clean_quotes_and_spaces.replace("**", "*")
masks = remove_double_star.split("|")
cwd = Path.cwd()
files = list(cwd.glob('*'))
print(files)
files_found = set()
for mask in masks:
mask = re.compile(fnmatch.translate(mask), re.IGNORECASE)
print(mask)
for file in files:
if mask.match(str(file)):
files_found.add(file)
print(files_found)
I just wanted a variant of this where I only went case insensitive if I was specifying a file extension — eg, I wanted ".jpg" and ".JPG" to be crawled the same. This is my variant:
import re
import glob
import os
from fnmatch import translate as regexGlob
from platform import system as getOS
def linuxGlob(globPattern:str) -> frozenset:
"""
Glob with a case-insensitive file extension
"""
base = set(glob.glob(globPattern, recursive= True))
maybeExt = os.path.splitext(os.path.basename(globPattern))[1][1:]
caseChange = set()
# Now only try the extended insensitivity if we've got a file extension
if len(maybeExt) > 0 and getOS() != "Windows":
rule = re.compile(regexGlob(globPattern), re.IGNORECASE)
endIndex = globPattern.find("*")
if endIndex == -1:
endIndex = len(globPattern)
crawl = os.path.join(os.path.dirname(globPattern[:endIndex]), "**", "*")
checkSet = set(glob.glob(crawl, recursive= True)) - base
caseChange = set([x for x in checkSet if rule.match(x)])
return frozenset(base.union(caseChange))
I didn’t actually restrict the insensitivity to just the extension because I was lazy, but that confusion space is pretty small (eg, you’d want to capture FOO.jpg
and FOO.JPG
but not foo.JPG
or foo.jpg
; if your path is that pathological you’ve got other problems)
def insensitive_glob(pattern):
def either(c):
return '[%s%s]' % (c.lower(), c.upper()) if c.isalpha() else c
return glob.glob(''.join(map(either, pattern)))
also can be:
def insensitive_glob(pattern):
return glob.glob(
''.join([
'[' + c.lower() + c.upper() + ']'
if c.isalpha() else c
for c in pattern
])
)
a variation of your answer with search recursive of names files :
def insensitive_for_glob(string_file):
return ''.join(['[' + c.lower() + c.upper() + ']' if c.isalpha() else c for c in string_file])
in otherplace in code :
namefile = self.insensitive_for_glob(namefile)
lst_found_file = glob.glob(f'{file_path}/**/*{namefile}', recursive=True)
I’m writing a script which will have to work on directories which are modified by hand by Windows and Linux users alike. The Windows users tend to not care at all about case in assigning filenames.
Is there a way to handle this on the Linux side in Python, i.e. can I get a case-insensitive, glob-like behaviour?
Use case-insensitive regexes instead of glob patterns. fnmatch.translate
generates a regex from a glob pattern, so
re.compile(fnmatch.translate(pattern), re.IGNORECASE)
gives you a case-insensitive version of a glob pattern as a compiled RE.
Keep in mind that, if the filesystem is hosted by a Linux box on a Unix-like filesystem, users will be able to create files foo
, Foo
and FOO
in the same directory.
You can replace each alphabetic character c with [cC], via
import glob
def insensitive_glob(pattern):
def either(c):
return '[%s%s]' % (c.lower(), c.upper()) if c.isalpha() else c
return glob.glob(''.join(map(either, pattern)))
Non recursively
In order to retrieve the files (and files only) of a directory "path", with "globexpression":
list_path = [i for i in os.listdir(path) if os.path.isfile(os.path.join(path, i))]
result = [os.path.join(path, j) for j in list_path if re.match(fnmatch.translate(globexpression), j, re.IGNORECASE)]
Recursively
with walk:
result = []
for root, dirs, files in os.walk(path, topdown=True):
result += [os.path.join(root, j) for j in files
if re.match(fnmatch.translate(globexpression), j, re.IGNORECASE)]
Better also compile the regular expression, so instead of
re.match(fnmatch.translate(globexpression)
do (before the loop):
reg_expr = re.compile(fnmatch.translate(globexpression), re.IGNORECASE)
and then replace in the loop:
result += [os.path.join(root, j) for j in files if re.match(reg_expr, j)]
Depending on your case, you might use .lower()
on both file pattern and results from folder listing and only then compare the pattern with the filename
Here is my non-recursive file search for Python with glob like behavior for Python 3.5+
# Eg: find_files('~/Downloads', '*.Xls', ignore_case=True)
def find_files(path: str, glob_pat: str, ignore_case: bool = False):
rule = re.compile(fnmatch.translate(glob_pat), re.IGNORECASE) if ignore_case
else re.compile(fnmatch.translate(glob_pat))
return [n for n in os.listdir(os.path.expanduser(path)) if rule.match(n)]
Note: This version handles home directory expansion
Riffing off of @Timothy C. Quinn’s answer, this modification allows the use of wildcards anywhere in the path. This is admittedly only case insensitive for the glob_pat argument.
import re
import os
import fnmatch
import glob
def find_files(path: str, glob_pat: str, ignore_case: bool = False):
rule = re.compile(fnmatch.translate(glob_pat), re.IGNORECASE) if ignore_case
else re.compile(fnmatch.translate(glob_pat))
return [n for n in glob.glob(os.path.join(path, '*')) if rule.match(n)]
Here is a working example with fnmatch.translate()
:
from glob import glob
from pathlib import Path
import fnmatch, re
mask_str = '"*_*_yyww.TXT" | "*_yyww.TXT" | "*_*_yyww_*.TXT" | "*_yyww_*.TXT"'
masks_list = ["yyyy", "yy", "mmmmm", "mmm", "mm", "#d", "#w", "#m", "ww"]
for mask_item in masks_list:
mask_str = mask_str.replace(mask_item, "*")
clean_quotes_and_spaces = mask_str.replace(" ", "").replace('"', '')
remove_double_star = clean_quotes_and_spaces.replace("**", "*")
masks = remove_double_star.split("|")
cwd = Path.cwd()
files = list(cwd.glob('*'))
print(files)
files_found = set()
for mask in masks:
mask = re.compile(fnmatch.translate(mask), re.IGNORECASE)
print(mask)
for file in files:
if mask.match(str(file)):
files_found.add(file)
print(files_found)
I just wanted a variant of this where I only went case insensitive if I was specifying a file extension — eg, I wanted ".jpg" and ".JPG" to be crawled the same. This is my variant:
import re
import glob
import os
from fnmatch import translate as regexGlob
from platform import system as getOS
def linuxGlob(globPattern:str) -> frozenset:
"""
Glob with a case-insensitive file extension
"""
base = set(glob.glob(globPattern, recursive= True))
maybeExt = os.path.splitext(os.path.basename(globPattern))[1][1:]
caseChange = set()
# Now only try the extended insensitivity if we've got a file extension
if len(maybeExt) > 0 and getOS() != "Windows":
rule = re.compile(regexGlob(globPattern), re.IGNORECASE)
endIndex = globPattern.find("*")
if endIndex == -1:
endIndex = len(globPattern)
crawl = os.path.join(os.path.dirname(globPattern[:endIndex]), "**", "*")
checkSet = set(glob.glob(crawl, recursive= True)) - base
caseChange = set([x for x in checkSet if rule.match(x)])
return frozenset(base.union(caseChange))
I didn’t actually restrict the insensitivity to just the extension because I was lazy, but that confusion space is pretty small (eg, you’d want to capture FOO.jpg
and FOO.JPG
but not foo.JPG
or foo.jpg
; if your path is that pathological you’ve got other problems)
def insensitive_glob(pattern):
def either(c):
return '[%s%s]' % (c.lower(), c.upper()) if c.isalpha() else c
return glob.glob(''.join(map(either, pattern)))
also can be:
def insensitive_glob(pattern):
return glob.glob(
''.join([
'[' + c.lower() + c.upper() + ']'
if c.isalpha() else c
for c in pattern
])
)
a variation of your answer with search recursive of names files :
def insensitive_for_glob(string_file):
return ''.join(['[' + c.lower() + c.upper() + ']' if c.isalpha() else c for c in string_file])
in otherplace in code :
namefile = self.insensitive_for_glob(namefile)
lst_found_file = glob.glob(f'{file_path}/**/*{namefile}', recursive=True)