Easiest way to find filenames that differ by one word?
Question:
I’m attempting to write a python script that compares a list of filenames to itself, and pulls out any filenames that are exact matches, or differ by only one word…
Something along the lines of
def FindCloseMatches(list_in_question):
match_list = []
list_one = [{x: x.split()} for x in list_in_question]
list_two = [{x: x.split()} for x in list_in_question]
# pseudo-ish
for x, y in zip(list_one, list_two):
if x.values in list_one match all but one of y.values in list_two:
match_list.append(x, y)
How would I go about comparing two lists of filenames, and finding any that differ by only one word or less?
For example, if I have a file named WaterServiceLines.pdf
and CustomerWaterServiceLines.pdf
(they are not all formatted the same way in terms of spaces and underscores etc.) then that would be a match. But WaterServiceLines.pdf
and SewerMainLines.pdf
would not be a match.
Answers:
Something like this?
Assuming that all words are separated by Capital Letters
import Levenshtein
import re
def FindCloseMatches(filenames):
# remove file types from filenames
filenames = [x.split('.')[0] for x in filenames]
# split filenames into words
for i in range(len(filenames)):
filenames[i] = [s for s in re.split("([A-Z][^A-Z]*)", filenames[i]) if s != '']
# compare each element in the list to itself
# count the number of words that are different
# if the number of words is 1 or less, then it is a match
matches = []
for i in range(len(filenames)):
for j in range(i + 1, len(filenames)):
if Levenshtein.distance(filenames[i], filenames[j]) <= 1:
# combin words into a string
matches.append((''.join(filenames[i]), ''.join(filenames[j])))
return matches
l = ['WaterServiceLines.pdf', 'CustomerWaterServiceLines.pdf', 'SewerMainLines.pdf', 'WaterServiceLines.pdf']
print(FindCloseMatches(l))
Output:
[('WaterServiceLines', 'CustomerWaterServiceLines'), ('WaterServiceLines', 'WaterServiceLines'), ('CustomerWaterServiceLines', 'WaterServiceLines')]
install levenshtein with pip install levenshtein
If you want the file types in your output:
import Levenshtein
import re
def FindCloseMatches(filenames):
# create dictionary keyed by filename, with value file type
# e.g. {'WaterServiceLines': 'pdf', 'CustomerWaterServiceLines': 'pdf'}
filetypes = {}
for filename in filenames:
filetypes[filename.split('.')[0]] = filename.split('.')[-1]
filenames = [x.split('.')[0] for x in filenames]
# split filenames into words
for i in range(len(filenames)):
filenames[i] = [s for s in re.split("([A-Z][^A-Z]*)", filenames[i]) if s != '']
# compare each element in the list to itself
# count the number of words that are different
# if the number of words is 1 or less, then it is a match
matches = []
for i in range(len(filenames)):
for j in range(i + 1, len(filenames)):
if Levenshtein.distance(filenames[i], filenames[j]) <= 1:
# combine words into the filename and append the filetype
f1 = ''.join(filenames[i])
f2 = ''.join(filenames[j])
f1 = f1 + '.' + filetypes[f1]
f2 = f2 + '.' + filetypes[f2]
matches.append((f1, f2))
return matches
l = ['WaterServiceLines.pdf', 'CustomerWaterServiceLines.pdf', 'SewerMainLines.pdf', 'WaterServiceLines.pdf']
print(FindCloseMatches(l))
Output:
[('WaterServiceLines.pdf', 'CustomerWaterServiceLines.pdf'), ('WaterServiceLines.pdf', 'WaterServiceLines.pdf'), ('CustomerWaterServiceLines.pdf', 'WaterServiceLines.pdf')]
I’m attempting to write a python script that compares a list of filenames to itself, and pulls out any filenames that are exact matches, or differ by only one word…
Something along the lines of
def FindCloseMatches(list_in_question):
match_list = []
list_one = [{x: x.split()} for x in list_in_question]
list_two = [{x: x.split()} for x in list_in_question]
# pseudo-ish
for x, y in zip(list_one, list_two):
if x.values in list_one match all but one of y.values in list_two:
match_list.append(x, y)
How would I go about comparing two lists of filenames, and finding any that differ by only one word or less?
For example, if I have a file named WaterServiceLines.pdf
and CustomerWaterServiceLines.pdf
(they are not all formatted the same way in terms of spaces and underscores etc.) then that would be a match. But WaterServiceLines.pdf
and SewerMainLines.pdf
would not be a match.
Something like this?
Assuming that all words are separated by Capital Letters
import Levenshtein
import re
def FindCloseMatches(filenames):
# remove file types from filenames
filenames = [x.split('.')[0] for x in filenames]
# split filenames into words
for i in range(len(filenames)):
filenames[i] = [s for s in re.split("([A-Z][^A-Z]*)", filenames[i]) if s != '']
# compare each element in the list to itself
# count the number of words that are different
# if the number of words is 1 or less, then it is a match
matches = []
for i in range(len(filenames)):
for j in range(i + 1, len(filenames)):
if Levenshtein.distance(filenames[i], filenames[j]) <= 1:
# combin words into a string
matches.append((''.join(filenames[i]), ''.join(filenames[j])))
return matches
l = ['WaterServiceLines.pdf', 'CustomerWaterServiceLines.pdf', 'SewerMainLines.pdf', 'WaterServiceLines.pdf']
print(FindCloseMatches(l))
Output:
[('WaterServiceLines', 'CustomerWaterServiceLines'), ('WaterServiceLines', 'WaterServiceLines'), ('CustomerWaterServiceLines', 'WaterServiceLines')]
install levenshtein with pip install levenshtein
If you want the file types in your output:
import Levenshtein
import re
def FindCloseMatches(filenames):
# create dictionary keyed by filename, with value file type
# e.g. {'WaterServiceLines': 'pdf', 'CustomerWaterServiceLines': 'pdf'}
filetypes = {}
for filename in filenames:
filetypes[filename.split('.')[0]] = filename.split('.')[-1]
filenames = [x.split('.')[0] for x in filenames]
# split filenames into words
for i in range(len(filenames)):
filenames[i] = [s for s in re.split("([A-Z][^A-Z]*)", filenames[i]) if s != '']
# compare each element in the list to itself
# count the number of words that are different
# if the number of words is 1 or less, then it is a match
matches = []
for i in range(len(filenames)):
for j in range(i + 1, len(filenames)):
if Levenshtein.distance(filenames[i], filenames[j]) <= 1:
# combine words into the filename and append the filetype
f1 = ''.join(filenames[i])
f2 = ''.join(filenames[j])
f1 = f1 + '.' + filetypes[f1]
f2 = f2 + '.' + filetypes[f2]
matches.append((f1, f2))
return matches
l = ['WaterServiceLines.pdf', 'CustomerWaterServiceLines.pdf', 'SewerMainLines.pdf', 'WaterServiceLines.pdf']
print(FindCloseMatches(l))
Output:
[('WaterServiceLines.pdf', 'CustomerWaterServiceLines.pdf'), ('WaterServiceLines.pdf', 'WaterServiceLines.pdf'), ('CustomerWaterServiceLines.pdf', 'WaterServiceLines.pdf')]