Speeding up string matching between two large lists
Question:
I have two large lists each with thousands of elements as follows.
I wanted to extract the pair of elements by matching the strings between two lists.
However, it is very slow. How can I speed it up ?
import os, glob
list1 = glob.glob("/data0/*.txt")
list2 = glob.glob("/data1/*.txt")`
with open("result.txt", "w") as fout:
for i1 in list1:
tobematched1 = os.path.basename(i1).split(".")[0] + "_" + os.path.basename(i1).split(".")[3]
for i2 in list2:
tobematched2 = os.path.basename(i2).split(".")[0] + "_" + os.path.basename(i2).split(".")[3]
if tobematched1 == tobematched2:
fout.write(i1 + ";" + i2 + "n")`
#This problem is not about common elements comparison as in the Common elements comparison between 2 lists
My question is to deal with strings
between two lists.
Answers:
To do this fast with set intersection, you’ll need to apply the transformation (and keep track of the original value), then look that up:
import os
import glob
# Maps a pathname to the part we want to compare
def process_name(item: str) -> str:
basename_bits = os.path.basename(item).split(".")
return f"{basename_bits[0]}_{basename_bits[3]}"
# Read the filenames and map them using the transformation above
map1 = {process_name(item): item for item in glob.glob("/data0/*.txt")}
map2 = {process_name(item): item for item in glob.glob("/data1/*.txt")}
# Find the common keys and print the original values.
for common_key in set(map1).intersection(set(map2)):
print(map1[common_key], map2[common_key])
I have two large lists each with thousands of elements as follows.
I wanted to extract the pair of elements by matching the strings between two lists.
However, it is very slow. How can I speed it up ?
import os, glob
list1 = glob.glob("/data0/*.txt")
list2 = glob.glob("/data1/*.txt")`
with open("result.txt", "w") as fout:
for i1 in list1:
tobematched1 = os.path.basename(i1).split(".")[0] + "_" + os.path.basename(i1).split(".")[3]
for i2 in list2:
tobematched2 = os.path.basename(i2).split(".")[0] + "_" + os.path.basename(i2).split(".")[3]
if tobematched1 == tobematched2:
fout.write(i1 + ";" + i2 + "n")`
#This problem is not about common elements comparison as in the Common elements comparison between 2 lists
My question is to deal with strings
between two lists.
To do this fast with set intersection, you’ll need to apply the transformation (and keep track of the original value), then look that up:
import os
import glob
# Maps a pathname to the part we want to compare
def process_name(item: str) -> str:
basename_bits = os.path.basename(item).split(".")
return f"{basename_bits[0]}_{basename_bits[3]}"
# Read the filenames and map them using the transformation above
map1 = {process_name(item): item for item in glob.glob("/data0/*.txt")}
map2 = {process_name(item): item for item in glob.glob("/data1/*.txt")}
# Find the common keys and print the original values.
for common_key in set(map1).intersection(set(map2)):
print(map1[common_key], map2[common_key])