Python – Search a list of a group strings in text file
Question:
I want to search a list of group of strings inside a text file (.txt or .log).
- it must include group A or B (or CDE..).
- group A OR B each words need in the same line but not near by. (eg. ["123456", "Login"] or ["123457", "Login"] if in the same line then save it to a new txt file.
Some of example output line:
20221110,1668057560.965,AE111,123457,0,"Action=Account Login,XXX,XXX",XXX,XXX
20221110,1668057560.965,AE112,123458,0,"Action=Account Login,XXX,XXX",XXX,XXX
20221111,1668057560.965,AE113,123458,0,"Action=Order,XXX,XXX",XXX,XXX
below is my code:
import os, re
path = "Log\"
file_list = [path + f for f in os.listdir(path) if f.endswith('.log')]
keep_phrases1 = ["123456", "Login"]
keep_phrases2 = ["123457", "Login"]
pat = r"b.*?b".join([re.escape(word) for word in keep_phrases1])
pat = re.compile(r"b" + pat + r"b")
pat2 = r"b.*?b".join([re.escape(word) for word in keep_phrases2])
pat2 = re.compile(r"b" + pat2 + r"b")
print(pat2,pat)
if len(file_list) != 0:
for infile in sorted(file_list):
with open(infile, encoding="latin-1") as f:
f = f.readlines()
for line in f:
found1 = pat.search(line)
found2 = pat2.search(line)
if found1 or found2:
with open(outfile, "a") as wf:
wf.write(line)
It’s works for me but not easy to add more group of words. And I think the code is not good for understand?
My problems is How can I simplify the code?
How can I easier to add other group to search? e.g. ["123458", "Login"] ["123456", "order"] ["123457", "order"]
Answers:
import os, re
path = "Log\"
file_list = [path + f for f in os.listdir(path) if f.endswith('.log')]
All keep_phrases in a container, I choose a dictionary but since they are identified by order, it could have been a list:
keep_phrases = {'keep_phrases1': ["123456", "Login"], 'keep_phrases2':["123457", "Login"]}
# Alternative, a list would work:
# keep_phrases = [["123456", "Login"], ["123457", "Login"]]
Now let’s generate a list with the compiled patterns:
def compile_pattern(keep_phrase):
pat = r"b.*?b".join([re.escape(word) for word in keep_phrase])
pat = re.compile(r"b" + pat + r"b")
return pat
patterns = [compile_pattern(keep_phrases[keep_phrase]) for keep_phrase in keep_phrases.keys()]
# if keep_phrases had been a list, we would do
# patterns = [compile_pattern(keep_phrase) for keep_phrase in keep_phrases]
Finally, we look for matches for every pattern and if we get any finding, we write to file.
if len(file_list) != 0:
for infile in sorted(file_list):
with open(infile, encoding="latin-1") as f:
f = f.readlines()
for line in f:
findings = [pat.search(line) for pat in patterns] # can do this because there's a list with patterns
if any(findings):
with open(outfile, "a") as wf:
wf.write(line)
Try, this. I read the whole file in a string to make code fast and readable, findall will return a list with all matching lines for the file.
If memory is a problem the pattern also works on individual lines:
import re
file_list=["sit.txt"]
keep_phrases=[["123456", "Login"],["123457", "Login"]]
pat = [r"(?:.*?(?:" + p1 + r"b.*?"+p2+r".*?(?:n|$)))" for p1,p2 in keep_phrases]
pat= r"|".join(pat)
for infile in sorted(file_list):
with open(infile, encoding="latin-1") as f:
text=f.read()
print(re.findall(pat,text))
Without regex
def match_words(line, words):
return all(word in words for word in line)
with open(infile, encoding="latin-1") as f:
f = f.readlines()
for line in f:
split_line = line.split(",")
if any( match_words(split_line , word) for word in [keep_phrases1, keep_phrases2]):
with open(outfile, "a") as wf:
wf.write(line)
I want to search a list of group of strings inside a text file (.txt or .log).
- it must include group A or B (or CDE..).
- group A OR B each words need in the same line but not near by. (eg. ["123456", "Login"] or ["123457", "Login"] if in the same line then save it to a new txt file.
Some of example output line:
20221110,1668057560.965,AE111,123457,0,"Action=Account Login,XXX,XXX",XXX,XXX
20221110,1668057560.965,AE112,123458,0,"Action=Account Login,XXX,XXX",XXX,XXX
20221111,1668057560.965,AE113,123458,0,"Action=Order,XXX,XXX",XXX,XXX
below is my code:
import os, re
path = "Log\"
file_list = [path + f for f in os.listdir(path) if f.endswith('.log')]
keep_phrases1 = ["123456", "Login"]
keep_phrases2 = ["123457", "Login"]
pat = r"b.*?b".join([re.escape(word) for word in keep_phrases1])
pat = re.compile(r"b" + pat + r"b")
pat2 = r"b.*?b".join([re.escape(word) for word in keep_phrases2])
pat2 = re.compile(r"b" + pat2 + r"b")
print(pat2,pat)
if len(file_list) != 0:
for infile in sorted(file_list):
with open(infile, encoding="latin-1") as f:
f = f.readlines()
for line in f:
found1 = pat.search(line)
found2 = pat2.search(line)
if found1 or found2:
with open(outfile, "a") as wf:
wf.write(line)
It’s works for me but not easy to add more group of words. And I think the code is not good for understand?
My problems is How can I simplify the code?
How can I easier to add other group to search? e.g. ["123458", "Login"] ["123456", "order"] ["123457", "order"]
import os, re
path = "Log\"
file_list = [path + f for f in os.listdir(path) if f.endswith('.log')]
All keep_phrases in a container, I choose a dictionary but since they are identified by order, it could have been a list:
keep_phrases = {'keep_phrases1': ["123456", "Login"], 'keep_phrases2':["123457", "Login"]}
# Alternative, a list would work:
# keep_phrases = [["123456", "Login"], ["123457", "Login"]]
Now let’s generate a list with the compiled patterns:
def compile_pattern(keep_phrase):
pat = r"b.*?b".join([re.escape(word) for word in keep_phrase])
pat = re.compile(r"b" + pat + r"b")
return pat
patterns = [compile_pattern(keep_phrases[keep_phrase]) for keep_phrase in keep_phrases.keys()]
# if keep_phrases had been a list, we would do
# patterns = [compile_pattern(keep_phrase) for keep_phrase in keep_phrases]
Finally, we look for matches for every pattern and if we get any finding, we write to file.
if len(file_list) != 0:
for infile in sorted(file_list):
with open(infile, encoding="latin-1") as f:
f = f.readlines()
for line in f:
findings = [pat.search(line) for pat in patterns] # can do this because there's a list with patterns
if any(findings):
with open(outfile, "a") as wf:
wf.write(line)
Try, this. I read the whole file in a string to make code fast and readable, findall will return a list with all matching lines for the file.
If memory is a problem the pattern also works on individual lines:
import re
file_list=["sit.txt"]
keep_phrases=[["123456", "Login"],["123457", "Login"]]
pat = [r"(?:.*?(?:" + p1 + r"b.*?"+p2+r".*?(?:n|$)))" for p1,p2 in keep_phrases]
pat= r"|".join(pat)
for infile in sorted(file_list):
with open(infile, encoding="latin-1") as f:
text=f.read()
print(re.findall(pat,text))
Without regex
def match_words(line, words):
return all(word in words for word in line)
with open(infile, encoding="latin-1") as f:
f = f.readlines()
for line in f:
split_line = line.split(",")
if any( match_words(split_line , word) for word in [keep_phrases1, keep_phrases2]):
with open(outfile, "a") as wf:
wf.write(line)