Regex function to extract selected rows
Question:
I have a text file like this
Some text and random stuff that I don't need
2 8
2 9 T
4 9
1 10
2 10 F
7 11 T
More random stuff
How should I construct a regex function to extract both the rows with just numbers and the rows with numbers and T or F?
So far my idea for the code is this
with open(file, 'r') as log_file:
# opening file
file = log_file
while True:
line = file.readlines()
# if line in regex function:
data.append(line)
# closing file
break
How can I solve this?
Answers:
With this approach, the re pattern will match only numbers or numbers that end with the letter T or F. You could also use a for loop instead of a while loop.
import re
matched_data = []
with open(file, 'r') as log_file:
data = log_file.readlines()
for line in data:
line = line.strip()
if re.match(r'^d+ d+( [TF])?$', line):
matched_data.append(line)
print(matched_data)
if some of the lines starts with a letter eg;
T 7 11
and you want to match those as well, you should substitute the above pattern with r'^[TF]|d+ d+( [TF])?$'
Test Code:
import re
data = """
2 8
2 9 T
4 9
1 10
2 10 F
7 11 T
5 B 37
Y 9 G
T 7 11
MG 99 Z
"""
data = data.splitlines()
matched_data = []
for line in data:
line = line.strip()
if re.match(r'^d+ d+( [TF])?$', line):
matched_data.append(line)
print(matched_data)
# ['2 8', '2 9 T', '4 9', '1 10', '2 10 F', '7 11 T']
We can use re.findall()
to get all the occurences, in the entire file.
import re
regexp = r"^d[d ]*[T|F]?"
with open("file.txt", "r") as fp:
# Not suggested if the file is large.
data = fp.read()
print(re.findall(regexp, data, re.M))
output:
['2 8 ', '2 9 T', '4 9', '1 10 ', '2 10 F', '7 11 T']
For large file, Its better to iterate line by line.
data = []
regexp = r"^d[d ]*[T|F]?"
with open(file, 'r') as fp:
for line in fp:
_match = re.match(regexp, line)
if _match:
data.append(_match.group())
If you are interested in learning more about regular expressions, visit regexone
You might as well parse the lines you match to tuples of (int, int, boolean |None
):
import re
with open("file.txt", "r") as file:
result = [
(int(a), int(b), flag == "T" if flag else None)
for a, b, flag in re.findall(r"^(d+)[ ]+(d+)(?:[ ]+([TF]))?[ ]*$",
file.read(), re.M)
]
print(result)
Output for the example file:
[(2, 8, None), (2, 9, True), (4, 9, None), (1, 10, None), (2, 10, False), (7, 11, True)]
I have a text file like this
Some text and random stuff that I don't need
2 8
2 9 T
4 9
1 10
2 10 F
7 11 T
More random stuff
How should I construct a regex function to extract both the rows with just numbers and the rows with numbers and T or F?
So far my idea for the code is this
with open(file, 'r') as log_file:
# opening file
file = log_file
while True:
line = file.readlines()
# if line in regex function:
data.append(line)
# closing file
break
How can I solve this?
With this approach, the re pattern will match only numbers or numbers that end with the letter T or F. You could also use a for loop instead of a while loop.
import re
matched_data = []
with open(file, 'r') as log_file:
data = log_file.readlines()
for line in data:
line = line.strip()
if re.match(r'^d+ d+( [TF])?$', line):
matched_data.append(line)
print(matched_data)
if some of the lines starts with a letter eg;
T 7 11
and you want to match those as well, you should substitute the above pattern with r'^[TF]|d+ d+( [TF])?$'
Test Code:
import re
data = """
2 8
2 9 T
4 9
1 10
2 10 F
7 11 T
5 B 37
Y 9 G
T 7 11
MG 99 Z
"""
data = data.splitlines()
matched_data = []
for line in data:
line = line.strip()
if re.match(r'^d+ d+( [TF])?$', line):
matched_data.append(line)
print(matched_data)
# ['2 8', '2 9 T', '4 9', '1 10', '2 10 F', '7 11 T']
We can use re.findall()
to get all the occurences, in the entire file.
import re
regexp = r"^d[d ]*[T|F]?"
with open("file.txt", "r") as fp:
# Not suggested if the file is large.
data = fp.read()
print(re.findall(regexp, data, re.M))
output:
['2 8 ', '2 9 T', '4 9', '1 10 ', '2 10 F', '7 11 T']
For large file, Its better to iterate line by line.
data = []
regexp = r"^d[d ]*[T|F]?"
with open(file, 'r') as fp:
for line in fp:
_match = re.match(regexp, line)
if _match:
data.append(_match.group())
If you are interested in learning more about regular expressions, visit regexone
You might as well parse the lines you match to tuples of (int, int, boolean |None
):
import re
with open("file.txt", "r") as file:
result = [
(int(a), int(b), flag == "T" if flag else None)
for a, b, flag in re.findall(r"^(d+)[ ]+(d+)(?:[ ]+([TF]))?[ ]*$",
file.read(), re.M)
]
print(result)
Output for the example file:
[(2, 8, None), (2, 9, True), (4, 9, None), (1, 10, None), (2, 10, False), (7, 11, True)]