regex findall overlapped does not give match if one of them is a prefix of the other
Question:
import regex
product_detail = "yyy target1 target2 xxx".lower()
p1 = r"btarget1b|btarget1 target2b"
p2 = r"btarget2b|btarget1 target2b"
for pattern in [p1, p2]:
matches = regex.findall(pattern, product_detail, overlapped=True)
print(matches)
why does matches from p1 only give ['target1']
as output, without 'target1 target2'
but matches from p2 can successfully give ['target1 target2', 'target2']
as output.
Also if you can provide a fix, how do i generalise it?
i have a list of 10000 target words and its not going to be feasible to hardcode them.
Answers:
Here is an example of what I had in mind with my comment on building a list of patterns separating common prefixes:
import regex # I'm actually using re (don't have regex)
product_detail = "yyy target1 target2 xxx".lower()
keywords = ["target1","target2","target1 target2","target3"]
from itertools import accumulate, groupby, zip_longest
keywords.sort()
groups = accumulate(keywords,lambda g,k:g if k.startswith(g) else k)
patterns = ( g for _,(*g,) in groupby(keywords,lambda _:next(groups)) )
patterns = ( filter(None,g) for g in zip_longest(*patterns) )
patterns = [r"b" + r"b|b".join(g) + r"b" for g in patterns]
# [r'btarget1b|btarget2b|btarget3b', r'btarget1 target2b']
for pattern in patterns:
matches = regex.findall(pattern, product_detail)
print(matches)
output:
['target1', 'target2']
['target1 target2']
import regex
product_detail = "yyy target1 target2 xxx".lower()
p1 = r"btarget1b|btarget1 target2b"
p2 = r"btarget2b|btarget1 target2b"
for pattern in [p1, p2]:
matches = regex.findall(pattern, product_detail, overlapped=True)
print(matches)
why does matches from p1 only give ['target1']
as output, without 'target1 target2'
but matches from p2 can successfully give ['target1 target2', 'target2']
as output.
Also if you can provide a fix, how do i generalise it?
i have a list of 10000 target words and its not going to be feasible to hardcode them.
Here is an example of what I had in mind with my comment on building a list of patterns separating common prefixes:
import regex # I'm actually using re (don't have regex)
product_detail = "yyy target1 target2 xxx".lower()
keywords = ["target1","target2","target1 target2","target3"]
from itertools import accumulate, groupby, zip_longest
keywords.sort()
groups = accumulate(keywords,lambda g,k:g if k.startswith(g) else k)
patterns = ( g for _,(*g,) in groupby(keywords,lambda _:next(groups)) )
patterns = ( filter(None,g) for g in zip_longest(*patterns) )
patterns = [r"b" + r"b|b".join(g) + r"b" for g in patterns]
# [r'btarget1b|btarget2b|btarget3b', r'btarget1 target2b']
for pattern in patterns:
matches = regex.findall(pattern, product_detail)
print(matches)
output:
['target1', 'target2']
['target1 target2']