regex whole string match between numbers
Question:
I want to extract a whole word from a sentence.
Thanks to this answer,
import re
def findWholeWord(w):
return re.compile(r'b({0})b'.format(w), flags=re.IGNORECASE).search
I can get whole words in cases like:
findWholeWord('thomas')('this is Thomas again') # -> <match object>
findWholeWord('thomas')('this is,Thomas again') # -> <match object>
findWholeWord('thomas')('this is,Thomas, again') # -> <match object>
findWholeWord('thomas')('this is.Thomas, again') # -> <match object>
findWholeWord('thomas')('this is ?Thomas again') # -> <match object>
where symbols next to the word don’t bother.
However if there’s a number it doesn’t find the word.
How should I modify the expression to match cases where there’s a number next to the word? Like:
findWholeWord('thomas')('this is 9Thomas, again')
findWholeWord('thomas')('this is9Thomas again')
findWholeWord('thomas')('this is Thomas36 again')
Answers:
You may use this code:
import re
def findWholeWord(w):
return re.compile(r'(?:d+{0}|{0}d+|b{0}b)'.format(w), flags=re.I).search
print ( findWholeWord('thomas')('this is 9Thomas, again') )
print ( findWholeWord('thomas')('this is9Thomas again') )
print ( findWholeWord('thomas')('this is Thomas36 again') )
print ( findWholeWord('thomas')('this is Thomas again') )
print ( findWholeWord('thomas')('this is,Thomas again') )
print ( findWholeWord('thomas')('this is,Thomas, again') )
print ( findWholeWord('thomas')('this is.Thomas, again') )
print ( findWholeWord('thomas')('this is ?Thomas again') )
print ( findWholeWord('thomas')('this is aThomas again') )
Output:
<re.Match object; span=(8, 15), match='9Thomas'>
<re.Match object; span=(7, 14), match='9Thomas'>
<re.Match object; span=(8, 16), match='Thomas36'>
<re.Match object; span=(8, 14), match='Thomas'>
<re.Match object; span=(8, 14), match='Thomas'>
<re.Match object; span=(8, 14), match='Thomas'>
<re.Match object; span=(8, 14), match='Thomas'>
<re.Match object; span=(9, 15), match='Thomas'>
None
(?:d+{0}|{0}d+|b{0}b)
will match given word with 1+ digits on either side or complete word.
Can use the regexp (?:d|b){0}(?:d|b)
to match the target word with either a word-boundary or a digit on either side of it.
import re
def findWholeWord(w):
return re.compile(r'(?:d|b){0}(?:d|b)'.format(w), flags=re.I).search
for s in [
'this is Thomas again',
'this is,Thomas again',
'this is,Thomas, again',
'this is.Thomas, again',
'this is ?Thomas again',
'this is 9Thomas, again',
'this is9Thomas again',
'this is Thomas36 again',
'this is -Thomas- again',
'athomas is no match',
'thomason no match']:
print("match >" if findWholeWord('thomas')(s) else "*no match* >", s)
Output:
match > this is Thomas again
match > this is,Thomas again
match > this is,Thomas, again
match > this is.Thomas, again
match > this is ?Thomas again
match > this is 9Thomas, again
match > this is9Thomas again
match > this is Thomas36 again
match > this is -Thomas- again
*no match* > athomas is no match
*no match* > thomason no match
If you want to reuse the same target word against multiple inputs or in a loop then you can assign findWholeWord() call to a variable then call it.
matcher = findWholeWord('thomas')
print(matcher('this is Thomas again'))
print(matcher('this is,Thomas again'))
I want to extract a whole word from a sentence.
Thanks to this answer,
import re
def findWholeWord(w):
return re.compile(r'b({0})b'.format(w), flags=re.IGNORECASE).search
I can get whole words in cases like:
findWholeWord('thomas')('this is Thomas again') # -> <match object>
findWholeWord('thomas')('this is,Thomas again') # -> <match object>
findWholeWord('thomas')('this is,Thomas, again') # -> <match object>
findWholeWord('thomas')('this is.Thomas, again') # -> <match object>
findWholeWord('thomas')('this is ?Thomas again') # -> <match object>
where symbols next to the word don’t bother.
However if there’s a number it doesn’t find the word.
How should I modify the expression to match cases where there’s a number next to the word? Like:
findWholeWord('thomas')('this is 9Thomas, again')
findWholeWord('thomas')('this is9Thomas again')
findWholeWord('thomas')('this is Thomas36 again')
You may use this code:
import re
def findWholeWord(w):
return re.compile(r'(?:d+{0}|{0}d+|b{0}b)'.format(w), flags=re.I).search
print ( findWholeWord('thomas')('this is 9Thomas, again') )
print ( findWholeWord('thomas')('this is9Thomas again') )
print ( findWholeWord('thomas')('this is Thomas36 again') )
print ( findWholeWord('thomas')('this is Thomas again') )
print ( findWholeWord('thomas')('this is,Thomas again') )
print ( findWholeWord('thomas')('this is,Thomas, again') )
print ( findWholeWord('thomas')('this is.Thomas, again') )
print ( findWholeWord('thomas')('this is ?Thomas again') )
print ( findWholeWord('thomas')('this is aThomas again') )
Output:
<re.Match object; span=(8, 15), match='9Thomas'>
<re.Match object; span=(7, 14), match='9Thomas'>
<re.Match object; span=(8, 16), match='Thomas36'>
<re.Match object; span=(8, 14), match='Thomas'>
<re.Match object; span=(8, 14), match='Thomas'>
<re.Match object; span=(8, 14), match='Thomas'>
<re.Match object; span=(8, 14), match='Thomas'>
<re.Match object; span=(9, 15), match='Thomas'>
None
(?:d+{0}|{0}d+|b{0}b)
will match given word with 1+ digits on either side or complete word.
Can use the regexp (?:d|b){0}(?:d|b)
to match the target word with either a word-boundary or a digit on either side of it.
import re
def findWholeWord(w):
return re.compile(r'(?:d|b){0}(?:d|b)'.format(w), flags=re.I).search
for s in [
'this is Thomas again',
'this is,Thomas again',
'this is,Thomas, again',
'this is.Thomas, again',
'this is ?Thomas again',
'this is 9Thomas, again',
'this is9Thomas again',
'this is Thomas36 again',
'this is -Thomas- again',
'athomas is no match',
'thomason no match']:
print("match >" if findWholeWord('thomas')(s) else "*no match* >", s)
Output:
match > this is Thomas again
match > this is,Thomas again
match > this is,Thomas, again
match > this is.Thomas, again
match > this is ?Thomas again
match > this is 9Thomas, again
match > this is9Thomas again
match > this is Thomas36 again
match > this is -Thomas- again
*no match* > athomas is no match
*no match* > thomason no match
If you want to reuse the same target word against multiple inputs or in a loop then you can assign findWholeWord() call to a variable then call it.
matcher = findWholeWord('thomas')
print(matcher('this is Thomas again'))
print(matcher('this is,Thomas again'))