Can these pairs of regexes be simplified into one?
Question:
I’m trying to fetch twitter usernames from strings. My current solution looks like this
def get_username(string):
p1 = re.compile(r'twitter.com/([a-z0-9_.-]+)', re.IGNORECASE)
p2 = re.compile(r'twitter[s:@]+([a-z0-9_.-]+)', re.IGNORECASE)
match1 = re.search(p1, string)
match2 = re.search(p2, string)
if match1:
return match1.group(1)
elif match2:
return match2.group(1)
else:
return None
Examples
get_username("Twitter: https://twitter.com/foo123")
get_username("Twitter: twitter.com/foo123")
get_username("https://twitter.com/foo123")
get_username("https://twitter.com/foo123?blah")
get_username("Twitter foo123")
get_username("Twitter @foo123")
get_username("Twitter: foo123")
get_username("Twitter: foo123 | youtube: ...")
I’m wondering if my two regexes can be simplified into one. My best attempt was
pattern = re.compile(r'twitter(?:(?:.com/)|(?:[s:@]+))([a-z0-9_.-]+)', re.IGNORECASE)
but this fails on the first example because Twitter: https
matches before twitter.com/foo123
.
Answers:
If it always end with the username, just use (w+)$
def get_username(string):
if match1 := re.search(r'(w+)$', string):
return match1.group(1)
return None
I’d try a negative lookahead of (?!https?://)
to exclude all usernames which appear to start with http://
or https://
.
twitter(?:(?:.com/)|(?:[s:@]+))(?!https?://)([a-z0-9_.-]+)
Add greedy quantifier .*
to the following regex pattern '.*twitter(?:(?:.com/)|(?::?s+@?))([a-z0-9_.-]+)'
to skip previous (optional) twitter
keywords and catch the last one:
def get_username(string):
pat = re.compile(r'.*twitter(?:(?:.com/)|(?::?s+@?))([a-z0-9_.-]+)', re.IGNORECASE)
if (match := pat.search(string)):
print(match.group(1))
return match.group(1)
return None
get_username("Twitter: https://twitter.com/foo123")
get_username("Twitter: twitter.com/foo123")
get_username("https://twitter.com/foo123")
get_username("https://twitter.com/foo123?blah")
get_username("Twitter foo123")
get_username("Twitter @foo123")
get_username("Twitter: foo123")
get_username("Twitter: foo123 | youtube: ...")
get_username("Twitt11er: foo123 | youtube: ...")
foo123
foo123
foo123
foo123
foo123
foo123
foo123
foo123
If there can be multiple matches, you can use a negative lookahead to rule out twitter or http://
or https://
to the right, and get the capture group 1 value.
btwitter(?:.com/|(?!:?s*(?:https?://|twitterb)):?s+@?)([w.-]+)
Explanation
btwitter
Match the word twitter
(?:
Non capture group for the alternatives
.com/
Match .com/
|
Or
(?!:?s*(?:https?://|twitterb))
Negative lookahead, assert not http:// or the word twitter preceded by an optional :
and whitspace chars directly to the right of the current position
:?s+@?)
Match an optional :
1+ whitspace chars and optional @
([w.-]+)
Capture group 1, match 1+ of the listed characters
import re
pattern = re.compile(r"btwitter(?:.com/|:?(?!s*(?:https?://|twitterb))s+@?)([w.-]+)", re.IGNORECASE)
def get_username(string):
m = pattern.search(string)
if m:
return m.group(1)
return None
print(get_username("Twitter: https://twitter.com/foo123"))
print(get_username("Twitter: twitter.com/foo123"))
print(get_username("https://twitter.com/foo123"))
print(get_username("https://twitter.com/foo123?blah"))
print(get_username("Twitter foo123"))
print(get_username("Twitter @foo123"))
print(get_username("Twitter: foo123"))
print(get_username("Twitter: foo123 | youtube: ..."))
Output
foo123
foo123
foo123
foo123
foo123
foo123
foo123
foo123
I’m trying to fetch twitter usernames from strings. My current solution looks like this
def get_username(string):
p1 = re.compile(r'twitter.com/([a-z0-9_.-]+)', re.IGNORECASE)
p2 = re.compile(r'twitter[s:@]+([a-z0-9_.-]+)', re.IGNORECASE)
match1 = re.search(p1, string)
match2 = re.search(p2, string)
if match1:
return match1.group(1)
elif match2:
return match2.group(1)
else:
return None
Examples
get_username("Twitter: https://twitter.com/foo123")
get_username("Twitter: twitter.com/foo123")
get_username("https://twitter.com/foo123")
get_username("https://twitter.com/foo123?blah")
get_username("Twitter foo123")
get_username("Twitter @foo123")
get_username("Twitter: foo123")
get_username("Twitter: foo123 | youtube: ...")
I’m wondering if my two regexes can be simplified into one. My best attempt was
pattern = re.compile(r'twitter(?:(?:.com/)|(?:[s:@]+))([a-z0-9_.-]+)', re.IGNORECASE)
but this fails on the first example because Twitter: https
matches before twitter.com/foo123
.
If it always end with the username, just use (w+)$
def get_username(string):
if match1 := re.search(r'(w+)$', string):
return match1.group(1)
return None
I’d try a negative lookahead of (?!https?://)
to exclude all usernames which appear to start with http://
or https://
.
twitter(?:(?:.com/)|(?:[s:@]+))(?!https?://)([a-z0-9_.-]+)
Add greedy quantifier .*
to the following regex pattern '.*twitter(?:(?:.com/)|(?::?s+@?))([a-z0-9_.-]+)'
to skip previous (optional) twitter
keywords and catch the last one:
def get_username(string):
pat = re.compile(r'.*twitter(?:(?:.com/)|(?::?s+@?))([a-z0-9_.-]+)', re.IGNORECASE)
if (match := pat.search(string)):
print(match.group(1))
return match.group(1)
return None
get_username("Twitter: https://twitter.com/foo123")
get_username("Twitter: twitter.com/foo123")
get_username("https://twitter.com/foo123")
get_username("https://twitter.com/foo123?blah")
get_username("Twitter foo123")
get_username("Twitter @foo123")
get_username("Twitter: foo123")
get_username("Twitter: foo123 | youtube: ...")
get_username("Twitt11er: foo123 | youtube: ...")
foo123
foo123
foo123
foo123
foo123
foo123
foo123
foo123
If there can be multiple matches, you can use a negative lookahead to rule out twitter or http://
or https://
to the right, and get the capture group 1 value.
btwitter(?:.com/|(?!:?s*(?:https?://|twitterb)):?s+@?)([w.-]+)
Explanation
btwitter
Match the word twitter(?:
Non capture group for the alternatives.com/
Match.com/
|
Or(?!:?s*(?:https?://|twitterb))
Negative lookahead, assert not http:// or the word twitter preceded by an optional:
and whitspace chars directly to the right of the current position
:?s+@?)
Match an optional:
1+ whitspace chars and optional @([w.-]+)
Capture group 1, match 1+ of the listed characters
import re
pattern = re.compile(r"btwitter(?:.com/|:?(?!s*(?:https?://|twitterb))s+@?)([w.-]+)", re.IGNORECASE)
def get_username(string):
m = pattern.search(string)
if m:
return m.group(1)
return None
print(get_username("Twitter: https://twitter.com/foo123"))
print(get_username("Twitter: twitter.com/foo123"))
print(get_username("https://twitter.com/foo123"))
print(get_username("https://twitter.com/foo123?blah"))
print(get_username("Twitter foo123"))
print(get_username("Twitter @foo123"))
print(get_username("Twitter: foo123"))
print(get_username("Twitter: foo123 | youtube: ..."))
Output
foo123
foo123
foo123
foo123
foo123
foo123
foo123
foo123