How to retrieve month with optional separator?
Question:
I tried to write python script to retrieve year and month from string.
The requirements are,
- year is fixed at 4 characters
- month is allowed to be two consecutive characters, or one or two characters when followed by a non-numeric value
"""
This is the "search year and month" module.
>>> search_year_month('202301')
True
>>> search_year_month('2023-1')
True
>>> search_year_month('2023-01')
True
>>> search_year_month('20231')
False
"""
import re
_re = re.compile(
r"(?P<year>d{4})"
r"(?P<month>d{2}|(?<=[^d])d{1,2})"
)
def search_year_month(v):
match = _re.search(v)
return match is not None
if __name__ == "__main__":
import doctest
doctest.testmod()
But, 2023-1
and 2023-01
are failed…
Is there a better way to build regular expressions?
I tried only month part, I got expected result.
"""
This is the "single lookbehind sample" module.
>>> search_lookbehind('01')
True
>>> search_lookbehind('-1')
True
>>> search_lookbehind('-01')
True
>>> search_lookbehind('1')
False
"""
import re
_re = re.compile(
r"(?P<month>d{2}|(?<=[^d])d{1,2})"
)
def search_lookbehind(v):
match = _re.search(v)
return match is not None
if __name__ == "__main__":
import doctest
doctest.testmod()
Answers:
option 1: search and replace strategy
import re
test_string=['202301','2023-1','2023-01','20231']
year=r"^d{4}"
sep=r"W+"
for e in test_string:
year=re.findall(r"^d{4}",'202301')[0]
e=re.sub(year,"",e)
e=re.sub(sep,"",e)
print(f"year:{year},month:{e}")
option 2: regex (all in one)
re_all=r"(^d{4})(?:W+){0,1}(d+)"
for e in test_string:
search_results=re.search(re_all,e)
print(f"year:{search_results.group(1)},month:{search_results.group(2)}")
same outcome:
year:2023,month:01
year:2023,month:1
year:2023,month:01
year:2023,month:1
You could use a conditional checking for group 2 that holds an optional -
char.
If there is a value for group 2, then match 1 or 2 digits for the month, else match 2 digits for the month.
b(?P<year>d{4})(-)?(?P<month>(?(2)dd?|dd))b
Explanation
b
A word boundary to prevent a partial word match
(?P<year>d{4})
Group year
matching 4 digits
(-)?
Optional capture group 2 matching a -
(?P<month>
Group month
(?
Start a conditional
(2)dd?
If we have group 2, match 1 or 2 digits
|
Or
dd
Match 2 digits
)
Close the conditional
)
Close group month
b
A word boundary
See a regex101 demo and a Python demo.
For example
import re
pattern = r"b(?P<year>d{4})(-)?(?P<month>(?(2)dd?|dd))b"
s = ("202301n"
"2023-1n"
"2023-01n"
"20231")
for m in re.finditer(pattern, s):
print(m.groupdict())
Output
{'year': '2023', 'month': '01'}
{'year': '2023', 'month': '1'}
{'year': '2023', 'month': '01'}
I tried to write python script to retrieve year and month from string.
The requirements are,
- year is fixed at 4 characters
- month is allowed to be two consecutive characters, or one or two characters when followed by a non-numeric value
"""
This is the "search year and month" module.
>>> search_year_month('202301')
True
>>> search_year_month('2023-1')
True
>>> search_year_month('2023-01')
True
>>> search_year_month('20231')
False
"""
import re
_re = re.compile(
r"(?P<year>d{4})"
r"(?P<month>d{2}|(?<=[^d])d{1,2})"
)
def search_year_month(v):
match = _re.search(v)
return match is not None
if __name__ == "__main__":
import doctest
doctest.testmod()
But, 2023-1
and 2023-01
are failed…
Is there a better way to build regular expressions?
I tried only month part, I got expected result.
"""
This is the "single lookbehind sample" module.
>>> search_lookbehind('01')
True
>>> search_lookbehind('-1')
True
>>> search_lookbehind('-01')
True
>>> search_lookbehind('1')
False
"""
import re
_re = re.compile(
r"(?P<month>d{2}|(?<=[^d])d{1,2})"
)
def search_lookbehind(v):
match = _re.search(v)
return match is not None
if __name__ == "__main__":
import doctest
doctest.testmod()
option 1: search and replace strategy
import re
test_string=['202301','2023-1','2023-01','20231']
year=r"^d{4}"
sep=r"W+"
for e in test_string:
year=re.findall(r"^d{4}",'202301')[0]
e=re.sub(year,"",e)
e=re.sub(sep,"",e)
print(f"year:{year},month:{e}")
option 2: regex (all in one)
re_all=r"(^d{4})(?:W+){0,1}(d+)"
for e in test_string:
search_results=re.search(re_all,e)
print(f"year:{search_results.group(1)},month:{search_results.group(2)}")
same outcome:
year:2023,month:01
year:2023,month:1
year:2023,month:01
year:2023,month:1
You could use a conditional checking for group 2 that holds an optional -
char.
If there is a value for group 2, then match 1 or 2 digits for the month, else match 2 digits for the month.
b(?P<year>d{4})(-)?(?P<month>(?(2)dd?|dd))b
Explanation
b
A word boundary to prevent a partial word match(?P<year>d{4})
Groupyear
matching 4 digits(-)?
Optional capture group 2 matching a-
(?P<month>
Groupmonth
(?
Start a conditional(2)dd?
If we have group 2, match 1 or 2 digits|
Ordd
Match 2 digits
)
Close the conditional
)
Close groupmonth
b
A word boundary
See a regex101 demo and a Python demo.
For example
import re
pattern = r"b(?P<year>d{4})(-)?(?P<month>(?(2)dd?|dd))b"
s = ("202301n"
"2023-1n"
"2023-01n"
"20231")
for m in re.finditer(pattern, s):
print(m.groupdict())
Output
{'year': '2023', 'month': '01'}
{'year': '2023', 'month': '1'}
{'year': '2023', 'month': '01'}