How to create spaces between characters only for acronysms in Python
Question:
I am trying to add spaces between characters only for acronyms (all consecutive all-caps words) in Python.
INPUT:
"The PNUD, UN, UCALP and USA and U N."
DESIRED OUTPUT:
"The P N U D, U N, U C A L P and U S A and U N."
I have this solution so far, but I am looking for something more efficient/elegant:
import re
data = "The PNUD, UN, UCALP and USA and U N."
result = re.sub(r'(?=(?!^)[^[a-z]|s+|W]*)', ' ', data)
result = re.sub(r's+(W)', 'g<1>', result)
print(result)
Answers:
I think the following regex is a lot more trivial solution for this problem
re.sub('([A-Z])(?=[A-Z])', '\1 ', s)
I’m just using a positive lookahead and a backreference.
Another solution re.sub
with lambda function:
import re
data = "The PNUD, UN, UCALP and USA and U N."
result = re.sub(r"b[A-Z]{2,}b", lambda g: " ".join(g.group(0)), data)
print(result)
Prints:
The P N U D, U N, U C A L P and U S A and U N.
EDIT: Small benchmark
import re
import regex
from timeit import timeit
pat1 = re.compile(r"b[A-Z]{2,}b")
pat2 = re.compile(r"([A-Z])(?=[A-Z])")
pat3 = re.compile(r"[A-Z](?=[A-Z])") # the same without capturing group
# using regex module instead of re
pat4 = regex.compile(r"b[[:upper:]]{2,}b")
data = "The PNUD, UN, UCALP and USA and U N."
def fn1():
return pat1.sub(lambda g: " ".join(g.group(0)), data)
def fn2():
return pat2.sub(r"g<1> ", data)
def fn3():
return pat3.sub(r"g<0> ", data)
def fn4():
return pat4.sub(lambda g: " ".join(g.group(0)), data)
t1 = timeit(fn1, number=10_000)
t2 = timeit(fn2, number=10_000)
t3 = timeit(fn3, number=10_000)
t4 = timeit(fn4, number=10_000)
print(t1)
print(t2)
print(t3)
print(t4)
Prints:
0.03805400081910193
0.10581987909972668
0.10386284696869552
0.044628452975302935
You can use a single call to re.sub and match a single uppercase char and assert another one to the right.
In the replacement use the match followed by a space using g<0>
[A-Z](?=[A-Z])
Example
result = re.sub('[A-Z](?=[A-Z])', r'g<0> ', data)
Using Lambda with regex instead of re is slightly slower but allows you to match also Unicode chars from non-English languages, which is caeteris paribus a more generalizable answer.
import re
import regex
from timeit import timeit
pat1 = re.compile(r"b[A-Z]+b")
pat2 = re.compile(r"([A-Z])(?=[A-Z])")
pat3 = regex.compile(r"b[[:upper:]]+b")
data = "The PNUD, UN, UCALP and USA and U N."
def fn1():
return pat1.sub(lambda g: " ".join(g.group(0)), data)
def fn2():
return pat2.sub(r"g<1> ", data)
def fn3():
return pat3.sub(lambda g: " ".join(g.group(0)), data)
t1 = timeit(fn1, number=10_000)
t2 = timeit(fn2, number=10_000)
t3 = timeit(fn3, number=10_000)
print(fn1())
print(fn2())
print(fn3())
print(t1)
print(t2)
print(t3)
I am trying to add spaces between characters only for acronyms (all consecutive all-caps words) in Python.
INPUT:
"The PNUD, UN, UCALP and USA and U N."
DESIRED OUTPUT:
"The P N U D, U N, U C A L P and U S A and U N."
I have this solution so far, but I am looking for something more efficient/elegant:
import re
data = "The PNUD, UN, UCALP and USA and U N."
result = re.sub(r'(?=(?!^)[^[a-z]|s+|W]*)', ' ', data)
result = re.sub(r's+(W)', 'g<1>', result)
print(result)
I think the following regex is a lot more trivial solution for this problem
re.sub('([A-Z])(?=[A-Z])', '\1 ', s)
I’m just using a positive lookahead and a backreference.
Another solution re.sub
with lambda function:
import re
data = "The PNUD, UN, UCALP and USA and U N."
result = re.sub(r"b[A-Z]{2,}b", lambda g: " ".join(g.group(0)), data)
print(result)
Prints:
The P N U D, U N, U C A L P and U S A and U N.
EDIT: Small benchmark
import re
import regex
from timeit import timeit
pat1 = re.compile(r"b[A-Z]{2,}b")
pat2 = re.compile(r"([A-Z])(?=[A-Z])")
pat3 = re.compile(r"[A-Z](?=[A-Z])") # the same without capturing group
# using regex module instead of re
pat4 = regex.compile(r"b[[:upper:]]{2,}b")
data = "The PNUD, UN, UCALP and USA and U N."
def fn1():
return pat1.sub(lambda g: " ".join(g.group(0)), data)
def fn2():
return pat2.sub(r"g<1> ", data)
def fn3():
return pat3.sub(r"g<0> ", data)
def fn4():
return pat4.sub(lambda g: " ".join(g.group(0)), data)
t1 = timeit(fn1, number=10_000)
t2 = timeit(fn2, number=10_000)
t3 = timeit(fn3, number=10_000)
t4 = timeit(fn4, number=10_000)
print(t1)
print(t2)
print(t3)
print(t4)
Prints:
0.03805400081910193
0.10581987909972668
0.10386284696869552
0.044628452975302935
You can use a single call to re.sub and match a single uppercase char and assert another one to the right.
In the replacement use the match followed by a space using g<0>
[A-Z](?=[A-Z])
Example
result = re.sub('[A-Z](?=[A-Z])', r'g<0> ', data)
Using Lambda with regex instead of re is slightly slower but allows you to match also Unicode chars from non-English languages, which is caeteris paribus a more generalizable answer.
import re
import regex
from timeit import timeit
pat1 = re.compile(r"b[A-Z]+b")
pat2 = re.compile(r"([A-Z])(?=[A-Z])")
pat3 = regex.compile(r"b[[:upper:]]+b")
data = "The PNUD, UN, UCALP and USA and U N."
def fn1():
return pat1.sub(lambda g: " ".join(g.group(0)), data)
def fn2():
return pat2.sub(r"g<1> ", data)
def fn3():
return pat3.sub(lambda g: " ".join(g.group(0)), data)
t1 = timeit(fn1, number=10_000)
t2 = timeit(fn2, number=10_000)
t3 = timeit(fn3, number=10_000)
print(fn1())
print(fn2())
print(fn3())
print(t1)
print(t2)
print(t3)