Join split words and punctuation with punctuation in the right place
Question:
So I tried using join()
after splitting a string into words and punctuation but it joins the string with a space in between the word and punctuation.
b = ['Hello', ',', 'who', 'are', 'you', '?']
c = " ".join(b)
But that returns:
c = 'Hello , who are you ?'
and I want:
c = 'Hello, who are you?'
Answers:
How abt
c = " ".join(b).replace(" ,", ",")
You could join on the punctuation first:
def join_punctuation(seq, characters='.,;?!'):
characters = set(characters)
seq = iter(seq)
current = next(seq)
for nxt in seq:
if nxt in characters:
current += nxt
else:
yield current
current = nxt
yield current
c = ' '.join(join_punctuation(b))
The join_punctuation
generator yields strings with any following punctuation already joined on:
>>> b = ['Hello', ',', 'who', 'are', 'you', '?']
>>> list(join_punctuation(b))
['Hello,', 'who', 'are', 'you?']
>>> ' '.join(join_punctuation(b))
'Hello, who are you?'
Do this after you get the result, not full, but works…
c = re.sub(r' ([^A-Za-z0-9])', r'1', c)
Output:
c = 'Hello , who are you ?'
>>> c = re.sub(r' ([^A-Za-z0-9])', r'1', c)
>>> c
'Hello, who are you?'
>>>
Maybe something like:
>>> from string import punctuation
>>> punc = set(punctuation) # or whatever special chars you want
>>> b = ['Hello', ',', 'who', 'are', 'you', '?']
>>> ''.join(w if set(w) <= punc else ' '+w for w in b).lstrip()
'Hello, who are you?'
This adds a space before words in b
which aren’t made up entirely of punctuation.
Based on the answer of Martijn Pieters♦, I made a little generalization for languages with punctuation signs that can also be at the start of a word.
from string import punctuation
def join_punctuation(
seq,
characters_after=punctuation,
characters_before="¡¿"
):
characters_after = set(characters_after)
characters_before = set(characters_before)
seq = iter(seq)
current = next(seq)
for nxt in seq:
if current in characters_before:
current += nxt
elif nxt in characters_after:
current += nxt
else:
yield current
current = nxt
yield current
It would work the same way:
>>> b = ["Hola", ",", "¿", "Qué", "tal", "?"]
>>> list(join_punctuation(b))
['Hola,', '¿Qué', 'tal?']
>>> " ".join(join_punctuation(b))
'Hola, ¿Qué tal?'
So I tried using join()
after splitting a string into words and punctuation but it joins the string with a space in between the word and punctuation.
b = ['Hello', ',', 'who', 'are', 'you', '?']
c = " ".join(b)
But that returns:
c = 'Hello , who are you ?'
and I want:
c = 'Hello, who are you?'
How abt
c = " ".join(b).replace(" ,", ",")
You could join on the punctuation first:
def join_punctuation(seq, characters='.,;?!'):
characters = set(characters)
seq = iter(seq)
current = next(seq)
for nxt in seq:
if nxt in characters:
current += nxt
else:
yield current
current = nxt
yield current
c = ' '.join(join_punctuation(b))
The join_punctuation
generator yields strings with any following punctuation already joined on:
>>> b = ['Hello', ',', 'who', 'are', 'you', '?']
>>> list(join_punctuation(b))
['Hello,', 'who', 'are', 'you?']
>>> ' '.join(join_punctuation(b))
'Hello, who are you?'
Do this after you get the result, not full, but works…
c = re.sub(r' ([^A-Za-z0-9])', r'1', c)
Output:
c = 'Hello , who are you ?'
>>> c = re.sub(r' ([^A-Za-z0-9])', r'1', c)
>>> c
'Hello, who are you?'
>>>
Maybe something like:
>>> from string import punctuation
>>> punc = set(punctuation) # or whatever special chars you want
>>> b = ['Hello', ',', 'who', 'are', 'you', '?']
>>> ''.join(w if set(w) <= punc else ' '+w for w in b).lstrip()
'Hello, who are you?'
This adds a space before words in b
which aren’t made up entirely of punctuation.
Based on the answer of Martijn Pieters♦, I made a little generalization for languages with punctuation signs that can also be at the start of a word.
from string import punctuation
def join_punctuation(
seq,
characters_after=punctuation,
characters_before="¡¿"
):
characters_after = set(characters_after)
characters_before = set(characters_before)
seq = iter(seq)
current = next(seq)
for nxt in seq:
if current in characters_before:
current += nxt
elif nxt in characters_after:
current += nxt
else:
yield current
current = nxt
yield current
It would work the same way:
>>> b = ["Hola", ",", "¿", "Qué", "tal", "?"]
>>> list(join_punctuation(b))
['Hola,', '¿Qué', 'tal?']
>>> " ".join(join_punctuation(b))
'Hola, ¿Qué tal?'