Truncating string to byte length in Python
Question:
I have a function here to truncate a given string to a given byte length:
LENGTH_BY_PREFIX = [
(0xC0, 2), # first byte mask, total codepoint length
(0xE0, 3),
(0xF0, 4),
(0xF8, 5),
(0xFC, 6),
]
def codepoint_length(first_byte):
if first_byte < 128:
return 1 # ASCII
for mask, length in LENGTH_BY_PREFIX:
if first_byte & mask == mask:
return length
assert False, 'Invalid byte %r' % first_byte
def cut_string_to_bytes_length(unicode_text, byte_limit):
utf8_bytes = unicode_text.encode('UTF-8')
cut_index = 0
while cut_index < len(utf8_bytes):
step = codepoint_length(ord(utf8_bytes[cut_index]))
if cut_index + step > byte_limit:
# can't go a whole codepoint further, time to cut
return utf8_bytes[:cut_index]
else:
cut_index += step
# length limit is longer than our bytes strung, so no cutting
return utf8_bytes
This seemed to work fine until the question of Emoji was introduced:
string = u"ud83dude14"
trunc = cut_string_to_bytes_length(string, 100)
Traceback (most recent call last):
File "<console>", line 1, in <module>
File "<console>", line 5, in cut_string_to_bytes_length
File "<console>", line 7, in codepoint_length
AssertionError: Invalid byte 152
Can anyone explain exactly what is going on here, and what a possible solution is?
Edit: I have another code snippet here that doesn’t throw an exception, but has weird behavior sometimes:
import encodings
_incr_encoder = encodings.search_function('utf8').incrementalencoder()
def utf8_byte_truncate(text, max_bytes):
""" truncate utf-8 text string to no more than max_bytes long """
byte_len = 0
_incr_encoder.reset()
for index,ch in enumerate(text):
byte_len += len(_incr_encoder.encode(ch))
if byte_len > max_bytes:
break
else:
return text
return text[:index]
>>> string = u"ud83dude14ud83dude14ud83dude14ud83dude14ud83dude14"
>>> print string
(prints a set of 5 Apple Emoji...)
>>> len(string)
10
>>> trunc = utf8_byte_truncate(string, 4)
>>> print trunc
???
>>> len(trunc)
1
So with this second example, I have a string of 10 bytes, truncate it to 4, but something weird happens, and the result is a string of size 1 byte.
Answers:
If a number f is such that f & 0xF0 == 0xF0
, then it is also the case that f & 0xC0 == 0xC0
because 0xF0 has all the bits that 0xC0 has, and then some. That is, among other problems your codepoint_length()
function will return a step of 2 when it should be 4. If you reverse your LENGTH_BY_PREFIX list, the function works ok with the first example.
LENGTH_BY_PREFIX = [
(0xFC, 6),
(0xF8, 5),
(0xF0, 4),
(0xE0, 3),
(0xC0, 2), # first byte mask, total codepoint length
]
The algorithm is wrong as @jwpat7 indicated. A simpler algorithm is the following, but note some perceived single characters (called graphemes) are made up of more than one Unicode code point such as . This doesn’t attempt to maintain graphemes.
# NOTE: This is Python 2 to match OP's code
# s = u'ud83dude14ud83dude14ud83dude14ud83dude14ud83dude14'
# Same as above
s = u'U0001f614' * 5 # Unicode character U+1F614
def utf8_lead_byte(b):
'''A UTF-8 intermediate byte starts with the bits 10xxxxxx.'''
# (b & 0xC0) != 0x80 # Python 3 no need for ord()
return (ord(b) & 0xC0) != 0x80
def utf8_byte_truncate(text, max_bytes):
'''If text[max_bytes] is not a lead byte, back up until a lead byte is
found and truncate before that character.'''
utf8 = text.encode('utf8')
if len(utf8) <= max_bytes:
return utf8
i = max_bytes
while i > 0 and not utf8_lead_byte(utf8[i]):
i -= 1
return utf8[:i]
# test for various max_bytes:
for m in range(len(s.encode('utf8'))+1):
b = utf8_byte_truncate(s,m)
print m,len(b),b.decode('utf8')
###Output
0 0
1 0
2 0
3 0
4 4
5 4
6 4
7 4
8 8
9 8
10 8
11 8
12 12
13 12
14 12
15 12
16 16
17 16
18 16
19 16
20 20
Version of Mark’s code for Python 3:
# s = u'ud83dude14ud83dude14ud83dude14ud83dude14ud83dude14'
# Same as above
s = u'U0001f614' * 5 # Unicode character U+1F614
def utf8_lead_byte(b):
'''A UTF-8 intermediate byte starts with the bits 10xxxxxx.'''
return (b & 0xC0) != 0x80
def utf8_byte_truncate(text, max_bytes):
'''If text[max_bytes] is not a lead byte, back up until a lead byte is
found and truncate before that character.'''
utf8 = text.encode('utf8')
if len(utf8) <= max_bytes:
return utf8
i = max_bytes
while i > 0 and not utf8_lead_byte(utf8[i]):
i -= 1
return utf8[:i]
# test for various max_bytes:
for m in range(len(s.encode('utf8'))+1):
b = utf8_byte_truncate(s,m)
print(m,len(b),b.decode('utf8'))
EDIT: this is the original code by Mark Tolonen adapted for python3. The previous code was wrong. Thanks for the comments!
(python3) and a lot simpler than previous answers:
cut_index = len(unicode_text.encode('utf8', errors="replace")[:max_length_byte].decode('utf8', errors="ignore"))
unicode_text[:cut_index]
The idea is to cut the encoded string at the byte length, then decode with ignoring errors (possibly removing a character that was broken by cutting).
I have a function here to truncate a given string to a given byte length:
LENGTH_BY_PREFIX = [
(0xC0, 2), # first byte mask, total codepoint length
(0xE0, 3),
(0xF0, 4),
(0xF8, 5),
(0xFC, 6),
]
def codepoint_length(first_byte):
if first_byte < 128:
return 1 # ASCII
for mask, length in LENGTH_BY_PREFIX:
if first_byte & mask == mask:
return length
assert False, 'Invalid byte %r' % first_byte
def cut_string_to_bytes_length(unicode_text, byte_limit):
utf8_bytes = unicode_text.encode('UTF-8')
cut_index = 0
while cut_index < len(utf8_bytes):
step = codepoint_length(ord(utf8_bytes[cut_index]))
if cut_index + step > byte_limit:
# can't go a whole codepoint further, time to cut
return utf8_bytes[:cut_index]
else:
cut_index += step
# length limit is longer than our bytes strung, so no cutting
return utf8_bytes
This seemed to work fine until the question of Emoji was introduced:
string = u"ud83dude14"
trunc = cut_string_to_bytes_length(string, 100)
Traceback (most recent call last):
File "<console>", line 1, in <module>
File "<console>", line 5, in cut_string_to_bytes_length
File "<console>", line 7, in codepoint_length
AssertionError: Invalid byte 152
Can anyone explain exactly what is going on here, and what a possible solution is?
Edit: I have another code snippet here that doesn’t throw an exception, but has weird behavior sometimes:
import encodings
_incr_encoder = encodings.search_function('utf8').incrementalencoder()
def utf8_byte_truncate(text, max_bytes):
""" truncate utf-8 text string to no more than max_bytes long """
byte_len = 0
_incr_encoder.reset()
for index,ch in enumerate(text):
byte_len += len(_incr_encoder.encode(ch))
if byte_len > max_bytes:
break
else:
return text
return text[:index]
>>> string = u"ud83dude14ud83dude14ud83dude14ud83dude14ud83dude14"
>>> print string
(prints a set of 5 Apple Emoji...)
>>> len(string)
10
>>> trunc = utf8_byte_truncate(string, 4)
>>> print trunc
???
>>> len(trunc)
1
So with this second example, I have a string of 10 bytes, truncate it to 4, but something weird happens, and the result is a string of size 1 byte.
If a number f is such that f & 0xF0 == 0xF0
, then it is also the case that f & 0xC0 == 0xC0
because 0xF0 has all the bits that 0xC0 has, and then some. That is, among other problems your codepoint_length()
function will return a step of 2 when it should be 4. If you reverse your LENGTH_BY_PREFIX list, the function works ok with the first example.
LENGTH_BY_PREFIX = [
(0xFC, 6),
(0xF8, 5),
(0xF0, 4),
(0xE0, 3),
(0xC0, 2), # first byte mask, total codepoint length
]
The algorithm is wrong as @jwpat7 indicated. A simpler algorithm is the following, but note some perceived single characters (called graphemes) are made up of more than one Unicode code point such as . This doesn’t attempt to maintain graphemes.
# NOTE: This is Python 2 to match OP's code
# s = u'ud83dude14ud83dude14ud83dude14ud83dude14ud83dude14'
# Same as above
s = u'U0001f614' * 5 # Unicode character U+1F614
def utf8_lead_byte(b):
'''A UTF-8 intermediate byte starts with the bits 10xxxxxx.'''
# (b & 0xC0) != 0x80 # Python 3 no need for ord()
return (ord(b) & 0xC0) != 0x80
def utf8_byte_truncate(text, max_bytes):
'''If text[max_bytes] is not a lead byte, back up until a lead byte is
found and truncate before that character.'''
utf8 = text.encode('utf8')
if len(utf8) <= max_bytes:
return utf8
i = max_bytes
while i > 0 and not utf8_lead_byte(utf8[i]):
i -= 1
return utf8[:i]
# test for various max_bytes:
for m in range(len(s.encode('utf8'))+1):
b = utf8_byte_truncate(s,m)
print m,len(b),b.decode('utf8')
###Output
0 0
1 0
2 0
3 0
4 4
5 4
6 4
7 4
8 8
9 8
10 8
11 8
12 12
13 12
14 12
15 12
16 16
17 16
18 16
19 16
20 20
Version of Mark’s code for Python 3:
# s = u'ud83dude14ud83dude14ud83dude14ud83dude14ud83dude14'
# Same as above
s = u'U0001f614' * 5 # Unicode character U+1F614
def utf8_lead_byte(b):
'''A UTF-8 intermediate byte starts with the bits 10xxxxxx.'''
return (b & 0xC0) != 0x80
def utf8_byte_truncate(text, max_bytes):
'''If text[max_bytes] is not a lead byte, back up until a lead byte is
found and truncate before that character.'''
utf8 = text.encode('utf8')
if len(utf8) <= max_bytes:
return utf8
i = max_bytes
while i > 0 and not utf8_lead_byte(utf8[i]):
i -= 1
return utf8[:i]
# test for various max_bytes:
for m in range(len(s.encode('utf8'))+1):
b = utf8_byte_truncate(s,m)
print(m,len(b),b.decode('utf8'))
EDIT: this is the original code by Mark Tolonen adapted for python3. The previous code was wrong. Thanks for the comments!
(python3) and a lot simpler than previous answers:
cut_index = len(unicode_text.encode('utf8', errors="replace")[:max_length_byte].decode('utf8', errors="ignore"))
unicode_text[:cut_index]
The idea is to cut the encoded string at the byte length, then decode with ignoring errors (possibly removing a character that was broken by cutting).