Convert Chinese numeric characters to numbers in Python
Question:
I am writing a Python function to convert numbers represented in a mix of Chinese and Arabic numbers to a numerical value.
import re
units = ['', '十', '百', '千', '萬', '十萬', '百萬', '千萬', '億', '十億', '百億', '千億']
# the Chinese means '', '100', '1000', '10000', etc.
def chinese_numeral_replace(input_text):
if input_text.group(0) in units:
idx = units.index(input_text.group(0))
return '*' + str(pow(10, idx))
else:
return input_text
test2 = '5萬' # means 50 thousands or 50000
result = re.sub('|'.join(units).lstrip('|'), chinese_numeral_replace, test2)
print(eval(result))
which results 50000
. The value is 50*1000
which will then eval()
to 50000.
The above function and codes work when the sentence is simple. However, if the input string is more complex like ‘5千萬5千’ (which translates to 50000 + 5000, that’s 55000), the above function will result in incorrect values.
I know there are 2 bugs:
-
The re.sub()
returns incorrect results. For 千萬 in the complex case, the re.sub()
will return 2 match groups (‘千’ and ‘萬’) instead of ‘千萬’ which is also in the units
list.
-
The chinese_numeral_replace
function hard codes .group(0)
into the function, how can I ask the replace function to replace:
- 千萬 -> 10000000 (which means 10 million in Chinese, which 千 = 1000, 萬 = 10000)
- 千 -> 1000 (which means 1000 in Chinese)
Thanks in advance.
Answers:
Instead of replacing and using eval
, here’s a safer approach: re.sub
takes a function argument, where this function receives the Match
object as its argument, and returns the replacement text. In this function, you can find the index of the "units" character(s) in the units
list, and infer the power of 10 from that index.
import re
units = ['十', '百', '千', '萬', '十萬', '百萬', '千萬', '億', '十億', '百億', '千億']
# the Chinese means '10', '100', '1000', '10000', etc.
number_regex = re.compile(rf"(d+)({'|'.join(reversed(units))})")
# '(\d+)(千億|百億|十億|億|千萬|百萬|十萬|萬|千|百|十)'
def chinese_numeral_repl(match):
num, multiplier = match.groups()
power_of_10 = units.index(multiplier) + 1
repl_value = int(num) * 10**power_of_10
return f"{repl_value:.0f}"
print(number_regex.sub(chinese_numeral_repl, '5千萬')) # 50000000
print(number_regex.sub(chinese_numeral_repl, '5萬')) # 50000
print(number_regex.sub(chinese_numeral_repl, '5千')) # 5000
Note the re.compile
call includes '|'.join(reversed(units))
.
Reversing the units
list before joining it prioritizes the larger multipliers, so it looks for '十億'
before looking for '十'
. Keeping this in mind, maybe sorting by len
would be a better strategy.
number_regex = re.compile(rf"(d+)({'|'.join(sorted(units, key=len, reverse=True))})")
# '(\d+)(十萬|百萬|千萬|十億|百億|千億|十|百|千|萬|億)'
For the "complex" version, you could extend this regex to match multiple instances of a number followed by an element of units
.
single_number_regex = rf"(d+)({'|'.join(sorted(units, key=len, reverse=True))})"
full_number_regex = f"(?:{single_number_regex})+"
def single_numeral_eval(match)
num, multiplier = match.groups()
power_of_10 = units.index(multiplier) + 1
return int(num) * 10**power_of_10
def chinese_numeral_repl(match):
full_number = sum(single_numeral_eval(m) for m in re.finditer(single_number_regex))
return f"{full_number:.0f}"
print(re.sub(full_number_regex, chinese_numeral_repl, '5千萬5千')) # 50005000
The difference between this approach and the other answer is that this approach will replace strings in-place, so you could do e.g.:
s = "5千萬 plus 5千 equals 5千萬5千"
print(re.sub(full_number_regex, chinese_numeral_repl, s))
# 50000000 plus 5000 equals 50005000
Please correct values in dictionary.
units = {'': 1,
'十': 10,
'百': 100,
'千': 1000,
'萬': 10000,
'十萬': 100000,
'百萬': 1000000,
'千萬': 10000000,
'億' : 100000000,
'十億': 1000000000,
'百億': 10000000000,
'千億': 100000000000
}
test2 = '6百億5萬'
print( sum([int(i[:1]) * units[i[1:]] for i in re.findall(r'd[^d]*', test2)]))
As already mentioned, I’d always try and avoid eval unless absolutely necessary. I think I covered all the edge cases, but with no error handling.
import re
units = ['十', '百', '千', '萬', '十萬', '百萬', '千萬', '億', '十億', '百億', '千億']
unit_factors = {unit: 10**factor for factor, unit in enumerate(units, start=1)}
digit_unit_regex = re.compile(rf"(d*)({'|'.join(reversed(units))})")
def chinese_numeral_replace(input_text: str) -> int:
digit_unit_pairs = digit_unit_regex.findall(input_text)
if not digit_unit_pairs:
return int(input_text)
return sum(replace_match(*m) for m in digit_unit_pairs)
def replace_match(digit_str: str, unit_str: str) -> int:
digits = int(digit_str or 1)
unit = unit_factors.get(unit_str, 1)
return digits * unit
for test_string, expected in (('5', 5), ('萬', 10_000), ('5萬', 50_000), ('51萬', 510_000), ('5千萬', 50_000_000), ('5千萬萬', 50_010_000), ('5千萬5萬', 50_050_000)):
result = chinese_numeral_replace(test_string)
print(test_string, expected, result)
assert result == expected
I am writing a Python function to convert numbers represented in a mix of Chinese and Arabic numbers to a numerical value.
import re
units = ['', '十', '百', '千', '萬', '十萬', '百萬', '千萬', '億', '十億', '百億', '千億']
# the Chinese means '', '100', '1000', '10000', etc.
def chinese_numeral_replace(input_text):
if input_text.group(0) in units:
idx = units.index(input_text.group(0))
return '*' + str(pow(10, idx))
else:
return input_text
test2 = '5萬' # means 50 thousands or 50000
result = re.sub('|'.join(units).lstrip('|'), chinese_numeral_replace, test2)
print(eval(result))
which results 50000
. The value is 50*1000
which will then eval()
to 50000.
The above function and codes work when the sentence is simple. However, if the input string is more complex like ‘5千萬5千’ (which translates to 50000 + 5000, that’s 55000), the above function will result in incorrect values.
I know there are 2 bugs:
-
The
re.sub()
returns incorrect results. For 千萬 in the complex case, there.sub()
will return 2 match groups (‘千’ and ‘萬’) instead of ‘千萬’ which is also in theunits
list. -
The
chinese_numeral_replace
function hard codes.group(0)
into the function, how can I ask the replace function to replace:
- 千萬 -> 10000000 (which means 10 million in Chinese, which 千 = 1000, 萬 = 10000)
- 千 -> 1000 (which means 1000 in Chinese)
Thanks in advance.
Instead of replacing and using eval
, here’s a safer approach: re.sub
takes a function argument, where this function receives the Match
object as its argument, and returns the replacement text. In this function, you can find the index of the "units" character(s) in the units
list, and infer the power of 10 from that index.
import re
units = ['十', '百', '千', '萬', '十萬', '百萬', '千萬', '億', '十億', '百億', '千億']
# the Chinese means '10', '100', '1000', '10000', etc.
number_regex = re.compile(rf"(d+)({'|'.join(reversed(units))})")
# '(\d+)(千億|百億|十億|億|千萬|百萬|十萬|萬|千|百|十)'
def chinese_numeral_repl(match):
num, multiplier = match.groups()
power_of_10 = units.index(multiplier) + 1
repl_value = int(num) * 10**power_of_10
return f"{repl_value:.0f}"
print(number_regex.sub(chinese_numeral_repl, '5千萬')) # 50000000
print(number_regex.sub(chinese_numeral_repl, '5萬')) # 50000
print(number_regex.sub(chinese_numeral_repl, '5千')) # 5000
Note the re.compile
call includes '|'.join(reversed(units))
.
Reversing the units
list before joining it prioritizes the larger multipliers, so it looks for '十億'
before looking for '十'
. Keeping this in mind, maybe sorting by len
would be a better strategy.
number_regex = re.compile(rf"(d+)({'|'.join(sorted(units, key=len, reverse=True))})")
# '(\d+)(十萬|百萬|千萬|十億|百億|千億|十|百|千|萬|億)'
For the "complex" version, you could extend this regex to match multiple instances of a number followed by an element of units
.
single_number_regex = rf"(d+)({'|'.join(sorted(units, key=len, reverse=True))})"
full_number_regex = f"(?:{single_number_regex})+"
def single_numeral_eval(match)
num, multiplier = match.groups()
power_of_10 = units.index(multiplier) + 1
return int(num) * 10**power_of_10
def chinese_numeral_repl(match):
full_number = sum(single_numeral_eval(m) for m in re.finditer(single_number_regex))
return f"{full_number:.0f}"
print(re.sub(full_number_regex, chinese_numeral_repl, '5千萬5千')) # 50005000
The difference between this approach and the other answer is that this approach will replace strings in-place, so you could do e.g.:
s = "5千萬 plus 5千 equals 5千萬5千"
print(re.sub(full_number_regex, chinese_numeral_repl, s))
# 50000000 plus 5000 equals 50005000
Please correct values in dictionary.
units = {'': 1,
'十': 10,
'百': 100,
'千': 1000,
'萬': 10000,
'十萬': 100000,
'百萬': 1000000,
'千萬': 10000000,
'億' : 100000000,
'十億': 1000000000,
'百億': 10000000000,
'千億': 100000000000
}
test2 = '6百億5萬'
print( sum([int(i[:1]) * units[i[1:]] for i in re.findall(r'd[^d]*', test2)]))
As already mentioned, I’d always try and avoid eval unless absolutely necessary. I think I covered all the edge cases, but with no error handling.
import re
units = ['十', '百', '千', '萬', '十萬', '百萬', '千萬', '億', '十億', '百億', '千億']
unit_factors = {unit: 10**factor for factor, unit in enumerate(units, start=1)}
digit_unit_regex = re.compile(rf"(d*)({'|'.join(reversed(units))})")
def chinese_numeral_replace(input_text: str) -> int:
digit_unit_pairs = digit_unit_regex.findall(input_text)
if not digit_unit_pairs:
return int(input_text)
return sum(replace_match(*m) for m in digit_unit_pairs)
def replace_match(digit_str: str, unit_str: str) -> int:
digits = int(digit_str or 1)
unit = unit_factors.get(unit_str, 1)
return digits * unit
for test_string, expected in (('5', 5), ('萬', 10_000), ('5萬', 50_000), ('51萬', 510_000), ('5千萬', 50_000_000), ('5千萬萬', 50_010_000), ('5千萬5萬', 50_050_000)):
result = chinese_numeral_replace(test_string)
print(test_string, expected, result)
assert result == expected