133 lines
4.6 KiB
Python
133 lines
4.6 KiB
Python
"""
|
|
Chinese number parser for the time expression parser.
|
|
"""
|
|
|
|
import re
|
|
from typing import Tuple
|
|
|
|
|
|
class ChineseNumberParser:
|
|
"""Parser for Chinese numbers."""
|
|
|
|
def __init__(self):
|
|
self.digits = {"零": 0, "一": 1, "二": 2, "三": 3, "四": 4,
|
|
"五": 5, "六": 6, "七": 7, "八": 8, "九": 9}
|
|
self.units = {"十": 10, "百": 100, "千": 1000, "万": 10000, "亿": 100000000}
|
|
|
|
def digest(self, text: str) -> Tuple[str, int]:
|
|
"""
|
|
Parse a Chinese number from the beginning of text and return the rest and the parsed number.
|
|
|
|
Args:
|
|
text: Text that may start with a Chinese number
|
|
|
|
Returns:
|
|
Tuple of (remaining_text, parsed_number)
|
|
"""
|
|
if not text:
|
|
return text, 0
|
|
|
|
# Handle "两" at start
|
|
if text.startswith("两"):
|
|
# Check if "两" is followed by a time unit
|
|
# Look ahead to see if we have a valid pattern like "两小时", "两分钟", etc.
|
|
if len(text) >= 2:
|
|
# Check for time units that start with the second character
|
|
time_units = ["小时", "分钟", "秒"]
|
|
for unit in time_units:
|
|
if text[1:].startswith(unit):
|
|
# Return the text starting from the time unit, not after it
|
|
# The parser will handle the time unit in the next step
|
|
return text[1:], 2
|
|
# Check for single character time units
|
|
next_char = text[1]
|
|
if next_char in "时分秒":
|
|
return text[1:], 2
|
|
# Check for Chinese number units
|
|
if next_char in "十百千万亿":
|
|
# This will be handled by the normal parsing below
|
|
pass
|
|
# If "两" is at the end of string, treat it as standalone
|
|
elif len(text) == 1:
|
|
return "", 2
|
|
# Also accept "两" followed by whitespace and then time units
|
|
elif next_char.isspace():
|
|
# Check if after whitespace we have time units
|
|
rest_after_space = text[2:].lstrip()
|
|
for unit in time_units:
|
|
if rest_after_space.startswith(unit):
|
|
# Return the text starting from the time unit
|
|
space_len = len(text[2:]) - len(rest_after_space)
|
|
return text[2+space_len:], 2
|
|
# Check single character time units after whitespace
|
|
if rest_after_space and rest_after_space[0] in "时分秒":
|
|
return text[2:], 2
|
|
else:
|
|
# Just "两" by itself
|
|
return "", 2
|
|
|
|
s = "零一二三四五六七八九"
|
|
i = 0
|
|
while i < len(text) and text[i] in s + "十百千万亿":
|
|
i += 1
|
|
if i == 0:
|
|
return text, 0
|
|
num_str = text[:i]
|
|
rest = text[i:]
|
|
|
|
return rest, self.parse(num_str)
|
|
|
|
def parse(self, text: str) -> int:
|
|
"""
|
|
Parse a Chinese number string and return its integer value.
|
|
|
|
Args:
|
|
text: Chinese number string
|
|
|
|
Returns:
|
|
Integer value of the Chinese number
|
|
"""
|
|
if not text:
|
|
return 0
|
|
if text == "零":
|
|
return 0
|
|
if text == "两":
|
|
return 2
|
|
|
|
# Handle special case for "十"
|
|
if text == "十":
|
|
return 10
|
|
|
|
# Handle numbers with "亿"
|
|
if "亿" in text:
|
|
parts = text.split("亿", 1)
|
|
a, b = parts[0], parts[1]
|
|
return self.parse(a) * 100000000 + self.parse(b)
|
|
|
|
# Handle numbers with "万"
|
|
if "万" in text:
|
|
parts = text.split("万", 1)
|
|
a, b = parts[0], parts[1]
|
|
return self.parse(a) * 10000 + self.parse(b)
|
|
|
|
# Handle remaining numbers
|
|
result = 0
|
|
temp = 0
|
|
|
|
for char in text:
|
|
if char == "零":
|
|
continue
|
|
elif char == "两":
|
|
temp = 2
|
|
elif char in self.digits:
|
|
temp = self.digits[char]
|
|
elif char in self.units:
|
|
unit = self.units[char]
|
|
if unit == 10 and temp == 0:
|
|
# Special case for numbers like "十三"
|
|
temp = 1
|
|
result += temp * unit
|
|
temp = 0
|
|
|
|
result += temp
|
|
return result |