Files
konabot/konabot/common/ptimeparse/chinese_number.py
passthem 3e5c1941c8
All checks were successful
continuous-integration/drone/push Build is passing
continuous-integration/drone/tag Build is passing
重构 ptimeparse 模块
2025-11-21 06:03:28 +08:00

133 lines
4.6 KiB
Python

"""
Chinese number parser for the time expression parser.
"""
import re
from typing import Tuple
class ChineseNumberParser:
"""Parser for Chinese numbers."""
def __init__(self):
self.digits = {"": 0, "": 1, "": 2, "": 3, "": 4,
"": 5, "": 6, "": 7, "": 8, "": 9}
self.units = {"": 10, "": 100, "": 1000, "": 10000, "亿": 100000000}
def digest(self, text: str) -> Tuple[str, int]:
"""
Parse a Chinese number from the beginning of text and return the rest and the parsed number.
Args:
text: Text that may start with a Chinese number
Returns:
Tuple of (remaining_text, parsed_number)
"""
if not text:
return text, 0
# Handle "两" at start
if text.startswith(""):
# Check if "两" is followed by a time unit
# Look ahead to see if we have a valid pattern like "两小时", "两分钟", etc.
if len(text) >= 2:
# Check for time units that start with the second character
time_units = ["小时", "分钟", ""]
for unit in time_units:
if text[1:].startswith(unit):
# Return the text starting from the time unit, not after it
# The parser will handle the time unit in the next step
return text[1:], 2
# Check for single character time units
next_char = text[1]
if next_char in "时分秒":
return text[1:], 2
# Check for Chinese number units
if next_char in "十百千万亿":
# This will be handled by the normal parsing below
pass
# If "两" is at the end of string, treat it as standalone
elif len(text) == 1:
return "", 2
# Also accept "两" followed by whitespace and then time units
elif next_char.isspace():
# Check if after whitespace we have time units
rest_after_space = text[2:].lstrip()
for unit in time_units:
if rest_after_space.startswith(unit):
# Return the text starting from the time unit
space_len = len(text[2:]) - len(rest_after_space)
return text[2+space_len:], 2
# Check single character time units after whitespace
if rest_after_space and rest_after_space[0] in "时分秒":
return text[2:], 2
else:
# Just "两" by itself
return "", 2
s = "零一二三四五六七八九"
i = 0
while i < len(text) and text[i] in s + "十百千万亿":
i += 1
if i == 0:
return text, 0
num_str = text[:i]
rest = text[i:]
return rest, self.parse(num_str)
def parse(self, text: str) -> int:
"""
Parse a Chinese number string and return its integer value.
Args:
text: Chinese number string
Returns:
Integer value of the Chinese number
"""
if not text:
return 0
if text == "":
return 0
if text == "":
return 2
# Handle special case for "十"
if text == "":
return 10
# Handle numbers with "亿"
if "亿" in text:
parts = text.split("亿", 1)
a, b = parts[0], parts[1]
return self.parse(a) * 100000000 + self.parse(b)
# Handle numbers with "万"
if "" in text:
parts = text.split("", 1)
a, b = parts[0], parts[1]
return self.parse(a) * 10000 + self.parse(b)
# Handle remaining numbers
result = 0
temp = 0
for char in text:
if char == "":
continue
elif char == "":
temp = 2
elif char in self.digits:
temp = self.digits[char]
elif char in self.units:
unit = self.units[char]
if unit == 10 and temp == 0:
# Special case for numbers like "十三"
temp = 1
result += temp * unit
temp = 0
result += temp
return result