重构 ptimeparse 模块
This commit is contained in:
133
konabot/common/ptimeparse/chinese_number.py
Normal file
133
konabot/common/ptimeparse/chinese_number.py
Normal file
@ -0,0 +1,133 @@
|
||||
"""
|
||||
Chinese number parser for the time expression parser.
|
||||
"""
|
||||
|
||||
import re
|
||||
from typing import Tuple
|
||||
|
||||
|
||||
class ChineseNumberParser:
|
||||
"""Parser for Chinese numbers."""
|
||||
|
||||
def __init__(self):
|
||||
self.digits = {"零": 0, "一": 1, "二": 2, "三": 3, "四": 4,
|
||||
"五": 5, "六": 6, "七": 7, "八": 8, "九": 9}
|
||||
self.units = {"十": 10, "百": 100, "千": 1000, "万": 10000, "亿": 100000000}
|
||||
|
||||
def digest(self, text: str) -> Tuple[str, int]:
|
||||
"""
|
||||
Parse a Chinese number from the beginning of text and return the rest and the parsed number.
|
||||
|
||||
Args:
|
||||
text: Text that may start with a Chinese number
|
||||
|
||||
Returns:
|
||||
Tuple of (remaining_text, parsed_number)
|
||||
"""
|
||||
if not text:
|
||||
return text, 0
|
||||
|
||||
# Handle "两" at start
|
||||
if text.startswith("两"):
|
||||
# Check if "两" is followed by a time unit
|
||||
# Look ahead to see if we have a valid pattern like "两小时", "两分钟", etc.
|
||||
if len(text) >= 2:
|
||||
# Check for time units that start with the second character
|
||||
time_units = ["小时", "分钟", "秒"]
|
||||
for unit in time_units:
|
||||
if text[1:].startswith(unit):
|
||||
# Return the text starting from the time unit, not after it
|
||||
# The parser will handle the time unit in the next step
|
||||
return text[1:], 2
|
||||
# Check for single character time units
|
||||
next_char = text[1]
|
||||
if next_char in "时分秒":
|
||||
return text[1:], 2
|
||||
# Check for Chinese number units
|
||||
if next_char in "十百千万亿":
|
||||
# This will be handled by the normal parsing below
|
||||
pass
|
||||
# If "两" is at the end of string, treat it as standalone
|
||||
elif len(text) == 1:
|
||||
return "", 2
|
||||
# Also accept "两" followed by whitespace and then time units
|
||||
elif next_char.isspace():
|
||||
# Check if after whitespace we have time units
|
||||
rest_after_space = text[2:].lstrip()
|
||||
for unit in time_units:
|
||||
if rest_after_space.startswith(unit):
|
||||
# Return the text starting from the time unit
|
||||
space_len = len(text[2:]) - len(rest_after_space)
|
||||
return text[2+space_len:], 2
|
||||
# Check single character time units after whitespace
|
||||
if rest_after_space and rest_after_space[0] in "时分秒":
|
||||
return text[2:], 2
|
||||
else:
|
||||
# Just "两" by itself
|
||||
return "", 2
|
||||
|
||||
s = "零一二三四五六七八九"
|
||||
i = 0
|
||||
while i < len(text) and text[i] in s + "十百千万亿":
|
||||
i += 1
|
||||
if i == 0:
|
||||
return text, 0
|
||||
num_str = text[:i]
|
||||
rest = text[i:]
|
||||
|
||||
return rest, self.parse(num_str)
|
||||
|
||||
def parse(self, text: str) -> int:
|
||||
"""
|
||||
Parse a Chinese number string and return its integer value.
|
||||
|
||||
Args:
|
||||
text: Chinese number string
|
||||
|
||||
Returns:
|
||||
Integer value of the Chinese number
|
||||
"""
|
||||
if not text:
|
||||
return 0
|
||||
if text == "零":
|
||||
return 0
|
||||
if text == "两":
|
||||
return 2
|
||||
|
||||
# Handle special case for "十"
|
||||
if text == "十":
|
||||
return 10
|
||||
|
||||
# Handle numbers with "亿"
|
||||
if "亿" in text:
|
||||
parts = text.split("亿", 1)
|
||||
a, b = parts[0], parts[1]
|
||||
return self.parse(a) * 100000000 + self.parse(b)
|
||||
|
||||
# Handle numbers with "万"
|
||||
if "万" in text:
|
||||
parts = text.split("万", 1)
|
||||
a, b = parts[0], parts[1]
|
||||
return self.parse(a) * 10000 + self.parse(b)
|
||||
|
||||
# Handle remaining numbers
|
||||
result = 0
|
||||
temp = 0
|
||||
|
||||
for char in text:
|
||||
if char == "零":
|
||||
continue
|
||||
elif char == "两":
|
||||
temp = 2
|
||||
elif char in self.digits:
|
||||
temp = self.digits[char]
|
||||
elif char in self.units:
|
||||
unit = self.units[char]
|
||||
if unit == 10 and temp == 0:
|
||||
# Special case for numbers like "十三"
|
||||
temp = 1
|
||||
result += temp * unit
|
||||
temp = 0
|
||||
|
||||
result += temp
|
||||
return result
|
||||
Reference in New Issue
Block a user