重构 ptimeparse 模块

2025-11-21 06:03:28 +08:00
parent f6e7dfcd93
commit 3e5c1941c8
11 changed files with 2058 additions and 645 deletions
--- a/konabot/common/ptimeparse/chinese_number.py
+++ b/konabot/common/ptimeparse/chinese_number.py
@ -0,0 +1,133 @@
+"""
+Chinese number parser for the time expression parser.
+"""
+
+import re
+from typing import Tuple
+
+
+class ChineseNumberParser:
+    """Parser for Chinese numbers."""
+    
+    def __init__(self):
+        self.digits = {"零": 0, "一": 1, "二": 2, "三": 3, "四": 4, 
+                      "五": 5, "六": 6, "七": 7, "八": 8, "九": 9}
+        self.units = {"十": 10, "百": 100, "千": 1000, "万": 10000, "亿": 100000000}
+    
+    def digest(self, text: str) -> Tuple[str, int]:
+        """
+        Parse a Chinese number from the beginning of text and return the rest and the parsed number.
+        
+        Args:
+            text: Text that may start with a Chinese number
+            
+        Returns:
+            Tuple of (remaining_text, parsed_number)
+        """
+        if not text:
+            return text, 0
+            
+        # Handle "两" at start
+        if text.startswith("两"):
+            # Check if "两" is followed by a time unit
+            # Look ahead to see if we have a valid pattern like "两小时", "两分钟", etc.
+            if len(text) >= 2:
+                # Check for time units that start with the second character
+                time_units = ["小时", "分钟", "秒"]
+                for unit in time_units:
+                    if text[1:].startswith(unit):
+                        # Return the text starting from the time unit, not after it
+                        # The parser will handle the time unit in the next step
+                        return text[1:], 2
+                # Check for single character time units
+                next_char = text[1]
+                if next_char in "时分秒":
+                    return text[1:], 2
+                # Check for Chinese number units
+                if next_char in "十百千万亿":
+                    # This will be handled by the normal parsing below
+                    pass
+                # If "两" is at the end of string, treat it as standalone
+                elif len(text) == 1:
+                    return "", 2
+                # Also accept "两" followed by whitespace and then time units
+                elif next_char.isspace():
+                    # Check if after whitespace we have time units
+                    rest_after_space = text[2:].lstrip()
+                    for unit in time_units:
+                        if rest_after_space.startswith(unit):
+                            # Return the text starting from the time unit
+                            space_len = len(text[2:]) - len(rest_after_space)
+                            return text[2+space_len:], 2
+                    # Check single character time units after whitespace
+                    if rest_after_space and rest_after_space[0] in "时分秒":
+                        return text[2:], 2
+            else:
+                # Just "两" by itself
+                return "", 2
+                
+        s = "零一二三四五六七八九"
+        i = 0
+        while i < len(text) and text[i] in s + "十百千万亿":
+            i += 1
+        if i == 0:
+            return text, 0
+        num_str = text[:i]
+        rest = text[i:]
+        
+        return rest, self.parse(num_str)
+    
+    def parse(self, text: str) -> int:
+        """
+        Parse a Chinese number string and return its integer value.
+
+        Args:
+            text: Chinese number string
+
+        Returns:
+            Integer value of the Chinese number
+        """
+        if not text:
+            return 0
+        if text == "零":
+            return 0
+        if text == "两":
+            return 2
+
+        # Handle special case for "十"
+        if text == "十":
+            return 10
+
+        # Handle numbers with "亿"
+        if "亿" in text:
+            parts = text.split("亿", 1)
+            a, b = parts[0], parts[1]
+            return self.parse(a) * 100000000 + self.parse(b)
+
+        # Handle numbers with "万"
+        if "万" in text:
+            parts = text.split("万", 1)
+            a, b = parts[0], parts[1]
+            return self.parse(a) * 10000 + self.parse(b)
+
+        # Handle remaining numbers
+        result = 0
+        temp = 0
+
+        for char in text:
+            if char == "零":
+                continue
+            elif char == "两":
+                temp = 2
+            elif char in self.digits:
+                temp = self.digits[char]
+            elif char in self.units:
+                unit = self.units[char]
+                if unit == 10 and temp == 0:
+                    # Special case for numbers like "十三"
+                    temp = 1
+                result += temp * unit
+                temp = 0
+
+        result += temp
+        return result