konabot/konabot/common/ptimeparse/chinese_number.py

"""
Chinese number parser for the time expression parser.
"""

import re
from typing import Tuple


class ChineseNumberParser:
    """Parser for Chinese numbers."""

    def __init__(self):
        self.digits = {"零": 0, "一": 1, "二": 2, "三": 3, "四": 4,
                      "五": 5, "六": 6, "七": 7, "八": 8, "九": 9}
        self.units = {"十": 10, "百": 100, "千": 1000, "万": 10000, "亿": 100000000}

    def digest(self, text: str) -> Tuple[str, int]:
        """
        Parse a Chinese number from the beginning of text and return the rest and the parsed number.

        Args:
            text: Text that may start with a Chinese number

        Returns:
            Tuple of (remaining_text, parsed_number)
        """
        if not text:
            return text, 0

        # Handle "两" at start
        if text.startswith("两"):
            # Check if "两" is followed by a time unit
            # Look ahead to see if we have a valid pattern like "两小时", "两分钟", etc.
            if len(text) >= 2:
                # Check for time units that start with the second character
                time_units = ["小时", "分钟", "秒"]
                for unit in time_units:
                    if text[1:].startswith(unit):
                        # Return the text starting from the time unit, not after it
                        # The parser will handle the time unit in the next step
                        return text[1:], 2
                # Check for single character time units
                next_char = text[1]
                if next_char in "时分秒":
                    return text[1:], 2
                # Check for Chinese number units
                if next_char in "十百千万亿":
                    # This will be handled by the normal parsing below
                    pass
                # If "两" is at the end of string, treat it as standalone
                elif len(text) == 1:
                    return "", 2
                # Also accept "两" followed by whitespace and then time units
                elif next_char.isspace():
                    # Check if after whitespace we have time units
                    rest_after_space = text[2:].lstrip()
                    for unit in time_units:
                        if rest_after_space.startswith(unit):
                            # Return the text starting from the time unit
                            space_len = len(text[2:]) - len(rest_after_space)
                            return text[2+space_len:], 2
                    # Check single character time units after whitespace
                    if rest_after_space and rest_after_space[0] in "时分秒":
                        return text[2:], 2
            else:
                # Just "两" by itself
                return "", 2

        s = "零一二三四五六七八九"
        i = 0
        while i < len(text) and text[i] in s + "十百千万亿":
            i += 1
        if i == 0:
            return text, 0
        num_str = text[:i]
        rest = text[i:]

        return rest, self.parse(num_str)

    def parse(self, text: str) -> int:
        """
        Parse a Chinese number string and return its integer value.

        Args:
            text: Chinese number string

        Returns:
            Integer value of the Chinese number
        """
        if not text:
            return 0
        if text == "零":
            return 0
        if text == "两":
            return 2

        # Handle special case for "十"
        if text == "十":
            return 10

        # Handle numbers with "亿"
        if "亿" in text:
            parts = text.split("亿", 1)
            a, b = parts[0], parts[1]
            return self.parse(a) * 100000000 + self.parse(b)

        # Handle numbers with "万"
        if "万" in text:
            parts = text.split("万", 1)
            a, b = parts[0], parts[1]
            return self.parse(a) * 10000 + self.parse(b)

        # Handle remaining numbers
        result = 0
        temp = 0

        for char in text:
            if char == "零":
                continue
            elif char == "两":
                temp = 2
            elif char in self.digits:
                temp = self.digits[char]
            elif char in self.units:
                unit = self.units[char]
                if unit == 10 and temp == 0:
                    # Special case for numbers like "十三"
                    temp = 1
                result += temp * unit
                temp = 0

        result += temp
        return result