Files
konabot/konabot/common/ptimeparse/lexer.py
passthem 3e5c1941c8
All checks were successful
continuous-integration/drone/push Build is passing
continuous-integration/drone/tag Build is passing
重构 ptimeparse 模块
2025-11-21 06:03:28 +08:00

225 lines
9.0 KiB
Python

"""
Lexical analyzer for time expressions.
"""
import re
from typing import Iterator, Optional
import datetime
from .ptime_token import Token, TokenType
from .chinese_number import ChineseNumberParser
class Lexer:
"""Lexical analyzer for time expressions."""
def __init__(self, text: str, now: Optional[datetime.datetime] = None):
self.text = text
self.pos = 0
self.current_char = self.text[self.pos] if self.text else None
self.now = now or datetime.datetime.now()
self.chinese_parser = ChineseNumberParser()
# Define token patterns
self.token_patterns = [
# Whitespace
(r'^\s+', TokenType.WHITESPACE),
# Time separators
(r'^:', TokenType.TIME_SEPARATOR),
(r'^点', TokenType.TIME_SEPARATOR),
(r'^时', TokenType.TIME_SEPARATOR),
(r'^分', TokenType.TIME_SEPARATOR),
(r'^秒', TokenType.TIME_SEPARATOR),
# Special time markers
(r'^半', TokenType.HALF),
(r'^一刻', TokenType.QUARTER),
(r'^整', TokenType.ZHENG),
(r'^钟', TokenType.ZHONG),
# Period indicators (must come before relative time patterns to avoid conflicts)
(r'^(上午|早晨|早上|清晨|早(?!\d))', TokenType.PERIOD_AM),
(r'^(中午|下午|晚上|晚(?!\d)|凌晨|午夜)', TokenType.PERIOD_PM),
# Week scope (more specific patterns first)
(r'^本周', TokenType.WEEK_SCOPE_CURRENT),
(r'^上周', TokenType.WEEK_SCOPE_LAST),
(r'^下周', TokenType.WEEK_SCOPE_NEXT),
# Relative directions
(r'^(后|以后|之后)', TokenType.RELATIVE_DIRECTION_FORWARD),
(r'^(前|以前|之前)', TokenType.RELATIVE_DIRECTION_BACKWARD),
# Extended relative time
(r'^明年', TokenType.RELATIVE_NEXT),
(r'^去年', TokenType.RELATIVE_LAST),
(r'^今年', TokenType.RELATIVE_THIS),
(r'^下(?![午年月周])', TokenType.RELATIVE_NEXT),
(r'^(上|去)(?![午年月周])', TokenType.RELATIVE_LAST),
(r'^这', TokenType.RELATIVE_THIS),
(r'^本(?![周月年])', TokenType.RELATIVE_THIS), # Match "本" but not "本周", "本月", "本年"
# Week scope (fallback for standalone terms)
(r'^本', TokenType.WEEK_SCOPE_CURRENT),
(r'^上', TokenType.WEEK_SCOPE_LAST),
(r'^下(?![午年月周])', TokenType.WEEK_SCOPE_NEXT),
# Week days (order matters - longer patterns first)
(r'^周一', TokenType.WEEKDAY_MONDAY),
(r'^周二', TokenType.WEEKDAY_TUESDAY),
(r'^周三', TokenType.WEEKDAY_WEDNESDAY),
(r'^周四', TokenType.WEEKDAY_THURSDAY),
(r'^周五', TokenType.WEEKDAY_FRIDAY),
(r'^周六', TokenType.WEEKDAY_SATURDAY),
(r'^周日', TokenType.WEEKDAY_SUNDAY),
# Single character weekdays should be matched after numbers
# (r'^一', TokenType.WEEKDAY_MONDAY),
# (r'^二', TokenType.WEEKDAY_TUESDAY),
# (r'^三', TokenType.WEEKDAY_WEDNESDAY),
# (r'^四', TokenType.WEEKDAY_THURSDAY),
# (r'^五', TokenType.WEEKDAY_FRIDAY),
# (r'^六', TokenType.WEEKDAY_SATURDAY),
# (r'^日', TokenType.WEEKDAY_SUNDAY),
# Student-friendly time expressions
(r'^早(?=\d)', TokenType.EARLY_MORNING),
(r'^晚(?=\d)', TokenType.LATE_NIGHT),
# Relative today variants
(r'^今晚上', TokenType.RELATIVE_TODAY),
(r'^今晚', TokenType.RELATIVE_TODAY),
(r'^今早', TokenType.RELATIVE_TODAY),
(r'^今天早上', TokenType.RELATIVE_TODAY),
(r'^今天早晨', TokenType.RELATIVE_TODAY),
(r'^今天上午', TokenType.RELATIVE_TODAY),
(r'^今天下午', TokenType.RELATIVE_TODAY),
(r'^今天晚上', TokenType.RELATIVE_TODAY),
(r'^今天', TokenType.RELATIVE_TODAY),
# Relative days
(r'^明天', TokenType.RELATIVE_TOMORROW),
(r'^后天', TokenType.RELATIVE_DAY_AFTER_TOMORROW),
(r'^大后天', TokenType.RELATIVE_THREE_DAYS_AFTER_TOMORROW),
(r'^昨天', TokenType.RELATIVE_YESTERDAY),
(r'^前天', TokenType.RELATIVE_DAY_BEFORE_YESTERDAY),
(r'^大前天', TokenType.RELATIVE_THREE_DAYS_BEFORE_YESTERDAY),
# Digits
(r'^\d+', TokenType.INTEGER),
# Time units (must come after date separators to avoid conflicts)
(r'^年(?![月日号])', TokenType.YEAR),
(r'^月(?![日号])', TokenType.MONTH),
(r'^[日号](?![月年])', TokenType.DAY),
(r'^天', TokenType.DAY),
(r'^周', TokenType.WEEK),
(r'^小时', TokenType.HOUR),
(r'^分钟', TokenType.MINUTE),
(r'^秒', TokenType.SECOND),
# Date separators (fallback patterns)
(r'^年', TokenType.DATE_SEPARATOR),
(r'^月', TokenType.DATE_SEPARATOR),
(r'^[日号]', TokenType.DATE_SEPARATOR),
(r'^[-/]', TokenType.DATE_SEPARATOR),
]
def advance(self):
"""Advance the position pointer and set the current character."""
self.pos += 1
if self.pos >= len(self.text):
self.current_char = None
else:
self.current_char = self.text[self.pos]
def skip_whitespace(self):
"""Skip whitespace characters."""
while self.current_char is not None and self.current_char.isspace():
self.advance()
def integer(self) -> int:
"""Parse an integer from the input."""
result = ''
while self.current_char is not None and self.current_char.isdigit():
result += self.current_char
self.advance()
return int(result)
def chinese_number(self) -> int:
"""Parse a Chinese number from the input."""
# Find the longest prefix that can be parsed as a Chinese number
for i in range(len(self.text) - self.pos, 0, -1):
prefix = self.text[self.pos:self.pos + i]
try:
# Use digest to get both the remaining text and the parsed value
remaining, value = self.chinese_parser.digest(prefix)
# Check if we actually consumed part of the prefix
consumed_length = len(prefix) - len(remaining)
if consumed_length > 0:
# Advance position by the length of the consumed text
for _ in range(consumed_length):
self.advance()
return value
except ValueError:
continue
# If no Chinese number found, just return 0
return 0
def get_next_token(self) -> Token:
"""Lexical analyzer that breaks the sentence into tokens."""
while self.current_char is not None:
# Skip whitespace
if self.current_char.isspace():
self.skip_whitespace()
continue
# Try to match each pattern
text_remaining = self.text[self.pos:]
for pattern, token_type in self.token_patterns:
match = re.match(pattern, text_remaining)
if match:
value = match.group(0)
position = self.pos
# Advance position
for _ in range(len(value)):
self.advance()
# Special handling for some tokens
if token_type == TokenType.INTEGER:
value = int(value)
elif token_type == TokenType.RELATIVE_TODAY and value in [
"今早上", "今天早上", "今天早晨", "今天上午"
]:
token_type = TokenType.PERIOD_AM
elif token_type == TokenType.RELATIVE_TODAY and value in [
"今晚上", "今天下午", "今天晚上"
]:
token_type = TokenType.PERIOD_PM
return Token(token_type, value, position)
# Try to parse Chinese numbers
chinese_start_pos = self.pos
try:
chinese_value = self.chinese_number()
if chinese_value > 0:
# We successfully parsed a Chinese number
return Token(TokenType.CHINESE_NUMBER, chinese_value, chinese_start_pos)
except ValueError:
pass
# If no pattern matches, skip the character and continue
self.advance()
# End of file
return Token(TokenType.EOF, None, self.pos)
def tokenize(self) -> Iterator[Token]:
"""Generate all tokens from the input."""
while True:
token = self.get_next_token()
yield token
if token.type == TokenType.EOF:
break