225 lines
9.0 KiB
Python
225 lines
9.0 KiB
Python
"""
|
|
Lexical analyzer for time expressions.
|
|
"""
|
|
|
|
import re
|
|
from typing import Iterator, Optional
|
|
import datetime
|
|
|
|
from .ptime_token import Token, TokenType
|
|
from .chinese_number import ChineseNumberParser
|
|
|
|
|
|
class Lexer:
|
|
"""Lexical analyzer for time expressions."""
|
|
|
|
def __init__(self, text: str, now: Optional[datetime.datetime] = None):
|
|
self.text = text
|
|
self.pos = 0
|
|
self.current_char = self.text[self.pos] if self.text else None
|
|
self.now = now or datetime.datetime.now()
|
|
self.chinese_parser = ChineseNumberParser()
|
|
|
|
# Define token patterns
|
|
self.token_patterns = [
|
|
# Whitespace
|
|
(r'^\s+', TokenType.WHITESPACE),
|
|
|
|
# Time separators
|
|
(r'^:', TokenType.TIME_SEPARATOR),
|
|
(r'^点', TokenType.TIME_SEPARATOR),
|
|
(r'^时', TokenType.TIME_SEPARATOR),
|
|
(r'^分', TokenType.TIME_SEPARATOR),
|
|
(r'^秒', TokenType.TIME_SEPARATOR),
|
|
|
|
# Special time markers
|
|
(r'^半', TokenType.HALF),
|
|
(r'^一刻', TokenType.QUARTER),
|
|
(r'^整', TokenType.ZHENG),
|
|
(r'^钟', TokenType.ZHONG),
|
|
|
|
# Period indicators (must come before relative time patterns to avoid conflicts)
|
|
(r'^(上午|早晨|早上|清晨|早(?!\d))', TokenType.PERIOD_AM),
|
|
(r'^(中午|下午|晚上|晚(?!\d)|凌晨|午夜)', TokenType.PERIOD_PM),
|
|
|
|
# Week scope (more specific patterns first)
|
|
(r'^本周', TokenType.WEEK_SCOPE_CURRENT),
|
|
(r'^上周', TokenType.WEEK_SCOPE_LAST),
|
|
(r'^下周', TokenType.WEEK_SCOPE_NEXT),
|
|
|
|
# Relative directions
|
|
(r'^(后|以后|之后)', TokenType.RELATIVE_DIRECTION_FORWARD),
|
|
(r'^(前|以前|之前)', TokenType.RELATIVE_DIRECTION_BACKWARD),
|
|
|
|
# Extended relative time
|
|
(r'^明年', TokenType.RELATIVE_NEXT),
|
|
(r'^去年', TokenType.RELATIVE_LAST),
|
|
(r'^今年', TokenType.RELATIVE_THIS),
|
|
(r'^下(?![午年月周])', TokenType.RELATIVE_NEXT),
|
|
(r'^(上|去)(?![午年月周])', TokenType.RELATIVE_LAST),
|
|
(r'^这', TokenType.RELATIVE_THIS),
|
|
(r'^本(?![周月年])', TokenType.RELATIVE_THIS), # Match "本" but not "本周", "本月", "本年"
|
|
|
|
# Week scope (fallback for standalone terms)
|
|
(r'^本', TokenType.WEEK_SCOPE_CURRENT),
|
|
(r'^上', TokenType.WEEK_SCOPE_LAST),
|
|
(r'^下(?![午年月周])', TokenType.WEEK_SCOPE_NEXT),
|
|
|
|
# Week days (order matters - longer patterns first)
|
|
(r'^周一', TokenType.WEEKDAY_MONDAY),
|
|
(r'^周二', TokenType.WEEKDAY_TUESDAY),
|
|
(r'^周三', TokenType.WEEKDAY_WEDNESDAY),
|
|
(r'^周四', TokenType.WEEKDAY_THURSDAY),
|
|
(r'^周五', TokenType.WEEKDAY_FRIDAY),
|
|
(r'^周六', TokenType.WEEKDAY_SATURDAY),
|
|
(r'^周日', TokenType.WEEKDAY_SUNDAY),
|
|
# Single character weekdays should be matched after numbers
|
|
# (r'^一', TokenType.WEEKDAY_MONDAY),
|
|
# (r'^二', TokenType.WEEKDAY_TUESDAY),
|
|
# (r'^三', TokenType.WEEKDAY_WEDNESDAY),
|
|
# (r'^四', TokenType.WEEKDAY_THURSDAY),
|
|
# (r'^五', TokenType.WEEKDAY_FRIDAY),
|
|
# (r'^六', TokenType.WEEKDAY_SATURDAY),
|
|
# (r'^日', TokenType.WEEKDAY_SUNDAY),
|
|
|
|
# Student-friendly time expressions
|
|
(r'^早(?=\d)', TokenType.EARLY_MORNING),
|
|
(r'^晚(?=\d)', TokenType.LATE_NIGHT),
|
|
|
|
# Relative today variants
|
|
(r'^今晚上', TokenType.RELATIVE_TODAY),
|
|
(r'^今晚', TokenType.RELATIVE_TODAY),
|
|
(r'^今早', TokenType.RELATIVE_TODAY),
|
|
(r'^今天早上', TokenType.RELATIVE_TODAY),
|
|
(r'^今天早晨', TokenType.RELATIVE_TODAY),
|
|
(r'^今天上午', TokenType.RELATIVE_TODAY),
|
|
(r'^今天下午', TokenType.RELATIVE_TODAY),
|
|
(r'^今天晚上', TokenType.RELATIVE_TODAY),
|
|
(r'^今天', TokenType.RELATIVE_TODAY),
|
|
|
|
# Relative days
|
|
(r'^明天', TokenType.RELATIVE_TOMORROW),
|
|
(r'^后天', TokenType.RELATIVE_DAY_AFTER_TOMORROW),
|
|
(r'^大后天', TokenType.RELATIVE_THREE_DAYS_AFTER_TOMORROW),
|
|
(r'^昨天', TokenType.RELATIVE_YESTERDAY),
|
|
(r'^前天', TokenType.RELATIVE_DAY_BEFORE_YESTERDAY),
|
|
(r'^大前天', TokenType.RELATIVE_THREE_DAYS_BEFORE_YESTERDAY),
|
|
|
|
# Digits
|
|
(r'^\d+', TokenType.INTEGER),
|
|
|
|
# Time units (must come after date separators to avoid conflicts)
|
|
(r'^年(?![月日号])', TokenType.YEAR),
|
|
(r'^月(?![日号])', TokenType.MONTH),
|
|
(r'^[日号](?![月年])', TokenType.DAY),
|
|
(r'^天', TokenType.DAY),
|
|
(r'^周', TokenType.WEEK),
|
|
(r'^小时', TokenType.HOUR),
|
|
(r'^分钟', TokenType.MINUTE),
|
|
(r'^秒', TokenType.SECOND),
|
|
|
|
# Date separators (fallback patterns)
|
|
(r'^年', TokenType.DATE_SEPARATOR),
|
|
(r'^月', TokenType.DATE_SEPARATOR),
|
|
(r'^[日号]', TokenType.DATE_SEPARATOR),
|
|
(r'^[-/]', TokenType.DATE_SEPARATOR),
|
|
]
|
|
|
|
def advance(self):
|
|
"""Advance the position pointer and set the current character."""
|
|
self.pos += 1
|
|
if self.pos >= len(self.text):
|
|
self.current_char = None
|
|
else:
|
|
self.current_char = self.text[self.pos]
|
|
|
|
def skip_whitespace(self):
|
|
"""Skip whitespace characters."""
|
|
while self.current_char is not None and self.current_char.isspace():
|
|
self.advance()
|
|
|
|
def integer(self) -> int:
|
|
"""Parse an integer from the input."""
|
|
result = ''
|
|
while self.current_char is not None and self.current_char.isdigit():
|
|
result += self.current_char
|
|
self.advance()
|
|
return int(result)
|
|
|
|
def chinese_number(self) -> int:
|
|
"""Parse a Chinese number from the input."""
|
|
# Find the longest prefix that can be parsed as a Chinese number
|
|
for i in range(len(self.text) - self.pos, 0, -1):
|
|
prefix = self.text[self.pos:self.pos + i]
|
|
try:
|
|
# Use digest to get both the remaining text and the parsed value
|
|
remaining, value = self.chinese_parser.digest(prefix)
|
|
# Check if we actually consumed part of the prefix
|
|
consumed_length = len(prefix) - len(remaining)
|
|
if consumed_length > 0:
|
|
# Advance position by the length of the consumed text
|
|
for _ in range(consumed_length):
|
|
self.advance()
|
|
return value
|
|
except ValueError:
|
|
continue
|
|
# If no Chinese number found, just return 0
|
|
return 0
|
|
|
|
def get_next_token(self) -> Token:
|
|
"""Lexical analyzer that breaks the sentence into tokens."""
|
|
while self.current_char is not None:
|
|
# Skip whitespace
|
|
if self.current_char.isspace():
|
|
self.skip_whitespace()
|
|
continue
|
|
|
|
# Try to match each pattern
|
|
text_remaining = self.text[self.pos:]
|
|
for pattern, token_type in self.token_patterns:
|
|
match = re.match(pattern, text_remaining)
|
|
if match:
|
|
value = match.group(0)
|
|
position = self.pos
|
|
|
|
# Advance position
|
|
for _ in range(len(value)):
|
|
self.advance()
|
|
|
|
# Special handling for some tokens
|
|
if token_type == TokenType.INTEGER:
|
|
value = int(value)
|
|
elif token_type == TokenType.RELATIVE_TODAY and value in [
|
|
"今早上", "今天早上", "今天早晨", "今天上午"
|
|
]:
|
|
token_type = TokenType.PERIOD_AM
|
|
elif token_type == TokenType.RELATIVE_TODAY and value in [
|
|
"今晚上", "今天下午", "今天晚上"
|
|
]:
|
|
token_type = TokenType.PERIOD_PM
|
|
|
|
return Token(token_type, value, position)
|
|
|
|
# Try to parse Chinese numbers
|
|
chinese_start_pos = self.pos
|
|
try:
|
|
chinese_value = self.chinese_number()
|
|
if chinese_value > 0:
|
|
# We successfully parsed a Chinese number
|
|
return Token(TokenType.CHINESE_NUMBER, chinese_value, chinese_start_pos)
|
|
except ValueError:
|
|
pass
|
|
|
|
# If no pattern matches, skip the character and continue
|
|
self.advance()
|
|
|
|
# End of file
|
|
return Token(TokenType.EOF, None, self.pos)
|
|
|
|
def tokenize(self) -> Iterator[Token]:
|
|
"""Generate all tokens from the input."""
|
|
while True:
|
|
token = self.get_next_token()
|
|
yield token
|
|
if token.type == TokenType.EOF:
|
|
break |