636 lines
23 KiB
Python
636 lines
23 KiB
Python
import datetime
|
||
import re
|
||
from dataclasses import dataclass
|
||
from typing import Literal
|
||
|
||
from ptimeparse.err import (MultipleSpecificationException,
|
||
TokenUnhandledException)
|
||
|
||
|
||
@dataclass
|
||
class Parser:
|
||
now: datetime.datetime
|
||
|
||
timedelta: datetime.timedelta
|
||
|
||
hour_delta_triggered: bool = False
|
||
minute_delta_triggered: bool = False
|
||
second_delta_triggered: bool = False
|
||
|
||
ampm_specification: Literal["AM", "PM", None, "ABSOLUTE"] = None
|
||
ampm_ismid: bool = False
|
||
hour_specification: int | None = None
|
||
minute_specification: int | None = None
|
||
time_spec_day_delta: int = 0
|
||
|
||
@property
|
||
def time_delta_triggered(self):
|
||
return self.hour_delta_triggered or self.minute_delta_triggered or self.second_delta_triggered
|
||
|
||
def __init__(self, now: datetime.datetime | None = None):
|
||
self.now = datetime.datetime.now() if now is None else now
|
||
|
||
self.CN_NUM = {
|
||
"零": 0,
|
||
"一": 1,
|
||
"二": 2,
|
||
"两": 2,
|
||
"三": 3,
|
||
"四": 4,
|
||
"五": 5,
|
||
"六": 6,
|
||
"七": 7,
|
||
"八": 8,
|
||
"九": 9,
|
||
"十": 10,
|
||
"百": 100,
|
||
"千": 1000,
|
||
}
|
||
self.CN_UNIT = {"万": 1_0000, "亿": 1_0000_0000, "兆": 1_0000_0000_0000}
|
||
|
||
def clear_state(self):
|
||
self.timedelta = datetime.timedelta()
|
||
self.hour_delta_triggered = False
|
||
self.minute_delta_triggered = False
|
||
self.second_delta_triggered = False
|
||
|
||
self.ampm_specification = None
|
||
self.ampm_ismid = False
|
||
self.hour_specification = None
|
||
self.minute_specification = None
|
||
self.time_spec_day_delta = 0
|
||
|
||
def clean(self, content: str) -> str:
|
||
return re.sub(r"[ \t的]", "", content)
|
||
|
||
def parse(self, content: str) -> datetime.datetime:
|
||
self.clear_state()
|
||
content = self.clean(content)
|
||
|
||
content = self.digest_relative_date(content)
|
||
content = self.digest_weekday_relative(content)
|
||
content = self.digest_delta(content)
|
||
|
||
content = self.digest_date(content)
|
||
|
||
content = self.digest_early_late_hour(content)
|
||
content = self.digest_ampm_specific(content)
|
||
content = self.digest_time(content)
|
||
content = self.digest_ke(content)
|
||
if len(content) != 0:
|
||
raise TokenUnhandledException(content)
|
||
return self.build()
|
||
|
||
def digest_relative_date(self, content: str) -> str:
|
||
"""
|
||
处理明天、昨天、今天、后天、大后天、前天、大前天 等相对日期。
|
||
返回剩余未处理字符串。
|
||
"""
|
||
# 注意:这些词必须完整匹配开头,避免误匹配(如“明天”不能匹配“明天早上”中的“明”)
|
||
relative_days = {
|
||
"今天": 0,
|
||
"明天": 1,
|
||
"后天": 2,
|
||
"大后天": 3,
|
||
"昨日": -1,
|
||
"昨天": -1,
|
||
"前天": -2,
|
||
"大前天": -3,
|
||
}
|
||
|
||
for word, delta_days in relative_days.items():
|
||
if content.startswith(word):
|
||
# 如果已经设置了时间偏移(如 3 小时后),则冲突
|
||
if self.time_delta_triggered:
|
||
raise MultipleSpecificationException()
|
||
# 如果已经通过其他方式设置了 day delta(如 digest_date 中),这里也应检查
|
||
# 为简化,我们直接设置
|
||
self.time_spec_day_delta = delta_days
|
||
return content[len(word):]
|
||
|
||
return content
|
||
|
||
def digest_timedelta(self, content: str) -> str:
|
||
"""
|
||
解析形如 "3天", "2小时", "1星期", "5个月" 等时间增量。
|
||
支持中文数字和阿拉伯数字。
|
||
返回未处理的剩余字符串。
|
||
"""
|
||
|
||
if content.startswith("半"):
|
||
# "半"通常指"半小时"
|
||
remaining = content[1:]
|
||
# 检查是否有"小时"、"时"等
|
||
if remaining.startswith(("小时", "时")):
|
||
if self.hour_delta_triggered:
|
||
raise MultipleSpecificationException()
|
||
self.hour_delta_triggered = True
|
||
self.timedelta = datetime.timedelta(minutes=30)
|
||
return remaining[len("小时") if remaining.startswith("小时") else len("时"):]
|
||
elif remaining.startswith(("分钟", "分")):
|
||
if self.minute_delta_triggered:
|
||
raise MultipleSpecificationException()
|
||
self.minute_delta_triggered = True
|
||
self.timedelta = datetime.timedelta(minutes=30)
|
||
return remaining[len("分钟") if remaining.startswith("分钟") else len("分"):]
|
||
else:
|
||
# 默认为半小时
|
||
if self.hour_delta_triggered:
|
||
raise MultipleSpecificationException()
|
||
self.hour_delta_triggered = True
|
||
self.timedelta = datetime.timedelta(minutes=30)
|
||
return remaining
|
||
|
||
# 定义时间单位映射(注意:月需特殊处理)
|
||
unit_patterns = [
|
||
(r"(秒钟|秒)", "second"),
|
||
(r"(分钟|分)", "minute"),
|
||
(r"(时|小时|点)", "hour"),
|
||
(r"半(时|小时|点)", "hour+30"),
|
||
(r"(天|日)", "day"),
|
||
(r"(星期|周)", "week"),
|
||
(r"(月)", "month"), # 特殊:按30天处理
|
||
]
|
||
|
||
remaining = content
|
||
delta_kwargs = {
|
||
"days": 0,
|
||
"seconds": 0,
|
||
"minutes": 0,
|
||
"hours": 0,
|
||
"weeks": 0,
|
||
}
|
||
month_count = 0 # 单独记录月,最后转为天
|
||
|
||
while True:
|
||
matched = False
|
||
for pattern, unit_type in unit_patterns:
|
||
m = re.match(rf"^([零一二两三四五六七八九十百千万亿兆]*|\d+)?个?({pattern})", remaining)
|
||
if m:
|
||
num_str = m.group(1)
|
||
if num_str is None or num_str == "":
|
||
num = 1 # 默认为1,如“明天”实际是“1天后”
|
||
else:
|
||
# 尝试解析数字(中文或阿拉伯)
|
||
_, num = self.digest_chinese_number(num_str)
|
||
if num is None:
|
||
try:
|
||
num = int(num_str)
|
||
except ValueError:
|
||
continue # 无效数字,跳过
|
||
|
||
# 设置标志位,防止后续时间规格冲突
|
||
if unit_type == "hour":
|
||
if self.hour_delta_triggered:
|
||
raise MultipleSpecificationException()
|
||
self.hour_delta_triggered = True
|
||
elif unit_type == "hour+30":
|
||
if self.hour_delta_triggered:
|
||
raise MultipleSpecificationException()
|
||
self.hour_delta_triggered = True
|
||
if self.minute_delta_triggered:
|
||
raise MultipleSpecificationException()
|
||
self.minute_delta_triggered = True
|
||
elif unit_type == "minute":
|
||
if self.minute_delta_triggered:
|
||
raise MultipleSpecificationException()
|
||
self.minute_delta_triggered = True
|
||
elif unit_type == "second":
|
||
if self.second_delta_triggered:
|
||
raise MultipleSpecificationException()
|
||
self.second_delta_triggered = True
|
||
|
||
# 累加到对应单位
|
||
if unit_type == "second":
|
||
delta_kwargs["seconds"] += num
|
||
elif unit_type == "minute":
|
||
delta_kwargs["minutes"] += num
|
||
elif unit_type == "hour":
|
||
delta_kwargs["hours"] += num
|
||
elif unit_type == "day":
|
||
delta_kwargs["days"] += num
|
||
elif unit_type == "week":
|
||
delta_kwargs["weeks"] += num
|
||
elif unit_type == "month":
|
||
month_count += num
|
||
elif unit_type == "hour+30":
|
||
delta_kwargs["hours"] += num
|
||
delta_kwargs["minutes"] += 30
|
||
|
||
# 更新剩余字符串
|
||
remaining = remaining[len(m.group(0)):]
|
||
matched = True
|
||
break
|
||
|
||
if not matched:
|
||
break
|
||
|
||
# 处理“月” → 按30天/月估算(简单处理)
|
||
if month_count > 0:
|
||
delta_kwargs["days"] += month_count * 30
|
||
|
||
# 构建 timedelta
|
||
self.timedelta = datetime.timedelta(
|
||
days=delta_kwargs["days"],
|
||
seconds=delta_kwargs["seconds"],
|
||
minutes=delta_kwargs["minutes"],
|
||
hours=delta_kwargs["hours"],
|
||
weeks=delta_kwargs["weeks"]
|
||
)
|
||
|
||
return remaining
|
||
|
||
def digest_delta(self, content: str) -> str:
|
||
if "后" in content:
|
||
c1, _ = content.split("后", 1)
|
||
c1 = self.digest_timedelta(c1)
|
||
if c1 != "":
|
||
raise TokenUnhandledException(c1)
|
||
return c1
|
||
if "前" in content:
|
||
c1, _ = content.split("前", 1)
|
||
c1 = self.digest_timedelta(c1)
|
||
self.timedelta = -self.timedelta
|
||
if c1 != "":
|
||
raise TokenUnhandledException(c1)
|
||
return c1
|
||
return content
|
||
|
||
def digest_date(self, content: str) -> str:
|
||
# 1. 尝试 ISO 格式: 2025-10-09T15:30 或 2025-10-09
|
||
iso_match = re.match(r"^(\d{4})-(\d{1,2})-(\d{1,2})(?:T(\d{1,2}):(\d{1,2}))?", content)
|
||
if iso_match:
|
||
year, month, day = int(iso_match.group(1)), int(iso_match.group(2)), int(iso_match.group(3))
|
||
try:
|
||
target_date = datetime.date(year, month, day)
|
||
self.time_spec_day_delta = (target_date - self.now.date()).days
|
||
remaining = content[len(iso_match.group(0)):]
|
||
if iso_match.group(4): # 有时间
|
||
hour = int(iso_match.group(4))
|
||
minute = int(iso_match.group(5)) if iso_match.group(5) else 0
|
||
self.hour_specification = hour
|
||
self.minute_specification = minute
|
||
return remaining
|
||
except ValueError:
|
||
pass
|
||
|
||
# 2. 尝试 YYYY年MM月DD日
|
||
full_date_match = re.match(r"^(\d{4})年(\d{1,2})月(\d{1,2})日", content)
|
||
if full_date_match:
|
||
year, month, day = map(int, full_date_match.groups())
|
||
try:
|
||
target_date = datetime.date(year, month, day)
|
||
self.time_spec_day_delta = (target_date - self.now.date()).days
|
||
return content[len(full_date_match.group(0)):]
|
||
except ValueError:
|
||
pass
|
||
|
||
# 3. 尝试 MM月DD日(默认今年)
|
||
md_match = re.match(r"^(\d{1,2})月(\d{1,2})日", content)
|
||
if md_match:
|
||
month, day = map(int, md_match.groups())
|
||
year = self.now.year
|
||
try:
|
||
target_date = datetime.date(year, month, day)
|
||
self.time_spec_day_delta = (target_date - self.now.date()).days
|
||
return content[len(md_match.group(0)):]
|
||
except ValueError:
|
||
pass
|
||
|
||
# 4. 尝试 YYYY/MM/DD
|
||
slash_full = re.match(r"^(\d{4})/(\d{1,2})/(\d{1,2})", content)
|
||
if slash_full:
|
||
year, month, day = map(int, slash_full.groups())
|
||
try:
|
||
target_date = datetime.date(year, month, day)
|
||
self.time_spec_day_delta = (target_date - self.now.date()).days
|
||
return content[len(slash_full.group(0)):]
|
||
except ValueError:
|
||
pass
|
||
|
||
# 5. 尝试 MM/DD
|
||
slash_md = re.match(r"^(\d{1,2})/(\d{1,2})", content)
|
||
if slash_md:
|
||
month, day = map(int, slash_md.groups())
|
||
year = self.now.year
|
||
try:
|
||
target_date = datetime.date(year, month, day)
|
||
self.time_spec_day_delta = (target_date - self.now.date()).days
|
||
return content[len(slash_md.group(0)):]
|
||
except ValueError:
|
||
pass
|
||
|
||
# 6. 中文月日:十月九日
|
||
cn_md_match = re.match(r"^([一二三四五六七八九十]+)月([一二三四五六七八九十]+)日", content)
|
||
if cn_md_match:
|
||
month_str, day_str = cn_md_match.groups()
|
||
_, month_num = self.digest_chinese_number(month_str + "日")
|
||
_, day_num = self.digest_chinese_number(day_str + "日")
|
||
if month_num is not None and day_num is not None:
|
||
year = self.now.year
|
||
try:
|
||
target_date = datetime.date(year, month_num, day_num)
|
||
self.time_spec_day_delta = (target_date - self.now.date()).days
|
||
return content[len(cn_md_match.group(0)):]
|
||
except ValueError:
|
||
pass
|
||
|
||
return content
|
||
|
||
def digest_time(self, content: str) -> str:
|
||
content = self.digest_single_hour(content)
|
||
return content
|
||
|
||
def _chinese_to_int_final(self, cn_str: str) -> int:
|
||
result = 0
|
||
num_section = 0
|
||
|
||
for char in cn_str:
|
||
if char in self.CN_NUM:
|
||
val = self.CN_NUM[char]
|
||
if val <= 9:
|
||
num_section = val
|
||
elif val >= 10:
|
||
if num_section == 0 and val == 10:
|
||
num_section = 1
|
||
|
||
result += num_section * val
|
||
num_section = 0
|
||
|
||
result += num_section
|
||
|
||
return result
|
||
|
||
def digest_chinese_number(self, content: str) -> tuple[str, int | None]:
|
||
"""
|
||
识别字符串开头的中文数字并将其转换为整数。
|
||
处理范围:零到九千九百九十九万九千九百九十九亿九千九百九十九万九千九百九十九(约10^16)
|
||
"""
|
||
CN_CHARS = "".join(self.CN_NUM.keys()) + "".join(self.CN_UNIT.keys())
|
||
m = re.match(f"^([{CN_CHARS}]+)", content)
|
||
|
||
if m is None:
|
||
return content, None
|
||
|
||
cn_num_str = m.group(1)
|
||
if not cn_num_str:
|
||
return content, None
|
||
|
||
remaining_content = content[len(cn_num_str) :]
|
||
|
||
if cn_num_str == "零":
|
||
return remaining_content, 0
|
||
if cn_num_str == "一":
|
||
return remaining_content, 1
|
||
if cn_num_str in self.CN_NUM and self.CN_NUM[cn_num_str] <= 9:
|
||
return remaining_content, self.CN_NUM[cn_num_str]
|
||
|
||
pattern = re.compile(r"([^万亿兆]*)([万亿兆]?)")
|
||
|
||
parts = pattern.findall(cn_num_str)
|
||
parts.reverse()
|
||
|
||
current_unit = 1
|
||
total_num = 0
|
||
|
||
for num_str, unit_char in parts:
|
||
if not num_str and not unit_char:
|
||
continue
|
||
|
||
if unit_char in self.CN_UNIT:
|
||
current_unit = self.CN_UNIT[unit_char]
|
||
|
||
if num_str:
|
||
section_num = self._chinese_to_int_final(num_str)
|
||
total_num += section_num * current_unit
|
||
|
||
if unit_char in self.CN_UNIT:
|
||
pass
|
||
elif not unit_char:
|
||
current_unit = 1
|
||
|
||
return remaining_content, total_num
|
||
|
||
def digest_number(self, content: str) -> tuple[str, int | None]:
|
||
c1, num = self.digest_chinese_number(content)
|
||
if num is not None:
|
||
return c1, num
|
||
m = re.match(r"^(\d+)(.+)$", content)
|
||
if m is not None:
|
||
return m.group(2), int(m.group(1))
|
||
return content, None
|
||
|
||
def digest_ampm_specific(self, content: str) -> str:
|
||
am_patterns = ["凌晨", "早上", "上午", "早晨", "早"]
|
||
pm_patterns = ["中午", "下午", "晚上", "傍晚", "晚"]
|
||
|
||
for pat in am_patterns:
|
||
if content.startswith(pat):
|
||
self.ampm_specification = "AM"
|
||
return content[len(pat):]
|
||
|
||
for pat in pm_patterns:
|
||
if content.startswith(pat):
|
||
self.ampm_specification = "PM"
|
||
if pat == '中午':
|
||
self.ampm_ismid = True
|
||
return content[len(pat):]
|
||
|
||
return content
|
||
|
||
def digest_single_hour(self, content: str) -> str:
|
||
c1, num = self.digest_number(content)
|
||
if num is None:
|
||
return content
|
||
if self.time_delta_triggered:
|
||
raise MultipleSpecificationException()
|
||
self.hour_specification = num
|
||
if c1.startswith("点"):
|
||
c1 = c1[1:]
|
||
elif c1.startswith("时"):
|
||
c1 = c1[1:]
|
||
if self.ampm_specification is None:
|
||
self.ampm_specification = "ABSOLUTE"
|
||
else:
|
||
return content
|
||
if c1.startswith('钟'):
|
||
c1 = c1[1:]
|
||
if c1.startswith('整'):
|
||
c1 = c1[1:]
|
||
self.minute_specification = 0
|
||
elif c1.startswith('半'):
|
||
c1 = c1[1:]
|
||
self.minute_specification = 30
|
||
if c1.startswith('钟'):
|
||
c1 = c1[1:]
|
||
return c1
|
||
|
||
def digest_ke(self, content: str) -> str:
|
||
for pat in ("一刻", "过一刻"):
|
||
if content.startswith(pat):
|
||
if self.minute_specification is not None:
|
||
raise MultipleSpecificationException()
|
||
self.minute_specification = 15
|
||
return content[len(pat):]
|
||
for pat in ("两刻", "过两刻"):
|
||
if content.startswith(pat):
|
||
if self.minute_specification is not None:
|
||
raise MultipleSpecificationException()
|
||
self.minute_specification = 30
|
||
return content[len(pat):]
|
||
for pat in ("三刻", "过三刻"):
|
||
if content.startswith(pat):
|
||
if self.minute_specification is not None:
|
||
raise MultipleSpecificationException()
|
||
self.minute_specification = 45
|
||
return content[len(pat):]
|
||
return content
|
||
|
||
def digest_early_late_hour(self, content: str) -> str:
|
||
if not (content.startswith("早") or content.startswith("晚")):
|
||
return content
|
||
|
||
if self.time_delta_triggered:
|
||
raise MultipleSpecificationException()
|
||
|
||
if self.hour_specification is not None:
|
||
raise MultipleSpecificationException()
|
||
if self.ampm_specification not in (None, "ABSOLUTE"):
|
||
raise MultipleSpecificationException()
|
||
|
||
prefix = "早" if content.startswith("早") else "晚"
|
||
rest = content[1:]
|
||
|
||
remaining, num = self.digest_number(rest)
|
||
if num is None:
|
||
return content
|
||
|
||
if not (0 <= num <= 12):
|
||
return content
|
||
|
||
if prefix == "早":
|
||
self.ampm_specification = "AM"
|
||
hour = num
|
||
if hour == 12:
|
||
hour = 0
|
||
else:
|
||
self.ampm_specification = "PM"
|
||
if num == 12:
|
||
hour = 0
|
||
else:
|
||
hour = num
|
||
|
||
self.hour_specification = hour
|
||
self.minute_specification = 0
|
||
|
||
return remaining
|
||
|
||
def _find_weekday(self, week_offset: int, target_weekday: int) -> datetime.datetime:
|
||
"""
|
||
计算相对周的目标星期几。
|
||
:param week_offset: 0=本周, 1=下周, -1=上周
|
||
:param target_weekday: Monday=0, Sunday=6 (与 datetime.weekday() 一致)
|
||
:return: 对应的 datetime(时间部分设为 00:00:00)
|
||
"""
|
||
# 本周一的日期(假设周一为每周开始)
|
||
today = self.now.date()
|
||
days_since_monday = today.weekday() # Monday is 0
|
||
this_monday = today - datetime.timedelta(days=days_since_monday)
|
||
|
||
# 目标周一
|
||
target_monday = this_monday + datetime.timedelta(weeks=week_offset)
|
||
|
||
# 目标星期几
|
||
target_date = target_monday + datetime.timedelta(days=target_weekday)
|
||
|
||
# 返回 datetime,时间归零(与“明天”行为一致)
|
||
return datetime.datetime.combine(target_date, datetime.time.min)
|
||
|
||
def digest_weekday_relative(self, content: str) -> str:
|
||
"""
|
||
支持:本周五、下周三、上周一、这周五、下周一 等
|
||
返回剩余字符串。
|
||
"""
|
||
# 星期映射(支持:星期一、周1、周五 等)
|
||
weekday_map = {
|
||
"一": 0, "1": 0,
|
||
"二": 1, "2": 1,
|
||
"三": 2, "3": 2,
|
||
"四": 3, "4": 3,
|
||
"五": 4, "5": 4,
|
||
"六": 5, "6": 5,
|
||
"日": 6, "天": 6, "7": 6,
|
||
}
|
||
|
||
# 周偏移映射
|
||
week_offset_map = {
|
||
"本周": 0,
|
||
"这周": 0,
|
||
"下周": 1,
|
||
"下下周": 2, # 可选扩展
|
||
"上周": -1,
|
||
"上上周": -2, # 可选扩展
|
||
}
|
||
|
||
# 尝试匹配 [周标识][星期限定]
|
||
for week_key, week_offset in week_offset_map.items():
|
||
if content.startswith(week_key):
|
||
rest = content[len(week_key):]
|
||
|
||
if rest.startswith("星期"):
|
||
rest = rest[2:]
|
||
elif rest.startswith("周"):
|
||
rest = rest[1:]
|
||
|
||
if rest and (c := rest[0]) in weekday_map:
|
||
target_wd = weekday_map[c]
|
||
rest = rest[1:]
|
||
else:
|
||
continue
|
||
|
||
if self.time_delta_triggered or self.time_spec_day_delta != 0:
|
||
raise MultipleSpecificationException()
|
||
|
||
target_dt = self._find_weekday(week_offset, target_wd)
|
||
|
||
# 设置 day delta 相对于 now 的 00:00
|
||
base_date = self.now.replace(hour=0, minute=0, second=0, microsecond=0).date()
|
||
delta_days = (target_dt.date() - base_date).days
|
||
self.time_spec_day_delta = delta_days
|
||
|
||
return rest
|
||
|
||
return content
|
||
|
||
def build(self) -> datetime.datetime:
|
||
t = self.now
|
||
if not self.time_delta_triggered:
|
||
t = t.replace(hour=0, minute=0, second=0, microsecond=0)
|
||
if self.hour_specification is not None:
|
||
hour = self.hour_specification
|
||
if self.ampm_specification == "AM":
|
||
if hour == 12:
|
||
hour = 0
|
||
elif self.ampm_specification == "PM":
|
||
if hour != 12:
|
||
hour += 12
|
||
elif self.ampm_ismid:
|
||
hour = 12
|
||
else:
|
||
hour = 0
|
||
self.time_spec_day_delta += 1
|
||
elif self.ampm_specification is None:
|
||
if hour < self.now.hour and hour < 13:
|
||
hour += 12
|
||
t = t.replace(hour=hour)
|
||
if self.minute_specification is not None:
|
||
t = t.replace(minute=self.minute_specification)
|
||
|
||
t += datetime.timedelta(days=self.time_spec_day_delta)
|
||
t += self.timedelta
|
||
return t
|
||
|
||
|
||
def parse(content: str) -> datetime.datetime:
|
||
return Parser().parse(content)
|