654 lines
22 KiB
Python
654 lines
22 KiB
Python
import re
|
|
import datetime
|
|
from typing import Tuple, Optional, Dict, Any
|
|
|
|
from .err import MultipleSpecificationException, TokenUnhandledException
|
|
|
|
|
|
class Parser:
|
|
def __init__(self, now: Optional[datetime.datetime] = None):
|
|
self.now = now or datetime.datetime.now()
|
|
|
|
def digest_chinese_number(self, text: str) -> Tuple[str, int]:
|
|
if not text:
|
|
return text, 0
|
|
# Handle "两" at start
|
|
if text.startswith("两"):
|
|
next_char = text[1] if len(text) > 1 else ''
|
|
if not next_char or next_char in "十百千万亿":
|
|
return text[1:], 2
|
|
s = "零一二三四五六七八九"
|
|
digits = {c: i for i, c in enumerate(s)}
|
|
i = 0
|
|
while i < len(text) and text[i] in s + "十百千万亿":
|
|
i += 1
|
|
if i == 0:
|
|
return text, 0
|
|
num_str = text[:i]
|
|
rest = text[i:]
|
|
|
|
def parse(s):
|
|
if not s:
|
|
return 0
|
|
if s == "零":
|
|
return 0
|
|
if "亿" in s:
|
|
a, b = s.split("亿", 1)
|
|
return parse(a) * 100000000 + parse(b)
|
|
if "万" in s:
|
|
a, b = s.split("万", 1)
|
|
return parse(a) * 10000 + parse(b)
|
|
n = 0
|
|
t = 0
|
|
for c in s:
|
|
if c == "零":
|
|
continue
|
|
if c in digits:
|
|
t = digits[c]
|
|
elif c == "十":
|
|
if t == 0:
|
|
t = 1
|
|
n += t * 10
|
|
t = 0
|
|
elif c == "百":
|
|
if t == 0:
|
|
t = 1
|
|
n += t * 100
|
|
t = 0
|
|
elif c == "千":
|
|
if t == 0:
|
|
t = 1
|
|
n += t * 1000
|
|
t = 0
|
|
n += t
|
|
return n
|
|
|
|
return rest, parse(num_str)
|
|
|
|
def parse(self, text: str) -> datetime.datetime:
|
|
text = text.strip()
|
|
if not text:
|
|
raise TokenUnhandledException("Empty input")
|
|
|
|
ctx = {
|
|
"date": None,
|
|
"time": None,
|
|
"relative_delta": None,
|
|
"am_pm": None,
|
|
"period_word": None,
|
|
"has_time": False,
|
|
"has_date": False,
|
|
"ambiguous_hour": False,
|
|
"is_24hour": False,
|
|
"has_relative_date": False,
|
|
}
|
|
|
|
rest = self._parse_all(text, ctx)
|
|
if rest.strip():
|
|
raise TokenUnhandledException(f"Unparsed tokens: {rest.strip()}")
|
|
|
|
return self._apply_context(ctx)
|
|
|
|
def _parse_all(self, text: str, ctx: Dict[str, Any]) -> str:
|
|
rest = text.lstrip()
|
|
while True:
|
|
for parser in [
|
|
self._parse_absolute_date,
|
|
self._parse_relative_date,
|
|
self._parse_relative_time,
|
|
self._parse_period,
|
|
self._parse_time,
|
|
]:
|
|
new_rest = parser(rest, ctx)
|
|
if new_rest != rest:
|
|
rest = new_rest.lstrip()
|
|
break
|
|
else:
|
|
break
|
|
return rest
|
|
|
|
def _add_delta(self, ctx, delta):
|
|
if ctx["relative_delta"] is None:
|
|
ctx["relative_delta"] = delta
|
|
else:
|
|
ctx["relative_delta"] += delta
|
|
|
|
def _parse_absolute_date(self, text: str, ctx: Dict[str, Any]) -> str:
|
|
text = text.lstrip()
|
|
m = re.match(r"^(\d{4})-(\d{1,2})-(\d{1,2})T(\d{1,2}):(\d{2})", text)
|
|
if m:
|
|
y, mth, d, h, minute = map(int, m.groups())
|
|
ctx["date"] = datetime.date(y, mth, d)
|
|
ctx["time"] = datetime.time(h, minute)
|
|
ctx["has_date"] = True
|
|
ctx["has_time"] = True
|
|
ctx["is_24hour"] = True
|
|
return text[m.end():]
|
|
m = re.match(r"^(\d{4})-(\d{1,2})-(\d{1,2})", text)
|
|
if m:
|
|
y, mth, d = map(int, m.groups())
|
|
ctx["date"] = datetime.date(y, mth, d)
|
|
ctx["has_date"] = True
|
|
return text[m.end():]
|
|
m = re.match(r"^(\d{4})/(\d{1,2})/(\d{1,2})", text)
|
|
if m:
|
|
y, mth, d = map(int, m.groups())
|
|
ctx["date"] = datetime.date(y, mth, d)
|
|
ctx["has_date"] = True
|
|
return text[m.end():]
|
|
m = re.match(r"^(\d{4})年(\d{1,2})月(\d{1,2})[日号]", text)
|
|
if m:
|
|
y, mth, d = map(int, m.groups())
|
|
ctx["date"] = datetime.date(y, mth, d)
|
|
ctx["has_date"] = True
|
|
return text[m.end():]
|
|
m = re.match(r"^(\d{1,2})月(\d{1,2})[日号]", text)
|
|
if m:
|
|
mth, d = map(int, m.groups())
|
|
ctx["date"] = datetime.date(self.now.year, mth, d)
|
|
ctx["has_date"] = True
|
|
return text[m.end():]
|
|
m = re.match(r"^(.{1,3})月(.{1,3})[日号]", text)
|
|
if m:
|
|
m_str, d_str = m.groups()
|
|
_, mth = self.digest_chinese_number(m_str)
|
|
_, d = self.digest_chinese_number(d_str)
|
|
if mth == 0:
|
|
mth = 1
|
|
if d == 0:
|
|
d = 1
|
|
ctx["date"] = datetime.date(self.now.year, mth, d)
|
|
ctx["has_date"] = True
|
|
return text[m.end():]
|
|
return text
|
|
|
|
def _parse_relative_date(self, text: str, ctx: Dict[str, Any]) -> str:
|
|
text = text.lstrip()
|
|
|
|
# Handle "今天", "今晚", "今早", etc.
|
|
today_variants = [
|
|
("今晚上", "PM"),
|
|
("今晚", "PM"),
|
|
("今早", "AM"),
|
|
("今天早上", "AM"),
|
|
("今天早晨", "AM"),
|
|
("今天上午", "AM"),
|
|
("今天下午", "PM"),
|
|
("今天晚上", "PM"),
|
|
("今天", None),
|
|
]
|
|
for variant, period in today_variants:
|
|
if text.startswith(variant):
|
|
self._add_delta(ctx, datetime.timedelta(days=0))
|
|
ctx["has_relative_date"] = True
|
|
rest = text[len(variant):]
|
|
if period is not None and ctx["am_pm"] is None:
|
|
ctx["am_pm"] = period
|
|
ctx["period_word"] = variant
|
|
return rest
|
|
|
|
mapping = {
|
|
"明天": 1,
|
|
"后天": 2,
|
|
"大后天": 3,
|
|
"昨天": -1,
|
|
"前天": -2,
|
|
"大前天": -3,
|
|
}
|
|
for word, days in mapping.items():
|
|
if text.startswith(word):
|
|
self._add_delta(ctx, datetime.timedelta(days=days))
|
|
ctx["has_relative_date"] = True
|
|
return text[len(word):]
|
|
m = re.match(r"^(\d+|[零一二三四五六七八九十两]+)天(后|前|以后|之后)", text)
|
|
if m:
|
|
num_str, direction = m.groups()
|
|
if num_str.isdigit():
|
|
n = int(num_str)
|
|
else:
|
|
_, n = self.digest_chinese_number(num_str)
|
|
days = n if direction in ("后", "以后", "之后") else -n
|
|
self._add_delta(ctx, datetime.timedelta(days=days))
|
|
ctx["has_relative_date"] = True
|
|
return text[m.end():]
|
|
m = re.match(r"^(本|上|下)周([一二三四五六日])", text)
|
|
if m:
|
|
scope, day = m.groups()
|
|
weekday_map = {"一": 0, "二": 1, "三": 2, "四": 3, "五": 4, "六": 5, "日": 6}
|
|
target = weekday_map[day]
|
|
current = self.now.weekday()
|
|
if scope == "本":
|
|
delta = target - current
|
|
elif scope == "上":
|
|
delta = target - current - 7
|
|
else:
|
|
delta = target - current + 7
|
|
self._add_delta(ctx, datetime.timedelta(days=delta))
|
|
ctx["has_relative_date"] = True
|
|
return text[m.end():]
|
|
return text
|
|
|
|
def _parse_period(self, text: str, ctx: Dict[str, Any]) -> str:
|
|
text = text.lstrip()
|
|
period_mapping = {
|
|
"上午": "AM",
|
|
"早晨": "AM",
|
|
"早上": "AM",
|
|
"早": "AM",
|
|
"中午": "PM",
|
|
"下午": "PM",
|
|
"晚上": "PM",
|
|
"晚": "PM",
|
|
"凌晨": "AM",
|
|
}
|
|
for word, tag in period_mapping.items():
|
|
if text.startswith(word):
|
|
if ctx["am_pm"] is not None:
|
|
raise MultipleSpecificationException("Multiple periods")
|
|
ctx["am_pm"] = tag
|
|
ctx["period_word"] = word
|
|
return text[len(word):]
|
|
return text
|
|
|
|
def _parse_time(self, text: str, ctx: Dict[str, Any]) -> str:
|
|
if ctx["has_time"]:
|
|
return text
|
|
text = text.lstrip()
|
|
|
|
# 1. H:MM pattern
|
|
m = re.match(r"^(\d{1,2}):(\d{2})", text)
|
|
if m:
|
|
h, minute = int(m.group(1)), int(m.group(2))
|
|
if 0 <= h <= 23 and 0 <= minute <= 59:
|
|
ctx["time"] = datetime.time(h, minute)
|
|
ctx["has_time"] = True
|
|
ctx["ambiguous_hour"] = 1 <= h <= 12
|
|
ctx["is_24hour"] = h > 12 or h == 0
|
|
return text[m.end():]
|
|
|
|
# 2. Parse hour part
|
|
hour = None
|
|
rest_after_hour = text
|
|
is_24hour_format = False
|
|
|
|
# Try Chinese number + 点/时
|
|
temp_rest, num = self.digest_chinese_number(text)
|
|
if num >= 0:
|
|
temp_rest_stripped = temp_rest.lstrip()
|
|
if temp_rest_stripped.startswith("点"):
|
|
hour = num
|
|
is_24hour_format = False
|
|
rest_after_hour = temp_rest_stripped[1:]
|
|
elif temp_rest_stripped.startswith("时"):
|
|
hour = num
|
|
is_24hour_format = True
|
|
rest_after_hour = temp_rest_stripped[1:]
|
|
|
|
if hour is None:
|
|
m = re.match(r"^(\d{1,2})\s*([点时])", text)
|
|
if m:
|
|
hour = int(m.group(1))
|
|
is_24hour_format = m.group(2) == "时"
|
|
rest_after_hour = text[m.end():]
|
|
|
|
if hour is None:
|
|
if ctx.get("am_pm") is not None:
|
|
temp_rest, num = self.digest_chinese_number(text)
|
|
if 0 <= num <= 23:
|
|
hour = num
|
|
is_24hour_format = False
|
|
rest_after_hour = temp_rest.lstrip()
|
|
else:
|
|
m = re.match(r"^(\d{1,2})", text)
|
|
if m:
|
|
h_val = int(m.group(1))
|
|
if 0 <= h_val <= 23:
|
|
hour = h_val
|
|
is_24hour_format = False
|
|
rest_after_hour = text[m.end():].lstrip()
|
|
|
|
if hour is None:
|
|
return text
|
|
|
|
if not (0 <= hour <= 23):
|
|
return text
|
|
|
|
# Parse minutes
|
|
rest = rest_after_hour.lstrip()
|
|
minute = 0
|
|
minute_spec_count = 0
|
|
|
|
if rest.startswith("钟"):
|
|
rest = rest[1:].lstrip()
|
|
|
|
has_zheng = False
|
|
if rest.startswith("整"):
|
|
has_zheng = True
|
|
rest = rest[1:].lstrip()
|
|
|
|
if rest.startswith("半"):
|
|
minute = 30
|
|
minute_spec_count += 1
|
|
rest = rest[1:].lstrip()
|
|
if rest.startswith("钟"):
|
|
rest = rest[1:].lstrip()
|
|
if rest.startswith("整"):
|
|
rest = rest[1:].lstrip()
|
|
|
|
if rest.startswith("一刻"):
|
|
minute = 15
|
|
minute_spec_count += 1
|
|
rest = rest[2:].lstrip()
|
|
if rest.startswith("钟"):
|
|
rest = rest[1:].lstrip()
|
|
|
|
if rest.startswith("过一刻"):
|
|
minute = 15
|
|
minute_spec_count += 1
|
|
rest = rest[3:].lstrip()
|
|
if rest.startswith("钟"):
|
|
rest = rest[1:].lstrip()
|
|
|
|
m = re.match(r"^(\d+|[零一二三四五六七八九十]+)分", rest)
|
|
if m:
|
|
minute_spec_count += 1
|
|
m_str = m.group(1)
|
|
if m_str.isdigit():
|
|
minute = int(m_str)
|
|
else:
|
|
_, minute = self.digest_chinese_number(m_str)
|
|
rest = rest[m.end():].lstrip()
|
|
|
|
if minute_spec_count == 0:
|
|
temp_rest, num = self.digest_chinese_number(rest)
|
|
if num > 0 and num <= 59:
|
|
minute = num
|
|
minute_spec_count += 1
|
|
rest = temp_rest.lstrip()
|
|
else:
|
|
m = re.match(r"^(\d{1,2})", rest)
|
|
if m:
|
|
m_val = int(m.group(1))
|
|
if 0 <= m_val <= 59:
|
|
minute = m_val
|
|
minute_spec_count += 1
|
|
rest = rest[m.end():].lstrip()
|
|
|
|
if has_zheng and minute_spec_count == 0:
|
|
minute_spec_count = 1
|
|
|
|
if minute_spec_count > 1:
|
|
raise MultipleSpecificationException("Multiple minute specifications")
|
|
|
|
if not (0 <= minute <= 59):
|
|
return text
|
|
|
|
# Hours 13-23 are always 24-hour, even with "点"
|
|
if hour >= 13:
|
|
is_24hour_format = True
|
|
|
|
ctx["time"] = datetime.time(hour, minute)
|
|
ctx["has_time"] = True
|
|
ctx["ambiguous_hour"] = 1 <= hour <= 12 and not is_24hour_format
|
|
ctx["is_24hour"] = is_24hour_format
|
|
|
|
return rest
|
|
|
|
def _parse_relative_time(self, text: str, ctx: Dict[str, Any]) -> str:
|
|
text = text.lstrip()
|
|
|
|
# 半小时
|
|
m = re.match(r"^(半)(?:个)?小时?(后|前|以后|之后)", text)
|
|
if m:
|
|
direction = m.group(2)
|
|
hours = 0.5
|
|
delta = datetime.timedelta(
|
|
hours=hours if direction in ("后", "以后", "之后") else -hours
|
|
)
|
|
self._add_delta(ctx, delta)
|
|
return text[m.end():]
|
|
|
|
# X个半
|
|
m = re.match(r"^([0-9零一二三四五六七八九十两]+)个半(?:小时?)?(后|前|以后|之后)", text)
|
|
if m:
|
|
num_str, direction = m.groups()
|
|
if num_str.isdigit():
|
|
base_hours = int(num_str)
|
|
else:
|
|
_, base_hours = self.digest_chinese_number(num_str)
|
|
if base_hours == 0 and num_str != "零":
|
|
return text
|
|
if base_hours <= 0:
|
|
return text
|
|
hours = base_hours + 0.5
|
|
delta = datetime.timedelta(
|
|
hours=hours if direction in ("后", "以后", "之后") else -hours
|
|
)
|
|
self._add_delta(ctx, delta)
|
|
return text[m.end():]
|
|
|
|
# 一个半
|
|
m = re.match(r"^(一个半)小时?(后|前|以后|之后)", text)
|
|
if m:
|
|
direction = m.group(2)
|
|
hours = 1.5
|
|
delta = datetime.timedelta(
|
|
hours=hours if direction in ("后", "以后", "之后") else -hours
|
|
)
|
|
self._add_delta(ctx, delta)
|
|
return text[m.end():]
|
|
|
|
# X小时
|
|
m = re.match(r"^([0-9零一二三四五六七八九十两]+)(?:个)?小时?(后|前|以后|之后)", text)
|
|
if m:
|
|
num_str, direction = m.groups()
|
|
if num_str.isdigit():
|
|
hours = int(num_str)
|
|
else:
|
|
_, hours = self.digest_chinese_number(num_str)
|
|
if hours == 0 and num_str != "零":
|
|
return text
|
|
if hours <= 0:
|
|
return text
|
|
delta = datetime.timedelta(
|
|
hours=hours if direction in ("后", "以后", "之后") else -hours
|
|
)
|
|
self._add_delta(ctx, delta)
|
|
return text[m.end():]
|
|
|
|
m = re.match(r"^([0-9零一二三四五六七八九十两]+)(?:个)?小时(后|前)", text)
|
|
if m:
|
|
num_str, direction = m.groups()
|
|
if num_str.isdigit():
|
|
hours = int(num_str)
|
|
else:
|
|
_, hours = self.digest_chinese_number(num_str)
|
|
if hours == 0 and num_str != "零":
|
|
return text
|
|
if hours <= 0:
|
|
return text
|
|
delta = datetime.timedelta(
|
|
hours=hours if direction == "后" else -hours
|
|
)
|
|
self._add_delta(ctx, delta)
|
|
return text[m.end():]
|
|
|
|
# X分钟
|
|
m = re.match(r"^([0-9零一二三四五六七八九十两]+)分钟?(后|前|以后|之后)", text)
|
|
if m:
|
|
num_str, direction = m.groups()
|
|
if num_str.isdigit():
|
|
minutes = int(num_str)
|
|
else:
|
|
_, minutes = self.digest_chinese_number(num_str)
|
|
if minutes == 0 and num_str != "零":
|
|
return text
|
|
if minutes <= 0:
|
|
return text
|
|
delta = datetime.timedelta(
|
|
minutes=minutes if direction in ("后", "以后", "之后") else -minutes
|
|
)
|
|
self._add_delta(ctx, delta)
|
|
return text[m.end():]
|
|
|
|
m = re.match(r"^([0-9零一二三四五六七八九十两]+)分(后|前|以后|之后)", text)
|
|
if m:
|
|
num_str, direction = m.groups()
|
|
if num_str.isdigit():
|
|
minutes = int(num_str)
|
|
else:
|
|
_, minutes = self.digest_chinese_number(num_str)
|
|
if minutes == 0 and num_str != "零":
|
|
return text
|
|
if minutes <= 0:
|
|
return text
|
|
delta = datetime.timedelta(
|
|
minutes=minutes if direction in ("后", "以后", "之后") else -minutes
|
|
)
|
|
self._add_delta(ctx, delta)
|
|
return text[m.end():]
|
|
|
|
m = re.match(r"^([0-9零一二三四五六七八九十两]+)分钟?(后|前)", text)
|
|
if m:
|
|
num_str, direction = m.groups()
|
|
if num_str.isdigit():
|
|
minutes = int(num_str)
|
|
else:
|
|
_, minutes = self.digest_chinese_number(num_str)
|
|
if minutes == 0 and num_str != "零":
|
|
return text
|
|
if minutes <= 0:
|
|
return text
|
|
delta = datetime.timedelta(
|
|
minutes=minutes if direction == "后" else -minutes
|
|
)
|
|
self._add_delta(ctx, delta)
|
|
return text[m.end():]
|
|
|
|
m = re.match(r"^([0-9零一二三四五六七八九十两]+)分(后|前)", text)
|
|
if m:
|
|
num_str, direction = m.groups()
|
|
if num_str.isdigit():
|
|
minutes = int(num_str)
|
|
else:
|
|
_, minutes = self.digest_chinese_number(num_str)
|
|
if minutes == 0 and num_str != "零":
|
|
return text
|
|
if minutes <= 0:
|
|
return text
|
|
delta = datetime.timedelta(
|
|
minutes=minutes if direction == "后" else -minutes
|
|
)
|
|
self._add_delta(ctx, delta)
|
|
return text[m.end():]
|
|
|
|
# === 秒级支持 ===
|
|
m = re.match(r"^([0-9零一二三四五六七八九十两]+)秒(后|前|以后|之后)", text)
|
|
if m:
|
|
num_str, direction = m.groups()
|
|
if num_str.isdigit():
|
|
seconds = int(num_str)
|
|
else:
|
|
_, seconds = self.digest_chinese_number(num_str)
|
|
if seconds == 0 and num_str != "零":
|
|
return text
|
|
if seconds <= 0:
|
|
return text
|
|
delta = datetime.timedelta(
|
|
seconds=seconds if direction in ("后", "以后", "之后") else -seconds
|
|
)
|
|
self._add_delta(ctx, delta)
|
|
return text[m.end():]
|
|
|
|
m = re.match(r"^([0-9零一二三四五六七八九十两]+)秒(后|前)", text)
|
|
if m:
|
|
num_str, direction = m.groups()
|
|
if num_str.isdigit():
|
|
seconds = int(num_str)
|
|
else:
|
|
_, seconds = self.digest_chinese_number(num_str)
|
|
if seconds == 0 and num_str != "零":
|
|
return text
|
|
if seconds <= 0:
|
|
return text
|
|
delta = datetime.timedelta(
|
|
seconds=seconds if direction == "后" else -seconds
|
|
)
|
|
self._add_delta(ctx, delta)
|
|
return text[m.end():]
|
|
|
|
return text
|
|
|
|
def _apply_context(self, ctx: Dict[str, Any]) -> datetime.datetime:
|
|
result = self.now
|
|
has_date = ctx["has_date"]
|
|
has_time = ctx["has_time"]
|
|
has_delta = ctx["relative_delta"] is not None
|
|
has_relative_date = ctx["has_relative_date"]
|
|
|
|
if has_delta:
|
|
result = result + ctx["relative_delta"]
|
|
|
|
if has_date:
|
|
result = result.replace(
|
|
year=ctx["date"].year,
|
|
month=ctx["date"].month,
|
|
day=ctx["date"].day,
|
|
)
|
|
|
|
if has_time:
|
|
h = ctx["time"].hour
|
|
m = ctx["time"].minute
|
|
|
|
if ctx["is_24hour"]:
|
|
# "10 时" → 10:00, no conversion
|
|
pass
|
|
|
|
elif ctx["am_pm"] == "AM":
|
|
if h == 12:
|
|
h = 0
|
|
|
|
elif ctx["am_pm"] == "PM":
|
|
if h == 12:
|
|
if ctx.get("period_word") in ("晚上", "晚"):
|
|
h = 0
|
|
result += datetime.timedelta(days=1)
|
|
else:
|
|
h = 12
|
|
elif 1 <= h <= 11:
|
|
h += 12
|
|
|
|
else:
|
|
# No period and not 24-hour (i.e., "点" format)
|
|
if ctx["has_relative_date"]:
|
|
# "明天五点" → 05:00 AM
|
|
if h == 12:
|
|
h = 0
|
|
# keep h as AM hour (1-11 unchanged)
|
|
else:
|
|
# Infer from current time
|
|
am_hour = 0 if h == 12 else h
|
|
candidate_am = result.replace(hour=am_hour, minute=m, second=0, microsecond=0)
|
|
if candidate_am < self.now:
|
|
# AM time is in the past, so use PM
|
|
if h == 12:
|
|
h = 12
|
|
else:
|
|
h += 12
|
|
# else: keep as AM (h unchanged)
|
|
|
|
if h > 23:
|
|
h = h % 24
|
|
|
|
result = result.replace(hour=h, minute=m, second=0, microsecond=0)
|
|
|
|
else:
|
|
if has_date or (has_relative_date and not has_time):
|
|
result = result.replace(hour=0, minute=0, second=0, microsecond=0)
|
|
|
|
return result
|
|
|
|
|
|
def parse(text: str) -> datetime.datetime:
|
|
return Parser().parse(text)
|