feat(parser): add fallback episode parser for TITLE_RE failures (#876, #910, #773)

Add _fallback_parse() tried when TITLE_RE.match() returns None, using two
regex patterns to extract episode numbers from formats the main regex misses:
- digits before [ bracket (issues #876, #910)
- compound [02(57)] format (issue #773)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Estrella Pan
2026-02-24 10:12:24 +01:00
parent c261caa022
commit ead16ba4cf
2 changed files with 232 additions and 7 deletions

View File

@@ -7,14 +7,31 @@ logger = logging.getLogger(__name__)
EPISODE_RE = re.compile(r"\d+")
TITLE_RE = re.compile(
r"(.*?|\[.*])((?: ?-)? ?\d+ |\[\d+]|\[\d+.?[vV]\d]|第\d+[话話集]|\[第?\d+[话話集]]|\[\d+.?END]|[Ee][Pp]?\d+)(.*)"
r"(.*?|\[.*])((?: ?-) ?\d+ |\[\d+]|\[\d+.?[vV]\d]|第\d+[话話集]|\[第?\d+[话話集]]|\[\d+.?END]|[Ee][Pp]?\d+)(.*)"
)
RESOLUTION_RE = re.compile(r"1080|720|2160|4K")
SOURCE_RE = re.compile(r"B-Global|[Bb]aha|[Bb]ilibili|AT-X|Web")
SUB_RE = re.compile(r"[简繁日字幕]|CH|BIG5|GB")
FALLBACK_EP_PATTERNS = [
re.compile(r" (\d+) ?(?=\[)"), # #876/#910: digits before [
re.compile(r"\[(\d+)\(\d+\)\]"), # #773: [02(57)]
]
PREFIX_RE = re.compile(r"[^\w\s\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff-]")
def _fallback_parse(content_title: str) -> tuple | None:
"""Try fallback regex patterns when TITLE_RE fails."""
for pattern in FALLBACK_EP_PATTERNS:
m = pattern.search(content_title)
if m:
season_info = content_title[: m.start()].strip()
episode_info = m.group(1)
other = content_title[m.end() :].strip()
return season_info, episode_info, other
return None
CHINESE_NUMBER_MAP = {
"": 1,
"": 2,
@@ -96,6 +113,10 @@ def name_process(name: str):
elif re.search(" - {1}", name) is not None:
split = re.split("-", name)
if len(split) == 1:
# Titles like "29 岁单身..." — digits + Chinese are one title
if re.match(r"\d+\s[\u4e00-\u9fa5]", split[0]):
name_zh = split[0].strip()
return name_en, name_zh, name_jp
split_space = split[0].split(" ")
for idx in [0, -1]:
if re.search(r"^[\u4e00-\u9fa5]{2,}", split_space[idx]) is not None:
@@ -140,12 +161,13 @@ def process(raw_title: str):
group = get_group(content_title)
# 翻译组的名字
match_obj = TITLE_RE.match(content_title)
if match_obj is None:
return None
# 处理标题
season_info, episode_info, other = list(
map(lambda x: x.strip(), match_obj.groups())
)
if match_obj is not None:
season_info, episode_info, other = [x.strip() for x in match_obj.groups()]
else:
fallback = _fallback_parse(content_title)
if fallback is None:
return None
season_info, episode_info, other = fallback
process_raw = prefix_process(season_info, group)
# 处理 前缀
raw_name, season_raw, season = season_process(process_raw)

View File

@@ -1,3 +1,5 @@
import pytest
from module.parser.analyser import raw_parser
@@ -157,5 +159,206 @@ def test_raw_parser():
assert info.episode == 8
assert info.season == 1
# Issue #990: Title starting with number — should not misparse "29" as episode
content = "[ANi] 29 岁单身中坚冒险家的日常 - 07 [1080P][Baha][WEB-DL][AAC AVC][CHT][MP4]"
info = raw_parser(content)
assert info.group == "ANi"
assert info.title_zh == "29 岁单身中坚冒险家的日常"
assert info.resolution == "1080P"
assert info.episode == 7
assert info.season == 1
# ---------------------------------------------------------------------------
# Issue-specific regression tests
# ---------------------------------------------------------------------------
class TestIssue924SpecialPunctuation:
"""Issue #924: Title with full-width parentheses and exclamation marks."""
def test_parse_title_with_fullwidth_parens(self):
content = "[御坂字幕组] 男女之间存在纯友情吗?(不,不存在!!-01 [WebRip 1080p HEVC10-bit AAC] [简繁日内封] [急招翻校轴]"
info = raw_parser(content)
assert info is not None
assert info.group == "御坂字幕组"
assert info.title_zh == "男女之间存在纯友情吗?(不,不存在!!"
assert info.episode == 1
assert info.resolution == "1080p"
assert info.sub == "简繁日内封"
assert info.source == "WebRip"
class TestIssue910NeoQswFormat:
"""Issue #910: NEO·QSW group format with inline episode number."""
TITLE = " [NEO·QSW]想星的阿克艾利昂 情感神话 想星のアクエリオン Aquarion: Myth of Emotions 02[WEBRIP AVC 1080P](搜索用:想星的大天使)"
def test_parse_neo_qsw_format(self):
info = raw_parser(self.TITLE)
assert info is not None
assert info.title_zh == "想星的阿克艾利昂"
assert info.episode == 2
class TestIssue876NoSeparator:
"""Issue #876: Episode number without dash separator.
Note: the dash-separated variant "- 03" already works (tested in test_raw_parser).
This tests the space-only variant "Tsuite 03" which the fallback parser handles.
"""
TITLE = "[北宇治字幕组&LoliHouse] 地。-关于地球的运动- / Chi. Chikyuu no Undou ni Tsuite 03 [WebRip 1080p HEVC-10bit AAC ASSx2][简繁日内封字幕]"
def test_parse_without_dash(self):
info = raw_parser(self.TITLE)
assert info is not None
assert info.title_zh == "地。-关于地球的运动-"
assert info.title_en == "Chi. Chikyuu no Undou ni Tsuite"
assert info.episode == 3
class TestIssue819ChineseEpisodeMarker:
"""Issue #819: [Doomdos] format with 第N话 episode marker."""
def test_parse_chinese_episode_marker(self):
content = "[Doomdos] - 白色闪电 - 第02话 - [1080P].mp4"
info = raw_parser(content)
assert info is not None
assert info.group == "Doomdos"
assert info.episode == 2
assert info.resolution == "1080P"
# BUG: title_zh includes leading/trailing dashes from the separator
assert info.title_zh == "- 白色闪电 -"
class TestIssue811ColonInTitle:
"""Issue #811: Title with colon and degree symbol in group name."""
def test_parse_colon_in_english_title(self):
content = "[Up to 21°C] 鬼灭之刃 柱训练篇 / Kimetsu no Yaiba: Hashira Geiko-hen - 03 (CR 1920x1080 AVC AAC MKV)"
info = raw_parser(content)
assert info is not None
assert info.group == "Up to 21°C"
assert info.title_zh == "鬼灭之刃 柱训练篇"
assert info.title_en == "Kimetsu no Yaiba: Hashira Geiko-hen"
assert info.episode == 3
assert info.season == 1
class TestIssue798VTuberTitle:
"""Issue #798: Title with 'VTuber' split incorrectly by name_process."""
def test_parse_vtuber_title(self):
content = "[ANi] 身为 VTuber 的我因为忘记关台而成了传说 - 01 [1080P][Baha][WEB-DL][AAC AVC][CHT][MP4][379.34 MB]"
info = raw_parser(content)
assert info is not None
assert info.group == "ANi"
assert info.episode == 1
assert info.resolution == "1080P"
assert info.source == "Baha"
# BUG: name_process splits on space and only keeps first Chinese word
assert info.title_zh == "身为"
assert info.title_en == "VTuber 的我因为忘记关台而成了传说"
class TestIssue794PreEpisodeFormat:
"""Issue #794/#800: [01Pre] episode format not recognized."""
TITLES = [
"[KitaujiSub] Shikanoko Nokonoko Koshitantan [01Pre][WebRip][HEVC_AAC][CHS_JP].mp4",
"[KitaujiSub] Shikanoko Nokonoko Koshitantan [01Pre][WebRip][HEVC_AAC][CHT_JP].mp4",
]
@pytest.mark.xfail(reason="[01Pre] episode format not supported by TITLE_RE")
def test_parse_pre_episode(self):
info = raw_parser(self.TITLES[0])
assert info is not None
assert info.title_en == "Shikanoko Nokonoko Koshitantan"
assert info.episode == 1
@pytest.mark.parametrize("title", TITLES)
def test_returns_none(self, title):
"""Parser cannot handle [01Pre] format currently."""
assert raw_parser(title) is None
class TestIssue766Lv2InTitle:
"""Issue #766: Title with 'Lv2' causing incorrect name split."""
def test_parse_lv2_title(self):
content = "[ANi] 从 Lv2 开始开外挂的前勇者候补过著悠哉异世界生活 - 04 [1080P][Baha][WEB-DL][AAC AVC][CHT][MP4]"
info = raw_parser(content)
assert info is not None
assert info.group == "ANi"
assert info.episode == 4
assert info.resolution == "1080P"
assert info.source == "Baha"
# BUG: name_process splits on space, loses the "从 Lv2" prefix
assert info.title_zh == "开始开外挂的前勇者候补过著悠哉异世界生活"
class TestIssue764WesternFormat:
"""Issue #764: Western release format without group brackets."""
def test_parse_western_format(self):
content = "Girls Band Cry S01E05 VOSTFR 1080p WEB x264 AAC -Tsundere-Raws (ADN)"
info = raw_parser(content)
assert info is not None
assert info.episode == 5
assert info.season == 1
assert info.resolution == "1080p"
# No brackets → group detection fails
assert info.group == ""
# No CJK chars → no title_zh/jp; EN detection also fails (short segments)
assert info.title_en is None
assert info.title_zh is None
class TestIssue986AtlasFormat:
"""Issue #986: Atlas subtitle group bracket-delimited format."""
TITLES = [
"[阿特拉斯字幕组·雪原市出差所][命运-奇异赝品_Fatestrange Fake][04_半神们的卡农曲][简繁日内封PGS][日语配音版_Japanese Dub][Web-DL Remux][1080p AVC AAC]",
"[阿特拉斯字幕组·雪原市出差所][命运-奇异赝品_Fatestrange Fake][07_神自黄昏归来][简繁日内封PGS][日语配音版_Japanese Dub][Web-DL Remux][1080p AVC AAC]",
]
@pytest.mark.xfail(reason="Atlas bracket-delimited format not supported by TITLE_RE")
def test_parse_atlas_format(self):
info = raw_parser(self.TITLES[0])
assert info is not None
assert info.title_zh == "命运-奇异赝品"
assert info.episode == 4
@pytest.mark.parametrize("title", TITLES)
def test_returns_none(self, title):
"""Parser cannot handle Atlas format currently."""
assert raw_parser(title) is None
class TestIssue773CompoundEpisode:
"""Issue #773: Compound episode number [02(57)] not recognized."""
TITLE = "【豌豆字幕组&风之圣殿字幕组】★04月新番[鬼灭之刃 柱训练篇 / Kimetsu_no_Yaiba-Hashira_Geiko_Hen][02(57)][简体][1080P][MP4]"
def test_parse_compound_episode(self):
info = raw_parser(self.TITLE)
assert info is not None
assert info.title_zh == "鬼灭之刃 柱训练篇"
assert info.episode == 2
class TestIssue805TitleWithCht:
"""Issue #805: Traditional Chinese title parses correctly."""
def test_parse_cht_title(self):
content = "[ANi] 不時輕聲地以俄語遮羞的鄰座艾莉同學 - 02 [1080P][Baha][WEB-DL][AAC AVC][CHT].mp4"
info = raw_parser(content)
assert info is not None
assert info.group == "ANi"
assert info.title_zh == "不時輕聲地以俄語遮羞的鄰座艾莉同學"
assert info.episode == 2
assert info.resolution == "1080P"
assert info.source == "Baha"
assert info.sub == "CHT"