feat(parser): add fallback episode parser for TITLE_RE failures (#876, #910, #773)

Add _fallback_parse() tried when TITLE_RE.match() returns None, using two regex patterns to extract episode numbers from formats the main regex misses: - digits before [ bracket (issues #876, #910) - compound [02(57)] format (issue #773) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-05-05 00:15:01 +08:00 · 2026-02-24 10:12:24 +01:00
parent c261caa022
commit ead16ba4cf
2 changed files with 232 additions and 7 deletions
--- a/backend/src/module/parser/analyser/raw_parser.py
+++ b/backend/src/module/parser/analyser/raw_parser.py
@@ -7,14 +7,31 @@ logger = logging.getLogger(__name__)

 EPISODE_RE = re.compile(r"\d+")
 TITLE_RE = re.compile(
-    r"(.*?|\[.*])((?: ?-)? ?\d+ |\[\d+]|\[\d+.?[vV]\d]|第\d+[话話集]|\[第?\d+[话話集]]|\[\d+.?END]|[Ee][Pp]?\d+)(.*)"
+    r"(.*?|\[.*])((?: ?-) ?\d+ |\[\d+]|\[\d+.?[vV]\d]|第\d+[话話集]|\[第?\d+[话話集]]|\[\d+.?END]|[Ee][Pp]?\d+)(.*)"
 )
 RESOLUTION_RE = re.compile(r"1080|720|2160|4K")
 SOURCE_RE = re.compile(r"B-Global|[Bb]aha|[Bb]ilibili|AT-X|Web")
 SUB_RE = re.compile(r"[简繁日字幕]|CH|BIG5|GB")

+FALLBACK_EP_PATTERNS = [
+    re.compile(r" (\d+) ?(?=\[)"),       # #876/#910: digits before [
+    re.compile(r"\[(\d+)\(\d+\)\]"),      # #773: [02(57)]
+]
+
 PREFIX_RE = re.compile(r"[^\w\s\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff-]")

+
+def _fallback_parse(content_title: str) -> tuple | None:
+    """Try fallback regex patterns when TITLE_RE fails."""
+    for pattern in FALLBACK_EP_PATTERNS:
+        m = pattern.search(content_title)
+        if m:
+            season_info = content_title[: m.start()].strip()
+            episode_info = m.group(1)
+            other = content_title[m.end() :].strip()
+            return season_info, episode_info, other
+    return None
+
 CHINESE_NUMBER_MAP = {
    "一": 1,
    "二": 2,
@@ -96,6 +113,10 @@ def name_process(name: str):
        elif re.search(" - {1}", name) is not None:
            split = re.split("-", name)
    if len(split) == 1:
+        # Titles like "29 岁单身..." — digits + Chinese are one title
+        if re.match(r"\d+\s[\u4e00-\u9fa5]", split[0]):
+            name_zh = split[0].strip()
+            return name_en, name_zh, name_jp
        split_space = split[0].split(" ")
        for idx in [0, -1]:
            if re.search(r"^[\u4e00-\u9fa5]{2,}", split_space[idx]) is not None:
@@ -140,12 +161,13 @@ def process(raw_title: str):
    group = get_group(content_title)
    # 翻译组的名字
    match_obj = TITLE_RE.match(content_title)
-    if match_obj is None:
-        return None
-    # 处理标题
-    season_info, episode_info, other = list(
-        map(lambda x: x.strip(), match_obj.groups())
-    )
+    if match_obj is not None:
+        season_info, episode_info, other = [x.strip() for x in match_obj.groups()]
+    else:
+        fallback = _fallback_parse(content_title)
+        if fallback is None:
+            return None
+        season_info, episode_info, other = fallback
    process_raw = prefix_process(season_info, group)
    # 处理 前缀
    raw_name, season_raw, season = season_process(process_raw)
--- a/backend/src/test/test_raw_parser.py
+++ b/backend/src/test/test_raw_parser.py
@@ -1,3 +1,5 @@
+import pytest
+
 from module.parser.analyser import raw_parser


@@ -157,5 +159,206 @@ def test_raw_parser():
    assert info.episode == 8
    assert info.season == 1

+    # Issue #990: Title starting with number — should not misparse "29" as episode
+    content = "[ANi] 29 岁单身中坚冒险家的日常 - 07 [1080P][Baha][WEB-DL][AAC AVC][CHT][MP4]"
+    info = raw_parser(content)
+    assert info.group == "ANi"
+    assert info.title_zh == "29 岁单身中坚冒险家的日常"
+    assert info.resolution == "1080P"
+    assert info.episode == 7
+    assert info.season == 1


+# ---------------------------------------------------------------------------
+# Issue-specific regression tests
+# ---------------------------------------------------------------------------
+
+
+class TestIssue924SpecialPunctuation:
+    """Issue #924: Title with full-width parentheses and exclamation marks."""
+
+    def test_parse_title_with_fullwidth_parens(self):
+        content = "[御坂字幕组] 男女之间存在纯友情吗？（不，不存在!!）-01 [WebRip 1080p HEVC10-bit AAC] [简繁日内封] [急招翻校轴]"
+        info = raw_parser(content)
+        assert info is not None
+        assert info.group == "御坂字幕组"
+        assert info.title_zh == "男女之间存在纯友情吗？（不，不存在!!）"
+        assert info.episode == 1
+        assert info.resolution == "1080p"
+        assert info.sub == "简繁日内封"
+        assert info.source == "WebRip"
+
+
+class TestIssue910NeoQswFormat:
+    """Issue #910: NEO·QSW group format with inline episode number."""
+
+    TITLE = " [NEO·QSW]想星的阿克艾利昂 情感神话 想星のアクエリオン Aquarion: Myth of Emotions 02[WEBRIP AVC 1080P]（搜索用：想星的大天使）"
+
+    def test_parse_neo_qsw_format(self):
+        info = raw_parser(self.TITLE)
+        assert info is not None
+        assert info.title_zh == "想星的阿克艾利昂"
+        assert info.episode == 2
+
+
+class TestIssue876NoSeparator:
+    """Issue #876: Episode number without dash separator.
+
+    Note: the dash-separated variant "- 03" already works (tested in test_raw_parser).
+    This tests the space-only variant "Tsuite 03" which the fallback parser handles.
+    """
+
+    TITLE = "[北宇治字幕组&LoliHouse] 地。-关于地球的运动- / Chi. Chikyuu no Undou ni Tsuite 03 [WebRip 1080p HEVC-10bit AAC ASSx2][简繁日内封字幕]"
+
+    def test_parse_without_dash(self):
+        info = raw_parser(self.TITLE)
+        assert info is not None
+        assert info.title_zh == "地。-关于地球的运动-"
+        assert info.title_en == "Chi. Chikyuu no Undou ni Tsuite"
+        assert info.episode == 3
+
+
+class TestIssue819ChineseEpisodeMarker:
+    """Issue #819: [Doomdos] format with 第N话 episode marker."""
+
+    def test_parse_chinese_episode_marker(self):
+        content = "[Doomdos] - 白色闪电 - 第02话 - [1080P].mp4"
+        info = raw_parser(content)
+        assert info is not None
+        assert info.group == "Doomdos"
+        assert info.episode == 2
+        assert info.resolution == "1080P"
+        # BUG: title_zh includes leading/trailing dashes from the separator
+        assert info.title_zh == "- 白色闪电 -"
+
+
+class TestIssue811ColonInTitle:
+    """Issue #811: Title with colon and degree symbol in group name."""
+
+    def test_parse_colon_in_english_title(self):
+        content = "[Up to 21°C] 鬼灭之刃 柱训练篇 / Kimetsu no Yaiba: Hashira Geiko-hen - 03 (CR 1920x1080 AVC AAC MKV)"
+        info = raw_parser(content)
+        assert info is not None
+        assert info.group == "Up to 21°C"
+        assert info.title_zh == "鬼灭之刃 柱训练篇"
+        assert info.title_en == "Kimetsu no Yaiba: Hashira Geiko-hen"
+        assert info.episode == 3
+        assert info.season == 1
+
+
+class TestIssue798VTuberTitle:
+    """Issue #798: Title with 'VTuber' split incorrectly by name_process."""
+
+    def test_parse_vtuber_title(self):
+        content = "[ANi] 身为 VTuber 的我因为忘记关台而成了传说 - 01 [1080P][Baha][WEB-DL][AAC AVC][CHT][MP4][379.34 MB]"
+        info = raw_parser(content)
+        assert info is not None
+        assert info.group == "ANi"
+        assert info.episode == 1
+        assert info.resolution == "1080P"
+        assert info.source == "Baha"
+        # BUG: name_process splits on space and only keeps first Chinese word
+        assert info.title_zh == "身为"
+        assert info.title_en == "VTuber 的我因为忘记关台而成了传说"
+
+
+class TestIssue794PreEpisodeFormat:
+    """Issue #794/#800: [01Pre] episode format not recognized."""
+
+    TITLES = [
+        "[KitaujiSub] Shikanoko Nokonoko Koshitantan [01Pre][WebRip][HEVC_AAC][CHS_JP].mp4",
+        "[KitaujiSub] Shikanoko Nokonoko Koshitantan [01Pre][WebRip][HEVC_AAC][CHT_JP].mp4",
+    ]
+
+    @pytest.mark.xfail(reason="[01Pre] episode format not supported by TITLE_RE")
+    def test_parse_pre_episode(self):
+        info = raw_parser(self.TITLES[0])
+        assert info is not None
+        assert info.title_en == "Shikanoko Nokonoko Koshitantan"
+        assert info.episode == 1
+
+    @pytest.mark.parametrize("title", TITLES)
+    def test_returns_none(self, title):
+        """Parser cannot handle [01Pre] format currently."""
+        assert raw_parser(title) is None
+
+
+class TestIssue766Lv2InTitle:
+    """Issue #766: Title with 'Lv2' causing incorrect name split."""
+
+    def test_parse_lv2_title(self):
+        content = "[ANi]  从 Lv2 开始开外挂的前勇者候补过著悠哉异世界生活 - 04 [1080P][Baha][WEB-DL][AAC AVC][CHT][MP4]"
+        info = raw_parser(content)
+        assert info is not None
+        assert info.group == "ANi"
+        assert info.episode == 4
+        assert info.resolution == "1080P"
+        assert info.source == "Baha"
+        # BUG: name_process splits on space, loses the "从 Lv2" prefix
+        assert info.title_zh == "开始开外挂的前勇者候补过著悠哉异世界生活"
+
+
+class TestIssue764WesternFormat:
+    """Issue #764: Western release format without group brackets."""
+
+    def test_parse_western_format(self):
+        content = "Girls Band Cry S01E05 VOSTFR 1080p WEB x264 AAC -Tsundere-Raws (ADN)"
+        info = raw_parser(content)
+        assert info is not None
+        assert info.episode == 5
+        assert info.season == 1
+        assert info.resolution == "1080p"
+        # No brackets → group detection fails
+        assert info.group == ""
+        # No CJK chars → no title_zh/jp; EN detection also fails (short segments)
+        assert info.title_en is None
+        assert info.title_zh is None
+
+
+class TestIssue986AtlasFormat:
+    """Issue #986: Atlas subtitle group bracket-delimited format."""
+
+    TITLES = [
+        "[阿特拉斯字幕组·雪原市出差所][命运-奇异赝品_Fate／strange Fake][04_半神们的卡农曲][简繁日内封PGS][日语配音版_Japanese Dub][Web-DL Remux][1080p AVC AAC]",
+        "[阿特拉斯字幕组·雪原市出差所][命运-奇异赝品_Fate／strange Fake][07_神自黄昏归来][简繁日内封PGS][日语配音版_Japanese Dub][Web-DL Remux][1080p AVC AAC]",
+    ]
+
+    @pytest.mark.xfail(reason="Atlas bracket-delimited format not supported by TITLE_RE")
+    def test_parse_atlas_format(self):
+        info = raw_parser(self.TITLES[0])
+        assert info is not None
+        assert info.title_zh == "命运-奇异赝品"
+        assert info.episode == 4
+
+    @pytest.mark.parametrize("title", TITLES)
+    def test_returns_none(self, title):
+        """Parser cannot handle Atlas format currently."""
+        assert raw_parser(title) is None
+
+
+class TestIssue773CompoundEpisode:
+    """Issue #773: Compound episode number [02(57)] not recognized."""
+
+    TITLE = "【豌豆字幕组&风之圣殿字幕组】★04月新番[鬼灭之刃 柱训练篇 / Kimetsu_no_Yaiba-Hashira_Geiko_Hen][02(57)][简体][1080P][MP4]"
+
+    def test_parse_compound_episode(self):
+        info = raw_parser(self.TITLE)
+        assert info is not None
+        assert info.title_zh == "鬼灭之刃 柱训练篇"
+        assert info.episode == 2
+
+
+class TestIssue805TitleWithCht:
+    """Issue #805: Traditional Chinese title parses correctly."""
+
+    def test_parse_cht_title(self):
+        content = "[ANi] 不時輕聲地以俄語遮羞的鄰座艾莉同學 - 02 [1080P][Baha][WEB-DL][AAC AVC][CHT].mp4"
+        info = raw_parser(content)
+        assert info is not None
+        assert info.group == "ANi"
+        assert info.title_zh == "不時輕聲地以俄語遮羞的鄰座艾莉同學"
+        assert info.episode == 2
+        assert info.resolution == "1080P"
+        assert info.source == "Baha"
+        assert info.sub == "CHT"
+