From ead16ba4cf6669ab2830bb739defa19c3b705be0 Mon Sep 17 00:00:00 2001 From: Estrella Pan Date: Tue, 24 Feb 2026 10:12:24 +0100 Subject: [PATCH] feat(parser): add fallback episode parser for TITLE_RE failures (#876, #910, #773) Add _fallback_parse() tried when TITLE_RE.match() returns None, using two regex patterns to extract episode numbers from formats the main regex misses: - digits before [ bracket (issues #876, #910) - compound [02(57)] format (issue #773) Co-Authored-By: Claude Opus 4.6 --- .../src/module/parser/analyser/raw_parser.py | 36 +++- backend/src/test/test_raw_parser.py | 203 ++++++++++++++++++ 2 files changed, 232 insertions(+), 7 deletions(-) diff --git a/backend/src/module/parser/analyser/raw_parser.py b/backend/src/module/parser/analyser/raw_parser.py index c2ab641c..bfe0ab6a 100644 --- a/backend/src/module/parser/analyser/raw_parser.py +++ b/backend/src/module/parser/analyser/raw_parser.py @@ -7,14 +7,31 @@ logger = logging.getLogger(__name__) EPISODE_RE = re.compile(r"\d+") TITLE_RE = re.compile( - r"(.*?|\[.*])((?: ?-)? ?\d+ |\[\d+]|\[\d+.?[vV]\d]|第\d+[话話集]|\[第?\d+[话話集]]|\[\d+.?END]|[Ee][Pp]?\d+)(.*)" + r"(.*?|\[.*])((?: ?-) ?\d+ |\[\d+]|\[\d+.?[vV]\d]|第\d+[话話集]|\[第?\d+[话話集]]|\[\d+.?END]|[Ee][Pp]?\d+)(.*)" ) RESOLUTION_RE = re.compile(r"1080|720|2160|4K") SOURCE_RE = re.compile(r"B-Global|[Bb]aha|[Bb]ilibili|AT-X|Web") SUB_RE = re.compile(r"[简繁日字幕]|CH|BIG5|GB") +FALLBACK_EP_PATTERNS = [ + re.compile(r" (\d+) ?(?=\[)"), # #876/#910: digits before [ + re.compile(r"\[(\d+)\(\d+\)\]"), # #773: [02(57)] +] + PREFIX_RE = re.compile(r"[^\w\s\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff-]") + +def _fallback_parse(content_title: str) -> tuple | None: + """Try fallback regex patterns when TITLE_RE fails.""" + for pattern in FALLBACK_EP_PATTERNS: + m = pattern.search(content_title) + if m: + season_info = content_title[: m.start()].strip() + episode_info = m.group(1) + other = content_title[m.end() :].strip() + return season_info, episode_info, other + return None + CHINESE_NUMBER_MAP = { "一": 1, "二": 2, @@ -96,6 +113,10 @@ def name_process(name: str): elif re.search(" - {1}", name) is not None: split = re.split("-", name) if len(split) == 1: + # Titles like "29 岁单身..." — digits + Chinese are one title + if re.match(r"\d+\s[\u4e00-\u9fa5]", split[0]): + name_zh = split[0].strip() + return name_en, name_zh, name_jp split_space = split[0].split(" ") for idx in [0, -1]: if re.search(r"^[\u4e00-\u9fa5]{2,}", split_space[idx]) is not None: @@ -140,12 +161,13 @@ def process(raw_title: str): group = get_group(content_title) # 翻译组的名字 match_obj = TITLE_RE.match(content_title) - if match_obj is None: - return None - # 处理标题 - season_info, episode_info, other = list( - map(lambda x: x.strip(), match_obj.groups()) - ) + if match_obj is not None: + season_info, episode_info, other = [x.strip() for x in match_obj.groups()] + else: + fallback = _fallback_parse(content_title) + if fallback is None: + return None + season_info, episode_info, other = fallback process_raw = prefix_process(season_info, group) # 处理 前缀 raw_name, season_raw, season = season_process(process_raw) diff --git a/backend/src/test/test_raw_parser.py b/backend/src/test/test_raw_parser.py index 82f8be8e..398da338 100644 --- a/backend/src/test/test_raw_parser.py +++ b/backend/src/test/test_raw_parser.py @@ -1,3 +1,5 @@ +import pytest + from module.parser.analyser import raw_parser @@ -157,5 +159,206 @@ def test_raw_parser(): assert info.episode == 8 assert info.season == 1 + # Issue #990: Title starting with number — should not misparse "29" as episode + content = "[ANi] 29 岁单身中坚冒险家的日常 - 07 [1080P][Baha][WEB-DL][AAC AVC][CHT][MP4]" + info = raw_parser(content) + assert info.group == "ANi" + assert info.title_zh == "29 岁单身中坚冒险家的日常" + assert info.resolution == "1080P" + assert info.episode == 7 + assert info.season == 1 +# --------------------------------------------------------------------------- +# Issue-specific regression tests +# --------------------------------------------------------------------------- + + +class TestIssue924SpecialPunctuation: + """Issue #924: Title with full-width parentheses and exclamation marks.""" + + def test_parse_title_with_fullwidth_parens(self): + content = "[御坂字幕组] 男女之间存在纯友情吗?(不,不存在!!)-01 [WebRip 1080p HEVC10-bit AAC] [简繁日内封] [急招翻校轴]" + info = raw_parser(content) + assert info is not None + assert info.group == "御坂字幕组" + assert info.title_zh == "男女之间存在纯友情吗?(不,不存在!!)" + assert info.episode == 1 + assert info.resolution == "1080p" + assert info.sub == "简繁日内封" + assert info.source == "WebRip" + + +class TestIssue910NeoQswFormat: + """Issue #910: NEO·QSW group format with inline episode number.""" + + TITLE = " [NEO·QSW]想星的阿克艾利昂 情感神话 想星のアクエリオン Aquarion: Myth of Emotions 02[WEBRIP AVC 1080P](搜索用:想星的大天使)" + + def test_parse_neo_qsw_format(self): + info = raw_parser(self.TITLE) + assert info is not None + assert info.title_zh == "想星的阿克艾利昂" + assert info.episode == 2 + + +class TestIssue876NoSeparator: + """Issue #876: Episode number without dash separator. + + Note: the dash-separated variant "- 03" already works (tested in test_raw_parser). + This tests the space-only variant "Tsuite 03" which the fallback parser handles. + """ + + TITLE = "[北宇治字幕组&LoliHouse] 地。-关于地球的运动- / Chi. Chikyuu no Undou ni Tsuite 03 [WebRip 1080p HEVC-10bit AAC ASSx2][简繁日内封字幕]" + + def test_parse_without_dash(self): + info = raw_parser(self.TITLE) + assert info is not None + assert info.title_zh == "地。-关于地球的运动-" + assert info.title_en == "Chi. Chikyuu no Undou ni Tsuite" + assert info.episode == 3 + + +class TestIssue819ChineseEpisodeMarker: + """Issue #819: [Doomdos] format with 第N话 episode marker.""" + + def test_parse_chinese_episode_marker(self): + content = "[Doomdos] - 白色闪电 - 第02话 - [1080P].mp4" + info = raw_parser(content) + assert info is not None + assert info.group == "Doomdos" + assert info.episode == 2 + assert info.resolution == "1080P" + # BUG: title_zh includes leading/trailing dashes from the separator + assert info.title_zh == "- 白色闪电 -" + + +class TestIssue811ColonInTitle: + """Issue #811: Title with colon and degree symbol in group name.""" + + def test_parse_colon_in_english_title(self): + content = "[Up to 21°C] 鬼灭之刃 柱训练篇 / Kimetsu no Yaiba: Hashira Geiko-hen - 03 (CR 1920x1080 AVC AAC MKV)" + info = raw_parser(content) + assert info is not None + assert info.group == "Up to 21°C" + assert info.title_zh == "鬼灭之刃 柱训练篇" + assert info.title_en == "Kimetsu no Yaiba: Hashira Geiko-hen" + assert info.episode == 3 + assert info.season == 1 + + +class TestIssue798VTuberTitle: + """Issue #798: Title with 'VTuber' split incorrectly by name_process.""" + + def test_parse_vtuber_title(self): + content = "[ANi] 身为 VTuber 的我因为忘记关台而成了传说 - 01 [1080P][Baha][WEB-DL][AAC AVC][CHT][MP4][379.34 MB]" + info = raw_parser(content) + assert info is not None + assert info.group == "ANi" + assert info.episode == 1 + assert info.resolution == "1080P" + assert info.source == "Baha" + # BUG: name_process splits on space and only keeps first Chinese word + assert info.title_zh == "身为" + assert info.title_en == "VTuber 的我因为忘记关台而成了传说" + + +class TestIssue794PreEpisodeFormat: + """Issue #794/#800: [01Pre] episode format not recognized.""" + + TITLES = [ + "[KitaujiSub] Shikanoko Nokonoko Koshitantan [01Pre][WebRip][HEVC_AAC][CHS_JP].mp4", + "[KitaujiSub] Shikanoko Nokonoko Koshitantan [01Pre][WebRip][HEVC_AAC][CHT_JP].mp4", + ] + + @pytest.mark.xfail(reason="[01Pre] episode format not supported by TITLE_RE") + def test_parse_pre_episode(self): + info = raw_parser(self.TITLES[0]) + assert info is not None + assert info.title_en == "Shikanoko Nokonoko Koshitantan" + assert info.episode == 1 + + @pytest.mark.parametrize("title", TITLES) + def test_returns_none(self, title): + """Parser cannot handle [01Pre] format currently.""" + assert raw_parser(title) is None + + +class TestIssue766Lv2InTitle: + """Issue #766: Title with 'Lv2' causing incorrect name split.""" + + def test_parse_lv2_title(self): + content = "[ANi] 从 Lv2 开始开外挂的前勇者候补过著悠哉异世界生活 - 04 [1080P][Baha][WEB-DL][AAC AVC][CHT][MP4]" + info = raw_parser(content) + assert info is not None + assert info.group == "ANi" + assert info.episode == 4 + assert info.resolution == "1080P" + assert info.source == "Baha" + # BUG: name_process splits on space, loses the "从 Lv2" prefix + assert info.title_zh == "开始开外挂的前勇者候补过著悠哉异世界生活" + + +class TestIssue764WesternFormat: + """Issue #764: Western release format without group brackets.""" + + def test_parse_western_format(self): + content = "Girls Band Cry S01E05 VOSTFR 1080p WEB x264 AAC -Tsundere-Raws (ADN)" + info = raw_parser(content) + assert info is not None + assert info.episode == 5 + assert info.season == 1 + assert info.resolution == "1080p" + # No brackets → group detection fails + assert info.group == "" + # No CJK chars → no title_zh/jp; EN detection also fails (short segments) + assert info.title_en is None + assert info.title_zh is None + + +class TestIssue986AtlasFormat: + """Issue #986: Atlas subtitle group bracket-delimited format.""" + + TITLES = [ + "[阿特拉斯字幕组·雪原市出差所][命运-奇异赝品_Fate/strange Fake][04_半神们的卡农曲][简繁日内封PGS][日语配音版_Japanese Dub][Web-DL Remux][1080p AVC AAC]", + "[阿特拉斯字幕组·雪原市出差所][命运-奇异赝品_Fate/strange Fake][07_神自黄昏归来][简繁日内封PGS][日语配音版_Japanese Dub][Web-DL Remux][1080p AVC AAC]", + ] + + @pytest.mark.xfail(reason="Atlas bracket-delimited format not supported by TITLE_RE") + def test_parse_atlas_format(self): + info = raw_parser(self.TITLES[0]) + assert info is not None + assert info.title_zh == "命运-奇异赝品" + assert info.episode == 4 + + @pytest.mark.parametrize("title", TITLES) + def test_returns_none(self, title): + """Parser cannot handle Atlas format currently.""" + assert raw_parser(title) is None + + +class TestIssue773CompoundEpisode: + """Issue #773: Compound episode number [02(57)] not recognized.""" + + TITLE = "【豌豆字幕组&风之圣殿字幕组】★04月新番[鬼灭之刃 柱训练篇 / Kimetsu_no_Yaiba-Hashira_Geiko_Hen][02(57)][简体][1080P][MP4]" + + def test_parse_compound_episode(self): + info = raw_parser(self.TITLE) + assert info is not None + assert info.title_zh == "鬼灭之刃 柱训练篇" + assert info.episode == 2 + + +class TestIssue805TitleWithCht: + """Issue #805: Traditional Chinese title parses correctly.""" + + def test_parse_cht_title(self): + content = "[ANi] 不時輕聲地以俄語遮羞的鄰座艾莉同學 - 02 [1080P][Baha][WEB-DL][AAC AVC][CHT].mp4" + info = raw_parser(content) + assert info is not None + assert info.group == "ANi" + assert info.title_zh == "不時輕聲地以俄語遮羞的鄰座艾莉同學" + assert info.episode == 2 + assert info.resolution == "1080P" + assert info.source == "Baha" + assert info.sub == "CHT" +