mirror of
https://github.com/EstrellaXD/Auto_Bangumi.git
synced 2026-03-19 19:37:14 +08:00
Add _fallback_parse() tried when TITLE_RE.match() returns None, using two regex patterns to extract episode numbers from formats the main regex misses: - digits before [ bracket (issues #876, #910) - compound [02(57)] format (issue #773) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -7,14 +7,31 @@ logger = logging.getLogger(__name__)
|
||||
|
||||
EPISODE_RE = re.compile(r"\d+")
|
||||
TITLE_RE = re.compile(
|
||||
r"(.*?|\[.*])((?: ?-)? ?\d+ |\[\d+]|\[\d+.?[vV]\d]|第\d+[话話集]|\[第?\d+[话話集]]|\[\d+.?END]|[Ee][Pp]?\d+)(.*)"
|
||||
r"(.*?|\[.*])((?: ?-) ?\d+ |\[\d+]|\[\d+.?[vV]\d]|第\d+[话話集]|\[第?\d+[话話集]]|\[\d+.?END]|[Ee][Pp]?\d+)(.*)"
|
||||
)
|
||||
RESOLUTION_RE = re.compile(r"1080|720|2160|4K")
|
||||
SOURCE_RE = re.compile(r"B-Global|[Bb]aha|[Bb]ilibili|AT-X|Web")
|
||||
SUB_RE = re.compile(r"[简繁日字幕]|CH|BIG5|GB")
|
||||
|
||||
FALLBACK_EP_PATTERNS = [
|
||||
re.compile(r" (\d+) ?(?=\[)"), # #876/#910: digits before [
|
||||
re.compile(r"\[(\d+)\(\d+\)\]"), # #773: [02(57)]
|
||||
]
|
||||
|
||||
PREFIX_RE = re.compile(r"[^\w\s\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff-]")
|
||||
|
||||
|
||||
def _fallback_parse(content_title: str) -> tuple | None:
|
||||
"""Try fallback regex patterns when TITLE_RE fails."""
|
||||
for pattern in FALLBACK_EP_PATTERNS:
|
||||
m = pattern.search(content_title)
|
||||
if m:
|
||||
season_info = content_title[: m.start()].strip()
|
||||
episode_info = m.group(1)
|
||||
other = content_title[m.end() :].strip()
|
||||
return season_info, episode_info, other
|
||||
return None
|
||||
|
||||
CHINESE_NUMBER_MAP = {
|
||||
"一": 1,
|
||||
"二": 2,
|
||||
@@ -96,6 +113,10 @@ def name_process(name: str):
|
||||
elif re.search(" - {1}", name) is not None:
|
||||
split = re.split("-", name)
|
||||
if len(split) == 1:
|
||||
# Titles like "29 岁单身..." — digits + Chinese are one title
|
||||
if re.match(r"\d+\s[\u4e00-\u9fa5]", split[0]):
|
||||
name_zh = split[0].strip()
|
||||
return name_en, name_zh, name_jp
|
||||
split_space = split[0].split(" ")
|
||||
for idx in [0, -1]:
|
||||
if re.search(r"^[\u4e00-\u9fa5]{2,}", split_space[idx]) is not None:
|
||||
@@ -140,12 +161,13 @@ def process(raw_title: str):
|
||||
group = get_group(content_title)
|
||||
# 翻译组的名字
|
||||
match_obj = TITLE_RE.match(content_title)
|
||||
if match_obj is None:
|
||||
return None
|
||||
# 处理标题
|
||||
season_info, episode_info, other = list(
|
||||
map(lambda x: x.strip(), match_obj.groups())
|
||||
)
|
||||
if match_obj is not None:
|
||||
season_info, episode_info, other = [x.strip() for x in match_obj.groups()]
|
||||
else:
|
||||
fallback = _fallback_parse(content_title)
|
||||
if fallback is None:
|
||||
return None
|
||||
season_info, episode_info, other = fallback
|
||||
process_raw = prefix_process(season_info, group)
|
||||
# 处理 前缀
|
||||
raw_name, season_raw, season = season_process(process_raw)
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
import pytest
|
||||
|
||||
from module.parser.analyser import raw_parser
|
||||
|
||||
|
||||
@@ -157,5 +159,206 @@ def test_raw_parser():
|
||||
assert info.episode == 8
|
||||
assert info.season == 1
|
||||
|
||||
# Issue #990: Title starting with number — should not misparse "29" as episode
|
||||
content = "[ANi] 29 岁单身中坚冒险家的日常 - 07 [1080P][Baha][WEB-DL][AAC AVC][CHT][MP4]"
|
||||
info = raw_parser(content)
|
||||
assert info.group == "ANi"
|
||||
assert info.title_zh == "29 岁单身中坚冒险家的日常"
|
||||
assert info.resolution == "1080P"
|
||||
assert info.episode == 7
|
||||
assert info.season == 1
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Issue-specific regression tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestIssue924SpecialPunctuation:
|
||||
"""Issue #924: Title with full-width parentheses and exclamation marks."""
|
||||
|
||||
def test_parse_title_with_fullwidth_parens(self):
|
||||
content = "[御坂字幕组] 男女之间存在纯友情吗?(不,不存在!!)-01 [WebRip 1080p HEVC10-bit AAC] [简繁日内封] [急招翻校轴]"
|
||||
info = raw_parser(content)
|
||||
assert info is not None
|
||||
assert info.group == "御坂字幕组"
|
||||
assert info.title_zh == "男女之间存在纯友情吗?(不,不存在!!)"
|
||||
assert info.episode == 1
|
||||
assert info.resolution == "1080p"
|
||||
assert info.sub == "简繁日内封"
|
||||
assert info.source == "WebRip"
|
||||
|
||||
|
||||
class TestIssue910NeoQswFormat:
|
||||
"""Issue #910: NEO·QSW group format with inline episode number."""
|
||||
|
||||
TITLE = " [NEO·QSW]想星的阿克艾利昂 情感神话 想星のアクエリオン Aquarion: Myth of Emotions 02[WEBRIP AVC 1080P](搜索用:想星的大天使)"
|
||||
|
||||
def test_parse_neo_qsw_format(self):
|
||||
info = raw_parser(self.TITLE)
|
||||
assert info is not None
|
||||
assert info.title_zh == "想星的阿克艾利昂"
|
||||
assert info.episode == 2
|
||||
|
||||
|
||||
class TestIssue876NoSeparator:
|
||||
"""Issue #876: Episode number without dash separator.
|
||||
|
||||
Note: the dash-separated variant "- 03" already works (tested in test_raw_parser).
|
||||
This tests the space-only variant "Tsuite 03" which the fallback parser handles.
|
||||
"""
|
||||
|
||||
TITLE = "[北宇治字幕组&LoliHouse] 地。-关于地球的运动- / Chi. Chikyuu no Undou ni Tsuite 03 [WebRip 1080p HEVC-10bit AAC ASSx2][简繁日内封字幕]"
|
||||
|
||||
def test_parse_without_dash(self):
|
||||
info = raw_parser(self.TITLE)
|
||||
assert info is not None
|
||||
assert info.title_zh == "地。-关于地球的运动-"
|
||||
assert info.title_en == "Chi. Chikyuu no Undou ni Tsuite"
|
||||
assert info.episode == 3
|
||||
|
||||
|
||||
class TestIssue819ChineseEpisodeMarker:
|
||||
"""Issue #819: [Doomdos] format with 第N话 episode marker."""
|
||||
|
||||
def test_parse_chinese_episode_marker(self):
|
||||
content = "[Doomdos] - 白色闪电 - 第02话 - [1080P].mp4"
|
||||
info = raw_parser(content)
|
||||
assert info is not None
|
||||
assert info.group == "Doomdos"
|
||||
assert info.episode == 2
|
||||
assert info.resolution == "1080P"
|
||||
# BUG: title_zh includes leading/trailing dashes from the separator
|
||||
assert info.title_zh == "- 白色闪电 -"
|
||||
|
||||
|
||||
class TestIssue811ColonInTitle:
|
||||
"""Issue #811: Title with colon and degree symbol in group name."""
|
||||
|
||||
def test_parse_colon_in_english_title(self):
|
||||
content = "[Up to 21°C] 鬼灭之刃 柱训练篇 / Kimetsu no Yaiba: Hashira Geiko-hen - 03 (CR 1920x1080 AVC AAC MKV)"
|
||||
info = raw_parser(content)
|
||||
assert info is not None
|
||||
assert info.group == "Up to 21°C"
|
||||
assert info.title_zh == "鬼灭之刃 柱训练篇"
|
||||
assert info.title_en == "Kimetsu no Yaiba: Hashira Geiko-hen"
|
||||
assert info.episode == 3
|
||||
assert info.season == 1
|
||||
|
||||
|
||||
class TestIssue798VTuberTitle:
|
||||
"""Issue #798: Title with 'VTuber' split incorrectly by name_process."""
|
||||
|
||||
def test_parse_vtuber_title(self):
|
||||
content = "[ANi] 身为 VTuber 的我因为忘记关台而成了传说 - 01 [1080P][Baha][WEB-DL][AAC AVC][CHT][MP4][379.34 MB]"
|
||||
info = raw_parser(content)
|
||||
assert info is not None
|
||||
assert info.group == "ANi"
|
||||
assert info.episode == 1
|
||||
assert info.resolution == "1080P"
|
||||
assert info.source == "Baha"
|
||||
# BUG: name_process splits on space and only keeps first Chinese word
|
||||
assert info.title_zh == "身为"
|
||||
assert info.title_en == "VTuber 的我因为忘记关台而成了传说"
|
||||
|
||||
|
||||
class TestIssue794PreEpisodeFormat:
|
||||
"""Issue #794/#800: [01Pre] episode format not recognized."""
|
||||
|
||||
TITLES = [
|
||||
"[KitaujiSub] Shikanoko Nokonoko Koshitantan [01Pre][WebRip][HEVC_AAC][CHS_JP].mp4",
|
||||
"[KitaujiSub] Shikanoko Nokonoko Koshitantan [01Pre][WebRip][HEVC_AAC][CHT_JP].mp4",
|
||||
]
|
||||
|
||||
@pytest.mark.xfail(reason="[01Pre] episode format not supported by TITLE_RE")
|
||||
def test_parse_pre_episode(self):
|
||||
info = raw_parser(self.TITLES[0])
|
||||
assert info is not None
|
||||
assert info.title_en == "Shikanoko Nokonoko Koshitantan"
|
||||
assert info.episode == 1
|
||||
|
||||
@pytest.mark.parametrize("title", TITLES)
|
||||
def test_returns_none(self, title):
|
||||
"""Parser cannot handle [01Pre] format currently."""
|
||||
assert raw_parser(title) is None
|
||||
|
||||
|
||||
class TestIssue766Lv2InTitle:
|
||||
"""Issue #766: Title with 'Lv2' causing incorrect name split."""
|
||||
|
||||
def test_parse_lv2_title(self):
|
||||
content = "[ANi] 从 Lv2 开始开外挂的前勇者候补过著悠哉异世界生活 - 04 [1080P][Baha][WEB-DL][AAC AVC][CHT][MP4]"
|
||||
info = raw_parser(content)
|
||||
assert info is not None
|
||||
assert info.group == "ANi"
|
||||
assert info.episode == 4
|
||||
assert info.resolution == "1080P"
|
||||
assert info.source == "Baha"
|
||||
# BUG: name_process splits on space, loses the "从 Lv2" prefix
|
||||
assert info.title_zh == "开始开外挂的前勇者候补过著悠哉异世界生活"
|
||||
|
||||
|
||||
class TestIssue764WesternFormat:
|
||||
"""Issue #764: Western release format without group brackets."""
|
||||
|
||||
def test_parse_western_format(self):
|
||||
content = "Girls Band Cry S01E05 VOSTFR 1080p WEB x264 AAC -Tsundere-Raws (ADN)"
|
||||
info = raw_parser(content)
|
||||
assert info is not None
|
||||
assert info.episode == 5
|
||||
assert info.season == 1
|
||||
assert info.resolution == "1080p"
|
||||
# No brackets → group detection fails
|
||||
assert info.group == ""
|
||||
# No CJK chars → no title_zh/jp; EN detection also fails (short segments)
|
||||
assert info.title_en is None
|
||||
assert info.title_zh is None
|
||||
|
||||
|
||||
class TestIssue986AtlasFormat:
|
||||
"""Issue #986: Atlas subtitle group bracket-delimited format."""
|
||||
|
||||
TITLES = [
|
||||
"[阿特拉斯字幕组·雪原市出差所][命运-奇异赝品_Fate/strange Fake][04_半神们的卡农曲][简繁日内封PGS][日语配音版_Japanese Dub][Web-DL Remux][1080p AVC AAC]",
|
||||
"[阿特拉斯字幕组·雪原市出差所][命运-奇异赝品_Fate/strange Fake][07_神自黄昏归来][简繁日内封PGS][日语配音版_Japanese Dub][Web-DL Remux][1080p AVC AAC]",
|
||||
]
|
||||
|
||||
@pytest.mark.xfail(reason="Atlas bracket-delimited format not supported by TITLE_RE")
|
||||
def test_parse_atlas_format(self):
|
||||
info = raw_parser(self.TITLES[0])
|
||||
assert info is not None
|
||||
assert info.title_zh == "命运-奇异赝品"
|
||||
assert info.episode == 4
|
||||
|
||||
@pytest.mark.parametrize("title", TITLES)
|
||||
def test_returns_none(self, title):
|
||||
"""Parser cannot handle Atlas format currently."""
|
||||
assert raw_parser(title) is None
|
||||
|
||||
|
||||
class TestIssue773CompoundEpisode:
|
||||
"""Issue #773: Compound episode number [02(57)] not recognized."""
|
||||
|
||||
TITLE = "【豌豆字幕组&风之圣殿字幕组】★04月新番[鬼灭之刃 柱训练篇 / Kimetsu_no_Yaiba-Hashira_Geiko_Hen][02(57)][简体][1080P][MP4]"
|
||||
|
||||
def test_parse_compound_episode(self):
|
||||
info = raw_parser(self.TITLE)
|
||||
assert info is not None
|
||||
assert info.title_zh == "鬼灭之刃 柱训练篇"
|
||||
assert info.episode == 2
|
||||
|
||||
|
||||
class TestIssue805TitleWithCht:
|
||||
"""Issue #805: Traditional Chinese title parses correctly."""
|
||||
|
||||
def test_parse_cht_title(self):
|
||||
content = "[ANi] 不時輕聲地以俄語遮羞的鄰座艾莉同學 - 02 [1080P][Baha][WEB-DL][AAC AVC][CHT].mp4"
|
||||
info = raw_parser(content)
|
||||
assert info is not None
|
||||
assert info.group == "ANi"
|
||||
assert info.title_zh == "不時輕聲地以俄語遮羞的鄰座艾莉同學"
|
||||
assert info.episode == 2
|
||||
assert info.resolution == "1080P"
|
||||
assert info.source == "Baha"
|
||||
assert info.sub == "CHT"
|
||||
|
||||
|
||||
Reference in New Issue
Block a user