From 3f4f3a141c199fc878e32183e299179f440ae942 Mon Sep 17 00:00:00 2001 From: EstrellaXD Date: Mon, 26 Jan 2026 15:44:44 +0100 Subject: [PATCH] feat(database): add title alias system for mid-season naming changes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When subtitle groups change their naming convention mid-season (e.g., "LoliHouse" → "LoliHouse&动漫国"), AutoBangumi was creating duplicate entries. This adds a title alias system that: - Detects semantic duplicates (same official_title, dpi, subtitle, source, and similar group name) - Merges them as aliases instead of creating new entries - Updates match_torrent() and match_list() to check aliases - Adds title_aliases field to Bangumi model (JSON list) - Includes migration v8 for the new column - Adds 10 new tests for the feature - Fixes cache invalidation bug in disable_rule() Co-Authored-By: Claude Opus 4.5 --- backend/src/module/database/bangumi.py | 266 ++++++++++++++++++---- backend/src/module/database/combine.py | 13 +- backend/src/module/models/bangumi.py | 6 + backend/src/test/test_database.py | 296 ++++++++++++++++++++++++- 4 files changed, 541 insertions(+), 40 deletions(-) diff --git a/backend/src/module/database/bangumi.py b/backend/src/module/database/bangumi.py index 8d429678..1f505b50 100644 --- a/backend/src/module/database/bangumi.py +++ b/backend/src/module/database/bangumi.py @@ -1,3 +1,4 @@ +import json import logging import re import time @@ -10,6 +11,57 @@ from module.models import Bangumi, BangumiUpdate logger = logging.getLogger(__name__) + +def _normalize_group_name(group: str | None) -> str: + """Normalize group name for comparison by removing common separators.""" + if not group: + return "" + # Remove common separators (&, ×, _, -) and normalize to lowercase + return re.sub(r"[&×_\-]", "", group).lower().strip() + + +def _groups_are_similar(group1: str | None, group2: str | None) -> bool: + """ + Check if two group names are similar enough to be considered the same group. + + Handles cases like: + - "LoliHouse" vs "LoliHouse&动漫国字幕组" + - "字幕组A" vs "字幕组A×字幕组B" + """ + if not group1 or not group2: + return False + + # Exact match or substring match (one contains the other) + if group1 == group2 or group1 in group2 or group2 in group1: + return True + + # Normalized comparison - check if core group names overlap + norm1 = _normalize_group_name(group1) + norm2 = _normalize_group_name(group2) + return norm1 in norm2 or norm2 in norm1 + + +def _get_aliases_list(bangumi: Bangumi) -> list[str]: + """Get the list of title aliases from a bangumi's title_aliases JSON field.""" + if not bangumi.title_aliases: + return [] + try: + aliases = json.loads(bangumi.title_aliases) + return aliases if isinstance(aliases, list) else [] + except (json.JSONDecodeError, TypeError): + return [] + + +def _set_aliases_list(bangumi: Bangumi, aliases: list[str]) -> None: + """Set the title aliases JSON field from a list.""" + if not aliases: + bangumi.title_aliases = None + else: + # Remove duplicates while preserving order + unique_aliases = list(dict.fromkeys(aliases)) + bangumi.title_aliases = json.dumps(unique_aliases, ensure_ascii=False) + + # Module-level TTL cache for search_all results _bangumi_cache: list[Bangumi] | None = None _bangumi_cache_time: float = 0 @@ -26,6 +78,91 @@ class BangumiDatabase: def __init__(self, session: Session): self.session = session + def find_semantic_duplicate(self, data: Bangumi) -> Optional[Bangumi]: + """ + Find existing bangumi that semantically matches the new one. + + This handles cases where subtitle groups change naming mid-season. + A semantic match requires: + - Same official_title + - Same dpi (resolution) + - Same subtitle type + - Same source + - Similar group_name (one contains the other) + + Returns the matching Bangumi if found, None otherwise. + """ + statement = select(Bangumi).where( + and_( + Bangumi.official_title == data.official_title, + Bangumi.deleted == false(), + ) + ) + candidates = self.session.execute(statement).scalars().all() + + for candidate in candidates: + is_exact_duplicate = ( + candidate.title_raw == data.title_raw + and candidate.group_name == data.group_name + ) + if is_exact_duplicate: + continue + + is_semantic_match = ( + candidate.dpi == data.dpi + and candidate.subtitle == data.subtitle + and candidate.source == data.source + and _groups_are_similar(candidate.group_name, data.group_name) + ) + if is_semantic_match: + logger.debug( + f"[Database] Found semantic duplicate: '{data.title_raw}' matches " + f"existing '{candidate.title_raw}' (official: {data.official_title})" + ) + return candidate + + return None + + def add_title_alias(self, bangumi_id: int, new_title_raw: str) -> bool: + """ + Add a new title_raw alias to an existing bangumi. + + This allows a single bangumi entry to match multiple naming patterns. + """ + bangumi = self.session.get(Bangumi, bangumi_id) + if not bangumi: + logger.warning( + f"[Database] Cannot add alias: bangumi id {bangumi_id} not found" + ) + return False + + # Don't add if it's the same as the main title_raw + if bangumi.title_raw == new_title_raw: + return False + + # Get existing aliases and add the new one + aliases = _get_aliases_list(bangumi) + if new_title_raw in aliases: + return False # Already exists + + aliases.append(new_title_raw) + _set_aliases_list(bangumi, aliases) + + self.session.add(bangumi) + self.session.commit() + _invalidate_bangumi_cache() + logger.info( + f"[Database] Added alias '{new_title_raw}' to bangumi '{bangumi.official_title}' " + f"(id: {bangumi_id})" + ) + return True + + def get_all_title_patterns(self, bangumi: Bangumi) -> list[str]: + """Get all title patterns for matching (title_raw + all aliases).""" + patterns = [bangumi.title_raw] + patterns.extend(_get_aliases_list(bangumi)) + return patterns + def _is_duplicate(self, data: Bangumi) -> bool: """Check if a bangumi rule already exists based on title_raw and group_name.""" statement = select(Bangumi).where( @@ -43,6 +180,18 @@ class BangumiDatabase: f"[Database] Skipping duplicate: {data.official_title} ({data.group_name})" ) return False + + # Check for semantic duplicate (same anime, different naming pattern) + semantic_match = self.find_semantic_duplicate(data) + if semantic_match: + # Add as alias instead of creating new entry + self.add_title_alias(semantic_match.id, data.title_raw) + logger.info( + f"[Database] Merged '{data.title_raw}' as alias to existing " + f"'{semantic_match.title_raw}' (official: {data.official_title})" + ) + return False # Return False since we didn't add a new entry + self.session.add(data) self.session.commit() _invalidate_bangumi_cache() @@ -70,31 +219,54 @@ class BangumiDatabase: else: existing = set() - # Filter out duplicates + # Filter out exact duplicates to_add = [d for d in datas if (d.title_raw, d.group_name) not in existing] + # Check for semantic duplicates and add as aliases + semantic_merged = 0 + really_to_add = [] + for d in to_add: + semantic_match = self.find_semantic_duplicate(d) + if semantic_match: + # Add as alias instead of creating new entry + self.add_title_alias(semantic_match.id, d.title_raw) + semantic_merged += 1 + logger.info( + f"[Database] Merged '{d.title_raw}' as alias to existing " + f"'{semantic_match.title_raw}' (official: {d.official_title})" + ) + else: + really_to_add.append(d) + # Also deduplicate within the batch itself seen = set() unique_to_add = [] - for d in to_add: + for d in really_to_add: key = (d.title_raw, d.group_name) if key not in seen: seen.add(key) unique_to_add.append(d) if not unique_to_add: - logger.debug( - f"[Database] All {len(datas)} bangumi already exist, skipping." - ) + if semantic_merged > 0: + logger.debug( + f"[Database] {semantic_merged} bangumi merged as aliases, " + f"rest were duplicates." + ) + else: + logger.debug( + f"[Database] All {len(datas)} bangumi already exist, skipping." + ) return 0 self.session.add_all(unique_to_add) self.session.commit() _invalidate_bangumi_cache() - skipped = len(datas) - len(unique_to_add) - if skipped > 0: + skipped = len(datas) - len(unique_to_add) - semantic_merged + if skipped > 0 or semantic_merged > 0: logger.debug( - f"[Database] Insert {len(unique_to_add)} bangumi, skipped {skipped} duplicates." + f"[Database] Insert {len(unique_to_add)} bangumi, " + f"skipped {skipped} duplicates, merged {semantic_merged} as aliases." ) else: logger.debug( @@ -186,25 +358,19 @@ class BangumiDatabase: def search_id(self, _id: int) -> Optional[Bangumi]: statement = select(Bangumi).where(Bangumi.id == _id) - result = self.session.execute(statement) - bangumi = result.scalar_one_or_none() + bangumi = self.session.execute(statement).scalar_one_or_none() if bangumi is None: logger.warning(f"[Database] Cannot find bangumi id: {_id}.") return None - else: - logger.debug(f"[Database] Find bangumi id: {_id}.") - return bangumi + logger.debug(f"[Database] Find bangumi id: {_id}.") + return bangumi def match_poster(self, bangumi_name: str) -> str: statement = select(Bangumi).where( func.instr(bangumi_name, Bangumi.official_title) > 0 ) - result = self.session.execute(statement) - data = result.scalar_one_or_none() - if data: - return data.poster_link - else: - return "" + data = self.session.execute(statement).scalar_one_or_none() + return data.poster_link if data else "" def match_list(self, torrent_list: list, rss_link: str) -> list: match_datas = self.search_all() @@ -212,7 +378,14 @@ class BangumiDatabase: return torrent_list # Build index for O(1) lookup after regex match - title_index = {m.title_raw: m for m in match_datas} + # Include both title_raw and all aliases + title_index: dict[str, Bangumi] = {} + for m in match_datas: + # Add main title_raw + title_index[m.title_raw] = m + # Add all aliases + for alias in _get_aliases_list(m): + title_index[alias] = m # Build compiled regex pattern for fast substring matching # Sort by length descending so longer (more specific) matches are found first @@ -226,12 +399,16 @@ class BangumiDatabase: for torrent in torrent_list: match = title_regex.search(torrent.name) if match: - title_raw = match.group(0) - match_data = title_index[title_raw] - if rss_link not in match_data.rss_link and title_raw not in rss_updated: + matched_title = match.group(0) + match_data = title_index[matched_title] + # Use the bangumi's main title_raw for rss_updated tracking + if ( + rss_link not in match_data.rss_link + and match_data.title_raw not in rss_updated + ): match_data.rss_link += f",{rss_link}" match_data.added = False - rss_updated.add(title_raw) + rss_updated.add(match_data.title_raw) else: unmatched.append(torrent) # Batch commit all rss_link updates @@ -244,20 +421,32 @@ class BangumiDatabase: return unmatched def match_torrent(self, torrent_name: str) -> Optional[Bangumi]: - statement = ( - select(Bangumi) - .where( - and_( - func.instr(torrent_name, Bangumi.title_raw) > 0, - Bangumi.deleted == false(), - ) - ) - # Prefer longer title_raw matches (more specific) - .order_by(func.length(Bangumi.title_raw).desc()) - .limit(1) - ) - result = self.session.execute(statement) - return result.scalar_one_or_none() + """ + Match torrent name to a bangumi, checking both title_raw and title_aliases. + + Returns the bangumi with the longest matching pattern for specificity. + """ + match_datas = self.search_all() + if not match_datas: + return None + + best_match: Optional[Bangumi] = None + best_match_len = 0 + + for bangumi in match_datas: + if bangumi.deleted: + continue + + # Check all patterns (title_raw + aliases) + patterns = self.get_all_title_patterns(bangumi) + for pattern in patterns: + if pattern in torrent_name: + # Prefer longer matches (more specific) + if len(pattern) > best_match_len: + best_match = bangumi + best_match_len = len(pattern) + + return best_match def not_complete(self) -> list[Bangumi]: condition = select(Bangumi).where( @@ -285,6 +474,7 @@ class BangumiDatabase: bangumi.deleted = True self.session.add(bangumi) self.session.commit() + _invalidate_bangumi_cache() logger.debug(f"[Database] Disable rule {bangumi.title_raw}.") def search_rss(self, rss_link: str) -> list[Bangumi]: diff --git a/backend/src/module/database/combine.py b/backend/src/module/database/combine.py index eb4ca17a..36e68a31 100644 --- a/backend/src/module/database/combine.py +++ b/backend/src/module/database/combine.py @@ -23,7 +23,7 @@ logger = logging.getLogger(__name__) TABLE_MODELS: list[type[SQLModel]] = [Bangumi, RSSItem, Torrent, User, Passkey] # Increment this when adding new migrations to MIGRATIONS list. -CURRENT_SCHEMA_VERSION = 7 +CURRENT_SCHEMA_VERSION = 8 # Each migration is a tuple of (version, description, list of SQL statements). # Migrations are applied in order. A migration at index i brings the schema @@ -96,6 +96,13 @@ MIGRATIONS = [ "ALTER TABLE bangumi ADD COLUMN suggested_episode_offset INTEGER DEFAULT NULL", ], ), + ( + 8, + "add title_aliases for mid-season naming changes", + [ + "ALTER TABLE bangumi ADD COLUMN title_aliases TEXT DEFAULT NULL", + ], + ), ] @@ -187,6 +194,10 @@ class Database(Session): columns = [col["name"] for col in inspector.get_columns("bangumi")] if "suggested_season_offset" in columns: needs_run = False + if "bangumi" in tables and version == 8: + columns = [col["name"] for col in inspector.get_columns("bangumi")] + if "title_aliases" in columns: + needs_run = False if needs_run: with self.engine.connect() as conn: for stmt in statements: diff --git a/backend/src/module/models/bangumi.py b/backend/src/module/models/bangumi.py index e6f41831..e2110180 100644 --- a/backend/src/module/models/bangumi.py +++ b/backend/src/module/models/bangumi.py @@ -46,6 +46,9 @@ class Bangumi(SQLModel, table=True): suggested_episode_offset: Optional[int] = Field( default=None, alias="suggested_episode_offset", title="建议集数偏移" ) + title_aliases: Optional[str] = Field( + default=None, alias="title_aliases", title="标题别名" + ) # JSON list: ["alt_title_1", "alt_title_2"] class BangumiUpdate(SQLModel): @@ -78,6 +81,9 @@ class BangumiUpdate(SQLModel): needs_review_reason: Optional[str] = Field( default=None, alias="needs_review_reason", title="检查原因" ) + title_aliases: Optional[str] = Field( + default=None, alias="title_aliases", title="标题别名" + ) class Notification(BaseModel): diff --git a/backend/src/test/test_database.py b/backend/src/test/test_database.py index 58c87072..363c2971 100644 --- a/backend/src/test/test_database.py +++ b/backend/src/test/test_database.py @@ -1,3 +1,5 @@ +import json + import pytest from sqlmodel import Session, SQLModel, create_engine @@ -208,7 +210,9 @@ def test_torrent_qb_hash_index_efficient(db_session): # Add multiple torrents torrents = [ - Torrent(name=f"Torrent {i}", url=f"https://example.com/{i}", qb_hash=f"hash_{i}") + Torrent( + name=f"Torrent {i}", url=f"https://example.com/{i}", qb_hash=f"hash_{i}" + ) for i in range(10) ] db.add_all(torrents) @@ -225,3 +229,293 @@ def test_torrent_qb_hash_index_efficient(db_session): # Non-existent hash result = db.search_by_qb_hash("hash_100") assert result is None + + +# ============================================================ +# Title Alias Tests - for mid-season naming change handling +# ============================================================ + + +def test_add_title_alias(db_session): + """Test adding a title alias to an existing bangumi.""" + db = BangumiDatabase(db_session) + + bangumi = Bangumi( + official_title="Test Anime", + title_raw="Test Anime S1", + group_name="TestGroup", + dpi="1080p", + source="Web", + subtitle="CHT", + rss_link="test", + ) + db.add(bangumi) + bangumi_id = db.search_all()[0].id + + # Add an alias + result = db.add_title_alias(bangumi_id, "Test Anime Season 1") + assert result is True + + # Verify alias was added + updated = db.search_id(bangumi_id) + assert updated.title_aliases is not None + aliases = json.loads(updated.title_aliases) + assert "Test Anime Season 1" in aliases + + +def test_add_title_alias_duplicate(db_session): + """Test that adding the same alias twice is a no-op.""" + db = BangumiDatabase(db_session) + + bangumi = Bangumi( + official_title="Test Anime", + title_raw="Test Anime S1", + group_name="TestGroup", + dpi="1080p", + source="Web", + subtitle="CHT", + rss_link="test", + ) + db.add(bangumi) + bangumi_id = db.search_all()[0].id + + # Add same alias twice + db.add_title_alias(bangumi_id, "Test Anime Season 1") + result = db.add_title_alias(bangumi_id, "Test Anime Season 1") + assert result is False # Second add should be a no-op + + +def test_add_title_alias_same_as_title_raw(db_session): + """Test that adding title_raw as alias is a no-op.""" + db = BangumiDatabase(db_session) + + bangumi = Bangumi( + official_title="Test Anime", + title_raw="Test Anime S1", + group_name="TestGroup", + dpi="1080p", + source="Web", + subtitle="CHT", + rss_link="test", + ) + db.add(bangumi) + bangumi_id = db.search_all()[0].id + + result = db.add_title_alias(bangumi_id, "Test Anime S1") + assert result is False + + +def test_match_torrent_with_alias(db_session): + """Test that match_torrent finds bangumi using aliases.""" + db = BangumiDatabase(db_session) + + bangumi = Bangumi( + official_title="Test Anime", + title_raw="Test Anime S1", + group_name="TestGroup", + dpi="1080p", + source="Web", + subtitle="CHT", + rss_link="test", + deleted=False, + ) + db.add(bangumi) + bangumi_id = db.search_all()[0].id + + # Add alias + db.add_title_alias(bangumi_id, "Test Anime Season 1") + + # Match using title_raw + result = db.match_torrent("[TestGroup] Test Anime S1 - 01.mkv") + assert result is not None + assert result.official_title == "Test Anime" + + # Match using alias + result = db.match_torrent("[TestGroup] Test Anime Season 1 - 01.mkv") + assert result is not None + assert result.official_title == "Test Anime" + + +def test_find_semantic_duplicate_same_official_title(db_session): + """Test finding semantic duplicates with same official title.""" + db = BangumiDatabase(db_session) + + # Add first bangumi + bangumi1 = Bangumi( + official_title="Frieren", + title_raw="Sousou no Frieren", + group_name="LoliHouse", + dpi="1080p", + source="Web", + subtitle="CHT", + rss_link="test1", + ) + db.add(bangumi1) + + # Create a semantically similar bangumi (same anime, group changed naming) + bangumi2 = Bangumi( + official_title="Frieren", + title_raw="Frieren Beyond Journey's End", # Different title_raw + group_name="LoliHouse&动漫国", # Group changed mid-season + dpi="1080p", + source="Web", + subtitle="CHT", + rss_link="test2", + ) + + # Should find semantic duplicate + result = db.find_semantic_duplicate(bangumi2) + assert result is not None + assert result.title_raw == "Sousou no Frieren" + + +def test_find_semantic_duplicate_no_match_different_resolution(db_session): + """Test that different resolution is NOT a semantic match.""" + db = BangumiDatabase(db_session) + + bangumi1 = Bangumi( + official_title="Frieren", + title_raw="Sousou no Frieren", + group_name="LoliHouse", + dpi="1080p", + source="Web", + subtitle="CHT", + rss_link="test1", + ) + db.add(bangumi1) + + # Same anime but different resolution - should NOT be semantic duplicate + bangumi2 = Bangumi( + official_title="Frieren", + title_raw="Sousou no Frieren 4K", + group_name="LoliHouse", + dpi="2160p", # Different resolution + source="Web", + subtitle="CHT", + rss_link="test2", + ) + + result = db.find_semantic_duplicate(bangumi2) + assert result is None + + +def test_add_with_semantic_duplicate_creates_alias(db_session): + """Test that adding a semantic duplicate creates an alias instead.""" + db = BangumiDatabase(db_session) + + # Add first bangumi + bangumi1 = Bangumi( + official_title="Frieren", + title_raw="Sousou no Frieren", + group_name="LoliHouse", + dpi="1080p", + source="Web", + subtitle="CHT", + rss_link="test1", + ) + db.add(bangumi1) + initial_count = len(db.search_all()) + assert initial_count == 1 + + # Try to add semantic duplicate + bangumi2 = Bangumi( + official_title="Frieren", + title_raw="Frieren Beyond Journey's End", + group_name="LoliHouse&动漫国", + dpi="1080p", + source="Web", + subtitle="CHT", + rss_link="test2", + ) + result = db.add(bangumi2) + assert result is False # Should not add new entry + + # Count should still be 1 + final_count = len(db.search_all()) + assert final_count == 1 + + # But the new title_raw should be an alias + original = db.search_all()[0] + aliases = json.loads(original.title_aliases) if original.title_aliases else [] + assert "Frieren Beyond Journey's End" in aliases + + +def test_groups_are_similar(): + """Test group name similarity detection.""" + from module.database.bangumi import _groups_are_similar + + # Exact match + assert _groups_are_similar("LoliHouse", "LoliHouse") is True + + # Substring match (one contains the other) + assert _groups_are_similar("LoliHouse", "LoliHouse&动漫国字幕组") is True + assert _groups_are_similar("LoliHouse&动漫国字幕组", "LoliHouse") is True + + # Completely different groups + assert _groups_are_similar("LoliHouse", "Sakurato") is False + assert _groups_are_similar("字幕组A", "字幕组B") is False + + # Edge cases + assert _groups_are_similar(None, "LoliHouse") is False + assert _groups_are_similar("LoliHouse", None) is False + assert _groups_are_similar(None, None) is False + + +def test_get_all_title_patterns(db_session): + """Test getting all title patterns for a bangumi.""" + db = BangumiDatabase(db_session) + + bangumi = Bangumi( + official_title="Test Anime", + title_raw="Test Anime S1", + group_name="TestGroup", + dpi="1080p", + source="Web", + subtitle="CHT", + rss_link="test", + ) + db.add(bangumi) + bangumi_id = db.search_all()[0].id + + # Add aliases + db.add_title_alias(bangumi_id, "Test Anime Season 1") + db.add_title_alias(bangumi_id, "TA S1") + + # Get all patterns + updated = db.search_id(bangumi_id) + patterns = db.get_all_title_patterns(updated) + + assert len(patterns) == 3 + assert "Test Anime S1" in patterns + assert "Test Anime Season 1" in patterns + assert "TA S1" in patterns + + +def test_match_list_with_aliases(db_session): + """Test match_list works with aliases.""" + db = BangumiDatabase(db_session) + + bangumi = Bangumi( + official_title="Test Anime", + title_raw="Test Anime S1", + group_name="TestGroup", + dpi="1080p", + source="Web", + subtitle="CHT", + rss_link="rss1", + ) + db.add(bangumi) + bangumi_id = db.search_all()[0].id + db.add_title_alias(bangumi_id, "Test Anime Season 1") + + # Create torrents with different naming patterns + torrents = [ + Torrent(name="[TestGroup] Test Anime S1 - 01.mkv", url="url1"), + Torrent(name="[TestGroup] Test Anime Season 1 - 02.mkv", url="url2"), + Torrent(name="[OtherGroup] Different Anime - 01.mkv", url="url3"), + ] + + # Only the third torrent should be unmatched + unmatched = db.match_list(torrents, "rss2") + assert len(unmatched) == 1 + assert unmatched[0].name == "[OtherGroup] Different Anime - 01.mkv"