feat(database): add title alias system for mid-season naming changes

When subtitle groups change their naming convention mid-season (e.g.,
"LoliHouse" → "LoliHouse&动漫国"), AutoBangumi was creating duplicate
entries. This adds a title alias system that:

- Detects semantic duplicates (same official_title, dpi, subtitle,
  source, and similar group name)
- Merges them as aliases instead of creating new entries
- Updates match_torrent() and match_list() to check aliases
- Adds title_aliases field to Bangumi model (JSON list)
- Includes migration v8 for the new column
- Adds 10 new tests for the feature
- Fixes cache invalidation bug in disable_rule()

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
EstrellaXD
2026-01-26 15:44:44 +01:00
parent 0ba508cd0b
commit 3f4f3a141c
4 changed files with 541 additions and 40 deletions

View File

@@ -1,3 +1,4 @@
import json
import logging
import re
import time
@@ -10,6 +11,57 @@ from module.models import Bangumi, BangumiUpdate
logger = logging.getLogger(__name__)
def _normalize_group_name(group: str | None) -> str:
"""Normalize group name for comparison by removing common separators."""
if not group:
return ""
# Remove common separators (&, ×, _, -) and normalize to lowercase
return re.sub(r"[&×_\-]", "", group).lower().strip()
def _groups_are_similar(group1: str | None, group2: str | None) -> bool:
"""
Check if two group names are similar enough to be considered the same group.
Handles cases like:
- "LoliHouse" vs "LoliHouse&动漫国字幕组"
- "字幕组A" vs "字幕组A×字幕组B"
"""
if not group1 or not group2:
return False
# Exact match or substring match (one contains the other)
if group1 == group2 or group1 in group2 or group2 in group1:
return True
# Normalized comparison - check if core group names overlap
norm1 = _normalize_group_name(group1)
norm2 = _normalize_group_name(group2)
return norm1 in norm2 or norm2 in norm1
def _get_aliases_list(bangumi: Bangumi) -> list[str]:
"""Get the list of title aliases from a bangumi's title_aliases JSON field."""
if not bangumi.title_aliases:
return []
try:
aliases = json.loads(bangumi.title_aliases)
return aliases if isinstance(aliases, list) else []
except (json.JSONDecodeError, TypeError):
return []
def _set_aliases_list(bangumi: Bangumi, aliases: list[str]) -> None:
"""Set the title aliases JSON field from a list."""
if not aliases:
bangumi.title_aliases = None
else:
# Remove duplicates while preserving order
unique_aliases = list(dict.fromkeys(aliases))
bangumi.title_aliases = json.dumps(unique_aliases, ensure_ascii=False)
# Module-level TTL cache for search_all results
_bangumi_cache: list[Bangumi] | None = None
_bangumi_cache_time: float = 0
@@ -26,6 +78,91 @@ class BangumiDatabase:
def __init__(self, session: Session):
self.session = session
def find_semantic_duplicate(self, data: Bangumi) -> Optional[Bangumi]:
"""
Find existing bangumi that semantically matches the new one.
This handles cases where subtitle groups change naming mid-season.
A semantic match requires:
- Same official_title
- Same dpi (resolution)
- Same subtitle type
- Same source
- Similar group_name (one contains the other)
Returns the matching Bangumi if found, None otherwise.
"""
statement = select(Bangumi).where(
and_(
Bangumi.official_title == data.official_title,
Bangumi.deleted == false(),
)
)
candidates = self.session.execute(statement).scalars().all()
for candidate in candidates:
is_exact_duplicate = (
candidate.title_raw == data.title_raw
and candidate.group_name == data.group_name
)
if is_exact_duplicate:
continue
is_semantic_match = (
candidate.dpi == data.dpi
and candidate.subtitle == data.subtitle
and candidate.source == data.source
and _groups_are_similar(candidate.group_name, data.group_name)
)
if is_semantic_match:
logger.debug(
f"[Database] Found semantic duplicate: '{data.title_raw}' matches "
f"existing '{candidate.title_raw}' (official: {data.official_title})"
)
return candidate
return None
def add_title_alias(self, bangumi_id: int, new_title_raw: str) -> bool:
"""
Add a new title_raw alias to an existing bangumi.
This allows a single bangumi entry to match multiple naming patterns.
"""
bangumi = self.session.get(Bangumi, bangumi_id)
if not bangumi:
logger.warning(
f"[Database] Cannot add alias: bangumi id {bangumi_id} not found"
)
return False
# Don't add if it's the same as the main title_raw
if bangumi.title_raw == new_title_raw:
return False
# Get existing aliases and add the new one
aliases = _get_aliases_list(bangumi)
if new_title_raw in aliases:
return False # Already exists
aliases.append(new_title_raw)
_set_aliases_list(bangumi, aliases)
self.session.add(bangumi)
self.session.commit()
_invalidate_bangumi_cache()
logger.info(
f"[Database] Added alias '{new_title_raw}' to bangumi '{bangumi.official_title}' "
f"(id: {bangumi_id})"
)
return True
def get_all_title_patterns(self, bangumi: Bangumi) -> list[str]:
"""Get all title patterns for matching (title_raw + all aliases)."""
patterns = [bangumi.title_raw]
patterns.extend(_get_aliases_list(bangumi))
return patterns
def _is_duplicate(self, data: Bangumi) -> bool:
"""Check if a bangumi rule already exists based on title_raw and group_name."""
statement = select(Bangumi).where(
@@ -43,6 +180,18 @@ class BangumiDatabase:
f"[Database] Skipping duplicate: {data.official_title} ({data.group_name})"
)
return False
# Check for semantic duplicate (same anime, different naming pattern)
semantic_match = self.find_semantic_duplicate(data)
if semantic_match:
# Add as alias instead of creating new entry
self.add_title_alias(semantic_match.id, data.title_raw)
logger.info(
f"[Database] Merged '{data.title_raw}' as alias to existing "
f"'{semantic_match.title_raw}' (official: {data.official_title})"
)
return False # Return False since we didn't add a new entry
self.session.add(data)
self.session.commit()
_invalidate_bangumi_cache()
@@ -70,31 +219,54 @@ class BangumiDatabase:
else:
existing = set()
# Filter out duplicates
# Filter out exact duplicates
to_add = [d for d in datas if (d.title_raw, d.group_name) not in existing]
# Check for semantic duplicates and add as aliases
semantic_merged = 0
really_to_add = []
for d in to_add:
semantic_match = self.find_semantic_duplicate(d)
if semantic_match:
# Add as alias instead of creating new entry
self.add_title_alias(semantic_match.id, d.title_raw)
semantic_merged += 1
logger.info(
f"[Database] Merged '{d.title_raw}' as alias to existing "
f"'{semantic_match.title_raw}' (official: {d.official_title})"
)
else:
really_to_add.append(d)
# Also deduplicate within the batch itself
seen = set()
unique_to_add = []
for d in to_add:
for d in really_to_add:
key = (d.title_raw, d.group_name)
if key not in seen:
seen.add(key)
unique_to_add.append(d)
if not unique_to_add:
logger.debug(
f"[Database] All {len(datas)} bangumi already exist, skipping."
)
if semantic_merged > 0:
logger.debug(
f"[Database] {semantic_merged} bangumi merged as aliases, "
f"rest were duplicates."
)
else:
logger.debug(
f"[Database] All {len(datas)} bangumi already exist, skipping."
)
return 0
self.session.add_all(unique_to_add)
self.session.commit()
_invalidate_bangumi_cache()
skipped = len(datas) - len(unique_to_add)
if skipped > 0:
skipped = len(datas) - len(unique_to_add) - semantic_merged
if skipped > 0 or semantic_merged > 0:
logger.debug(
f"[Database] Insert {len(unique_to_add)} bangumi, skipped {skipped} duplicates."
f"[Database] Insert {len(unique_to_add)} bangumi, "
f"skipped {skipped} duplicates, merged {semantic_merged} as aliases."
)
else:
logger.debug(
@@ -186,25 +358,19 @@ class BangumiDatabase:
def search_id(self, _id: int) -> Optional[Bangumi]:
statement = select(Bangumi).where(Bangumi.id == _id)
result = self.session.execute(statement)
bangumi = result.scalar_one_or_none()
bangumi = self.session.execute(statement).scalar_one_or_none()
if bangumi is None:
logger.warning(f"[Database] Cannot find bangumi id: {_id}.")
return None
else:
logger.debug(f"[Database] Find bangumi id: {_id}.")
return bangumi
logger.debug(f"[Database] Find bangumi id: {_id}.")
return bangumi
def match_poster(self, bangumi_name: str) -> str:
statement = select(Bangumi).where(
func.instr(bangumi_name, Bangumi.official_title) > 0
)
result = self.session.execute(statement)
data = result.scalar_one_or_none()
if data:
return data.poster_link
else:
return ""
data = self.session.execute(statement).scalar_one_or_none()
return data.poster_link if data else ""
def match_list(self, torrent_list: list, rss_link: str) -> list:
match_datas = self.search_all()
@@ -212,7 +378,14 @@ class BangumiDatabase:
return torrent_list
# Build index for O(1) lookup after regex match
title_index = {m.title_raw: m for m in match_datas}
# Include both title_raw and all aliases
title_index: dict[str, Bangumi] = {}
for m in match_datas:
# Add main title_raw
title_index[m.title_raw] = m
# Add all aliases
for alias in _get_aliases_list(m):
title_index[alias] = m
# Build compiled regex pattern for fast substring matching
# Sort by length descending so longer (more specific) matches are found first
@@ -226,12 +399,16 @@ class BangumiDatabase:
for torrent in torrent_list:
match = title_regex.search(torrent.name)
if match:
title_raw = match.group(0)
match_data = title_index[title_raw]
if rss_link not in match_data.rss_link and title_raw not in rss_updated:
matched_title = match.group(0)
match_data = title_index[matched_title]
# Use the bangumi's main title_raw for rss_updated tracking
if (
rss_link not in match_data.rss_link
and match_data.title_raw not in rss_updated
):
match_data.rss_link += f",{rss_link}"
match_data.added = False
rss_updated.add(title_raw)
rss_updated.add(match_data.title_raw)
else:
unmatched.append(torrent)
# Batch commit all rss_link updates
@@ -244,20 +421,32 @@ class BangumiDatabase:
return unmatched
def match_torrent(self, torrent_name: str) -> Optional[Bangumi]:
statement = (
select(Bangumi)
.where(
and_(
func.instr(torrent_name, Bangumi.title_raw) > 0,
Bangumi.deleted == false(),
)
)
# Prefer longer title_raw matches (more specific)
.order_by(func.length(Bangumi.title_raw).desc())
.limit(1)
)
result = self.session.execute(statement)
return result.scalar_one_or_none()
"""
Match torrent name to a bangumi, checking both title_raw and title_aliases.
Returns the bangumi with the longest matching pattern for specificity.
"""
match_datas = self.search_all()
if not match_datas:
return None
best_match: Optional[Bangumi] = None
best_match_len = 0
for bangumi in match_datas:
if bangumi.deleted:
continue
# Check all patterns (title_raw + aliases)
patterns = self.get_all_title_patterns(bangumi)
for pattern in patterns:
if pattern in torrent_name:
# Prefer longer matches (more specific)
if len(pattern) > best_match_len:
best_match = bangumi
best_match_len = len(pattern)
return best_match
def not_complete(self) -> list[Bangumi]:
condition = select(Bangumi).where(
@@ -285,6 +474,7 @@ class BangumiDatabase:
bangumi.deleted = True
self.session.add(bangumi)
self.session.commit()
_invalidate_bangumi_cache()
logger.debug(f"[Database] Disable rule {bangumi.title_raw}.")
def search_rss(self, rss_link: str) -> list[Bangumi]:

View File

@@ -23,7 +23,7 @@ logger = logging.getLogger(__name__)
TABLE_MODELS: list[type[SQLModel]] = [Bangumi, RSSItem, Torrent, User, Passkey]
# Increment this when adding new migrations to MIGRATIONS list.
CURRENT_SCHEMA_VERSION = 7
CURRENT_SCHEMA_VERSION = 8
# Each migration is a tuple of (version, description, list of SQL statements).
# Migrations are applied in order. A migration at index i brings the schema
@@ -96,6 +96,13 @@ MIGRATIONS = [
"ALTER TABLE bangumi ADD COLUMN suggested_episode_offset INTEGER DEFAULT NULL",
],
),
(
8,
"add title_aliases for mid-season naming changes",
[
"ALTER TABLE bangumi ADD COLUMN title_aliases TEXT DEFAULT NULL",
],
),
]
@@ -187,6 +194,10 @@ class Database(Session):
columns = [col["name"] for col in inspector.get_columns("bangumi")]
if "suggested_season_offset" in columns:
needs_run = False
if "bangumi" in tables and version == 8:
columns = [col["name"] for col in inspector.get_columns("bangumi")]
if "title_aliases" in columns:
needs_run = False
if needs_run:
with self.engine.connect() as conn:
for stmt in statements:

View File

@@ -46,6 +46,9 @@ class Bangumi(SQLModel, table=True):
suggested_episode_offset: Optional[int] = Field(
default=None, alias="suggested_episode_offset", title="建议集数偏移"
)
title_aliases: Optional[str] = Field(
default=None, alias="title_aliases", title="标题别名"
) # JSON list: ["alt_title_1", "alt_title_2"]
class BangumiUpdate(SQLModel):
@@ -78,6 +81,9 @@ class BangumiUpdate(SQLModel):
needs_review_reason: Optional[str] = Field(
default=None, alias="needs_review_reason", title="检查原因"
)
title_aliases: Optional[str] = Field(
default=None, alias="title_aliases", title="标题别名"
)
class Notification(BaseModel):

View File

@@ -1,3 +1,5 @@
import json
import pytest
from sqlmodel import Session, SQLModel, create_engine
@@ -208,7 +210,9 @@ def test_torrent_qb_hash_index_efficient(db_session):
# Add multiple torrents
torrents = [
Torrent(name=f"Torrent {i}", url=f"https://example.com/{i}", qb_hash=f"hash_{i}")
Torrent(
name=f"Torrent {i}", url=f"https://example.com/{i}", qb_hash=f"hash_{i}"
)
for i in range(10)
]
db.add_all(torrents)
@@ -225,3 +229,293 @@ def test_torrent_qb_hash_index_efficient(db_session):
# Non-existent hash
result = db.search_by_qb_hash("hash_100")
assert result is None
# ============================================================
# Title Alias Tests - for mid-season naming change handling
# ============================================================
def test_add_title_alias(db_session):
"""Test adding a title alias to an existing bangumi."""
db = BangumiDatabase(db_session)
bangumi = Bangumi(
official_title="Test Anime",
title_raw="Test Anime S1",
group_name="TestGroup",
dpi="1080p",
source="Web",
subtitle="CHT",
rss_link="test",
)
db.add(bangumi)
bangumi_id = db.search_all()[0].id
# Add an alias
result = db.add_title_alias(bangumi_id, "Test Anime Season 1")
assert result is True
# Verify alias was added
updated = db.search_id(bangumi_id)
assert updated.title_aliases is not None
aliases = json.loads(updated.title_aliases)
assert "Test Anime Season 1" in aliases
def test_add_title_alias_duplicate(db_session):
"""Test that adding the same alias twice is a no-op."""
db = BangumiDatabase(db_session)
bangumi = Bangumi(
official_title="Test Anime",
title_raw="Test Anime S1",
group_name="TestGroup",
dpi="1080p",
source="Web",
subtitle="CHT",
rss_link="test",
)
db.add(bangumi)
bangumi_id = db.search_all()[0].id
# Add same alias twice
db.add_title_alias(bangumi_id, "Test Anime Season 1")
result = db.add_title_alias(bangumi_id, "Test Anime Season 1")
assert result is False # Second add should be a no-op
def test_add_title_alias_same_as_title_raw(db_session):
"""Test that adding title_raw as alias is a no-op."""
db = BangumiDatabase(db_session)
bangumi = Bangumi(
official_title="Test Anime",
title_raw="Test Anime S1",
group_name="TestGroup",
dpi="1080p",
source="Web",
subtitle="CHT",
rss_link="test",
)
db.add(bangumi)
bangumi_id = db.search_all()[0].id
result = db.add_title_alias(bangumi_id, "Test Anime S1")
assert result is False
def test_match_torrent_with_alias(db_session):
"""Test that match_torrent finds bangumi using aliases."""
db = BangumiDatabase(db_session)
bangumi = Bangumi(
official_title="Test Anime",
title_raw="Test Anime S1",
group_name="TestGroup",
dpi="1080p",
source="Web",
subtitle="CHT",
rss_link="test",
deleted=False,
)
db.add(bangumi)
bangumi_id = db.search_all()[0].id
# Add alias
db.add_title_alias(bangumi_id, "Test Anime Season 1")
# Match using title_raw
result = db.match_torrent("[TestGroup] Test Anime S1 - 01.mkv")
assert result is not None
assert result.official_title == "Test Anime"
# Match using alias
result = db.match_torrent("[TestGroup] Test Anime Season 1 - 01.mkv")
assert result is not None
assert result.official_title == "Test Anime"
def test_find_semantic_duplicate_same_official_title(db_session):
"""Test finding semantic duplicates with same official title."""
db = BangumiDatabase(db_session)
# Add first bangumi
bangumi1 = Bangumi(
official_title="Frieren",
title_raw="Sousou no Frieren",
group_name="LoliHouse",
dpi="1080p",
source="Web",
subtitle="CHT",
rss_link="test1",
)
db.add(bangumi1)
# Create a semantically similar bangumi (same anime, group changed naming)
bangumi2 = Bangumi(
official_title="Frieren",
title_raw="Frieren Beyond Journey's End", # Different title_raw
group_name="LoliHouse&动漫国", # Group changed mid-season
dpi="1080p",
source="Web",
subtitle="CHT",
rss_link="test2",
)
# Should find semantic duplicate
result = db.find_semantic_duplicate(bangumi2)
assert result is not None
assert result.title_raw == "Sousou no Frieren"
def test_find_semantic_duplicate_no_match_different_resolution(db_session):
"""Test that different resolution is NOT a semantic match."""
db = BangumiDatabase(db_session)
bangumi1 = Bangumi(
official_title="Frieren",
title_raw="Sousou no Frieren",
group_name="LoliHouse",
dpi="1080p",
source="Web",
subtitle="CHT",
rss_link="test1",
)
db.add(bangumi1)
# Same anime but different resolution - should NOT be semantic duplicate
bangumi2 = Bangumi(
official_title="Frieren",
title_raw="Sousou no Frieren 4K",
group_name="LoliHouse",
dpi="2160p", # Different resolution
source="Web",
subtitle="CHT",
rss_link="test2",
)
result = db.find_semantic_duplicate(bangumi2)
assert result is None
def test_add_with_semantic_duplicate_creates_alias(db_session):
"""Test that adding a semantic duplicate creates an alias instead."""
db = BangumiDatabase(db_session)
# Add first bangumi
bangumi1 = Bangumi(
official_title="Frieren",
title_raw="Sousou no Frieren",
group_name="LoliHouse",
dpi="1080p",
source="Web",
subtitle="CHT",
rss_link="test1",
)
db.add(bangumi1)
initial_count = len(db.search_all())
assert initial_count == 1
# Try to add semantic duplicate
bangumi2 = Bangumi(
official_title="Frieren",
title_raw="Frieren Beyond Journey's End",
group_name="LoliHouse&动漫国",
dpi="1080p",
source="Web",
subtitle="CHT",
rss_link="test2",
)
result = db.add(bangumi2)
assert result is False # Should not add new entry
# Count should still be 1
final_count = len(db.search_all())
assert final_count == 1
# But the new title_raw should be an alias
original = db.search_all()[0]
aliases = json.loads(original.title_aliases) if original.title_aliases else []
assert "Frieren Beyond Journey's End" in aliases
def test_groups_are_similar():
"""Test group name similarity detection."""
from module.database.bangumi import _groups_are_similar
# Exact match
assert _groups_are_similar("LoliHouse", "LoliHouse") is True
# Substring match (one contains the other)
assert _groups_are_similar("LoliHouse", "LoliHouse&动漫国字幕组") is True
assert _groups_are_similar("LoliHouse&动漫国字幕组", "LoliHouse") is True
# Completely different groups
assert _groups_are_similar("LoliHouse", "Sakurato") is False
assert _groups_are_similar("字幕组A", "字幕组B") is False
# Edge cases
assert _groups_are_similar(None, "LoliHouse") is False
assert _groups_are_similar("LoliHouse", None) is False
assert _groups_are_similar(None, None) is False
def test_get_all_title_patterns(db_session):
"""Test getting all title patterns for a bangumi."""
db = BangumiDatabase(db_session)
bangumi = Bangumi(
official_title="Test Anime",
title_raw="Test Anime S1",
group_name="TestGroup",
dpi="1080p",
source="Web",
subtitle="CHT",
rss_link="test",
)
db.add(bangumi)
bangumi_id = db.search_all()[0].id
# Add aliases
db.add_title_alias(bangumi_id, "Test Anime Season 1")
db.add_title_alias(bangumi_id, "TA S1")
# Get all patterns
updated = db.search_id(bangumi_id)
patterns = db.get_all_title_patterns(updated)
assert len(patterns) == 3
assert "Test Anime S1" in patterns
assert "Test Anime Season 1" in patterns
assert "TA S1" in patterns
def test_match_list_with_aliases(db_session):
"""Test match_list works with aliases."""
db = BangumiDatabase(db_session)
bangumi = Bangumi(
official_title="Test Anime",
title_raw="Test Anime S1",
group_name="TestGroup",
dpi="1080p",
source="Web",
subtitle="CHT",
rss_link="rss1",
)
db.add(bangumi)
bangumi_id = db.search_all()[0].id
db.add_title_alias(bangumi_id, "Test Anime Season 1")
# Create torrents with different naming patterns
torrents = [
Torrent(name="[TestGroup] Test Anime S1 - 01.mkv", url="url1"),
Torrent(name="[TestGroup] Test Anime Season 1 - 02.mkv", url="url2"),
Torrent(name="[OtherGroup] Different Anime - 01.mkv", url="url3"),
]
# Only the third torrent should be unmatched
unmatched = db.match_list(torrents, "rss2")
assert len(unmatched) == 1
assert unmatched[0].name == "[OtherGroup] Different Anime - 01.mkv"