mirror of
https://github.com/EstrellaXD/Auto_Bangumi.git
synced 2026-04-26 03:21:21 +08:00
feat(database): add title alias system for mid-season naming changes
When subtitle groups change their naming convention mid-season (e.g., "LoliHouse" → "LoliHouse&动漫国"), AutoBangumi was creating duplicate entries. This adds a title alias system that: - Detects semantic duplicates (same official_title, dpi, subtitle, source, and similar group name) - Merges them as aliases instead of creating new entries - Updates match_torrent() and match_list() to check aliases - Adds title_aliases field to Bangumi model (JSON list) - Includes migration v8 for the new column - Adds 10 new tests for the feature - Fixes cache invalidation bug in disable_rule() Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -1,3 +1,4 @@
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import time
|
||||
@@ -10,6 +11,57 @@ from module.models import Bangumi, BangumiUpdate
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _normalize_group_name(group: str | None) -> str:
|
||||
"""Normalize group name for comparison by removing common separators."""
|
||||
if not group:
|
||||
return ""
|
||||
# Remove common separators (&, ×, _, -) and normalize to lowercase
|
||||
return re.sub(r"[&×_\-]", "", group).lower().strip()
|
||||
|
||||
|
||||
def _groups_are_similar(group1: str | None, group2: str | None) -> bool:
|
||||
"""
|
||||
Check if two group names are similar enough to be considered the same group.
|
||||
|
||||
Handles cases like:
|
||||
- "LoliHouse" vs "LoliHouse&动漫国字幕组"
|
||||
- "字幕组A" vs "字幕组A×字幕组B"
|
||||
"""
|
||||
if not group1 or not group2:
|
||||
return False
|
||||
|
||||
# Exact match or substring match (one contains the other)
|
||||
if group1 == group2 or group1 in group2 or group2 in group1:
|
||||
return True
|
||||
|
||||
# Normalized comparison - check if core group names overlap
|
||||
norm1 = _normalize_group_name(group1)
|
||||
norm2 = _normalize_group_name(group2)
|
||||
return norm1 in norm2 or norm2 in norm1
|
||||
|
||||
|
||||
def _get_aliases_list(bangumi: Bangumi) -> list[str]:
|
||||
"""Get the list of title aliases from a bangumi's title_aliases JSON field."""
|
||||
if not bangumi.title_aliases:
|
||||
return []
|
||||
try:
|
||||
aliases = json.loads(bangumi.title_aliases)
|
||||
return aliases if isinstance(aliases, list) else []
|
||||
except (json.JSONDecodeError, TypeError):
|
||||
return []
|
||||
|
||||
|
||||
def _set_aliases_list(bangumi: Bangumi, aliases: list[str]) -> None:
|
||||
"""Set the title aliases JSON field from a list."""
|
||||
if not aliases:
|
||||
bangumi.title_aliases = None
|
||||
else:
|
||||
# Remove duplicates while preserving order
|
||||
unique_aliases = list(dict.fromkeys(aliases))
|
||||
bangumi.title_aliases = json.dumps(unique_aliases, ensure_ascii=False)
|
||||
|
||||
|
||||
# Module-level TTL cache for search_all results
|
||||
_bangumi_cache: list[Bangumi] | None = None
|
||||
_bangumi_cache_time: float = 0
|
||||
@@ -26,6 +78,91 @@ class BangumiDatabase:
|
||||
def __init__(self, session: Session):
|
||||
self.session = session
|
||||
|
||||
def find_semantic_duplicate(self, data: Bangumi) -> Optional[Bangumi]:
|
||||
"""
|
||||
Find existing bangumi that semantically matches the new one.
|
||||
|
||||
This handles cases where subtitle groups change naming mid-season.
|
||||
A semantic match requires:
|
||||
- Same official_title
|
||||
- Same dpi (resolution)
|
||||
- Same subtitle type
|
||||
- Same source
|
||||
- Similar group_name (one contains the other)
|
||||
|
||||
Returns the matching Bangumi if found, None otherwise.
|
||||
"""
|
||||
statement = select(Bangumi).where(
|
||||
and_(
|
||||
Bangumi.official_title == data.official_title,
|
||||
Bangumi.deleted == false(),
|
||||
)
|
||||
)
|
||||
candidates = self.session.execute(statement).scalars().all()
|
||||
|
||||
for candidate in candidates:
|
||||
is_exact_duplicate = (
|
||||
candidate.title_raw == data.title_raw
|
||||
and candidate.group_name == data.group_name
|
||||
)
|
||||
if is_exact_duplicate:
|
||||
continue
|
||||
|
||||
is_semantic_match = (
|
||||
candidate.dpi == data.dpi
|
||||
and candidate.subtitle == data.subtitle
|
||||
and candidate.source == data.source
|
||||
and _groups_are_similar(candidate.group_name, data.group_name)
|
||||
)
|
||||
if is_semantic_match:
|
||||
logger.debug(
|
||||
f"[Database] Found semantic duplicate: '{data.title_raw}' matches "
|
||||
f"existing '{candidate.title_raw}' (official: {data.official_title})"
|
||||
)
|
||||
return candidate
|
||||
|
||||
return None
|
||||
|
||||
def add_title_alias(self, bangumi_id: int, new_title_raw: str) -> bool:
|
||||
"""
|
||||
Add a new title_raw alias to an existing bangumi.
|
||||
|
||||
This allows a single bangumi entry to match multiple naming patterns.
|
||||
"""
|
||||
bangumi = self.session.get(Bangumi, bangumi_id)
|
||||
if not bangumi:
|
||||
logger.warning(
|
||||
f"[Database] Cannot add alias: bangumi id {bangumi_id} not found"
|
||||
)
|
||||
return False
|
||||
|
||||
# Don't add if it's the same as the main title_raw
|
||||
if bangumi.title_raw == new_title_raw:
|
||||
return False
|
||||
|
||||
# Get existing aliases and add the new one
|
||||
aliases = _get_aliases_list(bangumi)
|
||||
if new_title_raw in aliases:
|
||||
return False # Already exists
|
||||
|
||||
aliases.append(new_title_raw)
|
||||
_set_aliases_list(bangumi, aliases)
|
||||
|
||||
self.session.add(bangumi)
|
||||
self.session.commit()
|
||||
_invalidate_bangumi_cache()
|
||||
logger.info(
|
||||
f"[Database] Added alias '{new_title_raw}' to bangumi '{bangumi.official_title}' "
|
||||
f"(id: {bangumi_id})"
|
||||
)
|
||||
return True
|
||||
|
||||
def get_all_title_patterns(self, bangumi: Bangumi) -> list[str]:
|
||||
"""Get all title patterns for matching (title_raw + all aliases)."""
|
||||
patterns = [bangumi.title_raw]
|
||||
patterns.extend(_get_aliases_list(bangumi))
|
||||
return patterns
|
||||
|
||||
def _is_duplicate(self, data: Bangumi) -> bool:
|
||||
"""Check if a bangumi rule already exists based on title_raw and group_name."""
|
||||
statement = select(Bangumi).where(
|
||||
@@ -43,6 +180,18 @@ class BangumiDatabase:
|
||||
f"[Database] Skipping duplicate: {data.official_title} ({data.group_name})"
|
||||
)
|
||||
return False
|
||||
|
||||
# Check for semantic duplicate (same anime, different naming pattern)
|
||||
semantic_match = self.find_semantic_duplicate(data)
|
||||
if semantic_match:
|
||||
# Add as alias instead of creating new entry
|
||||
self.add_title_alias(semantic_match.id, data.title_raw)
|
||||
logger.info(
|
||||
f"[Database] Merged '{data.title_raw}' as alias to existing "
|
||||
f"'{semantic_match.title_raw}' (official: {data.official_title})"
|
||||
)
|
||||
return False # Return False since we didn't add a new entry
|
||||
|
||||
self.session.add(data)
|
||||
self.session.commit()
|
||||
_invalidate_bangumi_cache()
|
||||
@@ -70,31 +219,54 @@ class BangumiDatabase:
|
||||
else:
|
||||
existing = set()
|
||||
|
||||
# Filter out duplicates
|
||||
# Filter out exact duplicates
|
||||
to_add = [d for d in datas if (d.title_raw, d.group_name) not in existing]
|
||||
|
||||
# Check for semantic duplicates and add as aliases
|
||||
semantic_merged = 0
|
||||
really_to_add = []
|
||||
for d in to_add:
|
||||
semantic_match = self.find_semantic_duplicate(d)
|
||||
if semantic_match:
|
||||
# Add as alias instead of creating new entry
|
||||
self.add_title_alias(semantic_match.id, d.title_raw)
|
||||
semantic_merged += 1
|
||||
logger.info(
|
||||
f"[Database] Merged '{d.title_raw}' as alias to existing "
|
||||
f"'{semantic_match.title_raw}' (official: {d.official_title})"
|
||||
)
|
||||
else:
|
||||
really_to_add.append(d)
|
||||
|
||||
# Also deduplicate within the batch itself
|
||||
seen = set()
|
||||
unique_to_add = []
|
||||
for d in to_add:
|
||||
for d in really_to_add:
|
||||
key = (d.title_raw, d.group_name)
|
||||
if key not in seen:
|
||||
seen.add(key)
|
||||
unique_to_add.append(d)
|
||||
|
||||
if not unique_to_add:
|
||||
logger.debug(
|
||||
f"[Database] All {len(datas)} bangumi already exist, skipping."
|
||||
)
|
||||
if semantic_merged > 0:
|
||||
logger.debug(
|
||||
f"[Database] {semantic_merged} bangumi merged as aliases, "
|
||||
f"rest were duplicates."
|
||||
)
|
||||
else:
|
||||
logger.debug(
|
||||
f"[Database] All {len(datas)} bangumi already exist, skipping."
|
||||
)
|
||||
return 0
|
||||
|
||||
self.session.add_all(unique_to_add)
|
||||
self.session.commit()
|
||||
_invalidate_bangumi_cache()
|
||||
skipped = len(datas) - len(unique_to_add)
|
||||
if skipped > 0:
|
||||
skipped = len(datas) - len(unique_to_add) - semantic_merged
|
||||
if skipped > 0 or semantic_merged > 0:
|
||||
logger.debug(
|
||||
f"[Database] Insert {len(unique_to_add)} bangumi, skipped {skipped} duplicates."
|
||||
f"[Database] Insert {len(unique_to_add)} bangumi, "
|
||||
f"skipped {skipped} duplicates, merged {semantic_merged} as aliases."
|
||||
)
|
||||
else:
|
||||
logger.debug(
|
||||
@@ -186,25 +358,19 @@ class BangumiDatabase:
|
||||
|
||||
def search_id(self, _id: int) -> Optional[Bangumi]:
|
||||
statement = select(Bangumi).where(Bangumi.id == _id)
|
||||
result = self.session.execute(statement)
|
||||
bangumi = result.scalar_one_or_none()
|
||||
bangumi = self.session.execute(statement).scalar_one_or_none()
|
||||
if bangumi is None:
|
||||
logger.warning(f"[Database] Cannot find bangumi id: {_id}.")
|
||||
return None
|
||||
else:
|
||||
logger.debug(f"[Database] Find bangumi id: {_id}.")
|
||||
return bangumi
|
||||
logger.debug(f"[Database] Find bangumi id: {_id}.")
|
||||
return bangumi
|
||||
|
||||
def match_poster(self, bangumi_name: str) -> str:
|
||||
statement = select(Bangumi).where(
|
||||
func.instr(bangumi_name, Bangumi.official_title) > 0
|
||||
)
|
||||
result = self.session.execute(statement)
|
||||
data = result.scalar_one_or_none()
|
||||
if data:
|
||||
return data.poster_link
|
||||
else:
|
||||
return ""
|
||||
data = self.session.execute(statement).scalar_one_or_none()
|
||||
return data.poster_link if data else ""
|
||||
|
||||
def match_list(self, torrent_list: list, rss_link: str) -> list:
|
||||
match_datas = self.search_all()
|
||||
@@ -212,7 +378,14 @@ class BangumiDatabase:
|
||||
return torrent_list
|
||||
|
||||
# Build index for O(1) lookup after regex match
|
||||
title_index = {m.title_raw: m for m in match_datas}
|
||||
# Include both title_raw and all aliases
|
||||
title_index: dict[str, Bangumi] = {}
|
||||
for m in match_datas:
|
||||
# Add main title_raw
|
||||
title_index[m.title_raw] = m
|
||||
# Add all aliases
|
||||
for alias in _get_aliases_list(m):
|
||||
title_index[alias] = m
|
||||
|
||||
# Build compiled regex pattern for fast substring matching
|
||||
# Sort by length descending so longer (more specific) matches are found first
|
||||
@@ -226,12 +399,16 @@ class BangumiDatabase:
|
||||
for torrent in torrent_list:
|
||||
match = title_regex.search(torrent.name)
|
||||
if match:
|
||||
title_raw = match.group(0)
|
||||
match_data = title_index[title_raw]
|
||||
if rss_link not in match_data.rss_link and title_raw not in rss_updated:
|
||||
matched_title = match.group(0)
|
||||
match_data = title_index[matched_title]
|
||||
# Use the bangumi's main title_raw for rss_updated tracking
|
||||
if (
|
||||
rss_link not in match_data.rss_link
|
||||
and match_data.title_raw not in rss_updated
|
||||
):
|
||||
match_data.rss_link += f",{rss_link}"
|
||||
match_data.added = False
|
||||
rss_updated.add(title_raw)
|
||||
rss_updated.add(match_data.title_raw)
|
||||
else:
|
||||
unmatched.append(torrent)
|
||||
# Batch commit all rss_link updates
|
||||
@@ -244,20 +421,32 @@ class BangumiDatabase:
|
||||
return unmatched
|
||||
|
||||
def match_torrent(self, torrent_name: str) -> Optional[Bangumi]:
|
||||
statement = (
|
||||
select(Bangumi)
|
||||
.where(
|
||||
and_(
|
||||
func.instr(torrent_name, Bangumi.title_raw) > 0,
|
||||
Bangumi.deleted == false(),
|
||||
)
|
||||
)
|
||||
# Prefer longer title_raw matches (more specific)
|
||||
.order_by(func.length(Bangumi.title_raw).desc())
|
||||
.limit(1)
|
||||
)
|
||||
result = self.session.execute(statement)
|
||||
return result.scalar_one_or_none()
|
||||
"""
|
||||
Match torrent name to a bangumi, checking both title_raw and title_aliases.
|
||||
|
||||
Returns the bangumi with the longest matching pattern for specificity.
|
||||
"""
|
||||
match_datas = self.search_all()
|
||||
if not match_datas:
|
||||
return None
|
||||
|
||||
best_match: Optional[Bangumi] = None
|
||||
best_match_len = 0
|
||||
|
||||
for bangumi in match_datas:
|
||||
if bangumi.deleted:
|
||||
continue
|
||||
|
||||
# Check all patterns (title_raw + aliases)
|
||||
patterns = self.get_all_title_patterns(bangumi)
|
||||
for pattern in patterns:
|
||||
if pattern in torrent_name:
|
||||
# Prefer longer matches (more specific)
|
||||
if len(pattern) > best_match_len:
|
||||
best_match = bangumi
|
||||
best_match_len = len(pattern)
|
||||
|
||||
return best_match
|
||||
|
||||
def not_complete(self) -> list[Bangumi]:
|
||||
condition = select(Bangumi).where(
|
||||
@@ -285,6 +474,7 @@ class BangumiDatabase:
|
||||
bangumi.deleted = True
|
||||
self.session.add(bangumi)
|
||||
self.session.commit()
|
||||
_invalidate_bangumi_cache()
|
||||
logger.debug(f"[Database] Disable rule {bangumi.title_raw}.")
|
||||
|
||||
def search_rss(self, rss_link: str) -> list[Bangumi]:
|
||||
|
||||
@@ -23,7 +23,7 @@ logger = logging.getLogger(__name__)
|
||||
TABLE_MODELS: list[type[SQLModel]] = [Bangumi, RSSItem, Torrent, User, Passkey]
|
||||
|
||||
# Increment this when adding new migrations to MIGRATIONS list.
|
||||
CURRENT_SCHEMA_VERSION = 7
|
||||
CURRENT_SCHEMA_VERSION = 8
|
||||
|
||||
# Each migration is a tuple of (version, description, list of SQL statements).
|
||||
# Migrations are applied in order. A migration at index i brings the schema
|
||||
@@ -96,6 +96,13 @@ MIGRATIONS = [
|
||||
"ALTER TABLE bangumi ADD COLUMN suggested_episode_offset INTEGER DEFAULT NULL",
|
||||
],
|
||||
),
|
||||
(
|
||||
8,
|
||||
"add title_aliases for mid-season naming changes",
|
||||
[
|
||||
"ALTER TABLE bangumi ADD COLUMN title_aliases TEXT DEFAULT NULL",
|
||||
],
|
||||
),
|
||||
]
|
||||
|
||||
|
||||
@@ -187,6 +194,10 @@ class Database(Session):
|
||||
columns = [col["name"] for col in inspector.get_columns("bangumi")]
|
||||
if "suggested_season_offset" in columns:
|
||||
needs_run = False
|
||||
if "bangumi" in tables and version == 8:
|
||||
columns = [col["name"] for col in inspector.get_columns("bangumi")]
|
||||
if "title_aliases" in columns:
|
||||
needs_run = False
|
||||
if needs_run:
|
||||
with self.engine.connect() as conn:
|
||||
for stmt in statements:
|
||||
|
||||
@@ -46,6 +46,9 @@ class Bangumi(SQLModel, table=True):
|
||||
suggested_episode_offset: Optional[int] = Field(
|
||||
default=None, alias="suggested_episode_offset", title="建议集数偏移"
|
||||
)
|
||||
title_aliases: Optional[str] = Field(
|
||||
default=None, alias="title_aliases", title="标题别名"
|
||||
) # JSON list: ["alt_title_1", "alt_title_2"]
|
||||
|
||||
|
||||
class BangumiUpdate(SQLModel):
|
||||
@@ -78,6 +81,9 @@ class BangumiUpdate(SQLModel):
|
||||
needs_review_reason: Optional[str] = Field(
|
||||
default=None, alias="needs_review_reason", title="检查原因"
|
||||
)
|
||||
title_aliases: Optional[str] = Field(
|
||||
default=None, alias="title_aliases", title="标题别名"
|
||||
)
|
||||
|
||||
|
||||
class Notification(BaseModel):
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
import json
|
||||
|
||||
import pytest
|
||||
from sqlmodel import Session, SQLModel, create_engine
|
||||
|
||||
@@ -208,7 +210,9 @@ def test_torrent_qb_hash_index_efficient(db_session):
|
||||
|
||||
# Add multiple torrents
|
||||
torrents = [
|
||||
Torrent(name=f"Torrent {i}", url=f"https://example.com/{i}", qb_hash=f"hash_{i}")
|
||||
Torrent(
|
||||
name=f"Torrent {i}", url=f"https://example.com/{i}", qb_hash=f"hash_{i}"
|
||||
)
|
||||
for i in range(10)
|
||||
]
|
||||
db.add_all(torrents)
|
||||
@@ -225,3 +229,293 @@ def test_torrent_qb_hash_index_efficient(db_session):
|
||||
# Non-existent hash
|
||||
result = db.search_by_qb_hash("hash_100")
|
||||
assert result is None
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Title Alias Tests - for mid-season naming change handling
|
||||
# ============================================================
|
||||
|
||||
|
||||
def test_add_title_alias(db_session):
|
||||
"""Test adding a title alias to an existing bangumi."""
|
||||
db = BangumiDatabase(db_session)
|
||||
|
||||
bangumi = Bangumi(
|
||||
official_title="Test Anime",
|
||||
title_raw="Test Anime S1",
|
||||
group_name="TestGroup",
|
||||
dpi="1080p",
|
||||
source="Web",
|
||||
subtitle="CHT",
|
||||
rss_link="test",
|
||||
)
|
||||
db.add(bangumi)
|
||||
bangumi_id = db.search_all()[0].id
|
||||
|
||||
# Add an alias
|
||||
result = db.add_title_alias(bangumi_id, "Test Anime Season 1")
|
||||
assert result is True
|
||||
|
||||
# Verify alias was added
|
||||
updated = db.search_id(bangumi_id)
|
||||
assert updated.title_aliases is not None
|
||||
aliases = json.loads(updated.title_aliases)
|
||||
assert "Test Anime Season 1" in aliases
|
||||
|
||||
|
||||
def test_add_title_alias_duplicate(db_session):
|
||||
"""Test that adding the same alias twice is a no-op."""
|
||||
db = BangumiDatabase(db_session)
|
||||
|
||||
bangumi = Bangumi(
|
||||
official_title="Test Anime",
|
||||
title_raw="Test Anime S1",
|
||||
group_name="TestGroup",
|
||||
dpi="1080p",
|
||||
source="Web",
|
||||
subtitle="CHT",
|
||||
rss_link="test",
|
||||
)
|
||||
db.add(bangumi)
|
||||
bangumi_id = db.search_all()[0].id
|
||||
|
||||
# Add same alias twice
|
||||
db.add_title_alias(bangumi_id, "Test Anime Season 1")
|
||||
result = db.add_title_alias(bangumi_id, "Test Anime Season 1")
|
||||
assert result is False # Second add should be a no-op
|
||||
|
||||
|
||||
def test_add_title_alias_same_as_title_raw(db_session):
|
||||
"""Test that adding title_raw as alias is a no-op."""
|
||||
db = BangumiDatabase(db_session)
|
||||
|
||||
bangumi = Bangumi(
|
||||
official_title="Test Anime",
|
||||
title_raw="Test Anime S1",
|
||||
group_name="TestGroup",
|
||||
dpi="1080p",
|
||||
source="Web",
|
||||
subtitle="CHT",
|
||||
rss_link="test",
|
||||
)
|
||||
db.add(bangumi)
|
||||
bangumi_id = db.search_all()[0].id
|
||||
|
||||
result = db.add_title_alias(bangumi_id, "Test Anime S1")
|
||||
assert result is False
|
||||
|
||||
|
||||
def test_match_torrent_with_alias(db_session):
|
||||
"""Test that match_torrent finds bangumi using aliases."""
|
||||
db = BangumiDatabase(db_session)
|
||||
|
||||
bangumi = Bangumi(
|
||||
official_title="Test Anime",
|
||||
title_raw="Test Anime S1",
|
||||
group_name="TestGroup",
|
||||
dpi="1080p",
|
||||
source="Web",
|
||||
subtitle="CHT",
|
||||
rss_link="test",
|
||||
deleted=False,
|
||||
)
|
||||
db.add(bangumi)
|
||||
bangumi_id = db.search_all()[0].id
|
||||
|
||||
# Add alias
|
||||
db.add_title_alias(bangumi_id, "Test Anime Season 1")
|
||||
|
||||
# Match using title_raw
|
||||
result = db.match_torrent("[TestGroup] Test Anime S1 - 01.mkv")
|
||||
assert result is not None
|
||||
assert result.official_title == "Test Anime"
|
||||
|
||||
# Match using alias
|
||||
result = db.match_torrent("[TestGroup] Test Anime Season 1 - 01.mkv")
|
||||
assert result is not None
|
||||
assert result.official_title == "Test Anime"
|
||||
|
||||
|
||||
def test_find_semantic_duplicate_same_official_title(db_session):
|
||||
"""Test finding semantic duplicates with same official title."""
|
||||
db = BangumiDatabase(db_session)
|
||||
|
||||
# Add first bangumi
|
||||
bangumi1 = Bangumi(
|
||||
official_title="Frieren",
|
||||
title_raw="Sousou no Frieren",
|
||||
group_name="LoliHouse",
|
||||
dpi="1080p",
|
||||
source="Web",
|
||||
subtitle="CHT",
|
||||
rss_link="test1",
|
||||
)
|
||||
db.add(bangumi1)
|
||||
|
||||
# Create a semantically similar bangumi (same anime, group changed naming)
|
||||
bangumi2 = Bangumi(
|
||||
official_title="Frieren",
|
||||
title_raw="Frieren Beyond Journey's End", # Different title_raw
|
||||
group_name="LoliHouse&动漫国", # Group changed mid-season
|
||||
dpi="1080p",
|
||||
source="Web",
|
||||
subtitle="CHT",
|
||||
rss_link="test2",
|
||||
)
|
||||
|
||||
# Should find semantic duplicate
|
||||
result = db.find_semantic_duplicate(bangumi2)
|
||||
assert result is not None
|
||||
assert result.title_raw == "Sousou no Frieren"
|
||||
|
||||
|
||||
def test_find_semantic_duplicate_no_match_different_resolution(db_session):
|
||||
"""Test that different resolution is NOT a semantic match."""
|
||||
db = BangumiDatabase(db_session)
|
||||
|
||||
bangumi1 = Bangumi(
|
||||
official_title="Frieren",
|
||||
title_raw="Sousou no Frieren",
|
||||
group_name="LoliHouse",
|
||||
dpi="1080p",
|
||||
source="Web",
|
||||
subtitle="CHT",
|
||||
rss_link="test1",
|
||||
)
|
||||
db.add(bangumi1)
|
||||
|
||||
# Same anime but different resolution - should NOT be semantic duplicate
|
||||
bangumi2 = Bangumi(
|
||||
official_title="Frieren",
|
||||
title_raw="Sousou no Frieren 4K",
|
||||
group_name="LoliHouse",
|
||||
dpi="2160p", # Different resolution
|
||||
source="Web",
|
||||
subtitle="CHT",
|
||||
rss_link="test2",
|
||||
)
|
||||
|
||||
result = db.find_semantic_duplicate(bangumi2)
|
||||
assert result is None
|
||||
|
||||
|
||||
def test_add_with_semantic_duplicate_creates_alias(db_session):
|
||||
"""Test that adding a semantic duplicate creates an alias instead."""
|
||||
db = BangumiDatabase(db_session)
|
||||
|
||||
# Add first bangumi
|
||||
bangumi1 = Bangumi(
|
||||
official_title="Frieren",
|
||||
title_raw="Sousou no Frieren",
|
||||
group_name="LoliHouse",
|
||||
dpi="1080p",
|
||||
source="Web",
|
||||
subtitle="CHT",
|
||||
rss_link="test1",
|
||||
)
|
||||
db.add(bangumi1)
|
||||
initial_count = len(db.search_all())
|
||||
assert initial_count == 1
|
||||
|
||||
# Try to add semantic duplicate
|
||||
bangumi2 = Bangumi(
|
||||
official_title="Frieren",
|
||||
title_raw="Frieren Beyond Journey's End",
|
||||
group_name="LoliHouse&动漫国",
|
||||
dpi="1080p",
|
||||
source="Web",
|
||||
subtitle="CHT",
|
||||
rss_link="test2",
|
||||
)
|
||||
result = db.add(bangumi2)
|
||||
assert result is False # Should not add new entry
|
||||
|
||||
# Count should still be 1
|
||||
final_count = len(db.search_all())
|
||||
assert final_count == 1
|
||||
|
||||
# But the new title_raw should be an alias
|
||||
original = db.search_all()[0]
|
||||
aliases = json.loads(original.title_aliases) if original.title_aliases else []
|
||||
assert "Frieren Beyond Journey's End" in aliases
|
||||
|
||||
|
||||
def test_groups_are_similar():
|
||||
"""Test group name similarity detection."""
|
||||
from module.database.bangumi import _groups_are_similar
|
||||
|
||||
# Exact match
|
||||
assert _groups_are_similar("LoliHouse", "LoliHouse") is True
|
||||
|
||||
# Substring match (one contains the other)
|
||||
assert _groups_are_similar("LoliHouse", "LoliHouse&动漫国字幕组") is True
|
||||
assert _groups_are_similar("LoliHouse&动漫国字幕组", "LoliHouse") is True
|
||||
|
||||
# Completely different groups
|
||||
assert _groups_are_similar("LoliHouse", "Sakurato") is False
|
||||
assert _groups_are_similar("字幕组A", "字幕组B") is False
|
||||
|
||||
# Edge cases
|
||||
assert _groups_are_similar(None, "LoliHouse") is False
|
||||
assert _groups_are_similar("LoliHouse", None) is False
|
||||
assert _groups_are_similar(None, None) is False
|
||||
|
||||
|
||||
def test_get_all_title_patterns(db_session):
|
||||
"""Test getting all title patterns for a bangumi."""
|
||||
db = BangumiDatabase(db_session)
|
||||
|
||||
bangumi = Bangumi(
|
||||
official_title="Test Anime",
|
||||
title_raw="Test Anime S1",
|
||||
group_name="TestGroup",
|
||||
dpi="1080p",
|
||||
source="Web",
|
||||
subtitle="CHT",
|
||||
rss_link="test",
|
||||
)
|
||||
db.add(bangumi)
|
||||
bangumi_id = db.search_all()[0].id
|
||||
|
||||
# Add aliases
|
||||
db.add_title_alias(bangumi_id, "Test Anime Season 1")
|
||||
db.add_title_alias(bangumi_id, "TA S1")
|
||||
|
||||
# Get all patterns
|
||||
updated = db.search_id(bangumi_id)
|
||||
patterns = db.get_all_title_patterns(updated)
|
||||
|
||||
assert len(patterns) == 3
|
||||
assert "Test Anime S1" in patterns
|
||||
assert "Test Anime Season 1" in patterns
|
||||
assert "TA S1" in patterns
|
||||
|
||||
|
||||
def test_match_list_with_aliases(db_session):
|
||||
"""Test match_list works with aliases."""
|
||||
db = BangumiDatabase(db_session)
|
||||
|
||||
bangumi = Bangumi(
|
||||
official_title="Test Anime",
|
||||
title_raw="Test Anime S1",
|
||||
group_name="TestGroup",
|
||||
dpi="1080p",
|
||||
source="Web",
|
||||
subtitle="CHT",
|
||||
rss_link="rss1",
|
||||
)
|
||||
db.add(bangumi)
|
||||
bangumi_id = db.search_all()[0].id
|
||||
db.add_title_alias(bangumi_id, "Test Anime Season 1")
|
||||
|
||||
# Create torrents with different naming patterns
|
||||
torrents = [
|
||||
Torrent(name="[TestGroup] Test Anime S1 - 01.mkv", url="url1"),
|
||||
Torrent(name="[TestGroup] Test Anime Season 1 - 02.mkv", url="url2"),
|
||||
Torrent(name="[OtherGroup] Different Anime - 01.mkv", url="url3"),
|
||||
]
|
||||
|
||||
# Only the third torrent should be unmatched
|
||||
unmatched = db.match_list(torrents, "rss2")
|
||||
assert len(unmatched) == 1
|
||||
assert unmatched[0].name == "[OtherGroup] Different Anime - 01.mkv"
|
||||
|
||||
Reference in New Issue
Block a user