perf(database): optimize N+1 queries and add caching

- Replace N individual _is_duplicate() calls with single batch SELECT query
  in add_all() method, reducing database round-trips
- Replace O(n*m) nested loop in match_list() with compiled regex alternation
  pattern for faster torrent-to-bangumi matching
- Add LRU cache (512 entries) to torrent_parser() to avoid redundant regex
  parsing for the same torrent paths
- Extend bangumi search_all() cache TTL from 60s to 300s (5 minutes)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
EstrellaXD
2026-01-26 14:30:16 +01:00
parent ebd58531b5
commit d6e89f62ed
2 changed files with 67 additions and 21 deletions

View File

@@ -1,4 +1,5 @@
import logging
import re
import time
from typing import Optional
@@ -12,7 +13,7 @@ logger = logging.getLogger(__name__)
# Module-level TTL cache for search_all results
_bangumi_cache: list[Bangumi] | None = None
_bangumi_cache_time: float = 0
_BANGUMI_CACHE_TTL: float = 60.0 # seconds
_BANGUMI_CACHE_TTL: float = 300.0 # 5 minutes - extended from 60s to reduce DB queries
def _invalidate_bangumi_cache():
@@ -53,11 +54,21 @@ class BangumiDatabase:
if not datas:
return 0
# Get existing title_raw + group_name combinations
existing = set()
for data in datas:
if self._is_duplicate(data):
existing.add((data.title_raw, data.group_name))
# Batch query: get all existing (title_raw, group_name) combinations in one query
# This replaces N individual _is_duplicate() calls with a single SELECT
keys_to_check = [(d.title_raw, d.group_name) for d in datas]
conditions = [
and_(Bangumi.title_raw == tr, Bangumi.group_name == gn)
for tr, gn in keys_to_check
]
if conditions:
statement = select(Bangumi.title_raw, Bangumi.group_name).where(
or_(*conditions)
)
result = self.session.execute(statement)
existing = set(result.all())
else:
existing = set()
# Filter out duplicates
to_add = [d for d in datas if (d.title_raw, d.group_name) not in existing]
@@ -199,24 +210,29 @@ class BangumiDatabase:
match_datas = self.search_all()
if not match_datas:
return torrent_list
# Build index for faster lookup
# Build index for O(1) lookup after regex match
title_index = {m.title_raw: m for m in match_datas}
# Build compiled regex pattern for fast substring matching
# Sort by length descending so longer (more specific) matches are found first
sorted_titles = sorted(title_index.keys(), key=len, reverse=True)
# Escape special regex characters and join with alternation
pattern = "|".join(re.escape(title) for title in sorted_titles)
title_regex = re.compile(pattern)
unmatched = []
rss_updated = set()
for torrent in torrent_list:
matched = False
for title_raw, match_data in title_index.items():
if title_raw in torrent.name:
if (
rss_link not in match_data.rss_link
and title_raw not in rss_updated
):
match_data.rss_link += f",{rss_link}"
match_data.added = False
rss_updated.add(title_raw)
matched = True
break
if not matched:
match = title_regex.search(torrent.name)
if match:
title_raw = match.group(0)
match_data = title_index[title_raw]
if rss_link not in match_data.rss_link and title_raw not in rss_updated:
match_data.rss_link += f",{rss_link}"
match_data.added = False
rss_updated.add(title_raw)
else:
unmatched.append(torrent)
# Batch commit all rss_link updates
if rss_updated:

View File

@@ -1,11 +1,16 @@
import logging
import re
from collections import OrderedDict
from pathlib import Path
from module.models import EpisodeFile, SubtitleFile
logger = logging.getLogger(__name__)
# LRU cache for torrent_parser results to avoid repeated regex parsing
_PARSER_CACHE_MAX_SIZE = 512
_parser_cache: OrderedDict[tuple, EpisodeFile | SubtitleFile | None] = OrderedDict()
PLATFORM = "Unix"
RULES = [
@@ -70,7 +75,31 @@ def torrent_parser(
torrent_name: str | None = None,
season: int | None = None,
file_type: str = "media",
) -> EpisodeFile | SubtitleFile:
) -> EpisodeFile | SubtitleFile | None:
# Check cache first to avoid repeated regex parsing
cache_key = (torrent_path, torrent_name, season, file_type)
if cache_key in _parser_cache:
# Move to end to mark as recently used
_parser_cache.move_to_end(cache_key)
return _parser_cache[cache_key]
result = _torrent_parser_impl(torrent_path, torrent_name, season, file_type)
# Store in cache with LRU eviction
_parser_cache[cache_key] = result
if len(_parser_cache) > _PARSER_CACHE_MAX_SIZE:
_parser_cache.popitem(last=False) # Remove oldest item
return result
def _torrent_parser_impl(
torrent_path: str,
torrent_name: str | None = None,
season: int | None = None,
file_type: str = "media",
) -> EpisodeFile | SubtitleFile | None:
"""Internal implementation of torrent_parser without caching."""
media_path = get_path_basename(torrent_path)
match_names = [torrent_name, media_path]
if torrent_name is None:
@@ -106,6 +135,7 @@ def torrent_parser(
episode=episode,
suffix=suffix,
)
return None
if __name__ == "__main__":