perf(database): optimize N+1 queries and add caching

- Replace N individual _is_duplicate() calls with single batch SELECT query in add_all() method, reducing database round-trips - Replace O(n*m) nested loop in match_list() with compiled regex alternation pattern for faster torrent-to-bangumi matching - Add LRU cache (512 entries) to torrent_parser() to avoid redundant regex parsing for the same torrent paths - Extend bangumi search_all() cache TTL from 60s to 300s (5 minutes) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-04-26 11:32:16 +08:00 · 2026-01-26 14:30:16 +01:00
parent ebd58531b5
commit d6e89f62ed
2 changed files with 67 additions and 21 deletions
--- a/backend/src/module/database/bangumi.py
+++ b/backend/src/module/database/bangumi.py
@@ -1,4 +1,5 @@
 import logging
+import re
 import time
 from typing import Optional

@@ -12,7 +13,7 @@ logger = logging.getLogger(__name__)
 # Module-level TTL cache for search_all results
 _bangumi_cache: list[Bangumi] | None = None
 _bangumi_cache_time: float = 0
-_BANGUMI_CACHE_TTL: float = 60.0  # seconds
+_BANGUMI_CACHE_TTL: float = 300.0  # 5 minutes - extended from 60s to reduce DB queries


 def _invalidate_bangumi_cache():
@@ -53,11 +54,21 @@ class BangumiDatabase:
        if not datas:
            return 0

-        # Get existing title_raw + group_name combinations
-        existing = set()
-        for data in datas:
-            if self._is_duplicate(data):
-                existing.add((data.title_raw, data.group_name))
+        # Batch query: get all existing (title_raw, group_name) combinations in one query
+        # This replaces N individual _is_duplicate() calls with a single SELECT
+        keys_to_check = [(d.title_raw, d.group_name) for d in datas]
+        conditions = [
+            and_(Bangumi.title_raw == tr, Bangumi.group_name == gn)
+            for tr, gn in keys_to_check
+        ]
+        if conditions:
+            statement = select(Bangumi.title_raw, Bangumi.group_name).where(
+                or_(*conditions)
+            )
+            result = self.session.execute(statement)
+            existing = set(result.all())
+        else:
+            existing = set()

        # Filter out duplicates
        to_add = [d for d in datas if (d.title_raw, d.group_name) not in existing]
@@ -199,24 +210,29 @@ class BangumiDatabase:
        match_datas = self.search_all()
        if not match_datas:
            return torrent_list
-        # Build index for faster lookup
+
+        # Build index for O(1) lookup after regex match
        title_index = {m.title_raw: m for m in match_datas}
+
+        # Build compiled regex pattern for fast substring matching
+        # Sort by length descending so longer (more specific) matches are found first
+        sorted_titles = sorted(title_index.keys(), key=len, reverse=True)
+        # Escape special regex characters and join with alternation
+        pattern = "|".join(re.escape(title) for title in sorted_titles)
+        title_regex = re.compile(pattern)
+
        unmatched = []
        rss_updated = set()
        for torrent in torrent_list:
-            matched = False
-            for title_raw, match_data in title_index.items():
-                if title_raw in torrent.name:
-                    if (
-                        rss_link not in match_data.rss_link
-                        and title_raw not in rss_updated
-                    ):
-                        match_data.rss_link += f",{rss_link}"
-                        match_data.added = False
-                        rss_updated.add(title_raw)
-                    matched = True
-                    break
-            if not matched:
+            match = title_regex.search(torrent.name)
+            if match:
+                title_raw = match.group(0)
+                match_data = title_index[title_raw]
+                if rss_link not in match_data.rss_link and title_raw not in rss_updated:
+                    match_data.rss_link += f",{rss_link}"
+                    match_data.added = False
+                    rss_updated.add(title_raw)
+            else:
                unmatched.append(torrent)
        # Batch commit all rss_link updates
        if rss_updated:
--- a/backend/src/module/parser/analyser/torrent_parser.py
+++ b/backend/src/module/parser/analyser/torrent_parser.py
@@ -1,11 +1,16 @@
 import logging
 import re
+from collections import OrderedDict
 from pathlib import Path

 from module.models import EpisodeFile, SubtitleFile

 logger = logging.getLogger(__name__)

+# LRU cache for torrent_parser results to avoid repeated regex parsing
+_PARSER_CACHE_MAX_SIZE = 512
+_parser_cache: OrderedDict[tuple, EpisodeFile | SubtitleFile | None] = OrderedDict()
+
 PLATFORM = "Unix"

 RULES = [
@@ -70,7 +75,31 @@ def torrent_parser(
    torrent_name: str | None = None,
    season: int | None = None,
    file_type: str = "media",
-) -> EpisodeFile | SubtitleFile:
+) -> EpisodeFile | SubtitleFile | None:
+    # Check cache first to avoid repeated regex parsing
+    cache_key = (torrent_path, torrent_name, season, file_type)
+    if cache_key in _parser_cache:
+        # Move to end to mark as recently used
+        _parser_cache.move_to_end(cache_key)
+        return _parser_cache[cache_key]
+
+    result = _torrent_parser_impl(torrent_path, torrent_name, season, file_type)
+
+    # Store in cache with LRU eviction
+    _parser_cache[cache_key] = result
+    if len(_parser_cache) > _PARSER_CACHE_MAX_SIZE:
+        _parser_cache.popitem(last=False)  # Remove oldest item
+
+    return result
+
+
+def _torrent_parser_impl(
+    torrent_path: str,
+    torrent_name: str | None = None,
+    season: int | None = None,
+    file_type: str = "media",
+) -> EpisodeFile | SubtitleFile | None:
+    """Internal implementation of torrent_parser without caching."""
    media_path = get_path_basename(torrent_path)
    match_names = [torrent_name, media_path]
    if torrent_name is None:
@@ -106,6 +135,7 @@ def torrent_parser(
                        episode=episode,
                        suffix=suffix,
                    )
+    return None


 if __name__ == "__main__":