MoviePilot/app/helper/format.py

import re
from collections import defaultdict
from dataclasses import dataclass
from functools import lru_cache
from pathlib import Path
from typing import Dict, Iterable, List, Match, Optional, Tuple, Union

import anitopy

from app.core.config import settings
from app.core.metainfo import MetaInfoPath
from app.core.meta.metabase import MetaBase
from app.log import logger
from app.schemas import EpisodeFormatRule, FileItem


@dataclass(frozen=True)
class _TemplateParseResult:
    named: Dict[str, str]
    spans: Dict[str, Tuple[int, int]]


@lru_cache(maxsize=256)
def _compile_template_pattern(
    template: str,
    ep_group_name: Optional[str] = None,
):
    parts: List[str] = ["^"]
    cursor = 0
    while cursor < len(template):
        if template.startswith("{{", cursor):
            parts.append(re.escape("{"))
            cursor += 2
            continue
        if template.startswith("}}", cursor):
            parts.append(re.escape("}"))
            cursor += 2
            continue
        if template[cursor] == "{":
            end = template.find("}", cursor + 1)
            if end < 0:
                raise ValueError(f"模板存在未闭合占位符：{template}")
            group_name = template[cursor + 1:end]
            if not re.fullmatch(r"[A-Za-z_]\w*", group_name):
                raise ValueError(f"模板占位符名称无效：{template}")
            quantifier = ".+?" if group_name == ep_group_name else ".*?"
            parts.append(f"(?P<{group_name}>{quantifier})")
            cursor = end + 1
            continue
        if template[cursor] == "}":
            raise ValueError(f"模板存在未转义的右花括号：{template}")

        literal_end = cursor
        while literal_end < len(template) and template[literal_end] not in "{}":
            literal_end += 1
        parts.append(re.escape(template[cursor:literal_end]))
        cursor = literal_end
    parts.append("$")
    return re.compile("".join(parts))


def _match_template(
    template: str,
    text: str,
    ep_group_name: Optional[str] = None,
) -> Optional[_TemplateParseResult]:
    pattern = _compile_template_pattern(template, ep_group_name)
    result = pattern.match(text)
    if not result:
        return None
    group_names = result.groupdict()
    return _TemplateParseResult(
        named=group_names,
        spans={
            group_name: result.span(group_name)
            for group_name in group_names
        },
    )


class FormatParser(object):
    _key = ""
    _split_chars = r"\.|\s+|\(|\)|\[|]|-|\+|【|】|/|～|;|&|\||#|_|「|」|~"

    def __init__(self, eformat: str, details: Optional[str] = None, part: Optional[str] = None,
                 offset: Optional[str] = None, key: Optional[str] = "ep"):
        """
        :params eformat: 格式化字符串
        :params details: 格式化详情
        :params part: 分集
        :params offset: 偏移量 -10/EP*2
        :prams key: EP关键字
        """
        self._format = eformat
        self._start_ep = None
        self._end_ep = None
        if not offset:
            self.__offset = "EP"
        elif "EP" in offset:
            self.__offset = offset
        else:
            if offset.startswith("-") or offset.startswith("+"):
                self.__offset = f"EP{offset}"
            else:
                self.__offset = f"EP+{offset}"
        self._key = key
        self._part = None
        self._compiled_pattern = (
            _compile_template_pattern(self._format, self._key)
            if self._format
            else None
        )
        if part:
            self._part = part
        if details:
            if re.compile("\\d{1,4}-\\d{1,4}").match(details):
                self._start_ep = details
                self._end_ep = details
            else:
                tmp = details.split(",")
                if len(tmp) > 1:
                    self._start_ep = int(tmp[0])
                    self._end_ep = int(tmp[0]) if int(tmp[0]) > int(tmp[1]) else int(tmp[1])
                else:
                    self._start_ep = self._end_ep = int(tmp[0])

    @property
    def format(self):
        return self._format

    @property
    def start_ep(self):
        return self._start_ep

    @property
    def end_ep(self):
        return self._end_ep

    @property
    def part(self):
        return self._part

    @property
    def offset(self):
        return self.__offset

    def match(self, file: str) -> bool:
        if not self._format:
            return True
        s, e = self.__handle_single(file)
        if not s:
            return False
        if self._start_ep is None:
            return True
        if self._start_ep <= s <= self._end_ep:
            return True
        return False

    def split_episode(self, file_name: str, file_meta: MetaBase) -> Tuple[Optional[int], Optional[int], Optional[str]]:
        """
        拆分集数，返回开始集数，结束集数，Part信息
        """
        # 指定的具体集数，直接返回
        if self._start_ep is not None:
            if self._start_ep == self._end_ep:
                # `details` 格式为 `X-X` 或者 `X`
                if isinstance(self._start_ep, str):
                    # `details` 格式为 `X-X`
                    s, e = self._start_ep.split("-")
                    start_ep = self.__offset.replace("EP", s)
                    end_ep = self.__offset.replace("EP", e)
                    if int(s) == int(e):
                        return int(eval(start_ep)), None, self.part
                    return int(eval(start_ep)), int(eval(end_ep)), self.part
                else:
                    # `details` 格式为 `X`
                    start_ep = self.__offset.replace("EP", str(self._start_ep))
                    return int(eval(start_ep)), None, self.part
            elif not self._format:
                # `details` 格式为 `X,X`
                start_ep = self.__offset.replace("EP", str(self._start_ep))
                end_ep = self.__offset.replace("EP", str(self._end_ep))
                return int(eval(start_ep)), int(eval(end_ep)), self.part
        if not self._format:
            # 未填入`集数定位` 且没有`指定集数` 仅处理`集数偏移`
            start_ep = eval(self.__offset.replace("EP", str(file_meta.begin_episode))) if file_meta.begin_episode else None
            end_ep = eval(self.__offset.replace("EP", str(file_meta.end_episode))) if file_meta.end_episode else None
            return int(start_ep) if start_ep else None, int(end_ep) if end_ep else None, self.part
        else:
            # 有`集数定位`
            s, e = self.__handle_single(file_name)
            start_ep = self.__offset.replace("EP", str(s)) if s else None
            end_ep = self.__offset.replace("EP", str(e)) if e else None
            return int(eval(start_ep)) if start_ep else None, int(eval(end_ep)) if end_ep else None, self.part

    def __handle_single(self, file: str) -> Tuple[Optional[int], Optional[int]]:
        """
        处理单集，返回单集的开始和结束集数
        """
        if not self._format:
            return None, None
        ret = self._compiled_pattern.match(file) if self._compiled_pattern else None
        if not ret or self._key not in ret.groupdict():
            return None, None
        episodes = ret.group(self._key)
        if not re.compile(
            r"^([Ee][Pp]?)?(\d{1,4})(-([Ee][Pp]?)?(\d{1,4}))?$",
            re.IGNORECASE,
        ).match(episodes):
            return None, None
        episode_splits = list(filter(lambda x: re.compile(r'[a-zA-Z]*\d{1,4}', re.IGNORECASE).match(x),
                                     re.split(r'%s' % self._split_chars, episodes)))
        if len(episode_splits) == 1:
            return int(re.compile(r'[a-zA-Z]*', re.IGNORECASE).sub("", episode_splits[0])), None
        else:
            return int(re.compile(r'[a-zA-Z]*', re.IGNORECASE).sub("", episode_splits[0])), int(
                re.compile(r'[a-zA-Z]*', re.IGNORECASE).sub("", episode_splits[1]))


@dataclass(frozen=True)
class _AutoRecommendSample:
    file_name: str
    ep_span: Tuple[int, int]
    expected_episode: str
    source_kind: str = "media"
    native_episode: Optional[str] = None
    native_verified: bool = False
    used_native_fallback: bool = False


class EpisodeFormatRuleHelper:
    """
    集数定位规则辅助类
    """

    _MIN_MEDIA_FILE_SIZE_BYTES = 100 * 1024 * 1024
    _MIN_AUTO_VALID_MEDIA_COVERAGE = 0.6
    _EMPTY_META = MetaBase(title="")

    _EP_RANGE_RE = re.compile(
        r"(?<![A-Za-z0-9])[Ee][Pp]?(\d{1,4}(?:-[Ee]?[Pp]?\d{1,4})+)(?!\d)"
    )
    _EP_PREFIX_RE = re.compile(r"(?<![A-Za-z0-9])[Ee][Pp]?(\d{1,4})(?!\d)")
    _SEASON_EP_RANGE_RE = re.compile(
        r"[Ss]\d{1,4}[Ee][Pp]?(\d{1,4}(?:-[Ee]?[Pp]?\d{1,4})+)(?!\d)"
    )
    _SEASON_EP_RE = re.compile(r"[Ss]\d{1,4}[Ee][Pp]?(\d{1,4})(?!\d)")
    _HASH_EP_RE = re.compile(r"(?<!\d)#(\d{1,4})(?!\d)")
    _BRACKET_EP_RE = re.compile(r"[\[【](\d{1,4})[\]】]")
    _FALLBACK_BRACKET_EP_RE = re.compile(r"[\[【](\d{1,3})[\]】]")
    _FALLBACK_EPISODE_RE = re.compile(r"第(\d{1,4})[話话]")
    _FALLBACK_EPISODE_JI_RE = re.compile(r"第(\d{1,4})集")
    _FALLBACK_PERIOD_RE = re.compile(r"。(\d{1,4})\s")
    _CJK_EP_RE = re.compile(r"第(\d{1,4})(?:[話话集])")
    _SPECIAL_SAMPLE_RE = re.compile(
        r"\[(?:"
        r"SP\d+"
        r"|NC(?:OP|ED)(?:[_\s-]*EP\d+)?(?:\s+VER\.\d+)?"
        r"|OP"
        r"|ED"
        r"|MENU(?:\d+|OVA)?"
        r"|OVA(?:\s+TRAILER)?"
        r"|OAD"
        r"|PV\d*"
        r"|CM(?:\d+| COLLECTION)?"
        r"|TRAILER"
        r"|WEB PREVIEW(?:\s+\d+)?"
        r"|SERIES REVIEW"
        r"|TABLE GAME"
        r"|TV SPOTS?"
        r"|\d+\((?:OVA|OAD|SP)\d+\)"
        r")\]",
        re.IGNORECASE,
    )

    def recommend(
        self,
        rules: List[EpisodeFormatRule],
        sample_files: List[FileItem],
    ) -> Tuple[bool, str, Optional[dict]]:
        """
        推荐集数定位模板
        """
        if not rules:
            return self._auto_recommend(sample_files)

        if not sample_files:
            return False, "目录中没有可用于识别的媒体文件", None

        for index, rule in enumerate(rules):
            matched_samples = self._match_rule(rule, sample_files)
            if not matched_samples:
                continue

            sample_file, match_result = matched_samples[0]
            episode_format = self._build_template(sample_file.name, match_result)
            if not episode_format:
                continue
            if not self._validate_template(episode_format, matched_samples):
                logger.warn(f"集数定位规则 {rule.name} 模板校验失败")
                continue
            compatibility_samples = self._build_detected_samples(
                self._filter_by_extension_and_size(sample_files),
            )
            if compatibility_samples and not self._validate_auto_template(
                episode_format,
                compatibility_samples,
            ):
                logger.warn(f"集数定位规则 {rule.name} 附加文件兼容性校验失败")
                continue

            logger.info(
                f"集数定位规则命中：{rule.name}，样本文件：{sample_file.name}"
            )
            return True, "", {
                "rule_name": rule.name,
                "rule_index": index,
                "pattern": rule.pattern,
                "episode_format": episode_format,
                "sample_file": sample_file.name,
                "min_file_size_mb": rule.min_file_size_mb,
                "message": "已根据预定义规则生成集数定位模板",
            }

        return self._auto_recommend(sample_files)

    def _auto_recommend(
        self,
        sample_files: List[FileItem],
    ) -> Tuple[bool, str, Optional[dict]]:
        """
        自动生成集数定位模板：anitopy 反向定位 + 多文件对比
        """
        if not sample_files:
            return False, "目录中没有可用于识别的媒体文件", None

        candidates = self._filter_by_extension_and_size(sample_files)
        size_filter_relaxed = False
        if not candidates:
            candidates = self._filter_by_extension_and_size(
                sample_files, ignore_size=True
            )
            size_filter_relaxed = bool(candidates)
        if not candidates:
            return False, "无匹配自定义定位规则，智能生成失败", None

        valid_samples = self._build_detected_samples(candidates)
        native_verified_count = 0
        native_fallback_count = 0
        native_conflict_count = 0
        episode_not_detected_count = 0
        for item in valid_samples:
            if item.native_verified:
                native_verified_count += 1
            if item.used_native_fallback:
                native_fallback_count += 1
        for item in sorted(
            candidates,
            key=lambda entry: (
                self._sample_kind_priority(self._get_file_kind(entry)),
                (entry.name or ""),
                (entry.path or ""),
            ),
        ):
            file_name = item.name or ""
            if self._is_special_sample(file_name):
                continue
            normalized_episode, native_episode, used_native_fallback, native_verified = (
                self._extract_episode_with_native_fallback(item)
            )
            if normalized_episode and native_episode and not (
                used_native_fallback or native_verified
            ):
                native_conflict_count += 1
                logger.warn(
                    "自动推荐样本与原生集数识别冲突，跳过："
                    f"{file_name} - auto={normalized_episode}, native={native_episode}"
                )
                continue
            expected_start, _ = self._parse_episode_value(normalized_episode)
            if expected_start is None:
                episode_not_detected_count += 1
                continue
            if expected_start <= 0:
                continue
            if self._locate_episode(file_name, normalized_episode) is None:
                episode_not_detected_count += 1

        if not valid_samples:
            if native_conflict_count:
                return (
                    False,
                    "样本命名与原生识别结果冲突，建议补充集数定位规则",
                    None,
                )
            if episode_not_detected_count:
                return False, "样本未识别到有效集数，智能生成失败", None
            return False, "无匹配自定义定位规则，智能生成失败", None

        if native_conflict_count and len(valid_samples) < len(candidates):
            return (
                False,
                "样本命名与原生识别结果冲突，建议补充集数定位规则",
                None,
            )

        candidate_media_count = 0
        for item in candidates:
            if (
                    self._get_file_kind(item) == "media"
                    and not self._is_special_sample(item.name or "")
            ):
                candidate_media_count += 1

        valid_media_count = 0
        for item in valid_samples:
            if item.source_kind == "media":
                valid_media_count += 1

        if (
                candidate_media_count > 1
                and valid_media_count / candidate_media_count
                < self._MIN_AUTO_VALID_MEDIA_COVERAGE
        ):
            logger.warn(
                "有效正片样本覆盖率不足，放弃智能生成："
                f"valid_media={valid_media_count}, candidate_media={candidate_media_count}"
            )
            return False, "有效正片样本覆盖率不足，建议补充集数定位规则", None

        majority_samples, clear_majority = self._select_base_samples(valid_samples)
        logger.debug(
            "自动推荐多数派样本："
            f"valid={len(valid_samples)}, majority={len(majority_samples)}, "
            f"clear_majority={clear_majority}, files="
            f"{[(sample.file_name, sample.expected_episode, sample.ep_span) for sample in majority_samples]}"
        )
        if len(valid_samples) > 1 and not clear_majority:
            logger.warn("自动生成样本未形成明确多数派，放弃推荐")
            return False, "样本命名差异过大，建议补充集数定位规则", None

        majority_names = [sample.file_name for sample in majority_samples]
        majority_spans = [sample.ep_span for sample in majority_samples]

        episode_format = self._build_ep_only_template(
            majority_names, majority_spans, use_majority=False
        )
        logger.debug(
            "自动推荐基础模板："
            f"sample={majority_names[0] if majority_names else None}, "
            f"span={majority_spans[0] if majority_spans else None}, template={episode_format}"
        )
        if not self._validate_auto_template(episode_format, majority_samples):
            diff_result = self._build_template_with_diff(
                majority_names, majority_spans, use_majority=False
            )
            logger.debug(
                "自动推荐差异模板尝试："
                f"base={episode_format}, diff={diff_result}"
            )
            if diff_result and self._validate_auto_template(
                diff_result, majority_samples
            ):
                episode_format = diff_result
            else:
                logger.warn("多文件对比未通过模板校验，自动生成失败")
                return False, "无匹配自定义定位规则，智能生成失败", None

        sample_file = majority_names[0]
        low_confidence = len(majority_samples) == 1 or size_filter_relaxed
        reasons = self._build_auto_reasons(
            sample_count=len(valid_samples),
            majority_count=len(majority_samples),
            size_filter_relaxed=size_filter_relaxed,
            native_fallback_count=native_fallback_count,
            native_verified_count=native_verified_count,
        )
        logger.info(f"智能分析生成集数定位模板：{sample_file} -> {episode_format}")

        return True, "", {
            "rule_name": "智能分析",
            "episode_format": episode_format,
            "sample_file": sample_file,
            "pattern": None,
            "sample_count": len(valid_samples),
            "majority_count": len(majority_samples),
            "confidence": "low" if low_confidence else "high",
            "size_filter_relaxed": size_filter_relaxed,
            "native_verified_count": native_verified_count,
            "native_fallback_count": native_fallback_count,
            "native_conflict_count": native_conflict_count,
            "reason": reasons[0] if reasons else None,
            "reasons": reasons,
            "message": self._build_auto_message(
                sample_count=len(valid_samples),
                majority_count=len(majority_samples),
                size_filter_relaxed=size_filter_relaxed,
                native_fallback_count=native_fallback_count,
            ),
        }

    @staticmethod
    def _build_auto_message(
        sample_count: int,
        majority_count: int,
        size_filter_relaxed: bool,
        native_fallback_count: int,
    ) -> str:
        if majority_count <= 1:
            return "样本不足，仅基于单文件智能生成（仅供参考）"
        if size_filter_relaxed:
            return "已放宽体积限制智能生成模板（仅供参考）"
        if native_fallback_count:
            return "已结合原生集数识别智能生成模板（仅供参考）"
        if sample_count != majority_count:
            return "已根据多数派样本智能生成模板（仅供参考）"
        return "无匹配自定义定位规则，已智能生成（仅供参考）"

    @staticmethod
    def _build_auto_reasons(
        sample_count: int,
        majority_count: int,
        size_filter_relaxed: bool,
        native_fallback_count: int,
        native_verified_count: int,
    ) -> List[str]:
        reasons: List[str] = []
        if majority_count <= 1:
            reasons.append("single_sample_only")
        if size_filter_relaxed:
            reasons.append("small_files_fallback")
        if native_fallback_count:
            reasons.append("native_meta_fallback")
        elif native_verified_count:
            reasons.append("native_meta_verified")
        if sample_count != majority_count:
            reasons.append("majority_samples_only")
        if not reasons:
            reasons.append("auto_recommendation")
        return reasons

    @staticmethod
    def _filter_by_extension_and_size(
        files: List[FileItem],
        ignore_size: bool = False,
    ) -> List[FileItem]:
        """
        第一轮筛选：主视频扩展名白名单 + 体积门槛，字幕/外挂音频始终允许参与
        """
        candidates: List[FileItem] = []
        for item in files:
            file_kind = EpisodeFormatRuleHelper._get_file_kind(item)
            if file_kind == "other":
                continue
            if (
                file_kind == "media"
                and not ignore_size
                and (item.size or 0) < EpisodeFormatRuleHelper._MIN_MEDIA_FILE_SIZE_BYTES
            ):
                continue
            candidates.append(item)
        return candidates

    @staticmethod
    def _get_file_kind(item: FileItem) -> str:
        extension = f".{(item.extension or '').lower().lstrip('.')}" if item.extension else ""
        if extension in settings.RMT_MEDIAEXT:
            return "media"
        if extension in settings.RMT_SUBEXT:
            return "subtitle"
        if extension in settings.RMT_AUDIOEXT:
            return "audio"
        return "other"

    @staticmethod
    def _sample_kind_priority(kind: str) -> int:
        return {
            "media": 0,
            "subtitle": 1,
            "audio": 2,
        }.get(kind, 9)

    @classmethod
    def _is_special_sample(cls, file_name: str) -> bool:
        return bool(cls._SPECIAL_SAMPLE_RE.search(file_name or ""))

    def _build_detected_samples(
        self,
        candidates: List[FileItem],
    ) -> List[_AutoRecommendSample]:
        valid_samples: List[_AutoRecommendSample] = []
        for item in sorted(
            candidates,
            key=lambda entry: (
                self._sample_kind_priority(self._get_file_kind(entry)),
                (entry.name or ""),
                (entry.path or ""),
            ),
        ):
            file_name = item.name or ""
            if self._is_special_sample(file_name):
                # SP/NCOP/NCED/OP/ED/MENU 等明显特典样本不参与正片模板自动推荐。
                continue
            normalized_episode, native_episode, used_native_fallback, native_verified = (
                self._extract_episode_with_native_fallback(item)
            )
            if normalized_episode and native_episode and not (
                used_native_fallback or native_verified
            ):
                continue
            expected_start, _ = self._parse_episode_value(normalized_episode)
            if expected_start is None:
                continue
            if expected_start <= 0:
                # 00 集通常归属于特殊季，不参与正片模板自动推荐。
                continue
            if normalized_episode and not normalized_episode.isdigit():
                # 非纯整数的特殊集数当前不在 FormatParser 消费契约内，
                # 继续参与推荐只会把正片模板生成带偏。
                continue

            ep_span = self._locate_episode(file_name, normalized_episode)
            if ep_span is None:
                logger.debug(
                    "自动推荐样本跳过：未定位到集数 token - "
                    f"{file_name} - episode={normalized_episode}"
                )
                continue

            logger.debug(
                "自动推荐样本入选："
                f"{file_name} - episode={normalized_episode}, span={ep_span}, "
                f"matched={file_name[ep_span[0]:ep_span[1]]}, "
                f"kind={self._get_file_kind(item)}"
            )
            valid_samples.append(
                _AutoRecommendSample(
                    file_name=file_name,
                    ep_span=ep_span,
                    expected_episode=normalized_episode,
                    source_kind=self._get_file_kind(item),
                    native_episode=native_episode,
                    native_verified=native_verified,
                    used_native_fallback=used_native_fallback,
                )
            )
        return valid_samples

    @classmethod
    def _locate_episode(
        cls,
        file_name: str,
        episode_value: str,
    ) -> Optional[Tuple[int, int]]:
        """
        三级策略反向定位 episode_number 在文件名中的位置
        """
        normalized_episode_value = cls._normalize_episode_value(episode_value)
        for matcher in (
            cls._EP_RANGE_RE,
            cls._EP_PREFIX_RE,
            cls._SEASON_EP_RANGE_RE,
            cls._SEASON_EP_RE,
            cls._HASH_EP_RE,
            cls._BRACKET_EP_RE,
            cls._CJK_EP_RE,
        ):
            for match in matcher.finditer(file_name):
                if cls._episode_value_equals(
                    match.group(1),
                    normalized_episode_value,
                ):
                    return match.span(1)

        for candidate in cls._build_episode_candidates(normalized_episode_value):
            token_pattern = re.compile(
                rf"(?:(?<=^)|(?<=[\s._\-\[\]【】()「」『』《》〈〉〔〕]))"
                rf"{re.escape(candidate)}"
                rf"(?:(?=$)|(?=[\s._\-\[\]【】()「」『』《》〈〉〔〕]))"
            )
            matches = list(token_pattern.finditer(file_name))
            if matches:
                return matches[-1].span()
        return None

    @staticmethod
    def _normalize_episode_value(episode_value) -> str:
        if isinstance(episode_value, list):
            parts = [str(part) for part in episode_value]
        else:
            parts = str(episode_value).split("-")
        normalized_parts = [
            re.sub(r"^[Ee][Pp]?", "", part.strip())
            for part in parts
            if str(part).strip()
        ]
        return "-".join(normalized_parts)

    @staticmethod
    def _parse_episode_value(
        expected_episode: Optional[str],
    ) -> Tuple[Optional[int], Optional[int]]:
        if not expected_episode:
            return None, None
        parts = []
        for part in str(expected_episode).split("-"):
            cleaned = re.sub(r"^[Ee][Pp]?", "", part.strip())
            number_match = re.search(r"\d{1,4}", cleaned)
            if not number_match:
                return None, None
            parts.append(int(number_match.group()))
        if not parts:
            return None, None
        if len(parts) == 1 or parts[-1] == parts[0]:
            return parts[0], None
        return parts[0], parts[-1]

    @classmethod
    def _episode_value_equals(
        cls,
        actual_episode: Optional[str],
        expected_episode: Optional[str],
    ) -> bool:
        if not actual_episode or not expected_episode:
            return False
        return cls._parse_episode_value(actual_episode) == cls._parse_episode_value(
            expected_episode
        )

    @classmethod
    def _build_episode_candidates(
        cls,
        episode_value: Optional[str],
    ) -> List[str]:
        start_episode, end_episode = cls._parse_episode_value(episode_value)
        if start_episode is None:
            return []
        candidates: List[str] = []
        if end_episode is None:
            for width in range(1, 5):
                candidates.append(str(start_episode).zfill(width))
        else:
            for width in range(1, 5):
                start_text = str(start_episode).zfill(width)
                end_text = str(end_episode).zfill(width)
                candidates.append(f"{start_text}-{end_text}")
                candidates.append(f"{start_text}-E{end_text}")
                candidates.append(f"{start_text}-EP{end_text}")
        # 保证顺序稳定，同时去重
        return list(dict.fromkeys(candidates))

    @classmethod
    def _extract_native_episode(cls, item: FileItem) -> Optional[str]:
        source_path = item.path or item.name
        if not source_path:
            return None
        try:
            meta = MetaInfoPath(Path(source_path))
        except Exception as err:
            logger.warn(f"原生集数识别失败：{source_path} - {err}")
            return None
        if meta.begin_episode is None:
            return None
        if meta.end_episode is not None and meta.end_episode != meta.begin_episode:
            return f"{meta.begin_episode}-{meta.end_episode}"
        return str(meta.begin_episode)

    @classmethod
    def _should_degrade_native_conflict(
        cls,
        file_name: str,
        normalized_episode: Optional[str],
        native_episode: Optional[str],
    ) -> bool:
        """
        判断原生集数冲突是否应降级处理。

        当自动定位到的集数 token 明确出现在文件名后部，而原生识别出来的数字
        只出现在更靠前的位置时，通常是标题续作号或目录序号误判，不应继续作
        为自动推荐的否决条件。
        """
        if not file_name or not normalized_episode or not native_episode:
            return False

        auto_span = cls._locate_episode(file_name, normalized_episode)
        native_span = cls._locate_episode(file_name, native_episode)
        if not auto_span or not native_span:
            return False
        return native_span[1] <= auto_span[0]

    @classmethod
    def _should_prefer_fallback_episode(
        cls,
        file_name: str,
        anitopy_episode: Optional[Union[str, List[str]]],
        fallback_episode: Optional[Union[str, List[str]]],
    ) -> bool:
        """
        当 anitopy 命中了标题前部数字，而 fallback 命中了更靠后的显式集数 token 时，
        优先使用 fallback 结果。
        """
        if not file_name or not anitopy_episode or not fallback_episode:
            return False
        normalized_anitopy_episode = cls._normalize_episode_value(anitopy_episode)
        normalized_fallback_episode = cls._normalize_episode_value(fallback_episode)
        if cls._episode_value_equals(
            normalized_anitopy_episode,
            normalized_fallback_episode,
        ):
            return False
        _, anitopy_end_episode = cls._parse_episode_value(normalized_anitopy_episode)
        if anitopy_end_episode is not None:
            return False

        anitopy_span = cls._locate_episode(file_name, normalized_anitopy_episode)
        fallback_span = cls._locate_episode(file_name, normalized_fallback_episode)
        if not anitopy_span or not fallback_span:
            return False
        return anitopy_span[1] <= fallback_span[0]

    def _extract_episode_with_native_fallback(
        self,
        item: FileItem,
    ) -> Tuple[Optional[str], Optional[str], bool, bool]:
        file_name = item.name or ""
        native_episode = self._extract_native_episode(item)
        episode_number = None
        anitopy_episode = None
        try:
            result = anitopy.parse(file_name)
            episode_number = result.get("episode_number")
            anitopy_episode = episode_number
        except Exception as err:
            logger.warn(f"anitopy 解析失败：{file_name} - {err}")
        fallback_episode = self._extract_episode_fallback(file_name)
        if not episode_number:
            episode_number = fallback_episode
        elif self._should_prefer_fallback_episode(
            file_name,
            anitopy_episode,
            fallback_episode,
        ):
            episode_number = fallback_episode
        normalized_episode = (
            self._normalize_episode_value(episode_number)
            if episode_number
            else None
        )
        logger.debug(
            "自动推荐集数提取："
            f"{file_name} - anitopy={anitopy_episode}, "
            f"fallback={fallback_episode}, normalized={normalized_episode}, "
            f"native={native_episode}"
        )
        used_native_fallback = False
        native_verified = False
        if normalized_episode and native_episode:
            if self._episode_value_equals(normalized_episode, native_episode):
                native_verified = True
            elif self._should_degrade_native_conflict(
                file_name,
                normalized_episode,
                native_episode,
            ):
                logger.info(
                    "原生集数识别疑似命中标题序号，降级冲突权重："
                    f"{file_name} - auto={normalized_episode}, native={native_episode}"
                )
                native_episode = None
            else:
                return normalized_episode, native_episode, False, False
        elif not normalized_episode and native_episode:
            normalized_episode = native_episode
            used_native_fallback = True
        return normalized_episode, native_episode, used_native_fallback, native_verified

    @classmethod
    def _extract_episode_fallback(cls, file_name: str) -> Optional[str]:
        """
        anitopy 无法识别时的兜底集数提取。

        优先尝试结构更明确的季集/井号/方括号集数，再退回到中日韩常见文案。
        """
        match = cls._SEASON_EP_RANGE_RE.search(file_name)
        if match:
            return match.group(1)
        match = cls._SEASON_EP_RE.search(file_name)
        if match:
            return match.group(1)
        hash_matches = list(cls._HASH_EP_RE.finditer(file_name))
        if hash_matches:
            return hash_matches[-1].group(1)
        bracket_matches = list(cls._FALLBACK_BRACKET_EP_RE.finditer(file_name))
        if bracket_matches:
            return bracket_matches[-1].group(1)
        match = cls._FALLBACK_EPISODE_RE.search(file_name)
        if match:
            return match.group(1)
        match = cls._FALLBACK_EPISODE_JI_RE.search(file_name)
        if match:
            return match.group(1)
        match = cls._FALLBACK_PERIOD_RE.search(file_name)
        if match:
            return match.group(1)
        return None

    @staticmethod
    def _select_base_samples(
        samples: Iterable[_AutoRecommendSample],
    ) -> Tuple[List[_AutoRecommendSample], bool]:
        """
        before_ep 多数投票选取基准文件，排除 OAD 等异类
        """
        before_groups: Dict[str, List[_AutoRecommendSample]] = defaultdict(list)
        for sample in samples:
            before_groups[sample.file_name[: sample.ep_span[0]]].append(sample)

        sorted_groups = sorted(
            before_groups.items(),
            key=lambda item: (-len(item[1]), item[0]),
        )
        majority_group = sorted(
            sorted_groups[0][1],
            key=lambda item: (
                EpisodeFormatRuleHelper._sample_kind_priority(item.source_kind),
                item.file_name,
                item.ep_span[0],
                item.ep_span[1],
            ),
        )
        clear_majority = (
            len(sorted_groups) == 1
            or len(majority_group) > len(sorted_groups[1][1])
        )
        return majority_group, clear_majority

    def _build_ep_only_template(
        self,
        file_names: List[str],
        ep_spans: List[Tuple[int, int]],
        use_majority: bool = True,
    ) -> str:
        """
        基于多数派文件生成仅含 {ep} 的模板
        """
        if use_majority:
            majority_samples, _ = self._select_base_samples(
                _AutoRecommendSample(
                    file_name=name,
                    ep_span=span,
                    expected_episode="",
                )
                for name, span in zip(file_names, ep_spans)
            )
            file_names = [sample.file_name for sample in majority_samples]
            ep_spans = [sample.ep_span for sample in majority_samples]
        return self._build_ep_template_from_file(file_names[0], ep_spans[0])

    def _build_ep_template_from_file(
        self,
        file_name: str,
        ep_span: Tuple[int, int],
    ) -> str:
        start, end = ep_span
        return (
            self._escape_literal(file_name[:start])
            + "{ep}"
            + self._escape_literal(file_name[end:])
        )

    def _build_template_with_diff(
        self,
        file_names: List[str],
        ep_spans: List[Tuple[int, int]],
        use_majority: bool = True,
    ) -> Optional[str]:
        """
        多文件对比生成含 {a}/{b}/{c} 占位符的模板
        """
        if use_majority:
            majority_samples, _ = self._select_base_samples(
                _AutoRecommendSample(
                    file_name=name,
                    ep_span=span,
                    expected_episode="",
                )
                for name, span in zip(file_names, ep_spans)
            )
            file_names = [sample.file_name for sample in majority_samples]
            ep_spans = [sample.ep_span for sample in majority_samples]
        if len(file_names) < 2:
            return None

        before_ep_set = {name[: span[0]] for name, span in zip(file_names, ep_spans)}
        if len(before_ep_set) != 1:
            return None

        after_ep_list = [name[span[1]:] for name, span in zip(file_names, ep_spans)]
        if len(set(after_ep_list)) == 1:
            return None

        template = self._build_ep_template_from_file(file_names[0], ep_spans[0])
        placeholders = ["a", "b", "c"]
        placeholder_idx = 0

        while placeholder_idx < len(placeholders):
            failed = self._find_unmatched(template, file_names)
            if not failed:
                break
            updated_template = self._insert_variable_placeholder(
                template,
                failed,
                after_ep_list,
                file_names,
                placeholders[placeholder_idx],
            )
            if updated_template == template:
                break
            template = updated_template
            placeholder_idx += 1
        return template

    @staticmethod
    def _find_unmatched(
        template: str,
        file_names: List[str],
    ) -> List[str]:
        parser = EpisodeFormatRuleHelper._create_format_parser(
            template,
            context="多文件对比预校验",
        )
        if not parser:
            return list(file_names)
        failed: List[str] = []
        for name in file_names:
            if not EpisodeFormatRuleHelper._safe_match_template(
                parser,
                name,
                context="多文件对比预校验",
            ):
                failed.append(name)
        return failed

    def _insert_variable_placeholder(
        self,
        template: str,
        failed_files: List[str],
        after_ep_list: List[str],
        all_file_names: List[str],
        placeholder: str,
    ) -> str:
        ep_marker = "{ep}"
        ep_pos = template.find(ep_marker)
        if ep_pos < 0:
            return template

        current_after_ep_template = template[ep_pos + len(ep_marker):]
        base_after_ep = after_ep_list[0]
        existing_spans = self._collect_placeholder_spans(
            current_after_ep_template, base_after_ep
        )
        failed_after_ep_list = [
            after_ep
            for name, after_ep in zip(all_file_names, after_ep_list)
            if name in failed_files
        ]
        next_span = self._find_next_variable_span(
            base_after_ep,
            failed_after_ep_list,
            existing_spans,
        )
        if next_span is None:
            return template

        updated_spans = existing_spans + [
            (next_span[0], next_span[1], placeholder)
        ]
        before_ep = template[:ep_pos]
        return before_ep + ep_marker + self._render_after_ep_template(
            base_after_ep,
            updated_spans,
        )

    @staticmethod
    def _collect_placeholder_spans(
        after_ep_template: str,
        base_after_ep: str,
    ) -> List[Tuple[int, int, str]]:
        if not after_ep_template or "{" not in after_ep_template:
            return []
        result = EpisodeFormatRuleHelper._safe_parse_template(
            after_ep_template,
            base_after_ep,
            context="占位符区间收集",
        )
        if not result:
            return []
        spans: List[Tuple[int, int, str]] = []
        for name, span in result.spans.items():
            spans.append((span[0], span[1], name))
        spans.sort(key=lambda item: item[0])
        return spans

    def _find_next_variable_span(
        self,
        base_after_ep: str,
        failed_after_ep_list: List[str],
        existing_spans: List[Tuple[int, int, str]],
    ) -> Optional[Tuple[int, int]]:
        cursor = 0
        literal_gaps: List[Tuple[int, int]] = []
        for start, end, _ in existing_spans:
            if cursor < start:
                literal_gaps.append((cursor, start))
            cursor = end
        if cursor < len(base_after_ep):
            literal_gaps.append((cursor, len(base_after_ep)))

        for gap_start, gap_end in literal_gaps:
            if gap_start >= gap_end:
                continue
            probe_template = self._render_after_ep_template(
                base_after_ep,
                existing_spans + [(gap_start, gap_end, "probe")],
            )
            probe_values: List[str] = []
            base_gap = base_after_ep[gap_start:gap_end]
            for failed_after_ep in failed_after_ep_list:
                result = self._safe_parse_template(
                    probe_template,
                    failed_after_ep,
                    context="变量区间探测",
                )
                if not result:
                    continue
                probe_value = result.named.get("probe")
                if probe_value is None or probe_value == base_gap:
                    continue
                probe_values.append(probe_value)
            if not probe_values:
                continue

            relative_span = self._calculate_variable_span(base_gap, probe_values)
            if relative_span is None:
                continue
            return gap_start + relative_span[0], gap_start + relative_span[1]
        return None

    def _calculate_variable_span(
        self,
        base_text: str,
        compare_texts: List[str],
    ) -> Optional[Tuple[int, int]]:
        candidates = [base_text] + compare_texts
        prefix_len = self._common_prefix_length(candidates)
        suffix_len = self._common_suffix_length(candidates, prefix_len)
        end_pos = len(base_text) - suffix_len
        if prefix_len >= end_pos:
            base_part = base_text[prefix_len:end_pos]
            compare_parts = [
                text[
                    prefix_len:
                    len(text) - suffix_len if suffix_len else len(text)
                ]
                for text in compare_texts
            ]
            if not base_part and any(compare_parts):
                return prefix_len, prefix_len
            return None

        base_part = base_text[prefix_len:end_pos]
        compare_parts = [
            text[
                prefix_len:
                len(text) - suffix_len if suffix_len else len(text)
            ]
            for text in compare_texts
        ]
        if any(not part for part in [base_part] + compare_parts):
            if not base_part and any(compare_parts):
                return prefix_len, prefix_len
            if base_part and any(part == "" for part in compare_parts):
                return prefix_len, end_pos
            return None
        return prefix_len, end_pos

    @staticmethod
    def _common_prefix_length(texts: List[str]) -> int:
        if not texts:
            return 0
        min_len = min(len(text) for text in texts)
        prefix_len = 0
        while prefix_len < min_len:
            current_char = texts[0][prefix_len]
            if any(text[prefix_len] != current_char for text in texts[1:]):
                break
            prefix_len += 1
        return prefix_len

    @staticmethod
    def _common_suffix_length(
        texts: List[str],
        prefix_len: int = 0,
    ) -> int:
        if not texts:
            return 0
        suffix_len = 0
        min_len = min(len(text) for text in texts)
        while suffix_len < min_len - prefix_len:
            current_char = texts[0][-suffix_len - 1]
            if any(text[-suffix_len - 1] != current_char for text in texts[1:]):
                break
            suffix_len += 1
        return suffix_len

    def _render_after_ep_template(
        self,
        base_after_ep: str,
        spans: List[Tuple[int, int, str]],
    ) -> str:
        template_parts: List[str] = []
        cursor = 0
        for start, end, name in sorted(spans, key=lambda item: item[0]):
            if start < cursor or end < start:
                continue
            template_parts.append(
                self._escape_literal(base_after_ep[cursor:start])
            )
            template_parts.append(f"{{{name}}}")
            cursor = end
        template_parts.append(self._escape_literal(base_after_ep[cursor:]))
        return "".join(template_parts)

    def _validate_auto_template(
        self,
        episode_format: str,
        samples: List[_AutoRecommendSample],
    ) -> bool:
        """
        用 FormatParser 校验自动生成的模板
        """
        if not episode_format:
            return False
        parser = self._create_format_parser(
            episode_format,
            context="自动模板校验",
        )
        if not parser:
            return False
        for sample in samples:
            if not self._safe_match_template(
                parser,
                sample.file_name,
                context="自动模板校验",
            ):
                logger.debug(
                    "自动模板校验失败：模板未命中文件 - "
                    f"template={episode_format}, file={sample.file_name}"
                )
                return False
            start_episode, end_episode, _ = self._safe_split_episode(
                parser,
                sample.file_name,
                context="自动模板校验",
            )
            if not self._episode_matches(
                start_episode,
                end_episode,
                sample.expected_episode,
            ):
                logger.debug(
                    "自动模板校验失败：集数不匹配 - "
                    f"template={episode_format}, file={sample.file_name}, "
                    f"expected={sample.expected_episode}, actual={start_episode}-{end_episode}"
                )
                return False
            if sample.native_episode and not self._episode_matches(
                start_episode,
                end_episode,
                sample.native_episode,
            ):
                logger.debug(
                    "自动模板校验失败：与原生集数不一致 - "
                    f"template={episode_format}, file={sample.file_name}, "
                    f"native={sample.native_episode}, actual={start_episode}-{end_episode}"
                )
                return False
        return True

    @staticmethod
    def _match_rule(
        rule: EpisodeFormatRule,
        sample_files: List[FileItem],
    ) -> List[Tuple[FileItem, Match[str]]]:
        """
        获取规则命中的样本文件
        """
        try:
            compiled_pattern = re.compile(
                EpisodeFormatRuleHelper._normalize_pattern(rule.pattern)
            )
        except Exception as err:
            logger.warn(f"集数定位规则 {rule.name} 编译失败：{err}")
            return []

        matched_samples: List[Tuple[FileItem, Match[str]]] = []
        for item in sample_files:
            if (
                rule.min_file_size_mb
                and EpisodeFormatRuleHelper._get_file_kind(item) == "media"
                and (item.size or 0) < rule.min_file_size_mb * 1024 * 1024
            ):
                continue
            match_result = compiled_pattern.search(item.name or "")
            if not match_result or "ep" not in match_result.groupdict():
                continue
            matched_samples.append((item, match_result))
        return matched_samples

    def _build_template(
        self,
        file_name: str,
        match_result: Match[str],
    ) -> Optional[str]:
        """
        根据命中的样本生成模板
        """
        group_items = []
        for group_name, group_value in match_result.groupdict().items():
            if group_value is None:
                continue
            start, end = match_result.span(group_name)
            if start < 0 or end < 0:
                continue
            if start == end:
                continue
            group_items.append((start, end, group_name))

        if not group_items or not any(
            group_name == "ep"
            for _, _, group_name in group_items
        ):
            return None

        group_items.sort(key=lambda item: (item[0], -(item[1] - item[0])))
        template_parts: List[str] = []
        cursor = 0
        for start, end, group_name in group_items:
            if start < cursor:
                continue
            template_parts.append(self._escape_literal(file_name[cursor:start]))
            template_parts.append(f"{{{group_name}}}")
            cursor = end
        template_parts.append(self._escape_literal(file_name[cursor:]))
        return "".join(template_parts)

    def _validate_template(
        self,
        episode_format: str,
        matched_samples: List[Tuple[FileItem, Match[str]]],
    ) -> bool:
        """
        校验生成的模板是否可被现有格式解析器稳定消费
        """
        parser = self._create_format_parser(
            episode_format,
            context="规则模板校验",
        )
        if not parser:
            return False
        for item, match_result in matched_samples:
            file_name = item.name or ""
            if not self._safe_match_template(
                parser,
                file_name,
                context="规则模板校验",
            ):
                return False
            start_episode, end_episode, _ = self._safe_split_episode(
                parser,
                file_name,
                context="规则模板校验",
            )
            expected_episode = match_result.groupdict().get("ep")
            if not self._episode_matches(
                start_episode,
                end_episode,
                expected_episode,
            ):
                return False
        return True

    @staticmethod
    def _create_format_parser(
        episode_format: str,
        context: str,
    ) -> Optional[FormatParser]:
        try:
            return FormatParser(eformat=episode_format)
        except Exception as err:
            logger.warn(f"{context} 创建模板解析器失败：{episode_format} - {err}")
            return None

    @staticmethod
    def _safe_match_template(
        parser: FormatParser,
        file_name: str,
        context: str,
    ) -> bool:
        try:
            return parser.match(file_name)
        except Exception as err:
            logger.warn(f"{context} 模板匹配失败：{file_name} - {err}")
            return False

    @classmethod
    def _safe_split_episode(
        cls,
        parser: FormatParser,
        file_name: str,
        context: str,
    ) -> Tuple[Optional[int], Optional[int], Optional[str]]:
        try:
            return parser.split_episode(
                file_name=file_name,
                file_meta=cls._EMPTY_META,
            )
        except Exception as err:
            logger.warn(f"{context} 集数拆分失败：{file_name} - {err}")
            return None, None, None

    @staticmethod
    def _safe_parse_template(
        template: str,
        file_name: str,
        context: str,
    ) -> Optional[_TemplateParseResult]:
        try:
            return _match_template(template, file_name)
        except Exception as err:
            logger.warn(f"{context} parse 模板解析失败：{template} <- {file_name} - {err}")
            return None

    @classmethod
    def _episode_matches(
        cls,
        actual_start: Optional[int],
        actual_end: Optional[int],
        expected_episode: Optional[str],
    ) -> bool:
        """
        校验模板提取出的集数是否与期望值一致
        """
        expected_start, expected_end = cls._parse_episode_value(expected_episode)
        if actual_start is None or expected_start is None:
            return False
        if actual_start != expected_start:
            return False
        if expected_end is None:
            return actual_end is None
        return actual_end == expected_end

    @staticmethod
    def _normalize_pattern(pattern: str) -> str:
        """
        将 PCRE 风格命名组转为 Python re 可识别的语法
        """
        return re.sub(
            r"\(\?<([a-zA-Z_][a-zA-Z0-9_]*)>",
            r"(?P<\1>",
            pattern,
        )

    def _escape_literal(self, text: str) -> str:
        """
        将样本文本转为 parse 模板中的字面量
        """
        escaped_parts: List[str] = []
        for char in text:
            if char in "{}":
                escaped_parts.append(char * 2)
            else:
                escaped_parts.append(char)
        return "".join(escaped_parts)