test: extend media-title detection coverage and cleanup

Co-authored-by: jxxghp <51039935+jxxghp@users.noreply.github.com>
This commit is contained in:
copilot-swe-agent[bot]
2026-03-03 12:20:54 +00:00
parent 86000ea19a
commit a9f2b40529
2 changed files with 28 additions and 16 deletions

View File

@@ -23,6 +23,17 @@ _special_domains = [
_version_map = {"stable": -1, "rc": -2, "beta": -3, "alpha": -4}
# 不符合的版本号
_other_version = -5
_max_media_title_words = 10
_min_media_title_length = 2
_non_media_title_pattern = re.compile(r"^#|^请[问帮你]|[?]$|^继续$")
_chat_intent_pattern = re.compile(r"帮我|请问|怎么|如何|为什么|可以|能否|推荐|介绍|谢谢|想看|找一下|搜一下")
_media_feature_pattern = re.compile(
r"\s*[0-9一二三四五六七八九十百零]+\s*[季集]|S\d{1,2}(?:E\d{1,4})?|E\d{1,4}|(?:19|20)\d{2}",
re.IGNORECASE
)
_media_separator_pattern = re.compile(r"[\s\-_.::·'\"()\[\]【】]+")
_media_sentence_punctuation_pattern = re.compile(r"[,。!?!?,;]")
_media_title_char_pattern = re.compile(r"[\u4e00-\u9fffA-Za-z]")
class StringUtils:
@@ -541,27 +552,20 @@ class StringUtils:
text = re.sub(r'\s+', ' ', text).strip()
if not text:
return False
if text.startswith("#") \
or re.search(r"^请[问帮你]", text) \
or re.search(r"[?]$", text) \
or StringUtils.count_words(text) > 10 \
or "继续" in text:
if _non_media_title_pattern.search(text) \
or StringUtils.count_words(text) > _max_media_title_words:
return False
if StringUtils.is_link(text):
if "://" in text or text.startswith("magnet:?"):
return False
if re.search(r"(帮我|请问|怎么|如何|为什么|可以|能否|推荐|介绍|谢谢|想看|找一下|搜一下)", text):
if _chat_intent_pattern.search(text):
return False
if re.search(r"[,。!?!?,;]", text):
if _media_sentence_punctuation_pattern.search(text):
return False
candidate = re.sub(
r"\s*[0-9一二三四五六七八九十百零]+\s*[季集]|S\d{1,2}(?:E\d{1,4})?|E\d{1,4}|(?:19|20)\d{2}",
"",
text,
flags=re.IGNORECASE
)
candidate = re.sub(r"[\s\-_.::·'\"()\[\]【】]+", "", candidate)
return len(candidate) >= 2 and bool(re.search(r"[\u4e00-\u9fffA-Za-z]", candidate))
# 先移除季/集/年份等媒体特征,再移除分隔符,只保留核心名称用于最终判定
candidate = _media_feature_pattern.sub("", text)
candidate = _media_separator_pattern.sub("", candidate)
return len(candidate) >= _min_media_title_length and _media_title_char_pattern.search(candidate) is not None
@staticmethod
def split_text(text: str, max_length: int) -> Generator: