diff --git a/app/utils/string.py b/app/utils/string.py index 6f73736c..4fd216d9 100644 --- a/app/utils/string.py +++ b/app/utils/string.py @@ -23,6 +23,17 @@ _special_domains = [ _version_map = {"stable": -1, "rc": -2, "beta": -3, "alpha": -4} # 不符合的版本号 _other_version = -5 +_max_media_title_words = 10 +_min_media_title_length = 2 +_non_media_title_pattern = re.compile(r"^#|^请[问帮你]|[??]$|^继续$") +_chat_intent_pattern = re.compile(r"帮我|请问|怎么|如何|为什么|可以|能否|推荐|介绍|谢谢|想看|找一下|搜一下") +_media_feature_pattern = re.compile( + r"第\s*[0-9一二三四五六七八九十百零]+\s*[季集]|S\d{1,2}(?:E\d{1,4})?|E\d{1,4}|(?:19|20)\d{2}", + re.IGNORECASE +) +_media_separator_pattern = re.compile(r"[\s\-_.::·'\"()\[\]【】]+") +_media_sentence_punctuation_pattern = re.compile(r"[,。!?!?,;;]") +_media_title_char_pattern = re.compile(r"[\u4e00-\u9fffA-Za-z]") class StringUtils: @@ -541,27 +552,20 @@ class StringUtils: text = re.sub(r'\s+', ' ', text).strip() if not text: return False - if text.startswith("#") \ - or re.search(r"^请[问帮你]", text) \ - or re.search(r"[??]$", text) \ - or StringUtils.count_words(text) > 10 \ - or "继续" in text: + if _non_media_title_pattern.search(text) \ + or StringUtils.count_words(text) > _max_media_title_words: return False - if StringUtils.is_link(text): + if "://" in text or text.startswith("magnet:?"): return False - if re.search(r"(帮我|请问|怎么|如何|为什么|可以|能否|推荐|介绍|谢谢|想看|找一下|搜一下)", text): + if _chat_intent_pattern.search(text): return False - if re.search(r"[,。!?!?,;;]", text): + if _media_sentence_punctuation_pattern.search(text): return False - candidate = re.sub( - r"第\s*[0-9一二三四五六七八九十百零]+\s*[季集]|S\d{1,2}(?:E\d{1,4})?|E\d{1,4}|(?:19|20)\d{2}", - "", - text, - flags=re.IGNORECASE - ) - candidate = re.sub(r"[\s\-_.::·'\"()\[\]【】]+", "", candidate) - return len(candidate) >= 2 and bool(re.search(r"[\u4e00-\u9fffA-Za-z]", candidate)) + # 先移除季/集/年份等媒体特征,再移除分隔符,只保留核心名称用于最终判定 + candidate = _media_feature_pattern.sub("", text) + candidate = _media_separator_pattern.sub("", candidate) + return len(candidate) >= _min_media_title_length and _media_title_char_pattern.search(candidate) is not None @staticmethod def split_text(text: str, max_length: int) -> Generator: diff --git a/tests/test_string.py b/tests/test_string.py index beec6450..648aebb7 100644 --- a/tests/test_string.py +++ b/tests/test_string.py @@ -9,8 +9,16 @@ class StringUtilsTest(TestCase): self.assertTrue(StringUtils.is_media_title_like("盗梦空间")) self.assertTrue(StringUtils.is_media_title_like("The Lord of the Rings")) self.assertTrue(StringUtils.is_media_title_like("庆余年 第2季")) + self.assertTrue(StringUtils.is_media_title_like("The Office S01E01")) + self.assertTrue(StringUtils.is_media_title_like("权力的游戏 Game of Thrones")) + self.assertTrue(StringUtils.is_media_title_like("Spider-Man: No Way Home 2021")) def test_is_media_title_like_false(self): + self.assertFalse(StringUtils.is_media_title_like("")) + self.assertFalse(StringUtils.is_media_title_like(" ")) + self.assertFalse(StringUtils.is_media_title_like("a")) + self.assertFalse(StringUtils.is_media_title_like("第2季")) + self.assertFalse(StringUtils.is_media_title_like("S01E01")) self.assertFalse(StringUtils.is_media_title_like("#推荐电影")) self.assertFalse(StringUtils.is_media_title_like("请帮我推荐一部电影")) self.assertFalse(StringUtils.is_media_title_like("盗梦空间怎么样?"))