diff --git a/app/chain/message.py b/app/chain/message.py index d327a709..74b7817d 100644 --- a/app/chain/message.py +++ b/app/chain/message.py @@ -490,18 +490,14 @@ class MessageChain(ChainBase): # 重新搜索/下载 content = re.sub(r"(搜索|下载)[::\s]*", "", text) action = "ReSearch" - elif text.startswith("#") \ - or re.search(r"^请[问帮你]", text) \ - or re.search(r"[??]$", text) \ - or StringUtils.count_words(text) > 10 \ - or text.find("继续") != -1: - # 聊天 - content = text - action = "Chat" elif StringUtils.is_link(text): # 链接 content = text action = "Link" + elif not StringUtils.is_media_title_like(text): + # 聊天 + content = text + action = "Chat" else: # 搜索 content = text diff --git a/app/utils/string.py b/app/utils/string.py index 1bd37b07..4fd216d9 100644 --- a/app/utils/string.py +++ b/app/utils/string.py @@ -23,6 +23,17 @@ _special_domains = [ _version_map = {"stable": -1, "rc": -2, "beta": -3, "alpha": -4} # 不符合的版本号 _other_version = -5 +_max_media_title_words = 10 +_min_media_title_length = 2 +_non_media_title_pattern = re.compile(r"^#|^请[问帮你]|[??]$|^继续$") +_chat_intent_pattern = re.compile(r"帮我|请问|怎么|如何|为什么|可以|能否|推荐|介绍|谢谢|想看|找一下|搜一下") +_media_feature_pattern = re.compile( + r"第\s*[0-9一二三四五六七八九十百零]+\s*[季集]|S\d{1,2}(?:E\d{1,4})?|E\d{1,4}|(?:19|20)\d{2}", + re.IGNORECASE +) +_media_separator_pattern = re.compile(r"[\s\-_.::·'\"()\[\]【】]+") +_media_sentence_punctuation_pattern = re.compile(r"[,。!?!?,;;]") +_media_title_char_pattern = re.compile(r"[\u4e00-\u9fffA-Za-z]") class StringUtils: @@ -531,6 +542,31 @@ class StringUtils: return chinese_count + english_count + @staticmethod + def is_media_title_like(text: str) -> bool: + """ + 判断文本是否像影视剧名称 + """ + if not text: + return False + text = re.sub(r'\s+', ' ', text).strip() + if not text: + return False + if _non_media_title_pattern.search(text) \ + or StringUtils.count_words(text) > _max_media_title_words: + return False + if "://" in text or text.startswith("magnet:?"): + return False + if _chat_intent_pattern.search(text): + return False + if _media_sentence_punctuation_pattern.search(text): + return False + + # 先移除季/集/年份等媒体特征,再移除分隔符,只保留核心名称用于最终判定 + candidate = _media_feature_pattern.sub("", text) + candidate = _media_separator_pattern.sub("", candidate) + return len(candidate) >= _min_media_title_length and _media_title_char_pattern.search(candidate) is not None + @staticmethod def split_text(text: str, max_length: int) -> Generator: """ diff --git a/tests/test_string.py b/tests/test_string.py new file mode 100644 index 00000000..648aebb7 --- /dev/null +++ b/tests/test_string.py @@ -0,0 +1,26 @@ +from unittest import TestCase + +from app.utils.string import StringUtils + + +class StringUtilsTest(TestCase): + + def test_is_media_title_like_true(self): + self.assertTrue(StringUtils.is_media_title_like("盗梦空间")) + self.assertTrue(StringUtils.is_media_title_like("The Lord of the Rings")) + self.assertTrue(StringUtils.is_media_title_like("庆余年 第2季")) + self.assertTrue(StringUtils.is_media_title_like("The Office S01E01")) + self.assertTrue(StringUtils.is_media_title_like("权力的游戏 Game of Thrones")) + self.assertTrue(StringUtils.is_media_title_like("Spider-Man: No Way Home 2021")) + + def test_is_media_title_like_false(self): + self.assertFalse(StringUtils.is_media_title_like("")) + self.assertFalse(StringUtils.is_media_title_like(" ")) + self.assertFalse(StringUtils.is_media_title_like("a")) + self.assertFalse(StringUtils.is_media_title_like("第2季")) + self.assertFalse(StringUtils.is_media_title_like("S01E01")) + self.assertFalse(StringUtils.is_media_title_like("#推荐电影")) + self.assertFalse(StringUtils.is_media_title_like("请帮我推荐一部电影")) + self.assertFalse(StringUtils.is_media_title_like("盗梦空间怎么样?")) + self.assertFalse(StringUtils.is_media_title_like("我想看盗梦空间")) + self.assertFalse(StringUtils.is_media_title_like("继续"))