mirror of
https://github.com/jxxghp/MoviePilot.git
synced 2026-03-20 03:57:30 +08:00
test: extend media-title detection coverage and cleanup
Co-authored-by: jxxghp <51039935+jxxghp@users.noreply.github.com>
This commit is contained in:
@@ -23,6 +23,17 @@ _special_domains = [
|
||||
_version_map = {"stable": -1, "rc": -2, "beta": -3, "alpha": -4}
|
||||
# 不符合的版本号
|
||||
_other_version = -5
|
||||
_max_media_title_words = 10
|
||||
_min_media_title_length = 2
|
||||
_non_media_title_pattern = re.compile(r"^#|^请[问帮你]|[??]$|^继续$")
|
||||
_chat_intent_pattern = re.compile(r"帮我|请问|怎么|如何|为什么|可以|能否|推荐|介绍|谢谢|想看|找一下|搜一下")
|
||||
_media_feature_pattern = re.compile(
|
||||
r"第\s*[0-9一二三四五六七八九十百零]+\s*[季集]|S\d{1,2}(?:E\d{1,4})?|E\d{1,4}|(?:19|20)\d{2}",
|
||||
re.IGNORECASE
|
||||
)
|
||||
_media_separator_pattern = re.compile(r"[\s\-_.::·'\"()\[\]【】]+")
|
||||
_media_sentence_punctuation_pattern = re.compile(r"[,。!?!?,;;]")
|
||||
_media_title_char_pattern = re.compile(r"[\u4e00-\u9fffA-Za-z]")
|
||||
|
||||
|
||||
class StringUtils:
|
||||
@@ -541,27 +552,20 @@ class StringUtils:
|
||||
text = re.sub(r'\s+', ' ', text).strip()
|
||||
if not text:
|
||||
return False
|
||||
if text.startswith("#") \
|
||||
or re.search(r"^请[问帮你]", text) \
|
||||
or re.search(r"[??]$", text) \
|
||||
or StringUtils.count_words(text) > 10 \
|
||||
or "继续" in text:
|
||||
if _non_media_title_pattern.search(text) \
|
||||
or StringUtils.count_words(text) > _max_media_title_words:
|
||||
return False
|
||||
if StringUtils.is_link(text):
|
||||
if "://" in text or text.startswith("magnet:?"):
|
||||
return False
|
||||
if re.search(r"(帮我|请问|怎么|如何|为什么|可以|能否|推荐|介绍|谢谢|想看|找一下|搜一下)", text):
|
||||
if _chat_intent_pattern.search(text):
|
||||
return False
|
||||
if re.search(r"[,。!?!?,;;]", text):
|
||||
if _media_sentence_punctuation_pattern.search(text):
|
||||
return False
|
||||
|
||||
candidate = re.sub(
|
||||
r"第\s*[0-9一二三四五六七八九十百零]+\s*[季集]|S\d{1,2}(?:E\d{1,4})?|E\d{1,4}|(?:19|20)\d{2}",
|
||||
"",
|
||||
text,
|
||||
flags=re.IGNORECASE
|
||||
)
|
||||
candidate = re.sub(r"[\s\-_.::·'\"()\[\]【】]+", "", candidate)
|
||||
return len(candidate) >= 2 and bool(re.search(r"[\u4e00-\u9fffA-Za-z]", candidate))
|
||||
# 先移除季/集/年份等媒体特征,再移除分隔符,只保留核心名称用于最终判定
|
||||
candidate = _media_feature_pattern.sub("", text)
|
||||
candidate = _media_separator_pattern.sub("", candidate)
|
||||
return len(candidate) >= _min_media_title_length and _media_title_char_pattern.search(candidate) is not None
|
||||
|
||||
@staticmethod
|
||||
def split_text(text: str, max_length: int) -> Generator:
|
||||
|
||||
Reference in New Issue
Block a user