rss识别v2.2(98%以上名称,多语种均正确识别(少数情况带 &等尾缀,不影响刮削))

This commit is contained in:
IceKyrin
2022-05-30 03:16:32 +08:00
parent 93face174b
commit 30457f9de7
2 changed files with 118 additions and 74 deletions

View File

@@ -74,19 +74,26 @@ class RSSInfoCleaner:
# 去广告
file_name = re.sub("[(\[【]?(字幕)?[\u4e00-\u9fa5、]{0,3}(新人|招募?新?)[\u4e00-\u9fa5、]{0,8}[)\]】]?", "", file_name)
# 除杂
file_name = re.sub("[(\[【]?★?(\d{4}年[春夏秋冬]?)?[\d一二三四五六七八九十]{1,2}月新?番?★?[)\]】]?", "", file_name)
file_name = re.sub("[(\[【]?★?((网飞)?\d{4}年[春夏秋冬]?)?[\d一二三四五六七八九十]{1,2}月新?番?★?[)\]】]?", "", file_name)
# 除杂x2
file_name = re.sub("[(\[【 ](2\d{3})[)\]】 ]", "", file_name)
file_name = re.sub("[(\[【 ](2\d{3})[)\]】 ]", " ", file_name)
# 除杂x3
file_name = re.sub("[(\[【]?(2(\d{3}[年.][春夏秋冬]?)\d{1,2}\.?\d{1,2})[)\]】]?", "", file_name)
file_name = re.sub("[(\[【]?((网飞)?2(\d{3}[年.][春夏秋冬]?)\d{1,2}\.?\d{1,2})[)\]】]?", "", file_name)
# 除杂x4
file_name = re.sub("[(\[【]检索.*[)\]】]?", "", file_name)
strip = ["特效歌词", "复制磁连", "兼容", "配音", "网盘", "\u200b", "[PSV&PC]","R10", "Fin]", "Fin ", "[mkv]", "[]", ""]
strip = ["特效歌词", "复制磁连", "兼容", "配音", "网盘", "\u200b", "[PSV&PC]", "Rv40", "R10", "Fin]", "Fin ", "[mkv]", "[]",
"", ""]
file_name = del_rules(file_name, strip)
# 中文_英文名_
f_res = re.search("_[a-zA-Z_ \-·、.。,!]*[_)\]】]", file_name)
# !!!重要
if f_res is not None:
file_name = file_name.replace(f_res.group(), "/%s/" % f_res.group().strip("_"))
# 日文.英文名
f_res = re.search("([\u4e00-\u9fa5\u3040-\u31ff\d:\-·、.。,!]{1,20}\.)([a-zA-Z\d:\-.。,!]{1,20} ?){2,}",
file_name)
if f_res is not None:
file_name = file_name.replace(f_res.group(1), "%s/" % f_res.group(1).strip("."))
self.Name.raw = str(file_name).replace('', ':').replace('', '[').replace('', ']').replace('-', '-') \
.replace('', '(').replace('', ')').replace("", "&").replace("X", "x").replace("×", "x") \
.replace("", "x").replace("__", "/")
@@ -189,7 +196,7 @@ class RSSInfoCleaner:
# 中文标示
try:
lang.append(
re.search("[(\[【 ]((tvb)?([粤简繁英俄][日中文体&/]?_?){1,5})[)\]】]?", str(file_name)).group(
re.search("[(\[【 ]((tvb)?([粤简繁英俄][日中文体&/]?[_&]?){1,5})[)\]】]?", str(file_name)).group(
1).strip(" "))
except Exception as e:
logging.info(e)
@@ -283,7 +290,7 @@ class RSSInfoCleaner:
season = []
# 中文标示
try:
season.append(re.search(" ?(第?(\d{1,2}|[一二三]|最终)(部|季|季度|丁目))", str(file_name)).group(1).strip(" "))
season.append(re.search(" ?(第?(\d{1,2}|[一二三])(部|季|季度|丁目))", str(file_name)).group(1).strip(" "))
except Exception as e:
logging.info(e)
# 英文标示
@@ -304,7 +311,7 @@ class RSSInfoCleaner:
# _集国漫
try:
episode.append(
re.search("_((\d+集-)?\d+集)", str(file_name)).group(1))
re.search("(_((\d+集-)?\d+集)|[ (\[第]\d+-\d+ ?)", str(file_name)).group(1))
return episode
except Exception as e:
logging.info(e)
@@ -346,14 +353,14 @@ class RSSInfoCleaner:
# 中文
try:
vision.append(
re.search("[(\[【]?(([\u4e00-\u9fa5]{0,5}|v\d)((版本?|修复?正?|WEB限定)|片源?|内详|(特别篇))(话|版|合?集?))[)\]】]?",
re.search("[(\[【]?(([\u4e00-\u9fa5]{0,5}|v\d)((版本?|修[复正]|WEB限定)|片源?|内详|(特别篇))(话|版|合?集?))[)\]】]?",
str(file_name)).group(1))
except Exception as e:
logging.info(e)
# 英文
try:
vision.append(
re.search("[(\[【 ]\d{1,2}((v\d)((版本?|修复?正?版?)|片源?|内详)?)[)\]】]", str(file_name)).group(1))
re.search("[(\[【 ]\d{1,2}((v\d)((版本?|修复?正?版)|片源?|内详)?)[)\]】]", str(file_name)).group(1))
except Exception as e:
logging.info(e)
# [v2]
@@ -374,13 +381,13 @@ class RSSInfoCleaner:
# 中文标示
try:
ass.append(
re.search("[(\[【]?(附?([内外][挂嵌封]\+?){1,2}(字幕)?)[)\]】]?", str(file_name)).group(1))
re.search("[(\[【]?(附?([内外][挂嵌封][+&]?){1,2}(字幕|[简中日英]*音轨)?)[)\]】]?", str(file_name)).group(1))
except Exception as e:
logging.info(e)
# 英文标示
try:
ass.append(
re.search("[(\[【]?(([ +]?(ass|pgs|srt)){1,3})[)\]】]?", str(file_name)).group(1).strip(" "))
re.search("[ (\[【+](([ +]?(ass|pgs|srt)){1,3})[)\]】]?", str(file_name)).group(1).strip(" "))
except Exception as e:
logging.info(e)
if ass:
@@ -428,9 +435,9 @@ class RSSInfoCleaner:
# 混合验证
def all_verity(self, raw_name):
self.zh_list = re_verity(self.zh_list, raw_name)
self.en_list = re_verity(self.en_list, raw_name)
self.jp_list = re_verity(self.jp_list, raw_name)
self.zh_list = re_verity(self.zh_list, raw_name) if self.zh_list is not None else None
self.en_list = re_verity(self.en_list, raw_name) if self.en_list is not None else None
self.jp_list = re_verity(self.jp_list, raw_name) if self.jp_list is not None else None
# 汇总信息
def get_clean_name(self):
@@ -447,7 +454,6 @@ class RSSInfoCleaner:
"code": self.Tag.code,
"source": self.Tag.source
}
# 字母全部小写
clean_name = self.Name.raw.lower()
@@ -459,17 +465,20 @@ class RSSInfoCleaner:
clean_name = clean_name.replace(i, "") if i is not None else clean_name
else:
clean_name = clean_name.replace(v, "")
# 除杂
x_list = ["pc&psp", "pc&psv", "movie", "bangumi.online", "donghua", "[_]",
"仅限港澳台地区", "话全", "第话", "第集", "全集", "字幕", "", "", "+", "@"]
"仅限港澳台地区", "话全", "第话", "第集", "全集", "字幕", "", "", "", "+", "@"]
for i in x_list:
clean_name = clean_name.replace(i, "")
# 去除多余空格
clean_name = re.sub(' +', ' ', clean_name).strip(" ").strip("-").strip(" ")
# 去除空括号
# !!! 不能删
clean_name = clean_name.replace("][", "/")
xx = re.search("[\u4e00-\u9fa5\u3040-\u31ff ]([(\[。_])[\u4e00-\u9fa5\a-z]", clean_name)
if xx is not None:
clean_name = clean_name.replace(xx.group(1), "/")
clean_name = re.sub("([(\[] *| *[)\]])", "", clean_name)
clean_name = re.sub("(/ */)", "/", clean_name)
@@ -492,10 +501,12 @@ class RSSInfoCleaner:
if en is not None:
self.Name.en = clean_list([en.group()])
return
elif re.search("(^[\u4e00-\u9fa5\u3040-\u31ff\d:\-·??、.。,! ]{1,20} ?)[\u4e00-\u9fa5~ ]*[._&]?([a-z\d:\-.。,! ]* ?)",
self.Name.clean) is not None:
res = re.search("(^[\u4e00-\u9fa5\u3040-\u31ff\d:\-·??、.。,! ]{1,20} ?)[\u4e00-\u9fa5~ ]*[._&]?([a-z\d:\-.。,! ]* ?)",
self.Name.clean)
elif re.search(
"(^[\u4e00-\u9fa5\u3040-\u31ff\d:\-·??、.。,!]{1,20}[a-z\d]{,3} ??)([a-z\d:\-.。,! ]* ?)",
self.Name.clean) is not None:
res = re.search(
"(^[\u4e00-\u9fa5\u3040-\u31ff\d:\-·??、.。,!]{1,20}[a-z\d]{,3} ??)[._&]?([a-z\d:\-.。,! ]* ?)",
self.Name.clean)
zh = res.group(1)
en = res.group(2)
zh = re.search(zh, self.Name.raw.lower())
@@ -505,18 +516,35 @@ class RSSInfoCleaner:
if en is not None:
self.Name.en = clean_list([en.group()])
return
# 英中
elif re.search(
"(^([a-z\d:\-_.。,! ]* ?) ?)[._&]?([\u4e00-\u9fa5\u3040-\u31ffa-z\d:\-_·??、.。,! ]{1,20})",
self.Name.clean) is not None:
res = re.search(
"(^([a-z\d:\-_.。,! ]* ?) ?)[._&]?([\u4e00-\u9fa5\u3040-\u31ffa-z\d:\-_·??、.。,! ]{1,20})",
self.Name.clean)
zh = res.group(3)
en = res.group(1)
zh = re.search(zh, self.Name.raw.lower())
if zh is not None:
self.Name.zh = clean_list([zh.group()])
en = re.search(en, self.Name.raw.lower())
if en is not None:
self.Name.en = clean_list([en.group()])
return
elif len(re.findall("[a-zA-Z]", self.Name.clean.lower())) < 10:
zh = re.search(self.Name.clean, self.Name.raw.lower())
if zh is not None:
self.Name.zh = clean_list([zh.group()])
return
if debug:
if debug > 0:
print("初筛:\r\n%s\r\n%s\r\n%s" % (self.zh_list, self.en_list, self.jp_list))
if (has_zh(self.Name.clean) or has_jp(self.Name.clean)) and has_en(self.Name.clean):
self.Name.clean = add_separator(self.Name.clean)
self.easy_split(self.Name.clean, self.zh_list, self.en_list, self.jp_list)
if debug:
if debug > 0:
print("二筛:\r\n%s\r\n%s\r\n%s" % (self.zh_list, self.en_list, self.jp_list))
# 结果反代入原名验证
self.all_verity([self.Name.raw, self.Name.clean])
@@ -533,32 +561,33 @@ class RSSInfoCleaner:
self.easy_split(temp_name, self.zh_list, self.en_list, self.jp_list)
while "" in self.en_list:
self.en_list.remove("")
if debug:
if debug > 0:
print("三筛:\r\n%s\r\n%s\r\n%s" % (self.zh_list, self.en_list, self.jp_list))
# 一步一验
self.all_verity([self.Name.raw, self.Name.clean])
for _ in range(3):
for _ in range(5):
# 拼合碎片
splicing(self.zh_list, self.zh_list, self.Name.clean)
splicing(self.en_list, self.en_list, self.Name.clean)
splicing(self.jp_list, self.jp_list, self.Name.clean)
# 拼合中英文碎片
splicing(self.en_list, self.zh_list, self.Name.clean)
# 拼合碎片
splicing(self.zh_list, self.zh_list, self.Name.raw)
splicing(self.en_list, self.en_list, self.Name.raw)
splicing(self.jp_list, self.jp_list, self.Name.raw)
# 拼合中英文碎片
splicing(self.en_list, self.zh_list, self.Name.raw)
if debug:
for i in self.en_list:
for j in self.zh_list:
res = re.search("%s +%s" % (i, j), self.Name.raw.lower())
if res is not None:
self.en_list.remove(i)
self.zh_list.append(res.group())
if debug > 0:
print("拼合:\r\n%s\r\n%s\r\n%s" % (self.zh_list, self.en_list, self.jp_list))
# 再次验证这里只能验raw名
self.all_verity(self.Name.raw)
# 灌装
self.Name.zh = clean_list(self.zh_list)
if "名侦探柯南" in self.Name.raw:
self.Name.zh = ["名侦探柯南"]
bug_list = ["不白吃话山海经"]
for i in bug_list:
if i in self.Name.raw.lower():
if has_zh(i):
self.Name.zh = [i]
self.Name.en = clean_list(self.en_list)
self.Name.jp = clean_list(self.jp_list)
@@ -567,8 +596,9 @@ if __name__ == "__main__":
debug = 0
# mikan/dmhy 获取数据dmhy 最多1w行mikan最多3w行
# 数据序号向下x条
num = 100
name_list = read_data("mikan", num, 100)
num = debug if debug > 1 else 294
row = 1 if debug else 200
name_list = read_data("mikan", num, row)
for i in range(0, len(name_list)):
title = RSSInfoCleaner(name_list[i]).Name
print("%s:%s" % (num + i, name_list[i]))

View File

@@ -44,46 +44,60 @@ def add_separator(clean_name):
# 拼合碎片
def splicing(frag_list, name_list, raw_name):
# 处理中英文混合名
if len(frag_list) > 1:
fragment = min(frag_list, key=len)
if fragment in raw_name.lower():
for piece_name in name_list:
try:
r_name = re.search("(%s {0,3}%s|%s {0,5}%s)" % (fragment, piece_name, piece_name, fragment),
raw_name.lower())
if r_name is not None:
frag_list.remove(fragment)
name_list.remove(piece_name)
name_list.append(r_name.group())
except Exception as e:
logging.warning("bug--%s" % e)
logging.warning("piece_name:%s,fragment:%s" % (piece_name, fragment))
try:
for i in range(0, len(name_list) - 1):
if name_list[i] in name_list[i + 1] and name_list[i] != name_list[i + 1]:
name_list.remove(name_list[i])
elif raw_list[i + 1] in name_list[i] and name_list[i] != name_list[i + 1]:
name_list.remove(name_list[i + 1])
except Exception as e:
logging.info(e)
min_list = sorted(name_list, key=lambda i: len(i), reverse=False)
for i in range(0, len(min_list) - 1):
# 处理中英文混合名
if frag_list is not None and len(frag_list) > 1:
fragment = min_list[i]
try:
if fragment in raw_name.lower():
for piece_name in name_list:
try:
r_name = re.search("(%s {0,3}%s|%s {0,5}%s)" % (fragment, piece_name, piece_name, fragment),
raw_name.lower())
if r_name is not None:
frag_list.remove(fragment)
name_list.remove(piece_name)
name_list.append(r_name.group())
except Exception as e:
logging.warning("bug--%s" % e)
logging.warning("piece_name:%s,fragment:%s" % (piece_name, fragment))
except Exception as e:
print(e)
# 清理列表
def clean_list(raw_list):
# 去除碎片和杂质
raw_list = [x.strip("_").strip("-").strip(" ") for x in raw_list if len(x) > 1]
# 小碎片归并
for _ in range(len(raw_list)):
if raw_list is not None and len(raw_list) > 1:
try:
for i in range(0, len(raw_list) - 1):
if raw_list[i] in raw_list[i + 1] and raw_list[i] != raw_list[i + 1]:
raw_list.remove(raw_list[i])
elif raw_list[i + 1] in raw_list[i] and raw_list[i] != raw_list[i + 1]:
raw_list.remove(raw_list[i + 1])
except Exception as e:
logging.info(e)
if raw_list is not None and len(raw_list) > 1:
try:
for i in range(0, len(raw_list)):
up_list = sorted(raw_list, key=lambda i: len(i), reverse=False)
if up_list[i] in up_list[-1] and up_list[i] != up_list[-1]:
raw_list.remove(up_list[i])
except Exception as e:
logging.info(e)
if raw_list is not None:
# 去除碎片和杂质
raw_list = [x.strip("-").strip(" ") for x in raw_list if len(x) > 1]
# 小碎片归并
for _ in range(len(raw_list)):
if raw_list is not None and len(raw_list) > 1:
try:
for i in range(0, len(raw_list) - 1):
if raw_list[i] in raw_list[i + 1] and raw_list[i] != raw_list[i + 1]:
raw_list.remove(raw_list[i])
elif raw_list[i + 1] in raw_list[i] and raw_list[i] != raw_list[i + 1]:
raw_list.remove(raw_list[i + 1])
except Exception as e:
logging.info(e)
if raw_list is not None and len(raw_list) > 1:
try:
for i in range(0, len(raw_list)):
up_list = sorted(raw_list, key=lambda i: len(i), reverse=False)
if up_list[i] in up_list[-1] and up_list[i] != up_list[-1]:
raw_list.remove(up_list[i])
except Exception as e:
logging.info(e)
if raw_list:
return set(raw_list)
else: