rss识别v1.6(保留标点,支持日文,bug:中日文有时未分离)

This commit is contained in:
IceKyrin
2022-05-28 02:49:29 +08:00
parent 5e5cfa6f60
commit b976f5331c
2 changed files with 128 additions and 54 deletions

View File

@@ -6,27 +6,38 @@ import logging
class RSSInfoCleaner:
class Name:
def __init__(self):
pass
raw = None
dpi = None
conv = None
zh = None
en = None
jp = None
clean = None
class Info:
def __init__(self):
pass
group = None
season = None
episode = None
vision = None
class Tag:
lang = None
def __init__(self):
pass
dpi = None
ass = None
lang = None
type = None
code = None
source = None
def __init__(self, file_name):
self.Name.file_name = file_name # 接收文件名参数
self.Name.raw = file_name # 接收文件名参数
self.clean() # 清理广告等杂质
# 加载日志,匹配特征等
logging.basicConfig(level=logging.DEBUG,
@@ -47,7 +58,7 @@ class RSSInfoCleaner:
# 匹配字幕组特征
self.recognize_group()
self.Info.group = self.get_group()
self.Name.dpi = self.get_dpi()
self.Tag.dpi = self.get_dpi()
self.Info.season = self.get_season()
self.Info.episode = self.get_episode()
self.Info.vision = self.get_vision()
@@ -77,7 +88,7 @@ class RSSInfoCleaner:
# 清理原链接(中文字符替换为英文)
def clean(self):
file_name = zhconv.convert(self.Name.file_name, 'zh-cn')
file_name = zhconv.convert(self.Name.raw, 'zh-cn')
# 去广告
file_name = re.sub("[(\[【]?(字幕)?[\u4e00-\u9fa5]{0,3}(新人|招募?新?)[\u4e00-\u9fa5]{0,5}[)\]】]?", "", file_name)
# 除杂
@@ -89,7 +100,7 @@ class RSSInfoCleaner:
strip = ["复制磁连", "兼容", "配音", "网盘", "\u200b", "[]", ""]
for i in strip:
file_name = file_name.replace(i, "")
self.Name.file_name = str(file_name).replace('', ':').replace('', '[').replace('', ']').replace('-', '-') \
self.Name.raw = str(file_name).replace('', ':').replace('', '[').replace('', ']').replace('-', '-') \
.replace('', '(').replace('', ')').replace("", "&").replace("X", "x").replace("×", "x") \
.replace("", "x").replace("-", " ").replace("_", " ")
@@ -102,12 +113,12 @@ class RSSInfoCleaner:
character = group + character
# !强规则,人工录入标准名,区分大小写,优先匹配
for char in rule:
if "[%s]" % char in self.Name.file_name:
if "[%s]" % char in self.Name.raw:
self.pre_analyse = char.lower()
return "enforce"
# 如果文件名以 [字幕组名] 开头
if self.Name.file_name[0] == "[":
str_split = self.Name.file_name.lower().split("]")
if self.Name.raw[0] == "[":
str_split = self.Name.raw.lower().split("]")
# 检索特征值是否位于文件名第1、2、最后一段
for char in character:
if char in str_split[0] or char in str_split[1] or char in str_split[-1]:
@@ -122,16 +133,16 @@ class RSSInfoCleaner:
self.pre_analyse = None
return False
# 文件名以 -字幕组名 结尾
elif "-" in self.Name.file_name:
elif "-" in self.Name.raw:
for char in character:
if char in self.Name.file_name.lower().split("-")[-1]:
self.pre_analyse = self.Name.file_name.lower().split("-")[-1]
if char in self.Name.raw.lower().split("-")[-1]:
self.pre_analyse = self.Name.raw.lower().split("-")[-1]
return "reserve"
self.pre_analyse = None
return False
# 文件名以空格分隔 字幕组名为第一段
else:
first_str = self.Name.file_name.lower().split(" ")[0]
first_str = self.Name.raw.lower().split(" ")[0]
for char in character:
if char in first_str:
self.pre_analyse = first_str
@@ -152,14 +163,14 @@ class RSSInfoCleaner:
# 大部分情况
elif status == "success":
# 如果是 [字幕组名] 这么标准的格式直接else送走吧剩下的匹配一下
if "[%s]" % res_char not in self.Name.file_name.lower():
if self.Name.file_name[0] == "[":
if "[%s]" % res_char not in self.Name.raw.lower():
if self.Name.raw[0] == "[":
try:
# 以特征值为中心,匹配最近的中括号,八成就这个了
gp = self.get_gp(res_char, self.Name.file_name.lower())
gp = self.get_gp(res_char, self.Name.raw.lower())
return gp
except Exception as e:
logging.warning("bug -- res_char:%s,%s,%s" % (res_char, self.Name.file_name.lower(), e))
logging.warning("bug -- res_char:%s,%s,%s" % (res_char, self.Name.raw.lower(), e))
else:
return res_char
# 再见
@@ -167,7 +178,7 @@ class RSSInfoCleaner:
# 扒了6W数据硬找的参数没啥说的
def get_dpi(self):
file_name = self.Name.file_name
file_name = self.Name.raw
dpi_list = ["4k", "2160p", "1440p", "1080p", "1036p", "816p", "810p", "720p", "576p", "544P", "540p", "480p",
"1080i", "1080+",
"3840x2160", "1920x1080", "1920x1036", "1920x804", "1920x800", "1536x864", "1452x1080", "1440x1080",
@@ -181,7 +192,7 @@ class RSSInfoCleaner:
# 获取语种
def get_language(self):
file_name = self.Name.file_name
file_name = self.Name.raw
lang = []
# 中文标示
try:
@@ -203,7 +214,7 @@ class RSSInfoCleaner:
# 文件种类
def get_type(self):
file_name = self.Name.file_name
file_name = self.Name.raw
type_list = []
# 英文标示
try:
@@ -218,7 +229,7 @@ class RSSInfoCleaner:
# 编码格式
def get_code(self):
file_name = self.Name.file_name
file_name = self.Name.raw
code = []
# 英文标示
try:
@@ -233,7 +244,7 @@ class RSSInfoCleaner:
# 来源
def get_source(self):
file_name = str(self.Name.file_name).lower()
file_name = str(self.Name.raw).lower()
type_list = []
# 英文标示
for _ in range(3):
@@ -254,7 +265,7 @@ class RSSInfoCleaner:
# 获取季度
def get_season(self):
file_name = self.Name.file_name.lower()
file_name = self.Name.raw.lower()
season = []
# 中文标示
try:
@@ -274,7 +285,7 @@ class RSSInfoCleaner:
# 获取集数
def get_episode(self):
file_name = self.Name.file_name.lower()
file_name = self.Name.raw.lower()
episode = []
# [10 11]集点名批评这种命名方法,几个国漫的组
try:
@@ -309,7 +320,7 @@ class RSSInfoCleaner:
# 获取版本
def get_vision(self):
file_name = self.Name.file_name.lower()
file_name = self.Name.raw.lower()
vision = []
# 中文
try:
@@ -336,7 +347,7 @@ class RSSInfoCleaner:
# 获取字幕类型
def get_ass(self):
file_name = self.Name.file_name.lower()
file_name = self.Name.raw.lower()
ass = []
# 中文标示
try:
@@ -371,6 +382,14 @@ class RSSInfoCleaner:
else:
return False
def has_jp(self, str):
my_re = re.compile(r'[\u3040-\u31ff]', re.S)
res = re.findall(my_re, str)
if len(res):
return True
else:
return False
# 粗略识别失败re强制匹配
def extract_title(self, raw_name):
title = {
@@ -435,13 +454,15 @@ class RSSInfoCleaner:
if "/" not in clean_name:
if '\u4e00' <= clean_name[0] <= '\u9fff':
try:
res = re.search("(^[a\u4e00-\u9fa5: ]{1,10} ?)([a-z:]{1,20} ?){1,10}", clean_name).group(1)
res = re.search("(^[\u4e00-\u9fa5\u3040-\u31ff: .。,!]{1,20} ?)([a-z: .。,!]{1,20} ?)*",
clean_name).group(1)
clean_name = clean_name.replace(res, res.strip(" ") + "/")
except Exception as e:
logging.info(e)
else:
try:
res = re.search("^(([a-z:]{1,20} ?){1,10} )[\u4e00-\u9fa5: a]{1,20}", clean_name).group(1)
res = re.search("^(([a-z: .。,!]{1,20} ?)* ?)[\u4e00-\u9fa5\u3040-\u31ff: .。,!]{1,20}",
clean_name).group(1)
clean_name = clean_name.replace(res, res.strip(" ") + "/")
except Exception as e:
logging.info(e)
@@ -450,30 +471,38 @@ class RSSInfoCleaner:
return clean_name
# 对以/分隔的多个翻译名,进行简单提取
def easy_split(self, clean_name, zh_list, en_list):
def easy_split(self, clean_name, zh_list, en_list, jp_list):
if "/" in clean_name:
n_list = clean_name.split("/")
for i in n_list:
if self.has_zh(i) is False:
en_list.append(i.strip(" "))
elif self.has_en(i) is False:
zh_list.append(i.strip(" "))
if self.has_jp(i):
jp_list.append(i.strip(" "))
else:
# 如果还是同时包含中英文的情况,递龟一下
i = self.add_separator(i)
self.easy_split(i, zh_list, en_list)
if self.has_zh(i) is False:
en_list.append(i.strip(" "))
elif self.has_en(i) is False:
zh_list.append(i.strip(" "))
elif self.has_zh(i) and self.has_en(i):
# 如果还是同时包含中英文的情况,递龟一下
i = self.add_separator(i)
self.easy_split(i, zh_list, en_list, jp_list)
else:
self.easy_split(i, zh_list, en_list, jp_list)
else:
if self.has_zh(clean_name) is False:
en_list.append(clean_name.strip(" "))
elif self.has_en(clean_name) is False:
zh_list.append(clean_name.strip(" "))
if self.has_jp(clean_name):
jp_list.append(clean_name.strip(" "))
else:
if self.has_zh(clean_name) is False:
en_list.append(clean_name.strip(" "))
elif self.has_en(clean_name) is False:
zh_list.append(clean_name.strip(" "))
# 汇总信息
def get_info(self):
# 获取到的信息
info = {
"group": self.Info.group,
"dpi": self.Name.dpi,
"dpi": self.Tag.dpi,
"season": self.Info.season,
"episode": self.Info.episode,
"vision": self.Info.vision,
@@ -485,7 +514,7 @@ class RSSInfoCleaner:
}
# 字母全部小写
clean_name = self.Name.file_name.lower()
clean_name = self.Name.raw.lower()
# 去除拿到的有效信息
for k, v in info.items():
@@ -507,32 +536,71 @@ class RSSInfoCleaner:
# 剩下来的几乎就是干净番名了,再刮不到不管了
info["clean_name"] = clean_name
clean_name = re.sub('[^a-zA-Z\u4e00-\u9fa5:@#$%^&*()\[\]/ ]', "", clean_name)
clean_name = re.sub('[^a-zA-Z\u4e00-\u9fa5\u3040-\u31ff:*()\[\]/ .。,!]', "", clean_name)
clean_name = re.sub(' +', ' ', clean_name).strip(" ")
clean_name = re.sub("([(\[] *| *[)\]])", "", clean_name)
zh_list = []
en_list = []
raw_zh_list = []
raw_jp_list = []
raw_en_list = []
clean_name = self.add_separator(clean_name)
clean_name = re.sub("(/ */)", "", clean_name)
self.easy_split(clean_name, zh_list, en_list)
self.easy_split(clean_name, raw_zh_list, raw_en_list, raw_jp_list)
self.Name.clean = clean_name
zh_list = []
en_list = []
jp_list = []
for res in raw_zh_list:
correct_res = re.search(res, self.Name.raw.lower())
if correct_res:
zh_list.append(correct_res.group())
for res in raw_en_list:
correct_res = re.search(res, self.Name.raw.lower())
if correct_res:
en_list.append(correct_res.group())
for res in raw_jp_list:
correct_res = re.search(res, self.Name.raw.lower())
if correct_res:
jp_list.append(correct_res.group())
if jp_list:
temp_name = self.Name.clean
for i in jp_list:
temp_name = temp_name.replace(i, "")
self.easy_split(temp_name, zh_list, en_list, jp_list)
if zh_list and en_list == []:
temp_name = self.Name.clean
for i in zh_list:
temp_name = temp_name.replace(i, "")
self.easy_split(temp_name, zh_list, en_list, jp_list)
elif zh_list == [] and en_list:
temp_name = self.Name.clean
for i in en_list:
temp_name = temp_name.replace(i, "")
self.easy_split(temp_name, zh_list, en_list, jp_list)
elif zh_list == [] and en_list == []:
self.extract_title(clean_name)
self.Name.zh = zh_list if zh_list else None
self.Name.en = en_list if en_list else None
if self.Name.zh is None and self.Name.en is None:
self.extract_title(clean_name)
self.Name.jp = jp_list if jp_list else None
return info
if __name__ == "__main__":
import csv
def read_data(name, rows):
if name == "mikan":
def read_data(file_name, rows):
if file_name == "mikan":
with open('mikan.csv', 'r', encoding='utf-8') as csv_file:
reader = csv.reader(csv_file)
raw_data = [row[3] for row in reader][0:rows]
return raw_data
elif name == "dmhy":
elif file_name == "dmhy":
with open('dmhy.csv', 'r', encoding='utf-8') as csv_file:
reader = csv.reader(csv_file)
raw_data = [row[4] for row in reader][1:rows + 1]
@@ -540,7 +608,12 @@ if __name__ == "__main__":
# mikan/dmhy 获取数据dmhy 最多1w行mikan最多3w行
name_list = read_data("dmhy", 1000)
name_list = read_data("dmhy", 100)
for name in name_list:
print(name)
print(RSSInfoCleaner(name).Name.zh)
print("pure_name:%s" % RSSInfoCleaner(name).Name.raw)
print("clean_name:%s" % RSSInfoCleaner(name).Name.clean)
print("zh:%s" % RSSInfoCleaner(name).Name.zh)
print("en:%s" % RSSInfoCleaner(name).Name.en)
print("jp:%s" % RSSInfoCleaner(name).Name.jp)
print()

View File

@@ -49,7 +49,8 @@
"nvacg",
"RHxDymy",
"PoInSu",
"DHR百合組"
"DHR百合組",
"东京不够热"
],
"name_position": 1
}