diff --git a/Windows/rename.py b/Windows/rename.py new file mode 100644 index 00000000..8423a692 --- /dev/null +++ b/Windows/rename.py @@ -0,0 +1,471 @@ +import re +import json +import time + +import zhconv +import requests +import logging +import pandas as pd + + +class Rename: + def __init__(self, file_name): + self.file_name = file_name + self.clean() + logging.basicConfig(level=logging.DEBUG, + filename='./rename_log.txt', + filemode='w', + format='%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s') + self.group_character = ['字幕社', '字幕组', '字幕屋', '发布组', '动漫', '国漫', '汉化', 'raw', 'works', '工作室', '压制', '合成', '制作', + '搬运', '委员会', '家族', '译制', '动画', '研究所', 'sub', '翻译', '联盟', 'dream', '-rip', 'neo', 'team'] + self.group_char = ['dmhy', "喵萌", 'lolihouse', 'vcb', '澄空学园', 'c.c动漫', '拨雪寻春', 'mingy', 'amor', 'moozzi2', + '酷漫', 'skytree', 'sweetsub', 'pcsub', 'ahu-sub', 'f宅', 'captions', 'dragsterps', 'onestar', + '卡通', '时雨初空', 'nyaa', 'ddd', 'koten', 'reinforce', '届恋对邦小队', 'cxraw'] + with open("rule.json", encoding='utf-8') as file_obj: + rule_json = json.load(file_obj)[0]["group_name"] + self.group_rule = [str(x).lower() for x in rule_json] + self.file_info = {} + + self.pre_analyse = None + self.regognize_group() + + # 获取字符串出现位置 + def getStrInfo(self, char, target): + locate = [] + for index, value in enumerate(char): + if target == value: + locate.append(index) + return locate + + # 匹配某字符串最近的括号 + def get_gp(self, char, string): + start = [x for x in self.getStrInfo(string, "[") if int(x) < int(string.find(char))][-1] + 1 + end = [x for x in self.getStrInfo(string, "]") if int(x) > int(string.find(char))][0] + return string[start:end] + + # 清理原链接(中文字符替换为英文) + def clean(self): + file_name = zhconv.convert(self.file_name, 'zh-cn') + # 去广告 + file_name = re.sub("[((\[【]?(字幕)?[\u4e00-\u9fa5]{0,3}(新人|招募?新?)[\u4e00-\u9fa5]{0,5}[))\]】]?", "", file_name) + # 除杂 + file_name = re.sub("[((\[【]?★?(\d{4}[年][春夏秋冬]?)?[\d一二三四五六七八九十]{1,2}月新?番?★?[))\]】]?", "", file_name) + # 除杂x2 + file_name = re.sub("[((\[【]?(2(\d{3}[年.][春夏秋冬]?)\d{1,2}\.?\d{1,2})[))\]】]?", "", file_name) + # 除杂x3 + file_name = re.sub("[((\[【]检索.*[))\]】]?", "", file_name) + strip = ["复制磁连", "兼容", "配音", "网盘", "\u200b", "[]", "★"] + for i in strip: + file_name = file_name.replace(i, "") + self.file_name = str(file_name).replace(':', ':').replace('【', '[').replace('】', ']').replace('-', '-') \ + .replace('(', '(').replace(')', ')').replace("&", "&").replace("X", "x").replace("×", "x") \ + .replace("Ⅹ", "x").replace("-", " ").replace("_", " ") + + # 检索string1列表元素是否在string2列表元素中 + def find_str(self, str1, str2): + for s1 in str1: + for s2 in str2: + if s1 in s2: + return [True, s2[1:]] + else: + return [False, name] + + # 检索字幕组特征 + def regognize_group(self): + character = self.group_character + group = self.group_char + rule = self.group_rule + # 字幕组(特例)特征优先级大于通用特征 + character = group + character + # !强规则,人工录入标准名,区分大小写,优先匹配 + if "[ANi]" in self.file_name: + self.pre_analyse = "[ani]" + return "enforce" + for char in rule: + if "[%s]" % char in self.file_name: + self.pre_analyse = char + return "enforce" + # 如果文件名以 [字幕组名] 开头 + if self.file_name[0] == "[": + str_split = self.file_name.lower().split("]") + # 检索特征值是否位于文件名第1、2、最后一段 + for char in character: + if char in str_split[0] or char in str_split[1] or char in str_split[-1]: + self.pre_analyse = char + return "success" + # 文件名是否为 [字幕组名&字幕组名&字幕组名] ,求求了,一集的工作量真的需要三个组一起做吗 + if self.find_str(["&", "@"], str_split)[0]: + res = self.find_str(["&", "@"], str_split)[1] + # 限制匹配长度,防止出bug + if len(res) < 10: + self.pre_analyse = res + return "special" + # 再匹配不上我就麻了 + self.pre_analyse = None + return False + # 文件名以 -字幕组名 结尾 + elif "-" in self.file_name: + for char in character: + if char in self.file_name.lower().split("-")[-1]: + self.pre_analyse = self.file_name.lower().split("-")[-1] + return "reserve" + self.pre_analyse = None + return False + # 文件名以空格分隔 字幕组名为第一段 + else: + for char in character: + if char in self.file_name.lower().split(" ")[0]: + self.pre_analyse = char + return "blank" + self.pre_analyse = None + return False + + # 获取字幕组名 + def get_group(self): + # 是否匹配成功(哪种方式匹配成功) + status = self.regognize_group() + # 检索到的特征值 + res_char = self.pre_analyse + # 强条 + if status == "enforce": + return res_char + # 大部分情况 + elif status == "success": + # 如果是 [字幕组名] ,这么标准的格式直接else送走吧,剩下的匹配一下 + if "[%s]" % res_char not in self.file_name.lower(): + if self.file_name[0] == "[": + try: + # 以特征值为中心,匹配最近的中括号,八成就这个了 + gp = self.get_gp(res_char, self.file_name.lower()) + # 防止太长炸了,一般不会这么长的字幕组名 + if len(gp) < 30: + pass + else: + print("name:%s\r\nchar:%s,gp:%s" % (self.file_name, res_char, gp)) + return gp + except Exception as e: + print("bug -- res_char:%s,%s,%s" % (res_char, self.file_name.lower(), e)) + else: + return res_char + # 文件名以空格分隔 字幕组名为第一段 + elif status == "blank": + if res_char in self.file_name.lower().split(" ")[0]: + res = self.file_name.lower().split(" ")[0] + return res + # 文件名为 [字幕组名&字幕组名&字幕组名] + elif status == "special": + return res_char + # -字幕组名 在结尾 + elif status == "reserve": + return res_char + # 再见 + else: + return None + + # 扒了6W数据,硬找的参数,没啥说的 + def get_dpi(self): + file_name = self.file_name + dpi_list = ["4k", "2160p", "1440p", "1080p", "1036p", "816p", "810p", "720p", "576p", "544P", "540p", "480p", + "1080i", "1080+", + "3840x2160", "1920x1080", "1920x1036", "1920x804", "1920x800", "1536x864", "1452x1080", "1440x1080", + "1280x720", "1272x720", "1255x940", "1024x768", "1024X576", "960x720", "948x720", "896x672", + "872x480", "848X480", "832x624", "704x528", "640x480", + "mp4_1080", "mp4_720"] + for i in dpi_list: + dpi = str(file_name).lower().find(i) + if dpi > 0: + return [str(i)] + return None + + # 获取语种 + def get_language(self): + file_name = self.file_name + lang = [] + # 中文标示 + try: + lang.append( + re.search("[((\[【]?((tvb)?(日?[粤中简繁英]日?(文|体|体?双?语)?/?){1,5}(双?字幕)?)[))\]】]?", str(file_name)).group( + 1).strip(" ")) + except Exception as e: + logging.info(e) + # 英文标示 + try: + lang = lang + re.search("[((\[【]?(((G?BIG5|CHT|CHS|GB|JP|CN)[/ _]?){1,3})[))\]】]?", str(file_name)).group( + 1).lower().strip(" ").split(" ") + except Exception as e: + logging.info(e) + if lang: + return lang + else: + return None + + # 文件种类 + def get_type(self): + file_name = self.file_name + type_list = [] + # 英文标示 + try: + type_list.append(re.search("[((\[【]?(((flac(x\d)?|mp4|mkv|mp3)[ -]?){1,3})[))\]】]?", + str(file_name).lower()).group(1).lower().strip(" ")) + except Exception as e: + logging.info(e) + if type_list: + return type_list + else: + return None + + # 编码格式 + def get_code(self): + file_name = self.file_name + code = [] + # 英文标示 + try: + code = code + re.search("[((\[【]?(((x26[45]|hevc|aac|avc|((10|8)[ -]?bit))[ -]?(x\d)?[ -]?){1,5})[ ))\]】]?", + str(file_name).lower()).group(1).lower().strip(" ").split(" ") + except Exception as e: + logging.info(e) + if code: + return code + else: + return None + + # 来源 + def get_source(self): + file_name = str(self.file_name).lower() + type_list = [] + # 英文标示 + for i in range(3): + try: + res = re.search( + "[((\[【]?((bd|remux|(viu)?tvb?|bilibili|b ?global|baha|web[ -]?(dl|rip))[ -]?(iso|mut|rip)?)[))\]】]?", + file_name).group(1).lower().strip(" ") + if res not in type_list: + type_list.append(res) + except Exception as e: + logging.info(e) + for res in type_list: + file_name = file_name.replace(res, "") + if type_list: + return type_list + else: + return None + + # 获取季度 + def get_season(self): + file_name = self.file_name.lower() + season = [] + # 中文标示 + try: + season.append(re.search(" ?(第?(\d{1,2}|[一二三]|最终)(部|季|季度|丁目))", str(file_name)).group(1).strip(" ")) + except Exception as e: + logging.info(e) + # 英文标示 + try: + season.append( + re.search("((final ?)?(season|[ \[]s) ?\d{1,2})", str(file_name)).group(1).strip(" ")) + except Exception as e: + logging.info(e) + if season: + return season + else: + return None + + # 获取集数 + def get_episode(self): + file_name = self.file_name.lower() + episode = [] + # [10 11]集点名批评这种命名方法,几个国漫的组 + try: + episode.append( + re.search("[\[( ](\d{1,3}[- &]\d{1,3}) ?(fin| Fin|\(全集\))?[ )\]]", str(file_name)).group(1)) + return episode + except Exception as e: + logging.info(e) + # 这里匹配ova 剧场版 不带集数的合集 之类的 + try: + episode.append( + re.search("[\[ 第](_\d{1,3}集|ova|剧场版|全|OVA ?\d{0,2}|合|[一二三四五六七八九十])[集话章 \]\[]", str(file_name)).group(1)) + return episode + except Exception as e: + logging.info(e) + # 标准单集 sp单集 + try: + episode.append( + re.search("[\[ 第e]((sp|(数码)?重映)?(1?\d{1,3}(\.\d)?|1?\d{1,3}\(1?\d{1,3}\)))(v\d)?[集话章 \]\[]", + str(file_name)).group(1)) + return episode + except Exception as e: + logging.info(e) + # xx-xx集 + try: + episode.append( + re.search("[\[ 第(]((合集)?\\\)?(\d{1,3}[ &]\d{1,3})(话| |]|\(全集\)|全集|fin)", str(file_name)).group(1)) + return episode + except Exception as e: + logging.info(e) + return None + + # 获取版本 + def get_vision(self): + file_name = self.file_name.lower() + vision = [] + # 中文 + try: + vision.append( + re.search("[((\[【]?(([\u4e00-\u9fa5]{0,2}|v\d)((版本?|修复?正?)|片源?|内详))[))\]】]?", str(file_name)).group(1)) + except Exception as e: + logging.info(e) + # 英文 + try: + vision.append( + re.search("[((\[【 ]\d{1,2}((v\d)((版本?|修复?正?版?)|片源?|内详)?)[))\]】]", str(file_name)).group(1)) + except Exception as e: + logging.info(e) + # [v2] + try: + vision.append( + re.search("[((\[【 ](v\d)[))\]】]", str(file_name)).group(1)) + except Exception as e: + logging.info(e) + if vision: + return vision + else: + return None + + # 获取字幕类型 + def get_ass(self): + file_name = self.file_name.lower() + ass = [] + # 中文标示 + try: + ass.append( + re.search("[((\[【]?(附?([内外][挂嵌封]\+?){1,2}(字幕)?)[))\]】]?", str(file_name)).group(1)) + except Exception as e: + logging.info(e) + # 英文标示 + try: + ass.append( + re.search("[((\[【]?(([ +]?(ass|pgs|srt)){1,3})[))\]】]?", str(file_name)).group(1).strip(" ")) + except Exception as e: + logging.info(e) + if ass: + return ass + else: + return None + + def has_en(self, str): + my_re = re.compile(r'[a-z]', re.S) + res = re.findall(my_re, str) + if len(res): + return True + else: + return False + + def has_zh(self, str): + my_re = re.compile(r'[\u4e00-\u9fa5]', re.S) + res = re.findall(my_re, str) + if len(res): + return True + else: + return False + + # 拿到的数据挨个测试 + def get_info(self): + # 获取到的信息 + info = { + "group": self.get_group(), + "dpi": self.get_dpi(), + "season": self.get_season(), + "episode": self.get_episode(), + "vision": self.get_vision(), + "lang": self.get_language(), + "ass": self.get_ass(), + "type": self.get_type(), + "code": self.get_code(), + "source": self.get_source() + } + + # 字母全部小写 + clean_name = self.file_name.lower() + # 去除拿到的有效信息 + for k, v in info.items(): + if v is not None: + if type(v) is list: + for i in v: + if i is not None: + clean_name = clean_name.replace(i, "") + else: + clean_name = clean_name.replace(v, "") + # 除杂 + clean_list = ["pc&psp", "pc&psv", "fin", "opus", "movie", "tvb", "end", "bangumi.online", "donghua", + "话全", "第话", "第集", "全集", " 话", " 集", "+", "@", "。"] + for i in clean_list: + clean_name = clean_name.replace(i, "").replace(" ]", "]").replace("[ ", "[").replace(" ", "") + # 分隔各字段 + clean_name = clean_name.replace("[", "").replace("]", " ").replace("()", "").replace("( )", "") + # 去除多余空格 + clean_name = re.sub(' +', ' ', clean_name).strip(" ") + # 剩下来的几乎就是干净番名了,再刮不到不管了 + info["clean_name"] = clean_name + + title = { + "zh": None, + "en": None, + } + clean_name = re.sub('[^a-zA-Z\u4e00-\u9fa5:@#$%^&*()\[\]/ ]', "", clean_name) + clean_name = re.sub(' +', ' ', clean_name).strip(" ") + + if self.has_en(clean_name) and self.has_zh(clean_name): + # 中英 + try: + res = re.search("(([\u4e00-\u9fa5]{2,12}[ /:]{0,3}){1,5}) {0,5}(( ?[a-z':]{1,15}){1,15})", clean_name) + title["zh"] = res.group(1).strip(" ") + title["en"] = res.group(3).strip(" ") + except Exception as e: + logging.info(e) + # 本程序依赖此bug运行,这行不能删 + if title["zh"] is None: + # 中英 + try: + res = re.search( + "(([\u4e00-\u9fa5a]{1,12}[ /:]{0,3}){1,5})[&/ (]{0,5}(( ?[a-z':]{1,15}){1,15})[ )/]{0,3}", + clean_name) + title["zh"] = res.group(1).strip(" ") + title["en"] = res.group(3).strip(" ") + except Exception as e: + logging.info(e) + # 英中 + try: + res = re.search( + "(([ a-z'.:]{1,20}){1,8})[&/ (]{0,5}(([\u4e00-\u9fa5a]{2,10}[a-z]{0,3} ?){1,5})[ )/]{0,3}", + clean_name) + title["en"] = res.group(1).strip(" ") + title["zh"] = res.group(3).strip(" ") + except Exception as e: + logging.info(e) + else: + if self.has_zh(clean_name): + # 中文 + try: + res = re.search("(([\u4e00-\u9fa5:]{2,15}[ /]?){1,5}) *", clean_name) + title["zh"] = res.group(1).strip(" ") + except Exception as e: + logging.info(e) + elif self.has_en(clean_name): + # 英文 + try: + res = re.search("(([a-z:]{2,15}[ /]?){1,15}) *", clean_name) + title["en"] = res.group(1).strip(" ") + except Exception as e: + logging.info(e) + for k, v in title.items(): + if v is not None and "/" in v: + zh_list = v.split("/") + title[k] = zh_list[0].strip(" ") + info["title"] = title + return info + + +if __name__ == "__main__": + # 使用方法 + print(Rename(name).get_info()) diff --git a/Windows/rule.json b/Windows/rule.json new file mode 100644 index 00000000..ac2c8588 --- /dev/null +++ b/Windows/rule.json @@ -0,0 +1,56 @@ +[ + { + "group_name": [ + "Lilith-Raws", + "NC-Raws", + "Skymoon-Raws", + "天月搬运组", + "LoliHouse", + "猎户不鸽发布组", + "NaN-Raws", + "猎户随缘发布组", + "桜都字幕组", + "澄空学园&雪飘工作室", + "千夏字幕组", + "IET字幕组", + "离谱Sub", + "酷漫404", + "星空字幕组", + "轻之国度字幕组", + "枫叶字幕组", + "雪飘工作室", + "豌豆字幕组", + "云光字幕组", + "悠哈璃羽字幕社", + "桜都字幕组", + "ANi", + "❀拨雪寻春❀", + "极彩字幕组", + "悠哈璃羽字幕社", + "爱恋&漫猫字幕组", + "MingY", + "VCB-Studio", + "喵萌奶茶屋", + "爱恋字母社", + "诸神字幕组", + "驯兽师联盟", + "夏沐字幕组", + "幻樱", + "动漫国字幕组", + "SweetSub&圆环记录攻略组", + "动漫萌", + "极影字幕社", + "喵萌Production", + "50yrs ago", + "40yrs ago", + "30yrs ago", + "20yrs ago", + "s5291s", + "nvacg", + "RHxDymy", + "PoInSu", + "DHR百合組" + ], + "name_position": 1 + } +] \ No newline at end of file