Files
Auto_Bangumi/module/parser/analyser/raw_parser.py
EstrellaXD b0ccb47dd9 Fix parser.
Close #157
Close #121
Close #152
2023-03-05 15:14:52 +08:00

196 lines
6.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import logging
import re
from dataclasses import dataclass
# from parser.episode import Episode
logger = logging.getLogger(__name__)
EPISODE_RE = re.compile(r"\d+")
TITLE_RE = re.compile(
r"(.*|\[.*])( -? \d+|\[\d+]|\[\d+.?[vV]\d{1}]|[第]?\d+[话話集]|\[\d+.?END])(.*)"
)
RESOLUTION_RE = re.compile(r"1080|720|2160|4K")
SOURCE_RE = re.compile(r"B-Global|[Bb]aha|[Bb]ilibili|AT-X|Web")
SUB_RE = re.compile(r"[简繁日字幕]|CH|BIG5|GB")
CHINESE_NUMBER_MAP = {
"": 1,
"": 2,
"": 3,
"": 4,
"": 5,
"": 6,
"": 7,
"": 8,
"": 9,
"": 10,
}
@dataclass
class Episode:
title_en: str or None
title_zh: str or None
title_jp: str or None
season: int
season_raw: str
episode: int
sub: str
group: str
resolution: str
source: str
class RawParser:
@staticmethod
def get_group(name: str) -> str:
return re.split(r"[\[\]]", name)[1]
@staticmethod
def pre_process(raw_name: str) -> str:
return raw_name.replace("", "[").replace("", "]")
def preffix_process(self, raw: str, group: str) -> str:
raw_process = re.sub(r"[^\w\s\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff-]", "/", raw)
arg_group = raw_process.split("/")
title_list = []
for arg in arg_group:
if re.search(group, arg):
pass
elif re.search(r"新番|月?番", arg) and len(arg) <= 5:
pass
elif re.search(r"港澳台地区", arg):
pass
elif arg == "":
pass
else:
title_list.append(arg)
if len(title_list) == 1:
return title_list[0]
return "/".join(arg_group)
@staticmethod
def season_process(season_info: str):
name_season = season_info
if re.search(r"新番|月?番", season_info):
name_season = re.sub(".*新番.", "", season_info)
# 去除「新番」信息
name_season = re.sub(r"^[^]】]*[]】]", "", name_season).strip()
season_rule = r"S\d{1,2}|Season \d{1,2}|[第].[季期]"
name_season = re.sub(r"[\[\]]", " ", name_season)
seasons = re.findall(season_rule, name_season)
if not seasons:
return name_season, "", 1
name = re.sub(season_rule, "", name_season)
for season in seasons:
season_raw = season
if re.search(r"Season|S", season) is not None:
season = int(re.sub(r"Season|S", "", season))
break
elif re.search(r"[第 ].*[季期(部分)]|部分", season) is not None:
season_pro = re.sub(r"[第季期 ]", "", season)
try:
season = int(season_pro)
except ValueError:
season = CHINESE_NUMBER_MAP[season_pro]
break
return name, season_raw, season
@staticmethod
def name_process(name: str):
name_en, name_zh, name_jp = None, None, None
name = name.strip()
name = re.sub(r"[(]仅限港澳台地区[)]", "", name)
split = re.split("/|\s{2}|-\s{2}", name)
while "" in split:
split.remove("")
if len(split) == 1:
if re.search("_{1}", name) is not None:
split = re.split("_", name)
elif re.search(" - {1}", name) is not None:
split = re.split("-", name)
if len(split) == 1:
split_space = split[0].split(" ")
for idx, item in enumerate(split_space):
if re.search(r"^[\u4e00-\u9fa5]{2,}", item) is not None:
split_space.remove(item)
split = [item.strip(), " ".join(split_space).strip()]
break
for item in split:
if re.search(r"[\u0800-\u4e00]{2,}", item) and not name_jp:
name_jp = item.strip()
elif re.search(r"[\u4e00-\u9fa5]{2,}", item) and not name_zh:
name_zh = item.strip()
elif re.search(r"[a-zA-Z]{3,}", item) and not name_en:
name_en = item.strip()
return name_en, name_zh, name_jp
@staticmethod
def find_tags(other):
elements = re.sub(r"[\[\]()]", " ", other).split(" ")
# find CHT
sub, resolution, source = None, None, None
for element in filter(lambda x: x != "", elements):
if SUB_RE.search(element):
sub = element
elif RESOLUTION_RE.search(element):
resolution = element
elif SOURCE_RE.search(element):
source = element
return RawParser.clean_sub(sub), resolution, source
@staticmethod
def clean_sub(sub: str | None) -> str | None:
if sub is None:
return sub
return re.sub(r"_MP4|_MKV", "", sub)
def process(self, raw_title: str):
raw_title = raw_title.strip()
content_title = self.pre_process(raw_title)
# 预处理标题
group = self.get_group(content_title)
# 翻译组的名字
match_obj = TITLE_RE.match(content_title)
# 处理标题
season_info, episode_info, other = list(map(
lambda x: x.strip(), match_obj.groups()
))
process_raw = self.preffix_process(season_info, group)
# 处理 前缀
raw_name, season_raw, season = self.season_process(process_raw)
# 处理 第n季
name_en, name_zh, name_jp = "", "", ""
try:
name_en, name_zh, name_jp = self.name_process(raw_name)
# 处理 名字
except ValueError:
pass
# 处理 集数
raw_episode = EPISODE_RE.search(episode_info)
episode = 0
if raw_episode is not None:
episode = int(raw_episode.group())
sub, dpi, source = self.find_tags(other) # 剩余信息处理
return name_en, name_zh, name_jp, season, season_raw, episode, sub, dpi, source, group
def analyse(self, raw: str) -> Episode | None:
ret = self.process(raw)
if ret is None:
logger.error(f"Parser cannot analyse {raw}")
return None
name_en, name_zh, name_jp, season, sr, episode, \
sub, dpi, source, group = ret
return Episode(name_en, name_zh, name_jp, season, sr, episode, sub, group, dpi, source)
if __name__ == "__main__":
test = RawParser()
test_list = [
"[ANi] Urusei Yatsura - 她来自烦星(仅限港澳台地区) - 19 [1080P][Bilibili][WEB-DL][AAC AVC][CHT CHS][MP4]",
"[织梦字幕组][尼尔:机械纪元 NieR Automata Ver1.1a][第02集][1080P][AVC][简日双语]"
]
for test_txt in test_list:
ep = test.analyse(test_txt)
print(f"en:{ep.title_en}, zh:{ep.title_zh}, jp:{ep.title_jp}, group:{ep.group}")