mirror of
https://github.com/EstrellaXD/Auto_Bangumi.git
synced 2026-05-01 22:12:18 +08:00
2.6.0
- refactor - change config from config.ini to config.json
This commit is contained in:
1
module/parser/__init__.py
Normal file
1
module/parser/__init__.py
Normal file
@@ -0,0 +1 @@
|
||||
from .title_parser import TitleParser
|
||||
4
module/parser/analyser/__init__.py
Normal file
4
module/parser/analyser/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
from .raw_parser import RawParser
|
||||
from .rename_parser import DownloadParser
|
||||
from .tmdb_parser import TMDBMatcher
|
||||
|
||||
17
module/parser/analyser/bgm_parser.py
Normal file
17
module/parser/analyser/bgm_parser.py
Normal file
@@ -0,0 +1,17 @@
|
||||
from module.network import RequestContent
|
||||
|
||||
|
||||
class BgmAPI:
|
||||
def __init__(self):
|
||||
self.search_url = lambda e: \
|
||||
f"https://api.bgm.tv/search/subject/{e}?type=2"
|
||||
self.info_url = lambda e: \
|
||||
f"https://api.bgm.tv/subject/{e}"
|
||||
self._request = RequestContent()
|
||||
|
||||
def search(self, title):
|
||||
url = self.search_url(title)
|
||||
contents = self._request.get_json(url)["list"]
|
||||
if contents.__len__() == 0:
|
||||
return None
|
||||
return contents[0]["name"], contents[0]["name_cn"]
|
||||
168
module/parser/analyser/raw_parser.py
Normal file
168
module/parser/analyser/raw_parser.py
Normal file
@@ -0,0 +1,168 @@
|
||||
import logging
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
|
||||
# from parser.episode import Episode
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
EPISODE_RE = re.compile(r"\d+")
|
||||
TITLE_RE = re.compile(
|
||||
r"(.*|\[.*])( -? \d+|\[\d+]|\[\d+.?[vV]\d{1}]|[第]\d+[话話集]|\[\d+.?END])(.*)"
|
||||
)
|
||||
RESOLUTION_RE = re.compile(r"1080|720|2160|4K")
|
||||
SOURCE_RE = re.compile(r"B-Global|[Bb]aha|[Bb]ilibili|AT-X|Web")
|
||||
SUB_RE = re.compile(r"[简繁日字幕]|CH|BIG5|GB")
|
||||
|
||||
CHINESE_NUMBER_MAP = {
|
||||
"一": 1,
|
||||
"二": 2,
|
||||
"三": 3,
|
||||
"四": 4,
|
||||
"五": 5,
|
||||
"六": 6,
|
||||
"七": 7,
|
||||
"八": 8,
|
||||
"九": 9,
|
||||
"十": 10,
|
||||
}
|
||||
|
||||
|
||||
@dataclass
|
||||
class Episode:
|
||||
title_en: str or None
|
||||
title_zh: str or None
|
||||
title_jp: str or None
|
||||
season: int
|
||||
season_raw: str
|
||||
episode: int
|
||||
sub: str
|
||||
group: str
|
||||
resolution: str
|
||||
source: str
|
||||
|
||||
|
||||
class RawParser:
|
||||
@staticmethod
|
||||
def get_group(name: str) -> str:
|
||||
return re.split(r"[\[\]]", name)[1]
|
||||
|
||||
@staticmethod
|
||||
def pre_process(raw_name: str) -> str:
|
||||
return raw_name.replace("【", "[").replace("】", "]")
|
||||
|
||||
@staticmethod
|
||||
def season_process(season_info: str):
|
||||
if re.search(r"新番|月?番", season_info):
|
||||
name_season = re.sub(".*新番.", "", season_info)
|
||||
else:
|
||||
name_season = re.sub(r"^[^]】]*[]】]", "", season_info).strip()
|
||||
season_rule = r"S\d{1,2}|Season \d{1,2}|[第].[季期]"
|
||||
name_season = re.sub(r"[\[\]]", " ", name_season)
|
||||
seasons = re.findall(season_rule, name_season)
|
||||
if not seasons:
|
||||
return name_season, "", 1
|
||||
name = re.sub(season_rule, "", name_season)
|
||||
for season in seasons:
|
||||
season_raw = season
|
||||
if re.search(r"S|Season", season) is not None:
|
||||
season = int(re.sub(r"S|Season", "", season))
|
||||
break
|
||||
elif re.search(r"[第 ].*[季期]", season) is not None:
|
||||
season_pro = re.sub(r"[第季期 ]", "", season)
|
||||
try:
|
||||
season = int(season_pro)
|
||||
except ValueError:
|
||||
season = CHINESE_NUMBER_MAP[season_pro]
|
||||
break
|
||||
return name, season_raw, season
|
||||
|
||||
@staticmethod
|
||||
def name_process(name: str):
|
||||
name_en, name_zh, name_jp = None, None, None
|
||||
name = name.strip()
|
||||
name = re.sub(r"[((]仅限港澳台地区[))]", "", name)
|
||||
split = re.split("/|\s{2}|-\s{2}", name)
|
||||
while "" in split:
|
||||
split.remove("")
|
||||
if len(split) == 1:
|
||||
if re.search("_{1}", name) is not None:
|
||||
split = re.split("_", name)
|
||||
elif re.search(" - {1}", name) is not None:
|
||||
split = re.split("-", name)
|
||||
if len(split) == 1:
|
||||
split_space = name.split(" ")
|
||||
for idx, item in enumerate(split_space):
|
||||
if re.search(r"^[\u4e00-\u9fa5]{2,}", item) is not None:
|
||||
split_space.remove(item)
|
||||
split = [item.strip(), " ".join(split_space).strip()]
|
||||
break
|
||||
for item in split:
|
||||
if re.search(r"[\u0800-\u4e00]{2,}", item) and not name_jp:
|
||||
name_jp = item.strip()
|
||||
elif re.search(r"[\u4e00-\u9fa5]{2,}", item) and not name_zh:
|
||||
name_zh = item.strip()
|
||||
elif re.search(r"[a-zA-Z]{3,}", item) and not name_en:
|
||||
name_en = item.strip()
|
||||
return name_en, name_zh, name_jp
|
||||
|
||||
@staticmethod
|
||||
def find_tags(other):
|
||||
elements = re.sub(r"[\[\]()()]", " ", other).split(" ")
|
||||
# find CHT
|
||||
sub, resolution, source = None, None, None
|
||||
for element in filter(lambda x: x != "", elements):
|
||||
if SUB_RE.search(element):
|
||||
sub = element
|
||||
elif RESOLUTION_RE.search(element):
|
||||
resolution = element
|
||||
elif SOURCE_RE.search(element):
|
||||
source = element
|
||||
return RawParser.clean_sub(sub), resolution, source
|
||||
|
||||
@staticmethod
|
||||
def clean_sub(sub: str | None) -> str | None:
|
||||
if sub is None:
|
||||
return sub
|
||||
return re.sub(r"_MP4|_MKV", "", sub)
|
||||
|
||||
def process(self, raw_title: str):
|
||||
raw_title = raw_title.strip()
|
||||
content_title = self.pre_process(raw_title) # 预处理标题
|
||||
group = self.get_group(content_title) # 翻译组的名字
|
||||
match_obj = TITLE_RE.match(content_title) # 处理标题
|
||||
season_info, episode_info, other = list(map(
|
||||
lambda x: x.strip(), match_obj.groups()
|
||||
))
|
||||
raw_name, season_raw, season = self.season_process(season_info) # 处理 第n季
|
||||
name_en, name_zh, name_jp = "", "", ""
|
||||
try:
|
||||
name_en, name_zh, name_jp = self.name_process(raw_name) # 处理 名字
|
||||
except ValueError:
|
||||
pass
|
||||
# 处理 集数
|
||||
raw_episode = EPISODE_RE.search(episode_info)
|
||||
episode = 0
|
||||
if raw_episode is not None:
|
||||
episode = int(raw_episode.group())
|
||||
sub, dpi, source = self.find_tags(other) # 剩余信息处理
|
||||
return name_en, name_zh, name_jp, season, season_raw, episode, sub, dpi, source, group
|
||||
|
||||
def analyse(self, raw: str) -> Episode or None:
|
||||
try:
|
||||
ret = self.process(raw)
|
||||
if ret is None:
|
||||
return None
|
||||
name_en, name_zh, name_jp, season, sr, episode, \
|
||||
sub, dpi, source, group = ret
|
||||
except Exception as e:
|
||||
logger.error(f"Parser cannot analyse {raw} {e}")
|
||||
return None
|
||||
return Episode(name_en, name_zh, name_jp, season, sr, episode, sub, group, dpi, source)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test = RawParser()
|
||||
test_txt = "[梦蓝字幕组]New Doraemon 哆啦A梦新番[716][2022.07.23][AVC][10080P][GB_JP]"
|
||||
ep = test.analyse(test_txt)
|
||||
print(f"en:{ep.title_en}, zh:{ep.title_zh}, jp:{ep.title_jp}, group:{ep.group}")
|
||||
102
module/parser/analyser/rename_parser.py
Normal file
102
module/parser/analyser/rename_parser.py
Normal file
@@ -0,0 +1,102 @@
|
||||
import re
|
||||
import logging
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@dataclass
|
||||
class DownloadInfo:
|
||||
name: str
|
||||
season: int
|
||||
suffix: str
|
||||
file_name: str
|
||||
folder_name: str
|
||||
|
||||
|
||||
class DownloadParser:
|
||||
def __init__(self):
|
||||
self.rules = [
|
||||
r"(.*) - (\d{1,4}|\d{1,4}\.\d{1,2})(?:v\d{1,2})?(?: )?(?:END)?(.*)",
|
||||
r"(.*)[\[ E](\d{1,3}|\d{1,3}\.\d{1,2})(?:v\d{1,2})?(?: )?(?:END)?[\] ](.*)",
|
||||
r"(.*)\[第(\d*\.*\d*)话(?:END)?\](.*)",
|
||||
r"(.*)\[第(\d*\.*\d*)話(?:END)?\](.*)",
|
||||
r"(.*)第(\d*\.*\d*)话(?:END)?(.*)",
|
||||
r"(.*)第(\d*\.*\d*)話(?:END)?(.*)",
|
||||
]
|
||||
|
||||
@staticmethod
|
||||
def rename_init(name, folder_name, season, suffix) -> DownloadInfo:
|
||||
n = re.split(r"[\[\]()【】()]", name)
|
||||
suffix = suffix if suffix is not None else n[-1]
|
||||
file_name = name.replace(f"[{n[1]}]", "")
|
||||
if season < 10:
|
||||
season = f"0{season}"
|
||||
return DownloadInfo(name, season, suffix, file_name, folder_name)
|
||||
|
||||
def rename_normal(self, info: DownloadInfo):
|
||||
for rule in self.rules:
|
||||
match_obj = re.match(rule, info.name, re.I)
|
||||
if match_obj is not None:
|
||||
title = re.sub(r"([Ss]|Season )\d{1,3}", "", match_obj.group(1)).strip()
|
||||
new_name = f"{title} S{info.season}E{match_obj.group(2)}{match_obj.group(3)}"
|
||||
return new_name
|
||||
|
||||
def rename_pn(self, info: DownloadInfo):
|
||||
for rule in self.rules:
|
||||
match_obj = re.match(rule, info.file_name, re.I)
|
||||
if match_obj is not None:
|
||||
title = re.sub(r"([Ss]|Season )\d{1,3}", "", match_obj.group(1)).strip()
|
||||
title = title if title != "" else info.folder_name
|
||||
new_name = re.sub(
|
||||
r"[\[\]]",
|
||||
"",
|
||||
f"{title} S{info.season}E{match_obj.group(2)}{info.suffix}",
|
||||
)
|
||||
return new_name
|
||||
|
||||
def rename_advance(self, info: DownloadInfo):
|
||||
for rule in self.rules:
|
||||
match_obj = re.match(rule, info.file_name, re.I)
|
||||
if match_obj is not None:
|
||||
new_name = re.sub(
|
||||
r"[\[\]]",
|
||||
"",
|
||||
f"{info.folder_name} S{info.season}E{match_obj.group(2)}{info.suffix}",
|
||||
)
|
||||
return new_name
|
||||
|
||||
def rename_no_season_pn(self, info: DownloadInfo):
|
||||
for rule in self.rules:
|
||||
match_obj = re.match(rule, info.file_name, re.I)
|
||||
if match_obj is not None:
|
||||
title = match_obj.group(1).strip()
|
||||
new_name = re.sub(
|
||||
r"[\[\]]",
|
||||
"",
|
||||
f"{title} E{match_obj.group(2)}{info.suffix}",
|
||||
)
|
||||
return new_name
|
||||
|
||||
@staticmethod
|
||||
def rename_none(info: DownloadInfo):
|
||||
return info.name
|
||||
|
||||
def download_rename(self, name, folder_name, season, suffix, method):
|
||||
rename_info = self.rename_init(name, folder_name, season, suffix)
|
||||
method_dict = {
|
||||
"normal": self.rename_normal,
|
||||
"pn": self.rename_pn,
|
||||
"advance": self.rename_advance,
|
||||
"no_season_pn": self.rename_no_season_pn,
|
||||
"none": self.rename_none
|
||||
}
|
||||
return method_dict[method.lower()](rename_info)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
name = "[Lilith-Raws] Tate no Yuusha no Nariagari S02 - 02 [Baha][WEB-DL][1080p][AVC AAC][CHT][MP4]"
|
||||
rename = DownloadParser()
|
||||
new_name = rename.download_rename(name, "异世界舅舅(2022)", 1, ".mp4", "normal")
|
||||
print(new_name)
|
||||
73
module/parser/analyser/tmdb_parser.py
Normal file
73
module/parser/analyser/tmdb_parser.py
Normal file
@@ -0,0 +1,73 @@
|
||||
import re
|
||||
import time
|
||||
from dataclasses import dataclass
|
||||
|
||||
from module.network import RequestContent
|
||||
from module.conf import TMDB_API
|
||||
|
||||
|
||||
@dataclass
|
||||
class TMDBInfo:
|
||||
id: int
|
||||
title_jp: str
|
||||
title_zh: str
|
||||
season: dict
|
||||
last_season: int
|
||||
year_number: int
|
||||
|
||||
|
||||
class TMDBMatcher:
|
||||
def __init__(self):
|
||||
self.search_url = lambda e: \
|
||||
f"https://api.themoviedb.org/3/search/tv?api_key={TMDB_API}&page=1&query={e}&include_adult=false"
|
||||
self.info_url = lambda e: \
|
||||
f"https://api.themoviedb.org/3/tv/{e}?api_key={TMDB_API}&language=zh-CN"
|
||||
self._request = RequestContent()
|
||||
|
||||
def is_animation(self, tv_id) -> bool:
|
||||
url_info = self.info_url(tv_id)
|
||||
type_id = self._request.get_json(url_info)["genres"]
|
||||
for type in type_id:
|
||||
if type.get("id") == 16:
|
||||
return True
|
||||
return False
|
||||
|
||||
# def get_zh_title(self, id):
|
||||
# alt_title_url = self.alt_title_url(id)
|
||||
# titles = self._request.get_content(alt_title_url, content="json")
|
||||
# for title in titles:
|
||||
# if title["iso_3166_1"] == "CN":
|
||||
# return title["title"]
|
||||
# return None
|
||||
|
||||
@staticmethod
|
||||
def get_season(seasons: list) -> int:
|
||||
for season in seasons:
|
||||
if re.search(r"第 \d 季", season.get("season")) is not None:
|
||||
date = season.get("air_date").split("-")
|
||||
[year, _ , _] = date
|
||||
now_year = time.localtime().tm_year
|
||||
if int(year) == now_year:
|
||||
return int(re.findall(r"\d", season.get("season"))[0])
|
||||
|
||||
def tmdb_search(self, title) -> TMDBInfo:
|
||||
url = self.search_url(title)
|
||||
contents = self._request.get_json(url).get("results")
|
||||
if contents.__len__() == 0:
|
||||
url = self.search_url(title.replace(" ", ""))
|
||||
contents = self._request.get_json(url).get("results")
|
||||
# 判断动画
|
||||
for content in contents:
|
||||
id = content["id"]
|
||||
if self.is_animation(id):
|
||||
break
|
||||
url_info = self.info_url(id)
|
||||
info_content = self._request.get_json(url_info)
|
||||
# 关闭链接
|
||||
self._request.close()
|
||||
season = [{"season": s.get("name"), "air_date": s.get("air_date")} for s in info_content.get("seasons")]
|
||||
last_season = self.get_season(season)
|
||||
title_jp = info_content.get("original_name")
|
||||
title_zh = info_content.get("name")
|
||||
year_number = info_content.get("first_air_date").split("-")[0]
|
||||
return TMDBInfo(id, title_jp, title_zh, season, last_season, year_number)
|
||||
52
module/parser/fuzz_match.py
Normal file
52
module/parser/fuzz_match.py
Normal file
@@ -0,0 +1,52 @@
|
||||
from thefuzz import fuzz
|
||||
import logging
|
||||
from utils import json_config
|
||||
from conf import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class FuzzMatch:
|
||||
def __init__(self):
|
||||
try:
|
||||
anidb_data = json_config.get(settings.anidb_url)
|
||||
json_config.save(settings.anidb_path, anidb_data)
|
||||
except Exception as e:
|
||||
logger.debug(e)
|
||||
logger.info(f"Fail to get anidb data, reading local data")
|
||||
anidb_data = json_config.load(settings.anidb_path)
|
||||
self.match_data = anidb_data
|
||||
|
||||
@staticmethod
|
||||
def match(title_raw, info: dict):
|
||||
compare_value = []
|
||||
for tag in ["main", "en", "ja", "zh-Hans", "zh-Hant"]:
|
||||
if info[tag] is not None:
|
||||
a = fuzz.token_sort_ratio(title_raw.lower(), info[tag].lower())
|
||||
compare_value.append(a)
|
||||
for compare in info["other"]:
|
||||
a = fuzz.token_sort_ratio(title_raw.lower(), compare.lower())
|
||||
compare_value.append(a)
|
||||
return max(compare_value)
|
||||
|
||||
def find_max_name(self, title_raw):
|
||||
max_value = 0
|
||||
max_info = None
|
||||
for info in self.match_data:
|
||||
a = self.match(title_raw, info)
|
||||
if a > max_value:
|
||||
max_value = a
|
||||
max_info = info
|
||||
return max_value, max_info["main"]
|
||||
# logger.debug(max(value))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
from conf.const_dev import DEV_SETTINGS
|
||||
settings.init(DEV_SETTINGS)
|
||||
f = FuzzMatch()
|
||||
name = "勇者、辞职不干了"
|
||||
value, title = f.find_max_name(name)
|
||||
print(f"Raw Name: {name} \n"
|
||||
f"Match Name: {title} \n"
|
||||
f"Match Value: {value}")
|
||||
64
module/parser/title_parser.py
Normal file
64
module/parser/title_parser.py
Normal file
@@ -0,0 +1,64 @@
|
||||
import logging
|
||||
|
||||
from .analyser import RawParser, DownloadParser, TMDBMatcher
|
||||
from module.conf import settings
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
LANGUAGE = settings.rss_parser.language
|
||||
|
||||
class TitleParser:
|
||||
def __init__(self):
|
||||
self._raw_parser = RawParser()
|
||||
self._download_parser = DownloadParser()
|
||||
self._tmdb_parser = TMDBMatcher()
|
||||
|
||||
def raw_parser(self, raw: str):
|
||||
return self._raw_parser.analyse(raw)
|
||||
|
||||
def download_parser(self, download_raw, folder_name, season, suffix, method=settings.bangumi_manage.method):
|
||||
return self._download_parser.download_rename(download_raw, folder_name, season, suffix, method)
|
||||
|
||||
def tmdb_parser(self, title: str, season: int):
|
||||
|
||||
official_title, tmdb_season = None, None
|
||||
try:
|
||||
tmdb_info = self._tmdb_parser.tmdb_search(title)
|
||||
logger.debug(f"TMDB Matched, official title is {tmdb_info.title_zh}")
|
||||
except Exception as e:
|
||||
logger.debug(e)
|
||||
logger.warning("Not Matched with TMDB")
|
||||
return title, season
|
||||
if LANGUAGE == "zh":
|
||||
official_title = f"{tmdb_info.title_zh} ({tmdb_info.year_number})"
|
||||
elif LANGUAGE == "jp":
|
||||
official_title = f"{tmdb_info.title_jp} ({tmdb_info.year_number})"
|
||||
tmdb_season = tmdb_info.last_season if tmdb_info.last_season else season
|
||||
official_title = official_title if official_title else title
|
||||
return official_title, tmdb_season
|
||||
|
||||
def return_dict(self, _raw: str):
|
||||
try:
|
||||
episode = self.raw_parser(_raw)
|
||||
title_search = episode.title_zh if episode.title_zh else episode.title_en
|
||||
title_raw = episode.title_en if episode.title_en else episode.title_zh
|
||||
if settings.rss_parser.enable_tmdb:
|
||||
official_title, _season = self.tmdb_parser(title_search, episode.season)
|
||||
else:
|
||||
official_title = title_search if LANGUAGE == "zh" else title_raw
|
||||
_season = episode.season
|
||||
data = {
|
||||
"official_title": official_title,
|
||||
"title_raw": title_raw,
|
||||
"season": _season,
|
||||
"season_raw": episode.season_raw,
|
||||
"group": episode.group,
|
||||
"dpi": episode.resolution,
|
||||
"source": episode.source,
|
||||
"subtitle": episode.sub,
|
||||
"added": False,
|
||||
"eps_collect": True if episode.episode > 1 else False,
|
||||
}
|
||||
logger.debug(f"RAW:{_raw} >> {episode.title_en}")
|
||||
return data
|
||||
except Exception as e:
|
||||
logger.debug(e)
|
||||
Reference in New Issue
Block a user