Merge pull request #43 from RanKKI/unittest

修改了下 Raw Parser 的格式,加了单元测试
This commit is contained in:
Estrella Pan
2022-06-27 20:39:08 +08:00
committed by GitHub
3 changed files with 168 additions and 79 deletions

View File

@@ -4,66 +4,67 @@ from parser.episode import Episode
logger = logging.getLogger(__name__)
EPISODE_RE = re.compile(r"\d{1,3}")
TITLE_RE = re.compile(
r"(.*|\[.*])( -? \d{1,3} |\[\d{1,3}]|\[\d{1,3}.?[vV]\d{1}]|[第]\d{1,3}[话話集]|\[\d{1,3}.?END])(.*)"
)
RESOLUTION_RE = re.compile(r"1080|720|2160|4K")
SOURCE_RE = re.compile(r"B-Global|[Bb]aha|[Bb]ilibili|AT-X|Web")
SUB_RE = re.compile(r"[简繁日字幕]|CH|BIG5|GB")
CHINESE_NUMBER_MAP = {
"": 1,
"": 2,
"": 3,
"": 4,
"": 5,
"": 6,
"": 7,
"": 8,
"": 9,
"": 10,
}
class RawParser:
def __init__(self) -> None:
self._info = Episode()
@staticmethod
def pre_process(raw_name):
pro_name = raw_name.replace("", "[").replace("", "]")
return pro_name
def get_group(self, name):
self._info.group = re.split(r"[\[\]]", name)[1]
def get_group(name: str) -> str:
return re.split(r"[\[\]]", name)[1]
@staticmethod
def second_process(raw_name):
if re.search(r"新番|月?番", raw_name):
pro_name = re.sub(".*新番.", "", raw_name)
def pre_process(raw_name: str) -> str:
return raw_name.replace("", "[").replace("", "]")
@staticmethod
def season_process(season_info: str):
if re.search(r"新番|月?番", season_info):
name_season = re.sub(".*新番.", "", season_info)
else:
pro_name = re.sub(r"^[^]】]*[]】]", "", raw_name).strip()
return pro_name
name_season = re.sub(r"^[^]】]*[]】]", "", season_info).strip()
@staticmethod
def season_process(name_season):
season_rule = r"S\d{1,2}|Season \d{1,2}|[第].[季期]"
season_map = {
"": 1,
"": 2,
"": 3,
"": 4,
"": 5,
"": 6,
"": 7,
"": 8,
"": 9,
"": 10,
}
name_season = re.sub(r"[\[\]]", " ", name_season)
seasons = re.findall(season_rule, name_season)
if not seasons:
name = name_season
season_raw = ""
season = 1
else:
name = re.sub(season_rule, "", name_season)
for season in seasons:
season_raw = season
if re.search(r"S|Season", season) is not None:
season = int(re.sub(r"S|Season", "", season))
return name_season, "", 1
name = re.sub(season_rule, "", name_season)
for season in seasons:
season_raw = season
if re.search(r"S|Season", season) is not None:
season = int(re.sub(r"S|Season", "", season))
break
elif re.search(r"[第 ].*[季期]", season) is not None:
season_pro = re.sub(r"[第季期 ]", "", season)
try:
season = int(season_pro)
except ValueError:
season = CHINESE_NUMBER_MAP[season_pro]
break
elif re.search(r"[第 ].*[季期]", season) is not None:
season_pro = re.sub(r"[第季期 ]", "", season)
try:
season = int(season_pro)
except ValueError:
season = season_map[season_pro]
break
return name, season_raw, season
@staticmethod
def name_process(name):
def name_process(name: str):
name = name.strip()
split = re.split("/| |- ", name.replace("(仅限港澳台地区)", ""))
while "" in split:
@@ -74,7 +75,8 @@ class RawParser:
elif re.search(" - {1}", name) is not None:
split = re.split("-", name)
if len(split) == 1:
match_obj = re.match(r"([^\x00-\xff]{1,})(\s)([\x00-\xff]{4,})", name)
match_obj = re.match(
r"([^\x00-\xff]{1,})(\s)([\x00-\xff]{4,})", name)
if match_obj is not None:
return match_obj.group(3), split
compare = 0
@@ -85,49 +87,80 @@ class RawParser:
for name in split:
if re.findall("[aA-zZ]{1}", name).__len__() == compare:
return name.strip(), split
raise ValueError()
@staticmethod
def find_tags(other):
elements = re.sub(r"[\[\]()]", " ", other).split(" ")
while "" in elements:
elements.remove("")
# find CHT
sub = None
dpi = None
source = None
for element in elements:
if re.search(r"[简繁日字幕]|CH|BIG5|GB", element) is not None:
sub = element.replace("_MP4","")
elif re.search(r"1080|720|2160|4K", element) is not None:
dpi = element
elif re.search(r"B-Global|[Bb]aha|[Bb]ilibili|AT-X|Web", element) is not None:
source = element
return sub, dpi, source
sub, resolution, source = None, None, None
def process(self, raw_name):
raw_name = self.pre_process(raw_name)
self.get_group(raw_name)
match_obj = re.match(
r"(.*|\[.*])( -? \d{1,3} |\[\d{1,3}]|\[\d{1,3}.?[vV]\d{1}]|[第]\d{1,3}[话話集]|\[\d{1,3}.?END])(.*)",
raw_name,
)
name_season = self.second_process(match_obj.group(1))
name, season_raw, season = self.season_process(name_season)
name, name_group = self.name_process(name)
episode = int(re.findall(r"\d{1,3}", match_obj.group(2))[0])
other = match_obj.group(3).strip()
sub, dpi, source= self.find_tags(other)
return name, season, season_raw, episode, sub, dpi, source, name_group
for element in filter(lambda x: x != "", elements):
if SUB_RE.search(element):
sub = element
elif RESOLUTION_RE.search(element):
resolution = element
elif SOURCE_RE.search(element):
source = element
return RawParser.clean_sub(sub), resolution, source
@staticmethod
def clean_sub(sub: str | None) -> str | None:
if sub is None:
return sub
# TODO: 这里需要改成更精准的匹配,可能不止 _MP4 ?
return sub.replace("_MP4", "")
def process(self, raw_title: str):
raw_title = raw_title.strip()
content_title = self.pre_process(raw_title) # 预处理标题
group = self.get_group(content_title) # 翻译组的名字
match_obj = TITLE_RE.match(content_title) # 处理标题
season_info, episode_info, other = list(map(
lambda x: x.strip(), match_obj.groups()
))
raw_name, season_raw, season = self.season_process(season_info) # 处理 第n季
name, name_group = "", ""
try:
name, name_group = self.name_process(raw_name) # 处理 名字
except ValueError:
pass
# 处理 集数
raw_episode = EPISODE_RE.search(episode_info)
episode = 0
if raw_episode is not None:
episode = int(raw_episode.group())
sub, dpi, source = self.find_tags(other) # 剩余信息处理
return name, season, season_raw, episode, sub, dpi, source, name_group, group
def analyse(self, raw) -> Episode:
try:
self._info.title, self._info.season_info.number, \
self._info.season_info.raw, self._info.ep_info.number,\
self._info.subtitle, self._info.dpi, self._info.source, \
self._info.title_info.group = self.process(raw)
return self._info
except:
logger.warning(f"ERROR match {raw}")
ret = self.process(raw)
if ret is None:
return None
name, season, sr, episode, \
sub, dpi, source, ng, group = ret
except Exception as e:
logger.error(f"ERROR match {raw} {e}")
return None
info = Episode()
info.title = name
info.season_info.number = season
info.season_info.raw = sr
info.ep_info.number = episode
info.subtitle = sub
info.dpi = dpi
info.source = source
info.title_info.group = ng
info.group = group
return info
if __name__ == "__main__":

3
auto_bangumi/test.sh Executable file
View File

@@ -0,0 +1,3 @@
#!/bin/bash
exec python -m unittest discover tests

View File

@@ -0,0 +1,53 @@
import unittest
from parser.analyser import RawParser
class TestRawParser(unittest.TestCase):
def test_raw_parser(self):
parser = RawParser()
content = "【幻樱字幕组】【4月新番】【古见同学有交流障碍症 第二季 Komi-san wa, Komyushou Desu. S02】【22】【GB_MP4】【1920X1080】"
info = parser.analyse(content)
self.assertEqual(info.title, "Komi-san wa, Komyushou Desu.")
self.assertEqual(info.dpi, "1920X1080")
self.assertEqual(info.ep_info.number, 22)
self.assertEqual(info.season_info.number, 2)
content = "【幻樱字幕组】【4月新番】【古见同学有交流障碍症 第一季 Komi-san wa, Komyushou Desu. S01】【23】【GB_MP4】【4K】"
info = parser.analyse(content)
self.assertEqual(info.title, "Komi-san wa, Komyushou Desu.")
self.assertEqual(info.dpi, "4K")
self.assertEqual(info.ep_info.number, 23)
self.assertEqual(info.season_info.number, 1)
def test_pre_process(self):
content = "【幻樱字幕组】【4月新番】"
expected_content = "[幻樱字幕组][4月新番]"
self.assertEqual(RawParser.pre_process(content), expected_content)
def test_get_group(self):
content = "【幻樱字幕组】【4月新番】【古见同学有交流障碍症 第二季 Komi-san wa, Komyushou Desu. S02】【22】【GB_MP4】【1920X1080】"
content = RawParser.pre_process(content)
expected_content = "幻樱字幕组"
self.assertEqual(RawParser.get_group(content), expected_content)
def test_find_tags(self):
cases = [
("[GB_MP4] [1920X1080] [bilibili]", ["GB", "1920X1080", "bilibili"]),
("[GB_MP4] [1920X1080]", ["GB", "1920X1080", None]),
("[简_MP4] [Bilibili]", ["", None, "Bilibili"]),
("[简_MP4] [Bilibili] [Web]", ["", None, "Web"]),
("dfkajflkdaj dfkadjlkfa [Web]", [None, None, "Web"]),
("dfkajflkdaj dfkadjlkfa [Web] [Web]", [None, None, "Web"]),
]
for content, expected in cases:
ret = RawParser.find_tags(content)
self.assertEqual(len(ret), 3)
for i in range(3):
self.assertEqual(ret[i], expected[i])