Files
Auto_Bangumi/test/test.py
2022-05-19 17:20:55 +08:00

93 lines
3.5 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import re
import sys
import time
import requests
from bs4 import BeautifulSoup
import json
class CollectRSS:
def __init__(self, info):
self.bangumi_list = []
with open("rule.json") as f:
self.rules = json.load(f)
url = "https://mikanani.me/RSS/Classic"
rss = requests.get(url, 'utf-8')
soup = BeautifulSoup(rss.text, 'xml')
self.items = soup.find_all('item')
self.info = info
def get_info_list(self):
split_rule = r"\[|\]|\【|\】|\★|\|\|\(|\)"
last_rule = r"(.*)( \-)"
for item in self.items:
name = item.title.string
exit_flag = False
for rule in self.rules:
for group in rule["group_name"]:
if re.search(group, name):
exit_flag = True
n = re.split(split_rule, name)
while '' in n:
n.remove('')
while ' ' in n:
n.remove(' ')
try:
bangumi_title = n[rule['name_position']].strip()
except IndexError:
continue
sub_title = re.sub(r"[^\x00-\xff]{1,}| \d{1,2}|\·","",bangumi_title)
b = re.split(r"\/|\_", sub_title)
while '' in b:
b.remove('')
pre_name = max(b, key=len, default='').strip()
if pre_name != '':
bangumi_title = pre_name
for i in range(2):
match_obj = re.match(last_rule, bangumi_title, re.I)
if match_obj is not None:
bangumi_title = match_obj.group(1).strip()
match_obj = re.match(r"(S\d{1,2}(.*))", bangumi_title, re.I)
if match_obj is not None:
bangumi_title = match_obj.group(2).strip()
if bangumi_title not in self.bangumi_list:
self.bangumi_list.append(bangumi_title)
break
if exit_flag:
break
if not exit_flag:
print(f"ERROR Not match with {name}")
def put_info_json(self):
season_match = r"(.*)(Season \d{1,2}|S\d{1,2}|第.*季)"
had_data = []
for data in self.info:
had_data.append(data["title"])
for title in self.bangumi_list:
match_title_season = re.match(season_match, title, re.I)
if match_title_season is not None:
json_title = match_title_season.group(1).strip()
json_season = match_title_season.group(2)
else:
json_season = ''
json_title = title
if json_title not in had_data:
self.info.append({
"title": json_title,
"season": json_season
})
sys.stdout.write(f"[{time.strftime('%Y-%m-%d %X')}] add {json_title} {json_season}" + "\n")
sys.stdout.flush()
with open("bangumi.json", 'w', encoding='utf8') as f:
json.dump(self.info, f, indent=4, separators=(',', ': '), ensure_ascii=False)
if __name__ == "__main__":
with open("bangumi.json") as f:
info = json.load(f)
cr = CollectRSS(info)
cr.get_info_list()
cr.put_info_json()