修复bug,新增anidb数据库抓取程式

This commit is contained in:
EstrellaXD
2022-06-07 10:46:33 +08:00
parent 3c2e0f463c
commit ca3ec58b4b
10 changed files with 102 additions and 7 deletions

2
.gitignore vendored
View File

@@ -163,5 +163,5 @@ cython_debug/
/auto_bangumi/const_dev.py
/config/bangumi.json
/auto_bangumi/tester.py
/source/names.txt
/resource/names.txt

View File

@@ -4,7 +4,7 @@
<content url="file://$MODULE_DIR$">
<sourceFolder url="file://$MODULE_DIR$/auto_bangumi" isTestSource="false" />
</content>
<orderEntry type="jdk" jdkName="Python 3.10" jdkType="Python SDK" />
<orderEntry type="jdk" jdkName="Python 3.10 (np_veclib)" jdkType="Python SDK" />
<orderEntry type="sourceFolder" forTests="false" />
</component>
</module>

2
.idea/misc.xml generated
View File

@@ -1,4 +1,4 @@
<?xml version="1.0" encoding="UTF-8"?>
<project version="4">
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10" project-jdk-type="Python SDK" />
<component name="ProjectRootManager" version="2" project-jdk-name="Python 3.10 (np_veclib)" project-jdk-type="Python SDK" />
</project>

View File

@@ -29,6 +29,7 @@ def load_data_file():
if bangumi_data["data_version"] != settings.data_version or bangumi_data["rss_link"] != settings.rss_link:
bangumi_data["bangumi_info"] = []
bangumi_data["rss_link"] = settings.rss_link
logger.info("Rebuilding data information...")
return bangumi_data

View File

@@ -114,5 +114,13 @@ class DownloadClient:
if __name__ == "__main__":
put = DownloadClient()
put.add_rules()
try:
from const_dev import DEV_SETTINGS
except ModuleNotFoundError:
logger.debug("Please copy `const_dev.py` to `const_dev.py` to use custom settings")
settings.init(DEV_SETTINGS)
client = getClient()
try:
client.rss_remove_item(item_path="Mikan_RSS")
except ConflictError:
logger.info("No feed exists, start adding feed.")

View File

@@ -33,7 +33,7 @@ class FullSeasonGet:
season = ""
else:
season = self.season
search_str = re.sub(r"[& ]", "+",
search_str = re.sub(r"[\W_]", "+",
f"{self.group} {self.bangumi_name} {season} {self.subtitle} {self.source} {self.dpi}")
season = requests.get(
f"https://mikanani.me/RSS/Search?searchstr={search_str}"

View File

@@ -63,7 +63,18 @@ class QbDownloader:
self._client.rss_remove_item(item_path)
except Conflict409Error as e:
logger.exception(e)
logger.info("Add new RSS")
raise ConflictError()
def rss_set_rule(self, rule_name, rule_def):
self._client.rss_set_rule(rule_name, rule_def)
if __name__ == "__main__":
try:
from const_dev import DEV_SETTINGS
except ModuleNotFoundError:
logger.debug("Please copy `const_dev.py` to `const_dev.py` to use custom settings")
settings.init(DEV_SETTINGS)
client = QbDownloader(settings.host_ip, settings.user_name, settings.password)
client.rss_remove_item("Mikan_RSS")

View File

@@ -2,5 +2,4 @@ qbittorrent-api
bs4
requests
lxml
zhconv

76
resource/anidb.py Normal file
View File

@@ -0,0 +1,76 @@
#! /usr/bin/python
import re
import requests
from bs4 import BeautifulSoup
from utils import json_config
header = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ApplewebKit/537.36 (KHtml, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
}
def get_html(url):
requests.adapters.DEFAULT_RETRIES = 5 # 增加重连次数
s = requests.session()
s.keep_alive = False # 关闭多余连接
html = s.get(url=url, headers=header).text
print("get html success")
return html
def get_list(year, season):
season = ["spring", "summer", "autumn", "winter"][season - 1]
url = "https://anidb.net/anime/season/%s/%s/" % (year, season)
html = get_html(url)
ids = re.findall("<a href=\"/anime/(\d+)\"><picture>", html)
return ids
def get_title(id):
url = "https://anidb.net/anime/%s" % id
soup = BeautifulSoup(get_html(url), "lxml")
titles = soup.find("div", id="tab_2_pane")
g = titles.findAll("th")
v = titles.findAll("td")
t_dic = {
"id": id,
"main": None,
"verified": None,
"en": None,
"chs": None,
"cht": None,
"jp": None,
"synonym": None,
"kana": None
}
for i in range(0, len(g)):
if g[i].text == "Main Title":
t_dic["main"] = re.sub("\(a\d+\)", "", v[i].text).strip("\n\t")
elif g[i].text == "Official Title":
if re.search("verified", str(v[i])):
t_dic["verified"] = v[i].find("label").text
if re.search("language: english", str(v[i])):
t_dic["en"] = v[i].find("label").text
elif re.search("span>zh-Hant", str(v[i])):
t_dic["cht"] = v[i].find("label").text
elif re.search("span>zh-Hans", str(v[i])):
t_dic["chs"] = v[i].find("label").text
elif re.search("language: japanese", str(v[i])):
t_dic["jp"] = v[i].find("label").text
elif g[i].text == "Synonym":
t_dic["synonym"] = v[i].text
elif g[i].text == "Kana":
t_dic["kana"] = v[i].text
return t_dic
if __name__ == "__main__":
print("start")
# 年份,季度
id_list = (get_list(2022, 1))
for i in id_list:
url = f"http://api.anidb.net:9001/httpapi?request=anime&client=autobangumi&clientver=1&protover=1&aid={i}"
req = requests.get(url)
soup = BeautifulSoup(req.text, "xml")
titles = soup.find("titles")["official"]
for item in titles:
print(item)