Move rss_analyser.py to database

2026-04-24 18:40:03 +08:00 · 2023-05-05 10:04:49 +08:00
parent 7dbdda2d67
commit 3ac9049627
5 changed files with 59 additions and 36 deletions
--- a/src/module/database/operator.py
+++ b/src/module/database/operator.py
@@ -115,7 +115,7 @@ class DataOperator(DataConnector):
        self._conn.commit()
        return self._cursor.rowcount == 1

-    def search(self, _id: int) -> BangumiData | None:
+    def search_id(self, _id: int) -> BangumiData | None:
        self._cursor.execute('''
            SELECT * FROM bangumi WHERE id = :id
            ''', {"id": _id})
@@ -126,6 +126,17 @@ class DataOperator(DataConnector):
        dict_data = dict(zip(keys, values))
        return self.db_to_data(dict_data)

+    def search_official_title(self, official_title: str) -> BangumiData | None:
+        self._cursor.execute('''
+            SELECT * FROM bangumi WHERE official_title = :official_title
+            ''', {"official_title": official_title})
+        values = self._cursor.fetchone()
+        if values is None:
+            return None
+        keys = [x[0] for x in self._cursor.description]
+        dict_data = dict(zip(keys, values))
+        return self.db_to_data(dict_data)
+
    def match_title(self, title: str) -> bool:
        # Select all title_raw
        self._cursor.execute('''
@@ -138,6 +149,19 @@ class DataOperator(DataConnector):
                return True
        return False

+    def not_exist_titles(self, titles: list[str]) -> list[str]:
+        # Select all title_raw
+        self._cursor.execute('''
+            SELECT title_raw FROM bangumi
+            ''')
+        title_raws = [x[0] for x in self._cursor.fetchall()]
+        # Match title
+        for title_raw in title_raws:
+            for title in titles:
+                if title_raw in title:
+                    titles.remove(title)
+        return titles
+
    def gen_id(self) -> int:
        self._cursor.execute('''
            SELECT id FROM bangumi ORDER BY id DESC LIMIT 1
--- a/src/module/models/bangumi.py
+++ b/src/module/models/bangumi.py
@@ -3,7 +3,7 @@ from dataclasses import dataclass


 class BangumiData(BaseModel):
-    id: int | None = Field(None, alias="id", title="番剧ID")
+    id: int = Field(..., alias="id", title="番剧ID")
    official_title: str = Field(..., alias="official_title", title="番剧中文名")
    year: int | None = Field(None, alias="year", title="番剧年份")
    title_raw: str = Field(..., alias="title_raw", title="番剧原名")
@@ -13,11 +13,11 @@ class BangumiData(BaseModel):
    dpi: str | None = Field(None, alias="dpi", title="分辨率")
    source: str | None = Field(None, alias="source", title="来源")
    subtitle: str | None = Field(None, alias="subtitle", title="字幕")
-    added: bool = Field(False, alias="added", title="是否已添加")
    eps_collect: bool = Field(False, alias="eps_collect", title="是否已收集")
    offset: int = Field(0, alias="offset", title="番剧偏移量")
    filter: list[str] = Field(..., alias="filter", title="番剧过滤器")
    rss: list[str] = Field(None, alias="rss", title="番剧RSS链接")
+    poster_link: str | None = Field(None, alias="poster_link", title="番剧海报链接")


 class ProgramData(BaseModel):
--- a/src/module/network/request_contents.py
+++ b/src/module/network/request_contents.py
@@ -1,6 +1,7 @@
 import re
 import xml.etree.ElementTree
 from dataclasses import dataclass
+from bs4 import BeautifulSoup

 from .request_url import RequestURL
 from module.conf import settings
@@ -20,10 +21,12 @@ class RequestContent(RequestURL):
        soup = self.get_xml(_url)
        torrent_titles = []
        torrent_urls = []
+        torrent_homepage = []

        for item in soup.findall("./channel/item"):
            torrent_titles.append(item.find("title").text)
            torrent_urls.append(item.find("enclosure").attrib['url'])
+            torrent_homepage.append(item.find("link").text)

        torrents = []
        for _title, torrent_url in zip(torrent_titles, torrent_urls):
@@ -34,7 +37,16 @@ class RequestContent(RequestURL):
                torrents.append(TorrentInfo(_title, torrent_url))
        return torrents

-    def get_xml(self, _url) -> xml.etree.ElementTree.ElementTree:
+    def get_poster(self, _url):
+        content = self.get_html(_url).text
+        soup = BeautifulSoup(content, 'html.parser')
+        div = soup.find('div', {'class': 'bangumi-poster'})
+        style = div.get('style')
+        if style:
+            return style.split('url(\'')[1].split('\')')[0]
+        return None
+
+    def get_xml(self, _url) -> xml.etree.ElementTree.Element:
        return xml.etree.ElementTree.fromstring(self.get_url(_url).text)

    # API JSON
@@ -52,3 +64,4 @@ class RequestContent(RequestURL):

    def get_content(self, _url):
        return self.get_url(_url).content
+
--- a/src/module/parser/title_parser.py
+++ b/src/module/parser/title_parser.py
@@ -43,7 +43,7 @@ class TitleParser:
            self,
            raw: str,
            settings: Config,
-            _id: int | None = None
+            _id: int = 0
    ) -> BangumiData:
        language = settings.rss_parser.language
        try:
@@ -74,10 +74,10 @@ class TitleParser:
                dpi=episode.resolution,
                source=episode.source,
                subtitle=episode.sub,
-                added=False,
                eps_collect=True if episode.episode > 1 else False,
                offset=0,
-                filter=settings.rss_parser.filter
+                filter=settings.rss_parser.filter,
+                rss=rss_link,
            )
            logger.debug(f"RAW:{raw} >> {episode.title_en}")
            return data
--- a/src/module/rss/rss_analyser.py
+++ b/src/module/rss/rss_analyser.py
@@ -3,8 +3,8 @@ import logging

 from module.network import RequestContent
 from module.parser import TitleParser
-from module.core import DownloadClient
 from module.models import BangumiData, Config
+from module.database import DataOperator

 logger = logging.getLogger(__name__)

@@ -14,37 +14,24 @@ class RSSAnalyser:
        self._title_analyser = TitleParser()
        self.settings = settings

-    @staticmethod
-    def find_id(bangumi_info: list[BangumiData]) -> int:
-        _id = 0
-        for info in bangumi_info:
-            if info.id > _id:
-                _id = info.id
-        return _id
-
-    def rss_to_datas(self, bangumi_info: list[BangumiData], rss_link: str) -> list[BangumiData]:
+    def rss_to_datas(self, rss_link: str) -> list[BangumiData]:
        with RequestContent() as req:
            rss_torrents = req.get_torrents(rss_link)
-        # Find largest bangumi id
-        _id = self.find_id(bangumi_info)
-        for torrent in rss_torrents:
-            raw_title = torrent.name
-            extra_add = True
-            if bangumi_info is not []:
-                for info in bangumi_info:
-                    if re.search(info.title_raw, raw_title) is not None:
-                        logger.debug(f"Had added {info.official_title} in auto_download rule before")
-                        extra_add = False
-                        break
-            if extra_add:
-                _id += 1
+        title_list = [torrent.name for torrent in rss_torrents]
+        data_list = []
+        with DataOperator() as op:
+            add_title_list = op.not_exist_titles(title_list)
+            _id = op.gen_id()
+            for raw_title in add_title_list:
                data = self._title_analyser.raw_parser(
                    raw=raw_title,
                    _id=_id,
                    settings=self.settings)
-                if data is not None and data.official_title not in bangumi_info:
-                    bangumi_info.append(data)
-        return bangumi_info
+                if data is not None and op.match_title(data.official_title) is None:
+                    data_list.append(data)
+                    _id += 1
+            op.insert_list(data_list)
+        return data_list

    def rss_to_data(self, url, _filter: bool = True) -> BangumiData:
        with RequestContent() as req:
@@ -59,10 +46,9 @@ class RSSAnalyser:
            except Exception as e:
                logger.debug(e)

-    def run(self, bangumi_info: list[BangumiData], rss_link: str):
+    def run(self, rss_link: str):
        logger.info("Start collecting RSS info.")
        try:
-            self.rss_to_datas(bangumi_info, rss_link)
+            return self.rss_to_datas(rss_link)
        except Exception as e:
            logger.debug(e)
-        logger.info("Finished")