From 737d2f3bc615a0576d9ec0c262a2135fa5015561 Mon Sep 17 00:00:00 2001 From: Reaper Date: Thu, 2 Oct 2025 20:03:28 +0800 Subject: [PATCH 1/2] =?UTF-8?q?=E4=BC=98=E5=8C=96=E7=9F=A5=E8=A1=8C=20?= =?UTF-8?q?=E6=9E=81=E9=80=9F=E4=B9=8B=E6=98=9F=20=E6=A1=86=E6=9E=B6?= =?UTF-8?q?=E8=A7=A3=E6=9E=90?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- app/modules/indexer/parser/bitpt.py | 135 +++++++++++++------------- app/modules/indexer/parser/zhixing.py | 80 +++++++++------ 2 files changed, 115 insertions(+), 100 deletions(-) diff --git a/app/modules/indexer/parser/bitpt.py b/app/modules/indexer/parser/bitpt.py index 082dd088..bb12b26b 100644 --- a/app/modules/indexer/parser/bitpt.py +++ b/app/modules/indexer/parser/bitpt.py @@ -1,21 +1,22 @@ +# +# 极速之星 https://bitpt.cn/ +# author: ThedoRap +# time: 2025-10-02 +# # -*- coding: utf-8 -*- -import json -from typing import Optional, Tuple import re +from typing import Optional, Tuple +from urllib.parse import urljoin, urlencode +from bs4 import BeautifulSoup from app.modules.indexer.parser import SiteParserBase, SiteSchema from app.utils.string import StringUtils -from bs4 import BeautifulSoup -from urllib.parse import urljoin class BitptSiteUserInfo(SiteParserBase): schema = SiteSchema.Bitpt def _parse_site_page(self, html_text: str): - """ - 获取站点页面地址 - """ - self._user_basic_page = "userdetails.php?uid={uid}" # uid 需要在解析时替换 + self._user_basic_page = "userdetails.php?uid={uid}" self._user_detail_page = None self._user_basic_params = {} self._user_traffic_page = None @@ -23,24 +24,15 @@ class BitptSiteUserInfo(SiteParserBase): self._user_mail_unread_page = None self._mail_unread_params = {} self._torrent_seeding_page = "browse.php?t=myseed" - self._torrent_seeding_params = { - "st": "2", - "d": "desc" - } + self._torrent_seeding_params = {"st": "2", "d": "desc"} self._torrent_seeding_headers = {} self._addition_headers = {} def _parse_logged_in(self, html_text): - """ - 判断是否登录成功, 通过判断是否存在用户信息 - """ soup = BeautifulSoup(html_text, 'html.parser') return bool(soup.find(id='userinfotop')) def _parse_user_base_info(self, html_text: str): - """ - 解析用户基本信息,这里把_parse_user_traffic_info和_parse_user_detail_info合并到这里 - """ if not html_text: return None soup = BeautifulSoup(html_text, 'html.parser') @@ -67,80 +59,55 @@ class BitptSiteUserInfo(SiteParserBase): self.ratio = float(info_dict.get('共享率')) if '共享率' in info_dict else 0 bonus_str = info_dict.get('星辰', '') self.bonus = float(re.search(r'累计([\d\.]+)', bonus_str).group(1)) if re.search(r'累计([\d\.]+)', bonus_str) else 0 - self.message_unread = 0 # 暂无消息解析 + self.message_unread = 0 - # 做种信息从页面底部提取 - seeding_info = soup.find('div', style="margin:0 auto;width:90%;font-size:14px;margin-top:10px;margin-bottom:10px;text-align:center;") - if seeding_info: - seeding_link = seeding_info.find_all('a')[1].text if len(seeding_info.find_all('a')) > 1 else '' - match = re.search(r'当前上传的种子\((\d+)个, 共([\d\.]+ [KMGT]B)\)', seeding_link) - if match: - self.seeding = int(match.group(1)) - self.seeding_size = StringUtils.num_filesize(match.group(2)) - else: - self.seeding = 0 - self.seeding_size = 0 + if self._torrent_seeding_page: + self.seeding = 0 + self.seeding_size = 0 + else: + seeding_info = soup.find('div', style="margin:0 auto;width:90%;font-size:14px;margin-top:10px;margin-bottom:10px;text-align:center;") + if seeding_info: + seeding_link = seeding_info.find_all('a')[1].text if len(seeding_info.find_all('a')) > 1 else '' + match = re.search(r'当前上传的种子\((\d+)个, 共([\d\.]+ [KMGT]B)\)', seeding_link) + if match: + self.seeding = int(match.group(1)) + self.seeding_size = StringUtils.num_filesize(match.group(2)) + else: + self.seeding = 0 + self.seeding_size = 0 def _parse_user_traffic_info(self, html_text: str): - """ - 解析用户流量信息 - """ pass def _parse_user_detail_info(self, html_text: str): - """ - 解析用户详细信息 - """ pass - def _parse_user_torrent_seeding_info(self, html_text: str, multi_page: Optional[bool] = False) -> Optional[str]: - """ - 解析用户做种信息 - """ + def _parse_user_torrent_seeding_page_info(self, html_text: str) -> Tuple[int, int]: if not html_text: - return None + return 0, 0 soup = BeautifulSoup(html_text, 'html.parser') - torrents = soup.find_all('tr', class_=['btr0', 'btr1']) + torrents = soup.find_all('tr', id=re.compile(r'^t\d+')) page_seeding = 0 page_seeding_size = 0 for torrent in torrents: size_td = torrent.find('td', class_='r') if size_td: - size_text = size_td.find('a').text if size_td.find('a') else size_td.text + size_text = size_td.find('a').text if size_td.find('a') else size_td.text.strip() page_seeding += 1 page_seeding_size += StringUtils.num_filesize(size_text) - - self.seeding += page_seeding - self.seeding_size += page_seeding_size - - # 是否存在下页数据 - pager = soup.find('div', class_='pager') - next_page = None - if pager: - next_link = pager.find('a', string=re.compile('下一页')) - if next_link: - next_page = next_link['href'] - - return next_page + return page_seeding, page_seeding_size def _parse_message_unread_links(self, html_text: str, msg_links: list) -> Optional[str]: - """ - 解析未读消息链接,这里直接读出详情 - """ pass def _parse_message_content(self, html_text) -> Tuple[Optional[str], Optional[str], Optional[str]]: - """ - 解析消息内容 - """ + pass + + def _parse_user_torrent_seeding_info(self, html_text: str): pass def parse(self): - """ - 解析站点信息 - """ super().parse() - # 先从首页解析userid if self._index_html: soup = BeautifulSoup(self._index_html, 'html.parser') user_link = soup.find('a', href=re.compile(r'userdetails\.php\?uid=\d+')) @@ -148,8 +115,40 @@ class BitptSiteUserInfo(SiteParserBase): uid_match = re.search(r'uid=(\d+)', user_link['href']) if uid_match: self.userid = uid_match.group(1) - # 如果有userid,则格式化_user_basic_page + if self.userid and self._user_basic_page: basic_url = self._user_basic_page.format(uid=self.userid) basic_html = self._get_page_content(url=urljoin(self._base_url, basic_url)) - self._parse_user_base_info(basic_html) \ No newline at end of file + self._parse_user_base_info(basic_html) + + if self._torrent_seeding_page: + seeding_base = self._torrent_seeding_page + seeding_base_url = urljoin(self._base_url, seeding_base) + params = self._torrent_seeding_params.copy() + page_num = 1 + while True: + params['p'] = page_num + query_string = urlencode(params) + full_url = f"{seeding_base_url}?{query_string}" + seeding_html = self._get_page_content(url=full_url) + page_seeding, page_seeding_size = self._parse_user_torrent_seeding_page_info(seeding_html) + self.seeding += page_seeding + self.seeding_size += page_seeding_size + if page_seeding == 0: + break + page_num += 1 + + # 🔑 最终对外统一转字符串 + self.userid = str(self.userid or "") + self.username = str(self.username or "") + self.user_level = str(self.user_level or "") + self.join_at = str(self.join_at or "") + + self.upload = str(self.upload or 0) + self.download = str(self.download or 0) + self.ratio = str(self.ratio or 0) + self.bonus = str(self.bonus or 0.0) + self.message_unread = str(self.message_unread or 0) + + self.seeding = str(self.seeding or 0) + self.seeding_size = str(self.seeding_size or 0) \ No newline at end of file diff --git a/app/modules/indexer/parser/zhixing.py b/app/modules/indexer/parser/zhixing.py index 719f2588..e7234a10 100644 --- a/app/modules/indexer/parser/zhixing.py +++ b/app/modules/indexer/parser/zhixing.py @@ -1,3 +1,8 @@ +# +# 知行 http://pt.zhixing.bjtu.edu.cn/ +# author: ThedoRap +# time: 2025-10-02 +# # -*- coding: utf-8 -*- import re from typing import Optional, Tuple @@ -22,7 +27,7 @@ class ZhixingSiteUserInfo(SiteParserBase): self._sys_mail_unread_page = None self._user_mail_unread_page = None self._mail_unread_params = {} - self._torrent_seeding_page = "user/{uid}/seeding" + self._torrent_seeding_base = "user/{uid}/seeding/" self._torrent_seeding_params = {} self._torrent_seeding_headers = {} self._addition_headers = {} @@ -76,27 +81,25 @@ class ZhixingSiteUserInfo(SiteParserBase): self.bonus = float(info_dict.get('保种积分')) if '保种积分' in info_dict else 0.0 self.message_unread = 0 # 暂无消息解析 - self.seeding = int(info_dict.get('当前保种数量')) if '当前保种数量' in info_dict else 0 - self.seeding_size = num_filesize_safe(info_dict.get('当前保种容量')) if '当前保种容量' in info_dict else 0 + if hasattr(self, '_torrent_seeding_base') and self._torrent_seeding_base: + self.seeding = 0 + self.seeding_size = 0 + else: + self.seeding = int(info_dict.get('当前保种数量')) if '当前保种数量' in info_dict else 0 + self.seeding_size = num_filesize_safe(info_dict.get('当前保种容量')) if '当前保种容量' in info_dict else 0 def _parse_user_traffic_info(self, html_text: str): - """ - 解析用户流量信息 - """ pass def _parse_user_detail_info(self, html_text: str): - """ - 解析用户详细信息 - """ pass - def _parse_user_torrent_seeding_info(self, html_text: str, multi_page: Optional[bool] = False) -> Optional[str]: + def _parse_user_torrent_seeding_page_info(self, html_text: str) -> Tuple[int, int]: """ - 解析用户做种信息 + 解析用户做种信息单页,返回本页数量和大小 """ if not html_text: - return None + return 0, 0 soup = BeautifulSoup(html_text, 'html.parser') torrents = soup.find_all('tr', id=re.compile(r'^t\d+')) page_seeding = 0 @@ -107,30 +110,17 @@ class ZhixingSiteUserInfo(SiteParserBase): size_text = size_td.find('a').text if size_td.find('a') else size_td.text.strip() page_seeding += 1 page_seeding_size += StringUtils.num_filesize(size_text) - - self.seeding += page_seeding - self.seeding_size += page_seeding_size - - # 是否存在下页数据 - next_page = None - # 假设有分页元素,类似
中的 下一页 - pager = soup.find('div', class_='pager') - if pager: - next_link = pager.find('a', string=re.compile('下一页')) - if next_link: - next_page = next_link['href'] - - return next_page + return page_seeding, page_seeding_size def _parse_message_unread_links(self, html_text: str, msg_links: list) -> Optional[str]: - """ - 解析未读消息链接,这里直接读出详情 - """ pass def _parse_message_content(self, html_text) -> Tuple[Optional[str], Optional[str], Optional[str]]: + pass + + def _parse_user_torrent_seeding_info(self, html_text: str): """ - 解析消息内容 + 占位,避免抽象类报错 """ pass @@ -153,5 +143,31 @@ class ZhixingSiteUserInfo(SiteParserBase): basic_url = self._user_basic_page.format(uid=self.userid) basic_html = self._get_page_content(url=urljoin(self._base_url, basic_url)) self._parse_user_base_info(basic_html) - if self._torrent_seeding_page: - self._torrent_seeding_page = self._torrent_seeding_page.format(uid=self.userid) + if hasattr(self, '_torrent_seeding_base') and self._torrent_seeding_base: + seeding_base = self._torrent_seeding_base.format(uid=self.userid) + seeding_base_url = urljoin(self._base_url, seeding_base) + page_num = 1 + while True: + seeding_url = f"{seeding_base_url}/p{page_num}" + seeding_html = self._get_page_content(url=seeding_url) + page_seeding, page_seeding_size = self._parse_user_torrent_seeding_page_info(seeding_html) + self.seeding += page_seeding + self.seeding_size += page_seeding_size + if page_seeding == 0: + break + page_num += 1 + + # 🔑 最终对外统一转字符串,避免 join 报错 + self.userid = str(self.userid or "") + self.username = str(self.username or "") + self.user_level = str(self.user_level or "") + self.join_at = str(self.join_at or "") + + self.upload = str(self.upload or 0) + self.download = str(self.download or 0) + self.ratio = str(self.ratio or 0) + self.bonus = str(self.bonus or 0.0) + self.message_unread = str(self.message_unread or 0) + + self.seeding = str(self.seeding or 0) + self.seeding_size = str(self.seeding_size or 0) From b128b0ede26b908e4ae97c85fe87b3f0b6342835 Mon Sep 17 00:00:00 2001 From: Reaper Date: Thu, 2 Oct 2025 20:43:06 +0800 Subject: [PATCH 2/2] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E7=9F=A5=E8=A1=8C=20?= =?UTF-8?q?=E6=9E=81=E9=80=9F=E4=B9=8B=E6=98=9F=20=E6=A1=86=E6=9E=B6?= =?UTF-8?q?=E8=A7=A3=E6=9E=90=20=E5=81=9A=E7=A7=8D=E4=BF=A1=E6=81=AF?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- app/modules/indexer/parser/bitpt.py | 27 +++++++++++++++++---------- app/modules/indexer/parser/zhixing.py | 27 +++++++++++++++++++-------- 2 files changed, 36 insertions(+), 18 deletions(-) diff --git a/app/modules/indexer/parser/bitpt.py b/app/modules/indexer/parser/bitpt.py index bb12b26b..b5d0102c 100644 --- a/app/modules/indexer/parser/bitpt.py +++ b/app/modules/indexer/parser/bitpt.py @@ -23,8 +23,8 @@ class BitptSiteUserInfo(SiteParserBase): self._sys_mail_unread_page = None self._user_mail_unread_page = None self._mail_unread_params = {} - self._torrent_seeding_page = "browse.php?t=myseed" - self._torrent_seeding_params = {"st": "2", "d": "desc"} + self._torrent_seeding_base = "browse.php" + self._torrent_seeding_params = {"t": "myseed", "st": "2", "d": "desc"} self._torrent_seeding_headers = {} self._addition_headers = {} @@ -61,7 +61,7 @@ class BitptSiteUserInfo(SiteParserBase): self.bonus = float(re.search(r'累计([\d\.]+)', bonus_str).group(1)) if re.search(r'累计([\d\.]+)', bonus_str) else 0 self.message_unread = 0 - if self._torrent_seeding_page: + if hasattr(self, '_torrent_seeding_base') and self._torrent_seeding_base: self.seeding = 0 self.seeding_size = 0 else: @@ -86,15 +86,23 @@ class BitptSiteUserInfo(SiteParserBase): if not html_text: return 0, 0 soup = BeautifulSoup(html_text, 'html.parser') - torrents = soup.find_all('tr', id=re.compile(r'^t\d+')) + torrent_table = soup.find('table', class_='torrenttable') + if not torrent_table: + return 0, 0 + rows = torrent_table.find_all('tr') + if len(rows) <= 1: + return 0, 0 + torrents = [row for row in rows[1:] if 'btr' in row.get('class', [])] page_seeding = 0 page_seeding_size = 0 for torrent in torrents: size_td = torrent.find('td', class_='r') if size_td: - size_text = size_td.find('a').text if size_td.find('a') else size_td.text.strip() - page_seeding += 1 - page_seeding_size += StringUtils.num_filesize(size_text) + size_a = size_td.find('a') + size_text = size_a.text.strip() if size_a else size_td.text.strip() + if size_text: + page_seeding += 1 + page_seeding_size += StringUtils.num_filesize(size_text) return page_seeding, page_seeding_size def _parse_message_unread_links(self, html_text: str, msg_links: list) -> Optional[str]: @@ -121,9 +129,8 @@ class BitptSiteUserInfo(SiteParserBase): basic_html = self._get_page_content(url=urljoin(self._base_url, basic_url)) self._parse_user_base_info(basic_html) - if self._torrent_seeding_page: - seeding_base = self._torrent_seeding_page - seeding_base_url = urljoin(self._base_url, seeding_base) + if hasattr(self, '_torrent_seeding_base') and self._torrent_seeding_base: + seeding_base_url = urljoin(self._base_url, self._torrent_seeding_base) params = self._torrent_seeding_params.copy() page_num = 1 while True: diff --git a/app/modules/indexer/parser/zhixing.py b/app/modules/indexer/parser/zhixing.py index e7234a10..d92a85cc 100644 --- a/app/modules/indexer/parser/zhixing.py +++ b/app/modules/indexer/parser/zhixing.py @@ -27,7 +27,7 @@ class ZhixingSiteUserInfo(SiteParserBase): self._sys_mail_unread_page = None self._user_mail_unread_page = None self._mail_unread_params = {} - self._torrent_seeding_base = "user/{uid}/seeding/" + self._torrent_seeding_base = "user/{uid}/seeding" self._torrent_seeding_params = {} self._torrent_seeding_headers = {} self._addition_headers = {} @@ -63,6 +63,8 @@ class ZhixingSiteUserInfo(SiteParserBase): value = re.split(r'\s*\(', value_text)[0].strip().split('查看')[0].strip() info_dict[key] = value + self._basic_info = info_dict # Save for fallback + self.userid = info_dict.get('UID') self.username = info_dict.get('用户名') self.user_level = info_dict.get('用户组') @@ -81,12 +83,9 @@ class ZhixingSiteUserInfo(SiteParserBase): self.bonus = float(info_dict.get('保种积分')) if '保种积分' in info_dict else 0.0 self.message_unread = 0 # 暂无消息解析 - if hasattr(self, '_torrent_seeding_base') and self._torrent_seeding_base: - self.seeding = 0 - self.seeding_size = 0 - else: - self.seeding = int(info_dict.get('当前保种数量')) if '当前保种数量' in info_dict else 0 - self.seeding_size = num_filesize_safe(info_dict.get('当前保种容量')) if '当前保种容量' in info_dict else 0 + # Temporarily set seeding from basic, will override or fallback later + self.seeding = int(info_dict.get('当前保种数量')) if '当前保种数量' in info_dict else 0 + self.seeding_size = num_filesize_safe(info_dict.get('当前保种容量')) if '当前保种容量' in info_dict else 0 def _parse_user_traffic_info(self, html_text: str): pass @@ -144,6 +143,8 @@ class ZhixingSiteUserInfo(SiteParserBase): basic_html = self._get_page_content(url=urljoin(self._base_url, basic_url)) self._parse_user_base_info(basic_html) if hasattr(self, '_torrent_seeding_base') and self._torrent_seeding_base: + self.seeding = 0 # Reset to sum from pages + self.seeding_size = 0 seeding_base = self._torrent_seeding_base.format(uid=self.userid) seeding_base_url = urljoin(self._base_url, seeding_base) page_num = 1 @@ -156,6 +157,16 @@ class ZhixingSiteUserInfo(SiteParserBase): if page_seeding == 0: break page_num += 1 + # Fallback to basic if no seeding found from pages + if self.seeding == 0 and hasattr(self, '_basic_info'): + def num_filesize_safe(s: str): + if s: + s = s.strip() + if re.match(r'^\d+(\.\d+)?$', s): + s += ' B' + return StringUtils.num_filesize(s) if s else 0 + self.seeding = int(self._basic_info.get('当前保种数量', 0)) + self.seeding_size = num_filesize_safe(self._basic_info.get('当前保种容量', '')) # 🔑 最终对外统一转字符串,避免 join 报错 self.userid = str(self.userid or "") @@ -170,4 +181,4 @@ class ZhixingSiteUserInfo(SiteParserBase): self.message_unread = str(self.message_unread or 0) self.seeding = str(self.seeding or 0) - self.seeding_size = str(self.seeding_size or 0) + self.seeding_size = str(self.seeding_size or 0) \ No newline at end of file