diff --git a/app/modules/indexer/parser/nexus_audiences.py b/app/modules/indexer/parser/nexus_audiences.py index f5b58c20..70096c51 100644 --- a/app/modules/indexer/parser/nexus_audiences.py +++ b/app/modules/indexer/parser/nexus_audiences.py @@ -24,6 +24,7 @@ class NexusAudiencesSiteUserInfo(NexusPhpSiteUserInfo): self._sys_mail_unread_page = None self.__next_mail_page = 1 self.__seen_unread_message_links = set() + self.__message_list_previews = {} def _parse_message_unread(self, html_text): """ @@ -64,10 +65,8 @@ class NexusAudiencesSiteUserInfo(NexusPhpSiteUserInfo): if not StringUtils.is_valid_html_element(html): return None - message_links = html.xpath( - '//tr[.//img[contains(concat(" ", normalize-space(@class), " "), " unreadpm ") ' - 'or @alt="Unread" or @title="未读"]]/td/a[contains(@href, "viewmessage")]/@href' - ) + message_links = self.__parse_table_unread_message_links(html) + message_links.extend(self.__parse_pm_item_unread_message_links(html)) new_message_links = self.__filter_new_message_links(message_links) if message_links and not new_message_links: logger.warn(f"{self._site_name} 未读消息页只发现重复消息链接,停止后续翻页") @@ -81,6 +80,30 @@ class NexusAudiencesSiteUserInfo(NexusPhpSiteUserInfo): return next_page + def _parse_message_content(self, html_text): + """ + 解析 Audiences 新版短消息详情页。 + """ + html = etree.HTML(html_text) + try: + if StringUtils.is_valid_html_element(html): + head = self.__extract_first_text( + html, + '//*[contains(concat(" ", normalize-space(@class), " "), " pm-hero__title ")]' + ) + date = self.__extract_pm_view_meta(html, "日期") + content = self.__extract_first_text( + html, + '//*[contains(concat(" ", normalize-space(@class), " "), " pm-view__body ")]' + ) + if not self.__is_empty_message_content(head, date, content): + return head, date, content + finally: + if html is not None: + del html + + return super()._parse_message_content(html_text) + def _pase_unread_msgs(self): """ 解析 Audiences 未读消息,避免异常分页重复通知和空详情通知。 @@ -110,6 +133,7 @@ class NexusAudiencesSiteUserInfo(NexusPhpSiteUserInfo): headers=self._mail_content_headers ) ) + head, date, content = self.__fill_empty_message_content_from_list(msg_link, head, date, content) logger.debug(f"{self._site_name} 标题 {head} 时间 {date} 内容 {content}") if self.__is_empty_message_content(head, date, content): logger.warn(f"{self._site_name} 信息链接 {msg_link} 解析结果为空,跳过消息通知") @@ -137,6 +161,7 @@ class NexusAudiencesSiteUserInfo(NexusPhpSiteUserInfo): """ self.__next_mail_page = 1 self.__seen_unread_message_links.clear() + self.__message_list_previews.clear() def __filter_new_message_links(self, message_links: list) -> list: """ @@ -151,6 +176,118 @@ class NexusAudiencesSiteUserInfo(NexusPhpSiteUserInfo): new_message_links.append(message_link) return new_message_links + @staticmethod + def __parse_table_unread_message_links(html) -> list: + """ + 解析 Audiences 旧版表格消息列表中的未读消息链接。 + """ + return html.xpath( + '//tr[.//img[contains(concat(" ", normalize-space(@class), " "), " unreadpm ") ' + 'or @alt="Unread" or @title="未读"]]/td/a[contains(@href, "viewmessage")]/@href' + ) + + def __parse_pm_item_unread_message_links(self, html) -> list: + """ + 解析 Audiences 新版 pm-item 私信列表中的未读消息链接。 + """ + message_links = [] + unread_rows = html.xpath( + '//*[contains(concat(" ", normalize-space(@class), " "), " pm-item-row ") ' + 'and contains(concat(" ", normalize-space(@class), " "), " is-unread ")]' + ) + if not unread_rows: + unread_rows = html.xpath( + '//*[contains(concat(" ", normalize-space(@class), " "), " pm-item-row ") ' + 'and .//*[contains(concat(" ", normalize-space(@class), " "), " pm-item__status--unread ") ' + 'or @title="未读"]]' + ) + + for row in unread_rows: + row_links = row.xpath('.//a[contains(@href, "viewmessage")]/@href') + if not row_links: + continue + message_link = row_links[0].strip() + if not message_link: + continue + message_links.append(message_link) + self.__cache_pm_item_preview(message_link, row) + return message_links + + def __cache_pm_item_preview(self, message_link: str, row): + """ + 缓存新版列表页预览,用于详情页结构变化时兜底生成站点消息。 + """ + head = self.__extract_pm_item_text( + row, + './/*[contains(concat(" ", normalize-space(@class), " "), " pm-item__subject ")]' + ) + date = self.__extract_pm_item_text( + row, + './/*[contains(concat(" ", normalize-space(@class), " "), " pm-item__time ")]' + ) + content = self.__extract_pm_item_text( + row, + './/*[contains(concat(" ", normalize-space(@class), " "), " pm-item__preview ")]' + ) + self.__message_list_previews[urljoin(self._base_url, message_link)] = (head, date, content) + + @staticmethod + def __extract_pm_item_text(row, xpath: str): + """ + 提取新版私信列表节点文本并规整空白字符。 + """ + nodes = row.xpath(xpath) + if not nodes: + return None + text = nodes[0].xpath("string(.)") + text = re.sub(r"\s+", " ", text.replace("\xa0", " ")).strip() + return text or None + + @staticmethod + def __extract_first_text(html, xpath: str): + """ + 提取第一个匹配节点的规整文本。 + """ + nodes = html.xpath(xpath) + if not nodes: + return None + return NexusAudiencesSiteUserInfo.__normalize_text(nodes[0].xpath("string(.)")) + + @staticmethod + def __extract_pm_view_meta(html, label: str): + """ + 按标签提取 Audiences 新版短消息详情页中的元信息。 + """ + values = html.xpath( + '//*[contains(concat(" ", normalize-space(@class), " "), " pm-view__meta ") ' + f'and .//*[contains(concat(" ", normalize-space(@class), " "), " pm-view__label ") ' + f'and normalize-space()="{label}"]]' + '//*[contains(concat(" ", normalize-space(@class), " "), " pm-view__value ")]' + ) + if not values: + return None + return NexusAudiencesSiteUserInfo.__normalize_text(values[0].xpath("string(.)")) + + @staticmethod + def __normalize_text(text: str): + """ + 规整 Audiences 新版消息页文本空白字符。 + """ + if not text: + return None + text = re.sub(r"\s+", " ", text.replace("\xa0", " ")).strip() + return text or None + + def __fill_empty_message_content_from_list(self, msg_link: str, head, date, content): + """ + 使用列表页预览填补详情页解析不到的字段。 + """ + preview = self.__message_list_previews.get(urljoin(self._base_url, msg_link)) + if not preview: + return head, date, content + preview_head, preview_date, preview_content = preview + return head or preview_head, date or preview_date, content or preview_content + def __should_fetch_next_unread_page(self, new_message_links: list) -> bool: """ 判断是否还需要继续请求 Audiences 下一页未读消息列表。 diff --git a/tests/test_nexus_audiences_parser.py b/tests/test_nexus_audiences_parser.py index 3d7a5604..79322b6b 100644 --- a/tests/test_nexus_audiences_parser.py +++ b/tests/test_nexus_audiences_parser.py @@ -269,6 +269,129 @@ def test_audiences_readpm_row_is_not_unread_message(): assert msg_links == [] +def test_audiences_pm_item_unread_links_use_list_preview_when_detail_empty(): + """ + Audiences 新版 div 私信列表应能识别未读行,并在详情页不可解析时使用列表预览通知。 + """ + parser = NexusAudiencesSiteUserInfo( + site_name="Audiences", + url="https://audiences.me/", + site_cookie="", + apikey=None, + token=None, + ) + parser.message_unread = 7 + unread_rows = "".join( + f""" +
+
+ + + 种子被删除 + 系统 + 2026-06-22 22:32:11 +
+
+ 你下载的种子'Wonder Wall S01E{index:02d} 2026 1080p WEB-DL H265 AAC-ADWeb'被管理员删除。 +
+
+ """ + for index in range(1, 8) + ) + list_html = f""" + + +
+
+ {unread_rows} +
+
+ + 已读消息 + 2026-06-07 14:27:45 +
+
+
+
+ + + """ + requested_urls = [] + + def fake_get_page_content(url, params=None, headers=None): + """ + 模拟新版列表页可读,但详情页结构暂不兼容导致解析为空。 + """ + requested_urls.append(url) + return "" if "viewmessage" in url else list_html + + parser._get_page_content = fake_get_page_content + + parser._pase_unread_msgs() + + detail_requests = [url for url in requested_urls if "viewmessage" in url] + assert len(detail_requests) == 7 + assert len(parser.message_unread_contents) == 7 + assert parser.message_unread_contents[0] == ( + "种子被删除", + "2026-06-22 22:32:11", + "你下载的种子'Wonder Wall S01E01 2026 1080p WEB-DL H265 AAC-ADWeb'被管理员删除。", + ) + assert "已读消息" not in [item[0] for item in parser.message_unread_contents] + + +def test_audiences_pm_view_message_content_is_parsed(): + """ + Audiences 新版短消息详情页应解析 pm-view 中的标题、日期和正文。 + """ + parser = NexusAudiencesSiteUserInfo( + site_name="Audiences", + url="https://audiences.me/", + site_cookie="", + apikey=None, + token=None, + ) + html_text = """ + + + +
+
+
+

种子被删除

+

自 系统

+
+
+
+
+
+ + 系统 +
+
+ 日期 + 2026-06-22 22:32:11 +
+
+
+ 你下载的种子'Wonder Wall S01E20 2026 1080p WEB-DL H265 AAC-ADWeb'被管理员删除。原因:已完结剧集,清理单集。 +
+
+
+ + + + """ + + head, date, content = parser._parse_message_content(html_text) + + assert head == "种子被删除" + assert date == "2026-06-22 22:32:11" + assert content == "你下载的种子'Wonder Wall S01E20 2026 1080p WEB-DL H265 AAC-ADWeb'被管理员删除。原因:已完结剧集,清理单集。" + + def test_audiences_unread_mailbox_only_uses_user_box(): """ Audiences 只使用用户消息箱,首页不传 page,page=1 实际表示第二页。