From cb0755038805d7c7b7edf8e813ef6892d0ec1031 Mon Sep 17 00:00:00 2001 From: zhzero Date: Sat, 21 Dec 2024 14:29:08 +0800 Subject: [PATCH] =?UTF-8?q?TorrentSpider=E6=B7=BB=E5=8A=A0encoding=20key?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- app/modules/indexer/spider/__init__.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/app/modules/indexer/spider/__init__.py b/app/modules/indexer/spider/__init__.py index 5f67d9c6..b5d5d8f8 100644 --- a/app/modules/indexer/spider/__init__.py +++ b/app/modules/indexer/spider/__init__.py @@ -64,6 +64,8 @@ class TorrentSpider: torrents_info_array: list = [] # 搜索超时, 默认: 15秒 _timeout = 15 + # 站点解析时是否需要编码 + encoding: bool = False def __init__(self, indexer: CommentedMap, @@ -95,6 +97,7 @@ class TorrentSpider: self.domain = indexer.get('domain') self.result_num = int(indexer.get('result_num') or 100) self._timeout = int(indexer.get('timeout') or 15) + self.encoding = indexer.get('encoding', False) self.page = page if self.domain and not str(self.domain).endswith("/"): self.domain = self.domain + "/" @@ -728,7 +731,10 @@ class TorrentSpider: self.torrents_info_array = [] try: # 解析站点文本对象 - html_doc = PyQuery(html_text) + if self.encoding: + html_doc = PyQuery(html_text.encode('utf-8')) + else: + html_doc = PyQuery(html_text) # 种子筛选器 torrents_selector = self.list.get('selector', '') # 遍历种子html列表