diff --git a/app/core/config.py b/app/core/config.py index 58e2f890..0671eb31 100644 --- a/app/core/config.py +++ b/app/core/config.py @@ -219,6 +219,10 @@ class ConfigModel(BaseModel): BIG_MEMORY_MODE: bool = False # 全局图片缓存,将媒体图片缓存到本地 GLOBAL_IMAGE_CACHE: bool = False + # 是否启用编码探测的兼容模式 + ENCODING_DETECTION_COMPATIBLE_MODE: bool = True + # 编码探测的最低置信度阈值 + ENCODING_DETECTION_MIN_CONFIDENCE: float = 0.8 # 允许的图片缓存域名 SECURITY_IMAGE_DOMAINS: List[str] = Field( default_factory=lambda: ["image.tmdb.org", diff --git a/app/modules/indexer/parser/__init__.py b/app/modules/indexer/parser/__init__.py index d3baaafe..9feb1c1f 100644 --- a/app/modules/indexer/parser/__init__.py +++ b/app/modules/indexer/parser/__init__.py @@ -344,11 +344,9 @@ class SiteParserBase(metaclass=ABCMeta): logger.warn( f"{self._site_name} 检测到Cloudflare,请更新Cookie和UA") return "" - if re.search(r"charset=\"?utf-8\"?", res.text, re.IGNORECASE): - res.encoding = "utf-8" - else: - res.encoding = res.apparent_encoding - return res.text + return RequestUtils.get_decoded_html_content(res, + settings.ENCODING_DETECTION_COMPATIBLE_MODE, + settings.ENCODING_DETECTION_MIN_CONFIDENCE) return "" diff --git a/app/modules/indexer/spider/__init__.py b/app/modules/indexer/spider/__init__.py index 9027ef39..34c23a6d 100644 --- a/app/modules/indexer/spider/__init__.py +++ b/app/modules/indexer/spider/__init__.py @@ -5,7 +5,6 @@ import traceback from typing import List from urllib.parse import quote, urlencode, urlparse, parse_qs -import chardet from jinja2 import Template from pyquery import PyQuery from ruamel.yaml import CommentedMap @@ -250,27 +249,9 @@ class TorrentSpider: referer=self.referer, proxies=self.proxies ).get_res(searchurl, allow_redirects=True) - if ret is not None: - # 使用chardet检测字符编码 - raw_data = ret.content - if raw_data: - try: - result = chardet.detect(raw_data) - encoding = result['encoding'] - # 解码为字符串 - page_source = raw_data.decode(encoding) - except Exception as e: - logger.debug(f"chardet解码失败:{str(e)}") - # 探测utf-8解码 - if re.search(r"charset=\"?utf-8\"?", ret.text, re.IGNORECASE): - ret.encoding = "utf-8" - else: - ret.encoding = ret.apparent_encoding - page_source = ret.text - else: - page_source = ret.text - else: - page_source = "" + page_source = RequestUtils.get_decoded_html_content(ret, + settings.ENCODING_DETECTION_COMPATIBLE_MODE, + settings.ENCODING_DETECTION_MIN_CONFIDENCE) # 解析 return self.parse(page_source) diff --git a/app/utils/http.py b/app/utils/http.py index e1f13cf0..98f75c56 100644 --- a/app/utils/http.py +++ b/app/utils/http.py @@ -1,5 +1,7 @@ +import re from typing import Any, Optional, Union +import chardet import requests import urllib3 from requests import Response, Session @@ -273,3 +275,108 @@ class RequestUtils: cache_headers["Cache-Control"] = f"max-age={max_age}" return cache_headers + + @staticmethod + def detect_encoding_from_html_response(response: Response, + compatible_mode: bool = False, confidence_threshold: float = 0.8): + """ + 根据HTML响应内容探测编码信息 + + :param response: HTTP 响应对象 + :param compatible_mode: 是否使用兼容模式,默认为 False (性能模式) + :param confidence_threshold: chardet 检测置信度阈值,默认为 0.8 + :return: 解析得到的字符编码 + """ + fallback_encoding = None + try: + if compatible_mode: + # 兼容模式:使用chardet分析后,再处理 BOM 和 meta 信息 + # 1. 使用 chardet 库进一步分析内容 + detection = chardet.detect(response.content) + if detection["confidence"] > confidence_threshold: + return detection.get("encoding") + # 保存 chardet 的结果备用 + fallback_encoding = detection.get("encoding") + + # 2. 检查响应体中的 BOM 标记(例如 UTF-8 BOM) + if response.content[:3] == b"\xef\xbb\xbf": # UTF-8 BOM + return "utf-8" + + # 3. 如果是 HTML 响应体,检查其中的 标签 + if re.search(r"charset=[\"']?utf-8[\"']?", response.text, re.IGNORECASE): + return "utf-8" + + # 4. 尝试从 response headers 中获取编码信息 + content_type = response.headers.get("Content-Type", "") + if re.search(r"charset=[\"']?utf-8[\"']?", content_type, re.IGNORECASE): + return "utf-8" + + else: + # 性能模式:优先从 headers 和 BOM 标记获取,最后使用 chardet 分析 + # 1. 尝试从 response headers 中获取编码信息 + content_type = response.headers.get("Content-Type", "") + if re.search(r"charset=[\"']?utf-8[\"']?", content_type, re.IGNORECASE): + return "utf-8" + # 暂不支持直接提取字符集,仅提取UTF8 + # match = re.search(r"charset=[\"']?([^\"';\s]+)", content_type, re.IGNORECASE) + # if match: + # return match.group(1) + + # 2. 检查响应体中的 BOM 标记(例如 UTF-8 BOM) + if response.content[:3] == b"\xef\xbb\xbf": + return "utf-8" + + # 3. 如果是 HTML 响应体,检查其中的 标签 + if re.search(r"charset=[\"']?utf-8[\"']?", response.text, re.IGNORECASE): + return "utf-8" + # 暂不支持直接提取字符集,仅提取UTF8 + # match = re.search(r"]+charset=[\"']?([^\"'>\s]+)", response.text, re.IGNORECASE) + # if match: + # return match.group(1) + + # 4. 使用 chardet 库进一步分析内容 + detection = chardet.detect(response.content) + if detection.get("confidence", 0) > confidence_threshold: + return detection.get("encoding") + # 保存 chardet 的结果备用 + fallback_encoding = detection.get("encoding") + + # 5. 如果上述方法都无法确定,信任 chardet 的结果(即使置信度较低),否则返回默认字符集 + return fallback_encoding or "utf-8" + except Exception as e: + logger.debug(f"Error when detect_encoding_from_response: {str(e)}") + return fallback_encoding or "utf-8" + + @staticmethod + def get_decoded_html_content(response: Response, + compatible_mode: bool = False, confidence_threshold: float = 0.8) -> str: + """ + 获取HTML响应的解码文本内容 + + :param response: HTTP 响应对象 + :param compatible_mode: 是否使用兼容模式,默认为 False (性能模式) + :param confidence_threshold: chardet 检测置信度阈值,默认为 0.8 + :return: 解码后的响应文本内容 + """ + try: + if not response: + return "" + if response.content: + # 1. 获取编码信息 + encoding = (RequestUtils.detect_encoding_from_html_response(response, compatible_mode, + confidence_threshold) + or response.apparent_encoding) + # 2. 根据解析得到的编码进行解码 + try: + # 尝试用推测的编码解码 + return response.content.decode(encoding) + except Exception as e: + logger.debug(f"Decoding failed, error message: {str(e)}") + # 如果解码失败,尝试 fallback 使用 apparent_encoding + response.encoding = response.apparent_encoding + return response.text + else: + return response.text + except Exception as e: + logger.debug(f"Error when getting decoded content: {str(e)}") + return response.text