diff --git a/app/core/config.py b/app/core/config.py
index 58e2f890..0671eb31 100644
--- a/app/core/config.py
+++ b/app/core/config.py
@@ -219,6 +219,10 @@ class ConfigModel(BaseModel):
BIG_MEMORY_MODE: bool = False
# 全局图片缓存,将媒体图片缓存到本地
GLOBAL_IMAGE_CACHE: bool = False
+ # 是否启用编码探测的兼容模式
+ ENCODING_DETECTION_COMPATIBLE_MODE: bool = True
+ # 编码探测的最低置信度阈值
+ ENCODING_DETECTION_MIN_CONFIDENCE: float = 0.8
# 允许的图片缓存域名
SECURITY_IMAGE_DOMAINS: List[str] = Field(
default_factory=lambda: ["image.tmdb.org",
diff --git a/app/modules/indexer/parser/__init__.py b/app/modules/indexer/parser/__init__.py
index d3baaafe..9feb1c1f 100644
--- a/app/modules/indexer/parser/__init__.py
+++ b/app/modules/indexer/parser/__init__.py
@@ -344,11 +344,9 @@ class SiteParserBase(metaclass=ABCMeta):
logger.warn(
f"{self._site_name} 检测到Cloudflare,请更新Cookie和UA")
return ""
- if re.search(r"charset=\"?utf-8\"?", res.text, re.IGNORECASE):
- res.encoding = "utf-8"
- else:
- res.encoding = res.apparent_encoding
- return res.text
+ return RequestUtils.get_decoded_html_content(res,
+ settings.ENCODING_DETECTION_COMPATIBLE_MODE,
+ settings.ENCODING_DETECTION_MIN_CONFIDENCE)
return ""
diff --git a/app/modules/indexer/spider/__init__.py b/app/modules/indexer/spider/__init__.py
index 9027ef39..34c23a6d 100644
--- a/app/modules/indexer/spider/__init__.py
+++ b/app/modules/indexer/spider/__init__.py
@@ -5,7 +5,6 @@ import traceback
from typing import List
from urllib.parse import quote, urlencode, urlparse, parse_qs
-import chardet
from jinja2 import Template
from pyquery import PyQuery
from ruamel.yaml import CommentedMap
@@ -250,27 +249,9 @@ class TorrentSpider:
referer=self.referer,
proxies=self.proxies
).get_res(searchurl, allow_redirects=True)
- if ret is not None:
- # 使用chardet检测字符编码
- raw_data = ret.content
- if raw_data:
- try:
- result = chardet.detect(raw_data)
- encoding = result['encoding']
- # 解码为字符串
- page_source = raw_data.decode(encoding)
- except Exception as e:
- logger.debug(f"chardet解码失败:{str(e)}")
- # 探测utf-8解码
- if re.search(r"charset=\"?utf-8\"?", ret.text, re.IGNORECASE):
- ret.encoding = "utf-8"
- else:
- ret.encoding = ret.apparent_encoding
- page_source = ret.text
- else:
- page_source = ret.text
- else:
- page_source = ""
+ page_source = RequestUtils.get_decoded_html_content(ret,
+ settings.ENCODING_DETECTION_COMPATIBLE_MODE,
+ settings.ENCODING_DETECTION_MIN_CONFIDENCE)
# 解析
return self.parse(page_source)
diff --git a/app/utils/http.py b/app/utils/http.py
index e1f13cf0..98f75c56 100644
--- a/app/utils/http.py
+++ b/app/utils/http.py
@@ -1,5 +1,7 @@
+import re
from typing import Any, Optional, Union
+import chardet
import requests
import urllib3
from requests import Response, Session
@@ -273,3 +275,108 @@ class RequestUtils:
cache_headers["Cache-Control"] = f"max-age={max_age}"
return cache_headers
+
+ @staticmethod
+ def detect_encoding_from_html_response(response: Response,
+ compatible_mode: bool = False, confidence_threshold: float = 0.8):
+ """
+ 根据HTML响应内容探测编码信息
+
+ :param response: HTTP 响应对象
+ :param compatible_mode: 是否使用兼容模式,默认为 False (性能模式)
+ :param confidence_threshold: chardet 检测置信度阈值,默认为 0.8
+ :return: 解析得到的字符编码
+ """
+ fallback_encoding = None
+ try:
+ if compatible_mode:
+ # 兼容模式:使用chardet分析后,再处理 BOM 和 meta 信息
+ # 1. 使用 chardet 库进一步分析内容
+ detection = chardet.detect(response.content)
+ if detection["confidence"] > confidence_threshold:
+ return detection.get("encoding")
+ # 保存 chardet 的结果备用
+ fallback_encoding = detection.get("encoding")
+
+ # 2. 检查响应体中的 BOM 标记(例如 UTF-8 BOM)
+ if response.content[:3] == b"\xef\xbb\xbf": # UTF-8 BOM
+ return "utf-8"
+
+ # 3. 如果是 HTML 响应体,检查其中的 标签
+ if re.search(r"charset=[\"']?utf-8[\"']?", response.text, re.IGNORECASE):
+ return "utf-8"
+
+ # 4. 尝试从 response headers 中获取编码信息
+ content_type = response.headers.get("Content-Type", "")
+ if re.search(r"charset=[\"']?utf-8[\"']?", content_type, re.IGNORECASE):
+ return "utf-8"
+
+ else:
+ # 性能模式:优先从 headers 和 BOM 标记获取,最后使用 chardet 分析
+ # 1. 尝试从 response headers 中获取编码信息
+ content_type = response.headers.get("Content-Type", "")
+ if re.search(r"charset=[\"']?utf-8[\"']?", content_type, re.IGNORECASE):
+ return "utf-8"
+ # 暂不支持直接提取字符集,仅提取UTF8
+ # match = re.search(r"charset=[\"']?([^\"';\s]+)", content_type, re.IGNORECASE)
+ # if match:
+ # return match.group(1)
+
+ # 2. 检查响应体中的 BOM 标记(例如 UTF-8 BOM)
+ if response.content[:3] == b"\xef\xbb\xbf":
+ return "utf-8"
+
+ # 3. 如果是 HTML 响应体,检查其中的 标签
+ if re.search(r"charset=[\"']?utf-8[\"']?", response.text, re.IGNORECASE):
+ return "utf-8"
+ # 暂不支持直接提取字符集,仅提取UTF8
+ # match = re.search(r"]+charset=[\"']?([^\"'>\s]+)", response.text, re.IGNORECASE)
+ # if match:
+ # return match.group(1)
+
+ # 4. 使用 chardet 库进一步分析内容
+ detection = chardet.detect(response.content)
+ if detection.get("confidence", 0) > confidence_threshold:
+ return detection.get("encoding")
+ # 保存 chardet 的结果备用
+ fallback_encoding = detection.get("encoding")
+
+ # 5. 如果上述方法都无法确定,信任 chardet 的结果(即使置信度较低),否则返回默认字符集
+ return fallback_encoding or "utf-8"
+ except Exception as e:
+ logger.debug(f"Error when detect_encoding_from_response: {str(e)}")
+ return fallback_encoding or "utf-8"
+
+ @staticmethod
+ def get_decoded_html_content(response: Response,
+ compatible_mode: bool = False, confidence_threshold: float = 0.8) -> str:
+ """
+ 获取HTML响应的解码文本内容
+
+ :param response: HTTP 响应对象
+ :param compatible_mode: 是否使用兼容模式,默认为 False (性能模式)
+ :param confidence_threshold: chardet 检测置信度阈值,默认为 0.8
+ :return: 解码后的响应文本内容
+ """
+ try:
+ if not response:
+ return ""
+ if response.content:
+ # 1. 获取编码信息
+ encoding = (RequestUtils.detect_encoding_from_html_response(response, compatible_mode,
+ confidence_threshold)
+ or response.apparent_encoding)
+ # 2. 根据解析得到的编码进行解码
+ try:
+ # 尝试用推测的编码解码
+ return response.content.decode(encoding)
+ except Exception as e:
+ logger.debug(f"Decoding failed, error message: {str(e)}")
+ # 如果解码失败,尝试 fallback 使用 apparent_encoding
+ response.encoding = response.apparent_encoding
+ return response.text
+ else:
+ return response.text
+ except Exception as e:
+ logger.debug(f"Error when getting decoded content: {str(e)}")
+ return response.text