feat(encoding): enhance encoding detection with confidence threshold

2026-04-09 13:48:30 +08:00 · 2024-11-27 12:33:57 +08:00
parent 2086651dbe
commit 83fc474dbe
4 changed files with 117 additions and 27 deletions
--- a/app/core/config.py
+++ b/app/core/config.py
@@ -219,6 +219,10 @@ class ConfigModel(BaseModel):
    BIG_MEMORY_MODE: bool = False
    # 全局图片缓存，将媒体图片缓存到本地
    GLOBAL_IMAGE_CACHE: bool = False
+    # 是否启用编码探测的兼容模式
+    ENCODING_DETECTION_COMPATIBLE_MODE: bool = True
+    # 编码探测的最低置信度阈值
+    ENCODING_DETECTION_MIN_CONFIDENCE: float = 0.8
    # 允许的图片缓存域名
    SECURITY_IMAGE_DOMAINS: List[str] = Field(
        default_factory=lambda: ["image.tmdb.org",
--- a/app/modules/indexer/parser/init.py
+++ b/app/modules/indexer/parser/init.py
@@ -344,11 +344,9 @@ class SiteParserBase(metaclass=ABCMeta):
                    logger.warn(
                        f"{self._site_name} 检测到Cloudflare，请更新Cookie和UA")
                    return ""
-                if re.search(r"charset=\"?utf-8\"?", res.text, re.IGNORECASE):
-                    res.encoding = "utf-8"
-                else:
-                    res.encoding = res.apparent_encoding
-                return res.text
+                return RequestUtils.get_decoded_html_content(res,
+                                                             settings.ENCODING_DETECTION_COMPATIBLE_MODE,
+                                                             settings.ENCODING_DETECTION_MIN_CONFIDENCE)

        return ""

--- a/app/modules/indexer/spider/init.py
+++ b/app/modules/indexer/spider/init.py
@@ -5,7 +5,6 @@ import traceback
 from typing import List
 from urllib.parse import quote, urlencode, urlparse, parse_qs

-import chardet
 from jinja2 import Template
 from pyquery import PyQuery
 from ruamel.yaml import CommentedMap
@@ -250,27 +249,9 @@ class TorrentSpider:
                referer=self.referer,
                proxies=self.proxies
            ).get_res(searchurl, allow_redirects=True)
-            if ret is not None:
-                # 使用chardet检测字符编码
-                raw_data = ret.content
-                if raw_data:
-                    try:
-                        result = chardet.detect(raw_data)
-                        encoding = result['encoding']
-                        # 解码为字符串
-                        page_source = raw_data.decode(encoding)
-                    except Exception as e:
-                        logger.debug(f"chardet解码失败：{str(e)}")
-                        # 探测utf-8解码
-                        if re.search(r"charset=\"?utf-8\"?", ret.text, re.IGNORECASE):
-                            ret.encoding = "utf-8"
-                        else:
-                            ret.encoding = ret.apparent_encoding
-                        page_source = ret.text
-                else:
-                    page_source = ret.text
-            else:
-                page_source = ""
+            page_source = RequestUtils.get_decoded_html_content(ret,
+                                                                settings.ENCODING_DETECTION_COMPATIBLE_MODE,
+                                                                settings.ENCODING_DETECTION_MIN_CONFIDENCE)

        # 解析
        return self.parse(page_source)
--- a/app/utils/http.py
+++ b/app/utils/http.py
@@ -1,5 +1,7 @@
+import re
 from typing import Any, Optional, Union

+import chardet
 import requests
 import urllib3
 from requests import Response, Session
@@ -273,3 +275,108 @@ class RequestUtils:
            cache_headers["Cache-Control"] = f"max-age={max_age}"

        return cache_headers
+
+    @staticmethod
+    def detect_encoding_from_html_response(response: Response,
+                                           compatible_mode: bool = False, confidence_threshold: float = 0.8):
+        """
+        根据HTML响应内容探测编码信息
+
+        :param response: HTTP 响应对象
+        :param compatible_mode: 是否使用兼容模式，默认为 False (性能模式)
+        :param confidence_threshold: chardet 检测置信度阈值，默认为 0.8
+        :return: 解析得到的字符编码
+        """
+        fallback_encoding = None
+        try:
+            if compatible_mode:
+                # 兼容模式：使用chardet分析后，再处理 BOM 和 meta 信息
+                # 1. 使用 chardet 库进一步分析内容
+                detection = chardet.detect(response.content)
+                if detection["confidence"] > confidence_threshold:
+                    return detection.get("encoding")
+                # 保存 chardet 的结果备用
+                fallback_encoding = detection.get("encoding")
+
+                # 2. 检查响应体中的 BOM 标记（例如 UTF-8 BOM）
+                if response.content[:3] == b"\xef\xbb\xbf":  # UTF-8 BOM
+                    return "utf-8"
+
+                # 3. 如果是 HTML 响应体，检查其中的 <meta charset="..."> 标签
+                if re.search(r"charset=[\"']?utf-8[\"']?", response.text, re.IGNORECASE):
+                    return "utf-8"
+
+                # 4. 尝试从 response headers 中获取编码信息
+                content_type = response.headers.get("Content-Type", "")
+                if re.search(r"charset=[\"']?utf-8[\"']?", content_type, re.IGNORECASE):
+                    return "utf-8"
+
+            else:
+                # 性能模式：优先从 headers 和 BOM 标记获取，最后使用 chardet 分析
+                # 1. 尝试从 response headers 中获取编码信息
+                content_type = response.headers.get("Content-Type", "")
+                if re.search(r"charset=[\"']?utf-8[\"']?", content_type, re.IGNORECASE):
+                    return "utf-8"
+                # 暂不支持直接提取字符集，仅提取UTF8
+                # match = re.search(r"charset=[\"']?([^\"';\s]+)", content_type, re.IGNORECASE)
+                # if match:
+                #     return match.group(1)
+
+                # 2. 检查响应体中的 BOM 标记（例如 UTF-8 BOM）
+                if response.content[:3] == b"\xef\xbb\xbf":
+                    return "utf-8"
+
+                # 3. 如果是 HTML 响应体，检查其中的 <meta charset="..."> 标签
+                if re.search(r"charset=[\"']?utf-8[\"']?", response.text, re.IGNORECASE):
+                    return "utf-8"
+                # 暂不支持直接提取字符集，仅提取UTF8
+                # match = re.search(r"<meta[^>]+charset=[\"']?([^\"'>\s]+)", response.text, re.IGNORECASE)
+                # if match:
+                #     return match.group(1)
+
+                # 4. 使用 chardet 库进一步分析内容
+                detection = chardet.detect(response.content)
+                if detection.get("confidence", 0) > confidence_threshold:
+                    return detection.get("encoding")
+                # 保存 chardet 的结果备用
+                fallback_encoding = detection.get("encoding")
+
+            # 5. 如果上述方法都无法确定，信任 chardet 的结果（即使置信度较低），否则返回默认字符集
+            return fallback_encoding or "utf-8"
+        except Exception as e:
+            logger.debug(f"Error when detect_encoding_from_response: {str(e)}")
+            return fallback_encoding or "utf-8"
+
+    @staticmethod
+    def get_decoded_html_content(response: Response,
+                                 compatible_mode: bool = False, confidence_threshold: float = 0.8) -> str:
+        """
+        获取HTML响应的解码文本内容
+
+        :param response: HTTP 响应对象
+        :param compatible_mode: 是否使用兼容模式，默认为 False (性能模式)
+        :param confidence_threshold: chardet 检测置信度阈值，默认为 0.8
+        :return: 解码后的响应文本内容
+        """
+        try:
+            if not response:
+                return ""
+            if response.content:
+                # 1. 获取编码信息
+                encoding = (RequestUtils.detect_encoding_from_html_response(response, compatible_mode,
+                                                                            confidence_threshold)
+                            or response.apparent_encoding)
+                # 2. 根据解析得到的编码进行解码
+                try:
+                    # 尝试用推测的编码解码
+                    return response.content.decode(encoding)
+                except Exception as e:
+                    logger.debug(f"Decoding failed, error message: {str(e)}")
+                    # 如果解码失败，尝试 fallback 使用 apparent_encoding
+                    response.encoding = response.apparent_encoding
+                    return response.text
+            else:
+                return response.text
+        except Exception as e:
+            logger.debug(f"Error when getting decoded content: {str(e)}")
+            return response.text