from types import SimpleNamespace from app.utils.http import RequestUtils def test_xml_decoding_prefers_xml_declaration_over_http_default(): """ XML 声明应优先于 HTTP 默认编码,避免 UTF-8 RSS 标题被 latin1 类编码解坏。 """ xml = '警察故事4:简单任务' response = SimpleNamespace( content=xml.encode("utf-8"), encoding="ISO-8859-1", apparent_encoding="utf-8", text=xml.encode("utf-8").decode("ISO-8859-1"), ) decoded = RequestUtils.get_decoded_xml_content(response, performance_mode=True) assert "警察故事4:简单任务" in decoded assert "è­¦" not in decoded def test_xml_decoding_uses_declared_non_utf8_encoding(): """ XML 声明为非 UTF-8 时应按声明解码,兼容旧站点的 GBK/Big5 类响应。 """ xml = '中文标题' response = SimpleNamespace( content=xml.encode("gbk"), encoding="ISO-8859-1", apparent_encoding="ISO-8859-1", text=xml.encode("gbk").decode("ISO-8859-1"), ) decoded = RequestUtils.get_decoded_xml_content(response, performance_mode=True) assert "中文标题" in decoded def test_xml_decoding_skips_low_confidence_apparent_encoding(): """ apparent_encoding 为 latin1 类编码时不应抢先解码,避免无 XML 声明的中文 RSS 被吞成乱码。 """ xml = "中文标题" response = SimpleNamespace( content=xml.encode("gbk"), encoding="ISO-8859-1", apparent_encoding="ISO-8859-1", text=xml.encode("gbk").decode("ISO-8859-1"), ) decoded = RequestUtils.get_decoded_xml_content(response, performance_mode=True) assert "中文标题" in decoded assert "ÖÐÎÄ" not in decoded def test_latin1_http_encoding_is_low_confidence(): """ latin1 类编码常由 HTTP 客户端默认填充,不能作为 XML/RSS 解码的优先依据。 """ assert RequestUtils.is_low_confidence_http_encoding("ISO-8859-1") assert RequestUtils.is_low_confidence_http_encoding("latin-1") assert not RequestUtils.is_low_confidence_http_encoding("utf-8")