from types import SimpleNamespace
from app.utils.http import RequestUtils
def test_xml_decoding_prefers_xml_declaration_over_http_default():
"""
XML 声明应优先于 HTTP 默认编码,避免 UTF-8 RSS 标题被 latin1 类编码解坏。
"""
xml = '警察故事4:简单任务'
response = SimpleNamespace(
content=xml.encode("utf-8"),
encoding="ISO-8859-1",
apparent_encoding="utf-8",
text=xml.encode("utf-8").decode("ISO-8859-1"),
)
decoded = RequestUtils.get_decoded_xml_content(response, performance_mode=True)
assert "警察故事4:简单任务" in decoded
assert "è¦" not in decoded
def test_xml_decoding_uses_declared_non_utf8_encoding():
"""
XML 声明为非 UTF-8 时应按声明解码,兼容旧站点的 GBK/Big5 类响应。
"""
xml = '中文标题'
response = SimpleNamespace(
content=xml.encode("gbk"),
encoding="ISO-8859-1",
apparent_encoding="ISO-8859-1",
text=xml.encode("gbk").decode("ISO-8859-1"),
)
decoded = RequestUtils.get_decoded_xml_content(response, performance_mode=True)
assert "中文标题" in decoded
def test_xml_decoding_skips_low_confidence_apparent_encoding():
"""
apparent_encoding 为 latin1 类编码时不应抢先解码,避免无 XML 声明的中文 RSS 被吞成乱码。
"""
xml = "中文标题"
response = SimpleNamespace(
content=xml.encode("gbk"),
encoding="ISO-8859-1",
apparent_encoding="ISO-8859-1",
text=xml.encode("gbk").decode("ISO-8859-1"),
)
decoded = RequestUtils.get_decoded_xml_content(response, performance_mode=True)
assert "中文标题" in decoded
assert "ÖÐÎÄ" not in decoded
def test_latin1_http_encoding_is_low_confidence():
"""
latin1 类编码常由 HTTP 客户端默认填充,不能作为 XML/RSS 解码的优先依据。
"""
assert RequestUtils.is_low_confidence_http_encoding("ISO-8859-1")
assert RequestUtils.is_low_confidence_http_encoding("latin-1")
assert not RequestUtils.is_low_confidence_http_encoding("utf-8")