diff --git a/app/chain/site.py b/app/chain/site.py index 0ce99ab8..9786bae6 100644 --- a/app/chain/site.py +++ b/app/chain/site.py @@ -697,7 +697,7 @@ class SiteChain(ChainBase): username=username, password=password, two_step_code=two_step_code, - proxies=settings.PROXY_HOST if site_info.proxy else None + proxies=settings.PROXY_SERVER if site_info.proxy else None ) if result: cookie, ua, msg = result diff --git a/app/core/config.py b/app/core/config.py index fc4360a9..9d5f0c98 100644 --- a/app/core/config.py +++ b/app/core/config.py @@ -8,6 +8,7 @@ import sys import threading from pathlib import Path from typing import Any, Dict, List, Optional, Tuple, Type +from urllib.parse import urlparse from dotenv import set_key from pydantic import BaseModel, BaseSettings, validator, Field @@ -319,6 +320,10 @@ class ConfigModel(BaseModel): RCLONE_SNAPSHOT_CHECK_FOLDER_MODTIME = True # 对OpenList进行快照对比时,是否检查文件夹的修改时间 OPENLIST_SNAPSHOT_CHECK_FOLDER_MODTIME = True + # 仿真类型:playwright 或 flaresolverr + BROWSER_EMULATION: str = "playwright" + # FlareSolverr 服务地址,例如 http://127.0.0.1:8191 + FLARESOLVERR_URL: Optional[str] = None class Settings(BaseSettings, ConfigModel, LogConfigModel): @@ -615,9 +620,22 @@ class Settings(BaseSettings, ConfigModel, LogConfigModel): @property def PROXY_SERVER(self): if self.PROXY_HOST: - return { - "server": self.PROXY_HOST - } + try: + parsed = urlparse(self.PROXY_HOST) + if not parsed.scheme: + return {"server": self.PROXY_HOST} + host = parsed.hostname or "" + port = f":{parsed.port}" if parsed.port else "" + server = f"{parsed.scheme}://{host}{port}" + proxy = {"server": server} + if parsed.username: + proxy["username"] = parsed.username + if parsed.password: + proxy["password"] = parsed.password + return proxy + except Exception as err: + logger.error(f"解析代理服务器地址 '{self.PROXY_HOST}' 时出错: {err}") + return {"server": self.PROXY_HOST} return None @property diff --git a/app/helper/browser.py b/app/helper/browser.py index dc598849..6bd3a4e0 100644 --- a/app/helper/browser.py +++ b/app/helper/browser.py @@ -3,7 +3,9 @@ from typing import Callable, Any, Optional from cf_clearance import sync_cf_retry, sync_stealth from playwright.sync_api import sync_playwright, Page +from app.core.config import settings from app.log import logger +from app.utils.http import RequestUtils, cookie_parse class PlaywrightHelper: @@ -19,6 +21,53 @@ class PlaywrightHelper: page.goto(url) return sync_cf_retry(page)[0] + @staticmethod + def __fs_cookie_str(cookies: list) -> str: + if not cookies: + return "" + return "; ".join([f"{c.get('name')}={c.get('value')}" for c in cookies if c and c.get('name') is not None]) + + @staticmethod + def __flaresolverr_request(url: str, + cookies: Optional[str] = None, + proxy_url: Optional[str] = None, + timeout: Optional[int] = 30) -> Optional[dict]: + """ + 调用 FlareSolverr 解决 Cloudflare 并返回 solution 结果 + 参考: https://github.com/FlareSolverr/FlareSolverr + """ + if not settings.FLARESOLVERR_URL: + logger.warn("未配置 FLARESOLVERR_URL,无法使用 FlareSolverr") + return None + + payload = { + "cmd": "request.get", + "url": url, + "maxTimeout": max(10, int(timeout or 30)) * 1000, + } + # 将 cookies 以数组形式传递给 FlareSolverr + if cookies: + try: + payload["cookies"] = cookie_parse(cookies, array=True) + except Exception as e: + logger.debug(f"解析 cookies 失败,忽略: {str(e)}") + if proxy_url: + payload["proxy"] = {"url": proxy_url} + + try: + fs_api = settings.FLARESOLVERR_URL.rstrip("/") + "/v1" + data = RequestUtils(content_type="application/json").post_json(url=fs_api, json=payload) + if not data: + logger.error("FlareSolverr 返回空响应") + return None + if data.get("status") != "ok": + logger.error(f"FlareSolverr 调用失败: {data.get('message')}") + return None + return data.get("solution") + except Exception as e: + logger.error(f"调用 FlareSolverr 失败: {str(e)}") + return None + def action(self, url: str, callback: Callable, cookies: Optional[str] = None, @@ -43,15 +92,34 @@ class PlaywrightHelper: context = None page = None try: + # 如果配置使用 FlareSolverr,先通过其获取清除后的 cookies 与 UA + fs_cookie_header = None + fs_ua = None + if settings.BROWSER_EMULATION == "flaresolverr": + proxy_url = None + if proxies and isinstance(proxies, dict): + proxy_url = proxies.get("server") + proxy_url = proxy_url or settings.PROXY_HOST + solution = self.__flaresolverr_request(url=url, cookies=cookies, + proxy_url=proxy_url, timeout=timeout) + if solution: + fs_cookie_header = self.__fs_cookie_str(solution.get("cookies", [])) + fs_ua = solution.get("userAgent") + browser = playwright[self.browser_type].launch(headless=headless) - context = browser.new_context(user_agent=ua, proxy=proxies) + context = browser.new_context(user_agent=fs_ua or ua, proxy=proxies) page = context.new_page() - if cookies: - page.set_extra_http_headers({"cookie": cookies}) + # 优先使用 FlareSolverr 返回,其次使用入参 + merged_cookie = fs_cookie_header or cookies + if merged_cookie: + page.set_extra_http_headers({"cookie": merged_cookie}) - if not self.__pass_cloudflare(url, page): - logger.warn("cloudflare challenge fail!") + if settings.BROWSER_EMULATION == "playwright": + if not self.__pass_cloudflare(url, page): + logger.warn("cloudflare challenge fail!") + else: + page.goto(url) page.wait_for_load_state("networkidle", timeout=timeout * 1000) # 回调函数 @@ -87,6 +155,19 @@ class PlaywrightHelper: :param timeout: 超时时间 """ source = None + # 如果配置为 FlareSolverr,则直接调用获取页面源码 + if settings.BROWSER_EMULATION == "flaresolverr": + try: + proxy_url = None + if proxies and isinstance(proxies, dict): + proxy_url = proxies.get("server") + proxy_url = proxy_url or settings.PROXY_HOST + solution = self.__flaresolverr_request(url=url, cookies=cookies, + proxy_url=proxy_url, timeout=timeout) + if solution: + return solution.get("response") + except Exception as e: + logger.error(f"FlareSolverr 获取源码失败: {str(e)}") try: with sync_playwright() as playwright: browser = None diff --git a/app/utils/http.py b/app/utils/http.py index 2eb49e0b..3336add2 100644 --- a/app/utils/http.py +++ b/app/utils/http.py @@ -2,7 +2,7 @@ import re import sys from contextlib import contextmanager, asynccontextmanager from pathlib import Path -from typing import Any, Optional, Union +from typing import Any, Optional, Tuple, Union import chardet import httpx @@ -395,7 +395,7 @@ class RequestUtils: return None @staticmethod - def parse_cache_control(header: str) -> (str, int): + def parse_cache_control(header: str) -> Tuple[str, Optional[int]]: """ 解析 Cache-Control 头,返回 cache_directive 和 max_age :param header: Cache-Control 头部的字符串