diff --git a/app/agent/tools/impl/search_web.py b/app/agent/tools/impl/search_web.py index 02f0430e..3a79ebf1 100644 --- a/app/agent/tools/impl/search_web.py +++ b/app/agent/tools/impl/search_web.py @@ -1,11 +1,10 @@ -"""搜索网络内容工具""" - import asyncio import json import re -from typing import Optional, Type +from typing import Optional, Type, List, Dict -from duckduckgo_search import DDGS +import httpx +from ddgs import DDGS from pydantic import BaseModel, Field from app.agent.tools.base import MoviePilotTool @@ -20,7 +19,8 @@ class SearchWebInput(BaseModel): """搜索网络内容工具的输入参数模型""" explanation: str = Field(..., description="Clear explanation of why this tool is being used in the current context") query: str = Field(..., description="The search query string to search for on the web") - max_results: Optional[int] = Field(5, description="Maximum number of search results to return (default: 5, max: 10)") + max_results: Optional[int] = Field(5, + description="Maximum number of search results to return (default: 5, max: 10)") class SearchWebTool(MoviePilotTool): @@ -37,149 +37,137 @@ class SearchWebTool(MoviePilotTool): async def run(self, query: str, max_results: Optional[int] = 5, **kwargs) -> str: """ 执行网络搜索 - - Args: - query: 搜索查询字符串 - max_results: 最大返回结果数(默认5,最大10) - - Returns: - 格式化的搜索结果JSON字符串 """ logger.info(f"执行工具: {self.name}, 参数: query={query}, max_results={max_results}") try: # 限制最大结果数 max_results = min(max(1, max_results or 5), 10) - - # 使用 duckduckgo-search 库进行搜索 - search_results = await self._search_duckduckgo(query, max_results) - - if not search_results: + results = [] + + # 1. 优先使用 Tavily (如果配置了 API Key) + if settings.TAVILY_API_KEY: + logger.info("使用 Tavily 进行搜索...") + results = await self._search_tavily(query, max_results) + + # 2. 如果没有结果或未配置 Tavily,使用 DuckDuckGo + if not results: + logger.info("使用 DuckDuckGo 进行搜索...") + results = await self._search_duckduckgo(query, max_results) + + if not results: return f"未找到与 '{query}' 相关的搜索结果" - - # 裁剪结果以避免占用过多上下文 - formatted_results = self._format_and_truncate_results(search_results, max_results) - - result_json = json.dumps(formatted_results, ensure_ascii=False, indent=2) - return result_json - + + # 格式化并裁剪结果 + formatted_results = self._format_and_truncate_results(results, max_results) + return json.dumps(formatted_results, ensure_ascii=False, indent=2) + except Exception as e: error_message = f"搜索网络内容失败: {str(e)}" logger.error(f"搜索网络内容失败: {e}", exc_info=True) return error_message @staticmethod - def _get_proxy_url(proxy_setting) -> Optional[str]: - """ - 从代理设置中提取代理URL - - Args: - proxy_setting: 代理设置,可以是字符串或字典 - - Returns: - 代理URL字符串,如果没有配置则返回None - """ - if not proxy_setting: - return None - - if isinstance(proxy_setting, dict): - return proxy_setting.get('http') or proxy_setting.get('https') - - return proxy_setting + async def _search_tavily(query: str, max_results: int) -> List[Dict]: + """使用 Tavily API 进行搜索""" + try: + async with httpx.AsyncClient(timeout=SEARCH_TIMEOUT) as client: + response = await client.post( + "https://api.tavily.com/search", + json={ + "api_key": settings.TAVILY_API_KEY, + "query": query, + "search_depth": "basic", + "max_results": max_results, + "include_answer": False, + "include_images": False, + "include_raw_content": False, + } + ) + response.raise_for_status() + data = response.json() + + results = [] + for result in data.get("results", []): + results.append({ + 'title': result.get('title', ''), + 'snippet': result.get('content', ''), + 'url': result.get('url', ''), + 'source': 'Tavily' + }) + return results + except Exception as e: + logger.warning(f"Tavily 搜索失败: {e}") + return [] @staticmethod - async def _search_duckduckgo(query: str, max_results: int) -> list: - """ - 使用 duckduckgo-search 库进行搜索 - - Args: - query: 搜索查询 - max_results: 最大结果数 - - Returns: - 搜索结果列表 - """ + def _get_proxy_url(proxy_setting) -> Optional[str]: + """从代理设置中提取代理URL""" + if not proxy_setting: + return None + if isinstance(proxy_setting, dict): + return proxy_setting.get('http') or proxy_setting.get('https') + return proxy_setting + + async def _search_duckduckgo(self, query: str, max_results: int) -> List[Dict]: + """使用 duckduckgo-search (DDGS) 进行搜索""" try: - # duckduckgo-search 是同步库,需要在 executor 中运行 def sync_search(): results = [] + ddgs_kwargs = { + 'timeout': SEARCH_TIMEOUT + } + proxy_url = self._get_proxy_url(settings.PROXY) + if proxy_url: + ddgs_kwargs['proxy'] = proxy_url + try: - # 使用代理(如果配置了) - ddgs_kwargs = {} - proxy_url = SearchWebTool._get_proxy_url(settings.PROXY) - if proxy_url: - ddgs_kwargs['proxy'] = proxy_url - - # 设置超时 - ddgs_kwargs['timeout'] = SEARCH_TIMEOUT - with DDGS(**ddgs_kwargs) as ddgs: - # 使用 text 方法进行搜索 - search_results = list(ddgs.text( - keywords=query, + ddgs_gen = ddgs.text( + query, max_results=max_results - )) - - for result in search_results: - results.append({ - 'title': result.get('title', ''), - 'snippet': result.get('body', ''), - 'url': result.get('href', ''), - 'source': 'DuckDuckGo' - }) - - except Exception as e: - logger.warning(f"duckduckgo-search 搜索失败: {e}") - raise - + ) + if ddgs_gen: + for result in ddgs_gen: + results.append({ + 'title': result.get('title', ''), + 'snippet': result.get('body', ''), + 'url': result.get('href', ''), + 'source': 'DuckDuckGo' + }) + except Exception as err: + logger.warning(f"DuckDuckGo search process failed: {err}") return results - - # 在线程池中运行同步搜索 + loop = asyncio.get_running_loop() - results = await loop.run_in_executor(None, sync_search) - return results - - except ImportError: - logger.error("duckduckgo-search 库未安装,请在 requirements.in 中添加依赖后重新构建") - return [] + return await loop.run_in_executor(None, sync_search) + except Exception as e: logger.warning(f"DuckDuckGo 搜索失败: {e}") return [] @staticmethod - def _format_and_truncate_results(results: list, max_results: int) -> dict: - """ - 格式化并裁剪搜索结果以避免占用过多上下文 - - Args: - results: 原始搜索结果列表 - max_results: 最大结果数 - - Returns: - 格式化后的结果字典 - """ + def _format_and_truncate_results(results: List[Dict], max_results: int) -> Dict: + """格式化并裁剪搜索结果""" formatted = { "total_results": len(results), "results": [] } - - # 限制结果数量 - limited_results = results[:max_results] - - for idx, result in enumerate(limited_results, 1): - title = result.get("title", "")[:200] # 限制标题长度 + + for idx, result in enumerate(results[:max_results], 1): + title = result.get("title", "")[:200] snippet = result.get("snippet", "") url = result.get("url", "") source = result.get("source", "Unknown") - - # 裁剪摘要,避免过长 - max_snippet_length = 300 # 每个摘要最多300字符 + + # 裁剪摘要 + max_snippet_length = 500 # 增加到500字符,提供更多上下文 if len(snippet) > max_snippet_length: snippet = snippet[:max_snippet_length] + "..." - - # 清理文本,移除多余的空白字符 + + # 清理文本 snippet = re.sub(r'\s+', ' ', snippet).strip() - + formatted["results"].append({ "rank": idx, "title": title, @@ -187,9 +175,8 @@ class SearchWebTool(MoviePilotTool): "url": url, "source": source }) - - # 添加提示信息 + if len(results) > max_results: - formatted["note"] = f"注意:共找到 {len(results)} 条结果,为节省上下文空间,仅显示前 {max_results} 条结果。" - + formatted["note"] = f"仅显示前 {max_results} 条结果。" + return formatted diff --git a/app/core/config.py b/app/core/config.py index c180bcbe..02b50655 100644 --- a/app/core/config.py +++ b/app/core/config.py @@ -447,10 +447,14 @@ class ConfigModel(BaseModel): AI_RECOMMEND_ENABLED: bool = False # AI推荐用户偏好 AI_RECOMMEND_USER_PREFERENCE: str = "" + # Tavily API密钥(用于网络搜索) + TAVILY_API_KEY: str = "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" + # AI推荐条目数量限制 AI_RECOMMEND_MAX_ITEMS: int = 50 + class Settings(BaseSettings, ConfigModel, LogConfigModel): """ 系统配置类 diff --git a/requirements.in b/requirements.in index 451e4cda..c929c610 100644 --- a/requirements.in +++ b/requirements.in @@ -91,4 +91,4 @@ langchain-deepseek~=0.1.4 langchain-experimental~=0.3.4 openai~=1.108.2 google-generativeai~=0.8.5 -duckduckgo-search~=7.2.1 +ddgs~=9.10.0