fix search_web tool

This commit is contained in:
jxxghp
2026-01-24 07:39:07 +08:00
parent 379bff0622
commit b896b07a08
3 changed files with 106 additions and 115 deletions

View File

@@ -1,11 +1,10 @@
"""搜索网络内容工具"""
import asyncio
import json
import re
from typing import Optional, Type
from typing import Optional, Type, List, Dict
from duckduckgo_search import DDGS
import httpx
from ddgs import DDGS
from pydantic import BaseModel, Field
from app.agent.tools.base import MoviePilotTool
@@ -20,7 +19,8 @@ class SearchWebInput(BaseModel):
"""搜索网络内容工具的输入参数模型"""
explanation: str = Field(..., description="Clear explanation of why this tool is being used in the current context")
query: str = Field(..., description="The search query string to search for on the web")
max_results: Optional[int] = Field(5, description="Maximum number of search results to return (default: 5, max: 10)")
max_results: Optional[int] = Field(5,
description="Maximum number of search results to return (default: 5, max: 10)")
class SearchWebTool(MoviePilotTool):
@@ -37,149 +37,137 @@ class SearchWebTool(MoviePilotTool):
async def run(self, query: str, max_results: Optional[int] = 5, **kwargs) -> str:
"""
执行网络搜索
Args:
query: 搜索查询字符串
max_results: 最大返回结果数默认5最大10
Returns:
格式化的搜索结果JSON字符串
"""
logger.info(f"执行工具: {self.name}, 参数: query={query}, max_results={max_results}")
try:
# 限制最大结果数
max_results = min(max(1, max_results or 5), 10)
# 使用 duckduckgo-search 库进行搜索
search_results = await self._search_duckduckgo(query, max_results)
if not search_results:
results = []
# 1. 优先使用 Tavily (如果配置了 API Key)
if settings.TAVILY_API_KEY:
logger.info("使用 Tavily 进行搜索...")
results = await self._search_tavily(query, max_results)
# 2. 如果没有结果或未配置 Tavily使用 DuckDuckGo
if not results:
logger.info("使用 DuckDuckGo 进行搜索...")
results = await self._search_duckduckgo(query, max_results)
if not results:
return f"未找到与 '{query}' 相关的搜索结果"
# 裁剪结果以避免占用过多上下文
formatted_results = self._format_and_truncate_results(search_results, max_results)
result_json = json.dumps(formatted_results, ensure_ascii=False, indent=2)
return result_json
# 格式化并裁剪结果
formatted_results = self._format_and_truncate_results(results, max_results)
return json.dumps(formatted_results, ensure_ascii=False, indent=2)
except Exception as e:
error_message = f"搜索网络内容失败: {str(e)}"
logger.error(f"搜索网络内容失败: {e}", exc_info=True)
return error_message
@staticmethod
def _get_proxy_url(proxy_setting) -> Optional[str]:
"""
从代理设置中提取代理URL
Args:
proxy_setting: 代理设置,可以是字符串或字典
Returns:
代理URL字符串如果没有配置则返回None
"""
if not proxy_setting:
return None
if isinstance(proxy_setting, dict):
return proxy_setting.get('http') or proxy_setting.get('https')
return proxy_setting
async def _search_tavily(query: str, max_results: int) -> List[Dict]:
"""使用 Tavily API 进行搜索"""
try:
async with httpx.AsyncClient(timeout=SEARCH_TIMEOUT) as client:
response = await client.post(
"https://api.tavily.com/search",
json={
"api_key": settings.TAVILY_API_KEY,
"query": query,
"search_depth": "basic",
"max_results": max_results,
"include_answer": False,
"include_images": False,
"include_raw_content": False,
}
)
response.raise_for_status()
data = response.json()
results = []
for result in data.get("results", []):
results.append({
'title': result.get('title', ''),
'snippet': result.get('content', ''),
'url': result.get('url', ''),
'source': 'Tavily'
})
return results
except Exception as e:
logger.warning(f"Tavily 搜索失败: {e}")
return []
@staticmethod
async def _search_duckduckgo(query: str, max_results: int) -> list:
"""
使用 duckduckgo-search 库进行搜索
Args:
query: 搜索查询
max_results: 最大结果数
Returns:
搜索结果列表
"""
def _get_proxy_url(proxy_setting) -> Optional[str]:
"""从代理设置中提取代理URL"""
if not proxy_setting:
return None
if isinstance(proxy_setting, dict):
return proxy_setting.get('http') or proxy_setting.get('https')
return proxy_setting
async def _search_duckduckgo(self, query: str, max_results: int) -> List[Dict]:
"""使用 duckduckgo-search (DDGS) 进行搜索"""
try:
# duckduckgo-search 是同步库,需要在 executor 中运行
def sync_search():
results = []
ddgs_kwargs = {
'timeout': SEARCH_TIMEOUT
}
proxy_url = self._get_proxy_url(settings.PROXY)
if proxy_url:
ddgs_kwargs['proxy'] = proxy_url
try:
# 使用代理(如果配置了)
ddgs_kwargs = {}
proxy_url = SearchWebTool._get_proxy_url(settings.PROXY)
if proxy_url:
ddgs_kwargs['proxy'] = proxy_url
# 设置超时
ddgs_kwargs['timeout'] = SEARCH_TIMEOUT
with DDGS(**ddgs_kwargs) as ddgs:
# 使用 text 方法进行搜索
search_results = list(ddgs.text(
keywords=query,
ddgs_gen = ddgs.text(
query,
max_results=max_results
))
for result in search_results:
results.append({
'title': result.get('title', ''),
'snippet': result.get('body', ''),
'url': result.get('href', ''),
'source': 'DuckDuckGo'
})
except Exception as e:
logger.warning(f"duckduckgo-search 搜索失败: {e}")
raise
)
if ddgs_gen:
for result in ddgs_gen:
results.append({
'title': result.get('title', ''),
'snippet': result.get('body', ''),
'url': result.get('href', ''),
'source': 'DuckDuckGo'
})
except Exception as err:
logger.warning(f"DuckDuckGo search process failed: {err}")
return results
# 在线程池中运行同步搜索
loop = asyncio.get_running_loop()
results = await loop.run_in_executor(None, sync_search)
return results
except ImportError:
logger.error("duckduckgo-search 库未安装,请在 requirements.in 中添加依赖后重新构建")
return []
return await loop.run_in_executor(None, sync_search)
except Exception as e:
logger.warning(f"DuckDuckGo 搜索失败: {e}")
return []
@staticmethod
def _format_and_truncate_results(results: list, max_results: int) -> dict:
"""
格式化并裁剪搜索结果以避免占用过多上下文
Args:
results: 原始搜索结果列表
max_results: 最大结果数
Returns:
格式化后的结果字典
"""
def _format_and_truncate_results(results: List[Dict], max_results: int) -> Dict:
"""格式化并裁剪搜索结果"""
formatted = {
"total_results": len(results),
"results": []
}
# 限制结果数量
limited_results = results[:max_results]
for idx, result in enumerate(limited_results, 1):
title = result.get("title", "")[:200] # 限制标题长度
for idx, result in enumerate(results[:max_results], 1):
title = result.get("title", "")[:200]
snippet = result.get("snippet", "")
url = result.get("url", "")
source = result.get("source", "Unknown")
# 裁剪摘要,避免过长
max_snippet_length = 300 # 每个摘要最多300字符
# 裁剪摘要
max_snippet_length = 500 # 增加到500字符提供更多上下文
if len(snippet) > max_snippet_length:
snippet = snippet[:max_snippet_length] + "..."
# 清理文本,移除多余的空白字符
# 清理文本
snippet = re.sub(r'\s+', ' ', snippet).strip()
formatted["results"].append({
"rank": idx,
"title": title,
@@ -187,9 +175,8 @@ class SearchWebTool(MoviePilotTool):
"url": url,
"source": source
})
# 添加提示信息
if len(results) > max_results:
formatted["note"] = f"注意:共找到 {len(results)} 条结果,为节省上下文空间,仅显示前 {max_results} 条结果。"
formatted["note"] = f"仅显示前 {max_results} 条结果。"
return formatted

View File

@@ -447,10 +447,14 @@ class ConfigModel(BaseModel):
AI_RECOMMEND_ENABLED: bool = False
# AI推荐用户偏好
AI_RECOMMEND_USER_PREFERENCE: str = ""
# Tavily API密钥用于网络搜索
TAVILY_API_KEY: str = "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"
# AI推荐条目数量限制
AI_RECOMMEND_MAX_ITEMS: int = 50
class Settings(BaseSettings, ConfigModel, LogConfigModel):
"""
系统配置类

View File

@@ -91,4 +91,4 @@ langchain-deepseek~=0.1.4
langchain-experimental~=0.3.4
openai~=1.108.2
google-generativeai~=0.8.5
duckduckgo-search~=7.2.1
ddgs~=9.10.0