mirror of
https://github.com/jxxghp/MoviePilot.git
synced 2026-06-29 16:26:41 +08:00
feat: enhance WebAgent audio handling with format conversion and transcription support
This commit is contained in:
@@ -2,6 +2,8 @@ import asyncio
|
||||
import hashlib
|
||||
import json
|
||||
import mimetypes
|
||||
import shutil
|
||||
import subprocess
|
||||
import time
|
||||
import uuid
|
||||
from pathlib import Path
|
||||
@@ -12,6 +14,7 @@ from fastapi.responses import FileResponse, StreamingResponse
|
||||
|
||||
from app import schemas
|
||||
from app.agent import MoviePilotAgent, ReplyMode, StreamingHandler
|
||||
from app.agent.llm.capability import AgentCapabilityManager
|
||||
from app.core.config import global_vars, settings
|
||||
from app.db.models import User
|
||||
from app.db.user_oper import UserOper, get_current_active_user
|
||||
@@ -27,6 +30,7 @@ WEB_AGENT_FILE_TTL_SECONDS = 6 * 60 * 60
|
||||
WEB_AGENT_FILE_MAX_ITEMS = 256
|
||||
WEB_AGENT_UPLOAD_MAX_BYTES = 32 * 1024 * 1024
|
||||
WEB_AGENT_UPLOAD_CHUNK_SIZE = 1024 * 1024
|
||||
WEB_AGENT_BROWSER_AUDIO_SUFFIXES = {".aac", ".m4a", ".mp3", ".mp4", ".wav", ".wave"}
|
||||
_WEB_AGENT_FILE_REGISTRY: dict[str, dict[str, Any]] = {}
|
||||
|
||||
|
||||
@@ -342,6 +346,137 @@ def _register_web_agent_file(
|
||||
}
|
||||
|
||||
|
||||
def _get_web_agent_audio_mime_type(audio_path: Path) -> Optional[str]:
|
||||
"""
|
||||
生成浏览器播放更友好的音频 MIME 类型。
|
||||
|
||||
:param audio_path: 音频文件路径
|
||||
:return: 可用于 FileResponse/audio 标签的 MIME 类型
|
||||
"""
|
||||
suffix = audio_path.suffix.lower()
|
||||
if suffix in {".wav", ".wave"}:
|
||||
return "audio/wav"
|
||||
if suffix == ".mp3":
|
||||
return "audio/mpeg"
|
||||
if suffix in {".m4a", ".mp4"}:
|
||||
return "audio/mp4"
|
||||
if suffix == ".aac":
|
||||
return "audio/aac"
|
||||
|
||||
return mimetypes.guess_type(audio_path.name)[0]
|
||||
|
||||
|
||||
def _prepare_web_agent_audio_attachment_path(voice_path: str) -> Path:
|
||||
"""
|
||||
将 Agent 语音回复准备成 Web 面板可稳定播放的音频文件。
|
||||
|
||||
部分 TTS provider 会生成 Opus/Ogg,桌面 Chromium 通常可播放,但 iOS/Safari
|
||||
兼容性不稳定;WebAgent 只在浏览器内播放,因此这里单独转成 WAV。
|
||||
"""
|
||||
try:
|
||||
source_path = Path(voice_path).expanduser().resolve(strict=True)
|
||||
except OSError:
|
||||
return Path(voice_path)
|
||||
if source_path.suffix.lower() in WEB_AGENT_BROWSER_AUDIO_SUFFIXES:
|
||||
return source_path
|
||||
if not shutil.which("ffmpeg"):
|
||||
logger.warning("WebAgent 语音转 WAV 跳过:ffmpeg 不可用,path=%s", source_path)
|
||||
return source_path
|
||||
|
||||
voice_dir = settings.TEMP_PATH / "voice"
|
||||
voice_dir.mkdir(parents=True, exist_ok=True)
|
||||
output_path = voice_dir / f"{source_path.stem}_web_{uuid.uuid4().hex[:8]}.wav"
|
||||
cmd = [
|
||||
"ffmpeg",
|
||||
"-y",
|
||||
"-i",
|
||||
str(source_path),
|
||||
"-ar",
|
||||
"24000",
|
||||
"-ac",
|
||||
"1",
|
||||
"-f",
|
||||
"wav",
|
||||
str(output_path),
|
||||
]
|
||||
result = subprocess.run(cmd, capture_output=True, text=True, check=False)
|
||||
if result.returncode != 0 or not output_path.exists():
|
||||
logger.warning(
|
||||
"WebAgent 语音转 WAV 失败,将回退原文件: returncode=%s, stderr=%s",
|
||||
result.returncode,
|
||||
(result.stderr or "").strip()[:500],
|
||||
)
|
||||
return source_path
|
||||
return output_path
|
||||
|
||||
|
||||
def _get_web_agent_registered_file(ref: str) -> Optional[dict[str, Any]]:
|
||||
"""
|
||||
根据前端附件引用读取 WebAgent 临时文件登记信息。
|
||||
|
||||
:param ref: message/agent/file/{file_id} 形式的短期引用
|
||||
:return: 文件登记信息,引用无效或过期时返回 None
|
||||
"""
|
||||
normalized_ref = (ref or "").strip()
|
||||
prefix = "message/agent/file/"
|
||||
if not normalized_ref.startswith(prefix):
|
||||
return None
|
||||
|
||||
_cleanup_web_agent_file_registry()
|
||||
file_id = normalized_ref[len(prefix):].split("/", 1)[0]
|
||||
return _WEB_AGENT_FILE_REGISTRY.get(file_id)
|
||||
|
||||
|
||||
def _transcribe_web_agent_audio_refs(audio_refs: list[str]) -> Optional[str]:
|
||||
"""
|
||||
转写 WebAgent 上传的本地录音附件。
|
||||
|
||||
Web 面板上传后的音频已经保存在短期文件登记表里,不能再像第三方渠道那样
|
||||
走模块下载逻辑;这里直接读取临时文件并调用当前音频输入 provider。
|
||||
"""
|
||||
if not audio_refs:
|
||||
return None
|
||||
if not AgentCapabilityManager.is_audio_input_available():
|
||||
logger.warning("WebAgent 音频输入能力未配置或未启用,跳过语音识别")
|
||||
return None
|
||||
|
||||
transcripts = []
|
||||
for audio_ref in audio_refs:
|
||||
file_info = _get_web_agent_registered_file(audio_ref)
|
||||
if not file_info:
|
||||
logger.warning("WebAgent 语音引用不存在或已过期: ref=%s", audio_ref)
|
||||
continue
|
||||
|
||||
file_path = Path(file_info["path"])
|
||||
try:
|
||||
content = file_path.read_bytes()
|
||||
except OSError as err:
|
||||
logger.warning("WebAgent 语音文件读取失败: ref=%s, error=%s", audio_ref, err)
|
||||
continue
|
||||
|
||||
transcript = AgentCapabilityManager.transcribe_audio(
|
||||
content=content,
|
||||
filename=file_info.get("name") or file_path.name,
|
||||
)
|
||||
if transcript:
|
||||
transcripts.append(transcript)
|
||||
|
||||
return "\n".join(transcripts).strip() if transcripts else None
|
||||
|
||||
|
||||
def _merge_web_agent_prompt_with_transcript(prompt: str, transcript: Optional[str]) -> str:
|
||||
"""合并用户输入文本和语音转写文本,避免重复发送相同内容。"""
|
||||
merged_parts = []
|
||||
seen_parts = set()
|
||||
for item in (prompt, transcript or ""):
|
||||
normalized = item.strip()
|
||||
if not normalized or normalized in seen_parts:
|
||||
continue
|
||||
seen_parts.add(normalized)
|
||||
merged_parts.append(normalized)
|
||||
return "\n".join(merged_parts).strip()
|
||||
|
||||
|
||||
def _parse_web_agent_choice_callback(callback_data: str) -> Optional[tuple[str, int]]:
|
||||
"""
|
||||
解析 Web Agent 按钮选择回调数据。
|
||||
@@ -484,10 +619,12 @@ def _build_web_agent_notification_events(
|
||||
events.append({"type": "attachment", "attachment": attachment})
|
||||
|
||||
if notification.voice_path:
|
||||
audio_path = _prepare_web_agent_audio_attachment_path(notification.voice_path)
|
||||
attachment = _register_web_agent_file(
|
||||
notification.voice_path,
|
||||
file_name=Path(notification.voice_path).name,
|
||||
str(audio_path),
|
||||
file_name=audio_path.name,
|
||||
kind="audio",
|
||||
mime_type=_get_web_agent_audio_mime_type(audio_path),
|
||||
)
|
||||
if attachment:
|
||||
events.append({"type": "attachment", "attachment": attachment})
|
||||
@@ -679,6 +816,19 @@ async def web_agent_stream(
|
||||
)
|
||||
|
||||
prompt = payload.text.strip()
|
||||
transcript = _transcribe_web_agent_audio_refs(payload.audio_refs or [])
|
||||
prompt = _merge_web_agent_prompt_with_transcript(prompt, transcript)
|
||||
has_audio_input = bool(transcript)
|
||||
if not prompt and payload.audio_refs and not payload.images and not payload.files:
|
||||
return StreamingResponse(
|
||||
iter([
|
||||
_build_web_agent_sse(
|
||||
"error",
|
||||
{"message": "语音识别失败,请稍后重试。"},
|
||||
)
|
||||
]),
|
||||
media_type="text/event-stream",
|
||||
)
|
||||
if not prompt and not payload.images and not payload.files and not payload.audio_refs:
|
||||
return StreamingResponse(
|
||||
iter([
|
||||
@@ -715,9 +865,11 @@ async def web_agent_stream(
|
||||
"""
|
||||
生成前端 Agent SSE 事件。
|
||||
"""
|
||||
audio_ref_set = set(payload.audio_refs or [])
|
||||
files = [
|
||||
file.model_dump(exclude_none=True)
|
||||
for file in (payload.files or [])
|
||||
if file.ref not in audio_ref_set
|
||||
]
|
||||
for audio_ref in payload.audio_refs or []:
|
||||
files.append({"ref": audio_ref, "mime_type": "audio/*"})
|
||||
@@ -742,7 +894,7 @@ async def web_agent_stream(
|
||||
message=prompt,
|
||||
images=payload.images or [],
|
||||
files=files or None,
|
||||
has_audio_input=bool(payload.audio_refs),
|
||||
has_audio_input=has_audio_input,
|
||||
)
|
||||
except Exception as err:
|
||||
logger.error(f"Web智能助手执行失败: {str(err)}")
|
||||
|
||||
@@ -182,6 +182,11 @@ class AgentCapabilityManagerTest(unittest.TestCase):
|
||||
self.assertTrue(
|
||||
AgentCapabilityManager.supports_native_voice_reply("Feishu", None)
|
||||
)
|
||||
self.assertTrue(
|
||||
AgentCapabilityManager.supports_native_voice_reply(
|
||||
MessageChannel.WebAgent.value, None
|
||||
)
|
||||
)
|
||||
self.assertFalse(
|
||||
AgentCapabilityManager.supports_native_voice_reply("Slack", None)
|
||||
)
|
||||
@@ -219,6 +224,7 @@ class AgentCapabilityManagerTest(unittest.TestCase):
|
||||
MessageChannel.Telegram,
|
||||
MessageChannel.Feishu,
|
||||
MessageChannel.Wechat,
|
||||
MessageChannel.WebAgent,
|
||||
):
|
||||
self.assertTrue(
|
||||
ChannelCapabilityManager.supports_capability(
|
||||
|
||||
@@ -463,7 +463,7 @@ class TestAgentToolStreaming:
|
||||
result = await tool.run("你好")
|
||||
return result, synthesize_speech, send_notification_message
|
||||
|
||||
for channel in (MessageChannel.Telegram, MessageChannel.Feishu):
|
||||
for channel in (MessageChannel.Telegram, MessageChannel.Feishu, MessageChannel.WebAgent):
|
||||
result, synthesize_speech, send_notification_message = asyncio.run(
|
||||
_run(channel)
|
||||
)
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import asyncio
|
||||
import time
|
||||
from types import SimpleNamespace
|
||||
from unittest.mock import AsyncMock, patch
|
||||
|
||||
@@ -6,8 +7,12 @@ from app import schemas
|
||||
from app.agent import ReplyMode
|
||||
from app.api.endpoints.agent import (
|
||||
_WebAgentMoviePilotAgent,
|
||||
_WEB_AGENT_FILE_REGISTRY,
|
||||
_build_web_agent_notification_events,
|
||||
_build_web_agent_session_id,
|
||||
_prepare_web_agent_audio_attachment_path,
|
||||
_transcribe_web_agent_audio_refs,
|
||||
web_agent_stream,
|
||||
_resolve_web_agent_choice_payload,
|
||||
_split_web_agent_output,
|
||||
)
|
||||
@@ -162,6 +167,112 @@ def test_build_web_agent_notification_events_registers_local_file(tmp_path):
|
||||
assert attachment["url"].startswith("message/agent/file/")
|
||||
|
||||
|
||||
def test_build_web_agent_notification_events_registers_voice_attachment(tmp_path):
|
||||
"""Agent 工具发送语音时应转换为可播放的音频附件事件。"""
|
||||
voice_path = tmp_path / "reply.wav"
|
||||
voice_path.write_bytes(b"wav-bytes")
|
||||
|
||||
events = _build_web_agent_notification_events(
|
||||
schemas.Notification(
|
||||
channel=MessageChannel.WebAgent,
|
||||
mtype=NotificationType.Agent,
|
||||
text="你好",
|
||||
voice_path=str(voice_path),
|
||||
)
|
||||
)
|
||||
|
||||
assert len(events) == 2
|
||||
assert events[0] == {"type": "delta", "content": "你好"}
|
||||
attachment = events[1]["attachment"]
|
||||
assert events[1]["type"] == "attachment"
|
||||
assert attachment["kind"] == "audio"
|
||||
assert attachment["name"] == "reply.wav"
|
||||
assert attachment["mime_type"] == "audio/wav"
|
||||
assert attachment["size"] == len(b"wav-bytes")
|
||||
assert attachment["url"].startswith("message/agent/file/")
|
||||
|
||||
|
||||
def test_prepare_web_agent_audio_attachment_converts_unsupported_audio(tmp_path):
|
||||
"""WebAgent 会把浏览器不稳定支持的语音格式转为 WAV 供面板播放。"""
|
||||
source_path = tmp_path / "reply.opus"
|
||||
source_path.write_bytes(b"opus-bytes")
|
||||
converted_path = tmp_path / "voice" / "reply_web_abcdef12.wav"
|
||||
|
||||
with patch("app.api.endpoints.agent.shutil.which", return_value="/usr/bin/ffmpeg"), patch(
|
||||
"app.api.endpoints.agent.uuid.uuid4",
|
||||
return_value=SimpleNamespace(hex="abcdef1234567890"),
|
||||
), patch("app.api.endpoints.agent.subprocess.run") as run:
|
||||
def write_converted_file(*args, **kwargs):
|
||||
converted_path.write_bytes(b"wav-bytes")
|
||||
return SimpleNamespace(returncode=0, stderr="")
|
||||
|
||||
run.side_effect = write_converted_file
|
||||
with patch("app.api.endpoints.agent.settings", SimpleNamespace(TEMP_PATH=tmp_path)):
|
||||
output_path = _prepare_web_agent_audio_attachment_path(str(source_path))
|
||||
|
||||
assert output_path == converted_path
|
||||
assert output_path.read_bytes() == b"wav-bytes"
|
||||
|
||||
|
||||
def test_transcribe_web_agent_audio_refs_reads_registered_upload(tmp_path):
|
||||
"""WebAgent 上传录音应从临时附件登记表读取并转写为文本。"""
|
||||
voice_path = tmp_path / "recording.webm"
|
||||
voice_path.write_bytes(b"webm-bytes")
|
||||
_WEB_AGENT_FILE_REGISTRY["audio-test"] = {
|
||||
"path": voice_path,
|
||||
"name": "recording.webm",
|
||||
"mime_type": "audio/webm",
|
||||
"created_at": time.time(),
|
||||
}
|
||||
|
||||
try:
|
||||
with patch(
|
||||
"app.api.endpoints.agent.AgentCapabilityManager.is_audio_input_available",
|
||||
return_value=True,
|
||||
), patch(
|
||||
"app.api.endpoints.agent.AgentCapabilityManager.transcribe_audio",
|
||||
return_value="帮我推荐一部电影",
|
||||
) as transcribe_audio:
|
||||
transcript = _transcribe_web_agent_audio_refs(["message/agent/file/audio-test"])
|
||||
finally:
|
||||
_WEB_AGENT_FILE_REGISTRY.pop("audio-test", None)
|
||||
|
||||
assert transcript == "帮我推荐一部电影"
|
||||
transcribe_audio.assert_called_once_with(
|
||||
content=b"webm-bytes",
|
||||
filename="recording.webm",
|
||||
)
|
||||
|
||||
|
||||
def test_web_agent_stream_returns_error_when_voice_transcription_fails():
|
||||
"""仅发送语音且转写失败时应直接返回错误事件。"""
|
||||
payload = schemas.AgentWebChatRequest(
|
||||
text="",
|
||||
session_id="browser-session",
|
||||
audio_refs=["message/agent/file/missing"],
|
||||
)
|
||||
request = SimpleNamespace()
|
||||
user = SimpleNamespace(id=1, name="admin")
|
||||
|
||||
with patch("app.api.endpoints.agent.settings.AI_AGENT_ENABLE", True), patch(
|
||||
"app.api.endpoints.agent._transcribe_web_agent_audio_refs",
|
||||
return_value=None,
|
||||
):
|
||||
response = asyncio.run(web_agent_stream(payload, request, user))
|
||||
body = "".join(asyncio.run(_collect_streaming_response(response)))
|
||||
|
||||
assert "error" in body
|
||||
assert "语音识别失败" in body
|
||||
|
||||
|
||||
async def _collect_streaming_response(response):
|
||||
"""读取 StreamingResponse,便于断言 SSE 内容。"""
|
||||
chunks = []
|
||||
async for chunk in response.body_iterator:
|
||||
chunks.append(chunk.decode("utf-8") if isinstance(chunk, bytes) else chunk)
|
||||
return chunks
|
||||
|
||||
|
||||
def test_build_web_agent_notification_events_extracts_choice_card():
|
||||
"""Agent 按钮通知应转换为 Web 选择卡片事件而非普通文本。"""
|
||||
events = _build_web_agent_notification_events(
|
||||
|
||||
Reference in New Issue
Block a user