add FastApi实时性能监控

This commit is contained in:
jxxghp
2025-08-01 17:47:55 +08:00
parent 8ab233baef
commit 1dd3af44b5
10 changed files with 857 additions and 21 deletions

View File

@@ -1,8 +1,8 @@
from fastapi import APIRouter
from app.api.endpoints import login, user, site, message, webhook, subscribe, \
from app.api.endpoints import login, user, webhook, message, site, subscribe, \
media, douban, search, plugin, tmdb, history, system, download, dashboard, \
transfer, mediaserver, bangumi, storage, discover, recommend, workflow, torrent
transfer, mediaserver, bangumi, storage, discover, recommend, workflow, torrent, monitoring
api_router = APIRouter()
api_router.include_router(login.router, prefix="/login", tags=["login"])
@@ -28,3 +28,4 @@ api_router.include_router(discover.router, prefix="/discover", tags=["discover"]
api_router.include_router(recommend.router, prefix="/recommend", tags=["recommend"])
api_router.include_router(workflow.router, prefix="/workflow", tags=["workflow"])
api_router.include_router(torrent.router, prefix="/torrent", tags=["torrent"])
api_router.include_router(monitoring.router, prefix="/monitoring", tags=["monitoring"])

View File

@@ -0,0 +1,409 @@
from typing import Any, List
from fastapi import APIRouter, Depends, Query
from fastapi.responses import HTMLResponse
from app import schemas
from app.core.security import verify_apitoken
from app.monitoring import monitor, get_metrics_response
from app.schemas.monitoring import (
PerformanceSnapshot,
EndpointStats,
ErrorRequest,
MonitoringOverview
)
router = APIRouter()
@router.get("/overview", summary="获取监控概览", response_model=schemas.MonitoringOverview)
def get_overview(_: str = Depends(verify_apitoken)) -> Any:
"""
获取完整的监控概览信息
"""
# 获取性能快照
performance = monitor.get_performance_snapshot()
# 获取最活跃端点
top_endpoints = monitor.get_top_endpoints(limit=10)
# 获取最近错误
recent_errors = monitor.get_recent_errors(limit=20)
# 检查告警
alerts = monitor.check_alerts()
return MonitoringOverview(
performance=PerformanceSnapshot(
timestamp=performance.timestamp,
cpu_usage=performance.cpu_usage,
memory_usage=performance.memory_usage,
active_requests=performance.active_requests,
request_rate=performance.request_rate,
avg_response_time=performance.avg_response_time,
error_rate=performance.error_rate,
slow_requests=performance.slow_requests
),
top_endpoints=[EndpointStats(**endpoint) for endpoint in top_endpoints],
recent_errors=[ErrorRequest(**error) for error in recent_errors],
alerts=alerts
)
@router.get("/performance", summary="获取性能快照", response_model=schemas.PerformanceSnapshot)
def get_performance(_: str = Depends(verify_apitoken)) -> Any:
"""
获取当前性能快照
"""
snapshot = monitor.get_performance_snapshot()
return PerformanceSnapshot(
timestamp=snapshot.timestamp,
cpu_usage=snapshot.cpu_usage,
memory_usage=snapshot.memory_usage,
active_requests=snapshot.active_requests,
request_rate=snapshot.request_rate,
avg_response_time=snapshot.avg_response_time,
error_rate=snapshot.error_rate,
slow_requests=snapshot.slow_requests
)
@router.get("/endpoints", summary="获取端点统计", response_model=List[schemas.EndpointStats])
def get_endpoints(
limit: int = Query(10, ge=1, le=50, description="返回的端点数量"),
_: str = Depends(verify_apitoken)
) -> Any:
"""
获取最活跃的API端点统计
"""
endpoints = monitor.get_top_endpoints(limit=limit)
return [EndpointStats(**endpoint) for endpoint in endpoints]
@router.get("/errors", summary="获取错误请求", response_model=List[schemas.ErrorRequest])
def get_errors(
limit: int = Query(20, ge=1, le=100, description="返回的错误数量"),
_: str = Depends(verify_apitoken)
) -> Any:
"""
获取最近的错误请求记录
"""
errors = monitor.get_recent_errors(limit=limit)
return [ErrorRequest(**error) for error in errors]
@router.get("/alerts", summary="获取告警信息", response_model=List[str])
def get_alerts(_: str = Depends(verify_apitoken)) -> Any:
"""
获取当前告警信息
"""
return monitor.check_alerts()
@router.get("/metrics", summary="Prometheus指标")
def get_prometheus_metrics(_: str = Depends(verify_apitoken)) -> Any:
"""
获取Prometheus格式的监控指标
"""
return get_metrics_response()
@router.get("/dashboard", summary="监控仪表板", response_class=HTMLResponse)
def get_dashboard(_: str = Depends(verify_apitoken)) -> Any:
"""
获取实时监控仪表板HTML页面
"""
return HTMLResponse(content="""
<!DOCTYPE html>
<html lang="zh-CN">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>MoviePilot 性能监控仪表板</title>
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
<style>
body {
font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif;
margin: 0;
padding: 20px;
background-color: #f5f5f5;
}
.container {
max-width: 1200px;
margin: 0 auto;
}
.header {
text-align: center;
margin-bottom: 30px;
color: #333;
}
.metrics-grid {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
gap: 20px;
margin-bottom: 30px;
}
.metric-card {
background: white;
padding: 20px;
border-radius: 10px;
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
text-align: center;
}
.metric-value {
font-size: 2em;
font-weight: bold;
color: #2196F3;
}
.metric-label {
color: #666;
margin-top: 5px;
}
.chart-container {
background: white;
padding: 20px;
border-radius: 10px;
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
margin-bottom: 20px;
}
.alerts {
background: #fff3cd;
border: 1px solid #ffeaa7;
border-radius: 5px;
padding: 15px;
margin-bottom: 20px;
}
.alert-item {
color: #856404;
margin: 5px 0;
}
.refresh-btn {
background: #2196F3;
color: white;
border: none;
padding: 10px 20px;
border-radius: 5px;
cursor: pointer;
margin-bottom: 20px;
}
.refresh-btn:hover {
background: #1976D2;
}
</style>
</head>
<body>
<div class="container">
<div class="header">
<h1>🎬 MoviePilot 性能监控仪表板</h1>
<button class="refresh-btn" onclick="refreshData()">刷新数据</button>
</div>
<div id="alerts" class="alerts" style="display: none;">
<h3>⚠️ 告警信息</h3>
<div id="alerts-list"></div>
</div>
<div class="metrics-grid">
<div class="metric-card">
<div class="metric-value" id="cpu-usage">--</div>
<div class="metric-label">CPU使用率 (%)</div>
</div>
<div class="metric-card">
<div class="metric-value" id="memory-usage">--</div>
<div class="metric-label">内存使用率 (%)</div>
</div>
<div class="metric-card">
<div class="metric-value" id="active-requests">--</div>
<div class="metric-label">活跃请求数</div>
</div>
<div class="metric-card">
<div class="metric-value" id="request-rate">--</div>
<div class="metric-label">请求率 (req/min)</div>
</div>
<div class="metric-card">
<div class="metric-value" id="avg-response-time">--</div>
<div class="metric-label">平均响应时间 (s)</div>
</div>
<div class="metric-card">
<div class="metric-value" id="error-rate">--</div>
<div class="metric-label">错误率 (%)</div>
</div>
</div>
<div class="chart-container">
<h3>📊 性能趋势</h3>
<canvas id="performanceChart" width="400" height="200"></canvas>
</div>
<div class="chart-container">
<h3>🔥 最活跃端点</h3>
<canvas id="endpointsChart" width="400" height="200"></canvas>
</div>
</div>
<script>
let performanceChart, endpointsChart;
let performanceData = {
labels: [],
cpu: [],
memory: [],
requests: []
};
// 初始化图表
function initCharts() {
const ctx1 = document.getElementById('performanceChart').getContext('2d');
performanceChart = new Chart(ctx1, {
type: 'line',
data: {
labels: performanceData.labels,
datasets: [{
label: 'CPU使用率 (%)',
data: performanceData.cpu,
borderColor: '#2196F3',
backgroundColor: 'rgba(33, 150, 243, 0.1)',
tension: 0.4
}, {
label: '内存使用率 (%)',
data: performanceData.memory,
borderColor: '#4CAF50',
backgroundColor: 'rgba(76, 175, 80, 0.1)',
tension: 0.4
}, {
label: '活跃请求数',
data: performanceData.requests,
borderColor: '#FF9800',
backgroundColor: 'rgba(255, 152, 0, 0.1)',
tension: 0.4
}]
},
options: {
responsive: true,
scales: {
y: {
beginAtZero: true
}
}
}
});
const ctx2 = document.getElementById('endpointsChart').getContext('2d');
endpointsChart = new Chart(ctx2, {
type: 'bar',
data: {
labels: [],
datasets: [{
label: '请求数',
data: [],
backgroundColor: 'rgba(33, 150, 243, 0.8)'
}]
},
options: {
responsive: true,
scales: {
y: {
beginAtZero: true
}
}
}
});
}
// 更新性能数据
function updatePerformanceData(data) {
const now = new Date().toLocaleTimeString();
performanceData.labels.push(now);
performanceData.cpu.push(data.performance.cpu_usage);
performanceData.memory.push(data.performance.memory_usage);
performanceData.requests.push(data.performance.active_requests);
// 保持最近20个数据点
if (performanceData.labels.length > 20) {
performanceData.labels.shift();
performanceData.cpu.shift();
performanceData.memory.shift();
performanceData.requests.shift();
}
// 更新图表
performanceChart.data.labels = performanceData.labels;
performanceChart.data.datasets[0].data = performanceData.cpu;
performanceChart.data.datasets[1].data = performanceData.memory;
performanceChart.data.datasets[2].data = performanceData.requests;
performanceChart.update();
// 更新端点图表
const endpointLabels = data.top_endpoints.map(e => e.endpoint.substring(0, 20));
const endpointData = data.top_endpoints.map(e => e.count);
endpointsChart.data.labels = endpointLabels;
endpointsChart.data.datasets[0].data = endpointData;
endpointsChart.update();
}
// 更新指标显示
function updateMetrics(data) {
document.getElementById('cpu-usage').textContent = data.performance.cpu_usage.toFixed(1);
document.getElementById('memory-usage').textContent = data.performance.memory_usage.toFixed(1);
document.getElementById('active-requests').textContent = data.performance.active_requests;
document.getElementById('request-rate').textContent = data.performance.request_rate.toFixed(0);
document.getElementById('avg-response-time').textContent = data.performance.avg_response_time.toFixed(3);
document.getElementById('error-rate').textContent = (data.performance.error_rate * 100).toFixed(2);
}
// 更新告警
function updateAlerts(alerts) {
const alertsDiv = document.getElementById('alerts');
const alertsList = document.getElementById('alerts-list');
if (alerts.length > 0) {
alertsDiv.style.display = 'block';
alertsList.innerHTML = alerts.map(alert =>
`<div class="alert-item">⚠️ ${alert}</div>`
).join('');
} else {
alertsDiv.style.display = 'none';
}
}
// 获取URL中的token参数
function getTokenFromUrl() {
const urlParams = new URLSearchParams(window.location.search);
return urlParams.get('token');
}
// 刷新数据
async function refreshData() {
try {
const token = getTokenFromUrl();
if (!token) {
console.error('未找到token参数');
return;
}
const response = await fetch(`/api/v1/monitoring/overview?token=${token}`);
if (response.ok) {
const data = await response.json();
updateMetrics(data);
updatePerformanceData(data);
updateAlerts(data.alerts);
}
} catch (error) {
console.error('获取监控数据失败:', error);
}
}
// 页面加载完成后初始化
document.addEventListener('DOMContentLoaded', function() {
initCharts();
refreshData();
// 每5秒自动刷新
setInterval(refreshData, 5000);
});
</script>
</body>
</html>
""")

View File

@@ -280,6 +280,8 @@ class ConfigModel(BaseModel):
REPO_GITHUB_TOKEN: Optional[str] = None
# 大内存模式
BIG_MEMORY_MODE: bool = False
# FastApi性能监控
PERFORMANCE_MONITOR_ENABLE: bool = False
# 全局图片缓存,将媒体图片缓存到本地
GLOBAL_IMAGE_CACHE: bool = False
# 是否启用编码探测的性能模式

View File

@@ -2,6 +2,7 @@ from fastapi import FastAPI
from fastapi.middleware.cors import CORSMiddleware
from app.core.config import settings
from app.monitoring import setup_prometheus_metrics
from app.startup.lifecycle import lifespan
@@ -24,6 +25,9 @@ def create_app() -> FastAPI:
allow_headers=["*"],
)
# 设置性能监控
setup_prometheus_metrics(_app)
return _app

340
app/monitoring.py Normal file
View File

@@ -0,0 +1,340 @@
import threading
import time
from collections import defaultdict, deque
from dataclasses import dataclass
from datetime import datetime, timedelta
from typing import Dict, List, Any
import psutil
from fastapi import Request, Response
from fastapi.responses import PlainTextResponse
from prometheus_client import Counter, Histogram, Gauge, generate_latest, CONTENT_TYPE_LATEST
from prometheus_fastapi_instrumentator import Instrumentator
from app.core.config import settings
from app.log import logger
@dataclass
class RequestMetrics:
"""
请求指标数据类
"""
path: str
method: str
status_code: int
response_time: float
timestamp: datetime
client_ip: str
user_agent: str
@dataclass
class PerformanceSnapshot:
"""
性能快照数据类
"""
timestamp: datetime
cpu_usage: float
memory_usage: float
active_requests: int
request_rate: float
avg_response_time: float
error_rate: float
slow_requests: int
class FastAPIMonitor:
"""
FastAPI性能监控器
"""
def __init__(self, max_history: int = 1000, window_size: int = 60):
self.max_history = max_history
self.window_size = window_size # 秒
# 请求历史记录
self.request_history: deque = deque(maxlen=max_history)
# 实时统计
self.active_requests = 0
self.total_requests = 0
self.error_requests = 0
self.slow_requests = 0 # 响应时间超过1秒的请求
# 时间窗口统计
self.window_requests: deque = deque(maxlen=window_size)
self.window_response_times: deque = deque(maxlen=window_size)
# 线程锁
self._lock = threading.Lock()
# 性能阈值
self.slow_request_threshold = 1.0 # 1秒
self.error_threshold = 0.05 # 5%
self.cpu_threshold = 80.0 # 80%
self.memory_threshold = 80.0 # 80%
# 告警状态
self.alerts: List[str] = []
logger.info("FastAPI性能监控器已初始化")
def record_request(self, request: Request, response: Response, response_time: float):
"""
记录请求指标
"""
with self._lock:
# 创建请求指标
metrics = RequestMetrics(
path=str(request.url.path),
method=request.method,
status_code=response.status_code,
response_time=response_time,
timestamp=datetime.now(),
client_ip=request.client.host if request.client else "unknown",
user_agent=request.headers.get("user-agent", "unknown")
)
# 添加到历史记录
self.request_history.append(metrics)
# 更新统计
self.total_requests += 1
if response.status_code >= 400:
self.error_requests += 1
if response_time > self.slow_request_threshold:
self.slow_requests += 1
# 添加到时间窗口
self.window_requests.append(metrics)
self.window_response_times.append(response_time)
def start_request(self):
"""
开始处理请求
"""
with self._lock:
self.active_requests += 1
def end_request(self):
"""
结束处理请求
"""
with self._lock:
self.active_requests = max(0, self.active_requests - 1)
def get_performance_snapshot(self) -> PerformanceSnapshot:
"""
获取性能快照
"""
with self._lock:
now = datetime.now()
# 计算请求率(每分钟)
recent_requests = [
req for req in self.window_requests
if now - req.timestamp < timedelta(seconds=self.window_size)
]
request_rate = len(recent_requests) / (self.window_size / 60)
# 计算平均响应时间
recent_response_times = [
rt for rt in self.window_response_times
if len(self.window_response_times) > 0
]
avg_response_time = sum(recent_response_times) / len(recent_response_times) if recent_response_times else 0
# 计算错误率
error_rate = self.error_requests / self.total_requests if self.total_requests > 0 else 0
# 系统资源使用率
cpu_usage = psutil.cpu_percent(interval=0.1)
memory_usage = psutil.virtual_memory().percent
return PerformanceSnapshot(
timestamp=now,
cpu_usage=cpu_usage,
memory_usage=memory_usage,
active_requests=self.active_requests,
request_rate=request_rate,
avg_response_time=avg_response_time,
error_rate=error_rate,
slow_requests=self.slow_requests
)
def get_top_endpoints(self, limit: int = 10) -> List[Dict[str, Any]]:
"""
获取最活跃的端点
"""
with self._lock:
endpoint_stats = defaultdict(lambda: {
'count': 0,
'total_time': 0,
'errors': 0,
'avg_time': 0
})
for req in self.request_history:
key = f"{req.method} {req.path}"
endpoint_stats[key]['count'] += 1
endpoint_stats[key]['total_time'] += req.response_time
if req.status_code >= 400:
endpoint_stats[key]['errors'] += 1
# 计算平均时间
for stats in endpoint_stats.values():
if stats['count'] > 0:
stats['avg_time'] = stats['total_time'] / stats['count']
# 按请求数量排序
sorted_endpoints = sorted(
[{'endpoint': k, **v} for k, v in endpoint_stats.items()],
key=lambda x: x['count'],
reverse=True
)
return sorted_endpoints[:limit]
def get_recent_errors(self, limit: int = 20) -> List[Dict[str, Any]]:
"""
获取最近的错误请求
"""
with self._lock:
errors = [
{
'timestamp': req.timestamp.isoformat(),
'method': req.method,
'path': req.path,
'status_code': req.status_code,
'response_time': req.response_time,
'client_ip': req.client_ip
}
for req in self.request_history
if req.status_code >= 400
]
return errors[-limit:]
def check_alerts(self) -> List[str]:
"""
检查告警条件
"""
snapshot = self.get_performance_snapshot()
alerts = []
if snapshot.error_rate > self.error_threshold:
alerts.append(f"错误率过高: {snapshot.error_rate:.2%}")
if snapshot.cpu_usage > self.cpu_threshold:
alerts.append(f"CPU使用率过高: {snapshot.cpu_usage:.1f}%")
if snapshot.memory_usage > self.memory_threshold:
alerts.append(f"内存使用率过高: {snapshot.memory_usage:.1f}%")
if snapshot.avg_response_time > self.slow_request_threshold:
alerts.append(f"平均响应时间过长: {snapshot.avg_response_time:.2f}s")
if snapshot.request_rate > 1000: # 每分钟1000请求
alerts.append(f"请求率过高: {snapshot.request_rate:.0f} req/min")
self.alerts = alerts
return alerts
# 全局监控实例
monitor = FastAPIMonitor()
def setup_prometheus_metrics(app):
"""
设置Prometheus指标
"""
if not settings.PERFORMANCE_MONITOR_ENABLE:
return
# 创建Prometheus指标
request_counter = Counter(
"http_requests_total",
"Total number of HTTP requests",
["method", "endpoint", "status"]
)
request_duration = Histogram(
"http_request_duration_seconds",
"HTTP request duration in seconds",
["method", "endpoint"]
)
active_requests = Gauge(
"http_active_requests",
"Number of active HTTP requests"
)
# 自定义指标收集函数
def custom_metrics(request: Request, response: Response, response_time: float):
request_counter.labels(
method=request.method,
endpoint=request.url.path,
status=response.status_code
).inc()
request_duration.labels(
method=request.method,
endpoint=request.url.path
).observe(response_time)
active_requests.set(monitor.active_requests)
# 设置Prometheus监控
Instrumentator().instrument(app).expose(app, include_in_schema=False, should_gzip=True)
# 添加自定义指标
@app.middleware("http")
async def monitor_middleware(request: Request, call_next):
start_time = time.time()
# 开始请求
monitor.start_request()
try:
response = await call_next(request)
response_time = time.time() - start_time
# 记录请求指标
monitor.record_request(request, response, response_time)
# 更新Prometheus指标
custom_metrics(request, response, response_time)
return response
except Exception as e:
response_time = time.time() - start_time
logger.error(f"请求处理异常: {e}")
# 创建错误响应
response = Response(
content=str(e),
status_code=500,
media_type="text/plain"
)
# 记录错误请求
monitor.record_request(request, response, response_time)
return response
finally:
# 结束请求
monitor.end_request()
logger.info("Prometheus指标监控已设置")
def get_metrics_response():
"""
获取Prometheus指标响应
"""
return PlainTextResponse(
generate_latest(),
media_type=CONTENT_TYPE_LATEST
)

View File

@@ -1,23 +1,24 @@
from .token import *
from .user import *
from .response import *
from .site import *
from .subscribe import *
from .context import *
from .servarr import *
from .servcookie import *
from .plugin import *
from .history import *
from .dashboard import *
from .download import *
from .event import *
from .exception import *
from .file import *
from .history import *
from .mediaserver import *
from .message import *
from .tmdb import *
from .transfer import *
from .monitoring import *
from .plugin import *
from .response import *
from .rule import *
from .servarr import *
from .servcookie import *
from .site import *
from .subscribe import *
from .system import *
from .file import *
from .exception import *
from .system import *
from .event import *
from .tmdb import *
from .token import *
from .transfer import *
from .user import *
from .workflow import *
from .download import *

View File

@@ -3,7 +3,8 @@ from typing import Optional, Dict, Any, List, Set, Callable
from pydantic import BaseModel, Field, root_validator
from app.schemas import MessageChannel, FileItem
from app.schemas.message import MessageChannel
from app.schemas.file import FileItem
class Event(BaseModel):

76
app/schemas/monitoring.py Normal file
View File

@@ -0,0 +1,76 @@
from datetime import datetime
from typing import List
from pydantic import BaseModel
class RequestMetrics(BaseModel):
"""
请求指标模型
"""
path: str
method: str
status_code: int
response_time: float
timestamp: datetime
client_ip: str
user_agent: str
class PerformanceSnapshot(BaseModel):
"""
性能快照模型
"""
timestamp: datetime
cpu_usage: float
memory_usage: float
active_requests: int
request_rate: float
avg_response_time: float
error_rate: float
slow_requests: int
class EndpointStats(BaseModel):
"""
端点统计模型
"""
endpoint: str
count: int
total_time: float
errors: int
avg_time: float
class ErrorRequest(BaseModel):
"""
错误请求模型
"""
timestamp: str
method: str
path: str
status_code: int
response_time: float
client_ip: str
class MonitoringOverview(BaseModel):
"""
监控概览模型
"""
performance: PerformanceSnapshot
top_endpoints: List[EndpointStats]
recent_errors: List[ErrorRequest]
alerts: List[str]
class MonitoringConfig(BaseModel):
"""
监控配置模型
"""
slow_request_threshold: float = 1.0
error_threshold: float = 0.05
cpu_threshold: float = 80.0
memory_threshold: float = 80.0
max_history: int = 1000
window_size: int = 60

View File

@@ -494,11 +494,11 @@ class SystemUtils:
time.sleep(1) # 等待1秒
# 获取1秒后的网络统计
net_io_2 = psutil.net_io_counters()
# 计算1秒内的流量变化
upload_speed = net_io_2.bytes_sent - net_io_1.bytes_sent
download_speed = net_io_2.bytes_recv - net_io_1.bytes_recv
return [upload_speed, download_speed]
@staticmethod

View File

@@ -76,4 +76,6 @@ setuptools~=78.1.0
pympler~=1.1
smbprotocol~=1.15.0
setproctitle~=1.3.6
httpx~=0.28.1
httpx~=0.28.1
prometheus-client~=0.22.1
prometheus-fastapi-instrumentator~=7.1.0