94 lines
3.5 KiB
Python
94 lines
3.5 KiB
Python
"""
|
|
Health Check Blueprint
|
|
Provides API health monitoring and system status endpoints.
|
|
"""
|
|
|
|
import os
|
|
import time
|
|
import logging
|
|
from flask import Blueprint, request
|
|
|
|
from ..app import Config, stats_collector
|
|
from ..shared import task_status, TASK_STATUS_PENDING, TASK_STATUS_PROCESSING
|
|
from ..shared import _format_response, log_performance, logger
|
|
|
|
# Create blueprint
|
|
health_bp = Blueprint('health', __name__, url_prefix='/health')
|
|
|
|
|
|
@health_bp.route('', methods=['GET'])
|
|
@log_performance
|
|
def health_check():
|
|
"""API Health Check"""
|
|
logger.debug("Health check requested")
|
|
|
|
try:
|
|
# Check storage accessibility
|
|
uploads_writable = os.access(Config.UPLOAD_FOLDER, os.W_OK)
|
|
outputs_writable = os.access(Config.OUTPUT_FOLDER, os.W_OK)
|
|
|
|
# Check active tasks
|
|
active_tasks = len([t for t in task_status.values() if t.get("status") in [TASK_STATUS_PENDING, TASK_STATUS_PROCESSING]])
|
|
|
|
# Get basic stats for health check
|
|
stats_summary = stats_collector.get_summary()
|
|
|
|
health_data = {
|
|
"status": "healthy",
|
|
"version": "1.0.0",
|
|
"timestamp": time.time(),
|
|
"uptime": stats_summary['summary']['uptime_formatted'],
|
|
"storage": {
|
|
"uploads_writable": uploads_writable,
|
|
"outputs_writable": outputs_writable
|
|
},
|
|
"tasks": {
|
|
"active_count": active_tasks,
|
|
"total_tracked": len(task_status),
|
|
"total_processed": stats_summary['tasks']['total_completed'] + stats_summary['tasks']['total_failed'],
|
|
"success_rate_percent": stats_summary['tasks']['success_rate_percent']
|
|
},
|
|
"performance": {
|
|
"requests_per_second": stats_summary['summary']['requests_per_second'],
|
|
"avg_response_time_ms": stats_summary['performance']['avg_response_time_ms'],
|
|
"error_rate_percent": stats_summary['summary']['error_rate_percent']
|
|
}
|
|
}
|
|
|
|
# Determine health status based on metrics
|
|
is_healthy = True
|
|
issues = []
|
|
|
|
if not uploads_writable:
|
|
issues.append("上传文件夹不可写")
|
|
is_healthy = False
|
|
if not outputs_writable:
|
|
issues.append("输出文件夹不可写")
|
|
is_healthy = False
|
|
if active_tasks > 20: # High load threshold
|
|
issues.append(f"活跃任务数量过多 ({active_tasks})")
|
|
if stats_summary['summary']['error_rate_percent'] > 10: # High error rate
|
|
issues.append(f"错误率过高 ({stats_summary['summary']['error_rate_percent']:.1f}%)")
|
|
is_healthy = False
|
|
|
|
health_data["status"] = "healthy" if is_healthy else "degraded"
|
|
if issues:
|
|
health_data["issues"] = issues
|
|
|
|
# Log warnings for potential issues
|
|
for issue in issues:
|
|
logger.warning(f"Health check issue: {issue}")
|
|
|
|
status_level = logging.DEBUG if is_healthy else logging.WARNING
|
|
logger.log(status_level, f"Health check: {health_data['status']} (active tasks: {active_tasks})")
|
|
|
|
status_code = 200 if is_healthy else 503 # 503 Service Unavailable for degraded
|
|
return _format_response(status_code, "健康检查完成" if is_healthy else "服务不可用", health_data)
|
|
|
|
except Exception as e:
|
|
logger.error(f"Health check failed: {str(e)}", exc_info=True)
|
|
return _format_response(500, "健康检查失败", {
|
|
"status": "unhealthy",
|
|
"error": str(e),
|
|
"timestamp": time.time()
|
|
}) |