Files
UAV-CO2/src/gasflux/blueprints/health.py
2026-02-05 15:13:54 +08:00

94 lines
3.5 KiB
Python

"""
Health Check Blueprint
Provides API health monitoring and system status endpoints.
"""
import os
import time
import logging
from flask import Blueprint, request
from ..app import Config, stats_collector
from ..shared import task_status, TASK_STATUS_PENDING, TASK_STATUS_PROCESSING
from ..shared import _format_response, log_performance, logger
# Create blueprint
health_bp = Blueprint('health', __name__, url_prefix='/health')
@health_bp.route('', methods=['GET'])
@log_performance
def health_check():
"""API Health Check"""
logger.debug("Health check requested")
try:
# Check storage accessibility
uploads_writable = os.access(Config.UPLOAD_FOLDER, os.W_OK)
outputs_writable = os.access(Config.OUTPUT_FOLDER, os.W_OK)
# Check active tasks
active_tasks = len([t for t in task_status.values() if t.get("status") in [TASK_STATUS_PENDING, TASK_STATUS_PROCESSING]])
# Get basic stats for health check
stats_summary = stats_collector.get_summary()
health_data = {
"status": "healthy",
"version": "1.0.0",
"timestamp": time.time(),
"uptime": stats_summary['summary']['uptime_formatted'],
"storage": {
"uploads_writable": uploads_writable,
"outputs_writable": outputs_writable
},
"tasks": {
"active_count": active_tasks,
"total_tracked": len(task_status),
"total_processed": stats_summary['tasks']['total_completed'] + stats_summary['tasks']['total_failed'],
"success_rate_percent": stats_summary['tasks']['success_rate_percent']
},
"performance": {
"requests_per_second": stats_summary['summary']['requests_per_second'],
"avg_response_time_ms": stats_summary['performance']['avg_response_time_ms'],
"error_rate_percent": stats_summary['summary']['error_rate_percent']
}
}
# Determine health status based on metrics
is_healthy = True
issues = []
if not uploads_writable:
issues.append("上传文件夹不可写")
is_healthy = False
if not outputs_writable:
issues.append("输出文件夹不可写")
is_healthy = False
if active_tasks > 20: # High load threshold
issues.append(f"活跃任务数量过多 ({active_tasks})")
if stats_summary['summary']['error_rate_percent'] > 10: # High error rate
issues.append(f"错误率过高 ({stats_summary['summary']['error_rate_percent']:.1f}%)")
is_healthy = False
health_data["status"] = "healthy" if is_healthy else "degraded"
if issues:
health_data["issues"] = issues
# Log warnings for potential issues
for issue in issues:
logger.warning(f"Health check issue: {issue}")
status_level = logging.DEBUG if is_healthy else logging.WARNING
logger.log(status_level, f"Health check: {health_data['status']} (active tasks: {active_tasks})")
status_code = 200 if is_healthy else 503 # 503 Service Unavailable for degraded
return _format_response(status_code, "健康检查完成" if is_healthy else "服务不可用", health_data)
except Exception as e:
logger.error(f"Health check failed: {str(e)}", exc_info=True)
return _format_response(500, "健康检查失败", {
"status": "unhealthy",
"error": str(e),
"timestamp": time.time()
})