geo/backend/app/services/health_checker.py

184 lines
5.7 KiB
Python

"""详细健康检查服务"""
import time
from dataclasses import dataclass
from typing import Optional
import redis.asyncio as aioredis
from sqlalchemy import text
from sqlalchemy.ext.asyncio import AsyncSession
@dataclass
class HealthCheckResult:
"""健康检查结果"""
name: str
healthy: bool
latency_ms: Optional[float] = None
message: Optional[str] = None
details: Optional[dict] = None
class HealthChecker:
"""健康检查服务"""
def __init__(self, db: AsyncSession, redis_url: str):
self.db = db
self.redis_url = redis_url
async def check_database(self) -> HealthCheckResult:
"""检查数据库连接"""
start = time.perf_counter()
try:
await self.db.execute(text("SELECT 1"))
latency = (time.perf_counter() - start) * 1000
return HealthCheckResult(
name="database",
healthy=True,
latency_ms=round(latency, 2),
message="Connection OK",
)
except Exception as e:
latency = (time.perf_counter() - start) * 1000
return HealthCheckResult(
name="database",
healthy=False,
latency_ms=round(latency, 2),
message=f"Connection failed: {str(e)}",
)
async def check_redis(self) -> HealthCheckResult:
"""检查Redis连接"""
start = time.perf_counter()
try:
from app.core.redis import get_redis
redis = await get_redis()
await redis.ping()
latency = (time.perf_counter() - start) * 1000
return HealthCheckResult(
name="redis",
healthy=True,
latency_ms=round(latency, 2),
message="Connection OK",
)
except Exception as e:
latency = (time.perf_counter() - start) * 1000
return HealthCheckResult(
name="redis",
healthy=False,
latency_ms=round(latency, 2),
message=f"Connection failed: {str(e)}",
)
async def check_llm_providers(self) -> HealthCheckResult:
"""检查LLM服务提供商"""
from app.config import settings
from app.services.llm.factory import LLMFactory
providers = {}
all_healthy = True
# 检查默认provider
try:
provider_name = getattr(settings, 'DEFAULT_LLM_PROVIDER', 'openai')
provider = LLMFactory.create(provider_name)
providers[provider_name] = {
"healthy": True,
"available": True,
}
except Exception as e:
providers[getattr(settings, 'DEFAULT_LLM_PROVIDER', 'openai')] = {
"healthy": False,
"error": str(e),
}
all_healthy = False
# 检查所有已注册的provider
for name in LLMFactory.list_providers():
if name not in providers:
try:
provider = LLMFactory.create(name)
providers[name] = {
"healthy": True,
"available": True,
}
except Exception as e:
providers[name] = {
"healthy": False,
"error": str(e),
}
all_healthy = False
return HealthCheckResult(
name="llm_providers",
healthy=all_healthy,
message="All providers healthy" if all_healthy else "Some providers unhealthy",
details={"providers": providers},
)
async def check_storage(self) -> HealthCheckResult:
"""检查存储(本地文件系统)"""
import os
storage_path = "/data/documents"
try:
if os.path.exists(storage_path):
# 检查读写权限
test_file = os.path.join(storage_path, ".health_check")
with open(test_file, "w") as f:
f.write("ok")
os.remove(test_file)
return HealthCheckResult(
name="storage",
healthy=True,
message=f"Storage path {storage_path} is writable",
details={"path": storage_path},
)
else:
return HealthCheckResult(
name="storage",
healthy=True,
message=f"Storage path {storage_path} does not exist (will be created)",
details={"path": storage_path, "created": True},
)
except Exception as e:
return HealthCheckResult(
name="storage",
healthy=False,
message=f"Storage check failed: {str(e)}",
)
async def check_all(self) -> dict:
"""执行所有健康检查"""
import asyncio
# 并行执行所有检查
checks = [
self.check_database(),
self.check_redis(),
self.check_llm_providers(),
self.check_storage(),
]
results = await asyncio.gather(*checks)
# 汇总结果
all_healthy = all(r.healthy for r in results)
return {
"status": "healthy" if all_healthy else "degraded",
"timestamp": time.time(),
"checks": {
r.name: {
"healthy": r.healthy,
"latency_ms": r.latency_ms,
"message": r.message,
"details": r.details,
}
for r in results
},
}