Redesign health service + create backup service

This commit is contained in:
2025-12-18 03:35:13 +07:00
parent e43e579329
commit 57bad3b4a8
16 changed files with 1486 additions and 98 deletions

View File

@@ -1,6 +1,7 @@
"""Status monitoring service with persistence and alerting."""
import os
import asyncio
from datetime import datetime, timedelta
from datetime import datetime
from typing import Optional
from contextlib import asynccontextmanager
@@ -8,13 +9,16 @@ from fastapi import FastAPI, Request
from fastapi.responses import HTMLResponse
from fastapi.templating import Jinja2Templates
from monitors import ServiceMonitor, ServiceStatus
from monitors import ServiceMonitor
from database import init_db, get_recent_incidents, get_latency_history, cleanup_old_metrics
# Configuration
BACKEND_URL = os.getenv("BACKEND_URL", "http://backend:8000")
FRONTEND_URL = os.getenv("FRONTEND_URL", "http://frontend:80")
BOT_URL = os.getenv("BOT_URL", "http://bot:8080")
EXTERNAL_URL = os.getenv("EXTERNAL_URL", "") # Public URL for external checks
PUBLIC_URL = os.getenv("PUBLIC_URL", "") # Public HTTPS URL for SSL checks
CHECK_INTERVAL = int(os.getenv("CHECK_INTERVAL", "30"))
# Initialize monitor
@@ -22,38 +26,64 @@ monitor = ServiceMonitor()
# Background task reference
background_task: Optional[asyncio.Task] = None
cleanup_task: Optional[asyncio.Task] = None
async def periodic_health_check():
"""Background task to check services periodically"""
"""Background task to check services periodically."""
while True:
await monitor.check_all_services(
backend_url=BACKEND_URL,
frontend_url=FRONTEND_URL,
bot_url=BOT_URL
)
try:
await monitor.check_all_services(
backend_url=BACKEND_URL,
frontend_url=FRONTEND_URL,
bot_url=BOT_URL,
external_url=EXTERNAL_URL,
public_url=PUBLIC_URL
)
except Exception as e:
print(f"Health check error: {e}")
await asyncio.sleep(CHECK_INTERVAL)
async def periodic_cleanup():
"""Background task to cleanup old metrics (daily)."""
while True:
await asyncio.sleep(86400) # 24 hours
try:
deleted = cleanup_old_metrics(days=7)
print(f"Cleaned up {deleted} old metrics")
except Exception as e:
print(f"Cleanup error: {e}")
@asynccontextmanager
async def lifespan(app: FastAPI):
"""Startup and shutdown events"""
global background_task
"""Startup and shutdown events."""
global background_task, cleanup_task
# Initialize database
init_db()
print("Database initialized")
# Start background health checks
background_task = asyncio.create_task(periodic_health_check())
cleanup_task = asyncio.create_task(periodic_cleanup())
yield
# Cancel background task on shutdown
if background_task:
background_task.cancel()
try:
await background_task
except asyncio.CancelledError:
pass
# Cancel background tasks on shutdown
for task in [background_task, cleanup_task]:
if task:
task.cancel()
try:
await task
except asyncio.CancelledError:
pass
app = FastAPI(
title="Status Monitor",
description="Service health monitoring",
description="Service health monitoring with persistence and alerting",
lifespan=lifespan
)
@@ -62,9 +92,11 @@ templates = Jinja2Templates(directory="templates")
@app.get("/", response_class=HTMLResponse)
async def status_page(request: Request):
"""Main status page"""
"""Main status page."""
services = monitor.get_all_statuses()
overall_status = monitor.get_overall_status()
ssl_status = monitor.get_ssl_status()
incidents = get_recent_incidents(limit=5)
return templates.TemplateResponse(
"index.html",
@@ -72,6 +104,8 @@ async def status_page(request: Request):
"request": request,
"services": services,
"overall_status": overall_status,
"ssl_status": ssl_status,
"incidents": incidents,
"last_check": monitor.last_check,
"check_interval": CHECK_INTERVAL
}
@@ -80,30 +114,52 @@ async def status_page(request: Request):
@app.get("/api/status")
async def api_status():
"""API endpoint for service statuses"""
"""API endpoint for service statuses."""
services = monitor.get_all_statuses()
overall_status = monitor.get_overall_status()
ssl_status = monitor.get_ssl_status()
return {
"overall_status": overall_status,
"overall_status": overall_status.value,
"services": {name: status.to_dict() for name, status in services.items()},
"ssl": ssl_status,
"last_check": monitor.last_check.isoformat() if monitor.last_check else None,
"check_interval_seconds": CHECK_INTERVAL
}
@app.get("/api/history/{service_name}")
async def api_history(service_name: str, hours: int = 24):
"""API endpoint for service latency history."""
history = get_latency_history(service_name, hours=hours)
return {
"service": service_name,
"hours": hours,
"data": history
}
@app.get("/api/incidents")
async def api_incidents(limit: int = 20):
"""API endpoint for recent incidents."""
incidents = get_recent_incidents(limit=limit)
return {"incidents": incidents}
@app.get("/api/health")
async def health():
"""Health check for this service"""
"""Health check for this service."""
return {"status": "ok", "service": "status-monitor"}
@app.post("/api/refresh")
async def refresh_status():
"""Force refresh all service statuses"""
"""Force refresh all service statuses."""
await monitor.check_all_services(
backend_url=BACKEND_URL,
frontend_url=FRONTEND_URL,
bot_url=BOT_URL
bot_url=BOT_URL,
external_url=EXTERNAL_URL,
public_url=PUBLIC_URL
)
return {"status": "refreshed"}