Files
game-marathon/status-service/main.py
2025-12-20 02:28:41 +07:00

219 lines
7.2 KiB
Python

"""Status monitoring service with persistence and alerting."""
import os
import asyncio
from datetime import datetime
from typing import Optional
from contextlib import asynccontextmanager
from fastapi import FastAPI, Request
from fastapi.responses import HTMLResponse
from fastapi.templating import Jinja2Templates
from monitors import ServiceMonitor, Status
from database import init_db, get_recent_incidents, get_latency_history, cleanup_old_metrics
# Configuration
BACKEND_URL = os.getenv("BACKEND_URL", "http://backend:8000")
FRONTEND_URL = os.getenv("FRONTEND_URL", "http://frontend:80")
BOT_URL = os.getenv("BOT_URL", "http://bot:8080")
EXTERNAL_URL = os.getenv("EXTERNAL_URL", "") # Public URL for external checks
PUBLIC_URL = os.getenv("PUBLIC_URL", "") # Public HTTPS URL for SSL checks
CHECK_INTERVAL = int(os.getenv("CHECK_INTERVAL", "60")) # Normal interval (1 minute)
FAST_CHECK_INTERVAL = int(os.getenv("FAST_CHECK_INTERVAL", "5")) # Fast interval when issues detected
STARTUP_GRACE_PERIOD = int(os.getenv("STARTUP_GRACE_PERIOD", "60")) # Wait before alerting after startup
# Initialize monitor
monitor = ServiceMonitor()
startup_time: Optional[datetime] = None # Track when service started
# Background task reference
background_task: Optional[asyncio.Task] = None
cleanup_task: Optional[asyncio.Task] = None
def has_issues() -> bool:
"""Check if any monitored service has issues."""
for name, svc in monitor.services.items():
# Skip external if not configured
if name == "external" and svc.status == Status.UNKNOWN:
continue
if svc.status in (Status.DOWN, Status.DEGRADED):
return True
return False
async def periodic_health_check():
"""Background task to check services periodically with adaptive polling."""
while True:
try:
# Suppress alerts during startup grace period
suppress_alerts = is_in_grace_period()
if suppress_alerts:
remaining = STARTUP_GRACE_PERIOD - (datetime.now() - startup_time).total_seconds()
print(f"Grace period: {remaining:.0f}s remaining (alerts suppressed)")
await monitor.check_all_services(
backend_url=BACKEND_URL,
frontend_url=FRONTEND_URL,
bot_url=BOT_URL,
external_url=EXTERNAL_URL,
public_url=PUBLIC_URL,
suppress_alerts=suppress_alerts
)
except Exception as e:
print(f"Health check error: {e}")
# Adaptive polling: check more frequently when issues detected
if has_issues():
await asyncio.sleep(FAST_CHECK_INTERVAL)
else:
await asyncio.sleep(CHECK_INTERVAL)
async def periodic_cleanup():
"""Background task to cleanup old metrics (runs immediately, then hourly)."""
while True:
try:
deleted = cleanup_old_metrics(hours=24) # Keep only last 24 hours
if deleted > 0:
print(f"Cleaned up {deleted} old metrics")
except Exception as e:
print(f"Cleanup error: {e}")
await asyncio.sleep(3600) # Wait 1 hour before next cleanup
def is_in_grace_period() -> bool:
"""Check if we're still in startup grace period."""
if startup_time is None:
return True
elapsed = (datetime.now() - startup_time).total_seconds()
return elapsed < STARTUP_GRACE_PERIOD
@asynccontextmanager
async def lifespan(app: FastAPI):
"""Startup and shutdown events."""
global background_task, cleanup_task, startup_time
# Initialize database
init_db()
print("Database initialized")
# Mark startup time
startup_time = datetime.now()
print(f"Startup grace period: {STARTUP_GRACE_PERIOD}s (no alerts until services stabilize)")
# Start background health checks
background_task = asyncio.create_task(periodic_health_check())
cleanup_task = asyncio.create_task(periodic_cleanup())
yield
# Cancel background tasks on shutdown
for task in [background_task, cleanup_task]:
if task:
task.cancel()
try:
await task
except asyncio.CancelledError:
pass
app = FastAPI(
title="Status Monitor",
description="Service health monitoring with persistence and alerting",
lifespan=lifespan
)
templates = Jinja2Templates(directory="templates")
@app.get("/", response_class=HTMLResponse)
async def status_page(request: Request, period: int = 24):
"""Main status page."""
# Validate period (1, 12, or 24 hours)
if period not in (1, 12, 24):
period = 24
services = monitor.get_all_statuses(period_hours=period)
overall_status = monitor.get_overall_status()
ssl_status = monitor.get_ssl_status()
incidents = get_recent_incidents(limit=5)
fast_mode = has_issues()
current_interval = FAST_CHECK_INTERVAL if fast_mode else CHECK_INTERVAL
grace_period_active = is_in_grace_period()
grace_period_remaining = max(0, STARTUP_GRACE_PERIOD - (datetime.now() - startup_time).total_seconds()) if startup_time else 0
return templates.TemplateResponse(
"index.html",
{
"request": request,
"services": services,
"overall_status": overall_status,
"ssl_status": ssl_status,
"incidents": incidents,
"last_check": monitor.last_check,
"check_interval": current_interval,
"fast_mode": fast_mode,
"grace_period_active": grace_period_active,
"grace_period_remaining": int(grace_period_remaining),
"period": period
}
)
@app.get("/api/status")
async def api_status():
"""API endpoint for service statuses."""
services = monitor.get_all_statuses()
overall_status = monitor.get_overall_status()
ssl_status = monitor.get_ssl_status()
current_interval = FAST_CHECK_INTERVAL if has_issues() else CHECK_INTERVAL
return {
"overall_status": overall_status.value,
"services": {name: status.to_dict() for name, status in services.items()},
"ssl": ssl_status,
"last_check": monitor.last_check.isoformat() if monitor.last_check else None,
"check_interval_seconds": current_interval,
"fast_mode": has_issues()
}
@app.get("/api/history/{service_name}")
async def api_history(service_name: str, hours: int = 24):
"""API endpoint for service latency history."""
history = get_latency_history(service_name, hours=hours)
return {
"service": service_name,
"hours": hours,
"data": history
}
@app.get("/api/incidents")
async def api_incidents(limit: int = 20):
"""API endpoint for recent incidents."""
incidents = get_recent_incidents(limit=limit)
return {"incidents": incidents}
@app.get("/api/health")
async def health():
"""Health check for this service."""
return {"status": "ok", "service": "status-monitor"}
@app.post("/api/refresh")
async def refresh_status():
"""Force refresh all service statuses."""
await monitor.check_all_services(
backend_url=BACKEND_URL,
frontend_url=FRONTEND_URL,
bot_url=BOT_URL,
external_url=EXTERNAL_URL,
public_url=PUBLIC_URL
)
return {"status": "refreshed"}