Files
game-marathon/status-service/main.py

166 lines
4.8 KiB
Python

"""Status monitoring service with persistence and alerting."""
import os
import asyncio
from datetime import datetime
from typing import Optional
from contextlib import asynccontextmanager
from fastapi import FastAPI, Request
from fastapi.responses import HTMLResponse
from fastapi.templating import Jinja2Templates
from monitors import ServiceMonitor
from database import init_db, get_recent_incidents, get_latency_history, cleanup_old_metrics
# Configuration
BACKEND_URL = os.getenv("BACKEND_URL", "http://backend:8000")
FRONTEND_URL = os.getenv("FRONTEND_URL", "http://frontend:80")
BOT_URL = os.getenv("BOT_URL", "http://bot:8080")
EXTERNAL_URL = os.getenv("EXTERNAL_URL", "") # Public URL for external checks
PUBLIC_URL = os.getenv("PUBLIC_URL", "") # Public HTTPS URL for SSL checks
CHECK_INTERVAL = int(os.getenv("CHECK_INTERVAL", "30"))
# Initialize monitor
monitor = ServiceMonitor()
# Background task reference
background_task: Optional[asyncio.Task] = None
cleanup_task: Optional[asyncio.Task] = None
async def periodic_health_check():
"""Background task to check services periodically."""
while True:
try:
await monitor.check_all_services(
backend_url=BACKEND_URL,
frontend_url=FRONTEND_URL,
bot_url=BOT_URL,
external_url=EXTERNAL_URL,
public_url=PUBLIC_URL
)
except Exception as e:
print(f"Health check error: {e}")
await asyncio.sleep(CHECK_INTERVAL)
async def periodic_cleanup():
"""Background task to cleanup old metrics (daily)."""
while True:
await asyncio.sleep(86400) # 24 hours
try:
deleted = cleanup_old_metrics(days=7)
print(f"Cleaned up {deleted} old metrics")
except Exception as e:
print(f"Cleanup error: {e}")
@asynccontextmanager
async def lifespan(app: FastAPI):
"""Startup and shutdown events."""
global background_task, cleanup_task
# Initialize database
init_db()
print("Database initialized")
# Start background health checks
background_task = asyncio.create_task(periodic_health_check())
cleanup_task = asyncio.create_task(periodic_cleanup())
yield
# Cancel background tasks on shutdown
for task in [background_task, cleanup_task]:
if task:
task.cancel()
try:
await task
except asyncio.CancelledError:
pass
app = FastAPI(
title="Status Monitor",
description="Service health monitoring with persistence and alerting",
lifespan=lifespan
)
templates = Jinja2Templates(directory="templates")
@app.get("/", response_class=HTMLResponse)
async def status_page(request: Request):
"""Main status page."""
services = monitor.get_all_statuses()
overall_status = monitor.get_overall_status()
ssl_status = monitor.get_ssl_status()
incidents = get_recent_incidents(limit=5)
return templates.TemplateResponse(
"index.html",
{
"request": request,
"services": services,
"overall_status": overall_status,
"ssl_status": ssl_status,
"incidents": incidents,
"last_check": monitor.last_check,
"check_interval": CHECK_INTERVAL
}
)
@app.get("/api/status")
async def api_status():
"""API endpoint for service statuses."""
services = monitor.get_all_statuses()
overall_status = monitor.get_overall_status()
ssl_status = monitor.get_ssl_status()
return {
"overall_status": overall_status.value,
"services": {name: status.to_dict() for name, status in services.items()},
"ssl": ssl_status,
"last_check": monitor.last_check.isoformat() if monitor.last_check else None,
"check_interval_seconds": CHECK_INTERVAL
}
@app.get("/api/history/{service_name}")
async def api_history(service_name: str, hours: int = 24):
"""API endpoint for service latency history."""
history = get_latency_history(service_name, hours=hours)
return {
"service": service_name,
"hours": hours,
"data": history
}
@app.get("/api/incidents")
async def api_incidents(limit: int = 20):
"""API endpoint for recent incidents."""
incidents = get_recent_incidents(limit=limit)
return {"incidents": incidents}
@app.get("/api/health")
async def health():
"""Health check for this service."""
return {"status": "ok", "service": "status-monitor"}
@app.post("/api/refresh")
async def refresh_status():
"""Force refresh all service statuses."""
await monitor.check_all_services(
backend_url=BACKEND_URL,
frontend_url=FRONTEND_URL,
bot_url=BOT_URL,
external_url=EXTERNAL_URL,
public_url=PUBLIC_URL
)
return {"status": "refreshed"}