game-marathon/status-service/main.py

"""Status monitoring service with persistence and alerting."""
import os
import asyncio
from datetime import datetime
from typing import Optional
from contextlib import asynccontextmanager

from fastapi import FastAPI, Request
from fastapi.responses import HTMLResponse
from fastapi.templating import Jinja2Templates

from monitors import ServiceMonitor, Status
from database import init_db, get_recent_incidents, get_latency_history, cleanup_old_metrics


# Configuration
BACKEND_URL = os.getenv("BACKEND_URL", "http://backend:8000")
FRONTEND_URL = os.getenv("FRONTEND_URL", "http://frontend:80")
BOT_URL = os.getenv("BOT_URL", "http://bot:8080")
EXTERNAL_URL = os.getenv("EXTERNAL_URL", "")  # Public URL for external checks
PUBLIC_URL = os.getenv("PUBLIC_URL", "")  # Public HTTPS URL for SSL checks
CHECK_INTERVAL = int(os.getenv("CHECK_INTERVAL", "60"))  # Normal interval (1 minute)
FAST_CHECK_INTERVAL = int(os.getenv("FAST_CHECK_INTERVAL", "5"))  # Fast interval when issues detected
STARTUP_GRACE_PERIOD = int(os.getenv("STARTUP_GRACE_PERIOD", "60"))  # Wait before alerting after startup

# Initialize monitor
monitor = ServiceMonitor()
startup_time: Optional[datetime] = None  # Track when service started

# Background task reference
background_task: Optional[asyncio.Task] = None
cleanup_task: Optional[asyncio.Task] = None


def has_issues() -> bool:
    """Check if any monitored service has issues."""
    for name, svc in monitor.services.items():
        # Skip external if not configured
        if name == "external" and svc.status == Status.UNKNOWN:
            continue
        if svc.status in (Status.DOWN, Status.DEGRADED):
            return True
    return False


async def periodic_health_check():
    """Background task to check services periodically with adaptive polling."""
    while True:
        try:
            # Suppress alerts during startup grace period
            suppress_alerts = is_in_grace_period()
            if suppress_alerts:
                remaining = STARTUP_GRACE_PERIOD - (datetime.now() - startup_time).total_seconds()
                print(f"Grace period: {remaining:.0f}s remaining (alerts suppressed)")

            await monitor.check_all_services(
                backend_url=BACKEND_URL,
                frontend_url=FRONTEND_URL,
                bot_url=BOT_URL,
                external_url=EXTERNAL_URL,
                public_url=PUBLIC_URL,
                suppress_alerts=suppress_alerts
            )
        except Exception as e:
            print(f"Health check error: {e}")

        # Adaptive polling: check more frequently when issues detected
        if has_issues():
            await asyncio.sleep(FAST_CHECK_INTERVAL)
        else:
            await asyncio.sleep(CHECK_INTERVAL)


async def periodic_cleanup():
    """Background task to cleanup old metrics (runs immediately, then hourly)."""
    while True:
        try:
            deleted = cleanup_old_metrics(hours=24)  # Keep only last 24 hours
            if deleted > 0:
                print(f"Cleaned up {deleted} old metrics")
        except Exception as e:
            print(f"Cleanup error: {e}")
        await asyncio.sleep(3600)  # Wait 1 hour before next cleanup


def is_in_grace_period() -> bool:
    """Check if we're still in startup grace period."""
    if startup_time is None:
        return True
    elapsed = (datetime.now() - startup_time).total_seconds()
    return elapsed < STARTUP_GRACE_PERIOD


@asynccontextmanager
async def lifespan(app: FastAPI):
    """Startup and shutdown events."""
    global background_task, cleanup_task, startup_time

    # Initialize database
    init_db()
    print("Database initialized")

    # Mark startup time
    startup_time = datetime.now()
    print(f"Startup grace period: {STARTUP_GRACE_PERIOD}s (no alerts until services stabilize)")

    # Start background health checks
    background_task = asyncio.create_task(periodic_health_check())
    cleanup_task = asyncio.create_task(periodic_cleanup())

    yield

    # Cancel background tasks on shutdown
    for task in [background_task, cleanup_task]:
        if task:
            task.cancel()
            try:
                await task
            except asyncio.CancelledError:
                pass


app = FastAPI(
    title="Status Monitor",
    description="Service health monitoring with persistence and alerting",
    lifespan=lifespan
)

templates = Jinja2Templates(directory="templates")


@app.get("/", response_class=HTMLResponse)
async def status_page(request: Request, period: int = 24):
    """Main status page."""
    # Validate period (1, 12, or 24 hours)
    if period not in (1, 12, 24):
        period = 24

    services = monitor.get_all_statuses(period_hours=period)
    overall_status = monitor.get_overall_status()
    ssl_status = monitor.get_ssl_status()
    incidents = get_recent_incidents(limit=5)
    fast_mode = has_issues()
    current_interval = FAST_CHECK_INTERVAL if fast_mode else CHECK_INTERVAL
    grace_period_active = is_in_grace_period()
    grace_period_remaining = max(0, STARTUP_GRACE_PERIOD - (datetime.now() - startup_time).total_seconds()) if startup_time else 0

    return templates.TemplateResponse(
        "index.html",
        {
            "request": request,
            "services": services,
            "overall_status": overall_status,
            "ssl_status": ssl_status,
            "incidents": incidents,
            "last_check": monitor.last_check,
            "check_interval": current_interval,
            "fast_mode": fast_mode,
            "grace_period_active": grace_period_active,
            "grace_period_remaining": int(grace_period_remaining),
            "period": period
        }
    )


@app.get("/api/status")
async def api_status():
    """API endpoint for service statuses."""
    services = monitor.get_all_statuses()
    overall_status = monitor.get_overall_status()
    ssl_status = monitor.get_ssl_status()
    current_interval = FAST_CHECK_INTERVAL if has_issues() else CHECK_INTERVAL

    return {
        "overall_status": overall_status.value,
        "services": {name: status.to_dict() for name, status in services.items()},
        "ssl": ssl_status,
        "last_check": monitor.last_check.isoformat() if monitor.last_check else None,
        "check_interval_seconds": current_interval,
        "fast_mode": has_issues()
    }


@app.get("/api/history/{service_name}")
async def api_history(service_name: str, hours: int = 24):
    """API endpoint for service latency history."""
    history = get_latency_history(service_name, hours=hours)
    return {
        "service": service_name,
        "hours": hours,
        "data": history
    }


@app.get("/api/incidents")
async def api_incidents(limit: int = 20):
    """API endpoint for recent incidents."""
    incidents = get_recent_incidents(limit=limit)
    return {"incidents": incidents}


@app.get("/api/health")
async def health():
    """Health check for this service."""
    return {"status": "ok", "service": "status-monitor"}


@app.post("/api/refresh")
async def refresh_status():
    """Force refresh all service statuses."""
    await monitor.check_all_services(
        backend_url=BACKEND_URL,
        frontend_url=FRONTEND_URL,
        bot_url=BOT_URL,
        external_url=EXTERNAL_URL,
        public_url=PUBLIC_URL
    )
    return {"status": "refreshed"}