"""Status monitoring service with persistence and alerting.""" import os import asyncio from datetime import datetime from typing import Optional from contextlib import asynccontextmanager from fastapi import FastAPI, Request from fastapi.responses import HTMLResponse from fastapi.templating import Jinja2Templates from monitors import ServiceMonitor, Status from database import init_db, get_recent_incidents, get_latency_history, cleanup_old_metrics # Configuration BACKEND_URL = os.getenv("BACKEND_URL", "http://backend:8000") FRONTEND_URL = os.getenv("FRONTEND_URL", "http://frontend:80") BOT_URL = os.getenv("BOT_URL", "http://bot:8080") EXTERNAL_URL = os.getenv("EXTERNAL_URL", "") # Public URL for external checks PUBLIC_URL = os.getenv("PUBLIC_URL", "") # Public HTTPS URL for SSL checks CHECK_INTERVAL = int(os.getenv("CHECK_INTERVAL", "60")) # Normal interval (1 minute) FAST_CHECK_INTERVAL = int(os.getenv("FAST_CHECK_INTERVAL", "5")) # Fast interval when issues detected STARTUP_GRACE_PERIOD = int(os.getenv("STARTUP_GRACE_PERIOD", "60")) # Wait before alerting after startup # Initialize monitor monitor = ServiceMonitor() startup_time: Optional[datetime] = None # Track when service started # Background task reference background_task: Optional[asyncio.Task] = None cleanup_task: Optional[asyncio.Task] = None def has_issues() -> bool: """Check if any monitored service has issues.""" for name, svc in monitor.services.items(): # Skip external if not configured if name == "external" and svc.status == Status.UNKNOWN: continue if svc.status in (Status.DOWN, Status.DEGRADED): return True return False async def periodic_health_check(): """Background task to check services periodically with adaptive polling.""" while True: try: # Suppress alerts during startup grace period suppress_alerts = is_in_grace_period() if suppress_alerts: remaining = STARTUP_GRACE_PERIOD - (datetime.now() - startup_time).total_seconds() print(f"Grace period: {remaining:.0f}s remaining (alerts suppressed)") await monitor.check_all_services( backend_url=BACKEND_URL, frontend_url=FRONTEND_URL, bot_url=BOT_URL, external_url=EXTERNAL_URL, public_url=PUBLIC_URL, suppress_alerts=suppress_alerts ) except Exception as e: print(f"Health check error: {e}") # Adaptive polling: check more frequently when issues detected if has_issues(): await asyncio.sleep(FAST_CHECK_INTERVAL) else: await asyncio.sleep(CHECK_INTERVAL) async def periodic_cleanup(): """Background task to cleanup old metrics (runs immediately, then hourly).""" while True: try: deleted = cleanup_old_metrics(hours=24) # Keep only last 24 hours if deleted > 0: print(f"Cleaned up {deleted} old metrics") except Exception as e: print(f"Cleanup error: {e}") await asyncio.sleep(3600) # Wait 1 hour before next cleanup def is_in_grace_period() -> bool: """Check if we're still in startup grace period.""" if startup_time is None: return True elapsed = (datetime.now() - startup_time).total_seconds() return elapsed < STARTUP_GRACE_PERIOD @asynccontextmanager async def lifespan(app: FastAPI): """Startup and shutdown events.""" global background_task, cleanup_task, startup_time # Initialize database init_db() print("Database initialized") # Mark startup time startup_time = datetime.now() print(f"Startup grace period: {STARTUP_GRACE_PERIOD}s (no alerts until services stabilize)") # Start background health checks background_task = asyncio.create_task(periodic_health_check()) cleanup_task = asyncio.create_task(periodic_cleanup()) yield # Cancel background tasks on shutdown for task in [background_task, cleanup_task]: if task: task.cancel() try: await task except asyncio.CancelledError: pass app = FastAPI( title="Status Monitor", description="Service health monitoring with persistence and alerting", lifespan=lifespan ) templates = Jinja2Templates(directory="templates") @app.get("/", response_class=HTMLResponse) async def status_page(request: Request, period: int = 24): """Main status page.""" # Validate period (1, 12, or 24 hours) if period not in (1, 12, 24): period = 24 services = monitor.get_all_statuses(period_hours=period) overall_status = monitor.get_overall_status() ssl_status = monitor.get_ssl_status() incidents = get_recent_incidents(limit=5) fast_mode = has_issues() current_interval = FAST_CHECK_INTERVAL if fast_mode else CHECK_INTERVAL grace_period_active = is_in_grace_period() grace_period_remaining = max(0, STARTUP_GRACE_PERIOD - (datetime.now() - startup_time).total_seconds()) if startup_time else 0 return templates.TemplateResponse( "index.html", { "request": request, "services": services, "overall_status": overall_status, "ssl_status": ssl_status, "incidents": incidents, "last_check": monitor.last_check, "check_interval": current_interval, "fast_mode": fast_mode, "grace_period_active": grace_period_active, "grace_period_remaining": int(grace_period_remaining), "period": period } ) @app.get("/api/status") async def api_status(): """API endpoint for service statuses.""" services = monitor.get_all_statuses() overall_status = monitor.get_overall_status() ssl_status = monitor.get_ssl_status() current_interval = FAST_CHECK_INTERVAL if has_issues() else CHECK_INTERVAL return { "overall_status": overall_status.value, "services": {name: status.to_dict() for name, status in services.items()}, "ssl": ssl_status, "last_check": monitor.last_check.isoformat() if monitor.last_check else None, "check_interval_seconds": current_interval, "fast_mode": has_issues() } @app.get("/api/history/{service_name}") async def api_history(service_name: str, hours: int = 24): """API endpoint for service latency history.""" history = get_latency_history(service_name, hours=hours) return { "service": service_name, "hours": hours, "data": history } @app.get("/api/incidents") async def api_incidents(limit: int = 20): """API endpoint for recent incidents.""" incidents = get_recent_incidents(limit=limit) return {"incidents": incidents} @app.get("/api/health") async def health(): """Health check for this service.""" return {"status": "ok", "service": "status-monitor"} @app.post("/api/refresh") async def refresh_status(): """Force refresh all service statuses.""" await monitor.check_all_services( backend_url=BACKEND_URL, frontend_url=FRONTEND_URL, bot_url=BOT_URL, external_url=EXTERNAL_URL, public_url=PUBLIC_URL ) return {"status": "refreshed"}