"""Status monitoring service with persistence and alerting.""" import os import asyncio from datetime import datetime from typing import Optional from contextlib import asynccontextmanager from fastapi import FastAPI, Request from fastapi.responses import HTMLResponse from fastapi.templating import Jinja2Templates from monitors import ServiceMonitor from database import init_db, get_recent_incidents, get_latency_history, cleanup_old_metrics # Configuration BACKEND_URL = os.getenv("BACKEND_URL", "http://backend:8000") FRONTEND_URL = os.getenv("FRONTEND_URL", "http://frontend:80") BOT_URL = os.getenv("BOT_URL", "http://bot:8080") EXTERNAL_URL = os.getenv("EXTERNAL_URL", "") # Public URL for external checks PUBLIC_URL = os.getenv("PUBLIC_URL", "") # Public HTTPS URL for SSL checks CHECK_INTERVAL = int(os.getenv("CHECK_INTERVAL", "600")) # 10 minutes # Initialize monitor monitor = ServiceMonitor() # Background task reference background_task: Optional[asyncio.Task] = None cleanup_task: Optional[asyncio.Task] = None async def periodic_health_check(): """Background task to check services periodically.""" while True: try: await monitor.check_all_services( backend_url=BACKEND_URL, frontend_url=FRONTEND_URL, bot_url=BOT_URL, external_url=EXTERNAL_URL, public_url=PUBLIC_URL ) except Exception as e: print(f"Health check error: {e}") await asyncio.sleep(CHECK_INTERVAL) async def periodic_cleanup(): """Background task to cleanup old metrics (hourly).""" while True: await asyncio.sleep(3600) # 1 hour try: deleted = cleanup_old_metrics(days=1) # Keep only last 24 hours print(f"Cleaned up {deleted} old metrics") except Exception as e: print(f"Cleanup error: {e}") @asynccontextmanager async def lifespan(app: FastAPI): """Startup and shutdown events.""" global background_task, cleanup_task # Initialize database init_db() print("Database initialized") # Start background health checks background_task = asyncio.create_task(periodic_health_check()) cleanup_task = asyncio.create_task(periodic_cleanup()) yield # Cancel background tasks on shutdown for task in [background_task, cleanup_task]: if task: task.cancel() try: await task except asyncio.CancelledError: pass app = FastAPI( title="Status Monitor", description="Service health monitoring with persistence and alerting", lifespan=lifespan ) templates = Jinja2Templates(directory="templates") @app.get("/", response_class=HTMLResponse) async def status_page(request: Request): """Main status page.""" services = monitor.get_all_statuses() overall_status = monitor.get_overall_status() ssl_status = monitor.get_ssl_status() incidents = get_recent_incidents(limit=5) return templates.TemplateResponse( "index.html", { "request": request, "services": services, "overall_status": overall_status, "ssl_status": ssl_status, "incidents": incidents, "last_check": monitor.last_check, "check_interval": CHECK_INTERVAL } ) @app.get("/api/status") async def api_status(): """API endpoint for service statuses.""" services = monitor.get_all_statuses() overall_status = monitor.get_overall_status() ssl_status = monitor.get_ssl_status() return { "overall_status": overall_status.value, "services": {name: status.to_dict() for name, status in services.items()}, "ssl": ssl_status, "last_check": monitor.last_check.isoformat() if monitor.last_check else None, "check_interval_seconds": CHECK_INTERVAL } @app.get("/api/history/{service_name}") async def api_history(service_name: str, hours: int = 24): """API endpoint for service latency history.""" history = get_latency_history(service_name, hours=hours) return { "service": service_name, "hours": hours, "data": history } @app.get("/api/incidents") async def api_incidents(limit: int = 20): """API endpoint for recent incidents.""" incidents = get_recent_incidents(limit=limit) return {"incidents": incidents} @app.get("/api/health") async def health(): """Health check for this service.""" return {"status": "ok", "service": "status-monitor"} @app.post("/api/refresh") async def refresh_status(): """Force refresh all service statuses.""" await monitor.check_all_services( backend_url=BACKEND_URL, frontend_url=FRONTEND_URL, bot_url=BOT_URL, external_url=EXTERNAL_URL, public_url=PUBLIC_URL ) return {"status": "refreshed"}