219 lines
7.2 KiB
Python
219 lines
7.2 KiB
Python
"""Status monitoring service with persistence and alerting."""
|
|
import os
|
|
import asyncio
|
|
from datetime import datetime
|
|
from typing import Optional
|
|
from contextlib import asynccontextmanager
|
|
|
|
from fastapi import FastAPI, Request
|
|
from fastapi.responses import HTMLResponse
|
|
from fastapi.templating import Jinja2Templates
|
|
|
|
from monitors import ServiceMonitor, Status
|
|
from database import init_db, get_recent_incidents, get_latency_history, cleanup_old_metrics
|
|
|
|
|
|
# Configuration
|
|
BACKEND_URL = os.getenv("BACKEND_URL", "http://backend:8000")
|
|
FRONTEND_URL = os.getenv("FRONTEND_URL", "http://frontend:80")
|
|
BOT_URL = os.getenv("BOT_URL", "http://bot:8080")
|
|
EXTERNAL_URL = os.getenv("EXTERNAL_URL", "") # Public URL for external checks
|
|
PUBLIC_URL = os.getenv("PUBLIC_URL", "") # Public HTTPS URL for SSL checks
|
|
CHECK_INTERVAL = int(os.getenv("CHECK_INTERVAL", "60")) # Normal interval (1 minute)
|
|
FAST_CHECK_INTERVAL = int(os.getenv("FAST_CHECK_INTERVAL", "5")) # Fast interval when issues detected
|
|
STARTUP_GRACE_PERIOD = int(os.getenv("STARTUP_GRACE_PERIOD", "60")) # Wait before alerting after startup
|
|
|
|
# Initialize monitor
|
|
monitor = ServiceMonitor()
|
|
startup_time: Optional[datetime] = None # Track when service started
|
|
|
|
# Background task reference
|
|
background_task: Optional[asyncio.Task] = None
|
|
cleanup_task: Optional[asyncio.Task] = None
|
|
|
|
|
|
def has_issues() -> bool:
|
|
"""Check if any monitored service has issues."""
|
|
for name, svc in monitor.services.items():
|
|
# Skip external if not configured
|
|
if name == "external" and svc.status == Status.UNKNOWN:
|
|
continue
|
|
if svc.status in (Status.DOWN, Status.DEGRADED):
|
|
return True
|
|
return False
|
|
|
|
|
|
async def periodic_health_check():
|
|
"""Background task to check services periodically with adaptive polling."""
|
|
while True:
|
|
try:
|
|
# Suppress alerts during startup grace period
|
|
suppress_alerts = is_in_grace_period()
|
|
if suppress_alerts:
|
|
remaining = STARTUP_GRACE_PERIOD - (datetime.now() - startup_time).total_seconds()
|
|
print(f"Grace period: {remaining:.0f}s remaining (alerts suppressed)")
|
|
|
|
await monitor.check_all_services(
|
|
backend_url=BACKEND_URL,
|
|
frontend_url=FRONTEND_URL,
|
|
bot_url=BOT_URL,
|
|
external_url=EXTERNAL_URL,
|
|
public_url=PUBLIC_URL,
|
|
suppress_alerts=suppress_alerts
|
|
)
|
|
except Exception as e:
|
|
print(f"Health check error: {e}")
|
|
|
|
# Adaptive polling: check more frequently when issues detected
|
|
if has_issues():
|
|
await asyncio.sleep(FAST_CHECK_INTERVAL)
|
|
else:
|
|
await asyncio.sleep(CHECK_INTERVAL)
|
|
|
|
|
|
async def periodic_cleanup():
|
|
"""Background task to cleanup old metrics (runs immediately, then hourly)."""
|
|
while True:
|
|
try:
|
|
deleted = cleanup_old_metrics(hours=24) # Keep only last 24 hours
|
|
if deleted > 0:
|
|
print(f"Cleaned up {deleted} old metrics")
|
|
except Exception as e:
|
|
print(f"Cleanup error: {e}")
|
|
await asyncio.sleep(3600) # Wait 1 hour before next cleanup
|
|
|
|
|
|
def is_in_grace_period() -> bool:
|
|
"""Check if we're still in startup grace period."""
|
|
if startup_time is None:
|
|
return True
|
|
elapsed = (datetime.now() - startup_time).total_seconds()
|
|
return elapsed < STARTUP_GRACE_PERIOD
|
|
|
|
|
|
@asynccontextmanager
|
|
async def lifespan(app: FastAPI):
|
|
"""Startup and shutdown events."""
|
|
global background_task, cleanup_task, startup_time
|
|
|
|
# Initialize database
|
|
init_db()
|
|
print("Database initialized")
|
|
|
|
# Mark startup time
|
|
startup_time = datetime.now()
|
|
print(f"Startup grace period: {STARTUP_GRACE_PERIOD}s (no alerts until services stabilize)")
|
|
|
|
# Start background health checks
|
|
background_task = asyncio.create_task(periodic_health_check())
|
|
cleanup_task = asyncio.create_task(periodic_cleanup())
|
|
|
|
yield
|
|
|
|
# Cancel background tasks on shutdown
|
|
for task in [background_task, cleanup_task]:
|
|
if task:
|
|
task.cancel()
|
|
try:
|
|
await task
|
|
except asyncio.CancelledError:
|
|
pass
|
|
|
|
|
|
app = FastAPI(
|
|
title="Status Monitor",
|
|
description="Service health monitoring with persistence and alerting",
|
|
lifespan=lifespan
|
|
)
|
|
|
|
templates = Jinja2Templates(directory="templates")
|
|
|
|
|
|
@app.get("/", response_class=HTMLResponse)
|
|
async def status_page(request: Request, period: int = 24):
|
|
"""Main status page."""
|
|
# Validate period (1, 12, or 24 hours)
|
|
if period not in (1, 12, 24):
|
|
period = 24
|
|
|
|
services = monitor.get_all_statuses(period_hours=period)
|
|
overall_status = monitor.get_overall_status()
|
|
ssl_status = monitor.get_ssl_status()
|
|
incidents = get_recent_incidents(limit=5)
|
|
fast_mode = has_issues()
|
|
current_interval = FAST_CHECK_INTERVAL if fast_mode else CHECK_INTERVAL
|
|
grace_period_active = is_in_grace_period()
|
|
grace_period_remaining = max(0, STARTUP_GRACE_PERIOD - (datetime.now() - startup_time).total_seconds()) if startup_time else 0
|
|
|
|
return templates.TemplateResponse(
|
|
"index.html",
|
|
{
|
|
"request": request,
|
|
"services": services,
|
|
"overall_status": overall_status,
|
|
"ssl_status": ssl_status,
|
|
"incidents": incidents,
|
|
"last_check": monitor.last_check,
|
|
"check_interval": current_interval,
|
|
"fast_mode": fast_mode,
|
|
"grace_period_active": grace_period_active,
|
|
"grace_period_remaining": int(grace_period_remaining),
|
|
"period": period
|
|
}
|
|
)
|
|
|
|
|
|
@app.get("/api/status")
|
|
async def api_status():
|
|
"""API endpoint for service statuses."""
|
|
services = monitor.get_all_statuses()
|
|
overall_status = monitor.get_overall_status()
|
|
ssl_status = monitor.get_ssl_status()
|
|
current_interval = FAST_CHECK_INTERVAL if has_issues() else CHECK_INTERVAL
|
|
|
|
return {
|
|
"overall_status": overall_status.value,
|
|
"services": {name: status.to_dict() for name, status in services.items()},
|
|
"ssl": ssl_status,
|
|
"last_check": monitor.last_check.isoformat() if monitor.last_check else None,
|
|
"check_interval_seconds": current_interval,
|
|
"fast_mode": has_issues()
|
|
}
|
|
|
|
|
|
@app.get("/api/history/{service_name}")
|
|
async def api_history(service_name: str, hours: int = 24):
|
|
"""API endpoint for service latency history."""
|
|
history = get_latency_history(service_name, hours=hours)
|
|
return {
|
|
"service": service_name,
|
|
"hours": hours,
|
|
"data": history
|
|
}
|
|
|
|
|
|
@app.get("/api/incidents")
|
|
async def api_incidents(limit: int = 20):
|
|
"""API endpoint for recent incidents."""
|
|
incidents = get_recent_incidents(limit=limit)
|
|
return {"incidents": incidents}
|
|
|
|
|
|
@app.get("/api/health")
|
|
async def health():
|
|
"""Health check for this service."""
|
|
return {"status": "ok", "service": "status-monitor"}
|
|
|
|
|
|
@app.post("/api/refresh")
|
|
async def refresh_status():
|
|
"""Force refresh all service statuses."""
|
|
await monitor.check_all_services(
|
|
backend_url=BACKEND_URL,
|
|
frontend_url=FRONTEND_URL,
|
|
bot_url=BOT_URL,
|
|
external_url=EXTERNAL_URL,
|
|
public_url=PUBLIC_URL
|
|
)
|
|
return {"status": "refreshed"}
|