Fix service status

This commit is contained in:
2025-12-20 02:28:41 +07:00
parent c645171671
commit 243abe55b5
5 changed files with 210 additions and 33 deletions

View File

@@ -9,7 +9,7 @@ from fastapi import FastAPI, Request
from fastapi.responses import HTMLResponse
from fastapi.templating import Jinja2Templates
from monitors import ServiceMonitor
from monitors import ServiceMonitor, Status
from database import init_db, get_recent_incidents, get_latency_history, cleanup_old_metrics
@@ -19,52 +19,91 @@ FRONTEND_URL = os.getenv("FRONTEND_URL", "http://frontend:80")
BOT_URL = os.getenv("BOT_URL", "http://bot:8080")
EXTERNAL_URL = os.getenv("EXTERNAL_URL", "") # Public URL for external checks
PUBLIC_URL = os.getenv("PUBLIC_URL", "") # Public HTTPS URL for SSL checks
CHECK_INTERVAL = int(os.getenv("CHECK_INTERVAL", "600")) # 10 minutes
CHECK_INTERVAL = int(os.getenv("CHECK_INTERVAL", "60")) # Normal interval (1 minute)
FAST_CHECK_INTERVAL = int(os.getenv("FAST_CHECK_INTERVAL", "5")) # Fast interval when issues detected
STARTUP_GRACE_PERIOD = int(os.getenv("STARTUP_GRACE_PERIOD", "60")) # Wait before alerting after startup
# Initialize monitor
monitor = ServiceMonitor()
startup_time: Optional[datetime] = None # Track when service started
# Background task reference
background_task: Optional[asyncio.Task] = None
cleanup_task: Optional[asyncio.Task] = None
def has_issues() -> bool:
"""Check if any monitored service has issues."""
for name, svc in monitor.services.items():
# Skip external if not configured
if name == "external" and svc.status == Status.UNKNOWN:
continue
if svc.status in (Status.DOWN, Status.DEGRADED):
return True
return False
async def periodic_health_check():
"""Background task to check services periodically."""
"""Background task to check services periodically with adaptive polling."""
while True:
try:
# Suppress alerts during startup grace period
suppress_alerts = is_in_grace_period()
if suppress_alerts:
remaining = STARTUP_GRACE_PERIOD - (datetime.now() - startup_time).total_seconds()
print(f"Grace period: {remaining:.0f}s remaining (alerts suppressed)")
await monitor.check_all_services(
backend_url=BACKEND_URL,
frontend_url=FRONTEND_URL,
bot_url=BOT_URL,
external_url=EXTERNAL_URL,
public_url=PUBLIC_URL
public_url=PUBLIC_URL,
suppress_alerts=suppress_alerts
)
except Exception as e:
print(f"Health check error: {e}")
await asyncio.sleep(CHECK_INTERVAL)
# Adaptive polling: check more frequently when issues detected
if has_issues():
await asyncio.sleep(FAST_CHECK_INTERVAL)
else:
await asyncio.sleep(CHECK_INTERVAL)
async def periodic_cleanup():
"""Background task to cleanup old metrics (hourly)."""
"""Background task to cleanup old metrics (runs immediately, then hourly)."""
while True:
await asyncio.sleep(3600) # 1 hour
try:
deleted = cleanup_old_metrics(days=1) # Keep only last 24 hours
print(f"Cleaned up {deleted} old metrics")
deleted = cleanup_old_metrics(hours=24) # Keep only last 24 hours
if deleted > 0:
print(f"Cleaned up {deleted} old metrics")
except Exception as e:
print(f"Cleanup error: {e}")
await asyncio.sleep(3600) # Wait 1 hour before next cleanup
def is_in_grace_period() -> bool:
"""Check if we're still in startup grace period."""
if startup_time is None:
return True
elapsed = (datetime.now() - startup_time).total_seconds()
return elapsed < STARTUP_GRACE_PERIOD
@asynccontextmanager
async def lifespan(app: FastAPI):
"""Startup and shutdown events."""
global background_task, cleanup_task
global background_task, cleanup_task, startup_time
# Initialize database
init_db()
print("Database initialized")
# Mark startup time
startup_time = datetime.now()
print(f"Startup grace period: {STARTUP_GRACE_PERIOD}s (no alerts until services stabilize)")
# Start background health checks
background_task = asyncio.create_task(periodic_health_check())
cleanup_task = asyncio.create_task(periodic_cleanup())
@@ -91,12 +130,20 @@ templates = Jinja2Templates(directory="templates")
@app.get("/", response_class=HTMLResponse)
async def status_page(request: Request):
async def status_page(request: Request, period: int = 24):
"""Main status page."""
services = monitor.get_all_statuses()
# Validate period (1, 12, or 24 hours)
if period not in (1, 12, 24):
period = 24
services = monitor.get_all_statuses(period_hours=period)
overall_status = monitor.get_overall_status()
ssl_status = monitor.get_ssl_status()
incidents = get_recent_incidents(limit=5)
fast_mode = has_issues()
current_interval = FAST_CHECK_INTERVAL if fast_mode else CHECK_INTERVAL
grace_period_active = is_in_grace_period()
grace_period_remaining = max(0, STARTUP_GRACE_PERIOD - (datetime.now() - startup_time).total_seconds()) if startup_time else 0
return templates.TemplateResponse(
"index.html",
@@ -107,7 +154,11 @@ async def status_page(request: Request):
"ssl_status": ssl_status,
"incidents": incidents,
"last_check": monitor.last_check,
"check_interval": CHECK_INTERVAL
"check_interval": current_interval,
"fast_mode": fast_mode,
"grace_period_active": grace_period_active,
"grace_period_remaining": int(grace_period_remaining),
"period": period
}
)
@@ -118,13 +169,15 @@ async def api_status():
services = monitor.get_all_statuses()
overall_status = monitor.get_overall_status()
ssl_status = monitor.get_ssl_status()
current_interval = FAST_CHECK_INTERVAL if has_issues() else CHECK_INTERVAL
return {
"overall_status": overall_status.value,
"services": {name: status.to_dict() for name, status in services.items()},
"ssl": ssl_status,
"last_check": monitor.last_check.isoformat() if monitor.last_check else None,
"check_interval_seconds": CHECK_INTERVAL
"check_interval_seconds": current_interval,
"fast_mode": has_issues()
}