import asyncio from datetime import datetime, timedelta from dataclasses import dataclass, field from typing import Optional from enum import Enum import httpx class Status(str, Enum): OPERATIONAL = "operational" DEGRADED = "degraded" DOWN = "down" UNKNOWN = "unknown" @dataclass class ServiceStatus: name: str display_name: str status: Status = Status.UNKNOWN latency_ms: Optional[float] = None last_check: Optional[datetime] = None last_incident: Optional[datetime] = None uptime_percent: float = 100.0 message: Optional[str] = None version: Optional[str] = None # For uptime calculation total_checks: int = 0 successful_checks: int = 0 def to_dict(self) -> dict: return { "name": self.name, "display_name": self.display_name, "status": self.status.value, "latency_ms": round(self.latency_ms, 2) if self.latency_ms else None, "last_check": self.last_check.isoformat() if self.last_check else None, "last_incident": self.last_incident.isoformat() if self.last_incident else None, "uptime_percent": round(self.uptime_percent, 2), "message": self.message, "version": self.version } def update_uptime(self, is_success: bool): self.total_checks += 1 if is_success: self.successful_checks += 1 if self.total_checks > 0: self.uptime_percent = (self.successful_checks / self.total_checks) * 100 class ServiceMonitor: def __init__(self): self.services: dict[str, ServiceStatus] = { "backend": ServiceStatus( name="backend", display_name="Backend API" ), "database": ServiceStatus( name="database", display_name="Database" ), "frontend": ServiceStatus( name="frontend", display_name="Frontend" ), "bot": ServiceStatus( name="bot", display_name="Telegram Bot" ) } self.last_check: Optional[datetime] = None async def check_backend(self, url: str) -> tuple[Status, Optional[float], Optional[str], Optional[str]]: """Check backend API health""" try: async with httpx.AsyncClient(timeout=10.0) as client: start = datetime.now() response = await client.get(f"{url}/health") latency = (datetime.now() - start).total_seconds() * 1000 if response.status_code == 200: data = response.json() return Status.OPERATIONAL, latency, None, data.get("version") else: return Status.DEGRADED, latency, f"HTTP {response.status_code}", None except httpx.TimeoutException: return Status.DOWN, None, "Timeout", None except Exception as e: return Status.DOWN, None, str(e)[:100], None async def check_database(self, backend_url: str) -> tuple[Status, Optional[float], Optional[str]]: """Check database through backend""" # We check database indirectly - if backend is up, DB is likely up # Could add a specific /health/db endpoint to backend later try: async with httpx.AsyncClient(timeout=10.0) as client: start = datetime.now() response = await client.get(f"{backend_url}/health") latency = (datetime.now() - start).total_seconds() * 1000 if response.status_code == 200: return Status.OPERATIONAL, latency, None else: return Status.DOWN, latency, "Backend reports unhealthy" except Exception as e: return Status.DOWN, None, "Cannot reach backend" async def check_frontend(self, url: str) -> tuple[Status, Optional[float], Optional[str]]: """Check frontend availability""" try: async with httpx.AsyncClient(timeout=10.0) as client: start = datetime.now() response = await client.get(url) latency = (datetime.now() - start).total_seconds() * 1000 if response.status_code == 200: return Status.OPERATIONAL, latency, None else: return Status.DEGRADED, latency, f"HTTP {response.status_code}" except httpx.TimeoutException: return Status.DOWN, None, "Timeout" except Exception as e: return Status.DOWN, None, str(e)[:100] async def check_bot(self, url: str) -> tuple[Status, Optional[float], Optional[str]]: """Check Telegram bot health""" try: async with httpx.AsyncClient(timeout=10.0) as client: start = datetime.now() response = await client.get(f"{url}/health") latency = (datetime.now() - start).total_seconds() * 1000 if response.status_code == 200: return Status.OPERATIONAL, latency, None else: return Status.DEGRADED, latency, f"HTTP {response.status_code}" except httpx.TimeoutException: return Status.DOWN, None, "Timeout" except Exception as e: return Status.DOWN, None, str(e)[:100] async def check_all_services(self, backend_url: str, frontend_url: str, bot_url: str): """Check all services concurrently""" now = datetime.now() # Run all checks concurrently results = await asyncio.gather( self.check_backend(backend_url), self.check_database(backend_url), self.check_frontend(frontend_url), self.check_bot(bot_url), return_exceptions=True ) # Process backend result if not isinstance(results[0], Exception): status, latency, message, version = results[0] svc = self.services["backend"] was_down = svc.status == Status.DOWN svc.status = status svc.latency_ms = latency svc.message = message svc.version = version svc.last_check = now svc.update_uptime(status == Status.OPERATIONAL) if status != Status.OPERATIONAL and not was_down: svc.last_incident = now # Process database result if not isinstance(results[1], Exception): status, latency, message = results[1] svc = self.services["database"] was_down = svc.status == Status.DOWN svc.status = status svc.latency_ms = latency svc.message = message svc.last_check = now svc.update_uptime(status == Status.OPERATIONAL) if status != Status.OPERATIONAL and not was_down: svc.last_incident = now # Process frontend result if not isinstance(results[2], Exception): status, latency, message = results[2] svc = self.services["frontend"] was_down = svc.status == Status.DOWN svc.status = status svc.latency_ms = latency svc.message = message svc.last_check = now svc.update_uptime(status == Status.OPERATIONAL) if status != Status.OPERATIONAL and not was_down: svc.last_incident = now # Process bot result if not isinstance(results[3], Exception): status, latency, message = results[3] svc = self.services["bot"] was_down = svc.status == Status.DOWN svc.status = status svc.latency_ms = latency svc.message = message svc.last_check = now svc.update_uptime(status == Status.OPERATIONAL) if status != Status.OPERATIONAL and not was_down: svc.last_incident = now self.last_check = now def get_all_statuses(self) -> dict[str, ServiceStatus]: return self.services def get_overall_status(self) -> Status: """Get overall system status based on all services""" statuses = [svc.status for svc in self.services.values()] if all(s == Status.OPERATIONAL for s in statuses): return Status.OPERATIONAL elif any(s == Status.DOWN for s in statuses): return Status.DOWN elif any(s == Status.DEGRADED for s in statuses): return Status.DEGRADED else: return Status.UNKNOWN