service
This commit is contained in:
227
status-service/monitors.py
Normal file
227
status-service/monitors.py
Normal file
@@ -0,0 +1,227 @@
|
||||
import asyncio
|
||||
from datetime import datetime, timedelta
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional
|
||||
from enum import Enum
|
||||
|
||||
import httpx
|
||||
|
||||
|
||||
class Status(str, Enum):
|
||||
OPERATIONAL = "operational"
|
||||
DEGRADED = "degraded"
|
||||
DOWN = "down"
|
||||
UNKNOWN = "unknown"
|
||||
|
||||
|
||||
@dataclass
|
||||
class ServiceStatus:
|
||||
name: str
|
||||
display_name: str
|
||||
status: Status = Status.UNKNOWN
|
||||
latency_ms: Optional[float] = None
|
||||
last_check: Optional[datetime] = None
|
||||
last_incident: Optional[datetime] = None
|
||||
uptime_percent: float = 100.0
|
||||
message: Optional[str] = None
|
||||
version: Optional[str] = None
|
||||
|
||||
# For uptime calculation
|
||||
total_checks: int = 0
|
||||
successful_checks: int = 0
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"name": self.name,
|
||||
"display_name": self.display_name,
|
||||
"status": self.status.value,
|
||||
"latency_ms": round(self.latency_ms, 2) if self.latency_ms else None,
|
||||
"last_check": self.last_check.isoformat() if self.last_check else None,
|
||||
"last_incident": self.last_incident.isoformat() if self.last_incident else None,
|
||||
"uptime_percent": round(self.uptime_percent, 2),
|
||||
"message": self.message,
|
||||
"version": self.version
|
||||
}
|
||||
|
||||
def update_uptime(self, is_success: bool):
|
||||
self.total_checks += 1
|
||||
if is_success:
|
||||
self.successful_checks += 1
|
||||
if self.total_checks > 0:
|
||||
self.uptime_percent = (self.successful_checks / self.total_checks) * 100
|
||||
|
||||
|
||||
class ServiceMonitor:
|
||||
def __init__(self):
|
||||
self.services: dict[str, ServiceStatus] = {
|
||||
"backend": ServiceStatus(
|
||||
name="backend",
|
||||
display_name="Backend API"
|
||||
),
|
||||
"database": ServiceStatus(
|
||||
name="database",
|
||||
display_name="Database"
|
||||
),
|
||||
"frontend": ServiceStatus(
|
||||
name="frontend",
|
||||
display_name="Frontend"
|
||||
),
|
||||
"bot": ServiceStatus(
|
||||
name="bot",
|
||||
display_name="Telegram Bot"
|
||||
)
|
||||
}
|
||||
self.last_check: Optional[datetime] = None
|
||||
|
||||
async def check_backend(self, url: str) -> tuple[Status, Optional[float], Optional[str], Optional[str]]:
|
||||
"""Check backend API health"""
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=10.0) as client:
|
||||
start = datetime.now()
|
||||
response = await client.get(f"{url}/health")
|
||||
latency = (datetime.now() - start).total_seconds() * 1000
|
||||
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
return Status.OPERATIONAL, latency, None, data.get("version")
|
||||
else:
|
||||
return Status.DEGRADED, latency, f"HTTP {response.status_code}", None
|
||||
except httpx.TimeoutException:
|
||||
return Status.DOWN, None, "Timeout", None
|
||||
except Exception as e:
|
||||
return Status.DOWN, None, str(e)[:100], None
|
||||
|
||||
async def check_database(self, backend_url: str) -> tuple[Status, Optional[float], Optional[str]]:
|
||||
"""Check database through backend"""
|
||||
# We check database indirectly - if backend is up, DB is likely up
|
||||
# Could add a specific /health/db endpoint to backend later
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=10.0) as client:
|
||||
start = datetime.now()
|
||||
response = await client.get(f"{backend_url}/health")
|
||||
latency = (datetime.now() - start).total_seconds() * 1000
|
||||
|
||||
if response.status_code == 200:
|
||||
return Status.OPERATIONAL, latency, None
|
||||
else:
|
||||
return Status.DOWN, latency, "Backend reports unhealthy"
|
||||
except Exception as e:
|
||||
return Status.DOWN, None, "Cannot reach backend"
|
||||
|
||||
async def check_frontend(self, url: str) -> tuple[Status, Optional[float], Optional[str]]:
|
||||
"""Check frontend availability"""
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=10.0) as client:
|
||||
start = datetime.now()
|
||||
response = await client.get(url)
|
||||
latency = (datetime.now() - start).total_seconds() * 1000
|
||||
|
||||
if response.status_code == 200:
|
||||
return Status.OPERATIONAL, latency, None
|
||||
else:
|
||||
return Status.DEGRADED, latency, f"HTTP {response.status_code}"
|
||||
except httpx.TimeoutException:
|
||||
return Status.DOWN, None, "Timeout"
|
||||
except Exception as e:
|
||||
return Status.DOWN, None, str(e)[:100]
|
||||
|
||||
async def check_bot(self, url: str) -> tuple[Status, Optional[float], Optional[str]]:
|
||||
"""Check Telegram bot health"""
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=10.0) as client:
|
||||
start = datetime.now()
|
||||
response = await client.get(f"{url}/health")
|
||||
latency = (datetime.now() - start).total_seconds() * 1000
|
||||
|
||||
if response.status_code == 200:
|
||||
return Status.OPERATIONAL, latency, None
|
||||
else:
|
||||
return Status.DEGRADED, latency, f"HTTP {response.status_code}"
|
||||
except httpx.TimeoutException:
|
||||
return Status.DOWN, None, "Timeout"
|
||||
except Exception as e:
|
||||
return Status.DOWN, None, str(e)[:100]
|
||||
|
||||
async def check_all_services(self, backend_url: str, frontend_url: str, bot_url: str):
|
||||
"""Check all services concurrently"""
|
||||
now = datetime.now()
|
||||
|
||||
# Run all checks concurrently
|
||||
results = await asyncio.gather(
|
||||
self.check_backend(backend_url),
|
||||
self.check_database(backend_url),
|
||||
self.check_frontend(frontend_url),
|
||||
self.check_bot(bot_url),
|
||||
return_exceptions=True
|
||||
)
|
||||
|
||||
# Process backend result
|
||||
if not isinstance(results[0], Exception):
|
||||
status, latency, message, version = results[0]
|
||||
svc = self.services["backend"]
|
||||
was_down = svc.status == Status.DOWN
|
||||
svc.status = status
|
||||
svc.latency_ms = latency
|
||||
svc.message = message
|
||||
svc.version = version
|
||||
svc.last_check = now
|
||||
svc.update_uptime(status == Status.OPERATIONAL)
|
||||
if status != Status.OPERATIONAL and not was_down:
|
||||
svc.last_incident = now
|
||||
|
||||
# Process database result
|
||||
if not isinstance(results[1], Exception):
|
||||
status, latency, message = results[1]
|
||||
svc = self.services["database"]
|
||||
was_down = svc.status == Status.DOWN
|
||||
svc.status = status
|
||||
svc.latency_ms = latency
|
||||
svc.message = message
|
||||
svc.last_check = now
|
||||
svc.update_uptime(status == Status.OPERATIONAL)
|
||||
if status != Status.OPERATIONAL and not was_down:
|
||||
svc.last_incident = now
|
||||
|
||||
# Process frontend result
|
||||
if not isinstance(results[2], Exception):
|
||||
status, latency, message = results[2]
|
||||
svc = self.services["frontend"]
|
||||
was_down = svc.status == Status.DOWN
|
||||
svc.status = status
|
||||
svc.latency_ms = latency
|
||||
svc.message = message
|
||||
svc.last_check = now
|
||||
svc.update_uptime(status == Status.OPERATIONAL)
|
||||
if status != Status.OPERATIONAL and not was_down:
|
||||
svc.last_incident = now
|
||||
|
||||
# Process bot result
|
||||
if not isinstance(results[3], Exception):
|
||||
status, latency, message = results[3]
|
||||
svc = self.services["bot"]
|
||||
was_down = svc.status == Status.DOWN
|
||||
svc.status = status
|
||||
svc.latency_ms = latency
|
||||
svc.message = message
|
||||
svc.last_check = now
|
||||
svc.update_uptime(status == Status.OPERATIONAL)
|
||||
if status != Status.OPERATIONAL and not was_down:
|
||||
svc.last_incident = now
|
||||
|
||||
self.last_check = now
|
||||
|
||||
def get_all_statuses(self) -> dict[str, ServiceStatus]:
|
||||
return self.services
|
||||
|
||||
def get_overall_status(self) -> Status:
|
||||
"""Get overall system status based on all services"""
|
||||
statuses = [svc.status for svc in self.services.values()]
|
||||
|
||||
if all(s == Status.OPERATIONAL for s in statuses):
|
||||
return Status.OPERATIONAL
|
||||
elif any(s == Status.DOWN for s in statuses):
|
||||
return Status.DOWN
|
||||
elif any(s == Status.DEGRADED for s in statuses):
|
||||
return Status.DEGRADED
|
||||
else:
|
||||
return Status.UNKNOWN
|
||||
Reference in New Issue
Block a user