This commit is contained in:
2025-12-17 22:10:01 +07:00
parent f371178518
commit 967176fab8
7 changed files with 794 additions and 1 deletions

227
status-service/monitors.py Normal file
View File

@@ -0,0 +1,227 @@
import asyncio
from datetime import datetime, timedelta
from dataclasses import dataclass, field
from typing import Optional
from enum import Enum
import httpx
class Status(str, Enum):
OPERATIONAL = "operational"
DEGRADED = "degraded"
DOWN = "down"
UNKNOWN = "unknown"
@dataclass
class ServiceStatus:
name: str
display_name: str
status: Status = Status.UNKNOWN
latency_ms: Optional[float] = None
last_check: Optional[datetime] = None
last_incident: Optional[datetime] = None
uptime_percent: float = 100.0
message: Optional[str] = None
version: Optional[str] = None
# For uptime calculation
total_checks: int = 0
successful_checks: int = 0
def to_dict(self) -> dict:
return {
"name": self.name,
"display_name": self.display_name,
"status": self.status.value,
"latency_ms": round(self.latency_ms, 2) if self.latency_ms else None,
"last_check": self.last_check.isoformat() if self.last_check else None,
"last_incident": self.last_incident.isoformat() if self.last_incident else None,
"uptime_percent": round(self.uptime_percent, 2),
"message": self.message,
"version": self.version
}
def update_uptime(self, is_success: bool):
self.total_checks += 1
if is_success:
self.successful_checks += 1
if self.total_checks > 0:
self.uptime_percent = (self.successful_checks / self.total_checks) * 100
class ServiceMonitor:
def __init__(self):
self.services: dict[str, ServiceStatus] = {
"backend": ServiceStatus(
name="backend",
display_name="Backend API"
),
"database": ServiceStatus(
name="database",
display_name="Database"
),
"frontend": ServiceStatus(
name="frontend",
display_name="Frontend"
),
"bot": ServiceStatus(
name="bot",
display_name="Telegram Bot"
)
}
self.last_check: Optional[datetime] = None
async def check_backend(self, url: str) -> tuple[Status, Optional[float], Optional[str], Optional[str]]:
"""Check backend API health"""
try:
async with httpx.AsyncClient(timeout=10.0) as client:
start = datetime.now()
response = await client.get(f"{url}/health")
latency = (datetime.now() - start).total_seconds() * 1000
if response.status_code == 200:
data = response.json()
return Status.OPERATIONAL, latency, None, data.get("version")
else:
return Status.DEGRADED, latency, f"HTTP {response.status_code}", None
except httpx.TimeoutException:
return Status.DOWN, None, "Timeout", None
except Exception as e:
return Status.DOWN, None, str(e)[:100], None
async def check_database(self, backend_url: str) -> tuple[Status, Optional[float], Optional[str]]:
"""Check database through backend"""
# We check database indirectly - if backend is up, DB is likely up
# Could add a specific /health/db endpoint to backend later
try:
async with httpx.AsyncClient(timeout=10.0) as client:
start = datetime.now()
response = await client.get(f"{backend_url}/health")
latency = (datetime.now() - start).total_seconds() * 1000
if response.status_code == 200:
return Status.OPERATIONAL, latency, None
else:
return Status.DOWN, latency, "Backend reports unhealthy"
except Exception as e:
return Status.DOWN, None, "Cannot reach backend"
async def check_frontend(self, url: str) -> tuple[Status, Optional[float], Optional[str]]:
"""Check frontend availability"""
try:
async with httpx.AsyncClient(timeout=10.0) as client:
start = datetime.now()
response = await client.get(url)
latency = (datetime.now() - start).total_seconds() * 1000
if response.status_code == 200:
return Status.OPERATIONAL, latency, None
else:
return Status.DEGRADED, latency, f"HTTP {response.status_code}"
except httpx.TimeoutException:
return Status.DOWN, None, "Timeout"
except Exception as e:
return Status.DOWN, None, str(e)[:100]
async def check_bot(self, url: str) -> tuple[Status, Optional[float], Optional[str]]:
"""Check Telegram bot health"""
try:
async with httpx.AsyncClient(timeout=10.0) as client:
start = datetime.now()
response = await client.get(f"{url}/health")
latency = (datetime.now() - start).total_seconds() * 1000
if response.status_code == 200:
return Status.OPERATIONAL, latency, None
else:
return Status.DEGRADED, latency, f"HTTP {response.status_code}"
except httpx.TimeoutException:
return Status.DOWN, None, "Timeout"
except Exception as e:
return Status.DOWN, None, str(e)[:100]
async def check_all_services(self, backend_url: str, frontend_url: str, bot_url: str):
"""Check all services concurrently"""
now = datetime.now()
# Run all checks concurrently
results = await asyncio.gather(
self.check_backend(backend_url),
self.check_database(backend_url),
self.check_frontend(frontend_url),
self.check_bot(bot_url),
return_exceptions=True
)
# Process backend result
if not isinstance(results[0], Exception):
status, latency, message, version = results[0]
svc = self.services["backend"]
was_down = svc.status == Status.DOWN
svc.status = status
svc.latency_ms = latency
svc.message = message
svc.version = version
svc.last_check = now
svc.update_uptime(status == Status.OPERATIONAL)
if status != Status.OPERATIONAL and not was_down:
svc.last_incident = now
# Process database result
if not isinstance(results[1], Exception):
status, latency, message = results[1]
svc = self.services["database"]
was_down = svc.status == Status.DOWN
svc.status = status
svc.latency_ms = latency
svc.message = message
svc.last_check = now
svc.update_uptime(status == Status.OPERATIONAL)
if status != Status.OPERATIONAL and not was_down:
svc.last_incident = now
# Process frontend result
if not isinstance(results[2], Exception):
status, latency, message = results[2]
svc = self.services["frontend"]
was_down = svc.status == Status.DOWN
svc.status = status
svc.latency_ms = latency
svc.message = message
svc.last_check = now
svc.update_uptime(status == Status.OPERATIONAL)
if status != Status.OPERATIONAL and not was_down:
svc.last_incident = now
# Process bot result
if not isinstance(results[3], Exception):
status, latency, message = results[3]
svc = self.services["bot"]
was_down = svc.status == Status.DOWN
svc.status = status
svc.latency_ms = latency
svc.message = message
svc.last_check = now
svc.update_uptime(status == Status.OPERATIONAL)
if status != Status.OPERATIONAL and not was_down:
svc.last_incident = now
self.last_check = now
def get_all_statuses(self) -> dict[str, ServiceStatus]:
return self.services
def get_overall_status(self) -> Status:
"""Get overall system status based on all services"""
statuses = [svc.status for svc in self.services.values()]
if all(s == Status.OPERATIONAL for s in statuses):
return Status.OPERATIONAL
elif any(s == Status.DOWN for s in statuses):
return Status.DOWN
elif any(s == Status.DEGRADED for s in statuses):
return Status.DEGRADED
else:
return Status.UNKNOWN