Redesign health service + create backup service
This commit is contained in:
@@ -1,11 +1,19 @@
|
||||
"""Service monitoring with persistence and alerting."""
|
||||
import asyncio
|
||||
from datetime import datetime, timedelta
|
||||
from dataclasses import dataclass, field
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
from enum import Enum
|
||||
|
||||
import httpx
|
||||
|
||||
from database import (
|
||||
save_metric, get_latency_history, get_uptime_stats, get_avg_latency,
|
||||
create_incident, resolve_incident, get_open_incident, mark_incident_notified
|
||||
)
|
||||
from alerts import alert_service_down, alert_service_recovered
|
||||
from ssl_monitor import check_and_alert_ssl, SSLInfo
|
||||
|
||||
|
||||
class Status(str, Enum):
|
||||
OPERATIONAL = "operational"
|
||||
@@ -25,11 +33,17 @@ class ServiceStatus:
|
||||
uptime_percent: float = 100.0
|
||||
message: Optional[str] = None
|
||||
version: Optional[str] = None
|
||||
avg_latency_24h: Optional[float] = None
|
||||
latency_history: list = None
|
||||
|
||||
# For uptime calculation
|
||||
# For uptime calculation (in-memory, backed by DB)
|
||||
total_checks: int = 0
|
||||
successful_checks: int = 0
|
||||
|
||||
def __post_init__(self):
|
||||
if self.latency_history is None:
|
||||
self.latency_history = []
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"name": self.name,
|
||||
@@ -40,7 +54,8 @@ class ServiceStatus:
|
||||
"last_incident": self.last_incident.isoformat() if self.last_incident else None,
|
||||
"uptime_percent": round(self.uptime_percent, 2),
|
||||
"message": self.message,
|
||||
"version": self.version
|
||||
"version": self.version,
|
||||
"avg_latency_24h": round(self.avg_latency_24h, 2) if self.avg_latency_24h else None,
|
||||
}
|
||||
|
||||
def update_uptime(self, is_success: bool):
|
||||
@@ -69,12 +84,17 @@ class ServiceMonitor:
|
||||
"bot": ServiceStatus(
|
||||
name="bot",
|
||||
display_name="Telegram Bot"
|
||||
)
|
||||
),
|
||||
"external": ServiceStatus(
|
||||
name="external",
|
||||
display_name="External Access"
|
||||
),
|
||||
}
|
||||
self.last_check: Optional[datetime] = None
|
||||
self.ssl_info: Optional[SSLInfo] = None
|
||||
|
||||
async def check_backend(self, url: str) -> tuple[Status, Optional[float], Optional[str], Optional[str]]:
|
||||
"""Check backend API health"""
|
||||
"""Check backend API health."""
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=10.0) as client:
|
||||
start = datetime.now()
|
||||
@@ -92,9 +112,7 @@ class ServiceMonitor:
|
||||
return Status.DOWN, None, str(e)[:100], None
|
||||
|
||||
async def check_database(self, backend_url: str) -> tuple[Status, Optional[float], Optional[str]]:
|
||||
"""Check database through backend"""
|
||||
# We check database indirectly - if backend is up, DB is likely up
|
||||
# Could add a specific /health/db endpoint to backend later
|
||||
"""Check database through backend."""
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=10.0) as client:
|
||||
start = datetime.now()
|
||||
@@ -109,7 +127,7 @@ class ServiceMonitor:
|
||||
return Status.DOWN, None, "Cannot reach backend"
|
||||
|
||||
async def check_frontend(self, url: str) -> tuple[Status, Optional[float], Optional[str]]:
|
||||
"""Check frontend availability"""
|
||||
"""Check frontend availability."""
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=10.0) as client:
|
||||
start = datetime.now()
|
||||
@@ -126,7 +144,7 @@ class ServiceMonitor:
|
||||
return Status.DOWN, None, str(e)[:100]
|
||||
|
||||
async def check_bot(self, url: str) -> tuple[Status, Optional[float], Optional[str]]:
|
||||
"""Check Telegram bot health"""
|
||||
"""Check Telegram bot health."""
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=10.0) as client:
|
||||
start = datetime.now()
|
||||
@@ -142,8 +160,93 @@ class ServiceMonitor:
|
||||
except Exception as e:
|
||||
return Status.DOWN, None, str(e)[:100]
|
||||
|
||||
async def check_all_services(self, backend_url: str, frontend_url: str, bot_url: str):
|
||||
"""Check all services concurrently"""
|
||||
async def check_external(self, url: str) -> tuple[Status, Optional[float], Optional[str]]:
|
||||
"""Check external (public) URL availability."""
|
||||
if not url:
|
||||
return Status.UNKNOWN, None, "Not configured"
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=15.0, follow_redirects=True) as client:
|
||||
start = datetime.now()
|
||||
response = await client.get(url)
|
||||
latency = (datetime.now() - start).total_seconds() * 1000
|
||||
|
||||
if response.status_code == 200:
|
||||
return Status.OPERATIONAL, latency, None
|
||||
else:
|
||||
return Status.DEGRADED, latency, f"HTTP {response.status_code}"
|
||||
except httpx.TimeoutException:
|
||||
return Status.DOWN, None, "Timeout"
|
||||
except Exception as e:
|
||||
return Status.DOWN, None, str(e)[:100]
|
||||
|
||||
async def _process_check_result(
|
||||
self,
|
||||
service_name: str,
|
||||
result: tuple,
|
||||
now: datetime
|
||||
):
|
||||
"""Process check result with DB persistence and alerting."""
|
||||
if isinstance(result, Exception):
|
||||
return
|
||||
|
||||
if len(result) == 4:
|
||||
status, latency, message, version = result
|
||||
else:
|
||||
status, latency, message = result
|
||||
version = None
|
||||
|
||||
svc = self.services[service_name]
|
||||
was_down = svc.status in (Status.DOWN, Status.DEGRADED)
|
||||
is_down = status in (Status.DOWN, Status.DEGRADED)
|
||||
|
||||
# Update service status
|
||||
svc.status = status
|
||||
svc.latency_ms = latency
|
||||
svc.message = message
|
||||
if version:
|
||||
svc.version = version
|
||||
svc.last_check = now
|
||||
svc.update_uptime(status == Status.OPERATIONAL)
|
||||
|
||||
# Save metric to database
|
||||
save_metric(service_name, status.value, latency, message)
|
||||
|
||||
# Load historical data
|
||||
svc.latency_history = get_latency_history(service_name, hours=24)
|
||||
svc.avg_latency_24h = get_avg_latency(service_name, hours=24)
|
||||
|
||||
# Update uptime from DB
|
||||
stats = get_uptime_stats(service_name, hours=24)
|
||||
if stats["total_checks"] > 0:
|
||||
svc.uptime_percent = stats["uptime_percent"]
|
||||
|
||||
# Handle incident tracking and alerting
|
||||
if is_down and not was_down:
|
||||
# Service just went down
|
||||
svc.last_incident = now
|
||||
incident_id = create_incident(service_name, status.value, message)
|
||||
await alert_service_down(service_name, svc.display_name, message)
|
||||
mark_incident_notified(incident_id)
|
||||
|
||||
elif not is_down and was_down:
|
||||
# Service recovered
|
||||
open_incident = get_open_incident(service_name)
|
||||
if open_incident:
|
||||
started_at = datetime.fromisoformat(open_incident["started_at"])
|
||||
downtime_minutes = int((now - started_at).total_seconds() / 60)
|
||||
resolve_incident(service_name)
|
||||
await alert_service_recovered(service_name, svc.display_name, downtime_minutes)
|
||||
|
||||
async def check_all_services(
|
||||
self,
|
||||
backend_url: str,
|
||||
frontend_url: str,
|
||||
bot_url: str,
|
||||
external_url: str = "",
|
||||
public_url: str = ""
|
||||
):
|
||||
"""Check all services concurrently."""
|
||||
now = datetime.now()
|
||||
|
||||
# Run all checks concurrently
|
||||
@@ -152,61 +255,18 @@ class ServiceMonitor:
|
||||
self.check_database(backend_url),
|
||||
self.check_frontend(frontend_url),
|
||||
self.check_bot(bot_url),
|
||||
self.check_external(external_url),
|
||||
return_exceptions=True
|
||||
)
|
||||
|
||||
# Process backend result
|
||||
if not isinstance(results[0], Exception):
|
||||
status, latency, message, version = results[0]
|
||||
svc = self.services["backend"]
|
||||
was_down = svc.status == Status.DOWN
|
||||
svc.status = status
|
||||
svc.latency_ms = latency
|
||||
svc.message = message
|
||||
svc.version = version
|
||||
svc.last_check = now
|
||||
svc.update_uptime(status == Status.OPERATIONAL)
|
||||
if status != Status.OPERATIONAL and not was_down:
|
||||
svc.last_incident = now
|
||||
# Process results
|
||||
service_names = ["backend", "database", "frontend", "bot", "external"]
|
||||
for i, service_name in enumerate(service_names):
|
||||
await self._process_check_result(service_name, results[i], now)
|
||||
|
||||
# Process database result
|
||||
if not isinstance(results[1], Exception):
|
||||
status, latency, message = results[1]
|
||||
svc = self.services["database"]
|
||||
was_down = svc.status == Status.DOWN
|
||||
svc.status = status
|
||||
svc.latency_ms = latency
|
||||
svc.message = message
|
||||
svc.last_check = now
|
||||
svc.update_uptime(status == Status.OPERATIONAL)
|
||||
if status != Status.OPERATIONAL and not was_down:
|
||||
svc.last_incident = now
|
||||
|
||||
# Process frontend result
|
||||
if not isinstance(results[2], Exception):
|
||||
status, latency, message = results[2]
|
||||
svc = self.services["frontend"]
|
||||
was_down = svc.status == Status.DOWN
|
||||
svc.status = status
|
||||
svc.latency_ms = latency
|
||||
svc.message = message
|
||||
svc.last_check = now
|
||||
svc.update_uptime(status == Status.OPERATIONAL)
|
||||
if status != Status.OPERATIONAL and not was_down:
|
||||
svc.last_incident = now
|
||||
|
||||
# Process bot result
|
||||
if not isinstance(results[3], Exception):
|
||||
status, latency, message = results[3]
|
||||
svc = self.services["bot"]
|
||||
was_down = svc.status == Status.DOWN
|
||||
svc.status = status
|
||||
svc.latency_ms = latency
|
||||
svc.message = message
|
||||
svc.last_check = now
|
||||
svc.update_uptime(status == Status.OPERATIONAL)
|
||||
if status != Status.OPERATIONAL and not was_down:
|
||||
svc.last_incident = now
|
||||
# Check SSL certificate (if public URL is HTTPS)
|
||||
if public_url and public_url.startswith("https://"):
|
||||
self.ssl_info = await check_and_alert_ssl(public_url)
|
||||
|
||||
self.last_check = now
|
||||
|
||||
@@ -214,8 +274,12 @@ class ServiceMonitor:
|
||||
return self.services
|
||||
|
||||
def get_overall_status(self) -> Status:
|
||||
"""Get overall system status based on all services"""
|
||||
statuses = [svc.status for svc in self.services.values()]
|
||||
"""Get overall system status based on all services."""
|
||||
# Exclude external from overall status if not configured
|
||||
statuses = [
|
||||
svc.status for name, svc in self.services.items()
|
||||
if name != "external" or svc.status != Status.UNKNOWN
|
||||
]
|
||||
|
||||
if all(s == Status.OPERATIONAL for s in statuses):
|
||||
return Status.OPERATIONAL
|
||||
@@ -225,3 +289,17 @@ class ServiceMonitor:
|
||||
return Status.DEGRADED
|
||||
else:
|
||||
return Status.UNKNOWN
|
||||
|
||||
def get_ssl_status(self) -> Optional[dict]:
|
||||
"""Get SSL certificate status."""
|
||||
if not self.ssl_info:
|
||||
return None
|
||||
|
||||
return {
|
||||
"domain": self.ssl_info.domain,
|
||||
"issuer": self.ssl_info.issuer,
|
||||
"expires_at": self.ssl_info.expires_at.isoformat(),
|
||||
"days_until_expiry": self.ssl_info.days_until_expiry,
|
||||
"is_valid": self.ssl_info.is_valid,
|
||||
"error": self.ssl_info.error
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user