Fix service status

This commit is contained in:
2025-12-20 02:28:41 +07:00
parent c645171671
commit 243abe55b5
5 changed files with 210 additions and 33 deletions

View File

@@ -184,7 +184,8 @@ class ServiceMonitor:
self,
service_name: str,
result: tuple,
now: datetime
now: datetime,
suppress_alerts: bool = False
):
"""Process check result with DB persistence and alerting."""
if isinstance(result, Exception):
@@ -221,13 +222,14 @@ class ServiceMonitor:
if stats["total_checks"] > 0:
svc.uptime_percent = stats["uptime_percent"]
# Handle incident tracking and alerting
# Handle incident tracking and alerting (skip alerts during grace period)
if is_down and not was_down:
# Service just went down
svc.last_incident = now
incident_id = create_incident(service_name, status.value, message)
await alert_service_down(service_name, svc.display_name, message)
mark_incident_notified(incident_id)
if not suppress_alerts:
await alert_service_down(service_name, svc.display_name, message)
mark_incident_notified(incident_id)
elif not is_down and was_down:
# Service recovered
@@ -236,7 +238,8 @@ class ServiceMonitor:
started_at = datetime.fromisoformat(open_incident["started_at"])
downtime_minutes = int((now - started_at).total_seconds() / 60)
resolve_incident(service_name)
await alert_service_recovered(service_name, svc.display_name, downtime_minutes)
if not suppress_alerts:
await alert_service_recovered(service_name, svc.display_name, downtime_minutes)
async def check_all_services(
self,
@@ -244,7 +247,8 @@ class ServiceMonitor:
frontend_url: str,
bot_url: str,
external_url: str = "",
public_url: str = ""
public_url: str = "",
suppress_alerts: bool = False
):
"""Check all services concurrently."""
now = datetime.now()
@@ -262,7 +266,7 @@ class ServiceMonitor:
# Process results
service_names = ["backend", "database", "frontend", "bot", "external"]
for i, service_name in enumerate(service_names):
await self._process_check_result(service_name, results[i], now)
await self._process_check_result(service_name, results[i], now, suppress_alerts)
# Check SSL certificate (if public URL is HTTPS)
if public_url and public_url.startswith("https://"):
@@ -270,7 +274,15 @@ class ServiceMonitor:
self.last_check = now
def get_all_statuses(self) -> dict[str, ServiceStatus]:
def get_all_statuses(self, period_hours: int = 24) -> dict[str, ServiceStatus]:
"""Get all service statuses with data for specified period."""
# Update historical data for requested period
for name, svc in self.services.items():
svc.latency_history = get_latency_history(name, hours=period_hours)
svc.avg_latency_24h = get_avg_latency(name, hours=period_hours)
stats = get_uptime_stats(name, hours=period_hours)
if stats["total_checks"] > 0:
svc.uptime_percent = stats["uptime_percent"]
return self.services
def get_overall_status(self) -> Status: