Fix service status
This commit is contained in:
@@ -184,7 +184,8 @@ class ServiceMonitor:
|
||||
self,
|
||||
service_name: str,
|
||||
result: tuple,
|
||||
now: datetime
|
||||
now: datetime,
|
||||
suppress_alerts: bool = False
|
||||
):
|
||||
"""Process check result with DB persistence and alerting."""
|
||||
if isinstance(result, Exception):
|
||||
@@ -221,13 +222,14 @@ class ServiceMonitor:
|
||||
if stats["total_checks"] > 0:
|
||||
svc.uptime_percent = stats["uptime_percent"]
|
||||
|
||||
# Handle incident tracking and alerting
|
||||
# Handle incident tracking and alerting (skip alerts during grace period)
|
||||
if is_down and not was_down:
|
||||
# Service just went down
|
||||
svc.last_incident = now
|
||||
incident_id = create_incident(service_name, status.value, message)
|
||||
await alert_service_down(service_name, svc.display_name, message)
|
||||
mark_incident_notified(incident_id)
|
||||
if not suppress_alerts:
|
||||
await alert_service_down(service_name, svc.display_name, message)
|
||||
mark_incident_notified(incident_id)
|
||||
|
||||
elif not is_down and was_down:
|
||||
# Service recovered
|
||||
@@ -236,7 +238,8 @@ class ServiceMonitor:
|
||||
started_at = datetime.fromisoformat(open_incident["started_at"])
|
||||
downtime_minutes = int((now - started_at).total_seconds() / 60)
|
||||
resolve_incident(service_name)
|
||||
await alert_service_recovered(service_name, svc.display_name, downtime_minutes)
|
||||
if not suppress_alerts:
|
||||
await alert_service_recovered(service_name, svc.display_name, downtime_minutes)
|
||||
|
||||
async def check_all_services(
|
||||
self,
|
||||
@@ -244,7 +247,8 @@ class ServiceMonitor:
|
||||
frontend_url: str,
|
||||
bot_url: str,
|
||||
external_url: str = "",
|
||||
public_url: str = ""
|
||||
public_url: str = "",
|
||||
suppress_alerts: bool = False
|
||||
):
|
||||
"""Check all services concurrently."""
|
||||
now = datetime.now()
|
||||
@@ -262,7 +266,7 @@ class ServiceMonitor:
|
||||
# Process results
|
||||
service_names = ["backend", "database", "frontend", "bot", "external"]
|
||||
for i, service_name in enumerate(service_names):
|
||||
await self._process_check_result(service_name, results[i], now)
|
||||
await self._process_check_result(service_name, results[i], now, suppress_alerts)
|
||||
|
||||
# Check SSL certificate (if public URL is HTTPS)
|
||||
if public_url and public_url.startswith("https://"):
|
||||
@@ -270,7 +274,15 @@ class ServiceMonitor:
|
||||
|
||||
self.last_check = now
|
||||
|
||||
def get_all_statuses(self) -> dict[str, ServiceStatus]:
|
||||
def get_all_statuses(self, period_hours: int = 24) -> dict[str, ServiceStatus]:
|
||||
"""Get all service statuses with data for specified period."""
|
||||
# Update historical data for requested period
|
||||
for name, svc in self.services.items():
|
||||
svc.latency_history = get_latency_history(name, hours=period_hours)
|
||||
svc.avg_latency_24h = get_avg_latency(name, hours=period_hours)
|
||||
stats = get_uptime_stats(name, hours=period_hours)
|
||||
if stats["total_checks"] > 0:
|
||||
svc.uptime_percent = stats["uptime_percent"]
|
||||
return self.services
|
||||
|
||||
def get_overall_status(self) -> Status:
|
||||
|
||||
Reference in New Issue
Block a user