Redesign health service + create backup service

This commit is contained in:
2025-12-18 03:35:13 +07:00
parent e43e579329
commit 57bad3b4a8
16 changed files with 1486 additions and 98 deletions

View File

@@ -6,6 +6,9 @@ WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Create data directory for SQLite
RUN mkdir -p /app/data
# Copy application
COPY . .

85
status-service/alerts.py Normal file
View File

@@ -0,0 +1,85 @@
"""Telegram alerting for status changes."""
import os
from datetime import datetime
from typing import Optional
import httpx
TELEGRAM_BOT_TOKEN = os.getenv("TELEGRAM_BOT_TOKEN", "")
TELEGRAM_ADMIN_ID = os.getenv("TELEGRAM_ADMIN_ID", "")
async def send_telegram_alert(message: str, is_recovery: bool = False) -> bool:
"""Send alert to Telegram."""
if not TELEGRAM_BOT_TOKEN or not TELEGRAM_ADMIN_ID:
print("Telegram alerting not configured")
return False
emoji = "\u2705" if is_recovery else "\u26a0\ufe0f"
text = f"{emoji} *Status Alert*\n\n{message}"
url = f"https://api.telegram.org/bot{TELEGRAM_BOT_TOKEN}/sendMessage"
data = {
"chat_id": TELEGRAM_ADMIN_ID,
"text": text,
"parse_mode": "Markdown",
}
try:
async with httpx.AsyncClient(timeout=10.0) as client:
response = await client.post(url, json=data)
response.raise_for_status()
print(f"Telegram alert sent: {message[:50]}...")
return True
except Exception as e:
print(f"Failed to send Telegram alert: {e}")
return False
async def alert_service_down(service_name: str, display_name: str, message: Optional[str]):
"""Alert when service goes down."""
now = datetime.now().strftime("%d.%m.%Y %H:%M:%S")
text = (
f"*{display_name}* is DOWN\n\n"
f"Time: `{now}`\n"
)
if message:
text += f"Error: `{message}`"
await send_telegram_alert(text, is_recovery=False)
async def alert_service_recovered(service_name: str, display_name: str, downtime_minutes: int):
"""Alert when service recovers."""
now = datetime.now().strftime("%d.%m.%Y %H:%M:%S")
text = (
f"*{display_name}* is back ONLINE\n\n"
f"Time: `{now}`\n"
f"Downtime: `{downtime_minutes} min`"
)
await send_telegram_alert(text, is_recovery=True)
async def alert_ssl_expiring(domain: str, days_left: int):
"""Alert when SSL certificate is expiring soon."""
text = (
f"*SSL Certificate Expiring*\n\n"
f"Domain: `{domain}`\n"
f"Days left: `{days_left}`\n\n"
f"Please renew the certificate!"
)
await send_telegram_alert(text, is_recovery=False)
async def alert_ssl_expired(domain: str):
"""Alert when SSL certificate has expired."""
text = (
f"*SSL Certificate EXPIRED*\n\n"
f"Domain: `{domain}`\n\n"
f"Certificate has expired! Site may show security warnings."
)
await send_telegram_alert(text, is_recovery=False)

261
status-service/database.py Normal file
View File

@@ -0,0 +1,261 @@
"""SQLite database for storing metrics history."""
import sqlite3
from datetime import datetime, timedelta
from pathlib import Path
from typing import Optional
import json
DB_PATH = Path("/app/data/metrics.db")
def get_connection() -> sqlite3.Connection:
"""Get database connection."""
DB_PATH.parent.mkdir(parents=True, exist_ok=True)
conn = sqlite3.connect(str(DB_PATH))
conn.row_factory = sqlite3.Row
return conn
def init_db():
"""Initialize database tables."""
conn = get_connection()
cursor = conn.cursor()
# Metrics history table
cursor.execute("""
CREATE TABLE IF NOT EXISTS metrics (
id INTEGER PRIMARY KEY AUTOINCREMENT,
service_name TEXT NOT NULL,
status TEXT NOT NULL,
latency_ms REAL,
message TEXT,
checked_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
""")
# Incidents table
cursor.execute("""
CREATE TABLE IF NOT EXISTS incidents (
id INTEGER PRIMARY KEY AUTOINCREMENT,
service_name TEXT NOT NULL,
status TEXT NOT NULL,
message TEXT,
started_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
resolved_at TIMESTAMP,
notified BOOLEAN DEFAULT FALSE
)
""")
# SSL certificates table
cursor.execute("""
CREATE TABLE IF NOT EXISTS ssl_certificates (
id INTEGER PRIMARY KEY AUTOINCREMENT,
domain TEXT NOT NULL UNIQUE,
issuer TEXT,
expires_at TIMESTAMP,
days_until_expiry INTEGER,
checked_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
""")
# Create indexes
cursor.execute("""
CREATE INDEX IF NOT EXISTS idx_metrics_service_time
ON metrics(service_name, checked_at DESC)
""")
cursor.execute("""
CREATE INDEX IF NOT EXISTS idx_incidents_service
ON incidents(service_name, started_at DESC)
""")
conn.commit()
conn.close()
def save_metric(service_name: str, status: str, latency_ms: Optional[float], message: Optional[str]):
"""Save a metric record."""
conn = get_connection()
cursor = conn.cursor()
cursor.execute(
"INSERT INTO metrics (service_name, status, latency_ms, message) VALUES (?, ?, ?, ?)",
(service_name, status, latency_ms, message)
)
conn.commit()
conn.close()
def get_latency_history(service_name: str, hours: int = 24) -> list[dict]:
"""Get latency history for a service."""
conn = get_connection()
cursor = conn.cursor()
since = datetime.now() - timedelta(hours=hours)
cursor.execute("""
SELECT latency_ms, status, checked_at
FROM metrics
WHERE service_name = ? AND checked_at > ? AND latency_ms IS NOT NULL
ORDER BY checked_at ASC
""", (service_name, since.isoformat()))
rows = cursor.fetchall()
conn.close()
return [
{
"latency_ms": row["latency_ms"],
"status": row["status"],
"checked_at": row["checked_at"]
}
for row in rows
]
def get_uptime_stats(service_name: str, hours: int = 24) -> dict:
"""Calculate uptime statistics for a service."""
conn = get_connection()
cursor = conn.cursor()
since = datetime.now() - timedelta(hours=hours)
cursor.execute("""
SELECT COUNT(*) as total,
SUM(CASE WHEN status = 'operational' THEN 1 ELSE 0 END) as successful
FROM metrics
WHERE service_name = ? AND checked_at > ?
""", (service_name, since.isoformat()))
row = cursor.fetchone()
conn.close()
total = row["total"] or 0
successful = row["successful"] or 0
return {
"total_checks": total,
"successful_checks": successful,
"uptime_percent": (successful / total * 100) if total > 0 else 100.0
}
def get_avg_latency(service_name: str, hours: int = 24) -> Optional[float]:
"""Get average latency for a service."""
conn = get_connection()
cursor = conn.cursor()
since = datetime.now() - timedelta(hours=hours)
cursor.execute("""
SELECT AVG(latency_ms) as avg_latency
FROM metrics
WHERE service_name = ? AND checked_at > ? AND latency_ms IS NOT NULL
""", (service_name, since.isoformat()))
row = cursor.fetchone()
conn.close()
return row["avg_latency"]
def create_incident(service_name: str, status: str, message: Optional[str]) -> int:
"""Create a new incident."""
conn = get_connection()
cursor = conn.cursor()
cursor.execute(
"INSERT INTO incidents (service_name, status, message) VALUES (?, ?, ?)",
(service_name, status, message)
)
incident_id = cursor.lastrowid
conn.commit()
conn.close()
return incident_id
def resolve_incident(service_name: str):
"""Resolve open incidents for a service."""
conn = get_connection()
cursor = conn.cursor()
cursor.execute("""
UPDATE incidents
SET resolved_at = CURRENT_TIMESTAMP
WHERE service_name = ? AND resolved_at IS NULL
""", (service_name,))
conn.commit()
conn.close()
def get_open_incident(service_name: str) -> Optional[dict]:
"""Get open incident for a service."""
conn = get_connection()
cursor = conn.cursor()
cursor.execute("""
SELECT * FROM incidents
WHERE service_name = ? AND resolved_at IS NULL
ORDER BY started_at DESC LIMIT 1
""", (service_name,))
row = cursor.fetchone()
conn.close()
if row:
return dict(row)
return None
def mark_incident_notified(incident_id: int):
"""Mark incident as notified."""
conn = get_connection()
cursor = conn.cursor()
cursor.execute("UPDATE incidents SET notified = TRUE WHERE id = ?", (incident_id,))
conn.commit()
conn.close()
def get_recent_incidents(limit: int = 10) -> list[dict]:
"""Get recent incidents."""
conn = get_connection()
cursor = conn.cursor()
cursor.execute("""
SELECT * FROM incidents
ORDER BY started_at DESC
LIMIT ?
""", (limit,))
rows = cursor.fetchall()
conn.close()
return [dict(row) for row in rows]
def save_ssl_info(domain: str, issuer: str, expires_at: datetime, days_until_expiry: int):
"""Save SSL certificate info."""
conn = get_connection()
cursor = conn.cursor()
cursor.execute("""
INSERT OR REPLACE INTO ssl_certificates
(domain, issuer, expires_at, days_until_expiry, checked_at)
VALUES (?, ?, ?, ?, CURRENT_TIMESTAMP)
""", (domain, issuer, expires_at.isoformat(), days_until_expiry))
conn.commit()
conn.close()
def get_ssl_info(domain: str) -> Optional[dict]:
"""Get SSL certificate info."""
conn = get_connection()
cursor = conn.cursor()
cursor.execute("SELECT * FROM ssl_certificates WHERE domain = ?", (domain,))
row = cursor.fetchone()
conn.close()
if row:
return dict(row)
return None
def cleanup_old_metrics(days: int = 7):
"""Delete metrics older than specified days."""
conn = get_connection()
cursor = conn.cursor()
cutoff = datetime.now() - timedelta(days=days)
cursor.execute("DELETE FROM metrics WHERE checked_at < ?", (cutoff.isoformat(),))
deleted = cursor.rowcount
conn.commit()
conn.close()
return deleted

View File

@@ -1,6 +1,7 @@
"""Status monitoring service with persistence and alerting."""
import os
import asyncio
from datetime import datetime, timedelta
from datetime import datetime
from typing import Optional
from contextlib import asynccontextmanager
@@ -8,13 +9,16 @@ from fastapi import FastAPI, Request
from fastapi.responses import HTMLResponse
from fastapi.templating import Jinja2Templates
from monitors import ServiceMonitor, ServiceStatus
from monitors import ServiceMonitor
from database import init_db, get_recent_incidents, get_latency_history, cleanup_old_metrics
# Configuration
BACKEND_URL = os.getenv("BACKEND_URL", "http://backend:8000")
FRONTEND_URL = os.getenv("FRONTEND_URL", "http://frontend:80")
BOT_URL = os.getenv("BOT_URL", "http://bot:8080")
EXTERNAL_URL = os.getenv("EXTERNAL_URL", "") # Public URL for external checks
PUBLIC_URL = os.getenv("PUBLIC_URL", "") # Public HTTPS URL for SSL checks
CHECK_INTERVAL = int(os.getenv("CHECK_INTERVAL", "30"))
# Initialize monitor
@@ -22,38 +26,64 @@ monitor = ServiceMonitor()
# Background task reference
background_task: Optional[asyncio.Task] = None
cleanup_task: Optional[asyncio.Task] = None
async def periodic_health_check():
"""Background task to check services periodically"""
"""Background task to check services periodically."""
while True:
await monitor.check_all_services(
backend_url=BACKEND_URL,
frontend_url=FRONTEND_URL,
bot_url=BOT_URL
)
try:
await monitor.check_all_services(
backend_url=BACKEND_URL,
frontend_url=FRONTEND_URL,
bot_url=BOT_URL,
external_url=EXTERNAL_URL,
public_url=PUBLIC_URL
)
except Exception as e:
print(f"Health check error: {e}")
await asyncio.sleep(CHECK_INTERVAL)
async def periodic_cleanup():
"""Background task to cleanup old metrics (daily)."""
while True:
await asyncio.sleep(86400) # 24 hours
try:
deleted = cleanup_old_metrics(days=7)
print(f"Cleaned up {deleted} old metrics")
except Exception as e:
print(f"Cleanup error: {e}")
@asynccontextmanager
async def lifespan(app: FastAPI):
"""Startup and shutdown events"""
global background_task
"""Startup and shutdown events."""
global background_task, cleanup_task
# Initialize database
init_db()
print("Database initialized")
# Start background health checks
background_task = asyncio.create_task(periodic_health_check())
cleanup_task = asyncio.create_task(periodic_cleanup())
yield
# Cancel background task on shutdown
if background_task:
background_task.cancel()
try:
await background_task
except asyncio.CancelledError:
pass
# Cancel background tasks on shutdown
for task in [background_task, cleanup_task]:
if task:
task.cancel()
try:
await task
except asyncio.CancelledError:
pass
app = FastAPI(
title="Status Monitor",
description="Service health monitoring",
description="Service health monitoring with persistence and alerting",
lifespan=lifespan
)
@@ -62,9 +92,11 @@ templates = Jinja2Templates(directory="templates")
@app.get("/", response_class=HTMLResponse)
async def status_page(request: Request):
"""Main status page"""
"""Main status page."""
services = monitor.get_all_statuses()
overall_status = monitor.get_overall_status()
ssl_status = monitor.get_ssl_status()
incidents = get_recent_incidents(limit=5)
return templates.TemplateResponse(
"index.html",
@@ -72,6 +104,8 @@ async def status_page(request: Request):
"request": request,
"services": services,
"overall_status": overall_status,
"ssl_status": ssl_status,
"incidents": incidents,
"last_check": monitor.last_check,
"check_interval": CHECK_INTERVAL
}
@@ -80,30 +114,52 @@ async def status_page(request: Request):
@app.get("/api/status")
async def api_status():
"""API endpoint for service statuses"""
"""API endpoint for service statuses."""
services = monitor.get_all_statuses()
overall_status = monitor.get_overall_status()
ssl_status = monitor.get_ssl_status()
return {
"overall_status": overall_status,
"overall_status": overall_status.value,
"services": {name: status.to_dict() for name, status in services.items()},
"ssl": ssl_status,
"last_check": monitor.last_check.isoformat() if monitor.last_check else None,
"check_interval_seconds": CHECK_INTERVAL
}
@app.get("/api/history/{service_name}")
async def api_history(service_name: str, hours: int = 24):
"""API endpoint for service latency history."""
history = get_latency_history(service_name, hours=hours)
return {
"service": service_name,
"hours": hours,
"data": history
}
@app.get("/api/incidents")
async def api_incidents(limit: int = 20):
"""API endpoint for recent incidents."""
incidents = get_recent_incidents(limit=limit)
return {"incidents": incidents}
@app.get("/api/health")
async def health():
"""Health check for this service"""
"""Health check for this service."""
return {"status": "ok", "service": "status-monitor"}
@app.post("/api/refresh")
async def refresh_status():
"""Force refresh all service statuses"""
"""Force refresh all service statuses."""
await monitor.check_all_services(
backend_url=BACKEND_URL,
frontend_url=FRONTEND_URL,
bot_url=BOT_URL
bot_url=BOT_URL,
external_url=EXTERNAL_URL,
public_url=PUBLIC_URL
)
return {"status": "refreshed"}

View File

@@ -1,11 +1,19 @@
"""Service monitoring with persistence and alerting."""
import asyncio
from datetime import datetime, timedelta
from dataclasses import dataclass, field
from dataclasses import dataclass
from typing import Optional
from enum import Enum
import httpx
from database import (
save_metric, get_latency_history, get_uptime_stats, get_avg_latency,
create_incident, resolve_incident, get_open_incident, mark_incident_notified
)
from alerts import alert_service_down, alert_service_recovered
from ssl_monitor import check_and_alert_ssl, SSLInfo
class Status(str, Enum):
OPERATIONAL = "operational"
@@ -25,11 +33,17 @@ class ServiceStatus:
uptime_percent: float = 100.0
message: Optional[str] = None
version: Optional[str] = None
avg_latency_24h: Optional[float] = None
latency_history: list = None
# For uptime calculation
# For uptime calculation (in-memory, backed by DB)
total_checks: int = 0
successful_checks: int = 0
def __post_init__(self):
if self.latency_history is None:
self.latency_history = []
def to_dict(self) -> dict:
return {
"name": self.name,
@@ -40,7 +54,8 @@ class ServiceStatus:
"last_incident": self.last_incident.isoformat() if self.last_incident else None,
"uptime_percent": round(self.uptime_percent, 2),
"message": self.message,
"version": self.version
"version": self.version,
"avg_latency_24h": round(self.avg_latency_24h, 2) if self.avg_latency_24h else None,
}
def update_uptime(self, is_success: bool):
@@ -69,12 +84,17 @@ class ServiceMonitor:
"bot": ServiceStatus(
name="bot",
display_name="Telegram Bot"
)
),
"external": ServiceStatus(
name="external",
display_name="External Access"
),
}
self.last_check: Optional[datetime] = None
self.ssl_info: Optional[SSLInfo] = None
async def check_backend(self, url: str) -> tuple[Status, Optional[float], Optional[str], Optional[str]]:
"""Check backend API health"""
"""Check backend API health."""
try:
async with httpx.AsyncClient(timeout=10.0) as client:
start = datetime.now()
@@ -92,9 +112,7 @@ class ServiceMonitor:
return Status.DOWN, None, str(e)[:100], None
async def check_database(self, backend_url: str) -> tuple[Status, Optional[float], Optional[str]]:
"""Check database through backend"""
# We check database indirectly - if backend is up, DB is likely up
# Could add a specific /health/db endpoint to backend later
"""Check database through backend."""
try:
async with httpx.AsyncClient(timeout=10.0) as client:
start = datetime.now()
@@ -109,7 +127,7 @@ class ServiceMonitor:
return Status.DOWN, None, "Cannot reach backend"
async def check_frontend(self, url: str) -> tuple[Status, Optional[float], Optional[str]]:
"""Check frontend availability"""
"""Check frontend availability."""
try:
async with httpx.AsyncClient(timeout=10.0) as client:
start = datetime.now()
@@ -126,7 +144,7 @@ class ServiceMonitor:
return Status.DOWN, None, str(e)[:100]
async def check_bot(self, url: str) -> tuple[Status, Optional[float], Optional[str]]:
"""Check Telegram bot health"""
"""Check Telegram bot health."""
try:
async with httpx.AsyncClient(timeout=10.0) as client:
start = datetime.now()
@@ -142,8 +160,93 @@ class ServiceMonitor:
except Exception as e:
return Status.DOWN, None, str(e)[:100]
async def check_all_services(self, backend_url: str, frontend_url: str, bot_url: str):
"""Check all services concurrently"""
async def check_external(self, url: str) -> tuple[Status, Optional[float], Optional[str]]:
"""Check external (public) URL availability."""
if not url:
return Status.UNKNOWN, None, "Not configured"
try:
async with httpx.AsyncClient(timeout=15.0, follow_redirects=True) as client:
start = datetime.now()
response = await client.get(url)
latency = (datetime.now() - start).total_seconds() * 1000
if response.status_code == 200:
return Status.OPERATIONAL, latency, None
else:
return Status.DEGRADED, latency, f"HTTP {response.status_code}"
except httpx.TimeoutException:
return Status.DOWN, None, "Timeout"
except Exception as e:
return Status.DOWN, None, str(e)[:100]
async def _process_check_result(
self,
service_name: str,
result: tuple,
now: datetime
):
"""Process check result with DB persistence and alerting."""
if isinstance(result, Exception):
return
if len(result) == 4:
status, latency, message, version = result
else:
status, latency, message = result
version = None
svc = self.services[service_name]
was_down = svc.status in (Status.DOWN, Status.DEGRADED)
is_down = status in (Status.DOWN, Status.DEGRADED)
# Update service status
svc.status = status
svc.latency_ms = latency
svc.message = message
if version:
svc.version = version
svc.last_check = now
svc.update_uptime(status == Status.OPERATIONAL)
# Save metric to database
save_metric(service_name, status.value, latency, message)
# Load historical data
svc.latency_history = get_latency_history(service_name, hours=24)
svc.avg_latency_24h = get_avg_latency(service_name, hours=24)
# Update uptime from DB
stats = get_uptime_stats(service_name, hours=24)
if stats["total_checks"] > 0:
svc.uptime_percent = stats["uptime_percent"]
# Handle incident tracking and alerting
if is_down and not was_down:
# Service just went down
svc.last_incident = now
incident_id = create_incident(service_name, status.value, message)
await alert_service_down(service_name, svc.display_name, message)
mark_incident_notified(incident_id)
elif not is_down and was_down:
# Service recovered
open_incident = get_open_incident(service_name)
if open_incident:
started_at = datetime.fromisoformat(open_incident["started_at"])
downtime_minutes = int((now - started_at).total_seconds() / 60)
resolve_incident(service_name)
await alert_service_recovered(service_name, svc.display_name, downtime_minutes)
async def check_all_services(
self,
backend_url: str,
frontend_url: str,
bot_url: str,
external_url: str = "",
public_url: str = ""
):
"""Check all services concurrently."""
now = datetime.now()
# Run all checks concurrently
@@ -152,61 +255,18 @@ class ServiceMonitor:
self.check_database(backend_url),
self.check_frontend(frontend_url),
self.check_bot(bot_url),
self.check_external(external_url),
return_exceptions=True
)
# Process backend result
if not isinstance(results[0], Exception):
status, latency, message, version = results[0]
svc = self.services["backend"]
was_down = svc.status == Status.DOWN
svc.status = status
svc.latency_ms = latency
svc.message = message
svc.version = version
svc.last_check = now
svc.update_uptime(status == Status.OPERATIONAL)
if status != Status.OPERATIONAL and not was_down:
svc.last_incident = now
# Process results
service_names = ["backend", "database", "frontend", "bot", "external"]
for i, service_name in enumerate(service_names):
await self._process_check_result(service_name, results[i], now)
# Process database result
if not isinstance(results[1], Exception):
status, latency, message = results[1]
svc = self.services["database"]
was_down = svc.status == Status.DOWN
svc.status = status
svc.latency_ms = latency
svc.message = message
svc.last_check = now
svc.update_uptime(status == Status.OPERATIONAL)
if status != Status.OPERATIONAL and not was_down:
svc.last_incident = now
# Process frontend result
if not isinstance(results[2], Exception):
status, latency, message = results[2]
svc = self.services["frontend"]
was_down = svc.status == Status.DOWN
svc.status = status
svc.latency_ms = latency
svc.message = message
svc.last_check = now
svc.update_uptime(status == Status.OPERATIONAL)
if status != Status.OPERATIONAL and not was_down:
svc.last_incident = now
# Process bot result
if not isinstance(results[3], Exception):
status, latency, message = results[3]
svc = self.services["bot"]
was_down = svc.status == Status.DOWN
svc.status = status
svc.latency_ms = latency
svc.message = message
svc.last_check = now
svc.update_uptime(status == Status.OPERATIONAL)
if status != Status.OPERATIONAL and not was_down:
svc.last_incident = now
# Check SSL certificate (if public URL is HTTPS)
if public_url and public_url.startswith("https://"):
self.ssl_info = await check_and_alert_ssl(public_url)
self.last_check = now
@@ -214,8 +274,12 @@ class ServiceMonitor:
return self.services
def get_overall_status(self) -> Status:
"""Get overall system status based on all services"""
statuses = [svc.status for svc in self.services.values()]
"""Get overall system status based on all services."""
# Exclude external from overall status if not configured
statuses = [
svc.status for name, svc in self.services.items()
if name != "external" or svc.status != Status.UNKNOWN
]
if all(s == Status.OPERATIONAL for s in statuses):
return Status.OPERATIONAL
@@ -225,3 +289,17 @@ class ServiceMonitor:
return Status.DEGRADED
else:
return Status.UNKNOWN
def get_ssl_status(self) -> Optional[dict]:
"""Get SSL certificate status."""
if not self.ssl_info:
return None
return {
"domain": self.ssl_info.domain,
"issuer": self.ssl_info.issuer,
"expires_at": self.ssl_info.expires_at.isoformat(),
"days_until_expiry": self.ssl_info.days_until_expiry,
"is_valid": self.ssl_info.is_valid,
"error": self.ssl_info.error
}

View File

@@ -0,0 +1,140 @@
"""SSL certificate monitoring."""
import ssl
import socket
from datetime import datetime, timezone
from dataclasses import dataclass
from typing import Optional
from urllib.parse import urlparse
from database import save_ssl_info, get_ssl_info
from alerts import alert_ssl_expiring, alert_ssl_expired
@dataclass
class SSLInfo:
domain: str
issuer: str
expires_at: datetime
days_until_expiry: int
is_valid: bool
error: Optional[str] = None
def check_ssl_certificate(url: str) -> Optional[SSLInfo]:
"""Check SSL certificate for a URL."""
try:
parsed = urlparse(url)
hostname = parsed.hostname
if not hostname:
return None
# Skip non-HTTPS or localhost
if parsed.scheme != "https" or hostname in ("localhost", "127.0.0.1"):
return None
context = ssl.create_default_context()
conn = context.wrap_socket(
socket.socket(socket.AF_INET),
server_hostname=hostname
)
conn.settimeout(10.0)
try:
conn.connect((hostname, parsed.port or 443))
cert = conn.getpeercert()
finally:
conn.close()
if not cert:
return SSLInfo(
domain=hostname,
issuer="Unknown",
expires_at=datetime.now(timezone.utc),
days_until_expiry=0,
is_valid=False,
error="No certificate found"
)
# Parse expiry date
not_after = cert.get("notAfter", "")
expires_at = datetime.strptime(not_after, "%b %d %H:%M:%S %Y %Z")
expires_at = expires_at.replace(tzinfo=timezone.utc)
# Calculate days until expiry
now = datetime.now(timezone.utc)
days_until_expiry = (expires_at - now).days
# Get issuer
issuer_parts = cert.get("issuer", ())
issuer = "Unknown"
for part in issuer_parts:
for key, value in part:
if key == "organizationName":
issuer = value
break
return SSLInfo(
domain=hostname,
issuer=issuer,
expires_at=expires_at,
days_until_expiry=days_until_expiry,
is_valid=days_until_expiry > 0
)
except ssl.SSLCertVerificationError as e:
hostname = urlparse(url).hostname or url
return SSLInfo(
domain=hostname,
issuer="Invalid",
expires_at=datetime.now(timezone.utc),
days_until_expiry=0,
is_valid=False,
error=f"SSL verification failed: {str(e)[:100]}"
)
except Exception as e:
hostname = urlparse(url).hostname or url
return SSLInfo(
domain=hostname,
issuer="Unknown",
expires_at=datetime.now(timezone.utc),
days_until_expiry=0,
is_valid=False,
error=str(e)[:100]
)
async def check_and_alert_ssl(url: str, warn_days: int = 14) -> Optional[SSLInfo]:
"""Check SSL and send alerts if needed."""
ssl_info = check_ssl_certificate(url)
if not ssl_info:
return None
# Save to database
save_ssl_info(
domain=ssl_info.domain,
issuer=ssl_info.issuer,
expires_at=ssl_info.expires_at,
days_until_expiry=ssl_info.days_until_expiry
)
# Check if we need to alert
prev_info = get_ssl_info(ssl_info.domain)
if ssl_info.days_until_expiry <= 0:
# Certificate expired
await alert_ssl_expired(ssl_info.domain)
elif ssl_info.days_until_expiry <= warn_days:
# Certificate expiring soon - alert once per day
should_alert = True
if prev_info and prev_info.get("checked_at"):
# Check if we already alerted today
last_check = datetime.fromisoformat(prev_info["checked_at"])
if (datetime.now() - last_check).days < 1:
should_alert = False
if should_alert:
await alert_ssl_expiring(ssl_info.domain, ssl_info.days_until_expiry)
return ssl_info

View File

@@ -4,6 +4,7 @@
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>System Status</title>
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
<style>
* {
margin: 0;
@@ -19,7 +20,7 @@
}
.container {
max-width: 900px;
max-width: 1100px;
margin: 0 auto;
padding: 40px 20px;
}
@@ -39,6 +40,13 @@
background-clip: text;
}
h2 {
font-size: 1.3rem;
font-weight: 600;
margin: 30px 0 16px 0;
color: #94a3b8;
}
.overall-status {
display: inline-flex;
align-items: center;
@@ -174,8 +182,9 @@
.service-metrics {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(140px, 1fr));
grid-template-columns: repeat(auto-fit, minmax(120px, 1fr));
gap: 12px;
margin-bottom: 16px;
}
.metric {
@@ -212,6 +221,132 @@
color: #fca5a5;
}
/* Latency chart */
.latency-chart {
height: 60px;
margin-top: 12px;
}
/* SSL Card */
.ssl-card {
background: rgba(30, 41, 59, 0.5);
border: 1px solid rgba(100, 116, 139, 0.2);
border-radius: 16px;
padding: 20px;
margin-bottom: 20px;
}
.ssl-card.warning {
border-color: rgba(250, 204, 21, 0.3);
}
.ssl-card.danger {
border-color: rgba(239, 68, 68, 0.3);
}
.ssl-header {
display: flex;
justify-content: space-between;
align-items: center;
margin-bottom: 12px;
}
.ssl-title {
font-size: 1.1rem;
font-weight: 600;
color: #f1f5f9;
}
.ssl-badge {
padding: 4px 12px;
border-radius: 20px;
font-size: 0.8rem;
font-weight: 500;
}
.ssl-badge.valid {
background: rgba(34, 197, 94, 0.15);
color: #22c55e;
}
.ssl-badge.expiring {
background: rgba(250, 204, 21, 0.15);
color: #facc15;
}
.ssl-badge.expired {
background: rgba(239, 68, 68, 0.15);
color: #ef4444;
}
.ssl-info {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(150px, 1fr));
gap: 12px;
}
/* Incidents */
.incidents-list {
background: rgba(30, 41, 59, 0.5);
border: 1px solid rgba(100, 116, 139, 0.2);
border-radius: 16px;
overflow: hidden;
}
.incident-item {
padding: 16px 20px;
border-bottom: 1px solid rgba(100, 116, 139, 0.1);
display: flex;
justify-content: space-between;
align-items: center;
}
.incident-item:last-child {
border-bottom: none;
}
.incident-info {
display: flex;
align-items: center;
gap: 12px;
}
.incident-dot {
width: 10px;
height: 10px;
border-radius: 50%;
}
.incident-dot.resolved {
background: #22c55e;
}
.incident-dot.open {
background: #ef4444;
animation: pulse 2s infinite;
}
.incident-service {
font-weight: 500;
color: #f1f5f9;
}
.incident-message {
font-size: 0.85rem;
color: #94a3b8;
}
.incident-time {
font-size: 0.85rem;
color: #64748b;
}
.no-incidents {
padding: 30px;
text-align: center;
color: #64748b;
}
.refresh-btn {
display: inline-flex;
align-items: center;
@@ -292,8 +427,42 @@
</p>
</header>
{% if ssl_status %}
<div class="ssl-card {% if ssl_status.days_until_expiry <= 0 %}danger{% elif ssl_status.days_until_expiry <= 14 %}warning{% endif %}">
<div class="ssl-header">
<span class="ssl-title">SSL Certificate</span>
<span class="ssl-badge {% if ssl_status.days_until_expiry <= 0 %}expired{% elif ssl_status.days_until_expiry <= 14 %}expiring{% else %}valid{% endif %}">
{% if ssl_status.days_until_expiry <= 0 %}
Expired
{% elif ssl_status.days_until_expiry <= 14 %}
Expiring Soon
{% else %}
Valid
{% endif %}
</span>
</div>
<div class="ssl-info">
<div class="metric">
<div class="metric-label">Domain</div>
<div class="metric-value">{{ ssl_status.domain }}</div>
</div>
<div class="metric">
<div class="metric-label">Issuer</div>
<div class="metric-value">{{ ssl_status.issuer }}</div>
</div>
<div class="metric">
<div class="metric-label">Days Left</div>
<div class="metric-value {% if ssl_status.days_until_expiry <= 0 %}bad{% elif ssl_status.days_until_expiry <= 14 %}warning{% else %}good{% endif %}">
{{ ssl_status.days_until_expiry }}
</div>
</div>
</div>
</div>
{% endif %}
<div class="services-grid">
{% for name, service in services.items() %}
{% if service.status.value != 'unknown' or name != 'external' %}
<div class="service-card">
<div class="service-header">
<span class="service-name">{{ service.display_name }}</span>
@@ -322,7 +491,17 @@
</div>
</div>
<div class="metric">
<div class="metric-label">Uptime</div>
<div class="metric-label">Avg 24h</div>
<div class="metric-value {% if service.avg_latency_24h and service.avg_latency_24h < 200 %}good{% elif service.avg_latency_24h and service.avg_latency_24h < 500 %}warning{% elif service.avg_latency_24h %}bad{% endif %}">
{% if service.avg_latency_24h %}
{{ "%.0f"|format(service.avg_latency_24h) }} ms
{% else %}
{% endif %}
</div>
</div>
<div class="metric">
<div class="metric-label">Uptime 24h</div>
<div class="metric-value {% if service.uptime_percent >= 99 %}good{% elif service.uptime_percent >= 95 %}warning{% else %}bad{% endif %}">
{{ "%.1f"|format(service.uptime_percent) }}%
</div>
@@ -333,20 +512,49 @@
<div class="metric-value">{{ service.version }}</div>
</div>
{% endif %}
{% if service.last_incident %}
<div class="metric">
<div class="metric-label">Last Incident</div>
<div class="metric-value warning">{{ service.last_incident.strftime('%d.%m %H:%M') }}</div>
</div>
{% endif %}
</div>
{% if service.latency_history and service.latency_history|length > 1 %}
<div class="latency-chart">
<canvas id="chart-{{ name }}"></canvas>
</div>
{% endif %}
{% if service.message %}
<div class="service-message">{{ service.message }}</div>
{% endif %}
</div>
{% endif %}
{% endfor %}
</div>
<h2>Recent Incidents</h2>
<div class="incidents-list">
{% if incidents and incidents|length > 0 %}
{% for incident in incidents %}
<div class="incident-item">
<div class="incident-info">
<span class="incident-dot {% if incident.resolved_at %}resolved{% else %}open{% endif %}"></span>
<div>
<div class="incident-service">{{ incident.service_name | title }}</div>
<div class="incident-message">{{ incident.message or 'Service unavailable' }}</div>
</div>
</div>
<div class="incident-time">
{{ incident.started_at[:16].replace('T', ' ') }}
{% if incident.resolved_at %}
- Resolved
{% else %}
- Ongoing
{% endif %}
</div>
</div>
{% endfor %}
{% else %}
<div class="no-incidents">
No recent incidents
</div>
{% endif %}
</div>
<center>
<button class="refresh-btn" onclick="refreshStatus(this)">
<svg width="18" height="18" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
@@ -363,6 +571,55 @@
</div>
<script>
// Initialize latency charts
{% for name, service in services.items() %}
{% if service.latency_history and service.latency_history|length > 1 %}
(function() {
const ctx = document.getElementById('chart-{{ name }}').getContext('2d');
const data = {{ service.latency_history | tojson }};
new Chart(ctx, {
type: 'line',
data: {
labels: data.map(d => ''),
datasets: [{
data: data.map(d => d.latency_ms),
borderColor: '#00d4ff',
backgroundColor: 'rgba(0, 212, 255, 0.1)',
fill: true,
tension: 0.4,
pointRadius: 0,
borderWidth: 2
}]
},
options: {
responsive: true,
maintainAspectRatio: false,
plugins: {
legend: { display: false },
tooltip: {
callbacks: {
label: (ctx) => ctx.raw.toFixed(0) + ' ms'
}
}
},
scales: {
x: { display: false },
y: {
display: false,
beginAtZero: true
}
},
interaction: {
intersect: false,
mode: 'index'
}
}
});
})();
{% endif %}
{% endfor %}
async function refreshStatus(btn) {
btn.classList.add('loading');
btn.disabled = true;