Redesign health service + create backup service

This commit is contained in:
2025-12-18 03:35:13 +07:00
parent e43e579329
commit 57bad3b4a8
16 changed files with 1486 additions and 98 deletions

View File

@@ -20,5 +20,14 @@ S3_SECRET_ACCESS_KEY=your-secret-access-key
S3_ENDPOINT_URL=https://s3.firstvds.ru S3_ENDPOINT_URL=https://s3.firstvds.ru
S3_PUBLIC_URL=https://your-bucket-name.s3.firstvds.ru S3_PUBLIC_URL=https://your-bucket-name.s3.firstvds.ru
# Backup Service
TELEGRAM_ADMIN_ID=947392854
S3_BACKUP_PREFIX=backups/
BACKUP_RETENTION_DAYS=14
# Status Service (optional - for external monitoring)
EXTERNAL_URL=https://your-domain.com
PUBLIC_URL=https://your-domain.com
# Frontend (for build) # Frontend (for build)
VITE_API_URL=/api/v1 VITE_API_URL=/api/v1

View File

@@ -31,6 +31,12 @@ help:
@echo " make shell - Open backend shell" @echo " make shell - Open backend shell"
@echo " make frontend-sh - Open frontend shell" @echo " make frontend-sh - Open frontend shell"
@echo "" @echo ""
@echo " Backup:"
@echo " make backup-now - Run backup immediately"
@echo " make backup-list - List available backups in S3"
@echo " make backup-restore - Restore from backup (interactive)"
@echo " make backup-logs - Show backup service logs"
@echo ""
@echo " Cleanup:" @echo " Cleanup:"
@echo " make clean - Stop and remove containers, volumes" @echo " make clean - Stop and remove containers, volumes"
@echo " make prune - Remove unused Docker resources" @echo " make prune - Remove unused Docker resources"
@@ -137,3 +143,20 @@ test-backend:
# Production # Production
prod: prod:
$(DC) -f docker-compose.yml up -d --build $(DC) -f docker-compose.yml up -d --build
# Backup
backup-now:
$(DC) exec backup python /app/backup.py
backup-list:
$(DC) exec backup python /app/restore.py
backup-restore:
@read -p "Backup filename: " file; \
$(DC) exec -it backup python /app/restore.py "$$file"
backup-logs:
$(DC) logs -f backup
backup-shell:
$(DC) exec backup bash

30
backup-service/Dockerfile Normal file
View File

@@ -0,0 +1,30 @@
FROM python:3.11-slim
WORKDIR /app
# Install PostgreSQL client (for pg_dump and psql) and cron
RUN apt-get update && apt-get install -y \
postgresql-client \
cron \
&& rm -rf /var/lib/apt/lists/*
# Install Python dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Copy application
COPY . .
# Make scripts executable
RUN chmod +x backup.py restore.py
# Setup cron
COPY crontab /etc/cron.d/backup-cron
RUN chmod 0644 /etc/cron.d/backup-cron
RUN crontab /etc/cron.d/backup-cron
# Create log file
RUN touch /var/log/cron.log
# Start cron in foreground and tail logs
CMD ["sh", "-c", "printenv > /etc/environment && cron && tail -f /var/log/cron.log"]

217
backup-service/backup.py Normal file
View File

@@ -0,0 +1,217 @@
#!/usr/bin/env python3
"""
PostgreSQL Backup Service for WebApp.
- Creates pg_dump backup
- Compresses with gzip
- Uploads to S3 FirstVDS
- Rotates old backups (configurable retention)
- Sends Telegram notifications
"""
import gzip
import os
import subprocess
import sys
from datetime import datetime, timedelta, timezone
import boto3
import httpx
from botocore.config import Config as BotoConfig
from botocore.exceptions import ClientError
from config import config
def create_s3_client():
"""Initialize S3 client (same pattern as backend storage.py)."""
return boto3.client(
"s3",
endpoint_url=config.S3_ENDPOINT_URL,
aws_access_key_id=config.S3_ACCESS_KEY_ID,
aws_secret_access_key=config.S3_SECRET_ACCESS_KEY,
region_name=config.S3_REGION or "us-east-1",
config=BotoConfig(signature_version="s3v4"),
)
def send_telegram_notification(message: str, is_error: bool = False) -> None:
"""Send notification to Telegram admin."""
if not config.TELEGRAM_BOT_TOKEN or not config.TELEGRAM_ADMIN_ID:
print("Telegram not configured, skipping notification")
return
emoji = "\u274c" if is_error else "\u2705"
text = f"{emoji} *Database Backup*\n\n{message}"
url = f"https://api.telegram.org/bot{config.TELEGRAM_BOT_TOKEN}/sendMessage"
data = {
"chat_id": config.TELEGRAM_ADMIN_ID,
"text": text,
"parse_mode": "Markdown",
}
try:
response = httpx.post(url, json=data, timeout=30)
response.raise_for_status()
print("Telegram notification sent")
except Exception as e:
print(f"Failed to send Telegram notification: {e}")
def create_backup() -> tuple[str, bytes]:
"""Create pg_dump backup and compress it."""
timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
filename = f"marathon_backup_{timestamp}.sql.gz"
# Build pg_dump command
env = os.environ.copy()
env["PGPASSWORD"] = config.DB_PASSWORD
cmd = [
"pg_dump",
"-h",
config.DB_HOST,
"-p",
config.DB_PORT,
"-U",
config.DB_USER,
"-d",
config.DB_NAME,
"--no-owner",
"--no-acl",
"-F",
"p", # plain SQL format
]
print(f"Running pg_dump for database {config.DB_NAME}...")
result = subprocess.run(
cmd,
env=env,
capture_output=True,
)
if result.returncode != 0:
raise Exception(f"pg_dump failed: {result.stderr.decode()}")
# Compress the output
print("Compressing backup...")
compressed = gzip.compress(result.stdout, compresslevel=9)
return filename, compressed
def upload_to_s3(s3_client, filename: str, data: bytes) -> str:
"""Upload backup to S3."""
key = f"{config.S3_BACKUP_PREFIX}{filename}"
print(f"Uploading to S3: {key}...")
s3_client.put_object(
Bucket=config.S3_BUCKET_NAME,
Key=key,
Body=data,
ContentType="application/gzip",
)
return key
def rotate_old_backups(s3_client) -> int:
"""Delete backups older than BACKUP_RETENTION_DAYS."""
cutoff_date = datetime.now(timezone.utc) - timedelta(
days=config.BACKUP_RETENTION_DAYS
)
deleted_count = 0
print(f"Rotating backups older than {config.BACKUP_RETENTION_DAYS} days...")
# List all objects with backup prefix
try:
paginator = s3_client.get_paginator("list_objects_v2")
pages = paginator.paginate(
Bucket=config.S3_BUCKET_NAME,
Prefix=config.S3_BACKUP_PREFIX,
)
for page in pages:
for obj in page.get("Contents", []):
last_modified = obj["LastModified"]
if last_modified.tzinfo is None:
last_modified = last_modified.replace(tzinfo=timezone.utc)
if last_modified < cutoff_date:
s3_client.delete_object(
Bucket=config.S3_BUCKET_NAME,
Key=obj["Key"],
)
deleted_count += 1
print(f"Deleted old backup: {obj['Key']}")
except ClientError as e:
print(f"Error during rotation: {e}")
return deleted_count
def main() -> int:
"""Main backup routine."""
start_time = datetime.now()
print(f"{'=' * 50}")
print(f"Backup started at {start_time}")
print(f"{'=' * 50}")
try:
# Validate configuration
if not config.S3_BUCKET_NAME:
raise Exception("S3_BUCKET_NAME is not configured")
if not config.S3_ACCESS_KEY_ID:
raise Exception("S3_ACCESS_KEY_ID is not configured")
if not config.S3_SECRET_ACCESS_KEY:
raise Exception("S3_SECRET_ACCESS_KEY is not configured")
if not config.S3_ENDPOINT_URL:
raise Exception("S3_ENDPOINT_URL is not configured")
# Create S3 client
s3_client = create_s3_client()
# Create backup
filename, data = create_backup()
size_mb = len(data) / (1024 * 1024)
print(f"Backup created: {filename} ({size_mb:.2f} MB)")
# Upload to S3
s3_key = upload_to_s3(s3_client, filename, data)
print(f"Uploaded to S3: {s3_key}")
# Rotate old backups
deleted_count = rotate_old_backups(s3_client)
print(f"Deleted {deleted_count} old backups")
# Calculate duration
duration = datetime.now() - start_time
# Send success notification
message = (
f"Backup completed successfully!\n\n"
f"*File:* `{filename}`\n"
f"*Size:* {size_mb:.2f} MB\n"
f"*Duration:* {duration.seconds}s\n"
f"*Deleted old:* {deleted_count} files"
)
send_telegram_notification(message, is_error=False)
print(f"{'=' * 50}")
print("Backup completed successfully!")
print(f"{'=' * 50}")
return 0
except Exception as e:
error_msg = f"Backup failed!\n\n*Error:* `{str(e)}`"
send_telegram_notification(error_msg, is_error=True)
print(f"{'=' * 50}")
print(f"Backup failed: {e}")
print(f"{'=' * 50}")
return 1
if __name__ == "__main__":
sys.exit(main())

33
backup-service/config.py Normal file
View File

@@ -0,0 +1,33 @@
"""Configuration for backup service."""
import os
from dataclasses import dataclass
@dataclass
class Config:
"""Backup service configuration from environment variables."""
# Database
DB_HOST: str = os.getenv("DB_HOST", "db")
DB_PORT: str = os.getenv("DB_PORT", "5432")
DB_NAME: str = os.getenv("DB_NAME", "marathon")
DB_USER: str = os.getenv("DB_USER", "marathon")
DB_PASSWORD: str = os.getenv("DB_PASSWORD", "123")
# S3
S3_BUCKET_NAME: str = os.getenv("S3_BUCKET_NAME", "")
S3_REGION: str = os.getenv("S3_REGION", "ru-1")
S3_ACCESS_KEY_ID: str = os.getenv("S3_ACCESS_KEY_ID", "")
S3_SECRET_ACCESS_KEY: str = os.getenv("S3_SECRET_ACCESS_KEY", "")
S3_ENDPOINT_URL: str = os.getenv("S3_ENDPOINT_URL", "")
S3_BACKUP_PREFIX: str = os.getenv("S3_BACKUP_PREFIX", "backups/")
# Telegram
TELEGRAM_BOT_TOKEN: str = os.getenv("TELEGRAM_BOT_TOKEN", "")
TELEGRAM_ADMIN_ID: str = os.getenv("TELEGRAM_ADMIN_ID", "947392854")
# Backup settings
BACKUP_RETENTION_DAYS: int = int(os.getenv("BACKUP_RETENTION_DAYS", "14"))
config = Config()

4
backup-service/crontab Normal file
View File

@@ -0,0 +1,4 @@
# Backup cron job
# Run backup daily at 3:00 AM UTC
0 3 * * * /usr/local/bin/python /app/backup.py >> /var/log/cron.log 2>&1
# Empty line required at end of crontab

View File

@@ -0,0 +1,2 @@
boto3==1.34.0
httpx==0.26.0

158
backup-service/restore.py Normal file
View File

@@ -0,0 +1,158 @@
#!/usr/bin/env python3
"""
Restore PostgreSQL database from S3 backup.
Usage:
python restore.py - List available backups
python restore.py <filename> - Restore from specific backup
"""
import gzip
import os
import subprocess
import sys
import boto3
from botocore.config import Config as BotoConfig
from botocore.exceptions import ClientError
from config import config
def create_s3_client():
"""Initialize S3 client."""
return boto3.client(
"s3",
endpoint_url=config.S3_ENDPOINT_URL,
aws_access_key_id=config.S3_ACCESS_KEY_ID,
aws_secret_access_key=config.S3_SECRET_ACCESS_KEY,
region_name=config.S3_REGION or "us-east-1",
config=BotoConfig(signature_version="s3v4"),
)
def list_backups(s3_client) -> list[tuple[str, float, str]]:
"""List all available backups."""
print("Available backups:\n")
try:
paginator = s3_client.get_paginator("list_objects_v2")
pages = paginator.paginate(
Bucket=config.S3_BUCKET_NAME,
Prefix=config.S3_BACKUP_PREFIX,
)
backups = []
for page in pages:
for obj in page.get("Contents", []):
filename = obj["Key"].replace(config.S3_BACKUP_PREFIX, "")
size_mb = obj["Size"] / (1024 * 1024)
modified = obj["LastModified"].strftime("%Y-%m-%d %H:%M:%S")
backups.append((filename, size_mb, modified))
# Sort by date descending (newest first)
backups.sort(key=lambda x: x[2], reverse=True)
for filename, size_mb, modified in backups:
print(f" {filename} ({size_mb:.2f} MB) - {modified}")
return backups
except ClientError as e:
print(f"Error listing backups: {e}")
return []
def restore_backup(s3_client, filename: str) -> None:
"""Download and restore backup."""
key = f"{config.S3_BACKUP_PREFIX}{filename}"
print(f"Downloading {filename} from S3...")
try:
response = s3_client.get_object(
Bucket=config.S3_BUCKET_NAME,
Key=key,
)
compressed_data = response["Body"].read()
except ClientError as e:
raise Exception(f"Failed to download backup: {e}")
print("Decompressing...")
sql_data = gzip.decompress(compressed_data)
print(f"Restoring to database {config.DB_NAME}...")
# Build psql command
env = os.environ.copy()
env["PGPASSWORD"] = config.DB_PASSWORD
cmd = [
"psql",
"-h",
config.DB_HOST,
"-p",
config.DB_PORT,
"-U",
config.DB_USER,
"-d",
config.DB_NAME,
]
result = subprocess.run(
cmd,
env=env,
input=sql_data,
capture_output=True,
)
if result.returncode != 0:
stderr = result.stderr.decode()
# psql may return warnings that aren't fatal errors
if "ERROR" in stderr:
raise Exception(f"psql restore failed: {stderr}")
else:
print(f"Warnings: {stderr}")
print("Restore completed successfully!")
def main() -> int:
"""Main restore routine."""
# Validate configuration
if not config.S3_BUCKET_NAME:
print("Error: S3_BUCKET_NAME is not configured")
return 1
s3_client = create_s3_client()
if len(sys.argv) < 2:
# List available backups
backups = list_backups(s3_client)
if backups:
print(f"\nTo restore, run: python restore.py <filename>")
else:
print("No backups found.")
return 0
filename = sys.argv[1]
# Confirm restore
print(f"WARNING: This will restore database from {filename}")
print("This may overwrite existing data!")
print()
confirm = input("Type 'yes' to continue: ")
if confirm.lower() != "yes":
print("Restore cancelled.")
return 0
try:
restore_backup(s3_client, filename)
return 0
except Exception as e:
print(f"Restore failed: {e}")
return 1
if __name__ == "__main__":
sys.exit(main())

View File

@@ -94,7 +94,13 @@ services:
BACKEND_URL: http://backend:8000 BACKEND_URL: http://backend:8000
FRONTEND_URL: http://frontend:80 FRONTEND_URL: http://frontend:80
BOT_URL: http://bot:8080 BOT_URL: http://bot:8080
EXTERNAL_URL: ${EXTERNAL_URL:-}
PUBLIC_URL: ${PUBLIC_URL:-}
CHECK_INTERVAL: "30" CHECK_INTERVAL: "30"
TELEGRAM_BOT_TOKEN: ${TELEGRAM_BOT_TOKEN}
TELEGRAM_ADMIN_ID: ${TELEGRAM_ADMIN_ID:-947392854}
volumes:
- status_data:/app/data
ports: ports:
- "8001:8001" - "8001:8001"
depends_on: depends_on:
@@ -103,5 +109,31 @@ services:
- bot - bot
restart: unless-stopped restart: unless-stopped
backup:
build:
context: ./backup-service
dockerfile: Dockerfile
container_name: marathon-backup
environment:
DB_HOST: db
DB_PORT: "5432"
DB_NAME: marathon
DB_USER: marathon
DB_PASSWORD: ${DB_PASSWORD:-marathon}
S3_BUCKET_NAME: ${S3_BUCKET_NAME:-}
S3_REGION: ${S3_REGION:-ru-1}
S3_ACCESS_KEY_ID: ${S3_ACCESS_KEY_ID:-}
S3_SECRET_ACCESS_KEY: ${S3_SECRET_ACCESS_KEY:-}
S3_ENDPOINT_URL: ${S3_ENDPOINT_URL:-}
S3_BACKUP_PREFIX: ${S3_BACKUP_PREFIX:-backups/}
TELEGRAM_BOT_TOKEN: ${TELEGRAM_BOT_TOKEN}
TELEGRAM_ADMIN_ID: ${TELEGRAM_ADMIN_ID:-947392854}
BACKUP_RETENTION_DAYS: ${BACKUP_RETENTION_DAYS:-14}
depends_on:
db:
condition: service_healthy
restart: unless-stopped
volumes: volumes:
postgres_data: postgres_data:
status_data:

View File

@@ -6,6 +6,9 @@ WORKDIR /app
COPY requirements.txt . COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt RUN pip install --no-cache-dir -r requirements.txt
# Create data directory for SQLite
RUN mkdir -p /app/data
# Copy application # Copy application
COPY . . COPY . .

85
status-service/alerts.py Normal file
View File

@@ -0,0 +1,85 @@
"""Telegram alerting for status changes."""
import os
from datetime import datetime
from typing import Optional
import httpx
TELEGRAM_BOT_TOKEN = os.getenv("TELEGRAM_BOT_TOKEN", "")
TELEGRAM_ADMIN_ID = os.getenv("TELEGRAM_ADMIN_ID", "")
async def send_telegram_alert(message: str, is_recovery: bool = False) -> bool:
"""Send alert to Telegram."""
if not TELEGRAM_BOT_TOKEN or not TELEGRAM_ADMIN_ID:
print("Telegram alerting not configured")
return False
emoji = "\u2705" if is_recovery else "\u26a0\ufe0f"
text = f"{emoji} *Status Alert*\n\n{message}"
url = f"https://api.telegram.org/bot{TELEGRAM_BOT_TOKEN}/sendMessage"
data = {
"chat_id": TELEGRAM_ADMIN_ID,
"text": text,
"parse_mode": "Markdown",
}
try:
async with httpx.AsyncClient(timeout=10.0) as client:
response = await client.post(url, json=data)
response.raise_for_status()
print(f"Telegram alert sent: {message[:50]}...")
return True
except Exception as e:
print(f"Failed to send Telegram alert: {e}")
return False
async def alert_service_down(service_name: str, display_name: str, message: Optional[str]):
"""Alert when service goes down."""
now = datetime.now().strftime("%d.%m.%Y %H:%M:%S")
text = (
f"*{display_name}* is DOWN\n\n"
f"Time: `{now}`\n"
)
if message:
text += f"Error: `{message}`"
await send_telegram_alert(text, is_recovery=False)
async def alert_service_recovered(service_name: str, display_name: str, downtime_minutes: int):
"""Alert when service recovers."""
now = datetime.now().strftime("%d.%m.%Y %H:%M:%S")
text = (
f"*{display_name}* is back ONLINE\n\n"
f"Time: `{now}`\n"
f"Downtime: `{downtime_minutes} min`"
)
await send_telegram_alert(text, is_recovery=True)
async def alert_ssl_expiring(domain: str, days_left: int):
"""Alert when SSL certificate is expiring soon."""
text = (
f"*SSL Certificate Expiring*\n\n"
f"Domain: `{domain}`\n"
f"Days left: `{days_left}`\n\n"
f"Please renew the certificate!"
)
await send_telegram_alert(text, is_recovery=False)
async def alert_ssl_expired(domain: str):
"""Alert when SSL certificate has expired."""
text = (
f"*SSL Certificate EXPIRED*\n\n"
f"Domain: `{domain}`\n\n"
f"Certificate has expired! Site may show security warnings."
)
await send_telegram_alert(text, is_recovery=False)

261
status-service/database.py Normal file
View File

@@ -0,0 +1,261 @@
"""SQLite database for storing metrics history."""
import sqlite3
from datetime import datetime, timedelta
from pathlib import Path
from typing import Optional
import json
DB_PATH = Path("/app/data/metrics.db")
def get_connection() -> sqlite3.Connection:
"""Get database connection."""
DB_PATH.parent.mkdir(parents=True, exist_ok=True)
conn = sqlite3.connect(str(DB_PATH))
conn.row_factory = sqlite3.Row
return conn
def init_db():
"""Initialize database tables."""
conn = get_connection()
cursor = conn.cursor()
# Metrics history table
cursor.execute("""
CREATE TABLE IF NOT EXISTS metrics (
id INTEGER PRIMARY KEY AUTOINCREMENT,
service_name TEXT NOT NULL,
status TEXT NOT NULL,
latency_ms REAL,
message TEXT,
checked_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
""")
# Incidents table
cursor.execute("""
CREATE TABLE IF NOT EXISTS incidents (
id INTEGER PRIMARY KEY AUTOINCREMENT,
service_name TEXT NOT NULL,
status TEXT NOT NULL,
message TEXT,
started_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
resolved_at TIMESTAMP,
notified BOOLEAN DEFAULT FALSE
)
""")
# SSL certificates table
cursor.execute("""
CREATE TABLE IF NOT EXISTS ssl_certificates (
id INTEGER PRIMARY KEY AUTOINCREMENT,
domain TEXT NOT NULL UNIQUE,
issuer TEXT,
expires_at TIMESTAMP,
days_until_expiry INTEGER,
checked_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
""")
# Create indexes
cursor.execute("""
CREATE INDEX IF NOT EXISTS idx_metrics_service_time
ON metrics(service_name, checked_at DESC)
""")
cursor.execute("""
CREATE INDEX IF NOT EXISTS idx_incidents_service
ON incidents(service_name, started_at DESC)
""")
conn.commit()
conn.close()
def save_metric(service_name: str, status: str, latency_ms: Optional[float], message: Optional[str]):
"""Save a metric record."""
conn = get_connection()
cursor = conn.cursor()
cursor.execute(
"INSERT INTO metrics (service_name, status, latency_ms, message) VALUES (?, ?, ?, ?)",
(service_name, status, latency_ms, message)
)
conn.commit()
conn.close()
def get_latency_history(service_name: str, hours: int = 24) -> list[dict]:
"""Get latency history for a service."""
conn = get_connection()
cursor = conn.cursor()
since = datetime.now() - timedelta(hours=hours)
cursor.execute("""
SELECT latency_ms, status, checked_at
FROM metrics
WHERE service_name = ? AND checked_at > ? AND latency_ms IS NOT NULL
ORDER BY checked_at ASC
""", (service_name, since.isoformat()))
rows = cursor.fetchall()
conn.close()
return [
{
"latency_ms": row["latency_ms"],
"status": row["status"],
"checked_at": row["checked_at"]
}
for row in rows
]
def get_uptime_stats(service_name: str, hours: int = 24) -> dict:
"""Calculate uptime statistics for a service."""
conn = get_connection()
cursor = conn.cursor()
since = datetime.now() - timedelta(hours=hours)
cursor.execute("""
SELECT COUNT(*) as total,
SUM(CASE WHEN status = 'operational' THEN 1 ELSE 0 END) as successful
FROM metrics
WHERE service_name = ? AND checked_at > ?
""", (service_name, since.isoformat()))
row = cursor.fetchone()
conn.close()
total = row["total"] or 0
successful = row["successful"] or 0
return {
"total_checks": total,
"successful_checks": successful,
"uptime_percent": (successful / total * 100) if total > 0 else 100.0
}
def get_avg_latency(service_name: str, hours: int = 24) -> Optional[float]:
"""Get average latency for a service."""
conn = get_connection()
cursor = conn.cursor()
since = datetime.now() - timedelta(hours=hours)
cursor.execute("""
SELECT AVG(latency_ms) as avg_latency
FROM metrics
WHERE service_name = ? AND checked_at > ? AND latency_ms IS NOT NULL
""", (service_name, since.isoformat()))
row = cursor.fetchone()
conn.close()
return row["avg_latency"]
def create_incident(service_name: str, status: str, message: Optional[str]) -> int:
"""Create a new incident."""
conn = get_connection()
cursor = conn.cursor()
cursor.execute(
"INSERT INTO incidents (service_name, status, message) VALUES (?, ?, ?)",
(service_name, status, message)
)
incident_id = cursor.lastrowid
conn.commit()
conn.close()
return incident_id
def resolve_incident(service_name: str):
"""Resolve open incidents for a service."""
conn = get_connection()
cursor = conn.cursor()
cursor.execute("""
UPDATE incidents
SET resolved_at = CURRENT_TIMESTAMP
WHERE service_name = ? AND resolved_at IS NULL
""", (service_name,))
conn.commit()
conn.close()
def get_open_incident(service_name: str) -> Optional[dict]:
"""Get open incident for a service."""
conn = get_connection()
cursor = conn.cursor()
cursor.execute("""
SELECT * FROM incidents
WHERE service_name = ? AND resolved_at IS NULL
ORDER BY started_at DESC LIMIT 1
""", (service_name,))
row = cursor.fetchone()
conn.close()
if row:
return dict(row)
return None
def mark_incident_notified(incident_id: int):
"""Mark incident as notified."""
conn = get_connection()
cursor = conn.cursor()
cursor.execute("UPDATE incidents SET notified = TRUE WHERE id = ?", (incident_id,))
conn.commit()
conn.close()
def get_recent_incidents(limit: int = 10) -> list[dict]:
"""Get recent incidents."""
conn = get_connection()
cursor = conn.cursor()
cursor.execute("""
SELECT * FROM incidents
ORDER BY started_at DESC
LIMIT ?
""", (limit,))
rows = cursor.fetchall()
conn.close()
return [dict(row) for row in rows]
def save_ssl_info(domain: str, issuer: str, expires_at: datetime, days_until_expiry: int):
"""Save SSL certificate info."""
conn = get_connection()
cursor = conn.cursor()
cursor.execute("""
INSERT OR REPLACE INTO ssl_certificates
(domain, issuer, expires_at, days_until_expiry, checked_at)
VALUES (?, ?, ?, ?, CURRENT_TIMESTAMP)
""", (domain, issuer, expires_at.isoformat(), days_until_expiry))
conn.commit()
conn.close()
def get_ssl_info(domain: str) -> Optional[dict]:
"""Get SSL certificate info."""
conn = get_connection()
cursor = conn.cursor()
cursor.execute("SELECT * FROM ssl_certificates WHERE domain = ?", (domain,))
row = cursor.fetchone()
conn.close()
if row:
return dict(row)
return None
def cleanup_old_metrics(days: int = 7):
"""Delete metrics older than specified days."""
conn = get_connection()
cursor = conn.cursor()
cutoff = datetime.now() - timedelta(days=days)
cursor.execute("DELETE FROM metrics WHERE checked_at < ?", (cutoff.isoformat(),))
deleted = cursor.rowcount
conn.commit()
conn.close()
return deleted

View File

@@ -1,6 +1,7 @@
"""Status monitoring service with persistence and alerting."""
import os import os
import asyncio import asyncio
from datetime import datetime, timedelta from datetime import datetime
from typing import Optional from typing import Optional
from contextlib import asynccontextmanager from contextlib import asynccontextmanager
@@ -8,13 +9,16 @@ from fastapi import FastAPI, Request
from fastapi.responses import HTMLResponse from fastapi.responses import HTMLResponse
from fastapi.templating import Jinja2Templates from fastapi.templating import Jinja2Templates
from monitors import ServiceMonitor, ServiceStatus from monitors import ServiceMonitor
from database import init_db, get_recent_incidents, get_latency_history, cleanup_old_metrics
# Configuration # Configuration
BACKEND_URL = os.getenv("BACKEND_URL", "http://backend:8000") BACKEND_URL = os.getenv("BACKEND_URL", "http://backend:8000")
FRONTEND_URL = os.getenv("FRONTEND_URL", "http://frontend:80") FRONTEND_URL = os.getenv("FRONTEND_URL", "http://frontend:80")
BOT_URL = os.getenv("BOT_URL", "http://bot:8080") BOT_URL = os.getenv("BOT_URL", "http://bot:8080")
EXTERNAL_URL = os.getenv("EXTERNAL_URL", "") # Public URL for external checks
PUBLIC_URL = os.getenv("PUBLIC_URL", "") # Public HTTPS URL for SSL checks
CHECK_INTERVAL = int(os.getenv("CHECK_INTERVAL", "30")) CHECK_INTERVAL = int(os.getenv("CHECK_INTERVAL", "30"))
# Initialize monitor # Initialize monitor
@@ -22,38 +26,64 @@ monitor = ServiceMonitor()
# Background task reference # Background task reference
background_task: Optional[asyncio.Task] = None background_task: Optional[asyncio.Task] = None
cleanup_task: Optional[asyncio.Task] = None
async def periodic_health_check(): async def periodic_health_check():
"""Background task to check services periodically""" """Background task to check services periodically."""
while True: while True:
try:
await monitor.check_all_services( await monitor.check_all_services(
backend_url=BACKEND_URL, backend_url=BACKEND_URL,
frontend_url=FRONTEND_URL, frontend_url=FRONTEND_URL,
bot_url=BOT_URL bot_url=BOT_URL,
external_url=EXTERNAL_URL,
public_url=PUBLIC_URL
) )
except Exception as e:
print(f"Health check error: {e}")
await asyncio.sleep(CHECK_INTERVAL) await asyncio.sleep(CHECK_INTERVAL)
async def periodic_cleanup():
"""Background task to cleanup old metrics (daily)."""
while True:
await asyncio.sleep(86400) # 24 hours
try:
deleted = cleanup_old_metrics(days=7)
print(f"Cleaned up {deleted} old metrics")
except Exception as e:
print(f"Cleanup error: {e}")
@asynccontextmanager @asynccontextmanager
async def lifespan(app: FastAPI): async def lifespan(app: FastAPI):
"""Startup and shutdown events""" """Startup and shutdown events."""
global background_task global background_task, cleanup_task
# Initialize database
init_db()
print("Database initialized")
# Start background health checks # Start background health checks
background_task = asyncio.create_task(periodic_health_check()) background_task = asyncio.create_task(periodic_health_check())
cleanup_task = asyncio.create_task(periodic_cleanup())
yield yield
# Cancel background task on shutdown
if background_task: # Cancel background tasks on shutdown
background_task.cancel() for task in [background_task, cleanup_task]:
if task:
task.cancel()
try: try:
await background_task await task
except asyncio.CancelledError: except asyncio.CancelledError:
pass pass
app = FastAPI( app = FastAPI(
title="Status Monitor", title="Status Monitor",
description="Service health monitoring", description="Service health monitoring with persistence and alerting",
lifespan=lifespan lifespan=lifespan
) )
@@ -62,9 +92,11 @@ templates = Jinja2Templates(directory="templates")
@app.get("/", response_class=HTMLResponse) @app.get("/", response_class=HTMLResponse)
async def status_page(request: Request): async def status_page(request: Request):
"""Main status page""" """Main status page."""
services = monitor.get_all_statuses() services = monitor.get_all_statuses()
overall_status = monitor.get_overall_status() overall_status = monitor.get_overall_status()
ssl_status = monitor.get_ssl_status()
incidents = get_recent_incidents(limit=5)
return templates.TemplateResponse( return templates.TemplateResponse(
"index.html", "index.html",
@@ -72,6 +104,8 @@ async def status_page(request: Request):
"request": request, "request": request,
"services": services, "services": services,
"overall_status": overall_status, "overall_status": overall_status,
"ssl_status": ssl_status,
"incidents": incidents,
"last_check": monitor.last_check, "last_check": monitor.last_check,
"check_interval": CHECK_INTERVAL "check_interval": CHECK_INTERVAL
} }
@@ -80,30 +114,52 @@ async def status_page(request: Request):
@app.get("/api/status") @app.get("/api/status")
async def api_status(): async def api_status():
"""API endpoint for service statuses""" """API endpoint for service statuses."""
services = monitor.get_all_statuses() services = monitor.get_all_statuses()
overall_status = monitor.get_overall_status() overall_status = monitor.get_overall_status()
ssl_status = monitor.get_ssl_status()
return { return {
"overall_status": overall_status, "overall_status": overall_status.value,
"services": {name: status.to_dict() for name, status in services.items()}, "services": {name: status.to_dict() for name, status in services.items()},
"ssl": ssl_status,
"last_check": monitor.last_check.isoformat() if monitor.last_check else None, "last_check": monitor.last_check.isoformat() if monitor.last_check else None,
"check_interval_seconds": CHECK_INTERVAL "check_interval_seconds": CHECK_INTERVAL
} }
@app.get("/api/history/{service_name}")
async def api_history(service_name: str, hours: int = 24):
"""API endpoint for service latency history."""
history = get_latency_history(service_name, hours=hours)
return {
"service": service_name,
"hours": hours,
"data": history
}
@app.get("/api/incidents")
async def api_incidents(limit: int = 20):
"""API endpoint for recent incidents."""
incidents = get_recent_incidents(limit=limit)
return {"incidents": incidents}
@app.get("/api/health") @app.get("/api/health")
async def health(): async def health():
"""Health check for this service""" """Health check for this service."""
return {"status": "ok", "service": "status-monitor"} return {"status": "ok", "service": "status-monitor"}
@app.post("/api/refresh") @app.post("/api/refresh")
async def refresh_status(): async def refresh_status():
"""Force refresh all service statuses""" """Force refresh all service statuses."""
await monitor.check_all_services( await monitor.check_all_services(
backend_url=BACKEND_URL, backend_url=BACKEND_URL,
frontend_url=FRONTEND_URL, frontend_url=FRONTEND_URL,
bot_url=BOT_URL bot_url=BOT_URL,
external_url=EXTERNAL_URL,
public_url=PUBLIC_URL
) )
return {"status": "refreshed"} return {"status": "refreshed"}

View File

@@ -1,11 +1,19 @@
"""Service monitoring with persistence and alerting."""
import asyncio import asyncio
from datetime import datetime, timedelta from datetime import datetime, timedelta
from dataclasses import dataclass, field from dataclasses import dataclass
from typing import Optional from typing import Optional
from enum import Enum from enum import Enum
import httpx import httpx
from database import (
save_metric, get_latency_history, get_uptime_stats, get_avg_latency,
create_incident, resolve_incident, get_open_incident, mark_incident_notified
)
from alerts import alert_service_down, alert_service_recovered
from ssl_monitor import check_and_alert_ssl, SSLInfo
class Status(str, Enum): class Status(str, Enum):
OPERATIONAL = "operational" OPERATIONAL = "operational"
@@ -25,11 +33,17 @@ class ServiceStatus:
uptime_percent: float = 100.0 uptime_percent: float = 100.0
message: Optional[str] = None message: Optional[str] = None
version: Optional[str] = None version: Optional[str] = None
avg_latency_24h: Optional[float] = None
latency_history: list = None
# For uptime calculation # For uptime calculation (in-memory, backed by DB)
total_checks: int = 0 total_checks: int = 0
successful_checks: int = 0 successful_checks: int = 0
def __post_init__(self):
if self.latency_history is None:
self.latency_history = []
def to_dict(self) -> dict: def to_dict(self) -> dict:
return { return {
"name": self.name, "name": self.name,
@@ -40,7 +54,8 @@ class ServiceStatus:
"last_incident": self.last_incident.isoformat() if self.last_incident else None, "last_incident": self.last_incident.isoformat() if self.last_incident else None,
"uptime_percent": round(self.uptime_percent, 2), "uptime_percent": round(self.uptime_percent, 2),
"message": self.message, "message": self.message,
"version": self.version "version": self.version,
"avg_latency_24h": round(self.avg_latency_24h, 2) if self.avg_latency_24h else None,
} }
def update_uptime(self, is_success: bool): def update_uptime(self, is_success: bool):
@@ -69,12 +84,17 @@ class ServiceMonitor:
"bot": ServiceStatus( "bot": ServiceStatus(
name="bot", name="bot",
display_name="Telegram Bot" display_name="Telegram Bot"
) ),
"external": ServiceStatus(
name="external",
display_name="External Access"
),
} }
self.last_check: Optional[datetime] = None self.last_check: Optional[datetime] = None
self.ssl_info: Optional[SSLInfo] = None
async def check_backend(self, url: str) -> tuple[Status, Optional[float], Optional[str], Optional[str]]: async def check_backend(self, url: str) -> tuple[Status, Optional[float], Optional[str], Optional[str]]:
"""Check backend API health""" """Check backend API health."""
try: try:
async with httpx.AsyncClient(timeout=10.0) as client: async with httpx.AsyncClient(timeout=10.0) as client:
start = datetime.now() start = datetime.now()
@@ -92,9 +112,7 @@ class ServiceMonitor:
return Status.DOWN, None, str(e)[:100], None return Status.DOWN, None, str(e)[:100], None
async def check_database(self, backend_url: str) -> tuple[Status, Optional[float], Optional[str]]: async def check_database(self, backend_url: str) -> tuple[Status, Optional[float], Optional[str]]:
"""Check database through backend""" """Check database through backend."""
# We check database indirectly - if backend is up, DB is likely up
# Could add a specific /health/db endpoint to backend later
try: try:
async with httpx.AsyncClient(timeout=10.0) as client: async with httpx.AsyncClient(timeout=10.0) as client:
start = datetime.now() start = datetime.now()
@@ -109,7 +127,7 @@ class ServiceMonitor:
return Status.DOWN, None, "Cannot reach backend" return Status.DOWN, None, "Cannot reach backend"
async def check_frontend(self, url: str) -> tuple[Status, Optional[float], Optional[str]]: async def check_frontend(self, url: str) -> tuple[Status, Optional[float], Optional[str]]:
"""Check frontend availability""" """Check frontend availability."""
try: try:
async with httpx.AsyncClient(timeout=10.0) as client: async with httpx.AsyncClient(timeout=10.0) as client:
start = datetime.now() start = datetime.now()
@@ -126,7 +144,7 @@ class ServiceMonitor:
return Status.DOWN, None, str(e)[:100] return Status.DOWN, None, str(e)[:100]
async def check_bot(self, url: str) -> tuple[Status, Optional[float], Optional[str]]: async def check_bot(self, url: str) -> tuple[Status, Optional[float], Optional[str]]:
"""Check Telegram bot health""" """Check Telegram bot health."""
try: try:
async with httpx.AsyncClient(timeout=10.0) as client: async with httpx.AsyncClient(timeout=10.0) as client:
start = datetime.now() start = datetime.now()
@@ -142,8 +160,93 @@ class ServiceMonitor:
except Exception as e: except Exception as e:
return Status.DOWN, None, str(e)[:100] return Status.DOWN, None, str(e)[:100]
async def check_all_services(self, backend_url: str, frontend_url: str, bot_url: str): async def check_external(self, url: str) -> tuple[Status, Optional[float], Optional[str]]:
"""Check all services concurrently""" """Check external (public) URL availability."""
if not url:
return Status.UNKNOWN, None, "Not configured"
try:
async with httpx.AsyncClient(timeout=15.0, follow_redirects=True) as client:
start = datetime.now()
response = await client.get(url)
latency = (datetime.now() - start).total_seconds() * 1000
if response.status_code == 200:
return Status.OPERATIONAL, latency, None
else:
return Status.DEGRADED, latency, f"HTTP {response.status_code}"
except httpx.TimeoutException:
return Status.DOWN, None, "Timeout"
except Exception as e:
return Status.DOWN, None, str(e)[:100]
async def _process_check_result(
self,
service_name: str,
result: tuple,
now: datetime
):
"""Process check result with DB persistence and alerting."""
if isinstance(result, Exception):
return
if len(result) == 4:
status, latency, message, version = result
else:
status, latency, message = result
version = None
svc = self.services[service_name]
was_down = svc.status in (Status.DOWN, Status.DEGRADED)
is_down = status in (Status.DOWN, Status.DEGRADED)
# Update service status
svc.status = status
svc.latency_ms = latency
svc.message = message
if version:
svc.version = version
svc.last_check = now
svc.update_uptime(status == Status.OPERATIONAL)
# Save metric to database
save_metric(service_name, status.value, latency, message)
# Load historical data
svc.latency_history = get_latency_history(service_name, hours=24)
svc.avg_latency_24h = get_avg_latency(service_name, hours=24)
# Update uptime from DB
stats = get_uptime_stats(service_name, hours=24)
if stats["total_checks"] > 0:
svc.uptime_percent = stats["uptime_percent"]
# Handle incident tracking and alerting
if is_down and not was_down:
# Service just went down
svc.last_incident = now
incident_id = create_incident(service_name, status.value, message)
await alert_service_down(service_name, svc.display_name, message)
mark_incident_notified(incident_id)
elif not is_down and was_down:
# Service recovered
open_incident = get_open_incident(service_name)
if open_incident:
started_at = datetime.fromisoformat(open_incident["started_at"])
downtime_minutes = int((now - started_at).total_seconds() / 60)
resolve_incident(service_name)
await alert_service_recovered(service_name, svc.display_name, downtime_minutes)
async def check_all_services(
self,
backend_url: str,
frontend_url: str,
bot_url: str,
external_url: str = "",
public_url: str = ""
):
"""Check all services concurrently."""
now = datetime.now() now = datetime.now()
# Run all checks concurrently # Run all checks concurrently
@@ -152,61 +255,18 @@ class ServiceMonitor:
self.check_database(backend_url), self.check_database(backend_url),
self.check_frontend(frontend_url), self.check_frontend(frontend_url),
self.check_bot(bot_url), self.check_bot(bot_url),
self.check_external(external_url),
return_exceptions=True return_exceptions=True
) )
# Process backend result # Process results
if not isinstance(results[0], Exception): service_names = ["backend", "database", "frontend", "bot", "external"]
status, latency, message, version = results[0] for i, service_name in enumerate(service_names):
svc = self.services["backend"] await self._process_check_result(service_name, results[i], now)
was_down = svc.status == Status.DOWN
svc.status = status
svc.latency_ms = latency
svc.message = message
svc.version = version
svc.last_check = now
svc.update_uptime(status == Status.OPERATIONAL)
if status != Status.OPERATIONAL and not was_down:
svc.last_incident = now
# Process database result # Check SSL certificate (if public URL is HTTPS)
if not isinstance(results[1], Exception): if public_url and public_url.startswith("https://"):
status, latency, message = results[1] self.ssl_info = await check_and_alert_ssl(public_url)
svc = self.services["database"]
was_down = svc.status == Status.DOWN
svc.status = status
svc.latency_ms = latency
svc.message = message
svc.last_check = now
svc.update_uptime(status == Status.OPERATIONAL)
if status != Status.OPERATIONAL and not was_down:
svc.last_incident = now
# Process frontend result
if not isinstance(results[2], Exception):
status, latency, message = results[2]
svc = self.services["frontend"]
was_down = svc.status == Status.DOWN
svc.status = status
svc.latency_ms = latency
svc.message = message
svc.last_check = now
svc.update_uptime(status == Status.OPERATIONAL)
if status != Status.OPERATIONAL and not was_down:
svc.last_incident = now
# Process bot result
if not isinstance(results[3], Exception):
status, latency, message = results[3]
svc = self.services["bot"]
was_down = svc.status == Status.DOWN
svc.status = status
svc.latency_ms = latency
svc.message = message
svc.last_check = now
svc.update_uptime(status == Status.OPERATIONAL)
if status != Status.OPERATIONAL and not was_down:
svc.last_incident = now
self.last_check = now self.last_check = now
@@ -214,8 +274,12 @@ class ServiceMonitor:
return self.services return self.services
def get_overall_status(self) -> Status: def get_overall_status(self) -> Status:
"""Get overall system status based on all services""" """Get overall system status based on all services."""
statuses = [svc.status for svc in self.services.values()] # Exclude external from overall status if not configured
statuses = [
svc.status for name, svc in self.services.items()
if name != "external" or svc.status != Status.UNKNOWN
]
if all(s == Status.OPERATIONAL for s in statuses): if all(s == Status.OPERATIONAL for s in statuses):
return Status.OPERATIONAL return Status.OPERATIONAL
@@ -225,3 +289,17 @@ class ServiceMonitor:
return Status.DEGRADED return Status.DEGRADED
else: else:
return Status.UNKNOWN return Status.UNKNOWN
def get_ssl_status(self) -> Optional[dict]:
"""Get SSL certificate status."""
if not self.ssl_info:
return None
return {
"domain": self.ssl_info.domain,
"issuer": self.ssl_info.issuer,
"expires_at": self.ssl_info.expires_at.isoformat(),
"days_until_expiry": self.ssl_info.days_until_expiry,
"is_valid": self.ssl_info.is_valid,
"error": self.ssl_info.error
}

View File

@@ -0,0 +1,140 @@
"""SSL certificate monitoring."""
import ssl
import socket
from datetime import datetime, timezone
from dataclasses import dataclass
from typing import Optional
from urllib.parse import urlparse
from database import save_ssl_info, get_ssl_info
from alerts import alert_ssl_expiring, alert_ssl_expired
@dataclass
class SSLInfo:
domain: str
issuer: str
expires_at: datetime
days_until_expiry: int
is_valid: bool
error: Optional[str] = None
def check_ssl_certificate(url: str) -> Optional[SSLInfo]:
"""Check SSL certificate for a URL."""
try:
parsed = urlparse(url)
hostname = parsed.hostname
if not hostname:
return None
# Skip non-HTTPS or localhost
if parsed.scheme != "https" or hostname in ("localhost", "127.0.0.1"):
return None
context = ssl.create_default_context()
conn = context.wrap_socket(
socket.socket(socket.AF_INET),
server_hostname=hostname
)
conn.settimeout(10.0)
try:
conn.connect((hostname, parsed.port or 443))
cert = conn.getpeercert()
finally:
conn.close()
if not cert:
return SSLInfo(
domain=hostname,
issuer="Unknown",
expires_at=datetime.now(timezone.utc),
days_until_expiry=0,
is_valid=False,
error="No certificate found"
)
# Parse expiry date
not_after = cert.get("notAfter", "")
expires_at = datetime.strptime(not_after, "%b %d %H:%M:%S %Y %Z")
expires_at = expires_at.replace(tzinfo=timezone.utc)
# Calculate days until expiry
now = datetime.now(timezone.utc)
days_until_expiry = (expires_at - now).days
# Get issuer
issuer_parts = cert.get("issuer", ())
issuer = "Unknown"
for part in issuer_parts:
for key, value in part:
if key == "organizationName":
issuer = value
break
return SSLInfo(
domain=hostname,
issuer=issuer,
expires_at=expires_at,
days_until_expiry=days_until_expiry,
is_valid=days_until_expiry > 0
)
except ssl.SSLCertVerificationError as e:
hostname = urlparse(url).hostname or url
return SSLInfo(
domain=hostname,
issuer="Invalid",
expires_at=datetime.now(timezone.utc),
days_until_expiry=0,
is_valid=False,
error=f"SSL verification failed: {str(e)[:100]}"
)
except Exception as e:
hostname = urlparse(url).hostname or url
return SSLInfo(
domain=hostname,
issuer="Unknown",
expires_at=datetime.now(timezone.utc),
days_until_expiry=0,
is_valid=False,
error=str(e)[:100]
)
async def check_and_alert_ssl(url: str, warn_days: int = 14) -> Optional[SSLInfo]:
"""Check SSL and send alerts if needed."""
ssl_info = check_ssl_certificate(url)
if not ssl_info:
return None
# Save to database
save_ssl_info(
domain=ssl_info.domain,
issuer=ssl_info.issuer,
expires_at=ssl_info.expires_at,
days_until_expiry=ssl_info.days_until_expiry
)
# Check if we need to alert
prev_info = get_ssl_info(ssl_info.domain)
if ssl_info.days_until_expiry <= 0:
# Certificate expired
await alert_ssl_expired(ssl_info.domain)
elif ssl_info.days_until_expiry <= warn_days:
# Certificate expiring soon - alert once per day
should_alert = True
if prev_info and prev_info.get("checked_at"):
# Check if we already alerted today
last_check = datetime.fromisoformat(prev_info["checked_at"])
if (datetime.now() - last_check).days < 1:
should_alert = False
if should_alert:
await alert_ssl_expiring(ssl_info.domain, ssl_info.days_until_expiry)
return ssl_info

View File

@@ -4,6 +4,7 @@
<meta charset="UTF-8"> <meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0"> <meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>System Status</title> <title>System Status</title>
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
<style> <style>
* { * {
margin: 0; margin: 0;
@@ -19,7 +20,7 @@
} }
.container { .container {
max-width: 900px; max-width: 1100px;
margin: 0 auto; margin: 0 auto;
padding: 40px 20px; padding: 40px 20px;
} }
@@ -39,6 +40,13 @@
background-clip: text; background-clip: text;
} }
h2 {
font-size: 1.3rem;
font-weight: 600;
margin: 30px 0 16px 0;
color: #94a3b8;
}
.overall-status { .overall-status {
display: inline-flex; display: inline-flex;
align-items: center; align-items: center;
@@ -174,8 +182,9 @@
.service-metrics { .service-metrics {
display: grid; display: grid;
grid-template-columns: repeat(auto-fit, minmax(140px, 1fr)); grid-template-columns: repeat(auto-fit, minmax(120px, 1fr));
gap: 12px; gap: 12px;
margin-bottom: 16px;
} }
.metric { .metric {
@@ -212,6 +221,132 @@
color: #fca5a5; color: #fca5a5;
} }
/* Latency chart */
.latency-chart {
height: 60px;
margin-top: 12px;
}
/* SSL Card */
.ssl-card {
background: rgba(30, 41, 59, 0.5);
border: 1px solid rgba(100, 116, 139, 0.2);
border-radius: 16px;
padding: 20px;
margin-bottom: 20px;
}
.ssl-card.warning {
border-color: rgba(250, 204, 21, 0.3);
}
.ssl-card.danger {
border-color: rgba(239, 68, 68, 0.3);
}
.ssl-header {
display: flex;
justify-content: space-between;
align-items: center;
margin-bottom: 12px;
}
.ssl-title {
font-size: 1.1rem;
font-weight: 600;
color: #f1f5f9;
}
.ssl-badge {
padding: 4px 12px;
border-radius: 20px;
font-size: 0.8rem;
font-weight: 500;
}
.ssl-badge.valid {
background: rgba(34, 197, 94, 0.15);
color: #22c55e;
}
.ssl-badge.expiring {
background: rgba(250, 204, 21, 0.15);
color: #facc15;
}
.ssl-badge.expired {
background: rgba(239, 68, 68, 0.15);
color: #ef4444;
}
.ssl-info {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(150px, 1fr));
gap: 12px;
}
/* Incidents */
.incidents-list {
background: rgba(30, 41, 59, 0.5);
border: 1px solid rgba(100, 116, 139, 0.2);
border-radius: 16px;
overflow: hidden;
}
.incident-item {
padding: 16px 20px;
border-bottom: 1px solid rgba(100, 116, 139, 0.1);
display: flex;
justify-content: space-between;
align-items: center;
}
.incident-item:last-child {
border-bottom: none;
}
.incident-info {
display: flex;
align-items: center;
gap: 12px;
}
.incident-dot {
width: 10px;
height: 10px;
border-radius: 50%;
}
.incident-dot.resolved {
background: #22c55e;
}
.incident-dot.open {
background: #ef4444;
animation: pulse 2s infinite;
}
.incident-service {
font-weight: 500;
color: #f1f5f9;
}
.incident-message {
font-size: 0.85rem;
color: #94a3b8;
}
.incident-time {
font-size: 0.85rem;
color: #64748b;
}
.no-incidents {
padding: 30px;
text-align: center;
color: #64748b;
}
.refresh-btn { .refresh-btn {
display: inline-flex; display: inline-flex;
align-items: center; align-items: center;
@@ -292,8 +427,42 @@
</p> </p>
</header> </header>
{% if ssl_status %}
<div class="ssl-card {% if ssl_status.days_until_expiry <= 0 %}danger{% elif ssl_status.days_until_expiry <= 14 %}warning{% endif %}">
<div class="ssl-header">
<span class="ssl-title">SSL Certificate</span>
<span class="ssl-badge {% if ssl_status.days_until_expiry <= 0 %}expired{% elif ssl_status.days_until_expiry <= 14 %}expiring{% else %}valid{% endif %}">
{% if ssl_status.days_until_expiry <= 0 %}
Expired
{% elif ssl_status.days_until_expiry <= 14 %}
Expiring Soon
{% else %}
Valid
{% endif %}
</span>
</div>
<div class="ssl-info">
<div class="metric">
<div class="metric-label">Domain</div>
<div class="metric-value">{{ ssl_status.domain }}</div>
</div>
<div class="metric">
<div class="metric-label">Issuer</div>
<div class="metric-value">{{ ssl_status.issuer }}</div>
</div>
<div class="metric">
<div class="metric-label">Days Left</div>
<div class="metric-value {% if ssl_status.days_until_expiry <= 0 %}bad{% elif ssl_status.days_until_expiry <= 14 %}warning{% else %}good{% endif %}">
{{ ssl_status.days_until_expiry }}
</div>
</div>
</div>
</div>
{% endif %}
<div class="services-grid"> <div class="services-grid">
{% for name, service in services.items() %} {% for name, service in services.items() %}
{% if service.status.value != 'unknown' or name != 'external' %}
<div class="service-card"> <div class="service-card">
<div class="service-header"> <div class="service-header">
<span class="service-name">{{ service.display_name }}</span> <span class="service-name">{{ service.display_name }}</span>
@@ -322,7 +491,17 @@
</div> </div>
</div> </div>
<div class="metric"> <div class="metric">
<div class="metric-label">Uptime</div> <div class="metric-label">Avg 24h</div>
<div class="metric-value {% if service.avg_latency_24h and service.avg_latency_24h < 200 %}good{% elif service.avg_latency_24h and service.avg_latency_24h < 500 %}warning{% elif service.avg_latency_24h %}bad{% endif %}">
{% if service.avg_latency_24h %}
{{ "%.0f"|format(service.avg_latency_24h) }} ms
{% else %}
{% endif %}
</div>
</div>
<div class="metric">
<div class="metric-label">Uptime 24h</div>
<div class="metric-value {% if service.uptime_percent >= 99 %}good{% elif service.uptime_percent >= 95 %}warning{% else %}bad{% endif %}"> <div class="metric-value {% if service.uptime_percent >= 99 %}good{% elif service.uptime_percent >= 95 %}warning{% else %}bad{% endif %}">
{{ "%.1f"|format(service.uptime_percent) }}% {{ "%.1f"|format(service.uptime_percent) }}%
</div> </div>
@@ -333,20 +512,49 @@
<div class="metric-value">{{ service.version }}</div> <div class="metric-value">{{ service.version }}</div>
</div> </div>
{% endif %} {% endif %}
{% if service.last_incident %} </div>
<div class="metric"> {% if service.latency_history and service.latency_history|length > 1 %}
<div class="metric-label">Last Incident</div> <div class="latency-chart">
<div class="metric-value warning">{{ service.last_incident.strftime('%d.%m %H:%M') }}</div> <canvas id="chart-{{ name }}"></canvas>
</div> </div>
{% endif %} {% endif %}
</div>
{% if service.message %} {% if service.message %}
<div class="service-message">{{ service.message }}</div> <div class="service-message">{{ service.message }}</div>
{% endif %} {% endif %}
</div> </div>
{% endif %}
{% endfor %} {% endfor %}
</div> </div>
<h2>Recent Incidents</h2>
<div class="incidents-list">
{% if incidents and incidents|length > 0 %}
{% for incident in incidents %}
<div class="incident-item">
<div class="incident-info">
<span class="incident-dot {% if incident.resolved_at %}resolved{% else %}open{% endif %}"></span>
<div>
<div class="incident-service">{{ incident.service_name | title }}</div>
<div class="incident-message">{{ incident.message or 'Service unavailable' }}</div>
</div>
</div>
<div class="incident-time">
{{ incident.started_at[:16].replace('T', ' ') }}
{% if incident.resolved_at %}
- Resolved
{% else %}
- Ongoing
{% endif %}
</div>
</div>
{% endfor %}
{% else %}
<div class="no-incidents">
No recent incidents
</div>
{% endif %}
</div>
<center> <center>
<button class="refresh-btn" onclick="refreshStatus(this)"> <button class="refresh-btn" onclick="refreshStatus(this)">
<svg width="18" height="18" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"> <svg width="18" height="18" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
@@ -363,6 +571,55 @@
</div> </div>
<script> <script>
// Initialize latency charts
{% for name, service in services.items() %}
{% if service.latency_history and service.latency_history|length > 1 %}
(function() {
const ctx = document.getElementById('chart-{{ name }}').getContext('2d');
const data = {{ service.latency_history | tojson }};
new Chart(ctx, {
type: 'line',
data: {
labels: data.map(d => ''),
datasets: [{
data: data.map(d => d.latency_ms),
borderColor: '#00d4ff',
backgroundColor: 'rgba(0, 212, 255, 0.1)',
fill: true,
tension: 0.4,
pointRadius: 0,
borderWidth: 2
}]
},
options: {
responsive: true,
maintainAspectRatio: false,
plugins: {
legend: { display: false },
tooltip: {
callbacks: {
label: (ctx) => ctx.raw.toFixed(0) + ' ms'
}
}
},
scales: {
x: { display: false },
y: {
display: false,
beginAtZero: true
}
},
interaction: {
intersect: false,
mode: 'index'
}
}
});
})();
{% endif %}
{% endfor %}
async function refreshStatus(btn) { async function refreshStatus(btn) {
btn.classList.add('loading'); btn.classList.add('loading');
btn.disabled = true; btn.disabled = true;