Redesign health service + create backup service
This commit is contained in:
@@ -20,5 +20,14 @@ S3_SECRET_ACCESS_KEY=your-secret-access-key
|
||||
S3_ENDPOINT_URL=https://s3.firstvds.ru
|
||||
S3_PUBLIC_URL=https://your-bucket-name.s3.firstvds.ru
|
||||
|
||||
# Backup Service
|
||||
TELEGRAM_ADMIN_ID=947392854
|
||||
S3_BACKUP_PREFIX=backups/
|
||||
BACKUP_RETENTION_DAYS=14
|
||||
|
||||
# Status Service (optional - for external monitoring)
|
||||
EXTERNAL_URL=https://your-domain.com
|
||||
PUBLIC_URL=https://your-domain.com
|
||||
|
||||
# Frontend (for build)
|
||||
VITE_API_URL=/api/v1
|
||||
|
||||
23
Makefile
23
Makefile
@@ -31,6 +31,12 @@ help:
|
||||
@echo " make shell - Open backend shell"
|
||||
@echo " make frontend-sh - Open frontend shell"
|
||||
@echo ""
|
||||
@echo " Backup:"
|
||||
@echo " make backup-now - Run backup immediately"
|
||||
@echo " make backup-list - List available backups in S3"
|
||||
@echo " make backup-restore - Restore from backup (interactive)"
|
||||
@echo " make backup-logs - Show backup service logs"
|
||||
@echo ""
|
||||
@echo " Cleanup:"
|
||||
@echo " make clean - Stop and remove containers, volumes"
|
||||
@echo " make prune - Remove unused Docker resources"
|
||||
@@ -137,3 +143,20 @@ test-backend:
|
||||
# Production
|
||||
prod:
|
||||
$(DC) -f docker-compose.yml up -d --build
|
||||
|
||||
# Backup
|
||||
backup-now:
|
||||
$(DC) exec backup python /app/backup.py
|
||||
|
||||
backup-list:
|
||||
$(DC) exec backup python /app/restore.py
|
||||
|
||||
backup-restore:
|
||||
@read -p "Backup filename: " file; \
|
||||
$(DC) exec -it backup python /app/restore.py "$$file"
|
||||
|
||||
backup-logs:
|
||||
$(DC) logs -f backup
|
||||
|
||||
backup-shell:
|
||||
$(DC) exec backup bash
|
||||
|
||||
30
backup-service/Dockerfile
Normal file
30
backup-service/Dockerfile
Normal file
@@ -0,0 +1,30 @@
|
||||
FROM python:3.11-slim
|
||||
|
||||
WORKDIR /app
|
||||
|
||||
# Install PostgreSQL client (for pg_dump and psql) and cron
|
||||
RUN apt-get update && apt-get install -y \
|
||||
postgresql-client \
|
||||
cron \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install Python dependencies
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Copy application
|
||||
COPY . .
|
||||
|
||||
# Make scripts executable
|
||||
RUN chmod +x backup.py restore.py
|
||||
|
||||
# Setup cron
|
||||
COPY crontab /etc/cron.d/backup-cron
|
||||
RUN chmod 0644 /etc/cron.d/backup-cron
|
||||
RUN crontab /etc/cron.d/backup-cron
|
||||
|
||||
# Create log file
|
||||
RUN touch /var/log/cron.log
|
||||
|
||||
# Start cron in foreground and tail logs
|
||||
CMD ["sh", "-c", "printenv > /etc/environment && cron && tail -f /var/log/cron.log"]
|
||||
217
backup-service/backup.py
Normal file
217
backup-service/backup.py
Normal file
@@ -0,0 +1,217 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
PostgreSQL Backup Service for WebApp.
|
||||
|
||||
- Creates pg_dump backup
|
||||
- Compresses with gzip
|
||||
- Uploads to S3 FirstVDS
|
||||
- Rotates old backups (configurable retention)
|
||||
- Sends Telegram notifications
|
||||
"""
|
||||
import gzip
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
from datetime import datetime, timedelta, timezone
|
||||
|
||||
import boto3
|
||||
import httpx
|
||||
from botocore.config import Config as BotoConfig
|
||||
from botocore.exceptions import ClientError
|
||||
|
||||
from config import config
|
||||
|
||||
|
||||
def create_s3_client():
|
||||
"""Initialize S3 client (same pattern as backend storage.py)."""
|
||||
return boto3.client(
|
||||
"s3",
|
||||
endpoint_url=config.S3_ENDPOINT_URL,
|
||||
aws_access_key_id=config.S3_ACCESS_KEY_ID,
|
||||
aws_secret_access_key=config.S3_SECRET_ACCESS_KEY,
|
||||
region_name=config.S3_REGION or "us-east-1",
|
||||
config=BotoConfig(signature_version="s3v4"),
|
||||
)
|
||||
|
||||
|
||||
def send_telegram_notification(message: str, is_error: bool = False) -> None:
|
||||
"""Send notification to Telegram admin."""
|
||||
if not config.TELEGRAM_BOT_TOKEN or not config.TELEGRAM_ADMIN_ID:
|
||||
print("Telegram not configured, skipping notification")
|
||||
return
|
||||
|
||||
emoji = "\u274c" if is_error else "\u2705"
|
||||
text = f"{emoji} *Database Backup*\n\n{message}"
|
||||
|
||||
url = f"https://api.telegram.org/bot{config.TELEGRAM_BOT_TOKEN}/sendMessage"
|
||||
data = {
|
||||
"chat_id": config.TELEGRAM_ADMIN_ID,
|
||||
"text": text,
|
||||
"parse_mode": "Markdown",
|
||||
}
|
||||
|
||||
try:
|
||||
response = httpx.post(url, json=data, timeout=30)
|
||||
response.raise_for_status()
|
||||
print("Telegram notification sent")
|
||||
except Exception as e:
|
||||
print(f"Failed to send Telegram notification: {e}")
|
||||
|
||||
|
||||
def create_backup() -> tuple[str, bytes]:
|
||||
"""Create pg_dump backup and compress it."""
|
||||
timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
|
||||
filename = f"marathon_backup_{timestamp}.sql.gz"
|
||||
|
||||
# Build pg_dump command
|
||||
env = os.environ.copy()
|
||||
env["PGPASSWORD"] = config.DB_PASSWORD
|
||||
|
||||
cmd = [
|
||||
"pg_dump",
|
||||
"-h",
|
||||
config.DB_HOST,
|
||||
"-p",
|
||||
config.DB_PORT,
|
||||
"-U",
|
||||
config.DB_USER,
|
||||
"-d",
|
||||
config.DB_NAME,
|
||||
"--no-owner",
|
||||
"--no-acl",
|
||||
"-F",
|
||||
"p", # plain SQL format
|
||||
]
|
||||
|
||||
print(f"Running pg_dump for database {config.DB_NAME}...")
|
||||
result = subprocess.run(
|
||||
cmd,
|
||||
env=env,
|
||||
capture_output=True,
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
raise Exception(f"pg_dump failed: {result.stderr.decode()}")
|
||||
|
||||
# Compress the output
|
||||
print("Compressing backup...")
|
||||
compressed = gzip.compress(result.stdout, compresslevel=9)
|
||||
|
||||
return filename, compressed
|
||||
|
||||
|
||||
def upload_to_s3(s3_client, filename: str, data: bytes) -> str:
|
||||
"""Upload backup to S3."""
|
||||
key = f"{config.S3_BACKUP_PREFIX}{filename}"
|
||||
|
||||
print(f"Uploading to S3: {key}...")
|
||||
s3_client.put_object(
|
||||
Bucket=config.S3_BUCKET_NAME,
|
||||
Key=key,
|
||||
Body=data,
|
||||
ContentType="application/gzip",
|
||||
)
|
||||
|
||||
return key
|
||||
|
||||
|
||||
def rotate_old_backups(s3_client) -> int:
|
||||
"""Delete backups older than BACKUP_RETENTION_DAYS."""
|
||||
cutoff_date = datetime.now(timezone.utc) - timedelta(
|
||||
days=config.BACKUP_RETENTION_DAYS
|
||||
)
|
||||
deleted_count = 0
|
||||
|
||||
print(f"Rotating backups older than {config.BACKUP_RETENTION_DAYS} days...")
|
||||
|
||||
# List all objects with backup prefix
|
||||
try:
|
||||
paginator = s3_client.get_paginator("list_objects_v2")
|
||||
pages = paginator.paginate(
|
||||
Bucket=config.S3_BUCKET_NAME,
|
||||
Prefix=config.S3_BACKUP_PREFIX,
|
||||
)
|
||||
|
||||
for page in pages:
|
||||
for obj in page.get("Contents", []):
|
||||
last_modified = obj["LastModified"]
|
||||
if last_modified.tzinfo is None:
|
||||
last_modified = last_modified.replace(tzinfo=timezone.utc)
|
||||
|
||||
if last_modified < cutoff_date:
|
||||
s3_client.delete_object(
|
||||
Bucket=config.S3_BUCKET_NAME,
|
||||
Key=obj["Key"],
|
||||
)
|
||||
deleted_count += 1
|
||||
print(f"Deleted old backup: {obj['Key']}")
|
||||
except ClientError as e:
|
||||
print(f"Error during rotation: {e}")
|
||||
|
||||
return deleted_count
|
||||
|
||||
|
||||
def main() -> int:
|
||||
"""Main backup routine."""
|
||||
start_time = datetime.now()
|
||||
|
||||
print(f"{'=' * 50}")
|
||||
print(f"Backup started at {start_time}")
|
||||
print(f"{'=' * 50}")
|
||||
|
||||
try:
|
||||
# Validate configuration
|
||||
if not config.S3_BUCKET_NAME:
|
||||
raise Exception("S3_BUCKET_NAME is not configured")
|
||||
if not config.S3_ACCESS_KEY_ID:
|
||||
raise Exception("S3_ACCESS_KEY_ID is not configured")
|
||||
if not config.S3_SECRET_ACCESS_KEY:
|
||||
raise Exception("S3_SECRET_ACCESS_KEY is not configured")
|
||||
if not config.S3_ENDPOINT_URL:
|
||||
raise Exception("S3_ENDPOINT_URL is not configured")
|
||||
|
||||
# Create S3 client
|
||||
s3_client = create_s3_client()
|
||||
|
||||
# Create backup
|
||||
filename, data = create_backup()
|
||||
size_mb = len(data) / (1024 * 1024)
|
||||
print(f"Backup created: {filename} ({size_mb:.2f} MB)")
|
||||
|
||||
# Upload to S3
|
||||
s3_key = upload_to_s3(s3_client, filename, data)
|
||||
print(f"Uploaded to S3: {s3_key}")
|
||||
|
||||
# Rotate old backups
|
||||
deleted_count = rotate_old_backups(s3_client)
|
||||
print(f"Deleted {deleted_count} old backups")
|
||||
|
||||
# Calculate duration
|
||||
duration = datetime.now() - start_time
|
||||
|
||||
# Send success notification
|
||||
message = (
|
||||
f"Backup completed successfully!\n\n"
|
||||
f"*File:* `{filename}`\n"
|
||||
f"*Size:* {size_mb:.2f} MB\n"
|
||||
f"*Duration:* {duration.seconds}s\n"
|
||||
f"*Deleted old:* {deleted_count} files"
|
||||
)
|
||||
send_telegram_notification(message, is_error=False)
|
||||
|
||||
print(f"{'=' * 50}")
|
||||
print("Backup completed successfully!")
|
||||
print(f"{'=' * 50}")
|
||||
return 0
|
||||
|
||||
except Exception as e:
|
||||
error_msg = f"Backup failed!\n\n*Error:* `{str(e)}`"
|
||||
send_telegram_notification(error_msg, is_error=True)
|
||||
print(f"{'=' * 50}")
|
||||
print(f"Backup failed: {e}")
|
||||
print(f"{'=' * 50}")
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
33
backup-service/config.py
Normal file
33
backup-service/config.py
Normal file
@@ -0,0 +1,33 @@
|
||||
"""Configuration for backup service."""
|
||||
import os
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class Config:
|
||||
"""Backup service configuration from environment variables."""
|
||||
|
||||
# Database
|
||||
DB_HOST: str = os.getenv("DB_HOST", "db")
|
||||
DB_PORT: str = os.getenv("DB_PORT", "5432")
|
||||
DB_NAME: str = os.getenv("DB_NAME", "marathon")
|
||||
DB_USER: str = os.getenv("DB_USER", "marathon")
|
||||
DB_PASSWORD: str = os.getenv("DB_PASSWORD", "123")
|
||||
|
||||
# S3
|
||||
S3_BUCKET_NAME: str = os.getenv("S3_BUCKET_NAME", "")
|
||||
S3_REGION: str = os.getenv("S3_REGION", "ru-1")
|
||||
S3_ACCESS_KEY_ID: str = os.getenv("S3_ACCESS_KEY_ID", "")
|
||||
S3_SECRET_ACCESS_KEY: str = os.getenv("S3_SECRET_ACCESS_KEY", "")
|
||||
S3_ENDPOINT_URL: str = os.getenv("S3_ENDPOINT_URL", "")
|
||||
S3_BACKUP_PREFIX: str = os.getenv("S3_BACKUP_PREFIX", "backups/")
|
||||
|
||||
# Telegram
|
||||
TELEGRAM_BOT_TOKEN: str = os.getenv("TELEGRAM_BOT_TOKEN", "")
|
||||
TELEGRAM_ADMIN_ID: str = os.getenv("TELEGRAM_ADMIN_ID", "947392854")
|
||||
|
||||
# Backup settings
|
||||
BACKUP_RETENTION_DAYS: int = int(os.getenv("BACKUP_RETENTION_DAYS", "14"))
|
||||
|
||||
|
||||
config = Config()
|
||||
4
backup-service/crontab
Normal file
4
backup-service/crontab
Normal file
@@ -0,0 +1,4 @@
|
||||
# Backup cron job
|
||||
# Run backup daily at 3:00 AM UTC
|
||||
0 3 * * * /usr/local/bin/python /app/backup.py >> /var/log/cron.log 2>&1
|
||||
# Empty line required at end of crontab
|
||||
2
backup-service/requirements.txt
Normal file
2
backup-service/requirements.txt
Normal file
@@ -0,0 +1,2 @@
|
||||
boto3==1.34.0
|
||||
httpx==0.26.0
|
||||
158
backup-service/restore.py
Normal file
158
backup-service/restore.py
Normal file
@@ -0,0 +1,158 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Restore PostgreSQL database from S3 backup.
|
||||
|
||||
Usage:
|
||||
python restore.py - List available backups
|
||||
python restore.py <filename> - Restore from specific backup
|
||||
"""
|
||||
import gzip
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
import boto3
|
||||
from botocore.config import Config as BotoConfig
|
||||
from botocore.exceptions import ClientError
|
||||
|
||||
from config import config
|
||||
|
||||
|
||||
def create_s3_client():
|
||||
"""Initialize S3 client."""
|
||||
return boto3.client(
|
||||
"s3",
|
||||
endpoint_url=config.S3_ENDPOINT_URL,
|
||||
aws_access_key_id=config.S3_ACCESS_KEY_ID,
|
||||
aws_secret_access_key=config.S3_SECRET_ACCESS_KEY,
|
||||
region_name=config.S3_REGION or "us-east-1",
|
||||
config=BotoConfig(signature_version="s3v4"),
|
||||
)
|
||||
|
||||
|
||||
def list_backups(s3_client) -> list[tuple[str, float, str]]:
|
||||
"""List all available backups."""
|
||||
print("Available backups:\n")
|
||||
|
||||
try:
|
||||
paginator = s3_client.get_paginator("list_objects_v2")
|
||||
pages = paginator.paginate(
|
||||
Bucket=config.S3_BUCKET_NAME,
|
||||
Prefix=config.S3_BACKUP_PREFIX,
|
||||
)
|
||||
|
||||
backups = []
|
||||
for page in pages:
|
||||
for obj in page.get("Contents", []):
|
||||
filename = obj["Key"].replace(config.S3_BACKUP_PREFIX, "")
|
||||
size_mb = obj["Size"] / (1024 * 1024)
|
||||
modified = obj["LastModified"].strftime("%Y-%m-%d %H:%M:%S")
|
||||
backups.append((filename, size_mb, modified))
|
||||
|
||||
# Sort by date descending (newest first)
|
||||
backups.sort(key=lambda x: x[2], reverse=True)
|
||||
|
||||
for filename, size_mb, modified in backups:
|
||||
print(f" {filename} ({size_mb:.2f} MB) - {modified}")
|
||||
|
||||
return backups
|
||||
|
||||
except ClientError as e:
|
||||
print(f"Error listing backups: {e}")
|
||||
return []
|
||||
|
||||
|
||||
def restore_backup(s3_client, filename: str) -> None:
|
||||
"""Download and restore backup."""
|
||||
key = f"{config.S3_BACKUP_PREFIX}{filename}"
|
||||
|
||||
print(f"Downloading {filename} from S3...")
|
||||
try:
|
||||
response = s3_client.get_object(
|
||||
Bucket=config.S3_BUCKET_NAME,
|
||||
Key=key,
|
||||
)
|
||||
compressed_data = response["Body"].read()
|
||||
except ClientError as e:
|
||||
raise Exception(f"Failed to download backup: {e}")
|
||||
|
||||
print("Decompressing...")
|
||||
sql_data = gzip.decompress(compressed_data)
|
||||
|
||||
print(f"Restoring to database {config.DB_NAME}...")
|
||||
|
||||
# Build psql command
|
||||
env = os.environ.copy()
|
||||
env["PGPASSWORD"] = config.DB_PASSWORD
|
||||
|
||||
cmd = [
|
||||
"psql",
|
||||
"-h",
|
||||
config.DB_HOST,
|
||||
"-p",
|
||||
config.DB_PORT,
|
||||
"-U",
|
||||
config.DB_USER,
|
||||
"-d",
|
||||
config.DB_NAME,
|
||||
]
|
||||
|
||||
result = subprocess.run(
|
||||
cmd,
|
||||
env=env,
|
||||
input=sql_data,
|
||||
capture_output=True,
|
||||
)
|
||||
|
||||
if result.returncode != 0:
|
||||
stderr = result.stderr.decode()
|
||||
# psql may return warnings that aren't fatal errors
|
||||
if "ERROR" in stderr:
|
||||
raise Exception(f"psql restore failed: {stderr}")
|
||||
else:
|
||||
print(f"Warnings: {stderr}")
|
||||
|
||||
print("Restore completed successfully!")
|
||||
|
||||
|
||||
def main() -> int:
|
||||
"""Main restore routine."""
|
||||
# Validate configuration
|
||||
if not config.S3_BUCKET_NAME:
|
||||
print("Error: S3_BUCKET_NAME is not configured")
|
||||
return 1
|
||||
|
||||
s3_client = create_s3_client()
|
||||
|
||||
if len(sys.argv) < 2:
|
||||
# List available backups
|
||||
backups = list_backups(s3_client)
|
||||
if backups:
|
||||
print(f"\nTo restore, run: python restore.py <filename>")
|
||||
else:
|
||||
print("No backups found.")
|
||||
return 0
|
||||
|
||||
filename = sys.argv[1]
|
||||
|
||||
# Confirm restore
|
||||
print(f"WARNING: This will restore database from {filename}")
|
||||
print("This may overwrite existing data!")
|
||||
print()
|
||||
|
||||
confirm = input("Type 'yes' to continue: ")
|
||||
|
||||
if confirm.lower() != "yes":
|
||||
print("Restore cancelled.")
|
||||
return 0
|
||||
|
||||
try:
|
||||
restore_backup(s3_client, filename)
|
||||
return 0
|
||||
except Exception as e:
|
||||
print(f"Restore failed: {e}")
|
||||
return 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
@@ -94,7 +94,13 @@ services:
|
||||
BACKEND_URL: http://backend:8000
|
||||
FRONTEND_URL: http://frontend:80
|
||||
BOT_URL: http://bot:8080
|
||||
EXTERNAL_URL: ${EXTERNAL_URL:-}
|
||||
PUBLIC_URL: ${PUBLIC_URL:-}
|
||||
CHECK_INTERVAL: "30"
|
||||
TELEGRAM_BOT_TOKEN: ${TELEGRAM_BOT_TOKEN}
|
||||
TELEGRAM_ADMIN_ID: ${TELEGRAM_ADMIN_ID:-947392854}
|
||||
volumes:
|
||||
- status_data:/app/data
|
||||
ports:
|
||||
- "8001:8001"
|
||||
depends_on:
|
||||
@@ -103,5 +109,31 @@ services:
|
||||
- bot
|
||||
restart: unless-stopped
|
||||
|
||||
backup:
|
||||
build:
|
||||
context: ./backup-service
|
||||
dockerfile: Dockerfile
|
||||
container_name: marathon-backup
|
||||
environment:
|
||||
DB_HOST: db
|
||||
DB_PORT: "5432"
|
||||
DB_NAME: marathon
|
||||
DB_USER: marathon
|
||||
DB_PASSWORD: ${DB_PASSWORD:-marathon}
|
||||
S3_BUCKET_NAME: ${S3_BUCKET_NAME:-}
|
||||
S3_REGION: ${S3_REGION:-ru-1}
|
||||
S3_ACCESS_KEY_ID: ${S3_ACCESS_KEY_ID:-}
|
||||
S3_SECRET_ACCESS_KEY: ${S3_SECRET_ACCESS_KEY:-}
|
||||
S3_ENDPOINT_URL: ${S3_ENDPOINT_URL:-}
|
||||
S3_BACKUP_PREFIX: ${S3_BACKUP_PREFIX:-backups/}
|
||||
TELEGRAM_BOT_TOKEN: ${TELEGRAM_BOT_TOKEN}
|
||||
TELEGRAM_ADMIN_ID: ${TELEGRAM_ADMIN_ID:-947392854}
|
||||
BACKUP_RETENTION_DAYS: ${BACKUP_RETENTION_DAYS:-14}
|
||||
depends_on:
|
||||
db:
|
||||
condition: service_healthy
|
||||
restart: unless-stopped
|
||||
|
||||
volumes:
|
||||
postgres_data:
|
||||
status_data:
|
||||
|
||||
@@ -6,6 +6,9 @@ WORKDIR /app
|
||||
COPY requirements.txt .
|
||||
RUN pip install --no-cache-dir -r requirements.txt
|
||||
|
||||
# Create data directory for SQLite
|
||||
RUN mkdir -p /app/data
|
||||
|
||||
# Copy application
|
||||
COPY . .
|
||||
|
||||
|
||||
85
status-service/alerts.py
Normal file
85
status-service/alerts.py
Normal file
@@ -0,0 +1,85 @@
|
||||
"""Telegram alerting for status changes."""
|
||||
import os
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
|
||||
import httpx
|
||||
|
||||
|
||||
TELEGRAM_BOT_TOKEN = os.getenv("TELEGRAM_BOT_TOKEN", "")
|
||||
TELEGRAM_ADMIN_ID = os.getenv("TELEGRAM_ADMIN_ID", "")
|
||||
|
||||
|
||||
async def send_telegram_alert(message: str, is_recovery: bool = False) -> bool:
|
||||
"""Send alert to Telegram."""
|
||||
if not TELEGRAM_BOT_TOKEN or not TELEGRAM_ADMIN_ID:
|
||||
print("Telegram alerting not configured")
|
||||
return False
|
||||
|
||||
emoji = "\u2705" if is_recovery else "\u26a0\ufe0f"
|
||||
text = f"{emoji} *Status Alert*\n\n{message}"
|
||||
|
||||
url = f"https://api.telegram.org/bot{TELEGRAM_BOT_TOKEN}/sendMessage"
|
||||
data = {
|
||||
"chat_id": TELEGRAM_ADMIN_ID,
|
||||
"text": text,
|
||||
"parse_mode": "Markdown",
|
||||
}
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=10.0) as client:
|
||||
response = await client.post(url, json=data)
|
||||
response.raise_for_status()
|
||||
print(f"Telegram alert sent: {message[:50]}...")
|
||||
return True
|
||||
except Exception as e:
|
||||
print(f"Failed to send Telegram alert: {e}")
|
||||
return False
|
||||
|
||||
|
||||
async def alert_service_down(service_name: str, display_name: str, message: Optional[str]):
|
||||
"""Alert when service goes down."""
|
||||
now = datetime.now().strftime("%d.%m.%Y %H:%M:%S")
|
||||
text = (
|
||||
f"*{display_name}* is DOWN\n\n"
|
||||
f"Time: `{now}`\n"
|
||||
)
|
||||
if message:
|
||||
text += f"Error: `{message}`"
|
||||
|
||||
await send_telegram_alert(text, is_recovery=False)
|
||||
|
||||
|
||||
async def alert_service_recovered(service_name: str, display_name: str, downtime_minutes: int):
|
||||
"""Alert when service recovers."""
|
||||
now = datetime.now().strftime("%d.%m.%Y %H:%M:%S")
|
||||
text = (
|
||||
f"*{display_name}* is back ONLINE\n\n"
|
||||
f"Time: `{now}`\n"
|
||||
f"Downtime: `{downtime_minutes} min`"
|
||||
)
|
||||
|
||||
await send_telegram_alert(text, is_recovery=True)
|
||||
|
||||
|
||||
async def alert_ssl_expiring(domain: str, days_left: int):
|
||||
"""Alert when SSL certificate is expiring soon."""
|
||||
text = (
|
||||
f"*SSL Certificate Expiring*\n\n"
|
||||
f"Domain: `{domain}`\n"
|
||||
f"Days left: `{days_left}`\n\n"
|
||||
f"Please renew the certificate!"
|
||||
)
|
||||
|
||||
await send_telegram_alert(text, is_recovery=False)
|
||||
|
||||
|
||||
async def alert_ssl_expired(domain: str):
|
||||
"""Alert when SSL certificate has expired."""
|
||||
text = (
|
||||
f"*SSL Certificate EXPIRED*\n\n"
|
||||
f"Domain: `{domain}`\n\n"
|
||||
f"Certificate has expired! Site may show security warnings."
|
||||
)
|
||||
|
||||
await send_telegram_alert(text, is_recovery=False)
|
||||
261
status-service/database.py
Normal file
261
status-service/database.py
Normal file
@@ -0,0 +1,261 @@
|
||||
"""SQLite database for storing metrics history."""
|
||||
import sqlite3
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
import json
|
||||
|
||||
|
||||
DB_PATH = Path("/app/data/metrics.db")
|
||||
|
||||
|
||||
def get_connection() -> sqlite3.Connection:
|
||||
"""Get database connection."""
|
||||
DB_PATH.parent.mkdir(parents=True, exist_ok=True)
|
||||
conn = sqlite3.connect(str(DB_PATH))
|
||||
conn.row_factory = sqlite3.Row
|
||||
return conn
|
||||
|
||||
|
||||
def init_db():
|
||||
"""Initialize database tables."""
|
||||
conn = get_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Metrics history table
|
||||
cursor.execute("""
|
||||
CREATE TABLE IF NOT EXISTS metrics (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
service_name TEXT NOT NULL,
|
||||
status TEXT NOT NULL,
|
||||
latency_ms REAL,
|
||||
message TEXT,
|
||||
checked_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
)
|
||||
""")
|
||||
|
||||
# Incidents table
|
||||
cursor.execute("""
|
||||
CREATE TABLE IF NOT EXISTS incidents (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
service_name TEXT NOT NULL,
|
||||
status TEXT NOT NULL,
|
||||
message TEXT,
|
||||
started_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||
resolved_at TIMESTAMP,
|
||||
notified BOOLEAN DEFAULT FALSE
|
||||
)
|
||||
""")
|
||||
|
||||
# SSL certificates table
|
||||
cursor.execute("""
|
||||
CREATE TABLE IF NOT EXISTS ssl_certificates (
|
||||
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||
domain TEXT NOT NULL UNIQUE,
|
||||
issuer TEXT,
|
||||
expires_at TIMESTAMP,
|
||||
days_until_expiry INTEGER,
|
||||
checked_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||
)
|
||||
""")
|
||||
|
||||
# Create indexes
|
||||
cursor.execute("""
|
||||
CREATE INDEX IF NOT EXISTS idx_metrics_service_time
|
||||
ON metrics(service_name, checked_at DESC)
|
||||
""")
|
||||
cursor.execute("""
|
||||
CREATE INDEX IF NOT EXISTS idx_incidents_service
|
||||
ON incidents(service_name, started_at DESC)
|
||||
""")
|
||||
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
|
||||
def save_metric(service_name: str, status: str, latency_ms: Optional[float], message: Optional[str]):
|
||||
"""Save a metric record."""
|
||||
conn = get_connection()
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(
|
||||
"INSERT INTO metrics (service_name, status, latency_ms, message) VALUES (?, ?, ?, ?)",
|
||||
(service_name, status, latency_ms, message)
|
||||
)
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
|
||||
def get_latency_history(service_name: str, hours: int = 24) -> list[dict]:
|
||||
"""Get latency history for a service."""
|
||||
conn = get_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
since = datetime.now() - timedelta(hours=hours)
|
||||
cursor.execute("""
|
||||
SELECT latency_ms, status, checked_at
|
||||
FROM metrics
|
||||
WHERE service_name = ? AND checked_at > ? AND latency_ms IS NOT NULL
|
||||
ORDER BY checked_at ASC
|
||||
""", (service_name, since.isoformat()))
|
||||
|
||||
rows = cursor.fetchall()
|
||||
conn.close()
|
||||
|
||||
return [
|
||||
{
|
||||
"latency_ms": row["latency_ms"],
|
||||
"status": row["status"],
|
||||
"checked_at": row["checked_at"]
|
||||
}
|
||||
for row in rows
|
||||
]
|
||||
|
||||
|
||||
def get_uptime_stats(service_name: str, hours: int = 24) -> dict:
|
||||
"""Calculate uptime statistics for a service."""
|
||||
conn = get_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
since = datetime.now() - timedelta(hours=hours)
|
||||
|
||||
cursor.execute("""
|
||||
SELECT COUNT(*) as total,
|
||||
SUM(CASE WHEN status = 'operational' THEN 1 ELSE 0 END) as successful
|
||||
FROM metrics
|
||||
WHERE service_name = ? AND checked_at > ?
|
||||
""", (service_name, since.isoformat()))
|
||||
|
||||
row = cursor.fetchone()
|
||||
conn.close()
|
||||
|
||||
total = row["total"] or 0
|
||||
successful = row["successful"] or 0
|
||||
|
||||
return {
|
||||
"total_checks": total,
|
||||
"successful_checks": successful,
|
||||
"uptime_percent": (successful / total * 100) if total > 0 else 100.0
|
||||
}
|
||||
|
||||
|
||||
def get_avg_latency(service_name: str, hours: int = 24) -> Optional[float]:
|
||||
"""Get average latency for a service."""
|
||||
conn = get_connection()
|
||||
cursor = conn.cursor()
|
||||
|
||||
since = datetime.now() - timedelta(hours=hours)
|
||||
cursor.execute("""
|
||||
SELECT AVG(latency_ms) as avg_latency
|
||||
FROM metrics
|
||||
WHERE service_name = ? AND checked_at > ? AND latency_ms IS NOT NULL
|
||||
""", (service_name, since.isoformat()))
|
||||
|
||||
row = cursor.fetchone()
|
||||
conn.close()
|
||||
|
||||
return row["avg_latency"]
|
||||
|
||||
|
||||
def create_incident(service_name: str, status: str, message: Optional[str]) -> int:
|
||||
"""Create a new incident."""
|
||||
conn = get_connection()
|
||||
cursor = conn.cursor()
|
||||
cursor.execute(
|
||||
"INSERT INTO incidents (service_name, status, message) VALUES (?, ?, ?)",
|
||||
(service_name, status, message)
|
||||
)
|
||||
incident_id = cursor.lastrowid
|
||||
conn.commit()
|
||||
conn.close()
|
||||
return incident_id
|
||||
|
||||
|
||||
def resolve_incident(service_name: str):
|
||||
"""Resolve open incidents for a service."""
|
||||
conn = get_connection()
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("""
|
||||
UPDATE incidents
|
||||
SET resolved_at = CURRENT_TIMESTAMP
|
||||
WHERE service_name = ? AND resolved_at IS NULL
|
||||
""", (service_name,))
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
|
||||
def get_open_incident(service_name: str) -> Optional[dict]:
|
||||
"""Get open incident for a service."""
|
||||
conn = get_connection()
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("""
|
||||
SELECT * FROM incidents
|
||||
WHERE service_name = ? AND resolved_at IS NULL
|
||||
ORDER BY started_at DESC LIMIT 1
|
||||
""", (service_name,))
|
||||
row = cursor.fetchone()
|
||||
conn.close()
|
||||
|
||||
if row:
|
||||
return dict(row)
|
||||
return None
|
||||
|
||||
|
||||
def mark_incident_notified(incident_id: int):
|
||||
"""Mark incident as notified."""
|
||||
conn = get_connection()
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("UPDATE incidents SET notified = TRUE WHERE id = ?", (incident_id,))
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
|
||||
def get_recent_incidents(limit: int = 10) -> list[dict]:
|
||||
"""Get recent incidents."""
|
||||
conn = get_connection()
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("""
|
||||
SELECT * FROM incidents
|
||||
ORDER BY started_at DESC
|
||||
LIMIT ?
|
||||
""", (limit,))
|
||||
rows = cursor.fetchall()
|
||||
conn.close()
|
||||
return [dict(row) for row in rows]
|
||||
|
||||
|
||||
def save_ssl_info(domain: str, issuer: str, expires_at: datetime, days_until_expiry: int):
|
||||
"""Save SSL certificate info."""
|
||||
conn = get_connection()
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("""
|
||||
INSERT OR REPLACE INTO ssl_certificates
|
||||
(domain, issuer, expires_at, days_until_expiry, checked_at)
|
||||
VALUES (?, ?, ?, ?, CURRENT_TIMESTAMP)
|
||||
""", (domain, issuer, expires_at.isoformat(), days_until_expiry))
|
||||
conn.commit()
|
||||
conn.close()
|
||||
|
||||
|
||||
def get_ssl_info(domain: str) -> Optional[dict]:
|
||||
"""Get SSL certificate info."""
|
||||
conn = get_connection()
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("SELECT * FROM ssl_certificates WHERE domain = ?", (domain,))
|
||||
row = cursor.fetchone()
|
||||
conn.close()
|
||||
|
||||
if row:
|
||||
return dict(row)
|
||||
return None
|
||||
|
||||
|
||||
def cleanup_old_metrics(days: int = 7):
|
||||
"""Delete metrics older than specified days."""
|
||||
conn = get_connection()
|
||||
cursor = conn.cursor()
|
||||
cutoff = datetime.now() - timedelta(days=days)
|
||||
cursor.execute("DELETE FROM metrics WHERE checked_at < ?", (cutoff.isoformat(),))
|
||||
deleted = cursor.rowcount
|
||||
conn.commit()
|
||||
conn.close()
|
||||
return deleted
|
||||
@@ -1,6 +1,7 @@
|
||||
"""Status monitoring service with persistence and alerting."""
|
||||
import os
|
||||
import asyncio
|
||||
from datetime import datetime, timedelta
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
from contextlib import asynccontextmanager
|
||||
|
||||
@@ -8,13 +9,16 @@ from fastapi import FastAPI, Request
|
||||
from fastapi.responses import HTMLResponse
|
||||
from fastapi.templating import Jinja2Templates
|
||||
|
||||
from monitors import ServiceMonitor, ServiceStatus
|
||||
from monitors import ServiceMonitor
|
||||
from database import init_db, get_recent_incidents, get_latency_history, cleanup_old_metrics
|
||||
|
||||
|
||||
# Configuration
|
||||
BACKEND_URL = os.getenv("BACKEND_URL", "http://backend:8000")
|
||||
FRONTEND_URL = os.getenv("FRONTEND_URL", "http://frontend:80")
|
||||
BOT_URL = os.getenv("BOT_URL", "http://bot:8080")
|
||||
EXTERNAL_URL = os.getenv("EXTERNAL_URL", "") # Public URL for external checks
|
||||
PUBLIC_URL = os.getenv("PUBLIC_URL", "") # Public HTTPS URL for SSL checks
|
||||
CHECK_INTERVAL = int(os.getenv("CHECK_INTERVAL", "30"))
|
||||
|
||||
# Initialize monitor
|
||||
@@ -22,38 +26,64 @@ monitor = ServiceMonitor()
|
||||
|
||||
# Background task reference
|
||||
background_task: Optional[asyncio.Task] = None
|
||||
cleanup_task: Optional[asyncio.Task] = None
|
||||
|
||||
|
||||
async def periodic_health_check():
|
||||
"""Background task to check services periodically"""
|
||||
"""Background task to check services periodically."""
|
||||
while True:
|
||||
try:
|
||||
await monitor.check_all_services(
|
||||
backend_url=BACKEND_URL,
|
||||
frontend_url=FRONTEND_URL,
|
||||
bot_url=BOT_URL
|
||||
bot_url=BOT_URL,
|
||||
external_url=EXTERNAL_URL,
|
||||
public_url=PUBLIC_URL
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"Health check error: {e}")
|
||||
await asyncio.sleep(CHECK_INTERVAL)
|
||||
|
||||
|
||||
async def periodic_cleanup():
|
||||
"""Background task to cleanup old metrics (daily)."""
|
||||
while True:
|
||||
await asyncio.sleep(86400) # 24 hours
|
||||
try:
|
||||
deleted = cleanup_old_metrics(days=7)
|
||||
print(f"Cleaned up {deleted} old metrics")
|
||||
except Exception as e:
|
||||
print(f"Cleanup error: {e}")
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
"""Startup and shutdown events"""
|
||||
global background_task
|
||||
"""Startup and shutdown events."""
|
||||
global background_task, cleanup_task
|
||||
|
||||
# Initialize database
|
||||
init_db()
|
||||
print("Database initialized")
|
||||
|
||||
# Start background health checks
|
||||
background_task = asyncio.create_task(periodic_health_check())
|
||||
cleanup_task = asyncio.create_task(periodic_cleanup())
|
||||
|
||||
yield
|
||||
# Cancel background task on shutdown
|
||||
if background_task:
|
||||
background_task.cancel()
|
||||
|
||||
# Cancel background tasks on shutdown
|
||||
for task in [background_task, cleanup_task]:
|
||||
if task:
|
||||
task.cancel()
|
||||
try:
|
||||
await background_task
|
||||
await task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
|
||||
|
||||
app = FastAPI(
|
||||
title="Status Monitor",
|
||||
description="Service health monitoring",
|
||||
description="Service health monitoring with persistence and alerting",
|
||||
lifespan=lifespan
|
||||
)
|
||||
|
||||
@@ -62,9 +92,11 @@ templates = Jinja2Templates(directory="templates")
|
||||
|
||||
@app.get("/", response_class=HTMLResponse)
|
||||
async def status_page(request: Request):
|
||||
"""Main status page"""
|
||||
"""Main status page."""
|
||||
services = monitor.get_all_statuses()
|
||||
overall_status = monitor.get_overall_status()
|
||||
ssl_status = monitor.get_ssl_status()
|
||||
incidents = get_recent_incidents(limit=5)
|
||||
|
||||
return templates.TemplateResponse(
|
||||
"index.html",
|
||||
@@ -72,6 +104,8 @@ async def status_page(request: Request):
|
||||
"request": request,
|
||||
"services": services,
|
||||
"overall_status": overall_status,
|
||||
"ssl_status": ssl_status,
|
||||
"incidents": incidents,
|
||||
"last_check": monitor.last_check,
|
||||
"check_interval": CHECK_INTERVAL
|
||||
}
|
||||
@@ -80,30 +114,52 @@ async def status_page(request: Request):
|
||||
|
||||
@app.get("/api/status")
|
||||
async def api_status():
|
||||
"""API endpoint for service statuses"""
|
||||
"""API endpoint for service statuses."""
|
||||
services = monitor.get_all_statuses()
|
||||
overall_status = monitor.get_overall_status()
|
||||
ssl_status = monitor.get_ssl_status()
|
||||
|
||||
return {
|
||||
"overall_status": overall_status,
|
||||
"overall_status": overall_status.value,
|
||||
"services": {name: status.to_dict() for name, status in services.items()},
|
||||
"ssl": ssl_status,
|
||||
"last_check": monitor.last_check.isoformat() if monitor.last_check else None,
|
||||
"check_interval_seconds": CHECK_INTERVAL
|
||||
}
|
||||
|
||||
|
||||
@app.get("/api/history/{service_name}")
|
||||
async def api_history(service_name: str, hours: int = 24):
|
||||
"""API endpoint for service latency history."""
|
||||
history = get_latency_history(service_name, hours=hours)
|
||||
return {
|
||||
"service": service_name,
|
||||
"hours": hours,
|
||||
"data": history
|
||||
}
|
||||
|
||||
|
||||
@app.get("/api/incidents")
|
||||
async def api_incidents(limit: int = 20):
|
||||
"""API endpoint for recent incidents."""
|
||||
incidents = get_recent_incidents(limit=limit)
|
||||
return {"incidents": incidents}
|
||||
|
||||
|
||||
@app.get("/api/health")
|
||||
async def health():
|
||||
"""Health check for this service"""
|
||||
"""Health check for this service."""
|
||||
return {"status": "ok", "service": "status-monitor"}
|
||||
|
||||
|
||||
@app.post("/api/refresh")
|
||||
async def refresh_status():
|
||||
"""Force refresh all service statuses"""
|
||||
"""Force refresh all service statuses."""
|
||||
await monitor.check_all_services(
|
||||
backend_url=BACKEND_URL,
|
||||
frontend_url=FRONTEND_URL,
|
||||
bot_url=BOT_URL
|
||||
bot_url=BOT_URL,
|
||||
external_url=EXTERNAL_URL,
|
||||
public_url=PUBLIC_URL
|
||||
)
|
||||
return {"status": "refreshed"}
|
||||
|
||||
@@ -1,11 +1,19 @@
|
||||
"""Service monitoring with persistence and alerting."""
|
||||
import asyncio
|
||||
from datetime import datetime, timedelta
|
||||
from dataclasses import dataclass, field
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
from enum import Enum
|
||||
|
||||
import httpx
|
||||
|
||||
from database import (
|
||||
save_metric, get_latency_history, get_uptime_stats, get_avg_latency,
|
||||
create_incident, resolve_incident, get_open_incident, mark_incident_notified
|
||||
)
|
||||
from alerts import alert_service_down, alert_service_recovered
|
||||
from ssl_monitor import check_and_alert_ssl, SSLInfo
|
||||
|
||||
|
||||
class Status(str, Enum):
|
||||
OPERATIONAL = "operational"
|
||||
@@ -25,11 +33,17 @@ class ServiceStatus:
|
||||
uptime_percent: float = 100.0
|
||||
message: Optional[str] = None
|
||||
version: Optional[str] = None
|
||||
avg_latency_24h: Optional[float] = None
|
||||
latency_history: list = None
|
||||
|
||||
# For uptime calculation
|
||||
# For uptime calculation (in-memory, backed by DB)
|
||||
total_checks: int = 0
|
||||
successful_checks: int = 0
|
||||
|
||||
def __post_init__(self):
|
||||
if self.latency_history is None:
|
||||
self.latency_history = []
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
return {
|
||||
"name": self.name,
|
||||
@@ -40,7 +54,8 @@ class ServiceStatus:
|
||||
"last_incident": self.last_incident.isoformat() if self.last_incident else None,
|
||||
"uptime_percent": round(self.uptime_percent, 2),
|
||||
"message": self.message,
|
||||
"version": self.version
|
||||
"version": self.version,
|
||||
"avg_latency_24h": round(self.avg_latency_24h, 2) if self.avg_latency_24h else None,
|
||||
}
|
||||
|
||||
def update_uptime(self, is_success: bool):
|
||||
@@ -69,12 +84,17 @@ class ServiceMonitor:
|
||||
"bot": ServiceStatus(
|
||||
name="bot",
|
||||
display_name="Telegram Bot"
|
||||
)
|
||||
),
|
||||
"external": ServiceStatus(
|
||||
name="external",
|
||||
display_name="External Access"
|
||||
),
|
||||
}
|
||||
self.last_check: Optional[datetime] = None
|
||||
self.ssl_info: Optional[SSLInfo] = None
|
||||
|
||||
async def check_backend(self, url: str) -> tuple[Status, Optional[float], Optional[str], Optional[str]]:
|
||||
"""Check backend API health"""
|
||||
"""Check backend API health."""
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=10.0) as client:
|
||||
start = datetime.now()
|
||||
@@ -92,9 +112,7 @@ class ServiceMonitor:
|
||||
return Status.DOWN, None, str(e)[:100], None
|
||||
|
||||
async def check_database(self, backend_url: str) -> tuple[Status, Optional[float], Optional[str]]:
|
||||
"""Check database through backend"""
|
||||
# We check database indirectly - if backend is up, DB is likely up
|
||||
# Could add a specific /health/db endpoint to backend later
|
||||
"""Check database through backend."""
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=10.0) as client:
|
||||
start = datetime.now()
|
||||
@@ -109,7 +127,7 @@ class ServiceMonitor:
|
||||
return Status.DOWN, None, "Cannot reach backend"
|
||||
|
||||
async def check_frontend(self, url: str) -> tuple[Status, Optional[float], Optional[str]]:
|
||||
"""Check frontend availability"""
|
||||
"""Check frontend availability."""
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=10.0) as client:
|
||||
start = datetime.now()
|
||||
@@ -126,7 +144,7 @@ class ServiceMonitor:
|
||||
return Status.DOWN, None, str(e)[:100]
|
||||
|
||||
async def check_bot(self, url: str) -> tuple[Status, Optional[float], Optional[str]]:
|
||||
"""Check Telegram bot health"""
|
||||
"""Check Telegram bot health."""
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=10.0) as client:
|
||||
start = datetime.now()
|
||||
@@ -142,8 +160,93 @@ class ServiceMonitor:
|
||||
except Exception as e:
|
||||
return Status.DOWN, None, str(e)[:100]
|
||||
|
||||
async def check_all_services(self, backend_url: str, frontend_url: str, bot_url: str):
|
||||
"""Check all services concurrently"""
|
||||
async def check_external(self, url: str) -> tuple[Status, Optional[float], Optional[str]]:
|
||||
"""Check external (public) URL availability."""
|
||||
if not url:
|
||||
return Status.UNKNOWN, None, "Not configured"
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=15.0, follow_redirects=True) as client:
|
||||
start = datetime.now()
|
||||
response = await client.get(url)
|
||||
latency = (datetime.now() - start).total_seconds() * 1000
|
||||
|
||||
if response.status_code == 200:
|
||||
return Status.OPERATIONAL, latency, None
|
||||
else:
|
||||
return Status.DEGRADED, latency, f"HTTP {response.status_code}"
|
||||
except httpx.TimeoutException:
|
||||
return Status.DOWN, None, "Timeout"
|
||||
except Exception as e:
|
||||
return Status.DOWN, None, str(e)[:100]
|
||||
|
||||
async def _process_check_result(
|
||||
self,
|
||||
service_name: str,
|
||||
result: tuple,
|
||||
now: datetime
|
||||
):
|
||||
"""Process check result with DB persistence and alerting."""
|
||||
if isinstance(result, Exception):
|
||||
return
|
||||
|
||||
if len(result) == 4:
|
||||
status, latency, message, version = result
|
||||
else:
|
||||
status, latency, message = result
|
||||
version = None
|
||||
|
||||
svc = self.services[service_name]
|
||||
was_down = svc.status in (Status.DOWN, Status.DEGRADED)
|
||||
is_down = status in (Status.DOWN, Status.DEGRADED)
|
||||
|
||||
# Update service status
|
||||
svc.status = status
|
||||
svc.latency_ms = latency
|
||||
svc.message = message
|
||||
if version:
|
||||
svc.version = version
|
||||
svc.last_check = now
|
||||
svc.update_uptime(status == Status.OPERATIONAL)
|
||||
|
||||
# Save metric to database
|
||||
save_metric(service_name, status.value, latency, message)
|
||||
|
||||
# Load historical data
|
||||
svc.latency_history = get_latency_history(service_name, hours=24)
|
||||
svc.avg_latency_24h = get_avg_latency(service_name, hours=24)
|
||||
|
||||
# Update uptime from DB
|
||||
stats = get_uptime_stats(service_name, hours=24)
|
||||
if stats["total_checks"] > 0:
|
||||
svc.uptime_percent = stats["uptime_percent"]
|
||||
|
||||
# Handle incident tracking and alerting
|
||||
if is_down and not was_down:
|
||||
# Service just went down
|
||||
svc.last_incident = now
|
||||
incident_id = create_incident(service_name, status.value, message)
|
||||
await alert_service_down(service_name, svc.display_name, message)
|
||||
mark_incident_notified(incident_id)
|
||||
|
||||
elif not is_down and was_down:
|
||||
# Service recovered
|
||||
open_incident = get_open_incident(service_name)
|
||||
if open_incident:
|
||||
started_at = datetime.fromisoformat(open_incident["started_at"])
|
||||
downtime_minutes = int((now - started_at).total_seconds() / 60)
|
||||
resolve_incident(service_name)
|
||||
await alert_service_recovered(service_name, svc.display_name, downtime_minutes)
|
||||
|
||||
async def check_all_services(
|
||||
self,
|
||||
backend_url: str,
|
||||
frontend_url: str,
|
||||
bot_url: str,
|
||||
external_url: str = "",
|
||||
public_url: str = ""
|
||||
):
|
||||
"""Check all services concurrently."""
|
||||
now = datetime.now()
|
||||
|
||||
# Run all checks concurrently
|
||||
@@ -152,61 +255,18 @@ class ServiceMonitor:
|
||||
self.check_database(backend_url),
|
||||
self.check_frontend(frontend_url),
|
||||
self.check_bot(bot_url),
|
||||
self.check_external(external_url),
|
||||
return_exceptions=True
|
||||
)
|
||||
|
||||
# Process backend result
|
||||
if not isinstance(results[0], Exception):
|
||||
status, latency, message, version = results[0]
|
||||
svc = self.services["backend"]
|
||||
was_down = svc.status == Status.DOWN
|
||||
svc.status = status
|
||||
svc.latency_ms = latency
|
||||
svc.message = message
|
||||
svc.version = version
|
||||
svc.last_check = now
|
||||
svc.update_uptime(status == Status.OPERATIONAL)
|
||||
if status != Status.OPERATIONAL and not was_down:
|
||||
svc.last_incident = now
|
||||
# Process results
|
||||
service_names = ["backend", "database", "frontend", "bot", "external"]
|
||||
for i, service_name in enumerate(service_names):
|
||||
await self._process_check_result(service_name, results[i], now)
|
||||
|
||||
# Process database result
|
||||
if not isinstance(results[1], Exception):
|
||||
status, latency, message = results[1]
|
||||
svc = self.services["database"]
|
||||
was_down = svc.status == Status.DOWN
|
||||
svc.status = status
|
||||
svc.latency_ms = latency
|
||||
svc.message = message
|
||||
svc.last_check = now
|
||||
svc.update_uptime(status == Status.OPERATIONAL)
|
||||
if status != Status.OPERATIONAL and not was_down:
|
||||
svc.last_incident = now
|
||||
|
||||
# Process frontend result
|
||||
if not isinstance(results[2], Exception):
|
||||
status, latency, message = results[2]
|
||||
svc = self.services["frontend"]
|
||||
was_down = svc.status == Status.DOWN
|
||||
svc.status = status
|
||||
svc.latency_ms = latency
|
||||
svc.message = message
|
||||
svc.last_check = now
|
||||
svc.update_uptime(status == Status.OPERATIONAL)
|
||||
if status != Status.OPERATIONAL and not was_down:
|
||||
svc.last_incident = now
|
||||
|
||||
# Process bot result
|
||||
if not isinstance(results[3], Exception):
|
||||
status, latency, message = results[3]
|
||||
svc = self.services["bot"]
|
||||
was_down = svc.status == Status.DOWN
|
||||
svc.status = status
|
||||
svc.latency_ms = latency
|
||||
svc.message = message
|
||||
svc.last_check = now
|
||||
svc.update_uptime(status == Status.OPERATIONAL)
|
||||
if status != Status.OPERATIONAL and not was_down:
|
||||
svc.last_incident = now
|
||||
# Check SSL certificate (if public URL is HTTPS)
|
||||
if public_url and public_url.startswith("https://"):
|
||||
self.ssl_info = await check_and_alert_ssl(public_url)
|
||||
|
||||
self.last_check = now
|
||||
|
||||
@@ -214,8 +274,12 @@ class ServiceMonitor:
|
||||
return self.services
|
||||
|
||||
def get_overall_status(self) -> Status:
|
||||
"""Get overall system status based on all services"""
|
||||
statuses = [svc.status for svc in self.services.values()]
|
||||
"""Get overall system status based on all services."""
|
||||
# Exclude external from overall status if not configured
|
||||
statuses = [
|
||||
svc.status for name, svc in self.services.items()
|
||||
if name != "external" or svc.status != Status.UNKNOWN
|
||||
]
|
||||
|
||||
if all(s == Status.OPERATIONAL for s in statuses):
|
||||
return Status.OPERATIONAL
|
||||
@@ -225,3 +289,17 @@ class ServiceMonitor:
|
||||
return Status.DEGRADED
|
||||
else:
|
||||
return Status.UNKNOWN
|
||||
|
||||
def get_ssl_status(self) -> Optional[dict]:
|
||||
"""Get SSL certificate status."""
|
||||
if not self.ssl_info:
|
||||
return None
|
||||
|
||||
return {
|
||||
"domain": self.ssl_info.domain,
|
||||
"issuer": self.ssl_info.issuer,
|
||||
"expires_at": self.ssl_info.expires_at.isoformat(),
|
||||
"days_until_expiry": self.ssl_info.days_until_expiry,
|
||||
"is_valid": self.ssl_info.is_valid,
|
||||
"error": self.ssl_info.error
|
||||
}
|
||||
|
||||
140
status-service/ssl_monitor.py
Normal file
140
status-service/ssl_monitor.py
Normal file
@@ -0,0 +1,140 @@
|
||||
"""SSL certificate monitoring."""
|
||||
import ssl
|
||||
import socket
|
||||
from datetime import datetime, timezone
|
||||
from dataclasses import dataclass
|
||||
from typing import Optional
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from database import save_ssl_info, get_ssl_info
|
||||
from alerts import alert_ssl_expiring, alert_ssl_expired
|
||||
|
||||
|
||||
@dataclass
|
||||
class SSLInfo:
|
||||
domain: str
|
||||
issuer: str
|
||||
expires_at: datetime
|
||||
days_until_expiry: int
|
||||
is_valid: bool
|
||||
error: Optional[str] = None
|
||||
|
||||
|
||||
def check_ssl_certificate(url: str) -> Optional[SSLInfo]:
|
||||
"""Check SSL certificate for a URL."""
|
||||
try:
|
||||
parsed = urlparse(url)
|
||||
hostname = parsed.hostname
|
||||
|
||||
if not hostname:
|
||||
return None
|
||||
|
||||
# Skip non-HTTPS or localhost
|
||||
if parsed.scheme != "https" or hostname in ("localhost", "127.0.0.1"):
|
||||
return None
|
||||
|
||||
context = ssl.create_default_context()
|
||||
conn = context.wrap_socket(
|
||||
socket.socket(socket.AF_INET),
|
||||
server_hostname=hostname
|
||||
)
|
||||
conn.settimeout(10.0)
|
||||
|
||||
try:
|
||||
conn.connect((hostname, parsed.port or 443))
|
||||
cert = conn.getpeercert()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
if not cert:
|
||||
return SSLInfo(
|
||||
domain=hostname,
|
||||
issuer="Unknown",
|
||||
expires_at=datetime.now(timezone.utc),
|
||||
days_until_expiry=0,
|
||||
is_valid=False,
|
||||
error="No certificate found"
|
||||
)
|
||||
|
||||
# Parse expiry date
|
||||
not_after = cert.get("notAfter", "")
|
||||
expires_at = datetime.strptime(not_after, "%b %d %H:%M:%S %Y %Z")
|
||||
expires_at = expires_at.replace(tzinfo=timezone.utc)
|
||||
|
||||
# Calculate days until expiry
|
||||
now = datetime.now(timezone.utc)
|
||||
days_until_expiry = (expires_at - now).days
|
||||
|
||||
# Get issuer
|
||||
issuer_parts = cert.get("issuer", ())
|
||||
issuer = "Unknown"
|
||||
for part in issuer_parts:
|
||||
for key, value in part:
|
||||
if key == "organizationName":
|
||||
issuer = value
|
||||
break
|
||||
|
||||
return SSLInfo(
|
||||
domain=hostname,
|
||||
issuer=issuer,
|
||||
expires_at=expires_at,
|
||||
days_until_expiry=days_until_expiry,
|
||||
is_valid=days_until_expiry > 0
|
||||
)
|
||||
|
||||
except ssl.SSLCertVerificationError as e:
|
||||
hostname = urlparse(url).hostname or url
|
||||
return SSLInfo(
|
||||
domain=hostname,
|
||||
issuer="Invalid",
|
||||
expires_at=datetime.now(timezone.utc),
|
||||
days_until_expiry=0,
|
||||
is_valid=False,
|
||||
error=f"SSL verification failed: {str(e)[:100]}"
|
||||
)
|
||||
except Exception as e:
|
||||
hostname = urlparse(url).hostname or url
|
||||
return SSLInfo(
|
||||
domain=hostname,
|
||||
issuer="Unknown",
|
||||
expires_at=datetime.now(timezone.utc),
|
||||
days_until_expiry=0,
|
||||
is_valid=False,
|
||||
error=str(e)[:100]
|
||||
)
|
||||
|
||||
|
||||
async def check_and_alert_ssl(url: str, warn_days: int = 14) -> Optional[SSLInfo]:
|
||||
"""Check SSL and send alerts if needed."""
|
||||
ssl_info = check_ssl_certificate(url)
|
||||
|
||||
if not ssl_info:
|
||||
return None
|
||||
|
||||
# Save to database
|
||||
save_ssl_info(
|
||||
domain=ssl_info.domain,
|
||||
issuer=ssl_info.issuer,
|
||||
expires_at=ssl_info.expires_at,
|
||||
days_until_expiry=ssl_info.days_until_expiry
|
||||
)
|
||||
|
||||
# Check if we need to alert
|
||||
prev_info = get_ssl_info(ssl_info.domain)
|
||||
|
||||
if ssl_info.days_until_expiry <= 0:
|
||||
# Certificate expired
|
||||
await alert_ssl_expired(ssl_info.domain)
|
||||
elif ssl_info.days_until_expiry <= warn_days:
|
||||
# Certificate expiring soon - alert once per day
|
||||
should_alert = True
|
||||
if prev_info and prev_info.get("checked_at"):
|
||||
# Check if we already alerted today
|
||||
last_check = datetime.fromisoformat(prev_info["checked_at"])
|
||||
if (datetime.now() - last_check).days < 1:
|
||||
should_alert = False
|
||||
|
||||
if should_alert:
|
||||
await alert_ssl_expiring(ssl_info.domain, ssl_info.days_until_expiry)
|
||||
|
||||
return ssl_info
|
||||
@@ -4,6 +4,7 @@
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>System Status</title>
|
||||
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
|
||||
<style>
|
||||
* {
|
||||
margin: 0;
|
||||
@@ -19,7 +20,7 @@
|
||||
}
|
||||
|
||||
.container {
|
||||
max-width: 900px;
|
||||
max-width: 1100px;
|
||||
margin: 0 auto;
|
||||
padding: 40px 20px;
|
||||
}
|
||||
@@ -39,6 +40,13 @@
|
||||
background-clip: text;
|
||||
}
|
||||
|
||||
h2 {
|
||||
font-size: 1.3rem;
|
||||
font-weight: 600;
|
||||
margin: 30px 0 16px 0;
|
||||
color: #94a3b8;
|
||||
}
|
||||
|
||||
.overall-status {
|
||||
display: inline-flex;
|
||||
align-items: center;
|
||||
@@ -174,8 +182,9 @@
|
||||
|
||||
.service-metrics {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(auto-fit, minmax(140px, 1fr));
|
||||
grid-template-columns: repeat(auto-fit, minmax(120px, 1fr));
|
||||
gap: 12px;
|
||||
margin-bottom: 16px;
|
||||
}
|
||||
|
||||
.metric {
|
||||
@@ -212,6 +221,132 @@
|
||||
color: #fca5a5;
|
||||
}
|
||||
|
||||
/* Latency chart */
|
||||
.latency-chart {
|
||||
height: 60px;
|
||||
margin-top: 12px;
|
||||
}
|
||||
|
||||
/* SSL Card */
|
||||
.ssl-card {
|
||||
background: rgba(30, 41, 59, 0.5);
|
||||
border: 1px solid rgba(100, 116, 139, 0.2);
|
||||
border-radius: 16px;
|
||||
padding: 20px;
|
||||
margin-bottom: 20px;
|
||||
}
|
||||
|
||||
.ssl-card.warning {
|
||||
border-color: rgba(250, 204, 21, 0.3);
|
||||
}
|
||||
|
||||
.ssl-card.danger {
|
||||
border-color: rgba(239, 68, 68, 0.3);
|
||||
}
|
||||
|
||||
.ssl-header {
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
align-items: center;
|
||||
margin-bottom: 12px;
|
||||
}
|
||||
|
||||
.ssl-title {
|
||||
font-size: 1.1rem;
|
||||
font-weight: 600;
|
||||
color: #f1f5f9;
|
||||
}
|
||||
|
||||
.ssl-badge {
|
||||
padding: 4px 12px;
|
||||
border-radius: 20px;
|
||||
font-size: 0.8rem;
|
||||
font-weight: 500;
|
||||
}
|
||||
|
||||
.ssl-badge.valid {
|
||||
background: rgba(34, 197, 94, 0.15);
|
||||
color: #22c55e;
|
||||
}
|
||||
|
||||
.ssl-badge.expiring {
|
||||
background: rgba(250, 204, 21, 0.15);
|
||||
color: #facc15;
|
||||
}
|
||||
|
||||
.ssl-badge.expired {
|
||||
background: rgba(239, 68, 68, 0.15);
|
||||
color: #ef4444;
|
||||
}
|
||||
|
||||
.ssl-info {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(auto-fit, minmax(150px, 1fr));
|
||||
gap: 12px;
|
||||
}
|
||||
|
||||
/* Incidents */
|
||||
.incidents-list {
|
||||
background: rgba(30, 41, 59, 0.5);
|
||||
border: 1px solid rgba(100, 116, 139, 0.2);
|
||||
border-radius: 16px;
|
||||
overflow: hidden;
|
||||
}
|
||||
|
||||
.incident-item {
|
||||
padding: 16px 20px;
|
||||
border-bottom: 1px solid rgba(100, 116, 139, 0.1);
|
||||
display: flex;
|
||||
justify-content: space-between;
|
||||
align-items: center;
|
||||
}
|
||||
|
||||
.incident-item:last-child {
|
||||
border-bottom: none;
|
||||
}
|
||||
|
||||
.incident-info {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 12px;
|
||||
}
|
||||
|
||||
.incident-dot {
|
||||
width: 10px;
|
||||
height: 10px;
|
||||
border-radius: 50%;
|
||||
}
|
||||
|
||||
.incident-dot.resolved {
|
||||
background: #22c55e;
|
||||
}
|
||||
|
||||
.incident-dot.open {
|
||||
background: #ef4444;
|
||||
animation: pulse 2s infinite;
|
||||
}
|
||||
|
||||
.incident-service {
|
||||
font-weight: 500;
|
||||
color: #f1f5f9;
|
||||
}
|
||||
|
||||
.incident-message {
|
||||
font-size: 0.85rem;
|
||||
color: #94a3b8;
|
||||
}
|
||||
|
||||
.incident-time {
|
||||
font-size: 0.85rem;
|
||||
color: #64748b;
|
||||
}
|
||||
|
||||
.no-incidents {
|
||||
padding: 30px;
|
||||
text-align: center;
|
||||
color: #64748b;
|
||||
}
|
||||
|
||||
.refresh-btn {
|
||||
display: inline-flex;
|
||||
align-items: center;
|
||||
@@ -292,8 +427,42 @@
|
||||
</p>
|
||||
</header>
|
||||
|
||||
{% if ssl_status %}
|
||||
<div class="ssl-card {% if ssl_status.days_until_expiry <= 0 %}danger{% elif ssl_status.days_until_expiry <= 14 %}warning{% endif %}">
|
||||
<div class="ssl-header">
|
||||
<span class="ssl-title">SSL Certificate</span>
|
||||
<span class="ssl-badge {% if ssl_status.days_until_expiry <= 0 %}expired{% elif ssl_status.days_until_expiry <= 14 %}expiring{% else %}valid{% endif %}">
|
||||
{% if ssl_status.days_until_expiry <= 0 %}
|
||||
Expired
|
||||
{% elif ssl_status.days_until_expiry <= 14 %}
|
||||
Expiring Soon
|
||||
{% else %}
|
||||
Valid
|
||||
{% endif %}
|
||||
</span>
|
||||
</div>
|
||||
<div class="ssl-info">
|
||||
<div class="metric">
|
||||
<div class="metric-label">Domain</div>
|
||||
<div class="metric-value">{{ ssl_status.domain }}</div>
|
||||
</div>
|
||||
<div class="metric">
|
||||
<div class="metric-label">Issuer</div>
|
||||
<div class="metric-value">{{ ssl_status.issuer }}</div>
|
||||
</div>
|
||||
<div class="metric">
|
||||
<div class="metric-label">Days Left</div>
|
||||
<div class="metric-value {% if ssl_status.days_until_expiry <= 0 %}bad{% elif ssl_status.days_until_expiry <= 14 %}warning{% else %}good{% endif %}">
|
||||
{{ ssl_status.days_until_expiry }}
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
<div class="services-grid">
|
||||
{% for name, service in services.items() %}
|
||||
{% if service.status.value != 'unknown' or name != 'external' %}
|
||||
<div class="service-card">
|
||||
<div class="service-header">
|
||||
<span class="service-name">{{ service.display_name }}</span>
|
||||
@@ -322,7 +491,17 @@
|
||||
</div>
|
||||
</div>
|
||||
<div class="metric">
|
||||
<div class="metric-label">Uptime</div>
|
||||
<div class="metric-label">Avg 24h</div>
|
||||
<div class="metric-value {% if service.avg_latency_24h and service.avg_latency_24h < 200 %}good{% elif service.avg_latency_24h and service.avg_latency_24h < 500 %}warning{% elif service.avg_latency_24h %}bad{% endif %}">
|
||||
{% if service.avg_latency_24h %}
|
||||
{{ "%.0f"|format(service.avg_latency_24h) }} ms
|
||||
{% else %}
|
||||
—
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
<div class="metric">
|
||||
<div class="metric-label">Uptime 24h</div>
|
||||
<div class="metric-value {% if service.uptime_percent >= 99 %}good{% elif service.uptime_percent >= 95 %}warning{% else %}bad{% endif %}">
|
||||
{{ "%.1f"|format(service.uptime_percent) }}%
|
||||
</div>
|
||||
@@ -333,20 +512,49 @@
|
||||
<div class="metric-value">{{ service.version }}</div>
|
||||
</div>
|
||||
{% endif %}
|
||||
{% if service.last_incident %}
|
||||
<div class="metric">
|
||||
<div class="metric-label">Last Incident</div>
|
||||
<div class="metric-value warning">{{ service.last_incident.strftime('%d.%m %H:%M') }}</div>
|
||||
</div>
|
||||
{% if service.latency_history and service.latency_history|length > 1 %}
|
||||
<div class="latency-chart">
|
||||
<canvas id="chart-{{ name }}"></canvas>
|
||||
</div>
|
||||
{% endif %}
|
||||
</div>
|
||||
{% if service.message %}
|
||||
<div class="service-message">{{ service.message }}</div>
|
||||
{% endif %}
|
||||
</div>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
</div>
|
||||
|
||||
<h2>Recent Incidents</h2>
|
||||
<div class="incidents-list">
|
||||
{% if incidents and incidents|length > 0 %}
|
||||
{% for incident in incidents %}
|
||||
<div class="incident-item">
|
||||
<div class="incident-info">
|
||||
<span class="incident-dot {% if incident.resolved_at %}resolved{% else %}open{% endif %}"></span>
|
||||
<div>
|
||||
<div class="incident-service">{{ incident.service_name | title }}</div>
|
||||
<div class="incident-message">{{ incident.message or 'Service unavailable' }}</div>
|
||||
</div>
|
||||
</div>
|
||||
<div class="incident-time">
|
||||
{{ incident.started_at[:16].replace('T', ' ') }}
|
||||
{% if incident.resolved_at %}
|
||||
- Resolved
|
||||
{% else %}
|
||||
- Ongoing
|
||||
{% endif %}
|
||||
</div>
|
||||
</div>
|
||||
{% endfor %}
|
||||
{% else %}
|
||||
<div class="no-incidents">
|
||||
No recent incidents
|
||||
</div>
|
||||
{% endif %}
|
||||
</div>
|
||||
|
||||
<center>
|
||||
<button class="refresh-btn" onclick="refreshStatus(this)">
|
||||
<svg width="18" height="18" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
|
||||
@@ -363,6 +571,55 @@
|
||||
</div>
|
||||
|
||||
<script>
|
||||
// Initialize latency charts
|
||||
{% for name, service in services.items() %}
|
||||
{% if service.latency_history and service.latency_history|length > 1 %}
|
||||
(function() {
|
||||
const ctx = document.getElementById('chart-{{ name }}').getContext('2d');
|
||||
const data = {{ service.latency_history | tojson }};
|
||||
|
||||
new Chart(ctx, {
|
||||
type: 'line',
|
||||
data: {
|
||||
labels: data.map(d => ''),
|
||||
datasets: [{
|
||||
data: data.map(d => d.latency_ms),
|
||||
borderColor: '#00d4ff',
|
||||
backgroundColor: 'rgba(0, 212, 255, 0.1)',
|
||||
fill: true,
|
||||
tension: 0.4,
|
||||
pointRadius: 0,
|
||||
borderWidth: 2
|
||||
}]
|
||||
},
|
||||
options: {
|
||||
responsive: true,
|
||||
maintainAspectRatio: false,
|
||||
plugins: {
|
||||
legend: { display: false },
|
||||
tooltip: {
|
||||
callbacks: {
|
||||
label: (ctx) => ctx.raw.toFixed(0) + ' ms'
|
||||
}
|
||||
}
|
||||
},
|
||||
scales: {
|
||||
x: { display: false },
|
||||
y: {
|
||||
display: false,
|
||||
beginAtZero: true
|
||||
}
|
||||
},
|
||||
interaction: {
|
||||
intersect: false,
|
||||
mode: 'index'
|
||||
}
|
||||
}
|
||||
});
|
||||
})();
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
|
||||
async function refreshStatus(btn) {
|
||||
btn.classList.add('loading');
|
||||
btn.disabled = true;
|
||||
|
||||
Reference in New Issue
Block a user