Redesign health service + create backup service
This commit is contained in:
@@ -20,5 +20,14 @@ S3_SECRET_ACCESS_KEY=your-secret-access-key
|
|||||||
S3_ENDPOINT_URL=https://s3.firstvds.ru
|
S3_ENDPOINT_URL=https://s3.firstvds.ru
|
||||||
S3_PUBLIC_URL=https://your-bucket-name.s3.firstvds.ru
|
S3_PUBLIC_URL=https://your-bucket-name.s3.firstvds.ru
|
||||||
|
|
||||||
|
# Backup Service
|
||||||
|
TELEGRAM_ADMIN_ID=947392854
|
||||||
|
S3_BACKUP_PREFIX=backups/
|
||||||
|
BACKUP_RETENTION_DAYS=14
|
||||||
|
|
||||||
|
# Status Service (optional - for external monitoring)
|
||||||
|
EXTERNAL_URL=https://your-domain.com
|
||||||
|
PUBLIC_URL=https://your-domain.com
|
||||||
|
|
||||||
# Frontend (for build)
|
# Frontend (for build)
|
||||||
VITE_API_URL=/api/v1
|
VITE_API_URL=/api/v1
|
||||||
|
|||||||
23
Makefile
23
Makefile
@@ -31,6 +31,12 @@ help:
|
|||||||
@echo " make shell - Open backend shell"
|
@echo " make shell - Open backend shell"
|
||||||
@echo " make frontend-sh - Open frontend shell"
|
@echo " make frontend-sh - Open frontend shell"
|
||||||
@echo ""
|
@echo ""
|
||||||
|
@echo " Backup:"
|
||||||
|
@echo " make backup-now - Run backup immediately"
|
||||||
|
@echo " make backup-list - List available backups in S3"
|
||||||
|
@echo " make backup-restore - Restore from backup (interactive)"
|
||||||
|
@echo " make backup-logs - Show backup service logs"
|
||||||
|
@echo ""
|
||||||
@echo " Cleanup:"
|
@echo " Cleanup:"
|
||||||
@echo " make clean - Stop and remove containers, volumes"
|
@echo " make clean - Stop and remove containers, volumes"
|
||||||
@echo " make prune - Remove unused Docker resources"
|
@echo " make prune - Remove unused Docker resources"
|
||||||
@@ -137,3 +143,20 @@ test-backend:
|
|||||||
# Production
|
# Production
|
||||||
prod:
|
prod:
|
||||||
$(DC) -f docker-compose.yml up -d --build
|
$(DC) -f docker-compose.yml up -d --build
|
||||||
|
|
||||||
|
# Backup
|
||||||
|
backup-now:
|
||||||
|
$(DC) exec backup python /app/backup.py
|
||||||
|
|
||||||
|
backup-list:
|
||||||
|
$(DC) exec backup python /app/restore.py
|
||||||
|
|
||||||
|
backup-restore:
|
||||||
|
@read -p "Backup filename: " file; \
|
||||||
|
$(DC) exec -it backup python /app/restore.py "$$file"
|
||||||
|
|
||||||
|
backup-logs:
|
||||||
|
$(DC) logs -f backup
|
||||||
|
|
||||||
|
backup-shell:
|
||||||
|
$(DC) exec backup bash
|
||||||
|
|||||||
30
backup-service/Dockerfile
Normal file
30
backup-service/Dockerfile
Normal file
@@ -0,0 +1,30 @@
|
|||||||
|
FROM python:3.11-slim
|
||||||
|
|
||||||
|
WORKDIR /app
|
||||||
|
|
||||||
|
# Install PostgreSQL client (for pg_dump and psql) and cron
|
||||||
|
RUN apt-get update && apt-get install -y \
|
||||||
|
postgresql-client \
|
||||||
|
cron \
|
||||||
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
|
# Install Python dependencies
|
||||||
|
COPY requirements.txt .
|
||||||
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
|
# Copy application
|
||||||
|
COPY . .
|
||||||
|
|
||||||
|
# Make scripts executable
|
||||||
|
RUN chmod +x backup.py restore.py
|
||||||
|
|
||||||
|
# Setup cron
|
||||||
|
COPY crontab /etc/cron.d/backup-cron
|
||||||
|
RUN chmod 0644 /etc/cron.d/backup-cron
|
||||||
|
RUN crontab /etc/cron.d/backup-cron
|
||||||
|
|
||||||
|
# Create log file
|
||||||
|
RUN touch /var/log/cron.log
|
||||||
|
|
||||||
|
# Start cron in foreground and tail logs
|
||||||
|
CMD ["sh", "-c", "printenv > /etc/environment && cron && tail -f /var/log/cron.log"]
|
||||||
217
backup-service/backup.py
Normal file
217
backup-service/backup.py
Normal file
@@ -0,0 +1,217 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
PostgreSQL Backup Service for WebApp.
|
||||||
|
|
||||||
|
- Creates pg_dump backup
|
||||||
|
- Compresses with gzip
|
||||||
|
- Uploads to S3 FirstVDS
|
||||||
|
- Rotates old backups (configurable retention)
|
||||||
|
- Sends Telegram notifications
|
||||||
|
"""
|
||||||
|
import gzip
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
from datetime import datetime, timedelta, timezone
|
||||||
|
|
||||||
|
import boto3
|
||||||
|
import httpx
|
||||||
|
from botocore.config import Config as BotoConfig
|
||||||
|
from botocore.exceptions import ClientError
|
||||||
|
|
||||||
|
from config import config
|
||||||
|
|
||||||
|
|
||||||
|
def create_s3_client():
|
||||||
|
"""Initialize S3 client (same pattern as backend storage.py)."""
|
||||||
|
return boto3.client(
|
||||||
|
"s3",
|
||||||
|
endpoint_url=config.S3_ENDPOINT_URL,
|
||||||
|
aws_access_key_id=config.S3_ACCESS_KEY_ID,
|
||||||
|
aws_secret_access_key=config.S3_SECRET_ACCESS_KEY,
|
||||||
|
region_name=config.S3_REGION or "us-east-1",
|
||||||
|
config=BotoConfig(signature_version="s3v4"),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def send_telegram_notification(message: str, is_error: bool = False) -> None:
|
||||||
|
"""Send notification to Telegram admin."""
|
||||||
|
if not config.TELEGRAM_BOT_TOKEN or not config.TELEGRAM_ADMIN_ID:
|
||||||
|
print("Telegram not configured, skipping notification")
|
||||||
|
return
|
||||||
|
|
||||||
|
emoji = "\u274c" if is_error else "\u2705"
|
||||||
|
text = f"{emoji} *Database Backup*\n\n{message}"
|
||||||
|
|
||||||
|
url = f"https://api.telegram.org/bot{config.TELEGRAM_BOT_TOKEN}/sendMessage"
|
||||||
|
data = {
|
||||||
|
"chat_id": config.TELEGRAM_ADMIN_ID,
|
||||||
|
"text": text,
|
||||||
|
"parse_mode": "Markdown",
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = httpx.post(url, json=data, timeout=30)
|
||||||
|
response.raise_for_status()
|
||||||
|
print("Telegram notification sent")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Failed to send Telegram notification: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
def create_backup() -> tuple[str, bytes]:
|
||||||
|
"""Create pg_dump backup and compress it."""
|
||||||
|
timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
|
||||||
|
filename = f"marathon_backup_{timestamp}.sql.gz"
|
||||||
|
|
||||||
|
# Build pg_dump command
|
||||||
|
env = os.environ.copy()
|
||||||
|
env["PGPASSWORD"] = config.DB_PASSWORD
|
||||||
|
|
||||||
|
cmd = [
|
||||||
|
"pg_dump",
|
||||||
|
"-h",
|
||||||
|
config.DB_HOST,
|
||||||
|
"-p",
|
||||||
|
config.DB_PORT,
|
||||||
|
"-U",
|
||||||
|
config.DB_USER,
|
||||||
|
"-d",
|
||||||
|
config.DB_NAME,
|
||||||
|
"--no-owner",
|
||||||
|
"--no-acl",
|
||||||
|
"-F",
|
||||||
|
"p", # plain SQL format
|
||||||
|
]
|
||||||
|
|
||||||
|
print(f"Running pg_dump for database {config.DB_NAME}...")
|
||||||
|
result = subprocess.run(
|
||||||
|
cmd,
|
||||||
|
env=env,
|
||||||
|
capture_output=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
if result.returncode != 0:
|
||||||
|
raise Exception(f"pg_dump failed: {result.stderr.decode()}")
|
||||||
|
|
||||||
|
# Compress the output
|
||||||
|
print("Compressing backup...")
|
||||||
|
compressed = gzip.compress(result.stdout, compresslevel=9)
|
||||||
|
|
||||||
|
return filename, compressed
|
||||||
|
|
||||||
|
|
||||||
|
def upload_to_s3(s3_client, filename: str, data: bytes) -> str:
|
||||||
|
"""Upload backup to S3."""
|
||||||
|
key = f"{config.S3_BACKUP_PREFIX}{filename}"
|
||||||
|
|
||||||
|
print(f"Uploading to S3: {key}...")
|
||||||
|
s3_client.put_object(
|
||||||
|
Bucket=config.S3_BUCKET_NAME,
|
||||||
|
Key=key,
|
||||||
|
Body=data,
|
||||||
|
ContentType="application/gzip",
|
||||||
|
)
|
||||||
|
|
||||||
|
return key
|
||||||
|
|
||||||
|
|
||||||
|
def rotate_old_backups(s3_client) -> int:
|
||||||
|
"""Delete backups older than BACKUP_RETENTION_DAYS."""
|
||||||
|
cutoff_date = datetime.now(timezone.utc) - timedelta(
|
||||||
|
days=config.BACKUP_RETENTION_DAYS
|
||||||
|
)
|
||||||
|
deleted_count = 0
|
||||||
|
|
||||||
|
print(f"Rotating backups older than {config.BACKUP_RETENTION_DAYS} days...")
|
||||||
|
|
||||||
|
# List all objects with backup prefix
|
||||||
|
try:
|
||||||
|
paginator = s3_client.get_paginator("list_objects_v2")
|
||||||
|
pages = paginator.paginate(
|
||||||
|
Bucket=config.S3_BUCKET_NAME,
|
||||||
|
Prefix=config.S3_BACKUP_PREFIX,
|
||||||
|
)
|
||||||
|
|
||||||
|
for page in pages:
|
||||||
|
for obj in page.get("Contents", []):
|
||||||
|
last_modified = obj["LastModified"]
|
||||||
|
if last_modified.tzinfo is None:
|
||||||
|
last_modified = last_modified.replace(tzinfo=timezone.utc)
|
||||||
|
|
||||||
|
if last_modified < cutoff_date:
|
||||||
|
s3_client.delete_object(
|
||||||
|
Bucket=config.S3_BUCKET_NAME,
|
||||||
|
Key=obj["Key"],
|
||||||
|
)
|
||||||
|
deleted_count += 1
|
||||||
|
print(f"Deleted old backup: {obj['Key']}")
|
||||||
|
except ClientError as e:
|
||||||
|
print(f"Error during rotation: {e}")
|
||||||
|
|
||||||
|
return deleted_count
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> int:
|
||||||
|
"""Main backup routine."""
|
||||||
|
start_time = datetime.now()
|
||||||
|
|
||||||
|
print(f"{'=' * 50}")
|
||||||
|
print(f"Backup started at {start_time}")
|
||||||
|
print(f"{'=' * 50}")
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Validate configuration
|
||||||
|
if not config.S3_BUCKET_NAME:
|
||||||
|
raise Exception("S3_BUCKET_NAME is not configured")
|
||||||
|
if not config.S3_ACCESS_KEY_ID:
|
||||||
|
raise Exception("S3_ACCESS_KEY_ID is not configured")
|
||||||
|
if not config.S3_SECRET_ACCESS_KEY:
|
||||||
|
raise Exception("S3_SECRET_ACCESS_KEY is not configured")
|
||||||
|
if not config.S3_ENDPOINT_URL:
|
||||||
|
raise Exception("S3_ENDPOINT_URL is not configured")
|
||||||
|
|
||||||
|
# Create S3 client
|
||||||
|
s3_client = create_s3_client()
|
||||||
|
|
||||||
|
# Create backup
|
||||||
|
filename, data = create_backup()
|
||||||
|
size_mb = len(data) / (1024 * 1024)
|
||||||
|
print(f"Backup created: {filename} ({size_mb:.2f} MB)")
|
||||||
|
|
||||||
|
# Upload to S3
|
||||||
|
s3_key = upload_to_s3(s3_client, filename, data)
|
||||||
|
print(f"Uploaded to S3: {s3_key}")
|
||||||
|
|
||||||
|
# Rotate old backups
|
||||||
|
deleted_count = rotate_old_backups(s3_client)
|
||||||
|
print(f"Deleted {deleted_count} old backups")
|
||||||
|
|
||||||
|
# Calculate duration
|
||||||
|
duration = datetime.now() - start_time
|
||||||
|
|
||||||
|
# Send success notification
|
||||||
|
message = (
|
||||||
|
f"Backup completed successfully!\n\n"
|
||||||
|
f"*File:* `{filename}`\n"
|
||||||
|
f"*Size:* {size_mb:.2f} MB\n"
|
||||||
|
f"*Duration:* {duration.seconds}s\n"
|
||||||
|
f"*Deleted old:* {deleted_count} files"
|
||||||
|
)
|
||||||
|
send_telegram_notification(message, is_error=False)
|
||||||
|
|
||||||
|
print(f"{'=' * 50}")
|
||||||
|
print("Backup completed successfully!")
|
||||||
|
print(f"{'=' * 50}")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
error_msg = f"Backup failed!\n\n*Error:* `{str(e)}`"
|
||||||
|
send_telegram_notification(error_msg, is_error=True)
|
||||||
|
print(f"{'=' * 50}")
|
||||||
|
print(f"Backup failed: {e}")
|
||||||
|
print(f"{'=' * 50}")
|
||||||
|
return 1
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.exit(main())
|
||||||
33
backup-service/config.py
Normal file
33
backup-service/config.py
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
"""Configuration for backup service."""
|
||||||
|
import os
|
||||||
|
from dataclasses import dataclass
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Config:
|
||||||
|
"""Backup service configuration from environment variables."""
|
||||||
|
|
||||||
|
# Database
|
||||||
|
DB_HOST: str = os.getenv("DB_HOST", "db")
|
||||||
|
DB_PORT: str = os.getenv("DB_PORT", "5432")
|
||||||
|
DB_NAME: str = os.getenv("DB_NAME", "marathon")
|
||||||
|
DB_USER: str = os.getenv("DB_USER", "marathon")
|
||||||
|
DB_PASSWORD: str = os.getenv("DB_PASSWORD", "123")
|
||||||
|
|
||||||
|
# S3
|
||||||
|
S3_BUCKET_NAME: str = os.getenv("S3_BUCKET_NAME", "")
|
||||||
|
S3_REGION: str = os.getenv("S3_REGION", "ru-1")
|
||||||
|
S3_ACCESS_KEY_ID: str = os.getenv("S3_ACCESS_KEY_ID", "")
|
||||||
|
S3_SECRET_ACCESS_KEY: str = os.getenv("S3_SECRET_ACCESS_KEY", "")
|
||||||
|
S3_ENDPOINT_URL: str = os.getenv("S3_ENDPOINT_URL", "")
|
||||||
|
S3_BACKUP_PREFIX: str = os.getenv("S3_BACKUP_PREFIX", "backups/")
|
||||||
|
|
||||||
|
# Telegram
|
||||||
|
TELEGRAM_BOT_TOKEN: str = os.getenv("TELEGRAM_BOT_TOKEN", "")
|
||||||
|
TELEGRAM_ADMIN_ID: str = os.getenv("TELEGRAM_ADMIN_ID", "947392854")
|
||||||
|
|
||||||
|
# Backup settings
|
||||||
|
BACKUP_RETENTION_DAYS: int = int(os.getenv("BACKUP_RETENTION_DAYS", "14"))
|
||||||
|
|
||||||
|
|
||||||
|
config = Config()
|
||||||
4
backup-service/crontab
Normal file
4
backup-service/crontab
Normal file
@@ -0,0 +1,4 @@
|
|||||||
|
# Backup cron job
|
||||||
|
# Run backup daily at 3:00 AM UTC
|
||||||
|
0 3 * * * /usr/local/bin/python /app/backup.py >> /var/log/cron.log 2>&1
|
||||||
|
# Empty line required at end of crontab
|
||||||
2
backup-service/requirements.txt
Normal file
2
backup-service/requirements.txt
Normal file
@@ -0,0 +1,2 @@
|
|||||||
|
boto3==1.34.0
|
||||||
|
httpx==0.26.0
|
||||||
158
backup-service/restore.py
Normal file
158
backup-service/restore.py
Normal file
@@ -0,0 +1,158 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Restore PostgreSQL database from S3 backup.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python restore.py - List available backups
|
||||||
|
python restore.py <filename> - Restore from specific backup
|
||||||
|
"""
|
||||||
|
import gzip
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import boto3
|
||||||
|
from botocore.config import Config as BotoConfig
|
||||||
|
from botocore.exceptions import ClientError
|
||||||
|
|
||||||
|
from config import config
|
||||||
|
|
||||||
|
|
||||||
|
def create_s3_client():
|
||||||
|
"""Initialize S3 client."""
|
||||||
|
return boto3.client(
|
||||||
|
"s3",
|
||||||
|
endpoint_url=config.S3_ENDPOINT_URL,
|
||||||
|
aws_access_key_id=config.S3_ACCESS_KEY_ID,
|
||||||
|
aws_secret_access_key=config.S3_SECRET_ACCESS_KEY,
|
||||||
|
region_name=config.S3_REGION or "us-east-1",
|
||||||
|
config=BotoConfig(signature_version="s3v4"),
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def list_backups(s3_client) -> list[tuple[str, float, str]]:
|
||||||
|
"""List all available backups."""
|
||||||
|
print("Available backups:\n")
|
||||||
|
|
||||||
|
try:
|
||||||
|
paginator = s3_client.get_paginator("list_objects_v2")
|
||||||
|
pages = paginator.paginate(
|
||||||
|
Bucket=config.S3_BUCKET_NAME,
|
||||||
|
Prefix=config.S3_BACKUP_PREFIX,
|
||||||
|
)
|
||||||
|
|
||||||
|
backups = []
|
||||||
|
for page in pages:
|
||||||
|
for obj in page.get("Contents", []):
|
||||||
|
filename = obj["Key"].replace(config.S3_BACKUP_PREFIX, "")
|
||||||
|
size_mb = obj["Size"] / (1024 * 1024)
|
||||||
|
modified = obj["LastModified"].strftime("%Y-%m-%d %H:%M:%S")
|
||||||
|
backups.append((filename, size_mb, modified))
|
||||||
|
|
||||||
|
# Sort by date descending (newest first)
|
||||||
|
backups.sort(key=lambda x: x[2], reverse=True)
|
||||||
|
|
||||||
|
for filename, size_mb, modified in backups:
|
||||||
|
print(f" {filename} ({size_mb:.2f} MB) - {modified}")
|
||||||
|
|
||||||
|
return backups
|
||||||
|
|
||||||
|
except ClientError as e:
|
||||||
|
print(f"Error listing backups: {e}")
|
||||||
|
return []
|
||||||
|
|
||||||
|
|
||||||
|
def restore_backup(s3_client, filename: str) -> None:
|
||||||
|
"""Download and restore backup."""
|
||||||
|
key = f"{config.S3_BACKUP_PREFIX}{filename}"
|
||||||
|
|
||||||
|
print(f"Downloading {filename} from S3...")
|
||||||
|
try:
|
||||||
|
response = s3_client.get_object(
|
||||||
|
Bucket=config.S3_BUCKET_NAME,
|
||||||
|
Key=key,
|
||||||
|
)
|
||||||
|
compressed_data = response["Body"].read()
|
||||||
|
except ClientError as e:
|
||||||
|
raise Exception(f"Failed to download backup: {e}")
|
||||||
|
|
||||||
|
print("Decompressing...")
|
||||||
|
sql_data = gzip.decompress(compressed_data)
|
||||||
|
|
||||||
|
print(f"Restoring to database {config.DB_NAME}...")
|
||||||
|
|
||||||
|
# Build psql command
|
||||||
|
env = os.environ.copy()
|
||||||
|
env["PGPASSWORD"] = config.DB_PASSWORD
|
||||||
|
|
||||||
|
cmd = [
|
||||||
|
"psql",
|
||||||
|
"-h",
|
||||||
|
config.DB_HOST,
|
||||||
|
"-p",
|
||||||
|
config.DB_PORT,
|
||||||
|
"-U",
|
||||||
|
config.DB_USER,
|
||||||
|
"-d",
|
||||||
|
config.DB_NAME,
|
||||||
|
]
|
||||||
|
|
||||||
|
result = subprocess.run(
|
||||||
|
cmd,
|
||||||
|
env=env,
|
||||||
|
input=sql_data,
|
||||||
|
capture_output=True,
|
||||||
|
)
|
||||||
|
|
||||||
|
if result.returncode != 0:
|
||||||
|
stderr = result.stderr.decode()
|
||||||
|
# psql may return warnings that aren't fatal errors
|
||||||
|
if "ERROR" in stderr:
|
||||||
|
raise Exception(f"psql restore failed: {stderr}")
|
||||||
|
else:
|
||||||
|
print(f"Warnings: {stderr}")
|
||||||
|
|
||||||
|
print("Restore completed successfully!")
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> int:
|
||||||
|
"""Main restore routine."""
|
||||||
|
# Validate configuration
|
||||||
|
if not config.S3_BUCKET_NAME:
|
||||||
|
print("Error: S3_BUCKET_NAME is not configured")
|
||||||
|
return 1
|
||||||
|
|
||||||
|
s3_client = create_s3_client()
|
||||||
|
|
||||||
|
if len(sys.argv) < 2:
|
||||||
|
# List available backups
|
||||||
|
backups = list_backups(s3_client)
|
||||||
|
if backups:
|
||||||
|
print(f"\nTo restore, run: python restore.py <filename>")
|
||||||
|
else:
|
||||||
|
print("No backups found.")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
filename = sys.argv[1]
|
||||||
|
|
||||||
|
# Confirm restore
|
||||||
|
print(f"WARNING: This will restore database from {filename}")
|
||||||
|
print("This may overwrite existing data!")
|
||||||
|
print()
|
||||||
|
|
||||||
|
confirm = input("Type 'yes' to continue: ")
|
||||||
|
|
||||||
|
if confirm.lower() != "yes":
|
||||||
|
print("Restore cancelled.")
|
||||||
|
return 0
|
||||||
|
|
||||||
|
try:
|
||||||
|
restore_backup(s3_client, filename)
|
||||||
|
return 0
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Restore failed: {e}")
|
||||||
|
return 1
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.exit(main())
|
||||||
@@ -94,7 +94,13 @@ services:
|
|||||||
BACKEND_URL: http://backend:8000
|
BACKEND_URL: http://backend:8000
|
||||||
FRONTEND_URL: http://frontend:80
|
FRONTEND_URL: http://frontend:80
|
||||||
BOT_URL: http://bot:8080
|
BOT_URL: http://bot:8080
|
||||||
|
EXTERNAL_URL: ${EXTERNAL_URL:-}
|
||||||
|
PUBLIC_URL: ${PUBLIC_URL:-}
|
||||||
CHECK_INTERVAL: "30"
|
CHECK_INTERVAL: "30"
|
||||||
|
TELEGRAM_BOT_TOKEN: ${TELEGRAM_BOT_TOKEN}
|
||||||
|
TELEGRAM_ADMIN_ID: ${TELEGRAM_ADMIN_ID:-947392854}
|
||||||
|
volumes:
|
||||||
|
- status_data:/app/data
|
||||||
ports:
|
ports:
|
||||||
- "8001:8001"
|
- "8001:8001"
|
||||||
depends_on:
|
depends_on:
|
||||||
@@ -103,5 +109,31 @@ services:
|
|||||||
- bot
|
- bot
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
|
|
||||||
|
backup:
|
||||||
|
build:
|
||||||
|
context: ./backup-service
|
||||||
|
dockerfile: Dockerfile
|
||||||
|
container_name: marathon-backup
|
||||||
|
environment:
|
||||||
|
DB_HOST: db
|
||||||
|
DB_PORT: "5432"
|
||||||
|
DB_NAME: marathon
|
||||||
|
DB_USER: marathon
|
||||||
|
DB_PASSWORD: ${DB_PASSWORD:-marathon}
|
||||||
|
S3_BUCKET_NAME: ${S3_BUCKET_NAME:-}
|
||||||
|
S3_REGION: ${S3_REGION:-ru-1}
|
||||||
|
S3_ACCESS_KEY_ID: ${S3_ACCESS_KEY_ID:-}
|
||||||
|
S3_SECRET_ACCESS_KEY: ${S3_SECRET_ACCESS_KEY:-}
|
||||||
|
S3_ENDPOINT_URL: ${S3_ENDPOINT_URL:-}
|
||||||
|
S3_BACKUP_PREFIX: ${S3_BACKUP_PREFIX:-backups/}
|
||||||
|
TELEGRAM_BOT_TOKEN: ${TELEGRAM_BOT_TOKEN}
|
||||||
|
TELEGRAM_ADMIN_ID: ${TELEGRAM_ADMIN_ID:-947392854}
|
||||||
|
BACKUP_RETENTION_DAYS: ${BACKUP_RETENTION_DAYS:-14}
|
||||||
|
depends_on:
|
||||||
|
db:
|
||||||
|
condition: service_healthy
|
||||||
|
restart: unless-stopped
|
||||||
|
|
||||||
volumes:
|
volumes:
|
||||||
postgres_data:
|
postgres_data:
|
||||||
|
status_data:
|
||||||
|
|||||||
@@ -6,6 +6,9 @@ WORKDIR /app
|
|||||||
COPY requirements.txt .
|
COPY requirements.txt .
|
||||||
RUN pip install --no-cache-dir -r requirements.txt
|
RUN pip install --no-cache-dir -r requirements.txt
|
||||||
|
|
||||||
|
# Create data directory for SQLite
|
||||||
|
RUN mkdir -p /app/data
|
||||||
|
|
||||||
# Copy application
|
# Copy application
|
||||||
COPY . .
|
COPY . .
|
||||||
|
|
||||||
|
|||||||
85
status-service/alerts.py
Normal file
85
status-service/alerts.py
Normal file
@@ -0,0 +1,85 @@
|
|||||||
|
"""Telegram alerting for status changes."""
|
||||||
|
import os
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import httpx
|
||||||
|
|
||||||
|
|
||||||
|
TELEGRAM_BOT_TOKEN = os.getenv("TELEGRAM_BOT_TOKEN", "")
|
||||||
|
TELEGRAM_ADMIN_ID = os.getenv("TELEGRAM_ADMIN_ID", "")
|
||||||
|
|
||||||
|
|
||||||
|
async def send_telegram_alert(message: str, is_recovery: bool = False) -> bool:
|
||||||
|
"""Send alert to Telegram."""
|
||||||
|
if not TELEGRAM_BOT_TOKEN or not TELEGRAM_ADMIN_ID:
|
||||||
|
print("Telegram alerting not configured")
|
||||||
|
return False
|
||||||
|
|
||||||
|
emoji = "\u2705" if is_recovery else "\u26a0\ufe0f"
|
||||||
|
text = f"{emoji} *Status Alert*\n\n{message}"
|
||||||
|
|
||||||
|
url = f"https://api.telegram.org/bot{TELEGRAM_BOT_TOKEN}/sendMessage"
|
||||||
|
data = {
|
||||||
|
"chat_id": TELEGRAM_ADMIN_ID,
|
||||||
|
"text": text,
|
||||||
|
"parse_mode": "Markdown",
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
async with httpx.AsyncClient(timeout=10.0) as client:
|
||||||
|
response = await client.post(url, json=data)
|
||||||
|
response.raise_for_status()
|
||||||
|
print(f"Telegram alert sent: {message[:50]}...")
|
||||||
|
return True
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Failed to send Telegram alert: {e}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
async def alert_service_down(service_name: str, display_name: str, message: Optional[str]):
|
||||||
|
"""Alert when service goes down."""
|
||||||
|
now = datetime.now().strftime("%d.%m.%Y %H:%M:%S")
|
||||||
|
text = (
|
||||||
|
f"*{display_name}* is DOWN\n\n"
|
||||||
|
f"Time: `{now}`\n"
|
||||||
|
)
|
||||||
|
if message:
|
||||||
|
text += f"Error: `{message}`"
|
||||||
|
|
||||||
|
await send_telegram_alert(text, is_recovery=False)
|
||||||
|
|
||||||
|
|
||||||
|
async def alert_service_recovered(service_name: str, display_name: str, downtime_minutes: int):
|
||||||
|
"""Alert when service recovers."""
|
||||||
|
now = datetime.now().strftime("%d.%m.%Y %H:%M:%S")
|
||||||
|
text = (
|
||||||
|
f"*{display_name}* is back ONLINE\n\n"
|
||||||
|
f"Time: `{now}`\n"
|
||||||
|
f"Downtime: `{downtime_minutes} min`"
|
||||||
|
)
|
||||||
|
|
||||||
|
await send_telegram_alert(text, is_recovery=True)
|
||||||
|
|
||||||
|
|
||||||
|
async def alert_ssl_expiring(domain: str, days_left: int):
|
||||||
|
"""Alert when SSL certificate is expiring soon."""
|
||||||
|
text = (
|
||||||
|
f"*SSL Certificate Expiring*\n\n"
|
||||||
|
f"Domain: `{domain}`\n"
|
||||||
|
f"Days left: `{days_left}`\n\n"
|
||||||
|
f"Please renew the certificate!"
|
||||||
|
)
|
||||||
|
|
||||||
|
await send_telegram_alert(text, is_recovery=False)
|
||||||
|
|
||||||
|
|
||||||
|
async def alert_ssl_expired(domain: str):
|
||||||
|
"""Alert when SSL certificate has expired."""
|
||||||
|
text = (
|
||||||
|
f"*SSL Certificate EXPIRED*\n\n"
|
||||||
|
f"Domain: `{domain}`\n\n"
|
||||||
|
f"Certificate has expired! Site may show security warnings."
|
||||||
|
)
|
||||||
|
|
||||||
|
await send_telegram_alert(text, is_recovery=False)
|
||||||
261
status-service/database.py
Normal file
261
status-service/database.py
Normal file
@@ -0,0 +1,261 @@
|
|||||||
|
"""SQLite database for storing metrics history."""
|
||||||
|
import sqlite3
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
import json
|
||||||
|
|
||||||
|
|
||||||
|
DB_PATH = Path("/app/data/metrics.db")
|
||||||
|
|
||||||
|
|
||||||
|
def get_connection() -> sqlite3.Connection:
|
||||||
|
"""Get database connection."""
|
||||||
|
DB_PATH.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
conn = sqlite3.connect(str(DB_PATH))
|
||||||
|
conn.row_factory = sqlite3.Row
|
||||||
|
return conn
|
||||||
|
|
||||||
|
|
||||||
|
def init_db():
|
||||||
|
"""Initialize database tables."""
|
||||||
|
conn = get_connection()
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
# Metrics history table
|
||||||
|
cursor.execute("""
|
||||||
|
CREATE TABLE IF NOT EXISTS metrics (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
service_name TEXT NOT NULL,
|
||||||
|
status TEXT NOT NULL,
|
||||||
|
latency_ms REAL,
|
||||||
|
message TEXT,
|
||||||
|
checked_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||||
|
)
|
||||||
|
""")
|
||||||
|
|
||||||
|
# Incidents table
|
||||||
|
cursor.execute("""
|
||||||
|
CREATE TABLE IF NOT EXISTS incidents (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
service_name TEXT NOT NULL,
|
||||||
|
status TEXT NOT NULL,
|
||||||
|
message TEXT,
|
||||||
|
started_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
|
||||||
|
resolved_at TIMESTAMP,
|
||||||
|
notified BOOLEAN DEFAULT FALSE
|
||||||
|
)
|
||||||
|
""")
|
||||||
|
|
||||||
|
# SSL certificates table
|
||||||
|
cursor.execute("""
|
||||||
|
CREATE TABLE IF NOT EXISTS ssl_certificates (
|
||||||
|
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
||||||
|
domain TEXT NOT NULL UNIQUE,
|
||||||
|
issuer TEXT,
|
||||||
|
expires_at TIMESTAMP,
|
||||||
|
days_until_expiry INTEGER,
|
||||||
|
checked_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
|
||||||
|
)
|
||||||
|
""")
|
||||||
|
|
||||||
|
# Create indexes
|
||||||
|
cursor.execute("""
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_metrics_service_time
|
||||||
|
ON metrics(service_name, checked_at DESC)
|
||||||
|
""")
|
||||||
|
cursor.execute("""
|
||||||
|
CREATE INDEX IF NOT EXISTS idx_incidents_service
|
||||||
|
ON incidents(service_name, started_at DESC)
|
||||||
|
""")
|
||||||
|
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
def save_metric(service_name: str, status: str, latency_ms: Optional[float], message: Optional[str]):
|
||||||
|
"""Save a metric record."""
|
||||||
|
conn = get_connection()
|
||||||
|
cursor = conn.cursor()
|
||||||
|
cursor.execute(
|
||||||
|
"INSERT INTO metrics (service_name, status, latency_ms, message) VALUES (?, ?, ?, ?)",
|
||||||
|
(service_name, status, latency_ms, message)
|
||||||
|
)
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
def get_latency_history(service_name: str, hours: int = 24) -> list[dict]:
|
||||||
|
"""Get latency history for a service."""
|
||||||
|
conn = get_connection()
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
since = datetime.now() - timedelta(hours=hours)
|
||||||
|
cursor.execute("""
|
||||||
|
SELECT latency_ms, status, checked_at
|
||||||
|
FROM metrics
|
||||||
|
WHERE service_name = ? AND checked_at > ? AND latency_ms IS NOT NULL
|
||||||
|
ORDER BY checked_at ASC
|
||||||
|
""", (service_name, since.isoformat()))
|
||||||
|
|
||||||
|
rows = cursor.fetchall()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
return [
|
||||||
|
{
|
||||||
|
"latency_ms": row["latency_ms"],
|
||||||
|
"status": row["status"],
|
||||||
|
"checked_at": row["checked_at"]
|
||||||
|
}
|
||||||
|
for row in rows
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def get_uptime_stats(service_name: str, hours: int = 24) -> dict:
|
||||||
|
"""Calculate uptime statistics for a service."""
|
||||||
|
conn = get_connection()
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
since = datetime.now() - timedelta(hours=hours)
|
||||||
|
|
||||||
|
cursor.execute("""
|
||||||
|
SELECT COUNT(*) as total,
|
||||||
|
SUM(CASE WHEN status = 'operational' THEN 1 ELSE 0 END) as successful
|
||||||
|
FROM metrics
|
||||||
|
WHERE service_name = ? AND checked_at > ?
|
||||||
|
""", (service_name, since.isoformat()))
|
||||||
|
|
||||||
|
row = cursor.fetchone()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
total = row["total"] or 0
|
||||||
|
successful = row["successful"] or 0
|
||||||
|
|
||||||
|
return {
|
||||||
|
"total_checks": total,
|
||||||
|
"successful_checks": successful,
|
||||||
|
"uptime_percent": (successful / total * 100) if total > 0 else 100.0
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def get_avg_latency(service_name: str, hours: int = 24) -> Optional[float]:
|
||||||
|
"""Get average latency for a service."""
|
||||||
|
conn = get_connection()
|
||||||
|
cursor = conn.cursor()
|
||||||
|
|
||||||
|
since = datetime.now() - timedelta(hours=hours)
|
||||||
|
cursor.execute("""
|
||||||
|
SELECT AVG(latency_ms) as avg_latency
|
||||||
|
FROM metrics
|
||||||
|
WHERE service_name = ? AND checked_at > ? AND latency_ms IS NOT NULL
|
||||||
|
""", (service_name, since.isoformat()))
|
||||||
|
|
||||||
|
row = cursor.fetchone()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
return row["avg_latency"]
|
||||||
|
|
||||||
|
|
||||||
|
def create_incident(service_name: str, status: str, message: Optional[str]) -> int:
|
||||||
|
"""Create a new incident."""
|
||||||
|
conn = get_connection()
|
||||||
|
cursor = conn.cursor()
|
||||||
|
cursor.execute(
|
||||||
|
"INSERT INTO incidents (service_name, status, message) VALUES (?, ?, ?)",
|
||||||
|
(service_name, status, message)
|
||||||
|
)
|
||||||
|
incident_id = cursor.lastrowid
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
return incident_id
|
||||||
|
|
||||||
|
|
||||||
|
def resolve_incident(service_name: str):
|
||||||
|
"""Resolve open incidents for a service."""
|
||||||
|
conn = get_connection()
|
||||||
|
cursor = conn.cursor()
|
||||||
|
cursor.execute("""
|
||||||
|
UPDATE incidents
|
||||||
|
SET resolved_at = CURRENT_TIMESTAMP
|
||||||
|
WHERE service_name = ? AND resolved_at IS NULL
|
||||||
|
""", (service_name,))
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
def get_open_incident(service_name: str) -> Optional[dict]:
|
||||||
|
"""Get open incident for a service."""
|
||||||
|
conn = get_connection()
|
||||||
|
cursor = conn.cursor()
|
||||||
|
cursor.execute("""
|
||||||
|
SELECT * FROM incidents
|
||||||
|
WHERE service_name = ? AND resolved_at IS NULL
|
||||||
|
ORDER BY started_at DESC LIMIT 1
|
||||||
|
""", (service_name,))
|
||||||
|
row = cursor.fetchone()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
if row:
|
||||||
|
return dict(row)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def mark_incident_notified(incident_id: int):
|
||||||
|
"""Mark incident as notified."""
|
||||||
|
conn = get_connection()
|
||||||
|
cursor = conn.cursor()
|
||||||
|
cursor.execute("UPDATE incidents SET notified = TRUE WHERE id = ?", (incident_id,))
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
def get_recent_incidents(limit: int = 10) -> list[dict]:
|
||||||
|
"""Get recent incidents."""
|
||||||
|
conn = get_connection()
|
||||||
|
cursor = conn.cursor()
|
||||||
|
cursor.execute("""
|
||||||
|
SELECT * FROM incidents
|
||||||
|
ORDER BY started_at DESC
|
||||||
|
LIMIT ?
|
||||||
|
""", (limit,))
|
||||||
|
rows = cursor.fetchall()
|
||||||
|
conn.close()
|
||||||
|
return [dict(row) for row in rows]
|
||||||
|
|
||||||
|
|
||||||
|
def save_ssl_info(domain: str, issuer: str, expires_at: datetime, days_until_expiry: int):
|
||||||
|
"""Save SSL certificate info."""
|
||||||
|
conn = get_connection()
|
||||||
|
cursor = conn.cursor()
|
||||||
|
cursor.execute("""
|
||||||
|
INSERT OR REPLACE INTO ssl_certificates
|
||||||
|
(domain, issuer, expires_at, days_until_expiry, checked_at)
|
||||||
|
VALUES (?, ?, ?, ?, CURRENT_TIMESTAMP)
|
||||||
|
""", (domain, issuer, expires_at.isoformat(), days_until_expiry))
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
|
||||||
|
def get_ssl_info(domain: str) -> Optional[dict]:
|
||||||
|
"""Get SSL certificate info."""
|
||||||
|
conn = get_connection()
|
||||||
|
cursor = conn.cursor()
|
||||||
|
cursor.execute("SELECT * FROM ssl_certificates WHERE domain = ?", (domain,))
|
||||||
|
row = cursor.fetchone()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
if row:
|
||||||
|
return dict(row)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def cleanup_old_metrics(days: int = 7):
|
||||||
|
"""Delete metrics older than specified days."""
|
||||||
|
conn = get_connection()
|
||||||
|
cursor = conn.cursor()
|
||||||
|
cutoff = datetime.now() - timedelta(days=days)
|
||||||
|
cursor.execute("DELETE FROM metrics WHERE checked_at < ?", (cutoff.isoformat(),))
|
||||||
|
deleted = cursor.rowcount
|
||||||
|
conn.commit()
|
||||||
|
conn.close()
|
||||||
|
return deleted
|
||||||
@@ -1,6 +1,7 @@
|
|||||||
|
"""Status monitoring service with persistence and alerting."""
|
||||||
import os
|
import os
|
||||||
import asyncio
|
import asyncio
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
from contextlib import asynccontextmanager
|
from contextlib import asynccontextmanager
|
||||||
|
|
||||||
@@ -8,13 +9,16 @@ from fastapi import FastAPI, Request
|
|||||||
from fastapi.responses import HTMLResponse
|
from fastapi.responses import HTMLResponse
|
||||||
from fastapi.templating import Jinja2Templates
|
from fastapi.templating import Jinja2Templates
|
||||||
|
|
||||||
from monitors import ServiceMonitor, ServiceStatus
|
from monitors import ServiceMonitor
|
||||||
|
from database import init_db, get_recent_incidents, get_latency_history, cleanup_old_metrics
|
||||||
|
|
||||||
|
|
||||||
# Configuration
|
# Configuration
|
||||||
BACKEND_URL = os.getenv("BACKEND_URL", "http://backend:8000")
|
BACKEND_URL = os.getenv("BACKEND_URL", "http://backend:8000")
|
||||||
FRONTEND_URL = os.getenv("FRONTEND_URL", "http://frontend:80")
|
FRONTEND_URL = os.getenv("FRONTEND_URL", "http://frontend:80")
|
||||||
BOT_URL = os.getenv("BOT_URL", "http://bot:8080")
|
BOT_URL = os.getenv("BOT_URL", "http://bot:8080")
|
||||||
|
EXTERNAL_URL = os.getenv("EXTERNAL_URL", "") # Public URL for external checks
|
||||||
|
PUBLIC_URL = os.getenv("PUBLIC_URL", "") # Public HTTPS URL for SSL checks
|
||||||
CHECK_INTERVAL = int(os.getenv("CHECK_INTERVAL", "30"))
|
CHECK_INTERVAL = int(os.getenv("CHECK_INTERVAL", "30"))
|
||||||
|
|
||||||
# Initialize monitor
|
# Initialize monitor
|
||||||
@@ -22,38 +26,64 @@ monitor = ServiceMonitor()
|
|||||||
|
|
||||||
# Background task reference
|
# Background task reference
|
||||||
background_task: Optional[asyncio.Task] = None
|
background_task: Optional[asyncio.Task] = None
|
||||||
|
cleanup_task: Optional[asyncio.Task] = None
|
||||||
|
|
||||||
|
|
||||||
async def periodic_health_check():
|
async def periodic_health_check():
|
||||||
"""Background task to check services periodically"""
|
"""Background task to check services periodically."""
|
||||||
while True:
|
while True:
|
||||||
await monitor.check_all_services(
|
try:
|
||||||
backend_url=BACKEND_URL,
|
await monitor.check_all_services(
|
||||||
frontend_url=FRONTEND_URL,
|
backend_url=BACKEND_URL,
|
||||||
bot_url=BOT_URL
|
frontend_url=FRONTEND_URL,
|
||||||
)
|
bot_url=BOT_URL,
|
||||||
|
external_url=EXTERNAL_URL,
|
||||||
|
public_url=PUBLIC_URL
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Health check error: {e}")
|
||||||
await asyncio.sleep(CHECK_INTERVAL)
|
await asyncio.sleep(CHECK_INTERVAL)
|
||||||
|
|
||||||
|
|
||||||
|
async def periodic_cleanup():
|
||||||
|
"""Background task to cleanup old metrics (daily)."""
|
||||||
|
while True:
|
||||||
|
await asyncio.sleep(86400) # 24 hours
|
||||||
|
try:
|
||||||
|
deleted = cleanup_old_metrics(days=7)
|
||||||
|
print(f"Cleaned up {deleted} old metrics")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Cleanup error: {e}")
|
||||||
|
|
||||||
|
|
||||||
@asynccontextmanager
|
@asynccontextmanager
|
||||||
async def lifespan(app: FastAPI):
|
async def lifespan(app: FastAPI):
|
||||||
"""Startup and shutdown events"""
|
"""Startup and shutdown events."""
|
||||||
global background_task
|
global background_task, cleanup_task
|
||||||
|
|
||||||
|
# Initialize database
|
||||||
|
init_db()
|
||||||
|
print("Database initialized")
|
||||||
|
|
||||||
# Start background health checks
|
# Start background health checks
|
||||||
background_task = asyncio.create_task(periodic_health_check())
|
background_task = asyncio.create_task(periodic_health_check())
|
||||||
|
cleanup_task = asyncio.create_task(periodic_cleanup())
|
||||||
|
|
||||||
yield
|
yield
|
||||||
# Cancel background task on shutdown
|
|
||||||
if background_task:
|
# Cancel background tasks on shutdown
|
||||||
background_task.cancel()
|
for task in [background_task, cleanup_task]:
|
||||||
try:
|
if task:
|
||||||
await background_task
|
task.cancel()
|
||||||
except asyncio.CancelledError:
|
try:
|
||||||
pass
|
await task
|
||||||
|
except asyncio.CancelledError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
app = FastAPI(
|
app = FastAPI(
|
||||||
title="Status Monitor",
|
title="Status Monitor",
|
||||||
description="Service health monitoring",
|
description="Service health monitoring with persistence and alerting",
|
||||||
lifespan=lifespan
|
lifespan=lifespan
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -62,9 +92,11 @@ templates = Jinja2Templates(directory="templates")
|
|||||||
|
|
||||||
@app.get("/", response_class=HTMLResponse)
|
@app.get("/", response_class=HTMLResponse)
|
||||||
async def status_page(request: Request):
|
async def status_page(request: Request):
|
||||||
"""Main status page"""
|
"""Main status page."""
|
||||||
services = monitor.get_all_statuses()
|
services = monitor.get_all_statuses()
|
||||||
overall_status = monitor.get_overall_status()
|
overall_status = monitor.get_overall_status()
|
||||||
|
ssl_status = monitor.get_ssl_status()
|
||||||
|
incidents = get_recent_incidents(limit=5)
|
||||||
|
|
||||||
return templates.TemplateResponse(
|
return templates.TemplateResponse(
|
||||||
"index.html",
|
"index.html",
|
||||||
@@ -72,6 +104,8 @@ async def status_page(request: Request):
|
|||||||
"request": request,
|
"request": request,
|
||||||
"services": services,
|
"services": services,
|
||||||
"overall_status": overall_status,
|
"overall_status": overall_status,
|
||||||
|
"ssl_status": ssl_status,
|
||||||
|
"incidents": incidents,
|
||||||
"last_check": monitor.last_check,
|
"last_check": monitor.last_check,
|
||||||
"check_interval": CHECK_INTERVAL
|
"check_interval": CHECK_INTERVAL
|
||||||
}
|
}
|
||||||
@@ -80,30 +114,52 @@ async def status_page(request: Request):
|
|||||||
|
|
||||||
@app.get("/api/status")
|
@app.get("/api/status")
|
||||||
async def api_status():
|
async def api_status():
|
||||||
"""API endpoint for service statuses"""
|
"""API endpoint for service statuses."""
|
||||||
services = monitor.get_all_statuses()
|
services = monitor.get_all_statuses()
|
||||||
overall_status = monitor.get_overall_status()
|
overall_status = monitor.get_overall_status()
|
||||||
|
ssl_status = monitor.get_ssl_status()
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"overall_status": overall_status,
|
"overall_status": overall_status.value,
|
||||||
"services": {name: status.to_dict() for name, status in services.items()},
|
"services": {name: status.to_dict() for name, status in services.items()},
|
||||||
|
"ssl": ssl_status,
|
||||||
"last_check": monitor.last_check.isoformat() if monitor.last_check else None,
|
"last_check": monitor.last_check.isoformat() if monitor.last_check else None,
|
||||||
"check_interval_seconds": CHECK_INTERVAL
|
"check_interval_seconds": CHECK_INTERVAL
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/api/history/{service_name}")
|
||||||
|
async def api_history(service_name: str, hours: int = 24):
|
||||||
|
"""API endpoint for service latency history."""
|
||||||
|
history = get_latency_history(service_name, hours=hours)
|
||||||
|
return {
|
||||||
|
"service": service_name,
|
||||||
|
"hours": hours,
|
||||||
|
"data": history
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@app.get("/api/incidents")
|
||||||
|
async def api_incidents(limit: int = 20):
|
||||||
|
"""API endpoint for recent incidents."""
|
||||||
|
incidents = get_recent_incidents(limit=limit)
|
||||||
|
return {"incidents": incidents}
|
||||||
|
|
||||||
|
|
||||||
@app.get("/api/health")
|
@app.get("/api/health")
|
||||||
async def health():
|
async def health():
|
||||||
"""Health check for this service"""
|
"""Health check for this service."""
|
||||||
return {"status": "ok", "service": "status-monitor"}
|
return {"status": "ok", "service": "status-monitor"}
|
||||||
|
|
||||||
|
|
||||||
@app.post("/api/refresh")
|
@app.post("/api/refresh")
|
||||||
async def refresh_status():
|
async def refresh_status():
|
||||||
"""Force refresh all service statuses"""
|
"""Force refresh all service statuses."""
|
||||||
await monitor.check_all_services(
|
await monitor.check_all_services(
|
||||||
backend_url=BACKEND_URL,
|
backend_url=BACKEND_URL,
|
||||||
frontend_url=FRONTEND_URL,
|
frontend_url=FRONTEND_URL,
|
||||||
bot_url=BOT_URL
|
bot_url=BOT_URL,
|
||||||
|
external_url=EXTERNAL_URL,
|
||||||
|
public_url=PUBLIC_URL
|
||||||
)
|
)
|
||||||
return {"status": "refreshed"}
|
return {"status": "refreshed"}
|
||||||
|
|||||||
@@ -1,11 +1,19 @@
|
|||||||
|
"""Service monitoring with persistence and alerting."""
|
||||||
import asyncio
|
import asyncio
|
||||||
from datetime import datetime, timedelta
|
from datetime import datetime, timedelta
|
||||||
from dataclasses import dataclass, field
|
from dataclasses import dataclass
|
||||||
from typing import Optional
|
from typing import Optional
|
||||||
from enum import Enum
|
from enum import Enum
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
|
|
||||||
|
from database import (
|
||||||
|
save_metric, get_latency_history, get_uptime_stats, get_avg_latency,
|
||||||
|
create_incident, resolve_incident, get_open_incident, mark_incident_notified
|
||||||
|
)
|
||||||
|
from alerts import alert_service_down, alert_service_recovered
|
||||||
|
from ssl_monitor import check_and_alert_ssl, SSLInfo
|
||||||
|
|
||||||
|
|
||||||
class Status(str, Enum):
|
class Status(str, Enum):
|
||||||
OPERATIONAL = "operational"
|
OPERATIONAL = "operational"
|
||||||
@@ -25,11 +33,17 @@ class ServiceStatus:
|
|||||||
uptime_percent: float = 100.0
|
uptime_percent: float = 100.0
|
||||||
message: Optional[str] = None
|
message: Optional[str] = None
|
||||||
version: Optional[str] = None
|
version: Optional[str] = None
|
||||||
|
avg_latency_24h: Optional[float] = None
|
||||||
|
latency_history: list = None
|
||||||
|
|
||||||
# For uptime calculation
|
# For uptime calculation (in-memory, backed by DB)
|
||||||
total_checks: int = 0
|
total_checks: int = 0
|
||||||
successful_checks: int = 0
|
successful_checks: int = 0
|
||||||
|
|
||||||
|
def __post_init__(self):
|
||||||
|
if self.latency_history is None:
|
||||||
|
self.latency_history = []
|
||||||
|
|
||||||
def to_dict(self) -> dict:
|
def to_dict(self) -> dict:
|
||||||
return {
|
return {
|
||||||
"name": self.name,
|
"name": self.name,
|
||||||
@@ -40,7 +54,8 @@ class ServiceStatus:
|
|||||||
"last_incident": self.last_incident.isoformat() if self.last_incident else None,
|
"last_incident": self.last_incident.isoformat() if self.last_incident else None,
|
||||||
"uptime_percent": round(self.uptime_percent, 2),
|
"uptime_percent": round(self.uptime_percent, 2),
|
||||||
"message": self.message,
|
"message": self.message,
|
||||||
"version": self.version
|
"version": self.version,
|
||||||
|
"avg_latency_24h": round(self.avg_latency_24h, 2) if self.avg_latency_24h else None,
|
||||||
}
|
}
|
||||||
|
|
||||||
def update_uptime(self, is_success: bool):
|
def update_uptime(self, is_success: bool):
|
||||||
@@ -69,12 +84,17 @@ class ServiceMonitor:
|
|||||||
"bot": ServiceStatus(
|
"bot": ServiceStatus(
|
||||||
name="bot",
|
name="bot",
|
||||||
display_name="Telegram Bot"
|
display_name="Telegram Bot"
|
||||||
)
|
),
|
||||||
|
"external": ServiceStatus(
|
||||||
|
name="external",
|
||||||
|
display_name="External Access"
|
||||||
|
),
|
||||||
}
|
}
|
||||||
self.last_check: Optional[datetime] = None
|
self.last_check: Optional[datetime] = None
|
||||||
|
self.ssl_info: Optional[SSLInfo] = None
|
||||||
|
|
||||||
async def check_backend(self, url: str) -> tuple[Status, Optional[float], Optional[str], Optional[str]]:
|
async def check_backend(self, url: str) -> tuple[Status, Optional[float], Optional[str], Optional[str]]:
|
||||||
"""Check backend API health"""
|
"""Check backend API health."""
|
||||||
try:
|
try:
|
||||||
async with httpx.AsyncClient(timeout=10.0) as client:
|
async with httpx.AsyncClient(timeout=10.0) as client:
|
||||||
start = datetime.now()
|
start = datetime.now()
|
||||||
@@ -92,9 +112,7 @@ class ServiceMonitor:
|
|||||||
return Status.DOWN, None, str(e)[:100], None
|
return Status.DOWN, None, str(e)[:100], None
|
||||||
|
|
||||||
async def check_database(self, backend_url: str) -> tuple[Status, Optional[float], Optional[str]]:
|
async def check_database(self, backend_url: str) -> tuple[Status, Optional[float], Optional[str]]:
|
||||||
"""Check database through backend"""
|
"""Check database through backend."""
|
||||||
# We check database indirectly - if backend is up, DB is likely up
|
|
||||||
# Could add a specific /health/db endpoint to backend later
|
|
||||||
try:
|
try:
|
||||||
async with httpx.AsyncClient(timeout=10.0) as client:
|
async with httpx.AsyncClient(timeout=10.0) as client:
|
||||||
start = datetime.now()
|
start = datetime.now()
|
||||||
@@ -109,7 +127,7 @@ class ServiceMonitor:
|
|||||||
return Status.DOWN, None, "Cannot reach backend"
|
return Status.DOWN, None, "Cannot reach backend"
|
||||||
|
|
||||||
async def check_frontend(self, url: str) -> tuple[Status, Optional[float], Optional[str]]:
|
async def check_frontend(self, url: str) -> tuple[Status, Optional[float], Optional[str]]:
|
||||||
"""Check frontend availability"""
|
"""Check frontend availability."""
|
||||||
try:
|
try:
|
||||||
async with httpx.AsyncClient(timeout=10.0) as client:
|
async with httpx.AsyncClient(timeout=10.0) as client:
|
||||||
start = datetime.now()
|
start = datetime.now()
|
||||||
@@ -126,7 +144,7 @@ class ServiceMonitor:
|
|||||||
return Status.DOWN, None, str(e)[:100]
|
return Status.DOWN, None, str(e)[:100]
|
||||||
|
|
||||||
async def check_bot(self, url: str) -> tuple[Status, Optional[float], Optional[str]]:
|
async def check_bot(self, url: str) -> tuple[Status, Optional[float], Optional[str]]:
|
||||||
"""Check Telegram bot health"""
|
"""Check Telegram bot health."""
|
||||||
try:
|
try:
|
||||||
async with httpx.AsyncClient(timeout=10.0) as client:
|
async with httpx.AsyncClient(timeout=10.0) as client:
|
||||||
start = datetime.now()
|
start = datetime.now()
|
||||||
@@ -142,8 +160,93 @@ class ServiceMonitor:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
return Status.DOWN, None, str(e)[:100]
|
return Status.DOWN, None, str(e)[:100]
|
||||||
|
|
||||||
async def check_all_services(self, backend_url: str, frontend_url: str, bot_url: str):
|
async def check_external(self, url: str) -> tuple[Status, Optional[float], Optional[str]]:
|
||||||
"""Check all services concurrently"""
|
"""Check external (public) URL availability."""
|
||||||
|
if not url:
|
||||||
|
return Status.UNKNOWN, None, "Not configured"
|
||||||
|
|
||||||
|
try:
|
||||||
|
async with httpx.AsyncClient(timeout=15.0, follow_redirects=True) as client:
|
||||||
|
start = datetime.now()
|
||||||
|
response = await client.get(url)
|
||||||
|
latency = (datetime.now() - start).total_seconds() * 1000
|
||||||
|
|
||||||
|
if response.status_code == 200:
|
||||||
|
return Status.OPERATIONAL, latency, None
|
||||||
|
else:
|
||||||
|
return Status.DEGRADED, latency, f"HTTP {response.status_code}"
|
||||||
|
except httpx.TimeoutException:
|
||||||
|
return Status.DOWN, None, "Timeout"
|
||||||
|
except Exception as e:
|
||||||
|
return Status.DOWN, None, str(e)[:100]
|
||||||
|
|
||||||
|
async def _process_check_result(
|
||||||
|
self,
|
||||||
|
service_name: str,
|
||||||
|
result: tuple,
|
||||||
|
now: datetime
|
||||||
|
):
|
||||||
|
"""Process check result with DB persistence and alerting."""
|
||||||
|
if isinstance(result, Exception):
|
||||||
|
return
|
||||||
|
|
||||||
|
if len(result) == 4:
|
||||||
|
status, latency, message, version = result
|
||||||
|
else:
|
||||||
|
status, latency, message = result
|
||||||
|
version = None
|
||||||
|
|
||||||
|
svc = self.services[service_name]
|
||||||
|
was_down = svc.status in (Status.DOWN, Status.DEGRADED)
|
||||||
|
is_down = status in (Status.DOWN, Status.DEGRADED)
|
||||||
|
|
||||||
|
# Update service status
|
||||||
|
svc.status = status
|
||||||
|
svc.latency_ms = latency
|
||||||
|
svc.message = message
|
||||||
|
if version:
|
||||||
|
svc.version = version
|
||||||
|
svc.last_check = now
|
||||||
|
svc.update_uptime(status == Status.OPERATIONAL)
|
||||||
|
|
||||||
|
# Save metric to database
|
||||||
|
save_metric(service_name, status.value, latency, message)
|
||||||
|
|
||||||
|
# Load historical data
|
||||||
|
svc.latency_history = get_latency_history(service_name, hours=24)
|
||||||
|
svc.avg_latency_24h = get_avg_latency(service_name, hours=24)
|
||||||
|
|
||||||
|
# Update uptime from DB
|
||||||
|
stats = get_uptime_stats(service_name, hours=24)
|
||||||
|
if stats["total_checks"] > 0:
|
||||||
|
svc.uptime_percent = stats["uptime_percent"]
|
||||||
|
|
||||||
|
# Handle incident tracking and alerting
|
||||||
|
if is_down and not was_down:
|
||||||
|
# Service just went down
|
||||||
|
svc.last_incident = now
|
||||||
|
incident_id = create_incident(service_name, status.value, message)
|
||||||
|
await alert_service_down(service_name, svc.display_name, message)
|
||||||
|
mark_incident_notified(incident_id)
|
||||||
|
|
||||||
|
elif not is_down and was_down:
|
||||||
|
# Service recovered
|
||||||
|
open_incident = get_open_incident(service_name)
|
||||||
|
if open_incident:
|
||||||
|
started_at = datetime.fromisoformat(open_incident["started_at"])
|
||||||
|
downtime_minutes = int((now - started_at).total_seconds() / 60)
|
||||||
|
resolve_incident(service_name)
|
||||||
|
await alert_service_recovered(service_name, svc.display_name, downtime_minutes)
|
||||||
|
|
||||||
|
async def check_all_services(
|
||||||
|
self,
|
||||||
|
backend_url: str,
|
||||||
|
frontend_url: str,
|
||||||
|
bot_url: str,
|
||||||
|
external_url: str = "",
|
||||||
|
public_url: str = ""
|
||||||
|
):
|
||||||
|
"""Check all services concurrently."""
|
||||||
now = datetime.now()
|
now = datetime.now()
|
||||||
|
|
||||||
# Run all checks concurrently
|
# Run all checks concurrently
|
||||||
@@ -152,61 +255,18 @@ class ServiceMonitor:
|
|||||||
self.check_database(backend_url),
|
self.check_database(backend_url),
|
||||||
self.check_frontend(frontend_url),
|
self.check_frontend(frontend_url),
|
||||||
self.check_bot(bot_url),
|
self.check_bot(bot_url),
|
||||||
|
self.check_external(external_url),
|
||||||
return_exceptions=True
|
return_exceptions=True
|
||||||
)
|
)
|
||||||
|
|
||||||
# Process backend result
|
# Process results
|
||||||
if not isinstance(results[0], Exception):
|
service_names = ["backend", "database", "frontend", "bot", "external"]
|
||||||
status, latency, message, version = results[0]
|
for i, service_name in enumerate(service_names):
|
||||||
svc = self.services["backend"]
|
await self._process_check_result(service_name, results[i], now)
|
||||||
was_down = svc.status == Status.DOWN
|
|
||||||
svc.status = status
|
|
||||||
svc.latency_ms = latency
|
|
||||||
svc.message = message
|
|
||||||
svc.version = version
|
|
||||||
svc.last_check = now
|
|
||||||
svc.update_uptime(status == Status.OPERATIONAL)
|
|
||||||
if status != Status.OPERATIONAL and not was_down:
|
|
||||||
svc.last_incident = now
|
|
||||||
|
|
||||||
# Process database result
|
# Check SSL certificate (if public URL is HTTPS)
|
||||||
if not isinstance(results[1], Exception):
|
if public_url and public_url.startswith("https://"):
|
||||||
status, latency, message = results[1]
|
self.ssl_info = await check_and_alert_ssl(public_url)
|
||||||
svc = self.services["database"]
|
|
||||||
was_down = svc.status == Status.DOWN
|
|
||||||
svc.status = status
|
|
||||||
svc.latency_ms = latency
|
|
||||||
svc.message = message
|
|
||||||
svc.last_check = now
|
|
||||||
svc.update_uptime(status == Status.OPERATIONAL)
|
|
||||||
if status != Status.OPERATIONAL and not was_down:
|
|
||||||
svc.last_incident = now
|
|
||||||
|
|
||||||
# Process frontend result
|
|
||||||
if not isinstance(results[2], Exception):
|
|
||||||
status, latency, message = results[2]
|
|
||||||
svc = self.services["frontend"]
|
|
||||||
was_down = svc.status == Status.DOWN
|
|
||||||
svc.status = status
|
|
||||||
svc.latency_ms = latency
|
|
||||||
svc.message = message
|
|
||||||
svc.last_check = now
|
|
||||||
svc.update_uptime(status == Status.OPERATIONAL)
|
|
||||||
if status != Status.OPERATIONAL and not was_down:
|
|
||||||
svc.last_incident = now
|
|
||||||
|
|
||||||
# Process bot result
|
|
||||||
if not isinstance(results[3], Exception):
|
|
||||||
status, latency, message = results[3]
|
|
||||||
svc = self.services["bot"]
|
|
||||||
was_down = svc.status == Status.DOWN
|
|
||||||
svc.status = status
|
|
||||||
svc.latency_ms = latency
|
|
||||||
svc.message = message
|
|
||||||
svc.last_check = now
|
|
||||||
svc.update_uptime(status == Status.OPERATIONAL)
|
|
||||||
if status != Status.OPERATIONAL and not was_down:
|
|
||||||
svc.last_incident = now
|
|
||||||
|
|
||||||
self.last_check = now
|
self.last_check = now
|
||||||
|
|
||||||
@@ -214,8 +274,12 @@ class ServiceMonitor:
|
|||||||
return self.services
|
return self.services
|
||||||
|
|
||||||
def get_overall_status(self) -> Status:
|
def get_overall_status(self) -> Status:
|
||||||
"""Get overall system status based on all services"""
|
"""Get overall system status based on all services."""
|
||||||
statuses = [svc.status for svc in self.services.values()]
|
# Exclude external from overall status if not configured
|
||||||
|
statuses = [
|
||||||
|
svc.status for name, svc in self.services.items()
|
||||||
|
if name != "external" or svc.status != Status.UNKNOWN
|
||||||
|
]
|
||||||
|
|
||||||
if all(s == Status.OPERATIONAL for s in statuses):
|
if all(s == Status.OPERATIONAL for s in statuses):
|
||||||
return Status.OPERATIONAL
|
return Status.OPERATIONAL
|
||||||
@@ -225,3 +289,17 @@ class ServiceMonitor:
|
|||||||
return Status.DEGRADED
|
return Status.DEGRADED
|
||||||
else:
|
else:
|
||||||
return Status.UNKNOWN
|
return Status.UNKNOWN
|
||||||
|
|
||||||
|
def get_ssl_status(self) -> Optional[dict]:
|
||||||
|
"""Get SSL certificate status."""
|
||||||
|
if not self.ssl_info:
|
||||||
|
return None
|
||||||
|
|
||||||
|
return {
|
||||||
|
"domain": self.ssl_info.domain,
|
||||||
|
"issuer": self.ssl_info.issuer,
|
||||||
|
"expires_at": self.ssl_info.expires_at.isoformat(),
|
||||||
|
"days_until_expiry": self.ssl_info.days_until_expiry,
|
||||||
|
"is_valid": self.ssl_info.is_valid,
|
||||||
|
"error": self.ssl_info.error
|
||||||
|
}
|
||||||
|
|||||||
140
status-service/ssl_monitor.py
Normal file
140
status-service/ssl_monitor.py
Normal file
@@ -0,0 +1,140 @@
|
|||||||
|
"""SSL certificate monitoring."""
|
||||||
|
import ssl
|
||||||
|
import socket
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from dataclasses import dataclass
|
||||||
|
from typing import Optional
|
||||||
|
from urllib.parse import urlparse
|
||||||
|
|
||||||
|
from database import save_ssl_info, get_ssl_info
|
||||||
|
from alerts import alert_ssl_expiring, alert_ssl_expired
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class SSLInfo:
|
||||||
|
domain: str
|
||||||
|
issuer: str
|
||||||
|
expires_at: datetime
|
||||||
|
days_until_expiry: int
|
||||||
|
is_valid: bool
|
||||||
|
error: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
def check_ssl_certificate(url: str) -> Optional[SSLInfo]:
|
||||||
|
"""Check SSL certificate for a URL."""
|
||||||
|
try:
|
||||||
|
parsed = urlparse(url)
|
||||||
|
hostname = parsed.hostname
|
||||||
|
|
||||||
|
if not hostname:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Skip non-HTTPS or localhost
|
||||||
|
if parsed.scheme != "https" or hostname in ("localhost", "127.0.0.1"):
|
||||||
|
return None
|
||||||
|
|
||||||
|
context = ssl.create_default_context()
|
||||||
|
conn = context.wrap_socket(
|
||||||
|
socket.socket(socket.AF_INET),
|
||||||
|
server_hostname=hostname
|
||||||
|
)
|
||||||
|
conn.settimeout(10.0)
|
||||||
|
|
||||||
|
try:
|
||||||
|
conn.connect((hostname, parsed.port or 443))
|
||||||
|
cert = conn.getpeercert()
|
||||||
|
finally:
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
if not cert:
|
||||||
|
return SSLInfo(
|
||||||
|
domain=hostname,
|
||||||
|
issuer="Unknown",
|
||||||
|
expires_at=datetime.now(timezone.utc),
|
||||||
|
days_until_expiry=0,
|
||||||
|
is_valid=False,
|
||||||
|
error="No certificate found"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Parse expiry date
|
||||||
|
not_after = cert.get("notAfter", "")
|
||||||
|
expires_at = datetime.strptime(not_after, "%b %d %H:%M:%S %Y %Z")
|
||||||
|
expires_at = expires_at.replace(tzinfo=timezone.utc)
|
||||||
|
|
||||||
|
# Calculate days until expiry
|
||||||
|
now = datetime.now(timezone.utc)
|
||||||
|
days_until_expiry = (expires_at - now).days
|
||||||
|
|
||||||
|
# Get issuer
|
||||||
|
issuer_parts = cert.get("issuer", ())
|
||||||
|
issuer = "Unknown"
|
||||||
|
for part in issuer_parts:
|
||||||
|
for key, value in part:
|
||||||
|
if key == "organizationName":
|
||||||
|
issuer = value
|
||||||
|
break
|
||||||
|
|
||||||
|
return SSLInfo(
|
||||||
|
domain=hostname,
|
||||||
|
issuer=issuer,
|
||||||
|
expires_at=expires_at,
|
||||||
|
days_until_expiry=days_until_expiry,
|
||||||
|
is_valid=days_until_expiry > 0
|
||||||
|
)
|
||||||
|
|
||||||
|
except ssl.SSLCertVerificationError as e:
|
||||||
|
hostname = urlparse(url).hostname or url
|
||||||
|
return SSLInfo(
|
||||||
|
domain=hostname,
|
||||||
|
issuer="Invalid",
|
||||||
|
expires_at=datetime.now(timezone.utc),
|
||||||
|
days_until_expiry=0,
|
||||||
|
is_valid=False,
|
||||||
|
error=f"SSL verification failed: {str(e)[:100]}"
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
hostname = urlparse(url).hostname or url
|
||||||
|
return SSLInfo(
|
||||||
|
domain=hostname,
|
||||||
|
issuer="Unknown",
|
||||||
|
expires_at=datetime.now(timezone.utc),
|
||||||
|
days_until_expiry=0,
|
||||||
|
is_valid=False,
|
||||||
|
error=str(e)[:100]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
async def check_and_alert_ssl(url: str, warn_days: int = 14) -> Optional[SSLInfo]:
|
||||||
|
"""Check SSL and send alerts if needed."""
|
||||||
|
ssl_info = check_ssl_certificate(url)
|
||||||
|
|
||||||
|
if not ssl_info:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Save to database
|
||||||
|
save_ssl_info(
|
||||||
|
domain=ssl_info.domain,
|
||||||
|
issuer=ssl_info.issuer,
|
||||||
|
expires_at=ssl_info.expires_at,
|
||||||
|
days_until_expiry=ssl_info.days_until_expiry
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check if we need to alert
|
||||||
|
prev_info = get_ssl_info(ssl_info.domain)
|
||||||
|
|
||||||
|
if ssl_info.days_until_expiry <= 0:
|
||||||
|
# Certificate expired
|
||||||
|
await alert_ssl_expired(ssl_info.domain)
|
||||||
|
elif ssl_info.days_until_expiry <= warn_days:
|
||||||
|
# Certificate expiring soon - alert once per day
|
||||||
|
should_alert = True
|
||||||
|
if prev_info and prev_info.get("checked_at"):
|
||||||
|
# Check if we already alerted today
|
||||||
|
last_check = datetime.fromisoformat(prev_info["checked_at"])
|
||||||
|
if (datetime.now() - last_check).days < 1:
|
||||||
|
should_alert = False
|
||||||
|
|
||||||
|
if should_alert:
|
||||||
|
await alert_ssl_expiring(ssl_info.domain, ssl_info.days_until_expiry)
|
||||||
|
|
||||||
|
return ssl_info
|
||||||
@@ -4,6 +4,7 @@
|
|||||||
<meta charset="UTF-8">
|
<meta charset="UTF-8">
|
||||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||||
<title>System Status</title>
|
<title>System Status</title>
|
||||||
|
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
|
||||||
<style>
|
<style>
|
||||||
* {
|
* {
|
||||||
margin: 0;
|
margin: 0;
|
||||||
@@ -19,7 +20,7 @@
|
|||||||
}
|
}
|
||||||
|
|
||||||
.container {
|
.container {
|
||||||
max-width: 900px;
|
max-width: 1100px;
|
||||||
margin: 0 auto;
|
margin: 0 auto;
|
||||||
padding: 40px 20px;
|
padding: 40px 20px;
|
||||||
}
|
}
|
||||||
@@ -39,6 +40,13 @@
|
|||||||
background-clip: text;
|
background-clip: text;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
h2 {
|
||||||
|
font-size: 1.3rem;
|
||||||
|
font-weight: 600;
|
||||||
|
margin: 30px 0 16px 0;
|
||||||
|
color: #94a3b8;
|
||||||
|
}
|
||||||
|
|
||||||
.overall-status {
|
.overall-status {
|
||||||
display: inline-flex;
|
display: inline-flex;
|
||||||
align-items: center;
|
align-items: center;
|
||||||
@@ -174,8 +182,9 @@
|
|||||||
|
|
||||||
.service-metrics {
|
.service-metrics {
|
||||||
display: grid;
|
display: grid;
|
||||||
grid-template-columns: repeat(auto-fit, minmax(140px, 1fr));
|
grid-template-columns: repeat(auto-fit, minmax(120px, 1fr));
|
||||||
gap: 12px;
|
gap: 12px;
|
||||||
|
margin-bottom: 16px;
|
||||||
}
|
}
|
||||||
|
|
||||||
.metric {
|
.metric {
|
||||||
@@ -212,6 +221,132 @@
|
|||||||
color: #fca5a5;
|
color: #fca5a5;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/* Latency chart */
|
||||||
|
.latency-chart {
|
||||||
|
height: 60px;
|
||||||
|
margin-top: 12px;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* SSL Card */
|
||||||
|
.ssl-card {
|
||||||
|
background: rgba(30, 41, 59, 0.5);
|
||||||
|
border: 1px solid rgba(100, 116, 139, 0.2);
|
||||||
|
border-radius: 16px;
|
||||||
|
padding: 20px;
|
||||||
|
margin-bottom: 20px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.ssl-card.warning {
|
||||||
|
border-color: rgba(250, 204, 21, 0.3);
|
||||||
|
}
|
||||||
|
|
||||||
|
.ssl-card.danger {
|
||||||
|
border-color: rgba(239, 68, 68, 0.3);
|
||||||
|
}
|
||||||
|
|
||||||
|
.ssl-header {
|
||||||
|
display: flex;
|
||||||
|
justify-content: space-between;
|
||||||
|
align-items: center;
|
||||||
|
margin-bottom: 12px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.ssl-title {
|
||||||
|
font-size: 1.1rem;
|
||||||
|
font-weight: 600;
|
||||||
|
color: #f1f5f9;
|
||||||
|
}
|
||||||
|
|
||||||
|
.ssl-badge {
|
||||||
|
padding: 4px 12px;
|
||||||
|
border-radius: 20px;
|
||||||
|
font-size: 0.8rem;
|
||||||
|
font-weight: 500;
|
||||||
|
}
|
||||||
|
|
||||||
|
.ssl-badge.valid {
|
||||||
|
background: rgba(34, 197, 94, 0.15);
|
||||||
|
color: #22c55e;
|
||||||
|
}
|
||||||
|
|
||||||
|
.ssl-badge.expiring {
|
||||||
|
background: rgba(250, 204, 21, 0.15);
|
||||||
|
color: #facc15;
|
||||||
|
}
|
||||||
|
|
||||||
|
.ssl-badge.expired {
|
||||||
|
background: rgba(239, 68, 68, 0.15);
|
||||||
|
color: #ef4444;
|
||||||
|
}
|
||||||
|
|
||||||
|
.ssl-info {
|
||||||
|
display: grid;
|
||||||
|
grid-template-columns: repeat(auto-fit, minmax(150px, 1fr));
|
||||||
|
gap: 12px;
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Incidents */
|
||||||
|
.incidents-list {
|
||||||
|
background: rgba(30, 41, 59, 0.5);
|
||||||
|
border: 1px solid rgba(100, 116, 139, 0.2);
|
||||||
|
border-radius: 16px;
|
||||||
|
overflow: hidden;
|
||||||
|
}
|
||||||
|
|
||||||
|
.incident-item {
|
||||||
|
padding: 16px 20px;
|
||||||
|
border-bottom: 1px solid rgba(100, 116, 139, 0.1);
|
||||||
|
display: flex;
|
||||||
|
justify-content: space-between;
|
||||||
|
align-items: center;
|
||||||
|
}
|
||||||
|
|
||||||
|
.incident-item:last-child {
|
||||||
|
border-bottom: none;
|
||||||
|
}
|
||||||
|
|
||||||
|
.incident-info {
|
||||||
|
display: flex;
|
||||||
|
align-items: center;
|
||||||
|
gap: 12px;
|
||||||
|
}
|
||||||
|
|
||||||
|
.incident-dot {
|
||||||
|
width: 10px;
|
||||||
|
height: 10px;
|
||||||
|
border-radius: 50%;
|
||||||
|
}
|
||||||
|
|
||||||
|
.incident-dot.resolved {
|
||||||
|
background: #22c55e;
|
||||||
|
}
|
||||||
|
|
||||||
|
.incident-dot.open {
|
||||||
|
background: #ef4444;
|
||||||
|
animation: pulse 2s infinite;
|
||||||
|
}
|
||||||
|
|
||||||
|
.incident-service {
|
||||||
|
font-weight: 500;
|
||||||
|
color: #f1f5f9;
|
||||||
|
}
|
||||||
|
|
||||||
|
.incident-message {
|
||||||
|
font-size: 0.85rem;
|
||||||
|
color: #94a3b8;
|
||||||
|
}
|
||||||
|
|
||||||
|
.incident-time {
|
||||||
|
font-size: 0.85rem;
|
||||||
|
color: #64748b;
|
||||||
|
}
|
||||||
|
|
||||||
|
.no-incidents {
|
||||||
|
padding: 30px;
|
||||||
|
text-align: center;
|
||||||
|
color: #64748b;
|
||||||
|
}
|
||||||
|
|
||||||
.refresh-btn {
|
.refresh-btn {
|
||||||
display: inline-flex;
|
display: inline-flex;
|
||||||
align-items: center;
|
align-items: center;
|
||||||
@@ -292,8 +427,42 @@
|
|||||||
</p>
|
</p>
|
||||||
</header>
|
</header>
|
||||||
|
|
||||||
|
{% if ssl_status %}
|
||||||
|
<div class="ssl-card {% if ssl_status.days_until_expiry <= 0 %}danger{% elif ssl_status.days_until_expiry <= 14 %}warning{% endif %}">
|
||||||
|
<div class="ssl-header">
|
||||||
|
<span class="ssl-title">SSL Certificate</span>
|
||||||
|
<span class="ssl-badge {% if ssl_status.days_until_expiry <= 0 %}expired{% elif ssl_status.days_until_expiry <= 14 %}expiring{% else %}valid{% endif %}">
|
||||||
|
{% if ssl_status.days_until_expiry <= 0 %}
|
||||||
|
Expired
|
||||||
|
{% elif ssl_status.days_until_expiry <= 14 %}
|
||||||
|
Expiring Soon
|
||||||
|
{% else %}
|
||||||
|
Valid
|
||||||
|
{% endif %}
|
||||||
|
</span>
|
||||||
|
</div>
|
||||||
|
<div class="ssl-info">
|
||||||
|
<div class="metric">
|
||||||
|
<div class="metric-label">Domain</div>
|
||||||
|
<div class="metric-value">{{ ssl_status.domain }}</div>
|
||||||
|
</div>
|
||||||
|
<div class="metric">
|
||||||
|
<div class="metric-label">Issuer</div>
|
||||||
|
<div class="metric-value">{{ ssl_status.issuer }}</div>
|
||||||
|
</div>
|
||||||
|
<div class="metric">
|
||||||
|
<div class="metric-label">Days Left</div>
|
||||||
|
<div class="metric-value {% if ssl_status.days_until_expiry <= 0 %}bad{% elif ssl_status.days_until_expiry <= 14 %}warning{% else %}good{% endif %}">
|
||||||
|
{{ ssl_status.days_until_expiry }}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
<div class="services-grid">
|
<div class="services-grid">
|
||||||
{% for name, service in services.items() %}
|
{% for name, service in services.items() %}
|
||||||
|
{% if service.status.value != 'unknown' or name != 'external' %}
|
||||||
<div class="service-card">
|
<div class="service-card">
|
||||||
<div class="service-header">
|
<div class="service-header">
|
||||||
<span class="service-name">{{ service.display_name }}</span>
|
<span class="service-name">{{ service.display_name }}</span>
|
||||||
@@ -322,7 +491,17 @@
|
|||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
<div class="metric">
|
<div class="metric">
|
||||||
<div class="metric-label">Uptime</div>
|
<div class="metric-label">Avg 24h</div>
|
||||||
|
<div class="metric-value {% if service.avg_latency_24h and service.avg_latency_24h < 200 %}good{% elif service.avg_latency_24h and service.avg_latency_24h < 500 %}warning{% elif service.avg_latency_24h %}bad{% endif %}">
|
||||||
|
{% if service.avg_latency_24h %}
|
||||||
|
{{ "%.0f"|format(service.avg_latency_24h) }} ms
|
||||||
|
{% else %}
|
||||||
|
—
|
||||||
|
{% endif %}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="metric">
|
||||||
|
<div class="metric-label">Uptime 24h</div>
|
||||||
<div class="metric-value {% if service.uptime_percent >= 99 %}good{% elif service.uptime_percent >= 95 %}warning{% else %}bad{% endif %}">
|
<div class="metric-value {% if service.uptime_percent >= 99 %}good{% elif service.uptime_percent >= 95 %}warning{% else %}bad{% endif %}">
|
||||||
{{ "%.1f"|format(service.uptime_percent) }}%
|
{{ "%.1f"|format(service.uptime_percent) }}%
|
||||||
</div>
|
</div>
|
||||||
@@ -333,20 +512,49 @@
|
|||||||
<div class="metric-value">{{ service.version }}</div>
|
<div class="metric-value">{{ service.version }}</div>
|
||||||
</div>
|
</div>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
{% if service.last_incident %}
|
|
||||||
<div class="metric">
|
|
||||||
<div class="metric-label">Last Incident</div>
|
|
||||||
<div class="metric-value warning">{{ service.last_incident.strftime('%d.%m %H:%M') }}</div>
|
|
||||||
</div>
|
|
||||||
{% endif %}
|
|
||||||
</div>
|
</div>
|
||||||
|
{% if service.latency_history and service.latency_history|length > 1 %}
|
||||||
|
<div class="latency-chart">
|
||||||
|
<canvas id="chart-{{ name }}"></canvas>
|
||||||
|
</div>
|
||||||
|
{% endif %}
|
||||||
{% if service.message %}
|
{% if service.message %}
|
||||||
<div class="service-message">{{ service.message }}</div>
|
<div class="service-message">{{ service.message }}</div>
|
||||||
{% endif %}
|
{% endif %}
|
||||||
</div>
|
</div>
|
||||||
|
{% endif %}
|
||||||
{% endfor %}
|
{% endfor %}
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
<h2>Recent Incidents</h2>
|
||||||
|
<div class="incidents-list">
|
||||||
|
{% if incidents and incidents|length > 0 %}
|
||||||
|
{% for incident in incidents %}
|
||||||
|
<div class="incident-item">
|
||||||
|
<div class="incident-info">
|
||||||
|
<span class="incident-dot {% if incident.resolved_at %}resolved{% else %}open{% endif %}"></span>
|
||||||
|
<div>
|
||||||
|
<div class="incident-service">{{ incident.service_name | title }}</div>
|
||||||
|
<div class="incident-message">{{ incident.message or 'Service unavailable' }}</div>
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
<div class="incident-time">
|
||||||
|
{{ incident.started_at[:16].replace('T', ' ') }}
|
||||||
|
{% if incident.resolved_at %}
|
||||||
|
- Resolved
|
||||||
|
{% else %}
|
||||||
|
- Ongoing
|
||||||
|
{% endif %}
|
||||||
|
</div>
|
||||||
|
</div>
|
||||||
|
{% endfor %}
|
||||||
|
{% else %}
|
||||||
|
<div class="no-incidents">
|
||||||
|
No recent incidents
|
||||||
|
</div>
|
||||||
|
{% endif %}
|
||||||
|
</div>
|
||||||
|
|
||||||
<center>
|
<center>
|
||||||
<button class="refresh-btn" onclick="refreshStatus(this)">
|
<button class="refresh-btn" onclick="refreshStatus(this)">
|
||||||
<svg width="18" height="18" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
|
<svg width="18" height="18" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
|
||||||
@@ -363,6 +571,55 @@
|
|||||||
</div>
|
</div>
|
||||||
|
|
||||||
<script>
|
<script>
|
||||||
|
// Initialize latency charts
|
||||||
|
{% for name, service in services.items() %}
|
||||||
|
{% if service.latency_history and service.latency_history|length > 1 %}
|
||||||
|
(function() {
|
||||||
|
const ctx = document.getElementById('chart-{{ name }}').getContext('2d');
|
||||||
|
const data = {{ service.latency_history | tojson }};
|
||||||
|
|
||||||
|
new Chart(ctx, {
|
||||||
|
type: 'line',
|
||||||
|
data: {
|
||||||
|
labels: data.map(d => ''),
|
||||||
|
datasets: [{
|
||||||
|
data: data.map(d => d.latency_ms),
|
||||||
|
borderColor: '#00d4ff',
|
||||||
|
backgroundColor: 'rgba(0, 212, 255, 0.1)',
|
||||||
|
fill: true,
|
||||||
|
tension: 0.4,
|
||||||
|
pointRadius: 0,
|
||||||
|
borderWidth: 2
|
||||||
|
}]
|
||||||
|
},
|
||||||
|
options: {
|
||||||
|
responsive: true,
|
||||||
|
maintainAspectRatio: false,
|
||||||
|
plugins: {
|
||||||
|
legend: { display: false },
|
||||||
|
tooltip: {
|
||||||
|
callbacks: {
|
||||||
|
label: (ctx) => ctx.raw.toFixed(0) + ' ms'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
scales: {
|
||||||
|
x: { display: false },
|
||||||
|
y: {
|
||||||
|
display: false,
|
||||||
|
beginAtZero: true
|
||||||
|
}
|
||||||
|
},
|
||||||
|
interaction: {
|
||||||
|
intersect: false,
|
||||||
|
mode: 'index'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
})();
|
||||||
|
{% endif %}
|
||||||
|
{% endfor %}
|
||||||
|
|
||||||
async function refreshStatus(btn) {
|
async function refreshStatus(btn) {
|
||||||
btn.classList.add('loading');
|
btn.classList.add('loading');
|
||||||
btn.disabled = true;
|
btn.disabled = true;
|
||||||
|
|||||||
Reference in New Issue
Block a user