Compare commits

2 Commits

Author SHA1 Message Date
33f49f4e47 Fix security 2025-12-18 17:15:21 +07:00
57bad3b4a8 Redesign health service + create backup service 2025-12-18 03:35:13 +07:00
31 changed files with 1667 additions and 147 deletions

View File

@@ -10,6 +10,7 @@ OPENAI_API_KEY=sk-...
# Telegram Bot # Telegram Bot
TELEGRAM_BOT_TOKEN=123456:ABC-DEF... TELEGRAM_BOT_TOKEN=123456:ABC-DEF...
BOT_API_SECRET=change_me_random_secret_for_bot_api
# S3 Storage - FirstVDS (set S3_ENABLED=true to use) # S3 Storage - FirstVDS (set S3_ENABLED=true to use)
S3_ENABLED=false S3_ENABLED=false
@@ -20,5 +21,14 @@ S3_SECRET_ACCESS_KEY=your-secret-access-key
S3_ENDPOINT_URL=https://s3.firstvds.ru S3_ENDPOINT_URL=https://s3.firstvds.ru
S3_PUBLIC_URL=https://your-bucket-name.s3.firstvds.ru S3_PUBLIC_URL=https://your-bucket-name.s3.firstvds.ru
# Backup Service
TELEGRAM_ADMIN_ID=947392854
S3_BACKUP_PREFIX=backups/
BACKUP_RETENTION_DAYS=14
# Status Service (optional - for external monitoring)
EXTERNAL_URL=https://your-domain.com
PUBLIC_URL=https://your-domain.com
# Frontend (for build) # Frontend (for build)
VITE_API_URL=/api/v1 VITE_API_URL=/api/v1

View File

@@ -31,6 +31,12 @@ help:
@echo " make shell - Open backend shell" @echo " make shell - Open backend shell"
@echo " make frontend-sh - Open frontend shell" @echo " make frontend-sh - Open frontend shell"
@echo "" @echo ""
@echo " Backup:"
@echo " make backup-now - Run backup immediately"
@echo " make backup-list - List available backups in S3"
@echo " make backup-restore - Restore from backup (interactive)"
@echo " make backup-logs - Show backup service logs"
@echo ""
@echo " Cleanup:" @echo " Cleanup:"
@echo " make clean - Stop and remove containers, volumes" @echo " make clean - Stop and remove containers, volumes"
@echo " make prune - Remove unused Docker resources" @echo " make prune - Remove unused Docker resources"
@@ -137,3 +143,20 @@ test-backend:
# Production # Production
prod: prod:
$(DC) -f docker-compose.yml up -d --build $(DC) -f docker-compose.yml up -d --build
# Backup
backup-now:
$(DC) exec backup python /app/backup.py
backup-list:
$(DC) exec backup python /app/restore.py
backup-restore:
@read -p "Backup filename: " file; \
$(DC) exec -it backup python /app/restore.py "$$file"
backup-logs:
$(DC) logs -f backup
backup-shell:
$(DC) exec backup bash

View File

@@ -1,10 +1,11 @@
from typing import Annotated from typing import Annotated
from fastapi import Depends, HTTPException, status from fastapi import Depends, HTTPException, status, Header
from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
from sqlalchemy import select from sqlalchemy import select
from sqlalchemy.ext.asyncio import AsyncSession from sqlalchemy.ext.asyncio import AsyncSession
from app.core.config import settings
from app.core.database import get_db from app.core.database import get_db
from app.core.security import decode_access_token from app.core.security import decode_access_token
from app.models import User, Participant, Marathon, UserRole, ParticipantRole from app.models import User, Participant, Marathon, UserRole, ParticipantRole
@@ -145,3 +146,21 @@ async def require_creator(
# Type aliases for cleaner dependency injection # Type aliases for cleaner dependency injection
CurrentUser = Annotated[User, Depends(get_current_user)] CurrentUser = Annotated[User, Depends(get_current_user)]
DbSession = Annotated[AsyncSession, Depends(get_db)] DbSession = Annotated[AsyncSession, Depends(get_db)]
async def verify_bot_secret(
x_bot_secret: str | None = Header(None, alias="X-Bot-Secret")
) -> None:
"""Verify that request comes from trusted bot using secret key."""
if not settings.BOT_API_SECRET:
# If secret is not configured, skip check (for development)
return
if x_bot_secret != settings.BOT_API_SECRET:
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail="Invalid or missing bot secret"
)
BotSecretDep = Annotated[None, Depends(verify_bot_secret)]

View File

@@ -1,16 +1,18 @@
from fastapi import APIRouter, HTTPException, status from fastapi import APIRouter, HTTPException, status, Request
from sqlalchemy import select from sqlalchemy import select
from app.api.deps import DbSession, CurrentUser from app.api.deps import DbSession, CurrentUser
from app.core.security import verify_password, get_password_hash, create_access_token from app.core.security import verify_password, get_password_hash, create_access_token
from app.core.rate_limit import limiter
from app.models import User from app.models import User
from app.schemas import UserRegister, UserLogin, TokenResponse, UserPublic from app.schemas import UserRegister, UserLogin, TokenResponse, UserPrivate
router = APIRouter(prefix="/auth", tags=["auth"]) router = APIRouter(prefix="/auth", tags=["auth"])
@router.post("/register", response_model=TokenResponse) @router.post("/register", response_model=TokenResponse)
async def register(data: UserRegister, db: DbSession): @limiter.limit("5/minute")
async def register(request: Request, data: UserRegister, db: DbSession):
# Check if login already exists # Check if login already exists
result = await db.execute(select(User).where(User.login == data.login.lower())) result = await db.execute(select(User).where(User.login == data.login.lower()))
if result.scalar_one_or_none(): if result.scalar_one_or_none():
@@ -34,12 +36,13 @@ async def register(data: UserRegister, db: DbSession):
return TokenResponse( return TokenResponse(
access_token=access_token, access_token=access_token,
user=UserPublic.model_validate(user), user=UserPrivate.model_validate(user),
) )
@router.post("/login", response_model=TokenResponse) @router.post("/login", response_model=TokenResponse)
async def login(data: UserLogin, db: DbSession): @limiter.limit("10/minute")
async def login(request: Request, data: UserLogin, db: DbSession):
# Find user # Find user
result = await db.execute(select(User).where(User.login == data.login.lower())) result = await db.execute(select(User).where(User.login == data.login.lower()))
user = result.scalar_one_or_none() user = result.scalar_one_or_none()
@@ -55,10 +58,11 @@ async def login(data: UserLogin, db: DbSession):
return TokenResponse( return TokenResponse(
access_token=access_token, access_token=access_token,
user=UserPublic.model_validate(user), user=UserPrivate.model_validate(user),
) )
@router.get("/me", response_model=UserPublic) @router.get("/me", response_model=UserPrivate)
async def get_me(current_user: CurrentUser): async def get_me(current_user: CurrentUser):
return UserPublic.model_validate(current_user) """Get current user's full profile (including private data)"""
return UserPrivate.model_validate(current_user)

View File

@@ -1,7 +1,8 @@
from datetime import timedelta from datetime import timedelta
import secrets import secrets
import string import string
from fastapi import APIRouter, HTTPException, status from fastapi import APIRouter, HTTPException, status, Depends
from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
from sqlalchemy import select, func from sqlalchemy import select, func
from sqlalchemy.orm import selectinload from sqlalchemy.orm import selectinload
@@ -10,6 +11,10 @@ from app.api.deps import (
require_participant, require_organizer, require_creator, require_participant, require_organizer, require_creator,
get_participant, get_participant,
) )
from app.core.security import decode_access_token
# Optional auth for endpoints that need it conditionally
optional_auth = HTTPBearer(auto_error=False)
from app.models import ( from app.models import (
Marathon, Participant, MarathonStatus, Game, GameStatus, Challenge, Marathon, Participant, MarathonStatus, Game, GameStatus, Challenge,
Assignment, AssignmentStatus, Activity, ActivityType, ParticipantRole, Assignment, AssignmentStatus, Activity, ActivityType, ParticipantRole,
@@ -188,6 +193,15 @@ async def create_marathon(
async def get_marathon(marathon_id: int, current_user: CurrentUser, db: DbSession): async def get_marathon(marathon_id: int, current_user: CurrentUser, db: DbSession):
marathon = await get_marathon_or_404(db, marathon_id) marathon = await get_marathon_or_404(db, marathon_id)
# For private marathons, require participation (or admin/creator)
if not marathon.is_public and not current_user.is_admin:
participation = await get_participation(db, current_user.id, marathon_id)
if not participation:
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail="You are not a participant of this private marathon",
)
# Count participants and approved games # Count participants and approved games
participants_count = await db.scalar( participants_count = await db.scalar(
select(func.count()).select_from(Participant).where(Participant.marathon_id == marathon_id) select(func.count()).select_from(Participant).where(Participant.marathon_id == marathon_id)
@@ -428,7 +442,16 @@ async def join_public_marathon(marathon_id: int, current_user: CurrentUser, db:
@router.get("/{marathon_id}/participants", response_model=list[ParticipantWithUser]) @router.get("/{marathon_id}/participants", response_model=list[ParticipantWithUser])
async def get_participants(marathon_id: int, current_user: CurrentUser, db: DbSession): async def get_participants(marathon_id: int, current_user: CurrentUser, db: DbSession):
await get_marathon_or_404(db, marathon_id) marathon = await get_marathon_or_404(db, marathon_id)
# For private marathons, require participation (or admin)
if not marathon.is_public and not current_user.is_admin:
participation = await get_participation(db, current_user.id, marathon_id)
if not participation:
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail="You are not a participant of this private marathon",
)
result = await db.execute( result = await db.execute(
select(Participant) select(Participant)
@@ -497,8 +520,42 @@ async def set_participant_role(
@router.get("/{marathon_id}/leaderboard", response_model=list[LeaderboardEntry]) @router.get("/{marathon_id}/leaderboard", response_model=list[LeaderboardEntry])
async def get_leaderboard(marathon_id: int, db: DbSession): async def get_leaderboard(
await get_marathon_or_404(db, marathon_id) marathon_id: int,
db: DbSession,
credentials: HTTPAuthorizationCredentials | None = Depends(optional_auth),
):
"""
Get marathon leaderboard.
Public marathons: no auth required.
Private marathons: requires auth + participation check.
"""
marathon = await get_marathon_or_404(db, marathon_id)
# For private marathons, require authentication and participation
if not marathon.is_public:
if not credentials:
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="Authentication required for private marathon leaderboard",
headers={"WWW-Authenticate": "Bearer"},
)
payload = decode_access_token(credentials.credentials)
if not payload:
raise HTTPException(
status_code=status.HTTP_401_UNAUTHORIZED,
detail="Invalid or expired token",
headers={"WWW-Authenticate": "Bearer"},
)
user_id = int(payload.get("sub"))
participant = await get_participant(db, user_id, marathon_id)
if not participant:
raise HTTPException(
status_code=status.HTTP_403_FORBIDDEN,
detail="You are not a participant of this marathon",
)
result = await db.execute( result = await db.execute(
select(Participant) select(Participant)

View File

@@ -5,7 +5,7 @@ from pydantic import BaseModel
from sqlalchemy import select, func from sqlalchemy import select, func
from sqlalchemy.orm import selectinload from sqlalchemy.orm import selectinload
from app.api.deps import DbSession, CurrentUser from app.api.deps import DbSession, CurrentUser, BotSecretDep
from app.core.config import settings from app.core.config import settings
from app.core.security import create_telegram_link_token, verify_telegram_link_token from app.core.security import create_telegram_link_token, verify_telegram_link_token
from app.models import User, Participant, Marathon, Assignment, Challenge, Event, Game from app.models import User, Participant, Marathon, Assignment, Challenge, Event, Game
@@ -94,7 +94,7 @@ async def generate_link_token(current_user: CurrentUser):
@router.post("/confirm-link", response_model=TelegramLinkResponse) @router.post("/confirm-link", response_model=TelegramLinkResponse)
async def confirm_telegram_link(data: TelegramConfirmLink, db: DbSession): async def confirm_telegram_link(data: TelegramConfirmLink, db: DbSession, _: BotSecretDep):
"""Confirm Telegram account linking (called by bot).""" """Confirm Telegram account linking (called by bot)."""
logger.info(f"[TG_CONFIRM] ========== CONFIRM LINK REQUEST ==========") logger.info(f"[TG_CONFIRM] ========== CONFIRM LINK REQUEST ==========")
logger.info(f"[TG_CONFIRM] telegram_id: {data.telegram_id}") logger.info(f"[TG_CONFIRM] telegram_id: {data.telegram_id}")
@@ -145,7 +145,7 @@ async def confirm_telegram_link(data: TelegramConfirmLink, db: DbSession):
@router.get("/user/{telegram_id}", response_model=TelegramUserResponse | None) @router.get("/user/{telegram_id}", response_model=TelegramUserResponse | None)
async def get_user_by_telegram_id(telegram_id: int, db: DbSession): async def get_user_by_telegram_id(telegram_id: int, db: DbSession, _: BotSecretDep):
"""Get user by Telegram ID.""" """Get user by Telegram ID."""
logger.info(f"[TG_USER] Looking up user by telegram_id={telegram_id}") logger.info(f"[TG_USER] Looking up user by telegram_id={telegram_id}")
@@ -168,7 +168,7 @@ async def get_user_by_telegram_id(telegram_id: int, db: DbSession):
@router.post("/unlink/{telegram_id}", response_model=TelegramLinkResponse) @router.post("/unlink/{telegram_id}", response_model=TelegramLinkResponse)
async def unlink_telegram(telegram_id: int, db: DbSession): async def unlink_telegram(telegram_id: int, db: DbSession, _: BotSecretDep):
"""Unlink Telegram account.""" """Unlink Telegram account."""
result = await db.execute( result = await db.execute(
select(User).where(User.telegram_id == telegram_id) select(User).where(User.telegram_id == telegram_id)
@@ -187,7 +187,7 @@ async def unlink_telegram(telegram_id: int, db: DbSession):
@router.get("/marathons/{telegram_id}", response_model=list[TelegramMarathonResponse]) @router.get("/marathons/{telegram_id}", response_model=list[TelegramMarathonResponse])
async def get_user_marathons(telegram_id: int, db: DbSession): async def get_user_marathons(telegram_id: int, db: DbSession, _: BotSecretDep):
"""Get user's marathons by Telegram ID.""" """Get user's marathons by Telegram ID."""
# Get user # Get user
result = await db.execute( result = await db.execute(
@@ -231,7 +231,7 @@ async def get_user_marathons(telegram_id: int, db: DbSession):
@router.get("/marathon/{marathon_id}", response_model=TelegramMarathonDetails | None) @router.get("/marathon/{marathon_id}", response_model=TelegramMarathonDetails | None)
async def get_marathon_details(marathon_id: int, telegram_id: int, db: DbSession): async def get_marathon_details(marathon_id: int, telegram_id: int, db: DbSession, _: BotSecretDep):
"""Get marathon details for user by Telegram ID.""" """Get marathon details for user by Telegram ID."""
# Get user # Get user
result = await db.execute( result = await db.execute(
@@ -341,7 +341,7 @@ async def get_marathon_details(marathon_id: int, telegram_id: int, db: DbSession
@router.get("/stats/{telegram_id}", response_model=TelegramStatsResponse | None) @router.get("/stats/{telegram_id}", response_model=TelegramStatsResponse | None)
async def get_user_stats(telegram_id: int, db: DbSession): async def get_user_stats(telegram_id: int, db: DbSession, _: BotSecretDep):
"""Get user's overall statistics by Telegram ID.""" """Get user's overall statistics by Telegram ID."""
# Get user # Get user
result = await db.execute( result = await db.execute(

View File

@@ -8,7 +8,7 @@ from app.models import User, Participant, Assignment, Marathon
from app.models.assignment import AssignmentStatus from app.models.assignment import AssignmentStatus
from app.models.marathon import MarathonStatus from app.models.marathon import MarathonStatus
from app.schemas import ( from app.schemas import (
UserPublic, UserUpdate, TelegramLink, MessageResponse, UserPublic, UserPrivate, UserUpdate, TelegramLink, MessageResponse,
PasswordChange, UserStats, UserProfilePublic, PasswordChange, UserStats, UserProfilePublic,
) )
from app.services.storage import storage_service from app.services.storage import storage_service
@@ -17,7 +17,8 @@ router = APIRouter(prefix="/users", tags=["users"])
@router.get("/{user_id}", response_model=UserPublic) @router.get("/{user_id}", response_model=UserPublic)
async def get_user(user_id: int, db: DbSession): async def get_user(user_id: int, db: DbSession, current_user: CurrentUser):
"""Get user profile. Requires authentication."""
result = await db.execute(select(User).where(User.id == user_id)) result = await db.execute(select(User).where(User.id == user_id))
user = result.scalar_one_or_none() user = result.scalar_one_or_none()
@@ -58,23 +59,25 @@ async def get_user_avatar(user_id: int, db: DbSession):
) )
@router.patch("/me", response_model=UserPublic) @router.patch("/me", response_model=UserPrivate)
async def update_me(data: UserUpdate, current_user: CurrentUser, db: DbSession): async def update_me(data: UserUpdate, current_user: CurrentUser, db: DbSession):
"""Update current user's profile"""
if data.nickname is not None: if data.nickname is not None:
current_user.nickname = data.nickname current_user.nickname = data.nickname
await db.commit() await db.commit()
await db.refresh(current_user) await db.refresh(current_user)
return UserPublic.model_validate(current_user) return UserPrivate.model_validate(current_user)
@router.post("/me/avatar", response_model=UserPublic) @router.post("/me/avatar", response_model=UserPrivate)
async def upload_avatar( async def upload_avatar(
current_user: CurrentUser, current_user: CurrentUser,
db: DbSession, db: DbSession,
file: UploadFile = File(...), file: UploadFile = File(...),
): ):
"""Upload current user's avatar"""
# Validate file # Validate file
if not file.content_type.startswith("image/"): if not file.content_type.startswith("image/"):
raise HTTPException( raise HTTPException(
@@ -115,7 +118,7 @@ async def upload_avatar(
await db.commit() await db.commit()
await db.refresh(current_user) await db.refresh(current_user)
return UserPublic.model_validate(current_user) return UserPrivate.model_validate(current_user)
@router.post("/me/telegram", response_model=MessageResponse) @router.post("/me/telegram", response_model=MessageResponse)
@@ -193,8 +196,8 @@ async def get_my_stats(current_user: CurrentUser, db: DbSession):
@router.get("/{user_id}/stats", response_model=UserStats) @router.get("/{user_id}/stats", response_model=UserStats)
async def get_user_stats(user_id: int, db: DbSession): async def get_user_stats(user_id: int, db: DbSession, current_user: CurrentUser):
"""Получить статистику пользователя""" """Получить статистику пользователя. Requires authentication."""
result = await db.execute(select(User).where(User.id == user_id)) result = await db.execute(select(User).where(User.id == user_id))
user = result.scalar_one_or_none() user = result.scalar_one_or_none()
if not user: if not user:
@@ -207,8 +210,8 @@ async def get_user_stats(user_id: int, db: DbSession):
@router.get("/{user_id}/profile", response_model=UserProfilePublic) @router.get("/{user_id}/profile", response_model=UserProfilePublic)
async def get_user_profile(user_id: int, db: DbSession): async def get_user_profile(user_id: int, db: DbSession, current_user: CurrentUser):
"""Получить публичный профиль пользователя со статистикой""" """Получить публичный профиль пользователя со статистикой. Requires authentication."""
result = await db.execute(select(User).where(User.id == user_id)) result = await db.execute(select(User).where(User.id == user_id))
user = result.scalar_one_or_none() user = result.scalar_one_or_none()

View File

@@ -22,6 +22,7 @@ class Settings(BaseSettings):
TELEGRAM_BOT_TOKEN: str = "" TELEGRAM_BOT_TOKEN: str = ""
TELEGRAM_BOT_USERNAME: str = "" TELEGRAM_BOT_USERNAME: str = ""
TELEGRAM_LINK_TOKEN_EXPIRE_MINUTES: int = 10 TELEGRAM_LINK_TOKEN_EXPIRE_MINUTES: int = 10
BOT_API_SECRET: str = "" # Secret key for bot-to-backend communication
# Frontend # Frontend
FRONTEND_URL: str = "http://localhost:3000" FRONTEND_URL: str = "http://localhost:3000"

View File

@@ -0,0 +1,5 @@
from slowapi import Limiter
from slowapi.util import get_remote_address
# Rate limiter using client IP address as key
limiter = Limiter(key_func=get_remote_address)

View File

@@ -1,7 +1,10 @@
import logging import logging
from contextlib import asynccontextmanager from contextlib import asynccontextmanager
from fastapi import FastAPI from fastapi import FastAPI, Request
from fastapi.responses import JSONResponse
from slowapi import _rate_limit_exceeded_handler
from slowapi.errors import RateLimitExceeded
# Configure logging # Configure logging
logging.basicConfig( logging.basicConfig(
@@ -14,6 +17,7 @@ from pathlib import Path
from app.core.config import settings from app.core.config import settings
from app.core.database import engine, Base, async_session_maker from app.core.database import engine, Base, async_session_maker
from app.core.rate_limit import limiter
from app.api.v1 import router as api_router from app.api.v1 import router as api_router
from app.services.event_scheduler import event_scheduler from app.services.event_scheduler import event_scheduler
from app.services.dispute_scheduler import dispute_scheduler from app.services.dispute_scheduler import dispute_scheduler
@@ -49,6 +53,10 @@ app = FastAPI(
lifespan=lifespan, lifespan=lifespan,
) )
# Rate limiting
app.state.limiter = limiter
app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)
# CORS # CORS
app.add_middleware( app.add_middleware(
CORSMiddleware, CORSMiddleware,

View File

@@ -3,7 +3,7 @@ from app.schemas.user import (
UserLogin, UserLogin,
UserUpdate, UserUpdate,
UserPublic, UserPublic,
UserWithTelegram, UserPrivate,
TokenResponse, TokenResponse,
TelegramLink, TelegramLink,
PasswordChange, PasswordChange,
@@ -88,7 +88,7 @@ __all__ = [
"UserLogin", "UserLogin",
"UserUpdate", "UserUpdate",
"UserPublic", "UserPublic",
"UserWithTelegram", "UserPrivate",
"TokenResponse", "TokenResponse",
"TelegramLink", "TelegramLink",
"PasswordChange", "PasswordChange",

View File

@@ -29,30 +29,30 @@ class UserUpdate(BaseModel):
class UserPublic(UserBase): class UserPublic(UserBase):
"""Public user info visible to other users - minimal data"""
id: int id: int
login: str
avatar_url: str | None = None avatar_url: str | None = None
role: str = "user" role: str = "user"
telegram_id: int | None = None telegram_avatar_url: str | None = None # Only TG avatar is public
telegram_username: str | None = None
telegram_first_name: str | None = None
telegram_last_name: str | None = None
telegram_avatar_url: str | None = None
created_at: datetime created_at: datetime
class Config: class Config:
from_attributes = True from_attributes = True
class UserWithTelegram(UserPublic): class UserPrivate(UserPublic):
"""Full user info visible only to the user themselves"""
login: str
telegram_id: int | None = None telegram_id: int | None = None
telegram_username: str | None = None telegram_username: str | None = None
telegram_first_name: str | None = None
telegram_last_name: str | None = None
class TokenResponse(BaseModel): class TokenResponse(BaseModel):
access_token: str access_token: str
token_type: str = "bearer" token_type: str = "bearer"
user: UserPublic user: UserPrivate
class TelegramLink(BaseModel): class TelegramLink(BaseModel):

View File

@@ -31,5 +31,8 @@ python-magic==0.4.27
# S3 Storage # S3 Storage
boto3==1.34.0 boto3==1.34.0
# Rate limiting
slowapi==0.1.9
# Utils # Utils
python-dotenv==1.0.0 python-dotenv==1.0.0

30
backup-service/Dockerfile Normal file
View File

@@ -0,0 +1,30 @@
FROM python:3.11-slim
WORKDIR /app
# Install PostgreSQL client (for pg_dump and psql) and cron
RUN apt-get update && apt-get install -y \
postgresql-client \
cron \
&& rm -rf /var/lib/apt/lists/*
# Install Python dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
# Copy application
COPY . .
# Make scripts executable
RUN chmod +x backup.py restore.py
# Setup cron
COPY crontab /etc/cron.d/backup-cron
RUN chmod 0644 /etc/cron.d/backup-cron
RUN crontab /etc/cron.d/backup-cron
# Create log file
RUN touch /var/log/cron.log
# Start cron in foreground and tail logs
CMD ["sh", "-c", "printenv > /etc/environment && cron && tail -f /var/log/cron.log"]

217
backup-service/backup.py Normal file
View File

@@ -0,0 +1,217 @@
#!/usr/bin/env python3
"""
PostgreSQL Backup Service for WebApp.
- Creates pg_dump backup
- Compresses with gzip
- Uploads to S3 FirstVDS
- Rotates old backups (configurable retention)
- Sends Telegram notifications
"""
import gzip
import os
import subprocess
import sys
from datetime import datetime, timedelta, timezone
import boto3
import httpx
from botocore.config import Config as BotoConfig
from botocore.exceptions import ClientError
from config import config
def create_s3_client():
"""Initialize S3 client (same pattern as backend storage.py)."""
return boto3.client(
"s3",
endpoint_url=config.S3_ENDPOINT_URL,
aws_access_key_id=config.S3_ACCESS_KEY_ID,
aws_secret_access_key=config.S3_SECRET_ACCESS_KEY,
region_name=config.S3_REGION or "us-east-1",
config=BotoConfig(signature_version="s3v4"),
)
def send_telegram_notification(message: str, is_error: bool = False) -> None:
"""Send notification to Telegram admin."""
if not config.TELEGRAM_BOT_TOKEN or not config.TELEGRAM_ADMIN_ID:
print("Telegram not configured, skipping notification")
return
emoji = "\u274c" if is_error else "\u2705"
text = f"{emoji} *Database Backup*\n\n{message}"
url = f"https://api.telegram.org/bot{config.TELEGRAM_BOT_TOKEN}/sendMessage"
data = {
"chat_id": config.TELEGRAM_ADMIN_ID,
"text": text,
"parse_mode": "Markdown",
}
try:
response = httpx.post(url, json=data, timeout=30)
response.raise_for_status()
print("Telegram notification sent")
except Exception as e:
print(f"Failed to send Telegram notification: {e}")
def create_backup() -> tuple[str, bytes]:
"""Create pg_dump backup and compress it."""
timestamp = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
filename = f"marathon_backup_{timestamp}.sql.gz"
# Build pg_dump command
env = os.environ.copy()
env["PGPASSWORD"] = config.DB_PASSWORD
cmd = [
"pg_dump",
"-h",
config.DB_HOST,
"-p",
config.DB_PORT,
"-U",
config.DB_USER,
"-d",
config.DB_NAME,
"--no-owner",
"--no-acl",
"-F",
"p", # plain SQL format
]
print(f"Running pg_dump for database {config.DB_NAME}...")
result = subprocess.run(
cmd,
env=env,
capture_output=True,
)
if result.returncode != 0:
raise Exception(f"pg_dump failed: {result.stderr.decode()}")
# Compress the output
print("Compressing backup...")
compressed = gzip.compress(result.stdout, compresslevel=9)
return filename, compressed
def upload_to_s3(s3_client, filename: str, data: bytes) -> str:
"""Upload backup to S3."""
key = f"{config.S3_BACKUP_PREFIX}{filename}"
print(f"Uploading to S3: {key}...")
s3_client.put_object(
Bucket=config.S3_BUCKET_NAME,
Key=key,
Body=data,
ContentType="application/gzip",
)
return key
def rotate_old_backups(s3_client) -> int:
"""Delete backups older than BACKUP_RETENTION_DAYS."""
cutoff_date = datetime.now(timezone.utc) - timedelta(
days=config.BACKUP_RETENTION_DAYS
)
deleted_count = 0
print(f"Rotating backups older than {config.BACKUP_RETENTION_DAYS} days...")
# List all objects with backup prefix
try:
paginator = s3_client.get_paginator("list_objects_v2")
pages = paginator.paginate(
Bucket=config.S3_BUCKET_NAME,
Prefix=config.S3_BACKUP_PREFIX,
)
for page in pages:
for obj in page.get("Contents", []):
last_modified = obj["LastModified"]
if last_modified.tzinfo is None:
last_modified = last_modified.replace(tzinfo=timezone.utc)
if last_modified < cutoff_date:
s3_client.delete_object(
Bucket=config.S3_BUCKET_NAME,
Key=obj["Key"],
)
deleted_count += 1
print(f"Deleted old backup: {obj['Key']}")
except ClientError as e:
print(f"Error during rotation: {e}")
return deleted_count
def main() -> int:
"""Main backup routine."""
start_time = datetime.now()
print(f"{'=' * 50}")
print(f"Backup started at {start_time}")
print(f"{'=' * 50}")
try:
# Validate configuration
if not config.S3_BUCKET_NAME:
raise Exception("S3_BUCKET_NAME is not configured")
if not config.S3_ACCESS_KEY_ID:
raise Exception("S3_ACCESS_KEY_ID is not configured")
if not config.S3_SECRET_ACCESS_KEY:
raise Exception("S3_SECRET_ACCESS_KEY is not configured")
if not config.S3_ENDPOINT_URL:
raise Exception("S3_ENDPOINT_URL is not configured")
# Create S3 client
s3_client = create_s3_client()
# Create backup
filename, data = create_backup()
size_mb = len(data) / (1024 * 1024)
print(f"Backup created: {filename} ({size_mb:.2f} MB)")
# Upload to S3
s3_key = upload_to_s3(s3_client, filename, data)
print(f"Uploaded to S3: {s3_key}")
# Rotate old backups
deleted_count = rotate_old_backups(s3_client)
print(f"Deleted {deleted_count} old backups")
# Calculate duration
duration = datetime.now() - start_time
# Send success notification
message = (
f"Backup completed successfully!\n\n"
f"*File:* `{filename}`\n"
f"*Size:* {size_mb:.2f} MB\n"
f"*Duration:* {duration.seconds}s\n"
f"*Deleted old:* {deleted_count} files"
)
send_telegram_notification(message, is_error=False)
print(f"{'=' * 50}")
print("Backup completed successfully!")
print(f"{'=' * 50}")
return 0
except Exception as e:
error_msg = f"Backup failed!\n\n*Error:* `{str(e)}`"
send_telegram_notification(error_msg, is_error=True)
print(f"{'=' * 50}")
print(f"Backup failed: {e}")
print(f"{'=' * 50}")
return 1
if __name__ == "__main__":
sys.exit(main())

33
backup-service/config.py Normal file
View File

@@ -0,0 +1,33 @@
"""Configuration for backup service."""
import os
from dataclasses import dataclass
@dataclass
class Config:
"""Backup service configuration from environment variables."""
# Database
DB_HOST: str = os.getenv("DB_HOST", "db")
DB_PORT: str = os.getenv("DB_PORT", "5432")
DB_NAME: str = os.getenv("DB_NAME", "marathon")
DB_USER: str = os.getenv("DB_USER", "marathon")
DB_PASSWORD: str = os.getenv("DB_PASSWORD", "123")
# S3
S3_BUCKET_NAME: str = os.getenv("S3_BUCKET_NAME", "")
S3_REGION: str = os.getenv("S3_REGION", "ru-1")
S3_ACCESS_KEY_ID: str = os.getenv("S3_ACCESS_KEY_ID", "")
S3_SECRET_ACCESS_KEY: str = os.getenv("S3_SECRET_ACCESS_KEY", "")
S3_ENDPOINT_URL: str = os.getenv("S3_ENDPOINT_URL", "")
S3_BACKUP_PREFIX: str = os.getenv("S3_BACKUP_PREFIX", "backups/")
# Telegram
TELEGRAM_BOT_TOKEN: str = os.getenv("TELEGRAM_BOT_TOKEN", "")
TELEGRAM_ADMIN_ID: str = os.getenv("TELEGRAM_ADMIN_ID", "947392854")
# Backup settings
BACKUP_RETENTION_DAYS: int = int(os.getenv("BACKUP_RETENTION_DAYS", "14"))
config = Config()

4
backup-service/crontab Normal file
View File

@@ -0,0 +1,4 @@
# Backup cron job
# Run backup daily at 3:00 AM UTC
0 3 * * * /usr/local/bin/python /app/backup.py >> /var/log/cron.log 2>&1
# Empty line required at end of crontab

View File

@@ -0,0 +1,2 @@
boto3==1.34.0
httpx==0.26.0

158
backup-service/restore.py Normal file
View File

@@ -0,0 +1,158 @@
#!/usr/bin/env python3
"""
Restore PostgreSQL database from S3 backup.
Usage:
python restore.py - List available backups
python restore.py <filename> - Restore from specific backup
"""
import gzip
import os
import subprocess
import sys
import boto3
from botocore.config import Config as BotoConfig
from botocore.exceptions import ClientError
from config import config
def create_s3_client():
"""Initialize S3 client."""
return boto3.client(
"s3",
endpoint_url=config.S3_ENDPOINT_URL,
aws_access_key_id=config.S3_ACCESS_KEY_ID,
aws_secret_access_key=config.S3_SECRET_ACCESS_KEY,
region_name=config.S3_REGION or "us-east-1",
config=BotoConfig(signature_version="s3v4"),
)
def list_backups(s3_client) -> list[tuple[str, float, str]]:
"""List all available backups."""
print("Available backups:\n")
try:
paginator = s3_client.get_paginator("list_objects_v2")
pages = paginator.paginate(
Bucket=config.S3_BUCKET_NAME,
Prefix=config.S3_BACKUP_PREFIX,
)
backups = []
for page in pages:
for obj in page.get("Contents", []):
filename = obj["Key"].replace(config.S3_BACKUP_PREFIX, "")
size_mb = obj["Size"] / (1024 * 1024)
modified = obj["LastModified"].strftime("%Y-%m-%d %H:%M:%S")
backups.append((filename, size_mb, modified))
# Sort by date descending (newest first)
backups.sort(key=lambda x: x[2], reverse=True)
for filename, size_mb, modified in backups:
print(f" {filename} ({size_mb:.2f} MB) - {modified}")
return backups
except ClientError as e:
print(f"Error listing backups: {e}")
return []
def restore_backup(s3_client, filename: str) -> None:
"""Download and restore backup."""
key = f"{config.S3_BACKUP_PREFIX}{filename}"
print(f"Downloading {filename} from S3...")
try:
response = s3_client.get_object(
Bucket=config.S3_BUCKET_NAME,
Key=key,
)
compressed_data = response["Body"].read()
except ClientError as e:
raise Exception(f"Failed to download backup: {e}")
print("Decompressing...")
sql_data = gzip.decompress(compressed_data)
print(f"Restoring to database {config.DB_NAME}...")
# Build psql command
env = os.environ.copy()
env["PGPASSWORD"] = config.DB_PASSWORD
cmd = [
"psql",
"-h",
config.DB_HOST,
"-p",
config.DB_PORT,
"-U",
config.DB_USER,
"-d",
config.DB_NAME,
]
result = subprocess.run(
cmd,
env=env,
input=sql_data,
capture_output=True,
)
if result.returncode != 0:
stderr = result.stderr.decode()
# psql may return warnings that aren't fatal errors
if "ERROR" in stderr:
raise Exception(f"psql restore failed: {stderr}")
else:
print(f"Warnings: {stderr}")
print("Restore completed successfully!")
def main() -> int:
"""Main restore routine."""
# Validate configuration
if not config.S3_BUCKET_NAME:
print("Error: S3_BUCKET_NAME is not configured")
return 1
s3_client = create_s3_client()
if len(sys.argv) < 2:
# List available backups
backups = list_backups(s3_client)
if backups:
print(f"\nTo restore, run: python restore.py <filename>")
else:
print("No backups found.")
return 0
filename = sys.argv[1]
# Confirm restore
print(f"WARNING: This will restore database from {filename}")
print("This may overwrite existing data!")
print()
confirm = input("Type 'yes' to continue: ")
if confirm.lower() != "yes":
print("Restore cancelled.")
return 0
try:
restore_backup(s3_client, filename)
return 0
except Exception as e:
print(f"Restore failed: {e}")
return 1
if __name__ == "__main__":
sys.exit(main())

View File

@@ -5,6 +5,7 @@ class Settings(BaseSettings):
TELEGRAM_BOT_TOKEN: str TELEGRAM_BOT_TOKEN: str
API_URL: str = "http://backend:8000" API_URL: str = "http://backend:8000"
BOT_USERNAME: str = "" # Will be set dynamically on startup BOT_USERNAME: str = "" # Will be set dynamically on startup
BOT_API_SECRET: str = "" # Secret for backend API communication
class Config: class Config:
env_file = ".env" env_file = ".env"

View File

@@ -32,6 +32,11 @@ class APIClient:
session = await self._get_session() session = await self._get_session()
url = f"{self.base_url}/api/v1{endpoint}" url = f"{self.base_url}/api/v1{endpoint}"
# Add bot secret header for authentication
headers = kwargs.pop("headers", {})
if settings.BOT_API_SECRET:
headers["X-Bot-Secret"] = settings.BOT_API_SECRET
logger.info(f"[APIClient] {method} {url}") logger.info(f"[APIClient] {method} {url}")
if 'json' in kwargs: if 'json' in kwargs:
logger.info(f"[APIClient] Request body: {kwargs['json']}") logger.info(f"[APIClient] Request body: {kwargs['json']}")
@@ -39,7 +44,7 @@ class APIClient:
logger.info(f"[APIClient] Request params: {kwargs['params']}") logger.info(f"[APIClient] Request params: {kwargs['params']}")
try: try:
async with session.request(method, url, **kwargs) as response: async with session.request(method, url, headers=headers, **kwargs) as response:
logger.info(f"[APIClient] Response status: {response.status}") logger.info(f"[APIClient] Response status: {response.status}")
response_text = await response.text() response_text = await response.text()
logger.info(f"[APIClient] Response body: {response_text[:500]}") logger.info(f"[APIClient] Response body: {response_text[:500]}")

View File

@@ -28,6 +28,7 @@ services:
OPENAI_API_KEY: ${OPENAI_API_KEY} OPENAI_API_KEY: ${OPENAI_API_KEY}
TELEGRAM_BOT_TOKEN: ${TELEGRAM_BOT_TOKEN} TELEGRAM_BOT_TOKEN: ${TELEGRAM_BOT_TOKEN}
TELEGRAM_BOT_USERNAME: ${TELEGRAM_BOT_USERNAME:-GameMarathonBot} TELEGRAM_BOT_USERNAME: ${TELEGRAM_BOT_USERNAME:-GameMarathonBot}
BOT_API_SECRET: ${BOT_API_SECRET:-}
DEBUG: ${DEBUG:-false} DEBUG: ${DEBUG:-false}
# S3 Storage # S3 Storage
S3_ENABLED: ${S3_ENABLED:-false} S3_ENABLED: ${S3_ENABLED:-false}
@@ -81,6 +82,7 @@ services:
environment: environment:
- TELEGRAM_BOT_TOKEN=${TELEGRAM_BOT_TOKEN} - TELEGRAM_BOT_TOKEN=${TELEGRAM_BOT_TOKEN}
- API_URL=http://backend:8000 - API_URL=http://backend:8000
- BOT_API_SECRET=${BOT_API_SECRET:-}
depends_on: depends_on:
- backend - backend
restart: unless-stopped restart: unless-stopped
@@ -94,7 +96,13 @@ services:
BACKEND_URL: http://backend:8000 BACKEND_URL: http://backend:8000
FRONTEND_URL: http://frontend:80 FRONTEND_URL: http://frontend:80
BOT_URL: http://bot:8080 BOT_URL: http://bot:8080
EXTERNAL_URL: ${EXTERNAL_URL:-}
PUBLIC_URL: ${PUBLIC_URL:-}
CHECK_INTERVAL: "30" CHECK_INTERVAL: "30"
TELEGRAM_BOT_TOKEN: ${TELEGRAM_BOT_TOKEN}
TELEGRAM_ADMIN_ID: ${TELEGRAM_ADMIN_ID:-947392854}
volumes:
- status_data:/app/data
ports: ports:
- "8001:8001" - "8001:8001"
depends_on: depends_on:
@@ -103,5 +111,31 @@ services:
- bot - bot
restart: unless-stopped restart: unless-stopped
backup:
build:
context: ./backup-service
dockerfile: Dockerfile
container_name: marathon-backup
environment:
DB_HOST: db
DB_PORT: "5432"
DB_NAME: marathon
DB_USER: marathon
DB_PASSWORD: ${DB_PASSWORD:-marathon}
S3_BUCKET_NAME: ${S3_BUCKET_NAME:-}
S3_REGION: ${S3_REGION:-ru-1}
S3_ACCESS_KEY_ID: ${S3_ACCESS_KEY_ID:-}
S3_SECRET_ACCESS_KEY: ${S3_SECRET_ACCESS_KEY:-}
S3_ENDPOINT_URL: ${S3_ENDPOINT_URL:-}
S3_BACKUP_PREFIX: ${S3_BACKUP_PREFIX:-backups/}
TELEGRAM_BOT_TOKEN: ${TELEGRAM_BOT_TOKEN}
TELEGRAM_ADMIN_ID: ${TELEGRAM_ADMIN_ID:-947392854}
BACKUP_RETENTION_DAYS: ${BACKUP_RETENTION_DAYS:-14}
depends_on:
db:
condition: service_healthy
restart: unless-stopped
volumes: volumes:
postgres_data: postgres_data:
status_data:

View File

@@ -1,20 +1,25 @@
// User types // User types
export type UserRole = 'user' | 'admin' export type UserRole = 'user' | 'admin'
export interface User { // Public user info (visible to other users)
export interface UserPublic {
id: number id: number
login: string
nickname: string nickname: string
avatar_url: string | null avatar_url: string | null
role: UserRole role: UserRole
telegram_id: number | null
telegram_username: string | null
telegram_first_name: string | null
telegram_last_name: string | null
telegram_avatar_url: string | null telegram_avatar_url: string | null
created_at: string created_at: string
} }
// Full user info (only for own profile from /auth/me)
export interface User extends UserPublic {
login?: string // Only visible to self
telegram_id?: number | null // Only visible to self
telegram_username?: string | null // Only visible to self
telegram_first_name?: string | null // Only visible to self
telegram_last_name?: string | null // Only visible to self
}
export interface TokenResponse { export interface TokenResponse {
access_token: string access_token: string
token_type: string token_type: string

View File

@@ -17,6 +17,10 @@ http {
# File upload limit (15 MB) # File upload limit (15 MB)
client_max_body_size 15M; client_max_body_size 15M;
# Rate limiting zones
limit_req_zone $binary_remote_addr zone=api_auth:10m rate=10r/m;
limit_req_zone $binary_remote_addr zone=api_general:10m rate=60r/m;
upstream backend { upstream backend {
server backend:8000; server backend:8000;
} }
@@ -37,8 +41,22 @@ http {
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
} }
# Backend API # Auth API - strict rate limit (10 req/min with burst of 5)
location /api/v1/auth {
limit_req zone=api_auth burst=5 nodelay;
limit_req_status 429;
proxy_pass http://backend;
proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr;
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
}
# Backend API - general rate limit (60 req/min with burst of 20)
location /api { location /api {
limit_req zone=api_general burst=20 nodelay;
limit_req_status 429;
proxy_pass http://backend; proxy_pass http://backend;
proxy_set_header Host $host; proxy_set_header Host $host;
proxy_set_header X-Real-IP $remote_addr; proxy_set_header X-Real-IP $remote_addr;

View File

@@ -6,6 +6,9 @@ WORKDIR /app
COPY requirements.txt . COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt RUN pip install --no-cache-dir -r requirements.txt
# Create data directory for SQLite
RUN mkdir -p /app/data
# Copy application # Copy application
COPY . . COPY . .

85
status-service/alerts.py Normal file
View File

@@ -0,0 +1,85 @@
"""Telegram alerting for status changes."""
import os
from datetime import datetime
from typing import Optional
import httpx
TELEGRAM_BOT_TOKEN = os.getenv("TELEGRAM_BOT_TOKEN", "")
TELEGRAM_ADMIN_ID = os.getenv("TELEGRAM_ADMIN_ID", "")
async def send_telegram_alert(message: str, is_recovery: bool = False) -> bool:
"""Send alert to Telegram."""
if not TELEGRAM_BOT_TOKEN or not TELEGRAM_ADMIN_ID:
print("Telegram alerting not configured")
return False
emoji = "\u2705" if is_recovery else "\u26a0\ufe0f"
text = f"{emoji} *Status Alert*\n\n{message}"
url = f"https://api.telegram.org/bot{TELEGRAM_BOT_TOKEN}/sendMessage"
data = {
"chat_id": TELEGRAM_ADMIN_ID,
"text": text,
"parse_mode": "Markdown",
}
try:
async with httpx.AsyncClient(timeout=10.0) as client:
response = await client.post(url, json=data)
response.raise_for_status()
print(f"Telegram alert sent: {message[:50]}...")
return True
except Exception as e:
print(f"Failed to send Telegram alert: {e}")
return False
async def alert_service_down(service_name: str, display_name: str, message: Optional[str]):
"""Alert when service goes down."""
now = datetime.now().strftime("%d.%m.%Y %H:%M:%S")
text = (
f"*{display_name}* is DOWN\n\n"
f"Time: `{now}`\n"
)
if message:
text += f"Error: `{message}`"
await send_telegram_alert(text, is_recovery=False)
async def alert_service_recovered(service_name: str, display_name: str, downtime_minutes: int):
"""Alert when service recovers."""
now = datetime.now().strftime("%d.%m.%Y %H:%M:%S")
text = (
f"*{display_name}* is back ONLINE\n\n"
f"Time: `{now}`\n"
f"Downtime: `{downtime_minutes} min`"
)
await send_telegram_alert(text, is_recovery=True)
async def alert_ssl_expiring(domain: str, days_left: int):
"""Alert when SSL certificate is expiring soon."""
text = (
f"*SSL Certificate Expiring*\n\n"
f"Domain: `{domain}`\n"
f"Days left: `{days_left}`\n\n"
f"Please renew the certificate!"
)
await send_telegram_alert(text, is_recovery=False)
async def alert_ssl_expired(domain: str):
"""Alert when SSL certificate has expired."""
text = (
f"*SSL Certificate EXPIRED*\n\n"
f"Domain: `{domain}`\n\n"
f"Certificate has expired! Site may show security warnings."
)
await send_telegram_alert(text, is_recovery=False)

261
status-service/database.py Normal file
View File

@@ -0,0 +1,261 @@
"""SQLite database for storing metrics history."""
import sqlite3
from datetime import datetime, timedelta
from pathlib import Path
from typing import Optional
import json
DB_PATH = Path("/app/data/metrics.db")
def get_connection() -> sqlite3.Connection:
"""Get database connection."""
DB_PATH.parent.mkdir(parents=True, exist_ok=True)
conn = sqlite3.connect(str(DB_PATH))
conn.row_factory = sqlite3.Row
return conn
def init_db():
"""Initialize database tables."""
conn = get_connection()
cursor = conn.cursor()
# Metrics history table
cursor.execute("""
CREATE TABLE IF NOT EXISTS metrics (
id INTEGER PRIMARY KEY AUTOINCREMENT,
service_name TEXT NOT NULL,
status TEXT NOT NULL,
latency_ms REAL,
message TEXT,
checked_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
""")
# Incidents table
cursor.execute("""
CREATE TABLE IF NOT EXISTS incidents (
id INTEGER PRIMARY KEY AUTOINCREMENT,
service_name TEXT NOT NULL,
status TEXT NOT NULL,
message TEXT,
started_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
resolved_at TIMESTAMP,
notified BOOLEAN DEFAULT FALSE
)
""")
# SSL certificates table
cursor.execute("""
CREATE TABLE IF NOT EXISTS ssl_certificates (
id INTEGER PRIMARY KEY AUTOINCREMENT,
domain TEXT NOT NULL UNIQUE,
issuer TEXT,
expires_at TIMESTAMP,
days_until_expiry INTEGER,
checked_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
""")
# Create indexes
cursor.execute("""
CREATE INDEX IF NOT EXISTS idx_metrics_service_time
ON metrics(service_name, checked_at DESC)
""")
cursor.execute("""
CREATE INDEX IF NOT EXISTS idx_incidents_service
ON incidents(service_name, started_at DESC)
""")
conn.commit()
conn.close()
def save_metric(service_name: str, status: str, latency_ms: Optional[float], message: Optional[str]):
"""Save a metric record."""
conn = get_connection()
cursor = conn.cursor()
cursor.execute(
"INSERT INTO metrics (service_name, status, latency_ms, message) VALUES (?, ?, ?, ?)",
(service_name, status, latency_ms, message)
)
conn.commit()
conn.close()
def get_latency_history(service_name: str, hours: int = 24) -> list[dict]:
"""Get latency history for a service."""
conn = get_connection()
cursor = conn.cursor()
since = datetime.now() - timedelta(hours=hours)
cursor.execute("""
SELECT latency_ms, status, checked_at
FROM metrics
WHERE service_name = ? AND checked_at > ? AND latency_ms IS NOT NULL
ORDER BY checked_at ASC
""", (service_name, since.isoformat()))
rows = cursor.fetchall()
conn.close()
return [
{
"latency_ms": row["latency_ms"],
"status": row["status"],
"checked_at": row["checked_at"]
}
for row in rows
]
def get_uptime_stats(service_name: str, hours: int = 24) -> dict:
"""Calculate uptime statistics for a service."""
conn = get_connection()
cursor = conn.cursor()
since = datetime.now() - timedelta(hours=hours)
cursor.execute("""
SELECT COUNT(*) as total,
SUM(CASE WHEN status = 'operational' THEN 1 ELSE 0 END) as successful
FROM metrics
WHERE service_name = ? AND checked_at > ?
""", (service_name, since.isoformat()))
row = cursor.fetchone()
conn.close()
total = row["total"] or 0
successful = row["successful"] or 0
return {
"total_checks": total,
"successful_checks": successful,
"uptime_percent": (successful / total * 100) if total > 0 else 100.0
}
def get_avg_latency(service_name: str, hours: int = 24) -> Optional[float]:
"""Get average latency for a service."""
conn = get_connection()
cursor = conn.cursor()
since = datetime.now() - timedelta(hours=hours)
cursor.execute("""
SELECT AVG(latency_ms) as avg_latency
FROM metrics
WHERE service_name = ? AND checked_at > ? AND latency_ms IS NOT NULL
""", (service_name, since.isoformat()))
row = cursor.fetchone()
conn.close()
return row["avg_latency"]
def create_incident(service_name: str, status: str, message: Optional[str]) -> int:
"""Create a new incident."""
conn = get_connection()
cursor = conn.cursor()
cursor.execute(
"INSERT INTO incidents (service_name, status, message) VALUES (?, ?, ?)",
(service_name, status, message)
)
incident_id = cursor.lastrowid
conn.commit()
conn.close()
return incident_id
def resolve_incident(service_name: str):
"""Resolve open incidents for a service."""
conn = get_connection()
cursor = conn.cursor()
cursor.execute("""
UPDATE incidents
SET resolved_at = CURRENT_TIMESTAMP
WHERE service_name = ? AND resolved_at IS NULL
""", (service_name,))
conn.commit()
conn.close()
def get_open_incident(service_name: str) -> Optional[dict]:
"""Get open incident for a service."""
conn = get_connection()
cursor = conn.cursor()
cursor.execute("""
SELECT * FROM incidents
WHERE service_name = ? AND resolved_at IS NULL
ORDER BY started_at DESC LIMIT 1
""", (service_name,))
row = cursor.fetchone()
conn.close()
if row:
return dict(row)
return None
def mark_incident_notified(incident_id: int):
"""Mark incident as notified."""
conn = get_connection()
cursor = conn.cursor()
cursor.execute("UPDATE incidents SET notified = TRUE WHERE id = ?", (incident_id,))
conn.commit()
conn.close()
def get_recent_incidents(limit: int = 10) -> list[dict]:
"""Get recent incidents."""
conn = get_connection()
cursor = conn.cursor()
cursor.execute("""
SELECT * FROM incidents
ORDER BY started_at DESC
LIMIT ?
""", (limit,))
rows = cursor.fetchall()
conn.close()
return [dict(row) for row in rows]
def save_ssl_info(domain: str, issuer: str, expires_at: datetime, days_until_expiry: int):
"""Save SSL certificate info."""
conn = get_connection()
cursor = conn.cursor()
cursor.execute("""
INSERT OR REPLACE INTO ssl_certificates
(domain, issuer, expires_at, days_until_expiry, checked_at)
VALUES (?, ?, ?, ?, CURRENT_TIMESTAMP)
""", (domain, issuer, expires_at.isoformat(), days_until_expiry))
conn.commit()
conn.close()
def get_ssl_info(domain: str) -> Optional[dict]:
"""Get SSL certificate info."""
conn = get_connection()
cursor = conn.cursor()
cursor.execute("SELECT * FROM ssl_certificates WHERE domain = ?", (domain,))
row = cursor.fetchone()
conn.close()
if row:
return dict(row)
return None
def cleanup_old_metrics(days: int = 7):
"""Delete metrics older than specified days."""
conn = get_connection()
cursor = conn.cursor()
cutoff = datetime.now() - timedelta(days=days)
cursor.execute("DELETE FROM metrics WHERE checked_at < ?", (cutoff.isoformat(),))
deleted = cursor.rowcount
conn.commit()
conn.close()
return deleted

View File

@@ -1,6 +1,7 @@
"""Status monitoring service with persistence and alerting."""
import os import os
import asyncio import asyncio
from datetime import datetime, timedelta from datetime import datetime
from typing import Optional from typing import Optional
from contextlib import asynccontextmanager from contextlib import asynccontextmanager
@@ -8,13 +9,16 @@ from fastapi import FastAPI, Request
from fastapi.responses import HTMLResponse from fastapi.responses import HTMLResponse
from fastapi.templating import Jinja2Templates from fastapi.templating import Jinja2Templates
from monitors import ServiceMonitor, ServiceStatus from monitors import ServiceMonitor
from database import init_db, get_recent_incidents, get_latency_history, cleanup_old_metrics
# Configuration # Configuration
BACKEND_URL = os.getenv("BACKEND_URL", "http://backend:8000") BACKEND_URL = os.getenv("BACKEND_URL", "http://backend:8000")
FRONTEND_URL = os.getenv("FRONTEND_URL", "http://frontend:80") FRONTEND_URL = os.getenv("FRONTEND_URL", "http://frontend:80")
BOT_URL = os.getenv("BOT_URL", "http://bot:8080") BOT_URL = os.getenv("BOT_URL", "http://bot:8080")
EXTERNAL_URL = os.getenv("EXTERNAL_URL", "") # Public URL for external checks
PUBLIC_URL = os.getenv("PUBLIC_URL", "") # Public HTTPS URL for SSL checks
CHECK_INTERVAL = int(os.getenv("CHECK_INTERVAL", "30")) CHECK_INTERVAL = int(os.getenv("CHECK_INTERVAL", "30"))
# Initialize monitor # Initialize monitor
@@ -22,38 +26,64 @@ monitor = ServiceMonitor()
# Background task reference # Background task reference
background_task: Optional[asyncio.Task] = None background_task: Optional[asyncio.Task] = None
cleanup_task: Optional[asyncio.Task] = None
async def periodic_health_check(): async def periodic_health_check():
"""Background task to check services periodically""" """Background task to check services periodically."""
while True: while True:
await monitor.check_all_services( try:
backend_url=BACKEND_URL, await monitor.check_all_services(
frontend_url=FRONTEND_URL, backend_url=BACKEND_URL,
bot_url=BOT_URL frontend_url=FRONTEND_URL,
) bot_url=BOT_URL,
external_url=EXTERNAL_URL,
public_url=PUBLIC_URL
)
except Exception as e:
print(f"Health check error: {e}")
await asyncio.sleep(CHECK_INTERVAL) await asyncio.sleep(CHECK_INTERVAL)
async def periodic_cleanup():
"""Background task to cleanup old metrics (daily)."""
while True:
await asyncio.sleep(86400) # 24 hours
try:
deleted = cleanup_old_metrics(days=7)
print(f"Cleaned up {deleted} old metrics")
except Exception as e:
print(f"Cleanup error: {e}")
@asynccontextmanager @asynccontextmanager
async def lifespan(app: FastAPI): async def lifespan(app: FastAPI):
"""Startup and shutdown events""" """Startup and shutdown events."""
global background_task global background_task, cleanup_task
# Initialize database
init_db()
print("Database initialized")
# Start background health checks # Start background health checks
background_task = asyncio.create_task(periodic_health_check()) background_task = asyncio.create_task(periodic_health_check())
cleanup_task = asyncio.create_task(periodic_cleanup())
yield yield
# Cancel background task on shutdown
if background_task: # Cancel background tasks on shutdown
background_task.cancel() for task in [background_task, cleanup_task]:
try: if task:
await background_task task.cancel()
except asyncio.CancelledError: try:
pass await task
except asyncio.CancelledError:
pass
app = FastAPI( app = FastAPI(
title="Status Monitor", title="Status Monitor",
description="Service health monitoring", description="Service health monitoring with persistence and alerting",
lifespan=lifespan lifespan=lifespan
) )
@@ -62,9 +92,11 @@ templates = Jinja2Templates(directory="templates")
@app.get("/", response_class=HTMLResponse) @app.get("/", response_class=HTMLResponse)
async def status_page(request: Request): async def status_page(request: Request):
"""Main status page""" """Main status page."""
services = monitor.get_all_statuses() services = monitor.get_all_statuses()
overall_status = monitor.get_overall_status() overall_status = monitor.get_overall_status()
ssl_status = monitor.get_ssl_status()
incidents = get_recent_incidents(limit=5)
return templates.TemplateResponse( return templates.TemplateResponse(
"index.html", "index.html",
@@ -72,6 +104,8 @@ async def status_page(request: Request):
"request": request, "request": request,
"services": services, "services": services,
"overall_status": overall_status, "overall_status": overall_status,
"ssl_status": ssl_status,
"incidents": incidents,
"last_check": monitor.last_check, "last_check": monitor.last_check,
"check_interval": CHECK_INTERVAL "check_interval": CHECK_INTERVAL
} }
@@ -80,30 +114,52 @@ async def status_page(request: Request):
@app.get("/api/status") @app.get("/api/status")
async def api_status(): async def api_status():
"""API endpoint for service statuses""" """API endpoint for service statuses."""
services = monitor.get_all_statuses() services = monitor.get_all_statuses()
overall_status = monitor.get_overall_status() overall_status = monitor.get_overall_status()
ssl_status = monitor.get_ssl_status()
return { return {
"overall_status": overall_status, "overall_status": overall_status.value,
"services": {name: status.to_dict() for name, status in services.items()}, "services": {name: status.to_dict() for name, status in services.items()},
"ssl": ssl_status,
"last_check": monitor.last_check.isoformat() if monitor.last_check else None, "last_check": monitor.last_check.isoformat() if monitor.last_check else None,
"check_interval_seconds": CHECK_INTERVAL "check_interval_seconds": CHECK_INTERVAL
} }
@app.get("/api/history/{service_name}")
async def api_history(service_name: str, hours: int = 24):
"""API endpoint for service latency history."""
history = get_latency_history(service_name, hours=hours)
return {
"service": service_name,
"hours": hours,
"data": history
}
@app.get("/api/incidents")
async def api_incidents(limit: int = 20):
"""API endpoint for recent incidents."""
incidents = get_recent_incidents(limit=limit)
return {"incidents": incidents}
@app.get("/api/health") @app.get("/api/health")
async def health(): async def health():
"""Health check for this service""" """Health check for this service."""
return {"status": "ok", "service": "status-monitor"} return {"status": "ok", "service": "status-monitor"}
@app.post("/api/refresh") @app.post("/api/refresh")
async def refresh_status(): async def refresh_status():
"""Force refresh all service statuses""" """Force refresh all service statuses."""
await monitor.check_all_services( await monitor.check_all_services(
backend_url=BACKEND_URL, backend_url=BACKEND_URL,
frontend_url=FRONTEND_URL, frontend_url=FRONTEND_URL,
bot_url=BOT_URL bot_url=BOT_URL,
external_url=EXTERNAL_URL,
public_url=PUBLIC_URL
) )
return {"status": "refreshed"} return {"status": "refreshed"}

View File

@@ -1,11 +1,19 @@
"""Service monitoring with persistence and alerting."""
import asyncio import asyncio
from datetime import datetime, timedelta from datetime import datetime, timedelta
from dataclasses import dataclass, field from dataclasses import dataclass
from typing import Optional from typing import Optional
from enum import Enum from enum import Enum
import httpx import httpx
from database import (
save_metric, get_latency_history, get_uptime_stats, get_avg_latency,
create_incident, resolve_incident, get_open_incident, mark_incident_notified
)
from alerts import alert_service_down, alert_service_recovered
from ssl_monitor import check_and_alert_ssl, SSLInfo
class Status(str, Enum): class Status(str, Enum):
OPERATIONAL = "operational" OPERATIONAL = "operational"
@@ -25,11 +33,17 @@ class ServiceStatus:
uptime_percent: float = 100.0 uptime_percent: float = 100.0
message: Optional[str] = None message: Optional[str] = None
version: Optional[str] = None version: Optional[str] = None
avg_latency_24h: Optional[float] = None
latency_history: list = None
# For uptime calculation # For uptime calculation (in-memory, backed by DB)
total_checks: int = 0 total_checks: int = 0
successful_checks: int = 0 successful_checks: int = 0
def __post_init__(self):
if self.latency_history is None:
self.latency_history = []
def to_dict(self) -> dict: def to_dict(self) -> dict:
return { return {
"name": self.name, "name": self.name,
@@ -40,7 +54,8 @@ class ServiceStatus:
"last_incident": self.last_incident.isoformat() if self.last_incident else None, "last_incident": self.last_incident.isoformat() if self.last_incident else None,
"uptime_percent": round(self.uptime_percent, 2), "uptime_percent": round(self.uptime_percent, 2),
"message": self.message, "message": self.message,
"version": self.version "version": self.version,
"avg_latency_24h": round(self.avg_latency_24h, 2) if self.avg_latency_24h else None,
} }
def update_uptime(self, is_success: bool): def update_uptime(self, is_success: bool):
@@ -69,12 +84,17 @@ class ServiceMonitor:
"bot": ServiceStatus( "bot": ServiceStatus(
name="bot", name="bot",
display_name="Telegram Bot" display_name="Telegram Bot"
) ),
"external": ServiceStatus(
name="external",
display_name="External Access"
),
} }
self.last_check: Optional[datetime] = None self.last_check: Optional[datetime] = None
self.ssl_info: Optional[SSLInfo] = None
async def check_backend(self, url: str) -> tuple[Status, Optional[float], Optional[str], Optional[str]]: async def check_backend(self, url: str) -> tuple[Status, Optional[float], Optional[str], Optional[str]]:
"""Check backend API health""" """Check backend API health."""
try: try:
async with httpx.AsyncClient(timeout=10.0) as client: async with httpx.AsyncClient(timeout=10.0) as client:
start = datetime.now() start = datetime.now()
@@ -92,9 +112,7 @@ class ServiceMonitor:
return Status.DOWN, None, str(e)[:100], None return Status.DOWN, None, str(e)[:100], None
async def check_database(self, backend_url: str) -> tuple[Status, Optional[float], Optional[str]]: async def check_database(self, backend_url: str) -> tuple[Status, Optional[float], Optional[str]]:
"""Check database through backend""" """Check database through backend."""
# We check database indirectly - if backend is up, DB is likely up
# Could add a specific /health/db endpoint to backend later
try: try:
async with httpx.AsyncClient(timeout=10.0) as client: async with httpx.AsyncClient(timeout=10.0) as client:
start = datetime.now() start = datetime.now()
@@ -109,7 +127,7 @@ class ServiceMonitor:
return Status.DOWN, None, "Cannot reach backend" return Status.DOWN, None, "Cannot reach backend"
async def check_frontend(self, url: str) -> tuple[Status, Optional[float], Optional[str]]: async def check_frontend(self, url: str) -> tuple[Status, Optional[float], Optional[str]]:
"""Check frontend availability""" """Check frontend availability."""
try: try:
async with httpx.AsyncClient(timeout=10.0) as client: async with httpx.AsyncClient(timeout=10.0) as client:
start = datetime.now() start = datetime.now()
@@ -126,7 +144,7 @@ class ServiceMonitor:
return Status.DOWN, None, str(e)[:100] return Status.DOWN, None, str(e)[:100]
async def check_bot(self, url: str) -> tuple[Status, Optional[float], Optional[str]]: async def check_bot(self, url: str) -> tuple[Status, Optional[float], Optional[str]]:
"""Check Telegram bot health""" """Check Telegram bot health."""
try: try:
async with httpx.AsyncClient(timeout=10.0) as client: async with httpx.AsyncClient(timeout=10.0) as client:
start = datetime.now() start = datetime.now()
@@ -142,8 +160,93 @@ class ServiceMonitor:
except Exception as e: except Exception as e:
return Status.DOWN, None, str(e)[:100] return Status.DOWN, None, str(e)[:100]
async def check_all_services(self, backend_url: str, frontend_url: str, bot_url: str): async def check_external(self, url: str) -> tuple[Status, Optional[float], Optional[str]]:
"""Check all services concurrently""" """Check external (public) URL availability."""
if not url:
return Status.UNKNOWN, None, "Not configured"
try:
async with httpx.AsyncClient(timeout=15.0, follow_redirects=True) as client:
start = datetime.now()
response = await client.get(url)
latency = (datetime.now() - start).total_seconds() * 1000
if response.status_code == 200:
return Status.OPERATIONAL, latency, None
else:
return Status.DEGRADED, latency, f"HTTP {response.status_code}"
except httpx.TimeoutException:
return Status.DOWN, None, "Timeout"
except Exception as e:
return Status.DOWN, None, str(e)[:100]
async def _process_check_result(
self,
service_name: str,
result: tuple,
now: datetime
):
"""Process check result with DB persistence and alerting."""
if isinstance(result, Exception):
return
if len(result) == 4:
status, latency, message, version = result
else:
status, latency, message = result
version = None
svc = self.services[service_name]
was_down = svc.status in (Status.DOWN, Status.DEGRADED)
is_down = status in (Status.DOWN, Status.DEGRADED)
# Update service status
svc.status = status
svc.latency_ms = latency
svc.message = message
if version:
svc.version = version
svc.last_check = now
svc.update_uptime(status == Status.OPERATIONAL)
# Save metric to database
save_metric(service_name, status.value, latency, message)
# Load historical data
svc.latency_history = get_latency_history(service_name, hours=24)
svc.avg_latency_24h = get_avg_latency(service_name, hours=24)
# Update uptime from DB
stats = get_uptime_stats(service_name, hours=24)
if stats["total_checks"] > 0:
svc.uptime_percent = stats["uptime_percent"]
# Handle incident tracking and alerting
if is_down and not was_down:
# Service just went down
svc.last_incident = now
incident_id = create_incident(service_name, status.value, message)
await alert_service_down(service_name, svc.display_name, message)
mark_incident_notified(incident_id)
elif not is_down and was_down:
# Service recovered
open_incident = get_open_incident(service_name)
if open_incident:
started_at = datetime.fromisoformat(open_incident["started_at"])
downtime_minutes = int((now - started_at).total_seconds() / 60)
resolve_incident(service_name)
await alert_service_recovered(service_name, svc.display_name, downtime_minutes)
async def check_all_services(
self,
backend_url: str,
frontend_url: str,
bot_url: str,
external_url: str = "",
public_url: str = ""
):
"""Check all services concurrently."""
now = datetime.now() now = datetime.now()
# Run all checks concurrently # Run all checks concurrently
@@ -152,61 +255,18 @@ class ServiceMonitor:
self.check_database(backend_url), self.check_database(backend_url),
self.check_frontend(frontend_url), self.check_frontend(frontend_url),
self.check_bot(bot_url), self.check_bot(bot_url),
self.check_external(external_url),
return_exceptions=True return_exceptions=True
) )
# Process backend result # Process results
if not isinstance(results[0], Exception): service_names = ["backend", "database", "frontend", "bot", "external"]
status, latency, message, version = results[0] for i, service_name in enumerate(service_names):
svc = self.services["backend"] await self._process_check_result(service_name, results[i], now)
was_down = svc.status == Status.DOWN
svc.status = status
svc.latency_ms = latency
svc.message = message
svc.version = version
svc.last_check = now
svc.update_uptime(status == Status.OPERATIONAL)
if status != Status.OPERATIONAL and not was_down:
svc.last_incident = now
# Process database result # Check SSL certificate (if public URL is HTTPS)
if not isinstance(results[1], Exception): if public_url and public_url.startswith("https://"):
status, latency, message = results[1] self.ssl_info = await check_and_alert_ssl(public_url)
svc = self.services["database"]
was_down = svc.status == Status.DOWN
svc.status = status
svc.latency_ms = latency
svc.message = message
svc.last_check = now
svc.update_uptime(status == Status.OPERATIONAL)
if status != Status.OPERATIONAL and not was_down:
svc.last_incident = now
# Process frontend result
if not isinstance(results[2], Exception):
status, latency, message = results[2]
svc = self.services["frontend"]
was_down = svc.status == Status.DOWN
svc.status = status
svc.latency_ms = latency
svc.message = message
svc.last_check = now
svc.update_uptime(status == Status.OPERATIONAL)
if status != Status.OPERATIONAL and not was_down:
svc.last_incident = now
# Process bot result
if not isinstance(results[3], Exception):
status, latency, message = results[3]
svc = self.services["bot"]
was_down = svc.status == Status.DOWN
svc.status = status
svc.latency_ms = latency
svc.message = message
svc.last_check = now
svc.update_uptime(status == Status.OPERATIONAL)
if status != Status.OPERATIONAL and not was_down:
svc.last_incident = now
self.last_check = now self.last_check = now
@@ -214,8 +274,12 @@ class ServiceMonitor:
return self.services return self.services
def get_overall_status(self) -> Status: def get_overall_status(self) -> Status:
"""Get overall system status based on all services""" """Get overall system status based on all services."""
statuses = [svc.status for svc in self.services.values()] # Exclude external from overall status if not configured
statuses = [
svc.status for name, svc in self.services.items()
if name != "external" or svc.status != Status.UNKNOWN
]
if all(s == Status.OPERATIONAL for s in statuses): if all(s == Status.OPERATIONAL for s in statuses):
return Status.OPERATIONAL return Status.OPERATIONAL
@@ -225,3 +289,17 @@ class ServiceMonitor:
return Status.DEGRADED return Status.DEGRADED
else: else:
return Status.UNKNOWN return Status.UNKNOWN
def get_ssl_status(self) -> Optional[dict]:
"""Get SSL certificate status."""
if not self.ssl_info:
return None
return {
"domain": self.ssl_info.domain,
"issuer": self.ssl_info.issuer,
"expires_at": self.ssl_info.expires_at.isoformat(),
"days_until_expiry": self.ssl_info.days_until_expiry,
"is_valid": self.ssl_info.is_valid,
"error": self.ssl_info.error
}

View File

@@ -0,0 +1,140 @@
"""SSL certificate monitoring."""
import ssl
import socket
from datetime import datetime, timezone
from dataclasses import dataclass
from typing import Optional
from urllib.parse import urlparse
from database import save_ssl_info, get_ssl_info
from alerts import alert_ssl_expiring, alert_ssl_expired
@dataclass
class SSLInfo:
domain: str
issuer: str
expires_at: datetime
days_until_expiry: int
is_valid: bool
error: Optional[str] = None
def check_ssl_certificate(url: str) -> Optional[SSLInfo]:
"""Check SSL certificate for a URL."""
try:
parsed = urlparse(url)
hostname = parsed.hostname
if not hostname:
return None
# Skip non-HTTPS or localhost
if parsed.scheme != "https" or hostname in ("localhost", "127.0.0.1"):
return None
context = ssl.create_default_context()
conn = context.wrap_socket(
socket.socket(socket.AF_INET),
server_hostname=hostname
)
conn.settimeout(10.0)
try:
conn.connect((hostname, parsed.port or 443))
cert = conn.getpeercert()
finally:
conn.close()
if not cert:
return SSLInfo(
domain=hostname,
issuer="Unknown",
expires_at=datetime.now(timezone.utc),
days_until_expiry=0,
is_valid=False,
error="No certificate found"
)
# Parse expiry date
not_after = cert.get("notAfter", "")
expires_at = datetime.strptime(not_after, "%b %d %H:%M:%S %Y %Z")
expires_at = expires_at.replace(tzinfo=timezone.utc)
# Calculate days until expiry
now = datetime.now(timezone.utc)
days_until_expiry = (expires_at - now).days
# Get issuer
issuer_parts = cert.get("issuer", ())
issuer = "Unknown"
for part in issuer_parts:
for key, value in part:
if key == "organizationName":
issuer = value
break
return SSLInfo(
domain=hostname,
issuer=issuer,
expires_at=expires_at,
days_until_expiry=days_until_expiry,
is_valid=days_until_expiry > 0
)
except ssl.SSLCertVerificationError as e:
hostname = urlparse(url).hostname or url
return SSLInfo(
domain=hostname,
issuer="Invalid",
expires_at=datetime.now(timezone.utc),
days_until_expiry=0,
is_valid=False,
error=f"SSL verification failed: {str(e)[:100]}"
)
except Exception as e:
hostname = urlparse(url).hostname or url
return SSLInfo(
domain=hostname,
issuer="Unknown",
expires_at=datetime.now(timezone.utc),
days_until_expiry=0,
is_valid=False,
error=str(e)[:100]
)
async def check_and_alert_ssl(url: str, warn_days: int = 14) -> Optional[SSLInfo]:
"""Check SSL and send alerts if needed."""
ssl_info = check_ssl_certificate(url)
if not ssl_info:
return None
# Save to database
save_ssl_info(
domain=ssl_info.domain,
issuer=ssl_info.issuer,
expires_at=ssl_info.expires_at,
days_until_expiry=ssl_info.days_until_expiry
)
# Check if we need to alert
prev_info = get_ssl_info(ssl_info.domain)
if ssl_info.days_until_expiry <= 0:
# Certificate expired
await alert_ssl_expired(ssl_info.domain)
elif ssl_info.days_until_expiry <= warn_days:
# Certificate expiring soon - alert once per day
should_alert = True
if prev_info and prev_info.get("checked_at"):
# Check if we already alerted today
last_check = datetime.fromisoformat(prev_info["checked_at"])
if (datetime.now() - last_check).days < 1:
should_alert = False
if should_alert:
await alert_ssl_expiring(ssl_info.domain, ssl_info.days_until_expiry)
return ssl_info

View File

@@ -4,6 +4,7 @@
<meta charset="UTF-8"> <meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0"> <meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>System Status</title> <title>System Status</title>
<script src="https://cdn.jsdelivr.net/npm/chart.js"></script>
<style> <style>
* { * {
margin: 0; margin: 0;
@@ -19,7 +20,7 @@
} }
.container { .container {
max-width: 900px; max-width: 1100px;
margin: 0 auto; margin: 0 auto;
padding: 40px 20px; padding: 40px 20px;
} }
@@ -39,6 +40,13 @@
background-clip: text; background-clip: text;
} }
h2 {
font-size: 1.3rem;
font-weight: 600;
margin: 30px 0 16px 0;
color: #94a3b8;
}
.overall-status { .overall-status {
display: inline-flex; display: inline-flex;
align-items: center; align-items: center;
@@ -174,8 +182,9 @@
.service-metrics { .service-metrics {
display: grid; display: grid;
grid-template-columns: repeat(auto-fit, minmax(140px, 1fr)); grid-template-columns: repeat(auto-fit, minmax(120px, 1fr));
gap: 12px; gap: 12px;
margin-bottom: 16px;
} }
.metric { .metric {
@@ -212,6 +221,132 @@
color: #fca5a5; color: #fca5a5;
} }
/* Latency chart */
.latency-chart {
height: 60px;
margin-top: 12px;
}
/* SSL Card */
.ssl-card {
background: rgba(30, 41, 59, 0.5);
border: 1px solid rgba(100, 116, 139, 0.2);
border-radius: 16px;
padding: 20px;
margin-bottom: 20px;
}
.ssl-card.warning {
border-color: rgba(250, 204, 21, 0.3);
}
.ssl-card.danger {
border-color: rgba(239, 68, 68, 0.3);
}
.ssl-header {
display: flex;
justify-content: space-between;
align-items: center;
margin-bottom: 12px;
}
.ssl-title {
font-size: 1.1rem;
font-weight: 600;
color: #f1f5f9;
}
.ssl-badge {
padding: 4px 12px;
border-radius: 20px;
font-size: 0.8rem;
font-weight: 500;
}
.ssl-badge.valid {
background: rgba(34, 197, 94, 0.15);
color: #22c55e;
}
.ssl-badge.expiring {
background: rgba(250, 204, 21, 0.15);
color: #facc15;
}
.ssl-badge.expired {
background: rgba(239, 68, 68, 0.15);
color: #ef4444;
}
.ssl-info {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(150px, 1fr));
gap: 12px;
}
/* Incidents */
.incidents-list {
background: rgba(30, 41, 59, 0.5);
border: 1px solid rgba(100, 116, 139, 0.2);
border-radius: 16px;
overflow: hidden;
}
.incident-item {
padding: 16px 20px;
border-bottom: 1px solid rgba(100, 116, 139, 0.1);
display: flex;
justify-content: space-between;
align-items: center;
}
.incident-item:last-child {
border-bottom: none;
}
.incident-info {
display: flex;
align-items: center;
gap: 12px;
}
.incident-dot {
width: 10px;
height: 10px;
border-radius: 50%;
}
.incident-dot.resolved {
background: #22c55e;
}
.incident-dot.open {
background: #ef4444;
animation: pulse 2s infinite;
}
.incident-service {
font-weight: 500;
color: #f1f5f9;
}
.incident-message {
font-size: 0.85rem;
color: #94a3b8;
}
.incident-time {
font-size: 0.85rem;
color: #64748b;
}
.no-incidents {
padding: 30px;
text-align: center;
color: #64748b;
}
.refresh-btn { .refresh-btn {
display: inline-flex; display: inline-flex;
align-items: center; align-items: center;
@@ -292,8 +427,42 @@
</p> </p>
</header> </header>
{% if ssl_status %}
<div class="ssl-card {% if ssl_status.days_until_expiry <= 0 %}danger{% elif ssl_status.days_until_expiry <= 14 %}warning{% endif %}">
<div class="ssl-header">
<span class="ssl-title">SSL Certificate</span>
<span class="ssl-badge {% if ssl_status.days_until_expiry <= 0 %}expired{% elif ssl_status.days_until_expiry <= 14 %}expiring{% else %}valid{% endif %}">
{% if ssl_status.days_until_expiry <= 0 %}
Expired
{% elif ssl_status.days_until_expiry <= 14 %}
Expiring Soon
{% else %}
Valid
{% endif %}
</span>
</div>
<div class="ssl-info">
<div class="metric">
<div class="metric-label">Domain</div>
<div class="metric-value">{{ ssl_status.domain }}</div>
</div>
<div class="metric">
<div class="metric-label">Issuer</div>
<div class="metric-value">{{ ssl_status.issuer }}</div>
</div>
<div class="metric">
<div class="metric-label">Days Left</div>
<div class="metric-value {% if ssl_status.days_until_expiry <= 0 %}bad{% elif ssl_status.days_until_expiry <= 14 %}warning{% else %}good{% endif %}">
{{ ssl_status.days_until_expiry }}
</div>
</div>
</div>
</div>
{% endif %}
<div class="services-grid"> <div class="services-grid">
{% for name, service in services.items() %} {% for name, service in services.items() %}
{% if service.status.value != 'unknown' or name != 'external' %}
<div class="service-card"> <div class="service-card">
<div class="service-header"> <div class="service-header">
<span class="service-name">{{ service.display_name }}</span> <span class="service-name">{{ service.display_name }}</span>
@@ -322,7 +491,17 @@
</div> </div>
</div> </div>
<div class="metric"> <div class="metric">
<div class="metric-label">Uptime</div> <div class="metric-label">Avg 24h</div>
<div class="metric-value {% if service.avg_latency_24h and service.avg_latency_24h < 200 %}good{% elif service.avg_latency_24h and service.avg_latency_24h < 500 %}warning{% elif service.avg_latency_24h %}bad{% endif %}">
{% if service.avg_latency_24h %}
{{ "%.0f"|format(service.avg_latency_24h) }} ms
{% else %}
{% endif %}
</div>
</div>
<div class="metric">
<div class="metric-label">Uptime 24h</div>
<div class="metric-value {% if service.uptime_percent >= 99 %}good{% elif service.uptime_percent >= 95 %}warning{% else %}bad{% endif %}"> <div class="metric-value {% if service.uptime_percent >= 99 %}good{% elif service.uptime_percent >= 95 %}warning{% else %}bad{% endif %}">
{{ "%.1f"|format(service.uptime_percent) }}% {{ "%.1f"|format(service.uptime_percent) }}%
</div> </div>
@@ -333,20 +512,49 @@
<div class="metric-value">{{ service.version }}</div> <div class="metric-value">{{ service.version }}</div>
</div> </div>
{% endif %} {% endif %}
{% if service.last_incident %}
<div class="metric">
<div class="metric-label">Last Incident</div>
<div class="metric-value warning">{{ service.last_incident.strftime('%d.%m %H:%M') }}</div>
</div>
{% endif %}
</div> </div>
{% if service.latency_history and service.latency_history|length > 1 %}
<div class="latency-chart">
<canvas id="chart-{{ name }}"></canvas>
</div>
{% endif %}
{% if service.message %} {% if service.message %}
<div class="service-message">{{ service.message }}</div> <div class="service-message">{{ service.message }}</div>
{% endif %} {% endif %}
</div> </div>
{% endif %}
{% endfor %} {% endfor %}
</div> </div>
<h2>Recent Incidents</h2>
<div class="incidents-list">
{% if incidents and incidents|length > 0 %}
{% for incident in incidents %}
<div class="incident-item">
<div class="incident-info">
<span class="incident-dot {% if incident.resolved_at %}resolved{% else %}open{% endif %}"></span>
<div>
<div class="incident-service">{{ incident.service_name | title }}</div>
<div class="incident-message">{{ incident.message or 'Service unavailable' }}</div>
</div>
</div>
<div class="incident-time">
{{ incident.started_at[:16].replace('T', ' ') }}
{% if incident.resolved_at %}
- Resolved
{% else %}
- Ongoing
{% endif %}
</div>
</div>
{% endfor %}
{% else %}
<div class="no-incidents">
No recent incidents
</div>
{% endif %}
</div>
<center> <center>
<button class="refresh-btn" onclick="refreshStatus(this)"> <button class="refresh-btn" onclick="refreshStatus(this)">
<svg width="18" height="18" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"> <svg width="18" height="18" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
@@ -363,6 +571,55 @@
</div> </div>
<script> <script>
// Initialize latency charts
{% for name, service in services.items() %}
{% if service.latency_history and service.latency_history|length > 1 %}
(function() {
const ctx = document.getElementById('chart-{{ name }}').getContext('2d');
const data = {{ service.latency_history | tojson }};
new Chart(ctx, {
type: 'line',
data: {
labels: data.map(d => ''),
datasets: [{
data: data.map(d => d.latency_ms),
borderColor: '#00d4ff',
backgroundColor: 'rgba(0, 212, 255, 0.1)',
fill: true,
tension: 0.4,
pointRadius: 0,
borderWidth: 2
}]
},
options: {
responsive: true,
maintainAspectRatio: false,
plugins: {
legend: { display: false },
tooltip: {
callbacks: {
label: (ctx) => ctx.raw.toFixed(0) + ' ms'
}
}
},
scales: {
x: { display: false },
y: {
display: false,
beginAtZero: true
}
},
interaction: {
intersect: false,
mode: 'index'
}
}
});
})();
{% endif %}
{% endfor %}
async function refreshStatus(btn) { async function refreshStatus(btn) {
btn.classList.add('loading'); btn.classList.add('loading');
btn.disabled = true; btn.disabled = true;