diff --git a/bot/main.py b/bot/main.py index 1debf3a..92906a7 100644 --- a/bot/main.py +++ b/bot/main.py @@ -5,6 +5,7 @@ import sys from aiogram import Bot, Dispatcher from aiogram.client.default import DefaultBotProperties from aiogram.enums import ParseMode +from aiohttp import web from config import settings from handlers import start, marathons, link @@ -23,14 +24,41 @@ logger = logging.getLogger(__name__) # Set aiogram logging level logging.getLogger("aiogram").setLevel(logging.INFO) +# Health check state +bot_running = False + + +async def health_handler(request): + """Health check endpoint""" + if bot_running: + return web.json_response({"status": "ok", "service": "telegram-bot"}) + return web.json_response({"status": "starting"}, status=503) + + +async def start_health_server(): + """Start health check HTTP server""" + app = web.Application() + app.router.add_get("/health", health_handler) + runner = web.AppRunner(app) + await runner.setup() + site = web.TCPSite(runner, "0.0.0.0", 8080) + await site.start() + logger.info("Health check server started on port 8080") + return runner + async def main(): + global bot_running + logger.info("="*50) logger.info("Starting Game Marathon Bot...") logger.info(f"API_URL: {settings.API_URL}") logger.info(f"BOT_TOKEN: {settings.TELEGRAM_BOT_TOKEN[:20]}...") logger.info("="*50) + # Start health check server + health_runner = await start_health_server() + bot = Bot( token=settings.TELEGRAM_BOT_TOKEN, default=DefaultBotProperties(parse_mode=ParseMode.HTML) @@ -54,11 +82,18 @@ async def main(): dp.include_router(marathons.router) logger.info("Routers registered: start, link, marathons") + # Mark bot as running + bot_running = True + # Start polling logger.info("Deleting webhook and starting polling...") await bot.delete_webhook(drop_pending_updates=True) logger.info("Polling started! Waiting for messages...") - await dp.start_polling(bot) + try: + await dp.start_polling(bot) + finally: + bot_running = False + await health_runner.cleanup() if __name__ == "__main__": diff --git a/docker-compose.yml b/docker-compose.yml index 1893eeb..c733bd4 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -85,5 +85,23 @@ services: - backend restart: unless-stopped + status: + build: + context: ./status-service + dockerfile: Dockerfile + container_name: marathon-status + environment: + BACKEND_URL: http://backend:8000 + FRONTEND_URL: http://frontend:80 + BOT_URL: http://bot:8080 + CHECK_INTERVAL: "30" + ports: + - "8001:8001" + depends_on: + - backend + - frontend + - bot + restart: unless-stopped + volumes: postgres_data: diff --git a/status-service/Dockerfile b/status-service/Dockerfile new file mode 100644 index 0000000..943ecff --- /dev/null +++ b/status-service/Dockerfile @@ -0,0 +1,13 @@ +FROM python:3.11-slim + +WORKDIR /app + +# Install dependencies +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy application +COPY . . + +# Run the application +CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8001"] diff --git a/status-service/main.py b/status-service/main.py new file mode 100644 index 0000000..4713cec --- /dev/null +++ b/status-service/main.py @@ -0,0 +1,109 @@ +import os +import asyncio +from datetime import datetime, timedelta +from typing import Optional +from contextlib import asynccontextmanager + +from fastapi import FastAPI, Request +from fastapi.responses import HTMLResponse +from fastapi.templating import Jinja2Templates + +from monitors import ServiceMonitor, ServiceStatus + + +# Configuration +BACKEND_URL = os.getenv("BACKEND_URL", "http://backend:8000") +FRONTEND_URL = os.getenv("FRONTEND_URL", "http://frontend:80") +BOT_URL = os.getenv("BOT_URL", "http://bot:8080") +CHECK_INTERVAL = int(os.getenv("CHECK_INTERVAL", "30")) + +# Initialize monitor +monitor = ServiceMonitor() + +# Background task reference +background_task: Optional[asyncio.Task] = None + + +async def periodic_health_check(): + """Background task to check services periodically""" + while True: + await monitor.check_all_services( + backend_url=BACKEND_URL, + frontend_url=FRONTEND_URL, + bot_url=BOT_URL + ) + await asyncio.sleep(CHECK_INTERVAL) + + +@asynccontextmanager +async def lifespan(app: FastAPI): + """Startup and shutdown events""" + global background_task + # Start background health checks + background_task = asyncio.create_task(periodic_health_check()) + yield + # Cancel background task on shutdown + if background_task: + background_task.cancel() + try: + await background_task + except asyncio.CancelledError: + pass + + +app = FastAPI( + title="Status Monitor", + description="Service health monitoring", + lifespan=lifespan +) + +templates = Jinja2Templates(directory="templates") + + +@app.get("/", response_class=HTMLResponse) +async def status_page(request: Request): + """Main status page""" + services = monitor.get_all_statuses() + overall_status = monitor.get_overall_status() + + return templates.TemplateResponse( + "index.html", + { + "request": request, + "services": services, + "overall_status": overall_status, + "last_check": monitor.last_check, + "check_interval": CHECK_INTERVAL + } + ) + + +@app.get("/api/status") +async def api_status(): + """API endpoint for service statuses""" + services = monitor.get_all_statuses() + overall_status = monitor.get_overall_status() + + return { + "overall_status": overall_status, + "services": {name: status.to_dict() for name, status in services.items()}, + "last_check": monitor.last_check.isoformat() if monitor.last_check else None, + "check_interval_seconds": CHECK_INTERVAL + } + + +@app.get("/api/health") +async def health(): + """Health check for this service""" + return {"status": "ok", "service": "status-monitor"} + + +@app.post("/api/refresh") +async def refresh_status(): + """Force refresh all service statuses""" + await monitor.check_all_services( + backend_url=BACKEND_URL, + frontend_url=FRONTEND_URL, + bot_url=BOT_URL + ) + return {"status": "refreshed"} diff --git a/status-service/monitors.py b/status-service/monitors.py new file mode 100644 index 0000000..018cd01 --- /dev/null +++ b/status-service/monitors.py @@ -0,0 +1,227 @@ +import asyncio +from datetime import datetime, timedelta +from dataclasses import dataclass, field +from typing import Optional +from enum import Enum + +import httpx + + +class Status(str, Enum): + OPERATIONAL = "operational" + DEGRADED = "degraded" + DOWN = "down" + UNKNOWN = "unknown" + + +@dataclass +class ServiceStatus: + name: str + display_name: str + status: Status = Status.UNKNOWN + latency_ms: Optional[float] = None + last_check: Optional[datetime] = None + last_incident: Optional[datetime] = None + uptime_percent: float = 100.0 + message: Optional[str] = None + version: Optional[str] = None + + # For uptime calculation + total_checks: int = 0 + successful_checks: int = 0 + + def to_dict(self) -> dict: + return { + "name": self.name, + "display_name": self.display_name, + "status": self.status.value, + "latency_ms": round(self.latency_ms, 2) if self.latency_ms else None, + "last_check": self.last_check.isoformat() if self.last_check else None, + "last_incident": self.last_incident.isoformat() if self.last_incident else None, + "uptime_percent": round(self.uptime_percent, 2), + "message": self.message, + "version": self.version + } + + def update_uptime(self, is_success: bool): + self.total_checks += 1 + if is_success: + self.successful_checks += 1 + if self.total_checks > 0: + self.uptime_percent = (self.successful_checks / self.total_checks) * 100 + + +class ServiceMonitor: + def __init__(self): + self.services: dict[str, ServiceStatus] = { + "backend": ServiceStatus( + name="backend", + display_name="Backend API" + ), + "database": ServiceStatus( + name="database", + display_name="Database" + ), + "frontend": ServiceStatus( + name="frontend", + display_name="Frontend" + ), + "bot": ServiceStatus( + name="bot", + display_name="Telegram Bot" + ) + } + self.last_check: Optional[datetime] = None + + async def check_backend(self, url: str) -> tuple[Status, Optional[float], Optional[str], Optional[str]]: + """Check backend API health""" + try: + async with httpx.AsyncClient(timeout=10.0) as client: + start = datetime.now() + response = await client.get(f"{url}/health") + latency = (datetime.now() - start).total_seconds() * 1000 + + if response.status_code == 200: + data = response.json() + return Status.OPERATIONAL, latency, None, data.get("version") + else: + return Status.DEGRADED, latency, f"HTTP {response.status_code}", None + except httpx.TimeoutException: + return Status.DOWN, None, "Timeout", None + except Exception as e: + return Status.DOWN, None, str(e)[:100], None + + async def check_database(self, backend_url: str) -> tuple[Status, Optional[float], Optional[str]]: + """Check database through backend""" + # We check database indirectly - if backend is up, DB is likely up + # Could add a specific /health/db endpoint to backend later + try: + async with httpx.AsyncClient(timeout=10.0) as client: + start = datetime.now() + response = await client.get(f"{backend_url}/health") + latency = (datetime.now() - start).total_seconds() * 1000 + + if response.status_code == 200: + return Status.OPERATIONAL, latency, None + else: + return Status.DOWN, latency, "Backend reports unhealthy" + except Exception as e: + return Status.DOWN, None, "Cannot reach backend" + + async def check_frontend(self, url: str) -> tuple[Status, Optional[float], Optional[str]]: + """Check frontend availability""" + try: + async with httpx.AsyncClient(timeout=10.0) as client: + start = datetime.now() + response = await client.get(url) + latency = (datetime.now() - start).total_seconds() * 1000 + + if response.status_code == 200: + return Status.OPERATIONAL, latency, None + else: + return Status.DEGRADED, latency, f"HTTP {response.status_code}" + except httpx.TimeoutException: + return Status.DOWN, None, "Timeout" + except Exception as e: + return Status.DOWN, None, str(e)[:100] + + async def check_bot(self, url: str) -> tuple[Status, Optional[float], Optional[str]]: + """Check Telegram bot health""" + try: + async with httpx.AsyncClient(timeout=10.0) as client: + start = datetime.now() + response = await client.get(f"{url}/health") + latency = (datetime.now() - start).total_seconds() * 1000 + + if response.status_code == 200: + return Status.OPERATIONAL, latency, None + else: + return Status.DEGRADED, latency, f"HTTP {response.status_code}" + except httpx.TimeoutException: + return Status.DOWN, None, "Timeout" + except Exception as e: + return Status.DOWN, None, str(e)[:100] + + async def check_all_services(self, backend_url: str, frontend_url: str, bot_url: str): + """Check all services concurrently""" + now = datetime.now() + + # Run all checks concurrently + results = await asyncio.gather( + self.check_backend(backend_url), + self.check_database(backend_url), + self.check_frontend(frontend_url), + self.check_bot(bot_url), + return_exceptions=True + ) + + # Process backend result + if not isinstance(results[0], Exception): + status, latency, message, version = results[0] + svc = self.services["backend"] + was_down = svc.status == Status.DOWN + svc.status = status + svc.latency_ms = latency + svc.message = message + svc.version = version + svc.last_check = now + svc.update_uptime(status == Status.OPERATIONAL) + if status != Status.OPERATIONAL and not was_down: + svc.last_incident = now + + # Process database result + if not isinstance(results[1], Exception): + status, latency, message = results[1] + svc = self.services["database"] + was_down = svc.status == Status.DOWN + svc.status = status + svc.latency_ms = latency + svc.message = message + svc.last_check = now + svc.update_uptime(status == Status.OPERATIONAL) + if status != Status.OPERATIONAL and not was_down: + svc.last_incident = now + + # Process frontend result + if not isinstance(results[2], Exception): + status, latency, message = results[2] + svc = self.services["frontend"] + was_down = svc.status == Status.DOWN + svc.status = status + svc.latency_ms = latency + svc.message = message + svc.last_check = now + svc.update_uptime(status == Status.OPERATIONAL) + if status != Status.OPERATIONAL and not was_down: + svc.last_incident = now + + # Process bot result + if not isinstance(results[3], Exception): + status, latency, message = results[3] + svc = self.services["bot"] + was_down = svc.status == Status.DOWN + svc.status = status + svc.latency_ms = latency + svc.message = message + svc.last_check = now + svc.update_uptime(status == Status.OPERATIONAL) + if status != Status.OPERATIONAL and not was_down: + svc.last_incident = now + + self.last_check = now + + def get_all_statuses(self) -> dict[str, ServiceStatus]: + return self.services + + def get_overall_status(self) -> Status: + """Get overall system status based on all services""" + statuses = [svc.status for svc in self.services.values()] + + if all(s == Status.OPERATIONAL for s in statuses): + return Status.OPERATIONAL + elif any(s == Status.DOWN for s in statuses): + return Status.DOWN + elif any(s == Status.DEGRADED for s in statuses): + return Status.DEGRADED + else: + return Status.UNKNOWN diff --git a/status-service/requirements.txt b/status-service/requirements.txt new file mode 100644 index 0000000..3da62c1 --- /dev/null +++ b/status-service/requirements.txt @@ -0,0 +1,5 @@ +fastapi==0.109.0 +uvicorn==0.27.0 +httpx==0.26.0 +jinja2==3.1.3 +python-dotenv==1.0.0 diff --git a/status-service/templates/index.html b/status-service/templates/index.html new file mode 100644 index 0000000..ab50cbf --- /dev/null +++ b/status-service/templates/index.html @@ -0,0 +1,386 @@ + + + + + + System Status + + + +
+
+

System Status

+
+ + {% if overall_status.value == 'operational' %} + All Systems Operational + {% elif overall_status.value == 'degraded' %} + Partial System Outage + {% elif overall_status.value == 'down' %} + Major System Outage + {% else %} + Status Unknown + {% endif %} +
+

+ {% if last_check %} + Last updated: {{ last_check.strftime('%d.%m.%Y %H:%M:%S') }} + {% else %} + Checking services... + {% endif %} + • Auto-refresh every {{ check_interval }}s +

+
+ +
+ {% for name, service in services.items() %} +
+
+ {{ service.display_name }} + + + {% if service.status.value == 'operational' %} + Operational + {% elif service.status.value == 'degraded' %} + Degraded + {% elif service.status.value == 'down' %} + Down + {% else %} + Unknown + {% endif %} + +
+
+
+
Latency
+
+ {% if service.latency_ms %} + {{ "%.0f"|format(service.latency_ms) }} ms + {% else %} + — + {% endif %} +
+
+
+
Uptime
+
+ {{ "%.1f"|format(service.uptime_percent) }}% +
+
+ {% if service.version %} +
+
Version
+
{{ service.version }}
+
+ {% endif %} + {% if service.last_incident %} +
+
Last Incident
+
{{ service.last_incident.strftime('%d.%m %H:%M') }}
+
+ {% endif %} +
+ {% if service.message %} +
{{ service.message }}
+ {% endif %} +
+ {% endfor %} +
+ +
+ +
+ + +
+ + + +