Включает модели ML для предсказаний, API маршруты, скрипты обучения и данные. Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
237 lines
8.0 KiB
Bash
Executable File
237 lines
8.0 KiB
Bash
Executable File
#!/usr/bin/env bash
|
||
set -euo pipefail
|
||
|
||
# ===========================
|
||
# Конфиг (меняйте по желанию)
|
||
# ===========================
|
||
PY=python3
|
||
VENV=".venv"
|
||
|
||
# Порт REST-сервиса:
|
||
PORT="${PORT:-8000}"
|
||
|
||
# Принудить IPv4 (полезно при проблемном IPv6 у провайдера):
|
||
FORCE_IPV4="${FORCE_IPV4:-1}"
|
||
|
||
# API-ключи (необязательно, но улучшает стабильность/квоты):
|
||
# export OPENDOTA_API_KEY=...
|
||
# export STRATZ_TOKEN=...
|
||
OPENDOTA_API_KEY="${OPENDOTA_API_KEY:-}"
|
||
STRATZ_TOKEN="${STRATZ_TOKEN:-}"
|
||
|
||
# Использовать STRATZ вместо OpenDota
|
||
# только для шага 1 (список матчей): USE_STRATZ_LIST=1
|
||
# только для шага 2 (детали/драфт): USE_STRATZ_DETAILS=1
|
||
USE_STRATZ_LIST="${USE_STRATZ_LIST:-0}"
|
||
USE_STRATZ_DETAILS="${USE_STRATZ_DETAILS:-0}"
|
||
|
||
# Задержка после каждых 100 запросов (смягчить 429):
|
||
SLEEP_PER_100="${SLEEP_PER_100:-1.0}"
|
||
|
||
# ===========================
|
||
# Окружение и зависимости
|
||
# ===========================
|
||
if [ ! -d "$VENV" ]; then
|
||
$PY -m venv "$VENV"
|
||
fi
|
||
# shellcheck disable=SC1091
|
||
source "$VENV/bin/activate"
|
||
|
||
pip install -U pip
|
||
pip install pandas pyarrow requests httpx "urllib3>=2.2" certifi catboost scikit-learn fastapi uvicorn
|
||
|
||
mkdir -p data artifacts
|
||
|
||
# ===========================
|
||
# Хелперы
|
||
# ===========================
|
||
export FORCE_IPV4
|
||
export OPENDOTA_API_KEY
|
||
export STRATZ_TOKEN
|
||
export PAGES
|
||
export SLEEP_PER_100
|
||
|
||
# ===========================
|
||
# [1/7] Список pub-матчей
|
||
# ===========================
|
||
# [1b] Паблики (high-rank)
|
||
$PY educationML/fetch_public_matches.py
|
||
# [2b] Детали пабликов (герои из players)
|
||
$PY educationML/fetch_public_details.py
|
||
|
||
# ===========================
|
||
# [2/7] Список pro-матчей
|
||
# ===========================
|
||
echo "[1/6] Fetch pro matches via OpenDota (pages=$PAGES)"
|
||
$PY educationML/fetch_pro_matches_opendota.py
|
||
|
||
# =========================================
|
||
# [2/6] Детали матча + драфт (устойчивый)
|
||
# =========================================
|
||
if [ "$USE_STRATZ_DETAILS" = "1" ]; then
|
||
echo "[2/6] Fetch match details + draft via STRATZ"
|
||
$PY - <<'PYCODE'
|
||
|
||
PYCODE
|
||
|
||
else
|
||
echo "[2/6] Fetch match details + draft via OpenDota (robust)"
|
||
$PY - <<'PYCODE'
|
||
import os, time, socket, sys, pandas as pd, requests, httpx
|
||
from urllib3.util.retry import Retry
|
||
from requests.adapters import HTTPAdapter
|
||
|
||
# IPv4-only (если FORCE_IPV4=1)
|
||
if os.getenv("FORCE_IPV4","1") == "1":
|
||
_orig = socket.getaddrinfo
|
||
def _v4(host, port, family=0, type=0, proto=0, flags=0):
|
||
return _orig(host, port, socket.AF_INET, type, proto, flags)
|
||
socket.getaddrinfo = _v4
|
||
|
||
API_KEY = os.getenv("OPENDOTA_API_KEY")
|
||
SLEEP_PER_100 = float(os.getenv("SLEEP_PER_100","1.0"))
|
||
BASE = "https://api.opendota.com/api/matches/{mid}"
|
||
headers = {"User-Agent":"korobkaGames/1.0","Accept":"application/json","Connection":"close"}
|
||
|
||
# список match_id
|
||
pro = pd.read_parquet("data/pro_matches.parquet")
|
||
match_ids = pro['match_id'].drop_duplicates().tolist()
|
||
|
||
# requests с Retry
|
||
sess = requests.Session()
|
||
retries = Retry(total=6, connect=6, read=6, backoff_factor=0.7,
|
||
status_forcelist=[429,500,502,503,504],
|
||
allowed_methods=frozenset(["GET"]))
|
||
sess.mount("https://", HTTPAdapter(max_retries=retries))
|
||
|
||
def fetch_one(mid: int):
|
||
url = BASE.format(mid=mid)
|
||
if API_KEY:
|
||
url += f"?api_key={API_KEY}"
|
||
try:
|
||
r = sess.get(url, headers=headers, timeout=(5,40))
|
||
r.raise_for_status()
|
||
return r.json()
|
||
except requests.exceptions.SSLError:
|
||
# fallback: httpx http2 off
|
||
with httpx.Client(http2=False, timeout=40, headers=headers) as client:
|
||
resp = client.get(url)
|
||
resp.raise_for_status()
|
||
return resp.json()
|
||
|
||
match_rows, draft_rows, failed = [], [], []
|
||
for i, mid in enumerate(match_ids, 1):
|
||
try:
|
||
m = fetch_one(int(mid))
|
||
match_rows.append({
|
||
"match_id": int(mid),
|
||
"date": pd.to_datetime(m.get("start_time",0), unit="s"),
|
||
"patch": str(m.get("patch")),
|
||
"radiant_win": bool(m.get("radiant_win")),
|
||
"duration_sec": m.get("duration"),
|
||
"league_id": (m.get("league") or {}).get("id"),
|
||
"series_type": m.get("series_type"),
|
||
})
|
||
for pb in (m.get("picks_bans") or []):
|
||
draft_rows.append({
|
||
"match_id": int(mid),
|
||
"is_pick": pb.get("is_pick", False),
|
||
"team": pb.get("team"),
|
||
"hero_id": pb.get("hero_id"),
|
||
"order": pb.get("order")
|
||
})
|
||
except Exception:
|
||
failed.append(int(mid))
|
||
if i % 100 == 0:
|
||
time.sleep(SLEEP_PER_100)
|
||
|
||
pd.DataFrame(match_rows).to_parquet("data/matches.parquet", index=False)
|
||
pd.DataFrame(draft_rows).to_parquet("data/draft.parquet", index=False)
|
||
pd.Series(failed, name="failed_match_id").to_csv("data/matches_failed.csv", index=False)
|
||
print(f"Saved via OpenDota: matches={len(match_rows)} draft_rows={len(draft_rows)} failed={len(failed)}")
|
||
if not match_rows:
|
||
raise SystemExit("OpenDota details: ничего не скачано")
|
||
PYCODE
|
||
fi
|
||
|
||
# ===========================
|
||
# [3/6] Простой Elo baseline
|
||
# ===========================
|
||
echo "[3/6] Build Elo"
|
||
$PY - <<'PYCODE'
|
||
import pandas as pd
|
||
matches = pd.read_parquet("data/matches.parquet").sort_values("date")
|
||
pro = pd.read_parquet("data/pro_matches.parquet")[['match_id','radiant_name','dire_name']]
|
||
df = matches.merge(pro, on='match_id', how='left')
|
||
|
||
K = 24
|
||
elo = {}
|
||
def get_elo(t): return elo.get(t, 1500)
|
||
def expect(a,b): return 1.0/(1.0+10**((b-a)/400))
|
||
|
||
rows=[]
|
||
for _, r in df.iterrows():
|
||
A, B = r['radiant_name'], r['dire_name']
|
||
ra, rb = get_elo(A), get_elo(B)
|
||
ea, eb = expect(ra,rb), expect(rb,ra)
|
||
y = 1.0 if r['radiant_win'] else 0.0
|
||
rows.append({
|
||
'match_id': r['match_id'],
|
||
'date': r['date'],
|
||
'elo_radiant': ra, 'elo_dire': rb,
|
||
'elo_diff_90': ra - rb # упрощённо без окон
|
||
})
|
||
elo[A] = ra + K*(y-ea)
|
||
elo[B] = rb + K*((1-y)-eb)
|
||
|
||
pd.DataFrame(rows).to_parquet("data/elo.parquet", index=False)
|
||
print("Saved data/elo.parquet")
|
||
PYCODE
|
||
|
||
# [4] вместо старого build_dataset_draft.py
|
||
$PY educationML/build_dataset_mixed.py
|
||
|
||
# ===========================
|
||
# [5/6] Обучение модели
|
||
# ===========================
|
||
echo "[5/6] Train CatBoost"
|
||
$PY - <<'PYCODE'
|
||
import pandas as pd
|
||
from catboost import CatBoostClassifier
|
||
from sklearn.model_selection import TimeSeriesSplit
|
||
from sklearn.metrics import log_loss, brier_score_loss
|
||
|
||
df = pd.read_parquet("data/dataset_mixed.parquet").sort_values("date")
|
||
cat_cols = ['patch','source','r_h1','r_h2','r_h3','r_h4','r_h5','d_h1','d_h2','d_h3','d_h4','d_h5']
|
||
X = df.drop(columns=['y','date','match_id']) if 'match_id' in df.columns else df.drop(columns=['y','date'])
|
||
y = df['y']
|
||
cat_idx = [X.columns.get_loc(c) for c in cat_cols]
|
||
|
||
tscv = TimeSeriesSplit(n_splits=5)
|
||
ll, br = [], []
|
||
for tr, te in tscv.split(X):
|
||
model = CatBoostClassifier(
|
||
depth=8, iterations=1200, learning_rate=0.03,
|
||
loss_function='Logloss', eval_metric='Logloss', verbose=False
|
||
)
|
||
model.fit(X.iloc[tr], y.iloc[tr], cat_features=cat_idx)
|
||
p = model.predict_proba(X.iloc[te])[:,1]
|
||
ll.append(log_loss(y.iloc[te], p))
|
||
br.append(brier_score_loss(y.iloc[te], p))
|
||
print("CV LogLoss=", sum(ll)/len(ll), " Brier=", sum(br)/len(br))
|
||
|
||
final = CatBoostClassifier(depth=8, iterations=1500, learning_rate=0.03,
|
||
loss_function='Logloss', verbose=False)
|
||
final.fit(X, y, cat_features=cat_idx)
|
||
final.save_model("artifacts/model_draft.cbm")
|
||
pd.Series(X.columns).to_csv("artifacts/feature_order.csv", index=False)
|
||
print("Saved artifacts/model_draft.cbm and artifacts/feature_order.csv")
|
||
PYCODE
|
||
|
||
# ===========================
|
||
# [6/6] REST-сервис предсказаний
|
||
# ===========================
|
||
echo "[6/6] Start API → http://127.0.0.1:$PORT"
|
||
|
||
exec uvicorn serve:app --host 0.0.0.0 --port "$PORT"
|