Files
predictV1/run.sh
mamonov.ep 8a134239d7 Initial commit: добавление проекта predictV1
Включает модели ML для предсказаний, API маршруты, скрипты обучения и данные.

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2026-02-21 17:22:58 +03:00

237 lines
8.0 KiB
Bash
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env bash
set -euo pipefail
# ===========================
# Конфиг (меняйте по желанию)
# ===========================
PY=python3
VENV=".venv"
# Порт REST-сервиса:
PORT="${PORT:-8000}"
# Принудить IPv4 (полезно при проблемном IPv6 у провайдера):
FORCE_IPV4="${FORCE_IPV4:-1}"
# API-ключи (необязательно, но улучшает стабильность/квоты):
# export OPENDOTA_API_KEY=...
# export STRATZ_TOKEN=...
OPENDOTA_API_KEY="${OPENDOTA_API_KEY:-}"
STRATZ_TOKEN="${STRATZ_TOKEN:-}"
# Использовать STRATZ вместо OpenDota
# только для шага 1 (список матчей): USE_STRATZ_LIST=1
# только для шага 2 (детали/драфт): USE_STRATZ_DETAILS=1
USE_STRATZ_LIST="${USE_STRATZ_LIST:-0}"
USE_STRATZ_DETAILS="${USE_STRATZ_DETAILS:-0}"
# Задержка после каждых 100 запросов (смягчить 429):
SLEEP_PER_100="${SLEEP_PER_100:-1.0}"
# ===========================
# Окружение и зависимости
# ===========================
if [ ! -d "$VENV" ]; then
$PY -m venv "$VENV"
fi
# shellcheck disable=SC1091
source "$VENV/bin/activate"
pip install -U pip
pip install pandas pyarrow requests httpx "urllib3>=2.2" certifi catboost scikit-learn fastapi uvicorn
mkdir -p data artifacts
# ===========================
# Хелперы
# ===========================
export FORCE_IPV4
export OPENDOTA_API_KEY
export STRATZ_TOKEN
export PAGES
export SLEEP_PER_100
# ===========================
# [1/7] Список pub-матчей
# ===========================
# [1b] Паблики (high-rank)
$PY educationML/fetch_public_matches.py
# [2b] Детали пабликов (герои из players)
$PY educationML/fetch_public_details.py
# ===========================
# [2/7] Список pro-матчей
# ===========================
echo "[1/6] Fetch pro matches via OpenDota (pages=$PAGES)"
$PY educationML/fetch_pro_matches_opendota.py
# =========================================
# [2/6] Детали матча + драфт (устойчивый)
# =========================================
if [ "$USE_STRATZ_DETAILS" = "1" ]; then
echo "[2/6] Fetch match details + draft via STRATZ"
$PY - <<'PYCODE'
PYCODE
else
echo "[2/6] Fetch match details + draft via OpenDota (robust)"
$PY - <<'PYCODE'
import os, time, socket, sys, pandas as pd, requests, httpx
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter
# IPv4-only (если FORCE_IPV4=1)
if os.getenv("FORCE_IPV4","1") == "1":
_orig = socket.getaddrinfo
def _v4(host, port, family=0, type=0, proto=0, flags=0):
return _orig(host, port, socket.AF_INET, type, proto, flags)
socket.getaddrinfo = _v4
API_KEY = os.getenv("OPENDOTA_API_KEY")
SLEEP_PER_100 = float(os.getenv("SLEEP_PER_100","1.0"))
BASE = "https://api.opendota.com/api/matches/{mid}"
headers = {"User-Agent":"korobkaGames/1.0","Accept":"application/json","Connection":"close"}
# список match_id
pro = pd.read_parquet("data/pro_matches.parquet")
match_ids = pro['match_id'].drop_duplicates().tolist()
# requests с Retry
sess = requests.Session()
retries = Retry(total=6, connect=6, read=6, backoff_factor=0.7,
status_forcelist=[429,500,502,503,504],
allowed_methods=frozenset(["GET"]))
sess.mount("https://", HTTPAdapter(max_retries=retries))
def fetch_one(mid: int):
url = BASE.format(mid=mid)
if API_KEY:
url += f"?api_key={API_KEY}"
try:
r = sess.get(url, headers=headers, timeout=(5,40))
r.raise_for_status()
return r.json()
except requests.exceptions.SSLError:
# fallback: httpx http2 off
with httpx.Client(http2=False, timeout=40, headers=headers) as client:
resp = client.get(url)
resp.raise_for_status()
return resp.json()
match_rows, draft_rows, failed = [], [], []
for i, mid in enumerate(match_ids, 1):
try:
m = fetch_one(int(mid))
match_rows.append({
"match_id": int(mid),
"date": pd.to_datetime(m.get("start_time",0), unit="s"),
"patch": str(m.get("patch")),
"radiant_win": bool(m.get("radiant_win")),
"duration_sec": m.get("duration"),
"league_id": (m.get("league") or {}).get("id"),
"series_type": m.get("series_type"),
})
for pb in (m.get("picks_bans") or []):
draft_rows.append({
"match_id": int(mid),
"is_pick": pb.get("is_pick", False),
"team": pb.get("team"),
"hero_id": pb.get("hero_id"),
"order": pb.get("order")
})
except Exception:
failed.append(int(mid))
if i % 100 == 0:
time.sleep(SLEEP_PER_100)
pd.DataFrame(match_rows).to_parquet("data/matches.parquet", index=False)
pd.DataFrame(draft_rows).to_parquet("data/draft.parquet", index=False)
pd.Series(failed, name="failed_match_id").to_csv("data/matches_failed.csv", index=False)
print(f"Saved via OpenDota: matches={len(match_rows)} draft_rows={len(draft_rows)} failed={len(failed)}")
if not match_rows:
raise SystemExit("OpenDota details: ничего не скачано")
PYCODE
fi
# ===========================
# [3/6] Простой Elo baseline
# ===========================
echo "[3/6] Build Elo"
$PY - <<'PYCODE'
import pandas as pd
matches = pd.read_parquet("data/matches.parquet").sort_values("date")
pro = pd.read_parquet("data/pro_matches.parquet")[['match_id','radiant_name','dire_name']]
df = matches.merge(pro, on='match_id', how='left')
K = 24
elo = {}
def get_elo(t): return elo.get(t, 1500)
def expect(a,b): return 1.0/(1.0+10**((b-a)/400))
rows=[]
for _, r in df.iterrows():
A, B = r['radiant_name'], r['dire_name']
ra, rb = get_elo(A), get_elo(B)
ea, eb = expect(ra,rb), expect(rb,ra)
y = 1.0 if r['radiant_win'] else 0.0
rows.append({
'match_id': r['match_id'],
'date': r['date'],
'elo_radiant': ra, 'elo_dire': rb,
'elo_diff_90': ra - rb # упрощённо без окон
})
elo[A] = ra + K*(y-ea)
elo[B] = rb + K*((1-y)-eb)
pd.DataFrame(rows).to_parquet("data/elo.parquet", index=False)
print("Saved data/elo.parquet")
PYCODE
# [4] вместо старого build_dataset_draft.py
$PY educationML/build_dataset_mixed.py
# ===========================
# [5/6] Обучение модели
# ===========================
echo "[5/6] Train CatBoost"
$PY - <<'PYCODE'
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import log_loss, brier_score_loss
df = pd.read_parquet("data/dataset_mixed.parquet").sort_values("date")
cat_cols = ['patch','source','r_h1','r_h2','r_h3','r_h4','r_h5','d_h1','d_h2','d_h3','d_h4','d_h5']
X = df.drop(columns=['y','date','match_id']) if 'match_id' in df.columns else df.drop(columns=['y','date'])
y = df['y']
cat_idx = [X.columns.get_loc(c) for c in cat_cols]
tscv = TimeSeriesSplit(n_splits=5)
ll, br = [], []
for tr, te in tscv.split(X):
model = CatBoostClassifier(
depth=8, iterations=1200, learning_rate=0.03,
loss_function='Logloss', eval_metric='Logloss', verbose=False
)
model.fit(X.iloc[tr], y.iloc[tr], cat_features=cat_idx)
p = model.predict_proba(X.iloc[te])[:,1]
ll.append(log_loss(y.iloc[te], p))
br.append(brier_score_loss(y.iloc[te], p))
print("CV LogLoss=", sum(ll)/len(ll), " Brier=", sum(br)/len(br))
final = CatBoostClassifier(depth=8, iterations=1500, learning_rate=0.03,
loss_function='Logloss', verbose=False)
final.fit(X, y, cat_features=cat_idx)
final.save_model("artifacts/model_draft.cbm")
pd.Series(X.columns).to_csv("artifacts/feature_order.csv", index=False)
print("Saved artifacts/model_draft.cbm and artifacts/feature_order.csv")
PYCODE
# ===========================
# [6/6] REST-сервис предсказаний
# ===========================
echo "[6/6] Start API → http://127.0.0.1:$PORT"
exec uvicorn serve:app --host 0.0.0.0 --port "$PORT"