Initial commit: добавление проекта predictV1
Включает модели ML для предсказаний, API маршруты, скрипты обучения и данные. Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
236
run.sh
Executable file
236
run.sh
Executable file
@@ -0,0 +1,236 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
# ===========================
|
||||
# Конфиг (меняйте по желанию)
|
||||
# ===========================
|
||||
PY=python3
|
||||
VENV=".venv"
|
||||
|
||||
# Порт REST-сервиса:
|
||||
PORT="${PORT:-8000}"
|
||||
|
||||
# Принудить IPv4 (полезно при проблемном IPv6 у провайдера):
|
||||
FORCE_IPV4="${FORCE_IPV4:-1}"
|
||||
|
||||
# API-ключи (необязательно, но улучшает стабильность/квоты):
|
||||
# export OPENDOTA_API_KEY=...
|
||||
# export STRATZ_TOKEN=...
|
||||
OPENDOTA_API_KEY="${OPENDOTA_API_KEY:-}"
|
||||
STRATZ_TOKEN="${STRATZ_TOKEN:-}"
|
||||
|
||||
# Использовать STRATZ вместо OpenDota
|
||||
# только для шага 1 (список матчей): USE_STRATZ_LIST=1
|
||||
# только для шага 2 (детали/драфт): USE_STRATZ_DETAILS=1
|
||||
USE_STRATZ_LIST="${USE_STRATZ_LIST:-0}"
|
||||
USE_STRATZ_DETAILS="${USE_STRATZ_DETAILS:-0}"
|
||||
|
||||
# Задержка после каждых 100 запросов (смягчить 429):
|
||||
SLEEP_PER_100="${SLEEP_PER_100:-1.0}"
|
||||
|
||||
# ===========================
|
||||
# Окружение и зависимости
|
||||
# ===========================
|
||||
if [ ! -d "$VENV" ]; then
|
||||
$PY -m venv "$VENV"
|
||||
fi
|
||||
# shellcheck disable=SC1091
|
||||
source "$VENV/bin/activate"
|
||||
|
||||
pip install -U pip
|
||||
pip install pandas pyarrow requests httpx "urllib3>=2.2" certifi catboost scikit-learn fastapi uvicorn
|
||||
|
||||
mkdir -p data artifacts
|
||||
|
||||
# ===========================
|
||||
# Хелперы
|
||||
# ===========================
|
||||
export FORCE_IPV4
|
||||
export OPENDOTA_API_KEY
|
||||
export STRATZ_TOKEN
|
||||
export PAGES
|
||||
export SLEEP_PER_100
|
||||
|
||||
# ===========================
|
||||
# [1/7] Список pub-матчей
|
||||
# ===========================
|
||||
# [1b] Паблики (high-rank)
|
||||
$PY educationML/fetch_public_matches.py
|
||||
# [2b] Детали пабликов (герои из players)
|
||||
$PY educationML/fetch_public_details.py
|
||||
|
||||
# ===========================
|
||||
# [2/7] Список pro-матчей
|
||||
# ===========================
|
||||
echo "[1/6] Fetch pro matches via OpenDota (pages=$PAGES)"
|
||||
$PY educationML/fetch_pro_matches_opendota.py
|
||||
|
||||
# =========================================
|
||||
# [2/6] Детали матча + драфт (устойчивый)
|
||||
# =========================================
|
||||
if [ "$USE_STRATZ_DETAILS" = "1" ]; then
|
||||
echo "[2/6] Fetch match details + draft via STRATZ"
|
||||
$PY - <<'PYCODE'
|
||||
|
||||
PYCODE
|
||||
|
||||
else
|
||||
echo "[2/6] Fetch match details + draft via OpenDota (robust)"
|
||||
$PY - <<'PYCODE'
|
||||
import os, time, socket, sys, pandas as pd, requests, httpx
|
||||
from urllib3.util.retry import Retry
|
||||
from requests.adapters import HTTPAdapter
|
||||
|
||||
# IPv4-only (если FORCE_IPV4=1)
|
||||
if os.getenv("FORCE_IPV4","1") == "1":
|
||||
_orig = socket.getaddrinfo
|
||||
def _v4(host, port, family=0, type=0, proto=0, flags=0):
|
||||
return _orig(host, port, socket.AF_INET, type, proto, flags)
|
||||
socket.getaddrinfo = _v4
|
||||
|
||||
API_KEY = os.getenv("OPENDOTA_API_KEY")
|
||||
SLEEP_PER_100 = float(os.getenv("SLEEP_PER_100","1.0"))
|
||||
BASE = "https://api.opendota.com/api/matches/{mid}"
|
||||
headers = {"User-Agent":"korobkaGames/1.0","Accept":"application/json","Connection":"close"}
|
||||
|
||||
# список match_id
|
||||
pro = pd.read_parquet("data/pro_matches.parquet")
|
||||
match_ids = pro['match_id'].drop_duplicates().tolist()
|
||||
|
||||
# requests с Retry
|
||||
sess = requests.Session()
|
||||
retries = Retry(total=6, connect=6, read=6, backoff_factor=0.7,
|
||||
status_forcelist=[429,500,502,503,504],
|
||||
allowed_methods=frozenset(["GET"]))
|
||||
sess.mount("https://", HTTPAdapter(max_retries=retries))
|
||||
|
||||
def fetch_one(mid: int):
|
||||
url = BASE.format(mid=mid)
|
||||
if API_KEY:
|
||||
url += f"?api_key={API_KEY}"
|
||||
try:
|
||||
r = sess.get(url, headers=headers, timeout=(5,40))
|
||||
r.raise_for_status()
|
||||
return r.json()
|
||||
except requests.exceptions.SSLError:
|
||||
# fallback: httpx http2 off
|
||||
with httpx.Client(http2=False, timeout=40, headers=headers) as client:
|
||||
resp = client.get(url)
|
||||
resp.raise_for_status()
|
||||
return resp.json()
|
||||
|
||||
match_rows, draft_rows, failed = [], [], []
|
||||
for i, mid in enumerate(match_ids, 1):
|
||||
try:
|
||||
m = fetch_one(int(mid))
|
||||
match_rows.append({
|
||||
"match_id": int(mid),
|
||||
"date": pd.to_datetime(m.get("start_time",0), unit="s"),
|
||||
"patch": str(m.get("patch")),
|
||||
"radiant_win": bool(m.get("radiant_win")),
|
||||
"duration_sec": m.get("duration"),
|
||||
"league_id": (m.get("league") or {}).get("id"),
|
||||
"series_type": m.get("series_type"),
|
||||
})
|
||||
for pb in (m.get("picks_bans") or []):
|
||||
draft_rows.append({
|
||||
"match_id": int(mid),
|
||||
"is_pick": pb.get("is_pick", False),
|
||||
"team": pb.get("team"),
|
||||
"hero_id": pb.get("hero_id"),
|
||||
"order": pb.get("order")
|
||||
})
|
||||
except Exception:
|
||||
failed.append(int(mid))
|
||||
if i % 100 == 0:
|
||||
time.sleep(SLEEP_PER_100)
|
||||
|
||||
pd.DataFrame(match_rows).to_parquet("data/matches.parquet", index=False)
|
||||
pd.DataFrame(draft_rows).to_parquet("data/draft.parquet", index=False)
|
||||
pd.Series(failed, name="failed_match_id").to_csv("data/matches_failed.csv", index=False)
|
||||
print(f"Saved via OpenDota: matches={len(match_rows)} draft_rows={len(draft_rows)} failed={len(failed)}")
|
||||
if not match_rows:
|
||||
raise SystemExit("OpenDota details: ничего не скачано")
|
||||
PYCODE
|
||||
fi
|
||||
|
||||
# ===========================
|
||||
# [3/6] Простой Elo baseline
|
||||
# ===========================
|
||||
echo "[3/6] Build Elo"
|
||||
$PY - <<'PYCODE'
|
||||
import pandas as pd
|
||||
matches = pd.read_parquet("data/matches.parquet").sort_values("date")
|
||||
pro = pd.read_parquet("data/pro_matches.parquet")[['match_id','radiant_name','dire_name']]
|
||||
df = matches.merge(pro, on='match_id', how='left')
|
||||
|
||||
K = 24
|
||||
elo = {}
|
||||
def get_elo(t): return elo.get(t, 1500)
|
||||
def expect(a,b): return 1.0/(1.0+10**((b-a)/400))
|
||||
|
||||
rows=[]
|
||||
for _, r in df.iterrows():
|
||||
A, B = r['radiant_name'], r['dire_name']
|
||||
ra, rb = get_elo(A), get_elo(B)
|
||||
ea, eb = expect(ra,rb), expect(rb,ra)
|
||||
y = 1.0 if r['radiant_win'] else 0.0
|
||||
rows.append({
|
||||
'match_id': r['match_id'],
|
||||
'date': r['date'],
|
||||
'elo_radiant': ra, 'elo_dire': rb,
|
||||
'elo_diff_90': ra - rb # упрощённо без окон
|
||||
})
|
||||
elo[A] = ra + K*(y-ea)
|
||||
elo[B] = rb + K*((1-y)-eb)
|
||||
|
||||
pd.DataFrame(rows).to_parquet("data/elo.parquet", index=False)
|
||||
print("Saved data/elo.parquet")
|
||||
PYCODE
|
||||
|
||||
# [4] вместо старого build_dataset_draft.py
|
||||
$PY educationML/build_dataset_mixed.py
|
||||
|
||||
# ===========================
|
||||
# [5/6] Обучение модели
|
||||
# ===========================
|
||||
echo "[5/6] Train CatBoost"
|
||||
$PY - <<'PYCODE'
|
||||
import pandas as pd
|
||||
from catboost import CatBoostClassifier
|
||||
from sklearn.model_selection import TimeSeriesSplit
|
||||
from sklearn.metrics import log_loss, brier_score_loss
|
||||
|
||||
df = pd.read_parquet("data/dataset_mixed.parquet").sort_values("date")
|
||||
cat_cols = ['patch','source','r_h1','r_h2','r_h3','r_h4','r_h5','d_h1','d_h2','d_h3','d_h4','d_h5']
|
||||
X = df.drop(columns=['y','date','match_id']) if 'match_id' in df.columns else df.drop(columns=['y','date'])
|
||||
y = df['y']
|
||||
cat_idx = [X.columns.get_loc(c) for c in cat_cols]
|
||||
|
||||
tscv = TimeSeriesSplit(n_splits=5)
|
||||
ll, br = [], []
|
||||
for tr, te in tscv.split(X):
|
||||
model = CatBoostClassifier(
|
||||
depth=8, iterations=1200, learning_rate=0.03,
|
||||
loss_function='Logloss', eval_metric='Logloss', verbose=False
|
||||
)
|
||||
model.fit(X.iloc[tr], y.iloc[tr], cat_features=cat_idx)
|
||||
p = model.predict_proba(X.iloc[te])[:,1]
|
||||
ll.append(log_loss(y.iloc[te], p))
|
||||
br.append(brier_score_loss(y.iloc[te], p))
|
||||
print("CV LogLoss=", sum(ll)/len(ll), " Brier=", sum(br)/len(br))
|
||||
|
||||
final = CatBoostClassifier(depth=8, iterations=1500, learning_rate=0.03,
|
||||
loss_function='Logloss', verbose=False)
|
||||
final.fit(X, y, cat_features=cat_idx)
|
||||
final.save_model("artifacts/model_draft.cbm")
|
||||
pd.Series(X.columns).to_csv("artifacts/feature_order.csv", index=False)
|
||||
print("Saved artifacts/model_draft.cbm and artifacts/feature_order.csv")
|
||||
PYCODE
|
||||
|
||||
# ===========================
|
||||
# [6/6] REST-сервис предсказаний
|
||||
# ===========================
|
||||
echo "[6/6] Start API → http://127.0.0.1:$PORT"
|
||||
|
||||
exec uvicorn serve:app --host 0.0.0.0 --port "$PORT"
|
||||
Reference in New Issue
Block a user