Включает модели ML для предсказаний, API маршруты, скрипты обучения и данные. Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
162 lines
5.8 KiB
Python
162 lines
5.8 KiB
Python
import os
|
||
import pandas as pd
|
||
import numpy as np
|
||
from catboost import CatBoostClassifier, Pool
|
||
from sklearn.model_selection import train_test_split
|
||
from sklearn.metrics import roc_auc_score
|
||
|
||
print("Загрузка датасета...")
|
||
|
||
df = pd.read_parquet("data/dataset_from_db.parquet")
|
||
|
||
print(f"Всего записей (матчей): {len(df)}")
|
||
print(f"Radiant wins: {df['y'].sum()} ({df['y'].mean()*100:.1f}%)")
|
||
print(f"Dire wins: {len(df) - df['y'].sum()} ({(1-df['y'].mean())*100:.1f}%)")
|
||
|
||
# --- Преобразование в long-format ---
|
||
print("\nПреобразование в long-format...")
|
||
|
||
hero_cols_r = [f"r_h{i}" for i in range(1, 6)]
|
||
hero_cols_d = [f"d_h{i}" for i in range(1, 6)]
|
||
pos_cols_r = [f"rp_h{i}" for i in range(1, 6)]
|
||
pos_cols_d = [f"dp_h{i}" for i in range(1, 6)]
|
||
|
||
rows = []
|
||
|
||
for idx, row in df.iterrows():
|
||
match_id = idx
|
||
is_first_pick_radiant = int(row.get("is_first_pick_radiant", 0))
|
||
radiant_win = int(row["y"])
|
||
|
||
# Radiant team (5 героев)
|
||
for i in range(5):
|
||
hero_id = int(row[hero_cols_r[i]])
|
||
position = int(row[pos_cols_r[i]])
|
||
|
||
if hero_id >= 0: # Только валидные герои
|
||
rows.append({
|
||
"match_id": match_id,
|
||
"is_first_pick_radiant": is_first_pick_radiant,
|
||
"team": 0, # Radiant
|
||
"hero_id": hero_id,
|
||
"position": position,
|
||
"radiant_win": radiant_win
|
||
})
|
||
|
||
# Dire team (5 героев)
|
||
for i in range(5):
|
||
hero_id = int(row[hero_cols_d[i]])
|
||
position = int(row[pos_cols_d[i]])
|
||
|
||
if hero_id >= 0: # Только валидные герои
|
||
rows.append({
|
||
"match_id": match_id,
|
||
"is_first_pick_radiant": is_first_pick_radiant,
|
||
"team": 1, # Dire
|
||
"hero_id": hero_id,
|
||
"position": position,
|
||
"radiant_win": radiant_win
|
||
})
|
||
|
||
df_long = pd.DataFrame(rows)
|
||
|
||
print(f"\nLong-format датасет создан:")
|
||
print(f"Всего записей (пиков): {len(df_long)}")
|
||
print(f"Уникальных матчей: {df_long['match_id'].nunique()}")
|
||
print(f"Средних пиков на матч: {len(df_long) / df_long['match_id'].nunique():.1f}")
|
||
|
||
# Целевая переменная
|
||
y = df_long["radiant_win"].astype(int)
|
||
|
||
# Признаки
|
||
feature_cols = ["team", "hero_id", "position"]
|
||
X = df_long[feature_cols].copy()
|
||
|
||
# Убедимся в правильных типах
|
||
X["team"] = X["team"].astype(int)
|
||
X["hero_id"] = X["hero_id"].astype(int)
|
||
X["position"] = X["position"].astype(int)
|
||
|
||
# Разбиение (важно: разбиваем по match_id, чтобы пики одного матча были в одном сплите)
|
||
unique_matches = df_long["match_id"].unique()
|
||
train_matches, test_matches = train_test_split(
|
||
unique_matches,
|
||
test_size=0.1,
|
||
random_state=42
|
||
)
|
||
|
||
train_mask = df_long["match_id"].isin(train_matches)
|
||
test_mask = df_long["match_id"].isin(test_matches)
|
||
|
||
X_train = X[train_mask].reset_index(drop=True)
|
||
y_train = y[train_mask].reset_index(drop=True)
|
||
X_test = X[test_mask].reset_index(drop=True)
|
||
y_test = y[test_mask].reset_index(drop=True)
|
||
|
||
print(f"\nTrain: {len(X_train)} пиков ({len(train_matches)} матчей)")
|
||
print(f"Test: {len(X_test)} пиков ({len(test_matches)} матчей)")
|
||
|
||
# Категориальные признаки
|
||
cat_features = ["team", "hero_id", "position"]
|
||
train_pool = Pool(X_train, y_train, cat_features=cat_features)
|
||
test_pool = Pool(X_test, y_test, cat_features=cat_features)
|
||
|
||
# Модель с более агрессивной регуляризацией для малого датасета
|
||
model = CatBoostClassifier(
|
||
iterations=1000,
|
||
learning_rate=0.1, # Увеличили learning rate
|
||
depth=4, # Уменьшили глубину
|
||
l2_leaf_reg=5, # Увеличили регуляризацию
|
||
min_data_in_leaf=20, # Добавили минимум данных в листе
|
||
bootstrap_type="Bayesian",
|
||
bagging_temperature=0.5, # Уменьшили для меньшего разброса
|
||
loss_function="Logloss",
|
||
eval_metric="AUC",
|
||
random_seed=42,
|
||
verbose=50,
|
||
od_type="Iter",
|
||
od_wait=50, # Уменьшили patience
|
||
use_best_model=True
|
||
)
|
||
|
||
print("\nНачало обучения...")
|
||
model.fit(train_pool, eval_set=test_pool, use_best_model=True)
|
||
|
||
# --- Оценка качества ---
|
||
best_scores = model.get_best_score()
|
||
train_auc_cb = best_scores.get("learn", {}).get("AUC", np.nan)
|
||
test_auc_cb = best_scores.get("validation", {}).get("AUC", np.nan)
|
||
|
||
y_train_proba = model.predict_proba(train_pool)[:, 1]
|
||
y_test_proba = model.predict_proba(test_pool)[:, 1]
|
||
train_auc = roc_auc_score(y_train, y_train_proba)
|
||
test_auc = roc_auc_score(y_test, y_test_proba)
|
||
|
||
print(f"\nCatBoost best AUC (learn/valid): {train_auc_cb:.4f} / {test_auc_cb:.4f}")
|
||
print(f"Recomputed AUC (train/test): {train_auc:.4f} / {test_auc:.4f}")
|
||
|
||
# --- Сохранение ---
|
||
os.makedirs("artifacts", exist_ok=True)
|
||
model_path = "artifacts/model_from_db_pro_v3.cbm"
|
||
model.save_model(model_path)
|
||
print(f"\nМодель сохранена: {model_path}")
|
||
|
||
# Порядок фичей
|
||
pd.DataFrame(feature_cols, columns=["feature"]).to_csv(
|
||
"artifacts/feature_order_db.csv", index=False
|
||
)
|
||
print("Порядок фичей сохранен в artifacts/feature_order_db.csv")
|
||
|
||
# Важность признаков
|
||
importance = model.get_feature_importance(train_pool)
|
||
importance_df = (
|
||
pd.DataFrame({"feature": X_train.columns, "importance": importance})
|
||
.sort_values("importance", ascending=False)
|
||
.reset_index(drop=True)
|
||
)
|
||
|
||
print("\nВажность признаков:")
|
||
print(importance_df.to_string(index=False))
|
||
|
||
importance_df.to_csv("artifacts/feature_importance_db.csv", index=False)
|