import os import pandas as pd import numpy as np from catboost import CatBoostClassifier, Pool from sklearn.model_selection import train_test_split from sklearn.metrics import roc_auc_score print("Загрузка датасета...") df = pd.read_parquet("data/dataset_from_db.parquet") print(f"Всего записей: {len(df)}") print(f"Radiant wins: {df['y'].sum()} ({df['y'].mean()*100:.1f}%)") print(f"Dire wins: {len(df) - df['y'].sum()} ({(1-df['y'].mean())*100:.1f}%)") # --- Bag-of-Heroes подход --- # Создаем бинарные признаки для каждого героя в каждой команде # Получаем все уникальные ID героев из данных hero_cols_r = [f"r_h{i}" for i in range(1, 6)] hero_cols_d = [f"d_h{i}" for i in range(1, 6)] all_hero_ids = set() for col in hero_cols_r + hero_cols_d: all_hero_ids.update(df[col].dropna().unique()) all_hero_ids = sorted([int(h) for h in all_hero_ids if h >= 0]) print(f"\nВсего уникальных героев: {len(all_hero_ids)}") # Создаем новый датафрейм с bag-of-heroes признаками X = pd.DataFrame() # Добавляем is_first_pick_radiant X["is_first_pick_radiant"] = df["is_first_pick_radiant"].astype(int) # Для каждого героя создаем 2 признака: radiant_hero_{id} и dire_hero_{id} for hero_id in all_hero_ids: # Radiant team X[f"radiant_hero_{hero_id}"] = 0 for col in hero_cols_r: X.loc[df[col] == hero_id, f"radiant_hero_{hero_id}"] = 1 # Dire team X[f"dire_hero_{hero_id}"] = 0 for col in hero_cols_d: X.loc[df[col] == hero_id, f"dire_hero_{hero_id}"] = 1 print(f"Количество признаков: {len(X.columns)}") print(f" - is_first_pick_radiant: 1") print(f" - radiant_hero_*: {len(all_hero_ids)}") print(f" - dire_hero_*: {len(all_hero_ids)}") # Целевая переменная y = df["y"].astype(int).copy() # Разбиение X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2, random_state=42, stratify=y ) print(f"\nTrain: {len(X_train)} записей") print(f"Test: {len(X_test)} записей") # В bag-of-heroes все признаки числовые (0 или 1), категориальных нет train_pool = Pool(X_train, y_train) test_pool = Pool(X_test, y_test) # Модель model = CatBoostClassifier( iterations=2500, learning_rate=0.03, depth=7, l2_leaf_reg=2, bootstrap_type="Bayesian", bagging_temperature=1.0, loss_function="Logloss", eval_metric="AUC", random_seed=42, verbose=100, od_type="Iter", od_wait=200 ) print("\nНачало обучения...") model.fit(train_pool, eval_set=test_pool, use_best_model=True) # --- Оценка качества --- best_scores = model.get_best_score() train_auc_cb = best_scores.get("learn", {}).get("AUC", np.nan) test_auc_cb = best_scores.get("validation", {}).get("AUC", np.nan) y_train_proba = model.predict_proba(train_pool)[:, 1] y_test_proba = model.predict_proba(test_pool)[:, 1] train_auc = roc_auc_score(y_train, y_train_proba) test_auc = roc_auc_score(y_test, y_test_proba) print(f"\nCatBoost best AUC (learn/valid): {train_auc_cb:.4f} / {test_auc_cb:.4f}") print(f"Recomputed AUC (train/test): {train_auc:.4f} / {test_auc:.4f}") # --- Сохранение --- os.makedirs("artifacts", exist_ok=True) model_path = "artifacts/model_bag_of_heroes.cbm" model.save_model(model_path) print(f"\nМодель сохранена: {model_path}") # Порядок фичей feature_cols = list(X.columns) pd.DataFrame(feature_cols, columns=["feature"]).to_csv( "artifacts/feature_order_bag_of_heroes.csv", index=False ) print("Порядок фичей сохранен в artifacts/feature_order_bag_of_heroes.csv") # Важность признаков (топ-30) importance = model.get_feature_importance(train_pool) importance_df = ( pd.DataFrame({"feature": X_train.columns, "importance": importance}) .sort_values("importance", ascending=False) .reset_index(drop=True) ) print("\nВажность признаков (top 30):") print(importance_df.head(30).to_string(index=False)) importance_df.to_csv("artifacts/feature_importance_bag_of_heroes.csv", index=False)