import pandas as pd import psycopg2 from psycopg2.extras import RealDictCursor def get_db_connection(): return psycopg2.connect( host="localhost", port=5432, database="korobka_db", user="postgres", password="postgres" ) print("Загрузка данных из БД...") conn = get_db_connection() # Загружаем матчи matches_df = pd.read_sql_query(""" SELECT id as match_id, radiant_team_id, dire_team_id, radiant_win FROM matches WHERE "source" = 'pro' ORDER BY id """, conn) print(f"Загружено {len(matches_df)} матчей") # Загружаем детали героев details_df = pd.read_sql_query(""" SELECT match_id, hero_id, team, players_id, pos FROM details_match WHERE "source" = 'pro' ORDER BY match_id """, conn) print(f"Загружено {len(details_df)} записей героев") conn.close() # Формируем слоты героев для каждого матча def slots_from_picks(group): # --- Radiant (team = 0) --- r_heroes = group[group['team'] == 0]['hero_id'].tolist()[:5] r_players = group[group['team'] == 0]['players_id'].tolist()[:5] r_pos = group[group['team'] == 0]['pos'].tolist()[:5] # --- Dire (team = 1) --- d_heroes = group[group['team'] == 1]['hero_id'].tolist()[:5] d_players = group[group['team'] == 1]['players_id'].tolist()[:5] d_pos = group[group['team'] == 1]['pos'].tolist()[:5] row = {} # --- Добавляем 5 слотов для каждой стороны --- for i in range(5): # Герои Radiant / Dire row[f"r_h{i+1}"] = r_heroes[i] if i < len(r_heroes) else -1 row[f"d_h{i+1}"] = d_heroes[i] if i < len(d_heroes) else -1 # Позиции героев row[f"rp_h{i+1}"] = r_pos[i] if i < len(r_pos) else -1 row[f"dp_h{i+1}"] = d_pos[i] if i < len(d_pos) else -1 # Игроки Radiant / Dire row[f"r_p{i+1}"] = r_players[i] if i < len(r_players) else -1 row[f"d_p{i+1}"] = d_players[i] if i < len(d_players) else -1 # Определяем, кто пикал первым (команда 0 = radiant) fp_team = group.iloc[0]['team'] if len(group) > 0 else 0 row["is_first_pick_radiant"] = 1 if fp_team == 0 else 0 return pd.Series(row) slots_df = details_df.groupby("match_id").apply(slots_from_picks).reset_index() # Объединяем с информацией о матчах dataset = matches_df.merge(slots_df, on="match_id", how="inner") # Добавляем целевую переменную dataset['y'] = dataset['radiant_win'].astype(int) # Выбираем нужные колонки в правильном порядке final_df = dataset[['match_id', 'is_first_pick_radiant', 'r_h1', 'r_h2', 'r_h3', 'r_h4', 'r_h5', 'd_h1', 'd_h2', 'd_h3', 'd_h4', 'd_h5', 'r_p1', 'r_p2', 'r_p3', 'r_p4', 'r_p5', 'd_p1', 'd_p2', 'd_p3', 'd_p4', 'd_p5', 'rp_h1', 'rp_h2', 'rp_h3', 'rp_h4', 'rp_h5', 'dp_h1', 'dp_h2', 'dp_h3', 'dp_h4', 'dp_h5', 'y']] # Сохраняем final_df.to_parquet("data/dataset_from_db.parquet", index=False) print(f"Сохранено {len(final_df)} записей в data/dataset_from_db.parquet") print(f"Radiant wins: {final_df['y'].sum()}, Dire wins: {len(final_df) - final_df['y'].sum()}")