import psycopg2 import pandas as pd import numpy as np print("Подключение к базе данных...") conn = psycopg2.connect( host="localhost", port=5432, database="korobka_db", user="postgres", password="postgres" ) print("Загрузка матчей с известными игроками...") # Получаем все матчи где есть хотя бы один известный игрок query = """ SELECT m.id as match_id, m.radiant_win, m.leagueid FROM matches m WHERE EXISTS ( SELECT 1 FROM details_match dm WHERE dm.match_id = m.id AND dm.players_id IS NOT NULL AND dm.players_id != 0 ) ORDER BY m.id """ matches_df = pd.read_sql(query, conn) print(f"Найдено матчей: {len(matches_df)}") # Получаем детали всех этих матчей query_details = """ SELECT dm.match_id, dm.hero_id, dm.team, dm.players_id, dm.pos, dm."order" FROM details_match dm WHERE dm.match_id IN ( SELECT DISTINCT m.id FROM matches m WHERE EXISTS ( SELECT 1 FROM details_match dm2 WHERE dm2.match_id = m.id AND dm2.players_id IS NOT NULL AND dm2.players_id != 0 ) ) ORDER BY dm.match_id, dm."order" """ details_df = pd.read_sql(query_details, conn) conn.close() print(f"Загружено {len(details_df)} записей деталей") # Преобразуем в wide-format print("\nПреобразование в wide-format...") rows = [] for match_id, group in details_df.groupby('match_id'): match_info = matches_df[matches_df['match_id'] == match_id].iloc[0] row = { 'match_id': match_id, 'y': int(match_info['radiant_win']), 'leagueid': int(match_info['leagueid']) } # Radiant (team=0) и Dire (team=1) radiant_picks = group[group['team'] == 0].sort_values('order') dire_picks = group[group['team'] == 1].sort_values('order') # Заполняем героев и игроков для Radiant (до 5) for i, (idx, pick) in enumerate(radiant_picks.iterrows(), 1): if i > 5: break row[f'r_h{i}'] = int(pick['hero_id']) row[f'r_p{i}'] = int(pick['players_id']) if pd.notna(pick['players_id']) and pick['players_id'] != 0 else -1 row[f'rp_h{i}'] = int(pick['pos']) if pd.notna(pick['pos']) else -1 # Заполняем пропуски для Radiant for i in range(len(radiant_picks) + 1, 6): row[f'r_h{i}'] = -1 row[f'r_p{i}'] = -1 row[f'rp_h{i}'] = -1 # Заполняем героев и игроков для Dire (до 5) for i, (idx, pick) in enumerate(dire_picks.iterrows(), 1): if i > 5: break row[f'd_h{i}'] = int(pick['hero_id']) row[f'd_p{i}'] = int(pick['players_id']) if pd.notna(pick['players_id']) and pick['players_id'] != 0 else -1 row[f'dp_h{i}'] = int(pick['pos']) if pd.notna(pick['pos']) else -1 # Заполняем пропуски для Dire for i in range(len(dire_picks) + 1, 6): row[f'd_h{i}'] = -1 row[f'd_p{i}'] = -1 row[f'dp_h{i}'] = -1 rows.append(row) df = pd.DataFrame(rows) print(f"Создано {len(df)} записей в wide-format") print(f"Radiant wins: {df['y'].sum()} ({df['y'].mean()*100:.1f}%)") print(f"Dire wins: {len(df) - df['y'].sum()} ({(1-df['y'].mean())*100:.1f}%)") # Статистика по игрокам player_cols = [f'r_p{i}' for i in range(1, 6)] + [f'd_p{i}' for i in range(1, 6)] all_players = [] for col in player_cols: all_players.extend(df[col][df[col] > 0].tolist()) unique_players = len(set(all_players)) print(f"\nУникальных игроков в датасете: {unique_players}") print(f"Всего записей игроков (не -1): {len(all_players)}") # Статистика по турнирам print(f"\nУникальных турниров (leagueid): {df['leagueid'].nunique()}") # Сохранение output_path = "data/dataset_with_players.parquet" df.to_parquet(output_path, index=False) print(f"\n✓ Датасет сохранён: {output_path}") # Пример первых записей print("\nПример данных (первые 3 матча):") print(df.head(3).to_string())