Initial commit: добавление проекта predictV1
Включает модели ML для предсказаний, API маршруты, скрипты обучения и данные. Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
139
educationML/build_dataset_with_players.py
Normal file
139
educationML/build_dataset_with_players.py
Normal file
@@ -0,0 +1,139 @@
|
||||
import psycopg2
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
|
||||
print("Подключение к базе данных...")
|
||||
conn = psycopg2.connect(
|
||||
host="localhost",
|
||||
port=5432,
|
||||
database="korobka_db",
|
||||
user="postgres",
|
||||
password="postgres"
|
||||
)
|
||||
|
||||
print("Загрузка матчей с известными игроками...")
|
||||
|
||||
# Получаем все матчи где есть хотя бы один известный игрок
|
||||
query = """
|
||||
SELECT
|
||||
m.id as match_id,
|
||||
m.radiant_win,
|
||||
m.leagueid
|
||||
FROM matches m
|
||||
WHERE EXISTS (
|
||||
SELECT 1
|
||||
FROM details_match dm
|
||||
WHERE dm.match_id = m.id
|
||||
AND dm.players_id IS NOT NULL
|
||||
AND dm.players_id != 0
|
||||
)
|
||||
ORDER BY m.id
|
||||
"""
|
||||
|
||||
matches_df = pd.read_sql(query, conn)
|
||||
print(f"Найдено матчей: {len(matches_df)}")
|
||||
|
||||
# Получаем детали всех этих матчей
|
||||
query_details = """
|
||||
SELECT
|
||||
dm.match_id,
|
||||
dm.hero_id,
|
||||
dm.team,
|
||||
dm.players_id,
|
||||
dm.pos,
|
||||
dm."order"
|
||||
FROM details_match dm
|
||||
WHERE dm.match_id IN (
|
||||
SELECT DISTINCT m.id
|
||||
FROM matches m
|
||||
WHERE EXISTS (
|
||||
SELECT 1
|
||||
FROM details_match dm2
|
||||
WHERE dm2.match_id = m.id
|
||||
AND dm2.players_id IS NOT NULL
|
||||
AND dm2.players_id != 0
|
||||
)
|
||||
)
|
||||
ORDER BY dm.match_id, dm."order"
|
||||
"""
|
||||
|
||||
details_df = pd.read_sql(query_details, conn)
|
||||
conn.close()
|
||||
|
||||
print(f"Загружено {len(details_df)} записей деталей")
|
||||
|
||||
# Преобразуем в wide-format
|
||||
print("\nПреобразование в wide-format...")
|
||||
|
||||
rows = []
|
||||
|
||||
for match_id, group in details_df.groupby('match_id'):
|
||||
match_info = matches_df[matches_df['match_id'] == match_id].iloc[0]
|
||||
|
||||
row = {
|
||||
'match_id': match_id,
|
||||
'y': int(match_info['radiant_win']),
|
||||
'leagueid': int(match_info['leagueid'])
|
||||
}
|
||||
|
||||
# Radiant (team=0) и Dire (team=1)
|
||||
radiant_picks = group[group['team'] == 0].sort_values('order')
|
||||
dire_picks = group[group['team'] == 1].sort_values('order')
|
||||
|
||||
# Заполняем героев и игроков для Radiant (до 5)
|
||||
for i, (idx, pick) in enumerate(radiant_picks.iterrows(), 1):
|
||||
if i > 5:
|
||||
break
|
||||
row[f'r_h{i}'] = int(pick['hero_id'])
|
||||
row[f'r_p{i}'] = int(pick['players_id']) if pd.notna(pick['players_id']) and pick['players_id'] != 0 else -1
|
||||
row[f'rp_h{i}'] = int(pick['pos']) if pd.notna(pick['pos']) else -1
|
||||
|
||||
# Заполняем пропуски для Radiant
|
||||
for i in range(len(radiant_picks) + 1, 6):
|
||||
row[f'r_h{i}'] = -1
|
||||
row[f'r_p{i}'] = -1
|
||||
row[f'rp_h{i}'] = -1
|
||||
|
||||
# Заполняем героев и игроков для Dire (до 5)
|
||||
for i, (idx, pick) in enumerate(dire_picks.iterrows(), 1):
|
||||
if i > 5:
|
||||
break
|
||||
row[f'd_h{i}'] = int(pick['hero_id'])
|
||||
row[f'd_p{i}'] = int(pick['players_id']) if pd.notna(pick['players_id']) and pick['players_id'] != 0 else -1
|
||||
row[f'dp_h{i}'] = int(pick['pos']) if pd.notna(pick['pos']) else -1
|
||||
|
||||
# Заполняем пропуски для Dire
|
||||
for i in range(len(dire_picks) + 1, 6):
|
||||
row[f'd_h{i}'] = -1
|
||||
row[f'd_p{i}'] = -1
|
||||
row[f'dp_h{i}'] = -1
|
||||
|
||||
rows.append(row)
|
||||
|
||||
df = pd.DataFrame(rows)
|
||||
|
||||
print(f"Создано {len(df)} записей в wide-format")
|
||||
print(f"Radiant wins: {df['y'].sum()} ({df['y'].mean()*100:.1f}%)")
|
||||
print(f"Dire wins: {len(df) - df['y'].sum()} ({(1-df['y'].mean())*100:.1f}%)")
|
||||
|
||||
# Статистика по игрокам
|
||||
player_cols = [f'r_p{i}' for i in range(1, 6)] + [f'd_p{i}' for i in range(1, 6)]
|
||||
all_players = []
|
||||
for col in player_cols:
|
||||
all_players.extend(df[col][df[col] > 0].tolist())
|
||||
|
||||
unique_players = len(set(all_players))
|
||||
print(f"\nУникальных игроков в датасете: {unique_players}")
|
||||
print(f"Всего записей игроков (не -1): {len(all_players)}")
|
||||
|
||||
# Статистика по турнирам
|
||||
print(f"\nУникальных турниров (leagueid): {df['leagueid'].nunique()}")
|
||||
|
||||
# Сохранение
|
||||
output_path = "data/dataset_with_players.parquet"
|
||||
df.to_parquet(output_path, index=False)
|
||||
print(f"\n✓ Датасет сохранён: {output_path}")
|
||||
|
||||
# Пример первых записей
|
||||
print("\nПример данных (первые 3 матча):")
|
||||
print(df.head(3).to_string())
|
||||
Reference in New Issue
Block a user