-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmodel_v_3.py
184 lines (146 loc) · 7.75 KB
/
model_v_3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
# Load the NBA game data into a Pandas DataFrame
from time import sleep
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
import pandas as pd
import numpy as np
import xgboost as xgb
from tqdm import tqdm
from joblib import parallel_backend
from sklearn.metrics import make_scorer, mean_squared_error
def custom_scorer(y_true, y_pred, **kwargs):
mse = mean_squared_error(y_true, y_pred)
pbar.update(1)
return -mse # Return the negative value since GridSearchCV maximizes the score
wrapped_scorer = make_scorer(custom_scorer, greater_is_better=True)
pd.set_option('display.max_columns', 30)
games_data = pd.read_csv('games_complete.csv')
def generate_game_features(games_data, home_team_id, visitor_team_id, game_date='2023-04-04', n_games=5,
weight_n_games=0.7,
weight_h2h=0.3):
# Filter games_data to include only the last n games for each team
home_team_games = games_data[
((games_data['HOME_TEAM_ID'] == home_team_id) | (games_data['VISITOR_TEAM_ID'] == home_team_id)) & (
games_data['GAME_DATE_EST'] < game_date)].head(n_games)
visitor_team_games = games_data[
((games_data['HOME_TEAM_ID'] == visitor_team_id) | (games_data['VISITOR_TEAM_ID'] == visitor_team_id)) & (
games_data['GAME_DATE_EST'] < game_date)].head(
n_games)
# Filter games_data to include only head-to-head games from this season
head_to_head_games = games_data[
(((games_data['HOME_TEAM_ID'] == home_team_id) & (games_data['VISITOR_TEAM_ID'] == visitor_team_id)) | (
(games_data['HOME_TEAM_ID'] == visitor_team_id) & (
games_data['VISITOR_TEAM_ID'] == home_team_id))) & (games_data['GAME_DATE_EST'] < game_date)]
def get_stats_from_games(games, team_id):
stats = np.empty((len(games), 7))
index_to_use = 0
for index, row in games.iterrows():
stats_row = []
for stat in ['PTS_home', 'FG_PCT_home', 'FT_PCT_home', 'FG3_PCT_home', 'AST_home', 'REB_home',
'OREB_home',
'PTS_away', 'FG_PCT_away', 'FT_PCT_away', 'FG3_PCT_away', 'AST_away', 'REB_away',
'OREB_away']:
if (row['HOME_TEAM_ID'] == team_id) and stat.endswith('_home'):
stats_row.append(row[stat])
elif (row['VISITOR_TEAM_ID'] == team_id) and stat.endswith('_away'):
stats_row.append(row[stat])
# Assign the values from stats_row to the corresponding row in the stats array
stats[index_to_use] = stats_row
index_to_use += 1
return stats
# ['PTS_home', 'FG_PCT_home', 'FT_PCT_home', 'FG3_PCT_home', 'AST_home', 'REB_home', 'OREB_home',
# 'PTS_away', 'FG_PCT_away', 'FT_PCT_away', 'FG3_PCT_away', 'AST_away', 'REB_away', 'OREB_away']
#
home_team_stats = get_stats_from_games(home_team_games, home_team_id)
visitor_team_stats = get_stats_from_games(visitor_team_games, visitor_team_id)
#
# # This is wrong because it can't tell whether the team we are interested in was home or away
# # Calculate the average stats for head-to-head games, considering the home and away teams
#
# # Calculate the weighted features
# weighted_features = []
#
# for stat in ['PTS_home', 'FG_PCT_home', 'FT_PCT_home', 'FG3_PCT_home', 'AST_home', 'REB_home', 'OREB_home',
# 'PTS_away', 'FG_PCT_away', 'FT_PCT_away', 'FG3_PCT_away', 'AST_away', 'REB_away', 'OREB_away']:
# if stat.endswith('_home'):
# weighted_stat = weight_n_games * home_team_stats[stat] + weight_h2h * h2h_home_stats[stat]
# else:
# weighted_stat = weight_n_games * visitor_team_stats[stat] + weight_h2h * h2h_visitor_stats[stat]
# weighted_features.append(weighted_stat)
#
# future_game_features = np.array(weighted_features)
# print("np.concatenate((home_team_stats.mean(axis=0), visitor_team_stats.mean(axis=0)), axis=0): ", np.concatenate((home_team_stats.mean(axis=0), visitor_team_stats.mean(axis=0)), axis=0))
return np.concatenate((np.nanmean(home_team_stats, axis=0), np.nanmean(visitor_team_stats, axis=0))) # a (2, 7) array
depth_of_games = 60
max_index = 1000
X_data_array = np.empty((max_index, 14))
for index, row in games_data.iterrows():
if index < 1000:
# print("generate_game_features(str(row['HOME_TEAM_ID']), str(row['VISITOR_TEAM_ID'])): ", generate_game_features(games_data, row['HOME_TEAM_ID'], row['VISITOR_TEAM_ID']))
X_data_array[index] = generate_game_features(games_data, row['HOME_TEAM_ID'], row['VISITOR_TEAM_ID'],
row['GAME_DATE_EST'], depth_of_games)
# Create a DataFrame from the NumPy array
X_data = pd.DataFrame(X_data_array.reshape(max_index, -1))
# Define the input variables (X) and target variable (Y)
# print("X_data: ", X_data)
# print("-------------------------------------\n\n\n")
X = X_data # two dimensional
Y = games_data[['spread', 'PTS_total', 'OREB_total']].head(1000)
# Normalize the input data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
y_scaler = StandardScaler()
Y_scaled = y_scaler.fit_transform(Y)
# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, Y_scaled, test_size=0.2, random_state=42)
# Print the shape of the training and testing sets
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)
print('Y_train shape:', Y_train.shape)
print('Y_test shape:', Y_test.shape)
sleep(0.1) # so that print finishes before status bar is printed
model = xgb.XGBRegressor()
param_grid = {
'objective': ['reg:squarederror'],
'max_depth': [2, 3, 4],
'learning_rate': [0.005, 0.01, 0.02],
'n_estimators': [200, 300, 400, 500],
'n_jobs': [-1],
}
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring=wrapped_scorer)
# Calculate the total number of iterations
total_iterations = len(param_grid['max_depth']) * len(param_grid['learning_rate']) * len(
param_grid['n_estimators']) * 5 # times 5 because cv = 5
# Create a progress bar
with tqdm(total=total_iterations) as pbar:
# Use joblib to run the grid search in parallel
# Fit the grid search object to the data
grid.fit(X_train, Y_train)
best_params = grid.best_params_
print("Best parameters found: ", best_params)
model = xgb.XGBRegressor(**best_params)
model.fit(X_train, Y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(Y_test, y_pred)
print("Mean squared error: ", mse)
score = model.score(X_test, Y_test)
print("Model score: ", score)
# print("input importance: ", model.feature_importances_)
print("-------------------------------------\n\n\n")
print(scaler.inverse_transform(X_test[:10, :]))
print(y_scaler.inverse_transform(Y_test[:10, :]))
print(y_scaler.inverse_transform(y_pred[:10, :]))
# print("-------------------------------------\n\n\n")
home_team_id = 1610612766 # Charlotte Hornets
# home_team_id = 1610612738 # Boston Celtics
visitor_team_id = 1610612761 # Toronto Raptors
# Example: Predicting the spread, PTS_total, and OREB_total for a future game
future_game_features = generate_game_features(games_data, home_team_id, visitor_team_id,
n_games=depth_of_games).reshape(1, -1)
# Scale the features using the provided scaler
future_game_features_scaled = scaler.transform(future_game_features)
future_game_prediction = y_scaler.inverse_transform(model.predict(future_game_features_scaled))
print("Spread: ", future_game_prediction[0][0])
print("PTS_total: ", future_game_prediction[0][1])
print("OREB_total: ", future_game_prediction[0][2])