-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanalysis.R
202 lines (169 loc) · 5.37 KB
/
analysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
# March Madness Analysis
library(RSQLite)
library(ggplot2)
library(GGally)
library(ISLR)
library(stringr)
library(pscl)
#Returns the opposite of the result
flip_result <- function(diff) {
return((diff+1)%%2)
}
#Calculate the log loss of an individual prediction
#given the prediction probability and the actual result val
log_loss <- function(prediction,val) {
p1 <- val * log(prediction,exp(1))
p2 <- (1-val) * log((1-prediction),exp(1))
return(p1+p2)
}
#Bind lists to a data frame
#All must have the same number of columns
#Makes all frames have the column names of f1
bind <- function(f1,l1,l2,p) {
t1 <- data.frame(l1)
t2 <- data.frame(l2)
colnames(t1) <- p
colnames(t2) <- p
t1 <- rbind(t1,t2)
f1 <- rbind(f1,t1)
return(f1)
}
#Used to visualize the probabilities of each
#Seed match up based on historical data
probability_matrix <- function(game_table) {
mat <- matrix (0,nrow = 16, ncol = 16)
winners = game_table$winning_seed
losers = game_table$losing_seed
for (i in 1:length(winners)) {
mat[winners[i],losers[i]] = mat[winners[i],losers[i]] + 1
}
prob_mat <- matrix(, nrow = 16, ncol = 16)
for(row in 1:16){
for(col in 1:16){
if (row == col){
#When i and j are equal, and games have
#existed where i == j, then we just
#default the winrate to 50%
if (mat[row,col] > 0){
prob_mat[row,col] = 0.5
}
}
else {
i = mat[row,col]
tot = i + mat[col,row]
if (tot > 0){
prob_mat[row,col] = round(i/(tot), digits=3)
}
}
}
}
return(prob_mat)
}
#Returns dataframe containing log loss per year
#Over the time period specified from beg to end inclusive
loss_over_period <- function(model, beg, end, conn) {
for (t in beg:end) {
temp <- dbGetQuery(conn, paste("SELECT seed_diff FROM v3 WHERE year = ", toString(t)))
cur <- 0
for(i in temp$seed_diff) {
p <- predict(log_model, data.frame(seed_diff=i), type="response")
#All games in database are recorded such that result is 1
loss <- log_loss(p,1)
cur <- cur + loss
}
#Total average log loss for year t
tot <- cur / length(temp$seed_diff)
#Create the data frame if first year calculated
#Otherwise append to already created frame
if (t == beg) {
output <- data.frame(tot)
}
else {
output <- rbind(output,data.frame(tot))
}
}
row.names(output) <- beg:end
return(output)
}
#The output of this function will provide the
#data needed for our multipule logisitc regression model
model_dataframe <- function(query) {
#Initializing the dataframe and column names
#Each column represents the difference between
#The winner's rank and loser's rank
predictors <- list()
for(i in 1:length(colnames(m))/2) {
parsed <- str_sub(colnames(m)[i],9)
predictors[i] = parsed
}
predictors[(length(colnames(m))/2)+1] = "result"
output <- data.frame(matrix(ncol=length(predictors),nrow=0))
colnames(output) <- predictors
#Append to our data frame row by row
#Append a copy to the frame for each row in the
#database, one whos result is 1 and another whos is 0
n <- length(m[,1])
step <- length(query)/2
for (i in 1:n) {
t1 <- list()
t2 <- list()
for (j in 1:step) {
t1[j] <- query[i,j] - query[i,j+step]
t2[j] <- query[i,j+step] - query[i,j]
}
t1[step+1] <- 1
t2[step+1] <- 0
output <- bind(output,t1,t2,predictors)
}
return(output)
}
#Returns the percentage of games that are predicted
#correctly, does not account for predicting games
#derived from the current game correctly
games_correct <- function(model, frame) {
prob <- predict(model, type='response')
correct <- 0
for (i in 1:length(prob)) {
t <- 1
if (prob[i] < 0.5) {
t <- 0
}
if (t == frame[i,]$result) {
correct <- correct + 1
}
}
return(correct/length(prob))
}
#Main
conn <- dbConnect(RSQLite::SQLite(), "bracket_data.db")
#Export data from the database needed for our probability matrix
#and our simple logistic regression model
game_table = dbGetQuery(conn, "SELECT winning_seed, losing_seed FROM v3")
r <- dbGetQuery(conn, "SELECT seed_diff, result FROM v3 where year > 2009")
prob_mat <- probability_matrix(game_table)
#All database results are 1, we need to add inverse
#rows with result 0 as well so the regression model
#doesn't just always output 1
for (i in 1:length(r$seed_diff)) {
de <- data.frame(-(r[i,1]),flip_result(r[i,2]))
names(de) <- c("seed_diff","result")
r <- rbind(r,de)
}
#Model using only seed difference
log_model <- glm(result ~ seed_diff,data=r, family=binomial)
#Plotting our simple logistic regression model
#Can only plot because we have one predictor, won't
#be possible later
M.df <- data.frame(seed_diff = seq(-15,15,0.1))
M.df$result <- predict(log_model,newdata=M.df, type="response")
ggplot(M.df, aes(x=seed_diff,y=result)) + geom_line()
#predict(log_model, data.frame(seed_diff=15), type="response")
#Dataframe formatted in the correct way for our extended model
#Each row represents a game and the columns represent the difference
#In ranking between the two teams
m <- dbGetQuery(conn, "SELECT * FROM model")
df <- model_dataframe(m)
#Create and inspect the extended model
improved_model <- glm(result~., data=df, family=binomial)
anova(improved_model, test="Chisq")
#test_model <- glm(result~dol+pgh+cng+pom+cng, data=df, family=binomial)