-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathLogisticRegression.py
224 lines (145 loc) · 12 KB
/
LogisticRegression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
# -*- coding: utf-8 -*-
"""
Last update on Tue Oct 13 18:00:00 2020
@student-name: Martim Cevadinha Figueiredo
@student-email: [email protected]
@student-number: 52701
@student-name: Ruben Andre Barreiro
@student-email: [email protected]
@student-number: 42648
@degree: Master of Computer Science and Engineering (MIEI)
@college: NOVA School of Science and Technology (FCT NOVA)
@university: New University of Lisbon (UNL)
"""
# Import NumPy Python's Library as np
import numpy as np
# Import Logistic Regression Sub-Module, from SciKit-Learn Python's Library,
# as skl_logistic_regression
from sklearn.linear_model import LogisticRegression as skl_logistic_regression
# Import Model Selection Sub-Module, from SciKit-Learn Python's Library,
# as skl_model_selection
from sklearn import model_selection as skl_model_selection
# Import Brier Score Loss (Metrics) Sub-Module,
# from SciKit-Learn Python's Library, as skl_brier_score_loss
from sklearn.metrics import brier_score_loss as skl_brier_score_loss
# Import PyPlot Sub-Module, from Matplotlib Python's Library as plt
import matplotlib.pyplot as plt
# The Number of Folds, for Stratified K Folds, in Cross-Validation
NUM_FOLDS = 5
# The Number of Steps/Variations for ajusting the C Regularization parameter,
# for the Logistic Regression
NUM_STEPS_C_REGULARIZATION_LOGISTIC_REGRESSION = 15
# -----------------------------------------------------
# \ \
# \ Classifier 1) - Logistic Regression, \
# \ varying its Regularization C parameter \
# \___________________________________________________\
# The Function to Compute and Return the Errors for Training and Validation Sets, for the Logistic Regression Classifier
def compute_logReg_errors(xs, ys, train_idx, valid_idx, c_param_value, num_features, score_type = 'brier_score'):
# Initialise the Logistic Regression, from the Linear Model of the SciKit-Learn
logReg = skl_logistic_regression(C = c_param_value, tol = 1e-10)
# Fit the Logistic Regression
logReg.fit(xs[train_idx,:num_features], ys[train_idx])
# Compute the prediction probabilities of some Features, belonging to a certain Class, due to the
ys_logReg_predict_prob = logReg.predict_proba(xs[:,:num_features])[:,1]
# Compute the Training and Validation Errors, based on a certain type of Scoring:
# 1) Based on Brier Score
if(score_type == 'brier_score'):
logReg_train_error = skl_brier_score_loss(ys[train_idx], ys_logReg_predict_prob[train_idx]) # Compute the Training Error, related to its Brier Score
logReg_valid_error = skl_brier_score_loss(ys[valid_idx], ys_logReg_predict_prob[valid_idx]) # Compute the Validation Error, related to its Brier Score
# 2) Based on Logistic Regression Score
if(score_type == 'logistic_regression_score'):
logReg_accuracy_train = logReg.score(xs[train_idx], ys[train_idx]) # Compute the Training Set's Accuracy (Score), for the Logistic Regression
logReg_accuracy_valid = logReg.score(xs[valid_idx], ys[valid_idx]) # Compute the Validation Set's Accuracy (Score), for the Logistic Regression
logReg_train_error = ( 1 - logReg_accuracy_train ) # Compute the Training Error, regarding its Accuracy (Score)
logReg_valid_error = ( 1 - logReg_accuracy_valid ) # Compute the Validation Error, regarding its Accuracy (Score)
# Return the Training and Validation Errors, for the Logistic Regression
return logReg_train_error, logReg_valid_error
# The Function to Plot the Training and Validation, for the Logistic Regression
def plot_train_valid_error_logistic_regression(train_error_values, valid_error_values):
# Initialise the Plot
plt.figure(figsize=(8, 8), frameon=True)
# Set the line representing the continuous values,
# for the Functions of the Training and Validation Errors
plt.plot(train_error_values[:,0], train_error_values[:,1],'-', color="blue")
plt.plot(valid_error_values[:,0], valid_error_values[:,1],'-', color="red")
# Set the axis for the Plot
plt.axis([min(valid_error_values[:,0]), max(valid_error_values[:,0]), min(valid_error_values[:,1]), max(valid_error_values[:,1])])
# Set the laber for the X axis of the Plot
plt.xlabel("log(C)")
# Set the laber for the Y axis of the Plot
plt.ylabel("Training/Validation Errors")
# Set the Title of the Plot
plt.title('Logistic Regression, varying the C parameter\n\nTraining Error (Blue) / Cross-Validation Error (Red)')
# Save the Plot, as a figure/image
plt.savefig('files/imgs/LR.png', dpi=600)
# Show the Plot
plt.show()
# Close the Plot
plt.close()
# The Function to Estimate the True/Test Error of the Testing Set,
# for the Logistic Regression Classifier
def estimate_logReg_true_test_error(xs_train, ys_train, xs_test, ys_test, num_features, best_c_param_value=1e12, score_type = 'brier_score'):
logReg = skl_logistic_regression(C=best_c_param_value, tol=1e-10) # Initialise the Logistic Regression Classifier, for the Best Regularization C Parameter found
logReg.fit(xs_train[:,:num_features], ys_train) # Fit the Logistic Regression Classifier with the Training Set
ys_logReg_predict_prob = logReg.predict_proba(xs_test[:,:num_features])[:,1] # Predict the Probabilities of the Features of the Testing Set, belongs to a certain Class
logReg_predict_classes_xs_test = logReg.predict(xs_test) # Predict and Classify the Values of the Testing Set, with the Logistic Regression Classifier
# Estimate the Testing Error, based on a certain type of Scoring
# 1) Brier Scoring
if(score_type == 'brier_score'):
estimated_true_test_error = skl_brier_score_loss(ys_test, ys_logReg_predict_prob) # Estimate the Testing Error, related to its Brier Score
# 2) Logistic Regression Scoring
if(score_type == 'logistic_regression_score'):
estimated_accuracy_test = logReg.score(xs_test, ys_test) # Compute the Training Set's Accuracy (Score), for the Logistic Regression
estimated_true_test_error = ( 1 - estimated_accuracy_test ) # Compute the Training Error, regarding its Accuracy (Score)
num_samples_test_set = len(xs_test) # The Number of Samples, from the Testing Set
logReg_num_incorrect_predictions = 0 # The Real Number of Incorrect Predictions, regarding the Logistic Regression Classifier
# For each Sample, from the Testing Set
for current_sample_test in range(num_samples_test_set):
# If the Prediction/Classification of the Class for the current Sample, of the Testing Set is different from the Real Class of the same,
# it's considered an Real Error in Prediction/Classification, regarding the Logistic Regression Classifier
if(logReg_predict_classes_xs_test[current_sample_test] != ys_test[current_sample_test] ):
logReg_num_incorrect_predictions += 1
# Return the Predictions of the Samples,
# the Real Number of Incorrect Predictions and the Estimated True/Test Error, for the Logistic Regression Classifier
return logReg_predict_classes_xs_test, logReg_num_incorrect_predictions, estimated_true_test_error
# Perform the Classification Process for
# the Logistic Regression Classifier
def do_logistic_regression(xs_test_features_std, ys_train_classes, xs_train_features_std, ys_test_classes, num_features):
k_folds = skl_model_selection.StratifiedKFold(n_splits = NUM_FOLDS) # The K Folds Combinations Model, for the Stratified K Folds process
logReg_best_c = 1e10 # The Best Regularization Parameter C found, for Logistic Regreession
logReg_best_valid_error_avg_folds = 1e10 # The Best Average of the Validation Error, for Logistic Regreession
initial_exp_factor = 0 # The Initial Exponential Factor, for the Loop
final_exp_factor = 15 # The Final Exponential Factor, for the Loop
initial_c_param_value = 1e-2 # The Initial Regularization Parameter C (i.e., 1e-2)
logReg_train_error_values = np.zeros((NUM_STEPS_C_REGULARIZATION_LOGISTIC_REGRESSION,2)) # The Values of Training and Validation Errors, for Logistic Regression
logReg_valid_error_values = np.zeros((NUM_STEPS_C_REGULARIZATION_LOGISTIC_REGRESSION,2))
# The loop for try all the Regularization Parameter Cs
for current_exp_factor in range(initial_exp_factor, final_exp_factor):
logReg_train_error_sum = 0 # The sum of the Training and Validation Errors, for Logistic Regression
logReg_valid_error_sum = 0
current_c_param_value = ( initial_c_param_value * 10**(current_exp_factor) ) # The current Regularization Parameter C
# The loop for all the combinations of K Folds, in the Stratified K Folds process
for train_idx, valid_idx in k_folds.split(ys_train_classes, ys_train_classes):
# Compute the Training and Validation Errors, for Logistic Regression
logReg_train_error, logReg_valid_error = compute_logReg_errors(xs_train_features_std, ys_train_classes, train_idx, valid_idx, current_c_param_value, num_features, 'brier_score')
# Sum the current Training and Validation Errors to the Sums of them
logReg_train_error_sum += logReg_train_error
logReg_valid_error_sum += logReg_valid_error
# Compute the Average of the Sums of the Training and Validation Errors, by the Total Number of Folds
logReg_train_error_avg_folds = (logReg_train_error_sum / NUM_FOLDS)
logReg_valid_error_avg_folds = (logReg_valid_error_sum / NUM_FOLDS)
# Updates the Best Validation Error and also, the Best Regularization C Parameter
if(logReg_best_valid_error_avg_folds > logReg_valid_error_avg_folds):
logReg_best_valid_error_avg_folds = logReg_valid_error_avg_folds
logReg_best_c = current_c_param_value
logReg_train_error_values[current_exp_factor, 0] = np.log(current_c_param_value) # Store the Values for x and y, for all the Training Error values,
logReg_train_error_values[current_exp_factor, 1] = logReg_train_error_avg_folds # for the Plot of the Training Errors, as a Function of Logarithm of the C Parameter
logReg_valid_error_values[current_exp_factor, 0] = np.log(current_c_param_value) # Store the Values for x and y, for all the Validation Error values,
logReg_valid_error_values[current_exp_factor, 1] = logReg_valid_error_avg_folds # for the Plot of the Validation Errors, as a Function of Logarithm of the C Parameter
# Plot the Training and Validation Errors, for the Logistic Regression Classifier
plot_train_valid_error_logistic_regression(logReg_train_error_values, logReg_valid_error_values)
# Compute the Predictions of the Samples, the Real Number of Incorrect Predictions and the Estimated True/Test Error, of the Testing Set, for the Logistic Regression Classifier
logReg_predict_classes_xs_test, logReg_num_incorrect_predictions, estimated_logReg_true_test_error = estimate_logReg_true_test_error(xs_train_features_std, ys_train_classes, xs_test_features_std, ys_test_classes, num_features, logReg_best_c, 'brier_score')
# Return the Predictions of the Samples, of the Testing Set, for the Logistic Regression Classifier
return logReg_train_error_avg_folds, logReg_valid_error_avg_folds, logReg_best_c, logReg_best_valid_error_avg_folds, logReg_predict_classes_xs_test, logReg_num_incorrect_predictions, estimated_logReg_true_test_error