-
Notifications
You must be signed in to change notification settings - Fork 1
/
logistic_regression.py
137 lines (104 loc) · 3.89 KB
/
logistic_regression.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Nov 25 17:25:31 2019
@author: lussier
"""
import pandas as pd
import itertools
from pandas import DataFrame
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_predict, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.feature_selection import SelectFromModel, RFE
#designate input file
input_file = "MLpain_old_vol_icvcontrol.csv"
#pandas read input csv
dataset = pd.read_csv(input_file, header = 0, sep=',')
#select data
#X = dataset.iloc[:, 103:] #select column through end, predictors
X = dataset.iloc[:, 29:] #select column through end, predictors
y = dataset.iloc[:, 17] #select column, target
#shuffle the data and split the sample into training and test data
X_train, X_test, y_train, y_test = train_test_split( X, y, train_size=.8, test_size=.2, stratify = y, shuffle=True)
#standarize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
logreg = LogisticRegression(penalty='l1', C=1e4, solver='liblinear', multi_class='auto')
#train model
logreg.fit(X_train, y_train)
#acc = logreg.score(X_test, y_test)
acc = logreg.score(X_train, y_train)
print("Accuracy: %.4f" % acc)
# predict the training data based on the model
y_pred = logreg.predict(X_train)
#print classification report
report = classification_report(y_train, y_pred)
print(report)
# get a table to help us break down these scores
cm = confusion_matrix(y_true=y_train, y_pred = y_pred)
print(cm)
#To retrieve the intercept:
print(logreg.intercept_)
#For retrieving the slope:
print(logreg.coef_)
# cross-validation
y_pred = cross_val_predict(logreg, X_train, y_train,
groups=y_train, cv=10)
# Evaluate a score for each cross-validation fold
acc = cross_val_score(logreg, X_train, y_train,
groups=y_train, cv=10)
for i in range(10):
print('Fold %s -- Acc = %s'%(i, acc[i]))
# get scores
overall_acc = accuracy_score(y_pred = y_pred, y_true = y_train)
overall_cr = classification_report(y_pred = y_pred, y_true = y_train)
overall_cm = confusion_matrix(y_pred = y_pred, y_true = y_train)
print('Accuracy:',overall_acc)
print(overall_cr)
print('Confusion matrix:')
print(overall_cm)
# plot
thresh = overall_cm.max() / 2
cmdf = DataFrame(overall_cm, index = ['NoPain','Pain'], columns = ['NoPain','Pain'])
sns.heatmap(cmdf, cmap='copper')
plt.xlabel('Predicted')
plt.ylabel('Observed')
for i, j in itertools.product(range(overall_cm.shape[0]), range(overall_cm.shape[1])):
plt.text(j+0.5, i+0.5, format(overall_cm[i, j], 'd'),
horizontalalignment="center",
color="white")
#test model
logreg.fit(X_train, y_train) # fit to training data
y_pred = logreg.predict(X_test) # classify pain group using testing data
acc = logreg.score(X_test, y_test) # get accuracy
cr = classification_report(y_pred=y_pred, y_true=y_test) # get prec., recall & f1
cm = confusion_matrix(y_pred=y_pred, y_true=y_test) # get confusion matrix
# print results
print('accuracy =', acc)
print(cr)
print('confusion matrix:')
print(cm)
## plot results
thresh = cm.max() / 2
cmdf = DataFrame(cm, index = ['NoPain','Pain'], columns = ['NoPain','Pain'])
sns.heatmap(cmdf, cmap='RdBu_r')
plt.xlabel('Predicted')
plt.ylabel('Observed')
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j+0.5, i+0.5, format(cm[i, j], 'd'),
horizontalalignment="center",
color="white")
model = SelectFromModel(logreg, prefit=True)
X_new = model.transform(X)
print(X_new.shape)
selector = RFE(logreg, 1)
selector = selector.fit(X_train, y_train)
selector.support_
order = selector.ranking_
order
print(order)