-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdiabetes_classImbalance_GMM.py
138 lines (78 loc) · 3.7 KB
/
diabetes_classImbalance_GMM.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
# -*- coding: utf-8 -*-
"""
Created on Wed Mar 14 11:43:16 2018
@author: David
parts of code adapted from code here:
https://www.kaggle.com/dbsnail/diabetes-prediction-over-0-86-accuracy
https://towardsdatascience.com/building-a-logistic-regression-in-python-step-by-step-becd4d56c9c8
class imbalance adapted partly from:
https://elitedatascience.com/imbalanced-classes
ROC Curve plotting adapted from:
https://datamize.wordpress.com/2015/01/24/how-to-plot-a-roc-curve-in-scikit-learn/
"""
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')
'''
import warnings
warnings.filterwarnings('ignore')
'''
### read CSV file containing BR census data
di_df = pd.read_csv('C:\\Users\\David\\Documents\\Data Science Related\\Datasets\\pima-indians-diabetes\\diabetes.csv')
### checking class balance (levels of Outcome variable)
di_df.Outcome.value_counts()
### Check how many columns contain missing data
print(di_df.isnull().any().sum(), ' / ', len(di_df.columns))
### Check how many entries in total are missing
print(di_df.isnull().any(axis=1).sum(), ' / ', len(di_df))
### function to replace particular value in a feature column with mean of subset
### of column grouped by target variable level (i.e. mean imputation by group)
def impute_mean_byGroup(df, field, val, target):
df_copy = df.copy() # make copy of df (not strictly necessary?)
df_copy[field].replace(val, np.nan, inplace=True) # replace values of field that match val with nan
field_grouped = df_copy[field].groupby(df_copy[target]) # field grouped by target (creates group object)
field_nanRepByMean = field_grouped.apply(lambda x: x.fillna(x.mean())) # replace nan in each group by mean of field for that group
df_copy[field] = field_nanRepByMean.values
return(df_copy)
### impute mean for 0 values by grouped outcome for selected columns
di_df_imputed = di_df.copy()
for col in ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']:
di_df_imputed = impute_mean_byGroup(di_df_imputed, col, 0, 'Outcome')
### assess balance of classes (counts of 0 and 1)
di_df_imputed['Outcome'].value_counts()
### copy di_df_imputed to new df simply called 'df' to use for modifying entries for balancing classes
df = di_df_imputed.copy()
### using upsampling (resampling) of minority class (1) to address class imbalance
from sklearn.utils import resample
### Separate majority and minority classes into new dfs
df_maj = df[df.Outcome==0]
df_min = df[df.Outcome==1]
### Upsample minority class
df_min_upsamp = resample(df_min,
replace=True, # sample with replacement
n_samples=500, # to match majority class
random_state=123) # reproducible results
### Combine majority class with upsampled minority class
df_combined_upsamp = pd.concat([df_maj, df_min_upsamp])
### Display new class counts
df_combined_upsamp.Outcome.value_counts()
### split data into features and target
X = df_combined_upsamp.iloc[:,:-1]
y = df_combined_upsamp.iloc[:, -1]
### import sklearn functions for PCA
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(X)
X_2D = pca.transform(X)
df_combined_upsamp['PCA1'] = X_2D[:, 0]
df_combined_upsamp['PCA2'] = X_2D[:, 1]
sns.lmplot('PCA1','PCA2', hue='Outcome', data=df_combined_upsamp, fit_reg=False)
#### import sklearn functions for Gaussian Mixture Model
from sklearn.mixture import GMM
gmm = GMM(n_components=2, covariance_type='full')
gmm.fit(X)
y_gmm = gmm.predict(X)
df_combined_upsamp['cluster'] = y_gmm
sns.lmplot('PCA1','PCA2', hue='cluster', data=df_combined_upsamp, fit_reg=False)