-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCOMP4388-1190652.py
274 lines (254 loc) · 11.9 KB
/
COMP4388-1190652.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
# Sondos Aabed
# Import Libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, roc_auc_score, confusion_matrix, precision_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import learning_curve
from sklearn.preprocessing import MinMaxScaler
# Load the csv file into a data Structure
dataframe = pd.read_csv('Customer Churn.csv')
# check the size of obsevations and the attributes size
print(len(dataframe))
print(dataframe.columns.size)
# Part one: 7 General Tasks
# 1- Show the summary statics for all attributes
# For the quantitative attributes used describe method
print(dataframe.describe())
# Plot the histograms (univariate)
figure, axes = plt.subplots(figsize=(20, 20))
dataframe.hist(ax=axes,color='green')
# for the qualititative attributes Plot pie charts to describe them
# Pie chart for Age group
figure1, axes1 = plt.subplots()
axes1.pie(dataframe['Age Group'].value_counts(), labels=dataframe['Age Group'].unique(), autopct='%1.1f%%')
axes1.set_title('Age Group Distribution')
# Pie chart for Status
figure4, axes4 = plt.subplots()
axes4.pie(dataframe['Status'].value_counts(), labels=dataframe['Status'].unique(), autopct='%1.1f%%')
axes4.set_title('Status Distribution')
# Pie chart for Plan
figure5, axes5 = plt.subplots()
axes5.pie(dataframe['Plan'].value_counts(), labels=dataframe['Plan'].unique(), autopct='%1.1f%%')
axes5.set_title('Plan Distribution')
# Pie chart for Complains
figure6, axes6 = plt.subplots()
axes6.pie(dataframe['Complains'].value_counts(), labels=dataframe['Complains'].unique(), autopct='%1.1f%%')
axes6.set_title('Complains Distribution')
# Task 2
# Pie chart for Churn
figure3, axes3 = plt.subplots()
axes3.pie(dataframe['Churn'].value_counts(), labels=dataframe['Churn'].unique(), autopct='%1.1f%%')
axes3.set_title('Churn Distribution')
# Task 3
# For each age group, draw a histogram detailing the amount of churn in each sub-group
# Group the data by age group and count the number of churn and non-churn customers
age_group_churn = dataframe.groupby(['Age Group', 'Churn']).size().reset_index(name='count')
# Plot the histogram
figure13, axes13 = plt.subplots()
axes13.hist(age_group_churn['Age Group'], weights=age_group_churn['count'], stacked=True, color='orange')
axes13.set_xlabel('Age group')
axes13.set_ylabel('Number of Customers')
axes13.set_title('Churn distribution by Age group')
# Task 4
# For each charge amount, draw a histogram detailing the amount of churn in each sub-group.
age_group_charge_amount = dataframe.groupby(['Charge Amount', 'Age Group']).size().reset_index(name='count1')
# Plot the histogram
figure13b, axes13b = plt.subplots()
axes13b.hist(age_group_charge_amount['Charge Amount'], weights=age_group_charge_amount['count1'], stacked=True, color='green')
axes13b.set_xlabel('Charge Amount')
axes13b.set_ylabel('Number of Customers')
axes13b.set_title('Age Group distribution by Charge Amount')
# Task 5
# Show the details of the charge amount of customers.
print(dataframe['Charge Amount'].describe())
column3 ='Charge Amount'
# Count the values in the column
counts = dataframe[column3].value_counts()
x_labels = counts.index
y_values = counts.values
figure2, axes2 = plt.subplots()
axes2.bar(x_labels, y_values,color='green')
axes2.set_xlabel(column3)
axes2.set_ylabel('Count')
axes2.set_title('Charge Amount Distribution')
print(dataframe.corr())
# Task 6
# Visualize the Correlaion between all the features
# Print the corralation
print(dataframe.corr())
# Convert Qualititative attributes to Quantititaive to use heat Map and to Standarize dataFrame
dataframe['Churn'] =dataframe['Churn'].astype('category').cat.codes
dataframe['Status'] =dataframe['Status'].astype('category').cat.codes
dataframe['Plan'] =dataframe['Plan'].astype('category').cat.codes
dataframe['Complains'] =dataframe['Complains'].astype('category').cat.codes
dataframe['Charge Amount'] =dataframe['Charge Amount'].astype('category').cat.codes
dataframe['Age Group'] =dataframe['Age Group'].astype('category').cat.codes
figure12, axes12= plt.subplots()
sns.set(rc = {'figure.figsize':(16,8)})
sns.heatmap(dataframe.corr(), ax=axes12,annot = True, fmt='.2g',cmap= 'vlag')
# I chose to drop these after interpreting the Correlation as specified in the document
dataframe.drop(['ID','Age Group'], axis=1)
# After dropping these comes the Data Cleansing part
# For data Cleansing I chose to fill in the missing data with the mean of their coulmn, and i chose to remove duplicates
dataframe.fillna(dataframe.mean())
dataframe.drop_duplicates()
# feature scaling using minmax
scaler = MinMaxScaler()
scaled = scaler.fit_transform(dataframe)
dataframe = pd.DataFrame(scaled, columns=dataframe.columns)
# Task 7
# Split the dataset into training (80%) and test (20%), asuming the target is Churn
# will be used in classification tasks
x_train, x_test, y_train, y_test = train_test_split(dataframe.drop('Churn', axis=1), dataframe['Churn'], test_size=0.2)
# Part two: Linear Regression
# Task 1: linear regression to learn the attribute “Customer Value” using all independent attributes
# since we are learning the Customer Value drop it
dataframe1= dataframe
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(dataframe1.drop('Customer Value', axis=1), dataframe1['Customer Value'], test_size=0.2)
LRM1 = LinearRegression()
LRM1.fit(X_train_1,y_train_1)
y_pred_1=LRM1.predict(X_test_1)
print(LRM1.score(X_test_1,y_test_1))
# Calculate teh mean squred error
mse = mean_squared_error(y_test_1, y_pred_1)
print(f'Mean Squared Error For LRM1: {mse:.2f}')
# Plot the Relation
# residuals = y_test_1 - y_pred_1
figure14, axes14 = plt.subplots()
axes14.scatter(y_test_1, y_pred_1, color='green')
axes14.plot([y_test_1.min(), y_test_1.max()], [y_test_1.min(), y_test_1.max()])
plt.title('LRM1 predicting Customer Value')
axes14.set_xlabel('Actual Customer values')
axes14.set_ylabel('Predicted Customer values')
axes14.grid(visible=False)
# Task 2: Linear regression using the most three attributes
# 3 important features: Frequency of SMS, Subscription Length and Freq. of use
dataframe2= dataframe
x = dataframe2[['Freq. of SMS', 'Subscription Length','Freq. of use']]
y = dataframe2['Customer Value']
# split the data into train and test
x_train_2, x_test_2, y_train_2, y_test_2 = train_test_split(x, y, test_size=0.2)
LRM2 = LinearRegression()
LRM2.fit(x_train_2,y_train_2)
y_pred_2=LRM2.predict(x_test_2)
print(LRM2.score(x_test_2,y_test_2))
# Calcutae the mean square error
mse2 = mean_squared_error(y_test_2, y_pred_2)
print(f'Mean Squared Error For LRM2: {mse2:.2f}')
# plot the relation between the actual values and the predicted values
figure15, axes15 = plt.subplots()
axes15.scatter(y_test_2, y_pred_2, color='green')
# add a diagonal line to the plot
axes15.plot([y_test_2.min(), y_test_2.max()], [y_test_2.min(), y_test_2.max()])
axes15.set_xlabel('Actual Customer values')
axes15.set_ylabel('Predicted Customer values')
axes15.grid(visible=False)
plt.title('LRM2 predicting Customer Value')
# Task 3: Linear regression using the most set of most important attributes
# use dataframeC
dataframe3= dataframe
x1 = dataframe3[['Freq. of SMS', 'Seconds of Use','Freq. of use']]
y1 = dataframe3['Customer Value']
# split into train and test
X_train_3, X_test_3, y_train_3, y_test_3 = train_test_split(x1, y1, test_size=0.2)
LRM3 = LinearRegression()
LRM3.fit(X_train_3,y_train_3)
y_pred_3=LRM3.predict(X_test_3)
print(LRM3.score(X_test_3,y_test_3))
# calculate the mean squared error
mse3 = mean_squared_error(y_test_3, y_pred_3)
print(f'Mean Squared Error For LRM3: {mse3:.2f}')
# Plot the Relation
# residuals = y_test_3 - y_pred_3
figure16, axes16 = plt.subplots()
axes16.scatter(y_test_3, y_pred_3, color='green')
axes16.plot([y_test_3.min(), y_test_3.max()], [y_test_3.min(), y_test_3.max()])
axes16.set_title('LRM3 predicting Customer Value')
axes16.set_xlabel('Actual Customer values')
axes16.set_ylabel('Predicted Customer values')
axes16.grid(visible=False)
# Part threee: Classsification
dataframe4 = dataframe
dataframe4.drop(['ID', 'Plan', 'Status','Age Group'], axis=1)
dataframe4.fillna(dataframe4.mean())
dataframe4.drop_duplicates()
# split into train and test for each model
x_train_4, x_test_4, y_train_4, y_test_4 = train_test_split(dataframe4.drop('Churn', axis=1), dataframe4['Churn'], test_size=0.2)
# Task 1: k-Nearest Neighbors classifier to predict Churn
# Create the k-NN classifier
knn = KNeighborsClassifier(n_neighbors=20)
knn.fit(x_train_4, y_train_4)
y_pred_knn=knn.predict(x_test_4)
#get the confusion matrix for KNN
confusion_matrix_knn = confusion_matrix(y_test_4, y_pred_knn)
print(confusion_matrix_knn)
# Obtain the predicted probabilities for the positive class
y_pred_proba = knn.predict_proba(x_test_4)[:, 1]
# Calculate the ROC/AUC score
roc_auc_knn = roc_auc_score(y_test_4, y_pred_proba)
print('ROC/AUC KNN: ',roc_auc_knn)
# Calculate presion
precision_knn = precision_score(y_test_4, y_pred_knn)
print('KNN precsion: ', precision_knn)
# Task 2: Naive Bayes classifier to predict Churn
# Create the NB classifier
nv = GaussianNB()
nv.fit(x_train_4,y_train_4)
y_pred_nv=nv.predict(x_test_4)
#get the confusion matrix for NB
confusion_matrix_nv = confusion_matrix(y_test_4, y_pred_nv)
print(confusion_matrix_nv)
# Obtain the predicted probabilities for the positive class
y_pred_proba = nv.predict_proba(x_test_4)[:, 1]
# Calculate the ROC/AUC score
roc_auc_nv = roc_auc_score(y_test_4, y_pred_proba)
print('ROC/AUC NV: ',roc_auc_nv)
# Calculate presion
precision_nv= precision_score(y_test_4, y_pred_nv)
print('NV precsion: ', precision_nv)
# Task 3: Logistic Regression classifier to predict Churn
# create model
lr = LogisticRegression(max_iter=1000)
lr.fit(x_train_4, y_train_4)
y_pred_lr = lr.predict(x_test_4)
#get the confusion matrix for NB
confusion_matrix_lr = confusion_matrix(y_test_4, y_pred_lr)
print(confusion_matrix_lr)
# Obtain the predicted probabilities for the positive class
y_pred_proba = lr.predict_proba(x_test_4)[:, 1]
# Calculate the ROC/AUC score
roc_auc_lr = roc_auc_score(y_test_4, y_pred_proba)
print('ROC/AUC LR: ',roc_auc_lr)
# Calculate presion
precision_lr = precision_score(y_test_4, y_pred_lr)
print('LR precsion: ', precision_lr)
# Cross validation and plotting the learning graph for each model
train_sizes, train_scores, test_scores = learning_curve(LogisticRegression(), dataframe4.drop('Churn', axis=1), dataframe4['Churn'], cv=5)
figure17, axes17 = plt.subplots()
axes17.plot(train_sizes, train_scores.mean(axis=1), label='Training score')
axes17.plot(train_sizes, test_scores.mean(axis=1), label='Test score')
axes17.set_xlabel('LR Training examples')
axes17.set_ylabel('Score')
axes17.legend(loc='best')
train_sizes_1, train_scores_1, test_scores_1= learning_curve(GaussianNB(), dataframe4.drop('Churn', axis=1), dataframe4['Churn'], cv=5)
figure18, axes18 = plt.subplots()
axes18.plot(train_sizes_1, train_scores_1.mean(axis=1), label='Training score')
axes18.plot(train_sizes_1, test_scores_1.mean(axis=1), label='Test score')
axes18.set_xlabel('NV Training examples')
axes18.set_ylabel('Score')
axes18.legend(loc='best')
train_sizes_2, train_scores_2, test_scores_2 = learning_curve(KNeighborsClassifier(), dataframe4.drop('Churn', axis=1), dataframe4['Churn'], cv=5)
figure19, axes19 = plt.subplots()
axes19.plot(train_sizes_2, train_scores_2.mean(axis=1), label='Training score')
axes19.plot(train_sizes_2, test_scores_2.mean(axis=1), label='Test score')
axes19.set_xlabel('KNN Training examples')
axes19.set_ylabel('Score')
axes19.legend(loc='best')
# For each axses show the plot
plt.show()