-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathEDA BF.Rmd
382 lines (283 loc) · 12.2 KB
/
EDA BF.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
---
title: "Black Friday Dataset from AV"
author: "KISHORE"
date: "January 28, 2017"
output: html_document
---
### Data explored here istaken from Analytics Vidya practice problem.
### The code to create new features is taken from another member (vopani)
loading libraries
```{r}
library(dummies)
library(plyr)
library(ggplot2)
```
```{r}
#train <- read.csv("C:/Users/Admin/Downloads/Black Friday/train_oSwQCTC/train.csv", stringsAsFactors=F)
train <- read.csv("train.csv", stringsAsFactors=F)
```
onehot-encoding city variable
```{r}
X_train <- train
X_train <- dummy.data.frame(X_train, names=c("City_Category"), sep="_")
```
Converting age variable to numeric
```{r}
X_train$Age[X_train$Age == "0-17"] <- "15"
X_train$Age[X_train$Age == "18-25"] <- "21"
X_train$Age[X_train$Age == "26-35"] <- "30"
X_train$Age[X_train$Age == "36-45"] <- "40"
X_train$Age[X_train$Age == "46-50"] <- "48"
X_train$Age[X_train$Age == "51-55"] <- "53"
X_train$Age[X_train$Age == "55+"] <- "60"
X_train$Age <- as.integer(X_train$Age)
```
Converting stay in current city to numeric
```{r}
X_train$Stay_In_Current_City_Years[X_train$Stay_In_Current_City_Years == "4+"] <- "4"
X_train$Stay_In_Current_City_Years <- as.integer(X_train$Stay_In_Current_City_Years)
```
converting Gender to binary
```{r}
X_train$Gender <- ifelse(X_train$Gender == "F", 1, 0)
```
Feature representing the count of each user
```{r}
user_count <- ddply(X_train, .(User_ID), nrow)
names(user_count)[2] <- "User_Count"
X_train <- merge(X_train, user_count, by="User_ID")
```
Feature representing the count of each product
```{r}
product_count <- ddply(X_train, .(Product_ID), nrow)
names(product_count)[2] <- "Product_Count"
X_train <- merge(X_train, product_count, by="Product_ID")
```
Feature representing the average Purchase of each product
```{r}
product_mean <- ddply(X_train, .(Product_ID), summarize, Product_Mean=mean(Purchase))
X_train <- merge(X_train, product_mean, by="Product_ID")
```
feature representing the proportion of times the user purchases the product more than the product's average
```{r}
X_train$flag_high <- ifelse(X_train$Purchase > X_train$Product_Mean,1,0)
user_high <- ddply(X_train, .(User_ID), summarize, User_High=mean(flag_high))
X_train <- merge(X_train, user_high, by="User_ID")
```
Add User_High column to the original dataset also
```{r}
train$flag_high <- ifelse(X_train$Purchase > X_train$Product_Mean,1,0)
user_high <- ddply(train, .(User_ID), summarize, User_High=mean(flag_high))
train <- merge(train, user_high, by="User_ID")
X_train <- merge(X_train, user_high, by="User_ID")
```
Structure of the train dataset
```{r}
str(train)
```
Structure of the X_train dataset which has newly created
```{r}
str(X_train)
```
Random sample the data set. Just 5% of the data set was take as a sample to explore
```{r}
library(caTools)
set.seed(3000)
spl = sample.split(train$User_ID, SplitRatio = 0.05)
Pdata=train[spl,]
Pdata$Marital_Status <- ifelse(Pdata$Marital_Status == 1, 'M', 'UM')
```
How is the distribution of Purchases?.
The distributions right skewed and the mean amd meedian for the sample and the population are approximately same.
```{r}
summary(X_train[spl,]$Purchase)
summary(X_train$Purchase)
```
Logtransfermation
```{r}
library(gridExtra)
NoScale<-qplot(x=Purchase,data=Pdata) # no scaleing
logScale<-qplot(x=log10(Purchase),data=Pdata) # x ais is not scaled by log10
#logScale<-ggplot(aes(x=Purchase),data=Pdata)+geom_histogram()+scale_x_log10() --> X-axis is scaled by log 10, this also works
grid.arrange(NoScale,logScale,ncol=2)
```
Look at the gender inbalance in the data set
```{r}
table(Pdata$Gender)
```
The below frequency poligon says that the proportion of male customer is higher than female purchases
```{r}
ggplot(aes(x = Purchase, y = ..count../sum(..count..)), data = subset(Pdata, !is.na(Gender))) +
geom_freqpoly(aes(color = Gender), binwidth=10) +
scale_x_continuous(limits = c(0, 27723), breaks = seq(0, 27723, 50)) +
xlab('Purchases') +
ylab('Proportion of customers with that Purchase amount')
```
Lets transform the data and see more in detail. Yes male proportions are in high in purchase
```{r}
ggplot(aes(x = Purchase, y = ..count../sum(..count..)), data = subset(Pdata, !is.na(Gender))) +
geom_freqpoly(aes(color = Gender), binwidth=10) +
xlab('Purchases') +
ylab('Proportion of customers with that Purchase amount')+scale_x_sqrt()
```
How prices are distributed
The below graph shows that the prices are spread in a multy model distribution.There are many peaks and crests,therefore it is clear the data is highly variable.
```{r}
ggplot(train,aes(x=Purchase))+geom_density()
```
Purchase by gender, statistical exploration. Females distribution is more skewed right
```{r}
by(Pdata$Purchase,Pdata$Gender,summary)
```
There is a gender imbalance
There are more outliers in the female purchases and an average female shops lesser than male.
```{r}
qplot(x=Gender, y=Purchase,data=subset(Pdata, !is.na(Gender)),geom='boxplot')
```
Outlier : is 1.5 times IQR distance from the median.
Set the limits to remove the outliers.
So when we see these plots it is clear that the female customers shoppe less than the male
```{r}
qplot(x=Gender, y=Purchase,data=subset(Pdata, !is.na(Gender)),geom='boxplot',ylim=c(0,17500))
```
```{r}
qplot(x=Gender, y=Purchase,data=subset(Pdata, !is.na(Gender)),geom='boxplot')+
scale_y_continuous(limits=c(0,15750))
```
Set the limits to remove the outliers.Suppose, we just wanted to remove the outliers in the visualization, but not in the actual plotting (which means for calculating mean, median etc) .You can see that the graph details matches with the above statistics by using coord_cartesian function
```{r}
by(Pdata$Purchase,Pdata$Gender,summary) #This is to summarize the Purchases by Gender numerically.
qplot(x=Gender, y=Purchase,data=subset(Pdata, !is.na(Gender)),geom='boxplot')+coord_cartesian(ylim=c(0,15750))
```
product_count is the number of times the product is appearing.
This means products appearing from 50 to 500 are sold more frequently compared to 1000 to the end.
This means very less proportion ifrequently sold
```{r}
p1<-ggplot(X_train,aes(x=Product_Count))+xlab('Number of times a product sold')+geom_density()
p2<-ggplot(X_train,aes(x=Product_Count))+xlab('Number of times a product sold')+geom_density()+scale_x_log10() #lets do a transform for fun
grid.arrange(p1,p2)
```
Below plot shows the distribution of purchase frequencies and distribution of Occupation frequencies and age frequencies as well
```{r}
library(gridExtra)
pl1<-ggplot(X_train[spl,],aes(x=Purchase))+geom_density()+ylab('Frequancy')
pl2<-ggplot(X_train[spl,],aes(x=Occupation))+geom_density()+ylab('Shopper frequency')
pl3<-ggplot(X_train[spl,],aes(x=Age))+geom_density()+ylab('Frequancy')
grid.arrange(pl1,pl2,pl3,ncol=1)
```
If we take product frequency(Product_count) , the city C has higher in low frequency purchased products.
```{r}
pl1<-ggplot(X_train[spl,],aes(x=Product_Count,fill=City_Category_A))+geom_density()+facet_grid(City_Category_A ~.)+
scale_x_continuous(limits = c(0,2000))
pl2<-ggplot(X_train[spl,],aes(x=Product_Count,fill=City_Category_B))+geom_density()+facet_grid(City_Category_B ~.)+
scale_x_continuous(limits = c(0,2000))
pl3<-ggplot(X_train[spl,],aes(x=Product_Count,fill=City_Category_C))+geom_density()+facet_grid(City_Category_C ~.)+
scale_x_continuous(limits = c(0,2000))
grid.arrange(pl1,pl2,pl3,ncol=1)
```
When we zoomed more on the right, City B has higher in product frequency in high volume purchases.
```{r}
pl1<-ggplot(X_train[spl,],aes(x=Product_Count,fill=City_Category_A))+geom_density()+facet_grid(City_Category_A ~.)+
scale_x_continuous(limits = c(1500,1700))
pl2<-ggplot(X_train[spl,],aes(x=Product_Count,fill=City_Category_B))+geom_density()+facet_grid(City_Category_B ~.)+
scale_x_continuous(limits = c(1500,1700))
pl3<-ggplot(X_train[spl,],aes(x=Product_Count,fill=City_Category_C))+geom_density()+facet_grid(City_Category_C ~.)+
scale_x_continuous(limits = c(1500,1700))
grid.arrange(pl1,pl2,pl3,ncol=1)
```
How Marrital status is effecting the purchase?
Check for imbalance
```{r}
table(Pdata$Marital_Status)
```
Imbalance observed
```{r}
by(Pdata$Purchase,Pdata$Marital_Status,summary)
```
How about the purchases in the case of married an unmarried?
Unmarried are the frquent shoppers
```{r}
ggplot(aes(x = Purchase, y = ..count../sum(..count..)), data = subset(Pdata, !is.na(Marital_Status))) +
geom_freqpoly(aes(color = Marital_Status), binwidth=10) +
scale_x_continuous(limits = c(0, 27723), breaks = seq(0, 27723, 50)) +
xlab('Purchases') +
ylab('Proportion of customers Married and unmarried')
```
Not clear. Lets transform data and see!
```{r}
ggplot(aes(x = Purchase, y = ..count../sum(..count..)), data = subset(Pdata, !is.na(Marital_Status))) +
geom_freqpoly(aes(color = Marital_Status), binwidth=10) +
scale_x_sqrt()+
xlab('Purchases') +
ylab('Proportion of customers Married and unmarried')
```
##Conclusion.
By looking at these plots , It is obvious that the unmarried males are buying more!. What is happening? Once they get married males are not buying due to financial planning or something else?
```{r}
p1<-ggplot(aes(x = Purchase, y = ..count../sum(..count..)), data = subset(Pdata, !is.na(Gender))) +
geom_freqpoly(aes(color = Gender), binwidth=10) +
xlab('Purchases') +
ylab('Proportion of customers with that Purchase amount')+scale_x_sqrt()
p2<-ggplot(aes(x = Purchase, y = ..count../sum(..count..)), data = subset(Pdata, !is.na(Marital_Status))) +
geom_freqpoly(aes(color = Marital_Status), binwidth=10) +
scale_x_sqrt()+
xlab('Purchases') +
ylab('Proportion of customers Married and unmarried')
grid.arrange(p1,p2,ncol=2)
```
Lets see how married males and married females are doing
```{r}
p1<-ggplot(aes(x = Purchase, y = ..count../sum(..count..)), data = subset(Pdata,Pdata$Gender=='M')) +
geom_freqpoly(aes(color = Marital_Status), binwidth=10) +
scale_x_sqrt()+
xlab('Male Purchases') +
ylab('Proportion of male customers Married and unmarried')
p2<-ggplot(aes(x = Purchase, y = ..count../sum(..count..)), data = subset(Pdata,Pdata$Gender=='F')) +
geom_freqpoly(aes(color = Marital_Status), binwidth=10) +
scale_x_sqrt()+
xlab('Female Purchases') +
ylab('Proportion of female customers Married and unmarried')
p3<-ggplot(aes(x = Purchase, y = ..count../sum(..count..)), data = subset(Pdata,Pdata$Marital_Status=='M')) +
geom_freqpoly(aes(color = Gender), binwidth=10) +
scale_x_sqrt()+
xlab('Male Purchases') +
ylab('Proportion of Married customers Male and Female')
p4<-ggplot(aes(x = Purchase, y = ..count../sum(..count..)), data = subset(Pdata,Pdata$Marital_Status=='UM')) +
geom_freqpoly(aes(color = Gender), binwidth=10) +
scale_x_sqrt()+
xlab('Female Purchases') +
ylab('Proportion of unmarried customers Male and Female')
grid.arrange(p1,p2,p3,p4,ncol=2)
```
There are more changes in female purchase after marriage than male purchase.
```{r}
t1<-ggplot(aes(x = Purchase, y = ..count../sum(..count..)), data = subset(Pdata,Pdata$Marital_Status=='M' & Pdata$Gender=='M')) +
geom_freqpoly(binwidth=10) +
scale_x_sqrt()+
xlab('Male Purchases') +
ylab('Sale Proportion of married Male ')
t2<-ggplot(aes(x = Purchase, y = ..count../sum(..count..)), data = subset(Pdata,Pdata$Marital_Status=='UM' & Pdata$Gender=='M')) +
geom_freqpoly(binwidth=10) +
scale_x_sqrt()+
xlab('Male Purchases') +
ylab('Sale Proportion of unmarried Male')
grid.arrange(t1,t2)
t3<-ggplot(aes(x = Purchase, y = ..count../sum(..count..)), data = subset(Pdata,Pdata$Marital_Status=='M' & Pdata$Gender=='F')) +
geom_freqpoly(binwidth=10) +
scale_x_sqrt()+
xlab('Female Purchases') +
ylab('Sale Proportion of Married Female')
t4<-ggplot(aes(x = Purchase, y = ..count../sum(..count..)), data = subset(Pdata,Pdata$Marital_Status=='UM' & Pdata$Gender=='F')) +
geom_freqpoly(binwidth=10) +
scale_x_sqrt()+
xlab('Female Purchases') +
ylab('Sale Proportion of unmarried Female')
grid.arrange(t1,t2,t3,t4)
```
It is evident after looking at the below stats
```{r}
data = subset(Pdata,Pdata$Gender=='M')
by(data$Purchase,data$Marital_Status,summary)
data = subset(Pdata,Pdata$Gender=='F')
by(data$Purchase,data$Marital_Status,summary)
```