-
Notifications
You must be signed in to change notification settings - Fork 10
/
AVI_JF_H20_Random_Forest_3.3.Rmd
95 lines (83 loc) · 3.23 KB
/
AVI_JF_H20_Random_Forest_3.3.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
```{r}
library(data.table)
library(h2o)
library(dplyr)
# localH2O = h2o.init()
localH2O = h2o.init(nthreads = -1, max_mem_size = '6g', assertion = FALSE)
cat("reading the train and test data (with data.table) \n")
train <- fread("../data/train.csv",stringsAsFactors = T)
test <- fread("../data/test.csv",stringsAsFactors = T)
store <- fread("data/store.csv",stringsAsFactors = T)
train <- train[Sales > 0,] ## We are not judged on 0 sales records in test set
train <- merge(train,store,by="Store")
test <- merge(test,store,by="Store")
```
# More care should be taken to ensure the dates of test can be projected from train
# Decision trees do not project well, so you will want to have some strategy here, if using the dates
```{r}
train[,Date:=as.Date(Date)]
test[,Date:=as.Date(Date)]
train[,Store:=as.factor(as.numeric(Store))]
test[,Store:=as.factor(as.numeric(Store))]
```
# log transformation to not be as sensitive to high sales
```{r}
train[,logSales:=log1p(Sales)]
trainHex<-as.h2o(train)
features<-colnames(train)[!(colnames(train) %in% c( "V1","Id","Date","Sales","logSales",
"Customers","Open","type","SH_IMM_MEAS"))]
```
## Train a random forest using all default parameters
```{r}
trees = 150
depth = 50
rfHex <- h2o.randomForest(x=features,
y="logSales",
ntrees = trees,
max_depth = depth,
nbins_cats = 1115, ## allow it to fit store ID
training_frame=trainHex)
```
```{r}
# Run this and save them please! It's the var importance
(varimps = data.frame(h2o.varimp(rfHex)))
```
## Saving our model:
```{r}
########################################
# DONT FORGET TO CHANGE THE PATH BELOW #
########################################
rfHex_150_50_v2_data = rfHex
h2o.saveModel(rfHex_150_50_v2_data, path = '/Users/jfdarre/Documents/NYCDS/Project4/H2O_models_v2', force = FALSE)
```
## Testing training score: RMSE Function
```{r}
rmse = function(predictions, targets) {
return(((predictions - targets)/targets) ** 2)
}
```
## Running training errors
```{r}
train_pred = as.data.frame(h2o.predict(rfHex,trainHex))
train_pred <- expm1(train_pred[,1])
train$pred = train_pred
train$rmse = rmse(train_pred, train$Sales)
train2 = filter(train, month %in% c(8,9))
(total_rmse = sqrt(sum(train$rmse)/nrow(train)))
(partial_rmse = sqrt(sum(train2$rmse)/nrow(train2)))
sumup = as.data.frame(rbind(summary(train_pred), summary(train$Sales), summary(train2$pred), summary(train2$Sales)))
sumup$sd = c(round(sd(train_pred)), round(sd(train$Sales)), round(sd(train2$pred)), round(sd(train2$Sales)))
# Please take note of the training errors on the whole set (the first one) and thesecond one is
# our training error on only the month of august and septembre which the month of the test set
sumup
```
## Load test data into cluster from R Get predictions out; predicts in H2O, as.data.frame gets them into R
```{r}
cat("Predicting Sales\n")
testHex<-as.h2o(test)
predictions<-as.data.frame(h2o.predict(rfHex,testHex))
pred <- expm1(predictions[,1])
submission <- data.frame(Id=test$Id, Sales=pred)
cat("saving the submission file\n")
write.csv(submission, "./H2O_submits/h2o_150_50_v2_data.csv",row.names=F)
```