-
Notifications
You must be signed in to change notification settings - Fork 1
/
Random_forest_prediction.R
97 lines (78 loc) · 3.52 KB
/
Random_forest_prediction.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#Loading required R packages
library(tidyverse)
library(caret)
library(randomForest)
library(party)
library(MLmetrics)
path1 = '../../all_variables_and_GPI_monthly_all_countries'
path2 = '../../rf_results'
country_files = list.files(path1, pattern="*.csv")
for (i in country_files){
coun<-strsplit(i,"_")[[1]][[3]]
country<-strsplit(coun, ".", fixed = TRUE)[[1]][[1]]
print(country)
results_analytics <- setNames(data.frame(matrix(ncol = 6, nrow = 0)), c('country', 'mtry', 'RMSE', 'Rsquare', 'Mape', 'Pearson'))
#Load the data
file_df <- file.path(path1, paste('all_variables_', country, '.csv', sep = ''))
if (file.exists(file_df)){
df_country_initial<- read.csv(file_df, stringsAsFactors = FALSE)
drops <- c("MonthYear")
df_country_rf<-df_country_initial[ , !(names(df_country_initial) %in% drops)]
#Split the data into training and test set
train_set<-0.5
train.data <- head(df_country_rf, round(length(df_country_rf$GPI) * train_set))
h <- length(df_country_rf$GPI) - length(train.data$GPI)
test.data <- tail(df_country_rf, h)
#Prediction model
#The dataframe with the most important variables from each training
df_important_var<-setNames(data.frame(matrix(ncol = 1, nrow = 0)), c("var_name"))
mtry_var <- list()
predictions <- double()
for (i in (1:nrow(test.data))){
model <- train(
GPI~., data = train.data,
method = "rf",
importance = TRUE,
trControl = trainControl(method="cv", number=10),
tuneLength = 10
)
#Variables' importance
imp_var <- varImp(model)$importance
imp_var <- rownames_to_column(imp_var, var = "var_name")
df_important_var <- rbind(df_important_var, imp_var)
# Best tuning parameter
mtry <- model$bestTune
mtry_var <- rbind(mtry_var, mtry)
predictions <- append(predictions, model %>% predict(test.data[i:i,]))
train.data <- rbind(train.data, test.data[i:i,])
}
#Save the most important variables from each training
write.csv(df_important_var,file.path (path2, paste(country, '_rf_', train_set, '_important_variables.csv', sep = '')))
#Save in a dataframe mtry and predictions
predictions_mtry <- data.frame(mtry_var,predictions)
rownames(predictions_mtry) <- NULL
write.csv(predictions_mtry, file.path (path2, paste(country, '_rf_', train_set, '_predictions_mtry.csv', sep = '')), row.names=T)
# Model performance metrics
results_analytics <- data.frame(
RMSE = RMSE(predictions, test.data$GPI),
Rsquare = R2(predictions, test.data$GPI),
Mape = MAPE(predictions, test.data$GPI),
Pearson = cor(test.data$GPI, predictions, method = "pearson")
)
write.csv(results_analytics, file.path (path2, paste(country, '_rf_', train_set, '_results.csv', sep = '')), row.names=T)
#Scatterplot predicted VS actual data
pdf(file = file.path (path2, paste(country, '_rf_', train_set, '_scatterplot', '.pdf', sep = '')))
plot(test.data$GPI,predictions,
xlab="Predicted GPI",ylab="Actual GPI")
abline(lm(predictions ~ test.data$GPI))
dev.off()
#Plot predicted VS actual data
pdf(file = file.path (path2, paste(country, '_rf_', train_set, '_trends_pred_actual', '.pdf', sep = '')))
plot(test.data$GPI,type="l",col="red",main="Predicted VS Actual GPI", sub="",
ylab="gpi values", xlab="months")
lines(predictions,col="blue")
dev.off()
}
#Confirm remove all objects before going to the next interation
#rm(list = ls(all.names = TRUE))
}