-
Notifications
You must be signed in to change notification settings - Fork 3
/
rowVarImp.R
61 lines (51 loc) · 1.9 KB
/
rowVarImp.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
rowVarImp.h2o <-
function(model, cols, data_frame, n){
# Calculates top variable importance for each row.
#
# Args:
# model: A h2o model.
# cols: Name of columns to be considered for calculating ranks.
# data_frame: A H2O data frame.
# n: Returns the results based on "n"th most important variables.
# key_cols: key columns (the data frame should have at least one column
# as key value).
#
# Returns:
# A r data.frame that contains nth most important variable for each
# variable it's importance, the value and it's name.
# n cannot be bigger than length(cols)
n <- min(length(cols), n)
# prediction probability when all co-variables included
pred.orig <- h2o.predict(model, data_frame)
# Each time drop a column an calculate it's difference in prediction of probabilities.
for(var in cols) {
tmp.df <- data_frame[, -which(names(data_frame) %in% c(var))]
pred <- h2o.predict(model, tmp.df)
if(var == cols[1]){
pred.diff <- pred.orig[,3] - pred[,3]
names(pred.diff) <- var
}else{
pred.diff[, var] <- pred.orig[,3] - pred[,3]
}
}
# conver data.tables to matrices
rankMat <- as.matrix(pred.diff)
valMat <- as.matrix(data_frame[,cols])
class(valMat) <- "character"
# calculate
l <- get_high_rank_values(rankMat, valMat, cols, n)
# bind all results
res <- cbind(
as.data.frame(l[[3]]),
as.data.frame(l[[2]]),
as.data.frame(apply(l[[1]], 2, as.numeric))
)
# give appropriate name to each column
colNameC <- paste0("colName_VI", c(1:n))
colNameR <- paste0("imp_VI", c(1:n))
colNameV <- paste0("value_VI", c(1:n))
colnames(res) <- c(colNameC, colNameV, colNameR )
# re-arrange the columns
res = res[,c(sapply(c(1:n), function(x)(seq(x,n*2+x, n))))]
return(res)
}