-
Notifications
You must be signed in to change notification settings - Fork 0
/
remove_correlated_par.R
80 lines (62 loc) · 2.81 KB
/
remove_correlated_par.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
library(doParallel)
#' Parallel determination of high correlated variables
#'
#' @description This function considers subsets of features and
#' removes correlated features pair-wise from each subset. If two variables have
#' a high correlation, the function looks at the mean absolute correlation of
#' each variable and removes the variable with the largest mean absolute
#' correlation. The computation on subsets is done in parallel.
#' Once a passage through the entire set of features is performed, the remaining
#' features are shuffled and the process is repeated again (till maxiter is
#' reached).
#'
#' @param df dataframe. Dataframe examples x features.
#' @param dim_split integer. Number of features considered at each iteration
#' (def. 1000).
#' @param maxiter integer. Maximum number of iterations (def. 10).
#' @param method string. Method used to compute correlation. Options are
#' c("pearson", "kendall", "spearman"). Default "kendall".
#' @param cutoff numeric.A numeric value for the pair-wise absolute correlation
#' cutoff (def. 0.6).
#'
#' @return A matrix (samples x features) having only the selected subset of
#' variables.
#' @export
remove_correlated_par <- function(mat, dim_split = 1000, maxiter = 10,
method = "kendall", cutoff = 0.6, ncores = my_detectCores()){
X = t(mat)
niter = 0
if (!is.finite(dim_split)) dim_split = nrow(X)
cat("dim(X) before starting remove correlation: ", dim(X), "\n")
while(niter < maxiter ){
cat("niter = ", niter, '\n')
filtered_X = NULL
cl <- makeCluster(min(ncores, detectCores()-1))
registerDoParallel(cl)
filtered_X = foreach(nR = seq(1, nrow(X), by=dim_split),
.combine='rbind', .packages = c("caret", "ggplot2")) %dopar% {
subX = X[nR:min(nrow(X), (nR+dim_split-1)), ]
if (is.null(dim(subX))) return(subX)
cc = cor(t(subX), use = "pairwise.complete.obs", method = method)
if (any(is.na(cc))) return(subX)
select_corr = caret::findCorrelation(cc, cutoff = cutoff, exact = FALSE)
#print(length(select_corr))
if (length(select_corr)>0){
subX = subX[-select_corr, ]
}
return(subX)
}
stopCluster(cl)
print(names(filtered_X))
# if (length(unique(filtered_X[,1]))>1) cat('PROBLEMA!')
no_removed = nrow(X)-nrow(filtered_X)
cat('Removed = ', no_removed, '\n')
cat('dim filetered_X =', dim(filtered_X), '\n')
X = filtered_X[sample(nrow(filtered_X)), ]
niter = niter + 1
cat("nrow(X) =", dim(X), '\n')
if (no_removed ==0) break;
}
cat('final dimension = ', dim(t(X)), '\n')
return(t(X))
}