-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_other_models.R
140 lines (104 loc) · 4.28 KB
/
run_other_models.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
## Import libraries
suppressPackageStartupMessages({
library(dplyr)
library(reshape2)
library(purrr)
library(tidyr)
library(cassandRa)
library(progress)
library(foreach)
library(doSNOW) # parallel + progress bar fix
})
## Set important paths
# Path of the main folder
path_main_dir = "/gpfs0/shai/users/barryb/link-predict/" # HPC
# dir_path = dirname(rstudioapi::callFun("getActiveDocumentContext")$path) # Rstudio
# Paths of data by specific purpose
path_metadata = paste0(path_main_dir, "data/processed/") # Path of proccessed networks & features data, serving as complementary metadata
path_raw_results = paste0(path_main_dir, "results/raw/") # Path of raw results
## Set Variables
cores = as.numeric(Sys.getenv('NSLOTS'))
if (is.na(cores)){
cores = 2
}
parallel_processing = T
## Load data
# Load subsamples (edgelists) data
message('Loading data\n')
subsamples_edge_lists <- read.csv(paste0(path_metadata, "networks/subsamples_edge_lists.csv"))
# Define the test set
test_data <- subsamples_edge_lists
# Main function - fit models
fit_models <- function(edgelist, n=10) {
# Convert edgelist to matrix
matrix = bipartite::frame2webs(edgelist, varname = c("lower_level", "higher_level", "subsample_ID", "weight"), emptylist=FALSE)[[1]]
# Create list object
network_list <- CreateListObject(matrix)
# Fit SBM
SBM_ProbsMat <-
melt(FitSBM(network_list, n_SBM = n, G = NULL)$SBM_ProbsMat) %>%
rename('SBM_Prob' = value) %>%
select(SBM_Prob)
# Fit centrality
C_ProbsMatrix <-
melt(FitCentrality(network_list, N_runs = n, maxit = 10000, method = "Nelder-Mead")$C_ProbsMatrix) %>%
rename('C_Prob' = value) %>%
select(C_Prob)
# Create dataframe
result <- expand.grid(network_list$HostNames, network_list$WaspNames, stringsAsFactors = FALSE) %>%
rename('lower_level' = Var1, 'higher_level' = Var2) %>%
bind_cols(SBM_ProbsMat, C_ProbsMatrix)
return(result)
}
# Get subsamples ids
ids = unique(test_data$subsample_ID)#[1:3]
len = length(ids)
message('Fitting predictive models\n')
if (parallel_processing == TRUE){
cl <- makeCluster(cores, outfile="/dev/null") # outfile="/dev/null": surpress the "Type: EXEC" and "Type: DONE" output on HPC
registerDoSNOW(cl)
pb <- txtProgressBar(min = 1, max = len, style = 3)
progress <- function(i) setTxtProgressBar(pb, i)
opts <- list(progress = progress)
pred_df <- foreach (i=1:len,
.combine=rbind,
.options.snow = opts,
.packages = c("cassandRa", "bipartite", "dplyr", "reshape2")) %dopar% {
id = ids[[i]]
edgelist = fit_models(test_data[test_data$subsample_ID == id, ] %>% select(lower_level, higher_level, subsample_ID, weight))
edgelist$subsample_ID = id
return(edgelist)
}
close(pb)
stopCluster(cl)
} else { # not parallel processing
# Set up a manual progress bar
pb <- progress_bar$new(
format = "Calculating [:bar] :percent",
total = len, clear = FALSE, width = 60
)
# Apply fit_models function on each subsample
pred_df <- test_data %>%
filter(subsample_ID %in% ids) %>% # Filter only the subsamples in the test set
select(lower_level, higher_level, subsample_ID, weight) %>%
mutate(subsample_ID_copy = subsample_ID) %>% # Create a copy of subsample_ID
group_by(subsample_ID_copy) %>%
nest() %>%
mutate(model = map(data, ~ {
pb$tick() # Update the progress bar at each iteration
fit_models(as.data.frame(.x))
})) %>%
unnest(cols = c(model)) %>%
rename('subsample_ID' = subsample_ID_copy) %>%
select(-data)
# pb$close()
}
# restore link_ID column
pred_df <- merge(pred_df, test_data %>% select(link_ID, lower_level, higher_level, subsample_ID), by = c("subsample_ID", "lower_level", "higher_level"))
# Save the results
cat('Exporting new dataframe\n')
write.csv(pred_df %>% select(link_ID, SBM_Prob, C_Prob), paste0(path_raw_results, "other_models.csv"), row.names = FALSE)
cat('\nDone')
# Debugging
edgelist = test_data %>% filter(subsample_ID == 3) %>% select( lower_level, higher_level, subsample_ID, weight)
res = fit_models(edgelist)