-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfuncs.R
126 lines (97 loc) · 3.61 KB
/
funcs.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
plot_distribution <- function(cur_stat) {
ggplot(df, aes(x = eval(parse(text = cur_stat)))) +
geom_histogram(bins = 40) +
labs(x = cur_stat,
y = "Count") +
theme_bw()
}
freq_mapper <- function(cur_pos) {
freq_array <- cur_pos %>%
select(contains(paste0("Freq", cur_pos$con_base))) %>%
as.vector() %>%
as.numeric()
return(freq_array)
}
max_mapper <- function(cur_pos, pattern) {
max_val <- cur_pos %>%
select(contains(paste0(pattern, "_", cur_pos$AbundMaxIdx, "."))) %>%
as.vector() %>%
as.numeric()
return(max_val)
}
sample_loader <- function(sample_name) {
sample_df <- read_csv2(sample_name) %>%
mutate(sample = gsub(".csv", "", gsub("data/", "", sample_name)),
genome = gsub("matabat2bin.", "", gsub(".fa", "", genome)))
return(sample_df)
}
sample_loader2 <- function(sample_name) {
sample_df <- read_tsv(sample_name) %>%
mutate(sample = gsub("_1.paired.fastq.gz_profile.IS_scaffold_info.tsv", "", gsub("data/scaffold_info/reference.fa.", "", sample_name)))
return(sample_df)
}
gene_loader <- function(sample_name, gen_list = "", method = read_csv) {
sample_df <- method(sample_name) %>%
mutate(sample = gsub("data/gene_info/", "", gsub("_gene_info.tsv", "", sample_name))) %>%
left_join(mapping) %>%
filter(AbundMaxSample == sample)
return(sample_df)
}
#' Hclust cannot handle matrices in which for some pairs of rows and columns,
#' only 1 or fewer shared values are non-NA. This function recurrently
#' identifies the most aggravating column/row, excludes that column/row and checks
#' whether more columns/rows need to be excluded
#'
#' @param mat Matrix to investigate
#' @param min_shared_fields Minimum number of positions that are not NA in both
#' vectors in order not to flag the vector pair as problematic
#'
identify_problematic_combs <- function(mat, min_shared_fields = 1) {
exclude_rows <- NULL
exclude_cols <- NULL
stopifnot(is.matrix(mat))
## Loop over candidate removals
for (k in 1:nrow(mat)) {
candidate_rows <- setdiff(1:nrow(mat), exclude_rows)
problem_row_combs <- NULL
for (i in candidate_rows) {
i_idx <- which(candidate_rows == i)
for (j in candidate_rows[i_idx:length(candidate_rows)]) {
if (sum(!is.na(mat[i, ]) & !is.na(mat[j, ])) <= min_shared_fields) {
problem_row_combs <- rbind(problem_row_combs, c(i, j))
}
}
}
if (is.null(problem_row_combs)) break
exclude_rows <- c(exclude_rows,
as.integer(names(which.max(table(problem_row_combs)))))
}
for (k in 1:ncol(mat)) {
candidate_cols <- setdiff(1:ncol(mat), exclude_cols)
problem_col_combs <- NULL
for (i in candidate_cols) {
i_idx <- which(candidate_cols == i)
for (j in candidate_cols[i_idx:length(candidate_cols)]) {
if (sum(!is.na(mat[, i]) & !is.na(mat[, j])) <= min_shared_fields) {
problem_col_combs <- rbind(problem_col_combs, c(i, j))
}
}
}
if (is.null(problem_col_combs)) break
exclude_cols <- c(exclude_cols,
as.integer(names(which.max(table(problem_col_combs)))))
}
return(list('row' = exclude_rows, 'column' = exclude_cols))
}
remove_problematic_combs <- function() {
problematic_combs <- identify_problematic_combs(
mat = mat, min_shared_fields = min_shared_fields)
if (!is.null(problematic_combs$row)) {
mat <- mat[-problematic_combs$row, ]
}
if (!is.null(problematic_combs$column)) {
mat <- mat[, -problematic_combs$column]
}
return(mat)
}
formals(remove_problematic_combs) <- formals(identify_problematic_combs)