Binding for acc2lin.R, assign_job_queue.R, cleanup.R, ipr2viz.R, line…

…age.R, msa.R, plotting.R, summarize.R and tree.R
JRaviLab · Oct 6, 2024 · 6b03555 · 6b03555
1 parent 08c6d55
commit 6b03555
Show file tree

Hide file tree

Showing 10 changed files with 89 additions and 111 deletions.
diff --git a/MolEvolvR.Rproj b/MolEvolvR.Rproj
diff --git a/R/acc2lin.R b/R/acc2lin.R
@@ -197,12 +197,12 @@ efetch_ipg <- function(accnums, out_path, plan = "sequential") {
 ipg2lin <- function(accessions, ipg_file, assembly_path, lineagelookup_path) {
     ipg_dt <- fread(ipg_file, sep = "\t", fill = T)
 
-    ipg_dt <- ipg_dt[Protein %in% accessions]
+    ipg_dt <- ipg_dt[.data$Protein %in% accessions]
 
     ipg_dt <- setnames(ipg_dt, "Assembly", "GCA_ID")
 
     lins <- GCA2Lins(prot_data = ipg_dt, assembly_path, lineagelookup_path)
-    lins <- lins[!is.na(Lineage)] %>% unique()
+    lins <- lins[!is.na(.data$Lineage)] %>% unique()
 
     return(lins)
 }

diff --git a/R/assign_job_queue.R b/R/assign_job_queue.R
@@ -94,7 +94,7 @@ get_proc_medians <- function(dir_job_results) {
         dplyr::summarise(
             dplyr::across(
                 dplyr::everything(),
-                \(x) median(x, na.rm = TRUE)
+                \(x) .data$median(x, na.rm = TRUE)
             )
         ) |>
         as.list()
@@ -126,7 +126,7 @@ write_proc_medians_table <- function(dir_job_results, filepath) {
             names_to = "process",
             values_to = "median_seconds"
         ) |>
-        dplyr::arrange(dplyr::desc(median_seconds))
+        dplyr::arrange(dplyr::desc(.data$median_seconds))
     readr::write_tsv(df_proc_medians, file = filepath)
     return(df_proc_medians)
 }

diff --git a/R/cleanup.R b/R/cleanup.R
@@ -342,7 +342,7 @@ remove_tails <- function(prot, by_column = "DomArch",
         # !! Insert line to read domains_keep
 
         # Contains all domains separated by "|"
-        domains_for_grep <- paste(domains_keep$domains, collapse = "|")
+        domains_for_grep <- paste(.data$domains_keep$domains, collapse = "|")
         # Remove rows with no domains contained within domains_keep
         # Redundant for ClustName since we already set the filter to only these doms.
         tails <- tails %>%
@@ -693,35 +693,35 @@ cleanup_GeneDesc <- function(prot, column) {
 pick_longer_duplicate <- function(prot, column) {
     col <- sym(column)
 
-    prot$row.orig <- 1:nrow(prot)
+    prot$.data$row.orig <- 1:nrow(prot)
 
     # Get list of duplicates
     dups <- prot %>%
-        group_by(AccNum) %>%
+        group_by(.data$AccNum) %>%
         summarize("count" = n()) %>%
         filter(count > 1) %>%
         arrange(-count) %>%
         merge(prot, by = "AccNum")
 
-    dup_acc <- dups$AccNum
+    dup_acc <- dups$.data$AccNum
 
     longest_rows <- c()
     remove_rows <- c()
     for (acc in dup_acc) {
-        dup_rows <- dups %>% filter(AccNum == acc)
+        dup_rows <- dups %>% filter(.data$AccNum == acc)
 
         longest <- dup_rows[which(nchar(pull(dup_rows, {{ col }})) == max(nchar(pull(dup_rows, {{ col }}))))[1], "row.orig"]
 
         longest_rows <- c(longest_rows, longest)
 
-        to_remove <- dup_rows[which(dup_rows$row.orig != longest), "row.orig"][]
+        to_remove <- dup_rows[which(dup_rows$.data$row.orig != longest), "row.orig"][]
 
         # dup_rows[which(nchar(pull(dup_rows,{{col}})) == max(nchar(pull(dup_rows,{{col}}))))[2:nrow(dup_rows)], "row.orig"]
         remove_rows <- c(remove_rows, to_remove)
     }
 
     # grab all the longest rows
-    unique_dups <- prot[-remove_rows, ] %>% select(-row.orig)
+    unique_dups <- prot[-remove_rows, ] %>% select(-.data$row.orig)
 
     return(unique_dups)
 }

diff --git a/R/ipr2viz.R b/R/ipr2viz.R
@@ -134,10 +134,10 @@ ipr2viz <- function(infile_ipr = NULL, infile_full = NULL, accessions = c(),
     ADDITIONAL_COLORS <- sample(CPCOLS, 1000, replace = TRUE)
     CPCOLS <- append(x = CPCOLS, values = ADDITIONAL_COLORS)
     ## Read IPR file
-    ipr_out <- read_tsv(infile_ipr, col_names = T, col_types = iprscan_cols)
-    ipr_out <- ipr_out %>% filter(Name %in% accessions)
+    ipr_out <- read_tsv(infile_ipr, col_names = T, col_types = .data$iprscan_cols)
+    ipr_out <- ipr_out %>% filter(.data$Name %in% accessions)
     analysis_cols <- paste0("DomArch.", analysis)
-    infile_full <- infile_full %>% select(analysis_cols, Lineage_short, QueryName, PcPositive, AccNum)
+    infile_full <- infile_full %>% select(analysis_cols, .data$Lineage_short, .data$QueryName, .data$PcPositive, .data$AccNum)
     ## To filter by Analysis
     analysis <- paste(analysis, collapse = "|")
     ## @SAM: This can't be set in stone since the analysis may change!
@@ -152,35 +152,35 @@ ipr2viz <- function(infile_ipr = NULL, infile_full = NULL, accessions = c(),
     # Filter by Top Accessions per Accession per DomArch and Lineage
     ipr_out <- subset(
         ipr_out,
-        ipr_out$AccNum %in% top_acc
+        ipr_out$.data$AccNum %in% top_acc
     )
     ## Need to fix this eventually based on the 'real' gene orientation! :)
     ipr_out$Strand <- rep("forward", nrow(ipr_out))
 
-    ipr_out <- ipr_out %>% arrange(AccNum, StartLoc, StopLoc)
+    ipr_out <- ipr_out %>% arrange(.data$AccNum, .data$StartLoc, .data$StopLoc)
     ipr_out_sub <- filter(
         ipr_out,
-        grepl(pattern = analysis, x = Analysis)
+        grepl(pattern = analysis, x = .data$Analysis)
     )
     # dynamic analysis labeller
     analyses <- ipr_out_sub %>%
-        select(Analysis) %>%
+        select(.data$Analysis) %>%
         distinct()
     analysis_labeler <- analyses %>%
-        pivot_wider(names_from = Analysis, values_from = Analysis)
+        pivot_wider(names_from = .data$Analysis, values_from = .data$Analysis)
 
     lookup_tbl_path <- "/data/research/jravilab/common_data/cln_lookup_tbl.tsv"
-    lookup_tbl <- read_tsv(lookup_tbl_path, col_names = T, col_types = lookup_table_cols)
+    lookup_tbl <- read_tsv(lookup_tbl_path, col_names = T, col_types = .data$lookup_table_cols)
 
-    lookup_tbl <- lookup_tbl %>% select(-ShortName) # Already has ShortName -- Just needs SignDesc
-    # ipr_out_sub = ipr_out_sub %>% select(-ShortName)
+    lookup_tbl <- lookup_tbl %>% select(-.data$ShortName) # Already has ShortName -- Just needs SignDesc
+    # ipr_out_sub = ipr_out_sub %>% select(-.data$ShortName)
     # TODO: Fix lookup table and uncomment below
     # ipr_out_sub <- merge(ipr_out_sub, lookup_tbl, by.x = "DB.ID", by.y = "DB.ID")
 
     ## PLOTTING
     ## domains as separate arrows
     # For odering with tree
-    # ipr_out_sub$Name <- paste0(" ", ipr_out_sub$Name)
+    # ipr_out_sub$.data$Name <- paste0(" ", ipr_out_sub$.data$Name)
     if (group_by == "Analysis") {
         plot <- ggplot(ipr_out_sub,
             aes_string(
@@ -195,7 +195,7 @@ ipr2viz <- function(infile_ipr = NULL, infile_full = NULL, accessions = c(),
             ), color = "white") +
             geom_gene_arrow(fill = NA, color = "grey") +
             # geom_blank(data = dummies) +
-            facet_wrap(~Analysis,
+            facet_wrap(~.data$Analysis,
                 strip.position = "top", ncol = 5,
                 labeller = as_labeller(analysis_labeler)
             ) +
@@ -206,7 +206,7 @@ ipr2viz <- function(infile_ipr = NULL, infile_full = NULL, accessions = c(),
             theme(
                 legend.position = "bottom",
                 legend.box = "horizontal",
-                legend.key.size = unit(0.02, "npc"),
+                legend.key.size = .data$unit(0.02, "npc"),
                 legend.box.margin = margin(),
                 text = element_text(size = text_size)
             ) +
@@ -216,9 +216,9 @@ ipr2viz <- function(infile_ipr = NULL, infile_full = NULL, accessions = c(),
         plot <- ggplot(
             ipr_out_sub,
             aes(
-                xmin = 1, xmax = SLength,
-                y = Analysis, # y = AccNum
-                label = ShortName
+                xmin = 1, xmax = .data$SLength,
+                y = .data$Analysis, # y = .data$AccNum
+                label = .data$ShortName
             )
         ) +
             geom_subgene_arrow(data = ipr_out_sub, aes_string(
@@ -236,7 +236,7 @@ ipr2viz <- function(infile_ipr = NULL, infile_full = NULL, accessions = c(),
             theme(
                 legend.position = "bottom",
                 legend.box = "horizontal",
-                legend.key.size = unit(0.02, "npc"),
+                legend.key.size = .data$unit(0.02, "npc"),
                 legend.box.margin = margin(),
                 text = element_text(size = text_size)
             ) +

diff --git a/R/lineage.R b/R/lineage.R
@@ -323,7 +323,7 @@ ipg2lin <- function(accessions, ipg_file,
     ipg_dt <- fread(ipg_file, sep = "\t", fill = T)
 
     accessions <- unique(accessions)
-    ipg_dt <- ipg_dt[Protein %in% accessions]
+    ipg_dt <- ipg_dt[.data$Protein %in% accessions]
 
     ipg_dt <- setnames(ipg_dt, "Assembly", "GCA_ID")
 
@@ -335,10 +335,10 @@ ipg2lin <- function(accessions, ipg_file,
     {
         # browser()
         acc <- accessions[i]
-        acc_inds <- which(mergedTax$Protein == acc)
+        acc_inds <- which(.data$mergedTax$.data$Protein == acc)
         if (length(acc_inds) != 0) {
             # refseq inds take precedence
-            refseq_inds <- acc_inds[which(mergedTax[acc_inds, ]$Source == "RefSeq")]
+            refseq_inds <- acc_inds[which(.data$mergedTax[acc_inds, ]$Source == "RefSeq")]
             if (length(refseq_inds) != 0) {
                 # Take the first first row of the refseq (smallest index)
                 refseq_rows[i] <- refseq_inds[1]
@@ -358,21 +358,21 @@ ipg2lin <- function(accessions, ipg_file,
     if (length(refseq_rows) != 0) {
         refseq_ipg_dt <- ipg_dt[refseq_rows, ]
         refseq_lins <- GCA2lin(refseq_ipg_dt,
-            assembly_path = refseq_assembly_path,
+            .data$assembly_path = refseq_assembly_path,
             lineagelookup_path
         )
     }
     if (length(genbank_rows) != 0) {
         genbank_ipg_dt <- ipg_dt[genbank_rows, ]
-        genbank_lins <- GCA2lin(gca_ipg_dt,
-            assembly_path = genbank_assembly_path,
+        genbank_lins <- GCA2lin(.data$gca_ipg_dt,
+            .data$assembly_path = genbank_assembly_path,
             lineagelookup_path
         )
     }
 
 
-    lins <- GCA2lin(prot_data = ipg_dt, assembly_path, lineagelookup_path)
-    lins <- lins[!is.na(Lineage)] %>% unique()
+    lins <- GCA2lin(prot_data = ipg_dt, .data$assembly_path, lineagelookup_path)
+    lins <- lins[!is.na(.data$Lineage)] %>% unique()
 
     return(lins)
 }

diff --git a/R/msa.R b/R/msa.R
@@ -210,7 +210,7 @@ generate_msa <- function(fa_file = "", outfile = "") {
     # source("scripts/c2r.R")
 
     ## align the sequences
-    al <- kalign(prot_aa) # !! won't work!
+    al <- .data$kalign(prot_aa) # !! won't work!
     al
 }
 

diff --git a/R/plotting.R b/R/plotting.R
@@ -574,7 +574,7 @@ lineage.domain_repeats.plot <- function(query_data, colname) {
     # colname <- "SIG.TM.LADB"
 
     ## Create columns for domains/DAs and fill them with 1/0
-    for (i in query.DAdoms$domains)
+    for (i in .data$query.DAdoms$.data$domains)
     {
         j <- str_replace_all(string = i, pattern = "\\(", replacement = "\\\\(")
         j <- str_replace_all(string = j, pattern = "\\)", replacement = "\\\\)")
@@ -592,9 +592,9 @@ lineage.domain_repeats.plot <- function(query_data, colname) {
     ggplot.data <- query_data %>%
         # filter(grepl(queryname, Query)) %>%
         select(
-            DomArch.norep, Lineage, GenContext.norep,
-            SIG.TM.LADB, GenContext, AccNum,
-            query.DAdoms$domains
+            .data$DomArch.norep, .data$Lineage, .data$GenContext.norep,
+            .data$SIG.TM.LADB, .data$GenContext, .data$AccNum,
+            .data$query.DAdoms$.data$domains
         ) %>% # words.gecutoff$words
         # mutate_all(list(~ if (is.numeric(.)) as.integer(.) else .)) %>%
         mutate(across(where(is.numeric), as.integer)) %>%
@@ -606,16 +606,16 @@ lineage.domain_repeats.plot <- function(query_data, colname) {
 
     ## Gathering element/word columns
     ggplot.data.gather <- ggplot.data %>%
-        gather(key = domains, value = count, 7:ncol(ggplot.data)) # %>%
-    # select(DomArch.norep, Lineage, domains, count)
+        gather(key = .data$domains, value = count, 7:ncol(ggplot.data)) # %>%
+    # select(.data$DomArch.norep, .data$Lineage, .data$domains, count)
 
     # ## written on Sep 4
     # write_delim(ggplot.data.gather,
     # 						"toast-rack.domain_repeat_counts-gathered.v1-2.txt",
     # 						delim="\t", col_names=TRUE)
 
     ## Stacked column plot
-    ggplot(data = ggplot.data.gather, aes(x = Lineage, y = domains)) + # aes_string # plot <- (
+    ggplot(data = ggplot.data.gather, aes(x = .data$Lineage, y = domains)) + # aes_string # plot <- (
         # geom_col(position="fill") +
         geom_tile(
             data = subset(ggplot.data.gather, !is.na(count)),
@@ -868,16 +868,16 @@ stacked_lin_plot <- function(prot, column = "DomArch", cutoff, Lineage_col = "Li
                 xlab("Group") +
                 ylab("Number of proteins") +
                 theme_minimal() +
-                scale_fill_manual(values = cpcols, na.value = "#A9A9A9") +
+                scale_fill_manual(values = .data$cpcols, na.value = "#A9A9A9") +
                 theme(
                     legend.position = legend.position,
                     legend.background = element_rect(fill = "white", color = "white"),
                     legend.text = element_text(size = legend.text.size),
                     legend.title = element_text(size = legend.text.size + 2),
-                    legend.key.size = unit(legend.size, "cm"),
-                    # legend.key.height = unit(2, "cm"),
-                    # legend.key.width = unit(0.9, "cm"),
-                    legend.spacing = unit(0.4, "cm"),
+                    legend.key.size = .data$unit(legend.size, "cm"),
+                    # legend.key.height = .data$unit(2, "cm"),
+                    # legend.key.width = .data$unit(0.9, "cm"),
+                    legend.spacing = .data$unit(0.4, "cm"),
                     axis.text = element_text(size = label.size),
                     panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
                     panel.background = element_blank(),
@@ -894,10 +894,10 @@ stacked_lin_plot <- function(prot, column = "DomArch", cutoff, Lineage_col = "Li
                 xlab("Group") +
                 ylab("Number of proteins") +
                 theme_minimal() +
-                scale_fill_manual(values = cpcols, na.value = "#A9A9A9") +
+                scale_fill_manual(values = .data$cpcols, na.value = "#A9A9A9") +
                 theme(
                     legend.position = "none",
-                    legend.spacing = unit(0.4, "cm"),
+                    legend.spacing = .data$unit(0.4, "cm"),
                     axis.text = element_text(size = label.size),
                     panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
                     panel.background = element_blank(),
@@ -918,7 +918,7 @@ stacked_lin_plot <- function(prot, column = "DomArch", cutoff, Lineage_col = "Li
                 legend.background = element_rect(fill = "white", color = "white"),
                 legend.text = element_text(size = legend.text.size),
                 legend.title = element_text(size = legend.text.size + 2),
-                legend.key.size = unit(legend.size, "cm"),
+                legend.key.size = .data$unit(legend.size, "cm"),
                 axis.text = element_text(size = label.size),
                 axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),
                 panel.grid.major = element_blank(), panel.grid.minor = element_blank()
@@ -1145,24 +1145,24 @@ wordcloud2_element <- function(query_data = "prot",
         type <- "gc2da"
     }
 
-    words.tc <- query_data %>%
+    .data$words.tc <- query_data %>%
         elements2words(
             column = colname,
             conversion_type = type
         ) %>%
         words2wc()
 
-    names(words.tc) <- c("words", "freq")
+    names(.data$words.tc) <- c("words", "freq")
 
     # need a label column for actual frequencies, and frequencies will be the
     # normalized sizes
-    words.tc$label <- words.tc$freq
+    .data$words.tc$.data$label <- .data$words.tc$.data$freq
 
-    words.tc <- words.tc %>% mutate(freq = log10(freq))
+    .data$words.tc <- .data$words.tc %>% mutate(.data$freq = log10(.data$freq))
 
-    words.tc <- words.tc %>% select(words, freq, label)
+    .data$words.tc <- .data$words.tc %>% select(words, .data$freq, .data$label)
 
-    wordcloud3(words.tc, minSize = 0)
+    wordcloud3(.data$words.tc, minSize = 0)
 }