Skip to content

Commit

Permalink
Binding for acc2lin.R, assign_job_queue.R, cleanup.R, ipr2viz.R, line…
Browse files Browse the repository at this point in the history
…age.R, msa.R, plotting.R, summarize.R and tree.R
  • Loading branch information
SunSummoner committed Oct 6, 2024
1 parent 08c6d55 commit 6b03555
Show file tree
Hide file tree
Showing 10 changed files with 89 additions and 111 deletions.
22 changes: 0 additions & 22 deletions MolEvolvR.Rproj

This file was deleted.

4 changes: 2 additions & 2 deletions R/acc2lin.R
Original file line number Diff line number Diff line change
Expand Up @@ -197,12 +197,12 @@ efetch_ipg <- function(accnums, out_path, plan = "sequential") {
ipg2lin <- function(accessions, ipg_file, assembly_path, lineagelookup_path) {
ipg_dt <- fread(ipg_file, sep = "\t", fill = T)

ipg_dt <- ipg_dt[Protein %in% accessions]
ipg_dt <- ipg_dt[.data$Protein %in% accessions]

ipg_dt <- setnames(ipg_dt, "Assembly", "GCA_ID")

lins <- GCA2Lins(prot_data = ipg_dt, assembly_path, lineagelookup_path)
lins <- lins[!is.na(Lineage)] %>% unique()
lins <- lins[!is.na(.data$Lineage)] %>% unique()

return(lins)
}
Expand Down
4 changes: 2 additions & 2 deletions R/assign_job_queue.R
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ get_proc_medians <- function(dir_job_results) {
dplyr::summarise(
dplyr::across(
dplyr::everything(),
\(x) median(x, na.rm = TRUE)
\(x) .data$median(x, na.rm = TRUE)
)
) |>
as.list()
Expand Down Expand Up @@ -126,7 +126,7 @@ write_proc_medians_table <- function(dir_job_results, filepath) {
names_to = "process",
values_to = "median_seconds"
) |>
dplyr::arrange(dplyr::desc(median_seconds))
dplyr::arrange(dplyr::desc(.data$median_seconds))
readr::write_tsv(df_proc_medians, file = filepath)
return(df_proc_medians)
}
Expand Down
14 changes: 7 additions & 7 deletions R/cleanup.R
Original file line number Diff line number Diff line change
Expand Up @@ -342,7 +342,7 @@ remove_tails <- function(prot, by_column = "DomArch",
# !! Insert line to read domains_keep

# Contains all domains separated by "|"
domains_for_grep <- paste(domains_keep$domains, collapse = "|")
domains_for_grep <- paste(.data$domains_keep$domains, collapse = "|")
# Remove rows with no domains contained within domains_keep
# Redundant for ClustName since we already set the filter to only these doms.
tails <- tails %>%
Expand Down Expand Up @@ -693,35 +693,35 @@ cleanup_GeneDesc <- function(prot, column) {
pick_longer_duplicate <- function(prot, column) {
col <- sym(column)

prot$row.orig <- 1:nrow(prot)
prot$.data$row.orig <- 1:nrow(prot)

# Get list of duplicates
dups <- prot %>%
group_by(AccNum) %>%
group_by(.data$AccNum) %>%
summarize("count" = n()) %>%
filter(count > 1) %>%
arrange(-count) %>%
merge(prot, by = "AccNum")

dup_acc <- dups$AccNum
dup_acc <- dups$.data$AccNum

longest_rows <- c()
remove_rows <- c()
for (acc in dup_acc) {
dup_rows <- dups %>% filter(AccNum == acc)
dup_rows <- dups %>% filter(.data$AccNum == acc)

longest <- dup_rows[which(nchar(pull(dup_rows, {{ col }})) == max(nchar(pull(dup_rows, {{ col }}))))[1], "row.orig"]

longest_rows <- c(longest_rows, longest)

to_remove <- dup_rows[which(dup_rows$row.orig != longest), "row.orig"][]
to_remove <- dup_rows[which(dup_rows$.data$row.orig != longest), "row.orig"][]

# dup_rows[which(nchar(pull(dup_rows,{{col}})) == max(nchar(pull(dup_rows,{{col}}))))[2:nrow(dup_rows)], "row.orig"]
remove_rows <- c(remove_rows, to_remove)
}

# grab all the longest rows
unique_dups <- prot[-remove_rows, ] %>% select(-row.orig)
unique_dups <- prot[-remove_rows, ] %>% select(-.data$row.orig)

return(unique_dups)
}
Expand Down
36 changes: 18 additions & 18 deletions R/ipr2viz.R
Original file line number Diff line number Diff line change
Expand Up @@ -134,10 +134,10 @@ ipr2viz <- function(infile_ipr = NULL, infile_full = NULL, accessions = c(),
ADDITIONAL_COLORS <- sample(CPCOLS, 1000, replace = TRUE)
CPCOLS <- append(x = CPCOLS, values = ADDITIONAL_COLORS)
## Read IPR file
ipr_out <- read_tsv(infile_ipr, col_names = T, col_types = iprscan_cols)
ipr_out <- ipr_out %>% filter(Name %in% accessions)
ipr_out <- read_tsv(infile_ipr, col_names = T, col_types = .data$iprscan_cols)
ipr_out <- ipr_out %>% filter(.data$Name %in% accessions)
analysis_cols <- paste0("DomArch.", analysis)
infile_full <- infile_full %>% select(analysis_cols, Lineage_short, QueryName, PcPositive, AccNum)
infile_full <- infile_full %>% select(analysis_cols, .data$Lineage_short, .data$QueryName, .data$PcPositive, .data$AccNum)
## To filter by Analysis
analysis <- paste(analysis, collapse = "|")
## @SAM: This can't be set in stone since the analysis may change!
Expand All @@ -152,35 +152,35 @@ ipr2viz <- function(infile_ipr = NULL, infile_full = NULL, accessions = c(),
# Filter by Top Accessions per Accession per DomArch and Lineage
ipr_out <- subset(
ipr_out,
ipr_out$AccNum %in% top_acc
ipr_out$.data$AccNum %in% top_acc
)
## Need to fix this eventually based on the 'real' gene orientation! :)
ipr_out$Strand <- rep("forward", nrow(ipr_out))

ipr_out <- ipr_out %>% arrange(AccNum, StartLoc, StopLoc)
ipr_out <- ipr_out %>% arrange(.data$AccNum, .data$StartLoc, .data$StopLoc)
ipr_out_sub <- filter(
ipr_out,
grepl(pattern = analysis, x = Analysis)
grepl(pattern = analysis, x = .data$Analysis)
)
# dynamic analysis labeller
analyses <- ipr_out_sub %>%
select(Analysis) %>%
select(.data$Analysis) %>%
distinct()
analysis_labeler <- analyses %>%
pivot_wider(names_from = Analysis, values_from = Analysis)
pivot_wider(names_from = .data$Analysis, values_from = .data$Analysis)

lookup_tbl_path <- "/data/research/jravilab/common_data/cln_lookup_tbl.tsv"
lookup_tbl <- read_tsv(lookup_tbl_path, col_names = T, col_types = lookup_table_cols)
lookup_tbl <- read_tsv(lookup_tbl_path, col_names = T, col_types = .data$lookup_table_cols)

lookup_tbl <- lookup_tbl %>% select(-ShortName) # Already has ShortName -- Just needs SignDesc
# ipr_out_sub = ipr_out_sub %>% select(-ShortName)
lookup_tbl <- lookup_tbl %>% select(-.data$ShortName) # Already has ShortName -- Just needs SignDesc
# ipr_out_sub = ipr_out_sub %>% select(-.data$ShortName)
# TODO: Fix lookup table and uncomment below
# ipr_out_sub <- merge(ipr_out_sub, lookup_tbl, by.x = "DB.ID", by.y = "DB.ID")

## PLOTTING
## domains as separate arrows
# For odering with tree
# ipr_out_sub$Name <- paste0(" ", ipr_out_sub$Name)
# ipr_out_sub$.data$Name <- paste0(" ", ipr_out_sub$.data$Name)
if (group_by == "Analysis") {
plot <- ggplot(ipr_out_sub,
aes_string(
Expand All @@ -195,7 +195,7 @@ ipr2viz <- function(infile_ipr = NULL, infile_full = NULL, accessions = c(),
), color = "white") +
geom_gene_arrow(fill = NA, color = "grey") +
# geom_blank(data = dummies) +
facet_wrap(~Analysis,
facet_wrap(~.data$Analysis,
strip.position = "top", ncol = 5,
labeller = as_labeller(analysis_labeler)
) +
Expand All @@ -206,7 +206,7 @@ ipr2viz <- function(infile_ipr = NULL, infile_full = NULL, accessions = c(),
theme(
legend.position = "bottom",
legend.box = "horizontal",
legend.key.size = unit(0.02, "npc"),
legend.key.size = .data$unit(0.02, "npc"),
legend.box.margin = margin(),
text = element_text(size = text_size)
) +
Expand All @@ -216,9 +216,9 @@ ipr2viz <- function(infile_ipr = NULL, infile_full = NULL, accessions = c(),
plot <- ggplot(
ipr_out_sub,
aes(
xmin = 1, xmax = SLength,
y = Analysis, # y = AccNum
label = ShortName
xmin = 1, xmax = .data$SLength,
y = .data$Analysis, # y = .data$AccNum
label = .data$ShortName
)
) +
geom_subgene_arrow(data = ipr_out_sub, aes_string(
Expand All @@ -236,7 +236,7 @@ ipr2viz <- function(infile_ipr = NULL, infile_full = NULL, accessions = c(),
theme(
legend.position = "bottom",
legend.box = "horizontal",
legend.key.size = unit(0.02, "npc"),
legend.key.size = .data$unit(0.02, "npc"),
legend.box.margin = margin(),
text = element_text(size = text_size)
) +
Expand Down
16 changes: 8 additions & 8 deletions R/lineage.R
Original file line number Diff line number Diff line change
Expand Up @@ -323,7 +323,7 @@ ipg2lin <- function(accessions, ipg_file,
ipg_dt <- fread(ipg_file, sep = "\t", fill = T)

accessions <- unique(accessions)
ipg_dt <- ipg_dt[Protein %in% accessions]
ipg_dt <- ipg_dt[.data$Protein %in% accessions]

ipg_dt <- setnames(ipg_dt, "Assembly", "GCA_ID")

Expand All @@ -335,10 +335,10 @@ ipg2lin <- function(accessions, ipg_file,
{
# browser()
acc <- accessions[i]
acc_inds <- which(mergedTax$Protein == acc)
acc_inds <- which(.data$mergedTax$.data$Protein == acc)
if (length(acc_inds) != 0) {
# refseq inds take precedence
refseq_inds <- acc_inds[which(mergedTax[acc_inds, ]$Source == "RefSeq")]
refseq_inds <- acc_inds[which(.data$mergedTax[acc_inds, ]$Source == "RefSeq")]
if (length(refseq_inds) != 0) {
# Take the first first row of the refseq (smallest index)
refseq_rows[i] <- refseq_inds[1]
Expand All @@ -358,21 +358,21 @@ ipg2lin <- function(accessions, ipg_file,
if (length(refseq_rows) != 0) {
refseq_ipg_dt <- ipg_dt[refseq_rows, ]
refseq_lins <- GCA2lin(refseq_ipg_dt,
assembly_path = refseq_assembly_path,
.data$assembly_path = refseq_assembly_path,
lineagelookup_path
)
}
if (length(genbank_rows) != 0) {
genbank_ipg_dt <- ipg_dt[genbank_rows, ]
genbank_lins <- GCA2lin(gca_ipg_dt,
assembly_path = genbank_assembly_path,
genbank_lins <- GCA2lin(.data$gca_ipg_dt,
.data$assembly_path = genbank_assembly_path,
lineagelookup_path
)
}


lins <- GCA2lin(prot_data = ipg_dt, assembly_path, lineagelookup_path)
lins <- lins[!is.na(Lineage)] %>% unique()
lins <- GCA2lin(prot_data = ipg_dt, .data$assembly_path, lineagelookup_path)
lins <- lins[!is.na(.data$Lineage)] %>% unique()

return(lins)
}
Expand Down
2 changes: 1 addition & 1 deletion R/msa.R
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,7 @@ generate_msa <- function(fa_file = "", outfile = "") {
# source("scripts/c2r.R")

## align the sequences
al <- kalign(prot_aa) # !! won't work!
al <- .data$kalign(prot_aa) # !! won't work!
al
}

Expand Down
42 changes: 21 additions & 21 deletions R/plotting.R
Original file line number Diff line number Diff line change
Expand Up @@ -574,7 +574,7 @@ lineage.domain_repeats.plot <- function(query_data, colname) {
# colname <- "SIG.TM.LADB"

## Create columns for domains/DAs and fill them with 1/0
for (i in query.DAdoms$domains)
for (i in .data$query.DAdoms$.data$domains)
{
j <- str_replace_all(string = i, pattern = "\\(", replacement = "\\\\(")
j <- str_replace_all(string = j, pattern = "\\)", replacement = "\\\\)")
Expand All @@ -592,9 +592,9 @@ lineage.domain_repeats.plot <- function(query_data, colname) {
ggplot.data <- query_data %>%
# filter(grepl(queryname, Query)) %>%
select(
DomArch.norep, Lineage, GenContext.norep,
SIG.TM.LADB, GenContext, AccNum,
query.DAdoms$domains
.data$DomArch.norep, .data$Lineage, .data$GenContext.norep,
.data$SIG.TM.LADB, .data$GenContext, .data$AccNum,
.data$query.DAdoms$.data$domains
) %>% # words.gecutoff$words
# mutate_all(list(~ if (is.numeric(.)) as.integer(.) else .)) %>%
mutate(across(where(is.numeric), as.integer)) %>%
Expand All @@ -606,16 +606,16 @@ lineage.domain_repeats.plot <- function(query_data, colname) {

## Gathering element/word columns
ggplot.data.gather <- ggplot.data %>%
gather(key = domains, value = count, 7:ncol(ggplot.data)) # %>%
# select(DomArch.norep, Lineage, domains, count)
gather(key = .data$domains, value = count, 7:ncol(ggplot.data)) # %>%
# select(.data$DomArch.norep, .data$Lineage, .data$domains, count)

# ## written on Sep 4
# write_delim(ggplot.data.gather,
# "toast-rack.domain_repeat_counts-gathered.v1-2.txt",
# delim="\t", col_names=TRUE)

## Stacked column plot
ggplot(data = ggplot.data.gather, aes(x = Lineage, y = domains)) + # aes_string # plot <- (
ggplot(data = ggplot.data.gather, aes(x = .data$Lineage, y = domains)) + # aes_string # plot <- (
# geom_col(position="fill") +
geom_tile(
data = subset(ggplot.data.gather, !is.na(count)),
Expand Down Expand Up @@ -868,16 +868,16 @@ stacked_lin_plot <- function(prot, column = "DomArch", cutoff, Lineage_col = "Li
xlab("Group") +
ylab("Number of proteins") +
theme_minimal() +
scale_fill_manual(values = cpcols, na.value = "#A9A9A9") +
scale_fill_manual(values = .data$cpcols, na.value = "#A9A9A9") +
theme(
legend.position = legend.position,
legend.background = element_rect(fill = "white", color = "white"),
legend.text = element_text(size = legend.text.size),
legend.title = element_text(size = legend.text.size + 2),
legend.key.size = unit(legend.size, "cm"),
# legend.key.height = unit(2, "cm"),
# legend.key.width = unit(0.9, "cm"),
legend.spacing = unit(0.4, "cm"),
legend.key.size = .data$unit(legend.size, "cm"),
# legend.key.height = .data$unit(2, "cm"),
# legend.key.width = .data$unit(0.9, "cm"),
legend.spacing = .data$unit(0.4, "cm"),
axis.text = element_text(size = label.size),
panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(),
Expand All @@ -894,10 +894,10 @@ stacked_lin_plot <- function(prot, column = "DomArch", cutoff, Lineage_col = "Li
xlab("Group") +
ylab("Number of proteins") +
theme_minimal() +
scale_fill_manual(values = cpcols, na.value = "#A9A9A9") +
scale_fill_manual(values = .data$cpcols, na.value = "#A9A9A9") +
theme(
legend.position = "none",
legend.spacing = unit(0.4, "cm"),
legend.spacing = .data$unit(0.4, "cm"),
axis.text = element_text(size = label.size),
panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
panel.background = element_blank(),
Expand All @@ -918,7 +918,7 @@ stacked_lin_plot <- function(prot, column = "DomArch", cutoff, Lineage_col = "Li
legend.background = element_rect(fill = "white", color = "white"),
legend.text = element_text(size = legend.text.size),
legend.title = element_text(size = legend.text.size + 2),
legend.key.size = unit(legend.size, "cm"),
legend.key.size = .data$unit(legend.size, "cm"),
axis.text = element_text(size = label.size),
axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),
panel.grid.major = element_blank(), panel.grid.minor = element_blank()
Expand Down Expand Up @@ -1145,24 +1145,24 @@ wordcloud2_element <- function(query_data = "prot",
type <- "gc2da"
}

words.tc <- query_data %>%
.data$words.tc <- query_data %>%
elements2words(
column = colname,
conversion_type = type
) %>%
words2wc()

names(words.tc) <- c("words", "freq")
names(.data$words.tc) <- c("words", "freq")

# need a label column for actual frequencies, and frequencies will be the
# normalized sizes
words.tc$label <- words.tc$freq
.data$words.tc$.data$label <- .data$words.tc$.data$freq

words.tc <- words.tc %>% mutate(freq = log10(freq))
.data$words.tc <- .data$words.tc %>% mutate(.data$freq = log10(.data$freq))

words.tc <- words.tc %>% select(words, freq, label)
.data$words.tc <- .data$words.tc %>% select(words, .data$freq, .data$label)

wordcloud3(words.tc, minSize = 0)
wordcloud3(.data$words.tc, minSize = 0)
}


Expand Down
Loading

0 comments on commit 6b03555

Please sign in to comment.