From fb401d6289433be78efe7be7840a7786e713ccae Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Thu, 15 Aug 2024 17:07:45 -0400 Subject: [PATCH 1/3] adding exceptions --- R/mod_gwas.R | 34 +++++++++++++++++++++++++++++++--- tests/testthat/test-GWAS.R | 23 ++++++++++++++++------- 2 files changed, 47 insertions(+), 10 deletions(-) diff --git a/R/mod_gwas.R b/R/mod_gwas.R index 230b0fa..e1aef38 100644 --- a/R/mod_gwas.R +++ b/R/mod_gwas.R @@ -165,6 +165,13 @@ mod_gwas_server <- function(id){ #I think I can subset the read.GWAS file pheno and fixed categories (data@pheno[,c("trait")]) and data@fixed = phenotype_file[,c("List of fixed traits")] phenotype_file <- read.csv(input$phenotype_file$datapath, header = TRUE, check.names = FALSE) + # Remove empty lines + rm.empty <- which(apply(phenotype_file, 1, function(x) all(is.na(x) | x == ""))) + if(length(rm.empty) > 0){ + warning(paste("Removing", length(rm.empty),"empty lines")) + phenotype_file <- phenotype_file[-rm.empty,] + } + ids <- colnames(phenotype_file)[1] traits <- input$trait_info fixed <- input$fixed_info @@ -204,9 +211,6 @@ mod_gwas_server <- function(id){ #Save new phenotype file with selected traits and fixed effects write.csv(phenotype_file, file = temp_pheno_file, row.names = FALSE) - #Remove the phenotype_file from memory - rm(phenotype_file) - #Status updateProgressBar(session = session, id = "pb_gwas", value = 5, title = "Upload Complete: Now Formatting GWASpoly Data") @@ -215,6 +219,8 @@ mod_gwas_server <- function(id){ #Geno.file conversion if needed if (grepl("\\.csv$", file_path)) { + #TODO: Add check for matches of sample names in genotype and phenotype data + data <- read.GWASpoly(ploidy= ploidy, pheno.file= temp_pheno_file, geno.file=input$gwas_file$datapath, format="numeric", n.traits=length(traits), delim=",") #only need to change files here @@ -231,6 +237,28 @@ mod_gwas_server <- function(id){ class(geno_mat) <- "numeric" info <- data.frame(vcf@fix) gpoly_df <- cbind(info[,c("ID","CHROM","POS")], geno_mat) + + if(!any(colnames(gpoly_df) %in% phenotype_file$Sample_ID)) { + shinyalert( + title = "Samples ID do not match", + text = paste("Check if passport/phenotype files have same sample ID as the VCF/genotype file."), + size = "s", + closeOnEsc = TRUE, + closeOnClickOutside = FALSE, + html = TRUE, + type = "error", + showConfirmButton = TRUE, + confirmButtonText = "OK", + confirmButtonCol = "#004192", + showCancelButton = FALSE, + animation = TRUE + ) + + } + validate( + need(any(colnames(gpoly_df) %in% phenotype_file$Sample_ID), "The selected traits must be numerical.") + ) + write.csv(gpoly_df, file = temp_geno_file, row.names = FALSE) data <- read.GWASpoly(ploidy= ploidy, pheno.file= temp_pheno_file, geno.file=temp_geno_file, diff --git a/tests/testthat/test-GWAS.R b/tests/testthat/test-GWAS.R index 5d26dca..7d5833e 100644 --- a/tests/testthat/test-GWAS.R +++ b/tests/testthat/test-GWAS.R @@ -17,6 +17,13 @@ test_that("test GWAS",{ #I think I can subset the read.GWAS file pheno and fixed categories (data@pheno[,c("trait")]) and data@fixed = phenotype_file[,c("List of fixed traits")] phenotype_file <- read.csv(input$phenotype_file$datapath, header = TRUE, check.names = FALSE) + # Remove empty lines + rm.empty <- which(apply(phenotype_file, 1, function(x) all(is.na(x) | x == ""))) + if(length(rm.empty) > 0){ + warning(paste("Removing", length(rm.empty),"empty lines")) + phenotype_file <- phenotype_file[-rm.empty,] + } + ids <- colnames(phenotype_file)[1] traits <- input$trait_info fixed <- input$fixed_info @@ -36,9 +43,6 @@ test_that("test GWAS",{ #Save new phenotype file with selected traits and fixed effects write.csv(phenotype_file, file = temp_pheno_file, row.names = FALSE) - #Remove the phenotype_file from memory - rm(phenotype_file) - #Geno file path file_path <- input$gwas_file$datapath @@ -56,10 +60,15 @@ test_that("test GWAS",{ #Extract GT geno_mat <- extract.gt(vcf, element = "GT") - geno_mat <- apply(geno_mat, 2, convert_to_dosage) + geno_mat <- apply(geno_mat, 2, BIGapp:::convert_to_dosage) class(geno_mat) <- "numeric" info <- data.frame(vcf@fix) gpoly_df <- cbind(info[,c("ID","CHROM","POS")], geno_mat) + + if(!any(colnames(gpoly_df) %in% phenotype_file$Sample_ID)) { # Add + stop("Make sure passport and VCF samples have same name") + } + write.csv(gpoly_df, file = temp_geno_file, row.names = FALSE) data <- read.GWASpoly(ploidy= ploidy, pheno.file= temp_pheno_file, geno.file=temp_geno_file, @@ -106,7 +115,7 @@ test_that("test GWAS",{ PC<-as.matrix(PCs) K=as.matrix(Kin) - kin.adj<-posdefmat(K) + kin.adj<-BIGapp:::posdefmat(K) kin.test<-as.matrix(kin.adj) for (i in 2:ncol(GE)){ @@ -114,7 +123,7 @@ test_that("test GWAS",{ #model selection y=as.numeric(GE[,i]) - BICs<-CalcBIC(y=y,PC=PC,K=kin.test) + BICs<- BIGapp:::CalcBIC(y=y,PC=PC,K=kin.test) plotBICs<-cbind(rbind.data.frame(BICs$BIC$withK,BICs$BIC$withoutK),rep(c("w/Kinship","no Kinship"),each=nrow(BICs$BIC$withK))) colnames(plotBICs)[ncol(plotBICs)]<-"RelationshipMatrix" @@ -165,7 +174,7 @@ test_that("test GWAS",{ #Save qq_plot info - CMplot_shiny(data_qq,plot.type="q",col=c(1:8), + BIGapp:::CMplot_shiny(data_qq,plot.type="q",col=c(1:8), ylab.pos=2, file.name=colnames(data@pheno[i]), conf.int=FALSE, From 7c885ec9d79a3bd065cd0e0ee760be138f99a9d5 Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Fri, 16 Aug 2024 12:49:49 -0400 Subject: [PATCH 2/3] fixed issues #37 and #38 --- R/mod_Filtering.R | 201 ++++++++++++++++++++------------ R/utils.R | 12 ++ tests/testthat/test-filtering.R | 146 ++++++++++++++++++++++- 3 files changed, 283 insertions(+), 76 deletions(-) diff --git a/R/mod_Filtering.R b/R/mod_Filtering.R index 3e52daf..6cf92ca 100644 --- a/R/mod_Filtering.R +++ b/R/mod_Filtering.R @@ -58,14 +58,7 @@ mod_Filtering_ui <- function(id){ ) ), column(width = 6, - tabBox(width =12, collapsible = FALSE, status = "info", - id = "updog_tab", height = "600px", - tabPanel("Bias Histogram", icon = icon("image"), plotOutput(ns("bias_hist"), height = '550px')), - tabPanel("OD Histogram", icon = icon("image"), plotOutput(ns("od_hist"), height = '550px')), - tabPanel("Prop_mis Histogram", icon = icon("image"), plotOutput(ns("maxpostprob_hist"), height = '550px')), - tabPanel("SNP_miss", icon = icon("image"), plotOutput(ns("missing_snp_hist"), height = '550px')), - tabPanel("Sample_miss", icon = icon("image"), plotOutput(ns("missing_sample_hist"), height = '550px')) - ) + uiOutput(ns("din_tabs")), ), column(width = 3, valueBoxOutput(ns("snp_retained_box"), width = NULL), @@ -165,7 +158,37 @@ mod_Filtering_server <- function(id){ req(input$filter_ploidy, input$filter_output_name,input$updog_rdata) - if (input$use_updog) { + #Input file + vcf <- read.vcfR(input$updog_rdata$datapath, verbose = FALSE) + + # Identify if have updog parameters + format_fields <- unique(vcf@gt[,1]) + info_fields <- vcf@fix[1,8] + updog_par <- grepl("MPP", format_fields) & grepl("PMC", info_fields) & grepl("BIAS", info_fields) & grepl("OD", info_fields) + + if(updog_par){ + output$din_tabs <- renderUI({ + tabBox(width =12, collapsible = FALSE, status = "info", + id = "updog_tab", height = "600px", + tabPanel("Bias Histogram", icon = icon("image"), plotOutput(ns("bias_hist"), height = '550px')), + tabPanel("OD Histogram", icon = icon("image"), plotOutput(ns("od_hist"), height = '550px')), + tabPanel("Prop_mis Histogram", icon = icon("image"), plotOutput(ns("maxpostprob_hist"), height = '550px')), + tabPanel("SNP_miss", icon = icon("image"), plotOutput(ns("missing_snp_hist"), height = '550px')), + tabPanel("Sample_miss", icon = icon("image"), plotOutput(ns("missing_sample_hist"), height = '550px')) + ) + }) + } else { + output$din_tabs <- renderUI({ + tabBox(width =12, collapsible = FALSE, status = "info", + id = "updog_tab", height = "600px", + tabPanel("SNP_miss", icon = icon("image"), plotOutput(ns("missing_snp_hist"), height = '550px')), + tabPanel("Sample_miss", icon = icon("image"), plotOutput(ns("missing_sample_hist"), height = '550px')) + ) + }) + } + + + if (input$use_updog & updog_par) { # Use Updog filtering parameters OD_filter <- as.numeric(input$OD_filter) Prop_mis <- as.numeric(input$Prop_mis) @@ -193,8 +216,7 @@ mod_Filtering_server <- function(id){ maf_filter <- input$filter_maf updateProgressBar(session = session, id = "pb_filter", value = 10, title = "Processing VCF file") - #Input file - vcf <- read.vcfR(input$updog_rdata$datapath, verbose = FALSE) + #Starting SNPs starting_snps <- nrow(vcf) output$snp_removed_box <- renderValueBox({ @@ -226,6 +248,23 @@ mod_Filtering_server <- function(id){ filter.MAF = as.numeric(maf_filter), filter.MPP = max_post) + if (length(vcf@gt) == 0) { + shinyalert( + title = "All markers were filtered out", + text = "Loose the parameters to access results in this tab", + size = "s", + closeOnEsc = TRUE, + closeOnClickOutside = FALSE, + html = TRUE, + type = "error", + showConfirmButton = TRUE, + confirmButtonText = "OK", + confirmButtonCol = "#004192", + showCancelButton = FALSE, + animation = TRUE + ) + } + #Getting missing data information #Add support for genotype matrix filtering? #Pb @@ -336,6 +375,8 @@ mod_Filtering_server <- function(id){ abline(v = median(as.numeric(filtering_output$df$BIAS)), col = "green", lty = 2) # Median line abline(v = 0.5, col = "black", lty = 2) # proposed lower line abline(v = 2, col = "black", lty = 2) # proposed upper line + legend("topright", legend=c("mean", "median", "suggested threshold"), + col=c("red", "green","black"), lty=2, cex=0.8) } else if (input$filter_hist == "OD Histogram") { @@ -355,6 +396,8 @@ mod_Filtering_server <- function(id){ abline(v = mean(as.numeric(filtering_output$df$OD)), col = "red", lty = 2) # Mean line abline(v = median(as.numeric(filtering_output$df$OD)), col = "green", lty = 2) # Median line abline(v = 0.05, col = "black", lty = 2) # proposed filter by updog + legend("topright", legend=c("mean", "median", "suggested threshold"), + col=c("red", "green","black"), lty=2, cex=0.8) } else if (input$filter_hist == "Prop_mis Histogram") { @@ -372,6 +415,8 @@ mod_Filtering_server <- function(id){ abline(v = mean(as.numeric(filtering_output$df$PMC)), col = "red", lty = 2) # Mean line abline(v = median(as.numeric(filtering_output$df$PMC)), col = "green", lty = 2) # Median line abline(v = quantile(as.numeric(filtering_output$df$PMC), 0.95), col = "blue", lty = 2) + legend("topright", legend=c("mean", "median", "quantile"), + col=c("red", "green","blue"), lty=2, cex=0.8) } else if (input$filter_hist == "SNP_mis") { @@ -389,6 +434,8 @@ mod_Filtering_server <- function(id){ abline(v = mean(as.numeric(filtering_files$snp_miss_df)), col = "red", lty = 2) # Mean line abline(v = median(as.numeric(filtering_files$snp_miss_df)), col = "green", lty = 2) # Median line abline(v = quantile(as.numeric(filtering_files$snp_miss_df), 0.95), col = "blue", lty = 2) + legend("topright", legend=c("mean", "median", "quantile"), + col=c("red", "green","blue"), lty=2, cex=0.8) } else if (input$filter_hist == "Sample_mis") { @@ -406,6 +453,8 @@ mod_Filtering_server <- function(id){ abline(v = mean(as.numeric(filtering_files$sample_miss_df)), col = "red", lty = 2) # Mean line abline(v = median(as.numeric(filtering_files$sample_miss_df)), col = "green", lty = 2) # Median line abline(v = quantile(as.numeric(filtering_files$sample_miss_df), 0.95), col = "blue", lty = 2) + legend("topright", legend=c("mean", "median", "quantile"), + col=c("red", "green","blue"), lty=2, cex=0.8) } dev.off() } @@ -421,19 +470,6 @@ mod_Filtering_server <- function(id){ observeEvent(filtering_files$raw_vcf_df, { - - # Function to split INFO column and expand it into multiple columns - split_info_column <- function(info) { - # Split the INFO column by semicolon - info_split <- str_split(info, ";")[[1]] - - # Create a named list by splitting each element by equals sign - info_list <- set_names(map(info_split, ~ str_split(.x, "=")[[1]][2]), - map(info_split, ~ str_split(.x, "=")[[1]][1])) - - return(info_list) - } - # Apply the function to each row and bind the results into a new dataframe new_df <- data.frame(filtering_files$raw_vcf_df) %>% mutate(INFO_list = map(INFO, split_info_column)) %>% @@ -450,67 +486,80 @@ mod_Filtering_server <- function(id){ ###Bias #Histogram - output$bias_hist <- renderPlot({ - hist(as.numeric(new_df$BIAS), - main = "Unfiltered SNP bias histogram", - xlab = "bias", - ylab = "SNPs", - col = "lightblue", - border = "black", - xlim = c(0,5), - breaks = as.numeric(input$hist_bins)) - axis(1, at = seq(0, 5, by = .2), labels = rep("", length(seq(0, 5, by = 0.2)))) # Add ticks - abline(v = mean(as.numeric(new_df$BIAS)), col = "red", lty = 2) # Mean line - abline(v = median(as.numeric(new_df$BIAS)), col = "green", lty = 2) # Median line - abline(v = 0.5, col = "black", lty = 2) # proposed lower line - abline(v = 2, col = "black", lty = 2) # proposed upper line - }) + if(any(grepl("BIAS", colnames(new_df)))){ + output$bias_hist <- renderPlot({ + hist(as.numeric(new_df$BIAS), + main = "Unfiltered SNP bias histogram", + xlab = "bias", + ylab = "SNPs", + col = "lightblue", + border = "black", + xlim = c(0,5), + breaks = as.numeric(input$hist_bins)) + axis(1, at = seq(0, 5, by = .2), labels = rep("", length(seq(0, 5, by = 0.2)))) # Add ticks + abline(v = mean(as.numeric(new_df$BIAS)), col = "red", lty = 2) # Mean line + abline(v = median(as.numeric(new_df$BIAS)), col = "green", lty = 2) # Median line + abline(v = 0.5, col = "black", lty = 2) # proposed lower line + abline(v = 2, col = "black", lty = 2) # proposed upper line + legend("topright", legend=c("mean", "median", "suggested threshold"), + col=c("red", "green","black"), lty=2, cex=0.8) + }) + } ###OD - quantile(as.numeric(new_df$OD), 0.95) - #Histogram - output$od_hist <- renderPlot({ - hist(as.numeric(new_df$OD), - main = "Unfiltered SNP overdispersion parameter histogram", - xlab = "OD", - ylab = "SNPs", - col = "lightblue", - border = "black", - xlim = c(0,0.6), - breaks = as.numeric(input$hist_bins)) - axis(1, at = seq(0, 0.6, by = .01), labels = rep("", length(seq(0, 0.6, by = 0.01)))) # Add ticks - abline(v = 0.05, col = "black", lty = 2) # proposed filter by updog + if(any(grepl("OD", colnames(new_df)))){ - # Add vertical lines - abline(v = mean(as.numeric(new_df$OD)), col = "red", lty = 2) # Mean line - abline(v = median(as.numeric(new_df$OD)), col = "green", lty = 2) # Median line - abline(v = 0.05, col = "black", lty = 2) # proposed filter by updog + quantile(as.numeric(new_df$OD), 0.95) + #Histogram + output$od_hist <- renderPlot({ + hist(as.numeric(new_df$OD), + main = "Unfiltered SNP overdispersion parameter histogram", + xlab = "OD", + ylab = "SNPs", + col = "lightblue", + border = "black", + xlim = c(0,0.6), + breaks = as.numeric(input$hist_bins)) + axis(1, at = seq(0, 0.6, by = .01), labels = rep("", length(seq(0, 0.6, by = 0.01)))) # Add ticks + abline(v = 0.05, col = "black", lty = 2) # proposed filter by updog - }) + # Add vertical lines + abline(v = mean(as.numeric(new_df$OD)), col = "red", lty = 2) # Mean line + abline(v = median(as.numeric(new_df$OD)), col = "green", lty = 2) # Median line + abline(v = 0.05, col = "black", lty = 2) # proposed filter by updog + legend("topright", legend=c("mean", "median", "suggested threshold"), + col=c("red", "green","black"), lty=2, cex=0.8) + + }) + } ##MAXPOSTPROB #Histogram + if(any(grepl("PMC", colnames(new_df)))){ - output$maxpostprob_hist <- renderPlot({ + output$maxpostprob_hist <- renderPlot({ - #Histogram - hist(as.numeric(new_df$PMC), - main = "The estimated proportion of individuals misclassified in the SNP from updog", - xlab = "Proportion of Misclassified Genotypes per SNP", - ylab = "Number of SNPs", - col = "lightblue", - border = "black", - xlim = c(0,1), - breaks = as.numeric(input$hist_bins)) - axis(1, at = seq(0, 1, by = .1), labels = rep("", length(seq(0, 1, by = 0.1)))) # Add ticks + #Histogram + hist(as.numeric(new_df$PMC), + main = "The estimated proportion of individuals misclassified in the SNP from updog", + xlab = "Proportion of Misclassified Genotypes per SNP", + ylab = "Number of SNPs", + col = "lightblue", + border = "black", + xlim = c(0,1), + breaks = as.numeric(input$hist_bins)) + axis(1, at = seq(0, 1, by = .1), labels = rep("", length(seq(0, 1, by = 0.1)))) # Add ticks - # Add vertical lines - abline(v = mean(as.numeric(new_df$PMC)), col = "red", lty = 2) # Mean line - abline(v = median(as.numeric(new_df$PMC)), col = "green", lty = 2) # Median line - abline(v = quantile(as.numeric(new_df$PMC), 0.95), col = "blue", lty = 2) + # Add vertical lines + abline(v = mean(as.numeric(new_df$PMC)), col = "red", lty = 2) # Mean line + abline(v = median(as.numeric(new_df$PMC)), col = "green", lty = 2) # Median line + abline(v = quantile(as.numeric(new_df$PMC), 0.95), col = "blue", lty = 2) + legend("topright", legend=c("mean", "median", "quantile"), + col=c("red", "green","blue"), lty=2, cex=0.8) - }) + }) + } #Missing data output$missing_snp_hist <- renderPlot({ @@ -530,7 +579,8 @@ mod_Filtering_server <- function(id){ abline(v = mean(as.numeric(filtering_files$snp_miss_df)), col = "red", lty = 2) # Mean line abline(v = median(as.numeric(filtering_files$snp_miss_df)), col = "green", lty = 2) # Median line abline(v = quantile(as.numeric(filtering_files$snp_miss_df), 0.95), col = "blue", lty = 2) - + legend("topright", legend=c("mean", "median", "quantile"), + col=c("red", "green","blue"), lty=2, cex=0.8) }) output$missing_sample_hist <- renderPlot({ @@ -550,7 +600,8 @@ mod_Filtering_server <- function(id){ abline(v = mean(as.numeric(filtering_files$sample_miss_df)), col = "red", lty = 2) # Mean line abline(v = median(as.numeric(filtering_files$sample_miss_df)), col = "green", lty = 2) # Median line abline(v = quantile(as.numeric(filtering_files$sample_miss_df), 0.95), col = "blue", lty = 2) - + legend("topright", legend=c("mean", "median", "quantile"), + col=c("red", "green","blue"), lty=2, cex=0.8) }) ##Read Depth (I would prefer that this show the mean depth for SNPs or Samples instead of all loci/sample cells) diff --git a/R/utils.R b/R/utils.R index d39d70c..811f485 100644 --- a/R/utils.R +++ b/R/utils.R @@ -280,3 +280,15 @@ posdefmat <- function(mat) { } return(g) } + +# Function to split INFO column and expand it into multiple columns +split_info_column <- function(info) { + # Split the INFO column by semicolon + info_split <- str_split(info, ";")[[1]] + + # Create a named list by splitting each element by equals sign + info_list <- set_names(map(info_split, ~ str_split(.x, "=")[[1]][2]), + map(info_split, ~ str_split(.x, "=")[[1]][1])) + + return(info_list) +} diff --git a/tests/testthat/test-filtering.R b/tests/testthat/test-filtering.R index aec9033..493c676 100644 --- a/tests/testthat/test-filtering.R +++ b/tests/testthat/test-filtering.R @@ -1,6 +1,14 @@ context("Filtering") -test_that("Filtering",{ +#library(vcfR) +#library(BIGr) +#library(testthat) +library(tidyr) +library(dplyr) +library(purrr) +library(stringr) + +test_that("Filtering with updog metrics",{ #Variables filter_ploidy <- 2 @@ -26,6 +34,81 @@ test_that("Filtering",{ temp_file <- tempfile(fileext = ".vcf.gz") + #Input file + vcf <- read.vcfR(input$updog_rdata$datapath, verbose = FALSE) + + # Identify if have updog parameters + format_fields <- unique(vcf@gt[,1]) + info_fields <- vcf@fix[1,8] + + updog_par <- grepl("MPP", format_fields) & grepl("PMC", info_fields) & grepl("BIAS", info_fields) + + #Starting SNPs + starting_snps <- nrow(vcf) + #export INFO dataframe + filtering_files$raw_vcf_df <- data.frame(vcf@fix) + + #Filtering + vcf <- filterVCF(vcf.file = vcf, + ploidy=ploidy, + output.file=NULL, + filter.OD = OD_filter, + filter.BIAS.min = Bias_min, + filter.BIAS.max = Bias_max, + filter.DP = as.numeric(size_depth), + filter.PMC = Prop_mis, + filter.SAMPLE.miss = as.numeric(sample_miss), + filter.SNP.miss = as.numeric(snp_miss), + filter.MAF = as.numeric(maf_filter), + filter.MPP = max_post) + + #Getting missing data information + #Add support for genotype matrix filtering? + gt_matrix <- extract.gt(vcf, element = "GT", as.numeric = FALSE) + filtering_files$snp_miss_df <- rowMeans(is.na(gt_matrix)) #SNP missing values + filtering_files$sample_miss_df <- as.numeric(colMeans(is.na(gt_matrix))) #Sample missing values + + expect_true(all(table(gt_matrix[,10]) == c(20,13,8))) + + rm(gt_matrix) #Remove gt matrix + + #Writing file + write.vcf(vcf, file = temp_file) + + #Get final_snps + final_snps <- nrow(vcf) + expect_equal(final_snps, 43) + +}) + + +test_that("Filtering without updog metrics",{ + + #Variables + filter_ploidy <- 2 + filter_maf <- 0.05 + size_depth <- 10 + snp_miss <- 100 + sample_miss <- 100 + OD_filter <- NULL + Bias <- NULL + Bias_min <- NULL + Bias_max <- NULL + Prop_mis <- 0.05 + maxpostprob_filter <- NULL + max_post <- maxpostprob_filter + output_name <- "out" + snp_miss <- snp_miss/100 + sample_miss <- sample_miss/100 + ploidy <- filter_ploidy + maf_filter <- filter_maf + input$hist_bins <- 50 + + input <- filtering_files <- list() + input$updog_rdata$datapath <- system.file("vcf_example_out.vcf.gz", package = "BIGapp") + + temp_file <- tempfile(fileext = ".vcf.gz") + #Input file vcf <- read.vcfR(input$updog_rdata$datapath, verbose = FALSE) #Starting SNPs @@ -47,11 +130,14 @@ test_that("Filtering",{ filter.MAF = as.numeric(maf_filter), filter.MPP = max_post) + if(length(vcf@gt) == 0) stop("All markers were filtered. Loose the parameters to access results in this tab.") + #Getting missing data information #Add support for genotype matrix filtering? gt_matrix <- extract.gt(vcf, element = "GT", as.numeric = FALSE) filtering_files$snp_miss_df <- rowMeans(is.na(gt_matrix)) #SNP missing values filtering_files$sample_miss_df <- as.numeric(colMeans(is.na(gt_matrix))) #Sample missing values + rm(gt_matrix) #Remove gt matrix #Writing file @@ -59,4 +145,62 @@ test_that("Filtering",{ #Get final_snps final_snps <- nrow(vcf) + + #export INFO dataframe + filtering_files$raw_vcf_df + + # Apply the function to each row and bind the results into a new dataframe + new_df <- data.frame(filtering_files$raw_vcf_df) %>% + mutate(INFO_list = map(INFO, split_info_column)) %>% + unnest_wider(INFO_list) + + #Save df to reactive value + filtering_output <- list() + filtering_output$df <- new_df + + ##Make plots + + #Missing data + + #Histogram + hist(as.numeric(filtering_files$snp_miss_df), + main = "Ratio of Missing Data per SNP After Filtering", + xlab = "Proportion of Missing Data per SNP", + ylab = "Number of SNPs", + col = "lightblue", + border = "black", + xlim = c(0,1), + breaks = as.numeric(input$hist_bins)) + axis(1, at = seq(0, 1, by = .1), labels = rep("", length(seq(0, 1, by = 0.1)))) # Add ticks + + # Add vertical lines + abline(v = mean(as.numeric(filtering_files$snp_miss_df)), col = "red", lty = 2) # Mean line + abline(v = median(as.numeric(filtering_files$snp_miss_df)), col = "green", lty = 2) # Median line + abline(v = quantile(as.numeric(filtering_files$snp_miss_df), 0.95), col = "blue", lty = 2) + legend("topright", legend=c("mean", "median", "quantile"), + col=c("red", "green","blue"), lty=1:2, cex=0.8) + + #Histogram + hist(as.numeric(filtering_files$sample_miss_df), + main = "Ratio of Missing Data per Sample After Filtering", + xlab = "Proportion of Missing Data per Sample", + ylab = "Number of Samples", + col = "lightblue", + border = "black", + xlim = c(0,1), + breaks = as.numeric(input$hist_bins)) + axis(1, at = seq(0, 1, by = .1), labels = rep("", length(seq(0, 1, by = 0.1)))) # Add ticks + + # Add vertical lines + abline(v = mean(as.numeric(filtering_files$sample_miss_df)), col = "red", lty = 2) # Mean line + abline(v = median(as.numeric(filtering_files$sample_miss_df)), col = "green", lty = 2) # Median line + abline(v = quantile(as.numeric(filtering_files$sample_miss_df), 0.95), col = "blue", lty = 2) + legend("topright", legend=c("mean", "median", "quantile"), + col=c("red", "green","blue"), lty=1:2, cex=0.8) + + + ##Read Depth (I would prefer that this show the mean depth for SNPs or Samples instead of all loci/sample cells) + quantile(as.numeric(new_df$DP), 0.95) + + }) From fd8255f682eb098c909f8aca0fa077319d4ce493 Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Wed, 28 Aug 2024 11:30:30 -0400 Subject: [PATCH 3/3] Add empty tab + fix tests --- R/mod_Filtering.R | 7 ++ tests/testthat/test-GWAS.R | 1 + tests/testthat/test-filtering.R | 123 -------------------------------- 3 files changed, 8 insertions(+), 123 deletions(-) diff --git a/R/mod_Filtering.R b/R/mod_Filtering.R index 6cf92ca..72ba828 100644 --- a/R/mod_Filtering.R +++ b/R/mod_Filtering.R @@ -132,6 +132,13 @@ mod_Filtering_server <- function(id){ disable("start_updog_filter") + output$din_tabs <- renderUI({ + tabBox(width =12, collapsible = FALSE, status = "info", + id = "updog_tab", height = "600px", + tabPanel("Results", p("Upload VCF file to access results in this section.")) + ) + }) + vcf <- eventReactive(input$run_filters, { # Ensure the files are uploaded diff --git a/tests/testthat/test-GWAS.R b/tests/testthat/test-GWAS.R index 7d5833e..a2d9420 100644 --- a/tests/testthat/test-GWAS.R +++ b/tests/testthat/test-GWAS.R @@ -1,6 +1,7 @@ context("GWAS") test_that("test GWAS",{ + input <- list() input$cores <- 1 input$phenotype_file$datapath <- system.file("iris_passport_file.csv", package = "BIGapp") diff --git a/tests/testthat/test-filtering.R b/tests/testthat/test-filtering.R index 493c676..cc74fea 100644 --- a/tests/testthat/test-filtering.R +++ b/tests/testthat/test-filtering.R @@ -81,126 +81,3 @@ test_that("Filtering with updog metrics",{ }) - -test_that("Filtering without updog metrics",{ - - #Variables - filter_ploidy <- 2 - filter_maf <- 0.05 - size_depth <- 10 - snp_miss <- 100 - sample_miss <- 100 - OD_filter <- NULL - Bias <- NULL - Bias_min <- NULL - Bias_max <- NULL - Prop_mis <- 0.05 - maxpostprob_filter <- NULL - max_post <- maxpostprob_filter - output_name <- "out" - snp_miss <- snp_miss/100 - sample_miss <- sample_miss/100 - ploidy <- filter_ploidy - maf_filter <- filter_maf - input$hist_bins <- 50 - - input <- filtering_files <- list() - input$updog_rdata$datapath <- system.file("vcf_example_out.vcf.gz", package = "BIGapp") - - temp_file <- tempfile(fileext = ".vcf.gz") - - #Input file - vcf <- read.vcfR(input$updog_rdata$datapath, verbose = FALSE) - #Starting SNPs - starting_snps <- nrow(vcf) - #export INFO dataframe - filtering_files$raw_vcf_df <- data.frame(vcf@fix) - - #Filtering - vcf <- filterVCF(vcf.file = vcf, - ploidy=ploidy, - output.file=NULL, - filter.OD = OD_filter, - filter.BIAS.min = Bias_min, - filter.BIAS.max = Bias_max, - filter.DP = as.numeric(size_depth), - filter.PMC = Prop_mis, - filter.SAMPLE.miss = as.numeric(sample_miss), - filter.SNP.miss = as.numeric(snp_miss), - filter.MAF = as.numeric(maf_filter), - filter.MPP = max_post) - - if(length(vcf@gt) == 0) stop("All markers were filtered. Loose the parameters to access results in this tab.") - - #Getting missing data information - #Add support for genotype matrix filtering? - gt_matrix <- extract.gt(vcf, element = "GT", as.numeric = FALSE) - filtering_files$snp_miss_df <- rowMeans(is.na(gt_matrix)) #SNP missing values - filtering_files$sample_miss_df <- as.numeric(colMeans(is.na(gt_matrix))) #Sample missing values - - rm(gt_matrix) #Remove gt matrix - - #Writing file - write.vcf(vcf, file = temp_file) - - #Get final_snps - final_snps <- nrow(vcf) - - #export INFO dataframe - filtering_files$raw_vcf_df - - # Apply the function to each row and bind the results into a new dataframe - new_df <- data.frame(filtering_files$raw_vcf_df) %>% - mutate(INFO_list = map(INFO, split_info_column)) %>% - unnest_wider(INFO_list) - - #Save df to reactive value - filtering_output <- list() - filtering_output$df <- new_df - - ##Make plots - - #Missing data - - #Histogram - hist(as.numeric(filtering_files$snp_miss_df), - main = "Ratio of Missing Data per SNP After Filtering", - xlab = "Proportion of Missing Data per SNP", - ylab = "Number of SNPs", - col = "lightblue", - border = "black", - xlim = c(0,1), - breaks = as.numeric(input$hist_bins)) - axis(1, at = seq(0, 1, by = .1), labels = rep("", length(seq(0, 1, by = 0.1)))) # Add ticks - - # Add vertical lines - abline(v = mean(as.numeric(filtering_files$snp_miss_df)), col = "red", lty = 2) # Mean line - abline(v = median(as.numeric(filtering_files$snp_miss_df)), col = "green", lty = 2) # Median line - abline(v = quantile(as.numeric(filtering_files$snp_miss_df), 0.95), col = "blue", lty = 2) - legend("topright", legend=c("mean", "median", "quantile"), - col=c("red", "green","blue"), lty=1:2, cex=0.8) - - #Histogram - hist(as.numeric(filtering_files$sample_miss_df), - main = "Ratio of Missing Data per Sample After Filtering", - xlab = "Proportion of Missing Data per Sample", - ylab = "Number of Samples", - col = "lightblue", - border = "black", - xlim = c(0,1), - breaks = as.numeric(input$hist_bins)) - axis(1, at = seq(0, 1, by = .1), labels = rep("", length(seq(0, 1, by = 0.1)))) # Add ticks - - # Add vertical lines - abline(v = mean(as.numeric(filtering_files$sample_miss_df)), col = "red", lty = 2) # Mean line - abline(v = median(as.numeric(filtering_files$sample_miss_df)), col = "green", lty = 2) # Median line - abline(v = quantile(as.numeric(filtering_files$sample_miss_df), 0.95), col = "blue", lty = 2) - legend("topright", legend=c("mean", "median", "quantile"), - col=c("red", "green","blue"), lty=1:2, cex=0.8) - - - ##Read Depth (I would prefer that this show the mean depth for SNPs or Samples instead of all loci/sample cells) - quantile(as.numeric(new_df$DP), 0.95) - - -})