diff --git a/R/mod_Filtering.R b/R/mod_Filtering.R index 3e52daf..72ba828 100644 --- a/R/mod_Filtering.R +++ b/R/mod_Filtering.R @@ -58,14 +58,7 @@ mod_Filtering_ui <- function(id){ ) ), column(width = 6, - tabBox(width =12, collapsible = FALSE, status = "info", - id = "updog_tab", height = "600px", - tabPanel("Bias Histogram", icon = icon("image"), plotOutput(ns("bias_hist"), height = '550px')), - tabPanel("OD Histogram", icon = icon("image"), plotOutput(ns("od_hist"), height = '550px')), - tabPanel("Prop_mis Histogram", icon = icon("image"), plotOutput(ns("maxpostprob_hist"), height = '550px')), - tabPanel("SNP_miss", icon = icon("image"), plotOutput(ns("missing_snp_hist"), height = '550px')), - tabPanel("Sample_miss", icon = icon("image"), plotOutput(ns("missing_sample_hist"), height = '550px')) - ) + uiOutput(ns("din_tabs")), ), column(width = 3, valueBoxOutput(ns("snp_retained_box"), width = NULL), @@ -139,6 +132,13 @@ mod_Filtering_server <- function(id){ disable("start_updog_filter") + output$din_tabs <- renderUI({ + tabBox(width =12, collapsible = FALSE, status = "info", + id = "updog_tab", height = "600px", + tabPanel("Results", p("Upload VCF file to access results in this section.")) + ) + }) + vcf <- eventReactive(input$run_filters, { # Ensure the files are uploaded @@ -165,7 +165,37 @@ mod_Filtering_server <- function(id){ req(input$filter_ploidy, input$filter_output_name,input$updog_rdata) - if (input$use_updog) { + #Input file + vcf <- read.vcfR(input$updog_rdata$datapath, verbose = FALSE) + + # Identify if have updog parameters + format_fields <- unique(vcf@gt[,1]) + info_fields <- vcf@fix[1,8] + updog_par <- grepl("MPP", format_fields) & grepl("PMC", info_fields) & grepl("BIAS", info_fields) & grepl("OD", info_fields) + + if(updog_par){ + output$din_tabs <- renderUI({ + tabBox(width =12, collapsible = FALSE, status = "info", + id = "updog_tab", height = "600px", + tabPanel("Bias Histogram", icon = icon("image"), plotOutput(ns("bias_hist"), height = '550px')), + tabPanel("OD Histogram", icon = icon("image"), plotOutput(ns("od_hist"), height = '550px')), + tabPanel("Prop_mis Histogram", icon = icon("image"), plotOutput(ns("maxpostprob_hist"), height = '550px')), + tabPanel("SNP_miss", icon = icon("image"), plotOutput(ns("missing_snp_hist"), height = '550px')), + tabPanel("Sample_miss", icon = icon("image"), plotOutput(ns("missing_sample_hist"), height = '550px')) + ) + }) + } else { + output$din_tabs <- renderUI({ + tabBox(width =12, collapsible = FALSE, status = "info", + id = "updog_tab", height = "600px", + tabPanel("SNP_miss", icon = icon("image"), plotOutput(ns("missing_snp_hist"), height = '550px')), + tabPanel("Sample_miss", icon = icon("image"), plotOutput(ns("missing_sample_hist"), height = '550px')) + ) + }) + } + + + if (input$use_updog & updog_par) { # Use Updog filtering parameters OD_filter <- as.numeric(input$OD_filter) Prop_mis <- as.numeric(input$Prop_mis) @@ -193,8 +223,7 @@ mod_Filtering_server <- function(id){ maf_filter <- input$filter_maf updateProgressBar(session = session, id = "pb_filter", value = 10, title = "Processing VCF file") - #Input file - vcf <- read.vcfR(input$updog_rdata$datapath, verbose = FALSE) + #Starting SNPs starting_snps <- nrow(vcf) output$snp_removed_box <- renderValueBox({ @@ -226,6 +255,23 @@ mod_Filtering_server <- function(id){ filter.MAF = as.numeric(maf_filter), filter.MPP = max_post) + if (length(vcf@gt) == 0) { + shinyalert( + title = "All markers were filtered out", + text = "Loose the parameters to access results in this tab", + size = "s", + closeOnEsc = TRUE, + closeOnClickOutside = FALSE, + html = TRUE, + type = "error", + showConfirmButton = TRUE, + confirmButtonText = "OK", + confirmButtonCol = "#004192", + showCancelButton = FALSE, + animation = TRUE + ) + } + #Getting missing data information #Add support for genotype matrix filtering? #Pb @@ -336,6 +382,8 @@ mod_Filtering_server <- function(id){ abline(v = median(as.numeric(filtering_output$df$BIAS)), col = "green", lty = 2) # Median line abline(v = 0.5, col = "black", lty = 2) # proposed lower line abline(v = 2, col = "black", lty = 2) # proposed upper line + legend("topright", legend=c("mean", "median", "suggested threshold"), + col=c("red", "green","black"), lty=2, cex=0.8) } else if (input$filter_hist == "OD Histogram") { @@ -355,6 +403,8 @@ mod_Filtering_server <- function(id){ abline(v = mean(as.numeric(filtering_output$df$OD)), col = "red", lty = 2) # Mean line abline(v = median(as.numeric(filtering_output$df$OD)), col = "green", lty = 2) # Median line abline(v = 0.05, col = "black", lty = 2) # proposed filter by updog + legend("topright", legend=c("mean", "median", "suggested threshold"), + col=c("red", "green","black"), lty=2, cex=0.8) } else if (input$filter_hist == "Prop_mis Histogram") { @@ -372,6 +422,8 @@ mod_Filtering_server <- function(id){ abline(v = mean(as.numeric(filtering_output$df$PMC)), col = "red", lty = 2) # Mean line abline(v = median(as.numeric(filtering_output$df$PMC)), col = "green", lty = 2) # Median line abline(v = quantile(as.numeric(filtering_output$df$PMC), 0.95), col = "blue", lty = 2) + legend("topright", legend=c("mean", "median", "quantile"), + col=c("red", "green","blue"), lty=2, cex=0.8) } else if (input$filter_hist == "SNP_mis") { @@ -389,6 +441,8 @@ mod_Filtering_server <- function(id){ abline(v = mean(as.numeric(filtering_files$snp_miss_df)), col = "red", lty = 2) # Mean line abline(v = median(as.numeric(filtering_files$snp_miss_df)), col = "green", lty = 2) # Median line abline(v = quantile(as.numeric(filtering_files$snp_miss_df), 0.95), col = "blue", lty = 2) + legend("topright", legend=c("mean", "median", "quantile"), + col=c("red", "green","blue"), lty=2, cex=0.8) } else if (input$filter_hist == "Sample_mis") { @@ -406,6 +460,8 @@ mod_Filtering_server <- function(id){ abline(v = mean(as.numeric(filtering_files$sample_miss_df)), col = "red", lty = 2) # Mean line abline(v = median(as.numeric(filtering_files$sample_miss_df)), col = "green", lty = 2) # Median line abline(v = quantile(as.numeric(filtering_files$sample_miss_df), 0.95), col = "blue", lty = 2) + legend("topright", legend=c("mean", "median", "quantile"), + col=c("red", "green","blue"), lty=2, cex=0.8) } dev.off() } @@ -421,19 +477,6 @@ mod_Filtering_server <- function(id){ observeEvent(filtering_files$raw_vcf_df, { - - # Function to split INFO column and expand it into multiple columns - split_info_column <- function(info) { - # Split the INFO column by semicolon - info_split <- str_split(info, ";")[[1]] - - # Create a named list by splitting each element by equals sign - info_list <- set_names(map(info_split, ~ str_split(.x, "=")[[1]][2]), - map(info_split, ~ str_split(.x, "=")[[1]][1])) - - return(info_list) - } - # Apply the function to each row and bind the results into a new dataframe new_df <- data.frame(filtering_files$raw_vcf_df) %>% mutate(INFO_list = map(INFO, split_info_column)) %>% @@ -450,67 +493,80 @@ mod_Filtering_server <- function(id){ ###Bias #Histogram - output$bias_hist <- renderPlot({ - hist(as.numeric(new_df$BIAS), - main = "Unfiltered SNP bias histogram", - xlab = "bias", - ylab = "SNPs", - col = "lightblue", - border = "black", - xlim = c(0,5), - breaks = as.numeric(input$hist_bins)) - axis(1, at = seq(0, 5, by = .2), labels = rep("", length(seq(0, 5, by = 0.2)))) # Add ticks - abline(v = mean(as.numeric(new_df$BIAS)), col = "red", lty = 2) # Mean line - abline(v = median(as.numeric(new_df$BIAS)), col = "green", lty = 2) # Median line - abline(v = 0.5, col = "black", lty = 2) # proposed lower line - abline(v = 2, col = "black", lty = 2) # proposed upper line - }) + if(any(grepl("BIAS", colnames(new_df)))){ + output$bias_hist <- renderPlot({ + hist(as.numeric(new_df$BIAS), + main = "Unfiltered SNP bias histogram", + xlab = "bias", + ylab = "SNPs", + col = "lightblue", + border = "black", + xlim = c(0,5), + breaks = as.numeric(input$hist_bins)) + axis(1, at = seq(0, 5, by = .2), labels = rep("", length(seq(0, 5, by = 0.2)))) # Add ticks + abline(v = mean(as.numeric(new_df$BIAS)), col = "red", lty = 2) # Mean line + abline(v = median(as.numeric(new_df$BIAS)), col = "green", lty = 2) # Median line + abline(v = 0.5, col = "black", lty = 2) # proposed lower line + abline(v = 2, col = "black", lty = 2) # proposed upper line + legend("topright", legend=c("mean", "median", "suggested threshold"), + col=c("red", "green","black"), lty=2, cex=0.8) + }) + } ###OD - quantile(as.numeric(new_df$OD), 0.95) - #Histogram - output$od_hist <- renderPlot({ - hist(as.numeric(new_df$OD), - main = "Unfiltered SNP overdispersion parameter histogram", - xlab = "OD", - ylab = "SNPs", - col = "lightblue", - border = "black", - xlim = c(0,0.6), - breaks = as.numeric(input$hist_bins)) - axis(1, at = seq(0, 0.6, by = .01), labels = rep("", length(seq(0, 0.6, by = 0.01)))) # Add ticks - abline(v = 0.05, col = "black", lty = 2) # proposed filter by updog + if(any(grepl("OD", colnames(new_df)))){ - # Add vertical lines - abline(v = mean(as.numeric(new_df$OD)), col = "red", lty = 2) # Mean line - abline(v = median(as.numeric(new_df$OD)), col = "green", lty = 2) # Median line - abline(v = 0.05, col = "black", lty = 2) # proposed filter by updog + quantile(as.numeric(new_df$OD), 0.95) + #Histogram + output$od_hist <- renderPlot({ + hist(as.numeric(new_df$OD), + main = "Unfiltered SNP overdispersion parameter histogram", + xlab = "OD", + ylab = "SNPs", + col = "lightblue", + border = "black", + xlim = c(0,0.6), + breaks = as.numeric(input$hist_bins)) + axis(1, at = seq(0, 0.6, by = .01), labels = rep("", length(seq(0, 0.6, by = 0.01)))) # Add ticks + abline(v = 0.05, col = "black", lty = 2) # proposed filter by updog - }) + # Add vertical lines + abline(v = mean(as.numeric(new_df$OD)), col = "red", lty = 2) # Mean line + abline(v = median(as.numeric(new_df$OD)), col = "green", lty = 2) # Median line + abline(v = 0.05, col = "black", lty = 2) # proposed filter by updog + legend("topright", legend=c("mean", "median", "suggested threshold"), + col=c("red", "green","black"), lty=2, cex=0.8) + + }) + } ##MAXPOSTPROB #Histogram + if(any(grepl("PMC", colnames(new_df)))){ - output$maxpostprob_hist <- renderPlot({ + output$maxpostprob_hist <- renderPlot({ - #Histogram - hist(as.numeric(new_df$PMC), - main = "The estimated proportion of individuals misclassified in the SNP from updog", - xlab = "Proportion of Misclassified Genotypes per SNP", - ylab = "Number of SNPs", - col = "lightblue", - border = "black", - xlim = c(0,1), - breaks = as.numeric(input$hist_bins)) - axis(1, at = seq(0, 1, by = .1), labels = rep("", length(seq(0, 1, by = 0.1)))) # Add ticks + #Histogram + hist(as.numeric(new_df$PMC), + main = "The estimated proportion of individuals misclassified in the SNP from updog", + xlab = "Proportion of Misclassified Genotypes per SNP", + ylab = "Number of SNPs", + col = "lightblue", + border = "black", + xlim = c(0,1), + breaks = as.numeric(input$hist_bins)) + axis(1, at = seq(0, 1, by = .1), labels = rep("", length(seq(0, 1, by = 0.1)))) # Add ticks - # Add vertical lines - abline(v = mean(as.numeric(new_df$PMC)), col = "red", lty = 2) # Mean line - abline(v = median(as.numeric(new_df$PMC)), col = "green", lty = 2) # Median line - abline(v = quantile(as.numeric(new_df$PMC), 0.95), col = "blue", lty = 2) + # Add vertical lines + abline(v = mean(as.numeric(new_df$PMC)), col = "red", lty = 2) # Mean line + abline(v = median(as.numeric(new_df$PMC)), col = "green", lty = 2) # Median line + abline(v = quantile(as.numeric(new_df$PMC), 0.95), col = "blue", lty = 2) + legend("topright", legend=c("mean", "median", "quantile"), + col=c("red", "green","blue"), lty=2, cex=0.8) - }) + }) + } #Missing data output$missing_snp_hist <- renderPlot({ @@ -530,7 +586,8 @@ mod_Filtering_server <- function(id){ abline(v = mean(as.numeric(filtering_files$snp_miss_df)), col = "red", lty = 2) # Mean line abline(v = median(as.numeric(filtering_files$snp_miss_df)), col = "green", lty = 2) # Median line abline(v = quantile(as.numeric(filtering_files$snp_miss_df), 0.95), col = "blue", lty = 2) - + legend("topright", legend=c("mean", "median", "quantile"), + col=c("red", "green","blue"), lty=2, cex=0.8) }) output$missing_sample_hist <- renderPlot({ @@ -550,7 +607,8 @@ mod_Filtering_server <- function(id){ abline(v = mean(as.numeric(filtering_files$sample_miss_df)), col = "red", lty = 2) # Mean line abline(v = median(as.numeric(filtering_files$sample_miss_df)), col = "green", lty = 2) # Median line abline(v = quantile(as.numeric(filtering_files$sample_miss_df), 0.95), col = "blue", lty = 2) - + legend("topright", legend=c("mean", "median", "quantile"), + col=c("red", "green","blue"), lty=2, cex=0.8) }) ##Read Depth (I would prefer that this show the mean depth for SNPs or Samples instead of all loci/sample cells) diff --git a/R/mod_gwas.R b/R/mod_gwas.R index 230b0fa..e1aef38 100644 --- a/R/mod_gwas.R +++ b/R/mod_gwas.R @@ -165,6 +165,13 @@ mod_gwas_server <- function(id){ #I think I can subset the read.GWAS file pheno and fixed categories (data@pheno[,c("trait")]) and data@fixed = phenotype_file[,c("List of fixed traits")] phenotype_file <- read.csv(input$phenotype_file$datapath, header = TRUE, check.names = FALSE) + # Remove empty lines + rm.empty <- which(apply(phenotype_file, 1, function(x) all(is.na(x) | x == ""))) + if(length(rm.empty) > 0){ + warning(paste("Removing", length(rm.empty),"empty lines")) + phenotype_file <- phenotype_file[-rm.empty,] + } + ids <- colnames(phenotype_file)[1] traits <- input$trait_info fixed <- input$fixed_info @@ -204,9 +211,6 @@ mod_gwas_server <- function(id){ #Save new phenotype file with selected traits and fixed effects write.csv(phenotype_file, file = temp_pheno_file, row.names = FALSE) - #Remove the phenotype_file from memory - rm(phenotype_file) - #Status updateProgressBar(session = session, id = "pb_gwas", value = 5, title = "Upload Complete: Now Formatting GWASpoly Data") @@ -215,6 +219,8 @@ mod_gwas_server <- function(id){ #Geno.file conversion if needed if (grepl("\\.csv$", file_path)) { + #TODO: Add check for matches of sample names in genotype and phenotype data + data <- read.GWASpoly(ploidy= ploidy, pheno.file= temp_pheno_file, geno.file=input$gwas_file$datapath, format="numeric", n.traits=length(traits), delim=",") #only need to change files here @@ -231,6 +237,28 @@ mod_gwas_server <- function(id){ class(geno_mat) <- "numeric" info <- data.frame(vcf@fix) gpoly_df <- cbind(info[,c("ID","CHROM","POS")], geno_mat) + + if(!any(colnames(gpoly_df) %in% phenotype_file$Sample_ID)) { + shinyalert( + title = "Samples ID do not match", + text = paste("Check if passport/phenotype files have same sample ID as the VCF/genotype file."), + size = "s", + closeOnEsc = TRUE, + closeOnClickOutside = FALSE, + html = TRUE, + type = "error", + showConfirmButton = TRUE, + confirmButtonText = "OK", + confirmButtonCol = "#004192", + showCancelButton = FALSE, + animation = TRUE + ) + + } + validate( + need(any(colnames(gpoly_df) %in% phenotype_file$Sample_ID), "The selected traits must be numerical.") + ) + write.csv(gpoly_df, file = temp_geno_file, row.names = FALSE) data <- read.GWASpoly(ploidy= ploidy, pheno.file= temp_pheno_file, geno.file=temp_geno_file, diff --git a/R/utils.R b/R/utils.R index d39d70c..811f485 100644 --- a/R/utils.R +++ b/R/utils.R @@ -280,3 +280,15 @@ posdefmat <- function(mat) { } return(g) } + +# Function to split INFO column and expand it into multiple columns +split_info_column <- function(info) { + # Split the INFO column by semicolon + info_split <- str_split(info, ";")[[1]] + + # Create a named list by splitting each element by equals sign + info_list <- set_names(map(info_split, ~ str_split(.x, "=")[[1]][2]), + map(info_split, ~ str_split(.x, "=")[[1]][1])) + + return(info_list) +} diff --git a/tests/testthat/test-GWAS.R b/tests/testthat/test-GWAS.R index 5d26dca..a2d9420 100644 --- a/tests/testthat/test-GWAS.R +++ b/tests/testthat/test-GWAS.R @@ -1,6 +1,7 @@ context("GWAS") test_that("test GWAS",{ + input <- list() input$cores <- 1 input$phenotype_file$datapath <- system.file("iris_passport_file.csv", package = "BIGapp") @@ -17,6 +18,13 @@ test_that("test GWAS",{ #I think I can subset the read.GWAS file pheno and fixed categories (data@pheno[,c("trait")]) and data@fixed = phenotype_file[,c("List of fixed traits")] phenotype_file <- read.csv(input$phenotype_file$datapath, header = TRUE, check.names = FALSE) + # Remove empty lines + rm.empty <- which(apply(phenotype_file, 1, function(x) all(is.na(x) | x == ""))) + if(length(rm.empty) > 0){ + warning(paste("Removing", length(rm.empty),"empty lines")) + phenotype_file <- phenotype_file[-rm.empty,] + } + ids <- colnames(phenotype_file)[1] traits <- input$trait_info fixed <- input$fixed_info @@ -36,9 +44,6 @@ test_that("test GWAS",{ #Save new phenotype file with selected traits and fixed effects write.csv(phenotype_file, file = temp_pheno_file, row.names = FALSE) - #Remove the phenotype_file from memory - rm(phenotype_file) - #Geno file path file_path <- input$gwas_file$datapath @@ -56,10 +61,15 @@ test_that("test GWAS",{ #Extract GT geno_mat <- extract.gt(vcf, element = "GT") - geno_mat <- apply(geno_mat, 2, convert_to_dosage) + geno_mat <- apply(geno_mat, 2, BIGapp:::convert_to_dosage) class(geno_mat) <- "numeric" info <- data.frame(vcf@fix) gpoly_df <- cbind(info[,c("ID","CHROM","POS")], geno_mat) + + if(!any(colnames(gpoly_df) %in% phenotype_file$Sample_ID)) { # Add + stop("Make sure passport and VCF samples have same name") + } + write.csv(gpoly_df, file = temp_geno_file, row.names = FALSE) data <- read.GWASpoly(ploidy= ploidy, pheno.file= temp_pheno_file, geno.file=temp_geno_file, @@ -106,7 +116,7 @@ test_that("test GWAS",{ PC<-as.matrix(PCs) K=as.matrix(Kin) - kin.adj<-posdefmat(K) + kin.adj<-BIGapp:::posdefmat(K) kin.test<-as.matrix(kin.adj) for (i in 2:ncol(GE)){ @@ -114,7 +124,7 @@ test_that("test GWAS",{ #model selection y=as.numeric(GE[,i]) - BICs<-CalcBIC(y=y,PC=PC,K=kin.test) + BICs<- BIGapp:::CalcBIC(y=y,PC=PC,K=kin.test) plotBICs<-cbind(rbind.data.frame(BICs$BIC$withK,BICs$BIC$withoutK),rep(c("w/Kinship","no Kinship"),each=nrow(BICs$BIC$withK))) colnames(plotBICs)[ncol(plotBICs)]<-"RelationshipMatrix" @@ -165,7 +175,7 @@ test_that("test GWAS",{ #Save qq_plot info - CMplot_shiny(data_qq,plot.type="q",col=c(1:8), + BIGapp:::CMplot_shiny(data_qq,plot.type="q",col=c(1:8), ylab.pos=2, file.name=colnames(data@pheno[i]), conf.int=FALSE, diff --git a/tests/testthat/test-filtering.R b/tests/testthat/test-filtering.R index aec9033..cc74fea 100644 --- a/tests/testthat/test-filtering.R +++ b/tests/testthat/test-filtering.R @@ -1,6 +1,14 @@ context("Filtering") -test_that("Filtering",{ +#library(vcfR) +#library(BIGr) +#library(testthat) +library(tidyr) +library(dplyr) +library(purrr) +library(stringr) + +test_that("Filtering with updog metrics",{ #Variables filter_ploidy <- 2 @@ -28,6 +36,13 @@ test_that("Filtering",{ #Input file vcf <- read.vcfR(input$updog_rdata$datapath, verbose = FALSE) + + # Identify if have updog parameters + format_fields <- unique(vcf@gt[,1]) + info_fields <- vcf@fix[1,8] + + updog_par <- grepl("MPP", format_fields) & grepl("PMC", info_fields) & grepl("BIAS", info_fields) + #Starting SNPs starting_snps <- nrow(vcf) #export INFO dataframe @@ -52,6 +67,9 @@ test_that("Filtering",{ gt_matrix <- extract.gt(vcf, element = "GT", as.numeric = FALSE) filtering_files$snp_miss_df <- rowMeans(is.na(gt_matrix)) #SNP missing values filtering_files$sample_miss_df <- as.numeric(colMeans(is.na(gt_matrix))) #Sample missing values + + expect_true(all(table(gt_matrix[,10]) == c(20,13,8))) + rm(gt_matrix) #Remove gt matrix #Writing file @@ -59,4 +77,7 @@ test_that("Filtering",{ #Get final_snps final_snps <- nrow(vcf) + expect_equal(final_snps, 43) + }) +