From fb401d6289433be78efe7be7840a7786e713ccae Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Thu, 15 Aug 2024 17:07:45 -0400 Subject: [PATCH 01/40] adding exceptions --- R/mod_gwas.R | 34 +++++++++++++++++++++++++++++++--- tests/testthat/test-GWAS.R | 23 ++++++++++++++++------- 2 files changed, 47 insertions(+), 10 deletions(-) diff --git a/R/mod_gwas.R b/R/mod_gwas.R index 230b0fa..e1aef38 100644 --- a/R/mod_gwas.R +++ b/R/mod_gwas.R @@ -165,6 +165,13 @@ mod_gwas_server <- function(id){ #I think I can subset the read.GWAS file pheno and fixed categories (data@pheno[,c("trait")]) and data@fixed = phenotype_file[,c("List of fixed traits")] phenotype_file <- read.csv(input$phenotype_file$datapath, header = TRUE, check.names = FALSE) + # Remove empty lines + rm.empty <- which(apply(phenotype_file, 1, function(x) all(is.na(x) | x == ""))) + if(length(rm.empty) > 0){ + warning(paste("Removing", length(rm.empty),"empty lines")) + phenotype_file <- phenotype_file[-rm.empty,] + } + ids <- colnames(phenotype_file)[1] traits <- input$trait_info fixed <- input$fixed_info @@ -204,9 +211,6 @@ mod_gwas_server <- function(id){ #Save new phenotype file with selected traits and fixed effects write.csv(phenotype_file, file = temp_pheno_file, row.names = FALSE) - #Remove the phenotype_file from memory - rm(phenotype_file) - #Status updateProgressBar(session = session, id = "pb_gwas", value = 5, title = "Upload Complete: Now Formatting GWASpoly Data") @@ -215,6 +219,8 @@ mod_gwas_server <- function(id){ #Geno.file conversion if needed if (grepl("\\.csv$", file_path)) { + #TODO: Add check for matches of sample names in genotype and phenotype data + data <- read.GWASpoly(ploidy= ploidy, pheno.file= temp_pheno_file, geno.file=input$gwas_file$datapath, format="numeric", n.traits=length(traits), delim=",") #only need to change files here @@ -231,6 +237,28 @@ mod_gwas_server <- function(id){ class(geno_mat) <- "numeric" info <- data.frame(vcf@fix) gpoly_df <- cbind(info[,c("ID","CHROM","POS")], geno_mat) + + if(!any(colnames(gpoly_df) %in% phenotype_file$Sample_ID)) { + shinyalert( + title = "Samples ID do not match", + text = paste("Check if passport/phenotype files have same sample ID as the VCF/genotype file."), + size = "s", + closeOnEsc = TRUE, + closeOnClickOutside = FALSE, + html = TRUE, + type = "error", + showConfirmButton = TRUE, + confirmButtonText = "OK", + confirmButtonCol = "#004192", + showCancelButton = FALSE, + animation = TRUE + ) + + } + validate( + need(any(colnames(gpoly_df) %in% phenotype_file$Sample_ID), "The selected traits must be numerical.") + ) + write.csv(gpoly_df, file = temp_geno_file, row.names = FALSE) data <- read.GWASpoly(ploidy= ploidy, pheno.file= temp_pheno_file, geno.file=temp_geno_file, diff --git a/tests/testthat/test-GWAS.R b/tests/testthat/test-GWAS.R index 5d26dca..7d5833e 100644 --- a/tests/testthat/test-GWAS.R +++ b/tests/testthat/test-GWAS.R @@ -17,6 +17,13 @@ test_that("test GWAS",{ #I think I can subset the read.GWAS file pheno and fixed categories (data@pheno[,c("trait")]) and data@fixed = phenotype_file[,c("List of fixed traits")] phenotype_file <- read.csv(input$phenotype_file$datapath, header = TRUE, check.names = FALSE) + # Remove empty lines + rm.empty <- which(apply(phenotype_file, 1, function(x) all(is.na(x) | x == ""))) + if(length(rm.empty) > 0){ + warning(paste("Removing", length(rm.empty),"empty lines")) + phenotype_file <- phenotype_file[-rm.empty,] + } + ids <- colnames(phenotype_file)[1] traits <- input$trait_info fixed <- input$fixed_info @@ -36,9 +43,6 @@ test_that("test GWAS",{ #Save new phenotype file with selected traits and fixed effects write.csv(phenotype_file, file = temp_pheno_file, row.names = FALSE) - #Remove the phenotype_file from memory - rm(phenotype_file) - #Geno file path file_path <- input$gwas_file$datapath @@ -56,10 +60,15 @@ test_that("test GWAS",{ #Extract GT geno_mat <- extract.gt(vcf, element = "GT") - geno_mat <- apply(geno_mat, 2, convert_to_dosage) + geno_mat <- apply(geno_mat, 2, BIGapp:::convert_to_dosage) class(geno_mat) <- "numeric" info <- data.frame(vcf@fix) gpoly_df <- cbind(info[,c("ID","CHROM","POS")], geno_mat) + + if(!any(colnames(gpoly_df) %in% phenotype_file$Sample_ID)) { # Add + stop("Make sure passport and VCF samples have same name") + } + write.csv(gpoly_df, file = temp_geno_file, row.names = FALSE) data <- read.GWASpoly(ploidy= ploidy, pheno.file= temp_pheno_file, geno.file=temp_geno_file, @@ -106,7 +115,7 @@ test_that("test GWAS",{ PC<-as.matrix(PCs) K=as.matrix(Kin) - kin.adj<-posdefmat(K) + kin.adj<-BIGapp:::posdefmat(K) kin.test<-as.matrix(kin.adj) for (i in 2:ncol(GE)){ @@ -114,7 +123,7 @@ test_that("test GWAS",{ #model selection y=as.numeric(GE[,i]) - BICs<-CalcBIC(y=y,PC=PC,K=kin.test) + BICs<- BIGapp:::CalcBIC(y=y,PC=PC,K=kin.test) plotBICs<-cbind(rbind.data.frame(BICs$BIC$withK,BICs$BIC$withoutK),rep(c("w/Kinship","no Kinship"),each=nrow(BICs$BIC$withK))) colnames(plotBICs)[ncol(plotBICs)]<-"RelationshipMatrix" @@ -165,7 +174,7 @@ test_that("test GWAS",{ #Save qq_plot info - CMplot_shiny(data_qq,plot.type="q",col=c(1:8), + BIGapp:::CMplot_shiny(data_qq,plot.type="q",col=c(1:8), ylab.pos=2, file.name=colnames(data@pheno[i]), conf.int=FALSE, From 7c885ec9d79a3bd065cd0e0ee760be138f99a9d5 Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Fri, 16 Aug 2024 12:49:49 -0400 Subject: [PATCH 02/40] fixed issues #37 and #38 --- R/mod_Filtering.R | 201 ++++++++++++++++++++------------ R/utils.R | 12 ++ tests/testthat/test-filtering.R | 146 ++++++++++++++++++++++- 3 files changed, 283 insertions(+), 76 deletions(-) diff --git a/R/mod_Filtering.R b/R/mod_Filtering.R index 3e52daf..6cf92ca 100644 --- a/R/mod_Filtering.R +++ b/R/mod_Filtering.R @@ -58,14 +58,7 @@ mod_Filtering_ui <- function(id){ ) ), column(width = 6, - tabBox(width =12, collapsible = FALSE, status = "info", - id = "updog_tab", height = "600px", - tabPanel("Bias Histogram", icon = icon("image"), plotOutput(ns("bias_hist"), height = '550px')), - tabPanel("OD Histogram", icon = icon("image"), plotOutput(ns("od_hist"), height = '550px')), - tabPanel("Prop_mis Histogram", icon = icon("image"), plotOutput(ns("maxpostprob_hist"), height = '550px')), - tabPanel("SNP_miss", icon = icon("image"), plotOutput(ns("missing_snp_hist"), height = '550px')), - tabPanel("Sample_miss", icon = icon("image"), plotOutput(ns("missing_sample_hist"), height = '550px')) - ) + uiOutput(ns("din_tabs")), ), column(width = 3, valueBoxOutput(ns("snp_retained_box"), width = NULL), @@ -165,7 +158,37 @@ mod_Filtering_server <- function(id){ req(input$filter_ploidy, input$filter_output_name,input$updog_rdata) - if (input$use_updog) { + #Input file + vcf <- read.vcfR(input$updog_rdata$datapath, verbose = FALSE) + + # Identify if have updog parameters + format_fields <- unique(vcf@gt[,1]) + info_fields <- vcf@fix[1,8] + updog_par <- grepl("MPP", format_fields) & grepl("PMC", info_fields) & grepl("BIAS", info_fields) & grepl("OD", info_fields) + + if(updog_par){ + output$din_tabs <- renderUI({ + tabBox(width =12, collapsible = FALSE, status = "info", + id = "updog_tab", height = "600px", + tabPanel("Bias Histogram", icon = icon("image"), plotOutput(ns("bias_hist"), height = '550px')), + tabPanel("OD Histogram", icon = icon("image"), plotOutput(ns("od_hist"), height = '550px')), + tabPanel("Prop_mis Histogram", icon = icon("image"), plotOutput(ns("maxpostprob_hist"), height = '550px')), + tabPanel("SNP_miss", icon = icon("image"), plotOutput(ns("missing_snp_hist"), height = '550px')), + tabPanel("Sample_miss", icon = icon("image"), plotOutput(ns("missing_sample_hist"), height = '550px')) + ) + }) + } else { + output$din_tabs <- renderUI({ + tabBox(width =12, collapsible = FALSE, status = "info", + id = "updog_tab", height = "600px", + tabPanel("SNP_miss", icon = icon("image"), plotOutput(ns("missing_snp_hist"), height = '550px')), + tabPanel("Sample_miss", icon = icon("image"), plotOutput(ns("missing_sample_hist"), height = '550px')) + ) + }) + } + + + if (input$use_updog & updog_par) { # Use Updog filtering parameters OD_filter <- as.numeric(input$OD_filter) Prop_mis <- as.numeric(input$Prop_mis) @@ -193,8 +216,7 @@ mod_Filtering_server <- function(id){ maf_filter <- input$filter_maf updateProgressBar(session = session, id = "pb_filter", value = 10, title = "Processing VCF file") - #Input file - vcf <- read.vcfR(input$updog_rdata$datapath, verbose = FALSE) + #Starting SNPs starting_snps <- nrow(vcf) output$snp_removed_box <- renderValueBox({ @@ -226,6 +248,23 @@ mod_Filtering_server <- function(id){ filter.MAF = as.numeric(maf_filter), filter.MPP = max_post) + if (length(vcf@gt) == 0) { + shinyalert( + title = "All markers were filtered out", + text = "Loose the parameters to access results in this tab", + size = "s", + closeOnEsc = TRUE, + closeOnClickOutside = FALSE, + html = TRUE, + type = "error", + showConfirmButton = TRUE, + confirmButtonText = "OK", + confirmButtonCol = "#004192", + showCancelButton = FALSE, + animation = TRUE + ) + } + #Getting missing data information #Add support for genotype matrix filtering? #Pb @@ -336,6 +375,8 @@ mod_Filtering_server <- function(id){ abline(v = median(as.numeric(filtering_output$df$BIAS)), col = "green", lty = 2) # Median line abline(v = 0.5, col = "black", lty = 2) # proposed lower line abline(v = 2, col = "black", lty = 2) # proposed upper line + legend("topright", legend=c("mean", "median", "suggested threshold"), + col=c("red", "green","black"), lty=2, cex=0.8) } else if (input$filter_hist == "OD Histogram") { @@ -355,6 +396,8 @@ mod_Filtering_server <- function(id){ abline(v = mean(as.numeric(filtering_output$df$OD)), col = "red", lty = 2) # Mean line abline(v = median(as.numeric(filtering_output$df$OD)), col = "green", lty = 2) # Median line abline(v = 0.05, col = "black", lty = 2) # proposed filter by updog + legend("topright", legend=c("mean", "median", "suggested threshold"), + col=c("red", "green","black"), lty=2, cex=0.8) } else if (input$filter_hist == "Prop_mis Histogram") { @@ -372,6 +415,8 @@ mod_Filtering_server <- function(id){ abline(v = mean(as.numeric(filtering_output$df$PMC)), col = "red", lty = 2) # Mean line abline(v = median(as.numeric(filtering_output$df$PMC)), col = "green", lty = 2) # Median line abline(v = quantile(as.numeric(filtering_output$df$PMC), 0.95), col = "blue", lty = 2) + legend("topright", legend=c("mean", "median", "quantile"), + col=c("red", "green","blue"), lty=2, cex=0.8) } else if (input$filter_hist == "SNP_mis") { @@ -389,6 +434,8 @@ mod_Filtering_server <- function(id){ abline(v = mean(as.numeric(filtering_files$snp_miss_df)), col = "red", lty = 2) # Mean line abline(v = median(as.numeric(filtering_files$snp_miss_df)), col = "green", lty = 2) # Median line abline(v = quantile(as.numeric(filtering_files$snp_miss_df), 0.95), col = "blue", lty = 2) + legend("topright", legend=c("mean", "median", "quantile"), + col=c("red", "green","blue"), lty=2, cex=0.8) } else if (input$filter_hist == "Sample_mis") { @@ -406,6 +453,8 @@ mod_Filtering_server <- function(id){ abline(v = mean(as.numeric(filtering_files$sample_miss_df)), col = "red", lty = 2) # Mean line abline(v = median(as.numeric(filtering_files$sample_miss_df)), col = "green", lty = 2) # Median line abline(v = quantile(as.numeric(filtering_files$sample_miss_df), 0.95), col = "blue", lty = 2) + legend("topright", legend=c("mean", "median", "quantile"), + col=c("red", "green","blue"), lty=2, cex=0.8) } dev.off() } @@ -421,19 +470,6 @@ mod_Filtering_server <- function(id){ observeEvent(filtering_files$raw_vcf_df, { - - # Function to split INFO column and expand it into multiple columns - split_info_column <- function(info) { - # Split the INFO column by semicolon - info_split <- str_split(info, ";")[[1]] - - # Create a named list by splitting each element by equals sign - info_list <- set_names(map(info_split, ~ str_split(.x, "=")[[1]][2]), - map(info_split, ~ str_split(.x, "=")[[1]][1])) - - return(info_list) - } - # Apply the function to each row and bind the results into a new dataframe new_df <- data.frame(filtering_files$raw_vcf_df) %>% mutate(INFO_list = map(INFO, split_info_column)) %>% @@ -450,67 +486,80 @@ mod_Filtering_server <- function(id){ ###Bias #Histogram - output$bias_hist <- renderPlot({ - hist(as.numeric(new_df$BIAS), - main = "Unfiltered SNP bias histogram", - xlab = "bias", - ylab = "SNPs", - col = "lightblue", - border = "black", - xlim = c(0,5), - breaks = as.numeric(input$hist_bins)) - axis(1, at = seq(0, 5, by = .2), labels = rep("", length(seq(0, 5, by = 0.2)))) # Add ticks - abline(v = mean(as.numeric(new_df$BIAS)), col = "red", lty = 2) # Mean line - abline(v = median(as.numeric(new_df$BIAS)), col = "green", lty = 2) # Median line - abline(v = 0.5, col = "black", lty = 2) # proposed lower line - abline(v = 2, col = "black", lty = 2) # proposed upper line - }) + if(any(grepl("BIAS", colnames(new_df)))){ + output$bias_hist <- renderPlot({ + hist(as.numeric(new_df$BIAS), + main = "Unfiltered SNP bias histogram", + xlab = "bias", + ylab = "SNPs", + col = "lightblue", + border = "black", + xlim = c(0,5), + breaks = as.numeric(input$hist_bins)) + axis(1, at = seq(0, 5, by = .2), labels = rep("", length(seq(0, 5, by = 0.2)))) # Add ticks + abline(v = mean(as.numeric(new_df$BIAS)), col = "red", lty = 2) # Mean line + abline(v = median(as.numeric(new_df$BIAS)), col = "green", lty = 2) # Median line + abline(v = 0.5, col = "black", lty = 2) # proposed lower line + abline(v = 2, col = "black", lty = 2) # proposed upper line + legend("topright", legend=c("mean", "median", "suggested threshold"), + col=c("red", "green","black"), lty=2, cex=0.8) + }) + } ###OD - quantile(as.numeric(new_df$OD), 0.95) - #Histogram - output$od_hist <- renderPlot({ - hist(as.numeric(new_df$OD), - main = "Unfiltered SNP overdispersion parameter histogram", - xlab = "OD", - ylab = "SNPs", - col = "lightblue", - border = "black", - xlim = c(0,0.6), - breaks = as.numeric(input$hist_bins)) - axis(1, at = seq(0, 0.6, by = .01), labels = rep("", length(seq(0, 0.6, by = 0.01)))) # Add ticks - abline(v = 0.05, col = "black", lty = 2) # proposed filter by updog + if(any(grepl("OD", colnames(new_df)))){ - # Add vertical lines - abline(v = mean(as.numeric(new_df$OD)), col = "red", lty = 2) # Mean line - abline(v = median(as.numeric(new_df$OD)), col = "green", lty = 2) # Median line - abline(v = 0.05, col = "black", lty = 2) # proposed filter by updog + quantile(as.numeric(new_df$OD), 0.95) + #Histogram + output$od_hist <- renderPlot({ + hist(as.numeric(new_df$OD), + main = "Unfiltered SNP overdispersion parameter histogram", + xlab = "OD", + ylab = "SNPs", + col = "lightblue", + border = "black", + xlim = c(0,0.6), + breaks = as.numeric(input$hist_bins)) + axis(1, at = seq(0, 0.6, by = .01), labels = rep("", length(seq(0, 0.6, by = 0.01)))) # Add ticks + abline(v = 0.05, col = "black", lty = 2) # proposed filter by updog - }) + # Add vertical lines + abline(v = mean(as.numeric(new_df$OD)), col = "red", lty = 2) # Mean line + abline(v = median(as.numeric(new_df$OD)), col = "green", lty = 2) # Median line + abline(v = 0.05, col = "black", lty = 2) # proposed filter by updog + legend("topright", legend=c("mean", "median", "suggested threshold"), + col=c("red", "green","black"), lty=2, cex=0.8) + + }) + } ##MAXPOSTPROB #Histogram + if(any(grepl("PMC", colnames(new_df)))){ - output$maxpostprob_hist <- renderPlot({ + output$maxpostprob_hist <- renderPlot({ - #Histogram - hist(as.numeric(new_df$PMC), - main = "The estimated proportion of individuals misclassified in the SNP from updog", - xlab = "Proportion of Misclassified Genotypes per SNP", - ylab = "Number of SNPs", - col = "lightblue", - border = "black", - xlim = c(0,1), - breaks = as.numeric(input$hist_bins)) - axis(1, at = seq(0, 1, by = .1), labels = rep("", length(seq(0, 1, by = 0.1)))) # Add ticks + #Histogram + hist(as.numeric(new_df$PMC), + main = "The estimated proportion of individuals misclassified in the SNP from updog", + xlab = "Proportion of Misclassified Genotypes per SNP", + ylab = "Number of SNPs", + col = "lightblue", + border = "black", + xlim = c(0,1), + breaks = as.numeric(input$hist_bins)) + axis(1, at = seq(0, 1, by = .1), labels = rep("", length(seq(0, 1, by = 0.1)))) # Add ticks - # Add vertical lines - abline(v = mean(as.numeric(new_df$PMC)), col = "red", lty = 2) # Mean line - abline(v = median(as.numeric(new_df$PMC)), col = "green", lty = 2) # Median line - abline(v = quantile(as.numeric(new_df$PMC), 0.95), col = "blue", lty = 2) + # Add vertical lines + abline(v = mean(as.numeric(new_df$PMC)), col = "red", lty = 2) # Mean line + abline(v = median(as.numeric(new_df$PMC)), col = "green", lty = 2) # Median line + abline(v = quantile(as.numeric(new_df$PMC), 0.95), col = "blue", lty = 2) + legend("topright", legend=c("mean", "median", "quantile"), + col=c("red", "green","blue"), lty=2, cex=0.8) - }) + }) + } #Missing data output$missing_snp_hist <- renderPlot({ @@ -530,7 +579,8 @@ mod_Filtering_server <- function(id){ abline(v = mean(as.numeric(filtering_files$snp_miss_df)), col = "red", lty = 2) # Mean line abline(v = median(as.numeric(filtering_files$snp_miss_df)), col = "green", lty = 2) # Median line abline(v = quantile(as.numeric(filtering_files$snp_miss_df), 0.95), col = "blue", lty = 2) - + legend("topright", legend=c("mean", "median", "quantile"), + col=c("red", "green","blue"), lty=2, cex=0.8) }) output$missing_sample_hist <- renderPlot({ @@ -550,7 +600,8 @@ mod_Filtering_server <- function(id){ abline(v = mean(as.numeric(filtering_files$sample_miss_df)), col = "red", lty = 2) # Mean line abline(v = median(as.numeric(filtering_files$sample_miss_df)), col = "green", lty = 2) # Median line abline(v = quantile(as.numeric(filtering_files$sample_miss_df), 0.95), col = "blue", lty = 2) - + legend("topright", legend=c("mean", "median", "quantile"), + col=c("red", "green","blue"), lty=2, cex=0.8) }) ##Read Depth (I would prefer that this show the mean depth for SNPs or Samples instead of all loci/sample cells) diff --git a/R/utils.R b/R/utils.R index d39d70c..811f485 100644 --- a/R/utils.R +++ b/R/utils.R @@ -280,3 +280,15 @@ posdefmat <- function(mat) { } return(g) } + +# Function to split INFO column and expand it into multiple columns +split_info_column <- function(info) { + # Split the INFO column by semicolon + info_split <- str_split(info, ";")[[1]] + + # Create a named list by splitting each element by equals sign + info_list <- set_names(map(info_split, ~ str_split(.x, "=")[[1]][2]), + map(info_split, ~ str_split(.x, "=")[[1]][1])) + + return(info_list) +} diff --git a/tests/testthat/test-filtering.R b/tests/testthat/test-filtering.R index aec9033..493c676 100644 --- a/tests/testthat/test-filtering.R +++ b/tests/testthat/test-filtering.R @@ -1,6 +1,14 @@ context("Filtering") -test_that("Filtering",{ +#library(vcfR) +#library(BIGr) +#library(testthat) +library(tidyr) +library(dplyr) +library(purrr) +library(stringr) + +test_that("Filtering with updog metrics",{ #Variables filter_ploidy <- 2 @@ -26,6 +34,81 @@ test_that("Filtering",{ temp_file <- tempfile(fileext = ".vcf.gz") + #Input file + vcf <- read.vcfR(input$updog_rdata$datapath, verbose = FALSE) + + # Identify if have updog parameters + format_fields <- unique(vcf@gt[,1]) + info_fields <- vcf@fix[1,8] + + updog_par <- grepl("MPP", format_fields) & grepl("PMC", info_fields) & grepl("BIAS", info_fields) + + #Starting SNPs + starting_snps <- nrow(vcf) + #export INFO dataframe + filtering_files$raw_vcf_df <- data.frame(vcf@fix) + + #Filtering + vcf <- filterVCF(vcf.file = vcf, + ploidy=ploidy, + output.file=NULL, + filter.OD = OD_filter, + filter.BIAS.min = Bias_min, + filter.BIAS.max = Bias_max, + filter.DP = as.numeric(size_depth), + filter.PMC = Prop_mis, + filter.SAMPLE.miss = as.numeric(sample_miss), + filter.SNP.miss = as.numeric(snp_miss), + filter.MAF = as.numeric(maf_filter), + filter.MPP = max_post) + + #Getting missing data information + #Add support for genotype matrix filtering? + gt_matrix <- extract.gt(vcf, element = "GT", as.numeric = FALSE) + filtering_files$snp_miss_df <- rowMeans(is.na(gt_matrix)) #SNP missing values + filtering_files$sample_miss_df <- as.numeric(colMeans(is.na(gt_matrix))) #Sample missing values + + expect_true(all(table(gt_matrix[,10]) == c(20,13,8))) + + rm(gt_matrix) #Remove gt matrix + + #Writing file + write.vcf(vcf, file = temp_file) + + #Get final_snps + final_snps <- nrow(vcf) + expect_equal(final_snps, 43) + +}) + + +test_that("Filtering without updog metrics",{ + + #Variables + filter_ploidy <- 2 + filter_maf <- 0.05 + size_depth <- 10 + snp_miss <- 100 + sample_miss <- 100 + OD_filter <- NULL + Bias <- NULL + Bias_min <- NULL + Bias_max <- NULL + Prop_mis <- 0.05 + maxpostprob_filter <- NULL + max_post <- maxpostprob_filter + output_name <- "out" + snp_miss <- snp_miss/100 + sample_miss <- sample_miss/100 + ploidy <- filter_ploidy + maf_filter <- filter_maf + input$hist_bins <- 50 + + input <- filtering_files <- list() + input$updog_rdata$datapath <- system.file("vcf_example_out.vcf.gz", package = "BIGapp") + + temp_file <- tempfile(fileext = ".vcf.gz") + #Input file vcf <- read.vcfR(input$updog_rdata$datapath, verbose = FALSE) #Starting SNPs @@ -47,11 +130,14 @@ test_that("Filtering",{ filter.MAF = as.numeric(maf_filter), filter.MPP = max_post) + if(length(vcf@gt) == 0) stop("All markers were filtered. Loose the parameters to access results in this tab.") + #Getting missing data information #Add support for genotype matrix filtering? gt_matrix <- extract.gt(vcf, element = "GT", as.numeric = FALSE) filtering_files$snp_miss_df <- rowMeans(is.na(gt_matrix)) #SNP missing values filtering_files$sample_miss_df <- as.numeric(colMeans(is.na(gt_matrix))) #Sample missing values + rm(gt_matrix) #Remove gt matrix #Writing file @@ -59,4 +145,62 @@ test_that("Filtering",{ #Get final_snps final_snps <- nrow(vcf) + + #export INFO dataframe + filtering_files$raw_vcf_df + + # Apply the function to each row and bind the results into a new dataframe + new_df <- data.frame(filtering_files$raw_vcf_df) %>% + mutate(INFO_list = map(INFO, split_info_column)) %>% + unnest_wider(INFO_list) + + #Save df to reactive value + filtering_output <- list() + filtering_output$df <- new_df + + ##Make plots + + #Missing data + + #Histogram + hist(as.numeric(filtering_files$snp_miss_df), + main = "Ratio of Missing Data per SNP After Filtering", + xlab = "Proportion of Missing Data per SNP", + ylab = "Number of SNPs", + col = "lightblue", + border = "black", + xlim = c(0,1), + breaks = as.numeric(input$hist_bins)) + axis(1, at = seq(0, 1, by = .1), labels = rep("", length(seq(0, 1, by = 0.1)))) # Add ticks + + # Add vertical lines + abline(v = mean(as.numeric(filtering_files$snp_miss_df)), col = "red", lty = 2) # Mean line + abline(v = median(as.numeric(filtering_files$snp_miss_df)), col = "green", lty = 2) # Median line + abline(v = quantile(as.numeric(filtering_files$snp_miss_df), 0.95), col = "blue", lty = 2) + legend("topright", legend=c("mean", "median", "quantile"), + col=c("red", "green","blue"), lty=1:2, cex=0.8) + + #Histogram + hist(as.numeric(filtering_files$sample_miss_df), + main = "Ratio of Missing Data per Sample After Filtering", + xlab = "Proportion of Missing Data per Sample", + ylab = "Number of Samples", + col = "lightblue", + border = "black", + xlim = c(0,1), + breaks = as.numeric(input$hist_bins)) + axis(1, at = seq(0, 1, by = .1), labels = rep("", length(seq(0, 1, by = 0.1)))) # Add ticks + + # Add vertical lines + abline(v = mean(as.numeric(filtering_files$sample_miss_df)), col = "red", lty = 2) # Mean line + abline(v = median(as.numeric(filtering_files$sample_miss_df)), col = "green", lty = 2) # Median line + abline(v = quantile(as.numeric(filtering_files$sample_miss_df), 0.95), col = "blue", lty = 2) + legend("topright", legend=c("mean", "median", "quantile"), + col=c("red", "green","blue"), lty=1:2, cex=0.8) + + + ##Read Depth (I would prefer that this show the mean depth for SNPs or Samples instead of all loci/sample cells) + quantile(as.numeric(new_df$DP), 0.95) + + }) From 0e287cd6ccf04e55a00cc35a839445c5c665525e Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Fri, 16 Aug 2024 14:29:55 -0400 Subject: [PATCH 03/40] Fixed PCA image download bug --- .Rbuildignore | 1 + R/mod_PCA.R | 26 ++++++++++++-------------- 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/.Rbuildignore b/.Rbuildignore index 8a980d2..cb081eb 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -9,3 +9,4 @@ $run_dev.* ^\www$ ^LICENSE$ ^\.github$ +^Dockerfile$ diff --git a/R/mod_PCA.R b/R/mod_PCA.R index 0c0f93b..96b03b8 100644 --- a/R/mod_PCA.R +++ b/R/mod_PCA.R @@ -448,35 +448,33 @@ mod_PCA_server <- function(id){ #Download figures for PCA output$download_pca <- downloadHandler( - filename = function() { if (input$pca_image_type == "jpeg") { - paste("pca-", Sys.Date(), ".jpg", sep="") + paste("pca-", Sys.Date(), ".jpg", sep = "") } else if (input$pca_image_type == "png") { - paste("pca-", Sys.Date(), ".png", sep="") + paste("pca-", Sys.Date(), ".png", sep = "") } else { - paste("pca-", Sys.Date(), ".tiff", sep="") + paste("pca-", Sys.Date(), ".tiff", sep = "") } }, content = function(file) { req(input$pca_figure) - + if (input$pca_image_type == "jpeg") { - jpeg(file, width = as.numeric(input$pca_image_width), height = as.numeric(input$pca_image_height), res= as.numeric(input$pca_image_res), units = "in") + jpeg(file, width = as.numeric(input$pca_image_width), height = as.numeric(input$pca_image_height), res = as.numeric(input$pca_image_res), units = "in") } else if (input$pca_image_type == "png") { - png(file, width = as.numeric(input$pca_image_width), height = as.numeric(input$pca_image_height), res= as.numeric(input$pca_image_res), units = "in") + png(file, width = as.numeric(input$pca_image_width), height = as.numeric(input$pca_image_height), res = as.numeric(input$pca_image_res), units = "in") } else { - tiff(file, width = as.numeric(input$pca_image_width), height = as.numeric(input$pca_image_height), res= as.numeric(input$pca_image_res), units = "in") + tiff(file, width = as.numeric(input$pca_image_width), height = as.numeric(input$pca_image_height), res = as.numeric(input$pca_image_res), units = "in") } - - # Conditional plotting based on input selection + + # Plot based on user selection if (input$pca_figure == "2D Plot") { - pca_2d() + print(pca_2d()) } else if (input$pca_figure == "Scree Plot") { - pca_scree() - } else { - plot(x = 1:10, y = 1:10, main = "Fallback Simple Test Plot") # Fallback simple test plot + print(pca_scree()) } + dev.off() } ) From fd8255f682eb098c909f8aca0fa077319d4ce493 Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Wed, 28 Aug 2024 11:30:30 -0400 Subject: [PATCH 04/40] Add empty tab + fix tests --- R/mod_Filtering.R | 7 ++ tests/testthat/test-GWAS.R | 1 + tests/testthat/test-filtering.R | 123 -------------------------------- 3 files changed, 8 insertions(+), 123 deletions(-) diff --git a/R/mod_Filtering.R b/R/mod_Filtering.R index 6cf92ca..72ba828 100644 --- a/R/mod_Filtering.R +++ b/R/mod_Filtering.R @@ -132,6 +132,13 @@ mod_Filtering_server <- function(id){ disable("start_updog_filter") + output$din_tabs <- renderUI({ + tabBox(width =12, collapsible = FALSE, status = "info", + id = "updog_tab", height = "600px", + tabPanel("Results", p("Upload VCF file to access results in this section.")) + ) + }) + vcf <- eventReactive(input$run_filters, { # Ensure the files are uploaded diff --git a/tests/testthat/test-GWAS.R b/tests/testthat/test-GWAS.R index 7d5833e..a2d9420 100644 --- a/tests/testthat/test-GWAS.R +++ b/tests/testthat/test-GWAS.R @@ -1,6 +1,7 @@ context("GWAS") test_that("test GWAS",{ + input <- list() input$cores <- 1 input$phenotype_file$datapath <- system.file("iris_passport_file.csv", package = "BIGapp") diff --git a/tests/testthat/test-filtering.R b/tests/testthat/test-filtering.R index 493c676..cc74fea 100644 --- a/tests/testthat/test-filtering.R +++ b/tests/testthat/test-filtering.R @@ -81,126 +81,3 @@ test_that("Filtering with updog metrics",{ }) - -test_that("Filtering without updog metrics",{ - - #Variables - filter_ploidy <- 2 - filter_maf <- 0.05 - size_depth <- 10 - snp_miss <- 100 - sample_miss <- 100 - OD_filter <- NULL - Bias <- NULL - Bias_min <- NULL - Bias_max <- NULL - Prop_mis <- 0.05 - maxpostprob_filter <- NULL - max_post <- maxpostprob_filter - output_name <- "out" - snp_miss <- snp_miss/100 - sample_miss <- sample_miss/100 - ploidy <- filter_ploidy - maf_filter <- filter_maf - input$hist_bins <- 50 - - input <- filtering_files <- list() - input$updog_rdata$datapath <- system.file("vcf_example_out.vcf.gz", package = "BIGapp") - - temp_file <- tempfile(fileext = ".vcf.gz") - - #Input file - vcf <- read.vcfR(input$updog_rdata$datapath, verbose = FALSE) - #Starting SNPs - starting_snps <- nrow(vcf) - #export INFO dataframe - filtering_files$raw_vcf_df <- data.frame(vcf@fix) - - #Filtering - vcf <- filterVCF(vcf.file = vcf, - ploidy=ploidy, - output.file=NULL, - filter.OD = OD_filter, - filter.BIAS.min = Bias_min, - filter.BIAS.max = Bias_max, - filter.DP = as.numeric(size_depth), - filter.PMC = Prop_mis, - filter.SAMPLE.miss = as.numeric(sample_miss), - filter.SNP.miss = as.numeric(snp_miss), - filter.MAF = as.numeric(maf_filter), - filter.MPP = max_post) - - if(length(vcf@gt) == 0) stop("All markers were filtered. Loose the parameters to access results in this tab.") - - #Getting missing data information - #Add support for genotype matrix filtering? - gt_matrix <- extract.gt(vcf, element = "GT", as.numeric = FALSE) - filtering_files$snp_miss_df <- rowMeans(is.na(gt_matrix)) #SNP missing values - filtering_files$sample_miss_df <- as.numeric(colMeans(is.na(gt_matrix))) #Sample missing values - - rm(gt_matrix) #Remove gt matrix - - #Writing file - write.vcf(vcf, file = temp_file) - - #Get final_snps - final_snps <- nrow(vcf) - - #export INFO dataframe - filtering_files$raw_vcf_df - - # Apply the function to each row and bind the results into a new dataframe - new_df <- data.frame(filtering_files$raw_vcf_df) %>% - mutate(INFO_list = map(INFO, split_info_column)) %>% - unnest_wider(INFO_list) - - #Save df to reactive value - filtering_output <- list() - filtering_output$df <- new_df - - ##Make plots - - #Missing data - - #Histogram - hist(as.numeric(filtering_files$snp_miss_df), - main = "Ratio of Missing Data per SNP After Filtering", - xlab = "Proportion of Missing Data per SNP", - ylab = "Number of SNPs", - col = "lightblue", - border = "black", - xlim = c(0,1), - breaks = as.numeric(input$hist_bins)) - axis(1, at = seq(0, 1, by = .1), labels = rep("", length(seq(0, 1, by = 0.1)))) # Add ticks - - # Add vertical lines - abline(v = mean(as.numeric(filtering_files$snp_miss_df)), col = "red", lty = 2) # Mean line - abline(v = median(as.numeric(filtering_files$snp_miss_df)), col = "green", lty = 2) # Median line - abline(v = quantile(as.numeric(filtering_files$snp_miss_df), 0.95), col = "blue", lty = 2) - legend("topright", legend=c("mean", "median", "quantile"), - col=c("red", "green","blue"), lty=1:2, cex=0.8) - - #Histogram - hist(as.numeric(filtering_files$sample_miss_df), - main = "Ratio of Missing Data per Sample After Filtering", - xlab = "Proportion of Missing Data per Sample", - ylab = "Number of Samples", - col = "lightblue", - border = "black", - xlim = c(0,1), - breaks = as.numeric(input$hist_bins)) - axis(1, at = seq(0, 1, by = .1), labels = rep("", length(seq(0, 1, by = 0.1)))) # Add ticks - - # Add vertical lines - abline(v = mean(as.numeric(filtering_files$sample_miss_df)), col = "red", lty = 2) # Mean line - abline(v = median(as.numeric(filtering_files$sample_miss_df)), col = "green", lty = 2) # Median line - abline(v = quantile(as.numeric(filtering_files$sample_miss_df), 0.95), col = "blue", lty = 2) - legend("topright", legend=c("mean", "median", "quantile"), - col=c("red", "green","blue"), lty=1:2, cex=0.8) - - - ##Read Depth (I would prefer that this show the mean depth for SNPs or Samples instead of all loci/sample cells) - quantile(as.numeric(new_df$DP), 0.95) - - -}) From 3c4b1a01b57b81401c95e68470a673918761b60c Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Wed, 28 Aug 2024 15:14:35 -0400 Subject: [PATCH 05/40] Removed AF plots and values from table --- R/mod_diversity.R | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/R/mod_diversity.R b/R/mod_diversity.R index 100080e..d756484 100644 --- a/R/mod_diversity.R +++ b/R/mod_diversity.R @@ -16,7 +16,6 @@ mod_diversity_ui <- function(id){ box(title="Inputs", width = 12, collapsible = TRUE, collapsed = FALSE, status = "info", solidHeader = TRUE, fileInput(ns("diversity_file"), "Choose VCF File", accept = c(".csv",".vcf",".gz")), numericInput(ns("diversity_ploidy"), "Species Ploidy", min = 1, value = NULL), - selectInput(ns("zero_value"), "What are the Dosage Calls?", choices = c("Reference Allele Counts", "Alternate Allele Counts"), selected = NULL), actionButton(ns("diversity_start"), "Run Analysis"), div(style="display:inline-block; float:right",dropdownButton( tags$h3("Diversity Parameters"), @@ -33,7 +32,6 @@ mod_diversity_ui <- function(id){ div(style="display:inline-block; float:left",dropdownButton( tags$h3("Save Image"), selectInput(inputId = ns('div_figure'), label = 'Figure', choices = c("Dosage Plot", - "AF Histogram", "MAF Histogram", "OHet Histogram", "Marker Plot")), @@ -56,7 +54,6 @@ mod_diversity_ui <- function(id){ title = "Plots", status = "info", solidHeader = FALSE, width = 12, height = 550, bs4Dash::tabsetPanel( tabPanel("Dosage Plot", plotOutput(ns('dosage_plot')),style = "overflow-y: auto; height: 500px"), - tabPanel("AF Plot", plotOutput(ns('af_plot')),style = "overflow-y: auto; height: 500px"), tabPanel("MAF Plot", plotOutput(ns('maf_plot')),style = "overflow-y: auto; height: 500px"), tabPanel("OHet Plot", plotOutput(ns('het_plot')),style = "overflow-y: auto; height: 500px"), tabPanel("Marker Plot", plotOutput(ns('marker_plot')),style = "overflow-y: auto; height: 500px"), #Can this be an interactive plotly? @@ -207,6 +204,7 @@ mod_diversity_server <- function(id){ print("Heterozygosity success") diversity_items$maf_df <- calculateMAF(geno_mat, ploidy = ploidy) + diversity_items$maf_df <- diversity_items$maf_df[, c(1,3)] print("MAF success") @@ -273,17 +271,17 @@ mod_diversity_server <- function(id){ }) #AF Plot - af_plot <- reactive({ - validate( - need(!is.null(diversity_items$maf_df) & !is.null(input$hist_bins), "Input VCF, define parameters and click `run analysis` to access results in this session.") - ) - hist(diversity_items$maf_df$AF, breaks = as.numeric(input$hist_bins), col = "grey", border = "black", xlab = "Alternate Allele Frequency", - ylab = "Frequency", main = "Alternate Allele Frequency Distribution") - }) - - output$af_plot <- renderPlot({ - af_plot() - }) + #af_plot <- reactive({ + # validate( + # need(!is.null(diversity_items$maf_df) & !is.null(input$hist_bins), "Input VCF, define parameters and click `run analysis` to access results in this session.") + # ) + # hist(diversity_items$maf_df$AF, breaks = as.numeric(input$hist_bins), col = "grey", border = "black", xlab = "Alternate Allele Frequency", + # ylab = "Frequency", main = "Alternate Allele Frequency Distribution") + #}) + + #output$af_plot <- renderPlot({ + # af_plot() + #}) #MAF plot maf_plot <- reactive({ From 91ca93c5cb1e071f1bfcaf24876b9764fa6cfd2c Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Wed, 28 Aug 2024 15:23:33 -0400 Subject: [PATCH 06/40] Removed diversity tab input --- R/mod_diversity.R | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/R/mod_diversity.R b/R/mod_diversity.R index d756484..821e686 100644 --- a/R/mod_diversity.R +++ b/R/mod_diversity.R @@ -116,7 +116,7 @@ mod_diversity_server <- function(id){ observeEvent(input$diversity_start, { toggleClass(id = "diversity_ploidy", class = "borderred", condition = (is.na(input$diversity_ploidy) | is.null(input$diversity_ploidy))) - toggleClass(id = "zero_value", class = "borderred", condition = (is.na(input$zero_value) | is.null(input$zero_value))) + #toggleClass(id = "zero_value", class = "borderred", condition = (is.na(input$zero_value) | is.null(input$zero_value))) if (is.null(input$diversity_file$datapath)) { shinyalert( @@ -134,7 +134,7 @@ mod_diversity_server <- function(id){ animation = TRUE ) } - req(input$diversity_file, input$diversity_ploidy, input$zero_value) + req(input$diversity_file, input$diversity_ploidy) #Input variables (need to add support for VCF file) ploidy <- as.numeric(input$diversity_ploidy) @@ -173,7 +173,7 @@ mod_diversity_server <- function(id){ print(class(geno_mat)) #Convert genotypes to alternate counts if they are the reference allele counts #Importantly, the dosage plot is based on the input format NOT the converted genotypes - is_reference <- (input$zero_value == "Reference Allele Counts") + is_reference <- TRUE #(input$zero_value == "Reference Allele Counts") print("Genotype file successfully imported") ######Get MAF plot (Need to remember that the VCF genotypes are likely set as 0 = homozygous reference, where the dosage report is 0 = homozygous alternate) From 5b398033a256a014e58b673eac1e3d895fdf4e62 Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Wed, 28 Aug 2024 15:24:00 -0400 Subject: [PATCH 07/40] Updated reference to Predictive Ability --- R/app_ui.R | 2 +- R/mod_GSAcc.R | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/R/app_ui.R b/R/app_ui.R index fa1a5d8..42f741f 100644 --- a/R/app_ui.R +++ b/R/app_ui.R @@ -38,7 +38,7 @@ app_ui <- function(request) { menuItem("GWASpoly", tabName = "gwas", icon = icon("think-peaks")), tags$li(class = "header", style = "color: grey; margin-top: 18px; margin-bottom: 10px; padding-left: 15px;", "Genomic Selection"), menuItem( - span("Prediction Accuracy", bs4Badge("beta", position = "right", color = "success")), + span("Predictive Ability", bs4Badge("beta", position = "right", color = "success")), tabName = "prediction_accuracy", icon = icon("right-left")), menuItem( diff --git a/R/mod_GSAcc.R b/R/mod_GSAcc.R index 18f544c..004cabe 100644 --- a/R/mod_GSAcc.R +++ b/R/mod_GSAcc.R @@ -729,7 +729,7 @@ mod_GSAcc_server <- function(id){ #geom_boxplot(position = position_dodge(width = 0.8), color = "black", width = 0.7, outlier.size = 0.2) + geom_boxplot() + facet_wrap(~ Trait, nrow = 1) + # Facet by trait, allowing different y-scales - labs(title = "Prediction Accuracy by Trait", + labs(title = "Predictive Ability by Trait", x = " ", y = "Pearson Correlation") + #theme_minimal() + # Using a minimal theme @@ -744,7 +744,7 @@ mod_GSAcc_server <- function(id){ geom_violin(trim = TRUE) + # Add violin plot geom_point(position = position_jitter(width = 0.1), color = "black", size = 1.5) + # Add jittered points facet_wrap(~ Trait, nrow = 1) + # Facet by trait, allowing different y-scales - labs(title = "Prediction Accuracy by Trait", + labs(title = "Predictive Ability by Trait", x = " ", # x-label is blank because it's not relevant per facet y = "Pearson Correlation") + theme(legend.position = "none", From f58e43d2d06a465b833511169c47b810bebf6514 Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Wed, 28 Aug 2024 16:29:17 -0400 Subject: [PATCH 08/40] fixed chrom order bug --- R/mod_diversity.R | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/R/mod_diversity.R b/R/mod_diversity.R index 821e686..1f5493f 100644 --- a/R/mod_diversity.R +++ b/R/mod_diversity.R @@ -300,7 +300,12 @@ mod_diversity_server <- function(id){ ) #Order the Chr column diversity_items$pos_df$POS <- as.numeric(diversity_items$pos_df$POS) - # Sort the dataframe + # Sort the dataframe and pad with a 0 if only a single digit is provided + diversity_items$pos_df$CHROM <- ifelse( + nchar(diversity_items$pos_df$CHROM) == 1, + paste0("0", diversity_items$pos_df$CHROM), + diversity_items$pos_df$CHROM + ) diversity_items$pos_df <- diversity_items$pos_df[order(diversity_items$pos_df$CHROM), ] #Plot From a9daa92424f53ec05896fa09a5b1a4782735a3ed Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Wed, 28 Aug 2024 17:03:00 -0400 Subject: [PATCH 09/40] allows links between modules --- NAMESPACE | 1 + R/app_server.R | 51 +- R/app_ui.R | 2 +- R/mod_DosageCall.R | 299 +++++----- R/mod_Filtering.R | 893 +++++++++++++++--------------- R/mod_GS.R | 1060 ++++++++++++++++++------------------ R/mod_GSAcc.R | 1293 ++++++++++++++++++++++---------------------- R/mod_Home.R | 117 ++-- R/mod_PCA.R | 653 +++++++++++----------- R/mod_dapc.R | 569 ++++++++++--------- R/mod_diversity.R | 642 +++++++++++----------- R/mod_dosage2vcf.R | 217 ++++---- R/mod_gwas.R | 792 ++++++++++++++------------- R/mod_help.R | 93 +++- R/mod_mapping.R | 23 +- R/mod_slurm.R | 53 +- 16 files changed, 3421 insertions(+), 3337 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 36375a3..0618583 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -43,6 +43,7 @@ importFrom(bs4Dash,sidebarMenu) importFrom(bs4Dash,tabBox) importFrom(bs4Dash,tabItem) importFrom(bs4Dash,tabItems) +importFrom(bs4Dash,updateControlbarMenu) importFrom(bs4Dash,valueBox) importFrom(bs4Dash,valueBoxOutput) importFrom(config,get) diff --git a/R/app_server.R b/R/app_server.R index acf6dbd..afda17e 100644 --- a/R/app_server.R +++ b/R/app_server.R @@ -11,14 +11,45 @@ app_server <- function(input, output, session) { options(shiny.maxRequestSize = 10000 * 1024^2) # Set maximum upload size to 10GB #shiny.maxRequestSize = 10000 * 1024^2; # 10 GB <- This is for a future limit when using BI's server remotely - mod_DosageCall_server("DosageCall_1") - mod_Filtering_server("Filtering_1") - mod_dosage2vcf_server("dosage2vcf_1") - mod_PCA_server("PCA_1") - mod_dapc_server("dapc_1") - mod_gwas_server("gwas_1") - mod_diversity_server("diversity_1") - mod_GS_server("GS_1") - mod_GSAcc_server("GSAcc_1") - mod_slurm_server("slurm_1") + callModule(mod_DosageCall_server, + "DosageCall_1", + parent_session = session) + callModule(mod_Filtering_server, + "Filtering_1", + parent_session = session) + callModule(mod_dosage2vcf_server, + "dosage2vcf_1", + parent_session = session) + callModule(mod_PCA_server, + "PCA_1", + parent_session = session) + callModule(mod_dapc_server, + "dapc_1", + parent_session = session) + callModule(mod_gwas_server, + "gwas_1", + parent_session = session) + callModule(mod_diversity_server, + "diversity_1", + parent_session = session) + callModule(mod_GS_server, + "GS_1", + parent_session = session) + callModule(mod_GSAcc_server, + "GSAcc_1", + parent_session = session) + callModule(mod_slurm_server, + "slurm_1", + parent_session = session) + + # mod_DosageCall_server("DosageCall_1") + # mod_Filtering_server("Filtering_1") + # mod_dosage2vcf_server("dosage2vcf_1") + # mod_PCA_server("PCA_1") + # mod_dapc_server("dapc_1") + # mod_gwas_server("gwas_1") + # mod_diversity_server("diversity_1") + # mod_GS_server("GS_1") + # mod_GSAcc_server("GSAcc_1") + # mod_slurm_server("slurm_1") } diff --git a/R/app_ui.R b/R/app_ui.R index fa1a5d8..5efff36 100644 --- a/R/app_ui.R +++ b/R/app_ui.R @@ -21,7 +21,7 @@ app_ui <- function(request) { ), bs4DashSidebar( skin="light", status = "info", - sidebarMenu( + sidebarMenu(id = "MainMenu", flat = FALSE, tags$li(class = "header", style = "color: grey; margin-top: 10px; margin-bottom: 10px; padding-left: 15px;", "Menu"), menuItem("Home", tabName = "welcome", icon = icon("house")), diff --git a/R/mod_DosageCall.R b/R/mod_DosageCall.R index 09d0692..b8c1c9c 100644 --- a/R/mod_DosageCall.R +++ b/R/mod_DosageCall.R @@ -84,181 +84,180 @@ mod_DosageCall_ui <- function(id){ #' @importFrom shinyjs enable disable #' #' @noRd -mod_DosageCall_server <- function(id){ - moduleServer( id, function(input, output, session){ - ns <- session$ns +mod_DosageCall_server <- function(input, output, session, parent_session){ - snp_number <- reactiveVal(0) + ns <- session$ns - #SNP counts value box - output$MADCsnps <- renderValueBox({ - valueBox(snp_number(), "Markers in uploaded file", icon = icon("dna"), color = "info") - }) - - disable("download_updog_vcf") - - ##This is for performing Updog Dosage Calling - updog_out <- eventReactive(input$run_analysis,{ - - # Missing input with red border and alerts - toggleClass(id = "ploidy", class = "borderred", condition = (is.na(input$ploidy) | is.null(input$ploidy))) - toggleClass(id = "output_name", class = "borderred", condition = (is.na(input$output_name) | is.null(input$output_name) | input$output_name == "")) - - if (is.null(input$madc_file$datapath)) { - shinyalert( - title = "Missing input!", - text = "Upload VCF File", - size = "s", - closeOnEsc = TRUE, - closeOnClickOutside = FALSE, - html = TRUE, - type = "error", - showConfirmButton = TRUE, - confirmButtonText = "OK", - confirmButtonCol = "#004192", - showCancelButton = FALSE, - animation = TRUE - ) - } - req(input$madc_file$datapath, input$output_name, input$ploidy) + snp_number <- reactiveVal(0) - # Get inputs - madc_file <- input$madc_file$datapath - output_name <- input$output_name - ploidy <- input$ploidy - cores <- input$cores - model_select <- input$updog_model + #SNP counts value box + output$MADCsnps <- renderValueBox({ + valueBox(snp_number(), "Markers in uploaded file", icon = icon("dna"), color = "info") + }) - # Status - updateProgressBar(session = session, id = "pb_madc", value = 0, title = "Formatting Input Files") - #Import genotype info if genotype matrix format - if (grepl("\\.csv$", madc_file)) { - # Call the get_counts function with the specified MADC file path and output file path - #Status - result_df <- get_counts(madc_file, output_name) + disable("download_updog_vcf") + + ##This is for performing Updog Dosage Calling + updog_out <- eventReactive(input$run_analysis,{ + + # Missing input with red border and alerts + toggleClass(id = "ploidy", class = "borderred", condition = (is.na(input$ploidy) | is.null(input$ploidy))) + toggleClass(id = "output_name", class = "borderred", condition = (is.na(input$output_name) | is.null(input$output_name) | input$output_name == "")) + + if (is.null(input$madc_file$datapath)) { + shinyalert( + title = "Missing input!", + text = "Upload VCF File", + size = "s", + closeOnEsc = TRUE, + closeOnClickOutside = FALSE, + html = TRUE, + type = "error", + showConfirmButton = TRUE, + confirmButtonText = "OK", + confirmButtonCol = "#004192", + showCancelButton = FALSE, + animation = TRUE + ) + } + req(input$madc_file$datapath, input$output_name, input$ploidy) + + # Get inputs + madc_file <- input$madc_file$datapath + output_name <- input$output_name + ploidy <- input$ploidy + cores <- input$cores + model_select <- input$updog_model + + # Status + updateProgressBar(session = session, id = "pb_madc", value = 0, title = "Formatting Input Files") + #Import genotype info if genotype matrix format + if (grepl("\\.csv$", madc_file)) { + # Call the get_counts function with the specified MADC file path and output file path + #Status + result_df <- get_counts(madc_file, output_name) - #Call the get_matrices function - matrices <- get_matrices(result_df) + #Call the get_matrices function + matrices <- get_matrices(result_df) - #Number of SNPs - snp_number <- (nrow(result_df) / 2) + #Number of SNPs + snp_number <- (nrow(result_df) / 2) - #SNP counts value box - output$MADCsnps <- renderValueBox({ - valueBox(snp_number, "Markers in MADC File", icon = icon("dna"), color = "info") - }) + #SNP counts value box + output$MADCsnps <- renderValueBox({ + valueBox(snp_number, "Markers in MADC File", icon = icon("dna"), color = "info") + }) - } else { + } else { - #Initialize matrices list - matrices <- list() + #Initialize matrices list + matrices <- list() - #Import genotype information if in VCF format - vcf <- read.vcfR(madc_file, verbose = FALSE) + #Import genotype information if in VCF format + vcf <- read.vcfR(madc_file, verbose = FALSE) - #Get items in FORMAT column - info <- vcf@gt[1,"FORMAT"] #Getting the first row FORMAT - chrom <- vcf@fix[,1] - pos <- vcf@fix[,2] + #Get items in FORMAT column + info <- vcf@gt[1,"FORMAT"] #Getting the first row FORMAT + chrom <- vcf@fix[,1] + pos <- vcf@fix[,2] - info_ids <- extract_info_ids(info[1]) + info_ids <- extract_info_ids(info[1]) - if (("DP" %in% info_ids) && (("RA" %in% info_ids) | ("AD" %in% info_ids))) { - #Extract DP and RA and convert to matrices - matrices$size_matrix <- extract.gt(vcf, element = "DP") - if("RA" %in% info_ids){ - matrices$ref_matrix <- extract.gt(vcf, element = "RA") - } else { - ad_matrix <- extract.gt(vcf, element = "AD") - matrices$ref_matrix <- matrix(sapply(strsplit(ad_matrix, ","), "[[", 1), nrow = nrow(matrices$size_matrix)) - colnames(matrices$ref_matrix) <- colnames(matrices$size_matrix) - rownames(matrices$ref_matrix) <- rownames(matrices$size_matrix) - } + if (("DP" %in% info_ids) && (("RA" %in% info_ids) | ("AD" %in% info_ids))) { + #Extract DP and RA and convert to matrices + matrices$size_matrix <- extract.gt(vcf, element = "DP") + if("RA" %in% info_ids){ + matrices$ref_matrix <- extract.gt(vcf, element = "RA") + } else { + ad_matrix <- extract.gt(vcf, element = "AD") + matrices$ref_matrix <- matrix(sapply(strsplit(ad_matrix, ","), "[[", 1), nrow = nrow(matrices$size_matrix)) + colnames(matrices$ref_matrix) <- colnames(matrices$size_matrix) + rownames(matrices$ref_matrix) <- rownames(matrices$size_matrix) + } - class(matrices$size_matrix) <- "numeric" - class(matrices$ref_matrix) <- "numeric" - rownames(matrices$size_matrix) <- rownames(matrices$ref_matrix) <- paste0(chrom, "_", pos) + class(matrices$size_matrix) <- "numeric" + class(matrices$ref_matrix) <- "numeric" + rownames(matrices$size_matrix) <- rownames(matrices$ref_matrix) <- paste0(chrom, "_", pos) - rm(vcf) #Remove VCF + rm(vcf) #Remove VCF - snp_number <- (nrow(matrices$size_matrix)) + snp_number <- (nrow(matrices$size_matrix)) - #SNP counts value box - output$MADCsnps <- renderValueBox({ - valueBox(snp_number, "Markers in VCF File", icon = icon("dna"), color = "info") - }) + #SNP counts value box + output$MADCsnps <- renderValueBox({ + valueBox(snp_number, "Markers in VCF File", icon = icon("dna"), color = "info") + }) - }else{ - ##Add user warning about read depth and allele read depth not found - stop(safeError("Error: DP and RA/AD FORMAT flags not found in VCF file")) - } + }else{ + ##Add user warning about read depth and allele read depth not found + stop(safeError("Error: DP and RA/AD FORMAT flags not found in VCF file")) } + } + + #Run Updog + #I initially used the "norm" model + #I am also taking the ploidy from the max value in the + updateProgressBar(session = session, id = "pb_madc", value = 40, title = "Dosage Calling in Progress") + print('Performing Updog dosage calling') + mout <- multidog(refmat = matrices$ref_matrix, + sizemat = matrices$size_matrix, + ploidy = as.numeric(ploidy), + model = model_select, + nc = cores) + #Status + updateProgressBar(session = session, id = "pb_madc", value = 100, title = "Finished") + mout + }) - #Run Updog - #I initially used the "norm" model - #I am also taking the ploidy from the max value in the - updateProgressBar(session = session, id = "pb_madc", value = 40, title = "Dosage Calling in Progress") - print('Performing Updog dosage calling') - mout <- multidog(refmat = matrices$ref_matrix, - sizemat = matrices$size_matrix, - ploidy = as.numeric(ploidy), - model = model_select, - nc = cores) - #Status - updateProgressBar(session = session, id = "pb_madc", value = 100, title = "Finished") - mout - }) - - # Only make available the download button when analysis is finished - observe({ - if (!is.null(updog_out())) { - Sys.sleep(1) - # enable the download button - enable("download_updog_vcf") - } else { - disable("download_updog_vcf") - } - }) + # Only make available the download button when analysis is finished + observe({ + if (!is.null(updog_out())) { + Sys.sleep(1) + # enable the download button + enable("download_updog_vcf") + } else { + disable("download_updog_vcf") + } + }) - output$download_updog_vcf <- downloadHandler( - filename = function() { - paste0(input$output_name, ".vcf.gz") - }, - content = function(file) { - #Save Updog output as VCF file - temp <- tempfile() - updog2vcf( - multidog.object = updog_out(), - output.file = temp, - updog_version = packageVersion("updog"), - compress = TRUE - ) + output$download_updog_vcf <- downloadHandler( + filename = function() { + paste0(input$output_name, ".vcf.gz") + }, + content = function(file) { + #Save Updog output as VCF file + temp <- tempfile() + updog2vcf( + multidog.object = updog_out(), + output.file = temp, + updog_version = packageVersion("updog"), + compress = TRUE + ) - # Move the file to the path specified by 'file' - file.copy(paste0(temp, ".vcf.gz"), file) + # Move the file to the path specified by 'file' + file.copy(paste0(temp, ".vcf.gz"), file) - # Delete the temporary file - unlink(temp) - }) + # Delete the temporary file + unlink(temp) + }) - output$download_vcf <- downloadHandler( - filename = function() { - paste0("BIGapp_VCF_Example_file.vcf.gz") - }, - content = function(file) { - ex <- system.file("iris_DArT_VCF.vcf.gz", package = "BIGapp") - file.copy(ex, file) - }) + output$download_vcf <- downloadHandler( + filename = function() { + paste0("BIGapp_VCF_Example_file.vcf.gz") + }, + content = function(file) { + ex <- system.file("iris_DArT_VCF.vcf.gz", package = "BIGapp") + file.copy(ex, file) + }) - output$download_madc <- downloadHandler( - filename = function() { - paste0("BIGapp_MADC_Example_file.csv") - }, - content = function(file) { - ex <- system.file("iris_DArT_MADC.csv", package = "BIGapp") - file.copy(ex, file) - }) - }) + output$download_madc <- downloadHandler( + filename = function() { + paste0("BIGapp_MADC_Example_file.csv") + }, + content = function(file) { + ex <- system.file("iris_DArT_MADC.csv", package = "BIGapp") + file.copy(ex, file) + }) } ## To be copied in the UI diff --git a/R/mod_Filtering.R b/R/mod_Filtering.R index 72ba828..7225f6c 100644 --- a/R/mod_Filtering.R +++ b/R/mod_Filtering.R @@ -99,479 +99,334 @@ mod_Filtering_ui <- function(id){ #' @importFrom graphics abline axis hist #' #' @noRd -mod_Filtering_server <- function(id){ - moduleServer( id, function(input, output, session){ - ns <- session$ns +mod_Filtering_server <- function(input, output, session, parent_session){ - #vcf - filtering_files <- reactiveValues( - raw_vcf_df = NULL, - sample_miss_df = NULL, - snp_miss_df = NULL + ns <- session$ns + #vcf + filtering_files <- reactiveValues( + raw_vcf_df = NULL, + sample_miss_df = NULL, + snp_miss_df = NULL + + ) + + #Reactive boxes + output$snp_retained_box <- renderValueBox({ + valueBox( + value = 0, + subtitle = "SNPs Retained", + icon = icon("dna"), + color = "info" ) + }) - #Reactive boxes - output$snp_retained_box <- renderValueBox({ - valueBox( - value = 0, - subtitle = "SNPs Retained", - icon = icon("dna"), - color = "info" - ) - }) + output$snp_removed_box <- renderValueBox({ + valueBox( + value = 0, + subtitle = "Percent SNPs Removed", + icon = icon("filter-circle-xmark"), + color = "info" + ) + }) + + disable("start_updog_filter") + output$din_tabs <- renderUI({ + tabBox(width =12, collapsible = FALSE, status = "info", + id = "updog_tab", height = "600px", + tabPanel("Results", p("Upload VCF file to access results in this section.")) + ) + }) + + vcf <- eventReactive(input$run_filters, { + + # Ensure the files are uploaded + # Missing input with red border and alerts + toggleClass(id = "filter_ploidy", class = "borderred", condition = (is.na(input$filter_ploidy) | is.null(input$filter_ploidy) | input$filter_ploidy == "")) + toggleClass(id = "filter_output_name", class = "borderred", condition = (is.na(input$filter_output_name) | is.null(input$filter_output_name) | input$filter_output_name == "")) + + if (is.null(input$updog_rdata$datapath)) { + shinyalert( + title = "Missing input!", + text = "Upload Dose Report and Counts Files", + size = "s", + closeOnEsc = TRUE, + closeOnClickOutside = FALSE, + html = TRUE, + type = "error", + showConfirmButton = TRUE, + confirmButtonText = "OK", + confirmButtonCol = "#004192", + showCancelButton = FALSE, + animation = TRUE + ) + } + + req(input$filter_ploidy, input$filter_output_name,input$updog_rdata) + + #Input file + vcf <- read.vcfR(input$updog_rdata$datapath, verbose = FALSE) + + # Identify if have updog parameters + format_fields <- unique(vcf@gt[,1]) + info_fields <- vcf@fix[1,8] + updog_par <- grepl("MPP", format_fields) & grepl("PMC", info_fields) & grepl("BIAS", info_fields) & grepl("OD", info_fields) + + if(updog_par){ + output$din_tabs <- renderUI({ + tabBox(width =12, collapsible = FALSE, status = "info", + id = "updog_tab", height = "600px", + tabPanel("Bias Histogram", icon = icon("image"), plotOutput(ns("bias_hist"), height = '550px')), + tabPanel("OD Histogram", icon = icon("image"), plotOutput(ns("od_hist"), height = '550px')), + tabPanel("Prop_mis Histogram", icon = icon("image"), plotOutput(ns("maxpostprob_hist"), height = '550px')), + tabPanel("SNP_miss", icon = icon("image"), plotOutput(ns("missing_snp_hist"), height = '550px')), + tabPanel("Sample_miss", icon = icon("image"), plotOutput(ns("missing_sample_hist"), height = '550px')) + ) + }) + } else { + output$din_tabs <- renderUI({ + tabBox(width =12, collapsible = FALSE, status = "info", + id = "updog_tab", height = "600px", + tabPanel("SNP_miss", icon = icon("image"), plotOutput(ns("missing_snp_hist"), height = '550px')), + tabPanel("Sample_miss", icon = icon("image"), plotOutput(ns("missing_sample_hist"), height = '550px')) + ) + }) + } + + + if (input$use_updog & updog_par) { + # Use Updog filtering parameters + OD_filter <- as.numeric(input$OD_filter) + Prop_mis <- as.numeric(input$Prop_mis) + Bias_min <- as.numeric(input$Bias[1]) + Bias_max <- as.numeric(input$Bias[2]) + max_post <- as.numeric(input$maxpostprob_filter) + + # Perform filtering with Updog parameters + # (insert your filtering code here) + } else { + # Do not use Updog filtering parameters + OD_filter = NULL + Prop_mis = NULL + Bias_min = NULL + Bias_max = NULL + max_post = NULL + } + + #Variables + size_depth <- input$size_depth + output_name <- input$filter_output_name + snp_miss <- input$snp_miss / 100 + sample_miss <- input$sample_miss / 100 + ploidy <- as.numeric(input$filter_ploidy) + maf_filter <- input$filter_maf + + updateProgressBar(session = session, id = "pb_filter", value = 10, title = "Processing VCF file") + + #Starting SNPs + starting_snps <- nrow(vcf) output$snp_removed_box <- renderValueBox({ valueBox( - value = 0, + value = round(((starting_snps - final_snps)/starting_snps*100),1), subtitle = "Percent SNPs Removed", - icon = icon("filter-circle-xmark"), + icon = icon("dna"), color = "info" ) }) - disable("start_updog_filter") - - output$din_tabs <- renderUI({ - tabBox(width =12, collapsible = FALSE, status = "info", - id = "updog_tab", height = "600px", - tabPanel("Results", p("Upload VCF file to access results in this section.")) + #export INFO dataframe + filtering_files$raw_vcf_df <- data.frame(vcf@fix) + + #Pb + updateProgressBar(session = session, id = "pb_filter", value = 40, title = "Filtering VCF file") + + #Filtering + vcf <- filterVCF(vcf.file = vcf, + ploidy=ploidy, + output.file=NULL, + filter.OD = OD_filter, + filter.BIAS.min = Bias_min, + filter.BIAS.max = Bias_max, + filter.DP = as.numeric(size_depth), + filter.PMC = Prop_mis, + filter.SAMPLE.miss = as.numeric(sample_miss), + filter.SNP.miss = as.numeric(snp_miss), + filter.MAF = as.numeric(maf_filter), + filter.MPP = max_post) + + if (length(vcf@gt) == 0) { + shinyalert( + title = "All markers were filtered out", + text = "Loose the parameters to access results in this tab", + size = "s", + closeOnEsc = TRUE, + closeOnClickOutside = FALSE, + html = TRUE, + type = "error", + showConfirmButton = TRUE, + confirmButtonText = "OK", + confirmButtonCol = "#004192", + showCancelButton = FALSE, + animation = TRUE ) - }) - - vcf <- eventReactive(input$run_filters, { - - # Ensure the files are uploaded - # Missing input with red border and alerts - toggleClass(id = "filter_ploidy", class = "borderred", condition = (is.na(input$filter_ploidy) | is.null(input$filter_ploidy) | input$filter_ploidy == "")) - toggleClass(id = "filter_output_name", class = "borderred", condition = (is.na(input$filter_output_name) | is.null(input$filter_output_name) | input$filter_output_name == "")) - - if (is.null(input$updog_rdata$datapath)) { - shinyalert( - title = "Missing input!", - text = "Upload Dose Report and Counts Files", - size = "s", - closeOnEsc = TRUE, - closeOnClickOutside = FALSE, - html = TRUE, - type = "error", - showConfirmButton = TRUE, - confirmButtonText = "OK", - confirmButtonCol = "#004192", - showCancelButton = FALSE, - animation = TRUE - ) - } + } - req(input$filter_ploidy, input$filter_output_name,input$updog_rdata) - - #Input file - vcf <- read.vcfR(input$updog_rdata$datapath, verbose = FALSE) - - # Identify if have updog parameters - format_fields <- unique(vcf@gt[,1]) - info_fields <- vcf@fix[1,8] - updog_par <- grepl("MPP", format_fields) & grepl("PMC", info_fields) & grepl("BIAS", info_fields) & grepl("OD", info_fields) - - if(updog_par){ - output$din_tabs <- renderUI({ - tabBox(width =12, collapsible = FALSE, status = "info", - id = "updog_tab", height = "600px", - tabPanel("Bias Histogram", icon = icon("image"), plotOutput(ns("bias_hist"), height = '550px')), - tabPanel("OD Histogram", icon = icon("image"), plotOutput(ns("od_hist"), height = '550px')), - tabPanel("Prop_mis Histogram", icon = icon("image"), plotOutput(ns("maxpostprob_hist"), height = '550px')), - tabPanel("SNP_miss", icon = icon("image"), plotOutput(ns("missing_snp_hist"), height = '550px')), - tabPanel("Sample_miss", icon = icon("image"), plotOutput(ns("missing_sample_hist"), height = '550px')) - ) - }) - } else { - output$din_tabs <- renderUI({ - tabBox(width =12, collapsible = FALSE, status = "info", - id = "updog_tab", height = "600px", - tabPanel("SNP_miss", icon = icon("image"), plotOutput(ns("missing_snp_hist"), height = '550px')), - tabPanel("Sample_miss", icon = icon("image"), plotOutput(ns("missing_sample_hist"), height = '550px')) - ) - }) - } + #Getting missing data information + #Add support for genotype matrix filtering? + #Pb + updateProgressBar(session = session, id = "pb_filter", value = 50, title = "Calculating Missing Data") + gt_matrix <- extract.gt(vcf, element = "GT", as.numeric = FALSE) + filtering_files$snp_miss_df <- rowMeans(is.na(gt_matrix)) #SNP missing values + filtering_files$sample_miss_df <- as.numeric(colMeans(is.na(gt_matrix))) #Sample missing values + rm(gt_matrix) #Remove gt matrix - if (input$use_updog & updog_par) { - # Use Updog filtering parameters - OD_filter <- as.numeric(input$OD_filter) - Prop_mis <- as.numeric(input$Prop_mis) - Bias_min <- as.numeric(input$Bias[1]) - Bias_max <- as.numeric(input$Bias[2]) - max_post <- as.numeric(input$maxpostprob_filter) - - # Perform filtering with Updog parameters - # (insert your filtering code here) - } else { - # Do not use Updog filtering parameters - OD_filter = NULL - Prop_mis = NULL - Bias_min = NULL - Bias_max = NULL - max_post = NULL - } - - #Variables - size_depth <- input$size_depth - output_name <- input$filter_output_name - snp_miss <- input$snp_miss / 100 - sample_miss <- input$sample_miss / 100 - ploidy <- as.numeric(input$filter_ploidy) - maf_filter <- input$filter_maf - - updateProgressBar(session = session, id = "pb_filter", value = 10, title = "Processing VCF file") - - #Starting SNPs - starting_snps <- nrow(vcf) - output$snp_removed_box <- renderValueBox({ - valueBox( - value = round(((starting_snps - final_snps)/starting_snps*100),1), - subtitle = "Percent SNPs Removed", - icon = icon("dna"), - color = "info" - ) - }) + #Pb + updateProgressBar(session = session, id = "pb_filter", value = 80, title = "Exporting Filtered VCF") - #export INFO dataframe - filtering_files$raw_vcf_df <- data.frame(vcf@fix) - - #Pb - updateProgressBar(session = session, id = "pb_filter", value = 40, title = "Filtering VCF file") - - #Filtering - vcf <- filterVCF(vcf.file = vcf, - ploidy=ploidy, - output.file=NULL, - filter.OD = OD_filter, - filter.BIAS.min = Bias_min, - filter.BIAS.max = Bias_max, - filter.DP = as.numeric(size_depth), - filter.PMC = Prop_mis, - filter.SAMPLE.miss = as.numeric(sample_miss), - filter.SNP.miss = as.numeric(snp_miss), - filter.MAF = as.numeric(maf_filter), - filter.MPP = max_post) - - if (length(vcf@gt) == 0) { - shinyalert( - title = "All markers were filtered out", - text = "Loose the parameters to access results in this tab", - size = "s", - closeOnEsc = TRUE, - closeOnClickOutside = FALSE, - html = TRUE, - type = "error", - showConfirmButton = TRUE, - confirmButtonText = "OK", - confirmButtonCol = "#004192", - showCancelButton = FALSE, - animation = TRUE - ) - } - - #Getting missing data information - #Add support for genotype matrix filtering? - #Pb - updateProgressBar(session = session, id = "pb_filter", value = 50, title = "Calculating Missing Data") - - gt_matrix <- extract.gt(vcf, element = "GT", as.numeric = FALSE) - filtering_files$snp_miss_df <- rowMeans(is.na(gt_matrix)) #SNP missing values - filtering_files$sample_miss_df <- as.numeric(colMeans(is.na(gt_matrix))) #Sample missing values - rm(gt_matrix) #Remove gt matrix - - #Pb - updateProgressBar(session = session, id = "pb_filter", value = 80, title = "Exporting Filtered VCF") - - #Get final_snps - final_snps <- nrow(vcf) - #Updating value boxes - output$snp_retained_box <- renderValueBox({ - valueBox( - value = final_snps, - subtitle = "SNPs Retained", - icon = icon("dna"), - color = "info" - ) - }) - - # Status - updateProgressBar(session = session, id = "pb_filter", value = 100, title = "Finished!") - - vcf + #Get final_snps + final_snps <- nrow(vcf) + #Updating value boxes + output$snp_retained_box <- renderValueBox({ + valueBox( + value = final_snps, + subtitle = "SNPs Retained", + icon = icon("dna"), + color = "info" + ) }) - # Only make available the download button when analysis is finished - observe({ - if (!is.null(vcf())) { - Sys.sleep(1) - # enable the download button - enable("start_updog_filter") - } else { - disable("start_updog_filter") - } - }) + # Status + updateProgressBar(session = session, id = "pb_filter", value = 100, title = "Finished!") + vcf + }) - #Updog filtering - output$start_updog_filter <- downloadHandler( - filename = function() { - paste0(input$filter_output_name, ".vcf.gz") - }, - content = function(file) { + # Only make available the download button when analysis is finished + observe({ + if (!is.null(vcf())) { + Sys.sleep(1) + # enable the download button + enable("start_updog_filter") + } else { + disable("start_updog_filter") + } + }) - #Writing file - temp_file <- tempfile(fileext = ".vcf.gz") - write.vcf(vcf(), file = temp_file) - # Check if the VCF file was created - if (file.exists(temp_file)) { - cat("VCF file created successfully.\n") + #Updog filtering + output$start_updog_filter <- downloadHandler( + filename = function() { + paste0(input$filter_output_name, ".vcf.gz") + }, + content = function(file) { - # Move the file to the path specified by 'file' - file.copy(temp_file, file, overwrite = TRUE) + #Writing file + temp_file <- tempfile(fileext = ".vcf.gz") + write.vcf(vcf(), file = temp_file) - # Delete the temporary file - unlink(temp_file) - } else { - stop("Error: Failed to create the VCF file.") - } + # Check if the VCF file was created + if (file.exists(temp_file)) { + cat("VCF file created successfully.\n") - } - ) + # Move the file to the path specified by 'file' + file.copy(temp_file, file, overwrite = TRUE) - #Download figures for VCF Filtering - output$download_filter_hist <- downloadHandler( - - filename = function() { - if (input$image_type == "jpeg") { - paste("VCF-histogram-", Sys.Date(), ".jpg", sep="") - } else if (input$image_type == "png") { - paste("VCF-histogram-", Sys.Date(), ".png", sep="") - } else { - paste("VCF-histogram-", Sys.Date(), ".tiff", sep="") - } - }, - content = function(file) { - req(input$image_type) - - if (input$image_type == "jpeg") { - jpeg(file, width = as.numeric(input$image_width), height = as.numeric(input$image_height), res= as.numeric(input$image_res), units = "in") - } else if (input$image_type == "png") { - png(file, width = as.numeric(input$image_width), height = as.numeric(input$image_height), res= as.numeric(input$image_res), units = "in") - } else { - tiff(file, width = as.numeric(input$image_width), height = as.numeric(input$image_height), res= as.numeric(input$image_res), units = "in") - } - - # Conditional plotting based on input selection - req(filtering_output$df, filtering_files) - if (input$filter_hist == "Bias Histogram") { - - hist(as.numeric(filtering_output$df$BIAS), - main = "Unfiltered SNP bias histogram", - xlab = "bias", - ylab = "SNPs", - col = "lightblue", - border = "black", - xlim = c(0,5), - breaks = as.numeric(input$hist_bins)) - axis(1, at = seq(0, 5, by = .2), labels = rep("", length(seq(0, 5, by = 0.2)))) # Add ticks - abline(v = mean(as.numeric(filtering_output$df$BIAS)), col = "red", lty = 2) # Mean line - abline(v = median(as.numeric(filtering_output$df$BIAS)), col = "green", lty = 2) # Median line - abline(v = 0.5, col = "black", lty = 2) # proposed lower line - abline(v = 2, col = "black", lty = 2) # proposed upper line - legend("topright", legend=c("mean", "median", "suggested threshold"), - col=c("red", "green","black"), lty=2, cex=0.8) - - } else if (input$filter_hist == "OD Histogram") { - - #Plot - hist(as.numeric(filtering_output$df$OD), - main = "Unfiltered SNP overdispersion parameter histogram", - xlab = "OD", - ylab = "SNPs", - col = "lightblue", - border = "black", - xlim = c(0,0.6), - breaks = as.numeric(input$hist_bins)) - axis(1, at = seq(0, 0.6, by = .01), labels = rep("", length(seq(0, 0.6, by = 0.01)))) # Add ticks - abline(v = 0.05, col = "black", lty = 2) # proposed filter by updog - - # Add vertical lines - abline(v = mean(as.numeric(filtering_output$df$OD)), col = "red", lty = 2) # Mean line - abline(v = median(as.numeric(filtering_output$df$OD)), col = "green", lty = 2) # Median line - abline(v = 0.05, col = "black", lty = 2) # proposed filter by updog - legend("topright", legend=c("mean", "median", "suggested threshold"), - col=c("red", "green","black"), lty=2, cex=0.8) - - } else if (input$filter_hist == "Prop_mis Histogram") { - - hist(as.numeric(filtering_output$df$PMC), - main = "The estimated proportion of individuals misclassified in the SNP from updog", - xlab = "Proportion of Misclassified Genotypes per SNP", - ylab = "Number of SNPs", - col = "lightblue", - border = "black", - xlim = c(0,1), - breaks = as.numeric(input$hist_bins)) - axis(1, at = seq(0, 1, by = .1), labels = rep("", length(seq(0, 1, by = 0.1)))) # Add ticks - - # Add vertical lines - abline(v = mean(as.numeric(filtering_output$df$PMC)), col = "red", lty = 2) # Mean line - abline(v = median(as.numeric(filtering_output$df$PMC)), col = "green", lty = 2) # Median line - abline(v = quantile(as.numeric(filtering_output$df$PMC), 0.95), col = "blue", lty = 2) - legend("topright", legend=c("mean", "median", "quantile"), - col=c("red", "green","blue"), lty=2, cex=0.8) - - } else if (input$filter_hist == "SNP_mis") { - - hist(as.numeric(filtering_files$snp_miss_df), - main = "Ratio of Missing Data per SNP After Filtering", - xlab = "Proportion of Missing Data per SNP", - ylab = "Number of SNPs", - col = "lightblue", - border = "black", - xlim = c(0,1), - breaks = as.numeric(input$hist_bins)) - axis(1, at = seq(0, 1, by = .1), labels = rep("", length(seq(0, 1, by = 0.1)))) # Add ticks - - # Add vertical lines - abline(v = mean(as.numeric(filtering_files$snp_miss_df)), col = "red", lty = 2) # Mean line - abline(v = median(as.numeric(filtering_files$snp_miss_df)), col = "green", lty = 2) # Median line - abline(v = quantile(as.numeric(filtering_files$snp_miss_df), 0.95), col = "blue", lty = 2) - legend("topright", legend=c("mean", "median", "quantile"), - col=c("red", "green","blue"), lty=2, cex=0.8) - - } else if (input$filter_hist == "Sample_mis") { - - hist(as.numeric(filtering_files$sample_miss_df), - main = "Ratio of Missing Data per Sample After Filtering", - xlab = "Proportion of Missing Data per Sample", - ylab = "Number of Samples", - col = "lightblue", - border = "black", - xlim = c(0,1), - breaks = as.numeric(input$hist_bins)) - axis(1, at = seq(0, 1, by = .1), labels = rep("", length(seq(0, 1, by = 0.1)))) # Add ticks - - # Add vertical lines - abline(v = mean(as.numeric(filtering_files$sample_miss_df)), col = "red", lty = 2) # Mean line - abline(v = median(as.numeric(filtering_files$sample_miss_df)), col = "green", lty = 2) # Median line - abline(v = quantile(as.numeric(filtering_files$sample_miss_df), 0.95), col = "blue", lty = 2) - legend("topright", legend=c("mean", "median", "quantile"), - col=c("red", "green","blue"), lty=2, cex=0.8) - } - dev.off() + # Delete the temporary file + unlink(temp_file) + } else { + stop("Error: Failed to create the VCF file.") } - ) - - # Commented code - ##Updog file stats - #Consider Extracting the GT info or UD info if present as a datafrfame, - #Obtaining the info in the INFO column as it's own dataframe with a column for each value - #Then remove the VCF file and use the remaining dataframes for producing the figures - filtering_output <- reactiveValues(df = NULL) - - observeEvent(filtering_files$raw_vcf_df, { - - # Apply the function to each row and bind the results into a new dataframe - new_df <- data.frame(filtering_files$raw_vcf_df) %>% - mutate(INFO_list = map(INFO, split_info_column)) %>% - unnest_wider(INFO_list) - - #Save df to reactive value - filtering_output$df <- new_df + } + ) + #Download figures for VCF Filtering + output$download_filter_hist <- downloadHandler( - ##Make plots - #Number of SNPs - nrow(filtering_files$raw_vcf_df) + filename = function() { + if (input$image_type == "jpeg") { + paste("VCF-histogram-", Sys.Date(), ".jpg", sep="") + } else if (input$image_type == "png") { + paste("VCF-histogram-", Sys.Date(), ".png", sep="") + } else { + paste("VCF-histogram-", Sys.Date(), ".tiff", sep="") + } + }, + content = function(file) { + req(input$image_type) + + if (input$image_type == "jpeg") { + jpeg(file, width = as.numeric(input$image_width), height = as.numeric(input$image_height), res= as.numeric(input$image_res), units = "in") + } else if (input$image_type == "png") { + png(file, width = as.numeric(input$image_width), height = as.numeric(input$image_height), res= as.numeric(input$image_res), units = "in") + } else { + tiff(file, width = as.numeric(input$image_width), height = as.numeric(input$image_height), res= as.numeric(input$image_res), units = "in") + } - ###Bias + # Conditional plotting based on input selection + req(filtering_output$df, filtering_files) + if (input$filter_hist == "Bias Histogram") { - #Histogram - if(any(grepl("BIAS", colnames(new_df)))){ - output$bias_hist <- renderPlot({ - hist(as.numeric(new_df$BIAS), - main = "Unfiltered SNP bias histogram", - xlab = "bias", - ylab = "SNPs", - col = "lightblue", - border = "black", - xlim = c(0,5), - breaks = as.numeric(input$hist_bins)) - axis(1, at = seq(0, 5, by = .2), labels = rep("", length(seq(0, 5, by = 0.2)))) # Add ticks - abline(v = mean(as.numeric(new_df$BIAS)), col = "red", lty = 2) # Mean line - abline(v = median(as.numeric(new_df$BIAS)), col = "green", lty = 2) # Median line - abline(v = 0.5, col = "black", lty = 2) # proposed lower line - abline(v = 2, col = "black", lty = 2) # proposed upper line - legend("topright", legend=c("mean", "median", "suggested threshold"), - col=c("red", "green","black"), lty=2, cex=0.8) - }) - } + hist(as.numeric(filtering_output$df$BIAS), + main = "Unfiltered SNP bias histogram", + xlab = "bias", + ylab = "SNPs", + col = "lightblue", + border = "black", + xlim = c(0,5), + breaks = as.numeric(input$hist_bins)) + axis(1, at = seq(0, 5, by = .2), labels = rep("", length(seq(0, 5, by = 0.2)))) # Add ticks + abline(v = mean(as.numeric(filtering_output$df$BIAS)), col = "red", lty = 2) # Mean line + abline(v = median(as.numeric(filtering_output$df$BIAS)), col = "green", lty = 2) # Median line + abline(v = 0.5, col = "black", lty = 2) # proposed lower line + abline(v = 2, col = "black", lty = 2) # proposed upper line + legend("topright", legend=c("mean", "median", "suggested threshold"), + col=c("red", "green","black"), lty=2, cex=0.8) + + } else if (input$filter_hist == "OD Histogram") { + + #Plot + hist(as.numeric(filtering_output$df$OD), + main = "Unfiltered SNP overdispersion parameter histogram", + xlab = "OD", + ylab = "SNPs", + col = "lightblue", + border = "black", + xlim = c(0,0.6), + breaks = as.numeric(input$hist_bins)) + axis(1, at = seq(0, 0.6, by = .01), labels = rep("", length(seq(0, 0.6, by = 0.01)))) # Add ticks + abline(v = 0.05, col = "black", lty = 2) # proposed filter by updog - ###OD - if(any(grepl("OD", colnames(new_df)))){ + # Add vertical lines + abline(v = mean(as.numeric(filtering_output$df$OD)), col = "red", lty = 2) # Mean line + abline(v = median(as.numeric(filtering_output$df$OD)), col = "green", lty = 2) # Median line + abline(v = 0.05, col = "black", lty = 2) # proposed filter by updog + legend("topright", legend=c("mean", "median", "suggested threshold"), + col=c("red", "green","black"), lty=2, cex=0.8) - quantile(as.numeric(new_df$OD), 0.95) - #Histogram - output$od_hist <- renderPlot({ - hist(as.numeric(new_df$OD), - main = "Unfiltered SNP overdispersion parameter histogram", - xlab = "OD", - ylab = "SNPs", - col = "lightblue", - border = "black", - xlim = c(0,0.6), - breaks = as.numeric(input$hist_bins)) - axis(1, at = seq(0, 0.6, by = .01), labels = rep("", length(seq(0, 0.6, by = 0.01)))) # Add ticks - abline(v = 0.05, col = "black", lty = 2) # proposed filter by updog - - # Add vertical lines - abline(v = mean(as.numeric(new_df$OD)), col = "red", lty = 2) # Mean line - abline(v = median(as.numeric(new_df$OD)), col = "green", lty = 2) # Median line - abline(v = 0.05, col = "black", lty = 2) # proposed filter by updog - legend("topright", legend=c("mean", "median", "suggested threshold"), - col=c("red", "green","black"), lty=2, cex=0.8) - - }) - } + } else if (input$filter_hist == "Prop_mis Histogram") { - ##MAXPOSTPROB + hist(as.numeric(filtering_output$df$PMC), + main = "The estimated proportion of individuals misclassified in the SNP from updog", + xlab = "Proportion of Misclassified Genotypes per SNP", + ylab = "Number of SNPs", + col = "lightblue", + border = "black", + xlim = c(0,1), + breaks = as.numeric(input$hist_bins)) + axis(1, at = seq(0, 1, by = .1), labels = rep("", length(seq(0, 1, by = 0.1)))) # Add ticks - #Histogram - if(any(grepl("PMC", colnames(new_df)))){ - - output$maxpostprob_hist <- renderPlot({ - - #Histogram - hist(as.numeric(new_df$PMC), - main = "The estimated proportion of individuals misclassified in the SNP from updog", - xlab = "Proportion of Misclassified Genotypes per SNP", - ylab = "Number of SNPs", - col = "lightblue", - border = "black", - xlim = c(0,1), - breaks = as.numeric(input$hist_bins)) - axis(1, at = seq(0, 1, by = .1), labels = rep("", length(seq(0, 1, by = 0.1)))) # Add ticks - - # Add vertical lines - abline(v = mean(as.numeric(new_df$PMC)), col = "red", lty = 2) # Mean line - abline(v = median(as.numeric(new_df$PMC)), col = "green", lty = 2) # Median line - abline(v = quantile(as.numeric(new_df$PMC), 0.95), col = "blue", lty = 2) - legend("topright", legend=c("mean", "median", "quantile"), - col=c("red", "green","blue"), lty=2, cex=0.8) - - }) - } + # Add vertical lines + abline(v = mean(as.numeric(filtering_output$df$PMC)), col = "red", lty = 2) # Mean line + abline(v = median(as.numeric(filtering_output$df$PMC)), col = "green", lty = 2) # Median line + abline(v = quantile(as.numeric(filtering_output$df$PMC), 0.95), col = "blue", lty = 2) + legend("topright", legend=c("mean", "median", "quantile"), + col=c("red", "green","blue"), lty=2, cex=0.8) - #Missing data - output$missing_snp_hist <- renderPlot({ + } else if (input$filter_hist == "SNP_mis") { - #Histogram hist(as.numeric(filtering_files$snp_miss_df), main = "Ratio of Missing Data per SNP After Filtering", xlab = "Proportion of Missing Data per SNP", @@ -588,11 +443,9 @@ mod_Filtering_server <- function(id){ abline(v = quantile(as.numeric(filtering_files$snp_miss_df), 0.95), col = "blue", lty = 2) legend("topright", legend=c("mean", "median", "quantile"), col=c("red", "green","blue"), lty=2, cex=0.8) - }) - output$missing_sample_hist <- renderPlot({ + } else if (input$filter_hist == "Sample_mis") { - #Histogram hist(as.numeric(filtering_files$sample_miss_df), main = "Ratio of Missing Data per Sample After Filtering", xlab = "Proportion of Missing Data per Sample", @@ -609,21 +462,167 @@ mod_Filtering_server <- function(id){ abline(v = quantile(as.numeric(filtering_files$sample_miss_df), 0.95), col = "blue", lty = 2) legend("topright", legend=c("mean", "median", "quantile"), col=c("red", "green","blue"), lty=2, cex=0.8) + } + dev.off() + } + ) + + # Commented code + ##Updog file stats + #Consider Extracting the GT info or UD info if present as a datafrfame, + #Obtaining the info in the INFO column as it's own dataframe with a column for each value + #Then remove the VCF file and use the remaining dataframes for producing the figures + + filtering_output <- reactiveValues(df = NULL) + + observeEvent(filtering_files$raw_vcf_df, { + + # Apply the function to each row and bind the results into a new dataframe + new_df <- data.frame(filtering_files$raw_vcf_df) %>% + mutate(INFO_list = map(INFO, split_info_column)) %>% + unnest_wider(INFO_list) + + #Save df to reactive value + filtering_output$df <- new_df + + + ##Make plots + #Number of SNPs + nrow(filtering_files$raw_vcf_df) + + ###Bias + + #Histogram + if(any(grepl("BIAS", colnames(new_df)))){ + output$bias_hist <- renderPlot({ + hist(as.numeric(new_df$BIAS), + main = "Unfiltered SNP bias histogram", + xlab = "bias", + ylab = "SNPs", + col = "lightblue", + border = "black", + xlim = c(0,5), + breaks = as.numeric(input$hist_bins)) + axis(1, at = seq(0, 5, by = .2), labels = rep("", length(seq(0, 5, by = 0.2)))) # Add ticks + abline(v = mean(as.numeric(new_df$BIAS)), col = "red", lty = 2) # Mean line + abline(v = median(as.numeric(new_df$BIAS)), col = "green", lty = 2) # Median line + abline(v = 0.5, col = "black", lty = 2) # proposed lower line + abline(v = 2, col = "black", lty = 2) # proposed upper line + legend("topright", legend=c("mean", "median", "suggested threshold"), + col=c("red", "green","black"), lty=2, cex=0.8) }) + } - ##Read Depth (I would prefer that this show the mean depth for SNPs or Samples instead of all loci/sample cells) - quantile(as.numeric(new_df$DP), 0.95) - }) + ###OD + if(any(grepl("OD", colnames(new_df)))){ + + quantile(as.numeric(new_df$OD), 0.95) + #Histogram + output$od_hist <- renderPlot({ + hist(as.numeric(new_df$OD), + main = "Unfiltered SNP overdispersion parameter histogram", + xlab = "OD", + ylab = "SNPs", + col = "lightblue", + border = "black", + xlim = c(0,0.6), + breaks = as.numeric(input$hist_bins)) + axis(1, at = seq(0, 0.6, by = .01), labels = rep("", length(seq(0, 0.6, by = 0.01)))) # Add ticks + abline(v = 0.05, col = "black", lty = 2) # proposed filter by updog + + # Add vertical lines + abline(v = mean(as.numeric(new_df$OD)), col = "red", lty = 2) # Mean line + abline(v = median(as.numeric(new_df$OD)), col = "green", lty = 2) # Median line + abline(v = 0.05, col = "black", lty = 2) # proposed filter by updog + legend("topright", legend=c("mean", "median", "suggested threshold"), + col=c("red", "green","black"), lty=2, cex=0.8) - output$download_vcf <- downloadHandler( - filename = function() { - paste0("BIGapp_VCF_Example_file.vcf.gz") - }, - content = function(file) { - ex <- system.file("iris_DArT_VCF.vcf.gz", package = "BIGapp") - file.copy(ex, file) }) + } + + ##MAXPOSTPROB + + #Histogram + if(any(grepl("PMC", colnames(new_df)))){ + + output$maxpostprob_hist <- renderPlot({ + + #Histogram + hist(as.numeric(new_df$PMC), + main = "The estimated proportion of individuals misclassified in the SNP from updog", + xlab = "Proportion of Misclassified Genotypes per SNP", + ylab = "Number of SNPs", + col = "lightblue", + border = "black", + xlim = c(0,1), + breaks = as.numeric(input$hist_bins)) + axis(1, at = seq(0, 1, by = .1), labels = rep("", length(seq(0, 1, by = 0.1)))) # Add ticks + + # Add vertical lines + abline(v = mean(as.numeric(new_df$PMC)), col = "red", lty = 2) # Mean line + abline(v = median(as.numeric(new_df$PMC)), col = "green", lty = 2) # Median line + abline(v = quantile(as.numeric(new_df$PMC), 0.95), col = "blue", lty = 2) + legend("topright", legend=c("mean", "median", "quantile"), + col=c("red", "green","blue"), lty=2, cex=0.8) + + }) + } + + #Missing data + output$missing_snp_hist <- renderPlot({ + + #Histogram + hist(as.numeric(filtering_files$snp_miss_df), + main = "Ratio of Missing Data per SNP After Filtering", + xlab = "Proportion of Missing Data per SNP", + ylab = "Number of SNPs", + col = "lightblue", + border = "black", + xlim = c(0,1), + breaks = as.numeric(input$hist_bins)) + axis(1, at = seq(0, 1, by = .1), labels = rep("", length(seq(0, 1, by = 0.1)))) # Add ticks + + # Add vertical lines + abline(v = mean(as.numeric(filtering_files$snp_miss_df)), col = "red", lty = 2) # Mean line + abline(v = median(as.numeric(filtering_files$snp_miss_df)), col = "green", lty = 2) # Median line + abline(v = quantile(as.numeric(filtering_files$snp_miss_df), 0.95), col = "blue", lty = 2) + legend("topright", legend=c("mean", "median", "quantile"), + col=c("red", "green","blue"), lty=2, cex=0.8) + }) + + output$missing_sample_hist <- renderPlot({ + + #Histogram + hist(as.numeric(filtering_files$sample_miss_df), + main = "Ratio of Missing Data per Sample After Filtering", + xlab = "Proportion of Missing Data per Sample", + ylab = "Number of Samples", + col = "lightblue", + border = "black", + xlim = c(0,1), + breaks = as.numeric(input$hist_bins)) + axis(1, at = seq(0, 1, by = .1), labels = rep("", length(seq(0, 1, by = 0.1)))) # Add ticks + + # Add vertical lines + abline(v = mean(as.numeric(filtering_files$sample_miss_df)), col = "red", lty = 2) # Mean line + abline(v = median(as.numeric(filtering_files$sample_miss_df)), col = "green", lty = 2) # Median line + abline(v = quantile(as.numeric(filtering_files$sample_miss_df), 0.95), col = "blue", lty = 2) + legend("topright", legend=c("mean", "median", "quantile"), + col=c("red", "green","blue"), lty=2, cex=0.8) + }) + + ##Read Depth (I would prefer that this show the mean depth for SNPs or Samples instead of all loci/sample cells) + quantile(as.numeric(new_df$DP), 0.95) }) + + output$download_vcf <- downloadHandler( + filename = function() { + paste0("BIGapp_VCF_Example_file.vcf.gz") + }, + content = function(file) { + ex <- system.file("iris_DArT_VCF.vcf.gz", package = "BIGapp") + file.copy(ex, file) + }) } ## To be copied in the UI diff --git a/R/mod_GS.R b/R/mod_GS.R index c5bc319..ee9c9fd 100644 --- a/R/mod_GS.R +++ b/R/mod_GS.R @@ -103,625 +103,623 @@ mod_GS_ui <- function(id){ #' @import tidyr #' @importFrom DT renderDT #' @noRd -mod_GS_server <- function(id){ - moduleServer( id, function(input, output, session){ - ns <- session$ns - ###Genomic Prediction - #This tab involved 3 observeEvents - #1) to get the traits listed in the phenotype file - #2) to input and validate the input files - #3) to perform the genomic prediction - - - #1) Get traits - observeEvent(input$pred_trait_file, { - info_df2 <- read.csv(input$pred_trait_file$datapath, header = TRUE, check.names = FALSE, nrow = 0) - trait_var2 <- colnames(info_df2) - trait_var2 <- trait_var2[2:length(trait_var2)] - #updateSelectInput(session, "pred_trait_info", choices = c("All", trait_var)) - updateVirtualSelect("pred_fixed_info2", choices = trait_var2, session = session) - updateVirtualSelect("pred_trait_info2", choices = trait_var2, session = session) - - #output$passport_table <- renderDT({info_df}, options = list(scrollX = TRUE,autoWidth = FALSE, pageLength = 4) - #) - }) +mod_GS_server <- function(input, output, session, parent_session){ + + ns <- session$ns + ###Genomic Prediction + #This tab involved 3 observeEvents + #1) to get the traits listed in the phenotype file + #2) to input and validate the input files + #3) to perform the genomic prediction + + + #1) Get traits + observeEvent(input$pred_trait_file, { + info_df2 <- read.csv(input$pred_trait_file$datapath, header = TRUE, check.names = FALSE, nrow = 0) + trait_var2 <- colnames(info_df2) + trait_var2 <- trait_var2[2:length(trait_var2)] + #updateSelectInput(session, "pred_trait_info", choices = c("All", trait_var)) + updateVirtualSelect("pred_fixed_info2", choices = trait_var2, session = session) + updateVirtualSelect("pred_trait_info2", choices = trait_var2, session = session) + + #output$passport_table <- renderDT({info_df}, options = list(scrollX = TRUE,autoWidth = FALSE, pageLength = 4) + #) + }) - #2) Error check for prediction and save input files - continue_prediction2 <- reactiveVal(NULL) - pred_inputs2 <- reactiveValues( - pheno_input = NULL, - train_geno_input = NULL, - est_geno_input = NULL, - shared_snps = NULL, - pred_genos = NULL, - pred_geno_pheno = NULL - ) + #2) Error check for prediction and save input files + continue_prediction2 <- reactiveVal(NULL) + pred_inputs2 <- reactiveValues( + pheno_input = NULL, + train_geno_input = NULL, + est_geno_input = NULL, + shared_snps = NULL, + pred_genos = NULL, + pred_geno_pheno = NULL + ) - pred_outputs2 <- reactiveValues( - corr_output = NULL, - box_plot = NULL, - violin_plot = NULL, - comb_output = NULL, - avg_GEBVs = NULL, - all_GEBVs = NULL, - colors = NULL, - trait_output = NULL + pred_outputs2 <- reactiveValues( + corr_output = NULL, + box_plot = NULL, + violin_plot = NULL, + comb_output = NULL, + avg_GEBVs = NULL, + all_GEBVs = NULL, + colors = NULL, + trait_output = NULL + ) + + #Reactive boxes + output$shared_snps <- renderValueBox({ + valueBox( + value = pred_inputs2$shared_snps, + subtitle = "Common SNPs in Genotype files", + icon = icon("dna"), + color = "info" ) + }) + + observeEvent(input$prediction_est_start, { + #req(pred_inputs$pheno_input, pred_inputs$geno_input) - #Reactive boxes - output$shared_snps <- renderValueBox({ - valueBox( - value = pred_inputs2$shared_snps, - subtitle = "Common SNPs in Genotype files", - icon = icon("dna"), - color = "info" + toggleClass(id = "pred_est_ploidy", class = "borderred", condition = (is.na(input$pred_est_ploidy) | is.null(input$pred_est_ploidy))) + + if (is.null(input$pred_known_file$datapath) | is.null(input$pred_est_file$datapath) | is.null(input$pred_trait_file$datapath)) { + shinyalert( + title = "Missing input!", + text = "Upload VCF and phenotype files", + size = "s", + closeOnEsc = TRUE, + closeOnClickOutside = FALSE, + html = TRUE, + type = "error", + showConfirmButton = TRUE, + confirmButtonText = "OK", + confirmButtonCol = "#004192", + showCancelButton = FALSE, + animation = TRUE ) - }) + } + req(input$pred_known_file$datapath, input$pred_est_file$datapath, input$pred_trait_file$datapath, input$pred_est_ploidy) - observeEvent(input$prediction_est_start, { - #req(pred_inputs$pheno_input, pred_inputs$geno_input) - - toggleClass(id = "pred_est_ploidy", class = "borderred", condition = (is.na(input$pred_est_ploidy) | is.null(input$pred_est_ploidy))) - - if (is.null(input$pred_known_file$datapath) | is.null(input$pred_est_file$datapath) | is.null(input$pred_trait_file$datapath)) { - shinyalert( - title = "Missing input!", - text = "Upload VCF and phenotype files", - size = "s", - closeOnEsc = TRUE, - closeOnClickOutside = FALSE, - html = TRUE, - type = "error", - showConfirmButton = TRUE, - confirmButtonText = "OK", - confirmButtonCol = "#004192", - showCancelButton = FALSE, - animation = TRUE - ) - } - req(input$pred_known_file$datapath, input$pred_est_file$datapath, input$pred_trait_file$datapath, input$pred_est_ploidy) + #Status + updateProgressBar(session = session, id = "pb_gp", value = 5, title = "Checking input files") - #Status - updateProgressBar(session = session, id = "pb_gp", value = 5, title = "Checking input files") - - #Variables - ploidy <- as.numeric(input$pred_est_ploidy) - train_geno_path <- input$pred_known_file$datapath - est_geno_path <- input$pred_est_file$datapath - pheno2 <- read.csv(input$pred_trait_file$datapath, header = TRUE, check.names = FALSE) - row.names(pheno2) <- pheno2[,1] - traits <- input$pred_trait_info2 - #CVs <- as.numeric(input$pred_cv) - #train_perc <- as.numeric(input$pred_folds) - - - #Make sure at least one trait was input - if (length(traits) == 0) { - - # If condition is met, show notification toast - shinyalert( - title = "Oops", - text = "No traits were selected", - size = "xs", - closeOnEsc = TRUE, - closeOnClickOutside = FALSE, - html = TRUE, - type = "info", - showConfirmButton = TRUE, - confirmButtonText = "OK", - confirmButtonCol = "#004192", - showCancelButton = FALSE, - imageUrl = "", - animation = TRUE, - ) - - - # Stop the observeEvent gracefully - return() + #Variables + ploidy <- as.numeric(input$pred_est_ploidy) + train_geno_path <- input$pred_known_file$datapath + est_geno_path <- input$pred_est_file$datapath + pheno2 <- read.csv(input$pred_trait_file$datapath, header = TRUE, check.names = FALSE) + row.names(pheno2) <- pheno2[,1] + traits <- input$pred_trait_info2 + #CVs <- as.numeric(input$pred_cv) + #train_perc <- as.numeric(input$pred_folds) - } + #Make sure at least one trait was input + if (length(traits) == 0) { - #Getting genotype matrix + # If condition is met, show notification toast + shinyalert( + title = "Oops", + text = "No traits were selected", + size = "xs", + closeOnEsc = TRUE, + closeOnClickOutside = FALSE, + html = TRUE, + type = "info", + showConfirmButton = TRUE, + confirmButtonText = "OK", + confirmButtonCol = "#004192", + showCancelButton = FALSE, + imageUrl = "", + animation = TRUE, + ) - #Geno file path - file_path <- train_geno_path - #Geno.file conversion if needed - if (grepl("\\.csv$", file_path)) { - train_geno <- read.csv(train_geno_path, header = TRUE, row.names = 1, check.names = FALSE) - est_geno <- read.csv(est_geno_path, header = TRUE, row.names = 1, check.names = FALSE) + # Stop the observeEvent gracefully + return() - #Save number of SNPs - #pred_inputs$pred_snps <- nrow(geno) + } - } else if (grepl("\\.vcf$", file_path) || grepl("\\.gz$", file_path)) { - #Function to convert GT to dosage calls (add to BIGr) - convert_to_dosage <- function(gt) { - # Split the genotype string - alleles <- strsplit(gt, "[|/]") - # Sum the alleles, treating NA values appropriately - sapply(alleles, function(x) { - if (any(is.na(x))) { - return(NA) - } else { - return(sum(as.numeric(x), na.rm = TRUE)) - } - }) - } + #Getting genotype matrix - #Convert VCF file if submitted - train_vcf <- vcfR::read.vcfR(train_geno_path) - est_vcf <- vcfR::read.vcfR(est_geno_path) + #Geno file path + file_path <- train_geno_path - #Get number of SNPs - #pred_inputs$pred_snps <- nrow(vcf) + #Geno.file conversion if needed + if (grepl("\\.csv$", file_path)) { + train_geno <- read.csv(train_geno_path, header = TRUE, row.names = 1, check.names = FALSE) + est_geno <- read.csv(est_geno_path, header = TRUE, row.names = 1, check.names = FALSE) - #Extract GT - train_geno <- extract.gt(train_vcf, element = "GT") - train_geno <- apply(train_geno, 2, convert_to_dosage) - est_geno <- extract.gt(est_vcf, element = "GT") - est_geno <- apply(est_geno, 2, convert_to_dosage) - class(train_geno) <- "numeric" - class(est_geno) <- "numeric" - rm(train_vcf) - rm(est_vcf) + #Save number of SNPs + #pred_inputs$pred_snps <- nrow(geno) - } else { + } else if (grepl("\\.vcf$", file_path) || grepl("\\.gz$", file_path)) { - # If condition is met, show notification toast - shinyalert( - title = "Oops", - text = "No valid genotype file detected", - size = "xs", - closeOnEsc = TRUE, - closeOnClickOutside = FALSE, - html = TRUE, - type = "info", - showConfirmButton = TRUE, - confirmButtonText = "OK", - confirmButtonCol = "#004192", - showCancelButton = FALSE, - imageUrl = "", - animation = TRUE, - ) - - #Stop the analysis - return() + #Function to convert GT to dosage calls (add to BIGr) + convert_to_dosage <- function(gt) { + # Split the genotype string + alleles <- strsplit(gt, "[|/]") + # Sum the alleles, treating NA values appropriately + sapply(alleles, function(x) { + if (any(is.na(x))) { + return(NA) + } else { + return(sum(as.numeric(x), na.rm = TRUE)) + } + }) } - #Save number of samples in file - #pred_inputs$pred_genos <- ncol(geno) - - #Check that the ploidy entered is correct - if (ploidy != max(train_geno, na.rm = TRUE)) { - # If condition is met, show notification toast - shinyalert( - title = "Ploidy Mismatch", - text = paste0("The maximum value in the genotype file (",max(train_geno, na.rm = TRUE),") does not equal the ploidy entered"), - size = "xs", - closeOnEsc = FALSE, - closeOnClickOutside = FALSE, - html = TRUE, - type = "warning", - showConfirmButton = TRUE, - confirmButtonText = "OK", - confirmButtonCol = "#004192", - showCancelButton = FALSE, - #closeOnConfirm = TRUE, - #closeOnCancel = TRUE, - imageUrl = "", - animation = TRUE - ) - - - # Stop the observeEvent gracefully - #return() - } + #Convert VCF file if submitted + train_vcf <- vcfR::read.vcfR(train_geno_path) + est_vcf <- vcfR::read.vcfR(est_geno_path) + #Get number of SNPs + #pred_inputs$pred_snps <- nrow(vcf) - # Function to convert genotype matrix according to ploidy - convert_genotype <- function(genotype_matrix, ploidy) { - normalized_matrix <- 2 * (genotype_matrix / ploidy) - 1 - return(normalized_matrix) - } + #Extract GT + train_geno <- extract.gt(train_vcf, element = "GT") + train_geno <- apply(train_geno, 2, convert_to_dosage) + est_geno <- extract.gt(est_vcf, element = "GT") + est_geno <- apply(est_geno, 2, convert_to_dosage) + class(train_geno) <- "numeric" + class(est_geno) <- "numeric" + rm(train_vcf) + rm(est_vcf) - #tranforming genotypes - train_geno_adj_init <- convert_genotype(train_geno, as.numeric(ploidy)) - est_geno_adj_init <- convert_genotype(est_geno, as.numeric(ploidy)) - - #Make sure the trait file and genotype file are in the same order - # Column names for geno (assuming these are the individual IDs) - colnames_geno <- colnames(train_geno) - # Assuming the first column in Pheno contains the matching IDs - ids_pheno <- pheno2[, 1] - # Find common identifiers - common_ids <- intersect(colnames_geno, ids_pheno) - #Get number of id - pred_inputs2$pred_geno_pheno <- length(common_ids) - - #Throw an error if there are less matching samples in the phenotype file than the genotype file - if (length(common_ids) == 0) { - - # If condition is met, show notification toast - shinyalert( - title = "Oops", - text = "All samples were missing from the phenotype file", - size = "xs", - closeOnEsc = TRUE, - closeOnClickOutside = FALSE, - html = TRUE, - type = "info", - showConfirmButton = TRUE, - confirmButtonText = "OK", - confirmButtonCol = "#004192", - showCancelButton = FALSE, - imageUrl = "", - animation = TRUE, - ) - - - # Stop the observeEvent gracefully - return() - - } else if (length(common_ids) < length(colnames_geno)) { - # If condition is met, show notification toast - shinyalert( - title = "Data Mismatch", - text = paste0((length(colnames_geno)-length(common_ids))," samples were removed for not having trait information"), - size = "xs", - closeOnEsc = FALSE, - closeOnClickOutside = FALSE, - html = TRUE, - type = "warning", - showConfirmButton = TRUE, - confirmButtonText = "OK", - confirmButtonCol = "#004192", - showCancelButton = FALSE, - #closeOnConfirm = TRUE, - #closeOnCancel = TRUE, - imageUrl = "", - animation = TRUE - ) - - - # Stop the observeEvent gracefully - #return() - } + } else { + # If condition is met, show notification toast + shinyalert( + title = "Oops", + text = "No valid genotype file detected", + size = "xs", + closeOnEsc = TRUE, + closeOnClickOutside = FALSE, + html = TRUE, + type = "info", + showConfirmButton = TRUE, + confirmButtonText = "OK", + confirmButtonCol = "#004192", + showCancelButton = FALSE, + imageUrl = "", + animation = TRUE, + ) + #Stop the analysis + return() + } + #Save number of samples in file + #pred_inputs$pred_genos <- ncol(geno) - #Final check before performing analyses + #Check that the ploidy entered is correct + if (ploidy != max(train_geno, na.rm = TRUE)) { + # If condition is met, show notification toast shinyalert( - title = "Ready?", - text = "Inputs have been checked", + title = "Ploidy Mismatch", + text = paste0("The maximum value in the genotype file (",max(train_geno, na.rm = TRUE),") does not equal the ploidy entered"), size = "xs", closeOnEsc = FALSE, closeOnClickOutside = FALSE, html = TRUE, - type = "info", + type = "warning", showConfirmButton = TRUE, - confirmButtonText = "Proceed", + confirmButtonText = "OK", confirmButtonCol = "#004192", - showCancelButton = TRUE, + showCancelButton = FALSE, #closeOnConfirm = TRUE, #closeOnCancel = TRUE, imageUrl = "", - animation = TRUE, - callbackR = function(value) { - if (isTRUE(value)) { - # Proceed with adjusted data - continue_prediction2(TRUE) - } else { - # Stop or change the process - continue_prediction2(FALSE) - } - } + animation = TRUE ) - # Subset and reorder geno and pheno to ensure they only contain and are ordered by common IDs - train_geno_adj <- train_geno_adj_init[, common_ids] # Assuming that the columns can be directly indexed by IDs - pheno2 <- pheno2[match(common_ids, ids_pheno), ] + # Stop the observeEvent gracefully + #return() + } - #Save to reactive values - pred_inputs2$pheno_input <- pheno2 - #pred_inputs$geno_adj_input <- geno_adj - #Match training and testing genotype file SNPs - common_markers <- intersect(rownames(train_geno_adj), rownames(est_geno_adj_init)) - train_geno_adj <- train_geno_adj[common_markers, ] - est_geno_adj_init <- est_geno_adj_init[common_markers, ] + # Function to convert genotype matrix according to ploidy + convert_genotype <- function(genotype_matrix, ploidy) { + normalized_matrix <- 2 * (genotype_matrix / ploidy) - 1 + return(normalized_matrix) + } - #Save to reactive values - pred_inputs2$shared_snps <- length(common_markers) - pred_inputs2$train_geno_input <- train_geno_adj - pred_inputs2$est_geno_input <- est_geno_adj_init + #tranforming genotypes + train_geno_adj_init <- convert_genotype(train_geno, as.numeric(ploidy)) + est_geno_adj_init <- convert_genotype(est_geno, as.numeric(ploidy)) - }) + #Make sure the trait file and genotype file are in the same order + # Column names for geno (assuming these are the individual IDs) + colnames_geno <- colnames(train_geno) + # Assuming the first column in Pheno contains the matching IDs + ids_pheno <- pheno2[, 1] + # Find common identifiers + common_ids <- intersect(colnames_geno, ids_pheno) + #Get number of id + pred_inputs2$pred_geno_pheno <- length(common_ids) + + #Throw an error if there are less matching samples in the phenotype file than the genotype file + if (length(common_ids) == 0) { + + # If condition is met, show notification toast + shinyalert( + title = "Oops", + text = "All samples were missing from the phenotype file", + size = "xs", + closeOnEsc = TRUE, + closeOnClickOutside = FALSE, + html = TRUE, + type = "info", + showConfirmButton = TRUE, + confirmButtonText = "OK", + confirmButtonCol = "#004192", + showCancelButton = FALSE, + imageUrl = "", + animation = TRUE, + ) - #3) Analysis only proceeds once continue_prediction is converted to TRUE - observe({ - req(continue_prediction2(),pred_inputs2$pheno_input, pred_inputs2$train_geno_input) + # Stop the observeEvent gracefully + return() - # Stop analysis if cancel was selected - if (isFALSE(continue_prediction2())) { - return() - } + } else if (length(common_ids) < length(colnames_geno)) { + # If condition is met, show notification toast + shinyalert( + title = "Data Mismatch", + text = paste0((length(colnames_geno)-length(common_ids))," samples were removed for not having trait information"), + size = "xs", + closeOnEsc = FALSE, + closeOnClickOutside = FALSE, + html = TRUE, + type = "warning", + showConfirmButton = TRUE, + confirmButtonText = "OK", + confirmButtonCol = "#004192", + showCancelButton = FALSE, + #closeOnConfirm = TRUE, + #closeOnCancel = TRUE, + imageUrl = "", + animation = TRUE + ) - #Variables - ploidy <- as.numeric(input$pred_est_ploidy) - train_geno_adj <- pred_inputs2$train_geno_input - est_geno_adj <- pred_inputs2$est_geno_input - pheno <- pred_inputs2$pheno_input - traits <- input$pred_trait_info2 - #CVs <- as.numeric(input$pred_cv) - #train_perc <- as.numeric(input$pred_folds) - fixed_traits <- input$pred_fixed_info2 - cores <- input$pred_cores - - ##Need to add ability for the use of parallelism for the for cross-validation - ##Example at this tutorial also: https://www.youtube.com/watch?v=ARWjdQU6ays - - # Function to perform genomic prediction - ##Make sure this is correct (I think I need to be generating a relationship matrix A.mat() to account for missing data, but I am not sure how that works with this) - genomic_prediction2 <- function(train_geno,est_geno, Pheno, traits, fixed_effects = NULL, cores = 1) { - - # Define variables - traits <- traits - #cycles <- as.numeric(Iters) - #Folds <- as.numeric(Fold) - total_population <- ncol(train_geno) - #train_size <- floor(percentage / 100 * total_population) - fixed_traits <- fixed_effects - cores <- as.numeric(cores) - - # Initialize a list to store GEBVs for all traits and cycles - GEBVs <- list() - - #Cross validation number for progress bar (not involved in the calculations, just shiny visuals) - pb_value = 10 - - #Remove the fixed traits from the Pheno file - if (length(fixed_traits) == 0) { - Pheno <- Pheno + + # Stop the observeEvent gracefully + #return() + } + + + + + #Final check before performing analyses + shinyalert( + title = "Ready?", + text = "Inputs have been checked", + size = "xs", + closeOnEsc = FALSE, + closeOnClickOutside = FALSE, + html = TRUE, + type = "info", + showConfirmButton = TRUE, + confirmButtonText = "Proceed", + confirmButtonCol = "#004192", + showCancelButton = TRUE, + #closeOnConfirm = TRUE, + #closeOnCancel = TRUE, + imageUrl = "", + animation = TRUE, + callbackR = function(value) { + if (isTRUE(value)) { + # Proceed with adjusted data + continue_prediction2(TRUE) } else { - #Subset fixed traits - Fixed <- subset(Pheno, select = fixed_traits) - - #Pheno <- subset(Pheno, select = -fixed_traits) - convert_all_to_factor_if_not_numeric <- function(df) { - for (col in names(df)) { - if (!is.numeric(df[[col]]) && !is.integer(df[[col]])) { - df[[col]] <- as.factor(df[[col]]) - } + # Stop or change the process + continue_prediction2(FALSE) + } + } + ) + + + # Subset and reorder geno and pheno to ensure they only contain and are ordered by common IDs + train_geno_adj <- train_geno_adj_init[, common_ids] # Assuming that the columns can be directly indexed by IDs + pheno2 <- pheno2[match(common_ids, ids_pheno), ] + + #Save to reactive values + pred_inputs2$pheno_input <- pheno2 + #pred_inputs$geno_adj_input <- geno_adj + + #Match training and testing genotype file SNPs + common_markers <- intersect(rownames(train_geno_adj), rownames(est_geno_adj_init)) + train_geno_adj <- train_geno_adj[common_markers, ] + est_geno_adj_init <- est_geno_adj_init[common_markers, ] + + #Save to reactive values + pred_inputs2$shared_snps <- length(common_markers) + pred_inputs2$train_geno_input <- train_geno_adj + pred_inputs2$est_geno_input <- est_geno_adj_init + + }) + + #3) Analysis only proceeds once continue_prediction is converted to TRUE + observe({ + + req(continue_prediction2(),pred_inputs2$pheno_input, pred_inputs2$train_geno_input) + + # Stop analysis if cancel was selected + if (isFALSE(continue_prediction2())) { + return() + } + + #Variables + ploidy <- as.numeric(input$pred_est_ploidy) + train_geno_adj <- pred_inputs2$train_geno_input + est_geno_adj <- pred_inputs2$est_geno_input + pheno <- pred_inputs2$pheno_input + traits <- input$pred_trait_info2 + #CVs <- as.numeric(input$pred_cv) + #train_perc <- as.numeric(input$pred_folds) + fixed_traits <- input$pred_fixed_info2 + cores <- input$pred_cores + + ##Need to add ability for the use of parallelism for the for cross-validation + ##Example at this tutorial also: https://www.youtube.com/watch?v=ARWjdQU6ays + + # Function to perform genomic prediction + ##Make sure this is correct (I think I need to be generating a relationship matrix A.mat() to account for missing data, but I am not sure how that works with this) + genomic_prediction2 <- function(train_geno,est_geno, Pheno, traits, fixed_effects = NULL, cores = 1) { + + # Define variables + traits <- traits + #cycles <- as.numeric(Iters) + #Folds <- as.numeric(Fold) + total_population <- ncol(train_geno) + #train_size <- floor(percentage / 100 * total_population) + fixed_traits <- fixed_effects + cores <- as.numeric(cores) + + # Initialize a list to store GEBVs for all traits and cycles + GEBVs <- list() + + #Cross validation number for progress bar (not involved in the calculations, just shiny visuals) + pb_value = 10 + + #Remove the fixed traits from the Pheno file + if (length(fixed_traits) == 0) { + Pheno <- Pheno + } else { + #Subset fixed traits + Fixed <- subset(Pheno, select = fixed_traits) + + #Pheno <- subset(Pheno, select = -fixed_traits) + convert_all_to_factor_if_not_numeric <- function(df) { + for (col in names(df)) { + if (!is.numeric(df[[col]]) && !is.integer(df[[col]])) { + df[[col]] <- as.factor(df[[col]]) } - return(df) } - # Convert all columns to factor if they are not numeric or integer - Fixed <- convert_all_to_factor_if_not_numeric(Fixed) + return(df) + } + # Convert all columns to factor if they are not numeric or integer + Fixed <- convert_all_to_factor_if_not_numeric(Fixed) - #Fixed <- as.data.frame(lapply(Fixed, as.factor)) #convert to factor - row.names(Fixed) <- row.names(Pheno) + #Fixed <- as.data.frame(lapply(Fixed, as.factor)) #convert to factor + row.names(Fixed) <- row.names(Pheno) - #Make the matrix - formula_str <- paste("~", paste(fixed_traits, collapse = " + ")) - formula <- as.formula(formula_str) + #Make the matrix + formula_str <- paste("~", paste(fixed_traits, collapse = " + ")) + formula <- as.formula(formula_str) - # Create the design matrix using the constructed formula - Fixed <- model.matrix(formula, data = Fixed) - } + # Create the design matrix using the constructed formula + Fixed <- model.matrix(formula, data = Fixed) + } - #Make kinship matrix of all individuals? - #Kin_mat <- A.mat(t(geno), n.core = 1) ##Need to explore whether or not to use a Kinship matrix and if it makes a significant improvement to accuracy - #If wanting to use Kkinship matrix, will then need to see how to implement it here - - #For now, I am just imputing the missing sites using mean, but EM is more accurate, but slower (can use multiple cores). - impute = (A.mat(t(train_geno), max.missing=1,impute.method="mean",return.imputed=TRUE)) - train_geno <- impute$imputed - impute = (A.mat(t(est_geno), max.missing=1,impute.method="mean",return.imputed=TRUE)) - est_geno <- impute$imputed - - #Match training and testing genotype file SNPs - common_markers <- intersect(colnames(train_geno), colnames(est_geno)) - train_geno <- train_geno[ ,common_markers] - est_geno <- est_geno[ ,common_markers] - - #Calculate predicted traits and GEBVs - #fold_ids <- sample(rep(1:Folds, length.out = total_population)) - #fold_df <- data.frame(Sample = row.names(geno), FoldID = fold_ids) #Randomly assign each sample to a fold - #fold_results <- matrix(nrow = Folds, ncol = length(traits)) - #colnames(fold_results) <- traits - - #Status - updateProgressBar(session = session, id = "pb_gp", value = 50, title = "Estimating Predicted Values") - - train <- row.names(train_geno) - - #Subset datasets - #if (length(fixed_traits) == 0) { - # Fixed_train = NULL - #} else{ - # Fixed_train <- data.frame(Fixed[train, ]) - # Fixed_train <- as.matrix(Fixed_train) - # row.names(Fixed_train) <- train - #colnames(Fixed_train) <- colnames(Fixed) - Fixed_train = NULL - - #Fixed (testing) - # Fixed_test<- data.frame(Fixed[test, ]) - # Fixed_test <- as.matrix(Fixed_test) - # row.names(Fixed_test) <- test - #colnames(Fixed_test) <- colnames(Fixed) - - Pheno_train <- Pheno[train, ] # Subset the phenotype df to only retain the relevant samples from the training set - m_train <- train_geno - #Pheno_test <- Pheno[test, ] - #Fixed_test <- Fixed[test, ] #Where would the Fixed_test be used? - m_valid <- est_geno - - print(dim(m_train)) - print(dim(m_valid)) - - # Initialize a matrix to store GEBVs for this fold - GEBVs_fold <- matrix(nrow = nrow(est_geno), ncol = length(traits)) - colnames(GEBVs_fold) <- c(traits) - rownames(GEBVs_fold) <- row.names(est_geno) - - Pred_results <- matrix(nrow = nrow(est_geno), ncol = length(traits)) - colnames(Pred_results) <- c(traits) - rownames(Pred_results) <- row.names(est_geno) - - #Evaluate each trait using the same train and testing samples for each - for (trait_idx in 1:length(traits)) { - trait <- Pheno_train[, traits[trait_idx]] # Get the trait of interest - trait_answer <- mixed.solve(y= trait, Z = m_train, K = NULL, X = Fixed_train, SE = FALSE, return.Hinv = FALSE) - TRT <- trait_answer$u - e <- as.matrix(TRT) - pred_trait_test <- m_valid %*% e - pred_trait <- pred_trait_test[, 1] + c(trait_answer$beta) # Make sure this still works when using multiple traits - Pred_results[, trait_idx] <- pred_trait #save to dataframe - - # Extract GEBVs - # Check if Fixed_train is not NULL and include beta if it is - if (!is.null(Fixed_train) && !is.null(trait_answer$beta)) { - # Calculate GEBVs including fixed effects - #GEBVs_fold[, trait_idx] <- m_train %*% trait_answer$u + Fixed_train %*% matrix(trait_answer$beta, nrow = length(trait_answer$beta), ncol = 1) - #GEBVs_fold[, trait_idx] <- m_valid %*% trait_answer$u + Fixed_test %*% matrix(trait_answer$beta, nrow = length(trait_answer$beta), ncol = 1) - GEBVs_fold[, trait_idx] <- m_valid %*% trait_answer$u + Fixed_test %*% trait_answer$beta - } else { - # Calculate GEBVs without fixed effects - #GEBVs_fold[, trait_idx] <- m_train %*% trait_answer$u - GEBVs_fold[, trait_idx] <- m_valid %*% trait_answer$u #Confirm it is accuract to calculate the GEBVs for testing group from the trained model - } + #Make kinship matrix of all individuals? + #Kin_mat <- A.mat(t(geno), n.core = 1) ##Need to explore whether or not to use a Kinship matrix and if it makes a significant improvement to accuracy + #If wanting to use Kkinship matrix, will then need to see how to implement it here + + #For now, I am just imputing the missing sites using mean, but EM is more accurate, but slower (can use multiple cores). + impute = (A.mat(t(train_geno), max.missing=1,impute.method="mean",return.imputed=TRUE)) + train_geno <- impute$imputed + impute = (A.mat(t(est_geno), max.missing=1,impute.method="mean",return.imputed=TRUE)) + est_geno <- impute$imputed + + #Match training and testing genotype file SNPs + common_markers <- intersect(colnames(train_geno), colnames(est_geno)) + train_geno <- train_geno[ ,common_markers] + est_geno <- est_geno[ ,common_markers] - # Calculate heritability for the current trait - #Vu <- trait_answer$Vu - #Ve <- trait_answer$Ve - #heritability_scores[(((r-1)*5)+fold), trait_idx] <- Vu / (Vu + Ve) + #Calculate predicted traits and GEBVs + #fold_ids <- sample(rep(1:Folds, length.out = total_population)) + #fold_df <- data.frame(Sample = row.names(geno), FoldID = fold_ids) #Randomly assign each sample to a fold + #fold_results <- matrix(nrow = Folds, ncol = length(traits)) + #colnames(fold_results) <- traits + #Status + updateProgressBar(session = session, id = "pb_gp", value = 50, title = "Estimating Predicted Values") + + train <- row.names(train_geno) + + #Subset datasets + #if (length(fixed_traits) == 0) { + # Fixed_train = NULL + #} else{ + # Fixed_train <- data.frame(Fixed[train, ]) + # Fixed_train <- as.matrix(Fixed_train) + # row.names(Fixed_train) <- train + #colnames(Fixed_train) <- colnames(Fixed) + Fixed_train = NULL + + #Fixed (testing) + # Fixed_test<- data.frame(Fixed[test, ]) + # Fixed_test <- as.matrix(Fixed_test) + # row.names(Fixed_test) <- test + #colnames(Fixed_test) <- colnames(Fixed) + + Pheno_train <- Pheno[train, ] # Subset the phenotype df to only retain the relevant samples from the training set + m_train <- train_geno + #Pheno_test <- Pheno[test, ] + #Fixed_test <- Fixed[test, ] #Where would the Fixed_test be used? + m_valid <- est_geno + + print(dim(m_train)) + print(dim(m_valid)) + + # Initialize a matrix to store GEBVs for this fold + GEBVs_fold <- matrix(nrow = nrow(est_geno), ncol = length(traits)) + colnames(GEBVs_fold) <- c(traits) + rownames(GEBVs_fold) <- row.names(est_geno) + + Pred_results <- matrix(nrow = nrow(est_geno), ncol = length(traits)) + colnames(Pred_results) <- c(traits) + rownames(Pred_results) <- row.names(est_geno) + + #Evaluate each trait using the same train and testing samples for each + for (trait_idx in 1:length(traits)) { + trait <- Pheno_train[, traits[trait_idx]] # Get the trait of interest + trait_answer <- mixed.solve(y= trait, Z = m_train, K = NULL, X = Fixed_train, SE = FALSE, return.Hinv = FALSE) + TRT <- trait_answer$u + e <- as.matrix(TRT) + pred_trait_test <- m_valid %*% e + pred_trait <- pred_trait_test[, 1] + c(trait_answer$beta) # Make sure this still works when using multiple traits + Pred_results[, trait_idx] <- pred_trait #save to dataframe + + # Extract GEBVs + # Check if Fixed_train is not NULL and include beta if it is + if (!is.null(Fixed_train) && !is.null(trait_answer$beta)) { + # Calculate GEBVs including fixed effects + #GEBVs_fold[, trait_idx] <- m_train %*% trait_answer$u + Fixed_train %*% matrix(trait_answer$beta, nrow = length(trait_answer$beta), ncol = 1) + #GEBVs_fold[, trait_idx] <- m_valid %*% trait_answer$u + Fixed_test %*% matrix(trait_answer$beta, nrow = length(trait_answer$beta), ncol = 1) + GEBVs_fold[, trait_idx] <- m_valid %*% trait_answer$u + Fixed_test %*% trait_answer$beta + } else { + # Calculate GEBVs without fixed effects + #GEBVs_fold[, trait_idx] <- m_train %*% trait_answer$u + GEBVs_fold[, trait_idx] <- m_valid %*% trait_answer$u #Confirm it is accuract to calculate the GEBVs for testing group from the trained model } - #Add iter and fold information for each trait/result - #heritability_scores[(((r-1)*5)+fold), (length(traits)+1)] <- r - #heritability_scores[(((r-1)*5)+fold), (length(traits)+2)] <- fold - #Add sample, iteration, and fold information to GEBVs_fold - #GEBVs_fold[,"Iter"] = r - #GEBVs_fold[,"Fold"] = fold - #GEBVs_fold[,"Sample"] <- row.names(est_geno) + # Calculate heritability for the current trait + #Vu <- trait_answer$Vu + #Ve <- trait_answer$Ve + #heritability_scores[(((r-1)*5)+fold), trait_idx] <- Vu / (Vu + Ve) - # Store GEBVs for this fold - GEBVs_df <- data.frame(GEBVs_fold) + } + #Add iter and fold information for each trait/result + #heritability_scores[(((r-1)*5)+fold), (length(traits)+1)] <- r + #heritability_scores[(((r-1)*5)+fold), (length(traits)+2)] <- fold - Pred_results <- data.frame(Pred_results) + #Add sample, iteration, and fold information to GEBVs_fold + #GEBVs_fold[,"Iter"] = r + #GEBVs_fold[,"Fold"] = fold + #GEBVs_fold[,"Sample"] <- row.names(est_geno) + # Store GEBVs for this fold + GEBVs_df <- data.frame(GEBVs_fold) - # Store GEBVs for this cycle - #GEBVs[[r]] <- do.call(rbind, GEBVs_cycle) + Pred_results <- data.frame(Pred_results) - # Combine all GEBVs into a single DataFrame - #GEBVs_df <- as.data.frame(do.call(rbind, GEBVs)) + # Store GEBVs for this cycle + #GEBVs[[r]] <- do.call(rbind, GEBVs_cycle) - #results <- as.data.frame(results) - #heritability_scores <- as.data.frame(heritability_scores) - # Combine results and heritability_scores using cbind - #combined_results <- cbind(results, heritability_scores) + # Combine all GEBVs into a single DataFrame + #GEBVs_df <- as.data.frame(do.call(rbind, GEBVs)) - return(list(GEBVs = GEBVs_df, Predictions = Pred_results)) - } + #results <- as.data.frame(results) + #heritability_scores <- as.data.frame(heritability_scores) - # Example call to the function - #This is slow when using 3k markers and 1.2k samples...will need to parallelize if using this script... - results <- genomic_prediction2(train_geno_adj, est_geno_adj, pheno, traits = traits, fixed_effects = fixed_traits, cores = cores) + # Combine results and heritability_scores using cbind + #combined_results <- cbind(results, heritability_scores) - #With fixed effects (need to inforporate the ability for fixed effects into the prediction?) - #results <- genomic_prediction(geno_matrix, phenotype_df, c("height", "weight"), "~ age + sex") + return(list(GEBVs = GEBVs_df, Predictions = Pred_results)) + } - #Save to reactive value - pred_outputs2$trait_output <- results$Predictions - pred_outputs2$all_GEBVs <- results$GEBVs - #TESTING!!! - #write.csv(results$GEBVs, "GEBVs_test.csv") + # Example call to the function + #This is slow when using 3k markers and 1.2k samples...will need to parallelize if using this script... + results <- genomic_prediction2(train_geno_adj, est_geno_adj, pheno, traits = traits, fixed_effects = fixed_traits, cores = cores) - # Convert trait columns to numeric - results$GEBVs <- results$GEBVs %>% - mutate(across(all_of(traits), ~ as.numeric(.x))) + #With fixed effects (need to inforporate the ability for fixed effects into the prediction?) + #results <- genomic_prediction(geno_matrix, phenotype_df, c("height", "weight"), "~ age + sex") + #Save to reactive value + pred_outputs2$trait_output <- results$Predictions + pred_outputs2$all_GEBVs <- results$GEBVs + #TESTING!!! + #write.csv(results$GEBVs, "GEBVs_test.csv") - #Get average accuracy and h2 for each iter accross the 5 folds + # Convert trait columns to numeric + results$GEBVs <- results$GEBVs %>% + mutate(across(all_of(traits), ~ as.numeric(.x))) - #columns <- setdiff(colnames(results$CombinedResults), c("Iter","Fold")) - #average_accuracy_df <- results$CombinedResults %>% - # group_by(Iter) %>% - # summarize(across(all_of(columns), mean, na.rm = TRUE)) + #Get average accuracy and h2 for each iter accross the 5 folds - #Status - updateProgressBar(session = session, id = "pb_gp", value = 90, title = "Generating Results") + #columns <- setdiff(colnames(results$CombinedResults), c("Iter","Fold")) + #average_accuracy_df <- results$CombinedResults %>% + # group_by(Iter) %>% + # summarize(across(all_of(columns), mean, na.rm = TRUE)) - ##Figures and Tables - #Status - updateProgressBar(session = session, id = "pb_gp", value = 100, title = "Finished!") + #Status + updateProgressBar(session = session, id = "pb_gp", value = 90, title = "Generating Results") - #End the event - continue_prediction2(NULL) - }) + ##Figures and Tables - #Output the prediction tables - all_GEBVs <- reactive({ - validate( - need(!is.null(pred_outputs2$all_GEBVs), "Upload the input files, set the parameters and click 'run analysis' to access results in this session.") - ) - pred_outputs2$all_GEBVs - }) + #Status + updateProgressBar(session = session, id = "pb_gp", value = 100, title = "Finished!") - #GEBVs from all iterations/folds - output$pred_gebvs_table2 <- renderDT({all_GEBVs()}, options = list(scrollX = TRUE,autoWidth = FALSE, pageLength = 5)) + #End the event + continue_prediction2(NULL) + }) - trait_output <- reactive({ - validate( - need(!is.null(pred_outputs2$trait_output), "Upload the input files, set the parameters and click 'run analysis' to access results in this session.") - ) - pred_outputs2$trait_output - }) + #Output the prediction tables + all_GEBVs <- reactive({ + validate( + need(!is.null(pred_outputs2$all_GEBVs), "Upload the input files, set the parameters and click 'run analysis' to access results in this session.") + ) + pred_outputs2$all_GEBVs + }) - #GEBVs from all iterations/folds - output$pred_trait_table <- renderDT({trait_output()}, options = list(scrollX = TRUE,autoWidth = FALSE, pageLength = 5)) - - output$download_vcft <- downloadHandler( - filename = function() { - paste0("BIGapp_Training_VCF_Example_file.vcf.gz") - }, - content = function(file) { - ex <- system.file("test-dose.vcf.gz", package = "BIGapp") - file.copy(ex, file) - }) - - output$download_vcfp <- downloadHandler( - filename = function() { - paste0("BIGapp_Predict_VCF_Example_file.vcf") - }, - content = function(file) { - ex <- system.file("test-dose-use-for-prediction.vcf", package = "BIGapp") - file.copy(ex, file) - }) - - output$download_pheno <- downloadHandler( - filename = function() { - paste0("BIGapp_passport_Example_file.csv") - }, - content = function(file) { - ex <- system.file("iris_passport_file.csv", package = "BIGapp") - file.copy(ex, file) - }) + #GEBVs from all iterations/folds + output$pred_gebvs_table2 <- renderDT({all_GEBVs()}, options = list(scrollX = TRUE,autoWidth = FALSE, pageLength = 5)) + trait_output <- reactive({ + validate( + need(!is.null(pred_outputs2$trait_output), "Upload the input files, set the parameters and click 'run analysis' to access results in this session.") + ) + pred_outputs2$trait_output }) + + #GEBVs from all iterations/folds + output$pred_trait_table <- renderDT({trait_output()}, options = list(scrollX = TRUE,autoWidth = FALSE, pageLength = 5)) + + output$download_vcft <- downloadHandler( + filename = function() { + paste0("BIGapp_Training_VCF_Example_file.vcf.gz") + }, + content = function(file) { + ex <- system.file("test-dose.vcf.gz", package = "BIGapp") + file.copy(ex, file) + }) + + output$download_vcfp <- downloadHandler( + filename = function() { + paste0("BIGapp_Predict_VCF_Example_file.vcf") + }, + content = function(file) { + ex <- system.file("test-dose-use-for-prediction.vcf", package = "BIGapp") + file.copy(ex, file) + }) + + output$download_pheno <- downloadHandler( + filename = function() { + paste0("BIGapp_passport_Example_file.csv") + }, + content = function(file) { + ex <- system.file("iris_passport_file.csv", package = "BIGapp") + file.copy(ex, file) + }) } ## To be copied in the UI diff --git a/R/mod_GSAcc.R b/R/mod_GSAcc.R index 18f544c..57fe26e 100644 --- a/R/mod_GSAcc.R +++ b/R/mod_GSAcc.R @@ -114,782 +114,781 @@ mod_GSAcc_ui <- function(id){ #' @import ggplot2 #' @import tidyr #' @noRd -mod_GSAcc_server <- function(id){ - moduleServer( id, function(input, output, session){ - ns <- session$ns - ####Genomic Prediction Accuracy - #This tab involved 3 observeEvents - #1) to get the traits listed in the phenotype file - #2) to input and validate the input files - #3) to perform the genomic prediction - - - #1) Get traits - observeEvent(input$trait_file, { - info_df <- read.csv(input$trait_file$datapath, header = TRUE, check.names = FALSE, nrow = 0) - trait_var <- colnames(info_df) - trait_var <- trait_var[2:length(trait_var)] - updateVirtualSelect("pred_fixed_info", choices = trait_var, session = session) - updateVirtualSelect("pred_trait_info", choices = trait_var, session = session) +mod_GSAcc_server <- function(input, output, session, parent_session){ - }) + ns <- session$ns + ####Genomic Prediction Accuracy + #This tab involved 3 observeEvents + #1) to get the traits listed in the phenotype file + #2) to input and validate the input files + #3) to perform the genomic prediction + + + #1) Get traits + observeEvent(input$trait_file, { + info_df <- read.csv(input$trait_file$datapath, header = TRUE, check.names = FALSE, nrow = 0) + trait_var <- colnames(info_df) + trait_var <- trait_var[2:length(trait_var)] + updateVirtualSelect("pred_fixed_info", choices = trait_var, session = session) + updateVirtualSelect("pred_trait_info", choices = trait_var, session = session) + + }) - #2) Error check for prediction and save input files - continue_prediction <- reactiveVal(NULL) - pred_inputs <- reactiveValues( - pheno_input = NULL, - geno_input = NULL, - pred_snps = NULL, - pred_genos = NULL, - pred_geno_pheno = NULL + #2) Error check for prediction and save input files + continue_prediction <- reactiveVal(NULL) + pred_inputs <- reactiveValues( + pheno_input = NULL, + geno_input = NULL, + pred_snps = NULL, + pred_genos = NULL, + pred_geno_pheno = NULL + ) + + pred_outputs <- reactiveValues( + corr_output = NULL, + box_plot = NULL, + violin_plot = NULL, + comb_output = NULL, + avg_GEBVs = NULL, + all_GEBVs = NULL, + colors = NULL + ) + + #Reactive boxes + output$pred_snps <- renderValueBox({ + valueBox( + value = pred_inputs$pred_snps, + subtitle = "SNPs in Genotype File", + icon = icon("dna"), + color = "info" ) + }) - pred_outputs <- reactiveValues( - corr_output = NULL, - box_plot = NULL, - violin_plot = NULL, - comb_output = NULL, - avg_GEBVs = NULL, - all_GEBVs = NULL, - colors = NULL + output$pred_geno <- renderValueBox({ + valueBox( + value = pred_inputs$pred_geno_pheno, + subtitle = "Samples with Phenotype Information", + icon = icon("location-dot"), + color = "info" ) + }) - #Reactive boxes - output$pred_snps <- renderValueBox({ - valueBox( - value = pred_inputs$pred_snps, - subtitle = "SNPs in Genotype File", - icon = icon("dna"), - color = "info" - ) - }) + observe({ + # Update colors based on input + pred_outputs$colors <- switch(input$pred_color_select, + "red" = "#F8766D", + "blue" = "#00BFC4", + "green" = "#00BA38", + input$pred_color_select) + }) - output$pred_geno <- renderValueBox({ - valueBox( - value = pred_inputs$pred_geno_pheno, - subtitle = "Samples with Phenotype Information", - icon = icon("location-dot"), - color = "info" - ) - }) + observeEvent(input$prediction_start, { - observe({ - # Update colors based on input - pred_outputs$colors <- switch(input$pred_color_select, - "red" = "#F8766D", - "blue" = "#00BFC4", - "green" = "#00BA38", - input$pred_color_select) - }) + toggleClass(id = "pred_ploidy", class = "borderred", condition = (is.na(input$pred_ploidy) | is.null(input$pred_ploidy))) - observeEvent(input$prediction_start, { - - toggleClass(id = "pred_ploidy", class = "borderred", condition = (is.na(input$pred_ploidy) | is.null(input$pred_ploidy))) - - if (is.null(input$pred_file$datapath) | is.null(input$trait_file$datapath)) { - shinyalert( - title = "Missing input!", - text = "Upload VCF and phenotype files", - size = "s", - closeOnEsc = TRUE, - closeOnClickOutside = FALSE, - html = TRUE, - type = "error", - showConfirmButton = TRUE, - confirmButtonText = "OK", - confirmButtonCol = "#004192", - showCancelButton = FALSE, - animation = TRUE - ) - } - req(input$pred_file$datapath, input$pred_ploidy, input$trait_file$datapath) - - #Status - updateProgressBar(session = session, id = "pb_prediction", value = 5, title = "Checking input files") - - #Variables - ploidy <- as.numeric(input$pred_ploidy) - geno_path <- input$pred_file$datapath - pheno <- read.csv(input$trait_file$datapath, header = TRUE, check.names = FALSE) - row.names(pheno) <- pheno[,1] - traits <- input$pred_trait_info - CVs <- as.numeric(input$pred_cv) - - #Make sure at least one trait was input - if (length(traits) == 0) { - - # If condition is met, show notification toast - shinyalert( - title = "Oops", - text = "No traits were selected", - size = "xs", - closeOnEsc = TRUE, - closeOnClickOutside = FALSE, - html = TRUE, - type = "info", - showConfirmButton = TRUE, - confirmButtonText = "OK", - confirmButtonCol = "#004192", - showCancelButton = FALSE, - imageUrl = "", - animation = TRUE, - ) - - - # Stop the observeEvent gracefully - return() + if (is.null(input$pred_file$datapath) | is.null(input$trait_file$datapath)) { + shinyalert( + title = "Missing input!", + text = "Upload VCF and phenotype files", + size = "s", + closeOnEsc = TRUE, + closeOnClickOutside = FALSE, + html = TRUE, + type = "error", + showConfirmButton = TRUE, + confirmButtonText = "OK", + confirmButtonCol = "#004192", + showCancelButton = FALSE, + animation = TRUE + ) + } + req(input$pred_file$datapath, input$pred_ploidy, input$trait_file$datapath) - } + #Status + updateProgressBar(session = session, id = "pb_prediction", value = 5, title = "Checking input files") + #Variables + ploidy <- as.numeric(input$pred_ploidy) + geno_path <- input$pred_file$datapath + pheno <- read.csv(input$trait_file$datapath, header = TRUE, check.names = FALSE) + row.names(pheno) <- pheno[,1] + traits <- input$pred_trait_info + CVs <- as.numeric(input$pred_cv) - #Getting genotype matrix + #Make sure at least one trait was input + if (length(traits) == 0) { - #Geno file path - file_path <- geno_path + # If condition is met, show notification toast + shinyalert( + title = "Oops", + text = "No traits were selected", + size = "xs", + closeOnEsc = TRUE, + closeOnClickOutside = FALSE, + html = TRUE, + type = "info", + showConfirmButton = TRUE, + confirmButtonText = "OK", + confirmButtonCol = "#004192", + showCancelButton = FALSE, + imageUrl = "", + animation = TRUE, + ) - #Geno.file conversion if needed - if (grepl("\\.csv$", file_path)) { - geno <- read.csv(geno_path, header = TRUE, row.names = 1, check.names = FALSE) - #Save number of SNPs - pred_inputs$pred_snps <- nrow(geno) + # Stop the observeEvent gracefully + return() - } else if (grepl("\\.vcf$", file_path) || grepl("\\.gz$", file_path)) { + } - #Function to convert GT to dosage calls (add to BIGr) - convert_to_dosage <- function(gt) { - # Split the genotype string - alleles <- strsplit(gt, "[|/]") - # Sum the alleles, treating NA values appropriately - sapply(alleles, function(x) { - if (any(is.na(x))) { - return(NA) - } else { - return(sum(as.numeric(x), na.rm = TRUE)) - } - }) - } - #Convert VCF file if submitted - vcf <- vcfR::read.vcfR(file_path) + #Getting genotype matrix - #Get number of SNPs - pred_inputs$pred_snps <- nrow(vcf) + #Geno file path + file_path <- geno_path - #Extract GT - geno <- extract.gt(vcf, element = "GT") - geno <- apply(geno, 2, convert_to_dosage) - class(geno) <- "numeric" - rm(vcf) + #Geno.file conversion if needed + if (grepl("\\.csv$", file_path)) { + geno <- read.csv(geno_path, header = TRUE, row.names = 1, check.names = FALSE) - } else { + #Save number of SNPs + pred_inputs$pred_snps <- nrow(geno) - # If condition is met, show notification toast - shinyalert( - title = "Oops", - text = "No valid genotype file detected", - size = "xs", - closeOnEsc = TRUE, - closeOnClickOutside = FALSE, - html = TRUE, - type = "info", - showConfirmButton = TRUE, - confirmButtonText = "OK", - confirmButtonCol = "#004192", - showCancelButton = FALSE, - imageUrl = "", - animation = TRUE, - ) - - #Stop the analysis - return() - } + } else if (grepl("\\.vcf$", file_path) || grepl("\\.gz$", file_path)) { - #Save number of samples in file - pred_inputs$pred_genos <- ncol(geno) - - #Check that the ploidy entered is correct - if (ploidy != max(geno, na.rm = TRUE)) { - # If condition is met, show notification toast - shinyalert( - title = "Ploidy Mismatch", - text = paste0("The maximum value in the genotype file (",max(geno, na.rm = TRUE),") does not equal the ploidy entered"), - size = "xs", - closeOnEsc = FALSE, - closeOnClickOutside = FALSE, - html = TRUE, - type = "warning", - showConfirmButton = TRUE, - confirmButtonText = "OK", - confirmButtonCol = "#004192", - showCancelButton = FALSE, - #closeOnConfirm = TRUE, - #closeOnCancel = TRUE, - imageUrl = "", - animation = TRUE - ) - - - # Stop the observeEvent gracefully - #return() + #Function to convert GT to dosage calls (add to BIGr) + convert_to_dosage <- function(gt) { + # Split the genotype string + alleles <- strsplit(gt, "[|/]") + # Sum the alleles, treating NA values appropriately + sapply(alleles, function(x) { + if (any(is.na(x))) { + return(NA) + } else { + return(sum(as.numeric(x), na.rm = TRUE)) + } + }) } + #Convert VCF file if submitted + vcf <- vcfR::read.vcfR(file_path) - # Function to convert genotype matrix according to ploidy - convert_genotype <- function(genotype_matrix, ploidy) { - normalized_matrix <- 2 * (genotype_matrix / ploidy) - 1 - return(normalized_matrix) - } + #Get number of SNPs + pred_inputs$pred_snps <- nrow(vcf) - #tranforming genotypes - geno_adj_init <- convert_genotype(geno, as.numeric(ploidy)) - - #Make sure the trait file and genotype file are in the same order - # Column names for geno (assuming these are the individual IDs) - colnames_geno <- colnames(geno) - # Assuming the first column in Pheno contains the matching IDs - ids_pheno <- pheno[, 1] - # Find common identifiers - common_ids <- intersect(colnames_geno, ids_pheno) - #Get number of id - pred_inputs$pred_geno_pheno <- length(common_ids) - - #Throw an error if there are less matching samples in the phenotype file than the genotype file - if (length(common_ids) == 0) { - - # If condition is met, show notification toast - shinyalert( - title = "Oops", - text = "All samples were missing from the phenotype file", - size = "xs", - closeOnEsc = TRUE, - closeOnClickOutside = FALSE, - html = TRUE, - type = "info", - showConfirmButton = TRUE, - confirmButtonText = "OK", - confirmButtonCol = "#004192", - showCancelButton = FALSE, - imageUrl = "", - animation = TRUE, - ) - - - # Stop the observeEvent gracefully - return() - - } else if (length(common_ids) < length(colnames_geno)) { - # If condition is met, show notification toast - shinyalert( - title = "Data Mismatch", - text = paste0((length(colnames_geno)-length(common_ids))," samples were removed for not having trait information"), - size = "xs", - closeOnEsc = FALSE, - closeOnClickOutside = FALSE, - html = TRUE, - type = "warning", - showConfirmButton = TRUE, - confirmButtonText = "OK", - confirmButtonCol = "#004192", - showCancelButton = FALSE, - #closeOnConfirm = TRUE, - #closeOnCancel = TRUE, - imageUrl = "", - animation = TRUE - ) - - - # Stop the observeEvent gracefully - #return() - } + #Extract GT + geno <- extract.gt(vcf, element = "GT") + geno <- apply(geno, 2, convert_to_dosage) + class(geno) <- "numeric" + rm(vcf) + } else { + # If condition is met, show notification toast + shinyalert( + title = "Oops", + text = "No valid genotype file detected", + size = "xs", + closeOnEsc = TRUE, + closeOnClickOutside = FALSE, + html = TRUE, + type = "info", + showConfirmButton = TRUE, + confirmButtonText = "OK", + confirmButtonCol = "#004192", + showCancelButton = FALSE, + imageUrl = "", + animation = TRUE, + ) + + #Stop the analysis + return() + } + #Save number of samples in file + pred_inputs$pred_genos <- ncol(geno) - #Final check before performing analyses + #Check that the ploidy entered is correct + if (ploidy != max(geno, na.rm = TRUE)) { + # If condition is met, show notification toast shinyalert( - title = "Ready?", - text = "Inputs have been checked", + title = "Ploidy Mismatch", + text = paste0("The maximum value in the genotype file (",max(geno, na.rm = TRUE),") does not equal the ploidy entered"), size = "xs", closeOnEsc = FALSE, closeOnClickOutside = FALSE, html = TRUE, - type = "info", + type = "warning", showConfirmButton = TRUE, - confirmButtonText = "Proceed", + confirmButtonText = "OK", confirmButtonCol = "#004192", - showCancelButton = TRUE, + showCancelButton = FALSE, #closeOnConfirm = TRUE, #closeOnCancel = TRUE, imageUrl = "", - animation = TRUE, - callbackR = function(value) { - if (isTRUE(value)) { - # Proceed with adjusted data - continue_prediction(TRUE) - } else { - # Stop or change the process - continue_prediction(FALSE) - } - } + animation = TRUE ) - # Subset and reorder geno and pheno to ensure they only contain and are ordered by common IDs - geno_adj <- geno_adj_init[, common_ids] # Assuming that the columns can be directly indexed by IDs - pheno <- pheno[match(common_ids, ids_pheno), ] + # Stop the observeEvent gracefully + #return() + } - #Save to reactive values - pred_inputs$pheno_input <- pheno - #pred_inputs$geno_adj_input <- geno_adj - pred_inputs$geno_input <- geno_adj - }) + # Function to convert genotype matrix according to ploidy + convert_genotype <- function(genotype_matrix, ploidy) { + normalized_matrix <- 2 * (genotype_matrix / ploidy) - 1 + return(normalized_matrix) + } - #3) Analysis only proceeds once continue_prediction is converted to TRUE - observe({ + #tranforming genotypes + geno_adj_init <- convert_genotype(geno, as.numeric(ploidy)) - req(continue_prediction(),pred_inputs$pheno_input, pred_inputs$geno_input) + #Make sure the trait file and genotype file are in the same order + # Column names for geno (assuming these are the individual IDs) + colnames_geno <- colnames(geno) + # Assuming the first column in Pheno contains the matching IDs + ids_pheno <- pheno[, 1] + # Find common identifiers + common_ids <- intersect(colnames_geno, ids_pheno) + #Get number of id + pred_inputs$pred_geno_pheno <- length(common_ids) - # Stop analysis if cancel was selected - if (isFALSE(continue_prediction())) { - return() - } + #Throw an error if there are less matching samples in the phenotype file than the genotype file + if (length(common_ids) == 0) { + + # If condition is met, show notification toast + shinyalert( + title = "Oops", + text = "All samples were missing from the phenotype file", + size = "xs", + closeOnEsc = TRUE, + closeOnClickOutside = FALSE, + html = TRUE, + type = "info", + showConfirmButton = TRUE, + confirmButtonText = "OK", + confirmButtonCol = "#004192", + showCancelButton = FALSE, + imageUrl = "", + animation = TRUE, + ) - #Variables - ploidy <- as.numeric(input$pred_ploidy) - geno_adj <- pred_inputs$geno_input - pheno <- pred_inputs$pheno_input - traits <- input$pred_trait_info - CVs <- as.numeric(input$pred_cv) - fixed_traits <- input$pred_fixed_info - cores <- input$pred_cores - - #Assign colors - if (input$pred_color_select == "red"){ - pred_outputs$colors <- "#F8766D" - } else if (input$pred_color_select == "blue") { - pred_outputs$colors <- "#00BFC4" - } else if (input$pred_color_select == "green") { - pred_outputs$colors <- "#00BA38" - } else{ - pred_outputs$colors <- input$pred_color_select - } - ##Need to add ability for the use of parallelism for the for cross-validation - ##Example at this tutorial also: https://www.youtube.com/watch?v=ARWjdQU6ays + # Stop the observeEvent gracefully + return() + + } else if (length(common_ids) < length(colnames_geno)) { + # If condition is met, show notification toast + shinyalert( + title = "Data Mismatch", + text = paste0((length(colnames_geno)-length(common_ids))," samples were removed for not having trait information"), + size = "xs", + closeOnEsc = FALSE, + closeOnClickOutside = FALSE, + html = TRUE, + type = "warning", + showConfirmButton = TRUE, + confirmButtonText = "OK", + confirmButtonCol = "#004192", + showCancelButton = FALSE, + #closeOnConfirm = TRUE, + #closeOnCancel = TRUE, + imageUrl = "", + animation = TRUE + ) - # Function to perform genomic prediction - ##Make sure this is correct (I think I need to be generating a relationship matrix A.mat() to account for missing data, but I am not sure how that works with this) - genomic_prediction <- function(geno, Pheno, traits, fixed_effects = NULL, Fold = 5, Iters = 5, cores = 1) { - # Define variables - traits <- traits - cycles <- as.numeric(Iters) - Folds <- as.numeric(Fold) - total_population <- ncol(geno) - #train_size <- floor(percentage / 100 * total_population) - fixed_traits <- fixed_effects - cores <- as.numeric(cores) + # Stop the observeEvent gracefully + #return() + } + + + + + #Final check before performing analyses + shinyalert( + title = "Ready?", + text = "Inputs have been checked", + size = "xs", + closeOnEsc = FALSE, + closeOnClickOutside = FALSE, + html = TRUE, + type = "info", + showConfirmButton = TRUE, + confirmButtonText = "Proceed", + confirmButtonCol = "#004192", + showCancelButton = TRUE, + #closeOnConfirm = TRUE, + #closeOnCancel = TRUE, + imageUrl = "", + animation = TRUE, + callbackR = function(value) { + if (isTRUE(value)) { + # Proceed with adjusted data + continue_prediction(TRUE) + } else { + # Stop or change the process + continue_prediction(FALSE) + } + } + ) - # Establish accuracy results matrix - results <- matrix(nrow = cycles*Folds, ncol = length(traits) + 2) - colnames(results) <- c(paste0(traits), "Iter", "Fold") # Set the column names to be the traits - # Initialize a list to store GEBVs for all traits and cycles - GEBVs <- list() + # Subset and reorder geno and pheno to ensure they only contain and are ordered by common IDs + geno_adj <- geno_adj_init[, common_ids] # Assuming that the columns can be directly indexed by IDs + pheno <- pheno[match(common_ids, ids_pheno), ] - #Establish heritability_scores_df () Maybe get h2 values - # Establish results matrix - heritability_scores <- matrix(nrow = cycles*Folds, ncol = length(traits) + 2) - colnames(heritability_scores) <- c(paste0(traits,"_h2"), "Iter", "Fold") # Set the column names to be the traits + #Save to reactive values + pred_inputs$pheno_input <- pheno + #pred_inputs$geno_adj_input <- geno_adj + pred_inputs$geno_input <- geno_adj - #Cross validation number for progress bar (not involved in the calculations, just shiny visuals) - pb_value = 10 + }) - #Remove the fixed traits from the Pheno file - if (length(fixed_traits) == 0) { - Pheno <- Pheno - } else { - #Subset fixed traits - Fixed <- subset(Pheno, select = fixed_traits) - - #Pheno <- subset(Pheno, select = -fixed_traits) - convert_all_to_factor_if_not_numeric <- function(df) { - for (col in names(df)) { - if (!is.numeric(df[[col]]) && !is.integer(df[[col]])) { - df[[col]] <- as.factor(df[[col]]) - } + #3) Analysis only proceeds once continue_prediction is converted to TRUE + observe({ + + req(continue_prediction(),pred_inputs$pheno_input, pred_inputs$geno_input) + + # Stop analysis if cancel was selected + if (isFALSE(continue_prediction())) { + return() + } + + #Variables + ploidy <- as.numeric(input$pred_ploidy) + geno_adj <- pred_inputs$geno_input + pheno <- pred_inputs$pheno_input + traits <- input$pred_trait_info + CVs <- as.numeric(input$pred_cv) + fixed_traits <- input$pred_fixed_info + cores <- input$pred_cores + + #Assign colors + if (input$pred_color_select == "red"){ + pred_outputs$colors <- "#F8766D" + } else if (input$pred_color_select == "blue") { + pred_outputs$colors <- "#00BFC4" + } else if (input$pred_color_select == "green") { + pred_outputs$colors <- "#00BA38" + } else{ + pred_outputs$colors <- input$pred_color_select + } + + ##Need to add ability for the use of parallelism for the for cross-validation + ##Example at this tutorial also: https://www.youtube.com/watch?v=ARWjdQU6ays + + # Function to perform genomic prediction + ##Make sure this is correct (I think I need to be generating a relationship matrix A.mat() to account for missing data, but I am not sure how that works with this) + genomic_prediction <- function(geno, Pheno, traits, fixed_effects = NULL, Fold = 5, Iters = 5, cores = 1) { + + # Define variables + traits <- traits + cycles <- as.numeric(Iters) + Folds <- as.numeric(Fold) + total_population <- ncol(geno) + #train_size <- floor(percentage / 100 * total_population) + fixed_traits <- fixed_effects + cores <- as.numeric(cores) + + # Establish accuracy results matrix + results <- matrix(nrow = cycles*Folds, ncol = length(traits) + 2) + colnames(results) <- c(paste0(traits), "Iter", "Fold") # Set the column names to be the traits + + # Initialize a list to store GEBVs for all traits and cycles + GEBVs <- list() + + #Establish heritability_scores_df () Maybe get h2 values + # Establish results matrix + heritability_scores <- matrix(nrow = cycles*Folds, ncol = length(traits) + 2) + colnames(heritability_scores) <- c(paste0(traits,"_h2"), "Iter", "Fold") # Set the column names to be the traits + + #Cross validation number for progress bar (not involved in the calculations, just shiny visuals) + pb_value = 10 + + #Remove the fixed traits from the Pheno file + if (length(fixed_traits) == 0) { + Pheno <- Pheno + } else { + #Subset fixed traits + Fixed <- subset(Pheno, select = fixed_traits) + + #Pheno <- subset(Pheno, select = -fixed_traits) + convert_all_to_factor_if_not_numeric <- function(df) { + for (col in names(df)) { + if (!is.numeric(df[[col]]) && !is.integer(df[[col]])) { + df[[col]] <- as.factor(df[[col]]) } - return(df) } - # Convert all columns to factor if they are not numeric or integer - Fixed <- convert_all_to_factor_if_not_numeric(Fixed) - - #Fixed <- as.data.frame(lapply(Fixed, as.factor)) #convert to factor - row.names(Fixed) <- row.names(Pheno) + return(df) + } + # Convert all columns to factor if they are not numeric or integer + Fixed <- convert_all_to_factor_if_not_numeric(Fixed) - #Make the matrix - formula_str <- paste("~", paste(fixed_traits, collapse = " + ")) - formula <- as.formula(formula_str) + #Fixed <- as.data.frame(lapply(Fixed, as.factor)) #convert to factor + row.names(Fixed) <- row.names(Pheno) - # Create the design matrix using the constructed formula - Fixed <- model.matrix(formula, data = Fixed) - } + #Make the matrix + formula_str <- paste("~", paste(fixed_traits, collapse = " + ")) + formula <- as.formula(formula_str) - #Make kinship matrix of all individuals? - #Kin_mat <- A.mat(t(geno), n.core = 1) ##Need to explore whether or not to use a Kinship matrix and if it makes a significant improvement to accuracy - #If wanting to use Kkinship matrix, will then need to see how to implement it here + # Create the design matrix using the constructed formula + Fixed <- model.matrix(formula, data = Fixed) + } - #For now, I am just imputing the missing sites using mean, but EM is more accurate, but slower (can use multiple cores). - impute = (A.mat(t(geno), max.missing=0.5,impute.method="mean",return.imputed=TRUE)) - geno <- impute$imputed + #Make kinship matrix of all individuals? + #Kin_mat <- A.mat(t(geno), n.core = 1) ##Need to explore whether or not to use a Kinship matrix and if it makes a significant improvement to accuracy + #If wanting to use Kkinship matrix, will then need to see how to implement it here - # For loop - for (r in 1:cycles) { - set.seed(r) - fold_ids <- sample(rep(1:Folds, length.out = total_population)) - fold_df <- data.frame(Sample = row.names(geno), FoldID = fold_ids) #Randomly assign each sample to a fold - fold_results <- matrix(nrow = Folds, ncol = length(traits)) - colnames(fold_results) <- traits + #For now, I am just imputing the missing sites using mean, but EM is more accurate, but slower (can use multiple cores). + impute = (A.mat(t(geno), max.missing=0.5,impute.method="mean",return.imputed=TRUE)) + geno <- impute$imputed - #Initialize GEBV object for each cycle - GEBVs_cycle <-list() + # For loop + for (r in 1:cycles) { + set.seed(r) + fold_ids <- sample(rep(1:Folds, length.out = total_population)) + fold_df <- data.frame(Sample = row.names(geno), FoldID = fold_ids) #Randomly assign each sample to a fold + fold_results <- matrix(nrow = Folds, ncol = length(traits)) + colnames(fold_results) <- traits - #Status - updateProgressBar(session = session, id = "pb_prediction", value = as.numeric(pb_value), title = paste0("Performing iteration:", r, "of", cycles)) + #Initialize GEBV object for each cycle + GEBVs_cycle <-list() - for (fold in 1:Folds) { + #Status + updateProgressBar(session = session, id = "pb_prediction", value = as.numeric(pb_value), title = paste0("Performing iteration:", r, "of", cycles)) - #Status bar length - pb_value = pb_value + (70 / as.numeric(cycles*Folds)) + for (fold in 1:Folds) { - train <- fold_df %>% - dplyr::filter(FoldID != fold) %>% - pull(Sample) - test <- setdiff(row.names(geno),train) + #Status bar length + pb_value = pb_value + (70 / as.numeric(cycles*Folds)) - #Subset datasets - if (length(fixed_traits) == 0) { - Fixed_train = NULL - } else{ - Fixed_train <- data.frame(Fixed[train, ]) - Fixed_train <- as.matrix(Fixed_train) - row.names(Fixed_train) <- train + train <- fold_df %>% + dplyr::filter(FoldID != fold) %>% + pull(Sample) + test <- setdiff(row.names(geno),train) - #Fixed (testing) - Fixed_test<- data.frame(Fixed[test, ]) - Fixed_test <- as.matrix(Fixed_test) - row.names(Fixed_test) <- test + #Subset datasets + if (length(fixed_traits) == 0) { + Fixed_train = NULL + } else{ + Fixed_train <- data.frame(Fixed[train, ]) + Fixed_train <- as.matrix(Fixed_train) + row.names(Fixed_train) <- train - } + #Fixed (testing) + Fixed_test<- data.frame(Fixed[test, ]) + Fixed_test <- as.matrix(Fixed_test) + row.names(Fixed_test) <- test - Pheno_train <- Pheno[train, ] # Subset the phenotype df to only retain the relevant samples from the training set - m_train <- geno[train, ] - Pheno_test <- Pheno[test, ] - #Fixed_test <- Fixed[test, ] #Where would the Fixed_test be used? - m_valid <- geno[test, ] - - # Initialize a matrix to store GEBVs for this fold - GEBVs_fold <- matrix(nrow = length(test), ncol = length(traits)+3) - colnames(GEBVs_fold) <- c(traits,"Sample","Iter","Fold") - rownames(GEBVs_fold) <- paste("Iter", r,"Fold",fold,"Ind", test, sep="_") - - #Evaluate each trait using the same train and testing samples for each - for (trait_idx in 1:length(traits)) { - trait <- Pheno_train[, traits[trait_idx]] # Get the trait of interest - trait_answer <- mixed.solve(y= trait, Z = m_train, K = NULL, X = Fixed_train, SE = FALSE, return.Hinv = FALSE) - TRT <- trait_answer$u - e <- as.matrix(TRT) - pred_trait_test <- m_valid %*% e - pred_trait <- pred_trait_test[, 1] + c(trait_answer$beta) # Make sure this still works when using multiple traits - trait_test <- Pheno_test[, traits[trait_idx]] - results[(((r-1)*5)+fold), trait_idx] <- cor(pred_trait, trait_test, use = "complete") - results[(((r-1)*5)+fold), (length(traits)+1)] <- r - results[(((r-1)*5)+fold), (length(traits)+2)] <- fold - - # Extract GEBVs - # Check if Fixed_train is not NULL and include beta if it is - if (!is.null(Fixed_train) && !is.null(trait_answer$beta)) { - # Calculate GEBVs including fixed effects - GEBVs_fold[, trait_idx] <- m_valid %*% trait_answer$u + Fixed_test %*% trait_answer$beta - } else { - # Calculate GEBVs without fixed effects - GEBVs_fold[, trait_idx] <- m_valid %*% trait_answer$u #Confirm it is accurate to calculate the GEBVs for testing group from the trained model - } - - # Calculate heritability for the current trait - Vu <- trait_answer$Vu - Ve <- trait_answer$Ve - heritability_scores[(((r-1)*5)+fold), trait_idx] <- Vu / (Vu + Ve) + } + Pheno_train <- Pheno[train, ] # Subset the phenotype df to only retain the relevant samples from the training set + m_train <- geno[train, ] + Pheno_test <- Pheno[test, ] + #Fixed_test <- Fixed[test, ] #Where would the Fixed_test be used? + m_valid <- geno[test, ] + + # Initialize a matrix to store GEBVs for this fold + GEBVs_fold <- matrix(nrow = length(test), ncol = length(traits)+3) + colnames(GEBVs_fold) <- c(traits,"Sample","Iter","Fold") + rownames(GEBVs_fold) <- paste("Iter", r,"Fold",fold,"Ind", test, sep="_") + + #Evaluate each trait using the same train and testing samples for each + for (trait_idx in 1:length(traits)) { + trait <- Pheno_train[, traits[trait_idx]] # Get the trait of interest + trait_answer <- mixed.solve(y= trait, Z = m_train, K = NULL, X = Fixed_train, SE = FALSE, return.Hinv = FALSE) + TRT <- trait_answer$u + e <- as.matrix(TRT) + pred_trait_test <- m_valid %*% e + pred_trait <- pred_trait_test[, 1] + c(trait_answer$beta) # Make sure this still works when using multiple traits + trait_test <- Pheno_test[, traits[trait_idx]] + results[(((r-1)*5)+fold), trait_idx] <- cor(pred_trait, trait_test, use = "complete") + results[(((r-1)*5)+fold), (length(traits)+1)] <- r + results[(((r-1)*5)+fold), (length(traits)+2)] <- fold + + # Extract GEBVs + # Check if Fixed_train is not NULL and include beta if it is + if (!is.null(Fixed_train) && !is.null(trait_answer$beta)) { + # Calculate GEBVs including fixed effects + GEBVs_fold[, trait_idx] <- m_valid %*% trait_answer$u + Fixed_test %*% trait_answer$beta + } else { + # Calculate GEBVs without fixed effects + GEBVs_fold[, trait_idx] <- m_valid %*% trait_answer$u #Confirm it is accurate to calculate the GEBVs for testing group from the trained model } - #Add iter and fold information for each trait/result - heritability_scores[(((r-1)*5)+fold), (length(traits)+1)] <- r - heritability_scores[(((r-1)*5)+fold), (length(traits)+2)] <- fold - #Add sample, iteration, and fold information to GEBVs_fold - GEBVs_fold[,"Iter"] = r - GEBVs_fold[,"Fold"] = fold - GEBVs_fold[,"Sample"] <- test - - # Store GEBVs for this fold - GEBVs_cycle[[fold]] <- GEBVs_fold + # Calculate heritability for the current trait + Vu <- trait_answer$Vu + Ve <- trait_answer$Ve + heritability_scores[(((r-1)*5)+fold), trait_idx] <- Vu / (Vu + Ve) } + #Add iter and fold information for each trait/result + heritability_scores[(((r-1)*5)+fold), (length(traits)+1)] <- r + heritability_scores[(((r-1)*5)+fold), (length(traits)+2)] <- fold + + #Add sample, iteration, and fold information to GEBVs_fold + GEBVs_fold[,"Iter"] = r + GEBVs_fold[,"Fold"] = fold + GEBVs_fold[,"Sample"] <- test - # Store GEBVs for this cycle - GEBVs[[r]] <- do.call(rbind, GEBVs_cycle) + # Store GEBVs for this fold + GEBVs_cycle[[fold]] <- GEBVs_fold } - # Combine all GEBVs into a single DataFrame - GEBVs_df <- as.data.frame(do.call(rbind, GEBVs)) + # Store GEBVs for this cycle + GEBVs[[r]] <- do.call(rbind, GEBVs_cycle) - results <- as.data.frame(results) - heritability_scores <- as.data.frame(heritability_scores) + } - # Combine results and heritability_scores using cbind - combined_results <- cbind(results, heritability_scores) + # Combine all GEBVs into a single DataFrame + GEBVs_df <- as.data.frame(do.call(rbind, GEBVs)) - return(list(GEBVs = GEBVs_df, PredictionAccuracy = results, CombinedResults = combined_results)) - } + results <- as.data.frame(results) + heritability_scores <- as.data.frame(heritability_scores) - # Example call to the function - #This is slow when using 3k markers and 1.2k samples...will need to parallelize if using this script... - results <- genomic_prediction(geno_adj, pheno, traits = traits, fixed_effects = fixed_traits, Iters = input$pred_cv, cores = cores) + # Combine results and heritability_scores using cbind + combined_results <- cbind(results, heritability_scores) - #With fixed effects (need to inforporate the ability for fixed effects into the prediction?) - #results <- genomic_prediction(geno_matrix, phenotype_df, c("height", "weight"), "~ age + sex") + return(list(GEBVs = GEBVs_df, PredictionAccuracy = results, CombinedResults = combined_results)) + } - #Save to reactive value - pred_outputs$corr_output <- results$PredictionAccuracy - pred_outputs$all_GEBVs <- results$GEBVs + # Example call to the function + #This is slow when using 3k markers and 1.2k samples...will need to parallelize if using this script... + results <- genomic_prediction(geno_adj, pheno, traits = traits, fixed_effects = fixed_traits, Iters = input$pred_cv, cores = cores) - # Convert trait columns to numeric - results$GEBVs <- results$GEBVs %>% - mutate(across(all_of(traits), ~ as.numeric(.x))) + #With fixed effects (need to inforporate the ability for fixed effects into the prediction?) + #results <- genomic_prediction(geno_matrix, phenotype_df, c("height", "weight"), "~ age + sex") - # Calculate the average value for each column in the traits list for each SampleID, ignoring Iter and Fold - average_gebvs_df <- results$GEBVs %>% - group_by(Sample) %>% - summarize(across(all_of(traits), mean, na.rm = TRUE)) + #Save to reactive value + pred_outputs$corr_output <- results$PredictionAccuracy + pred_outputs$all_GEBVs <- results$GEBVs - pred_outputs$avg_GEBVs <- average_gebvs_df + # Convert trait columns to numeric + results$GEBVs <- results$GEBVs %>% + mutate(across(all_of(traits), ~ as.numeric(.x))) - #Get average accuracy and h2 for each iter accross the 5 folds + # Calculate the average value for each column in the traits list for each SampleID, ignoring Iter and Fold + average_gebvs_df <- results$GEBVs %>% + group_by(Sample) %>% + summarize(across(all_of(traits), mean, na.rm = TRUE)) - #columns <- setdiff(colnames(results$CombinedResults), c("Iter","Fold")) - #average_accuracy_df <- results$CombinedResults %>% - # group_by(Iter) %>% - # summarize(across(all_of(columns), mean, na.rm = TRUE)) + pred_outputs$avg_GEBVs <- average_gebvs_df - columns <- setdiff(colnames(results$PredictionAccuracy), c("Iter","Fold")) - average_accuracy_df <- results$PredictionAccuracy %>% - group_by(Iter) %>% - summarize(across(all_of(columns), mean, na.rm = TRUE)) + #Get average accuracy and h2 for each iter accross the 5 folds + #columns <- setdiff(colnames(results$CombinedResults), c("Iter","Fold")) + #average_accuracy_df <- results$CombinedResults %>% + # group_by(Iter) %>% + # summarize(across(all_of(columns), mean, na.rm = TRUE)) - pred_outputs$comb_output <- average_accuracy_df + columns <- setdiff(colnames(results$PredictionAccuracy), c("Iter","Fold")) + average_accuracy_df <- results$PredictionAccuracy %>% + group_by(Iter) %>% + summarize(across(all_of(columns), mean, na.rm = TRUE)) - #Status - updateProgressBar(session = session, id = "pb_prediction", value = 90, title = "Generating Results") - ##Figures and Tables + pred_outputs$comb_output <- average_accuracy_df - #Status - updateProgressBar(session = session, id = "pb_prediction", value = 100, title = "Finished!") + #Status + updateProgressBar(session = session, id = "pb_prediction", value = 90, title = "Generating Results") - #End the event - continue_prediction(NULL) - }) + ##Figures and Tables - plots <- reactive({ - validate( - need(!is.null(pred_outputs$corr_output), "Upload the input files, set the parameters and click 'run analysis' to access results in this session.") - ) + #Status + updateProgressBar(session = session, id = "pb_prediction", value = 100, title = "Finished!") - df <- pred_outputs$corr_output - df <- df %>% dplyr::select(-Fold, -Iter) + #End the event + continue_prediction(NULL) + }) - #Probably want to add the ability for the user to select which trait(s) to display here + plots <- reactive({ + validate( + need(!is.null(pred_outputs$corr_output), "Upload the input files, set the parameters and click 'run analysis' to access results in this session.") + ) - #Convert to long format for ggplot - df_long <- pivot_longer( - df, - cols = colnames(df), # Exclude the Cycle column from transformation - names_to = "Trait", # New column for trait names - values_to = "Correlation" # New column for correlation values - ) + df <- pred_outputs$corr_output + df <- df %>% dplyr::select(-Fold, -Iter) - #This can be adapted if we start comparing more than one GP model - #Also consider a violin plot to show each cor value - #plot <- ggplot(df_long, aes(x = factor(Trait), y = Correlation, fill = "red"), fill = "red") + - plot <- ggplot(df_long, aes(x = "rrBLUP", y = Correlation, fill = "red"), fill = "red") + - #geom_boxplot(position = position_dodge(width = 0.8), color = "black", width = 0.7, outlier.size = 0.2) + - geom_boxplot() + - facet_wrap(~ Trait, nrow = 1) + # Facet by trait, allowing different y-scales - labs(title = "Prediction Accuracy by Trait", - x = " ", - y = "Pearson Correlation") + - #theme_minimal() + # Using a minimal theme - theme(legend.position = "none", - strip.text = element_text(size = 12), - axis.text = element_text(size = 12), - axis.title = element_text(size = 14), - axis.text.x = element_text(angle = 90, hjust = 0.95, vjust = 0.2), - strip.text.x = element_text(face = "bold")) - - plot_violin <- ggplot(df_long, aes(x = "rrBLUP", y = Correlation, fill = "red")) + - geom_violin(trim = TRUE) + # Add violin plot - geom_point(position = position_jitter(width = 0.1), color = "black", size = 1.5) + # Add jittered points - facet_wrap(~ Trait, nrow = 1) + # Facet by trait, allowing different y-scales - labs(title = "Prediction Accuracy by Trait", - x = " ", # x-label is blank because it's not relevant per facet - y = "Pearson Correlation") + - theme(legend.position = "none", - strip.text = element_text(size = 12), - axis.text = element_text(size = 12), - axis.title = element_text(size = 14), - axis.text.x = element_text(angle = 90, hjust = 0.95, vjust = 0.2), - strip.text.x = element_text(face = "bold")) - - list(plot, plot_violin) - }) + #Probably want to add the ability for the user to select which trait(s) to display here - #Output the genomic prediction correlation box plots - output$pred_box_plot <- renderPlot({ - plots()[[1]] + scale_fill_manual(values = pred_outputs$colors) - }) + #Convert to long format for ggplot + df_long <- pivot_longer( + df, + cols = colnames(df), # Exclude the Cycle column from transformation + names_to = "Trait", # New column for trait names + values_to = "Correlation" # New column for correlation values + ) - #Output the genomic prediction correlation box plots - output$pred_violin_plot <- renderPlot({ - plots()[[2]] + scale_fill_manual(values = pred_outputs$colors) - }) + #This can be adapted if we start comparing more than one GP model + #Also consider a violin plot to show each cor value + #plot <- ggplot(df_long, aes(x = factor(Trait), y = Correlation, fill = "red"), fill = "red") + + plot <- ggplot(df_long, aes(x = "rrBLUP", y = Correlation, fill = "red"), fill = "red") + + #geom_boxplot(position = position_dodge(width = 0.8), color = "black", width = 0.7, outlier.size = 0.2) + + geom_boxplot() + + facet_wrap(~ Trait, nrow = 1) + # Facet by trait, allowing different y-scales + labs(title = "Prediction Accuracy by Trait", + x = " ", + y = "Pearson Correlation") + + #theme_minimal() + # Using a minimal theme + theme(legend.position = "none", + strip.text = element_text(size = 12), + axis.text = element_text(size = 12), + axis.title = element_text(size = 14), + axis.text.x = element_text(angle = 90, hjust = 0.95, vjust = 0.2), + strip.text.x = element_text(face = "bold")) + + plot_violin <- ggplot(df_long, aes(x = "rrBLUP", y = Correlation, fill = "red")) + + geom_violin(trim = TRUE) + # Add violin plot + geom_point(position = position_jitter(width = 0.1), color = "black", size = 1.5) + # Add jittered points + facet_wrap(~ Trait, nrow = 1) + # Facet by trait, allowing different y-scales + labs(title = "Prediction Accuracy by Trait", + x = " ", # x-label is blank because it's not relevant per facet + y = "Pearson Correlation") + + theme(legend.position = "none", + strip.text = element_text(size = 12), + axis.text = element_text(size = 12), + axis.title = element_text(size = 14), + axis.text.x = element_text(angle = 90, hjust = 0.95, vjust = 0.2), + strip.text.x = element_text(face = "bold")) + + list(plot, plot_violin) + }) - #Output the prediction tables + #Output the genomic prediction correlation box plots + output$pred_box_plot <- renderPlot({ + plots()[[1]] + scale_fill_manual(values = pred_outputs$colors) + }) - all_GEBVs <- reactive({ - validate( - need(!is.null(pred_outputs$all_GEBVs), "Upload the input files, set the parameters and click 'run analysis' to access results in this session.") - ) - pred_outputs$comb_output - }) + #Output the genomic prediction correlation box plots + output$pred_violin_plot <- renderPlot({ + plots()[[2]] + scale_fill_manual(values = pred_outputs$colors) + }) - output$pred_all_table <- renderDT({all_GEBVs()}, options = list(scrollX = TRUE,autoWidth = FALSE, pageLength = 5)) + #Output the prediction tables - comb_output <- reactive({ - validate( - need(!is.null(pred_outputs$comb_output), "Upload the input files, set the parameters and click 'run analysis' to access results in this session.") - ) - pred_outputs$comb_output - }) + all_GEBVs <- reactive({ + validate( + need(!is.null(pred_outputs$all_GEBVs), "Upload the input files, set the parameters and click 'run analysis' to access results in this session.") + ) + pred_outputs$comb_output + }) - output$pred_acc_table <- renderDT({comb_output()}, options = list(scrollX = TRUE,autoWidth = FALSE, pageLength = 5)) + output$pred_all_table <- renderDT({all_GEBVs()}, options = list(scrollX = TRUE,autoWidth = FALSE, pageLength = 5)) - avg_GEBVs <- reactive({ - validate( - need(!is.null(pred_outputs$avg_GEBVs), "Upload the input files, set the parameters and click 'run analysis' to access results in this session.") - ) - pred_outputs$avg_GEBVs - }) + comb_output <- reactive({ + validate( + need(!is.null(pred_outputs$comb_output), "Upload the input files, set the parameters and click 'run analysis' to access results in this session.") + ) + pred_outputs$comb_output + }) - output$pred_gebvs_table <- renderDT({avg_GEBVs()}, options = list(scrollX = TRUE,autoWidth = FALSE, pageLength = 5)) - - #Download files for GP - output$download_pred_file <- downloadHandler( - filename = function() { - paste0("GS-results-", Sys.Date(), ".zip") - }, - content = function(file) { - # Temporary files list - temp_dir <- tempdir() - temp_files <- c() - - if (!is.null(pred_outputs$avg_GEBVs)) { - # Create a temporary file for assignments - gebv_file <- file.path(temp_dir, paste0("GEBVs-", Sys.Date(), ".csv")) - write.csv(pred_outputs$avg_GEBVs, gebv_file, row.names = FALSE) - temp_files <- c(temp_files, gebv_file) - } + output$pred_acc_table <- renderDT({comb_output()}, options = list(scrollX = TRUE,autoWidth = FALSE, pageLength = 5)) - if (!is.null(pred_outputs$comb_output)) { - # Create a temporary file for BIC data frame - acc_file <- file.path(temp_dir, paste0("GS-accuracy-statistics-", Sys.Date(), ".csv")) - write.csv(pred_outputs$comb_output, acc_file, row.names = FALSE) - temp_files <- c(temp_files, acc_file) - } + avg_GEBVs <- reactive({ + validate( + need(!is.null(pred_outputs$avg_GEBVs), "Upload the input files, set the parameters and click 'run analysis' to access results in this session.") + ) + pred_outputs$avg_GEBVs + }) - # Zip files only if there's something to zip - if (length(temp_files) > 0) { - zip(file, files = temp_files, extras = "-j") # Using -j to junk paths - } + output$pred_gebvs_table <- renderDT({avg_GEBVs()}, options = list(scrollX = TRUE,autoWidth = FALSE, pageLength = 5)) + + #Download files for GP + output$download_pred_file <- downloadHandler( + filename = function() { + paste0("GS-results-", Sys.Date(), ".zip") + }, + content = function(file) { + # Temporary files list + temp_dir <- tempdir() + temp_files <- c() + + if (!is.null(pred_outputs$avg_GEBVs)) { + # Create a temporary file for assignments + gebv_file <- file.path(temp_dir, paste0("GEBVs-", Sys.Date(), ".csv")) + write.csv(pred_outputs$avg_GEBVs, gebv_file, row.names = FALSE) + temp_files <- c(temp_files, gebv_file) + } - # Optionally clean up - file.remove(temp_files) + if (!is.null(pred_outputs$comb_output)) { + # Create a temporary file for BIC data frame + acc_file <- file.path(temp_dir, paste0("GS-accuracy-statistics-", Sys.Date(), ".csv")) + write.csv(pred_outputs$comb_output, acc_file, row.names = FALSE) + temp_files <- c(temp_files, acc_file) } - ) - #Download GP Figures - output$download_pred_figure <- downloadHandler( + # Zip files only if there's something to zip + if (length(temp_files) > 0) { + zip(file, files = temp_files, extras = "-j") # Using -j to junk paths + } - filename = function() { - if (input$pred_image_type == "jpeg") { - paste("GS-", Sys.Date(), ".jpg", sep="") - } else if (input$pred_image_type == "png") { - paste("GS-", Sys.Date(), ".png", sep="") - } else { - paste("GS-", Sys.Date(), ".tiff", sep="") - } - }, - content = function(file) { - #req(all_plots$pca_2d, all_plots$pca3d, all_plots$scree, input$pca_image_type, input$pca_image_res, input$pca_image_width, input$pca_image_height) #Get the plots - req(input$pred_figures) - - if (input$pred_image_type == "jpeg") { - jpeg(file, width = as.numeric(input$pred_image_width), height = as.numeric(input$pred_image_height), res= as.numeric(input$pred_image_res), units = "in") - } else if (input$pred_image_type == "png") { - png(file, width = as.numeric(input$pred_image_width), height = as.numeric(input$pred_image_height), res= as.numeric(input$pred_image_res), units = "in") - } else { - tiff(file, width = as.numeric(input$pred_image_width), height = as.numeric(input$pred_image_height), res= as.numeric(input$pred_image_res), units = "in") - } + # Optionally clean up + file.remove(temp_files) + } + ) - # Conditional plotting based on input selection - if (input$pred_figures == "Violin Plot") { - req(pred_outputs$violin_plot) + #Download GP Figures + output$download_pred_figure <- downloadHandler( - print(pred_outputs$violin_plot + scale_fill_manual(values = pred_outputs$colors)) + filename = function() { + if (input$pred_image_type == "jpeg") { + paste("GS-", Sys.Date(), ".jpg", sep="") + } else if (input$pred_image_type == "png") { + paste("GS-", Sys.Date(), ".png", sep="") + } else { + paste("GS-", Sys.Date(), ".tiff", sep="") + } + }, + content = function(file) { + #req(all_plots$pca_2d, all_plots$pca3d, all_plots$scree, input$pca_image_type, input$pca_image_res, input$pca_image_width, input$pca_image_height) #Get the plots + req(input$pred_figures) + + if (input$pred_image_type == "jpeg") { + jpeg(file, width = as.numeric(input$pred_image_width), height = as.numeric(input$pred_image_height), res= as.numeric(input$pred_image_res), units = "in") + } else if (input$pred_image_type == "png") { + png(file, width = as.numeric(input$pred_image_width), height = as.numeric(input$pred_image_height), res= as.numeric(input$pred_image_res), units = "in") + } else { + tiff(file, width = as.numeric(input$pred_image_width), height = as.numeric(input$pred_image_height), res= as.numeric(input$pred_image_res), units = "in") + } - } else if (input$pred_figures == "Box Plot") { - req(pred_outputs$box_plot) - #Plot - print(pred_outputs$box_plot + scale_fill_manual(values = pred_outputs$colors)) + # Conditional plotting based on input selection + if (input$pred_figures == "Violin Plot") { + req(pred_outputs$violin_plot) - } + print(pred_outputs$violin_plot + scale_fill_manual(values = pred_outputs$colors)) + + } else if (input$pred_figures == "Box Plot") { + req(pred_outputs$box_plot) + #Plot + print(pred_outputs$box_plot + scale_fill_manual(values = pred_outputs$colors)) - dev.off() } - ) + dev.off() + } - output$download_vcf <- downloadHandler( - filename = function() { - paste0("BIGapp_VCF_Example_file.vcf.gz") - }, - content = function(file) { - ex <- system.file("iris_DArT_VCF.vcf.gz", package = "BIGapp") - file.copy(ex, file) - }) - - output$download_pheno <- downloadHandler( - filename = function() { - paste0("BIGapp_passport_Example_file.csv") - }, - content = function(file) { - ex <- system.file("iris_passport_file.csv", package = "BIGapp") - file.copy(ex, file) - }) - }) + ) + + output$download_vcf <- downloadHandler( + filename = function() { + paste0("BIGapp_VCF_Example_file.vcf.gz") + }, + content = function(file) { + ex <- system.file("iris_DArT_VCF.vcf.gz", package = "BIGapp") + file.copy(ex, file) + }) + + output$download_pheno <- downloadHandler( + filename = function() { + paste0("BIGapp_passport_Example_file.csv") + }, + content = function(file) { + ex <- system.file("iris_passport_file.csv", package = "BIGapp") + file.copy(ex, file) + }) } ## To be copied in the UI diff --git a/R/mod_Home.R b/R/mod_Home.R index d77e4a0..9dd0b8e 100644 --- a/R/mod_Home.R +++ b/R/mod_Home.R @@ -17,12 +17,12 @@ mod_Home_ui <- function(id){ fluidPage( fluidRow( column(width = 4, - box( - title = "Breeding Insight Genomics App", status = "info", solidHeader = FALSE, width = 12, collapsible = FALSE, - HTML( - "The app is under development + box( + title = "Breeding Insight Genomics App", status = "info", solidHeader = FALSE, width = 12, collapsible = FALSE, + HTML( + "The app is under development

Breeding Insight provides bioinformatic processing support for our external collaborators. This R shiny app provides a web-based user friendly way for users to analyze genomic data without needing to use command-line tools.

- +

Supported Analyses

Initial supported analyses includes the mature genomics/bioinformatics pipelines developed within Breeding Insight:
    @@ -32,16 +32,16 @@ mod_Home_ui <- function(id){
  • GWAS
  • GS
" - ), - style = "overflow-y: auto; height: 500px" - - ) + ), + style = "overflow-y: auto; height: 500px" + + ) ), column(width = 4, - box( - title = "About Breeding Insight", status = "success", solidHeader = FALSE, width = 12, collapsible = FALSE, - HTML( - "We provide scientific consultation and data management software to the specialty crop and animal breeding communities. + box( + title = "About Breeding Insight", status = "success", solidHeader = FALSE, width = 12, collapsible = FALSE, + HTML( + "We provide scientific consultation and data management software to the specialty crop and animal breeding communities.
  • Genomics
  • Phenomics
  • @@ -53,50 +53,50 @@ mod_Home_ui <- function(id){
    Breeding Insight
    " - ), - style = "overflow-y: auto; height: 500px" - ) - ), - column(width = 4, - a( - href = "https://www.breedinginsight.org", # Replace with your desired URL - target = "_blank", # Optional: opens the link in a new tab - valueBox( - value = NULL, - subtitle = "Learn More About Breeding Insight", - icon = icon("link"), - color = "purple", - gradient = TRUE, - width = 11 - ), - style = "text-decoration: none; color: inherit;" # Optional: removes underline and retains original color - ), - a( - href = "https://breedinginsight.org/contact-us/", # Replace with your desired URL - target = "_blank", # Optional: opens the link in a new tab - valueBox( - value = NULL, - subtitle = "Contact Us", - icon = icon("envelope"), - color = "danger", - gradient = TRUE, - width = 11 + ), + style = "overflow-y: auto; height: 500px" + ) + ), + column(width = 4, + a( + href = "https://www.breedinginsight.org", # Replace with your desired URL + target = "_blank", # Optional: opens the link in a new tab + valueBox( + value = NULL, + subtitle = "Learn More About Breeding Insight", + icon = icon("link"), + color = "purple", + gradient = TRUE, + width = 11 + ), + style = "text-decoration: none; color: inherit;" # Optional: removes underline and retains original color ), - style = "text-decoration: none; color: inherit;" # Optional: removes underline and retains original color - ), - a( - href = "https://scribehow.com/page/BIGapp_Tutorials__FdLsY9ZxQsi6kgT9p-U2Zg", # Replace with your desired URL - target = "_blank", # Optional: opens the link in a new tab - valueBox( - value = NULL, - subtitle = "BIGapp Tutorials (in-progress)", - icon = icon("compass"), - color = "warning", - gradient = TRUE, - width = 11 + a( + href = "https://breedinginsight.org/contact-us/", # Replace with your desired URL + target = "_blank", # Optional: opens the link in a new tab + valueBox( + value = NULL, + subtitle = "Contact Us", + icon = icon("envelope"), + color = "danger", + gradient = TRUE, + width = 11 + ), + style = "text-decoration: none; color: inherit;" # Optional: removes underline and retains original color ), - style = "text-decoration: none; color: inherit;" # Optional: removes underline and retains original color - ) + a( + href = "https://scribehow.com/page/BIGapp_Tutorials__FdLsY9ZxQsi6kgT9p-U2Zg", # Replace with your desired URL + target = "_blank", # Optional: opens the link in a new tab + valueBox( + value = NULL, + subtitle = "BIGapp Tutorials (in-progress)", + icon = icon("compass"), + color = "warning", + gradient = TRUE, + width = 11 + ), + style = "text-decoration: none; color: inherit;" # Optional: removes underline and retains original color + ) ) ) ) @@ -107,11 +107,10 @@ mod_Home_ui <- function(id){ #' #' #' @noRd -mod_Home_server <- function(id){ - moduleServer( id, function(input, output, session){ - ns <- session$ns +mod_Home_server <- function(input, output, session, parent_session){ + + ns <- session$ns - }) } ## To be copied in the UI diff --git a/R/mod_PCA.R b/R/mod_PCA.R index 96b03b8..9c2c797 100644 --- a/R/mod_PCA.R +++ b/R/mod_PCA.R @@ -61,15 +61,15 @@ mod_PCA_ui <- function(id){ selectInput(ns("grey_choice"), "Select Grey", choices = c("Light Grey", "Grey", "Dark Grey", "Black"), selected = "Grey") ), selectInput(ns("color_choice"), "Color Palette", choices = list("Standard Palettes" = c("Set1","Set3","Pastel2", - "Pastel1","Accent","Spectral", - "RdYlGn","RdGy"), - "Colorblind Friendly" = c("Set2","Paired","Dark2","YlOrRd","YlOrBr","YlGnBu","YlGn", - "Reds","RdPu","Purples","PuRd","PuBuGn","PuBu", - "OrRd","Oranges","Greys","Greens","GnBu","BuPu", - "BuGn","Blues","RdYlBu", - "RdBu", "PuOr","PRGn","PiYG","BrBG" - )), - selected = "Set1"), + "Pastel1","Accent","Spectral", + "RdYlGn","RdGy"), + "Colorblind Friendly" = c("Set2","Paired","Dark2","YlOrRd","YlOrBr","YlGnBu","YlGn", + "Reds","RdPu","Purples","PuRd","PuBuGn","PuBu", + "OrRd","Oranges","Greys","Greens","GnBu","BuPu", + "BuGn","Blues","RdYlBu", + "RdBu", "PuOr","PRGn","PiYG","BrBG" + )), + selected = "Set1"), selectInput(ns("pc_X"), "X-Axis (2D-Plot only)", choices = c("PC1","PC2","PC3","PC4","PC5"), selected = "PC1"), selectInput(ns("pc_Y"), "Y-Axis (2D-Plot only)", choices = c("PC1","PC2","PC3","PC4","PC5"), selected = "PC2"), div(style="display:inline-block; float:right",dropdownButton( @@ -115,59 +115,121 @@ mod_PCA_ui <- function(id){ #' @importFrom shinyjs toggleClass #' #' @noRd -mod_PCA_server <- function(id){ - moduleServer( id, function(input, output, session){ - ns <- session$ns - - #PCA reactive values - pca_data <- reactiveValues( - pc_df_pop = NULL, - variance_explained = NULL, - my_palette = NULL +mod_PCA_server <- function(input, output, session, parent_session){ + + ns <- session$ns + + #PCA reactive values + pca_data <- reactiveValues( + pc_df_pop = NULL, + variance_explained = NULL, + my_palette = NULL + ) + + # Update dropdown menu choices based on uploaded passport file + passport_table <- reactive({ + validate( + need(!is.null(input$passport_file), "Upload passport file to access results in this section."), ) + info_df <- read.csv(input$passport_file$datapath, header = TRUE, check.names = FALSE) + info_df[,1] <- as.character(info_df[,1]) #Makes sure that the sample names are characters instead of numeric + + updateSelectInput(session, "group_info", choices = colnames(info_df)) + info_df + }) + + output$passport_table <- renderDT({ + passport_table()}, + options = list(scrollX = TRUE, + autoWidth = FALSE, + pageLength = 4)) + + #PCA specific category selection + observeEvent(input$group_info, { + #updateMaterialSwitch(session, inputId = "use_cat", status = "success") + + # Get selected column name + selected_col <- input$group_info - # Update dropdown menu choices based on uploaded passport file - passport_table <- reactive({ - validate( - need(!is.null(input$passport_file), "Upload passport file to access results in this section."), + # Extract unique values from the selected column + unique_values <- unique(passport_table()[[selected_col]]) + + #Add category selection + updateVirtualSelect("cat_color", choices = unique_values, session = session) + + }) + + #PCA events + observeEvent(input$pca_start, { + + # Missing input with red border and alerts + toggleClass(id = "pca_ploidy", class = "borderred", condition = is.na(input$pca_ploidy)) + if (is.null(input$dosage_file)) { + shinyalert( + title = "Missing input!", + text = "Upload Genotypes File", + size = "s", + closeOnEsc = TRUE, + closeOnClickOutside = FALSE, + html = TRUE, + type = "error", + showConfirmButton = TRUE, + confirmButtonText = "OK", + confirmButtonCol = "#004192", + showCancelButton = FALSE, + animation = TRUE ) - info_df <- read.csv(input$passport_file$datapath, header = TRUE, check.names = FALSE) - info_df[,1] <- as.character(info_df[,1]) #Makes sure that the sample names are characters instead of numeric + } + req(input$pca_ploidy, input$dosage_file$datapath) - updateSelectInput(session, "group_info", choices = colnames(info_df)) - info_df - }) + # Get inputs + geno <- input$dosage_file$datapath + g_info <- as.character(input$group_info) + output_name <- input$output_name + ploidy <- input$pca_ploidy - output$passport_table <- renderDT({ - passport_table()}, - options = list(scrollX = TRUE, - autoWidth = FALSE, - pageLength = 4)) + #Notification + showNotification("PCA analysis in progress...") - #PCA specific category selection - observeEvent(input$group_info, { - #updateMaterialSwitch(session, inputId = "use_cat", status = "success") + #Import genotype info if genotype matrix format + if (grepl("\\.csv$", geno)) { + genomat <- read.csv(geno, header = TRUE, row.names = 1, check.names = FALSE) + } else{ - # Get selected column name - selected_col <- input$group_info + #Import genotype information if in VCF format + vcf <- read.vcfR(geno) - # Extract unique values from the selected column - unique_values <- unique(passport_table()[[selected_col]]) + #Get items in FORMAT column + info <- vcf@gt[1,"FORMAT"] #Getting the first row FORMAT - #Add category selection - updateVirtualSelect("cat_color", choices = unique_values, session = session) + # Apply the function to the first INFO string + info_ids <- extract_info_ids(info[1]) - }) + #Get the genotype values if the updog dosage calls are present + if ("UD" %in% info_ids) { + genomat <- extract.gt(vcf, element = "UD") + class(genomat) <- "numeric" + rm(vcf) #Remove vcf + }else{ + #Extract GT and convert to numeric calls + genomat <- extract.gt(vcf, element = "GT") + genomat <- apply(genomat, 2, convert_to_dosage) + rm(vcf) #Remove VCF + } + } - #PCA events - observeEvent(input$pca_start, { + #Start analysis - # Missing input with red border and alerts - toggleClass(id = "pca_ploidy", class = "borderred", condition = is.na(input$pca_ploidy)) - if (is.null(input$dosage_file)) { + # Passport info + if (!is.null(input$passport_file$datapath) && input$passport_file$datapath != "") { + info_df <- read.csv(input$passport_file$datapath, header = TRUE, check.names = FALSE) + + # Check for duplicates in the first column + duplicated_samples <- info_df[duplicated(info_df[, 1]), 1] + if (length(duplicated_samples) > 0) { shinyalert( - title = "Missing input!", - text = "Upload Genotypes File", + title = "Duplicate Samples Detected in Passport File", + text = paste("The following samples are duplicated:", paste(unique(duplicated_samples), collapse = ", ")), size = "s", closeOnEsc = TRUE, closeOnClickOutside = FALSE, @@ -179,325 +241,262 @@ mod_PCA_server <- function(id){ showCancelButton = FALSE, animation = TRUE ) - } - req(input$pca_ploidy, input$dosage_file$datapath) - - # Get inputs - geno <- input$dosage_file$datapath - g_info <- as.character(input$group_info) - output_name <- input$output_name - ploidy <- input$pca_ploidy - - #Notification - showNotification("PCA analysis in progress...") - - #Import genotype info if genotype matrix format - if (grepl("\\.csv$", geno)) { - genomat <- read.csv(geno, header = TRUE, row.names = 1, check.names = FALSE) - } else{ - - #Import genotype information if in VCF format - vcf <- read.vcfR(geno) - - #Get items in FORMAT column - info <- vcf@gt[1,"FORMAT"] #Getting the first row FORMAT - - # Apply the function to the first INFO string - info_ids <- extract_info_ids(info[1]) - - #Get the genotype values if the updog dosage calls are present - if ("UD" %in% info_ids) { - genomat <- extract.gt(vcf, element = "UD") - class(genomat) <- "numeric" - rm(vcf) #Remove vcf - }else{ - #Extract GT and convert to numeric calls - genomat <- extract.gt(vcf, element = "GT") - genomat <- apply(genomat, 2, convert_to_dosage) - rm(vcf) #Remove VCF - } + req(length(duplicated_samples) == 0) # Stop the analysis if duplicates are found } - #Start analysis - - # Passport info - if (!is.null(input$passport_file$datapath) && input$passport_file$datapath != "") { - info_df <- read.csv(input$passport_file$datapath, header = TRUE, check.names = FALSE) - - # Check for duplicates in the first column - duplicated_samples <- info_df[duplicated(info_df[, 1]), 1] - if (length(duplicated_samples) > 0) { - shinyalert( - title = "Duplicate Samples Detected in Passport File", - text = paste("The following samples are duplicated:", paste(unique(duplicated_samples), collapse = ", ")), - size = "s", - closeOnEsc = TRUE, - closeOnClickOutside = FALSE, - html = TRUE, - type = "error", - showConfirmButton = TRUE, - confirmButtonText = "OK", - confirmButtonCol = "#004192", - showCancelButton = FALSE, - animation = TRUE - ) - req(length(duplicated_samples) == 0) # Stop the analysis if duplicates are found - } + } else { + info_df <- data.frame(SampleID = colnames(genomat)) + } - } else { - info_df <- data.frame(SampleID = colnames(genomat)) - } + # Print the modified dataframe + row.names(info_df) <- info_df[,1] - # Print the modified dataframe - row.names(info_df) <- info_df[,1] + #Plotting + #First build a relationship matrix using the genotype values + G.mat.updog <- Gmatrix(t(genomat), method = "VanRaden", ploidy = as.numeric(ploidy), missingValue = "NA") - #Plotting - #First build a relationship matrix using the genotype values - G.mat.updog <- Gmatrix(t(genomat), method = "VanRaden", ploidy = as.numeric(ploidy), missingValue = "NA") + #PCA + prin_comp <- prcomp(G.mat.updog, scale = TRUE) + eig <- get_eigenvalue(prin_comp) + round(sum(eig$variance.percent[1:3]),1) - #PCA - prin_comp <- prcomp(G.mat.updog, scale = TRUE) - eig <- get_eigenvalue(prin_comp) - round(sum(eig$variance.percent[1:3]),1) + ###Simple plots + # Extract the PC scores + pc_scores <- prin_comp$x - ###Simple plots - # Extract the PC scores - pc_scores <- prin_comp$x + # Create a data frame with PC scores + pc_df <- data.frame(PC1 = pc_scores[, 1], PC2 = pc_scores[, 2], + PC3 = pc_scores[, 3], PC4 = pc_scores[, 4], + PC5 = pc_scores[, 5], PC6 = pc_scores[, 6], + PC7 = pc_scores[, 7], PC8 = pc_scores[, 8], + PC9 = pc_scores[, 9], PC10 = pc_scores[, 10]) - # Create a data frame with PC scores - pc_df <- data.frame(PC1 = pc_scores[, 1], PC2 = pc_scores[, 2], - PC3 = pc_scores[, 3], PC4 = pc_scores[, 4], - PC5 = pc_scores[, 5], PC6 = pc_scores[, 6], - PC7 = pc_scores[, 7], PC8 = pc_scores[, 8], - PC9 = pc_scores[, 9], PC10 = pc_scores[, 10]) + # Compute the percentage of variance explained for each PC + variance_explained <- round(100 * prin_comp$sdev^2 / sum(prin_comp$sdev^2), 1) - # Compute the percentage of variance explained for each PC - variance_explained <- round(100 * prin_comp$sdev^2 / sum(prin_comp$sdev^2), 1) + # Retain only samples in common + row.names(info_df) <- info_df[,1] + info_df <- info_df[row.names(pc_df),] - # Retain only samples in common - row.names(info_df) <- info_df[,1] - info_df <- info_df[row.names(pc_df),] + #Add the information for each sample + pc_df_pop <- merge(pc_df, info_df, by.x = "row.names", by.y = "row.names", all.x = TRUE) - #Add the information for each sample - pc_df_pop <- merge(pc_df, info_df, by.x = "row.names", by.y = "row.names", all.x = TRUE) + # Ignore color input if none is entered by user + if (g_info != "") { + pc_df_pop[[g_info]] <- as.factor(pc_df_pop[[g_info]]) + } else { + g_info <- NULL + } - # Ignore color input if none is entered by user - if (g_info != "") { - pc_df_pop[[g_info]] <- as.factor(pc_df_pop[[g_info]]) - } else { - g_info <- NULL - } + #Update global variable + pca_dataframes <- pc_df_pop - #Update global variable - pca_dataframes <- pc_df_pop + # Generate a distinct color palette if g_info is provided + if (!is.null(g_info) && g_info != "") { + unique_countries <- unique(pc_df_pop[[g_info]]) + palette <- brewer.pal(length(unique_countries), input$color_choice) + my_palette <- colorRampPalette(palette)(length(unique_countries)) + } else { + unique_countries <- NULL + my_palette <- NULL + } - # Generate a distinct color palette if g_info is provided - if (!is.null(g_info) && g_info != "") { - unique_countries <- unique(pc_df_pop[[g_info]]) - palette <- brewer.pal(length(unique_countries), input$color_choice) - my_palette <- colorRampPalette(palette)(length(unique_countries)) - } else { - unique_countries <- NULL - my_palette <- NULL - } + # Store processed data in reactive values + pca_data$pc_df_pop <- pc_df_pop + pca_data$variance_explained <- variance_explained + pca_data$my_palette <- my_palette - # Store processed data in reactive values - pca_data$pc_df_pop <- pc_df_pop - pca_data$variance_explained <- variance_explained - pca_data$my_palette <- my_palette + #End of PCA section + }) - #End of PCA section - }) + ##2D PCA plotting + pca_2d <- reactive({ + validate( + need(!is.null(pca_data$pc_df_pop), "Input Genotype file, Species ploidy, and run the analysis to access results in this section.") + ) - ##2D PCA plotting - pca_2d <- reactive({ - validate( - need(!is.null(pca_data$pc_df_pop), "Input Genotype file, Species ploidy, and run the analysis to access results in this section.") + # Generate colors + if (!is.null(pca_data$my_palette)) { + unique_countries <- unique(pca_data$pc_df_pop[[input$group_info]]) + palette <- brewer.pal(length(unique_countries), input$color_choice) + my_palette <- colorRampPalette(palette)(length(unique_countries)) + } else { + unique_countries <- NULL + my_palette <- NULL + } + + # Define a named vector to map input labels to grey values + label_to_value <- c("Light Grey" = "grey80", + "Grey" = "grey60", + "Dark Grey" = "grey40", + "Black" = "black") + + # Get the corresponding value based on the selected grey + selected_grey <- label_to_value[[input$grey_choice]] + + #Set factor + if (!input$use_cat && is.null(my_palette)) { + print("No Color Info") + }else{ + pca_data$pc_df_pop[[input$group_info]] <- as.factor(pca_data$pc_df_pop[[input$group_info]]) + } + + # Similar plotting logic here + + cat_colors <- c(input$cat_color, "grey") + plot <- {if(!is.null(input$group_info) & input$group_info != "") + ggplot(pca_data$pc_df_pop, aes(x = pca_data$pc_df_pop[[input$pc_X]], + y = pca_data$pc_df_pop[[input$pc_Y]], + color = factor(pca_data$pc_df_pop[[input$group_info]]))) else + ggplot(pca_data$pc_df_pop, aes(x = pca_data$pc_df_pop[[input$pc_X]], + y = pca_data$pc_df_pop[[input$pc_Y]]))} + + geom_point(size = 2, alpha = 0.8) + + {if(input$use_cat) scale_color_manual(values = setNames(c(my_palette, "grey"), cat_colors), na.value = selected_grey) else + if(!is.null(my_palette)) scale_color_manual(values = my_palette)} + + guides(color = guide_legend(override.aes = list(size = 5.5), nrow = 17)) + + theme_minimal() + + theme( + panel.border = element_rect(color = "black", fill = NA), + legend.text = element_text(size = 14), + axis.title = element_text(size = 14), + axis.text = element_text(size = 12), + legend.title = element_text(size = 16) + ) + + labs( + x = paste0(input$pc_X, "(", pca_data$variance_explained[as.numeric(substr(input$pc_X, 3, 3))], "%)"), + y = paste0(input$pc_Y, "(", pca_data$variance_explained[as.numeric(substr(input$pc_Y, 3, 3))], "%)"), + color = input$group_info ) - # Generate colors - if (!is.null(pca_data$my_palette)) { - unique_countries <- unique(pca_data$pc_df_pop[[input$group_info]]) - palette <- brewer.pal(length(unique_countries), input$color_choice) - my_palette <- colorRampPalette(palette)(length(unique_countries)) - } else { - unique_countries <- NULL - my_palette <- NULL - } - - # Define a named vector to map input labels to grey values - label_to_value <- c("Light Grey" = "grey80", - "Grey" = "grey60", - "Dark Grey" = "grey40", - "Black" = "black") + plot # Assign the plot to your reactiveValues + }) - # Get the corresponding value based on the selected grey - selected_grey <- label_to_value[[input$grey_choice]] + #Plot the 2d plot + output$pca_plot_ggplot <- renderPlot({ + pca_2d() + }) - #Set factor - if (!input$use_cat && is.null(my_palette)) { - print("No Color Info") - }else{ - pca_data$pc_df_pop[[input$group_info]] <- as.factor(pca_data$pc_df_pop[[input$group_info]]) - } + #3D PCA plotting + pca_plot <- reactive({ + #Plotly + validate( + need(!is.null(pca_data$pc_df_pop), "Input Genotype file, Species ploidy, and run the analysis to access results in this section.") + ) - # Similar plotting logic here - - cat_colors <- c(input$cat_color, "grey") - plot <- {if(!is.null(input$group_info) & input$group_info != "") - ggplot(pca_data$pc_df_pop, aes(x = pca_data$pc_df_pop[[input$pc_X]], - y = pca_data$pc_df_pop[[input$pc_Y]], - color = factor(pca_data$pc_df_pop[[input$group_info]]))) else - ggplot(pca_data$pc_df_pop, aes(x = pca_data$pc_df_pop[[input$pc_X]], - y = pca_data$pc_df_pop[[input$pc_Y]]))} + - geom_point(size = 2, alpha = 0.8) + - {if(input$use_cat) scale_color_manual(values = setNames(c(my_palette, "grey"), cat_colors), na.value = selected_grey) else - if(!is.null(my_palette)) scale_color_manual(values = my_palette)} + - guides(color = guide_legend(override.aes = list(size = 5.5), nrow = 17)) + - theme_minimal() + - theme( - panel.border = element_rect(color = "black", fill = NA), - legend.text = element_text(size = 14), - axis.title = element_text(size = 14), - axis.text = element_text(size = 12), - legend.title = element_text(size = 16) - ) + - labs( - x = paste0(input$pc_X, "(", pca_data$variance_explained[as.numeric(substr(input$pc_X, 3, 3))], "%)"), - y = paste0(input$pc_Y, "(", pca_data$variance_explained[as.numeric(substr(input$pc_Y, 3, 3))], "%)"), - color = input$group_info - ) + #Generate colors + unique_countries <- unique(pca_data$pc_df_pop[[input$group_info]]) + palette <- brewer.pal(length(unique_countries),input$color_choice) + my_palette <- colorRampPalette(palette)(length(unique_countries)) - plot # Assign the plot to your reactiveValues - }) + tit = paste0('Total Explained Variance =', sum(pca_data$variance_explained[1:3])) - #Plot the 2d plot - output$pca_plot_ggplot <- renderPlot({ - pca_2d() - }) + fig <- plot_ly(pca_data$pc_df_pop, x = ~PC1, y = ~PC2, z = ~PC3, color = pca_data$pc_df_pop[[input$group_info]], + colors = my_palette) %>% + add_markers(size = 12, text = paste0("Sample:",pca_data$pc_df_pop$Row.names)) - #3D PCA plotting - pca_plot <- reactive({ - #Plotly - validate( - need(!is.null(pca_data$pc_df_pop), "Input Genotype file, Species ploidy, and run the analysis to access results in this section.") + fig <- fig %>% + layout( + title = tit, + scene = list(bgcolor = "white") ) - #Generate colors - unique_countries <- unique(pca_data$pc_df_pop[[input$group_info]]) - palette <- brewer.pal(length(unique_countries),input$color_choice) - my_palette <- colorRampPalette(palette)(length(unique_countries)) + fig # Return the Plotly object here + }) - tit = paste0('Total Explained Variance =', sum(pca_data$variance_explained[1:3])) + output$pca_plot <- renderPlotly({ + pca_plot() + }) - fig <- plot_ly(pca_data$pc_df_pop, x = ~PC1, y = ~PC2, z = ~PC3, color = pca_data$pc_df_pop[[input$group_info]], - colors = my_palette) %>% - add_markers(size = 12, text = paste0("Sample:",pca_data$pc_df_pop$Row.names)) + pca_scree <- reactive({ + #PCA scree plot + validate( + need(!is.null(pca_data$variance_explained), "Input Genotype file, Species ploidy, and run the analysis to access the results in this section.") + ) - fig <- fig %>% - layout( - title = tit, - scene = list(bgcolor = "white") - ) + var_explained <- pca_data$variance_explained + + # Create a data frame for plotting + plot_data <- data.frame(PC = 1:10, Variance_Explained = var_explained[1:10]) + + # Use ggplot for plotting + plot <- ggplot(plot_data, aes(x = PC, y = Variance_Explained)) + + geom_bar(stat = "identity", fill = "lightblue", alpha = 0.9, color = "black") + # Bars with some transparency + geom_line(color = "black") + # Connect points with a line + geom_point(color = "black") + # Add points on top of the line for emphasis + scale_x_continuous(breaks = 1:10, limits = c(0.5, 10.5)) + + xlab("Principal Component") + + ylab("% Variance Explained") + + ylim(0, 100) + + theme_bw() + + theme( + panel.border = element_rect(color = "black", fill = NA), + legend.text = element_text(size = 14), + axis.title = element_text(size = 14), + axis.text = element_text(size = 12), + legend.title = element_text(size = 16) + ) + plot + }) - fig # Return the Plotly object here - }) + #Scree plot + output$scree_plot <- renderPlot({ + pca_scree() + }) - output$pca_plot <- renderPlotly({ - pca_plot() - }) + #Download figures for PCA + output$download_pca <- downloadHandler( + filename = function() { + if (input$pca_image_type == "jpeg") { + paste("pca-", Sys.Date(), ".jpg", sep = "") + } else if (input$pca_image_type == "png") { + paste("pca-", Sys.Date(), ".png", sep = "") + } else { + paste("pca-", Sys.Date(), ".tiff", sep = "") + } + }, + content = function(file) { + req(input$pca_figure) + + if (input$pca_image_type == "jpeg") { + jpeg(file, width = as.numeric(input$pca_image_width), height = as.numeric(input$pca_image_height), res = as.numeric(input$pca_image_res), units = "in") + } else if (input$pca_image_type == "png") { + png(file, width = as.numeric(input$pca_image_width), height = as.numeric(input$pca_image_height), res = as.numeric(input$pca_image_res), units = "in") + } else { + tiff(file, width = as.numeric(input$pca_image_width), height = as.numeric(input$pca_image_height), res = as.numeric(input$pca_image_res), units = "in") + } - pca_scree <- reactive({ - #PCA scree plot - validate( - need(!is.null(pca_data$variance_explained), "Input Genotype file, Species ploidy, and run the analysis to access the results in this section.") - ) + # Plot based on user selection + if (input$pca_figure == "2D Plot") { + print(pca_2d()) + } else if (input$pca_figure == "Scree Plot") { + print(pca_scree()) + } - var_explained <- pca_data$variance_explained - - # Create a data frame for plotting - plot_data <- data.frame(PC = 1:10, Variance_Explained = var_explained[1:10]) - - # Use ggplot for plotting - plot <- ggplot(plot_data, aes(x = PC, y = Variance_Explained)) + - geom_bar(stat = "identity", fill = "lightblue", alpha = 0.9, color = "black") + # Bars with some transparency - geom_line(color = "black") + # Connect points with a line - geom_point(color = "black") + # Add points on top of the line for emphasis - scale_x_continuous(breaks = 1:10, limits = c(0.5, 10.5)) + - xlab("Principal Component") + - ylab("% Variance Explained") + - ylim(0, 100) + - theme_bw() + - theme( - panel.border = element_rect(color = "black", fill = NA), - legend.text = element_text(size = 14), - axis.title = element_text(size = 14), - axis.text = element_text(size = 12), - legend.title = element_text(size = 16) - ) - plot - }) + dev.off() + } + ) - #Scree plot - output$scree_plot <- renderPlot({ - pca_scree() + output$download_vcf <- downloadHandler( + filename = function() { + paste0("BIGapp_VCF_Example_file.vcf.gz") + }, + content = function(file) { + ex <- system.file("iris_DArT_VCF.vcf.gz", package = "BIGapp") + file.copy(ex, file) }) - #Download figures for PCA - output$download_pca <- downloadHandler( - filename = function() { - if (input$pca_image_type == "jpeg") { - paste("pca-", Sys.Date(), ".jpg", sep = "") - } else if (input$pca_image_type == "png") { - paste("pca-", Sys.Date(), ".png", sep = "") - } else { - paste("pca-", Sys.Date(), ".tiff", sep = "") - } - }, - content = function(file) { - req(input$pca_figure) - - if (input$pca_image_type == "jpeg") { - jpeg(file, width = as.numeric(input$pca_image_width), height = as.numeric(input$pca_image_height), res = as.numeric(input$pca_image_res), units = "in") - } else if (input$pca_image_type == "png") { - png(file, width = as.numeric(input$pca_image_width), height = as.numeric(input$pca_image_height), res = as.numeric(input$pca_image_res), units = "in") - } else { - tiff(file, width = as.numeric(input$pca_image_width), height = as.numeric(input$pca_image_height), res = as.numeric(input$pca_image_res), units = "in") - } - - # Plot based on user selection - if (input$pca_figure == "2D Plot") { - print(pca_2d()) - } else if (input$pca_figure == "Scree Plot") { - print(pca_scree()) - } - - dev.off() - } - ) - - output$download_vcf <- downloadHandler( - filename = function() { - paste0("BIGapp_VCF_Example_file.vcf.gz") - }, - content = function(file) { - ex <- system.file("iris_DArT_VCF.vcf.gz", package = "BIGapp") - file.copy(ex, file) - }) - - output$download_pheno <- downloadHandler( - filename = function() { - paste0("BIGapp_passport_Example_file.csv") - }, - content = function(file) { - ex <- system.file("iris_passport_file.csv", package = "BIGapp") - file.copy(ex, file) - }) + output$download_pheno <- downloadHandler( + filename = function() { + paste0("BIGapp_passport_Example_file.csv") + }, + content = function(file) { + ex <- system.file("iris_passport_file.csv", package = "BIGapp") + file.copy(ex, file) + }) - }) } ## To be copied in the UI diff --git a/R/mod_dapc.R b/R/mod_dapc.R index 7670e2d..02002bc 100644 --- a/R/mod_dapc.R +++ b/R/mod_dapc.R @@ -109,323 +109,322 @@ mod_dapc_ui <- function(id){ #' @importFrom vcfR read.vcfR extract.gt #' @importFrom stats BIC as.formula lm logLik median model.matrix na.omit prcomp qbeta quantile runif sd setNames #' @noRd -mod_dapc_server <- function(id){ - moduleServer( id, function(input, output, session){ - ns <- session$ns +mod_dapc_server <- function(input, output, session, parent_session){ + ns <- session$ns - dapc_items <- reactiveValues( - grp = NULL, - bestK = NULL, - BIC = NULL, - assignments = NULL, - dapc = NULL - ) - - ##DAPC analysis - #Make it a two step process 1) estimate K, and 2) perform DAPC - observeEvent(input$K_start, { - - toggleClass(id = "dapc_ploidy", class = "borderred", condition = (is.na(input$dapc_ploidy) | is.null(input$dapc_ploidy))) - if (is.null(input$dosage_file$datapath)) { - shinyalert( - title = "Missing input!", - text = "Upload VCF File", - size = "s", - closeOnEsc = TRUE, - closeOnClickOutside = FALSE, - html = TRUE, - type = "error", - showConfirmButton = TRUE, - confirmButtonText = "OK", - confirmButtonCol = "#004192", - showCancelButton = FALSE, - animation = TRUE - ) - } - req(input$dosage_file$datapath, input$dapc_ploidy) - - ploidy <- as.numeric(input$dapc_ploidy) - maxK <- as.numeric(input$dapc_kmax) - geno <- input$dosage_file$datapath - - ##Add in VCF with the vcfR package (input VCF, then convert to genlight using vcf2genlight function) - - #Import genotype information if in VCF format - vcf <- read.vcfR(geno) - - #Get items in FORMAT column - info <- vcf@gt[1,"FORMAT"] #Getting the first row FORMAT - - # Apply the function to the first INFO string - info_ids <- extract_info_ids(info[1]) - - #Get the genotype values if the updog dosage calls are present - if ("UD" %in% info_ids) { - genotypeMatrix <- extract.gt(vcf, element = "UD") - class(genotypeMatrix) <- "numeric" - rm(vcf) #Remove vcf - }else{ - #Extract GT and convert to numeric calls - genotypeMatrix <- extract.gt(vcf, element = "GT") - genotypeMatrix <- apply(genotypeMatrix, 2, convert_to_dosage) - rm(vcf) #Remove VCF - } - #Perform analysis - get_k <- findK(genotypeMatrix, maxK, ploidy) - - #Assign results to reactive values - dapc_items$grp <- get_k$grp - dapc_items$bestK <- get_k$bestK - dapc_items$BIC <- get_k$BIC - }) + dapc_items <- reactiveValues( + grp = NULL, + bestK = NULL, + BIC = NULL, + assignments = NULL, + dapc = NULL + ) - observeEvent(input$dapc_start, { - - toggleClass(id = "dapc_ploidy", class = "borderred", condition = (is.na(input$dapc_ploidy) | is.null(input$dapc_ploidy))) - toggleClass(id = "dapc_k", class = "borderred", condition = (is.na(input$dapc_k) | is.null(input$dapc_k))) - - if (is.null(input$dosage_file$datapath)) { - shinyalert( - title = "Missing input!", - text = "Upload VCF File", - size = "s", - closeOnEsc = TRUE, - closeOnClickOutside = FALSE, - html = TRUE, - type = "error", - showConfirmButton = TRUE, - confirmButtonText = "OK", - confirmButtonCol = "#004192", - showCancelButton = FALSE, - animation = TRUE - ) - } - req(input$dosage_file$datapath, input$dapc_ploidy, input$dapc_k) - - geno <- input$dosage_file$datapath - ploidy <- as.numeric(input$dapc_ploidy) - selected_K <- as.numeric(input$dapc_k) - - #Import genotype information if in VCF format - vcf <- read.vcfR(geno) - - #Get items in FORMAT column - info <- vcf@gt[1,"FORMAT"] #Getting the first row FORMAT - - # Apply the function to the first INFO string - info_ids <- extract_info_ids(info[1]) - - #Get the genotype values if the updog dosage calls are present - if ("UD" %in% info_ids) { - genotypeMatrix <- extract.gt(vcf, element = "UD") - class(genotypeMatrix) <- "numeric" - rm(vcf) #Remove vcf - }else{ - #Extract GT and convert to numeric calls - genotypeMatrix <- extract.gt(vcf, element = "GT") - genotypeMatrix <- apply(genotypeMatrix, 2, convert_to_dosage) - rm(vcf) #Remove VCF - } + ##DAPC analysis + #Make it a two step process 1) estimate K, and 2) perform DAPC + observeEvent(input$K_start, { + + toggleClass(id = "dapc_ploidy", class = "borderred", condition = (is.na(input$dapc_ploidy) | is.null(input$dapc_ploidy))) + if (is.null(input$dosage_file$datapath)) { + shinyalert( + title = "Missing input!", + text = "Upload VCF File", + size = "s", + closeOnEsc = TRUE, + closeOnClickOutside = FALSE, + html = TRUE, + type = "error", + showConfirmButton = TRUE, + confirmButtonText = "OK", + confirmButtonCol = "#004192", + showCancelButton = FALSE, + animation = TRUE + ) + } + req(input$dosage_file$datapath, input$dapc_ploidy) + + ploidy <- as.numeric(input$dapc_ploidy) + maxK <- as.numeric(input$dapc_kmax) + geno <- input$dosage_file$datapath + + ##Add in VCF with the vcfR package (input VCF, then convert to genlight using vcf2genlight function) + + #Import genotype information if in VCF format + vcf <- read.vcfR(geno) + + #Get items in FORMAT column + info <- vcf@gt[1,"FORMAT"] #Getting the first row FORMAT + + # Apply the function to the first INFO string + info_ids <- extract_info_ids(info[1]) + + #Get the genotype values if the updog dosage calls are present + if ("UD" %in% info_ids) { + genotypeMatrix <- extract.gt(vcf, element = "UD") + class(genotypeMatrix) <- "numeric" + rm(vcf) #Remove vcf + }else{ + #Extract GT and convert to numeric calls + genotypeMatrix <- extract.gt(vcf, element = "GT") + genotypeMatrix <- apply(genotypeMatrix, 2, convert_to_dosage) + rm(vcf) #Remove VCF + } + + #Perform analysis + get_k <- findK(genotypeMatrix, maxK, ploidy) + + #Assign results to reactive values + dapc_items$grp <- get_k$grp + dapc_items$bestK <- get_k$bestK + dapc_items$BIC <- get_k$BIC + }) - #Perform analysis - clusters <- performDAPC(genotypeMatrix, selected_K, ploidy) + observeEvent(input$dapc_start, { + + toggleClass(id = "dapc_ploidy", class = "borderred", condition = (is.na(input$dapc_ploidy) | is.null(input$dapc_ploidy))) + toggleClass(id = "dapc_k", class = "borderred", condition = (is.na(input$dapc_k) | is.null(input$dapc_k))) + + if (is.null(input$dosage_file$datapath)) { + shinyalert( + title = "Missing input!", + text = "Upload VCF File", + size = "s", + closeOnEsc = TRUE, + closeOnClickOutside = FALSE, + html = TRUE, + type = "error", + showConfirmButton = TRUE, + confirmButtonText = "OK", + confirmButtonCol = "#004192", + showCancelButton = FALSE, + animation = TRUE + ) + } + req(input$dosage_file$datapath, input$dapc_ploidy, input$dapc_k) + + geno <- input$dosage_file$datapath + ploidy <- as.numeric(input$dapc_ploidy) + selected_K <- as.numeric(input$dapc_k) + + #Import genotype information if in VCF format + vcf <- read.vcfR(geno) + + #Get items in FORMAT column + info <- vcf@gt[1,"FORMAT"] #Getting the first row FORMAT + + # Apply the function to the first INFO string + info_ids <- extract_info_ids(info[1]) + + #Get the genotype values if the updog dosage calls are present + if ("UD" %in% info_ids) { + genotypeMatrix <- extract.gt(vcf, element = "UD") + class(genotypeMatrix) <- "numeric" + rm(vcf) #Remove vcf + }else{ + #Extract GT and convert to numeric calls + genotypeMatrix <- extract.gt(vcf, element = "GT") + genotypeMatrix <- apply(genotypeMatrix, 2, convert_to_dosage) + rm(vcf) #Remove VCF + } + + #Perform analysis + clusters <- performDAPC(genotypeMatrix, selected_K, ploidy) + + #Assign results to reactive value + dapc_items$assignments <- clusters$Q + dapc_items$dapc <- clusters$dapc + }) - #Assign results to reactive value - dapc_items$assignments <- clusters$Q - dapc_items$dapc <- clusters$dapc - }) + ###Outputs from DAPC + #Output the BIC plot + BIC_plot <- reactive({ + validate( + need(!is.null(dapc_items$BIC), "Input VCF, define parameters and click `run analysis` in Step 1:(K) to access results in this session.") + ) - ###Outputs from DAPC - #Output the BIC plot - BIC_plot <- reactive({ - validate( - need(!is.null(dapc_items$BIC), "Input VCF, define parameters and click `run analysis` in Step 1:(K) to access results in this session.") - ) + BIC <- dapc_items$BIC + selected_K <- as.numeric(dapc_items$bestK) + plot(BIC, type = "o", xaxt = 'n') + axis(1, at = seq(1, nrow(BIC), 1), labels = TRUE) - BIC <- dapc_items$BIC - selected_K <- as.numeric(dapc_items$bestK) + if (input$plot_BICX) { + plot(BIC, type = "o", xaxt = 'n') + axis(1, at = seq(1, nrow(BIC), 1), labels = TRUE) + points(selected_K, BIC[selected_K,2], pch = "x", col = "red", cex = 2) + } else { plot(BIC, type = "o", xaxt = 'n') axis(1, at = seq(1, nrow(BIC), 1), labels = TRUE) + } + }) - if (input$plot_BICX) { - plot(BIC, type = "o", xaxt = 'n') - axis(1, at = seq(1, nrow(BIC), 1), labels = TRUE) - points(selected_K, BIC[selected_K,2], pch = "x", col = "red", cex = 2) - } else { - plot(BIC, type = "o", xaxt = 'n') - axis(1, at = seq(1, nrow(BIC), 1), labels = TRUE) - } - }) + output$BIC_plot <- renderPlot({ + BIC_plot() + }) - output$BIC_plot <- renderPlot({ - BIC_plot() - }) + # #Output the DAPC scatter plot + DAPC_plot <- reactive({ + validate( + need(!is.null(dapc_items$dapc), "Input VCF, define parameters and click `run analysis` in Step 2:(DAPC) to access results in this session.") + ) - # #Output the DAPC scatter plot - DAPC_plot <- reactive({ - validate( - need(!is.null(dapc_items$dapc), "Input VCF, define parameters and click `run analysis` in Step 2:(DAPC) to access results in this session.") - ) + #Get colors + palette <- brewer.pal(as.numeric(input$dapc_k), input$color_choice) + my_palette <- colorRampPalette(palette)(as.numeric(input$dapc_k)) + + sc1 <- scatter.dapc(dapc_items$dapc, + bg = "white", solid = 1, cex = 1, # cex circle size + col = my_palette, + pch = 20, # shapes + cstar = 1, # 0 or 1, arrows from center of cluster + cell = 2, # size of elipse + scree.da = T, # plot da + scree.pca = T, # plot pca + posi.da = "topright", + posi.pca="bottomright", + mstree = F, # lines connecting clusters + lwd = 1, lty = 2, + leg = F, clab = 1) # legend and label of legend clusters. clab 0 or 1 + }) - #Get colors - palette <- brewer.pal(as.numeric(input$dapc_k), input$color_choice) - my_palette <- colorRampPalette(palette)(as.numeric(input$dapc_k)) - - sc1 <- scatter.dapc(dapc_items$dapc, - bg = "white", solid = 1, cex = 1, # cex circle size - col = my_palette, - pch = 20, # shapes - cstar = 1, # 0 or 1, arrows from center of cluster - cell = 2, # size of elipse - scree.da = T, # plot da - scree.pca = T, # plot pca - posi.da = "topright", - posi.pca="bottomright", - mstree = F, # lines connecting clusters - lwd = 1, lty = 2, - leg = F, clab = 1) # legend and label of legend clusters. clab 0 or 1 - }) + output$DAPC_plot <- renderPlot({ + DAPC_plot() + }) - output$DAPC_plot <- renderPlot({ - DAPC_plot() - }) + # #Output datatables - # #Output datatables + BIC_table <- reactive({ + validate( + need(!is.null(dapc_items$BIC), "Input VCF, define parameters and click `run analysis` in Step 1:(K) to access results in this session.") + ) + dapc_items$BIC + }) - BIC_table <- reactive({ - validate( - need(!is.null(dapc_items$BIC), "Input VCF, define parameters and click `run analysis` in Step 1:(K) to access results in this session.") - ) - dapc_items$BIC - }) + output$BIC_table <- renderDT({ + BIC_table() + }, options = list(scrollX = TRUE,autoWidth = FALSE, pageLength = 5)) - output$BIC_table <- renderDT({ - BIC_table() - }, options = list(scrollX = TRUE,autoWidth = FALSE, pageLength = 5)) + assignments_table <- reactive({ + validate( + need(!is.null(dapc_items$assignments), "Input VCF, define parameters and click `run analysis` in Step 2:(DAPC) to access results in this session.") + ) + dapc_items$assignments + }) - assignments_table <- reactive({ - validate( - need(!is.null(dapc_items$assignments), "Input VCF, define parameters and click `run analysis` in Step 2:(DAPC) to access results in this session.") - ) - dapc_items$assignments - }) + output$DAPC_table <- renderDT({ + assignments_table() + }, options = list(scrollX = TRUE,autoWidth = FALSE, pageLength = 5)) - output$DAPC_table <- renderDT({ - assignments_table() - }, options = list(scrollX = TRUE,autoWidth = FALSE, pageLength = 5)) + #Download figures for DAPC (change this so that the figures were already saved as reactive values to then print() here) + output$download_dapc_image <- downloadHandler( - #Download figures for DAPC (change this so that the figures were already saved as reactive values to then print() here) - output$download_dapc_image <- downloadHandler( + filename = function() { + if (input$dapc_image_type == "jpeg") { + paste("dapc-", Sys.Date(), ".jpg", sep="") + } else if (input$dapc_image_type == "png") { + paste("dapc-", Sys.Date(), ".png", sep="") + } else { + paste("dapc-", Sys.Date(), ".tiff", sep="") + } + }, + content = function(file) { + #req(all_plots$pca_2d, all_plots$pca3d, all_plots$scree, input$pca_image_type, input$pca_image_res, input$pca_image_width, input$pca_image_height) #Get the plots + req(input$dapc_figure) + + if (input$dapc_image_type == "jpeg") { + jpeg(file, width = as.numeric(input$dapc_image_width), height = as.numeric(input$dapc_image_height), res= as.numeric(input$dapc_image_res), units = "in") + } else if (input$dapc_image_type == "png") { + png(file, width = as.numeric(input$dapc_image_width), height = as.numeric(input$dapc_image_height), res= as.numeric(input$dapc_image_res), units = "in") + } else { + tiff(file, width = as.numeric(input$dapc_image_width), height = as.numeric(input$dapc_image_height), res= as.numeric(input$dapc_image_res), units = "in") + } - filename = function() { - if (input$dapc_image_type == "jpeg") { - paste("dapc-", Sys.Date(), ".jpg", sep="") - } else if (input$dapc_image_type == "png") { - paste("dapc-", Sys.Date(), ".png", sep="") - } else { - paste("dapc-", Sys.Date(), ".tiff", sep="") - } - }, - content = function(file) { - #req(all_plots$pca_2d, all_plots$pca3d, all_plots$scree, input$pca_image_type, input$pca_image_res, input$pca_image_width, input$pca_image_height) #Get the plots - req(input$dapc_figure) - - if (input$dapc_image_type == "jpeg") { - jpeg(file, width = as.numeric(input$dapc_image_width), height = as.numeric(input$dapc_image_height), res= as.numeric(input$dapc_image_res), units = "in") - } else if (input$dapc_image_type == "png") { - png(file, width = as.numeric(input$dapc_image_width), height = as.numeric(input$dapc_image_height), res= as.numeric(input$dapc_image_res), units = "in") - } else { - tiff(file, width = as.numeric(input$dapc_image_width), height = as.numeric(input$dapc_image_height), res= as.numeric(input$dapc_image_res), units = "in") - } + # Conditional plotting based on input selection + if (input$dapc_figure == "DAPC Plot") { + req(dapc_items$dapc, input$dapc_k) + + #Get colors + palette <- brewer.pal(as.numeric(input$dapc_k), input$color_choice) + my_palette <- colorRampPalette(palette)(as.numeric(input$dapc_k)) + + sc1 <- scatter.dapc(dapc_items$dapc, + bg = "white", solid = 1, cex = 1, # cex circle size + col = my_palette, + pch = 20, # shapes + cstar = 1, # 0 or 1, arrows from center of cluster + cell = 2, # size of elipse + scree.da = T, # plot da + scree.pca = T, # plot pca + posi.da = "topright", + posi.pca="bottomright", + mstree = F, # lines connecting clusters + lwd = 1, lty = 2, + leg = F, clab = 1) # legend and label of legend clusters. clab 0 or 1 + + } else if (input$dapc_figure == "BIC Plot") { + req(dapc_items$BIC, dapc_items$bestK) + + BIC <- dapc_items$BIC + selected_K <- as.numeric(dapc_items$bestK) + plot(BIC, type = "o", xaxt = 'n') + axis(1, at = seq(1, nrow(BIC), 1), labels = TRUE) - # Conditional plotting based on input selection - if (input$dapc_figure == "DAPC Plot") { - req(dapc_items$dapc, input$dapc_k) - - #Get colors - palette <- brewer.pal(as.numeric(input$dapc_k), input$color_choice) - my_palette <- colorRampPalette(palette)(as.numeric(input$dapc_k)) - - sc1 <- scatter.dapc(dapc_items$dapc, - bg = "white", solid = 1, cex = 1, # cex circle size - col = my_palette, - pch = 20, # shapes - cstar = 1, # 0 or 1, arrows from center of cluster - cell = 2, # size of elipse - scree.da = T, # plot da - scree.pca = T, # plot pca - posi.da = "topright", - posi.pca="bottomright", - mstree = F, # lines connecting clusters - lwd = 1, lty = 2, - leg = F, clab = 1) # legend and label of legend clusters. clab 0 or 1 - - } else if (input$dapc_figure == "BIC Plot") { - req(dapc_items$BIC, dapc_items$bestK) - - BIC <- dapc_items$BIC - selected_K <- as.numeric(dapc_items$bestK) + if (input$plot_BICX) { + plot(BIC, type = "o", xaxt = 'n') + axis(1, at = seq(1, nrow(BIC), 1), labels = TRUE) + points(selected_K, BIC[selected_K,2], pch = "x", col = "red", cex = 2) + } else { plot(BIC, type = "o", xaxt = 'n') axis(1, at = seq(1, nrow(BIC), 1), labels = TRUE) - - if (input$plot_BICX) { - plot(BIC, type = "o", xaxt = 'n') - axis(1, at = seq(1, nrow(BIC), 1), labels = TRUE) - points(selected_K, BIC[selected_K,2], pch = "x", col = "red", cex = 2) - } else { - plot(BIC, type = "o", xaxt = 'n') - axis(1, at = seq(1, nrow(BIC), 1), labels = TRUE) - } } - dev.off() } - ) - - #Download files for DAPC - output$download_dapc_file <- downloadHandler( - filename = function() { - paste0("dapc-results-", Sys.Date(), ".zip") - }, - content = function(file) { - # Temporary files list - temp_dir <- tempdir() - temp_files <- c() - - if (!is.null(dapc_items$assignments)) { - # Create a temporary file for assignments - assignments_file <- file.path(temp_dir, paste0("DAPC-values-", Sys.Date(), ".csv")) - write.csv(dapc_items$assignments, assignments_file, row.names = TRUE) - temp_files <- c(temp_files, assignments_file) - } + dev.off() + } + ) - if (!is.null(dapc_items$BIC)) { - # Create a temporary file for BIC data frame - bicDF_file <- file.path(temp_dir, paste0("BIC-values-", Sys.Date(), ".csv")) - write.csv(dapc_items$BIC, bicDF_file, row.names = FALSE) - temp_files <- c(temp_files, bicDF_file) - } + #Download files for DAPC + output$download_dapc_file <- downloadHandler( + filename = function() { + paste0("dapc-results-", Sys.Date(), ".zip") + }, + content = function(file) { + # Temporary files list + temp_dir <- tempdir() + temp_files <- c() + + if (!is.null(dapc_items$assignments)) { + # Create a temporary file for assignments + assignments_file <- file.path(temp_dir, paste0("DAPC-values-", Sys.Date(), ".csv")) + write.csv(dapc_items$assignments, assignments_file, row.names = TRUE) + temp_files <- c(temp_files, assignments_file) + } - # Zip files only if there's something to zip - if (length(temp_files) > 0) { - zip(file, files = temp_files, extras = "-j") # Using -j to junk paths - } + if (!is.null(dapc_items$BIC)) { + # Create a temporary file for BIC data frame + bicDF_file <- file.path(temp_dir, paste0("BIC-values-", Sys.Date(), ".csv")) + write.csv(dapc_items$BIC, bicDF_file, row.names = FALSE) + temp_files <- c(temp_files, bicDF_file) + } - # Optionally clean up - file.remove(temp_files) + # Zip files only if there's something to zip + if (length(temp_files) > 0) { + zip(file, files = temp_files, extras = "-j") # Using -j to junk paths } - ) - output$download_vcf <- downloadHandler( - filename = function() { - paste0("BIGapp_VCF_Example_file.vcf.gz") - }, - content = function(file) { - ex <- system.file("iris_DArT_VCF.vcf.gz", package = "BIGapp") - file.copy(ex, file) - }) - }) + # Optionally clean up + file.remove(temp_files) + } + ) + + output$download_vcf <- downloadHandler( + filename = function() { + paste0("BIGapp_VCF_Example_file.vcf.gz") + }, + content = function(file) { + ex <- system.file("iris_DArT_VCF.vcf.gz", package = "BIGapp") + file.copy(ex, file) + }) } ## To be copied in the UI diff --git a/R/mod_diversity.R b/R/mod_diversity.R index 100080e..f5ad69a 100644 --- a/R/mod_diversity.R +++ b/R/mod_diversity.R @@ -83,376 +83,376 @@ mod_diversity_ui <- function(id){ #' @importFrom scales comma_format #' #' @noRd -mod_diversity_server <- function(id){ - moduleServer( id, function(input, output, session){ - ns <- session$ns - #######Genomic Diversity analysis - - #Genomic Diversity output files - diversity_items <- reactiveValues( - diversity_df = NULL, - dosage_df = NULL, - het_df = NULL, - maf_df = NULL, - pos_df = NULL, - markerPlot = NULL +mod_diversity_server <- function(input, output, session, parent_session){ + + ns <- session$ns + #######Genomic Diversity analysis + + #Genomic Diversity output files + diversity_items <- reactiveValues( + diversity_df = NULL, + dosage_df = NULL, + het_df = NULL, + maf_df = NULL, + pos_df = NULL, + markerPlot = NULL + ) + + #Reactive boxes + output$mean_het_box <- renderValueBox({ + valueBox( + value = 0, + subtitle = "Mean Heterozygosity", + icon = icon("dna"), + color = "info" + ) + }) + + output$mean_maf_box <- renderValueBox({ + valueBox( + value = 0, + subtitle = "Mean MAF", + icon = icon("dna"), + color = "info" ) + }) + + observeEvent(input$diversity_start, { + toggleClass(id = "diversity_ploidy", class = "borderred", condition = (is.na(input$diversity_ploidy) | is.null(input$diversity_ploidy))) + toggleClass(id = "zero_value", class = "borderred", condition = (is.na(input$zero_value) | is.null(input$zero_value))) + + if (is.null(input$diversity_file$datapath)) { + shinyalert( + title = "Missing input!", + text = "Upload VCF File", + size = "s", + closeOnEsc = TRUE, + closeOnClickOutside = FALSE, + html = TRUE, + type = "error", + showConfirmButton = TRUE, + confirmButtonText = "OK", + confirmButtonCol = "#004192", + showCancelButton = FALSE, + animation = TRUE + ) + } + req(input$diversity_file, input$diversity_ploidy, input$zero_value) + + #Input variables (need to add support for VCF file) + ploidy <- as.numeric(input$diversity_ploidy) + geno <- input$diversity_file$datapath + + #Status + updateProgressBar(session = session, id = "pb_diversity", value = 20, title = "Importing VCF") + + #Import genotype information if in VCF format + vcf <- read.vcfR(geno) + + #Save position information + diversity_items$pos_df <- data.frame(vcf@fix[, 1:2]) + + #Get items in FORMAT column + info <- vcf@gt[1,"FORMAT"] #Getting the first row FORMAT + + # Apply the function to the first INFO string + info_ids <- extract_info_ids(info[1]) + + #Status + updateProgressBar(session = session, id = "pb_diversity", value = 40, title = "Converting to Numeric") + + #Get the genotype values if the updog dosage calls are present + if ("UD" %in% info_ids) { + geno_mat <- extract.gt(vcf, element = "UD") + class(geno_mat) <- "numeric" + rm(vcf) #Remove vcf + }else{ + #Extract GT and convert to numeric calls + geno_mat <- extract.gt(vcf, element = "GT") + geno_mat <- apply(geno_mat, 2, convert_to_dosage) + rm(vcf) #Remove VCF + } + + print(class(geno_mat)) + #Convert genotypes to alternate counts if they are the reference allele counts + #Importantly, the dosage plot is based on the input format NOT the converted genotypes + is_reference <- (input$zero_value == "Reference Allele Counts") + + print("Genotype file successfully imported") + ######Get MAF plot (Need to remember that the VCF genotypes are likely set as 0 = homozygous reference, where the dosage report is 0 = homozygous alternate) + + print("Starting percentage calc") + #Status + updateProgressBar(session = session, id = "pb_diversity", value = 70, title = "Calculating...") + # Calculate percentages for both genotype matrices + percentages1 <- calculate_percentages(geno_mat, ploidy) + # Combine the data matrices into a single data frame + percentages1_df <- as.data.frame(t(percentages1)) + percentages1_df$Data <- "Dosages" + # Assuming my_data is your dataframe + print("Percentage Complete: melting dataframe") + melted_data <- percentages1_df %>% + pivot_longer(cols = -(Data),names_to = "Dosage", values_to = "Percentage") + + diversity_items$dosage_df <- melted_data + + print("Dosage calculations worked") + + #Convert the genotype calls prior to het,af, and maf calculation + geno_mat <- data.frame(convert_genotype_counts(df = geno_mat, ploidy = ploidy, is_reference), + check.names = FALSE) - #Reactive boxes + # Calculating heterozygosity for a tetraploid organism + diversity_items$het_df <- calculate_heterozygosity(geno_mat, ploidy = ploidy) + + print("Heterozygosity success") + diversity_items$maf_df <- calculateMAF(geno_mat, ploidy = ploidy) + + print("MAF success") + + #Updating value boxes output$mean_het_box <- renderValueBox({ valueBox( - value = 0, + value = round(mean(diversity_items$het_df$ObservedHeterozygosity),3), subtitle = "Mean Heterozygosity", icon = icon("dna"), color = "info" ) }) - output$mean_maf_box <- renderValueBox({ valueBox( - value = 0, + value = round(mean(diversity_items$maf_df$MAF),3), subtitle = "Mean MAF", icon = icon("dna"), color = "info" ) }) - observeEvent(input$diversity_start, { - toggleClass(id = "diversity_ploidy", class = "borderred", condition = (is.na(input$diversity_ploidy) | is.null(input$diversity_ploidy))) - toggleClass(id = "zero_value", class = "borderred", condition = (is.na(input$zero_value) | is.null(input$zero_value))) - - if (is.null(input$diversity_file$datapath)) { - shinyalert( - title = "Missing input!", - text = "Upload VCF File", - size = "s", - closeOnEsc = TRUE, - closeOnClickOutside = FALSE, - html = TRUE, - type = "error", - showConfirmButton = TRUE, - confirmButtonText = "OK", - confirmButtonCol = "#004192", - showCancelButton = FALSE, - animation = TRUE - ) - } - req(input$diversity_file, input$diversity_ploidy, input$zero_value) + #Status + updateProgressBar(session = session, id = "pb_diversity", value = 100, title = "Complete!") + }) - #Input variables (need to add support for VCF file) - ploidy <- as.numeric(input$diversity_ploidy) - geno <- input$diversity_file$datapath + box_plot <- reactive({ + validate( + need(!is.null(diversity_items$dosage_df), "Input VCF, define parameters and click `run analysis` to access results in this session.") + ) - #Status - updateProgressBar(session = session, id = "pb_diversity", value = 20, title = "Importing VCF") + #Plotting + box <- ggplot(diversity_items$dosage_df, aes(x=Dosage, y=Percentage, fill=Data)) + + #geom_point(aes(color = Data), position = position_dodge(width = 0.8), width = 0.2, alpha = 0.5) + # Add jittered points + geom_boxplot(position = position_dodge(width = 0.8), alpha = 0.9) + + labs(x = "\nDosage", y = "Percentage\n", title = "Genotype Distribution by Sample") + + theme_bw() + + theme( + axis.text = element_text(size = 14), + axis.title = element_text(size = 14) + ) - #Import genotype information if in VCF format - vcf <- read.vcfR(geno) + box + }) - #Save position information - diversity_items$pos_df <- data.frame(vcf@fix[, 1:2]) + output$dosage_plot <- renderPlot({ + box_plot() + }) - #Get items in FORMAT column - info <- vcf@gt[1,"FORMAT"] #Getting the first row FORMAT + #Het plot + het_plot <- reactive({ + validate( + need(!is.null(diversity_items$het_df) & !is.null(input$hist_bins), "Input VCF, define parameters and click `run analysis` to access results in this session.") + ) + hist(diversity_items$het_df$ObservedHeterozygosity, breaks = as.numeric(input$hist_bins), col = "tan3", border = "black", xlim= c(0,1), + xlab = "Observed Heterozygosity", + ylab = "Number of Samples", + main = "Sample Observed Heterozygosity") - # Apply the function to the first INFO string - info_ids <- extract_info_ids(info[1]) + axis(1, at = seq(0, 1, by = 0.1), labels = TRUE) + }) - #Status - updateProgressBar(session = session, id = "pb_diversity", value = 40, title = "Converting to Numeric") + output$het_plot <- renderPlot({ + het_plot() + }) - #Get the genotype values if the updog dosage calls are present - if ("UD" %in% info_ids) { - geno_mat <- extract.gt(vcf, element = "UD") - class(geno_mat) <- "numeric" - rm(vcf) #Remove vcf - }else{ - #Extract GT and convert to numeric calls - geno_mat <- extract.gt(vcf, element = "GT") - geno_mat <- apply(geno_mat, 2, convert_to_dosage) - rm(vcf) #Remove VCF - } + #AF Plot + af_plot <- reactive({ + validate( + need(!is.null(diversity_items$maf_df) & !is.null(input$hist_bins), "Input VCF, define parameters and click `run analysis` to access results in this session.") + ) + hist(diversity_items$maf_df$AF, breaks = as.numeric(input$hist_bins), col = "grey", border = "black", xlab = "Alternate Allele Frequency", + ylab = "Frequency", main = "Alternate Allele Frequency Distribution") + }) - print(class(geno_mat)) - #Convert genotypes to alternate counts if they are the reference allele counts - #Importantly, the dosage plot is based on the input format NOT the converted genotypes - is_reference <- (input$zero_value == "Reference Allele Counts") - - print("Genotype file successfully imported") - ######Get MAF plot (Need to remember that the VCF genotypes are likely set as 0 = homozygous reference, where the dosage report is 0 = homozygous alternate) - - print("Starting percentage calc") - #Status - updateProgressBar(session = session, id = "pb_diversity", value = 70, title = "Calculating...") - # Calculate percentages for both genotype matrices - percentages1 <- calculate_percentages(geno_mat, ploidy) - # Combine the data matrices into a single data frame - percentages1_df <- as.data.frame(t(percentages1)) - percentages1_df$Data <- "Dosages" - # Assuming my_data is your dataframe - print("Percentage Complete: melting dataframe") - melted_data <- percentages1_df %>% - pivot_longer(cols = -(Data),names_to = "Dosage", values_to = "Percentage") - - diversity_items$dosage_df <- melted_data - - print("Dosage calculations worked") - - #Convert the genotype calls prior to het,af, and maf calculation - geno_mat <- data.frame(convert_genotype_counts(df = geno_mat, ploidy = ploidy, is_reference), - check.names = FALSE) - - # Calculating heterozygosity for a tetraploid organism - diversity_items$het_df <- calculate_heterozygosity(geno_mat, ploidy = ploidy) - - print("Heterozygosity success") - diversity_items$maf_df <- calculateMAF(geno_mat, ploidy = ploidy) - - print("MAF success") - - #Updating value boxes - output$mean_het_box <- renderValueBox({ - valueBox( - value = round(mean(diversity_items$het_df$ObservedHeterozygosity),3), - subtitle = "Mean Heterozygosity", - icon = icon("dna"), - color = "info" - ) - }) - output$mean_maf_box <- renderValueBox({ - valueBox( - value = round(mean(diversity_items$maf_df$MAF),3), - subtitle = "Mean MAF", - icon = icon("dna"), - color = "info" - ) - }) + output$af_plot <- renderPlot({ + af_plot() + }) - #Status - updateProgressBar(session = session, id = "pb_diversity", value = 100, title = "Complete!") - }) + #MAF plot + maf_plot <- reactive({ + validate( + need(!is.null(diversity_items$maf_df) & !is.null(input$hist_bins), "Input VCF, define parameters and click `run analysis` to access results in this session.") + ) - box_plot <- reactive({ - validate( - need(!is.null(diversity_items$dosage_df), "Input VCF, define parameters and click `run analysis` to access results in this session.") - ) + hist(diversity_items$maf_df$MAF, breaks = as.numeric(input$hist_bins), col = "grey", border = "black", xlab = "Minor Allele Frequency (MAF)", + ylab = "Frequency", main = "Minor Allele Frequency Distribution") + }) - #Plotting - box <- ggplot(diversity_items$dosage_df, aes(x=Dosage, y=Percentage, fill=Data)) + - #geom_point(aes(color = Data), position = position_dodge(width = 0.8), width = 0.2, alpha = 0.5) + # Add jittered points - geom_boxplot(position = position_dodge(width = 0.8), alpha = 0.9) + - labs(x = "\nDosage", y = "Percentage\n", title = "Genotype Distribution by Sample") + - theme_bw() + - theme( - axis.text = element_text(size = 14), - axis.title = element_text(size = 14) + #Marker plot + marker_plot <- reactive({ + validate( + need(!is.null(diversity_items$pos_df), "Input VCF, define parameters and click `run analysis` to access results in this session.") + ) + #Order the Chr column + diversity_items$pos_df$POS <- as.numeric(diversity_items$pos_df$POS) + # Sort the dataframe + diversity_items$pos_df <- diversity_items$pos_df[order(diversity_items$pos_df$CHROM), ] + + #Plot + + # Create custom breaks for the x-axis labels (every 13Mb) + x_breaks <- seq(0, max(diversity_items$pos_df$POS), by = (max(diversity_items$pos_df$POS)/5)) + x_breaks <- c(x_breaks, max(diversity_items$pos_df$POS)) # Add 114Mb as a custom break + + # Create custom labels for the x-axis using the 'Mb' suffix + x_labels <- comma_format()(x_breaks / 1000000) + x_labels <- paste0(x_labels, "Mb") + + suppressWarnings({ + markerPlot <- ggplot(diversity_items$pos_df, aes(x = as.numeric(POS), y = CHROM, group = as.factor(CHROM))) + + geom_point(aes(color = as.factor(CHROM)), shape = 108, size = 5, show.legend = FALSE) + + xlab("Position") + + #ylab("Markers\n") + + theme(axis.text = element_text(size = 11, color = "black"), + axis.text.x.top = element_text(size = 11, color = "black"), + axis.title = element_blank(), + panel.grid = element_blank(), + axis.ticks.length.x = unit(-0.15, "cm"), + axis.ticks.margin = unit(0.1, "cm"), + axis.ticks.y = element_blank(), + axis.line.x.top = element_line(color="black"), + panel.background = element_rect(fill="white"), + plot.margin = margin(10, 25, 10, 10) + ) + + scale_x_continuous( + breaks = x_breaks, # Set custom breaks for x-axis labels + labels = x_labels, # Set custom labels with "Mb" suffixes + position = "top", # Move x-axis labels and ticks to the top + expand = c(0,0), + limits = c(0,max(diversity_items$pos_df$POS)) ) - - box - }) - - output$dosage_plot <- renderPlot({ - box_plot() }) + #Display plot + markerPlot + }) - #Het plot - het_plot <- reactive({ - validate( - need(!is.null(diversity_items$het_df) & !is.null(input$hist_bins), "Input VCF, define parameters and click `run analysis` to access results in this session.") - ) - hist(diversity_items$het_df$ObservedHeterozygosity, breaks = as.numeric(input$hist_bins), col = "tan3", border = "black", xlim= c(0,1), - xlab = "Observed Heterozygosity", - ylab = "Number of Samples", - main = "Sample Observed Heterozygosity") + output$marker_plot <- renderPlot({ + marker_plot() + }) - axis(1, at = seq(0, 1, by = 0.1), labels = TRUE) - }) + output$maf_plot <- renderPlot({ + maf_plot() + }) - output$het_plot <- renderPlot({ - het_plot() - }) + sample_table <- reactive({ + validate( + need(!is.null(diversity_items$het_df), "Input VCF, define parameters and click `run analysis` to access results in this session.") + ) + diversity_items$het_df + }) - #AF Plot - af_plot <- reactive({ - validate( - need(!is.null(diversity_items$maf_df) & !is.null(input$hist_bins), "Input VCF, define parameters and click `run analysis` to access results in this session.") - ) - hist(diversity_items$maf_df$AF, breaks = as.numeric(input$hist_bins), col = "grey", border = "black", xlab = "Alternate Allele Frequency", - ylab = "Frequency", main = "Alternate Allele Frequency Distribution") - }) + output$sample_table <- renderDT({sample_table()}, options = list(scrollX = TRUE,autoWidth = FALSE, pageLength = 5)) - output$af_plot <- renderPlot({ - af_plot() - }) + snp_table <- reactive({ + validate( + need(!is.null(diversity_items$maf_df), "Input VCF, define parameters and click `run analysis` to access results in this session.") + ) + diversity_items$maf_df + }) - #MAF plot - maf_plot <- reactive({ - validate( - need(!is.null(diversity_items$maf_df) & !is.null(input$hist_bins), "Input VCF, define parameters and click `run analysis` to access results in this session.") - ) + output$snp_table <- renderDT({snp_table()}, options = list(scrollX = TRUE,autoWidth = FALSE, pageLength = 5)) - hist(diversity_items$maf_df$MAF, breaks = as.numeric(input$hist_bins), col = "grey", border = "black", xlab = "Minor Allele Frequency (MAF)", - ylab = "Frequency", main = "Minor Allele Frequency Distribution") - }) + #Download Figures for Diversity Tab (Need to convert figures to ggplot) + output$download_div_figure <- downloadHandler( - #Marker plot - marker_plot <- reactive({ - validate( - need(!is.null(diversity_items$pos_df), "Input VCF, define parameters and click `run analysis` to access results in this session.") - ) - #Order the Chr column - diversity_items$pos_df$POS <- as.numeric(diversity_items$pos_df$POS) - # Sort the dataframe - diversity_items$pos_df <- diversity_items$pos_df[order(diversity_items$pos_df$CHROM), ] - - #Plot - - # Create custom breaks for the x-axis labels (every 13Mb) - x_breaks <- seq(0, max(diversity_items$pos_df$POS), by = (max(diversity_items$pos_df$POS)/5)) - x_breaks <- c(x_breaks, max(diversity_items$pos_df$POS)) # Add 114Mb as a custom break - - # Create custom labels for the x-axis using the 'Mb' suffix - x_labels <- comma_format()(x_breaks / 1000000) - x_labels <- paste0(x_labels, "Mb") - - suppressWarnings({ - markerPlot <- ggplot(diversity_items$pos_df, aes(x = as.numeric(POS), y = CHROM, group = as.factor(CHROM))) + - geom_point(aes(color = as.factor(CHROM)), shape = 108, size = 5, show.legend = FALSE) + - xlab("Position") + - #ylab("Markers\n") + - theme(axis.text = element_text(size = 11, color = "black"), - axis.text.x.top = element_text(size = 11, color = "black"), - axis.title = element_blank(), - panel.grid = element_blank(), - axis.ticks.length.x = unit(-0.15, "cm"), - axis.ticks.margin = unit(0.1, "cm"), - axis.ticks.y = element_blank(), - axis.line.x.top = element_line(color="black"), - panel.background = element_rect(fill="white"), - plot.margin = margin(10, 25, 10, 10) - ) + - scale_x_continuous( - breaks = x_breaks, # Set custom breaks for x-axis labels - labels = x_labels, # Set custom labels with "Mb" suffixes - position = "top", # Move x-axis labels and ticks to the top - expand = c(0,0), - limits = c(0,max(diversity_items$pos_df$POS)) - ) - }) - #Display plot - markerPlot - }) + filename = function() { + if (input$div_image_type == "jpeg") { + paste("genomic-diversity-", Sys.Date(), ".jpg", sep="") + } else if (input$div_image_type == "png") { + paste("genomic-diversity-", Sys.Date(), ".png", sep="") + } else { + paste("genomic-diversity-", Sys.Date(), ".tiff", sep="") + } + }, + content = function(file) { + req(input$div_figure) + + if (input$div_image_type == "jpeg") { + jpeg(file, width = as.numeric(input$div_image_width), height = as.numeric(input$div_image_height), res= as.numeric(input$div_image_res), units = "in") + } else if (input$div_image_type == "png") { + png(file, width = as.numeric(input$div_image_width), height = as.numeric(input$div_image_height), res= as.numeric(input$div_image_res), units = "in") + } else { + tiff(file, width = as.numeric(input$div_image_width), height = as.numeric(input$div_image_height), res= as.numeric(input$div_image_res), units = "in") + } - output$marker_plot <- renderPlot({ - marker_plot() - }) + # Conditional plotting based on input selection + if (input$div_figure == "Dosage Plot") { + print(box_plot()) + } else if (input$div_figure == "AF Histogram") { + af_plot() + } else if (input$div_figure == "MAF Histogram") { + maf_plot() + } else if (input$div_figure == "OHet Histogram") { + het_plot() + } else if (input$div_figure == "Marker Plot") { + print(marker_plot()) + } - output$maf_plot <- renderPlot({ - maf_plot() - }) + dev.off() + } - sample_table <- reactive({ - validate( - need(!is.null(diversity_items$het_df), "Input VCF, define parameters and click `run analysis` to access results in this session.") - ) - diversity_items$het_df - }) + ) - output$sample_table <- renderDT({sample_table()}, options = list(scrollX = TRUE,autoWidth = FALSE, pageLength = 5)) + #Download files for Genotype Diversity + output$download_div_file <- downloadHandler( + filename = function() { + paste0("genomic-diversity-results-", Sys.Date(), ".zip") + }, + content = function(file) { + # Temporary files list + temp_dir <- tempdir() + temp_files <- c() + + if (!is.null(diversity_items$het_df)) { + # Create a temporary file for assignments + het_file <- file.path(temp_dir, paste0("Sample-statistics-", Sys.Date(), ".csv")) + write.csv(diversity_items$het_df, het_file, row.names = FALSE) + temp_files <- c(temp_files, het_file) + } - snp_table <- reactive({ - validate( - need(!is.null(diversity_items$maf_df), "Input VCF, define parameters and click `run analysis` to access results in this session.") - ) - diversity_items$maf_df - }) + if (!is.null(diversity_items$maf_df)) { + # Create a temporary file for BIC data frame + maf_file <- file.path(temp_dir, paste0("SNP-statistics-", Sys.Date(), ".csv")) + write.csv(diversity_items$maf_df, maf_file, row.names = FALSE) + temp_files <- c(temp_files, maf_file) + } - output$snp_table <- renderDT({snp_table()}, options = list(scrollX = TRUE,autoWidth = FALSE, pageLength = 5)) - - #Download Figures for Diversity Tab (Need to convert figures to ggplot) - output$download_div_figure <- downloadHandler( - - filename = function() { - if (input$div_image_type == "jpeg") { - paste("genomic-diversity-", Sys.Date(), ".jpg", sep="") - } else if (input$div_image_type == "png") { - paste("genomic-diversity-", Sys.Date(), ".png", sep="") - } else { - paste("genomic-diversity-", Sys.Date(), ".tiff", sep="") - } - }, - content = function(file) { - req(input$div_figure) - - if (input$div_image_type == "jpeg") { - jpeg(file, width = as.numeric(input$div_image_width), height = as.numeric(input$div_image_height), res= as.numeric(input$div_image_res), units = "in") - } else if (input$div_image_type == "png") { - png(file, width = as.numeric(input$div_image_width), height = as.numeric(input$div_image_height), res= as.numeric(input$div_image_res), units = "in") - } else { - tiff(file, width = as.numeric(input$div_image_width), height = as.numeric(input$div_image_height), res= as.numeric(input$div_image_res), units = "in") - } - - # Conditional plotting based on input selection - if (input$div_figure == "Dosage Plot") { - print(box_plot()) - } else if (input$div_figure == "AF Histogram") { - af_plot() - } else if (input$div_figure == "MAF Histogram") { - maf_plot() - } else if (input$div_figure == "OHet Histogram") { - het_plot() - } else if (input$div_figure == "Marker Plot") { - print(marker_plot()) - } - - dev.off() + # Zip files only if there's something to zip + if (length(temp_files) > 0) { + zip(file, files = temp_files, extras = "-j") # Using -j to junk paths } - ) + # Optionally clean up + file.remove(temp_files) + } + ) - #Download files for Genotype Diversity - output$download_div_file <- downloadHandler( - filename = function() { - paste0("genomic-diversity-results-", Sys.Date(), ".zip") - }, - content = function(file) { - # Temporary files list - temp_dir <- tempdir() - temp_files <- c() - - if (!is.null(diversity_items$het_df)) { - # Create a temporary file for assignments - het_file <- file.path(temp_dir, paste0("Sample-statistics-", Sys.Date(), ".csv")) - write.csv(diversity_items$het_df, het_file, row.names = FALSE) - temp_files <- c(temp_files, het_file) - } - - if (!is.null(diversity_items$maf_df)) { - # Create a temporary file for BIC data frame - maf_file <- file.path(temp_dir, paste0("SNP-statistics-", Sys.Date(), ".csv")) - write.csv(diversity_items$maf_df, maf_file, row.names = FALSE) - temp_files <- c(temp_files, maf_file) - } - - # Zip files only if there's something to zip - if (length(temp_files) > 0) { - zip(file, files = temp_files, extras = "-j") # Using -j to junk paths - } - - # Optionally clean up - file.remove(temp_files) - } - ) + output$download_vcf <- downloadHandler( + filename = function() { + paste0("BIGapp_VCF_Example_file.vcf.gz") + }, + content = function(file) { + ex <- system.file("iris_DArT_VCF.vcf.gz", package = "BIGapp") + file.copy(ex, file) + }) - output$download_vcf <- downloadHandler( - filename = function() { - paste0("BIGapp_VCF_Example_file.vcf.gz") - }, - content = function(file) { - ex <- system.file("iris_DArT_VCF.vcf.gz", package = "BIGapp") - file.copy(ex, file) - }) - }) } ## To be copied in the UI diff --git a/R/mod_dosage2vcf.R b/R/mod_dosage2vcf.R index d6acf65..52889cc 100644 --- a/R/mod_dosage2vcf.R +++ b/R/mod_dosage2vcf.R @@ -53,136 +53,135 @@ mod_dosage2vcf_ui <- function(id){ #' @importFrom shinyjs enable disable #' #' @noRd -mod_dosage2vcf_server <- function(id){ - moduleServer( id, function(input, output, session){ - ns <- session$ns +mod_dosage2vcf_server <- function(input, output, session, parent_session){ - snp_number <- reactiveVal(0) - disable("download_d2vcf") + ns <- session$ns + + snp_number <- reactiveVal(0) + disable("download_d2vcf") + + #SNP counts value box + output$ReportSnps <- renderValueBox({ + valueBox(snp_number(), "Number of Markers", icon = icon("dna"), color = "info") + }) + + observeEvent(input$run_analysis, { + # Missing input with red border and alerts + toggleClass(id = "d2v_output_name", class = "borderred", condition = (is.na(input$d2v_output_name) | is.null(input$d2v_output_name) | input$d2v_output_name == "")) + toggleClass(id = "dosage2vcf_ploidy", class = "borderred", condition = (is.na(input$dosage2vcf_ploidy) | is.null(input$dosage2vcf_ploidy) | input$dosage2vcf_ploidy == "")) + + if (is.null(input$report_file$datapath) | is.null(input$counts_file$datapath)) { + shinyalert( + title = "Missing input!", + text = "Upload Dose Report and Counts Files", + size = "s", + closeOnEsc = TRUE, + closeOnClickOutside = FALSE, + html = TRUE, + type = "error", + showConfirmButton = TRUE, + confirmButtonText = "OK", + confirmButtonCol = "#004192", + showCancelButton = FALSE, + animation = TRUE + ) + } + req(input$report_file, input$counts_file, input$d2v_output_name, input$dosage2vcf_ploidy) + + dosage_file_df <- read.csv(input$report_file$datapath) + snp_number <- length(dosage_file_df$X.[-c(1:7)]) #SNP counts value box output$ReportSnps <- renderValueBox({ - valueBox(snp_number(), "Number of Markers", icon = icon("dna"), color = "info") + valueBox(snp_number, "Number of Markers", icon = icon("dna"), color = "info") }) - observeEvent(input$run_analysis, { - # Missing input with red border and alerts - toggleClass(id = "d2v_output_name", class = "borderred", condition = (is.na(input$d2v_output_name) | is.null(input$d2v_output_name) | input$d2v_output_name == "")) - toggleClass(id = "dosage2vcf_ploidy", class = "borderred", condition = (is.na(input$dosage2vcf_ploidy) | is.null(input$dosage2vcf_ploidy) | input$dosage2vcf_ploidy == "")) - - if (is.null(input$report_file$datapath) | is.null(input$counts_file$datapath)) { - shinyalert( - title = "Missing input!", - text = "Upload Dose Report and Counts Files", - size = "s", - closeOnEsc = TRUE, - closeOnClickOutside = FALSE, - html = TRUE, - type = "error", - showConfirmButton = TRUE, - confirmButtonText = "OK", - confirmButtonCol = "#004192", - showCancelButton = FALSE, - animation = TRUE - ) - } + enable("download_d2vcf") + }) + + output$download_dose <- downloadHandler( + filename = function() { + paste0("BIGapp_Dose_Report_Example_file.csv") + }, + content = function(file) { + ex <- system.file("iris_DArT_Allele_Dose_Report.csv", package = "BIGapp") + file.copy(ex, file) + }) + + output$download_counts <- downloadHandler( + filename = function() { + paste0("BIGapp_Counts_Example_file.csv") + }, + content = function(file) { + ex <- system.file("iris_DArT_Counts.csv", package = "BIGapp") + file.copy(ex, file) + }) + + ##This is for the DArT files conversion to VCF + output$download_d2vcf <- downloadHandler( + filename = function() { + paste0(input$d2v_output_name, ".vcf.gz") + }, + content = function(file) { + # Ensure the files are uploaded req(input$report_file, input$counts_file, input$d2v_output_name, input$dosage2vcf_ploidy) - dosage_file_df <- read.csv(input$report_file$datapath) - snp_number <- length(dosage_file_df$X.[-c(1:7)]) + # Get the uploaded file paths + dosage_file <- input$report_file$datapath + counts_file <- input$counts_file$datapath + ploidy <- input$dosage2vcf_ploidy - #SNP counts value box - output$ReportSnps <- renderValueBox({ - valueBox(snp_number, "Number of Markers", icon = icon("dna"), color = "info") - }) + # Use a temporary file path without appending .vcf + temp_base <- tempfile() - enable("download_d2vcf") - }) + #Status + updateProgressBar(session = session, id = "dosage2vcf_pb", value = 50, title = "Converting DArT files to VCF") - output$download_dose <- downloadHandler( - filename = function() { - paste0("BIGapp_Dose_Report_Example_file.csv") - }, - content = function(file) { - ex <- system.file("iris_DArT_Allele_Dose_Report.csv", package = "BIGapp") - file.copy(ex, file) - }) - - output$download_counts <- downloadHandler( - filename = function() { - paste0("BIGapp_Counts_Example_file.csv") - }, - content = function(file) { - ex <- system.file("iris_DArT_Counts.csv", package = "BIGapp") - file.copy(ex, file) - }) - - ##This is for the DArT files conversion to VCF - output$download_d2vcf <- downloadHandler( - filename = function() { - paste0(input$d2v_output_name, ".vcf.gz") - }, - content = function(file) { - # Ensure the files are uploaded - req(input$report_file, input$counts_file, input$d2v_output_name, input$dosage2vcf_ploidy) - - # Get the uploaded file paths - dosage_file <- input$report_file$datapath - counts_file <- input$counts_file$datapath - ploidy <- input$dosage2vcf_ploidy - - # Use a temporary file path without appending .vcf - temp_base <- tempfile() - - #Status - updateProgressBar(session = session, id = "dosage2vcf_pb", value = 50, title = "Converting DArT files to VCF") - - # Convert to VCF using the BIGr package - cat("Running BIGr::dosage2vcf...\n") - dosage2vcf( - dart.report = dosage_file, - dart.counts = counts_file, - output.file = temp_base, - ploidy = as.numeric(ploidy) - ) + # Convert to VCF using the BIGr package + cat("Running BIGr::dosage2vcf...\n") + dosage2vcf( + dart.report = dosage_file, + dart.counts = counts_file, + output.file = temp_base, + ploidy = as.numeric(ploidy) + ) - # The output file should be temp_base.vcf - output_name <- paste0(temp_base, ".vcf") + # The output file should be temp_base.vcf + output_name <- paste0(temp_base, ".vcf") - # Check if the VCF file was created - if (file.exists(output_name)) { - cat("VCF file created successfully.\n") + # Check if the VCF file was created + if (file.exists(output_name)) { + cat("VCF file created successfully.\n") - # Compress the VCF file using gzip - gzip_file <- paste0(output_name, ".gz") - gz <- gzfile(gzip_file, "w") - writeLines(readLines(output_name), gz) - close(gz) + # Compress the VCF file using gzip + gzip_file <- paste0(output_name, ".gz") + gz <- gzfile(gzip_file, "w") + writeLines(readLines(output_name), gz) + close(gz) - # Check if the gzip file was created - if (file.exists(gzip_file)) { - cat("Gzip file created successfully.\n") + # Check if the gzip file was created + if (file.exists(gzip_file)) { + cat("Gzip file created successfully.\n") - # Move the compressed file to the path specified by 'file' - file.copy(gzip_file, file) + # Move the compressed file to the path specified by 'file' + file.copy(gzip_file, file) - # Delete the temporary files - unlink(gzip_file) - unlink(output_name) + # Delete the temporary files + unlink(gzip_file) + unlink(output_name) - cat("Temporary files deleted successfully.\n") - } else { - stop("Error: Failed to create the gzip file.") - } + cat("Temporary files deleted successfully.\n") } else { - stop("Error: Failed to create the VCF file.") + stop("Error: Failed to create the gzip file.") } - - #Status - updateProgressBar(session = session, id = "dosage2vcf_pb", value = 100, title = "Complete! - Downloading VCF") + } else { + stop("Error: Failed to create the VCF file.") } - ) - }) + + #Status + updateProgressBar(session = session, id = "dosage2vcf_pb", value = 100, title = "Complete! - Downloading VCF") + } + ) } ## To be copied in the UI diff --git a/R/mod_gwas.R b/R/mod_gwas.R index e1aef38..4929fea 100644 --- a/R/mod_gwas.R +++ b/R/mod_gwas.R @@ -35,10 +35,12 @@ mod_gwas_ui <- function(id){ sliderInput(ns("cores"), "Number of CPU Cores", min = 1, max = (availableCores() - 1), value = 1, step = 1), actionButton(ns("gwas_start"), "Run Analysis"), div(style="display:inline-block; float:right",dropdownButton( - tags$h3("GWAS Parameters"), - "You can download examples of the expected input input files here: \n", - downloadButton(ns('download_vcf'), "Download VCF Example File"), - downloadButton(ns('download_pheno'), "Download Passport Example File"), + HTML("Input files"), + p(downloadButton(ns('download_vcf'),""), "VCF Example File"), + p(downloadButton(ns('download_pheno'),""), "Passport Example File"), hr(), + p(HTML("Parameters description:"), actionButton(ns("goGWASpar"), icon("arrow-up-right-from-square", verify_fa = FALSE) )), hr(), + p(HTML("Graphics description:"), actionButton(ns("goGWASgraph"), icon("arrow-up-right-from-square", verify_fa = FALSE) )), hr(), + p(HTML("GWASpoly tutorial:"), actionButton(ns("goGWASpoly"), icon("arrow-up-right-from-square", verify_fa = FALSE), onclick ="window.open('https://jendelman.github.io/GWASpoly/GWASpoly.html', '_blank')" )), circle = FALSE, status = "warning", icon = icon("info"), width = "300px", @@ -95,97 +97,180 @@ mod_gwas_ui <- function(id){ #' @importFrom vcfR read.vcfR #' @importFrom Matrix nearPD #' @importFrom stats BIC as.formula lm logLik median model.matrix na.omit prcomp qbeta quantile runif sd setNames +#' @importFrom bs4Dash updatebs4TabItems #' @noRd -mod_gwas_server <- function(id){ - moduleServer( id, function(input, output, session){ - ns <- session$ns - - #Call some plots to NULL so that the spinners do not show before analysis - output$bic_plot <- renderDT(NULL) - output$manhattan_plot <- renderDT(NULL) - output$qq_plot <- renderDT(NULL) - output$bic_table <- renderDT(NULL) - output$gwas_stats <- renderDT(NULL) - - ##GWAS items - gwas_vars <- reactiveValues( - gwas_df = NULL, - manhattan_plots = NULL, - qq_plots = NULL, - bic_df = NULL, - BIC_ggplot = NULL +mod_gwas_server <- function(input, output, session, parent_session){ + + ns <- session$ns + + # Help links + observeEvent(input$goGWASpar, { + # change to help tab + updatebs4TabItems(session = parent_session, inputId = "MainMenu", + selected = "help") + + # select specific tab + updateTabsetPanel(session = parent_session, inputId = "GWAS_tabset", + selected = "GWAS_par") + # expand specific box + updateBox(id = "GWAS_box", action = "toggle", session = parent_session) + }) + + observeEvent(input$goGWASgraph, { + # change to help tab + updatebs4TabItems(session = parent_session, inputId = "MainMenu", + selected = "help") + + # select specific tab + updateTabsetPanel(session = parent_session, inputId = "GWAS_tabset", + selected = "GWAS_results") + # expand specific box + updateBox(id = "GWAS_box", action = "toggle", session = parent_session) + }) + + + #Call some plots to NULL so that the spinners do not show before analysis + output$bic_plot <- renderDT(NULL) + output$manhattan_plot <- renderDT(NULL) + output$qq_plot <- renderDT(NULL) + output$bic_table <- renderDT(NULL) + output$gwas_stats <- renderDT(NULL) + + ##GWAS items + gwas_vars <- reactiveValues( + gwas_df = NULL, + manhattan_plots = NULL, + qq_plots = NULL, + bic_df = NULL, + BIC_ggplot = NULL + ) + + output$qtls_detected <- renderValueBox({ + valueBox( + value = 0, + subtitle = "QTLs Detected", + icon = icon("dna"), + color = "info" ) + }) + + observeEvent(input$phenotype_file, { + info_df <- read.csv(input$phenotype_file$datapath, header = TRUE, check.names = FALSE, nrow = 0) + trait_var <- colnames(info_df) + trait_var <- trait_var[2:length(trait_var)] + updateSelectInput(session, "trait_info", choices = c(trait_var)) + updateVirtualSelect("fixed_info", choices = trait_var, session = session) + }) - output$qtls_detected <- renderValueBox({ - valueBox( - value = 0, - subtitle = "QTLs Detected", - icon = icon("dna"), - color = "info" + #GWAS analysis (Shufen Chen and Meng Lin pipelines) + observeEvent(input$gwas_start, { + + toggleClass(id = "gwas_ploidy", class = "borderred", condition = (is.na(input$gwas_ploidy) | is.null(input$gwas_ploidy))) + toggleClass(id = "trait_info", class = "borderred", condition = (all(is.na(input$trait_info)) | all(is.null(input$trait_info)))) + + if (is.null(input$phenotype_file$datapath) | is.null(input$gwas_file$datapath)) { + shinyalert( + title = "Missing input!", + text = "Upload VCF and phenotype files", + size = "s", + closeOnEsc = TRUE, + closeOnClickOutside = FALSE, + html = TRUE, + type = "error", + showConfirmButton = TRUE, + confirmButtonText = "OK", + confirmButtonCol = "#004192", + showCancelButton = FALSE, + animation = TRUE ) - }) + } + req(input$phenotype_file$datapath, input$gwas_file$datapath, input$gwas_ploidy, input$trait_info) + + cores <- input$cores + #Status + updateProgressBar(session = session, id = "pb_gwas", value = 0, title = "Uploading Data") + + #Make subset phenotype file (need to develop alternative method that does not save a new phenotype file each time.) + #I think I can subset the read.GWAS file pheno and fixed categories (data@pheno[,c("trait")]) and data@fixed = phenotype_file[,c("List of fixed traits")] + phenotype_file <- read.csv(input$phenotype_file$datapath, header = TRUE, check.names = FALSE) + + # Remove empty lines + rm.empty <- which(apply(phenotype_file, 1, function(x) all(is.na(x) | x == ""))) + if(length(rm.empty) > 0){ + warning(paste("Removing", length(rm.empty),"empty lines")) + phenotype_file <- phenotype_file[-rm.empty,] + } + + ids <- colnames(phenotype_file)[1] + traits <- input$trait_info + fixed <- input$fixed_info + included_var <- c(ids, traits, fixed) + ploidy <- as.numeric(input$gwas_ploidy) + + # Check if traits are numerical + n_traits <- as.matrix(phenotype_file[,traits]) + n_traits <- apply(n_traits, 2, function(x) all(is.na(as.numeric(x)))) + + if(any(n_traits)){ + shinyalert( + title = "Input not supported", + text = paste("All selected traits must be numerical. Categorial traits found:",if(length(n_traits) > 1) names(which(n_traits)) else input$trait_info), + size = "s", + closeOnEsc = TRUE, + closeOnClickOutside = FALSE, + html = TRUE, + type = "error", + showConfirmButton = TRUE, + confirmButtonText = "OK", + confirmButtonCol = "#004192", + showCancelButton = FALSE, + animation = TRUE + ) + } - observeEvent(input$phenotype_file, { - info_df <- read.csv(input$phenotype_file$datapath, header = TRUE, check.names = FALSE, nrow = 0) - trait_var <- colnames(info_df) - trait_var <- trait_var[2:length(trait_var)] - updateSelectInput(session, "trait_info", choices = c(trait_var)) - updateVirtualSelect("fixed_info", choices = trait_var, session = session) - }) + validate( + need(!any(n_traits), "The selected traits must be numerical.") + ) - #GWAS analysis (Shufen Chen and Meng Lin pipelines) - observeEvent(input$gwas_start, { + phenotype_file <- phenotype_file[,included_var] - toggleClass(id = "gwas_ploidy", class = "borderred", condition = (is.na(input$gwas_ploidy) | is.null(input$gwas_ploidy))) - toggleClass(id = "trait_info", class = "borderred", condition = (all(is.na(input$trait_info)) | all(is.null(input$trait_info)))) + # Create a temporary file for the selected phenotype data + temp_pheno_file <- tempfile(fileext = ".csv") - if (is.null(input$phenotype_file$datapath) | is.null(input$gwas_file$datapath)) { - shinyalert( - title = "Missing input!", - text = "Upload VCF and phenotype files", - size = "s", - closeOnEsc = TRUE, - closeOnClickOutside = FALSE, - html = TRUE, - type = "error", - showConfirmButton = TRUE, - confirmButtonText = "OK", - confirmButtonCol = "#004192", - showCancelButton = FALSE, - animation = TRUE - ) - } - req(input$phenotype_file$datapath, input$gwas_file$datapath, input$gwas_ploidy, input$trait_info) + #Save new phenotype file with selected traits and fixed effects + write.csv(phenotype_file, file = temp_pheno_file, row.names = FALSE) - cores <- input$cores - #Status - updateProgressBar(session = session, id = "pb_gwas", value = 0, title = "Uploading Data") + #Status + updateProgressBar(session = session, id = "pb_gwas", value = 5, title = "Upload Complete: Now Formatting GWASpoly Data") - #Make subset phenotype file (need to develop alternative method that does not save a new phenotype file each time.) - #I think I can subset the read.GWAS file pheno and fixed categories (data@pheno[,c("trait")]) and data@fixed = phenotype_file[,c("List of fixed traits")] - phenotype_file <- read.csv(input$phenotype_file$datapath, header = TRUE, check.names = FALSE) + #Geno file path + file_path <- input$gwas_file$datapath - # Remove empty lines - rm.empty <- which(apply(phenotype_file, 1, function(x) all(is.na(x) | x == ""))) - if(length(rm.empty) > 0){ - warning(paste("Removing", length(rm.empty),"empty lines")) - phenotype_file <- phenotype_file[-rm.empty,] - } + #Geno.file conversion if needed + if (grepl("\\.csv$", file_path)) { + #TODO: Add check for matches of sample names in genotype and phenotype data - ids <- colnames(phenotype_file)[1] - traits <- input$trait_info - fixed <- input$fixed_info - included_var <- c(ids, traits, fixed) - ploidy <- as.numeric(input$gwas_ploidy) + data <- read.GWASpoly(ploidy= ploidy, pheno.file= temp_pheno_file, geno.file=input$gwas_file$datapath, + format="numeric", n.traits=length(traits), delim=",") #only need to change files here - # Check if traits are numerical - n_traits <- as.matrix(phenotype_file[,traits]) - n_traits <- apply(n_traits, 2, function(x) all(is.na(as.numeric(x)))) + } else if (grepl("\\.vcf$", file_path) || grepl("\\.gz$", file_path)) { + # Create a temporary file for the selected phenotype data + temp_geno_file <- tempfile(fileext = ".csv") + + #Convert VCF file if submitted + vcf <- read.vcfR(input$gwas_file$datapath) + + #Extract GT + geno_mat <- extract.gt(vcf, element = "GT") + geno_mat <- apply(geno_mat, 2, convert_to_dosage) + class(geno_mat) <- "numeric" + info <- data.frame(vcf@fix) + gpoly_df <- cbind(info[,c("ID","CHROM","POS")], geno_mat) - if(any(n_traits)){ + if(!any(colnames(gpoly_df) %in% phenotype_file$Sample_ID)) { shinyalert( - title = "Input not supported", - text = paste("All selected traits must be numerical. Categorial traits found:",if(length(n_traits) > 1) names(which(n_traits)) else input$trait_info), + title = "Samples ID do not match", + text = paste("Check if passport/phenotype files have same sample ID as the VCF/genotype file."), size = "s", closeOnEsc = TRUE, closeOnClickOutside = FALSE, @@ -197,371 +282,314 @@ mod_gwas_server <- function(id){ showCancelButton = FALSE, animation = TRUE ) - } + } validate( - need(!any(n_traits), "The selected traits must be numerical.") + need(any(colnames(gpoly_df) %in% phenotype_file$Sample_ID), "The selected traits must be numerical.") ) - phenotype_file <- phenotype_file[,included_var] + write.csv(gpoly_df, file = temp_geno_file, row.names = FALSE) + + data <- read.GWASpoly(ploidy= ploidy, pheno.file= temp_pheno_file, geno.file=temp_geno_file, + format="numeric", n.traits=length(traits), delim=",") + rm(geno_mat) + rm(gpoly_df) + rm(vcf) + + } else { + + # If condition is met, show notification toast + shinyalert( + title = "Oops", + text = "No valid genotype file detected", + size = "xs", + closeOnEsc = TRUE, + closeOnClickOutside = FALSE, + html = TRUE, + type = "info", + showConfirmButton = TRUE, + confirmButtonText = "OK", + confirmButtonCol = "#004192", + showCancelButton = FALSE, + imageUrl = "", + animation = TRUE, + ) - # Create a temporary file for the selected phenotype data - temp_pheno_file <- tempfile(fileext = ".csv") + #Stop the analysis + return() + } - #Save new phenotype file with selected traits and fixed effects - write.csv(phenotype_file, file = temp_pheno_file, row.names = FALSE) + data.loco <- set.K(data,LOCO=F,n.core= as.numeric(cores)) - #Status - updateProgressBar(session = session, id = "pb_gwas", value = 5, title = "Upload Complete: Now Formatting GWASpoly Data") - - #Geno file path - file_path <- input$gwas_file$datapath - - #Geno.file conversion if needed - if (grepl("\\.csv$", file_path)) { - #TODO: Add check for matches of sample names in genotype and phenotype data - - data <- read.GWASpoly(ploidy= ploidy, pheno.file= temp_pheno_file, geno.file=input$gwas_file$datapath, - format="numeric", n.traits=length(traits), delim=",") #only need to change files here - - } else if (grepl("\\.vcf$", file_path) || grepl("\\.gz$", file_path)) { - # Create a temporary file for the selected phenotype data - temp_geno_file <- tempfile(fileext = ".csv") - - #Convert VCF file if submitted - vcf <- read.vcfR(input$gwas_file$datapath) - - #Extract GT - geno_mat <- extract.gt(vcf, element = "GT") - geno_mat <- apply(geno_mat, 2, convert_to_dosage) - class(geno_mat) <- "numeric" - info <- data.frame(vcf@fix) - gpoly_df <- cbind(info[,c("ID","CHROM","POS")], geno_mat) - - if(!any(colnames(gpoly_df) %in% phenotype_file$Sample_ID)) { - shinyalert( - title = "Samples ID do not match", - text = paste("Check if passport/phenotype files have same sample ID as the VCF/genotype file."), - size = "s", - closeOnEsc = TRUE, - closeOnClickOutside = FALSE, - html = TRUE, - type = "error", - showConfirmButton = TRUE, - confirmButtonText = "OK", - confirmButtonCol = "#004192", - showCancelButton = FALSE, - animation = TRUE - ) - - } - validate( - need(any(colnames(gpoly_df) %in% phenotype_file$Sample_ID), "The selected traits must be numerical.") - ) + #Delete temp pheno file + unlink(temp_pheno_file) - write.csv(gpoly_df, file = temp_geno_file, row.names = FALSE) + ####Pheno, kinship, PCs from results of GWASpoly + GE<- data@pheno + colnames(GE)[1]<-"Genotype" - data <- read.GWASpoly(ploidy= ploidy, pheno.file= temp_pheno_file, geno.file=temp_geno_file, - format="numeric", n.traits=length(traits), delim=",") - rm(geno_mat) - rm(gpoly_df) - rm(vcf) + ## kinship + Kin<- data.loco@K$all - } else { + ## PCs + PC_all<- eigen(data.loco@K$all)$vectors + rownames(PC_all) <- rownames(data.loco@K$all) + PCs<-PC_all[,1:10] + colnames(PCs)<-c(paste("PC",1:10,sep="")) - # If condition is met, show notification toast - shinyalert( - title = "Oops", - text = "No valid genotype file detected", - size = "xs", - closeOnEsc = TRUE, - closeOnClickOutside = FALSE, - html = TRUE, - type = "info", - showConfirmButton = TRUE, - confirmButtonText = "OK", - confirmButtonCol = "#004192", - showCancelButton = FALSE, - imageUrl = "", - animation = TRUE, - ) + ## + taxa<-intersect(GE$Genotype,intersect(rownames(PCs),rownames(Kin))) + PCs<-PCs[which(rownames(PCs) %in% taxa),] + PCs<-PCs[order(rownames(PCs)),] - #Stop the analysis - return() - } + GE<-GE[which(GE$Genotype %in% taxa),] + GE<-GE[order(GE$Genotype),] - data.loco <- set.K(data,LOCO=F,n.core= as.numeric(cores)) + Kin<-Kin[which(rownames(Kin) %in% taxa),which(rownames(Kin) %in% taxa)] # need check the matrix after this step + Kin<-Kin[order(rownames(Kin)),order(colnames(Kin))] - #Delete temp pheno file - unlink(temp_pheno_file) + #### calculate BIC + #Status + updateProgressBar(session = session, id = "pb_gwas", value = 20, title = "Formatting Complete: Now Calculating BIC") - ####Pheno, kinship, PCs from results of GWASpoly - GE<- data@pheno - colnames(GE)[1]<-"Genotype" + PC<-as.matrix(PCs) + K=as.matrix(Kin) - ## kinship - Kin<- data.loco@K$all + kin.adj<-posdefmat(K) + kin.test<-as.matrix(kin.adj) - ## PCs - PC_all<- eigen(data.loco@K$all)$vectors - rownames(PC_all) <- rownames(data.loco@K$all) - PCs<-PC_all[,1:10] - colnames(PCs)<-c(paste("PC",1:10,sep="")) + for (i in 2:ncol(GE)){ - ## - taxa<-intersect(GE$Genotype,intersect(rownames(PCs),rownames(Kin))) - PCs<-PCs[which(rownames(PCs) %in% taxa),] - PCs<-PCs[order(rownames(PCs)),] + #model selection + y=as.numeric(GE[,i]) - GE<-GE[which(GE$Genotype %in% taxa),] - GE<-GE[order(GE$Genotype),] + BICs<-CalcBIC(y=y,PC=PC,K=kin.test) + BICs$BIC$withK + BICs$BIC$withoutK - Kin<-Kin[which(rownames(Kin) %in% taxa),which(rownames(Kin) %in% taxa)] # need check the matrix after this step - Kin<-Kin[order(rownames(Kin)),order(colnames(Kin))] + plotBICs<-cbind(rbind.data.frame(BICs$BIC$withK,BICs$BIC$withoutK),rep(c("w/Kinship","no Kinship"),each=nrow(BICs$BIC$withK))) + colnames(plotBICs)[ncol(plotBICs)]<-"RelationshipMatrix" + plotBICs$n.PC<-factor(plotBICs$n.PC,levels=c("0","1","2","3","4","5", + "6","7","8","9","10")) + plotBICs_kinship <- subset(plotBICs,plotBICs$RelationshipMatrix =="w/Kinship") + output$bic_table <- renderDT({plotBICs_kinship}, options = list(scrollX = TRUE,autoWidth = FALSE, pageLength = 5) + ) - #### calculate BIC - #Status - updateProgressBar(session = session, id = "pb_gwas", value = 20, title = "Formatting Complete: Now Calculating BIC") + p1<-ggplot(plotBICs_kinship, aes(x=n.PC, y=BIC,group=RelationshipMatrix)) + + geom_line(color="grey")+ + geom_point(shape=21, color="black", fill="#d95f0e", size=3)+ + theme(text=element_text(size=15),axis.text.x = element_text(angle =0), + panel.grid.major = element_blank(), panel.grid.minor = element_blank(), + panel.background = element_blank(), axis.line = element_line(colour = "black"))+ + labs(x = "Number of PCs",y="BIC") - PC<-as.matrix(PCs) - K=as.matrix(Kin) + #Save BIC plot + gwas_vars$BIC_ggplot <- p1 - kin.adj<-posdefmat(K) - kin.test<-as.matrix(kin.adj) + #Display BIC figure + output$bic_plot <- renderPlot({ + print(p1) + }) + #dev.off() - for (i in 2:ncol(GE)){ + #Save BIC plot info + gwas_vars$bic_df <- plotBICs_kinship - #model selection - y=as.numeric(GE[,i]) + #Status + updateProgressBar(session = session, id = "pb_gwas", value = 40, title = "BIC Complete: Now Performing GWAS") + + ##GWAS based on model selection + N <- nrow(data@pheno) #Population size + #Select models depending on ploidy + if (ploidy > 2) { + model <- c("additive","1-dom","2-dom","general","diplo-general","diplo-additive") + updateSelectInput(session, "model_select", choices = c("all", model)) + }else{ + model <- c("additive", "1-dom") + updateSelectInput(session, "model_select", choices = c("all", model)) + } - BICs<-CalcBIC(y=y,PC=PC,K=kin.test) - BICs$BIC$withK - BICs$BIC$withoutK + BIC_min <- plotBICs_kinship[which.min(plotBICs_kinship$BIC),] + if(BIC_min$n.PC == 0){params <- set.params(geno.freq = 1 - 5/N)}else{params <- set.params(geno.freq = 1 - 5/N,n.PC = as.numeric(levels(BIC_min$n.PC))[BIC_min$n.PC])} + data.loco.scan <- GWASpoly(data=data.loco,models=model,traits=colnames(data@pheno[i]),params=params,n.core=as.numeric(cores)) + #Consider adding options for different thresholds + data2 <- set.threshold(data.loco.scan,method=input$gwas_threshold,level=0.05) - plotBICs<-cbind(rbind.data.frame(BICs$BIC$withK,BICs$BIC$withoutK),rep(c("w/Kinship","no Kinship"),each=nrow(BICs$BIC$withK))) - colnames(plotBICs)[ncol(plotBICs)]<-"RelationshipMatrix" - plotBICs$n.PC<-factor(plotBICs$n.PC,levels=c("0","1","2","3","4","5", - "6","7","8","9","10")) - plotBICs_kinship <- subset(plotBICs,plotBICs$RelationshipMatrix =="w/Kinship") - output$bic_table <- renderDT({plotBICs_kinship}, options = list(scrollX = TRUE,autoWidth = FALSE, pageLength = 5) - ) - p1<-ggplot(plotBICs_kinship, aes(x=n.PC, y=BIC,group=RelationshipMatrix)) + - geom_line(color="grey")+ - geom_point(shape=21, color="black", fill="#d95f0e", size=3)+ - theme(text=element_text(size=15),axis.text.x = element_text(angle =0), - panel.grid.major = element_blank(), panel.grid.minor = element_blank(), - panel.background = element_blank(), axis.line = element_line(colour = "black"))+ - labs(x = "Number of PCs",y="BIC") + #Save manhattan plots to list (only for single trait analysis) + #if length(traits) == 1 + manhattan_plot_list <- list() - #Save BIC plot - gwas_vars$BIC_ggplot <- p1 + #plot for six models per trait + manhattan_plot_list[["all"]] <- manhattan.plot(data2,traits=colnames(data@pheno[i]), models = model)+geom_point(size=3)+theme(text = element_text(size = 25),strip.text = element_text(face = "bold")) - #Display BIC figure - output$bic_plot <- renderPlot({ - print(p1) - }) - #dev.off() + #Output the manhattan plots + output$manhattan_plot <- renderPlot({ - #Save BIC plot info - gwas_vars$bic_df <- plotBICs_kinship + print(manhattan_plot_list[[input$model_select]]) - #Status - updateProgressBar(session = session, id = "pb_gwas", value = 40, title = "BIC Complete: Now Performing GWAS") + }) - ##GWAS based on model selection - N <- nrow(data@pheno) #Population size - #Select models depending on ploidy - if (ploidy > 2) { - model <- c("additive","1-dom","2-dom","general","diplo-general","diplo-additive") - updateSelectInput(session, "model_select", choices = c("all", model)) - }else{ - model <- c("additive", "1-dom") - updateSelectInput(session, "model_select", choices = c("all", model)) - } - BIC_min <- plotBICs_kinship[which.min(plotBICs_kinship$BIC),] - if(BIC_min$n.PC == 0){params <- set.params(geno.freq = 1 - 5/N)}else{params <- set.params(geno.freq = 1 - 5/N,n.PC = as.numeric(levels(BIC_min$n.PC))[BIC_min$n.PC])} - data.loco.scan <- GWASpoly(data=data.loco,models=model,traits=colnames(data@pheno[i]),params=params,n.core=as.numeric(cores)) - #Consider adding options for different thresholds - data2 <- set.threshold(data.loco.scan,method=input$gwas_threshold,level=0.05) + #get most significant SNPs per QTL file + qtl <- get.QTL(data=data2,traits=colnames(data@pheno[i]),bp.window=5e6) + qtl_d <- data.frame(qtl) + #Save QTL info + gwas_vars$gwas_df <- qtl_d - #Save manhattan plots to list (only for single trait analysis) - #if length(traits) == 1 - manhattan_plot_list <- list() + output$gwas_stats <- renderDT({qtl_d}, options = list(scrollX = TRUE,autoWidth = FALSE, pageLength = 5)) - #plot for six models per trait - manhattan_plot_list[["all"]] <- manhattan.plot(data2,traits=colnames(data@pheno[i]), models = model)+geom_point(size=3)+theme(text = element_text(size = 25),strip.text = element_text(face = "bold")) + #Updating value boxes + output$qtls_detected <- renderValueBox({ + valueBox( + value = length(unique(qtl_d$Position)), + subtitle = "QTLs Detected", + icon = icon("dna"), + color = "info" + ) + }) - #Output the manhattan plots - output$manhattan_plot <- renderPlot({ + #Status + updateProgressBar(session = session, id = "pb_gwas", value = 80, title = "GWAS Complete: Now Plotting Results") - print(manhattan_plot_list[[input$model_select]]) + #get qqplot + data_qq <- cbind.data.frame(SNP=data.loco.scan@map$Marker,Chr=data.loco.scan@map$Chrom, Pos=data.loco.scan@map$Position,10^(-data.loco.scan@scores[[colnames(data@pheno[i])]])) - }) + #Save qq_plot info + gwas_vars$qq_plots <- data_qq + output$qq_plot <- renderPlot({ + CMplot_shiny(data_qq,plot.type="q",col=c(1:8), + ylab.pos=2, + file.name=colnames(data@pheno[i]), + conf.int=FALSE, + box=F,multraits=TRUE,file.output=FALSE) + }) - #get most significant SNPs per QTL file - qtl <- get.QTL(data=data2,traits=colnames(data@pheno[i]),bp.window=5e6) - qtl_d <- data.frame(qtl) + #plot for each model per trait + for (j in 1:length(model)) { + print(j) - #Save QTL info - gwas_vars$gwas_df <- qtl_d + data.loco.scan_2 <- GWASpoly(data=data.loco,models=model[j], + traits=colnames(data@pheno[i]),params=params,n.core= as.numeric(cores)) - output$gwas_stats <- renderDT({qtl_d}, options = list(scrollX = TRUE,autoWidth = FALSE, pageLength = 5)) + data3 <- set.threshold(data.loco.scan_2,method="M.eff",level=0.05) + manhattan_plot_list[[model[j]]] <- manhattan.plot(data3,traits=colnames(data@pheno[i]))+geom_point(size=3)+theme(text = element_text(size = 25),strip.text = element_text(face = "bold")) + } - #Updating value boxes - output$qtls_detected <- renderValueBox({ - valueBox( - value = length(unique(qtl_d$Position)), - subtitle = "QTLs Detected", - icon = icon("dna"), - color = "info" - ) - }) + #Save manhattan plots + gwas_vars$manhattan_plots <- manhattan_plot_list - #Status - updateProgressBar(session = session, id = "pb_gwas", value = 80, title = "GWAS Complete: Now Plotting Results") + } - #get qqplot - data_qq <- cbind.data.frame(SNP=data.loco.scan@map$Marker,Chr=data.loco.scan@map$Chrom, Pos=data.loco.scan@map$Position,10^(-data.loco.scan@scores[[colnames(data@pheno[i])]])) + #Status + updateProgressBar(session = session, id = "pb_gwas", value = 100, status = "success", title = "Finished") - #Save qq_plot info - gwas_vars$qq_plots <- data_qq + }) - output$qq_plot <- renderPlot({ - CMplot_shiny(data_qq,plot.type="q",col=c(1:8), - ylab.pos=2, - file.name=colnames(data@pheno[i]), - conf.int=FALSE, - box=F,multraits=TRUE,file.output=FALSE) - }) + #Download files for GWAS + output$download_gwas_file <- downloadHandler( + filename = function() { + paste0("GWAS-results-", Sys.Date(), ".zip") + }, + content = function(file) { + # Temporary files list + temp_dir <- tempdir() + temp_files <- c() + + if (!is.null(gwas_vars$gwas_df)) { + # Create a temporary file for assignments + gwas_file <- file.path(temp_dir, paste0("QTL-statistics-", Sys.Date(), ".csv")) + write.csv(gwas_vars$gwas_df, gwas_file, row.names = FALSE) + temp_files <- c(temp_files, gwas_file) + } + + if (!is.null(gwas_vars$bic_df)) { + # Create a temporary file for BIC data frame + bic_file <- file.path(temp_dir, paste0("GWAS-BIC-statistics-", Sys.Date(), ".csv")) + write.csv(gwas_vars$bic_df, bic_file, row.names = FALSE) + temp_files <- c(temp_files, bic_file) + } - #plot for each model per trait - for (j in 1:length(model)) { - print(j) + # Zip files only if there's something to zip + if (length(temp_files) > 0) { + zip(file, files = temp_files, extras = "-j") # Using -j to junk paths + } - data.loco.scan_2 <- GWASpoly(data=data.loco,models=model[j], - traits=colnames(data@pheno[i]),params=params,n.core= as.numeric(cores)) + # Optionally clean up + file.remove(temp_files) + } + ) - data3 <- set.threshold(data.loco.scan_2,method="M.eff",level=0.05) - manhattan_plot_list[[model[j]]] <- manhattan.plot(data3,traits=colnames(data@pheno[i]))+geom_point(size=3)+theme(text = element_text(size = 25),strip.text = element_text(face = "bold")) - } - #Save manhattan plots - gwas_vars$manhattan_plots <- manhattan_plot_list + #Download Figures for GWAS Tab (Need to convert figures to ggplot) + output$download_gwas_figure <- downloadHandler( + filename = function() { + if (input$gwas_image_type == "jpeg") { + paste("GWAS-", Sys.Date(), ".jpg", sep="") + } else if (input$gwas_image_type == "png") { + paste("GWAS-", Sys.Date(), ".png", sep="") + } else { + paste("GWAS-", Sys.Date(), ".tiff", sep="") + } + }, + content = function(file) { + req(input$gwas_figures) + + if (input$gwas_image_type == "jpeg") { + jpeg(file, width = as.numeric(input$gwas_image_width), height = as.numeric(input$gwas_image_height), res= as.numeric(input$gwas_image_res), units = "in") + } else if (input$gwas_image_type == "png") { + png(file, width = as.numeric(input$gwas_image_width), height = as.numeric(input$gwas_image_height), res= as.numeric(input$gwas_image_res), units = "in") + } else { + tiff(file, width = as.numeric(input$gwas_image_width), height = as.numeric(input$gwas_image_height), res= as.numeric(input$gwas_image_res), units = "in") } - #Status - updateProgressBar(session = session, id = "pb_gwas", value = 100, status = "success", title = "Finished") + # Conditional plotting based on input selection + if (input$gwas_figures == "BIC Plot") { + req(gwas_vars$BIC_ggplot) + print(gwas_vars$BIC_ggplot) - }) + } else if (input$gwas_figures == "Manhattan Plot") { + req(gwas_vars$manhattan_plots, input$model_select) + #Plot + print(gwas_vars$manhattan_plots[[input$model_select]]) - #Download files for GWAS - output$download_gwas_file <- downloadHandler( - filename = function() { - paste0("GWAS-results-", Sys.Date(), ".zip") - }, - content = function(file) { - # Temporary files list - temp_dir <- tempdir() - temp_files <- c() - - if (!is.null(gwas_vars$gwas_df)) { - # Create a temporary file for assignments - gwas_file <- file.path(temp_dir, paste0("QTL-statistics-", Sys.Date(), ".csv")) - write.csv(gwas_vars$gwas_df, gwas_file, row.names = FALSE) - temp_files <- c(temp_files, gwas_file) - } - - if (!is.null(gwas_vars$bic_df)) { - # Create a temporary file for BIC data frame - bic_file <- file.path(temp_dir, paste0("GWAS-BIC-statistics-", Sys.Date(), ".csv")) - write.csv(gwas_vars$bic_df, bic_file, row.names = FALSE) - temp_files <- c(temp_files, bic_file) - } - - # Zip files only if there's something to zip - if (length(temp_files) > 0) { - zip(file, files = temp_files, extras = "-j") # Using -j to junk paths - } - - # Optionally clean up - file.remove(temp_files) - } - ) + } else if (input$gwas_figures == "QQ Plot") { + req(gwas_vars$qq_plots) + + source("R/CMplot.r") #Obtained the CMplot code from GitHub and made edits to allow inline plotting for shiny app + #Plot + CMplot_shiny(gwas_vars$qq_plots,plot.type="q",col=c(1:8), + ylab.pos=2, + conf.int=FALSE, + box=F,multraits=TRUE,file.output=FALSE) - #Download Figures for GWAS Tab (Need to convert figures to ggplot) - output$download_gwas_figure <- downloadHandler( - - filename = function() { - if (input$gwas_image_type == "jpeg") { - paste("GWAS-", Sys.Date(), ".jpg", sep="") - } else if (input$gwas_image_type == "png") { - paste("GWAS-", Sys.Date(), ".png", sep="") - } else { - paste("GWAS-", Sys.Date(), ".tiff", sep="") - } - }, - content = function(file) { - req(input$gwas_figures) - - if (input$gwas_image_type == "jpeg") { - jpeg(file, width = as.numeric(input$gwas_image_width), height = as.numeric(input$gwas_image_height), res= as.numeric(input$gwas_image_res), units = "in") - } else if (input$gwas_image_type == "png") { - png(file, width = as.numeric(input$gwas_image_width), height = as.numeric(input$gwas_image_height), res= as.numeric(input$gwas_image_res), units = "in") - } else { - tiff(file, width = as.numeric(input$gwas_image_width), height = as.numeric(input$gwas_image_height), res= as.numeric(input$gwas_image_res), units = "in") - } - - # Conditional plotting based on input selection - if (input$gwas_figures == "BIC Plot") { - req(gwas_vars$BIC_ggplot) - print(gwas_vars$BIC_ggplot) - - } else if (input$gwas_figures == "Manhattan Plot") { - req(gwas_vars$manhattan_plots, input$model_select) - #Plot - print(gwas_vars$manhattan_plots[[input$model_select]]) - - } else if (input$gwas_figures == "QQ Plot") { - req(gwas_vars$qq_plots) - - source("R/CMplot.r") #Obtained the CMplot code from GitHub and made edits to allow inline plotting for shiny app - - #Plot - CMplot_shiny(gwas_vars$qq_plots,plot.type="q",col=c(1:8), - ylab.pos=2, - conf.int=FALSE, - box=F,multraits=TRUE,file.output=FALSE) - - } - - dev.off() } - ) - output$download_vcf <- downloadHandler( - filename = function() { - paste0("BIGapp_VCF_Example_file.vcf.gz") - }, - content = function(file) { - ex <- system.file("iris_DArT_VCF.vcf.gz", package = "BIGapp") - file.copy(ex, file) - }) + dev.off() + } + ) - output$download_pheno <- downloadHandler( - filename = function() { - paste0("BIGapp_passport_Example_file.csv") - }, - content = function(file) { - ex <- system.file("iris_passport_file.csv", package = "BIGapp") - file.copy(ex, file) - }) - }) + output$download_vcf <- downloadHandler( + filename = function() { + paste0("BIGapp_VCF_Example_file.vcf.gz") + }, + content = function(file) { + ex <- system.file("iris_DArT_VCF.vcf.gz", package = "BIGapp") + file.copy(ex, file) + }) + + output$download_pheno <- downloadHandler( + filename = function() { + paste0("BIGapp_passport_Example_file.csv") + }, + content = function(file) { + ex <- system.file("iris_passport_file.csv", package = "BIGapp") + file.copy(ex, file) + }) } ## To be copied in the UI diff --git a/R/mod_help.R b/R/mod_help.R index b63e34f..9c8b7dc 100644 --- a/R/mod_help.R +++ b/R/mod_help.R @@ -13,36 +13,72 @@ mod_help_ui <- function(id){ fluidPage( column(width=12), column(width=12, - box(title="Dosage Calling", width = 12, collapsible = TRUE, collapsed = TRUE, status = "info", solidHeader = TRUE, - bs4Dash::tabsetPanel( - tabPanel("DArT Report2VCF", - "**Draft**This tab is designed to convert the DArT Dose Report and Counts files to a VCF file. **DArT Website**" - ), - tabPanel("Updog Dosage Calling", - "**Draft**This tab is designed to handle the process of dosage calling in genomic data. Dosage calling is essential for determining the number of copies of a particular allele at each genomic location. The app likely includes functionalities to upload raw genomic data, apply various filtering criteria, and generate plots to visualize the distribution of dosages. Users can examine histograms for SNP max post probabilities and read depths, which help in assessing the quality and accuracy of the dosage calls.**Updog**" - ), - tabPanel("SNP Filtering", - "Filtering the genotypes" - )) + box(title="DArT Report2VCF", id = "DArT_Report2VCF_box",width = 12, collapsible = TRUE, collapsed = TRUE, status = "info", solidHeader = TRUE, + bs4Dash::tabsetPanel(id = "DArT_Report2VCF_tabset", + tabPanel("Parameters description", value = "DArT_Report2VCF_par", + "**Draft**This tab is designed to convert the DArT Dose Report and Counts files to a VCF file. **DArT Website**" + ), + tabPanel("Results description", value = "DArT_Report2VCF_results", + )) ), - box(title="Population Structure", width = 12, collapsible = TRUE, collapsed = TRUE, status = "info", solidHeader = TRUE, - bs4Dash::tabsetPanel( - tabPanel("PCA", - "**Draft**This tab focuses on analyzing the population structure using Discriminant Analysis of Principal Components (DAPC) and Principal Component Analysis (PCA). These methods are used to identify and visualize genetic diversity and structure within the population. The app provides options to perform PCA to reduce the dimensionality of the genomic data and visualize principal components. DAPC is used to find clusters within the data and visualize these clusters, helping users understand the genetic relationships and structure in their dataset." - ), - tabPanel("DAPC")) - + box(title="Updog Dosage Calling", id = "Updog_Dosage_Calling_box",width = 12, collapsible = TRUE, collapsed = TRUE, status = "info", solidHeader = TRUE, + bs4Dash::tabsetPanel(id = "Updog_Dosage_Calling_tabset", + tabPanel("Parameters description", value = "Updog_Dosage_Calling_par", + ), + tabPanel("Results description", value = "Updog_Dosage_Calling_results", + "**Draft**This tab is designed to handle the process of dosage calling in genomic data. Dosage calling is essential for determining the number of copies of a particular allele at each genomic location. The app likely includes functionalities to upload raw genomic data, apply various filtering criteria, and generate plots to visualize the distribution of dosages. Users can examine histograms for SNP max post probabilities and read depths, which help in assessing the quality and accuracy of the dosage calls.**Updog**" + )) + ), + box(title="VCF Filtering", id = "VCF_Filtering_box",width = 12, collapsible = TRUE, collapsed = TRUE, status = "info", solidHeader = TRUE, + bs4Dash::tabsetPanel(id = "VCF_Filtering_tabset", + tabPanel("Parameters description", value = "VCF_Filtering_par", + "**Draft**This tab is designed to convert the DArT Dose Report and Counts files to a VCF file. **DArT Website**" + ), + tabPanel("Results description", value = "VCF_Filtering_results", + "**Draft**This tab is designed to handle the process of dosage calling in genomic data. Dosage calling is essential for determining the number of copies of a particular allele at each genomic location. The app likely includes functionalities to upload raw genomic data, apply various filtering criteria, and generate plots to visualize the distribution of dosages. Users can examine histograms for SNP max post probabilities and read depths, which help in assessing the quality and accuracy of the dosage calls.**Updog**" + )) ), - box(title="Genomic Diversity", width = 12, collapsible = TRUE, collapsed = TRUE, status = "info", solidHeader = TRUE, - "**Draft**This tab is dedicated to analyzing genomic diversity within the population. It calculates various diversity metrics such as heterozygosity and minor allele frequency (MAF). The app includes functionalities to visualize these metrics through histograms and other plots. Users can download the calculated diversity metrics as CSV files. This tab helps in understanding the genetic variability and distribution of alleles within the population." + box(title="PCA", id = "PCA_box",width = 12, collapsible = TRUE, collapsed = TRUE, status = "info", solidHeader = TRUE, + bs4Dash::tabsetPanel(id = "PCA_tabset", + tabPanel("Parameters description", value = "PCA_par", + ), + tabPanel("Results description", value = "PCA_results", + )) ), - box(title="GWAS", width = 12, collapsible = TRUE, collapsed = TRUE, status = "info", solidHeader = TRUE, - "**Draft**This tab is for conducting Genome-Wide Association Studies (GWAS) to identify associations between genetic variants and traits of interest. Users can input phenotypic data and specify parameters for the GWAS analysis. The app performs statistical tests to identify significant associations between SNPs and traits, and visualizes the results using Manhattan plots and Q-Q plots. This tab helps in identifying potential genetic markers linked to specific traits.**List R packages utilized" + box(title="DAPC", id = "DAPC_box",width = 12, collapsible = TRUE, collapsed = TRUE, status = "info", solidHeader = TRUE, + bs4Dash::tabsetPanel(id = "DAPC_tabset", + tabPanel("Parameters description", value = "DAPC_par", + ), + tabPanel("Results description", value = "DAPC_results", + )) + ), + box(title="Genomic Diversity", id = "Genomic_Diversity_box",width = 12, collapsible = TRUE, collapsed = TRUE, status = "info", solidHeader = TRUE, + bs4Dash::tabsetPanel(id = "Genomic_Diversity_tabset", + tabPanel("Parameters description", value = "Genomic_Diversity_par", + "**Draft**This tab is dedicated to analyzing genomic diversity within the population. It calculates various diversity metrics such as heterozygosity and minor allele frequency (MAF). The app includes functionalities to visualize these metrics through histograms and other plots. Users can download the calculated diversity metrics as CSV files. This tab helps in understanding the genetic variability and distribution of alleles within the population." + + ), + tabPanel("Results description", value = "Genomic_Diversity_results", + )) ), - box(title="Genomic Prediction/Selection", width = 12, collapsible = TRUE, collapsed = TRUE, status = "info", solidHeader = TRUE, - "**Draft**This tab provides functionalities for genomic prediction, which involves predicting phenotypic traits based on genomic data. Users can input phenotypic and genotypic data, and specify parameters such as the number of cross-validation folds, training percentage, and fixed effects. The app performs genomic prediction using methods such as rrBLUP, and displays the results including cross-validation performance metrics. Users can download the prediction results for further analysis.**List R packages utilized" + box(title="GWAS", id = "GWAS_box",width = 12, collapsible = TRUE, collapsed = TRUE, status = "info", solidHeader = TRUE, + bs4Dash::tabsetPanel(id = "GWAS_tabset", + tabPanel("Parameters description", value = "GWAS_par", + "**Draft**This tab is for conducting Genome-Wide Association Studies (GWAS) to identify associations between genetic variants and traits of interest. Users can input phenotypic data and specify parameters for the GWAS analysis. The app performs statistical tests to identify significant associations between SNPs and traits, and visualizes the results using Manhattan plots and Q-Q plots. This tab helps in identifying potential genetic markers linked to specific traits.**List R packages utilized", + ), + tabPanel("Results description", value = "GWAS_results", + )) ), - box(title="How to Cite", width = 12, collapsible = TRUE, collapsed = TRUE, status = "info", solidHeader = TRUE, + box(title="Genomic Prediction/Selection", id = "Genomic_Prediction/Selection_box",width = 12, collapsible = TRUE, collapsed = TRUE, status = "info", solidHeader = TRUE, + bs4Dash::tabsetPanel(id = "Genomic_Prediction/Selection_tabset", + tabPanel("Parameters description", value = "Genomic_Prediction/Selection_par", + "**Draft**This tab is for conducting Genome-Wide Association Studies (GWAS) to identify associations between genetic variants and traits of interest. Users can input phenotypic data and specify parameters for the GWAS analysis. The app performs statistical tests to identify significant associations between SNPs and traits, and visualizes the results using Manhattan plots and Q-Q plots. This tab helps in identifying potential genetic markers linked to specific traits.**List R packages utilized", + ), + tabPanel("Results description", value = "Genomic_Prediction/Selection_results", + )) + ), + + box(title="How to Cite", id = "how_to_cite_box", width = 12, collapsible = TRUE, collapsed = TRUE, status = "info", solidHeader = TRUE, "**Draft**Instructions for citing the app and packages used in analyses" ), ), @@ -55,11 +91,10 @@ mod_help_ui <- function(id){ #' help Server Functions #' #' @noRd -mod_help_server <- function(id){ - moduleServer( id, function(input, output, session){ - ns <- session$ns +mod_help_server <- function(input, output, session, parent_session){ + + ns <- session$ns - }) } ## To be copied in the UI diff --git a/R/mod_mapping.R b/R/mod_mapping.R index d9e0625..ca22d39 100644 --- a/R/mod_mapping.R +++ b/R/mod_mapping.R @@ -4,28 +4,27 @@ #' #' @param id,input,output,session Internal parameters for {shiny}. #' -#' @noRd +#' @noRd #' -#' @importFrom shiny NS tagList +#' @importFrom shiny NS tagList mod_mapping_ui <- function(id){ ns <- NS(id) tagList( - + ) } - + #' mapping Server Functions #' -#' @noRd -mod_mapping_server <- function(id){ - moduleServer( id, function(input, output, session){ - ns <- session$ns - - }) +#' @noRd +mod_mapping_server <- function(input, output, session, parent_session){ + + ns <- session$ns + } - + ## To be copied in the UI # mod_mapping_ui("mapping_1") - + ## To be copied in the server # mod_mapping_server("mapping_1") diff --git a/R/mod_slurm.R b/R/mod_slurm.R index e51dabc..68cbf6e 100644 --- a/R/mod_slurm.R +++ b/R/mod_slurm.R @@ -29,40 +29,39 @@ mod_slurm_ui <- function(id){ #' #' @importFrom DT renderDT datatable #' @noRd -mod_slurm_server <- function(id){ - moduleServer( id, function(input, output, session){ - ns <- session$ns +mod_slurm_server <- function(input, output, session, parent_session){ - ####Command to get information from SLURM as dataframe - #add button to ui for user to cancel their scheduled jobs... - get_slurm_jobs <- reactive({ - # Run the SLURM squeue command to get job information - slurm_output <- system("squeue --Format=UserName,JobName,TimeUsed --noheader", intern = TRUE) + ns <- session$ns - # Process the output - job_list <- strsplit(slurm_output, "\\s+") + ####Command to get information from SLURM as dataframe + #add button to ui for user to cancel their scheduled jobs... + get_slurm_jobs <- reactive({ + # Run the SLURM squeue command to get job information + slurm_output <- system("squeue --Format=UserName,JobName,TimeUsed --noheader", intern = TRUE) - # Create a data frame - job_df <- do.call(rbind, job_list) - colnames(job_df) <- c("userID", "JobType", "Duration") + # Process the output + job_list <- strsplit(slurm_output, "\\s+") - # Convert to data frame - job_df <- as.data.frame(job_df, stringsAsFactors = FALSE) + # Create a data frame + job_df <- do.call(rbind, job_list) + colnames(job_df) <- c("userID", "JobType", "Duration") - return(job_df) - }) + # Convert to data frame + job_df <- as.data.frame(job_df, stringsAsFactors = FALSE) - #Display job queue to user - output$job_table <- renderDT({ - #job_df <- get_slurm_jobs() #Use this when I get the above code working on the server - job_df <- data.frame(userID = c("User1","User2","User3","User4","User5"), - JobID = c("000303","000312","000335","000342", "000348"), - JobType = c("Updog Dosage Calling","Updog Dosage Calling", "Updog Dosage Calling", "GWAS", "GWAS"), - Duration = c("Completed: Email notification sent","06:11:43", "03:31:01", "00:46:00", "Scheduled")) + return(job_df) + }) + + #Display job queue to user + output$job_table <- renderDT({ + #job_df <- get_slurm_jobs() #Use this when I get the above code working on the server + job_df <- data.frame(userID = c("User1","User2","User3","User4","User5"), + JobID = c("000303","000312","000335","000342", "000348"), + JobType = c("Updog Dosage Calling","Updog Dosage Calling", "Updog Dosage Calling", "GWAS", "GWAS"), + Duration = c("Completed: Email notification sent","06:11:43", "03:31:01", "00:46:00", "Scheduled")) - # Render the data table - datatable(job_df, options = list(pageLength = 10)) - }) + # Render the data table + datatable(job_df, options = list(pageLength = 10)) }) } From e1c559f13d7b1cb82cbe93c0b9a8d9b669a9f09f Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Thu, 29 Aug 2024 10:51:54 -0400 Subject: [PATCH 10/40] issues #43 and #44 --- R/mod_gwas.R | 14 +++++++++++++- R/mod_help.R | 12 +++++++++--- inst/help_files/GWAS_cite.Rmd | 12 ++++++++++++ inst/help_files/GWAS_par.Rmd | 22 ++++++++++++++++++++++ inst/help_files/GWAS_res.Rmd | 19 +++++++++++++++++++ 5 files changed, 75 insertions(+), 4 deletions(-) create mode 100644 inst/help_files/GWAS_cite.Rmd create mode 100644 inst/help_files/GWAS_par.Rmd create mode 100644 inst/help_files/GWAS_res.Rmd diff --git a/R/mod_gwas.R b/R/mod_gwas.R index 4929fea..737a3e0 100644 --- a/R/mod_gwas.R +++ b/R/mod_gwas.R @@ -23,7 +23,7 @@ mod_gwas_ui <- function(id){ fileInput(ns("phenotype_file"), "Choose Passport File", accept = ".csv"), numericInput(ns("gwas_ploidy"), "Species Ploidy", min = 1, value = NULL), selectInput(ns('gwas_threshold'), label='Significance Threshold Method', choices = c("M.eff","Bonferroni","FDR","permute"), selected="M.eff"), - selectInput(ns('trait_info'), label = 'Select Trait (eg, Color):', choices = NULL), + selectInput(ns('trait_info'), label = 'Select Trait (eg. Color):', choices = NULL), virtualSelectInput( inputId = ns("fixed_info"), label = "Select Fixed Effects (optional):", @@ -40,6 +40,7 @@ mod_gwas_ui <- function(id){ p(downloadButton(ns('download_pheno'),""), "Passport Example File"), hr(), p(HTML("Parameters description:"), actionButton(ns("goGWASpar"), icon("arrow-up-right-from-square", verify_fa = FALSE) )), hr(), p(HTML("Graphics description:"), actionButton(ns("goGWASgraph"), icon("arrow-up-right-from-square", verify_fa = FALSE) )), hr(), + p(HTML("How to cite:"), actionButton(ns("goGWAScite"), icon("arrow-up-right-from-square", verify_fa = FALSE) )), hr(), p(HTML("GWASpoly tutorial:"), actionButton(ns("goGWASpoly"), icon("arrow-up-right-from-square", verify_fa = FALSE), onclick ="window.open('https://jendelman.github.io/GWASpoly/GWASpoly.html', '_blank')" )), circle = FALSE, status = "warning", @@ -128,6 +129,17 @@ mod_gwas_server <- function(input, output, session, parent_session){ updateBox(id = "GWAS_box", action = "toggle", session = parent_session) }) + observeEvent(input$goGWAScite, { + # change to help tab + updatebs4TabItems(session = parent_session, inputId = "MainMenu", + selected = "help") + + # select specific tab + updateTabsetPanel(session = parent_session, inputId = "GWAS_tabset", + selected = "GWAS_cite") + # expand specific box + updateBox(id = "GWAS_box", action = "toggle", session = parent_session) + }) #Call some plots to NULL so that the spinners do not show before analysis output$bic_plot <- renderDT(NULL) diff --git a/R/mod_help.R b/R/mod_help.R index 9c8b7dc..2e01e12 100644 --- a/R/mod_help.R +++ b/R/mod_help.R @@ -62,11 +62,17 @@ mod_help_ui <- function(id){ )) ), box(title="GWAS", id = "GWAS_box",width = 12, collapsible = TRUE, collapsed = TRUE, status = "info", solidHeader = TRUE, + "The tab is for conducting Genome-Wide Association Studies (GWAS) to identify associations between genetic variants and traits of interest. Users can input phenotypic data and specify parameters for the GWAS analysis. The app performs statistical tests to identify significant associations between SNPs and traits, and visualizes the results using Manhattan plots and Q-Q plots. The tab helps in identifying potential genetic markers linked to specific traits. GWASpoly package is used to perform the analysis.", + br(), br(), bs4Dash::tabsetPanel(id = "GWAS_tabset", - tabPanel("Parameters description", value = "GWAS_par", - "**Draft**This tab is for conducting Genome-Wide Association Studies (GWAS) to identify associations between genetic variants and traits of interest. Users can input phenotypic data and specify parameters for the GWAS analysis. The app performs statistical tests to identify significant associations between SNPs and traits, and visualizes the results using Manhattan plots and Q-Q plots. This tab helps in identifying potential genetic markers linked to specific traits.**List R packages utilized", + tabPanel("Parameters description", value = "GWAS_par", br(), + includeMarkdown(system.file("help_files/GWAS_par.Rmd", package = "BIGapp")) + ), + tabPanel("Results description", value = "GWAS_results", br(), + includeMarkdown(system.file("help_files/GWAS_res.Rmd", package = "BIGapp")) ), - tabPanel("Results description", value = "GWAS_results", + tabPanel("How to cite", value = "GWAS_cite", br(), + includeMarkdown(system.file("help_files/GWAS_cite.Rmd", package = "BIGapp")) )) ), box(title="Genomic Prediction/Selection", id = "Genomic_Prediction/Selection_box",width = 12, collapsible = TRUE, collapsed = TRUE, status = "info", solidHeader = TRUE, diff --git a/inst/help_files/GWAS_cite.Rmd b/inst/help_files/GWAS_cite.Rmd new file mode 100644 index 0000000..837ece6 --- /dev/null +++ b/inst/help_files/GWAS_cite.Rmd @@ -0,0 +1,12 @@ +--- +title: "GWAS_cite" +output: html_document +date: "2024-08-29" +--- + +* BIGapp + + +* GWASpoly package + +Rosyara, U.R., De Jong, W.S., Douches, D.S. and Endelman, J.B. (2016), Software for Genome-Wide Association Studies in Autopolyploids and Its Application to Potato. The Plant Genome, 9: plantgenome2015.08.0073. https://doi.org/10.3835/plantgenome2015.08.0073 diff --git a/inst/help_files/GWAS_par.Rmd b/inst/help_files/GWAS_par.Rmd new file mode 100644 index 0000000..779a04d --- /dev/null +++ b/inst/help_files/GWAS_par.Rmd @@ -0,0 +1,22 @@ +--- +title: "GWAS parameters" +output: html_document +date: "2024-08-29" +--- + +* **VCF file**: Variant Call Format (VCF) is a standard file format to store genetic variant information. The genotype (GT) data within the VCF is required for the analysis in this tab. For more details about the VCF format, see [this document](https://samtools.github.io/hts-specs/VCFv4.2.pdf). + +* **Passport file**: A comma-separated values (CSV) file containing individual names (Sample_ID) in the first column and phenotype values in the subsequent columns. The phenotype column names should correspond to the phenotype ID. + +* **Species Ploidy**: Specifies the ploidy level of the species. The current analysis supports both diploids and autopolyploids. + +* **Significance Threshold Method**: Define the method to be used. The ones currently available in GWASpoly are: "M.eff","Bonferroni","FDR","permute". Description contained in GWASpoly package: "The default method, "M.eff", is a Bonferroni-type correction but using an effective number of markers that accounts for LD between markers (Moskvina and Schmidt, 2008). The FDR method is based on version 1.30.0 of the qvalue package." + +* **Select Trait**: Once the **Passport file** is uploaded, BIGapp updates this box with the phenotype column IDs. Select the target phenotype to be evaluated. Currently, BIGapp allows selecting only one phenotype at a time. + +* **Select Fixed Effects (optional)**: Specifies the Phenotype ID of traits with fixed effects that may influence the evaluated trait. + +* **Window size (bp)**: Define the window size for the search of significant markers. Only the most significant marker within this region will be returned. We suggest to evaluate the **LD plot** (see Results description) to define the window size parameter. + +* **Number of CPU Cores**: Defines the number of CPU cores to be used for the GWAS analysis, enabling faster processing by splitting the workload across multiple cores. + diff --git a/inst/help_files/GWAS_res.Rmd b/inst/help_files/GWAS_res.Rmd new file mode 100644 index 0000000..80003a4 --- /dev/null +++ b/inst/help_files/GWAS_res.Rmd @@ -0,0 +1,19 @@ +--- +title: "GWAS_res" +output: html_document +date: "2024-08-29" +--- + +* BIC plot + +* BIC Table + +* LD plot + +* Manhattan Plot + +* QQ Plot + +* QTL - significant markers + +* Multiple QTL model results table From 26aa2d227a3d97e6c79005ec3feebd2479e1653b Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Thu, 29 Aug 2024 16:51:07 -0400 Subject: [PATCH 11/40] Added GBLUP Support --- R/mod_GSAcc.R | 479 +++++++++++++++++++++++++++++++++----------------- 1 file changed, 320 insertions(+), 159 deletions(-) diff --git a/R/mod_GSAcc.R b/R/mod_GSAcc.R index 004cabe..3fae114 100644 --- a/R/mod_GSAcc.R +++ b/R/mod_GSAcc.R @@ -23,6 +23,7 @@ mod_GSAcc_ui <- function(id){ fileInput(ns("pred_file"), "Choose VCF File", accept = c(".csv",".vcf",".gz")), fileInput(ns("trait_file"), "Choose Passport File", accept = ".csv"), numericInput(ns("pred_ploidy"), "Species Ploidy", min = 1, value = NULL), + selectInput(inputId = ns('pred_model'), label = 'Model Choice', choices = c("rrBLUP","GBLUP"), selected = "rrBLUP"), numericInput(ns("pred_cv"), "Iterations", min = 1, max=20, value = 5), virtualSelectInput( inputId = ns("pred_trait_info"), @@ -40,6 +41,19 @@ mod_GSAcc_ui <- function(id){ search = TRUE, multiple = TRUE ), + conditionalPanel( + condition = "input.pred_fixed_info.length > 0", ns = ns, + div( + virtualSelectInput( + inputId = ns("pred_fixed_cat"), + label = "Select Categorical Fixed Effects (unselected will be considered covariates)", + choices = NULL, + showValueAsTags = TRUE, + search = TRUE, + multiple = TRUE + ) + ) + ), actionButton(ns("prediction_start"), "Run Analysis"), div(style="display:inline-block; float:right",dropdownButton( tags$h3("GP Parameters"), @@ -107,7 +121,7 @@ mod_GSAcc_ui <- function(id){ #' GS Server Functions #' #' @importFrom vcfR read.vcfR extract.gt -#' @importFrom rrBLUP mixed.solve A.mat +#' @importFrom rrBLUP mixed.solve A.mat kin.blup #' @importFrom stats cor #' @importFrom shinyalert shinyalert #' @import dplyr @@ -181,6 +195,10 @@ mod_GSAcc_server <- function(id){ "green" = "#00BA38", input$pred_color_select) }) + + observeEvent(input$pred_fixed_info, { + updateVirtualSelect("pred_fixed_cat", choices = input$pred_fixed_info, session = session) + }) observeEvent(input$prediction_start, { @@ -463,6 +481,12 @@ mod_GSAcc_server <- function(id){ traits <- input$pred_trait_info CVs <- as.numeric(input$pred_cv) fixed_traits <- input$pred_fixed_info + fixed_cat <- input$pred_fixed_cat + fixed_cov <- if (is.null(input$pred_fixed_info) || length(input$pred_fixed_info) == length(input$pred_fixed_cat)) { + NULL + } else { + setdiff(input$pred_fixed_info, input$pred_fixed_cat) + } cores <- input$pred_cores #Assign colors @@ -475,223 +499,356 @@ mod_GSAcc_server <- function(id){ } else{ pred_outputs$colors <- input$pred_color_select } - - ##Need to add ability for the use of parallelism for the for cross-validation - ##Example at this tutorial also: https://www.youtube.com/watch?v=ARWjdQU6ays - - # Function to perform genomic prediction - ##Make sure this is correct (I think I need to be generating a relationship matrix A.mat() to account for missing data, but I am not sure how that works with this) - genomic_prediction <- function(geno, Pheno, traits, fixed_effects = NULL, Fold = 5, Iters = 5, cores = 1) { - + + #Control whether rrBLUP or GBLUP run depending on user input + #Note, should add the GP functions to the utils.R file and then call them here... + if (input$pred_model == "rrBLUP"){ + ##Need to add ability for the use of parallelism for the for cross-validation + ##Example at this tutorial also: https://www.youtube.com/watch?v=ARWjdQU6ays + + # Function to perform genomic prediction + ##Make sure this is correct (I think I need to be generating a relationship matrix A.mat() to account for missing data, but I am not sure how that works with this) + genomic_prediction <- function(geno, Pheno, traits, fixed_effects = NULL, Fold = 5, Iters = 5, cores = 1) { + + # Define variables + traits <- traits + cycles <- as.numeric(Iters) + Folds <- as.numeric(Fold) + total_population <- ncol(geno) + #train_size <- floor(percentage / 100 * total_population) + fixed_traits <- fixed_effects + cores <- as.numeric(cores) + + # Establish accuracy results matrix + results <- matrix(nrow = cycles*Folds, ncol = length(traits) + 2) + colnames(results) <- c(paste0(traits), "Iter", "Fold") # Set the column names to be the traits + + # Initialize a list to store GEBVs for all traits and cycles + GEBVs <- list() + + #Establish heritability_scores_df () Maybe get h2 values + # Establish results matrix + heritability_scores <- matrix(nrow = cycles*Folds, ncol = length(traits) + 2) + colnames(heritability_scores) <- c(paste0(traits,"_h2"), "Iter", "Fold") # Set the column names to be the traits + + #Cross validation number for progress bar (not involved in the calculations, just shiny visuals) + pb_value = 10 + + #Remove the fixed traits from the Pheno file + if (length(fixed_traits) == 0) { + Pheno <- Pheno + } else { + #Subset fixed traits + Fixed <- subset(Pheno, select = fixed_traits) + + #Pheno <- subset(Pheno, select = -fixed_traits) + convert_all_to_factor_if_not_numeric <- function(df) { + for (col in names(df)) { + if (!is.numeric(df[[col]]) && !is.integer(df[[col]])) { + df[[col]] <- as.factor(df[[col]]) + } + } + return(df) + } + # Convert all columns to factor if they are not numeric or integer + Fixed <- convert_all_to_factor_if_not_numeric(Fixed) + + #Fixed <- as.data.frame(lapply(Fixed, as.factor)) #convert to factor + row.names(Fixed) <- row.names(Pheno) + + #Make the matrix + formula_str <- paste("~", paste(fixed_traits, collapse = " + ")) + formula <- as.formula(formula_str) + + # Create the design matrix using the constructed formula + Fixed <- model.matrix(formula, data = Fixed) + } + + #Make kinship matrix of all individuals? + #Kin_mat <- A.mat(t(geno), n.core = 1) ##Need to explore whether or not to use a Kinship matrix and if it makes a significant improvement to accuracy + #If wanting to use Kkinship matrix, will then need to see how to implement it here + + #For now, I am just imputing the missing sites using mean, but EM is more accurate, but slower (can use multiple cores). + impute = (A.mat(t(geno), max.missing=0.5,impute.method="mean",return.imputed=TRUE)) + geno <- impute$imputed + + # For loop + for (r in 1:cycles) { + set.seed(r) + fold_ids <- sample(rep(1:Folds, length.out = total_population)) + fold_df <- data.frame(Sample = row.names(geno), FoldID = fold_ids) #Randomly assign each sample to a fold + fold_results <- matrix(nrow = Folds, ncol = length(traits)) + colnames(fold_results) <- traits + + #Initialize GEBV object for each cycle + GEBVs_cycle <-list() + + #Status + updateProgressBar(session = session, id = "pb_prediction", value = as.numeric(pb_value), title = paste0("Performing iteration:", r, "of", cycles)) + + for (fold in 1:Folds) { + + #Status bar length + pb_value = pb_value + (70 / as.numeric(cycles*Folds)) + + train <- fold_df %>% + dplyr::filter(FoldID != fold) %>% + pull(Sample) + test <- setdiff(row.names(geno),train) + + #Subset datasets + if (length(fixed_traits) == 0) { + Fixed_train = NULL + } else{ + Fixed_train <- data.frame(Fixed[train, ]) + Fixed_train <- as.matrix(Fixed_train) + row.names(Fixed_train) <- train + + #Fixed (testing) + Fixed_test<- data.frame(Fixed[test, ]) + Fixed_test <- as.matrix(Fixed_test) + row.names(Fixed_test) <- test + + } + + Pheno_train <- Pheno[train, ] # Subset the phenotype df to only retain the relevant samples from the training set + m_train <- geno[train, ] + Pheno_test <- Pheno[test, ] + #Fixed_test <- Fixed[test, ] #Where would the Fixed_test be used? + m_valid <- geno[test, ] + + # Initialize a matrix to store GEBVs for this fold + GEBVs_fold <- matrix(nrow = length(test), ncol = length(traits)+3) + colnames(GEBVs_fold) <- c(traits,"Sample","Iter","Fold") + rownames(GEBVs_fold) <- paste("Iter", r,"Fold",fold,"Ind", test, sep="_") + + #Evaluate each trait using the same train and testing samples for each + for (trait_idx in 1:length(traits)) { + trait <- Pheno_train[, traits[trait_idx]] # Get the trait of interest + trait_answer <- mixed.solve(y= trait, Z = m_train, K = NULL, X = Fixed_train, SE = FALSE, return.Hinv = FALSE) + TRT <- trait_answer$u + e <- as.matrix(TRT) + pred_trait_test <- m_valid %*% e + pred_trait <- pred_trait_test[, 1] + c(trait_answer$beta) # Make sure this still works when using multiple traits + trait_test <- Pheno_test[, traits[trait_idx]] + results[(((r-1)*5)+fold), trait_idx] <- cor(pred_trait, trait_test, use = "complete") + results[(((r-1)*5)+fold), (length(traits)+1)] <- r + results[(((r-1)*5)+fold), (length(traits)+2)] <- fold + + # Extract GEBVs + # Check if Fixed_train is not NULL and include beta if it is + if (!is.null(Fixed_train) && !is.null(trait_answer$beta)) { + # Calculate GEBVs including fixed effects + GEBVs_fold[, trait_idx] <- m_valid %*% trait_answer$u + Fixed_test %*% trait_answer$beta + } else { + # Calculate GEBVs without fixed effects + GEBVs_fold[, trait_idx] <- m_valid %*% trait_answer$u #Confirm it is accurate to calculate the GEBVs for testing group from the trained model + } + + # Calculate heritability for the current trait + Vu <- trait_answer$Vu + Ve <- trait_answer$Ve + heritability_scores[(((r-1)*5)+fold), trait_idx] <- Vu / (Vu + Ve) + + } + #Add iter and fold information for each trait/result + heritability_scores[(((r-1)*5)+fold), (length(traits)+1)] <- r + heritability_scores[(((r-1)*5)+fold), (length(traits)+2)] <- fold + + #Add sample, iteration, and fold information to GEBVs_fold + GEBVs_fold[,"Iter"] = r + GEBVs_fold[,"Fold"] = fold + GEBVs_fold[,"Sample"] <- test + + # Store GEBVs for this fold + GEBVs_cycle[[fold]] <- GEBVs_fold + + } + + # Store GEBVs for this cycle + GEBVs[[r]] <- do.call(rbind, GEBVs_cycle) + + } + + # Combine all GEBVs into a single DataFrame + GEBVs_df <- as.data.frame(do.call(rbind, GEBVs)) + + results <- as.data.frame(results) + heritability_scores <- as.data.frame(heritability_scores) + + # Combine results and heritability_scores using cbind + combined_results <- cbind(results, heritability_scores) + + return(list(GEBVs = GEBVs_df, PredictionAccuracy = results, CombinedResults = combined_results)) + } + + # Example call to the function + #This is slow when using 3k markers and 1.2k samples...will need to parallelize if using this script... + results <- genomic_prediction(geno_adj, pheno, traits = traits, fixed_effects = fixed_traits, Iters = input$pred_cv, cores = cores) + + #With fixed effects (need to inforporate the ability for fixed effects into the prediction?) + #results <- genomic_prediction(geno_matrix, phenotype_df, c("height", "weight"), "~ age + sex") + + #Save to reactive value + pred_outputs$corr_output <- results$PredictionAccuracy + pred_outputs$all_GEBVs <- results$GEBVs + + # Convert trait columns to numeric + results$GEBVs <- results$GEBVs %>% + mutate(across(all_of(traits), ~ as.numeric(.x))) + + # Calculate the average value for each column in the traits list for each SampleID, ignoring Iter and Fold + average_gebvs_df <- results$GEBVs %>% + group_by(Sample) %>% + summarize(across(all_of(traits), mean, na.rm = TRUE)) + + pred_outputs$avg_GEBVs <- average_gebvs_df + + columns <- setdiff(colnames(results$PredictionAccuracy), c("Iter","Fold")) + average_accuracy_df <- results$PredictionAccuracy %>% + group_by(Iter) %>% + summarize(across(all_of(columns), mean, na.rm = TRUE)) + + + pred_outputs$comb_output <- average_accuracy_df + + }else{ + #Note: should wrap the GBLUP into a function too # Define variables traits <- traits - cycles <- as.numeric(Iters) - Folds <- as.numeric(Fold) - total_population <- ncol(geno) + cycles <- input$pred_cv + Folds <- 5 + total_population <- ncol(pred_inputs$geno_input) #train_size <- floor(percentage / 100 * total_population) - fixed_traits <- fixed_effects + fixed_traits <- fixed_traits cores <- as.numeric(cores) - + #Cross validation number for progress bar (not involved in the calculations, just shiny visuals) + pb_value = 10 + + #Convert normalized genotypes to relationship matrix + #By default, it removes SNPs with more than 50% missing data and imputes using the mean + Geno.mat <- A.mat(t(pred_inputs$geno_input)) + # Establish accuracy results matrix results <- matrix(nrow = cycles*Folds, ncol = length(traits) + 2) colnames(results) <- c(paste0(traits), "Iter", "Fold") # Set the column names to be the traits - + # Initialize a list to store GEBVs for all traits and cycles GEBVs <- list() - + #Establish heritability_scores_df () Maybe get h2 values # Establish results matrix heritability_scores <- matrix(nrow = cycles*Folds, ncol = length(traits) + 2) colnames(heritability_scores) <- c(paste0(traits,"_h2"), "Iter", "Fold") # Set the column names to be the traits - - #Cross validation number for progress bar (not involved in the calculations, just shiny visuals) - pb_value = 10 - - #Remove the fixed traits from the Pheno file - if (length(fixed_traits) == 0) { - Pheno <- Pheno - } else { - #Subset fixed traits - Fixed <- subset(Pheno, select = fixed_traits) - - #Pheno <- subset(Pheno, select = -fixed_traits) - convert_all_to_factor_if_not_numeric <- function(df) { - for (col in names(df)) { - if (!is.numeric(df[[col]]) && !is.integer(df[[col]])) { - df[[col]] <- as.factor(df[[col]]) - } - } - return(df) - } - # Convert all columns to factor if they are not numeric or integer - Fixed <- convert_all_to_factor_if_not_numeric(Fixed) - - #Fixed <- as.data.frame(lapply(Fixed, as.factor)) #convert to factor - row.names(Fixed) <- row.names(Pheno) - - #Make the matrix - formula_str <- paste("~", paste(fixed_traits, collapse = " + ")) - formula <- as.formula(formula_str) - - # Create the design matrix using the constructed formula - Fixed <- model.matrix(formula, data = Fixed) - } - - #Make kinship matrix of all individuals? - #Kin_mat <- A.mat(t(geno), n.core = 1) ##Need to explore whether or not to use a Kinship matrix and if it makes a significant improvement to accuracy - #If wanting to use Kkinship matrix, will then need to see how to implement it here - - #For now, I am just imputing the missing sites using mean, but EM is more accurate, but slower (can use multiple cores). - impute = (A.mat(t(geno), max.missing=0.5,impute.method="mean",return.imputed=TRUE)) - geno <- impute$imputed - + + # For loop for (r in 1:cycles) { set.seed(r) fold_ids <- sample(rep(1:Folds, length.out = total_population)) - fold_df <- data.frame(Sample = row.names(geno), FoldID = fold_ids) #Randomly assign each sample to a fold + fold_df <- data.frame(Sample = row.names(Geno.mat), FoldID = fold_ids) #Randomly assign each sample to a fold fold_results <- matrix(nrow = Folds, ncol = length(traits)) colnames(fold_results) <- traits - + #Initialize GEBV object for each cycle GEBVs_cycle <-list() - + #Status updateProgressBar(session = session, id = "pb_prediction", value = as.numeric(pb_value), title = paste0("Performing iteration:", r, "of", cycles)) - + for (fold in 1:Folds) { - + #Status bar length pb_value = pb_value + (70 / as.numeric(cycles*Folds)) - + + #Subset training and testing samples train <- fold_df %>% - dplyr::filter(FoldID != fold) %>% + filter(FoldID != fold) %>% pull(Sample) - test <- setdiff(row.names(geno),train) - - #Subset datasets - if (length(fixed_traits) == 0) { - Fixed_train = NULL - } else{ - Fixed_train <- data.frame(Fixed[train, ]) - Fixed_train <- as.matrix(Fixed_train) - row.names(Fixed_train) <- train - - #Fixed (testing) - Fixed_test<- data.frame(Fixed[test, ]) - Fixed_test <- as.matrix(Fixed_test) - row.names(Fixed_test) <- test - - } - - Pheno_train <- Pheno[train, ] # Subset the phenotype df to only retain the relevant samples from the training set - m_train <- geno[train, ] - Pheno_test <- Pheno[test, ] - #Fixed_test <- Fixed[test, ] #Where would the Fixed_test be used? - m_valid <- geno[test, ] - + test <- setdiff(row.names(Geno.mat),train) + + Fixed_train = NULL + # Initialize a matrix to store GEBVs for this fold GEBVs_fold <- matrix(nrow = length(test), ncol = length(traits)+3) colnames(GEBVs_fold) <- c(traits,"Sample","Iter","Fold") rownames(GEBVs_fold) <- paste("Iter", r,"Fold",fold,"Ind", test, sep="_") - + #Evaluate each trait using the same train and testing samples for each for (trait_idx in 1:length(traits)) { - trait <- Pheno_train[, traits[trait_idx]] # Get the trait of interest - trait_answer <- mixed.solve(y= trait, Z = m_train, K = NULL, X = Fixed_train, SE = FALSE, return.Hinv = FALSE) - TRT <- trait_answer$u - e <- as.matrix(TRT) - pred_trait_test <- m_valid %*% e - pred_trait <- pred_trait_test[, 1] + c(trait_answer$beta) # Make sure this still works when using multiple traits - trait_test <- Pheno_test[, traits[trait_idx]] - results[(((r-1)*5)+fold), trait_idx] <- cor(pred_trait, trait_test, use = "complete") + #Mask phenotypes in testing group + Pheno_test <- pred_inputs$pheno_input + Pheno_test[test, traits[trait_idx]] <- NA + #Kin.blup + traitpred <- kin.blup(data = Pheno_test, geno = names(pred_inputs$pheno_input)[1], pheno = traits[trait_idx], fixed = fixed_cat, covariate = fixed_cov, K=Geno.mat) + #Cor between test values and predicted breeding values + results[(((r-1)*5)+fold), trait_idx] <- cor(pred_inputs$pheno_input[test, traits[trait_idx]], traitpred$g[test], use = "complete.obs") results[(((r-1)*5)+fold), (length(traits)+1)] <- r results[(((r-1)*5)+fold), (length(traits)+2)] <- fold - + # Extract GEBVs - # Check if Fixed_train is not NULL and include beta if it is - if (!is.null(Fixed_train) && !is.null(trait_answer$beta)) { - # Calculate GEBVs including fixed effects - GEBVs_fold[, trait_idx] <- m_valid %*% trait_answer$u + Fixed_test %*% trait_answer$beta - } else { - # Calculate GEBVs without fixed effects - GEBVs_fold[, trait_idx] <- m_valid %*% trait_answer$u #Confirm it is accurate to calculate the GEBVs for testing group from the trained model - } - - # Calculate heritability for the current trait - Vu <- trait_answer$Vu - Ve <- trait_answer$Ve + GEBVs_fold[, trait_idx] <- traitpred$g[test] #Confirm it is accuract to calculate the GEBVs for testing group from the trained model + + + # Calculate heritability (these are wrong) + Vu <- traitpred$Vg + Ve <- traitpred$Ve heritability_scores[(((r-1)*5)+fold), trait_idx] <- Vu / (Vu + Ve) - + } #Add iter and fold information for each trait/result heritability_scores[(((r-1)*5)+fold), (length(traits)+1)] <- r heritability_scores[(((r-1)*5)+fold), (length(traits)+2)] <- fold - + #Add sample, iteration, and fold information to GEBVs_fold GEBVs_fold[,"Iter"] = r GEBVs_fold[,"Fold"] = fold GEBVs_fold[,"Sample"] <- test - + # Store GEBVs for this fold GEBVs_cycle[[fold]] <- GEBVs_fold - + } - + # Store GEBVs for this cycle GEBVs[[r]] <- do.call(rbind, GEBVs_cycle) - + } - + # Combine all GEBVs into a single DataFrame GEBVs_df <- as.data.frame(do.call(rbind, GEBVs)) - + results <- as.data.frame(results) heritability_scores <- as.data.frame(heritability_scores) - + # Combine results and heritability_scores using cbind combined_results <- cbind(results, heritability_scores) + + #Save to reactive value + pred_outputs$corr_output <- results + pred_outputs$all_GEBVs <- results$GEBVs_df + + # Convert trait columns to numeric + GEBVs <- GEBVs_df %>% + mutate(across(all_of(traits), ~ as.numeric(.x))) + + # Calculate the average value for each column in the traits list for each SampleID, ignoring Iter and Fold + average_gebvs_df <- GEBVs %>% + group_by(Sample) %>% + summarize(across(all_of(traits), mean, na.rm = TRUE)) + + pred_outputs$avg_GEBVs <- average_gebvs_df + + columns <- setdiff(colnames(results), c("Iter","Fold")) + average_accuracy_df <- results %>% + group_by(Iter) %>% + summarize(across(all_of(columns), mean, na.rm = TRUE)) + + + pred_outputs$comb_output <- average_accuracy_df - return(list(GEBVs = GEBVs_df, PredictionAccuracy = results, CombinedResults = combined_results)) } - # Example call to the function - #This is slow when using 3k markers and 1.2k samples...will need to parallelize if using this script... - results <- genomic_prediction(geno_adj, pheno, traits = traits, fixed_effects = fixed_traits, Iters = input$pred_cv, cores = cores) - - #With fixed effects (need to inforporate the ability for fixed effects into the prediction?) - #results <- genomic_prediction(geno_matrix, phenotype_df, c("height", "weight"), "~ age + sex") - - #Save to reactive value - pred_outputs$corr_output <- results$PredictionAccuracy - pred_outputs$all_GEBVs <- results$GEBVs - - # Convert trait columns to numeric - results$GEBVs <- results$GEBVs %>% - mutate(across(all_of(traits), ~ as.numeric(.x))) - - # Calculate the average value for each column in the traits list for each SampleID, ignoring Iter and Fold - average_gebvs_df <- results$GEBVs %>% - group_by(Sample) %>% - summarize(across(all_of(traits), mean, na.rm = TRUE)) - - pred_outputs$avg_GEBVs <- average_gebvs_df - - #Get average accuracy and h2 for each iter accross the 5 folds - - #columns <- setdiff(colnames(results$CombinedResults), c("Iter","Fold")) - #average_accuracy_df <- results$CombinedResults %>% - # group_by(Iter) %>% - # summarize(across(all_of(columns), mean, na.rm = TRUE)) - - columns <- setdiff(colnames(results$PredictionAccuracy), c("Iter","Fold")) - average_accuracy_df <- results$PredictionAccuracy %>% - group_by(Iter) %>% - summarize(across(all_of(columns), mean, na.rm = TRUE)) - - - pred_outputs$comb_output <- average_accuracy_df - #Status updateProgressBar(session = session, id = "pb_prediction", value = 90, title = "Generating Results") @@ -738,7 +895,9 @@ mod_GSAcc_server <- function(id){ axis.text = element_text(size = 12), axis.title = element_text(size = 14), axis.text.x = element_text(angle = 90, hjust = 0.95, vjust = 0.2), - strip.text.x = element_text(face = "bold")) + strip.text.x = element_text(face = "bold"), + axis.text.x.bottom = element_blank(), + axis.ticks.x.bottom = element_blank()) plot_violin <- ggplot(df_long, aes(x = "rrBLUP", y = Correlation, fill = "red")) + geom_violin(trim = TRUE) + # Add violin plot @@ -752,7 +911,9 @@ mod_GSAcc_server <- function(id){ axis.text = element_text(size = 12), axis.title = element_text(size = 14), axis.text.x = element_text(angle = 90, hjust = 0.95, vjust = 0.2), - strip.text.x = element_text(face = "bold")) + strip.text.x = element_text(face = "bold"), + axis.text.x.bottom = element_blank(), + axis.ticks.x.bottom = element_blank()) list(plot, plot_violin) }) From bd5ec81770857c12d2e5d00963ff7cc66141a588 Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Thu, 29 Aug 2024 16:51:44 -0400 Subject: [PATCH 12/40] added package --- R/mod_GSAcc.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/mod_GSAcc.R b/R/mod_GSAcc.R index 3fae114..7d3c735 100644 --- a/R/mod_GSAcc.R +++ b/R/mod_GSAcc.R @@ -763,7 +763,7 @@ mod_GSAcc_server <- function(id){ #Subset training and testing samples train <- fold_df %>% - filter(FoldID != fold) %>% + dplyr::filter(FoldID != fold) %>% pull(Sample) test <- setdiff(row.names(Geno.mat),train) From a987d1d0d5eb992e52b2ebdac2685b5849064deb Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Thu, 29 Aug 2024 17:17:43 -0400 Subject: [PATCH 13/40] add help files --- NAMESPACE | 5 +- R/mod_DosageCall.R | 73 +++++++++++-------- R/mod_dosage2vcf.R | 49 +++++++++++-- R/mod_gwas.R | 3 +- R/mod_help.R | 72 ++++++++++++++---- inst/help_files/DAPC_cite.Rmd | 6 ++ inst/help_files/DAPC_par.Rmd | 6 ++ inst/help_files/DAPC_res.Rmd | 6 ++ inst/help_files/DArT_Report2VCF_cite.Rmd | 6 ++ inst/help_files/DArT_Report2VCF_par.Rmd | 6 ++ inst/help_files/DArT_Report2VCF_res.Rmd | 6 ++ inst/help_files/GWAS_par.Rmd | 17 ++++- inst/help_files/Genomic_Diversity_cite.Rmd | 6 ++ inst/help_files/Genomic_Diversity_par.Rmd | 6 ++ inst/help_files/Genomic_Diversity_res.Rmd | 6 ++ inst/help_files/Genomic_Prediction_cite.Rmd | 6 ++ inst/help_files/Genomic_Prediction_par.Rmd | 6 ++ inst/help_files/Genomic_Prediction_res.Rmd | 6 ++ inst/help_files/PCA_cite.Rmd | 6 ++ inst/help_files/PCA_par.Rmd | 6 ++ inst/help_files/PCA_res.Rmd | 6 ++ inst/help_files/Predictive_Ability_cite.Rmd | 6 ++ inst/help_files/Predictive_Ability_par.Rmd | 6 ++ inst/help_files/Predictive_Ability_res.Rmd | 6 ++ inst/help_files/Updog_Dosage_Calling_cite.Rmd | 6 ++ inst/help_files/Updog_Dosage_Calling_par.Rmd | 14 ++++ inst/help_files/Updog_Dosage_Calling_res.Rmd | 5 ++ inst/help_files/VCF_Filtering_cite.Rmd | 6 ++ inst/help_files/VCF_Filtering_par.Rmd | 5 ++ inst/help_files/VCF_Filtering_res.Rmd | 6 ++ 30 files changed, 316 insertions(+), 53 deletions(-) create mode 100644 inst/help_files/DAPC_cite.Rmd create mode 100644 inst/help_files/DAPC_par.Rmd create mode 100644 inst/help_files/DAPC_res.Rmd create mode 100644 inst/help_files/DArT_Report2VCF_cite.Rmd create mode 100644 inst/help_files/DArT_Report2VCF_par.Rmd create mode 100644 inst/help_files/DArT_Report2VCF_res.Rmd create mode 100644 inst/help_files/Genomic_Diversity_cite.Rmd create mode 100644 inst/help_files/Genomic_Diversity_par.Rmd create mode 100644 inst/help_files/Genomic_Diversity_res.Rmd create mode 100644 inst/help_files/Genomic_Prediction_cite.Rmd create mode 100644 inst/help_files/Genomic_Prediction_par.Rmd create mode 100644 inst/help_files/Genomic_Prediction_res.Rmd create mode 100644 inst/help_files/PCA_cite.Rmd create mode 100644 inst/help_files/PCA_par.Rmd create mode 100644 inst/help_files/PCA_res.Rmd create mode 100644 inst/help_files/Predictive_Ability_cite.Rmd create mode 100644 inst/help_files/Predictive_Ability_par.Rmd create mode 100644 inst/help_files/Predictive_Ability_res.Rmd create mode 100644 inst/help_files/Updog_Dosage_Calling_cite.Rmd create mode 100644 inst/help_files/Updog_Dosage_Calling_par.Rmd create mode 100644 inst/help_files/Updog_Dosage_Calling_res.Rmd create mode 100644 inst/help_files/VCF_Filtering_cite.Rmd create mode 100644 inst/help_files/VCF_Filtering_par.Rmd create mode 100644 inst/help_files/VCF_Filtering_res.Rmd diff --git a/NAMESPACE b/NAMESPACE index 0618583..b57ce72 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -43,7 +43,8 @@ importFrom(bs4Dash,sidebarMenu) importFrom(bs4Dash,tabBox) importFrom(bs4Dash,tabItem) importFrom(bs4Dash,tabItems) -importFrom(bs4Dash,updateControlbarMenu) +importFrom(bs4Dash,updateBox) +importFrom(bs4Dash,updatebs4TabItems) importFrom(bs4Dash,valueBox) importFrom(bs4Dash,valueBoxOutput) importFrom(config,get) @@ -81,8 +82,10 @@ importFrom(rrBLUP,A.mat) importFrom(rrBLUP,mixed.solve) importFrom(scales,comma_format) importFrom(shiny,NS) +importFrom(shiny,includeMarkdown) importFrom(shiny,shinyApp) importFrom(shiny,tagList) +importFrom(shiny,updateTabsetPanel) importFrom(shinyWidgets,materialSwitch) importFrom(shinyWidgets,progressBar) importFrom(shinyWidgets,updateVirtualSelect) diff --git a/R/mod_DosageCall.R b/R/mod_DosageCall.R index b8c1c9c..79b2a7b 100644 --- a/R/mod_DosageCall.R +++ b/R/mod_DosageCall.R @@ -29,35 +29,13 @@ mod_DosageCall_ui <- function(id){ downloadButton(ns('download_updog_vcf'), "Download VCF File", class = "butt"), div(style="display:inline-block; float:right",dropdownButton( - - tags$h3("Updog Dosage Calling"), - "You can download examples of the expected files here: \n", - downloadButton(ns('download_vcf'), "Download VCF Example File", class = "butt"), - downloadButton(ns('download_madc'), "Download MADC Example File", class = "butt"), - # "About Population Models:\n", - # "Model: What form should the prior (genotype distribution) take?\n - # The following information is from the Updog manual:\n - # Possible values of the genotype distribution (values of model) are: \n - # `norm` A distribution whose genotype frequencies are proportional to the density value of a normal - # with some mean and some standard deviation. Unlike the `bb` and `hw` options, this will - # allow for distributions both more and less dispersed than a binomial. This seems to be the - # most robust to violations in modeling assumptions, and so is the default. This prior class was - # developed in Gerard and Ferrao (2020). - # `hw` A binomial distribution that results from assuming that the population is in Hardy-Weinberg - # equilibrium (HWE). This actually does pretty well even when there are minor to moderate - # deviations from HWE. Though it does not perform as well as the `norm` option when there - # are severe deviations from HWE. - # `bb` A beta-binomial distribution. This is an overdispersed version of `hw` and can be derived - # from a special case of the Balding-Nichols model. - # `s1` This prior assumes the individuals are all full-siblings resulting from one generation of selfing. I.e. there is only one parent. This model assumes a particular type of meiotic behavior: - # polysomic inheritance with bivalent, non-preferential pairing. - # `f1` This prior assumes the individuals are all full-siblings resulting from one generation of a - # bi-parental cross. This model assumes a particular type of meiotic behavior: polysomic inheritance with bivalent, non-preferential pairing. - # `f1pp` This prior allows for double reduction and preferential pairing in an F1 population of tretraploids. - # `s1pp` This prior allows for double reduction and preferential pairing in an S1 population of tretraploids. - # `flex` Generically any categorical distribution. Theoretically, this works well if you have a lot of - # individuals. In practice, it seems to be much less robust to violations in modeling assumptions. - # `uniform` A discrete uniform distribution. This should never be used in practice.", + HTML("Input files"), + p(downloadButton(ns('download_vcf'),""), "VCF Example File"), + p(downloadButton(ns('download_madc'),""), "MADC Example File"), hr(), + p(HTML("Parameters description:"), actionButton(ns("goPar"), icon("arrow-up-right-from-square", verify_fa = FALSE) )), hr(), + p(HTML("Graphics description:"), actionButton(ns("goRes"), icon("arrow-up-right-from-square", verify_fa = FALSE) )), hr(), + p(HTML("How to cite:"), actionButton(ns("goCite"), icon("arrow-up-right-from-square", verify_fa = FALSE) )), hr(), + p(HTML("Updog tutorial:"), actionButton(ns("goUpdog"), icon("arrow-up-right-from-square", verify_fa = FALSE), onclick ="window.open('https://dcgerard.github.io/updog/', '_blank')" )), circle = FALSE, status = "warning", icon = icon("info"), width = "500px", @@ -88,6 +66,43 @@ mod_DosageCall_server <- function(input, output, session, parent_session){ ns <- session$ns + # Help links + observeEvent(input$goPar, { + # change to help tab + updatebs4TabItems(session = parent_session, inputId = "MainMenu", + selected = "help") + + # select specific tab + updateTabsetPanel(session = parent_session, inputId = "Updog_Dosage_Calling_tabset", + selected = "Updog_Dosage_Calling_par") + # expand specific box + updateBox(id = "Updog_Dosage_Calling_box", action = "toggle", session = parent_session) + }) + + observeEvent(input$goRes, { + # change to help tab + updatebs4TabItems(session = parent_session, inputId = "MainMenu", + selected = "help") + + # select specific tab + updateTabsetPanel(session = parent_session, inputId = "Updog_Dosage_Calling_tabset", + selected = "Updog_Dosage_Calling_results") + # expand specific box + updateBox(id = "Updog_Dosage_Calling_box", action = "toggle", session = parent_session) + }) + + observeEvent(input$goCite, { + # change to help tab + updatebs4TabItems(session = parent_session, inputId = "MainMenu", + selected = "help") + + # select specific tab + updateTabsetPanel(session = parent_session, inputId = "Updog_Dosage_Calling_tabset", + selected = "Updog_Dosage_Calling_cite") + # expand specific box + updateBox(id = "Updog_Dosage_Calling_box", action = "toggle", session = parent_session) + }) + snp_number <- reactiveVal(0) #SNP counts value box diff --git a/R/mod_dosage2vcf.R b/R/mod_dosage2vcf.R index 52889cc..caa8434 100644 --- a/R/mod_dosage2vcf.R +++ b/R/mod_dosage2vcf.R @@ -24,12 +24,12 @@ mod_dosage2vcf_ui <- function(id){ useShinyjs(), downloadButton(ns('download_d2vcf'), "Download VCF File", class = "butt"), div(style="display:inline-block; float:right",dropdownButton( - - tags$h3("DArT File Conversion"), - "Converting DArT report files to VCF format. \n", - "You can download examples of the expected files here: \n", - downloadButton(ns('download_dose'), "Download Dose Report Example File"), - downloadButton(ns('download_counts'), "Download Counts Example File"), + HTML("Input files"), + p(downloadButton(ns('download_dose'), ""), "Dose Report Example File"), + p(downloadButton(ns('download_counts'), ""), "Counts Example File"), hr(), + p(HTML("Parameters description:"), actionButton(ns("goPar"), icon("arrow-up-right-from-square", verify_fa = FALSE) )), hr(), + p(HTML("Graphics description:"), actionButton(ns("goRes"), icon("arrow-up-right-from-square", verify_fa = FALSE) )), hr(), + p(HTML("How to cite:"), actionButton(ns("goCite"), icon("arrow-up-right-from-square", verify_fa = FALSE) )), hr(), circle = FALSE, status = "warning", icon = icon("info"), width = "500px", @@ -57,6 +57,43 @@ mod_dosage2vcf_server <- function(input, output, session, parent_session){ ns <- session$ns + # Help links + observeEvent(input$goPar, { + # change to help tab + updatebs4TabItems(session = parent_session, inputId = "MainMenu", + selected = "help") + + # select specific tab + updateTabsetPanel(session = parent_session, inputId = "DArT_Report2VCF_tabset", + selected = "DArT_Report2VCF_par") + # expand specific box + updateBox(id = "DArT_Report2VCF_box", action = "toggle", session = parent_session) + }) + + observeEvent(input$goRes, { + # change to help tab + updatebs4TabItems(session = parent_session, inputId = "MainMenu", + selected = "help") + + # select specific tab + updateTabsetPanel(session = parent_session, inputId = "DArT_Report2VCF_tabset", + selected = "DArT_Report2VCF_results") + # expand specific box + updateBox(id = "DArT_Report2VCF_box", action = "toggle", session = parent_session) + }) + + observeEvent(input$goCite, { + # change to help tab + updatebs4TabItems(session = parent_session, inputId = "MainMenu", + selected = "help") + + # select specific tab + updateTabsetPanel(session = parent_session, inputId = "DArT_Report2VCF_tabset", + selected = "DArT_Report2VCF_cite") + # expand specific box + updateBox(id = "DArT_Report2VCF_box", action = "toggle", session = parent_session) + }) + snp_number <- reactiveVal(0) disable("download_d2vcf") diff --git a/R/mod_gwas.R b/R/mod_gwas.R index 737a3e0..a5b6282 100644 --- a/R/mod_gwas.R +++ b/R/mod_gwas.R @@ -98,7 +98,8 @@ mod_gwas_ui <- function(id){ #' @importFrom vcfR read.vcfR #' @importFrom Matrix nearPD #' @importFrom stats BIC as.formula lm logLik median model.matrix na.omit prcomp qbeta quantile runif sd setNames -#' @importFrom bs4Dash updatebs4TabItems +#' @importFrom bs4Dash updatebs4TabItems updateBox +#' @importFrom shiny updateTabsetPanel #' @noRd mod_gwas_server <- function(input, output, session, parent_session){ diff --git a/R/mod_help.R b/R/mod_help.R index 2e01e12..b6d5d41 100644 --- a/R/mod_help.R +++ b/R/mod_help.R @@ -6,7 +6,7 @@ #' #' @noRd #' -#' @importFrom shiny NS tagList +#' @importFrom shiny NS tagList includeMarkdown mod_help_ui <- function(id){ ns <- NS(id) tagList( @@ -14,51 +14,81 @@ mod_help_ui <- function(id){ column(width=12), column(width=12, box(title="DArT Report2VCF", id = "DArT_Report2VCF_box",width = 12, collapsible = TRUE, collapsed = TRUE, status = "info", solidHeader = TRUE, + "**Draft**This tab is designed to convert the DArT Dose Report and Counts files to a VCF file. **DArT Website**", + br(), br(), bs4Dash::tabsetPanel(id = "DArT_Report2VCF_tabset", tabPanel("Parameters description", value = "DArT_Report2VCF_par", - "**Draft**This tab is designed to convert the DArT Dose Report and Counts files to a VCF file. **DArT Website**" + includeMarkdown(system.file("help_files/DArT_Report2VCF_par.Rmd", package = "BIGapp")) ), tabPanel("Results description", value = "DArT_Report2VCF_results", + includeMarkdown(system.file("help_files/DArT_Report2VCF_res.Rmd", package = "BIGapp")) + ), + tabPanel("How to cite", value = "DArT_Report2VCF_cite", + includeMarkdown(system.file("help_files/DArT_Report2VCF_cite.Rmd", package = "BIGapp")) )) ), box(title="Updog Dosage Calling", id = "Updog_Dosage_Calling_box",width = 12, collapsible = TRUE, collapsed = TRUE, status = "info", solidHeader = TRUE, + "**Draft**This tab is designed to handle the process of dosage calling in genomic data. Dosage calling is essential for determining the number of copies of a particular allele at each genomic location. The app likely includes functionalities to upload raw genomic data, apply various filtering criteria, and generate plots to visualize the distribution of dosages. Users can examine histograms for SNP max post probabilities and read depths, which help in assessing the quality and accuracy of the dosage calls.**Updog**", + br(), br(), bs4Dash::tabsetPanel(id = "Updog_Dosage_Calling_tabset", tabPanel("Parameters description", value = "Updog_Dosage_Calling_par", + includeMarkdown(system.file("help_files/Updog_Dosage_Calling_par.Rmd", package = "BIGapp")) ), tabPanel("Results description", value = "Updog_Dosage_Calling_results", - "**Draft**This tab is designed to handle the process of dosage calling in genomic data. Dosage calling is essential for determining the number of copies of a particular allele at each genomic location. The app likely includes functionalities to upload raw genomic data, apply various filtering criteria, and generate plots to visualize the distribution of dosages. Users can examine histograms for SNP max post probabilities and read depths, which help in assessing the quality and accuracy of the dosage calls.**Updog**" + includeMarkdown(system.file("help_files/Updog_Dosage_Calling_res.Rmd", package = "BIGapp")) + ), + tabPanel("How to cite", value = "Updog_Dosage_Calling_cite", + includeMarkdown(system.file("help_files/Updog_Dosage_Calling_cite.Rmd", package = "BIGapp")) )) ), box(title="VCF Filtering", id = "VCF_Filtering_box",width = 12, collapsible = TRUE, collapsed = TRUE, status = "info", solidHeader = TRUE, bs4Dash::tabsetPanel(id = "VCF_Filtering_tabset", tabPanel("Parameters description", value = "VCF_Filtering_par", - "**Draft**This tab is designed to convert the DArT Dose Report and Counts files to a VCF file. **DArT Website**" + includeMarkdown(system.file("help_files/VCF_Filtering_par.Rmd", package = "BIGapp")) ), tabPanel("Results description", value = "VCF_Filtering_results", - "**Draft**This tab is designed to handle the process of dosage calling in genomic data. Dosage calling is essential for determining the number of copies of a particular allele at each genomic location. The app likely includes functionalities to upload raw genomic data, apply various filtering criteria, and generate plots to visualize the distribution of dosages. Users can examine histograms for SNP max post probabilities and read depths, which help in assessing the quality and accuracy of the dosage calls.**Updog**" + includeMarkdown(system.file("help_files/VCF_Filtering_par.Rmd", package = "BIGapp")) + ), + tabPanel("How to cite", value = "Updog_Dosage_Calling_cite", + includeMarkdown(system.file("help_files/VCF_Filtering_cite.Rmd", package = "BIGapp")) )) ), box(title="PCA", id = "PCA_box",width = 12, collapsible = TRUE, collapsed = TRUE, status = "info", solidHeader = TRUE, bs4Dash::tabsetPanel(id = "PCA_tabset", tabPanel("Parameters description", value = "PCA_par", + includeMarkdown(system.file("help_files/PCA_par.Rmd", package = "BIGapp")) ), tabPanel("Results description", value = "PCA_results", + includeMarkdown(system.file("help_files/PCA_res.Rmd", package = "BIGapp")) + ), + tabPanel("How to cite", value = "PCA_cite", + includeMarkdown(system.file("help_files/PCA_cite.Rmd", package = "BIGapp")) )) ), box(title="DAPC", id = "DAPC_box",width = 12, collapsible = TRUE, collapsed = TRUE, status = "info", solidHeader = TRUE, bs4Dash::tabsetPanel(id = "DAPC_tabset", tabPanel("Parameters description", value = "DAPC_par", + includeMarkdown(system.file("help_files/DAPC_par.Rmd", package = "BIGapp")) ), tabPanel("Results description", value = "DAPC_results", + includeMarkdown(system.file("help_files/DAPC_res.Rmd", package = "BIGapp")) + ), + tabPanel("How to cite", value = "DAPC_cite", + includeMarkdown(system.file("help_files/DAPC_cite.Rmd", package = "BIGapp")) )) ), box(title="Genomic Diversity", id = "Genomic_Diversity_box",width = 12, collapsible = TRUE, collapsed = TRUE, status = "info", solidHeader = TRUE, + "**Draft**This tab is dedicated to analyzing genomic diversity within the population. It calculates various diversity metrics such as heterozygosity and minor allele frequency (MAF). The app includes functionalities to visualize these metrics through histograms and other plots. Users can download the calculated diversity metrics as CSV files. This tab helps in understanding the genetic variability and distribution of alleles within the population.", + br(), br(), bs4Dash::tabsetPanel(id = "Genomic_Diversity_tabset", tabPanel("Parameters description", value = "Genomic_Diversity_par", - "**Draft**This tab is dedicated to analyzing genomic diversity within the population. It calculates various diversity metrics such as heterozygosity and minor allele frequency (MAF). The app includes functionalities to visualize these metrics through histograms and other plots. Users can download the calculated diversity metrics as CSV files. This tab helps in understanding the genetic variability and distribution of alleles within the population." - + includeMarkdown(system.file("help_files/Genomic_Diversity_par.Rmd", package = "BIGapp")) ), tabPanel("Results description", value = "Genomic_Diversity_results", + includeMarkdown(system.file("help_files/Genomic_Diversity_res.Rmd", package = "BIGapp")) + ), + tabPanel("How to cite", value = "Genomic_Diversity_cite", + includeMarkdown(system.file("help_files/Genomic_Diversity_cite.Rmd", package = "BIGapp")) )) ), box(title="GWAS", id = "GWAS_box",width = 12, collapsible = TRUE, collapsed = TRUE, status = "info", solidHeader = TRUE, @@ -75,17 +105,29 @@ mod_help_ui <- function(id){ includeMarkdown(system.file("help_files/GWAS_cite.Rmd", package = "BIGapp")) )) ), - box(title="Genomic Prediction/Selection", id = "Genomic_Prediction/Selection_box",width = 12, collapsible = TRUE, collapsed = TRUE, status = "info", solidHeader = TRUE, - bs4Dash::tabsetPanel(id = "Genomic_Prediction/Selection_tabset", - tabPanel("Parameters description", value = "Genomic_Prediction/Selection_par", - "**Draft**This tab is for conducting Genome-Wide Association Studies (GWAS) to identify associations between genetic variants and traits of interest. Users can input phenotypic data and specify parameters for the GWAS analysis. The app performs statistical tests to identify significant associations between SNPs and traits, and visualizes the results using Manhattan plots and Q-Q plots. This tab helps in identifying potential genetic markers linked to specific traits.**List R packages utilized", + box(title="Predictive Ability", id = "Predictive_Ability_box",width = 12, collapsible = TRUE, collapsed = TRUE, status = "info", solidHeader = TRUE, + bs4Dash::tabsetPanel(id = "Predictive_Ability_tabset", + tabPanel("Parameters description", value = "Predictive_Ability_par", + includeMarkdown(system.file("help_files/Predictive_Ability_par.Rmd", package = "BIGapp")) ), - tabPanel("Results description", value = "Genomic_Prediction/Selection_results", + tabPanel("Results description", value = "Predictive_Ability_results", + includeMarkdown(system.file("help_files/Predictive_Ability_res.Rmd", package = "BIGapp")) + ), + tabPanel("How to cite", value = "Predictive_Ability_cite", + includeMarkdown(system.file("help_files/Predictive_Ability_cite.Rmd", package = "BIGapp")) )) ), - - box(title="How to Cite", id = "how_to_cite_box", width = 12, collapsible = TRUE, collapsed = TRUE, status = "info", solidHeader = TRUE, - "**Draft**Instructions for citing the app and packages used in analyses" + box(title="Genomic Prediction", id = "Genomic_Prediction_box",width = 12, collapsible = TRUE, collapsed = TRUE, status = "info", solidHeader = TRUE, + bs4Dash::tabsetPanel(id = "Genomic_Prediction_tabset", + tabPanel("Parameters description", value = "Genomic_Prediction_par", + includeMarkdown(system.file("help_files/Genomic_Prediction_par.Rmd", package = "BIGapp")) + ), + tabPanel("Results description", value = "Genomic_Prediction_results", + includeMarkdown(system.file("help_files/Genomic_Prediction_res.Rmd", package = "BIGapp")) + ), + tabPanel("How to cite", value = "Genomic_Prediction_cite", + includeMarkdown(system.file("help_files/Genomic_Prediction_cite.Rmd", package = "BIGapp")) + )) ), ), column(width=2) diff --git a/inst/help_files/DAPC_cite.Rmd b/inst/help_files/DAPC_cite.Rmd new file mode 100644 index 0000000..1458b7c --- /dev/null +++ b/inst/help_files/DAPC_cite.Rmd @@ -0,0 +1,6 @@ +--- +title: "DAPC_cite" +output: html_document +date: "2024-08-29" +--- + diff --git a/inst/help_files/DAPC_par.Rmd b/inst/help_files/DAPC_par.Rmd new file mode 100644 index 0000000..b566e42 --- /dev/null +++ b/inst/help_files/DAPC_par.Rmd @@ -0,0 +1,6 @@ +--- +title: "DAPC_par" +output: html_document +date: "2024-08-29" +--- + diff --git a/inst/help_files/DAPC_res.Rmd b/inst/help_files/DAPC_res.Rmd new file mode 100644 index 0000000..0932956 --- /dev/null +++ b/inst/help_files/DAPC_res.Rmd @@ -0,0 +1,6 @@ +--- +title: "DAPC_res" +output: html_document +date: "2024-08-29" +--- + diff --git a/inst/help_files/DArT_Report2VCF_cite.Rmd b/inst/help_files/DArT_Report2VCF_cite.Rmd new file mode 100644 index 0000000..d3be88a --- /dev/null +++ b/inst/help_files/DArT_Report2VCF_cite.Rmd @@ -0,0 +1,6 @@ +--- +title: "DArT_Report2VCF_cite" +output: html_document +date: "2024-08-29" +--- + diff --git a/inst/help_files/DArT_Report2VCF_par.Rmd b/inst/help_files/DArT_Report2VCF_par.Rmd new file mode 100644 index 0000000..ccf8d3c --- /dev/null +++ b/inst/help_files/DArT_Report2VCF_par.Rmd @@ -0,0 +1,6 @@ +--- +title: "DArT_Report2VCF_par" +output: html_document +date: "2024-08-29" +--- + diff --git a/inst/help_files/DArT_Report2VCF_res.Rmd b/inst/help_files/DArT_Report2VCF_res.Rmd new file mode 100644 index 0000000..8f71982 --- /dev/null +++ b/inst/help_files/DArT_Report2VCF_res.Rmd @@ -0,0 +1,6 @@ +--- +title: "DArT_Report2VCF_res" +output: html_document +date: "2024-08-29" +--- + diff --git a/inst/help_files/GWAS_par.Rmd b/inst/help_files/GWAS_par.Rmd index 779a04d..8a8aaff 100644 --- a/inst/help_files/GWAS_par.Rmd +++ b/inst/help_files/GWAS_par.Rmd @@ -6,7 +6,22 @@ date: "2024-08-29" * **VCF file**: Variant Call Format (VCF) is a standard file format to store genetic variant information. The genotype (GT) data within the VCF is required for the analysis in this tab. For more details about the VCF format, see [this document](https://samtools.github.io/hts-specs/VCFv4.2.pdf). -* **Passport file**: A comma-separated values (CSV) file containing individual names (Sample_ID) in the first column and phenotype values in the subsequent columns. The phenotype column names should correspond to the phenotype ID. +* **Passport file**: A comma-separated values (CSV) file containing individual names (Sample_ID) in the first column and phenotype values in the subsequent columns. The phenotype column names should correspond to the phenotype ID. Example: + +
    + +|Sample_ID | Sepal.Length| Sepal.Width| Petal.Length| Petal.Width|Species | +|:---------:|:------------:|:-----------:|:------------:|:-----------:|:-------:| +|Sample_1 | 5.1| 3.5| 1.4| 0.2|setosa | +|Sample_2 | 4.9| 3.0| 1.4| 0.2|setosa | +|Sample_3 | 4.7| 3.2| 1.3| 0.2|setosa | +|Sample_4 | 4.6| 3.1| 1.5| 0.2|setosa | +|Sample_5 | 5.0| 3.6| 1.4| 0.2|setosa | +|Sample_6 | 5.4| 3.9| 1.7| 0.4|setosa | + +
    + +  * **Species Ploidy**: Specifies the ploidy level of the species. The current analysis supports both diploids and autopolyploids. diff --git a/inst/help_files/Genomic_Diversity_cite.Rmd b/inst/help_files/Genomic_Diversity_cite.Rmd new file mode 100644 index 0000000..b85b481 --- /dev/null +++ b/inst/help_files/Genomic_Diversity_cite.Rmd @@ -0,0 +1,6 @@ +--- +title: "Genomic_Diversity_cite" +output: html_document +date: "2024-08-29" +--- + diff --git a/inst/help_files/Genomic_Diversity_par.Rmd b/inst/help_files/Genomic_Diversity_par.Rmd new file mode 100644 index 0000000..b236539 --- /dev/null +++ b/inst/help_files/Genomic_Diversity_par.Rmd @@ -0,0 +1,6 @@ +--- +title: "Genomic_Diversity_par" +output: html_document +date: "2024-08-29" +--- + diff --git a/inst/help_files/Genomic_Diversity_res.Rmd b/inst/help_files/Genomic_Diversity_res.Rmd new file mode 100644 index 0000000..752af92 --- /dev/null +++ b/inst/help_files/Genomic_Diversity_res.Rmd @@ -0,0 +1,6 @@ +--- +title: "Genomic_Diversity_res" +output: html_document +date: "2024-08-29" +--- + diff --git a/inst/help_files/Genomic_Prediction_cite.Rmd b/inst/help_files/Genomic_Prediction_cite.Rmd new file mode 100644 index 0000000..55097f3 --- /dev/null +++ b/inst/help_files/Genomic_Prediction_cite.Rmd @@ -0,0 +1,6 @@ +--- +title: "Genomic_Prediction_cite" +output: html_document +date: "2024-08-29" +--- + diff --git a/inst/help_files/Genomic_Prediction_par.Rmd b/inst/help_files/Genomic_Prediction_par.Rmd new file mode 100644 index 0000000..b4b18cb --- /dev/null +++ b/inst/help_files/Genomic_Prediction_par.Rmd @@ -0,0 +1,6 @@ +--- +title: "Genomic_Prediction_par" +output: html_document +date: "2024-08-29" +--- + diff --git a/inst/help_files/Genomic_Prediction_res.Rmd b/inst/help_files/Genomic_Prediction_res.Rmd new file mode 100644 index 0000000..c48e1ed --- /dev/null +++ b/inst/help_files/Genomic_Prediction_res.Rmd @@ -0,0 +1,6 @@ +--- +title: "Genomic_Prediction_res" +output: html_document +date: "2024-08-29" +--- + diff --git a/inst/help_files/PCA_cite.Rmd b/inst/help_files/PCA_cite.Rmd new file mode 100644 index 0000000..45d5f7f --- /dev/null +++ b/inst/help_files/PCA_cite.Rmd @@ -0,0 +1,6 @@ +--- +title: "PCA_cite" +output: html_document +date: "2024-08-29" +--- + diff --git a/inst/help_files/PCA_par.Rmd b/inst/help_files/PCA_par.Rmd new file mode 100644 index 0000000..61a0f27 --- /dev/null +++ b/inst/help_files/PCA_par.Rmd @@ -0,0 +1,6 @@ +--- +title: "PCA_par" +output: html_document +date: "2024-08-29" +--- + diff --git a/inst/help_files/PCA_res.Rmd b/inst/help_files/PCA_res.Rmd new file mode 100644 index 0000000..1f2f534 --- /dev/null +++ b/inst/help_files/PCA_res.Rmd @@ -0,0 +1,6 @@ +--- +title: "PCA_res" +output: html_document +date: "2024-08-29" +--- + diff --git a/inst/help_files/Predictive_Ability_cite.Rmd b/inst/help_files/Predictive_Ability_cite.Rmd new file mode 100644 index 0000000..92d6654 --- /dev/null +++ b/inst/help_files/Predictive_Ability_cite.Rmd @@ -0,0 +1,6 @@ +--- +title: "Predictive_Ability_cite" +output: html_document +date: "2024-08-29" +--- + diff --git a/inst/help_files/Predictive_Ability_par.Rmd b/inst/help_files/Predictive_Ability_par.Rmd new file mode 100644 index 0000000..c3eab0f --- /dev/null +++ b/inst/help_files/Predictive_Ability_par.Rmd @@ -0,0 +1,6 @@ +--- +title: "Predictive_Ability_par" +output: html_document +date: "2024-08-29" +--- + diff --git a/inst/help_files/Predictive_Ability_res.Rmd b/inst/help_files/Predictive_Ability_res.Rmd new file mode 100644 index 0000000..14e1952 --- /dev/null +++ b/inst/help_files/Predictive_Ability_res.Rmd @@ -0,0 +1,6 @@ +--- +title: "Predictive_Ability_res" +output: html_document +date: "2024-08-29" +--- + diff --git a/inst/help_files/Updog_Dosage_Calling_cite.Rmd b/inst/help_files/Updog_Dosage_Calling_cite.Rmd new file mode 100644 index 0000000..9004582 --- /dev/null +++ b/inst/help_files/Updog_Dosage_Calling_cite.Rmd @@ -0,0 +1,6 @@ +--- +title: "Updog_Dosage_Calling_cite" +output: html_document +date: "2024-08-29" +--- + diff --git a/inst/help_files/Updog_Dosage_Calling_par.Rmd b/inst/help_files/Updog_Dosage_Calling_par.Rmd new file mode 100644 index 0000000..9ea5f87 --- /dev/null +++ b/inst/help_files/Updog_Dosage_Calling_par.Rmd @@ -0,0 +1,14 @@ +--- +title: "Updog_Dosage_Calling_par" +output: html_document +date: "2024-08-29" +--- + +:hammer: Under development + +About Population Models: +Model: What form should the prior (genotype distribution) take? +The following information is from the Updog manual: +Possible values of the genotype distribution (values of model) are: +`norm` A distribution whose genotype frequencies are proportional to the density value of a normal with some mean and some standard deviation. Unlike the `bb` and `hw` options, this will allow for distributions both more and less dispersed than a binomial. This seems to be the most robust to violations in modeling assumptions, and so is the default. This prior class was developed in Gerard and Ferrao (2020). `hw` A binomial distribution that results from assuming that the population is in Hardy-Weinberg equilibrium (HWE). This actually does pretty well even when there are minor to moderate deviations from HWE. Though it does not perform as well as the `norm` option when there are severe deviations from HWE. `bb` A beta-binomial distribution. This is an overdispersed version of `hw` and can be derived from a special case of the Balding-Nichols model. `s1` This prior assumes the individuals are all full-siblings resulting from one generation of selfing. I.e. there is only one parent. This model assumes a particular type of meiotic behavior: polysomic inheritance with bivalent, non-preferential pairing. +`f1` This prior assumes the individuals are all full-siblings resulting from one generation of a bi-parental cross. This model assumes a particular type of meiotic behavior: polysomic inheritance with bivalent, non-preferential pairing. `f1pp` This prior allows for double reduction and preferential pairing in an F1 population of tretraploids. `s1pp` This prior allows for double reduction and preferential pairing in an S1 population of tretraploids. `flex` Generically any categorical distribution. Theoretically, this works well if you have a lot of individuals. In practice, it seems to be much less robust to violations in modeling assumptions.`uniform` A discrete uniform distribution. This should never be used in practice.", diff --git a/inst/help_files/Updog_Dosage_Calling_res.Rmd b/inst/help_files/Updog_Dosage_Calling_res.Rmd new file mode 100644 index 0000000..d7806fa --- /dev/null +++ b/inst/help_files/Updog_Dosage_Calling_res.Rmd @@ -0,0 +1,5 @@ +--- +title: "Updog_Dosage_Calling_res" +output: html_document +date: "2024-08-29" +--- diff --git a/inst/help_files/VCF_Filtering_cite.Rmd b/inst/help_files/VCF_Filtering_cite.Rmd new file mode 100644 index 0000000..0699277 --- /dev/null +++ b/inst/help_files/VCF_Filtering_cite.Rmd @@ -0,0 +1,6 @@ +--- +title: "VCF_Filtering" +output: html_document +date: "2024-08-29" +--- + diff --git a/inst/help_files/VCF_Filtering_par.Rmd b/inst/help_files/VCF_Filtering_par.Rmd new file mode 100644 index 0000000..2abbd30 --- /dev/null +++ b/inst/help_files/VCF_Filtering_par.Rmd @@ -0,0 +1,5 @@ +--- +title: "VCF_Filtering_par" +output: html_document +date: "2024-08-29" +--- diff --git a/inst/help_files/VCF_Filtering_res.Rmd b/inst/help_files/VCF_Filtering_res.Rmd new file mode 100644 index 0000000..e3430b6 --- /dev/null +++ b/inst/help_files/VCF_Filtering_res.Rmd @@ -0,0 +1,6 @@ +--- +title: "VCF_Filtering_res" +output: html_document +date: "2024-08-29" +--- + From 00abaf6bc6f87c7e0f2b0708c4bbb51319c1bf0f Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Fri, 30 Aug 2024 14:38:23 -0400 Subject: [PATCH 14/40] Added A,G, and H mat for GBLUP --- NAMESPACE | 3 + R/mod_GSAcc.R | 215 ++++++++++++++++++++++++++++++++++++++++++-------- 2 files changed, 187 insertions(+), 31 deletions(-) diff --git a/NAMESPACE b/NAMESPACE index 36375a3..c99ddfa 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -14,7 +14,9 @@ import(updog) import(utils) import(vcfR) importClassesFrom(adegenet,genlight) +importFrom(AGHmatrix,Amatrix) importFrom(AGHmatrix,Gmatrix) +importFrom(AGHmatrix,Hmatrix) importFrom(BIGr,updog2vcf) importFrom(DT,DTOutput) importFrom(DT,datatable) @@ -77,6 +79,7 @@ importFrom(plotly,renderPlotly) importFrom(purrr,map) importFrom(purrr,set_names) importFrom(rrBLUP,A.mat) +importFrom(rrBLUP,kin.blup) importFrom(rrBLUP,mixed.solve) importFrom(scales,comma_format) importFrom(shiny,NS) diff --git a/R/mod_GSAcc.R b/R/mod_GSAcc.R index 7d3c735..6e81d7e 100644 --- a/R/mod_GSAcc.R +++ b/R/mod_GSAcc.R @@ -23,11 +23,10 @@ mod_GSAcc_ui <- function(id){ fileInput(ns("pred_file"), "Choose VCF File", accept = c(".csv",".vcf",".gz")), fileInput(ns("trait_file"), "Choose Passport File", accept = ".csv"), numericInput(ns("pred_ploidy"), "Species Ploidy", min = 1, value = NULL), - selectInput(inputId = ns('pred_model'), label = 'Model Choice', choices = c("rrBLUP","GBLUP"), selected = "rrBLUP"), numericInput(ns("pred_cv"), "Iterations", min = 1, max=20, value = 5), virtualSelectInput( inputId = ns("pred_trait_info"), - label = "Select Trait (eg, Color):", + label = "Select Trait(s):", choices = NULL, showValueAsTags = TRUE, search = TRUE, @@ -55,18 +54,23 @@ mod_GSAcc_ui <- function(id){ ) ), actionButton(ns("prediction_start"), "Run Analysis"), - div(style="display:inline-block; float:right",dropdownButton( + div(style="display:inline-block; float:right", dropdownButton( tags$h3("GP Parameters"), "You can download examples of the expected input input files here: \n", downloadButton(ns('download_vcf'), "Download VCF Example File"), downloadButton(ns('download_pheno'), "Download Passport Example File"), - #"GP uses the rrBLUP package: It can impute missing data, adapt to different ploidy, perform 5-fold cross validations with different number of iterations, run multiple traits, and accept multiple fixed effects.", circle = FALSE, status = "warning", icon = icon("info"), width = "300px", tooltip = tooltipOptions(title = "Click to see info!") - )) - + )), + tags$hr(style="border-color: #d3d3d3; margin-top: 20px; margin-bottom: 20px;"), # Lighter grey line + div(style="text-align: left; margin-top: 10px;", + actionButton(ns("advanced_options"), + label = HTML(paste(icon("cog", style = "color: #007bff;"), "Advanced Options")), + style = "background-color: transparent; border: none; color: #007bff; font-size: smaller; text-decoration: underline; padding: 0;" + ) + ) ) ), @@ -124,6 +128,7 @@ mod_GSAcc_ui <- function(id){ #' @importFrom rrBLUP mixed.solve A.mat kin.blup #' @importFrom stats cor #' @importFrom shinyalert shinyalert +#' @importFrom AGHmatrix Gmatrix Amatrix Hmatrix #' @import dplyr #' @import ggplot2 #' @import tidyr @@ -131,13 +136,83 @@ mod_GSAcc_ui <- function(id){ mod_GSAcc_server <- function(id){ moduleServer( id, function(input, output, session){ ns <- session$ns + + #Default model choices + advanced_options <- reactiveValues( + pred_model = "rrBLUP", + pred_matrix = "Gmatrix", + ped_file = NULL + ) + + #List the ped file name if previously uploaded + output$uploaded_file_name <- renderText({ + if (!is.null(advanced_options$ped_file)) { + paste("Previously uploaded file:", advanced_options$ped_file$name) + } else { + "" # Return an empty string if no file has been uploaded + } + }) + + print("check1") + #UI popup window for input + observeEvent(input$advanced_options, { + showModal(modalDialog( + title = "Advanced Options (beta)", + selectInput( + inputId = ns('pred_model'), + label = 'Model Choice', + choices = c("rrBLUP", "GBLUP"), + selected = advanced_options$pred_model # Initialize with stored value + ), + conditionalPanel( + condition = "input.pred_model == 'GBLUP'", ns = ns, + div( + selectInput( + inputId = ns('pred_matrix'), + label = 'GBLUP Matrix Choice', + choices = c("Gmatrix", "Amatrix", "Hmatrix"), + selected = advanced_options$pred_matrix # Initialize with stored value + ) + ) + ), + conditionalPanel( + condition = "input.pred_matrix != 'Gmatrix'", ns = ns, + div( + fileInput(ns("ped_file"), "Choose Pedigree File", accept = ".csv"), + conditionalPanel( + condition = "output.uploaded_file_name !== ''", # Show only if there's content + textOutput(ns("uploaded_file_name")) # Display the uploaded file name + ) + ) + ), + footer = tagList( + modalButton("Close"), + actionButton(ns("save_advanced_options"), "Save") + ) + )) + }) + + + + #Close popup window when user "saves options" + observeEvent(input$save_advanced_options, { + advanced_options$pred_model <- input$pred_model + advanced_options$pred_matrix <- input$pred_matrix + advanced_options$ped_file <- input$ped_file + # Save other inputs as needed + + removeModal() # Close the modal after saving + }) + + + ####Genomic Prediction Accuracy #This tab involved 3 observeEvents #1) to get the traits listed in the phenotype file #2) to input and validate the input files #3) to perform the genomic prediction - + print("check2") #1) Get traits observeEvent(input$trait_file, { info_df <- read.csv(input$trait_file$datapath, header = TRUE, check.names = FALSE, nrow = 0) @@ -147,7 +222,7 @@ mod_GSAcc_server <- function(id){ updateVirtualSelect("pred_trait_info", choices = trait_var, session = session) }) - + print("check3") #2) Error check for prediction and save input files continue_prediction <- reactiveVal(NULL) pred_inputs <- reactiveValues( @@ -259,7 +334,7 @@ mod_GSAcc_server <- function(id){ } - + print("check4") #Getting genotype matrix #Geno file path @@ -310,7 +385,7 @@ mod_GSAcc_server <- function(id){ closeOnEsc = TRUE, closeOnClickOutside = FALSE, html = TRUE, - type = "info", + type = "warning", showConfirmButton = TRUE, confirmButtonText = "OK", confirmButtonCol = "#004192", @@ -322,7 +397,7 @@ mod_GSAcc_server <- function(id){ #Stop the analysis return() } - + print("check5") #Save number of samples in file pred_inputs$pred_genos <- ncol(geno) @@ -352,7 +427,7 @@ mod_GSAcc_server <- function(id){ #return() } - + print("check6") # Function to convert genotype matrix according to ploidy convert_genotype <- function(genotype_matrix, ploidy) { normalized_matrix <- 2 * (genotype_matrix / ploidy) - 1 @@ -395,7 +470,7 @@ mod_GSAcc_server <- function(id){ # Stop the observeEvent gracefully return() - + print("check7") } else if (length(common_ids) < length(colnames_geno)) { # If condition is met, show notification toast shinyalert( @@ -452,18 +527,23 @@ mod_GSAcc_server <- function(id){ } ) - + print("check8") # Subset and reorder geno and pheno to ensure they only contain and are ordered by common IDs geno_adj <- geno_adj_init[, common_ids] # Assuming that the columns can be directly indexed by IDs pheno <- pheno[match(common_ids, ids_pheno), ] - #Save to reactive values + ##Save to reactive values + #Gmatrix needs the original allele count values, so the user matrix selection determines the genotype matrix used pred_inputs$pheno_input <- pheno - #pred_inputs$geno_adj_input <- geno_adj - pred_inputs$geno_input <- geno_adj - + if (advanced_options$pred_matrix == "Gmatrix" || is.null(advanced_options$pred_matrix)) { + pred_inputs$geno_input <- geno_adj + } else if (advanced_options$pred_matrix == "Hmatrix") { + pred_inputs$geno_input <- geno[, common_ids] + } else { + pred_inputs$geno_input <- geno_adj + } }) - + print("check9") #3) Analysis only proceeds once continue_prediction is converted to TRUE observe({ @@ -473,7 +553,7 @@ mod_GSAcc_server <- function(id){ if (isFALSE(continue_prediction())) { return() } - + print("check10") #Variables ploidy <- as.numeric(input$pred_ploidy) geno_adj <- pred_inputs$geno_input @@ -502,7 +582,7 @@ mod_GSAcc_server <- function(id){ #Control whether rrBLUP or GBLUP run depending on user input #Note, should add the GP functions to the utils.R file and then call them here... - if (input$pred_model == "rrBLUP"){ + if (advanced_options$pred_model == "rrBLUP"){ ##Need to add ability for the use of parallelism for the for cross-validation ##Example at this tutorial also: https://www.youtube.com/watch?v=ARWjdQU6ays @@ -533,7 +613,7 @@ mod_GSAcc_server <- function(id){ #Cross validation number for progress bar (not involved in the calculations, just shiny visuals) pb_value = 10 - + print("check11") #Remove the fixed traits from the Pheno file if (length(fixed_traits) == 0) { Pheno <- Pheno @@ -542,16 +622,16 @@ mod_GSAcc_server <- function(id){ Fixed <- subset(Pheno, select = fixed_traits) #Pheno <- subset(Pheno, select = -fixed_traits) - convert_all_to_factor_if_not_numeric <- function(df) { + convert_categorical_to_factor <- function(df, fixed_cat) { for (col in names(df)) { - if (!is.numeric(df[[col]]) && !is.integer(df[[col]])) { + if (col %in% fixed_cat) { df[[col]] <- as.factor(df[[col]]) } } return(df) } # Convert all columns to factor if they are not numeric or integer - Fixed <- convert_all_to_factor_if_not_numeric(Fixed) + Fixed <- convert_categorical_to_factor(Fixed, fixed_cat) #Fixed <- as.data.frame(lapply(Fixed, as.factor)) #convert to factor row.names(Fixed) <- row.names(Pheno) @@ -610,7 +690,7 @@ mod_GSAcc_server <- function(id){ row.names(Fixed_test) <- test } - + print("check12") Pheno_train <- Pheno[train, ] # Subset the phenotype df to only retain the relevant samples from the training set m_train <- geno[train, ] Pheno_test <- Pheno[test, ] @@ -654,7 +734,7 @@ mod_GSAcc_server <- function(id){ #Add iter and fold information for each trait/result heritability_scores[(((r-1)*5)+fold), (length(traits)+1)] <- r heritability_scores[(((r-1)*5)+fold), (length(traits)+2)] <- fold - + print("check13") #Add sample, iteration, and fold information to GEBVs_fold GEBVs_fold[,"Iter"] = r GEBVs_fold[,"Fold"] = fold @@ -725,10 +805,83 @@ mod_GSAcc_server <- function(id){ #Cross validation number for progress bar (not involved in the calculations, just shiny visuals) pb_value = 10 - #Convert normalized genotypes to relationship matrix - #By default, it removes SNPs with more than 50% missing data and imputes using the mean - Geno.mat <- A.mat(t(pred_inputs$geno_input)) - + if (advanced_options$pred_matrix == "Gmatrix") { + #Convert normalized genotypes to relationship matrix + #By default, it removes SNPs with more than 50% missing data and imputes using the mean + Geno.mat <- A.mat(t(pred_inputs$geno_input)) + print("check14") + }else if (advanced_options$pred_matrix == "Amatrix") { + + #Import pedigree file, where pedigree data name (3-column way format). Unknown value should be equal 0 + ped <- read.csv(advanced_options$ped_file$datapath, header = TRUE, check.names = FALSE, colClasses = "factor") + colnames(ped) <- c("Ind", "Sire", "Dam") + #Convert NAs to 0 + ped[is.na(ped)] <- 0 + #Ensure Sire and Dam are also listed as individuals + missing_parents <- unique(c(ped$Sire, ped$Dam)) + # Filter out parents already listed as individuals and non-zero values + missing_parents <- missing_parents[!missing_parents %in% ped$Ind & missing_parents != 0] + # Create new rows for missing parents and setting their parents to 0 (unknown) + new_rows <- data.frame(Ind = missing_parents, Sire = 0, Dam = 0) + # Combine the original dataframe with the new rows and remove duplicates + ped_extended <- unique(rbind(ped, new_rows)) + + #Converting to Amatrix + #Using the default additive relationship options (Amatrix only works for even numbered ploidy) + Geno.mat <- Amatrix(data = ped_extended, ploidy = ploidy) + + #Filter and order the ped file based on the phenotype file (make sure this is valid to subset after generating) + pheno_ids <- as.character(rownames(pred_inputs$pheno_input)) + valid_ids <- intersect(pheno_ids, rownames(Geno.mat)) + pred_inputs$pheno_input <- pred_inputs$pheno_input[valid_ids, ] + Geno.mat <- Geno.mat[valid_ids, valid_ids] + + #Update variable + total_population <- ncol(Geno.mat) + print("check15") + }else if (advanced_options$pred_matrix == "Hmatrix") { + print("check16") + #Import pedigree file, where pedigree data name (3-column way format). Unknown value should be equal 0 + ped <- read.csv(advanced_options$ped_file$datapath, header = TRUE, check.names = FALSE, colClasses = "factor") + colnames(ped) <- c("Ind", "Sire", "Dam") + #Convert NAs to 0 + ped[is.na(ped)] <- 0 + #Ensure Sire and Dam are also listed as individuals + missing_parents <- unique(c(ped$Sire, ped$Dam)) + # Filter out parents already listed as individuals and non-zero values + missing_parents <- missing_parents[!missing_parents %in% ped$Ind & missing_parents != 0] + # Create new rows for missing parents and setting their parents to 0 (unknown) + new_rows <- data.frame(Ind = missing_parents, Sire = 0, Dam = 0) + # Combine the original dataframe with the new rows and remove duplicates + ped_extended <- unique(rbind(ped, new_rows)) + + #Converting to Amatrix + #Using the default additive relationship options (Amatrix only works for even numbered ploidy) + Ped.mat <- Amatrix(data = ped_extended, ploidy = ploidy) + + #Filter and order the ped file based on the phenotype file (make sure this is valid to subset after generating) + pheno_ids <- as.character(rownames(pred_inputs$pheno_input)) + valid_ids <- intersect(pheno_ids, rownames(Ped.mat)) + pred_inputs$pheno_input <- pred_inputs$pheno_input[valid_ids, ] + Ped.mat <- Ped.mat[valid_ids, valid_ids] + + #Update variable + total_population <- ncol(Ped.mat) + + #Using Gmatrix to get the Gmatrix instead of A.mat for consistency + #Should I be using the raw dosage values or is it okay to use the scaled genotype data that is used for A.mat()? + G.mat <- Gmatrix(t(pred_inputs$geno_input[ ,valid_ids]), method = "VanRaden", ploidy = as.numeric(ploidy), missingValue = "NA") + G.mat <- round(G.mat,3) #to be easy to invert + + #Computing H matrix (Martini) - Using the name Geno.mat for consistency + Geno.mat <- Hmatrix(A=Ped.mat, G=G.mat, method="Martini", + ploidy= ploidy, + maf=0.05) + #Clean memory + rm(G.mat) + rm(Ped.mat) + rm(ped_filtered) + } # Establish accuracy results matrix results <- matrix(nrow = cycles*Folds, ncol = length(traits) + 2) colnames(results) <- c(paste0(traits), "Iter", "Fold") # Set the column names to be the traits From a5e94ef91c0cc548f93e223bd445b05d8d622178 Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Fri, 30 Aug 2024 14:41:37 -0400 Subject: [PATCH 15/40] Updated AO label --- R/mod_GSAcc.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/mod_GSAcc.R b/R/mod_GSAcc.R index 6e81d7e..9b60c0b 100644 --- a/R/mod_GSAcc.R +++ b/R/mod_GSAcc.R @@ -67,7 +67,7 @@ mod_GSAcc_ui <- function(id){ tags$hr(style="border-color: #d3d3d3; margin-top: 20px; margin-bottom: 20px;"), # Lighter grey line div(style="text-align: left; margin-top: 10px;", actionButton(ns("advanced_options"), - label = HTML(paste(icon("cog", style = "color: #007bff;"), "Advanced Options")), + label = HTML(paste(icon("cog", style = "color: #007bff;"), "Advanced Options (beta)")), style = "background-color: transparent; border: none; color: #007bff; font-size: smaller; text-decoration: underline; padding: 0;" ) ) From 5b503c1395768b78edebab93f2cda80f2b258984 Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Tue, 3 Sep 2024 17:04:32 -0400 Subject: [PATCH 16/40] GS review (in progress) --- R/mod_GSAcc.R | 426 ++++++---------------------- R/utils.R | 257 +++++++++++++++++ tests/testthat/test-GSAcc.R | 546 ++++++++++++++++++++++++++++++++++++ 3 files changed, 895 insertions(+), 334 deletions(-) create mode 100644 tests/testthat/test-GSAcc.R diff --git a/R/mod_GSAcc.R b/R/mod_GSAcc.R index defb28f..abbdf7c 100644 --- a/R/mod_GSAcc.R +++ b/R/mod_GSAcc.R @@ -66,7 +66,7 @@ mod_GSAcc_ui <- function(id){ )), tags$hr(style="border-color: #d3d3d3; margin-top: 20px; margin-bottom: 20px;"), # Lighter grey line div(style="text-align: left; margin-top: 10px;", - actionButton(ns("advanced_options"), + actionButton(ns("advanced_options"), label = HTML(paste(icon("cog", style = "color: #007bff;"), "Advanced Options (beta)")), style = "background-color: transparent; border: none; color: #007bff; font-size: smaller; text-decoration: underline; padding: 0;" ) @@ -136,14 +136,14 @@ mod_GSAcc_ui <- function(id){ mod_GSAcc_server <- function(input, output, session, parent_session){ ns <- session$ns - + #Default model choices advanced_options <- reactiveValues( pred_model = "rrBLUP", pred_matrix = "Gmatrix", ped_file = NULL ) - + #List the ped file name if previously uploaded output$uploaded_file_name <- renderText({ if (!is.null(advanced_options$ped_file)) { @@ -152,25 +152,24 @@ mod_GSAcc_server <- function(input, output, session, parent_session){ "" # Return an empty string if no file has been uploaded } }) - - print("check1") + #UI popup window for input observeEvent(input$advanced_options, { showModal(modalDialog( title = "Advanced Options (beta)", selectInput( - inputId = ns('pred_model'), - label = 'Model Choice', - choices = c("rrBLUP", "GBLUP"), + inputId = ns('pred_model'), + label = 'Model Choice', + choices = c("rrBLUP", "GBLUP"), selected = advanced_options$pred_model # Initialize with stored value ), conditionalPanel( condition = "input.pred_model == 'GBLUP'", ns = ns, div( selectInput( - inputId = ns('pred_matrix'), - label = 'GBLUP Matrix Choice', - choices = c("Gmatrix", "Amatrix", "Hmatrix"), + inputId = ns('pred_matrix'), + label = 'GBLUP Matrix Choice', + choices = c("Gmatrix", "Amatrix", "Hmatrix"), selected = advanced_options$pred_matrix # Initialize with stored value ) ) @@ -191,28 +190,27 @@ mod_GSAcc_server <- function(input, output, session, parent_session){ ) )) }) - - - + + + #Close popup window when user "saves options" observeEvent(input$save_advanced_options, { advanced_options$pred_model <- input$pred_model advanced_options$pred_matrix <- input$pred_matrix advanced_options$ped_file <- input$ped_file # Save other inputs as needed - + removeModal() # Close the modal after saving }) - - - + + + ####Genomic Prediction Accuracy #This tab involved 3 observeEvents #1) to get the traits listed in the phenotype file #2) to input and validate the input files #3) to perform the genomic prediction - print("check2") #1) Get traits observeEvent(input$trait_file, { info_df <- read.csv(input$trait_file$datapath, header = TRUE, check.names = FALSE, nrow = 0) @@ -222,7 +220,7 @@ mod_GSAcc_server <- function(input, output, session, parent_session){ updateVirtualSelect("pred_trait_info", choices = trait_var, session = session) }) - print("check3") + #2) Error check for prediction and save input files continue_prediction <- reactiveVal(NULL) pred_inputs <- reactiveValues( @@ -270,7 +268,7 @@ mod_GSAcc_server <- function(input, output, session, parent_session){ "green" = "#00BA38", input$pred_color_select) }) - + observeEvent(input$pred_fixed_info, { updateVirtualSelect("pred_fixed_cat", choices = input$pred_fixed_info, session = session) }) @@ -334,70 +332,16 @@ mod_GSAcc_server <- function(input, output, session, parent_session){ } - print("check4") #Getting genotype matrix #Geno file path file_path <- geno_path #Geno.file conversion if needed - if (grepl("\\.csv$", file_path)) { - geno <- read.csv(geno_path, header = TRUE, row.names = 1, check.names = FALSE) - - #Save number of SNPs - pred_inputs$pred_snps <- nrow(geno) - - } else if (grepl("\\.vcf$", file_path) || grepl("\\.gz$", file_path)) { - - #Function to convert GT to dosage calls (add to BIGr) - convert_to_dosage <- function(gt) { - # Split the genotype string - alleles <- strsplit(gt, "[|/]") - # Sum the alleles, treating NA values appropriately - sapply(alleles, function(x) { - if (any(is.na(x))) { - return(NA) - } else { - return(sum(as.numeric(x), na.rm = TRUE)) - } - }) - } - - #Convert VCF file if submitted - vcf <- vcfR::read.vcfR(file_path) - - #Get number of SNPs - pred_inputs$pred_snps <- nrow(vcf) - - #Extract GT - geno <- extract.gt(vcf, element = "GT") - geno <- apply(geno, 2, convert_to_dosage) - class(geno) <- "numeric" - rm(vcf) + geno_snps <- read_geno_file(geno_path, requires = "GT") + geno <- geno_snps[[1]] + pred_inputs$pred_snps <- geno_snps[[2]] - } else { - - # If condition is met, show notification toast - shinyalert( - title = "Oops", - text = "No valid genotype file detected", - size = "xs", - closeOnEsc = TRUE, - closeOnClickOutside = FALSE, - html = TRUE, - type = "warning", - showConfirmButton = TRUE, - confirmButtonText = "OK", - confirmButtonCol = "#004192", - showCancelButton = FALSE, - imageUrl = "", - animation = TRUE, - ) - - #Stop the analysis - return() - } - print("check5") #Save number of samples in file pred_inputs$pred_genos <- ncol(geno) @@ -427,15 +371,8 @@ mod_GSAcc_server <- function(input, output, session, parent_session){ #return() } - print("check6") - # Function to convert genotype matrix according to ploidy - convert_genotype <- function(genotype_matrix, ploidy) { - normalized_matrix <- 2 * (genotype_matrix / ploidy) - 1 - return(normalized_matrix) - } - - #tranforming genotypes - geno_adj_init <- convert_genotype(geno, as.numeric(ploidy)) + # Convert genotype matrix according to ploidy + geno_adj_init <- 2 * (geno / as.numeric(ploidy)) - 1 #Make sure the trait file and genotype file are in the same order # Column names for geno (assuming these are the individual IDs) @@ -470,7 +407,6 @@ mod_GSAcc_server <- function(input, output, session, parent_session){ # Stop the observeEvent gracefully return() - print("check7") } else if (length(common_ids) < length(colnames_geno)) { # If condition is met, show notification toast shinyalert( @@ -527,7 +463,6 @@ mod_GSAcc_server <- function(input, output, session, parent_session){ } ) - print("check8") # Subset and reorder geno and pheno to ensure they only contain and are ordered by common IDs geno_adj <- geno_adj_init[, common_ids] # Assuming that the columns can be directly indexed by IDs pheno <- pheno[match(common_ids, ids_pheno), ] @@ -543,7 +478,7 @@ mod_GSAcc_server <- function(input, output, session, parent_session){ pred_inputs$geno_input <- geno_adj } }) - print("check9") + #3) Analysis only proceeds once continue_prediction is converted to TRUE observe({ @@ -553,7 +488,7 @@ mod_GSAcc_server <- function(input, output, session, parent_session){ if (isFALSE(continue_prediction())) { return() } - print("check10") + #Variables ploidy <- as.numeric(input$pred_ploidy) geno_adj <- pred_inputs$geno_input @@ -579,239 +514,64 @@ mod_GSAcc_server <- function(input, output, session, parent_session){ } else{ pred_outputs$colors <- input$pred_color_select } - + #Control whether rrBLUP or GBLUP run depending on user input #Note, should add the GP functions to the utils.R file and then call them here... if (advanced_options$pred_model == "rrBLUP"){ ##Need to add ability for the use of parallelism for the for cross-validation ##Example at this tutorial also: https://www.youtube.com/watch?v=ARWjdQU6ays - - # Function to perform genomic prediction - ##Make sure this is correct (I think I need to be generating a relationship matrix A.mat() to account for missing data, but I am not sure how that works with this) - genomic_prediction <- function(geno, Pheno, traits, fixed_effects = NULL, Fold = 5, Iters = 5, cores = 1) { - - # Define variables - traits <- traits - cycles <- as.numeric(Iters) - Folds <- as.numeric(Fold) - total_population <- ncol(geno) - #train_size <- floor(percentage / 100 * total_population) - fixed_traits <- fixed_effects - cores <- as.numeric(cores) - - # Establish accuracy results matrix - results <- matrix(nrow = cycles*Folds, ncol = length(traits) + 2) - colnames(results) <- c(paste0(traits), "Iter", "Fold") # Set the column names to be the traits - - # Initialize a list to store GEBVs for all traits and cycles - GEBVs <- list() - - #Establish heritability_scores_df () Maybe get h2 values - # Establish results matrix - heritability_scores <- matrix(nrow = cycles*Folds, ncol = length(traits) + 2) - colnames(heritability_scores) <- c(paste0(traits,"_h2"), "Iter", "Fold") # Set the column names to be the traits - - #Cross validation number for progress bar (not involved in the calculations, just shiny visuals) - pb_value = 10 - print("check11") - #Remove the fixed traits from the Pheno file - if (length(fixed_traits) == 0) { - Pheno <- Pheno - } else { - #Subset fixed traits - Fixed <- subset(Pheno, select = fixed_traits) - - #Pheno <- subset(Pheno, select = -fixed_traits) - convert_categorical_to_factor <- function(df, fixed_cat) { - for (col in names(df)) { - if (col %in% fixed_cat) { - df[[col]] <- as.factor(df[[col]]) - } - } - return(df) - } - # Convert all columns to factor if they are not numeric or integer - Fixed <- convert_categorical_to_factor(Fixed, fixed_cat) - - #Fixed <- as.data.frame(lapply(Fixed, as.factor)) #convert to factor - row.names(Fixed) <- row.names(Pheno) - - #Make the matrix - formula_str <- paste("~", paste(fixed_traits, collapse = " + ")) - formula <- as.formula(formula_str) - - # Create the design matrix using the constructed formula - Fixed <- model.matrix(formula, data = Fixed) - } - - #Make kinship matrix of all individuals? - #Kin_mat <- A.mat(t(geno), n.core = 1) ##Need to explore whether or not to use a Kinship matrix and if it makes a significant improvement to accuracy - #If wanting to use Kkinship matrix, will then need to see how to implement it here - - #For now, I am just imputing the missing sites using mean, but EM is more accurate, but slower (can use multiple cores). - impute = (A.mat(t(geno), max.missing=0.5,impute.method="mean",return.imputed=TRUE)) - geno <- impute$imputed - - # For loop - for (r in 1:cycles) { - set.seed(r) - fold_ids <- sample(rep(1:Folds, length.out = total_population)) - fold_df <- data.frame(Sample = row.names(geno), FoldID = fold_ids) #Randomly assign each sample to a fold - fold_results <- matrix(nrow = Folds, ncol = length(traits)) - colnames(fold_results) <- traits - - #Initialize GEBV object for each cycle - GEBVs_cycle <-list() - - #Status - updateProgressBar(session = session, id = "pb_prediction", value = as.numeric(pb_value), title = paste0("Performing iteration:", r, "of", cycles)) - - for (fold in 1:Folds) { - - #Status bar length - pb_value = pb_value + (70 / as.numeric(cycles*Folds)) - - train <- fold_df %>% - dplyr::filter(FoldID != fold) %>% - pull(Sample) - test <- setdiff(row.names(geno),train) - - #Subset datasets - if (length(fixed_traits) == 0) { - Fixed_train = NULL - } else{ - Fixed_train <- data.frame(Fixed[train, ]) - Fixed_train <- as.matrix(Fixed_train) - row.names(Fixed_train) <- train - - #Fixed (testing) - Fixed_test<- data.frame(Fixed[test, ]) - Fixed_test <- as.matrix(Fixed_test) - row.names(Fixed_test) <- test - - } - print("check12") - Pheno_train <- Pheno[train, ] # Subset the phenotype df to only retain the relevant samples from the training set - m_train <- geno[train, ] - Pheno_test <- Pheno[test, ] - #Fixed_test <- Fixed[test, ] #Where would the Fixed_test be used? - m_valid <- geno[test, ] - - # Initialize a matrix to store GEBVs for this fold - GEBVs_fold <- matrix(nrow = length(test), ncol = length(traits)+3) - colnames(GEBVs_fold) <- c(traits,"Sample","Iter","Fold") - rownames(GEBVs_fold) <- paste("Iter", r,"Fold",fold,"Ind", test, sep="_") - - #Evaluate each trait using the same train and testing samples for each - for (trait_idx in 1:length(traits)) { - trait <- Pheno_train[, traits[trait_idx]] # Get the trait of interest - trait_answer <- mixed.solve(y= trait, Z = m_train, K = NULL, X = Fixed_train, SE = FALSE, return.Hinv = FALSE) - TRT <- trait_answer$u - e <- as.matrix(TRT) - pred_trait_test <- m_valid %*% e - pred_trait <- pred_trait_test[, 1] + c(trait_answer$beta) # Make sure this still works when using multiple traits - trait_test <- Pheno_test[, traits[trait_idx]] - results[(((r-1)*5)+fold), trait_idx] <- cor(pred_trait, trait_test, use = "complete") - results[(((r-1)*5)+fold), (length(traits)+1)] <- r - results[(((r-1)*5)+fold), (length(traits)+2)] <- fold - - # Extract GEBVs - # Check if Fixed_train is not NULL and include beta if it is - if (!is.null(Fixed_train) && !is.null(trait_answer$beta)) { - # Calculate GEBVs including fixed effects - GEBVs_fold[, trait_idx] <- m_valid %*% trait_answer$u + Fixed_test %*% trait_answer$beta - } else { - # Calculate GEBVs without fixed effects - GEBVs_fold[, trait_idx] <- m_valid %*% trait_answer$u #Confirm it is accurate to calculate the GEBVs for testing group from the trained model - } - - # Calculate heritability for the current trait - Vu <- trait_answer$Vu - Ve <- trait_answer$Ve - heritability_scores[(((r-1)*5)+fold), trait_idx] <- Vu / (Vu + Ve) - - } - #Add iter and fold information for each trait/result - heritability_scores[(((r-1)*5)+fold), (length(traits)+1)] <- r - heritability_scores[(((r-1)*5)+fold), (length(traits)+2)] <- fold - print("check13") - #Add sample, iteration, and fold information to GEBVs_fold - GEBVs_fold[,"Iter"] = r - GEBVs_fold[,"Fold"] = fold - GEBVs_fold[,"Sample"] <- test - - # Store GEBVs for this fold - GEBVs_cycle[[fold]] <- GEBVs_fold - - } - - # Store GEBVs for this cycle - GEBVs[[r]] <- do.call(rbind, GEBVs_cycle) - - } - - # Combine all GEBVs into a single DataFrame - GEBVs_df <- as.data.frame(do.call(rbind, GEBVs)) - - results <- as.data.frame(results) - heritability_scores <- as.data.frame(heritability_scores) - - # Combine results and heritability_scores using cbind - combined_results <- cbind(results, heritability_scores) - - return(list(GEBVs = GEBVs_df, PredictionAccuracy = results, CombinedResults = combined_results)) - } - + # Example call to the function #This is slow when using 3k markers and 1.2k samples...will need to parallelize if using this script... - results <- genomic_prediction(geno_adj, pheno, traits = traits, fixed_effects = fixed_traits, Iters = input$pred_cv, cores = cores) - + results <- genomic_prediction(geno_adj, pheno, + traits = traits, + fixed_effects = fixed_traits, + iters = input$pred_cv, + cores = cores) + #With fixed effects (need to inforporate the ability for fixed effects into the prediction?) #results <- genomic_prediction(geno_matrix, phenotype_df, c("height", "weight"), "~ age + sex") - + #Save to reactive value pred_outputs$corr_output <- results$PredictionAccuracy pred_outputs$all_GEBVs <- results$GEBVs - + # Convert trait columns to numeric results$GEBVs <- results$GEBVs %>% mutate(across(all_of(traits), ~ as.numeric(.x))) - + # Calculate the average value for each column in the traits list for each SampleID, ignoring Iter and Fold average_gebvs_df <- results$GEBVs %>% group_by(Sample) %>% summarize(across(all_of(traits), mean, na.rm = TRUE)) - + pred_outputs$avg_GEBVs <- average_gebvs_df - + columns <- setdiff(colnames(results$PredictionAccuracy), c("Iter","Fold")) average_accuracy_df <- results$PredictionAccuracy %>% group_by(Iter) %>% summarize(across(all_of(columns), mean, na.rm = TRUE)) - - + pred_outputs$comb_output <- average_accuracy_df - + }else{ #Note: should wrap the GBLUP into a function too # Define variables - traits <- traits cycles <- input$pred_cv Folds <- 5 total_population <- ncol(pred_inputs$geno_input) #train_size <- floor(percentage / 100 * total_population) - fixed_traits <- fixed_traits cores <- as.numeric(cores) #Cross validation number for progress bar (not involved in the calculations, just shiny visuals) pb_value = 10 - + if (advanced_options$pred_matrix == "Gmatrix") { #Convert normalized genotypes to relationship matrix #By default, it removes SNPs with more than 50% missing data and imputes using the mean Geno.mat <- A.mat(t(pred_inputs$geno_input)) - print("check14") + }else if (advanced_options$pred_matrix == "Amatrix") { - + #Import pedigree file, where pedigree data name (3-column way format). Unknown value should be equal 0 ped <- read.csv(advanced_options$ped_file$datapath, header = TRUE, check.names = FALSE, colClasses = "factor") colnames(ped) <- c("Ind", "Sire", "Dam") @@ -825,22 +585,22 @@ mod_GSAcc_server <- function(input, output, session, parent_session){ new_rows <- data.frame(Ind = missing_parents, Sire = 0, Dam = 0) # Combine the original dataframe with the new rows and remove duplicates ped_extended <- unique(rbind(ped, new_rows)) - + #Converting to Amatrix #Using the default additive relationship options (Amatrix only works for even numbered ploidy) Geno.mat <- Amatrix(data = ped_extended, ploidy = ploidy) - + #Filter and order the ped file based on the phenotype file (make sure this is valid to subset after generating) pheno_ids <- as.character(rownames(pred_inputs$pheno_input)) valid_ids <- intersect(pheno_ids, rownames(Geno.mat)) pred_inputs$pheno_input <- pred_inputs$pheno_input[valid_ids, ] Geno.mat <- Geno.mat[valid_ids, valid_ids] - + #Update variable total_population <- ncol(Geno.mat) - print("check15") + }else if (advanced_options$pred_matrix == "Hmatrix") { - print("check16") + #Import pedigree file, where pedigree data name (3-column way format). Unknown value should be equal 0 ped <- read.csv(advanced_options$ped_file$datapath, header = TRUE, check.names = FALSE, colClasses = "factor") colnames(ped) <- c("Ind", "Sire", "Dam") @@ -854,28 +614,28 @@ mod_GSAcc_server <- function(input, output, session, parent_session){ new_rows <- data.frame(Ind = missing_parents, Sire = 0, Dam = 0) # Combine the original dataframe with the new rows and remove duplicates ped_extended <- unique(rbind(ped, new_rows)) - + #Converting to Amatrix #Using the default additive relationship options (Amatrix only works for even numbered ploidy) Ped.mat <- Amatrix(data = ped_extended, ploidy = ploidy) - + #Filter and order the ped file based on the phenotype file (make sure this is valid to subset after generating) pheno_ids <- as.character(rownames(pred_inputs$pheno_input)) valid_ids <- intersect(pheno_ids, rownames(Ped.mat)) pred_inputs$pheno_input <- pred_inputs$pheno_input[valid_ids, ] Ped.mat <- Ped.mat[valid_ids, valid_ids] - + #Update variable total_population <- ncol(Ped.mat) - + #Using Gmatrix to get the Gmatrix instead of A.mat for consistency #Should I be using the raw dosage values or is it okay to use the scaled genotype data that is used for A.mat()? G.mat <- Gmatrix(t(pred_inputs$geno_input[ ,valid_ids]), method = "VanRaden", ploidy = as.numeric(ploidy), missingValue = "NA") G.mat <- round(G.mat,3) #to be easy to invert - + #Computing H matrix (Martini) - Using the name Geno.mat for consistency - Geno.mat <- Hmatrix(A=Ped.mat, G=G.mat, method="Martini", - ploidy= ploidy, + Geno.mat <- Hmatrix(A=Ped.mat, G=G.mat, method="Martini", + ploidy= ploidy, maf=0.05) #Clean memory rm(G.mat) @@ -885,16 +645,16 @@ mod_GSAcc_server <- function(input, output, session, parent_session){ # Establish accuracy results matrix results <- matrix(nrow = cycles*Folds, ncol = length(traits) + 2) colnames(results) <- c(paste0(traits), "Iter", "Fold") # Set the column names to be the traits - + # Initialize a list to store GEBVs for all traits and cycles GEBVs <- list() - + #Establish heritability_scores_df () Maybe get h2 values # Establish results matrix heritability_scores <- matrix(nrow = cycles*Folds, ncol = length(traits) + 2) colnames(heritability_scores) <- c(paste0(traits,"_h2"), "Iter", "Fold") # Set the column names to be the traits - - + + # For loop for (r in 1:cycles) { set.seed(r) @@ -902,31 +662,31 @@ mod_GSAcc_server <- function(input, output, session, parent_session){ fold_df <- data.frame(Sample = row.names(Geno.mat), FoldID = fold_ids) #Randomly assign each sample to a fold fold_results <- matrix(nrow = Folds, ncol = length(traits)) colnames(fold_results) <- traits - + #Initialize GEBV object for each cycle GEBVs_cycle <-list() - + #Status updateProgressBar(session = session, id = "pb_prediction", value = as.numeric(pb_value), title = paste0("Performing iteration:", r, "of", cycles)) - + for (fold in 1:Folds) { - + #Status bar length pb_value = pb_value + (70 / as.numeric(cycles*Folds)) - + #Subset training and testing samples train <- fold_df %>% dplyr::filter(FoldID != fold) %>% pull(Sample) test <- setdiff(row.names(Geno.mat),train) - + Fixed_train = NULL - + # Initialize a matrix to store GEBVs for this fold GEBVs_fold <- matrix(nrow = length(test), ncol = length(traits)+3) colnames(GEBVs_fold) <- c(traits,"Sample","Iter","Fold") rownames(GEBVs_fold) <- paste("Iter", r,"Fold",fold,"Ind", test, sep="_") - + #Evaluate each trait using the same train and testing samples for each for (trait_idx in 1:length(traits)) { #Mask phenotypes in testing group @@ -938,66 +698,66 @@ mod_GSAcc_server <- function(input, output, session, parent_session){ results[(((r-1)*5)+fold), trait_idx] <- cor(pred_inputs$pheno_input[test, traits[trait_idx]], traitpred$g[test], use = "complete.obs") results[(((r-1)*5)+fold), (length(traits)+1)] <- r results[(((r-1)*5)+fold), (length(traits)+2)] <- fold - + # Extract GEBVs GEBVs_fold[, trait_idx] <- traitpred$g[test] #Confirm it is accuract to calculate the GEBVs for testing group from the trained model - - + + # Calculate heritability (these are wrong) Vu <- traitpred$Vg Ve <- traitpred$Ve heritability_scores[(((r-1)*5)+fold), trait_idx] <- Vu / (Vu + Ve) - + } #Add iter and fold information for each trait/result heritability_scores[(((r-1)*5)+fold), (length(traits)+1)] <- r heritability_scores[(((r-1)*5)+fold), (length(traits)+2)] <- fold - + #Add sample, iteration, and fold information to GEBVs_fold GEBVs_fold[,"Iter"] = r GEBVs_fold[,"Fold"] = fold GEBVs_fold[,"Sample"] <- test - + # Store GEBVs for this fold GEBVs_cycle[[fold]] <- GEBVs_fold - + } - + # Store GEBVs for this cycle GEBVs[[r]] <- do.call(rbind, GEBVs_cycle) - + } - + # Combine all GEBVs into a single DataFrame GEBVs_df <- as.data.frame(do.call(rbind, GEBVs)) - + results <- as.data.frame(results) heritability_scores <- as.data.frame(heritability_scores) - + # Combine results and heritability_scores using cbind combined_results <- cbind(results, heritability_scores) - + #Save to reactive value pred_outputs$corr_output <- results pred_outputs$all_GEBVs <- results$GEBVs_df - + # Convert trait columns to numeric GEBVs <- GEBVs_df %>% mutate(across(all_of(traits), ~ as.numeric(.x))) - + # Calculate the average value for each column in the traits list for each SampleID, ignoring Iter and Fold average_gebvs_df <- GEBVs %>% group_by(Sample) %>% summarize(across(all_of(traits), mean, na.rm = TRUE)) - + pred_outputs$avg_GEBVs <- average_gebvs_df - + columns <- setdiff(colnames(results), c("Iter","Fold")) average_accuracy_df <- results %>% group_by(Iter) %>% summarize(across(all_of(columns), mean, na.rm = TRUE)) - - + + pred_outputs$comb_output <- average_accuracy_df } @@ -1035,7 +795,7 @@ mod_GSAcc_server <- function(input, output, session, parent_session){ #This can be adapted if we start comparing more than one GP model #Also consider a violin plot to show each cor value #plot <- ggplot(df_long, aes(x = factor(Trait), y = Correlation, fill = "red"), fill = "red") + - plot <- ggplot(df_long, aes(x = "rrBLUP", y = Correlation, fill = "red"), fill = "red") + + pred_outputs$box_plot <- ggplot(df_long, aes(x = "rrBLUP", y = Correlation, fill = "red"), fill = "red") + #geom_boxplot(position = position_dodge(width = 0.8), color = "black", width = 0.7, outlier.size = 0.2) + geom_boxplot() + facet_wrap(~ Trait, nrow = 1) + # Facet by trait, allowing different y-scales @@ -1052,7 +812,7 @@ mod_GSAcc_server <- function(input, output, session, parent_session){ axis.text.x.bottom = element_blank(), axis.ticks.x.bottom = element_blank()) - plot_violin <- ggplot(df_long, aes(x = "rrBLUP", y = Correlation, fill = "red")) + + pred_outputs$violin_plot <- ggplot(df_long, aes(x = "rrBLUP", y = Correlation, fill = "red")) + geom_violin(trim = TRUE) + # Add violin plot geom_point(position = position_jitter(width = 0.1), color = "black", size = 1.5) + # Add jittered points facet_wrap(~ Trait, nrow = 1) + # Facet by trait, allowing different y-scales @@ -1067,18 +827,16 @@ mod_GSAcc_server <- function(input, output, session, parent_session){ strip.text.x = element_text(face = "bold"), axis.text.x.bottom = element_blank(), axis.ticks.x.bottom = element_blank()) - - list(plot, plot_violin) }) #Output the genomic prediction correlation box plots output$pred_box_plot <- renderPlot({ - plots()[[1]] + scale_fill_manual(values = pred_outputs$colors) + pred_outputs$box_plot + scale_fill_manual(values = pred_outputs$colors) }) #Output the genomic prediction correlation box plots output$pred_violin_plot <- renderPlot({ - plots()[[2]] + scale_fill_manual(values = pred_outputs$colors) + pred_outputs$violin_plot + scale_fill_manual(values = pred_outputs$colors) }) #Output the prediction tables @@ -1209,4 +967,4 @@ mod_GSAcc_server <- function(input, output, session, parent_session){ # mod_GSAcc_ui("GSAcc_1") ## To be copied in the server -# mod_GSAcc_server("GSAcc_1") \ No newline at end of file +# mod_GSAcc_server("GSAcc_1") diff --git a/R/utils.R b/R/utils.R index 811f485..03f6121 100644 --- a/R/utils.R +++ b/R/utils.R @@ -292,3 +292,260 @@ split_info_column <- function(info) { return(info_list) } + +#' Read geno file +#' +#' @param file_path character indicanting path to file +#' @param requires which information is required from the VCF. Define the FORMAT or INFO letters. Example: c("GT", "DP", "PL") +#' +#' @importFrom vcfR read.vcfR +#' @importFrom shinyalert shinyalert +#' +read_geno_file <- function(file_path, requires = c("GT")){ + if (grepl("\\.csv$", file_path)) { + geno <- read.csv(geno_path, header = TRUE, row.names = 1, check.names = FALSE) + n_snps <- nrow(geno) + return(list(geno, n_snps)) + + } else if (grepl("\\.vcf$", file_path) || grepl("\\.gz$", file_path)) { + + #Convert VCF file if submitted + vcf <- read.vcfR(file_path) + + all_requires <- vector() + for(i in 1:length(requires)) all_requires[i] <- grepl(requires[i], vcf@fix[1,8]) | grepl(requires[i], vcf@gt[1,1]) + + if(!all(all_requires)) { + shinyalert( + title = "Oops", + text = paste("The VCF file does not contain required information:", requires[which(!all_requires)]), + size = "xs", + closeOnEsc = TRUE, + closeOnClickOutside = FALSE, + html = TRUE, + type = "warning", + showConfirmButton = TRUE, + confirmButtonText = "OK", + confirmButtonCol = "#004192", + showCancelButton = FALSE, + imageUrl = "", + animation = TRUE, + ) + return() + } + + n_snps <- nrow(vcf@gt) + + #Extract GT + geno <- extract.gt(vcf, element = "GT") + geno <- apply(geno, 2, convert_to_dosage) + class(geno) <- "numeric" + + return(list(geno, n_snps)) + } else { + # If condition is met, show notification toast + shinyalert( + title = "Oops", + text = "No valid genotype file detected", + size = "xs", + closeOnEsc = TRUE, + closeOnClickOutside = FALSE, + html = TRUE, + type = "warning", + showConfirmButton = TRUE, + confirmButtonText = "OK", + confirmButtonCol = "#004192", + showCancelButton = FALSE, + imageUrl = "", + animation = TRUE, + ) + + return() + } +} + + +#' Function to perform genomic prediction +#' +#' @param geno ToDo +#' @param pheno ToDo +#' @param traits ToDo +#' @param fixed_effects ToDo +#' @param fold ToDo +#' @param iters ToDo +#' @param cores ToDo +#' +genomic_prediction <- function(geno, pheno, traits, fixed_effects = NULL, fold = 5, iters = 5, cores = 1) { + + # Define variables + traits <- traits + cycles <- as.numeric(iters) + folds <- as.numeric(fold) + total_population <- ncol(geno) + #train_size <- floor(percentage / 100 * total_population) + fixed_traits <- fixed_effects + cores <- as.numeric(cores) + + # Establish accuracy results matrix + results <- matrix(nrow = cycles*folds, ncol = length(traits) + 2) + colnames(results) <- c(paste0(traits), "Iter", "Fold") # Set the column names to be the traits + + # Initialize a list to store GEBVs for all traits and cycles + GEBVs <- list() + + #Establish heritability_scores_df () Maybe get h2 values + # Establish results matrix + heritability_scores <- matrix(nrow = cycles*folds, ncol = length(traits) + 2) + colnames(heritability_scores) <- c(paste0(traits,"_h2"), "Iter", "Fold") # Set the column names to be the traits + + #Cross validation number for progress bar (not involved in the calculations, just shiny visuals) + pb_value = 10 + #Remove the fixed traits from the pheno file + if (length(fixed_traits) == 0) { + pheno <- pheno + } else { + #Subset fixed traits + Fixed <- subset(pheno, select = fixed_traits) + + #pheno <- subset(pheno, select = -fixed_traits) + convert_categorical_to_factor <- function(df, fixed_cat) { + for (col in names(df)) { + if (col %in% fixed_cat) { + df[[col]] <- as.factor(df[[col]]) + } + } + return(df) + } + # Convert all columns to factor if they are not numeric or integer + Fixed <- convert_categorical_to_factor(Fixed, fixed_cat) + + #Fixed <- as.data.frame(lapply(Fixed, as.factor)) #convert to factor + row.names(Fixed) <- row.names(pheno) + + #Make the matrix + formula_str <- paste("~", paste(fixed_traits, collapse = " + ")) + formula <- as.formula(formula_str) + + # Create the design matrix using the constructed formula + Fixed <- model.matrix(formula, data = Fixed) + } + + #Make kinship matrix of all individuals? + #Kin_mat <- A.mat(t(geno), n.core = 1) ##Need to explore whether or not to use a Kinship matrix and if it makes a significant improvement to accuracy + #If wanting to use Kkinship matrix, will then need to see how to implement it here + + #For now, I am just imputing the missing sites using mean, but EM is more accurate, but slower (can use multiple cores). + impute = (A.mat(t(geno), max.missing=0.5,impute.method="mean",return.imputed=TRUE)) + geno <- impute$imputed + + # For loop + for (r in 1:cycles) { + set.seed(r) + fold_ids <- sample(rep(1:folds, length.out = total_population)) + fold_df <- data.frame(Sample = row.names(geno), FoldID = fold_ids) #Randomly assign each sample to a fold + fold_results <- matrix(nrow = folds, ncol = length(traits)) + colnames(fold_results) <- traits + + #Initialize GEBV object for each cycle + GEBVs_cycle <-list() + + for (fold in 1:folds) { + + #Status bar length + pb_value = pb_value + (70 / as.numeric(cycles*folds)) + + train <- fold_df %>% + dplyr::filter(FoldID != fold) %>% + pull(Sample) + test <- setdiff(row.names(geno),train) + + #Subset datasets + if (length(fixed_traits) == 0) { + Fixed_train = NULL + } else{ + Fixed_train <- data.frame(Fixed[train, ]) + Fixed_train <- as.matrix(Fixed_train) + row.names(Fixed_train) <- train + + #Fixed (testing) + Fixed_test<- data.frame(Fixed[test, ]) + Fixed_test <- as.matrix(Fixed_test) + row.names(Fixed_test) <- test + + } + + pheno_train <- pheno[train, ] # Subset the phenotype df to only retain the relevant samples from the training set + m_train <- geno[train, ] + pheno_test <- pheno[test, ] + #Fixed_test <- Fixed[test, ] #Where would the Fixed_test be used? + m_valid <- geno[test, ] + + # Initialize a matrix to store GEBVs for this fold + GEBVs_fold <- matrix(nrow = length(test), ncol = length(traits)+3) + colnames(GEBVs_fold) <- c(traits,"Sample","Iter","Fold") + rownames(GEBVs_fold) <- paste("Iter", r,"Fold",fold,"Ind", test, sep="_") + + #Evaluate each trait using the same train and testing samples for each + for (trait_idx in 1:length(traits)) { + trait <- pheno_train[, traits[trait_idx]] # Get the trait of interest + trait_answer <- mixed.solve(y= trait, Z = m_train, K = NULL, X = Fixed_train, SE = FALSE, return.Hinv = FALSE) + TRT <- trait_answer$u + e <- as.matrix(TRT) + pred_trait_test <- m_valid %*% e + pred_trait <- pred_trait_test[, 1] + c(trait_answer$beta) # Make sure this still works when using multiple traits + trait_test <- pheno_test[, traits[trait_idx]] + results[(((r-1)*5)+fold), trait_idx] <- cor(pred_trait, trait_test, use = "complete") + results[(((r-1)*5)+fold), (length(traits)+1)] <- r + results[(((r-1)*5)+fold), (length(traits)+2)] <- fold + + # Extract GEBVs + # Check if Fixed_train is not NULL and include beta if it is + if (!is.null(Fixed_train) && !is.null(trait_answer$beta)) { + # Calculate GEBVs including fixed effects + GEBVs_fold[, trait_idx] <- m_valid %*% trait_answer$u + Fixed_test %*% trait_answer$beta + } else { + # Calculate GEBVs without fixed effects + GEBVs_fold[, trait_idx] <- m_valid %*% trait_answer$u #Confirm it is accurate to calculate the GEBVs for testing group from the trained model + } + + # Calculate heritability for the current trait + Vu <- trait_answer$Vu + Ve <- trait_answer$Ve + heritability_scores[(((r-1)*5)+fold), trait_idx] <- Vu / (Vu + Ve) + + } + #Add iter and fold information for each trait/result + heritability_scores[(((r-1)*5)+fold), (length(traits)+1)] <- r + heritability_scores[(((r-1)*5)+fold), (length(traits)+2)] <- fold + + #Add sample, iteration, and fold information to GEBVs_fold + GEBVs_fold[,"Iter"] = r + GEBVs_fold[,"Fold"] = fold + GEBVs_fold[,"Sample"] <- test + + # Store GEBVs for this fold + GEBVs_cycle[[fold]] <- GEBVs_fold + + } + + # Store GEBVs for this cycle + GEBVs[[r]] <- do.call(rbind, GEBVs_cycle) + + } + + # Combine all GEBVs into a single DataFrame + GEBVs_df <- as.data.frame(do.call(rbind, GEBVs)) + + results <- as.data.frame(results) + heritability_scores <- as.data.frame(heritability_scores) + + # Combine results and heritability_scores using cbind + combined_results <- cbind(results, heritability_scores) + + return(list(GEBVs = GEBVs_df, PredictionAccuracy = results, CombinedResults = combined_results)) +} + +# genomic_prediction_gblup <- function(geno, pred_matrix, ped_file = NULL){ +# +# } + diff --git a/tests/testthat/test-GSAcc.R b/tests/testthat/test-GSAcc.R new file mode 100644 index 0000000..84342db --- /dev/null +++ b/tests/testthat/test-GSAcc.R @@ -0,0 +1,546 @@ +context("GSAcc") + +test_that("test Predictive Ability",{ + + # packages + library(vcfR) + library(BIGapp) + library(rrBLUP) + library(dplyr) + library(tidyr) + library(ggplot2) + + # Inputs + input <- list() + + input$trait_file$datapath <- system.file("iris_passport_file.csv", package = "BIGapp") + input$pred_color_select <- "red" + input$pred_ploidy <- 2 + input$pred_file$datapath <- system.file("iris_DArT_VCF.vcf.gz", package = "BIGapp") + + input$pred_trait_info <- "Petal.Length" + input$pred_cv <- 5 + + input$pred_fixed_info <- NULL + input$pred_fixed_cat <- NULL + input$pred_cores <- 3 + + input$pred_model <- "rrBLUP" + input$pred_matrix <- "Gmatrix" + input$ped_file <- NULL + + #Close popup window when user "saves options" + advanced_options <- list() + advanced_options$pred_model <- input$pred_model + advanced_options$pred_matrix <- input$pred_matrix + advanced_options$ped_file <- input$ped_file + + ####Genomic Prediction Accuracy + #This tab involved 3 observeEvents + #1) to get the traits listed in the phenotype file + #2) to input and validate the input files + #3) to perform the genomic prediction + + #1) Get traits + info_df <- read.csv(input$trait_file$datapath, header = TRUE, check.names = FALSE, nrow = 0) + trait_var <- colnames(info_df) + trait_var <- trait_var[2:length(trait_var)] + + #2) Error check for prediction and save input files + continue_prediction <- NULL + pred_inputs <- list( + pheno_input = NULL, + geno_input = NULL, + pred_snps = NULL, + pred_genos = NULL, + pred_geno_pheno = NULL + ) + + pred_outputs <- list( + corr_output = NULL, + box_plot = NULL, + violin_plot = NULL, + comb_output = NULL, + avg_GEBVs = NULL, + all_GEBVs = NULL, + colors = NULL + ) + + # Update colors based on input + pred_outputs$colors <- switch(input$pred_color_select, + "red" = "#F8766D", + "blue" = "#00BFC4", + "green" = "#00BA38", + input$pred_color_select) + + + #Variables + ploidy <- as.numeric(input$pred_ploidy) + geno_path <- input$pred_file$datapath + pheno <- read.csv(input$trait_file$datapath, header = TRUE, check.names = FALSE) + row.names(pheno) <- pheno[,1] + traits <- input$pred_trait_info + CVs <- as.numeric(input$pred_cv) + + #Getting genotype matrix + + #Geno.file conversion if needed + geno_snps <- read_geno_file(geno_path, requires = "GT") + geno <- geno_snps[[1]] + pred_inputs$pred_snps <- geno_snps[[2]] + + #Save number of samples in file + pred_inputs$pred_genos <- ncol(geno) + + #Check that the ploidy entered is correct + if (ploidy != max(geno, na.rm = TRUE)) stop(paste0("The maximum value in the genotype file (",max(geno, na.rm = TRUE),") does not equal the ploidy entered")) + + # Convert genotype matrix according to ploidy + geno_adj_init <- 2 * (geno / as.numeric(ploidy)) - 1 + + #Make sure the trait file and genotype file are in the same order + # Column names for geno (assuming these are the individual IDs) + colnames_geno <- colnames(geno) + # Assuming the first column in Pheno contains the matching IDs + ids_pheno <- pheno[, 1] + # Find common identifiers + common_ids <- intersect(colnames_geno, ids_pheno) + #Get number of id + pred_inputs$pred_geno_pheno <- length(common_ids) + + #Throw an error if there are less matching samples in the phenotype file than the genotype file + if (length(common_ids) == 0) { + stop("All samples were missing from the phenotype file") + } else { + if (length(common_ids) < length(colnames_geno)) stop(paste0((length(colnames_geno)-length(common_ids))," samples were removed for not having trait information")) + } + + # Subset and reorder geno and pheno to ensure they only contain and are ordered by common IDs + geno_adj <- geno_adj_init[, common_ids] # Assuming that the columns can be directly indexed by IDs + pheno <- pheno[match(common_ids, ids_pheno), ] + + ##Save to reactive values + #Gmatrix needs the original allele count values, so the user matrix selection determines the genotype matrix used + pred_inputs$pheno_input <- pheno + if (advanced_options$pred_matrix == "Gmatrix" || is.null(advanced_options$pred_matrix)) { + pred_inputs$geno_input <- geno_adj + } else if (advanced_options$pred_matrix == "Hmatrix") { + pred_inputs$geno_input <- geno[, common_ids] + } else { + pred_inputs$geno_input <- geno_adj + } + + #3) Analysis only proceeds once continue_prediction is converted to TRUE + #Variables + ploidy <- as.numeric(input$pred_ploidy) + geno_adj <- pred_inputs$geno_input + pheno <- pred_inputs$pheno_input + traits <- input$pred_trait_info + CVs <- as.numeric(input$pred_cv) + fixed_traits <- input$pred_fixed_info + fixed_cat <- input$pred_fixed_cat + fixed_cov <- if (is.null(input$pred_fixed_info) || length(input$pred_fixed_info) == length(input$pred_fixed_cat)) { + NULL + } else { + setdiff(input$pred_fixed_info, input$pred_fixed_cat) + } + cores <- input$pred_cores + + #Assign colors + if (input$pred_color_select == "red"){ + pred_outputs$colors <- "#F8766D" + } else if (input$pred_color_select == "blue") { + pred_outputs$colors <- "#00BFC4" + } else if (input$pred_color_select == "green") { + pred_outputs$colors <- "#00BA38" + } else{ + pred_outputs$colors <- input$pred_color_select + } + + #Control whether rrBLUP or GBLUP run depending on user input + + ####### rrBLUP + ##Need to add ability for the use of parallelism for the for cross-validation + ##Example at this tutorial also: https://www.youtube.com/watch?v=ARWjdQU6ays + + pred_inputs$geno_input <- geno_adj + + # Example call to the function + #This is slow when using 3k markers and 1.2k samples...will need to parallelize if using this script... + results <- genomic_prediction(geno = geno_adj, + pheno = pheno, + traits = traits, + fixed_effects = fixed_traits, + iters = input$pred_cv, + cores = cores) + + #With fixed effects (need to inforporate the ability for fixed effects into the prediction?) + #results <- genomic_prediction(geno_matrix, phenotype_df, c("height", "weight"), "~ age + sex") + + #Save to reactive value + pred_outputs_rrBLUP <- pred_outputs + pred_outputs_rrBLUP$corr_output <- results$PredictionAccuracy + pred_outputs_rrBLUP$all_GEBVs <- results$GEBVs + + # Convert trait columns to numeric + results$GEBVs <- results$GEBVs %>% + mutate(across(all_of(traits), ~ as.numeric(.x))) + + # Calculate the average value for each column in the traits list for each SampleID, ignoring Iter and Fold + average_gebvs_df <- results$GEBVs %>% + group_by(Sample) %>% + summarize(across(all_of(traits), mean, na.rm = TRUE)) + + pred_outputs_rrBLUP$avg_GEBVs <- average_gebvs_df + + columns <- setdiff(colnames(results$PredictionAccuracy), c("Iter","Fold")) + average_accuracy_df <- results$PredictionAccuracy %>% + group_by(Iter) %>% + summarize(across(all_of(columns), mean, na.rm = TRUE)) + + pred_outputs_rrBLUP$comb_output <- average_accuracy_df + + df <- pred_outputs_rrBLUP$corr_output + df <- df %>% dplyr::select(-Fold, -Iter) + + #Probably want to add the ability for the user to select which trait(s) to display here + + #Convert to long format for ggplot + df_long <- pivot_longer( + df, + cols = colnames(df), # Exclude the Cycle column from transformation + names_to = "Trait", # New column for trait names + values_to = "Correlation" # New column for correlation values + ) + + #This can be adapted if we start comparing more than one GP model + #Also consider a violin plot to show each cor value + #plot <- ggplot(df_long, aes(x = factor(Trait), y = Correlation, fill = "red"), fill = "red") + + pred_outputs_rrBLUP$box_plot <- ggplot(df_long, aes(x = "rrBLUP", y = Correlation, fill = "red"), fill = "red") + + #geom_boxplot(position = position_dodge(width = 0.8), color = "black", width = 0.7, outlier.size = 0.2) + + geom_boxplot() + + facet_wrap(~ Trait, nrow = 1) + # Facet by trait, allowing different y-scales + labs(title = "Predictive Ability by Trait", + x = " ", + y = "Pearson Correlation") + + #theme_minimal() + # Using a minimal theme + theme(legend.position = "none", + strip.text = element_text(size = 12), + axis.text = element_text(size = 12), + axis.title = element_text(size = 14), + axis.text.x = element_text(angle = 90, hjust = 0.95, vjust = 0.2), + strip.text.x = element_text(face = "bold"), + axis.text.x.bottom = element_blank(), + axis.ticks.x.bottom = element_blank()) + + pred_outputs_rrBLUP$violin_plot <- ggplot(df_long, aes(x = "rrBLUP", y = Correlation, fill = "red")) + + geom_violin(trim = TRUE) + # Add violin plot + geom_point(position = position_jitter(width = 0.1), color = "black", size = 1.5) + # Add jittered points + facet_wrap(~ Trait, nrow = 1) + # Facet by trait, allowing different y-scales + labs(title = "Predictive Ability by Trait", + x = " ", # x-label is blank because it's not relevant per facet + y = "Pearson Correlation") + + theme(legend.position = "none", + strip.text = element_text(size = 12), + axis.text = element_text(size = 12), + axis.title = element_text(size = 14), + axis.text.x = element_text(angle = 90, hjust = 0.95, vjust = 0.2), + strip.text.x = element_text(face = "bold"), + axis.text.x.bottom = element_blank(), + axis.ticks.x.bottom = element_blank()) + + plots <- list(pred_outputs_rrBLUP$box_plot, pred_outputs_rrBLUP$violin_plot) + + #Output the genomic prediction correlation box plots + plots[[1]] + scale_fill_manual(values = pred_outputs_rrBLUP$colors) + + #Output the genomic prediction correlation box plots + plots[[2]] + scale_fill_manual(values = pred_outputs_rrBLUP$colors) + + #Output the prediction tables + pred_outputs_rrBLUP$corr_output + pred_outputs_rrBLUP$comb_output + pred_outputs_rrBLUP$all_GEBVs + pred_outputs_rrBLUP$avg_GEBVs + + ######### + + ######### GBLUP + #Note: should wrap the GBLUP into a function too + # Define variables + cycles <- input$pred_cv + folds <- 5 + total_population <- ncol(pred_inputs$geno_input) + #train_size <- floor(percentage / 100 * total_population) + cores <- as.numeric(cores) + #Cross validation number for progress bar (not involved in the calculations, just shiny visuals) + pb_value = 10 + + if (advanced_options$pred_matrix == "Gmatrix") { + #Convert normalized genotypes to relationship matrix + #By default, it removes SNPs with more than 50% missing data and imputes using the mean + Geno.mat <- A.mat(t(pred_inputs$geno_input)) + + }else if (advanced_options$pred_matrix == "Amatrix") { + + #Import pedigree file, where pedigree data name (3-column way format). Unknown value should be equal 0 + ped <- read.csv(advanced_options$ped_file$datapath, header = TRUE, check.names = FALSE, colClasses = "factor") + colnames(ped) <- c("Ind", "Sire", "Dam") + #Convert NAs to 0 + ped[is.na(ped)] <- 0 + #Ensure Sire and Dam are also listed as individuals + missing_parents <- unique(c(ped$Sire, ped$Dam)) + # Filter out parents already listed as individuals and non-zero values + missing_parents <- missing_parents[!missing_parents %in% ped$Ind & missing_parents != 0] + # Create new rows for missing parents and setting their parents to 0 (unknown) + new_rows <- data.frame(Ind = missing_parents, Sire = 0, Dam = 0) + # Combine the original dataframe with the new rows and remove duplicates + ped_extended <- unique(rbind(ped, new_rows)) + + #Converting to Amatrix + #Using the default additive relationship options (Amatrix only works for even numbered ploidy) + Geno.mat <- Amatrix(data = ped_extended, ploidy = ploidy) + + #Filter and order the ped file based on the phenotype file (make sure this is valid to subset after generating) + pheno_ids <- as.character(rownames(pred_inputs$pheno_input)) + valid_ids <- intersect(pheno_ids, rownames(Geno.mat)) + pred_inputs$pheno_input <- pred_inputs$pheno_input[valid_ids, ] + Geno.mat <- Geno.mat[valid_ids, valid_ids] + + #Update variable + total_population <- ncol(Geno.mat) + print("check15") + }else if (advanced_options$pred_matrix == "Hmatrix") { + print("check16") + #Import pedigree file, where pedigree data name (3-column way format). Unknown value should be equal 0 + ped <- read.csv(advanced_options$ped_file$datapath, header = TRUE, check.names = FALSE, colClasses = "factor") + colnames(ped) <- c("Ind", "Sire", "Dam") + #Convert NAs to 0 + ped[is.na(ped)] <- 0 + #Ensure Sire and Dam are also listed as individuals + missing_parents <- unique(c(ped$Sire, ped$Dam)) + # Filter out parents already listed as individuals and non-zero values + missing_parents <- missing_parents[!missing_parents %in% ped$Ind & missing_parents != 0] + # Create new rows for missing parents and setting their parents to 0 (unknown) + new_rows <- data.frame(Ind = missing_parents, Sire = 0, Dam = 0) + # Combine the original dataframe with the new rows and remove duplicates + ped_extended <- unique(rbind(ped, new_rows)) + + #Converting to Amatrix + #Using the default additive relationship options (Amatrix only works for even numbered ploidy) + Ped.mat <- Amatrix(data = ped_extended, ploidy = ploidy) + + #Filter and order the ped file based on the phenotype file (make sure this is valid to subset after generating) + pheno_ids <- as.character(rownames(pred_inputs$pheno_input)) + valid_ids <- intersect(pheno_ids, rownames(Ped.mat)) + pred_inputs$pheno_input <- pred_inputs$pheno_input[valid_ids, ] + Ped.mat <- Ped.mat[valid_ids, valid_ids] + + #Update variable + total_population <- ncol(Ped.mat) + + #Using Gmatrix to get the Gmatrix instead of A.mat for consistency + #Should I be using the raw dosage values or is it okay to use the scaled genotype data that is used for A.mat()? + G.mat <- Gmatrix(t(pred_inputs$geno_input[ ,valid_ids]), method = "VanRaden", ploidy = as.numeric(ploidy), missingValue = "NA") + G.mat <- round(G.mat,3) #to be easy to invert + + #Computing H matrix (Martini) - Using the name Geno.mat for consistency + Geno.mat <- Hmatrix(A=Ped.mat, G=G.mat, method="Martini", + ploidy= ploidy, + maf=0.05) + #Clean memory + rm(G.mat) + rm(Ped.mat) + rm(ped_filtered) + } + # Establish accuracy results matrix + results <- matrix(nrow = cycles*Folds, ncol = length(traits) + 2) + colnames(results) <- c(paste0(traits), "Iter", "Fold") # Set the column names to be the traits + + # Initialize a list to store GEBVs for all traits and cycles + GEBVs <- list() + + #Establish heritability_scores_df () Maybe get h2 values + # Establish results matrix + heritability_scores <- matrix(nrow = cycles*Folds, ncol = length(traits) + 2) + colnames(heritability_scores) <- c(paste0(traits,"_h2"), "Iter", "Fold") # Set the column names to be the traits + + + # For loop + for (r in 1:cycles) { + set.seed(r) + fold_ids <- sample(rep(1:Folds, length.out = total_population)) + fold_df <- data.frame(Sample = row.names(Geno.mat), FoldID = fold_ids) #Randomly assign each sample to a fold + fold_results <- matrix(nrow = Folds, ncol = length(traits)) + colnames(fold_results) <- traits + + #Initialize GEBV object for each cycle + GEBVs_cycle <-list() + + #Status + updateProgressBar(session = session, id = "pb_prediction", value = as.numeric(pb_value), title = paste0("Performing iteration:", r, "of", cycles)) + + for (fold in 1:Folds) { + + #Status bar length + pb_value = pb_value + (70 / as.numeric(cycles*Folds)) + + #Subset training and testing samples + train <- fold_df %>% + dplyr::filter(FoldID != fold) %>% + pull(Sample) + test <- setdiff(row.names(Geno.mat),train) + + Fixed_train = NULL + + # Initialize a matrix to store GEBVs for this fold + GEBVs_fold <- matrix(nrow = length(test), ncol = length(traits)+3) + colnames(GEBVs_fold) <- c(traits,"Sample","Iter","Fold") + rownames(GEBVs_fold) <- paste("Iter", r,"Fold",fold,"Ind", test, sep="_") + + #Evaluate each trait using the same train and testing samples for each + for (trait_idx in 1:length(traits)) { + #Mask phenotypes in testing group + Pheno_test <- pred_inputs$pheno_input + Pheno_test[test, traits[trait_idx]] <- NA + #Kin.blup + traitpred <- kin.blup(data = Pheno_test, geno = names(pred_inputs$pheno_input)[1], pheno = traits[trait_idx], fixed = fixed_cat, covariate = fixed_cov, K=Geno.mat) + #Cor between test values and predicted breeding values + results[(((r-1)*5)+fold), trait_idx] <- cor(pred_inputs$pheno_input[test, traits[trait_idx]], traitpred$g[test], use = "complete.obs") + results[(((r-1)*5)+fold), (length(traits)+1)] <- r + results[(((r-1)*5)+fold), (length(traits)+2)] <- fold + + # Extract GEBVs + GEBVs_fold[, trait_idx] <- traitpred$g[test] #Confirm it is accuract to calculate the GEBVs for testing group from the trained model + + + # Calculate heritability (these are wrong) + Vu <- traitpred$Vg + Ve <- traitpred$Ve + heritability_scores[(((r-1)*5)+fold), trait_idx] <- Vu / (Vu + Ve) + + } + #Add iter and fold information for each trait/result + heritability_scores[(((r-1)*5)+fold), (length(traits)+1)] <- r + heritability_scores[(((r-1)*5)+fold), (length(traits)+2)] <- fold + + #Add sample, iteration, and fold information to GEBVs_fold + GEBVs_fold[,"Iter"] = r + GEBVs_fold[,"Fold"] = fold + GEBVs_fold[,"Sample"] <- test + + # Store GEBVs for this fold + GEBVs_cycle[[fold]] <- GEBVs_fold + + } + + # Store GEBVs for this cycle + GEBVs[[r]] <- do.call(rbind, GEBVs_cycle) + + } + + # Combine all GEBVs into a single DataFrame + GEBVs_df <- as.data.frame(do.call(rbind, GEBVs)) + + results <- as.data.frame(results) + heritability_scores <- as.data.frame(heritability_scores) + + # Combine results and heritability_scores using cbind + combined_results <- cbind(results, heritability_scores) + + #Save to reactive value + pred_outputs_gBLUP <- pred_outputs + pred_outputs_gBLUP$corr_output <- results + pred_outputs_gBLUP$all_GEBVs <- results$GEBVs_df + + # Convert trait columns to numeric + GEBVs <- GEBVs_df %>% + mutate(across(all_of(traits), ~ as.numeric(.x))) + + # Calculate the average value for each column in the traits list for each SampleID, ignoring Iter and Fold + average_gebvs_df <- GEBVs %>% + group_by(Sample) %>% + summarize(across(all_of(traits), mean, na.rm = TRUE)) + + pred_outputs_gBLUP$avg_GEBVs <- average_gebvs_df + + columns <- setdiff(colnames(results), c("Iter","Fold")) + average_accuracy_df <- results %>% + group_by(Iter) %>% + summarize(across(all_of(columns), mean, na.rm = TRUE)) + + + pred_outputs_gBLUP$comb_output <- average_accuracy_df + + df <- pred_outputs_gBLUP$corr_output + df <- df %>% dplyr::select(-Fold, -Iter) + + #Probably want to add the ability for the user to select which trait(s) to display here + + #Convert to long format for ggplot + df_long <- pivot_longer( + df, + cols = colnames(df), # Exclude the Cycle column from transformation + names_to = "Trait", # New column for trait names + values_to = "Correlation" # New column for correlation values + ) + + #This can be adapted if we start comparing more than one GP model + #Also consider a violin plot to show each cor value + #plot <- ggplot(df_long, aes(x = factor(Trait), y = Correlation, fill = "red"), fill = "red") + + plot <- ggplot(df_long, aes(x = "rrBLUP", y = Correlation, fill = "red"), fill = "red") + + #geom_boxplot(position = position_dodge(width = 0.8), color = "black", width = 0.7, outlier.size = 0.2) + + geom_boxplot() + + facet_wrap(~ Trait, nrow = 1) + # Facet by trait, allowing different y-scales + labs(title = "Predictive Ability by Trait", + x = " ", + y = "Pearson Correlation") + + #theme_minimal() + # Using a minimal theme + theme(legend.position = "none", + strip.text = element_text(size = 12), + axis.text = element_text(size = 12), + axis.title = element_text(size = 14), + axis.text.x = element_text(angle = 90, hjust = 0.95, vjust = 0.2), + strip.text.x = element_text(face = "bold"), + axis.text.x.bottom = element_blank(), + axis.ticks.x.bottom = element_blank()) + + plot_violin <- ggplot(df_long, aes(x = "rrBLUP", y = Correlation, fill = "red")) + + geom_violin(trim = TRUE) + # Add violin plot + geom_point(position = position_jitter(width = 0.1), color = "black", size = 1.5) + # Add jittered points + facet_wrap(~ Trait, nrow = 1) + # Facet by trait, allowing different y-scales + labs(title = "Predictive Ability by Trait", + x = " ", # x-label is blank because it's not relevant per facet + y = "Pearson Correlation") + + theme(legend.position = "none", + strip.text = element_text(size = 12), + axis.text = element_text(size = 12), + axis.title = element_text(size = 14), + axis.text.x = element_text(angle = 90, hjust = 0.95, vjust = 0.2), + strip.text.x = element_text(face = "bold"), + axis.text.x.bottom = element_blank(), + axis.ticks.x.bottom = element_blank()) + + + #Output the genomic prediction correlation box plots + plots()[[1]] + scale_fill_manual(values = pred_outputs_gBLUP$colors) + + #Output the genomic prediction correlation box plots + plots()[[2]] + scale_fill_manual(values = pred_outputs_gBLUP$colors) + + #Output the prediction tables + pred_outputs_gBLUP$comb_output + + all_GEBVs() + + pred_outputs_gBLUP$comb_output + + comb_output() + + pred_outputs_gBLUP$avg_GEBVs + +}) + + + + From 593633c93a30439b0797eb53073e87d0bf1d6ce1 Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Fri, 6 Sep 2024 17:14:04 -0400 Subject: [PATCH 17/40] Updated PCA Window and summary --- R/mod_PCA.R | 45 +++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 43 insertions(+), 2 deletions(-) diff --git a/R/mod_PCA.R b/R/mod_PCA.R index 96b03b8..96d0883 100644 --- a/R/mod_PCA.R +++ b/R/mod_PCA.R @@ -79,7 +79,8 @@ mod_PCA_ui <- function(id){ sliderInput(inputId = ns('pca_image_res'), label = 'Resolution', value = 300, min = 50, max = 1000, step=50), sliderInput(inputId = ns('pca_image_width'), label = 'Width', value = 10, min = 1, max = 20, step=0.5), sliderInput(inputId = ns('pca_image_height'), label = 'Height', value = 6, min = 1, max = 20, step = 0.5), - downloadButton(ns("download_pca"), "Save"), + downloadButton(ns("download_pca"), "Save Image"), + downloadButton(ns("download_pca_summary"), "Save Summary"), circle = FALSE, status = "danger", icon = icon("floppy-disk"), width = "300px", @@ -93,7 +94,7 @@ mod_PCA_ui <- function(id){ style = "overflow-y: auto; height: 480px" ), box( - title = "PCA Plots", status = "info", solidHeader = FALSE, width = 12, height = 550, + title = "PCA Plots", status = "info", solidHeader = FALSE, width = 12, height = 550, maximizable = T, bs4Dash::tabsetPanel( tabPanel("3D-Plot",withSpinner(plotlyOutput(ns("pca_plot"), height = '460px'))), tabPanel("2D-Plot", withSpinner(plotOutput(ns("pca_plot_ggplot"), height = '460px'))), @@ -478,6 +479,46 @@ mod_PCA_server <- function(id){ dev.off() } ) + + output$download_pca_summary <- downloadHandler( + filename = function() { + paste("pca-summary-", Sys.Date(), ".txt", sep = "") + }, + + content = function(file) { + pca_param <- c( + "BIGapp PCA Summary", + " ", + paste0("Date: ", Sys.Date()), + " ", + version$version.string, + " ", + "### Input Files ###", + "", + paste("Input Genotype File:", input$dosage_file$name, sep = " "), + paste("Input Passport File:", input$passport_file$name, sep = " "), + "", + "### User Selected Parameters ###", + "", + paste("Selected Ploidy:", as.character(input$pca_ploidy), sep = " "), + "", + "### R Packages Used ###", + "", + paste("BIGapp:",packageVersion("BIGapp"),sep=" "), + paste("AGHmatrix:",packageVersion("AGHmatrix"), sep = " "), + paste("ggplot2:",packageVersion("ggplot2"), sep = " "), + paste("plotly:",packageVersion("plotly"), sep = " "), + paste("factoextra:",packageVersion("factoextra"), sep = " "), + paste("RColorBrewer:",packageVersion("RColorBrewer"), sep= " ") + ) + + #sink(file) # Open sink with the provided file path + #cat(pca_param, sep = "\n") # Print the pca_param vector with newlines between entries + #sink() # Close the sink + writeLines(pca_param, con = file) + } + ) + output$download_vcf <- downloadHandler( filename = function() { From a59511b0a5fba4db82f656b5af3a03de9e57354f Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Wed, 11 Sep 2024 16:02:06 -0400 Subject: [PATCH 18/40] Fixed Bug for genotype conversion --- R/mod_diversity.R | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/R/mod_diversity.R b/R/mod_diversity.R index 1f5493f..2864869 100644 --- a/R/mod_diversity.R +++ b/R/mod_diversity.R @@ -158,17 +158,12 @@ mod_diversity_server <- function(id){ #Status updateProgressBar(session = session, id = "pb_diversity", value = 40, title = "Converting to Numeric") - #Get the genotype values if the updog dosage calls are present - if ("UD" %in% info_ids) { - geno_mat <- extract.gt(vcf, element = "UD") - class(geno_mat) <- "numeric" - rm(vcf) #Remove vcf - }else{ - #Extract GT and convert to numeric calls - geno_mat <- extract.gt(vcf, element = "GT") - geno_mat <- apply(geno_mat, 2, convert_to_dosage) - rm(vcf) #Remove VCF - } + #Get the genotype values and convert to numeric format + #Extract GT and convert to numeric calls + geno_mat <- extract.gt(vcf, element = "GT") + geno_mat <- apply(geno_mat, 2, convert_to_dosage) + rm(vcf) #Remove VCF + print(class(geno_mat)) #Convert genotypes to alternate counts if they are the reference allele counts @@ -199,7 +194,7 @@ mod_diversity_server <- function(id){ geno_mat <- data.frame(convert_genotype_counts(df = geno_mat, ploidy = ploidy, is_reference), check.names = FALSE) - # Calculating heterozygosity for a tetraploid organism + # Calculating heterozygosity diversity_items$het_df <- calculate_heterozygosity(geno_mat, ploidy = ploidy) print("Heterozygosity success") From 915c02b0899902a03b8c73f809a5f8fc2518df82 Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Wed, 11 Sep 2024 20:37:58 -0400 Subject: [PATCH 19/40] tests for GCAcc --- R/GS_functions.R | 425 ++++++++++++ R/mod_GSAcc.R | 1256 +++++++++++++++-------------------- R/utils.R | 184 ----- tests/testthat/test-GSAcc.R | 1103 +++++++++++++++++++----------- 4 files changed, 1674 insertions(+), 1294 deletions(-) create mode 100644 R/GS_functions.R diff --git a/R/GS_functions.R b/R/GS_functions.R new file mode 100644 index 0000000..846420e --- /dev/null +++ b/R/GS_functions.R @@ -0,0 +1,425 @@ +#' Function to perform genomic prediction +#' +#' @param geno ToDo +#' @param pheno ToDo +#' @param traits ToDo +#' @param fixed_effects ToDo +#' @param fixed_cat categorial fixed effects +#' @param fold ToDo +#' @param iters ToDo +#' @param cores ToDo +#' +rrBLUP_genomic_prediction <- function(geno, pheno, traits, fixed_effects = NULL, fixed_cat = NULL,folds = 5, iters = 5, cores = 1) { + + # Define variables + cycles <- as.numeric(iters) + folds <- as.numeric(folds) + total_population <- ncol(geno) + #train_size <- floor(percentage / 100 * total_population) + fixed_traits <- fixed_effects + cores <- as.numeric(cores) + + # Establish accuracy results matrix + results <- matrix(nrow = cycles*folds, ncol = length(traits) + 2) + colnames(results) <- c(paste0(traits), "Iter", "Fold") # Set the column names to be the traits + + # Initialize a list to store GEBVs for all traits and cycles + GEBVs <- list() + + #Establish heritability_scores_df () Maybe get h2 values + # Establish results matrix + heritability_scores <- matrix(nrow = cycles*folds, ncol = length(traits) + 2) + colnames(heritability_scores) <- c(paste0(traits,"_h2"), "Iter", "Fold") # Set the column names to be the traits + + #Cross validation number for progress bar (not involved in the calculations, just shiny visuals) + pb_value = 10 + #Remove the fixed traits from the pheno file + if (length(fixed_traits) == 0) { + pheno <- pheno + } else { + #Subset fixed traits + Fixed <- subset(pheno, select = fixed_traits) + + #pheno <- subset(pheno, select = -fixed_traits) + convert_categorical_to_factor <- function(df, fixed_cat) { + for (col in names(df)) { + if (col %in% fixed_cat) { + df[[col]] <- as.factor(df[[col]]) + } + } + return(df) + } + # Convert all columns to factor if they are not numeric or integer + Fixed <- convert_categorical_to_factor(Fixed, fixed_cat) + + #Fixed <- as.data.frame(lapply(Fixed, as.factor)) #convert to factor + row.names(Fixed) <- row.names(pheno) + + #Make the matrix + formula_str <- paste("~", paste(fixed_traits, collapse = " + ")) + formula <- as.formula(formula_str) + + # Create the design matrix using the constructed formula + Fixed <- model.matrix(formula, data = Fixed) + } + + #Make kinship matrix of all individuals? + #Kin_mat <- A.mat(t(geno), n.core = 1) ##Need to explore whether or not to use a Kinship matrix and if it makes a significant improvement to accuracy + #If wanting to use Kkinship matrix, will then need to see how to implement it here + + #For now, I am just imputing the missing sites using mean, but EM is more accurate, but slower (can use multiple cores). + impute = (A.mat(t(geno), max.missing=0.5,impute.method="mean",return.imputed=TRUE)) + geno <- impute$imputed + + # For loop + for (r in 1:cycles) { + set.seed(r) + fold_ids <- sample(rep(1:folds, length.out = total_population)) + fold_df <- data.frame(Sample = row.names(geno), FoldID = fold_ids) #Randomly assign each sample to a fold + fold_results <- matrix(nrow = folds, ncol = length(traits)) + colnames(fold_results) <- traits + + #Initialize GEBV object for each cycle + GEBVs_cycle <-list() + + for (fold in 1:folds) { + + train <- fold_df %>% + dplyr::filter(FoldID != fold) %>% + pull(Sample) + test <- setdiff(row.names(geno),train) + + #Subset datasets + if (length(fixed_traits) == 0) { + Fixed_train = NULL + } else{ + Fixed_train <- data.frame(Fixed[train, ]) + Fixed_train <- as.matrix(Fixed_train) + row.names(Fixed_train) <- train + + #Fixed (testing) + Fixed_test<- data.frame(Fixed[test, ]) + Fixed_test <- as.matrix(Fixed_test) + row.names(Fixed_test) <- test + + } + + pheno_train <- pheno[train, ] # Subset the phenotype df to only retain the relevant samples from the training set + m_train <- geno[train, ] + pheno_test <- pheno[test, ] + #Fixed_test <- Fixed[test, ] #Where would the Fixed_test be used? + m_valid <- geno[test, ] + + # Initialize a matrix to store GEBVs for this fold + GEBVs_fold <- matrix(nrow = length(test), ncol = length(traits)+3) + colnames(GEBVs_fold) <- c(traits,"Sample","Iter","Fold") + rownames(GEBVs_fold) <- paste("Iter", r,"Fold",fold,"Ind", test, sep="_") + + #Evaluate each trait using the same train and testing samples for each + for (trait_idx in 1:length(traits)) { + trait <- pheno_train[, traits[trait_idx]] # Get the trait of interest + trait_answer <- mixed.solve(y= trait, Z = m_train, K = NULL, X = Fixed_train, SE = FALSE, return.Hinv = FALSE) + TRT <- trait_answer$u + e <- as.matrix(TRT) + pred_trait_test <- m_valid %*% e + pred_trait <- pred_trait_test[, 1] + c(trait_answer$beta) # Make sure this still works when using multiple traits + trait_test <- pheno_test[, traits[trait_idx]] + results[(((r-1)*5)+fold), trait_idx] <- cor(pred_trait, trait_test, use = "complete") + results[(((r-1)*5)+fold), (length(traits)+1)] <- r + results[(((r-1)*5)+fold), (length(traits)+2)] <- fold + + # Extract GEBVs + # Check if Fixed_train is not NULL and include beta if it is + if (!is.null(Fixed_train) && !is.null(trait_answer$beta)) { + # Calculate GEBVs including fixed effects + GEBVs_fold[, trait_idx] <- m_valid %*% trait_answer$u + Fixed_test %*% trait_answer$beta + } else { + # Calculate GEBVs without fixed effects + GEBVs_fold[, trait_idx] <- m_valid %*% trait_answer$u #Confirm it is accurate to calculate the GEBVs for testing group from the trained model + } + + # Calculate heritability for the current trait + Vu <- trait_answer$Vu + Ve <- trait_answer$Ve + heritability_scores[(((r-1)*5)+fold), trait_idx] <- Vu / (Vu + Ve) + + } + #Add iter and fold information for each trait/result + heritability_scores[(((r-1)*5)+fold), (length(traits)+1)] <- r + heritability_scores[(((r-1)*5)+fold), (length(traits)+2)] <- fold + + #Add sample, iteration, and fold information to GEBVs_fold + GEBVs_fold[,"Iter"] = r + GEBVs_fold[,"Fold"] = fold + GEBVs_fold[,"Sample"] <- test + + # Store GEBVs for this fold + GEBVs_cycle[[fold]] <- GEBVs_fold + + } + + # Store GEBVs for this cycle + GEBVs[[r]] <- do.call(rbind, GEBVs_cycle) + + } + + # Combine all GEBVs into a single DataFrame + GEBVs_df <- as.data.frame(do.call(rbind, GEBVs)) + + results <- as.data.frame(results) + heritability_scores <- as.data.frame(heritability_scores) + + # Combine results and heritability_scores using cbind + combined_results <- cbind(results, heritability_scores) + + return(list(GEBVs = GEBVs_df, PredictionAccuracy = results, CombinedResults = combined_results)) +} + + +#' Compute relationship matrix +#' +#' @param type character defining which type +#' @param ploidy numeric indicating species ploidy +#' @param geno_input matrix with individuals in the row and markers in the columns +#' @param ped_file pedigree file +#' +#' @importFrom rrBLUP A.mat +#' @importFrom AGHmatrix Gmatrix Amatrix Hmatrix +#' +#' +get_relationship_mat <- function(geno_input, ped_file, type = c("Gmatrix", "Amatrix", "Hmatrix"), ploidy){ + + if (type == "Gmatrix") { + #Convert normalized genotypes to relationship matrix + #By default, it removes SNPs with more than 50% missing data and imputes using the mean + Geno.mat <- A.mat(t(geno_input)) # only diploids + #Geno.mat <- Gmatrix(t(geno_input), method = "VanRaden", ploidy = ploidy) + return(Geno.mat) + + }else if (type == "Amatrix") { + + #Converting to Amatrix + #Using the default additive relationship options (Amatrix only works for even numbered ploidy) + Geno.mat <- Amatrix(data = ped_file, ploidy = ploidy) + + #Filter and order the ped file based on the phenotype file (make sure this is valid to subset after generating) + pheno_ids <- as.character(colnames(geno_input)) + valid_ids <- intersect(pheno_ids, rownames(Geno.mat)) + Geno.mat <- Geno.mat[valid_ids, valid_ids] + + return(Geno.mat) + + }else if (type == "Hmatrix") { + + #Converting to Amatrix + #Using the default additive relationship options (Amatrix only works for even numbered ploidy) + Ped.mat <- Amatrix(data = ped_file, ploidy = ploidy) + + #Filter and order the ped file based on the phenotype file (make sure this is valid to subset after generating) + pheno_ids <- as.character(colnames(geno_input)) + valid_ids <- intersect(pheno_ids, rownames(Ped.mat)) + Ped.mat <- Ped.mat[valid_ids, valid_ids] + + #Using Gmatrix to get the Gmatrix instead of A.mat for consistency + #Should I be using the raw dosage values or is it okay to use the scaled genotype data that is used for A.mat()? + G.mat <- Gmatrix(t(geno_input[ ,valid_ids]), method = "VanRaden", ploidy = as.numeric(ploidy), missingValue = "NA") + G.mat <- round(G.mat,3) #to be easy to invert + + #Computing H matrix (Martini) - Using the name Geno.mat for consistency + Geno.mat <- Hmatrix(A=Ped.mat, G=G.mat, + method="Martini", + ploidy= ploidy, + maf=0.05) + return(Geno.mat) + } +} + + +#' Performes GBLUP +#' +#' @param fixed_cat categorical fixed effect +#' +GBLUP_genomic_prediction <- function(pheno_dat, Geno.mat, cycles, folds, traits, cores, fixed_cov = NULL, fixed_cat = NULL){ + + # Establish accuracy results matrix + results <- matrix(nrow = cycles*folds, ncol = length(traits) + 2) + colnames(results) <- c(paste0(traits), "Iter", "Fold") # Set the column names to be the traits + + # Initialize a list to store GEBVs for all traits and cycles + GEBVs <- list() + + #Establish heritability_scores_df () Maybe get h2 values + # Establish results matrix + heritability_scores <- matrix(nrow = cycles*folds, ncol = length(traits) + 2) + colnames(heritability_scores) <- c(paste0(traits,"_h2"), "Iter", "Fold") # Set the column names to be the traits + + # For loop + for (r in 1:cycles) { + set.seed(r) + fold_ids <- sample(rep(1:folds, length.out = nrow(Geno.mat))) + fold_df <- data.frame(Sample = row.names(Geno.mat), FoldID = fold_ids) #Randomly assign each sample to a fold + fold_results <- matrix(nrow = folds, ncol = length(traits)) + colnames(fold_results) <- traits + + #Initialize GEBV object for each cycle + GEBVs_cycle <-list() + + for (fold in 1:folds) { + + #Subset training and testing samples + train <- fold_df %>% + dplyr::filter(FoldID != fold) %>% + pull(Sample) + test <- setdiff(row.names(Geno.mat),train) + + Fixed_train = NULL + + # Initialize a matrix to store GEBVs for this fold + GEBVs_fold <- matrix(nrow = length(test), ncol = length(traits)+3) + colnames(GEBVs_fold) <- c(traits,"Sample","Iter","Fold") + rownames(GEBVs_fold) <- paste("Iter", r,"Fold",fold,"Ind", test, sep="_") + + #Evaluate each trait using the same train and testing samples for each + for (trait_idx in 1:length(traits)) { + #Mask phenotypes in testing group + Pheno_test <- pheno_dat + Pheno_test[test, traits[trait_idx]] <- NA + #Kin.blup + traitpred <- kin.blup(data = Pheno_test, + geno = colnames(pheno_dat)[1], + pheno = traits[trait_idx], + fixed = fixed_cat, + covariate = fixed_cov, + K=Geno.mat, + n.core = cores) + + #Cor between test values and predicted breeding values + results[(((r-1)*5)+fold), trait_idx] <- cor(pheno_dat[test, traits[trait_idx]], traitpred$g[test], use = "complete.obs") + results[(((r-1)*5)+fold), (length(traits)+1)] <- r + results[(((r-1)*5)+fold), (length(traits)+2)] <- fold + + # Extract GEBVs + GEBVs_fold[, trait_idx] <- traitpred$g[test] #Confirm it is accuract to calculate the GEBVs for testing group from the trained model + + # Calculate heritability (these are wrong) + Vu <- traitpred$Vg + Ve <- traitpred$Ve + heritability_scores[(((r-1)*5)+fold), trait_idx] <- Vu / (Vu + Ve) + } + + #Add iter and fold information for each trait/result + heritability_scores[(((r-1)*5)+fold), (length(traits)+1)] <- r + heritability_scores[(((r-1)*5)+fold), (length(traits)+2)] <- fold + + #Add sample, iteration, and fold information to GEBVs_fold + GEBVs_fold[,"Iter"] = r + GEBVs_fold[,"Fold"] = fold + GEBVs_fold[,"Sample"] <- test + + # Store GEBVs for this fold + GEBVs_cycle[[fold]] <- GEBVs_fold + } + + # Store GEBVs for this cycle + GEBVs[[r]] <- do.call(rbind, GEBVs_cycle) + } + + # Combine all GEBVs into a single DataFrame + GEBVs_df <- as.data.frame(do.call(rbind, GEBVs)) + + results <- as.data.frame(results) + heritability_scores <- as.data.frame(heritability_scores) + + # Combine results and heritability_scores using cbind + combined_results <- cbind(results, heritability_scores) + + return(list(GEBVs = GEBVs_df, PredictionAccuracy = results, CombinedResults = combined_results)) +} + +assign_colors <- function(color){ + if (color == "red"){ + our_color <- "#F8766D" + } else if (color == "blue") { + our_color <- "#00BFC4" + } else if (color == "green") { + our_color <- "#00BA38" + } else{ + our_color <- color + } + return(our_color) +} + +format_geno_matrix <- function(geno, model, pred_matrix = NULL, ploidy){ + + if(is.null(pred_matrix)) pred_matrix <- "none_selected" + if(model == "rrBLUP" | (model == "GBLUP" & pred_matrix == "Gmatrix")) { + #if(model == "rrBLUP") { + geno_formated <- 2 * (geno_adj / as.numeric(input$pred_ploidy)) - 1 # codification -1 0 1 + } else { + geno_formated <- geno_adj # codification 0 1 2 3 .. + } + + return(geno_formated) +} + +run_predictive_model <- function(geno, pheno, selected_traits, predictive_model, relationship_matrix_type, pedigree, + fixed_effects, categorical_fixed_effects, ploidy, cores, cycles, folds, relationship_matrix = NULL){ + + if(predictive_model == "rrBLUP"){ + results <- rrBLUP_genomic_prediction(geno = geno, + pheno = pheno, + traits = selected_traits, + fixed_effects = fixed_effects, + iters = cycles, + cores = cores) + return(results) + } else if(predictive_model == "GBLUP"){ + fixed_cov <- if (is.null(fixed_effects) || length(fixed_effects) == length(categorical_fixed_effects)) { + NULL + } else { + setdiff(fixed_effects, categorical_fixed_effects) + } + + if(is.null(relationship_matrix)){ + Geno.mat <- get_relationship_mat(geno_input = geno, + type = relationship_matrix_type, + ped_file = pedigree, + ploidy = ploidy) + } else Geno.mat <- relationship_matrix + + results <- GBLUP_genomic_prediction(pheno_dat = pheno, + Geno.mat = Geno.mat, + cycles = cycles, + folds = folds, #? + traits = selected_traits, + cores = cores, + fixed_cov = fixed_cov, + fixed_cat = categorical_fixed_effects) + return(results) + } +} + +# Remove individuals unrelated to the ones that have phenotype info +# Add line with zeros for older generation +remove_unrelated <- function(pedigree, samples_with_trait_info){ + common_ped <- intersect(pedigree$Ind, samples_with_trait_info) + ped_test <- pedigree[which(pedigree$Ind %in% common_ped),] + all_gene <- as.character(unique(unlist(ped_test))) + if(length(which(all_gene == "0")) > 0) all_gene <- all_gene[-which(all_gene == "0")] + gen <- 1 + while(length(all_gene) != length(ped_test$Ind)){ + gen <- gen + 1 + ped_test <- pedigree[which(pedigree$Ind %in% all_gene),] # add previous generation + dim(ped_test) + missing_older_gen <- which(!all_gene %in% pedigree$Ind) + length(missing_older_gen) + if(length(missing_older_gen) > 0) { + add_previous_gen <- data.frame(Ind = all_gene[missing_older_gen], P1 = 0, P2 = 0) # Add missing previous generation + ped_test <- rbind(ped_test, add_previous_gen) + } + all_gene <- as.character(unique(unlist(ped_test))) + if(length(which(all_gene == "0")) > 0) all_gene <- all_gene[-which(all_gene == "0")] + length(all_gene) + } + return(list(ped_test, gen)) +} diff --git a/R/mod_GSAcc.R b/R/mod_GSAcc.R index abbdf7c..ff68065 100644 --- a/R/mod_GSAcc.R +++ b/R/mod_GSAcc.R @@ -135,222 +135,274 @@ mod_GSAcc_ui <- function(id){ #' @noRd mod_GSAcc_server <- function(input, output, session, parent_session){ - ns <- session$ns + ns <- session$ns - #Default model choices - advanced_options <- reactiveValues( - pred_model = "rrBLUP", - pred_matrix = "Gmatrix", - ped_file = NULL - ) - - #List the ped file name if previously uploaded - output$uploaded_file_name <- renderText({ - if (!is.null(advanced_options$ped_file)) { - paste("Previously uploaded file:", advanced_options$ped_file$name) - } else { - "" # Return an empty string if no file has been uploaded - } - }) + #Default model choices + advanced_options <- reactiveValues( + pred_model = "rrBLUP", + pred_matrix = "Gmatrix", + ped_file = NULL + ) - #UI popup window for input - observeEvent(input$advanced_options, { - showModal(modalDialog( - title = "Advanced Options (beta)", - selectInput( - inputId = ns('pred_model'), - label = 'Model Choice', - choices = c("rrBLUP", "GBLUP"), - selected = advanced_options$pred_model # Initialize with stored value - ), - conditionalPanel( - condition = "input.pred_model == 'GBLUP'", ns = ns, - div( - selectInput( - inputId = ns('pred_matrix'), - label = 'GBLUP Matrix Choice', - choices = c("Gmatrix", "Amatrix", "Hmatrix"), - selected = advanced_options$pred_matrix # Initialize with stored value - ) + #List the ped file name if previously uploaded + output$uploaded_file_name <- renderText({ + if (!is.null(advanced_options$ped_file)) { + paste("Previously uploaded file:", advanced_options$ped_file$name) + } else { + "" # Return an empty string if no file has been uploaded + } + }) + + #UI popup window for input + observeEvent(input$advanced_options, { + showModal(modalDialog( + title = "Advanced Options (beta)", + selectInput( + inputId = ns('pred_model'), + label = 'Model Choice', + choices = c("rrBLUP", "GBLUP"), + selected = advanced_options$pred_model # Initialize with stored value + ), + conditionalPanel( + condition = "input.pred_model == 'GBLUP'", ns = ns, + div( + selectInput( + inputId = ns('pred_matrix'), + label = 'GBLUP Matrix Choice', + choices = c("Gmatrix", "Amatrix", "Hmatrix"), + selected = advanced_options$pred_matrix # Initialize with stored value ) - ), - conditionalPanel( - condition = "input.pred_matrix != 'Gmatrix'", ns = ns, - div( - fileInput(ns("ped_file"), "Choose Pedigree File", accept = ".csv"), - conditionalPanel( - condition = "output.uploaded_file_name !== ''", # Show only if there's content - textOutput(ns("uploaded_file_name")) # Display the uploaded file name - ) + ) + ), + conditionalPanel( + condition = "input.pred_matrix != 'Gmatrix'", ns = ns, + div( + fileInput(ns("ped_file"), "Choose Pedigree File", accept = ".csv"), + conditionalPanel( + condition = "output.uploaded_file_name !== ''", # Show only if there's content + textOutput(ns("uploaded_file_name")) # Display the uploaded file name ) - ), - footer = tagList( - modalButton("Close"), - actionButton(ns("save_advanced_options"), "Save") ) - )) - }) + ), + footer = tagList( + modalButton("Close"), + actionButton(ns("save_advanced_options"), "Save") + ) + )) + }) - #Close popup window when user "saves options" - observeEvent(input$save_advanced_options, { - advanced_options$pred_model <- input$pred_model - advanced_options$pred_matrix <- input$pred_matrix - advanced_options$ped_file <- input$ped_file - # Save other inputs as needed + #Close popup window when user "saves options" + observeEvent(input$save_advanced_options, { + advanced_options$pred_model <- input$pred_model + advanced_options$pred_matrix <- input$pred_matrix + advanced_options$ped_file <- input$ped_file + # Save other inputs as needed - removeModal() # Close the modal after saving - }) + removeModal() # Close the modal after saving + }) - ####Genomic Prediction Accuracy - #This tab involved 3 observeEvents - #1) to get the traits listed in the phenotype file - #2) to input and validate the input files - #3) to perform the genomic prediction + ####Genomic Prediction Accuracy + #This tab involved 3 observeEvents + #1) to get the traits listed in the phenotype file + #2) to input and validate the input files + #3) to perform the genomic prediction - #1) Get traits - observeEvent(input$trait_file, { - info_df <- read.csv(input$trait_file$datapath, header = TRUE, check.names = FALSE, nrow = 0) - trait_var <- colnames(info_df) - trait_var <- trait_var[2:length(trait_var)] - updateVirtualSelect("pred_fixed_info", choices = trait_var, session = session) - updateVirtualSelect("pred_trait_info", choices = trait_var, session = session) + #1) Get traits + observeEvent(input$trait_file, { + info_df <- read.csv(input$trait_file$datapath, header = TRUE, check.names = FALSE, nrow = 0) + trait_var <- colnames(info_df) + trait_var <- trait_var[2:length(trait_var)] + updateVirtualSelect("pred_fixed_info", choices = trait_var, session = session) + updateVirtualSelect("pred_trait_info", choices = trait_var, session = session) - }) + }) - #2) Error check for prediction and save input files - continue_prediction <- reactiveVal(NULL) - pred_inputs <- reactiveValues( - pheno_input = NULL, - geno_input = NULL, - pred_snps = NULL, - pred_genos = NULL, - pred_geno_pheno = NULL - ) + #2) Error check for prediction and save input files + continue_prediction <- reactiveVal(FALSE) + pred_inputs <- reactiveValues( + pheno_input = NULL, + geno_input = NULL, + pred_snps = NULL, + pred_genos = NULL, + pred_geno_pheno = NULL, + ped_input = NULL + ) - pred_outputs <- reactiveValues( - corr_output = NULL, - box_plot = NULL, - violin_plot = NULL, - comb_output = NULL, - avg_GEBVs = NULL, - all_GEBVs = NULL, - colors = NULL + colors <- reactiveValues(colors = NULL) + + #Reactive boxes + output$pred_snps <- renderValueBox({ + valueBox( + value = pred_inputs$pred_snps, + subtitle = "SNPs in Genotype File", + icon = icon("dna"), + color = "info" + ) + }) + + output$pred_geno <- renderValueBox({ + valueBox( + value = pred_inputs$pred_geno_pheno, + subtitle = "Samples with Phenotype Information", + icon = icon("location-dot"), + color = "info" ) + }) - #Reactive boxes - output$pred_snps <- renderValueBox({ - valueBox( - value = pred_inputs$pred_snps, - subtitle = "SNPs in Genotype File", - icon = icon("dna"), - color = "info" - ) - }) + observe({ + # Update colors based on input + colors$colors <- assign_colors(input$pred_color_select) + }) + + observeEvent(input$pred_fixed_info, { + updateVirtualSelect("pred_fixed_cat", choices = input$pred_fixed_info, session = session) + }) - output$pred_geno <- renderValueBox({ - valueBox( - value = pred_inputs$pred_geno_pheno, - subtitle = "Samples with Phenotype Information", - icon = icon("location-dot"), - color = "info" + observeEvent(input$prediction_start, { + + toggleClass(id = "pred_ploidy", class = "borderred", condition = (is.na(input$pred_ploidy) | is.null(input$pred_ploidy))) + + if (is.null(input$pred_file$datapath) | is.null(input$trait_file$datapath)) { + shinyalert( + title = "Missing input!", + text = "Upload VCF and phenotype files", + size = "s", + closeOnEsc = TRUE, + closeOnClickOutside = FALSE, + html = TRUE, + type = "error", + showConfirmButton = TRUE, + confirmButtonText = "OK", + confirmButtonCol = "#004192", + showCancelButton = FALSE, + animation = TRUE ) - }) + } + req(input$pred_file$datapath, input$pred_ploidy, input$trait_file$datapath) - observe({ - # Update colors based on input - pred_outputs$colors <- switch(input$pred_color_select, - "red" = "#F8766D", - "blue" = "#00BFC4", - "green" = "#00BA38", - input$pred_color_select) - }) + #Status + updateProgressBar(session = session, id = "pb_prediction", value = 5, title = "Checking input files") - observeEvent(input$pred_fixed_info, { - updateVirtualSelect("pred_fixed_cat", choices = input$pred_fixed_info, session = session) - }) + #Variables + pheno <- read.csv(input$trait_file$datapath, header = TRUE, check.names = FALSE) + row.names(pheno) <- pheno[,1] - observeEvent(input$prediction_start, { + #Make sure at least one trait was input + if (length(input$pred_trait_info) == 0) { - toggleClass(id = "pred_ploidy", class = "borderred", condition = (is.na(input$pred_ploidy) | is.null(input$pred_ploidy))) + # If condition is met, show notification toast + shinyalert( + title = "Oops", + text = "No traits were selected", + size = "xs", + closeOnEsc = TRUE, + closeOnClickOutside = FALSE, + html = TRUE, + type = "info", + showConfirmButton = TRUE, + confirmButtonText = "OK", + confirmButtonCol = "#004192", + showCancelButton = FALSE, + imageUrl = "", + animation = TRUE, + ) - if (is.null(input$pred_file$datapath) | is.null(input$trait_file$datapath)) { - shinyalert( - title = "Missing input!", - text = "Upload VCF and phenotype files", - size = "s", - closeOnEsc = TRUE, - closeOnClickOutside = FALSE, - html = TRUE, - type = "error", - showConfirmButton = TRUE, - confirmButtonText = "OK", - confirmButtonCol = "#004192", - showCancelButton = FALSE, - animation = TRUE - ) - } - req(input$pred_file$datapath, input$pred_ploidy, input$trait_file$datapath) + # Stop the observeEvent gracefully + return() + } + + #Getting genotype matrix + #Geno.file conversion if needed + geno_snps <- read_geno_file(input$pred_file$datapath, requires = "GT") + geno <- geno_snps[[1]] + pred_inputs$pred_snps <- geno_snps[[2]] - #Status - updateProgressBar(session = session, id = "pb_prediction", value = 5, title = "Checking input files") + #Save number of samples in file + pred_inputs$pred_genos <- ncol(geno) - #Variables - ploidy <- as.numeric(input$pred_ploidy) - geno_path <- input$pred_file$datapath - pheno <- read.csv(input$trait_file$datapath, header = TRUE, check.names = FALSE) - row.names(pheno) <- pheno[,1] - traits <- input$pred_trait_info - CVs <- as.numeric(input$pred_cv) + #Check that the ploidy entered is correct + if (input$pred_ploidy != max(geno, na.rm = TRUE)) { + # If condition is met, show notification toast + shinyalert( + title = "Ploidy Mismatch", + text = paste0("The maximum value in the genotype file (",max(geno, na.rm = TRUE),") does not equal the ploidy entered"), + size = "xs", + closeOnEsc = FALSE, + closeOnClickOutside = FALSE, + html = TRUE, + type = "warning", + showConfirmButton = TRUE, + confirmButtonText = "OK", + confirmButtonCol = "#004192", + showCancelButton = FALSE, + #closeOnConfirm = TRUE, + #closeOnCancel = TRUE, + imageUrl = "", + animation = TRUE + ) - #Make sure at least one trait was input - if (length(traits) == 0) { + # Stop the observeEvent gracefully + #return() + } + + #Make sure the trait file and genotype file are in the same order + # Column names for geno (assuming these are the individual IDs) + colnames_geno <- colnames(geno) + # Assuming the first column in Pheno contains the matching IDs + ids_pheno <- pheno[, 1] + # Find common identifiers + common_ids <- intersect(colnames_geno, ids_pheno) + #Get number of id + pred_inputs$pred_geno_pheno <- length(common_ids) + + #Throw an error if there are less matching samples in the phenotype file than the genotype file + if (length(common_ids) == 0) { + # If condition is met, show notification toast + shinyalert( + title = "Oops", + text = "All samples were missing from the phenotype file", + size = "xs", + closeOnEsc = TRUE, + closeOnClickOutside = FALSE, + html = TRUE, + type = "info", + showConfirmButton = TRUE, + confirmButtonText = "OK", + confirmButtonCol = "#004192", + showCancelButton = FALSE, + imageUrl = "", + animation = TRUE, + ) - # If condition is met, show notification toast + # Stop the observeEvent gracefully + return() + } else { + if (length(common_ids) < length(colnames_geno)) shinyalert( - title = "Oops", - text = "No traits were selected", + title = "Data Mismatch", + text = paste0((length(colnames_geno)-length(common_ids))," samples were removed for not having trait information"), size = "xs", - closeOnEsc = TRUE, + closeOnEsc = FALSE, closeOnClickOutside = FALSE, html = TRUE, - type = "info", + type = "warning", showConfirmButton = TRUE, confirmButtonText = "OK", confirmButtonCol = "#004192", showCancelButton = FALSE, + #closeOnConfirm = TRUE, + #closeOnCancel = TRUE, imageUrl = "", - animation = TRUE, + animation = TRUE ) - - - # Stop the observeEvent gracefully - return() - - } - - #Getting genotype matrix - - #Geno file path - file_path <- geno_path - - #Geno.file conversion if needed - geno_snps <- read_geno_file(geno_path, requires = "GT") - geno <- geno_snps[[1]] - pred_inputs$pred_snps <- geno_snps[[2]] - - #Save number of samples in file - pred_inputs$pred_genos <- ncol(geno) - - #Check that the ploidy entered is correct - if (ploidy != max(geno, na.rm = TRUE)) { - # If condition is met, show notification toast + if (length(common_ids) < length(ids_pheno)) shinyalert( - title = "Ploidy Mismatch", - text = paste0("The maximum value in the genotype file (",max(geno, na.rm = TRUE),") does not equal the ploidy entered"), + title = "Data Mismatch", + text = paste0((length(ids_pheno)-length(common_ids))," samples were removed for not having genotypic information"), size = "xs", closeOnEsc = FALSE, closeOnClickOutside = FALSE, @@ -365,28 +417,23 @@ mod_GSAcc_server <- function(input, output, session, parent_session){ imageUrl = "", animation = TRUE ) - - - # Stop the observeEvent gracefully - #return() - } - - # Convert genotype matrix according to ploidy - geno_adj_init <- 2 * (geno / as.numeric(ploidy)) - 1 - - #Make sure the trait file and genotype file are in the same order - # Column names for geno (assuming these are the individual IDs) - colnames_geno <- colnames(geno) - # Assuming the first column in Pheno contains the matching IDs - ids_pheno <- pheno[, 1] - # Find common identifiers - common_ids <- intersect(colnames_geno, ids_pheno) - #Get number of id - pred_inputs$pred_geno_pheno <- length(common_ids) - - #Throw an error if there are less matching samples in the phenotype file than the genotype file - if (length(common_ids) == 0) { - + } + + # Subset and reorder geno and pheno to ensure they only contain and are ordered by common IDs + geno_adj <- geno[, common_ids] # Assuming that the columns can be directly indexed by IDs + pheno <- pheno[match(common_ids, ids_pheno), ] # If there is pheno but not geno, the sample is also discarded + + # Check pedigree + #Import pedigree file, where pedigree data name (3-column way format). Unknown value should be equal 0 + if(!is.null(advanced_options$ped_file$datapath)){ + ped <- read.csv(advanced_options$ped_file$datapath, check.names = FALSE, colClasses = "factor") + colnames(ped) <- c("Ind", "P1", "P2") + #Convert NAs to 0 + ped[is.na(ped)] <- 0 + + common_ped <- intersect(ped$Ind, pheno[,1]) + #Throw an error if there are less matching samples in the phenotype file than the pedigree file + if (length(common_ped) == 0) { # If condition is met, show notification toast shinyalert( title = "Oops", @@ -404,563 +451,358 @@ mod_GSAcc_server <- function(input, output, session, parent_session){ animation = TRUE, ) - - # Stop the observeEvent gracefully - return() - } else if (length(common_ids) < length(colnames_geno)) { - # If condition is met, show notification toast - shinyalert( - title = "Data Mismatch", - text = paste0((length(colnames_geno)-length(common_ids))," samples were removed for not having trait information"), - size = "xs", - closeOnEsc = FALSE, - closeOnClickOutside = FALSE, - html = TRUE, - type = "warning", - showConfirmButton = TRUE, - confirmButtonText = "OK", - confirmButtonCol = "#004192", - showCancelButton = FALSE, - #closeOnConfirm = TRUE, - #closeOnCancel = TRUE, - imageUrl = "", - animation = TRUE - ) - - # Stop the observeEvent gracefully - #return() - } - - - - - #Final check before performing analyses - shinyalert( - title = "Ready?", - text = "Inputs have been checked", - size = "xs", - closeOnEsc = FALSE, - closeOnClickOutside = FALSE, - html = TRUE, - type = "info", - showConfirmButton = TRUE, - confirmButtonText = "Proceed", - confirmButtonCol = "#004192", - showCancelButton = TRUE, - #closeOnConfirm = TRUE, - #closeOnCancel = TRUE, - imageUrl = "", - animation = TRUE, - callbackR = function(value) { - if (isTRUE(value)) { - # Proceed with adjusted data - continue_prediction(TRUE) - } else { - # Stop or change the process - continue_prediction(FALSE) - } - } - ) - - # Subset and reorder geno and pheno to ensure they only contain and are ordered by common IDs - geno_adj <- geno_adj_init[, common_ids] # Assuming that the columns can be directly indexed by IDs - pheno <- pheno[match(common_ids, ids_pheno), ] - - ##Save to reactive values - #Gmatrix needs the original allele count values, so the user matrix selection determines the genotype matrix used - pred_inputs$pheno_input <- pheno - if (advanced_options$pred_matrix == "Gmatrix" || is.null(advanced_options$pred_matrix)) { - pred_inputs$geno_input <- geno_adj - } else if (advanced_options$pred_matrix == "Hmatrix") { - pred_inputs$geno_input <- geno[, common_ids] - } else { - pred_inputs$geno_input <- geno_adj - } - }) - - #3) Analysis only proceeds once continue_prediction is converted to TRUE - observe({ - - req(continue_prediction(),pred_inputs$pheno_input, pred_inputs$geno_input) - - # Stop analysis if cancel was selected - if (isFALSE(continue_prediction())) { return() - } - - #Variables - ploidy <- as.numeric(input$pred_ploidy) - geno_adj <- pred_inputs$geno_input - pheno <- pred_inputs$pheno_input - traits <- input$pred_trait_info - CVs <- as.numeric(input$pred_cv) - fixed_traits <- input$pred_fixed_info - fixed_cat <- input$pred_fixed_cat - fixed_cov <- if (is.null(input$pred_fixed_info) || length(input$pred_fixed_info) == length(input$pred_fixed_cat)) { - NULL } else { - setdiff(input$pred_fixed_info, input$pred_fixed_cat) - } - cores <- input$pred_cores - - #Assign colors - if (input$pred_color_select == "red"){ - pred_outputs$colors <- "#F8766D" - } else if (input$pred_color_select == "blue") { - pred_outputs$colors <- "#00BFC4" - } else if (input$pred_color_select == "green") { - pred_outputs$colors <- "#00BA38" - } else{ - pred_outputs$colors <- input$pred_color_select - } - - #Control whether rrBLUP or GBLUP run depending on user input - #Note, should add the GP functions to the utils.R file and then call them here... - if (advanced_options$pred_model == "rrBLUP"){ - ##Need to add ability for the use of parallelism for the for cross-validation - ##Example at this tutorial also: https://www.youtube.com/watch?v=ARWjdQU6ays - - # Example call to the function - #This is slow when using 3k markers and 1.2k samples...will need to parallelize if using this script... - results <- genomic_prediction(geno_adj, pheno, - traits = traits, - fixed_effects = fixed_traits, - iters = input$pred_cv, - cores = cores) - - #With fixed effects (need to inforporate the ability for fixed effects into the prediction?) - #results <- genomic_prediction(geno_matrix, phenotype_df, c("height", "weight"), "~ age + sex") - - #Save to reactive value - pred_outputs$corr_output <- results$PredictionAccuracy - pred_outputs$all_GEBVs <- results$GEBVs - - # Convert trait columns to numeric - results$GEBVs <- results$GEBVs %>% - mutate(across(all_of(traits), ~ as.numeric(.x))) - - # Calculate the average value for each column in the traits list for each SampleID, ignoring Iter and Fold - average_gebvs_df <- results$GEBVs %>% - group_by(Sample) %>% - summarize(across(all_of(traits), mean, na.rm = TRUE)) - - pred_outputs$avg_GEBVs <- average_gebvs_df - - columns <- setdiff(colnames(results$PredictionAccuracy), c("Iter","Fold")) - average_accuracy_df <- results$PredictionAccuracy %>% - group_by(Iter) %>% - summarize(across(all_of(columns), mean, na.rm = TRUE)) - - pred_outputs$comb_output <- average_accuracy_df - - }else{ - #Note: should wrap the GBLUP into a function too - # Define variables - cycles <- input$pred_cv - Folds <- 5 - total_population <- ncol(pred_inputs$geno_input) - #train_size <- floor(percentage / 100 * total_population) - cores <- as.numeric(cores) - #Cross validation number for progress bar (not involved in the calculations, just shiny visuals) - pb_value = 10 - - if (advanced_options$pred_matrix == "Gmatrix") { - #Convert normalized genotypes to relationship matrix - #By default, it removes SNPs with more than 50% missing data and imputes using the mean - Geno.mat <- A.mat(t(pred_inputs$geno_input)) - - }else if (advanced_options$pred_matrix == "Amatrix") { - - #Import pedigree file, where pedigree data name (3-column way format). Unknown value should be equal 0 - ped <- read.csv(advanced_options$ped_file$datapath, header = TRUE, check.names = FALSE, colClasses = "factor") - colnames(ped) <- c("Ind", "Sire", "Dam") - #Convert NAs to 0 - ped[is.na(ped)] <- 0 - #Ensure Sire and Dam are also listed as individuals - missing_parents <- unique(c(ped$Sire, ped$Dam)) - # Filter out parents already listed as individuals and non-zero values - missing_parents <- missing_parents[!missing_parents %in% ped$Ind & missing_parents != 0] - # Create new rows for missing parents and setting their parents to 0 (unknown) - new_rows <- data.frame(Ind = missing_parents, Sire = 0, Dam = 0) - # Combine the original dataframe with the new rows and remove duplicates - ped_extended <- unique(rbind(ped, new_rows)) - - #Converting to Amatrix - #Using the default additive relationship options (Amatrix only works for even numbered ploidy) - Geno.mat <- Amatrix(data = ped_extended, ploidy = ploidy) - - #Filter and order the ped file based on the phenotype file (make sure this is valid to subset after generating) - pheno_ids <- as.character(rownames(pred_inputs$pheno_input)) - valid_ids <- intersect(pheno_ids, rownames(Geno.mat)) - pred_inputs$pheno_input <- pred_inputs$pheno_input[valid_ids, ] - Geno.mat <- Geno.mat[valid_ids, valid_ids] - - #Update variable - total_population <- ncol(Geno.mat) - - }else if (advanced_options$pred_matrix == "Hmatrix") { - - #Import pedigree file, where pedigree data name (3-column way format). Unknown value should be equal 0 - ped <- read.csv(advanced_options$ped_file$datapath, header = TRUE, check.names = FALSE, colClasses = "factor") - colnames(ped) <- c("Ind", "Sire", "Dam") - #Convert NAs to 0 - ped[is.na(ped)] <- 0 - #Ensure Sire and Dam are also listed as individuals - missing_parents <- unique(c(ped$Sire, ped$Dam)) - # Filter out parents already listed as individuals and non-zero values - missing_parents <- missing_parents[!missing_parents %in% ped$Ind & missing_parents != 0] - # Create new rows for missing parents and setting their parents to 0 (unknown) - new_rows <- data.frame(Ind = missing_parents, Sire = 0, Dam = 0) - # Combine the original dataframe with the new rows and remove duplicates - ped_extended <- unique(rbind(ped, new_rows)) - - #Converting to Amatrix - #Using the default additive relationship options (Amatrix only works for even numbered ploidy) - Ped.mat <- Amatrix(data = ped_extended, ploidy = ploidy) - - #Filter and order the ped file based on the phenotype file (make sure this is valid to subset after generating) - pheno_ids <- as.character(rownames(pred_inputs$pheno_input)) - valid_ids <- intersect(pheno_ids, rownames(Ped.mat)) - pred_inputs$pheno_input <- pred_inputs$pheno_input[valid_ids, ] - Ped.mat <- Ped.mat[valid_ids, valid_ids] - - #Update variable - total_population <- ncol(Ped.mat) - - #Using Gmatrix to get the Gmatrix instead of A.mat for consistency - #Should I be using the raw dosage values or is it okay to use the scaled genotype data that is used for A.mat()? - G.mat <- Gmatrix(t(pred_inputs$geno_input[ ,valid_ids]), method = "VanRaden", ploidy = as.numeric(ploidy), missingValue = "NA") - G.mat <- round(G.mat,3) #to be easy to invert - - #Computing H matrix (Martini) - Using the name Geno.mat for consistency - Geno.mat <- Hmatrix(A=Ped.mat, G=G.mat, method="Martini", - ploidy= ploidy, - maf=0.05) - #Clean memory - rm(G.mat) - rm(Ped.mat) - rm(ped_filtered) + rm_unr <- remove_unrelated(ped, samples_with_trait_info = pheno[,1]) + extended_ped <- rm_unr[[1]] + gen <- rm_unr[[2]] + cat(paste0("You have pedigree information until the ", gen,"th generation\n")) + + if (length(common_ped) < length(ids_pheno)){ + shinyalert( + title = "Data Mismatch", + text = paste0((length(ids_pheno)-length(common_ped))," samples were removed from the phenotype data for not having pedigree information"), + size = "xs", + closeOnEsc = FALSE, + closeOnClickOutside = FALSE, + html = TRUE, + type = "warning", + showConfirmButton = TRUE, + confirmButtonText = "OK", + confirmButtonCol = "#004192", + showCancelButton = FALSE, + #closeOnConfirm = TRUE, + #closeOnCancel = TRUE, + imageUrl = "", + animation = TRUE + ) + pheno <- pheno[-which(!pheno$Sample_ID %in% extended_ped$Ind),] + geno_adj <- geno_adj[,-which(!pheno$Sample_ID %in% extended_ped$Ind)] } - # Establish accuracy results matrix - results <- matrix(nrow = cycles*Folds, ncol = length(traits) + 2) - colnames(results) <- c(paste0(traits), "Iter", "Fold") # Set the column names to be the traits - - # Initialize a list to store GEBVs for all traits and cycles - GEBVs <- list() - - #Establish heritability_scores_df () Maybe get h2 values - # Establish results matrix - heritability_scores <- matrix(nrow = cycles*Folds, ncol = length(traits) + 2) - colnames(heritability_scores) <- c(paste0(traits,"_h2"), "Iter", "Fold") # Set the column names to be the traits - - - # For loop - for (r in 1:cycles) { - set.seed(r) - fold_ids <- sample(rep(1:Folds, length.out = total_population)) - fold_df <- data.frame(Sample = row.names(Geno.mat), FoldID = fold_ids) #Randomly assign each sample to a fold - fold_results <- matrix(nrow = Folds, ncol = length(traits)) - colnames(fold_results) <- traits + if (length(ped$Ind) > length(extended_ped$Ind)) + shinyalert( + title = "Data Mismatch", + text = paste0((length(ped$Ind)-length(extended_ped$Ind))," samples in the pedigree file were unrelated to the samples with phenotype information. They were removed from the analysis."), + size = "xs", + closeOnEsc = FALSE, + closeOnClickOutside = FALSE, + html = TRUE, + type = "warning", + showConfirmButton = TRUE, + confirmButtonText = "OK", + confirmButtonCol = "#004192", + showCancelButton = FALSE, + #closeOnConfirm = TRUE, + #closeOnCancel = TRUE, + imageUrl = "", + animation = TRUE + ) - #Initialize GEBV object for each cycle - GEBVs_cycle <-list() + ped_temp <- tempfile() + ped_temp_file <- extended_ped + colnames(ped_temp_file) <- c("id", "sire", "dam") + write.table(ped_temp_file, file = ped_temp) + ped_check <- BIGr::check_ped(ped_temp) + if(dim(ped_check$repeated_ids)[1] != 0){ + shinyalert( + title = "Oops", + text = "Check for repeated IDs in the pedigree file", + size = "xs", + closeOnEsc = TRUE, + closeOnClickOutside = FALSE, + html = TRUE, + type = "info", + showConfirmButton = TRUE, + confirmButtonText = "OK", + confirmButtonCol = "#004192", + showCancelButton = FALSE, + imageUrl = "", + animation = TRUE, + ) - #Status - updateProgressBar(session = session, id = "pb_prediction", value = as.numeric(pb_value), title = paste0("Performing iteration:", r, "of", cycles)) + # Stop the observeEvent gracefully + return() + } + if(dim(ped_check$messy_parents)[1] != 0){ + shinyalert( + title = "Oops", + text = paste("We found inconsistencies in the pedigree file for the individuals:", paste0(ped_check$messy_parents$id, collapse = ", ")), + size = "xs", + closeOnEsc = TRUE, + closeOnClickOutside = FALSE, + html = TRUE, + type = "info", + showConfirmButton = TRUE, + confirmButtonText = "OK", + confirmButtonCol = "#004192", + showCancelButton = FALSE, + imageUrl = "", + animation = TRUE, + ) + } + } + } else extended_ped <- NULL - for (fold in 1:Folds) { + #Status + updateProgressBar(session = session, id = "pb_prediction", value = 20, title = "Inputs checked!") - #Status bar length - pb_value = pb_value + (70 / as.numeric(cycles*Folds)) + ## Make ouput as checked inputs pred_inputs + pred_inputs$pheno_input <- pheno + pred_inputs$geno_input <- geno_adj + pred_inputs$ped_input <- extended_ped - #Subset training and testing samples - train <- fold_df %>% - dplyr::filter(FoldID != fold) %>% - pull(Sample) - test <- setdiff(row.names(Geno.mat),train) + continue_prediction(TRUE) + }) - Fixed_train = NULL + #3) Analysis only proceeds once continue_prediction is converted to TRUE + pred_outputs <- reactive({ - # Initialize a matrix to store GEBVs for this fold - GEBVs_fold <- matrix(nrow = length(test), ncol = length(traits)+3) - colnames(GEBVs_fold) <- c(traits,"Sample","Iter","Fold") - rownames(GEBVs_fold) <- paste("Iter", r,"Fold",fold,"Ind", test, sep="_") + req(continue_prediction(),pred_inputs$pheno_input, pred_inputs$geno_input) - #Evaluate each trait using the same train and testing samples for each - for (trait_idx in 1:length(traits)) { - #Mask phenotypes in testing group - Pheno_test <- pred_inputs$pheno_input - Pheno_test[test, traits[trait_idx]] <- NA - #Kin.blup - traitpred <- kin.blup(data = Pheno_test, geno = names(pred_inputs$pheno_input)[1], pheno = traits[trait_idx], fixed = fixed_cat, covariate = fixed_cov, K=Geno.mat) - #Cor between test values and predicted breeding values - results[(((r-1)*5)+fold), trait_idx] <- cor(pred_inputs$pheno_input[test, traits[trait_idx]], traitpred$g[test], use = "complete.obs") - results[(((r-1)*5)+fold), (length(traits)+1)] <- r - results[(((r-1)*5)+fold), (length(traits)+2)] <- fold + # Stop analysis if cancel was selected + if (isFALSE(continue_prediction())) { + return() + } - # Extract GEBVs - GEBVs_fold[, trait_idx] <- traitpred$g[test] #Confirm it is accuract to calculate the GEBVs for testing group from the trained model + # Convert genotype matrix according to ploidy and model used + geno_formated <- format_geno_matrix(pred_inputs$geno_input,advanced_options$pred_model, advanced_options$pred_matrix, input$pred_ploidy) + #Status + updateProgressBar(session = session, id = "pb_prediction", value = 30, title = paste("Genotype matrix formatted for", advanced_options$pred_model, advanced_options$pred_matrix)) - # Calculate heritability (these are wrong) - Vu <- traitpred$Vg - Ve <- traitpred$Ve - heritability_scores[(((r-1)*5)+fold), trait_idx] <- Vu / (Vu + Ve) + results <- run_predictive_model(geno = geno_formated, + pheno = pred_inputs$pheno_input, + selected_traits = input$pred_trait_info, + predictive_model = advanced_options$pred_model, + relationship_matrix_type = advanced_options$pred_matrix, + pedigree = pred_inputs$ped_input, + fixed_effects = input$pred_fixed_info, + categorical_fixed_effects = input$pred_fixed_cat, + ploidy = input$pred_ploidy, + cores = input$pred_cores, + cycles = input$pred_cv, + folds = 5) - } - #Add iter and fold information for each trait/result - heritability_scores[(((r-1)*5)+fold), (length(traits)+1)] <- r - heritability_scores[(((r-1)*5)+fold), (length(traits)+2)] <- fold + updateProgressBar(session = session, id = "pb_prediction", value = 70, title = "Cross validation concluded") - #Add sample, iteration, and fold information to GEBVs_fold - GEBVs_fold[,"Iter"] = r - GEBVs_fold[,"Fold"] = fold - GEBVs_fold[,"Sample"] <- test + #Save to reactive value + pred_outputs <- list(corr_output = NULL, comb_output = NULL, all_GEBVs = NULL, avg_GEBVs = NULL) + pred_outputs$corr_output <- results$PredictionAccuracy + pred_outputs$all_GEBVs <- results$GEBVs - # Store GEBVs for this fold - GEBVs_cycle[[fold]] <- GEBVs_fold + # Convert trait columns to numeric + results$GEBVs <- results$GEBVs %>% + mutate(across(all_of(input$pred_trait_info), ~ as.numeric(.x))) - } + # Calculate the average value for each column in the traits list for each SampleID, ignoring Iter and Fold + average_gebvs_df <- results$GEBVs %>% + group_by(Sample) %>% + summarize(across(all_of(input$pred_trait_info), mean, na.rm = TRUE)) - # Store GEBVs for this cycle - GEBVs[[r]] <- do.call(rbind, GEBVs_cycle) + pred_outputs$avg_GEBVs <- average_gebvs_df - } + columns <- setdiff(colnames(results$PredictionAccuracy), c("Iter","Fold")) + average_accuracy_df <- results$PredictionAccuracy %>% + group_by(Iter) %>% + summarize(across(all_of(columns), mean, na.rm = TRUE)) - # Combine all GEBVs into a single DataFrame - GEBVs_df <- as.data.frame(do.call(rbind, GEBVs)) + pred_outputs$comb_output <- average_accuracy_df + #Status + updateProgressBar(session = session, id = "pb_prediction", value = 100, title = "Finished!") - results <- as.data.frame(results) - heritability_scores <- as.data.frame(heritability_scores) + pred_outputs + }) - # Combine results and heritability_scores using cbind - combined_results <- cbind(results, heritability_scores) + plots <- reactive({ + validate( + need(!is.null(pred_outputs()$corr_output), "Upload the input files, set the parameters and click 'run analysis' to access results in this session.") + ) - #Save to reactive value - pred_outputs$corr_output <- results - pred_outputs$all_GEBVs <- results$GEBVs_df + df <- pred_outputs()$corr_output + df <- df %>% dplyr::select(-Fold, -Iter) - # Convert trait columns to numeric - GEBVs <- GEBVs_df %>% - mutate(across(all_of(traits), ~ as.numeric(.x))) + #Probably want to add the ability for the user to select which trait(s) to display here - # Calculate the average value for each column in the traits list for each SampleID, ignoring Iter and Fold - average_gebvs_df <- GEBVs %>% - group_by(Sample) %>% - summarize(across(all_of(traits), mean, na.rm = TRUE)) + #Convert to long format for ggplot + df_long <- pivot_longer( + df, + cols = colnames(df), # Exclude the Cycle column from transformation + names_to = "Trait", # New column for trait names + values_to = "Correlation" # New column for correlation values + ) - pred_outputs$avg_GEBVs <- average_gebvs_df + plots <- list(box_plot = NULL, violin_plot = NULL) + #This can be adapted if we start comparing more than one GP model + #Also consider a violin plot to show each cor value + #plot <- ggplot(df_long, aes(x = factor(Trait), y = Correlation, fill = "red"), fill = "red") + + plots$box_plot <- ggplot(df_long, aes(x = "rrBLUP", y = Correlation, fill = "red"), fill = "red") + + #geom_boxplot(position = position_dodge(width = 0.8), color = "black", width = 0.7, outlier.size = 0.2) + + geom_boxplot() + + facet_wrap(~ Trait, nrow = 1) + # Facet by trait, allowing different y-scales + labs(title = "Predictive Ability by Trait", + x = " ", + y = "Pearson Correlation") + + #theme_minimal() + # Using a minimal theme + theme(legend.position = "none", + strip.text = element_text(size = 12), + axis.text = element_text(size = 12), + axis.title = element_text(size = 14), + axis.text.x = element_text(angle = 90, hjust = 0.95, vjust = 0.2), + strip.text.x = element_text(face = "bold"), + axis.text.x.bottom = element_blank(), + axis.ticks.x.bottom = element_blank()) + + plots$violin_plot <- ggplot(df_long, aes(x = "rrBLUP", y = Correlation, fill = "red")) + + geom_violin(trim = TRUE) + # Add violin plot + geom_point(position = position_jitter(width = 0.1), color = "black", size = 1.5) + # Add jittered points + facet_wrap(~ Trait, nrow = 1) + # Facet by trait, allowing different y-scales + labs(title = "Predictive Ability by Trait", + x = " ", # x-label is blank because it's not relevant per facet + y = "Pearson Correlation") + + theme(legend.position = "none", + strip.text = element_text(size = 12), + axis.text = element_text(size = 12), + axis.title = element_text(size = 14), + axis.text.x = element_text(angle = 90, hjust = 0.95, vjust = 0.2), + strip.text.x = element_text(face = "bold"), + axis.text.x.bottom = element_blank(), + axis.ticks.x.bottom = element_blank()) + plots + }) + + #Output the genomic prediction correlation box plots + output$pred_box_plot <- renderPlot({ + plots()$box_plot + scale_fill_manual(values = colors$colors) + }) + + #Output the genomic prediction correlation box plots + output$pred_violin_plot <- renderPlot({ + plots()$violin_plot + scale_fill_manual(values = colors$colors) + }) + + #Output the prediction tables + + all_GEBVs <- reactive({ + validate( + need(!is.null(pred_outputs()$all_GEBVs), "Upload the input files, set the parameters and click 'run analysis' to access results in this session.") + ) + pred_outputs()$comb_output + }) - columns <- setdiff(colnames(results), c("Iter","Fold")) - average_accuracy_df <- results %>% - group_by(Iter) %>% - summarize(across(all_of(columns), mean, na.rm = TRUE)) + output$pred_all_table <- renderDT({all_GEBVs()}, options = list(scrollX = TRUE,autoWidth = FALSE, pageLength = 5)) + comb_output <- reactive({ + validate( + need(!is.null(pred_outputs()$comb_output), "Upload the input files, set the parameters and click 'run analysis' to access results in this session.") + ) + pred_outputs()$comb_output + }) - pred_outputs$comb_output <- average_accuracy_df + output$pred_acc_table <- renderDT({comb_output()}, options = list(scrollX = TRUE,autoWidth = FALSE, pageLength = 5)) + avg_GEBVs <- reactive({ + validate( + need(!is.null(pred_outputs()$avg_GEBVs), "Upload the input files, set the parameters and click 'run analysis' to access results in this session.") + ) + pred_outputs()$avg_GEBVs + }) + + output$pred_gebvs_table <- renderDT({avg_GEBVs()}, options = list(scrollX = TRUE,autoWidth = FALSE, pageLength = 5)) + + #Download files for GP + output$download_pred_file <- downloadHandler( + filename = function() { + paste0("GS-results-", Sys.Date(), ".zip") + }, + content = function(file) { + # Temporary files list + temp_dir <- tempdir() + temp_files <- c() + + if (!is.null(pred_outputs()$avg_GEBVs)) { + # Create a temporary file for assignments + gebv_file <- file.path(temp_dir, paste0("GEBVs-", Sys.Date(), ".csv")) + write.csv(pred_outputs()$avg_GEBVs, gebv_file, row.names = FALSE) + temp_files <- c(temp_files, gebv_file) } - #Status - updateProgressBar(session = session, id = "pb_prediction", value = 90, title = "Generating Results") - - ##Figures and Tables - - #Status - updateProgressBar(session = session, id = "pb_prediction", value = 100, title = "Finished!") - - #End the event - continue_prediction(NULL) - }) + if (!is.null(pred_outputs()$comb_output)) { + # Create a temporary file for BIC data frame + acc_file <- file.path(temp_dir, paste0("GS-accuracy-statistics-", Sys.Date(), ".csv")) + write.csv(pred_outputs()$comb_output, acc_file, row.names = FALSE) + temp_files <- c(temp_files, acc_file) + } - plots <- reactive({ - validate( - need(!is.null(pred_outputs$corr_output), "Upload the input files, set the parameters and click 'run analysis' to access results in this session.") - ) + # Zip files only if there's something to zip + if (length(temp_files) > 0) { + zip(file, files = temp_files, extras = "-j") # Using -j to junk paths + } - df <- pred_outputs$corr_output - df <- df %>% dplyr::select(-Fold, -Iter) + # Optionally clean up + file.remove(temp_files) + } + ) - #Probably want to add the ability for the user to select which trait(s) to display here + #Download GP Figures + output$download_pred_figure <- downloadHandler( - #Convert to long format for ggplot - df_long <- pivot_longer( - df, - cols = colnames(df), # Exclude the Cycle column from transformation - names_to = "Trait", # New column for trait names - values_to = "Correlation" # New column for correlation values - ) + filename = function() { + if (input$pred_image_type == "jpeg") { + paste("GS-", Sys.Date(), ".jpg", sep="") + } else if (input$pred_image_type == "png") { + paste("GS-", Sys.Date(), ".png", sep="") + } else { + paste("GS-", Sys.Date(), ".tiff", sep="") + } + }, + content = function(file) { + #req(all_plots$pca_2d, all_plots$pca3d, all_plots$scree, input$pca_image_type, input$pca_image_res, input$pca_image_width, input$pca_image_height) #Get the plots + req(input$pred_figures) + + if (input$pred_image_type == "jpeg") { + jpeg(file, width = as.numeric(input$pred_image_width), height = as.numeric(input$pred_image_height), res= as.numeric(input$pred_image_res), units = "in") + } else if (input$pred_image_type == "png") { + png(file, width = as.numeric(input$pred_image_width), height = as.numeric(input$pred_image_height), res= as.numeric(input$pred_image_res), units = "in") + } else { + tiff(file, width = as.numeric(input$pred_image_width), height = as.numeric(input$pred_image_height), res= as.numeric(input$pred_image_res), units = "in") + } - #This can be adapted if we start comparing more than one GP model - #Also consider a violin plot to show each cor value - #plot <- ggplot(df_long, aes(x = factor(Trait), y = Correlation, fill = "red"), fill = "red") + - pred_outputs$box_plot <- ggplot(df_long, aes(x = "rrBLUP", y = Correlation, fill = "red"), fill = "red") + - #geom_boxplot(position = position_dodge(width = 0.8), color = "black", width = 0.7, outlier.size = 0.2) + - geom_boxplot() + - facet_wrap(~ Trait, nrow = 1) + # Facet by trait, allowing different y-scales - labs(title = "Predictive Ability by Trait", - x = " ", - y = "Pearson Correlation") + - #theme_minimal() + # Using a minimal theme - theme(legend.position = "none", - strip.text = element_text(size = 12), - axis.text = element_text(size = 12), - axis.title = element_text(size = 14), - axis.text.x = element_text(angle = 90, hjust = 0.95, vjust = 0.2), - strip.text.x = element_text(face = "bold"), - axis.text.x.bottom = element_blank(), - axis.ticks.x.bottom = element_blank()) - - pred_outputs$violin_plot <- ggplot(df_long, aes(x = "rrBLUP", y = Correlation, fill = "red")) + - geom_violin(trim = TRUE) + # Add violin plot - geom_point(position = position_jitter(width = 0.1), color = "black", size = 1.5) + # Add jittered points - facet_wrap(~ Trait, nrow = 1) + # Facet by trait, allowing different y-scales - labs(title = "Predictive Ability by Trait", - x = " ", # x-label is blank because it's not relevant per facet - y = "Pearson Correlation") + - theme(legend.position = "none", - strip.text = element_text(size = 12), - axis.text = element_text(size = 12), - axis.title = element_text(size = 14), - axis.text.x = element_text(angle = 90, hjust = 0.95, vjust = 0.2), - strip.text.x = element_text(face = "bold"), - axis.text.x.bottom = element_blank(), - axis.ticks.x.bottom = element_blank()) - }) + # Conditional plotting based on input selection + if (input$pred_figures == "Violin Plot") { + req(plots()$violin_plot) - #Output the genomic prediction correlation box plots - output$pred_box_plot <- renderPlot({ - pred_outputs$box_plot + scale_fill_manual(values = pred_outputs$colors) - }) + print(plots()$violin_plot + scale_fill_manual(values = colors$colors)) - #Output the genomic prediction correlation box plots - output$pred_violin_plot <- renderPlot({ - pred_outputs$violin_plot + scale_fill_manual(values = pred_outputs$colors) - }) + } else if (input$pred_figures == "Box Plot") { + req(plots()$box_plot) + #Plot + print(plots()$box_plot + scale_fill_manual(values = colors$colors)) - #Output the prediction tables + } - all_GEBVs <- reactive({ - validate( - need(!is.null(pred_outputs$all_GEBVs), "Upload the input files, set the parameters and click 'run analysis' to access results in this session.") - ) - pred_outputs$comb_output - }) + dev.off() + } - output$pred_all_table <- renderDT({all_GEBVs()}, options = list(scrollX = TRUE,autoWidth = FALSE, pageLength = 5)) + ) - comb_output <- reactive({ - validate( - need(!is.null(pred_outputs$comb_output), "Upload the input files, set the parameters and click 'run analysis' to access results in this session.") - ) - pred_outputs$comb_output + output$download_vcf <- downloadHandler( + filename = function() { + paste0("BIGapp_VCF_Example_file.vcf.gz") + }, + content = function(file) { + ex <- system.file("iris_DArT_VCF.vcf.gz", package = "BIGapp") + file.copy(ex, file) }) - output$pred_acc_table <- renderDT({comb_output()}, options = list(scrollX = TRUE,autoWidth = FALSE, pageLength = 5)) - - avg_GEBVs <- reactive({ - validate( - need(!is.null(pred_outputs$avg_GEBVs), "Upload the input files, set the parameters and click 'run analysis' to access results in this session.") - ) - pred_outputs$avg_GEBVs + output$download_pheno <- downloadHandler( + filename = function() { + paste0("BIGapp_passport_Example_file.csv") + }, + content = function(file) { + ex <- system.file("iris_passport_file.csv", package = "BIGapp") + file.copy(ex, file) }) - - output$pred_gebvs_table <- renderDT({avg_GEBVs()}, options = list(scrollX = TRUE,autoWidth = FALSE, pageLength = 5)) - - #Download files for GP - output$download_pred_file <- downloadHandler( - filename = function() { - paste0("GS-results-", Sys.Date(), ".zip") - }, - content = function(file) { - # Temporary files list - temp_dir <- tempdir() - temp_files <- c() - - if (!is.null(pred_outputs$avg_GEBVs)) { - # Create a temporary file for assignments - gebv_file <- file.path(temp_dir, paste0("GEBVs-", Sys.Date(), ".csv")) - write.csv(pred_outputs$avg_GEBVs, gebv_file, row.names = FALSE) - temp_files <- c(temp_files, gebv_file) - } - - if (!is.null(pred_outputs$comb_output)) { - # Create a temporary file for BIC data frame - acc_file <- file.path(temp_dir, paste0("GS-accuracy-statistics-", Sys.Date(), ".csv")) - write.csv(pred_outputs$comb_output, acc_file, row.names = FALSE) - temp_files <- c(temp_files, acc_file) - } - - # Zip files only if there's something to zip - if (length(temp_files) > 0) { - zip(file, files = temp_files, extras = "-j") # Using -j to junk paths - } - - # Optionally clean up - file.remove(temp_files) - } - ) - - #Download GP Figures - output$download_pred_figure <- downloadHandler( - - filename = function() { - if (input$pred_image_type == "jpeg") { - paste("GS-", Sys.Date(), ".jpg", sep="") - } else if (input$pred_image_type == "png") { - paste("GS-", Sys.Date(), ".png", sep="") - } else { - paste("GS-", Sys.Date(), ".tiff", sep="") - } - }, - content = function(file) { - #req(all_plots$pca_2d, all_plots$pca3d, all_plots$scree, input$pca_image_type, input$pca_image_res, input$pca_image_width, input$pca_image_height) #Get the plots - req(input$pred_figures) - - if (input$pred_image_type == "jpeg") { - jpeg(file, width = as.numeric(input$pred_image_width), height = as.numeric(input$pred_image_height), res= as.numeric(input$pred_image_res), units = "in") - } else if (input$pred_image_type == "png") { - png(file, width = as.numeric(input$pred_image_width), height = as.numeric(input$pred_image_height), res= as.numeric(input$pred_image_res), units = "in") - } else { - tiff(file, width = as.numeric(input$pred_image_width), height = as.numeric(input$pred_image_height), res= as.numeric(input$pred_image_res), units = "in") - } - - # Conditional plotting based on input selection - if (input$pred_figures == "Violin Plot") { - req(pred_outputs$violin_plot) - - print(pred_outputs$violin_plot + scale_fill_manual(values = pred_outputs$colors)) - - } else if (input$pred_figures == "Box Plot") { - req(pred_outputs$box_plot) - #Plot - print(pred_outputs$box_plot + scale_fill_manual(values = pred_outputs$colors)) - - } - - dev.off() - } - - ) - - output$download_vcf <- downloadHandler( - filename = function() { - paste0("BIGapp_VCF_Example_file.vcf.gz") - }, - content = function(file) { - ex <- system.file("iris_DArT_VCF.vcf.gz", package = "BIGapp") - file.copy(ex, file) - }) - - output$download_pheno <- downloadHandler( - filename = function() { - paste0("BIGapp_passport_Example_file.csv") - }, - content = function(file) { - ex <- system.file("iris_passport_file.csv", package = "BIGapp") - file.copy(ex, file) - }) } ## To be copied in the UI diff --git a/R/utils.R b/R/utils.R index 03f6121..5622f74 100644 --- a/R/utils.R +++ b/R/utils.R @@ -365,187 +365,3 @@ read_geno_file <- function(file_path, requires = c("GT")){ } -#' Function to perform genomic prediction -#' -#' @param geno ToDo -#' @param pheno ToDo -#' @param traits ToDo -#' @param fixed_effects ToDo -#' @param fold ToDo -#' @param iters ToDo -#' @param cores ToDo -#' -genomic_prediction <- function(geno, pheno, traits, fixed_effects = NULL, fold = 5, iters = 5, cores = 1) { - - # Define variables - traits <- traits - cycles <- as.numeric(iters) - folds <- as.numeric(fold) - total_population <- ncol(geno) - #train_size <- floor(percentage / 100 * total_population) - fixed_traits <- fixed_effects - cores <- as.numeric(cores) - - # Establish accuracy results matrix - results <- matrix(nrow = cycles*folds, ncol = length(traits) + 2) - colnames(results) <- c(paste0(traits), "Iter", "Fold") # Set the column names to be the traits - - # Initialize a list to store GEBVs for all traits and cycles - GEBVs <- list() - - #Establish heritability_scores_df () Maybe get h2 values - # Establish results matrix - heritability_scores <- matrix(nrow = cycles*folds, ncol = length(traits) + 2) - colnames(heritability_scores) <- c(paste0(traits,"_h2"), "Iter", "Fold") # Set the column names to be the traits - - #Cross validation number for progress bar (not involved in the calculations, just shiny visuals) - pb_value = 10 - #Remove the fixed traits from the pheno file - if (length(fixed_traits) == 0) { - pheno <- pheno - } else { - #Subset fixed traits - Fixed <- subset(pheno, select = fixed_traits) - - #pheno <- subset(pheno, select = -fixed_traits) - convert_categorical_to_factor <- function(df, fixed_cat) { - for (col in names(df)) { - if (col %in% fixed_cat) { - df[[col]] <- as.factor(df[[col]]) - } - } - return(df) - } - # Convert all columns to factor if they are not numeric or integer - Fixed <- convert_categorical_to_factor(Fixed, fixed_cat) - - #Fixed <- as.data.frame(lapply(Fixed, as.factor)) #convert to factor - row.names(Fixed) <- row.names(pheno) - - #Make the matrix - formula_str <- paste("~", paste(fixed_traits, collapse = " + ")) - formula <- as.formula(formula_str) - - # Create the design matrix using the constructed formula - Fixed <- model.matrix(formula, data = Fixed) - } - - #Make kinship matrix of all individuals? - #Kin_mat <- A.mat(t(geno), n.core = 1) ##Need to explore whether or not to use a Kinship matrix and if it makes a significant improvement to accuracy - #If wanting to use Kkinship matrix, will then need to see how to implement it here - - #For now, I am just imputing the missing sites using mean, but EM is more accurate, but slower (can use multiple cores). - impute = (A.mat(t(geno), max.missing=0.5,impute.method="mean",return.imputed=TRUE)) - geno <- impute$imputed - - # For loop - for (r in 1:cycles) { - set.seed(r) - fold_ids <- sample(rep(1:folds, length.out = total_population)) - fold_df <- data.frame(Sample = row.names(geno), FoldID = fold_ids) #Randomly assign each sample to a fold - fold_results <- matrix(nrow = folds, ncol = length(traits)) - colnames(fold_results) <- traits - - #Initialize GEBV object for each cycle - GEBVs_cycle <-list() - - for (fold in 1:folds) { - - #Status bar length - pb_value = pb_value + (70 / as.numeric(cycles*folds)) - - train <- fold_df %>% - dplyr::filter(FoldID != fold) %>% - pull(Sample) - test <- setdiff(row.names(geno),train) - - #Subset datasets - if (length(fixed_traits) == 0) { - Fixed_train = NULL - } else{ - Fixed_train <- data.frame(Fixed[train, ]) - Fixed_train <- as.matrix(Fixed_train) - row.names(Fixed_train) <- train - - #Fixed (testing) - Fixed_test<- data.frame(Fixed[test, ]) - Fixed_test <- as.matrix(Fixed_test) - row.names(Fixed_test) <- test - - } - - pheno_train <- pheno[train, ] # Subset the phenotype df to only retain the relevant samples from the training set - m_train <- geno[train, ] - pheno_test <- pheno[test, ] - #Fixed_test <- Fixed[test, ] #Where would the Fixed_test be used? - m_valid <- geno[test, ] - - # Initialize a matrix to store GEBVs for this fold - GEBVs_fold <- matrix(nrow = length(test), ncol = length(traits)+3) - colnames(GEBVs_fold) <- c(traits,"Sample","Iter","Fold") - rownames(GEBVs_fold) <- paste("Iter", r,"Fold",fold,"Ind", test, sep="_") - - #Evaluate each trait using the same train and testing samples for each - for (trait_idx in 1:length(traits)) { - trait <- pheno_train[, traits[trait_idx]] # Get the trait of interest - trait_answer <- mixed.solve(y= trait, Z = m_train, K = NULL, X = Fixed_train, SE = FALSE, return.Hinv = FALSE) - TRT <- trait_answer$u - e <- as.matrix(TRT) - pred_trait_test <- m_valid %*% e - pred_trait <- pred_trait_test[, 1] + c(trait_answer$beta) # Make sure this still works when using multiple traits - trait_test <- pheno_test[, traits[trait_idx]] - results[(((r-1)*5)+fold), trait_idx] <- cor(pred_trait, trait_test, use = "complete") - results[(((r-1)*5)+fold), (length(traits)+1)] <- r - results[(((r-1)*5)+fold), (length(traits)+2)] <- fold - - # Extract GEBVs - # Check if Fixed_train is not NULL and include beta if it is - if (!is.null(Fixed_train) && !is.null(trait_answer$beta)) { - # Calculate GEBVs including fixed effects - GEBVs_fold[, trait_idx] <- m_valid %*% trait_answer$u + Fixed_test %*% trait_answer$beta - } else { - # Calculate GEBVs without fixed effects - GEBVs_fold[, trait_idx] <- m_valid %*% trait_answer$u #Confirm it is accurate to calculate the GEBVs for testing group from the trained model - } - - # Calculate heritability for the current trait - Vu <- trait_answer$Vu - Ve <- trait_answer$Ve - heritability_scores[(((r-1)*5)+fold), trait_idx] <- Vu / (Vu + Ve) - - } - #Add iter and fold information for each trait/result - heritability_scores[(((r-1)*5)+fold), (length(traits)+1)] <- r - heritability_scores[(((r-1)*5)+fold), (length(traits)+2)] <- fold - - #Add sample, iteration, and fold information to GEBVs_fold - GEBVs_fold[,"Iter"] = r - GEBVs_fold[,"Fold"] = fold - GEBVs_fold[,"Sample"] <- test - - # Store GEBVs for this fold - GEBVs_cycle[[fold]] <- GEBVs_fold - - } - - # Store GEBVs for this cycle - GEBVs[[r]] <- do.call(rbind, GEBVs_cycle) - - } - - # Combine all GEBVs into a single DataFrame - GEBVs_df <- as.data.frame(do.call(rbind, GEBVs)) - - results <- as.data.frame(results) - heritability_scores <- as.data.frame(heritability_scores) - - # Combine results and heritability_scores using cbind - combined_results <- cbind(results, heritability_scores) - - return(list(GEBVs = GEBVs_df, PredictionAccuracy = results, CombinedResults = combined_results)) -} - -# genomic_prediction_gblup <- function(geno, pred_matrix, ped_file = NULL){ -# -# } - diff --git a/tests/testthat/test-GSAcc.R b/tests/testthat/test-GSAcc.R index 84342db..238e78d 100644 --- a/tests/testthat/test-GSAcc.R +++ b/tests/testthat/test-GSAcc.R @@ -1,6 +1,6 @@ context("GSAcc") -test_that("test Predictive Ability",{ +test_that("test Predictive Ability iris",{ # packages library(vcfR) @@ -14,25 +14,22 @@ test_that("test Predictive Ability",{ input <- list() input$trait_file$datapath <- system.file("iris_passport_file.csv", package = "BIGapp") - input$pred_color_select <- "red" - input$pred_ploidy <- 2 input$pred_file$datapath <- system.file("iris_DArT_VCF.vcf.gz", package = "BIGapp") + input$pred_color_select <- "red" + input$pred_ploidy <- 2 input$pred_trait_info <- "Petal.Length" input$pred_cv <- 5 - input$pred_fixed_info <- NULL input$pred_fixed_cat <- NULL input$pred_cores <- 3 - input$pred_model <- "rrBLUP" - input$pred_matrix <- "Gmatrix" input$ped_file <- NULL + fixed_traits <- input$pred_fixed_info + #Close popup window when user "saves options" advanced_options <- list() - advanced_options$pred_model <- input$pred_model - advanced_options$pred_matrix <- input$pred_matrix advanced_options$ped_file <- input$ped_file ####Genomic Prediction Accuracy @@ -41,11 +38,6 @@ test_that("test Predictive Ability",{ #2) to input and validate the input files #3) to perform the genomic prediction - #1) Get traits - info_df <- read.csv(input$trait_file$datapath, header = TRUE, check.names = FALSE, nrow = 0) - trait_var <- colnames(info_df) - trait_var <- trait_var[2:length(trait_var)] - #2) Error check for prediction and save input files continue_prediction <- NULL pred_inputs <- list( @@ -66,37 +58,23 @@ test_that("test Predictive Ability",{ colors = NULL ) - # Update colors based on input - pred_outputs$colors <- switch(input$pred_color_select, - "red" = "#F8766D", - "blue" = "#00BFC4", - "green" = "#00BA38", - input$pred_color_select) - - #Variables - ploidy <- as.numeric(input$pred_ploidy) - geno_path <- input$pred_file$datapath pheno <- read.csv(input$trait_file$datapath, header = TRUE, check.names = FALSE) row.names(pheno) <- pheno[,1] - traits <- input$pred_trait_info - CVs <- as.numeric(input$pred_cv) #Getting genotype matrix - #Geno.file conversion if needed - geno_snps <- read_geno_file(geno_path, requires = "GT") + geno_snps <- read_geno_file(input$pred_file$datapath, requires = "GT") geno <- geno_snps[[1]] - pred_inputs$pred_snps <- geno_snps[[2]] + pred_inputs$pred_snps <- geno_snps[[2]] # n markers + pred_inputs$pred_genos <- ncol(geno) # n samples - #Save number of samples in file - pred_inputs$pred_genos <- ncol(geno) + # Update colors based on input + pred_outputs$colors <- assign_colors(input$pred_color_select) + ##### input checks #Check that the ploidy entered is correct - if (ploidy != max(geno, na.rm = TRUE)) stop(paste0("The maximum value in the genotype file (",max(geno, na.rm = TRUE),") does not equal the ploidy entered")) - - # Convert genotype matrix according to ploidy - geno_adj_init <- 2 * (geno / as.numeric(ploidy)) - 1 + if (input$pred_ploidy != max(geno, na.rm = TRUE)) stop(paste0("The maximum value in the genotype file (",max(geno, na.rm = TRUE),") does not equal the ploidy entered")) #Make sure the trait file and genotype file are in the same order # Column names for geno (assuming these are the individual IDs) @@ -112,70 +90,44 @@ test_that("test Predictive Ability",{ if (length(common_ids) == 0) { stop("All samples were missing from the phenotype file") } else { - if (length(common_ids) < length(colnames_geno)) stop(paste0((length(colnames_geno)-length(common_ids))," samples were removed for not having trait information")) + if (length(common_ids) < length(colnames_geno)) + warning(paste0((length(colnames_geno)-length(common_ids))," samples were removed for not having trait information")) + if (length(common_ids) < length(ids_pheno)) + warning(paste0((length(ids_pheno)-length(common_ids))," samples were removed for not having genotypic information")) } # Subset and reorder geno and pheno to ensure they only contain and are ordered by common IDs - geno_adj <- geno_adj_init[, common_ids] # Assuming that the columns can be directly indexed by IDs - pheno <- pheno[match(common_ids, ids_pheno), ] + geno_adj <- geno[, common_ids] # Assuming that the columns can be directly indexed by IDs + pheno <- pheno[match(common_ids, ids_pheno), ] # If there is pheno but not geno, the sample is also discarted - ##Save to reactive values - #Gmatrix needs the original allele count values, so the user matrix selection determines the genotype matrix used - pred_inputs$pheno_input <- pheno - if (advanced_options$pred_matrix == "Gmatrix" || is.null(advanced_options$pred_matrix)) { - pred_inputs$geno_input <- geno_adj - } else if (advanced_options$pred_matrix == "Hmatrix") { - pred_inputs$geno_input <- geno[, common_ids] - } else { - pred_inputs$geno_input <- geno_adj - } + # Check pedigree - #3) Analysis only proceeds once continue_prediction is converted to TRUE - #Variables - ploidy <- as.numeric(input$pred_ploidy) - geno_adj <- pred_inputs$geno_input - pheno <- pred_inputs$pheno_input - traits <- input$pred_trait_info - CVs <- as.numeric(input$pred_cv) - fixed_traits <- input$pred_fixed_info - fixed_cat <- input$pred_fixed_cat - fixed_cov <- if (is.null(input$pred_fixed_info) || length(input$pred_fixed_info) == length(input$pred_fixed_cat)) { - NULL - } else { - setdiff(input$pred_fixed_info, input$pred_fixed_cat) - } - cores <- input$pred_cores - - #Assign colors - if (input$pred_color_select == "red"){ - pred_outputs$colors <- "#F8766D" - } else if (input$pred_color_select == "blue") { - pred_outputs$colors <- "#00BFC4" - } else if (input$pred_color_select == "green") { - pred_outputs$colors <- "#00BA38" - } else{ - pred_outputs$colors <- input$pred_color_select - } + ## Make ouput as checked inputs pred_inputs + pred_inputs$pheno_input <- pheno + pred_inputs$geno_input <- geno_adj - #Control whether rrBLUP or GBLUP run depending on user input ####### rrBLUP ##Need to add ability for the use of parallelism for the for cross-validation ##Example at this tutorial also: https://www.youtube.com/watch?v=ARWjdQU6ays - pred_inputs$geno_input <- geno_adj - - # Example call to the function - #This is slow when using 3k markers and 1.2k samples...will need to parallelize if using this script... - results <- genomic_prediction(geno = geno_adj, - pheno = pheno, - traits = traits, - fixed_effects = fixed_traits, - iters = input$pred_cv, - cores = cores) + input$pred_model <- "rrBLUP" - #With fixed effects (need to inforporate the ability for fixed effects into the prediction?) - #results <- genomic_prediction(geno_matrix, phenotype_df, c("height", "weight"), "~ age + sex") + # Convert genotype matrix according to ploidy and model used + geno_formated <- format_geno_matrix(pred_inputs$geno_input,input$pred_model, input$pred_matrix, input$pred_ploidy) + + results <- run_predictive_model(geno = geno_formated, + pheno = pred_inputs$pheno_input, + selected_traits = input$pred_trait_info, + predictive_model = input$pred_model, + relationship_matrix_type = input$pred_matrix, + pedigree = pred_inputs$ped_input, + fixed_effects = input$pred_fixed_info, + categorical_fixed_effects = input$pred_fixed_cat, + ploidy = input$pred_ploidy, + cores = input$pred_cores, + cycles = input$pred_cv, + folds = 5) #Save to reactive value pred_outputs_rrBLUP <- pred_outputs @@ -184,12 +136,12 @@ test_that("test Predictive Ability",{ # Convert trait columns to numeric results$GEBVs <- results$GEBVs %>% - mutate(across(all_of(traits), ~ as.numeric(.x))) + mutate(across(all_of(input$pred_trait_info), ~ as.numeric(.x))) # Calculate the average value for each column in the traits list for each SampleID, ignoring Iter and Fold average_gebvs_df <- results$GEBVs %>% group_by(Sample) %>% - summarize(across(all_of(traits), mean, na.rm = TRUE)) + summarize(across(all_of(input$pred_trait_info), mean, na.rm = TRUE)) pred_outputs_rrBLUP$avg_GEBVs <- average_gebvs_df @@ -200,347 +152,692 @@ test_that("test Predictive Ability",{ pred_outputs_rrBLUP$comb_output <- average_accuracy_df - df <- pred_outputs_rrBLUP$corr_output - df <- df %>% dplyr::select(-Fold, -Iter) - - #Probably want to add the ability for the user to select which trait(s) to display here - - #Convert to long format for ggplot - df_long <- pivot_longer( - df, - cols = colnames(df), # Exclude the Cycle column from transformation - names_to = "Trait", # New column for trait names - values_to = "Correlation" # New column for correlation values - ) - - #This can be adapted if we start comparing more than one GP model - #Also consider a violin plot to show each cor value - #plot <- ggplot(df_long, aes(x = factor(Trait), y = Correlation, fill = "red"), fill = "red") + - pred_outputs_rrBLUP$box_plot <- ggplot(df_long, aes(x = "rrBLUP", y = Correlation, fill = "red"), fill = "red") + - #geom_boxplot(position = position_dodge(width = 0.8), color = "black", width = 0.7, outlier.size = 0.2) + - geom_boxplot() + - facet_wrap(~ Trait, nrow = 1) + # Facet by trait, allowing different y-scales - labs(title = "Predictive Ability by Trait", - x = " ", - y = "Pearson Correlation") + - #theme_minimal() + # Using a minimal theme - theme(legend.position = "none", - strip.text = element_text(size = 12), - axis.text = element_text(size = 12), - axis.title = element_text(size = 14), - axis.text.x = element_text(angle = 90, hjust = 0.95, vjust = 0.2), - strip.text.x = element_text(face = "bold"), - axis.text.x.bottom = element_blank(), - axis.ticks.x.bottom = element_blank()) - - pred_outputs_rrBLUP$violin_plot <- ggplot(df_long, aes(x = "rrBLUP", y = Correlation, fill = "red")) + - geom_violin(trim = TRUE) + # Add violin plot - geom_point(position = position_jitter(width = 0.1), color = "black", size = 1.5) + # Add jittered points - facet_wrap(~ Trait, nrow = 1) + # Facet by trait, allowing different y-scales - labs(title = "Predictive Ability by Trait", - x = " ", # x-label is blank because it's not relevant per facet - y = "Pearson Correlation") + - theme(legend.position = "none", - strip.text = element_text(size = 12), - axis.text = element_text(size = 12), - axis.title = element_text(size = 14), - axis.text.x = element_text(angle = 90, hjust = 0.95, vjust = 0.2), - strip.text.x = element_text(face = "bold"), - axis.text.x.bottom = element_blank(), - axis.ticks.x.bottom = element_blank()) - - plots <- list(pred_outputs_rrBLUP$box_plot, pred_outputs_rrBLUP$violin_plot) - - #Output the genomic prediction correlation box plots - plots[[1]] + scale_fill_manual(values = pred_outputs_rrBLUP$colors) - - #Output the genomic prediction correlation box plots - plots[[2]] + scale_fill_manual(values = pred_outputs_rrBLUP$colors) - - #Output the prediction tables - pred_outputs_rrBLUP$corr_output - pred_outputs_rrBLUP$comb_output - pred_outputs_rrBLUP$all_GEBVs - pred_outputs_rrBLUP$avg_GEBVs - ######### ######### GBLUP #Note: should wrap the GBLUP into a function too # Define variables - cycles <- input$pred_cv - folds <- 5 - total_population <- ncol(pred_inputs$geno_input) #train_size <- floor(percentage / 100 * total_population) - cores <- as.numeric(cores) #Cross validation number for progress bar (not involved in the calculations, just shiny visuals) - pb_value = 10 - - if (advanced_options$pred_matrix == "Gmatrix") { - #Convert normalized genotypes to relationship matrix - #By default, it removes SNPs with more than 50% missing data and imputes using the mean - Geno.mat <- A.mat(t(pred_inputs$geno_input)) - - }else if (advanced_options$pred_matrix == "Amatrix") { - - #Import pedigree file, where pedigree data name (3-column way format). Unknown value should be equal 0 - ped <- read.csv(advanced_options$ped_file$datapath, header = TRUE, check.names = FALSE, colClasses = "factor") - colnames(ped) <- c("Ind", "Sire", "Dam") - #Convert NAs to 0 - ped[is.na(ped)] <- 0 - #Ensure Sire and Dam are also listed as individuals - missing_parents <- unique(c(ped$Sire, ped$Dam)) - # Filter out parents already listed as individuals and non-zero values - missing_parents <- missing_parents[!missing_parents %in% ped$Ind & missing_parents != 0] - # Create new rows for missing parents and setting their parents to 0 (unknown) - new_rows <- data.frame(Ind = missing_parents, Sire = 0, Dam = 0) - # Combine the original dataframe with the new rows and remove duplicates - ped_extended <- unique(rbind(ped, new_rows)) - - #Converting to Amatrix - #Using the default additive relationship options (Amatrix only works for even numbered ploidy) - Geno.mat <- Amatrix(data = ped_extended, ploidy = ploidy) - - #Filter and order the ped file based on the phenotype file (make sure this is valid to subset after generating) - pheno_ids <- as.character(rownames(pred_inputs$pheno_input)) - valid_ids <- intersect(pheno_ids, rownames(Geno.mat)) - pred_inputs$pheno_input <- pred_inputs$pheno_input[valid_ids, ] - Geno.mat <- Geno.mat[valid_ids, valid_ids] - - #Update variable - total_population <- ncol(Geno.mat) - print("check15") - }else if (advanced_options$pred_matrix == "Hmatrix") { - print("check16") - #Import pedigree file, where pedigree data name (3-column way format). Unknown value should be equal 0 - ped <- read.csv(advanced_options$ped_file$datapath, header = TRUE, check.names = FALSE, colClasses = "factor") - colnames(ped) <- c("Ind", "Sire", "Dam") - #Convert NAs to 0 - ped[is.na(ped)] <- 0 - #Ensure Sire and Dam are also listed as individuals - missing_parents <- unique(c(ped$Sire, ped$Dam)) - # Filter out parents already listed as individuals and non-zero values - missing_parents <- missing_parents[!missing_parents %in% ped$Ind & missing_parents != 0] - # Create new rows for missing parents and setting their parents to 0 (unknown) - new_rows <- data.frame(Ind = missing_parents, Sire = 0, Dam = 0) - # Combine the original dataframe with the new rows and remove duplicates - ped_extended <- unique(rbind(ped, new_rows)) - - #Converting to Amatrix - #Using the default additive relationship options (Amatrix only works for even numbered ploidy) - Ped.mat <- Amatrix(data = ped_extended, ploidy = ploidy) - - #Filter and order the ped file based on the phenotype file (make sure this is valid to subset after generating) - pheno_ids <- as.character(rownames(pred_inputs$pheno_input)) - valid_ids <- intersect(pheno_ids, rownames(Ped.mat)) - pred_inputs$pheno_input <- pred_inputs$pheno_input[valid_ids, ] - Ped.mat <- Ped.mat[valid_ids, valid_ids] - - #Update variable - total_population <- ncol(Ped.mat) - - #Using Gmatrix to get the Gmatrix instead of A.mat for consistency - #Should I be using the raw dosage values or is it okay to use the scaled genotype data that is used for A.mat()? - G.mat <- Gmatrix(t(pred_inputs$geno_input[ ,valid_ids]), method = "VanRaden", ploidy = as.numeric(ploidy), missingValue = "NA") - G.mat <- round(G.mat,3) #to be easy to invert - - #Computing H matrix (Martini) - Using the name Geno.mat for consistency - Geno.mat <- Hmatrix(A=Ped.mat, G=G.mat, method="Martini", - ploidy= ploidy, - maf=0.05) - #Clean memory - rm(G.mat) - rm(Ped.mat) - rm(ped_filtered) - } - # Establish accuracy results matrix - results <- matrix(nrow = cycles*Folds, ncol = length(traits) + 2) - colnames(results) <- c(paste0(traits), "Iter", "Fold") # Set the column names to be the traits - # Initialize a list to store GEBVs for all traits and cycles - GEBVs <- list() - - #Establish heritability_scores_df () Maybe get h2 values - # Establish results matrix - heritability_scores <- matrix(nrow = cycles*Folds, ncol = length(traits) + 2) - colnames(heritability_scores) <- c(paste0(traits,"_h2"), "Iter", "Fold") # Set the column names to be the traits - - - # For loop - for (r in 1:cycles) { - set.seed(r) - fold_ids <- sample(rep(1:Folds, length.out = total_population)) - fold_df <- data.frame(Sample = row.names(Geno.mat), FoldID = fold_ids) #Randomly assign each sample to a fold - fold_results <- matrix(nrow = Folds, ncol = length(traits)) - colnames(fold_results) <- traits - - #Initialize GEBV object for each cycle - GEBVs_cycle <-list() - - #Status - updateProgressBar(session = session, id = "pb_prediction", value = as.numeric(pb_value), title = paste0("Performing iteration:", r, "of", cycles)) - - for (fold in 1:Folds) { - - #Status bar length - pb_value = pb_value + (70 / as.numeric(cycles*Folds)) - - #Subset training and testing samples - train <- fold_df %>% - dplyr::filter(FoldID != fold) %>% - pull(Sample) - test <- setdiff(row.names(Geno.mat),train) - - Fixed_train = NULL - - # Initialize a matrix to store GEBVs for this fold - GEBVs_fold <- matrix(nrow = length(test), ncol = length(traits)+3) - colnames(GEBVs_fold) <- c(traits,"Sample","Iter","Fold") - rownames(GEBVs_fold) <- paste("Iter", r,"Fold",fold,"Ind", test, sep="_") - - #Evaluate each trait using the same train and testing samples for each - for (trait_idx in 1:length(traits)) { - #Mask phenotypes in testing group - Pheno_test <- pred_inputs$pheno_input - Pheno_test[test, traits[trait_idx]] <- NA - #Kin.blup - traitpred <- kin.blup(data = Pheno_test, geno = names(pred_inputs$pheno_input)[1], pheno = traits[trait_idx], fixed = fixed_cat, covariate = fixed_cov, K=Geno.mat) - #Cor between test values and predicted breeding values - results[(((r-1)*5)+fold), trait_idx] <- cor(pred_inputs$pheno_input[test, traits[trait_idx]], traitpred$g[test], use = "complete.obs") - results[(((r-1)*5)+fold), (length(traits)+1)] <- r - results[(((r-1)*5)+fold), (length(traits)+2)] <- fold - - # Extract GEBVs - GEBVs_fold[, trait_idx] <- traitpred$g[test] #Confirm it is accuract to calculate the GEBVs for testing group from the trained model - - - # Calculate heritability (these are wrong) - Vu <- traitpred$Vg - Ve <- traitpred$Ve - heritability_scores[(((r-1)*5)+fold), trait_idx] <- Vu / (Vu + Ve) - - } - #Add iter and fold information for each trait/result - heritability_scores[(((r-1)*5)+fold), (length(traits)+1)] <- r - heritability_scores[(((r-1)*5)+fold), (length(traits)+2)] <- fold - - #Add sample, iteration, and fold information to GEBVs_fold - GEBVs_fold[,"Iter"] = r - GEBVs_fold[,"Fold"] = fold - GEBVs_fold[,"Sample"] <- test - - # Store GEBVs for this fold - GEBVs_cycle[[fold]] <- GEBVs_fold - - } - - # Store GEBVs for this cycle - GEBVs[[r]] <- do.call(rbind, GEBVs_cycle) - - } - - # Combine all GEBVs into a single DataFrame - GEBVs_df <- as.data.frame(do.call(rbind, GEBVs)) - - results <- as.data.frame(results) - heritability_scores <- as.data.frame(heritability_scores) + # Convert genotype matrix according to ploidy + input$pred_model <- "GBLUP" + input$pred_matrix <- "Gmatrix" + advanced_options$pred_matrix <- input$pred_matrix - # Combine results and heritability_scores using cbind - combined_results <- cbind(results, heritability_scores) + # Convert genotype matrix according to ploidy and model used + geno_formated <- format_geno_matrix(pred_inputs$geno_input,input$pred_model, input$pred_matrix, input$pred_ploidy) + + # Main function + results <- run_predictive_model(geno = geno_formated, + pheno = pred_inputs$pheno_input, + selected_traits = input$pred_trait_info, + predictive_model = input$pred_model, + relationship_matrix_type = input$pred_matrix, + pedigree = pred_inputs$ped_input, + fixed_effects = input$pred_fixed_info, + categorical_fixed_effects = input$pred_fixed_cat, + ploidy = input$pred_ploidy, + cores = input$pred_cores, + cycles = input$pred_cv, + folds = 5) #Save to reactive value pred_outputs_gBLUP <- pred_outputs - pred_outputs_gBLUP$corr_output <- results - pred_outputs_gBLUP$all_GEBVs <- results$GEBVs_df + pred_outputs_gBLUP$corr_output <- results$PredictionAccuracy + pred_outputs_gBLUP$all_GEBVs <- results$GEBVs # Convert trait columns to numeric - GEBVs <- GEBVs_df %>% - mutate(across(all_of(traits), ~ as.numeric(.x))) + GEBVs <- results$GEBVs %>% + mutate(across(all_of(input$pred_trait_info), ~ as.numeric(.x))) # Calculate the average value for each column in the traits list for each SampleID, ignoring Iter and Fold average_gebvs_df <- GEBVs %>% group_by(Sample) %>% - summarize(across(all_of(traits), mean, na.rm = TRUE)) + summarize(across(all_of(input$pred_trait_info), mean, na.rm = TRUE)) pred_outputs_gBLUP$avg_GEBVs <- average_gebvs_df - columns <- setdiff(colnames(results), c("Iter","Fold")) - average_accuracy_df <- results %>% + columns <- setdiff(colnames(results$PredictionAccuracy), c("Iter","Fold")) + average_accuracy_df <- results$PredictionAccuracy %>% group_by(Iter) %>% summarize(across(all_of(columns), mean, na.rm = TRUE)) - pred_outputs_gBLUP$comb_output <- average_accuracy_df - df <- pred_outputs_gBLUP$corr_output - df <- df %>% dplyr::select(-Fold, -Iter) - - #Probably want to add the ability for the user to select which trait(s) to display here - - #Convert to long format for ggplot - df_long <- pivot_longer( - df, - cols = colnames(df), # Exclude the Cycle column from transformation - names_to = "Trait", # New column for trait names - values_to = "Correlation" # New column for correlation values - ) - - #This can be adapted if we start comparing more than one GP model - #Also consider a violin plot to show each cor value - #plot <- ggplot(df_long, aes(x = factor(Trait), y = Correlation, fill = "red"), fill = "red") + - plot <- ggplot(df_long, aes(x = "rrBLUP", y = Correlation, fill = "red"), fill = "red") + - #geom_boxplot(position = position_dodge(width = 0.8), color = "black", width = 0.7, outlier.size = 0.2) + - geom_boxplot() + - facet_wrap(~ Trait, nrow = 1) + # Facet by trait, allowing different y-scales - labs(title = "Predictive Ability by Trait", - x = " ", - y = "Pearson Correlation") + - #theme_minimal() + # Using a minimal theme - theme(legend.position = "none", - strip.text = element_text(size = 12), - axis.text = element_text(size = 12), - axis.title = element_text(size = 14), - axis.text.x = element_text(angle = 90, hjust = 0.95, vjust = 0.2), - strip.text.x = element_text(face = "bold"), - axis.text.x.bottom = element_blank(), - axis.ticks.x.bottom = element_blank()) - - plot_violin <- ggplot(df_long, aes(x = "rrBLUP", y = Correlation, fill = "red")) + - geom_violin(trim = TRUE) + # Add violin plot - geom_point(position = position_jitter(width = 0.1), color = "black", size = 1.5) + # Add jittered points - facet_wrap(~ Trait, nrow = 1) + # Facet by trait, allowing different y-scales - labs(title = "Predictive Ability by Trait", - x = " ", # x-label is blank because it's not relevant per facet - y = "Pearson Correlation") + - theme(legend.position = "none", - strip.text = element_text(size = 12), - axis.text = element_text(size = 12), - axis.title = element_text(size = 14), - axis.text.x = element_text(angle = 90, hjust = 0.95, vjust = 0.2), - strip.text.x = element_text(face = "bold"), - axis.text.x.bottom = element_blank(), - axis.ticks.x.bottom = element_blank()) - - - #Output the genomic prediction correlation box plots - plots()[[1]] + scale_fill_manual(values = pred_outputs_gBLUP$colors) - - #Output the genomic prediction correlation box plots - plots()[[2]] + scale_fill_manual(values = pred_outputs_gBLUP$colors) - - #Output the prediction tables - pred_outputs_gBLUP$comb_output - - all_GEBVs() - - pred_outputs_gBLUP$comb_output - - comb_output() - - pred_outputs_gBLUP$avg_GEBVs + # Compare rrBLUP and GBLUP + expect_equal(pred_outputs_gBLUP$corr_output[,1], pred_outputs_rrBLUP$corr_output[,1], tolerance = 0.01) + expect_equal(pred_outputs_gBLUP$comb_output[,2], pred_outputs_rrBLUP$comb_output[,2], tolerance = 0.01) + # expect_equal(pred_outputs_gBLUP$avg_GEBVs[,2], pred_outputs_rrBLUP$avg_GEBVs[,2], tolerance = 0.01) # Different + # expect_equal(pred_outputs_gBLUP$all_GEBVs[,1], pred_outputs_rrBLUP$all_GEBVs[,1], tolerance = 0.1) # Different }) - - - +# test_that("test Predictive Ability wheat (BGLR dataset)",{ +# +# # packages +# library(vcfR) +# library(BIGapp) +# library(rrBLUP) +# library(dplyr) +# library(tidyr) +# library(ggplot2) +# +# library(BGLR) +# +# # Inputs +# input <- list() +# +# input$pred_color_select <- "red" +# input$pred_ploidy <- 2 +# input$pred_trait_info <- "Pheno2" +# input$pred_cv <- 5 +# input$pred_fixed_info <- NULL +# input$pred_fixed_cat <- NULL +# input$pred_cores <- 3 +# +# input$ped_file <- NULL +# +# fixed_traits <- input$pred_fixed_info +# +# #Close popup window when user "saves options" +# advanced_options <- list() +# advanced_options$ped_file <- input$ped_file +# +# ####Genomic Prediction Accuracy +# #This tab involved 3 observeEvents +# #1) to get the traits listed in the phenotype file +# #2) to input and validate the input files +# #3) to perform the genomic prediction +# +# #2) Error check for prediction and save input files +# continue_prediction <- NULL +# pred_inputs <- list( +# pheno_input = NULL, +# geno_input = NULL, +# pred_snps = NULL, +# pred_genos = NULL, +# pred_geno_pheno = NULL +# ) +# +# pred_outputs <- list( +# corr_output = NULL, +# box_plot = NULL, +# violin_plot = NULL, +# comb_output = NULL, +# avg_GEBVs = NULL, +# all_GEBVs = NULL, +# colors = NULL +# ) +# +# data(wheat) +# #Variables +# pheno <- wheat.Y +# #row.names(pheno) <- pheno[,1] +# colnames(pheno) <- paste0("Pheno", 1:4) +# pheno <- data.frame(Sample_ID = rownames(pheno), pheno) # First column as the sample names is required +# +# #Getting genotype matrix +# #Geno.file conversion if needed +# geno_raw <- wheat.X +# +# # Codification 0 homozygous ref, 1 homozygous alt +# geno <- matrix(as.numeric(gsub(1,2,geno_raw)), nrow = nrow(geno_raw)) +# colnames(geno) <- colnames(geno_raw) +# rownames(geno) <- rownames(pheno) +# geno <- t(geno) +# +# pred_inputs$pred_snps <- nrow(geno) # n markers +# pred_inputs$pred_genos <- ncol(geno) # n samples +# +# # Update colors based on input +# pred_outputs$colors <- assign_colors(input$pred_color_select) +# +# ##### input checks +# #Check that the ploidy entered is correct +# if (input$pred_ploidy != max(geno, na.rm = TRUE)) stop(paste0("The maximum value in the genotype file (",max(geno, na.rm = TRUE),") does not equal the ploidy entered")) +# +# #Make sure the trait file and genotype file are in the same order +# # Column names for geno (assuming these are the individual IDs) +# colnames_geno <- colnames(geno) +# # Assuming the first column in Pheno contains the matching IDs +# ids_pheno <- rownames(pheno) +# # Find common identifiers +# common_ids <- intersect(colnames_geno, ids_pheno) +# #Get number of id +# pred_inputs$pred_geno_pheno <- length(common_ids) +# +# #Throw an error if there are less matching samples in the phenotype file than the genotype file +# if (length(common_ids) == 0) { +# stop("All samples were missing from the phenotype file") +# } else { +# if (length(common_ids) < length(colnames_geno)) +# warning(paste0((length(colnames_geno)-length(common_ids))," samples were removed for not having trait information")) +# if (length(common_ids) < length(ids_pheno)) +# warning(paste0((length(ids_pheno)-length(common_ids))," samples were removed for not having genotypic information")) +# } +# +# # Subset and reorder geno and pheno to ensure they only contain and are ordered by common IDs +# geno_adj <- geno[, common_ids] # Assuming that the columns can be directly indexed by IDs +# pheno <- pheno[match(common_ids, ids_pheno), ] # If there is pheno but not geno, the sample is also discarted +# +# # Check pedigree +# +# ## Make ouput as checked inputs pred_inputs +# pred_inputs$pheno_input <- pheno +# pred_inputs$geno_input <- geno_adj +# +# ####### rrBLUP +# ##Need to add ability for the use of parallelism for the for cross-validation +# ##Example at this tutorial also: https://www.youtube.com/watch?v=ARWjdQU6ays +# +# input$pred_model <- "rrBLUP" +# +# # Convert genotype matrix according to ploidy and model used +# geno_formated <- format_geno_matrix(pred_inputs$geno_input,input$pred_model, input$pred_matrix, input$pred_ploidy) +# +# results <- run_predictive_model(geno = geno_formated, +# pheno = pred_inputs$pheno_input, +# selected_traits = input$pred_trait_info, +# predictive_model = input$pred_model, +# relationship_matrix_type = input$pred_matrix, +# pedigree = pred_inputs$ped_input, +# fixed_effects = input$pred_fixed_info, +# categorical_fixed_effects = input$pred_fixed_cat, +# ploidy = input$pred_ploidy, +# cores = input$pred_cores, +# cycles = input$pred_cv, +# folds = 5) +# +# #Save to reactive value +# pred_outputs_rrBLUP <- pred_outputs +# pred_outputs_rrBLUP$corr_output <- results$PredictionAccuracy +# pred_outputs_rrBLUP$all_GEBVs <- results$GEBVs +# +# # Convert trait columns to numeric +# results$GEBVs <- results$GEBVs %>% +# mutate(across(all_of(input$pred_trait_info), ~ as.numeric(.x))) +# +# # Calculate the average value for each column in the traits list for each SampleID, ignoring Iter and Fold +# average_gebvs_df <- results$GEBVs %>% +# group_by(Sample) %>% +# summarize(across(all_of(input$pred_trait_info), mean, na.rm = TRUE)) +# +# pred_outputs_rrBLUP$avg_GEBVs <- average_gebvs_df +# +# columns <- setdiff(colnames(results$PredictionAccuracy), c("Iter","Fold")) +# average_accuracy_df <- results$PredictionAccuracy %>% +# group_by(Iter) %>% +# summarize(across(all_of(columns), mean, na.rm = TRUE)) +# +# pred_outputs_rrBLUP$comb_output <- average_accuracy_df +# +# ######### +# +# ######### GBLUP +# #Note: should wrap the GBLUP into a function too +# # Define variables +# #train_size <- floor(percentage / 100 * total_population) +# #Cross validation number for progress bar (not involved in the calculations, just shiny visuals) +# +# # Convert genotype matrix according to ploidy +# input$pred_model <- "GBLUP" +# input$pred_matrix <- "Gmatrix" +# advanced_options$pred_matrix <- input$pred_matrix +# +# # Convert genotype matrix according to ploidy and model used +# geno_formated <- format_geno_matrix(pred_inputs$geno_input,input$pred_model, input$pred_matrix, input$pred_ploidy) +# +# # Main function +# results <- run_predictive_model(geno = geno_formated, +# pheno = pred_inputs$pheno_input, +# selected_traits = input$pred_trait_info, +# predictive_model = input$pred_model, +# relationship_matrix_type = input$pred_matrix, +# pedigree = pred_inputs$ped_input, +# fixed_effects = input$pred_fixed_info, +# categorical_fixed_effects = input$pred_fixed_cat, +# ploidy = input$pred_ploidy, +# cores = input$pred_cores, +# cycles = input$pred_cv, +# folds = 5) +# +# #Save to reactive value +# pred_outputs_gBLUP <- pred_outputs +# pred_outputs_gBLUP$corr_output <- results$PredictionAccuracy +# pred_outputs_gBLUP$all_GEBVs <- results$GEBVs +# +# # Convert trait columns to numeric +# GEBVs <- results$GEBVs %>% +# mutate(across(all_of(input$pred_trait_info), ~ as.numeric(.x))) +# +# # Calculate the average value for each column in the traits list for each SampleID, ignoring Iter and Fold +# average_gebvs_df <- GEBVs %>% +# group_by(Sample) %>% +# summarize(across(all_of(input$pred_trait_info), mean, na.rm = TRUE)) +# +# pred_outputs_gBLUP$avg_GEBVs <- average_gebvs_df +# +# columns <- setdiff(colnames(results$PredictionAccuracy), c("Iter","Fold")) +# average_accuracy_df <- results$PredictionAccuracy %>% +# group_by(Iter) %>% +# summarize(across(all_of(columns), mean, na.rm = TRUE)) +# +# pred_outputs_gBLUP$comb_output <- average_accuracy_df +# +# # Checks +# expect_equal(pred_outputs_gBLUP$corr_output[,1], pred_outputs_rrBLUP$corr_output[,1], tolerance = 0.01) +# expect_equal(pred_outputs_gBLUP$comb_output[,2], pred_outputs_rrBLUP$comb_output[,2], tolerance = 0.01) +# expect_equal(sum(pred_outputs_gBLUP$corr_output[,1]), 12.44776, tolerance = 0.01) +# expect_equal(sum(pred_outputs_gBLUP$comb_output[,2]$Pheno2), 2.489551, tolerance = 0.01) +# expect_equal(sum(pred_outputs_gBLUP$avg_GEBVs[,2]$Pheno2), -2.913457, tolerance = 0.01) +# expect_equal(sum(as.numeric(pred_outputs_gBLUP$all_GEBVs[,1])), -14.56729, tolerance = 0.1) +# +# #### A matrix +# input$pred_matrix <- "Amatrix" +# advanced_options$pred_matrix <- input$pred_matrix +# +# # Convert genotype matrix according to ploidy and model used +# geno_formated <- format_geno_matrix(pred_inputs$geno_input,input$pred_model, input$pred_matrix, input$pred_ploidy) +# +# # Main function +# results <- run_predictive_model(geno = geno_formated, +# pheno = pred_inputs$pheno_input, +# selected_traits = input$pred_trait_info, +# predictive_model = input$pred_model, +# relationship_matrix_type = input$pred_matrix, +# pedigree = pred_inputs$ped_input, +# fixed_effects = input$pred_fixed_info, +# categorical_fixed_effects = input$pred_fixed_cat, +# ploidy = input$pred_ploidy, +# cores = input$pred_cores, +# cycles = input$pred_cv, +# folds = 5, +# relationship_matrix = wheat.A) +# +# #Save to reactive value +# pred_outputs_gBLUPA <- pred_outputs +# pred_outputs_gBLUPA$corr_output <- results$PredictionAccuracy +# pred_outputs_gBLUPA$all_GEBVs <- results$GEBVs +# +# # Convert trait columns to numeric +# GEBVs <- results$GEBVs %>% +# mutate(across(all_of(input$pred_trait_info), ~ as.numeric(.x))) +# +# # Calculate the average value for each column in the traits list for each SampleID, ignoring Iter and Fold +# average_gebvs_df <- GEBVs %>% +# group_by(Sample) %>% +# summarize(across(all_of(input$pred_trait_info), mean, na.rm = TRUE)) +# +# pred_outputs_gBLUPA$avg_GEBVs <- average_gebvs_df +# +# columns <- setdiff(colnames(results$PredictionAccuracy), c("Iter","Fold")) +# average_accuracy_df <- results$PredictionAccuracy %>% +# group_by(Iter) %>% +# summarize(across(all_of(columns), mean, na.rm = TRUE)) +# +# pred_outputs_gBLUPA$comb_output <- average_accuracy_df +# +# # Checks +# expect_equal(sum(pred_outputs_gBLUPA$corr_output[,1]), 10.335, tolerance = 0.01) +# expect_equal(sum(pred_outputs_gBLUPA$comb_output[,2]$Pheno2), 2.06, tolerance = 0.01) +# expect_equal(sum(pred_outputs_gBLUPA$avg_GEBVs[,2]$Pheno2), 318.62, tolerance = 0.01) +# expect_equal(sum(as.numeric(pred_outputs_gBLUPA$all_GEBVs[,1])), 1593.09, tolerance = 0.1) +# +# }) +# +# +# test_that("test Predictive Ability Josue",{ +# +# # packages +# library(vcfR) +# library(BIGapp) +# library(rrBLUP) +# library(dplyr) +# library(tidyr) +# library(ggplot2) +# +# # Inputs +# input <- list() +# +# input$trait_file$datapath <- "BIG_pheno.csv" +# input$pred_file$datapath <- "BIG_genos.vcf" +# input$ped_file$datapath <- "sealice_ped.csv" +# +# input$pred_color_select <- "red" +# input$pred_ploidy <- 2 +# input$pred_trait_info <- "Pheno3" +# input$pred_cv <- 5 +# input$pred_fixed_info <- NULL +# input$pred_fixed_cat <- NULL +# input$pred_cores <- 3 +# +# fixed_traits <- input$pred_fixed_info +# +# #Close popup window when user "saves options" +# advanced_options <- list() +# advanced_options$ped_file <- input$ped_file +# +# ####Genomic Prediction Accuracy +# #This tab involved 3 observeEvents +# #1) to get the traits listed in the phenotype file +# #2) to input and validate the input files +# #3) to perform the genomic prediction +# +# #2) Error check for prediction and save input files +# continue_prediction <- NULL +# pred_inputs <- list( +# pheno_input = NULL, +# geno_input = NULL, +# pred_snps = NULL, +# pred_genos = NULL, +# pred_geno_pheno = NULL, +# ped_input = NULL +# ) +# +# pred_outputs <- list( +# corr_output = NULL, +# box_plot = NULL, +# violin_plot = NULL, +# comb_output = NULL, +# avg_GEBVs = NULL, +# all_GEBVs = NULL, +# colors = NULL +# ) +# +# #Variables +# pheno <- read.csv(input$trait_file$datapath, header = TRUE, check.names = FALSE) +# pheno <- pheno[,-c(1:(which(colnames(pheno) == "Sample_ID") - 1))] # Sample_ID must be the first column +# rownames(pheno) <- pheno[,1] +# +# #Getting genotype matrix +# #Geno.file conversion if needed +# geno_snps <- read_geno_file(input$pred_file$datapath, requires = "GT") +# geno <- geno_snps[[1]] +# pred_inputs$pred_snps <- geno_snps[[2]] # n markers +# pred_inputs$pred_genos <- ncol(geno) # n samples +# +# ##### input checks +# #Check that the ploidy entered is correct +# if (input$pred_ploidy != max(geno, na.rm = TRUE)) stop(paste0("The maximum value in the genotype file (",max(geno, na.rm = TRUE),") does not equal the ploidy entered")) +# +# #Make sure the trait file and genotype file are in the same order +# # Column names for geno (assuming these are the individual IDs) +# colnames_geno <- colnames(geno) +# # Assuming the first column in Pheno contains the matching IDs +# ids_pheno <- pheno[, 1] +# # Find common identifiers +# common_ids <- intersect(colnames_geno, ids_pheno) +# #Get number of id +# pred_inputs$pred_geno_pheno <- length(common_ids) +# +# #Throw an error if there are less matching samples in the phenotype file than the genotype file +# if (length(common_ids) == 0) { +# stop("All samples were missing from the phenotype file") +# } else { +# if (length(common_ids) < length(colnames_geno)) +# warning(paste0((length(colnames_geno)-length(common_ids))," samples were removed for not having trait information")) +# if (length(common_ids) < length(ids_pheno)) +# warning(paste0((length(ids_pheno)-length(common_ids))," samples were removed for not having genotypic information")) +# } +# +# # Subset and reorder geno and pheno to ensure they only contain and are ordered by common IDs +# geno_adj <- geno[, common_ids] # Assuming that the columns can be directly indexed by IDs +# pheno <- pheno[match(common_ids, ids_pheno), ] # If there is pheno but not geno, the sample is also discarted +# +# # Check pedigree +# #Import pedigree file, where pedigree data name (3-column way format). Unknown value should be equal 0 +# if(!is.null(advanced_options$ped_file$datapath)){ +# ped <- read.csv(advanced_options$ped_file$datapath, check.names = FALSE, colClasses = "factor") +# colnames(ped) <- c("Ind", "P1", "P2") +# #Convert NAs to 0 +# ped[is.na(ped)] <- 0 +# +# common_ped <- intersect(ped$Ind, pheno[,1]) +# #Throw an error if there are less matching samples in the phenotype file than the pedigree file +# if (length(common_ped) == 0) { +# stop("All samples were missing from the phenotype file") +# } else { +# rm_unr <- remove_unrelated(ped, samples_with_trait_info = pheno[,1]) +# extended_ped <- rm_unr[[1]] +# gen <- rm_unr[[2]] +# cat(paste0("You have pedigree information until the ", gen,"th generation\n")) +# +# if (length(common_ped) < length(ids_pheno)){ +# warning(paste0((length(ids_pheno)-length(common_ped))," samples were removed from the phenotype data for not having pedigree information")) +# pheno <- pheno[-which(!pheno$Sample_ID %in% extended_ped$Ind),] +# } +# if (length(ped$Ind) > length(extended_ped$Ind)) +# warning(paste0((length(ped$Ind)-length(extended_ped$Ind))," samples in the pedigree file were unrelated to the samples with phenotype information. They were removed from the analysis.")) # samples not removed +# +# ped_temp <- tempfile() +# ped_temp_file <- extended_ped +# colnames(ped_temp_file) <- c("id", "sire", "dam") +# write.table(ped_temp_file, file = ped_temp) +# ped_check <- BIGr::check_ped(ped_temp) +# if(dim(ped_check$repeated_ids)[1] != 0) stop("Check for repeated IDs in the pedigree file") +# if(dim(ped_check$messy_parents)[1] != 0) stop(paste("We found inconsistencies in the pedigree file for the individuals:", paste0(ped_check$messy_parents$id, collapse = ", "))) +# } +# +# } +# +# ## Make ouput as checked inputs pred_inputs +# pred_inputs$pheno_input <- pheno +# pred_inputs$geno_input <- geno_adj +# pred_inputs$ped_input <- extended_ped +# +# ####### rrBLUP +# ##Need to add ability for the use of parallelism for the for cross-validation +# ##Example at this tutorial also: https://www.youtube.com/watch?v=ARWjdQU6ays +# +# input$pred_model <- "rrBLUP" +# +# # Convert genotype matrix according to ploidy and model used +# geno_formated <- format_geno_matrix(pred_inputs$geno_input,input$pred_model, input$pred_matrix, input$pred_ploidy) +# +# results <- run_predictive_model(geno = geno_formated, +# pheno = pred_inputs$pheno_input, +# selected_traits = input$pred_trait_info, +# predictive_model = input$pred_model, +# relationship_matrix_type = input$pred_matrix, +# pedigree = pred_inputs$ped_input, +# fixed_effects = input$pred_fixed_info, +# categorical_fixed_effects = input$pred_fixed_cat, +# ploidy = input$pred_ploidy, +# cores = input$pred_cores, +# cycles = input$pred_cv, +# folds = 5) +# +# #Save to reactive value +# pred_outputs_rrBLUP <- pred_outputs +# pred_outputs_rrBLUP$corr_output <- results$PredictionAccuracy +# pred_outputs_rrBLUP$all_GEBVs <- results$GEBVs +# +# # Convert trait columns to numeric +# results$GEBVs <- results$GEBVs %>% +# mutate(across(all_of(input$pred_trait_info), ~ as.numeric(.x))) +# +# # Calculate the average value for each column in the traits list for each SampleID, ignoring Iter and Fold +# average_gebvs_df <- results$GEBVs %>% +# group_by(Sample) %>% +# summarize(across(all_of(input$pred_trait_info), mean, na.rm = TRUE)) +# +# pred_outputs_rrBLUP$avg_GEBVs <- average_gebvs_df +# +# columns <- setdiff(colnames(results$PredictionAccuracy), c("Iter","Fold")) +# average_accuracy_df <- results$PredictionAccuracy %>% +# group_by(Iter) %>% +# summarize(across(all_of(columns), mean, na.rm = TRUE)) +# +# pred_outputs_rrBLUP$comb_output <- average_accuracy_df +# +# # Checks +# expect_equal(mean(pred_outputs_rrBLUP$corr_output[,1]), 0.1848, tolerance = 0.01) +# expect_equal(mean(pred_outputs_rrBLUP$comb_output[,2]$Pheno3), 0.1848, tolerance = 0.01) +# expect_equal(sum(pred_outputs_rrBLUP$avg_GEBVs[,2]$Pheno3), 19.378, tolerance = 0.01) +# expect_equal(sum(as.numeric(pred_outputs_rrBLUP$all_GEBVs[,1])), 96.89, tolerance = 0.1) +# ######### +# +# ######### GBLUP +# #Note: should wrap the GBLUP into a function too +# # Define variables +# #train_size <- floor(percentage / 100 * total_population) +# #Cross validation number for progress bar (not involved in the calculations, just shiny visuals) +# +# # Convert genotype matrix according to ploidy +# input$pred_model <- "GBLUP" +# input$pred_matrix <- "Gmatrix" +# advanced_options$pred_matrix <- input$pred_matrix +# +# # Convert genotype matrix according to ploidy and model used +# geno_formated <- format_geno_matrix(pred_inputs$geno_input,input$pred_model, input$pred_matrix, input$pred_ploidy) +# +# # Main function +# results <- run_predictive_model(geno = geno_formated, +# pheno = pred_inputs$pheno_input, +# selected_traits = input$pred_trait_info, +# predictive_model = input$pred_model, +# relationship_matrix_type = input$pred_matrix, +# pedigree = pred_inputs$ped_input, +# fixed_effects = input$pred_fixed_info, +# categorical_fixed_effects = input$pred_fixed_cat, +# ploidy = input$pred_ploidy, +# cores = input$pred_cores, +# cycles = input$pred_cv, +# folds = 5) +# +# #Save to reactive value +# pred_outputs_gBLUP <- pred_outputs +# pred_outputs_gBLUP$corr_output <- results$PredictionAccuracy +# pred_outputs_gBLUP$all_GEBVs <- results$GEBVs +# +# # Convert trait columns to numeric +# GEBVs <- results$GEBVs %>% +# mutate(across(all_of(input$pred_trait_info), ~ as.numeric(.x))) +# +# # Calculate the average value for each column in the traits list for each SampleID, ignoring Iter and Fold +# average_gebvs_df <- GEBVs %>% +# group_by(Sample) %>% +# summarize(across(all_of(input$pred_trait_info), mean, na.rm = TRUE)) +# +# pred_outputs_gBLUP$avg_GEBVs <- average_gebvs_df +# +# columns <- setdiff(colnames(results$PredictionAccuracy), c("Iter","Fold")) +# average_accuracy_df <- results$PredictionAccuracy %>% +# group_by(Iter) %>% +# summarize(across(all_of(columns), mean, na.rm = TRUE)) +# +# pred_outputs_gBLUP$comb_output <- average_accuracy_df +# +# # Compare rrBLUP and GBLUP +# expect_equal(pred_outputs_gBLUP$corr_output[,1], pred_outputs_rrBLUP$corr_output[,1], tolerance = 0.01) +# expect_equal(pred_outputs_gBLUP$comb_output[,2], pred_outputs_rrBLUP$comb_output[,2], tolerance = 0.01) +# expect_equal(mean(pred_outputs_gBLUP$corr_output[,1]), 0.1848, tolerance = 0.01) +# expect_equal(mean(pred_outputs_gBLUP$comb_output[,2]$Pheno3), 0.1848, tolerance = 0.01) +# expect_equal(sum(pred_outputs_gBLUP$avg_GEBVs[,2]$Pheno3), -0.8889, tolerance = 0.01) +# expect_equal(sum(as.numeric(pred_outputs_gBLUP$all_GEBVs[,1])), -4.444, tolerance = 0.1) +# +# ## Using A matrix +# # Convert genotype matrix according to ploidy +# input$pred_model <- "GBLUP" +# input$pred_matrix <- "Amatrix" +# advanced_options$pred_matrix <- input$pred_matrix +# +# # Convert genotype matrix according to ploidy and model used +# geno_formated <- format_geno_matrix(pred_inputs$geno_input,input$pred_model, input$pred_matrix, input$pred_ploidy) +# +# # Main function +# results <- run_predictive_model(geno = geno_formated, +# pheno = pred_inputs$pheno_input, +# selected_traits = input$pred_trait_info, +# predictive_model = input$pred_model, +# relationship_matrix_type = input$pred_matrix, +# pedigree = pred_inputs$ped_input, +# fixed_effects = input$pred_fixed_info, +# categorical_fixed_effects = input$pred_fixed_cat, +# ploidy = input$pred_ploidy, +# cores = input$pred_cores, +# cycles = input$pred_cv, +# folds = 5) +# +# #Save to reactive value +# pred_outputs_gBLUPA <- pred_outputs +# pred_outputs_gBLUPA$corr_output <- results$PredictionAccuracy +# pred_outputs_gBLUPA$all_GEBVs <- results$GEBVs +# +# # Convert trait columns to numeric +# GEBVs <- results$GEBVs %>% +# mutate(across(all_of(input$pred_trait_info), ~ as.numeric(.x))) +# +# # Calculate the average value for each column in the traits list for each SampleID, ignoring Iter and Fold +# average_gebvs_df <- GEBVs %>% +# group_by(Sample) %>% +# summarize(across(all_of(input$pred_trait_info), mean, na.rm = TRUE)) +# +# pred_outputs_gBLUPA$avg_GEBVs <- average_gebvs_df +# +# columns <- setdiff(colnames(results$PredictionAccuracy), c("Iter","Fold")) +# average_accuracy_df <- results$PredictionAccuracy %>% +# group_by(Iter) %>% +# summarize(across(all_of(columns), mean, na.rm = TRUE)) +# +# pred_outputs_gBLUPA$comb_output <- average_accuracy_df +# +# # Checks +# expect_equal(mean(pred_outputs_gBLUPA$corr_output[,1]), 0.153, tolerance = 0.01) +# expect_equal(mean(pred_outputs_gBLUPA$comb_output[,2]$Pheno3), 0.153, tolerance = 0.01) +# expect_equal(sum(pred_outputs_gBLUPA$avg_GEBVs[,2]$Pheno3), 2.165, tolerance = 0.01) +# expect_equal(sum(as.numeric(pred_outputs_gBLUPA$all_GEBVs[,1])), 10.826, tolerance = 0.1) +# +# ## Using H matrix +# # Convert genotype matrix according to ploidy +# input$pred_model <- "GBLUP" +# input$pred_matrix <- "Hmatrix" +# advanced_options$pred_matrix <- input$pred_matrix +# +# # Convert genotype matrix according to ploidy and model used +# geno_formated <- format_geno_matrix(pred_inputs$geno_input,input$pred_model,input$pred_matrix, input$pred_ploidy) +# +# # Main function +# results <- run_predictive_model(geno = geno_formated, +# pheno = pred_inputs$pheno_input, +# selected_traits = input$pred_trait_info, +# predictive_model = input$pred_model, +# relationship_matrix_type = input$pred_matrix, +# pedigree = pred_inputs$ped_input, +# fixed_effects = input$pred_fixed_info, +# categorical_fixed_effects = input$pred_fixed_cat, +# ploidy = input$pred_ploidy, +# cores = input$pred_cores, +# cycles = input$pred_cv, +# folds = 5) +# +# #Save to reactive value +# pred_outputs_gBLUPH <- pred_outputs +# pred_outputs_gBLUPH$corr_output <- results$PredictionAccuracy +# pred_outputs_gBLUPH$all_GEBVs <- results$GEBVs +# +# # Convert trait columns to numeric +# GEBVs <- results$GEBVs %>% +# mutate(across(all_of(input$pred_trait_info), ~ as.numeric(.x))) +# +# # Calculate the average value for each column in the traits list for each SampleID, ignoring Iter and Fold +# average_gebvs_df <- GEBVs %>% +# group_by(Sample) %>% +# summarize(across(all_of(input$pred_trait_info), mean, na.rm = TRUE)) +# +# pred_outputs_gBLUPH$avg_GEBVs <- average_gebvs_df +# +# columns <- setdiff(colnames(results$PredictionAccuracy), c("Iter","Fold")) +# average_accuracy_df <- results$PredictionAccuracy %>% +# group_by(Iter) %>% +# summarize(across(all_of(columns), mean, na.rm = TRUE)) +# +# pred_outputs_gBLUPH$comb_output <- average_accuracy_df +# +# # Checks +# expect_equal(mean(pred_outputs_gBLUPH$corr_output[,1]), 0.195, tolerance = 0.01) +# expect_equal(mean(pred_outputs_gBLUPH$comb_output[,2]$Pheno3), 0.195, tolerance = 0.01) +# expect_equal(sum(pred_outputs_gBLUPH$avg_GEBVs[,2]$Pheno3), -0.187, tolerance = 0.01) +# expect_equal(sum(as.numeric(pred_outputs_gBLUPH$all_GEBVs[,1])), -0.934, tolerance = 0.1) +# }) From 56c0d9ac22f638d0564c1aa67dab8d8181a478b4 Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Wed, 11 Sep 2024 22:25:34 -0400 Subject: [PATCH 20/40] check fix --- R/GS_functions.R | 4 ++-- tests/testthat/test-GSAcc.R | 8 ++------ 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/R/GS_functions.R b/R/GS_functions.R index 846420e..77700d9 100644 --- a/R/GS_functions.R +++ b/R/GS_functions.R @@ -354,9 +354,9 @@ format_geno_matrix <- function(geno, model, pred_matrix = NULL, ploidy){ if(is.null(pred_matrix)) pred_matrix <- "none_selected" if(model == "rrBLUP" | (model == "GBLUP" & pred_matrix == "Gmatrix")) { #if(model == "rrBLUP") { - geno_formated <- 2 * (geno_adj / as.numeric(input$pred_ploidy)) - 1 # codification -1 0 1 + geno_formated <- 2 * (geno / as.numeric(input$pred_ploidy)) - 1 # codification -1 0 1 } else { - geno_formated <- geno_adj # codification 0 1 2 3 .. + geno_formated <- geno # codification 0 1 2 3 .. } return(geno_formated) diff --git a/tests/testthat/test-GSAcc.R b/tests/testthat/test-GSAcc.R index 238e78d..10e8f7f 100644 --- a/tests/testthat/test-GSAcc.R +++ b/tests/testthat/test-GSAcc.R @@ -69,9 +69,6 @@ test_that("test Predictive Ability iris",{ pred_inputs$pred_snps <- geno_snps[[2]] # n markers pred_inputs$pred_genos <- ncol(geno) # n samples - # Update colors based on input - pred_outputs$colors <- assign_colors(input$pred_color_select) - ##### input checks #Check that the ploidy entered is correct if (input$pred_ploidy != max(geno, na.rm = TRUE)) stop(paste0("The maximum value in the genotype file (",max(geno, na.rm = TRUE),") does not equal the ploidy entered")) @@ -208,9 +205,8 @@ test_that("test Predictive Ability iris",{ # Compare rrBLUP and GBLUP expect_equal(pred_outputs_gBLUP$corr_output[,1], pred_outputs_rrBLUP$corr_output[,1], tolerance = 0.01) expect_equal(pred_outputs_gBLUP$comb_output[,2], pred_outputs_rrBLUP$comb_output[,2], tolerance = 0.01) - # expect_equal(pred_outputs_gBLUP$avg_GEBVs[,2], pred_outputs_rrBLUP$avg_GEBVs[,2], tolerance = 0.01) # Different - # expect_equal(pred_outputs_gBLUP$all_GEBVs[,1], pred_outputs_rrBLUP$all_GEBVs[,1], tolerance = 0.1) # Different - + expect_equal(sum(pred_outputs_gBLUP$avg_GEBVs[,2]), -0.594, tolerance = 0.01) + expect_equal(sum(as.numeric(pred_outputs_gBLUP$all_GEBVs[,1])), -2.971, tolerance = 0.1) }) # test_that("test Predictive Ability wheat (BGLR dataset)",{ From 418eb7347ec31babbbc34831d201b8c9959f1ac6 Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Wed, 11 Sep 2024 22:39:14 -0400 Subject: [PATCH 21/40] fix --- R/GS_functions.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/GS_functions.R b/R/GS_functions.R index 77700d9..e88fdb1 100644 --- a/R/GS_functions.R +++ b/R/GS_functions.R @@ -354,7 +354,7 @@ format_geno_matrix <- function(geno, model, pred_matrix = NULL, ploidy){ if(is.null(pred_matrix)) pred_matrix <- "none_selected" if(model == "rrBLUP" | (model == "GBLUP" & pred_matrix == "Gmatrix")) { #if(model == "rrBLUP") { - geno_formated <- 2 * (geno / as.numeric(input$pred_ploidy)) - 1 # codification -1 0 1 + geno_formated <- 2 * (geno / as.numeric(ploidy)) - 1 # codification -1 0 1 } else { geno_formated <- geno # codification 0 1 2 3 .. } From 8161a2ded4a2de84a121f5578c7e4f50ca2a3b91 Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Thu, 12 Sep 2024 10:04:16 -0400 Subject: [PATCH 22/40] MADC Matrix Checks --- R/utils.R | 46 +++++++++++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 21 deletions(-) diff --git a/R/utils.R b/R/utils.R index 811f485..9d96aa6 100644 --- a/R/utils.R +++ b/R/utils.R @@ -21,48 +21,52 @@ get_counts <- function(madc_file, output_name) { #Add functionality here to stop the script if indentical() is False get_matrices <- function(result_df) { #This function takes the dataframe of ref and alt counts for each sample, and converts them to ref, alt, and size(total count) matrices for Updog - + update_df <- result_df - + # Filter rows where 'AlleleID' ends with 'Ref' ref_df <- subset(update_df, grepl("Ref$", AlleleID)) - + # Filter rows where 'AlleleID' ends with 'Alt' alt_df <- subset(update_df, grepl("Alt$", AlleleID)) - - #remove alt or ref rows that do not have a counterpart in the other dataframe - if (nrow(ref_df) > nrow(alt_df)) { - ref_df <- ref_df[ref_df$CloneID %in% alt_df$CloneID,] - } else if (nrow(ref_df) < nrow(alt_df)) { - alt_df <- alt_df[alt_df$CloneID %in% ref_df$CloneID,] - } else { - alt_df <- alt_df[alt_df$CloneID %in% ref_df$CloneID,] - } - + #Ensure that each has the same SNPs and that they are in the same order - identical(alt_df$CloneID,ref_df$CloneID) - + same <- identical(alt_df$CloneID,ref_df$CloneID) + ###Convert the ref and alt counts into matrices with the CloneID as the index #Set SNP names as index row.names(ref_df) <- ref_df$CloneID row.names(alt_df) <- alt_df$CloneID - + + #Retain only the rows in common if they are not identical and provide warning + if (same == FALSE) { + warning("Mismatch between Ref and Alt Markers. MADC likely altered. Markers without a Ref or Alt match removed.") + # Find the common CloneIDs between the two dataframes + common_ids <- intersect(rownames(ref_df), rownames(alt_df)) + # Subset both dataframes to retain only the common rows + ref_df <- ref_df[common_ids, ] + alt_df <- alt_df[common_ids, ] + } + #Remove unwanted columns and convert to matrix - #Probably best to just remove the column names that aren't wanted instead of the first 16 columns. ref_matrix <- as.matrix(ref_df[, -c(1:16)]) alt_matrix <- as.matrix(alt_df[, -c(1:16)]) - + + #Convert elements to numeric + class(ref_matrix) <- "numeric" + class(alt_matrix) <- "numeric" + #Make the size matrix by combining the two matrices size_matrix <- (ref_matrix + alt_matrix) - + #Count the number of cells with 0 count to estimate missing data # Count the number of cells with the value 0 count_zeros <- sum(size_matrix == 0) - + # Print the result ratio_missing_data <- count_zeros / length(size_matrix) cat("Ratio of missing data =", ratio_missing_data, "\n") - + # Return the ref and alt matrices as a list matrices_list <- list(ref_matrix = ref_matrix, size_matrix = size_matrix) return(matrices_list) From 4be310e40763d38caada71a23dde824580192b18 Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Thu, 12 Sep 2024 18:00:34 -0400 Subject: [PATCH 23/40] Revert GS_Acc to previous version --- R/app_server.R | 8 +- R/mod_GSAcc.R | 1639 ++++++++++++++++++++++++++++++------------------ 2 files changed, 1024 insertions(+), 623 deletions(-) diff --git a/R/app_server.R b/R/app_server.R index afda17e..74c2138 100644 --- a/R/app_server.R +++ b/R/app_server.R @@ -35,9 +35,9 @@ app_server <- function(input, output, session) { callModule(mod_GS_server, "GS_1", parent_session = session) - callModule(mod_GSAcc_server, - "GSAcc_1", - parent_session = session) + #callModule(mod_GSAcc_server, + # "GSAcc_1", + # parent_session = session) callModule(mod_slurm_server, "slurm_1", parent_session = session) @@ -50,6 +50,6 @@ app_server <- function(input, output, session) { # mod_gwas_server("gwas_1") # mod_diversity_server("diversity_1") # mod_GS_server("GS_1") - # mod_GSAcc_server("GSAcc_1") + mod_GSAcc_server("GSAcc_1") # mod_slurm_server("slurm_1") } diff --git a/R/mod_GSAcc.R b/R/mod_GSAcc.R index ff68065..7bc904f 100644 --- a/R/mod_GSAcc.R +++ b/R/mod_GSAcc.R @@ -66,14 +66,14 @@ mod_GSAcc_ui <- function(id){ )), tags$hr(style="border-color: #d3d3d3; margin-top: 20px; margin-bottom: 20px;"), # Lighter grey line div(style="text-align: left; margin-top: 10px;", - actionButton(ns("advanced_options"), + actionButton(ns("advanced_options"), label = HTML(paste(icon("cog", style = "color: #007bff;"), "Advanced Options (beta)")), style = "background-color: transparent; border: none; color: #007bff; font-size: smaller; text-decoration: underline; padding: 0;" ) ) ) ), - + column(width = 6, box( title = "Plots", status = "info", solidHeader = FALSE, width = 12, height = 600, @@ -82,13 +82,13 @@ mod_GSAcc_ui <- function(id){ tabPanel("Box Plot", plotOutput(ns("pred_box_plot"), height = "500px")), tabPanel("Accuracy Table", DTOutput(ns("pred_acc_table")), style = "overflow-y: auto; height: 500px"), tabPanel("GEBVs Table", DTOutput(ns("pred_gebvs_table")),style = "overflow-y: auto; height: 500px") - + ) - + ) - + ), - + column(width = 3, valueBoxOutput(ns("pred_snps"), width = NULL), valueBoxOutput(ns("pred_geno"), width = NULL), @@ -114,11 +114,11 @@ mod_GSAcc_ui <- function(id){ tooltip = tooltipOptions(title = "Click to see inputs!") )) ) - + ) - + ) - + ) } @@ -133,260 +133,256 @@ mod_GSAcc_ui <- function(id){ #' @import ggplot2 #' @import tidyr #' @noRd -mod_GSAcc_server <- function(input, output, session, parent_session){ - - ns <- session$ns - - #Default model choices - advanced_options <- reactiveValues( - pred_model = "rrBLUP", - pred_matrix = "Gmatrix", - ped_file = NULL - ) - - #List the ped file name if previously uploaded - output$uploaded_file_name <- renderText({ - if (!is.null(advanced_options$ped_file)) { - paste("Previously uploaded file:", advanced_options$ped_file$name) - } else { - "" # Return an empty string if no file has been uploaded - } - }) - - #UI popup window for input - observeEvent(input$advanced_options, { - showModal(modalDialog( - title = "Advanced Options (beta)", - selectInput( - inputId = ns('pred_model'), - label = 'Model Choice', - choices = c("rrBLUP", "GBLUP"), - selected = advanced_options$pred_model # Initialize with stored value - ), - conditionalPanel( - condition = "input.pred_model == 'GBLUP'", ns = ns, - div( - selectInput( - inputId = ns('pred_matrix'), - label = 'GBLUP Matrix Choice', - choices = c("Gmatrix", "Amatrix", "Hmatrix"), - selected = advanced_options$pred_matrix # Initialize with stored value +mod_GSAcc_server <- function(id){ + moduleServer( id, function(input, output, session){ + ns <- session$ns + + #Default model choices + advanced_options <- reactiveValues( + pred_model = "rrBLUP", + pred_matrix = "Gmatrix", + ped_file = NULL + ) + + #List the ped file name if previously uploaded + output$uploaded_file_name <- renderText({ + if (!is.null(advanced_options$ped_file)) { + paste("Previously uploaded file:", advanced_options$ped_file$name) + } else { + "" # Return an empty string if no file has been uploaded + } + }) + + print("check1") + #UI popup window for input + observeEvent(input$advanced_options, { + showModal(modalDialog( + title = "Advanced Options (beta)", + selectInput( + inputId = ns('pred_model'), + label = 'Model Choice', + choices = c("rrBLUP", "GBLUP"), + selected = advanced_options$pred_model # Initialize with stored value + ), + conditionalPanel( + condition = "input.pred_model == 'GBLUP'", ns = ns, + div( + selectInput( + inputId = ns('pred_matrix'), + label = 'GBLUP Matrix Choice', + choices = c("Gmatrix", "Amatrix", "Hmatrix"), + selected = advanced_options$pred_matrix # Initialize with stored value + ) ) - ) - ), - conditionalPanel( - condition = "input.pred_matrix != 'Gmatrix'", ns = ns, - div( - fileInput(ns("ped_file"), "Choose Pedigree File", accept = ".csv"), - conditionalPanel( - condition = "output.uploaded_file_name !== ''", # Show only if there's content - textOutput(ns("uploaded_file_name")) # Display the uploaded file name + ), + conditionalPanel( + condition = "input.pred_matrix != 'Gmatrix'", ns = ns, + div( + fileInput(ns("ped_file"), "Choose Pedigree File", accept = ".csv"), + conditionalPanel( + condition = "output.uploaded_file_name !== ''", # Show only if there's content + textOutput(ns("uploaded_file_name")) # Display the uploaded file name + ) ) + ), + footer = tagList( + modalButton("Close"), + actionButton(ns("save_advanced_options"), "Save") ) - ), - footer = tagList( - modalButton("Close"), - actionButton(ns("save_advanced_options"), "Save") - ) - )) - }) - - - - #Close popup window when user "saves options" - observeEvent(input$save_advanced_options, { - advanced_options$pred_model <- input$pred_model - advanced_options$pred_matrix <- input$pred_matrix - advanced_options$ped_file <- input$ped_file - # Save other inputs as needed - - removeModal() # Close the modal after saving - }) - - - - ####Genomic Prediction Accuracy - #This tab involved 3 observeEvents - #1) to get the traits listed in the phenotype file - #2) to input and validate the input files - #3) to perform the genomic prediction - - #1) Get traits - observeEvent(input$trait_file, { - info_df <- read.csv(input$trait_file$datapath, header = TRUE, check.names = FALSE, nrow = 0) - trait_var <- colnames(info_df) - trait_var <- trait_var[2:length(trait_var)] - updateVirtualSelect("pred_fixed_info", choices = trait_var, session = session) - updateVirtualSelect("pred_trait_info", choices = trait_var, session = session) - - }) - - #2) Error check for prediction and save input files - continue_prediction <- reactiveVal(FALSE) - pred_inputs <- reactiveValues( - pheno_input = NULL, - geno_input = NULL, - pred_snps = NULL, - pred_genos = NULL, - pred_geno_pheno = NULL, - ped_input = NULL - ) - - colors <- reactiveValues(colors = NULL) - - #Reactive boxes - output$pred_snps <- renderValueBox({ - valueBox( - value = pred_inputs$pred_snps, - subtitle = "SNPs in Genotype File", - icon = icon("dna"), - color = "info" + )) + }) + + + + #Close popup window when user "saves options" + observeEvent(input$save_advanced_options, { + advanced_options$pred_model <- input$pred_model + advanced_options$pred_matrix <- input$pred_matrix + advanced_options$ped_file <- input$ped_file + # Save other inputs as needed + + removeModal() # Close the modal after saving + }) + + + + ####Genomic Prediction Accuracy + #This tab involved 3 observeEvents + #1) to get the traits listed in the phenotype file + #2) to input and validate the input files + #3) to perform the genomic prediction + + print("check2") + #1) Get traits + observeEvent(input$trait_file, { + info_df <- read.csv(input$trait_file$datapath, header = TRUE, check.names = FALSE, nrow = 0) + trait_var <- colnames(info_df) + trait_var <- trait_var[2:length(trait_var)] + updateVirtualSelect("pred_fixed_info", choices = trait_var, session = session) + updateVirtualSelect("pred_trait_info", choices = trait_var, session = session) + + }) + print("check3") + #2) Error check for prediction and save input files + continue_prediction <- reactiveVal(NULL) + pred_inputs <- reactiveValues( + pheno_input = NULL, + geno_input = NULL, + pred_snps = NULL, + pred_genos = NULL, + pred_geno_pheno = NULL ) - }) - - output$pred_geno <- renderValueBox({ - valueBox( - value = pred_inputs$pred_geno_pheno, - subtitle = "Samples with Phenotype Information", - icon = icon("location-dot"), - color = "info" + + pred_outputs <- reactiveValues( + corr_output = NULL, + box_plot = NULL, + violin_plot = NULL, + comb_output = NULL, + avg_GEBVs = NULL, + all_GEBVs = NULL, + colors = NULL ) - }) - - observe({ - # Update colors based on input - colors$colors <- assign_colors(input$pred_color_select) - }) - - observeEvent(input$pred_fixed_info, { - updateVirtualSelect("pred_fixed_cat", choices = input$pred_fixed_info, session = session) - }) - - observeEvent(input$prediction_start, { - - toggleClass(id = "pred_ploidy", class = "borderred", condition = (is.na(input$pred_ploidy) | is.null(input$pred_ploidy))) - - if (is.null(input$pred_file$datapath) | is.null(input$trait_file$datapath)) { - shinyalert( - title = "Missing input!", - text = "Upload VCF and phenotype files", - size = "s", - closeOnEsc = TRUE, - closeOnClickOutside = FALSE, - html = TRUE, - type = "error", - showConfirmButton = TRUE, - confirmButtonText = "OK", - confirmButtonCol = "#004192", - showCancelButton = FALSE, - animation = TRUE - ) - } - req(input$pred_file$datapath, input$pred_ploidy, input$trait_file$datapath) - - #Status - updateProgressBar(session = session, id = "pb_prediction", value = 5, title = "Checking input files") - - #Variables - pheno <- read.csv(input$trait_file$datapath, header = TRUE, check.names = FALSE) - row.names(pheno) <- pheno[,1] - - #Make sure at least one trait was input - if (length(input$pred_trait_info) == 0) { - - # If condition is met, show notification toast - shinyalert( - title = "Oops", - text = "No traits were selected", - size = "xs", - closeOnEsc = TRUE, - closeOnClickOutside = FALSE, - html = TRUE, - type = "info", - showConfirmButton = TRUE, - confirmButtonText = "OK", - confirmButtonCol = "#004192", - showCancelButton = FALSE, - imageUrl = "", - animation = TRUE, - ) - - # Stop the observeEvent gracefully - return() - } - - #Getting genotype matrix - #Geno.file conversion if needed - geno_snps <- read_geno_file(input$pred_file$datapath, requires = "GT") - geno <- geno_snps[[1]] - pred_inputs$pred_snps <- geno_snps[[2]] - - #Save number of samples in file - pred_inputs$pred_genos <- ncol(geno) - - #Check that the ploidy entered is correct - if (input$pred_ploidy != max(geno, na.rm = TRUE)) { - # If condition is met, show notification toast - shinyalert( - title = "Ploidy Mismatch", - text = paste0("The maximum value in the genotype file (",max(geno, na.rm = TRUE),") does not equal the ploidy entered"), - size = "xs", - closeOnEsc = FALSE, - closeOnClickOutside = FALSE, - html = TRUE, - type = "warning", - showConfirmButton = TRUE, - confirmButtonText = "OK", - confirmButtonCol = "#004192", - showCancelButton = FALSE, - #closeOnConfirm = TRUE, - #closeOnCancel = TRUE, - imageUrl = "", - animation = TRUE + + #Reactive boxes + output$pred_snps <- renderValueBox({ + valueBox( + value = pred_inputs$pred_snps, + subtitle = "SNPs in Genotype File", + icon = icon("dna"), + color = "info" ) - - # Stop the observeEvent gracefully - #return() - } - - #Make sure the trait file and genotype file are in the same order - # Column names for geno (assuming these are the individual IDs) - colnames_geno <- colnames(geno) - # Assuming the first column in Pheno contains the matching IDs - ids_pheno <- pheno[, 1] - # Find common identifiers - common_ids <- intersect(colnames_geno, ids_pheno) - #Get number of id - pred_inputs$pred_geno_pheno <- length(common_ids) - - #Throw an error if there are less matching samples in the phenotype file than the genotype file - if (length(common_ids) == 0) { - # If condition is met, show notification toast - shinyalert( - title = "Oops", - text = "All samples were missing from the phenotype file", - size = "xs", - closeOnEsc = TRUE, - closeOnClickOutside = FALSE, - html = TRUE, - type = "info", - showConfirmButton = TRUE, - confirmButtonText = "OK", - confirmButtonCol = "#004192", - showCancelButton = FALSE, - imageUrl = "", - animation = TRUE, + }) + + output$pred_geno <- renderValueBox({ + valueBox( + value = pred_inputs$pred_geno_pheno, + subtitle = "Samples with Phenotype Information", + icon = icon("location-dot"), + color = "info" ) - - # Stop the observeEvent gracefully - return() - } else { - if (length(common_ids) < length(colnames_geno)) + }) + + observe({ + # Update colors based on input + pred_outputs$colors <- switch(input$pred_color_select, + "red" = "#F8766D", + "blue" = "#00BFC4", + "green" = "#00BA38", + input$pred_color_select) + }) + + observeEvent(input$pred_fixed_info, { + updateVirtualSelect("pred_fixed_cat", choices = input$pred_fixed_info, session = session) + }) + + observeEvent(input$prediction_start, { + + toggleClass(id = "pred_ploidy", class = "borderred", condition = (is.na(input$pred_ploidy) | is.null(input$pred_ploidy))) + + if (is.null(input$pred_file$datapath) | is.null(input$trait_file$datapath)) { shinyalert( - title = "Data Mismatch", - text = paste0((length(colnames_geno)-length(common_ids))," samples were removed for not having trait information"), + title = "Missing input!", + text = "Upload VCF and phenotype files", + size = "s", + closeOnEsc = TRUE, + closeOnClickOutside = FALSE, + html = TRUE, + type = "error", + showConfirmButton = TRUE, + confirmButtonText = "OK", + confirmButtonCol = "#004192", + showCancelButton = FALSE, + animation = TRUE + ) + } + req(input$pred_file$datapath, input$pred_ploidy, input$trait_file$datapath) + + #Status + updateProgressBar(session = session, id = "pb_prediction", value = 5, title = "Checking input files") + + #Variables + ploidy <- as.numeric(input$pred_ploidy) + geno_path <- input$pred_file$datapath + pheno <- read.csv(input$trait_file$datapath, header = TRUE, check.names = FALSE) + row.names(pheno) <- pheno[,1] + traits <- input$pred_trait_info + CVs <- as.numeric(input$pred_cv) + + #Make sure at least one trait was input + if (length(traits) == 0) { + + # If condition is met, show notification toast + shinyalert( + title = "Oops", + text = "No traits were selected", size = "xs", - closeOnEsc = FALSE, + closeOnEsc = TRUE, + closeOnClickOutside = FALSE, + html = TRUE, + type = "info", + showConfirmButton = TRUE, + confirmButtonText = "OK", + confirmButtonCol = "#004192", + showCancelButton = FALSE, + imageUrl = "", + animation = TRUE, + ) + + + # Stop the observeEvent gracefully + return() + + } + + print("check4") + #Getting genotype matrix + + #Geno file path + file_path <- geno_path + + #Geno.file conversion if needed + if (grepl("\\.csv$", file_path)) { + geno <- read.csv(geno_path, header = TRUE, row.names = 1, check.names = FALSE) + + #Save number of SNPs + pred_inputs$pred_snps <- nrow(geno) + + } else if (grepl("\\.vcf$", file_path) || grepl("\\.gz$", file_path)) { + + #Function to convert GT to dosage calls (add to BIGr) + convert_to_dosage <- function(gt) { + # Split the genotype string + alleles <- strsplit(gt, "[|/]") + # Sum the alleles, treating NA values appropriately + sapply(alleles, function(x) { + if (any(is.na(x))) { + return(NA) + } else { + return(sum(as.numeric(x), na.rm = TRUE)) + } + }) + } + + #Convert VCF file if submitted + vcf <- vcfR::read.vcfR(file_path) + + #Get number of SNPs + pred_inputs$pred_snps <- nrow(vcf) + + #Extract GT + geno <- extract.gt(vcf, element = "GT") + geno <- apply(geno, 2, convert_to_dosage) + class(geno) <- "numeric" + rm(vcf) + + } else { + + # If condition is met, show notification toast + shinyalert( + title = "Oops", + text = "No valid genotype file detected", + size = "xs", + closeOnEsc = TRUE, closeOnClickOutside = FALSE, html = TRUE, type = "warning", @@ -394,15 +390,23 @@ mod_GSAcc_server <- function(input, output, session, parent_session){ confirmButtonText = "OK", confirmButtonCol = "#004192", showCancelButton = FALSE, - #closeOnConfirm = TRUE, - #closeOnCancel = TRUE, imageUrl = "", - animation = TRUE + animation = TRUE, ) - if (length(common_ids) < length(ids_pheno)) + + #Stop the analysis + return() + } + print("check5") + #Save number of samples in file + pred_inputs$pred_genos <- ncol(geno) + + #Check that the ploidy entered is correct + if (ploidy != max(geno, na.rm = TRUE)) { + # If condition is met, show notification toast shinyalert( - title = "Data Mismatch", - text = paste0((length(ids_pheno)-length(common_ids))," samples were removed for not having genotypic information"), + title = "Ploidy Mismatch", + text = paste0("The maximum value in the genotype file (",max(geno, na.rm = TRUE),") does not equal the ploidy entered"), size = "xs", closeOnEsc = FALSE, closeOnClickOutside = FALSE, @@ -417,23 +421,35 @@ mod_GSAcc_server <- function(input, output, session, parent_session){ imageUrl = "", animation = TRUE ) - } - - # Subset and reorder geno and pheno to ensure they only contain and are ordered by common IDs - geno_adj <- geno[, common_ids] # Assuming that the columns can be directly indexed by IDs - pheno <- pheno[match(common_ids, ids_pheno), ] # If there is pheno but not geno, the sample is also discarded - - # Check pedigree - #Import pedigree file, where pedigree data name (3-column way format). Unknown value should be equal 0 - if(!is.null(advanced_options$ped_file$datapath)){ - ped <- read.csv(advanced_options$ped_file$datapath, check.names = FALSE, colClasses = "factor") - colnames(ped) <- c("Ind", "P1", "P2") - #Convert NAs to 0 - ped[is.na(ped)] <- 0 - - common_ped <- intersect(ped$Ind, pheno[,1]) - #Throw an error if there are less matching samples in the phenotype file than the pedigree file - if (length(common_ped) == 0) { + + + # Stop the observeEvent gracefully + #return() + } + + print("check6") + # Function to convert genotype matrix according to ploidy + convert_genotype <- function(genotype_matrix, ploidy) { + normalized_matrix <- 2 * (genotype_matrix / ploidy) - 1 + return(normalized_matrix) + } + + #tranforming genotypes + geno_adj_init <- convert_genotype(geno, as.numeric(ploidy)) + + #Make sure the trait file and genotype file are in the same order + # Column names for geno (assuming these are the individual IDs) + colnames_geno <- colnames(geno) + # Assuming the first column in Pheno contains the matching IDs + ids_pheno <- pheno[, 1] + # Find common identifiers + common_ids <- intersect(colnames_geno, ids_pheno) + #Get number of id + pred_inputs$pred_geno_pheno <- length(common_ids) + + #Throw an error if there are less matching samples in the phenotype file than the genotype file + if (length(common_ids) == 0) { + # If condition is met, show notification toast shinyalert( title = "Oops", @@ -450,359 +466,744 @@ mod_GSAcc_server <- function(input, output, session, parent_session){ imageUrl = "", animation = TRUE, ) - + + # Stop the observeEvent gracefully return() - } else { - rm_unr <- remove_unrelated(ped, samples_with_trait_info = pheno[,1]) - extended_ped <- rm_unr[[1]] - gen <- rm_unr[[2]] - cat(paste0("You have pedigree information until the ", gen,"th generation\n")) - - if (length(common_ped) < length(ids_pheno)){ - shinyalert( - title = "Data Mismatch", - text = paste0((length(ids_pheno)-length(common_ped))," samples were removed from the phenotype data for not having pedigree information"), - size = "xs", - closeOnEsc = FALSE, - closeOnClickOutside = FALSE, - html = TRUE, - type = "warning", - showConfirmButton = TRUE, - confirmButtonText = "OK", - confirmButtonCol = "#004192", - showCancelButton = FALSE, - #closeOnConfirm = TRUE, - #closeOnCancel = TRUE, - imageUrl = "", - animation = TRUE - ) - pheno <- pheno[-which(!pheno$Sample_ID %in% extended_ped$Ind),] - geno_adj <- geno_adj[,-which(!pheno$Sample_ID %in% extended_ped$Ind)] - } - if (length(ped$Ind) > length(extended_ped$Ind)) - shinyalert( - title = "Data Mismatch", - text = paste0((length(ped$Ind)-length(extended_ped$Ind))," samples in the pedigree file were unrelated to the samples with phenotype information. They were removed from the analysis."), - size = "xs", - closeOnEsc = FALSE, - closeOnClickOutside = FALSE, - html = TRUE, - type = "warning", - showConfirmButton = TRUE, - confirmButtonText = "OK", - confirmButtonCol = "#004192", - showCancelButton = FALSE, - #closeOnConfirm = TRUE, - #closeOnCancel = TRUE, - imageUrl = "", - animation = TRUE - ) - - ped_temp <- tempfile() - ped_temp_file <- extended_ped - colnames(ped_temp_file) <- c("id", "sire", "dam") - write.table(ped_temp_file, file = ped_temp) - ped_check <- BIGr::check_ped(ped_temp) - if(dim(ped_check$repeated_ids)[1] != 0){ - shinyalert( - title = "Oops", - text = "Check for repeated IDs in the pedigree file", - size = "xs", - closeOnEsc = TRUE, - closeOnClickOutside = FALSE, - html = TRUE, - type = "info", - showConfirmButton = TRUE, - confirmButtonText = "OK", - confirmButtonCol = "#004192", - showCancelButton = FALSE, - imageUrl = "", - animation = TRUE, - ) - - # Stop the observeEvent gracefully - return() - } - if(dim(ped_check$messy_parents)[1] != 0){ - shinyalert( - title = "Oops", - text = paste("We found inconsistencies in the pedigree file for the individuals:", paste0(ped_check$messy_parents$id, collapse = ", ")), - size = "xs", - closeOnEsc = TRUE, - closeOnClickOutside = FALSE, - html = TRUE, - type = "info", - showConfirmButton = TRUE, - confirmButtonText = "OK", - confirmButtonCol = "#004192", - showCancelButton = FALSE, - imageUrl = "", - animation = TRUE, - ) - } - } - } else extended_ped <- NULL - - #Status - updateProgressBar(session = session, id = "pb_prediction", value = 20, title = "Inputs checked!") - - ## Make ouput as checked inputs pred_inputs - pred_inputs$pheno_input <- pheno - pred_inputs$geno_input <- geno_adj - pred_inputs$ped_input <- extended_ped - - continue_prediction(TRUE) - }) - - #3) Analysis only proceeds once continue_prediction is converted to TRUE - pred_outputs <- reactive({ - - req(continue_prediction(),pred_inputs$pheno_input, pred_inputs$geno_input) - - # Stop analysis if cancel was selected - if (isFALSE(continue_prediction())) { - return() - } - - # Convert genotype matrix according to ploidy and model used - geno_formated <- format_geno_matrix(pred_inputs$geno_input,advanced_options$pred_model, advanced_options$pred_matrix, input$pred_ploidy) - - #Status - updateProgressBar(session = session, id = "pb_prediction", value = 30, title = paste("Genotype matrix formatted for", advanced_options$pred_model, advanced_options$pred_matrix)) - - results <- run_predictive_model(geno = geno_formated, - pheno = pred_inputs$pheno_input, - selected_traits = input$pred_trait_info, - predictive_model = advanced_options$pred_model, - relationship_matrix_type = advanced_options$pred_matrix, - pedigree = pred_inputs$ped_input, - fixed_effects = input$pred_fixed_info, - categorical_fixed_effects = input$pred_fixed_cat, - ploidy = input$pred_ploidy, - cores = input$pred_cores, - cycles = input$pred_cv, - folds = 5) - - updateProgressBar(session = session, id = "pb_prediction", value = 70, title = "Cross validation concluded") - - #Save to reactive value - pred_outputs <- list(corr_output = NULL, comb_output = NULL, all_GEBVs = NULL, avg_GEBVs = NULL) - pred_outputs$corr_output <- results$PredictionAccuracy - pred_outputs$all_GEBVs <- results$GEBVs - - # Convert trait columns to numeric - results$GEBVs <- results$GEBVs %>% - mutate(across(all_of(input$pred_trait_info), ~ as.numeric(.x))) - - # Calculate the average value for each column in the traits list for each SampleID, ignoring Iter and Fold - average_gebvs_df <- results$GEBVs %>% - group_by(Sample) %>% - summarize(across(all_of(input$pred_trait_info), mean, na.rm = TRUE)) - - pred_outputs$avg_GEBVs <- average_gebvs_df - - columns <- setdiff(colnames(results$PredictionAccuracy), c("Iter","Fold")) - average_accuracy_df <- results$PredictionAccuracy %>% - group_by(Iter) %>% - summarize(across(all_of(columns), mean, na.rm = TRUE)) - - pred_outputs$comb_output <- average_accuracy_df - #Status - updateProgressBar(session = session, id = "pb_prediction", value = 100, title = "Finished!") - - pred_outputs - }) - - plots <- reactive({ - validate( - need(!is.null(pred_outputs()$corr_output), "Upload the input files, set the parameters and click 'run analysis' to access results in this session.") - ) - - df <- pred_outputs()$corr_output - df <- df %>% dplyr::select(-Fold, -Iter) - - #Probably want to add the ability for the user to select which trait(s) to display here - - #Convert to long format for ggplot - df_long <- pivot_longer( - df, - cols = colnames(df), # Exclude the Cycle column from transformation - names_to = "Trait", # New column for trait names - values_to = "Correlation" # New column for correlation values - ) - - plots <- list(box_plot = NULL, violin_plot = NULL) - #This can be adapted if we start comparing more than one GP model - #Also consider a violin plot to show each cor value - #plot <- ggplot(df_long, aes(x = factor(Trait), y = Correlation, fill = "red"), fill = "red") + - plots$box_plot <- ggplot(df_long, aes(x = "rrBLUP", y = Correlation, fill = "red"), fill = "red") + - #geom_boxplot(position = position_dodge(width = 0.8), color = "black", width = 0.7, outlier.size = 0.2) + - geom_boxplot() + - facet_wrap(~ Trait, nrow = 1) + # Facet by trait, allowing different y-scales - labs(title = "Predictive Ability by Trait", - x = " ", - y = "Pearson Correlation") + - #theme_minimal() + # Using a minimal theme - theme(legend.position = "none", - strip.text = element_text(size = 12), - axis.text = element_text(size = 12), - axis.title = element_text(size = 14), - axis.text.x = element_text(angle = 90, hjust = 0.95, vjust = 0.2), - strip.text.x = element_text(face = "bold"), - axis.text.x.bottom = element_blank(), - axis.ticks.x.bottom = element_blank()) - - plots$violin_plot <- ggplot(df_long, aes(x = "rrBLUP", y = Correlation, fill = "red")) + - geom_violin(trim = TRUE) + # Add violin plot - geom_point(position = position_jitter(width = 0.1), color = "black", size = 1.5) + # Add jittered points - facet_wrap(~ Trait, nrow = 1) + # Facet by trait, allowing different y-scales - labs(title = "Predictive Ability by Trait", - x = " ", # x-label is blank because it's not relevant per facet - y = "Pearson Correlation") + - theme(legend.position = "none", - strip.text = element_text(size = 12), - axis.text = element_text(size = 12), - axis.title = element_text(size = 14), - axis.text.x = element_text(angle = 90, hjust = 0.95, vjust = 0.2), - strip.text.x = element_text(face = "bold"), - axis.text.x.bottom = element_blank(), - axis.ticks.x.bottom = element_blank()) - plots - }) - - #Output the genomic prediction correlation box plots - output$pred_box_plot <- renderPlot({ - plots()$box_plot + scale_fill_manual(values = colors$colors) - }) - - #Output the genomic prediction correlation box plots - output$pred_violin_plot <- renderPlot({ - plots()$violin_plot + scale_fill_manual(values = colors$colors) - }) - - #Output the prediction tables - - all_GEBVs <- reactive({ - validate( - need(!is.null(pred_outputs()$all_GEBVs), "Upload the input files, set the parameters and click 'run analysis' to access results in this session.") - ) - pred_outputs()$comb_output - }) - - output$pred_all_table <- renderDT({all_GEBVs()}, options = list(scrollX = TRUE,autoWidth = FALSE, pageLength = 5)) - - comb_output <- reactive({ - validate( - need(!is.null(pred_outputs()$comb_output), "Upload the input files, set the parameters and click 'run analysis' to access results in this session.") - ) - pred_outputs()$comb_output - }) - - output$pred_acc_table <- renderDT({comb_output()}, options = list(scrollX = TRUE,autoWidth = FALSE, pageLength = 5)) - - avg_GEBVs <- reactive({ - validate( - need(!is.null(pred_outputs()$avg_GEBVs), "Upload the input files, set the parameters and click 'run analysis' to access results in this session.") - ) - pred_outputs()$avg_GEBVs - }) - - output$pred_gebvs_table <- renderDT({avg_GEBVs()}, options = list(scrollX = TRUE,autoWidth = FALSE, pageLength = 5)) - - #Download files for GP - output$download_pred_file <- downloadHandler( - filename = function() { - paste0("GS-results-", Sys.Date(), ".zip") - }, - content = function(file) { - # Temporary files list - temp_dir <- tempdir() - temp_files <- c() - - if (!is.null(pred_outputs()$avg_GEBVs)) { - # Create a temporary file for assignments - gebv_file <- file.path(temp_dir, paste0("GEBVs-", Sys.Date(), ".csv")) - write.csv(pred_outputs()$avg_GEBVs, gebv_file, row.names = FALSE) - temp_files <- c(temp_files, gebv_file) + print("check7") + } else if (length(common_ids) < length(colnames_geno)) { + # If condition is met, show notification toast + shinyalert( + title = "Data Mismatch", + text = paste0((length(colnames_geno)-length(common_ids))," samples were removed for not having trait information"), + size = "xs", + closeOnEsc = FALSE, + closeOnClickOutside = FALSE, + html = TRUE, + type = "warning", + showConfirmButton = TRUE, + confirmButtonText = "OK", + confirmButtonCol = "#004192", + showCancelButton = FALSE, + #closeOnConfirm = TRUE, + #closeOnCancel = TRUE, + imageUrl = "", + animation = TRUE + ) + + + # Stop the observeEvent gracefully + #return() } - - if (!is.null(pred_outputs()$comb_output)) { - # Create a temporary file for BIC data frame - acc_file <- file.path(temp_dir, paste0("GS-accuracy-statistics-", Sys.Date(), ".csv")) - write.csv(pred_outputs()$comb_output, acc_file, row.names = FALSE) - temp_files <- c(temp_files, acc_file) + + + + + #Final check before performing analyses + shinyalert( + title = "Ready?", + text = "Inputs have been checked", + size = "xs", + closeOnEsc = FALSE, + closeOnClickOutside = FALSE, + html = TRUE, + type = "info", + showConfirmButton = TRUE, + confirmButtonText = "Proceed", + confirmButtonCol = "#004192", + showCancelButton = TRUE, + #closeOnConfirm = TRUE, + #closeOnCancel = TRUE, + imageUrl = "", + animation = TRUE, + callbackR = function(value) { + if (isTRUE(value)) { + # Proceed with adjusted data + continue_prediction(TRUE) + } else { + # Stop or change the process + continue_prediction(FALSE) + } + } + ) + + print("check8") + # Subset and reorder geno and pheno to ensure they only contain and are ordered by common IDs + geno_adj <- geno_adj_init[, common_ids] # Assuming that the columns can be directly indexed by IDs + pheno <- pheno[match(common_ids, ids_pheno), ] + + ##Save to reactive values + #Gmatrix needs the original allele count values, so the user matrix selection determines the genotype matrix used + pred_inputs$pheno_input <- pheno + if (advanced_options$pred_matrix == "Gmatrix" || is.null(advanced_options$pred_matrix)) { + pred_inputs$geno_input <- geno_adj + } else if (advanced_options$pred_matrix == "Hmatrix") { + pred_inputs$geno_input <- geno[, common_ids] + } else { + pred_inputs$geno_input <- geno_adj } - - # Zip files only if there's something to zip - if (length(temp_files) > 0) { - zip(file, files = temp_files, extras = "-j") # Using -j to junk paths + }) + print("check9") + #3) Analysis only proceeds once continue_prediction is converted to TRUE + observe({ + + req(continue_prediction(),pred_inputs$pheno_input, pred_inputs$geno_input) + + # Stop analysis if cancel was selected + if (isFALSE(continue_prediction())) { + return() } - - # Optionally clean up - file.remove(temp_files) - } - ) - - #Download GP Figures - output$download_pred_figure <- downloadHandler( - - filename = function() { - if (input$pred_image_type == "jpeg") { - paste("GS-", Sys.Date(), ".jpg", sep="") - } else if (input$pred_image_type == "png") { - paste("GS-", Sys.Date(), ".png", sep="") + print("check10") + #Variables + ploidy <- as.numeric(input$pred_ploidy) + geno_adj <- pred_inputs$geno_input + pheno <- pred_inputs$pheno_input + traits <- input$pred_trait_info + CVs <- as.numeric(input$pred_cv) + fixed_traits <- input$pred_fixed_info + fixed_cat <- input$pred_fixed_cat + fixed_cov <- if (is.null(input$pred_fixed_info) || length(input$pred_fixed_info) == length(input$pred_fixed_cat)) { + NULL } else { - paste("GS-", Sys.Date(), ".tiff", sep="") + setdiff(input$pred_fixed_info, input$pred_fixed_cat) } - }, - content = function(file) { - #req(all_plots$pca_2d, all_plots$pca3d, all_plots$scree, input$pca_image_type, input$pca_image_res, input$pca_image_width, input$pca_image_height) #Get the plots - req(input$pred_figures) - - if (input$pred_image_type == "jpeg") { - jpeg(file, width = as.numeric(input$pred_image_width), height = as.numeric(input$pred_image_height), res= as.numeric(input$pred_image_res), units = "in") - } else if (input$pred_image_type == "png") { - png(file, width = as.numeric(input$pred_image_width), height = as.numeric(input$pred_image_height), res= as.numeric(input$pred_image_res), units = "in") - } else { - tiff(file, width = as.numeric(input$pred_image_width), height = as.numeric(input$pred_image_height), res= as.numeric(input$pred_image_res), units = "in") + cores <- input$pred_cores + + #Assign colors + if (input$pred_color_select == "red"){ + pred_outputs$colors <- "#F8766D" + } else if (input$pred_color_select == "blue") { + pred_outputs$colors <- "#00BFC4" + } else if (input$pred_color_select == "green") { + pred_outputs$colors <- "#00BA38" + } else{ + pred_outputs$colors <- input$pred_color_select } - - # Conditional plotting based on input selection - if (input$pred_figures == "Violin Plot") { - req(plots()$violin_plot) - - print(plots()$violin_plot + scale_fill_manual(values = colors$colors)) - - } else if (input$pred_figures == "Box Plot") { - req(plots()$box_plot) - #Plot - print(plots()$box_plot + scale_fill_manual(values = colors$colors)) - + + #Control whether rrBLUP or GBLUP run depending on user input + #Note, should add the GP functions to the utils.R file and then call them here... + if (advanced_options$pred_model == "rrBLUP"){ + ##Need to add ability for the use of parallelism for the for cross-validation + ##Example at this tutorial also: https://www.youtube.com/watch?v=ARWjdQU6ays + + # Function to perform genomic prediction + ##Make sure this is correct (I think I need to be generating a relationship matrix A.mat() to account for missing data, but I am not sure how that works with this) + genomic_prediction <- function(geno, Pheno, traits, fixed_effects = NULL, Fold = 5, Iters = 5, cores = 1) { + + # Define variables + traits <- traits + cycles <- as.numeric(Iters) + Folds <- as.numeric(Fold) + total_population <- ncol(geno) + #train_size <- floor(percentage / 100 * total_population) + fixed_traits <- fixed_effects + cores <- as.numeric(cores) + + # Establish accuracy results matrix + results <- matrix(nrow = cycles*Folds, ncol = length(traits) + 2) + colnames(results) <- c(paste0(traits), "Iter", "Fold") # Set the column names to be the traits + + # Initialize a list to store GEBVs for all traits and cycles + GEBVs <- list() + + #Establish heritability_scores_df () Maybe get h2 values + # Establish results matrix + heritability_scores <- matrix(nrow = cycles*Folds, ncol = length(traits) + 2) + colnames(heritability_scores) <- c(paste0(traits,"_h2"), "Iter", "Fold") # Set the column names to be the traits + + #Cross validation number for progress bar (not involved in the calculations, just shiny visuals) + pb_value = 10 + print("check11") + #Remove the fixed traits from the Pheno file + if (length(fixed_traits) == 0) { + Pheno <- Pheno + } else { + #Subset fixed traits + Fixed <- subset(Pheno, select = fixed_traits) + + #Pheno <- subset(Pheno, select = -fixed_traits) + convert_categorical_to_factor <- function(df, fixed_cat) { + for (col in names(df)) { + if (col %in% fixed_cat) { + df[[col]] <- as.factor(df[[col]]) + } + } + return(df) + } + # Convert all columns to factor if they are not numeric or integer + Fixed <- convert_categorical_to_factor(Fixed, fixed_cat) + + #Fixed <- as.data.frame(lapply(Fixed, as.factor)) #convert to factor + row.names(Fixed) <- row.names(Pheno) + + #Make the matrix + formula_str <- paste("~", paste(fixed_traits, collapse = " + ")) + formula <- as.formula(formula_str) + + # Create the design matrix using the constructed formula + Fixed <- model.matrix(formula, data = Fixed) + } + + #Make kinship matrix of all individuals? + #Kin_mat <- A.mat(t(geno), n.core = 1) ##Need to explore whether or not to use a Kinship matrix and if it makes a significant improvement to accuracy + #If wanting to use Kkinship matrix, will then need to see how to implement it here + + #For now, I am just imputing the missing sites using mean, but EM is more accurate, but slower (can use multiple cores). + impute = (A.mat(t(geno), max.missing=0.5,impute.method="mean",return.imputed=TRUE)) + geno <- impute$imputed + + # For loop + for (r in 1:cycles) { + set.seed(r) + fold_ids <- sample(rep(1:Folds, length.out = total_population)) + fold_df <- data.frame(Sample = row.names(geno), FoldID = fold_ids) #Randomly assign each sample to a fold + fold_results <- matrix(nrow = Folds, ncol = length(traits)) + colnames(fold_results) <- traits + + #Initialize GEBV object for each cycle + GEBVs_cycle <-list() + + #Status + updateProgressBar(session = session, id = "pb_prediction", value = as.numeric(pb_value), title = paste0("Performing iteration:", r, "of", cycles)) + + for (fold in 1:Folds) { + + #Status bar length + pb_value = pb_value + (70 / as.numeric(cycles*Folds)) + + train <- fold_df %>% + dplyr::filter(FoldID != fold) %>% + pull(Sample) + test <- setdiff(row.names(geno),train) + + #Subset datasets + if (length(fixed_traits) == 0) { + Fixed_train = NULL + } else{ + Fixed_train <- data.frame(Fixed[train, ]) + Fixed_train <- as.matrix(Fixed_train) + row.names(Fixed_train) <- train + + #Fixed (testing) + Fixed_test<- data.frame(Fixed[test, ]) + Fixed_test <- as.matrix(Fixed_test) + row.names(Fixed_test) <- test + + } + print("check12") + Pheno_train <- Pheno[train, ] # Subset the phenotype df to only retain the relevant samples from the training set + m_train <- geno[train, ] + Pheno_test <- Pheno[test, ] + #Fixed_test <- Fixed[test, ] #Where would the Fixed_test be used? + m_valid <- geno[test, ] + + # Initialize a matrix to store GEBVs for this fold + GEBVs_fold <- matrix(nrow = length(test), ncol = length(traits)+3) + colnames(GEBVs_fold) <- c(traits,"Sample","Iter","Fold") + rownames(GEBVs_fold) <- paste("Iter", r,"Fold",fold,"Ind", test, sep="_") + + #Evaluate each trait using the same train and testing samples for each + for (trait_idx in 1:length(traits)) { + trait <- Pheno_train[, traits[trait_idx]] # Get the trait of interest + trait_answer <- mixed.solve(y= trait, Z = m_train, K = NULL, X = Fixed_train, SE = FALSE, return.Hinv = FALSE) + TRT <- trait_answer$u + e <- as.matrix(TRT) + pred_trait_test <- m_valid %*% e + pred_trait <- pred_trait_test[, 1] + c(trait_answer$beta) # Make sure this still works when using multiple traits + trait_test <- Pheno_test[, traits[trait_idx]] + results[(((r-1)*5)+fold), trait_idx] <- cor(pred_trait, trait_test, use = "complete") + results[(((r-1)*5)+fold), (length(traits)+1)] <- r + results[(((r-1)*5)+fold), (length(traits)+2)] <- fold + + # Extract GEBVs + # Check if Fixed_train is not NULL and include beta if it is + if (!is.null(Fixed_train) && !is.null(trait_answer$beta)) { + # Calculate GEBVs including fixed effects + GEBVs_fold[, trait_idx] <- m_valid %*% trait_answer$u + Fixed_test %*% trait_answer$beta + } else { + # Calculate GEBVs without fixed effects + GEBVs_fold[, trait_idx] <- m_valid %*% trait_answer$u #Confirm it is accurate to calculate the GEBVs for testing group from the trained model + } + + # Calculate heritability for the current trait + Vu <- trait_answer$Vu + Ve <- trait_answer$Ve + heritability_scores[(((r-1)*5)+fold), trait_idx] <- Vu / (Vu + Ve) + + } + #Add iter and fold information for each trait/result + heritability_scores[(((r-1)*5)+fold), (length(traits)+1)] <- r + heritability_scores[(((r-1)*5)+fold), (length(traits)+2)] <- fold + print("check13") + #Add sample, iteration, and fold information to GEBVs_fold + GEBVs_fold[,"Iter"] = r + GEBVs_fold[,"Fold"] = fold + GEBVs_fold[,"Sample"] <- test + + # Store GEBVs for this fold + GEBVs_cycle[[fold]] <- GEBVs_fold + + } + + # Store GEBVs for this cycle + GEBVs[[r]] <- do.call(rbind, GEBVs_cycle) + + } + + # Combine all GEBVs into a single DataFrame + GEBVs_df <- as.data.frame(do.call(rbind, GEBVs)) + + results <- as.data.frame(results) + heritability_scores <- as.data.frame(heritability_scores) + + # Combine results and heritability_scores using cbind + combined_results <- cbind(results, heritability_scores) + + return(list(GEBVs = GEBVs_df, PredictionAccuracy = results, CombinedResults = combined_results)) + } + + # Example call to the function + #This is slow when using 3k markers and 1.2k samples...will need to parallelize if using this script... + results <- genomic_prediction(geno_adj, pheno, traits = traits, fixed_effects = fixed_traits, Iters = input$pred_cv, cores = cores) + + #With fixed effects (need to inforporate the ability for fixed effects into the prediction?) + #results <- genomic_prediction(geno_matrix, phenotype_df, c("height", "weight"), "~ age + sex") + + #Save to reactive value + pred_outputs$corr_output <- results$PredictionAccuracy + pred_outputs$all_GEBVs <- results$GEBVs + + # Convert trait columns to numeric + results$GEBVs <- results$GEBVs %>% + mutate(across(all_of(traits), ~ as.numeric(.x))) + + # Calculate the average value for each column in the traits list for each SampleID, ignoring Iter and Fold + average_gebvs_df <- results$GEBVs %>% + group_by(Sample) %>% + summarize(across(all_of(traits), mean, na.rm = TRUE)) + + pred_outputs$avg_GEBVs <- average_gebvs_df + + columns <- setdiff(colnames(results$PredictionAccuracy), c("Iter","Fold")) + average_accuracy_df <- results$PredictionAccuracy %>% + group_by(Iter) %>% + summarize(across(all_of(columns), mean, na.rm = TRUE)) + + + pred_outputs$comb_output <- average_accuracy_df + + }else{ + #Note: should wrap the GBLUP into a function too + # Define variables + traits <- traits + cycles <- input$pred_cv + Folds <- 5 + total_population <- ncol(pred_inputs$geno_input) + #train_size <- floor(percentage / 100 * total_population) + fixed_traits <- fixed_traits + cores <- as.numeric(cores) + #Cross validation number for progress bar (not involved in the calculations, just shiny visuals) + pb_value = 10 + + if (advanced_options$pred_matrix == "Gmatrix") { + #Convert normalized genotypes to relationship matrix + #By default, it removes SNPs with more than 50% missing data and imputes using the mean + Geno.mat <- A.mat(t(pred_inputs$geno_input)) + print("check14") + }else if (advanced_options$pred_matrix == "Amatrix") { + + #Import pedigree file, where pedigree data name (3-column way format). Unknown value should be equal 0 + ped <- read.csv(advanced_options$ped_file$datapath, header = TRUE, check.names = FALSE, colClasses = "factor") + colnames(ped) <- c("Ind", "Sire", "Dam") + #Convert NAs to 0 + ped[is.na(ped)] <- 0 + #Ensure Sire and Dam are also listed as individuals + missing_parents <- unique(c(ped$Sire, ped$Dam)) + # Filter out parents already listed as individuals and non-zero values + missing_parents <- missing_parents[!missing_parents %in% ped$Ind & missing_parents != 0] + # Create new rows for missing parents and setting their parents to 0 (unknown) + new_rows <- data.frame(Ind = missing_parents, Sire = 0, Dam = 0) + # Combine the original dataframe with the new rows and remove duplicates + ped_extended <- unique(rbind(ped, new_rows)) + + #Converting to Amatrix + #Using the default additive relationship options (Amatrix only works for even numbered ploidy) + Geno.mat <- Amatrix(data = ped_extended, ploidy = ploidy) + + #Filter and order the ped file based on the phenotype file (make sure this is valid to subset after generating) + pheno_ids <- as.character(rownames(pred_inputs$pheno_input)) + valid_ids <- intersect(pheno_ids, rownames(Geno.mat)) + pred_inputs$pheno_input <- pred_inputs$pheno_input[valid_ids, ] + Geno.mat <- Geno.mat[valid_ids, valid_ids] + + #Update variable + total_population <- ncol(Geno.mat) + print("check15") + }else if (advanced_options$pred_matrix == "Hmatrix") { + print("check16") + #Import pedigree file, where pedigree data name (3-column way format). Unknown value should be equal 0 + ped <- read.csv(advanced_options$ped_file$datapath, header = TRUE, check.names = FALSE, colClasses = "factor") + colnames(ped) <- c("Ind", "Sire", "Dam") + #Convert NAs to 0 + ped[is.na(ped)] <- 0 + #Ensure Sire and Dam are also listed as individuals + missing_parents <- unique(c(ped$Sire, ped$Dam)) + # Filter out parents already listed as individuals and non-zero values + missing_parents <- missing_parents[!missing_parents %in% ped$Ind & missing_parents != 0] + # Create new rows for missing parents and setting their parents to 0 (unknown) + new_rows <- data.frame(Ind = missing_parents, Sire = 0, Dam = 0) + # Combine the original dataframe with the new rows and remove duplicates + ped_extended <- unique(rbind(ped, new_rows)) + + #Converting to Amatrix + #Using the default additive relationship options (Amatrix only works for even numbered ploidy) + Ped.mat <- Amatrix(data = ped_extended, ploidy = ploidy) + + #Filter and order the ped file based on the phenotype file (make sure this is valid to subset after generating) + pheno_ids <- as.character(rownames(pred_inputs$pheno_input)) + valid_ids <- intersect(pheno_ids, rownames(Ped.mat)) + pred_inputs$pheno_input <- pred_inputs$pheno_input[valid_ids, ] + Ped.mat <- Ped.mat[valid_ids, valid_ids] + + #Update variable + total_population <- ncol(Ped.mat) + + #Using Gmatrix to get the Gmatrix instead of A.mat for consistency + #Should I be using the raw dosage values or is it okay to use the scaled genotype data that is used for A.mat()? + G.mat <- Gmatrix(t(pred_inputs$geno_input[ ,valid_ids]), method = "VanRaden", ploidy = as.numeric(ploidy), missingValue = "NA") + G.mat <- round(G.mat,3) #to be easy to invert + + #Computing H matrix (Martini) - Using the name Geno.mat for consistency + Geno.mat <- Hmatrix(A=Ped.mat, G=G.mat, method="Martini", + ploidy= ploidy, + maf=0.05) + #Clean memory + rm(G.mat) + rm(Ped.mat) + rm(ped_filtered) + } + # Establish accuracy results matrix + results <- matrix(nrow = cycles*Folds, ncol = length(traits) + 2) + colnames(results) <- c(paste0(traits), "Iter", "Fold") # Set the column names to be the traits + + # Initialize a list to store GEBVs for all traits and cycles + GEBVs <- list() + + #Establish heritability_scores_df () Maybe get h2 values + # Establish results matrix + heritability_scores <- matrix(nrow = cycles*Folds, ncol = length(traits) + 2) + colnames(heritability_scores) <- c(paste0(traits,"_h2"), "Iter", "Fold") # Set the column names to be the traits + + + # For loop + for (r in 1:cycles) { + set.seed(r) + fold_ids <- sample(rep(1:Folds, length.out = total_population)) + fold_df <- data.frame(Sample = row.names(Geno.mat), FoldID = fold_ids) #Randomly assign each sample to a fold + fold_results <- matrix(nrow = Folds, ncol = length(traits)) + colnames(fold_results) <- traits + + #Initialize GEBV object for each cycle + GEBVs_cycle <-list() + + #Status + updateProgressBar(session = session, id = "pb_prediction", value = as.numeric(pb_value), title = paste0("Performing iteration:", r, "of", cycles)) + + for (fold in 1:Folds) { + + #Status bar length + pb_value = pb_value + (70 / as.numeric(cycles*Folds)) + + #Subset training and testing samples + train <- fold_df %>% + dplyr::filter(FoldID != fold) %>% + pull(Sample) + test <- setdiff(row.names(Geno.mat),train) + + Fixed_train = NULL + + # Initialize a matrix to store GEBVs for this fold + GEBVs_fold <- matrix(nrow = length(test), ncol = length(traits)+3) + colnames(GEBVs_fold) <- c(traits,"Sample","Iter","Fold") + rownames(GEBVs_fold) <- paste("Iter", r,"Fold",fold,"Ind", test, sep="_") + + #Evaluate each trait using the same train and testing samples for each + for (trait_idx in 1:length(traits)) { + #Mask phenotypes in testing group + Pheno_test <- pred_inputs$pheno_input + Pheno_test[test, traits[trait_idx]] <- NA + #Kin.blup + traitpred <- kin.blup(data = Pheno_test, geno = names(pred_inputs$pheno_input)[1], pheno = traits[trait_idx], fixed = fixed_cat, covariate = fixed_cov, K=Geno.mat) + #Cor between test values and predicted breeding values + results[(((r-1)*5)+fold), trait_idx] <- cor(pred_inputs$pheno_input[test, traits[trait_idx]], traitpred$g[test], use = "complete.obs") + results[(((r-1)*5)+fold), (length(traits)+1)] <- r + results[(((r-1)*5)+fold), (length(traits)+2)] <- fold + + # Extract GEBVs + GEBVs_fold[, trait_idx] <- traitpred$g[test] #Confirm it is accuract to calculate the GEBVs for testing group from the trained model + + + # Calculate heritability (these are wrong) + Vu <- traitpred$Vg + Ve <- traitpred$Ve + heritability_scores[(((r-1)*5)+fold), trait_idx] <- Vu / (Vu + Ve) + + } + #Add iter and fold information for each trait/result + heritability_scores[(((r-1)*5)+fold), (length(traits)+1)] <- r + heritability_scores[(((r-1)*5)+fold), (length(traits)+2)] <- fold + + #Add sample, iteration, and fold information to GEBVs_fold + GEBVs_fold[,"Iter"] = r + GEBVs_fold[,"Fold"] = fold + GEBVs_fold[,"Sample"] <- test + + # Store GEBVs for this fold + GEBVs_cycle[[fold]] <- GEBVs_fold + + } + + # Store GEBVs for this cycle + GEBVs[[r]] <- do.call(rbind, GEBVs_cycle) + + } + + # Combine all GEBVs into a single DataFrame + GEBVs_df <- as.data.frame(do.call(rbind, GEBVs)) + + results <- as.data.frame(results) + heritability_scores <- as.data.frame(heritability_scores) + + # Combine results and heritability_scores using cbind + combined_results <- cbind(results, heritability_scores) + + #Save to reactive value + pred_outputs$corr_output <- results + pred_outputs$all_GEBVs <- results$GEBVs_df + + # Convert trait columns to numeric + GEBVs <- GEBVs_df %>% + mutate(across(all_of(traits), ~ as.numeric(.x))) + + # Calculate the average value for each column in the traits list for each SampleID, ignoring Iter and Fold + average_gebvs_df <- GEBVs %>% + group_by(Sample) %>% + summarize(across(all_of(traits), mean, na.rm = TRUE)) + + pred_outputs$avg_GEBVs <- average_gebvs_df + + columns <- setdiff(colnames(results), c("Iter","Fold")) + average_accuracy_df <- results %>% + group_by(Iter) %>% + summarize(across(all_of(columns), mean, na.rm = TRUE)) + + + pred_outputs$comb_output <- average_accuracy_df + } - - dev.off() - } - - ) - - output$download_vcf <- downloadHandler( - filename = function() { - paste0("BIGapp_VCF_Example_file.vcf.gz") - }, - content = function(file) { - ex <- system.file("iris_DArT_VCF.vcf.gz", package = "BIGapp") - file.copy(ex, file) + + #Status + updateProgressBar(session = session, id = "pb_prediction", value = 90, title = "Generating Results") + + ##Figures and Tables + + #Status + updateProgressBar(session = session, id = "pb_prediction", value = 100, title = "Finished!") + + #End the event + continue_prediction(NULL) }) - - output$download_pheno <- downloadHandler( - filename = function() { - paste0("BIGapp_passport_Example_file.csv") - }, - content = function(file) { - ex <- system.file("iris_passport_file.csv", package = "BIGapp") - file.copy(ex, file) + + plots <- reactive({ + validate( + need(!is.null(pred_outputs$corr_output), "Upload the input files, set the parameters and click 'run analysis' to access results in this session.") + ) + + df <- pred_outputs$corr_output + df <- df %>% dplyr::select(-Fold, -Iter) + + #Probably want to add the ability for the user to select which trait(s) to display here + + #Convert to long format for ggplot + df_long <- pivot_longer( + df, + cols = colnames(df), # Exclude the Cycle column from transformation + names_to = "Trait", # New column for trait names + values_to = "Correlation" # New column for correlation values + ) + + #This can be adapted if we start comparing more than one GP model + #Also consider a violin plot to show each cor value + #plot <- ggplot(df_long, aes(x = factor(Trait), y = Correlation, fill = "red"), fill = "red") + + plot <- ggplot(df_long, aes(x = "rrBLUP", y = Correlation, fill = "red"), fill = "red") + + #geom_boxplot(position = position_dodge(width = 0.8), color = "black", width = 0.7, outlier.size = 0.2) + + geom_boxplot() + + facet_wrap(~ Trait, nrow = 1) + # Facet by trait, allowing different y-scales + labs(title = "Predictive Ability by Trait", + x = " ", + y = "Pearson Correlation") + + #theme_minimal() + # Using a minimal theme + theme(legend.position = "none", + strip.text = element_text(size = 12), + axis.text = element_text(size = 12), + axis.title = element_text(size = 14), + axis.text.x = element_text(angle = 90, hjust = 0.95, vjust = 0.2), + strip.text.x = element_text(face = "bold"), + axis.text.x.bottom = element_blank(), + axis.ticks.x.bottom = element_blank()) + + plot_violin <- ggplot(df_long, aes(x = "rrBLUP", y = Correlation, fill = "red")) + + geom_violin(trim = TRUE) + # Add violin plot + geom_point(position = position_jitter(width = 0.1), color = "black", size = 1.5) + # Add jittered points + facet_wrap(~ Trait, nrow = 1) + # Facet by trait, allowing different y-scales + labs(title = "Predictive Ability by Trait", + x = " ", # x-label is blank because it's not relevant per facet + y = "Pearson Correlation") + + theme(legend.position = "none", + strip.text = element_text(size = 12), + axis.text = element_text(size = 12), + axis.title = element_text(size = 14), + axis.text.x = element_text(angle = 90, hjust = 0.95, vjust = 0.2), + strip.text.x = element_text(face = "bold"), + axis.text.x.bottom = element_blank(), + axis.ticks.x.bottom = element_blank()) + + list(plot, plot_violin) + }) + + #Output the genomic prediction correlation box plots + output$pred_box_plot <- renderPlot({ + plots()[[1]] + scale_fill_manual(values = pred_outputs$colors) }) + + #Output the genomic prediction correlation box plots + output$pred_violin_plot <- renderPlot({ + plots()[[2]] + scale_fill_manual(values = pred_outputs$colors) + }) + + #Output the prediction tables + + all_GEBVs <- reactive({ + validate( + need(!is.null(pred_outputs$all_GEBVs), "Upload the input files, set the parameters and click 'run analysis' to access results in this session.") + ) + pred_outputs$comb_output + }) + + output$pred_all_table <- renderDT({all_GEBVs()}, options = list(scrollX = TRUE,autoWidth = FALSE, pageLength = 5)) + + comb_output <- reactive({ + validate( + need(!is.null(pred_outputs$comb_output), "Upload the input files, set the parameters and click 'run analysis' to access results in this session.") + ) + pred_outputs$comb_output + }) + + output$pred_acc_table <- renderDT({comb_output()}, options = list(scrollX = TRUE,autoWidth = FALSE, pageLength = 5)) + + avg_GEBVs <- reactive({ + validate( + need(!is.null(pred_outputs$avg_GEBVs), "Upload the input files, set the parameters and click 'run analysis' to access results in this session.") + ) + pred_outputs$avg_GEBVs + }) + + output$pred_gebvs_table <- renderDT({avg_GEBVs()}, options = list(scrollX = TRUE,autoWidth = FALSE, pageLength = 5)) + + #Download files for GP + output$download_pred_file <- downloadHandler( + filename = function() { + paste0("GS-results-", Sys.Date(), ".zip") + }, + content = function(file) { + # Temporary files list + temp_dir <- tempdir() + temp_files <- c() + + if (!is.null(pred_outputs$avg_GEBVs)) { + # Create a temporary file for assignments + gebv_file <- file.path(temp_dir, paste0("GEBVs-", Sys.Date(), ".csv")) + write.csv(pred_outputs$avg_GEBVs, gebv_file, row.names = FALSE) + temp_files <- c(temp_files, gebv_file) + } + + if (!is.null(pred_outputs$comb_output)) { + # Create a temporary file for BIC data frame + acc_file <- file.path(temp_dir, paste0("GS-accuracy-statistics-", Sys.Date(), ".csv")) + write.csv(pred_outputs$comb_output, acc_file, row.names = FALSE) + temp_files <- c(temp_files, acc_file) + } + + # Zip files only if there's something to zip + if (length(temp_files) > 0) { + zip(file, files = temp_files, extras = "-j") # Using -j to junk paths + } + + # Optionally clean up + file.remove(temp_files) + } + ) + + #Download GP Figures + output$download_pred_figure <- downloadHandler( + + filename = function() { + if (input$pred_image_type == "jpeg") { + paste("GS-", Sys.Date(), ".jpg", sep="") + } else if (input$pred_image_type == "png") { + paste("GS-", Sys.Date(), ".png", sep="") + } else { + paste("GS-", Sys.Date(), ".tiff", sep="") + } + }, + content = function(file) { + #req(all_plots$pca_2d, all_plots$pca3d, all_plots$scree, input$pca_image_type, input$pca_image_res, input$pca_image_width, input$pca_image_height) #Get the plots + req(input$pred_figures) + + if (input$pred_image_type == "jpeg") { + jpeg(file, width = as.numeric(input$pred_image_width), height = as.numeric(input$pred_image_height), res= as.numeric(input$pred_image_res), units = "in") + } else if (input$pred_image_type == "png") { + png(file, width = as.numeric(input$pred_image_width), height = as.numeric(input$pred_image_height), res= as.numeric(input$pred_image_res), units = "in") + } else { + tiff(file, width = as.numeric(input$pred_image_width), height = as.numeric(input$pred_image_height), res= as.numeric(input$pred_image_res), units = "in") + } + + # Conditional plotting based on input selection + if (input$pred_figures == "Violin Plot") { + req(pred_outputs$violin_plot) + + print(pred_outputs$violin_plot + scale_fill_manual(values = pred_outputs$colors)) + + } else if (input$pred_figures == "Box Plot") { + req(pred_outputs$box_plot) + #Plot + print(pred_outputs$box_plot + scale_fill_manual(values = pred_outputs$colors)) + + } + + dev.off() + } + + ) + + output$download_vcf <- downloadHandler( + filename = function() { + paste0("BIGapp_VCF_Example_file.vcf.gz") + }, + content = function(file) { + ex <- system.file("iris_DArT_VCF.vcf.gz", package = "BIGapp") + file.copy(ex, file) + }) + + output$download_pheno <- downloadHandler( + filename = function() { + paste0("BIGapp_passport_Example_file.csv") + }, + content = function(file) { + ex <- system.file("iris_passport_file.csv", package = "BIGapp") + file.copy(ex, file) + }) + }) } ## To be copied in the UI From 32bbe6915ff7dc3a589f8f6c5de33e78d387c510 Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Fri, 13 Sep 2024 13:49:03 -0400 Subject: [PATCH 24/40] Updated NavBar and added SessionInfo --- R/app_server.R | 27 +++++++++++++++++++++++++++ R/app_ui.R | 26 +++++++++++++++++++++++--- 2 files changed, 50 insertions(+), 3 deletions(-) diff --git a/R/app_server.R b/R/app_server.R index 74c2138..1bf6ac2 100644 --- a/R/app_server.R +++ b/R/app_server.R @@ -52,4 +52,31 @@ app_server <- function(input, output, session) { # mod_GS_server("GS_1") mod_GSAcc_server("GSAcc_1") # mod_slurm_server("slurm_1") + + #Session info popup + observeEvent(input$session_info_button, { + showModal(modalDialog( + title = "Session Information", + size = "l", + easyClose = TRUE, + footer = tagList( + modalButton("Close"), + downloadButton("download_session_info", "Download") + ), + pre( + paste(capture.output(sessionInfo()), collapse = "\n") + ) + )) + }) + + #Download Session Info + output$download_session_info <- downloadHandler( + filename = function() { + paste("session_info_", Sys.Date(), ".txt", sep = "") + }, + content = function(file) { + writeLines(paste(capture.output(sessionInfo()), collapse = "\n"), file) + } + ) + } diff --git a/R/app_ui.R b/R/app_ui.R index a0634b6..f6aac72 100644 --- a/R/app_ui.R +++ b/R/app_ui.R @@ -15,10 +15,30 @@ app_ui <- function(request) { # Your application UI logic bs4DashPage( skin = "black", - bs4DashNavbar(title = tagList( - tags$img(src = 'www/BIG_R_logo.png', height = '40', width = '50'), - ) + bs4DashNavbar( + title = tagList( + tags$img(src = 'www/BIG_R_logo.png', height = '40', width = '50'), + ), + rightUi = tags$li( + class = "dropdown", + tags$a( + href = "#", + class = "nav-link", + `data-toggle` = "dropdown", + icon("info-circle") + ), + tags$div( + class = "dropdown-menu dropdown-menu-right", + tags$a( + class = "dropdown-item", + href = "#", + "Session Info", + onclick = "Shiny.setInputValue('session_info_button', Math.random())" + ) + ) + ) ), + help = NULL, #This is the default bs4Dash button to control the presence of tooltips and popovers, which can be added as a user help/info feature. bs4DashSidebar( skin="light", status = "info", sidebarMenu(id = "MainMenu", From 9863724234c43535212f52d726964659bc024b19 Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Fri, 13 Sep 2024 13:50:30 -0400 Subject: [PATCH 25/40] Updated Footer Version --- R/app_ui.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/app_ui.R b/R/app_ui.R index f6aac72..de2dd35 100644 --- a/R/app_ui.R +++ b/R/app_ui.R @@ -94,7 +94,7 @@ app_ui <- function(request) { ), left = div( style = "display: flex; align-items: center; height: 100%;", # Center the version text vertically - "v0.5.1") + "v0.6.0") ), dashboardBody( disconnectMessage(), #Adds generic error message for any error if not already accounted for From 661e311e14dd1d01867c4d97490eb386a0ff3bc8 Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Fri, 13 Sep 2024 13:51:04 -0400 Subject: [PATCH 26/40] Updated Package Version --- DESCRIPTION | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DESCRIPTION b/DESCRIPTION index 29f411a..e995725 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: BIGapp Title: Breeding Insight Genomics Shiny Application -Version: 0.5.1 +Version: 0.6.0 Authors@R: c( person(c("Alexander", "M."), "Sandercock", From fde1cce3631ee994a98762b857d9358b1bf980ae Mon Sep 17 00:00:00 2001 From: Alexander Sandercock <39815775+alex-sandercock@users.noreply.github.com> Date: Fri, 13 Sep 2024 15:25:11 -0400 Subject: [PATCH 27/40] Added PCA Summary --- R/mod_PCA.R | 106 ++++++++++++++++++++++++++++++++-------------------- 1 file changed, 66 insertions(+), 40 deletions(-) diff --git a/R/mod_PCA.R b/R/mod_PCA.R index f67b754..9c871ba 100644 --- a/R/mod_PCA.R +++ b/R/mod_PCA.R @@ -36,6 +36,7 @@ mod_PCA_ui <- function(id){ "You can download examples of the expected files here: \n", downloadButton(ns('download_vcf'), "Download VCF Example File"), downloadButton(ns('download_pheno'), "Download Passport Example File"), + actionButton(ns("pca_summary"), "Summary"), circle = FALSE, status = "warning", icon = icon("info"), width = "300px", @@ -80,7 +81,6 @@ mod_PCA_ui <- function(id){ sliderInput(inputId = ns('pca_image_width'), label = 'Width', value = 10, min = 1, max = 20, step=0.5), sliderInput(inputId = ns('pca_image_height'), label = 'Height', value = 6, min = 1, max = 20, step = 0.5), downloadButton(ns("download_pca"), "Save Image"), - downloadButton(ns("download_pca_summary"), "Save Summary"), circle = FALSE, status = "danger", icon = icon("floppy-disk"), width = "300px", @@ -312,6 +312,7 @@ mod_PCA_server <- function(input, output, session, parent_session){ #End of PCA section }) + ##2D PCA plotting pca_2d <- reactive({ @@ -319,45 +320,6 @@ mod_PCA_server <- function(input, output, session, parent_session){ need(!is.null(pca_data$pc_df_pop), "Input Genotype file, Species ploidy, and run the analysis to access results in this section.") ) - output$download_pca_summary <- downloadHandler( - filename = function() { - paste("pca-summary-", Sys.Date(), ".txt", sep = "") - }, - - content = function(file) { - pca_param <- c( - "BIGapp PCA Summary", - " ", - paste0("Date: ", Sys.Date()), - " ", - version$version.string, - " ", - "### Input Files ###", - "", - paste("Input Genotype File:", input$dosage_file$name, sep = " "), - paste("Input Passport File:", input$passport_file$name, sep = " "), - "", - "### User Selected Parameters ###", - "", - paste("Selected Ploidy:", as.character(input$pca_ploidy), sep = " "), - "", - "### R Packages Used ###", - "", - paste("BIGapp:",packageVersion("BIGapp"),sep=" "), - paste("AGHmatrix:",packageVersion("AGHmatrix"), sep = " "), - paste("ggplot2:",packageVersion("ggplot2"), sep = " "), - paste("plotly:",packageVersion("plotly"), sep = " "), - paste("factoextra:",packageVersion("factoextra"), sep = " "), - paste("RColorBrewer:",packageVersion("RColorBrewer"), sep= " ") - ) - - #sink(file) # Open sink with the provided file path - #cat(pca_param, sep = "\n") # Print the pca_param vector with newlines between entries - #sink() # Close the sink - writeLines(pca_param, con = file) - } - ) - # Generate colors if (!is.null(pca_data$my_palette)) { @@ -486,6 +448,70 @@ mod_PCA_server <- function(input, output, session, parent_session){ output$scree_plot <- renderPlot({ pca_scree() }) + + ##Summary Info + pca_summary_info <- function() { + # Handle possible NULL values for inputs + dosage_file_name <- if (!is.null(input$dosage_file$name)) input$dosage_file$name else "No file selected" + passport_file_name <- if (!is.null(input$passport_file$name)) input$passport_file$name else "No file selected" + selected_ploidy <- if (!is.null(input$pca_ploidy)) as.character(input$pca_ploidy) else "Not selected" + + # Print the summary information + cat( + "BIGapp PCA Summary\n", + "\n", + paste0("Date: ", Sys.Date()), "\n", + paste("R Version:", R.Version()$version.string), "\n", + "\n", + "### Input Files ###\n", + "\n", + paste("Input Genotype File:", dosage_file_name), "\n", + paste("Input Passport File:", passport_file_name), "\n", + "\n", + "### User Selected Parameters ###\n", + "\n", + paste("Selected Ploidy:", selected_ploidy), "\n", + "\n", + "### R Packages Used ###\n", + "\n", + paste("BIGapp:", packageVersion("BIGapp")), "\n", + paste("AGHmatrix:", packageVersion("AGHmatrix")), "\n", + paste("ggplot2:", packageVersion("ggplot2")), "\n", + paste("plotly:", packageVersion("plotly")), "\n", + paste("factoextra:", packageVersion("factoextra")), "\n", + paste("RColorBrewer:", packageVersion("RColorBrewer")), "\n", + sep = "" + ) + } + + # Popup for analysis summary + observeEvent(input$pca_summary, { + showModal(modalDialog( + title = "Summary Information", + size = "l", + easyClose = TRUE, + footer = tagList( + modalButton("Close"), + downloadButton("download_pca_info", "Download") + ), + pre( + paste(capture.output(pca_summary_info()), collapse = "\n") + ) + )) + }) + + + # Download Summary Info + output$download_pca_info <- downloadHandler( + filename = function() { + paste("PCA_summary_", Sys.Date(), ".txt", sep = "") + }, + content = function(file) { + # Write the summary info to a file + writeLines(paste(capture.output(pca_summary_info()), collapse = "\n"), file) + } + ) + #Download figures for PCA output$download_pca <- downloadHandler( From a5eb9ad99e4bf40c70f62f496a0046af980051aa Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Fri, 13 Sep 2024 15:51:16 -0400 Subject: [PATCH 28/40] adapting input controls --- R/GS_functions.R | 9 +- R/mod_GSAcc.R | 220 ++++++++++++++++++------------------ R/utils.R | 2 +- tests/testthat/test-GSAcc.R | 6 +- 4 files changed, 119 insertions(+), 118 deletions(-) diff --git a/R/GS_functions.R b/R/GS_functions.R index e88fdb1..8fbc5a4 100644 --- a/R/GS_functions.R +++ b/R/GS_functions.R @@ -187,7 +187,7 @@ rrBLUP_genomic_prediction <- function(geno, pheno, traits, fixed_effects = NULL, #' @importFrom AGHmatrix Gmatrix Amatrix Hmatrix #' #' -get_relationship_mat <- function(geno_input, ped_file, type = c("Gmatrix", "Amatrix", "Hmatrix"), ploidy){ +get_relationship_mat <- function(geno_input, ped_file, type = c("Gmatrix", "Amatrix", "Hmatrix"), ploidy, pheno){ if (type == "Gmatrix") { #Convert normalized genotypes to relationship matrix @@ -203,7 +203,7 @@ get_relationship_mat <- function(geno_input, ped_file, type = c("Gmatrix", "Amat Geno.mat <- Amatrix(data = ped_file, ploidy = ploidy) #Filter and order the ped file based on the phenotype file (make sure this is valid to subset after generating) - pheno_ids <- as.character(colnames(geno_input)) + pheno_ids <- as.character(pheno[,1]) valid_ids <- intersect(pheno_ids, rownames(Geno.mat)) Geno.mat <- Geno.mat[valid_ids, valid_ids] @@ -216,7 +216,7 @@ get_relationship_mat <- function(geno_input, ped_file, type = c("Gmatrix", "Amat Ped.mat <- Amatrix(data = ped_file, ploidy = ploidy) #Filter and order the ped file based on the phenotype file (make sure this is valid to subset after generating) - pheno_ids <- as.character(colnames(geno_input)) + pheno_ids <- as.character(pheno[,1]) valid_ids <- intersect(pheno_ids, rownames(Ped.mat)) Ped.mat <- Ped.mat[valid_ids, valid_ids] @@ -384,7 +384,8 @@ run_predictive_model <- function(geno, pheno, selected_traits, predictive_model, Geno.mat <- get_relationship_mat(geno_input = geno, type = relationship_matrix_type, ped_file = pedigree, - ploidy = ploidy) + ploidy = ploidy, + pheno = pheno) } else Geno.mat <- relationship_matrix results <- GBLUP_genomic_prediction(pheno_dat = pheno, diff --git a/R/mod_GSAcc.R b/R/mod_GSAcc.R index ff68065..1475224 100644 --- a/R/mod_GSAcc.R +++ b/R/mod_GSAcc.R @@ -139,7 +139,7 @@ mod_GSAcc_server <- function(input, output, session, parent_session){ #Default model choices advanced_options <- reactiveValues( - pred_model = "rrBLUP", + pred_model = "GBLUP", pred_matrix = "Gmatrix", ped_file = NULL ) @@ -221,23 +221,12 @@ mod_GSAcc_server <- function(input, output, session, parent_session){ }) - #2) Error check for prediction and save input files - continue_prediction <- reactiveVal(FALSE) - pred_inputs <- reactiveValues( - pheno_input = NULL, - geno_input = NULL, - pred_snps = NULL, - pred_genos = NULL, - pred_geno_pheno = NULL, - ped_input = NULL - ) - colors <- reactiveValues(colors = NULL) #Reactive boxes output$pred_snps <- renderValueBox({ valueBox( - value = pred_inputs$pred_snps, + value = pred_inputs()$pred_snps, subtitle = "SNPs in Genotype File", icon = icon("dna"), color = "info" @@ -246,7 +235,7 @@ mod_GSAcc_server <- function(input, output, session, parent_session){ output$pred_geno <- renderValueBox({ valueBox( - value = pred_inputs$pred_geno_pheno, + value = pred_inputs()$pred_geno_pheno, subtitle = "Samples with Phenotype Information", icon = icon("location-dot"), color = "info" @@ -262,14 +251,18 @@ mod_GSAcc_server <- function(input, output, session, parent_session){ updateVirtualSelect("pred_fixed_cat", choices = input$pred_fixed_info, session = session) }) - observeEvent(input$prediction_start, { + #2) Error check for prediction and save input files + pred_inputs <- eventReactive(input$prediction_start,{ toggleClass(id = "pred_ploidy", class = "borderred", condition = (is.na(input$pred_ploidy) | is.null(input$pred_ploidy))) - if (is.null(input$pred_file$datapath) | is.null(input$trait_file$datapath)) { + if(is.null(advanced_options$pred_matrix)) advanced_options$pred_matrix <- "none_selected" + if (((is.null(input$pred_file$datapath) & advanced_options$pred_matrix != "Amatrix") | + (is.null(advanced_options$ped_file$datapath) & advanced_options$pred_matrix == "Amatrix")) | + is.null(input$trait_file$datapath)) { shinyalert( title = "Missing input!", - text = "Upload VCF and phenotype files", + text = "Upload VCF or a pedigree file and the phenotype file", size = "s", closeOnEsc = TRUE, closeOnClickOutside = FALSE, @@ -281,8 +274,8 @@ mod_GSAcc_server <- function(input, output, session, parent_session){ showCancelButton = FALSE, animation = TRUE ) + return() } - req(input$pred_file$datapath, input$pred_ploidy, input$trait_file$datapath) #Status updateProgressBar(session = session, id = "pb_prediction", value = 5, title = "Checking input files") @@ -290,6 +283,8 @@ mod_GSAcc_server <- function(input, output, session, parent_session){ #Variables pheno <- read.csv(input$trait_file$datapath, header = TRUE, check.names = FALSE) row.names(pheno) <- pheno[,1] + # Assuming the first column in Pheno contains the matching IDs + ids_pheno <- pheno[, 1] #Make sure at least one trait was input if (length(input$pred_trait_info) == 0) { @@ -315,76 +310,31 @@ mod_GSAcc_server <- function(input, output, session, parent_session){ return() } + pred_inputs <- list( + pheno_input = NULL, + geno_input = NULL, + pred_snps = NULL, + pred_genos = NULL, + pred_geno_pheno = NULL, + ped_input = NULL + ) + #Getting genotype matrix #Geno.file conversion if needed - geno_snps <- read_geno_file(input$pred_file$datapath, requires = "GT") - geno <- geno_snps[[1]] - pred_inputs$pred_snps <- geno_snps[[2]] - - #Save number of samples in file - pred_inputs$pred_genos <- ncol(geno) - - #Check that the ploidy entered is correct - if (input$pred_ploidy != max(geno, na.rm = TRUE)) { - # If condition is met, show notification toast - shinyalert( - title = "Ploidy Mismatch", - text = paste0("The maximum value in the genotype file (",max(geno, na.rm = TRUE),") does not equal the ploidy entered"), - size = "xs", - closeOnEsc = FALSE, - closeOnClickOutside = FALSE, - html = TRUE, - type = "warning", - showConfirmButton = TRUE, - confirmButtonText = "OK", - confirmButtonCol = "#004192", - showCancelButton = FALSE, - #closeOnConfirm = TRUE, - #closeOnCancel = TRUE, - imageUrl = "", - animation = TRUE - ) - - # Stop the observeEvent gracefully - #return() - } - - #Make sure the trait file and genotype file are in the same order - # Column names for geno (assuming these are the individual IDs) - colnames_geno <- colnames(geno) - # Assuming the first column in Pheno contains the matching IDs - ids_pheno <- pheno[, 1] - # Find common identifiers - common_ids <- intersect(colnames_geno, ids_pheno) - #Get number of id - pred_inputs$pred_geno_pheno <- length(common_ids) + if(!is.null(input$pred_file$datapath)){ + geno_snps <- read_geno_file(input$pred_file$datapath, requires = "GT") + geno <- geno_snps[[1]] + pred_inputs$pred_snps <- geno_snps[[2]] - #Throw an error if there are less matching samples in the phenotype file than the genotype file - if (length(common_ids) == 0) { - # If condition is met, show notification toast - shinyalert( - title = "Oops", - text = "All samples were missing from the phenotype file", - size = "xs", - closeOnEsc = TRUE, - closeOnClickOutside = FALSE, - html = TRUE, - type = "info", - showConfirmButton = TRUE, - confirmButtonText = "OK", - confirmButtonCol = "#004192", - showCancelButton = FALSE, - imageUrl = "", - animation = TRUE, - ) + #Save number of samples in file + pred_inputs$pred_genos <- ncol(geno) - # Stop the observeEvent gracefully - return() - } else { - if (length(common_ids) < length(colnames_geno)) + #Check that the ploidy entered is correct + if (input$pred_ploidy != max(geno, na.rm = TRUE)) { + # If condition is met, show notification toast shinyalert( - title = "Data Mismatch", - text = paste0((length(colnames_geno)-length(common_ids))," samples were removed for not having trait information"), + title = "Ploidy Mismatch", + text = paste0("The maximum value in the genotype file (",max(geno, na.rm = TRUE),") does not equal the ploidy entered"), size = "xs", closeOnEsc = FALSE, closeOnClickOutside = FALSE, @@ -399,33 +349,88 @@ mod_GSAcc_server <- function(input, output, session, parent_session){ imageUrl = "", animation = TRUE ) - if (length(common_ids) < length(ids_pheno)) + + # Stop the observeEvent gracefully + #return() + } + + #Make sure the trait file and genotype file are in the same order + # Column names for geno (assuming these are the individual IDs) + colnames_geno <- colnames(geno) + + # Find common identifiers + common_ids <- intersect(colnames_geno, ids_pheno) + #Get number of id + pred_inputs$pred_geno_pheno <- length(common_ids) + + #Throw an error if there are less matching samples in the phenotype file than the genotype file + if (length(common_ids) == 0) { + # If condition is met, show notification toast shinyalert( - title = "Data Mismatch", - text = paste0((length(ids_pheno)-length(common_ids))," samples were removed for not having genotypic information"), + title = "Oops", + text = "All samples were missing from the phenotype file", size = "xs", - closeOnEsc = FALSE, + closeOnEsc = TRUE, closeOnClickOutside = FALSE, html = TRUE, - type = "warning", + type = "info", showConfirmButton = TRUE, confirmButtonText = "OK", confirmButtonCol = "#004192", showCancelButton = FALSE, - #closeOnConfirm = TRUE, - #closeOnCancel = TRUE, imageUrl = "", - animation = TRUE + animation = TRUE, ) - } - # Subset and reorder geno and pheno to ensure they only contain and are ordered by common IDs - geno_adj <- geno[, common_ids] # Assuming that the columns can be directly indexed by IDs - pheno <- pheno[match(common_ids, ids_pheno), ] # If there is pheno but not geno, the sample is also discarded + # Stop the observeEvent gracefully + return() + } else { + if (length(common_ids) < length(colnames_geno)) + shinyalert( + title = "Data Mismatch", + text = paste0((length(colnames_geno)-length(common_ids))," samples were removed for not having trait information"), + size = "xs", + closeOnEsc = FALSE, + closeOnClickOutside = FALSE, + html = TRUE, + type = "warning", + showConfirmButton = TRUE, + confirmButtonText = "OK", + confirmButtonCol = "#004192", + showCancelButton = FALSE, + #closeOnConfirm = TRUE, + #closeOnCancel = TRUE, + imageUrl = "", + animation = TRUE + ) + if (length(common_ids) < length(ids_pheno)) + shinyalert( + title = "Data Mismatch", + text = paste0((length(ids_pheno)-length(common_ids))," samples were removed for not having genotypic information"), + size = "xs", + closeOnEsc = FALSE, + closeOnClickOutside = FALSE, + html = TRUE, + type = "warning", + showConfirmButton = TRUE, + confirmButtonText = "OK", + confirmButtonCol = "#004192", + showCancelButton = FALSE, + #closeOnConfirm = TRUE, + #closeOnCancel = TRUE, + imageUrl = "", + animation = TRUE + ) + } + + # Subset and reorder geno and pheno to ensure they only contain and are ordered by common IDs + geno_adj <- geno[, common_ids] # Assuming that the columns can be directly indexed by IDs + pheno <- pheno[match(common_ids, ids_pheno), ] # If there is pheno but not geno, the sample is also discarded + } else geno_adj <- NULL # Check pedigree #Import pedigree file, where pedigree data name (3-column way format). Unknown value should be equal 0 - if(!is.null(advanced_options$ped_file$datapath)){ + if(!is.null(advanced_options$ped_file$datapath) & (advanced_options$pred_matrix == "Amatrix" | advanced_options$pred_matrix == "Hmatrix")){ ped <- read.csv(advanced_options$ped_file$datapath, check.names = FALSE, colClasses = "factor") colnames(ped) <- c("Ind", "P1", "P2") #Convert NAs to 0 @@ -478,7 +483,7 @@ mod_GSAcc_server <- function(input, output, session, parent_session){ animation = TRUE ) pheno <- pheno[-which(!pheno$Sample_ID %in% extended_ped$Ind),] - geno_adj <- geno_adj[,-which(!pheno$Sample_ID %in% extended_ped$Ind)] + if(!is.null(geno_adj)) geno_adj <- geno_adj[,-which(!colnames(geno_adj) %in% extended_ped$Ind)] } if (length(ped$Ind) > length(extended_ped$Ind)) shinyalert( @@ -551,32 +556,25 @@ mod_GSAcc_server <- function(input, output, session, parent_session){ pred_inputs$pheno_input <- pheno pred_inputs$geno_input <- geno_adj pred_inputs$ped_input <- extended_ped - - continue_prediction(TRUE) + pred_inputs }) - #3) Analysis only proceeds once continue_prediction is converted to TRUE - pred_outputs <- reactive({ - - req(continue_prediction(),pred_inputs$pheno_input, pred_inputs$geno_input) - - # Stop analysis if cancel was selected - if (isFALSE(continue_prediction())) { - return() - } + pred_outputs <- eventReactive(pred_inputs(), { # Convert genotype matrix according to ploidy and model used - geno_formated <- format_geno_matrix(pred_inputs$geno_input,advanced_options$pred_model, advanced_options$pred_matrix, input$pred_ploidy) + if(!is.null(pred_inputs()$geno_input)){ + geno_formated <- format_geno_matrix(pred_inputs()$geno_input,advanced_options$pred_model, advanced_options$pred_matrix, input$pred_ploidy) + } else geno_formated <- NULL #Status updateProgressBar(session = session, id = "pb_prediction", value = 30, title = paste("Genotype matrix formatted for", advanced_options$pred_model, advanced_options$pred_matrix)) results <- run_predictive_model(geno = geno_formated, - pheno = pred_inputs$pheno_input, + pheno = pred_inputs()$pheno_input, selected_traits = input$pred_trait_info, predictive_model = advanced_options$pred_model, relationship_matrix_type = advanced_options$pred_matrix, - pedigree = pred_inputs$ped_input, + pedigree = pred_inputs()$ped_input, fixed_effects = input$pred_fixed_info, categorical_fixed_effects = input$pred_fixed_cat, ploidy = input$pred_ploidy, diff --git a/R/utils.R b/R/utils.R index 5622f74..14b6fb1 100644 --- a/R/utils.R +++ b/R/utils.R @@ -310,7 +310,7 @@ read_geno_file <- function(file_path, requires = c("GT")){ } else if (grepl("\\.vcf$", file_path) || grepl("\\.gz$", file_path)) { #Convert VCF file if submitted - vcf <- read.vcfR(file_path) + vcf <- read.vcfR(file_path, verbose = FALSE) all_requires <- vector() for(i in 1:length(requires)) all_requires[i] <- grepl(requires[i], vcf@fix[1,8]) | grepl(requires[i], vcf@gt[1,1]) diff --git a/tests/testthat/test-GSAcc.R b/tests/testthat/test-GSAcc.R index 10e8f7f..924f297 100644 --- a/tests/testthat/test-GSAcc.R +++ b/tests/testthat/test-GSAcc.R @@ -496,7 +496,7 @@ test_that("test Predictive Ability iris",{ # # Inputs # input <- list() # -# input$trait_file$datapath <- "BIG_pheno.csv" +# input$trait_file$datapath <- "BIG_pheno2.csv" # input$pred_file$datapath <- "BIG_genos.vcf" # input$ped_file$datapath <- "sealice_ped.csv" # @@ -543,7 +543,7 @@ test_that("test Predictive Ability iris",{ # # #Variables # pheno <- read.csv(input$trait_file$datapath, header = TRUE, check.names = FALSE) -# pheno <- pheno[,-c(1:(which(colnames(pheno) == "Sample_ID") - 1))] # Sample_ID must be the first column +# #pheno <- pheno[,-c(1:(which(colnames(pheno) == "Sample_ID") - 1))] # Sample_ID must be the first column # rownames(pheno) <- pheno[,1] # # #Getting genotype matrix @@ -602,6 +602,7 @@ test_that("test Predictive Ability iris",{ # if (length(common_ped) < length(ids_pheno)){ # warning(paste0((length(ids_pheno)-length(common_ped))," samples were removed from the phenotype data for not having pedigree information")) # pheno <- pheno[-which(!pheno$Sample_ID %in% extended_ped$Ind),] +# geno_adj <- geno_adj[,-which(!colnames(geno_adj) %in% extended_ped$Ind)] # } # if (length(ped$Ind) > length(extended_ped$Ind)) # warning(paste0((length(ped$Ind)-length(extended_ped$Ind))," samples in the pedigree file were unrelated to the samples with phenotype information. They were removed from the analysis.")) # samples not removed @@ -756,6 +757,7 @@ test_that("test Predictive Ability iris",{ # cycles = input$pred_cv, # folds = 5) # +# str(results) # #Save to reactive value # pred_outputs_gBLUPA <- pred_outputs # pred_outputs_gBLUPA$corr_output <- results$PredictionAccuracy From 0dec351660f4feee85dc39e67cb17e0446d8bc18 Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Fri, 13 Sep 2024 16:23:24 -0400 Subject: [PATCH 29/40] add back folds progress bar --- R/GS_functions.R | 25 ++++++++++++++++++++----- R/mod_GSAcc.R | 23 +++++++++-------------- 2 files changed, 29 insertions(+), 19 deletions(-) diff --git a/R/GS_functions.R b/R/GS_functions.R index 8fbc5a4..2d81a3f 100644 --- a/R/GS_functions.R +++ b/R/GS_functions.R @@ -9,7 +9,7 @@ #' @param iters ToDo #' @param cores ToDo #' -rrBLUP_genomic_prediction <- function(geno, pheno, traits, fixed_effects = NULL, fixed_cat = NULL,folds = 5, iters = 5, cores = 1) { +rrBLUP_genomic_prediction <- function(geno, pheno, traits, fixed_effects = NULL, fixed_cat = NULL,folds = 5, iters = 5, cores = 1, session) { # Define variables cycles <- as.numeric(iters) @@ -82,8 +82,14 @@ rrBLUP_genomic_prediction <- function(geno, pheno, traits, fixed_effects = NULL, #Initialize GEBV object for each cycle GEBVs_cycle <-list() + #Status + updateProgressBar(session = session, id = "pb_prediction", value = as.numeric(pb_value), title = paste0("Performing iteration:", r, "of", cycles)) + for (fold in 1:folds) { + #Status bar length + pb_value = pb_value + (70 / as.numeric(cycles*folds)) + train <- fold_df %>% dplyr::filter(FoldID != fold) %>% pull(Sample) @@ -239,11 +245,12 @@ get_relationship_mat <- function(geno_input, ped_file, type = c("Gmatrix", "Amat #' #' @param fixed_cat categorical fixed effect #' -GBLUP_genomic_prediction <- function(pheno_dat, Geno.mat, cycles, folds, traits, cores, fixed_cov = NULL, fixed_cat = NULL){ +GBLUP_genomic_prediction <- function(pheno_dat, Geno.mat, cycles, folds, traits, cores, fixed_cov = NULL, fixed_cat = NULL, session){ # Establish accuracy results matrix results <- matrix(nrow = cycles*folds, ncol = length(traits) + 2) colnames(results) <- c(paste0(traits), "Iter", "Fold") # Set the column names to be the traits + pb_value <- 10 # Initialize a list to store GEBVs for all traits and cycles GEBVs <- list() @@ -264,8 +271,13 @@ GBLUP_genomic_prediction <- function(pheno_dat, Geno.mat, cycles, folds, traits, #Initialize GEBV object for each cycle GEBVs_cycle <-list() + updateProgressBar(session = session, id = "pb_prediction", value = as.numeric(pb_value), title = paste0("Performing iteration:", r, "of", cycles)) + for (fold in 1:folds) { + #Status bar length + pb_value = pb_value + (70 / as.numeric(cycles*folds)) + #Subset training and testing samples train <- fold_df %>% dplyr::filter(FoldID != fold) %>% @@ -363,7 +375,8 @@ format_geno_matrix <- function(geno, model, pred_matrix = NULL, ploidy){ } run_predictive_model <- function(geno, pheno, selected_traits, predictive_model, relationship_matrix_type, pedigree, - fixed_effects, categorical_fixed_effects, ploidy, cores, cycles, folds, relationship_matrix = NULL){ + fixed_effects, categorical_fixed_effects, ploidy, cores, cycles, folds, relationship_matrix = NULL, + session){ if(predictive_model == "rrBLUP"){ results <- rrBLUP_genomic_prediction(geno = geno, @@ -371,7 +384,8 @@ run_predictive_model <- function(geno, pheno, selected_traits, predictive_model, traits = selected_traits, fixed_effects = fixed_effects, iters = cycles, - cores = cores) + cores = cores, + session = session) return(results) } else if(predictive_model == "GBLUP"){ fixed_cov <- if (is.null(fixed_effects) || length(fixed_effects) == length(categorical_fixed_effects)) { @@ -395,7 +409,8 @@ run_predictive_model <- function(geno, pheno, selected_traits, predictive_model, traits = selected_traits, cores = cores, fixed_cov = fixed_cov, - fixed_cat = categorical_fixed_effects) + fixed_cat = categorical_fixed_effects, + session = session) return(results) } } diff --git a/R/mod_GSAcc.R b/R/mod_GSAcc.R index 1475224..3a302ea 100644 --- a/R/mod_GSAcc.R +++ b/R/mod_GSAcc.R @@ -222,11 +222,12 @@ mod_GSAcc_server <- function(input, output, session, parent_session){ }) colors <- reactiveValues(colors = NULL) + values_boxes <- reactiveValues(pred_snps = 0, pred_geno_pheno = 0) #Reactive boxes output$pred_snps <- renderValueBox({ valueBox( - value = pred_inputs()$pred_snps, + value = values_boxes$pred_snps, subtitle = "SNPs in Genotype File", icon = icon("dna"), color = "info" @@ -235,7 +236,7 @@ mod_GSAcc_server <- function(input, output, session, parent_session){ output$pred_geno <- renderValueBox({ valueBox( - value = pred_inputs()$pred_geno_pheno, + value = values_boxes$pred_geno_pheno, subtitle = "Samples with Phenotype Information", icon = icon("location-dot"), color = "info" @@ -313,9 +314,6 @@ mod_GSAcc_server <- function(input, output, session, parent_session){ pred_inputs <- list( pheno_input = NULL, geno_input = NULL, - pred_snps = NULL, - pred_genos = NULL, - pred_geno_pheno = NULL, ped_input = NULL ) @@ -324,10 +322,7 @@ mod_GSAcc_server <- function(input, output, session, parent_session){ if(!is.null(input$pred_file$datapath)){ geno_snps <- read_geno_file(input$pred_file$datapath, requires = "GT") geno <- geno_snps[[1]] - pred_inputs$pred_snps <- geno_snps[[2]] - - #Save number of samples in file - pred_inputs$pred_genos <- ncol(geno) + values_boxes$pred_snps <- geno_snps[[2]] #Check that the ploidy entered is correct if (input$pred_ploidy != max(geno, na.rm = TRUE)) { @@ -361,7 +356,7 @@ mod_GSAcc_server <- function(input, output, session, parent_session){ # Find common identifiers common_ids <- intersect(colnames_geno, ids_pheno) #Get number of id - pred_inputs$pred_geno_pheno <- length(common_ids) + values_boxes$pred_geno_pheno <- length(common_ids) #Throw an error if there are less matching samples in the phenotype file than the genotype file if (length(common_ids) == 0) { @@ -550,7 +545,7 @@ mod_GSAcc_server <- function(input, output, session, parent_session){ } else extended_ped <- NULL #Status - updateProgressBar(session = session, id = "pb_prediction", value = 20, title = "Inputs checked!") + updateProgressBar(session = session, id = "pb_prediction", value = 8, title = "Inputs checked!") ## Make ouput as checked inputs pred_inputs pred_inputs$pheno_input <- pheno @@ -567,7 +562,7 @@ mod_GSAcc_server <- function(input, output, session, parent_session){ } else geno_formated <- NULL #Status - updateProgressBar(session = session, id = "pb_prediction", value = 30, title = paste("Genotype matrix formatted for", advanced_options$pred_model, advanced_options$pred_matrix)) + updateProgressBar(session = session, id = "pb_prediction", value = 10, title = paste("Genotype matrix formatted for", advanced_options$pred_model, advanced_options$pred_matrix)) results <- run_predictive_model(geno = geno_formated, pheno = pred_inputs()$pheno_input, @@ -580,9 +575,9 @@ mod_GSAcc_server <- function(input, output, session, parent_session){ ploidy = input$pred_ploidy, cores = input$pred_cores, cycles = input$pred_cv, - folds = 5) + folds = 5, session = session) - updateProgressBar(session = session, id = "pb_prediction", value = 70, title = "Cross validation concluded") + updateProgressBar(session = session, id = "pb_prediction", value = 90, title = "Cross validation concluded") #Save to reactive value pred_outputs <- list(corr_output = NULL, comb_output = NULL, all_GEBVs = NULL, avg_GEBVs = NULL) From 20859fe9e20024024516e056b3dec81202aa19bd Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Fri, 13 Sep 2024 16:30:33 -0400 Subject: [PATCH 30/40] fix test --- R/GS_functions.R | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/R/GS_functions.R b/R/GS_functions.R index 2d81a3f..03b5e4c 100644 --- a/R/GS_functions.R +++ b/R/GS_functions.R @@ -83,7 +83,7 @@ rrBLUP_genomic_prediction <- function(geno, pheno, traits, fixed_effects = NULL, GEBVs_cycle <-list() #Status - updateProgressBar(session = session, id = "pb_prediction", value = as.numeric(pb_value), title = paste0("Performing iteration:", r, "of", cycles)) + if(!is.null(session)) updateProgressBar(session = session, id = "pb_prediction", value = as.numeric(pb_value), title = paste0("Performing iteration:", r, "of", cycles)) for (fold in 1:folds) { @@ -245,7 +245,7 @@ get_relationship_mat <- function(geno_input, ped_file, type = c("Gmatrix", "Amat #' #' @param fixed_cat categorical fixed effect #' -GBLUP_genomic_prediction <- function(pheno_dat, Geno.mat, cycles, folds, traits, cores, fixed_cov = NULL, fixed_cat = NULL, session){ +GBLUP_genomic_prediction <- function(pheno_dat, Geno.mat, cycles, folds, traits, cores, fixed_cov = NULL, fixed_cat = NULL, session = NULL){ # Establish accuracy results matrix results <- matrix(nrow = cycles*folds, ncol = length(traits) + 2) @@ -271,7 +271,7 @@ GBLUP_genomic_prediction <- function(pheno_dat, Geno.mat, cycles, folds, traits, #Initialize GEBV object for each cycle GEBVs_cycle <-list() - updateProgressBar(session = session, id = "pb_prediction", value = as.numeric(pb_value), title = paste0("Performing iteration:", r, "of", cycles)) + if(!is.null(session)) updateProgressBar(session = session, id = "pb_prediction", value = as.numeric(pb_value), title = paste0("Performing iteration:", r, "of", cycles)) for (fold in 1:folds) { @@ -376,7 +376,7 @@ format_geno_matrix <- function(geno, model, pred_matrix = NULL, ploidy){ run_predictive_model <- function(geno, pheno, selected_traits, predictive_model, relationship_matrix_type, pedigree, fixed_effects, categorical_fixed_effects, ploidy, cores, cycles, folds, relationship_matrix = NULL, - session){ + session = NULL){ if(predictive_model == "rrBLUP"){ results <- rrBLUP_genomic_prediction(geno = geno, From eb1730db2defbc1957ca47018a3778bd44da0224 Mon Sep 17 00:00:00 2001 From: Alexander Sandercock Date: Mon, 16 Sep 2024 09:37:15 -0400 Subject: [PATCH 31/40] Reverted file --- R/mod_GSAcc.R | 1600 +++++++++++++++++++++++++++++++------------------ 1 file changed, 1004 insertions(+), 596 deletions(-) diff --git a/R/mod_GSAcc.R b/R/mod_GSAcc.R index 3a302ea..7bc904f 100644 --- a/R/mod_GSAcc.R +++ b/R/mod_GSAcc.R @@ -66,14 +66,14 @@ mod_GSAcc_ui <- function(id){ )), tags$hr(style="border-color: #d3d3d3; margin-top: 20px; margin-bottom: 20px;"), # Lighter grey line div(style="text-align: left; margin-top: 10px;", - actionButton(ns("advanced_options"), + actionButton(ns("advanced_options"), label = HTML(paste(icon("cog", style = "color: #007bff;"), "Advanced Options (beta)")), style = "background-color: transparent; border: none; color: #007bff; font-size: smaller; text-decoration: underline; padding: 0;" ) ) ) ), - + column(width = 6, box( title = "Plots", status = "info", solidHeader = FALSE, width = 12, height = 600, @@ -82,13 +82,13 @@ mod_GSAcc_ui <- function(id){ tabPanel("Box Plot", plotOutput(ns("pred_box_plot"), height = "500px")), tabPanel("Accuracy Table", DTOutput(ns("pred_acc_table")), style = "overflow-y: auto; height: 500px"), tabPanel("GEBVs Table", DTOutput(ns("pred_gebvs_table")),style = "overflow-y: auto; height: 500px") - + ) - + ) - + ), - + column(width = 3, valueBoxOutput(ns("pred_snps"), width = NULL), valueBoxOutput(ns("pred_geno"), width = NULL), @@ -114,11 +114,11 @@ mod_GSAcc_ui <- function(id){ tooltip = tooltipOptions(title = "Click to see inputs!") )) ) - + ) - + ) - + ) } @@ -133,199 +133,276 @@ mod_GSAcc_ui <- function(id){ #' @import ggplot2 #' @import tidyr #' @noRd -mod_GSAcc_server <- function(input, output, session, parent_session){ - - ns <- session$ns - - #Default model choices - advanced_options <- reactiveValues( - pred_model = "GBLUP", - pred_matrix = "Gmatrix", - ped_file = NULL - ) - - #List the ped file name if previously uploaded - output$uploaded_file_name <- renderText({ - if (!is.null(advanced_options$ped_file)) { - paste("Previously uploaded file:", advanced_options$ped_file$name) - } else { - "" # Return an empty string if no file has been uploaded - } - }) - - #UI popup window for input - observeEvent(input$advanced_options, { - showModal(modalDialog( - title = "Advanced Options (beta)", - selectInput( - inputId = ns('pred_model'), - label = 'Model Choice', - choices = c("rrBLUP", "GBLUP"), - selected = advanced_options$pred_model # Initialize with stored value - ), - conditionalPanel( - condition = "input.pred_model == 'GBLUP'", ns = ns, - div( - selectInput( - inputId = ns('pred_matrix'), - label = 'GBLUP Matrix Choice', - choices = c("Gmatrix", "Amatrix", "Hmatrix"), - selected = advanced_options$pred_matrix # Initialize with stored value +mod_GSAcc_server <- function(id){ + moduleServer( id, function(input, output, session){ + ns <- session$ns + + #Default model choices + advanced_options <- reactiveValues( + pred_model = "rrBLUP", + pred_matrix = "Gmatrix", + ped_file = NULL + ) + + #List the ped file name if previously uploaded + output$uploaded_file_name <- renderText({ + if (!is.null(advanced_options$ped_file)) { + paste("Previously uploaded file:", advanced_options$ped_file$name) + } else { + "" # Return an empty string if no file has been uploaded + } + }) + + print("check1") + #UI popup window for input + observeEvent(input$advanced_options, { + showModal(modalDialog( + title = "Advanced Options (beta)", + selectInput( + inputId = ns('pred_model'), + label = 'Model Choice', + choices = c("rrBLUP", "GBLUP"), + selected = advanced_options$pred_model # Initialize with stored value + ), + conditionalPanel( + condition = "input.pred_model == 'GBLUP'", ns = ns, + div( + selectInput( + inputId = ns('pred_matrix'), + label = 'GBLUP Matrix Choice', + choices = c("Gmatrix", "Amatrix", "Hmatrix"), + selected = advanced_options$pred_matrix # Initialize with stored value + ) ) - ) - ), - conditionalPanel( - condition = "input.pred_matrix != 'Gmatrix'", ns = ns, - div( - fileInput(ns("ped_file"), "Choose Pedigree File", accept = ".csv"), - conditionalPanel( - condition = "output.uploaded_file_name !== ''", # Show only if there's content - textOutput(ns("uploaded_file_name")) # Display the uploaded file name + ), + conditionalPanel( + condition = "input.pred_matrix != 'Gmatrix'", ns = ns, + div( + fileInput(ns("ped_file"), "Choose Pedigree File", accept = ".csv"), + conditionalPanel( + condition = "output.uploaded_file_name !== ''", # Show only if there's content + textOutput(ns("uploaded_file_name")) # Display the uploaded file name + ) ) + ), + footer = tagList( + modalButton("Close"), + actionButton(ns("save_advanced_options"), "Save") ) - ), - footer = tagList( - modalButton("Close"), - actionButton(ns("save_advanced_options"), "Save") - ) - )) - }) - - - - #Close popup window when user "saves options" - observeEvent(input$save_advanced_options, { - advanced_options$pred_model <- input$pred_model - advanced_options$pred_matrix <- input$pred_matrix - advanced_options$ped_file <- input$ped_file - # Save other inputs as needed - - removeModal() # Close the modal after saving - }) - - - - ####Genomic Prediction Accuracy - #This tab involved 3 observeEvents - #1) to get the traits listed in the phenotype file - #2) to input and validate the input files - #3) to perform the genomic prediction - - #1) Get traits - observeEvent(input$trait_file, { - info_df <- read.csv(input$trait_file$datapath, header = TRUE, check.names = FALSE, nrow = 0) - trait_var <- colnames(info_df) - trait_var <- trait_var[2:length(trait_var)] - updateVirtualSelect("pred_fixed_info", choices = trait_var, session = session) - updateVirtualSelect("pred_trait_info", choices = trait_var, session = session) - - }) - - colors <- reactiveValues(colors = NULL) - values_boxes <- reactiveValues(pred_snps = 0, pred_geno_pheno = 0) - - #Reactive boxes - output$pred_snps <- renderValueBox({ - valueBox( - value = values_boxes$pred_snps, - subtitle = "SNPs in Genotype File", - icon = icon("dna"), - color = "info" + )) + }) + + + + #Close popup window when user "saves options" + observeEvent(input$save_advanced_options, { + advanced_options$pred_model <- input$pred_model + advanced_options$pred_matrix <- input$pred_matrix + advanced_options$ped_file <- input$ped_file + # Save other inputs as needed + + removeModal() # Close the modal after saving + }) + + + + ####Genomic Prediction Accuracy + #This tab involved 3 observeEvents + #1) to get the traits listed in the phenotype file + #2) to input and validate the input files + #3) to perform the genomic prediction + + print("check2") + #1) Get traits + observeEvent(input$trait_file, { + info_df <- read.csv(input$trait_file$datapath, header = TRUE, check.names = FALSE, nrow = 0) + trait_var <- colnames(info_df) + trait_var <- trait_var[2:length(trait_var)] + updateVirtualSelect("pred_fixed_info", choices = trait_var, session = session) + updateVirtualSelect("pred_trait_info", choices = trait_var, session = session) + + }) + print("check3") + #2) Error check for prediction and save input files + continue_prediction <- reactiveVal(NULL) + pred_inputs <- reactiveValues( + pheno_input = NULL, + geno_input = NULL, + pred_snps = NULL, + pred_genos = NULL, + pred_geno_pheno = NULL ) - }) - - output$pred_geno <- renderValueBox({ - valueBox( - value = values_boxes$pred_geno_pheno, - subtitle = "Samples with Phenotype Information", - icon = icon("location-dot"), - color = "info" + + pred_outputs <- reactiveValues( + corr_output = NULL, + box_plot = NULL, + violin_plot = NULL, + comb_output = NULL, + avg_GEBVs = NULL, + all_GEBVs = NULL, + colors = NULL ) - }) - - observe({ - # Update colors based on input - colors$colors <- assign_colors(input$pred_color_select) - }) - - observeEvent(input$pred_fixed_info, { - updateVirtualSelect("pred_fixed_cat", choices = input$pred_fixed_info, session = session) - }) - - #2) Error check for prediction and save input files - pred_inputs <- eventReactive(input$prediction_start,{ - - toggleClass(id = "pred_ploidy", class = "borderred", condition = (is.na(input$pred_ploidy) | is.null(input$pred_ploidy))) - - if(is.null(advanced_options$pred_matrix)) advanced_options$pred_matrix <- "none_selected" - if (((is.null(input$pred_file$datapath) & advanced_options$pred_matrix != "Amatrix") | - (is.null(advanced_options$ped_file$datapath) & advanced_options$pred_matrix == "Amatrix")) | - is.null(input$trait_file$datapath)) { - shinyalert( - title = "Missing input!", - text = "Upload VCF or a pedigree file and the phenotype file", - size = "s", - closeOnEsc = TRUE, - closeOnClickOutside = FALSE, - html = TRUE, - type = "error", - showConfirmButton = TRUE, - confirmButtonText = "OK", - confirmButtonCol = "#004192", - showCancelButton = FALSE, - animation = TRUE + + #Reactive boxes + output$pred_snps <- renderValueBox({ + valueBox( + value = pred_inputs$pred_snps, + subtitle = "SNPs in Genotype File", + icon = icon("dna"), + color = "info" ) - return() - } - - #Status - updateProgressBar(session = session, id = "pb_prediction", value = 5, title = "Checking input files") - - #Variables - pheno <- read.csv(input$trait_file$datapath, header = TRUE, check.names = FALSE) - row.names(pheno) <- pheno[,1] - # Assuming the first column in Pheno contains the matching IDs - ids_pheno <- pheno[, 1] - - #Make sure at least one trait was input - if (length(input$pred_trait_info) == 0) { - - # If condition is met, show notification toast - shinyalert( - title = "Oops", - text = "No traits were selected", - size = "xs", - closeOnEsc = TRUE, - closeOnClickOutside = FALSE, - html = TRUE, - type = "info", - showConfirmButton = TRUE, - confirmButtonText = "OK", - confirmButtonCol = "#004192", - showCancelButton = FALSE, - imageUrl = "", - animation = TRUE, + }) + + output$pred_geno <- renderValueBox({ + valueBox( + value = pred_inputs$pred_geno_pheno, + subtitle = "Samples with Phenotype Information", + icon = icon("location-dot"), + color = "info" ) - - # Stop the observeEvent gracefully - return() - } - - pred_inputs <- list( - pheno_input = NULL, - geno_input = NULL, - ped_input = NULL - ) - - #Getting genotype matrix - #Geno.file conversion if needed - if(!is.null(input$pred_file$datapath)){ - geno_snps <- read_geno_file(input$pred_file$datapath, requires = "GT") - geno <- geno_snps[[1]] - values_boxes$pred_snps <- geno_snps[[2]] - + }) + + observe({ + # Update colors based on input + pred_outputs$colors <- switch(input$pred_color_select, + "red" = "#F8766D", + "blue" = "#00BFC4", + "green" = "#00BA38", + input$pred_color_select) + }) + + observeEvent(input$pred_fixed_info, { + updateVirtualSelect("pred_fixed_cat", choices = input$pred_fixed_info, session = session) + }) + + observeEvent(input$prediction_start, { + + toggleClass(id = "pred_ploidy", class = "borderred", condition = (is.na(input$pred_ploidy) | is.null(input$pred_ploidy))) + + if (is.null(input$pred_file$datapath) | is.null(input$trait_file$datapath)) { + shinyalert( + title = "Missing input!", + text = "Upload VCF and phenotype files", + size = "s", + closeOnEsc = TRUE, + closeOnClickOutside = FALSE, + html = TRUE, + type = "error", + showConfirmButton = TRUE, + confirmButtonText = "OK", + confirmButtonCol = "#004192", + showCancelButton = FALSE, + animation = TRUE + ) + } + req(input$pred_file$datapath, input$pred_ploidy, input$trait_file$datapath) + + #Status + updateProgressBar(session = session, id = "pb_prediction", value = 5, title = "Checking input files") + + #Variables + ploidy <- as.numeric(input$pred_ploidy) + geno_path <- input$pred_file$datapath + pheno <- read.csv(input$trait_file$datapath, header = TRUE, check.names = FALSE) + row.names(pheno) <- pheno[,1] + traits <- input$pred_trait_info + CVs <- as.numeric(input$pred_cv) + + #Make sure at least one trait was input + if (length(traits) == 0) { + + # If condition is met, show notification toast + shinyalert( + title = "Oops", + text = "No traits were selected", + size = "xs", + closeOnEsc = TRUE, + closeOnClickOutside = FALSE, + html = TRUE, + type = "info", + showConfirmButton = TRUE, + confirmButtonText = "OK", + confirmButtonCol = "#004192", + showCancelButton = FALSE, + imageUrl = "", + animation = TRUE, + ) + + + # Stop the observeEvent gracefully + return() + + } + + print("check4") + #Getting genotype matrix + + #Geno file path + file_path <- geno_path + + #Geno.file conversion if needed + if (grepl("\\.csv$", file_path)) { + geno <- read.csv(geno_path, header = TRUE, row.names = 1, check.names = FALSE) + + #Save number of SNPs + pred_inputs$pred_snps <- nrow(geno) + + } else if (grepl("\\.vcf$", file_path) || grepl("\\.gz$", file_path)) { + + #Function to convert GT to dosage calls (add to BIGr) + convert_to_dosage <- function(gt) { + # Split the genotype string + alleles <- strsplit(gt, "[|/]") + # Sum the alleles, treating NA values appropriately + sapply(alleles, function(x) { + if (any(is.na(x))) { + return(NA) + } else { + return(sum(as.numeric(x), na.rm = TRUE)) + } + }) + } + + #Convert VCF file if submitted + vcf <- vcfR::read.vcfR(file_path) + + #Get number of SNPs + pred_inputs$pred_snps <- nrow(vcf) + + #Extract GT + geno <- extract.gt(vcf, element = "GT") + geno <- apply(geno, 2, convert_to_dosage) + class(geno) <- "numeric" + rm(vcf) + + } else { + + # If condition is met, show notification toast + shinyalert( + title = "Oops", + text = "No valid genotype file detected", + size = "xs", + closeOnEsc = TRUE, + closeOnClickOutside = FALSE, + html = TRUE, + type = "warning", + showConfirmButton = TRUE, + confirmButtonText = "OK", + confirmButtonCol = "#004192", + showCancelButton = FALSE, + imageUrl = "", + animation = TRUE, + ) + + #Stop the analysis + return() + } + print("check5") + #Save number of samples in file + pred_inputs$pred_genos <- ncol(geno) + #Check that the ploidy entered is correct - if (input$pred_ploidy != max(geno, na.rm = TRUE)) { + if (ploidy != max(geno, na.rm = TRUE)) { # If condition is met, show notification toast shinyalert( title = "Ploidy Mismatch", @@ -344,22 +421,35 @@ mod_GSAcc_server <- function(input, output, session, parent_session){ imageUrl = "", animation = TRUE ) - + + # Stop the observeEvent gracefully #return() } - + + print("check6") + # Function to convert genotype matrix according to ploidy + convert_genotype <- function(genotype_matrix, ploidy) { + normalized_matrix <- 2 * (genotype_matrix / ploidy) - 1 + return(normalized_matrix) + } + + #tranforming genotypes + geno_adj_init <- convert_genotype(geno, as.numeric(ploidy)) + #Make sure the trait file and genotype file are in the same order # Column names for geno (assuming these are the individual IDs) colnames_geno <- colnames(geno) - + # Assuming the first column in Pheno contains the matching IDs + ids_pheno <- pheno[, 1] # Find common identifiers common_ids <- intersect(colnames_geno, ids_pheno) #Get number of id - values_boxes$pred_geno_pheno <- length(common_ids) - + pred_inputs$pred_geno_pheno <- length(common_ids) + #Throw an error if there are less matching samples in the phenotype file than the genotype file if (length(common_ids) == 0) { + # If condition is met, show notification toast shinyalert( title = "Oops", @@ -376,426 +466,744 @@ mod_GSAcc_server <- function(input, output, session, parent_session){ imageUrl = "", animation = TRUE, ) - + + # Stop the observeEvent gracefully return() - } else { - if (length(common_ids) < length(colnames_geno)) - shinyalert( - title = "Data Mismatch", - text = paste0((length(colnames_geno)-length(common_ids))," samples were removed for not having trait information"), - size = "xs", - closeOnEsc = FALSE, - closeOnClickOutside = FALSE, - html = TRUE, - type = "warning", - showConfirmButton = TRUE, - confirmButtonText = "OK", - confirmButtonCol = "#004192", - showCancelButton = FALSE, - #closeOnConfirm = TRUE, - #closeOnCancel = TRUE, - imageUrl = "", - animation = TRUE - ) - if (length(common_ids) < length(ids_pheno)) - shinyalert( - title = "Data Mismatch", - text = paste0((length(ids_pheno)-length(common_ids))," samples were removed for not having genotypic information"), - size = "xs", - closeOnEsc = FALSE, - closeOnClickOutside = FALSE, - html = TRUE, - type = "warning", - showConfirmButton = TRUE, - confirmButtonText = "OK", - confirmButtonCol = "#004192", - showCancelButton = FALSE, - #closeOnConfirm = TRUE, - #closeOnCancel = TRUE, - imageUrl = "", - animation = TRUE - ) - } - - # Subset and reorder geno and pheno to ensure they only contain and are ordered by common IDs - geno_adj <- geno[, common_ids] # Assuming that the columns can be directly indexed by IDs - pheno <- pheno[match(common_ids, ids_pheno), ] # If there is pheno but not geno, the sample is also discarded - } else geno_adj <- NULL - - # Check pedigree - #Import pedigree file, where pedigree data name (3-column way format). Unknown value should be equal 0 - if(!is.null(advanced_options$ped_file$datapath) & (advanced_options$pred_matrix == "Amatrix" | advanced_options$pred_matrix == "Hmatrix")){ - ped <- read.csv(advanced_options$ped_file$datapath, check.names = FALSE, colClasses = "factor") - colnames(ped) <- c("Ind", "P1", "P2") - #Convert NAs to 0 - ped[is.na(ped)] <- 0 - - common_ped <- intersect(ped$Ind, pheno[,1]) - #Throw an error if there are less matching samples in the phenotype file than the pedigree file - if (length(common_ped) == 0) { + print("check7") + } else if (length(common_ids) < length(colnames_geno)) { # If condition is met, show notification toast shinyalert( - title = "Oops", - text = "All samples were missing from the phenotype file", + title = "Data Mismatch", + text = paste0((length(colnames_geno)-length(common_ids))," samples were removed for not having trait information"), size = "xs", - closeOnEsc = TRUE, + closeOnEsc = FALSE, closeOnClickOutside = FALSE, html = TRUE, - type = "info", + type = "warning", showConfirmButton = TRUE, confirmButtonText = "OK", confirmButtonCol = "#004192", showCancelButton = FALSE, + #closeOnConfirm = TRUE, + #closeOnCancel = TRUE, imageUrl = "", - animation = TRUE, + animation = TRUE ) - + + # Stop the observeEvent gracefully - return() - } else { - rm_unr <- remove_unrelated(ped, samples_with_trait_info = pheno[,1]) - extended_ped <- rm_unr[[1]] - gen <- rm_unr[[2]] - cat(paste0("You have pedigree information until the ", gen,"th generation\n")) - - if (length(common_ped) < length(ids_pheno)){ - shinyalert( - title = "Data Mismatch", - text = paste0((length(ids_pheno)-length(common_ped))," samples were removed from the phenotype data for not having pedigree information"), - size = "xs", - closeOnEsc = FALSE, - closeOnClickOutside = FALSE, - html = TRUE, - type = "warning", - showConfirmButton = TRUE, - confirmButtonText = "OK", - confirmButtonCol = "#004192", - showCancelButton = FALSE, - #closeOnConfirm = TRUE, - #closeOnCancel = TRUE, - imageUrl = "", - animation = TRUE - ) - pheno <- pheno[-which(!pheno$Sample_ID %in% extended_ped$Ind),] - if(!is.null(geno_adj)) geno_adj <- geno_adj[,-which(!colnames(geno_adj) %in% extended_ped$Ind)] - } - if (length(ped$Ind) > length(extended_ped$Ind)) - shinyalert( - title = "Data Mismatch", - text = paste0((length(ped$Ind)-length(extended_ped$Ind))," samples in the pedigree file were unrelated to the samples with phenotype information. They were removed from the analysis."), - size = "xs", - closeOnEsc = FALSE, - closeOnClickOutside = FALSE, - html = TRUE, - type = "warning", - showConfirmButton = TRUE, - confirmButtonText = "OK", - confirmButtonCol = "#004192", - showCancelButton = FALSE, - #closeOnConfirm = TRUE, - #closeOnCancel = TRUE, - imageUrl = "", - animation = TRUE - ) - - ped_temp <- tempfile() - ped_temp_file <- extended_ped - colnames(ped_temp_file) <- c("id", "sire", "dam") - write.table(ped_temp_file, file = ped_temp) - ped_check <- BIGr::check_ped(ped_temp) - if(dim(ped_check$repeated_ids)[1] != 0){ - shinyalert( - title = "Oops", - text = "Check for repeated IDs in the pedigree file", - size = "xs", - closeOnEsc = TRUE, - closeOnClickOutside = FALSE, - html = TRUE, - type = "info", - showConfirmButton = TRUE, - confirmButtonText = "OK", - confirmButtonCol = "#004192", - showCancelButton = FALSE, - imageUrl = "", - animation = TRUE, - ) - - # Stop the observeEvent gracefully - return() - } - if(dim(ped_check$messy_parents)[1] != 0){ - shinyalert( - title = "Oops", - text = paste("We found inconsistencies in the pedigree file for the individuals:", paste0(ped_check$messy_parents$id, collapse = ", ")), - size = "xs", - closeOnEsc = TRUE, - closeOnClickOutside = FALSE, - html = TRUE, - type = "info", - showConfirmButton = TRUE, - confirmButtonText = "OK", - confirmButtonCol = "#004192", - showCancelButton = FALSE, - imageUrl = "", - animation = TRUE, - ) - } - } - } else extended_ped <- NULL - - #Status - updateProgressBar(session = session, id = "pb_prediction", value = 8, title = "Inputs checked!") - - ## Make ouput as checked inputs pred_inputs - pred_inputs$pheno_input <- pheno - pred_inputs$geno_input <- geno_adj - pred_inputs$ped_input <- extended_ped - pred_inputs - }) - - pred_outputs <- eventReactive(pred_inputs(), { - - # Convert genotype matrix according to ploidy and model used - if(!is.null(pred_inputs()$geno_input)){ - geno_formated <- format_geno_matrix(pred_inputs()$geno_input,advanced_options$pred_model, advanced_options$pred_matrix, input$pred_ploidy) - } else geno_formated <- NULL - - #Status - updateProgressBar(session = session, id = "pb_prediction", value = 10, title = paste("Genotype matrix formatted for", advanced_options$pred_model, advanced_options$pred_matrix)) - - results <- run_predictive_model(geno = geno_formated, - pheno = pred_inputs()$pheno_input, - selected_traits = input$pred_trait_info, - predictive_model = advanced_options$pred_model, - relationship_matrix_type = advanced_options$pred_matrix, - pedigree = pred_inputs()$ped_input, - fixed_effects = input$pred_fixed_info, - categorical_fixed_effects = input$pred_fixed_cat, - ploidy = input$pred_ploidy, - cores = input$pred_cores, - cycles = input$pred_cv, - folds = 5, session = session) - - updateProgressBar(session = session, id = "pb_prediction", value = 90, title = "Cross validation concluded") - - #Save to reactive value - pred_outputs <- list(corr_output = NULL, comb_output = NULL, all_GEBVs = NULL, avg_GEBVs = NULL) - pred_outputs$corr_output <- results$PredictionAccuracy - pred_outputs$all_GEBVs <- results$GEBVs - - # Convert trait columns to numeric - results$GEBVs <- results$GEBVs %>% - mutate(across(all_of(input$pred_trait_info), ~ as.numeric(.x))) - - # Calculate the average value for each column in the traits list for each SampleID, ignoring Iter and Fold - average_gebvs_df <- results$GEBVs %>% - group_by(Sample) %>% - summarize(across(all_of(input$pred_trait_info), mean, na.rm = TRUE)) - - pred_outputs$avg_GEBVs <- average_gebvs_df - - columns <- setdiff(colnames(results$PredictionAccuracy), c("Iter","Fold")) - average_accuracy_df <- results$PredictionAccuracy %>% - group_by(Iter) %>% - summarize(across(all_of(columns), mean, na.rm = TRUE)) - - pred_outputs$comb_output <- average_accuracy_df - #Status - updateProgressBar(session = session, id = "pb_prediction", value = 100, title = "Finished!") - - pred_outputs - }) - - plots <- reactive({ - validate( - need(!is.null(pred_outputs()$corr_output), "Upload the input files, set the parameters and click 'run analysis' to access results in this session.") - ) - - df <- pred_outputs()$corr_output - df <- df %>% dplyr::select(-Fold, -Iter) - - #Probably want to add the ability for the user to select which trait(s) to display here - - #Convert to long format for ggplot - df_long <- pivot_longer( - df, - cols = colnames(df), # Exclude the Cycle column from transformation - names_to = "Trait", # New column for trait names - values_to = "Correlation" # New column for correlation values - ) - - plots <- list(box_plot = NULL, violin_plot = NULL) - #This can be adapted if we start comparing more than one GP model - #Also consider a violin plot to show each cor value - #plot <- ggplot(df_long, aes(x = factor(Trait), y = Correlation, fill = "red"), fill = "red") + - plots$box_plot <- ggplot(df_long, aes(x = "rrBLUP", y = Correlation, fill = "red"), fill = "red") + - #geom_boxplot(position = position_dodge(width = 0.8), color = "black", width = 0.7, outlier.size = 0.2) + - geom_boxplot() + - facet_wrap(~ Trait, nrow = 1) + # Facet by trait, allowing different y-scales - labs(title = "Predictive Ability by Trait", - x = " ", - y = "Pearson Correlation") + - #theme_minimal() + # Using a minimal theme - theme(legend.position = "none", - strip.text = element_text(size = 12), - axis.text = element_text(size = 12), - axis.title = element_text(size = 14), - axis.text.x = element_text(angle = 90, hjust = 0.95, vjust = 0.2), - strip.text.x = element_text(face = "bold"), - axis.text.x.bottom = element_blank(), - axis.ticks.x.bottom = element_blank()) - - plots$violin_plot <- ggplot(df_long, aes(x = "rrBLUP", y = Correlation, fill = "red")) + - geom_violin(trim = TRUE) + # Add violin plot - geom_point(position = position_jitter(width = 0.1), color = "black", size = 1.5) + # Add jittered points - facet_wrap(~ Trait, nrow = 1) + # Facet by trait, allowing different y-scales - labs(title = "Predictive Ability by Trait", - x = " ", # x-label is blank because it's not relevant per facet - y = "Pearson Correlation") + - theme(legend.position = "none", - strip.text = element_text(size = 12), - axis.text = element_text(size = 12), - axis.title = element_text(size = 14), - axis.text.x = element_text(angle = 90, hjust = 0.95, vjust = 0.2), - strip.text.x = element_text(face = "bold"), - axis.text.x.bottom = element_blank(), - axis.ticks.x.bottom = element_blank()) - plots - }) - - #Output the genomic prediction correlation box plots - output$pred_box_plot <- renderPlot({ - plots()$box_plot + scale_fill_manual(values = colors$colors) - }) - - #Output the genomic prediction correlation box plots - output$pred_violin_plot <- renderPlot({ - plots()$violin_plot + scale_fill_manual(values = colors$colors) - }) - - #Output the prediction tables - - all_GEBVs <- reactive({ - validate( - need(!is.null(pred_outputs()$all_GEBVs), "Upload the input files, set the parameters and click 'run analysis' to access results in this session.") - ) - pred_outputs()$comb_output - }) - - output$pred_all_table <- renderDT({all_GEBVs()}, options = list(scrollX = TRUE,autoWidth = FALSE, pageLength = 5)) - - comb_output <- reactive({ - validate( - need(!is.null(pred_outputs()$comb_output), "Upload the input files, set the parameters and click 'run analysis' to access results in this session.") - ) - pred_outputs()$comb_output - }) - - output$pred_acc_table <- renderDT({comb_output()}, options = list(scrollX = TRUE,autoWidth = FALSE, pageLength = 5)) - - avg_GEBVs <- reactive({ - validate( - need(!is.null(pred_outputs()$avg_GEBVs), "Upload the input files, set the parameters and click 'run analysis' to access results in this session.") - ) - pred_outputs()$avg_GEBVs - }) - - output$pred_gebvs_table <- renderDT({avg_GEBVs()}, options = list(scrollX = TRUE,autoWidth = FALSE, pageLength = 5)) - - #Download files for GP - output$download_pred_file <- downloadHandler( - filename = function() { - paste0("GS-results-", Sys.Date(), ".zip") - }, - content = function(file) { - # Temporary files list - temp_dir <- tempdir() - temp_files <- c() - - if (!is.null(pred_outputs()$avg_GEBVs)) { - # Create a temporary file for assignments - gebv_file <- file.path(temp_dir, paste0("GEBVs-", Sys.Date(), ".csv")) - write.csv(pred_outputs()$avg_GEBVs, gebv_file, row.names = FALSE) - temp_files <- c(temp_files, gebv_file) + #return() } - - if (!is.null(pred_outputs()$comb_output)) { - # Create a temporary file for BIC data frame - acc_file <- file.path(temp_dir, paste0("GS-accuracy-statistics-", Sys.Date(), ".csv")) - write.csv(pred_outputs()$comb_output, acc_file, row.names = FALSE) - temp_files <- c(temp_files, acc_file) + + + + + #Final check before performing analyses + shinyalert( + title = "Ready?", + text = "Inputs have been checked", + size = "xs", + closeOnEsc = FALSE, + closeOnClickOutside = FALSE, + html = TRUE, + type = "info", + showConfirmButton = TRUE, + confirmButtonText = "Proceed", + confirmButtonCol = "#004192", + showCancelButton = TRUE, + #closeOnConfirm = TRUE, + #closeOnCancel = TRUE, + imageUrl = "", + animation = TRUE, + callbackR = function(value) { + if (isTRUE(value)) { + # Proceed with adjusted data + continue_prediction(TRUE) + } else { + # Stop or change the process + continue_prediction(FALSE) + } + } + ) + + print("check8") + # Subset and reorder geno and pheno to ensure they only contain and are ordered by common IDs + geno_adj <- geno_adj_init[, common_ids] # Assuming that the columns can be directly indexed by IDs + pheno <- pheno[match(common_ids, ids_pheno), ] + + ##Save to reactive values + #Gmatrix needs the original allele count values, so the user matrix selection determines the genotype matrix used + pred_inputs$pheno_input <- pheno + if (advanced_options$pred_matrix == "Gmatrix" || is.null(advanced_options$pred_matrix)) { + pred_inputs$geno_input <- geno_adj + } else if (advanced_options$pred_matrix == "Hmatrix") { + pred_inputs$geno_input <- geno[, common_ids] + } else { + pred_inputs$geno_input <- geno_adj } - - # Zip files only if there's something to zip - if (length(temp_files) > 0) { - zip(file, files = temp_files, extras = "-j") # Using -j to junk paths + }) + print("check9") + #3) Analysis only proceeds once continue_prediction is converted to TRUE + observe({ + + req(continue_prediction(),pred_inputs$pheno_input, pred_inputs$geno_input) + + # Stop analysis if cancel was selected + if (isFALSE(continue_prediction())) { + return() } - - # Optionally clean up - file.remove(temp_files) - } - ) - - #Download GP Figures - output$download_pred_figure <- downloadHandler( - - filename = function() { - if (input$pred_image_type == "jpeg") { - paste("GS-", Sys.Date(), ".jpg", sep="") - } else if (input$pred_image_type == "png") { - paste("GS-", Sys.Date(), ".png", sep="") + print("check10") + #Variables + ploidy <- as.numeric(input$pred_ploidy) + geno_adj <- pred_inputs$geno_input + pheno <- pred_inputs$pheno_input + traits <- input$pred_trait_info + CVs <- as.numeric(input$pred_cv) + fixed_traits <- input$pred_fixed_info + fixed_cat <- input$pred_fixed_cat + fixed_cov <- if (is.null(input$pred_fixed_info) || length(input$pred_fixed_info) == length(input$pred_fixed_cat)) { + NULL } else { - paste("GS-", Sys.Date(), ".tiff", sep="") + setdiff(input$pred_fixed_info, input$pred_fixed_cat) } - }, - content = function(file) { - #req(all_plots$pca_2d, all_plots$pca3d, all_plots$scree, input$pca_image_type, input$pca_image_res, input$pca_image_width, input$pca_image_height) #Get the plots - req(input$pred_figures) - - if (input$pred_image_type == "jpeg") { - jpeg(file, width = as.numeric(input$pred_image_width), height = as.numeric(input$pred_image_height), res= as.numeric(input$pred_image_res), units = "in") - } else if (input$pred_image_type == "png") { - png(file, width = as.numeric(input$pred_image_width), height = as.numeric(input$pred_image_height), res= as.numeric(input$pred_image_res), units = "in") - } else { - tiff(file, width = as.numeric(input$pred_image_width), height = as.numeric(input$pred_image_height), res= as.numeric(input$pred_image_res), units = "in") + cores <- input$pred_cores + + #Assign colors + if (input$pred_color_select == "red"){ + pred_outputs$colors <- "#F8766D" + } else if (input$pred_color_select == "blue") { + pred_outputs$colors <- "#00BFC4" + } else if (input$pred_color_select == "green") { + pred_outputs$colors <- "#00BA38" + } else{ + pred_outputs$colors <- input$pred_color_select } - - # Conditional plotting based on input selection - if (input$pred_figures == "Violin Plot") { - req(plots()$violin_plot) - - print(plots()$violin_plot + scale_fill_manual(values = colors$colors)) - - } else if (input$pred_figures == "Box Plot") { - req(plots()$box_plot) - #Plot - print(plots()$box_plot + scale_fill_manual(values = colors$colors)) - + + #Control whether rrBLUP or GBLUP run depending on user input + #Note, should add the GP functions to the utils.R file and then call them here... + if (advanced_options$pred_model == "rrBLUP"){ + ##Need to add ability for the use of parallelism for the for cross-validation + ##Example at this tutorial also: https://www.youtube.com/watch?v=ARWjdQU6ays + + # Function to perform genomic prediction + ##Make sure this is correct (I think I need to be generating a relationship matrix A.mat() to account for missing data, but I am not sure how that works with this) + genomic_prediction <- function(geno, Pheno, traits, fixed_effects = NULL, Fold = 5, Iters = 5, cores = 1) { + + # Define variables + traits <- traits + cycles <- as.numeric(Iters) + Folds <- as.numeric(Fold) + total_population <- ncol(geno) + #train_size <- floor(percentage / 100 * total_population) + fixed_traits <- fixed_effects + cores <- as.numeric(cores) + + # Establish accuracy results matrix + results <- matrix(nrow = cycles*Folds, ncol = length(traits) + 2) + colnames(results) <- c(paste0(traits), "Iter", "Fold") # Set the column names to be the traits + + # Initialize a list to store GEBVs for all traits and cycles + GEBVs <- list() + + #Establish heritability_scores_df () Maybe get h2 values + # Establish results matrix + heritability_scores <- matrix(nrow = cycles*Folds, ncol = length(traits) + 2) + colnames(heritability_scores) <- c(paste0(traits,"_h2"), "Iter", "Fold") # Set the column names to be the traits + + #Cross validation number for progress bar (not involved in the calculations, just shiny visuals) + pb_value = 10 + print("check11") + #Remove the fixed traits from the Pheno file + if (length(fixed_traits) == 0) { + Pheno <- Pheno + } else { + #Subset fixed traits + Fixed <- subset(Pheno, select = fixed_traits) + + #Pheno <- subset(Pheno, select = -fixed_traits) + convert_categorical_to_factor <- function(df, fixed_cat) { + for (col in names(df)) { + if (col %in% fixed_cat) { + df[[col]] <- as.factor(df[[col]]) + } + } + return(df) + } + # Convert all columns to factor if they are not numeric or integer + Fixed <- convert_categorical_to_factor(Fixed, fixed_cat) + + #Fixed <- as.data.frame(lapply(Fixed, as.factor)) #convert to factor + row.names(Fixed) <- row.names(Pheno) + + #Make the matrix + formula_str <- paste("~", paste(fixed_traits, collapse = " + ")) + formula <- as.formula(formula_str) + + # Create the design matrix using the constructed formula + Fixed <- model.matrix(formula, data = Fixed) + } + + #Make kinship matrix of all individuals? + #Kin_mat <- A.mat(t(geno), n.core = 1) ##Need to explore whether or not to use a Kinship matrix and if it makes a significant improvement to accuracy + #If wanting to use Kkinship matrix, will then need to see how to implement it here + + #For now, I am just imputing the missing sites using mean, but EM is more accurate, but slower (can use multiple cores). + impute = (A.mat(t(geno), max.missing=0.5,impute.method="mean",return.imputed=TRUE)) + geno <- impute$imputed + + # For loop + for (r in 1:cycles) { + set.seed(r) + fold_ids <- sample(rep(1:Folds, length.out = total_population)) + fold_df <- data.frame(Sample = row.names(geno), FoldID = fold_ids) #Randomly assign each sample to a fold + fold_results <- matrix(nrow = Folds, ncol = length(traits)) + colnames(fold_results) <- traits + + #Initialize GEBV object for each cycle + GEBVs_cycle <-list() + + #Status + updateProgressBar(session = session, id = "pb_prediction", value = as.numeric(pb_value), title = paste0("Performing iteration:", r, "of", cycles)) + + for (fold in 1:Folds) { + + #Status bar length + pb_value = pb_value + (70 / as.numeric(cycles*Folds)) + + train <- fold_df %>% + dplyr::filter(FoldID != fold) %>% + pull(Sample) + test <- setdiff(row.names(geno),train) + + #Subset datasets + if (length(fixed_traits) == 0) { + Fixed_train = NULL + } else{ + Fixed_train <- data.frame(Fixed[train, ]) + Fixed_train <- as.matrix(Fixed_train) + row.names(Fixed_train) <- train + + #Fixed (testing) + Fixed_test<- data.frame(Fixed[test, ]) + Fixed_test <- as.matrix(Fixed_test) + row.names(Fixed_test) <- test + + } + print("check12") + Pheno_train <- Pheno[train, ] # Subset the phenotype df to only retain the relevant samples from the training set + m_train <- geno[train, ] + Pheno_test <- Pheno[test, ] + #Fixed_test <- Fixed[test, ] #Where would the Fixed_test be used? + m_valid <- geno[test, ] + + # Initialize a matrix to store GEBVs for this fold + GEBVs_fold <- matrix(nrow = length(test), ncol = length(traits)+3) + colnames(GEBVs_fold) <- c(traits,"Sample","Iter","Fold") + rownames(GEBVs_fold) <- paste("Iter", r,"Fold",fold,"Ind", test, sep="_") + + #Evaluate each trait using the same train and testing samples for each + for (trait_idx in 1:length(traits)) { + trait <- Pheno_train[, traits[trait_idx]] # Get the trait of interest + trait_answer <- mixed.solve(y= trait, Z = m_train, K = NULL, X = Fixed_train, SE = FALSE, return.Hinv = FALSE) + TRT <- trait_answer$u + e <- as.matrix(TRT) + pred_trait_test <- m_valid %*% e + pred_trait <- pred_trait_test[, 1] + c(trait_answer$beta) # Make sure this still works when using multiple traits + trait_test <- Pheno_test[, traits[trait_idx]] + results[(((r-1)*5)+fold), trait_idx] <- cor(pred_trait, trait_test, use = "complete") + results[(((r-1)*5)+fold), (length(traits)+1)] <- r + results[(((r-1)*5)+fold), (length(traits)+2)] <- fold + + # Extract GEBVs + # Check if Fixed_train is not NULL and include beta if it is + if (!is.null(Fixed_train) && !is.null(trait_answer$beta)) { + # Calculate GEBVs including fixed effects + GEBVs_fold[, trait_idx] <- m_valid %*% trait_answer$u + Fixed_test %*% trait_answer$beta + } else { + # Calculate GEBVs without fixed effects + GEBVs_fold[, trait_idx] <- m_valid %*% trait_answer$u #Confirm it is accurate to calculate the GEBVs for testing group from the trained model + } + + # Calculate heritability for the current trait + Vu <- trait_answer$Vu + Ve <- trait_answer$Ve + heritability_scores[(((r-1)*5)+fold), trait_idx] <- Vu / (Vu + Ve) + + } + #Add iter and fold information for each trait/result + heritability_scores[(((r-1)*5)+fold), (length(traits)+1)] <- r + heritability_scores[(((r-1)*5)+fold), (length(traits)+2)] <- fold + print("check13") + #Add sample, iteration, and fold information to GEBVs_fold + GEBVs_fold[,"Iter"] = r + GEBVs_fold[,"Fold"] = fold + GEBVs_fold[,"Sample"] <- test + + # Store GEBVs for this fold + GEBVs_cycle[[fold]] <- GEBVs_fold + + } + + # Store GEBVs for this cycle + GEBVs[[r]] <- do.call(rbind, GEBVs_cycle) + + } + + # Combine all GEBVs into a single DataFrame + GEBVs_df <- as.data.frame(do.call(rbind, GEBVs)) + + results <- as.data.frame(results) + heritability_scores <- as.data.frame(heritability_scores) + + # Combine results and heritability_scores using cbind + combined_results <- cbind(results, heritability_scores) + + return(list(GEBVs = GEBVs_df, PredictionAccuracy = results, CombinedResults = combined_results)) + } + + # Example call to the function + #This is slow when using 3k markers and 1.2k samples...will need to parallelize if using this script... + results <- genomic_prediction(geno_adj, pheno, traits = traits, fixed_effects = fixed_traits, Iters = input$pred_cv, cores = cores) + + #With fixed effects (need to inforporate the ability for fixed effects into the prediction?) + #results <- genomic_prediction(geno_matrix, phenotype_df, c("height", "weight"), "~ age + sex") + + #Save to reactive value + pred_outputs$corr_output <- results$PredictionAccuracy + pred_outputs$all_GEBVs <- results$GEBVs + + # Convert trait columns to numeric + results$GEBVs <- results$GEBVs %>% + mutate(across(all_of(traits), ~ as.numeric(.x))) + + # Calculate the average value for each column in the traits list for each SampleID, ignoring Iter and Fold + average_gebvs_df <- results$GEBVs %>% + group_by(Sample) %>% + summarize(across(all_of(traits), mean, na.rm = TRUE)) + + pred_outputs$avg_GEBVs <- average_gebvs_df + + columns <- setdiff(colnames(results$PredictionAccuracy), c("Iter","Fold")) + average_accuracy_df <- results$PredictionAccuracy %>% + group_by(Iter) %>% + summarize(across(all_of(columns), mean, na.rm = TRUE)) + + + pred_outputs$comb_output <- average_accuracy_df + + }else{ + #Note: should wrap the GBLUP into a function too + # Define variables + traits <- traits + cycles <- input$pred_cv + Folds <- 5 + total_population <- ncol(pred_inputs$geno_input) + #train_size <- floor(percentage / 100 * total_population) + fixed_traits <- fixed_traits + cores <- as.numeric(cores) + #Cross validation number for progress bar (not involved in the calculations, just shiny visuals) + pb_value = 10 + + if (advanced_options$pred_matrix == "Gmatrix") { + #Convert normalized genotypes to relationship matrix + #By default, it removes SNPs with more than 50% missing data and imputes using the mean + Geno.mat <- A.mat(t(pred_inputs$geno_input)) + print("check14") + }else if (advanced_options$pred_matrix == "Amatrix") { + + #Import pedigree file, where pedigree data name (3-column way format). Unknown value should be equal 0 + ped <- read.csv(advanced_options$ped_file$datapath, header = TRUE, check.names = FALSE, colClasses = "factor") + colnames(ped) <- c("Ind", "Sire", "Dam") + #Convert NAs to 0 + ped[is.na(ped)] <- 0 + #Ensure Sire and Dam are also listed as individuals + missing_parents <- unique(c(ped$Sire, ped$Dam)) + # Filter out parents already listed as individuals and non-zero values + missing_parents <- missing_parents[!missing_parents %in% ped$Ind & missing_parents != 0] + # Create new rows for missing parents and setting their parents to 0 (unknown) + new_rows <- data.frame(Ind = missing_parents, Sire = 0, Dam = 0) + # Combine the original dataframe with the new rows and remove duplicates + ped_extended <- unique(rbind(ped, new_rows)) + + #Converting to Amatrix + #Using the default additive relationship options (Amatrix only works for even numbered ploidy) + Geno.mat <- Amatrix(data = ped_extended, ploidy = ploidy) + + #Filter and order the ped file based on the phenotype file (make sure this is valid to subset after generating) + pheno_ids <- as.character(rownames(pred_inputs$pheno_input)) + valid_ids <- intersect(pheno_ids, rownames(Geno.mat)) + pred_inputs$pheno_input <- pred_inputs$pheno_input[valid_ids, ] + Geno.mat <- Geno.mat[valid_ids, valid_ids] + + #Update variable + total_population <- ncol(Geno.mat) + print("check15") + }else if (advanced_options$pred_matrix == "Hmatrix") { + print("check16") + #Import pedigree file, where pedigree data name (3-column way format). Unknown value should be equal 0 + ped <- read.csv(advanced_options$ped_file$datapath, header = TRUE, check.names = FALSE, colClasses = "factor") + colnames(ped) <- c("Ind", "Sire", "Dam") + #Convert NAs to 0 + ped[is.na(ped)] <- 0 + #Ensure Sire and Dam are also listed as individuals + missing_parents <- unique(c(ped$Sire, ped$Dam)) + # Filter out parents already listed as individuals and non-zero values + missing_parents <- missing_parents[!missing_parents %in% ped$Ind & missing_parents != 0] + # Create new rows for missing parents and setting their parents to 0 (unknown) + new_rows <- data.frame(Ind = missing_parents, Sire = 0, Dam = 0) + # Combine the original dataframe with the new rows and remove duplicates + ped_extended <- unique(rbind(ped, new_rows)) + + #Converting to Amatrix + #Using the default additive relationship options (Amatrix only works for even numbered ploidy) + Ped.mat <- Amatrix(data = ped_extended, ploidy = ploidy) + + #Filter and order the ped file based on the phenotype file (make sure this is valid to subset after generating) + pheno_ids <- as.character(rownames(pred_inputs$pheno_input)) + valid_ids <- intersect(pheno_ids, rownames(Ped.mat)) + pred_inputs$pheno_input <- pred_inputs$pheno_input[valid_ids, ] + Ped.mat <- Ped.mat[valid_ids, valid_ids] + + #Update variable + total_population <- ncol(Ped.mat) + + #Using Gmatrix to get the Gmatrix instead of A.mat for consistency + #Should I be using the raw dosage values or is it okay to use the scaled genotype data that is used for A.mat()? + G.mat <- Gmatrix(t(pred_inputs$geno_input[ ,valid_ids]), method = "VanRaden", ploidy = as.numeric(ploidy), missingValue = "NA") + G.mat <- round(G.mat,3) #to be easy to invert + + #Computing H matrix (Martini) - Using the name Geno.mat for consistency + Geno.mat <- Hmatrix(A=Ped.mat, G=G.mat, method="Martini", + ploidy= ploidy, + maf=0.05) + #Clean memory + rm(G.mat) + rm(Ped.mat) + rm(ped_filtered) + } + # Establish accuracy results matrix + results <- matrix(nrow = cycles*Folds, ncol = length(traits) + 2) + colnames(results) <- c(paste0(traits), "Iter", "Fold") # Set the column names to be the traits + + # Initialize a list to store GEBVs for all traits and cycles + GEBVs <- list() + + #Establish heritability_scores_df () Maybe get h2 values + # Establish results matrix + heritability_scores <- matrix(nrow = cycles*Folds, ncol = length(traits) + 2) + colnames(heritability_scores) <- c(paste0(traits,"_h2"), "Iter", "Fold") # Set the column names to be the traits + + + # For loop + for (r in 1:cycles) { + set.seed(r) + fold_ids <- sample(rep(1:Folds, length.out = total_population)) + fold_df <- data.frame(Sample = row.names(Geno.mat), FoldID = fold_ids) #Randomly assign each sample to a fold + fold_results <- matrix(nrow = Folds, ncol = length(traits)) + colnames(fold_results) <- traits + + #Initialize GEBV object for each cycle + GEBVs_cycle <-list() + + #Status + updateProgressBar(session = session, id = "pb_prediction", value = as.numeric(pb_value), title = paste0("Performing iteration:", r, "of", cycles)) + + for (fold in 1:Folds) { + + #Status bar length + pb_value = pb_value + (70 / as.numeric(cycles*Folds)) + + #Subset training and testing samples + train <- fold_df %>% + dplyr::filter(FoldID != fold) %>% + pull(Sample) + test <- setdiff(row.names(Geno.mat),train) + + Fixed_train = NULL + + # Initialize a matrix to store GEBVs for this fold + GEBVs_fold <- matrix(nrow = length(test), ncol = length(traits)+3) + colnames(GEBVs_fold) <- c(traits,"Sample","Iter","Fold") + rownames(GEBVs_fold) <- paste("Iter", r,"Fold",fold,"Ind", test, sep="_") + + #Evaluate each trait using the same train and testing samples for each + for (trait_idx in 1:length(traits)) { + #Mask phenotypes in testing group + Pheno_test <- pred_inputs$pheno_input + Pheno_test[test, traits[trait_idx]] <- NA + #Kin.blup + traitpred <- kin.blup(data = Pheno_test, geno = names(pred_inputs$pheno_input)[1], pheno = traits[trait_idx], fixed = fixed_cat, covariate = fixed_cov, K=Geno.mat) + #Cor between test values and predicted breeding values + results[(((r-1)*5)+fold), trait_idx] <- cor(pred_inputs$pheno_input[test, traits[trait_idx]], traitpred$g[test], use = "complete.obs") + results[(((r-1)*5)+fold), (length(traits)+1)] <- r + results[(((r-1)*5)+fold), (length(traits)+2)] <- fold + + # Extract GEBVs + GEBVs_fold[, trait_idx] <- traitpred$g[test] #Confirm it is accuract to calculate the GEBVs for testing group from the trained model + + + # Calculate heritability (these are wrong) + Vu <- traitpred$Vg + Ve <- traitpred$Ve + heritability_scores[(((r-1)*5)+fold), trait_idx] <- Vu / (Vu + Ve) + + } + #Add iter and fold information for each trait/result + heritability_scores[(((r-1)*5)+fold), (length(traits)+1)] <- r + heritability_scores[(((r-1)*5)+fold), (length(traits)+2)] <- fold + + #Add sample, iteration, and fold information to GEBVs_fold + GEBVs_fold[,"Iter"] = r + GEBVs_fold[,"Fold"] = fold + GEBVs_fold[,"Sample"] <- test + + # Store GEBVs for this fold + GEBVs_cycle[[fold]] <- GEBVs_fold + + } + + # Store GEBVs for this cycle + GEBVs[[r]] <- do.call(rbind, GEBVs_cycle) + + } + + # Combine all GEBVs into a single DataFrame + GEBVs_df <- as.data.frame(do.call(rbind, GEBVs)) + + results <- as.data.frame(results) + heritability_scores <- as.data.frame(heritability_scores) + + # Combine results and heritability_scores using cbind + combined_results <- cbind(results, heritability_scores) + + #Save to reactive value + pred_outputs$corr_output <- results + pred_outputs$all_GEBVs <- results$GEBVs_df + + # Convert trait columns to numeric + GEBVs <- GEBVs_df %>% + mutate(across(all_of(traits), ~ as.numeric(.x))) + + # Calculate the average value for each column in the traits list for each SampleID, ignoring Iter and Fold + average_gebvs_df <- GEBVs %>% + group_by(Sample) %>% + summarize(across(all_of(traits), mean, na.rm = TRUE)) + + pred_outputs$avg_GEBVs <- average_gebvs_df + + columns <- setdiff(colnames(results), c("Iter","Fold")) + average_accuracy_df <- results %>% + group_by(Iter) %>% + summarize(across(all_of(columns), mean, na.rm = TRUE)) + + + pred_outputs$comb_output <- average_accuracy_df + } - - dev.off() - } - - ) - - output$download_vcf <- downloadHandler( - filename = function() { - paste0("BIGapp_VCF_Example_file.vcf.gz") - }, - content = function(file) { - ex <- system.file("iris_DArT_VCF.vcf.gz", package = "BIGapp") - file.copy(ex, file) + + #Status + updateProgressBar(session = session, id = "pb_prediction", value = 90, title = "Generating Results") + + ##Figures and Tables + + #Status + updateProgressBar(session = session, id = "pb_prediction", value = 100, title = "Finished!") + + #End the event + continue_prediction(NULL) }) - - output$download_pheno <- downloadHandler( - filename = function() { - paste0("BIGapp_passport_Example_file.csv") - }, - content = function(file) { - ex <- system.file("iris_passport_file.csv", package = "BIGapp") - file.copy(ex, file) + + plots <- reactive({ + validate( + need(!is.null(pred_outputs$corr_output), "Upload the input files, set the parameters and click 'run analysis' to access results in this session.") + ) + + df <- pred_outputs$corr_output + df <- df %>% dplyr::select(-Fold, -Iter) + + #Probably want to add the ability for the user to select which trait(s) to display here + + #Convert to long format for ggplot + df_long <- pivot_longer( + df, + cols = colnames(df), # Exclude the Cycle column from transformation + names_to = "Trait", # New column for trait names + values_to = "Correlation" # New column for correlation values + ) + + #This can be adapted if we start comparing more than one GP model + #Also consider a violin plot to show each cor value + #plot <- ggplot(df_long, aes(x = factor(Trait), y = Correlation, fill = "red"), fill = "red") + + plot <- ggplot(df_long, aes(x = "rrBLUP", y = Correlation, fill = "red"), fill = "red") + + #geom_boxplot(position = position_dodge(width = 0.8), color = "black", width = 0.7, outlier.size = 0.2) + + geom_boxplot() + + facet_wrap(~ Trait, nrow = 1) + # Facet by trait, allowing different y-scales + labs(title = "Predictive Ability by Trait", + x = " ", + y = "Pearson Correlation") + + #theme_minimal() + # Using a minimal theme + theme(legend.position = "none", + strip.text = element_text(size = 12), + axis.text = element_text(size = 12), + axis.title = element_text(size = 14), + axis.text.x = element_text(angle = 90, hjust = 0.95, vjust = 0.2), + strip.text.x = element_text(face = "bold"), + axis.text.x.bottom = element_blank(), + axis.ticks.x.bottom = element_blank()) + + plot_violin <- ggplot(df_long, aes(x = "rrBLUP", y = Correlation, fill = "red")) + + geom_violin(trim = TRUE) + # Add violin plot + geom_point(position = position_jitter(width = 0.1), color = "black", size = 1.5) + # Add jittered points + facet_wrap(~ Trait, nrow = 1) + # Facet by trait, allowing different y-scales + labs(title = "Predictive Ability by Trait", + x = " ", # x-label is blank because it's not relevant per facet + y = "Pearson Correlation") + + theme(legend.position = "none", + strip.text = element_text(size = 12), + axis.text = element_text(size = 12), + axis.title = element_text(size = 14), + axis.text.x = element_text(angle = 90, hjust = 0.95, vjust = 0.2), + strip.text.x = element_text(face = "bold"), + axis.text.x.bottom = element_blank(), + axis.ticks.x.bottom = element_blank()) + + list(plot, plot_violin) + }) + + #Output the genomic prediction correlation box plots + output$pred_box_plot <- renderPlot({ + plots()[[1]] + scale_fill_manual(values = pred_outputs$colors) }) + + #Output the genomic prediction correlation box plots + output$pred_violin_plot <- renderPlot({ + plots()[[2]] + scale_fill_manual(values = pred_outputs$colors) + }) + + #Output the prediction tables + + all_GEBVs <- reactive({ + validate( + need(!is.null(pred_outputs$all_GEBVs), "Upload the input files, set the parameters and click 'run analysis' to access results in this session.") + ) + pred_outputs$comb_output + }) + + output$pred_all_table <- renderDT({all_GEBVs()}, options = list(scrollX = TRUE,autoWidth = FALSE, pageLength = 5)) + + comb_output <- reactive({ + validate( + need(!is.null(pred_outputs$comb_output), "Upload the input files, set the parameters and click 'run analysis' to access results in this session.") + ) + pred_outputs$comb_output + }) + + output$pred_acc_table <- renderDT({comb_output()}, options = list(scrollX = TRUE,autoWidth = FALSE, pageLength = 5)) + + avg_GEBVs <- reactive({ + validate( + need(!is.null(pred_outputs$avg_GEBVs), "Upload the input files, set the parameters and click 'run analysis' to access results in this session.") + ) + pred_outputs$avg_GEBVs + }) + + output$pred_gebvs_table <- renderDT({avg_GEBVs()}, options = list(scrollX = TRUE,autoWidth = FALSE, pageLength = 5)) + + #Download files for GP + output$download_pred_file <- downloadHandler( + filename = function() { + paste0("GS-results-", Sys.Date(), ".zip") + }, + content = function(file) { + # Temporary files list + temp_dir <- tempdir() + temp_files <- c() + + if (!is.null(pred_outputs$avg_GEBVs)) { + # Create a temporary file for assignments + gebv_file <- file.path(temp_dir, paste0("GEBVs-", Sys.Date(), ".csv")) + write.csv(pred_outputs$avg_GEBVs, gebv_file, row.names = FALSE) + temp_files <- c(temp_files, gebv_file) + } + + if (!is.null(pred_outputs$comb_output)) { + # Create a temporary file for BIC data frame + acc_file <- file.path(temp_dir, paste0("GS-accuracy-statistics-", Sys.Date(), ".csv")) + write.csv(pred_outputs$comb_output, acc_file, row.names = FALSE) + temp_files <- c(temp_files, acc_file) + } + + # Zip files only if there's something to zip + if (length(temp_files) > 0) { + zip(file, files = temp_files, extras = "-j") # Using -j to junk paths + } + + # Optionally clean up + file.remove(temp_files) + } + ) + + #Download GP Figures + output$download_pred_figure <- downloadHandler( + + filename = function() { + if (input$pred_image_type == "jpeg") { + paste("GS-", Sys.Date(), ".jpg", sep="") + } else if (input$pred_image_type == "png") { + paste("GS-", Sys.Date(), ".png", sep="") + } else { + paste("GS-", Sys.Date(), ".tiff", sep="") + } + }, + content = function(file) { + #req(all_plots$pca_2d, all_plots$pca3d, all_plots$scree, input$pca_image_type, input$pca_image_res, input$pca_image_width, input$pca_image_height) #Get the plots + req(input$pred_figures) + + if (input$pred_image_type == "jpeg") { + jpeg(file, width = as.numeric(input$pred_image_width), height = as.numeric(input$pred_image_height), res= as.numeric(input$pred_image_res), units = "in") + } else if (input$pred_image_type == "png") { + png(file, width = as.numeric(input$pred_image_width), height = as.numeric(input$pred_image_height), res= as.numeric(input$pred_image_res), units = "in") + } else { + tiff(file, width = as.numeric(input$pred_image_width), height = as.numeric(input$pred_image_height), res= as.numeric(input$pred_image_res), units = "in") + } + + # Conditional plotting based on input selection + if (input$pred_figures == "Violin Plot") { + req(pred_outputs$violin_plot) + + print(pred_outputs$violin_plot + scale_fill_manual(values = pred_outputs$colors)) + + } else if (input$pred_figures == "Box Plot") { + req(pred_outputs$box_plot) + #Plot + print(pred_outputs$box_plot + scale_fill_manual(values = pred_outputs$colors)) + + } + + dev.off() + } + + ) + + output$download_vcf <- downloadHandler( + filename = function() { + paste0("BIGapp_VCF_Example_file.vcf.gz") + }, + content = function(file) { + ex <- system.file("iris_DArT_VCF.vcf.gz", package = "BIGapp") + file.copy(ex, file) + }) + + output$download_pheno <- downloadHandler( + filename = function() { + paste0("BIGapp_passport_Example_file.csv") + }, + content = function(file) { + ex <- system.file("iris_passport_file.csv", package = "BIGapp") + file.copy(ex, file) + }) + }) } ## To be copied in the UI From 7037ea1f54389e6bfcafae4b5e50987d9aaca4c3 Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Mon, 16 Sep 2024 12:34:30 -0400 Subject: [PATCH 32/40] adapt --- R/app_server.R | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/R/app_server.R b/R/app_server.R index 1bf6ac2..c74809a 100644 --- a/R/app_server.R +++ b/R/app_server.R @@ -35,9 +35,9 @@ app_server <- function(input, output, session) { callModule(mod_GS_server, "GS_1", parent_session = session) - #callModule(mod_GSAcc_server, - # "GSAcc_1", - # parent_session = session) + callModule(mod_GSAcc_server, + "GSAcc_1", + parent_session = session) callModule(mod_slurm_server, "slurm_1", parent_session = session) @@ -50,9 +50,9 @@ app_server <- function(input, output, session) { # mod_gwas_server("gwas_1") # mod_diversity_server("diversity_1") # mod_GS_server("GS_1") - mod_GSAcc_server("GSAcc_1") + # mod_GSAcc_server("GSAcc_1") # mod_slurm_server("slurm_1") - + #Session info popup observeEvent(input$session_info_button, { showModal(modalDialog( @@ -68,7 +68,7 @@ app_server <- function(input, output, session) { ) )) }) - + #Download Session Info output$download_session_info <- downloadHandler( filename = function() { @@ -78,5 +78,5 @@ app_server <- function(input, output, session) { writeLines(paste(capture.output(sessionInfo()), collapse = "\n"), file) } ) - + } From 036ddf193e623993b82653e8cf87500c4ed481cd Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Mon, 16 Sep 2024 16:00:48 -0400 Subject: [PATCH 33/40] bugfix --- R/mod_GSAcc.R | 6 +++--- tests/testthat/test-GSAcc.R | 20 +++++++++++--------- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/R/mod_GSAcc.R b/R/mod_GSAcc.R index 3a302ea..8321d1d 100644 --- a/R/mod_GSAcc.R +++ b/R/mod_GSAcc.R @@ -462,7 +462,7 @@ mod_GSAcc_server <- function(input, output, session, parent_session){ if (length(common_ped) < length(ids_pheno)){ shinyalert( title = "Data Mismatch", - text = paste0((length(ids_pheno)-length(common_ped))," samples were removed from the phenotype data for not having pedigree information"), + text = paste0((length(pheno[,1])-length(common_ped))," samples were removed from the phenotype data for not having pedigree information"), size = "xs", closeOnEsc = FALSE, closeOnClickOutside = FALSE, @@ -477,8 +477,8 @@ mod_GSAcc_server <- function(input, output, session, parent_session){ imageUrl = "", animation = TRUE ) - pheno <- pheno[-which(!pheno$Sample_ID %in% extended_ped$Ind),] - if(!is.null(geno_adj)) geno_adj <- geno_adj[,-which(!colnames(geno_adj) %in% extended_ped$Ind)] + if(length(which(!pheno[,1] %in% extended_ped$Ind)) > 0) pheno <- pheno[-which(!pheno[,1] %in% extended_ped$Ind),] + if(!is.null(geno_adj) & length(which(!colnames(geno_adj) %in% extended_ped$Ind)) > 0) geno_adj <- geno_adj[,-which(!colnames(geno_adj) %in% extended_ped$Ind)] } if (length(ped$Ind) > length(extended_ped$Ind)) shinyalert( diff --git a/tests/testthat/test-GSAcc.R b/tests/testthat/test-GSAcc.R index 924f297..97051a1 100644 --- a/tests/testthat/test-GSAcc.R +++ b/tests/testthat/test-GSAcc.R @@ -496,13 +496,15 @@ test_that("test Predictive Ability iris",{ # # Inputs # input <- list() # -# input$trait_file$datapath <- "BIG_pheno2.csv" +# #input$trait_file$datapath <- "BIG_pheno2.csv" +# input$trait_file$datapath <- "BIG_phenos.csv" # input$pred_file$datapath <- "BIG_genos.vcf" -# input$ped_file$datapath <- "sealice_ped.csv" +# #input$ped_file$datapath <- "sealice_ped.csv" +# input$ped_file$datapath <- "BIG_ped.csv" # # input$pred_color_select <- "red" # input$pred_ploidy <- 2 -# input$pred_trait_info <- "Pheno3" +# input$pred_trait_info <- "licedensity" # input$pred_cv <- 5 # input$pred_fixed_info <- NULL # input$pred_fixed_cat <- NULL @@ -599,10 +601,10 @@ test_that("test Predictive Ability iris",{ # gen <- rm_unr[[2]] # cat(paste0("You have pedigree information until the ", gen,"th generation\n")) # -# if (length(common_ped) < length(ids_pheno)){ -# warning(paste0((length(ids_pheno)-length(common_ped))," samples were removed from the phenotype data for not having pedigree information")) -# pheno <- pheno[-which(!pheno$Sample_ID %in% extended_ped$Ind),] -# geno_adj <- geno_adj[,-which(!colnames(geno_adj) %in% extended_ped$Ind)] +# if (length(common_ped) < length(pheno[,1])){ +# warning(paste0((length(pheno[,1])-length(common_ped))," samples were removed from the phenotype data for not having pedigree information")) +# if(length(which(!pheno[,1] %in% extended_ped$Ind)) > 0) pheno <- pheno[-which(!pheno[,1] %in% extended_ped$Ind),] +# if(length(which(!colnames(geno_adj) %in% extended_ped$Ind)) > 0) geno_adj <- geno_adj[,-which(!colnames(geno_adj) %in% extended_ped$Ind)] # } # if (length(ped$Ind) > length(extended_ped$Ind)) # warning(paste0((length(ped$Ind)-length(extended_ped$Ind))," samples in the pedigree file were unrelated to the samples with phenotype information. They were removed from the analysis.")) # samples not removed @@ -670,8 +672,8 @@ test_that("test Predictive Ability iris",{ # # # Checks # expect_equal(mean(pred_outputs_rrBLUP$corr_output[,1]), 0.1848, tolerance = 0.01) -# expect_equal(mean(pred_outputs_rrBLUP$comb_output[,2]$Pheno3), 0.1848, tolerance = 0.01) -# expect_equal(sum(pred_outputs_rrBLUP$avg_GEBVs[,2]$Pheno3), 19.378, tolerance = 0.01) +# expect_equal(mean(pred_outputs_rrBLUP$comb_output[,2]$licedensity), 0.1848, tolerance = 0.01) +# expect_equal(sum(pred_outputs_rrBLUP$avg_GEBVs[,2]$licedensity), 19.378, tolerance = 0.01) # expect_equal(sum(as.numeric(pred_outputs_rrBLUP$all_GEBVs[,1])), 96.89, tolerance = 0.1) # ######### # From a436c298448b41fb60019da6c2364ce32ef3d9ce Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Mon, 16 Sep 2024 16:11:32 -0400 Subject: [PATCH 34/40] rm GEBV from CV --- R/mod_GSAcc.R | 20 +------------------- 1 file changed, 1 insertion(+), 19 deletions(-) diff --git a/R/mod_GSAcc.R b/R/mod_GSAcc.R index 8321d1d..7e12183 100644 --- a/R/mod_GSAcc.R +++ b/R/mod_GSAcc.R @@ -80,9 +80,7 @@ mod_GSAcc_ui <- function(id){ bs4Dash::tabsetPanel( tabPanel("Violin Plot", plotOutput(ns("pred_violin_plot"), height = "500px")), tabPanel("Box Plot", plotOutput(ns("pred_box_plot"), height = "500px")), - tabPanel("Accuracy Table", DTOutput(ns("pred_acc_table")), style = "overflow-y: auto; height: 500px"), - tabPanel("GEBVs Table", DTOutput(ns("pred_gebvs_table")),style = "overflow-y: auto; height: 500px") - + tabPanel("Accuracy Table", DTOutput(ns("pred_acc_table")), style = "overflow-y: auto; height: 500px") ) ) @@ -694,15 +692,6 @@ mod_GSAcc_server <- function(input, output, session, parent_session){ output$pred_acc_table <- renderDT({comb_output()}, options = list(scrollX = TRUE,autoWidth = FALSE, pageLength = 5)) - avg_GEBVs <- reactive({ - validate( - need(!is.null(pred_outputs()$avg_GEBVs), "Upload the input files, set the parameters and click 'run analysis' to access results in this session.") - ) - pred_outputs()$avg_GEBVs - }) - - output$pred_gebvs_table <- renderDT({avg_GEBVs()}, options = list(scrollX = TRUE,autoWidth = FALSE, pageLength = 5)) - #Download files for GP output$download_pred_file <- downloadHandler( filename = function() { @@ -713,13 +702,6 @@ mod_GSAcc_server <- function(input, output, session, parent_session){ temp_dir <- tempdir() temp_files <- c() - if (!is.null(pred_outputs()$avg_GEBVs)) { - # Create a temporary file for assignments - gebv_file <- file.path(temp_dir, paste0("GEBVs-", Sys.Date(), ".csv")) - write.csv(pred_outputs()$avg_GEBVs, gebv_file, row.names = FALSE) - temp_files <- c(temp_files, gebv_file) - } - if (!is.null(pred_outputs()$comb_output)) { # Create a temporary file for BIC data frame acc_file <- file.path(temp_dir, paste0("GS-accuracy-statistics-", Sys.Date(), ".csv")) From 0da389d455c8c184aeff35d35b5ffa8bcce336e1 Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Mon, 16 Sep 2024 16:56:34 -0400 Subject: [PATCH 35/40] add message back --- R/mod_GSAcc.R | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/R/mod_GSAcc.R b/R/mod_GSAcc.R index 7e12183..788f425 100644 --- a/R/mod_GSAcc.R +++ b/R/mod_GSAcc.R @@ -142,6 +142,11 @@ mod_GSAcc_server <- function(input, output, session, parent_session){ ped_file = NULL ) + pred_outputs <- reactiveValues(corr_output = NULL, + comb_output = NULL, + all_GEBVs = NULL, + avg_GEBVs = NULL) + #List the ped file name if previously uploaded output$uploaded_file_name <- renderText({ if (!is.null(advanced_options$ped_file)) { @@ -552,7 +557,7 @@ mod_GSAcc_server <- function(input, output, session, parent_session){ pred_inputs }) - pred_outputs <- eventReactive(pred_inputs(), { + observeEvent(pred_inputs(),{ # Convert genotype matrix according to ploidy and model used if(!is.null(pred_inputs()$geno_input)){ @@ -578,7 +583,6 @@ mod_GSAcc_server <- function(input, output, session, parent_session){ updateProgressBar(session = session, id = "pb_prediction", value = 90, title = "Cross validation concluded") #Save to reactive value - pred_outputs <- list(corr_output = NULL, comb_output = NULL, all_GEBVs = NULL, avg_GEBVs = NULL) pred_outputs$corr_output <- results$PredictionAccuracy pred_outputs$all_GEBVs <- results$GEBVs @@ -602,15 +606,16 @@ mod_GSAcc_server <- function(input, output, session, parent_session){ #Status updateProgressBar(session = session, id = "pb_prediction", value = 100, title = "Finished!") - pred_outputs + #pred_outputs }) plots <- reactive({ + validate( - need(!is.null(pred_outputs()$corr_output), "Upload the input files, set the parameters and click 'run analysis' to access results in this session.") + need(!is.null(pred_outputs$corr_output), "Upload the input files, set the parameters and click 'run analysis' to access results in this session.") ) - df <- pred_outputs()$corr_output + df <- pred_outputs$corr_output df <- df %>% dplyr::select(-Fold, -Iter) #Probably want to add the ability for the user to select which trait(s) to display here @@ -676,18 +681,18 @@ mod_GSAcc_server <- function(input, output, session, parent_session){ all_GEBVs <- reactive({ validate( - need(!is.null(pred_outputs()$all_GEBVs), "Upload the input files, set the parameters and click 'run analysis' to access results in this session.") + need(!is.null(pred_outputs$all_GEBVs), "Upload the input files, set the parameters and click 'run analysis' to access results in this session.") ) - pred_outputs()$comb_output + pred_outputs$comb_output }) output$pred_all_table <- renderDT({all_GEBVs()}, options = list(scrollX = TRUE,autoWidth = FALSE, pageLength = 5)) comb_output <- reactive({ validate( - need(!is.null(pred_outputs()$comb_output), "Upload the input files, set the parameters and click 'run analysis' to access results in this session.") + need(!is.null(pred_outputs$comb_output), "Upload the input files, set the parameters and click 'run analysis' to access results in this session.") ) - pred_outputs()$comb_output + pred_outputs$comb_output }) output$pred_acc_table <- renderDT({comb_output()}, options = list(scrollX = TRUE,autoWidth = FALSE, pageLength = 5)) @@ -702,10 +707,10 @@ mod_GSAcc_server <- function(input, output, session, parent_session){ temp_dir <- tempdir() temp_files <- c() - if (!is.null(pred_outputs()$comb_output)) { + if (!is.null(pred_outputs$comb_output)) { # Create a temporary file for BIC data frame acc_file <- file.path(temp_dir, paste0("GS-accuracy-statistics-", Sys.Date(), ".csv")) - write.csv(pred_outputs()$comb_output, acc_file, row.names = FALSE) + write.csv(pred_outputs$comb_output, acc_file, row.names = FALSE) temp_files <- c(temp_files, acc_file) } From 3885c20f6e35762864d62a88475771ba6f31df04 Mon Sep 17 00:00:00 2001 From: Alex Sandercock Date: Mon, 16 Sep 2024 17:39:19 -0400 Subject: [PATCH 36/40] Added option to subset dosage calling samples --- R/mod_DosageCall.R | 96 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 96 insertions(+) diff --git a/R/mod_DosageCall.R b/R/mod_DosageCall.R index 79b2a7b..f6b8b3f 100644 --- a/R/mod_DosageCall.R +++ b/R/mod_DosageCall.R @@ -20,6 +20,30 @@ mod_DosageCall_ui <- function(id){ box( title = "Inputs", status = "info", solidHeader = TRUE, collapsible = FALSE, collapsed = FALSE, fileInput(ns("madc_file"), "Choose MADC or VCF File", accept = c(".csv",".vcf",".gz")), + fileInput(ns("madc_passport"), "Choose Passport File (optional)", accept = c(".csv")), + conditionalPanel( + condition = "output.passportTablePopulated", + ns = ns, + tags$div( + style = "padding-left: 20px;", # Add padding/indentation + virtualSelectInput( + inputId = ns("cat_madc"), + label = "Select Category Subset:", + choices = NULL, + showValueAsTags = TRUE, + search = TRUE, + multiple = FALSE + ), + virtualSelectInput( + inputId = ns("item_madc"), + label = "Select Subset Values:", + choices = NULL, + showValueAsTags = TRUE, + search = TRUE, + multiple = TRUE + ) + ) + ), textInput(ns("output_name"), "Output File Name"), numericInput(ns("ploidy"), "Species Ploidy", min = 1, value = NULL), selectInput(ns("updog_model"), "Updog Model", choices = c("norm","hw","bb","s1","s1pp","f1","f1pp","flex","uniform"), selected = "norm"), @@ -60,6 +84,7 @@ mod_DosageCall_ui <- function(id){ #' @import updog #' @importFrom BIGr updog2vcf #' @importFrom shinyjs enable disable +#' @import dplyr #' #' @noRd mod_DosageCall_server <- function(input, output, session, parent_session){ @@ -102,6 +127,38 @@ mod_DosageCall_server <- function(input, output, session, parent_session){ # expand specific box updateBox(id = "Updog_Dosage_Calling_box", action = "toggle", session = parent_session) }) + + # Update dropdown menu choices based on uploaded passport file + passport_table <- reactive({ + validate( + need(!is.null(input$madc_passport), "Upload passport file to access results in this section."), + ) + info_df <- read.csv(input$madc_passport$datapath, header = TRUE, check.names = FALSE) + info_df[,1] <- as.character(info_df[,1]) #Makes sure that the sample names are characters instead of numeric + + updateVirtualSelect("cat_madc", choices = colnames(info_df), session = session) + info_df + }) + + # Server logic to check if passport_table() has data + output$passportTablePopulated <- reactive({ + !is.null(passport_table()) && nrow(passport_table()) > 0 # Check if the table has rows + }) + outputOptions(output, "passportTablePopulated", suspendWhenHidden = FALSE) + + #MADC specific category selection + observeEvent(input$cat_madc, { + + # Get selected column name + selected_col <- input$cat_madc + + # Extract unique values from the selected column + unique_values <- unique(passport_table()[[selected_col]]) + + #Add category selection + updateVirtualSelect("item_madc", choices = unique_values, session = session) + + }) snp_number <- reactiveVal(0) @@ -208,6 +265,45 @@ mod_DosageCall_server <- function(input, output, session, parent_session){ stop(safeError("Error: DP and RA/AD FORMAT flags not found in VCF file")) } } + + #Subset samples from the matrices if the user selected items in the passport file + if (!is.null(input$item_madc) && length(input$item_madc) > 0){ + + #First getting the samples that are both in the passport and the MADC/VCF file + #**Assuming the first column of the passport table is the sample IDs + shared_samples <- intersect(passport_table()[[1]], colnames(matrices$ref_matrix)) + + # Filter the passport dataframe + filtered_shared_samples <- passport_table() %>% + filter(passport_table()[[1]] %in% shared_samples, + passport_table()[[input$cat_madc]] %in% input$item_madc) %>% + pull(1) + + #Give warning if no samples were subset + if (length(filtered_shared_samples) < 1) { + shinyalert( + title = "Data Warning!", + text = "No samples remain after subsetting options", + size = "s", + closeOnEsc = TRUE, + closeOnClickOutside = FALSE, + html = TRUE, + type = "error", + showConfirmButton = TRUE, + confirmButtonText = "OK", + confirmButtonCol = "#004192", + showCancelButton = FALSE, + animation = TRUE + ) + + return() + } + + #Subset the matrices + matrices$ref_matrix <- matrices$ref_matrix[, filtered_shared_samples] + matrices$size_matrix <- matrices$size_matrix[, filtered_shared_samples] + + } #Run Updog #I initially used the "norm" model From 4750636b2b74296ae45237476c2bbced5fe4197e Mon Sep 17 00:00:00 2001 From: Alex Sandercock Date: Tue, 17 Sep 2024 08:42:33 -0400 Subject: [PATCH 37/40] Added MADC data check --- R/utils.R | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/R/utils.R b/R/utils.R index 14b6fb1..1ad1753 100644 --- a/R/utils.R +++ b/R/utils.R @@ -40,18 +40,32 @@ get_matrices <- function(result_df) { } #Ensure that each has the same SNPs and that they are in the same order - identical(alt_df$CloneID,ref_df$CloneID) - + same <- identical(alt_df$CloneID,ref_df$CloneID) + ###Convert the ref and alt counts into matrices with the CloneID as the index #Set SNP names as index row.names(ref_df) <- ref_df$CloneID row.names(alt_df) <- alt_df$CloneID - + + #Retain only the rows in common if they are not identical and provide warning + if (same == FALSE) { + warning("Mismatch between Ref and Alt Markers. MADC likely altered. Markers without a Ref or Alt match removed.") + # Find the common CloneIDs between the two dataframes + common_ids <- intersect(rownames(ref_df), rownames(alt_df)) + # Subset both dataframes to retain only the common rows + ref_df <- ref_df[common_ids, ] + alt_df <- alt_df[common_ids, ] + } + #Remove unwanted columns and convert to matrix - #Probably best to just remove the column names that aren't wanted instead of the first 16 columns. + #Consider list columnnames to be removed instead of the first 16 columns.. ref_matrix <- as.matrix(ref_df[, -c(1:16)]) alt_matrix <- as.matrix(alt_df[, -c(1:16)]) - + + #Convert elements to numeric + class(ref_matrix) <- "numeric" + class(alt_matrix) <- "numeric" + #Make the size matrix by combining the two matrices size_matrix <- (ref_matrix + alt_matrix) From 963186ee2ff7b276bcfc092c83ddeb4f0bd9b943 Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Tue, 17 Sep 2024 13:42:28 -0400 Subject: [PATCH 38/40] fix #53 --- R/mod_DosageCall.R | 97 +++++++++++++++++++++----- R/mod_dosage2vcf.R | 9 ++- inst/vcf_example_out.vcf.gz | Bin 5868 -> 0 bytes tests/testthat/test-DosageCall.R | 115 +++++++++++++++++++++++++++++-- 4 files changed, 196 insertions(+), 25 deletions(-) delete mode 100644 inst/vcf_example_out.vcf.gz diff --git a/R/mod_DosageCall.R b/R/mod_DosageCall.R index f6b8b3f..64f6674 100644 --- a/R/mod_DosageCall.R +++ b/R/mod_DosageCall.R @@ -47,6 +47,35 @@ mod_DosageCall_ui <- function(id){ textInput(ns("output_name"), "Output File Name"), numericInput(ns("ploidy"), "Species Ploidy", min = 1, value = NULL), selectInput(ns("updog_model"), "Updog Model", choices = c("norm","hw","bb","s1","s1pp","f1","f1pp","flex","uniform"), selected = "norm"), + conditionalPanel( + condition = "input.updog_model == 'f1' | input.updog_model == 'f1pp'", + ns = ns, + tags$div( + style = "padding-left: 20px;", # Add padding/indentation + textInput( + inputId = ns("parent1"), + label = "Enter parent1 ID:", + value = NULL + ), + textInput( + inputId = ns("parent2"), + label = "Enter parent2 ID:", + value = NULL + ) + ) + ), + conditionalPanel( + condition = "input.updog_model == 's1' | input.updog_model == 's1pp'", + ns = ns, + tags$div( + style = "padding-left: 20px;", # Add padding/indentation + textInput( + inputId = ns("parent"), + label = "Enter parent ID:", + value = NULL + ) + ) + ), numericInput(ns("cores"), "Number of CPU Cores", min = 1, max = (future::availableCores() - 1), value = 1), actionButton(ns("run_analysis"), "Run Analysis"), useShinyjs(), @@ -127,7 +156,7 @@ mod_DosageCall_server <- function(input, output, session, parent_session){ # expand specific box updateBox(id = "Updog_Dosage_Calling_box", action = "toggle", session = parent_session) }) - + # Update dropdown menu choices based on uploaded passport file passport_table <- reactive({ validate( @@ -135,29 +164,29 @@ mod_DosageCall_server <- function(input, output, session, parent_session){ ) info_df <- read.csv(input$madc_passport$datapath, header = TRUE, check.names = FALSE) info_df[,1] <- as.character(info_df[,1]) #Makes sure that the sample names are characters instead of numeric - + updateVirtualSelect("cat_madc", choices = colnames(info_df), session = session) info_df }) - + # Server logic to check if passport_table() has data output$passportTablePopulated <- reactive({ !is.null(passport_table()) && nrow(passport_table()) > 0 # Check if the table has rows }) outputOptions(output, "passportTablePopulated", suspendWhenHidden = FALSE) - + #MADC specific category selection observeEvent(input$cat_madc, { # Get selected column name selected_col <- input$cat_madc - + # Extract unique values from the selected column unique_values <- unique(passport_table()[[selected_col]]) - + #Add category selection updateVirtualSelect("item_madc", choices = unique_values, session = session) - + }) snp_number <- reactiveVal(0) @@ -175,6 +204,9 @@ mod_DosageCall_server <- function(input, output, session, parent_session){ # Missing input with red border and alerts toggleClass(id = "ploidy", class = "borderred", condition = (is.na(input$ploidy) | is.null(input$ploidy))) toggleClass(id = "output_name", class = "borderred", condition = (is.na(input$output_name) | is.null(input$output_name) | input$output_name == "")) + toggleClass(id = "parent", class = "borderred", condition = ((input$updog_model == "s1" | input$updog_model == "s1pp") & (is.null(input$parent) | input$parent == ""))) + toggleClass(id = "parent1", class = "borderred", condition = ((input$updog_model == "f1" | input$updog_model == "f1pp") & (is.null(input$parent1) | input$parent1 == ""))) + toggleClass(id = "parent2", class = "borderred", condition = ((input$updog_model == "f1" | input$updog_model == "f1pp") & (is.null(input$parent2) | input$parent2 == ""))) if (is.null(input$madc_file$datapath)) { shinyalert( @@ -199,7 +231,6 @@ mod_DosageCall_server <- function(input, output, session, parent_session){ output_name <- input$output_name ploidy <- input$ploidy cores <- input$cores - model_select <- input$updog_model # Status updateProgressBar(session = session, id = "pb_madc", value = 0, title = "Formatting Input Files") @@ -265,20 +296,20 @@ mod_DosageCall_server <- function(input, output, session, parent_session){ stop(safeError("Error: DP and RA/AD FORMAT flags not found in VCF file")) } } - + #Subset samples from the matrices if the user selected items in the passport file if (!is.null(input$item_madc) && length(input$item_madc) > 0){ - + #First getting the samples that are both in the passport and the MADC/VCF file #**Assuming the first column of the passport table is the sample IDs shared_samples <- intersect(passport_table()[[1]], colnames(matrices$ref_matrix)) - + # Filter the passport dataframe filtered_shared_samples <- passport_table() %>% - filter(passport_table()[[1]] %in% shared_samples, + filter(passport_table()[[1]] %in% shared_samples, passport_table()[[input$cat_madc]] %in% input$item_madc) %>% pull(1) - + #Give warning if no samples were subset if (length(filtered_shared_samples) < 1) { shinyalert( @@ -295,25 +326,53 @@ mod_DosageCall_server <- function(input, output, session, parent_session){ showCancelButton = FALSE, animation = TRUE ) - + return() } - + #Subset the matrices matrices$ref_matrix <- matrices$ref_matrix[, filtered_shared_samples] matrices$size_matrix <- matrices$size_matrix[, filtered_shared_samples] - + + } + + # Select parents + if(input$updog_model == "s1" | input$updog_model == "s1pp"){ + parents <- c(input$parent, NULL) + } else if(input$updog_model == "f1" | input$updog_model == "f1pp"){ + parents <- c(input$parent1, input$parent2) + } else { + parents <- c(NULL, NULL) + } + + if (!all(parents %in% colnames(matrices$size_matrix))) { + shinyalert( + title = "Data Warning!", + text = "Parent(s) not found. Check the genotype input file parent(s) ID, make sure they match with the input parent(s) ID.", + size = "s", + closeOnEsc = TRUE, + closeOnClickOutside = FALSE, + html = TRUE, + type = "error", + showConfirmButton = TRUE, + confirmButtonText = "OK", + confirmButtonCol = "#004192", + showCancelButton = FALSE, + animation = TRUE + ) + + return() } #Run Updog - #I initially used the "norm" model #I am also taking the ploidy from the max value in the updateProgressBar(session = session, id = "pb_madc", value = 40, title = "Dosage Calling in Progress") - print('Performing Updog dosage calling') mout <- multidog(refmat = matrices$ref_matrix, sizemat = matrices$size_matrix, ploidy = as.numeric(ploidy), - model = model_select, + p1_id = parents[1], + p2_id = if(is.na(parents[2])) NULL else parents[2], + model = input$updog_model, nc = cores) #Status updateProgressBar(session = session, id = "pb_madc", value = 100, title = "Finished") diff --git a/R/mod_dosage2vcf.R b/R/mod_dosage2vcf.R index caa8434..645c7f8 100644 --- a/R/mod_dosage2vcf.R +++ b/R/mod_dosage2vcf.R @@ -125,6 +125,8 @@ mod_dosage2vcf_server <- function(input, output, session, parent_session){ } req(input$report_file, input$counts_file, input$d2v_output_name, input$dosage2vcf_ploidy) + updateProgressBar(session = session, id = "dosage2vcf_pb", value = 10, title = "Input files evaluated.") + dosage_file_df <- read.csv(input$report_file$datapath) snp_number <- length(dosage_file_df$X.[-c(1:7)]) @@ -134,6 +136,9 @@ mod_dosage2vcf_server <- function(input, output, session, parent_session){ }) enable("download_d2vcf") + + updateProgressBar(session = session, id = "dosage2vcf_pb", value = 100, title = "Click in Download to continue.") + }) output$download_dose <- downloadHandler( @@ -172,7 +177,7 @@ mod_dosage2vcf_server <- function(input, output, session, parent_session){ temp_base <- tempfile() #Status - updateProgressBar(session = session, id = "dosage2vcf_pb", value = 50, title = "Converting DArT files to VCF") + updateProgressBar(session = session, id = "dosage2vcf_pb", value = 10, title = "Converting DArT files to VCF") # Convert to VCF using the BIGr package cat("Running BIGr::dosage2vcf...\n") @@ -186,6 +191,8 @@ mod_dosage2vcf_server <- function(input, output, session, parent_session){ # The output file should be temp_base.vcf output_name <- paste0(temp_base, ".vcf") + updateProgressBar(session = session, id = "dosage2vcf_pb", value = 50, title = "Writting vcf.") + # Check if the VCF file was created if (file.exists(output_name)) { cat("VCF file created successfully.\n") diff --git a/inst/vcf_example_out.vcf.gz b/inst/vcf_example_out.vcf.gz deleted file mode 100644 index 1a254f3edda3679b209088243625d2c667ecc04c..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 5868 zcmVLLYd?MZ!^P$LhwGc4 z&o{^4J$wGI=BRk`^y%68?d|&V_%AO{j$f>=t~bAZUZ0(Rf4N?Kx4yZ(xV}1W7bol6 z4>uQ|H}P*@y<7jVzFA*=SkKQdFV~mr`Ni!#44eOSb^Ytr(fsVwdj9kIKd*0g|NQIu z?fjS9^~ZVm|IOamul^b)@chlYZ&zn&0%wcYzx@3D`sTP?oP~)Tzu(+kT>W@|0=rr6 z4_=+@1|OxtFRwQ1AJ;ecN3Y_GFFwpau0L-+-Od9!=9@6(J2SfqQ@z}*Z?4WayLsNu zFRtvi>zj}3oB8F%?PmRP|NSSYdy|Lnf44sWI6sLmpa0?d>hiaFSd{RxK%blNFI!T@ zS$znv4&VOA360+Fz2^Dl_4x*qX#Vz>^UI6PZ~MYO(re@y$2$AGdG4o&WkNe9?S$9H?sa{N^}N^v}j7 z=bt|Z@<#@nUu@>L7n?5(^!>$oS{s|~%h%7}2xL9G-ke`1y2jzp{{HUGx3kkX?`JPh zX7B#?e71UZHhcSz)vMX_m#@zL_HGs*KT8yuotCrHDqJ}!<4+xbn)uVkpFaK!@n^JK z3j4J?%QVOals2NYA!TR3HnOy#rH!p@Y-M9B8(Z1f%0^W-srqjW%ku`$xNfwEIW9f3*8Yd)JtjeY@_(lX5nMo8G?; z{PJS16RB_Tdq!)C#TC7Zu@ztB3>OAWwCo(yPHH?*z3~dx!PZ)L|-E{F8eUtsXDLMJQ`yZ07R*8%IDzI_-xFL`y%^|!!&bR3(SHu~I zWvcDLh0!8x`y4r&Wq5RBq$sN|vq1da2WlqP>JwiK+vznsau)G=V%oSSWtv`TcXw5e zX(!8<VgGd7j~QEA#qAs0-N!VxxI;}Afs={t0Oc8os9XewrzM)U%^gZZ=B3-B+Foud z+Z8)58+O~X8EqORm4P{#i$8I?W&K@NYjm8iL|J^$1kjfw1mbO zd>R<{2(S+7K?SSjw-0^_TL8pJKu_>#px0p|hbhxC?Bae)C{w)+SA=XHO?Yy8LTlQw z+(8aWGU8{K;UQ_gM%%-AC{jg=6gyrB-%w>X3wsEjw_MTxzY=b3UlxHs+Ixi;z%Lk=RtFj{6DiY-~J>L*euXH6kAiWiDc1M@S!inlk%sCXs_`{WQ5 z@^c`OqU&0OIgp>ksVZ_-IW|O*p~3eAGDN4m&p(67M6B!4cVM%FL2cx6;IqKM2eUZ# zI5MXi!Vp~UFk{E(x_A)0fzTzh8K{&}z%TaJ5?O@Zs>4o|8y@JnK}juiX1Vk`EoEOF zbXA7w*ha&6oyi4kA7G%NxHl9}fu=68Xut{yQIyY_dnT~|i+WnnGV%puuWjP>w95yB z;$=>odL1GWs&#kaZ!=M^ws3kFV}^b)_5LFXd)4&Wlw;R3ga&pt;4Lurs2zz;Pc?S z5UEpqZy;g%?8?MncKa~Wqo^-dy%;QdAwdovLGH%B?=nrdD(|?yaj=8!Qx>N{lAURi zKFniF<2#0hvUExaR|#I{;1CKC&E*LL#`{rXri-{gacVJXFhI!+%$tDN*uZ?ov?bQVO#fkiS8_R7&L(z`LSF?- zpI})jFsYH}W?(lDOHyDQ81`Edd|jkp#}x_pu`q!FKQC@pl#kt%rR{U~Q0^*x- za1c8-d7u);HCd605*DVg1??ytC1g7l0|tdkS_sDRN z1%UpXO;!K#pvj?G?~Fr~nQ(iYJwp+}YNrb9XT{+dckMgZMp}H7ZAav}Q{3<>LlI>> zu}#!OGK|CVl6Va=BCa9?8LP%1NLlRXpms0^wGV2;rq|r{UU8qX;(S#Sco_cLZ#I?7 zNS|1dVoU>C{7N4*$+Fj+E6kKtlB2!ZgnE*inzQds0LU2)UZkP~P&EsKD{G~*42}o+ zAwv#}AR5suBogGuJDzDKhM{1#sJFA5mH@_fCJPr}|AgOn#PJ!=jA552k6)##$7pF2aw!+pg-ovae2uDr*HEmkK zvR+7u66w7u=88T}Gy?U9sw(6`$;rd@Xa~PYzK!v83OGNe>{3=FatOQzpB#K7O%Rj>Cu#BWs)QkJl2HM+J8l~PFqyQhAV4tuGgh^?{f}nL za{mCa4CzYZM*6sPZL-5-(~q*PN<$oVKpgiR9$}SW0xGME!|IzPbhBbNoZz)zq)nWP zYyoguA%=EXtYCktw@pt@(_nDO>H(1c@t%Lf85bcpZ+J(xQlR$YS$So8agolVoqGU9l zh6F8W#>?2nsQ6N}o)?O(r?ya3d<2}>kbDeeD}FY<1kBrsbC!8ffdwI$+3gPj!uo4H z)&i4W$UzGX_z{5Tjp8QccfaKDNj0m+AOj=fn6x#mDjZ_*Cq?lk_E)3|IV%~p;rISA zfUNlyxkU|K&OVjTMUFq;SnCYP4E4j4S~-uh77I9!&M#~^oMaX1uq&lsiE8;i3Q^fr zsW&D`XM#{3Ipw)D*6J7hXIDv?2TB5?3Yi!zWN{Jvi3o@?K2kOuNj+2A^c@;l(Zn#` z@!;GN3A?bqNlMZUXBU5b+JD*?K?KC(rX~JFrPgfGjXk~MjnTMQl5G>r8Pg+Ms##ra z;l`w|X#wDd+q00{4gRFF!@G&HE`ls69xgab0K=hC_y|^Z#hi9HLMss8@1@n+-;owd z3{s$EP=4qY$_mF3R3?{MP9F6BGl=YW%&^>81v(rm#YIz5J86m_7}mTB`3IOpQ}mRU zLcnT(@Yw@6A__m89NpzFN!2U^<*Qi}IKSRMwBaQewwkv=i7z9L)9P??$-{z@&;S-s z^CERBD7BV-4{N?F&&sg+^g!N{Glahlp1hH+rG~V&`YEDe| zVR4)lQRl2lzgj*6|5X)1C`k>35=vJer67xLVh)Zm`I+!>yJIo_=iwLJ6cD8r@!sZ2 z+{)h#DsLJyhYm{z@i!?}%kj0=sN4?2FhlOrJW-bcf0RYh+xfNSFli|Ui)l4Ss@7EE zP}iU67E>O4c*mZcGGT}qdE9Usiod5kXDQwl7cZB;UPrr%BU=7YJO`+fu`0wOh}oeO zL#r-zR_fp{#Bv8{w^YJ`3q?&+6Ax}$Es2M1EhUjeafos`uC%IA)<8pyI1Hu)XmaHl z?36lTAxSt?>8-+f6{l?+dc zLs_pwV@Z064mLjgv}X6RbNQ5Y(M6b20Gji52w`;1#$m=K1A{_ZIr|oVXuH2v6FDDP z*^*rAAnl+zt#Ck#ZC|zT$%JJi3Mq$la9mYRyc-?v+u??;T%RL1Ez3FZ>$`)id@{;| zH=;v%S4`D-t7Z9lZ0xWW#%D@XTvY}r^)!me)zUQGk#e?Jh#emy@t@ge&NhC(1FOO6 zNyKxcW`nd0VxeGPyO^9h!^yr(3gR6B%}lLWh|v=}10SMVPbqsM8RjU_h3DL}-d5}^ zLR8nv`10_+L9I?sKik-)wBlhih-kHV#S4<%lCJlRirvVVBQYv$iO;j;eNqbS|}Z;wDTR9 z3CB)anfCu;&TQo@6IR9HJ9#_T0dPu-9KX0uVb@;Ag_Ym`p~POK>T{v#VUFS0W+g@J z_}EF*cuDIEha3M+n%gN^()>gcf60X(Me_#ujE%i5=yJvsZaBfqU{FbjAJ6Cbh@D`F zOKaJAE{Z24r&5&G&ffAFR>H#tu5v`6`sEmRj6=d5bwC~WPiG;x1E;qtCy(WveSXKZ z7PRFjv9KVaa>Nu<8@~2(`mEhWGh9lMN3buMim5}x7CG6D&nZ)iy0!2LL%fU{&XQHN zt*KHns3=xEw^GyUcKT^3mUj7M{x()7r}Tl~o7P(%sMf)1nvKaz3^GXm90|B|WnrB? zs%GEk(ZI85l@nb_6QU`eQ%k3nT1TE-$(kWe8HXZ@c}r#md%>@ z)lz>XZKkASIa5!Rui$uyA3KJ<9loC3xib*(xf}{9y!|_JkrEzthQ3*5Y5C4o$r~#y z^8rdEud_#$#i#|I;_1t6d4947M!>jBkM|fnZq(w_PQWpEn)_6-n0PJ~>*InU>?B0T z?JHJauysX#2+Nd`1N^m)4fl?E-Ewgqty zuj@UF_pqE-*us5&=>xRQP`X1hE67`9o9P&a6)BMx%$!iXrb={?R4r+BCqxwC6AziT z!X80f2B2`)Bx00nET*K2@m8CA9ay*?M1d;`An?bLn}L%9r_pf0$U15I!r!_H Date: Tue, 17 Sep 2024 15:28:43 -0400 Subject: [PATCH 39/40] add help material updog #43 --- R/mod_DosageCall.R | 2 +- R/mod_help.R | 8 +-- inst/help_files/Updog_Dosage_Calling_cite.Rmd | 10 ++++ inst/help_files/Updog_Dosage_Calling_par.Rmd | 52 ++++++++++++++++--- inst/help_files/Updog_Dosage_Calling_res.Rmd | 21 ++++++++ 5 files changed, 80 insertions(+), 13 deletions(-) diff --git a/R/mod_DosageCall.R b/R/mod_DosageCall.R index 64f6674..736978d 100644 --- a/R/mod_DosageCall.R +++ b/R/mod_DosageCall.R @@ -86,7 +86,7 @@ mod_DosageCall_ui <- function(id){ p(downloadButton(ns('download_vcf'),""), "VCF Example File"), p(downloadButton(ns('download_madc'),""), "MADC Example File"), hr(), p(HTML("Parameters description:"), actionButton(ns("goPar"), icon("arrow-up-right-from-square", verify_fa = FALSE) )), hr(), - p(HTML("Graphics description:"), actionButton(ns("goRes"), icon("arrow-up-right-from-square", verify_fa = FALSE) )), hr(), + p(HTML("Results description:"), actionButton(ns("goRes"), icon("arrow-up-right-from-square", verify_fa = FALSE) )), hr(), p(HTML("How to cite:"), actionButton(ns("goCite"), icon("arrow-up-right-from-square", verify_fa = FALSE) )), hr(), p(HTML("Updog tutorial:"), actionButton(ns("goUpdog"), icon("arrow-up-right-from-square", verify_fa = FALSE), onclick ="window.open('https://dcgerard.github.io/updog/', '_blank')" )), circle = FALSE, diff --git a/R/mod_help.R b/R/mod_help.R index b6d5d41..19960f0 100644 --- a/R/mod_help.R +++ b/R/mod_help.R @@ -28,16 +28,16 @@ mod_help_ui <- function(id){ )) ), box(title="Updog Dosage Calling", id = "Updog_Dosage_Calling_box",width = 12, collapsible = TRUE, collapsed = TRUE, status = "info", solidHeader = TRUE, - "**Draft**This tab is designed to handle the process of dosage calling in genomic data. Dosage calling is essential for determining the number of copies of a particular allele at each genomic location. The app likely includes functionalities to upload raw genomic data, apply various filtering criteria, and generate plots to visualize the distribution of dosages. Users can examine histograms for SNP max post probabilities and read depths, which help in assessing the quality and accuracy of the dosage calls.**Updog**", + "This tab is designed to handle the process of dosage calling in genomic data. Dosage calling is essential for determining the number of copies of a particular allele at each genomic location.", br(), br(), bs4Dash::tabsetPanel(id = "Updog_Dosage_Calling_tabset", - tabPanel("Parameters description", value = "Updog_Dosage_Calling_par", + tabPanel("Parameters description", value = "Updog_Dosage_Calling_par", br(), includeMarkdown(system.file("help_files/Updog_Dosage_Calling_par.Rmd", package = "BIGapp")) ), - tabPanel("Results description", value = "Updog_Dosage_Calling_results", + tabPanel("Results description", value = "Updog_Dosage_Calling_results", br(), includeMarkdown(system.file("help_files/Updog_Dosage_Calling_res.Rmd", package = "BIGapp")) ), - tabPanel("How to cite", value = "Updog_Dosage_Calling_cite", + tabPanel("How to cite", value = "Updog_Dosage_Calling_cite", br(), includeMarkdown(system.file("help_files/Updog_Dosage_Calling_cite.Rmd", package = "BIGapp")) )) ), diff --git a/inst/help_files/Updog_Dosage_Calling_cite.Rmd b/inst/help_files/Updog_Dosage_Calling_cite.Rmd index 9004582..21e35d4 100644 --- a/inst/help_files/Updog_Dosage_Calling_cite.Rmd +++ b/inst/help_files/Updog_Dosage_Calling_cite.Rmd @@ -4,3 +4,13 @@ output: html_document date: "2024-08-29" --- +* **BIGapp** + + +* **Updog package** + +Gerard, D., Ferrão, L. F. V., Garcia, A. A. F., & Stephens, M. (2018). Genotyping Polyploids from Messy Sequencing Data. Genetics, 210(3), 789-807. doi: 10.1534/genetics.118.301468. + +If you used the “norm” model cite also: + +Gerard D, Ferrão L (2020). “Priors for Genotyping Polyploids.” Bioinformatics, 36(6), 1795-1800. ISSN 1367-4803, doi: 10.1093/bioinformatics/btz852. diff --git a/inst/help_files/Updog_Dosage_Calling_par.Rmd b/inst/help_files/Updog_Dosage_Calling_par.Rmd index 9ea5f87..40305c8 100644 --- a/inst/help_files/Updog_Dosage_Calling_par.Rmd +++ b/inst/help_files/Updog_Dosage_Calling_par.Rmd @@ -4,11 +4,47 @@ output: html_document date: "2024-08-29" --- -:hammer: Under development - -About Population Models: -Model: What form should the prior (genotype distribution) take? -The following information is from the Updog manual: -Possible values of the genotype distribution (values of model) are: -`norm` A distribution whose genotype frequencies are proportional to the density value of a normal with some mean and some standard deviation. Unlike the `bb` and `hw` options, this will allow for distributions both more and less dispersed than a binomial. This seems to be the most robust to violations in modeling assumptions, and so is the default. This prior class was developed in Gerard and Ferrao (2020). `hw` A binomial distribution that results from assuming that the population is in Hardy-Weinberg equilibrium (HWE). This actually does pretty well even when there are minor to moderate deviations from HWE. Though it does not perform as well as the `norm` option when there are severe deviations from HWE. `bb` A beta-binomial distribution. This is an overdispersed version of `hw` and can be derived from a special case of the Balding-Nichols model. `s1` This prior assumes the individuals are all full-siblings resulting from one generation of selfing. I.e. there is only one parent. This model assumes a particular type of meiotic behavior: polysomic inheritance with bivalent, non-preferential pairing. -`f1` This prior assumes the individuals are all full-siblings resulting from one generation of a bi-parental cross. This model assumes a particular type of meiotic behavior: polysomic inheritance with bivalent, non-preferential pairing. `f1pp` This prior allows for double reduction and preferential pairing in an F1 population of tretraploids. `s1pp` This prior allows for double reduction and preferential pairing in an S1 population of tretraploids. `flex` Generically any categorical distribution. Theoretically, this works well if you have a lot of individuals. In practice, it seems to be much less robust to violations in modeling assumptions.`uniform` A discrete uniform distribution. This should never be used in practice.", +* **MADC or VCF file** + + * **MADC file**: + + * **VCF file**: +Variant Call Format (VCF) is a standard file format to store genetic variant information. The genotype (GT) data within the VCF is required for the analysis in this tab. For more details about the VCF format, see this document: https://samtools.github.io/hts-specs/VCFv4.2.pdf. + +* **Passport File**: A comma-separated values (CSV) file containing individual names (Sample_ID) in the first column and phenotype values in the subsequent columns. The phenotype column names should correspond to the phenotype ID. + + * **Select Category Subset**: After loading the passport file, this option will be available. You can select the column name to base the subsetting for the samples + + * **Select Category Values**: Select the value within the select column that should be kept for the analysis. For example, selecting column “Species” from the example bellow, and selecting only “setosa” as value, the “Sample_1” will be removed from the analysis. + +
    + +|Sample_ID | Sepal.Length| Sepal.Width| Petal.Length| Petal.Width|Species | +|:---------:|:------------:|:-----------:|:------------:|:-----------:|:-------:| +|Sample_1 | 5.1| 3.5| 1.4| 0.2|versicolor | +|Sample_2 | 4.9| 3.0| 1.4| 0.2|setosa | +|Sample_3 | 4.7| 3.2| 1.3| 0.2|setosa | +|Sample_4 | 4.6| 3.1| 1.5| 0.2|setosa | +|Sample_5 | 5.0| 3.6| 1.4| 0.2|setosa | +|Sample_6 | 5.4| 3.9| 1.7| 0.4|setosa | + +
    + +  + +* **Output File Name**: Define output VCF file name + +* **Species Ploidy**: Specifies the ploidy level of the species. The current analysis supports both diploids and autopolyploids. + +* **Updog Model**: Select the model to be applied. + +The following information is from the Updog manual. Possible values of the genotype distribution (values of model) are: + +`norm` A distribution whose genotype frequencies are proportional to the density value of a normal with some mean and some standard deviation. Unlike the `bb` and `hw` options, this will allow for distributions both more and less dispersed than a binomial. This seems to be the most robust to violations in modeling assumptions, and so is the default. This prior class was developed in Gerard and Ferrao (2020). `hw` A binomial distribution that results from assuming that the population is in Hardy-Weinberg equilibrium (HWE). This actually does pretty well even when there are minor to moderate deviations from HWE. Though it does not perform as well as the `norm` option when there are severe deviations from HWE. `bb` A beta-binomial distribution. This is an overdispersed version of `hw` and can be derived from a special case of the Balding-Nichols model. `s1` This prior assumes the individuals are all full-siblings resulting from one generation of selfing. I.e. there is only one parent. This model assumes a particular type of meiotic behavior: polysomic inheritance with bivalent, non-preferential pairing. + +`f1` This prior assumes the individuals are all full-siblings resulting from one generation of a bi-parental cross. This model assumes a particular type of meiotic behavior: polysomic inheritance with bivalent, non-preferential pairing. `f1pp` This prior allows for double reduction and preferential pairing in an F1 population of tretraploids. `s1pp` This prior allows for double reduction and preferential pairing in an S1 population of tretraploids. `flex` Generically any categorical distribution. Theoretically, this works well if you have a lot of individuals. In practice, it seems to be much less robust to violations in modeling assumptions.`uniform` A discrete uniform distribution. This should never be used in practice." + + * **Parent**: If “s1” or “s1pp” model is selected you must define which sample is correspondent to the parent including the sample ID in this box. The input sample ID must match to the sample ID in the input genotype file + * **Parent1 and Parent2**: if “f1” or “f1pp” model is selected you must define which samples correspondent to the parent1 and parent2 including the samples ID in the respective boxes. The input sample ID must match to the sample ID in the input genotype file + +* **Number of CPU Cores**: Number of cores to be used in the multidog function paralelization diff --git a/inst/help_files/Updog_Dosage_Calling_res.Rmd b/inst/help_files/Updog_Dosage_Calling_res.Rmd index d7806fa..cd92fa3 100644 --- a/inst/help_files/Updog_Dosage_Calling_res.Rmd +++ b/inst/help_files/Updog_Dosage_Calling_res.Rmd @@ -3,3 +3,24 @@ title: "Updog_Dosage_Calling_res" output: html_document date: "2024-08-29" --- + +* **Download VCF file**: + +VCF file generated by function updog2vcf from package BIGr. The function add lines to the VCF header specifying updog and BIGr versions and command line used. Updog information are kept in the INFO fields: + + * DP: Total Depth + * ADS: Depths for the ref and each alt allele in the order listed + * BIAS: The estimated allele bias of the SNP from updog + * OD: The estimated overdispersion parameter of the SNP from updog + * PMC: The estimated proportion of individuals misclassified in the SNP from updog + +And in the FORMAT fields: + + * GT: Genotype, where 1 is the count of alternate alleles + * UD: Dosage count of reference alleles from updog, where 0 = homozygous alternate + * DP: Read depth + * RA: Reference allele read depth + * AD: Allelic depths for the ref and alt alleles in the order listed + * MPP: Maximum posterior probability for that dosage call from updog + +In case model “f1”, “f1pp” selected, the VCF will contain “parent1” and “parent2” as IDs for the parents defined. If “s1”, “s1pp” are selected, the VCF will contain “parent” as the ID for the input parent. From d186a56fe7291a407d0fd8dea69d5b97151a5ee0 Mon Sep 17 00:00:00 2001 From: Cristianetaniguti Date: Tue, 17 Sep 2024 15:50:43 -0400 Subject: [PATCH 40/40] fix test --- tests/testthat/test-DosageCall.R | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/testthat/test-DosageCall.R b/tests/testthat/test-DosageCall.R index 7034cc7..b49b881 100644 --- a/tests/testthat/test-DosageCall.R +++ b/tests/testthat/test-DosageCall.R @@ -210,7 +210,6 @@ test_that("Dosage Calling from VCF file f1 and s1 model",{ ) vcf_result <- read.vcfR(paste0(output_name,".vcf.gz")) - colnames(vcf_result@gt) DP <- sum(as.numeric(extract.gt(vcf_result, "DP"))) expect_equal(DP, 23618990)