From 40303079e81b16e662255fed72f26cd735e0fe8f Mon Sep 17 00:00:00 2001 From: nkwang Date: Thu, 29 Aug 2024 02:44:06 -0700 Subject: [PATCH 01/16] update Rscripts --- module/scripts/extract-VCF-features-SV.R | 278 ++++++++++++++++++ ...-vcf-features.R => extract-VCF-features.R} | 46 ++- module/scripts/extract-vcf-features-SV.R | 181 ------------ module/scripts/liftover-Delly2-vcf.R | 157 ---------- ...tability.R => predict-variant-stability.R} | 110 +++---- 5 files changed, 345 insertions(+), 427 deletions(-) create mode 100644 module/scripts/extract-VCF-features-SV.R rename module/scripts/{extract-vcf-features.R => extract-VCF-features.R} (87%) delete mode 100644 module/scripts/extract-vcf-features-SV.R delete mode 100644 module/scripts/liftover-Delly2-vcf.R rename module/scripts/{predict-liftover-stability.R => predict-variant-stability.R} (50%) diff --git a/module/scripts/extract-VCF-features-SV.R b/module/scripts/extract-VCF-features-SV.R new file mode 100644 index 0000000..b378a9c --- /dev/null +++ b/module/scripts/extract-VCF-features-SV.R @@ -0,0 +1,278 @@ +#!/usr/bin/env Rscript +# extract-VCF-features-SV.R +#################################################################################################### +# +# LiftOver Delly2 structural variants and annotate with gnomAD population allele frequency +# Extract VCF features and save as Rds for input to predict-variant-stability.R +# +#################################################################################################### + +suppressPackageStartupMessages({ + library(vcfR); + library(data.table); + library(argparse); + library(rtracklayer); + library(GenomicRanges); + }); + +################################################################################################### +# Input +################################################################################################### +# Define command line arguments +parser <- ArgumentParser(); +parser$add_argument('--input-vcf', type = 'character', help = 'Input Delly2 VCF'); +parser$add_argument('--source-build', type = 'character', help = 'One of {GRCh37, GRCh38}'); +parser$add_argument('--chain-file', type = 'character', help = 'Chain file for coordinate conversion'); +parser$add_argument('--header-contigs', type = 'character', help = 'Resource file with VCF header for target build'); +parser$add_argument('--gnomad-rds', type = 'character', help = 'gnomAD-SV v4 resource file'); +parser$add_argument('--output-vcf', type = 'character', help = 'VCF output'); +parser$add_argument('--output-rds', type = 'character', help = 'Rds output for input to RF model'); +args <- parser$parse_args(); + +# Save command line arguments +for (arg in names(args)) { + assign(gsub('_', '.', arg), args[[arg]]); + } + +input.vcf <- '/hot/project/method/AlgorithmEvaluation/BNCH-000142-GRCh37v38/gSV/bcftools-merge/CPCG-40QC_GRCh38/CPCG-40QC_GRCh38_regenotype-gSV_delly_bcftools-merge_delly-filter-germline.vcf.gz'; +source.build <- 'GRCh38'; +chain.file <- '/hot/project/method/AlgorithmEvaluation/BNCH-000142-GRCh37v38/publish/resource/hg38ToHg19.over.chain'; +header.contigs <- '/hot/project/method/AlgorithmEvaluation/BNCH-000142-GRCh37v38/publish/resource/GRCh37-vcf-header-contigs.txt'; +gnomad.rds <- '/hot/project/method/AlgorithmEvaluation/BNCH-000142-GRCh37v38/publish/resource/gnomad.v4.0.sv.Rds'; +output.vcf <- '/hot/project/method/AlgorithmEvaluation/BNCH-000142-GRCh37v38/gSV/stableLift/train_CPCG-40QC_Delly2_38_test/CPCG-40QC_Delly2_LiftOver-GRCh37.vcf.gz'; +output.rds <- '/hot/project/method/AlgorithmEvaluation/BNCH-000142-GRCh37v38/gSV/stableLift/train_CPCG-40QC_Delly2_38_test/CPCG-40QC_Delly2_LiftOver-GRCh37_annotated.Rds'; + +################################################################################################### +# Functions +################################################################################################### +# vcfR::getINFO() to data.table +vcf.info.to.dt <- function(vcf.info) { + vcf.info <- lapply(vcf.info, function(x) vcf.info.string.to.list(x)); + feature.names <- unique(unlist(lapply(vcf.info, names))); + vcf.info <- do.call(mapply, c(FUN = list, lapply(vcf.info, `[`, feature.names))); + setNames(as.data.table(vcf.info), feature.names); + } + +# Split VCF info field to list +vcf.info.string.to.list <- function(vcf.info, keep.columns = NULL) { + list.out <- strsplit(vcf.info, split = ';'); + list.out <- lapply(list.out, function(x) strsplit(x, split = '=')); + labels <- sapply(list.out[[1]], function(x) x[[1]]); + values <- sapply(list.out[[1]], function(x) if (length(x) == 2) x[[2]] else x[[1]]); + names(values) <- labels; + if (is.null(keep.columns)) return(values); + values <- values[labels %in% keep.columns]; + return(values); + } + +calculate.VAF <- function(GT.row) { + total <- sum(GT.row %in% c('0/0', '0/1', '1/1'), na.rm = TRUE) * 2; + alt <- sum(GT.row == '0/1', na.rm = TRUE) + sum(GT.row == '1/1', na.rm = TRUE) * 2; + return(alt / total); + } + +get.overlap <- function(start1, end1, start2, end2) { + max.length <- pmax((end1 - start1), (end2 - start2)); + overlap.length <- pmin(end1, end2) - pmax(start1, start2); + return(overlap.length / max.length); + } + +find.SV.match <- function(this.ID, input, reference, overlap, offset) { + # Match SV type and CHR + this.variant <- input[ID == this.ID]; + reference <- reference[SVTYPE == this.variant$SVTYPE & CHROM == this.variant$CHROM]; + + if (this.variant$SVTYPE == 'BND') { + reference <- reference[CHR2 == this.variant$CHR2]; + reference[, OFFSET := abs(POS - this.variant$POS)]; + reference <- reference[OFFSET < offset][order(OFFSET)]; + } else { + reference[, OVERLAP := get.overlap(POS, END, this.variant$POS, this.variant$END)]; + reference <- reference[OVERLAP > overlap][order(OVERLAP, decreasing = TRUE)]; + } + + return(list(gnomad.match.ID = reference[1, ID], gnomad.matches = nrow(reference))); + } + +annotate.gnomad.features <- function(features.dt, features.dt.gnomad) { + gnomad.features <- c('ID', 'AF', 'POPMAX_AF'); + features.dt[, c('gnomad.match.ID', 'gnomad.matches') := rbindlist(lapply(ID, find.SV.match, input = features.dt, reference = features.dt.gnomad, overlap = 0.8, offset = 500))]; + features.dt <- merge(features.dt, features.dt.gnomad[, ..gnomad.features], all.x = TRUE, by.x = 'gnomad.match.ID', by.y = 'ID'); + } + +################################################################################################### +# Load files +################################################################################################### +input.vcf <- read.vcfR(input.vcf); +header.contigs <- scan(header.contigs, character()); +liftover.chain <- import.chain(chain.file); +features.dt.gnomad <- readRDS(gnomad.rds); + +################################################################################################### +# Data preprocessing +################################################################################################### +# Convert variant information into dt +if (any(duplicated(input.vcf@fix[, 'ID']))) input.vcf@fix[, 'ID'] <- paste0(substr(input.vcf@fix[, 'ID'], 1, 3), sprintf('%08d', seq_len(nrow(input.vcf@fix)))); +input.info <- vcf.info.to.dt(input.vcf@fix[, 'INFO']); +input.fix <- as.data.table(input.vcf@fix); +features.dt <- cbind(input.fix[, -c('INFO')], input.info); + +# Format columns +features.dt[, CONSENSUS := NULL]; +numeric.columns <- c('POS', 'QUAL', 'END', 'PE', 'MAPQ', 'SRMAPQ', 'INSLEN', 'HOMLEN', 'SR', 'SRQ', 'CE', 'RDRATIO', 'SVLEN', 'POS2'); +character.columns <- names(features.dt)[!names(features.dt) %in% numeric.columns]; +features.dt[, (numeric.columns) := lapply(.SD, as.numeric), .SDcols = numeric.columns]; +features.dt[, (character.columns) := lapply(.SD, as.character), .SDcols = character.columns]; + +# Extract and aggregate per sample GT fields +gt.fields <- c('GQ', 'RC', 'RDCN', 'DR', 'DV', 'RR', 'RV'); +for (field in gt.fields) { + features.dt[, (field) := apply(extract.gt(input.vcf, element = ..field, as.numeric = TRUE), 1, mean, na.rm = TRUE)]; + } +features.dt[, COHORT_AF := apply(extract.gt(input.vcf, element = 'GT'), 1, calculate.VAF)]; +features.dt[, CIPOS := as.numeric(sapply(CIPOS, function(x) unlist(strsplit(x, ','))[2]))]; + +if (source.build == 'GRCh37') { + features.dt[, CHROM := paste0('chr', CHROM)]; + features.dt[, CHR2 := ifelse(is.na(CHR2), CHR2, paste0('chr', CHR2))]; +} else if (source.build == 'GRCh38') { + # Annotate with gnomAD before LiftOver if source build == GRCh38 + features.dt <- annotate.gnomad.features(features.dt, features.dt.gnomad); + } + +################################################################################################### +# LiftOver variants by breakpoint +################################################################################################### +# Create GRanges object +grange.source <- makeGRangesFromDataFrame( + df = features.dt, + seqnames.field = 'CHROM', + start.field = 'POS', + end.field = 'END', + keep.extra.columns = TRUE + ); + +# LiftOver using chain file +grange.target <- unlist(liftOver(grange.source, liftover.chain)); +grange.target.dt <- as.data.table(grange.target); + +# Create GRanges object using CHROM, CHR2, and POS2 from features.dt +grange.source.BND <- makeGRangesFromDataFrame( + df = features.dt[SVTYPE == 'BND', ], + seqnames.field = 'CHR2', + start.field = 'POS2', + end.field = 'POS2', + keep.extra.columns = TRUE + ); +grange.target.BND <- as.data.table(unlist(liftOver(grange.source.BND, liftover.chain))); + +# Remove multiple mappings +grange.target.dt <- grange.target.dt[!duplicated(ID)]; +grange.target.BND <- grange.target.BND[!duplicated(ID)]; +common <- intersect(grange.target.dt$ID, grange.target.BND$ID); + +grange.target.dt[ID %in% common, c('CHR2', 'POS2') := grange.target.BND[ID %in% common, .(seqnames, start)]]; + +if (source.build == 'GRCh38') { + grange.target.dt[, seqnames := sub('chr', '', seqnames)]; + grange.target.dt[, CHR2 := ifelse(is.na(CHR2), CHR2, sub('chr', '', CHR2))]; + } + +################################################################################################### +# Write output VCF +################################################################################################### +pass.liftover <- as.data.table(input.vcf@fix)$ID %in% grange.target.dt$ID; +input.fix <- as.data.table(input.vcf@fix)[pass.liftover]; +input.gt <- as.data.table(input.vcf@gt)[pass.liftover]; +grange.target.dt <- grange.target.dt[match(input.fix$ID, grange.target.dt$ID)]; + +for (i in seq_len(nrow(input.fix))) { + this.ID <- input.fix[i, ID]; + this.INFO <- vcf.info.string.to.list(input.fix[i, INFO]); + this.INFO[['END']] <- grange.target.dt[i, end]; + if (this.INFO[['SVTYPE']] == 'BND') { + this.INFO[['CHR2']] <- grange.target.dt[i, CHR2]; + this.INFO[['POS2']] <- grange.target.dt[i, POS2]; + } + if (this.INFO[['SVTYPE']] %in% ) + this.INFO <- lapply(names(this.INFO), function(x) paste(x, this.INFO[[x]], sep = '=')); + this.INFO <- paste(this.INFO, collapse = ';'); + this.INFO <- gsub('IMPRECISE=IMPRECISE', 'IMPRECISE', this.INFO); + this.INFO <- gsub('PRECISE=PRECISE', 'PRECISE', this.INFO); + this.INFO <- gsub('SOMATIC=SOMATIC', 'SOMATIC', this.INFO); + input.fix[i, c('CHROM', 'POS', 'INFO') := grange.target.dt[i, .(seqnames, start, ..this.INFO)]]; + } + +lifted.vcf <- input.vcf; +lifted.vcf@fix <- as.matrix(input.fix); +lifted.vcf@gt <- as.matrix(input.gt); +lifted.vcf@meta <- lifted.vcf@meta[!grepl("^##(contig|reference)", lifted.vcf@meta)]; +lifted.vcf@meta <- c(lifted.vcf@meta, header.contigs); + +write.vcf(lifted.vcf, output.vcf); + +################################################################################################### +# Format features for RF +################################################################################################### +features.dt <- features.dt[ID %in% grange.target.dt$ID]; +features.dt <- features.dt[match(input.fix$ID, features.dt$ID)]; +features.dt[, c('CHROM', 'POS', 'END', 'CHR2', 'POS2') := grange.target.dt[, .(seqnames, start, end, CHR2, POS2)]]; +features.dt[!SVTYPE %in% c('BND', 'INS'), SVLEN := END - POS + 1]; + +if (source.build == 'GRCh37') { + features.dt <- annotate.gnomad.features(features.dt, features.dt.gnomad); + } + +continuous.features <- c( + 'POS', + 'QUAL', + 'END', + 'PE', + 'MAPQ', + 'CIPOS', + 'SRMAPQ', + 'HOMLEN', + 'SR', + 'SRQ', + 'CE', + 'RDRATIO', + 'SVLEN', + 'GQ', + 'RC', + 'RDCN', + 'DR', + 'DV', + 'RR', + 'RV', + 'AF', + 'gnomad.matches', + 'POPMAX_AF' + ); + +categorical.features <- c( + 'CHROM', + 'SVTYPE', + 'CT' + ); + +# Extract features and format +continuous.features <- continuous.features[continuous.features %in% names(features.dt)]; +categorical.features <- categorical.features[categorical.features %in% names(features.dt)]; +all.features <- c(continuous.features, categorical.features, 'ID'); + +features.dt <- features.dt[, ..all.features]; +features.dt[, (continuous.features) := lapply(.SD, as.numeric), .SDcols = continuous.features]; +features.dt[, (continuous.features) := lapply(.SD, function(x) ifelse(is.na(x), 0, x)), .SDcols = continuous.features]; +features.dt[, (categorical.features) := lapply(.SD, function(x) ifelse(is.na(x), '', x)), .SDcols = categorical.features]; +features.dt[, (categorical.features) := lapply(.SD, as.factor), .SDcols = categorical.features]; +names(features.dt) <- make.names(names(features.dt)); + +# Remove rows with NA +features.dt.rows <- nrow(features.dt); +features.dt <- features.dt[apply(features.dt, 1, function(x) !any(is.na(x))), ]; +cat('Removed', features.dt.rows - nrow(features.dt), 'rows with missing data\n'); + +################################################################################################### +# Save features.dt for input to RF +################################################################################################### +saveRDS(features.dt, output.rds); diff --git a/module/scripts/extract-vcf-features.R b/module/scripts/extract-VCF-features.R similarity index 87% rename from module/scripts/extract-vcf-features.R rename to module/scripts/extract-VCF-features.R index 5aa3d75..641d653 100644 --- a/module/scripts/extract-vcf-features.R +++ b/module/scripts/extract-VCF-features.R @@ -1,10 +1,8 @@ #!/usr/bin/env Rscript -# extract-vcf-features.R +# extract-VCF-features.R #################################################################################################### # -# Extract features from vcf -# Extract Funcotator annotations if present -# Annotate with RepeatMasker regions if intersect file is provided +# Extract VCF features and save as Rds for input to predict-variant-stability.R # #################################################################################################### @@ -21,11 +19,11 @@ suppressPackageStartupMessages({ ################################################################################################### # Define command line arguments parser <- ArgumentParser(); -parser$add_argument('--input-vcf', type = 'character', help = 'GRCh37 vcf lifted to GRCh38 for feature extraction'); -parser$add_argument('--input-dir', type = 'character', help = 'Directory with vcf subsets'); -parser$add_argument('--output-rds', type = 'character', help = 'Rds output for use in RF model'); -parser$add_argument('--variant-caller', type = 'character', help = ''); -parser$add_argument('--ncore', type = 'integer', help = 'Number of cores to use for parallelizing features extraction', default = 1); +parser$add_argument('--input-vcf', type = 'character', help = 'Input VCF for feature extraction, mutually exclusive with --input-dir'); +parser$add_argument('--input-dir', type = 'character', help = 'Directory with VCF subsets for parallelization, mutually exclusive with --input-vcf'); +parser$add_argument('--output-rds', type = 'character', help = 'Rds output for input to RF model'); +parser$add_argument('--variant-caller', type = 'character', help = 'One of {HaplotypeCaller, Mutect2, Strelka2, SomaticSniper, Muse2, Delly2}'); +parser$add_argument('--ncore', type = 'integer', help = 'Number of cores to use for processing VCF subsets in --input-dir', default = 1); args <- parser$parse_args(); # Save command line arguments @@ -33,13 +31,6 @@ for (arg in names(args)) { assign(gsub('_', '.', arg), args[[arg]]); } -# Set parameters for interactive runs -if (interactive()) { - variant.caller <- 'Strelka2'; - input.vcf <- '/hot/project/method/AlgorithmEvaluation/BNCH-000142-GRCh37v38/gSNP/stableLift/validate_TCGA-SARC_WXS/TCGA-SARC_WXS_HaplotypeCaller_LiftOver-GRCh38_annotated_exome.vcf.gz'; - vcf.subset <- input.vcf; - } - if (!is.null(input.dir)) { vcf.subsets <- list.files(input.dir, full.names = TRUE, pattern = '(\\.vcf.gz|\\.vcf)$'); output.path <- output.rds; @@ -116,6 +107,21 @@ features.dt.subsets <- foreach(vcf.subset = vcf.subsets) %dopar% { info[input.vcf@fix[, 'REF'] == 'G', REFCOUNTS := apply(extract.gt(input.vcf, element = 'GU')[input.vcf@fix[, 'REF'] == 'G', ], 1, function(x) mean(sapply(strsplit(x, ','), function(y) as.numeric(y[1])), na.rm = TRUE))]; info[input.vcf@fix[, 'ALT'] == 'G', ALTCOUNTS := apply(extract.gt(input.vcf, element = 'GU')[input.vcf@fix[, 'ALT'] == 'G', ], 1, function(x) mean(sapply(strsplit(x, ','), function(y) as.numeric(y[1])), na.rm = TRUE))]; info[, AF := ALTCOUNTS / (REFCOUNTS + ALTCOUNTS)]; + } else if (variant.caller == 'SomaticSniper') { + # Calculate VAF from allelic depths + info$AF <- apply(extract.gt(input.vcf, element = 'DP4'), 1, function(x) mean( + sapply(strsplit(x, ','), function(y) { + y <- as.numeric(y); + return((y[3] + y[4]) / sum(y)); + }), + na.rm = TRUE + )); + info$AMQ <- apply(extract.gt(input.vcf, element = 'AMQ'), 1, function(x) mean(sapply(strsplit(x, ','), function(y) as.numeric(y[2])), na.rm = TRUE)); + info$BQ <- apply(extract.gt(input.vcf, element = 'BQ'), 1, function(x) mean(sapply(strsplit(x, ','), function(y) as.numeric(y[2])), na.rm = TRUE)); + info$GQ <- apply(extract.gt(input.vcf, element = 'GQ', as.numeric = TRUE), 1, mean, na.rm = TRUE); + info$MQ <- apply(extract.gt(input.vcf, element = 'GQ', as.numeric = TRUE), 1, mean, na.rm = TRUE); + info$SSC <- apply(extract.gt(input.vcf, element = 'SSC', as.numeric = TRUE), 1, mean, na.rm = TRUE); + info$VAQ <- apply(extract.gt(input.vcf, element = 'VAQ', as.numeric = TRUE), 1, mean, na.rm = TRUE); } # Get funcotation fields @@ -230,6 +236,14 @@ if (variant.caller == 'Strelka2') continuous.features <- c(continuous.features, if (variant.caller == 'Muse2') continuous.features <- c(continuous.features, 'Variant Base Quality (BQ)' = 'BQ' ); +if (variant.caller == 'SomaticSniper') continuous.features <- c(continuous.features, + 'Variant Mapping Quality (AMQ)' = 'AMQ', + 'Base Quality (BQ)' = 'BQ', + 'Genotype Quality (GQ)' = 'GQ', + 'Mapping Quality (MQ)' = 'MQ', + 'Somatic Score (SSC)' = 'SSC', + 'Variant Allele Quality (VAQ)' = 'VAQ' + ); categorical.features <- c( 'Chromosome (CHR)' = 'CHROM', diff --git a/module/scripts/extract-vcf-features-SV.R b/module/scripts/extract-vcf-features-SV.R deleted file mode 100644 index 9927a7e..0000000 --- a/module/scripts/extract-vcf-features-SV.R +++ /dev/null @@ -1,181 +0,0 @@ -#!/usr/bin/env Rscript -# extract-vcf-features-SV.R -#################################################################################################### -# -# Extract features from vcf -# Intersect and annotate with gnomAD-SV vcf -# -#################################################################################################### - -suppressPackageStartupMessages({ - library(vcfR); - library(data.table); - library(argparse); - library(GenomicRanges); - }); - -################################################################################################### -# Input -################################################################################################### -# Define command line arguments -parser <- ArgumentParser(); -parser$add_argument('--variant-caller', type = 'character', help = ''); -parser$add_argument('--input-vcf', type = 'character', help = 'Delly2 vcf'); -parser$add_argument('--output-rds', type = 'character', help = 'Rds output for use in RF model'); -parser$add_argument('--gnomad-rds', type = 'character', help = 'gnomAD Rds file'); -args <- parser$parse_args(); - -# Save command line arguments -for (arg in names(args)) { - assign(gsub('_', '.', arg), args[[arg]]); - } - -# Set parameters for interactive runs -if (interactive()) { - variant.caller <- 'Delly2'; - input.vcf <- '/hot/project/method/AlgorithmEvaluation/BNCH-000142-GRCh37v38/sSV/stableLift/train_CPCG-40QC_Delly2/CPCG-40QC_Delly2_LiftOver-GRCh38.vcf.gz'; - gnomad.rds <- '/hot/code/nkwang/GitHub/uclahs-cds/project-method-AlgorithmEvaluation-BNCH-000142-GRCh37v38/report/manuscript/publish/data/gnomad.v4.0.sv.Rds'; - } - -################################################################################################### -# Functions -################################################################################################### -vcf.info.to.dt <- function(vcf.info) { - # Split each string by semicolon and convert to a list of key-value pairs - vcf.info <- strsplit(vcf.info, ';'); - vcf.info <- lapply(vcf.info, function(x) { - x <- strsplit(x, '='); - as.list(stats::setNames(sapply(x, `[`, 2), sapply(x, `[`, 1))); - }) - - # Combine the list of key-value pairs into a data table - rbindlist(vcf.info, fill = TRUE); - } - -calculate.VAF <- function(GT.row) { - total <- sum(GT.row %in% c('0/0', '0/1', '1/1'), na.rm = TRUE) * 2; - alt <- sum(GT.row == '0/1', na.rm = TRUE) + sum(GT.row == '1/1', na.rm = TRUE) * 2; - return(alt / total); - } - -get.overlap <- function(start1, end1, start2, end2) { - max.length <- pmax((end1 - start1), (end2 - start2)); - overlap.length <- pmin(end1, end2) - pmax(start1, start2); - return(overlap.length / max.length); - } - -find.SV.match <- function(this.ID, input, reference, overlap, offset) { - # Match SV type and CHR - this.variant <- input[ID == this.ID]; - reference <- reference[SVTYPE == this.variant$SVTYPE & CHROM == this.variant$CHROM]; - - if (this.variant$SVTYPE == 'BND') { - # reference[, OFFSET := abs(POS - this.variant$POS) + abs(POS2 - this.variant$POS2)]; - reference[, OFFSET := abs(POS - this.variant$POS)]; - reference <- reference[OFFSET < offset & CHR2 == this.variant$CHR2][order(OFFSET)]; - } else { - reference[, OVERLAP := get.overlap(POS, END, this.variant$POS, this.variant$END)]; - reference <- reference[OVERLAP > overlap][order(OVERLAP, decreasing = TRUE)]; - } - - return(list(gnomad.match.ID = reference[1, ID], gnomad.matches = nrow(reference))); - } - -################################################################################################### -# Load files -################################################################################################### -input.vcf <- read.vcfR(input.vcf); -features.dt.gnomad <- readRDS(gnomad.rds); - -################################################################################################### -# Data preprocessing -################################################################################################### -# Convert variant information into dt -input.info <- vcf.info.to.dt(input.vcf@fix[, 'INFO']); -input.fix <- as.data.table(input.vcf@fix); -features.dt <- cbind(input.fix[, -c('INFO')], input.info); - -# Format columns -features.dt[, CONSENSUS := NULL]; -numeric.columns <- c('POS', 'QUAL', 'END', 'PE', 'MAPQ', 'SRMAPQ', 'INSLEN', 'HOMLEN', 'SR', 'SRQ', 'CE', 'RDRATIO', 'SVLEN', 'POS2'); -features.dt[, (numeric.columns) := lapply(.SD, as.numeric), .SDcols = numeric.columns]; - -# Extract and aggregate per sample GT fields -gt.fields <- c('GQ', 'RC', 'RDCN', 'DR', 'DV', 'RR', 'RV'); -for (field in gt.fields) { - features.dt[, (field) := apply(extract.gt(input.vcf, element = ..field, as.numeric = TRUE), 1, mean, na.rm = TRUE)]; - } -features.dt[, COHORT_AF := apply(extract.gt(input.vcf, element = 'GT'), 1, calculate.VAF)]; -features.dt[!SVTYPE %in% c('BND', 'INS'), SVLEN := END - POS + 1]; -features.dt[, CIPOS := as.numeric(sapply(CIPOS, function(x) unlist(strsplit(x, ','))[2]))]; - -################################################################################################### -# Intersect variants with gnomAD SVs -################################################################################################### -start.time <- Sys.time(); - -# features.dt <- features.dt[1:100]; -features.dt[, c('gnomad.match.ID', 'gnomad.matches') := rbindlist(lapply(ID, find.SV.match, input = features.dt, reference = features.dt.gnomad, overlap = 0.8, offset = 500))]; - -gnomad.features <- c('ID', 'AF', 'POPMAX_AF', 'NCR'); -features.dt <- merge(features.dt, features.dt.gnomad[, ..gnomad.features], all.x = TRUE, by.x = 'gnomad.match.ID', by.y = 'ID'); - -cat(format(Sys.time() - start.time, nsmall = 2), '\n'); - -################################################################################################### -# Format features for RF -################################################################################################### -continuous.features <- c( - 'POS', - 'QUAL', - 'END', - 'PE', - 'MAPQ', - 'CIPOS', - 'SRMAPQ', - 'HOMLEN', - 'SR', - 'SRQ', - 'CE', - 'RDRATIO', - 'SVLEN', - 'GQ', - 'RC', - 'RDCN', - 'DR', - 'DV', - 'RR', - 'RV', - 'gnomad.matches', - 'AF', - 'POPMAX_AF', - 'NCR' - ); - -categorical.features <- c( - 'CHROM', - 'SVTYPE', - 'CT' - ); - -# Extract features and format -continuous.features <- continuous.features[continuous.features %in% names(features.dt)]; -categorical.features <- categorical.features[categorical.features %in% names(features.dt)]; -all.features <- c(continuous.features, categorical.features, 'ID'); - -features.dt <- features.dt[, ..all.features]; -features.dt[, (continuous.features) := lapply(.SD, as.numeric), .SDcols = continuous.features]; -features.dt[, (continuous.features) := lapply(.SD, function(x) ifelse(is.na(x), 0, x)), .SDcols = continuous.features]; -features.dt[, (categorical.features) := lapply(.SD, function(x) ifelse(is.na(x), '', x)), .SDcols = categorical.features]; -features.dt[, (categorical.features) := lapply(.SD, as.factor), .SDcols = categorical.features]; -names(features.dt) <- make.names(names(features.dt)); - -# Remove rows with NA -features.dt.rows <- nrow(features.dt); -features.dt <- features.dt[apply(features.dt, 1, function(x) !any(is.na(x))), ]; -cat('Removed', features.dt.rows - nrow(features.dt), 'rows with missing data\n'); - -################################################################################################### -# Save features.dt for input to RF -################################################################################################### -saveRDS(features.dt, output.rds); diff --git a/module/scripts/liftover-Delly2-vcf.R b/module/scripts/liftover-Delly2-vcf.R deleted file mode 100644 index d473091..0000000 --- a/module/scripts/liftover-Delly2-vcf.R +++ /dev/null @@ -1,157 +0,0 @@ -#!/usr/bin/env Rscript -# liftover-Delly2-vcf.R -################################################################################################### -# -# -# -################################################################################################### - -suppressPackageStartupMessages({ - library(vcfR); - library(data.table); - library(argparse); - library(rtracklayer); - }); - -################################################################################################### -# Input -################################################################################################### -# Define command line arguments -parser <- ArgumentParser(); -parser$add_argument('--input-vcf', type = 'character', help = 'GRCh37 Delly2 vcf'); -parser$add_argument('--header-contigs', type = 'character', help = 'Directory with vcf subsets'); -parser$add_argument('--chain-file', type = 'character', help = 'hg19ToHg38.over.chain file'); -parser$add_argument('--output', type = 'character', help = 'Where to write lifted vcf'); -args <- parser$parse_args(); - -# Save command line arguments -for (arg in names(args)) { - assign(gsub('_', '.', arg), args[[arg]]); - } - -# Set parameters for interactive runs -if (interactive()) { - # input.vcf <- '/hot/project/method/AlgorithmEvaluation/BNCH-000142-GRCh37v38/gSV/bcftools-merge/CPCG-40QC_GRCh37/CPCG-40QC_GRCh37_regenotype-gSV_delly_bcftools-merge_delly-filter-germline.vcf.gz'; - input.vcf <- '/hot/project/method/AlgorithmEvaluation/BNCH-000142-GRCh37v38/sSV/bcftools-merge/CPCG-40QC_GRCh37/CPCG-40QC_GRCh37_call-sSV_delly_bcftools-merge_somatic-only.vcf.gz'; - header.contigs <- '/hot/code/nkwang/GitHub/uclahs-cds/project-method-AlgorithmEvaluation-BNCH-000142-GRCh37v38/report/manuscript/publish/GRCh38-vcf-header-contigs.txt'; - chain.file <- '/hot/resource/genomics/liftover_chain_files/hg19ToHg38.over.chain'; - output <- '/hot/project/method/AlgorithmEvaluation/BNCH-000142-GRCh37v38/sSV/stableLift/train_CPCG-40QC_Delly2/CPCG-40QC_Delly2_LiftOver-GRCh38.vcf.gz'; - } - -################################################################################################### -# Functions -################################################################################################### -vcf.fix.to.dt <- function(vcf.fix) { - vcf.fix <- as.data.table(vcf.fix); - vcf.info <- vcf.info.to.dt(vcf.fix$INFO); - cbind(vcf.fix[, -'INFO'], vcf.info); - } - -# vcfR::getINFO() to data.table -vcf.info.to.dt <- function(vcf.info) { - vcf.info <- lapply(vcf.info, function(x) vcf.info.string.to.list(x)); - feature.names <- unique(unlist(lapply(vcf.info, names))); - vcf.info <- do.call(mapply, c(FUN = list, lapply(vcf.info, `[`, feature.names))); - setNames(as.data.table(vcf.info), feature.names); - } - -# Split vcf info field to list -vcf.info.string.to.list <- function(vcf.info, keep.columns = NULL) { - list.out <- strsplit(vcf.info, split = ';'); - list.out <- lapply(list.out, function(x) strsplit(x, split = '=')); - labels <- sapply(list.out[[1]], function(x) x[[1]]); - values <- sapply(list.out[[1]], function(x) if (length(x) == 2) x[[2]] else x[[1]]); - names(values) <- labels; - if (is.null(keep.columns)) return(values); - values <- values[labels %in% keep.columns]; - return(values); - } - -################################################################################################### -# Load files -################################################################################################### -input.vcf.path <- input.vcf; -input.vcf <- read.vcfR(input.vcf); -header.contigs <- scan(header.contigs, character()); -liftover.chain <- import.chain(chain.file); - -################################################################################################### -# Data preprocessing -################################################################################################### -if (any(duplicated(input.vcf@fix[, 'ID']))) input.vcf@fix[, 'ID'] <- paste0(substr(input.vcf@fix[, 'ID'], 1, 3), sprintf('%08d', seq_len(nrow(input.vcf@fix)))); -# if (any(duplicated(input.vcf@fix[, 'ID']))) fix.dt[, ID := paste0(substr(ID, 1, 3), sprintf('%08d', seq_len(nrow(fix.dt))))]; -fix.dt <- as.data.table(input.vcf@fix); -gt.dt <- as.data.table(input.vcf@gt); - -fix.dt <- vcf.fix.to.dt(fix.dt); -fix.dt[, CHROM := paste0('chr', CHROM)]; -fix.dt[, CHR2 := ifelse(is.na(CHR2), CHR2, paste0('chr', CHR2))]; - -fix.dt <- fix.dt[, -c('CONSENSUS')]; -numeric.columns <- c('POS', 'QUAL', 'END', 'PE', 'MAPQ', 'SRMAPQ', 'INSLEN', 'HOMLEN', 'SR', 'SRQ', 'CE', 'SVLEN', 'POS2'); -character.columns <- names(fix.dt)[!names(fix.dt) %in% numeric.columns]; -fix.dt[, (numeric.columns) := lapply(.SD, as.numeric), .SDcols = numeric.columns]; -fix.dt[, (character.columns) := lapply(.SD, as.character), .SDcols = character.columns]; - -################################################################################################### -# Liftover -################################################################################################### -# Create GRanges object -granges.37 <- makeGRangesFromDataFrame( - df = fix.dt, - seqnames.field = 'CHROM', - start.field = 'POS', - end.field = 'END', - keep.extra.columns = TRUE - ); - -# Liftover using chain file -granges.38 <- unlist(liftOver(granges.37, liftover.chain)); -granges.38.dt <- as.data.table(granges.38); - -# Create GRanges object using CHROM, CHR2, and POS2 from fix.dt -granges.37.BND <- makeGRangesFromDataFrame( - df = fix.dt[SVTYPE == 'BND', ], - seqnames.field = 'CHR2', - start.field = 'POS2', - end.field = 'POS2', - keep.extra.columns = TRUE - ); -granges.38.BND <- as.data.table(unlist(liftOver(granges.37.BND, liftover.chain))); - -# Remove multiple mappings -granges.38.dt <- granges.38.dt[!duplicated(ID)]; -granges.38.BND <- granges.38.BND[!duplicated(ID)]; -common <- intersect(granges.38.dt$ID, granges.38.BND$ID); - -granges.38.dt[ID %in% common, c('CHR2', 'POS2') := granges.38.BND[ID %in% common, .(seqnames, start)]]; - -pass.liftover <- as.data.table(input.vcf@fix)$ID %in% granges.38.dt$ID; -fix.lifted <- as.data.table(input.vcf@fix)[pass.liftover]; -gt.dt <- gt.dt[pass.liftover]; -for (i in seq_len(nrow(fix.lifted))) { - this.ID <- fix.lifted[i, ID]; - this.INFO <- vcf.info.string.to.list(fix.lifted[i, INFO]); - this.INFO[['END']] <- granges.38.dt[i, end]; - if (this.INFO[['SVTYPE']] == 'BND') { - this.INFO[['CHR2']] <- granges.38.dt[i, CHR2]; - this.INFO[['POS2']] <- granges.38.dt[i, POS2]; - } - this.INFO <- lapply(names(this.INFO), function(x) paste(x, this.INFO[[x]], sep = '=')); - this.INFO <- paste(this.INFO, collapse = ';'); - this.INFO <- gsub('IMPRECISE=IMPRECISE', 'IMPRECISE', this.INFO); - this.INFO <- gsub('PRECISE=PRECISE', 'PRECISE', this.INFO); - this.INFO <- gsub('SOMATIC=SOMATIC', 'SOMATIC', this.INFO); - fix.lifted[i, c('CHROM', 'POS', 'INFO') := granges.38.dt[ID == ..this.ID, .(seqnames, start, ..this.INFO)]]; - } - -################################################################################################### -# Write output vcf -################################################################################################### -output.vcf <- input.vcf; -output.vcf@fix <- as.matrix(fix.lifted); -output.vcf@gt <- as.matrix(gt.dt); -output.vcf@meta <- output.vcf@meta[!grepl('^##(contig|reference)', output.vcf@meta)]; -output.vcf@meta <- c(output.vcf@meta, header.contigs); - -write.vcf(output.vcf, output); diff --git a/module/scripts/predict-liftover-stability.R b/module/scripts/predict-variant-stability.R similarity index 50% rename from module/scripts/predict-liftover-stability.R rename to module/scripts/predict-variant-stability.R index d8f5dd2..177a4d1 100644 --- a/module/scripts/predict-liftover-stability.R +++ b/module/scripts/predict-variant-stability.R @@ -2,13 +2,11 @@ # predict-liftover-stability.R #################################################################################################### # -# Apply random forest model to predict variant LiftOver stability -# Validate results and plot model performance with discordance file +# Apply random forest model to predict variant stability # #################################################################################################### suppressPackageStartupMessages({ - library(caret); library(ranger); library(argparse); library(ROCR); @@ -20,12 +18,11 @@ suppressPackageStartupMessages({ ################################################################################################### # Define command line arguments parser <- ArgumentParser(); -parser$add_argument('--variant-caller', type = 'character'); -parser$add_argument('--features-dt', type = 'character'); -parser$add_argument('--rf-model', type = 'character'); -parser$add_argument('--specificity', type = 'numeric', help = 'Target specificity, overrides `--threshold`'); -parser$add_argument('--threshold', type = 'numeric', help = 'Stability score threshold', default = 0.5); -parser$add_argument('--output-tsv', type = 'character', help = 'TSV output file'); +parser$add_argument('--variant-caller', type = 'character', help = 'One of {HaplotypeCaller, Mutect2, Strelka2, SomaticSniper, Muse2, Delly2}'); +parser$add_argument('--features-dt', type = 'character', help = 'Processed Rds file with variant info and annotations'); +parser$add_argument('--rf-model', type = 'character', help = 'Pre-trained random forest model Rds file'); +parser$add_argument('--specificity', type = 'numeric', help = 'Target specificity based on whole genome validation set, overrides `--threshold`'); +parser$add_argument('--threshold', type = 'numeric', help = 'Stability score threshold, default based on maximizing F1-score in whole genome validation set'); args <- parser$parse_args(); # Save command line arguments @@ -33,27 +30,6 @@ for (arg in names(args)) { assign(gsub('_', '.', arg), args[[arg]]); } -#################################################################################################### -# Functions -#################################################################################################### - -# Sort datatable by chr then position -sort.genomic.dt <- function(x, chr = 'CHROM', pos = 'POS') { - setDT(x); - x[, eval(chr) := gsub('chr', '', get(chr))]; - x[, eval(chr) := gsub('X', '23', get(chr))]; - x[, eval(chr) := gsub('Y', '24', get(chr))]; - x[, eval(chr) := as.numeric(get(chr))]; - - setorderv(x, c(chr, pos), c(1, 1)); - - x[, eval(chr) := gsub('23', 'X', get(chr))]; - x[, eval(chr) := gsub('24', 'Y', get(chr))]; - x[, eval(chr) := paste0('chr', get(chr))]; - - return(x); - } - ################################################################################################### # Load data ################################################################################################### @@ -88,62 +64,50 @@ print(dim(features.dt)); ################################################################################################### # Apply random forest model ################################################################################################### -cat('\nPredicting liftover stability with', basename(rf.model.path), '\n'); +cat('\nPredicting variant stability with', basename(rf.model.path), '\n'); stability <- predict(rf.model, data = features.dt); - -# if (!is.null(specificity) && is.numeric(specificity)) { -# cat('Target specificity =', specificity, '\n'); -# operating.index <- max(which(unlist(rf.model$performance@x.values) < 1 - specificity)); -# sensitivity <- unlist(rf.model$performance@y.values)[operating.index]; -# cat('Projected sensitivity =', round(sensitivity, 3), '\n'); -# threshold <- 1 - unlist(rf.model$performance@alpha.values)[operating.index]; -# cat('Stability score threshold =', round(threshold, 3), '\n'); -# } else if (!is.null(threshold) && is.numeric(threshold)) { -# cat('Target threshold =', threshold, '\n'); -# operating.index <- min(which(unlist(rf.model$performance@alpha.values) <= 1 - threshold)); -# specificity <- 1 - unlist(rf.model$performance@x.values)[operating.index]; -# sensitivity <- unlist(rf.model$performance@y.values)[operating.index]; -# cat('Projected specificity =', round(specificity, 3), '\n'); -# cat('Projected sensitivity =', round(sensitivity, 3), '\n'); -# } else { -# performance.acc <- performance(prediction$train, measure = 'f'); #F1-score -# index <- which.max(unlist(performance.acc@y.values)); -# cutoff <- unlist(performance.acc@x.values)[index]; -# metric <- unlist(performance.acc@y.values)[index]; -# specificity <- 1 - unlist(performance$train@x.values)[index]; -# sensitivity <- unlist(performance$train@y.values)[index]; -# cat(sprintf('Projected F[0.5]-score = %.3f\n', metric)); -# cat(sprintf('Projected sensitivity = %.3f\n', sensitivity)); -# cat(sprintf('Projected specificity = %.3f\n', specificity)); -# } - -performance.f <- performance(rf.model$prediction, measure = 'f'); -index <- which.max(unlist(performance.f@y.values)); -threshold <- unlist(performance.f@x.values)[index]; - performance <- performance(rf.model$prediction, 'sens', 'spec'); -sensitivity <- unlist(performance@y.values)[index]; -specificity <- unlist(performance@x.values)[index]; - -# Convert to stability units -threshold.stability <- 1 - threshold; -cat(sprintf('Threshold = %.3f\n', threshold.stability)); -cat(sprintf('Training sensitivity = %.3f\n', sensitivity)); -cat(sprintf('Training specificity = %.3f\n', specificity)); +if (!is.null(specificity) && is.numeric(specificity)) { + cat('Target specificity =', specificity, '\n'); + operating.index <- max(which(unlist(performance@x.values) > specificity)); + threshold <- 1 - unlist(performance@alpha.values)[operating.index]; + sensitivity <- unlist(performance@y.values)[operating.index]; + + cat(sprintf('Threshold = %.3f\n', threshold)); + cat(sprintf('Projected sensitivity = %.3f\n', sensitivity)); +} else if (!is.null(threshold) && is.numeric(threshold)) { + cat('Target threshold =', threshold, '\n'); + operating.index <- min(which(unlist(performance@alpha.values) <= 1 - threshold)); + specificity <- unlist(performance@x.values)[operating.index]; + sensitivity <- unlist(performance@y.values)[operating.index]; + + cat(sprintf('Projected sensitivity = %.3f\n', sensitivity)); + cat(sprintf('Projected specificity = %.3f\n', specificity)); +} else { + performance.f <- performance(rf.model$prediction, measure = 'f'); + operating.index <- which.max(unlist(performance.f@y.values)); + threshold <- 1 - unlist(performance.f@x.values)[operating.index]; + sensitivity <- unlist(performance@y.values)[operating.index]; + specificity <- unlist(performance@x.values)[operating.index]; + + cat(sprintf('Default threshold = %.3f\n', threshold)); + cat(sprintf('Projected sensitivity = %.3f\n', sensitivity)); + cat(sprintf('Projected specificity = %.3f\n', specificity)); + } -stability.classification <- ifelse(stability$predictions[, 1] < threshold.stability, 1, 0); +stability.classification <- ifelse(stability$predictions[, 1] < threshold, 1, 0); cat(sprintf('Proportion predicted unstable = %.3f\n\n', mean(stability.classification))); stability.classification <- as.factor(stability.classification); ################################################################################################### # Output stability scores ################################################################################################### +this.filename <- sub('_LiftOver-(.*)_annotated.Rds', '_StableLift-\\1_stability-scores.tsv', features.dt.path); annotation.dt <- data.table( CHROM = features.dt$CHROM, POS = features.dt$POS, STABILITY_SCORE = format(round(stability$predictions[, 1], 4), nsmall = 4), STABILITY = ifelse(stability.classification == '1', 'UNSTABLE', 'STABLE') ); -sort.genomic.dt(annotation.dt); -fwrite(annotation.dt, file = output.tsv, sep = '\t', col.names = FALSE); +fwrite(annotation.dt, file = this.filename, sep = '\t', col.names = FALSE); From 4482a22fabec5a3ee5ce9187ac0d9902a3d3679b Mon Sep 17 00:00:00 2001 From: nkwang Date: Thu, 29 Aug 2024 03:07:12 -0700 Subject: [PATCH 02/16] update script names --- module/predict_stability.nf | 4 ++-- module/scripts/predict-variant-stability.R | 4 ++-- module/snv_workflow.nf | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/module/predict_stability.nf b/module/predict_stability.nf index 1ae6987..d675fdd 100644 --- a/module/predict_stability.nf +++ b/module/predict_stability.nf @@ -19,10 +19,10 @@ process predict_stability_StableLift { script: """ - Rscript "${moduleDir}/scripts/predict-liftover-stability.R" \ + Rscript "${moduleDir}/scripts/predict-variant-stability.R" \ + --variant-caller "${variant_caller}" \ --features-dt "${features_rds}" \ --rf-model "${rf_model}" \ - --variant-caller "${variant_caller}" \ --output-tsv "stability.tsv" """ diff --git a/module/scripts/predict-variant-stability.R b/module/scripts/predict-variant-stability.R index 177a4d1..873f2c8 100644 --- a/module/scripts/predict-variant-stability.R +++ b/module/scripts/predict-variant-stability.R @@ -23,6 +23,7 @@ parser$add_argument('--features-dt', type = 'character', help = 'Processed Rds f parser$add_argument('--rf-model', type = 'character', help = 'Pre-trained random forest model Rds file'); parser$add_argument('--specificity', type = 'numeric', help = 'Target specificity based on whole genome validation set, overrides `--threshold`'); parser$add_argument('--threshold', type = 'numeric', help = 'Stability score threshold, default based on maximizing F1-score in whole genome validation set'); +parser$add_argument('--output-tsv', type = 'character', help = 'Output TSV with predicted Stability Scores'); args <- parser$parse_args(); # Save command line arguments @@ -103,11 +104,10 @@ stability.classification <- as.factor(stability.classification); ################################################################################################### # Output stability scores ################################################################################################### -this.filename <- sub('_LiftOver-(.*)_annotated.Rds', '_StableLift-\\1_stability-scores.tsv', features.dt.path); annotation.dt <- data.table( CHROM = features.dt$CHROM, POS = features.dt$POS, STABILITY_SCORE = format(round(stability$predictions[, 1], 4), nsmall = 4), STABILITY = ifelse(stability.classification == '1', 'UNSTABLE', 'STABLE') ); -fwrite(annotation.dt, file = this.filename, sep = '\t', col.names = FALSE); +fwrite(annotation.dt, file = output.tsv, sep = '\t', col.names = FALSE); diff --git a/module/snv_workflow.nf b/module/snv_workflow.nf index f298b7e..d5f896b 100644 --- a/module/snv_workflow.nf +++ b/module/snv_workflow.nf @@ -64,7 +64,7 @@ process extract_VCF_features_StableLift { script: """ - Rscript "${moduleDir}/scripts/extract-vcf-features.R" \ + Rscript "${moduleDir}/scripts/extract-VCF-features.R" \ --input-vcf "${vcf}" \ --variant-caller ${params.variant_caller} \ --output-rds "features.Rds" From 7c8456e57b5c477573c98776f54db43c52e83a56 Mon Sep 17 00:00:00 2001 From: nkwang Date: Thu, 29 Aug 2024 03:14:44 -0700 Subject: [PATCH 03/16] remove test paths --- module/scripts/extract-VCF-features-SV.R | 8 -------- 1 file changed, 8 deletions(-) diff --git a/module/scripts/extract-VCF-features-SV.R b/module/scripts/extract-VCF-features-SV.R index b378a9c..5a9fea1 100644 --- a/module/scripts/extract-VCF-features-SV.R +++ b/module/scripts/extract-VCF-features-SV.R @@ -34,14 +34,6 @@ for (arg in names(args)) { assign(gsub('_', '.', arg), args[[arg]]); } -input.vcf <- '/hot/project/method/AlgorithmEvaluation/BNCH-000142-GRCh37v38/gSV/bcftools-merge/CPCG-40QC_GRCh38/CPCG-40QC_GRCh38_regenotype-gSV_delly_bcftools-merge_delly-filter-germline.vcf.gz'; -source.build <- 'GRCh38'; -chain.file <- '/hot/project/method/AlgorithmEvaluation/BNCH-000142-GRCh37v38/publish/resource/hg38ToHg19.over.chain'; -header.contigs <- '/hot/project/method/AlgorithmEvaluation/BNCH-000142-GRCh37v38/publish/resource/GRCh37-vcf-header-contigs.txt'; -gnomad.rds <- '/hot/project/method/AlgorithmEvaluation/BNCH-000142-GRCh37v38/publish/resource/gnomad.v4.0.sv.Rds'; -output.vcf <- '/hot/project/method/AlgorithmEvaluation/BNCH-000142-GRCh37v38/gSV/stableLift/train_CPCG-40QC_Delly2_38_test/CPCG-40QC_Delly2_LiftOver-GRCh37.vcf.gz'; -output.rds <- '/hot/project/method/AlgorithmEvaluation/BNCH-000142-GRCh37v38/gSV/stableLift/train_CPCG-40QC_Delly2_38_test/CPCG-40QC_Delly2_LiftOver-GRCh37_annotated.Rds'; - ################################################################################################### # Functions ################################################################################################### From c6349c455d10dbe3eb6507dbed0c8752846e31d9 Mon Sep 17 00:00:00 2001 From: nkwang Date: Thu, 29 Aug 2024 03:23:17 -0700 Subject: [PATCH 04/16] remove line --- module/scripts/extract-VCF-features-SV.R | 1 - 1 file changed, 1 deletion(-) diff --git a/module/scripts/extract-VCF-features-SV.R b/module/scripts/extract-VCF-features-SV.R index 5a9fea1..fcdc297 100644 --- a/module/scripts/extract-VCF-features-SV.R +++ b/module/scripts/extract-VCF-features-SV.R @@ -186,7 +186,6 @@ for (i in seq_len(nrow(input.fix))) { this.INFO[['CHR2']] <- grange.target.dt[i, CHR2]; this.INFO[['POS2']] <- grange.target.dt[i, POS2]; } - if (this.INFO[['SVTYPE']] %in% ) this.INFO <- lapply(names(this.INFO), function(x) paste(x, this.INFO[[x]], sep = '=')); this.INFO <- paste(this.INFO, collapse = ';'); this.INFO <- gsub('IMPRECISE=IMPRECISE', 'IMPRECISE', this.INFO); From 471313cb55dec779223227b089dd831b02492ac8 Mon Sep 17 00:00:00 2001 From: nkwang Date: Thu, 29 Aug 2024 04:13:39 -0700 Subject: [PATCH 05/16] sort stability.tsv --- module/scripts/extract-VCF-features-SV.R | 2 +- module/scripts/predict-variant-stability.R | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/module/scripts/extract-VCF-features-SV.R b/module/scripts/extract-VCF-features-SV.R index fcdc297..264ce4e 100644 --- a/module/scripts/extract-VCF-features-SV.R +++ b/module/scripts/extract-VCF-features-SV.R @@ -197,7 +197,7 @@ for (i in seq_len(nrow(input.fix))) { lifted.vcf <- input.vcf; lifted.vcf@fix <- as.matrix(input.fix); lifted.vcf@gt <- as.matrix(input.gt); -lifted.vcf@meta <- lifted.vcf@meta[!grepl("^##(contig|reference)", lifted.vcf@meta)]; +lifted.vcf@meta <- lifted.vcf@meta[!grepl('^##(contig|reference)', lifted.vcf@meta)]; lifted.vcf@meta <- c(lifted.vcf@meta, header.contigs); write.vcf(lifted.vcf, output.vcf); diff --git a/module/scripts/predict-variant-stability.R b/module/scripts/predict-variant-stability.R index 873f2c8..db1b7ea 100644 --- a/module/scripts/predict-variant-stability.R +++ b/module/scripts/predict-variant-stability.R @@ -110,4 +110,11 @@ annotation.dt <- data.table( STABILITY_SCORE = format(round(stability$predictions[, 1], 4), nsmall = 4), STABILITY = ifelse(stability.classification == '1', 'UNSTABLE', 'STABLE') ); +if (substr(1, 3, annotation.dt[1, CHROM]) == 'chr') { + annotation.dt[, CHROM := factor(CHROM, levels = paste0('chr', c(1:22, 'X', 'Y')))]; +} else { + annotation.dt[, CHROM := factor(CHROM, levels = c(1:22, 'X', 'Y'))]; + } +setorder(annotation.dt, CHROM, POS); + fwrite(annotation.dt, file = output.tsv, sep = '\t', col.names = FALSE); From 6599e85cbadd3efac875321a70ebf3a25a86ff77 Mon Sep 17 00:00:00 2001 From: nkwang Date: Thu, 29 Aug 2024 15:51:19 -0700 Subject: [PATCH 06/16] add optional parameters for predict-variant-stability.R --- README.md | 8 +++++--- config/default.config | 4 ++++ config/template.config | 15 ++++++++++++--- module/predict_stability.nf | 5 +++-- 4 files changed, 24 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 67d27eb..548d356 100644 --- a/README.md +++ b/README.md @@ -98,6 +98,8 @@ input: | Optional Parameter | Type | Default | Description | | --------------------------- | ----------------------------------------------------------------------------------------- | ---------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `target_threshold` | numeric | `""` | Target Stability Score threshold for variant filtering: [0, 1] | +| `target_specificity` | numeric | `""` | Target specificity based on whole genome validation set for variant filtering: [0, 1] | | `work_dir` | path | `/scratch/$SLURM_JOB_ID` | Path of working directory for Nextflow. When included in the sample config file, Nextflow intermediate files and logs will be saved to this directory. With `ucla_cds`, the default is `/scratch` and should only be changed for testing/development. Changing this directory to `/hot` or `/tmp` can lead to high server latency and potential disk space limitations, respectively. | | `save_intermediate_files` | boolean | false | If set, save output files from intermediate pipeline processes. | | `min_cpus` | int | 1 | Minimum number of CPUs that can be assigned to each process. | @@ -117,13 +119,13 @@ The docker images in the following table are generally defined like `docker_imag * Change `params.docker_container_registry`. This will affect all of the images (except for GATK). * Change `params._version`. This will pull a different version of the same image from the registry. -* Change `params.docker_image_`. This will explicitly set the image to use, ignoring `docker_container_registry` and `_version`, and thus requires that the docker tag be explicitly set (e.g. `broadinstitute/gatk:4.2.4.1`). +* Change `params.docker_image_`. This will explicitly set the image to use, ignoring `docker_container_registry` and `_version`, and thus requires that the docker tag be explicitly set (e.g. `broadinstitute/gatk:4.4.0.0`). | Tool Parameter | Version Parameter | Default | Notes | | ------------------------ | -------------------- | ------------------------------------------------------------ | ------------------------------------------------------------------- | | `docker_image_bcftools` | `bcftools_version` | `ghcr.io/uclahs-cds/bcftools-score:1.20_score-1.20-20240505` | This image must have both BCFtools and the score plugins available. | | `docker_image_bedtools` | `bedtools_version` | `ghcr.io/uclahs-cds/bedtools:2.31.0` | | -| `docker_image_gatk` | `gatk_version` | `broadinstitute/gatk:4.2.4.1` | | +| `docker_image_gatk` | `gatk_version` | `broadinstitute/gatk:4.4.0.0` | | | `docker_image_pipeval` | `pipeval_version` | `ghcr.io/uclahs-cds/pipeval:5.0.0-rc.3` | | | `docker_image_samtools` | `samtools_version` | `ghcr.io/uclahs-cds/samtools:1.20` | | | `doker_image_stablelift` | `stablelift_version` | `ghcr.io/uclahs-cds/stablelift:FIXME` | This image is built and maintained via this repository. | @@ -191,7 +193,7 @@ Please see list of [Contributors](https://github.com/uclahs-cds/pipeline-StableL pipeline-StableLift is licensed under the GNU General Public License version 2. See the file LICENSE for the terms of the GNU GPL license. -StableLift is a machine learning approach designed to predict variant stability across reference genome builds, supplementing LiftOver coordinate conversion and increasing portability of variant calls. +StableLift is a machine learning approach designed to predict variant stability across reference genome builds, supplementing LiftOver coordinate conversion to increase the portability of variant calls. Copyright (C) 2024 University of California Los Angeles ("Boutros Lab") All rights reserved. diff --git a/config/default.config b/config/default.config index 4d808e9..1224c7d 100644 --- a/config/default.config +++ b/config/default.config @@ -38,6 +38,10 @@ params { src_fasta_dict = "${ -> Nextflow.file(params.src_fasta_ref).resolveSibling(Nextflow.file(params.src_fasta_ref).getBaseName() + '.dict') }" dest_fasta_dict = "${ -> Nextflow.file(params.dest_fasta_ref).resolveSibling(Nextflow.file(params.dest_fasta_ref).getBaseName() + '.dict') }" + + // Pass empty values to `predict-variant-stability.R` if not specified by user + target_threshold = "" + target_specificity = "" } // Process specific scope diff --git a/config/template.config b/config/template.config index e30e086..de829c0 100644 --- a/config/template.config +++ b/config/template.config @@ -5,17 +5,26 @@ includeConfig "${projectDir}/config/default.config" includeConfig "${projectDir}/config/methods.config" includeConfig "${projectDir}/nextflow.config" - // Inputs/parameters of the pipeline params { // input/output locations - output_dir = 'where/to/save/outputs/' + output_dir = "where/to/save/outputs/" - // Choices: ["Mutect2", "HaplotypeCaller"] + // Choices: ["HaplotypeCaller", "Mutect2", "Strelka2", "SomaticSniper", "Muse2", "Delly2"] variant_caller = "Mutect2" + // Path to pre-trained random forest model rf_model = "" + // Optional parameter specifying target Stability Score threshold for variant filtering + // Default behavior without `target_threshold` or `target_specificity` specified + // uses threshold maximizing F1-score in whole genome validation set + target_threshold = "" // numeric [0, 1] + + // Optional parameter specifying target specificity for variant filtering based on whole genome validation set + // Overrides `target_threshold` + target_specificity = "" // numeric [0, 1] + // Reference files funcotator_data { data_source = "/hot/ref/tool-specific-input/Funcotator/somatic/funcotator_dataSources.v1.7.20200521s" diff --git a/module/predict_stability.nf b/module/predict_stability.nf index d675fdd..67252fa 100644 --- a/module/predict_stability.nf +++ b/module/predict_stability.nf @@ -23,7 +23,9 @@ process predict_stability_StableLift { --variant-caller "${variant_caller}" \ --features-dt "${features_rds}" \ --rf-model "${rf_model}" \ - --output-tsv "stability.tsv" + --output-tsv "stability.tsv" \ + --specificity "${params.target_specificity}" \ + --threshold "${params.target_threshold}" """ stub: @@ -43,7 +45,6 @@ process run_apply_stability_annotations { input: tuple val(sample_id), path(annotated_vcf, stageAs: 'inputs/*'), - // FIXME Should there be an annotated_vcf_tbi? path(stability_tsv, stageAs: 'inputs/*'), path(stability_tsv_tbi, stageAs: 'inputs/*') From a225618e5a513f7963a71b0cf29f0c0e316815ab Mon Sep 17 00:00:00 2001 From: Nicholas Wang <47401561+nkwang24@users.noreply.github.com> Date: Thu, 29 Aug 2024 16:13:54 -0700 Subject: [PATCH 07/16] Update README.md --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 548d356..e52acd3 100644 --- a/README.md +++ b/README.md @@ -51,10 +51,11 @@ If you are using the UCLA Azure cluster, please use the [submission script](http - For SNVs, convert variant coordinates using the `BCFtools` LiftOver plugin with UCSC chain files. - For SVs, convert variant breakpoint coordinates using custom R script with UCSC chain files and `rtracklayer` and `GenomicRanges` R packages. -### 2. Variant annotation +### 2. Variant annotation* - For SNVs, add dbSNP, GENCODE, and HGNC annotations using GATK's Funcotator. Add trinucleotide context and RepeatMasker intervals with `bedtools`. - For SVs, annotate variants with population allele frequency from the gnomAD-SV v4 database. +- *Variant annotation occurs prior to LiftOver when converting from GRCh38 -> GRCh37 ### 3. Predict variant stability From 1ae5c8e3af0ab7f6b123c048fe70637120e55a7f Mon Sep 17 00:00:00 2001 From: Nicholas Wang <47401561+nkwang24@users.noreply.github.com> Date: Thu, 29 Aug 2024 16:17:48 -0700 Subject: [PATCH 08/16] Update pipeline.mmd --- docs/pipeline.mmd | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/pipeline.mmd b/docs/pipeline.mmd index 0bc792b..9db31c5 100644 --- a/docs/pipeline.mmd +++ b/docs/pipeline.mmd @@ -64,7 +64,7 @@ flowchart TD --> bcftools_annotate2["`bcftools annotate*Trinucleotide*`"]:::bcftools end - blocknote["`**Note:** Annotation is performed before Liftover when lifting backward`"] + blocknote["`**Note:** Annotation is performed prior to LiftOver when converting from GRCh38 -> GRCh37`"] bcftools_liftover ---> gatk_func bcftools_annotate2 --> r_extract_snv[extract-VCF-features.R]:::R @@ -79,7 +79,7 @@ flowchart TD joinpaths ---> r_predict_stability subgraph Predict Stability ["`        **Predict Stability**`"] - r_predict_stability[predict-liftover-stability.R]:::R + r_predict_stability[predict-variant-stability.R]:::R --> bcftools_annotate3["`bcftools annotate*Stability*`"]:::bcftools rf_model([rf_model]):::input .-> r_predict_stability From 690b8878146b2908e8b7a0540c1de8d9c8f0818b Mon Sep 17 00:00:00 2001 From: Nicholas Wiltsie Date: Fri, 30 Aug 2024 09:17:02 -0700 Subject: [PATCH 09/16] Incorporate new bi-directional liftover for SV branch --- main.nf | 6 ++- module/sv_workflow.nf | 85 ++++++++++++++----------------------------- 2 files changed, 32 insertions(+), 59 deletions(-) diff --git a/main.nf b/main.nf index abac9ca..7d9ffee 100644 --- a/main.nf +++ b/main.nf @@ -42,6 +42,8 @@ log.info """\ chain_file: ${params.chain_file} repeat_bed: ${params.repeat_bed} + header_contigs: ${params.header_contigs} + funcotator_data: data_source: ${params.funcotator_data.data_source} src_reference_id: ${params.funcotator_data.src_reference_id} @@ -149,10 +151,10 @@ workflow { // Take the SV branch workflow_extract_sv_annotations( validated_vcf_tuple, + input_ch_src_sequence, Channel.value(params.header_contigs), Channel.value(params.gnomad_rds), - Channel.value(params.chain_file), - Channel.value(params.variant_caller) + Channel.value(params.chain_file) ) workflow_extract_sv_annotations.out.liftover_vcf.set { liftover_vcf } diff --git a/module/sv_workflow.nf b/module/sv_workflow.nf index bdf2621..8cca9e8 100644 --- a/module/sv_workflow.nf +++ b/module/sv_workflow.nf @@ -1,35 +1,39 @@ - -process liftover_SV_StableLift{ +process run_SV_liftover_and_annotate { container params.docker_image_stablelift publishDir path: "${params.output_dir_base}/intermediate/${task.process.replace(':', '/')}", - pattern: "liftover.vcf.gz", + pattern: "liftover.{vcf.gz,Rds}", mode: "copy", enabled: params.save_intermediate_files, saveAs: { "LiftOver-${sample_id}.vcf.gz" } input: - tuple val(sample_id), - path(vcf, stageAs: 'inputs/*'), - path(index, stageAs: 'inputs/*') + tuple val(sample_id), path(vcf, stageAs: 'inputs/*') + val(source_grch_label) path (header_contigs) path (chain_file) + path (gnomad_rds) output: - tuple val(sample_id), path('liftover.vcf.gz'), emit: liftover_vcf + tuple val(sample_id), path('annotations.vcf.gz'), emit: liftover_vcf + tuple val(sample_id), path('annotations.Rds'), emit: r_annotations script: """ - Rscript "${moduleDir}/scripts/liftover-Delly2-vcf.R" \ + Rscript "${moduleDir}/scripts/extract-VCF-features-SV.R" \ --input-vcf "${vcf}" \ - --header-contigs "${header_contigs}" \ + --source-build "${source_grch_label}" \ --chain-file "${chain_file}" \ - --output "liftover.vcf.gz" + --header-contigs "${header_contigs}" \ + --gnomad-rds ${gnomad_rds} \ + --output-vcf "annotations.vcf.gz" \ + --output-rds "annotations.Rds" """ stub: """ - touch "liftover.vcf.gz" + touch "annotations.Rds" + touch "annotations.vcf.gz" """ } @@ -62,66 +66,33 @@ process run_sort_BCFtools { """ } -process annotate_gnomAD_StableLift { - container params.docker_image_stablelift - - publishDir path: "${params.output_dir_base}/intermediate/${task.process.replace(':', '/')}", - pattern: "annotations.Rds", - mode: "copy", - enabled: params.save_intermediate_files, - saveAs: { "LiftOver-${sample_id}-${variant_caller}.Rds" } - - input: - tuple val(sample_id), path(vcf, stageAs: 'inputs/*') - path (gnomad_rds) - val (variant_caller) - - output: - tuple val(sample_id), path('annotations.Rds'), emit: r_annotations - - script: - """ - Rscript "${moduleDir}/scripts/extract-vcf-features-SV.R" \ - --variant-caller "${variant_caller}" \ - --input-vcf "${vcf}" \ - --output-rds "annotations.Rds" \ - --gnomad-rds ${gnomad_rds} - """ - - stub: - """ - touch "annotations.Rds" - """ -} - workflow workflow_extract_sv_annotations { take: vcf_with_sample_id + src_sequence header_contigs gnomad_rds chain_file - variant_caller main: - // Step 1: Liftover - liftover_SV_StableLift( - vcf_with_sample_id, + run_SV_liftover_and_annotate( + // We don't need the index file + vcf_with_sample_id.map{ [it[0], it[1]] }, + + // We only need the sample ID + src_sequence.map{ ["hg19": "GRCh37", "hg38": "GRCh38"][it[0]] }, + header_contigs, - chain_file - ) - run_sort_BCFtools( - liftover_SV_StableLift.out.liftover_vcf + chain_file, + gnomad_rds ) - // Step 2: Extract features - annotate_gnomAD_StableLift( - run_sort_BCFtools.out.sorted_vcf, - gnomad_rds, - variant_caller + run_sort_BCFtools( + run_SV_liftover_and_annotate.out.liftover_vcf ) emit: liftover_vcf = run_sort_BCFtools.out.sorted_vcf - r_annotations = annotate_gnomAD_StableLift.out.r_annotations + r_annotations = run_SV_liftover_and_annotate.out.r_annotations } From 64440a12eabe60f7ae834680860995acfb19c368 Mon Sep 17 00:00:00 2001 From: Nicholas Wiltsie Date: Fri, 30 Aug 2024 09:28:24 -0700 Subject: [PATCH 10/16] Clean up logic for new optional parameters --- config/default.config | 4 ++-- config/schema.yaml | 17 +++++++++++++++++ config/template.config | 18 ++++++++++-------- module/predict_stability.nf | 7 +++++-- 4 files changed, 34 insertions(+), 12 deletions(-) diff --git a/config/default.config b/config/default.config index 1224c7d..770eed6 100644 --- a/config/default.config +++ b/config/default.config @@ -40,8 +40,8 @@ params { dest_fasta_dict = "${ -> Nextflow.file(params.dest_fasta_ref).resolveSibling(Nextflow.file(params.dest_fasta_ref).getBaseName() + '.dict') }" // Pass empty values to `predict-variant-stability.R` if not specified by user - target_threshold = "" - target_specificity = "" + target_threshold = null + target_specificity = null } // Process specific scope diff --git a/config/schema.yaml b/config/schema.yaml index 76b8a2f..db9dd58 100644 --- a/config/schema.yaml +++ b/config/schema.yaml @@ -99,3 +99,20 @@ input: mode: 'r' required: true help: 'Input dataset supplied by input yaml' + +target_threshold: + type: 'Number' + required: false + help: >- + Optional parameter specifying target Stability Score threshold for variant + filtering Default behavior without `target_threshold` or + `target_specificity` specified uses threshold maximizing F1-score in whole + genome validation set'. Must be in the range [0, 1]. + +target_specificity: + type: 'Number' + required: false + help: >- + Optional parameter specifying target specificity for variant filtering + based on whole genome validation set. Overrides `target_threshold`. Must be + in the range [0, 1]. diff --git a/config/template.config b/config/template.config index de829c0..c811315 100644 --- a/config/template.config +++ b/config/template.config @@ -16,14 +16,16 @@ params { // Path to pre-trained random forest model rf_model = "" - // Optional parameter specifying target Stability Score threshold for variant filtering - // Default behavior without `target_threshold` or `target_specificity` specified - // uses threshold maximizing F1-score in whole genome validation set - target_threshold = "" // numeric [0, 1] - - // Optional parameter specifying target specificity for variant filtering based on whole genome validation set - // Overrides `target_threshold` - target_specificity = "" // numeric [0, 1] + // Optional parameter specifying target Stability Score threshold for + // variant filtering Default behavior without `target_threshold` or + // `target_specificity` specified uses threshold maximizing F1-score in + // whole genome validation set. Must be in the range [0.0, 1.0]. + target_threshold = null + + // Optional parameter specifying target specificity for variant filtering + // based on whole genome validation set Overrides `target_threshold`. Must + // be in the range [0.0, 1.0], + target_specificity = null // Reference files funcotator_data { diff --git a/module/predict_stability.nf b/module/predict_stability.nf index 67252fa..cc59869 100644 --- a/module/predict_stability.nf +++ b/module/predict_stability.nf @@ -18,14 +18,17 @@ process predict_stability_StableLift { tuple val(sample_id), path("stability.tsv"), emit: stability_tsv script: + spec_arg = params.target_specificity ? "--specificity \"${params.target_specificity}\"" : "" + thresh_arg = params.target_threshold ? "--threshold \"${params.target_threshold}\"" : "" + """ Rscript "${moduleDir}/scripts/predict-variant-stability.R" \ --variant-caller "${variant_caller}" \ --features-dt "${features_rds}" \ --rf-model "${rf_model}" \ --output-tsv "stability.tsv" \ - --specificity "${params.target_specificity}" \ - --threshold "${params.target_threshold}" + ${spec_arg} \ + ${thresh_arg} """ stub: From c58e0fd8054c647fb3103521c225ceb89c3381b0 Mon Sep 17 00:00:00 2001 From: Nicholas Wiltsie Date: Fri, 30 Aug 2024 09:34:32 -0700 Subject: [PATCH 11/16] Never set variable to null --- config/default.config | 4 ---- config/template.config | 4 ++-- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/config/default.config b/config/default.config index 770eed6..4d808e9 100644 --- a/config/default.config +++ b/config/default.config @@ -38,10 +38,6 @@ params { src_fasta_dict = "${ -> Nextflow.file(params.src_fasta_ref).resolveSibling(Nextflow.file(params.src_fasta_ref).getBaseName() + '.dict') }" dest_fasta_dict = "${ -> Nextflow.file(params.dest_fasta_ref).resolveSibling(Nextflow.file(params.dest_fasta_ref).getBaseName() + '.dict') }" - - // Pass empty values to `predict-variant-stability.R` if not specified by user - target_threshold = null - target_specificity = null } // Process specific scope diff --git a/config/template.config b/config/template.config index c811315..486b3e6 100644 --- a/config/template.config +++ b/config/template.config @@ -20,12 +20,12 @@ params { // variant filtering Default behavior without `target_threshold` or // `target_specificity` specified uses threshold maximizing F1-score in // whole genome validation set. Must be in the range [0.0, 1.0]. - target_threshold = null + // target_threshold = 0.5 // Optional parameter specifying target specificity for variant filtering // based on whole genome validation set Overrides `target_threshold`. Must // be in the range [0.0, 1.0], - target_specificity = null + // target_specificity = 0.5 // Reference files funcotator_data { From 6752903047ffd1e562b29c3fe5e966b357080897 Mon Sep 17 00:00:00 2001 From: nkwang Date: Fri, 30 Aug 2024 11:59:25 -0700 Subject: [PATCH 12/16] fix stability.tsv sorting --- module/scripts/predict-variant-stability.R | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/module/scripts/predict-variant-stability.R b/module/scripts/predict-variant-stability.R index db1b7ea..730567c 100644 --- a/module/scripts/predict-variant-stability.R +++ b/module/scripts/predict-variant-stability.R @@ -31,6 +31,11 @@ for (arg in names(args)) { assign(gsub('_', '.', arg), args[[arg]]); } +variant.caller <- 'Mutect2'; +features.dt <- '/hot/project/method/AlgorithmEvaluation/BNCH-000142-GRCh37v38/test/output/TCGA-SARC_10TN-WGS_GRCh37-to-GRCh38/TCGA-SARC_10TN-WGS_GRCh37_sSNV_Mutect2_LiftOver-GRCh38_annotated.Rds'; +rf.model <- '/hot/project/method/AlgorithmEvaluation/BNCH-000142-GRCh37v38/publish/model/GRCh37-to-GRCh38/RF-model_GRCh37-to-GRCh38_sSNV_Mutect2.Rds'; +output.tsv <- '/hot/project/method/AlgorithmEvaluation/BNCH-000142-GRCh37v38/test/output/TCGA-SARC_10TN-WGS_GRCh37-to-GRCh38/TCGA-SARC_10TN-WGS_GRCh37_sSNV_Mutect2_StableLift-GRCh38_stability-scores.tsv'; + ################################################################################################### # Load data ################################################################################################### @@ -110,11 +115,6 @@ annotation.dt <- data.table( STABILITY_SCORE = format(round(stability$predictions[, 1], 4), nsmall = 4), STABILITY = ifelse(stability.classification == '1', 'UNSTABLE', 'STABLE') ); -if (substr(1, 3, annotation.dt[1, CHROM]) == 'chr') { - annotation.dt[, CHROM := factor(CHROM, levels = paste0('chr', c(1:22, 'X', 'Y')))]; -} else { - annotation.dt[, CHROM := factor(CHROM, levels = c(1:22, 'X', 'Y'))]; - } setorder(annotation.dt, CHROM, POS); fwrite(annotation.dt, file = output.tsv, sep = '\t', col.names = FALSE); From b9a0209e1dff47524ae5f13ff7bbbd8523b95c84 Mon Sep 17 00:00:00 2001 From: nkwang Date: Fri, 30 Aug 2024 12:11:05 -0700 Subject: [PATCH 13/16] remove test paths --- module/scripts/predict-variant-stability.R | 5 ----- 1 file changed, 5 deletions(-) diff --git a/module/scripts/predict-variant-stability.R b/module/scripts/predict-variant-stability.R index 730567c..c8b271d 100644 --- a/module/scripts/predict-variant-stability.R +++ b/module/scripts/predict-variant-stability.R @@ -31,11 +31,6 @@ for (arg in names(args)) { assign(gsub('_', '.', arg), args[[arg]]); } -variant.caller <- 'Mutect2'; -features.dt <- '/hot/project/method/AlgorithmEvaluation/BNCH-000142-GRCh37v38/test/output/TCGA-SARC_10TN-WGS_GRCh37-to-GRCh38/TCGA-SARC_10TN-WGS_GRCh37_sSNV_Mutect2_LiftOver-GRCh38_annotated.Rds'; -rf.model <- '/hot/project/method/AlgorithmEvaluation/BNCH-000142-GRCh37v38/publish/model/GRCh37-to-GRCh38/RF-model_GRCh37-to-GRCh38_sSNV_Mutect2.Rds'; -output.tsv <- '/hot/project/method/AlgorithmEvaluation/BNCH-000142-GRCh37v38/test/output/TCGA-SARC_10TN-WGS_GRCh37-to-GRCh38/TCGA-SARC_10TN-WGS_GRCh37_sSNV_Mutect2_StableLift-GRCh38_stability-scores.tsv'; - ################################################################################################### # Load data ################################################################################################### From 66831384b1ba3919f5ba9c74b32b2665e7486d84 Mon Sep 17 00:00:00 2001 From: Nicholas Wiltsie Date: Fri, 30 Aug 2024 14:26:58 -0700 Subject: [PATCH 14/16] Add RangedNumber custom type, fix bug when optional parameters are 0 --- config/custom_schema_types.config | 24 +++++++++++++++++++++++- config/schema.yaml | 13 ++++++++----- main.nf | 2 +- module/predict_stability.nf | 4 ++-- 4 files changed, 34 insertions(+), 9 deletions(-) diff --git a/config/custom_schema_types.config b/config/custom_schema_types.config index bcf24fc..a38d8d9 100644 --- a/config/custom_schema_types.config +++ b/config/custom_schema_types.config @@ -43,7 +43,29 @@ custom_schema_types { } } + /** + * Check that the input is numeric in the appropriate range. + */ + ranged_number = { Map options, String name, Map properties -> + if (!(properties.containsKey('min') && properties['min'] in Number)) { + throw new Exception('`min` parameter misconfigured - must be a Number.') + } + + if (!(properties.containsKey('max') && properties['max'] in Number)) { + throw new Exception('`max` parameter misconfigured - must be a Number.') + } + + if (!(options[name] in Number)) { + throw new Exception("${name} must be a Number, not ${options[name].getClass()}") + } + + if (options[name] < properties.min || properties.max < options[name]) { + throw new Exception("${name}=${options[name]} is not in range [${properties.min}, ${properties.max}]") + } + } + types = [ - 'FuncotatorDataSource': custom_schema_types.check_funcotator_data_source + 'FuncotatorDataSource': custom_schema_types.check_funcotator_data_source, + 'RangedNumber': custom_schema_types.ranged_number ] } diff --git a/config/schema.yaml b/config/schema.yaml index db9dd58..8e21f40 100644 --- a/config/schema.yaml +++ b/config/schema.yaml @@ -101,18 +101,21 @@ input: help: 'Input dataset supplied by input yaml' target_threshold: - type: 'Number' + type: 'RangedNumber' required: false + min: 0 + max: 1 help: >- Optional parameter specifying target Stability Score threshold for variant filtering Default behavior without `target_threshold` or `target_specificity` specified uses threshold maximizing F1-score in whole - genome validation set'. Must be in the range [0, 1]. + genome validation set'. target_specificity: - type: 'Number' + type: 'RangedNumber' required: false + min: 0 + max: 1 help: >- Optional parameter specifying target specificity for variant filtering - based on whole genome validation set. Overrides `target_threshold`. Must be - in the range [0, 1]. + based on whole genome validation set. Overrides `target_threshold`. diff --git a/main.nf b/main.nf index 7d9ffee..0526d8f 100644 --- a/main.nf +++ b/main.nf @@ -42,7 +42,7 @@ log.info """\ chain_file: ${params.chain_file} repeat_bed: ${params.repeat_bed} - header_contigs: ${params.header_contigs} + header_contigs: ${params.getOrDefault('header_contigs', null)} funcotator_data: data_source: ${params.funcotator_data.data_source} diff --git a/module/predict_stability.nf b/module/predict_stability.nf index cc59869..5ab6dac 100644 --- a/module/predict_stability.nf +++ b/module/predict_stability.nf @@ -18,8 +18,8 @@ process predict_stability_StableLift { tuple val(sample_id), path("stability.tsv"), emit: stability_tsv script: - spec_arg = params.target_specificity ? "--specificity \"${params.target_specificity}\"" : "" - thresh_arg = params.target_threshold ? "--threshold \"${params.target_threshold}\"" : "" + spec_arg = (params.getOrDefault('target_specificity', null) != null) ? "--specificity \"${params.get('target_specificity')}\"" : "" + thresh_arg = (params.getOrDefault('target_threshold', null) != null) ? "--threshold \"${params.get('target_threshold')}\"" : "" """ Rscript "${moduleDir}/scripts/predict-variant-stability.R" \ From fa470209cb8db5056ef1cc49a0c9987f3a81980e Mon Sep 17 00:00:00 2001 From: Nicholas Wiltsie Date: Fri, 30 Aug 2024 14:28:06 -0700 Subject: [PATCH 15/16] s/run_SV_liftover_and_annotate/liftover_annotate_SV_StableLift --- module/sv_workflow.nf | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/module/sv_workflow.nf b/module/sv_workflow.nf index 8cca9e8..432da05 100644 --- a/module/sv_workflow.nf +++ b/module/sv_workflow.nf @@ -1,4 +1,4 @@ -process run_SV_liftover_and_annotate { +process liftover_annotate_SV_StableLift { container params.docker_image_stablelift publishDir path: "${params.output_dir_base}/intermediate/${task.process.replace(':', '/')}", @@ -76,7 +76,7 @@ workflow workflow_extract_sv_annotations { main: - run_SV_liftover_and_annotate( + liftover_annotate_SV_StableLift( // We don't need the index file vcf_with_sample_id.map{ [it[0], it[1]] }, @@ -89,10 +89,10 @@ workflow workflow_extract_sv_annotations { ) run_sort_BCFtools( - run_SV_liftover_and_annotate.out.liftover_vcf + liftover_annotate_SV_StableLift.out.liftover_vcf ) emit: liftover_vcf = run_sort_BCFtools.out.sorted_vcf - r_annotations = run_SV_liftover_and_annotate.out.r_annotations + r_annotations = liftover_annotate_SV_StableLift.out.r_annotations } From 9418d9c999ac1f76d6f783fef236e31d9ae71d02 Mon Sep 17 00:00:00 2001 From: Nicholas Wiltsie Date: Fri, 30 Aug 2024 14:29:02 -0700 Subject: [PATCH 16/16] Update CHANGELOG --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 11aa2a1..1fe8ca2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,8 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm - Add workflow for SV caller (Delly2) - Add pipeline diagram - Add reverse liftover (GRCh38 -> GRCh37) for SNV branch +- Add reverse liftover (GRCh38 -> GRCh37) for SV branch +- Add optional `target_threshold` and `target_specificity` parameters ### Changed