diff --git a/CHANGELOG.md b/CHANGELOG.md index 11aa2a1..1fe8ca2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,6 +15,8 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm - Add workflow for SV caller (Delly2) - Add pipeline diagram - Add reverse liftover (GRCh38 -> GRCh37) for SNV branch +- Add reverse liftover (GRCh38 -> GRCh37) for SV branch +- Add optional `target_threshold` and `target_specificity` parameters ### Changed diff --git a/README.md b/README.md index 67d27eb..e52acd3 100644 --- a/README.md +++ b/README.md @@ -51,10 +51,11 @@ If you are using the UCLA Azure cluster, please use the [submission script](http - For SNVs, convert variant coordinates using the `BCFtools` LiftOver plugin with UCSC chain files. - For SVs, convert variant breakpoint coordinates using custom R script with UCSC chain files and `rtracklayer` and `GenomicRanges` R packages. -### 2. Variant annotation +### 2. Variant annotation* - For SNVs, add dbSNP, GENCODE, and HGNC annotations using GATK's Funcotator. Add trinucleotide context and RepeatMasker intervals with `bedtools`. - For SVs, annotate variants with population allele frequency from the gnomAD-SV v4 database. +- *Variant annotation occurs prior to LiftOver when converting from GRCh38 -> GRCh37 ### 3. Predict variant stability @@ -98,6 +99,8 @@ input: | Optional Parameter | Type | Default | Description | | --------------------------- | ----------------------------------------------------------------------------------------- | ---------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `target_threshold` | numeric | `""` | Target Stability Score threshold for variant filtering: [0, 1] | +| `target_specificity` | numeric | `""` | Target specificity based on whole genome validation set for variant filtering: [0, 1] | | `work_dir` | path | `/scratch/$SLURM_JOB_ID` | Path of working directory for Nextflow. When included in the sample config file, Nextflow intermediate files and logs will be saved to this directory. With `ucla_cds`, the default is `/scratch` and should only be changed for testing/development. Changing this directory to `/hot` or `/tmp` can lead to high server latency and potential disk space limitations, respectively. | | `save_intermediate_files` | boolean | false | If set, save output files from intermediate pipeline processes. | | `min_cpus` | int | 1 | Minimum number of CPUs that can be assigned to each process. | @@ -117,13 +120,13 @@ The docker images in the following table are generally defined like `docker_imag * Change `params.docker_container_registry`. This will affect all of the images (except for GATK). * Change `params._version`. This will pull a different version of the same image from the registry. -* Change `params.docker_image_`. This will explicitly set the image to use, ignoring `docker_container_registry` and `_version`, and thus requires that the docker tag be explicitly set (e.g. `broadinstitute/gatk:4.2.4.1`). +* Change `params.docker_image_`. This will explicitly set the image to use, ignoring `docker_container_registry` and `_version`, and thus requires that the docker tag be explicitly set (e.g. `broadinstitute/gatk:4.4.0.0`). | Tool Parameter | Version Parameter | Default | Notes | | ------------------------ | -------------------- | ------------------------------------------------------------ | ------------------------------------------------------------------- | | `docker_image_bcftools` | `bcftools_version` | `ghcr.io/uclahs-cds/bcftools-score:1.20_score-1.20-20240505` | This image must have both BCFtools and the score plugins available. | | `docker_image_bedtools` | `bedtools_version` | `ghcr.io/uclahs-cds/bedtools:2.31.0` | | -| `docker_image_gatk` | `gatk_version` | `broadinstitute/gatk:4.2.4.1` | | +| `docker_image_gatk` | `gatk_version` | `broadinstitute/gatk:4.4.0.0` | | | `docker_image_pipeval` | `pipeval_version` | `ghcr.io/uclahs-cds/pipeval:5.0.0-rc.3` | | | `docker_image_samtools` | `samtools_version` | `ghcr.io/uclahs-cds/samtools:1.20` | | | `doker_image_stablelift` | `stablelift_version` | `ghcr.io/uclahs-cds/stablelift:FIXME` | This image is built and maintained via this repository. | @@ -191,7 +194,7 @@ Please see list of [Contributors](https://github.com/uclahs-cds/pipeline-StableL pipeline-StableLift is licensed under the GNU General Public License version 2. See the file LICENSE for the terms of the GNU GPL license. -StableLift is a machine learning approach designed to predict variant stability across reference genome builds, supplementing LiftOver coordinate conversion and increasing portability of variant calls. +StableLift is a machine learning approach designed to predict variant stability across reference genome builds, supplementing LiftOver coordinate conversion to increase the portability of variant calls. Copyright (C) 2024 University of California Los Angeles ("Boutros Lab") All rights reserved. diff --git a/config/custom_schema_types.config b/config/custom_schema_types.config index bcf24fc..a38d8d9 100644 --- a/config/custom_schema_types.config +++ b/config/custom_schema_types.config @@ -43,7 +43,29 @@ custom_schema_types { } } + /** + * Check that the input is numeric in the appropriate range. + */ + ranged_number = { Map options, String name, Map properties -> + if (!(properties.containsKey('min') && properties['min'] in Number)) { + throw new Exception('`min` parameter misconfigured - must be a Number.') + } + + if (!(properties.containsKey('max') && properties['max'] in Number)) { + throw new Exception('`max` parameter misconfigured - must be a Number.') + } + + if (!(options[name] in Number)) { + throw new Exception("${name} must be a Number, not ${options[name].getClass()}") + } + + if (options[name] < properties.min || properties.max < options[name]) { + throw new Exception("${name}=${options[name]} is not in range [${properties.min}, ${properties.max}]") + } + } + types = [ - 'FuncotatorDataSource': custom_schema_types.check_funcotator_data_source + 'FuncotatorDataSource': custom_schema_types.check_funcotator_data_source, + 'RangedNumber': custom_schema_types.ranged_number ] } diff --git a/config/schema.yaml b/config/schema.yaml index 76b8a2f..8e21f40 100644 --- a/config/schema.yaml +++ b/config/schema.yaml @@ -99,3 +99,23 @@ input: mode: 'r' required: true help: 'Input dataset supplied by input yaml' + +target_threshold: + type: 'RangedNumber' + required: false + min: 0 + max: 1 + help: >- + Optional parameter specifying target Stability Score threshold for variant + filtering Default behavior without `target_threshold` or + `target_specificity` specified uses threshold maximizing F1-score in whole + genome validation set'. + +target_specificity: + type: 'RangedNumber' + required: false + min: 0 + max: 1 + help: >- + Optional parameter specifying target specificity for variant filtering + based on whole genome validation set. Overrides `target_threshold`. diff --git a/config/template.config b/config/template.config index e30e086..486b3e6 100644 --- a/config/template.config +++ b/config/template.config @@ -5,17 +5,28 @@ includeConfig "${projectDir}/config/default.config" includeConfig "${projectDir}/config/methods.config" includeConfig "${projectDir}/nextflow.config" - // Inputs/parameters of the pipeline params { // input/output locations - output_dir = 'where/to/save/outputs/' + output_dir = "where/to/save/outputs/" - // Choices: ["Mutect2", "HaplotypeCaller"] + // Choices: ["HaplotypeCaller", "Mutect2", "Strelka2", "SomaticSniper", "Muse2", "Delly2"] variant_caller = "Mutect2" + // Path to pre-trained random forest model rf_model = "" + // Optional parameter specifying target Stability Score threshold for + // variant filtering Default behavior without `target_threshold` or + // `target_specificity` specified uses threshold maximizing F1-score in + // whole genome validation set. Must be in the range [0.0, 1.0]. + // target_threshold = 0.5 + + // Optional parameter specifying target specificity for variant filtering + // based on whole genome validation set Overrides `target_threshold`. Must + // be in the range [0.0, 1.0], + // target_specificity = 0.5 + // Reference files funcotator_data { data_source = "/hot/ref/tool-specific-input/Funcotator/somatic/funcotator_dataSources.v1.7.20200521s" diff --git a/docs/pipeline.mmd b/docs/pipeline.mmd index 0bc792b..9db31c5 100644 --- a/docs/pipeline.mmd +++ b/docs/pipeline.mmd @@ -64,7 +64,7 @@ flowchart TD --> bcftools_annotate2["`bcftools annotate*Trinucleotide*`"]:::bcftools end - blocknote["`**Note:** Annotation is performed before Liftover when lifting backward`"] + blocknote["`**Note:** Annotation is performed prior to LiftOver when converting from GRCh38 -> GRCh37`"] bcftools_liftover ---> gatk_func bcftools_annotate2 --> r_extract_snv[extract-VCF-features.R]:::R @@ -79,7 +79,7 @@ flowchart TD joinpaths ---> r_predict_stability subgraph Predict Stability ["`        **Predict Stability**`"] - r_predict_stability[predict-liftover-stability.R]:::R + r_predict_stability[predict-variant-stability.R]:::R --> bcftools_annotate3["`bcftools annotate*Stability*`"]:::bcftools rf_model([rf_model]):::input .-> r_predict_stability diff --git a/main.nf b/main.nf index abac9ca..0526d8f 100644 --- a/main.nf +++ b/main.nf @@ -42,6 +42,8 @@ log.info """\ chain_file: ${params.chain_file} repeat_bed: ${params.repeat_bed} + header_contigs: ${params.getOrDefault('header_contigs', null)} + funcotator_data: data_source: ${params.funcotator_data.data_source} src_reference_id: ${params.funcotator_data.src_reference_id} @@ -149,10 +151,10 @@ workflow { // Take the SV branch workflow_extract_sv_annotations( validated_vcf_tuple, + input_ch_src_sequence, Channel.value(params.header_contigs), Channel.value(params.gnomad_rds), - Channel.value(params.chain_file), - Channel.value(params.variant_caller) + Channel.value(params.chain_file) ) workflow_extract_sv_annotations.out.liftover_vcf.set { liftover_vcf } diff --git a/module/predict_stability.nf b/module/predict_stability.nf index 1ae6987..5ab6dac 100644 --- a/module/predict_stability.nf +++ b/module/predict_stability.nf @@ -18,12 +18,17 @@ process predict_stability_StableLift { tuple val(sample_id), path("stability.tsv"), emit: stability_tsv script: + spec_arg = (params.getOrDefault('target_specificity', null) != null) ? "--specificity \"${params.get('target_specificity')}\"" : "" + thresh_arg = (params.getOrDefault('target_threshold', null) != null) ? "--threshold \"${params.get('target_threshold')}\"" : "" + """ - Rscript "${moduleDir}/scripts/predict-liftover-stability.R" \ + Rscript "${moduleDir}/scripts/predict-variant-stability.R" \ + --variant-caller "${variant_caller}" \ --features-dt "${features_rds}" \ --rf-model "${rf_model}" \ - --variant-caller "${variant_caller}" \ - --output-tsv "stability.tsv" + --output-tsv "stability.tsv" \ + ${spec_arg} \ + ${thresh_arg} """ stub: @@ -43,7 +48,6 @@ process run_apply_stability_annotations { input: tuple val(sample_id), path(annotated_vcf, stageAs: 'inputs/*'), - // FIXME Should there be an annotated_vcf_tbi? path(stability_tsv, stageAs: 'inputs/*'), path(stability_tsv_tbi, stageAs: 'inputs/*') diff --git a/module/scripts/extract-VCF-features-SV.R b/module/scripts/extract-VCF-features-SV.R new file mode 100644 index 0000000..264ce4e --- /dev/null +++ b/module/scripts/extract-VCF-features-SV.R @@ -0,0 +1,269 @@ +#!/usr/bin/env Rscript +# extract-VCF-features-SV.R +#################################################################################################### +# +# LiftOver Delly2 structural variants and annotate with gnomAD population allele frequency +# Extract VCF features and save as Rds for input to predict-variant-stability.R +# +#################################################################################################### + +suppressPackageStartupMessages({ + library(vcfR); + library(data.table); + library(argparse); + library(rtracklayer); + library(GenomicRanges); + }); + +################################################################################################### +# Input +################################################################################################### +# Define command line arguments +parser <- ArgumentParser(); +parser$add_argument('--input-vcf', type = 'character', help = 'Input Delly2 VCF'); +parser$add_argument('--source-build', type = 'character', help = 'One of {GRCh37, GRCh38}'); +parser$add_argument('--chain-file', type = 'character', help = 'Chain file for coordinate conversion'); +parser$add_argument('--header-contigs', type = 'character', help = 'Resource file with VCF header for target build'); +parser$add_argument('--gnomad-rds', type = 'character', help = 'gnomAD-SV v4 resource file'); +parser$add_argument('--output-vcf', type = 'character', help = 'VCF output'); +parser$add_argument('--output-rds', type = 'character', help = 'Rds output for input to RF model'); +args <- parser$parse_args(); + +# Save command line arguments +for (arg in names(args)) { + assign(gsub('_', '.', arg), args[[arg]]); + } + +################################################################################################### +# Functions +################################################################################################### +# vcfR::getINFO() to data.table +vcf.info.to.dt <- function(vcf.info) { + vcf.info <- lapply(vcf.info, function(x) vcf.info.string.to.list(x)); + feature.names <- unique(unlist(lapply(vcf.info, names))); + vcf.info <- do.call(mapply, c(FUN = list, lapply(vcf.info, `[`, feature.names))); + setNames(as.data.table(vcf.info), feature.names); + } + +# Split VCF info field to list +vcf.info.string.to.list <- function(vcf.info, keep.columns = NULL) { + list.out <- strsplit(vcf.info, split = ';'); + list.out <- lapply(list.out, function(x) strsplit(x, split = '=')); + labels <- sapply(list.out[[1]], function(x) x[[1]]); + values <- sapply(list.out[[1]], function(x) if (length(x) == 2) x[[2]] else x[[1]]); + names(values) <- labels; + if (is.null(keep.columns)) return(values); + values <- values[labels %in% keep.columns]; + return(values); + } + +calculate.VAF <- function(GT.row) { + total <- sum(GT.row %in% c('0/0', '0/1', '1/1'), na.rm = TRUE) * 2; + alt <- sum(GT.row == '0/1', na.rm = TRUE) + sum(GT.row == '1/1', na.rm = TRUE) * 2; + return(alt / total); + } + +get.overlap <- function(start1, end1, start2, end2) { + max.length <- pmax((end1 - start1), (end2 - start2)); + overlap.length <- pmin(end1, end2) - pmax(start1, start2); + return(overlap.length / max.length); + } + +find.SV.match <- function(this.ID, input, reference, overlap, offset) { + # Match SV type and CHR + this.variant <- input[ID == this.ID]; + reference <- reference[SVTYPE == this.variant$SVTYPE & CHROM == this.variant$CHROM]; + + if (this.variant$SVTYPE == 'BND') { + reference <- reference[CHR2 == this.variant$CHR2]; + reference[, OFFSET := abs(POS - this.variant$POS)]; + reference <- reference[OFFSET < offset][order(OFFSET)]; + } else { + reference[, OVERLAP := get.overlap(POS, END, this.variant$POS, this.variant$END)]; + reference <- reference[OVERLAP > overlap][order(OVERLAP, decreasing = TRUE)]; + } + + return(list(gnomad.match.ID = reference[1, ID], gnomad.matches = nrow(reference))); + } + +annotate.gnomad.features <- function(features.dt, features.dt.gnomad) { + gnomad.features <- c('ID', 'AF', 'POPMAX_AF'); + features.dt[, c('gnomad.match.ID', 'gnomad.matches') := rbindlist(lapply(ID, find.SV.match, input = features.dt, reference = features.dt.gnomad, overlap = 0.8, offset = 500))]; + features.dt <- merge(features.dt, features.dt.gnomad[, ..gnomad.features], all.x = TRUE, by.x = 'gnomad.match.ID', by.y = 'ID'); + } + +################################################################################################### +# Load files +################################################################################################### +input.vcf <- read.vcfR(input.vcf); +header.contigs <- scan(header.contigs, character()); +liftover.chain <- import.chain(chain.file); +features.dt.gnomad <- readRDS(gnomad.rds); + +################################################################################################### +# Data preprocessing +################################################################################################### +# Convert variant information into dt +if (any(duplicated(input.vcf@fix[, 'ID']))) input.vcf@fix[, 'ID'] <- paste0(substr(input.vcf@fix[, 'ID'], 1, 3), sprintf('%08d', seq_len(nrow(input.vcf@fix)))); +input.info <- vcf.info.to.dt(input.vcf@fix[, 'INFO']); +input.fix <- as.data.table(input.vcf@fix); +features.dt <- cbind(input.fix[, -c('INFO')], input.info); + +# Format columns +features.dt[, CONSENSUS := NULL]; +numeric.columns <- c('POS', 'QUAL', 'END', 'PE', 'MAPQ', 'SRMAPQ', 'INSLEN', 'HOMLEN', 'SR', 'SRQ', 'CE', 'RDRATIO', 'SVLEN', 'POS2'); +character.columns <- names(features.dt)[!names(features.dt) %in% numeric.columns]; +features.dt[, (numeric.columns) := lapply(.SD, as.numeric), .SDcols = numeric.columns]; +features.dt[, (character.columns) := lapply(.SD, as.character), .SDcols = character.columns]; + +# Extract and aggregate per sample GT fields +gt.fields <- c('GQ', 'RC', 'RDCN', 'DR', 'DV', 'RR', 'RV'); +for (field in gt.fields) { + features.dt[, (field) := apply(extract.gt(input.vcf, element = ..field, as.numeric = TRUE), 1, mean, na.rm = TRUE)]; + } +features.dt[, COHORT_AF := apply(extract.gt(input.vcf, element = 'GT'), 1, calculate.VAF)]; +features.dt[, CIPOS := as.numeric(sapply(CIPOS, function(x) unlist(strsplit(x, ','))[2]))]; + +if (source.build == 'GRCh37') { + features.dt[, CHROM := paste0('chr', CHROM)]; + features.dt[, CHR2 := ifelse(is.na(CHR2), CHR2, paste0('chr', CHR2))]; +} else if (source.build == 'GRCh38') { + # Annotate with gnomAD before LiftOver if source build == GRCh38 + features.dt <- annotate.gnomad.features(features.dt, features.dt.gnomad); + } + +################################################################################################### +# LiftOver variants by breakpoint +################################################################################################### +# Create GRanges object +grange.source <- makeGRangesFromDataFrame( + df = features.dt, + seqnames.field = 'CHROM', + start.field = 'POS', + end.field = 'END', + keep.extra.columns = TRUE + ); + +# LiftOver using chain file +grange.target <- unlist(liftOver(grange.source, liftover.chain)); +grange.target.dt <- as.data.table(grange.target); + +# Create GRanges object using CHROM, CHR2, and POS2 from features.dt +grange.source.BND <- makeGRangesFromDataFrame( + df = features.dt[SVTYPE == 'BND', ], + seqnames.field = 'CHR2', + start.field = 'POS2', + end.field = 'POS2', + keep.extra.columns = TRUE + ); +grange.target.BND <- as.data.table(unlist(liftOver(grange.source.BND, liftover.chain))); + +# Remove multiple mappings +grange.target.dt <- grange.target.dt[!duplicated(ID)]; +grange.target.BND <- grange.target.BND[!duplicated(ID)]; +common <- intersect(grange.target.dt$ID, grange.target.BND$ID); + +grange.target.dt[ID %in% common, c('CHR2', 'POS2') := grange.target.BND[ID %in% common, .(seqnames, start)]]; + +if (source.build == 'GRCh38') { + grange.target.dt[, seqnames := sub('chr', '', seqnames)]; + grange.target.dt[, CHR2 := ifelse(is.na(CHR2), CHR2, sub('chr', '', CHR2))]; + } + +################################################################################################### +# Write output VCF +################################################################################################### +pass.liftover <- as.data.table(input.vcf@fix)$ID %in% grange.target.dt$ID; +input.fix <- as.data.table(input.vcf@fix)[pass.liftover]; +input.gt <- as.data.table(input.vcf@gt)[pass.liftover]; +grange.target.dt <- grange.target.dt[match(input.fix$ID, grange.target.dt$ID)]; + +for (i in seq_len(nrow(input.fix))) { + this.ID <- input.fix[i, ID]; + this.INFO <- vcf.info.string.to.list(input.fix[i, INFO]); + this.INFO[['END']] <- grange.target.dt[i, end]; + if (this.INFO[['SVTYPE']] == 'BND') { + this.INFO[['CHR2']] <- grange.target.dt[i, CHR2]; + this.INFO[['POS2']] <- grange.target.dt[i, POS2]; + } + this.INFO <- lapply(names(this.INFO), function(x) paste(x, this.INFO[[x]], sep = '=')); + this.INFO <- paste(this.INFO, collapse = ';'); + this.INFO <- gsub('IMPRECISE=IMPRECISE', 'IMPRECISE', this.INFO); + this.INFO <- gsub('PRECISE=PRECISE', 'PRECISE', this.INFO); + this.INFO <- gsub('SOMATIC=SOMATIC', 'SOMATIC', this.INFO); + input.fix[i, c('CHROM', 'POS', 'INFO') := grange.target.dt[i, .(seqnames, start, ..this.INFO)]]; + } + +lifted.vcf <- input.vcf; +lifted.vcf@fix <- as.matrix(input.fix); +lifted.vcf@gt <- as.matrix(input.gt); +lifted.vcf@meta <- lifted.vcf@meta[!grepl('^##(contig|reference)', lifted.vcf@meta)]; +lifted.vcf@meta <- c(lifted.vcf@meta, header.contigs); + +write.vcf(lifted.vcf, output.vcf); + +################################################################################################### +# Format features for RF +################################################################################################### +features.dt <- features.dt[ID %in% grange.target.dt$ID]; +features.dt <- features.dt[match(input.fix$ID, features.dt$ID)]; +features.dt[, c('CHROM', 'POS', 'END', 'CHR2', 'POS2') := grange.target.dt[, .(seqnames, start, end, CHR2, POS2)]]; +features.dt[!SVTYPE %in% c('BND', 'INS'), SVLEN := END - POS + 1]; + +if (source.build == 'GRCh37') { + features.dt <- annotate.gnomad.features(features.dt, features.dt.gnomad); + } + +continuous.features <- c( + 'POS', + 'QUAL', + 'END', + 'PE', + 'MAPQ', + 'CIPOS', + 'SRMAPQ', + 'HOMLEN', + 'SR', + 'SRQ', + 'CE', + 'RDRATIO', + 'SVLEN', + 'GQ', + 'RC', + 'RDCN', + 'DR', + 'DV', + 'RR', + 'RV', + 'AF', + 'gnomad.matches', + 'POPMAX_AF' + ); + +categorical.features <- c( + 'CHROM', + 'SVTYPE', + 'CT' + ); + +# Extract features and format +continuous.features <- continuous.features[continuous.features %in% names(features.dt)]; +categorical.features <- categorical.features[categorical.features %in% names(features.dt)]; +all.features <- c(continuous.features, categorical.features, 'ID'); + +features.dt <- features.dt[, ..all.features]; +features.dt[, (continuous.features) := lapply(.SD, as.numeric), .SDcols = continuous.features]; +features.dt[, (continuous.features) := lapply(.SD, function(x) ifelse(is.na(x), 0, x)), .SDcols = continuous.features]; +features.dt[, (categorical.features) := lapply(.SD, function(x) ifelse(is.na(x), '', x)), .SDcols = categorical.features]; +features.dt[, (categorical.features) := lapply(.SD, as.factor), .SDcols = categorical.features]; +names(features.dt) <- make.names(names(features.dt)); + +# Remove rows with NA +features.dt.rows <- nrow(features.dt); +features.dt <- features.dt[apply(features.dt, 1, function(x) !any(is.na(x))), ]; +cat('Removed', features.dt.rows - nrow(features.dt), 'rows with missing data\n'); + +################################################################################################### +# Save features.dt for input to RF +################################################################################################### +saveRDS(features.dt, output.rds); diff --git a/module/scripts/extract-vcf-features.R b/module/scripts/extract-VCF-features.R similarity index 87% rename from module/scripts/extract-vcf-features.R rename to module/scripts/extract-VCF-features.R index 5aa3d75..641d653 100644 --- a/module/scripts/extract-vcf-features.R +++ b/module/scripts/extract-VCF-features.R @@ -1,10 +1,8 @@ #!/usr/bin/env Rscript -# extract-vcf-features.R +# extract-VCF-features.R #################################################################################################### # -# Extract features from vcf -# Extract Funcotator annotations if present -# Annotate with RepeatMasker regions if intersect file is provided +# Extract VCF features and save as Rds for input to predict-variant-stability.R # #################################################################################################### @@ -21,11 +19,11 @@ suppressPackageStartupMessages({ ################################################################################################### # Define command line arguments parser <- ArgumentParser(); -parser$add_argument('--input-vcf', type = 'character', help = 'GRCh37 vcf lifted to GRCh38 for feature extraction'); -parser$add_argument('--input-dir', type = 'character', help = 'Directory with vcf subsets'); -parser$add_argument('--output-rds', type = 'character', help = 'Rds output for use in RF model'); -parser$add_argument('--variant-caller', type = 'character', help = ''); -parser$add_argument('--ncore', type = 'integer', help = 'Number of cores to use for parallelizing features extraction', default = 1); +parser$add_argument('--input-vcf', type = 'character', help = 'Input VCF for feature extraction, mutually exclusive with --input-dir'); +parser$add_argument('--input-dir', type = 'character', help = 'Directory with VCF subsets for parallelization, mutually exclusive with --input-vcf'); +parser$add_argument('--output-rds', type = 'character', help = 'Rds output for input to RF model'); +parser$add_argument('--variant-caller', type = 'character', help = 'One of {HaplotypeCaller, Mutect2, Strelka2, SomaticSniper, Muse2, Delly2}'); +parser$add_argument('--ncore', type = 'integer', help = 'Number of cores to use for processing VCF subsets in --input-dir', default = 1); args <- parser$parse_args(); # Save command line arguments @@ -33,13 +31,6 @@ for (arg in names(args)) { assign(gsub('_', '.', arg), args[[arg]]); } -# Set parameters for interactive runs -if (interactive()) { - variant.caller <- 'Strelka2'; - input.vcf <- '/hot/project/method/AlgorithmEvaluation/BNCH-000142-GRCh37v38/gSNP/stableLift/validate_TCGA-SARC_WXS/TCGA-SARC_WXS_HaplotypeCaller_LiftOver-GRCh38_annotated_exome.vcf.gz'; - vcf.subset <- input.vcf; - } - if (!is.null(input.dir)) { vcf.subsets <- list.files(input.dir, full.names = TRUE, pattern = '(\\.vcf.gz|\\.vcf)$'); output.path <- output.rds; @@ -116,6 +107,21 @@ features.dt.subsets <- foreach(vcf.subset = vcf.subsets) %dopar% { info[input.vcf@fix[, 'REF'] == 'G', REFCOUNTS := apply(extract.gt(input.vcf, element = 'GU')[input.vcf@fix[, 'REF'] == 'G', ], 1, function(x) mean(sapply(strsplit(x, ','), function(y) as.numeric(y[1])), na.rm = TRUE))]; info[input.vcf@fix[, 'ALT'] == 'G', ALTCOUNTS := apply(extract.gt(input.vcf, element = 'GU')[input.vcf@fix[, 'ALT'] == 'G', ], 1, function(x) mean(sapply(strsplit(x, ','), function(y) as.numeric(y[1])), na.rm = TRUE))]; info[, AF := ALTCOUNTS / (REFCOUNTS + ALTCOUNTS)]; + } else if (variant.caller == 'SomaticSniper') { + # Calculate VAF from allelic depths + info$AF <- apply(extract.gt(input.vcf, element = 'DP4'), 1, function(x) mean( + sapply(strsplit(x, ','), function(y) { + y <- as.numeric(y); + return((y[3] + y[4]) / sum(y)); + }), + na.rm = TRUE + )); + info$AMQ <- apply(extract.gt(input.vcf, element = 'AMQ'), 1, function(x) mean(sapply(strsplit(x, ','), function(y) as.numeric(y[2])), na.rm = TRUE)); + info$BQ <- apply(extract.gt(input.vcf, element = 'BQ'), 1, function(x) mean(sapply(strsplit(x, ','), function(y) as.numeric(y[2])), na.rm = TRUE)); + info$GQ <- apply(extract.gt(input.vcf, element = 'GQ', as.numeric = TRUE), 1, mean, na.rm = TRUE); + info$MQ <- apply(extract.gt(input.vcf, element = 'GQ', as.numeric = TRUE), 1, mean, na.rm = TRUE); + info$SSC <- apply(extract.gt(input.vcf, element = 'SSC', as.numeric = TRUE), 1, mean, na.rm = TRUE); + info$VAQ <- apply(extract.gt(input.vcf, element = 'VAQ', as.numeric = TRUE), 1, mean, na.rm = TRUE); } # Get funcotation fields @@ -230,6 +236,14 @@ if (variant.caller == 'Strelka2') continuous.features <- c(continuous.features, if (variant.caller == 'Muse2') continuous.features <- c(continuous.features, 'Variant Base Quality (BQ)' = 'BQ' ); +if (variant.caller == 'SomaticSniper') continuous.features <- c(continuous.features, + 'Variant Mapping Quality (AMQ)' = 'AMQ', + 'Base Quality (BQ)' = 'BQ', + 'Genotype Quality (GQ)' = 'GQ', + 'Mapping Quality (MQ)' = 'MQ', + 'Somatic Score (SSC)' = 'SSC', + 'Variant Allele Quality (VAQ)' = 'VAQ' + ); categorical.features <- c( 'Chromosome (CHR)' = 'CHROM', diff --git a/module/scripts/extract-vcf-features-SV.R b/module/scripts/extract-vcf-features-SV.R deleted file mode 100644 index 9927a7e..0000000 --- a/module/scripts/extract-vcf-features-SV.R +++ /dev/null @@ -1,181 +0,0 @@ -#!/usr/bin/env Rscript -# extract-vcf-features-SV.R -#################################################################################################### -# -# Extract features from vcf -# Intersect and annotate with gnomAD-SV vcf -# -#################################################################################################### - -suppressPackageStartupMessages({ - library(vcfR); - library(data.table); - library(argparse); - library(GenomicRanges); - }); - -################################################################################################### -# Input -################################################################################################### -# Define command line arguments -parser <- ArgumentParser(); -parser$add_argument('--variant-caller', type = 'character', help = ''); -parser$add_argument('--input-vcf', type = 'character', help = 'Delly2 vcf'); -parser$add_argument('--output-rds', type = 'character', help = 'Rds output for use in RF model'); -parser$add_argument('--gnomad-rds', type = 'character', help = 'gnomAD Rds file'); -args <- parser$parse_args(); - -# Save command line arguments -for (arg in names(args)) { - assign(gsub('_', '.', arg), args[[arg]]); - } - -# Set parameters for interactive runs -if (interactive()) { - variant.caller <- 'Delly2'; - input.vcf <- '/hot/project/method/AlgorithmEvaluation/BNCH-000142-GRCh37v38/sSV/stableLift/train_CPCG-40QC_Delly2/CPCG-40QC_Delly2_LiftOver-GRCh38.vcf.gz'; - gnomad.rds <- '/hot/code/nkwang/GitHub/uclahs-cds/project-method-AlgorithmEvaluation-BNCH-000142-GRCh37v38/report/manuscript/publish/data/gnomad.v4.0.sv.Rds'; - } - -################################################################################################### -# Functions -################################################################################################### -vcf.info.to.dt <- function(vcf.info) { - # Split each string by semicolon and convert to a list of key-value pairs - vcf.info <- strsplit(vcf.info, ';'); - vcf.info <- lapply(vcf.info, function(x) { - x <- strsplit(x, '='); - as.list(stats::setNames(sapply(x, `[`, 2), sapply(x, `[`, 1))); - }) - - # Combine the list of key-value pairs into a data table - rbindlist(vcf.info, fill = TRUE); - } - -calculate.VAF <- function(GT.row) { - total <- sum(GT.row %in% c('0/0', '0/1', '1/1'), na.rm = TRUE) * 2; - alt <- sum(GT.row == '0/1', na.rm = TRUE) + sum(GT.row == '1/1', na.rm = TRUE) * 2; - return(alt / total); - } - -get.overlap <- function(start1, end1, start2, end2) { - max.length <- pmax((end1 - start1), (end2 - start2)); - overlap.length <- pmin(end1, end2) - pmax(start1, start2); - return(overlap.length / max.length); - } - -find.SV.match <- function(this.ID, input, reference, overlap, offset) { - # Match SV type and CHR - this.variant <- input[ID == this.ID]; - reference <- reference[SVTYPE == this.variant$SVTYPE & CHROM == this.variant$CHROM]; - - if (this.variant$SVTYPE == 'BND') { - # reference[, OFFSET := abs(POS - this.variant$POS) + abs(POS2 - this.variant$POS2)]; - reference[, OFFSET := abs(POS - this.variant$POS)]; - reference <- reference[OFFSET < offset & CHR2 == this.variant$CHR2][order(OFFSET)]; - } else { - reference[, OVERLAP := get.overlap(POS, END, this.variant$POS, this.variant$END)]; - reference <- reference[OVERLAP > overlap][order(OVERLAP, decreasing = TRUE)]; - } - - return(list(gnomad.match.ID = reference[1, ID], gnomad.matches = nrow(reference))); - } - -################################################################################################### -# Load files -################################################################################################### -input.vcf <- read.vcfR(input.vcf); -features.dt.gnomad <- readRDS(gnomad.rds); - -################################################################################################### -# Data preprocessing -################################################################################################### -# Convert variant information into dt -input.info <- vcf.info.to.dt(input.vcf@fix[, 'INFO']); -input.fix <- as.data.table(input.vcf@fix); -features.dt <- cbind(input.fix[, -c('INFO')], input.info); - -# Format columns -features.dt[, CONSENSUS := NULL]; -numeric.columns <- c('POS', 'QUAL', 'END', 'PE', 'MAPQ', 'SRMAPQ', 'INSLEN', 'HOMLEN', 'SR', 'SRQ', 'CE', 'RDRATIO', 'SVLEN', 'POS2'); -features.dt[, (numeric.columns) := lapply(.SD, as.numeric), .SDcols = numeric.columns]; - -# Extract and aggregate per sample GT fields -gt.fields <- c('GQ', 'RC', 'RDCN', 'DR', 'DV', 'RR', 'RV'); -for (field in gt.fields) { - features.dt[, (field) := apply(extract.gt(input.vcf, element = ..field, as.numeric = TRUE), 1, mean, na.rm = TRUE)]; - } -features.dt[, COHORT_AF := apply(extract.gt(input.vcf, element = 'GT'), 1, calculate.VAF)]; -features.dt[!SVTYPE %in% c('BND', 'INS'), SVLEN := END - POS + 1]; -features.dt[, CIPOS := as.numeric(sapply(CIPOS, function(x) unlist(strsplit(x, ','))[2]))]; - -################################################################################################### -# Intersect variants with gnomAD SVs -################################################################################################### -start.time <- Sys.time(); - -# features.dt <- features.dt[1:100]; -features.dt[, c('gnomad.match.ID', 'gnomad.matches') := rbindlist(lapply(ID, find.SV.match, input = features.dt, reference = features.dt.gnomad, overlap = 0.8, offset = 500))]; - -gnomad.features <- c('ID', 'AF', 'POPMAX_AF', 'NCR'); -features.dt <- merge(features.dt, features.dt.gnomad[, ..gnomad.features], all.x = TRUE, by.x = 'gnomad.match.ID', by.y = 'ID'); - -cat(format(Sys.time() - start.time, nsmall = 2), '\n'); - -################################################################################################### -# Format features for RF -################################################################################################### -continuous.features <- c( - 'POS', - 'QUAL', - 'END', - 'PE', - 'MAPQ', - 'CIPOS', - 'SRMAPQ', - 'HOMLEN', - 'SR', - 'SRQ', - 'CE', - 'RDRATIO', - 'SVLEN', - 'GQ', - 'RC', - 'RDCN', - 'DR', - 'DV', - 'RR', - 'RV', - 'gnomad.matches', - 'AF', - 'POPMAX_AF', - 'NCR' - ); - -categorical.features <- c( - 'CHROM', - 'SVTYPE', - 'CT' - ); - -# Extract features and format -continuous.features <- continuous.features[continuous.features %in% names(features.dt)]; -categorical.features <- categorical.features[categorical.features %in% names(features.dt)]; -all.features <- c(continuous.features, categorical.features, 'ID'); - -features.dt <- features.dt[, ..all.features]; -features.dt[, (continuous.features) := lapply(.SD, as.numeric), .SDcols = continuous.features]; -features.dt[, (continuous.features) := lapply(.SD, function(x) ifelse(is.na(x), 0, x)), .SDcols = continuous.features]; -features.dt[, (categorical.features) := lapply(.SD, function(x) ifelse(is.na(x), '', x)), .SDcols = categorical.features]; -features.dt[, (categorical.features) := lapply(.SD, as.factor), .SDcols = categorical.features]; -names(features.dt) <- make.names(names(features.dt)); - -# Remove rows with NA -features.dt.rows <- nrow(features.dt); -features.dt <- features.dt[apply(features.dt, 1, function(x) !any(is.na(x))), ]; -cat('Removed', features.dt.rows - nrow(features.dt), 'rows with missing data\n'); - -################################################################################################### -# Save features.dt for input to RF -################################################################################################### -saveRDS(features.dt, output.rds); diff --git a/module/scripts/liftover-Delly2-vcf.R b/module/scripts/liftover-Delly2-vcf.R deleted file mode 100644 index d473091..0000000 --- a/module/scripts/liftover-Delly2-vcf.R +++ /dev/null @@ -1,157 +0,0 @@ -#!/usr/bin/env Rscript -# liftover-Delly2-vcf.R -################################################################################################### -# -# -# -################################################################################################### - -suppressPackageStartupMessages({ - library(vcfR); - library(data.table); - library(argparse); - library(rtracklayer); - }); - -################################################################################################### -# Input -################################################################################################### -# Define command line arguments -parser <- ArgumentParser(); -parser$add_argument('--input-vcf', type = 'character', help = 'GRCh37 Delly2 vcf'); -parser$add_argument('--header-contigs', type = 'character', help = 'Directory with vcf subsets'); -parser$add_argument('--chain-file', type = 'character', help = 'hg19ToHg38.over.chain file'); -parser$add_argument('--output', type = 'character', help = 'Where to write lifted vcf'); -args <- parser$parse_args(); - -# Save command line arguments -for (arg in names(args)) { - assign(gsub('_', '.', arg), args[[arg]]); - } - -# Set parameters for interactive runs -if (interactive()) { - # input.vcf <- '/hot/project/method/AlgorithmEvaluation/BNCH-000142-GRCh37v38/gSV/bcftools-merge/CPCG-40QC_GRCh37/CPCG-40QC_GRCh37_regenotype-gSV_delly_bcftools-merge_delly-filter-germline.vcf.gz'; - input.vcf <- '/hot/project/method/AlgorithmEvaluation/BNCH-000142-GRCh37v38/sSV/bcftools-merge/CPCG-40QC_GRCh37/CPCG-40QC_GRCh37_call-sSV_delly_bcftools-merge_somatic-only.vcf.gz'; - header.contigs <- '/hot/code/nkwang/GitHub/uclahs-cds/project-method-AlgorithmEvaluation-BNCH-000142-GRCh37v38/report/manuscript/publish/GRCh38-vcf-header-contigs.txt'; - chain.file <- '/hot/resource/genomics/liftover_chain_files/hg19ToHg38.over.chain'; - output <- '/hot/project/method/AlgorithmEvaluation/BNCH-000142-GRCh37v38/sSV/stableLift/train_CPCG-40QC_Delly2/CPCG-40QC_Delly2_LiftOver-GRCh38.vcf.gz'; - } - -################################################################################################### -# Functions -################################################################################################### -vcf.fix.to.dt <- function(vcf.fix) { - vcf.fix <- as.data.table(vcf.fix); - vcf.info <- vcf.info.to.dt(vcf.fix$INFO); - cbind(vcf.fix[, -'INFO'], vcf.info); - } - -# vcfR::getINFO() to data.table -vcf.info.to.dt <- function(vcf.info) { - vcf.info <- lapply(vcf.info, function(x) vcf.info.string.to.list(x)); - feature.names <- unique(unlist(lapply(vcf.info, names))); - vcf.info <- do.call(mapply, c(FUN = list, lapply(vcf.info, `[`, feature.names))); - setNames(as.data.table(vcf.info), feature.names); - } - -# Split vcf info field to list -vcf.info.string.to.list <- function(vcf.info, keep.columns = NULL) { - list.out <- strsplit(vcf.info, split = ';'); - list.out <- lapply(list.out, function(x) strsplit(x, split = '=')); - labels <- sapply(list.out[[1]], function(x) x[[1]]); - values <- sapply(list.out[[1]], function(x) if (length(x) == 2) x[[2]] else x[[1]]); - names(values) <- labels; - if (is.null(keep.columns)) return(values); - values <- values[labels %in% keep.columns]; - return(values); - } - -################################################################################################### -# Load files -################################################################################################### -input.vcf.path <- input.vcf; -input.vcf <- read.vcfR(input.vcf); -header.contigs <- scan(header.contigs, character()); -liftover.chain <- import.chain(chain.file); - -################################################################################################### -# Data preprocessing -################################################################################################### -if (any(duplicated(input.vcf@fix[, 'ID']))) input.vcf@fix[, 'ID'] <- paste0(substr(input.vcf@fix[, 'ID'], 1, 3), sprintf('%08d', seq_len(nrow(input.vcf@fix)))); -# if (any(duplicated(input.vcf@fix[, 'ID']))) fix.dt[, ID := paste0(substr(ID, 1, 3), sprintf('%08d', seq_len(nrow(fix.dt))))]; -fix.dt <- as.data.table(input.vcf@fix); -gt.dt <- as.data.table(input.vcf@gt); - -fix.dt <- vcf.fix.to.dt(fix.dt); -fix.dt[, CHROM := paste0('chr', CHROM)]; -fix.dt[, CHR2 := ifelse(is.na(CHR2), CHR2, paste0('chr', CHR2))]; - -fix.dt <- fix.dt[, -c('CONSENSUS')]; -numeric.columns <- c('POS', 'QUAL', 'END', 'PE', 'MAPQ', 'SRMAPQ', 'INSLEN', 'HOMLEN', 'SR', 'SRQ', 'CE', 'SVLEN', 'POS2'); -character.columns <- names(fix.dt)[!names(fix.dt) %in% numeric.columns]; -fix.dt[, (numeric.columns) := lapply(.SD, as.numeric), .SDcols = numeric.columns]; -fix.dt[, (character.columns) := lapply(.SD, as.character), .SDcols = character.columns]; - -################################################################################################### -# Liftover -################################################################################################### -# Create GRanges object -granges.37 <- makeGRangesFromDataFrame( - df = fix.dt, - seqnames.field = 'CHROM', - start.field = 'POS', - end.field = 'END', - keep.extra.columns = TRUE - ); - -# Liftover using chain file -granges.38 <- unlist(liftOver(granges.37, liftover.chain)); -granges.38.dt <- as.data.table(granges.38); - -# Create GRanges object using CHROM, CHR2, and POS2 from fix.dt -granges.37.BND <- makeGRangesFromDataFrame( - df = fix.dt[SVTYPE == 'BND', ], - seqnames.field = 'CHR2', - start.field = 'POS2', - end.field = 'POS2', - keep.extra.columns = TRUE - ); -granges.38.BND <- as.data.table(unlist(liftOver(granges.37.BND, liftover.chain))); - -# Remove multiple mappings -granges.38.dt <- granges.38.dt[!duplicated(ID)]; -granges.38.BND <- granges.38.BND[!duplicated(ID)]; -common <- intersect(granges.38.dt$ID, granges.38.BND$ID); - -granges.38.dt[ID %in% common, c('CHR2', 'POS2') := granges.38.BND[ID %in% common, .(seqnames, start)]]; - -pass.liftover <- as.data.table(input.vcf@fix)$ID %in% granges.38.dt$ID; -fix.lifted <- as.data.table(input.vcf@fix)[pass.liftover]; -gt.dt <- gt.dt[pass.liftover]; -for (i in seq_len(nrow(fix.lifted))) { - this.ID <- fix.lifted[i, ID]; - this.INFO <- vcf.info.string.to.list(fix.lifted[i, INFO]); - this.INFO[['END']] <- granges.38.dt[i, end]; - if (this.INFO[['SVTYPE']] == 'BND') { - this.INFO[['CHR2']] <- granges.38.dt[i, CHR2]; - this.INFO[['POS2']] <- granges.38.dt[i, POS2]; - } - this.INFO <- lapply(names(this.INFO), function(x) paste(x, this.INFO[[x]], sep = '=')); - this.INFO <- paste(this.INFO, collapse = ';'); - this.INFO <- gsub('IMPRECISE=IMPRECISE', 'IMPRECISE', this.INFO); - this.INFO <- gsub('PRECISE=PRECISE', 'PRECISE', this.INFO); - this.INFO <- gsub('SOMATIC=SOMATIC', 'SOMATIC', this.INFO); - fix.lifted[i, c('CHROM', 'POS', 'INFO') := granges.38.dt[ID == ..this.ID, .(seqnames, start, ..this.INFO)]]; - } - -################################################################################################### -# Write output vcf -################################################################################################### -output.vcf <- input.vcf; -output.vcf@fix <- as.matrix(fix.lifted); -output.vcf@gt <- as.matrix(gt.dt); -output.vcf@meta <- output.vcf@meta[!grepl('^##(contig|reference)', output.vcf@meta)]; -output.vcf@meta <- c(output.vcf@meta, header.contigs); - -write.vcf(output.vcf, output); diff --git a/module/scripts/predict-liftover-stability.R b/module/scripts/predict-variant-stability.R similarity index 52% rename from module/scripts/predict-liftover-stability.R rename to module/scripts/predict-variant-stability.R index d8f5dd2..c8b271d 100644 --- a/module/scripts/predict-liftover-stability.R +++ b/module/scripts/predict-variant-stability.R @@ -2,13 +2,11 @@ # predict-liftover-stability.R #################################################################################################### # -# Apply random forest model to predict variant LiftOver stability -# Validate results and plot model performance with discordance file +# Apply random forest model to predict variant stability # #################################################################################################### suppressPackageStartupMessages({ - library(caret); library(ranger); library(argparse); library(ROCR); @@ -20,12 +18,12 @@ suppressPackageStartupMessages({ ################################################################################################### # Define command line arguments parser <- ArgumentParser(); -parser$add_argument('--variant-caller', type = 'character'); -parser$add_argument('--features-dt', type = 'character'); -parser$add_argument('--rf-model', type = 'character'); -parser$add_argument('--specificity', type = 'numeric', help = 'Target specificity, overrides `--threshold`'); -parser$add_argument('--threshold', type = 'numeric', help = 'Stability score threshold', default = 0.5); -parser$add_argument('--output-tsv', type = 'character', help = 'TSV output file'); +parser$add_argument('--variant-caller', type = 'character', help = 'One of {HaplotypeCaller, Mutect2, Strelka2, SomaticSniper, Muse2, Delly2}'); +parser$add_argument('--features-dt', type = 'character', help = 'Processed Rds file with variant info and annotations'); +parser$add_argument('--rf-model', type = 'character', help = 'Pre-trained random forest model Rds file'); +parser$add_argument('--specificity', type = 'numeric', help = 'Target specificity based on whole genome validation set, overrides `--threshold`'); +parser$add_argument('--threshold', type = 'numeric', help = 'Stability score threshold, default based on maximizing F1-score in whole genome validation set'); +parser$add_argument('--output-tsv', type = 'character', help = 'Output TSV with predicted Stability Scores'); args <- parser$parse_args(); # Save command line arguments @@ -33,27 +31,6 @@ for (arg in names(args)) { assign(gsub('_', '.', arg), args[[arg]]); } -#################################################################################################### -# Functions -#################################################################################################### - -# Sort datatable by chr then position -sort.genomic.dt <- function(x, chr = 'CHROM', pos = 'POS') { - setDT(x); - x[, eval(chr) := gsub('chr', '', get(chr))]; - x[, eval(chr) := gsub('X', '23', get(chr))]; - x[, eval(chr) := gsub('Y', '24', get(chr))]; - x[, eval(chr) := as.numeric(get(chr))]; - - setorderv(x, c(chr, pos), c(1, 1)); - - x[, eval(chr) := gsub('23', 'X', get(chr))]; - x[, eval(chr) := gsub('24', 'Y', get(chr))]; - x[, eval(chr) := paste0('chr', get(chr))]; - - return(x); - } - ################################################################################################### # Load data ################################################################################################### @@ -88,51 +65,39 @@ print(dim(features.dt)); ################################################################################################### # Apply random forest model ################################################################################################### -cat('\nPredicting liftover stability with', basename(rf.model.path), '\n'); +cat('\nPredicting variant stability with', basename(rf.model.path), '\n'); stability <- predict(rf.model, data = features.dt); - -# if (!is.null(specificity) && is.numeric(specificity)) { -# cat('Target specificity =', specificity, '\n'); -# operating.index <- max(which(unlist(rf.model$performance@x.values) < 1 - specificity)); -# sensitivity <- unlist(rf.model$performance@y.values)[operating.index]; -# cat('Projected sensitivity =', round(sensitivity, 3), '\n'); -# threshold <- 1 - unlist(rf.model$performance@alpha.values)[operating.index]; -# cat('Stability score threshold =', round(threshold, 3), '\n'); -# } else if (!is.null(threshold) && is.numeric(threshold)) { -# cat('Target threshold =', threshold, '\n'); -# operating.index <- min(which(unlist(rf.model$performance@alpha.values) <= 1 - threshold)); -# specificity <- 1 - unlist(rf.model$performance@x.values)[operating.index]; -# sensitivity <- unlist(rf.model$performance@y.values)[operating.index]; -# cat('Projected specificity =', round(specificity, 3), '\n'); -# cat('Projected sensitivity =', round(sensitivity, 3), '\n'); -# } else { -# performance.acc <- performance(prediction$train, measure = 'f'); #F1-score -# index <- which.max(unlist(performance.acc@y.values)); -# cutoff <- unlist(performance.acc@x.values)[index]; -# metric <- unlist(performance.acc@y.values)[index]; -# specificity <- 1 - unlist(performance$train@x.values)[index]; -# sensitivity <- unlist(performance$train@y.values)[index]; -# cat(sprintf('Projected F[0.5]-score = %.3f\n', metric)); -# cat(sprintf('Projected sensitivity = %.3f\n', sensitivity)); -# cat(sprintf('Projected specificity = %.3f\n', specificity)); -# } - -performance.f <- performance(rf.model$prediction, measure = 'f'); -index <- which.max(unlist(performance.f@y.values)); -threshold <- unlist(performance.f@x.values)[index]; - performance <- performance(rf.model$prediction, 'sens', 'spec'); -sensitivity <- unlist(performance@y.values)[index]; -specificity <- unlist(performance@x.values)[index]; - -# Convert to stability units -threshold.stability <- 1 - threshold; -cat(sprintf('Threshold = %.3f\n', threshold.stability)); -cat(sprintf('Training sensitivity = %.3f\n', sensitivity)); -cat(sprintf('Training specificity = %.3f\n', specificity)); +if (!is.null(specificity) && is.numeric(specificity)) { + cat('Target specificity =', specificity, '\n'); + operating.index <- max(which(unlist(performance@x.values) > specificity)); + threshold <- 1 - unlist(performance@alpha.values)[operating.index]; + sensitivity <- unlist(performance@y.values)[operating.index]; + + cat(sprintf('Threshold = %.3f\n', threshold)); + cat(sprintf('Projected sensitivity = %.3f\n', sensitivity)); +} else if (!is.null(threshold) && is.numeric(threshold)) { + cat('Target threshold =', threshold, '\n'); + operating.index <- min(which(unlist(performance@alpha.values) <= 1 - threshold)); + specificity <- unlist(performance@x.values)[operating.index]; + sensitivity <- unlist(performance@y.values)[operating.index]; + + cat(sprintf('Projected sensitivity = %.3f\n', sensitivity)); + cat(sprintf('Projected specificity = %.3f\n', specificity)); +} else { + performance.f <- performance(rf.model$prediction, measure = 'f'); + operating.index <- which.max(unlist(performance.f@y.values)); + threshold <- 1 - unlist(performance.f@x.values)[operating.index]; + sensitivity <- unlist(performance@y.values)[operating.index]; + specificity <- unlist(performance@x.values)[operating.index]; + + cat(sprintf('Default threshold = %.3f\n', threshold)); + cat(sprintf('Projected sensitivity = %.3f\n', sensitivity)); + cat(sprintf('Projected specificity = %.3f\n', specificity)); + } -stability.classification <- ifelse(stability$predictions[, 1] < threshold.stability, 1, 0); +stability.classification <- ifelse(stability$predictions[, 1] < threshold, 1, 0); cat(sprintf('Proportion predicted unstable = %.3f\n\n', mean(stability.classification))); stability.classification <- as.factor(stability.classification); @@ -145,5 +110,6 @@ annotation.dt <- data.table( STABILITY_SCORE = format(round(stability$predictions[, 1], 4), nsmall = 4), STABILITY = ifelse(stability.classification == '1', 'UNSTABLE', 'STABLE') ); -sort.genomic.dt(annotation.dt); +setorder(annotation.dt, CHROM, POS); + fwrite(annotation.dt, file = output.tsv, sep = '\t', col.names = FALSE); diff --git a/module/snv_workflow.nf b/module/snv_workflow.nf index f298b7e..d5f896b 100644 --- a/module/snv_workflow.nf +++ b/module/snv_workflow.nf @@ -64,7 +64,7 @@ process extract_VCF_features_StableLift { script: """ - Rscript "${moduleDir}/scripts/extract-vcf-features.R" \ + Rscript "${moduleDir}/scripts/extract-VCF-features.R" \ --input-vcf "${vcf}" \ --variant-caller ${params.variant_caller} \ --output-rds "features.Rds" diff --git a/module/sv_workflow.nf b/module/sv_workflow.nf index bdf2621..432da05 100644 --- a/module/sv_workflow.nf +++ b/module/sv_workflow.nf @@ -1,35 +1,39 @@ - -process liftover_SV_StableLift{ +process liftover_annotate_SV_StableLift { container params.docker_image_stablelift publishDir path: "${params.output_dir_base}/intermediate/${task.process.replace(':', '/')}", - pattern: "liftover.vcf.gz", + pattern: "liftover.{vcf.gz,Rds}", mode: "copy", enabled: params.save_intermediate_files, saveAs: { "LiftOver-${sample_id}.vcf.gz" } input: - tuple val(sample_id), - path(vcf, stageAs: 'inputs/*'), - path(index, stageAs: 'inputs/*') + tuple val(sample_id), path(vcf, stageAs: 'inputs/*') + val(source_grch_label) path (header_contigs) path (chain_file) + path (gnomad_rds) output: - tuple val(sample_id), path('liftover.vcf.gz'), emit: liftover_vcf + tuple val(sample_id), path('annotations.vcf.gz'), emit: liftover_vcf + tuple val(sample_id), path('annotations.Rds'), emit: r_annotations script: """ - Rscript "${moduleDir}/scripts/liftover-Delly2-vcf.R" \ + Rscript "${moduleDir}/scripts/extract-VCF-features-SV.R" \ --input-vcf "${vcf}" \ - --header-contigs "${header_contigs}" \ + --source-build "${source_grch_label}" \ --chain-file "${chain_file}" \ - --output "liftover.vcf.gz" + --header-contigs "${header_contigs}" \ + --gnomad-rds ${gnomad_rds} \ + --output-vcf "annotations.vcf.gz" \ + --output-rds "annotations.Rds" """ stub: """ - touch "liftover.vcf.gz" + touch "annotations.Rds" + touch "annotations.vcf.gz" """ } @@ -62,66 +66,33 @@ process run_sort_BCFtools { """ } -process annotate_gnomAD_StableLift { - container params.docker_image_stablelift - - publishDir path: "${params.output_dir_base}/intermediate/${task.process.replace(':', '/')}", - pattern: "annotations.Rds", - mode: "copy", - enabled: params.save_intermediate_files, - saveAs: { "LiftOver-${sample_id}-${variant_caller}.Rds" } - - input: - tuple val(sample_id), path(vcf, stageAs: 'inputs/*') - path (gnomad_rds) - val (variant_caller) - - output: - tuple val(sample_id), path('annotations.Rds'), emit: r_annotations - - script: - """ - Rscript "${moduleDir}/scripts/extract-vcf-features-SV.R" \ - --variant-caller "${variant_caller}" \ - --input-vcf "${vcf}" \ - --output-rds "annotations.Rds" \ - --gnomad-rds ${gnomad_rds} - """ - - stub: - """ - touch "annotations.Rds" - """ -} - workflow workflow_extract_sv_annotations { take: vcf_with_sample_id + src_sequence header_contigs gnomad_rds chain_file - variant_caller main: - // Step 1: Liftover - liftover_SV_StableLift( - vcf_with_sample_id, + liftover_annotate_SV_StableLift( + // We don't need the index file + vcf_with_sample_id.map{ [it[0], it[1]] }, + + // We only need the sample ID + src_sequence.map{ ["hg19": "GRCh37", "hg38": "GRCh38"][it[0]] }, + header_contigs, - chain_file - ) - run_sort_BCFtools( - liftover_SV_StableLift.out.liftover_vcf + chain_file, + gnomad_rds ) - // Step 2: Extract features - annotate_gnomAD_StableLift( - run_sort_BCFtools.out.sorted_vcf, - gnomad_rds, - variant_caller + run_sort_BCFtools( + liftover_annotate_SV_StableLift.out.liftover_vcf ) emit: liftover_vcf = run_sort_BCFtools.out.sorted_vcf - r_annotations = annotate_gnomAD_StableLift.out.r_annotations + r_annotations = liftover_annotate_SV_StableLift.out.r_annotations }