diff --git a/CHANGELOG.md b/CHANGELOG.md
index 11aa2a1..1fe8ca2 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -15,6 +15,8 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm
 - Add workflow for SV caller (Delly2)
 - Add pipeline diagram
 - Add reverse liftover (GRCh38 -> GRCh37) for SNV branch
+- Add reverse liftover (GRCh38 -> GRCh37) for SV branch
+- Add optional `target_threshold` and `target_specificity` parameters
 
 ### Changed
 
diff --git a/README.md b/README.md
index 67d27eb..e52acd3 100644
--- a/README.md
+++ b/README.md
@@ -51,10 +51,11 @@ If you are using the UCLA Azure cluster, please use the [submission script](http
 - For SNVs, convert variant coordinates using the `BCFtools` LiftOver plugin with UCSC chain files.
 - For SVs, convert variant breakpoint coordinates using custom R script with UCSC chain files and `rtracklayer` and `GenomicRanges` R packages.
 
-### 2. Variant annotation
+### 2. Variant annotation*
 
 - For SNVs, add dbSNP, GENCODE, and HGNC annotations using GATK's Funcotator. Add trinucleotide context and RepeatMasker intervals with `bedtools`.
 - For SVs, annotate variants with population allele frequency from the gnomAD-SV v4 database.
+- *Variant annotation occurs prior to LiftOver when converting from GRCh38 -> GRCh37
 
 ### 3. Predict variant stability
 
@@ -98,6 +99,8 @@ input:
 
 | Optional Parameter          | Type                                                                                      | Default                      | Description                                                                                                                                                                                                                                                                                                                                                                           |
 | --------------------------- | ----------------------------------------------------------------------------------------- | ---------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `target_threshold`          | numeric                                                                                   | `""`             | Target Stability Score threshold for variant filtering: [0, 1] |
+| `target_specificity`          | numeric                                                                                   | `""`             | Target specificity based on whole genome validation set for variant filtering: [0, 1] |
 | `work_dir`                  | path                                                                                      | `/scratch/$SLURM_JOB_ID`     | Path of working directory for Nextflow. When included in the sample config file, Nextflow intermediate files and logs will be saved to this directory. With `ucla_cds`, the default is `/scratch` and should only be changed for testing/development. Changing this directory to `/hot` or `/tmp` can lead to high server latency and potential disk space limitations, respectively. |
 | `save_intermediate_files`   | boolean                                                                                   | false                        | If set, save output files from intermediate pipeline processes.                                                                                                                                                                                                                                                                                                                       |
 | `min_cpus`                  | int                                                                                       | 1                            | Minimum number of CPUs that can be assigned to each process.                                                                                                                                                                                                                                                                                                                          |
@@ -117,13 +120,13 @@ The docker images in the following table are generally defined like `docker_imag
 
 * Change `params.docker_container_registry`. This will affect all of the images (except for GATK).
 * Change `params.<tool>_version`. This will pull a different version of the same image from the registry.
-* Change `params.docker_image_<tool>`. This will explicitly set the image to use, ignoring `docker_container_registry` and `<tool>_version`, and thus requires that the docker tag be explicitly set (e.g. `broadinstitute/gatk:4.2.4.1`).
+* Change `params.docker_image_<tool>`. This will explicitly set the image to use, ignoring `docker_container_registry` and `<tool>_version`, and thus requires that the docker tag be explicitly set (e.g. `broadinstitute/gatk:4.4.0.0`).
 
 | Tool Parameter           | Version Parameter    | Default                                                      | Notes                                                               |
 | ------------------------ | -------------------- | ------------------------------------------------------------ | ------------------------------------------------------------------- |
 | `docker_image_bcftools`  | `bcftools_version`   | `ghcr.io/uclahs-cds/bcftools-score:1.20_score-1.20-20240505` | This image must have both BCFtools and the score plugins available. |
 | `docker_image_bedtools`  | `bedtools_version`   | `ghcr.io/uclahs-cds/bedtools:2.31.0`                         |                                                                     |
-| `docker_image_gatk`      | `gatk_version`       | `broadinstitute/gatk:4.2.4.1`                                |                                                                     |
+| `docker_image_gatk`      | `gatk_version`       | `broadinstitute/gatk:4.4.0.0`                                |                                                                     |
 | `docker_image_pipeval`   | `pipeval_version`    | `ghcr.io/uclahs-cds/pipeval:5.0.0-rc.3`                      |                                                                     |
 | `docker_image_samtools`  | `samtools_version`   | `ghcr.io/uclahs-cds/samtools:1.20`                           |                                                                     |
 | `doker_image_stablelift` | `stablelift_version` | `ghcr.io/uclahs-cds/stablelift:FIXME`                        | This image is built and maintained via this repository.             |
@@ -191,7 +194,7 @@ Please see list of [Contributors](https://github.com/uclahs-cds/pipeline-StableL
 
 pipeline-StableLift is licensed under the GNU General Public License version 2. See the file LICENSE for the terms of the GNU GPL license.
 
-StableLift is a machine learning approach designed to predict variant stability across reference genome builds, supplementing LiftOver coordinate conversion and increasing portability of variant calls.
+StableLift is a machine learning approach designed to predict variant stability across reference genome builds, supplementing LiftOver coordinate conversion to increase the portability of variant calls.
 
 Copyright (C) 2024 University of California Los Angeles ("Boutros Lab") All rights reserved.
 
diff --git a/config/custom_schema_types.config b/config/custom_schema_types.config
index bcf24fc..a38d8d9 100644
--- a/config/custom_schema_types.config
+++ b/config/custom_schema_types.config
@@ -43,7 +43,29 @@ custom_schema_types {
         }
     }
 
+    /**
+    * Check that the input is numeric in the appropriate range.
+    */
+    ranged_number = { Map options, String name, Map properties ->
+        if (!(properties.containsKey('min') && properties['min'] in Number)) {
+            throw new Exception('`min` parameter misconfigured - must be a Number.')
+        }
+
+        if (!(properties.containsKey('max') && properties['max'] in Number)) {
+            throw new Exception('`max` parameter misconfigured - must be a Number.')
+        }
+
+        if (!(options[name] in Number)) {
+            throw new Exception("${name} must be a Number, not ${options[name].getClass()}")
+        }
+
+        if (options[name] < properties.min || properties.max < options[name]) {
+            throw new Exception("${name}=${options[name]} is not in range [${properties.min}, ${properties.max}]")
+        }
+    }
+
     types = [
-        'FuncotatorDataSource': custom_schema_types.check_funcotator_data_source
+        'FuncotatorDataSource': custom_schema_types.check_funcotator_data_source,
+        'RangedNumber': custom_schema_types.ranged_number
     ]
 }
diff --git a/config/schema.yaml b/config/schema.yaml
index 76b8a2f..8e21f40 100644
--- a/config/schema.yaml
+++ b/config/schema.yaml
@@ -99,3 +99,23 @@ input:
       mode: 'r'
       required: true
       help: 'Input dataset supplied by input yaml'
+
+target_threshold:
+  type: 'RangedNumber'
+  required: false
+  min: 0
+  max: 1
+  help: >-
+    Optional parameter specifying target Stability Score threshold for variant
+    filtering Default behavior without `target_threshold` or
+    `target_specificity` specified uses threshold maximizing F1-score in whole
+    genome validation set'.
+
+target_specificity:
+  type: 'RangedNumber'
+  required: false
+  min: 0
+  max: 1
+  help: >-
+    Optional parameter specifying target specificity for variant filtering
+    based on whole genome validation set. Overrides `target_threshold`.
diff --git a/config/template.config b/config/template.config
index e30e086..486b3e6 100644
--- a/config/template.config
+++ b/config/template.config
@@ -5,17 +5,28 @@ includeConfig "${projectDir}/config/default.config"
 includeConfig "${projectDir}/config/methods.config"
 includeConfig "${projectDir}/nextflow.config"
 
-
 // Inputs/parameters of the pipeline
 params {
     // input/output locations
-    output_dir = 'where/to/save/outputs/'
+    output_dir = "where/to/save/outputs/"
 
-    // Choices: ["Mutect2", "HaplotypeCaller"]
+    // Choices: ["HaplotypeCaller", "Mutect2", "Strelka2", "SomaticSniper", "Muse2", "Delly2"]
     variant_caller = "Mutect2"
 
+    // Path to pre-trained random forest model
     rf_model = ""
 
+    // Optional parameter specifying target Stability Score threshold for
+    // variant filtering Default behavior without `target_threshold` or
+    // `target_specificity` specified uses threshold maximizing F1-score in
+    // whole genome validation set. Must be in the range [0.0, 1.0].
+    // target_threshold = 0.5
+
+    // Optional parameter specifying target specificity for variant filtering
+    // based on whole genome validation set Overrides `target_threshold`. Must
+    // be in the range [0.0, 1.0],
+    // target_specificity = 0.5
+
     // Reference files
     funcotator_data {
         data_source = "/hot/ref/tool-specific-input/Funcotator/somatic/funcotator_dataSources.v1.7.20200521s"
diff --git a/docs/pipeline.mmd b/docs/pipeline.mmd
index 0bc792b..9db31c5 100644
--- a/docs/pipeline.mmd
+++ b/docs/pipeline.mmd
@@ -64,7 +64,7 @@ flowchart TD
         --> bcftools_annotate2["`bcftools annotate*Trinucleotide*`"]:::bcftools
     end
 
-    blocknote["`**Note:** Annotation is performed before Liftover when lifting backward`"]
+    blocknote["`**Note:** Annotation is performed prior to LiftOver when converting from GRCh38 -> GRCh37`"]
 
     bcftools_liftover ---> gatk_func
     bcftools_annotate2 --> r_extract_snv[extract-VCF-features.R]:::R
@@ -79,7 +79,7 @@ flowchart TD
   joinpaths ---> r_predict_stability
 
   subgraph Predict Stability ["`&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;**Predict Stability**`"]
-    r_predict_stability[predict-liftover-stability.R]:::R
+    r_predict_stability[predict-variant-stability.R]:::R
     --> bcftools_annotate3["`bcftools annotate*Stability*`"]:::bcftools
 
     rf_model([rf_model]):::input .-> r_predict_stability
diff --git a/main.nf b/main.nf
index abac9ca..0526d8f 100644
--- a/main.nf
+++ b/main.nf
@@ -42,6 +42,8 @@ log.info """\
         chain_file: ${params.chain_file}
         repeat_bed: ${params.repeat_bed}
 
+        header_contigs: ${params.getOrDefault('header_contigs', null)}
+
         funcotator_data:
             data_source:       ${params.funcotator_data.data_source}
             src_reference_id:  ${params.funcotator_data.src_reference_id}
@@ -149,10 +151,10 @@ workflow {
         // Take the SV branch
         workflow_extract_sv_annotations(
             validated_vcf_tuple,
+            input_ch_src_sequence,
             Channel.value(params.header_contigs),
             Channel.value(params.gnomad_rds),
-            Channel.value(params.chain_file),
-            Channel.value(params.variant_caller)
+            Channel.value(params.chain_file)
         )
 
         workflow_extract_sv_annotations.out.liftover_vcf.set { liftover_vcf }
diff --git a/module/predict_stability.nf b/module/predict_stability.nf
index 1ae6987..5ab6dac 100644
--- a/module/predict_stability.nf
+++ b/module/predict_stability.nf
@@ -18,12 +18,17 @@ process predict_stability_StableLift {
     tuple val(sample_id), path("stability.tsv"), emit: stability_tsv
 
     script:
+    spec_arg = (params.getOrDefault('target_specificity', null) != null) ? "--specificity \"${params.get('target_specificity')}\"" : ""
+    thresh_arg = (params.getOrDefault('target_threshold', null) != null) ? "--threshold \"${params.get('target_threshold')}\"" : ""
+
     """
-    Rscript "${moduleDir}/scripts/predict-liftover-stability.R" \
+    Rscript "${moduleDir}/scripts/predict-variant-stability.R" \
+        --variant-caller "${variant_caller}" \
         --features-dt "${features_rds}" \
         --rf-model "${rf_model}" \
-        --variant-caller "${variant_caller}" \
-        --output-tsv "stability.tsv"
+        --output-tsv "stability.tsv" \
+        ${spec_arg} \
+        ${thresh_arg}
     """
 
     stub:
@@ -43,7 +48,6 @@ process run_apply_stability_annotations {
     input:
     tuple val(sample_id),
         path(annotated_vcf, stageAs: 'inputs/*'),
-        // FIXME Should there be an annotated_vcf_tbi?
         path(stability_tsv, stageAs: 'inputs/*'),
         path(stability_tsv_tbi, stageAs: 'inputs/*')
 
diff --git a/module/scripts/extract-VCF-features-SV.R b/module/scripts/extract-VCF-features-SV.R
new file mode 100644
index 0000000..264ce4e
--- /dev/null
+++ b/module/scripts/extract-VCF-features-SV.R
@@ -0,0 +1,269 @@
+#!/usr/bin/env Rscript
+# extract-VCF-features-SV.R
+####################################################################################################
+#
+# LiftOver Delly2 structural variants and annotate with gnomAD population allele frequency
+# Extract VCF features and save as Rds for input to predict-variant-stability.R
+#
+####################################################################################################
+
+suppressPackageStartupMessages({
+    library(vcfR);
+    library(data.table);
+    library(argparse);
+    library(rtracklayer);
+    library(GenomicRanges);
+    });
+
+###################################################################################################
+# Input
+###################################################################################################
+# Define command line arguments
+parser <- ArgumentParser();
+parser$add_argument('--input-vcf', type = 'character', help = 'Input Delly2 VCF');
+parser$add_argument('--source-build', type = 'character', help = 'One of {GRCh37, GRCh38}');
+parser$add_argument('--chain-file', type = 'character', help = 'Chain file for coordinate conversion');
+parser$add_argument('--header-contigs', type = 'character', help = 'Resource file with VCF header for target build');
+parser$add_argument('--gnomad-rds', type = 'character', help = 'gnomAD-SV v4 resource file');
+parser$add_argument('--output-vcf', type = 'character', help = 'VCF output');
+parser$add_argument('--output-rds', type = 'character', help = 'Rds output for input to RF model');
+args <- parser$parse_args();
+
+# Save command line arguments
+for (arg in names(args)) {
+    assign(gsub('_', '.', arg), args[[arg]]);
+    }
+
+###################################################################################################
+# Functions
+###################################################################################################
+# vcfR::getINFO() to data.table
+vcf.info.to.dt <- function(vcf.info) {
+    vcf.info <- lapply(vcf.info, function(x) vcf.info.string.to.list(x));
+    feature.names <- unique(unlist(lapply(vcf.info, names)));
+    vcf.info <- do.call(mapply, c(FUN = list, lapply(vcf.info, `[`, feature.names)));
+    setNames(as.data.table(vcf.info), feature.names);
+    }
+
+# Split VCF info field to list
+vcf.info.string.to.list <- function(vcf.info, keep.columns = NULL) {
+    list.out <- strsplit(vcf.info, split = ';');
+    list.out <- lapply(list.out, function(x) strsplit(x, split = '='));
+    labels <- sapply(list.out[[1]], function(x) x[[1]]);
+    values <- sapply(list.out[[1]], function(x) if (length(x) == 2) x[[2]] else x[[1]]);
+    names(values) <- labels;
+    if (is.null(keep.columns)) return(values);
+    values <- values[labels %in% keep.columns];
+    return(values);
+    }
+
+calculate.VAF <- function(GT.row) {
+    total <- sum(GT.row %in% c('0/0', '0/1', '1/1'), na.rm = TRUE) * 2;
+    alt <- sum(GT.row == '0/1', na.rm = TRUE) + sum(GT.row == '1/1', na.rm = TRUE) * 2;
+    return(alt / total);
+    }
+
+get.overlap <- function(start1, end1, start2, end2) {
+    max.length <- pmax((end1 - start1), (end2 - start2));
+    overlap.length <- pmin(end1, end2) - pmax(start1, start2);
+    return(overlap.length / max.length);
+    }
+
+find.SV.match <- function(this.ID, input, reference, overlap, offset) {
+    # Match SV type and CHR
+    this.variant <- input[ID == this.ID];
+    reference <- reference[SVTYPE == this.variant$SVTYPE & CHROM == this.variant$CHROM];
+
+    if (this.variant$SVTYPE == 'BND') {
+        reference <- reference[CHR2 == this.variant$CHR2];
+        reference[, OFFSET := abs(POS - this.variant$POS)];
+        reference <- reference[OFFSET < offset][order(OFFSET)];
+    } else {
+        reference[, OVERLAP := get.overlap(POS, END, this.variant$POS, this.variant$END)];
+        reference <- reference[OVERLAP > overlap][order(OVERLAP, decreasing = TRUE)];
+        }
+
+    return(list(gnomad.match.ID = reference[1, ID], gnomad.matches = nrow(reference)));
+    }
+
+annotate.gnomad.features <- function(features.dt, features.dt.gnomad) {
+    gnomad.features <- c('ID', 'AF', 'POPMAX_AF');
+    features.dt[, c('gnomad.match.ID', 'gnomad.matches') := rbindlist(lapply(ID, find.SV.match, input = features.dt, reference = features.dt.gnomad, overlap = 0.8, offset = 500))];
+    features.dt <- merge(features.dt, features.dt.gnomad[, ..gnomad.features], all.x = TRUE, by.x = 'gnomad.match.ID', by.y = 'ID');
+    }
+
+###################################################################################################
+# Load files
+###################################################################################################
+input.vcf <- read.vcfR(input.vcf);
+header.contigs <- scan(header.contigs, character());
+liftover.chain <- import.chain(chain.file);
+features.dt.gnomad <- readRDS(gnomad.rds);
+
+###################################################################################################
+# Data preprocessing
+###################################################################################################
+# Convert variant information into dt
+if (any(duplicated(input.vcf@fix[, 'ID']))) input.vcf@fix[, 'ID'] <- paste0(substr(input.vcf@fix[, 'ID'], 1, 3), sprintf('%08d', seq_len(nrow(input.vcf@fix))));
+input.info <- vcf.info.to.dt(input.vcf@fix[, 'INFO']);
+input.fix <- as.data.table(input.vcf@fix);
+features.dt <- cbind(input.fix[, -c('INFO')], input.info);
+
+# Format columns
+features.dt[, CONSENSUS := NULL];
+numeric.columns <- c('POS', 'QUAL', 'END', 'PE', 'MAPQ', 'SRMAPQ', 'INSLEN', 'HOMLEN', 'SR', 'SRQ', 'CE', 'RDRATIO', 'SVLEN', 'POS2');
+character.columns <- names(features.dt)[!names(features.dt) %in% numeric.columns];
+features.dt[, (numeric.columns) := lapply(.SD, as.numeric), .SDcols = numeric.columns];
+features.dt[, (character.columns) := lapply(.SD, as.character), .SDcols = character.columns];
+
+# Extract and aggregate per sample GT fields
+gt.fields <- c('GQ', 'RC', 'RDCN', 'DR', 'DV', 'RR', 'RV');
+for (field in gt.fields) {
+    features.dt[, (field) := apply(extract.gt(input.vcf, element = ..field, as.numeric = TRUE), 1, mean, na.rm = TRUE)];
+    }
+features.dt[, COHORT_AF := apply(extract.gt(input.vcf, element = 'GT'), 1, calculate.VAF)];
+features.dt[, CIPOS := as.numeric(sapply(CIPOS, function(x) unlist(strsplit(x, ','))[2]))];
+
+if (source.build == 'GRCh37') {
+    features.dt[, CHROM := paste0('chr', CHROM)];
+    features.dt[, CHR2 := ifelse(is.na(CHR2), CHR2, paste0('chr', CHR2))];
+} else if (source.build == 'GRCh38') {
+    # Annotate with gnomAD before LiftOver if source build == GRCh38
+    features.dt <- annotate.gnomad.features(features.dt, features.dt.gnomad);
+    }
+
+###################################################################################################
+# LiftOver variants by breakpoint
+###################################################################################################
+# Create GRanges object
+grange.source <- makeGRangesFromDataFrame(
+    df = features.dt,
+    seqnames.field = 'CHROM',
+    start.field = 'POS',
+    end.field = 'END',
+    keep.extra.columns = TRUE
+    );
+
+# LiftOver using chain file
+grange.target <- unlist(liftOver(grange.source, liftover.chain));
+grange.target.dt <- as.data.table(grange.target);
+
+# Create GRanges object using CHROM, CHR2, and POS2 from features.dt
+grange.source.BND <- makeGRangesFromDataFrame(
+    df = features.dt[SVTYPE == 'BND', ],
+    seqnames.field = 'CHR2',
+    start.field = 'POS2',
+    end.field = 'POS2',
+    keep.extra.columns = TRUE
+    );
+grange.target.BND <- as.data.table(unlist(liftOver(grange.source.BND, liftover.chain)));
+
+# Remove multiple mappings
+grange.target.dt <- grange.target.dt[!duplicated(ID)];
+grange.target.BND <- grange.target.BND[!duplicated(ID)];
+common <- intersect(grange.target.dt$ID, grange.target.BND$ID);
+
+grange.target.dt[ID %in% common, c('CHR2', 'POS2') := grange.target.BND[ID %in% common, .(seqnames, start)]];
+
+if (source.build == 'GRCh38') {
+    grange.target.dt[, seqnames := sub('chr', '', seqnames)];
+    grange.target.dt[, CHR2 := ifelse(is.na(CHR2), CHR2, sub('chr', '', CHR2))];
+    }
+
+###################################################################################################
+# Write output VCF
+###################################################################################################
+pass.liftover <- as.data.table(input.vcf@fix)$ID %in% grange.target.dt$ID;
+input.fix <- as.data.table(input.vcf@fix)[pass.liftover];
+input.gt <- as.data.table(input.vcf@gt)[pass.liftover];
+grange.target.dt <- grange.target.dt[match(input.fix$ID, grange.target.dt$ID)];
+
+for (i in seq_len(nrow(input.fix))) {
+    this.ID <- input.fix[i, ID];
+    this.INFO <- vcf.info.string.to.list(input.fix[i, INFO]);
+    this.INFO[['END']] <- grange.target.dt[i, end];
+    if (this.INFO[['SVTYPE']] == 'BND') {
+        this.INFO[['CHR2']] <- grange.target.dt[i, CHR2];
+        this.INFO[['POS2']] <- grange.target.dt[i, POS2];
+        }
+    this.INFO <- lapply(names(this.INFO), function(x) paste(x, this.INFO[[x]], sep = '='));
+    this.INFO <- paste(this.INFO, collapse = ';');
+    this.INFO <- gsub('IMPRECISE=IMPRECISE', 'IMPRECISE', this.INFO);
+    this.INFO <- gsub('PRECISE=PRECISE', 'PRECISE', this.INFO);
+    this.INFO <- gsub('SOMATIC=SOMATIC', 'SOMATIC', this.INFO);
+    input.fix[i, c('CHROM', 'POS', 'INFO') := grange.target.dt[i, .(seqnames, start, ..this.INFO)]];
+    }
+
+lifted.vcf <- input.vcf;
+lifted.vcf@fix <- as.matrix(input.fix);
+lifted.vcf@gt <- as.matrix(input.gt);
+lifted.vcf@meta <- lifted.vcf@meta[!grepl('^##(contig|reference)', lifted.vcf@meta)];
+lifted.vcf@meta <- c(lifted.vcf@meta, header.contigs);
+
+write.vcf(lifted.vcf, output.vcf);
+
+###################################################################################################
+# Format features for RF
+###################################################################################################
+features.dt <- features.dt[ID %in% grange.target.dt$ID];
+features.dt <- features.dt[match(input.fix$ID, features.dt$ID)];
+features.dt[, c('CHROM', 'POS', 'END', 'CHR2', 'POS2') := grange.target.dt[, .(seqnames, start, end, CHR2, POS2)]];
+features.dt[!SVTYPE %in% c('BND', 'INS'), SVLEN := END - POS + 1];
+
+if (source.build == 'GRCh37') {
+    features.dt <- annotate.gnomad.features(features.dt, features.dt.gnomad);
+    }
+
+continuous.features <- c(
+    'POS',
+    'QUAL',
+    'END',
+    'PE',
+    'MAPQ',
+    'CIPOS',
+    'SRMAPQ',
+    'HOMLEN',
+    'SR',
+    'SRQ',
+    'CE',
+    'RDRATIO',
+    'SVLEN',
+    'GQ',
+    'RC',
+    'RDCN',
+    'DR',
+    'DV',
+    'RR',
+    'RV',
+    'AF',
+    'gnomad.matches',
+    'POPMAX_AF'
+    );
+
+categorical.features <- c(
+    'CHROM',
+    'SVTYPE',
+    'CT'
+    );
+
+# Extract features and format
+continuous.features <- continuous.features[continuous.features %in% names(features.dt)];
+categorical.features <- categorical.features[categorical.features %in% names(features.dt)];
+all.features <- c(continuous.features, categorical.features, 'ID');
+
+features.dt <- features.dt[, ..all.features];
+features.dt[, (continuous.features) := lapply(.SD, as.numeric), .SDcols = continuous.features];
+features.dt[, (continuous.features) := lapply(.SD, function(x) ifelse(is.na(x), 0, x)), .SDcols = continuous.features];
+features.dt[, (categorical.features) := lapply(.SD, function(x) ifelse(is.na(x), '', x)), .SDcols = categorical.features];
+features.dt[, (categorical.features) := lapply(.SD, as.factor), .SDcols = categorical.features];
+names(features.dt) <- make.names(names(features.dt));
+
+# Remove rows with NA
+features.dt.rows <- nrow(features.dt);
+features.dt <- features.dt[apply(features.dt, 1, function(x) !any(is.na(x))), ];
+cat('Removed', features.dt.rows - nrow(features.dt), 'rows with missing data\n');
+
+###################################################################################################
+# Save features.dt for input to RF
+###################################################################################################
+saveRDS(features.dt, output.rds);
diff --git a/module/scripts/extract-vcf-features.R b/module/scripts/extract-VCF-features.R
similarity index 87%
rename from module/scripts/extract-vcf-features.R
rename to module/scripts/extract-VCF-features.R
index 5aa3d75..641d653 100644
--- a/module/scripts/extract-vcf-features.R
+++ b/module/scripts/extract-VCF-features.R
@@ -1,10 +1,8 @@
 #!/usr/bin/env Rscript
-# extract-vcf-features.R
+# extract-VCF-features.R
 ####################################################################################################
 #
-# Extract features from vcf
-# Extract Funcotator annotations if present
-# Annotate with RepeatMasker regions if intersect file is provided
+# Extract VCF features and save as Rds for input to predict-variant-stability.R
 #
 ####################################################################################################
 
@@ -21,11 +19,11 @@ suppressPackageStartupMessages({
 ###################################################################################################
 # Define command line arguments
 parser <- ArgumentParser();
-parser$add_argument('--input-vcf', type = 'character', help = 'GRCh37 vcf lifted to GRCh38 for feature extraction');
-parser$add_argument('--input-dir', type = 'character', help = 'Directory with vcf subsets');
-parser$add_argument('--output-rds', type = 'character', help = 'Rds output for use in RF model');
-parser$add_argument('--variant-caller', type = 'character', help = '');
-parser$add_argument('--ncore', type = 'integer', help = 'Number of cores to use for parallelizing features extraction', default = 1);
+parser$add_argument('--input-vcf', type = 'character', help = 'Input VCF for feature extraction, mutually exclusive with --input-dir');
+parser$add_argument('--input-dir', type = 'character', help = 'Directory with VCF subsets for parallelization, mutually exclusive with --input-vcf');
+parser$add_argument('--output-rds', type = 'character', help = 'Rds output for input to RF model');
+parser$add_argument('--variant-caller', type = 'character', help = 'One of {HaplotypeCaller, Mutect2, Strelka2, SomaticSniper, Muse2, Delly2}');
+parser$add_argument('--ncore', type = 'integer', help = 'Number of cores to use for processing VCF subsets in --input-dir', default = 1);
 args <- parser$parse_args();
 
 # Save command line arguments
@@ -33,13 +31,6 @@ for (arg in names(args)) {
     assign(gsub('_', '.', arg), args[[arg]]);
     }
 
-# Set parameters for interactive runs
-if (interactive()) {
-    variant.caller <- 'Strelka2';
-    input.vcf <- '/hot/project/method/AlgorithmEvaluation/BNCH-000142-GRCh37v38/gSNP/stableLift/validate_TCGA-SARC_WXS/TCGA-SARC_WXS_HaplotypeCaller_LiftOver-GRCh38_annotated_exome.vcf.gz';
-    vcf.subset <- input.vcf;
-    }
-
 if (!is.null(input.dir)) {
     vcf.subsets <- list.files(input.dir, full.names = TRUE, pattern = '(\\.vcf.gz|\\.vcf)$');
     output.path <- output.rds;
@@ -116,6 +107,21 @@ features.dt.subsets <- foreach(vcf.subset = vcf.subsets) %dopar% {
         info[input.vcf@fix[, 'REF'] == 'G', REFCOUNTS := apply(extract.gt(input.vcf, element = 'GU')[input.vcf@fix[, 'REF'] == 'G', ], 1, function(x) mean(sapply(strsplit(x, ','), function(y) as.numeric(y[1])), na.rm = TRUE))];
         info[input.vcf@fix[, 'ALT'] == 'G', ALTCOUNTS := apply(extract.gt(input.vcf, element = 'GU')[input.vcf@fix[, 'ALT'] == 'G', ], 1, function(x) mean(sapply(strsplit(x, ','), function(y) as.numeric(y[1])), na.rm = TRUE))];
         info[, AF := ALTCOUNTS / (REFCOUNTS + ALTCOUNTS)];
+    } else if (variant.caller == 'SomaticSniper') {
+        # Calculate VAF from allelic depths
+        info$AF <- apply(extract.gt(input.vcf, element = 'DP4'), 1, function(x) mean(
+            sapply(strsplit(x, ','), function(y) {
+                y <- as.numeric(y);
+                return((y[3] + y[4]) / sum(y));
+                }),
+            na.rm = TRUE
+            ));
+        info$AMQ <- apply(extract.gt(input.vcf, element = 'AMQ'), 1, function(x) mean(sapply(strsplit(x, ','), function(y) as.numeric(y[2])), na.rm = TRUE));
+        info$BQ <- apply(extract.gt(input.vcf, element = 'BQ'), 1, function(x) mean(sapply(strsplit(x, ','), function(y) as.numeric(y[2])), na.rm = TRUE));
+        info$GQ <- apply(extract.gt(input.vcf, element = 'GQ', as.numeric = TRUE), 1, mean, na.rm = TRUE);
+        info$MQ <- apply(extract.gt(input.vcf, element = 'GQ', as.numeric = TRUE), 1, mean, na.rm = TRUE);
+        info$SSC <- apply(extract.gt(input.vcf, element = 'SSC', as.numeric = TRUE), 1, mean, na.rm = TRUE);
+        info$VAQ <- apply(extract.gt(input.vcf, element = 'VAQ', as.numeric = TRUE), 1, mean, na.rm = TRUE);
         }
 
     # Get funcotation fields
@@ -230,6 +236,14 @@ if (variant.caller == 'Strelka2') continuous.features <- c(continuous.features,
 if (variant.caller == 'Muse2') continuous.features <- c(continuous.features,
     'Variant Base Quality (BQ)' = 'BQ'
     );
+if (variant.caller == 'SomaticSniper') continuous.features <- c(continuous.features,
+    'Variant Mapping Quality (AMQ)' = 'AMQ',
+    'Base Quality (BQ)' = 'BQ',
+    'Genotype Quality (GQ)' = 'GQ',
+    'Mapping Quality (MQ)' = 'MQ',
+    'Somatic Score (SSC)' = 'SSC',
+    'Variant Allele Quality (VAQ)' = 'VAQ'
+    );
 
 categorical.features <- c(
     'Chromosome (CHR)' = 'CHROM',
diff --git a/module/scripts/extract-vcf-features-SV.R b/module/scripts/extract-vcf-features-SV.R
deleted file mode 100644
index 9927a7e..0000000
--- a/module/scripts/extract-vcf-features-SV.R
+++ /dev/null
@@ -1,181 +0,0 @@
-#!/usr/bin/env Rscript
-# extract-vcf-features-SV.R
-####################################################################################################
-#
-# Extract features from vcf
-# Intersect and annotate with gnomAD-SV vcf
-#
-####################################################################################################
-
-suppressPackageStartupMessages({
-    library(vcfR);
-    library(data.table);
-    library(argparse);
-    library(GenomicRanges);
-    });
-
-###################################################################################################
-# Input
-###################################################################################################
-# Define command line arguments
-parser <- ArgumentParser();
-parser$add_argument('--variant-caller', type = 'character', help = '');
-parser$add_argument('--input-vcf', type = 'character', help = 'Delly2 vcf');
-parser$add_argument('--output-rds', type = 'character', help = 'Rds output for use in RF model');
-parser$add_argument('--gnomad-rds', type = 'character', help = 'gnomAD Rds file');
-args <- parser$parse_args();
-
-# Save command line arguments
-for (arg in names(args)) {
-    assign(gsub('_', '.', arg), args[[arg]]);
-    }
-
-# Set parameters for interactive runs
-if (interactive()) {
-    variant.caller <- 'Delly2';
-    input.vcf <- '/hot/project/method/AlgorithmEvaluation/BNCH-000142-GRCh37v38/sSV/stableLift/train_CPCG-40QC_Delly2/CPCG-40QC_Delly2_LiftOver-GRCh38.vcf.gz';
-    gnomad.rds <- '/hot/code/nkwang/GitHub/uclahs-cds/project-method-AlgorithmEvaluation-BNCH-000142-GRCh37v38/report/manuscript/publish/data/gnomad.v4.0.sv.Rds';
-    }
-
-###################################################################################################
-# Functions
-###################################################################################################
-vcf.info.to.dt <- function(vcf.info) {
-    # Split each string by semicolon and convert to a list of key-value pairs
-    vcf.info <- strsplit(vcf.info, ';');
-    vcf.info <- lapply(vcf.info, function(x) {
-        x <- strsplit(x, '=');
-        as.list(stats::setNames(sapply(x, `[`, 2), sapply(x, `[`, 1)));
-        })
-
-    # Combine the list of key-value pairs into a data table
-    rbindlist(vcf.info, fill = TRUE);
-    }
-
-calculate.VAF <- function(GT.row) {
-    total <- sum(GT.row %in% c('0/0', '0/1', '1/1'), na.rm = TRUE) * 2;
-    alt <- sum(GT.row == '0/1', na.rm = TRUE) + sum(GT.row == '1/1', na.rm = TRUE) * 2;
-    return(alt / total);
-    }
-
-get.overlap <- function(start1, end1, start2, end2) {
-    max.length <- pmax((end1 - start1), (end2 - start2));
-    overlap.length <- pmin(end1, end2) - pmax(start1, start2);
-    return(overlap.length / max.length);
-    }
-
-find.SV.match <- function(this.ID, input, reference, overlap, offset) {
-    # Match SV type and CHR
-    this.variant <- input[ID == this.ID];
-    reference <- reference[SVTYPE == this.variant$SVTYPE & CHROM == this.variant$CHROM];
-
-    if (this.variant$SVTYPE == 'BND') {
-        # reference[, OFFSET := abs(POS - this.variant$POS) + abs(POS2 - this.variant$POS2)];
-        reference[, OFFSET := abs(POS - this.variant$POS)];
-        reference <- reference[OFFSET < offset & CHR2 == this.variant$CHR2][order(OFFSET)];
-    } else {
-        reference[, OVERLAP := get.overlap(POS, END, this.variant$POS, this.variant$END)];
-        reference <- reference[OVERLAP > overlap][order(OVERLAP, decreasing = TRUE)];
-        }
-
-    return(list(gnomad.match.ID = reference[1, ID], gnomad.matches = nrow(reference)));
-    }
-
-###################################################################################################
-# Load files
-###################################################################################################
-input.vcf <- read.vcfR(input.vcf);
-features.dt.gnomad <- readRDS(gnomad.rds);
-
-###################################################################################################
-# Data preprocessing
-###################################################################################################
-# Convert variant information into dt
-input.info <- vcf.info.to.dt(input.vcf@fix[, 'INFO']);
-input.fix <- as.data.table(input.vcf@fix);
-features.dt <- cbind(input.fix[, -c('INFO')], input.info);
-
-# Format columns
-features.dt[, CONSENSUS := NULL];
-numeric.columns <- c('POS', 'QUAL', 'END', 'PE', 'MAPQ', 'SRMAPQ', 'INSLEN', 'HOMLEN', 'SR', 'SRQ', 'CE', 'RDRATIO', 'SVLEN', 'POS2');
-features.dt[, (numeric.columns) := lapply(.SD, as.numeric), .SDcols = numeric.columns];
-
-# Extract and aggregate per sample GT fields
-gt.fields <- c('GQ', 'RC', 'RDCN', 'DR', 'DV', 'RR', 'RV');
-for (field in gt.fields) {
-    features.dt[, (field) := apply(extract.gt(input.vcf, element = ..field, as.numeric = TRUE), 1, mean, na.rm = TRUE)];
-    }
-features.dt[, COHORT_AF := apply(extract.gt(input.vcf, element = 'GT'), 1, calculate.VAF)];
-features.dt[!SVTYPE %in% c('BND', 'INS'), SVLEN := END - POS + 1];
-features.dt[, CIPOS := as.numeric(sapply(CIPOS, function(x) unlist(strsplit(x, ','))[2]))];
-
-###################################################################################################
-# Intersect variants with gnomAD SVs
-###################################################################################################
-start.time <- Sys.time();
-
-# features.dt <- features.dt[1:100];
-features.dt[, c('gnomad.match.ID', 'gnomad.matches') := rbindlist(lapply(ID, find.SV.match, input = features.dt, reference = features.dt.gnomad, overlap = 0.8, offset = 500))];
-
-gnomad.features <- c('ID', 'AF', 'POPMAX_AF', 'NCR');
-features.dt <- merge(features.dt, features.dt.gnomad[, ..gnomad.features], all.x = TRUE, by.x = 'gnomad.match.ID', by.y = 'ID');
-
-cat(format(Sys.time() - start.time, nsmall = 2), '\n');
-
-###################################################################################################
-# Format features for RF
-###################################################################################################
-continuous.features <- c(
-    'POS',
-    'QUAL',
-    'END',
-    'PE',
-    'MAPQ',
-    'CIPOS',
-    'SRMAPQ',
-    'HOMLEN',
-    'SR',
-    'SRQ',
-    'CE',
-    'RDRATIO',
-    'SVLEN',
-    'GQ',
-    'RC',
-    'RDCN',
-    'DR',
-    'DV',
-    'RR',
-    'RV',
-    'gnomad.matches',
-    'AF',
-    'POPMAX_AF',
-    'NCR'
-    );
-
-categorical.features <- c(
-    'CHROM',
-    'SVTYPE',
-    'CT'
-    );
-
-# Extract features and format
-continuous.features <- continuous.features[continuous.features %in% names(features.dt)];
-categorical.features <- categorical.features[categorical.features %in% names(features.dt)];
-all.features <- c(continuous.features, categorical.features, 'ID');
-
-features.dt <- features.dt[, ..all.features];
-features.dt[, (continuous.features) := lapply(.SD, as.numeric), .SDcols = continuous.features];
-features.dt[, (continuous.features) := lapply(.SD, function(x) ifelse(is.na(x), 0, x)), .SDcols = continuous.features];
-features.dt[, (categorical.features) := lapply(.SD, function(x) ifelse(is.na(x), '', x)), .SDcols = categorical.features];
-features.dt[, (categorical.features) := lapply(.SD, as.factor), .SDcols = categorical.features];
-names(features.dt) <- make.names(names(features.dt));
-
-# Remove rows with NA
-features.dt.rows <- nrow(features.dt);
-features.dt <- features.dt[apply(features.dt, 1, function(x) !any(is.na(x))), ];
-cat('Removed', features.dt.rows - nrow(features.dt), 'rows with missing data\n');
-
-###################################################################################################
-# Save features.dt for input to RF
-###################################################################################################
-saveRDS(features.dt, output.rds);
diff --git a/module/scripts/liftover-Delly2-vcf.R b/module/scripts/liftover-Delly2-vcf.R
deleted file mode 100644
index d473091..0000000
--- a/module/scripts/liftover-Delly2-vcf.R
+++ /dev/null
@@ -1,157 +0,0 @@
-#!/usr/bin/env Rscript
-# liftover-Delly2-vcf.R
-###################################################################################################
-#
-#
-#
-###################################################################################################
-
-suppressPackageStartupMessages({
-    library(vcfR);
-    library(data.table);
-    library(argparse);
-    library(rtracklayer);
-    });
-
-###################################################################################################
-# Input
-###################################################################################################
-# Define command line arguments
-parser <- ArgumentParser();
-parser$add_argument('--input-vcf', type = 'character', help = 'GRCh37 Delly2 vcf');
-parser$add_argument('--header-contigs', type = 'character', help = 'Directory with vcf subsets');
-parser$add_argument('--chain-file', type = 'character', help = 'hg19ToHg38.over.chain file');
-parser$add_argument('--output', type = 'character', help = 'Where to write lifted vcf');
-args <- parser$parse_args();
-
-# Save command line arguments
-for (arg in names(args)) {
-    assign(gsub('_', '.', arg), args[[arg]]);
-    }
-
-# Set parameters for interactive runs
-if (interactive()) {
-    # input.vcf <- '/hot/project/method/AlgorithmEvaluation/BNCH-000142-GRCh37v38/gSV/bcftools-merge/CPCG-40QC_GRCh37/CPCG-40QC_GRCh37_regenotype-gSV_delly_bcftools-merge_delly-filter-germline.vcf.gz';
-    input.vcf <- '/hot/project/method/AlgorithmEvaluation/BNCH-000142-GRCh37v38/sSV/bcftools-merge/CPCG-40QC_GRCh37/CPCG-40QC_GRCh37_call-sSV_delly_bcftools-merge_somatic-only.vcf.gz';
-    header.contigs <- '/hot/code/nkwang/GitHub/uclahs-cds/project-method-AlgorithmEvaluation-BNCH-000142-GRCh37v38/report/manuscript/publish/GRCh38-vcf-header-contigs.txt';
-    chain.file <- '/hot/resource/genomics/liftover_chain_files/hg19ToHg38.over.chain';
-    output <- '/hot/project/method/AlgorithmEvaluation/BNCH-000142-GRCh37v38/sSV/stableLift/train_CPCG-40QC_Delly2/CPCG-40QC_Delly2_LiftOver-GRCh38.vcf.gz';
-    }
-
-###################################################################################################
-# Functions
-###################################################################################################
-vcf.fix.to.dt <- function(vcf.fix) {
-    vcf.fix <- as.data.table(vcf.fix);
-    vcf.info <- vcf.info.to.dt(vcf.fix$INFO);
-    cbind(vcf.fix[, -'INFO'], vcf.info);
-    }
-
-# vcfR::getINFO() to data.table
-vcf.info.to.dt <- function(vcf.info) {
-    vcf.info <- lapply(vcf.info, function(x) vcf.info.string.to.list(x));
-    feature.names <- unique(unlist(lapply(vcf.info, names)));
-    vcf.info <- do.call(mapply, c(FUN = list, lapply(vcf.info, `[`, feature.names)));
-    setNames(as.data.table(vcf.info), feature.names);
-    }
-
-# Split vcf info field to list
-vcf.info.string.to.list <- function(vcf.info, keep.columns = NULL) {
-    list.out <- strsplit(vcf.info, split = ';');
-    list.out <- lapply(list.out, function(x) strsplit(x, split = '='));
-    labels <- sapply(list.out[[1]], function(x) x[[1]]);
-    values <- sapply(list.out[[1]], function(x) if (length(x) == 2) x[[2]] else x[[1]]);
-    names(values) <- labels;
-    if (is.null(keep.columns)) return(values);
-    values <- values[labels %in% keep.columns];
-    return(values);
-    }
-
-###################################################################################################
-# Load files
-###################################################################################################
-input.vcf.path <- input.vcf;
-input.vcf <- read.vcfR(input.vcf);
-header.contigs <- scan(header.contigs, character());
-liftover.chain <- import.chain(chain.file);
-
-###################################################################################################
-# Data preprocessing
-###################################################################################################
-if (any(duplicated(input.vcf@fix[, 'ID']))) input.vcf@fix[, 'ID'] <- paste0(substr(input.vcf@fix[, 'ID'], 1, 3), sprintf('%08d', seq_len(nrow(input.vcf@fix))));
-# if (any(duplicated(input.vcf@fix[, 'ID']))) fix.dt[, ID := paste0(substr(ID, 1, 3), sprintf('%08d', seq_len(nrow(fix.dt))))];
-fix.dt <- as.data.table(input.vcf@fix);
-gt.dt <- as.data.table(input.vcf@gt);
-
-fix.dt <- vcf.fix.to.dt(fix.dt);
-fix.dt[, CHROM := paste0('chr', CHROM)];
-fix.dt[, CHR2 := ifelse(is.na(CHR2), CHR2, paste0('chr', CHR2))];
-
-fix.dt <- fix.dt[, -c('CONSENSUS')];
-numeric.columns <- c('POS', 'QUAL', 'END', 'PE', 'MAPQ', 'SRMAPQ', 'INSLEN', 'HOMLEN', 'SR', 'SRQ', 'CE', 'SVLEN', 'POS2');
-character.columns <- names(fix.dt)[!names(fix.dt) %in% numeric.columns];
-fix.dt[, (numeric.columns) := lapply(.SD, as.numeric), .SDcols = numeric.columns];
-fix.dt[, (character.columns) := lapply(.SD, as.character), .SDcols = character.columns];
-
-###################################################################################################
-# Liftover
-###################################################################################################
-# Create GRanges object
-granges.37 <- makeGRangesFromDataFrame(
-    df = fix.dt,
-    seqnames.field = 'CHROM',
-    start.field = 'POS',
-    end.field = 'END',
-    keep.extra.columns = TRUE
-    );
-
-# Liftover using chain file
-granges.38 <- unlist(liftOver(granges.37, liftover.chain));
-granges.38.dt <- as.data.table(granges.38);
-
-# Create GRanges object using CHROM, CHR2, and POS2 from fix.dt
-granges.37.BND <- makeGRangesFromDataFrame(
-    df = fix.dt[SVTYPE == 'BND', ],
-    seqnames.field = 'CHR2',
-    start.field = 'POS2',
-    end.field = 'POS2',
-    keep.extra.columns = TRUE
-    );
-granges.38.BND <- as.data.table(unlist(liftOver(granges.37.BND, liftover.chain)));
-
-# Remove multiple mappings
-granges.38.dt <- granges.38.dt[!duplicated(ID)];
-granges.38.BND <- granges.38.BND[!duplicated(ID)];
-common <- intersect(granges.38.dt$ID, granges.38.BND$ID);
-
-granges.38.dt[ID %in% common, c('CHR2', 'POS2') := granges.38.BND[ID %in% common, .(seqnames, start)]];
-
-pass.liftover <- as.data.table(input.vcf@fix)$ID %in% granges.38.dt$ID;
-fix.lifted <- as.data.table(input.vcf@fix)[pass.liftover];
-gt.dt <- gt.dt[pass.liftover];
-for (i in seq_len(nrow(fix.lifted))) {
-    this.ID <- fix.lifted[i, ID];
-    this.INFO <- vcf.info.string.to.list(fix.lifted[i, INFO]);
-    this.INFO[['END']] <- granges.38.dt[i, end];
-    if (this.INFO[['SVTYPE']] == 'BND') {
-        this.INFO[['CHR2']] <- granges.38.dt[i, CHR2];
-        this.INFO[['POS2']] <- granges.38.dt[i, POS2];
-        }
-    this.INFO <- lapply(names(this.INFO), function(x) paste(x, this.INFO[[x]], sep = '='));
-    this.INFO <- paste(this.INFO, collapse = ';');
-    this.INFO <- gsub('IMPRECISE=IMPRECISE', 'IMPRECISE', this.INFO);
-    this.INFO <- gsub('PRECISE=PRECISE', 'PRECISE', this.INFO);
-    this.INFO <- gsub('SOMATIC=SOMATIC', 'SOMATIC', this.INFO);
-    fix.lifted[i, c('CHROM', 'POS', 'INFO') := granges.38.dt[ID == ..this.ID, .(seqnames, start, ..this.INFO)]];
-    }
-
-###################################################################################################
-# Write output vcf
-###################################################################################################
-output.vcf <- input.vcf;
-output.vcf@fix <- as.matrix(fix.lifted);
-output.vcf@gt <- as.matrix(gt.dt);
-output.vcf@meta <- output.vcf@meta[!grepl('^##(contig|reference)', output.vcf@meta)];
-output.vcf@meta <- c(output.vcf@meta, header.contigs);
-
-write.vcf(output.vcf, output);
diff --git a/module/scripts/predict-liftover-stability.R b/module/scripts/predict-variant-stability.R
similarity index 52%
rename from module/scripts/predict-liftover-stability.R
rename to module/scripts/predict-variant-stability.R
index d8f5dd2..c8b271d 100644
--- a/module/scripts/predict-liftover-stability.R
+++ b/module/scripts/predict-variant-stability.R
@@ -2,13 +2,11 @@
 # predict-liftover-stability.R
 ####################################################################################################
 #
-# Apply random forest model to predict variant LiftOver stability
-# Validate results and plot model performance with discordance file
+# Apply random forest model to predict variant stability
 #
 ####################################################################################################
 
 suppressPackageStartupMessages({
-    library(caret);
     library(ranger);
     library(argparse);
     library(ROCR);
@@ -20,12 +18,12 @@ suppressPackageStartupMessages({
 ###################################################################################################
 # Define command line arguments
 parser <- ArgumentParser();
-parser$add_argument('--variant-caller', type = 'character');
-parser$add_argument('--features-dt', type = 'character');
-parser$add_argument('--rf-model', type = 'character');
-parser$add_argument('--specificity', type = 'numeric', help = 'Target specificity, overrides `--threshold`');
-parser$add_argument('--threshold', type = 'numeric', help = 'Stability score threshold', default = 0.5);
-parser$add_argument('--output-tsv', type = 'character', help = 'TSV output file');
+parser$add_argument('--variant-caller', type = 'character', help = 'One of {HaplotypeCaller, Mutect2, Strelka2, SomaticSniper, Muse2, Delly2}');
+parser$add_argument('--features-dt', type = 'character', help = 'Processed Rds file with variant info and annotations');
+parser$add_argument('--rf-model', type = 'character', help = 'Pre-trained random forest model Rds file');
+parser$add_argument('--specificity', type = 'numeric', help = 'Target specificity based on whole genome validation set, overrides `--threshold`');
+parser$add_argument('--threshold', type = 'numeric', help = 'Stability score threshold, default based on maximizing F1-score in whole genome validation set');
+parser$add_argument('--output-tsv', type = 'character', help = 'Output TSV with predicted Stability Scores');
 args <- parser$parse_args();
 
 # Save command line arguments
@@ -33,27 +31,6 @@ for (arg in names(args)) {
     assign(gsub('_', '.', arg), args[[arg]]);
     }
 
-####################################################################################################
-# Functions
-####################################################################################################
-
-# Sort datatable by chr then position
-sort.genomic.dt <- function(x, chr = 'CHROM', pos = 'POS') {
-    setDT(x);
-    x[, eval(chr) := gsub('chr', '', get(chr))];
-    x[, eval(chr) := gsub('X', '23', get(chr))];
-    x[, eval(chr) := gsub('Y', '24', get(chr))];
-    x[, eval(chr) := as.numeric(get(chr))];
-
-    setorderv(x, c(chr, pos), c(1, 1));
-
-    x[, eval(chr) := gsub('23', 'X', get(chr))];
-    x[, eval(chr) := gsub('24', 'Y', get(chr))];
-    x[, eval(chr) := paste0('chr', get(chr))];
-
-    return(x);
-    }
-
 ###################################################################################################
 # Load data
 ###################################################################################################
@@ -88,51 +65,39 @@ print(dim(features.dt));
 ###################################################################################################
 # Apply random forest model
 ###################################################################################################
-cat('\nPredicting liftover stability with', basename(rf.model.path), '\n');
+cat('\nPredicting variant stability with', basename(rf.model.path), '\n');
 stability <- predict(rf.model, data = features.dt);
-
-# if (!is.null(specificity) && is.numeric(specificity)) {
-#     cat('Target specificity =', specificity, '\n');
-#     operating.index <- max(which(unlist(rf.model$performance@x.values) < 1 - specificity));
-#     sensitivity <- unlist(rf.model$performance@y.values)[operating.index];
-#     cat('Projected sensitivity =', round(sensitivity, 3), '\n');
-#     threshold <- 1 - unlist(rf.model$performance@alpha.values)[operating.index];
-#     cat('Stability score threshold =', round(threshold, 3), '\n');
-# } else if (!is.null(threshold) && is.numeric(threshold)) {
-#     cat('Target threshold =', threshold, '\n');
-#     operating.index <- min(which(unlist(rf.model$performance@alpha.values) <= 1 - threshold));
-#     specificity <- 1 - unlist(rf.model$performance@x.values)[operating.index];
-#     sensitivity <- unlist(rf.model$performance@y.values)[operating.index];
-#     cat('Projected specificity =', round(specificity, 3), '\n');
-#     cat('Projected sensitivity =', round(sensitivity, 3), '\n');
-# } else {
-#     performance.acc <- performance(prediction$train, measure = 'f'); #F1-score
-#     index <- which.max(unlist(performance.acc@y.values));
-#     cutoff <- unlist(performance.acc@x.values)[index];
-#     metric <- unlist(performance.acc@y.values)[index];
-#     specificity <- 1 - unlist(performance$train@x.values)[index];
-#     sensitivity <- unlist(performance$train@y.values)[index];
-#     cat(sprintf('Projected F[0.5]-score = %.3f\n', metric));
-#     cat(sprintf('Projected sensitivity = %.3f\n', sensitivity));
-#     cat(sprintf('Projected specificity = %.3f\n', specificity));
-#     }
-
-performance.f <- performance(rf.model$prediction, measure = 'f');
-index <- which.max(unlist(performance.f@y.values));
-threshold <- unlist(performance.f@x.values)[index];
-
 performance <- performance(rf.model$prediction, 'sens', 'spec');
 
-sensitivity <- unlist(performance@y.values)[index];
-specificity <- unlist(performance@x.values)[index];
-
-# Convert to stability units
-threshold.stability <- 1 - threshold;
-cat(sprintf('Threshold = %.3f\n', threshold.stability));
-cat(sprintf('Training sensitivity = %.3f\n', sensitivity));
-cat(sprintf('Training specificity = %.3f\n', specificity));
+if (!is.null(specificity) && is.numeric(specificity)) {
+    cat('Target specificity =', specificity, '\n');
+    operating.index <- max(which(unlist(performance@x.values) > specificity));
+    threshold <- 1 - unlist(performance@alpha.values)[operating.index];
+    sensitivity <- unlist(performance@y.values)[operating.index];
+
+    cat(sprintf('Threshold = %.3f\n', threshold));
+    cat(sprintf('Projected sensitivity = %.3f\n', sensitivity));
+} else if (!is.null(threshold) && is.numeric(threshold)) {
+    cat('Target threshold =', threshold, '\n');
+    operating.index <- min(which(unlist(performance@alpha.values) <= 1 - threshold));
+    specificity <- unlist(performance@x.values)[operating.index];
+    sensitivity <- unlist(performance@y.values)[operating.index];
+
+    cat(sprintf('Projected sensitivity = %.3f\n', sensitivity));
+    cat(sprintf('Projected specificity = %.3f\n', specificity));
+} else {
+    performance.f <- performance(rf.model$prediction, measure = 'f');
+    operating.index <- which.max(unlist(performance.f@y.values));
+    threshold <- 1 - unlist(performance.f@x.values)[operating.index];
+    sensitivity <- unlist(performance@y.values)[operating.index];
+    specificity <- unlist(performance@x.values)[operating.index];
+
+    cat(sprintf('Default threshold = %.3f\n', threshold));
+    cat(sprintf('Projected sensitivity = %.3f\n', sensitivity));
+    cat(sprintf('Projected specificity = %.3f\n', specificity));
+    }
 
-stability.classification <- ifelse(stability$predictions[, 1] < threshold.stability, 1, 0);
+stability.classification <- ifelse(stability$predictions[, 1] < threshold, 1, 0);
 cat(sprintf('Proportion predicted unstable = %.3f\n\n', mean(stability.classification)));
 stability.classification <- as.factor(stability.classification);
 
@@ -145,5 +110,6 @@ annotation.dt <- data.table(
     STABILITY_SCORE = format(round(stability$predictions[, 1], 4), nsmall = 4),
     STABILITY = ifelse(stability.classification == '1', 'UNSTABLE', 'STABLE')
     );
-sort.genomic.dt(annotation.dt);
+setorder(annotation.dt, CHROM, POS);
+
 fwrite(annotation.dt, file = output.tsv, sep = '\t', col.names = FALSE);
diff --git a/module/snv_workflow.nf b/module/snv_workflow.nf
index f298b7e..d5f896b 100644
--- a/module/snv_workflow.nf
+++ b/module/snv_workflow.nf
@@ -64,7 +64,7 @@ process extract_VCF_features_StableLift {
 
     script:
     """
-    Rscript "${moduleDir}/scripts/extract-vcf-features.R" \
+    Rscript "${moduleDir}/scripts/extract-VCF-features.R" \
         --input-vcf "${vcf}" \
         --variant-caller ${params.variant_caller} \
         --output-rds "features.Rds"
diff --git a/module/sv_workflow.nf b/module/sv_workflow.nf
index bdf2621..432da05 100644
--- a/module/sv_workflow.nf
+++ b/module/sv_workflow.nf
@@ -1,35 +1,39 @@
-
-process liftover_SV_StableLift{
+process liftover_annotate_SV_StableLift {
     container params.docker_image_stablelift
 
     publishDir path: "${params.output_dir_base}/intermediate/${task.process.replace(':', '/')}",
-        pattern: "liftover.vcf.gz",
+        pattern: "liftover.{vcf.gz,Rds}",
         mode: "copy",
         enabled: params.save_intermediate_files,
         saveAs: { "LiftOver-${sample_id}.vcf.gz" }
 
     input:
-        tuple val(sample_id),
-            path(vcf, stageAs: 'inputs/*'),
-            path(index, stageAs: 'inputs/*')
+        tuple val(sample_id), path(vcf, stageAs: 'inputs/*')
+        val(source_grch_label)
         path (header_contigs)
         path (chain_file)
+        path (gnomad_rds)
 
     output:
-        tuple val(sample_id), path('liftover.vcf.gz'), emit: liftover_vcf
+        tuple val(sample_id), path('annotations.vcf.gz'), emit: liftover_vcf
+        tuple val(sample_id), path('annotations.Rds'), emit: r_annotations
 
     script:
         """
-        Rscript "${moduleDir}/scripts/liftover-Delly2-vcf.R" \
+        Rscript "${moduleDir}/scripts/extract-VCF-features-SV.R" \
             --input-vcf "${vcf}" \
-            --header-contigs "${header_contigs}" \
+            --source-build "${source_grch_label}" \
             --chain-file "${chain_file}" \
-            --output "liftover.vcf.gz"
+            --header-contigs "${header_contigs}" \
+            --gnomad-rds ${gnomad_rds} \
+            --output-vcf "annotations.vcf.gz" \
+            --output-rds "annotations.Rds"
         """
 
     stub:
     """
-    touch "liftover.vcf.gz"
+    touch "annotations.Rds"
+    touch "annotations.vcf.gz"
     """
 }
 
@@ -62,66 +66,33 @@ process run_sort_BCFtools {
         """
 }
 
-process annotate_gnomAD_StableLift {
-    container params.docker_image_stablelift
-
-    publishDir path: "${params.output_dir_base}/intermediate/${task.process.replace(':', '/')}",
-        pattern: "annotations.Rds",
-        mode: "copy",
-        enabled: params.save_intermediate_files,
-        saveAs: { "LiftOver-${sample_id}-${variant_caller}.Rds" }
-
-    input:
-        tuple val(sample_id), path(vcf, stageAs: 'inputs/*')
-        path (gnomad_rds)
-        val (variant_caller)
-
-    output:
-        tuple val(sample_id), path('annotations.Rds'), emit: r_annotations
-
-    script:
-        """
-        Rscript "${moduleDir}/scripts/extract-vcf-features-SV.R" \
-            --variant-caller "${variant_caller}" \
-            --input-vcf "${vcf}" \
-            --output-rds "annotations.Rds" \
-            --gnomad-rds ${gnomad_rds}
-        """
-
-    stub:
-    """
-    touch "annotations.Rds"
-    """
-}
-
 workflow workflow_extract_sv_annotations {
     take:
     vcf_with_sample_id
+    src_sequence
     header_contigs
     gnomad_rds
     chain_file
-    variant_caller
 
     main:
 
-    // Step 1: Liftover
-    liftover_SV_StableLift(
-        vcf_with_sample_id,
+    liftover_annotate_SV_StableLift(
+        // We don't need the index file
+        vcf_with_sample_id.map{ [it[0], it[1]] },
+
+        // We only need the sample ID
+        src_sequence.map{ ["hg19": "GRCh37", "hg38": "GRCh38"][it[0]] },
+
         header_contigs,
-        chain_file
-    )
-    run_sort_BCFtools(
-        liftover_SV_StableLift.out.liftover_vcf
+        chain_file,
+        gnomad_rds
     )
 
-    // Step 2: Extract features
-    annotate_gnomAD_StableLift(
-        run_sort_BCFtools.out.sorted_vcf,
-        gnomad_rds,
-        variant_caller
+    run_sort_BCFtools(
+        liftover_annotate_SV_StableLift.out.liftover_vcf
     )
 
     emit:
     liftover_vcf = run_sort_BCFtools.out.sorted_vcf
-    r_annotations = annotate_gnomAD_StableLift.out.r_annotations
+    r_annotations = liftover_annotate_SV_StableLift.out.r_annotations
 }