diff --git a/CHANGELOG.md b/CHANGELOG.md index 3221173..08526b2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm ## [Unreleased] ### Added +- Add XY filtration - NFTest test case --- @@ -152,7 +153,7 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm - Update reheadering to use -c option - Modularize workflows for different modes (single vs. paired, WGS vs targeted) - Update GATK to 4.2.4.0 to address Log4j critical vulnerability (https://github.com/advisories/GHSA-jfh8-c2jp-5v3q) -- Update Picard to 2.26.8 to address Log4j critical vulnerability (https://github.com/advisories/GHSA-jfh8-c2jp-5v3q) +- Update Picard to 2.26.8 to address Log4j critical vulnerability (https://github.com/advisories/GHSA-jfh8-c2jp-5v3q) --- diff --git a/README.md b/README.md index 5a3b68c..80976fb 100644 --- a/README.md +++ b/README.md @@ -78,7 +78,10 @@ Take the output from Step 6 as input, and apply the model in Step 5 to recalibra ### 8. Filter gSNP – Filter out ambiguous variants Use customized Perl script to filter out ambiguous variants. -### 9. Generate sha512 checksum +### 9. Adjust chrX and chrY genotypes based on sample sex from recalibrated VCF +Apply XY filtration workflow to recalibrated VCF as discribed [here](docs/xy_filtration_workflow.md). + +### 10. Generate sha512 checksum Generate sha512 checksum for VCFs and GVCFs. --- @@ -115,6 +118,8 @@ For normal-only or tumor-only samples, exclude the fields for the other state. |:----------------|:---------|:-----|:------------| | `dataset_id` | Yes | string | Dataset ID | | `blcds_registered_dataset` | Yes | boolean | Set to true when using BLCDS folder structure; use false for now | +| `genome_build` | Yes | string | Genome build, GRCh37 or GRCh38 | +| `sample_sex` | Yes | string | Sample Sex, XY or XX | | `output_dir` | Yes | string | Need to set if `blcds_registered_dataset = false` | | `save_intermediate_files` | Yes | boolean | Set to false to disable publishing of intermediate files; true otherwise; disabling option will delete intermediate files to allow for processing of large BAMs | | `cache_intermediate_pipeline_steps` | No | boolean | Set to true to enable process caching from Nextflow; defaults to false | @@ -126,6 +131,7 @@ For normal-only or tumor-only samples, exclude the fields for the other state. | `bundle_hapmap_3p3_vcf_gz` | Yes | path | Absolute path to HapMap 3.3 file, e.g., `/hot/resource/tool-specific-input/GATK/GRCh38/hapmap_3.3.hg38.vcf.gz` | | `bundle_omni_1000g_2p5_vcf_gz` | Yes | path | Absolute path to 1000 genomes OMNI 2.5 file, e.g., `/hot/resource/tool-specific-input/GATK/GRCh38/1000G_omni2.5.hg38.vcf.gz` | | `bundle_phase1_1000g_snps_high_conf_vcf_gz` | Yes | path | Absolute path to 1000 genomes phase 1 high-confidence file, e.g., `/hot/resource/tool-specific-input/GATK/GRCh38/1000G_phase1.snps.high_confidence.hg38.vcf.gz` | +| `par_bed` | Yes | path | Absolute path to Pseudo-autosomal Region (PAR) BED | | `work_dir` | optional | path | Path of working directory for Nextflow. When included in the sample config file, Nextflow intermediate files and logs will be saved to this directory. With ucla_cds, the default is `/scratch` and should only be changed for testing/development. Changing this directory to `/hot` or `/tmp` can lead to high server latency and potential disk space limitations, respectively. | | `docker_container_registry` | optional | string | Registry containing tool Docker images. Default: `ghcr.io/uclahs-cds` | | `base_resource_update` | optional | namespace | Namespace of parameters to update base resource allocations in the pipeline. Usage and structure are detailed in `template.config` and below. | @@ -199,6 +205,10 @@ base_resource_update { | `___indel.vcf.gz` | Filtered INDELs with non-germline and ambiguous variants removed | | `___indel.vcf.gz.tbi` | Filtered germline INDELs index | | `___indel.vcf.gz.sha512` | Filtered germline INDELs sha512 checksum | +| `_____filtered.vcf.bgz` | chrX/Y filtered SNP and INDEL recalibrated variants | +| `_____filtered.vcf.bgz.sha512` | chrX/Y filtered SNP and INDEL recalibrated variants checksum | +| `_____filtered.vcf.bgz.tbi` | chrX/Y filtered SNP and INDEL recalibrated variants index | +| `_____filtered.vcf.bgz.tbi.sha512` | chrX/Y filtered SNP and INDEL recalibrated variants index checksum | | `report.html`, `timeline.html` and `trace.txt` | Nextflow report, timeline and trace files | | `*.command.*` | Process specific logging files created by nextflow | diff --git a/config/F16.config b/config/F16.config index 1fb00e0..cf0a3a5 100644 --- a/config/F16.config +++ b/config/F16.config @@ -111,4 +111,14 @@ process { } } } + withName: filter_XY_Hail { + cpus = 1 + memory = 2.GB + retry_strategy { + memory { + strategy = 'exponential' + operand = 2 + } + } + } } diff --git a/config/F32.config b/config/F32.config index 1fb00e0..61fae3c 100644 --- a/config/F32.config +++ b/config/F32.config @@ -111,4 +111,14 @@ process { } } } + withName: filter_XY_Hail { + cpus = 2 + memory = 4.GB + retry_strategy { + memory { + strategy = 'exponential' + operand = 2 + } + } + } } diff --git a/config/F72.config b/config/F72.config index b16f3db..e61ab5b 100644 --- a/config/F72.config +++ b/config/F72.config @@ -111,4 +111,14 @@ process { } } } + withName: filter_XY_Hail { + cpus = 2 + memory = 6.GB + retry_strategy { + memory { + strategy = 'exponential' + operand = 2 + } + } + } } diff --git a/config/M64.config b/config/M64.config index b16f3db..3619d63 100644 --- a/config/M64.config +++ b/config/M64.config @@ -111,4 +111,14 @@ process { } } } + withName: filter_XY_Hail { + cpus = 4 + memory = 10.GB + retry_strategy { + memory { + strategy = 'exponential' + operand = 2 + } + } + } } diff --git a/config/default.config b/config/default.config index 806d508..af66b8e 100644 --- a/config/default.config +++ b/config/default.config @@ -20,10 +20,12 @@ params { picard_version = "2.26.10" pipeval_version = "4.0.0-rc.2" gatkfilter_version = "v1.0.0" + hail_version = "0.2.133" docker_image_gatk = "broadinstitute/gatk:${params.gatk_version}" docker_image_picard = "${-> params.docker_container_registry}/picard:${params.picard_version}" docker_image_pipeval = "${-> params.docker_container_registry}/pipeval:${params.pipeval_version}" docker_image_gatkfilter = "${-> params.docker_container_registry}/gatk:${params.gatkfilter_version}" + docker_image_hail = "${-> params.docker_container_registry}/hail:${params.hail_version}" emit_all_confident_sites = false } @@ -36,7 +38,7 @@ process { cache = true executor = 'local' - + // Other directives or options that should apply for every process // total amount of resources avaible to the pipeline diff --git a/config/schema.yaml b/config/schema.yaml index 01f7040..c13ba18 100644 --- a/config/schema.yaml +++ b/config/schema.yaml @@ -3,10 +3,26 @@ patient_id: type: 'String' required: true help: 'Patient ID' +sample_sex: + type: 'String' + required: true + help: 'Sample Sex' + choices: + - "XY" + - "XX" dataset_id: type: 'String' required: true help: 'Dataset ID' +genome_build: + type: 'String' + required: true + help: 'Genome build, GRCh37 or GRCh38' + default: + - "GRCh38" + choice: + - "GRCh37" + - "GRCh38" output_dir: type: 'Path' mode: 'w' @@ -62,6 +78,11 @@ bundle_phase1_1000g_snps_high_conf_vcf_gz: mode: 'r' required: true help: 'Absolute path to high-confidence 1000g SNPs VCF' +par_bed: + type: 'Path' + mode: 'r' + required: true + help: 'Absolute path to Pseudo-autosomal Region (PAR) BED' base_resource_update: type: 'ResourceUpdateNamespace' required: false diff --git a/config/template.config b/config/template.config index c24d6dc..83e1f88 100644 --- a/config/template.config +++ b/config/template.config @@ -11,6 +11,11 @@ params { dataset_id = '' blcds_registered_dataset = false // if you want the output to be registered + genome_build = "GRCh38" + + // Input patient sex + sample_sex = '' // 'XY' or 'XX' + output_dir = '/path/to/output/directory' // Set to false to disable the publish rule and delete intermediate files as they're no longer needed @@ -43,6 +48,9 @@ params { bundle_omni_1000g_2p5_vcf_gz = "/hot/resource/tool-specific-input/GATK/GRCh38/1000G_omni2.5.hg38.vcf.gz" bundle_phase1_1000g_snps_high_conf_vcf_gz = "/hot/resource/tool-specific-input/GATK/GRCh38/1000G_phase1.snps.high_confidence.hg38.vcf.gz" + // Specify BED file path for Pseudoautosomal Region (PAR) + par_bed = "" + // Base resource allocation updater // See README for adding parameters to update the base resource allocations } diff --git a/docs/xy_filtration_workflow.md b/docs/xy_filtration_workflow.md new file mode 100644 index 0000000..c053933 --- /dev/null +++ b/docs/xy_filtration_workflow.md @@ -0,0 +1,26 @@ +# Filter XY calls from a germline VCF file + +## Steps: +1. Extract autosomes and chrX/Y variants from input VCF +2. Filter chrX/Y variants +3. Merge autosomal and filtered chrX/Y variants + +## chrX/Y Filter Criteria: +- Extract chrX/Y calls +- Extract chrX/Y calls overlapping with Pseudo-Autosomal Regions (PARs) +- For non-PAR chrX/Y calls + - if `sample_sex` is `XY`: + - Filter out heterozygous `GT` calls in chrX and chrY + - Transform homozygous `GT=1/1` to hemizygous `GT=1` + - if `sample_sex` is `XX`: + - Filter out `chrY` calls + +## Pseudo-Autosomal Regions (PARs) +### GRCh38 +| CHROM | START | END | PAR | REGION | REFERENCE | +|---|---|---|---|---|---| +| chrX | 10001 | 2781479 | PAR1 | Xp22 | EMSEMBL | +| chrX | 91434839 | 91438584 | PAR3/XTR | Xq21.3 | PMID:23708688 | +| chrX | 155701383 | 156030895 | PAR2 | Xq28 | ENSEMBL | +| chrY | 10001 | 10300000 | PAR1+PAR3/XTR | Yp11 | ENSEMBL +PMID:23708688 | +| chrY | 56887903 | 57217415 | PAR2 | Yq12 | ENSEMBL | diff --git a/main.nf b/main.nf index 33e0937..c07e58e 100644 --- a/main.nf +++ b/main.nf @@ -68,6 +68,7 @@ include { } from './module/merge-vcf.nf' include { recalibrate_variants } from './module/workflow-recalibrate-variants.nf' include { filter_gSNP_GATK } from './module/filter-gsnp.nf' +include { filter_XY_Hail } from './module/filter-xy.nf' include { calculate_sha512 } from './module/checksum.nf' // Returns the index file for the given bam or vcf @@ -104,6 +105,12 @@ workflow { } .set{ input_ch_collected_files } + script_dir_ch = Channel.fromPath( + "$projectDir/script", + checkIfExists: true + ) + .collect() + /** * Input validation */ @@ -248,6 +255,20 @@ workflow { recalibrate_variants.out.output_ch_recalibrated_variants ) + filter_xy_ch = recalibrate_variants.out.output_ch_recalibrated_variants + .map { it -> [it[0], it[1], it[2]] } + + script_dir_ch = Channel.fromPath( + "$projectDir/script", + checkIfExists: true + ) + .collect() + + filter_XY_Hail( + filter_xy_ch, + params.par_bed, + script_dir_ch + ) /** * Calculate checksums for output files */ @@ -255,6 +276,7 @@ workflow { .mix(run_MergeVcfs_Picard_GVCF.out.merged_vcf) .mix(recalibrate_variants.out.output_ch_recalibrated_variants) .map{ [it[1], it[2]] } + .mix(filter_XY_Hail.out.xy_filtered_vqsr) .mix(filter_gSNP_GATK.out.germline_filtered) .flatten() .set{ input_ch_calculate_checksum } diff --git a/module/filter-xy.nf b/module/filter-xy.nf new file mode 100644 index 0000000..b57bd22 --- /dev/null +++ b/module/filter-xy.nf @@ -0,0 +1,63 @@ +include { generate_standard_filename; sanitize_string } from '../external/pipeline-Nextflow-module/modules/common/generate_standardized_filename/main.nf' + +/* + Nextflow module for filtering chrX and chrY variant calls based on sample sex + + input: + sample_id: identifier for sample + sample_vcf: path to VCF to filter + sample_vcf_tbi: path to index of VCF to filter + + params: + params.output_dir_base: string(path) + params.log_output_dir: string(path) + params.docker_image_hail: string + params.sample_sex: string + params.par_bed: string(path) +*/ + +process filter_XY_Hail { + container params.docker_image_hail + + publishDir path: "${params.output_dir_base}/output", + mode: "copy", + pattern: '*.vcf.bgz*' + + publishDir path: "${params.log_output_dir}/process-log", + pattern: ".command.*", + mode: "copy", + saveAs: { + "${task.process.replace(':', '/')}-${sample_id}/log${file(it).getName()}" + } + + input: + tuple val(sample_id), path(recalibrated_vcf), path(recalibrated_vcf_tbi) + path(par_bed) + path(script_dir) + + output: + path(".command.*") + tuple path("${output_filename}_XY_filtered.vcf.bgz"), path("${output_filename}_XY_filtered.vcf.bgz.tbi"), emit: xy_filtered_vqsr + + script: + output_filename = generate_standard_filename( + "Hail-${params.hail_version}", + params.dataset_id, + sample_id, + [additional_tools:["GATK-${params.gatk_version}"]] + ) + """ + set -euo pipefail + + zgrep "##source=" ${recalibrated_vcf} > ./vcf_source.txt + + python ${script_dir}/filter_xy_call.py \ + --sample_name ${output_filename} \ + --input_vcf ${recalibrated_vcf} \ + --vcf_source_file ./vcf_source.txt \ + --sample_sex ${params.sample_sex} \ + --par_bed ${par_bed} \ + --genome_build ${params.genome_build} \ + --output_dir . + """ +} diff --git a/script/filter_xy_call.py b/script/filter_xy_call.py new file mode 100755 index 0000000..dcb2871 --- /dev/null +++ b/script/filter_xy_call.py @@ -0,0 +1,170 @@ +#!/usr/bin/env python3 +""" +Filter XY calls from a germline VCF file + +Steps: +- Extract autosomes and chrX/Y variants from input VCF +- Filter chrX/Y variants +- Merge autosomal and filtered chrX/Y variants + +Filter criteria: +- Extract XY calls +- Extract XY calls overlapping with Pseudo-Autosomal Regions (PARs) +- For non-PAR + - Male sample: + - Filter out heterozygous GT calls in chrX and chrY + - Transform homozygous GT=1/1 to hemizygous GT=1 + - Female sample: Filter out chrY calls + +Dependencies: +- Python 3 +- HAIL python library (pip install hail) + +Note: +- Do not export VCF to a path that is being read from in the same pipeline,\ +based on HAIL recommendation +""" + +import argparse +import os +import sys +import tempfile +import hail as hl + +script_dir = os.getcwd() + +parser = argparse.ArgumentParser() +parser.add_argument( + '--sample_name', + dest='sample_name', + help = 'Sample name', + required=True + ) +parser.add_argument( + '--input_vcf', + dest='input_vcf', + help = 'Input single sample VCF file path', + required=True + ) +parser.add_argument( + '--vcf_source_file', + dest='vcf_source_file', + help = 'A TXT file containing variant caller source details (eg. ##source=HaplotypeCaller)', + required=True + ) +parser.add_argument( + '--sample_sex', + dest='sample_sex', + help = 'Sample sex, XY or XX', + required=True + ) +parser.add_argument( + '--par_bed', + dest='par_bed', + help = 'Input BED file path for Pseudo-Autosomal Regions (PAR)', + required=True + ) +parser.add_argument( + '--genome_build', + dest='genome_build', + help = 'Genome build of input VCF, GRCh37 or GRCh38', + required=True + ) +parser.add_argument( + '--output_dir', + dest='output_dir', + help = 'Output path where filtered XY variant VCF will be written', + required=True + ) + +args = parser.parse_args() + +sample_name = args.sample_name +sample_sex = args.sample_sex +vcf_file = args.input_vcf +vcf_source_file = args.vcf_source_file +par_bed = args.par_bed +genome_build = args.genome_build +output_dir = args.output_dir + +#Extract VCF file header +vcf_header = hl.get_vcf_metadata(vcf_file) + +#Add script system command to VCF source +SCRIPT_COMMAND = ' '.join(sys.argv) + +with open(vcf_source_file, 'r', encoding='utf-8') as vcf_source: + vcf_source_content = vcf_source.read() + +script_command_entry = f'##XYFiltration=' +vcf_source = vcf_source_content + script_command_entry +temp_file_path = os.path.join(tempfile.gettempdir(), 'temp_file.txt') +with open(temp_file_path, 'w', encoding='utf-8') as temp_file: + temp_file.write(vcf_source) + +#Import PAR BED file +par = hl.import_bed( + path = par_bed, + reference_genome = genome_build, + skip_invalid_intervals = True + ) + +#Import VCF file into a hail MatrixTable +vcf_matrix = hl.import_vcf( + path = vcf_file, + reference_genome = genome_build, + force_bgz = True + ) + +#Filter XY calls +##Extract XY calls +X_contig = vcf_matrix.locus.contig.startswith('chrX') | vcf_matrix.locus.contig.startswith('X') +Y_contig = vcf_matrix.locus.contig.startswith('chrY') | vcf_matrix.locus.contig.startswith('Y') +extract_condition = (X_contig) | (Y_contig) +vcf_XY = vcf_matrix.filter_rows(extract_condition) +print(f'chrX/Y variants before {sample_sex} filtration:', vcf_XY.count()) + +##Extract autosomes +vcf_autosomes = vcf_matrix.filter_rows(~extract_condition) + +##Extract PAR and non-PAR regions +par_variants = vcf_XY.filter_rows(hl.is_defined(par[vcf_XY.locus])) +non_par_variants = vcf_XY.filter_rows(hl.is_missing(par[vcf_XY.locus])) + +if sample_sex == 'XY': + #If MALE (XY), remove heterozygous non-PAR chrX calls + non_par_filtered_variants = non_par_variants.filter_rows( + hl.agg.all( + non_par_variants.GT.is_diploid() & non_par_variants.GT.is_hom_var() + ) + ) + non_par_filtered_variants = non_par_filtered_variants.annotate_entries( + GT = hl.call(non_par_filtered_variants.GT[0]) + ) + +elif sample_sex == 'XX': + #If Female (XX), remove non-PAR chrY calls + non_par_filtered_variants = non_par_variants.filter_rows( + non_par_variants.locus.contig.startswith('chrX') | \ + non_par_variants.locus.contig.startswith('X') + ) + +#Combine PAR and filtered non-PAR regions +par_non_par = [par_variants, non_par_filtered_variants] +filterXY = hl.MatrixTable.union_rows(*par_non_par) +print(f'chrX/Y variant counts after {sample_sex} filtration:', filterXY.count()) + +#Combine filtered X/Y + autosomal variants +autosomes_XYfiltered = [vcf_autosomes, filterXY] +output_vcf = hl.MatrixTable.union_rows(*autosomes_XYfiltered) + +#Export MatrixTable to VCF +OUTPUT_FILE = f'{output_dir}/{sample_name}_{sample_sex}_filtered.vcf.bgz' + +hl.export_vcf( + dataset = output_vcf, + output = OUTPUT_FILE, + tabix = True, + metadata = vcf_header, + append_to_header = temp_file_path + )