From 73e1e03d3aaff46672c494543c0fbfbf70da7cc4 Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Mon, 25 Nov 2024 16:27:57 -0800 Subject: [PATCH 01/67] add patient_sex to template --- config/template.config | 3 +++ 1 file changed, 3 insertions(+) diff --git a/config/template.config b/config/template.config index c24d6dc..7abbe19 100644 --- a/config/template.config +++ b/config/template.config @@ -11,6 +11,9 @@ params { dataset_id = '' blcds_registered_dataset = false // if you want the output to be registered + // Input patient sex if known - male or female. Leave empty if not known. + patient_sex = '' + output_dir = '/path/to/output/directory' // Set to false to disable the publish rule and delete intermediate files as they're no longer needed From 7cd06a6719fa820e9bf3f193be9892a0ce7325df Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Mon, 25 Nov 2024 16:37:15 -0800 Subject: [PATCH 02/67] add XY filter script from project-method-AlgorithmEvaluation-BNCH-000122-GIABGermlineVariant --- script/workflow_filter_xy_call_allvar.py | 137 +++++++++++++++++++++++ 1 file changed, 137 insertions(+) create mode 100755 script/workflow_filter_xy_call_allvar.py diff --git a/script/workflow_filter_xy_call_allvar.py b/script/workflow_filter_xy_call_allvar.py new file mode 100755 index 0000000..a9f8b80 --- /dev/null +++ b/script/workflow_filter_xy_call_allvar.py @@ -0,0 +1,137 @@ +#!/usr/bin/env python3 +""" +Filter XY calls from call-gSNP single sample VCF file + +Filter criteria: +- Extract XY calls +- Extract XY calls overlapping with Pseudo-Autosomal Regions (PARs) +- For non-PAR + - Male sample: Filter out chrX calls where VAF < 80% + - Female sample: Filter out chrY calls + +Dependencies: +- Python 3 +- HAIL python library (pip install hail) + +Note: +- Do not export VCF to a path that is being read from in the same pipeline,\ +based on HAIL recommendation +""" + +import os +import argparse +import hail as hl + +script_dir = os.getcwd() + +parser = argparse.ArgumentParser() +parser.add_argument( + '--input_vcf', + dest='input_vcf', + help = 'Input single sample VCF file path', + required=True + ) +parser.add_argument( + '--par_bed', + dest='par_bed', + help = 'Input BED file path for Pseudo-Autosomal Regions (PAR)', + required=True + ) +parser.add_argument( + '--output_dir', + dest='output_dir', + help = 'Output path where filtered XY variant VCF will be written', + required=True + ) + +args = parser.parse_args() + +vcf_file = args.input_vcf +par_bed_file = args.par_bed +output_dir = args.output_dir + +#Import PAR BED file +par = hl.import_bed( + path = args.par_bed, + reference_genome = 'GRCh38', + skip_invalid_intervals = True + ) + +#Extract VCF file header +vcf_header = hl.get_vcf_metadata(vcf_file) +vcf_source = script_dir + '/call-gSNP_caller_source_VCF_header.txt' + +#Import VCF file into a hail MatrixTable +vcf_matrix = hl.import_vcf( + path = vcf_file, + reference_genome = 'GRCh38', + force_bgz = True + ) + +###Get Sample ID +sample_name = vcf_matrix.s.collect()[0] #list has one sample ID + +#Filter XY calls +##Extract XY calls +X_contig = vcf_matrix.locus.contig.startswith('chrX') +Y_contig = vcf_matrix.locus.contig.startswith('chrY') +extract_condition = (X_contig) | (Y_contig) +vcf_XY = vcf_matrix.filter_rows(extract_condition) +print('variants in chrX/Y:', vcf_XY.count()) + +##Remove calls with DP=0 +depth_field = sample_name + '.DP' +depth = vcf_XY.make_table() +zero_depth = depth.filter(depth[depth_field] == 0) +zero_depth_count = zero_depth.count() +print('variants with zero depth:', zero_depth_count) +zero_depth_contig = zero_depth.locus.contig.collect() +zero_depth_pos = zero_depth.locus.position.collect() + +for allele in range(zero_depth_count): + zero_depth_match = (vcf_XY.locus.contig == zero_depth_contig[allele]) & (vcf_XY.locus.position == zero_depth_pos[allele]) + vcf_XY = vcf_XY.filter_rows( + ~(zero_depth_match) + ) + +print('variants after filtering zero depth:', vcf_XY.count()) + +##Extract PAR and non-PAR regions +par_filtered = vcf_XY.filter_rows(hl.is_defined(par[vcf_XY.locus])) +non_par_filtered = vcf_XY.filter_rows(hl.is_missing(par[vcf_XY.locus])) + +###For non-PAR regions, extract VQSR PASS calls. (note: Hail parses PASS as an empty set {}) +#non_par_filtered = non_par_filtered.filter_rows( +# hl.len(non_par_filtered.filters) == 0 +# ) + +##Predict SEX of the sample +#imputed_sex = hl.impute_sex(vcf_file.GT) +#temp place holder for SEX +SEX = 'XY' + +if SEX == 'XY': + #If MALE (XY), remove non-PAR chrX calls with AF=0.5 + filter_non_par_call = non_par_filtered.filter_rows( + non_par_filtered.info.AF[0] != 0.5 + ) +elif SEX == 'XX': + #If Female (XX), remove non-PAR chrY calls + filter_non_par_call = non_par_filtered.filter_rows( + non_par_filtered.locus.contig.startswith('chrX') + ) + +#Combine PAR and filtered non-PAR regions +par_non_par = [par_filtered, filter_non_par_call] +filterXY = hl.MatrixTable.union_rows(*par_non_par) + +#Export MatrixTable to VCF +output_file = output_dir + '/' + sample_name + '_filterXY.vcf.bgz' + +hl.export_vcf( + dataset = filterXY, + output = output_file, + tabix = True, + metadata = vcf_header, + append_to_header = vcf_source + ) From cc5ba7094f86ac10f2c8dbc23a0d56c26249c06b Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Fri, 6 Dec 2024 19:28:14 -0800 Subject: [PATCH 03/67] add par_bed parameter to template.config --- config/template.config | 3 +++ 1 file changed, 3 insertions(+) diff --git a/config/template.config b/config/template.config index 7abbe19..dabb35a 100644 --- a/config/template.config +++ b/config/template.config @@ -46,6 +46,9 @@ params { bundle_omni_1000g_2p5_vcf_gz = "/hot/resource/tool-specific-input/GATK/GRCh38/1000G_omni2.5.hg38.vcf.gz" bundle_phase1_1000g_snps_high_conf_vcf_gz = "/hot/resource/tool-specific-input/GATK/GRCh38/1000G_phase1.snps.high_confidence.hg38.vcf.gz" + // Specify BED file path for Pseudoautosomal Region (PAR) + par_bed = "" + // Base resource allocation updater // See README for adding parameters to update the base resource allocations } From 13fb8448063ccb7bbaebc271bdac56c53b066575 Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Fri, 6 Dec 2024 19:37:49 -0800 Subject: [PATCH 04/67] add genome build to template --- config/template.config | 1 + 1 file changed, 1 insertion(+) diff --git a/config/template.config b/config/template.config index dabb35a..01d7214 100644 --- a/config/template.config +++ b/config/template.config @@ -47,6 +47,7 @@ params { bundle_phase1_1000g_snps_high_conf_vcf_gz = "/hot/resource/tool-specific-input/GATK/GRCh38/1000G_phase1.snps.high_confidence.hg38.vcf.gz" // Specify BED file path for Pseudoautosomal Region (PAR) + genome_build = "GRCh38" par_bed = "" // Base resource allocation updater From 43fd1015556216cf58fd10b239b6f9c27cb826be Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Wed, 18 Dec 2024 17:19:05 -0800 Subject: [PATCH 05/67] add user input sample id --- script/workflow_filter_xy_call_allvar.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/script/workflow_filter_xy_call_allvar.py b/script/workflow_filter_xy_call_allvar.py index a9f8b80..6fe3bc2 100755 --- a/script/workflow_filter_xy_call_allvar.py +++ b/script/workflow_filter_xy_call_allvar.py @@ -1,4 +1,10 @@ #!/usr/bin/env python3 +Task + - add sample name input + - make script work if VCF has multiple samples +Note + - script works on both single sample and multi sample vcf + - filtration can be done using hom GT, not necessarily AF """ Filter XY calls from call-gSNP single sample VCF file @@ -25,6 +31,12 @@ script_dir = os.getcwd() parser = argparse.ArgumentParser() +parser.add_argument( + '--sample_name', + dest='sample_name', + help = 'Sample name', + required=True + ) parser.add_argument( '--input_vcf', dest='input_vcf', @@ -46,6 +58,7 @@ args = parser.parse_args() +sample = args.sample_name vcf_file = args.input_vcf par_bed_file = args.par_bed output_dir = args.output_dir From cfb41983c1e605fac76c81162e673814696f1144 Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Wed, 18 Dec 2024 22:31:28 -0800 Subject: [PATCH 06/67] add sample sex; remove redundant code; exremove het calls for XY --- script/workflow_filter_xy_call_allvar.py | 74 +++++++++--------------- 1 file changed, 27 insertions(+), 47 deletions(-) diff --git a/script/workflow_filter_xy_call_allvar.py b/script/workflow_filter_xy_call_allvar.py index 6fe3bc2..e7e7067 100755 --- a/script/workflow_filter_xy_call_allvar.py +++ b/script/workflow_filter_xy_call_allvar.py @@ -1,10 +1,4 @@ #!/usr/bin/env python3 -Task - - add sample name input - - make script work if VCF has multiple samples -Note - - script works on both single sample and multi sample vcf - - filtration can be done using hom GT, not necessarily AF """ Filter XY calls from call-gSNP single sample VCF file @@ -12,7 +6,7 @@ - Extract XY calls - Extract XY calls overlapping with Pseudo-Autosomal Regions (PARs) - For non-PAR - - Male sample: Filter out chrX calls where VAF < 80% + - Male sample: Filter out heterozygous GT calls in chrX - Female sample: Filter out chrY calls Dependencies: @@ -43,6 +37,12 @@ help = 'Input single sample VCF file path', required=True ) +parser.add_argument( + '--sample_sex', + dest='sample_sex', + help = 'Sample sex, XY or XX', + required=True + ) parser.add_argument( '--par_bed', dest='par_bed', @@ -58,7 +58,8 @@ args = parser.parse_args() -sample = args.sample_name +sample_name = args.sample_name +sample_sex = args.sample_sex vcf_file = args.input_vcf par_bed_file = args.par_bed output_dir = args.output_dir @@ -81,9 +82,6 @@ force_bgz = True ) -###Get Sample ID -sample_name = vcf_matrix.s.collect()[0] #list has one sample ID - #Filter XY calls ##Extract XY calls X_contig = vcf_matrix.locus.contig.startswith('chrX') @@ -93,49 +91,31 @@ print('variants in chrX/Y:', vcf_XY.count()) ##Remove calls with DP=0 -depth_field = sample_name + '.DP' -depth = vcf_XY.make_table() -zero_depth = depth.filter(depth[depth_field] == 0) -zero_depth_count = zero_depth.count() -print('variants with zero depth:', zero_depth_count) -zero_depth_contig = zero_depth.locus.contig.collect() -zero_depth_pos = zero_depth.locus.position.collect() - -for allele in range(zero_depth_count): - zero_depth_match = (vcf_XY.locus.contig == zero_depth_contig[allele]) & (vcf_XY.locus.position == zero_depth_pos[allele]) - vcf_XY = vcf_XY.filter_rows( - ~(zero_depth_match) - ) - -print('variants after filtering zero depth:', vcf_XY.count()) +#vcf_XY = vcf_XY.filter_rows(hl.agg.all(vcf_XY.DP != 0)) ##Extract PAR and non-PAR regions -par_filtered = vcf_XY.filter_rows(hl.is_defined(par[vcf_XY.locus])) -non_par_filtered = vcf_XY.filter_rows(hl.is_missing(par[vcf_XY.locus])) - -###For non-PAR regions, extract VQSR PASS calls. (note: Hail parses PASS as an empty set {}) -#non_par_filtered = non_par_filtered.filter_rows( -# hl.len(non_par_filtered.filters) == 0 -# ) - -##Predict SEX of the sample -#imputed_sex = hl.impute_sex(vcf_file.GT) -#temp place holder for SEX -SEX = 'XY' - -if SEX == 'XY': - #If MALE (XY), remove non-PAR chrX calls with AF=0.5 - filter_non_par_call = non_par_filtered.filter_rows( - non_par_filtered.info.AF[0] != 0.5 +par_variants = vcf_XY.filter_rows(hl.is_defined(par[vcf_XY.locus])) +non_par_variants = vcf_XY.filter_rows(hl.is_missing(par[vcf_XY.locus])) + +if sample_sex == 'XY': + #If MALE (XY), remove heterozygous non-PAR chrX calls + non_par_filtered_variants = non_par_variants.filter_rows( + hl.agg.all( + non_par_variants.GT.is_diploid() & non_par_variants.GT.is_hom_var() + ) ) -elif SEX == 'XX': + non_par_filtered_variants = non_par_filtered_variants.annotate_entries( + GT = hl.call(non_par_filtered.GT[0]) + ) + +elif sample_sex == 'XX': #If Female (XX), remove non-PAR chrY calls - filter_non_par_call = non_par_filtered.filter_rows( - non_par_filtered.locus.contig.startswith('chrX') + non_par_filtered_variants = non_par_variants.filter_rows( + non_par_variants.locus.contig.startswith('chrX') | non_par_variants.locus.contig.startswith('X') ) #Combine PAR and filtered non-PAR regions -par_non_par = [par_filtered, filter_non_par_call] +par_non_par = [par_variants, non_par_filtered_variants] filterXY = hl.MatrixTable.union_rows(*par_non_par) #Export MatrixTable to VCF From 6d0ed9eb78277741e63dea810303be5921c6ac43 Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Wed, 18 Dec 2024 22:43:54 -0800 Subject: [PATCH 07/67] fix variables --- script/workflow_filter_xy_call_allvar.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/script/workflow_filter_xy_call_allvar.py b/script/workflow_filter_xy_call_allvar.py index e7e7067..8d94fba 100755 --- a/script/workflow_filter_xy_call_allvar.py +++ b/script/workflow_filter_xy_call_allvar.py @@ -105,7 +105,7 @@ ) ) non_par_filtered_variants = non_par_filtered_variants.annotate_entries( - GT = hl.call(non_par_filtered.GT[0]) + GT = hl.call(non_par_filtered_variants.GT[0]) ) elif sample_sex == 'XX': From 7abfcf78186b4bc0a5f0369d19d820312208c359 Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Thu, 19 Dec 2024 14:15:50 -0800 Subject: [PATCH 08/67] extract autosomes --- script/workflow_filter_xy_call_allvar.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/script/workflow_filter_xy_call_allvar.py b/script/workflow_filter_xy_call_allvar.py index 8d94fba..a05bdd8 100755 --- a/script/workflow_filter_xy_call_allvar.py +++ b/script/workflow_filter_xy_call_allvar.py @@ -84,11 +84,13 @@ #Filter XY calls ##Extract XY calls -X_contig = vcf_matrix.locus.contig.startswith('chrX') -Y_contig = vcf_matrix.locus.contig.startswith('chrY') +X_contig = vcf_matrix.locus.contig.startswith('chrX') | vcf_matrix.locus.contig.startswith('X') +Y_contig = vcf_matrix.locus.contig.startswith('chrY') | vcf_matrix.locus.contig.startswith('Y') extract_condition = (X_contig) | (Y_contig) vcf_XY = vcf_matrix.filter_rows(extract_condition) print('variants in chrX/Y:', vcf_XY.count()) +##Extract autosomes +vcf_autosomes = vcf_matrix.filter_rows(~extract_condition) ##Remove calls with DP=0 #vcf_XY = vcf_XY.filter_rows(hl.agg.all(vcf_XY.DP != 0)) @@ -118,6 +120,7 @@ par_non_par = [par_variants, non_par_filtered_variants] filterXY = hl.MatrixTable.union_rows(*par_non_par) + #Export MatrixTable to VCF output_file = output_dir + '/' + sample_name + '_filterXY.vcf.bgz' From 9417465b3c8e61992cf3d789803448a3df16aa63 Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Thu, 19 Dec 2024 14:33:35 -0800 Subject: [PATCH 09/67] merge autosomes and XY filtered calls --- script/workflow_filter_xy_call_allvar.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/script/workflow_filter_xy_call_allvar.py b/script/workflow_filter_xy_call_allvar.py index a05bdd8..1ff45f7 100755 --- a/script/workflow_filter_xy_call_allvar.py +++ b/script/workflow_filter_xy_call_allvar.py @@ -88,7 +88,7 @@ Y_contig = vcf_matrix.locus.contig.startswith('chrY') | vcf_matrix.locus.contig.startswith('Y') extract_condition = (X_contig) | (Y_contig) vcf_XY = vcf_matrix.filter_rows(extract_condition) -print('variants in chrX/Y:', vcf_XY.count()) +print('chrX/Y variants before XY filtration:', vcf_XY.count()) ##Extract autosomes vcf_autosomes = vcf_matrix.filter_rows(~extract_condition) @@ -119,7 +119,11 @@ #Combine PAR and filtered non-PAR regions par_non_par = [par_variants, non_par_filtered_variants] filterXY = hl.MatrixTable.union_rows(*par_non_par) +print('chrX/Y variant counts after XY filtration:', filterXY.count()) +#Combine filtered X/Y + autosomal variants +autosomes_XYfiltered = [vcf_autosomes, filterXY] +output_vcf = hl.MatrixTable.union_rows(*autosomes_XYfiltered) #Export MatrixTable to VCF output_file = output_dir + '/' + sample_name + '_filterXY.vcf.bgz' From f8417d5628b1e40e1de4eb62d57dc4f982e7afee Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Thu, 19 Dec 2024 14:46:57 -0800 Subject: [PATCH 10/67] change vcf header extraction location --- script/workflow_filter_xy_call_allvar.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/script/workflow_filter_xy_call_allvar.py b/script/workflow_filter_xy_call_allvar.py index 1ff45f7..8d58112 100755 --- a/script/workflow_filter_xy_call_allvar.py +++ b/script/workflow_filter_xy_call_allvar.py @@ -73,7 +73,7 @@ #Extract VCF file header vcf_header = hl.get_vcf_metadata(vcf_file) -vcf_source = script_dir + '/call-gSNP_caller_source_VCF_header.txt' +vcf_source = output_dir + '/call-gSNP_caller_source_VCF_header.txt' #Import VCF file into a hail MatrixTable vcf_matrix = hl.import_vcf( From 055ab3e0c8025649ace85954936981385cb8c967 Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Thu, 19 Dec 2024 14:49:10 -0800 Subject: [PATCH 11/67] Add workflow steps to script note --- script/workflow_filter_xy_call_allvar.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/script/workflow_filter_xy_call_allvar.py b/script/workflow_filter_xy_call_allvar.py index 8d58112..22045ec 100755 --- a/script/workflow_filter_xy_call_allvar.py +++ b/script/workflow_filter_xy_call_allvar.py @@ -2,6 +2,11 @@ """ Filter XY calls from call-gSNP single sample VCF file +Steps: +- Extract autosomes and chrX/Y variants from input VCF +- Filter chrX/Y variants +- Merge autosomal and filtered chrX/Y variants + Filter criteria: - Extract XY calls - Extract XY calls overlapping with Pseudo-Autosomal Regions (PARs) @@ -92,9 +97,6 @@ ##Extract autosomes vcf_autosomes = vcf_matrix.filter_rows(~extract_condition) -##Remove calls with DP=0 -#vcf_XY = vcf_XY.filter_rows(hl.agg.all(vcf_XY.DP != 0)) - ##Extract PAR and non-PAR regions par_variants = vcf_XY.filter_rows(hl.is_defined(par[vcf_XY.locus])) non_par_variants = vcf_XY.filter_rows(hl.is_missing(par[vcf_XY.locus])) From 6c313374f34e5af6478a8deff87f0a799bd05ebc Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Thu, 19 Dec 2024 14:50:37 -0800 Subject: [PATCH 12/67] clean up script --- script/workflow_filter_xy_call_allvar.py | 1 + 1 file changed, 1 insertion(+) diff --git a/script/workflow_filter_xy_call_allvar.py b/script/workflow_filter_xy_call_allvar.py index 22045ec..793955e 100755 --- a/script/workflow_filter_xy_call_allvar.py +++ b/script/workflow_filter_xy_call_allvar.py @@ -94,6 +94,7 @@ extract_condition = (X_contig) | (Y_contig) vcf_XY = vcf_matrix.filter_rows(extract_condition) print('chrX/Y variants before XY filtration:', vcf_XY.count()) + ##Extract autosomes vcf_autosomes = vcf_matrix.filter_rows(~extract_condition) From 08213c9aa218bfd9d19f5e2d1dd13fb2e2a145bc Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Fri, 20 Dec 2024 10:36:31 -0800 Subject: [PATCH 13/67] add skeleton code for vcf header temp file --- script/workflow_filter_xy_call_allvar.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/script/workflow_filter_xy_call_allvar.py b/script/workflow_filter_xy_call_allvar.py index 793955e..fb21e25 100755 --- a/script/workflow_filter_xy_call_allvar.py +++ b/script/workflow_filter_xy_call_allvar.py @@ -78,6 +78,10 @@ #Extract VCF file header vcf_header = hl.get_vcf_metadata(vcf_file) + +with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as temp_file: + temp_file.write(content) + temp_file_path = output_dir vcf_source = output_dir + '/call-gSNP_caller_source_VCF_header.txt' #Import VCF file into a hail MatrixTable From 4be2064744ac0a3dc2f261c224a9987d307fe7fa Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Fri, 20 Dec 2024 16:10:14 -0800 Subject: [PATCH 14/67] write VCF source to temp file and parameterize it --- script/workflow_filter_xy_call_allvar.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/script/workflow_filter_xy_call_allvar.py b/script/workflow_filter_xy_call_allvar.py index fb21e25..092614c 100755 --- a/script/workflow_filter_xy_call_allvar.py +++ b/script/workflow_filter_xy_call_allvar.py @@ -26,6 +26,7 @@ import os import argparse import hail as hl +import tempfile script_dir = os.getcwd() @@ -80,9 +81,8 @@ vcf_header = hl.get_vcf_metadata(vcf_file) with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as temp_file: - temp_file.write(content) - temp_file_path = output_dir -vcf_source = output_dir + '/call-gSNP_caller_source_VCF_header.txt' + temp_file.write("##source=HaplotypeCaller") + vcf_source = temp_file.name #Import VCF file into a hail MatrixTable vcf_matrix = hl.import_vcf( From ac2d36c5b5ec2f52f4545eb26431db56bc8f0bd3 Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Fri, 20 Dec 2024 17:30:15 -0800 Subject: [PATCH 15/67] add arg for variant caller --- script/workflow_filter_xy_call_allvar.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/script/workflow_filter_xy_call_allvar.py b/script/workflow_filter_xy_call_allvar.py index 092614c..1a578de 100755 --- a/script/workflow_filter_xy_call_allvar.py +++ b/script/workflow_filter_xy_call_allvar.py @@ -43,6 +43,13 @@ help = 'Input single sample VCF file path', required=True ) +parser.add_argument( + '--variant_caller', + dest='variant_caller', + default = 'HaplotypeCaller', + help = 'Name of the variant caller to set source in the output VCF header', + required=True + ) parser.add_argument( '--sample_sex', dest='sample_sex', From 4ff95752d797ed1e432176ee79a6bd52af447153 Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Fri, 20 Dec 2024 20:03:52 -0800 Subject: [PATCH 16/67] set variant caller source --- script/workflow_filter_xy_call_allvar.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/script/workflow_filter_xy_call_allvar.py b/script/workflow_filter_xy_call_allvar.py index 1a578de..e0d4e01 100755 --- a/script/workflow_filter_xy_call_allvar.py +++ b/script/workflow_filter_xy_call_allvar.py @@ -74,6 +74,7 @@ sample_name = args.sample_name sample_sex = args.sample_sex vcf_file = args.input_vcf +variant_caller = args.variant_caller par_bed_file = args.par_bed output_dir = args.output_dir @@ -87,9 +88,10 @@ #Extract VCF file header vcf_header = hl.get_vcf_metadata(vcf_file) +variant_caller_source = "##source=" + variant_caller with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as temp_file: - temp_file.write("##source=HaplotypeCaller") - vcf_source = temp_file.name + temp_file.write(variant_caller_source) + vcf_source_file = temp_file.name #Import VCF file into a hail MatrixTable vcf_matrix = hl.import_vcf( @@ -147,5 +149,5 @@ output = output_file, tabix = True, metadata = vcf_header, - append_to_header = vcf_source + append_to_header = vcf_source_file ) From 7bdd8cc2fa0798a98378da1ca2fd819bd5b9d2fe Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Fri, 20 Dec 2024 20:12:06 -0800 Subject: [PATCH 17/67] improve documentation --- script/workflow_filter_xy_call_allvar.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/script/workflow_filter_xy_call_allvar.py b/script/workflow_filter_xy_call_allvar.py index e0d4e01..2fa8416 100755 --- a/script/workflow_filter_xy_call_allvar.py +++ b/script/workflow_filter_xy_call_allvar.py @@ -11,7 +11,9 @@ - Extract XY calls - Extract XY calls overlapping with Pseudo-Autosomal Regions (PARs) - For non-PAR - - Male sample: Filter out heterozygous GT calls in chrX + - Male sample: + - Filter out heterozygous GT calls in chrX and chrY + - Transform homozygous GT=1/1 to hemizygous GT=1 - Female sample: Filter out chrY calls Dependencies: From 1eadfbbdc7410cd8f717c7701f6c3ab8cc55f839 Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Fri, 20 Dec 2024 20:15:26 -0800 Subject: [PATCH 18/67] add hail v0.2.133 and docker image to default config --- config/default.config | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/config/default.config b/config/default.config index 806d508..c07dafa 100644 --- a/config/default.config +++ b/config/default.config @@ -20,10 +20,12 @@ params { picard_version = "2.26.10" pipeval_version = "4.0.0-rc.2" gatkfilter_version = "v1.0.0" + hail_version = "v0.2.133" docker_image_gatk = "broadinstitute/gatk:${params.gatk_version}" docker_image_picard = "${-> params.docker_container_registry}/picard:${params.picard_version}" docker_image_pipeval = "${-> params.docker_container_registry}/pipeval:${params.pipeval_version}" docker_image_gatkfilter = "${-> params.docker_container_registry}/gatk:${params.gatkfilter_version}" + docker_image_hail = "${-> params.docker_container_registry}/hail:${params.hail_version}" emit_all_confident_sites = false } @@ -36,7 +38,7 @@ process { cache = true executor = 'local' - + // Other directives or options that should apply for every process // total amount of resources avaible to the pipeline From 24cb06d3f5a899203ed67245ffa2bc408a29bb05 Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Fri, 20 Dec 2024 20:38:00 -0800 Subject: [PATCH 19/67] add filter-xy NF script --- module/filter-xy.nf | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 module/filter-xy.nf diff --git a/module/filter-xy.nf b/module/filter-xy.nf new file mode 100644 index 0000000..b4a699b --- /dev/null +++ b/module/filter-xy.nf @@ -0,0 +1,17 @@ +include { generate_standard_filename } from '../external/pipeline-Nextflow-module/modules/common/generate_standardized_filename/main.nf' + +/* + Nextflow module for filtering chrX and chrY variant calls based on sample sex + + input: + sample_id: identifier for sample + sample_vcf: path to VCF to filter + sample_vcf_tbi: path to index of VCF to filter + + params: + params.output_dir_base: string(path) + params.log_output_dir: string(path) + params.docker_image_hail: string + params.sample_sex: string + params.par_bed: string(path) +*/ \ No newline at end of file From 27b39da95dfbe8c9132802e0034408f99f33903b Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Fri, 20 Dec 2024 20:47:08 -0800 Subject: [PATCH 20/67] add NF skeleton --- module/filter-xy.nf | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/module/filter-xy.nf b/module/filter-xy.nf index b4a699b..415b42d 100644 --- a/module/filter-xy.nf +++ b/module/filter-xy.nf @@ -14,4 +14,20 @@ include { generate_standard_filename } from '../external/pipeline-Nextflow-modul params.docker_image_hail: string params.sample_sex: string params.par_bed: string(path) -*/ \ No newline at end of file +*/ + +process filter_XY { + container params.docker_image_hail + publishDir path: + publishDir path: + + input: + + output: + + script: + """ + set -euo pipefail + + """ +} \ No newline at end of file From 73c7f1e3fa9fea75891e18bbfc4b25211aebb5c7 Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Fri, 20 Dec 2024 21:00:46 -0800 Subject: [PATCH 21/67] add script dir var in main --- main.nf | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/main.nf b/main.nf index 33e0937..01a6073 100644 --- a/main.nf +++ b/main.nf @@ -104,6 +104,12 @@ workflow { } .set{ input_ch_collected_files } + script_dir_ch = Channel.fromPath( + "$projectDir/script", + checkIfExists: true + ) + .collect() + /** * Input validation */ From 570677b169de550ebed4d0f94150a57d08a0d4b2 Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Fri, 20 Dec 2024 21:01:16 -0800 Subject: [PATCH 22/67] add xy filtration command --- module/filter-xy.nf | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/module/filter-xy.nf b/module/filter-xy.nf index 415b42d..13270ef 100644 --- a/module/filter-xy.nf +++ b/module/filter-xy.nf @@ -1,4 +1,4 @@ -include { generate_standard_filename } from '../external/pipeline-Nextflow-module/modules/common/generate_standardized_filename/main.nf' +include { generate_standard_filename; sanitize_string } from '../external/pipeline-Nextflow-module/modules/common/generate_standardized_filename/main.nf' /* Nextflow module for filtering chrX and chrY variant calls based on sample sex @@ -28,6 +28,12 @@ process filter_XY { script: """ set -euo pipefail - + python ${script_dir}/filter_xy_call.py \ + --sample_name id + --input_vcf vcf + --variant_caller 'HaplotypeCaller' + --sample_sex XX + --par_bed params.par_bed + --output_dir . """ } \ No newline at end of file From d24757740aa79a16fa4a787f43f3640cfb420c44 Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Fri, 20 Dec 2024 21:05:21 -0800 Subject: [PATCH 23/67] rename XY script --- script/{workflow_filter_xy_call_allvar.py => filter_xy_call.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename script/{workflow_filter_xy_call_allvar.py => filter_xy_call.py} (100%) diff --git a/script/workflow_filter_xy_call_allvar.py b/script/filter_xy_call.py similarity index 100% rename from script/workflow_filter_xy_call_allvar.py rename to script/filter_xy_call.py From 71c294b406430b665bb747629fcd427be13bf527 Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Fri, 27 Dec 2024 18:34:48 -0800 Subject: [PATCH 24/67] change arg variant_caller to vcf_source_file --- script/filter_xy_call.py | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/script/filter_xy_call.py b/script/filter_xy_call.py index 2fa8416..e6b7d2d 100755 --- a/script/filter_xy_call.py +++ b/script/filter_xy_call.py @@ -46,10 +46,9 @@ required=True ) parser.add_argument( - '--variant_caller', - dest='variant_caller', - default = 'HaplotypeCaller', - help = 'Name of the variant caller to set source in the output VCF header', + '--vcf_source_file', + dest='vcf_source_file', + help = 'A TXT file containing variant caller source details (eg. ##source=HaplotypeCaller)', required=True ) parser.add_argument( @@ -76,7 +75,7 @@ sample_name = args.sample_name sample_sex = args.sample_sex vcf_file = args.input_vcf -variant_caller = args.variant_caller +vcf_source_file = args.vcf_source_file par_bed_file = args.par_bed output_dir = args.output_dir @@ -90,11 +89,6 @@ #Extract VCF file header vcf_header = hl.get_vcf_metadata(vcf_file) -variant_caller_source = "##source=" + variant_caller -with tempfile.NamedTemporaryFile(mode='w', delete=False, suffix='.txt') as temp_file: - temp_file.write(variant_caller_source) - vcf_source_file = temp_file.name - #Import VCF file into a hail MatrixTable vcf_matrix = hl.import_vcf( path = vcf_file, From 6cb1d9c2867f084b337cdd263ae211878309b7bc Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Fri, 27 Dec 2024 18:35:42 -0800 Subject: [PATCH 25/67] add VCF source extraction code to script section in nextflow module --- module/filter-xy.nf | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/module/filter-xy.nf b/module/filter-xy.nf index 13270ef..0e4a640 100644 --- a/module/filter-xy.nf +++ b/module/filter-xy.nf @@ -22,18 +22,24 @@ process filter_XY { publishDir path: input: + val sample_id + val sample_sex + tuple path(recalibrated_vcf), path(recalibrated_vcf_tbi) output: script: """ set -euo pipefail + + zgrep "##source=" ${recalibrated_vcf} > ./vcf_source.txt + python ${script_dir}/filter_xy_call.py \ - --sample_name id - --input_vcf vcf - --variant_caller 'HaplotypeCaller' - --sample_sex XX - --par_bed params.par_bed + --sample_name ${sample_id} \ + --input_vcf ${recalibrated_vcf} \ + --vcf_source ./vcf_source.txt \ + --sample_sex ${params.sample_sex} \ + --par_bed ${params.par_bed} \ --output_dir . """ } \ No newline at end of file From 8b8948d9fc32028692ae0bfa2970bb21a4101a82 Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Sat, 28 Dec 2024 10:53:16 -0800 Subject: [PATCH 26/67] revert script output to bgz --- script/filter_xy_call.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/script/filter_xy_call.py b/script/filter_xy_call.py index e6b7d2d..9790548 100755 --- a/script/filter_xy_call.py +++ b/script/filter_xy_call.py @@ -138,7 +138,7 @@ output_vcf = hl.MatrixTable.union_rows(*autosomes_XYfiltered) #Export MatrixTable to VCF -output_file = output_dir + '/' + sample_name + '_filterXY.vcf.bgz' +output_file = output_dir + '/' + sample_name + '_XY_filtered.vcf.bgz' hl.export_vcf( dataset = filterXY, From 6e22838373cd2ba11aaff484bd164f817ec2a40b Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Sat, 28 Dec 2024 10:53:55 -0800 Subject: [PATCH 27/67] set publishDir --- module/filter-xy.nf | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/module/filter-xy.nf b/module/filter-xy.nf index 0e4a640..7e14523 100644 --- a/module/filter-xy.nf +++ b/module/filter-xy.nf @@ -18,24 +18,43 @@ include { generate_standard_filename; sanitize_string } from '../external/pipeli process filter_XY { container params.docker_image_hail - publishDir path: - publishDir path: + + publishDir path: "${params.output_dir_base}/output", + mode: "copy", + pattern: '*.vcf.bgz*', + saveAs: { + "${output_filename}_${sanitize_string(file(it).getName().replace("${sample_id}_", ""))}" + } + + publishDir path: "${params.log_output_dir}/process-log", + pattern: ".command.*", + mode: "copy", + saveAs: { + "${task.process.replace(':', '/')}/${task.process.split(':')[-1]}-${sample_id}-${interval_id}/log${file(it).getName()}" + } input: - val sample_id - val sample_sex - tuple path(recalibrated_vcf), path(recalibrated_vcf_tbi) + tuple val(sample_id), path(recalibrated_vcf), path(recalibrated_vcf_tbi) output: + path(".command.*") + path("${output_filename}_XY_filtered.vcf.bgz") + path("${output_filename}_XY_filtered.vcf.bgz.tbi") script: + output_filename = generate_standard_filename( + "Hail-${params.hail_version}", + params.dataset_id, + sample_id, + [additional_tools:["GATK-${params.gatk_version}"]] + ) """ set -euo pipefail zgrep "##source=" ${recalibrated_vcf} > ./vcf_source.txt python ${script_dir}/filter_xy_call.py \ - --sample_name ${sample_id} \ + --sample_name ${output_filename} \ --input_vcf ${recalibrated_vcf} \ --vcf_source ./vcf_source.txt \ --sample_sex ${params.sample_sex} \ From f41aab6a0ba682123dedd15684886d30f27a318f Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Sat, 28 Dec 2024 10:54:26 -0800 Subject: [PATCH 28/67] add channel for xy filter and call process --- main.nf | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/main.nf b/main.nf index 01a6073..43bbaa5 100644 --- a/main.nf +++ b/main.nf @@ -254,6 +254,14 @@ workflow { recalibrate_variants.out.output_ch_recalibrated_variants ) + filter_xy_ch = input_ch_merge_gvcfs.map { it[-1] } //sample id + .mix(recalibrate_variants.out.output_ch_recalibrated_variants + .map{ [it[1], it[2]] } //VQSR VCF and index + ) + + filter_XY( + filter_xy_ch + ) /** * Calculate checksums for output files */ From 28698a2d9b0992a80977577219eb2aa436017d96 Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Sat, 28 Dec 2024 11:06:24 -0800 Subject: [PATCH 29/67] include filter XY module in main --- main.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/main.nf b/main.nf index 43bbaa5..ba7f0c0 100644 --- a/main.nf +++ b/main.nf @@ -68,6 +68,7 @@ include { } from './module/merge-vcf.nf' include { recalibrate_variants } from './module/workflow-recalibrate-variants.nf' include { filter_gSNP_GATK } from './module/filter-gsnp.nf' +include { filter_XY } from './module/filter-xy.nf' include { calculate_sha512 } from './module/checksum.nf' // Returns the index file for the given bam or vcf From e572c843f36f2a08211d1c3029f111f7b716b233 Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Mon, 30 Dec 2024 10:38:12 -0800 Subject: [PATCH 30/67] simplify xy filter channel --- main.nf | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/main.nf b/main.nf index ba7f0c0..052b5af 100644 --- a/main.nf +++ b/main.nf @@ -255,10 +255,8 @@ workflow { recalibrate_variants.out.output_ch_recalibrated_variants ) - filter_xy_ch = input_ch_merge_gvcfs.map { it[-1] } //sample id - .mix(recalibrate_variants.out.output_ch_recalibrated_variants - .map{ [it[1], it[2]] } //VQSR VCF and index - ) + filter_xy_ch = recalibrate_variants.out.output_ch_recalibrated_variants + .map { it -> [it[0], it[1], it[2]] } filter_XY( filter_xy_ch From f43e8f9533543bf728c3ca96b39e301496ae693d Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Mon, 30 Dec 2024 10:39:09 -0800 Subject: [PATCH 31/67] add script dir ch --- main.nf | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/main.nf b/main.nf index 052b5af..b9e9932 100644 --- a/main.nf +++ b/main.nf @@ -258,8 +258,15 @@ workflow { filter_xy_ch = recalibrate_variants.out.output_ch_recalibrated_variants .map { it -> [it[0], it[1], it[2]] } + script_dir_ch = Channel.fromPath( + "$projectDir/script", + checkIfExists: true + ) + .collect() + filter_XY( - filter_xy_ch + filter_xy_ch, + script_dir_ch ) /** * Calculate checksums for output files From 08e738708447cdacde83ea52a04a6690ff8c743a Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Mon, 30 Dec 2024 10:41:01 -0800 Subject: [PATCH 32/67] add script dir input to NF module --- module/filter-xy.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/module/filter-xy.nf b/module/filter-xy.nf index 7e14523..b58a241 100644 --- a/module/filter-xy.nf +++ b/module/filter-xy.nf @@ -35,6 +35,7 @@ process filter_XY { input: tuple val(sample_id), path(recalibrated_vcf), path(recalibrated_vcf_tbi) + path(script_dir) output: path(".command.*") From c0af8975d305444f238b030ff3a7230b3d597a54 Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Mon, 30 Dec 2024 11:27:04 -0800 Subject: [PATCH 33/67] fix docker tag --- config/default.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/default.config b/config/default.config index c07dafa..af66b8e 100644 --- a/config/default.config +++ b/config/default.config @@ -20,7 +20,7 @@ params { picard_version = "2.26.10" pipeval_version = "4.0.0-rc.2" gatkfilter_version = "v1.0.0" - hail_version = "v0.2.133" + hail_version = "0.2.133" docker_image_gatk = "broadinstitute/gatk:${params.gatk_version}" docker_image_picard = "${-> params.docker_container_registry}/picard:${params.picard_version}" docker_image_pipeval = "${-> params.docker_container_registry}/pipeval:${params.pipeval_version}" From 4a0687ed3abc38b32695f3d1c2e4e8f9ce5e2b85 Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Mon, 6 Jan 2025 17:38:32 -0800 Subject: [PATCH 34/67] update script command to take vcf file source --- module/filter-xy.nf | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/module/filter-xy.nf b/module/filter-xy.nf index b58a241..51ec738 100644 --- a/module/filter-xy.nf +++ b/module/filter-xy.nf @@ -54,12 +54,14 @@ process filter_XY { zgrep "##source=" ${recalibrated_vcf} > ./vcf_source.txt + cat ./vcf_source.txt + python ${script_dir}/filter_xy_call.py \ --sample_name ${output_filename} \ --input_vcf ${recalibrated_vcf} \ - --vcf_source ./vcf_source.txt \ + --vcf_source_file ./vcf_source.txt \ --sample_sex ${params.sample_sex} \ --par_bed ${params.par_bed} \ --output_dir . """ -} \ No newline at end of file +} From 0e552e6c45a183891b6df6253bdfe85cd2420e51 Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Mon, 6 Jan 2025 17:39:24 -0800 Subject: [PATCH 35/67] update sample sex parameter in template --- config/template.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/template.config b/config/template.config index 01d7214..b802e0b 100644 --- a/config/template.config +++ b/config/template.config @@ -12,7 +12,7 @@ params { blcds_registered_dataset = false // if you want the output to be registered // Input patient sex if known - male or female. Leave empty if not known. - patient_sex = '' + sample_sex = '' output_dir = '/path/to/output/directory' From 6d78944634d95c716d5e9aec44678509442effb3 Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Mon, 6 Jan 2025 17:44:48 -0800 Subject: [PATCH 36/67] update template config --- config/template.config | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/config/template.config b/config/template.config index b802e0b..8165f55 100644 --- a/config/template.config +++ b/config/template.config @@ -11,6 +11,8 @@ params { dataset_id = '' blcds_registered_dataset = false // if you want the output to be registered + genome_build = "GRCh38" + // Input patient sex if known - male or female. Leave empty if not known. sample_sex = '' @@ -47,7 +49,6 @@ params { bundle_phase1_1000g_snps_high_conf_vcf_gz = "/hot/resource/tool-specific-input/GATK/GRCh38/1000G_phase1.snps.high_confidence.hg38.vcf.gz" // Specify BED file path for Pseudoautosomal Region (PAR) - genome_build = "GRCh38" par_bed = "" // Base resource allocation updater From a99e3bb5832f96e7e0f95bc43f65f65fa7ebbaf8 Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Mon, 6 Jan 2025 17:45:44 -0800 Subject: [PATCH 37/67] add parameters to schema --- config/schema.yaml | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/config/schema.yaml b/config/schema.yaml index 01f7040..1a8009e 100644 --- a/config/schema.yaml +++ b/config/schema.yaml @@ -3,10 +3,18 @@ patient_id: type: 'String' required: true help: 'Patient ID' +sample_sex: + type: 'String' + required: true + help: 'Sample Sex' dataset_id: type: 'String' required: true help: 'Dataset ID' +genome_build: + type: 'String' + required: true + help: 'Genome build, GRCh37 or GRCh38' output_dir: type: 'Path' mode: 'w' @@ -62,6 +70,11 @@ bundle_phase1_1000g_snps_high_conf_vcf_gz: mode: 'r' required: true help: 'Absolute path to high-confidence 1000g SNPs VCF' +par_bed: + type: 'Path' + mode: 'r' + required: true + help: 'Absolute path to Pseudo-autosomal Region (PAR) BED' base_resource_update: type: 'ResourceUpdateNamespace' required: false From a4150b5e200183e8f16d25c082579623e40f611a Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Mon, 6 Jan 2025 17:47:50 -0800 Subject: [PATCH 38/67] add genome build arg to script command --- module/filter-xy.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/module/filter-xy.nf b/module/filter-xy.nf index 51ec738..4b73742 100644 --- a/module/filter-xy.nf +++ b/module/filter-xy.nf @@ -62,6 +62,7 @@ process filter_XY { --vcf_source_file ./vcf_source.txt \ --sample_sex ${params.sample_sex} \ --par_bed ${params.par_bed} \ + --genome_build ${params.genome_build} \ --output_dir . """ } From 9bf2c4dd9f273bcc4dffafea641c0de31bdca331 Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Mon, 6 Jan 2025 17:53:09 -0800 Subject: [PATCH 39/67] parameterize genome build in script --- script/filter_xy_call.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/script/filter_xy_call.py b/script/filter_xy_call.py index 9790548..3dc1f7a 100755 --- a/script/filter_xy_call.py +++ b/script/filter_xy_call.py @@ -63,6 +63,12 @@ help = 'Input BED file path for Pseudo-Autosomal Regions (PAR)', required=True ) +parser.add_argument( + '--genome_build', + dest='genome_build', + help = 'Genome build of input VCF, GRCh37 or GRCh38', + required=True + ) parser.add_argument( '--output_dir', dest='output_dir', @@ -76,13 +82,14 @@ sample_sex = args.sample_sex vcf_file = args.input_vcf vcf_source_file = args.vcf_source_file -par_bed_file = args.par_bed +par_bed = args.par_bed +genome_build = args.genome_build output_dir = args.output_dir #Import PAR BED file par = hl.import_bed( - path = args.par_bed, - reference_genome = 'GRCh38', + path = par_bed, + reference_genome = genome_build, skip_invalid_intervals = True ) @@ -92,7 +99,7 @@ #Import VCF file into a hail MatrixTable vcf_matrix = hl.import_vcf( path = vcf_file, - reference_genome = 'GRCh38', + reference_genome = genome_build, force_bgz = True ) From 5408c83ac36a58c95ba38dc0734270b32862b5c5 Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Tue, 7 Jan 2025 11:09:34 -0800 Subject: [PATCH 40/67] temporarily add hail dev tag --- config/default.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/default.config b/config/default.config index af66b8e..adb33af 100644 --- a/config/default.config +++ b/config/default.config @@ -20,7 +20,7 @@ params { picard_version = "2.26.10" pipeval_version = "4.0.0-rc.2" gatkfilter_version = "v1.0.0" - hail_version = "0.2.133" + hail_version = "branch-mmootor-fix-spark-permission" docker_image_gatk = "broadinstitute/gatk:${params.gatk_version}" docker_image_picard = "${-> params.docker_container_registry}/picard:${params.picard_version}" docker_image_pipeval = "${-> params.docker_container_registry}/pipeval:${params.pipeval_version}" From b3123f0eecd0fa130861feace6882c4c6cd7dde6 Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Tue, 7 Jan 2025 11:10:01 -0800 Subject: [PATCH 41/67] add params.par_bed as input --- main.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/main.nf b/main.nf index b9e9932..c25bf8a 100644 --- a/main.nf +++ b/main.nf @@ -266,6 +266,7 @@ workflow { filter_XY( filter_xy_ch, + params.par_bed, script_dir_ch ) /** From d026262dce0e01938158000d7483736f76fe2b4e Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Tue, 7 Jan 2025 11:10:33 -0800 Subject: [PATCH 42/67] add par_bed as process input --- module/filter-xy.nf | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/module/filter-xy.nf b/module/filter-xy.nf index 4b73742..383999b 100644 --- a/module/filter-xy.nf +++ b/module/filter-xy.nf @@ -35,6 +35,7 @@ process filter_XY { input: tuple val(sample_id), path(recalibrated_vcf), path(recalibrated_vcf_tbi) + path(par_bed) path(script_dir) output: @@ -61,7 +62,7 @@ process filter_XY { --input_vcf ${recalibrated_vcf} \ --vcf_source_file ./vcf_source.txt \ --sample_sex ${params.sample_sex} \ - --par_bed ${params.par_bed} \ + --par_bed ${par_bed} \ --genome_build ${params.genome_build} \ --output_dir . """ From 3e0047f929f0f83b72ee87f04576184cd3ef65d9 Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Tue, 7 Jan 2025 12:49:50 -0800 Subject: [PATCH 43/67] fix output vcf dataset at export in script --- script/filter_xy_call.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/script/filter_xy_call.py b/script/filter_xy_call.py index 3dc1f7a..d9f7e7e 100755 --- a/script/filter_xy_call.py +++ b/script/filter_xy_call.py @@ -148,7 +148,7 @@ output_file = output_dir + '/' + sample_name + '_XY_filtered.vcf.bgz' hl.export_vcf( - dataset = filterXY, + dataset = output_vcf, output = output_file, tabix = True, metadata = vcf_header, From 365c331339440728f18ac456a3c1b7b520ff7f89 Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Tue, 7 Jan 2025 16:21:02 -0800 Subject: [PATCH 44/67] fix pylint --- script/filter_xy_call.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/script/filter_xy_call.py b/script/filter_xy_call.py index d9f7e7e..8fab666 100755 --- a/script/filter_xy_call.py +++ b/script/filter_xy_call.py @@ -28,7 +28,6 @@ import os import argparse import hail as hl -import tempfile script_dir = os.getcwd() @@ -132,7 +131,8 @@ elif sample_sex == 'XX': #If Female (XX), remove non-PAR chrY calls non_par_filtered_variants = non_par_variants.filter_rows( - non_par_variants.locus.contig.startswith('chrX') | non_par_variants.locus.contig.startswith('X') + non_par_variants.locus.contig.startswith('chrX') | \ + non_par_variants.locus.contig.startswith('X') ) #Combine PAR and filtered non-PAR regions From f70361b5c00c0c372c00760ab5204c3ae55c1e41 Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Wed, 8 Jan 2025 10:54:52 -0800 Subject: [PATCH 45/67] update docker hail version --- config/default.config | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/config/default.config b/config/default.config index adb33af..af66b8e 100644 --- a/config/default.config +++ b/config/default.config @@ -20,7 +20,7 @@ params { picard_version = "2.26.10" pipeval_version = "4.0.0-rc.2" gatkfilter_version = "v1.0.0" - hail_version = "branch-mmootor-fix-spark-permission" + hail_version = "0.2.133" docker_image_gatk = "broadinstitute/gatk:${params.gatk_version}" docker_image_picard = "${-> params.docker_container_registry}/picard:${params.picard_version}" docker_image_pipeval = "${-> params.docker_container_registry}/pipeval:${params.pipeval_version}" From 7928c5e6636f9d20872d760927383bed1b8dab9d Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Wed, 8 Jan 2025 10:55:27 -0800 Subject: [PATCH 46/67] remove cat command --- module/filter-xy.nf | 2 -- 1 file changed, 2 deletions(-) diff --git a/module/filter-xy.nf b/module/filter-xy.nf index 383999b..37b5efc 100644 --- a/module/filter-xy.nf +++ b/module/filter-xy.nf @@ -55,8 +55,6 @@ process filter_XY { zgrep "##source=" ${recalibrated_vcf} > ./vcf_source.txt - cat ./vcf_source.txt - python ${script_dir}/filter_xy_call.py \ --sample_name ${output_filename} \ --input_vcf ${recalibrated_vcf} \ From 7ab2a6e80c09a6c6d6e568f540c7763bd5ca574b Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Wed, 8 Jan 2025 11:00:03 -0800 Subject: [PATCH 47/67] fix log output dir --- module/filter-xy.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/filter-xy.nf b/module/filter-xy.nf index 37b5efc..ef94620 100644 --- a/module/filter-xy.nf +++ b/module/filter-xy.nf @@ -30,7 +30,7 @@ process filter_XY { pattern: ".command.*", mode: "copy", saveAs: { - "${task.process.replace(':', '/')}/${task.process.split(':')[-1]}-${sample_id}-${interval_id}/log${file(it).getName()}" + "${task.process.replace(':', '/')}-${sample_id}/log${file(it).getName()}" } input: From 29de6e0affd8269235bd3add00ffe2d78d86c4c4 Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Wed, 8 Jan 2025 11:01:05 -0800 Subject: [PATCH 48/67] standardize process name with tool name at the end --- module/filter-xy.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/filter-xy.nf b/module/filter-xy.nf index ef94620..7f737b0 100644 --- a/module/filter-xy.nf +++ b/module/filter-xy.nf @@ -16,7 +16,7 @@ include { generate_standard_filename; sanitize_string } from '../external/pipeli params.par_bed: string(path) */ -process filter_XY { +process filter_XY_Hail { container params.docker_image_hail publishDir path: "${params.output_dir_base}/output", From f779ca5a2ffb9d143e533573274b23960c026f92 Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Wed, 8 Jan 2025 11:03:05 -0800 Subject: [PATCH 49/67] standardize process name with tool name at the end --- main.nf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/main.nf b/main.nf index c25bf8a..a19a123 100644 --- a/main.nf +++ b/main.nf @@ -68,7 +68,7 @@ include { } from './module/merge-vcf.nf' include { recalibrate_variants } from './module/workflow-recalibrate-variants.nf' include { filter_gSNP_GATK } from './module/filter-gsnp.nf' -include { filter_XY } from './module/filter-xy.nf' +include { filter_XY_hail } from './module/filter-xy.nf' include { calculate_sha512 } from './module/checksum.nf' // Returns the index file for the given bam or vcf @@ -264,7 +264,7 @@ workflow { ) .collect() - filter_XY( + filter_XY_Hail( filter_xy_ch, params.par_bed, script_dir_ch From c67cc5c2ddec3072d2443cf0a93d7ce8a44d7fbf Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Wed, 8 Jan 2025 11:13:31 -0800 Subject: [PATCH 50/67] update sample_sex comment in template --- config/template.config | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/config/template.config b/config/template.config index 8165f55..83e1f88 100644 --- a/config/template.config +++ b/config/template.config @@ -13,8 +13,8 @@ params { genome_build = "GRCh38" - // Input patient sex if known - male or female. Leave empty if not known. - sample_sex = '' + // Input patient sex + sample_sex = '' // 'XY' or 'XX' output_dir = '/path/to/output/directory' From 96768ff8430e108fb503121e70dc1a0a24007bc2 Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Wed, 8 Jan 2025 11:29:08 -0800 Subject: [PATCH 51/67] add default and choices for genome_build --- config/schema.yaml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/config/schema.yaml b/config/schema.yaml index 1a8009e..b3b5b34 100644 --- a/config/schema.yaml +++ b/config/schema.yaml @@ -15,6 +15,11 @@ genome_build: type: 'String' required: true help: 'Genome build, GRCh37 or GRCh38' + default: + - "GRCh38" + choice: + - "GRCh37" + - "GRCh38" output_dir: type: 'Path' mode: 'w' From b2cf9e81f70fa36b6aa220192e3c62be0c2de08d Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Wed, 8 Jan 2025 11:31:11 -0800 Subject: [PATCH 52/67] add choices for sample_sex in schema --- config/schema.yaml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/config/schema.yaml b/config/schema.yaml index b3b5b34..c13ba18 100644 --- a/config/schema.yaml +++ b/config/schema.yaml @@ -7,6 +7,9 @@ sample_sex: type: 'String' required: true help: 'Sample Sex' + choices: + - "XY" + - "XX" dataset_id: type: 'String' required: true From 1e5196034c60296e0afbe46f3c84a822b3c7ec1e Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Wed, 8 Jan 2025 11:47:46 -0800 Subject: [PATCH 53/67] add resource allocation to filter_XY_Hail --- config/F16.config | 10 ++++++++++ config/F32.config | 10 ++++++++++ config/F72.config | 10 ++++++++++ config/M64.config | 10 ++++++++++ 4 files changed, 40 insertions(+) diff --git a/config/F16.config b/config/F16.config index 1fb00e0..cf0a3a5 100644 --- a/config/F16.config +++ b/config/F16.config @@ -111,4 +111,14 @@ process { } } } + withName: filter_XY_Hail { + cpus = 1 + memory = 2.GB + retry_strategy { + memory { + strategy = 'exponential' + operand = 2 + } + } + } } diff --git a/config/F32.config b/config/F32.config index 1fb00e0..61fae3c 100644 --- a/config/F32.config +++ b/config/F32.config @@ -111,4 +111,14 @@ process { } } } + withName: filter_XY_Hail { + cpus = 2 + memory = 4.GB + retry_strategy { + memory { + strategy = 'exponential' + operand = 2 + } + } + } } diff --git a/config/F72.config b/config/F72.config index b16f3db..e61ab5b 100644 --- a/config/F72.config +++ b/config/F72.config @@ -111,4 +111,14 @@ process { } } } + withName: filter_XY_Hail { + cpus = 2 + memory = 6.GB + retry_strategy { + memory { + strategy = 'exponential' + operand = 2 + } + } + } } diff --git a/config/M64.config b/config/M64.config index b16f3db..3619d63 100644 --- a/config/M64.config +++ b/config/M64.config @@ -111,4 +111,14 @@ process { } } } + withName: filter_XY_Hail { + cpus = 4 + memory = 10.GB + retry_strategy { + memory { + strategy = 'exponential' + operand = 2 + } + } + } } From c51d3c3ccc9c403d926b4aac99d51888595ee7c0 Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Wed, 8 Jan 2025 11:54:59 -0800 Subject: [PATCH 54/67] emit xy filtered output --- module/filter-xy.nf | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/module/filter-xy.nf b/module/filter-xy.nf index 7f737b0..bdf1be9 100644 --- a/module/filter-xy.nf +++ b/module/filter-xy.nf @@ -40,8 +40,7 @@ process filter_XY_Hail { output: path(".command.*") - path("${output_filename}_XY_filtered.vcf.bgz") - path("${output_filename}_XY_filtered.vcf.bgz.tbi") + tuple path("${output_filename}_XY_filtered.vcf.bgz"), path("${output_filename}_XY_filtered.vcf.bgz.tbi"), emit: xy_filtered_vqsr script: output_filename = generate_standard_filename( From a69f46c08f9bc18689d7936a37d12dcfa3557e84 Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Wed, 8 Jan 2025 11:55:23 -0800 Subject: [PATCH 55/67] generate checksum for xy filtered vqsr VCF --- main.nf | 1 + 1 file changed, 1 insertion(+) diff --git a/main.nf b/main.nf index a19a123..df2aa6c 100644 --- a/main.nf +++ b/main.nf @@ -276,6 +276,7 @@ workflow { .mix(run_MergeVcfs_Picard_GVCF.out.merged_vcf) .mix(recalibrate_variants.out.output_ch_recalibrated_variants) .map{ [it[1], it[2]] } + .mix(filter_XY_Hail.out.xy_filtered_vqsr) .mix(filter_gSNP_GATK.out.germline_filtered) .flatten() .set{ input_ch_calculate_checksum } From bd9e78125bb01eb9add7d8c86f2a1af340f3efc2 Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Thu, 9 Jan 2025 10:32:44 -0800 Subject: [PATCH 56/67] add system command to VCF header --- script/filter_xy_call.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/script/filter_xy_call.py b/script/filter_xy_call.py index 8fab666..96fe044 100755 --- a/script/filter_xy_call.py +++ b/script/filter_xy_call.py @@ -28,6 +28,8 @@ import os import argparse import hail as hl +import sys +import tempfile script_dir = os.getcwd() @@ -85,6 +87,21 @@ genome_build = args.genome_build output_dir = args.output_dir +#Extract VCF file header +vcf_header = hl.get_vcf_metadata(vcf_file) + +#Add script system command to VCF source +script_command = ' '.join(sys.argv) + +with open(vcf_source_file, 'r') as vcf_source: + vcf_source_content = vcf_source.read() + +script_command_entry = f'##XYFiltration=' +vcf_source = vcf_source_content + script_command_entry +temp_file_path = os.path.join(tempfile.gettempdir(), 'temp_file.txt') +with open(temp_file_path, 'w') as temp_file: + temp_file.write(vcf_source) + #Import PAR BED file par = hl.import_bed( path = par_bed, @@ -92,9 +109,6 @@ skip_invalid_intervals = True ) -#Extract VCF file header -vcf_header = hl.get_vcf_metadata(vcf_file) - #Import VCF file into a hail MatrixTable vcf_matrix = hl.import_vcf( path = vcf_file, @@ -152,5 +166,5 @@ output = output_file, tabix = True, metadata = vcf_header, - append_to_header = vcf_source_file + append_to_header = temp_file_path ) From 002ffc38c6363c4e16a669e1e0340132ab931edc Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Thu, 9 Jan 2025 10:49:41 -0800 Subject: [PATCH 57/67] Add XY filtration step to README --- README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 5a3b68c..ffb4837 100644 --- a/README.md +++ b/README.md @@ -78,7 +78,10 @@ Take the output from Step 6 as input, and apply the model in Step 5 to recalibra ### 8. Filter gSNP – Filter out ambiguous variants Use customized Perl script to filter out ambiguous variants. -### 9. Generate sha512 checksum +### 9. Adjust chrX and chrY genotypes based on sample sex from recalibrated VCF +Apply XY filtration workflow to recalibrated VCF as discribed [here](docs/xy_filtration_workflow.md). + +### 10. Generate sha512 checksum Generate sha512 checksum for VCFs and GVCFs. --- From f7835303ff3bec8ac777738930a65152bf14d7b0 Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Thu, 9 Jan 2025 11:01:13 -0800 Subject: [PATCH 58/67] fix process name --- main.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.nf b/main.nf index df2aa6c..c07e58e 100644 --- a/main.nf +++ b/main.nf @@ -68,7 +68,7 @@ include { } from './module/merge-vcf.nf' include { recalibrate_variants } from './module/workflow-recalibrate-variants.nf' include { filter_gSNP_GATK } from './module/filter-gsnp.nf' -include { filter_XY_hail } from './module/filter-xy.nf' +include { filter_XY_Hail } from './module/filter-xy.nf' include { calculate_sha512 } from './module/checksum.nf' // Returns the index file for the given bam or vcf From f96c632f2642ae21aae9057f145151914794da33 Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Thu, 9 Jan 2025 11:14:55 -0800 Subject: [PATCH 59/67] add XY filteration params to README --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index ffb4837..95ea822 100644 --- a/README.md +++ b/README.md @@ -118,6 +118,8 @@ For normal-only or tumor-only samples, exclude the fields for the other state. |:----------------|:---------|:-----|:------------| | `dataset_id` | Yes | string | Dataset ID | | `blcds_registered_dataset` | Yes | boolean | Set to true when using BLCDS folder structure; use false for now | +| `genome_build` | Yes | string | Genome build, GRCh37 or GRCh38 | +| `sample_sex` | Yes | string | Sample Sex, XY or XX | | `output_dir` | Yes | string | Need to set if `blcds_registered_dataset = false` | | `save_intermediate_files` | Yes | boolean | Set to false to disable publishing of intermediate files; true otherwise; disabling option will delete intermediate files to allow for processing of large BAMs | | `cache_intermediate_pipeline_steps` | No | boolean | Set to true to enable process caching from Nextflow; defaults to false | @@ -129,6 +131,7 @@ For normal-only or tumor-only samples, exclude the fields for the other state. | `bundle_hapmap_3p3_vcf_gz` | Yes | path | Absolute path to HapMap 3.3 file, e.g., `/hot/resource/tool-specific-input/GATK/GRCh38/hapmap_3.3.hg38.vcf.gz` | | `bundle_omni_1000g_2p5_vcf_gz` | Yes | path | Absolute path to 1000 genomes OMNI 2.5 file, e.g., `/hot/resource/tool-specific-input/GATK/GRCh38/1000G_omni2.5.hg38.vcf.gz` | | `bundle_phase1_1000g_snps_high_conf_vcf_gz` | Yes | path | Absolute path to 1000 genomes phase 1 high-confidence file, e.g., `/hot/resource/tool-specific-input/GATK/GRCh38/1000G_phase1.snps.high_confidence.hg38.vcf.gz` | +| `par_bed` | Yes | path | Absolute path to Pseudo-autosomal Region (PAR) BED | | `work_dir` | optional | path | Path of working directory for Nextflow. When included in the sample config file, Nextflow intermediate files and logs will be saved to this directory. With ucla_cds, the default is `/scratch` and should only be changed for testing/development. Changing this directory to `/hot` or `/tmp` can lead to high server latency and potential disk space limitations, respectively. | | `docker_container_registry` | optional | string | Registry containing tool Docker images. Default: `ghcr.io/uclahs-cds` | | `base_resource_update` | optional | namespace | Namespace of parameters to update base resource allocations in the pipeline. Usage and structure are detailed in `template.config` and below. | From 9b6e036a2e83264fb8ef332dddfa5ed743a1941d Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Thu, 9 Jan 2025 11:16:39 -0800 Subject: [PATCH 60/67] Update script description --- script/filter_xy_call.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/script/filter_xy_call.py b/script/filter_xy_call.py index 96fe044..0e9e686 100755 --- a/script/filter_xy_call.py +++ b/script/filter_xy_call.py @@ -1,6 +1,6 @@ #!/usr/bin/env python3 """ -Filter XY calls from call-gSNP single sample VCF file +Filter XY calls from a germline VCF file Steps: - Extract autosomes and chrX/Y variants from input VCF From 6e571bcab2b2a03fe8ae7576bf525e72722368f1 Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Thu, 9 Jan 2025 11:22:26 -0800 Subject: [PATCH 61/67] add xy_filtration_workflow.md --- docs/xy_filtration_workflow.md | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) create mode 100644 docs/xy_filtration_workflow.md diff --git a/docs/xy_filtration_workflow.md b/docs/xy_filtration_workflow.md new file mode 100644 index 0000000..05f9b9d --- /dev/null +++ b/docs/xy_filtration_workflow.md @@ -0,0 +1,16 @@ +# Filter XY calls from a germline VCF file + +## Steps: +1. Extract autosomes and chrX/Y variants from input VCF +2. Filter chrX/Y variants +3. Merge autosomal and filtered chrX/Y variants + +## chrX/Y Filter Criteria: +- Extract chrX/Y calls +- Extract chrX/Y calls overlapping with Pseudo-Autosomal Regions (PARs) +- For non-PAR chrX/Y calls + - if `sample_sex` is `XY`: + - Filter out heterozygous `GT` calls in chrX and chrY + - Transform homozygous `GT=1/1` to hemizygous `GT=1` + - if `sample_sex` is `XX`: + - Filter out `chrY` calls \ No newline at end of file From d9a1eae8566434a10ab4937a645f0fe358842807 Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Thu, 9 Jan 2025 11:29:51 -0800 Subject: [PATCH 62/67] add GRCh38 PAR to README --- docs/xy_filtration_workflow.md | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/docs/xy_filtration_workflow.md b/docs/xy_filtration_workflow.md index 05f9b9d..c053933 100644 --- a/docs/xy_filtration_workflow.md +++ b/docs/xy_filtration_workflow.md @@ -13,4 +13,14 @@ - Filter out heterozygous `GT` calls in chrX and chrY - Transform homozygous `GT=1/1` to hemizygous `GT=1` - if `sample_sex` is `XX`: - - Filter out `chrY` calls \ No newline at end of file + - Filter out `chrY` calls + +## Pseudo-Autosomal Regions (PARs) +### GRCh38 +| CHROM | START | END | PAR | REGION | REFERENCE | +|---|---|---|---|---|---| +| chrX | 10001 | 2781479 | PAR1 | Xp22 | EMSEMBL | +| chrX | 91434839 | 91438584 | PAR3/XTR | Xq21.3 | PMID:23708688 | +| chrX | 155701383 | 156030895 | PAR2 | Xq28 | ENSEMBL | +| chrY | 10001 | 10300000 | PAR1+PAR3/XTR | Yp11 | ENSEMBL +PMID:23708688 | +| chrY | 56887903 | 57217415 | PAR2 | Yq12 | ENSEMBL | From bd568b659054bee22854e4b52dda3831b9cfd30b Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Thu, 9 Jan 2025 14:08:58 -0800 Subject: [PATCH 63/67] fix pylint --- script/filter_xy_call.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/script/filter_xy_call.py b/script/filter_xy_call.py index 0e9e686..69b3ef3 100755 --- a/script/filter_xy_call.py +++ b/script/filter_xy_call.py @@ -25,11 +25,11 @@ based on HAIL recommendation """ -import os import argparse -import hail as hl +import os import sys import tempfile +import hail as hl script_dir = os.getcwd() @@ -91,15 +91,15 @@ vcf_header = hl.get_vcf_metadata(vcf_file) #Add script system command to VCF source -script_command = ' '.join(sys.argv) +SCRIPT_COMMAND = ' '.join(sys.argv) -with open(vcf_source_file, 'r') as vcf_source: +with open(vcf_source_file, 'r', encoding='utf-8') as vcf_source: vcf_source_content = vcf_source.read() -script_command_entry = f'##XYFiltration=' +script_command_entry = f'##XYFiltration=' vcf_source = vcf_source_content + script_command_entry temp_file_path = os.path.join(tempfile.gettempdir(), 'temp_file.txt') -with open(temp_file_path, 'w') as temp_file: +with open(temp_file_path, 'w', encoding='utf-8') as temp_file: temp_file.write(vcf_source) #Import PAR BED file From 8f8326d7206c002e90d7bc09a1c8da27bf2e2176 Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Thu, 9 Jan 2025 14:09:30 -0800 Subject: [PATCH 64/67] fix publishDir rules --- module/filter-xy.nf | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/module/filter-xy.nf b/module/filter-xy.nf index bdf1be9..b57bd22 100644 --- a/module/filter-xy.nf +++ b/module/filter-xy.nf @@ -21,10 +21,7 @@ process filter_XY_Hail { publishDir path: "${params.output_dir_base}/output", mode: "copy", - pattern: '*.vcf.bgz*', - saveAs: { - "${output_filename}_${sanitize_string(file(it).getName().replace("${sample_id}_", ""))}" - } + pattern: '*.vcf.bgz*' publishDir path: "${params.log_output_dir}/process-log", pattern: ".command.*", From 9b06490a62be4c3f41daa65718c891bf0a866986 Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Thu, 9 Jan 2025 14:19:47 -0800 Subject: [PATCH 65/67] Update outputs in README --- README.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/README.md b/README.md index 95ea822..c37bceb 100644 --- a/README.md +++ b/README.md @@ -205,6 +205,10 @@ base_resource_update { | `___indel.vcf.gz` | Filtered INDELs with non-germline and ambiguous variants removed | | `___indel.vcf.gz.tbi` | Filtered germline INDELs index | | `___indel.vcf.gz.sha512` | Filtered germline INDELs sha512 checksum | +| `____XY_filtered.vcf.bgz` | chrX/Y filtered SNP and INDEL recalibrated variants | +| `____XY_filtered.vcf.bgz.sha512` | chrX/Y filtered SNP and INDEL recalibrated variants checksum | +| `____XY_filtered.vcf.bgz.tbi` | chrX/Y filtered SNP and INDEL recalibrated variants index | +| `____XY_filtered.vcf.bgz.tbi.sha512` | chrX/Y filtered SNP and INDEL recalibrated variants index checksum | | `report.html`, `timeline.html` and `trace.txt` | Nextflow report, timeline and trace files | | `*.command.*` | Process specific logging files created by nextflow | From ca0e4f43e630e9fd4fa76f0626250ef5c1de8768 Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Thu, 9 Jan 2025 14:41:47 -0800 Subject: [PATCH 66/67] Upddate CHANGELOG --- CHANGELOG.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3221173..08526b2 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm ## [Unreleased] ### Added +- Add XY filtration - NFTest test case --- @@ -152,7 +153,7 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm - Update reheadering to use -c option - Modularize workflows for different modes (single vs. paired, WGS vs targeted) - Update GATK to 4.2.4.0 to address Log4j critical vulnerability (https://github.com/advisories/GHSA-jfh8-c2jp-5v3q) -- Update Picard to 2.26.8 to address Log4j critical vulnerability (https://github.com/advisories/GHSA-jfh8-c2jp-5v3q) +- Update Picard to 2.26.8 to address Log4j critical vulnerability (https://github.com/advisories/GHSA-jfh8-c2jp-5v3q) --- From e8d1d5d142b09c3674ff6c329c3d0242bb7456af Mon Sep 17 00:00:00 2001 From: Faizal-Eeman Date: Thu, 9 Jan 2025 17:25:48 -0800 Subject: [PATCH 67/67] update output filename --- README.md | 8 ++++---- script/filter_xy_call.py | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index c37bceb..80976fb 100644 --- a/README.md +++ b/README.md @@ -205,10 +205,10 @@ base_resource_update { | `___indel.vcf.gz` | Filtered INDELs with non-germline and ambiguous variants removed | | `___indel.vcf.gz.tbi` | Filtered germline INDELs index | | `___indel.vcf.gz.sha512` | Filtered germline INDELs sha512 checksum | -| `____XY_filtered.vcf.bgz` | chrX/Y filtered SNP and INDEL recalibrated variants | -| `____XY_filtered.vcf.bgz.sha512` | chrX/Y filtered SNP and INDEL recalibrated variants checksum | -| `____XY_filtered.vcf.bgz.tbi` | chrX/Y filtered SNP and INDEL recalibrated variants index | -| `____XY_filtered.vcf.bgz.tbi.sha512` | chrX/Y filtered SNP and INDEL recalibrated variants index checksum | +| `_____filtered.vcf.bgz` | chrX/Y filtered SNP and INDEL recalibrated variants | +| `_____filtered.vcf.bgz.sha512` | chrX/Y filtered SNP and INDEL recalibrated variants checksum | +| `_____filtered.vcf.bgz.tbi` | chrX/Y filtered SNP and INDEL recalibrated variants index | +| `_____filtered.vcf.bgz.tbi.sha512` | chrX/Y filtered SNP and INDEL recalibrated variants index checksum | | `report.html`, `timeline.html` and `trace.txt` | Nextflow report, timeline and trace files | | `*.command.*` | Process specific logging files created by nextflow | diff --git a/script/filter_xy_call.py b/script/filter_xy_call.py index 69b3ef3..dcb2871 100755 --- a/script/filter_xy_call.py +++ b/script/filter_xy_call.py @@ -122,7 +122,7 @@ Y_contig = vcf_matrix.locus.contig.startswith('chrY') | vcf_matrix.locus.contig.startswith('Y') extract_condition = (X_contig) | (Y_contig) vcf_XY = vcf_matrix.filter_rows(extract_condition) -print('chrX/Y variants before XY filtration:', vcf_XY.count()) +print(f'chrX/Y variants before {sample_sex} filtration:', vcf_XY.count()) ##Extract autosomes vcf_autosomes = vcf_matrix.filter_rows(~extract_condition) @@ -152,18 +152,18 @@ #Combine PAR and filtered non-PAR regions par_non_par = [par_variants, non_par_filtered_variants] filterXY = hl.MatrixTable.union_rows(*par_non_par) -print('chrX/Y variant counts after XY filtration:', filterXY.count()) +print(f'chrX/Y variant counts after {sample_sex} filtration:', filterXY.count()) #Combine filtered X/Y + autosomal variants autosomes_XYfiltered = [vcf_autosomes, filterXY] output_vcf = hl.MatrixTable.union_rows(*autosomes_XYfiltered) #Export MatrixTable to VCF -output_file = output_dir + '/' + sample_name + '_XY_filtered.vcf.bgz' +OUTPUT_FILE = f'{output_dir}/{sample_name}_{sample_sex}_filtered.vcf.bgz' hl.export_vcf( dataset = output_vcf, - output = output_file, + output = OUTPUT_FILE, tabix = True, metadata = vcf_header, append_to_header = temp_file_path