From 7cbb260276d30912b00fb703d3835fafdcd34b79 Mon Sep 17 00:00:00 2001 From: Timothee Cezard Date: Wed, 26 May 2021 12:08:45 +0200 Subject: [PATCH 01/19] Allow novel reference allele as long as they are not changing the reference length which is where most false positives happened --- variant_remapping_tools/reads_to_remapped_variants.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/variant_remapping_tools/reads_to_remapped_variants.py b/variant_remapping_tools/reads_to_remapped_variants.py index e90361b..058ea9e 100755 --- a/variant_remapping_tools/reads_to_remapped_variants.py +++ b/variant_remapping_tools/reads_to_remapped_variants.py @@ -63,7 +63,8 @@ def calculate_new_variant_definition(left_read, right_read, ref_fasta, original_ new_alts.append(old_ref_conv) operations['rac'] = old_ref_conv + '-' + new_ref operations['nra'] = None - failure_reason = 'Novel Reference Allele' + if len(old_ref_conv) != len(new_ref): + failure_reason = 'Novel Reference Allele length change' # 3. Correct zero-length reference sequence if len(new_ref) == 0: From 7e881e4eb3ed741a54223b9fe3f5cf87bcb73e20 Mon Sep 17 00:00:00 2001 From: Timothee Cezard Date: Wed, 26 May 2021 20:05:17 +0200 Subject: [PATCH 02/19] Fix test --- .../tests/test_reads_to_remapped_variants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/variant_remapping_tools/tests/test_reads_to_remapped_variants.py b/variant_remapping_tools/tests/test_reads_to_remapped_variants.py index 4696bb2..c8f0c6f 100644 --- a/variant_remapping_tools/tests/test_reads_to_remapped_variants.py +++ b/variant_remapping_tools/tests/test_reads_to_remapped_variants.py @@ -145,7 +145,7 @@ def test_calculate_new_variant_definition(self): vcf_rec = ['chr1', '48', '.', 'T', 'A'] with patch('variant_remapping_tools.reads_to_remapped_variants.fetch_bases', return_value='C'): assert calculate_new_variant_definition(left_read, right_read, fasta, vcf_rec) == \ - (48, 'C', ['A', 'T'], {'st': '+', 'rac': 'T-C', 'nra': None}, 'Novel Reference Allele') + (48, 'C', ['A', 'T'], {'st': '+', 'rac': 'T-C', 'nra': None}, None) # Forward strand alignment for Deletion left_read = self.mk_read(reference_name='chr2', reference_start=1, reference_end=47, is_reverse=False) From 4f0734da0dca5b2e1bab62ae979ed539948aa60a Mon Sep 17 00:00:00 2001 From: Timothee Cezard Date: Wed, 26 May 2021 22:17:41 +0200 Subject: [PATCH 03/19] swap two columns around in the intermediate bed file because the strand column (5) does not support space but the name column (4) does Also Add new genotypes change --- variant_remapping_tools/reads_to_remapped_variants.py | 4 +++- variant_to_realignment.nf | 8 ++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/variant_remapping_tools/reads_to_remapped_variants.py b/variant_remapping_tools/reads_to_remapped_variants.py index 058ea9e..375aae5 100755 --- a/variant_remapping_tools/reads_to_remapped_variants.py +++ b/variant_remapping_tools/reads_to_remapped_variants.py @@ -99,7 +99,9 @@ def update_vcf_record(reference_name, varpos, new_ref, new_alts, operations, ori genotype_str_list = original_vcf_rec[genotype_i].split(':') if genotype_str_list[gt_index] == '1/1': genotype_str_list[gt_index] = '0/0' - original_vcf_rec[genotype_i] = ':'.join(genotype_str_list) + elif 'nra' in operations and genotype_str_list[gt_index] == '0/1': + genotype_str_list[gt_index] = '1/2' + original_vcf_rec[genotype_i] = ':'.join(genotype_str_list) def fetch_bases(fasta, contig, start, length): diff --git a/variant_to_realignment.nf b/variant_to_realignment.nf index 6319d7f..177278a 100755 --- a/variant_to_realignment.nf +++ b/variant_to_realignment.nf @@ -25,8 +25,8 @@ process convertVCFToBed { # - add all VCF fields separated by 2 characters pipe and caret (|^) to avoid impacting existing formatting of # the VCF line. The sub is to protect the % character that would be interpreted by printf otherwise. awk -F '\\t' '{ if (!/^#/){ \ - printf $1"\\t"$2-1"\\t"$2"\\t"$4"\\t"$1; \ - for (i=2; i<=NF; i++){ sub(/%/, "%%", $i); printf "|^"$i }; print ""}; \ + printf $1"\\t"$2-1"\\t"$2"\\t"$1; \ + for (i=2; i<=NF; i++){ sub(/%/, "%%", $i); printf "|^"$i }; print "\\t"$4}; \ }' source.vcf \ > variants.bed ''' @@ -56,7 +56,7 @@ process flankingRegionBed { | bedtools slop -g genome.chrom.sizes -l $flankingseq -r 0 > flanking_r1.bed # Adjust the start position of the flank to be one base downstream of the end of variant (\$4 is the reference allele) - awk 'BEGIN{OFS="\\t"}{ \$2=\$2+length(\$4); \$3=\$3+length(\$4); print \$0}' variants.bed \ + awk 'BEGIN{OFS="\\t"}{ \$2=\$2+length(\$5); \$3=\$3+length(\$5); print \$0}' variants.bed \ | bedtools slop -g genome.chrom.sizes -l 0 -r $flankingseq > flanking_r2.bed """ } @@ -107,7 +107,7 @@ process extractVariantInfoToFastaHeader { awk '{print ">" NR }' flanking_r1.bed > position.txt # Store position of the variant in the file - cut -f 5 flanking_r1.bed > vcf_fields.txt + cut -f 4 flanking_r1.bed > vcf_fields.txt # Paste the names, variant bases, then fasta sequences into a new file # A space will be inserted between the position and the vcf fields From d505405464ced38f62c197ad51f3adde7181cd24 Mon Sep 17 00:00:00 2001 From: Timothee Cezard Date: Fri, 28 May 2021 09:03:00 +0200 Subject: [PATCH 04/19] Clarify some of the processes's definition --- main.nf | 16 ++++++++-------- variant_to_realignment.nf | 7 ++++--- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/main.nf b/main.nf index 621195b..71f6037 100755 --- a/main.nf +++ b/main.nf @@ -85,13 +85,13 @@ include { process_split_reads; process_split_reads_mid; process_split_reads_long /* - * Create the header for the output VCF + * This process convert the original Header to the remapped header and concatenate it with the remapped VCF records  */ -process convertAndAddHeaderToVCF { +process generateRemappedVCF { input: - path "variants_remapped_sorted.vcf" path "vcf_header.txt" + path "variants_remapped_sorted.vcf" output: path "variants_remapped_sorted_with_header.vcf", emit: final_vcf_with_header @@ -119,9 +119,9 @@ process convertAndAddHeaderToVCF { } /* - * Add header to unmapped variant VCF records + * This process adds the original header to unmapped variant VCF records and output the results  */ -process mergeOriginalHeaderAndVCFAndOutput { +process generateUnmappedVCF { publishDir outfile_dir, overwrite: true, @@ -235,9 +235,9 @@ workflow finalise { summary main: - convertAndAddHeaderToVCF(variants_remapped, vcf_header) - mergeOriginalHeaderAndVCFAndOutput(vcf_header, variants_unmapped) - sortVCF(convertAndAddHeaderToVCF.out.final_vcf_with_header) + generateUnmappedVCF(vcf_header, variants_unmapped) + generateRemappedVCF(vcf_header, variants_remapped) + sortVCF(generateRemappedVCF.out.final_vcf_with_header) normaliseAnOutput(sortVCF.out.variants_remapped_sorted_gz, genome) outputStats(summary) } diff --git a/variant_to_realignment.nf b/variant_to_realignment.nf index 177278a..573dd6e 100755 --- a/variant_to_realignment.nf +++ b/variant_to_realignment.nf @@ -7,7 +7,8 @@ nextflow.enable.dsl=2 /* - * Convert the VCF file to BED format. + * Convert the VCF file to BED format storing the VCF line in the "name" column and the reference allele in the + * "strand" column.  */ process convertVCFToBed { @@ -33,7 +34,7 @@ process convertVCFToBed { } /* - * Based on variants BED, generate the flanking regions BED files. + * Based on variants BED, generate the BED file for each flank.  */ process flankingRegionBed { @@ -100,7 +101,7 @@ process extractVariantInfoToFastaHeader { path "variant_read1.out.fa", emit: variant_read1_with_info path "variant_read2.out.fa", emit: variant_read2_with_info - // Disable the string interpolation using single quotes + // Disable Nextflow string interpolation using single quotes // https://www.nextflow.io/docs/latest/script.html#string-interpolation ''' # Store variant position in the file to have a unique name From f3fc4ce6f6b53850828b6cf6725d19c77cf39cc9 Mon Sep 17 00:00:00 2001 From: Timothee Cezard Date: Mon, 31 May 2021 22:39:43 +0200 Subject: [PATCH 05/19] Ensure the reference base is upper case hack to allow space in the INFO field --- tests/test_pipeline.sh | 4 ++-- .../reads_to_remapped_variants.py | 5 +++-- .../tests/test_reads_to_remapped_variants.py | 19 ++++++++++++++++++- variant_to_realignment.nf | 9 +++++---- 4 files changed, 28 insertions(+), 9 deletions(-) diff --git a/tests/test_pipeline.sh b/tests/test_pipeline.sh index c31472d..4e6ef58 100755 --- a/tests/test_pipeline.sh +++ b/tests/test_pipeline.sh @@ -12,7 +12,7 @@ cat << EOT > "${SCRIPT_DIR}/resources/source.vcf" ##FORMAT= ##FORMAT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG001 -chr1 48 . C A,T 50 PASS COMMENT=NANANANANANANANANANANANANAN%ANANANANANANANANANANANANANANANANAN|ANANANANANANA&NANANANA\$NANANANANANANANANANANANANANANANANAN^ANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANABATMAN GT:GQ 1/1:0 +chr1 48 . C A,T 50 PASS COMMENT=NANANANANANANANANANANANANAN%ANANANANANANANANANANANANANANANANAN|ANANANANANANA&NANAN ANA\$NANANANANANANANANANANANANANANANANAN^ANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANABATMAN GT:GQ 1/1:0 chr1 98 . C CG 50 PASS . GT:GQ 1/1:0 chr1 1078 . G A 50 PASS . GT 1/1 chr1 1818 . AAC A 50 PASS . GT:GQ 1/1:0 @@ -34,7 +34,7 @@ ls ${SCRIPT_DIR}/resources/remap.vcf \ # Build the expected VCF cat << EOT > "${SCRIPT_DIR}/resources/expected_remap.vcf" #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG001 -chr2 48 . C A,T 50 PASS COMMENT=NANANANANANANANANANANANANAN%ANANANANANANANANANANANANANANANANAN|ANANANANANANA&NANANANA\$NANANANANANANANANANANANANANANANANAN^ANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANABATMAN;st=+ GT:GQ 1/1:0 +chr2 48 . C A,T 50 PASS COMMENT=NANANANANANANANANANANANANAN%ANANANANANANANANANANANANANANANANAN|ANANANANANANA&NANAN ANA\$NANANANANANANANANANANANANANANANANAN^ANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANABATMAN;st=+ GT:GQ 1/1:0 chr2 98 . C CG 50 PASS st=+ GT:GQ 1/1:0 chr2 1078 . A G 50 PASS st=+;rac=G-A GT 0/0 chr2 1818 . AAC A 50 PASS st=+ GT:GQ 1/1:0 diff --git a/variant_remapping_tools/reads_to_remapped_variants.py b/variant_remapping_tools/reads_to_remapped_variants.py index 375aae5..a581e9a 100755 --- a/variant_remapping_tools/reads_to_remapped_variants.py +++ b/variant_remapping_tools/reads_to_remapped_variants.py @@ -25,7 +25,8 @@ def calculate_new_variant_definition(left_read, right_read, ref_fasta, original_ operations = {} # Define new ref and new pos new_ref = fetch_bases(ref_fasta, left_read.reference_name, left_read.reference_end + 1, - right_read.reference_start - left_read.reference_end) + right_read.reference_start - left_read.reference_end).upper() + new_pos = left_read.reference_end + 1 # 1. Handle reference strand change @@ -69,7 +70,7 @@ def calculate_new_variant_definition(left_read, right_read, ref_fasta, original_ # 3. Correct zero-length reference sequence if len(new_ref) == 0: new_pos -= 1 - new_ref = fetch_bases(ref_fasta, left_read.reference_name, new_pos, 1) + new_ref = fetch_bases(ref_fasta, left_read.reference_name, new_pos, 1).upper() new_alts = [new_ref + alt for alt in new_alts] operations['zlr'] = None diff --git a/variant_remapping_tools/tests/test_reads_to_remapped_variants.py b/variant_remapping_tools/tests/test_reads_to_remapped_variants.py index c8f0c6f..457f6b3 100644 --- a/variant_remapping_tools/tests/test_reads_to_remapped_variants.py +++ b/variant_remapping_tools/tests/test_reads_to_remapped_variants.py @@ -7,7 +7,8 @@ from variant_remapping_tools.reads_to_remapped_variants import fetch_bases, process_bam_file, \ - calculate_new_variant_definition, order_reads, link_supplementary, pass_aligned_filtering, group_reads + calculate_new_variant_definition, order_reads, link_supplementary, pass_aligned_filtering, group_reads, \ + update_vcf_record class TestProcess(TestCase): @@ -167,6 +168,22 @@ def test_calculate_new_variant_definition(self): assert calculate_new_variant_definition(left_read, right_read, fasta, vcf_rec) == \ (48, 'TTG', ['G'], {'st': '-'}, None) + def test_update_vcf_record(self): + # Allele swap no genotype change + original_vcf_rec = ['chr1', '10', '.', 'A', 'T', '50', '.', '.', 'GT', '0/1'] + update_vcf_record('1', 11, 'T', 'A', {'rac': 'A-T'}, original_vcf_rec) + assert original_vcf_rec == ['1', '11', '.', 'T', 'A', '50', '.', 'rac=A-T', 'GT', '0/1'] + + # Allele swap with genotype change + original_vcf_rec = ['chr1', '10', '.', 'A', 'T', '50', '.', '.', 'GT', '1/1'] + update_vcf_record('1', 11, 'T', 'A', {'rac': 'A-T'}, original_vcf_rec) + assert original_vcf_rec == ['1', '11', '.', 'T', 'A', '50', '.', 'rac=A-T', 'GT', '0/0'] + + # Novel reference allele with genotype change + original_vcf_rec = ['chr1', '10', '.', 'A', 'T', '50', '.', '.', 'GT', '0/1'] + update_vcf_record('1', 11, 'C', ['A', 'T'], {'rac': 'A-C', 'nra': None}, original_vcf_rec) + assert original_vcf_rec == ['1', '11', '.', 'C', 'A,T', '50', '.', 'rac=A-C;nra', 'GT', '1/2'] + @staticmethod def get_test_resource(resource_name): diff --git a/variant_to_realignment.nf b/variant_to_realignment.nf index 573dd6e..8e15d11 100755 --- a/variant_to_realignment.nf +++ b/variant_to_realignment.nf @@ -24,10 +24,11 @@ process convertVCFToBed { # - Switch to 0 based coordinates system # - Add the reference allele so it can be used in flankingRegionBed to adjust the position of the right flank # - add all VCF fields separated by 2 characters pipe and caret (|^) to avoid impacting existing formatting of - # the VCF line. The sub is to protect the % character that would be interpreted by printf otherwise. + # the VCF line. The sub replacing percent is to protect the % character that would be interpreted by printf + # otherwise. the sub replacing space is to prevent bedtools from using them as a field separator awk -F '\\t' '{ if (!/^#/){ \ printf $1"\\t"$2-1"\\t"$2"\\t"$1; \ - for (i=2; i<=NF; i++){ sub(/%/, "%%", $i); printf "|^"$i }; print "\\t"$4}; \ + for (i=2; i<=NF; i++){ sub(/%/, "%%", $i); sub(/ /, "£€", $i); printf "|^"$i }; print "\\t"$4}; \ }' source.vcf \ > variants.bed ''' @@ -56,7 +57,7 @@ process flankingRegionBed { awk 'BEGIN{OFS="\\t"}{\$2=\$2-1; \$3=\$3-1; print \$0}' variants.bed \ | bedtools slop -g genome.chrom.sizes -l $flankingseq -r 0 > flanking_r1.bed - # Adjust the start position of the flank to be one base downstream of the end of variant (\$4 is the reference allele) + # Adjust the start position of the flank to be one base downstream of the end of variant (\$5 is the reference allele) awk 'BEGIN{OFS="\\t"}{ \$2=\$2+length(\$5); \$3=\$3+length(\$5); print \$0}' variants.bed \ | bedtools slop -g genome.chrom.sizes -l 0 -r $flankingseq > flanking_r2.bed """ @@ -108,7 +109,7 @@ process extractVariantInfoToFastaHeader { awk '{print ">" NR }' flanking_r1.bed > position.txt # Store position of the variant in the file - cut -f 4 flanking_r1.bed > vcf_fields.txt + cut -f 4 flanking_r1.bed | sed 's/£€/ /g' > vcf_fields.txt # Paste the names, variant bases, then fasta sequences into a new file # A space will be inserted between the position and the vcf fields From e539f7d805512ba525a4cecedb5f00bb46aef202 Mon Sep 17 00:00:00 2001 From: Timothee Cezard Date: Tue, 1 Jun 2021 08:42:15 +0200 Subject: [PATCH 06/19] add suport for multiple whitespace --- tests/test_pipeline.sh | 4 ++-- variant_to_realignment.nf | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_pipeline.sh b/tests/test_pipeline.sh index 4e6ef58..50c28e5 100755 --- a/tests/test_pipeline.sh +++ b/tests/test_pipeline.sh @@ -12,7 +12,7 @@ cat << EOT > "${SCRIPT_DIR}/resources/source.vcf" ##FORMAT= ##FORMAT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG001 -chr1 48 . C A,T 50 PASS COMMENT=NANANANANANANANANANANANANAN%ANANANANANANANANANANANANANANANANAN|ANANANANANANA&NANAN ANA\$NANANANANANANANANANANANANANANANANAN^ANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANABATMAN GT:GQ 1/1:0 +chr1 48 . C A,T 50 PASS COMMENT=NANANANANANANANANANANANANAN%ANANANANANANANANANANANANANANANANAN|ANANANANANANA&NANAN ANA\$NANANANANANANANANANANANANANANANANAN^ANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANA NANANANANANANANANANANANANABATMAN GT:GQ 1/1:0 chr1 98 . C CG 50 PASS . GT:GQ 1/1:0 chr1 1078 . G A 50 PASS . GT 1/1 chr1 1818 . AAC A 50 PASS . GT:GQ 1/1:0 @@ -34,7 +34,7 @@ ls ${SCRIPT_DIR}/resources/remap.vcf \ # Build the expected VCF cat << EOT > "${SCRIPT_DIR}/resources/expected_remap.vcf" #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG001 -chr2 48 . C A,T 50 PASS COMMENT=NANANANANANANANANANANANANAN%ANANANANANANANANANANANANANANANANAN|ANANANANANANA&NANAN ANA\$NANANANANANANANANANANANANANANANANAN^ANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANABATMAN;st=+ GT:GQ 1/1:0 +chr2 48 . C A,T 50 PASS COMMENT=NANANANANANANANANANANANANAN%ANANANANANANANANANANANANANANANANAN|ANANANANANANA&NANAN ANA\$NANANANANANANANANANANANANANANANANAN^ANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANA NANANANANANANANANANANANANABATMAN;st=+ GT:GQ 1/1:0 chr2 98 . C CG 50 PASS st=+ GT:GQ 1/1:0 chr2 1078 . A G 50 PASS st=+;rac=G-A GT 0/0 chr2 1818 . AAC A 50 PASS st=+ GT:GQ 1/1:0 diff --git a/variant_to_realignment.nf b/variant_to_realignment.nf index 8e15d11..a63c309 100755 --- a/variant_to_realignment.nf +++ b/variant_to_realignment.nf @@ -28,7 +28,7 @@ process convertVCFToBed { # otherwise. the sub replacing space is to prevent bedtools from using them as a field separator awk -F '\\t' '{ if (!/^#/){ \ printf $1"\\t"$2-1"\\t"$2"\\t"$1; \ - for (i=2; i<=NF; i++){ sub(/%/, "%%", $i); sub(/ /, "£€", $i); printf "|^"$i }; print "\\t"$4}; \ + for (i=2; i<=NF; i++){ gsub(/%/, "%%", $i); gsub(/ /, "£€", $i); printf "|^"$i }; print "\\t"$4}; \ }' source.vcf \ > variants.bed ''' From 0720ecfdcdaa2adf21a391e466ef63bc29de7d34 Mon Sep 17 00:00:00 2001 From: Timothee Cezard Date: Tue, 1 Jun 2021 21:03:28 +0100 Subject: [PATCH 07/19] Add filtering of variants that are on the first or on the last position of a contig as no flanks can be generated --- main.nf | 26 +++++++++++++++++++++++++- tests/test_pipeline.sh | 5 ++++- 2 files changed, 29 insertions(+), 2 deletions(-) diff --git a/main.nf b/main.nf index 71f6037..876df1a 100755 --- a/main.nf +++ b/main.nf @@ -64,6 +64,29 @@ process uncompressInputVCF { """ } + +/* + * filter VCF file to remove variant too close the edges of chromosome because we can't get flanking regions + */ +process filterInputVCF { + + input: + path "source.vcf" + path "genome_fai" + + output: + path "filtered.vcf", emit: filtered_vcf_file + + script: + """ + awk '{ print \$1"\\t1\\t"\$2;}' genome_fai > edge_region.bed + bgzip source.vcf + bcftools index source.vcf.gz + bcftools filter --regions-file edge_region.bed -o filtered.vcf source.vcf.gz + """ +} + + /*  * Store the original VCF header for later use  */ @@ -250,9 +273,10 @@ workflow { prepare_old_genome(params.oldgenome) prepare_new_genome(params.newgenome) uncompressInputVCF(params.vcffile) + filterInputVCF(uncompressInputVCF.out.vcf_file, prepare_old_genome.out.genome_fai) storeVCFHeader(uncompressInputVCF.out.vcf_file) process_split_reads( - uncompressInputVCF.out.vcf_file, + filterInputVCF.out.filtered_vcf_file, params.oldgenome, prepare_old_genome.out.genome_fai, prepare_old_genome.out.genome_chrom_sizes, diff --git a/tests/test_pipeline.sh b/tests/test_pipeline.sh index 50c28e5..4436e72 100755 --- a/tests/test_pipeline.sh +++ b/tests/test_pipeline.sh @@ -12,6 +12,7 @@ cat << EOT > "${SCRIPT_DIR}/resources/source.vcf" ##FORMAT= ##FORMAT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG001 +chr1 1 . C T 50 PASS . GT:GQ 1/1:0 chr1 48 . C A,T 50 PASS COMMENT=NANANANANANANANANANANANANAN%ANANANANANANANANANANANANANANANANAN|ANANANANANANA&NANAN ANA\$NANANANANANANANANANANANANANANANANAN^ANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANA NANANANANANANANANANANANANABATMAN GT:GQ 1/1:0 chr1 98 . C CG 50 PASS . GT:GQ 1/1:0 chr1 1078 . G A 50 PASS . GT 1/1 @@ -48,6 +49,8 @@ diff "${SCRIPT_DIR}/resources/expected_remap.vcf" <(grep -v '^##' "${SCRIPT_DIR} # Clean up after the test rm -rf work .nextflow* \ ${SCRIPT_DIR}/resources/source.vcf \ - ${SCRIPT_DIR}/resources/*remap.vcf* \ + ${SCRIPT_DIR}/resources/remap.vcf \ + ${SCRIPT_DIR}/resources/remap_counts.yml \ + ${SCRIPT_DIR}/resources/remap_unmapped.vcf \ ${SCRIPT_DIR}/resources/new_genome.fa.* \ ${SCRIPT_DIR}/resources/genome.fa.fai From 88c7630e1b78a586650aa51fe7ac8e34b53a1e71 Mon Sep 17 00:00:00 2001 From: Timothee Cezard Date: Wed, 2 Jun 2021 14:04:29 +0100 Subject: [PATCH 08/19] Support for unsorted input file and add test --- main.nf | 92 +++++++++++++++++++++++++++++------------- tests/test_pipeline.sh | 4 +- 2 files changed, 66 insertions(+), 30 deletions(-) diff --git a/main.nf b/main.nf index 876df1a..970d195 100755 --- a/main.nf +++ b/main.nf @@ -76,13 +76,20 @@ process filterInputVCF { output: path "filtered.vcf", emit: filtered_vcf_file + path "kept.vcf", emit: kept_vcf_file + path "count.yml", emit: count_yml script: """ - awk '{ print \$1"\\t1\\t"\$2;}' genome_fai > edge_region.bed - bgzip source.vcf - bcftools index source.vcf.gz - bcftools filter --regions-file edge_region.bed -o filtered.vcf source.vcf.gz + awk '{ print \$1"\\t1\\t"\$2-1;}' genome_fai > center_regions.bed + awk '{ print \$1"\\t0\\t1"; print \$1"\\t"\$2-1"\\t"\$2;}' genome_fai > edge_regions.bed + # Bcftools sort needs the contigs to be set in the header or an index neither of which we can't garantee here. + # that's why we're using unix sort + awk '\$1 ~ /^#/ {print \$0;next} {print \$0 | "sort -k1,1 -k2,2n"}' source.vcf | tee >(wc -l > all_count.txt) | bgzip -c > source_sorted.vcf.gz + bcftools index source_sorted.vcf.gz + bcftools filter --regions-file center_regions.bed -o kept.vcf source_sorted.vcf.gz + bcftools filter --regions-file edge_regions.bed source_sorted.vcf.gz | grep -v '^#' | tee filtered.vcf | wc -l > filtered_count.txt + cat <(cat all_count.txt | awk '{print "all: "\$1}') <(cat filtered_count.txt | awk '{print "filtered: "\$1}') > count.yml """ } @@ -221,6 +228,23 @@ process outputStats { """ } +/* + * Concatenate the unmapped variants + */ +process combineUnmappedVCF { + input: + path "variants1.vcf" + path "variants2.vcf" + + output: + path "merge.vcf", emit: merge_vcf + + """ + cat variants1.vcf variants2.vcf > merge.vcf + """ +} + + process combineVCF { input: path "variants1.vcf" @@ -236,6 +260,7 @@ process combineVCF { process combineYaml { input: + path "initial_yml" path "round1.yml" path "round2.yml" path "round3.yml" @@ -244,7 +269,7 @@ process combineYaml { path "merge.yml", emit: merge_yml """ - cat round1.yml round2.yml round3.yml > merge.yml + cat initial_yml round1.yml round2.yml round3.yml > merge.yml """ } @@ -266,9 +291,33 @@ workflow finalise { } +//process_with_bowtie +workflow process_with_bowtie { + main: + prepare_old_genome(params.oldgenome) + prepare_new_genome_bowtie(params.newgenome) + uncompressInputVCF(params.vcffile) + storeVCFHeader(uncompressInputVCF.out.vcf_file) + process_split_reads_with_bowtie( + uncompressInputVCF.out.vcf_file, + params.oldgenome, + prepare_old_genome.out.genome_fai, + prepare_old_genome.out.genome_chrom_sizes, + params.newgenome, + prepare_new_genome_bowtie.out.genome_fai, + prepare_new_genome_bowtie.out.bowtie_indexes + ) + finalise(process_split_reads_with_bowtie.out.variants_remapped, storeVCFHeader.out.vcf_header, + params.newgenome, process_split_reads_with_bowtie.out.summary_yml) +} + + + + + // process_with_minimap // Workflow without a name is the default workflow that gets executed when the file is run through nextflow -workflow { +workflow { main: prepare_old_genome(params.oldgenome) prepare_new_genome(params.newgenome) @@ -276,7 +325,7 @@ workflow { filterInputVCF(uncompressInputVCF.out.vcf_file, prepare_old_genome.out.genome_fai) storeVCFHeader(uncompressInputVCF.out.vcf_file) process_split_reads( - filterInputVCF.out.filtered_vcf_file, + filterInputVCF.out.kept_vcf_file, params.oldgenome, prepare_old_genome.out.genome_fai, prepare_old_genome.out.genome_chrom_sizes, @@ -299,39 +348,24 @@ workflow { params.newgenome, prepare_new_genome.out.genome_fai ) + combineUnmappedVCF( + filterInputVCF.out.filtered_vcf_file, + process_split_reads_long.out.variants_unmapped, + ) combineVCF( process_split_reads.out.variants_remapped, process_split_reads_mid.out.variants_remapped, process_split_reads_long.out.variants_remapped ) combineYaml( + filterInputVCF.out.count_yml, process_split_reads.out.summary_yml, process_split_reads_mid.out.summary_yml, - process_split_reads_long.out.summary_yml, + process_split_reads_long.out.summary_yml ) finalise( - combineVCF.out.merge_vcf, process_split_reads_long.out.variants_unmapped, storeVCFHeader.out.vcf_header, + combineVCF.out.merge_vcf, combineUnmappedVCF.out.merge_vcf, storeVCFHeader.out.vcf_header, params.newgenome, combineYaml.out.merge_yml ) } - -//process_with_bowtie -workflow process_with_bowtie { - main: - prepare_old_genome(params.oldgenome) - prepare_new_genome_bowtie(params.newgenome) - uncompressInputVCF(params.vcffile) - storeVCFHeader(uncompressInputVCF.out.vcf_file) - process_split_reads_with_bowtie( - uncompressInputVCF.out.vcf_file, - params.oldgenome, - prepare_old_genome.out.genome_fai, - prepare_old_genome.out.genome_chrom_sizes, - params.newgenome, - prepare_new_genome_bowtie.out.genome_fai, - prepare_new_genome_bowtie.out.bowtie_indexes - ) - finalise(process_split_reads_with_bowtie.out.variants_remapped, storeVCFHeader.out.vcf_header, - params.newgenome, process_split_reads_with_bowtie.out.summary_yml) -} diff --git a/tests/test_pipeline.sh b/tests/test_pipeline.sh index 4436e72..91d22a8 100755 --- a/tests/test_pipeline.sh +++ b/tests/test_pipeline.sh @@ -16,9 +16,10 @@ chr1 1 . C T 50 PASS . GT:GQ 1/1:0 chr1 48 . C A,T 50 PASS COMMENT=NANANANANANANANANANANANANAN%ANANANANANANANANANANANANANANANANAN|ANANANANANANA&NANAN ANA\$NANANANANANANANANANANANANANANANANAN^ANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANA NANANANANANANANANANANANANABATMAN GT:GQ 1/1:0 chr1 98 . C CG 50 PASS . GT:GQ 1/1:0 chr1 1078 . G A 50 PASS . GT 1/1 -chr1 1818 . AAC A 50 PASS . GT:GQ 1/1:0 chr1 2030 . A TCC 50 PASS . GT:GQ 1/1:0 +chr1 1818 . AAC A 50 PASS . GT:GQ 1/1:0 chr1 3510 . T C 50 PASS . GT:GQ 1/1:0 +chr1 3709 . C T 50 PASS . GT:GQ 1/1:0 EOT nextflow run ${SOURCE_DIR}/main.nf \ @@ -43,6 +44,7 @@ chr2 2030 . A TCC 50 PASS st=+ GT:GQ 1/1:0 chr2 3510 . T C 50 PASS st=+ GT:GQ 1/1:0 EOT + # Compare vs the expected VCF diff "${SCRIPT_DIR}/resources/expected_remap.vcf" <(grep -v '^##' "${SCRIPT_DIR}/resources/remap.vcf") From 60362ff69f1205b92f8a2ba227c199724cd92ff4 Mon Sep 17 00:00:00 2001 From: Timothee Cezard Date: Thu, 3 Jun 2021 14:15:49 +0100 Subject: [PATCH 09/19] Filter out the variants close to the edge using targets rather than regions --- main.nf | 10 +++------- tests/test_pipeline.sh | 19 ++++++++++++++++--- 2 files changed, 19 insertions(+), 10 deletions(-) diff --git a/main.nf b/main.nf index 970d195..85c79b0 100755 --- a/main.nf +++ b/main.nf @@ -83,13 +83,9 @@ process filterInputVCF { """ awk '{ print \$1"\\t1\\t"\$2-1;}' genome_fai > center_regions.bed awk '{ print \$1"\\t0\\t1"; print \$1"\\t"\$2-1"\\t"\$2;}' genome_fai > edge_regions.bed - # Bcftools sort needs the contigs to be set in the header or an index neither of which we can't garantee here. - # that's why we're using unix sort - awk '\$1 ~ /^#/ {print \$0;next} {print \$0 | "sort -k1,1 -k2,2n"}' source.vcf | tee >(wc -l > all_count.txt) | bgzip -c > source_sorted.vcf.gz - bcftools index source_sorted.vcf.gz - bcftools filter --regions-file center_regions.bed -o kept.vcf source_sorted.vcf.gz - bcftools filter --regions-file edge_regions.bed source_sorted.vcf.gz | grep -v '^#' | tee filtered.vcf | wc -l > filtered_count.txt - cat <(cat all_count.txt | awk '{print "all: "\$1}') <(cat filtered_count.txt | awk '{print "filtered: "\$1}') > count.yml + bcftools filter --targets-file center_regions.bed source.vcf | tee kept.vcf | grep -cv '^#' > all_count.txt + bcftools filter --targets-file edge_regions.bed source.vcf | grep -v '^#' | tee filtered.vcf | wc -l > filtered_count.txt + cat <(cat *_count.txt | awk '{sum += \$1} END{print "all: "sum}') <(cat filtered_count.txt | awk '{print "filtered: "\$1}') > count.yml """ } diff --git a/tests/test_pipeline.sh b/tests/test_pipeline.sh index 91d22a8..431f6ae 100755 --- a/tests/test_pipeline.sh +++ b/tests/test_pipeline.sh @@ -2,6 +2,15 @@ set -Eeuo pipefail +function asserteq() { + if [[ ! "$1" -eq "$2" ]] + then + echo "Assertion Error: $1 not equal to $2" + exit 1 + fi + +} + SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" SOURCE_DIR=$(dirname $SCRIPT_DIR) @@ -12,14 +21,15 @@ cat << EOT > "${SCRIPT_DIR}/resources/source.vcf" ##FORMAT= ##FORMAT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG001 -chr1 1 . C T 50 PASS . GT:GQ 1/1:0 +chr1 1 . CG TG 50 PASS . GT:GQ 1/1:0 chr1 48 . C A,T 50 PASS COMMENT=NANANANANANANANANANANANANAN%ANANANANANANANANANANANANANANANANAN|ANANANANANANA&NANAN ANA\$NANANANANANANANANANANANANANANANANAN^ANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANA NANANANANANANANANANANANANABATMAN GT:GQ 1/1:0 chr1 98 . C CG 50 PASS . GT:GQ 1/1:0 chr1 1078 . G A 50 PASS . GT 1/1 chr1 2030 . A TCC 50 PASS . GT:GQ 1/1:0 chr1 1818 . AAC A 50 PASS . GT:GQ 1/1:0 chr1 3510 . T C 50 PASS . GT:GQ 1/1:0 -chr1 3709 . C T 50 PASS . GT:GQ 1/1:0 +chr1 3709 . CA TA 50 PASS . GT:GQ 1/1:0 +chr1 3710 . T A 50 PASS . GT:GQ 1/1:0 EOT nextflow run ${SOURCE_DIR}/main.nf \ @@ -44,10 +54,13 @@ chr2 2030 . A TCC 50 PASS st=+ GT:GQ 1/1:0 chr2 3510 . T C 50 PASS st=+ GT:GQ 1/1:0 EOT - # Compare vs the expected VCF diff "${SCRIPT_DIR}/resources/expected_remap.vcf" <(grep -v '^##' "${SCRIPT_DIR}/resources/remap.vcf") +asserteq `cat ${SCRIPT_DIR}/resources/remap_counts.yml | grep 'all:' | cut -d ' ' -f 2` 9 +asserteq `cat ${SCRIPT_DIR}/resources/remap_counts.yml | grep 'filtered:' | cut -d ' ' -f 2` 2 + + # Clean up after the test rm -rf work .nextflow* \ ${SCRIPT_DIR}/resources/source.vcf \ From d8666ab07a3a666281c6227bbf0ac59460e2edfb Mon Sep 17 00:00:00 2001 From: Timothee Cezard Date: Thu, 3 Jun 2021 14:30:56 +0100 Subject: [PATCH 10/19] increase memory for VariantInfoToFastaHeader that failed occasionally --- variant_to_realignment.nf | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/variant_to_realignment.nf b/variant_to_realignment.nf index a63c309..9b738b9 100755 --- a/variant_to_realignment.nf +++ b/variant_to_realignment.nf @@ -92,6 +92,8 @@ process flankingRegionFasta {  */ process extractVariantInfoToFastaHeader { + memory 8GB + input: path "flanking_r1.bed" path "flanking_r2.bed" @@ -108,7 +110,7 @@ process extractVariantInfoToFastaHeader { # Store variant position in the file to have a unique name awk '{print ">" NR }' flanking_r1.bed > position.txt - # Store position of the variant in the file + # Store position of the variant in the file and replace '£€' with the original whitespace from convertVCFToBed cut -f 4 flanking_r1.bed | sed 's/£€/ /g' > vcf_fields.txt # Paste the names, variant bases, then fasta sequences into a new file From 5037c83cad55accf11236d24d150471ffbaa8b83 Mon Sep 17 00:00:00 2001 From: Timothee Cezard Date: Thu, 3 Jun 2021 16:25:05 +0100 Subject: [PATCH 11/19] Don't use grep -c because it exists with status 1 when nothing is found Fix memory syntax --- main.nf | 2 +- variant_to_realignment.nf | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/main.nf b/main.nf index 85c79b0..eb981f7 100755 --- a/main.nf +++ b/main.nf @@ -83,7 +83,7 @@ process filterInputVCF { """ awk '{ print \$1"\\t1\\t"\$2-1;}' genome_fai > center_regions.bed awk '{ print \$1"\\t0\\t1"; print \$1"\\t"\$2-1"\\t"\$2;}' genome_fai > edge_regions.bed - bcftools filter --targets-file center_regions.bed source.vcf | tee kept.vcf | grep -cv '^#' > all_count.txt + bcftools filter --targets-file center_regions.bed source.vcf | tee kept.vcf | grep -v '^#' | wc -l > all_count.txt bcftools filter --targets-file edge_regions.bed source.vcf | grep -v '^#' | tee filtered.vcf | wc -l > filtered_count.txt cat <(cat *_count.txt | awk '{sum += \$1} END{print "all: "sum}') <(cat filtered_count.txt | awk '{print "filtered: "\$1}') > count.yml """ diff --git a/variant_to_realignment.nf b/variant_to_realignment.nf index 9b738b9..afa42cd 100755 --- a/variant_to_realignment.nf +++ b/variant_to_realignment.nf @@ -92,7 +92,7 @@ process flankingRegionFasta {  */ process extractVariantInfoToFastaHeader { - memory 8GB + memory '8GB' input: path "flanking_r1.bed" From 7cef6cb184950f3c8a15bf5d0d073fb7bcb9e835 Mon Sep 17 00:00:00 2001 From: Timothee Cezard Date: Thu, 3 Jun 2021 23:47:41 +0100 Subject: [PATCH 12/19] Use scatter gather to parallelize the alignments --- .../reads_to_remapped_variants.py | 47 ++++++++++--------- variant_to_realignment.nf | 42 ++++++++++++----- 2 files changed, 53 insertions(+), 36 deletions(-) diff --git a/variant_remapping_tools/reads_to_remapped_variants.py b/variant_remapping_tools/reads_to_remapped_variants.py index a581e9a..5cfef47 100755 --- a/variant_remapping_tools/reads_to_remapped_variants.py +++ b/variant_remapping_tools/reads_to_remapped_variants.py @@ -241,36 +241,37 @@ def link_supplementary(primary_group, supplementary_group): return dict(primary_to_supplementary) -def process_bam_file(bam_file_path, output_file, out_failed_file, new_genome, +def process_bam_file(bam_file_paths, output_file, out_failed_file, new_genome, filter_align_with_secondary, flank_length, summary_file): counter = Counter() fasta = pysam.FastaFile(new_genome) with open(output_file, 'w') as outfile, open(out_failed_file, 'w') as out_failed: - for primary_group, supplementary_group, secondary_group in group_reads(bam_file_path): - counter['total'] += 1 - primary_to_supplementary = link_supplementary(primary_group, supplementary_group) - # Retrieve the full VCF record from the bam vr tag - original_vcf_rec = primary_group[0].get_tag('vr').split('|^') - if pass_basic_filtering(primary_group, secondary_group, primary_to_supplementary, counter, filter_align_with_secondary): - left_read, right_read = order_reads(primary_group, primary_to_supplementary) - if pass_aligned_filtering(left_read, right_read, counter): - varpos, new_ref, new_alts, ops, failure_reason = \ - calculate_new_variant_definition(left_read, right_read, fasta, original_vcf_rec) - if not failure_reason: - counter['Remapped'] += 1 - update_vcf_record(left_read.reference_name, varpos, new_ref, new_alts, ops, original_vcf_rec) - output_alignment(original_vcf_rec, outfile) + for bam_file_path in bam_file_paths: + for primary_group, supplementary_group, secondary_group in group_reads(bam_file_path): + counter['total'] += 1 + primary_to_supplementary = link_supplementary(primary_group, supplementary_group) + # Retrieve the full VCF record from the bam vr tag + original_vcf_rec = primary_group[0].get_tag('vr').split('|^') + if pass_basic_filtering(primary_group, secondary_group, primary_to_supplementary, counter, filter_align_with_secondary): + left_read, right_read = order_reads(primary_group, primary_to_supplementary) + if pass_aligned_filtering(left_read, right_read, counter): + varpos, new_ref, new_alts, ops, failure_reason = \ + calculate_new_variant_definition(left_read, right_read, fasta, original_vcf_rec) + if not failure_reason: + counter['Remapped'] += 1 + update_vcf_record(left_read.reference_name, varpos, new_ref, new_alts, ops, original_vcf_rec) + output_alignment(original_vcf_rec, outfile) + else: + # Currently the alignment is not precise enough to ensure that the allele change for INDEL and + # novel reference allele are correct. So we skip them. + # TODO: add realignment confirmation see #14 and EVA-2417 + counter[failure_reason] += 1 + output_alignment(original_vcf_rec, out_failed) else: - # Currently the alignment is not precise enough to ensure that the allele change for INDEL and - # novel reference allele are correct. So we skip them. - # TODO: add realignment confirmation see #14 and EVA-2417 - counter[failure_reason] += 1 output_alignment(original_vcf_rec, out_failed) else: output_alignment(original_vcf_rec, out_failed) - else: - output_alignment(original_vcf_rec, out_failed) with open(summary_file, 'w') as open_summary: yaml.safe_dump({f'Flank_{flank_length}': dict(counter)}, open_summary) @@ -281,7 +282,7 @@ def main(): 'separate file.') parser = argparse.ArgumentParser(description=description, formatter_class=RawTextHelpFormatter) - parser.add_argument('-i', '--bam', type=str, required=True, + parser.add_argument('-i', '--bams', type=str, required=True, nargs='+', help='Input BAM file with remapped flanking regions') parser.add_argument('-o', '--outfile', type=str, required=True, help='Output VCF file with remapped variants') @@ -297,7 +298,7 @@ def main(): args = parser.parse_args() process_bam_file( - bam_file_path=args.bam, + bam_file_paths=args.bams, output_file=args.outfile, out_failed_file=args.out_failed_file, new_genome=args.newgenome, diff --git a/variant_to_realignment.nf b/variant_to_realignment.nf index afa42cd..89c6976 100755 --- a/variant_to_realignment.nf +++ b/variant_to_realignment.nf @@ -135,8 +135,8 @@ process alignWithMinimap { maxRetries 3 input: - path "variant_read1.fa" - path "variant_read2.fa" + // reads contains a list of 2 files (first and second read) + path(reads) // indexing is done on the fly so get the genome directly path "genome.fa" val flanklength @@ -157,7 +157,7 @@ process alignWithMinimap { # the awk script will convert this comment in valid SAM tag minimap2 -k21 -w11 --sr --frag=yes -A2 -B5 -O6,16 --end-bonus 20 -E2,1 -r50 -p.5 -z 800,200\ -f1000,5000 -n2 -m20 -s40 -g200 -2K50m --heap-sort=yes --secondary=yes -N 2 -y \ - -a genome.fa variant_read1.fa variant_read2.fa | \ + -a genome.fa ${reads[0]} ${reads[1]} | \ awk -F '\\t' 'BEGIN{OFS="\\t"}{if(!/^@/){\$NF="vr:Z:"\$NF}; print \$0;}' | \ samtools view -bS - > reads_aligned.bam """ @@ -165,7 +165,7 @@ process alignWithMinimap { """ minimap2 -k19 -w19 -A2 -B5 -O6,16 --end-bonus 20 -E3,1 -s200 -z200 -N50 --min-occ-floor=100 \ --secondary=yes -N 2 -y \ - -a genome.fa variant_read1.fa variant_read2.fa | \ + -a genome.fa ${reads[0]} ${reads[1]} | \ awk -F '\\t' 'BEGIN{OFS="\\t"}{if(!/^@/){\$NF="vr:Z:"\$NF}; print \$0;}' | \ samtools view -bS - > reads_aligned.bam """ @@ -221,7 +221,7 @@ process alignWithBowtie { process readsToRemappedVariants { input: - path "reads_aligned.bam" + path "reads.*.bam" path "genome.fa" val flank_length val filter_align_with_secondary @@ -235,14 +235,14 @@ process readsToRemappedVariants { if (filter_align_with_secondary) """ # Ensure that we will use the reads_to_remapped_variants.py from this repo - ${baseDir}/variant_remapping_tools/reads_to_remapped_variants.py -i reads_aligned.bam \ + ${baseDir}/variant_remapping_tools/reads_to_remapped_variants.py -i reads*.bam \ -o variants_remapped.vcf --newgenome genome.fa --out_failed_file variants_unmapped.vcf \ --flank_length $flank_length --summary summary.yml --filter_align_with_secondary """ else """ # Ensure that we will use the reads_to_remapped_variants.py from this repo - ${baseDir}/variant_remapping_tools/reads_to_remapped_variants.py -i reads_aligned.bam \ + ${baseDir}/variant_remapping_tools/reads_to_remapped_variants.py -i reads*.bam \ -o variants_remapped.vcf --newgenome genome.fa --out_failed_file variants_unmapped.vcf \ --flank_length $flank_length --summary summary.yml """ @@ -259,6 +259,7 @@ workflow process_split_reads_generic { new_genome_fa_fai flank_length filter_align_with_secondary + chunck_size main: convertVCFToBed(source_vcf) @@ -271,15 +272,24 @@ workflow process_split_reads_generic { flankingRegionBed.out.flanking_r1_bed, flankingRegionBed.out.flanking_r2_bed, flankingRegionFasta.out.variants_read1, flankingRegionFasta.out.variants_read2 ) + + // This will split the fasta file into chunks + // mix creates a single channel with both file + // toList create a single entry channel with the list of two file + // splitFasta split the two files in chunks + split_reads = extractVariantInfoToFastaHeader.out.variant_read1_with_info + .mix(extractVariantInfoToFastaHeader.out.variant_read2_with_info) + .toList() + .splitFasta(by: chunck_size, file: true, elem: [0,1]) + alignWithMinimap( - extractVariantInfoToFastaHeader.out.variant_read1_with_info, - extractVariantInfoToFastaHeader.out.variant_read2_with_info, + split_reads, new_genome_fa, flank_length ) sortByName(alignWithMinimap.out.reads_aligned_bam) readsToRemappedVariants( - sortByName.out.reads_aligned_sorted_bam, new_genome_fa, + sortByName.out.reads_aligned_sorted_bam.collect(), new_genome_fa, flank_length, filter_align_with_secondary ) @@ -300,9 +310,11 @@ workflow process_split_reads { main: flank_length = 50 filter_align_with_secondary = true + chunck_size = 10000000 process_split_reads_generic( source_vcf, old_genome_fa, old_genome_fa_fai, old_genome_chrom_sizes, - new_genome_fa, new_genome_fa_fai, flank_length, filter_align_with_secondary + new_genome_fa, new_genome_fa_fai, flank_length, filter_align_with_secondary, + chunck_size ) emit: variants_remapped = process_split_reads_generic.out.variants_remapped @@ -323,9 +335,11 @@ workflow process_split_reads_mid { main: flank_length = 2000 filter_align_with_secondary = true + chunck_size = 1000000 process_split_reads_generic( source_vcf, old_genome_fa, old_genome_fa_fai, old_genome_chrom_sizes, - new_genome_fa, new_genome_fa_fai, flank_length, filter_align_with_secondary + new_genome_fa, new_genome_fa_fai, flank_length, filter_align_with_secondary, + chunck_size ) emit: variants_remapped = process_split_reads_generic.out.variants_remapped @@ -346,9 +360,11 @@ workflow process_split_reads_long { main: flank_length = 50000 filter_align_with_secondary = false + chunck_size = 100000 process_split_reads_generic( source_vcf, old_genome_fa, old_genome_fa_fai, old_genome_chrom_sizes, - new_genome_fa, new_genome_fa_fai, flank_length, filter_align_with_secondary + new_genome_fa, new_genome_fa_fai, flank_length, filter_align_with_secondary, + chunck_size ) emit: variants_remapped = process_split_reads_generic.out.variants_remapped From af34f9c410722082ccb1f355f45583b5604f0fe4 Mon Sep 17 00:00:00 2001 From: Timothee Cezard Date: Fri, 4 Jun 2021 10:40:20 +0100 Subject: [PATCH 13/19] ONly split the fasta file if they are not empty otherwise leave them as they are --- .github/workflows/variant_remapping.yml | 3 +- tests/test_pipeline_empty.sh | 56 +++++++++++++++++++++++++ variant_to_realignment.nf | 13 +++--- 3 files changed, 66 insertions(+), 6 deletions(-) create mode 100755 tests/test_pipeline_empty.sh diff --git a/.github/workflows/variant_remapping.yml b/.github/workflows/variant_remapping.yml index ec6c8cc..15bfbc2 100644 --- a/.github/workflows/variant_remapping.yml +++ b/.github/workflows/variant_remapping.yml @@ -32,8 +32,9 @@ jobs: $CONDA/bin/conda run pip install -q -r requirements.txt - name: Test nextflow workflow - run: + run: | $CONDA/bin/conda run tests/test_pipeline.sh + $CONDA/bin/conda run test_pipeline_empty.sh - name: Test with pytest run: diff --git a/tests/test_pipeline_empty.sh b/tests/test_pipeline_empty.sh new file mode 100755 index 0000000..edd27c0 --- /dev/null +++ b/tests/test_pipeline_empty.sh @@ -0,0 +1,56 @@ +#!/bin/bash + +set -Eeuo pipefail + +function asserteq() { + if [[ ! "$1" -eq "$2" ]] + then + echo "Assertion Error: $1 not equal to $2" + exit 1 + fi + +} + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )" +SOURCE_DIR=$(dirname $SCRIPT_DIR) + +# Build the Source VCF +cat << EOT > "${SCRIPT_DIR}/resources/source_empty.vcf" +##fileformat=VCFv4.3 +##INFO= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG001 +EOT + +nextflow run ${SOURCE_DIR}/main.nf \ +--oldgenome ${SCRIPT_DIR}/resources/genome.fa \ +--newgenome ${SCRIPT_DIR}/resources/new_genome.fa \ +--vcffile ${SCRIPT_DIR}/resources/source_empty.vcf \ +--outfile ${SCRIPT_DIR}/resources/remap_empty.vcf + +# Check the presence of the output file +ls ${SCRIPT_DIR}/resources/remap_empty.vcf \ + ${SCRIPT_DIR}/resources/remap_empty_unmapped.vcf \ + ${SCRIPT_DIR}/resources/remap_empty_counts.yml + +# Build the expected VCF +cat << EOT > "${SCRIPT_DIR}/resources/expected_remap.vcf" +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG001 +EOT + +# Compare vs the expected VCF +diff "${SCRIPT_DIR}/resources/expected_remap.vcf" <(grep -v '^##' "${SCRIPT_DIR}/resources/remap_empty.vcf") + +asserteq `cat ${SCRIPT_DIR}/resources/remap_empty_counts.yml | grep 'all:' | cut -d ' ' -f 2` 0 +asserteq `cat ${SCRIPT_DIR}/resources/remap_empty_counts.yml | grep 'filtered:' | cut -d ' ' -f 2` 0 + + +# Clean up after the test +rm -rf work .nextflow* \ + ${SCRIPT_DIR}/resources/source_empty.vcf \ + ${SCRIPT_DIR}/resources/remap_empty.vcf \ + ${SCRIPT_DIR}/resources/remap_empty_counts.yml \ + ${SCRIPT_DIR}/resources/remap_empty_unmapped.vcf \ + ${SCRIPT_DIR}/resources/new_genome.fa.* \ + ${SCRIPT_DIR}/resources/genome.fa.fai diff --git a/variant_to_realignment.nf b/variant_to_realignment.nf index 89c6976..46e7e24 100755 --- a/variant_to_realignment.nf +++ b/variant_to_realignment.nf @@ -273,15 +273,18 @@ workflow process_split_reads_generic { flankingRegionFasta.out.variants_read1, flankingRegionFasta.out.variants_read2 ) - // This will split the fasta file into chunks // mix creates a single channel with both file // toList create a single entry channel with the list of two file - // splitFasta split the two files in chunks - split_reads = extractVariantInfoToFastaHeader.out.variant_read1_with_info - .mix(extractVariantInfoToFastaHeader.out.variant_read2_with_info) + split_reads = extractVariantInfoToFastaHeader.out.variant_read2_with_info + .mix(extractVariantInfoToFastaHeader.out.variant_read1_with_info) .toList() - .splitFasta(by: chunck_size, file: true, elem: [0,1]) + // splitFasta split the two files in chunks only if the input fasta is not empty + extractVariantInfoToFastaHeader.out.variant_read1_with_info.subscribe { + if (it.size() > 0) { + split_reads = split_reads.splitFasta(by: chunck_size, file: true, elem: [0,1]) + } + } alignWithMinimap( split_reads, new_genome_fa, From 80a37e1fe77c650e4e87d6ae40d843a39eaaa674 Mon Sep 17 00:00:00 2001 From: Timothee Cezard Date: Fri, 4 Jun 2021 10:42:01 +0100 Subject: [PATCH 14/19] Remove files after successful test --- tests/test_pipeline.sh | 1 + tests/test_pipeline_empty.sh | 1 + 2 files changed, 2 insertions(+) diff --git a/tests/test_pipeline.sh b/tests/test_pipeline.sh index 431f6ae..5d2c62f 100755 --- a/tests/test_pipeline.sh +++ b/tests/test_pipeline.sh @@ -64,6 +64,7 @@ asserteq `cat ${SCRIPT_DIR}/resources/remap_counts.yml | grep 'filtered:' | cut # Clean up after the test rm -rf work .nextflow* \ ${SCRIPT_DIR}/resources/source.vcf \ + ${SCRIPT_DIR}/resources/expected_remap.vcf \ ${SCRIPT_DIR}/resources/remap.vcf \ ${SCRIPT_DIR}/resources/remap_counts.yml \ ${SCRIPT_DIR}/resources/remap_unmapped.vcf \ diff --git a/tests/test_pipeline_empty.sh b/tests/test_pipeline_empty.sh index edd27c0..b95231c 100755 --- a/tests/test_pipeline_empty.sh +++ b/tests/test_pipeline_empty.sh @@ -49,6 +49,7 @@ asserteq `cat ${SCRIPT_DIR}/resources/remap_empty_counts.yml | grep 'filtered:' # Clean up after the test rm -rf work .nextflow* \ ${SCRIPT_DIR}/resources/source_empty.vcf \ + ${SCRIPT_DIR}/resources/expected_remap.vcf \ ${SCRIPT_DIR}/resources/remap_empty.vcf \ ${SCRIPT_DIR}/resources/remap_empty_counts.yml \ ${SCRIPT_DIR}/resources/remap_empty_unmapped.vcf \ From d52cd4b64eec66b02e8167565f275a169f73bb8b Mon Sep 17 00:00:00 2001 From: Timothee Cezard Date: Fri, 4 Jun 2021 11:32:19 +0100 Subject: [PATCH 15/19] Add config and limit memory required --- tests/resources/config.yml | 5 +++++ tests/test_pipeline.sh | 1 + tests/test_pipeline_empty.sh | 1 + variant_to_realignment.nf | 2 +- 4 files changed, 8 insertions(+), 1 deletion(-) create mode 100644 tests/resources/config.yml diff --git a/tests/resources/config.yml b/tests/resources/config.yml new file mode 100644 index 0000000..4162a2c --- /dev/null +++ b/tests/resources/config.yml @@ -0,0 +1,5 @@ +executor { + $local { + memory = '6 GB' + } +} \ No newline at end of file diff --git a/tests/test_pipeline.sh b/tests/test_pipeline.sh index 5d2c62f..c41ae5b 100755 --- a/tests/test_pipeline.sh +++ b/tests/test_pipeline.sh @@ -33,6 +33,7 @@ chr1 3710 . T A 50 PASS . GT:GQ 1/1:0 EOT nextflow run ${SOURCE_DIR}/main.nf \ +-config ${SCRIPT_DIR}/resources/config.yml \ --oldgenome ${SCRIPT_DIR}/resources/genome.fa \ --newgenome ${SCRIPT_DIR}/resources/new_genome.fa \ --vcffile ${SCRIPT_DIR}/resources/source.vcf \ diff --git a/tests/test_pipeline_empty.sh b/tests/test_pipeline_empty.sh index b95231c..a40b4f4 100755 --- a/tests/test_pipeline_empty.sh +++ b/tests/test_pipeline_empty.sh @@ -24,6 +24,7 @@ cat << EOT > "${SCRIPT_DIR}/resources/source_empty.vcf" EOT nextflow run ${SOURCE_DIR}/main.nf \ +-config ${SCRIPT_DIR}/resources/config.yml \ --oldgenome ${SCRIPT_DIR}/resources/genome.fa \ --newgenome ${SCRIPT_DIR}/resources/new_genome.fa \ --vcffile ${SCRIPT_DIR}/resources/source_empty.vcf \ diff --git a/variant_to_realignment.nf b/variant_to_realignment.nf index 46e7e24..27e00f7 100755 --- a/variant_to_realignment.nf +++ b/variant_to_realignment.nf @@ -92,7 +92,7 @@ process flankingRegionFasta {  */ process extractVariantInfoToFastaHeader { - memory '8GB' + memory '6GB' input: path "flanking_r1.bed" From 1d41b875df04e66c686b918a80e9735dd35ae3ae Mon Sep 17 00:00:00 2001 From: Timothee Cezard Date: Fri, 4 Jun 2021 11:41:41 +0100 Subject: [PATCH 16/19] Fix path to test --- .github/workflows/variant_remapping.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/variant_remapping.yml b/.github/workflows/variant_remapping.yml index 15bfbc2..1b5da00 100644 --- a/.github/workflows/variant_remapping.yml +++ b/.github/workflows/variant_remapping.yml @@ -34,7 +34,7 @@ jobs: - name: Test nextflow workflow run: | $CONDA/bin/conda run tests/test_pipeline.sh - $CONDA/bin/conda run test_pipeline_empty.sh + $CONDA/bin/conda run tests/test_pipeline_empty.sh - name: Test with pytest run: From 918b21c704e6e77d0f37c1f52880c517b8ece6c8 Mon Sep 17 00:00:00 2001 From: Timothee Cezard Date: Fri, 4 Jun 2021 11:53:16 +0100 Subject: [PATCH 17/19] fix test --- .../tests/test_reads_to_remapped_variants.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/variant_remapping_tools/tests/test_reads_to_remapped_variants.py b/variant_remapping_tools/tests/test_reads_to_remapped_variants.py index 457f6b3..a0de9ba 100644 --- a/variant_remapping_tools/tests/test_reads_to_remapped_variants.py +++ b/variant_remapping_tools/tests/test_reads_to_remapped_variants.py @@ -59,7 +59,7 @@ def test_process_bam_file(self): output_file = '/tmp/remapped.vcf' summary_file = '/tmp/summary.yml' out_failed_file = '/tmp/unmapped.vcf' - process_bam_file(bamfile, output_file, out_failed_file, fasta_path, True, 50, summary_file) + process_bam_file([bamfile], output_file, out_failed_file, fasta_path, True, 50, summary_file) expected = [ 'chr2 98 . C CG 50 PASS st=+ GT:GQ 1/1:0\n', From f2ecdb721161cd2cddea7faaf518f7da0e85b249 Mon Sep 17 00:00:00 2001 From: Timothee Cezard Date: Mon, 7 Jun 2021 22:12:58 +0100 Subject: [PATCH 18/19] Use interleaved fasta file (for flank 1 and flank2) to make it easier to split the resulting file in chunk reduce chunk size --- main.nf | 2 +- variant_to_realignment.nf | 75 ++++++++++++++++++++++++--------------- 2 files changed, 48 insertions(+), 29 deletions(-) diff --git a/main.nf b/main.nf index eb981f7..c233a06 100755 --- a/main.nf +++ b/main.nf @@ -179,7 +179,7 @@ process sortVCF { """ bgzip variants_remapped.vcf - bcftools sort -o variants_remapped_sorted.vcf.gz -Oz variants_remapped.vcf.gz + bcftools sort -T . -o variants_remapped_sorted.vcf.gz -Oz variants_remapped.vcf.gz """ } diff --git a/variant_to_realignment.nf b/variant_to_realignment.nf index 27e00f7..af402b1 100755 --- a/variant_to_realignment.nf +++ b/variant_to_realignment.nf @@ -101,8 +101,7 @@ process extractVariantInfoToFastaHeader { path "variants_read2.fa" output: - path "variant_read1.out.fa", emit: variant_read1_with_info - path "variant_read2.out.fa", emit: variant_read2_with_info + path "interleaved.fa", emit: interleaved_fasta // Disable Nextflow string interpolation using single quotes // https://www.nextflow.io/docs/latest/script.html#string-interpolation @@ -118,10 +117,35 @@ process extractVariantInfoToFastaHeader { # Then a newline is inserted between the vcf fields and the sequence # The vcf fields are regarded as comment to the fasta entry. paste -d ' \\n' position.txt vcf_fields.txt <(grep -v '^>' variants_read1.fa) > variant_read1.out.fa - paste -d ' \\n' position.txt vcf_fields.txt <(grep -v '^>' variants_read2.fa) > variant_read2.out.fa + paste -d '\\n' position.txt <(grep -v '^>' variants_read2.fa) > variant_read2.out.fa + + paste variant_read1.out.fa variant_read2.out.fa | paste - - | awk -F "\\t" 'BEGIN {OFS="\\n"} {print $1,$3,$2,$4}' > interleaved.fa ''' } +/* + * Split fasta entries into multiple chunks + */ +process split_fasta { + + input: + path interleaved_fasta + val chunk_size + + output: + path("read_chunk-*"), emit: read_split + + script: + if (interleaved_fasta.size() > 0) + """ + split -a 5 -d -l ${chunk_size * 4} ${interleaved_fasta} read_chunk- + """ + else + """ + ln -s ${interleaved_fasta} read_chunk-00001 + """ +} + /*  * Align sequence with minimap2 */ @@ -135,8 +159,8 @@ process alignWithMinimap { maxRetries 3 input: - // reads contains a list of 2 files (first and second read) - path(reads) + // reads contains paired interleaved (first and second read in the same file) + each path(reads) // indexing is done on the fly so get the genome directly path "genome.fa" val flanklength @@ -157,7 +181,7 @@ process alignWithMinimap { # the awk script will convert this comment in valid SAM tag minimap2 -k21 -w11 --sr --frag=yes -A2 -B5 -O6,16 --end-bonus 20 -E2,1 -r50 -p.5 -z 800,200\ -f1000,5000 -n2 -m20 -s40 -g200 -2K50m --heap-sort=yes --secondary=yes -N 2 -y \ - -a genome.fa ${reads[0]} ${reads[1]} | \ + -a genome.fa ${reads} | \ awk -F '\\t' 'BEGIN{OFS="\\t"}{if(!/^@/){\$NF="vr:Z:"\$NF}; print \$0;}' | \ samtools view -bS - > reads_aligned.bam """ @@ -165,7 +189,7 @@ process alignWithMinimap { """ minimap2 -k19 -w19 -A2 -B5 -O6,16 --end-bonus 20 -E3,1 -s200 -z200 -N50 --min-occ-floor=100 \ --secondary=yes -N 2 -y \ - -a genome.fa ${reads[0]} ${reads[1]} | \ + -a genome.fa ${reads} | \ awk -F '\\t' 'BEGIN{OFS="\\t"}{if(!/^@/){\$NF="vr:Z:"\$NF}; print \$0;}' | \ samtools view -bS - > reads_aligned.bam """ @@ -221,7 +245,7 @@ process alignWithBowtie { process readsToRemappedVariants { input: - path "reads.*.bam" + path "reads*.bam" path "genome.fa" val flank_length val filter_align_with_secondary @@ -259,7 +283,7 @@ workflow process_split_reads_generic { new_genome_fa_fai flank_length filter_align_with_secondary - chunck_size + chunk_size main: convertVCFToBed(source_vcf) @@ -273,24 +297,19 @@ workflow process_split_reads_generic { flankingRegionFasta.out.variants_read1, flankingRegionFasta.out.variants_read2 ) - // mix creates a single channel with both file - // toList create a single entry channel with the list of two file - split_reads = extractVariantInfoToFastaHeader.out.variant_read2_with_info - .mix(extractVariantInfoToFastaHeader.out.variant_read1_with_info) - .toList() - - // splitFasta split the two files in chunks only if the input fasta is not empty - extractVariantInfoToFastaHeader.out.variant_read1_with_info.subscribe { - if (it.size() > 0) { - split_reads = split_reads.splitFasta(by: chunck_size, file: true, elem: [0,1]) - } - } + split_fasta( + extractVariantInfoToFastaHeader.out.interleaved_fasta, + chunk_size + ) + + alignWithMinimap( - split_reads, + split_fasta.out.read_split, new_genome_fa, flank_length ) sortByName(alignWithMinimap.out.reads_aligned_bam) + // Collect all the bam files in the next step readsToRemappedVariants( sortByName.out.reads_aligned_sorted_bam.collect(), new_genome_fa, flank_length, filter_align_with_secondary @@ -313,11 +332,11 @@ workflow process_split_reads { main: flank_length = 50 filter_align_with_secondary = true - chunck_size = 10000000 + chunck_size = 5000000 process_split_reads_generic( source_vcf, old_genome_fa, old_genome_fa_fai, old_genome_chrom_sizes, new_genome_fa, new_genome_fa_fai, flank_length, filter_align_with_secondary, - chunck_size + chunk_size ) emit: variants_remapped = process_split_reads_generic.out.variants_remapped @@ -338,11 +357,11 @@ workflow process_split_reads_mid { main: flank_length = 2000 filter_align_with_secondary = true - chunck_size = 1000000 + chunk_size = 500000 process_split_reads_generic( source_vcf, old_genome_fa, old_genome_fa_fai, old_genome_chrom_sizes, new_genome_fa, new_genome_fa_fai, flank_length, filter_align_with_secondary, - chunck_size + chunk_size ) emit: variants_remapped = process_split_reads_generic.out.variants_remapped @@ -363,11 +382,11 @@ workflow process_split_reads_long { main: flank_length = 50000 filter_align_with_secondary = false - chunck_size = 100000 + chunk_size = 50000 process_split_reads_generic( source_vcf, old_genome_fa, old_genome_fa_fai, old_genome_chrom_sizes, new_genome_fa, new_genome_fa_fai, flank_length, filter_align_with_secondary, - chunck_size + chunk_size ) emit: variants_remapped = process_split_reads_generic.out.variants_remapped From cb63b41331be637fa64ff18cdcd5fd4eb27cdff0 Mon Sep 17 00:00:00 2001 From: Timothee Cezard Date: Mon, 7 Jun 2021 22:22:22 +0100 Subject: [PATCH 19/19] fix typo --- variant_to_realignment.nf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/variant_to_realignment.nf b/variant_to_realignment.nf index af402b1..b0a49cf 100755 --- a/variant_to_realignment.nf +++ b/variant_to_realignment.nf @@ -332,7 +332,7 @@ workflow process_split_reads { main: flank_length = 50 filter_align_with_secondary = true - chunck_size = 5000000 + chunk_size = 5000000 process_split_reads_generic( source_vcf, old_genome_fa, old_genome_fa_fai, old_genome_chrom_sizes, new_genome_fa, new_genome_fa_fai, flank_length, filter_align_with_secondary,