From 7cbb260276d30912b00fb703d3835fafdcd34b79 Mon Sep 17 00:00:00 2001
From: Timothee Cezard <tcezard@ebi.ac.uk>
Date: Wed, 26 May 2021 12:08:45 +0200
Subject: [PATCH 01/19] Allow novel reference allele as long as they are not
 changing the reference length which is where most false positives happened

---
 variant_remapping_tools/reads_to_remapped_variants.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/variant_remapping_tools/reads_to_remapped_variants.py b/variant_remapping_tools/reads_to_remapped_variants.py
index e90361b..058ea9e 100755
--- a/variant_remapping_tools/reads_to_remapped_variants.py
+++ b/variant_remapping_tools/reads_to_remapped_variants.py
@@ -63,7 +63,8 @@ def calculate_new_variant_definition(left_read, right_read, ref_fasta, original_
         new_alts.append(old_ref_conv)
         operations['rac'] = old_ref_conv + '-' + new_ref
         operations['nra'] = None
-        failure_reason = 'Novel Reference Allele'
+        if len(old_ref_conv) != len(new_ref):
+            failure_reason = 'Novel Reference Allele length change'
 
     # 3. Correct zero-length reference sequence
     if len(new_ref) == 0:

From 7e881e4eb3ed741a54223b9fe3f5cf87bcb73e20 Mon Sep 17 00:00:00 2001
From: Timothee Cezard <tcezard@ebi.ac.uk>
Date: Wed, 26 May 2021 20:05:17 +0200
Subject: [PATCH 02/19] Fix test

---
 .../tests/test_reads_to_remapped_variants.py                    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/variant_remapping_tools/tests/test_reads_to_remapped_variants.py b/variant_remapping_tools/tests/test_reads_to_remapped_variants.py
index 4696bb2..c8f0c6f 100644
--- a/variant_remapping_tools/tests/test_reads_to_remapped_variants.py
+++ b/variant_remapping_tools/tests/test_reads_to_remapped_variants.py
@@ -145,7 +145,7 @@ def test_calculate_new_variant_definition(self):
         vcf_rec = ['chr1', '48', '.', 'T', 'A']
         with patch('variant_remapping_tools.reads_to_remapped_variants.fetch_bases', return_value='C'):
             assert calculate_new_variant_definition(left_read, right_read, fasta, vcf_rec) == \
-                   (48, 'C', ['A', 'T'], {'st': '+', 'rac': 'T-C', 'nra': None}, 'Novel Reference Allele')
+                   (48, 'C', ['A', 'T'], {'st': '+', 'rac': 'T-C', 'nra': None}, None)
 
         # Forward strand alignment for Deletion
         left_read = self.mk_read(reference_name='chr2', reference_start=1, reference_end=47, is_reverse=False)

From 4f0734da0dca5b2e1bab62ae979ed539948aa60a Mon Sep 17 00:00:00 2001
From: Timothee Cezard <tcezard@ebi.ac.uk>
Date: Wed, 26 May 2021 22:17:41 +0200
Subject: [PATCH 03/19] swap two columns around in the intermediate bed file
 because the strand column (5) does not support space but the name column (4)
 does Also Add new genotypes change

---
 variant_remapping_tools/reads_to_remapped_variants.py | 4 +++-
 variant_to_realignment.nf                             | 8 ++++----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/variant_remapping_tools/reads_to_remapped_variants.py b/variant_remapping_tools/reads_to_remapped_variants.py
index 058ea9e..375aae5 100755
--- a/variant_remapping_tools/reads_to_remapped_variants.py
+++ b/variant_remapping_tools/reads_to_remapped_variants.py
@@ -99,7 +99,9 @@ def update_vcf_record(reference_name, varpos, new_ref, new_alts, operations, ori
             genotype_str_list = original_vcf_rec[genotype_i].split(':')
             if genotype_str_list[gt_index] == '1/1':
                 genotype_str_list[gt_index] = '0/0'
-                original_vcf_rec[genotype_i] = ':'.join(genotype_str_list)
+            elif 'nra' in operations and genotype_str_list[gt_index] == '0/1':
+                genotype_str_list[gt_index] = '1/2'
+            original_vcf_rec[genotype_i] = ':'.join(genotype_str_list)
 
 
 def fetch_bases(fasta, contig, start, length):
diff --git a/variant_to_realignment.nf b/variant_to_realignment.nf
index 6319d7f..177278a 100755
--- a/variant_to_realignment.nf
+++ b/variant_to_realignment.nf
@@ -25,8 +25,8 @@ process convertVCFToBed {
     #  - add all VCF fields separated by 2 characters pipe and caret (|^) to avoid impacting existing formatting of
     #    the VCF line. The sub is to protect the % character that would be interpreted by printf otherwise.
     awk -F '\\t' '{ if (!/^#/){ \
-                    printf $1"\\t"$2-1"\\t"$2"\\t"$4"\\t"$1; \
-                    for (i=2; i<=NF; i++){ sub(/%/, "%%", $i); printf "|^"$i }; print ""}; \
+                    printf $1"\\t"$2-1"\\t"$2"\\t"$1; \
+                    for (i=2; i<=NF; i++){ sub(/%/, "%%", $i); printf "|^"$i }; print "\\t"$4}; \
                   }' source.vcf \
                   > variants.bed
     '''
@@ -56,7 +56,7 @@ process flankingRegionBed {
         | bedtools slop  -g genome.chrom.sizes -l $flankingseq -r 0  > flanking_r1.bed
 
     # Adjust the start position of the flank to be one base downstream of the end of variant (\$4 is the reference allele)
-    awk 'BEGIN{OFS="\\t"}{ \$2=\$2+length(\$4); \$3=\$3+length(\$4); print \$0}' variants.bed \
+    awk 'BEGIN{OFS="\\t"}{ \$2=\$2+length(\$5); \$3=\$3+length(\$5); print \$0}' variants.bed \
         | bedtools slop  -g genome.chrom.sizes -l 0 -r $flankingseq  > flanking_r2.bed
     """
 }
@@ -107,7 +107,7 @@ process extractVariantInfoToFastaHeader {
     awk '{print ">" NR }' flanking_r1.bed > position.txt
 
     # Store position of the variant in the file
-    cut -f 5 flanking_r1.bed > vcf_fields.txt
+    cut -f 4 flanking_r1.bed > vcf_fields.txt
 
     # Paste the names, variant bases, then fasta sequences into a new file
     # A space will be inserted between the position and the vcf fields

From d505405464ced38f62c197ad51f3adde7181cd24 Mon Sep 17 00:00:00 2001
From: Timothee Cezard <tcezard@ebi.ac.uk>
Date: Fri, 28 May 2021 09:03:00 +0200
Subject: [PATCH 04/19] Clarify some of the processes's definition

---
 main.nf                   | 16 ++++++++--------
 variant_to_realignment.nf |  7 ++++---
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/main.nf b/main.nf
index 621195b..71f6037 100755
--- a/main.nf
+++ b/main.nf
@@ -85,13 +85,13 @@ include { process_split_reads; process_split_reads_mid; process_split_reads_long
 
 
 /*
- * Create the header for the output VCF
+ * This process convert the original Header to the remapped header and concatenate it with the remapped VCF records
  */
-process convertAndAddHeaderToVCF {
+process generateRemappedVCF {
 
     input:
-        path "variants_remapped_sorted.vcf"
         path "vcf_header.txt"
+        path "variants_remapped_sorted.vcf"
 
     output:
         path "variants_remapped_sorted_with_header.vcf", emit: final_vcf_with_header
@@ -119,9 +119,9 @@ process convertAndAddHeaderToVCF {
 }
 
 /*
- * Add header to unmapped variant VCF records
+ * This process adds the original header to unmapped variant VCF records and output the results
  */
-process mergeOriginalHeaderAndVCFAndOutput {
+process generateUnmappedVCF {
 
     publishDir outfile_dir,
         overwrite: true,
@@ -235,9 +235,9 @@ workflow finalise {
         summary
 
     main:
-        convertAndAddHeaderToVCF(variants_remapped, vcf_header)
-        mergeOriginalHeaderAndVCFAndOutput(vcf_header, variants_unmapped)
-        sortVCF(convertAndAddHeaderToVCF.out.final_vcf_with_header)
+        generateUnmappedVCF(vcf_header, variants_unmapped)
+        generateRemappedVCF(vcf_header, variants_remapped)
+        sortVCF(generateRemappedVCF.out.final_vcf_with_header)
         normaliseAnOutput(sortVCF.out.variants_remapped_sorted_gz, genome)
         outputStats(summary)
 }
diff --git a/variant_to_realignment.nf b/variant_to_realignment.nf
index 177278a..573dd6e 100755
--- a/variant_to_realignment.nf
+++ b/variant_to_realignment.nf
@@ -7,7 +7,8 @@ nextflow.enable.dsl=2
 
 
 /*
- * Convert the VCF file to BED format.
+ * Convert the VCF file to BED format storing the VCF line in the "name" column and the reference allele in the
+ * "strand" column.
  */
 process convertVCFToBed {
 
@@ -33,7 +34,7 @@ process convertVCFToBed {
 }
 
 /*
- * Based on variants BED, generate the flanking regions BED files.
+ * Based on variants BED, generate the BED file for each flank.
  */
 process flankingRegionBed {
 
@@ -100,7 +101,7 @@ process extractVariantInfoToFastaHeader {
         path "variant_read1.out.fa", emit: variant_read1_with_info
         path "variant_read2.out.fa", emit: variant_read2_with_info
 
-    // Disable the string interpolation using single quotes
+    // Disable Nextflow string interpolation using single quotes
     // https://www.nextflow.io/docs/latest/script.html#string-interpolation
     '''
     # Store variant position in the file to have a unique name

From f3fc4ce6f6b53850828b6cf6725d19c77cf39cc9 Mon Sep 17 00:00:00 2001
From: Timothee Cezard <tcezard@ebi.ac.uk>
Date: Mon, 31 May 2021 22:39:43 +0200
Subject: [PATCH 05/19] Ensure the reference base is upper case hack to allow
 space in the INFO field

---
 tests/test_pipeline.sh                        |  4 ++--
 .../reads_to_remapped_variants.py             |  5 +++--
 .../tests/test_reads_to_remapped_variants.py  | 19 ++++++++++++++++++-
 variant_to_realignment.nf                     |  9 +++++----
 4 files changed, 28 insertions(+), 9 deletions(-)

diff --git a/tests/test_pipeline.sh b/tests/test_pipeline.sh
index c31472d..4e6ef58 100755
--- a/tests/test_pipeline.sh
+++ b/tests/test_pipeline.sh
@@ -12,7 +12,7 @@ cat << EOT > "${SCRIPT_DIR}/resources/source.vcf"
 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Consensus Genotype across all datasets with called genotype">
 ##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
 #CHROM	POS	ID	REF	 ALT	QUAL 	FILTER	INFO	FORMAT	HG001
-chr1	48	.	C	A,T	50	PASS	COMMENT=NANANANANANANANANANANANANAN%ANANANANANANANANANANANANANANANANAN|ANANANANANANA&NANANANA\$NANANANANANANANANANANANANANANANANAN^ANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANABATMAN	GT:GQ	1/1:0
+chr1	48	.	C	A,T	50	PASS	COMMENT=NANANANANANANANANANANANANAN%ANANANANANANANANANANANANANANANANAN|ANANANANANANA&NANAN ANA\$NANANANANANANANANANANANANANANANANAN^ANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANABATMAN	GT:GQ	1/1:0
 chr1	98	.	C	CG	50	PASS	.	GT:GQ	1/1:0
 chr1	1078	.	G	A	50	PASS	.	GT	1/1
 chr1	1818	.	AAC	A	50	PASS	.	GT:GQ	1/1:0
@@ -34,7 +34,7 @@ ls ${SCRIPT_DIR}/resources/remap.vcf \
 # Build the expected VCF
 cat << EOT > "${SCRIPT_DIR}/resources/expected_remap.vcf"
 #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	HG001
-chr2	48	.	C	A,T	50	PASS	COMMENT=NANANANANANANANANANANANANAN%ANANANANANANANANANANANANANANANANAN|ANANANANANANA&NANANANA\$NANANANANANANANANANANANANANANANANAN^ANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANABATMAN;st=+	GT:GQ	1/1:0
+chr2	48	.	C	A,T	50	PASS	COMMENT=NANANANANANANANANANANANANAN%ANANANANANANANANANANANANANANANANAN|ANANANANANANA&NANAN ANA\$NANANANANANANANANANANANANANANANANAN^ANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANABATMAN;st=+	GT:GQ	1/1:0
 chr2	98	.	C	CG	50	PASS	st=+	GT:GQ	1/1:0
 chr2	1078	.	A	G	50	PASS	st=+;rac=G-A	GT	0/0
 chr2	1818	.	AAC	A	50	PASS	st=+	GT:GQ	1/1:0
diff --git a/variant_remapping_tools/reads_to_remapped_variants.py b/variant_remapping_tools/reads_to_remapped_variants.py
index 375aae5..a581e9a 100755
--- a/variant_remapping_tools/reads_to_remapped_variants.py
+++ b/variant_remapping_tools/reads_to_remapped_variants.py
@@ -25,7 +25,8 @@ def calculate_new_variant_definition(left_read, right_read, ref_fasta, original_
     operations = {}
     # Define new ref and new pos
     new_ref = fetch_bases(ref_fasta, left_read.reference_name, left_read.reference_end + 1,
-                          right_read.reference_start - left_read.reference_end)
+                          right_read.reference_start - left_read.reference_end).upper()
+
     new_pos = left_read.reference_end + 1
 
     # 1. Handle reference strand change
@@ -69,7 +70,7 @@ def calculate_new_variant_definition(left_read, right_read, ref_fasta, original_
     # 3. Correct zero-length reference sequence
     if len(new_ref) == 0:
         new_pos -= 1
-        new_ref = fetch_bases(ref_fasta, left_read.reference_name, new_pos, 1)
+        new_ref = fetch_bases(ref_fasta, left_read.reference_name, new_pos, 1).upper()
         new_alts = [new_ref + alt for alt in new_alts]
         operations['zlr'] = None
 
diff --git a/variant_remapping_tools/tests/test_reads_to_remapped_variants.py b/variant_remapping_tools/tests/test_reads_to_remapped_variants.py
index c8f0c6f..457f6b3 100644
--- a/variant_remapping_tools/tests/test_reads_to_remapped_variants.py
+++ b/variant_remapping_tools/tests/test_reads_to_remapped_variants.py
@@ -7,7 +7,8 @@
 
 
 from variant_remapping_tools.reads_to_remapped_variants import fetch_bases, process_bam_file, \
-    calculate_new_variant_definition, order_reads, link_supplementary, pass_aligned_filtering, group_reads
+    calculate_new_variant_definition, order_reads, link_supplementary, pass_aligned_filtering, group_reads, \
+    update_vcf_record
 
 
 class TestProcess(TestCase):
@@ -167,6 +168,22 @@ def test_calculate_new_variant_definition(self):
             assert calculate_new_variant_definition(left_read, right_read, fasta, vcf_rec) == \
                    (48, 'TTG', ['G'], {'st': '-'}, None)
 
+    def test_update_vcf_record(self):
+        # Allele swap no genotype change
+        original_vcf_rec = ['chr1', '10', '.', 'A', 'T', '50', '.', '.', 'GT', '0/1']
+        update_vcf_record('1', 11, 'T', 'A', {'rac': 'A-T'}, original_vcf_rec)
+        assert original_vcf_rec == ['1', '11', '.', 'T', 'A', '50', '.', 'rac=A-T', 'GT', '0/1']
+
+        # Allele swap with genotype change
+        original_vcf_rec = ['chr1', '10', '.', 'A', 'T', '50', '.', '.', 'GT', '1/1']
+        update_vcf_record('1', 11, 'T', 'A', {'rac': 'A-T'}, original_vcf_rec)
+        assert original_vcf_rec == ['1', '11', '.', 'T', 'A', '50', '.', 'rac=A-T', 'GT', '0/0']
+
+        # Novel reference allele with genotype change
+        original_vcf_rec = ['chr1', '10', '.', 'A', 'T', '50', '.', '.', 'GT', '0/1']
+        update_vcf_record('1', 11, 'C', ['A', 'T'], {'rac': 'A-C', 'nra': None}, original_vcf_rec)
+        assert original_vcf_rec == ['1', '11', '.', 'C', 'A,T', '50', '.', 'rac=A-C;nra', 'GT', '1/2']
+
 
     @staticmethod
     def get_test_resource(resource_name):
diff --git a/variant_to_realignment.nf b/variant_to_realignment.nf
index 573dd6e..8e15d11 100755
--- a/variant_to_realignment.nf
+++ b/variant_to_realignment.nf
@@ -24,10 +24,11 @@ process convertVCFToBed {
     #  - Switch to 0 based coordinates system
     #  - Add the reference allele so it can be used in flankingRegionBed to adjust the position of the right flank
     #  - add all VCF fields separated by 2 characters pipe and caret (|^) to avoid impacting existing formatting of
-    #    the VCF line. The sub is to protect the % character that would be interpreted by printf otherwise.
+    #    the VCF line. The sub replacing percent is to protect the % character that would be interpreted by printf
+    #    otherwise. the sub replacing space is to prevent bedtools from using them as a field separator
     awk -F '\\t' '{ if (!/^#/){ \
                     printf $1"\\t"$2-1"\\t"$2"\\t"$1; \
-                    for (i=2; i<=NF; i++){ sub(/%/, "%%", $i); printf "|^"$i }; print "\\t"$4}; \
+                    for (i=2; i<=NF; i++){ sub(/%/, "%%", $i); sub(/ /, "£€", $i); printf "|^"$i }; print "\\t"$4}; \
                   }' source.vcf \
                   > variants.bed
     '''
@@ -56,7 +57,7 @@ process flankingRegionBed {
     awk 'BEGIN{OFS="\\t"}{\$2=\$2-1; \$3=\$3-1; print \$0}' variants.bed \
         | bedtools slop  -g genome.chrom.sizes -l $flankingseq -r 0  > flanking_r1.bed
 
-    # Adjust the start position of the flank to be one base downstream of the end of variant (\$4 is the reference allele)
+    # Adjust the start position of the flank to be one base downstream of the end of variant (\$5 is the reference allele)
     awk 'BEGIN{OFS="\\t"}{ \$2=\$2+length(\$5); \$3=\$3+length(\$5); print \$0}' variants.bed \
         | bedtools slop  -g genome.chrom.sizes -l 0 -r $flankingseq  > flanking_r2.bed
     """
@@ -108,7 +109,7 @@ process extractVariantInfoToFastaHeader {
     awk '{print ">" NR }' flanking_r1.bed > position.txt
 
     # Store position of the variant in the file
-    cut -f 4 flanking_r1.bed > vcf_fields.txt
+    cut -f 4 flanking_r1.bed | sed 's/£€/ /g' > vcf_fields.txt
 
     # Paste the names, variant bases, then fasta sequences into a new file
     # A space will be inserted between the position and the vcf fields

From e539f7d805512ba525a4cecedb5f00bb46aef202 Mon Sep 17 00:00:00 2001
From: Timothee Cezard <tcezard@ebi.ac.uk>
Date: Tue, 1 Jun 2021 08:42:15 +0200
Subject: [PATCH 06/19] add suport for multiple whitespace

---
 tests/test_pipeline.sh    | 4 ++--
 variant_to_realignment.nf | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/test_pipeline.sh b/tests/test_pipeline.sh
index 4e6ef58..50c28e5 100755
--- a/tests/test_pipeline.sh
+++ b/tests/test_pipeline.sh
@@ -12,7 +12,7 @@ cat << EOT > "${SCRIPT_DIR}/resources/source.vcf"
 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Consensus Genotype across all datasets with called genotype">
 ##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
 #CHROM	POS	ID	REF	 ALT	QUAL 	FILTER	INFO	FORMAT	HG001
-chr1	48	.	C	A,T	50	PASS	COMMENT=NANANANANANANANANANANANANAN%ANANANANANANANANANANANANANANANANAN|ANANANANANANA&NANAN ANA\$NANANANANANANANANANANANANANANANANAN^ANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANABATMAN	GT:GQ	1/1:0
+chr1	48	.	C	A,T	50	PASS	COMMENT=NANANANANANANANANANANANANAN%ANANANANANANANANANANANANANANANANAN|ANANANANANANA&NANAN ANA\$NANANANANANANANANANANANANANANANANAN^ANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANA NANANANANANANANANANANANANABATMAN	GT:GQ	1/1:0
 chr1	98	.	C	CG	50	PASS	.	GT:GQ	1/1:0
 chr1	1078	.	G	A	50	PASS	.	GT	1/1
 chr1	1818	.	AAC	A	50	PASS	.	GT:GQ	1/1:0
@@ -34,7 +34,7 @@ ls ${SCRIPT_DIR}/resources/remap.vcf \
 # Build the expected VCF
 cat << EOT > "${SCRIPT_DIR}/resources/expected_remap.vcf"
 #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	HG001
-chr2	48	.	C	A,T	50	PASS	COMMENT=NANANANANANANANANANANANANAN%ANANANANANANANANANANANANANANANANAN|ANANANANANANA&NANAN ANA\$NANANANANANANANANANANANANANANANANAN^ANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANABATMAN;st=+	GT:GQ	1/1:0
+chr2	48	.	C	A,T	50	PASS	COMMENT=NANANANANANANANANANANANANAN%ANANANANANANANANANANANANANANANANAN|ANANANANANANA&NANAN ANA\$NANANANANANANANANANANANANANANANANAN^ANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANA NANANANANANANANANANANANANABATMAN;st=+	GT:GQ	1/1:0
 chr2	98	.	C	CG	50	PASS	st=+	GT:GQ	1/1:0
 chr2	1078	.	A	G	50	PASS	st=+;rac=G-A	GT	0/0
 chr2	1818	.	AAC	A	50	PASS	st=+	GT:GQ	1/1:0
diff --git a/variant_to_realignment.nf b/variant_to_realignment.nf
index 8e15d11..a63c309 100755
--- a/variant_to_realignment.nf
+++ b/variant_to_realignment.nf
@@ -28,7 +28,7 @@ process convertVCFToBed {
     #    otherwise. the sub replacing space is to prevent bedtools from using them as a field separator
     awk -F '\\t' '{ if (!/^#/){ \
                     printf $1"\\t"$2-1"\\t"$2"\\t"$1; \
-                    for (i=2; i<=NF; i++){ sub(/%/, "%%", $i); sub(/ /, "£€", $i); printf "|^"$i }; print "\\t"$4}; \
+                    for (i=2; i<=NF; i++){ gsub(/%/, "%%", $i); gsub(/ /, "£€", $i); printf "|^"$i }; print "\\t"$4}; \
                   }' source.vcf \
                   > variants.bed
     '''

From 0720ecfdcdaa2adf21a391e466ef63bc29de7d34 Mon Sep 17 00:00:00 2001
From: Timothee Cezard <tcezard@ebi.ac.uk>
Date: Tue, 1 Jun 2021 21:03:28 +0100
Subject: [PATCH 07/19] Add filtering of variants that are on the first or on
 the last position of a contig as no flanks can be generated

---
 main.nf                | 26 +++++++++++++++++++++++++-
 tests/test_pipeline.sh |  5 ++++-
 2 files changed, 29 insertions(+), 2 deletions(-)

diff --git a/main.nf b/main.nf
index 71f6037..876df1a 100755
--- a/main.nf
+++ b/main.nf
@@ -64,6 +64,29 @@ process uncompressInputVCF {
             """
 }
 
+
+/*
+ * filter VCF file to remove variant too close the edges of chromosome because we can't get flanking regions
+ */
+process filterInputVCF {
+
+    input:
+        path "source.vcf"
+        path "genome_fai"
+
+    output:
+        path "filtered.vcf", emit: filtered_vcf_file
+
+    script:
+    """
+    awk '{ print \$1"\\t1\\t"\$2;}' genome_fai > edge_region.bed
+    bgzip source.vcf
+    bcftools index source.vcf.gz
+    bcftools filter --regions-file edge_region.bed  -o filtered.vcf source.vcf.gz
+    """
+}
+
+
 /*
  * Store the original VCF header for later use
  */
@@ -250,9 +273,10 @@ workflow {
         prepare_old_genome(params.oldgenome)
         prepare_new_genome(params.newgenome)
         uncompressInputVCF(params.vcffile)
+        filterInputVCF(uncompressInputVCF.out.vcf_file, prepare_old_genome.out.genome_fai)
         storeVCFHeader(uncompressInputVCF.out.vcf_file)
         process_split_reads(
-            uncompressInputVCF.out.vcf_file,
+            filterInputVCF.out.filtered_vcf_file,
             params.oldgenome,
             prepare_old_genome.out.genome_fai,
             prepare_old_genome.out.genome_chrom_sizes,
diff --git a/tests/test_pipeline.sh b/tests/test_pipeline.sh
index 50c28e5..4436e72 100755
--- a/tests/test_pipeline.sh
+++ b/tests/test_pipeline.sh
@@ -12,6 +12,7 @@ cat << EOT > "${SCRIPT_DIR}/resources/source.vcf"
 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Consensus Genotype across all datasets with called genotype">
 ##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
 #CHROM	POS	ID	REF	 ALT	QUAL 	FILTER	INFO	FORMAT	HG001
+chr1	1	.	C	T	50	PASS	.	GT:GQ	1/1:0
 chr1	48	.	C	A,T	50	PASS	COMMENT=NANANANANANANANANANANANANAN%ANANANANANANANANANANANANANANANANAN|ANANANANANANA&NANAN ANA\$NANANANANANANANANANANANANANANANANAN^ANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANA NANANANANANANANANANANANANABATMAN	GT:GQ	1/1:0
 chr1	98	.	C	CG	50	PASS	.	GT:GQ	1/1:0
 chr1	1078	.	G	A	50	PASS	.	GT	1/1
@@ -48,6 +49,8 @@ diff "${SCRIPT_DIR}/resources/expected_remap.vcf" <(grep -v '^##' "${SCRIPT_DIR}
 # Clean up after the test
 rm -rf work .nextflow* \
        ${SCRIPT_DIR}/resources/source.vcf \
-       ${SCRIPT_DIR}/resources/*remap.vcf* \
+       ${SCRIPT_DIR}/resources/remap.vcf \
+       ${SCRIPT_DIR}/resources/remap_counts.yml \
+       ${SCRIPT_DIR}/resources/remap_unmapped.vcf \
        ${SCRIPT_DIR}/resources/new_genome.fa.* \
        ${SCRIPT_DIR}/resources/genome.fa.fai

From 88c7630e1b78a586650aa51fe7ac8e34b53a1e71 Mon Sep 17 00:00:00 2001
From: Timothee Cezard <tcezard@ebi.ac.uk>
Date: Wed, 2 Jun 2021 14:04:29 +0100
Subject: [PATCH 08/19] Support for unsorted input file and add test

---
 main.nf                | 92 +++++++++++++++++++++++++++++-------------
 tests/test_pipeline.sh |  4 +-
 2 files changed, 66 insertions(+), 30 deletions(-)

diff --git a/main.nf b/main.nf
index 876df1a..970d195 100755
--- a/main.nf
+++ b/main.nf
@@ -76,13 +76,20 @@ process filterInputVCF {
 
     output:
         path "filtered.vcf", emit: filtered_vcf_file
+        path "kept.vcf", emit: kept_vcf_file
+        path "count.yml", emit: count_yml
 
     script:
     """
-    awk '{ print \$1"\\t1\\t"\$2;}' genome_fai > edge_region.bed
-    bgzip source.vcf
-    bcftools index source.vcf.gz
-    bcftools filter --regions-file edge_region.bed  -o filtered.vcf source.vcf.gz
+    awk '{ print \$1"\\t1\\t"\$2-1;}' genome_fai > center_regions.bed
+    awk '{ print \$1"\\t0\\t1"; print \$1"\\t"\$2-1"\\t"\$2;}' genome_fai > edge_regions.bed
+    # Bcftools sort needs the contigs to be set in the header or an index neither of which we can't garantee here.
+    # that's why we're using unix sort
+    awk '\$1 ~ /^#/ {print \$0;next} {print \$0 | "sort -k1,1 -k2,2n"}' source.vcf | tee >(wc -l > all_count.txt) | bgzip -c >  source_sorted.vcf.gz
+    bcftools index source_sorted.vcf.gz
+    bcftools filter --regions-file center_regions.bed  -o kept.vcf source_sorted.vcf.gz
+    bcftools filter --regions-file edge_regions.bed  source_sorted.vcf.gz | grep -v '^#' | tee filtered.vcf | wc -l > filtered_count.txt
+    cat <(cat all_count.txt | awk '{print "all: "\$1}') <(cat filtered_count.txt | awk '{print "filtered: "\$1}') > count.yml
     """
 }
 
@@ -221,6 +228,23 @@ process outputStats {
     """
 }
 
+/*
+ * Concatenate the unmapped variants
+ */
+process combineUnmappedVCF {
+    input:
+        path "variants1.vcf"
+        path "variants2.vcf"
+
+    output:
+        path "merge.vcf", emit: merge_vcf
+
+    """
+    cat variants1.vcf variants2.vcf > merge.vcf
+    """
+}
+
+
 process combineVCF {
     input:
         path "variants1.vcf"
@@ -236,6 +260,7 @@ process combineVCF {
 
 process combineYaml {
     input:
+        path "initial_yml"
         path "round1.yml"
         path "round2.yml"
         path "round3.yml"
@@ -244,7 +269,7 @@ process combineYaml {
         path "merge.yml", emit: merge_yml
 
     """
-    cat round1.yml round2.yml round3.yml > merge.yml
+    cat initial_yml round1.yml round2.yml round3.yml > merge.yml
     """
 }
 
@@ -266,9 +291,33 @@ workflow finalise {
 }
 
 
+//process_with_bowtie
+workflow process_with_bowtie {
+    main:
+        prepare_old_genome(params.oldgenome)
+        prepare_new_genome_bowtie(params.newgenome)
+        uncompressInputVCF(params.vcffile)
+        storeVCFHeader(uncompressInputVCF.out.vcf_file)
+        process_split_reads_with_bowtie(
+            uncompressInputVCF.out.vcf_file,
+            params.oldgenome,
+            prepare_old_genome.out.genome_fai,
+            prepare_old_genome.out.genome_chrom_sizes,
+            params.newgenome,
+            prepare_new_genome_bowtie.out.genome_fai,
+            prepare_new_genome_bowtie.out.bowtie_indexes
+        )
+        finalise(process_split_reads_with_bowtie.out.variants_remapped, storeVCFHeader.out.vcf_header,
+                 params.newgenome, process_split_reads_with_bowtie.out.summary_yml)
+}
+
+
+
+
+
 // process_with_minimap
 // Workflow without a name is the default workflow that gets executed when the file is run through nextflow
-workflow {
+workflow  {
     main:
         prepare_old_genome(params.oldgenome)
         prepare_new_genome(params.newgenome)
@@ -276,7 +325,7 @@ workflow {
         filterInputVCF(uncompressInputVCF.out.vcf_file, prepare_old_genome.out.genome_fai)
         storeVCFHeader(uncompressInputVCF.out.vcf_file)
         process_split_reads(
-            filterInputVCF.out.filtered_vcf_file,
+            filterInputVCF.out.kept_vcf_file,
             params.oldgenome,
             prepare_old_genome.out.genome_fai,
             prepare_old_genome.out.genome_chrom_sizes,
@@ -299,39 +348,24 @@ workflow {
             params.newgenome,
             prepare_new_genome.out.genome_fai
         )
+        combineUnmappedVCF(
+            filterInputVCF.out.filtered_vcf_file,
+            process_split_reads_long.out.variants_unmapped,
+        )
         combineVCF(
             process_split_reads.out.variants_remapped,
             process_split_reads_mid.out.variants_remapped,
             process_split_reads_long.out.variants_remapped
         )
         combineYaml(
+            filterInputVCF.out.count_yml,
             process_split_reads.out.summary_yml,
             process_split_reads_mid.out.summary_yml,
-            process_split_reads_long.out.summary_yml,
+            process_split_reads_long.out.summary_yml
         )
 
         finalise(
-            combineVCF.out.merge_vcf, process_split_reads_long.out.variants_unmapped, storeVCFHeader.out.vcf_header,
+            combineVCF.out.merge_vcf, combineUnmappedVCF.out.merge_vcf, storeVCFHeader.out.vcf_header,
             params.newgenome, combineYaml.out.merge_yml
         )
 }
-
-//process_with_bowtie
-workflow process_with_bowtie {
-    main:
-        prepare_old_genome(params.oldgenome)
-        prepare_new_genome_bowtie(params.newgenome)
-        uncompressInputVCF(params.vcffile)
-        storeVCFHeader(uncompressInputVCF.out.vcf_file)
-        process_split_reads_with_bowtie(
-            uncompressInputVCF.out.vcf_file,
-            params.oldgenome,
-            prepare_old_genome.out.genome_fai,
-            prepare_old_genome.out.genome_chrom_sizes,
-            params.newgenome,
-            prepare_new_genome_bowtie.out.genome_fai,
-            prepare_new_genome_bowtie.out.bowtie_indexes
-        )
-        finalise(process_split_reads_with_bowtie.out.variants_remapped, storeVCFHeader.out.vcf_header,
-                 params.newgenome, process_split_reads_with_bowtie.out.summary_yml)
-}
diff --git a/tests/test_pipeline.sh b/tests/test_pipeline.sh
index 4436e72..91d22a8 100755
--- a/tests/test_pipeline.sh
+++ b/tests/test_pipeline.sh
@@ -16,9 +16,10 @@ chr1	1	.	C	T	50	PASS	.	GT:GQ	1/1:0
 chr1	48	.	C	A,T	50	PASS	COMMENT=NANANANANANANANANANANANANAN%ANANANANANANANANANANANANANANANANAN|ANANANANANANA&NANAN ANA\$NANANANANANANANANANANANANANANANANAN^ANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANA NANANANANANANANANANANANANABATMAN	GT:GQ	1/1:0
 chr1	98	.	C	CG	50	PASS	.	GT:GQ	1/1:0
 chr1	1078	.	G	A	50	PASS	.	GT	1/1
-chr1	1818	.	AAC	A	50	PASS	.	GT:GQ	1/1:0
 chr1	2030	.	A	TCC	50	PASS	.	GT:GQ	1/1:0
+chr1	1818	.	AAC	A	50	PASS	.	GT:GQ	1/1:0
 chr1	3510	.	T	C	50	PASS	.	GT:GQ	1/1:0
+chr1	3709	.	C	T	50	PASS	.	GT:GQ	1/1:0
 EOT
 
 nextflow run ${SOURCE_DIR}/main.nf \
@@ -43,6 +44,7 @@ chr2	2030	.	A	TCC	50	PASS	st=+	GT:GQ	1/1:0
 chr2	3510	.	T	C	50	PASS	st=+	GT:GQ	1/1:0
 EOT
 
+
 # Compare vs the expected VCF
 diff "${SCRIPT_DIR}/resources/expected_remap.vcf" <(grep -v '^##' "${SCRIPT_DIR}/resources/remap.vcf")
 

From 60362ff69f1205b92f8a2ba227c199724cd92ff4 Mon Sep 17 00:00:00 2001
From: Timothee Cezard <tcezard@ebi.ac.uk>
Date: Thu, 3 Jun 2021 14:15:49 +0100
Subject: [PATCH 09/19] Filter out the variants close to the edge using targets
 rather than regions

---
 main.nf                | 10 +++-------
 tests/test_pipeline.sh | 19 ++++++++++++++++---
 2 files changed, 19 insertions(+), 10 deletions(-)

diff --git a/main.nf b/main.nf
index 970d195..85c79b0 100755
--- a/main.nf
+++ b/main.nf
@@ -83,13 +83,9 @@ process filterInputVCF {
     """
     awk '{ print \$1"\\t1\\t"\$2-1;}' genome_fai > center_regions.bed
     awk '{ print \$1"\\t0\\t1"; print \$1"\\t"\$2-1"\\t"\$2;}' genome_fai > edge_regions.bed
-    # Bcftools sort needs the contigs to be set in the header or an index neither of which we can't garantee here.
-    # that's why we're using unix sort
-    awk '\$1 ~ /^#/ {print \$0;next} {print \$0 | "sort -k1,1 -k2,2n"}' source.vcf | tee >(wc -l > all_count.txt) | bgzip -c >  source_sorted.vcf.gz
-    bcftools index source_sorted.vcf.gz
-    bcftools filter --regions-file center_regions.bed  -o kept.vcf source_sorted.vcf.gz
-    bcftools filter --regions-file edge_regions.bed  source_sorted.vcf.gz | grep -v '^#' | tee filtered.vcf | wc -l > filtered_count.txt
-    cat <(cat all_count.txt | awk '{print "all: "\$1}') <(cat filtered_count.txt | awk '{print "filtered: "\$1}') > count.yml
+    bcftools filter --targets-file center_regions.bed source.vcf | tee kept.vcf |  grep -cv '^#' > all_count.txt
+    bcftools filter --targets-file edge_regions.bed  source.vcf | grep -v '^#' | tee filtered.vcf | wc -l > filtered_count.txt
+    cat <(cat *_count.txt | awk '{sum += \$1} END{print "all: "sum}') <(cat filtered_count.txt | awk '{print "filtered: "\$1}') > count.yml
     """
 }
 
diff --git a/tests/test_pipeline.sh b/tests/test_pipeline.sh
index 91d22a8..431f6ae 100755
--- a/tests/test_pipeline.sh
+++ b/tests/test_pipeline.sh
@@ -2,6 +2,15 @@
 
 set -Eeuo pipefail
 
+function asserteq() {
+  if [[ ! "$1" -eq "$2" ]]
+  then
+    echo "Assertion Error: $1 not equal to $2"
+    exit 1
+  fi
+
+}
+
 SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 SOURCE_DIR=$(dirname $SCRIPT_DIR)
 
@@ -12,14 +21,15 @@ cat << EOT > "${SCRIPT_DIR}/resources/source.vcf"
 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Consensus Genotype across all datasets with called genotype">
 ##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
 #CHROM	POS	ID	REF	 ALT	QUAL 	FILTER	INFO	FORMAT	HG001
-chr1	1	.	C	T	50	PASS	.	GT:GQ	1/1:0
+chr1	1	.	CG	TG	50	PASS	.	GT:GQ	1/1:0
 chr1	48	.	C	A,T	50	PASS	COMMENT=NANANANANANANANANANANANANAN%ANANANANANANANANANANANANANANANANAN|ANANANANANANA&NANAN ANA\$NANANANANANANANANANANANANANANANANAN^ANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANA NANANANANANANANANANANANANABATMAN	GT:GQ	1/1:0
 chr1	98	.	C	CG	50	PASS	.	GT:GQ	1/1:0
 chr1	1078	.	G	A	50	PASS	.	GT	1/1
 chr1	2030	.	A	TCC	50	PASS	.	GT:GQ	1/1:0
 chr1	1818	.	AAC	A	50	PASS	.	GT:GQ	1/1:0
 chr1	3510	.	T	C	50	PASS	.	GT:GQ	1/1:0
-chr1	3709	.	C	T	50	PASS	.	GT:GQ	1/1:0
+chr1	3709	.	CA	TA	50	PASS	.	GT:GQ	1/1:0
+chr1	3710	.	T	A	50	PASS	.	GT:GQ	1/1:0
 EOT
 
 nextflow run ${SOURCE_DIR}/main.nf \
@@ -44,10 +54,13 @@ chr2	2030	.	A	TCC	50	PASS	st=+	GT:GQ	1/1:0
 chr2	3510	.	T	C	50	PASS	st=+	GT:GQ	1/1:0
 EOT
 
-
 # Compare vs the expected VCF
 diff "${SCRIPT_DIR}/resources/expected_remap.vcf" <(grep -v '^##' "${SCRIPT_DIR}/resources/remap.vcf")
 
+asserteq `cat ${SCRIPT_DIR}/resources/remap_counts.yml | grep 'all:' | cut -d ' ' -f 2`  9
+asserteq `cat ${SCRIPT_DIR}/resources/remap_counts.yml | grep 'filtered:' | cut -d ' ' -f 2`  2
+
+
 # Clean up after the test
 rm -rf work .nextflow* \
        ${SCRIPT_DIR}/resources/source.vcf \

From d8666ab07a3a666281c6227bbf0ac59460e2edfb Mon Sep 17 00:00:00 2001
From: Timothee Cezard <tcezard@ebi.ac.uk>
Date: Thu, 3 Jun 2021 14:30:56 +0100
Subject: [PATCH 10/19] increase memory for VariantInfoToFastaHeader that
 failed occasionally

---
 variant_to_realignment.nf | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/variant_to_realignment.nf b/variant_to_realignment.nf
index a63c309..9b738b9 100755
--- a/variant_to_realignment.nf
+++ b/variant_to_realignment.nf
@@ -92,6 +92,8 @@ process flankingRegionFasta {
  */
 process extractVariantInfoToFastaHeader {
 
+    memory 8GB
+
     input:  
         path "flanking_r1.bed"
         path "flanking_r2.bed"
@@ -108,7 +110,7 @@ process extractVariantInfoToFastaHeader {
     # Store variant position in the file to have a unique name
     awk '{print ">" NR }' flanking_r1.bed > position.txt
 
-    # Store position of the variant in the file
+    # Store position of the variant in the file and replace '£€' with the original whitespace from convertVCFToBed
     cut -f 4 flanking_r1.bed | sed 's/£€/ /g' > vcf_fields.txt
 
     # Paste the names, variant bases, then fasta sequences into a new file

From 5037c83cad55accf11236d24d150471ffbaa8b83 Mon Sep 17 00:00:00 2001
From: Timothee Cezard <tcezard@ebi.ac.uk>
Date: Thu, 3 Jun 2021 16:25:05 +0100
Subject: [PATCH 11/19] Don't use grep -c because it exists with status 1 when
 nothing is found Fix memory syntax

---
 main.nf                   | 2 +-
 variant_to_realignment.nf | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/main.nf b/main.nf
index 85c79b0..eb981f7 100755
--- a/main.nf
+++ b/main.nf
@@ -83,7 +83,7 @@ process filterInputVCF {
     """
     awk '{ print \$1"\\t1\\t"\$2-1;}' genome_fai > center_regions.bed
     awk '{ print \$1"\\t0\\t1"; print \$1"\\t"\$2-1"\\t"\$2;}' genome_fai > edge_regions.bed
-    bcftools filter --targets-file center_regions.bed source.vcf | tee kept.vcf |  grep -cv '^#' > all_count.txt
+    bcftools filter --targets-file center_regions.bed source.vcf | tee kept.vcf |  grep -v '^#' | wc -l > all_count.txt
     bcftools filter --targets-file edge_regions.bed  source.vcf | grep -v '^#' | tee filtered.vcf | wc -l > filtered_count.txt
     cat <(cat *_count.txt | awk '{sum += \$1} END{print "all: "sum}') <(cat filtered_count.txt | awk '{print "filtered: "\$1}') > count.yml
     """
diff --git a/variant_to_realignment.nf b/variant_to_realignment.nf
index 9b738b9..afa42cd 100755
--- a/variant_to_realignment.nf
+++ b/variant_to_realignment.nf
@@ -92,7 +92,7 @@ process flankingRegionFasta {
  */
 process extractVariantInfoToFastaHeader {
 
-    memory 8GB
+    memory '8GB'
 
     input:  
         path "flanking_r1.bed"

From 7cef6cb184950f3c8a15bf5d0d073fb7bcb9e835 Mon Sep 17 00:00:00 2001
From: Timothee Cezard <tcezard@ebi.ac.uk>
Date: Thu, 3 Jun 2021 23:47:41 +0100
Subject: [PATCH 12/19] Use scatter gather to parallelize the alignments

---
 .../reads_to_remapped_variants.py             | 47 ++++++++++---------
 variant_to_realignment.nf                     | 42 ++++++++++++-----
 2 files changed, 53 insertions(+), 36 deletions(-)

diff --git a/variant_remapping_tools/reads_to_remapped_variants.py b/variant_remapping_tools/reads_to_remapped_variants.py
index a581e9a..5cfef47 100755
--- a/variant_remapping_tools/reads_to_remapped_variants.py
+++ b/variant_remapping_tools/reads_to_remapped_variants.py
@@ -241,36 +241,37 @@ def link_supplementary(primary_group, supplementary_group):
     return dict(primary_to_supplementary)
 
 
-def process_bam_file(bam_file_path, output_file, out_failed_file, new_genome,
+def process_bam_file(bam_file_paths, output_file, out_failed_file, new_genome,
                      filter_align_with_secondary, flank_length, summary_file):
     counter = Counter()
     fasta = pysam.FastaFile(new_genome)
 
     with open(output_file, 'w') as outfile, open(out_failed_file, 'w') as out_failed:
-        for primary_group, supplementary_group, secondary_group in group_reads(bam_file_path):
-            counter['total'] += 1
-            primary_to_supplementary = link_supplementary(primary_group, supplementary_group)
-            # Retrieve the full VCF record from the bam vr tag
-            original_vcf_rec = primary_group[0].get_tag('vr').split('|^')
-            if pass_basic_filtering(primary_group, secondary_group, primary_to_supplementary, counter, filter_align_with_secondary):
-                left_read, right_read = order_reads(primary_group, primary_to_supplementary)
-                if pass_aligned_filtering(left_read, right_read, counter):
-                    varpos, new_ref, new_alts, ops, failure_reason = \
-                        calculate_new_variant_definition(left_read, right_read, fasta, original_vcf_rec)
-                    if not failure_reason:
-                        counter['Remapped'] += 1
-                        update_vcf_record(left_read.reference_name, varpos, new_ref, new_alts, ops, original_vcf_rec)
-                        output_alignment(original_vcf_rec, outfile)
+        for bam_file_path in bam_file_paths:
+            for primary_group, supplementary_group, secondary_group in group_reads(bam_file_path):
+                counter['total'] += 1
+                primary_to_supplementary = link_supplementary(primary_group, supplementary_group)
+                # Retrieve the full VCF record from the bam vr tag
+                original_vcf_rec = primary_group[0].get_tag('vr').split('|^')
+                if pass_basic_filtering(primary_group, secondary_group, primary_to_supplementary, counter, filter_align_with_secondary):
+                    left_read, right_read = order_reads(primary_group, primary_to_supplementary)
+                    if pass_aligned_filtering(left_read, right_read, counter):
+                        varpos, new_ref, new_alts, ops, failure_reason = \
+                            calculate_new_variant_definition(left_read, right_read, fasta, original_vcf_rec)
+                        if not failure_reason:
+                            counter['Remapped'] += 1
+                            update_vcf_record(left_read.reference_name, varpos, new_ref, new_alts, ops, original_vcf_rec)
+                            output_alignment(original_vcf_rec, outfile)
+                        else:
+                            # Currently the alignment is not precise enough to ensure that the allele change for INDEL and
+                            # novel reference allele are correct. So we skip them.
+                            # TODO: add realignment confirmation see #14 and EVA-2417
+                            counter[failure_reason] += 1
+                            output_alignment(original_vcf_rec, out_failed)
                     else:
-                        # Currently the alignment is not precise enough to ensure that the allele change for INDEL and
-                        # novel reference allele are correct. So we skip them.
-                        # TODO: add realignment confirmation see #14 and EVA-2417
-                        counter[failure_reason] += 1
                         output_alignment(original_vcf_rec, out_failed)
                 else:
                     output_alignment(original_vcf_rec, out_failed)
-            else:
-                output_alignment(original_vcf_rec, out_failed)
     with open(summary_file, 'w') as open_summary:
         yaml.safe_dump({f'Flank_{flank_length}': dict(counter)}, open_summary)
 
@@ -281,7 +282,7 @@ def main():
                    'separate file.')
 
     parser = argparse.ArgumentParser(description=description, formatter_class=RawTextHelpFormatter)
-    parser.add_argument('-i', '--bam', type=str, required=True,
+    parser.add_argument('-i', '--bams', type=str, required=True, nargs='+',
                         help='Input BAM file with remapped flanking regions')
     parser.add_argument('-o', '--outfile', type=str, required=True,
                         help='Output VCF file with remapped variants')
@@ -297,7 +298,7 @@ def main():
     args = parser.parse_args()
 
     process_bam_file(
-        bam_file_path=args.bam,
+        bam_file_paths=args.bams,
         output_file=args.outfile,
         out_failed_file=args.out_failed_file,
         new_genome=args.newgenome,
diff --git a/variant_to_realignment.nf b/variant_to_realignment.nf
index afa42cd..89c6976 100755
--- a/variant_to_realignment.nf
+++ b/variant_to_realignment.nf
@@ -135,8 +135,8 @@ process alignWithMinimap {
     maxRetries 3
 
     input:
-        path "variant_read1.fa"
-        path "variant_read2.fa"
+        // reads contains a list of 2 files (first and second read)
+        path(reads)
         // indexing is done on the fly so get the genome directly
         path "genome.fa"
         val flanklength
@@ -157,7 +157,7 @@ process alignWithMinimap {
         # the awk script will convert this comment in valid SAM tag
         minimap2 -k21 -w11 --sr --frag=yes -A2 -B5 -O6,16 --end-bonus 20 -E2,1 -r50 -p.5 -z 800,200\
                  -f1000,5000 -n2 -m20 -s40 -g200 -2K50m --heap-sort=yes --secondary=yes -N 2 -y \
-                 -a genome.fa variant_read1.fa variant_read2.fa | \
+                 -a genome.fa ${reads[0]} ${reads[1]} | \
                  awk -F '\\t' 'BEGIN{OFS="\\t"}{if(!/^@/){\$NF="vr:Z:"\$NF}; print \$0;}' | \
                  samtools view -bS - > reads_aligned.bam
         """
@@ -165,7 +165,7 @@ process alignWithMinimap {
         """
         minimap2 -k19 -w19 -A2 -B5 -O6,16 --end-bonus 20 -E3,1 -s200 -z200 -N50 --min-occ-floor=100 \
                  --secondary=yes -N 2 -y \
-                 -a genome.fa variant_read1.fa variant_read2.fa | \
+                 -a genome.fa ${reads[0]} ${reads[1]} | \
                  awk -F '\\t' 'BEGIN{OFS="\\t"}{if(!/^@/){\$NF="vr:Z:"\$NF}; print \$0;}' | \
                  samtools view -bS - > reads_aligned.bam
         """
@@ -221,7 +221,7 @@ process alignWithBowtie {
 process readsToRemappedVariants {
 
     input:
-        path "reads_aligned.bam"
+        path "reads.*.bam"
         path "genome.fa"
         val flank_length
         val filter_align_with_secondary
@@ -235,14 +235,14 @@ process readsToRemappedVariants {
         if (filter_align_with_secondary)
             """
             # Ensure that we will use the reads_to_remapped_variants.py from this repo
-            ${baseDir}/variant_remapping_tools/reads_to_remapped_variants.py -i reads_aligned.bam \
+            ${baseDir}/variant_remapping_tools/reads_to_remapped_variants.py -i reads*.bam \
                 -o variants_remapped.vcf  --newgenome genome.fa --out_failed_file variants_unmapped.vcf \
                 --flank_length $flank_length --summary summary.yml --filter_align_with_secondary
             """
         else
             """
             # Ensure that we will use the reads_to_remapped_variants.py from this repo
-            ${baseDir}/variant_remapping_tools/reads_to_remapped_variants.py -i reads_aligned.bam \
+            ${baseDir}/variant_remapping_tools/reads_to_remapped_variants.py -i reads*.bam \
                 -o variants_remapped.vcf  --newgenome genome.fa --out_failed_file variants_unmapped.vcf \
                 --flank_length $flank_length --summary summary.yml
            """
@@ -259,6 +259,7 @@ workflow process_split_reads_generic {
         new_genome_fa_fai
         flank_length
         filter_align_with_secondary
+        chunck_size
 
     main:
         convertVCFToBed(source_vcf)
@@ -271,15 +272,24 @@ workflow process_split_reads_generic {
             flankingRegionBed.out.flanking_r1_bed, flankingRegionBed.out.flanking_r2_bed,
             flankingRegionFasta.out.variants_read1, flankingRegionFasta.out.variants_read2
         )
+
+        // This will split the fasta file into chunks
+        // mix creates a single channel with both file
+        // toList create a single entry channel with the list of two file
+        // splitFasta split the two files in chunks
+        split_reads = extractVariantInfoToFastaHeader.out.variant_read1_with_info
+          .mix(extractVariantInfoToFastaHeader.out.variant_read2_with_info)
+          .toList()
+          .splitFasta(by: chunck_size, file: true, elem: [0,1])
+
         alignWithMinimap(
-            extractVariantInfoToFastaHeader.out.variant_read1_with_info,
-            extractVariantInfoToFastaHeader.out.variant_read2_with_info,
+            split_reads,
             new_genome_fa,
             flank_length
         )
         sortByName(alignWithMinimap.out.reads_aligned_bam)
         readsToRemappedVariants(
-            sortByName.out.reads_aligned_sorted_bam, new_genome_fa,
+            sortByName.out.reads_aligned_sorted_bam.collect(), new_genome_fa,
             flank_length, filter_align_with_secondary
         )
 
@@ -300,9 +310,11 @@ workflow process_split_reads {
     main:
         flank_length = 50
         filter_align_with_secondary = true
+        chunck_size = 10000000
         process_split_reads_generic(
             source_vcf, old_genome_fa, old_genome_fa_fai, old_genome_chrom_sizes,
-            new_genome_fa, new_genome_fa_fai, flank_length, filter_align_with_secondary
+            new_genome_fa, new_genome_fa_fai, flank_length, filter_align_with_secondary,
+            chunck_size
         )
     emit:
         variants_remapped = process_split_reads_generic.out.variants_remapped
@@ -323,9 +335,11 @@ workflow process_split_reads_mid {
     main:
         flank_length = 2000
         filter_align_with_secondary = true
+        chunck_size = 1000000
         process_split_reads_generic(
             source_vcf, old_genome_fa, old_genome_fa_fai, old_genome_chrom_sizes,
-            new_genome_fa, new_genome_fa_fai, flank_length, filter_align_with_secondary
+            new_genome_fa, new_genome_fa_fai, flank_length, filter_align_with_secondary,
+            chunck_size
         )
     emit:
         variants_remapped = process_split_reads_generic.out.variants_remapped
@@ -346,9 +360,11 @@ workflow process_split_reads_long {
     main:
         flank_length = 50000
         filter_align_with_secondary = false
+        chunck_size = 100000
         process_split_reads_generic(
             source_vcf, old_genome_fa, old_genome_fa_fai, old_genome_chrom_sizes,
-            new_genome_fa, new_genome_fa_fai, flank_length, filter_align_with_secondary
+            new_genome_fa, new_genome_fa_fai, flank_length, filter_align_with_secondary,
+            chunck_size
         )
     emit:
         variants_remapped = process_split_reads_generic.out.variants_remapped

From af34f9c410722082ccb1f355f45583b5604f0fe4 Mon Sep 17 00:00:00 2001
From: Timothee Cezard <tcezard@ebi.ac.uk>
Date: Fri, 4 Jun 2021 10:40:20 +0100
Subject: [PATCH 13/19] ONly split the fasta file if they are not empty
 otherwise leave them as they are

---
 .github/workflows/variant_remapping.yml |  3 +-
 tests/test_pipeline_empty.sh            | 56 +++++++++++++++++++++++++
 variant_to_realignment.nf               | 13 +++---
 3 files changed, 66 insertions(+), 6 deletions(-)
 create mode 100755 tests/test_pipeline_empty.sh

diff --git a/.github/workflows/variant_remapping.yml b/.github/workflows/variant_remapping.yml
index ec6c8cc..15bfbc2 100644
--- a/.github/workflows/variant_remapping.yml
+++ b/.github/workflows/variant_remapping.yml
@@ -32,8 +32,9 @@ jobs:
         $CONDA/bin/conda run pip install -q -r requirements.txt
 
     - name: Test nextflow workflow
-      run:
+      run: |
         $CONDA/bin/conda run tests/test_pipeline.sh
+        $CONDA/bin/conda run test_pipeline_empty.sh
 
     - name: Test with pytest
       run:
diff --git a/tests/test_pipeline_empty.sh b/tests/test_pipeline_empty.sh
new file mode 100755
index 0000000..edd27c0
--- /dev/null
+++ b/tests/test_pipeline_empty.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+
+set -Eeuo pipefail
+
+function asserteq() {
+  if [[ ! "$1" -eq "$2" ]]
+  then
+    echo "Assertion Error: $1 not equal to $2"
+    exit 1
+  fi
+
+}
+
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+SOURCE_DIR=$(dirname $SCRIPT_DIR)
+
+# Build the Source VCF
+cat << EOT > "${SCRIPT_DIR}/resources/source_empty.vcf"
+##fileformat=VCFv4.3
+##INFO=<ID=COMMENT,Number=1,Type=String,Description="Comment">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Consensus Genotype across all datasets with called genotype">
+##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
+#CHROM	POS	ID	REF	 ALT	QUAL 	FILTER	INFO	FORMAT	HG001
+EOT
+
+nextflow run ${SOURCE_DIR}/main.nf \
+--oldgenome ${SCRIPT_DIR}/resources/genome.fa \
+--newgenome ${SCRIPT_DIR}/resources/new_genome.fa \
+--vcffile ${SCRIPT_DIR}/resources/source_empty.vcf \
+--outfile ${SCRIPT_DIR}/resources/remap_empty.vcf
+
+# Check the presence of the output file
+ls ${SCRIPT_DIR}/resources/remap_empty.vcf \
+   ${SCRIPT_DIR}/resources/remap_empty_unmapped.vcf \
+   ${SCRIPT_DIR}/resources/remap_empty_counts.yml
+
+# Build the expected VCF
+cat << EOT > "${SCRIPT_DIR}/resources/expected_remap.vcf"
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	HG001
+EOT
+
+# Compare vs the expected VCF
+diff "${SCRIPT_DIR}/resources/expected_remap.vcf" <(grep -v '^##' "${SCRIPT_DIR}/resources/remap_empty.vcf")
+
+asserteq `cat ${SCRIPT_DIR}/resources/remap_empty_counts.yml | grep 'all:' | cut -d ' ' -f 2`  0
+asserteq `cat ${SCRIPT_DIR}/resources/remap_empty_counts.yml | grep 'filtered:' | cut -d ' ' -f 2`  0
+
+
+# Clean up after the test
+rm -rf work .nextflow* \
+       ${SCRIPT_DIR}/resources/source_empty.vcf \
+       ${SCRIPT_DIR}/resources/remap_empty.vcf \
+       ${SCRIPT_DIR}/resources/remap_empty_counts.yml \
+       ${SCRIPT_DIR}/resources/remap_empty_unmapped.vcf \
+       ${SCRIPT_DIR}/resources/new_genome.fa.* \
+       ${SCRIPT_DIR}/resources/genome.fa.fai
diff --git a/variant_to_realignment.nf b/variant_to_realignment.nf
index 89c6976..46e7e24 100755
--- a/variant_to_realignment.nf
+++ b/variant_to_realignment.nf
@@ -273,15 +273,18 @@ workflow process_split_reads_generic {
             flankingRegionFasta.out.variants_read1, flankingRegionFasta.out.variants_read2
         )
 
-        // This will split the fasta file into chunks
         // mix creates a single channel with both file
         // toList create a single entry channel with the list of two file
-        // splitFasta split the two files in chunks
-        split_reads = extractVariantInfoToFastaHeader.out.variant_read1_with_info
-          .mix(extractVariantInfoToFastaHeader.out.variant_read2_with_info)
+        split_reads = extractVariantInfoToFastaHeader.out.variant_read2_with_info
+          .mix(extractVariantInfoToFastaHeader.out.variant_read1_with_info)
           .toList()
-          .splitFasta(by: chunck_size, file: true, elem: [0,1])
 
+        // splitFasta split the two files in chunks only if the input fasta is not empty
+        extractVariantInfoToFastaHeader.out.variant_read1_with_info.subscribe {
+            if (it.size() > 0) {
+                split_reads = split_reads.splitFasta(by: chunck_size, file: true, elem: [0,1])
+            }
+        }
         alignWithMinimap(
             split_reads,
             new_genome_fa,

From 80a37e1fe77c650e4e87d6ae40d843a39eaaa674 Mon Sep 17 00:00:00 2001
From: Timothee Cezard <tcezard@ebi.ac.uk>
Date: Fri, 4 Jun 2021 10:42:01 +0100
Subject: [PATCH 14/19] Remove files after successful test

---
 tests/test_pipeline.sh       | 1 +
 tests/test_pipeline_empty.sh | 1 +
 2 files changed, 2 insertions(+)

diff --git a/tests/test_pipeline.sh b/tests/test_pipeline.sh
index 431f6ae..5d2c62f 100755
--- a/tests/test_pipeline.sh
+++ b/tests/test_pipeline.sh
@@ -64,6 +64,7 @@ asserteq `cat ${SCRIPT_DIR}/resources/remap_counts.yml | grep 'filtered:' | cut
 # Clean up after the test
 rm -rf work .nextflow* \
        ${SCRIPT_DIR}/resources/source.vcf \
+       ${SCRIPT_DIR}/resources/expected_remap.vcf \
        ${SCRIPT_DIR}/resources/remap.vcf \
        ${SCRIPT_DIR}/resources/remap_counts.yml \
        ${SCRIPT_DIR}/resources/remap_unmapped.vcf \
diff --git a/tests/test_pipeline_empty.sh b/tests/test_pipeline_empty.sh
index edd27c0..b95231c 100755
--- a/tests/test_pipeline_empty.sh
+++ b/tests/test_pipeline_empty.sh
@@ -49,6 +49,7 @@ asserteq `cat ${SCRIPT_DIR}/resources/remap_empty_counts.yml | grep 'filtered:'
 # Clean up after the test
 rm -rf work .nextflow* \
        ${SCRIPT_DIR}/resources/source_empty.vcf \
+       ${SCRIPT_DIR}/resources/expected_remap.vcf \
        ${SCRIPT_DIR}/resources/remap_empty.vcf \
        ${SCRIPT_DIR}/resources/remap_empty_counts.yml \
        ${SCRIPT_DIR}/resources/remap_empty_unmapped.vcf \

From d52cd4b64eec66b02e8167565f275a169f73bb8b Mon Sep 17 00:00:00 2001
From: Timothee Cezard <tcezard@ebi.ac.uk>
Date: Fri, 4 Jun 2021 11:32:19 +0100
Subject: [PATCH 15/19] Add config and limit memory required

---
 tests/resources/config.yml   | 5 +++++
 tests/test_pipeline.sh       | 1 +
 tests/test_pipeline_empty.sh | 1 +
 variant_to_realignment.nf    | 2 +-
 4 files changed, 8 insertions(+), 1 deletion(-)
 create mode 100644 tests/resources/config.yml

diff --git a/tests/resources/config.yml b/tests/resources/config.yml
new file mode 100644
index 0000000..4162a2c
--- /dev/null
+++ b/tests/resources/config.yml
@@ -0,0 +1,5 @@
+executor {
+  $local {
+      memory = '6 GB'
+  }
+}
\ No newline at end of file
diff --git a/tests/test_pipeline.sh b/tests/test_pipeline.sh
index 5d2c62f..c41ae5b 100755
--- a/tests/test_pipeline.sh
+++ b/tests/test_pipeline.sh
@@ -33,6 +33,7 @@ chr1	3710	.	T	A	50	PASS	.	GT:GQ	1/1:0
 EOT
 
 nextflow run ${SOURCE_DIR}/main.nf \
+-config ${SCRIPT_DIR}/resources/config.yml \
 --oldgenome ${SCRIPT_DIR}/resources/genome.fa \
 --newgenome ${SCRIPT_DIR}/resources/new_genome.fa \
 --vcffile ${SCRIPT_DIR}/resources/source.vcf \
diff --git a/tests/test_pipeline_empty.sh b/tests/test_pipeline_empty.sh
index b95231c..a40b4f4 100755
--- a/tests/test_pipeline_empty.sh
+++ b/tests/test_pipeline_empty.sh
@@ -24,6 +24,7 @@ cat << EOT > "${SCRIPT_DIR}/resources/source_empty.vcf"
 EOT
 
 nextflow run ${SOURCE_DIR}/main.nf \
+-config ${SCRIPT_DIR}/resources/config.yml \
 --oldgenome ${SCRIPT_DIR}/resources/genome.fa \
 --newgenome ${SCRIPT_DIR}/resources/new_genome.fa \
 --vcffile ${SCRIPT_DIR}/resources/source_empty.vcf \
diff --git a/variant_to_realignment.nf b/variant_to_realignment.nf
index 46e7e24..27e00f7 100755
--- a/variant_to_realignment.nf
+++ b/variant_to_realignment.nf
@@ -92,7 +92,7 @@ process flankingRegionFasta {
  */
 process extractVariantInfoToFastaHeader {
 
-    memory '8GB'
+    memory '6GB'
 
     input:  
         path "flanking_r1.bed"

From 1d41b875df04e66c686b918a80e9735dd35ae3ae Mon Sep 17 00:00:00 2001
From: Timothee Cezard <tcezard@ebi.ac.uk>
Date: Fri, 4 Jun 2021 11:41:41 +0100
Subject: [PATCH 16/19] Fix path to test

---
 .github/workflows/variant_remapping.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/variant_remapping.yml b/.github/workflows/variant_remapping.yml
index 15bfbc2..1b5da00 100644
--- a/.github/workflows/variant_remapping.yml
+++ b/.github/workflows/variant_remapping.yml
@@ -34,7 +34,7 @@ jobs:
     - name: Test nextflow workflow
       run: |
         $CONDA/bin/conda run tests/test_pipeline.sh
-        $CONDA/bin/conda run test_pipeline_empty.sh
+        $CONDA/bin/conda run tests/test_pipeline_empty.sh
 
     - name: Test with pytest
       run:

From 918b21c704e6e77d0f37c1f52880c517b8ece6c8 Mon Sep 17 00:00:00 2001
From: Timothee Cezard <tcezard@ebi.ac.uk>
Date: Fri, 4 Jun 2021 11:53:16 +0100
Subject: [PATCH 17/19] fix test

---
 .../tests/test_reads_to_remapped_variants.py                    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/variant_remapping_tools/tests/test_reads_to_remapped_variants.py b/variant_remapping_tools/tests/test_reads_to_remapped_variants.py
index 457f6b3..a0de9ba 100644
--- a/variant_remapping_tools/tests/test_reads_to_remapped_variants.py
+++ b/variant_remapping_tools/tests/test_reads_to_remapped_variants.py
@@ -59,7 +59,7 @@ def test_process_bam_file(self):
         output_file = '/tmp/remapped.vcf'
         summary_file = '/tmp/summary.yml'
         out_failed_file = '/tmp/unmapped.vcf'
-        process_bam_file(bamfile, output_file, out_failed_file, fasta_path, True, 50, summary_file)
+        process_bam_file([bamfile], output_file, out_failed_file, fasta_path, True, 50, summary_file)
 
         expected = [
             'chr2	98	.	C	CG	50	PASS	st=+	GT:GQ	1/1:0\n',

From f2ecdb721161cd2cddea7faaf518f7da0e85b249 Mon Sep 17 00:00:00 2001
From: Timothee Cezard <tcezard@ebi.ac.uk>
Date: Mon, 7 Jun 2021 22:12:58 +0100
Subject: [PATCH 18/19] Use interleaved fasta file (for flank 1 and flank2) to
 make it easier to split the resulting file in chunk reduce chunk size

---
 main.nf                   |  2 +-
 variant_to_realignment.nf | 75 ++++++++++++++++++++++++---------------
 2 files changed, 48 insertions(+), 29 deletions(-)

diff --git a/main.nf b/main.nf
index eb981f7..c233a06 100755
--- a/main.nf
+++ b/main.nf
@@ -179,7 +179,7 @@ process sortVCF {
 
     """
     bgzip variants_remapped.vcf
-    bcftools sort -o variants_remapped_sorted.vcf.gz -Oz variants_remapped.vcf.gz
+    bcftools sort -T . -o variants_remapped_sorted.vcf.gz -Oz variants_remapped.vcf.gz
     """
 }
 
diff --git a/variant_to_realignment.nf b/variant_to_realignment.nf
index 27e00f7..af402b1 100755
--- a/variant_to_realignment.nf
+++ b/variant_to_realignment.nf
@@ -101,8 +101,7 @@ process extractVariantInfoToFastaHeader {
         path "variants_read2.fa"
 
     output:
-        path "variant_read1.out.fa", emit: variant_read1_with_info
-        path "variant_read2.out.fa", emit: variant_read2_with_info
+        path "interleaved.fa", emit: interleaved_fasta
 
     // Disable Nextflow string interpolation using single quotes
     // https://www.nextflow.io/docs/latest/script.html#string-interpolation
@@ -118,10 +117,35 @@ process extractVariantInfoToFastaHeader {
     # Then a newline is inserted between the vcf fields and the sequence
     # The vcf fields are regarded as comment to the fasta entry.
     paste -d ' \\n' position.txt vcf_fields.txt <(grep -v '^>' variants_read1.fa) > variant_read1.out.fa
-    paste -d ' \\n' position.txt vcf_fields.txt <(grep -v '^>' variants_read2.fa) > variant_read2.out.fa
+    paste -d '\\n' position.txt <(grep -v '^>' variants_read2.fa) > variant_read2.out.fa
+
+    paste variant_read1.out.fa variant_read2.out.fa | paste - - | awk -F "\\t" 'BEGIN {OFS="\\n"} {print $1,$3,$2,$4}' > interleaved.fa
     '''
 }
 
+/*
+ * Split fasta entries into multiple chunks
+ */
+process split_fasta {
+
+    input:
+        path interleaved_fasta
+        val chunk_size
+
+    output:
+        path("read_chunk-*"), emit: read_split
+
+    script:
+    if (interleaved_fasta.size() > 0)
+        """
+        split -a 5 -d -l ${chunk_size * 4} ${interleaved_fasta} read_chunk-
+        """
+    else
+        """
+        ln -s ${interleaved_fasta} read_chunk-00001
+        """
+}
+
 /*
  * Align sequence with minimap2
  */
@@ -135,8 +159,8 @@ process alignWithMinimap {
     maxRetries 3
 
     input:
-        // reads contains a list of 2 files (first and second read)
-        path(reads)
+        // reads contains paired interleaved (first and second read in the same file)
+        each path(reads)
         // indexing is done on the fly so get the genome directly
         path "genome.fa"
         val flanklength
@@ -157,7 +181,7 @@ process alignWithMinimap {
         # the awk script will convert this comment in valid SAM tag
         minimap2 -k21 -w11 --sr --frag=yes -A2 -B5 -O6,16 --end-bonus 20 -E2,1 -r50 -p.5 -z 800,200\
                  -f1000,5000 -n2 -m20 -s40 -g200 -2K50m --heap-sort=yes --secondary=yes -N 2 -y \
-                 -a genome.fa ${reads[0]} ${reads[1]} | \
+                 -a genome.fa ${reads} | \
                  awk -F '\\t' 'BEGIN{OFS="\\t"}{if(!/^@/){\$NF="vr:Z:"\$NF}; print \$0;}' | \
                  samtools view -bS - > reads_aligned.bam
         """
@@ -165,7 +189,7 @@ process alignWithMinimap {
         """
         minimap2 -k19 -w19 -A2 -B5 -O6,16 --end-bonus 20 -E3,1 -s200 -z200 -N50 --min-occ-floor=100 \
                  --secondary=yes -N 2 -y \
-                 -a genome.fa ${reads[0]} ${reads[1]} | \
+                 -a genome.fa ${reads} | \
                  awk -F '\\t' 'BEGIN{OFS="\\t"}{if(!/^@/){\$NF="vr:Z:"\$NF}; print \$0;}' | \
                  samtools view -bS - > reads_aligned.bam
         """
@@ -221,7 +245,7 @@ process alignWithBowtie {
 process readsToRemappedVariants {
 
     input:
-        path "reads.*.bam"
+        path "reads*.bam"
         path "genome.fa"
         val flank_length
         val filter_align_with_secondary
@@ -259,7 +283,7 @@ workflow process_split_reads_generic {
         new_genome_fa_fai
         flank_length
         filter_align_with_secondary
-        chunck_size
+        chunk_size
 
     main:
         convertVCFToBed(source_vcf)
@@ -273,24 +297,19 @@ workflow process_split_reads_generic {
             flankingRegionFasta.out.variants_read1, flankingRegionFasta.out.variants_read2
         )
 
-        // mix creates a single channel with both file
-        // toList create a single entry channel with the list of two file
-        split_reads = extractVariantInfoToFastaHeader.out.variant_read2_with_info
-          .mix(extractVariantInfoToFastaHeader.out.variant_read1_with_info)
-          .toList()
-
-        // splitFasta split the two files in chunks only if the input fasta is not empty
-        extractVariantInfoToFastaHeader.out.variant_read1_with_info.subscribe {
-            if (it.size() > 0) {
-                split_reads = split_reads.splitFasta(by: chunck_size, file: true, elem: [0,1])
-            }
-        }
+        split_fasta(
+            extractVariantInfoToFastaHeader.out.interleaved_fasta,
+            chunk_size
+        )
+
+
         alignWithMinimap(
-            split_reads,
+            split_fasta.out.read_split,
             new_genome_fa,
             flank_length
         )
         sortByName(alignWithMinimap.out.reads_aligned_bam)
+        // Collect all the bam files in the next step
         readsToRemappedVariants(
             sortByName.out.reads_aligned_sorted_bam.collect(), new_genome_fa,
             flank_length, filter_align_with_secondary
@@ -313,11 +332,11 @@ workflow process_split_reads {
     main:
         flank_length = 50
         filter_align_with_secondary = true
-        chunck_size = 10000000
+        chunck_size = 5000000
         process_split_reads_generic(
             source_vcf, old_genome_fa, old_genome_fa_fai, old_genome_chrom_sizes,
             new_genome_fa, new_genome_fa_fai, flank_length, filter_align_with_secondary,
-            chunck_size
+            chunk_size
         )
     emit:
         variants_remapped = process_split_reads_generic.out.variants_remapped
@@ -338,11 +357,11 @@ workflow process_split_reads_mid {
     main:
         flank_length = 2000
         filter_align_with_secondary = true
-        chunck_size = 1000000
+        chunk_size = 500000
         process_split_reads_generic(
             source_vcf, old_genome_fa, old_genome_fa_fai, old_genome_chrom_sizes,
             new_genome_fa, new_genome_fa_fai, flank_length, filter_align_with_secondary,
-            chunck_size
+            chunk_size
         )
     emit:
         variants_remapped = process_split_reads_generic.out.variants_remapped
@@ -363,11 +382,11 @@ workflow process_split_reads_long {
     main:
         flank_length = 50000
         filter_align_with_secondary = false
-        chunck_size = 100000
+        chunk_size = 50000
         process_split_reads_generic(
             source_vcf, old_genome_fa, old_genome_fa_fai, old_genome_chrom_sizes,
             new_genome_fa, new_genome_fa_fai, flank_length, filter_align_with_secondary,
-            chunck_size
+            chunk_size
         )
     emit:
         variants_remapped = process_split_reads_generic.out.variants_remapped

From cb63b41331be637fa64ff18cdcd5fd4eb27cdff0 Mon Sep 17 00:00:00 2001
From: Timothee Cezard <tcezard@ebi.ac.uk>
Date: Mon, 7 Jun 2021 22:22:22 +0100
Subject: [PATCH 19/19] fix typo

---
 variant_to_realignment.nf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/variant_to_realignment.nf b/variant_to_realignment.nf
index af402b1..b0a49cf 100755
--- a/variant_to_realignment.nf
+++ b/variant_to_realignment.nf
@@ -332,7 +332,7 @@ workflow process_split_reads {
     main:
         flank_length = 50
         filter_align_with_secondary = true
-        chunck_size = 5000000
+        chunk_size = 5000000
         process_split_reads_generic(
             source_vcf, old_genome_fa, old_genome_fa_fai, old_genome_chrom_sizes,
             new_genome_fa, new_genome_fa_fai, flank_length, filter_align_with_secondary,