Merge pull request #29 from tcezard/EVA2451_allow_novel_ref_allele

EVA-2451 - allow novel ref allele
EBIvariation · Jun 14, 2021 · ab8fe5f · ab8fe5f
2 parents e3a6754 + cb63b41
commit ab8fe5f
Show file tree

Hide file tree

Showing 8 changed files with 297 additions and 95 deletions.
diff --git a/.github/workflows/variant_remapping.yml b/.github/workflows/variant_remapping.yml
@@ -32,8 +32,9 @@ jobs:
         $CONDA/bin/conda run pip install -q -r requirements.txt
 
     - name: Test nextflow workflow
-      run:
+      run: |
         $CONDA/bin/conda run tests/test_pipeline.sh
+        $CONDA/bin/conda run tests/test_pipeline_empty.sh
 
     - name: Test with pytest
       run:

diff --git a/main.nf b/main.nf
@@ -64,6 +64,32 @@ process uncompressInputVCF {
             """
 }
 
+
+/*
+ * filter VCF file to remove variant too close the edges of chromosome because we can't get flanking regions
+ */
+process filterInputVCF {
+
+    input:
+        path "source.vcf"
+        path "genome_fai"
+
+    output:
+        path "filtered.vcf", emit: filtered_vcf_file
+        path "kept.vcf", emit: kept_vcf_file
+        path "count.yml", emit: count_yml
+
+    script:
+    """
+    awk '{ print \$1"\\t1\\t"\$2-1;}' genome_fai > center_regions.bed
+    awk '{ print \$1"\\t0\\t1"; print \$1"\\t"\$2-1"\\t"\$2;}' genome_fai > edge_regions.bed
+    bcftools filter --targets-file center_regions.bed source.vcf | tee kept.vcf |  grep -v '^#' | wc -l > all_count.txt
+    bcftools filter --targets-file edge_regions.bed  source.vcf | grep -v '^#' | tee filtered.vcf | wc -l > filtered_count.txt
+    cat <(cat *_count.txt | awk '{sum += \$1} END{print "all: "sum}') <(cat filtered_count.txt | awk '{print "filtered: "\$1}') > count.yml
+    """
+}
+
+
 /*
  * Store the original VCF header for later use
  */
@@ -85,13 +111,13 @@ include { process_split_reads; process_split_reads_mid; process_split_reads_long
 
 
 /*
- * Create the header for the output VCF
+ * This process convert the original Header to the remapped header and concatenate it with the remapped VCF records
  */
-process convertAndAddHeaderToVCF {
+process generateRemappedVCF {
 
     input:
-        path "variants_remapped_sorted.vcf"
         path "vcf_header.txt"
+        path "variants_remapped_sorted.vcf"
 
     output:
         path "variants_remapped_sorted_with_header.vcf", emit: final_vcf_with_header
@@ -119,9 +145,9 @@ process convertAndAddHeaderToVCF {
 }
 
 /*
- * Add header to unmapped variant VCF records
+ * This process adds the original header to unmapped variant VCF records and output the results
  */
-process mergeOriginalHeaderAndVCFAndOutput {
+process generateUnmappedVCF {
 
     publishDir outfile_dir,
         overwrite: true,
@@ -153,7 +179,7 @@ process sortVCF {
 
     """
     bgzip variants_remapped.vcf
-    bcftools sort -o variants_remapped_sorted.vcf.gz -Oz variants_remapped.vcf.gz
+    bcftools sort -T . -o variants_remapped_sorted.vcf.gz -Oz variants_remapped.vcf.gz
     """
 }
 
@@ -198,6 +224,23 @@ process outputStats {
     """
 }
 
+/*
+ * Concatenate the unmapped variants
+ */
+process combineUnmappedVCF {
+    input:
+        path "variants1.vcf"
+        path "variants2.vcf"
+
+    output:
+        path "merge.vcf", emit: merge_vcf
+
+    """
+    cat variants1.vcf variants2.vcf > merge.vcf
+    """
+}
+
+
 process combineVCF {
     input:
         path "variants1.vcf"
@@ -213,6 +256,7 @@ process combineVCF {
 
 process combineYaml {
     input:
+        path "initial_yml"
         path "round1.yml"
         path "round2.yml"
         path "round3.yml"
@@ -221,7 +265,7 @@ process combineYaml {
         path "merge.yml", emit: merge_yml
 
     """
-    cat round1.yml round2.yml round3.yml > merge.yml
+    cat initial_yml round1.yml round2.yml round3.yml > merge.yml
     """
 }
 
@@ -235,24 +279,49 @@ workflow finalise {
         summary
 
     main:
-        convertAndAddHeaderToVCF(variants_remapped, vcf_header)
-        mergeOriginalHeaderAndVCFAndOutput(vcf_header, variants_unmapped)
-        sortVCF(convertAndAddHeaderToVCF.out.final_vcf_with_header)
+        generateUnmappedVCF(vcf_header, variants_unmapped)
+        generateRemappedVCF(vcf_header, variants_remapped)
+        sortVCF(generateRemappedVCF.out.final_vcf_with_header)
         normaliseAnOutput(sortVCF.out.variants_remapped_sorted_gz, genome)
         outputStats(summary)
 }
 
 
+//process_with_bowtie
+workflow process_with_bowtie {
+    main:
+        prepare_old_genome(params.oldgenome)
+        prepare_new_genome_bowtie(params.newgenome)
+        uncompressInputVCF(params.vcffile)
+        storeVCFHeader(uncompressInputVCF.out.vcf_file)
+        process_split_reads_with_bowtie(
+            uncompressInputVCF.out.vcf_file,
+            params.oldgenome,
+            prepare_old_genome.out.genome_fai,
+            prepare_old_genome.out.genome_chrom_sizes,
+            params.newgenome,
+            prepare_new_genome_bowtie.out.genome_fai,
+            prepare_new_genome_bowtie.out.bowtie_indexes
+        )
+        finalise(process_split_reads_with_bowtie.out.variants_remapped, storeVCFHeader.out.vcf_header,
+                 params.newgenome, process_split_reads_with_bowtie.out.summary_yml)
+}
+
+
+
+
+
 // process_with_minimap
 // Workflow without a name is the default workflow that gets executed when the file is run through nextflow
-workflow {
+workflow  {
     main:
         prepare_old_genome(params.oldgenome)
         prepare_new_genome(params.newgenome)
         uncompressInputVCF(params.vcffile)
+        filterInputVCF(uncompressInputVCF.out.vcf_file, prepare_old_genome.out.genome_fai)
         storeVCFHeader(uncompressInputVCF.out.vcf_file)
         process_split_reads(
-            uncompressInputVCF.out.vcf_file,
+            filterInputVCF.out.kept_vcf_file,
             params.oldgenome,
             prepare_old_genome.out.genome_fai,
             prepare_old_genome.out.genome_chrom_sizes,
@@ -275,39 +344,24 @@ workflow {
             params.newgenome,
             prepare_new_genome.out.genome_fai
         )
+        combineUnmappedVCF(
+            filterInputVCF.out.filtered_vcf_file,
+            process_split_reads_long.out.variants_unmapped,
+        )
         combineVCF(
             process_split_reads.out.variants_remapped,
             process_split_reads_mid.out.variants_remapped,
             process_split_reads_long.out.variants_remapped
         )
         combineYaml(
+            filterInputVCF.out.count_yml,
             process_split_reads.out.summary_yml,
             process_split_reads_mid.out.summary_yml,
-            process_split_reads_long.out.summary_yml,
+            process_split_reads_long.out.summary_yml
         )
 
         finalise(
-            combineVCF.out.merge_vcf, process_split_reads_long.out.variants_unmapped, storeVCFHeader.out.vcf_header,
+            combineVCF.out.merge_vcf, combineUnmappedVCF.out.merge_vcf, storeVCFHeader.out.vcf_header,
             params.newgenome, combineYaml.out.merge_yml
         )
 }
-
-//process_with_bowtie
-workflow process_with_bowtie {
-    main:
-        prepare_old_genome(params.oldgenome)
-        prepare_new_genome_bowtie(params.newgenome)
-        uncompressInputVCF(params.vcffile)
-        storeVCFHeader(uncompressInputVCF.out.vcf_file)
-        process_split_reads_with_bowtie(
-            uncompressInputVCF.out.vcf_file,
-            params.oldgenome,
-            prepare_old_genome.out.genome_fai,
-            prepare_old_genome.out.genome_chrom_sizes,
-            params.newgenome,
-            prepare_new_genome_bowtie.out.genome_fai,
-            prepare_new_genome_bowtie.out.bowtie_indexes
-        )
-        finalise(process_split_reads_with_bowtie.out.variants_remapped, storeVCFHeader.out.vcf_header,
-                 params.newgenome, process_split_reads_with_bowtie.out.summary_yml)
-}
diff --git a/tests/resources/config.yml b/tests/resources/config.yml
@@ -0,0 +1,5 @@
+executor {
+  $local {
+      memory = '6 GB'
+  }
+}
diff --git a/tests/test_pipeline.sh b/tests/test_pipeline.sh
@@ -2,6 +2,15 @@
 
 set -Eeuo pipefail
 
+function asserteq() {
+  if [[ ! "$1" -eq "$2" ]]
+  then
+    echo "Assertion Error: $1 not equal to $2"
+    exit 1
+  fi
+
+}
+
 SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 SOURCE_DIR=$(dirname $SCRIPT_DIR)
 
@@ -12,15 +21,19 @@ cat << EOT > "${SCRIPT_DIR}/resources/source.vcf"
 ##FORMAT=<ID=GT,Number=1,Type=String,Description="Consensus Genotype across all datasets with called genotype">
 ##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
 #CHROM	POS	ID	REF	 ALT	QUAL 	FILTER	INFO	FORMAT	HG001
-chr1	48	.	C	A,T	50	PASS	COMMENT=NANANANANANANANANANANANANAN%ANANANANANANANANANANANANANANANANAN|ANANANANANANA&NANANANA\$NANANANANANANANANANANANANANANANANAN^ANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANABATMAN	GT:GQ	1/1:0
+chr1	1	.	CG	TG	50	PASS	.	GT:GQ	1/1:0
+chr1	48	.	C	A,T	50	PASS	COMMENT=NANANANANANANANANANANANANAN%ANANANANANANANANANANANANANANANANAN|ANANANANANANA&NANAN ANA\$NANANANANANANANANANANANANANANANANAN^ANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANA NANANANANANANANANANANANANABATMAN	GT:GQ	1/1:0
 chr1	98	.	C	CG	50	PASS	.	GT:GQ	1/1:0
 chr1	1078	.	G	A	50	PASS	.	GT	1/1
-chr1	1818	.	AAC	A	50	PASS	.	GT:GQ	1/1:0
 chr1	2030	.	A	TCC	50	PASS	.	GT:GQ	1/1:0
+chr1	1818	.	AAC	A	50	PASS	.	GT:GQ	1/1:0
 chr1	3510	.	T	C	50	PASS	.	GT:GQ	1/1:0
+chr1	3709	.	CA	TA	50	PASS	.	GT:GQ	1/1:0
+chr1	3710	.	T	A	50	PASS	.	GT:GQ	1/1:0
 EOT
 
 nextflow run ${SOURCE_DIR}/main.nf \
+-config ${SCRIPT_DIR}/resources/config.yml \
 --oldgenome ${SCRIPT_DIR}/resources/genome.fa \
 --newgenome ${SCRIPT_DIR}/resources/new_genome.fa \
 --vcffile ${SCRIPT_DIR}/resources/source.vcf \
@@ -34,7 +47,7 @@ ls ${SCRIPT_DIR}/resources/remap.vcf \
 # Build the expected VCF
 cat << EOT > "${SCRIPT_DIR}/resources/expected_remap.vcf"
 #CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	HG001
-chr2	48	.	C	A,T	50	PASS	COMMENT=NANANANANANANANANANANANANAN%ANANANANANANANANANANANANANANANANAN|ANANANANANANA&NANANANA\$NANANANANANANANANANANANANANANANANAN^ANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANABATMAN;st=+	GT:GQ	1/1:0
+chr2	48	.	C	A,T	50	PASS	COMMENT=NANANANANANANANANANANANANAN%ANANANANANANANANANANANANANANANANAN|ANANANANANANA&NANAN ANA\$NANANANANANANANANANANANANANANANANAN^ANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANA NANANANANANANANANANANANANABATMAN;st=+	GT:GQ	1/1:0
 chr2	98	.	C	CG	50	PASS	st=+	GT:GQ	1/1:0
 chr2	1078	.	A	G	50	PASS	st=+;rac=G-A	GT	0/0
 chr2	1818	.	AAC	A	50	PASS	st=+	GT:GQ	1/1:0
@@ -45,9 +58,16 @@ EOT
 # Compare vs the expected VCF
 diff "${SCRIPT_DIR}/resources/expected_remap.vcf" <(grep -v '^##' "${SCRIPT_DIR}/resources/remap.vcf")
 
+asserteq `cat ${SCRIPT_DIR}/resources/remap_counts.yml | grep 'all:' | cut -d ' ' -f 2`  9
+asserteq `cat ${SCRIPT_DIR}/resources/remap_counts.yml | grep 'filtered:' | cut -d ' ' -f 2`  2
+
+
 # Clean up after the test
 rm -rf work .nextflow* \
        ${SCRIPT_DIR}/resources/source.vcf \
-       ${SCRIPT_DIR}/resources/*remap.vcf* \
+       ${SCRIPT_DIR}/resources/expected_remap.vcf \
+       ${SCRIPT_DIR}/resources/remap.vcf \
+       ${SCRIPT_DIR}/resources/remap_counts.yml \
+       ${SCRIPT_DIR}/resources/remap_unmapped.vcf \
        ${SCRIPT_DIR}/resources/new_genome.fa.* \
        ${SCRIPT_DIR}/resources/genome.fa.fai
diff --git a/tests/test_pipeline_empty.sh b/tests/test_pipeline_empty.sh
@@ -0,0 +1,58 @@
+#!/bin/bash
+
+set -Eeuo pipefail
+
+function asserteq() {
+  if [[ ! "$1" -eq "$2" ]]
+  then
+    echo "Assertion Error: $1 not equal to $2"
+    exit 1
+  fi
+
+}
+
+SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
+SOURCE_DIR=$(dirname $SCRIPT_DIR)
+
+# Build the Source VCF
+cat << EOT > "${SCRIPT_DIR}/resources/source_empty.vcf"
+##fileformat=VCFv4.3
+##INFO=<ID=COMMENT,Number=1,Type=String,Description="Comment">
+##FORMAT=<ID=GT,Number=1,Type=String,Description="Consensus Genotype across all datasets with called genotype">
+##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
+#CHROM	POS	ID	REF	 ALT	QUAL 	FILTER	INFO	FORMAT	HG001
+EOT
+
+nextflow run ${SOURCE_DIR}/main.nf \
+-config ${SCRIPT_DIR}/resources/config.yml \
+--oldgenome ${SCRIPT_DIR}/resources/genome.fa \
+--newgenome ${SCRIPT_DIR}/resources/new_genome.fa \
+--vcffile ${SCRIPT_DIR}/resources/source_empty.vcf \
+--outfile ${SCRIPT_DIR}/resources/remap_empty.vcf
+
+# Check the presence of the output file
+ls ${SCRIPT_DIR}/resources/remap_empty.vcf \
+   ${SCRIPT_DIR}/resources/remap_empty_unmapped.vcf \
+   ${SCRIPT_DIR}/resources/remap_empty_counts.yml
+
+# Build the expected VCF
+cat << EOT > "${SCRIPT_DIR}/resources/expected_remap.vcf"
+#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	HG001
+EOT
+
+# Compare vs the expected VCF
+diff "${SCRIPT_DIR}/resources/expected_remap.vcf" <(grep -v '^##' "${SCRIPT_DIR}/resources/remap_empty.vcf")
+
+asserteq `cat ${SCRIPT_DIR}/resources/remap_empty_counts.yml | grep 'all:' | cut -d ' ' -f 2`  0
+asserteq `cat ${SCRIPT_DIR}/resources/remap_empty_counts.yml | grep 'filtered:' | cut -d ' ' -f 2`  0
+
+
+# Clean up after the test
+rm -rf work .nextflow* \
+       ${SCRIPT_DIR}/resources/source_empty.vcf \
+       ${SCRIPT_DIR}/resources/expected_remap.vcf \
+       ${SCRIPT_DIR}/resources/remap_empty.vcf \
+       ${SCRIPT_DIR}/resources/remap_empty_counts.yml \
+       ${SCRIPT_DIR}/resources/remap_empty_unmapped.vcf \
+       ${SCRIPT_DIR}/resources/new_genome.fa.* \
+       ${SCRIPT_DIR}/resources/genome.fa.fai