Skip to content

Commit

Permalink
Merge pull request #29 from tcezard/EVA2451_allow_novel_ref_allele
Browse files Browse the repository at this point in the history
EVA-2451 - allow novel ref allele
  • Loading branch information
tcezard authored Jun 14, 2021
2 parents e3a6754 + cb63b41 commit ab8fe5f
Show file tree
Hide file tree
Showing 8 changed files with 297 additions and 95 deletions.
3 changes: 2 additions & 1 deletion .github/workflows/variant_remapping.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,9 @@ jobs:
$CONDA/bin/conda run pip install -q -r requirements.txt
- name: Test nextflow workflow
run:
run: |
$CONDA/bin/conda run tests/test_pipeline.sh
$CONDA/bin/conda run tests/test_pipeline_empty.sh
- name: Test with pytest
run:
Expand Down
122 changes: 88 additions & 34 deletions main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,32 @@ process uncompressInputVCF {
"""
}


/*
 * filter VCF file to remove variant too close the edges of chromosome because we can't get flanking regions
 */
process filterInputVCF {

input:
path "source.vcf"
path "genome_fai"

output:
path "filtered.vcf", emit: filtered_vcf_file
path "kept.vcf", emit: kept_vcf_file
path "count.yml", emit: count_yml

script:
"""
awk '{ print \$1"\\t1\\t"\$2-1;}' genome_fai > center_regions.bed
awk '{ print \$1"\\t0\\t1"; print \$1"\\t"\$2-1"\\t"\$2;}' genome_fai > edge_regions.bed
bcftools filter --targets-file center_regions.bed source.vcf | tee kept.vcf | grep -v '^#' | wc -l > all_count.txt
bcftools filter --targets-file edge_regions.bed source.vcf | grep -v '^#' | tee filtered.vcf | wc -l > filtered_count.txt
cat <(cat *_count.txt | awk '{sum += \$1} END{print "all: "sum}') <(cat filtered_count.txt | awk '{print "filtered: "\$1}') > count.yml
"""
}


/*
 * Store the original VCF header for later use
 */
Expand All @@ -85,13 +111,13 @@ include { process_split_reads; process_split_reads_mid; process_split_reads_long


/*
 * Create the header for the output VCF
 * This process convert the original Header to the remapped header and concatenate it with the remapped VCF records
 */
process convertAndAddHeaderToVCF {
process generateRemappedVCF {

input:
path "variants_remapped_sorted.vcf"
path "vcf_header.txt"
path "variants_remapped_sorted.vcf"

output:
path "variants_remapped_sorted_with_header.vcf", emit: final_vcf_with_header
Expand Down Expand Up @@ -119,9 +145,9 @@ process convertAndAddHeaderToVCF {
}

/*
 * Add header to unmapped variant VCF records
 * This process adds the original header to unmapped variant VCF records and output the results
 */
process mergeOriginalHeaderAndVCFAndOutput {
process generateUnmappedVCF {

publishDir outfile_dir,
overwrite: true,
Expand Down Expand Up @@ -153,7 +179,7 @@ process sortVCF {

"""
bgzip variants_remapped.vcf
bcftools sort -o variants_remapped_sorted.vcf.gz -Oz variants_remapped.vcf.gz
bcftools sort -T . -o variants_remapped_sorted.vcf.gz -Oz variants_remapped.vcf.gz
"""
}

Expand Down Expand Up @@ -198,6 +224,23 @@ process outputStats {
"""
}

/*
 * Concatenate the unmapped variants
 */
process combineUnmappedVCF {
input:
path "variants1.vcf"
path "variants2.vcf"

output:
path "merge.vcf", emit: merge_vcf

"""
cat variants1.vcf variants2.vcf > merge.vcf
"""
}


process combineVCF {
input:
path "variants1.vcf"
Expand All @@ -213,6 +256,7 @@ process combineVCF {

process combineYaml {
input:
path "initial_yml"
path "round1.yml"
path "round2.yml"
path "round3.yml"
Expand All @@ -221,7 +265,7 @@ process combineYaml {
path "merge.yml", emit: merge_yml

"""
cat round1.yml round2.yml round3.yml > merge.yml
cat initial_yml round1.yml round2.yml round3.yml > merge.yml
"""
}

Expand All @@ -235,24 +279,49 @@ workflow finalise {
summary

main:
convertAndAddHeaderToVCF(variants_remapped, vcf_header)
mergeOriginalHeaderAndVCFAndOutput(vcf_header, variants_unmapped)
sortVCF(convertAndAddHeaderToVCF.out.final_vcf_with_header)
generateUnmappedVCF(vcf_header, variants_unmapped)
generateRemappedVCF(vcf_header, variants_remapped)
sortVCF(generateRemappedVCF.out.final_vcf_with_header)
normaliseAnOutput(sortVCF.out.variants_remapped_sorted_gz, genome)
outputStats(summary)
}


//process_with_bowtie
workflow process_with_bowtie {
main:
prepare_old_genome(params.oldgenome)
prepare_new_genome_bowtie(params.newgenome)
uncompressInputVCF(params.vcffile)
storeVCFHeader(uncompressInputVCF.out.vcf_file)
process_split_reads_with_bowtie(
uncompressInputVCF.out.vcf_file,
params.oldgenome,
prepare_old_genome.out.genome_fai,
prepare_old_genome.out.genome_chrom_sizes,
params.newgenome,
prepare_new_genome_bowtie.out.genome_fai,
prepare_new_genome_bowtie.out.bowtie_indexes
)
finalise(process_split_reads_with_bowtie.out.variants_remapped, storeVCFHeader.out.vcf_header,
params.newgenome, process_split_reads_with_bowtie.out.summary_yml)
}





// process_with_minimap
// Workflow without a name is the default workflow that gets executed when the file is run through nextflow
workflow {
workflow {
main:
prepare_old_genome(params.oldgenome)
prepare_new_genome(params.newgenome)
uncompressInputVCF(params.vcffile)
filterInputVCF(uncompressInputVCF.out.vcf_file, prepare_old_genome.out.genome_fai)
storeVCFHeader(uncompressInputVCF.out.vcf_file)
process_split_reads(
uncompressInputVCF.out.vcf_file,
filterInputVCF.out.kept_vcf_file,
params.oldgenome,
prepare_old_genome.out.genome_fai,
prepare_old_genome.out.genome_chrom_sizes,
Expand All @@ -275,39 +344,24 @@ workflow {
params.newgenome,
prepare_new_genome.out.genome_fai
)
combineUnmappedVCF(
filterInputVCF.out.filtered_vcf_file,
process_split_reads_long.out.variants_unmapped,
)
combineVCF(
process_split_reads.out.variants_remapped,
process_split_reads_mid.out.variants_remapped,
process_split_reads_long.out.variants_remapped
)
combineYaml(
filterInputVCF.out.count_yml,
process_split_reads.out.summary_yml,
process_split_reads_mid.out.summary_yml,
process_split_reads_long.out.summary_yml,
process_split_reads_long.out.summary_yml
)

finalise(
combineVCF.out.merge_vcf, process_split_reads_long.out.variants_unmapped, storeVCFHeader.out.vcf_header,
combineVCF.out.merge_vcf, combineUnmappedVCF.out.merge_vcf, storeVCFHeader.out.vcf_header,
params.newgenome, combineYaml.out.merge_yml
)
}

//process_with_bowtie
workflow process_with_bowtie {
main:
prepare_old_genome(params.oldgenome)
prepare_new_genome_bowtie(params.newgenome)
uncompressInputVCF(params.vcffile)
storeVCFHeader(uncompressInputVCF.out.vcf_file)
process_split_reads_with_bowtie(
uncompressInputVCF.out.vcf_file,
params.oldgenome,
prepare_old_genome.out.genome_fai,
prepare_old_genome.out.genome_chrom_sizes,
params.newgenome,
prepare_new_genome_bowtie.out.genome_fai,
prepare_new_genome_bowtie.out.bowtie_indexes
)
finalise(process_split_reads_with_bowtie.out.variants_remapped, storeVCFHeader.out.vcf_header,
params.newgenome, process_split_reads_with_bowtie.out.summary_yml)
}
5 changes: 5 additions & 0 deletions tests/resources/config.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
executor {
$local {
memory = '6 GB'
}
}
28 changes: 24 additions & 4 deletions tests/test_pipeline.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,15 @@

set -Eeuo pipefail

function asserteq() {
if [[ ! "$1" -eq "$2" ]]
then
echo "Assertion Error: $1 not equal to $2"
exit 1
fi

}

SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
SOURCE_DIR=$(dirname $SCRIPT_DIR)

Expand All @@ -12,15 +21,19 @@ cat << EOT > "${SCRIPT_DIR}/resources/source.vcf"
##FORMAT=<ID=GT,Number=1,Type=String,Description="Consensus Genotype across all datasets with called genotype">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG001
chr1 48 . C A,T 50 PASS COMMENT=NANANANANANANANANANANANANAN%ANANANANANANANANANANANANANANANANAN|ANANANANANANA&NANANANA\$NANANANANANANANANANANANANANANANANAN^ANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANABATMAN GT:GQ 1/1:0
chr1 1 . CG TG 50 PASS . GT:GQ 1/1:0
chr1 48 . C A,T 50 PASS COMMENT=NANANANANANANANANANANANANAN%ANANANANANANANANANANANANANANANANAN|ANANANANANANA&NANAN ANA\$NANANANANANANANANANANANANANANANANAN^ANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANA NANANANANANANANANANANANANABATMAN GT:GQ 1/1:0
chr1 98 . C CG 50 PASS . GT:GQ 1/1:0
chr1 1078 . G A 50 PASS . GT 1/1
chr1 1818 . AAC A 50 PASS . GT:GQ 1/1:0
chr1 2030 . A TCC 50 PASS . GT:GQ 1/1:0
chr1 1818 . AAC A 50 PASS . GT:GQ 1/1:0
chr1 3510 . T C 50 PASS . GT:GQ 1/1:0
chr1 3709 . CA TA 50 PASS . GT:GQ 1/1:0
chr1 3710 . T A 50 PASS . GT:GQ 1/1:0
EOT

nextflow run ${SOURCE_DIR}/main.nf \
-config ${SCRIPT_DIR}/resources/config.yml \
--oldgenome ${SCRIPT_DIR}/resources/genome.fa \
--newgenome ${SCRIPT_DIR}/resources/new_genome.fa \
--vcffile ${SCRIPT_DIR}/resources/source.vcf \
Expand All @@ -34,7 +47,7 @@ ls ${SCRIPT_DIR}/resources/remap.vcf \
# Build the expected VCF
cat << EOT > "${SCRIPT_DIR}/resources/expected_remap.vcf"
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG001
chr2 48 . C A,T 50 PASS COMMENT=NANANANANANANANANANANANANAN%ANANANANANANANANANANANANANANANANAN|ANANANANANANA&NANANANA\$NANANANANANANANANANANANANANANANANAN^ANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANABATMAN;st=+ GT:GQ 1/1:0
chr2 48 . C A,T 50 PASS COMMENT=NANANANANANANANANANANANANAN%ANANANANANANANANANANANANANANANANAN|ANANANANANANA&NANAN ANA\$NANANANANANANANANANANANANANANANANAN^ANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANANA NANANANANANANANANANANANANABATMAN;st=+ GT:GQ 1/1:0
chr2 98 . C CG 50 PASS st=+ GT:GQ 1/1:0
chr2 1078 . A G 50 PASS st=+;rac=G-A GT 0/0
chr2 1818 . AAC A 50 PASS st=+ GT:GQ 1/1:0
Expand All @@ -45,9 +58,16 @@ EOT
# Compare vs the expected VCF
diff "${SCRIPT_DIR}/resources/expected_remap.vcf" <(grep -v '^##' "${SCRIPT_DIR}/resources/remap.vcf")

asserteq `cat ${SCRIPT_DIR}/resources/remap_counts.yml | grep 'all:' | cut -d ' ' -f 2` 9
asserteq `cat ${SCRIPT_DIR}/resources/remap_counts.yml | grep 'filtered:' | cut -d ' ' -f 2` 2


# Clean up after the test
rm -rf work .nextflow* \
${SCRIPT_DIR}/resources/source.vcf \
${SCRIPT_DIR}/resources/*remap.vcf* \
${SCRIPT_DIR}/resources/expected_remap.vcf \
${SCRIPT_DIR}/resources/remap.vcf \
${SCRIPT_DIR}/resources/remap_counts.yml \
${SCRIPT_DIR}/resources/remap_unmapped.vcf \
${SCRIPT_DIR}/resources/new_genome.fa.* \
${SCRIPT_DIR}/resources/genome.fa.fai
58 changes: 58 additions & 0 deletions tests/test_pipeline_empty.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#!/bin/bash

set -Eeuo pipefail

function asserteq() {
if [[ ! "$1" -eq "$2" ]]
then
echo "Assertion Error: $1 not equal to $2"
exit 1
fi

}

SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
SOURCE_DIR=$(dirname $SCRIPT_DIR)

# Build the Source VCF
cat << EOT > "${SCRIPT_DIR}/resources/source_empty.vcf"
##fileformat=VCFv4.3
##INFO=<ID=COMMENT,Number=1,Type=String,Description="Comment">
##FORMAT=<ID=GT,Number=1,Type=String,Description="Consensus Genotype across all datasets with called genotype">
##FORMAT=<ID=GQ,Number=1,Type=Integer,Description="Genotype Quality">
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG001
EOT

nextflow run ${SOURCE_DIR}/main.nf \
-config ${SCRIPT_DIR}/resources/config.yml \
--oldgenome ${SCRIPT_DIR}/resources/genome.fa \
--newgenome ${SCRIPT_DIR}/resources/new_genome.fa \
--vcffile ${SCRIPT_DIR}/resources/source_empty.vcf \
--outfile ${SCRIPT_DIR}/resources/remap_empty.vcf

# Check the presence of the output file
ls ${SCRIPT_DIR}/resources/remap_empty.vcf \
${SCRIPT_DIR}/resources/remap_empty_unmapped.vcf \
${SCRIPT_DIR}/resources/remap_empty_counts.yml

# Build the expected VCF
cat << EOT > "${SCRIPT_DIR}/resources/expected_remap.vcf"
#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT HG001
EOT

# Compare vs the expected VCF
diff "${SCRIPT_DIR}/resources/expected_remap.vcf" <(grep -v '^##' "${SCRIPT_DIR}/resources/remap_empty.vcf")

asserteq `cat ${SCRIPT_DIR}/resources/remap_empty_counts.yml | grep 'all:' | cut -d ' ' -f 2` 0
asserteq `cat ${SCRIPT_DIR}/resources/remap_empty_counts.yml | grep 'filtered:' | cut -d ' ' -f 2` 0


# Clean up after the test
rm -rf work .nextflow* \
${SCRIPT_DIR}/resources/source_empty.vcf \
${SCRIPT_DIR}/resources/expected_remap.vcf \
${SCRIPT_DIR}/resources/remap_empty.vcf \
${SCRIPT_DIR}/resources/remap_empty_counts.yml \
${SCRIPT_DIR}/resources/remap_empty_unmapped.vcf \
${SCRIPT_DIR}/resources/new_genome.fa.* \
${SCRIPT_DIR}/resources/genome.fa.fai
Loading

0 comments on commit ab8fe5f

Please sign in to comment.