theiagen · kapsakcj · Dec 11, 2024 · Dec 12, 2024 · Jan 17, 2025 · Jan 21, 2025
@@ -4,41 +4,45 @@ task genoflu {
   input {
     File assembly_fasta
     String samplename
-
+    Float? min_percent_identity # genoflu default is 98
     # excel file to cross-reference BLAST findings; probably useful if novel
     #  genotypes are not in the default file used by genoflu.py
     File? cross_reference
-
     Int cpu = 1
     Int disk_size = 25
-    String docker = "us-docker.pkg.dev/general-theiagen/staphb/genoflu:1.05"
+    String docker = "us-docker.pkg.dev/general-theiagen/staphb/genoflu:1.06"
     Int memory = 2
   }
   command <<<
-
+    set -euo pipefail
+
     cp ~{assembly_fasta} .
 
-    genoflu.py -v | sed -e 's/genoflu.py:\ version\ //' > VERSION
+    echo "DEBUG: capturing genoflu version..."
+    genoflu.py -v | sed -e 's/genoflu.py:\ version\ //' | tee VERSION
 
+    echo "DEBUG: running genoflu.py..."
     genoflu.py \
       --fasta ~{assembly_fasta} \
       --sample_name ~{samplename} \
+      ~{"--pident_threshold " + min_percent_identity} \
       ~{"--cross_reference" + cross_reference} > genoflu.output.txt
 
     GENOTYPE=$(grep "~{samplename} Genotype" genoflu.output.txt | cut -d ">" -f2 | cut -d " " -f2 | cut -d ":" -f1)
     ALL_SEGMENTS=$(grep "~{samplename} Genotype" genoflu.output.txt | cut -d ">" -f2 | cut -d " " -f3-)
 
     # If genotype unable to be assigned ("Not"), then parse out the expected text
     if [[ "$GENOTYPE" == "Not" ]]; then
-      grep "~{samplename} Genotype" genoflu.output.txt | cut -d ">" -f2- | cut -d " " -f2- | cut -d ":" -f1 > GENOTYPE
-      grep "~{samplename} Genotype" genoflu.output.txt | cut -d ">" -f2- | cut -d " " -f4- | cut -d ":" -f1 > ALL_SEGMENTS
+      echo "DEBUG: parsing out genotype and all segments..."
+      grep "~{samplename} Genotype" genoflu.output.txt | cut -d ">" -f2- | cut -d " " -f2- | cut -d ":" -f1 | tee GENOTYPE
+      grep "~{samplename} Genotype" genoflu.output.txt | cut -d ">" -f2- | cut -d " " -f4- | cut -d ":" -f1 | tee ALL_SEGMENTS
     else
-      echo "$GENOTYPE" > GENOTYPE
-      echo "$ALL_SEGMENTS" > ALL_SEGMENTS
+      echo "DEBUG: saving genotype and all segments..."
+      echo "$GENOTYPE" | tee GENOTYPE
+      echo "$ALL_SEGMENTS" | tee ALL_SEGMENTS
     fi
-
 
-    mv ~{samplename}_*_stats.tsv ~{samplename}_stats.tsv
+    mv -v ~{samplename}_*_stats.tsv ~{samplename}_stats.tsv
   >>>
   output {
     String genoflu_version = read_string("VERSION")

@@ -1,101 +1,46 @@
 version 1.0
 
-task nextclade {
-  meta {
-    description: "Nextclade classification of one sample. Leaving optional inputs unspecified will use SARS-CoV-2 defaults."
-  }
-  input {
-    File genome_fasta
-    File? root_sequence
-    File? auspice_reference_tree_json
-    File? qc_config_json
-    File? gene_annotations_gff
-    File? pcr_primers_csv
-    File? virus_properties
-    String docker = "us-docker.pkg.dev/general-theiagen/nextstrain/nextclade:2.14.0"
-    String dataset_name
-    String dataset_reference
-    String dataset_tag
-    Int disk_size = 50
-    Int memory = 4
-    Int cpu = 2
-  }
-  String basename = basename(genome_fasta, ".fasta")
-  command <<<
-    NEXTCLADE_VERSION="$(nextclade --version)"
-    echo $NEXTCLADE_VERSION > NEXTCLADE_VERSION
-
-    nextclade dataset get --name="~{dataset_name}" --reference="~{dataset_reference}" --tag="~{dataset_tag}" -o nextclade_dataset_dir --verbose
-    set -e
-    nextclade run \
-        --input-dataset=nextclade_dataset_dir/ \
-        ~{"--input-root-seq " + root_sequence} \
-        ~{"--input-tree " + auspice_reference_tree_json} \
-        ~{"--input-qc-config " + qc_config_json} \
-        ~{"--input-gene-map " + gene_annotations_gff} \
-        ~{"--input-pcr-primers " + pcr_primers_csv} \
-        ~{"--input-virus-properties " + virus_properties}  \
-        --output-json "~{basename}".nextclade.json \
-        --output-tsv  "~{basename}".nextclade.tsv \
-        --output-tree "~{basename}".nextclade.auspice.json \
-        --output-all=. \
-        "~{genome_fasta}"
-  >>>
-  runtime {
-    docker: "~{docker}"
-    memory: "~{memory} GB"
-    cpu: cpu
-    disks:  "local-disk " + disk_size + " SSD"
-    disk: disk_size + " GB" # TES
-    dx_instance_type: "mem1_ssd1_v2_x2"
-    maxRetries: 3 
-  }
-  output {
-    String nextclade_version = read_string("NEXTCLADE_VERSION")
-    File nextclade_json = "~{basename}.nextclade.json"
-    File auspice_json = "~{basename}.nextclade.auspice.json"
-    File nextclade_tsv = "~{basename}.nextclade.tsv"
-    String nextclade_docker = docker
-    String nextclade_dataset_tag = "~{dataset_tag}"
-  }
-}
-
 task nextclade_v3 {
   meta {
     description: "Nextclade classification of one sample. Leaving optional inputs unspecified will use SARS-CoV-2 defaults."
   }
   input {
     File genome_fasta
+    File? custom_input_dataset
     File? auspice_reference_tree_json
     File? gene_annotations_gff
     File? nextclade_pathogen_json
     File? input_ref
-    String docker = "us-docker.pkg.dev/general-theiagen/nextstrain/nextclade:3.3.1" 
-    String dataset_name
+    String docker = "us-docker.pkg.dev/general-theiagen/nextstrain/nextclade:3.9.1" 
+    String? dataset_name
     String verbosity = "warn" # other options are: "off" "error" "info" "debug" and "trace"
-    String dataset_tag
+    String? dataset_tag
     Int disk_size = 50
     Int memory = 4
     Int cpu = 2
   }
   String basename = basename(genome_fasta, ".fasta")
   command <<<
+    # exit script/task upon error
+    set -euo pipefail
+
     # track version & print to log
     nextclade --version | tee NEXTCLADE_VERSION
 
     # --reference no longer used in v3. consolidated into --name and --tag
-    nextclade dataset get \
-      --name="~{dataset_name}" \
-      --tag="~{dataset_tag}" \
-      -o nextclade_dataset_dir \
-      --verbosity ~{verbosity}
-
-    # exit script/task upon error
-    set -e
+    # if a custom input dataset is not provided, then use the dataset name and tag
+    # ! -s (if file is not empty) is used to check if that dataset exists and is not empty
+    if [ ! -s "~{custom_input_dataset}" ]; then
+      nextclade dataset get \
+        --name="~{dataset_name}" \
+        --tag="~{dataset_tag}" \
+        -o nextclade_dataset_dir \
+        --verbosity ~{verbosity}
+    fi
 
     # not necessary to include `--jobs <jobs>` in v3. Nextclade will use all available CPU threads by default. It's fast so I don't think we will need to change unless we see errors
     nextclade run \
-      --input-dataset nextclade_dataset_dir/ \
+      --input-dataset ~{default="nextclade_dataset_dir/" custom_input_dataset} \
       ~{"--input-ref " + input_ref} \
       ~{"--input-tree " + auspice_reference_tree_json} \
       ~{"--input-pathogen-json " + nextclade_pathogen_json} \
@@ -128,7 +73,7 @@ task nextclade_v3 {
 
 task nextclade_output_parser {
   meta {
-    description: "Python and bash codeblocks for parsing the output files from Nextclade."
+    description: "Python codeblocks for parsing the output files from Nextclade."
   }
   input {
     File nextclade_tsv
@@ -139,70 +84,52 @@ task nextclade_output_parser {
     String? organism
   }
   command <<<
-    # Set WDL input variable to input.tsv file
-    cat "~{nextclade_tsv}" > input.tsv
-    touch TAMIFLU_AASUBS
-
-    # Parse outputs using python3
     python3 <<CODE
     import csv
-    import codecs
 
-    with codecs.open("./input.tsv",'r') as tsv_file:
+    with open("~{nextclade_tsv}", 'r') as tsv_file:
       tsv_reader = csv.reader(tsv_file, delimiter="\t")
       tsv_data = list(tsv_reader)
 
       if len(tsv_data) == 1:
         tsv_data.append(['NA']*len(tsv_data[0]))
+
       tsv_dict = dict(zip(tsv_data[0], tsv_data[1]))
 
-      # combine 'clade_nextstrain' and 'clade_who' column if sars-cov-2, if false then parse 'clade' column
+      # function to write a field in the tsv_dict to a file for output
+      def write_field_to_file(output_file_name, item_to_parse):
+        with open(output_file_name, 'wt') as output_file:
+          try:
+            item = tsv_dict[item_to_parse]
+            if item == '':
+              item = 'NA'
+          except:
+            item = 'NA'
+          output_file.write(item)
+
+      # combine 'clade_nextstrain' and 'clade_who' column if sars-cov-2
+      # this one is slightly more complicated so the function doesn't apply
       if ("~{organism}" == "sars-cov-2"):
-        with codecs.open("NEXTCLADE_CLADE", 'wt') as Nextclade_Clade:
+        with open("NEXTCLADE_CLADE", 'wt') as nextclade_clade:
           nc_clade = tsv_dict['clade_nextstrain']
           who_clade = tsv_dict['clade_who']
           if (nc_clade != who_clade) and (nc_clade != '') and (who_clade != ''):
             nc_clade = nc_clade + " (" + who_clade + ")"
           if nc_clade == '':
             nc_clade = 'NA'
-          Nextclade_Clade.write(nc_clade)
+          nextclade_clade.write(nc_clade)
       else:
-        with codecs.open("NEXTCLADE_CLADE", 'wt') as Nextclade_Clade:
-          nc_clade = tsv_dict['clade']
-          if nc_clade == '':
-            nc_clade = 'NA'
-          Nextclade_Clade.write(nc_clade)
+        write_field_to_file("NEXTCLADE_CLADE", 'clade')
+
+      write_field_to_file('NEXTCLADE_AASUBS', 'aaSubstitutions')
+      write_field_to_file('NEXTCLADE_AADELS', 'aaDeletions')
 
-      with codecs.open("NEXTCLADE_AASUBS", 'wt') as Nextclade_AA_Subs:
-        nc_aa_subs = tsv_dict['aaSubstitutions']
-        if nc_aa_subs == '':
-          nc_aa_subs = 'NA'
-        Nextclade_AA_Subs.write(nc_aa_subs)
+      write_field_to_file('NEXTCLADE_LINEAGE', 'lineage')
+      if 'Nextclade_pango' in tsv_dict:
+        write_field_to_file('NEXTCLADE_LINEAGE', 'Nextclade_pango')
 
-      with codecs.open("NEXTCLADE_AADELS", 'wt') as Nextclade_AA_Dels:
-        nc_aa_dels = tsv_dict['aaDeletions']
-        if nc_aa_dels == '':
-          nc_aa_dels = 'NA'
-        Nextclade_AA_Dels.write(nc_aa_dels)
+      write_field_to_file('NEXTCLADE_QC', 'qc.overallStatus')
 
-      with codecs.open("NEXTCLADE_LINEAGE", 'wt') as Nextclade_Lineage:
-        if 'lineage' in tsv_dict:
-          nc_lineage = tsv_dict['lineage']
-          if nc_lineage is None:
-            nc_lineage = ""
-        elif 'Nextclade_pango' in tsv_dict:
-          nc_lineage = tsv_dict['Nextclade_pango']
-          if nc_lineage is None:
-            nc_lineage = ""
-        else:
-          nc_lineage = ""
-        Nextclade_Lineage.write(nc_lineage)
-
-      with codecs.open("NEXTCLADE_QC", 'wt') as Nextclade_QC:
-        nc_qc = tsv_dict['qc.overallStatus']
-        if nc_qc == '':
-          nc_qc = 'NA'
-        Nextclade_QC.write(nc_qc)
     CODE
   >>>
   runtime {

@@ -40,7 +40,7 @@ workflow theiacov_illumina_pe {
     Int trim_quality_min_score = 30
     Int trim_window_size = 4
     # assembly parameters
-    Int min_depth = 100  # the minimum depth to use for consensus and variant calling
+    Int? min_depth # minimum depth to use for consensus and variant calling; default is 100 for non-flu (default value set below in call block for ivar consensus subwf), flu default is 30 for illumina (default set below in flu_track call block)
     Float consensus_min_freq = 0.6 # minimum frequency for a variant to be called as SNP in consensus genome
     Float variant_min_freq = 0.6 # minimum frequency for a variant to be reported in ivar outputs
     # nextclade inputs
@@ -138,7 +138,7 @@ workflow theiacov_illumina_pe {
             reference_genome = organism_parameters.reference,
             primer_bed = organism_parameters.primer_bed,
             reference_gff = organism_parameters.reference_gff,
-            min_depth = min_depth,
+            min_depth = select_first([min_depth, 100]),
             consensus_min_freq = consensus_min_freq,
             variant_min_freq = variant_min_freq,
             trim_primers = trim_primers
@@ -152,7 +152,8 @@ workflow theiacov_illumina_pe {
             read2 = read_QC_trim.read2_clean,
             samplename = samplename,
             standardized_organism = organism_parameters.standardized_organism,
-            seq_method = seq_method
+            seq_method = seq_method,
+            irma_min_consensus_support = select_first([min_depth, 30])
         }
       }
       if (defined(ivar_consensus.assembly_fasta) || defined(flu_track.irma_assembly_fasta)) {
@@ -328,7 +329,8 @@ workflow theiacov_illumina_pe {
     String? ivar_version_consensus = ivar_consensus.ivar_version_consensus
     String? samtools_version_consensus = ivar_consensus.samtools_version_consensus
     # Read Alignment - consensus assembly qc outputs
-    Int consensus_n_variant_min_depth = min_depth
+    # this is the minimum depth used for consensus and variant calling in EITHER iVar or IRMA
+    Int consensus_n_variant_min_depth = select_first([min_depth, flu_track.irma_minimum_consensus_support, 100])
     File? consensus_stats = ivar_consensus.consensus_stats
     File? consensus_flagstat = ivar_consensus.consensus_flagstat
     String meanbaseq_trim = select_first([ivar_consensus.meanbaseq_trim, ""])
@@ -367,6 +369,14 @@ workflow theiacov_illumina_pe {
     String? nextclade_clade = nextclade_output_parser.nextclade_clade
     String? nextclade_lineage = nextclade_output_parser.nextclade_lineage
     String? nextclade_qc = nextclade_output_parser.nextclade_qc
+    # Nextclade outputs for flu H5N1
+    File? nextclade_json_flu_h5n1 = flu_track.nextclade_json_flu_h5n1
+    File? auspice_json_flu_h5n1 = flu_track.auspice_json_flu_h5n1
+    File? nextclade_tsv_flu_h5n1 = flu_track.nextclade_tsv_flu_h5n1
+    String? nextclade_aa_subs_flu_h5n1 = flu_track.nextclade_aa_subs_flu_h5n1
+    String? nextclade_aa_dels_flu_h5n1 = flu_track.nextclade_aa_dels_flu_h5n1
+    String? nextclade_clade_flu_h5n1 = flu_track.nextclade_clade_flu_h5n1
+    String? nextclade_qc_flu_h5n1 = flu_track.nextclade_qc_flu_h5n1
     # Nextclade outputs for flu HA
     File? nextclade_json_flu_ha = flu_track.nextclade_json_flu_ha
     File? auspice_json_flu_ha = flu_track.auspice_json_flu_ha
@@ -400,6 +410,8 @@ workflow theiacov_illumina_pe {
     String? irma_type = flu_track.irma_type
     String? irma_subtype = flu_track.irma_subtype
     String? irma_subtype_notes = flu_track.irma_subtype_notes
+    File? irma_assembly_fasta_concatenated = flu_track.irma_assembly_fasta_concatenated
+    File? irma_assembly_fasta_concatenated_padded = flu_track.irma_assembly_fasta_concatenated_padded
     File? irma_ha_segment_fasta = flu_track.irma_ha_segment_fasta
     File? irma_na_segment_fasta = flu_track.irma_na_segment_fasta
     File? irma_pa_segment_fasta = flu_track.irma_pa_segment_fasta

@@ -57,6 +57,9 @@ workflow theiacov_ont {
     String? pangolin_docker_image
     # qc check parameters
     File? qc_check_table
+    ## flu specific inputs
+    # default set to 50 for ONT data in call block below, following CDC MIRA standards
+    Int? irma_min_consensus_support
   }
   call set_organism_defaults.organism_parameters {
     input:
@@ -147,7 +150,8 @@ workflow theiacov_ont {
             read1 = read_qc_trim.read1_clean,
             samplename = samplename,
             standardized_organism = organism_parameters.standardized_organism,
-            seq_method = seq_method
+            seq_method = seq_method,
+            irma_min_consensus_support = select_first([irma_min_consensus_support, 50])
         }
       }
       # nanoplot for basic QC metrics
@@ -351,6 +355,14 @@ workflow theiacov_ont {
     String? nextclade_clade = nextclade_output_parser.nextclade_clade
     String? nextclade_lineage = nextclade_output_parser.nextclade_lineage
     String? nextclade_qc = nextclade_output_parser.nextclade_qc
+    # Nextclade outputs for flu H5N1
+    File? nextclade_json_flu_h5n1 = flu_track.nextclade_json_flu_h5n1
+    File? auspice_json_flu_h5n1 = flu_track.auspice_json_flu_h5n1
+    File? nextclade_tsv_flu_h5n1 = flu_track.nextclade_tsv_flu_h5n1
+    String? nextclade_aa_subs_flu_h5n1 = flu_track.nextclade_aa_subs_flu_h5n1
+    String? nextclade_aa_dels_flu_h5n1 = flu_track.nextclade_aa_dels_flu_h5n1
+    String? nextclade_clade_flu_h5n1 = flu_track.nextclade_clade_flu_h5n1
+    String? nextclade_qc_flu_h5n1 = flu_track.nextclade_qc_flu_h5n1
     # Nextclade outputs for flu HA
     File? nextclade_json_flu_ha = flu_track.nextclade_json_flu_ha
     File? auspice_json_flu_ha = flu_track.auspice_json_flu_ha
@@ -381,9 +393,12 @@ workflow theiacov_ont {
     # Flu IRMA Outputs
     String? irma_version = flu_track.irma_version
     String? irma_docker = flu_track.irma_docker
+    Int? irma_min_consensus_support_threshold = flu_track.irma_minimum_consensus_support
     String? irma_type = flu_track.irma_type
     String? irma_subtype = flu_track.irma_subtype
     String? irma_subtype_notes = flu_track.irma_subtype_notes
+    File? irma_assembly_fasta_concatenated = flu_track.irma_assembly_fasta_concatenated
+    File? irma_assembly_fasta_concatenated_padded = flu_track.irma_assembly_fasta_concatenated_padded
     File? irma_ha_segment_fasta = flu_track.irma_ha_segment_fasta
     File? irma_na_segment_fasta = flu_track.irma_na_segment_fasta
     File? irma_pa_segment_fasta = flu_track.irma_pa_segment_fasta