diff --git a/.dockstore.yml b/.dockstore.yml index bda39fc51..12a5ed2a2 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -50,6 +50,11 @@ workflows: primaryDescriptorPath: /pipes/WDL/workflows/augur_from_msa.wdl testParameterFiles: - empty.json + - name: augur_from_msa_with_subsampler + subclass: WDL + primaryDescriptorPath: /pipes/WDL/workflows/augur_from_msa_with_subsampler.wdl + testParameterFiles: + - empty.json - name: bams_multiqc subclass: WDL primaryDescriptorPath: /pipes/WDL/workflows/bams_multiqc.wdl @@ -324,6 +329,11 @@ workflows: primaryDescriptorPath: /pipes/WDL/workflows/scaffold_and_refine.wdl testParameterFiles: - empty.json + - name: subsample_by_casecounts + subclass: WDL + primaryDescriptorPath: /pipes/WDL/workflows/subsample_by_casecounts.wdl + testParameterFiles: + - empty.json - name: subsample_by_metadata subclass: WDL primaryDescriptorPath: /pipes/WDL/workflows/subsample_by_metadata.wdl @@ -358,4 +368,4 @@ workflows: subclass: WDL primaryDescriptorPath: /pipes/WDL/workflows/bam_to_qiime.wdl testParameterFiles: - - empty.json \ No newline at end of file + - empty.json diff --git a/pipes/WDL/tasks/tasks_interhost.wdl b/pipes/WDL/tasks/tasks_interhost.wdl index 14598f19c..e19f29557 100644 --- a/pipes/WDL/tasks/tasks_interhost.wdl +++ b/pipes/WDL/tasks/tasks_interhost.wdl @@ -1,5 +1,156 @@ version 1.0 +task subsample_by_cases { + meta { + description: "Run subsampler to get downsampled dataset and metadata proportional to epidemiological case counts." + } + input { + File metadata + File case_data + + String id_column + String geo_column + String date_column = "date" + String unit = "week" + + File? keep_file + File? remove_file + File? filter_file + Float baseline = 0.0001 + Int? seed_num + String? start_date + String? end_date + + String docker = "quay.io/broadinstitute/subsampler" + Int machine_mem_gb = 30 + } + command <<< + set -e -o pipefail + mkdir -p data outputs + + # decompress if compressed + echo "staging and decompressing input data files" + if [[ ~{metadata} == *.gz ]]; then + cat "~{metadata}" | pigz -d > data/metadata.tsv + elif [[ ~{metadata} == *.zst ]]; then + cat "~{metadata}" | zstd -d > data/metadata.tsv + else + ln -s "~{metadata}" data/metadata.tsv + fi + if [[ ~{case_data} == *.gz ]]; then + cat "~{case_data}" | pigz -d > data/case_data.tsv + elif [[ ~{case_data} == *.zst ]]; then + cat "~{case_data}" | zstd -d > data/case_data.tsv + else + ln -s "~{case_data}" data/case_data.tsv + fi + + ## replicate snakemake DAG manually + # rule genome_matrix + # Generate matrix of genome counts per day, for each element in column ~{geo_column} + echo "getting genome matrix" + python3 /opt/subsampler/scripts/get_genome_matrix.py \ + --metadata data/metadata.tsv \ + --index-column ~{geo_column} \ + --date-column ~{date_column} \ + ~{"--start-date " + start_date} \ + ~{"--end-date " + end_date} \ + --output outputs/genome_matrix_days.tsv + date;uptime;free + + # rule unit_conversion + # Generate matrix of genome and case counts per epiweek + echo "converting matricies to epiweeks" + python3 /opt/subsampler/scripts/aggregator.py \ + --input outputs/genome_matrix_days.tsv \ + --unit ~{unit} \ + --format integer \ + --output outputs/matrix_genomes_unit.tsv + python3 /opt/subsampler/scripts/aggregator.py \ + --input data/case_data.tsv \ + --unit ~{unit} \ + --format integer \ + ~{"--start-date " + start_date} \ + ~{"--end-date " + end_date} \ + --output outputs/matrix_cases_unit.tsv + date;uptime;free + + # rule correct_bias + # Correct under- and oversampling genome counts based on epidemiological data + echo "create bias-correction matrix" + python3 /opt/subsampler/scripts/correct_bias.py \ + --genome-matrix outputs/matrix_genomes_unit.tsv \ + --case-matrix outputs/matrix_cases_unit.tsv \ + --index-column code \ + ~{"--baseline " + baseline} \ + --output1 outputs/weekly_sampling_proportions.tsv \ + --output2 outputs/weekly_sampling_bias.tsv \ + --output3 outputs/matrix_genomes_unit_corrected.tsv + date;uptime;free + + # rule subsample + # Sample genomes and metadata according to the corrected genome matrix + echo "subsample data according to bias-correction" + # subsampler_timeseries says --keep is optional but actually fails if you don't specify one + cp /dev/null data/keep.txt + ~{"cp " + keep_file + " data/keep.txt"} + python3 /opt/subsampler/scripts/subsampler_timeseries.py \ + --metadata data/metadata.tsv \ + --genome-matrix outputs/matrix_genomes_unit_corrected.tsv \ + --index-column ~{id_column} \ + --geo-column ~{geo_column} \ + --date-column ~{date_column} \ + --time-unit ~{unit} \ + --keep data/keep.txt \ + ~{"--remove " + remove_file} \ + ~{"--filter-file " + filter_file} \ + ~{"--seed " + seed_num} \ + ~{"--start-date " + start_date} \ + ~{"--end-date " + end_date} \ + --weekasdate no \ + --sampled-sequences outputs/selected_sequences.txt \ + --sampled-metadata outputs/selected_metadata.tsv \ + --report outputs/sampling_stats.txt + echo '# Sampling proportion: ~{baseline}' | cat - outputs/sampling_stats.txt > temp && mv temp outputs/sampling_stats.txt + date;uptime;free + + # copy outputs from container's temp dir to host-accessible working dir for delocalization + echo "wrap up" + mv outputs/* . + # get counts + cat selected_sequences.txt | wc -l | tee NUM_OUT + # get hardware utilization + set +o pipefail + cat /proc/uptime | cut -f 1 -d ' ' > UPTIME_SEC + cat /proc/loadavg > CPU_LOAD + { cat /sys/fs/cgroup/memory/memory.max_usage_in_bytes || echo 0; } > MEM_BYTES + + >>> + runtime { + docker: docker + memory: machine_mem_gb + " GB" + cpu: 2 + disks: "local-disk 200 HDD" + disk: "200 GB" + dx_instance_type: "mem3_ssd1_v2_x4" + } + output { + File genome_matrix_days = "genome_matrix_days.tsv" + File matrix_genomes_unit = "matrix_genomes_unit.tsv" + File matrix_cases_unit = "matrix_cases_unit.tsv" + File weekly_sampling_proportions = "weekly_sampling_proportions.tsv" + File weekly_sampling_bias = "weekly_sampling_bias.tsv" + File matrix_genomes_unit_corrected = "matrix_genomes_unit_corrected.tsv" + File selected_sequences = "selected_sequences.txt" + File selected_metadata = "selected_metadata.tsv" + File sampling_stats = "sampling_stats.txt" + Int num_selected = read_int("NUM_OUT") + Int max_ram_gb = ceil(read_float("MEM_BYTES")/1000000000) + Int runtime_sec = ceil(read_float("UPTIME_SEC")) + String cpu_load = read_string("CPU_LOAD") + } +} + task multi_align_mafft_ref { input { File reference_fasta diff --git a/pipes/WDL/tasks/tasks_nextstrain.wdl b/pipes/WDL/tasks/tasks_nextstrain.wdl index 8fb09ecd8..5053c77e6 100644 --- a/pipes/WDL/tasks/tasks_nextstrain.wdl +++ b/pipes/WDL/tasks/tasks_nextstrain.wdl @@ -450,7 +450,7 @@ task nextstrain_build_subsample { File? drop_list Int machine_mem_gb = 50 - String docker = "nextstrain/base:build-20211012T204409Z" + String docker = "nextstrain/base:build-20230905T192825Z" String nextstrain_ncov_repo_commit = "30435fb9ec8de2f045167fb90adfec12f123e80a" Int disk_size = 750 } @@ -594,7 +594,7 @@ task nextstrain_build_subsample { task nextstrain_ncov_defaults { input { String nextstrain_ncov_repo_commit = "30435fb9ec8de2f045167fb90adfec12f123e80a" - String docker = "nextstrain/base:build-20211012T204409Z" + String docker = "nextstrain/base:build-20230905T192825Z" Int disk_size = 50 } command { @@ -632,7 +632,7 @@ task nextstrain_deduplicate_sequences { Boolean error_on_seq_diff = false String nextstrain_ncov_repo_commit = "30435fb9ec8de2f045167fb90adfec12f123e80a" - String docker = "nextstrain/base:build-20211012T204409Z" + String docker = "nextstrain/base:build-20230905T192825Z" Int disk_size = 750 } @@ -686,7 +686,7 @@ task nextstrain_ncov_sanitize_gisaid_data { String? prefix_to_strip String nextstrain_ncov_repo_commit = "30435fb9ec8de2f045167fb90adfec12f123e80a" - String docker = "nextstrain/base:build-20211012T204409Z" + String docker = "nextstrain/base:build-20230905T192825Z" Int disk_size = 750 } @@ -762,7 +762,7 @@ task filter_subsample_sequences { Array[String]? exclude_where Array[String]? include_where - String docker = "nextstrain/base:build-20211012T204409Z" + String docker = "nextstrain/base:build-20230905T192825Z" Int disk_size = 750 } parameter_meta { @@ -846,14 +846,14 @@ task filter_sequences_to_list { File sequences Array[File]? keep_list - String out_fname = sub(sub(basename(sequences), ".vcf", ".filtered.vcf"), ".fasta$", ".filtered.fasta") - String docker = "nextstrain/base:build-20211012T204409Z" + String out_fname = sub(sub(basename(sequences, ".zst"), ".vcf", ".filtered.vcf"), ".fasta$", ".filtered.fasta") + String docker = "quay.io/broadinstitute/viral-core:2.1.33" # "nextstrain/base:build-20211012T204409Z" Int disk_size = 750 } parameter_meta { sequences: { description: "Set of sequences (unaligned fasta or aligned fasta -- one sequence per genome) or variants (vcf format) to subsample using augur filter.", - patterns: ["*.fasta", "*.fa", "*.vcf", "*.vcf.gz"] + patterns: ["*.fasta", "*.fa", "*.fasta.zst", "*.vcf", "*.vcf.gz"] } keep_list: { description: "List of strain ids.", @@ -876,13 +876,14 @@ task filter_sequences_to_list { echo filtering fasta file python3 < VERSION - AUGUR_RECURSION_LIMIT=10000 augur tree --alignment "~{msa_or_vcf}" \ + AUGUR_RECURSION_LIMIT=100000 augur tree --alignment "~{msa_or_vcf}" \ --output "~{out_basename}_~{method}.nwk" \ --method "~{method}" \ --substitution-model ~{default="GTR" substitution_model} \ @@ -1344,7 +1345,7 @@ task refine_augur_tree { String? divergence_units = "mutations" File? vcf_reference - String docker = "nextstrain/base:build-20211012T204409Z" + String docker = "nextstrain/base:build-20230905T192825Z" Int disk_size = 750 Int machine_mem_gb = 75 } @@ -1358,7 +1359,7 @@ task refine_augur_tree { command <<< set -e augur version > VERSION - AUGUR_RECURSION_LIMIT=10000 augur refine \ + AUGUR_RECURSION_LIMIT=100000 augur refine \ --tree "~{raw_tree}" \ --alignment "~{msa_or_vcf}" \ --metadata "~{metadata}" \ @@ -1418,14 +1419,14 @@ task ancestral_traits { Float? sampling_bias_correction Int machine_mem_gb = 32 - String docker = "nextstrain/base:build-20211012T204409Z" + String docker = "nextstrain/base:build-20230905T192825Z" Int disk_size = 750 } String out_basename = basename(tree, '.nwk') command <<< set -e augur version > VERSION - AUGUR_RECURSION_LIMIT=10000 augur traits \ + AUGUR_RECURSION_LIMIT=100000 augur traits \ --tree "~{tree}" \ --metadata "~{metadata}" \ --columns ~{sep=" " columns} \ @@ -1471,7 +1472,7 @@ task ancestral_tree { File? vcf_reference File? output_vcf - String docker = "nextstrain/base:build-20211012T204409Z" + String docker = "nextstrain/base:build-20230905T192825Z" Int disk_size = 300 } parameter_meta { @@ -1484,7 +1485,7 @@ task ancestral_tree { command <<< set -e augur version > VERSION - AUGUR_RECURSION_LIMIT=10000 augur ancestral \ + AUGUR_RECURSION_LIMIT=100000 augur ancestral \ --tree "~{tree}" \ --alignment "~{msa_or_vcf}" \ --output-node-data "~{out_basename}_nt_muts.json" \ @@ -1532,14 +1533,14 @@ task translate_augur_tree { File? vcf_reference_output File? vcf_reference - String docker = "nextstrain/base:build-20211012T204409Z" + String docker = "nextstrain/base:build-20230905T192825Z" Int disk_size = 300 } String out_basename = basename(tree, '.nwk') command <<< set -e augur version > VERSION - AUGUR_RECURSION_LIMIT=10000 augur translate --tree "~{tree}" \ + AUGUR_RECURSION_LIMIT=500000 augur translate --tree "~{tree}" \ --ancestral-sequences "~{nt_muts}" \ --reference-sequence "~{genbank_gb}" \ ~{"--vcf-reference-output " + vcf_reference_output} \ @@ -1589,14 +1590,14 @@ task tip_frequencies { Boolean include_internal_nodes = false Int machine_mem_gb = 64 - String docker = "nextstrain/base:build-20211012T204409Z" + String docker = "nextstrain/base:build-20230905T192825Z" String out_basename = basename(tree, '.nwk') Int disk_size = 200 } command <<< set -e augur version > VERSION - AUGUR_RECURSION_LIMIT=10000 augur frequencies \ + AUGUR_RECURSION_LIMIT=100000 augur frequencies \ --method "~{method}" \ --tree "~{tree}" \ --metadata "~{metadata}" \ @@ -1645,20 +1646,20 @@ task assign_clades_to_nodes { input { File tree_nwk File nt_muts_json - File aa_muts_json + File? aa_muts_json File ref_fasta File clades_tsv - String docker = "nextstrain/base:build-20211012T204409Z" + String docker = "nextstrain/base:build-20230905T192825Z" Int disk_size = 300 } String out_basename = basename(basename(tree_nwk, ".nwk"), "_timetree") command <<< set -e augur version > VERSION - AUGUR_RECURSION_LIMIT=10000 augur clades \ + AUGUR_RECURSION_LIMIT=100000 augur clades \ --tree "~{tree_nwk}" \ - --mutations "~{nt_muts_json}" "~{aa_muts_json}" \ + --mutations "~{nt_muts_json}" ~{'"' + aa_muts_json + '"'} \ --reference "~{ref_fasta}" \ --clades "~{clades_tsv}" \ --output-node-data "~{out_basename}_clades.json" @@ -1694,14 +1695,14 @@ task augur_import_beast { String? tip_date_delimiter Int machine_mem_gb = 3 - String docker = "nextstrain/base:build-20211012T204409Z" + String docker = "nextstrain/base:build-20230905T192825Z" Int disk_size = 150 } String tree_basename = basename(beast_mcc_tree, ".tree") command <<< set -e augur version > VERSION - AUGUR_RECURSION_LIMIT=10000 augur import beast \ + AUGUR_RECURSION_LIMIT=100000 augur import beast \ --mcc "~{beast_mcc_tree}" \ --output-tree "~{tree_basename}.nwk" \ --output-node-data "~{tree_basename}.json" \ @@ -1755,7 +1756,7 @@ task export_auspice_json { String out_basename = basename(basename(tree, ".nwk"), "_timetree") Int machine_mem_gb = 64 - String docker = "nextstrain/base:build-20211012T204409Z" + String docker = "nextstrain/base:build-20230905T192825Z" Int disk_size = 300 } @@ -1803,7 +1804,7 @@ task export_auspice_json { echo --auspice-config >> exportargs echo "~{auspice_config}" >> exportargs - (export AUGUR_RECURSION_LIMIT=15000; cat exportargs | grep . | tr '\n' '\0' | xargs -0 -t augur export v2 \ + (export AUGUR_RECURSION_LIMIT=100000; cat exportargs | grep . | tr '\n' '\0' | xargs -0 -t augur export v2 \ ~{"--metadata " + sample_metadata} \ ~{"--lat-longs " + lat_longs_tsv} \ ~{"--colors " + colors_tsv} \ diff --git a/pipes/WDL/workflows/augur_from_msa_with_subsampler.wdl b/pipes/WDL/workflows/augur_from_msa_with_subsampler.wdl new file mode 100644 index 000000000..e10fda486 --- /dev/null +++ b/pipes/WDL/workflows/augur_from_msa_with_subsampler.wdl @@ -0,0 +1,171 @@ +version 1.0 + +import "../tasks/tasks_interhost.wdl" as interhost +import "../tasks/tasks_nextstrain.wdl" as nextstrain +import "../tasks/tasks_reports.wdl" as reports +import "../tasks/tasks_utils.wdl" as utils + +workflow augur_from_msa_with_subsampler { + meta { + description: "Build trees, and convert to json representation suitable for Nextstrain visualization. See https://nextstrain.org/docs/getting-started/ and https://nextstrain-augur.readthedocs.io/en/stable/" + author: "Broad Viral Genomics" + email: "viral-ngs@broadinstitute.org" + allowNestedInputs: true + } + + input { + File aligned_msa_fasta + Array[File]+ sample_metadata + File? ref_fasta + File? genbank_gb + File auspice_config + File? clades_tsv + Array[String]? ancestral_traits_to_infer + File? mask_bed + } + + parameter_meta { + aligned_msa_fasta: { + description: "Multiple sequence alignment (aligned fasta).", + patterns: ["*.fasta", "*.fa", "*.fasta.gz", "*.fa.gz", "*.fasta.zst", "*.fa.zst"] + } + sample_metadata: { + description: "Metadata in tab-separated text format. See https://nextstrain-augur.readthedocs.io/en/stable/faq/metadata.html for details. At least one tab file must be provided--if multiple are provided, they will be joined via a full left outer join using the 'strain' column as the join ID.", + patterns: ["*.txt", "*.tsv", "*.txt.gz", "*.txt.zst", "*.tsv.gz", "*.tsv.zst"] + } + ref_fasta: { + description: "A reference assembly (not included in assembly_fastas) to align assembly_fastas against. Typically from NCBI RefSeq or similar.", + patterns: ["*.fasta", "*.fa"] + } + genbank_gb: { + description: "A 'genbank' formatted gene annotation file that is used to calculate coding consequences of observed mutations. Must correspond to the same coordinate space as ref_fasta. Typically downloaded from the same NCBI accession number as ref_fasta.", + patterns: ["*.gb", "*.gbf"] + } + ancestral_traits_to_infer: { + description: "A list of metadata traits to use for ancestral node inference (see https://nextstrain-augur.readthedocs.io/en/stable/usage/cli/traits.html). Multiple traits may be specified; must correspond exactly to column headers in metadata file. Omitting these values will skip ancestral trait inference, and ancestral nodes will not have estimated values for metadata." + } + auspice_config: { + description: "A file specifying options to customize the auspice export; see: https://nextstrain.github.io/auspice/customise-client/introduction", + patterns: ["*.json", "*.txt"] + } + clades_tsv: { + description: "A TSV file containing clade mutation positions in four columns: [clade gene site alt]; see: https://nextstrain.org/docs/tutorials/defining-clades", + patterns: ["*.tsv", "*.txt"] + } + mask_bed: { + description: "Optional list of sites to mask when building trees.", + patterns: ["*.bed"] + } + } + + # merge tsvs if necessary + if(length(sample_metadata)>1) { + call utils.tsv_join { + input: + input_tsvs = sample_metadata, + id_col = 'strain', + out_basename = "metadata-merged", + out_suffix = ".txt.zst" + } + } + + # subsample and filter genomic data based on epi case data + call interhost.subsample_by_cases { + input: + metadata = select_first(flatten([[tsv_join.out_tsv], sample_metadata])) + } + call nextstrain.filter_sequences_to_list { + input: + sequences = aligned_msa_fasta, + keep_list = [subsample_by_cases.selected_sequences] + } + + # standard augur pipeline + if(defined(mask_bed)) { + call nextstrain.augur_mask_sites { + input: + sequences = filter_sequences_to_list.filtered_fasta, + mask_bed = mask_bed + } + } + File masked_sequences = select_first([augur_mask_sites.masked_sequences, filter_sequences_to_list.filtered_fasta]) + call nextstrain.draft_augur_tree { + input: + msa_or_vcf = masked_sequences + } + call nextstrain.refine_augur_tree { + input: + raw_tree = draft_augur_tree.aligned_tree, + msa_or_vcf = masked_sequences, + metadata = subsample_by_cases.selected_metadata + } + if(defined(ancestral_traits_to_infer) && length(select_first([ancestral_traits_to_infer,[]]))>0) { + call nextstrain.ancestral_traits { + input: + tree = refine_augur_tree.tree_refined, + metadata = subsample_by_cases.selected_metadata, + columns = select_first([ancestral_traits_to_infer,[]]) + } + } + call nextstrain.tip_frequencies { + input: + tree = refine_augur_tree.tree_refined, + metadata = subsample_by_cases.selected_metadata + } + call nextstrain.ancestral_tree { + input: + tree = refine_augur_tree.tree_refined, + msa_or_vcf = masked_sequences + } + if(defined(genbank_gb)) { + call nextstrain.translate_augur_tree { + input: + tree = refine_augur_tree.tree_refined, + nt_muts = ancestral_tree.nt_muts_json, + genbank_gb = select_first([genbank_gb]) + } + } + if(defined(clades_tsv) && defined(ref_fasta)) { + call nextstrain.assign_clades_to_nodes { + input: + tree_nwk = refine_augur_tree.tree_refined, + nt_muts_json = ancestral_tree.nt_muts_json, + aa_muts_json = translate_augur_tree.aa_muts_json, + ref_fasta = select_first([ref_fasta]), + clades_tsv = select_first([clades_tsv]) + } + } + call nextstrain.export_auspice_json { + input: + tree = refine_augur_tree.tree_refined, + sample_metadata = subsample_by_cases.selected_metadata, + node_data_jsons = select_all([ + refine_augur_tree.branch_lengths, + ancestral_traits.node_data_json, + ancestral_tree.nt_muts_json, + translate_augur_tree.aa_muts_json, + assign_clades_to_nodes.node_clade_data_json]), + auspice_config = auspice_config + } + + output { + File selected_metadata = subsample_by_cases.selected_metadata + File sampling_stats_file = subsample_by_cases.sampling_stats + + File masked_subsampled_msa = masked_sequences + + File ml_tree = draft_augur_tree.aligned_tree + File time_tree = refine_augur_tree.tree_refined + + Array[File] node_data_jsons = select_all([ + refine_augur_tree.branch_lengths, + ancestral_traits.node_data_json, + ancestral_tree.nt_muts_json, + translate_augur_tree.aa_muts_json, + assign_clades_to_nodes.node_clade_data_json]) + + File auspice_input_json = export_auspice_json.virus_json + File tip_frequencies_json = tip_frequencies.node_data_json + File root_sequence_json = export_auspice_json.root_sequence_json + } +} \ No newline at end of file diff --git a/pipes/WDL/workflows/sarscov2_nextstrain.wdl b/pipes/WDL/workflows/sarscov2_nextstrain.wdl index 7e35e18ee..b0cbabc98 100644 --- a/pipes/WDL/workflows/sarscov2_nextstrain.wdl +++ b/pipes/WDL/workflows/sarscov2_nextstrain.wdl @@ -15,7 +15,7 @@ workflow sarscov2_nextstrain { } input { - Array[File]+ assembly_fastas=["gs://nextstrain-data/files/ncov/open/sequences.fasta.xz"] + Array[File]+ assembly_fastas=["gs://nextstrain-data/files/ncov/open/sequences.fasta.zst"] Array[File]+ sample_metadata_tsvs=["gs://nextstrain-data/files/ncov/open/metadata.tsv.gz"] File? ref_fasta Int min_unambig_genome = 27000 @@ -57,7 +57,7 @@ workflow sarscov2_nextstrain { call utils.zcat { input: infiles = assembly_fastas, - output_name = "all_samples_combined_assembly.fasta" + output_name = "all_samples_combined_assembly.fasta.zst" } call nextstrain.nextstrain_deduplicate_sequences as dedup_seqs { @@ -215,4 +215,4 @@ workflow sarscov2_nextstrain { File root_sequence_json = export_auspice_json.root_sequence_json File auspice_input_json = export_auspice_json.virus_json } -} \ No newline at end of file +} diff --git a/pipes/WDL/workflows/sarscov2_nextstrain_aligned_input.wdl b/pipes/WDL/workflows/sarscov2_nextstrain_aligned_input.wdl index e6f822985..52b0da5fe 100644 --- a/pipes/WDL/workflows/sarscov2_nextstrain_aligned_input.wdl +++ b/pipes/WDL/workflows/sarscov2_nextstrain_aligned_input.wdl @@ -57,7 +57,7 @@ workflow sarscov2_nextstrain_aligned_input { call utils.zcat { input: infiles = aligned_sequences_fasta, - output_name = "all_samples_combined_assembly.fasta.xz" + output_name = "all_samples_combined_assembly.fasta.zst" } #### merge metadata, compute derived cols @@ -200,4 +200,4 @@ workflow sarscov2_nextstrain_aligned_input { File root_sequence_json = export_auspice_json.root_sequence_json File auspice_input_json = export_auspice_json.virus_json } -} \ No newline at end of file +} diff --git a/pipes/WDL/workflows/subsample_by_casecounts.wdl b/pipes/WDL/workflows/subsample_by_casecounts.wdl new file mode 100644 index 000000000..75ee2b397 --- /dev/null +++ b/pipes/WDL/workflows/subsample_by_casecounts.wdl @@ -0,0 +1,20 @@ +version 1.0 + +import "../tasks/tasks_interhost.wdl" as interhost + +workflow subsampler_only { + + call interhost.subsample_by_cases + + output { + File genome_matrix_days_file = subsample_by_cases.genome_matrix_days + File matrix_genomes_unit_file = subsample_by_cases.matrix_genomes_unit + File matrix_cases_unit_file = subsample_by_cases.matrix_cases_unit + File weekly_sampling_proportions_file = subsample_by_cases.weekly_sampling_proportions + File weekly_sampling_bias_file = subsample_by_cases.weekly_sampling_bias + File matrix_genomes_unit_corrected_file = subsample_by_cases.matrix_genomes_unit_corrected + File selected_sequences_file = subsample_by_cases.selected_sequences + File selected_metadata_file = subsample_by_cases.selected_metadata + File sampling_stats_file = subsample_by_cases.sampling_stats + } +} diff --git a/requirements-modules.txt b/requirements-modules.txt index b20c89a98..4d112e34f 100644 --- a/requirements-modules.txt +++ b/requirements-modules.txt @@ -5,7 +5,7 @@ broadinstitute/viral-phylo=2.1.20.2 broadinstitute/py3-bio=0.1.2 broadinstitute/beast-beagle-cuda=1.10.5pre broadinstitute/ncbi-tools=2.10.7.10 -nextstrain/base=build-20211012T204409Z +nextstrain/base=build-20230905T192825Z andersenlabapps/ivar=1.3.1 quay.io/staphb/pangolin=4.3.1-pdata-1.22 nextstrain/nextclade=2.12.0