diff --git a/workflows/amr/run.wdl b/workflows/amr/run.wdl index 1a15b8562..e3ed389c0 100644 --- a/workflows/amr/run.wdl +++ b/workflows/amr/run.wdl @@ -32,8 +32,8 @@ workflow amr { input: non_host_reads = select_all( [ - host_filter_stage.gsnap_filter_out_gsnap_filter_1_fa, - host_filter_stage.gsnap_filter_out_gsnap_filter_2_fa + host_filter_stage.subsampled_out_subsampled_1_fa, + host_filter_stage.subsampled_out_subsampled_2_fa ] ), min_contig_length = min_contig_length, @@ -45,8 +45,8 @@ workflow amr { non_host_reads = select_first([non_host_reads, select_all( [ - host_filter_stage.gsnap_filter_out_gsnap_filter_1_fa, - host_filter_stage.gsnap_filter_out_gsnap_filter_2_fa + host_filter_stage.subsampled_out_subsampled_1_fa, + host_filter_stage.subsampled_out_subsampled_2_fa ] )]), card_json = card_json, @@ -102,8 +102,8 @@ workflow amr { non_host_reads, select_all( [ - host_filter_stage.gsnap_filter_out_gsnap_filter_1_fa, - host_filter_stage.gsnap_filter_out_gsnap_filter_2_fa + host_filter_stage.subsampled_out_subsampled_1_fa, + host_filter_stage.subsampled_out_subsampled_2_fa ] ) ]), diff --git a/workflows/legacy-host-filter/Dockerfile b/workflows/legacy-host-filter/Dockerfile new file mode 100644 index 000000000..0475a5a12 --- /dev/null +++ b/workflows/legacy-host-filter/Dockerfile @@ -0,0 +1,141 @@ +# syntax=docker/dockerfile:1.4 +FROM ubuntu:18.04 +ARG DEBIAN_FRONTEND=noninteractive +ARG MINIWDL_VERSION=1.1.5 + +LABEL maintainer="CZ ID Team " + +RUN sed -i s/archive.ubuntu.com/us-west-2.ec2.archive.ubuntu.com/ /etc/apt/sources.list; \ + echo 'APT::Install-Recommends "false";' > /etc/apt/apt.conf.d/98czid; \ + echo 'APT::Install-Suggests "false";' > /etc/apt/apt.conf.d/99czid + +RUN apt-get -q update && apt-get -q install -y \ + jq \ + moreutils \ + pigz \ + pixz \ + aria2 \ + httpie \ + curl \ + wget \ + zip \ + unzip \ + zlib1g-dev \ + pkg-config \ + apt-utils \ + libbz2-dev \ + liblzma-dev \ + software-properties-common \ + libarchive-tools \ + liblz4-tool \ + lbzip2 \ + docker.io \ + python3-dev \ + python3-pip \ + python3-setuptools \ + python3-wheel \ + python3-requests \ + python3-yaml \ + python3-dateutil \ + python3-psutil \ + python3-cutadapt \ + python3-scipy \ + samtools \ + fastx-toolkit \ + seqtk \ + bedtools \ + dh-autoreconf \ + nasm \ + build-essential + +# The following packages pull in python2.7 +RUN apt-get -q install -y \ + bowtie2 \ + spades \ + ncbi-blast+ + +RUN pip3 install boto3==1.23.10 marisa-trie==0.7.7 pytest +RUN pip3 install miniwdl==${MINIWDL_VERSION} miniwdl-s3parcp==0.0.5 miniwdl-s3upload==0.0.4 +RUN pip3 install https://github.com/chanzuckerberg/miniwdl-plugins/archive/f0465b0.zip#subdirectory=sfn-wdl +RUN pip3 install https://github.com/chanzuckerberg/s3mi/archive/v0.8.0.tar.gz + +ADD https://raw.githubusercontent.com/chanzuckerberg/miniwdl/v${MINIWDL_VERSION}/examples/clean_download_cache.sh /usr/local/bin +RUN chmod +x /usr/local/bin/clean_download_cache.sh + +# docker.io is the largest package at 250MB+ / half of all package disk space usage. +# The docker daemons never run inside the container - removing them saves 150MB+ +RUN rm -f /usr/bin/dockerd /usr/bin/containerd* + +RUN cd /usr/bin; curl -O https://amazon-ecr-credential-helper-releases.s3.amazonaws.com/0.4.0/linux-amd64/docker-credential-ecr-login +RUN chmod +x /usr/bin/docker-credential-ecr-login +RUN mkdir -p /root/.docker +RUN jq -n '.credsStore="ecr-login"' > /root/.docker/config.json + +RUN curl -L -o /usr/bin/czid-dedup https://github.com/chanzuckerberg/czid-dedup/releases/download/v0.1.2/czid-dedup-Linux; chmod +x /usr/bin/czid-dedup + +# Note: bsdtar is available in libarchive-tools +# Note: python3-scipy pulls in gcc (fixed in Ubuntu 19.10) +# TODO: kSNP3 (separate phylotree image?) + +# Note: the NonHostAlignment stage uses a different version of gmap custom to CZ ID, installed here: +# https://github.com/chanzuckerberg/czid/blob/master/workflows/docker/gsnap/Dockerfile#L16-L20 +# TODO: migrate both to https://packages.ubuntu.com/focal/gmap (updates to gmap require revalidation) +RUN apt-get -q install -y gmap + +# FIXME: replace trimmomatic with cutadapt (trimmomatic pulls in too many deps) +RUN apt-get -q install -y trimmomatic +RUN ln -sf /usr/share/java/trimmomatic-0.36.jar /usr/local/bin/trimmomatic-0.38.jar + +# FIXME: replace PriceSeqFilter with cutadapt quality/N-fraction cutoff +RUN curl -s https://idseq-prod-pipeline-public-assets-us-west-2.s3-us-west-2.amazonaws.com/PriceSource140408/PriceSeqFilter > /usr/bin/PriceSeqFilter +RUN chmod +x /usr/bin/PriceSeqFilter + +RUN curl -Ls https://github.com/chanzuckerberg/s3parcp/releases/download/v0.2.0-alpha/s3parcp_0.2.0-alpha_Linux_x86_64.tar.gz | tar -C /usr/bin -xz s3parcp + +# FIXME: check if use of pandas, pysam is necessary +RUN pip3 install pysam==0.14.1 pandas==1.1.5 + +# Picard for average fragment size https://github.com/broadinstitute/picard +# r-base is a dependency of collecting input size metrics https://github.com/bioconda/bioconda-recipes/pull/16398 +RUN apt-get install -y r-base +RUN curl -L -o /usr/local/bin/picard.jar https://github.com/broadinstitute/picard/releases/download/2.21.2/picard.jar +# Create a single executable so we can use SingleCommand +RUN printf '#!/bin/bash\njava -jar /usr/local/bin/picard.jar "$@"\n' > /usr/local/bin/picard +RUN chmod +x /usr/local/bin/picard + +# install STAR, the package rna-star does not include STARlong +RUN curl -L https://github.com/alexdobin/STAR/archive/2.5.3a.tar.gz | tar xz +RUN mv STAR-2.5.3a/bin/Linux_x86_64_static/* /usr/local/bin +RUN rm -rf STAR-2.5.3a + + +RUN apt-get -y update && apt-get install -y build-essential libz-dev git python3-pip cmake + +# Host filtering (2022 version) dependencies +# fastp (libdeflate libisal (dh-autoreconf nasm)) +# hisat2 +# bowtie2 [already installed] +# kallisto + python gtfparse +WORKDIR /tmp +RUN wget -nv -O - https://github.com/intel/isa-l/archive/refs/tags/v2.30.0.tar.gz | tar zx +RUN cd isa-l-* && ./autogen.sh && ./configure && make -j8 && make install +RUN wget -nv -O - https://github.com/ebiggers/libdeflate/archive/refs/tags/v1.12.tar.gz | tar zx +RUN cd libdeflate-* && make -j8 && make install +RUN ldconfig +RUN git clone https://github.com/mlin/fastp.git && git -C fastp checkout 37edd60 +RUN cd fastp && make -j8 && ./fastp test && cp fastp /usr/local/bin +WORKDIR / +RUN wget -nv -O /tmp/HISAT2.zip https://czid-public-references.s3.us-west-2.amazonaws.com/test/hisat2/hisat2.zip \ + && unzip /tmp/HISAT2.zip && rm /tmp/HISAT2.zip +RUN curl -L https://github.com/pachterlab/kallisto/releases/download/v0.46.1/kallisto_linux-v0.46.1.tar.gz | tar xz -C / +RUN pip3 install gtfparse==1.2.1 + +# Uninstall build only dependencies +RUN apt-get purge -y g++ libperl4-corelibs-perl make + +COPY --from=lib idseq-dag /tmp/idseq-dag +RUN pip3 install /tmp/idseq-dag && rm -rf /tmp/idseq-dag + +COPY --from=lib idseq_utils /tmp/idseq_utils +RUN pip3 install /tmp/idseq_utils && rm -rf /tmp/idseq_utils + diff --git a/workflows/legacy-host-filter/legacy-host-filter.wdl b/workflows/legacy-host-filter/legacy-host-filter.wdl new file mode 100644 index 000000000..04c1b2c40 --- /dev/null +++ b/workflows/legacy-host-filter/legacy-host-filter.wdl @@ -0,0 +1,675 @@ +version 1.0 + +task RunValidateInput { + input { + String docker_image_id + String s3_wd_uri + Array[File] fastqs + Int max_input_fragments + String file_ext + } + command<<< + set -euxo pipefail + idseq-dag-run-step --workflow-name host_filter \ + --step-module idseq_dag.steps.run_validate_input \ + --step-class PipelineStepRunValidateInput \ + --step-name validate_input_out \ + --input-files '[["~{sep='","' fastqs}"]]' \ + --output-files '["validate_input_summary.json", ~{if length(fastqs) == 2 then '"valid_input1.fastq", "valid_input2.fastq"' else '"valid_input1.fastq"'}]' \ + --output-dir-s3 '~{s3_wd_uri}' \ + --additional-files '{}' \ + --additional-attributes '{"truncate_fragments_to": ~{max_input_fragments}, "file_ext": "~{file_ext}"}' + >>> + output { + String step_description_md = read_string("validate_input_out.description.md") + File validate_input_summary_json = "validate_input_summary.json" + File valid_input1_fastq = "valid_input1.fastq" + File? valid_input2_fastq = "valid_input2.fastq" + File? output_read_count = "validate_input_out.count" + File? input_read_count = "fastqs.count" + } + runtime { + docker: docker_image_id + } +} + +task RunStar { + input { + String docker_image_id + String s3_wd_uri + File validate_input_summary_json + Array[File] valid_input_fastq + File star_genome + String nucleotide_type + String host_genome + String genome_dir = "STAR_genome/part-0/" + } + command<<< + # TODO(Ryan): remove when status upload is not dependent on idseq-dag see: https://app.shortcut.com/idseq/story/163323 + # this comment is for the miniwdl plugin uploader to parse: + # --step-name star_out + set -euxo pipefail + + python3 < star_out_version.txt + rm "~{genome_dir}"/SAindex # the star genome is pretty big (1.5G) + rm "~{genome_dir}"/Genome + >>> + output { + String step_description_md = read_string("star_out.description.md") + File unmapped1_fastq = "unmapped1.fastq" + File output_log_file = "Log.final.out" + File? unmapped2_fastq = "unmapped2.fastq" + File? aligned_file = "Aligned.out.bam" + File? output_read_count = "star_out.count" + File? output_gene_file = "reads_per_gene.star.tab" + File? output_metrics_file = "picard_insert_metrics.txt" + File? output_histogram_file = "insert_size_histogram.pdf" + File? version = "star_out_version.txt" + } + runtime { + docker: docker_image_id + } +} + +task RunTrimmomatic { + input { + String docker_image_id + String s3_wd_uri + Array[File] unmapped_fastq + File adapter_fasta + } + command<<< + set -euxo pipefail + idseq-dag-run-step --workflow-name host_filter \ + --step-module idseq_dag.steps.run_trimmomatic \ + --step-class PipelineStepRunTrimmomatic \ + --step-name trimmomatic_out \ + --input-files '[["~{sep='","' unmapped_fastq}"]]' \ + --output-files '[~{if length(unmapped_fastq) == 2 then '"trimmomatic1.fastq", "trimmomatic2.fastq"' else '"trimmomatic1.fastq"'}]' \ + --output-dir-s3 '~{s3_wd_uri}' \ + --additional-files '{"adapter_fasta": "~{adapter_fasta}"}' \ + --additional-attributes '{}' + java -jar /usr/local/bin/trimmomatic-0.38.jar -version > trimmomatic_version.txt + + >>> + output { + String step_description_md = read_string("trimmomatic_out.description.md") + File trimmomatic1_fastq = "trimmomatic1.fastq" + File? trimmomatic2_fastq = "trimmomatic2.fastq" + File? output_read_count = "trimmomatic_out.count" + File? version = "trimmomatic_version.txt" + } + runtime { + docker: docker_image_id + } +} + +task RunPriceSeq { + input { + String docker_image_id + String s3_wd_uri + Array[File] trimmomatic_fastq + } + command<<< + set -euxo pipefail + idseq-dag-run-step --workflow-name host_filter \ + --step-module idseq_dag.steps.run_priceseq \ + --step-class PipelineStepRunPriceSeq \ + --step-name priceseq_out \ + --input-files '[["~{sep='","' trimmomatic_fastq}"]]' \ + --output-files '[~{if length(trimmomatic_fastq) == 2 then '"priceseq1.fa", "priceseq2.fa"' else '"priceseq1.fa"'}]' \ + --output-dir-s3 '~{s3_wd_uri}' \ + --additional-files '{}' \ + --additional-attributes '{}' + PriceSeqFilter 2> /dev/null | head -n1 > priceseq_version.txt + >>> + output { + String step_description_md = read_string("priceseq_out.description.md") + File priceseq1_fa = "priceseq1.fa" + File? priceseq2_fa = "priceseq2.fa" + File? output_read_count = "priceseq_out.count" + File? version = "priceseq_version.txt" + } + runtime { + docker: docker_image_id + } +} + +task RunCZIDDedup { + input { + String docker_image_id + String s3_wd_uri + Array[File] priceseq_fa + } + command<<< + set -euxo pipefail + idseq-dag-run-step --workflow-name host_filter \ + --step-module idseq_dag.steps.run_czid_dedup \ + --step-class PipelineStepRunCZIDDedup \ + --step-name czid_dedup_out \ + --input-files '[["~{sep='","' priceseq_fa}"]]' \ + --output-files '[~{if length(priceseq_fa) == 2 then '"dedup1.fa", "dedup2.fa"' else '"dedup1.fa"'}, "clusters.csv", "duplicate_cluster_sizes.tsv"]' \ + --output-dir-s3 '~{s3_wd_uri}' \ + --additional-files '{}' \ + --additional-attributes '{}' + czid-dedup --version > czid_dedup_version.txt + >>> + output { + String step_description_md = read_string("czid_dedup_out.description.md") + File dedup1_fa = "dedup1.fa" + File? dedup2_fa = "dedup2.fa" + File duplicate_clusters_csv = "clusters.csv" + File duplicate_cluster_sizes_tsv = "duplicate_cluster_sizes.tsv" + File? output_read_count = "czid_dedup_out.count" + File? version = "czid_dedup_version.txt" + } + runtime { + docker: docker_image_id + } +} + +task RunLZW { + input { + String docker_image_id + String s3_wd_uri + Array[File] dedup_fa + File duplicate_clusters_csv + File duplicate_cluster_sizes_tsv + } + command<<< + set -euxo pipefail + idseq-dag-run-step --workflow-name host_filter \ + --step-module idseq_dag.steps.run_lzw \ + --step-class PipelineStepRunLZW \ + --step-name lzw_out \ + --input-files '[["~{sep='","' dedup_fa}", "~{duplicate_clusters_csv}", "~{duplicate_cluster_sizes_tsv}"]]' \ + --output-files '[~{if length(dedup_fa) == 2 then '"lzw1.fa", "lzw2.fa"' else '"lzw1.fa"'}]' \ + --output-dir-s3 '~{s3_wd_uri}' \ + --additional-files '{}' \ + --additional-attributes '{"thresholds": [0.45, 0.42], "threshold_readlength": 150}' + >>> + output { + String step_description_md = read_string("lzw_out.description.md") + File lzw1_fa = "lzw1.fa" + File? lzw2_fa = "lzw2.fa" + File? output_read_count = "lzw_out.count" + } + runtime { + docker: docker_image_id + } +} + +task RunBowtie2_bowtie2_out { + input { + String docker_image_id + String s3_wd_uri + Array[File] lzw_fa + Array[File] dedup_fa + File duplicate_clusters_csv + File duplicate_cluster_sizes_tsv + File bowtie2_genome + } + command<<< + set -euxo pipefail + idseq-dag-run-step --workflow-name host_filter \ + --step-module idseq_dag.steps.run_bowtie2 \ + --step-class PipelineStepRunBowtie2 \ + --step-name bowtie2_out \ + --input-files '[["~{sep='","' lzw_fa}"], ["~{sep='","' dedup_fa}", "~{duplicate_clusters_csv}", "~{duplicate_cluster_sizes_tsv}"]]' \ + --output-files '[~{if length(lzw_fa) == 2 then '"bowtie2_1.fa", "bowtie2_2.fa", "bowtie2_merged.fa"' else '"bowtie2_1.fa"'}]' \ + --output-dir-s3 '~{s3_wd_uri}' \ + --additional-files '{"bowtie2_genome": "~{bowtie2_genome}"}' \ + --additional-attributes '{"output_sam_file": "bowtie2.sam"}' + bowtie2 --version > bowtie2_version.txt + >>> + output { + String step_description_md = read_string("bowtie2_out.description.md") + File bowtie2_1_fa = "bowtie2_1.fa" + File? bowtie2_2_fa = "bowtie2_2.fa" + File? bowtie2_merged_fa = "bowtie2_merged.fa" + File? output_read_count = "bowtie2_out.count" + File? version = "bowtie2_version.txt" + } + runtime { + docker: docker_image_id + } +} + +task RunSubsample { + input { + String docker_image_id + String s3_wd_uri + Array[File] bowtie2_fa + Array[File] dedup_fa + File duplicate_clusters_csv + File duplicate_cluster_sizes_tsv + Int max_subsample_fragments + } + command<<< + set -euxo pipefail + idseq-dag-run-step --workflow-name host_filter \ + --step-module idseq_dag.steps.run_subsample \ + --step-class PipelineStepRunSubsample \ + --step-name subsampled_out \ + --input-files '[["~{sep='","' bowtie2_fa}"], ["~{sep='","' dedup_fa}", "~{duplicate_clusters_csv}", "~{duplicate_cluster_sizes_tsv}"]]' \ + --output-files '[~{if length(dedup_fa) == 2 then '"subsampled_1.fa", "subsampled_2.fa", "subsampled_merged.fa"' else '"subsampled_1.fa"'}]' \ + --output-dir-s3 '~{s3_wd_uri}' \ + --additional-files '{}' \ + --additional-attributes '{"max_fragments": ~{max_subsample_fragments}}' + >>> + output { + String step_description_md = read_string("subsampled_out.description.md") + File subsampled_1_fa = "subsampled_1.fa" + File? subsampled_2_fa = "subsampled_2.fa" + File? subsampled_merged_fa = "subsampled_merged.fa" + File? output_read_count = "subsampled_out.count" + } + runtime { + docker: docker_image_id + } +} + +task RunStarDownstream { + input { + String docker_image_id + String s3_wd_uri + Array[File] subsampled_fa + File validate_input_summary_json + Array[File] valid_input_fastq + Array[File] dedup_fa + File duplicate_clusters_csv + File duplicate_cluster_sizes_tsv + File human_star_genome + } + command<<< + set -euxo pipefail + idseq-dag-run-step --workflow-name host_filter \ + --step-module idseq_dag.steps.run_star_downstream \ + --step-class PipelineStepRunStarDownstream \ + --step-name star_human_out \ + --input-files '[["~{sep='","' subsampled_fa}"], ["~{validate_input_summary_json}", "~{sep='","' valid_input_fastq}"], ["~{sep='","' dedup_fa}", "~{duplicate_clusters_csv}", "~{duplicate_cluster_sizes_tsv}"]]' \ + --output-files '[~{if length(dedup_fa) == 2 then '"unmapped_human_1.fa", "unmapped_human_2.fa"' else '"unmapped_human_1.fa"'}]' \ + --output-dir-s3 '~{s3_wd_uri}' \ + --additional-files '{"star_genome": "~{human_star_genome}"}' \ + --additional-attributes '{}' + STAR --version > star_human_version.txt + >>> + output { + String step_description_md = read_string("star_human_out.description.md") + File unmapped_human_1_fa = "unmapped_human_1.fa" + File? unmapped_human_2_fa = "unmapped_human_2.fa" + File? output_read_count = "star_human_out.count" + File? version = "star_human_version.txt" + } + runtime { + docker: docker_image_id + } +} + +task RunBowtie2_bowtie2_human_out { + input { + String docker_image_id + String s3_wd_uri + Array[File] unmapped_human_fa + Array[File] dedup_fa + File duplicate_clusters_csv + File duplicate_cluster_sizes_tsv + File human_bowtie2_genome + } + command<<< + set -euxo pipefail + idseq-dag-run-step --workflow-name host_filter \ + --step-module idseq_dag.steps.run_bowtie2 \ + --step-class PipelineStepRunBowtie2 \ + --step-name bowtie2_human_out \ + --input-files '[["~{sep='","' unmapped_human_fa}"], ["~{sep='","' dedup_fa}", "~{duplicate_clusters_csv}", "~{duplicate_cluster_sizes_tsv}"]]' \ + --output-files '[~{if length(dedup_fa) == 2 then '"bowtie2_human_1.fa", "bowtie2_human_2.fa", "bowtie2_human_merged.fa"' else '"bowtie2_human_1.fa"'}]' \ + --output-dir-s3 '~{s3_wd_uri}' \ + --additional-files '{"bowtie2_genome": "~{human_bowtie2_genome}"}' \ + --additional-attributes '{"output_sam_file": "bowtie2_human.sam"}' + bowtie2 --version > bowtie2_human_version.txt + >>> + output { + String step_description_md = read_string("bowtie2_human_out.description.md") + File bowtie2_human_1_fa = "bowtie2_human_1.fa" + File? bowtie2_human_2_fa = "bowtie2_human_2.fa" + File? bowtie2_human_merged_fa = "bowtie2_human_merged.fa" + File? output_read_count = "bowtie2_human_out.count" + File? version = "bowtie2_human_version.txt" + } + runtime { + docker: docker_image_id + } +} + +task RunGsnapFilter { + input { + String docker_image_id + String s3_wd_uri + Array[File] subsampled_fa + Array[File] dedup_fa + File duplicate_clusters_csv + File duplicate_cluster_sizes_tsv + File gsnap_genome + } + command<<< + set -euxo pipefail + idseq-dag-run-step --workflow-name host_filter \ + --step-module idseq_dag.steps.run_gsnap_filter \ + --step-class PipelineStepRunGsnapFilter \ + --step-name gsnap_filter_out \ + --input-files '[["~{sep='","' subsampled_fa}"], ["~{sep='","' dedup_fa}", "~{duplicate_clusters_csv}", "~{duplicate_cluster_sizes_tsv}"]]' \ + --output-files '[~{if length(dedup_fa) == 2 then '"gsnap_filter_1.fa", "gsnap_filter_2.fa", "gsnap_filter_merged.fa"' else '"gsnap_filter_1.fa"'}]' \ + --output-dir-s3 '~{s3_wd_uri}' \ + --additional-files '{"gsnap_genome": "~{gsnap_genome}"}' \ + --additional-attributes '{"output_sam_file": "gsnap_filter.sam"}' + gsnap --version > gsnap_filter_version.txt + >>> + output { + String step_description_md = read_string("gsnap_filter_out.description.md") + File gsnap_filter_1_fa = "gsnap_filter_1.fa" + File? gsnap_filter_2_fa = "gsnap_filter_2.fa" + File? gsnap_filter_merged_fa = "gsnap_filter_merged.fa" + File? output_read_count = "gsnap_filter_out.count" + File? version = "gsnap_filter_version.txt" + } + runtime { + docker: docker_image_id + } +} + + +workflow czid_host_filter { + input { + String docker_image_id + String s3_wd_uri + File fastqs_0 + File? fastqs_1 + String file_ext + String nucleotide_type + String host_genome + File adapter_fasta + File star_genome + File bowtie2_genome + File gsnap_genome = "s3://czid-public-references/host_filter/human/2018-02-15-utc-1518652800-unixtime__2018-02-15-utc-1518652800-unixtime/hg38_pantro5_k16.tar" + String human_star_genome + String human_bowtie2_genome + Int max_input_fragments + Int max_subsample_fragments + } + + call RunValidateInput { + input: + docker_image_id = docker_image_id, + s3_wd_uri = s3_wd_uri, + fastqs = select_all([fastqs_0, fastqs_1]), + file_ext = file_ext, + max_input_fragments = max_input_fragments + } + + call RunStar { + input: + docker_image_id = docker_image_id, + s3_wd_uri = s3_wd_uri, + validate_input_summary_json = RunValidateInput.validate_input_summary_json, + valid_input_fastq = select_all([RunValidateInput.valid_input1_fastq, RunValidateInput.valid_input2_fastq]), + star_genome = star_genome, + nucleotide_type = nucleotide_type, + host_genome = host_genome + } + + call RunTrimmomatic { + input: + docker_image_id = docker_image_id, + s3_wd_uri = s3_wd_uri, + unmapped_fastq = select_all([RunStar.unmapped1_fastq, RunStar.unmapped2_fastq]), + adapter_fasta = adapter_fasta + } + + call RunPriceSeq { + input: + docker_image_id = docker_image_id, + s3_wd_uri = s3_wd_uri, + trimmomatic_fastq = select_all([RunTrimmomatic.trimmomatic1_fastq, RunTrimmomatic.trimmomatic2_fastq]) + } + + call RunCZIDDedup { + input: + docker_image_id = docker_image_id, + s3_wd_uri = s3_wd_uri, + priceseq_fa = select_all([RunPriceSeq.priceseq1_fa, RunPriceSeq.priceseq2_fa]) + } + + call RunLZW { + input: + docker_image_id = docker_image_id, + s3_wd_uri = s3_wd_uri, + dedup_fa = select_all([RunCZIDDedup.dedup1_fa, RunCZIDDedup.dedup2_fa]), + duplicate_clusters_csv = RunCZIDDedup.duplicate_clusters_csv, + duplicate_cluster_sizes_tsv = RunCZIDDedup.duplicate_cluster_sizes_tsv + } + + call RunBowtie2_bowtie2_out { + input: + docker_image_id = docker_image_id, + s3_wd_uri = s3_wd_uri, + lzw_fa = select_all([RunLZW.lzw1_fa, RunLZW.lzw2_fa]), + dedup_fa = select_all([RunCZIDDedup.dedup1_fa, RunCZIDDedup.dedup2_fa]), + duplicate_clusters_csv = RunCZIDDedup.duplicate_clusters_csv, + duplicate_cluster_sizes_tsv = RunCZIDDedup.duplicate_cluster_sizes_tsv, + bowtie2_genome = bowtie2_genome + } + + call RunSubsample { + input: + docker_image_id = docker_image_id, + s3_wd_uri = s3_wd_uri, + bowtie2_fa = select_all([RunBowtie2_bowtie2_out.bowtie2_1_fa, RunBowtie2_bowtie2_out.bowtie2_2_fa, RunBowtie2_bowtie2_out.bowtie2_merged_fa]), + dedup_fa = select_all([RunCZIDDedup.dedup1_fa, RunCZIDDedup.dedup2_fa]), + duplicate_clusters_csv = RunCZIDDedup.duplicate_clusters_csv, + duplicate_cluster_sizes_tsv = RunCZIDDedup.duplicate_cluster_sizes_tsv, + max_subsample_fragments = max_subsample_fragments + } + + if (host_genome != "human") { + call RunStarDownstream { + input: + docker_image_id = docker_image_id, + s3_wd_uri = s3_wd_uri, + subsampled_fa = select_all([RunSubsample.subsampled_1_fa, RunSubsample.subsampled_2_fa, RunSubsample.subsampled_merged_fa]), + validate_input_summary_json = RunValidateInput.validate_input_summary_json, + valid_input_fastq = select_all([RunValidateInput.valid_input1_fastq, RunValidateInput.valid_input2_fastq]), + dedup_fa = select_all([RunCZIDDedup.dedup1_fa, RunCZIDDedup.dedup2_fa]), + duplicate_clusters_csv = RunCZIDDedup.duplicate_clusters_csv, + duplicate_cluster_sizes_tsv = RunCZIDDedup.duplicate_cluster_sizes_tsv, + human_star_genome = human_star_genome + } + + call RunBowtie2_bowtie2_human_out { + input: + docker_image_id = docker_image_id, + s3_wd_uri = s3_wd_uri, + unmapped_human_fa = select_all([RunStarDownstream.unmapped_human_1_fa, RunStarDownstream.unmapped_human_2_fa]), + dedup_fa = select_all([RunCZIDDedup.dedup1_fa, RunCZIDDedup.dedup2_fa]), + duplicate_clusters_csv = RunCZIDDedup.duplicate_clusters_csv, + duplicate_cluster_sizes_tsv = RunCZIDDedup.duplicate_cluster_sizes_tsv, + human_bowtie2_genome = human_bowtie2_genome + } + } + + Array[File] gsnap_filter_input = if (host_genome == "human") + then select_all([RunSubsample.subsampled_1_fa, RunSubsample.subsampled_2_fa, RunSubsample.subsampled_merged_fa]) + else select_all([RunBowtie2_bowtie2_human_out.bowtie2_human_1_fa, RunBowtie2_bowtie2_human_out.bowtie2_human_2_fa, RunBowtie2_bowtie2_human_out.bowtie2_human_merged_fa]) + + call RunGsnapFilter { + input: + docker_image_id = docker_image_id, + s3_wd_uri = s3_wd_uri, + subsampled_fa = gsnap_filter_input, + dedup_fa = select_all([RunCZIDDedup.dedup1_fa, RunCZIDDedup.dedup2_fa]), + duplicate_clusters_csv = RunCZIDDedup.duplicate_clusters_csv, + duplicate_cluster_sizes_tsv = RunCZIDDedup.duplicate_cluster_sizes_tsv, + gsnap_genome = gsnap_genome + } + + output { + File validate_input_out_validate_input_summary_json = RunValidateInput.validate_input_summary_json + File? validate_input_out_count = RunValidateInput.output_read_count + File star_out_unmapped1_fastq = RunStar.unmapped1_fastq + File? star_out_unmapped2_fastq = RunStar.unmapped2_fastq + File? star_out_log_file = RunStar.output_log_file + File? star_out_count = RunStar.output_read_count + File? star_version = RunStar.version + File trimmomatic_out_trimmomatic1_fastq = RunTrimmomatic.trimmomatic1_fastq + File? trimmomatic_out_trimmomatic2_fastq = RunTrimmomatic.trimmomatic2_fastq + File? trimmomatic_out_count = RunTrimmomatic.output_read_count + File? trimmomatic_version = RunTrimmomatic.version + File priceseq_out_priceseq1_fa = RunPriceSeq.priceseq1_fa + File? priceseq_out_priceseq2_fa = RunPriceSeq.priceseq2_fa + File? priceseq_out_count = RunPriceSeq.output_read_count + File? priceseq_version = RunPriceSeq.version + File czid_dedup_out_dedup1_fa = RunCZIDDedup.dedup1_fa + File? czid_dedup_out_dedup2_fa = RunCZIDDedup.dedup2_fa + File czid_dedup_out_duplicate_clusters_csv = RunCZIDDedup.duplicate_clusters_csv + File czid_dedup_out_duplicate_cluster_sizes_tsv = RunCZIDDedup.duplicate_cluster_sizes_tsv + File? czid_dedup_out_count = RunCZIDDedup.output_read_count + File? czid_dedup_version = RunCZIDDedup.version + File lzw_out_lzw1_fa = RunLZW.lzw1_fa + File? lzw_out_lzw2_fa = RunLZW.lzw2_fa + File? lzw_out_count = RunLZW.output_read_count + File bowtie2_out_bowtie2_1_fa = RunBowtie2_bowtie2_out.bowtie2_1_fa + File? bowtie2_out_bowtie2_2_fa = RunBowtie2_bowtie2_out.bowtie2_2_fa + File? bowtie2_out_bowtie2_merged_fa = RunBowtie2_bowtie2_out.bowtie2_merged_fa + File? bowtie2_out_count = RunBowtie2_bowtie2_out.output_read_count + File? bowtie2_version = RunBowtie2_bowtie2_out.version + File subsampled_out_subsampled_1_fa = RunSubsample.subsampled_1_fa + File? subsampled_out_subsampled_2_fa = RunSubsample.subsampled_2_fa + File? subsampled_out_subsampled_merged_fa = RunSubsample.subsampled_merged_fa + File? subsampled_out_count = RunSubsample.output_read_count + File? star_human_out_unmapped_human_1_fa = RunStarDownstream.unmapped_human_1_fa + File? star_human_out_unmapped_human_2_fa = RunStarDownstream.unmapped_human_2_fa + File? star_human_out_count = RunStarDownstream.output_read_count + File? star_human_version = RunStarDownstream.version + File? bowtie2_human_out_bowtie2_human_1_fa = RunBowtie2_bowtie2_human_out.bowtie2_human_1_fa + File? bowtie2_human_out_bowtie2_human_2_fa = RunBowtie2_bowtie2_human_out.bowtie2_human_2_fa + File? bowtie2_human_out_bowtie2_human_merged_fa = RunBowtie2_bowtie2_human_out.bowtie2_human_merged_fa + File? bowtie2_human_out_count = RunBowtie2_bowtie2_human_out.output_read_count + File? bowtie2_human_version = RunBowtie2_bowtie2_human_out.version + File gsnap_filter_out_gsnap_filter_1_fa = RunGsnapFilter.gsnap_filter_1_fa + File? gsnap_filter_out_gsnap_filter_2_fa = RunGsnapFilter.gsnap_filter_2_fa + File? gsnap_filter_out_gsnap_filter_merged_fa = RunGsnapFilter.gsnap_filter_merged_fa + File? gsnap_filter_out_count = RunGsnapFilter.output_read_count + File? gsnap_filter_version = RunGsnapFilter.version + File? input_read_count = RunValidateInput.input_read_count + File? output_gene_file = RunStar.output_gene_file + File? output_metrics_file = RunStar.output_metrics_file + File? output_histogram_file = RunStar.output_histogram_file + } +} diff --git a/workflows/legacy-host-filter/stage_io_map.json b/workflows/legacy-host-filter/stage_io_map.json new file mode 100644 index 000000000..565be202a --- /dev/null +++ b/workflows/legacy-host-filter/stage_io_map.json @@ -0,0 +1,38 @@ +{ + "NonHostAlignment":{ + "host_filter_out_gsnap_filter_1_fa":"gsnap_filter_out_gsnap_filter_1_fa", + "host_filter_out_gsnap_filter_2_fa":"gsnap_filter_out_gsnap_filter_2_fa", + "host_filter_out_gsnap_filter_merged_fa":"gsnap_filter_out_gsnap_filter_merged_fa", + "duplicate_cluster_sizes_tsv":"czid_dedup_out_duplicate_cluster_sizes_tsv", + "czid_dedup_out_duplicate_clusters_csv":"czid_dedup_out_duplicate_clusters_csv" + }, + "Postprocess":{ + "host_filter_out_gsnap_filter_1_fa":"gsnap_filter_out_gsnap_filter_1_fa", + "host_filter_out_gsnap_filter_2_fa":"gsnap_filter_out_gsnap_filter_2_fa", + "host_filter_out_gsnap_filter_merged_fa":"gsnap_filter_out_gsnap_filter_merged_fa", + "gsnap_out_gsnap_m8":"gsnap_out_gsnap_m8", + "gsnap_out_gsnap_deduped_m8":"gsnap_out_gsnap_deduped_m8", + "gsnap_out_gsnap_hitsummary_tab":"gsnap_out_gsnap_hitsummary_tab", + "gsnap_out_gsnap_counts_with_dcr_json":"gsnap_out_gsnap_counts_with_dcr_json", + "rapsearch2_out_rapsearch2_m8":"rapsearch2_out_rapsearch2_m8", + "rapsearch2_out_rapsearch2_deduped_m8":"rapsearch2_out_rapsearch2_deduped_m8", + "rapsearch2_out_rapsearch2_hitsummary_tab":"rapsearch2_out_rapsearch2_hitsummary_tab", + "rapsearch2_out_rapsearch2_counts_with_dcr_json":"rapsearch2_out_rapsearch2_counts_with_dcr_json", + "duplicate_cluster_sizes_tsv":"czid_dedup_out_duplicate_cluster_sizes_tsv", + "czid_dedup_out_duplicate_clusters_csv":"czid_dedup_out_duplicate_clusters_csv" + }, + "Experimental":{ + "taxid_fasta_in_annotated_merged_fa":"annotated_out_annotated_merged_fa", + "taxid_fasta_in_gsnap_hitsummary_tab":"gsnap_out_gsnap_hitsummary_tab", + "taxid_fasta_in_rapsearch2_hitsummary_tab":"rapsearch2_out_rapsearch2_hitsummary_tab", + "gsnap_m8_gsnap_deduped_m8":"gsnap_out_gsnap_deduped_m8", + "refined_gsnap_in_gsnap_reassigned_m8":"refined_gsnap_out_assembly_gsnap_reassigned_m8", + "refined_gsnap_in_gsnap_hitsummary2_tab":"refined_gsnap_out_assembly_gsnap_hitsummary2_tab", + "refined_gsnap_in_gsnap_blast_top_m8":"refined_gsnap_out_assembly_gsnap_blast_top_m8", + "contig_in_contig_coverage_json":"coverage_out_assembly_contig_coverage_json", + "contig_in_contig_stats_json":"assembly_out_assembly_contig_stats_json", + "contig_in_contigs_fasta":"assembly_out_assembly_contigs_fasta", + "nonhost_fasta_refined_taxid_annot_fasta":"refined_taxid_fasta_out_assembly_refined_taxid_annot_fasta", + "duplicate_clusters_csv":"czid_dedup_out_duplicate_clusters_csv" + } +} diff --git a/workflows/legacy-host-filter/test/__init__.py b/workflows/legacy-host-filter/test/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/workflows/legacy-host-filter/test/duplicate_cluster_sizes.tsv b/workflows/legacy-host-filter/test/duplicate_cluster_sizes.tsv new file mode 100644 index 000000000..51ba71020 --- /dev/null +++ b/workflows/legacy-host-filter/test/duplicate_cluster_sizes.tsv @@ -0,0 +1,22 @@ +1 NC_007795.1_62__benchmark_lineage_93061_1280_1279_90964__s0000001323 +1 NC_007795.1_62__benchmark_lineage_93061_1280_1279_90964__s0000004573 +1 NC_007795.1_64__benchmark_lineage_93061_1280_1279_90964__s0000000966 +1 NC_007795.1_65__benchmark_lineage_93061_1280_1279_90964__s0000001325 +1 NC_007795.1_66__benchmark_lineage_93061_1280_1279_90964__s0000001061 +1 NC_007795.1_68__benchmark_lineage_93061_1280_1279_90964__s0000001151 +1 NC_007795.1_8__benchmark_lineage_93061_1280_1279_90964__s0000002124 +1 NC_016845.1_16__benchmark_lineage_1125630_573_570_543__s0000001766 +1 NC_016845.1_19__benchmark_lineage_1125630_573_570_543__s0000002195 +1 NC_016845.1_29__benchmark_lineage_1125630_573_570_543__s0000004269 +1 NC_016845.1_36__benchmark_lineage_1125630_573_570_543__s0000002278 +1 NC_016845.1_38__benchmark_lineage_1125630_573_570_543__s0000000459 +1 NC_016845.1_40__benchmark_lineage_1125630_573_570_543__s0000002827 +1 NC_016845.1_44__benchmark_lineage_1125630_573_570_543__s0000001495 +1 NC_016845.1_46__benchmark_lineage_1125630_573_570_543__s0000000467 +1 NC_016845.1_4__benchmark_lineage_1125630_573_570_543__s0000003258 +1 NC_016845.1_53__benchmark_lineage_1125630_573_570_543__s0000001392 +1 NC_016845.1_54__benchmark_lineage_1125630_573_570_543__s0000001251 +1 NC_016845.1_57__benchmark_lineage_1125630_573_570_543__s0000002297 +1 NC_016845.1_60__benchmark_lineage_1125630_573_570_543__s0000003310 +1 NC_016845.1_65__benchmark_lineage_1125630_573_570_543__s0000002305 +1 NC_016845.1_65__benchmark_lineage_1125630_573_570_543__s0000003893 diff --git a/workflows/short-read-mngs/test/host_filter/star_inputs/valid_input1.fastq b/workflows/legacy-host-filter/test/host_filter/star_inputs/valid_input1.fastq similarity index 100% rename from workflows/short-read-mngs/test/host_filter/star_inputs/valid_input1.fastq rename to workflows/legacy-host-filter/test/host_filter/star_inputs/valid_input1.fastq diff --git a/workflows/short-read-mngs/test/host_filter/star_inputs/valid_input2.fastq b/workflows/legacy-host-filter/test/host_filter/star_inputs/valid_input2.fastq similarity index 100% rename from workflows/short-read-mngs/test/host_filter/star_inputs/valid_input2.fastq rename to workflows/legacy-host-filter/test/host_filter/star_inputs/valid_input2.fastq diff --git a/workflows/short-read-mngs/test/host_filter/star_inputs/validate_input_summary.json b/workflows/legacy-host-filter/test/host_filter/star_inputs/validate_input_summary.json similarity index 100% rename from workflows/short-read-mngs/test/host_filter/star_inputs/validate_input_summary.json rename to workflows/legacy-host-filter/test/host_filter/star_inputs/validate_input_summary.json diff --git a/workflows/short-read-mngs/test/host_filter/star_inputs/validate_input_summary_long.json b/workflows/legacy-host-filter/test/host_filter/star_inputs/validate_input_summary_long.json similarity index 100% rename from workflows/short-read-mngs/test/host_filter/star_inputs/validate_input_summary_long.json rename to workflows/legacy-host-filter/test/host_filter/star_inputs/validate_input_summary_long.json diff --git a/workflows/legacy-host-filter/test/host_filter/test_RunValidateInput_invalid.fastq b/workflows/legacy-host-filter/test/host_filter/test_RunValidateInput_invalid.fastq new file mode 100644 index 000000000..ac92b98f9 --- /dev/null +++ b/workflows/legacy-host-filter/test/host_filter/test_RunValidateInput_invalid.fastq @@ -0,0 +1,8 @@ +@NB501961:14:HM7TLBGX2:1:11102:3233:17234 1:N:0:GATCACCA+GATCACCA +CATTCGGCTGGGTTTCGTCACCCTGCGGGAAGATGCGGGTCCAGGCGATAGAGGTGCGGAAGCAT +TTGAAGCCCATCTCGGCGATCAGTTTGATGTCTTCTTTGTAGCGACCGTAGAAGTCGACGGCTTC +GTGGTTCGGGTAGTATTTTN ++ +AAAAAEEEEEEEEEE/EEEEEEEEEEEEEEEEEEEEEE 500 is >1 + the validation input has been modified, but there are no actual long reads + """ + + args = self.star_args + ["nucleotide_type=DNA", "host_genome=human"] + args[1] = args[1].replace(".json", "_long.json") + res = self.run_miniwdl(args, task="RunStar") + with open(res["outputs"]["RunStar.output_read_count"]) as f: + count = json.load(f) + self.assertEqual(count["star_out"], 100) diff --git a/workflows/legacy-host-filter/test/windows1.fastq.gz b/workflows/legacy-host-filter/test/windows1.fastq.gz new file mode 100644 index 000000000..75fd11e53 Binary files /dev/null and b/workflows/legacy-host-filter/test/windows1.fastq.gz differ diff --git a/workflows/short-read-mngs/Dockerfile b/workflows/short-read-mngs/Dockerfile index 70af1512d..12bb1ebbd 100644 --- a/workflows/short-read-mngs/Dockerfile +++ b/workflows/short-read-mngs/Dockerfile @@ -33,8 +33,7 @@ RUN sed -i s/archive.ubuntu.com/us-west-2.ec2.archive.ubuntu.com/ /etc/apt/sourc echo 'APT::Install-Recommends "false";' > /etc/apt/apt.conf.d/98czid; \ echo 'APT::Install-Suggests "false";' > /etc/apt/apt.conf.d/99czid -RUN apt-get -q update -RUN apt-get -q install -y \ +RUN apt-get -q update && apt-get -q install -y \ jq \ moreutils \ pigz \ @@ -69,6 +68,8 @@ RUN apt-get -q install -y \ fastx-toolkit \ seqtk \ bedtools \ + dh-autoreconf \ + nasm \ build-essential # The following packages pull in python2.7 @@ -116,7 +117,7 @@ RUN chmod +x /usr/bin/PriceSeqFilter RUN curl -Ls https://github.com/chanzuckerberg/s3parcp/releases/download/v0.2.0-alpha/s3parcp_0.2.0-alpha_Linux_x86_64.tar.gz | tar -C /usr/bin -xz s3parcp # FIXME: check if use of pandas, pysam is necessary -RUN apt-get -q install -y python3-pysam python3-pandas +RUN pip3 install pysam==0.14.1 pandas==1.1.5 # Workaround for srst2 refusing to work with upstream bowtie2 and samtools # FIXME: replace srst2 with a more appropriate tool @@ -160,9 +161,6 @@ RUN curl -L https://idseq-rapsearch2.s3-us-west-2.amazonaws.com/RAPSearch2.24_6 RUN sed -i -e 's|^INC.*|INC := -I /usr/include/boost|' -e 's|^LIB.*|LIB :=|' Makefile RUN make ENV PATH="${PATH}:/rapsearch2/Src/" -# Uninstall build only dependencies -RUN apt-get purge -y g++ libperl4-corelibs-perl make -WORKDIR / RUN apt-get -y update && apt-get install -y build-essential libz-dev git python3-pip cmake @@ -183,6 +181,28 @@ RUN mv diamond /usr/local/bin RUN curl -Ls https://github.com/chanzuckerberg/s3parcp/releases/download/v0.2.0-alpha/s3parcp_0.2.0-alpha_Linux_x86_64.tar.gz | tar -C /usr/bin -xz s3parcp +# Host filtering (2022 version) dependencies +# fastp (libdeflate libisal (dh-autoreconf nasm)) +# hisat2 +# bowtie2 [already installed] +# kallisto + python gtfparse +WORKDIR /tmp +RUN wget -nv -O - https://github.com/intel/isa-l/archive/refs/tags/v2.30.0.tar.gz | tar zx +RUN cd isa-l-* && ./autogen.sh && ./configure && make -j8 && make install +RUN wget -nv -O - https://github.com/ebiggers/libdeflate/archive/refs/tags/v1.12.tar.gz | tar zx +RUN cd libdeflate-* && make -j8 && make install +RUN ldconfig +RUN git clone https://github.com/mlin/fastp.git && git -C fastp checkout 37edd60 +RUN cd fastp && make -j8 && ./fastp test && cp fastp /usr/local/bin +WORKDIR / +RUN wget -nv -O /tmp/HISAT2.zip https://czid-public-references.s3.us-west-2.amazonaws.com/test/hisat2/hisat2.zip \ + && unzip /tmp/HISAT2.zip && rm /tmp/HISAT2.zip +RUN curl -L https://github.com/pachterlab/kallisto/releases/download/v0.46.1/kallisto_linux-v0.46.1.tar.gz | tar xz -C / +RUN pip3 install gtfparse==1.2.1 + +# Uninstall build only dependencies +RUN apt-get purge -y g++ libperl4-corelibs-perl make + COPY --from=lib idseq-dag /tmp/idseq-dag RUN pip3 install /tmp/idseq-dag && rm -rf /tmp/idseq-dag diff --git a/workflows/short-read-mngs/auto_benchmark/README.md b/workflows/short-read-mngs/auto_benchmark/README.md index 2e7ae243a..1994405d8 100644 --- a/workflows/short-read-mngs/auto_benchmark/README.md +++ b/workflows/short-read-mngs/auto_benchmark/README.md @@ -31,7 +31,7 @@ Then run desired test scenarios **either (1A)** locally **or (1B)** by submittin Prepare by building the czid-short-read-mngs docker image and enabling the miniwdl download cache: ```bash -docker build czid-workflows/short-read-mngs --tag czid-short-read-mngs +docker build czid-workflows/workflows/short-read-mngs --tag czid-short-read-mngs export MINIWDL__DOWNLOAD_CACHE__PUT=true export MINIWDL__DOWNLOAD_CACHE__GET=true export MINIWDL__DOWNLOAD_CACHE__DIR=/tmp/miniwdl_download_cache @@ -40,7 +40,7 @@ export MINIWDL__DOWNLOAD_CACHE__DIR=/tmp/miniwdl_download_cache The following invocation runs two small synthetic samples using the viral reference databases (roughly 6GB download): ```bash -czid-workflows/short-read-mngs/auto_benchmark/run_local.py --dir my_benchmarks/ \ +czid-workflows/workflows/short-read-mngs/auto_benchmark/run_local.py --dir my_benchmarks/ \ --docker-image-id czid-short-read-mngs --settings default --verbose \ idseq_bench_3 idseq_bench_5 ``` @@ -48,7 +48,7 @@ czid-workflows/short-read-mngs/auto_benchmark/run_local.py --dir my_benchmarks/ This would run those and two other samples on the full-size databases, which takes a few hours: ```bash -czid-workflows/short-read-mngs/auto_benchmark/run_local.py --dir my_benchmarks/ \ +czid-workflows/workflows/short-read-mngs/auto_benchmark/run_local.py --dir my_benchmarks/ \ --docker-image-id czid-short-read-mngs --settings default --databases full \ idseq_bench_3 idseq_bench_5 atcc_staggered atcc_even ``` @@ -60,7 +60,7 @@ The available benchmark samples are listed in [benchmarks.yml](benchmarks.yml). `run_dev.py` submits requested samples to the idseq-dev SFN-WDL backend, using a given [released version vA.B.C](https://github.com/chanzuckerberg/czid-workflows/releases) of the WDL code (not necessarily the checked-out revision!) and the full-size reference databases. The invoking shell session must be pre-configured with an appropriate AWS profile for control of the idseq-dev infrastructure. ```bash -czid-workflows/short-read-mngs/auto_benchmark/run_dev.py --workflow-version vA.B.C \ +czid-workflows/workflows/short-read-mngs/auto_benchmark/run_dev.py --workflow-version vA.B.C \ idseq_bench_3 idseq_bench_5 atcc_staggered atcc_even ``` @@ -80,7 +80,7 @@ taxadb create -i taxadb --dbname taxadb.sqlite Harvesting local run folders generated by `run_local.py` (printed at the end of its standard output): ```bash -czid-workflows/short-read-mngs/auto_benchmark/harvest.py --taxadb taxadb.sqlite \ +czid-workflows/workflows/short-read-mngs/auto_benchmark/harvest.py --taxadb taxadb.sqlite \ idseq_bench_3=my_benchmarks/idseq_bench_3/ idseq_bench_5=my_benchmarks/idseq_bench_5/ \ > my_benchmarks.json ``` @@ -88,7 +88,7 @@ czid-workflows/short-read-mngs/auto_benchmark/harvest.py --taxadb taxadb.sqlite or S3 folders from`run_dev.py`: ```bash -czid-workflows/short-read-mngs/auto_benchmark/harvest.py --taxadb taxadb.sqlite \ +czid-workflows/workflows/short-read-mngs/auto_benchmark/harvest.py --taxadb taxadb.sqlite \ idseq_bench_3=s3://idseq-samples-development/auto_benchmark/YYYYMMDD_HHmmss_default_latest/idseq_bench_3/results/short-read-mngs-A/ \ idseq_bench_5=s3://idseq-samples-development/auto_benchmark/YYYYMMDD_HHmmss_default_latest/idseq_bench_5/results/short-read-mngs-A/ \ > my_benchmarks.json @@ -101,10 +101,10 @@ Finally, run the Jupyter notebook to compare the generated results with the refe ```bash docker run -v $(pwd):/mnt \ --env HARVEST_DATA=/mnt/my_benchmarks.json \ - --env REF_LIB=/mnt/czid-workflows/short-read-mngs/auto_benchmark/ref_libs/default_viral \ + --env REF_LIB=/mnt/czid-workflows/workflows/short-read-mngs/auto_benchmark/ref_libs/default_viral \ --env "RUN_NAME=default_viral_vA.B.C" \ jupyter/scipy-notebook:latest jupyter nbconvert --execute --to html --no-input --output-dir /mnt \ - /mnt/czid-workflows/short-read-mngs/auto_benchmark/short-read-mngs-benchmarks.ipynb + /mnt/czid-workflows/workflows/short-read-mngs/auto_benchmark/short-read-mngs-benchmarks.ipynb ``` Then find `idseq-short-read-mngs-benchmarks.html` in your working directory! (Note: when using the viral databases, the precision-recall curves compared to the truth sets are very poor, correctly so because the simulated datasets include non-viral species.) @@ -116,7 +116,7 @@ Change "viral" to "full" if you used the full-sized databases. Strike `--no-inpu Suppose the results in `my_benchmarks.json` differ from the reference library in an expected way due to pipeline code changes. You can update the reference values like so: ```bash -jq .idseq_bench_3 my_benchmarks.json > czid-workflows/short-read-mngs/auto_benchmark/ref_libs/default_viral/idseq_bench_3.json +jq .idseq_bench_3 my_benchmarks.json > czid-workflows/workflows/short-read-mngs/auto_benchmark/ref_libs/default_viral/idseq_bench_3.json ``` Rerun the notebook to verify it now reports identical results, and check into git. @@ -126,7 +126,7 @@ Rerun the notebook to verify it now reports identical results, and check into gi You can edit the notebook by opening it in a local Jupyter server started like so: ```bash -docker run -v $(pwd)/czid-workflows/short-read-mngs/auto_benchmark:/home/jovyan \ +docker run -v $(pwd)/czid-workflows/workflows/short-read-mngs/workflows/auto_benchmark:/home/jovyan \ -p 8888:8888 jupyter/scipy-notebook:latest ``` diff --git a/workflows/short-read-mngs/auto_benchmark/benchmarks.yml b/workflows/short-read-mngs/auto_benchmark/benchmarks.yml index 12679b7f2..24663e5ee 100644 --- a/workflows/short-read-mngs/auto_benchmark/benchmarks.yml +++ b/workflows/short-read-mngs/auto_benchmark/benchmarks.yml @@ -17,11 +17,11 @@ settings: databases: viral: - host_filter.star_genome: s3://czid-public-references/host_filter/ercc/2018-02-15-utc-1518652800-unixtime__2018-02-15-utc-1518652800-unixtime/STAR_genome.tar - host_filter.bowtie2_genome: s3://czid-public-references/host_filter/ercc/2018-02-15-utc-1518652800-unixtime__2018-02-15-utc-1518652800-unixtime/bowtie2_genome.tar - host_filter.gsnap_genome: s3://czid-public-references/test/gsnap/ERCC_gsnap2017-11-15_k16.tar - host_filter.human_star_genome: s3://czid-public-references/host_filter/ercc/2018-02-15-utc-1518652800-unixtime__2018-02-15-utc-1518652800-unixtime/STAR_genome.tar - host_filter.human_bowtie2_genome: s3://czid-public-references/host_filter/ercc/2018-02-15-utc-1518652800-unixtime__2018-02-15-utc-1518652800-unixtime/bowtie2_genome.tar + host_filter.bowtie2_index_tar: s3://public-test-bucket-idseq/host_filter/human/2022/bowtie2_index_tar/GRCh38_ERCC.bowtie2.tar + host_filter.hisat2_index_tar: s3://public-test-bucket-idseq/host_filter/human/2022/hisat2_index_tar/GRCh38_ERCC.hisat2.tar + host_filter.kallisto_idx: s3://public-test-bucket-idseq/host_filter/human/2022/kallisto_idx/GRCh38_ERCC.kallisto.idx + host_filter.human_bowtie2_index_tar: s3://public-test-bucket-idseq/host_filter/human/2022/bowtie2_index_tar/GRCh38_ERCC.bowtie2.tar + host_filter.human_hisat2_index_tar: s3://public-test-bucket-idseq/host_filter/human/2022/hisat2_index_tar/GRCh38_ERCC.hisat2.tar minimap2_local_db_path: s3://czid-public-references/test/viral-alignment-indexes/viral_nt diamond_local_db_path: s3://czid-public-references/test/viral-alignment-indexes/viral_nr diamond_args: mid-sensitive diff --git a/workflows/short-read-mngs/auto_benchmark/harvest.py b/workflows/short-read-mngs/auto_benchmark/harvest.py index c3b175a0e..ebd6f7d3c 100755 --- a/workflows/short-read-mngs/auto_benchmark/harvest.py +++ b/workflows/short-read-mngs/auto_benchmark/harvest.py @@ -90,7 +90,7 @@ def harvest_sample(sample, outputs_json, taxadb): # collect read counts at various pipeline steps ans["paired"] = ( - outputs_json["czid_short_read_mngs.host_filter.star_out_unmapped2_fastq"] + outputs_json["czid_short_read_mngs.host_filter.fastp_out_fastp2_fastq"] is not None ) ans["input_reads"] = read_output_jsonfile(outputs_json, "host_filter.input_read_count")[ @@ -98,14 +98,11 @@ def harvest_sample(sample, outputs_json, taxadb): ] for step in [ "validate_input", - "star", - "trimmomatic", - "priceseq", + "fastp", + "bowtie2_host_filtered", + "hisat2_host_filtered", "czid_dedup", - "lzw", - "bowtie2", "subsampled", - "gsnap_filter", ]: ans[step + "_reads"] = read_output_jsonfile( outputs_json, "host_filter." + step + "_out_count" diff --git a/workflows/short-read-mngs/auto_benchmark/short-read-mngs-benchmarks.ipynb b/workflows/short-read-mngs/auto_benchmark/short-read-mngs-benchmarks.ipynb index a46e0fbbd..14d3fd37f 100644 --- a/workflows/short-read-mngs/auto_benchmark/short-read-mngs-benchmarks.ipynb +++ b/workflows/short-read-mngs/auto_benchmark/short-read-mngs-benchmarks.ipynb @@ -202,9 +202,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "scrolled": false - }, + "metadata": {}, "outputs": [], "source": [ "def taxa_dataframe(sample_data, sample_reads, db):\n", @@ -218,7 +216,7 @@ "\n", "def joined_taxa_dataframe(sample_data):\n", " # figure rPM denominator\n", - " sample_reads = sample_data[\"counts\"][\"gsnap_filter_reads\"]\n", + " sample_reads = sample_data[\"counts\"][\"subsampled_reads\"]\n", " if sample_data[\"counts\"][\"paired\"]:\n", " sample_reads *= 2\n", "\n", @@ -287,9 +285,7 @@ { "cell_type": "code", "execution_count": null, - "metadata": { - "scrolled": false - }, + "metadata": {}, "outputs": [], "source": [ "for sample, joined in taxa_tables.items():\n", @@ -338,14 +334,23 @@ "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49" }, "kernelspec": { - "display_name": "Python 3.9.1 64-bit", + "display_name": "Python 3 (ipykernel)", + "language": "python", "name": "python3" }, "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", "name": "python", - "version": "" + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.9" } }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +} diff --git a/workflows/short-read-mngs/host_filter.wdl b/workflows/short-read-mngs/host_filter.wdl index 04c1b2c40..6268c7739 100644 --- a/workflows/short-read-mngs/host_filter.wdl +++ b/workflows/short-read-mngs/host_filter.wdl @@ -1,675 +1,887 @@ version 1.0 -task RunValidateInput { +# CZ ID short-read-mngs pipeline stage 1 (2022 version): +# - input validation & QC +# - host & human filtering +# - deduplication +# - subsampling +workflow czid_host_filter { input { + File fastqs_0 + File? fastqs_1 + String nucleotide_type = "DNA" + + File adapter_fasta + + String host_genome + File bowtie2_index_tar + File hisat2_index_tar + File kallisto_idx + File? gtf_gz # Ensembl GTF for host species + + File human_bowtie2_index_tar + File human_hisat2_index_tar + + Int max_input_fragments + Int max_subsample_fragments + + Int cpu = 16 String docker_image_id + + # legacy idseq-dag inputs: + String file_ext = "fastq" String s3_wd_uri - Array[File] fastqs - Int max_input_fragments - String file_ext } - command<<< - set -euxo pipefail - idseq-dag-run-step --workflow-name host_filter \ - --step-module idseq_dag.steps.run_validate_input \ - --step-class PipelineStepRunValidateInput \ - --step-name validate_input_out \ - --input-files '[["~{sep='","' fastqs}"]]' \ - --output-files '["validate_input_summary.json", ~{if length(fastqs) == 2 then '"valid_input1.fastq", "valid_input2.fastq"' else '"valid_input1.fastq"'}]' \ - --output-dir-s3 '~{s3_wd_uri}' \ - --additional-files '{}' \ - --additional-attributes '{"truncate_fragments_to": ~{max_input_fragments}, "file_ext": "~{file_ext}"}' - >>> - output { - String step_description_md = read_string("validate_input_out.description.md") - File validate_input_summary_json = "validate_input_summary.json" - File valid_input1_fastq = "valid_input1.fastq" - File? valid_input2_fastq = "valid_input2.fastq" - File? output_read_count = "validate_input_out.count" - File? input_read_count = "fastqs.count" + + # Validate input reads (and truncate if very large) + call RunValidateInput { + input: + reads1_fastq = fastqs_0, + reads2_fastq = fastqs_1, + file_ext = file_ext, + max_input_fragments = max_input_fragments, + docker_image_id = docker_image_id, + s3_wd_uri = s3_wd_uri } - runtime { - docker: docker_image_id + + # Adapter trimming and QC filtering + call fastp_qc { + input: + valid_input1_fastq = RunValidateInput.valid_input1_fastq, + valid_input2_fastq = RunValidateInput.valid_input2_fastq, + adapter_fasta = adapter_fasta, + docker_image_id = docker_image_id, + cpu = cpu + } + + # Quantify host transcripts and ERCC + # NOTE: we run kallisto even if nucleotide_type == "DNA" in order to get ERCC read counts. + # The transcript & gene abundances are ~meaningless in that case, of course. This isn't a big + # wasted cost because kallisto is so fast. + call kallisto { + input: + fastp1_fastq = fastp_qc.fastp1_fastq, + fastp2_fastq = fastp_qc.fastp2_fastq, + kallisto_idx = kallisto_idx, + gtf_gz = gtf_gz, + docker_image_id = docker_image_id, + cpu = cpu } -} -task RunStar { - input { - String docker_image_id - String s3_wd_uri - File validate_input_summary_json - Array[File] valid_input_fastq - File star_genome - String nucleotide_type - String host_genome - String genome_dir = "STAR_genome/part-0/" + # Filter out host reads. + # Two stages: bowtie2 --very-sensitive-local, followed by splice-aware HISAT2. + call bowtie2_filter { + input: + fastp1_fastq = fastp_qc.fastp1_fastq, + fastp2_fastq = fastp_qc.fastp2_fastq, + index_tar = bowtie2_index_tar, + docker_image_id = docker_image_id, + cpu = cpu } - command<<< - # TODO(Ryan): remove when status upload is not dependent on idseq-dag see: https://app.shortcut.com/idseq/story/163323 - # this comment is for the miniwdl plugin uploader to parse: - # --step-name star_out - set -euxo pipefail - python3 < star_out_version.txt - rm "~{genome_dir}"/SAindex # the star genome is pretty big (1.5G) - rm "~{genome_dir}"/Genome - >>> - output { - String step_description_md = read_string("star_out.description.md") - File unmapped1_fastq = "unmapped1.fastq" - File output_log_file = "Log.final.out" - File? unmapped2_fastq = "unmapped2.fastq" - File? aligned_file = "Aligned.out.bam" - File? output_read_count = "star_out.count" - File? output_gene_file = "reads_per_gene.star.tab" - File? output_metrics_file = "picard_insert_metrics.txt" - File? output_histogram_file = "insert_size_histogram.pdf" - File? version = "star_out_version.txt" + # Deduplicate filtered reads using custom czid-dedup tool. + # It retains one exemplar [pair] from each duplicate cluster, and produces mapping from exemplar + # read name to cluster size. + call RunCZIDDedup { + input: + hisat2_filtered1_fastq = hisat2_filtered1_fastq, + hisat2_filtered2_fastq = hisat2_filtered2_fastq, + docker_image_id = docker_image_id, + s3_wd_uri = s3_wd_uri } - runtime { - docker: docker_image_id + + # Subsample remaining reads. + call RunSubsample { + input: + dedup1_fastq = RunCZIDDedup.dedup1_fastq, + dedup2_fastq = RunCZIDDedup.dedup2_fastq, + duplicate_cluster_sizes_tsv = RunCZIDDedup.duplicate_cluster_sizes_tsv, + max_subsample_fragments = max_subsample_fragments, + docker_image_id = docker_image_id, + s3_wd_uri = s3_wd_uri + } + + output { + File input_read_count = RunValidateInput.reads_in_count + File validate_input_out_valid_input1_fastq = RunValidateInput.valid_input1_fastq + File? validate_input_out_valid_input2_fastq = RunValidateInput.valid_input2_fastq + File validate_input_out_validate_input_summary_json = RunValidateInput.validate_input_summary_json + File validate_input_out_count = RunValidateInput.reads_out_count + + File fastp_out_fastp1_fastq = fastp_qc.fastp1_fastq + File? fastp_out_fastp2_fastq = fastp_qc.fastp2_fastq + File fastp_out_count = fastp_qc.reads_out_count + File fastp_html = fastp_qc.fastp_html + File fastp_json = fastp_qc.fastp_json + + File kallisto_transcript_abundance_tsv = kallisto.transcript_abundance_tsv + File kallisto_ERCC_counts_tsv = kallisto.ERCC_counts_tsv + File? kallisto_gene_abundance_tsv = kallisto.gene_abundance_tsv + + File bowtie2_host_filtered1_fastq = bowtie2_filter.bowtie2_host_filtered1_fastq + File? bowtie2_host_filtered2_fastq = bowtie2_filter.bowtie2_host_filtered2_fastq + File bowtie2_host_filtered_out_count = bowtie2_filter.reads_out_count + File bowtie2_host_filtered_bam = bowtie2_filter.bam + File hisat2_host_filtered1_fastq = hisat2_filter.hisat2_host_filtered1_fastq + File? hisat2_host_filtered2_fastq = hisat2_filter.hisat2_host_filtered2_fastq + File hisat2_host_filtered_out_count = hisat2_filter.reads_out_count + + File? insert_size_metrics = collect_insert_size_metrics.insert_size_metrics + File? insert_size_histogram = collect_insert_size_metrics.insert_size_histogram + + File? bowtie2_human_filtered1_fastq = bowtie2_human_filter.bowtie2_human_filtered1_fastq + File? bowtie2_human_filtered2_fastq = bowtie2_human_filter.bowtie2_human_filtered2_fastq + File? bowtie2_human_filtered_out_count = bowtie2_human_filter.reads_out_count + File? hisat2_human_filtered1_fastq = hisat2_human_filter.hisat2_human_filtered1_fastq + File? hisat2_human_filtered2_fastq = hisat2_human_filter.hisat2_human_filtered2_fastq + File? hisat2_human_filtered_out_count = hisat2_human_filter.reads_out_count + + File czid_dedup_out_dedup1_fastq = RunCZIDDedup.dedup1_fastq + File? czid_dedup_out_dedup2_fastq = RunCZIDDedup.dedup2_fastq + File czid_dedup_out_duplicate_clusters_csv = RunCZIDDedup.duplicate_clusters_csv + File czid_dedup_out_duplicate_cluster_sizes_tsv = RunCZIDDedup.duplicate_cluster_sizes_tsv + File czid_dedup_out_count = RunCZIDDedup.reads_out_count + + File subsampled_out_subsampled_1_fa = RunSubsample.subsampled_1_fa + File? subsampled_out_subsampled_2_fa = RunSubsample.subsampled_2_fa + File? subsampled_out_subsampled_merged_fa = RunSubsample.subsampled_merged_fa + File subsampled_out_count = RunSubsample.reads_out_count } } -task RunTrimmomatic { +task RunValidateInput { input { + File reads1_fastq + File? reads2_fastq + String file_ext + + Int max_input_fragments + String docker_image_id String s3_wd_uri - Array[File] unmapped_fastq - File adapter_fasta } + Boolean paired = defined(reads2_fastq) command<<< - set -euxo pipefail - idseq-dag-run-step --workflow-name host_filter \ - --step-module idseq_dag.steps.run_trimmomatic \ - --step-class PipelineStepRunTrimmomatic \ - --step-name trimmomatic_out \ - --input-files '[["~{sep='","' unmapped_fastq}"]]' \ - --output-files '[~{if length(unmapped_fastq) == 2 then '"trimmomatic1.fastq", "trimmomatic2.fastq"' else '"trimmomatic1.fastq"'}]' \ - --output-dir-s3 '~{s3_wd_uri}' \ - --additional-files '{"adapter_fasta": "~{adapter_fasta}"}' \ - --additional-attributes '{}' - java -jar /usr/local/bin/trimmomatic-0.38.jar -version > trimmomatic_version.txt - + set -euxo pipefail + idseq-dag-run-step --workflow-name host_filter \ + --step-module idseq_dag.steps.run_validate_input \ + --step-class PipelineStepRunValidateInput \ + --step-name validate_input_out \ + --input-files '[["~{sep='","' select_all([reads1_fastq, reads2_fastq])}"]]' \ + --output-files '["validate_input_summary.json", ~{if paired then '"valid_input1.fastq", "valid_input2.fastq"' else '"valid_input1.fastq"'}]' \ + --output-dir-s3 '~{s3_wd_uri}' \ + --additional-files '{}' \ + --additional-attributes '{"truncate_fragments_to": ~{max_input_fragments}, "file_ext": "~{file_ext}"}' >>> output { - String step_description_md = read_string("trimmomatic_out.description.md") - File trimmomatic1_fastq = "trimmomatic1.fastq" - File? trimmomatic2_fastq = "trimmomatic2.fastq" - File? output_read_count = "trimmomatic_out.count" - File? version = "trimmomatic_version.txt" + String step_description_md = read_string("validate_input_out.description.md") + File validate_input_summary_json = "validate_input_summary.json" + File valid_input1_fastq = "valid_input1.fastq" + File? valid_input2_fastq = "valid_input2.fastq" + File reads_out_count = "validate_input_out.count" + File reads_in_count = "fastqs.count" } runtime { docker: docker_image_id + cpu: 4 + memory: "8G" } } -task RunPriceSeq { +task fastp_qc { + # fastp all-in-one for + # - adapter trimming + # - quality filtering + # - complexity filtering input { + File valid_input1_fastq + File? valid_input2_fastq + File adapter_fasta + + # These default QC thresholds are loosely based on the pre-2022 pipeline using PriceSeq & LZW + String fastp_options = "--dont_eval_duplication --length_required 35" + + " --qualified_quality_phred 17 --unqualified_percent_limit 15 --n_base_limit 15" + + " --sdust_complexity_filter --complexity_threshold 60" + String docker_image_id - String s3_wd_uri - Array[File] trimmomatic_fastq + Int cpu = 16 } + Boolean paired = defined(valid_input2_fastq) + String fastp_invocation = "fastp" + + " -i ${valid_input1_fastq} ${'-I ' + valid_input2_fastq}" + + " -o fastp1.fastq ${if (paired) then '-O fastp2.fastq' else ''}" + + " -w ${cpu} ${fastp_options}" + + " --adapter_fasta ${adapter_fasta} ${if (paired) then '--detect_adapter_for_pe' else ''}" + command<<< - set -euxo pipefail - idseq-dag-run-step --workflow-name host_filter \ - --step-module idseq_dag.steps.run_priceseq \ - --step-class PipelineStepRunPriceSeq \ - --step-name priceseq_out \ - --input-files '[["~{sep='","' trimmomatic_fastq}"]]' \ - --output-files '[~{if length(trimmomatic_fastq) == 2 then '"priceseq1.fa", "priceseq2.fa"' else '"priceseq1.fa"'}]' \ - --output-dir-s3 '~{s3_wd_uri}' \ - --additional-files '{}' \ - --additional-attributes '{}' - PriceSeqFilter 2> /dev/null | head -n1 > priceseq_version.txt + set -euxo pipefail + ~{fastp_invocation} + count="$(jq .read1_after_filtering.total_reads fastp.json)" + if [ '~{paired}' == 'true' ]; then + count=$((2 * count)) + fi + jq --null-input --arg count "$count" '{"fastp_out":$count}' > fastp_out.count + # TODO: extract insert size metrics from JSON, also render histogram? + + python3 - << 'EOF' + import textwrap + with open("fastp.description.md", "w") as outfile: + print(textwrap.dedent(""" + **fastp read trimming & filtering** + + Processes the reads using [fastp](https://github.com/OpenGene/fastp): + + 1. Trim adapters + 2. Quality score filter + 3. Non-called base (N) filter + 4. Length filter + 5. Complexity filter ([custom feature](https://github.com/mlin/fastp/tree/mlin/sdust) + using the [SDUST algorithm](https://pubmed.ncbi.nlm.nih.gov/16796549/)) + + fastp is run on the FASTQ file(s) from input validation: + ``` + ~{fastp_invocation} + ``` + + fastp documentation can be found [here](https://github.com/OpenGene/fastp) + """).strip(), file=outfile) + EOF >>> output { - String step_description_md = read_string("priceseq_out.description.md") - File priceseq1_fa = "priceseq1.fa" - File? priceseq2_fa = "priceseq2.fa" - File? output_read_count = "priceseq_out.count" - File? version = "priceseq_version.txt" + String step_description_md = read_string("fastp.description.md") + File fastp1_fastq = "fastp1.fastq" + File? fastp2_fastq = "fastp2.fastq" + File fastp_html = "fastp.html" + File fastp_json = "fastp.json" + File reads_out_count = "fastp_out.count" } runtime { docker: docker_image_id + cpu: cpu + memory: "~{cpu}G" } } -task RunCZIDDedup { +task kallisto { input { + File fastp1_fastq + File? fastp2_fastq + File kallisto_idx + File? gtf_gz + String kallisto_options = "" + String docker_image_id - String s3_wd_uri - Array[File] priceseq_fa - } - command<<< - set -euxo pipefail - idseq-dag-run-step --workflow-name host_filter \ - --step-module idseq_dag.steps.run_czid_dedup \ - --step-class PipelineStepRunCZIDDedup \ - --step-name czid_dedup_out \ - --input-files '[["~{sep='","' priceseq_fa}"]]' \ - --output-files '[~{if length(priceseq_fa) == 2 then '"dedup1.fa", "dedup2.fa"' else '"dedup1.fa"'}, "clusters.csv", "duplicate_cluster_sizes.tsv"]' \ - --output-dir-s3 '~{s3_wd_uri}' \ - --additional-files '{}' \ - --additional-attributes '{}' - czid-dedup --version > czid_dedup_version.txt + Int cpu = 16 + } + Boolean paired = defined(fastp2_fastq) + # TODO: input fragment length parameters for non-paired-end (l = average, s = std dev) + String kallisto_invocation = "/kallisto/kallisto quant" + + " -i '${kallisto_idx}' -o $(pwd) --plaintext ${if (paired) then '' else '--single -l 200 -s 20'} ${kallisto_options} -t ${cpu}" + + " '~{fastp1_fastq}'" + if (defined(fastp2_fastq)) then " '~{fastp2_fastq}'" else "" + + command <<< + set -euxo pipefail + + # NOTE: kallisto exit code will be 1 if no reads pseudoalign, which we don't necessarily + # consider an error. Therefore decide success based on existence of run_info.json and + # abundance.tsv + ~{kallisto_invocation} || true + >&2 jq . run_info.json + + mv abundance.tsv reads_per_transcript.kallisto.tsv + + # extract ERCC counts + echo -e "target_id\test_counts" > ERCC_counts.tsv + grep ERCC- reads_per_transcript.kallisto.tsv | cut -f1,4 >> ERCC_counts.tsv + + # If we've been provided the GTF, then roll up the transcript abundance estimates by gene. + if [[ -n '~{gtf_gz}' ]]; then + python3 - reads_per_transcript.kallisto.tsv '~{gtf_gz}' << 'EOF' + # Given kallisto output tsv based on index of Ensembl transcripts FASTA, and matching + # Ensembl GTF, report the total est_counts and tpm for each gene (sum over all transcripts + # of each gene). + import sys + import pandas as pd + import gtfparse + + kallisto_df = pd.read_csv(sys.argv[1], sep="\t") + + gtf_df = gtfparse.read_gtf(sys.argv[2]) + tx_df = gtf_df[gtf_df["feature"] == "transcript"][ + ["transcript_id", "transcript_version", "gene_id"] + ] + # kallisto target_id is a versioned transcript ID e.g. "ENST00000390446.3", while the GTF + # breaks out: transcript_id "ENST00000390446"; transcript_version "3"; + # synthesize a column with the versioned transcript ID for merging. + tx_df = tx_df.assign( + transcript_id_version=tx_df["transcript_id"] + "." + tx_df["transcript_version"] + ) + + merged_df = pd.merge( + kallisto_df[["target_id", "est_counts", "tpm"]], + tx_df[["transcript_id_version", "gene_id"]], + left_on="target_id", + right_on="transcript_id_version", + ) + + gene_abundance = merged_df.groupby("gene_id").sum(numeric_only=True) + gene_abundance.to_csv("reads_per_gene.kallisto.tsv", sep="\t") + EOF + fi + + python3 - << 'EOF' + import textwrap + with open("kallisto.description.md", "w") as outfile: + print(textwrap.dedent(""" + **kallisto RNA quantification** + + Quantifies host transcripts using [kallisto](https://pachterlab.github.io/kallisto/about). + The host transcript sequences are sourced from Ensembl, along with + [ERCC control sequences](https://www.nist.gov/programs-projects/external-rna-controls-consortium). + Not all CZ ID host species have transcripts indexed; for those without, kallisto is run using ERCC + sequences only. + + kallisto is run on the fastp-filtered FASTQ(s): + + ``` + ~{kallisto_invocation} + ``` + + kallisto documentation can be found [here](https://pachterlab.github.io/kallisto/manual), including + details of the `transcript_abundance.tsv` output format. + """).strip(), file=outfile) + EOF >>> + output { - String step_description_md = read_string("czid_dedup_out.description.md") - File dedup1_fa = "dedup1.fa" - File? dedup2_fa = "dedup2.fa" - File duplicate_clusters_csv = "clusters.csv" - File duplicate_cluster_sizes_tsv = "duplicate_cluster_sizes.tsv" - File? output_read_count = "czid_dedup_out.count" - File? version = "czid_dedup_version.txt" + String step_description_md = read_string("kallisto.description.md") + File transcript_abundance_tsv = "reads_per_transcript.kallisto.tsv" + File ERCC_counts_tsv = "ERCC_counts.tsv" + File? gene_abundance_tsv = "reads_per_gene.kallisto.tsv" } + runtime { docker: docker_image_id + cpu: cpu + memory: "~{cpu}G" } } -task RunLZW { +task bowtie2_filter { + # Remove reads [pairs] with bowtie2 hits to the given index input { + File fastp1_fastq + File? fastp2_fastq + + # GENOME_NAME.bowtie2.tar should contain GENOME_NAME/GENOME_NAME.*.bt* + File index_tar + String bowtie2_options = "--very-sensitive-local" + String docker_image_id - String s3_wd_uri - Array[File] dedup_fa - File duplicate_clusters_csv - File duplicate_cluster_sizes_tsv + Int cpu = 16 } - command<<< - set -euxo pipefail - idseq-dag-run-step --workflow-name host_filter \ - --step-module idseq_dag.steps.run_lzw \ - --step-class PipelineStepRunLZW \ - --step-name lzw_out \ - --input-files '[["~{sep='","' dedup_fa}", "~{duplicate_clusters_csv}", "~{duplicate_cluster_sizes_tsv}"]]' \ - --output-files '[~{if length(dedup_fa) == 2 then '"lzw1.fa", "lzw2.fa"' else '"lzw1.fa"'}]' \ - --output-dir-s3 '~{s3_wd_uri}' \ - --additional-files '{}' \ - --additional-attributes '{"thresholds": [0.45, 0.42], "threshold_readlength": 150}' + + Boolean paired = defined(fastp2_fastq) + String genome_name = basename(index_tar, ".bowtie2.tar") + String bowtie2_invocation = + "bowtie2 -x '/tmp/${genome_name}/${genome_name}' ${bowtie2_options} -p ${cpu}" + + (if (paired) then " -1 '${fastp1_fastq}' -2 '${fastp2_fastq}'" else " -U '${fastp1_fastq}'") + + " -q -S '/tmp/bowtie2.sam'" + + command <<< + set -euxo pipefail + + tar xf '~{index_tar}' -C /tmp + + ~{bowtie2_invocation} + + # generate sort & compressed BAM file for archival + samtools sort -n -o "bowtie2_host.bam" -@ 4 -T /tmp "/tmp/bowtie2.sam" & samtools_pid=$! + + # Extract reads [pairs] that did NOT map to the index + if [[ '~{paired}' == 'true' ]]; then + # 1 (read paired) + # 4 (read unmapped) + # + 8 (mate unmapped) + # ---- + # 13 + samtools fastq -f 13 -1 'bowtie2_host_filtered1.fastq' -2 'bowtie2_host_filtered2.fastq' -0 /dev/null -s /dev/null /tmp/bowtie2.sam + count="$(cat bowtie2_host_filtered{1,2}.fastq | wc -l)" + else + samtools fastq -f 4 /tmp/bowtie2.sam > 'bowtie2_host_filtered1.fastq' + count="$(cat bowtie2_host_filtered1.fastq | wc -l)" + fi + + + count=$((count / 4)) + jq --null-input --arg count "$count" '{"bowtie2_host_filtered_out":$count}' > 'bowtie2_host_filtered_out.count' + + python3 - << 'EOF' + import textwrap + with open("bowtie2.description.md", "w") as outfile: + print(textwrap.dedent(""" + **bowtie2 host filtering** + + Filters out reads matching the host genome using + [Bowtie2](https://bowtie-bio.sourceforge.net/bowtie2/index.shtml). Runs + `bowtie2 ~{bowtie2_options}` using a precomputed index, then uses + [samtools](http://www.htslib.org/) to keep reads *not* mapping to the host genome. + + Bowtie2 is run on the fastp-filtered FASTQ(s): + + ``` + ~{bowtie2_invocation} + ``` + + Then, non-mapping reads are selected using `samtools fastq -f ~{if (paired) then 13 else 4}`. + + Bowtie2 documentation can be found [here](https://bowtie-bio.sourceforge.net/bowtie2/index.shtml) + """).strip(), file=outfile) + EOF + + wait $samtools_pid >>> + output { - String step_description_md = read_string("lzw_out.description.md") - File lzw1_fa = "lzw1.fa" - File? lzw2_fa = "lzw2.fa" - File? output_read_count = "lzw_out.count" + String step_description_md = read_string("bowtie2.description.md") + File bowtie2_host_filtered1_fastq = "bowtie2_host_filtered1.fastq" + File? bowtie2_host_filtered2_fastq = "bowtie2_host_filtered2.fastq" + File reads_out_count = "bowtie2_host_filtered_out.count" + File bam = "bowtie2_host.bam" } runtime { docker: docker_image_id + cpu: cpu + memory: "~{cpu*2}G" } } -task RunBowtie2_bowtie2_out { +task hisat2_filter { + # Remove reads [pairs] with HISAT2 hits to the given index input { + File bowtie2_host_filtered1_fastq + File? bowtie2_host_filtered2_fastq + + # GENOME_NAME.hisat2.tar should contain GENOME_NAME/GENOME_NAME.*.ht2 + File index_tar + String hisat2_options = "" + String docker_image_id - String s3_wd_uri - Array[File] lzw_fa - Array[File] dedup_fa - File duplicate_clusters_csv - File duplicate_cluster_sizes_tsv - File bowtie2_genome + Int cpu = 10 } - command<<< - set -euxo pipefail - idseq-dag-run-step --workflow-name host_filter \ - --step-module idseq_dag.steps.run_bowtie2 \ - --step-class PipelineStepRunBowtie2 \ - --step-name bowtie2_out \ - --input-files '[["~{sep='","' lzw_fa}"], ["~{sep='","' dedup_fa}", "~{duplicate_clusters_csv}", "~{duplicate_cluster_sizes_tsv}"]]' \ - --output-files '[~{if length(lzw_fa) == 2 then '"bowtie2_1.fa", "bowtie2_2.fa", "bowtie2_merged.fa"' else '"bowtie2_1.fa"'}]' \ - --output-dir-s3 '~{s3_wd_uri}' \ - --additional-files '{"bowtie2_genome": "~{bowtie2_genome}"}' \ - --additional-attributes '{"output_sam_file": "bowtie2.sam"}' - bowtie2 --version > bowtie2_version.txt + + Boolean paired = defined(bowtie2_host_filtered2_fastq) + String genome_name = basename(index_tar, ".hisat2.tar") + String hisat2_invocation = + "/hisat2/hisat2 -x '/tmp/${genome_name}/${genome_name}' ${hisat2_options} -p ${cpu}" + + (if (paired) then " -1 '${bowtie2_host_filtered1_fastq}' -2 '${bowtie2_host_filtered2_fastq}'" else " -U '${bowtie2_host_filtered1_fastq}'") + + " -q -S /tmp/hisat2.sam" + + command <<< + set -euxo pipefail + + tar xf '~{index_tar}' -C /tmp + + ~{hisat2_invocation} + + # Extract reads [pairs] that did NOT map to the index + if [[ '~{paired}' == 'true' ]]; then + # 1 (read paired) + # 4 (read unmapped) + # + 8 (mate unmapped) + # ---- + # 13 + samtools fastq -f 13 -1 'hisat2_host_filtered1.fastq' -2 'hisat2_host_filtered2.fastq' -0 /dev/null -s /dev/null /tmp/hisat2.sam + count="$(cat hisat2_host_filtered{1,2}.fastq | wc -l)" + else + samtools fastq -f 4 /tmp/hisat2.sam > 'hisat2_host_filtered1.fastq' + count="$(cat hisat2_host_filtered1.fastq | wc -l)" + fi + + count=$((count / 4)) + jq --null-input --arg count "$count" '{"hisat2_host_filtered_out":$count}' > 'hisat2_host_filtered_out.count' + + python3 - << 'EOF' + import textwrap + with open("hisat2.description.md", "w") as outfile: + print(textwrap.dedent(""" + **HISAT2 host filtering** + + Filters out reads matching the host genome using + [HISAT2](http://daehwankimlab.github.io/hisat2/). Runs `hisat2` using a precomputed index, + then uses [samtools](http://www.htslib.org/) to keep reads *not* mapping to the + host genome. + + HISAT2 complements Bowtie2 with a different algorithm that also models potential RNA splice + junctions (if CZ ID indexes transcript models for the host). + + HISAT2 is run on the bowtie2-filtered FASTQ(s): + + ``` + ~{hisat2_invocation} + ``` + + Then, non-mapping reads are selected using `samtools fastq -f ~{if (paired) then 13 else 4}`. + + HISAT2 documentation can be found [here](http://daehwankimlab.github.io/hisat2/) + """).strip(), file=outfile) + EOF >>> + output { - String step_description_md = read_string("bowtie2_out.description.md") - File bowtie2_1_fa = "bowtie2_1.fa" - File? bowtie2_2_fa = "bowtie2_2.fa" - File? bowtie2_merged_fa = "bowtie2_merged.fa" - File? output_read_count = "bowtie2_out.count" - File? version = "bowtie2_version.txt" + String step_description_md = read_string("hisat2.description.md") + File hisat2_host_filtered1_fastq = "hisat2_host_filtered1.fastq" + File? hisat2_host_filtered2_fastq = "hisat2_host_filtered2.fastq" + File reads_out_count = "hisat2_host_filtered_out.count" } runtime { docker: docker_image_id + cpu: cpu + memory: "~{cpu*4}G" } } -task RunSubsample { +################################################################################################### +### NOTE: bowtie2_human_filter and hisat2_human_filter are roughly copy/paste of the _host_filter +### tasks above. We'd much prefer to consolidate them, but the webapp pipeline visualization +### isn't yet able to handle WDL tasks used multiple times with dynamic output filenames. +################################################################################################### + +task bowtie2_human_filter { + # Remove reads [pairs] with bowtie2 hits to the given index input { + File hisat2_host_filtered1_fastq + File? hisat2_host_filtered2_fastq + + # GENOME_NAME.bowtie2.tar should contain GENOME_NAME/GENOME_NAME.*.bt* + File index_tar + String bowtie2_options = "--very-sensitive-local" + String docker_image_id - String s3_wd_uri - Array[File] bowtie2_fa - Array[File] dedup_fa - File duplicate_clusters_csv - File duplicate_cluster_sizes_tsv - Int max_subsample_fragments - } - command<<< - set -euxo pipefail - idseq-dag-run-step --workflow-name host_filter \ - --step-module idseq_dag.steps.run_subsample \ - --step-class PipelineStepRunSubsample \ - --step-name subsampled_out \ - --input-files '[["~{sep='","' bowtie2_fa}"], ["~{sep='","' dedup_fa}", "~{duplicate_clusters_csv}", "~{duplicate_cluster_sizes_tsv}"]]' \ - --output-files '[~{if length(dedup_fa) == 2 then '"subsampled_1.fa", "subsampled_2.fa", "subsampled_merged.fa"' else '"subsampled_1.fa"'}]' \ - --output-dir-s3 '~{s3_wd_uri}' \ - --additional-files '{}' \ - --additional-attributes '{"max_fragments": ~{max_subsample_fragments}}' + Int cpu = 16 + } + + Boolean paired = defined(hisat2_host_filtered2_fastq) + String genome_name = basename(index_tar, ".bowtie2.tar") + String bowtie2_invocation = + "bowtie2 -x '/tmp/${genome_name}/${genome_name}' ${bowtie2_options} -p ${cpu}" + + (if (paired) then " -1 '${hisat2_host_filtered1_fastq}' -2 '${hisat2_host_filtered2_fastq}'" else " -U '${hisat2_host_filtered1_fastq}'") + + " -q -S '/tmp/bowtie2.sam'" + + command <<< + set -euxo pipefail + + tar xf '~{index_tar}' -C /tmp + + ~{bowtie2_invocation} + + # generate sort & compressed BAM file for archival + samtools sort -n -o "bowtie2_human.bam" -@ 4 -T /tmp "/tmp/bowtie2.sam" & samtools_pid=$! + + # Extract reads [pairs] that did NOT map to the index + if [[ '~{paired}' == 'true' ]]; then + # 1 (read paired) + # 4 (read unmapped) + # + 8 (mate unmapped) + # ---- + # 13 + samtools fastq -f 13 -1 'bowtie2_human_filtered1.fastq' -2 'bowtie2_human_filtered2.fastq' -0 /dev/null -s /dev/null /tmp/bowtie2.sam + count="$(cat bowtie2_human_filtered{1,2}.fastq | wc -l)" + else + samtools fastq -f 4 /tmp/bowtie2.sam > 'bowtie2_human_filtered1.fastq' + count="$(cat bowtie2_human_filtered1.fastq | wc -l)" + fi + + count=$((count / 4)) + jq --null-input --arg count "$count" '{"bowtie2_human_filtered_out":$count}' > 'bowtie2_human_filtered_out.count' + + python3 - << 'EOF' + import textwrap + with open("bowtie2.description.md", "w") as outfile: + print(textwrap.dedent(""" + **bowtie2 human filtering** + + Filters out reads matching the human genome using + [Bowtie2](https://bowtie-bio.sourceforge.net/bowtie2/index.shtml). This is similar to the + host filtering task, but CZ ID also filters non-human samples against human genome indexes + to alleviate any potential data privacy concerns. + """).strip(), file=outfile) + EOF + + wait $samtools_pid >>> + output { - String step_description_md = read_string("subsampled_out.description.md") - File subsampled_1_fa = "subsampled_1.fa" - File? subsampled_2_fa = "subsampled_2.fa" - File? subsampled_merged_fa = "subsampled_merged.fa" - File? output_read_count = "subsampled_out.count" + String step_description_md = read_string("bowtie2.description.md") + File bowtie2_human_filtered1_fastq = "bowtie2_human_filtered1.fastq" + File? bowtie2_human_filtered2_fastq = "bowtie2_human_filtered2.fastq" + File reads_out_count = "bowtie2_human_filtered_out.count" + File bam = "bowtie2_human.bam" } runtime { docker: docker_image_id + cpu: cpu + memory: "~{cpu*2}G" } } -task RunStarDownstream { +task hisat2_human_filter { + # Remove reads [pairs] with HISAT2 hits to the given index input { + File bowtie2_human_filtered1_fastq + File? bowtie2_human_filtered2_fastq + + # GENOME_NAME.hisat2.tar should contain GENOME_NAME/GENOME_NAME.*.ht2 + File index_tar + String hisat2_options = "" + String docker_image_id - String s3_wd_uri - Array[File] subsampled_fa - File validate_input_summary_json - Array[File] valid_input_fastq - Array[File] dedup_fa - File duplicate_clusters_csv - File duplicate_cluster_sizes_tsv - File human_star_genome - } - command<<< - set -euxo pipefail - idseq-dag-run-step --workflow-name host_filter \ - --step-module idseq_dag.steps.run_star_downstream \ - --step-class PipelineStepRunStarDownstream \ - --step-name star_human_out \ - --input-files '[["~{sep='","' subsampled_fa}"], ["~{validate_input_summary_json}", "~{sep='","' valid_input_fastq}"], ["~{sep='","' dedup_fa}", "~{duplicate_clusters_csv}", "~{duplicate_cluster_sizes_tsv}"]]' \ - --output-files '[~{if length(dedup_fa) == 2 then '"unmapped_human_1.fa", "unmapped_human_2.fa"' else '"unmapped_human_1.fa"'}]' \ - --output-dir-s3 '~{s3_wd_uri}' \ - --additional-files '{"star_genome": "~{human_star_genome}"}' \ - --additional-attributes '{}' - STAR --version > star_human_version.txt + Int cpu = 16 + } + + Boolean paired = defined(bowtie2_human_filtered2_fastq) + String genome_name = basename(index_tar, ".hisat2.tar") + String hisat2_invocation = + "/hisat2/hisat2 -x '/tmp/${genome_name}/${genome_name}' ${hisat2_options} -p ${cpu}" + + (if (paired) then " -1 '${bowtie2_human_filtered1_fastq}' -2 '${bowtie2_human_filtered2_fastq}'" else " -U '${bowtie2_human_filtered1_fastq}'") + + " -q -S /tmp/hisat2.sam" + + command <<< + set -euxo pipefail + + tar xf '~{index_tar}' -C /tmp + + ~{hisat2_invocation} + + # Extract reads [pairs] that did NOT map to the index + if [[ '~{paired}' == 'true' ]]; then + # 1 (read paired) + # 4 (read unmapped) + # + 8 (mate unmapped) + # ---- + # 13 + samtools fastq -f 13 -1 'hisat2_human_filtered1.fastq' -2 'hisat2_human_filtered2.fastq' -0 /dev/null -s /dev/null /tmp/hisat2.sam + count="$(cat hisat2_human_filtered{1,2}.fastq | wc -l)" + else + samtools fastq -f 4 /tmp/hisat2.sam > 'hisat2_human_filtered1.fastq' + count="$(cat hisat2_human_filtered1.fastq | wc -l)" + fi + + count=$((count / 4)) + jq --null-input --arg count "$count" '{"hisat2_human_filtered_out":$count}' > 'hisat2_human_filtered_out.count' + + python3 - << 'EOF' + import textwrap + with open("hisat2.description.md", "w") as outfile: + print(textwrap.dedent(""" + **HISAT2 human filtering** + + Filters out reads matching the human genome using + [HISAT2](http://daehwankimlab.github.io/hisat2/). This is similar to the host filtering + task, but CZ ID also filters non-human samples against human genome indexes to alleviate any + potential data privacy concerns. + """).strip(), file=outfile) + EOF >>> + output { - String step_description_md = read_string("star_human_out.description.md") - File unmapped_human_1_fa = "unmapped_human_1.fa" - File? unmapped_human_2_fa = "unmapped_human_2.fa" - File? output_read_count = "star_human_out.count" - File? version = "star_human_version.txt" + String step_description_md = read_string("hisat2.description.md") + File hisat2_human_filtered1_fastq = "hisat2_human_filtered1.fastq" + File? hisat2_human_filtered2_fastq = "hisat2_human_filtered2.fastq" + File reads_out_count = "hisat2_human_filtered_out.count" } runtime { docker: docker_image_id + cpu: cpu + memory: "~{cpu*2}G" } } -task RunBowtie2_bowtie2_human_out { +task collect_insert_size_metrics { input { + File bam String docker_image_id - String s3_wd_uri - Array[File] unmapped_human_fa - Array[File] dedup_fa - File duplicate_clusters_csv - File duplicate_cluster_sizes_tsv - File human_bowtie2_genome } - command<<< - set -euxo pipefail - idseq-dag-run-step --workflow-name host_filter \ - --step-module idseq_dag.steps.run_bowtie2 \ - --step-class PipelineStepRunBowtie2 \ - --step-name bowtie2_human_out \ - --input-files '[["~{sep='","' unmapped_human_fa}"], ["~{sep='","' dedup_fa}", "~{duplicate_clusters_csv}", "~{duplicate_cluster_sizes_tsv}"]]' \ - --output-files '[~{if length(dedup_fa) == 2 then '"bowtie2_human_1.fa", "bowtie2_human_2.fa", "bowtie2_human_merged.fa"' else '"bowtie2_human_1.fa"'}]' \ - --output-dir-s3 '~{s3_wd_uri}' \ - --additional-files '{"bowtie2_genome": "~{human_bowtie2_genome}"}' \ - --additional-attributes '{"output_sam_file": "bowtie2_human.sam"}' - bowtie2 --version > bowtie2_human_version.txt + + command <<< + picard CollectInsertSizeMetrics 'I=~{bam}' O=picard_insert_metrics.txt H=insert_size_histogram.pdf + python3 - << 'EOF' + import textwrap + with open("collect_insert_size_metrics.description.md", "w") as outfile: + print(textwrap.dedent(""" + **Picard CollectInsertSizeMetrics** + + This step computes insert size metrics for Paired End samples. These metrics are computed by + the Broad Institute's Picard toolkit. + + Picard is run on the output BAM file obtained from running Bowtie2 on the host genome: + + ``` + picard CollectInsertSizeMetrics 'I=~{bam}' O=picard_insert_metrics.txt H=insert_size_histogram.pdf + ``` + + Picard documentation can be found [here](https://gatk.broadinstitute.org/hc/en-us/articles/360037055772-CollectInsertSizeMetrics-Picard-) + """).strip(), file=outfile) + EOF >>> + output { - String step_description_md = read_string("bowtie2_human_out.description.md") - File bowtie2_human_1_fa = "bowtie2_human_1.fa" - File? bowtie2_human_2_fa = "bowtie2_human_2.fa" - File? bowtie2_human_merged_fa = "bowtie2_human_merged.fa" - File? output_read_count = "bowtie2_human_out.count" - File? version = "bowtie2_human_version.txt" + String step_description_md = read_string("collect_insert_size_metrics.description.md") + # If no reads mapped to the host, then picard exits "successfully" without creating these files. + File? insert_size_metrics = "picard_insert_metrics.txt" + File? insert_size_histogram = "insert_size_histogram.pdf" } + runtime { docker: docker_image_id + cpu: 1 + memory: "8G" } } -task RunGsnapFilter { +task RunCZIDDedup { input { + File hisat2_filtered1_fastq + File? hisat2_filtered2_fastq String docker_image_id String s3_wd_uri - Array[File] subsampled_fa - Array[File] dedup_fa - File duplicate_clusters_csv - File duplicate_cluster_sizes_tsv - File gsnap_genome } + Boolean paired = defined(hisat2_filtered2_fastq) command<<< - set -euxo pipefail - idseq-dag-run-step --workflow-name host_filter \ - --step-module idseq_dag.steps.run_gsnap_filter \ - --step-class PipelineStepRunGsnapFilter \ - --step-name gsnap_filter_out \ - --input-files '[["~{sep='","' subsampled_fa}"], ["~{sep='","' dedup_fa}", "~{duplicate_clusters_csv}", "~{duplicate_cluster_sizes_tsv}"]]' \ - --output-files '[~{if length(dedup_fa) == 2 then '"gsnap_filter_1.fa", "gsnap_filter_2.fa", "gsnap_filter_merged.fa"' else '"gsnap_filter_1.fa"'}]' \ - --output-dir-s3 '~{s3_wd_uri}' \ - --additional-files '{"gsnap_genome": "~{gsnap_genome}"}' \ - --additional-attributes '{"output_sam_file": "gsnap_filter.sam"}' - gsnap --version > gsnap_filter_version.txt + set -euxo pipefail + + >&2 idseq-dag-run-step --workflow-name host_filter \ + --step-module idseq_dag.steps.run_czid_dedup \ + --step-class PipelineStepRunCZIDDedup \ + --step-name czid_dedup_out \ + --input-files '[["~{sep='","' select_all([hisat2_filtered1_fastq, hisat2_filtered2_fastq])}"]]' \ + --output-files '[~{if paired then '"dedup1.fastq","dedup2.fastq"' else '"dedup1.fastq"'}, "clusters.csv", "duplicate_cluster_sizes.tsv"]' \ + --output-dir-s3 '~{s3_wd_uri}' \ + --additional-files '{}' \ + --additional-attributes '{}' >>> output { - String step_description_md = read_string("gsnap_filter_out.description.md") - File gsnap_filter_1_fa = "gsnap_filter_1.fa" - File? gsnap_filter_2_fa = "gsnap_filter_2.fa" - File? gsnap_filter_merged_fa = "gsnap_filter_merged.fa" - File? output_read_count = "gsnap_filter_out.count" - File? version = "gsnap_filter_version.txt" + String step_description_md = read_string("czid_dedup_out.description.md") + File dedup1_fastq = "dedup1.fastq" + File? dedup2_fastq = "dedup2.fastq" + File duplicate_clusters_csv = "clusters.csv" + File duplicate_cluster_sizes_tsv = "duplicate_cluster_sizes.tsv" + File reads_out_count = "czid_dedup_out.count" } runtime { docker: docker_image_id + cpu: 4 + memory: "16G" } } - -workflow czid_host_filter { +task RunSubsample { input { - String docker_image_id - String s3_wd_uri - File fastqs_0 - File? fastqs_1 - String file_ext - String nucleotide_type - String host_genome - File adapter_fasta - File star_genome - File bowtie2_genome - File gsnap_genome = "s3://czid-public-references/host_filter/human/2018-02-15-utc-1518652800-unixtime__2018-02-15-utc-1518652800-unixtime/hg38_pantro5_k16.tar" - String human_star_genome - String human_bowtie2_genome - Int max_input_fragments + File dedup1_fastq + File? dedup2_fastq + File duplicate_cluster_sizes_tsv Int max_subsample_fragments - } - - call RunValidateInput { - input: - docker_image_id = docker_image_id, - s3_wd_uri = s3_wd_uri, - fastqs = select_all([fastqs_0, fastqs_1]), - file_ext = file_ext, - max_input_fragments = max_input_fragments - } - - call RunStar { - input: - docker_image_id = docker_image_id, - s3_wd_uri = s3_wd_uri, - validate_input_summary_json = RunValidateInput.validate_input_summary_json, - valid_input_fastq = select_all([RunValidateInput.valid_input1_fastq, RunValidateInput.valid_input2_fastq]), - star_genome = star_genome, - nucleotide_type = nucleotide_type, - host_genome = host_genome - } - - call RunTrimmomatic { - input: - docker_image_id = docker_image_id, - s3_wd_uri = s3_wd_uri, - unmapped_fastq = select_all([RunStar.unmapped1_fastq, RunStar.unmapped2_fastq]), - adapter_fasta = adapter_fasta - } - - call RunPriceSeq { - input: - docker_image_id = docker_image_id, - s3_wd_uri = s3_wd_uri, - trimmomatic_fastq = select_all([RunTrimmomatic.trimmomatic1_fastq, RunTrimmomatic.trimmomatic2_fastq]) - } - call RunCZIDDedup { - input: - docker_image_id = docker_image_id, - s3_wd_uri = s3_wd_uri, - priceseq_fa = select_all([RunPriceSeq.priceseq1_fa, RunPriceSeq.priceseq2_fa]) - } - - call RunLZW { - input: - docker_image_id = docker_image_id, - s3_wd_uri = s3_wd_uri, - dedup_fa = select_all([RunCZIDDedup.dedup1_fa, RunCZIDDedup.dedup2_fa]), - duplicate_clusters_csv = RunCZIDDedup.duplicate_clusters_csv, - duplicate_cluster_sizes_tsv = RunCZIDDedup.duplicate_cluster_sizes_tsv - } - - call RunBowtie2_bowtie2_out { - input: - docker_image_id = docker_image_id, - s3_wd_uri = s3_wd_uri, - lzw_fa = select_all([RunLZW.lzw1_fa, RunLZW.lzw2_fa]), - dedup_fa = select_all([RunCZIDDedup.dedup1_fa, RunCZIDDedup.dedup2_fa]), - duplicate_clusters_csv = RunCZIDDedup.duplicate_clusters_csv, - duplicate_cluster_sizes_tsv = RunCZIDDedup.duplicate_cluster_sizes_tsv, - bowtie2_genome = bowtie2_genome - } - - call RunSubsample { - input: - docker_image_id = docker_image_id, - s3_wd_uri = s3_wd_uri, - bowtie2_fa = select_all([RunBowtie2_bowtie2_out.bowtie2_1_fa, RunBowtie2_bowtie2_out.bowtie2_2_fa, RunBowtie2_bowtie2_out.bowtie2_merged_fa]), - dedup_fa = select_all([RunCZIDDedup.dedup1_fa, RunCZIDDedup.dedup2_fa]), - duplicate_clusters_csv = RunCZIDDedup.duplicate_clusters_csv, - duplicate_cluster_sizes_tsv = RunCZIDDedup.duplicate_cluster_sizes_tsv, - max_subsample_fragments = max_subsample_fragments - } - - if (host_genome != "human") { - call RunStarDownstream { - input: - docker_image_id = docker_image_id, - s3_wd_uri = s3_wd_uri, - subsampled_fa = select_all([RunSubsample.subsampled_1_fa, RunSubsample.subsampled_2_fa, RunSubsample.subsampled_merged_fa]), - validate_input_summary_json = RunValidateInput.validate_input_summary_json, - valid_input_fastq = select_all([RunValidateInput.valid_input1_fastq, RunValidateInput.valid_input2_fastq]), - dedup_fa = select_all([RunCZIDDedup.dedup1_fa, RunCZIDDedup.dedup2_fa]), - duplicate_clusters_csv = RunCZIDDedup.duplicate_clusters_csv, - duplicate_cluster_sizes_tsv = RunCZIDDedup.duplicate_cluster_sizes_tsv, - human_star_genome = human_star_genome - } - - call RunBowtie2_bowtie2_human_out { - input: - docker_image_id = docker_image_id, - s3_wd_uri = s3_wd_uri, - unmapped_human_fa = select_all([RunStarDownstream.unmapped_human_1_fa, RunStarDownstream.unmapped_human_2_fa]), - dedup_fa = select_all([RunCZIDDedup.dedup1_fa, RunCZIDDedup.dedup2_fa]), - duplicate_clusters_csv = RunCZIDDedup.duplicate_clusters_csv, - duplicate_cluster_sizes_tsv = RunCZIDDedup.duplicate_cluster_sizes_tsv, - human_bowtie2_genome = human_bowtie2_genome - } - } - - Array[File] gsnap_filter_input = if (host_genome == "human") - then select_all([RunSubsample.subsampled_1_fa, RunSubsample.subsampled_2_fa, RunSubsample.subsampled_merged_fa]) - else select_all([RunBowtie2_bowtie2_human_out.bowtie2_human_1_fa, RunBowtie2_bowtie2_human_out.bowtie2_human_2_fa, RunBowtie2_bowtie2_human_out.bowtie2_human_merged_fa]) - - call RunGsnapFilter { - input: - docker_image_id = docker_image_id, - s3_wd_uri = s3_wd_uri, - subsampled_fa = gsnap_filter_input, - dedup_fa = select_all([RunCZIDDedup.dedup1_fa, RunCZIDDedup.dedup2_fa]), - duplicate_clusters_csv = RunCZIDDedup.duplicate_clusters_csv, - duplicate_cluster_sizes_tsv = RunCZIDDedup.duplicate_cluster_sizes_tsv, - gsnap_genome = gsnap_genome + String docker_image_id + String s3_wd_uri } + Boolean paired = defined(dedup2_fastq) + command<<< + set -euxo pipefail + TMPDIR="${TMPDIR:-/tmp}" + + # Convert FASTQs to FASTAs: the idseq-dag subsampling tool inputs and outputs FASTAs, and + # downstream pipeline stages consume the FASTAs. + seqtk seq -a '~{dedup1_fastq}' > "$TMPDIR/reads1.fasta" & pid=$! + fastas="\"$TMPDIR/reads1.fasta\"" + if [[ '~{paired}' == 'true' ]]; then + seqtk seq -a '~{dedup2_fastq}' > "$TMPDIR/reads2.fasta" + wait $pid + # also generate merged FASTA. `seqtk mergepe` interleaves the reads but doesn't append /1 /2 to + # the names, so we add an awk kludge to do that. + seqtk mergepe "$TMPDIR/reads1.fasta" "$TMPDIR/reads2.fasta" | awk ' + BEGIN { + name = ""; + } + /^>.*/ { + if ($0 != name) { + name = $0; + printf("%s/1\n", $0); + } else { + printf("%s/2\n", $0); + } + } + ! /^>.*/ { print; } + ' > "$TMPDIR/reads_merged.fasta" + fastas="\"$TMPDIR/reads1.fasta\",\"$TMPDIR/reads2.fasta\",\"$TMPDIR/reads_merged.fasta\"" + else + wait $pid + fi + # subsample FASTAs + idseq-dag-run-step --workflow-name host_filter \ + --step-module idseq_dag.steps.run_subsample \ + --step-class PipelineStepRunSubsample \ + --step-name subsampled_out \ + --input-files '[['"$fastas"'], ["~{duplicate_cluster_sizes_tsv}"]]' \ + --output-files '[~{if paired then '"subsampled_1.fa", "subsampled_2.fa", "subsampled_merged.fa"' else '"subsampled_1.fa"'}]' \ + --output-dir-s3 '~{s3_wd_uri}' \ + --additional-files '{}' \ + --additional-attributes '{"max_fragments": ~{max_subsample_fragments}}' + >>> output { - File validate_input_out_validate_input_summary_json = RunValidateInput.validate_input_summary_json - File? validate_input_out_count = RunValidateInput.output_read_count - File star_out_unmapped1_fastq = RunStar.unmapped1_fastq - File? star_out_unmapped2_fastq = RunStar.unmapped2_fastq - File? star_out_log_file = RunStar.output_log_file - File? star_out_count = RunStar.output_read_count - File? star_version = RunStar.version - File trimmomatic_out_trimmomatic1_fastq = RunTrimmomatic.trimmomatic1_fastq - File? trimmomatic_out_trimmomatic2_fastq = RunTrimmomatic.trimmomatic2_fastq - File? trimmomatic_out_count = RunTrimmomatic.output_read_count - File? trimmomatic_version = RunTrimmomatic.version - File priceseq_out_priceseq1_fa = RunPriceSeq.priceseq1_fa - File? priceseq_out_priceseq2_fa = RunPriceSeq.priceseq2_fa - File? priceseq_out_count = RunPriceSeq.output_read_count - File? priceseq_version = RunPriceSeq.version - File czid_dedup_out_dedup1_fa = RunCZIDDedup.dedup1_fa - File? czid_dedup_out_dedup2_fa = RunCZIDDedup.dedup2_fa - File czid_dedup_out_duplicate_clusters_csv = RunCZIDDedup.duplicate_clusters_csv - File czid_dedup_out_duplicate_cluster_sizes_tsv = RunCZIDDedup.duplicate_cluster_sizes_tsv - File? czid_dedup_out_count = RunCZIDDedup.output_read_count - File? czid_dedup_version = RunCZIDDedup.version - File lzw_out_lzw1_fa = RunLZW.lzw1_fa - File? lzw_out_lzw2_fa = RunLZW.lzw2_fa - File? lzw_out_count = RunLZW.output_read_count - File bowtie2_out_bowtie2_1_fa = RunBowtie2_bowtie2_out.bowtie2_1_fa - File? bowtie2_out_bowtie2_2_fa = RunBowtie2_bowtie2_out.bowtie2_2_fa - File? bowtie2_out_bowtie2_merged_fa = RunBowtie2_bowtie2_out.bowtie2_merged_fa - File? bowtie2_out_count = RunBowtie2_bowtie2_out.output_read_count - File? bowtie2_version = RunBowtie2_bowtie2_out.version - File subsampled_out_subsampled_1_fa = RunSubsample.subsampled_1_fa - File? subsampled_out_subsampled_2_fa = RunSubsample.subsampled_2_fa - File? subsampled_out_subsampled_merged_fa = RunSubsample.subsampled_merged_fa - File? subsampled_out_count = RunSubsample.output_read_count - File? star_human_out_unmapped_human_1_fa = RunStarDownstream.unmapped_human_1_fa - File? star_human_out_unmapped_human_2_fa = RunStarDownstream.unmapped_human_2_fa - File? star_human_out_count = RunStarDownstream.output_read_count - File? star_human_version = RunStarDownstream.version - File? bowtie2_human_out_bowtie2_human_1_fa = RunBowtie2_bowtie2_human_out.bowtie2_human_1_fa - File? bowtie2_human_out_bowtie2_human_2_fa = RunBowtie2_bowtie2_human_out.bowtie2_human_2_fa - File? bowtie2_human_out_bowtie2_human_merged_fa = RunBowtie2_bowtie2_human_out.bowtie2_human_merged_fa - File? bowtie2_human_out_count = RunBowtie2_bowtie2_human_out.output_read_count - File? bowtie2_human_version = RunBowtie2_bowtie2_human_out.version - File gsnap_filter_out_gsnap_filter_1_fa = RunGsnapFilter.gsnap_filter_1_fa - File? gsnap_filter_out_gsnap_filter_2_fa = RunGsnapFilter.gsnap_filter_2_fa - File? gsnap_filter_out_gsnap_filter_merged_fa = RunGsnapFilter.gsnap_filter_merged_fa - File? gsnap_filter_out_count = RunGsnapFilter.output_read_count - File? gsnap_filter_version = RunGsnapFilter.version - File? input_read_count = RunValidateInput.input_read_count - File? output_gene_file = RunStar.output_gene_file - File? output_metrics_file = RunStar.output_metrics_file - File? output_histogram_file = RunStar.output_histogram_file + String step_description_md = read_string("subsampled_out.description.md") + File subsampled_1_fa = "subsampled_1.fa" + File? subsampled_2_fa = "subsampled_2.fa" + File? subsampled_merged_fa = "subsampled_merged.fa" + File reads_out_count = "subsampled_out.count" + } + runtime { + docker: docker_image_id + cpu: 4 + memory: "8G" } } diff --git a/workflows/short-read-mngs/host_filter_defaults.yml b/workflows/short-read-mngs/host_filter_defaults.yml index a9d7458a2..eb1df7c0a 100644 --- a/workflows/short-read-mngs/host_filter_defaults.yml +++ b/workflows/short-read-mngs/host_filter_defaults.yml @@ -1,10 +1,10 @@ nucleotide_type: DNA host_genome: human -star_genome: s3://czid-public-references/host_filter/human/2018-02-15-utc-1518652800-unixtime__2018-02-15-utc-1518652800-unixtime/human_STAR_genome.tar # human host genome -bowtie2_genome: s3://czid-public-references/host_filter/human/2018-02-15-utc-1518652800-unixtime__2018-02-15-utc-1518652800-unixtime/human_bowtie2_genome.tar # human host genome -gsnap_genome: s3://czid-public-references/host_filter/human/2018-02-15-utc-1518652800-unixtime__2018-02-15-utc-1518652800-unixtime/hg38_pantro5_k16.tar -human_star_genome: s3://czid-public-references/host_filter/human/2018-02-15-utc-1518652800-unixtime__2018-02-15-utc-1518652800-unixtime/human_STAR_genome.tar -human_bowtie2_genome: s3://czid-public-references/host_filter/human/2018-02-15-utc-1518652800-unixtime__2018-02-15-utc-1518652800-unixtime/human_bowtie2_genome.tar +bowtie2_index_tar: s3://public-test-bucket-idseq/host_filter/human/2022/bowtie2_index_tar/GRCh38_ERCC.bowtie2.tar +hisat2_index_tar: s3://public-test-bucket-idseq/host_filter/human/2022/hisat2_index_tar/GRCh38_ERCC.hisat2.tar +kallisto_idx: s3://public-test-bucket-idseq/host_filter/human/2022/kallisto_idx/GRCh38_ERCC.kallisto.idx +human_bowtie2_index_tar: s3://public-test-bucket-idseq/host_filter/human/2022/bowtie2_index_tar/GRCh38_ERCC.bowtie2.tar +human_hisat2_index_tar: s3://public-test-bucket-idseq/host_filter/human/2022/hisat2_index_tar/GRCh38_ERCC.hisat2.tar adapter_fasta: s3://czid-public-references/adapter_sequences/illumina_TruSeq3-PE-2_NexteraPE-PE.fasta max_input_fragments: 75000000 max_subsample_fragments: 1000000 diff --git a/workflows/short-read-mngs/local_driver.wdl b/workflows/short-read-mngs/local_driver.wdl index 4e9062dcc..c84f89e55 100644 --- a/workflows/short-read-mngs/local_driver.wdl +++ b/workflows/short-read-mngs/local_driver.wdl @@ -29,9 +29,9 @@ workflow czid_short_read_mngs { } call stage2.czid_non_host_alignment as non_host_alignment { input: - host_filter_out_gsnap_filter_1_fa = host_filter.gsnap_filter_out_gsnap_filter_1_fa, - host_filter_out_gsnap_filter_2_fa = host_filter.gsnap_filter_out_gsnap_filter_2_fa, - host_filter_out_gsnap_filter_merged_fa = host_filter.gsnap_filter_out_gsnap_filter_merged_fa, + host_filter_out_gsnap_filter_1_fa = host_filter.subsampled_out_subsampled_1_fa, + host_filter_out_gsnap_filter_2_fa = host_filter.subsampled_out_subsampled_2_fa, + host_filter_out_gsnap_filter_merged_fa = host_filter.subsampled_out_subsampled_merged_fa, duplicate_cluster_sizes_tsv = host_filter.czid_dedup_out_duplicate_cluster_sizes_tsv, czid_dedup_out_duplicate_clusters_csv = host_filter.czid_dedup_out_duplicate_clusters_csv, minimap2_local_db_path = minimap2_local_db_path, @@ -43,9 +43,9 @@ workflow czid_short_read_mngs { } call stage3.czid_postprocess as postprocess { input: - host_filter_out_gsnap_filter_1_fa = host_filter.gsnap_filter_out_gsnap_filter_1_fa, - host_filter_out_gsnap_filter_2_fa = host_filter.gsnap_filter_out_gsnap_filter_2_fa, - host_filter_out_gsnap_filter_merged_fa = host_filter.gsnap_filter_out_gsnap_filter_merged_fa, + host_filter_out_gsnap_filter_1_fa = host_filter.subsampled_out_subsampled_1_fa, + host_filter_out_gsnap_filter_2_fa = host_filter.subsampled_out_subsampled_2_fa, + host_filter_out_gsnap_filter_merged_fa = host_filter.subsampled_out_subsampled_merged_fa, duplicate_cluster_sizes_tsv = host_filter.czid_dedup_out_duplicate_cluster_sizes_tsv, czid_dedup_out_duplicate_clusters_csv = host_filter.czid_dedup_out_duplicate_clusters_csv, gsnap_out_gsnap_m8 = non_host_alignment.gsnap_out_gsnap_m8, diff --git a/workflows/short-read-mngs/stage_io_map.json b/workflows/short-read-mngs/stage_io_map.json index 565be202a..6df58eebb 100644 --- a/workflows/short-read-mngs/stage_io_map.json +++ b/workflows/short-read-mngs/stage_io_map.json @@ -1,15 +1,15 @@ { "NonHostAlignment":{ - "host_filter_out_gsnap_filter_1_fa":"gsnap_filter_out_gsnap_filter_1_fa", - "host_filter_out_gsnap_filter_2_fa":"gsnap_filter_out_gsnap_filter_2_fa", - "host_filter_out_gsnap_filter_merged_fa":"gsnap_filter_out_gsnap_filter_merged_fa", + "host_filter_out_gsnap_filter_1_fa":"subsampled_out_subsampled_1_fa", + "host_filter_out_gsnap_filter_2_fa":"subsampled_out_subsampled_2_fa", + "host_filter_out_gsnap_filter_merged_fa":"subsampled_out_subsampled_merged_fa", "duplicate_cluster_sizes_tsv":"czid_dedup_out_duplicate_cluster_sizes_tsv", "czid_dedup_out_duplicate_clusters_csv":"czid_dedup_out_duplicate_clusters_csv" }, "Postprocess":{ - "host_filter_out_gsnap_filter_1_fa":"gsnap_filter_out_gsnap_filter_1_fa", - "host_filter_out_gsnap_filter_2_fa":"gsnap_filter_out_gsnap_filter_2_fa", - "host_filter_out_gsnap_filter_merged_fa":"gsnap_filter_out_gsnap_filter_merged_fa", + "host_filter_out_gsnap_filter_1_fa":"subsampled_out_subsampled_1_fa", + "host_filter_out_gsnap_filter_2_fa":"subsampled_out_subsampled_2_fa", + "host_filter_out_gsnap_filter_merged_fa":"subsampled_out_subsampled_merged_fa", "gsnap_out_gsnap_m8":"gsnap_out_gsnap_m8", "gsnap_out_gsnap_deduped_m8":"gsnap_out_gsnap_deduped_m8", "gsnap_out_gsnap_hitsummary_tab":"gsnap_out_gsnap_hitsummary_tab", diff --git a/workflows/short-read-mngs/test/host_filter/test_RunCZIDDedup.py b/workflows/short-read-mngs/test/host_filter/test_RunCZIDDedup.py index 7d48420da..1672cab78 100644 --- a/workflows/short-read-mngs/test/host_filter/test_RunCZIDDedup.py +++ b/workflows/short-read-mngs/test/host_filter/test_RunCZIDDedup.py @@ -15,7 +15,7 @@ def test_RunCZIDDedup_safe_csv(util, short_read_mngs_bench3_viral_outputs): with NamedTemporaryFile(prefix=os.path.dirname(__file__), mode="w") as input_file: quote_count = 10 special_char_rows = 0 - for line in open(inputs["priceseq_fa"][0]): + for line in open(inputs["hisat2_filtered1_fastq"]): if line[0] == ">" or line[0] == "@": if special_char_rows < quote_count: input_file.write(f"{line[0]}={line[1:]}") @@ -28,7 +28,8 @@ def test_RunCZIDDedup_safe_csv(util, short_read_mngs_bench3_viral_outputs): input_file.seek(0) assert special_char_rows == quote_count - inputs["priceseq_fa"] = [input_file.name] + inputs["hisat2_filtered1_fastq"] = input_file.name + inputs["hisat2_filtered2_fastq"] = None outp = util.miniwdl_run( util.repo_dir() / "workflows/short-read-mngs/host_filter.wdl", diff --git a/workflows/short-read-mngs/test/host_filter/test_RunStar.py b/workflows/short-read-mngs/test/host_filter/test_RunStar.py deleted file mode 100644 index f668be27f..000000000 --- a/workflows/short-read-mngs/test/host_filter/test_RunStar.py +++ /dev/null @@ -1,24 +0,0 @@ -import os -import json - - -def test_RunStar_outputs_logfile(util, short_read_mngs_bench3_viral_outputs): - # load the task's inputs from the end-to-end workflow test - inputs, _ = util.miniwdl_inputs_outputs( - os.path.join( - short_read_mngs_bench3_viral_outputs["dir"], "call-host_filter/call-RunStar" - ) - ) - - # run the task with the manipulated inputs, expecting an error exit status - outp = util.miniwdl_run( - util.repo_dir() / "workflows/short-read-mngs/host_filter.wdl", - "--task", - "RunStar", - "-i", - json.dumps(inputs), - ) - - # verify Log.final.out is emitted - logfile = outp["outputs"]["RunStar.output_log_file"] - assert os.path.exists(logfile) diff --git a/workflows/short-read-mngs/test/host_filter/test_RunValidateInput.py b/workflows/short-read-mngs/test/host_filter/test_RunValidateInput.py index 2ad78a23e..99322fe23 100644 --- a/workflows/short-read-mngs/test/host_filter/test_RunValidateInput.py +++ b/workflows/short-read-mngs/test/host_filter/test_RunValidateInput.py @@ -10,9 +10,8 @@ def test_RunValidateInput_invalid(util, short_read_mngs_bench3_viral_outputs): ) ) # override fastqs to invalid test article - inputs["fastqs"] = [ - os.path.join(os.path.dirname(__file__), "test_RunValidateInput_invalid.fastq") - ] + inputs["reads1_fastq"] = os.path.join(os.path.dirname(__file__), "test_RunValidateInput_invalid.fastq") + del inputs["reads2_fastq"] # run the task with the manipulated inputs, expecting an error exit status outp = util.miniwdl_run( diff --git a/workflows/short-read-mngs/test/local_test_viral.yml b/workflows/short-read-mngs/test/local_test_viral.yml index 6772dfe45..5241faf82 100644 --- a/workflows/short-read-mngs/test/local_test_viral.yml +++ b/workflows/short-read-mngs/test/local_test_viral.yml @@ -9,12 +9,12 @@ host_filter.file_ext: fastq host_filter.nucleotide_type: DNA host_filter.host_genome: human -host_filter.star_genome: s3://czid-public-references/host_filter/ercc/2018-02-15-utc-1518652800-unixtime__2018-02-15-utc-1518652800-unixtime/STAR_genome.tar -host_filter.bowtie2_genome: s3://czid-public-references/host_filter/ercc/2018-02-15-utc-1518652800-unixtime__2018-02-15-utc-1518652800-unixtime/bowtie2_genome.tar -host_filter.gsnap_genome: s3://czid-public-references/test/gsnap/ERCC_gsnap2017-11-15_k16.tar -host_filter.human_star_genome: s3://czid-public-references/host_filter/ercc/2018-02-15-utc-1518652800-unixtime__2018-02-15-utc-1518652800-unixtime/STAR_genome.tar -host_filter.human_bowtie2_genome: s3://czid-public-references/host_filter/ercc/2018-02-15-utc-1518652800-unixtime__2018-02-15-utc-1518652800-unixtime/bowtie2_genome.tar host_filter.adapter_fasta: https://raw.githubusercontent.com/broadinstitute/viral-pipelines/master/test/input/clipDb.fasta +host_filter.bowtie2_index_tar: s3://public-test-bucket-idseq/host_filter/human/2022/bowtie2_index_tar/GRCh38_ERCC.bowtie2.tar +host_filter.hisat2_index_tar: s3://public-test-bucket-idseq/host_filter/human/2022/hisat2_index_tar/GRCh38_ERCC.hisat2.tar +host_filter.kallisto_idx: s3://public-test-bucket-idseq/host_filter/human/2022/kallisto_idx/GRCh38_ERCC.kallisto.idx +host_filter.human_bowtie2_index_tar: s3://public-test-bucket-idseq/host_filter/human/2022/bowtie2_index_tar/GRCh38_ERCC.bowtie2.tar +host_filter.human_hisat2_index_tar: s3://public-test-bucket-idseq/host_filter/human/2022/hisat2_index_tar/GRCh38_ERCC.hisat2.tar host_filter.max_input_fragments: 9000 host_filter.max_subsample_fragments: 9000 non_host_alignment.accession2taxid_db: s3://czid-public-references/mini-database/alignment_indexes/2020-08-20-viral/viral_accessions2taxid.marisa diff --git a/workflows/short-read-mngs/test/test_short_read_mngs.py b/workflows/short-read-mngs/test/test_short_read_mngs.py index 6bff6d68c..103356970 100644 --- a/workflows/short-read-mngs/test/test_short_read_mngs.py +++ b/workflows/short-read-mngs/test/test_short_read_mngs.py @@ -31,7 +31,7 @@ def test_bench3_viral(short_read_mngs_bench3_viral_outputs): taxon_counts = json.load(infile)["pipeline_output"]["taxon_counts_attributes"] taxa = set(entry["tax_id"] for entry in taxon_counts) - assert len(taxa) == 177 + assert abs(len(taxa) - 184) < 16 for filename in outp["outputs"]: if filename.endswith(".fasta"): diff --git a/workflows/short-read-mngs/test/test_wdl.py b/workflows/short-read-mngs/test/test_wdl.py index a905f3cb5..a2fdcb134 100644 --- a/workflows/short-read-mngs/test/test_wdl.py +++ b/workflows/short-read-mngs/test/test_wdl.py @@ -19,7 +19,7 @@ def setUpClass(self): def testValidateWindows(self): fastqs_0 = os.path.join(os.path.dirname(__file__), "windows1.fastq.gz") - args = self.rv_args + [f"fastqs={fastqs_0}"] + args = self.rv_args + [f"reads1_fastq={fastqs_0}"] res = self.run_miniwdl(args, task="RunValidateInput") with open(res["outputs"]["RunValidateInput.valid_input1_fastq"]) as f: hash = hashlib.md5(f.read().encode("utf-8")).hexdigest() @@ -27,7 +27,7 @@ def testValidateWindows(self): def testInvalidInput(self): fastqs_0 = os.path.join(os.path.dirname(__file__), "host_filter", "test_RunValidateInput_invalid_char.fastq") - args = self.rv_args + [f"fastqs={fastqs_0}"] + args = self.rv_args + [f"reads1_fastq={fastqs_0}"] with self.assertRaises(CalledProcessError) as ecm: self.run_miniwdl(args, task="RunValidateInput") @@ -38,94 +38,6 @@ def testInvalidInput(self): self.assertEqual(cause, "PARSE ERROR: not an ascii file. Line 4 contains non-ascii characters.") -class TestSTAR(WDLTestCase): - """Tests the RunSTAR function - the inputs are minimal, with only 100 reads - should only add ~1 min to testing time - """ - - wdl = os.path.join(os.path.dirname(__file__), "..", "host_filter.wdl") - with open(os.path.join(os.path.dirname(__file__), "local_test.yml")) as fh: - common_inputs = yaml.safe_load(fh) - star_args = None - - @classmethod - def setUpClass(self): - fastqs_0 = os.path.join( - os.path.dirname(__file__), - "host_filter", - "star_inputs", - "valid_input1.fastq", - ) - fastqs_1 = os.path.join( - os.path.dirname(__file__), - "host_filter", - "star_inputs", - "valid_input2.fastq", - ) - summary_json = os.path.join( - os.path.dirname(__file__), - "host_filter", - "star_inputs", - "validate_input_summary.json", - ) - args = [ - "s3_wd_uri=''", - f"validate_input_summary_json={summary_json}", - f"valid_input_fastq={fastqs_0}", - f"valid_input_fastq={fastqs_1}", - "star_genome=s3://czid-public-references/host_filter/ercc" - "/2018-02-15-utc-1518652800-unixtime__2018-02-15-utc-1518652800-unixtime/STAR_genome.tar", - ] - self.star_args = args - - def test_star(self): - """test the basic star parameters""" - args = self.star_args + ["nucleotide_type=DNA", "host_genome=human"] - res = self.run_miniwdl(args, task="RunStar") - with open(res["outputs"]["RunStar.output_read_count"]) as f: - count = json.load(f) - - self.assertEqual(count["star_out"], 100) - with open(res["outputs"]["RunStar.unmapped1_fastq"]) as f: - hash = hashlib.md5(f.read().encode("utf-8")).hexdigest() - self.assertEqual(hash, "c4d71e1b9b01734f7c3d300a7eac327a") - with open(res["outputs"]["RunStar.unmapped2_fastq"]) as f: - hash = hashlib.md5(f.read().encode("utf-8")).hexdigest() - self.assertEqual(hash, "6b46fe79bf089c8b3f6377fab34b9744") - - def test_star_rna(self): - """test the nucleotide_type of RNA works, should run STAR with TranscriptomeSAM""" - args = self.star_args + ["nucleotide_type=RNA", "host_genome=human"] - res = self.run_miniwdl(args, task="RunStar") - with open(res["outputs"]["RunStar.output_read_count"]) as f: - count = json.load(f) - self.assertEqual(count["star_out"], 100) - self.assertIn("TranscriptomeSAM", res["outputs"]["RunStar.step_description_md"]) - - def test_star_nonhuman(self): - """test that there is no output BAM file if the host is non-human""" - args = self.star_args + ["nucleotide_type=DNA", "host_genome=pig"] - res = self.run_miniwdl(args, task="RunStar") - - with open(res["outputs"]["RunStar.output_read_count"]) as f: - count = json.load(f) - self.assertEqual(count["star_out"], 100) - self.assertIsNone(res["outputs"]["RunStar.aligned_file"]) - - def test_starlong(self): - """tests that STARLong runs if # of reads with length > 500 is >1 - the validation input has been modified, but there are no actual long reads - """ - - args = self.star_args + ["nucleotide_type=DNA", "host_genome=human"] - args[1] = args[1].replace(".json", "_long.json") - res = self.run_miniwdl(args, task="RunStar") - with open(res["outputs"]["RunStar.output_read_count"]) as f: - count = json.load(f) - self.assertEqual(count["star_out"], 100) - - class TestAlign(WDLTestCase): wdl = os.path.join(os.path.dirname(__file__), "..", "non_host_alignment.wdl") with open(os.path.join(os.path.dirname(__file__), "local_test.yml")) as fh: