diff --git a/workflows/amr/run.wdl b/workflows/amr/run.wdl
index 1a15b8562..e3ed389c0 100644
--- a/workflows/amr/run.wdl
+++ b/workflows/amr/run.wdl
@@ -32,8 +32,8 @@ workflow amr {
             input:
             non_host_reads = select_all(
                 [
-                    host_filter_stage.gsnap_filter_out_gsnap_filter_1_fa,
-                    host_filter_stage.gsnap_filter_out_gsnap_filter_2_fa
+                    host_filter_stage.subsampled_out_subsampled_1_fa,
+                    host_filter_stage.subsampled_out_subsampled_2_fa
                 ]
             ),
             min_contig_length = min_contig_length,
@@ -45,8 +45,8 @@ workflow amr {
         non_host_reads = select_first([non_host_reads, 
             select_all(
                 [
-                    host_filter_stage.gsnap_filter_out_gsnap_filter_1_fa,
-                    host_filter_stage.gsnap_filter_out_gsnap_filter_2_fa
+                    host_filter_stage.subsampled_out_subsampled_1_fa,
+                    host_filter_stage.subsampled_out_subsampled_2_fa
                 ]
             )]),
         card_json = card_json, 
@@ -102,8 +102,8 @@ workflow amr {
                            non_host_reads,
                            select_all(
                                [
-                                   host_filter_stage.gsnap_filter_out_gsnap_filter_1_fa,
-                                   host_filter_stage.gsnap_filter_out_gsnap_filter_2_fa
+                                   host_filter_stage.subsampled_out_subsampled_1_fa,
+                                   host_filter_stage.subsampled_out_subsampled_2_fa
                                ]
                            )
                        ]),
diff --git a/workflows/legacy-host-filter/Dockerfile b/workflows/legacy-host-filter/Dockerfile
new file mode 100644
index 000000000..0475a5a12
--- /dev/null
+++ b/workflows/legacy-host-filter/Dockerfile
@@ -0,0 +1,141 @@
+# syntax=docker/dockerfile:1.4
+FROM ubuntu:18.04
+ARG DEBIAN_FRONTEND=noninteractive
+ARG MINIWDL_VERSION=1.1.5
+
+LABEL maintainer="CZ ID Team <idseq-tech@chanzuckerberg.com>"
+
+RUN sed -i s/archive.ubuntu.com/us-west-2.ec2.archive.ubuntu.com/ /etc/apt/sources.list; \
+        echo 'APT::Install-Recommends "false";' > /etc/apt/apt.conf.d/98czid; \
+        echo 'APT::Install-Suggests "false";' > /etc/apt/apt.conf.d/99czid
+
+RUN apt-get -q update && apt-get -q install -y \
+        jq \
+        moreutils \
+        pigz \
+        pixz \
+        aria2 \
+        httpie \
+        curl \
+        wget \
+        zip \
+        unzip \
+        zlib1g-dev \
+        pkg-config \
+        apt-utils \
+        libbz2-dev \
+        liblzma-dev \
+        software-properties-common \
+        libarchive-tools \
+        liblz4-tool \
+        lbzip2 \
+        docker.io \
+        python3-dev \
+        python3-pip \
+        python3-setuptools \
+        python3-wheel \
+        python3-requests \
+        python3-yaml \
+        python3-dateutil \
+        python3-psutil \
+        python3-cutadapt \
+        python3-scipy \
+        samtools \
+        fastx-toolkit \
+        seqtk \
+        bedtools \
+        dh-autoreconf \
+        nasm \
+        build-essential 
+
+# The following packages pull in python2.7
+RUN apt-get -q install -y \
+        bowtie2 \
+        spades \
+        ncbi-blast+
+
+RUN pip3 install boto3==1.23.10 marisa-trie==0.7.7 pytest
+RUN pip3 install miniwdl==${MINIWDL_VERSION} miniwdl-s3parcp==0.0.5 miniwdl-s3upload==0.0.4
+RUN pip3 install https://github.com/chanzuckerberg/miniwdl-plugins/archive/f0465b0.zip#subdirectory=sfn-wdl
+RUN pip3 install https://github.com/chanzuckerberg/s3mi/archive/v0.8.0.tar.gz
+
+ADD https://raw.githubusercontent.com/chanzuckerberg/miniwdl/v${MINIWDL_VERSION}/examples/clean_download_cache.sh /usr/local/bin
+RUN chmod +x /usr/local/bin/clean_download_cache.sh
+
+# docker.io is the largest package at 250MB+ / half of all package disk space usage.
+# The docker daemons never run inside the container - removing them saves 150MB+
+RUN rm -f /usr/bin/dockerd /usr/bin/containerd*
+
+RUN cd /usr/bin; curl -O https://amazon-ecr-credential-helper-releases.s3.amazonaws.com/0.4.0/linux-amd64/docker-credential-ecr-login
+RUN chmod +x /usr/bin/docker-credential-ecr-login
+RUN mkdir -p /root/.docker
+RUN jq -n '.credsStore="ecr-login"' > /root/.docker/config.json
+
+RUN curl -L -o /usr/bin/czid-dedup https://github.com/chanzuckerberg/czid-dedup/releases/download/v0.1.2/czid-dedup-Linux; chmod +x /usr/bin/czid-dedup
+
+# Note: bsdtar is available in libarchive-tools
+# Note: python3-scipy pulls in gcc (fixed in Ubuntu 19.10)
+# TODO: kSNP3 (separate phylotree image?)
+
+# Note: the NonHostAlignment stage uses a different version of gmap custom to CZ ID, installed here:
+# https://github.com/chanzuckerberg/czid/blob/master/workflows/docker/gsnap/Dockerfile#L16-L20
+# TODO: migrate both to https://packages.ubuntu.com/focal/gmap (updates to gmap require revalidation)
+RUN apt-get -q install -y gmap
+
+# FIXME: replace trimmomatic with cutadapt (trimmomatic pulls in too many deps)
+RUN apt-get -q install -y trimmomatic
+RUN ln -sf /usr/share/java/trimmomatic-0.36.jar /usr/local/bin/trimmomatic-0.38.jar
+
+# FIXME: replace PriceSeqFilter with cutadapt quality/N-fraction cutoff
+RUN curl -s https://idseq-prod-pipeline-public-assets-us-west-2.s3-us-west-2.amazonaws.com/PriceSource140408/PriceSeqFilter > /usr/bin/PriceSeqFilter
+RUN chmod +x /usr/bin/PriceSeqFilter
+
+RUN curl -Ls https://github.com/chanzuckerberg/s3parcp/releases/download/v0.2.0-alpha/s3parcp_0.2.0-alpha_Linux_x86_64.tar.gz | tar -C /usr/bin -xz s3parcp
+
+# FIXME: check if use of pandas, pysam is necessary
+RUN pip3 install pysam==0.14.1 pandas==1.1.5
+
+# Picard for average fragment size https://github.com/broadinstitute/picard
+# r-base is a dependency of collecting input size metrics https://github.com/bioconda/bioconda-recipes/pull/16398
+RUN apt-get install -y r-base
+RUN curl -L -o /usr/local/bin/picard.jar https://github.com/broadinstitute/picard/releases/download/2.21.2/picard.jar
+# Create a single executable so we can use SingleCommand
+RUN printf '#!/bin/bash\njava -jar /usr/local/bin/picard.jar "$@"\n' > /usr/local/bin/picard
+RUN chmod +x /usr/local/bin/picard
+
+# install STAR, the package rna-star does not include STARlong
+RUN curl -L https://github.com/alexdobin/STAR/archive/2.5.3a.tar.gz | tar xz
+RUN mv STAR-2.5.3a/bin/Linux_x86_64_static/* /usr/local/bin
+RUN rm -rf STAR-2.5.3a
+
+
+RUN apt-get -y update && apt-get install -y build-essential libz-dev git python3-pip cmake
+
+# Host filtering (2022 version) dependencies
+# fastp (libdeflate libisal (dh-autoreconf nasm))
+# hisat2
+# bowtie2 [already installed]
+# kallisto + python gtfparse
+WORKDIR /tmp
+RUN wget -nv -O - https://github.com/intel/isa-l/archive/refs/tags/v2.30.0.tar.gz | tar zx
+RUN cd isa-l-* && ./autogen.sh && ./configure && make -j8 && make install
+RUN wget -nv -O - https://github.com/ebiggers/libdeflate/archive/refs/tags/v1.12.tar.gz | tar zx
+RUN cd libdeflate-* && make -j8 && make install
+RUN ldconfig
+RUN git clone https://github.com/mlin/fastp.git && git -C fastp checkout 37edd60
+RUN cd fastp && make -j8 && ./fastp test && cp fastp /usr/local/bin
+WORKDIR /
+RUN wget -nv -O /tmp/HISAT2.zip https://czid-public-references.s3.us-west-2.amazonaws.com/test/hisat2/hisat2.zip \
+        && unzip /tmp/HISAT2.zip && rm /tmp/HISAT2.zip
+RUN curl -L https://github.com/pachterlab/kallisto/releases/download/v0.46.1/kallisto_linux-v0.46.1.tar.gz | tar xz -C /
+RUN pip3 install gtfparse==1.2.1
+
+# Uninstall build only dependencies
+RUN apt-get purge -y g++ libperl4-corelibs-perl make
+
+COPY --from=lib idseq-dag /tmp/idseq-dag
+RUN pip3 install /tmp/idseq-dag && rm -rf /tmp/idseq-dag
+
+COPY --from=lib idseq_utils /tmp/idseq_utils
+RUN pip3 install /tmp/idseq_utils && rm -rf /tmp/idseq_utils
+
diff --git a/workflows/legacy-host-filter/legacy-host-filter.wdl b/workflows/legacy-host-filter/legacy-host-filter.wdl
new file mode 100644
index 000000000..04c1b2c40
--- /dev/null
+++ b/workflows/legacy-host-filter/legacy-host-filter.wdl
@@ -0,0 +1,675 @@
+version 1.0
+
+task RunValidateInput {
+  input {
+    String docker_image_id
+    String s3_wd_uri
+    Array[File] fastqs
+    Int max_input_fragments
+    String file_ext
+  }
+  command<<<
+  set -euxo pipefail
+  idseq-dag-run-step --workflow-name host_filter \
+    --step-module idseq_dag.steps.run_validate_input \
+    --step-class PipelineStepRunValidateInput \
+    --step-name validate_input_out \
+    --input-files '[["~{sep='","' fastqs}"]]' \
+    --output-files '["validate_input_summary.json", ~{if length(fastqs) == 2 then '"valid_input1.fastq", "valid_input2.fastq"' else '"valid_input1.fastq"'}]' \
+    --output-dir-s3 '~{s3_wd_uri}' \
+    --additional-files '{}' \
+    --additional-attributes '{"truncate_fragments_to": ~{max_input_fragments}, "file_ext": "~{file_ext}"}'
+  >>>
+  output {
+    String step_description_md = read_string("validate_input_out.description.md")
+    File validate_input_summary_json = "validate_input_summary.json"
+    File valid_input1_fastq = "valid_input1.fastq"
+    File? valid_input2_fastq = "valid_input2.fastq"
+    File? output_read_count = "validate_input_out.count"
+    File? input_read_count = "fastqs.count"
+  }
+  runtime {
+    docker: docker_image_id
+  }
+}
+
+task RunStar {
+  input {
+    String docker_image_id
+    String s3_wd_uri
+    File validate_input_summary_json
+    Array[File] valid_input_fastq
+    File star_genome
+    String nucleotide_type
+    String host_genome
+    String genome_dir = "STAR_genome/part-0/"
+  }
+  command<<<
+  # TODO(Ryan): remove when status upload is not dependent on idseq-dag see: https://app.shortcut.com/idseq/story/163323
+  # this comment is for the miniwdl plugin uploader to parse:
+  # --step-name star_out 
+  set -euxo pipefail
+
+  python3 <<CODE
+  """ save description to file """
+  from idseq_utils.save_descriptions import star_description
+  from idseq_utils.exceptions import print_exceptions
+
+  def main():
+    description = star_description("~{nucleotide_type}")
+    with open("star_out.description.md", "w+") as f:
+      f.write(description)
+
+  print_exceptions(main)
+  CODE
+
+  mkdir "STAR_genome"
+  tar xf "~{star_genome}" -C "STAR_genome" --strip-components 1
+  # Set Parameters
+  SAMMODE="None"
+  SAMTYPE="None"
+
+  # Currently we always use 'GeneCounts', 
+  QUANTMODE="~{if nucleotide_type == 'RNA' && host_genome == 'human' then 'TranscriptomeSAM GeneCounts' else 'GeneCounts'}"
+  if [[ "~{length(valid_input_fastq)}" -eq "2" ]] && [[ "~{host_genome}" == "human" ]]; then
+    SAMMODE="NoQS"
+    SAMTYPE="BAM Unsorted"
+  fi
+
+  if [[ $(jq '."500-10000"' "~{validate_input_summary_json}") -gt "1" ]] || [[ $(jq '."10000+"' "~{validate_input_summary_json}") -gt "1" ]]; then 
+    STARlong \
+    --outFilterMultimapNmax 99999 \
+    --outFilterScoreMinOverLread 0.5 \
+    --outFilterMatchNminOverLread 0.5 \
+    --outReadsUnmapped Fastx \
+    --outFilterMismatchNmax 999 \
+    --clip3pNbases 0 \
+    --runThreadN "$(nproc --all)" \
+    --genomeDir "~{genome_dir}" \
+    --readFilesIn "~{sep='" "' valid_input_fastq}" \
+    --seedSearchStartLmax 20 \
+    --seedPerReadNmax 100000 \
+    --seedPerWindowNmax 1000 \
+    --alignTranscriptsPerReadNmax 100000 \
+    --outSAMmode $SAMMODE \
+    --outSAMtype $SAMTYPE \
+    --quantMode $QUANTMODE
+  else
+    STAR --outFilterMultimapNmax 99999 \
+    --outFilterScoreMinOverLread 0.5 \
+    --outFilterMatchNminOverLread 0.5 \
+    --outReadsUnmapped Fastx \
+    --outFilterMismatchNmax 999 \
+    --outSAMmode $SAMMODE \
+    --outSAMtype $SAMTYPE \
+    --clip3pNbases 0 \
+    --runThreadN "$(nproc --all)" \
+    --limitOutSJcollapsed 2000000 \
+    --runRNGseed 777 \
+    --genomeDir "~{genome_dir}" \
+    --quantMode $QUANTMODE \
+    --readFilesIn "~{sep='" "' valid_input_fastq}" 
+  fi
+
+  if [ -f "Aligned.toTranscriptome.out.bam" ]; then 
+    mv "Aligned.toTranscriptome.out.bam" "Aligned.out.bam"
+  fi
+
+  python3 <<CODE
+  """ sync pairs of files, sort by entry id, count reads """
+  import idseq_utils.sync_pairs as sp
+  import shutil
+  import glob
+  from idseq_utils.exceptions import print_exceptions, BrokenReadPairError
+  
+  def main():
+    unmapped = sorted(glob.glob("Unmapped.out.mate*"))
+    output_files, too_discrepant = sp.sync_pairs(unmapped)
+    if too_discrepant:
+        raise BrokenReadPairError("Paired input files were detected to be out of order. Both pairs must have the same read order")
+    for unmapped_file in output_files:
+        sp.sort_fastx_by_entry_id(unmapped_file)
+
+    for ind, unmapped_file in enumerate(output_files):
+        shutil.move(unmapped_file, f"unmapped{ind+1}.fastq")
+
+  print_exceptions(main)
+  CODE
+
+  if [ -f "Aligned.out.bam" ]; then 
+    picard CollectInsertSizeMetrics I=Aligned.out.bam O=picard_insert_metrics.txt H=insert_size_histogram.pdf || echo WARNING: picard failed
+  fi 
+
+  python3 <<CODE
+  """ count reads """
+  import idseq_utils.count_reads as cr
+  import glob
+  from idseq_utils.exceptions import print_exceptions
+
+  def main():
+    input_files = sorted(glob.glob("unmapped*.fastq"))
+    cr.main("star_out", input_files)
+
+  print_exceptions(main)
+  CODE
+
+  if [ -f "ReadsPerGene.out.tab" ]; then 
+    mv ReadsPerGene.out.tab reads_per_gene.star.tab
+  fi
+
+  STAR --version > star_out_version.txt
+  rm "~{genome_dir}"/SAindex # the star genome is pretty big (1.5G)
+  rm "~{genome_dir}"/Genome 
+  >>>
+  output {
+    String step_description_md = read_string("star_out.description.md")
+    File unmapped1_fastq = "unmapped1.fastq"
+    File output_log_file = "Log.final.out"
+    File? unmapped2_fastq = "unmapped2.fastq"
+    File? aligned_file = "Aligned.out.bam"
+    File? output_read_count = "star_out.count"
+    File? output_gene_file = "reads_per_gene.star.tab"
+    File? output_metrics_file = "picard_insert_metrics.txt"
+    File? output_histogram_file = "insert_size_histogram.pdf"
+    File? version = "star_out_version.txt"
+  }
+  runtime {
+    docker: docker_image_id
+  }
+}
+
+task RunTrimmomatic {
+  input {
+    String docker_image_id
+    String s3_wd_uri
+    Array[File] unmapped_fastq
+    File adapter_fasta
+  }
+  command<<<
+  set -euxo pipefail
+  idseq-dag-run-step --workflow-name host_filter \
+    --step-module idseq_dag.steps.run_trimmomatic \
+    --step-class PipelineStepRunTrimmomatic \
+    --step-name trimmomatic_out \
+    --input-files '[["~{sep='","' unmapped_fastq}"]]' \
+    --output-files '[~{if length(unmapped_fastq) == 2 then '"trimmomatic1.fastq", "trimmomatic2.fastq"' else '"trimmomatic1.fastq"'}]' \
+    --output-dir-s3 '~{s3_wd_uri}' \
+    --additional-files '{"adapter_fasta": "~{adapter_fasta}"}' \
+    --additional-attributes '{}'
+  java -jar /usr/local/bin/trimmomatic-0.38.jar -version > trimmomatic_version.txt
+  
+  >>>
+  output {
+    String step_description_md = read_string("trimmomatic_out.description.md")
+    File trimmomatic1_fastq = "trimmomatic1.fastq"
+    File? trimmomatic2_fastq = "trimmomatic2.fastq"
+    File? output_read_count = "trimmomatic_out.count"
+    File? version = "trimmomatic_version.txt"
+  }
+  runtime {
+    docker: docker_image_id
+  }
+}
+
+task RunPriceSeq {
+  input {
+    String docker_image_id
+    String s3_wd_uri
+    Array[File] trimmomatic_fastq
+  }
+  command<<<
+  set -euxo pipefail
+  idseq-dag-run-step --workflow-name host_filter \
+    --step-module idseq_dag.steps.run_priceseq \
+    --step-class PipelineStepRunPriceSeq \
+    --step-name priceseq_out \
+    --input-files '[["~{sep='","' trimmomatic_fastq}"]]' \
+    --output-files '[~{if length(trimmomatic_fastq) == 2 then '"priceseq1.fa", "priceseq2.fa"' else '"priceseq1.fa"'}]' \
+    --output-dir-s3 '~{s3_wd_uri}' \
+    --additional-files '{}' \
+    --additional-attributes '{}'
+  PriceSeqFilter 2> /dev/null | head -n1 > priceseq_version.txt
+  >>>
+  output {
+    String step_description_md = read_string("priceseq_out.description.md")
+    File priceseq1_fa = "priceseq1.fa"
+    File? priceseq2_fa = "priceseq2.fa"
+    File? output_read_count = "priceseq_out.count"
+    File? version = "priceseq_version.txt"
+  }
+  runtime {
+    docker: docker_image_id
+  }
+}
+
+task RunCZIDDedup {
+  input {
+    String docker_image_id
+    String s3_wd_uri
+    Array[File] priceseq_fa
+  }
+  command<<<
+  set -euxo pipefail
+  idseq-dag-run-step --workflow-name host_filter \
+    --step-module idseq_dag.steps.run_czid_dedup \
+    --step-class PipelineStepRunCZIDDedup \
+    --step-name czid_dedup_out \
+    --input-files '[["~{sep='","' priceseq_fa}"]]' \
+    --output-files '[~{if length(priceseq_fa) == 2 then '"dedup1.fa", "dedup2.fa"' else '"dedup1.fa"'}, "clusters.csv", "duplicate_cluster_sizes.tsv"]' \
+    --output-dir-s3 '~{s3_wd_uri}' \
+    --additional-files '{}' \
+    --additional-attributes '{}'
+  czid-dedup --version > czid_dedup_version.txt
+  >>>
+  output {
+    String step_description_md = read_string("czid_dedup_out.description.md")
+    File dedup1_fa = "dedup1.fa"
+    File? dedup2_fa = "dedup2.fa"
+    File duplicate_clusters_csv = "clusters.csv"
+    File duplicate_cluster_sizes_tsv = "duplicate_cluster_sizes.tsv"
+    File? output_read_count = "czid_dedup_out.count"
+    File? version = "czid_dedup_version.txt"
+  }
+  runtime {
+    docker: docker_image_id
+  }
+}
+
+task RunLZW {
+  input {
+    String docker_image_id
+    String s3_wd_uri
+    Array[File] dedup_fa
+    File duplicate_clusters_csv
+    File duplicate_cluster_sizes_tsv
+  }
+  command<<<
+  set -euxo pipefail
+  idseq-dag-run-step --workflow-name host_filter \
+    --step-module idseq_dag.steps.run_lzw \
+    --step-class PipelineStepRunLZW \
+    --step-name lzw_out \
+    --input-files '[["~{sep='","' dedup_fa}", "~{duplicate_clusters_csv}", "~{duplicate_cluster_sizes_tsv}"]]' \
+    --output-files '[~{if length(dedup_fa) == 2 then '"lzw1.fa", "lzw2.fa"' else '"lzw1.fa"'}]' \
+    --output-dir-s3 '~{s3_wd_uri}' \
+    --additional-files '{}' \
+    --additional-attributes '{"thresholds": [0.45, 0.42], "threshold_readlength": 150}'
+  >>>
+  output {
+    String step_description_md = read_string("lzw_out.description.md")
+    File lzw1_fa = "lzw1.fa"
+    File? lzw2_fa = "lzw2.fa"
+    File? output_read_count = "lzw_out.count"
+  }
+  runtime {
+    docker: docker_image_id
+  }
+}
+
+task RunBowtie2_bowtie2_out {
+  input {
+    String docker_image_id
+    String s3_wd_uri
+    Array[File] lzw_fa
+    Array[File] dedup_fa
+    File duplicate_clusters_csv
+    File duplicate_cluster_sizes_tsv
+    File bowtie2_genome
+  }
+  command<<<
+  set -euxo pipefail
+  idseq-dag-run-step --workflow-name host_filter \
+    --step-module idseq_dag.steps.run_bowtie2 \
+    --step-class PipelineStepRunBowtie2 \
+    --step-name bowtie2_out \
+    --input-files '[["~{sep='","' lzw_fa}"], ["~{sep='","' dedup_fa}", "~{duplicate_clusters_csv}", "~{duplicate_cluster_sizes_tsv}"]]' \
+    --output-files '[~{if length(lzw_fa) == 2 then '"bowtie2_1.fa", "bowtie2_2.fa", "bowtie2_merged.fa"' else '"bowtie2_1.fa"'}]' \
+    --output-dir-s3 '~{s3_wd_uri}' \
+    --additional-files '{"bowtie2_genome": "~{bowtie2_genome}"}' \
+    --additional-attributes '{"output_sam_file": "bowtie2.sam"}'
+  bowtie2 --version > bowtie2_version.txt
+  >>>
+  output {
+    String step_description_md = read_string("bowtie2_out.description.md")
+    File bowtie2_1_fa = "bowtie2_1.fa"
+    File? bowtie2_2_fa = "bowtie2_2.fa"
+    File? bowtie2_merged_fa = "bowtie2_merged.fa"
+    File? output_read_count = "bowtie2_out.count"
+    File? version = "bowtie2_version.txt"
+  }
+  runtime {
+    docker: docker_image_id
+  }
+}
+
+task RunSubsample {
+  input {
+    String docker_image_id
+    String s3_wd_uri
+    Array[File] bowtie2_fa
+    Array[File] dedup_fa
+    File duplicate_clusters_csv
+    File duplicate_cluster_sizes_tsv
+    Int max_subsample_fragments
+  }
+  command<<<
+  set -euxo pipefail
+  idseq-dag-run-step --workflow-name host_filter \
+    --step-module idseq_dag.steps.run_subsample \
+    --step-class PipelineStepRunSubsample \
+    --step-name subsampled_out \
+    --input-files '[["~{sep='","' bowtie2_fa}"], ["~{sep='","' dedup_fa}", "~{duplicate_clusters_csv}", "~{duplicate_cluster_sizes_tsv}"]]' \
+    --output-files '[~{if length(dedup_fa) == 2 then '"subsampled_1.fa", "subsampled_2.fa", "subsampled_merged.fa"' else '"subsampled_1.fa"'}]' \
+    --output-dir-s3 '~{s3_wd_uri}' \
+    --additional-files '{}' \
+    --additional-attributes '{"max_fragments": ~{max_subsample_fragments}}'
+  >>>
+  output {
+    String step_description_md = read_string("subsampled_out.description.md")
+    File subsampled_1_fa = "subsampled_1.fa"
+    File? subsampled_2_fa = "subsampled_2.fa"
+    File? subsampled_merged_fa = "subsampled_merged.fa"
+    File? output_read_count = "subsampled_out.count"
+  }
+  runtime {
+    docker: docker_image_id
+  }
+}
+
+task RunStarDownstream {
+  input {
+    String docker_image_id
+    String s3_wd_uri
+    Array[File] subsampled_fa
+    File validate_input_summary_json
+    Array[File] valid_input_fastq
+    Array[File] dedup_fa
+    File duplicate_clusters_csv
+    File duplicate_cluster_sizes_tsv
+    File human_star_genome
+  }
+  command<<<
+  set -euxo pipefail
+  idseq-dag-run-step --workflow-name host_filter \
+    --step-module idseq_dag.steps.run_star_downstream \
+    --step-class PipelineStepRunStarDownstream \
+    --step-name star_human_out \
+    --input-files '[["~{sep='","' subsampled_fa}"], ["~{validate_input_summary_json}", "~{sep='","' valid_input_fastq}"], ["~{sep='","' dedup_fa}", "~{duplicate_clusters_csv}", "~{duplicate_cluster_sizes_tsv}"]]' \
+    --output-files '[~{if length(dedup_fa) == 2 then '"unmapped_human_1.fa", "unmapped_human_2.fa"' else '"unmapped_human_1.fa"'}]' \
+    --output-dir-s3 '~{s3_wd_uri}' \
+    --additional-files '{"star_genome": "~{human_star_genome}"}' \
+    --additional-attributes '{}'
+  STAR --version > star_human_version.txt
+  >>>
+  output {
+    String step_description_md = read_string("star_human_out.description.md")
+    File unmapped_human_1_fa = "unmapped_human_1.fa"
+    File? unmapped_human_2_fa = "unmapped_human_2.fa"
+    File? output_read_count = "star_human_out.count"
+    File? version = "star_human_version.txt"
+  }
+  runtime {
+    docker: docker_image_id
+  }
+}
+
+task RunBowtie2_bowtie2_human_out {
+  input {
+    String docker_image_id
+    String s3_wd_uri
+    Array[File] unmapped_human_fa
+    Array[File] dedup_fa
+    File duplicate_clusters_csv
+    File duplicate_cluster_sizes_tsv
+    File human_bowtie2_genome
+  }
+  command<<<
+  set -euxo pipefail
+  idseq-dag-run-step --workflow-name host_filter \
+    --step-module idseq_dag.steps.run_bowtie2 \
+    --step-class PipelineStepRunBowtie2 \
+    --step-name bowtie2_human_out \
+    --input-files '[["~{sep='","' unmapped_human_fa}"], ["~{sep='","' dedup_fa}", "~{duplicate_clusters_csv}", "~{duplicate_cluster_sizes_tsv}"]]' \
+    --output-files '[~{if length(dedup_fa) == 2 then '"bowtie2_human_1.fa", "bowtie2_human_2.fa", "bowtie2_human_merged.fa"' else '"bowtie2_human_1.fa"'}]' \
+    --output-dir-s3 '~{s3_wd_uri}' \
+    --additional-files '{"bowtie2_genome": "~{human_bowtie2_genome}"}' \
+    --additional-attributes '{"output_sam_file": "bowtie2_human.sam"}'
+  bowtie2 --version > bowtie2_human_version.txt
+  >>>
+  output {
+    String step_description_md = read_string("bowtie2_human_out.description.md")
+    File bowtie2_human_1_fa = "bowtie2_human_1.fa"
+    File? bowtie2_human_2_fa = "bowtie2_human_2.fa"
+    File? bowtie2_human_merged_fa = "bowtie2_human_merged.fa"
+    File? output_read_count = "bowtie2_human_out.count"
+    File? version = "bowtie2_human_version.txt"
+  }
+  runtime {
+    docker: docker_image_id
+  }
+}
+
+task RunGsnapFilter {
+  input {
+    String docker_image_id
+    String s3_wd_uri
+    Array[File] subsampled_fa
+    Array[File] dedup_fa
+    File duplicate_clusters_csv
+    File duplicate_cluster_sizes_tsv
+    File gsnap_genome
+  }
+  command<<<
+  set -euxo pipefail
+  idseq-dag-run-step --workflow-name host_filter \
+    --step-module idseq_dag.steps.run_gsnap_filter \
+    --step-class PipelineStepRunGsnapFilter \
+    --step-name gsnap_filter_out \
+    --input-files '[["~{sep='","' subsampled_fa}"], ["~{sep='","' dedup_fa}", "~{duplicate_clusters_csv}", "~{duplicate_cluster_sizes_tsv}"]]' \
+    --output-files '[~{if length(dedup_fa) == 2 then '"gsnap_filter_1.fa", "gsnap_filter_2.fa", "gsnap_filter_merged.fa"' else '"gsnap_filter_1.fa"'}]' \
+    --output-dir-s3 '~{s3_wd_uri}' \
+    --additional-files '{"gsnap_genome": "~{gsnap_genome}"}' \
+    --additional-attributes '{"output_sam_file": "gsnap_filter.sam"}'
+  gsnap --version > gsnap_filter_version.txt
+  >>>
+  output {
+    String step_description_md = read_string("gsnap_filter_out.description.md")
+    File gsnap_filter_1_fa = "gsnap_filter_1.fa"
+    File? gsnap_filter_2_fa = "gsnap_filter_2.fa"
+    File? gsnap_filter_merged_fa = "gsnap_filter_merged.fa"
+    File? output_read_count = "gsnap_filter_out.count"
+    File? version = "gsnap_filter_version.txt"
+  }
+  runtime {
+    docker: docker_image_id
+  }
+}
+
+
+workflow czid_host_filter {
+  input {
+    String docker_image_id
+    String s3_wd_uri
+    File fastqs_0
+    File? fastqs_1
+    String file_ext
+    String nucleotide_type
+    String host_genome
+    File adapter_fasta
+    File star_genome
+    File bowtie2_genome
+    File gsnap_genome = "s3://czid-public-references/host_filter/human/2018-02-15-utc-1518652800-unixtime__2018-02-15-utc-1518652800-unixtime/hg38_pantro5_k16.tar"
+    String human_star_genome
+    String human_bowtie2_genome
+    Int max_input_fragments
+    Int max_subsample_fragments
+  }
+
+  call RunValidateInput {
+    input:
+      docker_image_id = docker_image_id,
+      s3_wd_uri = s3_wd_uri,
+      fastqs = select_all([fastqs_0, fastqs_1]),
+      file_ext = file_ext,
+      max_input_fragments = max_input_fragments
+  }
+
+  call RunStar {
+    input:
+      docker_image_id = docker_image_id,
+      s3_wd_uri = s3_wd_uri,
+      validate_input_summary_json = RunValidateInput.validate_input_summary_json,
+      valid_input_fastq = select_all([RunValidateInput.valid_input1_fastq, RunValidateInput.valid_input2_fastq]),
+      star_genome = star_genome,
+      nucleotide_type = nucleotide_type,
+      host_genome = host_genome
+  }
+
+  call RunTrimmomatic {
+    input:
+      docker_image_id = docker_image_id,
+      s3_wd_uri = s3_wd_uri,
+      unmapped_fastq = select_all([RunStar.unmapped1_fastq, RunStar.unmapped2_fastq]),
+      adapter_fasta = adapter_fasta
+  }
+
+  call RunPriceSeq {
+    input:
+      docker_image_id = docker_image_id,
+      s3_wd_uri = s3_wd_uri,
+      trimmomatic_fastq = select_all([RunTrimmomatic.trimmomatic1_fastq, RunTrimmomatic.trimmomatic2_fastq])
+  }
+
+  call RunCZIDDedup {
+    input:
+      docker_image_id = docker_image_id,
+      s3_wd_uri = s3_wd_uri,
+      priceseq_fa = select_all([RunPriceSeq.priceseq1_fa, RunPriceSeq.priceseq2_fa])
+  }
+
+  call RunLZW {
+    input:
+      docker_image_id = docker_image_id,
+      s3_wd_uri = s3_wd_uri,
+      dedup_fa = select_all([RunCZIDDedup.dedup1_fa, RunCZIDDedup.dedup2_fa]),
+      duplicate_clusters_csv = RunCZIDDedup.duplicate_clusters_csv,
+      duplicate_cluster_sizes_tsv = RunCZIDDedup.duplicate_cluster_sizes_tsv
+  }
+
+  call RunBowtie2_bowtie2_out {
+    input:
+      docker_image_id = docker_image_id,
+      s3_wd_uri = s3_wd_uri,
+      lzw_fa = select_all([RunLZW.lzw1_fa, RunLZW.lzw2_fa]),
+      dedup_fa = select_all([RunCZIDDedup.dedup1_fa, RunCZIDDedup.dedup2_fa]),
+      duplicate_clusters_csv = RunCZIDDedup.duplicate_clusters_csv,
+      duplicate_cluster_sizes_tsv = RunCZIDDedup.duplicate_cluster_sizes_tsv,
+      bowtie2_genome = bowtie2_genome
+  }
+
+  call RunSubsample {
+    input:
+      docker_image_id = docker_image_id,
+      s3_wd_uri = s3_wd_uri,
+      bowtie2_fa = select_all([RunBowtie2_bowtie2_out.bowtie2_1_fa, RunBowtie2_bowtie2_out.bowtie2_2_fa, RunBowtie2_bowtie2_out.bowtie2_merged_fa]),
+      dedup_fa = select_all([RunCZIDDedup.dedup1_fa, RunCZIDDedup.dedup2_fa]),
+      duplicate_clusters_csv = RunCZIDDedup.duplicate_clusters_csv,
+      duplicate_cluster_sizes_tsv = RunCZIDDedup.duplicate_cluster_sizes_tsv,
+      max_subsample_fragments = max_subsample_fragments
+  }
+
+  if (host_genome != "human") {
+    call RunStarDownstream {
+      input:
+        docker_image_id = docker_image_id,
+        s3_wd_uri = s3_wd_uri,
+        subsampled_fa = select_all([RunSubsample.subsampled_1_fa, RunSubsample.subsampled_2_fa, RunSubsample.subsampled_merged_fa]),
+        validate_input_summary_json = RunValidateInput.validate_input_summary_json,
+        valid_input_fastq = select_all([RunValidateInput.valid_input1_fastq, RunValidateInput.valid_input2_fastq]),
+        dedup_fa = select_all([RunCZIDDedup.dedup1_fa, RunCZIDDedup.dedup2_fa]),
+        duplicate_clusters_csv = RunCZIDDedup.duplicate_clusters_csv,
+        duplicate_cluster_sizes_tsv = RunCZIDDedup.duplicate_cluster_sizes_tsv,
+        human_star_genome = human_star_genome
+    }
+
+    call RunBowtie2_bowtie2_human_out {
+      input:
+        docker_image_id = docker_image_id,
+        s3_wd_uri = s3_wd_uri,
+        unmapped_human_fa = select_all([RunStarDownstream.unmapped_human_1_fa, RunStarDownstream.unmapped_human_2_fa]),
+        dedup_fa = select_all([RunCZIDDedup.dedup1_fa, RunCZIDDedup.dedup2_fa]),
+        duplicate_clusters_csv = RunCZIDDedup.duplicate_clusters_csv,
+        duplicate_cluster_sizes_tsv = RunCZIDDedup.duplicate_cluster_sizes_tsv,
+        human_bowtie2_genome = human_bowtie2_genome
+    }
+  }
+
+  Array[File] gsnap_filter_input = if (host_genome == "human")
+    then select_all([RunSubsample.subsampled_1_fa, RunSubsample.subsampled_2_fa, RunSubsample.subsampled_merged_fa])
+    else select_all([RunBowtie2_bowtie2_human_out.bowtie2_human_1_fa, RunBowtie2_bowtie2_human_out.bowtie2_human_2_fa, RunBowtie2_bowtie2_human_out.bowtie2_human_merged_fa])
+
+  call RunGsnapFilter {
+    input:
+      docker_image_id = docker_image_id,
+      s3_wd_uri = s3_wd_uri,
+      subsampled_fa = gsnap_filter_input,
+      dedup_fa = select_all([RunCZIDDedup.dedup1_fa, RunCZIDDedup.dedup2_fa]),
+      duplicate_clusters_csv = RunCZIDDedup.duplicate_clusters_csv,
+      duplicate_cluster_sizes_tsv = RunCZIDDedup.duplicate_cluster_sizes_tsv,
+      gsnap_genome = gsnap_genome
+  }
+
+  output {
+    File validate_input_out_validate_input_summary_json = RunValidateInput.validate_input_summary_json
+    File? validate_input_out_count = RunValidateInput.output_read_count
+    File star_out_unmapped1_fastq = RunStar.unmapped1_fastq
+    File? star_out_unmapped2_fastq = RunStar.unmapped2_fastq
+    File? star_out_log_file = RunStar.output_log_file
+    File? star_out_count = RunStar.output_read_count
+    File? star_version = RunStar.version
+    File trimmomatic_out_trimmomatic1_fastq = RunTrimmomatic.trimmomatic1_fastq
+    File? trimmomatic_out_trimmomatic2_fastq = RunTrimmomatic.trimmomatic2_fastq
+    File? trimmomatic_out_count = RunTrimmomatic.output_read_count
+    File? trimmomatic_version = RunTrimmomatic.version
+    File priceseq_out_priceseq1_fa = RunPriceSeq.priceseq1_fa
+    File? priceseq_out_priceseq2_fa = RunPriceSeq.priceseq2_fa
+    File? priceseq_out_count = RunPriceSeq.output_read_count
+    File? priceseq_version = RunPriceSeq.version
+    File czid_dedup_out_dedup1_fa = RunCZIDDedup.dedup1_fa
+    File? czid_dedup_out_dedup2_fa = RunCZIDDedup.dedup2_fa
+    File czid_dedup_out_duplicate_clusters_csv = RunCZIDDedup.duplicate_clusters_csv
+    File czid_dedup_out_duplicate_cluster_sizes_tsv = RunCZIDDedup.duplicate_cluster_sizes_tsv
+    File? czid_dedup_out_count = RunCZIDDedup.output_read_count
+    File? czid_dedup_version = RunCZIDDedup.version
+    File lzw_out_lzw1_fa = RunLZW.lzw1_fa
+    File? lzw_out_lzw2_fa = RunLZW.lzw2_fa
+    File? lzw_out_count = RunLZW.output_read_count
+    File bowtie2_out_bowtie2_1_fa = RunBowtie2_bowtie2_out.bowtie2_1_fa
+    File? bowtie2_out_bowtie2_2_fa = RunBowtie2_bowtie2_out.bowtie2_2_fa
+    File? bowtie2_out_bowtie2_merged_fa = RunBowtie2_bowtie2_out.bowtie2_merged_fa
+    File? bowtie2_out_count = RunBowtie2_bowtie2_out.output_read_count
+    File? bowtie2_version = RunBowtie2_bowtie2_out.version
+    File subsampled_out_subsampled_1_fa = RunSubsample.subsampled_1_fa
+    File? subsampled_out_subsampled_2_fa = RunSubsample.subsampled_2_fa
+    File? subsampled_out_subsampled_merged_fa = RunSubsample.subsampled_merged_fa
+    File? subsampled_out_count = RunSubsample.output_read_count
+    File? star_human_out_unmapped_human_1_fa = RunStarDownstream.unmapped_human_1_fa
+    File? star_human_out_unmapped_human_2_fa = RunStarDownstream.unmapped_human_2_fa
+    File? star_human_out_count = RunStarDownstream.output_read_count
+    File? star_human_version = RunStarDownstream.version
+    File? bowtie2_human_out_bowtie2_human_1_fa = RunBowtie2_bowtie2_human_out.bowtie2_human_1_fa
+    File? bowtie2_human_out_bowtie2_human_2_fa = RunBowtie2_bowtie2_human_out.bowtie2_human_2_fa
+    File? bowtie2_human_out_bowtie2_human_merged_fa = RunBowtie2_bowtie2_human_out.bowtie2_human_merged_fa
+    File? bowtie2_human_out_count = RunBowtie2_bowtie2_human_out.output_read_count
+    File? bowtie2_human_version = RunBowtie2_bowtie2_human_out.version
+    File gsnap_filter_out_gsnap_filter_1_fa = RunGsnapFilter.gsnap_filter_1_fa
+    File? gsnap_filter_out_gsnap_filter_2_fa = RunGsnapFilter.gsnap_filter_2_fa
+    File? gsnap_filter_out_gsnap_filter_merged_fa = RunGsnapFilter.gsnap_filter_merged_fa
+    File? gsnap_filter_out_count = RunGsnapFilter.output_read_count
+    File? gsnap_filter_version = RunGsnapFilter.version
+    File? input_read_count = RunValidateInput.input_read_count
+    File? output_gene_file = RunStar.output_gene_file
+    File? output_metrics_file = RunStar.output_metrics_file
+    File? output_histogram_file = RunStar.output_histogram_file
+  }
+}
diff --git a/workflows/legacy-host-filter/stage_io_map.json b/workflows/legacy-host-filter/stage_io_map.json
new file mode 100644
index 000000000..565be202a
--- /dev/null
+++ b/workflows/legacy-host-filter/stage_io_map.json
@@ -0,0 +1,38 @@
+{
+   "NonHostAlignment":{
+      "host_filter_out_gsnap_filter_1_fa":"gsnap_filter_out_gsnap_filter_1_fa",
+      "host_filter_out_gsnap_filter_2_fa":"gsnap_filter_out_gsnap_filter_2_fa",
+      "host_filter_out_gsnap_filter_merged_fa":"gsnap_filter_out_gsnap_filter_merged_fa",
+      "duplicate_cluster_sizes_tsv":"czid_dedup_out_duplicate_cluster_sizes_tsv",
+      "czid_dedup_out_duplicate_clusters_csv":"czid_dedup_out_duplicate_clusters_csv"
+   },
+   "Postprocess":{
+      "host_filter_out_gsnap_filter_1_fa":"gsnap_filter_out_gsnap_filter_1_fa",
+      "host_filter_out_gsnap_filter_2_fa":"gsnap_filter_out_gsnap_filter_2_fa",
+      "host_filter_out_gsnap_filter_merged_fa":"gsnap_filter_out_gsnap_filter_merged_fa",
+      "gsnap_out_gsnap_m8":"gsnap_out_gsnap_m8",
+      "gsnap_out_gsnap_deduped_m8":"gsnap_out_gsnap_deduped_m8",
+      "gsnap_out_gsnap_hitsummary_tab":"gsnap_out_gsnap_hitsummary_tab",
+      "gsnap_out_gsnap_counts_with_dcr_json":"gsnap_out_gsnap_counts_with_dcr_json",
+      "rapsearch2_out_rapsearch2_m8":"rapsearch2_out_rapsearch2_m8",
+      "rapsearch2_out_rapsearch2_deduped_m8":"rapsearch2_out_rapsearch2_deduped_m8",
+      "rapsearch2_out_rapsearch2_hitsummary_tab":"rapsearch2_out_rapsearch2_hitsummary_tab",
+      "rapsearch2_out_rapsearch2_counts_with_dcr_json":"rapsearch2_out_rapsearch2_counts_with_dcr_json",
+      "duplicate_cluster_sizes_tsv":"czid_dedup_out_duplicate_cluster_sizes_tsv",
+      "czid_dedup_out_duplicate_clusters_csv":"czid_dedup_out_duplicate_clusters_csv"
+   },
+   "Experimental":{
+      "taxid_fasta_in_annotated_merged_fa":"annotated_out_annotated_merged_fa",
+      "taxid_fasta_in_gsnap_hitsummary_tab":"gsnap_out_gsnap_hitsummary_tab",
+      "taxid_fasta_in_rapsearch2_hitsummary_tab":"rapsearch2_out_rapsearch2_hitsummary_tab",
+      "gsnap_m8_gsnap_deduped_m8":"gsnap_out_gsnap_deduped_m8",
+      "refined_gsnap_in_gsnap_reassigned_m8":"refined_gsnap_out_assembly_gsnap_reassigned_m8",
+      "refined_gsnap_in_gsnap_hitsummary2_tab":"refined_gsnap_out_assembly_gsnap_hitsummary2_tab",
+      "refined_gsnap_in_gsnap_blast_top_m8":"refined_gsnap_out_assembly_gsnap_blast_top_m8",
+      "contig_in_contig_coverage_json":"coverage_out_assembly_contig_coverage_json",
+      "contig_in_contig_stats_json":"assembly_out_assembly_contig_stats_json",
+      "contig_in_contigs_fasta":"assembly_out_assembly_contigs_fasta",
+      "nonhost_fasta_refined_taxid_annot_fasta":"refined_taxid_fasta_out_assembly_refined_taxid_annot_fasta",
+      "duplicate_clusters_csv":"czid_dedup_out_duplicate_clusters_csv"
+   }
+}
diff --git a/workflows/legacy-host-filter/test/__init__.py b/workflows/legacy-host-filter/test/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/workflows/legacy-host-filter/test/duplicate_cluster_sizes.tsv b/workflows/legacy-host-filter/test/duplicate_cluster_sizes.tsv
new file mode 100644
index 000000000..51ba71020
--- /dev/null
+++ b/workflows/legacy-host-filter/test/duplicate_cluster_sizes.tsv
@@ -0,0 +1,22 @@
+1	NC_007795.1_62__benchmark_lineage_93061_1280_1279_90964__s0000001323
+1	NC_007795.1_62__benchmark_lineage_93061_1280_1279_90964__s0000004573
+1	NC_007795.1_64__benchmark_lineage_93061_1280_1279_90964__s0000000966
+1	NC_007795.1_65__benchmark_lineage_93061_1280_1279_90964__s0000001325
+1	NC_007795.1_66__benchmark_lineage_93061_1280_1279_90964__s0000001061
+1	NC_007795.1_68__benchmark_lineage_93061_1280_1279_90964__s0000001151
+1	NC_007795.1_8__benchmark_lineage_93061_1280_1279_90964__s0000002124
+1	NC_016845.1_16__benchmark_lineage_1125630_573_570_543__s0000001766
+1	NC_016845.1_19__benchmark_lineage_1125630_573_570_543__s0000002195
+1	NC_016845.1_29__benchmark_lineage_1125630_573_570_543__s0000004269
+1	NC_016845.1_36__benchmark_lineage_1125630_573_570_543__s0000002278
+1	NC_016845.1_38__benchmark_lineage_1125630_573_570_543__s0000000459
+1	NC_016845.1_40__benchmark_lineage_1125630_573_570_543__s0000002827
+1	NC_016845.1_44__benchmark_lineage_1125630_573_570_543__s0000001495
+1	NC_016845.1_46__benchmark_lineage_1125630_573_570_543__s0000000467
+1	NC_016845.1_4__benchmark_lineage_1125630_573_570_543__s0000003258
+1	NC_016845.1_53__benchmark_lineage_1125630_573_570_543__s0000001392
+1	NC_016845.1_54__benchmark_lineage_1125630_573_570_543__s0000001251
+1	NC_016845.1_57__benchmark_lineage_1125630_573_570_543__s0000002297
+1	NC_016845.1_60__benchmark_lineage_1125630_573_570_543__s0000003310
+1	NC_016845.1_65__benchmark_lineage_1125630_573_570_543__s0000002305
+1	NC_016845.1_65__benchmark_lineage_1125630_573_570_543__s0000003893
diff --git a/workflows/short-read-mngs/test/host_filter/star_inputs/valid_input1.fastq b/workflows/legacy-host-filter/test/host_filter/star_inputs/valid_input1.fastq
similarity index 100%
rename from workflows/short-read-mngs/test/host_filter/star_inputs/valid_input1.fastq
rename to workflows/legacy-host-filter/test/host_filter/star_inputs/valid_input1.fastq
diff --git a/workflows/short-read-mngs/test/host_filter/star_inputs/valid_input2.fastq b/workflows/legacy-host-filter/test/host_filter/star_inputs/valid_input2.fastq
similarity index 100%
rename from workflows/short-read-mngs/test/host_filter/star_inputs/valid_input2.fastq
rename to workflows/legacy-host-filter/test/host_filter/star_inputs/valid_input2.fastq
diff --git a/workflows/short-read-mngs/test/host_filter/star_inputs/validate_input_summary.json b/workflows/legacy-host-filter/test/host_filter/star_inputs/validate_input_summary.json
similarity index 100%
rename from workflows/short-read-mngs/test/host_filter/star_inputs/validate_input_summary.json
rename to workflows/legacy-host-filter/test/host_filter/star_inputs/validate_input_summary.json
diff --git a/workflows/short-read-mngs/test/host_filter/star_inputs/validate_input_summary_long.json b/workflows/legacy-host-filter/test/host_filter/star_inputs/validate_input_summary_long.json
similarity index 100%
rename from workflows/short-read-mngs/test/host_filter/star_inputs/validate_input_summary_long.json
rename to workflows/legacy-host-filter/test/host_filter/star_inputs/validate_input_summary_long.json
diff --git a/workflows/legacy-host-filter/test/host_filter/test_RunValidateInput_invalid.fastq b/workflows/legacy-host-filter/test/host_filter/test_RunValidateInput_invalid.fastq
new file mode 100644
index 000000000..ac92b98f9
--- /dev/null
+++ b/workflows/legacy-host-filter/test/host_filter/test_RunValidateInput_invalid.fastq
@@ -0,0 +1,8 @@
+@NB501961:14:HM7TLBGX2:1:11102:3233:17234 1:N:0:GATCACCA+GATCACCA
+CATTCGGCTGGGTTTCGTCACCCTGCGGGAAGATGCGGGTCCAGGCGATAGAGGTGCGGAAGCAT
+TTGAAGCCCATCTCGGCGATCAGTTTGATGTCTTCTTTGTAGCGACCGTAGAAGTCGACGGCTTC
+GTGGTTCGGGTAGTATTTTN
++
+AAAAAEEEEEEEEEE/EEEEEEEEEEEEEEEEEEEEEE<EEEEEEEEE//EAEEEEEEAA/EEE/
+EEAAEEE<6/<AEE<E<A//EEEEEAEE<EA/<AA//EAAEAAE<E/EAEE///E<A/<AA//A<
+A<<A6<A//AA/E////6A#
\ No newline at end of file
diff --git a/workflows/legacy-host-filter/test/host_filter/test_RunValidateInput_invalid_char.fastq b/workflows/legacy-host-filter/test/host_filter/test_RunValidateInput_invalid_char.fastq
new file mode 100644
index 000000000..371f32d53
--- /dev/null
+++ b/workflows/legacy-host-filter/test/host_filter/test_RunValidateInput_invalid_char.fastq
@@ -0,0 +1,4 @@
+@NB501961:14:HM7TLBGX2:1:11102:3233:17234 1:N:0:GATCACCA+GATCACCA
+CATTCGGCTGGGTTTCGTCACCCTGCGGGAAGATGCGGGTCCAGGCGATAGAGGTGCGGAAGCATTTGAAGCCCATCTCGGCGATCAGTTTGATGTCTTCTTTGTAGCGACCGTAGAAGTCGACGGCTTCGTGGTTCGGGTAGTATTTTN
++
+AAAAAEEEEEEEEEE/EEEEEEEEEEEEEEEEEEEEEE<EEEEEEEEE//EAEEEEEEAA/EEE/EEAAEEE<6/<AEE<E<A//EEEEEAEE<EA/<AA//EAAEAAE<E/EAEE///E<A/<AA//A<A<<A6<A//AA/E////6A#
diff --git a/workflows/legacy-host-filter/test/local_test.yml b/workflows/legacy-host-filter/test/local_test.yml
new file mode 100644
index 000000000..f6e32097f
--- /dev/null
+++ b/workflows/legacy-host-filter/test/local_test.yml
@@ -0,0 +1,19 @@
+# local_test_viral.yml
+# Boilerplate local_driver.wdl input YAML for use with `miniwdl run` so that this file,
+# docker_image_id, fastqs_0, and fastqs_1 are the only inputs required on the command line.
+# For testing purposes, uses reference databases with only viral sequences, instead of full NR/NT
+# databases, to reduce download and processing burden.
+# Note: this YAML file doesn't show all possible optional inputs to the workflow, only ones that
+# must be set or overridden for the above-mentioned purposes. See the help message printed by
+# `miniwdl run local_driver.wdl` to see them all.
+file_ext: fastq
+nucleotide_type: DNA
+host_genome: human
+star_genome: s3://czid-public-references/host_filter/ercc/2018-02-15-utc-1518652800-unixtime__2018-02-15-utc-1518652800-unixtime/STAR_genome.tar
+bowtie2_genome: s3://czid-public-references/host_filter/ercc/2018-02-15-utc-1518652800-unixtime__2018-02-15-utc-1518652800-unixtime/bowtie2_genome.tar
+gsnap_genome: s3://czid-public-references/test/gsnap/ERCC_gsnap2017-11-15_k16.tar
+human_star_genome: s3://czid-public-references/host_filter/ercc/2018-02-15-utc-1518652800-unixtime__2018-02-15-utc-1518652800-unixtime/STAR_genome.tar
+human_bowtie2_genome: s3://czid-public-references/host_filter/ercc/2018-02-15-utc-1518652800-unixtime__2018-02-15-utc-1518652800-unixtime/bowtie2_genome.tar
+adapter_fasta: https://raw.githubusercontent.com/broadinstitute/viral-pipelines/master/test/input/clipDb.fasta
+max_input_fragments: 9000
+max_subsample_fragments: 9000
\ No newline at end of file
diff --git a/workflows/legacy-host-filter/test/test_wdl.py b/workflows/legacy-host-filter/test/test_wdl.py
new file mode 100644
index 000000000..adbe0e35d
--- /dev/null
+++ b/workflows/legacy-host-filter/test/test_wdl.py
@@ -0,0 +1,132 @@
+import os
+import json
+import hashlib
+import yaml
+from test_util import WDLTestCase
+from subprocess import CalledProcessError
+
+
+class TestRunValidate(WDLTestCase):
+    """Tests the RunValidateInput function"""
+
+    wdl = os.path.join(os.path.dirname(__file__), "..", "legacy-host-filter.wdl")
+
+    @classmethod
+    def setUpClass(self):
+        args = ["max_input_fragments=1", "file_ext=fastq", "s3_wd_uri=''"]
+        self.rv_args = args
+
+    def testValidateWindows(self):
+        fastqs_0 = os.path.join(os.path.dirname(__file__), "windows1.fastq.gz")
+        args = self.rv_args + [f"fastqs={fastqs_0}"]
+        res = self.run_miniwdl(args, task="RunValidateInput")
+        with open(res["outputs"]["RunValidateInput.valid_input1_fastq"]) as f:
+            hash = hashlib.md5(f.read().encode("utf-8")).hexdigest()
+        self.assertEqual(hash, "a410dd184a01187d9c7c1823f5fc353e")
+
+    def testInvalidInput(self):
+        fastqs_0 = os.path.join(
+            os.path.dirname(__file__),
+            "host_filter",
+            "test_RunValidateInput_invalid_char.fastq",
+        )
+        args = self.rv_args + [f"fastqs={fastqs_0}"]
+
+        with self.assertRaises(CalledProcessError) as ecm:
+            self.run_miniwdl(args, task="RunValidateInput")
+        miniwdl_error = json.loads(ecm.exception.output)
+        with open(miniwdl_error["cause"]["stderr_file"]) as stderr:
+            error_json = stderr.readlines()[-1]
+            cause = json.loads(error_json.strip())["cause"]
+            self.assertEqual(
+                cause,
+                "PARSE ERROR: not an ascii file. Line 4 contains non-ascii characters.",
+            )
+
+
+class TestSTAR(WDLTestCase):
+    """Tests the RunSTAR function
+    the inputs are minimal, with only 100 reads
+    should only add ~1 min to testing time
+    """
+
+    wdl = os.path.join(os.path.dirname(__file__), "..", "legacy-host-filter.wdl")
+    with open(os.path.join(os.path.dirname(__file__), "local_test.yml")) as fh:
+        common_inputs = yaml.safe_load(fh)
+    star_args = None
+
+    @classmethod
+    def setUpClass(self):
+        fastqs_0 = os.path.join(
+            os.path.dirname(__file__),
+            "host_filter",
+            "star_inputs",
+            "valid_input1.fastq",
+        )
+        fastqs_1 = os.path.join(
+            os.path.dirname(__file__),
+            "host_filter",
+            "star_inputs",
+            "valid_input2.fastq",
+        )
+        summary_json = os.path.join(
+            os.path.dirname(__file__),
+            "host_filter",
+            "star_inputs",
+            "validate_input_summary.json",
+        )
+        args = [
+            "s3_wd_uri=''",
+            f"validate_input_summary_json={summary_json}",
+            f"valid_input_fastq={fastqs_0}",
+            f"valid_input_fastq={fastqs_1}",
+            "star_genome=s3://czid-public-references/host_filter/ercc"
+            "/2018-02-15-utc-1518652800-unixtime__2018-02-15-utc-1518652800-unixtime/STAR_genome.tar",
+        ]
+        self.star_args = args
+
+    def test_star(self):
+        """test the basic star parameters"""
+        args = self.star_args + ["nucleotide_type=DNA", "host_genome=human"]
+        res = self.run_miniwdl(args, task="RunStar")
+        with open(res["outputs"]["RunStar.output_read_count"]) as f:
+            count = json.load(f)
+
+        self.assertEqual(count["star_out"], 100)
+        with open(res["outputs"]["RunStar.unmapped1_fastq"]) as f:
+            hash = hashlib.md5(f.read().encode("utf-8")).hexdigest()
+            self.assertEqual(hash, "c4d71e1b9b01734f7c3d300a7eac327a")
+        with open(res["outputs"]["RunStar.unmapped2_fastq"]) as f:
+            hash = hashlib.md5(f.read().encode("utf-8")).hexdigest()
+            self.assertEqual(hash, "6b46fe79bf089c8b3f6377fab34b9744")
+
+    def test_star_rna(self):
+        """test the nucleotide_type of RNA works, should run STAR with TranscriptomeSAM"""
+        args = self.star_args + ["nucleotide_type=RNA", "host_genome=human"]
+        res = self.run_miniwdl(args, task="RunStar")
+        with open(res["outputs"]["RunStar.output_read_count"]) as f:
+            count = json.load(f)
+        self.assertEqual(count["star_out"], 100)
+        self.assertIn("TranscriptomeSAM", res["outputs"]["RunStar.step_description_md"])
+
+    def test_star_nonhuman(self):
+        """test that there is no output BAM file if the host is non-human"""
+        args = self.star_args + ["nucleotide_type=DNA", "host_genome=pig"]
+        res = self.run_miniwdl(args, task="RunStar")
+
+        with open(res["outputs"]["RunStar.output_read_count"]) as f:
+            count = json.load(f)
+        self.assertEqual(count["star_out"], 100)
+        self.assertIsNone(res["outputs"]["RunStar.aligned_file"])
+
+    def test_starlong(self):
+        """tests that STARLong runs if # of reads with length > 500 is >1
+        the validation input has been modified, but there are no actual long reads
+        """
+
+        args = self.star_args + ["nucleotide_type=DNA", "host_genome=human"]
+        args[1] = args[1].replace(".json", "_long.json")
+        res = self.run_miniwdl(args, task="RunStar")
+        with open(res["outputs"]["RunStar.output_read_count"]) as f:
+            count = json.load(f)
+        self.assertEqual(count["star_out"], 100)
diff --git a/workflows/legacy-host-filter/test/windows1.fastq.gz b/workflows/legacy-host-filter/test/windows1.fastq.gz
new file mode 100644
index 000000000..75fd11e53
Binary files /dev/null and b/workflows/legacy-host-filter/test/windows1.fastq.gz differ
diff --git a/workflows/short-read-mngs/Dockerfile b/workflows/short-read-mngs/Dockerfile
index 70af1512d..12bb1ebbd 100644
--- a/workflows/short-read-mngs/Dockerfile
+++ b/workflows/short-read-mngs/Dockerfile
@@ -33,8 +33,7 @@ RUN sed -i s/archive.ubuntu.com/us-west-2.ec2.archive.ubuntu.com/ /etc/apt/sourc
         echo 'APT::Install-Recommends "false";' > /etc/apt/apt.conf.d/98czid; \
         echo 'APT::Install-Suggests "false";' > /etc/apt/apt.conf.d/99czid
 
-RUN apt-get -q update
-RUN apt-get -q install -y \
+RUN apt-get -q update && apt-get -q install -y \
         jq \
         moreutils \
         pigz \
@@ -69,6 +68,8 @@ RUN apt-get -q install -y \
         fastx-toolkit \
         seqtk \
         bedtools \
+        dh-autoreconf \
+        nasm \
         build-essential 
 
 # The following packages pull in python2.7
@@ -116,7 +117,7 @@ RUN chmod +x /usr/bin/PriceSeqFilter
 RUN curl -Ls https://github.com/chanzuckerberg/s3parcp/releases/download/v0.2.0-alpha/s3parcp_0.2.0-alpha_Linux_x86_64.tar.gz | tar -C /usr/bin -xz s3parcp
 
 # FIXME: check if use of pandas, pysam is necessary
-RUN apt-get -q install -y python3-pysam python3-pandas
+RUN pip3 install pysam==0.14.1 pandas==1.1.5
 
 # Workaround for srst2 refusing to work with upstream bowtie2 and samtools
 # FIXME: replace srst2 with a more appropriate tool
@@ -160,9 +161,6 @@ RUN curl -L  https://idseq-rapsearch2.s3-us-west-2.amazonaws.com/RAPSearch2.24_6
 RUN sed -i -e 's|^INC.*|INC := -I /usr/include/boost|' -e 's|^LIB.*|LIB :=|' Makefile
 RUN make
 ENV PATH="${PATH}:/rapsearch2/Src/"
-# Uninstall build only dependencies
-RUN apt-get purge -y g++ libperl4-corelibs-perl make
-WORKDIR /
 
 RUN apt-get -y update && apt-get install -y build-essential libz-dev git python3-pip cmake
 
@@ -183,6 +181,28 @@ RUN mv diamond /usr/local/bin
 
 RUN curl -Ls https://github.com/chanzuckerberg/s3parcp/releases/download/v0.2.0-alpha/s3parcp_0.2.0-alpha_Linux_x86_64.tar.gz | tar -C /usr/bin -xz s3parcp
 
+# Host filtering (2022 version) dependencies
+# fastp (libdeflate libisal (dh-autoreconf nasm))
+# hisat2
+# bowtie2 [already installed]
+# kallisto + python gtfparse
+WORKDIR /tmp
+RUN wget -nv -O - https://github.com/intel/isa-l/archive/refs/tags/v2.30.0.tar.gz | tar zx
+RUN cd isa-l-* && ./autogen.sh && ./configure && make -j8 && make install
+RUN wget -nv -O - https://github.com/ebiggers/libdeflate/archive/refs/tags/v1.12.tar.gz | tar zx
+RUN cd libdeflate-* && make -j8 && make install
+RUN ldconfig
+RUN git clone https://github.com/mlin/fastp.git && git -C fastp checkout 37edd60
+RUN cd fastp && make -j8 && ./fastp test && cp fastp /usr/local/bin
+WORKDIR /
+RUN wget -nv -O /tmp/HISAT2.zip https://czid-public-references.s3.us-west-2.amazonaws.com/test/hisat2/hisat2.zip \
+        && unzip /tmp/HISAT2.zip && rm /tmp/HISAT2.zip
+RUN curl -L https://github.com/pachterlab/kallisto/releases/download/v0.46.1/kallisto_linux-v0.46.1.tar.gz | tar xz -C /
+RUN pip3 install gtfparse==1.2.1
+
+# Uninstall build only dependencies
+RUN apt-get purge -y g++ libperl4-corelibs-perl make
+
 COPY --from=lib idseq-dag /tmp/idseq-dag
 RUN pip3 install /tmp/idseq-dag && rm -rf /tmp/idseq-dag
 
diff --git a/workflows/short-read-mngs/auto_benchmark/README.md b/workflows/short-read-mngs/auto_benchmark/README.md
index 2e7ae243a..1994405d8 100644
--- a/workflows/short-read-mngs/auto_benchmark/README.md
+++ b/workflows/short-read-mngs/auto_benchmark/README.md
@@ -31,7 +31,7 @@ Then run desired test scenarios **either (1A)** locally **or (1B)** by submittin
 Prepare by building the czid-short-read-mngs docker image and enabling the miniwdl download cache:
 
 ```bash
-docker build czid-workflows/short-read-mngs --tag czid-short-read-mngs
+docker build czid-workflows/workflows/short-read-mngs --tag czid-short-read-mngs
 export MINIWDL__DOWNLOAD_CACHE__PUT=true
 export MINIWDL__DOWNLOAD_CACHE__GET=true
 export MINIWDL__DOWNLOAD_CACHE__DIR=/tmp/miniwdl_download_cache
@@ -40,7 +40,7 @@ export MINIWDL__DOWNLOAD_CACHE__DIR=/tmp/miniwdl_download_cache
 The following invocation runs two small synthetic samples using the viral reference databases (roughly 6GB download):
 
 ```bash
-czid-workflows/short-read-mngs/auto_benchmark/run_local.py --dir my_benchmarks/ \
+czid-workflows/workflows/short-read-mngs/auto_benchmark/run_local.py --dir my_benchmarks/ \
     --docker-image-id czid-short-read-mngs --settings default --verbose \
     idseq_bench_3 idseq_bench_5
 ```
@@ -48,7 +48,7 @@ czid-workflows/short-read-mngs/auto_benchmark/run_local.py --dir my_benchmarks/
 This would run those and two other samples on the full-size databases, which takes a few hours:
 
 ```bash
-czid-workflows/short-read-mngs/auto_benchmark/run_local.py --dir my_benchmarks/ \
+czid-workflows/workflows/short-read-mngs/auto_benchmark/run_local.py --dir my_benchmarks/ \
     --docker-image-id czid-short-read-mngs --settings default --databases full \
     idseq_bench_3 idseq_bench_5 atcc_staggered atcc_even
 ```
@@ -60,7 +60,7 @@ The available benchmark samples are listed in [benchmarks.yml](benchmarks.yml).
 `run_dev.py` submits requested samples to the idseq-dev SFN-WDL backend, using a given [released version vA.B.C](https://github.com/chanzuckerberg/czid-workflows/releases) of the WDL code (not necessarily the checked-out revision!) and the full-size reference databases. The invoking shell session must be pre-configured with an appropriate AWS profile for control of the idseq-dev infrastructure.
 
 ```bash
-czid-workflows/short-read-mngs/auto_benchmark/run_dev.py --workflow-version vA.B.C \
+czid-workflows/workflows/short-read-mngs/auto_benchmark/run_dev.py --workflow-version vA.B.C \
     idseq_bench_3 idseq_bench_5 atcc_staggered atcc_even
 ```
 
@@ -80,7 +80,7 @@ taxadb create -i taxadb --dbname taxadb.sqlite
 Harvesting local run folders generated by `run_local.py` (printed at the end of its standard output):
 
 ```bash
-czid-workflows/short-read-mngs/auto_benchmark/harvest.py --taxadb taxadb.sqlite \
+czid-workflows/workflows/short-read-mngs/auto_benchmark/harvest.py --taxadb taxadb.sqlite \
     idseq_bench_3=my_benchmarks/idseq_bench_3/ idseq_bench_5=my_benchmarks/idseq_bench_5/ \
     > my_benchmarks.json
 ```
@@ -88,7 +88,7 @@ czid-workflows/short-read-mngs/auto_benchmark/harvest.py --taxadb taxadb.sqlite
 or S3 folders from`run_dev.py`:
 
 ```bash
-czid-workflows/short-read-mngs/auto_benchmark/harvest.py --taxadb taxadb.sqlite \
+czid-workflows/workflows/short-read-mngs/auto_benchmark/harvest.py --taxadb taxadb.sqlite \
     idseq_bench_3=s3://idseq-samples-development/auto_benchmark/YYYYMMDD_HHmmss_default_latest/idseq_bench_3/results/short-read-mngs-A/ \
     idseq_bench_5=s3://idseq-samples-development/auto_benchmark/YYYYMMDD_HHmmss_default_latest/idseq_bench_5/results/short-read-mngs-A/ \
     > my_benchmarks.json
@@ -101,10 +101,10 @@ Finally, run the Jupyter notebook to compare the generated results with the refe
 ```bash
 docker run -v $(pwd):/mnt \
     --env HARVEST_DATA=/mnt/my_benchmarks.json \
-    --env REF_LIB=/mnt/czid-workflows/short-read-mngs/auto_benchmark/ref_libs/default_viral \
+    --env REF_LIB=/mnt/czid-workflows/workflows/short-read-mngs/auto_benchmark/ref_libs/default_viral \
     --env "RUN_NAME=default_viral_vA.B.C" \
     jupyter/scipy-notebook:latest jupyter nbconvert --execute --to html --no-input --output-dir /mnt \
-        /mnt/czid-workflows/short-read-mngs/auto_benchmark/short-read-mngs-benchmarks.ipynb
+        /mnt/czid-workflows/workflows/short-read-mngs/auto_benchmark/short-read-mngs-benchmarks.ipynb
 ```
 
 Then find `idseq-short-read-mngs-benchmarks.html` in your working directory! (Note: when using the viral databases, the precision-recall curves compared to the truth sets are very poor, correctly so because the simulated datasets include non-viral species.)
@@ -116,7 +116,7 @@ Change "viral" to "full" if you used the full-sized databases. Strike `--no-inpu
 Suppose the results in `my_benchmarks.json` differ from the reference library in an expected way due to pipeline code changes. You can update the reference values like so:
 
 ```bash
-jq .idseq_bench_3 my_benchmarks.json > czid-workflows/short-read-mngs/auto_benchmark/ref_libs/default_viral/idseq_bench_3.json
+jq .idseq_bench_3 my_benchmarks.json > czid-workflows/workflows/short-read-mngs/auto_benchmark/ref_libs/default_viral/idseq_bench_3.json
 ```
 
 Rerun the notebook to verify it now reports identical results, and check into git.
@@ -126,7 +126,7 @@ Rerun the notebook to verify it now reports identical results, and check into gi
 You can edit the notebook by opening it in a local Jupyter server started like so:
 
 ```bash
-docker run -v $(pwd)/czid-workflows/short-read-mngs/auto_benchmark:/home/jovyan \
+docker run -v $(pwd)/czid-workflows/workflows/short-read-mngs/workflows/auto_benchmark:/home/jovyan \
     -p 8888:8888 jupyter/scipy-notebook:latest
 ```
 
diff --git a/workflows/short-read-mngs/auto_benchmark/benchmarks.yml b/workflows/short-read-mngs/auto_benchmark/benchmarks.yml
index 12679b7f2..24663e5ee 100644
--- a/workflows/short-read-mngs/auto_benchmark/benchmarks.yml
+++ b/workflows/short-read-mngs/auto_benchmark/benchmarks.yml
@@ -17,11 +17,11 @@ settings:
 
 databases:
   viral:
-    host_filter.star_genome: s3://czid-public-references/host_filter/ercc/2018-02-15-utc-1518652800-unixtime__2018-02-15-utc-1518652800-unixtime/STAR_genome.tar
-    host_filter.bowtie2_genome: s3://czid-public-references/host_filter/ercc/2018-02-15-utc-1518652800-unixtime__2018-02-15-utc-1518652800-unixtime/bowtie2_genome.tar
-    host_filter.gsnap_genome: s3://czid-public-references/test/gsnap/ERCC_gsnap2017-11-15_k16.tar
-    host_filter.human_star_genome: s3://czid-public-references/host_filter/ercc/2018-02-15-utc-1518652800-unixtime__2018-02-15-utc-1518652800-unixtime/STAR_genome.tar
-    host_filter.human_bowtie2_genome: s3://czid-public-references/host_filter/ercc/2018-02-15-utc-1518652800-unixtime__2018-02-15-utc-1518652800-unixtime/bowtie2_genome.tar
+    host_filter.bowtie2_index_tar: s3://public-test-bucket-idseq/host_filter/human/2022/bowtie2_index_tar/GRCh38_ERCC.bowtie2.tar
+    host_filter.hisat2_index_tar: s3://public-test-bucket-idseq/host_filter/human/2022/hisat2_index_tar/GRCh38_ERCC.hisat2.tar
+    host_filter.kallisto_idx: s3://public-test-bucket-idseq/host_filter/human/2022/kallisto_idx/GRCh38_ERCC.kallisto.idx
+    host_filter.human_bowtie2_index_tar: s3://public-test-bucket-idseq/host_filter/human/2022/bowtie2_index_tar/GRCh38_ERCC.bowtie2.tar
+    host_filter.human_hisat2_index_tar: s3://public-test-bucket-idseq/host_filter/human/2022/hisat2_index_tar/GRCh38_ERCC.hisat2.tar
     minimap2_local_db_path: s3://czid-public-references/test/viral-alignment-indexes/viral_nt
     diamond_local_db_path: s3://czid-public-references/test/viral-alignment-indexes/viral_nr
     diamond_args: mid-sensitive
diff --git a/workflows/short-read-mngs/auto_benchmark/harvest.py b/workflows/short-read-mngs/auto_benchmark/harvest.py
index c3b175a0e..ebd6f7d3c 100755
--- a/workflows/short-read-mngs/auto_benchmark/harvest.py
+++ b/workflows/short-read-mngs/auto_benchmark/harvest.py
@@ -90,7 +90,7 @@ def harvest_sample(sample, outputs_json, taxadb):
 
     # collect read counts at various pipeline steps
     ans["paired"] = (
-        outputs_json["czid_short_read_mngs.host_filter.star_out_unmapped2_fastq"]
+        outputs_json["czid_short_read_mngs.host_filter.fastp_out_fastp2_fastq"]
         is not None
     )
     ans["input_reads"] = read_output_jsonfile(outputs_json, "host_filter.input_read_count")[
@@ -98,14 +98,11 @@ def harvest_sample(sample, outputs_json, taxadb):
     ]
     for step in [
         "validate_input",
-        "star",
-        "trimmomatic",
-        "priceseq",
+        "fastp",
+        "bowtie2_host_filtered",
+        "hisat2_host_filtered",
         "czid_dedup",
-        "lzw",
-        "bowtie2",
         "subsampled",
-        "gsnap_filter",
     ]:
         ans[step + "_reads"] = read_output_jsonfile(
             outputs_json, "host_filter." + step + "_out_count"
diff --git a/workflows/short-read-mngs/auto_benchmark/short-read-mngs-benchmarks.ipynb b/workflows/short-read-mngs/auto_benchmark/short-read-mngs-benchmarks.ipynb
index a46e0fbbd..14d3fd37f 100644
--- a/workflows/short-read-mngs/auto_benchmark/short-read-mngs-benchmarks.ipynb
+++ b/workflows/short-read-mngs/auto_benchmark/short-read-mngs-benchmarks.ipynb
@@ -202,9 +202,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "scrolled": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "def taxa_dataframe(sample_data, sample_reads, db):\n",
@@ -218,7 +216,7 @@
     "\n",
     "def joined_taxa_dataframe(sample_data):\n",
     "    # figure rPM denominator\n",
-    "    sample_reads = sample_data[\"counts\"][\"gsnap_filter_reads\"]\n",
+    "    sample_reads = sample_data[\"counts\"][\"subsampled_reads\"]\n",
     "    if sample_data[\"counts\"][\"paired\"]:\n",
     "        sample_reads *= 2\n",
     "\n",
@@ -287,9 +285,7 @@
   {
    "cell_type": "code",
    "execution_count": null,
-   "metadata": {
-    "scrolled": false
-   },
+   "metadata": {},
    "outputs": [],
    "source": [
     "for sample, joined in taxa_tables.items():\n",
@@ -338,14 +334,23 @@
    "hash": "aee8b7b246df8f9039afb4144a1f6fd8d2ca17a180786b69acc140d282b71a49"
   },
   "kernelspec": {
-   "display_name": "Python 3.9.1 64-bit",
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
    "name": "python3"
   },
   "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
    "name": "python",
-   "version": ""
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
   }
  },
  "nbformat": 4,
  "nbformat_minor": 4
-}
\ No newline at end of file
+}
diff --git a/workflows/short-read-mngs/host_filter.wdl b/workflows/short-read-mngs/host_filter.wdl
index 04c1b2c40..6268c7739 100644
--- a/workflows/short-read-mngs/host_filter.wdl
+++ b/workflows/short-read-mngs/host_filter.wdl
@@ -1,675 +1,887 @@
 version 1.0
 
-task RunValidateInput {
+# CZ ID short-read-mngs pipeline stage 1 (2022 version):
+# - input validation & QC
+# - host & human filtering
+# - deduplication
+# - subsampling
+workflow czid_host_filter {
   input {
+    File fastqs_0
+    File? fastqs_1
+    String nucleotide_type = "DNA"
+
+    File adapter_fasta
+
+    String host_genome
+    File bowtie2_index_tar
+    File hisat2_index_tar
+    File kallisto_idx
+    File? gtf_gz  # Ensembl GTF for host species
+
+    File human_bowtie2_index_tar
+    File human_hisat2_index_tar
+
+    Int max_input_fragments
+    Int max_subsample_fragments
+
+    Int cpu = 16
     String docker_image_id
+
+    # legacy idseq-dag inputs:
+    String file_ext = "fastq"
     String s3_wd_uri
-    Array[File] fastqs
-    Int max_input_fragments
-    String file_ext
   }
-  command<<<
-  set -euxo pipefail
-  idseq-dag-run-step --workflow-name host_filter \
-    --step-module idseq_dag.steps.run_validate_input \
-    --step-class PipelineStepRunValidateInput \
-    --step-name validate_input_out \
-    --input-files '[["~{sep='","' fastqs}"]]' \
-    --output-files '["validate_input_summary.json", ~{if length(fastqs) == 2 then '"valid_input1.fastq", "valid_input2.fastq"' else '"valid_input1.fastq"'}]' \
-    --output-dir-s3 '~{s3_wd_uri}' \
-    --additional-files '{}' \
-    --additional-attributes '{"truncate_fragments_to": ~{max_input_fragments}, "file_ext": "~{file_ext}"}'
-  >>>
-  output {
-    String step_description_md = read_string("validate_input_out.description.md")
-    File validate_input_summary_json = "validate_input_summary.json"
-    File valid_input1_fastq = "valid_input1.fastq"
-    File? valid_input2_fastq = "valid_input2.fastq"
-    File? output_read_count = "validate_input_out.count"
-    File? input_read_count = "fastqs.count"
+
+  # Validate input reads (and truncate if very large)
+  call RunValidateInput {
+    input:
+    reads1_fastq = fastqs_0,
+    reads2_fastq = fastqs_1,
+    file_ext = file_ext,
+    max_input_fragments = max_input_fragments,
+    docker_image_id = docker_image_id,
+    s3_wd_uri = s3_wd_uri
   }
-  runtime {
-    docker: docker_image_id
+
+  # Adapter trimming and QC filtering
+  call fastp_qc {
+    input:
+    valid_input1_fastq = RunValidateInput.valid_input1_fastq,
+    valid_input2_fastq = RunValidateInput.valid_input2_fastq,
+    adapter_fasta = adapter_fasta,
+    docker_image_id = docker_image_id,
+    cpu = cpu
+  }
+
+  # Quantify host transcripts and ERCC
+  # NOTE: we run kallisto even if nucleotide_type == "DNA" in order to get ERCC read counts.
+  # The transcript & gene abundances are ~meaningless in that case, of course. This isn't a big
+  # wasted cost because kallisto is so fast.
+  call kallisto {
+    input:
+    fastp1_fastq = fastp_qc.fastp1_fastq,
+    fastp2_fastq = fastp_qc.fastp2_fastq,
+    kallisto_idx = kallisto_idx,
+    gtf_gz = gtf_gz,
+    docker_image_id = docker_image_id,
+    cpu = cpu
   }
-}
 
-task RunStar {
-  input {
-    String docker_image_id
-    String s3_wd_uri
-    File validate_input_summary_json
-    Array[File] valid_input_fastq
-    File star_genome
-    String nucleotide_type
-    String host_genome
-    String genome_dir = "STAR_genome/part-0/"
+  # Filter out host reads.
+  # Two stages: bowtie2 --very-sensitive-local, followed by splice-aware HISAT2.
+  call bowtie2_filter {
+    input:
+    fastp1_fastq = fastp_qc.fastp1_fastq,
+    fastp2_fastq = fastp_qc.fastp2_fastq,
+    index_tar = bowtie2_index_tar,
+    docker_image_id = docker_image_id,
+    cpu = cpu
   }
-  command<<<
-  # TODO(Ryan): remove when status upload is not dependent on idseq-dag see: https://app.shortcut.com/idseq/story/163323
-  # this comment is for the miniwdl plugin uploader to parse:
-  # --step-name star_out 
-  set -euxo pipefail
 
-  python3 <<CODE
-  """ save description to file """
-  from idseq_utils.save_descriptions import star_description
-  from idseq_utils.exceptions import print_exceptions
-
-  def main():
-    description = star_description("~{nucleotide_type}")
-    with open("star_out.description.md", "w+") as f:
-      f.write(description)
-
-  print_exceptions(main)
-  CODE
-
-  mkdir "STAR_genome"
-  tar xf "~{star_genome}" -C "STAR_genome" --strip-components 1
-  # Set Parameters
-  SAMMODE="None"
-  SAMTYPE="None"
-
-  # Currently we always use 'GeneCounts', 
-  QUANTMODE="~{if nucleotide_type == 'RNA' && host_genome == 'human' then 'TranscriptomeSAM GeneCounts' else 'GeneCounts'}"
-  if [[ "~{length(valid_input_fastq)}" -eq "2" ]] && [[ "~{host_genome}" == "human" ]]; then
-    SAMMODE="NoQS"
-    SAMTYPE="BAM Unsorted"
-  fi
+  call hisat2_filter {
+    input:
+    bowtie2_host_filtered1_fastq = bowtie2_filter.bowtie2_host_filtered1_fastq,
+    bowtie2_host_filtered2_fastq = bowtie2_filter.bowtie2_host_filtered2_fastq,
+    index_tar = hisat2_index_tar,
+    docker_image_id = docker_image_id,
+    cpu = cpu
+  }
 
-  if [[ $(jq '."500-10000"' "~{validate_input_summary_json}") -gt "1" ]] || [[ $(jq '."10000+"' "~{validate_input_summary_json}") -gt "1" ]]; then 
-    STARlong \
-    --outFilterMultimapNmax 99999 \
-    --outFilterScoreMinOverLread 0.5 \
-    --outFilterMatchNminOverLread 0.5 \
-    --outReadsUnmapped Fastx \
-    --outFilterMismatchNmax 999 \
-    --clip3pNbases 0 \
-    --runThreadN "$(nproc --all)" \
-    --genomeDir "~{genome_dir}" \
-    --readFilesIn "~{sep='" "' valid_input_fastq}" \
-    --seedSearchStartLmax 20 \
-    --seedPerReadNmax 100000 \
-    --seedPerWindowNmax 1000 \
-    --alignTranscriptsPerReadNmax 100000 \
-    --outSAMmode $SAMMODE \
-    --outSAMtype $SAMTYPE \
-    --quantMode $QUANTMODE
-  else
-    STAR --outFilterMultimapNmax 99999 \
-    --outFilterScoreMinOverLread 0.5 \
-    --outFilterMatchNminOverLread 0.5 \
-    --outReadsUnmapped Fastx \
-    --outFilterMismatchNmax 999 \
-    --outSAMmode $SAMMODE \
-    --outSAMtype $SAMTYPE \
-    --clip3pNbases 0 \
-    --runThreadN "$(nproc --all)" \
-    --limitOutSJcollapsed 2000000 \
-    --runRNGseed 777 \
-    --genomeDir "~{genome_dir}" \
-    --quantMode $QUANTMODE \
-    --readFilesIn "~{sep='" "' valid_input_fastq}" 
-  fi
+  # If paired-end, collect insert size metrics from unfiltered, host-aligned bowtie2 BAM.
+  if (defined(fastqs_1)) {
+    call collect_insert_size_metrics {
+      input:
+      bam = bowtie2_filter.bam,
+      docker_image_id = docker_image_id
+    }
+  }
 
-  if [ -f "Aligned.toTranscriptome.out.bam" ]; then 
-    mv "Aligned.toTranscriptome.out.bam" "Aligned.out.bam"
-  fi
+  # Additionally filter out human reads, if the host is non-human.
+  if (host_genome != "human") {
+    call bowtie2_human_filter {
+      input:
+      hisat2_host_filtered1_fastq = hisat2_filter.hisat2_host_filtered1_fastq,
+      hisat2_host_filtered2_fastq = hisat2_filter.hisat2_host_filtered2_fastq,
+      index_tar = human_bowtie2_index_tar,
+      docker_image_id = docker_image_id,
+      cpu = cpu
+    }
+    call hisat2_human_filter {
+      input:
+      bowtie2_human_filtered1_fastq = bowtie2_human_filter.bowtie2_human_filtered1_fastq,
+      bowtie2_human_filtered2_fastq = bowtie2_human_filter.bowtie2_human_filtered2_fastq,
+      index_tar = human_hisat2_index_tar,
+      docker_image_id = docker_image_id,
+      cpu = cpu
+    }
+  }
 
-  python3 <<CODE
-  """ sync pairs of files, sort by entry id, count reads """
-  import idseq_utils.sync_pairs as sp
-  import shutil
-  import glob
-  from idseq_utils.exceptions import print_exceptions, BrokenReadPairError
-  
-  def main():
-    unmapped = sorted(glob.glob("Unmapped.out.mate*"))
-    output_files, too_discrepant = sp.sync_pairs(unmapped)
-    if too_discrepant:
-        raise BrokenReadPairError("Paired input files were detected to be out of order. Both pairs must have the same read order")
-    for unmapped_file in output_files:
-        sp.sort_fastx_by_entry_id(unmapped_file)
-
-    for ind, unmapped_file in enumerate(output_files):
-        shutil.move(unmapped_file, f"unmapped{ind+1}.fastq")
-
-  print_exceptions(main)
-  CODE
-
-  if [ -f "Aligned.out.bam" ]; then 
-    picard CollectInsertSizeMetrics I=Aligned.out.bam O=picard_insert_metrics.txt H=insert_size_histogram.pdf || echo WARNING: picard failed
-  fi 
-
-  python3 <<CODE
-  """ count reads """
-  import idseq_utils.count_reads as cr
-  import glob
-  from idseq_utils.exceptions import print_exceptions
-
-  def main():
-    input_files = sorted(glob.glob("unmapped*.fastq"))
-    cr.main("star_out", input_files)
-
-  print_exceptions(main)
-  CODE
-
-  if [ -f "ReadsPerGene.out.tab" ]; then 
-    mv ReadsPerGene.out.tab reads_per_gene.star.tab
-  fi
+  # Collect effectively filtered reads from the previous conditional
+  File hisat2_filtered1_fastq = select_first([hisat2_human_filter.hisat2_human_filtered1_fastq, hisat2_filter.hisat2_host_filtered1_fastq])
+  File? hisat2_filtered2_fastq = if defined(hisat2_human_filter.hisat2_human_filtered2_fastq) then hisat2_human_filter.hisat2_human_filtered2_fastq
+                                                                          else hisat2_filter.hisat2_host_filtered2_fastq
 
-  STAR --version > star_out_version.txt
-  rm "~{genome_dir}"/SAindex # the star genome is pretty big (1.5G)
-  rm "~{genome_dir}"/Genome 
-  >>>
-  output {
-    String step_description_md = read_string("star_out.description.md")
-    File unmapped1_fastq = "unmapped1.fastq"
-    File output_log_file = "Log.final.out"
-    File? unmapped2_fastq = "unmapped2.fastq"
-    File? aligned_file = "Aligned.out.bam"
-    File? output_read_count = "star_out.count"
-    File? output_gene_file = "reads_per_gene.star.tab"
-    File? output_metrics_file = "picard_insert_metrics.txt"
-    File? output_histogram_file = "insert_size_histogram.pdf"
-    File? version = "star_out_version.txt"
+  # Deduplicate filtered reads using custom czid-dedup tool.
+  # It retains one exemplar [pair] from each duplicate cluster, and produces mapping from exemplar
+  # read name to cluster size.
+  call RunCZIDDedup {
+    input:
+    hisat2_filtered1_fastq = hisat2_filtered1_fastq,
+    hisat2_filtered2_fastq = hisat2_filtered2_fastq,
+    docker_image_id = docker_image_id,
+    s3_wd_uri = s3_wd_uri
   }
-  runtime {
-    docker: docker_image_id
+
+  # Subsample remaining reads.
+  call RunSubsample {
+    input:
+    dedup1_fastq = RunCZIDDedup.dedup1_fastq,
+    dedup2_fastq = RunCZIDDedup.dedup2_fastq,
+    duplicate_cluster_sizes_tsv = RunCZIDDedup.duplicate_cluster_sizes_tsv,
+    max_subsample_fragments = max_subsample_fragments,
+    docker_image_id = docker_image_id,
+    s3_wd_uri = s3_wd_uri
+  }
+
+  output {
+    File input_read_count = RunValidateInput.reads_in_count
+    File validate_input_out_valid_input1_fastq = RunValidateInput.valid_input1_fastq
+    File? validate_input_out_valid_input2_fastq = RunValidateInput.valid_input2_fastq
+    File validate_input_out_validate_input_summary_json = RunValidateInput.validate_input_summary_json
+    File validate_input_out_count = RunValidateInput.reads_out_count
+
+    File fastp_out_fastp1_fastq = fastp_qc.fastp1_fastq
+    File? fastp_out_fastp2_fastq = fastp_qc.fastp2_fastq
+    File fastp_out_count = fastp_qc.reads_out_count
+    File fastp_html = fastp_qc.fastp_html
+    File fastp_json = fastp_qc.fastp_json
+
+    File kallisto_transcript_abundance_tsv = kallisto.transcript_abundance_tsv
+    File kallisto_ERCC_counts_tsv = kallisto.ERCC_counts_tsv
+    File? kallisto_gene_abundance_tsv = kallisto.gene_abundance_tsv
+
+    File bowtie2_host_filtered1_fastq = bowtie2_filter.bowtie2_host_filtered1_fastq
+    File? bowtie2_host_filtered2_fastq = bowtie2_filter.bowtie2_host_filtered2_fastq
+    File bowtie2_host_filtered_out_count = bowtie2_filter.reads_out_count
+    File bowtie2_host_filtered_bam = bowtie2_filter.bam
+    File hisat2_host_filtered1_fastq = hisat2_filter.hisat2_host_filtered1_fastq
+    File? hisat2_host_filtered2_fastq = hisat2_filter.hisat2_host_filtered2_fastq
+    File hisat2_host_filtered_out_count = hisat2_filter.reads_out_count
+
+    File? insert_size_metrics = collect_insert_size_metrics.insert_size_metrics
+    File? insert_size_histogram = collect_insert_size_metrics.insert_size_histogram
+
+    File? bowtie2_human_filtered1_fastq = bowtie2_human_filter.bowtie2_human_filtered1_fastq
+    File? bowtie2_human_filtered2_fastq = bowtie2_human_filter.bowtie2_human_filtered2_fastq
+    File? bowtie2_human_filtered_out_count = bowtie2_human_filter.reads_out_count
+    File? hisat2_human_filtered1_fastq = hisat2_human_filter.hisat2_human_filtered1_fastq
+    File? hisat2_human_filtered2_fastq = hisat2_human_filter.hisat2_human_filtered2_fastq
+    File? hisat2_human_filtered_out_count = hisat2_human_filter.reads_out_count
+
+    File czid_dedup_out_dedup1_fastq = RunCZIDDedup.dedup1_fastq
+    File? czid_dedup_out_dedup2_fastq = RunCZIDDedup.dedup2_fastq
+    File czid_dedup_out_duplicate_clusters_csv = RunCZIDDedup.duplicate_clusters_csv
+    File czid_dedup_out_duplicate_cluster_sizes_tsv = RunCZIDDedup.duplicate_cluster_sizes_tsv
+    File czid_dedup_out_count = RunCZIDDedup.reads_out_count
+
+    File subsampled_out_subsampled_1_fa = RunSubsample.subsampled_1_fa
+    File? subsampled_out_subsampled_2_fa = RunSubsample.subsampled_2_fa
+    File? subsampled_out_subsampled_merged_fa = RunSubsample.subsampled_merged_fa
+    File subsampled_out_count = RunSubsample.reads_out_count
   }
 }
 
-task RunTrimmomatic {
+task RunValidateInput {
   input {
+    File reads1_fastq
+    File? reads2_fastq
+    String file_ext
+
+    Int max_input_fragments
+
     String docker_image_id
     String s3_wd_uri
-    Array[File] unmapped_fastq
-    File adapter_fasta
   }
+  Boolean paired = defined(reads2_fastq)
   command<<<
-  set -euxo pipefail
-  idseq-dag-run-step --workflow-name host_filter \
-    --step-module idseq_dag.steps.run_trimmomatic \
-    --step-class PipelineStepRunTrimmomatic \
-    --step-name trimmomatic_out \
-    --input-files '[["~{sep='","' unmapped_fastq}"]]' \
-    --output-files '[~{if length(unmapped_fastq) == 2 then '"trimmomatic1.fastq", "trimmomatic2.fastq"' else '"trimmomatic1.fastq"'}]' \
-    --output-dir-s3 '~{s3_wd_uri}' \
-    --additional-files '{"adapter_fasta": "~{adapter_fasta}"}' \
-    --additional-attributes '{}'
-  java -jar /usr/local/bin/trimmomatic-0.38.jar -version > trimmomatic_version.txt
-  
+    set -euxo pipefail
+    idseq-dag-run-step --workflow-name host_filter \
+      --step-module idseq_dag.steps.run_validate_input \
+      --step-class PipelineStepRunValidateInput \
+      --step-name validate_input_out \
+      --input-files '[["~{sep='","' select_all([reads1_fastq, reads2_fastq])}"]]' \
+      --output-files '["validate_input_summary.json", ~{if paired then '"valid_input1.fastq", "valid_input2.fastq"' else '"valid_input1.fastq"'}]' \
+      --output-dir-s3 '~{s3_wd_uri}' \
+      --additional-files '{}' \
+      --additional-attributes '{"truncate_fragments_to": ~{max_input_fragments}, "file_ext": "~{file_ext}"}'
   >>>
   output {
-    String step_description_md = read_string("trimmomatic_out.description.md")
-    File trimmomatic1_fastq = "trimmomatic1.fastq"
-    File? trimmomatic2_fastq = "trimmomatic2.fastq"
-    File? output_read_count = "trimmomatic_out.count"
-    File? version = "trimmomatic_version.txt"
+    String step_description_md = read_string("validate_input_out.description.md")
+    File validate_input_summary_json = "validate_input_summary.json"
+    File valid_input1_fastq = "valid_input1.fastq"
+    File? valid_input2_fastq = "valid_input2.fastq"
+    File reads_out_count = "validate_input_out.count"
+    File reads_in_count = "fastqs.count"
   }
   runtime {
     docker: docker_image_id
+    cpu: 4
+    memory: "8G"
   }
 }
 
-task RunPriceSeq {
+task fastp_qc {
+    # fastp all-in-one for
+    # - adapter trimming
+    # - quality filtering
+    # - complexity filtering
   input {
+    File valid_input1_fastq
+    File? valid_input2_fastq
+    File adapter_fasta
+
+    # These default QC thresholds are loosely based on the pre-2022 pipeline using PriceSeq & LZW
+    String fastp_options = "--dont_eval_duplication --length_required 35" +
+                           " --qualified_quality_phred 17 --unqualified_percent_limit 15 --n_base_limit 15" +
+                           " --sdust_complexity_filter --complexity_threshold 60"
+
     String docker_image_id
-    String s3_wd_uri
-    Array[File] trimmomatic_fastq
+    Int cpu = 16
   }
+  Boolean paired = defined(valid_input2_fastq)
+  String fastp_invocation = "fastp"
+        + " -i ${valid_input1_fastq} ${'-I ' + valid_input2_fastq}"
+        + " -o fastp1.fastq ${if (paired) then '-O fastp2.fastq' else ''}"
+        + " -w ${cpu} ${fastp_options}"
+        + " --adapter_fasta ${adapter_fasta} ${if (paired) then '--detect_adapter_for_pe' else ''}"
+
   command<<<
-  set -euxo pipefail
-  idseq-dag-run-step --workflow-name host_filter \
-    --step-module idseq_dag.steps.run_priceseq \
-    --step-class PipelineStepRunPriceSeq \
-    --step-name priceseq_out \
-    --input-files '[["~{sep='","' trimmomatic_fastq}"]]' \
-    --output-files '[~{if length(trimmomatic_fastq) == 2 then '"priceseq1.fa", "priceseq2.fa"' else '"priceseq1.fa"'}]' \
-    --output-dir-s3 '~{s3_wd_uri}' \
-    --additional-files '{}' \
-    --additional-attributes '{}'
-  PriceSeqFilter 2> /dev/null | head -n1 > priceseq_version.txt
+    set -euxo pipefail
+    ~{fastp_invocation}
+    count="$(jq .read1_after_filtering.total_reads fastp.json)"
+    if [ '~{paired}' == 'true' ]; then
+        count=$((2 * count))
+    fi
+    jq --null-input --arg count "$count" '{"fastp_out":$count}' > fastp_out.count
+    # TODO: extract insert size metrics from JSON, also render histogram?
+
+    python3 - << 'EOF'
+    import textwrap
+    with open("fastp.description.md", "w") as outfile:
+      print(textwrap.dedent("""
+      **fastp read trimming & filtering**
+
+      Processes the reads using [fastp](https://github.com/OpenGene/fastp):
+
+      1. Trim adapters
+      2. Quality score filter
+      3. Non-called base (N) filter
+      4. Length filter
+      5. Complexity filter ([custom feature](https://github.com/mlin/fastp/tree/mlin/sdust)
+         using the [SDUST algorithm](https://pubmed.ncbi.nlm.nih.gov/16796549/))
+
+      fastp is run on the FASTQ file(s) from input validation:
+      ```
+      ~{fastp_invocation}
+      ```
+
+      fastp documentation can be found [here](https://github.com/OpenGene/fastp)
+      """).strip(), file=outfile)
+    EOF
   >>>
   output {
-    String step_description_md = read_string("priceseq_out.description.md")
-    File priceseq1_fa = "priceseq1.fa"
-    File? priceseq2_fa = "priceseq2.fa"
-    File? output_read_count = "priceseq_out.count"
-    File? version = "priceseq_version.txt"
+    String step_description_md = read_string("fastp.description.md")
+    File fastp1_fastq = "fastp1.fastq"
+    File? fastp2_fastq = "fastp2.fastq"
+    File fastp_html = "fastp.html"
+    File fastp_json = "fastp.json"
+    File reads_out_count = "fastp_out.count"
   }
   runtime {
     docker: docker_image_id
+    cpu: cpu
+    memory: "~{cpu}G"
   }
 }
 
-task RunCZIDDedup {
+task kallisto {
   input {
+    File fastp1_fastq
+    File? fastp2_fastq
+    File kallisto_idx
+    File? gtf_gz
+    String kallisto_options = ""
+
     String docker_image_id
-    String s3_wd_uri
-    Array[File] priceseq_fa
-  }
-  command<<<
-  set -euxo pipefail
-  idseq-dag-run-step --workflow-name host_filter \
-    --step-module idseq_dag.steps.run_czid_dedup \
-    --step-class PipelineStepRunCZIDDedup \
-    --step-name czid_dedup_out \
-    --input-files '[["~{sep='","' priceseq_fa}"]]' \
-    --output-files '[~{if length(priceseq_fa) == 2 then '"dedup1.fa", "dedup2.fa"' else '"dedup1.fa"'}, "clusters.csv", "duplicate_cluster_sizes.tsv"]' \
-    --output-dir-s3 '~{s3_wd_uri}' \
-    --additional-files '{}' \
-    --additional-attributes '{}'
-  czid-dedup --version > czid_dedup_version.txt
+    Int cpu = 16
+  }
+  Boolean paired = defined(fastp2_fastq)
+  # TODO: input fragment length parameters for non-paired-end (l = average, s = std dev)
+  String kallisto_invocation = "/kallisto/kallisto quant"
+      + " -i '${kallisto_idx}' -o $(pwd) --plaintext ${if (paired) then '' else '--single -l 200 -s 20'} ${kallisto_options} -t ${cpu}"
+      + " '~{fastp1_fastq}'" + if (defined(fastp2_fastq)) then " '~{fastp2_fastq}'" else ""
+
+  command <<<
+    set -euxo pipefail
+
+    # NOTE: kallisto exit code will be 1 if no reads pseudoalign, which we don't necessarily
+    #       consider an error. Therefore decide success based on existence of run_info.json and
+    #       abundance.tsv
+    ~{kallisto_invocation} || true
+    >&2 jq . run_info.json
+
+    mv abundance.tsv reads_per_transcript.kallisto.tsv
+
+    # extract ERCC counts
+    echo -e "target_id\test_counts" > ERCC_counts.tsv
+    grep ERCC- reads_per_transcript.kallisto.tsv | cut -f1,4 >> ERCC_counts.tsv
+
+    # If we've been provided the GTF, then roll up the transcript abundance estimates by gene.
+    if [[ -n '~{gtf_gz}' ]]; then
+      python3 - reads_per_transcript.kallisto.tsv '~{gtf_gz}' << 'EOF'
+    # Given kallisto output tsv based on index of Ensembl transcripts FASTA, and matching
+    # Ensembl GTF, report the total est_counts and tpm for each gene (sum over all transcripts
+    # of each gene).
+    import sys
+    import pandas as pd
+    import gtfparse
+
+    kallisto_df = pd.read_csv(sys.argv[1], sep="\t")
+
+    gtf_df = gtfparse.read_gtf(sys.argv[2])
+    tx_df = gtf_df[gtf_df["feature"] == "transcript"][
+        ["transcript_id", "transcript_version", "gene_id"]
+    ]
+    # kallisto target_id is a versioned transcript ID e.g. "ENST00000390446.3", while the GTF
+    # breaks out: transcript_id "ENST00000390446"; transcript_version "3";
+    # synthesize a column with the versioned transcript ID for merging.
+    tx_df = tx_df.assign(
+        transcript_id_version=tx_df["transcript_id"] + "." + tx_df["transcript_version"]
+    )
+
+    merged_df = pd.merge(
+        kallisto_df[["target_id", "est_counts", "tpm"]],
+        tx_df[["transcript_id_version", "gene_id"]],
+        left_on="target_id",
+        right_on="transcript_id_version",
+    )
+
+    gene_abundance = merged_df.groupby("gene_id").sum(numeric_only=True)
+    gene_abundance.to_csv("reads_per_gene.kallisto.tsv", sep="\t")
+    EOF
+    fi
+
+    python3 - << 'EOF'
+    import textwrap
+    with open("kallisto.description.md", "w") as outfile:
+      print(textwrap.dedent("""
+      **kallisto RNA quantification**
+
+      Quantifies host transcripts using [kallisto](https://pachterlab.github.io/kallisto/about).
+      The host transcript sequences are sourced from Ensembl, along with
+      [ERCC control sequences](https://www.nist.gov/programs-projects/external-rna-controls-consortium).
+      Not all CZ ID host species have transcripts indexed; for those without, kallisto is run using ERCC
+      sequences only.
+
+      kallisto is run on the fastp-filtered FASTQ(s):
+
+      ```
+      ~{kallisto_invocation}
+      ```
+
+      kallisto documentation can be found [here](https://pachterlab.github.io/kallisto/manual), including
+      details of the `transcript_abundance.tsv` output format.
+      """).strip(), file=outfile)
+    EOF
   >>>
+
   output {
-    String step_description_md = read_string("czid_dedup_out.description.md")
-    File dedup1_fa = "dedup1.fa"
-    File? dedup2_fa = "dedup2.fa"
-    File duplicate_clusters_csv = "clusters.csv"
-    File duplicate_cluster_sizes_tsv = "duplicate_cluster_sizes.tsv"
-    File? output_read_count = "czid_dedup_out.count"
-    File? version = "czid_dedup_version.txt"
+    String step_description_md = read_string("kallisto.description.md")
+    File transcript_abundance_tsv = "reads_per_transcript.kallisto.tsv"
+    File ERCC_counts_tsv = "ERCC_counts.tsv"
+    File? gene_abundance_tsv = "reads_per_gene.kallisto.tsv"
   }
+
   runtime {
     docker: docker_image_id
+    cpu: cpu
+    memory: "~{cpu}G"
   }
 }
 
-task RunLZW {
+task bowtie2_filter {
+  # Remove reads [pairs] with bowtie2 hits to the given index
   input {
+    File fastp1_fastq
+    File? fastp2_fastq
+
+    # GENOME_NAME.bowtie2.tar should contain GENOME_NAME/GENOME_NAME.*.bt*
+    File index_tar
+    String bowtie2_options = "--very-sensitive-local"
+
     String docker_image_id
-    String s3_wd_uri
-    Array[File] dedup_fa
-    File duplicate_clusters_csv
-    File duplicate_cluster_sizes_tsv
+    Int cpu = 16
   }
-  command<<<
-  set -euxo pipefail
-  idseq-dag-run-step --workflow-name host_filter \
-    --step-module idseq_dag.steps.run_lzw \
-    --step-class PipelineStepRunLZW \
-    --step-name lzw_out \
-    --input-files '[["~{sep='","' dedup_fa}", "~{duplicate_clusters_csv}", "~{duplicate_cluster_sizes_tsv}"]]' \
-    --output-files '[~{if length(dedup_fa) == 2 then '"lzw1.fa", "lzw2.fa"' else '"lzw1.fa"'}]' \
-    --output-dir-s3 '~{s3_wd_uri}' \
-    --additional-files '{}' \
-    --additional-attributes '{"thresholds": [0.45, 0.42], "threshold_readlength": 150}'
+
+  Boolean paired = defined(fastp2_fastq)
+  String genome_name = basename(index_tar, ".bowtie2.tar")
+  String bowtie2_invocation =
+      "bowtie2 -x '/tmp/${genome_name}/${genome_name}' ${bowtie2_options} -p ${cpu}"
+        + (if (paired) then " -1 '${fastp1_fastq}' -2 '${fastp2_fastq}'" else " -U '${fastp1_fastq}'")
+        + " -q -S '/tmp/bowtie2.sam'"
+
+  command <<<
+    set -euxo pipefail
+
+    tar xf '~{index_tar}' -C /tmp
+
+    ~{bowtie2_invocation}
+
+    # generate sort & compressed BAM file for archival
+    samtools sort -n -o "bowtie2_host.bam" -@ 4 -T /tmp "/tmp/bowtie2.sam" & samtools_pid=$!
+
+    # Extract reads [pairs] that did NOT map to the index
+    if [[ '~{paired}' == 'true' ]]; then
+        #    1 (read paired)
+        #    4 (read unmapped)
+        # +  8 (mate unmapped)
+        # ----
+        #   13
+        samtools fastq -f 13 -1 'bowtie2_host_filtered1.fastq' -2 'bowtie2_host_filtered2.fastq' -0 /dev/null -s /dev/null /tmp/bowtie2.sam
+        count="$(cat bowtie2_host_filtered{1,2}.fastq | wc -l)"
+    else
+        samtools fastq -f 4 /tmp/bowtie2.sam > 'bowtie2_host_filtered1.fastq'
+        count="$(cat bowtie2_host_filtered1.fastq | wc -l)"
+    fi
+
+    
+    count=$((count / 4))
+    jq --null-input --arg count "$count" '{"bowtie2_host_filtered_out":$count}' > 'bowtie2_host_filtered_out.count'
+
+    python3 - << 'EOF'
+    import textwrap
+    with open("bowtie2.description.md", "w") as outfile:
+      print(textwrap.dedent("""
+      **bowtie2 host filtering**
+
+      Filters out reads matching the host genome using
+      [Bowtie2](https://bowtie-bio.sourceforge.net/bowtie2/index.shtml). Runs
+      `bowtie2 ~{bowtie2_options}` using a precomputed index, then uses
+      [samtools](http://www.htslib.org/) to keep reads *not* mapping to the host genome.
+
+      Bowtie2 is run on the fastp-filtered FASTQ(s):
+
+      ```
+      ~{bowtie2_invocation}
+      ```
+
+      Then, non-mapping reads are selected using `samtools fastq -f ~{if (paired) then 13 else 4}`.
+
+      Bowtie2 documentation can be found [here](https://bowtie-bio.sourceforge.net/bowtie2/index.shtml)
+      """).strip(), file=outfile)
+    EOF
+
+    wait $samtools_pid
   >>>
+
   output {
-    String step_description_md = read_string("lzw_out.description.md")
-    File lzw1_fa = "lzw1.fa"
-    File? lzw2_fa = "lzw2.fa"
-    File? output_read_count = "lzw_out.count"
+    String step_description_md = read_string("bowtie2.description.md")
+    File bowtie2_host_filtered1_fastq = "bowtie2_host_filtered1.fastq"
+    File? bowtie2_host_filtered2_fastq = "bowtie2_host_filtered2.fastq"
+    File reads_out_count = "bowtie2_host_filtered_out.count"
+    File bam = "bowtie2_host.bam"
   }
   runtime {
     docker: docker_image_id
+    cpu: cpu
+    memory: "~{cpu*2}G"
   }
 }
 
-task RunBowtie2_bowtie2_out {
+task hisat2_filter {
+  # Remove reads [pairs] with HISAT2 hits to the given index
   input {
+    File bowtie2_host_filtered1_fastq
+    File? bowtie2_host_filtered2_fastq
+
+    # GENOME_NAME.hisat2.tar should contain GENOME_NAME/GENOME_NAME.*.ht2
+    File index_tar
+    String hisat2_options = ""
+
     String docker_image_id
-    String s3_wd_uri
-    Array[File] lzw_fa
-    Array[File] dedup_fa
-    File duplicate_clusters_csv
-    File duplicate_cluster_sizes_tsv
-    File bowtie2_genome
+    Int cpu = 10
   }
-  command<<<
-  set -euxo pipefail
-  idseq-dag-run-step --workflow-name host_filter \
-    --step-module idseq_dag.steps.run_bowtie2 \
-    --step-class PipelineStepRunBowtie2 \
-    --step-name bowtie2_out \
-    --input-files '[["~{sep='","' lzw_fa}"], ["~{sep='","' dedup_fa}", "~{duplicate_clusters_csv}", "~{duplicate_cluster_sizes_tsv}"]]' \
-    --output-files '[~{if length(lzw_fa) == 2 then '"bowtie2_1.fa", "bowtie2_2.fa", "bowtie2_merged.fa"' else '"bowtie2_1.fa"'}]' \
-    --output-dir-s3 '~{s3_wd_uri}' \
-    --additional-files '{"bowtie2_genome": "~{bowtie2_genome}"}' \
-    --additional-attributes '{"output_sam_file": "bowtie2.sam"}'
-  bowtie2 --version > bowtie2_version.txt
+
+  Boolean paired = defined(bowtie2_host_filtered2_fastq)
+  String genome_name = basename(index_tar, ".hisat2.tar")
+  String hisat2_invocation =
+      "/hisat2/hisat2 -x '/tmp/${genome_name}/${genome_name}' ${hisat2_options} -p ${cpu}"
+        + (if (paired) then " -1 '${bowtie2_host_filtered1_fastq}' -2 '${bowtie2_host_filtered2_fastq}'" else " -U '${bowtie2_host_filtered1_fastq}'")
+        + " -q -S /tmp/hisat2.sam"
+
+  command <<<
+    set -euxo pipefail
+
+    tar xf '~{index_tar}' -C /tmp
+
+    ~{hisat2_invocation}
+
+    # Extract reads [pairs] that did NOT map to the index
+    if [[ '~{paired}' == 'true' ]]; then
+        #    1 (read paired)
+        #    4 (read unmapped)
+        # +  8 (mate unmapped)
+        # ----
+        #   13
+        samtools fastq -f 13 -1 'hisat2_host_filtered1.fastq' -2 'hisat2_host_filtered2.fastq' -0 /dev/null -s /dev/null /tmp/hisat2.sam
+        count="$(cat hisat2_host_filtered{1,2}.fastq | wc -l)"
+    else
+        samtools fastq -f 4 /tmp/hisat2.sam > 'hisat2_host_filtered1.fastq'
+        count="$(cat hisat2_host_filtered1.fastq | wc -l)"
+    fi
+
+    count=$((count / 4))
+    jq --null-input --arg count "$count" '{"hisat2_host_filtered_out":$count}' > 'hisat2_host_filtered_out.count'
+
+    python3 - << 'EOF'
+    import textwrap
+    with open("hisat2.description.md", "w") as outfile:
+      print(textwrap.dedent("""
+      **HISAT2 host filtering**
+
+      Filters out reads matching the host genome using
+      [HISAT2](http://daehwankimlab.github.io/hisat2/). Runs `hisat2` using a precomputed index,
+      then uses [samtools](http://www.htslib.org/) to keep reads *not* mapping to the
+      host genome.
+
+      HISAT2 complements Bowtie2 with a different algorithm that also models potential RNA splice
+      junctions (if CZ ID indexes transcript models for the host).
+
+      HISAT2 is run on the bowtie2-filtered FASTQ(s):
+
+      ```
+      ~{hisat2_invocation}
+      ```
+
+      Then, non-mapping reads are selected using `samtools fastq -f ~{if (paired) then 13 else 4}`.
+
+      HISAT2 documentation can be found [here](http://daehwankimlab.github.io/hisat2/)
+      """).strip(), file=outfile)
+    EOF
   >>>
+
   output {
-    String step_description_md = read_string("bowtie2_out.description.md")
-    File bowtie2_1_fa = "bowtie2_1.fa"
-    File? bowtie2_2_fa = "bowtie2_2.fa"
-    File? bowtie2_merged_fa = "bowtie2_merged.fa"
-    File? output_read_count = "bowtie2_out.count"
-    File? version = "bowtie2_version.txt"
+    String step_description_md = read_string("hisat2.description.md")
+    File hisat2_host_filtered1_fastq = "hisat2_host_filtered1.fastq"
+    File? hisat2_host_filtered2_fastq = "hisat2_host_filtered2.fastq"
+    File reads_out_count = "hisat2_host_filtered_out.count"
   }
   runtime {
     docker: docker_image_id
+    cpu: cpu
+    memory: "~{cpu*4}G"
   }
 }
 
-task RunSubsample {
+###################################################################################################
+### NOTE: bowtie2_human_filter and hisat2_human_filter are roughly copy/paste of the _host_filter
+###       tasks above. We'd much prefer to consolidate them, but the webapp pipeline visualization
+###       isn't yet able to handle WDL tasks used multiple times with dynamic output filenames.
+###################################################################################################
+
+task bowtie2_human_filter {
+  # Remove reads [pairs] with bowtie2 hits to the given index
   input {
+    File hisat2_host_filtered1_fastq
+    File? hisat2_host_filtered2_fastq
+
+    # GENOME_NAME.bowtie2.tar should contain GENOME_NAME/GENOME_NAME.*.bt*
+    File index_tar
+    String bowtie2_options = "--very-sensitive-local"
+
     String docker_image_id
-    String s3_wd_uri
-    Array[File] bowtie2_fa
-    Array[File] dedup_fa
-    File duplicate_clusters_csv
-    File duplicate_cluster_sizes_tsv
-    Int max_subsample_fragments
-  }
-  command<<<
-  set -euxo pipefail
-  idseq-dag-run-step --workflow-name host_filter \
-    --step-module idseq_dag.steps.run_subsample \
-    --step-class PipelineStepRunSubsample \
-    --step-name subsampled_out \
-    --input-files '[["~{sep='","' bowtie2_fa}"], ["~{sep='","' dedup_fa}", "~{duplicate_clusters_csv}", "~{duplicate_cluster_sizes_tsv}"]]' \
-    --output-files '[~{if length(dedup_fa) == 2 then '"subsampled_1.fa", "subsampled_2.fa", "subsampled_merged.fa"' else '"subsampled_1.fa"'}]' \
-    --output-dir-s3 '~{s3_wd_uri}' \
-    --additional-files '{}' \
-    --additional-attributes '{"max_fragments": ~{max_subsample_fragments}}'
+    Int cpu = 16
+  }
+
+  Boolean paired = defined(hisat2_host_filtered2_fastq)
+  String genome_name = basename(index_tar, ".bowtie2.tar")
+  String bowtie2_invocation =
+      "bowtie2 -x '/tmp/${genome_name}/${genome_name}' ${bowtie2_options} -p ${cpu}"
+        + (if (paired) then " -1 '${hisat2_host_filtered1_fastq}' -2 '${hisat2_host_filtered2_fastq}'" else " -U '${hisat2_host_filtered1_fastq}'")
+        + " -q -S '/tmp/bowtie2.sam'"
+
+  command <<<
+    set -euxo pipefail
+
+    tar xf '~{index_tar}' -C /tmp
+
+    ~{bowtie2_invocation}
+
+    # generate sort & compressed BAM file for archival
+    samtools sort -n -o "bowtie2_human.bam" -@ 4 -T /tmp "/tmp/bowtie2.sam" & samtools_pid=$!
+
+    # Extract reads [pairs] that did NOT map to the index
+    if [[ '~{paired}' == 'true' ]]; then
+        #    1 (read paired)
+        #    4 (read unmapped)
+        # +  8 (mate unmapped)
+        # ----
+        #   13
+        samtools fastq -f 13 -1 'bowtie2_human_filtered1.fastq' -2 'bowtie2_human_filtered2.fastq' -0 /dev/null -s /dev/null /tmp/bowtie2.sam
+        count="$(cat bowtie2_human_filtered{1,2}.fastq | wc -l)"
+    else
+        samtools fastq -f 4 /tmp/bowtie2.sam > 'bowtie2_human_filtered1.fastq'
+        count="$(cat bowtie2_human_filtered1.fastq | wc -l)"
+    fi
+
+    count=$((count / 4))
+    jq --null-input --arg count "$count" '{"bowtie2_human_filtered_out":$count}' > 'bowtie2_human_filtered_out.count'
+
+    python3 - << 'EOF'
+    import textwrap
+    with open("bowtie2.description.md", "w") as outfile:
+      print(textwrap.dedent("""
+      **bowtie2 human filtering**
+
+      Filters out reads matching the human genome using
+      [Bowtie2](https://bowtie-bio.sourceforge.net/bowtie2/index.shtml). This is similar to the
+      host filtering task, but CZ ID also filters non-human samples against human genome indexes
+      to alleviate any potential data privacy concerns.
+      """).strip(), file=outfile)
+    EOF
+
+    wait $samtools_pid
   >>>
+
   output {
-    String step_description_md = read_string("subsampled_out.description.md")
-    File subsampled_1_fa = "subsampled_1.fa"
-    File? subsampled_2_fa = "subsampled_2.fa"
-    File? subsampled_merged_fa = "subsampled_merged.fa"
-    File? output_read_count = "subsampled_out.count"
+    String step_description_md = read_string("bowtie2.description.md")
+    File bowtie2_human_filtered1_fastq = "bowtie2_human_filtered1.fastq"
+    File? bowtie2_human_filtered2_fastq = "bowtie2_human_filtered2.fastq"
+    File reads_out_count = "bowtie2_human_filtered_out.count"
+    File bam = "bowtie2_human.bam"
   }
   runtime {
     docker: docker_image_id
+    cpu: cpu
+    memory: "~{cpu*2}G"
   }
 }
 
-task RunStarDownstream {
+task hisat2_human_filter {
+  # Remove reads [pairs] with HISAT2 hits to the given index
   input {
+    File bowtie2_human_filtered1_fastq
+    File? bowtie2_human_filtered2_fastq
+
+    # GENOME_NAME.hisat2.tar should contain GENOME_NAME/GENOME_NAME.*.ht2
+    File index_tar
+    String hisat2_options = ""
+
     String docker_image_id
-    String s3_wd_uri
-    Array[File] subsampled_fa
-    File validate_input_summary_json
-    Array[File] valid_input_fastq
-    Array[File] dedup_fa
-    File duplicate_clusters_csv
-    File duplicate_cluster_sizes_tsv
-    File human_star_genome
-  }
-  command<<<
-  set -euxo pipefail
-  idseq-dag-run-step --workflow-name host_filter \
-    --step-module idseq_dag.steps.run_star_downstream \
-    --step-class PipelineStepRunStarDownstream \
-    --step-name star_human_out \
-    --input-files '[["~{sep='","' subsampled_fa}"], ["~{validate_input_summary_json}", "~{sep='","' valid_input_fastq}"], ["~{sep='","' dedup_fa}", "~{duplicate_clusters_csv}", "~{duplicate_cluster_sizes_tsv}"]]' \
-    --output-files '[~{if length(dedup_fa) == 2 then '"unmapped_human_1.fa", "unmapped_human_2.fa"' else '"unmapped_human_1.fa"'}]' \
-    --output-dir-s3 '~{s3_wd_uri}' \
-    --additional-files '{"star_genome": "~{human_star_genome}"}' \
-    --additional-attributes '{}'
-  STAR --version > star_human_version.txt
+    Int cpu = 16
+  }
+
+  Boolean paired = defined(bowtie2_human_filtered2_fastq)
+  String genome_name = basename(index_tar, ".hisat2.tar")
+  String hisat2_invocation =
+      "/hisat2/hisat2 -x '/tmp/${genome_name}/${genome_name}' ${hisat2_options} -p ${cpu}"
+        + (if (paired) then " -1 '${bowtie2_human_filtered1_fastq}' -2 '${bowtie2_human_filtered2_fastq}'" else " -U '${bowtie2_human_filtered1_fastq}'")
+        + " -q -S /tmp/hisat2.sam"
+
+  command <<<
+    set -euxo pipefail
+
+    tar xf '~{index_tar}' -C /tmp
+
+    ~{hisat2_invocation}
+
+    # Extract reads [pairs] that did NOT map to the index
+    if [[ '~{paired}' == 'true' ]]; then
+        #    1 (read paired)
+        #    4 (read unmapped)
+        # +  8 (mate unmapped)
+        # ----
+        #   13
+        samtools fastq -f 13 -1 'hisat2_human_filtered1.fastq' -2 'hisat2_human_filtered2.fastq' -0 /dev/null -s /dev/null /tmp/hisat2.sam
+        count="$(cat hisat2_human_filtered{1,2}.fastq | wc -l)"
+    else
+        samtools fastq -f 4 /tmp/hisat2.sam > 'hisat2_human_filtered1.fastq'
+        count="$(cat hisat2_human_filtered1.fastq | wc -l)"
+    fi
+
+    count=$((count / 4))
+    jq --null-input --arg count "$count" '{"hisat2_human_filtered_out":$count}' > 'hisat2_human_filtered_out.count'
+
+    python3 - << 'EOF'
+    import textwrap
+    with open("hisat2.description.md", "w") as outfile:
+      print(textwrap.dedent("""
+      **HISAT2 human filtering**
+
+      Filters out reads matching the human genome using
+      [HISAT2](http://daehwankimlab.github.io/hisat2/). This is similar to the host filtering
+      task, but CZ ID also filters non-human samples against human genome indexes to alleviate any
+      potential data privacy concerns.
+      """).strip(), file=outfile)
+    EOF
   >>>
+
   output {
-    String step_description_md = read_string("star_human_out.description.md")
-    File unmapped_human_1_fa = "unmapped_human_1.fa"
-    File? unmapped_human_2_fa = "unmapped_human_2.fa"
-    File? output_read_count = "star_human_out.count"
-    File? version = "star_human_version.txt"
+    String step_description_md = read_string("hisat2.description.md")
+    File hisat2_human_filtered1_fastq = "hisat2_human_filtered1.fastq"
+    File? hisat2_human_filtered2_fastq = "hisat2_human_filtered2.fastq"
+    File reads_out_count = "hisat2_human_filtered_out.count"
   }
   runtime {
     docker: docker_image_id
+    cpu: cpu
+    memory: "~{cpu*2}G"
   }
 }
 
-task RunBowtie2_bowtie2_human_out {
+task collect_insert_size_metrics {
   input {
+    File bam
     String docker_image_id
-    String s3_wd_uri
-    Array[File] unmapped_human_fa
-    Array[File] dedup_fa
-    File duplicate_clusters_csv
-    File duplicate_cluster_sizes_tsv
-    File human_bowtie2_genome
   }
-  command<<<
-  set -euxo pipefail
-  idseq-dag-run-step --workflow-name host_filter \
-    --step-module idseq_dag.steps.run_bowtie2 \
-    --step-class PipelineStepRunBowtie2 \
-    --step-name bowtie2_human_out \
-    --input-files '[["~{sep='","' unmapped_human_fa}"], ["~{sep='","' dedup_fa}", "~{duplicate_clusters_csv}", "~{duplicate_cluster_sizes_tsv}"]]' \
-    --output-files '[~{if length(dedup_fa) == 2 then '"bowtie2_human_1.fa", "bowtie2_human_2.fa", "bowtie2_human_merged.fa"' else '"bowtie2_human_1.fa"'}]' \
-    --output-dir-s3 '~{s3_wd_uri}' \
-    --additional-files '{"bowtie2_genome": "~{human_bowtie2_genome}"}' \
-    --additional-attributes '{"output_sam_file": "bowtie2_human.sam"}'
-  bowtie2 --version > bowtie2_human_version.txt
+
+  command <<<
+    picard CollectInsertSizeMetrics 'I=~{bam}' O=picard_insert_metrics.txt H=insert_size_histogram.pdf
+    python3 - << 'EOF'
+    import textwrap
+    with open("collect_insert_size_metrics.description.md", "w") as outfile:
+      print(textwrap.dedent("""
+      **Picard CollectInsertSizeMetrics**
+
+      This step computes insert size metrics for Paired End samples. These metrics are computed by
+      the Broad Institute's Picard toolkit.
+
+      Picard is run on the output BAM file obtained from running Bowtie2 on the host genome:
+
+      ```
+      picard CollectInsertSizeMetrics 'I=~{bam}' O=picard_insert_metrics.txt H=insert_size_histogram.pdf
+      ```
+
+      Picard documentation can be found [here](https://gatk.broadinstitute.org/hc/en-us/articles/360037055772-CollectInsertSizeMetrics-Picard-)
+      """).strip(), file=outfile)
+    EOF
   >>>
+
   output {
-    String step_description_md = read_string("bowtie2_human_out.description.md")
-    File bowtie2_human_1_fa = "bowtie2_human_1.fa"
-    File? bowtie2_human_2_fa = "bowtie2_human_2.fa"
-    File? bowtie2_human_merged_fa = "bowtie2_human_merged.fa"
-    File? output_read_count = "bowtie2_human_out.count"
-    File? version = "bowtie2_human_version.txt"
+    String step_description_md = read_string("collect_insert_size_metrics.description.md")
+    # If no reads mapped to the host, then picard exits "successfully" without creating these files.
+    File? insert_size_metrics = "picard_insert_metrics.txt"
+    File? insert_size_histogram = "insert_size_histogram.pdf"
   }
+
   runtime {
     docker: docker_image_id
+    cpu: 1
+    memory: "8G"
   }
 }
 
-task RunGsnapFilter {
+task RunCZIDDedup {
   input {
+    File hisat2_filtered1_fastq
+    File? hisat2_filtered2_fastq
     String docker_image_id
     String s3_wd_uri
-    Array[File] subsampled_fa
-    Array[File] dedup_fa
-    File duplicate_clusters_csv
-    File duplicate_cluster_sizes_tsv
-    File gsnap_genome
   }
+  Boolean paired = defined(hisat2_filtered2_fastq)
   command<<<
-  set -euxo pipefail
-  idseq-dag-run-step --workflow-name host_filter \
-    --step-module idseq_dag.steps.run_gsnap_filter \
-    --step-class PipelineStepRunGsnapFilter \
-    --step-name gsnap_filter_out \
-    --input-files '[["~{sep='","' subsampled_fa}"], ["~{sep='","' dedup_fa}", "~{duplicate_clusters_csv}", "~{duplicate_cluster_sizes_tsv}"]]' \
-    --output-files '[~{if length(dedup_fa) == 2 then '"gsnap_filter_1.fa", "gsnap_filter_2.fa", "gsnap_filter_merged.fa"' else '"gsnap_filter_1.fa"'}]' \
-    --output-dir-s3 '~{s3_wd_uri}' \
-    --additional-files '{"gsnap_genome": "~{gsnap_genome}"}' \
-    --additional-attributes '{"output_sam_file": "gsnap_filter.sam"}'
-  gsnap --version > gsnap_filter_version.txt
+    set -euxo pipefail
+
+    >&2 idseq-dag-run-step --workflow-name host_filter \
+      --step-module idseq_dag.steps.run_czid_dedup \
+      --step-class PipelineStepRunCZIDDedup \
+      --step-name czid_dedup_out \
+      --input-files '[["~{sep='","' select_all([hisat2_filtered1_fastq, hisat2_filtered2_fastq])}"]]' \
+      --output-files '[~{if paired then '"dedup1.fastq","dedup2.fastq"' else '"dedup1.fastq"'}, "clusters.csv", "duplicate_cluster_sizes.tsv"]' \
+      --output-dir-s3 '~{s3_wd_uri}' \
+      --additional-files '{}' \
+      --additional-attributes '{}'
   >>>
   output {
-    String step_description_md = read_string("gsnap_filter_out.description.md")
-    File gsnap_filter_1_fa = "gsnap_filter_1.fa"
-    File? gsnap_filter_2_fa = "gsnap_filter_2.fa"
-    File? gsnap_filter_merged_fa = "gsnap_filter_merged.fa"
-    File? output_read_count = "gsnap_filter_out.count"
-    File? version = "gsnap_filter_version.txt"
+    String step_description_md = read_string("czid_dedup_out.description.md")
+    File dedup1_fastq = "dedup1.fastq"
+    File? dedup2_fastq = "dedup2.fastq"
+    File duplicate_clusters_csv = "clusters.csv"
+    File duplicate_cluster_sizes_tsv = "duplicate_cluster_sizes.tsv"
+    File reads_out_count = "czid_dedup_out.count"
   }
   runtime {
     docker: docker_image_id
+    cpu: 4
+    memory: "16G"
   }
 }
 
-
-workflow czid_host_filter {
+task RunSubsample {
   input {
-    String docker_image_id
-    String s3_wd_uri
-    File fastqs_0
-    File? fastqs_1
-    String file_ext
-    String nucleotide_type
-    String host_genome
-    File adapter_fasta
-    File star_genome
-    File bowtie2_genome
-    File gsnap_genome = "s3://czid-public-references/host_filter/human/2018-02-15-utc-1518652800-unixtime__2018-02-15-utc-1518652800-unixtime/hg38_pantro5_k16.tar"
-    String human_star_genome
-    String human_bowtie2_genome
-    Int max_input_fragments
+    File dedup1_fastq
+    File? dedup2_fastq
+    File duplicate_cluster_sizes_tsv
     Int max_subsample_fragments
-  }
-
-  call RunValidateInput {
-    input:
-      docker_image_id = docker_image_id,
-      s3_wd_uri = s3_wd_uri,
-      fastqs = select_all([fastqs_0, fastqs_1]),
-      file_ext = file_ext,
-      max_input_fragments = max_input_fragments
-  }
-
-  call RunStar {
-    input:
-      docker_image_id = docker_image_id,
-      s3_wd_uri = s3_wd_uri,
-      validate_input_summary_json = RunValidateInput.validate_input_summary_json,
-      valid_input_fastq = select_all([RunValidateInput.valid_input1_fastq, RunValidateInput.valid_input2_fastq]),
-      star_genome = star_genome,
-      nucleotide_type = nucleotide_type,
-      host_genome = host_genome
-  }
-
-  call RunTrimmomatic {
-    input:
-      docker_image_id = docker_image_id,
-      s3_wd_uri = s3_wd_uri,
-      unmapped_fastq = select_all([RunStar.unmapped1_fastq, RunStar.unmapped2_fastq]),
-      adapter_fasta = adapter_fasta
-  }
-
-  call RunPriceSeq {
-    input:
-      docker_image_id = docker_image_id,
-      s3_wd_uri = s3_wd_uri,
-      trimmomatic_fastq = select_all([RunTrimmomatic.trimmomatic1_fastq, RunTrimmomatic.trimmomatic2_fastq])
-  }
 
-  call RunCZIDDedup {
-    input:
-      docker_image_id = docker_image_id,
-      s3_wd_uri = s3_wd_uri,
-      priceseq_fa = select_all([RunPriceSeq.priceseq1_fa, RunPriceSeq.priceseq2_fa])
-  }
-
-  call RunLZW {
-    input:
-      docker_image_id = docker_image_id,
-      s3_wd_uri = s3_wd_uri,
-      dedup_fa = select_all([RunCZIDDedup.dedup1_fa, RunCZIDDedup.dedup2_fa]),
-      duplicate_clusters_csv = RunCZIDDedup.duplicate_clusters_csv,
-      duplicate_cluster_sizes_tsv = RunCZIDDedup.duplicate_cluster_sizes_tsv
-  }
-
-  call RunBowtie2_bowtie2_out {
-    input:
-      docker_image_id = docker_image_id,
-      s3_wd_uri = s3_wd_uri,
-      lzw_fa = select_all([RunLZW.lzw1_fa, RunLZW.lzw2_fa]),
-      dedup_fa = select_all([RunCZIDDedup.dedup1_fa, RunCZIDDedup.dedup2_fa]),
-      duplicate_clusters_csv = RunCZIDDedup.duplicate_clusters_csv,
-      duplicate_cluster_sizes_tsv = RunCZIDDedup.duplicate_cluster_sizes_tsv,
-      bowtie2_genome = bowtie2_genome
-  }
-
-  call RunSubsample {
-    input:
-      docker_image_id = docker_image_id,
-      s3_wd_uri = s3_wd_uri,
-      bowtie2_fa = select_all([RunBowtie2_bowtie2_out.bowtie2_1_fa, RunBowtie2_bowtie2_out.bowtie2_2_fa, RunBowtie2_bowtie2_out.bowtie2_merged_fa]),
-      dedup_fa = select_all([RunCZIDDedup.dedup1_fa, RunCZIDDedup.dedup2_fa]),
-      duplicate_clusters_csv = RunCZIDDedup.duplicate_clusters_csv,
-      duplicate_cluster_sizes_tsv = RunCZIDDedup.duplicate_cluster_sizes_tsv,
-      max_subsample_fragments = max_subsample_fragments
-  }
-
-  if (host_genome != "human") {
-    call RunStarDownstream {
-      input:
-        docker_image_id = docker_image_id,
-        s3_wd_uri = s3_wd_uri,
-        subsampled_fa = select_all([RunSubsample.subsampled_1_fa, RunSubsample.subsampled_2_fa, RunSubsample.subsampled_merged_fa]),
-        validate_input_summary_json = RunValidateInput.validate_input_summary_json,
-        valid_input_fastq = select_all([RunValidateInput.valid_input1_fastq, RunValidateInput.valid_input2_fastq]),
-        dedup_fa = select_all([RunCZIDDedup.dedup1_fa, RunCZIDDedup.dedup2_fa]),
-        duplicate_clusters_csv = RunCZIDDedup.duplicate_clusters_csv,
-        duplicate_cluster_sizes_tsv = RunCZIDDedup.duplicate_cluster_sizes_tsv,
-        human_star_genome = human_star_genome
-    }
-
-    call RunBowtie2_bowtie2_human_out {
-      input:
-        docker_image_id = docker_image_id,
-        s3_wd_uri = s3_wd_uri,
-        unmapped_human_fa = select_all([RunStarDownstream.unmapped_human_1_fa, RunStarDownstream.unmapped_human_2_fa]),
-        dedup_fa = select_all([RunCZIDDedup.dedup1_fa, RunCZIDDedup.dedup2_fa]),
-        duplicate_clusters_csv = RunCZIDDedup.duplicate_clusters_csv,
-        duplicate_cluster_sizes_tsv = RunCZIDDedup.duplicate_cluster_sizes_tsv,
-        human_bowtie2_genome = human_bowtie2_genome
-    }
-  }
-
-  Array[File] gsnap_filter_input = if (host_genome == "human")
-    then select_all([RunSubsample.subsampled_1_fa, RunSubsample.subsampled_2_fa, RunSubsample.subsampled_merged_fa])
-    else select_all([RunBowtie2_bowtie2_human_out.bowtie2_human_1_fa, RunBowtie2_bowtie2_human_out.bowtie2_human_2_fa, RunBowtie2_bowtie2_human_out.bowtie2_human_merged_fa])
-
-  call RunGsnapFilter {
-    input:
-      docker_image_id = docker_image_id,
-      s3_wd_uri = s3_wd_uri,
-      subsampled_fa = gsnap_filter_input,
-      dedup_fa = select_all([RunCZIDDedup.dedup1_fa, RunCZIDDedup.dedup2_fa]),
-      duplicate_clusters_csv = RunCZIDDedup.duplicate_clusters_csv,
-      duplicate_cluster_sizes_tsv = RunCZIDDedup.duplicate_cluster_sizes_tsv,
-      gsnap_genome = gsnap_genome
+    String docker_image_id
+    String s3_wd_uri
   }
+  Boolean paired = defined(dedup2_fastq)
+  command<<<
+  set -euxo pipefail
+  TMPDIR="${TMPDIR:-/tmp}"
+
+  # Convert FASTQs to FASTAs: the idseq-dag subsampling tool inputs and outputs FASTAs, and
+  # downstream pipeline stages consume the FASTAs.
+  seqtk seq -a '~{dedup1_fastq}' > "$TMPDIR/reads1.fasta" & pid=$!
+  fastas="\"$TMPDIR/reads1.fasta\""
+  if [[ '~{paired}' == 'true' ]]; then
+    seqtk seq -a '~{dedup2_fastq}' > "$TMPDIR/reads2.fasta"
+    wait $pid
+    # also generate merged FASTA. `seqtk mergepe` interleaves the reads but doesn't append /1 /2 to
+    # the names, so we add an awk kludge to do that.
+    seqtk mergepe "$TMPDIR/reads1.fasta" "$TMPDIR/reads2.fasta" | awk '
+        BEGIN {
+          name = "";
+        }
+        /^>.*/ {
+          if ($0 != name) {
+            name = $0;
+            printf("%s/1\n", $0);
+          } else {
+            printf("%s/2\n", $0);
+          }
+        }
+        ! /^>.*/ { print; }
+      ' > "$TMPDIR/reads_merged.fasta"
+    fastas="\"$TMPDIR/reads1.fasta\",\"$TMPDIR/reads2.fasta\",\"$TMPDIR/reads_merged.fasta\""
+  else
+    wait $pid
+  fi
 
+  # subsample FASTAs
+  idseq-dag-run-step --workflow-name host_filter \
+    --step-module idseq_dag.steps.run_subsample \
+    --step-class PipelineStepRunSubsample \
+    --step-name subsampled_out \
+    --input-files '[['"$fastas"'], ["~{duplicate_cluster_sizes_tsv}"]]' \
+    --output-files '[~{if paired then '"subsampled_1.fa", "subsampled_2.fa", "subsampled_merged.fa"' else '"subsampled_1.fa"'}]' \
+    --output-dir-s3 '~{s3_wd_uri}' \
+    --additional-files '{}' \
+    --additional-attributes '{"max_fragments": ~{max_subsample_fragments}}'
+  >>>
   output {
-    File validate_input_out_validate_input_summary_json = RunValidateInput.validate_input_summary_json
-    File? validate_input_out_count = RunValidateInput.output_read_count
-    File star_out_unmapped1_fastq = RunStar.unmapped1_fastq
-    File? star_out_unmapped2_fastq = RunStar.unmapped2_fastq
-    File? star_out_log_file = RunStar.output_log_file
-    File? star_out_count = RunStar.output_read_count
-    File? star_version = RunStar.version
-    File trimmomatic_out_trimmomatic1_fastq = RunTrimmomatic.trimmomatic1_fastq
-    File? trimmomatic_out_trimmomatic2_fastq = RunTrimmomatic.trimmomatic2_fastq
-    File? trimmomatic_out_count = RunTrimmomatic.output_read_count
-    File? trimmomatic_version = RunTrimmomatic.version
-    File priceseq_out_priceseq1_fa = RunPriceSeq.priceseq1_fa
-    File? priceseq_out_priceseq2_fa = RunPriceSeq.priceseq2_fa
-    File? priceseq_out_count = RunPriceSeq.output_read_count
-    File? priceseq_version = RunPriceSeq.version
-    File czid_dedup_out_dedup1_fa = RunCZIDDedup.dedup1_fa
-    File? czid_dedup_out_dedup2_fa = RunCZIDDedup.dedup2_fa
-    File czid_dedup_out_duplicate_clusters_csv = RunCZIDDedup.duplicate_clusters_csv
-    File czid_dedup_out_duplicate_cluster_sizes_tsv = RunCZIDDedup.duplicate_cluster_sizes_tsv
-    File? czid_dedup_out_count = RunCZIDDedup.output_read_count
-    File? czid_dedup_version = RunCZIDDedup.version
-    File lzw_out_lzw1_fa = RunLZW.lzw1_fa
-    File? lzw_out_lzw2_fa = RunLZW.lzw2_fa
-    File? lzw_out_count = RunLZW.output_read_count
-    File bowtie2_out_bowtie2_1_fa = RunBowtie2_bowtie2_out.bowtie2_1_fa
-    File? bowtie2_out_bowtie2_2_fa = RunBowtie2_bowtie2_out.bowtie2_2_fa
-    File? bowtie2_out_bowtie2_merged_fa = RunBowtie2_bowtie2_out.bowtie2_merged_fa
-    File? bowtie2_out_count = RunBowtie2_bowtie2_out.output_read_count
-    File? bowtie2_version = RunBowtie2_bowtie2_out.version
-    File subsampled_out_subsampled_1_fa = RunSubsample.subsampled_1_fa
-    File? subsampled_out_subsampled_2_fa = RunSubsample.subsampled_2_fa
-    File? subsampled_out_subsampled_merged_fa = RunSubsample.subsampled_merged_fa
-    File? subsampled_out_count = RunSubsample.output_read_count
-    File? star_human_out_unmapped_human_1_fa = RunStarDownstream.unmapped_human_1_fa
-    File? star_human_out_unmapped_human_2_fa = RunStarDownstream.unmapped_human_2_fa
-    File? star_human_out_count = RunStarDownstream.output_read_count
-    File? star_human_version = RunStarDownstream.version
-    File? bowtie2_human_out_bowtie2_human_1_fa = RunBowtie2_bowtie2_human_out.bowtie2_human_1_fa
-    File? bowtie2_human_out_bowtie2_human_2_fa = RunBowtie2_bowtie2_human_out.bowtie2_human_2_fa
-    File? bowtie2_human_out_bowtie2_human_merged_fa = RunBowtie2_bowtie2_human_out.bowtie2_human_merged_fa
-    File? bowtie2_human_out_count = RunBowtie2_bowtie2_human_out.output_read_count
-    File? bowtie2_human_version = RunBowtie2_bowtie2_human_out.version
-    File gsnap_filter_out_gsnap_filter_1_fa = RunGsnapFilter.gsnap_filter_1_fa
-    File? gsnap_filter_out_gsnap_filter_2_fa = RunGsnapFilter.gsnap_filter_2_fa
-    File? gsnap_filter_out_gsnap_filter_merged_fa = RunGsnapFilter.gsnap_filter_merged_fa
-    File? gsnap_filter_out_count = RunGsnapFilter.output_read_count
-    File? gsnap_filter_version = RunGsnapFilter.version
-    File? input_read_count = RunValidateInput.input_read_count
-    File? output_gene_file = RunStar.output_gene_file
-    File? output_metrics_file = RunStar.output_metrics_file
-    File? output_histogram_file = RunStar.output_histogram_file
+    String step_description_md = read_string("subsampled_out.description.md")
+    File subsampled_1_fa = "subsampled_1.fa"
+    File? subsampled_2_fa = "subsampled_2.fa"
+    File? subsampled_merged_fa = "subsampled_merged.fa"
+    File reads_out_count = "subsampled_out.count"
+  }
+  runtime {
+    docker: docker_image_id
+    cpu: 4
+    memory: "8G"
   }
 }
diff --git a/workflows/short-read-mngs/host_filter_defaults.yml b/workflows/short-read-mngs/host_filter_defaults.yml
index a9d7458a2..eb1df7c0a 100644
--- a/workflows/short-read-mngs/host_filter_defaults.yml
+++ b/workflows/short-read-mngs/host_filter_defaults.yml
@@ -1,10 +1,10 @@
 nucleotide_type: DNA
 host_genome: human
-star_genome: s3://czid-public-references/host_filter/human/2018-02-15-utc-1518652800-unixtime__2018-02-15-utc-1518652800-unixtime/human_STAR_genome.tar # human host genome
-bowtie2_genome: s3://czid-public-references/host_filter/human/2018-02-15-utc-1518652800-unixtime__2018-02-15-utc-1518652800-unixtime/human_bowtie2_genome.tar # human host genome
-gsnap_genome: s3://czid-public-references/host_filter/human/2018-02-15-utc-1518652800-unixtime__2018-02-15-utc-1518652800-unixtime/hg38_pantro5_k16.tar 
-human_star_genome: s3://czid-public-references/host_filter/human/2018-02-15-utc-1518652800-unixtime__2018-02-15-utc-1518652800-unixtime/human_STAR_genome.tar
-human_bowtie2_genome: s3://czid-public-references/host_filter/human/2018-02-15-utc-1518652800-unixtime__2018-02-15-utc-1518652800-unixtime/human_bowtie2_genome.tar
+bowtie2_index_tar: s3://public-test-bucket-idseq/host_filter/human/2022/bowtie2_index_tar/GRCh38_ERCC.bowtie2.tar
+hisat2_index_tar: s3://public-test-bucket-idseq/host_filter/human/2022/hisat2_index_tar/GRCh38_ERCC.hisat2.tar
+kallisto_idx: s3://public-test-bucket-idseq/host_filter/human/2022/kallisto_idx/GRCh38_ERCC.kallisto.idx
+human_bowtie2_index_tar: s3://public-test-bucket-idseq/host_filter/human/2022/bowtie2_index_tar/GRCh38_ERCC.bowtie2.tar
+human_hisat2_index_tar: s3://public-test-bucket-idseq/host_filter/human/2022/hisat2_index_tar/GRCh38_ERCC.hisat2.tar
 adapter_fasta: s3://czid-public-references/adapter_sequences/illumina_TruSeq3-PE-2_NexteraPE-PE.fasta
 max_input_fragments: 75000000
 max_subsample_fragments: 1000000
diff --git a/workflows/short-read-mngs/local_driver.wdl b/workflows/short-read-mngs/local_driver.wdl
index 4e9062dcc..c84f89e55 100644
--- a/workflows/short-read-mngs/local_driver.wdl
+++ b/workflows/short-read-mngs/local_driver.wdl
@@ -29,9 +29,9 @@ workflow czid_short_read_mngs {
     }
     call stage2.czid_non_host_alignment as non_host_alignment {
         input:
-        host_filter_out_gsnap_filter_1_fa = host_filter.gsnap_filter_out_gsnap_filter_1_fa,
-        host_filter_out_gsnap_filter_2_fa = host_filter.gsnap_filter_out_gsnap_filter_2_fa,
-        host_filter_out_gsnap_filter_merged_fa = host_filter.gsnap_filter_out_gsnap_filter_merged_fa,
+        host_filter_out_gsnap_filter_1_fa = host_filter.subsampled_out_subsampled_1_fa,
+        host_filter_out_gsnap_filter_2_fa = host_filter.subsampled_out_subsampled_2_fa,
+        host_filter_out_gsnap_filter_merged_fa = host_filter.subsampled_out_subsampled_merged_fa,
         duplicate_cluster_sizes_tsv = host_filter.czid_dedup_out_duplicate_cluster_sizes_tsv,
         czid_dedup_out_duplicate_clusters_csv = host_filter.czid_dedup_out_duplicate_clusters_csv,
         minimap2_local_db_path = minimap2_local_db_path,
@@ -43,9 +43,9 @@ workflow czid_short_read_mngs {
     }
     call stage3.czid_postprocess as postprocess {
         input:
-        host_filter_out_gsnap_filter_1_fa = host_filter.gsnap_filter_out_gsnap_filter_1_fa,
-        host_filter_out_gsnap_filter_2_fa = host_filter.gsnap_filter_out_gsnap_filter_2_fa,
-        host_filter_out_gsnap_filter_merged_fa = host_filter.gsnap_filter_out_gsnap_filter_merged_fa,
+        host_filter_out_gsnap_filter_1_fa = host_filter.subsampled_out_subsampled_1_fa,
+        host_filter_out_gsnap_filter_2_fa = host_filter.subsampled_out_subsampled_2_fa,
+        host_filter_out_gsnap_filter_merged_fa = host_filter.subsampled_out_subsampled_merged_fa,
         duplicate_cluster_sizes_tsv = host_filter.czid_dedup_out_duplicate_cluster_sizes_tsv,
         czid_dedup_out_duplicate_clusters_csv = host_filter.czid_dedup_out_duplicate_clusters_csv,
         gsnap_out_gsnap_m8 = non_host_alignment.gsnap_out_gsnap_m8,
diff --git a/workflows/short-read-mngs/stage_io_map.json b/workflows/short-read-mngs/stage_io_map.json
index 565be202a..6df58eebb 100644
--- a/workflows/short-read-mngs/stage_io_map.json
+++ b/workflows/short-read-mngs/stage_io_map.json
@@ -1,15 +1,15 @@
 {
    "NonHostAlignment":{
-      "host_filter_out_gsnap_filter_1_fa":"gsnap_filter_out_gsnap_filter_1_fa",
-      "host_filter_out_gsnap_filter_2_fa":"gsnap_filter_out_gsnap_filter_2_fa",
-      "host_filter_out_gsnap_filter_merged_fa":"gsnap_filter_out_gsnap_filter_merged_fa",
+      "host_filter_out_gsnap_filter_1_fa":"subsampled_out_subsampled_1_fa",
+      "host_filter_out_gsnap_filter_2_fa":"subsampled_out_subsampled_2_fa",
+      "host_filter_out_gsnap_filter_merged_fa":"subsampled_out_subsampled_merged_fa",
       "duplicate_cluster_sizes_tsv":"czid_dedup_out_duplicate_cluster_sizes_tsv",
       "czid_dedup_out_duplicate_clusters_csv":"czid_dedup_out_duplicate_clusters_csv"
    },
    "Postprocess":{
-      "host_filter_out_gsnap_filter_1_fa":"gsnap_filter_out_gsnap_filter_1_fa",
-      "host_filter_out_gsnap_filter_2_fa":"gsnap_filter_out_gsnap_filter_2_fa",
-      "host_filter_out_gsnap_filter_merged_fa":"gsnap_filter_out_gsnap_filter_merged_fa",
+      "host_filter_out_gsnap_filter_1_fa":"subsampled_out_subsampled_1_fa",
+      "host_filter_out_gsnap_filter_2_fa":"subsampled_out_subsampled_2_fa",
+      "host_filter_out_gsnap_filter_merged_fa":"subsampled_out_subsampled_merged_fa",
       "gsnap_out_gsnap_m8":"gsnap_out_gsnap_m8",
       "gsnap_out_gsnap_deduped_m8":"gsnap_out_gsnap_deduped_m8",
       "gsnap_out_gsnap_hitsummary_tab":"gsnap_out_gsnap_hitsummary_tab",
diff --git a/workflows/short-read-mngs/test/host_filter/test_RunCZIDDedup.py b/workflows/short-read-mngs/test/host_filter/test_RunCZIDDedup.py
index 7d48420da..1672cab78 100644
--- a/workflows/short-read-mngs/test/host_filter/test_RunCZIDDedup.py
+++ b/workflows/short-read-mngs/test/host_filter/test_RunCZIDDedup.py
@@ -15,7 +15,7 @@ def test_RunCZIDDedup_safe_csv(util, short_read_mngs_bench3_viral_outputs):
     with NamedTemporaryFile(prefix=os.path.dirname(__file__), mode="w") as input_file:
         quote_count = 10
         special_char_rows = 0
-        for line in open(inputs["priceseq_fa"][0]):
+        for line in open(inputs["hisat2_filtered1_fastq"]):
             if line[0] == ">" or line[0] == "@":
                 if special_char_rows < quote_count:
                     input_file.write(f"{line[0]}={line[1:]}")
@@ -28,7 +28,8 @@ def test_RunCZIDDedup_safe_csv(util, short_read_mngs_bench3_viral_outputs):
         input_file.seek(0)
         assert special_char_rows == quote_count
 
-        inputs["priceseq_fa"] = [input_file.name]
+        inputs["hisat2_filtered1_fastq"] = input_file.name
+        inputs["hisat2_filtered2_fastq"] = None
 
         outp = util.miniwdl_run(
             util.repo_dir() / "workflows/short-read-mngs/host_filter.wdl",
diff --git a/workflows/short-read-mngs/test/host_filter/test_RunStar.py b/workflows/short-read-mngs/test/host_filter/test_RunStar.py
deleted file mode 100644
index f668be27f..000000000
--- a/workflows/short-read-mngs/test/host_filter/test_RunStar.py
+++ /dev/null
@@ -1,24 +0,0 @@
-import os
-import json
-
-
-def test_RunStar_outputs_logfile(util, short_read_mngs_bench3_viral_outputs):
-    # load the task's inputs from the end-to-end workflow test
-    inputs, _ = util.miniwdl_inputs_outputs(
-        os.path.join(
-            short_read_mngs_bench3_viral_outputs["dir"], "call-host_filter/call-RunStar"
-        )
-    )
-
-    # run the task with the manipulated inputs, expecting an error exit status
-    outp = util.miniwdl_run(
-        util.repo_dir() / "workflows/short-read-mngs/host_filter.wdl",
-        "--task",
-        "RunStar",
-        "-i",
-        json.dumps(inputs),
-    )
-
-    # verify Log.final.out is emitted
-    logfile = outp["outputs"]["RunStar.output_log_file"]
-    assert os.path.exists(logfile)
diff --git a/workflows/short-read-mngs/test/host_filter/test_RunValidateInput.py b/workflows/short-read-mngs/test/host_filter/test_RunValidateInput.py
index 2ad78a23e..99322fe23 100644
--- a/workflows/short-read-mngs/test/host_filter/test_RunValidateInput.py
+++ b/workflows/short-read-mngs/test/host_filter/test_RunValidateInput.py
@@ -10,9 +10,8 @@ def test_RunValidateInput_invalid(util, short_read_mngs_bench3_viral_outputs):
         )
     )
     # override fastqs to invalid test article
-    inputs["fastqs"] = [
-        os.path.join(os.path.dirname(__file__), "test_RunValidateInput_invalid.fastq")
-    ]
+    inputs["reads1_fastq"] = os.path.join(os.path.dirname(__file__), "test_RunValidateInput_invalid.fastq")
+    del inputs["reads2_fastq"]
 
     # run the task with the manipulated inputs, expecting an error exit status
     outp = util.miniwdl_run(
diff --git a/workflows/short-read-mngs/test/local_test_viral.yml b/workflows/short-read-mngs/test/local_test_viral.yml
index 6772dfe45..5241faf82 100644
--- a/workflows/short-read-mngs/test/local_test_viral.yml
+++ b/workflows/short-read-mngs/test/local_test_viral.yml
@@ -9,12 +9,12 @@
 host_filter.file_ext: fastq
 host_filter.nucleotide_type: DNA
 host_filter.host_genome: human
-host_filter.star_genome: s3://czid-public-references/host_filter/ercc/2018-02-15-utc-1518652800-unixtime__2018-02-15-utc-1518652800-unixtime/STAR_genome.tar
-host_filter.bowtie2_genome: s3://czid-public-references/host_filter/ercc/2018-02-15-utc-1518652800-unixtime__2018-02-15-utc-1518652800-unixtime/bowtie2_genome.tar
-host_filter.gsnap_genome: s3://czid-public-references/test/gsnap/ERCC_gsnap2017-11-15_k16.tar
-host_filter.human_star_genome: s3://czid-public-references/host_filter/ercc/2018-02-15-utc-1518652800-unixtime__2018-02-15-utc-1518652800-unixtime/STAR_genome.tar
-host_filter.human_bowtie2_genome: s3://czid-public-references/host_filter/ercc/2018-02-15-utc-1518652800-unixtime__2018-02-15-utc-1518652800-unixtime/bowtie2_genome.tar
 host_filter.adapter_fasta: https://raw.githubusercontent.com/broadinstitute/viral-pipelines/master/test/input/clipDb.fasta
+host_filter.bowtie2_index_tar: s3://public-test-bucket-idseq/host_filter/human/2022/bowtie2_index_tar/GRCh38_ERCC.bowtie2.tar
+host_filter.hisat2_index_tar: s3://public-test-bucket-idseq/host_filter/human/2022/hisat2_index_tar/GRCh38_ERCC.hisat2.tar
+host_filter.kallisto_idx: s3://public-test-bucket-idseq/host_filter/human/2022/kallisto_idx/GRCh38_ERCC.kallisto.idx
+host_filter.human_bowtie2_index_tar: s3://public-test-bucket-idseq/host_filter/human/2022/bowtie2_index_tar/GRCh38_ERCC.bowtie2.tar
+host_filter.human_hisat2_index_tar: s3://public-test-bucket-idseq/host_filter/human/2022/hisat2_index_tar/GRCh38_ERCC.hisat2.tar
 host_filter.max_input_fragments: 9000
 host_filter.max_subsample_fragments: 9000
 non_host_alignment.accession2taxid_db: s3://czid-public-references/mini-database/alignment_indexes/2020-08-20-viral/viral_accessions2taxid.marisa
diff --git a/workflows/short-read-mngs/test/test_short_read_mngs.py b/workflows/short-read-mngs/test/test_short_read_mngs.py
index 6bff6d68c..103356970 100644
--- a/workflows/short-read-mngs/test/test_short_read_mngs.py
+++ b/workflows/short-read-mngs/test/test_short_read_mngs.py
@@ -31,7 +31,7 @@ def test_bench3_viral(short_read_mngs_bench3_viral_outputs):
         taxon_counts = json.load(infile)["pipeline_output"]["taxon_counts_attributes"]
 
     taxa = set(entry["tax_id"] for entry in taxon_counts)
-    assert len(taxa) == 177
+    assert abs(len(taxa) - 184) < 16
 
     for filename in outp["outputs"]:
         if filename.endswith(".fasta"):
diff --git a/workflows/short-read-mngs/test/test_wdl.py b/workflows/short-read-mngs/test/test_wdl.py
index a905f3cb5..a2fdcb134 100644
--- a/workflows/short-read-mngs/test/test_wdl.py
+++ b/workflows/short-read-mngs/test/test_wdl.py
@@ -19,7 +19,7 @@ def setUpClass(self):
 
     def testValidateWindows(self):
         fastqs_0 = os.path.join(os.path.dirname(__file__), "windows1.fastq.gz")
-        args = self.rv_args + [f"fastqs={fastqs_0}"]
+        args = self.rv_args + [f"reads1_fastq={fastqs_0}"]
         res = self.run_miniwdl(args, task="RunValidateInput")
         with open(res["outputs"]["RunValidateInput.valid_input1_fastq"]) as f:
             hash = hashlib.md5(f.read().encode("utf-8")).hexdigest()
@@ -27,7 +27,7 @@ def testValidateWindows(self):
 
     def testInvalidInput(self):
         fastqs_0 = os.path.join(os.path.dirname(__file__), "host_filter", "test_RunValidateInput_invalid_char.fastq")
-        args = self.rv_args + [f"fastqs={fastqs_0}"]
+        args = self.rv_args + [f"reads1_fastq={fastqs_0}"]
 
         with self.assertRaises(CalledProcessError) as ecm:
             self.run_miniwdl(args, task="RunValidateInput")
@@ -38,94 +38,6 @@ def testInvalidInput(self):
             self.assertEqual(cause, "PARSE ERROR: not an ascii file. Line 4 contains non-ascii characters.")
 
 
-class TestSTAR(WDLTestCase):
-    """Tests the RunSTAR function
-    the inputs are minimal, with only 100 reads
-    should only add ~1 min to testing time
-    """
-
-    wdl = os.path.join(os.path.dirname(__file__), "..", "host_filter.wdl")
-    with open(os.path.join(os.path.dirname(__file__), "local_test.yml")) as fh:
-        common_inputs = yaml.safe_load(fh)
-    star_args = None
-
-    @classmethod
-    def setUpClass(self):
-        fastqs_0 = os.path.join(
-            os.path.dirname(__file__),
-            "host_filter",
-            "star_inputs",
-            "valid_input1.fastq",
-        )
-        fastqs_1 = os.path.join(
-            os.path.dirname(__file__),
-            "host_filter",
-            "star_inputs",
-            "valid_input2.fastq",
-        )
-        summary_json = os.path.join(
-            os.path.dirname(__file__),
-            "host_filter",
-            "star_inputs",
-            "validate_input_summary.json",
-        )
-        args = [
-            "s3_wd_uri=''",
-            f"validate_input_summary_json={summary_json}",
-            f"valid_input_fastq={fastqs_0}",
-            f"valid_input_fastq={fastqs_1}",
-            "star_genome=s3://czid-public-references/host_filter/ercc"
-            "/2018-02-15-utc-1518652800-unixtime__2018-02-15-utc-1518652800-unixtime/STAR_genome.tar",
-        ]
-        self.star_args = args
-
-    def test_star(self):
-        """test the basic star parameters"""
-        args = self.star_args + ["nucleotide_type=DNA", "host_genome=human"]
-        res = self.run_miniwdl(args, task="RunStar")
-        with open(res["outputs"]["RunStar.output_read_count"]) as f:
-            count = json.load(f)
-
-        self.assertEqual(count["star_out"], 100)
-        with open(res["outputs"]["RunStar.unmapped1_fastq"]) as f:
-            hash = hashlib.md5(f.read().encode("utf-8")).hexdigest()
-            self.assertEqual(hash, "c4d71e1b9b01734f7c3d300a7eac327a")
-        with open(res["outputs"]["RunStar.unmapped2_fastq"]) as f:
-            hash = hashlib.md5(f.read().encode("utf-8")).hexdigest()
-            self.assertEqual(hash, "6b46fe79bf089c8b3f6377fab34b9744")
-
-    def test_star_rna(self):
-        """test the nucleotide_type of RNA works, should run STAR with TranscriptomeSAM"""
-        args = self.star_args + ["nucleotide_type=RNA", "host_genome=human"]
-        res = self.run_miniwdl(args, task="RunStar")
-        with open(res["outputs"]["RunStar.output_read_count"]) as f:
-            count = json.load(f)
-        self.assertEqual(count["star_out"], 100)
-        self.assertIn("TranscriptomeSAM", res["outputs"]["RunStar.step_description_md"])
-
-    def test_star_nonhuman(self):
-        """test that there is no output BAM file if the host is non-human"""
-        args = self.star_args + ["nucleotide_type=DNA", "host_genome=pig"]
-        res = self.run_miniwdl(args, task="RunStar")
-
-        with open(res["outputs"]["RunStar.output_read_count"]) as f:
-            count = json.load(f)
-        self.assertEqual(count["star_out"], 100)
-        self.assertIsNone(res["outputs"]["RunStar.aligned_file"])
-
-    def test_starlong(self):
-        """tests that STARLong runs if # of reads with length > 500 is >1
-        the validation input has been modified, but there are no actual long reads
-        """
-
-        args = self.star_args + ["nucleotide_type=DNA", "host_genome=human"]
-        args[1] = args[1].replace(".json", "_long.json")
-        res = self.run_miniwdl(args, task="RunStar")
-        with open(res["outputs"]["RunStar.output_read_count"]) as f:
-            count = json.load(f)
-        self.assertEqual(count["star_out"], 100)
-
-
 class TestAlign(WDLTestCase):
     wdl = os.path.join(os.path.dirname(__file__), "..", "non_host_alignment.wdl")
     with open(os.path.join(os.path.dirname(__file__), "local_test.yml")) as fh: