chanzuckerberg · jgadling · Aug 16, 2024 · Aug 16, 2024 · Aug 16, 2024 · Aug 16, 2024
diff --git a/src/backend/Dockerfile.lineage_qc b/src/backend/Dockerfile.lineage_qc
@@ -19,7 +19,7 @@ RUN update-ca-certificates
 
 # install nextclade, check it installed correctly
 RUN apt-get --yes install curl
-RUN cd /usr/local/bin && curl -fsSL "https://github.com/nextstrain/nextclade/releases/download/2.14.0/nextclade-x86_64-unknown-linux-gnu" -o "nextclade" && chmod +x nextclade
+RUN cd /usr/local/bin && curl -fsSL "https://github.com/nextstrain/nextclade/releases/download/3.8.2/nextclade-x86_64-unknown-linux-gnu" -o "nextclade" && chmod +x nextclade
 RUN nextclade --version
 
 # Poetry: install app

diff --git a/src/backend/aspen/workflows/nextclade/prep_samples.py b/src/backend/aspen/workflows/nextclade/prep_samples.py
@@ -15,6 +15,7 @@
 if you're tweaking it to support multiple tools. Especially look closely at
 the function `get_sample_ids_to_refresh` in here.
 """
+
 import io
 import json
 import subprocess
@@ -127,6 +128,13 @@ def cli(
         # generalized case and we'll need to figure out how to handle that,
         # but right now the workflow is hardcoded to always expecting dataset.
         nextclade_dataset_name = target_pathogen.nextclade_dataset_name
+        # Nextclade 3.2.8 has new names for datasets vs the 2.1 names in the db.
+        new_nextclade_dataset_names = {
+            "SARS-CoV": "nextstrain/sars-cov-2/wuhan-hu-1/orfs",
+            "hMPXV": "nextstrain/mpox/all-clades",
+        }
+        if nextclade_dataset_name in new_nextclade_dataset_names:
+            nextclade_dataset_name = new_nextclade_dataset_names[nextclade_dataset_name]
         if not nextclade_dataset_name:
             print("No nextclade_dataset_name for this pathogen in the DB.")
             if run_type == RunType.REFRESH_STALE:

diff --git a/src/backend/aspen/workflows/nextclade/run_nextclade.sh b/src/backend/aspen/workflows/nextclade/run_nextclade.sh
@@ -20,7 +20,7 @@ shopt -s inherit_errexit  # no silent breaking
 # This is where we will store Nextclade's dataset for the target pathogen.
 NEXTCLADE_DATASET_DIR=nextclade_dataset_bundle
 # Inside the dataset, Nextclade uses this file to tag the dataset.
-NEXTCLADE_TAG_FILENAME=tag.json
+NEXTCLADE_TAG_FILENAME=pathogen.json
 
 # Certain bits of info need to be passed around during the workflow.
 # Using JSON file as an easy way to pass them around to various processes.