Skip to content

Commit

Permalink
feat(ci, ingest, prepro): Allow custom nextclade datasets (#3075)
Browse files Browse the repository at this point in the history
* Add option to pass custom nextclade datasets

* Clean up configs
  • Loading branch information
anna-parker authored Nov 6, 2024
1 parent 17cfd80 commit 47a27dd
Show file tree
Hide file tree
Showing 7 changed files with 37 additions and 16 deletions.
20 changes: 16 additions & 4 deletions ingest/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,20 @@ FILTER_FASTA_HEADERS = config.get("filter_fasta_headers", None)
APPROVE_TIMEOUT_MIN = config.get("approve_timeout_min") # time in minutes
CHECK_ENA_DEPOSITION = config.get("check_ena_deposition", False)

dataset_server_map = {}
dataset_name_map = {}

if SEGMENTED:
for segment in config["nucleotide_sequences"]:
if config.get("nextclade_dataset_server_map") and segment in config["nextclade_dataset_server_map"]:
dataset_server_map[segment] = config["nextclade_dataset_server_map"][segment]
else:
dataset_server_map[segment] = config.get("nextclade_dataset_server")
if config.get("nextclade_dataset_name_map") and segment in config["nextclade_dataset_name_map"]:
dataset_name_map[segment] = config["nextclade_dataset_name_map"][segment]
else:
dataset_name_map[segment] = config.get("nextclade_dataset_name") + "/" + segment

if os.uname().sysname == "Darwin":
# Don't use conda-forge unzip on macOS
# Due to https://github.com/conda-forge/unzip-feedstock/issues/16
Expand Down Expand Up @@ -197,10 +211,8 @@ rule align:
output:
results="results/nextclade_{segment}.tsv",
params:
dataset_server=config.get("nextclade_dataset_server"),
dataset_name=lambda w: config.get("nextclade_dataset_name", "")
+ "/"
+ w.segment,
dataset_server=lambda w: dataset_server_map[w.segment],
dataset_name=lambda w: dataset_name_map[w.segment],
shell:
"""
nextclade run \
Expand Down
2 changes: 2 additions & 0 deletions kubernetes/loculus/templates/ingest-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
{{- range $key, $values := (.Values.organisms | default .Values.defaultOrganisms) }}
{{- if $values.ingest }}
{{- $metadata := (include "loculus.patchMetadataSchema" $values.schema | fromYaml).metadata }}
{{- $nucleotideSequencesList := (include "loculus.patchMetadataSchema" $values.schema | fromYaml).nucleotideSequences | default (list "main")}}
---
apiVersion: v1
kind: ConfigMap
Expand All @@ -13,6 +14,7 @@ metadata:
data:
config.yaml: |
{{- $values.ingest.configFile | toYaml | nindent 4 }}
nucleotide_sequences: {{- $nucleotideSequencesList | toYaml | nindent 4 }}
verify_loculus_version_is: {{$dockerTag}}
check_ena_deposition: {{ not $.Values.disableEnaSubmission }}
organism: {{ $key }}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
{{- range $organism, $organismConfig := (.Values.organisms | default .Values.defaultOrganisms) }}
{{- $metadata := ($organismConfig.schema | include "loculus.patchMetadataSchema" | fromYaml).metadata }}
{{- $nucleotideSequences := (($organismConfig.schema | include "loculus.patchMetadataSchema" | fromYaml).nucleotideSequences | default "" ) }}
{{- $nucleotideSequencesList := ($organismConfig.schema | include "loculus.patchMetadataSchema" | fromYaml).nucleotideSequences | default (list "main")}}
{{- range $processingIndex, $processingConfig := $organismConfig.preprocessing }}
{{- if $processingConfig.configFile }}
---
Expand All @@ -12,6 +13,7 @@ data:
preprocessing-config.yaml: |
organism: {{ $organism }}
{{- $processingConfig.configFile | toYaml | nindent 4 }}
nucleotideSequences: {{- $nucleotideSequencesList | toYaml | nindent 4 }}
processing_spec:
{{- $args := dict "metadata" $metadata "nucleotideSequences" $nucleotideSequences }}
{{- include "loculus.preprocessingSpecs" $args | nindent 6 }}
Expand Down
5 changes: 0 additions & 5 deletions kubernetes/loculus/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1410,17 +1410,12 @@ defaultOrganisms:
log_level: DEBUG
nextclade_dataset_name: nextstrain/cchfv/linked
nextclade_dataset_server: https://raw.githubusercontent.com/nextstrain/nextclade_data/cornelius-cchfv/data_output
nucleotideSequences: [L, M, S]
genes: [RdRp, GPC, NP]
ingest:
<<: *ingest
configFile:
<<: *ingestConfigFile
taxon_id: 3052518
nucleotide_sequences:
- L
- M
- S
nextclade_dataset_server: https://raw.githubusercontent.com/nextstrain/nextclade_data/cornelius-cchfv/data_output
nextclade_dataset_name: nextstrain/cchfv/linked
enaDeposition:
Expand Down
2 changes: 1 addition & 1 deletion preprocessing/nextclade/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ When deployed on kubernetes the preprocessing pipeline reads in config files whi
and use this in the pipeline as follows:

```sh
prepro --config-file=../../temp/preprocessing-config.{organism}.yaml --keep-tmp-dir
prepro --config-file=../../website/tests/config/preprocessing-config.{organism}.yaml --keep-tmp-dir
```

Additionally, the `--keep-tmp-dir` is useful for debugging issues. The results of nextclade run will be stored in the temp directory, as well as a file called `submission_requests.json` which contains a log of the full submit requests that are sent to the backend.
Expand Down
2 changes: 2 additions & 0 deletions preprocessing/nextclade/src/loculus_preprocessing/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,10 @@ class Config:
keycloak_password: str = "preprocessing_pipeline"
keycloak_token_path: str = "realms/loculus/protocol/openid-connect/token"
nextclade_dataset_name: str | None = None
nextclade_dataset_name_map: dict[str, str] | None = None
nextclade_dataset_tag: str | None = None
nextclade_dataset_server: str = "https://data.clades.nextstrain.org/v3"
nextclade_dataset_server_map: dict[str, str] | None = None
config_file: str | None = None
log_level: str = "DEBUG"
genes: list[str] = dataclasses.field(default_factory=list)
Expand Down
20 changes: 14 additions & 6 deletions preprocessing/nextclade/src/loculus_preprocessing/prepro.py
Original file line number Diff line number Diff line change
Expand Up @@ -710,18 +710,26 @@ def process_all(

def download_nextclade_dataset(dataset_dir: str, config: Config) -> None:
for segment in config.nucleotideSequences:
nextclade_dataset_name = (
config.nextclade_dataset_name
if segment == "main"
else config.nextclade_dataset_name + "/" + segment
)
if config.nextclade_dataset_name_map and segment in config.nextclade_dataset_name_map:
nextclade_dataset_name = config.nextclade_dataset_name_map[segment]
else:
nextclade_dataset_name = (
config.nextclade_dataset_name
if segment == "main"
else config.nextclade_dataset_name + "/" + segment
)

nextclade_dataset_server = config.nextclade_dataset_server
if config.nextclade_dataset_server_map and segment in config.nextclade_dataset_server_map:
nextclade_dataset_server = config.nextclade_dataset_server_map[segment]

dataset_dir_seg = dataset_dir if segment == "main" else dataset_dir + "/" + segment
dataset_download_command = [
"nextclade3",
"dataset",
"get",
f"--name={nextclade_dataset_name}",
f"--server={config.nextclade_dataset_server}",
f"--server={nextclade_dataset_server}",
f"--output-dir={dataset_dir_seg}",
]

Expand Down

0 comments on commit 47a27dd

Please sign in to comment.