Skip to content

Commit

Permalink
Specify dm parameters as regular dicts
Browse files Browse the repository at this point in the history
Conditionals as dicts, repeats are lists.
  • Loading branch information
mvdbeek committed Feb 2, 2024
1 parent f8bc985 commit 67b6078
Show file tree
Hide file tree
Showing 3 changed files with 78 additions and 36 deletions.
9 changes: 8 additions & 1 deletion src/ephemeris/_config_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,10 +42,17 @@ class RepositoryInstallTargets(BaseModel):
tools: List[RepositoryInstallTarget]


class DictOrValue(BaseModel):
__root__: Union[Dict[str, Union[str, int, float, bool, "DictOrValue"]], Union[str, int, float, bool]]


DictOrValue.update_forward_refs()


class DataManager(BaseModel, extra=Extra.forbid):
tags: List[str]
tool_id: str
parameters: Optional[List[Dict[str, str]]] = None
parameters: Optional[DictOrValue] = None


class DataManagers(BaseModel, extra=Extra.forbid):
Expand Down
60 changes: 29 additions & 31 deletions src/ephemeris/_idc_split_data_manager_genomes.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
by genomes.yml that have already been executed and appear in the target
installed data table configuration.
"""
import json
import logging
import os
import re
Expand Down Expand Up @@ -38,6 +39,7 @@
from ._config_models import (
DataManager,
DataManagers,
DictOrValue,
read_data_managers,
)
from .common_parser import get_common_args
Expand Down Expand Up @@ -96,8 +98,7 @@ def tool_id_for(indexer: str, data_managers: DataManagers, mode: str) -> str:
class RunDataManager(BaseModel):
id: str
items: Optional[List[Any]] = None
params: Optional[List[Any]] = None
data_table_reload: Optional[List[str]] = None
params: Optional[DictOrValue] = None


class RunDataManagers(BaseModel):
Expand Down Expand Up @@ -172,36 +173,34 @@ def walk_over_incomplete_runs(split_options: SplitOptions):
if do_fetch and not split_options.is_build_complete(build_id, fetch_indexer):
log.info(f"Fetching: {build_id}")
fetch_tool_id = tool_id_for(fetch_indexer, data_managers, split_options.tool_id_mode)
fetch_params = []
fetch_params.append({"dbkey_source|dbkey_source_selector": "new"})
fetch_params.append({"dbkey_source|dbkey": genome["id"]})
description = genome.get("description")
fetch_params = {
"dbkey_source": {"dbkey_source_selector": "new", "dbkey": genome["id"]},
"sequence_id": genome["id"],
"sequence_name": description,
}
source = genome.get("source")
if source == "ucsc":
if not description:
description = ucsc_description_for_build(genome["id"])
fetch_params.append({"reference_source|reference_source_selector": "ucsc"})
fetch_params.append({"reference_source|requested_dbkey": genome["id"]})
fetch_params.append({"sequence_name": description})
fetch_params["sequence_name"] = ucsc_description_for_build(genome["id"])
fetch_params["reference_source"] = {
"reference_source_selector": "ucsc",
"requested_dbkey": genome["id"],
}
elif re.match("^[A-Z_]+[0-9.]+", source):
fetch_params.append({"reference_source|reference_source_selector": "ncbi"})
fetch_params.append({"reference_source|requested_identifier": source})
fetch_params.append({"sequence_name": genome["description"]})
fetch_params.append({"sequence.id": genome["id"]})
fetch_params["reference_source"] = {
"reference_source_selector": "ncbi",
"requested_identifier": source,
}
elif re.match("^http", source):
fetch_params.append({"reference_source|reference_source_selector": "url"})
fetch_params.append({"reference_source|user_url": source})
fetch_params.append({"sequence_name": genome["description"]})
fetch_params.append({"sequence.id": genome["id"]})
fetch_params["reference_source"] = {"reference_source_selector": "url", "user_url": source}

if description:
fetch_params.append({"dbkey_source|dbkey_name": description})
fetch_params["dbkey_source"]["dbkey_name"] = description

fetch_run_data_manager = RunDataManager(
id=fetch_tool_id,
params=fetch_params,
# Not needed according to Marius
# data_table_reload=["all_fasta", "__dbkeys__"],
)
yield (build_id, fetch_indexer, fetch_run_data_manager)
else:
Expand All @@ -223,18 +222,17 @@ def walk_over_incomplete_runs(split_options: SplitOptions):

tool_id = tool_id_for(indexer, data_managers, split_options.tool_id_mode)
data_manager = data_managers.__root__[indexer]
params = data_manager.parameters
params = {}
if data_manager.parameters:
params = json.loads(data_manager.parameters.json()) or {}
genome_params = genome.pop("parameters", None) or {}
params.update(genome_params)
if params is None:
params = [
{"all_fasta_source": "{{ item.id }}"},
{"sequence_name": "{{ item.name }}"},
{"sequence_id": "{{ item.id }}"},
]
# why is this not pulled from the data managers conf? -nate
if re.search("bwa", tool_id):
params.append({"index_algorithm": "bwtsw"})
if re.search("color_space", tool_id):
continue
params = {
"all_fasta_source": "{{ item.id }}",
"sequence_name": "{{ item.name }}",
"sequence_id": "{{ item.id }}",
}

item = deepcopy(genome)
item.pop("indexers", None)
Expand Down
45 changes: 41 additions & 4 deletions tests/test_split_genomes.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,13 +49,36 @@
- genome
"""

DATA_MANAGER_YAML_WITH_PARAMS = """
the_data_manager:
tool_id: toolshed.g2.bx.psu.edu/repos/iuc/the_data_manager/the_data_manager/0.0.1'
parameters:
conditional:
param_a: a
param_b: b
tags:
- dm_tag
"""

GENOMES_WITH_PARAMS = """
genomes:
- dbkey: cat
description: fluffy
id: cat
indexers:
- the_data_manager
parameters:
conditional:
param_c: c
"""


def setup_mock_idc_dir(directory: Path):
def setup_mock_idc_dir(directory: Path, genomes=MERGED_YAML_STR, data_managers=DATA_MANAGER_YAML_STR):
merged = directory / "genomes.yml"
merged.write_text(MERGED_YAML_STR)
merged.write_text(genomes)

data_managers = directory / "data_managers.yml"
data_managers.write_text(DATA_MANAGER_YAML_STR)
data_managers_path = directory / "data_managers.yml"
data_managers_path.write_text(data_managers)


def read_and_validate_run_data_manager_yaml(path):
Expand Down Expand Up @@ -98,6 +121,20 @@ def test_split_genomes(tmp_path: Path):
assert data_manager.items[0]["dbkey"] == "hg19_rCRS_pUC18_phiX174"


def test_split_genomes_with_params(tmp_path):
setup_mock_idc_dir(tmp_path, GENOMES_WITH_PARAMS, DATA_MANAGER_YAML_WITH_PARAMS)
split_path = tmp_path / "split"
split_options = split_options_for(tmp_path)
split_genomes(split_options)
new_task = split_path / "cat" / "the_data_manager"
new_task_run_yaml = new_task / "run_data_managers.yaml"
run = read_and_validate_run_data_manager_yaml(new_task_run_yaml)
assert len(run.data_managers) == 1
data_manager = run.data_managers[0]
# genome config overwrites data manager config
assert data_manager.params.json() == '{"conditional": {"param_c": "c"}}'


def test_split_genomes_short_ids(tmp_path: Path):
setup_mock_idc_dir(tmp_path)
split_path = tmp_path / "split"
Expand Down

0 comments on commit 67b6078

Please sign in to comment.