ihmeuw-msca · kels271828 · Dec 9, 2024 · Nov 22, 2024 · Nov 25, 2024 · Nov 25, 2024
diff --git a/examples/example_pipeline.json b/examples/example_pipeline.json
@@ -21,28 +21,6 @@
         "sex_id"
     ],
     "stages": {
-        "preprocessing": {
-            "name": "preprocessing",
-            "config": {
-                "id_columns": [
-                    "year_id",
-                    "sex_id",
-                    "age_group_id",
-                    "location_id"
-                ],
-                "model_type": "binomial",
-                "observation_column": "obs",
-                "prediction_column": "pred",
-                "weight_column": "weights",
-                "test_column": "test",
-                "holdout_columns": [],
-                "coef_bounds": {}
-            },
-            "input": {
-                "data": "/path/to/data.parquet"
-            },
-            "type": "PreprocessingStage"
-        },
         "covariate_selection": {
             "name": "covariate_selection",
             "config": {
@@ -80,10 +58,7 @@
             ],
             "crossby": [],
             "input": {
-                "data": {
-                    "stage": "preprocessing",
-                    "path": "/path/to/experiment/directory/preprocessing/data.parquet"
-                }
+                "data": "/path/to/data.parquet"
             },
             "type": "RoverStage"
         },
@@ -179,10 +154,7 @@
             ],
             "crossby": [],
             "input": {
-                "data": {
-                    "stage": "preprocessing",
-                    "path": "/path/to/experiment/directory/preprocessing/data.parquet"
-                },
+                "data": "/path/to/data.parquet",
                 "offset": {
                     "stage": "global_model",
                     "path": "/path/to/experiment/directory/global_model/predictions.parquet"
@@ -233,10 +205,7 @@
             ],
             "crossby": [],
             "input": {
-                "data": {
-                    "stage": "preprocessing",
-                    "path": "/path/to/experiment/directory/preprocessing/data.parquet"
-                },
+                "data": "/path/to/data.parquet",
                 "offset": {
                     "stage": "location_model",
                     "path": "/path/to/experiment/directory/location_model/predictions.parquet"
@@ -274,10 +243,7 @@
             ],
             "module": "/path/to/custom_stage.py",
             "input": {
-                "observations": {
-                    "stage": "preprocessing",
-                    "path": "/path/to/experiment/directory/preprocessing/data.parquet"
-                },
+                "observations": "/path/to/data.parquet",
                 "predictions": {
                     "stage": "smoothing",
                     "path": "/path/to/experiment/directory/smoothing/predictions.parquet"
@@ -287,25 +253,18 @@
         }
     },
     "dependencies": {
-        "preprocessing": [],
-        "covariate_selection": [
-            "preprocessing"
-        ],
+        "covariate_selection": [],
         "global_model": [
-            "covariate_selection",
-            "preprocessing"
+            "covariate_selection"
         ],
         "location_model": [
-            "global_model",
-            "preprocessing"
+            "global_model"
         ],
         "smoothing": [
-            "location_model",
-            "preprocessing"
+            "location_model"
         ],
         "custom_stage": [
-            "smoothing",
-            "preprocessing"
+            "smoothing"
         ]
     }
 }
diff --git a/examples/pipeline_example.py b/examples/pipeline_example.py
@@ -4,14 +4,13 @@
 from custom_stage import CustomStage
 
 from onemod import Pipeline
-from onemod.stage import KregStage, PreprocessingStage, RoverStage, SpxmodStage
+from onemod.stage import KregStage, RoverStage, SpxmodStage
 
 
 def create_pipeline(directory: str, data: str):
     # Create stages
     # Stage-specific validation specifications go here.
     # Stage classes may also implement default validation specifications.
-    preprocessing = PreprocessingStage(name="preprocessing", config={})
     covariate_selection = RoverStage(
         name="covariate_selection",
         config={"cov_exploring": ["cov1", "cov2", "cov3"]},
@@ -68,7 +67,6 @@ def create_pipeline(directory: str, data: str):
     # Add stages
     example_pipeline.add_stages(
         [
-            preprocessing,
             covariate_selection,
             global_model,
             location_model,
@@ -78,24 +76,13 @@ def create_pipeline(directory: str, data: str):
     )
 
     # Define dependencies
-    preprocessing(data=example_pipeline.data)
-    covariate_selection(data=preprocessing.output["data"])
+    covariate_selection(data=data)
     global_model(
-        data=preprocessing.output["data"],
-        selected_covs=covariate_selection.output["selected_covs"],
-    )
-    location_model(
-        data=preprocessing.output["data"],
-        offset=global_model.output["predictions"],
-    )
-    smoothing(
-        data=preprocessing.output["data"],
-        offset=location_model.output["predictions"],
-    )
-    custom_stage(
-        observations=preprocessing.output["data"],
-        predictions=smoothing.output["predictions"],
+        data=data, selected_covs=covariate_selection.output["selected_covs"]
     )
+    location_model(data=data, offset=global_model.output["predictions"])
+    smoothing(data=data, offset=location_model.output["predictions"])
+    custom_stage(observations=data, predictions=smoothing.output["predictions"])
 
     # Serialize pipeline
     example_pipeline.to_json()
@@ -111,7 +98,7 @@ def create_pipeline(directory: str, data: str):
 
     # Fit specific stages
     example_pipeline.evaluate(
-        method="fit", stages=["preprocessing", "covariate_selection"]
+        method="fit", stages=["covariate_selection", "global_model"]
     )
 
     # Predict for specific locations

diff --git a/src/onemod/backend/jobmon_backend.py b/src/onemod/backend/jobmon_backend.py
@@ -31,7 +31,7 @@
 
 import sys
 from pathlib import Path
-from typing import Any, Literal, cast
+from typing import Literal
 
 import yaml
 from jobmon.client.api import Tool
@@ -96,7 +96,7 @@ def get_tasks(
                 name=f"{stage.name}_{method}",
                 upstream_tasks=upstream_tasks,
                 max_attempts=1,
-                **cast(dict[str, Any], task_args),
+                **task_args,
             )
         ]
 
@@ -141,8 +141,8 @@ def get_command_template(
         "{python}"
         f" {Path(__file__).parents[1] / 'main.py'}"
         " --config {config}"
-        f" --stage_name {stage_name}"
         f" --method {method}"
+        f" --stages {stage_name}"
     )
 
     for node_arg in node_args:
@@ -167,21 +167,38 @@ def get_task_resources(
 def get_upstream_tasks(
     stage: Stage,
     method: Literal["run", "fit", "predict"],
-    stages: dict[str, Stage],
+    stage_dict: dict[str, Stage],
     task_dict: dict[str, list[Task]],
-    specified_stages: set[str] | None = None,
+    stages: set[str] | None = None,
 ) -> list[Task]:
-    """Get upstream stage tasks."""
+    """Get upstream stage tasks.
+
+    Parameters
+    ----------
+    stage : Stage
+        Current stage.
+    method : str
+        Name of  method to evaluate.
+    stage_dict : dict[str, Stage]
+        Dictionary of all upstream pipeline stages.
+    task_dict : dict[str, list[Task]]
+        Dictionary of all tasks being evaluated.
+    stages : set[str] or None, optional
+        Name of all pipeline stages being evaluated.
+
+    Returns
+    -------
+    list of Task
+        Upstream tasks for current stage.
+
+    """
     upstream_tasks = []
 
     for upstream_name in stage.dependencies:
-        if (
-            specified_stages is not None
-            and upstream_name not in specified_stages
-        ):
+        if stages is not None and upstream_name not in stages:
             continue
 
-        upstream = stages[upstream_name]
+        upstream = stage_dict[upstream_name]
         if method not in upstream.skip:
             if (
                 isinstance(upstream, ModelStage)
@@ -200,8 +217,8 @@ def evaluate_with_jobmon(
     cluster: str,
     resources: Path | str,
     python: Path | str | None = None,
-    method: Literal["run", "fit", "predict", "collect"] = "run",
-    stages: list[str] | None = None,
+    method: Literal["run", "fit", "predict"] = "run",
+    stages: set[str] | None = None,
 ) -> None:
     """Evaluate pipeline or stage method with Jobmon.
 
@@ -219,7 +236,8 @@ def evaluate_with_jobmon(
     method : str, optional
         Name of method to evalaute. Default is 'run'.
     stages : set of str or None, optional
-        Set of stage names to evaluate. Default is None.
+        Names of stages to evaluate if `model` is a pipeline instance.
+        If None, evaluate entire pipeline. Default is None.
 
     TODO: Optional stage-specific Python environments
     TODO: User-defined max_attempts
@@ -231,34 +249,25 @@ def evaluate_with_jobmon(
 
     # Set config
     if isinstance(model, Stage):
-        model_config = model.dataif.load(key="config")
+        config_path = model.dataif.get_path("config")
     elif isinstance(model, Pipeline):
-        model_config = model.config
+        config_path = model.directory / f"{model.name}.json"
 
     task_args: dict[str, str] = {
         "python": str(python or sys.executable),
-        "config": str(model_config),
+        "config": str(config_path),
     }
 
     # Create tasks
     if isinstance(model, Pipeline):
         tasks = []
         task_dict: dict[str, list[Task]] = {}
 
-        if stages is None:
-            stages = model.get_execution_order()
-
-        for stage_name in stages:
+        for stage_name in model.get_execution_order(stages):
             stage = model.stages[stage_name]
-            if (
-                method not in stage.skip and method != "collect"
-            ):  # TODO: handle collect
+            if method not in stage.skip:
                 upstream_tasks = get_upstream_tasks(
-                    stage,
-                    method,
-                    model.stages,
-                    task_dict,
-                    specified_stages=set(stages),
+                    stage, method, model.stages, task_dict, stages
                 )
                 task_dict[stage_name] = get_tasks(
                     tool, resources, stage, method, task_args, upstream_tasks

diff --git a/src/onemod/backend/local_backend.py b/src/onemod/backend/local_backend.py
@@ -13,7 +13,7 @@
 def evaluate_local(
     model: Pipeline | Stage,
     method: Literal["run", "fit", "predict"] = "run",
-    stages: list[str] | None = None,
+    stages: set[str] | None = None,
     **kwargs,
 ) -> None:
     """Evaluate pipeline or stage method locally.
@@ -24,22 +24,20 @@ def evaluate_local(
         Pipeline or stage instance.
     method : str, optional
         Name of method to evaluate. Default is 'run'.
+    stages : set of str or None, optional
+        Names of stages to evaluate if `model` is a pipeline instance.
+        If None, evaluate entire pipeline. Default is None.
 
     Other Parameters
     ----------------
     subset_id : int, optional
         Submodel data subset ID. Only used for model stages.
     param_id : int, optional
         Submodel parameter set ID. Only used for model stages.
-    stages : list of str or None, optional
-        List of stage names to evaluate. Default is None.
 
     """
     if isinstance(model, Pipeline):
-        if stages is None:
-            stages = model.get_execution_order()
-
-        for stage_name in stages:
+        for stage_name in model.get_execution_order(stages):
             stage = model.stages[stage_name]
             if method not in stage.skip:
                 _evaluate_stage(stage, method)

diff --git a/src/onemod/config/__init__.py b/src/onemod/config/__init__.py
@@ -1,13 +1,11 @@
 from onemod.config.base import Config, ModelConfig, PipelineConfig, StageConfig
-from onemod.config.data_config import PreprocessingConfig
 from onemod.config.model_config import KregConfig, RoverConfig, SpxmodConfig
 
 __all__ = [
     "Config",
     "PipelineConfig",
     "StageConfig",
     "ModelConfig",
-    "PreprocessingConfig",
     "RoverConfig",
     "SpxmodConfig",
     "KregConfig",

diff --git a/src/onemod/config/data_config/__init__.py b/src/onemod/config/data_config/__init__.py
@@ -1,3 +0,0 @@
-from onemod.config.data_config.preprocessing_config import PreprocessingConfig
-
-__all__ = ["PreprocessingConfig"]

diff --git a/src/onemod/config/data_config/preprocessing_config.py b/src/onemod/config/data_config/preprocessing_config.py
Original file line number	Diff line number	Diff line change
		@@ -1,3 +0,0 @@
		from onemod.config.data_config.preprocessing_config import PreprocessingConfig

		__all__ = ["PreprocessingConfig"]