Skip to content

Commit

Permalink
Merge pull request #15 from ihmeuw-msca/cat_dev_wip
Browse files Browse the repository at this point in the history
Settings + signal model for categorical pipeline
  • Loading branch information
n-gilbertson authored Sep 4, 2024
2 parents 3801e1d + 3358f39 commit d0df991
Show file tree
Hide file tree
Showing 10 changed files with 1,042 additions and 2 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -189,4 +189,7 @@ result/
# vscode
.vscode/

# MacOS DS Store files
.DS_Store

results/
17 changes: 17 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.4.2
hooks:
- id: ruff
args: [ --fix ]
- id: ruff-format
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.6.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
- repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.10.0
hooks:
- id: mypy
files: ^src
20 changes: 20 additions & 0 deletions data/categorical/settings.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
default:
seed: 0
fit_signal_model:
cat_cov_model:
ref_cat:
prior_order: []
signal_model:
inlier_pct: 0.9
select_bias_covs:
cov_finder:
pre_selected_covs: []
complete_summary:
draws:
num_draws: 1000
quantiles: [0.025, 0.05, 0.5, 0.95, 0.975]
score:
normalize_to_tmrel: false
figure:
show_ref: true
cat_order: []
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ license = { file = "LICENSE" }
authors = [
{ name = "IHME Math Sciences", email = "[email protected]" },
]
dependencies = ["numpy", "scipy", "pandas", "matplotlib", "mrtool==0.1.4", "pplkit"]
dependencies = ["numpy", "scipy", "pandas", "matplotlib", "pplkit"]

[project.optional-dependencies]
test = ["pytest"]
Expand All @@ -24,6 +24,7 @@ github = "https://github.com/ihmeuw-msca/bopforge"
[project.scripts]
continuous_pipeline = "bopforge.continuous_pipeline.__main__:main"
dichotomous_pipeline = "bopforge.dichotomous_pipeline.__main__:main"
categorical_pipeline = "bopforge.categorical_pipeline.__main__:main"

[tool.sphinx]
project = "modrover"
Expand Down
8 changes: 8 additions & 0 deletions ruff.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
line-length = 80
src = ["src"]

[format]
docstring-code-format = true

[lint.pydocstyle]
convention = "numpy"
20 changes: 20 additions & 0 deletions sphinx/api_reference/categorical_pipeline.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
Categorical Pipeline
====================

Actions
-------

.. autofunction:: bopforge.categorical.__main__.pre_processing

.. autofunction:: bopforge.categorical.__main__.fit_signal_model

.. autofunction:: bopforge.categorical.__main__.select_bias_covs

.. autofunction:: bopforge.categorical.__main__.fit_linear_model


Functions
---------

.. automodule:: bopforge.categorical_pipeline.functions
:members:
Empty file.
265 changes: 265 additions & 0 deletions src/bopforge/categorical_pipeline/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,265 @@
import os
import shutil
import warnings
from argparse import ArgumentParser
from pathlib import Path

import bopforge.categorical_pipeline.functions as functions
import numpy as np
from bopforge.utils import fill_dict, ParseKwargs
from pplkit.data.interface import DataInterface

warnings.filterwarnings("ignore")

def pre_processing(result_folder: Path) -> None:
dataif = DataInterface(result=result_folder)
name = dataif.result.name

# load data
df = dataif.load_result(f"raw-{name}.csv")
all_settings = dataif.load_result("settings.yaml")
settings = all_settings["select_bias_covs"]["cov_finder"]

# get bias covariates that need to be removed
all_covs = [col for col in df.columns if col.startswith("cov_")]
covs_to_remove = [col for col in all_covs if len(df[col].unique()) == 1]

# remove from dataframe
df.drop(columns=covs_to_remove, inplace=True)

# remove from settings
all_covs = set(all_covs)
covs_to_remove = set(covs_to_remove)
pre_selected_covs = set(settings["pre_selected_covs"])
pre_selected_covs = pre_selected_covs & all_covs
pre_selected_covs = pre_selected_covs - covs_to_remove

settings["pre_selected_covs"] = list(pre_selected_covs)
all_settings["select_bias_covs"]["cov_finder"] = settings

# save results
dataif.dump_result(df, f"{name}.csv")
dataif.dump_result(all_settings, "settings.yaml")


def fit_signal_model(result_folder: Path) -> None:
"""Fit signal model. This step involves, trimming, but does not use a mixed
effect model. The goal is to get the strength of prior for the covariate
selection step and identifying all the outliers. A summary file will be
generated to store the results of signal model.
Parameters
----------
dataif
Data interface in charge of file reading and writing.
"""
pre_processing(result_folder)
dataif = DataInterface(result=result_folder)
name = dataif.result.name

# load data
df = dataif.load_result(f"{name}.csv")

# load settings
all_settings = dataif.load_result("settings.yaml")
settings = all_settings["fit_signal_model"]

signal_model = functions.get_signal_model(settings, df)
signal_model.fit_model(outer_step_size=200, outer_max_iter=100)

df = functions.add_cols(df, signal_model)
df_coef = functions.get_coefs(all_settings, signal_model)

summary = functions.get_signal_model_summary(name, df, df_coef)

fig = functions.plot_signal_model(
name,
summary,
df,
df_coef,
signal_model,
show_ref=all_settings["figure"]["show_ref"],
)

# save results
dataif.dump_result(df, f"{name}.csv")
dataif.dump_result(signal_model, "signal_model.pkl")
dataif.dump_result(summary, "summary.yaml")
fig.savefig(dataif.result / "signal_model.pdf", bbox_inches="tight")


# def select_bias_covs(result_folder: Path) -> None:
# """Select the bias covariates. In this step, we first fit a linear model to
# get the prior strength of the bias-covariates. And then we use `CovFinder`
# to select important bias-covariates. A summary of the result will be
# generated and store in file `cov_finder_result.yaml`.

# Parameters
# ----------
# dataif
# Data interface in charge of file reading and writing.

# """
# dataif = DataInterface(result=result_folder)
# name = dataif.result.name

# df = dataif.load_result(f"{name}.csv")
# df = df[df.is_outlier == 0].copy()

# all_settings = dataif.load_result("settings.yaml")
# settings = all_settings["select_bias_covs"]

# cov_finder_linear_model = dataif.load_result("signal_model.pkl")

# cov_finder = functions.get_cov_finder(settings, cov_finder_linear_model)
# cov_finder.select_covs(verbose=True)

# cov_finder_result = functions.get_cov_finder_result(
# cov_finder_linear_model, cov_finder
# )

# dataif.dump_result(cov_finder_result, "cov_finder_result.yaml")
# dataif.dump_result(cov_finder, "cov_finder.pkl")


# def fit_linear_model(result_folder: Path) -> None:
# """Fit the final linear mixed effect model for the process. We will fit the
# linear model using selected bias covariates in this step. And we will create
# draws and quantiles for the effects. A single panels figure will be plotted
# to show the fit and all the important result information is documented in
# the `summary.yaml` file.

# Parameters
# ----------
# dataif
# Data interface in charge of file reading and writing.

# """
# dataif = DataInterface(result=result_folder)
# name = dataif.result.name

# df = dataif.load_result(f"{name}.csv")
# df_train = df[df.is_outlier == 0].copy()

# cov_finder_result = dataif.load_result("cov_finder_result.yaml")
# all_settings = dataif.load_result("settings.yaml")
# settings = all_settings["complete_summary"]
# summary = dataif.load_result("summary.yaml")

# linear_model = functions.get_linear_model(df_train, cov_finder_result)
# linear_model.fit_model()

# summary = functions.get_linear_model_summary(summary, df, linear_model)

# df_inner_draws, df_outer_draws = functions.get_draws(settings, summary)

# df_inner_quantiles, df_outer_quantiles = functions.get_quantiles(settings, summary)

# fig = functions.plot_linear_model(summary, df)

# dataif.dump_result(linear_model, "linear_model.pkl")
# dataif.dump_result(summary, "summary.yaml")
# dataif.dump_result(df_inner_draws, "inner_draws.csv")
# dataif.dump_result(df_outer_draws, "outer_draws.csv")
# dataif.dump_result(df_inner_quantiles, "inner_quantiles.csv")
# dataif.dump_result(df_outer_quantiles, "outer_quantiles.csv")
# fig.savefig(dataif.result / "linear_model.pdf", bbox_inches="tight")


def run(
i_dir: str,
o_dir: str,
pairs: list[str],
actions: list[str],
metadata: dict,
) -> None:
i_dir, o_dir = Path(i_dir), Path(o_dir)
# check the input and output folders
if not i_dir.exists():
raise FileNotFoundError("input data folder not found")

o_dir.mkdir(parents=True, exist_ok=True)

dataif = DataInterface(i_dir=i_dir, o_dir=o_dir)
settings = dataif.load_i_dir("settings.yaml")

# check pairs
all_pairs = [pair for pair in settings.keys() if pair != "default"]
pairs = pairs or all_pairs
for pair in pairs:
data_path = dataif.get_fpath(f"{pair}.csv", key="i_dir")
if not data_path.exists():
raise FileNotFoundError(f"Missing data file {data_path}")

# check actions
# TODO: might be good to use enum here
all_actions = ["fit_signal_model", "select_bias_covs", "fit_linear_model"]
actions = actions or all_actions
invalid_actions = set(actions) - set(all_actions)
if len(invalid_actions) != 0:
raise ValueError(f"{list(invalid_actions)} are invalid actions")

# fit each pair
for pair in pairs:
pair_o_dir = o_dir / pair
pair_o_dir.mkdir(parents=True, exist_ok=True)

shutil.copy(i_dir / f"{pair}.csv", pair_o_dir / f"raw-{pair}.csv")

if pair not in settings:
pair_settings = settings["default"]
else:
pair_settings = fill_dict(settings[pair], settings["default"])
pair_settings["metadata"] = metadata
dataif.dump_o_dir(pair_settings, pair, "settings.yaml")

np.random.seed(pair_settings["seed"])
for action in actions:
globals()[action](pair_o_dir)


def main(args=None) -> None:
parser = ArgumentParser(description="Categorical burden of proof pipeline.")
parser.add_argument(
"-i", "--input", type=os.path.abspath, required=True, help="Input data folder"
)
parser.add_argument(
"-o",
"--output",
type=os.path.abspath,
required=True,
help="Output result folder",
)
parser.add_argument(
"-p",
"--pairs",
required=False,
default=None,
nargs="+",
help="Included pairs, default all pairs",
)
parser.add_argument(
"-a",
"--actions",
choices=["fit_signal_model", "select_bias_covs", "fit_linear_model"],
default=None,
nargs="+",
help="Included actions, default all actions",
)
parser.add_argument(
"-m",
"--metadata",
nargs="*",
required=False,
default={},
action=ParseKwargs,
help="User defined metadata",
)
args = parser.parse_args(args)

run(args.input, args.output, args.pairs, args.actions, args.metadata)


if __name__ == "__main__":
main()
Loading

0 comments on commit d0df991

Please sign in to comment.