Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Settings + signal model for categorical pipeline #15

Merged
merged 6 commits into from
Sep 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -189,4 +189,7 @@ result/
# vscode
.vscode/

# MacOS DS Store files
.DS_Store

results/
17 changes: 17 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
repos:
- repo: https://github.com/astral-sh/ruff-pre-commit
rev: v0.4.2
hooks:
- id: ruff
args: [ --fix ]
- id: ruff-format
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v4.6.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
- repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.10.0
hooks:
- id: mypy
files: ^src
20 changes: 20 additions & 0 deletions data/categorical/settings.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
default:
seed: 0
fit_signal_model:
cat_cov_model:
ref_cat:
prior_order: []
signal_model:
inlier_pct: 0.9
select_bias_covs:
cov_finder:
pre_selected_covs: []
complete_summary:
draws:
num_draws: 1000
quantiles: [0.025, 0.05, 0.5, 0.95, 0.975]
score:
normalize_to_tmrel: false
figure:
show_ref: true
cat_order: []
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ license = { file = "LICENSE" }
authors = [
{ name = "IHME Math Sciences", email = "[email protected]" },
]
dependencies = ["numpy", "scipy", "pandas", "matplotlib", "mrtool==0.1.4", "pplkit"]
dependencies = ["numpy", "scipy", "pandas", "matplotlib", "pplkit"]

[project.optional-dependencies]
test = ["pytest"]
Expand All @@ -24,6 +24,7 @@ github = "https://github.com/ihmeuw-msca/bopforge"
[project.scripts]
continuous_pipeline = "bopforge.continuous_pipeline.__main__:main"
dichotomous_pipeline = "bopforge.dichotomous_pipeline.__main__:main"
categorical_pipeline = "bopforge.categorical_pipeline.__main__:main"

[tool.sphinx]
project = "modrover"
Expand Down
8 changes: 8 additions & 0 deletions ruff.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
line-length = 80
src = ["src"]

[format]
docstring-code-format = true

[lint.pydocstyle]
convention = "numpy"
20 changes: 20 additions & 0 deletions sphinx/api_reference/categorical_pipeline.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
Categorical Pipeline
====================

Actions
-------

.. autofunction:: bopforge.categorical.__main__.pre_processing

.. autofunction:: bopforge.categorical.__main__.fit_signal_model

.. autofunction:: bopforge.categorical.__main__.select_bias_covs

.. autofunction:: bopforge.categorical.__main__.fit_linear_model


Functions
---------

.. automodule:: bopforge.categorical_pipeline.functions
:members:
Empty file.
265 changes: 265 additions & 0 deletions src/bopforge/categorical_pipeline/__main__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,265 @@
import os
import shutil
import warnings
from argparse import ArgumentParser
from pathlib import Path

import bopforge.categorical_pipeline.functions as functions
import numpy as np
from bopforge.utils import fill_dict, ParseKwargs
from pplkit.data.interface import DataInterface

warnings.filterwarnings("ignore")

def pre_processing(result_folder: Path) -> None:
dataif = DataInterface(result=result_folder)
name = dataif.result.name

# load data
df = dataif.load_result(f"raw-{name}.csv")
all_settings = dataif.load_result("settings.yaml")
settings = all_settings["select_bias_covs"]["cov_finder"]

# get bias covariates that need to be removed
all_covs = [col for col in df.columns if col.startswith("cov_")]
covs_to_remove = [col for col in all_covs if len(df[col].unique()) == 1]
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

here we probably can use

df[col].unique() == 1

But we can make this chance along with removing columns with only several 1s or 0s. And also in other types of pipelines


# remove from dataframe
df.drop(columns=covs_to_remove, inplace=True)

# remove from settings
all_covs = set(all_covs)
covs_to_remove = set(covs_to_remove)
pre_selected_covs = set(settings["pre_selected_covs"])
pre_selected_covs = pre_selected_covs & all_covs
pre_selected_covs = pre_selected_covs - covs_to_remove

settings["pre_selected_covs"] = list(pre_selected_covs)
all_settings["select_bias_covs"]["cov_finder"] = settings

# save results
dataif.dump_result(df, f"{name}.csv")
dataif.dump_result(all_settings, "settings.yaml")


def fit_signal_model(result_folder: Path) -> None:
"""Fit signal model. This step involves, trimming, but does not use a mixed
effect model. The goal is to get the strength of prior for the covariate
selection step and identifying all the outliers. A summary file will be
generated to store the results of signal model.

Parameters
----------
dataif
Data interface in charge of file reading and writing.

"""
pre_processing(result_folder)
dataif = DataInterface(result=result_folder)
name = dataif.result.name

# load data
df = dataif.load_result(f"{name}.csv")

# load settings
all_settings = dataif.load_result("settings.yaml")
settings = all_settings["fit_signal_model"]

signal_model = functions.get_signal_model(settings, df)
signal_model.fit_model(outer_step_size=200, outer_max_iter=100)

df = functions.add_cols(df, signal_model)
df_coef = functions.get_coefs(all_settings, signal_model)

summary = functions.get_signal_model_summary(name, df, df_coef)

fig = functions.plot_signal_model(
name,
summary,
df,
df_coef,
signal_model,
show_ref=all_settings["figure"]["show_ref"],
)

# save results
dataif.dump_result(df, f"{name}.csv")
dataif.dump_result(signal_model, "signal_model.pkl")
dataif.dump_result(summary, "summary.yaml")
fig.savefig(dataif.result / "signal_model.pdf", bbox_inches="tight")


# def select_bias_covs(result_folder: Path) -> None:
# """Select the bias covariates. In this step, we first fit a linear model to
# get the prior strength of the bias-covariates. And then we use `CovFinder`
# to select important bias-covariates. A summary of the result will be
# generated and store in file `cov_finder_result.yaml`.

# Parameters
# ----------
# dataif
# Data interface in charge of file reading and writing.

# """
# dataif = DataInterface(result=result_folder)
# name = dataif.result.name

# df = dataif.load_result(f"{name}.csv")
# df = df[df.is_outlier == 0].copy()

# all_settings = dataif.load_result("settings.yaml")
# settings = all_settings["select_bias_covs"]

# cov_finder_linear_model = dataif.load_result("signal_model.pkl")

# cov_finder = functions.get_cov_finder(settings, cov_finder_linear_model)
# cov_finder.select_covs(verbose=True)

# cov_finder_result = functions.get_cov_finder_result(
# cov_finder_linear_model, cov_finder
# )

# dataif.dump_result(cov_finder_result, "cov_finder_result.yaml")
# dataif.dump_result(cov_finder, "cov_finder.pkl")


# def fit_linear_model(result_folder: Path) -> None:
# """Fit the final linear mixed effect model for the process. We will fit the
# linear model using selected bias covariates in this step. And we will create
# draws and quantiles for the effects. A single panels figure will be plotted
# to show the fit and all the important result information is documented in
# the `summary.yaml` file.

# Parameters
# ----------
# dataif
# Data interface in charge of file reading and writing.

# """
# dataif = DataInterface(result=result_folder)
# name = dataif.result.name

# df = dataif.load_result(f"{name}.csv")
# df_train = df[df.is_outlier == 0].copy()

# cov_finder_result = dataif.load_result("cov_finder_result.yaml")
# all_settings = dataif.load_result("settings.yaml")
# settings = all_settings["complete_summary"]
# summary = dataif.load_result("summary.yaml")

# linear_model = functions.get_linear_model(df_train, cov_finder_result)
# linear_model.fit_model()

# summary = functions.get_linear_model_summary(summary, df, linear_model)

# df_inner_draws, df_outer_draws = functions.get_draws(settings, summary)

# df_inner_quantiles, df_outer_quantiles = functions.get_quantiles(settings, summary)

# fig = functions.plot_linear_model(summary, df)

# dataif.dump_result(linear_model, "linear_model.pkl")
# dataif.dump_result(summary, "summary.yaml")
# dataif.dump_result(df_inner_draws, "inner_draws.csv")
# dataif.dump_result(df_outer_draws, "outer_draws.csv")
# dataif.dump_result(df_inner_quantiles, "inner_quantiles.csv")
# dataif.dump_result(df_outer_quantiles, "outer_quantiles.csv")
# fig.savefig(dataif.result / "linear_model.pdf", bbox_inches="tight")


def run(
i_dir: str,
o_dir: str,
pairs: list[str],
actions: list[str],
metadata: dict,
) -> None:
i_dir, o_dir = Path(i_dir), Path(o_dir)
# check the input and output folders
if not i_dir.exists():
raise FileNotFoundError("input data folder not found")

o_dir.mkdir(parents=True, exist_ok=True)

dataif = DataInterface(i_dir=i_dir, o_dir=o_dir)
settings = dataif.load_i_dir("settings.yaml")

# check pairs
all_pairs = [pair for pair in settings.keys() if pair != "default"]
pairs = pairs or all_pairs
for pair in pairs:
data_path = dataif.get_fpath(f"{pair}.csv", key="i_dir")
if not data_path.exists():
raise FileNotFoundError(f"Missing data file {data_path}")

# check actions
# TODO: might be good to use enum here
all_actions = ["fit_signal_model", "select_bias_covs", "fit_linear_model"]
actions = actions or all_actions
invalid_actions = set(actions) - set(all_actions)
if len(invalid_actions) != 0:
raise ValueError(f"{list(invalid_actions)} are invalid actions")

# fit each pair
for pair in pairs:
pair_o_dir = o_dir / pair
pair_o_dir.mkdir(parents=True, exist_ok=True)

shutil.copy(i_dir / f"{pair}.csv", pair_o_dir / f"raw-{pair}.csv")

if pair not in settings:
pair_settings = settings["default"]
else:
pair_settings = fill_dict(settings[pair], settings["default"])
pair_settings["metadata"] = metadata
dataif.dump_o_dir(pair_settings, pair, "settings.yaml")

np.random.seed(pair_settings["seed"])
for action in actions:
globals()[action](pair_o_dir)


def main(args=None) -> None:
parser = ArgumentParser(description="Categorical burden of proof pipeline.")
parser.add_argument(
"-i", "--input", type=os.path.abspath, required=True, help="Input data folder"
)
parser.add_argument(
"-o",
"--output",
type=os.path.abspath,
required=True,
help="Output result folder",
)
parser.add_argument(
"-p",
"--pairs",
required=False,
default=None,
nargs="+",
help="Included pairs, default all pairs",
)
parser.add_argument(
"-a",
"--actions",
choices=["fit_signal_model", "select_bias_covs", "fit_linear_model"],
default=None,
nargs="+",
help="Included actions, default all actions",
)
parser.add_argument(
"-m",
"--metadata",
nargs="*",
required=False,
default={},
action=ParseKwargs,
help="User defined metadata",
)
args = parser.parse_args(args)

run(args.input, args.output, args.pairs, args.actions, args.metadata)


if __name__ == "__main__":
main()
Loading
Loading