Skip to content

Commit

Permalink
Merge pull request #15 from big-o/develop
Browse files Browse the repository at this point in the history
v0.0.3
  • Loading branch information
big-o authored Aug 13, 2022
2 parents dc0123d + 02c1dd2 commit 6b84937
Show file tree
Hide file tree
Showing 10 changed files with 428 additions and 171 deletions.
1 change: 0 additions & 1 deletion appveyor.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,6 @@ install: |
for f in $(find . -maxdepth 1 -name 'requirements*.txt'); do
pip install -r ${f}
done
pip install pandas # Needed for some estimator checks.
pip install .
test_script:
Expand Down
14 changes: 9 additions & 5 deletions doc/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -13,24 +13,28 @@ cases including complex pre-processing, model stacking and benchmarking.
from skdag import DAGBuilder
dag = (
DAGBuilder()
DAGBuilder(infer_dataframe=True)
.add_step("impute", SimpleImputer())
.add_step("vitals", "passthrough", deps={"impute": slice(0, 4)})
.add_step(
"vitals",
"passthrough",
deps={"impute": ["age", "sex", "bmi", "bp"]},
)
.add_step(
"blood",
PCA(n_components=2, random_state=0),
deps={"impute": slice(4, 10)}
deps={"impute": ["s1", "s2", "s3", "s4", "s5", "s6"]},
)
.add_step(
"rf",
RandomForestRegressor(max_depth=5, random_state=0),
deps=["blood", "vitals"]
deps=["blood", "vitals"],
)
.add_step("svm", SVR(C=0.7), deps=["blood", "vitals"])
.add_step(
"knn",
KNeighborsRegressor(n_neighbors=5),
deps=["blood", "vitals"]
deps=["blood", "vitals"],
)
.add_step("meta", LinearRegression(), deps=["rf", "svm", "knn"])
.make_dag(n_jobs=2, verbose=True)
Expand Down
43 changes: 23 additions & 20 deletions doc/quick_start.rst
Original file line number Diff line number Diff line change
Expand Up @@ -50,31 +50,34 @@ For more complex DAGs, it is recommended to use a :class:`skdag.dag.DAGBuilder`,
which allows you to define the graph by specifying the dependencies of each new
estimator:

>>> from skdag import DAGBuilder
>>> dag = (
... DAGBuilder()
... .add_step("impute", SimpleImputer())
... .add_step("vitals", "passthrough", deps={"impute": slice(0, 4)})
... .add_step("blood", PCA(n_components=2, random_state=0), deps={"impute": slice(4, 10)})
... .add_step("lr", LogisticRegression(random_state=0), deps=["blood", "vitals"])
... .make_dag()
... )
>>> dag.draw()
o impute
|\
o o blood,vitals
|/
o lr
<BLANKLINE>
.. code-block:: python
>>> from skdag import DAGBuilder
>>> dag = (
... DAGBuilder(infer_dataframe=True)
... .add_step("impute", SimpleImputer())
... .add_step("vitals", "passthrough", deps={"impute": ["age", "sex", "bmi", "bp"]})
... .add_step("blood", PCA(n_components=2, random_state=0), deps={"impute": slice(4, 10)})
... .add_step("lr", LogisticRegression(random_state=0), deps=["blood", "vitals"])
... .make_dag()
... )
>>> dag.draw()
o impute
|\
o o blood,vitals
|/
o lr
<BLANKLINE>
.. image:: _static/img/dag2.png

In the above examples we pass the first four columns directly to a regressor, but
the remaining columns have dimensionality reduction applied first before being
passed to the same regressor as extra input columns. Note that we can define our graph
edges in two different ways: as a dict (if we need to select only certain columns from
the source node) or as a simple list (if we want to simply grab all columns from all
input nodes).
passed to the same regressor as extra input columns.

In this DAG, as well as using the ``deps`` option to control which estimators feed in to
other estimators, but which columns are used (and ignored) by each step. For more detail
on how to control this behaviour, see the `User Guide <user_guide.html>`_.

The DAG may now be used as an estimator in its own right:

Expand Down
29 changes: 23 additions & 6 deletions doc/user_guide.rst
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,8 @@ scikit-learn :class:`~sklearn.pipeline.Pipeline`. These DAGs may be created from
... ("impute", SimpleImputer()),
... ("pca", PCA()),
... ("lr", LogisticRegression())
... ]
... ],
... infer_dataframe=True,
... )
You may view a diagram of the DAG with the :meth:`~skdag.dag.DAG.show` method. In a
Expand All @@ -44,18 +45,25 @@ ASCII text:
.. image:: _static/img/dag1.png

Note that we also provided an extra option, ``infer_dataframe``. This is entirely
optional, but if set the DAG will ensure that dataframe inputs have column and index
information preserved (or inferred), and the output of the pipeline will also be a
dataframe. This is useful if you wish to filter down the inputs for one particular step
to only include certain columns; something we shall see in action later.

For more complex DAGs, it is recommended to use a :class:`skdag.dag.DAGBuilder`,
which allows you to define the graph by specifying the dependencies of each new
estimator:

.. code-block:: python
>>> from skdag import DAGBuilder
>>> from sklearn.compose import make_column_selector
>>> dag = (
... DAGBuilder()
... DAGBuilder(infer_dataframe=True)
... .add_step("impute", SimpleImputer())
... .add_step("vitals", "passthrough", deps={"impute": slice(0, 4)})
... .add_step("blood", PCA(n_components=2, random_state=0), deps={"impute": slice(4, 10)})
... .add_step("vitals", "passthrough", deps={"impute": ["age", "sex", "bmi", "bp"]})
... .add_step("blood", PCA(n_components=2, random_state=0), deps={"impute": make_column_selector("s[0-9]+")})
... .add_step("lr", LogisticRegression(random_state=0), deps=["blood", "vitals"])
... .make_dag()
... )
Expand All @@ -73,7 +81,16 @@ the remaining columns have dimensionality reduction applied first before being
passed to the same regressor. Note that we can define our graph edges in two
different ways: as a dict (if we need to select only certain columns from the source
node) or as a simple list (if we want to simply grab all columns from all input
nodes).
nodes). Columns may be specified as any kind of iterable (list, slice etc.) or a column
selector function that conforms to :meth:`sklearn.compose.make_column_selector`.

If you wish to specify string column names for dependencies, ensure you provide the
``infer_dataframe=True`` option when you create a dag. This will ensure that all
estimator outputs are coerced into dataframes. Where possible column names will be
inferred, otherwise the column names will just be the name of the estimator step with an
appended index number. If you do not specify ``infer_dataframe=True``, the dag will
leave the outputs unmodified, which in most cases will mean numpy arrays that only
support numeric column indices.

The DAG may now be used as an estimator in its own right:

Expand Down Expand Up @@ -189,7 +206,7 @@ as a dictionary of step name to column indices instead:
>>> from sklearn.ensemble import RandomForestClassifier
>>> from sklearn.svm import SVC
>>> clf_stack = (
... DAGBuilder()
... DAGBuilder(infer_dataframe=True)
... .add_step("pass", "passthrough")
... .add_step("rf", RandomForestClassifier(), deps=["pass"])
... .add_step("svr", SVC(), deps=["pass"])
Expand Down
1 change: 1 addition & 0 deletions requirements_test.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
pandas
pytest
pytest-cov
2 changes: 1 addition & 1 deletion skdag/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.0.2"
__version__ = "0.0.3"
76 changes: 73 additions & 3 deletions skdag/dag/_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,15 @@ class DAGBuilder:
that reference each step by name. Note that steps must be defined before they are
used as dependencies.
Parameters
----------
infer_dataframe : bool, default = False
If True, assume ``dataframe_columns="infer"`` every time :meth:`.add_step` is
called, if ``dataframe_columns`` is set to ``None``. This effectively makes the
resulting DAG always try to coerce output into pandas DataFrames wherever
possible.
See Also
--------
:class:`skdag.DAG` : The estimator DAG created by this utility.
Expand Down Expand Up @@ -43,10 +52,66 @@ class DAGBuilder:
o lr
"""

def __init__(self):
def __init__(self, infer_dataframe=False):
self.graph = nx.DiGraph()
self.infer_dataframe = infer_dataframe

def from_pipeline(self, steps, **kwargs):
"""
Construct a DAG from a simple linear sequence of steps. The resulting DAG will
be equivalent to a :class:`~sklearn.pipeline.Pipeline`.
Parameters
----------
steps : sequence of (str, estimator)
An ordered sequence of pipeline steps. A step is simply a pair of
``(name, estimator)``, just like a scikit-learn Pipeline.
infer_dataframe : bool, default = False
If True, assume ``dataframe_columns="infer"`` every time :meth:`.add_step`
is called, if ``dataframe_columns`` is set to ``None``. This effectively
makes the resulting DAG always try to coerce output into pandas DataFrames
wherever possible.
kwargs : kwargs
Any other hyperparameters that are accepted by :class:`~skdag.dag.DAG`'s
contructor.
"""
if hasattr(steps, "steps"):
pipe = steps
steps = pipe.steps
if hasattr(pipe, "get_params"):
kwargs = {
**{
k: v
for k, v in pipe.get_params().items()
if k in ("memory", "verbose")
},
**kwargs,
}

dfcols = "infer" if self.infer_dataframe else None

for i in range(len(steps)):
name, estimator = steps[i]
self._validate_name(name)
deps = {}
if i > 0:
dep = steps[i - 1][0]
deps[dep] = None
self._validate_deps(deps)

step = DAGStep(name, estimator, deps, dataframe_columns=dfcols)
self.graph.add_node(name, step=step)
if deps:
self.graph.add_edge(dep, name)

def add_step(self, name, est, deps=None):
self._validate_graph()

return self

def add_step(self, name, est, deps=None, dataframe_columns=None):
self._validate_name(name)
if isinstance(deps, Sequence):
deps = {dep: None for dep in deps}
Expand All @@ -56,7 +121,12 @@ def add_step(self, name, est, deps=None):
else:
deps = {}

step = DAGStep(name, est, deps=deps)
if dataframe_columns is None and self.infer_dataframe:
dfcols = "infer"
else:
dfcols = dataframe_columns

step = DAGStep(name, est, deps=deps, dataframe_columns=dfcols)
self.graph.add_node(name, step=step)

for dep in deps:
Expand Down
Loading

0 comments on commit 6b84937

Please sign in to comment.