diff --git a/Makefile b/Makefile index a771b8261..c97980c94 100644 --- a/Makefile +++ b/Makefile @@ -30,3 +30,8 @@ clean: rm -rf dist rm -rf scikit_lego.egg-info rm -rf .ipynb_checkpoints + +dist: clean + python setup.py sdist + python setup.py bdist_wheel --universal + twine upload dist/* \ No newline at end of file diff --git a/doc/_static/css/custom.css b/doc/_static/css/custom.css index a297bb320..fd3a43e83 100644 --- a/doc/_static/css/custom.css +++ b/doc/_static/css/custom.css @@ -1,5 +1,5 @@ .wy-nav-side{ - background-color: #f2f2f2; + background-color: white; color: black; } diff --git a/doc/_static/grouped-chick-1.png b/doc/_static/grouped-chick-1.png new file mode 100644 index 000000000..88026828e Binary files /dev/null and b/doc/_static/grouped-chick-1.png differ diff --git a/doc/_static/grouped-chick-2.png b/doc/_static/grouped-chick-2.png new file mode 100644 index 000000000..11a2d1d6e Binary files /dev/null and b/doc/_static/grouped-chick-2.png differ diff --git a/doc/_static/grouped-chick-3.png b/doc/_static/grouped-chick-3.png new file mode 100644 index 000000000..d698cd90f Binary files /dev/null and b/doc/_static/grouped-chick-3.png differ diff --git a/doc/_static/grouped-df.png b/doc/_static/grouped-df.png new file mode 100644 index 000000000..cb2fdc274 Binary files /dev/null and b/doc/_static/grouped-df.png differ diff --git a/doc/_static/grouped-np.png b/doc/_static/grouped-np.png new file mode 100644 index 000000000..16ae6a744 Binary files /dev/null and b/doc/_static/grouped-np.png differ diff --git a/doc/conf.py b/doc/conf.py index 5ea428896..273c6360a 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -19,7 +19,7 @@ # -- Project information ----------------------------------------------------- project = 'scikit-lego' -copyright = '2019, Vincent and Others[tm]' +copyright = '2019, Vincent, Matthijs and ManyOthers[tm]' author = 'Vincent and Others[tm]' # The short X.Y version diff --git a/doc/index.rst b/doc/index.rst index c55ea0e57..c37d7a515 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -62,5 +62,6 @@ Usage install contribution mixture-methods + meta preprocessing api/modules diff --git a/doc/meta.rst b/doc/meta.rst new file mode 100644 index 000000000..3a2e31522 --- /dev/null +++ b/doc/meta.rst @@ -0,0 +1,167 @@ +Meta Models +=========== + +Certain models in scikit-lego are "meta". Meta models are +models that depend on other estimators that go in and these +models will add features to the input model. One way of thinking +of a meta model is to consider it to be a way to "decorate" a +model. + +This part of the documentation will highlight a few of them. + +Grouped Estimation +------------------ + +A kind introduction to "meta"-models is the `GroupedEstimator`. + +To help explain what it can do we'll consider three methods to predict +the chicken weight . The chicken data has 578 rows and 4 columns +from an experiment on the effect of diet on early growth of chicks. +The body weights of the chicks were measured at birth and every second +day thereafter until day 20. They were also measured on day 21. +There were four groups on chicks on different protein diets. + +Setup +***** + +Let's first load a bunch of things to do this. + +.. code-block::python + + import numpy as np + import pandas as pd + import matplotlib.pylab as plt + from sklearn.linear_model import LinearRegression + from sklearn.pipeline import Pipeline, FeatureUnion + from sklearn.preprocessing import OneHotEncoder, StandardScaler + from sklearn.metrics import mean_absolute_error, mean_squared_error + + from sklego.datasets import load_chicken + from sklego.preprocessing import ColumnSelector + + df = load_chicken(give_pandas=True) + + def plot_model(model): + df = load_chicken(give_pandas=True) + model.fit(df[['diet', 'time']], df['weight']) + metric_df = df[['diet', 'time', 'weight']].assign(pred=lambda d: model.predict(d[['diet', 'time']])) + metric = mean_absolute_error(metric_df['weight'], metric_df['pred']) + plt.scatter(df['time'], df['weight']) + for i in [1, 2, 3, 4]: + pltr = metric_df[['time', 'diet', 'pred']].drop_duplicates().loc[lambda d: d['diet'] == i] + plt.plot(pltr['time'], pltr['pred'], color='.rbgy'[i]) + plt.title(f"linear model per group, MAE: {np.round(metric, 2)}"); + +Model 1: Linear Regression with Dummies +*************************************** + +First we start with a baseline. + +.. code-block:: python + + feature_pipeline = Pipeline([ + ("datagrab", FeatureUnion([ + ("discrete", Pipeline([ + ("grab", ColumnSelector("diet")), + ("encode", OneHotEncoder(categories="auto", sparse=False)) + ])), + ("continous", Pipeline([ + ("grab", ColumnSelector("time")), + ("standardize", StandardScaler()) + ])) + ])) + ]) + + pipe = Pipeline([ + ("transform", feature_pipeline), + ("model", LinearRegression()) + ]) + + plot_model(pipe) + +This code results in a model that is plotted below. + +.. image:: _static/grouped-chick-1.png + :align: center + +Because the model is linear the dummy variable causes the intercept +to change but leaves the gradient untouched. This might not be what +we want from a model. So let's see how the grouped model can adress +this. + +Model 2: Linear Regression in GroupedEstimation +*********************************************** + +The goal of the grouped estimator is to allow us to split up our data. +The image below demonstrates what will happen. + +.. image:: _static/grouped-df.png + +We train 5 models in total because the model will also train a +fallback automatically (you can turn this off via `use_fallback=False`). +The idea behind the fallback is that we can predict something if +the group does not appear in the prediction. + +Each model will accept features that are in `X` that are not +part of the grouping variables. In this case each group will +model based on the `time` since `weight` is what we're trying +to predict. + +Applying this model to the dataframe is easy. + +.. code-block:: python + + from sklego.meta import GroupedEstimator + mod = GroupedEstimator(LinearRegression(), groups=["diet"]) + plot_model(mod) + + +And the results will look a bit better. + +.. image:: _static/grouped-chick-2.png + :align: center + +Model 3: Dummy Regression in GroupedEstimation +********************************************** + +We could go a step further and train a DummyRegressor_ per diet +per timestep. The code below works similar as the previous example +but one difference is that the grouped model does not receive a +dataframe but a numpy array. + +.. image:: _static/grouped-np.png + +Note that we're also grouping over more than one column here. +The code that does this is listed below. + +.. code-block:: python + + from sklearn.dummy import DummyRegressor + + feature_pipeline = Pipeline([ + ("datagrab", FeatureUnion([ + ("discrete", Pipeline([ + ("grab", ColumnSelector("diet")), + ])), + ("continous", Pipeline([ + ("grab", ColumnSelector("time")), + ])) + ])) + ]) + + pipe = Pipeline([ + ("transform", feature_pipeline), + ("model", GroupedEstimator(DummyRegressor(strategy="mean"), groups=[0, 1])) + ]) + + plot_model(pipe) + +Again, we show the predictions: + +.. image:: _static/grouped-chick-3.png + :align: center + +Note that these predictions seems to yield the lowest error but take it +with a grain of salt since these errors are only based on the train set. + +.. _DummyRegressor: https://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyRegressor.html \ No newline at end of file