scikit-learn-contrib · asford · Mar 1, 2016 · Mar 1, 2016 · Mar 1, 2016 · Mar 1, 2016
diff --git a/examples/01 Flowers and Forests - A Simple Pipeline.ipynb b/examples/01 Flowers and Forests - A Simple Pipeline.ipynb
@@ -0,0 +1,333 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import sklearn.datasets\n",
+    "import pandas"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "from sklearn_pandas import DataFrameMapper, make_dataframe_pipeline\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "from sklearn.linear_model import LogisticRegression\n",
+    "from sklearn.cross_validation import cross_val_score"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Gather a Tidy Dataframe\n",
+    "----"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>sepal length (cm)</th>\n",
+       "      <th>sepal width (cm)</th>\n",
+       "      <th>petal length (cm)</th>\n",
+       "      <th>petal width (cm)</th>\n",
+       "      <th>class</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>5.1</td>\n",
+       "      <td>3.5</td>\n",
+       "      <td>1.4</td>\n",
+       "      <td>0.2</td>\n",
+       "      <td>setosa</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>4.9</td>\n",
+       "      <td>3.0</td>\n",
+       "      <td>1.4</td>\n",
+       "      <td>0.2</td>\n",
+       "      <td>setosa</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>4.7</td>\n",
+       "      <td>3.2</td>\n",
+       "      <td>1.3</td>\n",
+       "      <td>0.2</td>\n",
+       "      <td>setosa</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>4.6</td>\n",
+       "      <td>3.1</td>\n",
+       "      <td>1.5</td>\n",
+       "      <td>0.2</td>\n",
+       "      <td>setosa</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>5.0</td>\n",
+       "      <td>3.6</td>\n",
+       "      <td>1.4</td>\n",
+       "      <td>0.2</td>\n",
+       "      <td>setosa</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \\\n",
+       "0                5.1               3.5                1.4               0.2   \n",
+       "1                4.9               3.0                1.4               0.2   \n",
+       "2                4.7               3.2                1.3               0.2   \n",
+       "3                4.6               3.1                1.5               0.2   \n",
+       "4                5.0               3.6                1.4               0.2   \n",
+       "\n",
+       "    class  \n",
+       "0  setosa  \n",
+       "1  setosa  \n",
+       "2  setosa  \n",
+       "3  setosa  \n",
+       "4  setosa  "
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "iris_data = sklearn.datasets.load_iris()\n",
+    "iris = pandas.DataFrame(data = iris_data[\"data\"], columns=iris_data[\"feature_names\"])\n",
+    "iris[\"class\"] = iris_data[\"target_names\"][iris_data[\"target\"]]\n",
+    "\n",
+    "iris.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Assemble a Simple Learning Pipeline\n",
+    "--------\n",
+    "\n",
+    "A DataFramePipeline begins with a DataFrameMapper, specify how features **`X`** and targets **`y`** are extracted from an input frame. It ends with an estimator object.\n",
+    "\n",
+    "In this case, extract each available feature without transformation and specify the class label as the target."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "forest_pipeline = make_dataframe_pipeline([\n",
+    "        DataFrameMapper(iris_data[\"feature_names\"], \"class\"),\n",
+    "        RandomForestClassifier(n_estimators=200)\n",
+    "    ])\n",
+    "\n",
+    "logistic_pipeline = make_dataframe_pipeline([\n",
+    "        DataFrameMapper(iris_data[\"feature_names\"], \"class\"),\n",
+    "        LogisticRegression()\n",
+    "    ])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Cross Validate\n",
+    "-----\n",
+    "\n",
+    "Cross validation requires the target **`y`** to perform train-test splits. Use the pipeline's DataFrameMapper to extract the target feature array from input data. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>forest</th>\n",
+       "      <th>logistic</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>count</th>\n",
+       "      <td>5.000000</td>\n",
+       "      <td>5.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>mean</th>\n",
+       "      <td>0.960000</td>\n",
+       "      <td>0.960000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>std</th>\n",
+       "      <td>0.027889</td>\n",
+       "      <td>0.043461</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>min</th>\n",
+       "      <td>0.933333</td>\n",
+       "      <td>0.900000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>25%</th>\n",
+       "      <td>0.933333</td>\n",
+       "      <td>0.933333</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>50%</th>\n",
+       "      <td>0.966667</td>\n",
+       "      <td>0.966667</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>75%</th>\n",
+       "      <td>0.966667</td>\n",
+       "      <td>1.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>max</th>\n",
+       "      <td>1.000000</td>\n",
+       "      <td>1.000000</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "         forest  logistic\n",
+       "count  5.000000  5.000000\n",
+       "mean   0.960000  0.960000\n",
+       "std    0.027889  0.043461\n",
+       "min    0.933333  0.900000\n",
+       "25%    0.933333  0.933333\n",
+       "50%    0.966667  0.966667\n",
+       "75%    0.966667  1.000000\n",
+       "max    1.000000  1.000000"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "cross_val_result = pandas.DataFrame.from_dict({\n",
+    "    \"forest\" : cross_val_score(\n",
+    "        estimator = forest_pipeline,\n",
+    "        X = iris, y = forest_pipeline._dataframe_mapper.extract_y(iris),\n",
+    "        cv = 5, scoring=\"accuracy\"),\n",
+    "    \"logistic\" : cross_val_score(\n",
+    "        estimator = logistic_pipeline,\n",
+    "        X = iris, y = logistic_pipeline._dataframe_mapper.extract_y(iris),\n",
+    "        cv = 5, scoring=\"accuracy\")\n",
+    "    })\n",
+    "\n",
+    "cross_val_result.describe()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Extract Feature Metadata\n",
+    "----\n",
+    "\n",
+    "The DataFrameMapper may be used to associate estimator metadata with feature source information. In this case, the `feature_importances_` vector is associated with the source column name."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "sepal length (cm)    0.111118\n",
+       "sepal width (cm)     0.028009\n",
+       "petal length (cm)    0.455807\n",
+       "petal width (cm)     0.405066\n",
+       "Name: feature_importances, dtype: float64"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "forest_pipeline.fit(iris)\n",
+    "pandas.Series(\n",
+    "    data = forest_pipeline._final_estimator.feature_importances_,\n",
+    "    index = forest_pipeline._dataframe_mapper.X_columns_,\n",
+    "    name=\"feature_importances\"\n",
+    ")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 2",
+   "language": "python",
+   "name": "python2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/sklearn_pandas/__init__.py b/sklearn_pandas/__init__.py
@@ -2,3 +2,4 @@
 
 from .dataframe_mapper import DataFrameMapper  # NOQA
 from .cross_validation import cross_val_score, GridSearchCV, RandomizedSearchCV  # NOQA
+from .dataframe_pipeline import DataFramePipeline, make_dataframe_pipeline