From c1ac0a876363e0dff3c752b3e5ddbdb735395e5b Mon Sep 17 00:00:00 2001 From: eccabay Date: Wed, 25 Oct 2023 10:31:55 -0400 Subject: [PATCH] Add multiseries section to user guide --- docs/source/user_guide/timeseries.ipynb | 224 +++++++++++++++++++++++- evalml/demos/diabetes.py | 2 +- evalml/demos/weather.py | 2 +- 3 files changed, 224 insertions(+), 4 deletions(-) diff --git a/docs/source/user_guide/timeseries.ipynb b/docs/source/user_guide/timeseries.ipynb index cec0f391b8..ce2144f56c 100644 --- a/docs/source/user_guide/timeseries.ipynb +++ b/docs/source/user_guide/timeseries.ipynb @@ -1195,11 +1195,231 @@ "pipeline = automl.best_pipeline\n", "pipeline.graph()" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Multiseries time series problems\n", + "The above documentation has focused on single series time series data. Now, we take a look at multiseries time series forecasting.\n", + "\n", + "## What is multiseries?\n", + "Multiseries time series refers to data where we have multiple time series that we're trying to forecast simultaneously. For example, if we are a retailer who sells multiple products at our stores, we may have a single dataset that contains sales data for those multiple products. In this case, we would like to forecast sales for all products without splitting them off into separate datasets.\n", + "\n", + "There are two forms of multiseries forecasting - independent and dependent. Independent forecasting assumes that the separate series we're modeling are independent from each other, that is, the value of one series at a given point in time is unrelated to the value of a different series at any point in time. In our sales example, product A sales and product B sales would not impact each other at all. Dependent forecasting is the opposite, where it is assumed that all series have an impact on the others in the dataset.\n", + "\n", + "At the moment, EvalML only supports independent multiseries time series forecasting." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Preparing the data\n", + "\n", + "For this example, we will generate a fake example dataset with just two example series. It is common that many real-world scenarios will have more." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "\n", + "time_index = list(pd.date_range(start=\"1/1/2018\", periods=50)) * 2\n", + "series_id = pd.Series([1] * 50 + [2] * 50, dtype=\"str\")\n", + "\n", + "series_1_target = pd.Series(10 * np.sin(np.arange(50)) + np.random.rand(50))\n", + "series_2_target = pd.Series(range(50) + np.random.rand(50))\n", + "target = pd.Series(\n", + " pd.concat([series_1_target, series_2_target]), name=\"target\"\n", + ").reset_index(drop=True)\n", + "\n", + "data = (\n", + " pd.DataFrame(\n", + " {\n", + " \"date\": time_index,\n", + " \"series_id\": series_id,\n", + " \"target\": target,\n", + " },\n", + " )\n", + " .sort_values(\"date\")\n", + " .reset_index(drop=True)\n", + ")\n", + "X = data.drop([\"target\"], axis=1)\n", + "y = data[\"target\"]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This data has two unique series with their own target patterns, combined into a single dataset. Notice that both targets are combined into a single column, with each series' targets being delineated by the `series_id`. These values could be in essentially any order, as long as the date, series id, and target values are in equivalent rows. Note that the series id column can be categorical, it does not need to be numeric as it is in this example. Also note that while this example does not contain any other non-target features, the `VARMAXRegressor` can handle them." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "data.head(6)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In EvalML, we refer to this general data format as being \"stacked\" data, where all the target values for the separate series are stacked into a single column. If we removed the series id column and broke them into separate columns with one for each series, that would be referred to as \"unstacked\" data. EvalML provides utility functions to go between these formats, `unstack_multiseries`, `stack_data`, and `stack_X`. The stacking functions are differentiated by their return type, where `stack_data` will stack the input into a single column while `stack_X` will stack multiple series into their respective columns within a DataFrame." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from evalml.pipelines.utils import unstack_multiseries\n", + "\n", + "X_unstacked, y_unstacked = unstack_multiseries(\n", + " X, y, series_id=\"series_id\", time_index=\"date\", target_name=\"target\"\n", + ")\n", + "y_unstacked.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We provide a separate utility function to split your data into training and holdout sets for multiseries data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from evalml.preprocessing import split_multiseries_data\n", + "\n", + "X_train, X_holdout, y_train, y_holdout = split_multiseries_data(\n", + " X, y, series_id=\"series_id\", time_index=\"date\"\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Running AutoMLSearch\n", + "\n", + "Running AutoMLSearch for multiseries time series forecasting is similar to any other single series time series problem. The data should be in its \"stacked\" format, and the problem configuration requires an additional parameter of `series_id`.\n", + "\n", + "Right now, the only estimator that supports multiseries time series regression forecasting outside of the baseline model is the `VARMAXRegressor` (more information can be found [here](https://www.statsmodels.org/stable/generated/statsmodels.tsa.statespace.varmax.VARMAX.html)). The VARMAX regressor is trained during the first batch twice - once on its own and once running STL decomposition beforehand, as with single series instances." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "problem_configuration = {\n", + " \"time_index\": \"date\",\n", + " \"forecast_horizon\": 5,\n", + " \"max_delay\": 10,\n", + " \"gap\": 0,\n", + " \"series_id\": \"series_id\",\n", + "}\n", + "\n", + "automl = AutoMLSearch(\n", + " X_train,\n", + " y_train,\n", + " problem_type=\"multiseries time series regression\",\n", + " problem_configuration=problem_configuration,\n", + ")\n", + "automl.search()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "automl.rankings" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Visualizing the data and results\n", + "\n", + "As with any other time series problem, EvalML provides utility functions to visualize the data.\n", + "\n", + "### Decomposition\n", + "Plotting the decomposition of our multiseries data displays each series' decomposition in order. The decomposition object, unlike AutoMLSearch, requires unstacked data." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "decomposer = STLDecomposer(time_index=\"date\", series_id=\"series_id\")\n", + "decomposer.fit(X_unstacked, y_unstacked)\n", + "\n", + "decomposer.plot_decomposition(X_unstacked, y_unstacked)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Prediction vs actual\n", + "\n", + "Similarly, we can visualize the performance of our models by examining the prediction compared to the actual values over time. If we'd like to see the results of a single series, rather than all of them, we can use the argument `single_series`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pipeline = automl.best_pipeline\n", + "fig = graph_prediction_vs_actual_over_time(\n", + " pipeline, X_holdout, y_holdout, X_train, y_train, dates=X_holdout[\"date\"]\n", + ")\n", + "fig" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "fig = graph_prediction_vs_actual_over_time(\n", + " pipeline,\n", + " X_holdout,\n", + " y_holdout,\n", + " X_train,\n", + " y_train,\n", + " dates=X_holdout[\"date\"],\n", + " single_series=\"2\",\n", + ")\n", + "fig" + ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -1213,7 +1433,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.6" + "version": "3.10.13" } }, "nbformat": 4, diff --git a/evalml/demos/diabetes.py b/evalml/demos/diabetes.py index 69fb17efcf..dc97685492 100644 --- a/evalml/demos/diabetes.py +++ b/evalml/demos/diabetes.py @@ -8,7 +8,7 @@ def load_diabetes(): - """Load diabetes dataset. Used for regression problem. + """Load diabetes dataset. Used for regression problems. Returns: (pd.Dataframe, pd.Series): X and y diff --git a/evalml/demos/weather.py b/evalml/demos/weather.py index 188d04d1e7..fcac835471 100644 --- a/evalml/demos/weather.py +++ b/evalml/demos/weather.py @@ -7,7 +7,7 @@ def load_weather(): - """Load the Australian daily-min-termperatures weather dataset. + """Load the Australian daily-min-temperatures weather dataset. Returns: (pd.Dataframe, pd.Series): X and y