diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..a2f0b14
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,5 @@
+.ipynb_checkpoints
+__pycache__
+
+notebooks/kdd_cup_98/tmp/
+notebooks/kaggle_acquire_valued_shoppers_challenge/tmp/
diff --git a/README.md b/README.md
index 28e880d..3bb863f 100644
--- a/README.md
+++ b/README.md
@@ -1,3 +1,77 @@
+# Updating project to work with TensorFlow 2.18
+
+This repo is created as a fork of the [Customer Lifetime Value](https://github.com/google/lifetime_value) project from google. A pull request for these changes has been created. The changes in this repo enable the use of TensorFlow 2.18.
+
+The code has has been tested using the Ubuntu 24 operating system running Python 3.12 and the NVIDIA RTX A5000 graphics card.
+
+## List of changes
+
+- updated package name from sklearn to scikit-learn in Setup.py
+
+- updated notebooks from the notebook folder to:
+
+ - use a DATA_FOLDER variable for location of input and output files.
+
+ - replaced `%%script` blocks with `%%bash` since `%%script` is no longer supported.
+
+ - added an extra dimension for y_train and y_eval during the fit call to
+make them 2-dimensional arrays. Without this, the zlin loss (ltv.zero_inflated_lognormal_loss)
+will fail as it has a checks for target variable being two dimensional.
+
+ - removed quote characters around the company variable in calls to the pandas *query*
+function in notebooks of the *kaggle_acquire_valued_shoppers_challenge* folder. This may
+have worked in previous versions of pandas, but it silently returns an empty dataframe if
+a string is used as a query value against a numeric column.
+
+ - replaced referenced to LinearModel with a Sequential linear model as this class is longer supported.
+
+ - moved the numeric input field in kdd_cup_98/regression.ipynb to the last parameter.
+Due to the shape of this parameter (21,) and the presence of other features, if this
+paramter is not the last, TensorFlow throws an error during the call to the *fit* method.
+
+- added environment.yml to save packages used in the conda environment used to build this project. This includes NVIDIA libraries.
+
+- added requirements.txt file.
+
+## TLDR
+
+### kaggle acquire valued shoppers challenge
+
+There are three notebooks in the folder *notebooks/kaggle_acquire_valued_shoppers_challenge*
+
+- **preprocess_data.ipynb** has the code for processing the raw transaction file
+to build company specific feature files. This code is repeated in the other two
+notebooks. The preprocessing involves:
+
+ - Filtering of all records. Only select transactions with positive values; this excludes all returns which have a negative value. So the label won't reflect the returns nor will the calibration value
+
+ - Generating the calibration value. Sum up the total purchase amount for the first day of shopping.
+
+ - Generating calibration attributes. Take the most expensive transaction and select its
+ 'chain', 'dept', 'category', 'brand', 'productmeasure' values for the first day of shopping. Note, all other transactions are ignored. Any null values for these attributes are replaced by UNKNOWN.
+
+ - Generating the label/holdout value. This is the total amount purchased by a customer in one year
+
+The zero_inflated_lognormal_loss function used by both regression and classification notebooks, requires three inputs which are generated as the three output nodes of these models.
+
+- **regression.ipynb**
+
+To predict using a regression model, call the intial predict function followed by a call to the *zero_inflated_lognormal_pred* function, passing it all three output node values e.g.
+
+```
+logits = model.predict(x=x_eval, batch_size=1024)
+y_pred = ltv.zero_inflated_lognormal_pred(logits).numpy().flatten()
+ ```
+
+- **classification.ipynb**
+
+To predict using a classification model, call the intial predict function followed by a call to the sigmoid function passing it the values of the first of the three output node values e.g.
+
+```
+logits = model.predict(x=x_eval, batch_size=1024)
+y_pred = K.sigmoid(logits[..., :1]).numpy().flatten()
+```
+
# Lifetime Value
Accurate predictions of customers’ lifetime value (LTV) given their attributes
@@ -32,20 +106,20 @@ A Deep Probabilistic Model for Customer Lifetime Value Prediction.
The easiest way is propably using pip:
```
-pip install -q git+https://github.com/google/lifetime_value
+pip install -q git+https://github.com/seyedrezamirkhani/lifetime_value
```
If you are using a machine without admin rights, you can do:
```
-pip install -q git+https://github.com/google/lifetime_value --user
+pip install -q git+https://github.com/seyedrezamirkhani/lifetime_value --user
```
If you are using [Google Colab](https://colab.research.google.com/), just add
"!" to the beginning:
```
-!pip install -q git+https://github.com/google/lifetime_value
+!pip install -q git+https://github.com/seyedrezamirkhani/lifetime_value
```
Package works for python 3 only.
diff --git a/environment.yml b/environment.yml
new file mode 100644
index 0000000..6890362
--- /dev/null
+++ b/environment.yml
@@ -0,0 +1,184 @@
+name: clv-google
+channels:
+ - defaults
+dependencies:
+ - _libgcc_mutex=0.1=main
+ - _openmp_mutex=5.1=1_gnu
+ - bzip2=1.0.8=h5eee18b_6
+ - ca-certificates=2024.9.24=h06a4308_0
+ - expat=2.6.3=h6a678d5_0
+ - ld_impl_linux-64=2.40=h12ee557_0
+ - libffi=3.4.4=h6a678d5_1
+ - libgcc-ng=11.2.0=h1234567_1
+ - libgomp=11.2.0=h1234567_1
+ - libstdcxx-ng=11.2.0=h1234567_1
+ - libuuid=1.41.5=h5eee18b_0
+ - ncurses=6.4=h6a678d5_0
+ - openssl=3.0.15=h5eee18b_0
+ - pip=24.2=py312h06a4308_0
+ - python=3.12.7=h5148396_0
+ - readline=8.2=h5eee18b_0
+ - setuptools=75.1.0=py312h06a4308_0
+ - sqlite=3.45.3=h5eee18b_0
+ - tk=8.6.14=h39e8969_0
+ - wheel=0.44.0=py312h06a4308_0
+ - xz=5.4.6=h5eee18b_1
+ - zlib=1.2.13=h5eee18b_1
+ - pip:
+ - absl-py==2.1.0
+ - anyio==4.6.2.post1
+ - argon2-cffi==23.1.0
+ - argon2-cffi-bindings==21.2.0
+ - arrow==1.3.0
+ - asttokens==2.4.1
+ - astunparse==1.6.3
+ - async-lru==2.0.4
+ - attrs==24.2.0
+ - babel==2.16.0
+ - beautifulsoup4==4.12.3
+ - bleach==6.2.0
+ - certifi==2024.8.30
+ - cffi==1.17.1
+ - charset-normalizer==3.4.0
+ - cloudpickle==3.1.0
+ - comm==0.2.2
+ - contourpy==1.3.1
+ - cycler==0.12.1
+ - debugpy==1.8.8
+ - decorator==5.1.1
+ - defusedxml==0.7.1
+ - dm-tree==0.1.8
+ - executing==2.1.0
+ - fastjsonschema==2.20.0
+ - flatbuffers==24.3.25
+ - fonttools==4.55.0
+ - fqdn==1.5.1
+ - gast==0.6.0
+ - google-pasta==0.2.0
+ - grpcio==1.68.0
+ - h11==0.14.0
+ - h5py==3.12.1
+ - httpcore==1.0.7
+ - httpx==0.27.2
+ - idna==3.10
+ - ipykernel==6.29.5
+ - ipython==8.29.0
+ - ipywidgets==8.1.5
+ - isoduration==20.11.0
+ - jedi==0.19.2
+ - jinja2==3.1.4
+ - joblib==1.4.2
+ - json5==0.9.28
+ - jsonpointer==3.0.0
+ - jsonschema==4.23.0
+ - jsonschema-specifications==2024.10.1
+ - jupyter==1.1.1
+ - jupyter-client==8.6.3
+ - jupyter-console==6.6.3
+ - jupyter-core==5.7.2
+ - jupyter-events==0.10.0
+ - jupyter-lsp==2.2.5
+ - jupyter-server==2.14.2
+ - jupyter-server-terminals==0.5.3
+ - jupyterlab==4.2.6
+ - jupyterlab-pygments==0.3.0
+ - jupyterlab-server==2.27.3
+ - jupyterlab-widgets==3.0.13
+ - kaggle==1.6.17
+ - keras==3.6.0
+ - kiwisolver==1.4.7
+ - libclang==18.1.1
+ - lifetime-value==0.1
+ - markdown==3.7
+ - markdown-it-py==3.0.0
+ - markupsafe==3.0.2
+ - matplotlib==3.9.2
+ - matplotlib-inline==0.1.7
+ - mdurl==0.1.2
+ - mistune==3.0.2
+ - ml-dtypes==0.4.1
+ - namex==0.0.8
+ - nbclient==0.10.0
+ - nbconvert==7.16.4
+ - nbformat==5.10.4
+ - nest-asyncio==1.6.0
+ - notebook==7.2.2
+ - notebook-shim==0.2.4
+ - numpy==2.0.2
+ - nvidia-cublas-cu12==12.5.3.2
+ - nvidia-cuda-cupti-cu12==12.5.82
+ - nvidia-cuda-nvcc-cu12==12.5.82
+ - nvidia-cuda-nvrtc-cu12==12.5.82
+ - nvidia-cuda-runtime-cu12==12.5.82
+ - nvidia-cudnn-cu12==9.3.0.75
+ - nvidia-cufft-cu12==11.2.3.61
+ - nvidia-curand-cu12==10.3.6.82
+ - nvidia-cusolver-cu12==11.6.3.83
+ - nvidia-cusparse-cu12==12.5.1.3
+ - nvidia-nccl-cu12==2.21.5
+ - nvidia-nvjitlink-cu12==12.5.82
+ - opt-einsum==3.4.0
+ - optree==0.13.1
+ - overrides==7.7.0
+ - packaging==24.2
+ - pandas==2.2.3
+ - pandocfilters==1.5.1
+ - parso==0.8.4
+ - pexpect==4.9.0
+ - pillow==11.0.0
+ - platformdirs==4.3.6
+ - prometheus-client==0.21.0
+ - prompt-toolkit==3.0.48
+ - protobuf==5.28.3
+ - psutil==6.1.0
+ - ptyprocess==0.7.0
+ - pure-eval==0.2.3
+ - pycparser==2.22
+ - pydot==3.0.2
+ - pygments==2.18.0
+ - pyparsing==3.2.0
+ - python-dateutil==2.9.0.post0
+ - python-json-logger==2.0.7
+ - python-slugify==8.0.4
+ - pytz==2024.2
+ - pyyaml==6.0.2
+ - pyzmq==26.2.0
+ - referencing==0.35.1
+ - requests==2.32.3
+ - rfc3339-validator==0.1.4
+ - rfc3986-validator==0.1.1
+ - rich==13.9.4
+ - rpds-py==0.21.0
+ - scikit-learn==1.5.2
+ - scipy==1.14.1
+ - seaborn==0.13.2
+ - send2trash==1.8.3
+ - six==1.16.0
+ - sniffio==1.3.1
+ - soupsieve==2.6
+ - stack-data==0.6.3
+ - tensorboard==2.18.0
+ - tensorboard-data-server==0.7.2
+ - tensorflow==2.18.0
+ - tensorflow-probability==0.25.0
+ - termcolor==2.5.0
+ - terminado==0.18.1
+ - text-unidecode==1.3
+ - tf-keras==2.18.0
+ - threadpoolctl==3.5.0
+ - tinycss2==1.4.0
+ - tornado==6.4.1
+ - tqdm==4.67.0
+ - traitlets==5.14.3
+ - types-python-dateutil==2.9.0.20241003
+ - typing-extensions==4.12.2
+ - tzdata==2024.2
+ - uri-template==1.3.0
+ - urllib3==2.2.3
+ - wcwidth==0.2.13
+ - webcolors==24.11.1
+ - webencodings==0.5.1
+ - websocket-client==1.8.0
+ - werkzeug==3.1.3
+ - widgetsnbextension==4.0.13
+ - wrapt==1.16.0
diff --git a/notebooks/kaggle_acquire_valued_shoppers_challenge/classification.ipynb b/notebooks/kaggle_acquire_valued_shoppers_challenge/classification.ipynb
index 8982e00..0fd68c2 100644
--- a/notebooks/kaggle_acquire_valued_shoppers_challenge/classification.ipynb
+++ b/notebooks/kaggle_acquire_valued_shoppers_challenge/classification.ipynb
@@ -1,811 +1,838 @@
{
- "cells": [
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "znEK1MNRXqWY"
- },
- "outputs": [],
- "source": [
- "#@title Copyright 2019 The Lifetime Value Authors.\n",
- "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
- "# you may not use this file except in compliance with the License.\n",
- "# You may obtain a copy of the License at\n",
- "#\n",
- "# https://www.apache.org/licenses/LICENSE-2.0\n",
- "#\n",
- "# Unless required by applicable law or agreed to in writing, software\n",
- "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
- "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
- "# See the License for the specific language governing permissions and\n",
- "# limitations under the License.\n",
- "# ============================================================================"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "3oNqWmn530N-"
- },
- "source": [
- "# Churn Prediction for Kaggle Acquire Valued Customer Challenge"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "XKydJ9qF4KVm"
- },
- "source": [
- "\u003ctable align=\"left\"\u003e\n",
- " \u003ctd\u003e\n",
- " \u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/google/lifetime_value/blob/master/notebooks/kaggle_acquire_valued_shoppers_challenge/classification.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n",
- " \u003c/td\u003e\n",
- " \u003ctd\u003e\n",
- " \u003ca target=\"_blank\" href=\"https://github.com/google/lifetime_value/blob/master/notebooks/kaggle_acquire_valued_shoppers_challenge/classification.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\n",
- " \u003c/td\u003e\n",
- "\u003c/table\u003e"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "KObdQwyXH2mC"
- },
- "outputs": [],
- "source": [
- "import os\n",
- "\n",
- "import numpy as np\n",
- "import pandas as pd\n",
- "import tqdm\n",
- "from sklearn import metrics\n",
- "from sklearn import model_selection\n",
- "from sklearn import preprocessing\n",
- "import tensorflow as tf\n",
- "from tensorflow import keras\n",
- "from tensorflow.keras import backend as K\n",
- "import tensorflow_probability as tfp\n",
- "from typing import Sequence\n",
- "\n",
- "# install and import ltv\n",
- "!pip install -q git+https://github.com/google/lifetime_value\n",
- "import lifetime_value as ltv"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "K41RmAfNXtu_"
- },
- "outputs": [],
- "source": [
- "tfd = tfp.distributions\n",
- "pd.options.mode.chained_assignment = None # default='warn'"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "RzTaK6fFXMWT"
- },
- "source": [
- "## Global variables"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "VQVhF3fhNEr2"
- },
- "outputs": [],
- "source": [
- "COMPANY = '104900040' # @param { isTemplate: true, type: 'string'}\n",
- "LOSS = 'ziln' # @param { isTemplate: true, type: 'string'} ['bce', 'ziln']\n",
- "MODEL = 'linear' # @param { isTemplate: true, type: 'string'} ['linear', 'dnn']\n",
- "LEARNING_RATE = 0.0002 # @param { isTemplate: true}\n",
- "EPOCHS = 400 # @param {type: 'integer'}\n",
- "OUTPUT_CSV_FOLDER = '/tmp/lifetime-value/result' # @param { isTemplate: true, type: 'string'}"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "g7dg8TwYbxnl"
- },
- "outputs": [],
- "source": [
- "CATEGORICAL_FEATURES = ['chain', 'dept', 'category', 'brand', 'productmeasure']\n",
- "NUMERIC_FEATURES = ['log_calibration_value']\n",
- "\n",
- "ALL_FEATURES = CATEGORICAL_FEATURES + NUMERIC_FEATURES"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "I_nbvZjMuj_z"
- },
- "source": [
- "## Data"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "SFi0JMPu138h"
- },
- "source": [
- "### Download data"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "krXMbrkVNtdN"
- },
- "source": [
- "Setup kaggle API correctly following https://www.kaggle.com/docs/api\n",
- "```\n",
- "%%shell\n",
- "mkdir ~/.kaggle\n",
- "echo \\{\\\"username\\\":\\\"{your kaggle username}\\\",\\\"key\\\":\\\"{your kaggle api key}\\\"\\} \u003e ~/.kaggle/kaggle.json\n",
- "pip install kaggle\n",
- "```"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "0gf4ipd-14x0"
- },
- "outputs": [],
- "source": [
- "%%shell\n",
- "if [ -e /tmp/lifetime-value/acquire-valued-shoppers-challenge/transactions.csv ]\n",
- "then\n",
- " echo \"File already exists, no need to download.\"\n",
- "else\n",
- " rm -rf /tmp/lifetime-value/acquire-valued-shoppers-challenge\n",
- " mkdir -p /tmp/lifetime-value/acquire-valued-shoppers-challenge\n",
- " cd /tmp/lifetime-value/acquire-valued-shoppers-challenge\n",
- " kaggle competitions download -c acquire-valued-shoppers-challenge\n",
- " echo \"Unzip file. This may take 10 min.\"\n",
- " gunzip transactions.csv.gz\n",
- "fi"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "V4zoAS25uj_7"
- },
- "source": [
- "### Load transaction csv"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "5tIMvE3dW1Ky"
- },
- "outputs": [],
- "source": [
- "def load_transaction_data(company):\n",
- " all_data_filename = '/tmp/lifetime-value/acquire-valued-shoppers-challenge/transactions.csv'\n",
- " one_company_data_filename = (\n",
- " '/tmp/lifetime-value/acquire-valued-shoppers-challenge/transactions_company_{}.csv'\n",
- " .format(COMPANY))\n",
- " if os.path.isfile(one_company_data_filename):\n",
- " df = pd.read_csv(one_company_data_filename)\n",
- " else:\n",
- " data_list = []\n",
- " chunksize = 10**6\n",
- " # 350 iterations\n",
- " for chunk in tqdm.tqdm(pd.read_csv(all_data_filename, chunksize=chunksize)):\n",
- " data_list.append(chunk.query(\"company=='{}'\".format(company)))\n",
- " df = pd.concat(data_list, axis=0)\n",
- " df.to_csv(one_company_data_filename, index=None)\n",
- " return df"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "9ra4bfwCVwKn"
- },
- "source": [
- "### Preprocess data"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "PlJl5g9Delmi"
- },
- "outputs": [],
- "source": [
- "def preprocess(df):\n",
- " df = df.query('purchaseamount\u003e0')\n",
- " df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')\n",
- " df['start_date'] = df.groupby('id')['date'].transform('min')\n",
- "\n",
- " # Compute calibration values\n",
- " calibration_value = (\n",
- " df.query('date==start_date').groupby('id')\n",
- " ['purchaseamount'].sum().reset_index())\n",
- " calibration_value.columns = ['id', 'calibration_value']\n",
- "\n",
- " # Compute holdout values\n",
- " one_year_holdout_window_mask = (\n",
- " (df['date'] \u003e df['start_date']) \u0026\n",
- " (df['date'] \u003c= df['start_date'] + np.timedelta64(365, 'D')))\n",
- " holdout_value = (\n",
- " df[one_year_holdout_window_mask].groupby('id')\n",
- " ['purchaseamount'].sum().reset_index())\n",
- " holdout_value.columns = ['id', 'holdout_value']\n",
- "\n",
- " # Compute calibration attributes\n",
- " calibration_attributes = (\n",
- " df.query('date==start_date').sort_values(\n",
- " 'purchaseamount', ascending=False).groupby('id')[[\n",
- " 'chain', 'dept', 'category', 'brand', 'productmeasure'\n",
- " ]].first().reset_index())\n",
- "\n",
- " # Merge dataframes\n",
- " customer_level_data = (\n",
- " calibration_value.merge(calibration_attributes, how='left',\n",
- " on='id').merge(\n",
- " holdout_value, how='left', on='id'))\n",
- " customer_level_data['holdout_value'] = (\n",
- " customer_level_data['holdout_value'].fillna(0.))\n",
- " customer_level_data[CATEGORICAL_FEATURES] = (\n",
- " customer_level_data[CATEGORICAL_FEATURES].fillna('UNKNOWN'))\n",
- "\n",
- " # Specify data types\n",
- " customer_level_data['log_calibration_value'] = (\n",
- " np.log(customer_level_data['calibration_value']).astype('float32'))\n",
- " customer_level_data['chain'] = (\n",
- " customer_level_data['chain'].astype('category'))\n",
- " customer_level_data['dept'] = (customer_level_data['dept'].astype('category'))\n",
- " customer_level_data['brand'] = (\n",
- " customer_level_data['brand'].astype('category'))\n",
- " customer_level_data['category'] = (\n",
- " customer_level_data['category'].astype('category'))\n",
- " customer_level_data['label'] = (\n",
- " customer_level_data['holdout_value'].astype('float32'))\n",
- " return customer_level_data"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "fP3q6uuMoXhA"
- },
- "source": [
- "### Load customer-level csv"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "X8B4zV1xoeMX"
- },
- "outputs": [],
- "source": [
- "def load_customer_level_csv(company):\n",
- " customer_level_data_file = (\n",
- " '/tmp/lifetime-value/acquire-valued-shoppers-challenge/customer_level_data_company_{}.csv'\n",
- " .format(company))\n",
- " if os.path.isfile(customer_level_data_file):\n",
- " customer_level_data = pd.read_csv(customer_level_data_file)\n",
- " else:\n",
- " customer_level_data = preprocess(load_transaction_data(company))\n",
- " for cat_col in CATEGORICAL_FEATURES:\n",
- " customer_level_data[cat_col] = (\n",
- " customer_level_data[cat_col].astype('category'))\n",
- " for num_col in [\n",
- " 'log_calibration_value', 'calibration_value', 'holdout_value'\n",
- " ]:\n",
- " customer_level_data[num_col] = (\n",
- " customer_level_data[num_col].astype('float32'))\n",
- "\n",
- " return customer_level_data"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "DWwMxpIEukAE"
- },
- "outputs": [],
- "source": [
- "# Processes data. 350 iteration in total. May take 10min.\n",
- "customer_level_data = load_customer_level_csv(COMPANY)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "09tqgvANtsil"
- },
- "source": [
- "We observe a mixture of zero and lognormal distribution of holdout value."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "BtF0z3VbmGev"
- },
- "outputs": [],
- "source": [
- "customer_level_data.label.apply(np.log1p).hist(bins=50)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "wLpgjEuofbdy"
- },
- "outputs": [],
- "source": [
- "customer_level_data.head().T"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "slIDJAaTcQeK"
- },
- "source": [
- "## Data"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "i4kN0uk4kZ68"
- },
- "source": [
- "### Make train/eval"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "JjP5v6NiQfCX"
- },
- "outputs": [],
- "source": [
- "def linear_split(df):\n",
- " # get_dummies preserves numeric features.\n",
- " x = pd.get_dummies(df[ALL_FEATURES], drop_first=True).astype('float32').values\n",
- " y = df['label'].values\n",
- "\n",
- " x_train, x_eval, y_train, y_eval = model_selection.train_test_split(\n",
- " x, y, test_size=0.2, random_state=123)\n",
- "\n",
- " return x_train, x_eval, y_train, y_eval"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "KtWXwORJjaP4"
- },
- "outputs": [],
- "source": [
- "def dnn_split(df):\n",
- " for key in CATEGORICAL_FEATURES:\n",
- " encoder = preprocessing.LabelEncoder()\n",
- " df[key] = encoder.fit_transform(df[key])\n",
- "\n",
- " df_train, df_eval = model_selection.train_test_split(\n",
- " df, test_size=0.2, random_state=123)\n",
- "\n",
- " def feature_dict(df):\n",
- " features = {k: v.values for k, v in dict(df[CATEGORICAL_FEATURES]).items()}\n",
- " features['numeric'] = df[NUMERIC_FEATURES].values\n",
- " return features\n",
- "\n",
- " x_train, y_train = feature_dict(df_train), df_train['label'].values\n",
- " x_eval, y_eval = feature_dict(df_eval), df_eval['label'].values\n",
- "\n",
- " return x_train, x_eval, y_train, y_eval"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "lqbShWBzR4NE"
- },
- "source": [
- "## Model"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "Yh4Y4a89ooP3"
- },
- "outputs": [],
- "source": [
- "def linear_model(output_units):\n",
- " return tf.keras.experimental.LinearModel(output_units)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "W8yo3HLtrAE_"
- },
- "outputs": [],
- "source": [
- "def embedding_dim(x):\n",
- " return int(x**.25) + 1\n",
- "\n",
- "\n",
- "def embedding_layer(vocab_size):\n",
- " return tf.keras.Sequential([\n",
- " tf.keras.layers.Embedding(\n",
- " input_dim=vocab_size,\n",
- " output_dim=embedding_dim(vocab_size),\n",
- " input_length=1),\n",
- " tf.keras.layers.Flatten(),\n",
- " ])\n",
- "\n",
- "\n",
- "def dnn_model(output_units, df):\n",
- " numeric_input = tf.keras.layers.Input(\n",
- " shape=(len(NUMERIC_FEATURES),), name='numeric')\n",
- "\n",
- " embedding_inputs = [\n",
- " tf.keras.layers.Input(shape=(1,), name=key, dtype=np.int64)\n",
- " for key in CATEGORICAL_FEATURES\n",
- " ]\n",
- "\n",
- " embedding_outputs = [\n",
- " embedding_layer(vocab_size=df[key].nunique())(input)\n",
- " for key, input in zip(CATEGORICAL_FEATURES, embedding_inputs)\n",
- " ]\n",
- "\n",
- " deep_input = tf.keras.layers.concatenate([numeric_input] + embedding_outputs)\n",
- " deep_model = tf.keras.Sequential([\n",
- " tf.keras.layers.Dense(64, activation='relu'),\n",
- " tf.keras.layers.Dense(32, activation='relu'),\n",
- " tf.keras.layers.Dense(output_units),\n",
- " ])\n",
- " return tf.keras.Model(\n",
- " inputs=[numeric_input] + embedding_inputs, outputs=deep_model(deep_input))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "U8l-KzZ12fbK"
- },
- "source": [
- "### Train"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "45BHY6q7rQmI"
- },
- "outputs": [],
- "source": [
- "if LOSS == 'bce':\n",
- " loss = keras.losses.BinaryCrossentropy(from_logits=True)\n",
- " output_units = 1\n",
- "\n",
- "if LOSS == 'ziln':\n",
- " loss = ltv.zero_inflated_lognormal_loss\n",
- " output_units = 3"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "7Jeou8bGrhll"
- },
- "outputs": [],
- "source": [
- "if MODEL == 'linear':\n",
- " x_train, x_eval, y_train, y_eval = linear_split(customer_level_data)\n",
- " model = linear_model(output_units)\n",
- "\n",
- "if MODEL == 'dnn':\n",
- " x_train, x_eval, y_train, y_eval = dnn_split(customer_level_data)\n",
- " model = dnn_model(output_units, customer_level_data)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "uF2IdTpAwiZV"
- },
- "outputs": [],
- "source": [
- "if LOSS == 'bce':\n",
- " y_train = (y_train \u003e 0).astype('float32')\n",
- " y_eval = (y_eval \u003e 0).astype('float32')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "_GQ-RlIAfT62"
- },
- "outputs": [],
- "source": [
- "model.compile(loss=loss, optimizer=keras.optimizers.Adam(lr=LEARNING_RATE))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "chEIOzq6rlJx"
- },
- "outputs": [],
- "source": [
- "callbacks = [\n",
- " tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', min_lr=1e-6),\n",
- " tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10),\n",
- "]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "-BjnHV7MWhK1"
- },
- "outputs": [],
- "source": [
- "history = model.fit(\n",
- " x=x_train,\n",
- " y=y_train,\n",
- " batch_size=1024,\n",
- " epochs=EPOCHS,\n",
- " verbose=2,\n",
- " callbacks=callbacks,\n",
- " validation_data=(x_eval, y_eval)).history"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "mAJGs5SebDeN"
- },
- "outputs": [],
- "source": [
- "pd.DataFrame(history)[['loss', 'val_loss']][2:].plot()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "bHaiutmy2aYm"
- },
- "source": [
- "### Eval"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "l6E_5gYAYQMw"
- },
- "outputs": [],
- "source": [
- "logits = model.predict(x=x_eval, batch_size=1024)\n",
- "y_pred = K.sigmoid(logits[..., :1]).numpy().flatten()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "ICTDpg4rxdlj"
- },
- "outputs": [],
- "source": [
- "y_true = (y_eval \u003e 0).astype('float32')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "POeY1gdKTwfx"
- },
- "outputs": [],
- "source": [
- "def classification_report(y_true: Sequence[int],\n",
- " y_pred: Sequence[float]) -\u003e pd.DataFrame:\n",
- " \"\"\"Report individual level classification metrics.\n",
- "\n",
- " Arguments:\n",
- " y_true: true binary labels.\n",
- " y_pred: predicted binary labels.\n",
- "\n",
- " Returns:\n",
- " out: dataframe with classification metrics as columns.\n",
- " \"\"\"\n",
- " out = pd.DataFrame(index=[0])\n",
- "\n",
- " out['AUC'] = metrics.roc_auc_score(y_true, y_pred)\n",
- " out['PR_AUC'] = metrics.average_precision_score(y_true, y_pred)\n",
- " out['precision'] = metrics.precision_score(y_true, 1 * (y_pred \u003e .5))\n",
- " out['recall'] = metrics.recall_score(y_true, 1 * (y_pred \u003e .5))\n",
- " out['f1'] = metrics.f1_score(y_true, 1 * (y_pred \u003e .5))\n",
- " return out"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "vGcWU2vFaeT1"
- },
- "outputs": [],
- "source": [
- "classification = classification_report(y_true, y_pred)\n",
- "classification"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "-i_AbqhXcurk"
- },
- "source": [
- "### All metrics together"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "Umqg1-0Bc1HS"
- },
- "outputs": [],
- "source": [
- "df_metrics = pd.DataFrame(\n",
- " {\n",
- " 'company': COMPANY,\n",
- " 'model': MODEL,\n",
- " 'loss': LOSS,\n",
- " 'label_mean': y_true.mean(),\n",
- " 'pred_mean': y_pred.mean(),\n",
- " 'AUC': classification.loc[0, 'AUC'],\n",
- " 'PR_AUC': classification.loc[0, 'PR_AUC'],\n",
- " 'precision': classification.loc[0, 'precision'],\n",
- " 'recall': classification.loc[0, 'recall'],\n",
- " 'f1': classification.loc[0, 'f1']\n",
- " },\n",
- " index=[0])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "1LV1Hs3xcxnd"
- },
- "outputs": [],
- "source": [
- "df_metrics[[\n",
- " 'company',\n",
- " 'model',\n",
- " 'loss',\n",
- " 'label_mean',\n",
- " 'pred_mean',\n",
- " 'AUC',\n",
- " 'PR_AUC',\n",
- " 'precision',\n",
- " 'recall',\n",
- " 'f1',\n",
- "]]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "UVy6lYn4mSrj"
- },
- "source": [
- "## Save"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "mtkQ4mqUEFsb"
- },
- "outputs": [],
- "source": [
- "output_path = os.path.join(OUTPUT_CSV_FOLDER, COMPANY)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "3qmLzJqOEFsm"
- },
- "outputs": [],
- "source": [
- "if not os.path.isdir(output_path):\n",
- " os.makedirs(output_path)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "61B5Zc_UEFsr"
- },
- "outputs": [],
- "source": [
- "output_file = os.path.join(output_path,\n",
- " '{}_classification_{}.csv'.format(MODEL, LOSS))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "gqglbXfwEFsv"
- },
- "outputs": [],
- "source": [
- "df_metrics.to_csv(output_file, index=False)"
- ]
- }
- ],
- "metadata": {
- "colab": {
- "collapsed_sections": [],
- "last_runtime": {
- "build_target": "",
- "kind": "local"
- },
- "name": "classification.ipynb",
- "provenance": [],
- "toc_visible": true
- },
- "kernelspec": {
- "display_name": "Python 3",
- "name": "python3"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 0
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "znEK1MNRXqWY"
+ },
+ "outputs": [],
+ "source": [
+ "#@title Copyright 2019 The Lifetime Value Authors.\n",
+ "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+ "# you may not use this file except in compliance with the License.\n",
+ "# You may obtain a copy of the License at\n",
+ "#\n",
+ "# https://www.apache.org/licenses/LICENSE-2.0\n",
+ "#\n",
+ "# Unless required by applicable law or agreed to in writing, software\n",
+ "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+ "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+ "# See the License for the specific language governing permissions and\n",
+ "# limitations under the License.\n",
+ "# ============================================================================"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "3oNqWmn530N-"
+ },
+ "source": [
+ "# Churn Prediction for Kaggle Acquire Valued Customer Challenge"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "XKydJ9qF4KVm"
+ },
+ "source": [
+ "
"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "KObdQwyXH2mC"
+ },
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import tqdm\n",
+ "from sklearn import metrics\n",
+ "from sklearn import model_selection\n",
+ "from sklearn import preprocessing\n",
+ "import tensorflow as tf\n",
+ "from tensorflow import keras\n",
+ "from tensorflow.keras import backend as K\n",
+ "import tensorflow_probability as tfp\n",
+ "from typing import Sequence\n",
+ "\n",
+ "# install and import ltv\n",
+ "# !pip install -q git+https://github.com/google/lifetime_value\n",
+ "!pip install -q git+https://github.com/seyedrezamirkhani/lifetime_value\n",
+ "import lifetime_value as ltv"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "K41RmAfNXtu_"
+ },
+ "outputs": [],
+ "source": [
+ "tfd = tfp.distributions\n",
+ "pd.options.mode.chained_assignment = None # default='warn'"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "RzTaK6fFXMWT"
+ },
+ "source": [
+ "## Global variables"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "VQVhF3fhNEr2"
+ },
+ "outputs": [],
+ "source": [
+ "COMPANY = '104900040' # @param { isTemplate: true, type: 'string'}\n",
+ "LOSS = 'ziln' # @param { isTemplate: true, type: 'string'} ['bce', 'ziln']\n",
+ "# LOSS = 'bce' # @param { isTemplate: true, type: 'string'} ['bce', 'ziln']\n",
+ "# MODEL = 'linear' # @param { isTemplate: true, type: 'string'} ['linear', 'dnn']\n",
+ "MODEL = 'dnn' # @param { isTemplate: true, type: 'string'} ['linear', 'dnn']\n",
+ "LEARNING_RATE = 0.0002 # @param { isTemplate: true}\n",
+ "EPOCHS = 400 # @param {type: 'integer'}\n",
+ "DATA_FOLDER = './tmp/acquire-valued-shoppers-challenge' # @param { isTemplate: true, type: 'string'}\n",
+ "OUTPUT_CSV_FOLDER = f'{DATA_FOLDER}/result' # @param { isTemplate: true, type: 'string'}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "g7dg8TwYbxnl"
+ },
+ "outputs": [],
+ "source": [
+ "CATEGORICAL_FEATURES = ['chain', 'dept', 'category', 'brand', 'productmeasure']\n",
+ "NUMERIC_FEATURES = ['log_calibration_value']\n",
+ "\n",
+ "ALL_FEATURES = CATEGORICAL_FEATURES + NUMERIC_FEATURES"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "I_nbvZjMuj_z"
+ },
+ "source": [
+ "## Data"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "SFi0JMPu138h"
+ },
+ "source": [
+ "### Download data"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "krXMbrkVNtdN"
+ },
+ "source": [
+ "Setup kaggle API correctly following https://www.kaggle.com/docs/api\n",
+ "```\n",
+ "%%shell\n",
+ "mkdir ~/.kaggle\n",
+ "echo \\{\\\"username\\\":\\\"{your kaggle username}\\\",\\\"key\\\":\\\"{your kaggle api key}\\\"\\} > ~/.kaggle/kaggle.json\n",
+ "pip install kaggle\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Set it DATA_FOLDER as an environment variable\n",
+ "%env DATA_FOLDER=$DATA_FOLDER"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "0gf4ipd-14x0"
+ },
+ "outputs": [],
+ "source": [
+ "%%bash\n",
+ "if [ -e $DATA_FOLDER/transactions.csv ]\n",
+ "then\n",
+ " echo \"File already exists, no need to download.\"\n",
+ "else\n",
+ " rm -rf $DATA_FOLDER\n",
+ " mkdir -p $DATA_FOLDER\n",
+ " cd $DATA_FOLDER\n",
+ " kaggle competitions download -c acquire-valued-shoppers-challenge\n",
+ " echo \"Unzip file. This may take 10 min.\"\n",
+ " unzip acquire-valued-shoppers-challenge.zip transactions.csv.gz\n",
+ " gunzip transactions.csv.gz\n",
+ "fi"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "V4zoAS25uj_7"
+ },
+ "source": [
+ "### Load transaction csv"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "5tIMvE3dW1Ky"
+ },
+ "outputs": [],
+ "source": [
+ "def load_transaction_data(company):\n",
+ " all_data_filename = f'{DATA_FOLDER}/transactions.csv'\n",
+ " one_company_data_filename = f'{DATA_FOLDER}/transactions_company_{company}.csv'\n",
+ " if os.path.isfile(one_company_data_filename):\n",
+ " df = pd.read_csv(one_company_data_filename)\n",
+ " else:\n",
+ " data_list = []\n",
+ " chunksize = 10**6\n",
+ " # 350 iterations\n",
+ " for chunk in tqdm.tqdm(pd.read_csv(all_data_filename, chunksize=chunksize)):\n",
+ " data_list.append(chunk.query(\"company=={}\".format(company)))\n",
+ " df = pd.concat(data_list, axis=0)\n",
+ " df.to_csv(one_company_data_filename, index=None)\n",
+ " return df"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "9ra4bfwCVwKn"
+ },
+ "source": [
+ "### Preprocess data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "PlJl5g9Delmi"
+ },
+ "outputs": [],
+ "source": [
+ "def preprocess(df):\n",
+ " df = df.query('purchaseamount>0')\n",
+ " df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')\n",
+ " df['start_date'] = df.groupby('id')['date'].transform('min')\n",
+ "\n",
+ " # Compute calibration values\n",
+ " calibration_value = (\n",
+ " df.query('date==start_date').groupby('id')\n",
+ " ['purchaseamount'].sum().reset_index())\n",
+ " calibration_value.columns = ['id', 'calibration_value']\n",
+ "\n",
+ " # Compute holdout values\n",
+ " one_year_holdout_window_mask = (\n",
+ " (df['date'] > df['start_date']) &\n",
+ " (df['date'] <= df['start_date'] + np.timedelta64(365, 'D')))\n",
+ " holdout_value = (\n",
+ " df[one_year_holdout_window_mask].groupby('id')\n",
+ " ['purchaseamount'].sum().reset_index())\n",
+ " holdout_value.columns = ['id', 'holdout_value']\n",
+ "\n",
+ " # Compute calibration attributes\n",
+ " calibration_attributes = (\n",
+ " df.query('date==start_date').sort_values(\n",
+ " 'purchaseamount', ascending=False).groupby('id')[[\n",
+ " 'chain', 'dept', 'category', 'brand', 'productmeasure'\n",
+ " ]].first().reset_index())\n",
+ "\n",
+ " # Merge dataframes\n",
+ " customer_level_data = (\n",
+ " calibration_value.merge(calibration_attributes, how='left',\n",
+ " on='id').merge(\n",
+ " holdout_value, how='left', on='id'))\n",
+ " customer_level_data['holdout_value'] = (\n",
+ " customer_level_data['holdout_value'].fillna(0.))\n",
+ " customer_level_data[CATEGORICAL_FEATURES] = (\n",
+ " customer_level_data[CATEGORICAL_FEATURES].fillna('UNKNOWN'))\n",
+ "\n",
+ " # Specify data types\n",
+ " customer_level_data['log_calibration_value'] = (\n",
+ " np.log(customer_level_data['calibration_value']).astype('float32'))\n",
+ " customer_level_data['chain'] = (\n",
+ " customer_level_data['chain'].astype('category'))\n",
+ " customer_level_data['dept'] = (customer_level_data['dept'].astype('category'))\n",
+ " customer_level_data['brand'] = (\n",
+ " customer_level_data['brand'].astype('category'))\n",
+ " customer_level_data['category'] = (\n",
+ " customer_level_data['category'].astype('category'))\n",
+ " customer_level_data['label'] = (\n",
+ " customer_level_data['holdout_value'].astype('float32'))\n",
+ " return customer_level_data"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "fP3q6uuMoXhA"
+ },
+ "source": [
+ "### Load customer-level csv"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "X8B4zV1xoeMX"
+ },
+ "outputs": [],
+ "source": [
+ "def load_customer_level_csv(company):\n",
+ " customer_level_data_file = f'{DATA_FOLDER}/customer_level_data_company_{company}.csv'\n",
+ " if os.path.isfile(customer_level_data_file):\n",
+ " customer_level_data = pd.read_csv(customer_level_data_file)\n",
+ " else:\n",
+ " customer_level_data = preprocess(load_transaction_data(company))\n",
+ " for cat_col in CATEGORICAL_FEATURES:\n",
+ " customer_level_data[cat_col] = (\n",
+ " customer_level_data[cat_col].astype('category'))\n",
+ " for num_col in [\n",
+ " 'log_calibration_value', 'calibration_value', 'holdout_value'\n",
+ " ]:\n",
+ " customer_level_data[num_col] = (\n",
+ " customer_level_data[num_col].astype('float32'))\n",
+ "\n",
+ " return customer_level_data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "DWwMxpIEukAE"
+ },
+ "outputs": [],
+ "source": [
+ "# Processes data. 350 iteration in total. May take 10min.\n",
+ "customer_level_data = load_customer_level_csv(COMPANY)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "09tqgvANtsil"
+ },
+ "source": [
+ "We observe a mixture of zero and lognormal distribution of holdout value."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "BtF0z3VbmGev"
+ },
+ "outputs": [],
+ "source": [
+ "customer_level_data.label.apply(np.log1p).hist(bins=50)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "wLpgjEuofbdy"
+ },
+ "outputs": [],
+ "source": [
+ "customer_level_data.head().T"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "slIDJAaTcQeK"
+ },
+ "source": [
+ "## Data"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "i4kN0uk4kZ68"
+ },
+ "source": [
+ "### Make train/eval"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "JjP5v6NiQfCX"
+ },
+ "outputs": [],
+ "source": [
+ "def linear_split(df):\n",
+ " # get_dummies preserves numeric features.\n",
+ " x = pd.get_dummies(df[ALL_FEATURES], drop_first=True).astype('float32').values\n",
+ " y = df['label'].values\n",
+ "\n",
+ " x_train, x_eval, y_train, y_eval = model_selection.train_test_split(\n",
+ " x, y, test_size=0.2, random_state=123)\n",
+ "\n",
+ " return x_train, x_eval, y_train, y_eval"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "KtWXwORJjaP4"
+ },
+ "outputs": [],
+ "source": [
+ "def dnn_split(df):\n",
+ " for key in CATEGORICAL_FEATURES:\n",
+ " encoder = preprocessing.LabelEncoder()\n",
+ " df[key] = encoder.fit_transform(df[key])\n",
+ "\n",
+ " df_train, df_eval = model_selection.train_test_split(\n",
+ " df, test_size=0.2, random_state=123)\n",
+ "\n",
+ " def feature_dict(df):\n",
+ " features = {k: v.values for k, v in dict(df[CATEGORICAL_FEATURES]).items()}\n",
+ " features['numeric'] = df[NUMERIC_FEATURES].values\n",
+ " return features\n",
+ "\n",
+ " x_train, y_train = feature_dict(df_train), df_train['label'].values\n",
+ " x_eval, y_eval = feature_dict(df_eval), df_eval['label'].values\n",
+ "\n",
+ " return x_train, x_eval, y_train, y_eval"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "lqbShWBzR4NE"
+ },
+ "source": [
+ "## Model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "Yh4Y4a89ooP3"
+ },
+ "outputs": [],
+ "source": [
+ "def linear_model(output_units, input_dim):\n",
+ " return tf.keras.Sequential([\n",
+ " tf.keras.layers.Input(shape=(input_dim,)),\n",
+ " tf.keras.layers.Dense(output_units, activation=None)\n",
+ " ])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "W8yo3HLtrAE_"
+ },
+ "outputs": [],
+ "source": [
+ "def embedding_dim(x):\n",
+ " return int(x**.25) + 1\n",
+ "\n",
+ "\n",
+ "def embedding_layer(vocab_size):\n",
+ " return tf.keras.Sequential([\n",
+ " tf.keras.layers.Embedding(\n",
+ " input_dim=vocab_size,\n",
+ " output_dim=embedding_dim(vocab_size)\n",
+ " ),\n",
+ " tf.keras.layers.Flatten(),\n",
+ " ])\n",
+ "\n",
+ "\n",
+ "def dnn_model(output_units, df):\n",
+ " numeric_input = tf.keras.layers.Input(\n",
+ " shape=(len(NUMERIC_FEATURES),), name='numeric')\n",
+ "\n",
+ " embedding_inputs = [\n",
+ " tf.keras.layers.Input(shape=(1,), name=key, dtype=np.int64)\n",
+ " for key in CATEGORICAL_FEATURES\n",
+ " ]\n",
+ "\n",
+ " embedding_outputs = [\n",
+ " embedding_layer(vocab_size=df[key].nunique())(input)\n",
+ " for key, input in zip(CATEGORICAL_FEATURES, embedding_inputs)\n",
+ " ]\n",
+ "\n",
+ " deep_input = tf.keras.layers.concatenate([numeric_input] + embedding_outputs)\n",
+ " deep_model = tf.keras.Sequential([\n",
+ " tf.keras.layers.Dense(64, activation='relu'),\n",
+ " tf.keras.layers.Dense(32, activation='relu'),\n",
+ " tf.keras.layers.Dense(output_units),\n",
+ " ])\n",
+ " return tf.keras.Model(\n",
+ " inputs=[numeric_input] + embedding_inputs, outputs=deep_model(deep_input))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "U8l-KzZ12fbK"
+ },
+ "source": [
+ "### Train"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "45BHY6q7rQmI"
+ },
+ "outputs": [],
+ "source": [
+ "if LOSS == 'bce':\n",
+ " loss = keras.losses.BinaryCrossentropy(from_logits=True)\n",
+ " output_units = 1\n",
+ "\n",
+ "if LOSS == 'ziln':\n",
+ " loss = ltv.zero_inflated_lognormal_loss\n",
+ " output_units = 3"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "7Jeou8bGrhll"
+ },
+ "outputs": [],
+ "source": [
+ "if MODEL == 'linear':\n",
+ " x_train, x_eval, y_train, y_eval = linear_split(customer_level_data)\n",
+ " model = linear_model(output_units, x_train.shape[1])\n",
+ "\n",
+ "if MODEL == 'dnn':\n",
+ " x_train, x_eval, y_train, y_eval = dnn_split(customer_level_data)\n",
+ " model = dnn_model(output_units, customer_level_data)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "uF2IdTpAwiZV"
+ },
+ "outputs": [],
+ "source": [
+ "if LOSS == 'bce':\n",
+ " y_train = (y_train > 0).astype('float32')\n",
+ " y_eval = (y_eval > 0).astype('float32')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "_GQ-RlIAfT62"
+ },
+ "outputs": [],
+ "source": [
+ "model.compile(loss=loss, optimizer=keras.optimizers.Adam(learning_rate=LEARNING_RATE))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "chEIOzq6rlJx"
+ },
+ "outputs": [],
+ "source": [
+ "callbacks = [\n",
+ " tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', min_lr=1e-6),\n",
+ " tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10),\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "-BjnHV7MWhK1"
+ },
+ "outputs": [],
+ "source": [
+ "history = model.fit(\n",
+ " x=x_train,\n",
+ " y=y_train[:, np.newaxis],\n",
+ " batch_size=1024,\n",
+ " epochs=EPOCHS,\n",
+ " verbose=2,\n",
+ " callbacks=callbacks,\n",
+ " validation_data=(x_eval, y_eval[:, np.newaxis])).history"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "mAJGs5SebDeN"
+ },
+ "outputs": [],
+ "source": [
+ "pd.DataFrame(history)[['loss', 'val_loss']][2:].plot()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "bHaiutmy2aYm"
+ },
+ "source": [
+ "### Eval"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "l6E_5gYAYQMw"
+ },
+ "outputs": [],
+ "source": [
+ "logits = model.predict(x=x_eval, batch_size=1024)\n",
+ "y_pred = K.sigmoid(logits[..., :1]).numpy().flatten()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "ICTDpg4rxdlj"
+ },
+ "outputs": [],
+ "source": [
+ "y_true = (y_eval > 0).astype('float32')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "POeY1gdKTwfx"
+ },
+ "outputs": [],
+ "source": [
+ "def classification_report(y_true: Sequence[int],\n",
+ " y_pred: Sequence[float]) -> pd.DataFrame:\n",
+ " \"\"\"Report individual level classification metrics.\n",
+ "\n",
+ " Arguments:\n",
+ " y_true: true binary labels.\n",
+ " y_pred: predicted binary labels.\n",
+ "\n",
+ " Returns:\n",
+ " out: dataframe with classification metrics as columns.\n",
+ " \"\"\"\n",
+ " out = pd.DataFrame(index=[0])\n",
+ "\n",
+ " out['AUC'] = metrics.roc_auc_score(y_true, y_pred)\n",
+ " out['PR_AUC'] = metrics.average_precision_score(y_true, y_pred)\n",
+ " out['precision'] = metrics.precision_score(y_true, 1 * (y_pred > .5))\n",
+ " out['recall'] = metrics.recall_score(y_true, 1 * (y_pred > .5))\n",
+ " out['f1'] = metrics.f1_score(y_true, 1 * (y_pred > .5))\n",
+ " return out"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "vGcWU2vFaeT1"
+ },
+ "outputs": [],
+ "source": [
+ "classification = classification_report(y_true, y_pred)\n",
+ "classification"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "-i_AbqhXcurk"
+ },
+ "source": [
+ "### All metrics together"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "Umqg1-0Bc1HS"
+ },
+ "outputs": [],
+ "source": [
+ "df_metrics = pd.DataFrame(\n",
+ " {\n",
+ " 'company': COMPANY,\n",
+ " 'model': MODEL,\n",
+ " 'loss': LOSS,\n",
+ " 'label_mean': y_true.mean(),\n",
+ " 'pred_mean': y_pred.mean(),\n",
+ " 'AUC': classification.loc[0, 'AUC'],\n",
+ " 'PR_AUC': classification.loc[0, 'PR_AUC'],\n",
+ " 'precision': classification.loc[0, 'precision'],\n",
+ " 'recall': classification.loc[0, 'recall'],\n",
+ " 'f1': classification.loc[0, 'f1']\n",
+ " },\n",
+ " index=[0])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "1LV1Hs3xcxnd"
+ },
+ "outputs": [],
+ "source": [
+ "df_metrics[[\n",
+ " 'company',\n",
+ " 'model',\n",
+ " 'loss',\n",
+ " 'label_mean',\n",
+ " 'pred_mean',\n",
+ " 'AUC',\n",
+ " 'PR_AUC',\n",
+ " 'precision',\n",
+ " 'recall',\n",
+ " 'f1',\n",
+ "]]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "UVy6lYn4mSrj"
+ },
+ "source": [
+ "## Save"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "mtkQ4mqUEFsb"
+ },
+ "outputs": [],
+ "source": [
+ "output_path = os.path.join(OUTPUT_CSV_FOLDER, COMPANY)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "3qmLzJqOEFsm"
+ },
+ "outputs": [],
+ "source": [
+ "if not os.path.isdir(output_path):\n",
+ " os.makedirs(output_path)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "61B5Zc_UEFsr"
+ },
+ "outputs": [],
+ "source": [
+ "output_file = os.path.join(output_path,\n",
+ " '{}_classification_{}.csv'.format(MODEL, LOSS))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "gqglbXfwEFsv"
+ },
+ "outputs": [],
+ "source": [
+ "df_metrics.to_csv(output_file, index=False)"
+ ]
+ }
+ ],
+ "metadata": {
+ "colab": {
+ "collapsed_sections": [],
+ "last_runtime": {
+ "build_target": "",
+ "kind": "local"
+ },
+ "name": "classification.ipynb",
+ "provenance": [],
+ "toc_visible": true
+ },
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.7"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
}
diff --git a/notebooks/kaggle_acquire_valued_shoppers_challenge/preprocess_data.ipynb b/notebooks/kaggle_acquire_valued_shoppers_challenge/preprocess_data.ipynb
index 8e489ad..8af0d3a 100644
--- a/notebooks/kaggle_acquire_valued_shoppers_challenge/preprocess_data.ipynb
+++ b/notebooks/kaggle_acquire_valued_shoppers_challenge/preprocess_data.ipynb
@@ -1,319 +1,500 @@
{
- "cells": [
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "s2VGz60wbOiq"
- },
- "outputs": [],
- "source": [
- "#@title Copyright 2019 The Lifetime Value Authors.\n",
- "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
- "# you may not use this file except in compliance with the License.\n",
- "# You may obtain a copy of the License at\n",
- "#\n",
- "# https://www.apache.org/licenses/LICENSE-2.0\n",
- "#\n",
- "# Unless required by applicable law or agreed to in writing, software\n",
- "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
- "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
- "# See the License for the specific language governing permissions and\n",
- "# limitations under the License.\n",
- "# ============================================================================"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "eBoqlan65Q9T"
- },
- "source": [
- "\u003ctable align=\"left\"\u003e\n",
- " \u003ctd\u003e\n",
- " \u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/google/lifetime_value/blob/master/notebooks/kaggle_acquire_valued_shoppers_challenge/preprocess_data.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n",
- " \u003c/td\u003e\n",
- " \u003ctd\u003e\n",
- " \u003ca target=\"_blank\" href=\"https://github.com/google/lifetime_value/blob/master/notebooks/kaggle_acquire_valued_shoppers_challenge/preprocess_data.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\n",
- " \u003c/td\u003e\n",
- "\u003c/table\u003e"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "KObdQwyXH2mC"
- },
- "outputs": [],
- "source": [
- "from __future__ import absolute_import\n",
- "from __future__ import division\n",
- "from __future__ import print_function\n",
- "\n",
- "\n",
- "import os\n",
- "import numpy as np\n",
- "import pandas as pd\n",
- "import tqdm\n",
- "import multiprocessing"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "K41RmAfNXtu_"
- },
- "outputs": [],
- "source": [
- "pd.options.mode.chained_assignment = None # default='warn'"
- ]
- },
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "id": "s2VGz60wbOiq"
+ },
+ "outputs": [],
+ "source": [
+ "#@title Copyright 2019 The Lifetime Value Authors.\n",
+ "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+ "# you may not use this file except in compliance with the License.\n",
+ "# You may obtain a copy of the License at\n",
+ "#\n",
+ "# https://www.apache.org/licenses/LICENSE-2.0\n",
+ "#\n",
+ "# Unless required by applicable law or agreed to in writing, software\n",
+ "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+ "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+ "# See the License for the specific language governing permissions and\n",
+ "# limitations under the License.\n",
+ "# ============================================================================"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "eBoqlan65Q9T"
+ },
+ "source": [
+ ""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "id": "KObdQwyXH2mC"
+ },
+ "outputs": [],
+ "source": [
+ "from __future__ import absolute_import\n",
+ "from __future__ import division\n",
+ "from __future__ import print_function\n",
+ "\n",
+ "\n",
+ "import os\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "import tqdm\n",
+ "import multiprocessing"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {
+ "id": "K41RmAfNXtu_"
+ },
+ "outputs": [],
+ "source": [
+ "pd.options.mode.chained_assignment = None # default='warn'"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "DoN-PRvNuIti"
+ },
+ "source": [
+ "## Global variables"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "id": "3GGpDbxd3S5L"
+ },
+ "outputs": [],
+ "source": [
+ "COMPANYS = [\n",
+ " '10000', '101200010', '101410010', '101600010', '102100020', '102700020',\n",
+ " '102840020', '103000030', '103338333', '103400030', '103600030',\n",
+ " '103700030', '103800030', '104300040', '104400040', '104470040',\n",
+ " '104900040', '105100050', '105150050', '107800070'\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "DATA_FOLDER = './tmp/acquire-valued-shoppers-challenge' # @param { isTemplate: true, type: 'string'}"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "RzTaK6fFXMWT"
+ },
+ "source": [
+ "## Data"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "SFi0JMPu138h"
+ },
+ "source": [
+ "### Download data"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "krXMbrkVNtdN"
+ },
+ "source": [
+ "Setup kaggle API correctly following https://www.kaggle.com/docs/api\n",
+ "```\n",
+ "%%shell\n",
+ "mkdir ~/.kaggle\n",
+ "echo \\{\\\"username\\\":\\\"{your kaggle username}\\\",\\\"key\\\":\\\"{your kaggle api key}\\\"\\} > ~/.kaggle/kaggle.json\n",
+ "pip install kaggle\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {},
+ "outputs": [
{
- "cell_type": "markdown",
- "metadata": {
- "id": "DoN-PRvNuIti"
- },
- "source": [
- "## Global variables"
- ]
- },
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "env: DATA_FOLDER=./tmp/acquire-valued-shoppers-challenge\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Set it DATA_FOLDER as an environment variable\n",
+ "%env DATA_FOLDER=$DATA_FOLDER"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {
+ "id": "0gf4ipd-14x0"
+ },
+ "outputs": [
{
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "3GGpDbxd3S5L"
- },
- "outputs": [],
- "source": [
- "COMPANYS = [\n",
- " '10000', '101200010', '101410010', '101600010', '102100020', '102700020',\n",
- " '102840020', '103000030', '103338333', '103400030', '103600030',\n",
- " '103700030', '103800030', '104300040', '104400040', '104470040',\n",
- " '104900040', '105100050', '105150050', '107800070'\n",
- "]"
- ]
- },
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "File already exists, no need to download.\n"
+ ]
+ }
+ ],
+ "source": [
+ "%%bash\n",
+ "if [ -e $DATA_FOLDER/transactions.csv ]\n",
+ "then\n",
+ " echo \"File already exists, no need to download.\"\n",
+ "else\n",
+ " rm -rf $DATA_FOLDER\n",
+ " mkdir -p $DATA_FOLDER\n",
+ " cd $DATA_FOLDER\n",
+ " kaggle competitions download -c acquire-valued-shoppers-challenge\n",
+ " echo \"Unzip file. This may take 10 min.\"\n",
+ " unzip acquire-valued-shoppers-challenge.zip transactions.csv.gz\n",
+ " gunzip transactions.csv.gz\n",
+ "fi"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "IT53azGsa2a2"
+ },
+ "source": [
+ "### Load csv"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {
+ "id": "5tIMvE3dW1Ky"
+ },
+ "outputs": [],
+ "source": [
+ "def load_data(company):\n",
+ " all_data_filename = f'{DATA_FOLDER}/transactions.csv'\n",
+ " one_company_data_filename = f'{DATA_FOLDER}/transactions_company_{company}.csv'\n",
+ " if os.path.isfile(one_company_data_filename):\n",
+ " df = pd.read_csv(one_company_data_filename)\n",
+ " else:\n",
+ " data_list = []\n",
+ " chunksize = 10**6\n",
+ " # 350 iterations\n",
+ " for chunk in tqdm.tqdm(pd.read_csv(all_data_filename, chunksize=chunksize)):\n",
+ " data_list.append(chunk.query(\"company=={}\".format(company)))\n",
+ " df = pd.concat(data_list, axis=0)\n",
+ " df.to_csv(one_company_data_filename, index=None)\n",
+ " return df"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "9ra4bfwCVwKn"
+ },
+ "source": [
+ "### Preprocess data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {
+ "id": "PlJl5g9Delmi"
+ },
+ "outputs": [],
+ "source": [
+ "def preprocess(df):\n",
+ " df = df.query('purchaseamount>0')\n",
+ " df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')\n",
+ " df['start_date'] = df.groupby('id')['date'].transform('min')\n",
+ "\n",
+ " # Compute calibration values\n",
+ " calibration_value = (\n",
+ " df.query('date==start_date').groupby('id')\n",
+ " ['purchaseamount'].sum().reset_index())\n",
+ " calibration_value.columns = ['id', 'calibration_value']\n",
+ "\n",
+ " # Compute holdout values\n",
+ " one_year_holdout_window_mask = (\n",
+ " (df['date'] > df['start_date']) &\n",
+ " (df['date'] <= df['start_date'] + np.timedelta64(365, 'D')))\n",
+ " holdout_value = (\n",
+ " df[one_year_holdout_window_mask].groupby('id')\n",
+ " ['purchaseamount'].sum().reset_index())\n",
+ " holdout_value.columns = ['id', 'holdout_value']\n",
+ "\n",
+ " # Compute calibration attributes\n",
+ " calibration_attributes = (\n",
+ " df.query('date==start_date').sort_values(\n",
+ " 'purchaseamount', ascending=False).groupby('id')[[\n",
+ " 'chain', 'dept', 'category', 'brand', 'productmeasure'\n",
+ " ]].first().reset_index())\n",
+ "\n",
+ " # Merge dataframes\n",
+ " customer_level_data = (\n",
+ " calibration_value.merge(calibration_attributes, how='left',\n",
+ " on='id').merge(\n",
+ " holdout_value, how='left', on='id'))\n",
+ " customer_level_data['holdout_value'] = (\n",
+ " customer_level_data['holdout_value'].fillna(0.))\n",
+ " categorical_features = ([\n",
+ " 'chain', 'dept', 'category', 'brand', 'productmeasure'\n",
+ " ])\n",
+ " customer_level_data[categorical_features] = (\n",
+ " customer_level_data[categorical_features].fillna('UNKNOWN'))\n",
+ "\n",
+ " # Specify data types\n",
+ " customer_level_data['log_calibration_value'] = (\n",
+ " np.log(customer_level_data['calibration_value']).astype('float32'))\n",
+ " customer_level_data['chain'] = (\n",
+ " customer_level_data['chain'].astype('category'))\n",
+ " customer_level_data['dept'] = (customer_level_data['dept'].astype('category'))\n",
+ " customer_level_data['brand'] = (\n",
+ " customer_level_data['brand'].astype('category'))\n",
+ " customer_level_data['category'] = (\n",
+ " customer_level_data['category'].astype('category'))\n",
+ " customer_level_data['label'] = (\n",
+ " customer_level_data['holdout_value'].astype('float32'))\n",
+ " return customer_level_data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {
+ "id": "Bx80J6Ztferj"
+ },
+ "outputs": [],
+ "source": [
+ "def process(company):\n",
+ " print(\"Process company {}\".format(company))\n",
+ " transaction_level_data = load_data(company)\n",
+ " customer_level_data = preprocess(transaction_level_data)\n",
+ " customer_level_data_file = f\"{DATA_FOLDER}/customer_level_data_company_{company}.csv\"\n",
+ " customer_level_data.to_csv(customer_level_data_file, index=None)\n",
+ " print(\"Done company {}\".format(company))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "Q05sKVnxi8mV"
+ },
+ "source": [
+ "This step may take a while to finish -- 10min-1hr depending on number of core in\n",
+ "the computer."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {
+ "id": "88dVPdt5QWpu"
+ },
+ "outputs": [
{
- "cell_type": "markdown",
- "metadata": {
- "id": "RzTaK6fFXMWT"
- },
- "source": [
- "## Data"
- ]
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Process company 102100020Process company 101200010Process company 102840020Process company 102700020Process company 10000Process company 103338333Process company 101410010Process company 101600010Process company 103700030Process company 103600030Process company 104300040Process company 103800030\n",
+ "Process company 103000030Process company 104400040Process company 104470040\n",
+ "\n",
+ "Process company 103400030\n",
+ "Process company 107800070\n",
+ "Process company 105150050\n",
+ "\n",
+ "\n",
+ "Process company 105100050\n",
+ "\n",
+ "\n",
+ "Process company 104900040\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n",
+ "\n"
+ ]
},
{
- "cell_type": "markdown",
- "metadata": {
- "id": "SFi0JMPu138h"
- },
- "source": [
- "### Download data"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "350it [05:29, 1.06it/s]\n",
+ "350it [05:31, 1.06it/s]\n",
+ "350it [05:31, 1.06it/s]\n",
+ "350it [05:33, 1.05it/s]\n",
+ "350it [05:36, 1.04it/s]\n",
+ "350it [05:36, 1.04it/s]\n",
+ "350it [05:37, 1.04it/s]\n",
+ "350it [05:39, 1.03it/s]\n",
+ "350it [05:39, 1.03it/s]\n",
+ "350it [05:41, 1.03it/s]\n",
+ "350it [05:41, 1.02it/s]\n",
+ "350it [05:41, 1.02it/s]\n",
+ "350it [05:42, 1.02it/s]\n",
+ "345it [05:43, 1.12it/s]"
+ ]
},
{
- "cell_type": "markdown",
- "metadata": {
- "id": "krXMbrkVNtdN"
- },
- "source": [
- "Setup kaggle API correctly following https://www.kaggle.com/docs/api\n",
- "```\n",
- "%%shell\n",
- "mkdir ~/.kaggle\n",
- "echo \\{\\\"username\\\":\\\"{your kaggle username}\\\",\\\"key\\\":\\\"{your kaggle api key}\\\"\\} \u003e ~/.kaggle/kaggle.json\n",
- "pip install kaggle\n",
- "```"
- ]
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Done company 103600030\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "0gf4ipd-14x0"
- },
- "outputs": [],
- "source": [
- "%%shell\n",
- "if [ -e /tmp/lifetime-value/acquire-valued-shoppers-challenge/transactions.csv ]\n",
- "then\n",
- " echo \"File already exists, no need to download.\"\n",
- "else\n",
- " rm -rf /tmp/lifetime-value/acquire-valued-shoppers-challenge\n",
- " mkdir -p /tmp/lifetime-value/acquire-valued-shoppers-challenge\n",
- " cd /tmp/lifetime-value/acquire-valued-shoppers-challenge\n",
- " kaggle competitions download -c acquire-valued-shoppers-challenge\n",
- " echo \"Unzip file. This may take 10 min.\"\n",
- " gunzip transactions.csv.gz\n",
- "fi"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "350it [05:44, 1.01it/s]\n",
+ "345it [05:44, 1.21it/s]"
+ ]
},
{
- "cell_type": "markdown",
- "metadata": {
- "id": "IT53azGsa2a2"
- },
- "source": [
- "### Load csv"
- ]
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Done company 103000030\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "5tIMvE3dW1Ky"
- },
- "outputs": [],
- "source": [
- "def load_data(company):\n",
- " all_data_filename = '/tmp/lifetime-value/acquire-valued-shoppers-challenge/transactions.csv'\n",
- " one_company_data_filename = (\n",
- " '/tmp/lifetime-value/acquire-valued-shoppers-challenge/transactions_company_{}.csv'\n",
- " .format(company))\n",
- " if os.path.isfile(one_company_data_filename):\n",
- " df = pd.read_csv(one_company_data_filename)\n",
- " else:\n",
- " data_list = []\n",
- " chunksize = 10**6\n",
- " # 350 iterations\n",
- " for chunk in tqdm.tqdm(pd.read_csv(all_data_filename, chunksize=chunksize)):\n",
- " data_list.append(chunk.query(\"company=='{}'\".format(company)))\n",
- " df = pd.concat(data_list, axis=0)\n",
- " df.to_csv(one_company_data_filename, index=None)\n",
- " return df"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "350it [05:45, 1.01it/s]\n",
+ "350it [05:46, 1.01it/s]\n",
+ "350it [05:47, 1.01it/s]\n",
+ "350it [05:47, 1.01it/s]\n"
+ ]
},
{
- "cell_type": "markdown",
- "metadata": {
- "id": "9ra4bfwCVwKn"
- },
- "source": [
- "### Preprocess data"
- ]
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Done company 104470040\n",
+ "Done company 105150050\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "PlJl5g9Delmi"
- },
- "outputs": [],
- "source": [
- "def preprocess(df):\n",
- " df = df.query('purchaseamount\u003e0')\n",
- " df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')\n",
- " df['start_date'] = df.groupby('id')['date'].transform('min')\n",
- "\n",
- " # Compute calibration values\n",
- " calibration_value = (\n",
- " df.query('date==start_date').groupby('id')\n",
- " ['purchaseamount'].sum().reset_index())\n",
- " calibration_value.columns = ['id', 'calibration_value']\n",
- "\n",
- " # Compute holdout values\n",
- " one_year_holdout_window_mask = (\n",
- " (df['date'] \u003e df['start_date']) \u0026\n",
- " (df['date'] \u003c= df['start_date'] + np.timedelta64(365, 'D')))\n",
- " holdout_value = (\n",
- " df[one_year_holdout_window_mask].groupby('id')\n",
- " ['purchaseamount'].sum().reset_index())\n",
- " holdout_value.columns = ['id', 'holdout_value']\n",
- "\n",
- " # Compute calibration attributes\n",
- " calibration_attributes = (\n",
- " df.query('date==start_date').sort_values(\n",
- " 'purchaseamount', ascending=False).groupby('id')[[\n",
- " 'chain', 'dept', 'category', 'brand', 'productmeasure'\n",
- " ]].first().reset_index())\n",
- "\n",
- " # Merge dataframes\n",
- " customer_level_data = (\n",
- " calibration_value.merge(calibration_attributes, how='left',\n",
- " on='id').merge(\n",
- " holdout_value, how='left', on='id'))\n",
- " customer_level_data['holdout_value'] = (\n",
- " customer_level_data['holdout_value'].fillna(0.))\n",
- " categorical_features = ([\n",
- " 'chain', 'dept', 'category', 'brand', 'productmeasure'\n",
- " ])\n",
- " customer_level_data[categorical_features] = (\n",
- " customer_level_data[categorical_features].fillna('UNKNOWN'))\n",
- "\n",
- " # Specify data types\n",
- " customer_level_data['log_calibration_value'] = (\n",
- " np.log(customer_level_data['calibration_value']).astype('float32'))\n",
- " customer_level_data['chain'] = (\n",
- " customer_level_data['chain'].astype('category'))\n",
- " customer_level_data['dept'] = (customer_level_data['dept'].astype('category'))\n",
- " customer_level_data['brand'] = (\n",
- " customer_level_data['brand'].astype('category'))\n",
- " customer_level_data['category'] = (\n",
- " customer_level_data['category'].astype('category'))\n",
- " customer_level_data['label'] = (\n",
- " customer_level_data['holdout_value'].astype('float32'))\n",
- " return customer_level_data"
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "350it [05:48, 1.00it/s]\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "Bx80J6Ztferj"
- },
- "outputs": [],
- "source": [
- "def process(company):\n",
- " print(\"Process company {}\".format(company))\n",
- " transaction_level_data = load_data(company)\n",
- " customer_level_data = preprocess(transaction_level_data)\n",
- " customer_level_data_file = (\n",
- " \"/tmp/lifetime-value/acquire-valued-shoppers-challenge/customer_level_data_company_{}.csv\"\n",
- " .format(company))\n",
- " customer_level_data.to_csv(customer_level_data_file, index=None)\n",
- " print(\"Done company {}\".format(company))"
- ]
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Done company 107800070\n"
+ ]
},
{
- "cell_type": "markdown",
- "metadata": {
- "id": "Q05sKVnxi8mV"
- },
- "source": [
- "This step may take a while to finish -- 10min-1hr depending on number of core in\n",
- "the computer."
- ]
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "350it [05:51, 1.00s/it]\n"
+ ]
},
{
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "88dVPdt5QWpu"
- },
- "outputs": [],
- "source": [
- "p = multiprocessing.Pool(multiprocessing.cpu_count())\n",
- "_ = p.map(process, COMPANYS)"
- ]
- }
- ],
- "metadata": {
- "colab": {
- "collapsed_sections": [],
- "last_runtime": {
- "build_target": "",
- "kind": "local"
- },
- "name": "preprocess_data.ipynb",
- "provenance": []
- },
- "kernelspec": {
- "display_name": "Python 2",
- "name": "python2"
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Done company 102700020\n",
+ "Done company 101200010\n",
+ "Done company 101410010\n",
+ "Done company 104900040\n",
+ "Done company 104300040\n",
+ "Done company 105100050\n",
+ "Done company 103400030\n",
+ "Done company 103800030\n",
+ "Done company 101600010\n",
+ "Done company 102100020\n",
+ "Done company 104400040\n",
+ "Done company 103338333\n",
+ "Done company 102840020\n",
+ "Done company 103700030\n",
+ "Done company 10000\n"
+ ]
}
+ ],
+ "source": [
+ "p = multiprocessing.Pool(multiprocessing.cpu_count())\n",
+ "_ = p.map(process, COMPANYS)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "colab": {
+ "collapsed_sections": [],
+ "last_runtime": {
+ "build_target": "",
+ "kind": "local"
+ },
+ "name": "preprocess_data.ipynb",
+ "provenance": []
+ },
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
},
- "nbformat": 4,
- "nbformat_minor": 0
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.7"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
}
diff --git a/notebooks/kaggle_acquire_valued_shoppers_challenge/regression.ipynb b/notebooks/kaggle_acquire_valued_shoppers_challenge/regression.ipynb
index c4d00d5..3441e29 100644
--- a/notebooks/kaggle_acquire_valued_shoppers_challenge/regression.ipynb
+++ b/notebooks/kaggle_acquire_valued_shoppers_challenge/regression.ipynb
@@ -1,899 +1,935 @@
{
- "cells": [
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "5RoRxBv3bRjy"
- },
- "outputs": [],
- "source": [
- "#@title Copyright 2019 The Lifetime Value Authors.\n",
- "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
- "# you may not use this file except in compliance with the License.\n",
- "# You may obtain a copy of the License at\n",
- "#\n",
- "# https://www.apache.org/licenses/LICENSE-2.0\n",
- "#\n",
- "# Unless required by applicable law or agreed to in writing, software\n",
- "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
- "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
- "# See the License for the specific language governing permissions and\n",
- "# limitations under the License.\n",
- "# ============================================================================"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "2tkQUXmWhqRY"
- },
- "source": [
- "# Lifetime Value prediction for Kaggle Acquire Valued Customer Challenge"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "Pw8bm9nV6YJ5"
- },
- "source": [
- "\u003ctable align=\"left\"\u003e\n",
- " \u003ctd\u003e\n",
- " \u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/google/lifetime_value/blob/master/notebooks/kaggle_acquire_valued_shoppers_challenge/regression.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n",
- " \u003c/td\u003e\n",
- " \u003ctd\u003e\n",
- " \u003ca target=\"_blank\" href=\"https://github.com/google/lifetime_value/blob/master/notebooks/kaggle_acquire_valued_shoppers_challenge/regression.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\n",
- " \u003c/td\u003e\n",
- "\u003c/table\u003e"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "KObdQwyXH2mC"
- },
- "outputs": [],
- "source": [
- "import os\n",
- "\n",
- "import numpy as np\n",
- "import pandas as pd\n",
- "from scipy import stats\n",
- "import seaborn as sns\n",
- "from sklearn import model_selection\n",
- "from sklearn import preprocessing\n",
- "import tensorflow as tf\n",
- "from tensorflow import keras\n",
- "from tensorflow.keras import backend as K\n",
- "import tensorflow_probability as tfp\n",
- "import tqdm\n",
- "from typing import Sequence\n",
- "\n",
- "# install and import ltv\n",
- "!pip install -q git+https://github.com/google/lifetime_value\n",
- "import lifetime_value as ltv"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "K41RmAfNXtu_"
- },
- "outputs": [],
- "source": [
- "tfd = tfp.distributions\n",
- "%config InlineBackend.figure_format='retina'\n",
- "sns.set_style('whitegrid')\n",
- "pd.options.mode.chained_assignment = None # default='warn'"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "DoN-PRvNuIti"
- },
- "source": [
- "## Global variables"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "3GGpDbxd3S5L"
- },
- "outputs": [],
- "source": [
- "COMPANY = '103600030' # @param { isTemplate: true, type: 'string'}\n",
- "LOSS = 'ziln' # @param { isTemplate: true, type: 'string'} ['mse', 'ziln']\n",
- "MODEL = 'dnn' # @param { isTemplate: true, type: 'string'} ['linear', 'dnn']\n",
- "LEARNING_RATE = 0.0002 # @param { isTemplate: true}\n",
- "EPOCHS = 400 # @param { isTemplate: true, type: 'integer'}\n",
- "OUTPUT_CSV_FOLDER = '/tmp/lifetime-value/kaggle_acquire_valued_shoppers_challenge/result' # @param { isTemplate: true, type: 'string'}"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "UK9Y5NoMtm3X"
- },
- "outputs": [],
- "source": [
- "CATEGORICAL_FEATURES = ['chain', 'dept', 'category', 'brand', 'productmeasure']\n",
- "NUMERIC_FEATURES = ['log_calibration_value']\n",
- "\n",
- "ALL_FEATURES = CATEGORICAL_FEATURES + NUMERIC_FEATURES"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "RzTaK6fFXMWT"
- },
- "source": [
- "## Data"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "SFi0JMPu138h"
- },
- "source": [
- "### Download data"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "krXMbrkVNtdN"
- },
- "source": [
- "Setup kaggle API correctly following https://www.kaggle.com/docs/api\n",
- "```\n",
- "%%shell\n",
- "mkdir ~/.kaggle\n",
- "echo \\{\\\"username\\\":\\\"{your kaggle username}\\\",\\\"key\\\":\\\"{your kaggle api key}\\\"\\} \u003e ~/.kaggle/kaggle.json\n",
- "pip install kaggle\n",
- "```"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "0gf4ipd-14x0"
- },
- "outputs": [],
- "source": [
- "%%shell\n",
- "if [ -e /tmp/lifetime-value/acquire-valued-shoppers-challenge/transactions.csv ]\n",
- "then\n",
- " echo \"File already exists, no need to download.\"\n",
- "else\n",
- " rm -rf /tmp/lifetime-value/acquire-valued-shoppers-challenge\n",
- " mkdir -p /tmp/lifetime-value/acquire-valued-shoppers-challenge\n",
- " cd /tmp/lifetime-value/acquire-valued-shoppers-challenge\n",
- " kaggle competitions download -c acquire-valued-shoppers-challenge\n",
- " echo \"Unzip file. This may take 10 min.\"\n",
- " gunzip transactions.csv.gz\n",
- "fi"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "IT53azGsa2a2"
- },
- "source": [
- "### Load transaction csv"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "5tIMvE3dW1Ky"
- },
- "outputs": [],
- "source": [
- "def load_transaction_data(company):\n",
- " all_data_filename = '/tmp/lifetime-value/acquire-valued-shoppers-challenge/transactions.csv'\n",
- " one_company_data_filename = (\n",
- " '/tmp/lifetime-value/acquire-valued-shoppers-challenge/transactions_company_{}.csv'\n",
- " .format(COMPANY))\n",
- " if os.path.isfile(one_company_data_filename):\n",
- " df = pd.read_csv(one_company_data_filename)\n",
- " else:\n",
- " data_list = []\n",
- " chunksize = 10**6\n",
- " # 350 iterations\n",
- " for chunk in tqdm.tqdm(pd.read_csv(all_data_filename, chunksize=chunksize)):\n",
- " data_list.append(chunk.query(\"company=='{}'\".format(company)))\n",
- " df = pd.concat(data_list, axis=0)\n",
- " df.to_csv(one_company_data_filename, index=None)\n",
- " return df"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "9ra4bfwCVwKn"
- },
- "source": [
- "### Preprocess data"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "PlJl5g9Delmi"
- },
- "outputs": [],
- "source": [
- "def preprocess(df):\n",
- " df = df.query('purchaseamount\u003e0')\n",
- " df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')\n",
- " df['start_date'] = df.groupby('id')['date'].transform('min')\n",
- "\n",
- " # Compute calibration values\n",
- " calibration_value = (\n",
- " df.query('date==start_date').groupby('id')\n",
- " ['purchaseamount'].sum().reset_index())\n",
- " calibration_value.columns = ['id', 'calibration_value']\n",
- "\n",
- " # Compute holdout values\n",
- " one_year_holdout_window_mask = (\n",
- " (df['date'] \u003e df['start_date']) \u0026\n",
- " (df['date'] \u003c= df['start_date'] + np.timedelta64(365, 'D')))\n",
- " holdout_value = (\n",
- " df[one_year_holdout_window_mask].groupby('id')\n",
- " ['purchaseamount'].sum().reset_index())\n",
- " holdout_value.columns = ['id', 'holdout_value']\n",
- "\n",
- " # Compute calibration attributes\n",
- " calibration_attributes = (\n",
- " df.query('date==start_date').sort_values(\n",
- " 'purchaseamount', ascending=False).groupby('id')[[\n",
- " 'chain', 'dept', 'category', 'brand', 'productmeasure'\n",
- " ]].first().reset_index())\n",
- "\n",
- " # Merge dataframes\n",
- " customer_level_data = (\n",
- " calibration_value.merge(calibration_attributes, how='left',\n",
- " on='id').merge(\n",
- " holdout_value, how='left', on='id'))\n",
- " customer_level_data['holdout_value'] = (\n",
- " customer_level_data['holdout_value'].fillna(0.))\n",
- " customer_level_data[CATEGORICAL_FEATURES] = (\n",
- " customer_level_data[CATEGORICAL_FEATURES].fillna('UNKNOWN'))\n",
- "\n",
- " # Specify data types\n",
- " customer_level_data['log_calibration_value'] = (\n",
- " np.log(customer_level_data['calibration_value']).astype('float32'))\n",
- " customer_level_data['chain'] = (\n",
- " customer_level_data['chain'].astype('category'))\n",
- " customer_level_data['dept'] = (customer_level_data['dept'].astype('category'))\n",
- " customer_level_data['brand'] = (\n",
- " customer_level_data['brand'].astype('category'))\n",
- " customer_level_data['category'] = (\n",
- " customer_level_data['category'].astype('category'))\n",
- " customer_level_data['label'] = (\n",
- " customer_level_data['holdout_value'].astype('float32'))\n",
- " return customer_level_data"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "fP3q6uuMoXhA"
- },
- "source": [
- "### Load customer-level csv"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "X8B4zV1xoeMX"
- },
- "outputs": [],
- "source": [
- "def load_customer_level_csv(company):\n",
- " customer_level_data_file = (\n",
- " '/tmp/lifetime-value/acquire-valued-shoppers-challenge/customer_level_data_company_{}.csv'\n",
- " .format(company))\n",
- " if os.path.isfile(customer_level_data_file):\n",
- " customer_level_data = pd.read_csv(customer_level_data_file)\n",
- " else:\n",
- " customer_level_data = preprocess(load_transaction_data(company))\n",
- " for cat_col in CATEGORICAL_FEATURES:\n",
- " customer_level_data[cat_col] = (\n",
- " customer_level_data[cat_col].astype('category'))\n",
- " for num_col in [\n",
- " 'log_calibration_value', 'calibration_value', 'holdout_value'\n",
- " ]:\n",
- " customer_level_data[num_col] = (\n",
- " customer_level_data[num_col].astype('float32'))\n",
- "\n",
- " return customer_level_data"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "88dVPdt5QWpu"
- },
- "outputs": [],
- "source": [
- "# Processes data. 350 iteration in total. May take 10min.\n",
- "customer_level_data = load_customer_level_csv(COMPANY)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "09tqgvANtsil"
- },
- "source": [
- "We observe a mixture of zero and lognormal distribution of holdout value."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "BtF0z3VbmGev"
- },
- "outputs": [],
- "source": [
- "customer_level_data.label.apply(np.log1p).hist(bins=50)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "i4kN0uk4kZ68"
- },
- "source": [
- "### Make train/eval"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "nc0MLKx2yD72"
- },
- "outputs": [],
- "source": [
- "def linear_split(df):\n",
- " # get_dummies preserves numeric features.\n",
- " x = pd.get_dummies(df[ALL_FEATURES], drop_first=True).astype('float32').values\n",
- " y = df['label'].values\n",
- " y0 = df['calibration_value'].values\n",
- "\n",
- " x_train, x_eval, y_train, y_eval, y0_train, y0_eval = (\n",
- " model_selection.train_test_split(\n",
- " x, y, y0, test_size=0.2, random_state=123))\n",
- "\n",
- " return x_train, x_eval, y_train, y_eval, y0_eval"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "eAGbXp9ax042"
- },
- "outputs": [],
- "source": [
- "def dnn_split(df):\n",
- " for key in CATEGORICAL_FEATURES:\n",
- " encoder = preprocessing.LabelEncoder()\n",
- " df[key] = encoder.fit_transform(df[key])\n",
- "\n",
- " y0 = df['calibration_value'].values\n",
- " df_train, df_eval, y0_train, y0_eval = model_selection.train_test_split(\n",
- " df, y0, test_size=0.2, random_state=123)\n",
- "\n",
- " def feature_dict(df):\n",
- " features = {k: v.values for k, v in dict(df[CATEGORICAL_FEATURES]).items()}\n",
- " features['numeric'] = df[NUMERIC_FEATURES].values\n",
- " return features\n",
- "\n",
- " x_train, y_train = feature_dict(df_train), df_train['label'].values\n",
- " x_eval, y_eval = feature_dict(df_eval), df_eval['label'].values\n",
- "\n",
- " return x_train, x_eval, y_train, y_eval, y0_eval"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "lqbShWBzR4NE"
- },
- "source": [
- "## Model"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "RAOttr0W0yTM"
- },
- "outputs": [],
- "source": [
- "def linear_model(output_units):\n",
- " return tf.keras.experimental.LinearModel(output_units)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "Q7huREFbR7Dl"
- },
- "outputs": [],
- "source": [
- "def embedding_dim(x):\n",
- " return int(x**.25) + 1\n",
- "\n",
- "\n",
- "def embedding_layer(vocab_size):\n",
- " return tf.keras.Sequential([\n",
- " tf.keras.layers.Embedding(\n",
- " input_dim=vocab_size,\n",
- " output_dim=embedding_dim(vocab_size),\n",
- " input_length=1),\n",
- " tf.keras.layers.Flatten(),\n",
- " ])\n",
- "\n",
- "\n",
- "def dnn_model(output_units, df):\n",
- " numeric_input = tf.keras.layers.Input(\n",
- " shape=(len(NUMERIC_FEATURES),), name='numeric')\n",
- "\n",
- " embedding_inputs = [\n",
- " tf.keras.layers.Input(shape=(1,), name=key, dtype=np.int64)\n",
- " for key in CATEGORICAL_FEATURES\n",
- " ]\n",
- "\n",
- " embedding_outputs = [\n",
- " embedding_layer(vocab_size=df[key].nunique())(input)\n",
- " for key, input in zip(CATEGORICAL_FEATURES, embedding_inputs)\n",
- " ]\n",
- "\n",
- " deep_input = tf.keras.layers.concatenate([numeric_input] + embedding_outputs)\n",
- " deep_model = tf.keras.Sequential([\n",
- " tf.keras.layers.Dense(64, activation='relu'),\n",
- " tf.keras.layers.Dense(32, activation='relu'),\n",
- " tf.keras.layers.Dense(output_units),\n",
- " ])\n",
- " return tf.keras.Model(\n",
- " inputs=[numeric_input] + embedding_inputs, outputs=deep_model(deep_input))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "U8l-KzZ12fbK"
- },
- "source": [
- "### Train"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "L3HzXsj61uy3"
- },
- "outputs": [],
- "source": [
- "if LOSS == 'mse':\n",
- " loss = keras.losses.MeanSquaredError()\n",
- " output_units = 1\n",
- "\n",
- "if LOSS == 'ziln':\n",
- " loss = ltv.zero_inflated_lognormal_loss\n",
- " output_units = 3"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "0pNM4q5m19Dv"
- },
- "outputs": [],
- "source": [
- "if MODEL == 'linear':\n",
- " x_train, x_eval, y_train, y_eval, y0_eval = linear_split(customer_level_data)\n",
- " model = linear_model(output_units)\n",
- "\n",
- "if MODEL == 'dnn':\n",
- " x_train, x_eval, y_train, y_eval, y0_eval = dnn_split(customer_level_data)\n",
- " model = dnn_model(output_units, customer_level_data)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "Un-yJPHp31gp"
- },
- "outputs": [],
- "source": [
- "model.compile(loss=loss, optimizer=keras.optimizers.Adam(lr=LEARNING_RATE))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "_GQ-RlIAfT62"
- },
- "outputs": [],
- "source": [
- "callbacks = [\n",
- " tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', min_lr=1e-6),\n",
- " tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10),\n",
- "]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "-BjnHV7MWhK1"
- },
- "outputs": [],
- "source": [
- "history = model.fit(\n",
- " x=x_train,\n",
- " y=y_train,\n",
- " batch_size=1024,\n",
- " epochs=EPOCHS,\n",
- " verbose=2,\n",
- " callbacks=callbacks,\n",
- " validation_data=(x_eval, y_eval)).history"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "mAJGs5SebDeN"
- },
- "outputs": [],
- "source": [
- "pd.DataFrame(history)[['loss', 'val_loss']][2:].plot()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "bHaiutmy2aYm"
- },
- "source": [
- "### Eval"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "l6E_5gYAYQMw"
- },
- "outputs": [],
- "source": [
- "if LOSS == 'mse':\n",
- " y_pred = model.predict(x=x_eval, batch_size=1024).flatten()\n",
- "\n",
- "if LOSS == 'ziln':\n",
- " logits = model.predict(x=x_eval, batch_size=1024)\n",
- " y_pred = ltv.zero_inflated_lognormal_pred(logits).numpy().flatten()"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "Mm28qKSGXNyr"
- },
- "outputs": [],
- "source": [
- "df_pred = pd.DataFrame({\n",
- " 'y_true': y_eval,\n",
- " 'y_pred': y_pred,\n",
- "})\n",
- "df_pred.head(10)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "zROhsEWxnA5u"
- },
- "source": [
- "### Gini Coefficient"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "gRsJ7y-632h_"
- },
- "outputs": [],
- "source": [
- "gain = pd.DataFrame({\n",
- " 'lorenz': ltv.cumulative_true(y_eval, y_eval),\n",
- " 'baseline': ltv.cumulative_true(y_eval, y0_eval),\n",
- " 'model': ltv.cumulative_true(y_eval, y_pred),\n",
- "})"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "yg-ndbve4AL_"
- },
- "outputs": [],
- "source": [
- "num_customers = np.float32(gain.shape[0])\n",
- "gain['cumulative_customer'] = (np.arange(num_customers) + 1.) / num_customers"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "WEoAvuCj4OVy"
- },
- "outputs": [],
- "source": [
- "ax = gain[[\n",
- " 'cumulative_customer',\n",
- " 'lorenz',\n",
- " 'baseline',\n",
- " 'model',\n",
- "]].plot(\n",
- " x='cumulative_customer', figsize=(8, 5), legend=True)\n",
- "\n",
- "ax.legend(['Groundtruth', 'Baseline', 'Model'], loc='upper left')\n",
- "\n",
- "ax.set_xlabel('Cumulative Fraction of Customers')\n",
- "ax.set_xticks(np.arange(0, 1.1, 0.1))\n",
- "ax.set_xlim((0, 1.))\n",
- "\n",
- "ax.set_ylabel('Cumulative Fraction of Total Lifetime Value')\n",
- "ax.set_yticks(np.arange(0, 1.1, 0.1))\n",
- "ax.set_ylim((0, 1.05))\n",
- "ax.set_title('Gain Chart')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "kzPqaiNO4iWC"
- },
- "outputs": [],
- "source": [
- "gini = ltv.gini_from_gain(gain[['lorenz', 'baseline', 'model']])\n",
- "gini"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "S84RitIa9PBu"
- },
- "source": [
- "### Calibration"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "X7sKbsEf6RvF"
- },
- "outputs": [],
- "source": [
- "df_decile = ltv.decile_stats(y_eval, y_pred)\n",
- "df_decile"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "DHdLqUqdL4hf"
- },
- "outputs": [],
- "source": [
- "ax = df_decile[['label_mean', 'pred_mean']].plot.bar(rot=0)\n",
- "\n",
- "ax.set_title('Decile Chart')\n",
- "ax.set_xlabel('Prediction bucket')\n",
- "ax.set_ylabel('Average bucket value')\n",
- "ax.legend(['Label', 'Prediction'], loc='upper left')"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "nK6DQ89xU-d4"
- },
- "source": [
- "### Rank Correlation"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "I9qWGyY3WePz"
- },
- "outputs": [],
- "source": [
- "def spearmanr(x1: Sequence[float], x2: Sequence[float]) -\u003e float:\n",
- " \"\"\"Calculates spearmanr rank correlation coefficient.\n",
- "\n",
- " See https://docs.scipy.org/doc/scipy/reference/stats.html.\n",
- "\n",
- " Args:\n",
- " x1: 1D array_like.\n",
- " x2: 1D array_like.\n",
- "\n",
- " Returns:\n",
- " correlation: float.\n",
- " \"\"\"\n",
- " return stats.spearmanr(x1, x2, nan_policy='raise')[0]\n",
- "\n",
- "\n",
- "spearman_corr = spearmanr(y_eval, y_pred)\n",
- "spearman_corr"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "-i_AbqhXcurk"
- },
- "source": [
- "### All metrics together"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "Umqg1-0Bc1HS"
- },
- "outputs": [],
- "source": [
- "df_metrics = pd.DataFrame(\n",
- " {\n",
- " 'company': COMPANY,\n",
- " 'model': MODEL,\n",
- " 'loss': LOSS,\n",
- " 'label_mean': y_eval.mean(),\n",
- " 'pred_mean': y_pred.mean(),\n",
- " 'label_positive': np.mean(y_eval \u003e 0),\n",
- " 'decile_mape': df_decile['decile_mape'].mean(),\n",
- " 'baseline_gini': gini['normalized'][1],\n",
- " 'gini': gini['normalized'][2],\n",
- " 'spearman_corr': spearman_corr,\n",
- " },\n",
- " index=[0])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "1LV1Hs3xcxnd"
- },
- "outputs": [],
- "source": [
- "df_metrics[[\n",
- " 'company',\n",
- " 'model',\n",
- " 'loss',\n",
- " 'label_mean',\n",
- " 'pred_mean',\n",
- " 'label_positive',\n",
- " 'decile_mape',\n",
- " 'baseline_gini',\n",
- " 'gini',\n",
- " 'spearman_corr',\n",
- "]]"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "UVy6lYn4mSrj"
- },
- "source": [
- "## Save"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "mtkQ4mqUEFsb"
- },
- "outputs": [],
- "source": [
- "output_path = os.path.join(OUTPUT_CSV_FOLDER, COMPANY)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "3qmLzJqOEFsm"
- },
- "outputs": [],
- "source": [
- "if not os.path.isdir(output_path):\n",
- " os.makedirs(output_path)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "61B5Zc_UEFsr"
- },
- "outputs": [],
- "source": [
- "output_file = os.path.join(output_path,\n",
- " '{}_regression_{}.csv'.format(MODEL, LOSS))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "gqglbXfwEFsv"
- },
- "outputs": [],
- "source": [
- "df_metrics.to_csv(output_file, index=False)"
- ]
- }
- ],
- "metadata": {
- "colab": {
- "collapsed_sections": [],
- "last_runtime": {
- "build_target": "//learning/deepmind/public/tools/ml_python:ml_notebook",
- "kind": "private"
- },
- "name": "regression.ipynb",
- "provenance": []
- },
- "kernelspec": {
- "display_name": "Python 3",
- "name": "python3"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 0
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "5RoRxBv3bRjy"
+ },
+ "outputs": [],
+ "source": [
+ "#@title Copyright 2019 The Lifetime Value Authors.\n",
+ "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+ "# you may not use this file except in compliance with the License.\n",
+ "# You may obtain a copy of the License at\n",
+ "#\n",
+ "# https://www.apache.org/licenses/LICENSE-2.0\n",
+ "#\n",
+ "# Unless required by applicable law or agreed to in writing, software\n",
+ "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+ "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+ "# See the License for the specific language governing permissions and\n",
+ "# limitations under the License.\n",
+ "# ============================================================================"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "2tkQUXmWhqRY"
+ },
+ "source": [
+ "# Lifetime Value prediction for Kaggle Acquire Valued Customer Challenge"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "Pw8bm9nV6YJ5"
+ },
+ "source": [
+ ""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "KObdQwyXH2mC"
+ },
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "from scipy import stats\n",
+ "import seaborn as sns\n",
+ "from sklearn import model_selection\n",
+ "from sklearn import preprocessing\n",
+ "import tensorflow as tf\n",
+ "from tensorflow import keras\n",
+ "from tensorflow.keras import backend as K\n",
+ "import tensorflow_probability as tfp\n",
+ "import tqdm\n",
+ "from typing import Sequence\n",
+ "\n",
+ "# install and import ltv\n",
+ "#!pip install -q git+https://github.com/google/lifetime_value\n",
+ "!pip install -q git+https://github.com/seyedrezamirkhani/lifetime_value\n",
+ "import lifetime_value as ltv"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "K41RmAfNXtu_"
+ },
+ "outputs": [],
+ "source": [
+ "tfd = tfp.distributions\n",
+ "%config InlineBackend.figure_format='retina'\n",
+ "sns.set_style('whitegrid')\n",
+ "pd.options.mode.chained_assignment = None # default='warn'"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "DoN-PRvNuIti"
+ },
+ "source": [
+ "## Global variables"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "3GGpDbxd3S5L"
+ },
+ "outputs": [],
+ "source": [
+ "COMPANY = '103600030' # @param { isTemplate: true, type: 'string'}\n",
+ "LOSS = 'ziln' # @param { isTemplate: true, type: 'string'} ['mse', 'ziln']\n",
+ "#LOSS = 'mse' # @param { isTemplate: true, type: 'string'} ['mse', 'ziln']\n",
+ "MODEL = 'dnn' # @param { isTemplate: true, type: 'string'} ['linear', 'dnn']\n",
+ "# MODEL = 'linear' # @param { isTemplate: true, type: 'string'} ['linear', 'dnn']\n",
+ "LEARNING_RATE = 0.0002 # @param { isTemplate: true}\n",
+ "EPOCHS = 400 # @param { isTemplate: true, type: 'integer'}\n",
+ "DATA_FOLDER = './tmp/acquire-valued-shoppers-challenge' # @param { isTemplate: true, type: 'string'}\n",
+ "OUTPUT_CSV_FOLDER = f'{DATA_FOLDER}/result' # @param { isTemplate: true, type: 'string'}"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "UK9Y5NoMtm3X"
+ },
+ "outputs": [],
+ "source": [
+ "CATEGORICAL_FEATURES = ['chain', 'dept', 'category', 'brand', 'productmeasure']\n",
+ "NUMERIC_FEATURES = ['log_calibration_value']\n",
+ "\n",
+ "ALL_FEATURES = CATEGORICAL_FEATURES + NUMERIC_FEATURES"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "RzTaK6fFXMWT"
+ },
+ "source": [
+ "## Data"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "SFi0JMPu138h"
+ },
+ "source": [
+ "### Download data"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "krXMbrkVNtdN"
+ },
+ "source": [
+ "Setup kaggle API correctly following https://www.kaggle.com/docs/api\n",
+ "```\n",
+ "%%shell\n",
+ "mkdir ~/.kaggle\n",
+ "echo \\{\\\"username\\\":\\\"{your kaggle username}\\\",\\\"key\\\":\\\"{your kaggle api key}\\\"\\} > ~/.kaggle/kaggle.json\n",
+ "pip install kaggle\n",
+ "```"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Set it DATA_FOLDER as an environment variable\n",
+ "%env DATA_FOLDER=$DATA_FOLDER"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "0gf4ipd-14x0"
+ },
+ "outputs": [],
+ "source": [
+ "%%bash\n",
+ "if [ -e $DATA_FOLDER/transactions.csv ]\n",
+ "then\n",
+ " echo \"File already exists, no need to download.\"\n",
+ "else\n",
+ " rm -rf $DATA_FOLDER\n",
+ " mkdir -p $DATA_FOLDER\n",
+ " cd $DATA_FOLDER\n",
+ " kaggle competitions download -c acquire-valued-shoppers-challenge\n",
+ " echo \"Unzip file. This may take 10 min.\"\n",
+ " unzip acquire-valued-shoppers-challenge.zip transactions.csv.gz\n",
+ " gunzip transactions.csv.gz\n",
+ "fi"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "IT53azGsa2a2"
+ },
+ "source": [
+ "### Load transaction csv"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "5tIMvE3dW1Ky"
+ },
+ "outputs": [],
+ "source": [
+ "def load_transaction_data(company):\n",
+ " all_data_filename = f'{DATA_FOLDER}/transactions.csv'\n",
+ " one_company_data_filename = f'{DATA_FOLDER}/transactions_company_{company}.csv'\n",
+ " if os.path.isfile(one_company_data_filename):\n",
+ " df = pd.read_csv(one_company_data_filename)\n",
+ " else:\n",
+ " data_list = []\n",
+ " chunksize = 10**6\n",
+ " # 350 iterations\n",
+ " for chunk in tqdm.tqdm(pd.read_csv(all_data_filename, chunksize=chunksize)):\n",
+ " data_list.append(chunk.query(\"company=={}\".format(company)))\n",
+ " df = pd.concat(data_list, axis=0)\n",
+ " df.to_csv(one_company_data_filename, index=None)\n",
+ " return df"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "9ra4bfwCVwKn"
+ },
+ "source": [
+ "### Preprocess data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "PlJl5g9Delmi"
+ },
+ "outputs": [],
+ "source": [
+ "def preprocess(df):\n",
+ " df = df.query('purchaseamount>0')\n",
+ " df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')\n",
+ " df['start_date'] = df.groupby('id')['date'].transform('min')\n",
+ "\n",
+ " # Compute calibration values\n",
+ " calibration_value = (\n",
+ " df.query('date==start_date').groupby('id')\n",
+ " ['purchaseamount'].sum().reset_index())\n",
+ " calibration_value.columns = ['id', 'calibration_value']\n",
+ "\n",
+ " # Compute holdout values\n",
+ " one_year_holdout_window_mask = (\n",
+ " (df['date'] > df['start_date']) &\n",
+ " (df['date'] <= df['start_date'] + np.timedelta64(365, 'D')))\n",
+ " holdout_value = (\n",
+ " df[one_year_holdout_window_mask].groupby('id')\n",
+ " ['purchaseamount'].sum().reset_index())\n",
+ " holdout_value.columns = ['id', 'holdout_value']\n",
+ "\n",
+ " # Compute calibration attributes\n",
+ " calibration_attributes = (\n",
+ " df.query('date==start_date').sort_values(\n",
+ " 'purchaseamount', ascending=False).groupby('id')[[\n",
+ " 'chain', 'dept', 'category', 'brand', 'productmeasure'\n",
+ " ]].first().reset_index())\n",
+ "\n",
+ " # Merge dataframes\n",
+ " customer_level_data = (\n",
+ " calibration_value.merge(calibration_attributes, how='left',\n",
+ " on='id').merge(\n",
+ " holdout_value, how='left', on='id'))\n",
+ " customer_level_data['holdout_value'] = (\n",
+ " customer_level_data['holdout_value'].fillna(0.))\n",
+ " customer_level_data[CATEGORICAL_FEATURES] = (\n",
+ " customer_level_data[CATEGORICAL_FEATURES].fillna('UNKNOWN'))\n",
+ "\n",
+ " # Specify data types\n",
+ " customer_level_data['log_calibration_value'] = (\n",
+ " np.log(customer_level_data['calibration_value']).astype('float32'))\n",
+ " customer_level_data['chain'] = (\n",
+ " customer_level_data['chain'].astype('category'))\n",
+ " customer_level_data['dept'] = (customer_level_data['dept'].astype('category'))\n",
+ " customer_level_data['brand'] = (\n",
+ " customer_level_data['brand'].astype('category'))\n",
+ " customer_level_data['category'] = (\n",
+ " customer_level_data['category'].astype('category'))\n",
+ " customer_level_data['label'] = (\n",
+ " customer_level_data['holdout_value'].astype('float32'))\n",
+ " return customer_level_data"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "fP3q6uuMoXhA"
+ },
+ "source": [
+ "### Load customer-level csv"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "X8B4zV1xoeMX"
+ },
+ "outputs": [],
+ "source": [
+ "def load_customer_level_csv(company):\n",
+ " customer_level_data_file = f'{DATA_FOLDER}/customer_level_data_company_{company}.csv'\n",
+ " if os.path.isfile(customer_level_data_file):\n",
+ " customer_level_data = pd.read_csv(customer_level_data_file)\n",
+ " else:\n",
+ " customer_level_data = preprocess(load_transaction_data(company))\n",
+ " for cat_col in CATEGORICAL_FEATURES:\n",
+ " customer_level_data[cat_col] = (\n",
+ " customer_level_data[cat_col].astype('category'))\n",
+ " for num_col in [\n",
+ " 'log_calibration_value', 'calibration_value', 'holdout_value'\n",
+ " ]:\n",
+ " customer_level_data[num_col] = (\n",
+ " customer_level_data[num_col].astype('float32'))\n",
+ "\n",
+ " return customer_level_data"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "88dVPdt5QWpu"
+ },
+ "outputs": [],
+ "source": [
+ "# Processes data. 350 iteration in total. May take 10min.\n",
+ "customer_level_data = load_customer_level_csv(COMPANY)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "customer_level_data"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "09tqgvANtsil"
+ },
+ "source": [
+ "We observe a mixture of zero and lognormal distribution of holdout value."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "BtF0z3VbmGev"
+ },
+ "outputs": [],
+ "source": [
+ "customer_level_data.label.apply(np.log1p).hist(bins=50)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "i4kN0uk4kZ68"
+ },
+ "source": [
+ "### Make train/eval"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "nc0MLKx2yD72"
+ },
+ "outputs": [],
+ "source": [
+ "def linear_split(df):\n",
+ " # get_dummies preserves numeric features.\n",
+ " x = pd.get_dummies(df[ALL_FEATURES], drop_first=True).astype('float32').values\n",
+ " y = df['label'].values\n",
+ " y0 = df['calibration_value'].values\n",
+ "\n",
+ " x_train, x_eval, y_train, y_eval, y0_train, y0_eval = (\n",
+ " model_selection.train_test_split(\n",
+ " x, y, y0, test_size=0.2, random_state=123))\n",
+ "\n",
+ " return x_train, x_eval, y_train, y_eval, y0_eval"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "eAGbXp9ax042"
+ },
+ "outputs": [],
+ "source": [
+ "def dnn_split(df):\n",
+ " for key in CATEGORICAL_FEATURES:\n",
+ " encoder = preprocessing.LabelEncoder()\n",
+ " df[key] = encoder.fit_transform(df[key])\n",
+ "\n",
+ " y0 = df['calibration_value'].values\n",
+ " df_train, df_eval, y0_train, y0_eval = model_selection.train_test_split(\n",
+ " df, y0, test_size=0.2, random_state=123)\n",
+ "\n",
+ " def feature_dict(df):\n",
+ " features = {k: v.values for k, v in dict(df[CATEGORICAL_FEATURES]).items()}\n",
+ " features['numeric'] = df[NUMERIC_FEATURES].values\n",
+ " return features\n",
+ "\n",
+ " x_train, y_train = feature_dict(df_train), df_train['label'].values\n",
+ " x_eval, y_eval = feature_dict(df_eval), df_eval['label'].values\n",
+ "\n",
+ " return x_train, x_eval, y_train, y_eval, y0_eval"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "lqbShWBzR4NE"
+ },
+ "source": [
+ "## Model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def linear_model(output_units, input_dim):\n",
+ " return tf.keras.Sequential([\n",
+ " tf.keras.layers.Input(shape=(input_dim,)),\n",
+ " tf.keras.layers.Dense(output_units, activation=None)\n",
+ " ])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "Q7huREFbR7Dl"
+ },
+ "outputs": [],
+ "source": [
+ "def embedding_dim(x):\n",
+ " return int(x**.25) + 1\n",
+ "\n",
+ "\n",
+ "def embedding_layer(vocab_size):\n",
+ " return tf.keras.Sequential([\n",
+ " tf.keras.layers.Embedding(\n",
+ " input_dim=vocab_size,\n",
+ " output_dim=embedding_dim(vocab_size)\n",
+ " ),\n",
+ " tf.keras.layers.Flatten(),\n",
+ " ])\n",
+ "\n",
+ "\n",
+ "def dnn_model(output_units, df):\n",
+ " numeric_input = tf.keras.layers.Input(\n",
+ " shape=(len(NUMERIC_FEATURES),), name='numeric')\n",
+ "\n",
+ " embedding_inputs = [\n",
+ " tf.keras.layers.Input(shape=(1,), name=key, dtype=np.int64)\n",
+ " for key in CATEGORICAL_FEATURES\n",
+ " ]\n",
+ "\n",
+ " embedding_outputs = [\n",
+ " embedding_layer(vocab_size=df[key].nunique())(input)\n",
+ " for key, input in zip(CATEGORICAL_FEATURES, embedding_inputs)\n",
+ " ]\n",
+ "\n",
+ " deep_input = tf.keras.layers.concatenate([numeric_input] + embedding_outputs)\n",
+ " deep_model = tf.keras.Sequential([\n",
+ " tf.keras.layers.Dense(64, activation='relu'),\n",
+ " tf.keras.layers.Dense(32, activation='relu'),\n",
+ " tf.keras.layers.Dense(output_units),\n",
+ " ])\n",
+ " return tf.keras.Model(\n",
+ " inputs=[numeric_input] + embedding_inputs, outputs=deep_model(deep_input))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "U8l-KzZ12fbK"
+ },
+ "source": [
+ "### Train"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "L3HzXsj61uy3"
+ },
+ "outputs": [],
+ "source": [
+ "if LOSS == 'mse':\n",
+ " loss = keras.losses.MeanSquaredError()\n",
+ " output_units = 1\n",
+ "\n",
+ "if LOSS == 'ziln':\n",
+ " loss = ltv.zero_inflated_lognormal_loss\n",
+ " output_units = 3"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "0pNM4q5m19Dv"
+ },
+ "outputs": [],
+ "source": [
+ "if MODEL == 'linear':\n",
+ " x_train, x_eval, y_train, y_eval, y0_eval = linear_split(customer_level_data)\n",
+ " model = linear_model(output_units, x_train.shape[1])\n",
+ "\n",
+ "if MODEL == 'dnn':\n",
+ " x_train, x_eval, y_train, y_eval, y0_eval = dnn_split(customer_level_data)\n",
+ " model = dnn_model(output_units, customer_level_data)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "Un-yJPHp31gp"
+ },
+ "outputs": [],
+ "source": [
+ "model.compile(loss=loss, optimizer=keras.optimizers.Adam(learning_rate=LEARNING_RATE))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "_GQ-RlIAfT62"
+ },
+ "outputs": [],
+ "source": [
+ "callbacks = [\n",
+ " tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', min_lr=1e-6),\n",
+ " tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10),\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "-BjnHV7MWhK1"
+ },
+ "outputs": [],
+ "source": [
+ "# y array needs to have a two dimensional shape to work with ziln loss function\n",
+ "# so we use [:, np.newaxis] to make the data two-dimensional for the fit function call\n",
+ "history = model.fit(\n",
+ " x=x_train,\n",
+ " y=y_train[:, np.newaxis],\n",
+ " batch_size=1024,\n",
+ " epochs=EPOCHS,\n",
+ " verbose=2,\n",
+ " callbacks=callbacks,\n",
+ " validation_data=(x_eval, y_eval[:, np.newaxis])).history"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "mAJGs5SebDeN"
+ },
+ "outputs": [],
+ "source": [
+ "pd.DataFrame(history)[['loss', 'val_loss']][2:].plot()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "bHaiutmy2aYm"
+ },
+ "source": [
+ "### Eval"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "l6E_5gYAYQMw"
+ },
+ "outputs": [],
+ "source": [
+ "if LOSS == 'mse':\n",
+ " y_pred = model.predict(x=x_eval, batch_size=1024).flatten()\n",
+ "\n",
+ "if LOSS == 'ziln':\n",
+ " logits = model.predict(x=x_eval, batch_size=1024)\n",
+ " y_pred = ltv.zero_inflated_lognormal_pred(logits).numpy().flatten()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "Mm28qKSGXNyr"
+ },
+ "outputs": [],
+ "source": [
+ "df_pred = pd.DataFrame({\n",
+ " 'y_true': y_eval,\n",
+ " 'y_pred': y_pred,\n",
+ "})\n",
+ "df_pred.head(10)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "zROhsEWxnA5u"
+ },
+ "source": [
+ "### Gini Coefficient"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "gRsJ7y-632h_"
+ },
+ "outputs": [],
+ "source": [
+ "gain = pd.DataFrame({\n",
+ " 'lorenz': ltv.cumulative_true(y_eval, y_eval),\n",
+ " 'baseline': ltv.cumulative_true(y_eval, y0_eval),\n",
+ " 'model': ltv.cumulative_true(y_eval, y_pred),\n",
+ "})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "yg-ndbve4AL_"
+ },
+ "outputs": [],
+ "source": [
+ "num_customers = np.float32(gain.shape[0])\n",
+ "gain['cumulative_customer'] = (np.arange(num_customers) + 1.) / num_customers"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "WEoAvuCj4OVy"
+ },
+ "outputs": [],
+ "source": [
+ "ax = gain[[\n",
+ " 'cumulative_customer',\n",
+ " 'lorenz',\n",
+ " 'baseline',\n",
+ " 'model',\n",
+ "]].plot(\n",
+ " x='cumulative_customer', figsize=(8, 5), legend=True)\n",
+ "\n",
+ "ax.legend(['Groundtruth', 'Baseline', 'Model'], loc='upper left')\n",
+ "\n",
+ "ax.set_xlabel('Cumulative Fraction of Customers')\n",
+ "ax.set_xticks(np.arange(0, 1.1, 0.1))\n",
+ "ax.set_xlim((0, 1.))\n",
+ "\n",
+ "ax.set_ylabel('Cumulative Fraction of Total Lifetime Value')\n",
+ "ax.set_yticks(np.arange(0, 1.1, 0.1))\n",
+ "ax.set_ylim((0, 1.05))\n",
+ "ax.set_title('Gain Chart')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "kzPqaiNO4iWC"
+ },
+ "outputs": [],
+ "source": [
+ "gini = ltv.gini_from_gain(gain[['lorenz', 'baseline', 'model']])\n",
+ "gini"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "S84RitIa9PBu"
+ },
+ "source": [
+ "### Calibration"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "X7sKbsEf6RvF"
+ },
+ "outputs": [],
+ "source": [
+ "df_decile = ltv.decile_stats(y_eval, y_pred)\n",
+ "df_decile"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "DHdLqUqdL4hf"
+ },
+ "outputs": [],
+ "source": [
+ "ax = df_decile[['label_mean', 'pred_mean']].plot.bar(rot=0)\n",
+ "\n",
+ "ax.set_title('Decile Chart')\n",
+ "ax.set_xlabel('Prediction bucket')\n",
+ "ax.set_ylabel('Average bucket value')\n",
+ "ax.legend(['Label', 'Prediction'], loc='upper left')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "nK6DQ89xU-d4"
+ },
+ "source": [
+ "### Rank Correlation"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "I9qWGyY3WePz"
+ },
+ "outputs": [],
+ "source": [
+ "def spearmanr(x1: Sequence[float], x2: Sequence[float]) -> float:\n",
+ " \"\"\"Calculates spearmanr rank correlation coefficient.\n",
+ "\n",
+ " See https://docs.scipy.org/doc/scipy/reference/stats.html.\n",
+ "\n",
+ " Args:\n",
+ " x1: 1D array_like.\n",
+ " x2: 1D array_like.\n",
+ "\n",
+ " Returns:\n",
+ " correlation: float.\n",
+ " \"\"\"\n",
+ " return stats.spearmanr(x1, x2, nan_policy='raise')[0]\n",
+ "\n",
+ "\n",
+ "spearman_corr = spearmanr(y_eval, y_pred)\n",
+ "spearman_corr"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "-i_AbqhXcurk"
+ },
+ "source": [
+ "### All metrics together"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "Umqg1-0Bc1HS"
+ },
+ "outputs": [],
+ "source": [
+ "df_metrics = pd.DataFrame(\n",
+ " {\n",
+ " 'company': COMPANY,\n",
+ " 'model': MODEL,\n",
+ " 'loss': LOSS,\n",
+ " 'label_mean': y_eval.mean(),\n",
+ " 'pred_mean': y_pred.mean(),\n",
+ " 'label_positive': np.mean(y_eval > 0),\n",
+ " 'decile_mape': df_decile['decile_mape'].mean(),\n",
+ " 'baseline_gini': gini['normalized'][1],\n",
+ " 'gini': gini['normalized'][2],\n",
+ " 'spearman_corr': spearman_corr,\n",
+ " },\n",
+ " index=[0])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "1LV1Hs3xcxnd"
+ },
+ "outputs": [],
+ "source": [
+ "df_metrics[[\n",
+ " 'company',\n",
+ " 'model',\n",
+ " 'loss',\n",
+ " 'label_mean',\n",
+ " 'pred_mean',\n",
+ " 'label_positive',\n",
+ " 'decile_mape',\n",
+ " 'baseline_gini',\n",
+ " 'gini',\n",
+ " 'spearman_corr',\n",
+ "]]"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "UVy6lYn4mSrj"
+ },
+ "source": [
+ "## Save"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "mtkQ4mqUEFsb"
+ },
+ "outputs": [],
+ "source": [
+ "output_path = os.path.join(OUTPUT_CSV_FOLDER, COMPANY)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "3qmLzJqOEFsm"
+ },
+ "outputs": [],
+ "source": [
+ "if not os.path.isdir(output_path):\n",
+ " os.makedirs(output_path)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "61B5Zc_UEFsr"
+ },
+ "outputs": [],
+ "source": [
+ "output_file = os.path.join(output_path,\n",
+ " '{}_regression_{}.csv'.format(MODEL, LOSS))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "gqglbXfwEFsv"
+ },
+ "outputs": [],
+ "source": [
+ "df_metrics.to_csv(output_file, index=False)"
+ ]
+ }
+ ],
+ "metadata": {
+ "colab": {
+ "collapsed_sections": [],
+ "last_runtime": {
+ "build_target": "//learning/deepmind/public/tools/ml_python:ml_notebook",
+ "kind": "private"
+ },
+ "name": "regression.ipynb",
+ "provenance": []
+ },
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.7"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
}
diff --git a/notebooks/kdd_cup_98/regression.ipynb b/notebooks/kdd_cup_98/regression.ipynb
index 1a935a7..31bd01c 100644
--- a/notebooks/kdd_cup_98/regression.ipynb
+++ b/notebooks/kdd_cup_98/regression.ipynb
@@ -1,1050 +1,1110 @@
{
- "cells": [
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "gE76T8J7IsGC"
- },
- "outputs": [],
- "source": [
- "#@title Copyright 2019 The Lifetime Value Authors.\n",
- "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
- "# you may not use this file except in compliance with the License.\n",
- "# You may obtain a copy of the License at\n",
- "#\n",
- "# https://www.apache.org/licenses/LICENSE-2.0\n",
- "#\n",
- "# Unless required by applicable law or agreed to in writing, software\n",
- "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
- "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
- "# See the License for the specific language governing permissions and\n",
- "# limitations under the License.\n",
- "# ============================================================================"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "sswTFWDv7HZd"
- },
- "source": [
- "# KDD Cup 98 LTV Prediction"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "PSr1mSJP7O1J"
- },
- "source": [
- "\u003ctable align=\"left\"\u003e\n",
- " \u003ctd\u003e\n",
- " \u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/google/lifetime_value/blob/master/notebooks/kdd_cup_98/regression.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n",
- " \u003c/td\u003e\n",
- " \u003ctd\u003e\n",
- " \u003ca target=\"_blank\" href=\"https://github.com/google/lifetime_value/blob/master/notebooks/kdd_cup_98/regression.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\n",
- " \u003c/td\u003e\n",
- "\u003c/table\u003e"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "pBXE3Dz3NI4A"
- },
- "outputs": [],
- "source": [
- "import os\n",
- "\n",
- "import numpy as np\n",
- "import pandas as pd\n",
- "from scipy import stats\n",
- "import matplotlib.pyplot as plt\n",
- "import seaborn as sns\n",
- "import tensorflow as tf\n",
- "import tensorflow_probability as tfp\n",
- "from typing import Sequence\n",
- "\n",
- "# install and import ltv\n",
- "!pip install -q git+https://github.com/google/lifetime_value\n",
- "import lifetime_value as ltv"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "Bq0Ah16lBmgV"
- },
- "outputs": [],
- "source": [
- "tfd = tfp.distributions\n",
- "%config InlineBackend.figure_format='retina'\n",
- "sns.set_style('whitegrid')"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "2qN319qZK3IG"
- },
- "source": [
- "## Configs"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "hNy_ybw_K19n"
- },
- "outputs": [],
- "source": [
- "MODEL = 'dnn'\n",
- "LOSS = 'ziln' # @param { isTemplate: true, type: 'string'} ['mse', 'ziln']\n",
- "LEARNING_RATE = 0.001 # @param { isTemplate: true}\n",
- "VERSION = 0 # @param { isTemplate: true, type: 'integer'}\n",
- "OUTPUT_CSV_FOLDER = '/tmp/lifetime-value/kdd_cup_98/result' # @param { isTemplate: true, type: 'string'}"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "mDSR921CCEcL"
- },
- "source": [
- "## Load data"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "lHxp4rOGI02Q"
- },
- "source": [
- "Download kdd_cup_98 data to /tmp/lifetime-value/kdd_cup_98"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "Dg3qtgJyJpdi"
- },
- "outputs": [],
- "source": [
- "%%shell\n",
- "mkdir -p /tmp/lifetime-value/kdd_cup_98\n",
- "wget https://kdd.ics.uci.edu/databases/kddcup98/epsilon_mirror/cup98lrn.zip -P /tmp/lifetime-value/kdd_cup_98/\n",
- "wget https://kdd.ics.uci.edu/databases/kddcup98/epsilon_mirror/cup98val.zip -P /tmp/lifetime-value/kdd_cup_98/\n",
- "wget https://kdd.ics.uci.edu/databases/kddcup98/epsilon_mirror/valtargt.txt -P /tmp/lifetime-value/kdd_cup_98/\n",
- "cd /tmp/lifetime-value/kdd_cup_98/\n",
- "unzip cup98lrn.zip\n",
- "unzip cup98val.zip"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "a_LnLmQQRlYF"
- },
- "outputs": [],
- "source": [
- "df_train = pd.read_csv('/tmp/lifetime-value/kdd_cup_98/cup98LRN.txt')\n",
- "num_train = df_train.shape[0]\n",
- "df_eval = pd.read_csv('/tmp/lifetime-value/kdd_cup_98/cup98VAL.txt')\n",
- "df_eval_target = pd.read_csv('/tmp/lifetime-value/kdd_cup_98/valtargt.txt')\n",
- "df_eval = df_eval.merge(df_eval_target, on='CONTROLN')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "ggQmy9wiP5M6"
- },
- "outputs": [],
- "source": [
- "df = pd.concat([df_train, df_eval], axis=0, sort=True)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "0rgxHpIyjaMH"
- },
- "source": [
- "## Label distribution"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "Xmpu_d3YjcFC"
- },
- "outputs": [],
- "source": [
- "y = df['TARGET_D'][:num_train]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "yMr2EDRyK5Sb"
- },
- "outputs": [],
- "source": [
- "def plot_hist_log_scale(y):\n",
- " max_val = y.max() + 1. \n",
- " ax = pd.Series(y).hist(\n",
- " figsize=(8, 5), bins = 10 ** np.linspace(0., np.log10(max_val), 20))\n",
- "\n",
- " plt.xlabel('Donation ($)')\n",
- " plt.ylabel('Count')\n",
- " # plt.title('Histogram of LTV')\n",
- " plt.xticks(rotation='horizontal')\n",
- " plt.legend(loc='upper left')\n",
- " ax.set_xscale('log')\n",
- " ax.grid(False)\n",
- " # Hide the right and top spines\n",
- " ax.spines['right'].set_visible(False)\n",
- " ax.spines['top'].set_visible(False)\n",
- " # Only show ticks on the left and bottom spines\n",
- " ax.yaxis.set_ticks_position('left')\n",
- " ax.xaxis.set_ticks_position('bottom')\n",
- " plt.show()\n",
- "\n",
- " fig = ax.get_figure()\n",
- " output_file = tf.io.gfile.GFile(\n",
- " '/tmp/lifetime-value/kdd_cup_98/histogram_kdd98_log_scale.pdf',\n",
- " 'wb')\n",
- " fig.savefig(output_file, bbox_inches='tight', format='pdf')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "KbwCzGkBOWhH"
- },
- "outputs": [],
- "source": [
- "plot_hist_log_scale(y[y\u003e0])"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "1XXMLbnlCdlN"
- },
- "source": [
- "## Preprocess features"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "L1sBf_RSU3pR"
- },
- "source": [
- "### Vocab"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "xB_ddsd_U_4e"
- },
- "outputs": [],
- "source": [
- "VOCAB_FEATURES = [\n",
- " 'ODATEDW', # date of donor's first gift (YYMM)\n",
- " 'OSOURCE', # donor acquisition mailing list\n",
- " 'TCODE', # donor title code\n",
- " 'STATE',\n",
- " 'ZIP',\n",
- " 'DOMAIN', # urbanicity level and socio-economic status of the neighborhood\n",
- " 'CLUSTER', # socio-economic status\n",
- " 'GENDER',\n",
- " 'MAXADATE', # date of the most recent promotion received\n",
- " 'MINRDATE',\n",
- " 'LASTDATE',\n",
- " 'FISTDATE',\n",
- " 'RFA_2A',\n",
- "]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "f2oPZGVLRSPe"
- },
- "outputs": [],
- "source": [
- "df['ODATEDW'] = df['ODATEDW'].astype('str')\n",
- "df['TCODE'] = df['TCODE'].apply(\n",
- " lambda x: '{:03d}'.format(x // 1000 if x \u003e 1000 else x))\n",
- "df['ZIP'] = df['ZIP'].str.slice(0, 5)\n",
- "df['MAXADATE'] = df['MAXADATE'].astype('str')\n",
- "df['MINRDATE'] = df['MINRDATE'].astype('str')\n",
- "df['LASTDATE'] = df['LASTDATE'].astype('str')\n",
- "df['FISTDATE'] = df['FISTDATE'].astype('str')"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "isL9Ofv9JLAP"
- },
- "outputs": [],
- "source": [
- "def label_encoding(y, frequency_threshold=100):\n",
- " value_counts = pd.value_counts(y)\n",
- " categories = value_counts[\n",
- " value_counts \u003e= frequency_threshold].index.to_numpy()\n",
- " # 0 indicates the unknown category.\n",
- " return pd.Categorical(y, categories=categories).codes + 1"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "BgXGO5D0OdJP"
- },
- "outputs": [],
- "source": [
- "for key in VOCAB_FEATURES:\n",
- " df[key] = label_encoding(df[key])"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "kZkmnJ93Zrjw"
- },
- "source": [
- "### Indicator"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "tGBpMfaGhCD0"
- },
- "outputs": [],
- "source": [
- "MAIL_ORDER_RESPONSES = [\n",
- " 'MBCRAFT',\n",
- " 'MBGARDEN',\n",
- " 'MBBOOKS',\n",
- " 'MBCOLECT',\n",
- " 'MAGFAML',\n",
- " 'MAGFEM',\n",
- " 'MAGMALE',\n",
- " 'PUBGARDN',\n",
- " 'PUBCULIN',\n",
- " 'PUBHLTH',\n",
- " 'PUBDOITY',\n",
- " 'PUBNEWFN',\n",
- " 'PUBPHOTO',\n",
- " 'PUBOPP',\n",
- " 'RFA_2F',\n",
- "]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "4V-DeOZFZhjB"
- },
- "outputs": [],
- "source": [
- "INDICATOR_FEATURES = [\n",
- " 'AGE', # age decile, 0 indicates unknown\n",
- " 'NUMCHLD',\n",
- " 'INCOME',\n",
- " 'WEALTH1',\n",
- " 'HIT',\n",
- "] + MAIL_ORDER_RESPONSES"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "U9y5qA1vZ0kz"
- },
- "outputs": [],
- "source": [
- "df['AGE'] = pd.qcut(df['AGE'].values, 10).codes + 1\n",
- "df['NUMCHLD'] = df['NUMCHLD'].apply(lambda x: 0 if np.isnan(x) else int(x))\n",
- "df['INCOME'] = df['INCOME'].apply(lambda x: 0 if np.isnan(x) else int(x))\n",
- "df['WEALTH1'] = df['WEALTH1'].apply(lambda x: 0 if np.isnan(x) else int(x) + 1)\n",
- "df['HIT'] = pd.qcut(df['HIT'].values, q=50, duplicates='drop').codes\n",
- "\n",
- "for col in MAIL_ORDER_RESPONSES:\n",
- " df[col] = pd.qcut(df[col].values, q=20, duplicates='drop').codes + 1"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "8DOO_2a-U6gr"
- },
- "source": [
- "### Numeric"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "rqVteSLDiLVr"
- },
- "outputs": [],
- "source": [
- "NUMERIC_FEATURES = [\n",
- " # binary\n",
- " 'MAILCODE', # bad address\n",
- " 'NOEXCH', # do not exchange\n",
- " 'RECINHSE', # donor has given to PVA's in house program\n",
- " 'RECP3', # donor has given to PVA's P3 program\n",
- " 'RECPGVG', # planned giving record\n",
- " 'RECSWEEP', # sweepstakes record\n",
- " 'HOMEOWNR', # home owner\n",
- " 'CHILD03',\n",
- " 'CHILD07',\n",
- " 'CHILD12',\n",
- " 'CHILD18',\n",
- "\n",
- " # continuous\n",
- " 'CARDPROM',\n",
- " 'NUMPROM',\n",
- " 'CARDPM12',\n",
- " 'NUMPRM12',\n",
- " 'RAMNTALL',\n",
- " 'NGIFTALL',\n",
- " 'MINRAMNT',\n",
- " 'MAXRAMNT',\n",
- " 'LASTGIFT',\n",
- " 'AVGGIFT',\n",
- "]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "xMRP05Ztic0A"
- },
- "outputs": [],
- "source": [
- "df['MAILCODE'] = (df['MAILCODE'] == 'B').astype('float32')\n",
- "df['PVASTATE'] = df['PVASTATE'].isin(['P', 'E']).astype('float32')\n",
- "df['NOEXCH'] = df['NOEXCH'].isin(['X', '1']).astype('float32')\n",
- "df['RECINHSE'] = (df['RECINHSE'] == 'X').astype('float32')\n",
- "df['RECP3'] = (df['RECP3'] == 'X').astype('float32')\n",
- "df['RECPGVG'] = (df['RECPGVG'] == 'X').astype('float32')\n",
- "df['RECSWEEP'] = (df['RECSWEEP'] == 'X').astype('float32')\n",
- "df['HOMEOWNR'] = (df['HOMEOWNR'] == 'H').astype('float32')\n",
- "df['CHILD03'] = df['CHILD03'].isin(['M', 'F', 'B']).astype('float32')\n",
- "df['CHILD07'] = df['CHILD07'].isin(['M', 'F', 'B']).astype('float32')\n",
- "df['CHILD12'] = df['CHILD12'].isin(['M', 'F', 'B']).astype('float32')\n",
- "df['CHILD18'] = df['CHILD18'].isin(['M', 'F', 'B']).astype('float32')\n",
- "\n",
- "df['CARDPROM'] = df['CARDPROM'] / 100\n",
- "df['NUMPROM'] = df['NUMPROM'] / 100\n",
- "df['CARDPM12'] = df['CARDPM12'] / 100\n",
- "df['NUMPRM12'] = df['NUMPRM12'] / 100\n",
- "df['RAMNTALL'] = np.log1p(df['RAMNTALL'])\n",
- "df['NGIFTALL'] = np.log1p(df['NGIFTALL'])\n",
- "df['MINRAMNT'] = np.log1p(df['MINRAMNT'])\n",
- "df['MAXRAMNT'] = np.log1p(df['MAXRAMNT'])\n",
- "df['LASTGIFT'] = np.log1p(df['LASTGIFT'])\n",
- "df['AVGGIFT'] = np.log1p(df['AVGGIFT'])"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "GoLg1PvWuCT_"
- },
- "source": [
- "### All"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "lSnNgjBCuJdb"
- },
- "outputs": [],
- "source": [
- "CATEGORICAL_FEATURES = VOCAB_FEATURES + INDICATOR_FEATURES\n",
- "ALL_FEATURES = CATEGORICAL_FEATURES + NUMERIC_FEATURES"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "8HJBvvCxRPg3"
- },
- "source": [
- "## Train/eval split"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "N7BXLB1eHovl"
- },
- "outputs": [],
- "source": [
- "def dnn_split(df):\n",
- " df_train = df.iloc[:num_train]\n",
- " df_eval = df.iloc[num_train:]\n",
- "\n",
- " def feature_dict(df):\n",
- " features = {k: v.values for k, v in dict(df[CATEGORICAL_FEATURES]).items()}\n",
- " features['numeric'] = df[NUMERIC_FEATURES].astype('float32').values\n",
- " return features\n",
- "\n",
- " x_train, y_train = feature_dict(df_train), df_train['TARGET_D'].astype(\n",
- " 'float32').values\n",
- " x_eval, y_eval = feature_dict(df_eval), df_eval['TARGET_D'].astype(\n",
- " 'float32').values\n",
- "\n",
- " return x_train, x_eval, y_train, y_eval"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "4yw6fekBtX7X"
- },
- "source": [
- "## Model"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "_rIuO0XYtZH2"
- },
- "outputs": [],
- "source": [
- "def embedding_dim(x):\n",
- " return int(x**.25) + 1\n",
- "\n",
- "\n",
- "def embedding_layer(vocab_size):\n",
- " return tf.keras.Sequential([\n",
- " tf.keras.layers.Embedding(\n",
- " input_dim=vocab_size,\n",
- " output_dim=embedding_dim(vocab_size),\n",
- " input_length=1),\n",
- " tf.keras.layers.Flatten(),\n",
- " ])\n",
- "\n",
- "\n",
- "def dnn_model(output_units):\n",
- " numeric_input = tf.keras.layers.Input(\n",
- " shape=(len(NUMERIC_FEATURES),), name='numeric')\n",
- "\n",
- " embedding_inputs = [\n",
- " tf.keras.layers.Input(shape=(1,), name=key, dtype=np.int64)\n",
- " for key in CATEGORICAL_FEATURES\n",
- " ]\n",
- "\n",
- " embedding_outputs = [\n",
- " embedding_layer(vocab_size=df[key].max() + 1)(input)\n",
- " for key, input in zip(CATEGORICAL_FEATURES, embedding_inputs)\n",
- " ]\n",
- "\n",
- " deep_input = tf.keras.layers.concatenate([numeric_input] + embedding_outputs)\n",
- " deep_model = tf.keras.Sequential([\n",
- " tf.keras.layers.Dense(128, activation='relu'),\n",
- " tf.keras.layers.Dense(128, activation='relu'),\n",
- " tf.keras.layers.Dense(64, activation='relu'),\n",
- " tf.keras.layers.Dense(64, activation='relu'),\n",
- " tf.keras.layers.Dense(units=output_units),\n",
- " ])\n",
- " return tf.keras.Model(\n",
- " inputs=[numeric_input] + embedding_inputs, outputs=deep_model(deep_input))"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "G5h7X6botcHl"
- },
- "source": [
- "## Loss"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "iJ9gpkC6tgP0"
- },
- "outputs": [],
- "source": [
- "if LOSS == 'mse':\n",
- " loss = tf.keras.losses.MeanSquaredError()\n",
- " output_units = 1\n",
- "\n",
- "if LOSS == 'ziln':\n",
- " loss = ltv.zero_inflated_lognormal_loss\n",
- " output_units = 3"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "_afFfIritjCM"
- },
- "outputs": [],
- "source": [
- "x_train, x_eval, y_train, y_eval = dnn_split(df)\n",
- "model = dnn_model(output_units)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "Qj3kI7pyVwzO"
- },
- "outputs": [],
- "source": [
- "model.compile(optimizer=tf.keras.optimizers.Nadam(lr=LEARNING_RATE), loss=loss)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "KZSYxgWdwiXC"
- },
- "source": [
- "## Train"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "Nwj9h5ysQDLp"
- },
- "outputs": [],
- "source": [
- "callbacks = [\n",
- " tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', min_lr=1e-6),\n",
- " tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10),\n",
- "]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "Vb5Tnld6hsfx"
- },
- "outputs": [],
- "source": [
- "history = model.fit(\n",
- " x=x_train,\n",
- " y=y_train,\n",
- " batch_size=2048,\n",
- " epochs=200,\n",
- " verbose=2,\n",
- " callbacks=callbacks,\n",
- " validation_data=(x_eval, y_eval)).history"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "J1sLSUdgvfa6"
- },
- "outputs": [],
- "source": [
- "pd.DataFrame(history)[['loss', 'val_loss']].plot();"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "jRKuZBqhvhT9"
- },
- "source": [
- "## Eval"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "q9_zNMd3vjNk"
- },
- "outputs": [],
- "source": [
- "if LOSS == 'mse':\n",
- " y_pred = model.predict(x=x_eval, batch_size=1024).flatten()\n",
- "\n",
- "if LOSS == 'ziln':\n",
- " logits = model.predict(x=x_eval, batch_size=1024)\n",
- " y_pred = ltv.zero_inflated_lognormal_pred(logits).numpy().flatten()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "SkfkUMUvUu_E"
- },
- "source": [
- "### Total Profit"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "AwfWAp8WQuns"
- },
- "outputs": [],
- "source": [
- "unit_costs = [0.4, 0.5, 0.6, 0.68, 0.7, 0.8, 0.9, 1.0]"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "zqi91dfCUxpx"
- },
- "outputs": [],
- "source": [
- "num_mailed = [np.sum(y_pred \u003e v) for v in unit_costs]\n",
- "num_mailed"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "ZgFjZUcuhScv"
- },
- "outputs": [],
- "source": [
- "baseline_total_profit = np.sum(y_eval - 0.68)\n",
- "baseline_total_profit"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "VwsFnin5U-R9"
- },
- "outputs": [],
- "source": [
- "total_profits = [np.sum(y_eval[y_pred \u003e v] - v) for v in unit_costs]\n",
- "total_profits"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "zROhsEWxnA5u"
- },
- "source": [
- "### Gini Coefficient"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "gRsJ7y-632h_"
- },
- "outputs": [],
- "source": [
- "gain = pd.DataFrame({\n",
- " 'lorenz': ltv.cumulative_true(y_eval, y_eval),\n",
- " 'baseline': ltv.cumulative_true(y_eval, x_eval['numeric'][:, 19]),\n",
- " 'model': ltv.cumulative_true(y_eval, y_pred),\n",
- "})"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "yg-ndbve4AL_"
- },
- "outputs": [],
- "source": [
- "num_customers = np.float32(gain.shape[0])\n",
- "gain['cumulative_customer'] = (np.arange(num_customers) + 1.) / num_customers"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "WEoAvuCj4OVy"
- },
- "outputs": [],
- "source": [
- "ax = gain[[\n",
- " 'cumulative_customer',\n",
- " 'lorenz',\n",
- " 'baseline',\n",
- " 'model',\n",
- "]].plot(\n",
- " x='cumulative_customer', figsize=(8, 5), legend=True)\n",
- "\n",
- "ax.legend(['Groundtruth', 'Baseline', 'Model'], loc='lower right')\n",
- "\n",
- "ax.set_xlabel('Cumulative Fraction of Customers')\n",
- "ax.set_xticks(np.arange(0, 1.1, 0.1))\n",
- "ax.set_xlim((0, 1.))\n",
- "\n",
- "ax.set_ylabel('Cumulative Fraction of Total Lifetime Value')\n",
- "ax.set_yticks(np.arange(0, 1.1, 0.1))\n",
- "ax.set_ylim((0, 1.05))\n",
- "ax.set_title('Gain Chart');"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "kzPqaiNO4iWC"
- },
- "outputs": [],
- "source": [
- "gini = ltv.gini_from_gain(gain[['lorenz', 'baseline', 'model']])\n",
- "gini"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "S84RitIa9PBu"
- },
- "source": [
- "### Calibration"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "X7sKbsEf6RvF"
- },
- "outputs": [],
- "source": [
- "df_decile = ltv.decile_stats(y_eval, y_pred)\n",
- "df_decile"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "DHdLqUqdL4hf"
- },
- "outputs": [],
- "source": [
- "ax = df_decile[['label_mean', 'pred_mean']].plot.bar(rot=0)\n",
- "\n",
- "ax.set_title('Decile Chart')\n",
- "ax.set_xlabel('Prediction bucket')\n",
- "ax.set_ylabel('Average bucket value')\n",
- "ax.legend(['Label', 'Prediction'], loc='upper left');"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "nK6DQ89xU-d4"
- },
- "source": [
- "### Rank Correlation"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "I9qWGyY3WePz"
- },
- "outputs": [],
- "source": [
- "def spearmanr(\n",
- " x1: Sequence[float],\n",
- " x2: Sequence[float]) -\u003e float:\n",
- " \"\"\"Calculates spearmanr rank correlation coefficient.\n",
- "\n",
- " See https://docs.scipy.org/doc/scipy/reference/stats.html.\n",
- "\n",
- " Args:\n",
- " x1: 1D array_like.\n",
- " x2: 1D array_like.\n",
- "\n",
- " Returns:\n",
- " correlation: float.\n",
- " \"\"\"\n",
- " return stats.spearmanr(x1, x2, nan_policy='raise')[0]\n",
- "\n",
- "\n",
- "spearman_corr = spearmanr(y_eval, y_pred)\n",
- "spearman_corr"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "-i_AbqhXcurk"
- },
- "source": [
- "### All metrics together"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "Umqg1-0Bc1HS"
- },
- "outputs": [],
- "source": [
- "df_metrics = pd.DataFrame({\n",
- " 'model': MODEL,\n",
- " 'loss_function': LOSS,\n",
- " 'train_loss': history['loss'][-1],\n",
- " 'eval_loss': history['val_loss'][-1],\n",
- " 'label_positive': np.mean(y_eval \u003e 0),\n",
- " 'label_mean': y_eval.mean(),\n",
- " 'pred_mean': y_pred.mean(),\n",
- " 'decile_mape': df_decile['decile_mape'].mean(),\n",
- " 'baseline_gini': gini['normalized'][1],\n",
- " 'gini': gini['normalized'][2],\n",
- " 'spearman_corr': spearman_corr,\n",
- "}, index=[VERSION])"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "C_cM2Mc2SB3W"
- },
- "outputs": [],
- "source": [
- "for unit_cost, total_profit in zip(unit_costs, total_profits):\n",
- " df_metrics['total_profit_{:02d}'.format(int(unit_cost * 100))] = total_profit"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "iyMvsOtbRrXZ"
- },
- "outputs": [],
- "source": [
- "df_metrics.T"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "8uHtLKk1x0IE"
- },
- "source": [
- "## Save"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "L-fMkqWIm6X6"
- },
- "outputs": [],
- "source": [
- "output_path = OUTPUT_CSV_FOLDER"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "jpJJAbWEm94h"
- },
- "outputs": [],
- "source": [
- "if not os.path.isdir(output_path):\n",
- " os.makedirs(output_path)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "y4LcrTLOm_4B"
- },
- "outputs": [],
- "source": [
- "output_file = os.path.join(output_path, '{}_regression_{}_{}.csv'.format(MODEL, LOSS, VERSION))"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "4WOF7a-dnENp"
- },
- "outputs": [],
- "source": [
- "df_metrics.to_csv(output_file, index=False)"
- ]
- }
- ],
- "metadata": {
- "colab": {
- "collapsed_sections": [],
- "last_runtime": {
- "build_target": "",
- "kind": "local"
- },
- "name": "regression.ipynb",
- "provenance": []
- },
- "kernelspec": {
- "display_name": "Python 3",
- "name": "python3"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 0
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "gE76T8J7IsGC"
+ },
+ "outputs": [],
+ "source": [
+ "#@title Copyright 2019 The Lifetime Value Authors.\n",
+ "# Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+ "# you may not use this file except in compliance with the License.\n",
+ "# You may obtain a copy of the License at\n",
+ "#\n",
+ "# https://www.apache.org/licenses/LICENSE-2.0\n",
+ "#\n",
+ "# Unless required by applicable law or agreed to in writing, software\n",
+ "# distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+ "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+ "# See the License for the specific language governing permissions and\n",
+ "# limitations under the License.\n",
+ "# ============================================================================"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "sswTFWDv7HZd"
+ },
+ "source": [
+ "# KDD Cup 98 LTV Prediction"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "PSr1mSJP7O1J"
+ },
+ "source": [
+ ""
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "pBXE3Dz3NI4A"
+ },
+ "outputs": [],
+ "source": [
+ "import os\n",
+ "\n",
+ "import numpy as np\n",
+ "import pandas as pd\n",
+ "from scipy import stats\n",
+ "import matplotlib.pyplot as plt\n",
+ "import seaborn as sns\n",
+ "import tensorflow as tf\n",
+ "import tensorflow_probability as tfp\n",
+ "from typing import Sequence\n",
+ "\n",
+ "# install and import ltv\n",
+ "# !pip install -q git+https://github.com/google/lifetime_value\n",
+ "!pip install -q git+https://github.com/seyedrezamirkhani/lifetime_value\n",
+ "import lifetime_value as ltv"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "Bq0Ah16lBmgV"
+ },
+ "outputs": [],
+ "source": [
+ "tfd = tfp.distributions\n",
+ "%config InlineBackend.figure_format='retina'\n",
+ "sns.set_style('whitegrid')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "2qN319qZK3IG"
+ },
+ "source": [
+ "## Configs"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "hNy_ybw_K19n"
+ },
+ "outputs": [],
+ "source": [
+ "MODEL = 'dnn'\n",
+ "# LOSS = 'ziln' # @param { isTemplate: true, type: 'string'} ['mse', 'ziln']\n",
+ "LOSS = 'mse' # @param { isTemplate: true, type: 'string'} ['mse', 'ziln']\n",
+ "LEARNING_RATE = 0.001 # @param { isTemplate: true}\n",
+ "VERSION = 0 # @param { isTemplate: true, type: 'integer'}\n",
+ "DATA_FOLDER = './tmp/kdd_cup_98/' # @param { isTemplate: true, type: 'string'}\n",
+ "OUTPUT_CSV_FOLDER = f'{DATA_FOLDER}result' # @param { isTemplate: true, type: 'string'}"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "mDSR921CCEcL"
+ },
+ "source": [
+ "## Load data"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "lHxp4rOGI02Q"
+ },
+ "source": [
+ "Download kdd_cup_98 data to /tmp/lifetime-value/kdd_cup_98"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Set it DATA_FOLDER as an environment variable\n",
+ "%env DATA_FOLDER=$DATA_FOLDER"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "Dg3qtgJyJpdi"
+ },
+ "outputs": [],
+ "source": [
+ "%%bash\n",
+ "mkdir -p $DATA_FOLDER\n",
+ "wget -N https://kdd.ics.uci.edu/databases/kddcup98/epsilon_mirror/cup98lrn.zip -P $DATA_FOLDER\n",
+ "wget -N https://kdd.ics.uci.edu/databases/kddcup98/epsilon_mirror/cup98val.zip -P $DATA_FOLDER\n",
+ "wget -N https://kdd.ics.uci.edu/databases/kddcup98/epsilon_mirror/valtargt.txt -P $DATA_FOLDER\n",
+ "cd $DATA_FOLDER\n",
+ "unzip -n cup98lrn.zip\n",
+ "unzip -n cup98val.zip"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "a_LnLmQQRlYF"
+ },
+ "outputs": [],
+ "source": [
+ "df_train = pd.read_csv(f'{DATA_FOLDER}cup98LRN.txt')\n",
+ "num_train = df_train.shape[0]\n",
+ "df_eval = pd.read_csv(f'{DATA_FOLDER}cup98VAL.txt')\n",
+ "df_eval_target = pd.read_csv(f'{DATA_FOLDER}valtargt.txt')\n",
+ "df_eval = df_eval.merge(df_eval_target, on='CONTROLN')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "ggQmy9wiP5M6"
+ },
+ "outputs": [],
+ "source": [
+ "df = pd.concat([df_train, df_eval], axis=0, sort=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "0rgxHpIyjaMH"
+ },
+ "source": [
+ "## Label distribution"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "Xmpu_d3YjcFC"
+ },
+ "outputs": [],
+ "source": [
+ "y = df['TARGET_D'][:num_train]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "yMr2EDRyK5Sb"
+ },
+ "outputs": [],
+ "source": [
+ "def plot_hist_log_scale(y):\n",
+ " max_val = y.max() + 1. \n",
+ " ax = pd.Series(y).hist(\n",
+ " figsize=(8, 5), bins = 10 ** np.linspace(0., np.log10(max_val), 20))\n",
+ "\n",
+ " plt.xlabel('Donation ($)')\n",
+ " plt.ylabel('Count')\n",
+ " # plt.title('Histogram of LTV')\n",
+ " plt.xticks(rotation='horizontal')\n",
+ " plt.legend(loc='upper left')\n",
+ " ax.set_xscale('log')\n",
+ " ax.grid(False)\n",
+ " # Hide the right and top spines\n",
+ " ax.spines['right'].set_visible(False)\n",
+ " ax.spines['top'].set_visible(False)\n",
+ " # Only show ticks on the left and bottom spines\n",
+ " ax.yaxis.set_ticks_position('left')\n",
+ " ax.xaxis.set_ticks_position('bottom')\n",
+ " plt.show()\n",
+ "\n",
+ " fig = ax.get_figure()\n",
+ " output_file = tf.io.gfile.GFile(\n",
+ " f'{DATA_FOLDER}histogram_kdd98_log_scale.pdf',\n",
+ " 'wb')\n",
+ " fig.savefig(output_file, bbox_inches='tight', format='pdf')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "KbwCzGkBOWhH"
+ },
+ "outputs": [],
+ "source": [
+ "plot_hist_log_scale(y[y>0])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "1XXMLbnlCdlN"
+ },
+ "source": [
+ "## Preprocess features"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "L1sBf_RSU3pR"
+ },
+ "source": [
+ "### Vocab"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "xB_ddsd_U_4e"
+ },
+ "outputs": [],
+ "source": [
+ "VOCAB_FEATURES = [\n",
+ " 'ODATEDW', # date of donor's first gift (YYMM)\n",
+ " 'OSOURCE', # donor acquisition mailing list\n",
+ " 'TCODE', # donor title code\n",
+ " 'STATE',\n",
+ " 'ZIP',\n",
+ " 'DOMAIN', # urbanicity level and socio-economic status of the neighborhood\n",
+ " 'CLUSTER', # socio-economic status\n",
+ " 'GENDER',\n",
+ " 'MAXADATE', # date of the most recent promotion received\n",
+ " 'MINRDATE',\n",
+ " 'LASTDATE',\n",
+ " 'FISTDATE',\n",
+ " 'RFA_2A',\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "f2oPZGVLRSPe"
+ },
+ "outputs": [],
+ "source": [
+ "df['ODATEDW'] = df['ODATEDW'].astype('str')\n",
+ "df['TCODE'] = df['TCODE'].apply(\n",
+ " lambda x: '{:03d}'.format(x // 1000 if x > 1000 else x))\n",
+ "df['ZIP'] = df['ZIP'].str.slice(0, 5)\n",
+ "df['MAXADATE'] = df['MAXADATE'].astype('str')\n",
+ "df['MINRDATE'] = df['MINRDATE'].astype('str')\n",
+ "df['LASTDATE'] = df['LASTDATE'].astype('str')\n",
+ "df['FISTDATE'] = df['FISTDATE'].astype('str')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "isL9Ofv9JLAP"
+ },
+ "outputs": [],
+ "source": [
+ "def label_encoding(y, frequency_threshold=100):\n",
+ " #value_counts = pd.value_counts(y) # raises FutureWarning: pandas.value_counts is deprecated and will be removed in a future version. Use pd.Series(obj).value_counts() instead.\n",
+ " value_counts = pd.Series(y).value_counts()\n",
+ " categories = value_counts[\n",
+ " value_counts >= frequency_threshold].index.to_numpy()\n",
+ " # 0 indicates the unknown category.\n",
+ " return pd.Categorical(y, categories=categories).codes + 1"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "BgXGO5D0OdJP"
+ },
+ "outputs": [],
+ "source": [
+ "for key in VOCAB_FEATURES:\n",
+ " df[key] = label_encoding(df[key])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "kZkmnJ93Zrjw"
+ },
+ "source": [
+ "### Indicator"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "tGBpMfaGhCD0"
+ },
+ "outputs": [],
+ "source": [
+ "MAIL_ORDER_RESPONSES = [\n",
+ " 'MBCRAFT',\n",
+ " 'MBGARDEN',\n",
+ " 'MBBOOKS',\n",
+ " 'MBCOLECT',\n",
+ " 'MAGFAML',\n",
+ " 'MAGFEM',\n",
+ " 'MAGMALE',\n",
+ " 'PUBGARDN',\n",
+ " 'PUBCULIN',\n",
+ " 'PUBHLTH',\n",
+ " 'PUBDOITY',\n",
+ " 'PUBNEWFN',\n",
+ " 'PUBPHOTO',\n",
+ " 'PUBOPP',\n",
+ " 'RFA_2F',\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "4V-DeOZFZhjB"
+ },
+ "outputs": [],
+ "source": [
+ "INDICATOR_FEATURES = [\n",
+ " 'AGE', # age decile, 0 indicates unknown\n",
+ " 'NUMCHLD',\n",
+ " 'INCOME',\n",
+ " 'WEALTH1',\n",
+ " 'HIT',\n",
+ "] + MAIL_ORDER_RESPONSES"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "U9y5qA1vZ0kz"
+ },
+ "outputs": [],
+ "source": [
+ "df['AGE'] = pd.qcut(df['AGE'].values, 10).codes + 1\n",
+ "df['NUMCHLD'] = df['NUMCHLD'].apply(lambda x: 0 if np.isnan(x) else int(x))\n",
+ "df['INCOME'] = df['INCOME'].apply(lambda x: 0 if np.isnan(x) else int(x))\n",
+ "df['WEALTH1'] = df['WEALTH1'].apply(lambda x: 0 if np.isnan(x) else int(x) + 1)\n",
+ "df['HIT'] = pd.qcut(df['HIT'].values, q=50, duplicates='drop').codes\n",
+ "\n",
+ "for col in MAIL_ORDER_RESPONSES:\n",
+ " df[col] = pd.qcut(df[col].values, q=20, duplicates='drop').codes + 1"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "8DOO_2a-U6gr"
+ },
+ "source": [
+ "### Numeric"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "rqVteSLDiLVr"
+ },
+ "outputs": [],
+ "source": [
+ "NUMERIC_FEATURES = [\n",
+ " # binary\n",
+ " 'MAILCODE', # bad address\n",
+ " 'NOEXCH', # do not exchange\n",
+ " 'RECINHSE', # donor has given to PVA's in house program\n",
+ " 'RECP3', # donor has given to PVA's P3 program\n",
+ " 'RECPGVG', # planned giving record\n",
+ " 'RECSWEEP', # sweepstakes record\n",
+ " 'HOMEOWNR', # home owner\n",
+ " 'CHILD03',\n",
+ " 'CHILD07',\n",
+ " 'CHILD12',\n",
+ " 'CHILD18',\n",
+ "\n",
+ " # continuous\n",
+ " 'CARDPROM',\n",
+ " 'NUMPROM',\n",
+ " 'CARDPM12',\n",
+ " 'NUMPRM12',\n",
+ " 'RAMNTALL',\n",
+ " 'NGIFTALL',\n",
+ " 'MINRAMNT',\n",
+ " 'MAXRAMNT',\n",
+ " 'LASTGIFT',\n",
+ " 'AVGGIFT',\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "xMRP05Ztic0A"
+ },
+ "outputs": [],
+ "source": [
+ "df['MAILCODE'] = (df['MAILCODE'] == 'B').astype('float32')\n",
+ "df['PVASTATE'] = df['PVASTATE'].isin(['P', 'E']).astype('float32')\n",
+ "df['NOEXCH'] = df['NOEXCH'].isin(['X', '1']).astype('float32')\n",
+ "df['RECINHSE'] = (df['RECINHSE'] == 'X').astype('float32')\n",
+ "df['RECP3'] = (df['RECP3'] == 'X').astype('float32')\n",
+ "df['RECPGVG'] = (df['RECPGVG'] == 'X').astype('float32')\n",
+ "df['RECSWEEP'] = (df['RECSWEEP'] == 'X').astype('float32')\n",
+ "df['HOMEOWNR'] = (df['HOMEOWNR'] == 'H').astype('float32')\n",
+ "df['CHILD03'] = df['CHILD03'].isin(['M', 'F', 'B']).astype('float32')\n",
+ "df['CHILD07'] = df['CHILD07'].isin(['M', 'F', 'B']).astype('float32')\n",
+ "df['CHILD12'] = df['CHILD12'].isin(['M', 'F', 'B']).astype('float32')\n",
+ "df['CHILD18'] = df['CHILD18'].isin(['M', 'F', 'B']).astype('float32')\n",
+ "\n",
+ "df['CARDPROM'] = df['CARDPROM'] / 100\n",
+ "df['NUMPROM'] = df['NUMPROM'] / 100\n",
+ "df['CARDPM12'] = df['CARDPM12'] / 100\n",
+ "df['NUMPRM12'] = df['NUMPRM12'] / 100\n",
+ "df['RAMNTALL'] = np.log1p(df['RAMNTALL'])\n",
+ "df['NGIFTALL'] = np.log1p(df['NGIFTALL'])\n",
+ "df['MINRAMNT'] = np.log1p(df['MINRAMNT'])\n",
+ "df['MAXRAMNT'] = np.log1p(df['MAXRAMNT'])\n",
+ "df['LASTGIFT'] = np.log1p(df['LASTGIFT'])\n",
+ "df['AVGGIFT'] = np.log1p(df['AVGGIFT'])"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "GoLg1PvWuCT_"
+ },
+ "source": [
+ "### All"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "lSnNgjBCuJdb"
+ },
+ "outputs": [],
+ "source": [
+ "CATEGORICAL_FEATURES = VOCAB_FEATURES + INDICATOR_FEATURES\n",
+ "ALL_FEATURES = CATEGORICAL_FEATURES + NUMERIC_FEATURES"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "8HJBvvCxRPg3"
+ },
+ "source": [
+ "## Train/eval split"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "N7BXLB1eHovl"
+ },
+ "outputs": [],
+ "source": [
+ "def dnn_split(df):\n",
+ " df_train = df.iloc[:num_train]\n",
+ " df_eval = df.iloc[num_train:]\n",
+ "\n",
+ " def feature_dict(df):\n",
+ " features = dict()\n",
+ " \n",
+ " for k, v in dict(df[CATEGORICAL_FEATURES]).items():\n",
+ " features[k] = v.values \n",
+ "\n",
+ " features['numeric'] = df[NUMERIC_FEATURES].astype('float32').values \n",
+ "\n",
+ " return features\n",
+ "\n",
+ " x_train, y_train = feature_dict(df_train), df_train['TARGET_D'].astype(\n",
+ " 'float32').values\n",
+ " x_eval, y_eval = feature_dict(df_eval), df_eval['TARGET_D'].astype(\n",
+ " 'float32').values\n",
+ "\n",
+ " return x_train, x_eval, y_train, y_eval"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "4yw6fekBtX7X"
+ },
+ "source": [
+ "## Model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "_rIuO0XYtZH2"
+ },
+ "outputs": [],
+ "source": [
+ "def embedding_dim(x):\n",
+ " return int(x**.25) + 1\n",
+ "\n",
+ "\n",
+ "def embedding_layer(vocab_size):\n",
+ " return tf.keras.Sequential([\n",
+ " tf.keras.layers.Embedding(\n",
+ " input_dim=vocab_size,\n",
+ " output_dim=embedding_dim(vocab_size)),\n",
+ " tf.keras.layers.Flatten(),\n",
+ " ])\n",
+ "\n",
+ "# NOTE: The call to the fit method fails if the numeric, multi-dimensional, feature is not the last parameter\n",
+ "def dnn_model(output_units):\n",
+ " numeric_input = tf.keras.layers.Input(\n",
+ " shape=(len(NUMERIC_FEATURES),), name='numeric')\n",
+ " numeric_inputs = [numeric_input] \n",
+ "\n",
+ " embedding_inputs = [\n",
+ " tf.keras.layers.Input(shape=(1,), name=key, dtype=np.int64)\n",
+ " for key in CATEGORICAL_FEATURES\n",
+ " ]\n",
+ "\n",
+ " embedding_outputs = [\n",
+ " embedding_layer(vocab_size=df[key].max() + 1)(input)\n",
+ " for key, input in zip(CATEGORICAL_FEATURES, embedding_inputs)\n",
+ " ]\n",
+ "\n",
+ " deep_input = tf.keras.layers.concatenate(embedding_outputs + numeric_inputs)\n",
+ " \n",
+ " deep_model = tf.keras.Sequential([\n",
+ " tf.keras.layers.Dense(128, activation='relu'),\n",
+ " tf.keras.layers.Dense(128, activation='relu'),\n",
+ " tf.keras.layers.Dense(64, activation='relu'),\n",
+ " tf.keras.layers.Dense(64, activation='relu'),\n",
+ " tf.keras.layers.Dense(units=output_units),\n",
+ " ])\n",
+ " return tf.keras.Model(\n",
+ " inputs=embedding_inputs + numeric_inputs, outputs=deep_model(deep_input))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "G5h7X6botcHl"
+ },
+ "source": [
+ "## Loss"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "iJ9gpkC6tgP0"
+ },
+ "outputs": [],
+ "source": [
+ "if LOSS == 'mse':\n",
+ " loss = tf.keras.losses.MeanSquaredError()\n",
+ " output_units = 1\n",
+ "\n",
+ "if LOSS == 'ziln':\n",
+ " loss = ltv.zero_inflated_lognormal_loss\n",
+ " output_units = 3"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "_afFfIritjCM"
+ },
+ "outputs": [],
+ "source": [
+ "x_train, x_eval, y_train, y_eval = dnn_split(df)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "model = dnn_model(output_units)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "Qj3kI7pyVwzO"
+ },
+ "outputs": [],
+ "source": [
+ "model.compile(optimizer=tf.keras.optimizers.Nadam(learning_rate=LEARNING_RATE), loss=loss)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "KZSYxgWdwiXC"
+ },
+ "source": [
+ "## Train"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "Nwj9h5ysQDLp"
+ },
+ "outputs": [],
+ "source": [
+ "callbacks = [\n",
+ " tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', min_lr=1e-6),\n",
+ " tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10),\n",
+ "]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "Vb5Tnld6hsfx"
+ },
+ "outputs": [],
+ "source": [
+ "%%time\n",
+ "# y array needs to have a two dimensional shape to work with ziln loss function\n",
+ "# so we use [:, np.newaxis] to make the data two-dimensional for the fit function call\n",
+ "history = model.fit(\n",
+ " x=x_train,\n",
+ " y=y_train[:, np.newaxis],\n",
+ " batch_size=2048,\n",
+ " epochs=200,\n",
+ " verbose=2,\n",
+ " callbacks=callbacks,\n",
+ " validation_data=(x_eval, y_eval[:, np.newaxis])).history"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "J1sLSUdgvfa6"
+ },
+ "outputs": [],
+ "source": [
+ "pd.DataFrame(history)[['loss', 'val_loss']].plot();"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "jRKuZBqhvhT9"
+ },
+ "source": [
+ "## Eval"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "q9_zNMd3vjNk"
+ },
+ "outputs": [],
+ "source": [
+ "if LOSS == 'mse':\n",
+ " y_pred = model.predict(x=x_eval, batch_size=1024).flatten()\n",
+ "\n",
+ "if LOSS == 'ziln':\n",
+ " logits = model.predict(x=x_eval, batch_size=1024)\n",
+ " y_pred = ltv.zero_inflated_lognormal_pred(logits).numpy().flatten()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "SkfkUMUvUu_E"
+ },
+ "source": [
+ "### Total Profit"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "AwfWAp8WQuns"
+ },
+ "outputs": [],
+ "source": [
+ "unit_costs = [0.4, 0.5, 0.6, 0.68, 0.7, 0.8, 0.9, 1.0]"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "zqi91dfCUxpx"
+ },
+ "outputs": [],
+ "source": [
+ "num_mailed = [np.sum(y_pred > v) for v in unit_costs]\n",
+ "num_mailed"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "ZgFjZUcuhScv"
+ },
+ "outputs": [],
+ "source": [
+ "baseline_total_profit = np.sum(y_eval - 0.68)\n",
+ "baseline_total_profit"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "VwsFnin5U-R9"
+ },
+ "outputs": [],
+ "source": [
+ "total_profits = [np.sum(y_eval[y_pred > v] - v) for v in unit_costs]\n",
+ "total_profits"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "zROhsEWxnA5u"
+ },
+ "source": [
+ "### Gini Coefficient"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "gRsJ7y-632h_"
+ },
+ "outputs": [],
+ "source": [
+ "gain = pd.DataFrame({\n",
+ " 'lorenz': ltv.cumulative_true(y_eval, y_eval),\n",
+ " 'baseline': ltv.cumulative_true(y_eval, x_eval['numeric'][:, 19]),\n",
+ " 'model': ltv.cumulative_true(y_eval, y_pred),\n",
+ "})"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "yg-ndbve4AL_"
+ },
+ "outputs": [],
+ "source": [
+ "num_customers = np.float32(gain.shape[0])\n",
+ "gain['cumulative_customer'] = (np.arange(num_customers) + 1.) / num_customers"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "WEoAvuCj4OVy"
+ },
+ "outputs": [],
+ "source": [
+ "ax = gain[[\n",
+ " 'cumulative_customer',\n",
+ " 'lorenz',\n",
+ " 'baseline',\n",
+ " 'model',\n",
+ "]].plot(\n",
+ " x='cumulative_customer', figsize=(8, 5), legend=True)\n",
+ "\n",
+ "ax.legend(['Groundtruth', 'Baseline', 'Model'], loc='lower right')\n",
+ "\n",
+ "ax.set_xlabel('Cumulative Fraction of Customers')\n",
+ "ax.set_xticks(np.arange(0, 1.1, 0.1))\n",
+ "ax.set_xlim((0, 1.))\n",
+ "\n",
+ "ax.set_ylabel('Cumulative Fraction of Total Lifetime Value')\n",
+ "ax.set_yticks(np.arange(0, 1.1, 0.1))\n",
+ "ax.set_ylim((0, 1.05))\n",
+ "ax.set_title('Gain Chart');"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "kzPqaiNO4iWC"
+ },
+ "outputs": [],
+ "source": [
+ "gini = ltv.gini_from_gain(gain[['lorenz', 'baseline', 'model']])\n",
+ "gini"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "S84RitIa9PBu"
+ },
+ "source": [
+ "### Calibration"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "X7sKbsEf6RvF"
+ },
+ "outputs": [],
+ "source": [
+ "df_decile = ltv.decile_stats(y_eval, y_pred)\n",
+ "df_decile"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "DHdLqUqdL4hf"
+ },
+ "outputs": [],
+ "source": [
+ "ax = df_decile[['label_mean', 'pred_mean']].plot.bar(rot=0)\n",
+ "\n",
+ "ax.set_title('Decile Chart')\n",
+ "ax.set_xlabel('Prediction bucket')\n",
+ "ax.set_ylabel('Average bucket value')\n",
+ "ax.legend(['Label', 'Prediction'], loc='upper left');"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "nK6DQ89xU-d4"
+ },
+ "source": [
+ "### Rank Correlation"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "I9qWGyY3WePz"
+ },
+ "outputs": [],
+ "source": [
+ "def spearmanr(\n",
+ " x1: Sequence[float],\n",
+ " x2: Sequence[float]) -> float:\n",
+ " \"\"\"Calculates spearmanr rank correlation coefficient.\n",
+ "\n",
+ " See https://docs.scipy.org/doc/scipy/reference/stats.html.\n",
+ "\n",
+ " Args:\n",
+ " x1: 1D array_like.\n",
+ " x2: 1D array_like.\n",
+ "\n",
+ " Returns:\n",
+ " correlation: float.\n",
+ " \"\"\"\n",
+ " return stats.spearmanr(x1, x2, nan_policy='raise')[0]\n",
+ "\n",
+ "\n",
+ "spearman_corr = spearmanr(y_eval, y_pred)\n",
+ "spearman_corr"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "-i_AbqhXcurk"
+ },
+ "source": [
+ "### All metrics together"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "Umqg1-0Bc1HS"
+ },
+ "outputs": [],
+ "source": [
+ "df_metrics = pd.DataFrame({\n",
+ " 'model': MODEL,\n",
+ " 'loss_function': LOSS,\n",
+ " 'train_loss': history['loss'][-1],\n",
+ " 'eval_loss': history['val_loss'][-1],\n",
+ " 'label_positive': np.mean(y_eval > 0),\n",
+ " 'label_mean': y_eval.mean(),\n",
+ " 'pred_mean': y_pred.mean(),\n",
+ " 'decile_mape': df_decile['decile_mape'].mean(),\n",
+ " 'baseline_gini': gini['normalized'][1],\n",
+ " 'gini': gini['normalized'][2],\n",
+ " 'spearman_corr': spearman_corr,\n",
+ "}, index=[VERSION])"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "C_cM2Mc2SB3W"
+ },
+ "outputs": [],
+ "source": [
+ "for unit_cost, total_profit in zip(unit_costs, total_profits):\n",
+ " df_metrics['total_profit_{:02d}'.format(int(unit_cost * 100))] = total_profit"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "iyMvsOtbRrXZ"
+ },
+ "outputs": [],
+ "source": [
+ "df_metrics.T"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "id": "8uHtLKk1x0IE"
+ },
+ "source": [
+ "## Save"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "L-fMkqWIm6X6"
+ },
+ "outputs": [],
+ "source": [
+ "output_path = OUTPUT_CSV_FOLDER"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "jpJJAbWEm94h"
+ },
+ "outputs": [],
+ "source": [
+ "if not os.path.isdir(output_path):\n",
+ " os.makedirs(output_path)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "y4LcrTLOm_4B"
+ },
+ "outputs": [],
+ "source": [
+ "output_file = os.path.join(output_path, '{}_regression_{}_{}.csv'.format(MODEL, LOSS, VERSION))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {
+ "id": "4WOF7a-dnENp"
+ },
+ "outputs": [],
+ "source": [
+ "df_metrics.to_csv(output_file, index=False)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df_metrics"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "metadata": {},
+ "outputs": [],
+ "source": []
+ }
+ ],
+ "metadata": {
+ "colab": {
+ "collapsed_sections": [],
+ "last_runtime": {
+ "build_target": "",
+ "kind": "local"
+ },
+ "name": "regression.ipynb",
+ "provenance": []
+ },
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.12.7"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
}
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..ec4905e
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,159 @@
+absl-py==2.1.0
+anyio==4.6.2.post1
+argon2-cffi==23.1.0
+argon2-cffi-bindings==21.2.0
+arrow==1.3.0
+asttokens==2.4.1
+astunparse==1.6.3
+async-lru==2.0.4
+attrs==24.2.0
+babel==2.16.0
+beautifulsoup4==4.12.3
+bleach==6.2.0
+certifi==2024.8.30
+cffi==1.17.1
+charset-normalizer==3.4.0
+cloudpickle==3.1.0
+comm==0.2.2
+contourpy==1.3.1
+cycler==0.12.1
+debugpy==1.8.8
+decorator==5.1.1
+defusedxml==0.7.1
+dm-tree==0.1.8
+executing==2.1.0
+fastjsonschema==2.20.0
+flatbuffers==24.3.25
+fonttools==4.55.0
+fqdn==1.5.1
+gast==0.6.0
+google-pasta==0.2.0
+grpcio==1.68.0
+h11==0.14.0
+h5py==3.12.1
+httpcore==1.0.7
+httpx==0.27.2
+idna==3.10
+ipykernel==6.29.5
+ipython==8.29.0
+ipywidgets==8.1.5
+isoduration==20.11.0
+jedi==0.19.2
+Jinja2==3.1.4
+joblib==1.4.2
+json5==0.9.28
+jsonpointer==3.0.0
+jsonschema==4.23.0
+jsonschema-specifications==2024.10.1
+jupyter==1.1.1
+jupyter-console==6.6.3
+jupyter-events==0.10.0
+jupyter-lsp==2.2.5
+jupyter_client==8.6.3
+jupyter_core==5.7.2
+jupyter_server==2.14.2
+jupyter_server_terminals==0.5.3
+jupyterlab==4.2.6
+jupyterlab_pygments==0.3.0
+jupyterlab_server==2.27.3
+jupyterlab_widgets==3.0.13
+kaggle==1.6.17
+keras==3.6.0
+kiwisolver==1.4.7
+libclang==18.1.1
+lifetime_value @ git+https://github.com/seyedrezamirkhani/lifetime_value@5a4feaa2e64856d4b5e8ae355d61d330b653e1c7
+Markdown==3.7
+markdown-it-py==3.0.0
+MarkupSafe==3.0.2
+matplotlib==3.9.2
+matplotlib-inline==0.1.7
+mdurl==0.1.2
+mistune==3.0.2
+ml-dtypes==0.4.1
+namex==0.0.8
+nbclient==0.10.0
+nbconvert==7.16.4
+nbformat==5.10.4
+nest-asyncio==1.6.0
+notebook==7.2.2
+notebook_shim==0.2.4
+numpy==2.0.2
+nvidia-cublas-cu12==12.5.3.2
+nvidia-cuda-cupti-cu12==12.5.82
+nvidia-cuda-nvcc-cu12==12.5.82
+nvidia-cuda-nvrtc-cu12==12.5.82
+nvidia-cuda-runtime-cu12==12.5.82
+nvidia-cudnn-cu12==9.3.0.75
+nvidia-cufft-cu12==11.2.3.61
+nvidia-curand-cu12==10.3.6.82
+nvidia-cusolver-cu12==11.6.3.83
+nvidia-cusparse-cu12==12.5.1.3
+nvidia-nccl-cu12==2.21.5
+nvidia-nvjitlink-cu12==12.5.82
+opt_einsum==3.4.0
+optree==0.13.1
+overrides==7.7.0
+packaging==24.2
+pandas==2.2.3
+pandocfilters==1.5.1
+parso==0.8.4
+pexpect==4.9.0
+pillow==11.0.0
+platformdirs==4.3.6
+prometheus_client==0.21.0
+prompt_toolkit==3.0.48
+protobuf==5.28.3
+psutil==6.1.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+pycparser==2.22
+pydot==3.0.2
+Pygments==2.18.0
+pyparsing==3.2.0
+python-dateutil==2.9.0.post0
+python-json-logger==2.0.7
+python-slugify==8.0.4
+pytz==2024.2
+PyYAML==6.0.2
+pyzmq==26.2.0
+referencing==0.35.1
+requests==2.32.3
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rich==13.9.4
+rpds-py==0.21.0
+scikit-learn==1.5.2
+scipy==1.14.1
+seaborn==0.13.2
+Send2Trash==1.8.3
+setuptools==75.1.0
+six==1.16.0
+sniffio==1.3.1
+soupsieve==2.6
+stack-data==0.6.3
+tensorboard==2.18.0
+tensorboard-data-server==0.7.2
+tensorflow==2.18.0
+tensorflow-probability==0.25.0
+termcolor==2.5.0
+terminado==0.18.1
+text-unidecode==1.3
+tf_keras==2.18.0
+threadpoolctl==3.5.0
+tinycss2==1.4.0
+tornado==6.4.1
+tqdm==4.67.0
+traitlets==5.14.3
+types-python-dateutil==2.9.0.20241003
+typing_extensions==4.12.2
+tzdata==2024.2
+uri-template==1.3.0
+urllib3==2.2.3
+wcwidth==0.2.13
+webcolors==24.11.1
+webencodings==0.5.1
+websocket-client==1.8.0
+Werkzeug==3.1.3
+wheel==0.44.0
+widgetsnbextension==4.0.13
+wrapt==1.16.0
diff --git a/setup.py b/setup.py
index 6d3da28..903ce81 100644
--- a/setup.py
+++ b/setup.py
@@ -29,7 +29,7 @@
install_requires=[
'numpy >= 1.11.1',
'pandas',
- 'sklearn',
+ 'scikit-learn',
'tensorflow',
'tensorflow-probability',
'tqdm',