diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a2f0b14 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +.ipynb_checkpoints +__pycache__ + +notebooks/kdd_cup_98/tmp/ +notebooks/kaggle_acquire_valued_shoppers_challenge/tmp/ diff --git a/README.md b/README.md index 28e880d..3bb863f 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,77 @@ +# Updating project to work with TensorFlow 2.18 + +This repo is created as a fork of the [Customer Lifetime Value](https://github.com/google/lifetime_value) project from google. A pull request for these changes has been created. The changes in this repo enable the use of TensorFlow 2.18. + +The code has has been tested using the Ubuntu 24 operating system running Python 3.12 and the NVIDIA RTX A5000 graphics card. + +## List of changes + +- updated package name from sklearn to scikit-learn in Setup.py + +- updated notebooks from the notebook folder to: + + - use a DATA_FOLDER variable for location of input and output files. + + - replaced `%%script` blocks with `%%bash` since `%%script` is no longer supported. + + - added an extra dimension for y_train and y_eval during the fit call to +make them 2-dimensional arrays. Without this, the zlin loss (ltv.zero_inflated_lognormal_loss) +will fail as it has a checks for target variable being two dimensional. + + - removed quote characters around the company variable in calls to the pandas *query* +function in notebooks of the *kaggle_acquire_valued_shoppers_challenge* folder. This may +have worked in previous versions of pandas, but it silently returns an empty dataframe if +a string is used as a query value against a numeric column. + + - replaced referenced to LinearModel with a Sequential linear model as this class is longer supported. + + - moved the numeric input field in kdd_cup_98/regression.ipynb to the last parameter. +Due to the shape of this parameter (21,) and the presence of other features, if this +paramter is not the last, TensorFlow throws an error during the call to the *fit* method. + +- added environment.yml to save packages used in the conda environment used to build this project. This includes NVIDIA libraries. + +- added requirements.txt file. + +## TLDR + +### kaggle acquire valued shoppers challenge + +There are three notebooks in the folder *notebooks/kaggle_acquire_valued_shoppers_challenge* + +- **preprocess_data.ipynb** has the code for processing the raw transaction file +to build company specific feature files. This code is repeated in the other two +notebooks. The preprocessing involves: + + - Filtering of all records. Only select transactions with positive values; this excludes all returns which have a negative value. So the label won't reflect the returns nor will the calibration value + + - Generating the calibration value. Sum up the total purchase amount for the first day of shopping. + + - Generating calibration attributes. Take the most expensive transaction and select its + 'chain', 'dept', 'category', 'brand', 'productmeasure' values for the first day of shopping. Note, all other transactions are ignored. Any null values for these attributes are replaced by UNKNOWN. + + - Generating the label/holdout value. This is the total amount purchased by a customer in one year + +The zero_inflated_lognormal_loss function used by both regression and classification notebooks, requires three inputs which are generated as the three output nodes of these models. + +- **regression.ipynb** + +To predict using a regression model, call the intial predict function followed by a call to the *zero_inflated_lognormal_pred* function, passing it all three output node values e.g. + +``` +logits = model.predict(x=x_eval, batch_size=1024) +y_pred = ltv.zero_inflated_lognormal_pred(logits).numpy().flatten() + ``` + +- **classification.ipynb** + +To predict using a classification model, call the intial predict function followed by a call to the sigmoid function passing it the values of the first of the three output node values e.g. + +``` +logits = model.predict(x=x_eval, batch_size=1024) +y_pred = K.sigmoid(logits[..., :1]).numpy().flatten() +``` + # Lifetime Value Accurate predictions of customers’ lifetime value (LTV) given their attributes @@ -32,20 +106,20 @@ A Deep Probabilistic Model for Customer Lifetime Value Prediction. The easiest way is propably using pip: ``` -pip install -q git+https://github.com/google/lifetime_value +pip install -q git+https://github.com/seyedrezamirkhani/lifetime_value ``` If you are using a machine without admin rights, you can do: ``` -pip install -q git+https://github.com/google/lifetime_value --user +pip install -q git+https://github.com/seyedrezamirkhani/lifetime_value --user ``` If you are using [Google Colab](https://colab.research.google.com/), just add "!" to the beginning: ``` -!pip install -q git+https://github.com/google/lifetime_value +!pip install -q git+https://github.com/seyedrezamirkhani/lifetime_value ``` Package works for python 3 only. diff --git a/environment.yml b/environment.yml new file mode 100644 index 0000000..6890362 --- /dev/null +++ b/environment.yml @@ -0,0 +1,184 @@ +name: clv-google +channels: + - defaults +dependencies: + - _libgcc_mutex=0.1=main + - _openmp_mutex=5.1=1_gnu + - bzip2=1.0.8=h5eee18b_6 + - ca-certificates=2024.9.24=h06a4308_0 + - expat=2.6.3=h6a678d5_0 + - ld_impl_linux-64=2.40=h12ee557_0 + - libffi=3.4.4=h6a678d5_1 + - libgcc-ng=11.2.0=h1234567_1 + - libgomp=11.2.0=h1234567_1 + - libstdcxx-ng=11.2.0=h1234567_1 + - libuuid=1.41.5=h5eee18b_0 + - ncurses=6.4=h6a678d5_0 + - openssl=3.0.15=h5eee18b_0 + - pip=24.2=py312h06a4308_0 + - python=3.12.7=h5148396_0 + - readline=8.2=h5eee18b_0 + - setuptools=75.1.0=py312h06a4308_0 + - sqlite=3.45.3=h5eee18b_0 + - tk=8.6.14=h39e8969_0 + - wheel=0.44.0=py312h06a4308_0 + - xz=5.4.6=h5eee18b_1 + - zlib=1.2.13=h5eee18b_1 + - pip: + - absl-py==2.1.0 + - anyio==4.6.2.post1 + - argon2-cffi==23.1.0 + - argon2-cffi-bindings==21.2.0 + - arrow==1.3.0 + - asttokens==2.4.1 + - astunparse==1.6.3 + - async-lru==2.0.4 + - attrs==24.2.0 + - babel==2.16.0 + - beautifulsoup4==4.12.3 + - bleach==6.2.0 + - certifi==2024.8.30 + - cffi==1.17.1 + - charset-normalizer==3.4.0 + - cloudpickle==3.1.0 + - comm==0.2.2 + - contourpy==1.3.1 + - cycler==0.12.1 + - debugpy==1.8.8 + - decorator==5.1.1 + - defusedxml==0.7.1 + - dm-tree==0.1.8 + - executing==2.1.0 + - fastjsonschema==2.20.0 + - flatbuffers==24.3.25 + - fonttools==4.55.0 + - fqdn==1.5.1 + - gast==0.6.0 + - google-pasta==0.2.0 + - grpcio==1.68.0 + - h11==0.14.0 + - h5py==3.12.1 + - httpcore==1.0.7 + - httpx==0.27.2 + - idna==3.10 + - ipykernel==6.29.5 + - ipython==8.29.0 + - ipywidgets==8.1.5 + - isoduration==20.11.0 + - jedi==0.19.2 + - jinja2==3.1.4 + - joblib==1.4.2 + - json5==0.9.28 + - jsonpointer==3.0.0 + - jsonschema==4.23.0 + - jsonschema-specifications==2024.10.1 + - jupyter==1.1.1 + - jupyter-client==8.6.3 + - jupyter-console==6.6.3 + - jupyter-core==5.7.2 + - jupyter-events==0.10.0 + - jupyter-lsp==2.2.5 + - jupyter-server==2.14.2 + - jupyter-server-terminals==0.5.3 + - jupyterlab==4.2.6 + - jupyterlab-pygments==0.3.0 + - jupyterlab-server==2.27.3 + - jupyterlab-widgets==3.0.13 + - kaggle==1.6.17 + - keras==3.6.0 + - kiwisolver==1.4.7 + - libclang==18.1.1 + - lifetime-value==0.1 + - markdown==3.7 + - markdown-it-py==3.0.0 + - markupsafe==3.0.2 + - matplotlib==3.9.2 + - matplotlib-inline==0.1.7 + - mdurl==0.1.2 + - mistune==3.0.2 + - ml-dtypes==0.4.1 + - namex==0.0.8 + - nbclient==0.10.0 + - nbconvert==7.16.4 + - nbformat==5.10.4 + - nest-asyncio==1.6.0 + - notebook==7.2.2 + - notebook-shim==0.2.4 + - numpy==2.0.2 + - nvidia-cublas-cu12==12.5.3.2 + - nvidia-cuda-cupti-cu12==12.5.82 + - nvidia-cuda-nvcc-cu12==12.5.82 + - nvidia-cuda-nvrtc-cu12==12.5.82 + - nvidia-cuda-runtime-cu12==12.5.82 + - nvidia-cudnn-cu12==9.3.0.75 + - nvidia-cufft-cu12==11.2.3.61 + - nvidia-curand-cu12==10.3.6.82 + - nvidia-cusolver-cu12==11.6.3.83 + - nvidia-cusparse-cu12==12.5.1.3 + - nvidia-nccl-cu12==2.21.5 + - nvidia-nvjitlink-cu12==12.5.82 + - opt-einsum==3.4.0 + - optree==0.13.1 + - overrides==7.7.0 + - packaging==24.2 + - pandas==2.2.3 + - pandocfilters==1.5.1 + - parso==0.8.4 + - pexpect==4.9.0 + - pillow==11.0.0 + - platformdirs==4.3.6 + - prometheus-client==0.21.0 + - prompt-toolkit==3.0.48 + - protobuf==5.28.3 + - psutil==6.1.0 + - ptyprocess==0.7.0 + - pure-eval==0.2.3 + - pycparser==2.22 + - pydot==3.0.2 + - pygments==2.18.0 + - pyparsing==3.2.0 + - python-dateutil==2.9.0.post0 + - python-json-logger==2.0.7 + - python-slugify==8.0.4 + - pytz==2024.2 + - pyyaml==6.0.2 + - pyzmq==26.2.0 + - referencing==0.35.1 + - requests==2.32.3 + - rfc3339-validator==0.1.4 + - rfc3986-validator==0.1.1 + - rich==13.9.4 + - rpds-py==0.21.0 + - scikit-learn==1.5.2 + - scipy==1.14.1 + - seaborn==0.13.2 + - send2trash==1.8.3 + - six==1.16.0 + - sniffio==1.3.1 + - soupsieve==2.6 + - stack-data==0.6.3 + - tensorboard==2.18.0 + - tensorboard-data-server==0.7.2 + - tensorflow==2.18.0 + - tensorflow-probability==0.25.0 + - termcolor==2.5.0 + - terminado==0.18.1 + - text-unidecode==1.3 + - tf-keras==2.18.0 + - threadpoolctl==3.5.0 + - tinycss2==1.4.0 + - tornado==6.4.1 + - tqdm==4.67.0 + - traitlets==5.14.3 + - types-python-dateutil==2.9.0.20241003 + - typing-extensions==4.12.2 + - tzdata==2024.2 + - uri-template==1.3.0 + - urllib3==2.2.3 + - wcwidth==0.2.13 + - webcolors==24.11.1 + - webencodings==0.5.1 + - websocket-client==1.8.0 + - werkzeug==3.1.3 + - widgetsnbextension==4.0.13 + - wrapt==1.16.0 diff --git a/notebooks/kaggle_acquire_valued_shoppers_challenge/classification.ipynb b/notebooks/kaggle_acquire_valued_shoppers_challenge/classification.ipynb index 8982e00..0fd68c2 100644 --- a/notebooks/kaggle_acquire_valued_shoppers_challenge/classification.ipynb +++ b/notebooks/kaggle_acquire_valued_shoppers_challenge/classification.ipynb @@ -1,811 +1,838 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "znEK1MNRXqWY" - }, - "outputs": [], - "source": [ - "#@title Copyright 2019 The Lifetime Value Authors.\n", - "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "# you may not use this file except in compliance with the License.\n", - "# You may obtain a copy of the License at\n", - "#\n", - "# https://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing, software\n", - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "# See the License for the specific language governing permissions and\n", - "# limitations under the License.\n", - "# ============================================================================" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "3oNqWmn530N-" - }, - "source": [ - "# Churn Prediction for Kaggle Acquire Valued Customer Challenge" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "XKydJ9qF4KVm" - }, - "source": [ - "\u003ctable align=\"left\"\u003e\n", - " \u003ctd\u003e\n", - " \u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/google/lifetime_value/blob/master/notebooks/kaggle_acquire_valued_shoppers_challenge/classification.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n", - " \u003c/td\u003e\n", - " \u003ctd\u003e\n", - " \u003ca target=\"_blank\" href=\"https://github.com/google/lifetime_value/blob/master/notebooks/kaggle_acquire_valued_shoppers_challenge/classification.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\n", - " \u003c/td\u003e\n", - "\u003c/table\u003e" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "KObdQwyXH2mC" - }, - "outputs": [], - "source": [ - "import os\n", - "\n", - "import numpy as np\n", - "import pandas as pd\n", - "import tqdm\n", - "from sklearn import metrics\n", - "from sklearn import model_selection\n", - "from sklearn import preprocessing\n", - "import tensorflow as tf\n", - "from tensorflow import keras\n", - "from tensorflow.keras import backend as K\n", - "import tensorflow_probability as tfp\n", - "from typing import Sequence\n", - "\n", - "# install and import ltv\n", - "!pip install -q git+https://github.com/google/lifetime_value\n", - "import lifetime_value as ltv" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "K41RmAfNXtu_" - }, - "outputs": [], - "source": [ - "tfd = tfp.distributions\n", - "pd.options.mode.chained_assignment = None # default='warn'" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "RzTaK6fFXMWT" - }, - "source": [ - "## Global variables" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "VQVhF3fhNEr2" - }, - "outputs": [], - "source": [ - "COMPANY = '104900040' # @param { isTemplate: true, type: 'string'}\n", - "LOSS = 'ziln' # @param { isTemplate: true, type: 'string'} ['bce', 'ziln']\n", - "MODEL = 'linear' # @param { isTemplate: true, type: 'string'} ['linear', 'dnn']\n", - "LEARNING_RATE = 0.0002 # @param { isTemplate: true}\n", - "EPOCHS = 400 # @param {type: 'integer'}\n", - "OUTPUT_CSV_FOLDER = '/tmp/lifetime-value/result' # @param { isTemplate: true, type: 'string'}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "g7dg8TwYbxnl" - }, - "outputs": [], - "source": [ - "CATEGORICAL_FEATURES = ['chain', 'dept', 'category', 'brand', 'productmeasure']\n", - "NUMERIC_FEATURES = ['log_calibration_value']\n", - "\n", - "ALL_FEATURES = CATEGORICAL_FEATURES + NUMERIC_FEATURES" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "I_nbvZjMuj_z" - }, - "source": [ - "## Data" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "SFi0JMPu138h" - }, - "source": [ - "### Download data" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "krXMbrkVNtdN" - }, - "source": [ - "Setup kaggle API correctly following https://www.kaggle.com/docs/api\n", - "```\n", - "%%shell\n", - "mkdir ~/.kaggle\n", - "echo \\{\\\"username\\\":\\\"{your kaggle username}\\\",\\\"key\\\":\\\"{your kaggle api key}\\\"\\} \u003e ~/.kaggle/kaggle.json\n", - "pip install kaggle\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "0gf4ipd-14x0" - }, - "outputs": [], - "source": [ - "%%shell\n", - "if [ -e /tmp/lifetime-value/acquire-valued-shoppers-challenge/transactions.csv ]\n", - "then\n", - " echo \"File already exists, no need to download.\"\n", - "else\n", - " rm -rf /tmp/lifetime-value/acquire-valued-shoppers-challenge\n", - " mkdir -p /tmp/lifetime-value/acquire-valued-shoppers-challenge\n", - " cd /tmp/lifetime-value/acquire-valued-shoppers-challenge\n", - " kaggle competitions download -c acquire-valued-shoppers-challenge\n", - " echo \"Unzip file. This may take 10 min.\"\n", - " gunzip transactions.csv.gz\n", - "fi" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "V4zoAS25uj_7" - }, - "source": [ - "### Load transaction csv" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "5tIMvE3dW1Ky" - }, - "outputs": [], - "source": [ - "def load_transaction_data(company):\n", - " all_data_filename = '/tmp/lifetime-value/acquire-valued-shoppers-challenge/transactions.csv'\n", - " one_company_data_filename = (\n", - " '/tmp/lifetime-value/acquire-valued-shoppers-challenge/transactions_company_{}.csv'\n", - " .format(COMPANY))\n", - " if os.path.isfile(one_company_data_filename):\n", - " df = pd.read_csv(one_company_data_filename)\n", - " else:\n", - " data_list = []\n", - " chunksize = 10**6\n", - " # 350 iterations\n", - " for chunk in tqdm.tqdm(pd.read_csv(all_data_filename, chunksize=chunksize)):\n", - " data_list.append(chunk.query(\"company=='{}'\".format(company)))\n", - " df = pd.concat(data_list, axis=0)\n", - " df.to_csv(one_company_data_filename, index=None)\n", - " return df" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9ra4bfwCVwKn" - }, - "source": [ - "### Preprocess data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "PlJl5g9Delmi" - }, - "outputs": [], - "source": [ - "def preprocess(df):\n", - " df = df.query('purchaseamount\u003e0')\n", - " df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')\n", - " df['start_date'] = df.groupby('id')['date'].transform('min')\n", - "\n", - " # Compute calibration values\n", - " calibration_value = (\n", - " df.query('date==start_date').groupby('id')\n", - " ['purchaseamount'].sum().reset_index())\n", - " calibration_value.columns = ['id', 'calibration_value']\n", - "\n", - " # Compute holdout values\n", - " one_year_holdout_window_mask = (\n", - " (df['date'] \u003e df['start_date']) \u0026\n", - " (df['date'] \u003c= df['start_date'] + np.timedelta64(365, 'D')))\n", - " holdout_value = (\n", - " df[one_year_holdout_window_mask].groupby('id')\n", - " ['purchaseamount'].sum().reset_index())\n", - " holdout_value.columns = ['id', 'holdout_value']\n", - "\n", - " # Compute calibration attributes\n", - " calibration_attributes = (\n", - " df.query('date==start_date').sort_values(\n", - " 'purchaseamount', ascending=False).groupby('id')[[\n", - " 'chain', 'dept', 'category', 'brand', 'productmeasure'\n", - " ]].first().reset_index())\n", - "\n", - " # Merge dataframes\n", - " customer_level_data = (\n", - " calibration_value.merge(calibration_attributes, how='left',\n", - " on='id').merge(\n", - " holdout_value, how='left', on='id'))\n", - " customer_level_data['holdout_value'] = (\n", - " customer_level_data['holdout_value'].fillna(0.))\n", - " customer_level_data[CATEGORICAL_FEATURES] = (\n", - " customer_level_data[CATEGORICAL_FEATURES].fillna('UNKNOWN'))\n", - "\n", - " # Specify data types\n", - " customer_level_data['log_calibration_value'] = (\n", - " np.log(customer_level_data['calibration_value']).astype('float32'))\n", - " customer_level_data['chain'] = (\n", - " customer_level_data['chain'].astype('category'))\n", - " customer_level_data['dept'] = (customer_level_data['dept'].astype('category'))\n", - " customer_level_data['brand'] = (\n", - " customer_level_data['brand'].astype('category'))\n", - " customer_level_data['category'] = (\n", - " customer_level_data['category'].astype('category'))\n", - " customer_level_data['label'] = (\n", - " customer_level_data['holdout_value'].astype('float32'))\n", - " return customer_level_data" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "fP3q6uuMoXhA" - }, - "source": [ - "### Load customer-level csv" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "X8B4zV1xoeMX" - }, - "outputs": [], - "source": [ - "def load_customer_level_csv(company):\n", - " customer_level_data_file = (\n", - " '/tmp/lifetime-value/acquire-valued-shoppers-challenge/customer_level_data_company_{}.csv'\n", - " .format(company))\n", - " if os.path.isfile(customer_level_data_file):\n", - " customer_level_data = pd.read_csv(customer_level_data_file)\n", - " else:\n", - " customer_level_data = preprocess(load_transaction_data(company))\n", - " for cat_col in CATEGORICAL_FEATURES:\n", - " customer_level_data[cat_col] = (\n", - " customer_level_data[cat_col].astype('category'))\n", - " for num_col in [\n", - " 'log_calibration_value', 'calibration_value', 'holdout_value'\n", - " ]:\n", - " customer_level_data[num_col] = (\n", - " customer_level_data[num_col].astype('float32'))\n", - "\n", - " return customer_level_data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "DWwMxpIEukAE" - }, - "outputs": [], - "source": [ - "# Processes data. 350 iteration in total. May take 10min.\n", - "customer_level_data = load_customer_level_csv(COMPANY)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "09tqgvANtsil" - }, - "source": [ - "We observe a mixture of zero and lognormal distribution of holdout value." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "BtF0z3VbmGev" - }, - "outputs": [], - "source": [ - "customer_level_data.label.apply(np.log1p).hist(bins=50)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "wLpgjEuofbdy" - }, - "outputs": [], - "source": [ - "customer_level_data.head().T" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "slIDJAaTcQeK" - }, - "source": [ - "## Data" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "i4kN0uk4kZ68" - }, - "source": [ - "### Make train/eval" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "JjP5v6NiQfCX" - }, - "outputs": [], - "source": [ - "def linear_split(df):\n", - " # get_dummies preserves numeric features.\n", - " x = pd.get_dummies(df[ALL_FEATURES], drop_first=True).astype('float32').values\n", - " y = df['label'].values\n", - "\n", - " x_train, x_eval, y_train, y_eval = model_selection.train_test_split(\n", - " x, y, test_size=0.2, random_state=123)\n", - "\n", - " return x_train, x_eval, y_train, y_eval" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "KtWXwORJjaP4" - }, - "outputs": [], - "source": [ - "def dnn_split(df):\n", - " for key in CATEGORICAL_FEATURES:\n", - " encoder = preprocessing.LabelEncoder()\n", - " df[key] = encoder.fit_transform(df[key])\n", - "\n", - " df_train, df_eval = model_selection.train_test_split(\n", - " df, test_size=0.2, random_state=123)\n", - "\n", - " def feature_dict(df):\n", - " features = {k: v.values for k, v in dict(df[CATEGORICAL_FEATURES]).items()}\n", - " features['numeric'] = df[NUMERIC_FEATURES].values\n", - " return features\n", - "\n", - " x_train, y_train = feature_dict(df_train), df_train['label'].values\n", - " x_eval, y_eval = feature_dict(df_eval), df_eval['label'].values\n", - "\n", - " return x_train, x_eval, y_train, y_eval" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "lqbShWBzR4NE" - }, - "source": [ - "## Model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Yh4Y4a89ooP3" - }, - "outputs": [], - "source": [ - "def linear_model(output_units):\n", - " return tf.keras.experimental.LinearModel(output_units)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "W8yo3HLtrAE_" - }, - "outputs": [], - "source": [ - "def embedding_dim(x):\n", - " return int(x**.25) + 1\n", - "\n", - "\n", - "def embedding_layer(vocab_size):\n", - " return tf.keras.Sequential([\n", - " tf.keras.layers.Embedding(\n", - " input_dim=vocab_size,\n", - " output_dim=embedding_dim(vocab_size),\n", - " input_length=1),\n", - " tf.keras.layers.Flatten(),\n", - " ])\n", - "\n", - "\n", - "def dnn_model(output_units, df):\n", - " numeric_input = tf.keras.layers.Input(\n", - " shape=(len(NUMERIC_FEATURES),), name='numeric')\n", - "\n", - " embedding_inputs = [\n", - " tf.keras.layers.Input(shape=(1,), name=key, dtype=np.int64)\n", - " for key in CATEGORICAL_FEATURES\n", - " ]\n", - "\n", - " embedding_outputs = [\n", - " embedding_layer(vocab_size=df[key].nunique())(input)\n", - " for key, input in zip(CATEGORICAL_FEATURES, embedding_inputs)\n", - " ]\n", - "\n", - " deep_input = tf.keras.layers.concatenate([numeric_input] + embedding_outputs)\n", - " deep_model = tf.keras.Sequential([\n", - " tf.keras.layers.Dense(64, activation='relu'),\n", - " tf.keras.layers.Dense(32, activation='relu'),\n", - " tf.keras.layers.Dense(output_units),\n", - " ])\n", - " return tf.keras.Model(\n", - " inputs=[numeric_input] + embedding_inputs, outputs=deep_model(deep_input))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "U8l-KzZ12fbK" - }, - "source": [ - "### Train" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "45BHY6q7rQmI" - }, - "outputs": [], - "source": [ - "if LOSS == 'bce':\n", - " loss = keras.losses.BinaryCrossentropy(from_logits=True)\n", - " output_units = 1\n", - "\n", - "if LOSS == 'ziln':\n", - " loss = ltv.zero_inflated_lognormal_loss\n", - " output_units = 3" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "7Jeou8bGrhll" - }, - "outputs": [], - "source": [ - "if MODEL == 'linear':\n", - " x_train, x_eval, y_train, y_eval = linear_split(customer_level_data)\n", - " model = linear_model(output_units)\n", - "\n", - "if MODEL == 'dnn':\n", - " x_train, x_eval, y_train, y_eval = dnn_split(customer_level_data)\n", - " model = dnn_model(output_units, customer_level_data)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "uF2IdTpAwiZV" - }, - "outputs": [], - "source": [ - "if LOSS == 'bce':\n", - " y_train = (y_train \u003e 0).astype('float32')\n", - " y_eval = (y_eval \u003e 0).astype('float32')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "_GQ-RlIAfT62" - }, - "outputs": [], - "source": [ - "model.compile(loss=loss, optimizer=keras.optimizers.Adam(lr=LEARNING_RATE))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "chEIOzq6rlJx" - }, - "outputs": [], - "source": [ - "callbacks = [\n", - " tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', min_lr=1e-6),\n", - " tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10),\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "-BjnHV7MWhK1" - }, - "outputs": [], - "source": [ - "history = model.fit(\n", - " x=x_train,\n", - " y=y_train,\n", - " batch_size=1024,\n", - " epochs=EPOCHS,\n", - " verbose=2,\n", - " callbacks=callbacks,\n", - " validation_data=(x_eval, y_eval)).history" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "mAJGs5SebDeN" - }, - "outputs": [], - "source": [ - "pd.DataFrame(history)[['loss', 'val_loss']][2:].plot()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "bHaiutmy2aYm" - }, - "source": [ - "### Eval" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "l6E_5gYAYQMw" - }, - "outputs": [], - "source": [ - "logits = model.predict(x=x_eval, batch_size=1024)\n", - "y_pred = K.sigmoid(logits[..., :1]).numpy().flatten()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ICTDpg4rxdlj" - }, - "outputs": [], - "source": [ - "y_true = (y_eval \u003e 0).astype('float32')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "POeY1gdKTwfx" - }, - "outputs": [], - "source": [ - "def classification_report(y_true: Sequence[int],\n", - " y_pred: Sequence[float]) -\u003e pd.DataFrame:\n", - " \"\"\"Report individual level classification metrics.\n", - "\n", - " Arguments:\n", - " y_true: true binary labels.\n", - " y_pred: predicted binary labels.\n", - "\n", - " Returns:\n", - " out: dataframe with classification metrics as columns.\n", - " \"\"\"\n", - " out = pd.DataFrame(index=[0])\n", - "\n", - " out['AUC'] = metrics.roc_auc_score(y_true, y_pred)\n", - " out['PR_AUC'] = metrics.average_precision_score(y_true, y_pred)\n", - " out['precision'] = metrics.precision_score(y_true, 1 * (y_pred \u003e .5))\n", - " out['recall'] = metrics.recall_score(y_true, 1 * (y_pred \u003e .5))\n", - " out['f1'] = metrics.f1_score(y_true, 1 * (y_pred \u003e .5))\n", - " return out" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "vGcWU2vFaeT1" - }, - "outputs": [], - "source": [ - "classification = classification_report(y_true, y_pred)\n", - "classification" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-i_AbqhXcurk" - }, - "source": [ - "### All metrics together" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Umqg1-0Bc1HS" - }, - "outputs": [], - "source": [ - "df_metrics = pd.DataFrame(\n", - " {\n", - " 'company': COMPANY,\n", - " 'model': MODEL,\n", - " 'loss': LOSS,\n", - " 'label_mean': y_true.mean(),\n", - " 'pred_mean': y_pred.mean(),\n", - " 'AUC': classification.loc[0, 'AUC'],\n", - " 'PR_AUC': classification.loc[0, 'PR_AUC'],\n", - " 'precision': classification.loc[0, 'precision'],\n", - " 'recall': classification.loc[0, 'recall'],\n", - " 'f1': classification.loc[0, 'f1']\n", - " },\n", - " index=[0])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "1LV1Hs3xcxnd" - }, - "outputs": [], - "source": [ - "df_metrics[[\n", - " 'company',\n", - " 'model',\n", - " 'loss',\n", - " 'label_mean',\n", - " 'pred_mean',\n", - " 'AUC',\n", - " 'PR_AUC',\n", - " 'precision',\n", - " 'recall',\n", - " 'f1',\n", - "]]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "UVy6lYn4mSrj" - }, - "source": [ - "## Save" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "mtkQ4mqUEFsb" - }, - "outputs": [], - "source": [ - "output_path = os.path.join(OUTPUT_CSV_FOLDER, COMPANY)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "3qmLzJqOEFsm" - }, - "outputs": [], - "source": [ - "if not os.path.isdir(output_path):\n", - " os.makedirs(output_path)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "61B5Zc_UEFsr" - }, - "outputs": [], - "source": [ - "output_file = os.path.join(output_path,\n", - " '{}_classification_{}.csv'.format(MODEL, LOSS))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "gqglbXfwEFsv" - }, - "outputs": [], - "source": [ - "df_metrics.to_csv(output_file, index=False)" - ] - } - ], - "metadata": { - "colab": { - "collapsed_sections": [], - "last_runtime": { - "build_target": "", - "kind": "local" - }, - "name": "classification.ipynb", - "provenance": [], - "toc_visible": true - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - } - }, - "nbformat": 4, - "nbformat_minor": 0 + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "znEK1MNRXqWY" + }, + "outputs": [], + "source": [ + "#@title Copyright 2019 The Lifetime Value Authors.\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License.\n", + "# ============================================================================" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "3oNqWmn530N-" + }, + "source": [ + "# Churn Prediction for Kaggle Acquire Valued Customer Challenge" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "XKydJ9qF4KVm" + }, + "source": [ + "\n", + " \n", + " \n", + "
\n", + " Run in Google Colab\n", + " \n", + " View source on GitHub\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "KObdQwyXH2mC" + }, + "outputs": [], + "source": [ + "import os\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "import tqdm\n", + "from sklearn import metrics\n", + "from sklearn import model_selection\n", + "from sklearn import preprocessing\n", + "import tensorflow as tf\n", + "from tensorflow import keras\n", + "from tensorflow.keras import backend as K\n", + "import tensorflow_probability as tfp\n", + "from typing import Sequence\n", + "\n", + "# install and import ltv\n", + "# !pip install -q git+https://github.com/google/lifetime_value\n", + "!pip install -q git+https://github.com/seyedrezamirkhani/lifetime_value\n", + "import lifetime_value as ltv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "K41RmAfNXtu_" + }, + "outputs": [], + "source": [ + "tfd = tfp.distributions\n", + "pd.options.mode.chained_assignment = None # default='warn'" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "RzTaK6fFXMWT" + }, + "source": [ + "## Global variables" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "VQVhF3fhNEr2" + }, + "outputs": [], + "source": [ + "COMPANY = '104900040' # @param { isTemplate: true, type: 'string'}\n", + "LOSS = 'ziln' # @param { isTemplate: true, type: 'string'} ['bce', 'ziln']\n", + "# LOSS = 'bce' # @param { isTemplate: true, type: 'string'} ['bce', 'ziln']\n", + "# MODEL = 'linear' # @param { isTemplate: true, type: 'string'} ['linear', 'dnn']\n", + "MODEL = 'dnn' # @param { isTemplate: true, type: 'string'} ['linear', 'dnn']\n", + "LEARNING_RATE = 0.0002 # @param { isTemplate: true}\n", + "EPOCHS = 400 # @param {type: 'integer'}\n", + "DATA_FOLDER = './tmp/acquire-valued-shoppers-challenge' # @param { isTemplate: true, type: 'string'}\n", + "OUTPUT_CSV_FOLDER = f'{DATA_FOLDER}/result' # @param { isTemplate: true, type: 'string'}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "g7dg8TwYbxnl" + }, + "outputs": [], + "source": [ + "CATEGORICAL_FEATURES = ['chain', 'dept', 'category', 'brand', 'productmeasure']\n", + "NUMERIC_FEATURES = ['log_calibration_value']\n", + "\n", + "ALL_FEATURES = CATEGORICAL_FEATURES + NUMERIC_FEATURES" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "I_nbvZjMuj_z" + }, + "source": [ + "## Data" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SFi0JMPu138h" + }, + "source": [ + "### Download data" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "krXMbrkVNtdN" + }, + "source": [ + "Setup kaggle API correctly following https://www.kaggle.com/docs/api\n", + "```\n", + "%%shell\n", + "mkdir ~/.kaggle\n", + "echo \\{\\\"username\\\":\\\"{your kaggle username}\\\",\\\"key\\\":\\\"{your kaggle api key}\\\"\\} > ~/.kaggle/kaggle.json\n", + "pip install kaggle\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Set it DATA_FOLDER as an environment variable\n", + "%env DATA_FOLDER=$DATA_FOLDER" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "0gf4ipd-14x0" + }, + "outputs": [], + "source": [ + "%%bash\n", + "if [ -e $DATA_FOLDER/transactions.csv ]\n", + "then\n", + " echo \"File already exists, no need to download.\"\n", + "else\n", + " rm -rf $DATA_FOLDER\n", + " mkdir -p $DATA_FOLDER\n", + " cd $DATA_FOLDER\n", + " kaggle competitions download -c acquire-valued-shoppers-challenge\n", + " echo \"Unzip file. This may take 10 min.\"\n", + " unzip acquire-valued-shoppers-challenge.zip transactions.csv.gz\n", + " gunzip transactions.csv.gz\n", + "fi" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "V4zoAS25uj_7" + }, + "source": [ + "### Load transaction csv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "5tIMvE3dW1Ky" + }, + "outputs": [], + "source": [ + "def load_transaction_data(company):\n", + " all_data_filename = f'{DATA_FOLDER}/transactions.csv'\n", + " one_company_data_filename = f'{DATA_FOLDER}/transactions_company_{company}.csv'\n", + " if os.path.isfile(one_company_data_filename):\n", + " df = pd.read_csv(one_company_data_filename)\n", + " else:\n", + " data_list = []\n", + " chunksize = 10**6\n", + " # 350 iterations\n", + " for chunk in tqdm.tqdm(pd.read_csv(all_data_filename, chunksize=chunksize)):\n", + " data_list.append(chunk.query(\"company=={}\".format(company)))\n", + " df = pd.concat(data_list, axis=0)\n", + " df.to_csv(one_company_data_filename, index=None)\n", + " return df" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9ra4bfwCVwKn" + }, + "source": [ + "### Preprocess data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "PlJl5g9Delmi" + }, + "outputs": [], + "source": [ + "def preprocess(df):\n", + " df = df.query('purchaseamount>0')\n", + " df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')\n", + " df['start_date'] = df.groupby('id')['date'].transform('min')\n", + "\n", + " # Compute calibration values\n", + " calibration_value = (\n", + " df.query('date==start_date').groupby('id')\n", + " ['purchaseamount'].sum().reset_index())\n", + " calibration_value.columns = ['id', 'calibration_value']\n", + "\n", + " # Compute holdout values\n", + " one_year_holdout_window_mask = (\n", + " (df['date'] > df['start_date']) &\n", + " (df['date'] <= df['start_date'] + np.timedelta64(365, 'D')))\n", + " holdout_value = (\n", + " df[one_year_holdout_window_mask].groupby('id')\n", + " ['purchaseamount'].sum().reset_index())\n", + " holdout_value.columns = ['id', 'holdout_value']\n", + "\n", + " # Compute calibration attributes\n", + " calibration_attributes = (\n", + " df.query('date==start_date').sort_values(\n", + " 'purchaseamount', ascending=False).groupby('id')[[\n", + " 'chain', 'dept', 'category', 'brand', 'productmeasure'\n", + " ]].first().reset_index())\n", + "\n", + " # Merge dataframes\n", + " customer_level_data = (\n", + " calibration_value.merge(calibration_attributes, how='left',\n", + " on='id').merge(\n", + " holdout_value, how='left', on='id'))\n", + " customer_level_data['holdout_value'] = (\n", + " customer_level_data['holdout_value'].fillna(0.))\n", + " customer_level_data[CATEGORICAL_FEATURES] = (\n", + " customer_level_data[CATEGORICAL_FEATURES].fillna('UNKNOWN'))\n", + "\n", + " # Specify data types\n", + " customer_level_data['log_calibration_value'] = (\n", + " np.log(customer_level_data['calibration_value']).astype('float32'))\n", + " customer_level_data['chain'] = (\n", + " customer_level_data['chain'].astype('category'))\n", + " customer_level_data['dept'] = (customer_level_data['dept'].astype('category'))\n", + " customer_level_data['brand'] = (\n", + " customer_level_data['brand'].astype('category'))\n", + " customer_level_data['category'] = (\n", + " customer_level_data['category'].astype('category'))\n", + " customer_level_data['label'] = (\n", + " customer_level_data['holdout_value'].astype('float32'))\n", + " return customer_level_data" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "fP3q6uuMoXhA" + }, + "source": [ + "### Load customer-level csv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "X8B4zV1xoeMX" + }, + "outputs": [], + "source": [ + "def load_customer_level_csv(company):\n", + " customer_level_data_file = f'{DATA_FOLDER}/customer_level_data_company_{company}.csv'\n", + " if os.path.isfile(customer_level_data_file):\n", + " customer_level_data = pd.read_csv(customer_level_data_file)\n", + " else:\n", + " customer_level_data = preprocess(load_transaction_data(company))\n", + " for cat_col in CATEGORICAL_FEATURES:\n", + " customer_level_data[cat_col] = (\n", + " customer_level_data[cat_col].astype('category'))\n", + " for num_col in [\n", + " 'log_calibration_value', 'calibration_value', 'holdout_value'\n", + " ]:\n", + " customer_level_data[num_col] = (\n", + " customer_level_data[num_col].astype('float32'))\n", + "\n", + " return customer_level_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "DWwMxpIEukAE" + }, + "outputs": [], + "source": [ + "# Processes data. 350 iteration in total. May take 10min.\n", + "customer_level_data = load_customer_level_csv(COMPANY)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "09tqgvANtsil" + }, + "source": [ + "We observe a mixture of zero and lognormal distribution of holdout value." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "BtF0z3VbmGev" + }, + "outputs": [], + "source": [ + "customer_level_data.label.apply(np.log1p).hist(bins=50)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "wLpgjEuofbdy" + }, + "outputs": [], + "source": [ + "customer_level_data.head().T" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "slIDJAaTcQeK" + }, + "source": [ + "## Data" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "i4kN0uk4kZ68" + }, + "source": [ + "### Make train/eval" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "JjP5v6NiQfCX" + }, + "outputs": [], + "source": [ + "def linear_split(df):\n", + " # get_dummies preserves numeric features.\n", + " x = pd.get_dummies(df[ALL_FEATURES], drop_first=True).astype('float32').values\n", + " y = df['label'].values\n", + "\n", + " x_train, x_eval, y_train, y_eval = model_selection.train_test_split(\n", + " x, y, test_size=0.2, random_state=123)\n", + "\n", + " return x_train, x_eval, y_train, y_eval" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "KtWXwORJjaP4" + }, + "outputs": [], + "source": [ + "def dnn_split(df):\n", + " for key in CATEGORICAL_FEATURES:\n", + " encoder = preprocessing.LabelEncoder()\n", + " df[key] = encoder.fit_transform(df[key])\n", + "\n", + " df_train, df_eval = model_selection.train_test_split(\n", + " df, test_size=0.2, random_state=123)\n", + "\n", + " def feature_dict(df):\n", + " features = {k: v.values for k, v in dict(df[CATEGORICAL_FEATURES]).items()}\n", + " features['numeric'] = df[NUMERIC_FEATURES].values\n", + " return features\n", + "\n", + " x_train, y_train = feature_dict(df_train), df_train['label'].values\n", + " x_eval, y_eval = feature_dict(df_eval), df_eval['label'].values\n", + "\n", + " return x_train, x_eval, y_train, y_eval" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "lqbShWBzR4NE" + }, + "source": [ + "## Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Yh4Y4a89ooP3" + }, + "outputs": [], + "source": [ + "def linear_model(output_units, input_dim):\n", + " return tf.keras.Sequential([\n", + " tf.keras.layers.Input(shape=(input_dim,)),\n", + " tf.keras.layers.Dense(output_units, activation=None)\n", + " ])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "W8yo3HLtrAE_" + }, + "outputs": [], + "source": [ + "def embedding_dim(x):\n", + " return int(x**.25) + 1\n", + "\n", + "\n", + "def embedding_layer(vocab_size):\n", + " return tf.keras.Sequential([\n", + " tf.keras.layers.Embedding(\n", + " input_dim=vocab_size,\n", + " output_dim=embedding_dim(vocab_size)\n", + " ),\n", + " tf.keras.layers.Flatten(),\n", + " ])\n", + "\n", + "\n", + "def dnn_model(output_units, df):\n", + " numeric_input = tf.keras.layers.Input(\n", + " shape=(len(NUMERIC_FEATURES),), name='numeric')\n", + "\n", + " embedding_inputs = [\n", + " tf.keras.layers.Input(shape=(1,), name=key, dtype=np.int64)\n", + " for key in CATEGORICAL_FEATURES\n", + " ]\n", + "\n", + " embedding_outputs = [\n", + " embedding_layer(vocab_size=df[key].nunique())(input)\n", + " for key, input in zip(CATEGORICAL_FEATURES, embedding_inputs)\n", + " ]\n", + "\n", + " deep_input = tf.keras.layers.concatenate([numeric_input] + embedding_outputs)\n", + " deep_model = tf.keras.Sequential([\n", + " tf.keras.layers.Dense(64, activation='relu'),\n", + " tf.keras.layers.Dense(32, activation='relu'),\n", + " tf.keras.layers.Dense(output_units),\n", + " ])\n", + " return tf.keras.Model(\n", + " inputs=[numeric_input] + embedding_inputs, outputs=deep_model(deep_input))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "U8l-KzZ12fbK" + }, + "source": [ + "### Train" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "45BHY6q7rQmI" + }, + "outputs": [], + "source": [ + "if LOSS == 'bce':\n", + " loss = keras.losses.BinaryCrossentropy(from_logits=True)\n", + " output_units = 1\n", + "\n", + "if LOSS == 'ziln':\n", + " loss = ltv.zero_inflated_lognormal_loss\n", + " output_units = 3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "7Jeou8bGrhll" + }, + "outputs": [], + "source": [ + "if MODEL == 'linear':\n", + " x_train, x_eval, y_train, y_eval = linear_split(customer_level_data)\n", + " model = linear_model(output_units, x_train.shape[1])\n", + "\n", + "if MODEL == 'dnn':\n", + " x_train, x_eval, y_train, y_eval = dnn_split(customer_level_data)\n", + " model = dnn_model(output_units, customer_level_data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "uF2IdTpAwiZV" + }, + "outputs": [], + "source": [ + "if LOSS == 'bce':\n", + " y_train = (y_train > 0).astype('float32')\n", + " y_eval = (y_eval > 0).astype('float32')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_GQ-RlIAfT62" + }, + "outputs": [], + "source": [ + "model.compile(loss=loss, optimizer=keras.optimizers.Adam(learning_rate=LEARNING_RATE))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "chEIOzq6rlJx" + }, + "outputs": [], + "source": [ + "callbacks = [\n", + " tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', min_lr=1e-6),\n", + " tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10),\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "-BjnHV7MWhK1" + }, + "outputs": [], + "source": [ + "history = model.fit(\n", + " x=x_train,\n", + " y=y_train[:, np.newaxis],\n", + " batch_size=1024,\n", + " epochs=EPOCHS,\n", + " verbose=2,\n", + " callbacks=callbacks,\n", + " validation_data=(x_eval, y_eval[:, np.newaxis])).history" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "mAJGs5SebDeN" + }, + "outputs": [], + "source": [ + "pd.DataFrame(history)[['loss', 'val_loss']][2:].plot()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bHaiutmy2aYm" + }, + "source": [ + "### Eval" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "l6E_5gYAYQMw" + }, + "outputs": [], + "source": [ + "logits = model.predict(x=x_eval, batch_size=1024)\n", + "y_pred = K.sigmoid(logits[..., :1]).numpy().flatten()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ICTDpg4rxdlj" + }, + "outputs": [], + "source": [ + "y_true = (y_eval > 0).astype('float32')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "POeY1gdKTwfx" + }, + "outputs": [], + "source": [ + "def classification_report(y_true: Sequence[int],\n", + " y_pred: Sequence[float]) -> pd.DataFrame:\n", + " \"\"\"Report individual level classification metrics.\n", + "\n", + " Arguments:\n", + " y_true: true binary labels.\n", + " y_pred: predicted binary labels.\n", + "\n", + " Returns:\n", + " out: dataframe with classification metrics as columns.\n", + " \"\"\"\n", + " out = pd.DataFrame(index=[0])\n", + "\n", + " out['AUC'] = metrics.roc_auc_score(y_true, y_pred)\n", + " out['PR_AUC'] = metrics.average_precision_score(y_true, y_pred)\n", + " out['precision'] = metrics.precision_score(y_true, 1 * (y_pred > .5))\n", + " out['recall'] = metrics.recall_score(y_true, 1 * (y_pred > .5))\n", + " out['f1'] = metrics.f1_score(y_true, 1 * (y_pred > .5))\n", + " return out" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "vGcWU2vFaeT1" + }, + "outputs": [], + "source": [ + "classification = classification_report(y_true, y_pred)\n", + "classification" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-i_AbqhXcurk" + }, + "source": [ + "### All metrics together" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Umqg1-0Bc1HS" + }, + "outputs": [], + "source": [ + "df_metrics = pd.DataFrame(\n", + " {\n", + " 'company': COMPANY,\n", + " 'model': MODEL,\n", + " 'loss': LOSS,\n", + " 'label_mean': y_true.mean(),\n", + " 'pred_mean': y_pred.mean(),\n", + " 'AUC': classification.loc[0, 'AUC'],\n", + " 'PR_AUC': classification.loc[0, 'PR_AUC'],\n", + " 'precision': classification.loc[0, 'precision'],\n", + " 'recall': classification.loc[0, 'recall'],\n", + " 'f1': classification.loc[0, 'f1']\n", + " },\n", + " index=[0])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "1LV1Hs3xcxnd" + }, + "outputs": [], + "source": [ + "df_metrics[[\n", + " 'company',\n", + " 'model',\n", + " 'loss',\n", + " 'label_mean',\n", + " 'pred_mean',\n", + " 'AUC',\n", + " 'PR_AUC',\n", + " 'precision',\n", + " 'recall',\n", + " 'f1',\n", + "]]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "UVy6lYn4mSrj" + }, + "source": [ + "## Save" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "mtkQ4mqUEFsb" + }, + "outputs": [], + "source": [ + "output_path = os.path.join(OUTPUT_CSV_FOLDER, COMPANY)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "3qmLzJqOEFsm" + }, + "outputs": [], + "source": [ + "if not os.path.isdir(output_path):\n", + " os.makedirs(output_path)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "61B5Zc_UEFsr" + }, + "outputs": [], + "source": [ + "output_file = os.path.join(output_path,\n", + " '{}_classification_{}.csv'.format(MODEL, LOSS))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "gqglbXfwEFsv" + }, + "outputs": [], + "source": [ + "df_metrics.to_csv(output_file, index=False)" + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "last_runtime": { + "build_target": "", + "kind": "local" + }, + "name": "classification.ipynb", + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 4 } diff --git a/notebooks/kaggle_acquire_valued_shoppers_challenge/preprocess_data.ipynb b/notebooks/kaggle_acquire_valued_shoppers_challenge/preprocess_data.ipynb index 8e489ad..8af0d3a 100644 --- a/notebooks/kaggle_acquire_valued_shoppers_challenge/preprocess_data.ipynb +++ b/notebooks/kaggle_acquire_valued_shoppers_challenge/preprocess_data.ipynb @@ -1,319 +1,500 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "s2VGz60wbOiq" - }, - "outputs": [], - "source": [ - "#@title Copyright 2019 The Lifetime Value Authors.\n", - "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "# you may not use this file except in compliance with the License.\n", - "# You may obtain a copy of the License at\n", - "#\n", - "# https://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing, software\n", - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "# See the License for the specific language governing permissions and\n", - "# limitations under the License.\n", - "# ============================================================================" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "eBoqlan65Q9T" - }, - "source": [ - "\u003ctable align=\"left\"\u003e\n", - " \u003ctd\u003e\n", - " \u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/google/lifetime_value/blob/master/notebooks/kaggle_acquire_valued_shoppers_challenge/preprocess_data.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n", - " \u003c/td\u003e\n", - " \u003ctd\u003e\n", - " \u003ca target=\"_blank\" href=\"https://github.com/google/lifetime_value/blob/master/notebooks/kaggle_acquire_valued_shoppers_challenge/preprocess_data.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\n", - " \u003c/td\u003e\n", - "\u003c/table\u003e" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "KObdQwyXH2mC" - }, - "outputs": [], - "source": [ - "from __future__ import absolute_import\n", - "from __future__ import division\n", - "from __future__ import print_function\n", - "\n", - "\n", - "import os\n", - "import numpy as np\n", - "import pandas as pd\n", - "import tqdm\n", - "import multiprocessing" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "K41RmAfNXtu_" - }, - "outputs": [], - "source": [ - "pd.options.mode.chained_assignment = None # default='warn'" - ] - }, + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "id": "s2VGz60wbOiq" + }, + "outputs": [], + "source": [ + "#@title Copyright 2019 The Lifetime Value Authors.\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License.\n", + "# ============================================================================" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "eBoqlan65Q9T" + }, + "source": [ + "\n", + " \n", + " \n", + "
\n", + " Run in Google Colab\n", + " \n", + " View source on GitHub\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "id": "KObdQwyXH2mC" + }, + "outputs": [], + "source": [ + "from __future__ import absolute_import\n", + "from __future__ import division\n", + "from __future__ import print_function\n", + "\n", + "\n", + "import os\n", + "import numpy as np\n", + "import pandas as pd\n", + "import tqdm\n", + "import multiprocessing" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "id": "K41RmAfNXtu_" + }, + "outputs": [], + "source": [ + "pd.options.mode.chained_assignment = None # default='warn'" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DoN-PRvNuIti" + }, + "source": [ + "## Global variables" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "id": "3GGpDbxd3S5L" + }, + "outputs": [], + "source": [ + "COMPANYS = [\n", + " '10000', '101200010', '101410010', '101600010', '102100020', '102700020',\n", + " '102840020', '103000030', '103338333', '103400030', '103600030',\n", + " '103700030', '103800030', '104300040', '104400040', '104470040',\n", + " '104900040', '105100050', '105150050', '107800070'\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "DATA_FOLDER = './tmp/acquire-valued-shoppers-challenge' # @param { isTemplate: true, type: 'string'}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "RzTaK6fFXMWT" + }, + "source": [ + "## Data" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SFi0JMPu138h" + }, + "source": [ + "### Download data" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "krXMbrkVNtdN" + }, + "source": [ + "Setup kaggle API correctly following https://www.kaggle.com/docs/api\n", + "```\n", + "%%shell\n", + "mkdir ~/.kaggle\n", + "echo \\{\\\"username\\\":\\\"{your kaggle username}\\\",\\\"key\\\":\\\"{your kaggle api key}\\\"\\} > ~/.kaggle/kaggle.json\n", + "pip install kaggle\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "DoN-PRvNuIti" - }, - "source": [ - "## Global variables" - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "env: DATA_FOLDER=./tmp/acquire-valued-shoppers-challenge\n" + ] + } + ], + "source": [ + "# Set it DATA_FOLDER as an environment variable\n", + "%env DATA_FOLDER=$DATA_FOLDER" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "id": "0gf4ipd-14x0" + }, + "outputs": [ { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "3GGpDbxd3S5L" - }, - "outputs": [], - "source": [ - "COMPANYS = [\n", - " '10000', '101200010', '101410010', '101600010', '102100020', '102700020',\n", - " '102840020', '103000030', '103338333', '103400030', '103600030',\n", - " '103700030', '103800030', '104300040', '104400040', '104470040',\n", - " '104900040', '105100050', '105150050', '107800070'\n", - "]" - ] - }, + "name": "stdout", + "output_type": "stream", + "text": [ + "File already exists, no need to download.\n" + ] + } + ], + "source": [ + "%%bash\n", + "if [ -e $DATA_FOLDER/transactions.csv ]\n", + "then\n", + " echo \"File already exists, no need to download.\"\n", + "else\n", + " rm -rf $DATA_FOLDER\n", + " mkdir -p $DATA_FOLDER\n", + " cd $DATA_FOLDER\n", + " kaggle competitions download -c acquire-valued-shoppers-challenge\n", + " echo \"Unzip file. This may take 10 min.\"\n", + " unzip acquire-valued-shoppers-challenge.zip transactions.csv.gz\n", + " gunzip transactions.csv.gz\n", + "fi" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "IT53azGsa2a2" + }, + "source": [ + "### Load csv" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "id": "5tIMvE3dW1Ky" + }, + "outputs": [], + "source": [ + "def load_data(company):\n", + " all_data_filename = f'{DATA_FOLDER}/transactions.csv'\n", + " one_company_data_filename = f'{DATA_FOLDER}/transactions_company_{company}.csv'\n", + " if os.path.isfile(one_company_data_filename):\n", + " df = pd.read_csv(one_company_data_filename)\n", + " else:\n", + " data_list = []\n", + " chunksize = 10**6\n", + " # 350 iterations\n", + " for chunk in tqdm.tqdm(pd.read_csv(all_data_filename, chunksize=chunksize)):\n", + " data_list.append(chunk.query(\"company=={}\".format(company)))\n", + " df = pd.concat(data_list, axis=0)\n", + " df.to_csv(one_company_data_filename, index=None)\n", + " return df" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9ra4bfwCVwKn" + }, + "source": [ + "### Preprocess data" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "id": "PlJl5g9Delmi" + }, + "outputs": [], + "source": [ + "def preprocess(df):\n", + " df = df.query('purchaseamount>0')\n", + " df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')\n", + " df['start_date'] = df.groupby('id')['date'].transform('min')\n", + "\n", + " # Compute calibration values\n", + " calibration_value = (\n", + " df.query('date==start_date').groupby('id')\n", + " ['purchaseamount'].sum().reset_index())\n", + " calibration_value.columns = ['id', 'calibration_value']\n", + "\n", + " # Compute holdout values\n", + " one_year_holdout_window_mask = (\n", + " (df['date'] > df['start_date']) &\n", + " (df['date'] <= df['start_date'] + np.timedelta64(365, 'D')))\n", + " holdout_value = (\n", + " df[one_year_holdout_window_mask].groupby('id')\n", + " ['purchaseamount'].sum().reset_index())\n", + " holdout_value.columns = ['id', 'holdout_value']\n", + "\n", + " # Compute calibration attributes\n", + " calibration_attributes = (\n", + " df.query('date==start_date').sort_values(\n", + " 'purchaseamount', ascending=False).groupby('id')[[\n", + " 'chain', 'dept', 'category', 'brand', 'productmeasure'\n", + " ]].first().reset_index())\n", + "\n", + " # Merge dataframes\n", + " customer_level_data = (\n", + " calibration_value.merge(calibration_attributes, how='left',\n", + " on='id').merge(\n", + " holdout_value, how='left', on='id'))\n", + " customer_level_data['holdout_value'] = (\n", + " customer_level_data['holdout_value'].fillna(0.))\n", + " categorical_features = ([\n", + " 'chain', 'dept', 'category', 'brand', 'productmeasure'\n", + " ])\n", + " customer_level_data[categorical_features] = (\n", + " customer_level_data[categorical_features].fillna('UNKNOWN'))\n", + "\n", + " # Specify data types\n", + " customer_level_data['log_calibration_value'] = (\n", + " np.log(customer_level_data['calibration_value']).astype('float32'))\n", + " customer_level_data['chain'] = (\n", + " customer_level_data['chain'].astype('category'))\n", + " customer_level_data['dept'] = (customer_level_data['dept'].astype('category'))\n", + " customer_level_data['brand'] = (\n", + " customer_level_data['brand'].astype('category'))\n", + " customer_level_data['category'] = (\n", + " customer_level_data['category'].astype('category'))\n", + " customer_level_data['label'] = (\n", + " customer_level_data['holdout_value'].astype('float32'))\n", + " return customer_level_data" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "id": "Bx80J6Ztferj" + }, + "outputs": [], + "source": [ + "def process(company):\n", + " print(\"Process company {}\".format(company))\n", + " transaction_level_data = load_data(company)\n", + " customer_level_data = preprocess(transaction_level_data)\n", + " customer_level_data_file = f\"{DATA_FOLDER}/customer_level_data_company_{company}.csv\"\n", + " customer_level_data.to_csv(customer_level_data_file, index=None)\n", + " print(\"Done company {}\".format(company))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Q05sKVnxi8mV" + }, + "source": [ + "This step may take a while to finish -- 10min-1hr depending on number of core in\n", + "the computer." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "id": "88dVPdt5QWpu" + }, + "outputs": [ { - "cell_type": "markdown", - "metadata": { - "id": "RzTaK6fFXMWT" - }, - "source": [ - "## Data" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "Process company 102100020Process company 101200010Process company 102840020Process company 102700020Process company 10000Process company 103338333Process company 101410010Process company 101600010Process company 103700030Process company 103600030Process company 104300040Process company 103800030\n", + "Process company 103000030Process company 104400040Process company 104470040\n", + "\n", + "Process company 103400030\n", + "Process company 107800070\n", + "Process company 105150050\n", + "\n", + "\n", + "Process company 105100050\n", + "\n", + "\n", + "Process company 104900040\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n" + ] }, { - "cell_type": "markdown", - "metadata": { - "id": "SFi0JMPu138h" - }, - "source": [ - "### Download data" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "350it [05:29, 1.06it/s]\n", + "350it [05:31, 1.06it/s]\n", + "350it [05:31, 1.06it/s]\n", + "350it [05:33, 1.05it/s]\n", + "350it [05:36, 1.04it/s]\n", + "350it [05:36, 1.04it/s]\n", + "350it [05:37, 1.04it/s]\n", + "350it [05:39, 1.03it/s]\n", + "350it [05:39, 1.03it/s]\n", + "350it [05:41, 1.03it/s]\n", + "350it [05:41, 1.02it/s]\n", + "350it [05:41, 1.02it/s]\n", + "350it [05:42, 1.02it/s]\n", + "345it [05:43, 1.12it/s]" + ] }, { - "cell_type": "markdown", - "metadata": { - "id": "krXMbrkVNtdN" - }, - "source": [ - "Setup kaggle API correctly following https://www.kaggle.com/docs/api\n", - "```\n", - "%%shell\n", - "mkdir ~/.kaggle\n", - "echo \\{\\\"username\\\":\\\"{your kaggle username}\\\",\\\"key\\\":\\\"{your kaggle api key}\\\"\\} \u003e ~/.kaggle/kaggle.json\n", - "pip install kaggle\n", - "```" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "Done company 103600030\n" + ] }, { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "0gf4ipd-14x0" - }, - "outputs": [], - "source": [ - "%%shell\n", - "if [ -e /tmp/lifetime-value/acquire-valued-shoppers-challenge/transactions.csv ]\n", - "then\n", - " echo \"File already exists, no need to download.\"\n", - "else\n", - " rm -rf /tmp/lifetime-value/acquire-valued-shoppers-challenge\n", - " mkdir -p /tmp/lifetime-value/acquire-valued-shoppers-challenge\n", - " cd /tmp/lifetime-value/acquire-valued-shoppers-challenge\n", - " kaggle competitions download -c acquire-valued-shoppers-challenge\n", - " echo \"Unzip file. This may take 10 min.\"\n", - " gunzip transactions.csv.gz\n", - "fi" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "350it [05:44, 1.01it/s]\n", + "345it [05:44, 1.21it/s]" + ] }, { - "cell_type": "markdown", - "metadata": { - "id": "IT53azGsa2a2" - }, - "source": [ - "### Load csv" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "Done company 103000030\n" + ] }, { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "5tIMvE3dW1Ky" - }, - "outputs": [], - "source": [ - "def load_data(company):\n", - " all_data_filename = '/tmp/lifetime-value/acquire-valued-shoppers-challenge/transactions.csv'\n", - " one_company_data_filename = (\n", - " '/tmp/lifetime-value/acquire-valued-shoppers-challenge/transactions_company_{}.csv'\n", - " .format(company))\n", - " if os.path.isfile(one_company_data_filename):\n", - " df = pd.read_csv(one_company_data_filename)\n", - " else:\n", - " data_list = []\n", - " chunksize = 10**6\n", - " # 350 iterations\n", - " for chunk in tqdm.tqdm(pd.read_csv(all_data_filename, chunksize=chunksize)):\n", - " data_list.append(chunk.query(\"company=='{}'\".format(company)))\n", - " df = pd.concat(data_list, axis=0)\n", - " df.to_csv(one_company_data_filename, index=None)\n", - " return df" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "350it [05:45, 1.01it/s]\n", + "350it [05:46, 1.01it/s]\n", + "350it [05:47, 1.01it/s]\n", + "350it [05:47, 1.01it/s]\n" + ] }, { - "cell_type": "markdown", - "metadata": { - "id": "9ra4bfwCVwKn" - }, - "source": [ - "### Preprocess data" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "Done company 104470040\n", + "Done company 105150050\n" + ] }, { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "PlJl5g9Delmi" - }, - "outputs": [], - "source": [ - "def preprocess(df):\n", - " df = df.query('purchaseamount\u003e0')\n", - " df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')\n", - " df['start_date'] = df.groupby('id')['date'].transform('min')\n", - "\n", - " # Compute calibration values\n", - " calibration_value = (\n", - " df.query('date==start_date').groupby('id')\n", - " ['purchaseamount'].sum().reset_index())\n", - " calibration_value.columns = ['id', 'calibration_value']\n", - "\n", - " # Compute holdout values\n", - " one_year_holdout_window_mask = (\n", - " (df['date'] \u003e df['start_date']) \u0026\n", - " (df['date'] \u003c= df['start_date'] + np.timedelta64(365, 'D')))\n", - " holdout_value = (\n", - " df[one_year_holdout_window_mask].groupby('id')\n", - " ['purchaseamount'].sum().reset_index())\n", - " holdout_value.columns = ['id', 'holdout_value']\n", - "\n", - " # Compute calibration attributes\n", - " calibration_attributes = (\n", - " df.query('date==start_date').sort_values(\n", - " 'purchaseamount', ascending=False).groupby('id')[[\n", - " 'chain', 'dept', 'category', 'brand', 'productmeasure'\n", - " ]].first().reset_index())\n", - "\n", - " # Merge dataframes\n", - " customer_level_data = (\n", - " calibration_value.merge(calibration_attributes, how='left',\n", - " on='id').merge(\n", - " holdout_value, how='left', on='id'))\n", - " customer_level_data['holdout_value'] = (\n", - " customer_level_data['holdout_value'].fillna(0.))\n", - " categorical_features = ([\n", - " 'chain', 'dept', 'category', 'brand', 'productmeasure'\n", - " ])\n", - " customer_level_data[categorical_features] = (\n", - " customer_level_data[categorical_features].fillna('UNKNOWN'))\n", - "\n", - " # Specify data types\n", - " customer_level_data['log_calibration_value'] = (\n", - " np.log(customer_level_data['calibration_value']).astype('float32'))\n", - " customer_level_data['chain'] = (\n", - " customer_level_data['chain'].astype('category'))\n", - " customer_level_data['dept'] = (customer_level_data['dept'].astype('category'))\n", - " customer_level_data['brand'] = (\n", - " customer_level_data['brand'].astype('category'))\n", - " customer_level_data['category'] = (\n", - " customer_level_data['category'].astype('category'))\n", - " customer_level_data['label'] = (\n", - " customer_level_data['holdout_value'].astype('float32'))\n", - " return customer_level_data" - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "350it [05:48, 1.00it/s]\n" + ] }, { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Bx80J6Ztferj" - }, - "outputs": [], - "source": [ - "def process(company):\n", - " print(\"Process company {}\".format(company))\n", - " transaction_level_data = load_data(company)\n", - " customer_level_data = preprocess(transaction_level_data)\n", - " customer_level_data_file = (\n", - " \"/tmp/lifetime-value/acquire-valued-shoppers-challenge/customer_level_data_company_{}.csv\"\n", - " .format(company))\n", - " customer_level_data.to_csv(customer_level_data_file, index=None)\n", - " print(\"Done company {}\".format(company))" - ] + "name": "stdout", + "output_type": "stream", + "text": [ + "Done company 107800070\n" + ] }, { - "cell_type": "markdown", - "metadata": { - "id": "Q05sKVnxi8mV" - }, - "source": [ - "This step may take a while to finish -- 10min-1hr depending on number of core in\n", - "the computer." - ] + "name": "stderr", + "output_type": "stream", + "text": [ + "350it [05:51, 1.00s/it]\n" + ] }, { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "88dVPdt5QWpu" - }, - "outputs": [], - "source": [ - "p = multiprocessing.Pool(multiprocessing.cpu_count())\n", - "_ = p.map(process, COMPANYS)" - ] - } - ], - "metadata": { - "colab": { - "collapsed_sections": [], - "last_runtime": { - "build_target": "", - "kind": "local" - }, - "name": "preprocess_data.ipynb", - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 2", - "name": "python2" + "name": "stdout", + "output_type": "stream", + "text": [ + "Done company 102700020\n", + "Done company 101200010\n", + "Done company 101410010\n", + "Done company 104900040\n", + "Done company 104300040\n", + "Done company 105100050\n", + "Done company 103400030\n", + "Done company 103800030\n", + "Done company 101600010\n", + "Done company 102100020\n", + "Done company 104400040\n", + "Done company 103338333\n", + "Done company 102840020\n", + "Done company 103700030\n", + "Done company 10000\n" + ] } + ], + "source": [ + "p = multiprocessing.Pool(multiprocessing.cpu_count())\n", + "_ = p.map(process, COMPANYS)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "last_runtime": { + "build_target": "", + "kind": "local" + }, + "name": "preprocess_data.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" }, - "nbformat": 4, - "nbformat_minor": 0 + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 4 } diff --git a/notebooks/kaggle_acquire_valued_shoppers_challenge/regression.ipynb b/notebooks/kaggle_acquire_valued_shoppers_challenge/regression.ipynb index c4d00d5..3441e29 100644 --- a/notebooks/kaggle_acquire_valued_shoppers_challenge/regression.ipynb +++ b/notebooks/kaggle_acquire_valued_shoppers_challenge/regression.ipynb @@ -1,899 +1,935 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "5RoRxBv3bRjy" - }, - "outputs": [], - "source": [ - "#@title Copyright 2019 The Lifetime Value Authors.\n", - "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "# you may not use this file except in compliance with the License.\n", - "# You may obtain a copy of the License at\n", - "#\n", - "# https://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing, software\n", - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "# See the License for the specific language governing permissions and\n", - "# limitations under the License.\n", - "# ============================================================================" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2tkQUXmWhqRY" - }, - "source": [ - "# Lifetime Value prediction for Kaggle Acquire Valued Customer Challenge" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "Pw8bm9nV6YJ5" - }, - "source": [ - "\u003ctable align=\"left\"\u003e\n", - " \u003ctd\u003e\n", - " \u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/google/lifetime_value/blob/master/notebooks/kaggle_acquire_valued_shoppers_challenge/regression.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n", - " \u003c/td\u003e\n", - " \u003ctd\u003e\n", - " \u003ca target=\"_blank\" href=\"https://github.com/google/lifetime_value/blob/master/notebooks/kaggle_acquire_valued_shoppers_challenge/regression.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\n", - " \u003c/td\u003e\n", - "\u003c/table\u003e" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "KObdQwyXH2mC" - }, - "outputs": [], - "source": [ - "import os\n", - "\n", - "import numpy as np\n", - "import pandas as pd\n", - "from scipy import stats\n", - "import seaborn as sns\n", - "from sklearn import model_selection\n", - "from sklearn import preprocessing\n", - "import tensorflow as tf\n", - "from tensorflow import keras\n", - "from tensorflow.keras import backend as K\n", - "import tensorflow_probability as tfp\n", - "import tqdm\n", - "from typing import Sequence\n", - "\n", - "# install and import ltv\n", - "!pip install -q git+https://github.com/google/lifetime_value\n", - "import lifetime_value as ltv" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "K41RmAfNXtu_" - }, - "outputs": [], - "source": [ - "tfd = tfp.distributions\n", - "%config InlineBackend.figure_format='retina'\n", - "sns.set_style('whitegrid')\n", - "pd.options.mode.chained_assignment = None # default='warn'" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "DoN-PRvNuIti" - }, - "source": [ - "## Global variables" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "3GGpDbxd3S5L" - }, - "outputs": [], - "source": [ - "COMPANY = '103600030' # @param { isTemplate: true, type: 'string'}\n", - "LOSS = 'ziln' # @param { isTemplate: true, type: 'string'} ['mse', 'ziln']\n", - "MODEL = 'dnn' # @param { isTemplate: true, type: 'string'} ['linear', 'dnn']\n", - "LEARNING_RATE = 0.0002 # @param { isTemplate: true}\n", - "EPOCHS = 400 # @param { isTemplate: true, type: 'integer'}\n", - "OUTPUT_CSV_FOLDER = '/tmp/lifetime-value/kaggle_acquire_valued_shoppers_challenge/result' # @param { isTemplate: true, type: 'string'}" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "UK9Y5NoMtm3X" - }, - "outputs": [], - "source": [ - "CATEGORICAL_FEATURES = ['chain', 'dept', 'category', 'brand', 'productmeasure']\n", - "NUMERIC_FEATURES = ['log_calibration_value']\n", - "\n", - "ALL_FEATURES = CATEGORICAL_FEATURES + NUMERIC_FEATURES" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "RzTaK6fFXMWT" - }, - "source": [ - "## Data" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "SFi0JMPu138h" - }, - "source": [ - "### Download data" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "krXMbrkVNtdN" - }, - "source": [ - "Setup kaggle API correctly following https://www.kaggle.com/docs/api\n", - "```\n", - "%%shell\n", - "mkdir ~/.kaggle\n", - "echo \\{\\\"username\\\":\\\"{your kaggle username}\\\",\\\"key\\\":\\\"{your kaggle api key}\\\"\\} \u003e ~/.kaggle/kaggle.json\n", - "pip install kaggle\n", - "```" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "0gf4ipd-14x0" - }, - "outputs": [], - "source": [ - "%%shell\n", - "if [ -e /tmp/lifetime-value/acquire-valued-shoppers-challenge/transactions.csv ]\n", - "then\n", - " echo \"File already exists, no need to download.\"\n", - "else\n", - " rm -rf /tmp/lifetime-value/acquire-valued-shoppers-challenge\n", - " mkdir -p /tmp/lifetime-value/acquire-valued-shoppers-challenge\n", - " cd /tmp/lifetime-value/acquire-valued-shoppers-challenge\n", - " kaggle competitions download -c acquire-valued-shoppers-challenge\n", - " echo \"Unzip file. This may take 10 min.\"\n", - " gunzip transactions.csv.gz\n", - "fi" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "IT53azGsa2a2" - }, - "source": [ - "### Load transaction csv" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "5tIMvE3dW1Ky" - }, - "outputs": [], - "source": [ - "def load_transaction_data(company):\n", - " all_data_filename = '/tmp/lifetime-value/acquire-valued-shoppers-challenge/transactions.csv'\n", - " one_company_data_filename = (\n", - " '/tmp/lifetime-value/acquire-valued-shoppers-challenge/transactions_company_{}.csv'\n", - " .format(COMPANY))\n", - " if os.path.isfile(one_company_data_filename):\n", - " df = pd.read_csv(one_company_data_filename)\n", - " else:\n", - " data_list = []\n", - " chunksize = 10**6\n", - " # 350 iterations\n", - " for chunk in tqdm.tqdm(pd.read_csv(all_data_filename, chunksize=chunksize)):\n", - " data_list.append(chunk.query(\"company=='{}'\".format(company)))\n", - " df = pd.concat(data_list, axis=0)\n", - " df.to_csv(one_company_data_filename, index=None)\n", - " return df" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "9ra4bfwCVwKn" - }, - "source": [ - "### Preprocess data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "PlJl5g9Delmi" - }, - "outputs": [], - "source": [ - "def preprocess(df):\n", - " df = df.query('purchaseamount\u003e0')\n", - " df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')\n", - " df['start_date'] = df.groupby('id')['date'].transform('min')\n", - "\n", - " # Compute calibration values\n", - " calibration_value = (\n", - " df.query('date==start_date').groupby('id')\n", - " ['purchaseamount'].sum().reset_index())\n", - " calibration_value.columns = ['id', 'calibration_value']\n", - "\n", - " # Compute holdout values\n", - " one_year_holdout_window_mask = (\n", - " (df['date'] \u003e df['start_date']) \u0026\n", - " (df['date'] \u003c= df['start_date'] + np.timedelta64(365, 'D')))\n", - " holdout_value = (\n", - " df[one_year_holdout_window_mask].groupby('id')\n", - " ['purchaseamount'].sum().reset_index())\n", - " holdout_value.columns = ['id', 'holdout_value']\n", - "\n", - " # Compute calibration attributes\n", - " calibration_attributes = (\n", - " df.query('date==start_date').sort_values(\n", - " 'purchaseamount', ascending=False).groupby('id')[[\n", - " 'chain', 'dept', 'category', 'brand', 'productmeasure'\n", - " ]].first().reset_index())\n", - "\n", - " # Merge dataframes\n", - " customer_level_data = (\n", - " calibration_value.merge(calibration_attributes, how='left',\n", - " on='id').merge(\n", - " holdout_value, how='left', on='id'))\n", - " customer_level_data['holdout_value'] = (\n", - " customer_level_data['holdout_value'].fillna(0.))\n", - " customer_level_data[CATEGORICAL_FEATURES] = (\n", - " customer_level_data[CATEGORICAL_FEATURES].fillna('UNKNOWN'))\n", - "\n", - " # Specify data types\n", - " customer_level_data['log_calibration_value'] = (\n", - " np.log(customer_level_data['calibration_value']).astype('float32'))\n", - " customer_level_data['chain'] = (\n", - " customer_level_data['chain'].astype('category'))\n", - " customer_level_data['dept'] = (customer_level_data['dept'].astype('category'))\n", - " customer_level_data['brand'] = (\n", - " customer_level_data['brand'].astype('category'))\n", - " customer_level_data['category'] = (\n", - " customer_level_data['category'].astype('category'))\n", - " customer_level_data['label'] = (\n", - " customer_level_data['holdout_value'].astype('float32'))\n", - " return customer_level_data" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "fP3q6uuMoXhA" - }, - "source": [ - "### Load customer-level csv" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "X8B4zV1xoeMX" - }, - "outputs": [], - "source": [ - "def load_customer_level_csv(company):\n", - " customer_level_data_file = (\n", - " '/tmp/lifetime-value/acquire-valued-shoppers-challenge/customer_level_data_company_{}.csv'\n", - " .format(company))\n", - " if os.path.isfile(customer_level_data_file):\n", - " customer_level_data = pd.read_csv(customer_level_data_file)\n", - " else:\n", - " customer_level_data = preprocess(load_transaction_data(company))\n", - " for cat_col in CATEGORICAL_FEATURES:\n", - " customer_level_data[cat_col] = (\n", - " customer_level_data[cat_col].astype('category'))\n", - " for num_col in [\n", - " 'log_calibration_value', 'calibration_value', 'holdout_value'\n", - " ]:\n", - " customer_level_data[num_col] = (\n", - " customer_level_data[num_col].astype('float32'))\n", - "\n", - " return customer_level_data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "88dVPdt5QWpu" - }, - "outputs": [], - "source": [ - "# Processes data. 350 iteration in total. May take 10min.\n", - "customer_level_data = load_customer_level_csv(COMPANY)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "09tqgvANtsil" - }, - "source": [ - "We observe a mixture of zero and lognormal distribution of holdout value." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "BtF0z3VbmGev" - }, - "outputs": [], - "source": [ - "customer_level_data.label.apply(np.log1p).hist(bins=50)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "i4kN0uk4kZ68" - }, - "source": [ - "### Make train/eval" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "nc0MLKx2yD72" - }, - "outputs": [], - "source": [ - "def linear_split(df):\n", - " # get_dummies preserves numeric features.\n", - " x = pd.get_dummies(df[ALL_FEATURES], drop_first=True).astype('float32').values\n", - " y = df['label'].values\n", - " y0 = df['calibration_value'].values\n", - "\n", - " x_train, x_eval, y_train, y_eval, y0_train, y0_eval = (\n", - " model_selection.train_test_split(\n", - " x, y, y0, test_size=0.2, random_state=123))\n", - "\n", - " return x_train, x_eval, y_train, y_eval, y0_eval" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "eAGbXp9ax042" - }, - "outputs": [], - "source": [ - "def dnn_split(df):\n", - " for key in CATEGORICAL_FEATURES:\n", - " encoder = preprocessing.LabelEncoder()\n", - " df[key] = encoder.fit_transform(df[key])\n", - "\n", - " y0 = df['calibration_value'].values\n", - " df_train, df_eval, y0_train, y0_eval = model_selection.train_test_split(\n", - " df, y0, test_size=0.2, random_state=123)\n", - "\n", - " def feature_dict(df):\n", - " features = {k: v.values for k, v in dict(df[CATEGORICAL_FEATURES]).items()}\n", - " features['numeric'] = df[NUMERIC_FEATURES].values\n", - " return features\n", - "\n", - " x_train, y_train = feature_dict(df_train), df_train['label'].values\n", - " x_eval, y_eval = feature_dict(df_eval), df_eval['label'].values\n", - "\n", - " return x_train, x_eval, y_train, y_eval, y0_eval" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "lqbShWBzR4NE" - }, - "source": [ - "## Model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "RAOttr0W0yTM" - }, - "outputs": [], - "source": [ - "def linear_model(output_units):\n", - " return tf.keras.experimental.LinearModel(output_units)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Q7huREFbR7Dl" - }, - "outputs": [], - "source": [ - "def embedding_dim(x):\n", - " return int(x**.25) + 1\n", - "\n", - "\n", - "def embedding_layer(vocab_size):\n", - " return tf.keras.Sequential([\n", - " tf.keras.layers.Embedding(\n", - " input_dim=vocab_size,\n", - " output_dim=embedding_dim(vocab_size),\n", - " input_length=1),\n", - " tf.keras.layers.Flatten(),\n", - " ])\n", - "\n", - "\n", - "def dnn_model(output_units, df):\n", - " numeric_input = tf.keras.layers.Input(\n", - " shape=(len(NUMERIC_FEATURES),), name='numeric')\n", - "\n", - " embedding_inputs = [\n", - " tf.keras.layers.Input(shape=(1,), name=key, dtype=np.int64)\n", - " for key in CATEGORICAL_FEATURES\n", - " ]\n", - "\n", - " embedding_outputs = [\n", - " embedding_layer(vocab_size=df[key].nunique())(input)\n", - " for key, input in zip(CATEGORICAL_FEATURES, embedding_inputs)\n", - " ]\n", - "\n", - " deep_input = tf.keras.layers.concatenate([numeric_input] + embedding_outputs)\n", - " deep_model = tf.keras.Sequential([\n", - " tf.keras.layers.Dense(64, activation='relu'),\n", - " tf.keras.layers.Dense(32, activation='relu'),\n", - " tf.keras.layers.Dense(output_units),\n", - " ])\n", - " return tf.keras.Model(\n", - " inputs=[numeric_input] + embedding_inputs, outputs=deep_model(deep_input))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "U8l-KzZ12fbK" - }, - "source": [ - "### Train" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "L3HzXsj61uy3" - }, - "outputs": [], - "source": [ - "if LOSS == 'mse':\n", - " loss = keras.losses.MeanSquaredError()\n", - " output_units = 1\n", - "\n", - "if LOSS == 'ziln':\n", - " loss = ltv.zero_inflated_lognormal_loss\n", - " output_units = 3" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "0pNM4q5m19Dv" - }, - "outputs": [], - "source": [ - "if MODEL == 'linear':\n", - " x_train, x_eval, y_train, y_eval, y0_eval = linear_split(customer_level_data)\n", - " model = linear_model(output_units)\n", - "\n", - "if MODEL == 'dnn':\n", - " x_train, x_eval, y_train, y_eval, y0_eval = dnn_split(customer_level_data)\n", - " model = dnn_model(output_units, customer_level_data)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Un-yJPHp31gp" - }, - "outputs": [], - "source": [ - "model.compile(loss=loss, optimizer=keras.optimizers.Adam(lr=LEARNING_RATE))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "_GQ-RlIAfT62" - }, - "outputs": [], - "source": [ - "callbacks = [\n", - " tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', min_lr=1e-6),\n", - " tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10),\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "-BjnHV7MWhK1" - }, - "outputs": [], - "source": [ - "history = model.fit(\n", - " x=x_train,\n", - " y=y_train,\n", - " batch_size=1024,\n", - " epochs=EPOCHS,\n", - " verbose=2,\n", - " callbacks=callbacks,\n", - " validation_data=(x_eval, y_eval)).history" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "mAJGs5SebDeN" - }, - "outputs": [], - "source": [ - "pd.DataFrame(history)[['loss', 'val_loss']][2:].plot()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "bHaiutmy2aYm" - }, - "source": [ - "### Eval" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "l6E_5gYAYQMw" - }, - "outputs": [], - "source": [ - "if LOSS == 'mse':\n", - " y_pred = model.predict(x=x_eval, batch_size=1024).flatten()\n", - "\n", - "if LOSS == 'ziln':\n", - " logits = model.predict(x=x_eval, batch_size=1024)\n", - " y_pred = ltv.zero_inflated_lognormal_pred(logits).numpy().flatten()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Mm28qKSGXNyr" - }, - "outputs": [], - "source": [ - "df_pred = pd.DataFrame({\n", - " 'y_true': y_eval,\n", - " 'y_pred': y_pred,\n", - "})\n", - "df_pred.head(10)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "zROhsEWxnA5u" - }, - "source": [ - "### Gini Coefficient" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "gRsJ7y-632h_" - }, - "outputs": [], - "source": [ - "gain = pd.DataFrame({\n", - " 'lorenz': ltv.cumulative_true(y_eval, y_eval),\n", - " 'baseline': ltv.cumulative_true(y_eval, y0_eval),\n", - " 'model': ltv.cumulative_true(y_eval, y_pred),\n", - "})" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "yg-ndbve4AL_" - }, - "outputs": [], - "source": [ - "num_customers = np.float32(gain.shape[0])\n", - "gain['cumulative_customer'] = (np.arange(num_customers) + 1.) / num_customers" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "WEoAvuCj4OVy" - }, - "outputs": [], - "source": [ - "ax = gain[[\n", - " 'cumulative_customer',\n", - " 'lorenz',\n", - " 'baseline',\n", - " 'model',\n", - "]].plot(\n", - " x='cumulative_customer', figsize=(8, 5), legend=True)\n", - "\n", - "ax.legend(['Groundtruth', 'Baseline', 'Model'], loc='upper left')\n", - "\n", - "ax.set_xlabel('Cumulative Fraction of Customers')\n", - "ax.set_xticks(np.arange(0, 1.1, 0.1))\n", - "ax.set_xlim((0, 1.))\n", - "\n", - "ax.set_ylabel('Cumulative Fraction of Total Lifetime Value')\n", - "ax.set_yticks(np.arange(0, 1.1, 0.1))\n", - "ax.set_ylim((0, 1.05))\n", - "ax.set_title('Gain Chart')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "kzPqaiNO4iWC" - }, - "outputs": [], - "source": [ - "gini = ltv.gini_from_gain(gain[['lorenz', 'baseline', 'model']])\n", - "gini" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "S84RitIa9PBu" - }, - "source": [ - "### Calibration" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "X7sKbsEf6RvF" - }, - "outputs": [], - "source": [ - "df_decile = ltv.decile_stats(y_eval, y_pred)\n", - "df_decile" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "DHdLqUqdL4hf" - }, - "outputs": [], - "source": [ - "ax = df_decile[['label_mean', 'pred_mean']].plot.bar(rot=0)\n", - "\n", - "ax.set_title('Decile Chart')\n", - "ax.set_xlabel('Prediction bucket')\n", - "ax.set_ylabel('Average bucket value')\n", - "ax.legend(['Label', 'Prediction'], loc='upper left')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nK6DQ89xU-d4" - }, - "source": [ - "### Rank Correlation" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "I9qWGyY3WePz" - }, - "outputs": [], - "source": [ - "def spearmanr(x1: Sequence[float], x2: Sequence[float]) -\u003e float:\n", - " \"\"\"Calculates spearmanr rank correlation coefficient.\n", - "\n", - " See https://docs.scipy.org/doc/scipy/reference/stats.html.\n", - "\n", - " Args:\n", - " x1: 1D array_like.\n", - " x2: 1D array_like.\n", - "\n", - " Returns:\n", - " correlation: float.\n", - " \"\"\"\n", - " return stats.spearmanr(x1, x2, nan_policy='raise')[0]\n", - "\n", - "\n", - "spearman_corr = spearmanr(y_eval, y_pred)\n", - "spearman_corr" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-i_AbqhXcurk" - }, - "source": [ - "### All metrics together" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Umqg1-0Bc1HS" - }, - "outputs": [], - "source": [ - "df_metrics = pd.DataFrame(\n", - " {\n", - " 'company': COMPANY,\n", - " 'model': MODEL,\n", - " 'loss': LOSS,\n", - " 'label_mean': y_eval.mean(),\n", - " 'pred_mean': y_pred.mean(),\n", - " 'label_positive': np.mean(y_eval \u003e 0),\n", - " 'decile_mape': df_decile['decile_mape'].mean(),\n", - " 'baseline_gini': gini['normalized'][1],\n", - " 'gini': gini['normalized'][2],\n", - " 'spearman_corr': spearman_corr,\n", - " },\n", - " index=[0])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "1LV1Hs3xcxnd" - }, - "outputs": [], - "source": [ - "df_metrics[[\n", - " 'company',\n", - " 'model',\n", - " 'loss',\n", - " 'label_mean',\n", - " 'pred_mean',\n", - " 'label_positive',\n", - " 'decile_mape',\n", - " 'baseline_gini',\n", - " 'gini',\n", - " 'spearman_corr',\n", - "]]" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "UVy6lYn4mSrj" - }, - "source": [ - "## Save" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "mtkQ4mqUEFsb" - }, - "outputs": [], - "source": [ - "output_path = os.path.join(OUTPUT_CSV_FOLDER, COMPANY)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "3qmLzJqOEFsm" - }, - "outputs": [], - "source": [ - "if not os.path.isdir(output_path):\n", - " os.makedirs(output_path)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "61B5Zc_UEFsr" - }, - "outputs": [], - "source": [ - "output_file = os.path.join(output_path,\n", - " '{}_regression_{}.csv'.format(MODEL, LOSS))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "gqglbXfwEFsv" - }, - "outputs": [], - "source": [ - "df_metrics.to_csv(output_file, index=False)" - ] - } - ], - "metadata": { - "colab": { - "collapsed_sections": [], - "last_runtime": { - "build_target": "//learning/deepmind/public/tools/ml_python:ml_notebook", - "kind": "private" - }, - "name": "regression.ipynb", - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - } - }, - "nbformat": 4, - "nbformat_minor": 0 + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "5RoRxBv3bRjy" + }, + "outputs": [], + "source": [ + "#@title Copyright 2019 The Lifetime Value Authors.\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License.\n", + "# ============================================================================" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2tkQUXmWhqRY" + }, + "source": [ + "# Lifetime Value prediction for Kaggle Acquire Valued Customer Challenge" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Pw8bm9nV6YJ5" + }, + "source": [ + "\n", + " \n", + " \n", + "
\n", + " Run in Google Colab\n", + " \n", + " View source on GitHub\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "KObdQwyXH2mC" + }, + "outputs": [], + "source": [ + "import os\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "from scipy import stats\n", + "import seaborn as sns\n", + "from sklearn import model_selection\n", + "from sklearn import preprocessing\n", + "import tensorflow as tf\n", + "from tensorflow import keras\n", + "from tensorflow.keras import backend as K\n", + "import tensorflow_probability as tfp\n", + "import tqdm\n", + "from typing import Sequence\n", + "\n", + "# install and import ltv\n", + "#!pip install -q git+https://github.com/google/lifetime_value\n", + "!pip install -q git+https://github.com/seyedrezamirkhani/lifetime_value\n", + "import lifetime_value as ltv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "K41RmAfNXtu_" + }, + "outputs": [], + "source": [ + "tfd = tfp.distributions\n", + "%config InlineBackend.figure_format='retina'\n", + "sns.set_style('whitegrid')\n", + "pd.options.mode.chained_assignment = None # default='warn'" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "DoN-PRvNuIti" + }, + "source": [ + "## Global variables" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "3GGpDbxd3S5L" + }, + "outputs": [], + "source": [ + "COMPANY = '103600030' # @param { isTemplate: true, type: 'string'}\n", + "LOSS = 'ziln' # @param { isTemplate: true, type: 'string'} ['mse', 'ziln']\n", + "#LOSS = 'mse' # @param { isTemplate: true, type: 'string'} ['mse', 'ziln']\n", + "MODEL = 'dnn' # @param { isTemplate: true, type: 'string'} ['linear', 'dnn']\n", + "# MODEL = 'linear' # @param { isTemplate: true, type: 'string'} ['linear', 'dnn']\n", + "LEARNING_RATE = 0.0002 # @param { isTemplate: true}\n", + "EPOCHS = 400 # @param { isTemplate: true, type: 'integer'}\n", + "DATA_FOLDER = './tmp/acquire-valued-shoppers-challenge' # @param { isTemplate: true, type: 'string'}\n", + "OUTPUT_CSV_FOLDER = f'{DATA_FOLDER}/result' # @param { isTemplate: true, type: 'string'}" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "UK9Y5NoMtm3X" + }, + "outputs": [], + "source": [ + "CATEGORICAL_FEATURES = ['chain', 'dept', 'category', 'brand', 'productmeasure']\n", + "NUMERIC_FEATURES = ['log_calibration_value']\n", + "\n", + "ALL_FEATURES = CATEGORICAL_FEATURES + NUMERIC_FEATURES" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "RzTaK6fFXMWT" + }, + "source": [ + "## Data" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SFi0JMPu138h" + }, + "source": [ + "### Download data" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "krXMbrkVNtdN" + }, + "source": [ + "Setup kaggle API correctly following https://www.kaggle.com/docs/api\n", + "```\n", + "%%shell\n", + "mkdir ~/.kaggle\n", + "echo \\{\\\"username\\\":\\\"{your kaggle username}\\\",\\\"key\\\":\\\"{your kaggle api key}\\\"\\} > ~/.kaggle/kaggle.json\n", + "pip install kaggle\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Set it DATA_FOLDER as an environment variable\n", + "%env DATA_FOLDER=$DATA_FOLDER" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "0gf4ipd-14x0" + }, + "outputs": [], + "source": [ + "%%bash\n", + "if [ -e $DATA_FOLDER/transactions.csv ]\n", + "then\n", + " echo \"File already exists, no need to download.\"\n", + "else\n", + " rm -rf $DATA_FOLDER\n", + " mkdir -p $DATA_FOLDER\n", + " cd $DATA_FOLDER\n", + " kaggle competitions download -c acquire-valued-shoppers-challenge\n", + " echo \"Unzip file. This may take 10 min.\"\n", + " unzip acquire-valued-shoppers-challenge.zip transactions.csv.gz\n", + " gunzip transactions.csv.gz\n", + "fi" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "IT53azGsa2a2" + }, + "source": [ + "### Load transaction csv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "5tIMvE3dW1Ky" + }, + "outputs": [], + "source": [ + "def load_transaction_data(company):\n", + " all_data_filename = f'{DATA_FOLDER}/transactions.csv'\n", + " one_company_data_filename = f'{DATA_FOLDER}/transactions_company_{company}.csv'\n", + " if os.path.isfile(one_company_data_filename):\n", + " df = pd.read_csv(one_company_data_filename)\n", + " else:\n", + " data_list = []\n", + " chunksize = 10**6\n", + " # 350 iterations\n", + " for chunk in tqdm.tqdm(pd.read_csv(all_data_filename, chunksize=chunksize)):\n", + " data_list.append(chunk.query(\"company=={}\".format(company)))\n", + " df = pd.concat(data_list, axis=0)\n", + " df.to_csv(one_company_data_filename, index=None)\n", + " return df" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "9ra4bfwCVwKn" + }, + "source": [ + "### Preprocess data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "PlJl5g9Delmi" + }, + "outputs": [], + "source": [ + "def preprocess(df):\n", + " df = df.query('purchaseamount>0')\n", + " df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d')\n", + " df['start_date'] = df.groupby('id')['date'].transform('min')\n", + "\n", + " # Compute calibration values\n", + " calibration_value = (\n", + " df.query('date==start_date').groupby('id')\n", + " ['purchaseamount'].sum().reset_index())\n", + " calibration_value.columns = ['id', 'calibration_value']\n", + "\n", + " # Compute holdout values\n", + " one_year_holdout_window_mask = (\n", + " (df['date'] > df['start_date']) &\n", + " (df['date'] <= df['start_date'] + np.timedelta64(365, 'D')))\n", + " holdout_value = (\n", + " df[one_year_holdout_window_mask].groupby('id')\n", + " ['purchaseamount'].sum().reset_index())\n", + " holdout_value.columns = ['id', 'holdout_value']\n", + "\n", + " # Compute calibration attributes\n", + " calibration_attributes = (\n", + " df.query('date==start_date').sort_values(\n", + " 'purchaseamount', ascending=False).groupby('id')[[\n", + " 'chain', 'dept', 'category', 'brand', 'productmeasure'\n", + " ]].first().reset_index())\n", + "\n", + " # Merge dataframes\n", + " customer_level_data = (\n", + " calibration_value.merge(calibration_attributes, how='left',\n", + " on='id').merge(\n", + " holdout_value, how='left', on='id'))\n", + " customer_level_data['holdout_value'] = (\n", + " customer_level_data['holdout_value'].fillna(0.))\n", + " customer_level_data[CATEGORICAL_FEATURES] = (\n", + " customer_level_data[CATEGORICAL_FEATURES].fillna('UNKNOWN'))\n", + "\n", + " # Specify data types\n", + " customer_level_data['log_calibration_value'] = (\n", + " np.log(customer_level_data['calibration_value']).astype('float32'))\n", + " customer_level_data['chain'] = (\n", + " customer_level_data['chain'].astype('category'))\n", + " customer_level_data['dept'] = (customer_level_data['dept'].astype('category'))\n", + " customer_level_data['brand'] = (\n", + " customer_level_data['brand'].astype('category'))\n", + " customer_level_data['category'] = (\n", + " customer_level_data['category'].astype('category'))\n", + " customer_level_data['label'] = (\n", + " customer_level_data['holdout_value'].astype('float32'))\n", + " return customer_level_data" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "fP3q6uuMoXhA" + }, + "source": [ + "### Load customer-level csv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "X8B4zV1xoeMX" + }, + "outputs": [], + "source": [ + "def load_customer_level_csv(company):\n", + " customer_level_data_file = f'{DATA_FOLDER}/customer_level_data_company_{company}.csv'\n", + " if os.path.isfile(customer_level_data_file):\n", + " customer_level_data = pd.read_csv(customer_level_data_file)\n", + " else:\n", + " customer_level_data = preprocess(load_transaction_data(company))\n", + " for cat_col in CATEGORICAL_FEATURES:\n", + " customer_level_data[cat_col] = (\n", + " customer_level_data[cat_col].astype('category'))\n", + " for num_col in [\n", + " 'log_calibration_value', 'calibration_value', 'holdout_value'\n", + " ]:\n", + " customer_level_data[num_col] = (\n", + " customer_level_data[num_col].astype('float32'))\n", + "\n", + " return customer_level_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "88dVPdt5QWpu" + }, + "outputs": [], + "source": [ + "# Processes data. 350 iteration in total. May take 10min.\n", + "customer_level_data = load_customer_level_csv(COMPANY)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "customer_level_data" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "09tqgvANtsil" + }, + "source": [ + "We observe a mixture of zero and lognormal distribution of holdout value." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "BtF0z3VbmGev" + }, + "outputs": [], + "source": [ + "customer_level_data.label.apply(np.log1p).hist(bins=50)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "i4kN0uk4kZ68" + }, + "source": [ + "### Make train/eval" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "nc0MLKx2yD72" + }, + "outputs": [], + "source": [ + "def linear_split(df):\n", + " # get_dummies preserves numeric features.\n", + " x = pd.get_dummies(df[ALL_FEATURES], drop_first=True).astype('float32').values\n", + " y = df['label'].values\n", + " y0 = df['calibration_value'].values\n", + "\n", + " x_train, x_eval, y_train, y_eval, y0_train, y0_eval = (\n", + " model_selection.train_test_split(\n", + " x, y, y0, test_size=0.2, random_state=123))\n", + "\n", + " return x_train, x_eval, y_train, y_eval, y0_eval" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "eAGbXp9ax042" + }, + "outputs": [], + "source": [ + "def dnn_split(df):\n", + " for key in CATEGORICAL_FEATURES:\n", + " encoder = preprocessing.LabelEncoder()\n", + " df[key] = encoder.fit_transform(df[key])\n", + "\n", + " y0 = df['calibration_value'].values\n", + " df_train, df_eval, y0_train, y0_eval = model_selection.train_test_split(\n", + " df, y0, test_size=0.2, random_state=123)\n", + "\n", + " def feature_dict(df):\n", + " features = {k: v.values for k, v in dict(df[CATEGORICAL_FEATURES]).items()}\n", + " features['numeric'] = df[NUMERIC_FEATURES].values\n", + " return features\n", + "\n", + " x_train, y_train = feature_dict(df_train), df_train['label'].values\n", + " x_eval, y_eval = feature_dict(df_eval), df_eval['label'].values\n", + "\n", + " return x_train, x_eval, y_train, y_eval, y0_eval" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "lqbShWBzR4NE" + }, + "source": [ + "## Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def linear_model(output_units, input_dim):\n", + " return tf.keras.Sequential([\n", + " tf.keras.layers.Input(shape=(input_dim,)),\n", + " tf.keras.layers.Dense(output_units, activation=None)\n", + " ])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Q7huREFbR7Dl" + }, + "outputs": [], + "source": [ + "def embedding_dim(x):\n", + " return int(x**.25) + 1\n", + "\n", + "\n", + "def embedding_layer(vocab_size):\n", + " return tf.keras.Sequential([\n", + " tf.keras.layers.Embedding(\n", + " input_dim=vocab_size,\n", + " output_dim=embedding_dim(vocab_size)\n", + " ),\n", + " tf.keras.layers.Flatten(),\n", + " ])\n", + "\n", + "\n", + "def dnn_model(output_units, df):\n", + " numeric_input = tf.keras.layers.Input(\n", + " shape=(len(NUMERIC_FEATURES),), name='numeric')\n", + "\n", + " embedding_inputs = [\n", + " tf.keras.layers.Input(shape=(1,), name=key, dtype=np.int64)\n", + " for key in CATEGORICAL_FEATURES\n", + " ]\n", + "\n", + " embedding_outputs = [\n", + " embedding_layer(vocab_size=df[key].nunique())(input)\n", + " for key, input in zip(CATEGORICAL_FEATURES, embedding_inputs)\n", + " ]\n", + "\n", + " deep_input = tf.keras.layers.concatenate([numeric_input] + embedding_outputs)\n", + " deep_model = tf.keras.Sequential([\n", + " tf.keras.layers.Dense(64, activation='relu'),\n", + " tf.keras.layers.Dense(32, activation='relu'),\n", + " tf.keras.layers.Dense(output_units),\n", + " ])\n", + " return tf.keras.Model(\n", + " inputs=[numeric_input] + embedding_inputs, outputs=deep_model(deep_input))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "U8l-KzZ12fbK" + }, + "source": [ + "### Train" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "L3HzXsj61uy3" + }, + "outputs": [], + "source": [ + "if LOSS == 'mse':\n", + " loss = keras.losses.MeanSquaredError()\n", + " output_units = 1\n", + "\n", + "if LOSS == 'ziln':\n", + " loss = ltv.zero_inflated_lognormal_loss\n", + " output_units = 3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "0pNM4q5m19Dv" + }, + "outputs": [], + "source": [ + "if MODEL == 'linear':\n", + " x_train, x_eval, y_train, y_eval, y0_eval = linear_split(customer_level_data)\n", + " model = linear_model(output_units, x_train.shape[1])\n", + "\n", + "if MODEL == 'dnn':\n", + " x_train, x_eval, y_train, y_eval, y0_eval = dnn_split(customer_level_data)\n", + " model = dnn_model(output_units, customer_level_data)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Un-yJPHp31gp" + }, + "outputs": [], + "source": [ + "model.compile(loss=loss, optimizer=keras.optimizers.Adam(learning_rate=LEARNING_RATE))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_GQ-RlIAfT62" + }, + "outputs": [], + "source": [ + "callbacks = [\n", + " tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', min_lr=1e-6),\n", + " tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10),\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "-BjnHV7MWhK1" + }, + "outputs": [], + "source": [ + "# y array needs to have a two dimensional shape to work with ziln loss function\n", + "# so we use [:, np.newaxis] to make the data two-dimensional for the fit function call\n", + "history = model.fit(\n", + " x=x_train,\n", + " y=y_train[:, np.newaxis],\n", + " batch_size=1024,\n", + " epochs=EPOCHS,\n", + " verbose=2,\n", + " callbacks=callbacks,\n", + " validation_data=(x_eval, y_eval[:, np.newaxis])).history" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "mAJGs5SebDeN" + }, + "outputs": [], + "source": [ + "pd.DataFrame(history)[['loss', 'val_loss']][2:].plot()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "bHaiutmy2aYm" + }, + "source": [ + "### Eval" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "l6E_5gYAYQMw" + }, + "outputs": [], + "source": [ + "if LOSS == 'mse':\n", + " y_pred = model.predict(x=x_eval, batch_size=1024).flatten()\n", + "\n", + "if LOSS == 'ziln':\n", + " logits = model.predict(x=x_eval, batch_size=1024)\n", + " y_pred = ltv.zero_inflated_lognormal_pred(logits).numpy().flatten()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Mm28qKSGXNyr" + }, + "outputs": [], + "source": [ + "df_pred = pd.DataFrame({\n", + " 'y_true': y_eval,\n", + " 'y_pred': y_pred,\n", + "})\n", + "df_pred.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "zROhsEWxnA5u" + }, + "source": [ + "### Gini Coefficient" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "gRsJ7y-632h_" + }, + "outputs": [], + "source": [ + "gain = pd.DataFrame({\n", + " 'lorenz': ltv.cumulative_true(y_eval, y_eval),\n", + " 'baseline': ltv.cumulative_true(y_eval, y0_eval),\n", + " 'model': ltv.cumulative_true(y_eval, y_pred),\n", + "})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "yg-ndbve4AL_" + }, + "outputs": [], + "source": [ + "num_customers = np.float32(gain.shape[0])\n", + "gain['cumulative_customer'] = (np.arange(num_customers) + 1.) / num_customers" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "WEoAvuCj4OVy" + }, + "outputs": [], + "source": [ + "ax = gain[[\n", + " 'cumulative_customer',\n", + " 'lorenz',\n", + " 'baseline',\n", + " 'model',\n", + "]].plot(\n", + " x='cumulative_customer', figsize=(8, 5), legend=True)\n", + "\n", + "ax.legend(['Groundtruth', 'Baseline', 'Model'], loc='upper left')\n", + "\n", + "ax.set_xlabel('Cumulative Fraction of Customers')\n", + "ax.set_xticks(np.arange(0, 1.1, 0.1))\n", + "ax.set_xlim((0, 1.))\n", + "\n", + "ax.set_ylabel('Cumulative Fraction of Total Lifetime Value')\n", + "ax.set_yticks(np.arange(0, 1.1, 0.1))\n", + "ax.set_ylim((0, 1.05))\n", + "ax.set_title('Gain Chart')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "kzPqaiNO4iWC" + }, + "outputs": [], + "source": [ + "gini = ltv.gini_from_gain(gain[['lorenz', 'baseline', 'model']])\n", + "gini" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "S84RitIa9PBu" + }, + "source": [ + "### Calibration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "X7sKbsEf6RvF" + }, + "outputs": [], + "source": [ + "df_decile = ltv.decile_stats(y_eval, y_pred)\n", + "df_decile" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "DHdLqUqdL4hf" + }, + "outputs": [], + "source": [ + "ax = df_decile[['label_mean', 'pred_mean']].plot.bar(rot=0)\n", + "\n", + "ax.set_title('Decile Chart')\n", + "ax.set_xlabel('Prediction bucket')\n", + "ax.set_ylabel('Average bucket value')\n", + "ax.legend(['Label', 'Prediction'], loc='upper left')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nK6DQ89xU-d4" + }, + "source": [ + "### Rank Correlation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "I9qWGyY3WePz" + }, + "outputs": [], + "source": [ + "def spearmanr(x1: Sequence[float], x2: Sequence[float]) -> float:\n", + " \"\"\"Calculates spearmanr rank correlation coefficient.\n", + "\n", + " See https://docs.scipy.org/doc/scipy/reference/stats.html.\n", + "\n", + " Args:\n", + " x1: 1D array_like.\n", + " x2: 1D array_like.\n", + "\n", + " Returns:\n", + " correlation: float.\n", + " \"\"\"\n", + " return stats.spearmanr(x1, x2, nan_policy='raise')[0]\n", + "\n", + "\n", + "spearman_corr = spearmanr(y_eval, y_pred)\n", + "spearman_corr" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-i_AbqhXcurk" + }, + "source": [ + "### All metrics together" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Umqg1-0Bc1HS" + }, + "outputs": [], + "source": [ + "df_metrics = pd.DataFrame(\n", + " {\n", + " 'company': COMPANY,\n", + " 'model': MODEL,\n", + " 'loss': LOSS,\n", + " 'label_mean': y_eval.mean(),\n", + " 'pred_mean': y_pred.mean(),\n", + " 'label_positive': np.mean(y_eval > 0),\n", + " 'decile_mape': df_decile['decile_mape'].mean(),\n", + " 'baseline_gini': gini['normalized'][1],\n", + " 'gini': gini['normalized'][2],\n", + " 'spearman_corr': spearman_corr,\n", + " },\n", + " index=[0])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "1LV1Hs3xcxnd" + }, + "outputs": [], + "source": [ + "df_metrics[[\n", + " 'company',\n", + " 'model',\n", + " 'loss',\n", + " 'label_mean',\n", + " 'pred_mean',\n", + " 'label_positive',\n", + " 'decile_mape',\n", + " 'baseline_gini',\n", + " 'gini',\n", + " 'spearman_corr',\n", + "]]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "UVy6lYn4mSrj" + }, + "source": [ + "## Save" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "mtkQ4mqUEFsb" + }, + "outputs": [], + "source": [ + "output_path = os.path.join(OUTPUT_CSV_FOLDER, COMPANY)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "3qmLzJqOEFsm" + }, + "outputs": [], + "source": [ + "if not os.path.isdir(output_path):\n", + " os.makedirs(output_path)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "61B5Zc_UEFsr" + }, + "outputs": [], + "source": [ + "output_file = os.path.join(output_path,\n", + " '{}_regression_{}.csv'.format(MODEL, LOSS))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "gqglbXfwEFsv" + }, + "outputs": [], + "source": [ + "df_metrics.to_csv(output_file, index=False)" + ] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "last_runtime": { + "build_target": "//learning/deepmind/public/tools/ml_python:ml_notebook", + "kind": "private" + }, + "name": "regression.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 4 } diff --git a/notebooks/kdd_cup_98/regression.ipynb b/notebooks/kdd_cup_98/regression.ipynb index 1a935a7..31bd01c 100644 --- a/notebooks/kdd_cup_98/regression.ipynb +++ b/notebooks/kdd_cup_98/regression.ipynb @@ -1,1050 +1,1110 @@ { - "cells": [ - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "gE76T8J7IsGC" - }, - "outputs": [], - "source": [ - "#@title Copyright 2019 The Lifetime Value Authors.\n", - "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", - "# you may not use this file except in compliance with the License.\n", - "# You may obtain a copy of the License at\n", - "#\n", - "# https://www.apache.org/licenses/LICENSE-2.0\n", - "#\n", - "# Unless required by applicable law or agreed to in writing, software\n", - "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", - "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", - "# See the License for the specific language governing permissions and\n", - "# limitations under the License.\n", - "# ============================================================================" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "sswTFWDv7HZd" - }, - "source": [ - "# KDD Cup 98 LTV Prediction" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "PSr1mSJP7O1J" - }, - "source": [ - "\u003ctable align=\"left\"\u003e\n", - " \u003ctd\u003e\n", - " \u003ca target=\"_blank\" href=\"https://colab.research.google.com/github/google/lifetime_value/blob/master/notebooks/kdd_cup_98/regression.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/colab_logo_32px.png\" /\u003eRun in Google Colab\u003c/a\u003e\n", - " \u003c/td\u003e\n", - " \u003ctd\u003e\n", - " \u003ca target=\"_blank\" href=\"https://github.com/google/lifetime_value/blob/master/notebooks/kdd_cup_98/regression.ipynb\"\u003e\u003cimg src=\"https://www.tensorflow.org/images/GitHub-Mark-32px.png\" /\u003eView source on GitHub\u003c/a\u003e\n", - " \u003c/td\u003e\n", - "\u003c/table\u003e" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "pBXE3Dz3NI4A" - }, - "outputs": [], - "source": [ - "import os\n", - "\n", - "import numpy as np\n", - "import pandas as pd\n", - "from scipy import stats\n", - "import matplotlib.pyplot as plt\n", - "import seaborn as sns\n", - "import tensorflow as tf\n", - "import tensorflow_probability as tfp\n", - "from typing import Sequence\n", - "\n", - "# install and import ltv\n", - "!pip install -q git+https://github.com/google/lifetime_value\n", - "import lifetime_value as ltv" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Bq0Ah16lBmgV" - }, - "outputs": [], - "source": [ - "tfd = tfp.distributions\n", - "%config InlineBackend.figure_format='retina'\n", - "sns.set_style('whitegrid')" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "2qN319qZK3IG" - }, - "source": [ - "## Configs" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "hNy_ybw_K19n" - }, - "outputs": [], - "source": [ - "MODEL = 'dnn'\n", - "LOSS = 'ziln' # @param { isTemplate: true, type: 'string'} ['mse', 'ziln']\n", - "LEARNING_RATE = 0.001 # @param { isTemplate: true}\n", - "VERSION = 0 # @param { isTemplate: true, type: 'integer'}\n", - "OUTPUT_CSV_FOLDER = '/tmp/lifetime-value/kdd_cup_98/result' # @param { isTemplate: true, type: 'string'}" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "mDSR921CCEcL" - }, - "source": [ - "## Load data" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "lHxp4rOGI02Q" - }, - "source": [ - "Download kdd_cup_98 data to /tmp/lifetime-value/kdd_cup_98" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Dg3qtgJyJpdi" - }, - "outputs": [], - "source": [ - "%%shell\n", - "mkdir -p /tmp/lifetime-value/kdd_cup_98\n", - "wget https://kdd.ics.uci.edu/databases/kddcup98/epsilon_mirror/cup98lrn.zip -P /tmp/lifetime-value/kdd_cup_98/\n", - "wget https://kdd.ics.uci.edu/databases/kddcup98/epsilon_mirror/cup98val.zip -P /tmp/lifetime-value/kdd_cup_98/\n", - "wget https://kdd.ics.uci.edu/databases/kddcup98/epsilon_mirror/valtargt.txt -P /tmp/lifetime-value/kdd_cup_98/\n", - "cd /tmp/lifetime-value/kdd_cup_98/\n", - "unzip cup98lrn.zip\n", - "unzip cup98val.zip" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "a_LnLmQQRlYF" - }, - "outputs": [], - "source": [ - "df_train = pd.read_csv('/tmp/lifetime-value/kdd_cup_98/cup98LRN.txt')\n", - "num_train = df_train.shape[0]\n", - "df_eval = pd.read_csv('/tmp/lifetime-value/kdd_cup_98/cup98VAL.txt')\n", - "df_eval_target = pd.read_csv('/tmp/lifetime-value/kdd_cup_98/valtargt.txt')\n", - "df_eval = df_eval.merge(df_eval_target, on='CONTROLN')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ggQmy9wiP5M6" - }, - "outputs": [], - "source": [ - "df = pd.concat([df_train, df_eval], axis=0, sort=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "0rgxHpIyjaMH" - }, - "source": [ - "## Label distribution" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Xmpu_d3YjcFC" - }, - "outputs": [], - "source": [ - "y = df['TARGET_D'][:num_train]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "yMr2EDRyK5Sb" - }, - "outputs": [], - "source": [ - "def plot_hist_log_scale(y):\n", - " max_val = y.max() + 1. \n", - " ax = pd.Series(y).hist(\n", - " figsize=(8, 5), bins = 10 ** np.linspace(0., np.log10(max_val), 20))\n", - "\n", - " plt.xlabel('Donation ($)')\n", - " plt.ylabel('Count')\n", - " # plt.title('Histogram of LTV')\n", - " plt.xticks(rotation='horizontal')\n", - " plt.legend(loc='upper left')\n", - " ax.set_xscale('log')\n", - " ax.grid(False)\n", - " # Hide the right and top spines\n", - " ax.spines['right'].set_visible(False)\n", - " ax.spines['top'].set_visible(False)\n", - " # Only show ticks on the left and bottom spines\n", - " ax.yaxis.set_ticks_position('left')\n", - " ax.xaxis.set_ticks_position('bottom')\n", - " plt.show()\n", - "\n", - " fig = ax.get_figure()\n", - " output_file = tf.io.gfile.GFile(\n", - " '/tmp/lifetime-value/kdd_cup_98/histogram_kdd98_log_scale.pdf',\n", - " 'wb')\n", - " fig.savefig(output_file, bbox_inches='tight', format='pdf')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "KbwCzGkBOWhH" - }, - "outputs": [], - "source": [ - "plot_hist_log_scale(y[y\u003e0])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "1XXMLbnlCdlN" - }, - "source": [ - "## Preprocess features" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "L1sBf_RSU3pR" - }, - "source": [ - "### Vocab" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "xB_ddsd_U_4e" - }, - "outputs": [], - "source": [ - "VOCAB_FEATURES = [\n", - " 'ODATEDW', # date of donor's first gift (YYMM)\n", - " 'OSOURCE', # donor acquisition mailing list\n", - " 'TCODE', # donor title code\n", - " 'STATE',\n", - " 'ZIP',\n", - " 'DOMAIN', # urbanicity level and socio-economic status of the neighborhood\n", - " 'CLUSTER', # socio-economic status\n", - " 'GENDER',\n", - " 'MAXADATE', # date of the most recent promotion received\n", - " 'MINRDATE',\n", - " 'LASTDATE',\n", - " 'FISTDATE',\n", - " 'RFA_2A',\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "f2oPZGVLRSPe" - }, - "outputs": [], - "source": [ - "df['ODATEDW'] = df['ODATEDW'].astype('str')\n", - "df['TCODE'] = df['TCODE'].apply(\n", - " lambda x: '{:03d}'.format(x // 1000 if x \u003e 1000 else x))\n", - "df['ZIP'] = df['ZIP'].str.slice(0, 5)\n", - "df['MAXADATE'] = df['MAXADATE'].astype('str')\n", - "df['MINRDATE'] = df['MINRDATE'].astype('str')\n", - "df['LASTDATE'] = df['LASTDATE'].astype('str')\n", - "df['FISTDATE'] = df['FISTDATE'].astype('str')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "isL9Ofv9JLAP" - }, - "outputs": [], - "source": [ - "def label_encoding(y, frequency_threshold=100):\n", - " value_counts = pd.value_counts(y)\n", - " categories = value_counts[\n", - " value_counts \u003e= frequency_threshold].index.to_numpy()\n", - " # 0 indicates the unknown category.\n", - " return pd.Categorical(y, categories=categories).codes + 1" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "BgXGO5D0OdJP" - }, - "outputs": [], - "source": [ - "for key in VOCAB_FEATURES:\n", - " df[key] = label_encoding(df[key])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "kZkmnJ93Zrjw" - }, - "source": [ - "### Indicator" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "tGBpMfaGhCD0" - }, - "outputs": [], - "source": [ - "MAIL_ORDER_RESPONSES = [\n", - " 'MBCRAFT',\n", - " 'MBGARDEN',\n", - " 'MBBOOKS',\n", - " 'MBCOLECT',\n", - " 'MAGFAML',\n", - " 'MAGFEM',\n", - " 'MAGMALE',\n", - " 'PUBGARDN',\n", - " 'PUBCULIN',\n", - " 'PUBHLTH',\n", - " 'PUBDOITY',\n", - " 'PUBNEWFN',\n", - " 'PUBPHOTO',\n", - " 'PUBOPP',\n", - " 'RFA_2F',\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "4V-DeOZFZhjB" - }, - "outputs": [], - "source": [ - "INDICATOR_FEATURES = [\n", - " 'AGE', # age decile, 0 indicates unknown\n", - " 'NUMCHLD',\n", - " 'INCOME',\n", - " 'WEALTH1',\n", - " 'HIT',\n", - "] + MAIL_ORDER_RESPONSES" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "U9y5qA1vZ0kz" - }, - "outputs": [], - "source": [ - "df['AGE'] = pd.qcut(df['AGE'].values, 10).codes + 1\n", - "df['NUMCHLD'] = df['NUMCHLD'].apply(lambda x: 0 if np.isnan(x) else int(x))\n", - "df['INCOME'] = df['INCOME'].apply(lambda x: 0 if np.isnan(x) else int(x))\n", - "df['WEALTH1'] = df['WEALTH1'].apply(lambda x: 0 if np.isnan(x) else int(x) + 1)\n", - "df['HIT'] = pd.qcut(df['HIT'].values, q=50, duplicates='drop').codes\n", - "\n", - "for col in MAIL_ORDER_RESPONSES:\n", - " df[col] = pd.qcut(df[col].values, q=20, duplicates='drop').codes + 1" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8DOO_2a-U6gr" - }, - "source": [ - "### Numeric" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "rqVteSLDiLVr" - }, - "outputs": [], - "source": [ - "NUMERIC_FEATURES = [\n", - " # binary\n", - " 'MAILCODE', # bad address\n", - " 'NOEXCH', # do not exchange\n", - " 'RECINHSE', # donor has given to PVA's in house program\n", - " 'RECP3', # donor has given to PVA's P3 program\n", - " 'RECPGVG', # planned giving record\n", - " 'RECSWEEP', # sweepstakes record\n", - " 'HOMEOWNR', # home owner\n", - " 'CHILD03',\n", - " 'CHILD07',\n", - " 'CHILD12',\n", - " 'CHILD18',\n", - "\n", - " # continuous\n", - " 'CARDPROM',\n", - " 'NUMPROM',\n", - " 'CARDPM12',\n", - " 'NUMPRM12',\n", - " 'RAMNTALL',\n", - " 'NGIFTALL',\n", - " 'MINRAMNT',\n", - " 'MAXRAMNT',\n", - " 'LASTGIFT',\n", - " 'AVGGIFT',\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "xMRP05Ztic0A" - }, - "outputs": [], - "source": [ - "df['MAILCODE'] = (df['MAILCODE'] == 'B').astype('float32')\n", - "df['PVASTATE'] = df['PVASTATE'].isin(['P', 'E']).astype('float32')\n", - "df['NOEXCH'] = df['NOEXCH'].isin(['X', '1']).astype('float32')\n", - "df['RECINHSE'] = (df['RECINHSE'] == 'X').astype('float32')\n", - "df['RECP3'] = (df['RECP3'] == 'X').astype('float32')\n", - "df['RECPGVG'] = (df['RECPGVG'] == 'X').astype('float32')\n", - "df['RECSWEEP'] = (df['RECSWEEP'] == 'X').astype('float32')\n", - "df['HOMEOWNR'] = (df['HOMEOWNR'] == 'H').astype('float32')\n", - "df['CHILD03'] = df['CHILD03'].isin(['M', 'F', 'B']).astype('float32')\n", - "df['CHILD07'] = df['CHILD07'].isin(['M', 'F', 'B']).astype('float32')\n", - "df['CHILD12'] = df['CHILD12'].isin(['M', 'F', 'B']).astype('float32')\n", - "df['CHILD18'] = df['CHILD18'].isin(['M', 'F', 'B']).astype('float32')\n", - "\n", - "df['CARDPROM'] = df['CARDPROM'] / 100\n", - "df['NUMPROM'] = df['NUMPROM'] / 100\n", - "df['CARDPM12'] = df['CARDPM12'] / 100\n", - "df['NUMPRM12'] = df['NUMPRM12'] / 100\n", - "df['RAMNTALL'] = np.log1p(df['RAMNTALL'])\n", - "df['NGIFTALL'] = np.log1p(df['NGIFTALL'])\n", - "df['MINRAMNT'] = np.log1p(df['MINRAMNT'])\n", - "df['MAXRAMNT'] = np.log1p(df['MAXRAMNT'])\n", - "df['LASTGIFT'] = np.log1p(df['LASTGIFT'])\n", - "df['AVGGIFT'] = np.log1p(df['AVGGIFT'])" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "GoLg1PvWuCT_" - }, - "source": [ - "### All" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "lSnNgjBCuJdb" - }, - "outputs": [], - "source": [ - "CATEGORICAL_FEATURES = VOCAB_FEATURES + INDICATOR_FEATURES\n", - "ALL_FEATURES = CATEGORICAL_FEATURES + NUMERIC_FEATURES" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8HJBvvCxRPg3" - }, - "source": [ - "## Train/eval split" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "N7BXLB1eHovl" - }, - "outputs": [], - "source": [ - "def dnn_split(df):\n", - " df_train = df.iloc[:num_train]\n", - " df_eval = df.iloc[num_train:]\n", - "\n", - " def feature_dict(df):\n", - " features = {k: v.values for k, v in dict(df[CATEGORICAL_FEATURES]).items()}\n", - " features['numeric'] = df[NUMERIC_FEATURES].astype('float32').values\n", - " return features\n", - "\n", - " x_train, y_train = feature_dict(df_train), df_train['TARGET_D'].astype(\n", - " 'float32').values\n", - " x_eval, y_eval = feature_dict(df_eval), df_eval['TARGET_D'].astype(\n", - " 'float32').values\n", - "\n", - " return x_train, x_eval, y_train, y_eval" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "4yw6fekBtX7X" - }, - "source": [ - "## Model" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "_rIuO0XYtZH2" - }, - "outputs": [], - "source": [ - "def embedding_dim(x):\n", - " return int(x**.25) + 1\n", - "\n", - "\n", - "def embedding_layer(vocab_size):\n", - " return tf.keras.Sequential([\n", - " tf.keras.layers.Embedding(\n", - " input_dim=vocab_size,\n", - " output_dim=embedding_dim(vocab_size),\n", - " input_length=1),\n", - " tf.keras.layers.Flatten(),\n", - " ])\n", - "\n", - "\n", - "def dnn_model(output_units):\n", - " numeric_input = tf.keras.layers.Input(\n", - " shape=(len(NUMERIC_FEATURES),), name='numeric')\n", - "\n", - " embedding_inputs = [\n", - " tf.keras.layers.Input(shape=(1,), name=key, dtype=np.int64)\n", - " for key in CATEGORICAL_FEATURES\n", - " ]\n", - "\n", - " embedding_outputs = [\n", - " embedding_layer(vocab_size=df[key].max() + 1)(input)\n", - " for key, input in zip(CATEGORICAL_FEATURES, embedding_inputs)\n", - " ]\n", - "\n", - " deep_input = tf.keras.layers.concatenate([numeric_input] + embedding_outputs)\n", - " deep_model = tf.keras.Sequential([\n", - " tf.keras.layers.Dense(128, activation='relu'),\n", - " tf.keras.layers.Dense(128, activation='relu'),\n", - " tf.keras.layers.Dense(64, activation='relu'),\n", - " tf.keras.layers.Dense(64, activation='relu'),\n", - " tf.keras.layers.Dense(units=output_units),\n", - " ])\n", - " return tf.keras.Model(\n", - " inputs=[numeric_input] + embedding_inputs, outputs=deep_model(deep_input))" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "G5h7X6botcHl" - }, - "source": [ - "## Loss" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "iJ9gpkC6tgP0" - }, - "outputs": [], - "source": [ - "if LOSS == 'mse':\n", - " loss = tf.keras.losses.MeanSquaredError()\n", - " output_units = 1\n", - "\n", - "if LOSS == 'ziln':\n", - " loss = ltv.zero_inflated_lognormal_loss\n", - " output_units = 3" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "_afFfIritjCM" - }, - "outputs": [], - "source": [ - "x_train, x_eval, y_train, y_eval = dnn_split(df)\n", - "model = dnn_model(output_units)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Qj3kI7pyVwzO" - }, - "outputs": [], - "source": [ - "model.compile(optimizer=tf.keras.optimizers.Nadam(lr=LEARNING_RATE), loss=loss)" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "KZSYxgWdwiXC" - }, - "source": [ - "## Train" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Nwj9h5ysQDLp" - }, - "outputs": [], - "source": [ - "callbacks = [\n", - " tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', min_lr=1e-6),\n", - " tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10),\n", - "]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Vb5Tnld6hsfx" - }, - "outputs": [], - "source": [ - "history = model.fit(\n", - " x=x_train,\n", - " y=y_train,\n", - " batch_size=2048,\n", - " epochs=200,\n", - " verbose=2,\n", - " callbacks=callbacks,\n", - " validation_data=(x_eval, y_eval)).history" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "J1sLSUdgvfa6" - }, - "outputs": [], - "source": [ - "pd.DataFrame(history)[['loss', 'val_loss']].plot();" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "jRKuZBqhvhT9" - }, - "source": [ - "## Eval" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "q9_zNMd3vjNk" - }, - "outputs": [], - "source": [ - "if LOSS == 'mse':\n", - " y_pred = model.predict(x=x_eval, batch_size=1024).flatten()\n", - "\n", - "if LOSS == 'ziln':\n", - " logits = model.predict(x=x_eval, batch_size=1024)\n", - " y_pred = ltv.zero_inflated_lognormal_pred(logits).numpy().flatten()" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "SkfkUMUvUu_E" - }, - "source": [ - "### Total Profit" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "AwfWAp8WQuns" - }, - "outputs": [], - "source": [ - "unit_costs = [0.4, 0.5, 0.6, 0.68, 0.7, 0.8, 0.9, 1.0]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "zqi91dfCUxpx" - }, - "outputs": [], - "source": [ - "num_mailed = [np.sum(y_pred \u003e v) for v in unit_costs]\n", - "num_mailed" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "ZgFjZUcuhScv" - }, - "outputs": [], - "source": [ - "baseline_total_profit = np.sum(y_eval - 0.68)\n", - "baseline_total_profit" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "VwsFnin5U-R9" - }, - "outputs": [], - "source": [ - "total_profits = [np.sum(y_eval[y_pred \u003e v] - v) for v in unit_costs]\n", - "total_profits" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "zROhsEWxnA5u" - }, - "source": [ - "### Gini Coefficient" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "gRsJ7y-632h_" - }, - "outputs": [], - "source": [ - "gain = pd.DataFrame({\n", - " 'lorenz': ltv.cumulative_true(y_eval, y_eval),\n", - " 'baseline': ltv.cumulative_true(y_eval, x_eval['numeric'][:, 19]),\n", - " 'model': ltv.cumulative_true(y_eval, y_pred),\n", - "})" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "yg-ndbve4AL_" - }, - "outputs": [], - "source": [ - "num_customers = np.float32(gain.shape[0])\n", - "gain['cumulative_customer'] = (np.arange(num_customers) + 1.) / num_customers" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "WEoAvuCj4OVy" - }, - "outputs": [], - "source": [ - "ax = gain[[\n", - " 'cumulative_customer',\n", - " 'lorenz',\n", - " 'baseline',\n", - " 'model',\n", - "]].plot(\n", - " x='cumulative_customer', figsize=(8, 5), legend=True)\n", - "\n", - "ax.legend(['Groundtruth', 'Baseline', 'Model'], loc='lower right')\n", - "\n", - "ax.set_xlabel('Cumulative Fraction of Customers')\n", - "ax.set_xticks(np.arange(0, 1.1, 0.1))\n", - "ax.set_xlim((0, 1.))\n", - "\n", - "ax.set_ylabel('Cumulative Fraction of Total Lifetime Value')\n", - "ax.set_yticks(np.arange(0, 1.1, 0.1))\n", - "ax.set_ylim((0, 1.05))\n", - "ax.set_title('Gain Chart');" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "kzPqaiNO4iWC" - }, - "outputs": [], - "source": [ - "gini = ltv.gini_from_gain(gain[['lorenz', 'baseline', 'model']])\n", - "gini" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "S84RitIa9PBu" - }, - "source": [ - "### Calibration" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "X7sKbsEf6RvF" - }, - "outputs": [], - "source": [ - "df_decile = ltv.decile_stats(y_eval, y_pred)\n", - "df_decile" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "DHdLqUqdL4hf" - }, - "outputs": [], - "source": [ - "ax = df_decile[['label_mean', 'pred_mean']].plot.bar(rot=0)\n", - "\n", - "ax.set_title('Decile Chart')\n", - "ax.set_xlabel('Prediction bucket')\n", - "ax.set_ylabel('Average bucket value')\n", - "ax.legend(['Label', 'Prediction'], loc='upper left');" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "nK6DQ89xU-d4" - }, - "source": [ - "### Rank Correlation" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "I9qWGyY3WePz" - }, - "outputs": [], - "source": [ - "def spearmanr(\n", - " x1: Sequence[float],\n", - " x2: Sequence[float]) -\u003e float:\n", - " \"\"\"Calculates spearmanr rank correlation coefficient.\n", - "\n", - " See https://docs.scipy.org/doc/scipy/reference/stats.html.\n", - "\n", - " Args:\n", - " x1: 1D array_like.\n", - " x2: 1D array_like.\n", - "\n", - " Returns:\n", - " correlation: float.\n", - " \"\"\"\n", - " return stats.spearmanr(x1, x2, nan_policy='raise')[0]\n", - "\n", - "\n", - "spearman_corr = spearmanr(y_eval, y_pred)\n", - "spearman_corr" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "-i_AbqhXcurk" - }, - "source": [ - "### All metrics together" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "Umqg1-0Bc1HS" - }, - "outputs": [], - "source": [ - "df_metrics = pd.DataFrame({\n", - " 'model': MODEL,\n", - " 'loss_function': LOSS,\n", - " 'train_loss': history['loss'][-1],\n", - " 'eval_loss': history['val_loss'][-1],\n", - " 'label_positive': np.mean(y_eval \u003e 0),\n", - " 'label_mean': y_eval.mean(),\n", - " 'pred_mean': y_pred.mean(),\n", - " 'decile_mape': df_decile['decile_mape'].mean(),\n", - " 'baseline_gini': gini['normalized'][1],\n", - " 'gini': gini['normalized'][2],\n", - " 'spearman_corr': spearman_corr,\n", - "}, index=[VERSION])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "C_cM2Mc2SB3W" - }, - "outputs": [], - "source": [ - "for unit_cost, total_profit in zip(unit_costs, total_profits):\n", - " df_metrics['total_profit_{:02d}'.format(int(unit_cost * 100))] = total_profit" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "iyMvsOtbRrXZ" - }, - "outputs": [], - "source": [ - "df_metrics.T" - ] - }, - { - "cell_type": "markdown", - "metadata": { - "id": "8uHtLKk1x0IE" - }, - "source": [ - "## Save" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "L-fMkqWIm6X6" - }, - "outputs": [], - "source": [ - "output_path = OUTPUT_CSV_FOLDER" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "jpJJAbWEm94h" - }, - "outputs": [], - "source": [ - "if not os.path.isdir(output_path):\n", - " os.makedirs(output_path)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "y4LcrTLOm_4B" - }, - "outputs": [], - "source": [ - "output_file = os.path.join(output_path, '{}_regression_{}_{}.csv'.format(MODEL, LOSS, VERSION))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "4WOF7a-dnENp" - }, - "outputs": [], - "source": [ - "df_metrics.to_csv(output_file, index=False)" - ] - } - ], - "metadata": { - "colab": { - "collapsed_sections": [], - "last_runtime": { - "build_target": "", - "kind": "local" - }, - "name": "regression.ipynb", - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3", - "name": "python3" - } - }, - "nbformat": 4, - "nbformat_minor": 0 + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "gE76T8J7IsGC" + }, + "outputs": [], + "source": [ + "#@title Copyright 2019 The Lifetime Value Authors.\n", + "# Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "# you may not use this file except in compliance with the License.\n", + "# You may obtain a copy of the License at\n", + "#\n", + "# https://www.apache.org/licenses/LICENSE-2.0\n", + "#\n", + "# Unless required by applicable law or agreed to in writing, software\n", + "# distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "# See the License for the specific language governing permissions and\n", + "# limitations under the License.\n", + "# ============================================================================" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "sswTFWDv7HZd" + }, + "source": [ + "# KDD Cup 98 LTV Prediction" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "PSr1mSJP7O1J" + }, + "source": [ + "\n", + " \n", + " \n", + "
\n", + " Run in Google Colab\n", + " \n", + " View source on GitHub\n", + "
" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "pBXE3Dz3NI4A" + }, + "outputs": [], + "source": [ + "import os\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "from scipy import stats\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "import tensorflow as tf\n", + "import tensorflow_probability as tfp\n", + "from typing import Sequence\n", + "\n", + "# install and import ltv\n", + "# !pip install -q git+https://github.com/google/lifetime_value\n", + "!pip install -q git+https://github.com/seyedrezamirkhani/lifetime_value\n", + "import lifetime_value as ltv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Bq0Ah16lBmgV" + }, + "outputs": [], + "source": [ + "tfd = tfp.distributions\n", + "%config InlineBackend.figure_format='retina'\n", + "sns.set_style('whitegrid')" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "2qN319qZK3IG" + }, + "source": [ + "## Configs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "hNy_ybw_K19n" + }, + "outputs": [], + "source": [ + "MODEL = 'dnn'\n", + "# LOSS = 'ziln' # @param { isTemplate: true, type: 'string'} ['mse', 'ziln']\n", + "LOSS = 'mse' # @param { isTemplate: true, type: 'string'} ['mse', 'ziln']\n", + "LEARNING_RATE = 0.001 # @param { isTemplate: true}\n", + "VERSION = 0 # @param { isTemplate: true, type: 'integer'}\n", + "DATA_FOLDER = './tmp/kdd_cup_98/' # @param { isTemplate: true, type: 'string'}\n", + "OUTPUT_CSV_FOLDER = f'{DATA_FOLDER}result' # @param { isTemplate: true, type: 'string'}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "mDSR921CCEcL" + }, + "source": [ + "## Load data" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "lHxp4rOGI02Q" + }, + "source": [ + "Download kdd_cup_98 data to /tmp/lifetime-value/kdd_cup_98" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Set it DATA_FOLDER as an environment variable\n", + "%env DATA_FOLDER=$DATA_FOLDER" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Dg3qtgJyJpdi" + }, + "outputs": [], + "source": [ + "%%bash\n", + "mkdir -p $DATA_FOLDER\n", + "wget -N https://kdd.ics.uci.edu/databases/kddcup98/epsilon_mirror/cup98lrn.zip -P $DATA_FOLDER\n", + "wget -N https://kdd.ics.uci.edu/databases/kddcup98/epsilon_mirror/cup98val.zip -P $DATA_FOLDER\n", + "wget -N https://kdd.ics.uci.edu/databases/kddcup98/epsilon_mirror/valtargt.txt -P $DATA_FOLDER\n", + "cd $DATA_FOLDER\n", + "unzip -n cup98lrn.zip\n", + "unzip -n cup98val.zip" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "a_LnLmQQRlYF" + }, + "outputs": [], + "source": [ + "df_train = pd.read_csv(f'{DATA_FOLDER}cup98LRN.txt')\n", + "num_train = df_train.shape[0]\n", + "df_eval = pd.read_csv(f'{DATA_FOLDER}cup98VAL.txt')\n", + "df_eval_target = pd.read_csv(f'{DATA_FOLDER}valtargt.txt')\n", + "df_eval = df_eval.merge(df_eval_target, on='CONTROLN')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ggQmy9wiP5M6" + }, + "outputs": [], + "source": [ + "df = pd.concat([df_train, df_eval], axis=0, sort=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "0rgxHpIyjaMH" + }, + "source": [ + "## Label distribution" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Xmpu_d3YjcFC" + }, + "outputs": [], + "source": [ + "y = df['TARGET_D'][:num_train]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "yMr2EDRyK5Sb" + }, + "outputs": [], + "source": [ + "def plot_hist_log_scale(y):\n", + " max_val = y.max() + 1. \n", + " ax = pd.Series(y).hist(\n", + " figsize=(8, 5), bins = 10 ** np.linspace(0., np.log10(max_val), 20))\n", + "\n", + " plt.xlabel('Donation ($)')\n", + " plt.ylabel('Count')\n", + " # plt.title('Histogram of LTV')\n", + " plt.xticks(rotation='horizontal')\n", + " plt.legend(loc='upper left')\n", + " ax.set_xscale('log')\n", + " ax.grid(False)\n", + " # Hide the right and top spines\n", + " ax.spines['right'].set_visible(False)\n", + " ax.spines['top'].set_visible(False)\n", + " # Only show ticks on the left and bottom spines\n", + " ax.yaxis.set_ticks_position('left')\n", + " ax.xaxis.set_ticks_position('bottom')\n", + " plt.show()\n", + "\n", + " fig = ax.get_figure()\n", + " output_file = tf.io.gfile.GFile(\n", + " f'{DATA_FOLDER}histogram_kdd98_log_scale.pdf',\n", + " 'wb')\n", + " fig.savefig(output_file, bbox_inches='tight', format='pdf')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "KbwCzGkBOWhH" + }, + "outputs": [], + "source": [ + "plot_hist_log_scale(y[y>0])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "1XXMLbnlCdlN" + }, + "source": [ + "## Preprocess features" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "L1sBf_RSU3pR" + }, + "source": [ + "### Vocab" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "xB_ddsd_U_4e" + }, + "outputs": [], + "source": [ + "VOCAB_FEATURES = [\n", + " 'ODATEDW', # date of donor's first gift (YYMM)\n", + " 'OSOURCE', # donor acquisition mailing list\n", + " 'TCODE', # donor title code\n", + " 'STATE',\n", + " 'ZIP',\n", + " 'DOMAIN', # urbanicity level and socio-economic status of the neighborhood\n", + " 'CLUSTER', # socio-economic status\n", + " 'GENDER',\n", + " 'MAXADATE', # date of the most recent promotion received\n", + " 'MINRDATE',\n", + " 'LASTDATE',\n", + " 'FISTDATE',\n", + " 'RFA_2A',\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "f2oPZGVLRSPe" + }, + "outputs": [], + "source": [ + "df['ODATEDW'] = df['ODATEDW'].astype('str')\n", + "df['TCODE'] = df['TCODE'].apply(\n", + " lambda x: '{:03d}'.format(x // 1000 if x > 1000 else x))\n", + "df['ZIP'] = df['ZIP'].str.slice(0, 5)\n", + "df['MAXADATE'] = df['MAXADATE'].astype('str')\n", + "df['MINRDATE'] = df['MINRDATE'].astype('str')\n", + "df['LASTDATE'] = df['LASTDATE'].astype('str')\n", + "df['FISTDATE'] = df['FISTDATE'].astype('str')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "isL9Ofv9JLAP" + }, + "outputs": [], + "source": [ + "def label_encoding(y, frequency_threshold=100):\n", + " #value_counts = pd.value_counts(y) # raises FutureWarning: pandas.value_counts is deprecated and will be removed in a future version. Use pd.Series(obj).value_counts() instead.\n", + " value_counts = pd.Series(y).value_counts()\n", + " categories = value_counts[\n", + " value_counts >= frequency_threshold].index.to_numpy()\n", + " # 0 indicates the unknown category.\n", + " return pd.Categorical(y, categories=categories).codes + 1" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "BgXGO5D0OdJP" + }, + "outputs": [], + "source": [ + "for key in VOCAB_FEATURES:\n", + " df[key] = label_encoding(df[key])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "kZkmnJ93Zrjw" + }, + "source": [ + "### Indicator" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tGBpMfaGhCD0" + }, + "outputs": [], + "source": [ + "MAIL_ORDER_RESPONSES = [\n", + " 'MBCRAFT',\n", + " 'MBGARDEN',\n", + " 'MBBOOKS',\n", + " 'MBCOLECT',\n", + " 'MAGFAML',\n", + " 'MAGFEM',\n", + " 'MAGMALE',\n", + " 'PUBGARDN',\n", + " 'PUBCULIN',\n", + " 'PUBHLTH',\n", + " 'PUBDOITY',\n", + " 'PUBNEWFN',\n", + " 'PUBPHOTO',\n", + " 'PUBOPP',\n", + " 'RFA_2F',\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "4V-DeOZFZhjB" + }, + "outputs": [], + "source": [ + "INDICATOR_FEATURES = [\n", + " 'AGE', # age decile, 0 indicates unknown\n", + " 'NUMCHLD',\n", + " 'INCOME',\n", + " 'WEALTH1',\n", + " 'HIT',\n", + "] + MAIL_ORDER_RESPONSES" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "U9y5qA1vZ0kz" + }, + "outputs": [], + "source": [ + "df['AGE'] = pd.qcut(df['AGE'].values, 10).codes + 1\n", + "df['NUMCHLD'] = df['NUMCHLD'].apply(lambda x: 0 if np.isnan(x) else int(x))\n", + "df['INCOME'] = df['INCOME'].apply(lambda x: 0 if np.isnan(x) else int(x))\n", + "df['WEALTH1'] = df['WEALTH1'].apply(lambda x: 0 if np.isnan(x) else int(x) + 1)\n", + "df['HIT'] = pd.qcut(df['HIT'].values, q=50, duplicates='drop').codes\n", + "\n", + "for col in MAIL_ORDER_RESPONSES:\n", + " df[col] = pd.qcut(df[col].values, q=20, duplicates='drop').codes + 1" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8DOO_2a-U6gr" + }, + "source": [ + "### Numeric" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "rqVteSLDiLVr" + }, + "outputs": [], + "source": [ + "NUMERIC_FEATURES = [\n", + " # binary\n", + " 'MAILCODE', # bad address\n", + " 'NOEXCH', # do not exchange\n", + " 'RECINHSE', # donor has given to PVA's in house program\n", + " 'RECP3', # donor has given to PVA's P3 program\n", + " 'RECPGVG', # planned giving record\n", + " 'RECSWEEP', # sweepstakes record\n", + " 'HOMEOWNR', # home owner\n", + " 'CHILD03',\n", + " 'CHILD07',\n", + " 'CHILD12',\n", + " 'CHILD18',\n", + "\n", + " # continuous\n", + " 'CARDPROM',\n", + " 'NUMPROM',\n", + " 'CARDPM12',\n", + " 'NUMPRM12',\n", + " 'RAMNTALL',\n", + " 'NGIFTALL',\n", + " 'MINRAMNT',\n", + " 'MAXRAMNT',\n", + " 'LASTGIFT',\n", + " 'AVGGIFT',\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "xMRP05Ztic0A" + }, + "outputs": [], + "source": [ + "df['MAILCODE'] = (df['MAILCODE'] == 'B').astype('float32')\n", + "df['PVASTATE'] = df['PVASTATE'].isin(['P', 'E']).astype('float32')\n", + "df['NOEXCH'] = df['NOEXCH'].isin(['X', '1']).astype('float32')\n", + "df['RECINHSE'] = (df['RECINHSE'] == 'X').astype('float32')\n", + "df['RECP3'] = (df['RECP3'] == 'X').astype('float32')\n", + "df['RECPGVG'] = (df['RECPGVG'] == 'X').astype('float32')\n", + "df['RECSWEEP'] = (df['RECSWEEP'] == 'X').astype('float32')\n", + "df['HOMEOWNR'] = (df['HOMEOWNR'] == 'H').astype('float32')\n", + "df['CHILD03'] = df['CHILD03'].isin(['M', 'F', 'B']).astype('float32')\n", + "df['CHILD07'] = df['CHILD07'].isin(['M', 'F', 'B']).astype('float32')\n", + "df['CHILD12'] = df['CHILD12'].isin(['M', 'F', 'B']).astype('float32')\n", + "df['CHILD18'] = df['CHILD18'].isin(['M', 'F', 'B']).astype('float32')\n", + "\n", + "df['CARDPROM'] = df['CARDPROM'] / 100\n", + "df['NUMPROM'] = df['NUMPROM'] / 100\n", + "df['CARDPM12'] = df['CARDPM12'] / 100\n", + "df['NUMPRM12'] = df['NUMPRM12'] / 100\n", + "df['RAMNTALL'] = np.log1p(df['RAMNTALL'])\n", + "df['NGIFTALL'] = np.log1p(df['NGIFTALL'])\n", + "df['MINRAMNT'] = np.log1p(df['MINRAMNT'])\n", + "df['MAXRAMNT'] = np.log1p(df['MAXRAMNT'])\n", + "df['LASTGIFT'] = np.log1p(df['LASTGIFT'])\n", + "df['AVGGIFT'] = np.log1p(df['AVGGIFT'])" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "GoLg1PvWuCT_" + }, + "source": [ + "### All" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "lSnNgjBCuJdb" + }, + "outputs": [], + "source": [ + "CATEGORICAL_FEATURES = VOCAB_FEATURES + INDICATOR_FEATURES\n", + "ALL_FEATURES = CATEGORICAL_FEATURES + NUMERIC_FEATURES" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8HJBvvCxRPg3" + }, + "source": [ + "## Train/eval split" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "N7BXLB1eHovl" + }, + "outputs": [], + "source": [ + "def dnn_split(df):\n", + " df_train = df.iloc[:num_train]\n", + " df_eval = df.iloc[num_train:]\n", + "\n", + " def feature_dict(df):\n", + " features = dict()\n", + " \n", + " for k, v in dict(df[CATEGORICAL_FEATURES]).items():\n", + " features[k] = v.values \n", + "\n", + " features['numeric'] = df[NUMERIC_FEATURES].astype('float32').values \n", + "\n", + " return features\n", + "\n", + " x_train, y_train = feature_dict(df_train), df_train['TARGET_D'].astype(\n", + " 'float32').values\n", + " x_eval, y_eval = feature_dict(df_eval), df_eval['TARGET_D'].astype(\n", + " 'float32').values\n", + "\n", + " return x_train, x_eval, y_train, y_eval" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4yw6fekBtX7X" + }, + "source": [ + "## Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_rIuO0XYtZH2" + }, + "outputs": [], + "source": [ + "def embedding_dim(x):\n", + " return int(x**.25) + 1\n", + "\n", + "\n", + "def embedding_layer(vocab_size):\n", + " return tf.keras.Sequential([\n", + " tf.keras.layers.Embedding(\n", + " input_dim=vocab_size,\n", + " output_dim=embedding_dim(vocab_size)),\n", + " tf.keras.layers.Flatten(),\n", + " ])\n", + "\n", + "# NOTE: The call to the fit method fails if the numeric, multi-dimensional, feature is not the last parameter\n", + "def dnn_model(output_units):\n", + " numeric_input = tf.keras.layers.Input(\n", + " shape=(len(NUMERIC_FEATURES),), name='numeric')\n", + " numeric_inputs = [numeric_input] \n", + "\n", + " embedding_inputs = [\n", + " tf.keras.layers.Input(shape=(1,), name=key, dtype=np.int64)\n", + " for key in CATEGORICAL_FEATURES\n", + " ]\n", + "\n", + " embedding_outputs = [\n", + " embedding_layer(vocab_size=df[key].max() + 1)(input)\n", + " for key, input in zip(CATEGORICAL_FEATURES, embedding_inputs)\n", + " ]\n", + "\n", + " deep_input = tf.keras.layers.concatenate(embedding_outputs + numeric_inputs)\n", + " \n", + " deep_model = tf.keras.Sequential([\n", + " tf.keras.layers.Dense(128, activation='relu'),\n", + " tf.keras.layers.Dense(128, activation='relu'),\n", + " tf.keras.layers.Dense(64, activation='relu'),\n", + " tf.keras.layers.Dense(64, activation='relu'),\n", + " tf.keras.layers.Dense(units=output_units),\n", + " ])\n", + " return tf.keras.Model(\n", + " inputs=embedding_inputs + numeric_inputs, outputs=deep_model(deep_input))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "G5h7X6botcHl" + }, + "source": [ + "## Loss" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "iJ9gpkC6tgP0" + }, + "outputs": [], + "source": [ + "if LOSS == 'mse':\n", + " loss = tf.keras.losses.MeanSquaredError()\n", + " output_units = 1\n", + "\n", + "if LOSS == 'ziln':\n", + " loss = ltv.zero_inflated_lognormal_loss\n", + " output_units = 3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "_afFfIritjCM" + }, + "outputs": [], + "source": [ + "x_train, x_eval, y_train, y_eval = dnn_split(df)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "model = dnn_model(output_units)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Qj3kI7pyVwzO" + }, + "outputs": [], + "source": [ + "model.compile(optimizer=tf.keras.optimizers.Nadam(learning_rate=LEARNING_RATE), loss=loss)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KZSYxgWdwiXC" + }, + "source": [ + "## Train" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Nwj9h5ysQDLp" + }, + "outputs": [], + "source": [ + "callbacks = [\n", + " tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', min_lr=1e-6),\n", + " tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=10),\n", + "]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Vb5Tnld6hsfx" + }, + "outputs": [], + "source": [ + "%%time\n", + "# y array needs to have a two dimensional shape to work with ziln loss function\n", + "# so we use [:, np.newaxis] to make the data two-dimensional for the fit function call\n", + "history = model.fit(\n", + " x=x_train,\n", + " y=y_train[:, np.newaxis],\n", + " batch_size=2048,\n", + " epochs=200,\n", + " verbose=2,\n", + " callbacks=callbacks,\n", + " validation_data=(x_eval, y_eval[:, np.newaxis])).history" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "J1sLSUdgvfa6" + }, + "outputs": [], + "source": [ + "pd.DataFrame(history)[['loss', 'val_loss']].plot();" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "jRKuZBqhvhT9" + }, + "source": [ + "## Eval" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "q9_zNMd3vjNk" + }, + "outputs": [], + "source": [ + "if LOSS == 'mse':\n", + " y_pred = model.predict(x=x_eval, batch_size=1024).flatten()\n", + "\n", + "if LOSS == 'ziln':\n", + " logits = model.predict(x=x_eval, batch_size=1024)\n", + " y_pred = ltv.zero_inflated_lognormal_pred(logits).numpy().flatten()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "SkfkUMUvUu_E" + }, + "source": [ + "### Total Profit" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "AwfWAp8WQuns" + }, + "outputs": [], + "source": [ + "unit_costs = [0.4, 0.5, 0.6, 0.68, 0.7, 0.8, 0.9, 1.0]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "zqi91dfCUxpx" + }, + "outputs": [], + "source": [ + "num_mailed = [np.sum(y_pred > v) for v in unit_costs]\n", + "num_mailed" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ZgFjZUcuhScv" + }, + "outputs": [], + "source": [ + "baseline_total_profit = np.sum(y_eval - 0.68)\n", + "baseline_total_profit" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "VwsFnin5U-R9" + }, + "outputs": [], + "source": [ + "total_profits = [np.sum(y_eval[y_pred > v] - v) for v in unit_costs]\n", + "total_profits" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "zROhsEWxnA5u" + }, + "source": [ + "### Gini Coefficient" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "gRsJ7y-632h_" + }, + "outputs": [], + "source": [ + "gain = pd.DataFrame({\n", + " 'lorenz': ltv.cumulative_true(y_eval, y_eval),\n", + " 'baseline': ltv.cumulative_true(y_eval, x_eval['numeric'][:, 19]),\n", + " 'model': ltv.cumulative_true(y_eval, y_pred),\n", + "})" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "yg-ndbve4AL_" + }, + "outputs": [], + "source": [ + "num_customers = np.float32(gain.shape[0])\n", + "gain['cumulative_customer'] = (np.arange(num_customers) + 1.) / num_customers" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "WEoAvuCj4OVy" + }, + "outputs": [], + "source": [ + "ax = gain[[\n", + " 'cumulative_customer',\n", + " 'lorenz',\n", + " 'baseline',\n", + " 'model',\n", + "]].plot(\n", + " x='cumulative_customer', figsize=(8, 5), legend=True)\n", + "\n", + "ax.legend(['Groundtruth', 'Baseline', 'Model'], loc='lower right')\n", + "\n", + "ax.set_xlabel('Cumulative Fraction of Customers')\n", + "ax.set_xticks(np.arange(0, 1.1, 0.1))\n", + "ax.set_xlim((0, 1.))\n", + "\n", + "ax.set_ylabel('Cumulative Fraction of Total Lifetime Value')\n", + "ax.set_yticks(np.arange(0, 1.1, 0.1))\n", + "ax.set_ylim((0, 1.05))\n", + "ax.set_title('Gain Chart');" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "kzPqaiNO4iWC" + }, + "outputs": [], + "source": [ + "gini = ltv.gini_from_gain(gain[['lorenz', 'baseline', 'model']])\n", + "gini" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "S84RitIa9PBu" + }, + "source": [ + "### Calibration" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "X7sKbsEf6RvF" + }, + "outputs": [], + "source": [ + "df_decile = ltv.decile_stats(y_eval, y_pred)\n", + "df_decile" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "DHdLqUqdL4hf" + }, + "outputs": [], + "source": [ + "ax = df_decile[['label_mean', 'pred_mean']].plot.bar(rot=0)\n", + "\n", + "ax.set_title('Decile Chart')\n", + "ax.set_xlabel('Prediction bucket')\n", + "ax.set_ylabel('Average bucket value')\n", + "ax.legend(['Label', 'Prediction'], loc='upper left');" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nK6DQ89xU-d4" + }, + "source": [ + "### Rank Correlation" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "I9qWGyY3WePz" + }, + "outputs": [], + "source": [ + "def spearmanr(\n", + " x1: Sequence[float],\n", + " x2: Sequence[float]) -> float:\n", + " \"\"\"Calculates spearmanr rank correlation coefficient.\n", + "\n", + " See https://docs.scipy.org/doc/scipy/reference/stats.html.\n", + "\n", + " Args:\n", + " x1: 1D array_like.\n", + " x2: 1D array_like.\n", + "\n", + " Returns:\n", + " correlation: float.\n", + " \"\"\"\n", + " return stats.spearmanr(x1, x2, nan_policy='raise')[0]\n", + "\n", + "\n", + "spearman_corr = spearmanr(y_eval, y_pred)\n", + "spearman_corr" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-i_AbqhXcurk" + }, + "source": [ + "### All metrics together" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Umqg1-0Bc1HS" + }, + "outputs": [], + "source": [ + "df_metrics = pd.DataFrame({\n", + " 'model': MODEL,\n", + " 'loss_function': LOSS,\n", + " 'train_loss': history['loss'][-1],\n", + " 'eval_loss': history['val_loss'][-1],\n", + " 'label_positive': np.mean(y_eval > 0),\n", + " 'label_mean': y_eval.mean(),\n", + " 'pred_mean': y_pred.mean(),\n", + " 'decile_mape': df_decile['decile_mape'].mean(),\n", + " 'baseline_gini': gini['normalized'][1],\n", + " 'gini': gini['normalized'][2],\n", + " 'spearman_corr': spearman_corr,\n", + "}, index=[VERSION])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "C_cM2Mc2SB3W" + }, + "outputs": [], + "source": [ + "for unit_cost, total_profit in zip(unit_costs, total_profits):\n", + " df_metrics['total_profit_{:02d}'.format(int(unit_cost * 100))] = total_profit" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "iyMvsOtbRrXZ" + }, + "outputs": [], + "source": [ + "df_metrics.T" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8uHtLKk1x0IE" + }, + "source": [ + "## Save" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "L-fMkqWIm6X6" + }, + "outputs": [], + "source": [ + "output_path = OUTPUT_CSV_FOLDER" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "jpJJAbWEm94h" + }, + "outputs": [], + "source": [ + "if not os.path.isdir(output_path):\n", + " os.makedirs(output_path)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "y4LcrTLOm_4B" + }, + "outputs": [], + "source": [ + "output_file = os.path.join(output_path, '{}_regression_{}_{}.csv'.format(MODEL, LOSS, VERSION))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "4WOF7a-dnENp" + }, + "outputs": [], + "source": [ + "df_metrics.to_csv(output_file, index=False)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_metrics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "colab": { + "collapsed_sections": [], + "last_runtime": { + "build_target": "", + "kind": "local" + }, + "name": "regression.ipynb", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 4 } diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..ec4905e --- /dev/null +++ b/requirements.txt @@ -0,0 +1,159 @@ +absl-py==2.1.0 +anyio==4.6.2.post1 +argon2-cffi==23.1.0 +argon2-cffi-bindings==21.2.0 +arrow==1.3.0 +asttokens==2.4.1 +astunparse==1.6.3 +async-lru==2.0.4 +attrs==24.2.0 +babel==2.16.0 +beautifulsoup4==4.12.3 +bleach==6.2.0 +certifi==2024.8.30 +cffi==1.17.1 +charset-normalizer==3.4.0 +cloudpickle==3.1.0 +comm==0.2.2 +contourpy==1.3.1 +cycler==0.12.1 +debugpy==1.8.8 +decorator==5.1.1 +defusedxml==0.7.1 +dm-tree==0.1.8 +executing==2.1.0 +fastjsonschema==2.20.0 +flatbuffers==24.3.25 +fonttools==4.55.0 +fqdn==1.5.1 +gast==0.6.0 +google-pasta==0.2.0 +grpcio==1.68.0 +h11==0.14.0 +h5py==3.12.1 +httpcore==1.0.7 +httpx==0.27.2 +idna==3.10 +ipykernel==6.29.5 +ipython==8.29.0 +ipywidgets==8.1.5 +isoduration==20.11.0 +jedi==0.19.2 +Jinja2==3.1.4 +joblib==1.4.2 +json5==0.9.28 +jsonpointer==3.0.0 +jsonschema==4.23.0 +jsonschema-specifications==2024.10.1 +jupyter==1.1.1 +jupyter-console==6.6.3 +jupyter-events==0.10.0 +jupyter-lsp==2.2.5 +jupyter_client==8.6.3 +jupyter_core==5.7.2 +jupyter_server==2.14.2 +jupyter_server_terminals==0.5.3 +jupyterlab==4.2.6 +jupyterlab_pygments==0.3.0 +jupyterlab_server==2.27.3 +jupyterlab_widgets==3.0.13 +kaggle==1.6.17 +keras==3.6.0 +kiwisolver==1.4.7 +libclang==18.1.1 +lifetime_value @ git+https://github.com/seyedrezamirkhani/lifetime_value@5a4feaa2e64856d4b5e8ae355d61d330b653e1c7 +Markdown==3.7 +markdown-it-py==3.0.0 +MarkupSafe==3.0.2 +matplotlib==3.9.2 +matplotlib-inline==0.1.7 +mdurl==0.1.2 +mistune==3.0.2 +ml-dtypes==0.4.1 +namex==0.0.8 +nbclient==0.10.0 +nbconvert==7.16.4 +nbformat==5.10.4 +nest-asyncio==1.6.0 +notebook==7.2.2 +notebook_shim==0.2.4 +numpy==2.0.2 +nvidia-cublas-cu12==12.5.3.2 +nvidia-cuda-cupti-cu12==12.5.82 +nvidia-cuda-nvcc-cu12==12.5.82 +nvidia-cuda-nvrtc-cu12==12.5.82 +nvidia-cuda-runtime-cu12==12.5.82 +nvidia-cudnn-cu12==9.3.0.75 +nvidia-cufft-cu12==11.2.3.61 +nvidia-curand-cu12==10.3.6.82 +nvidia-cusolver-cu12==11.6.3.83 +nvidia-cusparse-cu12==12.5.1.3 +nvidia-nccl-cu12==2.21.5 +nvidia-nvjitlink-cu12==12.5.82 +opt_einsum==3.4.0 +optree==0.13.1 +overrides==7.7.0 +packaging==24.2 +pandas==2.2.3 +pandocfilters==1.5.1 +parso==0.8.4 +pexpect==4.9.0 +pillow==11.0.0 +platformdirs==4.3.6 +prometheus_client==0.21.0 +prompt_toolkit==3.0.48 +protobuf==5.28.3 +psutil==6.1.0 +ptyprocess==0.7.0 +pure_eval==0.2.3 +pycparser==2.22 +pydot==3.0.2 +Pygments==2.18.0 +pyparsing==3.2.0 +python-dateutil==2.9.0.post0 +python-json-logger==2.0.7 +python-slugify==8.0.4 +pytz==2024.2 +PyYAML==6.0.2 +pyzmq==26.2.0 +referencing==0.35.1 +requests==2.32.3 +rfc3339-validator==0.1.4 +rfc3986-validator==0.1.1 +rich==13.9.4 +rpds-py==0.21.0 +scikit-learn==1.5.2 +scipy==1.14.1 +seaborn==0.13.2 +Send2Trash==1.8.3 +setuptools==75.1.0 +six==1.16.0 +sniffio==1.3.1 +soupsieve==2.6 +stack-data==0.6.3 +tensorboard==2.18.0 +tensorboard-data-server==0.7.2 +tensorflow==2.18.0 +tensorflow-probability==0.25.0 +termcolor==2.5.0 +terminado==0.18.1 +text-unidecode==1.3 +tf_keras==2.18.0 +threadpoolctl==3.5.0 +tinycss2==1.4.0 +tornado==6.4.1 +tqdm==4.67.0 +traitlets==5.14.3 +types-python-dateutil==2.9.0.20241003 +typing_extensions==4.12.2 +tzdata==2024.2 +uri-template==1.3.0 +urllib3==2.2.3 +wcwidth==0.2.13 +webcolors==24.11.1 +webencodings==0.5.1 +websocket-client==1.8.0 +Werkzeug==3.1.3 +wheel==0.44.0 +widgetsnbextension==4.0.13 +wrapt==1.16.0 diff --git a/setup.py b/setup.py index 6d3da28..903ce81 100644 --- a/setup.py +++ b/setup.py @@ -29,7 +29,7 @@ install_requires=[ 'numpy >= 1.11.1', 'pandas', - 'sklearn', + 'scikit-learn', 'tensorflow', 'tensorflow-probability', 'tqdm',