Skip to content

Commit

Permalink
Merge pull request #120 from VikramsDataScience/churn-propensity-model
Browse files Browse the repository at this point in the history
Restructured codebase to begin development of a local pipeline
  • Loading branch information
VikramsDataScience authored Aug 23, 2024
2 parents 0e86759 + a5568d6 commit aef8424
Show file tree
Hide file tree
Showing 17 changed files with 192 additions and 184 deletions.
11 changes: 11 additions & 0 deletions ECommerce_Churn_Propensity_Model/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from .src.config import Config
from pathlib import Path

# Model version
__version__ = '0.01'

# Initialise global variables from config module
config = Config()
data_path = Path(config.data_path)
churn_app_path = Path(config.churn_app_models)
seed = config.seed
Binary file not shown.
6 changes: 6 additions & 0 deletions ECommerce_Churn_Propensity_Model/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
import argparse

parser = argparse.ArgumentParser(description='Churn propensity pipeline')
parser.add_argument('--module', type=str, choices=['eda', 'preprocessing', 'model_training'])
args = parser.parse_args()

171 changes: 0 additions & 171 deletions ECommerce_Churn_Propensity_Model/src/__init__.py

This file was deleted.

Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
from ydata_profiling import ProfileReport
from phik import phik_matrix, significance_matrix

# Load variables from __init__.py
from . import read_impute_data, doanes_formula, Config
# Load variables from __init__.py
from . import Config, read_impute_data, doanes_formula

# Load the file paths and global variables from the Config file
config = Config()
Expand All @@ -24,7 +24,6 @@
df[float_columns] = df[float_columns].astype(int)
print('\nRECASTED DATA FRAME WITHOUT NaN VALUES:\n', df)

########## Phi K Correlation calculation and report generation ##########
# Apply Doane's Formula to calculate and store bin sizes for the skewed data in a Dictionary structure as prepartion for Phi K Correlation
for col in skewed_interval_columns:
skewed_bin_len = doanes_formula(df[col], nan_count=0)
Expand All @@ -35,6 +34,7 @@

print('RESULTS OF DOANE\'S CALCULATION OF BIN LENGTHS FOR DECLARED INTERVAL VARIABLES:\n', interval_bins)

########## GENERATE PHI_K CORRELATION MATRIX ##########
# If the following Matrices don't exist, generate and store them as CSVs
if not exists(Path(data_path) / 'phi_k_matrix.csv') or not exists(Path(data_path) / 'significance_matrix.csv'):

Expand All @@ -44,11 +44,11 @@
noise_correction=True).to_csv(Path(data_path) / 'phi_k_matrix.csv')

# Please note that calculating a Significance Matrix can be a little slow!
significance_matrix(df,
bins=interval_bins,
interval_cols=interval_bins,
significance_method='hybrid' # Hybrid method between calculating G-Test Statistic (asymptotic) and Monte Carlo simulations is default and recommended by the authors
).to_csv(Path(data_path) / 'significance_matrix.csv')
# significance_matrix(df,
# bins=interval_bins,
# interval_cols=interval_bins,
# significance_method='hybrid' # Hybrid method between calculating G-Test Statistic (asymptotic) and Monte Carlo simulations is default and recommended by the authors
# ).to_csv(Path(data_path) / 'significance_matrix.csv')

########## Y-Data Profiling ##########
# If the EDA profiling report doesn't exist, generate report as an HTML document
Expand Down
72 changes: 72 additions & 0 deletions ECommerce_Churn_Propensity_Model/src/eda/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
from ..config import Config
from missforest.missforest import MissForest
from scipy.stats import skew
import numpy as np
import pandas as pd
import os
import sys
import contextlib

# Initialise global variables from config module
config = Config()

#################### ADDITIONAL REQUIRED FUNCTIONS ####################
def doanes_formula(data, nan_count) -> int:
"""
To aid in the preparation for the correct binning (FOR SKEWED DATA COLUMNS ONLY) of Intervals prior to the
calculation of Phi K Correlation, I've opted to use Doane's Formula to determine the Bin sizes of the intervals.
Since I couldn't find a Python library for the formula, I've written this implementation of Doane's Formula.
Please refer to the 'README.md' file (in the EDA folder) for a mathematical explanation of the formula
and the data justifications behind selecting Doane's Formula for calculating bin sizes.
This function will return the bin length as a truncated integer (not rounded!). I elected for numeric truncation
over rounding, since I saw that numpy's rounding with np.ceil() led to substantial rounding errors (for instance,
18.1 would be rounded to 19). So, I've opted to truncate and cast as integer rather than have these rounding
errors in the calculation of the interval's bin length.
N.B.: Since Doane's Formula relies on the number of observations in the DF, if there are NaNs in the input DF,
please calculate the number of NaNs and deduct that value from 'n' (i.e. use the 'nan_count' arg in the function).
If there are no NaNs, please set 'nan_count = 0'.
"""
n = len(data) - nan_count
g1 = skew(data) - nan_count
sigma_g1 = np.sqrt((6*(n - 2)) / ((n + 1)*(n + 3)))
k = 1 + np.log2(n) + np.log2(1 + abs(g1) / sigma_g1)
return int(np.trunc(k))

def read_impute_data(df_path, float_cols, categorical_cols, output_path, sheet_name=None) -> pd.DataFrame:
"""
Read in Excel/CSV file, define columns for casting & interval definitions, and perform imputation
with Missing Forest.
IMPORTANT NOTE: When parsing to the 'df_path' arg, only parse from the 'Path' class in the 'pathlib'
library.
- 'sheet_name' (OPTIONAL): Only required when reading Excel files to indicate which Excel sheet to read
into the DataFrame.
"""
missforest_imputer = MissForest()
if '.xlsx' in df_path.suffix:
df = pd.read_excel(df_path, sheet_name)
elif '.csv' in df_path.suffix:
df = pd.read_csv(df_path)

# Cast float_columns as integers, impute NaN values using MissForest
with suppress_stdout():
df = missforest_imputer.fit_transform(x=df,
categorical=categorical_cols)
df[float_cols] = df[float_cols].astype(int)

# Save to storage for downstream PreProcessing module
df.to_csv(output_path)
return df

@contextlib.contextmanager
def suppress_stdout():
"""
For any library that contains (undesirably) verbose output, use this boilerplate function to suppress
that output in the CLI.
"""
with open(os.devnull, 'w') as devnull:
old_stdout = sys.stdout
sys.stdout = devnull
try:
yield
finally:
sys.stdout = old_stdout
59 changes: 59 additions & 0 deletions ECommerce_Churn_Propensity_Model/src/model_training/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
from ..config import Config
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from pathlib import Path

# Initialise global variables from config module
config = Config()
data_path = config.data_path
seed = config.seed

insample_scores = pd.DataFrame(columns=['Model', 'Precision', 'Recall', 'F1-Score'])
outofsample_scores = pd.DataFrame(columns=['Model', 'Precision', 'Recall', 'F1-Score'])
df = pd.read_csv(Path(data_path) / 'PreProcessed_ECommerce_Dataset.csv')

# Define target variable (y) and features (X)
# The EDA exposed high correlation with the 'CashbackAmount' feature. So remove from X
X = df.drop(['CustomerID', 'Churn', 'CashbackAmount', 'CityTier', 'WarehouseToHome', 'HourSpendOnApp', 'NumberOfDeviceRegistered',
'NumberOfAddress', 'OrderAmountHikeFromlastYear', 'CouponUsed', 'OrderCount', 'DaySinceLastOrder', 'Tenure_(12, 24)', 'Tenure_(48, 60)',
'Tenure_(60, 72)', 'PreferredLoginDevice_Computer', 'PreferredLoginDevice_Mobile Phone', 'PreferredLoginDevice_Phone', 'PreferredPaymentMode_CC',
'PreferredPaymentMode_Credit Card', 'PreferredPaymentMode_Debit Card', 'PreferredPaymentMode_E wallet', 'Gender_Female',
'Gender_Male', 'PreferedOrderCat_Mobile', 'PreferedOrderCat_Mobile Phone', 'PreferedOrderCat_Others', 'MaritalStatus_Divorced',
'MaritalStatus_Married', 'PreferredPaymentMode_Cash on Delivery'], axis=1)
y = df['Churn']

# Define DMatrix and Hyper Parameters for XGBoost
d_matrix = xgb.DMatrix(data=X,
label=y,
enable_categorical=True)
params = {
'objective':'binary:logistic',
'max_depth': 9,
# 'alpha': 10, # L1 Regularization on the leaf nodes (larger value means greater regularization)
'lambda': 10, # L2 Regularization on the leaf nodes (larger value means greater regularization). L2 is smoother than L1 and tends to better prevent overfitting
'learning_rate': 0.4,
'n_estimators':100,
}

# Perform Train/Test split with Stratification since the class labels, y, is an imbalanced dataset that favours those who didn't churn (i.e. ~83% didn't churn)
X_train, X_test, y_train, y_test = train_test_split(X,
y,
train_size=0.80,
stratify=y,
shuffle=True,
random_state=seed)

# Define/load Hyper Parameters
models_config = {
'logistic_regression': LogisticRegression(class_weight='balanced', # In addition to Stratification, this perform's class balancing on model at fit() stage
solver='liblinear', # The Solver refers to the available Gradient Descent Optimization options (i.e. selection of Loss functions)
random_state=seed),
'RFClassifier': RandomForestClassifier(class_weight=None, # For RFClassifier, setting class_weights='balanced' harms the F1-Score. Leave class_weight=None (default)
n_estimators=200, # ~200 trees improves F1-Score. 200+ is past the point of optimality, and will reduce accuracy
max_depth=None, # Leave max_depth=None so the classifier can grow the trees until all leaves are pure (lowest Gini Impurity) without stopping criterion causing premature terminations
random_state=seed),
'XGBoost': xgb.XGBClassifier(**params)
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,9 @@
import pickle

# Load variables from __init__.py
from . import models_config, insample_scores, outofsample_scores, X, X_train, X_test, y_train, y_test, Config
from . import models_config, insample_scores, outofsample_scores, X, X_train, X_test, y_train, y_test, Config, data_path

config = Config()
data_path = Path(config.data_path)
churn_app_path = Path(config.churn_app_models)

for model in models_config:
Expand Down
Loading

0 comments on commit aef8424

Please sign in to comment.