-
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #120 from VikramsDataScience/churn-propensity-model
Restructured codebase to begin development of a local pipeline
- Loading branch information
Showing
17 changed files
with
192 additions
and
184 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
from .src.config import Config | ||
from pathlib import Path | ||
|
||
# Model version | ||
__version__ = '0.01' | ||
|
||
# Initialise global variables from config module | ||
config = Config() | ||
data_path = Path(config.data_path) | ||
churn_app_path = Path(config.churn_app_models) | ||
seed = config.seed |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
import argparse | ||
|
||
parser = argparse.ArgumentParser(description='Churn propensity pipeline') | ||
parser.add_argument('--module', type=str, choices=['eda', 'preprocessing', 'model_training']) | ||
args = parser.parse_args() | ||
|
This file was deleted.
Oops, something went wrong.
Binary file not shown.
Binary file added
BIN
+8.63 KB
ECommerce_Churn_Propensity_Model/src/__pycache__/__init__.cpython-311.pyc
Binary file not shown.
Binary file not shown.
Binary file added
BIN
+1.45 KB
ECommerce_Churn_Propensity_Model/src/__pycache__/config.cpython-311.pyc
Binary file not shown.
Binary file added
BIN
+3.97 KB
ECommerce_Churn_Propensity_Model/src/__pycache__/model_training.cpython-311.pyc
Binary file not shown.
Binary file added
BIN
+921 Bytes
ECommerce_Churn_Propensity_Model/src/__pycache__/preprocessing.cpython-311.pyc
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,72 @@ | ||
from ..config import Config | ||
from missforest.missforest import MissForest | ||
from scipy.stats import skew | ||
import numpy as np | ||
import pandas as pd | ||
import os | ||
import sys | ||
import contextlib | ||
|
||
# Initialise global variables from config module | ||
config = Config() | ||
|
||
#################### ADDITIONAL REQUIRED FUNCTIONS #################### | ||
def doanes_formula(data, nan_count) -> int: | ||
""" | ||
To aid in the preparation for the correct binning (FOR SKEWED DATA COLUMNS ONLY) of Intervals prior to the | ||
calculation of Phi K Correlation, I've opted to use Doane's Formula to determine the Bin sizes of the intervals. | ||
Since I couldn't find a Python library for the formula, I've written this implementation of Doane's Formula. | ||
Please refer to the 'README.md' file (in the EDA folder) for a mathematical explanation of the formula | ||
and the data justifications behind selecting Doane's Formula for calculating bin sizes. | ||
This function will return the bin length as a truncated integer (not rounded!). I elected for numeric truncation | ||
over rounding, since I saw that numpy's rounding with np.ceil() led to substantial rounding errors (for instance, | ||
18.1 would be rounded to 19). So, I've opted to truncate and cast as integer rather than have these rounding | ||
errors in the calculation of the interval's bin length. | ||
N.B.: Since Doane's Formula relies on the number of observations in the DF, if there are NaNs in the input DF, | ||
please calculate the number of NaNs and deduct that value from 'n' (i.e. use the 'nan_count' arg in the function). | ||
If there are no NaNs, please set 'nan_count = 0'. | ||
""" | ||
n = len(data) - nan_count | ||
g1 = skew(data) - nan_count | ||
sigma_g1 = np.sqrt((6*(n - 2)) / ((n + 1)*(n + 3))) | ||
k = 1 + np.log2(n) + np.log2(1 + abs(g1) / sigma_g1) | ||
return int(np.trunc(k)) | ||
|
||
def read_impute_data(df_path, float_cols, categorical_cols, output_path, sheet_name=None) -> pd.DataFrame: | ||
""" | ||
Read in Excel/CSV file, define columns for casting & interval definitions, and perform imputation | ||
with Missing Forest. | ||
IMPORTANT NOTE: When parsing to the 'df_path' arg, only parse from the 'Path' class in the 'pathlib' | ||
library. | ||
- 'sheet_name' (OPTIONAL): Only required when reading Excel files to indicate which Excel sheet to read | ||
into the DataFrame. | ||
""" | ||
missforest_imputer = MissForest() | ||
if '.xlsx' in df_path.suffix: | ||
df = pd.read_excel(df_path, sheet_name) | ||
elif '.csv' in df_path.suffix: | ||
df = pd.read_csv(df_path) | ||
|
||
# Cast float_columns as integers, impute NaN values using MissForest | ||
with suppress_stdout(): | ||
df = missforest_imputer.fit_transform(x=df, | ||
categorical=categorical_cols) | ||
df[float_cols] = df[float_cols].astype(int) | ||
|
||
# Save to storage for downstream PreProcessing module | ||
df.to_csv(output_path) | ||
return df | ||
|
||
@contextlib.contextmanager | ||
def suppress_stdout(): | ||
""" | ||
For any library that contains (undesirably) verbose output, use this boilerplate function to suppress | ||
that output in the CLI. | ||
""" | ||
with open(os.devnull, 'w') as devnull: | ||
old_stdout = sys.stdout | ||
sys.stdout = devnull | ||
try: | ||
yield | ||
finally: | ||
sys.stdout = old_stdout |
59 changes: 59 additions & 0 deletions
59
ECommerce_Churn_Propensity_Model/src/model_training/__init__.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,59 @@ | ||
from ..config import Config | ||
import pandas as pd | ||
from sklearn.model_selection import train_test_split | ||
from sklearn.linear_model import LogisticRegression | ||
from sklearn.ensemble import RandomForestClassifier | ||
import xgboost as xgb | ||
from pathlib import Path | ||
|
||
# Initialise global variables from config module | ||
config = Config() | ||
data_path = config.data_path | ||
seed = config.seed | ||
|
||
insample_scores = pd.DataFrame(columns=['Model', 'Precision', 'Recall', 'F1-Score']) | ||
outofsample_scores = pd.DataFrame(columns=['Model', 'Precision', 'Recall', 'F1-Score']) | ||
df = pd.read_csv(Path(data_path) / 'PreProcessed_ECommerce_Dataset.csv') | ||
|
||
# Define target variable (y) and features (X) | ||
# The EDA exposed high correlation with the 'CashbackAmount' feature. So remove from X | ||
X = df.drop(['CustomerID', 'Churn', 'CashbackAmount', 'CityTier', 'WarehouseToHome', 'HourSpendOnApp', 'NumberOfDeviceRegistered', | ||
'NumberOfAddress', 'OrderAmountHikeFromlastYear', 'CouponUsed', 'OrderCount', 'DaySinceLastOrder', 'Tenure_(12, 24)', 'Tenure_(48, 60)', | ||
'Tenure_(60, 72)', 'PreferredLoginDevice_Computer', 'PreferredLoginDevice_Mobile Phone', 'PreferredLoginDevice_Phone', 'PreferredPaymentMode_CC', | ||
'PreferredPaymentMode_Credit Card', 'PreferredPaymentMode_Debit Card', 'PreferredPaymentMode_E wallet', 'Gender_Female', | ||
'Gender_Male', 'PreferedOrderCat_Mobile', 'PreferedOrderCat_Mobile Phone', 'PreferedOrderCat_Others', 'MaritalStatus_Divorced', | ||
'MaritalStatus_Married', 'PreferredPaymentMode_Cash on Delivery'], axis=1) | ||
y = df['Churn'] | ||
|
||
# Define DMatrix and Hyper Parameters for XGBoost | ||
d_matrix = xgb.DMatrix(data=X, | ||
label=y, | ||
enable_categorical=True) | ||
params = { | ||
'objective':'binary:logistic', | ||
'max_depth': 9, | ||
# 'alpha': 10, # L1 Regularization on the leaf nodes (larger value means greater regularization) | ||
'lambda': 10, # L2 Regularization on the leaf nodes (larger value means greater regularization). L2 is smoother than L1 and tends to better prevent overfitting | ||
'learning_rate': 0.4, | ||
'n_estimators':100, | ||
} | ||
|
||
# Perform Train/Test split with Stratification since the class labels, y, is an imbalanced dataset that favours those who didn't churn (i.e. ~83% didn't churn) | ||
X_train, X_test, y_train, y_test = train_test_split(X, | ||
y, | ||
train_size=0.80, | ||
stratify=y, | ||
shuffle=True, | ||
random_state=seed) | ||
|
||
# Define/load Hyper Parameters | ||
models_config = { | ||
'logistic_regression': LogisticRegression(class_weight='balanced', # In addition to Stratification, this perform's class balancing on model at fit() stage | ||
solver='liblinear', # The Solver refers to the available Gradient Descent Optimization options (i.e. selection of Loss functions) | ||
random_state=seed), | ||
'RFClassifier': RandomForestClassifier(class_weight=None, # For RFClassifier, setting class_weights='balanced' harms the F1-Score. Leave class_weight=None (default) | ||
n_estimators=200, # ~200 trees improves F1-Score. 200+ is past the point of optimality, and will reduce accuracy | ||
max_depth=None, # Leave max_depth=None so the classifier can grow the trees until all leaves are pure (lowest Gini Impurity) without stopping criterion causing premature terminations | ||
random_state=seed), | ||
'XGBoost': xgb.XGBClassifier(**params) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.