-
Notifications
You must be signed in to change notification settings - Fork 10
/
survival_analysis.py
100 lines (57 loc) · 2.88 KB
/
survival_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
import pandas as pd
import numpy as np
import sys, getopt
import cPickle as pickle
from lifelines.estimation import KaplanMeierFitter
def get_churn_data(df, min_date, max_date, time_to_churn):
ns_in_day = float(8.64*10**13)
max_date_overall = df[max_date].max()
diff_from_max_date = df[max_date].apply(lambda x: max_date_overall - x)
churn = np.where(diff_from_max_date.apply(lambda x: x.item()/ns_in_day) > time_to_churn,1,0)
last_date_of_interaction = df[max_date]
last_date_of_interaction.iloc[np.where(diff_from_max_date.apply(lambda x: x.item()/ns_in_day) < time_to_churn)[0]] = max_date_overall
length_of_life = (last_date_of_interaction - df[min_date]).apply(lambda x: x.item()/ns_in_day)
df['duration'] = list(length_of_life)
df['churn'] = list(churn)
return df
def kmf_calculation(df, bucket):
indices_ = np.where(df.use_buckets == bucket)
T = df['duration'].iloc[indices_]
C = df['churn'].iloc[indices_]
kmf = KaplanMeierFitter()
kmf.fit(T, event_observed = C, label=bucket)
return kmf
def main(inputfile, outputfile, buckets, time_to_churn):
df = pd.read_csv(inputfile)
df = df.dropna()
df['first_use_date'] = pd.to_datetime(df['first_use_date'])
df['last_use_date'] = pd.to_datetime(df['last_use_date'])
all_churn_data = get_churn_data(df, "first_use_date", "last_use_date", time_to_churn)
all_churn_data_gt0 = all_churn_data[(all_churn_data.duration > 0) & (all_churn_data.std_freq >=0) & (all_churn_data.mean_freq > 0)]
all_churn_data_gt0["use_buckets"] = all_churn_data_gt0["use_count"]
all_churn_data_gt0 = all_churn_data_gt0.sort(columns = ["use_buckets"])
all_churn_data_gt0.use_buckets = pd.cut(all_churn_data_gt0.use_buckets, buckets)
xx = pd.cut(all_churn_data_gt0.use_buckets, buckets)
unique_buckets = list(xx.levels)
kmf_buckets = []
avg_durations = []
avg_total_spent = []
daily_margin = []
counts_in_bucket = []
for bucket in unique_buckets:
indices_ = np.where(all_churn_data_gt0.use_buckets == bucket)
counts_in_bucket_temp = all_churn_data_gt0[all_churn_data_gt0.use_buckets == bucket].use_count.count()
counts_in_bucket.append(counts_in_bucket_temp)
avg_durations_temp = all_churn_data_gt0[all_churn_data_gt0.use_buckets == bucket].duration.mean()
avg_durations.append(avg_durations_temp)
avg_total_spent_temp = all_churn_data_gt0[all_churn_data_gt0.use_buckets == bucket].total_order_value.mean()
avg_total_spent.append(avg_total_spent_temp)
daily_margin.append(avg_total_spent_temp/float(avg_durations_temp))
kmf = kmf_calculation(all_churn_data_gt0, bucket)
kmf_buckets.append(kmf)
kmf_values = [x.survival_function_ for x in kmf_buckets]
pickle.dump((kmf_values, unique_buckets, counts_in_bucket, daily_margin), open(outputfile, 'wb'))
df_final = all_churn_data_gt0
df_final.to_csv('./data/surv_feature_matrix.csv')
if __name__ == '__main__':
main(inputfile, outputfile, buckets, time_to_churn)