generated from streamlit/blank-app-template
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstreamlit_app.py
214 lines (176 loc) · 8.54 KB
/
streamlit_app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
import streamlit as st
import pandas as pd
import plotly.graph_objects as go
# Set page layout to wide
st.set_page_config(layout="wide")
# Read the CSV file from the public S3 URL
data = pd.read_csv("https://behaviorally-testing.s3.amazonaws.com/sankey_relevant_session_v2.csv")
st.title("Behaviorally Sankey beh_v105")
st.write("Sankey Diagram of Participant Journeys")
# App selection dropdown
app_names = {
'com.ss.android.ugc.trill': 'TikTok',
'com.shopee.id': 'Shopee'
}
selected_app = st.selectbox("Select App", options=list(app_names.keys()), format_func=lambda x: app_names[x])
# Filter the data after app selection
if selected_app:
app_data = data[data['apppackagename'] == selected_app]
print(app_data)
app_data['participantId'] = app_data['participantId'].astype('str')
# Order the participant_ids numerically and add an "All" option
participant_ids = sorted(app_data['participantId'].unique())
participant_ids = ['All'] + participant_ids # Add "All" option at the beginning
# Filter dropdown for participantId
selected_participant = st.selectbox("Select Participant", participant_ids)
# Third filter for journey type
journey_types = ["Purchase", "Non-Purchase"]
selected_journey_type = st.selectbox("Select Journey Type", journey_types, index=0) # "Purchase" is default
# Function to process the journeys until purchases
def journeys_until_first_purchase(pages):
if not pages:
return []
journeys = []
current_journey = [pages[0]]
for i in range(1, len(pages)):
if pages[i] == 'Purchase':
current_journey.append(pages[i])
journeys.append(current_journey)
break # Stop after the first purchase
elif pages[i] != pages[i - 1]: # Compare with the previous element
current_journey.append(pages[i])
if 'Purchase' not in current_journey:
journeys.append(current_journey)
return journeys
def get_sankey_format_data(df):
# Exploding the 'url_path' into separate rows
df_exploded = df.explode('pagetype')
# Adding Step column based on the order within each session
df_exploded['Step'] = df_exploded.groupby(['participantId', 'session']).cumcount()
# Pivoting the DataFrame to get Step columns
df_pivoted = df_exploded.pivot(index=['participantId', 'session'], columns='Step', values='pagetype')
# Renaming the columns to match the desired Step format
df_pivoted.columns = [f'Step{col}' for col in df_pivoted.columns]
# Resetting index to flatten the DataFrame
df_pivoted.reset_index(inplace=True)
# Reordering the columns to match the desired format
df_final = df_pivoted[['participantId', 'session'] + [f'Step{i}' for i in range(df_pivoted.shape[1] - 2)]]
return df_final
def get_first_and_last_five_journeys(list_items):
if len(list_items) > 10:
return list_items[:5] + list_items[-5:]
return list_items
def get_journeys_until_first_purchase(df, app_package_name, participant_id):
if participant_id == 'All':
app_df = df[df['apppackagename'] == app_package_name]
else:
app_df = df[(df['apppackagename'] == app_package_name) & (df['participantId'] == participant_id)]
app_df['pagetype'] = app_df['pagetype'].str.split('|')
app_df = app_df.explode('pagetype', ignore_index=True)
app_df = app_df[app_df['pagetype'] != 'Viewedrecommendedproduct']
app_df['pagetype'] = app_df['pagetype'].replace('Cart', 'Cart Journey')
app_df['eventtime'] = pd.to_datetime(app_df['eventtime'])
app_df.sort_values(by=['participantId', 'eventtime', 'session'], inplace=True)
aggregated_data = app_df.groupby(['participantId', 'session']).agg({'pagetype': list}).reset_index()
aggregated_data['pagetype'] = aggregated_data['pagetype'].apply(journeys_until_first_purchase)
aggregated_data = aggregated_data.explode('pagetype')
aggregated_data = aggregated_data[~aggregated_data['pagetype'].isnull()]
aggregated_data['pagetype'] = aggregated_data['pagetype'].apply(lambda x: x + ['Non-Purchase'] if x[-1] != 'Purchase' else x)
aggregated_data['pagetype_length'] = aggregated_data['pagetype'].apply(len)
aggregated_data = aggregated_data[aggregated_data['pagetype_length'] > 1]
aggregated_data['pagetype'] = aggregated_data['pagetype'].apply(get_first_and_last_five_journeys)
aggregated_data['journey_type'] = aggregated_data['pagetype'].apply(lambda x: 'Non Purchase Journeys' if x[-1] != 'Purchase' else 'Purchase Journeys')
aggregated_data['pagetype_length'] = aggregated_data['pagetype'].apply(len)
# Filter data based on selected journey type
if selected_journey_type == "Purchase":
filtered_df = aggregated_data[aggregated_data['journey_type'] == 'Purchase Journeys']
else:
filtered_df = aggregated_data[aggregated_data['journey_type'] == 'Non Purchase Journeys']
return get_sankey_format_data(filtered_df)
# Filter the data based on the selected participant and journey type
purchase_paths_df = get_journeys_until_first_purchase(data, selected_app, selected_participant)
event_colors = {
"Home": "#d02f80",
"Search": "#d98c26",
"Review": "#abd629",
"Category": "#68d22d",
"Product": "#2bd4bd",
"Cart Journey": "#229cdd",
"Checkout":"#229ddd",
"Purchase": "#964db2",
"Videolive": "#9a7965",
"Videononlive": "#9a7345",
"Voucher": "#6e918b",
"History": "#edda12",
"Brandshop": "#64739b",
"Me":"#63d6d6",
"Non-Purchase": "#63d8d6",
"Shopeemall":"#23d8d6",
"Allproductsandservices":"#62d8d6",
}
# Initialize lists for sources, targets, values, and colors
source = []
target = []
value = []
link_colors = []
# Create dictionaries to store node labels and indices
node_labels = []
node_indices = {}
node_colors = []
# Helper function to get the index of a node label
def get_node_index(label):
if label not in node_indices:
node_indices[label] = len(node_labels)
node_labels.append(label)
node_colors.append(event_colors[label.split("_")[1]] if "_" in label else event_colors.get(label, "grey"))
return node_indices[label]
# Iterate over each row to build the source-target pairs
for index, row in purchase_paths_df.iterrows():
steps = row.dropna().tolist()[2:] # Exclude PID and SID
for i in range(len(steps) - 1):
current_step = steps[i]
next_step = steps[i + 1]
# Ensure all non-purchase paths end at a single node "Non-Purchase"
if next_step == 'Non-Purchase':
next_step_label = 'Non-Purchase'
elif next_step == 'Purchase':
next_step_label = 'Purchase'
else:
next_step_label = f"Step{i + 1}_{next_step}"
source_index = get_node_index(f"Step{i}_{current_step}")
target_index = get_node_index(next_step_label)
source.append(source_index)
target.append(target_index)
value.append(1) # Each transition has a value of 1
link_colors.append("lightgrey") # Transition color
# Create the Sankey diagram without the 'font' parameter in node
fig = go.Figure(go.Sankey(
node=dict(
pad=20, # Increased padding for clearer separation
thickness=20,
line=dict(color="black", width=0.5), # Optional node border
label=[label.split("_")[1] if "_" in label else label for label in node_labels],
color=node_colors # Background colors for nodes
),
link=dict(
source=source,
target=target,
value=value,
color=link_colors
)
))
# Update layout to control font settings globally and set the width of the figure
fig.update_layout(
title_text="E-commerce Purchase Journeys (First 5 steps, Last 5 steps only)",
# font=dict(
# size=14, # Adjusted for better readability across the chart
# color="black", # Set font color to black for overall text contrast
# family="Arial" # Use a readable font family
# ),
font_size=14,
font_color ="red",
width=1600, # Adjust width for a better display
height=900 # Adjust height for a better display
)
# Display the Sankey diagram in Streamlit with full width
st.plotly_chart(fig, use_container_width=True)