-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtrain_test.py
103 lines (75 loc) · 5.92 KB
/
train_test.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import pm4py
from pm4py.objects.conversion.log import converter as log_converter
from pm4py.objects.log.exporter.xes import exporter as xes_exporter
from pm4py.algo.discovery.alpha import algorithm as alpha_miner
from pm4py.algo.conformance.tokenreplay import algorithm as token_replay
import numpy as np
def train_test_split(data_dir, dataset, trace_case_id="case:concept:name", test_size=.3,
split_by_unique_traces: bool = False,
split_by_time: bool = True):
# Read .xes log files
log_lm = pm4py.read_xes(f"{data_dir}/{dataset}/Lm_final.xes")
log_lp = pm4py.read_xes(f"{data_dir}/{dataset}/Lp_final.xes")
# Convert them to pandas df
df_lm = pm4py.convert_to_dataframe(log_lm)
df_lp = pm4py.convert_to_dataframe(log_lp)
# Free up the RAM
del(log_lm, log_lp)
# Collect unique traces from lm & lp and do the split
if split_by_unique_traces:
# Get traces from event log(pandas df)
lm_traces = df_lm[[trace_case_id, "concept:name"]].groupby(trace_case_id).agg(','.join)['concept:name']
lp_traces = df_lp[[trace_case_id, "concept:name"]].groupby(trace_case_id).agg(','.join)['concept:name']
# Get unique traces
lm_traces_unique = lm_traces.reset_index().groupby("concept:name").agg(','.join)
lp_traces_unique = lp_traces.reset_index().groupby("concept:name").agg(','.join)
# Compute number of samples in train and test set
n_traces_lm = lm_traces.nunique()
n_traces_lp = lp_traces.nunique()
n_traces_lm_train = int((1. - test_size) * n_traces_lm) # Sample class wise - be aware of highly imbalanced data!
n_traces_lp_train = int((1. - test_size) * n_traces_lp)
# Sample from unique traces and add non-unique traces,too! (required for discovering a process model)
lm_ids_train = ",".join(lm_traces_unique.sample(n=n_traces_lm_train, random_state=1)[trace_case_id]).split(",");lm_ids_test = list(set(df_lm[trace_case_id].unique()) - set(lm_ids_train))
lp_ids_train = ",".join(lp_traces_unique.sample(n=n_traces_lp_train, random_state=1)[trace_case_id]).split(",");lp_ids_test = list(set(df_lp[trace_case_id].unique()) - set(lp_ids_train))
# Split traces into train and test parts (dataframes)
df_lm_train = df_lm.loc[df_lm[trace_case_id].isin(lm_ids_train)].reset_index(drop=True);df_lm_test = df_lm.loc[df_lm[trace_case_id].isin(lm_ids_test)].reset_index(drop=True)
df_lp_train = df_lp.loc[df_lp[trace_case_id].isin(lp_ids_train)].reset_index(drop=True);df_lp_test = df_lp.loc[df_lp[trace_case_id].isin(lp_ids_test)].reset_index(drop=True)
# Sort traces by time and take the first (1-test_size)% as train, the rest as test (uniqueness is not considered!)
elif split_by_time:
# Sort data by time and keep only the first event(w.r.t. time) in a trace
lm_ids = df_lm.loc[~df_lm.sort_values('time:timestamp').duplicated(trace_case_id, keep='first')].sort_values('time:timestamp')[trace_case_id].values
lp_ids = df_lp.loc[~df_lp.sort_values('time:timestamp').duplicated(trace_case_id, keep='first')].sort_values('time:timestamp')[trace_case_id].values
# Compute number of samples in train and test set
n_traces_lm = lm_ids.shape[0]
n_traces_lp = lp_ids.shape[0]
n_traces_lm_train = int((1. - test_size) * n_traces_lm) # Sample class wise - be aware of highly imbalanced data!
n_traces_lp_train = int((1. - test_size) * n_traces_lp)
# Collect the ids for train & test sets
lm_ids_train = lm_ids[:n_traces_lm_train];lm_ids_test = lm_ids[n_traces_lm_train:]
lp_ids_train = lp_ids[:n_traces_lp_train];lp_ids_test = lp_ids[n_traces_lp_train:]
# Construct train & test sets
df_lm_train = df_lm.loc[df_lm[trace_case_id].isin(lm_ids_train)].reset_index(drop=True);df_lm_test = df_lm.loc[df_lm[trace_case_id].isin(lm_ids_test)].reset_index(drop=True)
df_lp_train = df_lp.loc[df_lp[trace_case_id].isin(lp_ids_train)].reset_index(drop=True);df_lp_test = df_lp.loc[df_lp[trace_case_id].isin(lp_ids_test)].reset_index(drop=True)
# Convert back to event log objects
parameters = {log_converter.Variants.TO_EVENT_LOG.value.Parameters.CASE_ID_KEY: trace_case_id}
lm_train = log_converter.apply(df_lm_train, parameters=parameters, variant=log_converter.Variants.TO_EVENT_LOG);lm_test = log_converter.apply(df_lm_test, parameters=parameters, variant=log_converter.Variants.TO_EVENT_LOG)
lp_train = log_converter.apply(df_lp_train, parameters=parameters, variant=log_converter.Variants.TO_EVENT_LOG);lp_test = log_converter.apply(df_lp_test, parameters=parameters, variant=log_converter.Variants.TO_EVENT_LOG)
return lm_train, lp_train, lm_test, lp_test
def export_event_log_to_xes(event_log_data, file_out_name):
xes_exporter.apply(event_log_data, file_out_name)
def evaluate_pm_model(pm, lp_test, lm_test, threshold = .6): # TODO: Threshold depends on the data set
# Token replay (TODO: Other conformance checking methods are possible)
results_lp = token_replay.apply(lp_test, *pm)
results_lm = token_replay.apply(lm_test, *pm)
# Extract fitness values
fitness_lp = np.array([r["trace_fitness"] for r in results_lp])
fitness_lm = np.array([r["trace_fitness"] for r in results_lm])
# Perform a simple threshold based classification
return np.sum(fitness_lp >= threshold) / (1. * len(fitness_lp)), np.sum(fitness_lm <= threshold) / (1. * len(fitness_lm))
if __name__ == "__main__":
# Load and split data
lm_train, lp_train, lm_test, lp_test = train_test_split("../data", "Traffic")
# "Discover"/Estimate process model
pm = alpha_miner.apply(lp_train) # Estimate a process model (petri net)
# Evaluate process model on test set
print(evaluate_pm_model(pm, lp_test, lm_test)) # TODO: Estimated petri net does not perform that well as a classifier (in particular on BPIC18 it shows a pretty bad performance! - maybe a bad local optimum?!)