-
Notifications
You must be signed in to change notification settings - Fork 13
/
model.py
265 lines (237 loc) · 11.6 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
#!/usr/bin/env python
##########################################################################################
# Some of the code is adapted from:
# https://github.com/sbustreamspot/sbustreamspot-train/blob/master/create_seed_clusters.py
##########################################################################################
import argparse
import numpy as np
import random
import os
import sys
from helper.profile import BestClusterGroup, Model, test_single_graph
from scipy.spatial.distance import pdist, squareform, hamming
from sklearn.model_selection import KFold, ShuffleSplit
# This global dictionary is populated
# with useful debugging information if
# -v is set to be True in the command line
DEBUG_INFO = None
def save_model(model, model_name, fh):
"""Save a model with model name as the training file name.
Models are saved to a file @fh. """
fh.write("model: {}\n".format(model_name))
num_cluster = len(model.medoids)
fh.write("cluster: {}\n".format(num_cluster))
for medoid in model.medoids:
fh.write("medoid: ")
for elem in medoid:
fh.write("{} ".format(int(float(elem))))
fh.write("\n")
fh.write("mean: ")
for mean in model.mean_thresholds:
fh.write("{} ".format(float(mean)))
fh.write("\n")
fh.write("max: ")
for max in model.max_thresholds:
fh.write("{} ".format(float(max)))
fh.write("\n")
fh.write("std: ")
for std in model.stds:
fh.write("{} ".format(float(std)))
fh.write("\n")
fh.write("evolution: ")
for evol in model.evolution:
fh.write("{} ".format(evol))
fh.write("\n")
def load_sketches(fh):
"""Load sketches in a file from the handle @fh to memory as numpy arrays. """
sketches = list()
for line in fh:
sketch = map(long, line.strip().split())
sketches.append(sketch)
return np.array(sketches)
def pairwise_distance(arr, method='hamming'):
"""Wrapper function that calculates the pairwise distance between every
two elements within the @arr. The metric (@method) is default as hamming.
squareform function makes it a matrix for easy indexing and accessing. """
return squareform(pdist(arr, metric=method))
def model_graphs(train_files, model_file, max_cluster_num=6, num_trials=20, max_iterations=1000):
"""Read sketch vectors in @train_files to build submodels. Create one model from each file.
Returns a dictionary that maps the train file name to its model. """
# A dictionary of models from each file in @train_files.
models = dict()
if model_file:
savefile = open(model_file, 'a+')
else:
print("\33[5;30;42m[INFO]\033[0m Model is not saved, use --save-model to save the model")
for train_file in train_files:
with open(train_file, 'r') as f:
sketches = load_sketches(f)
# @dists contains pairwise Hamming distance between two sketches in @sketches.
try:
dists = pairwise_distance(sketches)
except Exception as e:
print("\33[101m[ERROR]\033[0m Exception occurred in modeling from file {}: {}".format(train_file, e))
raise RuntimeError("Model builing failed: {}".format(e))
# Define a @distance function to use to optimize.
def distance(x, y):
return dists[x][y]
best_cluster_group = BestClusterGroup()
best_cluster_group.optimize(arrs=sketches, distance=distance, max_cluster_num=max_cluster_num, num_trials=num_trials, max_iterations=max_iterations)
# With the best medoids, we can compute some statistics for the model.
model = Model(train_file)
model.construct(sketches, dists, best_cluster_group)
print("\x1b[6;30;42m[SUCCESS]\x1b[0m Model from {} is done...".format(train_file))
# Save some model information in DEBUG_INFO if -v is turned on
if isinstance(DEBUG_INFO, dict):
DEBUG_INFO[train_file] = model.get_members()
# Save model
if model_file:
print("\x1b[6;30;42m[STATUS]\x1b[0m Saving the model {} to {}...".format(train_file, model_file))
save_model(model, train_file, savefile)
models[train_file] = model
# Close the file and proceed to the next model.
f.close()
if model_file:
savefile.close()
return models
def test_graphs(test_files, models, metric, num_stds):
"""Test all sketch vectors in @test_files using the @models
built from model_training_graphs. """
total_graphs_tested = 0.0
tp = 0.0 # true positive (intrusion and alarmed)
tn = 0.0 # true negative (not intrusion and not alarmed)
fp = 0.0 # false positive (not intrusion but alarmed)
fn = 0.0 # false negative (intrusion but not alarmed)
printout = ""
for test_file in test_files:
with open(test_file, 'r') as f:
# if DEBUG_INFO exists, then we will
# track debugging information for
# each test graph. The per-graph debugging
# information is stored in a dictionary
test_info = None
if isinstance(DEBUG_INFO, dict):
test_info = dict()
sketches = load_sketches(f)
abnormal, max_abnormal_point, num_fitted_model = test_single_graph(sketches, models, metric, num_stds, test_info)
if isinstance(DEBUG_INFO, dict):
DEBUG_INFO[test_file] = test_info
f.close()
total_graphs_tested += 1
if not abnormal: # The graph is considered normal
printout += "{} is NORMAL fitting {}/{} models\n".format(test_file, num_fitted_model, len(models))
if "attack" not in test_file: # NOTE: file name should include "attack" to indicate the oracle
tn = tn + 1
else:
fn = fn + 1
else:
printout += "{} is ABNORMAL at {}\n".format(test_file, max_abnormal_point)
if "attack" in test_file:
tp = tp + 1
else:
fp = fp + 1
if (tp + fp) == 0:
precision = None
else:
precision = tp / (tp + fp)
if (tp + fn) == 0:
recall = None
else:
recall = tp / (tp + fn)
accuracy = (tp + tn) / (tp + tn + fp + fn)
if not precision or not recall or (precision + recall) == 0:
f_measure = None
else:
f_measure = 2 * (precision * recall) / (precision + recall)
return precision, recall, accuracy, f_measure, printout
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('-t', '--train-dir', help='absolute path to the directory that contains all training sketches', required=True)
parser.add_argument('-u', '--test-dir', help='absolute path to the directory that contains all test sketches', required=True)
parser.add_argument('-m', '--metric', choices=['mean', 'max', 'both'], default='both',
help='threshold metric to use to calculate the mean or max of the cluster distances between cluster members and the medoid')
parser.add_argument('-n', '--num-stds', choices=np.arange(0, 5.0, 0.1), type=float,
help='the number of standard deviations above the threshold to tolerate')
parser.add_argument('-s', '--save-model', help='use this flag to save the model', action='store_true')
parser.add_argument('-S', '--seed', help='seed for random number generator', type=int, default=98765432)
parser.add_argument('-p', '--model-path', help='file path to save the model', default='model.txt')
parser.add_argument('-c', '--cross-validation', help='number of cross validation we perform (use 0 to turn off cross validation)', type=int, default=5)
parser.add_argument('-v', '--verbose', help='produce debugging information', action='store_true')
args = parser.parse_args()
SEED = args.seed
random.seed(SEED)
np.random.seed(SEED)
print("\33[5;30;42m[INFO]\033[0m Random number seed: {}".format(SEED))
if args.verbose:
print("\33[5;30;42m[INFO]\033[0m Debugging information will be collected. You can access it programmatically")
DEBUG_INFO = dict()
# The training file names within @train_dir directory.
# We will read every file within the directory, but
# we do not do error-checking. You must make sure every
# file in train_dir diretory is valid graph sketches.
train = os.listdir(args.train_dir)
# We make the file name full path
train_files = [os.path.join(args.train_dir, f) for f in train]
# The test file names within test_dir directory.
# Again, we perform no error-checking here.
test = os.listdir(args.test_dir)
test_files = [os.path.join(args.test_dir, f) for f in test]
# Determine metric to use
if args.metric is 'both':
metric_config = ['mean', 'max']
else:
metric_config = [args.metric]
# Determine the number of standard deviations to use
if not args.num_stds: # If this argument is not given, we explore different possible configurations.
std_config = np.arange(0, 5.0, 0.1)
else:
std_config = [args.num_stds]
# Train (all training graphs) #
model_save_path = None
if args.save_model:
model_save_path = args.model_path
models = model_graphs(train_files, model_save_path)
# Perform K-fold cross validation, unless turned off
if args.cross_validation == 0:
print("\33[5;30;42m[INFO]\033[0m No cross validation specified, use --cross-validation")
# Model (all training_files)
submodels = list()
for _, model in models.items():
submodels.append(model)
for tm in metric_config:
for ns in std_config:
precision, recall, accuracy, f_measure, printout = test_graphs(test_files, submodels, tm, ns)
print("Metric: {}\tSTD: {}".format(tm, ns))
print("Accuracy: {}\tPrecision: {}\tRecall: {}\tF-1: {}".format(accuracy, precision, recall, f_measure))
print("{}".format(printout))
else:
kf = ShuffleSplit(n_splits=args.cross_validation, test_size=0.2, random_state=0)
print("\x1b[6;30;42m[STATUS]\x1b[0m Performing {} cross validation".format(args.cross_validation))
cv = 0 # counter of number of cross validation tests
for train_idx, validate_idx in kf.split(train_files):
training_files = list() # Training submodels we use
for tidx in train_idx:
training_files.append(train_files[tidx])
for vidx in validate_idx: # Train graphs used as validation
test_files.append(train_files[vidx]) # Validation graphs are used as test graphs
# Model (only graphs in training_files)
submodels = list()
for tf in training_files:
submodels.append(models[tf])
print("\x1b[6;30;42m[STATUS] Test {}/{}\x1b[0m:".format(cv, args.cross_validation))
for tm in metric_config:
for ns in std_config:
precision, recall, accuracy, f_measure, printout = test_graphs(test_files, submodels, tm, ns)
print("Metric: {} STD: {}".format(tm, ns))
print("Accuracy: {}\tPrecision: {}\tRecall: {}\tF-1: {}".format(accuracy, precision, recall, f_measure))
print("{}".format(printout))
cv += 1
# Debug print for Visicorn
if args.verbose:
for tf in train_files:
print(tf)
print(DEBUG_INFO[tf])
for tf in test_files:
print(tf)
print(DEBUG_INFO[tf])
print("\x1b[6;30;42m[SUCCESS]\x1b[0m Unicorn is finished")