-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathrun.py
executable file
·340 lines (273 loc) · 11.4 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
# LIBRARIES REQUIRED
import os, errno, csv
from collections import Counter
from datetime import datetime, timedelta
from glob import glob
# FUNCTION DEFINITIONS
def calculate_7Grams(data,dictionary):
'''
This function will generate 7 grams from the data and store them in the dictionary.
It will return the last 7-gram for calcuting the residuals of the 5-grams and 3-grams
'''
ngrams = ()
for it in range(0, len(data) - 6):
ngrams = tuple(data[it: (it + 7)])
dictionary.setdefault(ngrams,0)
dictionary[ngrams] += 1
return ngrams
def calcualte_Grams(dict7, dict5, dict3):
'''
This function will calculate the proper 5-grams from the dictionary dict7 and store them in dictionary dict5
and calculate the proper 3-grams from the dictionary dict5 and store them in dictionary dict3
according to the main algo described in the latex document.
'''
# calcualting & storing the rest of 5-grams from the 7-grams
grams5 = ()
for key in dict7:
grams5 = key[:5]
dict5.setdefault(grams5,0)
dict5[grams5] += dict7[key]
# calcualting & storing the rest of 3-grams from the 5-grams
grams3 = ()
for key in dict5:
grams3 = key[:3]
dict3.setdefault(grams3,0)
dict3[grams3] += dict5[key]
def calculateResidual(grams7, dict5, dict3):
'''
This function will take the last 7-gram as one of the attribute and calculate the corresponding residual 5-grams.
Similarly, it will also calcualate the residual 3-grams from the last 5-gram
'''
# calculating the last two 5 grams from the last 7 gram
residual5a = grams7[2:7]
residual5b = grams7[1:6]
dict5.setdefault(residual5a,0)
dict5.setdefault(residual5b,0)
dict5[residual5a] += 1
dict5[residual5b] += 1
# now calculating the residual 3-grams from the last 5-grams
residual3a = residual5b[2:5]
residual3b = residual5b[1:4]
dict3.setdefault(residual3a,0)
dict3.setdefault(residual3b,0)
dict3[residual3a] += 1
dict3[residual3b] += 1
def getFeatures(dict, path):
'''
This function will take a dictionary as input, get the top 30% of that dictionary and store them in the path file
'''
# get the top 30% of the data
dictTop30 = dict.most_common(int(0.3*len(dict)))
# writing the top 30% tupples to the file
writeFile(path, dictTop30)
# return the top 30%
return dictTop30
def getData(file):
'''
The function would open the file passed as the argument and then read the data
as string. It would return this data after splitting.
'''
data = ""
# reading the data of the file and storing in the variable "data"
try:
with open(file) as f:
data = data + f.read()
except IOError as exc:
if exc.errno != errno.EISDIR:
raise
# split data along the spaces
data = data.split()
return data
def remFile(file):
if os.path.isfile(file):
os.remove(file)
def writeFile(file,data):
'''
This function will write the data to the file.
'''
remFile(file)
filewrite=open(file,'a')
for tmp in data:
filewrite.write(str(tmp)+"\n")
filewrite.close()
def validateGeneral_Auxillary(Features, Dictionary, Filewrite):
for feature in Features:
if feature[0] in Dictionary:
Filewrite.write( (str(Dictionary[feature[0]]) + ',').rstrip('\n'))
else:
Filewrite.write( '0,'.rstrip('\n'))
def validateGenerate():
'''
It will generate the frequency counts for all the files of the Validation Dataset for all the 3/5/7-tupples
'''
print("\nFinding the frequency of features in the Validation Dataset.")
files = glob('ADFA-LD/Validation_Data_Master/*.txt')
remFile("VALIDATE/7.txt")
remFile("VALIDATE/5.txt")
remFile("VALIDATE/3.txt")
filewrite7=open("VALIDATE/7.txt",'a')
filewrite5=open("VALIDATE/5.txt",'a')
filewrite3=open("VALIDATE/3.txt",'a')
for i,file in enumerate(files):
dict3 = {}
dict5 = {}
dict7 = {}
# Get the data of the current file
data = getData(file)
# calcualting the 7 grams
grams7 = calculate_7Grams(data, dict7)
# calculating the residual
calculateResidual(grams7, dict5, dict3)
# Calcualting 3-grams and 5-grams
calcualte_Grams(dict7, dict5, dict3)
# calculating the frequency for attack features
validateGeneral_Auxillary(Features7, dict7, filewrite7)
validateGeneral_Auxillary(Features5, dict5, filewrite5)
validateGeneral_Auxillary(Features3, dict3, filewrite3)
filewrite3.write('\n')
filewrite5.write('\n')
filewrite7.write('\n')
filewrite7.close()
filewrite5.close()
filewrite3.close()
# For the Validation 30% of the Attack Dataset
for AttackIterator,Attack in enumerate(AttackList):
# For each folder(8-10) of the Attack
remFile("VALIDATE/" + Attack +"7.txt")
remFile("VALIDATE/" + Attack +"5.txt")
remFile("VALIDATE/" + Attack +"3.txt")
filewrite7=open("VALIDATE/" + Attack +"7.txt",'a')
filewrite5=open("VALIDATE/" + Attack +"5.txt",'a')
filewrite3=open("VALIDATE/" + Attack +"3.txt",'a')
for foldernum in range(8,11):
# path of the folder
path = 'ADFA-LD/Attack_Data_Master/' + Attack + '_' + str(foldernum) + '/*.txt'
# all the files in this folder
files = glob(path)
# for each file in this folder
for file in files:
# the following dictionaries will store the corresponding n-grams of the current file
dict7 = {}
dict5 = {}
dict3 = {}
# reading the data of the file and storing in the variable "attackdata"
attackdata = getData(file)
# finding and storing the 7 grams of 'attackdata' in the dictionary 'dict7'
grams7 = calculate_7Grams(attackdata, dict7)
# according to the logic we need to calculate the last(residual) 5/3 grams separately
calculateResidual(grams7, dict5, dict3)
# calcualting & storing the rest of 5-grams from the 7-grams and 3-grams from the 5-grams
calcualte_Grams(dict7, dict5, dict3)
# calculating the frequency for attack features
validateGeneral_Auxillary(Features7, dict7, filewrite7)
validateGeneral_Auxillary(Features5, dict5, filewrite5)
validateGeneral_Auxillary(Features3, dict3, filewrite3)
filewrite3.write('\n')
filewrite5.write('\n')
filewrite7.write('\n')
filewrite7.close()
filewrite5.close()
filewrite3.close()
def displayTimeMessage():
'''
This function is used to display the time messages on the console
'''
global a,b
b = datetime.now()
print("\n\tTime Taken = " + str((b-a).total_seconds()) + " seconds")
print("############################################################################################################################")
a = b
# GLOBAL VARIABLES
# namelist is the list of different attacks
AttackList = ["Adduser" , "Hydra_FTP" , "Hydra_SSH" , "Java_Meterpreter" , "Meterpreter" , "Web_Shell"]
# 2-D array of dictionaries for storing the grams of the corresponding attacks
# grams_k[i][j] represents the k-grams of i'th attack's j'th folder
# j varies from 1 to 7, i is the number of attacks = 6 and we need three types of n-grams
grams_7 = [[Counter() for x in range(7)] for y in range(6)]
grams_5 = [[Counter() for x in range(7)] for y in range(6)]
grams_3 = [[Counter() for x in range(7)] for y in range(6)]
print("\nCalculating all the 3/5/7 Grams for all the 6 Attacks.")
InitialTime = datetime.now()
a = InitialTime
# For each attack in the attacklist
for AttackIterator,Attack in enumerate(AttackList):
# For each folder(1-7) of the Attack
for foldernum in range(0,7):
# path of the folder
path = 'ADFA-LD/Attack_Data_Master/' + Attack + '_' + str(foldernum+1) + '/*.txt'
# all the files in this folder
files = glob(path)
# for each file in this folder
for file in files:
# the following dictionaries will store the corresponding n-grams of the current file
dict7 = {}
dict5 = {}
dict3 = {}
# reading the data of the file and storing in the variable "attackdata"
attackdata = getData(file)
# finding and storing the 7 grams of 'attackdata' in the dictionary 'dict7'
grams7 = calculate_7Grams(attackdata, dict7)
# according to the logic we need to calculate the last(residual) 5/3 grams separately
calculateResidual(grams7, dict5, dict3)
# calcualting & storing the rest of 5-grams from the 7-grams and 3-grams from the 5-grams
calcualte_Grams(dict7, dict5, dict3)
# now we have calculated the 3/5/7 grams of the current file.
# adding these counts to the global variable which stores the 3/5/7 grams of the folder
grams_7[AttackIterator][foldernum] = grams_7[AttackIterator][foldernum] + Counter(dict7)
grams_5[AttackIterator][foldernum] = grams_5[AttackIterator][foldernum] + Counter(dict5)
grams_3[AttackIterator][foldernum] = grams_3[AttackIterator][foldernum] + Counter(dict3)
displayTimeMessage()
# Calculating the 3/5/7 grams for the normal files
print("\n\nCalculating all the 3/5/7 Grams for Normal ")
# taking the 70% of the normal files and concatenating them
path = 'ADFA-LD/Training_Data_Master/*.txt'
files = glob(path)
# finding and storing the 3/5/7 grams for the normal usage in the corresponding dictionary(counter)
dict7_normal = Counter()
dict5_normal = Counter()
dict3_normal = Counter()
for file in files: # for each normal file data
# reading the data of the file and storing in the variable "attackdata"
normalfile = getData(file)
# calculate the 7 grams of 'normalfile' and store in the dictionary 'dict7_normal'
grams7 = calculate_7Grams(normalfile, dict7_normal)
# according to the logic we need to calculate the last 5/3 grams separtely
calculateResidual(grams7, dict5_normal, dict3_normal)
# calcualting & storing the rest of 5-grams from the 7-grams and 3-grams from the 5-grams
calcualte_Grams(dict7_normal, dict5_normal, dict3_normal)
displayTimeMessage()
print("\n\nCalculating the features for all 3/5/7 Grams for Normal (i.e. Top 30%) and writing to files.")
Final7_normal = getFeatures(dict7_normal, 'TRAINING/NORMAL/top30%7tupple.txt')
Final5_normal = getFeatures(dict5_normal, 'TRAINING/NORMAL/top30%5tupple.txt')
Final3_normal = getFeatures(dict3_normal, 'TRAINING/NORMAL/top30%3tupple.txt')
displayTimeMessage()
print("\n\nCalculating the features(i.e. Top 30%) for all 3/5/7 Grams for all 6 Attacks and writing to files.")
# for the top 30% for each attack
Final7 = [Counter() for x in range(0,7)]
Final5 = [Counter() for x in range(0,7)]
Final3 = [Counter() for x in range(0,7)]
Features7 = set()
Features5 = set()
Features3 = set()
for i in range(0,6): # 6 attack
# first adding the results of different folders of each attack
for j in range(0,7): # 7 folders of each attack
Final7[i] = Final7[i] + grams_7[i][j]
Final5[i] = Final5[i] + grams_5[i][j]
Final3[i] = Final3[i] + grams_3[i][j]
Final7[i] = getFeatures(Final7[i], 'TRAINING/ATTACK-' + str(i+1) + '/top30%_7tupple.txt')
Final5[i] = getFeatures(Final5[i], 'TRAINING/ATTACK-' + str(i+1) + '/top30%_5tupple.txt')
Final3[i] = getFeatures(Final3[i], 'TRAINING/ATTACK-' + str(i+1) + '/top30%_3tupple.txt')
Features7 = set(Final7[0] + Final7[1] + Final7[2] + Final7[3] + Final7[4] + Final7[5] + Final7_normal)
Features5 = set(Final5[0] + Final5[1] + Final5[2] + Final5[3] + Final5[4] + Final5[5] + Final5_normal)
Features3 = set(Final3[0] + Final3[1] + Final3[2] + Final3[3] + Final3[4] + Final3[5] + Final3_normal)
print("Total 3 - Features = ", len(Features3))
print("Total 5 - Features = ", len(Features5))
print("Total 7 - Features = ", len(Features7))
displayTimeMessage()
# Generating validation data
validateGenerate()
displayTimeMessage()
FinalTime = datetime.now()
print("\n############################################################################################################################")
print("\nTOTAL TIME TAKEN BY THE SCRIPT : " + str((FinalTime-InitialTime).total_seconds()))