-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathformatter.py
462 lines (387 loc) · 16 KB
/
formatter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
"""
Tool to make extraction of raw binary data easier
Copyright (C) 2016 Liam Brandt <[email protected]>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
"""
This tool is intended to make extraction of data from raw binary files
easier. Any text file contained in ./formats can be used to describe the
format of a binary file, and pointers to each point of data will be
stored in a dictionary. To get the data from a binary file using this
reference dictionary, call get_raw(bin_list, bin_file) where bin_list
is the 2 item list found in the reference dictionary for the piece of
data you want to get. bin_file is the binary file you are extracting the
data from.
Problems:
* Cannot do nested IF statements
* Cannot handle nested parentheses in get_dynamic_number()
* INDEX needs to be made respective to each CHUNK, not just stored in VARS
* Fix PEMDAS in get_dynamic_number()
"""
import struct
import time
import os
import traceback
from collections import OrderedDict
SETTINGS = {
"trace": False,
"safe_debug": True,
}
INDICES = {}
#flat dictionary of variables previously defined
VARS = {}
MARKERS = {}
ABORT = False
def trace(text):
"""Print the text to the console if SETTINGS["trace"] is True."""
if SETTINGS["trace"]:
print(text)
def trace_error():
"""Print the last error to the console."""
if SETTINGS["trace"]:
traceback.print_exc()
def shorten_vowels(text, length):
"""Return the shorter text of specified length by removing vowels."""
text = text.lower()
vowels = ["y", "u", "o", "i", "e", "a"]
for v in vowels:
if len(text) <= length:
break
text = text.replace(v, "")
if len(text) <= length:
split = text.ljust(length)
else:
split = text[:length]
final = "[" + split + "]"
return final
def chunk_trace(text, layer):
"""Print the text to the console along with the current chunk layer."""
if SETTINGS["trace"]:
trace(shorten_vowels(layer, 12) + " " + text)
def unpack(bin_file, data_type, length_arg=0, endianness="@"):
"""
Use struct.unpack() to load a value from the binary file.
Keyword arguments:
length_arg -- the length of the data, used only for strings
Use data_type to tell what type of data to unpack from bin_file, then return
the unpacked data.
"""
if SETTINGS["safe_debug"]:
global ABORT
current_offset = bin_file.tell()
bin_file.seek(0, 2)
if current_offset == bin_file.tell():
#trying to read bytes that dont exist, we are at end of file!
ABORT = True
return ["EOF"]
bin_file.seek(current_offset)
#integer or unsigned integer
if data_type == "i" or data_type == "I":
return int(struct.unpack(endianness+data_type, bin_file.read(4))[0])
#short or unsigned short
elif data_type == "h" or data_type == "H":
return int(struct.unpack(endianness+data_type, bin_file.read(2))[0])
#float
elif data_type == "f":
return float(struct.unpack(endianness+data_type, bin_file.read(4))[0])
#string
elif data_type == "s":
return (struct.unpack(str(length_arg) + data_type, bin_file.read(length_arg))[0]).decode("UTF-8")
#char
elif data_type == "c":
return struct.unpack(data_type, bin_file.read(1))[0]
#byte or unsigned byte
elif data_type == "b" or data_type == "B":
return int(struct.unpack(data_type, bin_file.read(1))[0])
else:
trace("UNKNOWN UNPACK DATA TYPE: " + str(data_type))
def get_dynamic_number(var, chunk, bin_file):
"""
Get and return an integer based on a math expression and or variables.
If var is just an integer in string form, return the integer. Otherwise
evaluate var as a mathematical expression, calling get_dynamic_number()
on both sides of an operand until an integer value is reached. Variable
names defined in previously in the format file can be used instead of
integers.
"""
try:
number = int(var)
except:
#evaluate parentheses first
#only works for single parentheses, no nested parentheses
while(True):
if "(" in var:
start = var.find("(")
end = var.find(")", start)
evaluated = get_dynamic_number(var[start+1:end], chunk, bin_file)
var = var[:start] + str(evaluated) + var[end+1:]
else:
break
if "*" in var:
var = var.split("*")
number = get_dynamic_number(var[0], chunk, bin_file) * get_dynamic_number(var[1], chunk, bin_file)
elif "/" in var:
var = var.split("/")
number = get_dynamic_number(var[0], chunk, bin_file) / get_dynamic_number(var[1], chunk, bin_file)
elif "+" in var:
var = var.split("+")
number = get_dynamic_number(var[0], chunk, bin_file) + get_dynamic_number(var[1], chunk, bin_file)
elif "-" in var:
var = var.split("-")
number = get_dynamic_number(var[0], chunk, bin_file) - get_dynamic_number(var[1], chunk, bin_file)
else:
#all math symbols have been evaluated, now evaluate variable values
if var == "INDEX":
number = chunk[var]
trace("FOUND 'INDEX' of " + str(number))
elif "INDEX:" in var:
number = INDICES[var.split(":")[1]]
trace("FOUND 'INDEX:' of " + str(number))
else:
number = get_raw(chunk[var], bin_file)
return int(number)
def interpret_chunk(format_file, bin_file, layer):
"""
Interpret binary data from bin_file using the format from format_file.
Read lines from format_file. For each line, identify what the line is
telling the interpret_chunk() to do, and do it. This could mean anything
from carrying out an if statement, to jumping to an offset, to
creating a new chunk inside of this chunk by calling interpret_chunk()
recursively.
"""
global INDICES
global VARS
chunk_trace("<<<NEW CHUNK<<<", layer)
skipping_until_endif = False
nested_ifs = 0
chunk = OrderedDict()
flags = {
"return": False,
}
try:
#read lines in this chunk
while(True):
if ABORT:
chunk_trace(">>>END CHUNK by error: ABORT>>>", layer)
flags["return"] = True
return chunk, flags
line = format_file.readline().lstrip()
line_list = line.split()
#ignore comments and blank lines
if line.startswith("#") or len(line_list) == 0:
chunk_trace("COMMENT/BLANK", layer)
continue
#IF statement
if line_list[0] == "IF":
chunk_trace("IF", layer)
nested_ifs += 1
if line_list[1].startswith("INDEX"):
boolean = (INDICES[line_list[1].split("/")[1]] == get_dynamic_number(line_list[2], VARS, bin_file))
else:
boolean = (get_dynamic_number(line_list[1], VARS, bin_file) == get_dynamic_number(line_list[2], VARS, bin_file))
chunk_trace("If resolved to " + str(boolean), layer)
if not boolean:
skipping_until_endif = True
continue
#Exit IF statement
if line_list[0] == "ENDIF":
chunk_trace("ENDIF", layer)
nested_ifs -= 1
if nested_ifs < 1:
skipping_until_endif = False
continue
#stop skipping lines because we have left all if statements
#bug - will not work if we skip while already inside an if block
if skipping_until_endif:
chunk_trace("WAIT FOR ENDIF, nested_ifs: " + str(nested_ifs), layer)
continue
#skip over pattern definitions
if line.startswith("@"):
chunk_trace("PATTERN DEFINITION", layer)
continue
#exit, the chunk is done being read
if line_list[0] == "END":
chunk_trace("END", layer)
chunk_trace(">>>END CHUNK>>>", layer)
return chunk, flags
if line_list[0] == "RETURN":
chunk_trace("RETURN", layer)
chunk_trace(">>>END CHUNK by return>>>", layer)
flags["return"] = True
return chunk, flags
#GOTO a MARKER
if line_list[0] == "GOTO":
chunk_trace("GOTO", layer)
format_file.seek(MARKERS[line_list[1]])
continue
#MARKER to jump to from GOTO
if line_list[0] == "MARKER":
chunk_trace("MARKER", layer)
continue
#KILL for debugging
if line_list[0] == "KILL":
chunk_trace("KILL", layer)
time.sleep(1000)
#CHUNK
if line_list[0] == "CHUNK":
chunk_trace("CHUNK", layer)
chunk[line_list[1]] = []
#reference offset for the beggining of the chunk format
format_reference_offset = format_file.tell()
try:
num_chunks = get_dynamic_number(line_list[2], VARS, bin_file)
except:
trace_error()
trace("num_chunks: " + str(num_chunks))
#skip to the end of this format chunk if there are no chunks to be read
if num_chunks == 0:
indent = 0
while(True):
line = format_file.readline().lstrip()
line_list = line.split()
if len(line_list) == 0:
continue
if line_list[0] == "CHUNK":
indent += 1
if line_list[0] == "END":
if indent == 0:
break
indent -= 1
for chunk_index in range(num_chunks):
#Set the index with this chunk name to be the index we are on in the loop.
#This is used for IF statements that need to know what the chunk INDEX is.
INDICES[line_list[1]] = chunk_index
trace("INDICIES --- " + str(INDICES))
VARS["INDEX"] = chunk_index
#go back to beggining of chunk format instructions for every chunk we read
format_file.seek(format_reference_offset)
trace("seeking to: " + str(format_reference_offset))
chunk_trace("***" + line_list[1] + "*** INDEX: " + str(INDICES[line_list[1]]) + "/" + str(num_chunks), layer)
new_chunk, new_flags = interpret_chunk(format_file, bin_file, line_list[1])
#add new child chunks to this parent chunk
chunk[line_list[1]].append(new_chunk)
if new_flags["return"]:
trace("chunk RETURNed, chunk_index: " + str(chunk_index))
break
trace("chunk ENDed, chunk_index: " + str(chunk_index))
trace("end of all chunks: ")
continue
#SKIP bytes
if line_list[0] == "SKIP":
chunk_trace("SKIP", layer)
bin_file.read(get_dynamic_number(line_list[1], VARS, bin_file))
continue
#SEEK offset
if line_list[0] == "SEEK":
chunk_trace("SEEK", layer)
bin_file.seek(get_dynamic_number(line_list[1], VARS, bin_file))
continue
#SEEKREL offset
if line_list[0] == "SEEKREL":
chunk_trace("SEEKREL", layer)
bin_file.seek(get_dynamic_number(line_list[1], VARS, bin_file), 1)
continue
#normal line
chunk_trace("UNPACK DATA", layer)
if len(line_list) > 2:
bin_list = (line_list[1] + str(get_dynamic_number(line_list[2], VARS, bin_file)), bin_file.tell())
else:
bin_list = (line_list[1], bin_file.tell())
chunk[line_list[0]] = bin_list
VARS[line_list[0]] = bin_list
#advance in bin_file for correct offset to be stored in bin_list
data = get_raw(bin_list, bin_file, False)
if data == "ERROR":
chunk_trace(">>>END CHUNK by error: data is 'ERROR'>>>", layer)
flags["return"] = True
return chunk, flags
except:
trace_error()
chunk_trace(">>>END CHUNK by error: exception in interpret_chunk()>>>", layer)
flags["return"] = True
return chunk, flags
def get_raw(bin_list, bin_file, return_to_pos=True):
"""
Unpack and return binary data from bin_file using bin_list.
Keyword arguments:
return_to_pos -- whether or not to return to the offset of bin_file before the get_raw() was called
Use the form from bin_list[0] and offset from bin_list[1] to unpack
the binary data at the offset with a specific form. This form is a string
that will be given to unpack() as the data_type for the data to be
unpacked.
"""
form = bin_list[0]
offset = bin_list[1]
old_offset = bin_file.tell()
bin_file.seek(offset)
#strip endianness from form
endianness = "@"
endian_list = ["=", "<", ">"]
if form[0] in endian_list:
endianness = form[0]
form = form[1:]
try:
if len(form) > 1:
length_arg = int(form[1:])
raw = unpack(bin_file, form[:1], length_arg, endianness=endianness)
else:
raw = unpack(bin_file, form[:1], endianness=endianness)
except:
trace("get_raw() returned ERROR!")
trace_error()
raw = "ERROR"
ABORT = True
#hit end of file
if type(raw) == type([]):
if raw[0] == "EOF":
raw = "ERROR"
ABORT = True
if return_to_pos:
bin_file.seek(old_offset)
return raw
def get_formatted_data(bin_file, format_name, pattern_name):
"""
Return a structured dictionary of data about values contained in bin_file.
Open a format file based on format_name and find a pattern to start from
based on pattern_name. Return a dictionary with values that are
tuples that represent what type of data is stored at a certain offset in
bin_file, and keys that are descriptions of that data.
"""
global MARKERS
global ABORT
ABORT = False
format_file = open("./formats/" + format_name, "r")
format_file.seek(0, os.SEEK_END)
format_file_size = format_file.tell()
format_file.seek(0)
#Find all of the MARKER statements in the format file, so they can be
# jumped to later.
MARKERS = {}
while(True):
line = format_file.readline().lstrip()
if format_file.tell() == format_file_size:
break
split = line.split()
if split != []:
if split[0] == "MARKER":
MARKERS[split[1]] = format_file.tell()
trace(MARKERS)
#jump to the specific pattern given
format_file.seek(0)
while(True):
line = format_file.readline().lstrip()
line_list = line.split()
if line.startswith("@"):
if line_list[0] == "@"+pattern_name:
break
data, flags = interpret_chunk(format_file, bin_file, "GLOBAL")
return data