-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbulkextractor.py
136 lines (110 loc) · 4.96 KB
/
bulkextractor.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or (at
# your option) any later version.
#
# This program is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
#
import volatility.timefmt as timefmt
import volatility.obj as obj
import volatility.utils as utils
import volatility.commands as commands
import volatility.win32.tasks as tasks
import os
import os.path
import re
import HTMLParser
import lxml.html
import time
import datetime
import json
from hashlib import sha1
import tempfile
import binascii
#pylint: disable-msg=C0111
class BulkExtractor(commands.Command):
"""Retrieve specific artifacts from a memory image"""
def __init__(self, config, *args, **kwargs):
commands.Command.__init__(self, config, *args, **kwargs)
config.add_option('PID', short_option = 'p', default = None,help = 'Operate on these Process IDs (comma-separated) rather than all browser processes',action = 'store', type = 'int')
def calculate(self):
"""Calculate and carry out any processing that may take time upon the image"""
# Load the address space
addr_space = utils.load_as(self._config)
print("Bulk Exractor Starting")
print("Note: data is extracted using regex on a dirty dump of memory and may miss a minor percentage of edge cases.")
# Call a subfunction so that it can be used by other plugins
for proc in tasks.pslist(addr_space):
if self._config.PID == proc.UniqueProcessId:
yield proc
def render_text(self, outfd, data):
proc = next(data)
# get the memory dump of a process
proc_data = self.get_process_data(proc)
# create subdirectory string
time_st = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
subdirectory = "bulk_extractor" + time_st
# create subdirectory
try:
os.mkdir(subdirectory)
except Exception:
pass
# write all urls to a file
urls = self.extract_urls(proc_data)
self.write(subdirectory, "urls.txt", urls)
# write all emails to a file
emails = self.extract_emails(proc_data)
self.write(subdirectory, "emails.txt", emails)
# write all json to a file
# json = self.extract_json(proc_data)
# self.write(subdirectory, "json.txt", json)
# write all IPv4 addresses to a file
# IPv4 = self.extract_json(proc_data)
# self.write(subdirectory, "IPv4.txt", IPv4)
def get_process_data(self, proc):
pid = proc.UniqueProcessId
print('found browser pid: {0}, {1}'.format(pid,proc.ImageFileName))
print('Getting process memory dump')
# get process's memory
procSpace = proc.get_process_address_space()
pages = procSpace.get_available_pages()
if pages:
f=tempfile.TemporaryFile()
for p in pages:
procdata = procSpace.read(p[0], p[1])
if procdata == None:
if self._config.verbose:
outfd.write("Memory Not Accessible: Virtual Address: 0x{0:x} File Offset: 0x{1:x} Size: 0x{2:x}\n".format(p[0], proc.obj_offset, p[1]))
else:
dataDecoded= procdata.decode('ascii','ignore')
f.write(dataDecoded.replace('\x00',''))
f.seek(0)
browserData=f.read()
f.close()
return browserData
def write(self, subdirectory, filename, data_list):
filename = os.path.join(subdirectory, filename)
f = open(filename, 'w')
f.write( "\n".join(data_list) )
f.close
def extract_urls(self, proc_data_string):
print('Extracting URLs')
regex = 'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
return re.findall(regex, proc_data_string)
def extract_emails(self, proc_data_string):
print('Extracting emails')
regex_1 = '[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$'
regex_2 = '[\w\-][\w\-\.]+@[\w\-][\w\-\.]+[a-zA-Z]{1,4}'
regex_3 = "[a-z0-9!#$%&'*+\/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+\/=?^_`{|}~-]+)*(@|\sat\s)(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?(\.|\sdot\s))+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?"
return re.findall(regex_2, proc_data_string)
def extract_json(self, proc_data_string):
print("Extracting Json")
def extract_ip_addys(self, proc_data_string):
print("Extracting IPv4 addresses")