-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhttp.py
247 lines (210 loc) · 9.23 KB
/
http.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
#
# Copyright (C) 2016 University of Southern California.
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License,
# version 2, as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
#
import time
from collections import namedtuple
from bs4 import BeautifulSoup
import critter_settings
from critter_settings import PAGE_LOAD_TIMEOUT, BROWSING_SESSION_TIMEOUT
#A single page id is assigned to the parent and their children recursively
global page_id
page_id = 0
#Image file types to link with parent page
image_formats = ('.bmp', '.gif', '.jpeg', '.jfif', '.jpg', '.png', '.ppm', '.pgm', '.pbm', '.pnm', '.tiff')
class BuildSession:
def __init__(self):
self.sql = critter_settings.init()
self.sql.update_query("""UPDATE ParsedHTTP SET browsing_session_id = %s, page_id = %s""",
[(0, 0)]) # Initialise the table with 0s
def process_table(self):
"""
This function processes the packet table by linking the rows together with ids
:return: None
"""
httptuple = namedtuple('httptuple',
['id', 'timestamp', 'page_id', 'tcp_id', 'browsing_session_id', 'type', 'host', 'url',
'referer', 'content_type', 'payload', 'hrefs', 'iframes', 'images'])
treetuple = namedtuple('tree', ['tcp_id', 'browsing_id', 'host', 'url', 'no_children'])
timestamp_marker = 0
while True:
res = self.sql.request_query("""SELECT no, timestmp, page_id, tcp_session_id, browsing_session_id, http_type, host,
url, referer, content_type, payload, hrefs, iframes, images FROM ParsedHTTP where content_type LIKE %s and timestmp > %s ORDER BY
timestmp, tcp_session_id LIMIT 1""", ("html", timestamp_marker))
if (res is None):
return
if (len(res) == 0):
return
row = res[0]
ht = httptuple(*row)
timestamp_marker = ht.timestamp
if (ht.browsing_session_id == 0):
browsing_id = ht.id
else:
browsing_id = ht.browsing_session_id
absolute_url = "http://" + ht.host + ht.url
href_set = set(ht.hrefs.split(" "))
iframe_set = set(ht.iframes.split(" "))
image_set = set(ht.images.split(" "))
res = self.sql.request_query(
"""SELECT tcp_session_id, browsing_session_id, host, url, no_children FROM ParsedHTTP WHERE (referer = %s OR referer = %s) AND timestmp BETWEEN %s AND %s""",
(absolute_url, "https://www.google.com/", float(ht.timestamp),
float(ht.timestamp) + BROWSING_SESSION_TIMEOUT))
to_update = []
to_update.append((browsing_id, ht.tcp_id))
# Link html/text parents and their a href/iframe children with browsing session id
for row in res:
tree = treetuple(*row)
abs_url = get_abs_url(tree.host, tree.url)
if (tree.browsing_id != 0):
if ((browsing_id in to_update)):
to_update.remove((browsing_id,
ht.tcp_id)) # If children of a particular parent are found to belong to another parent, then this parent is also a child of previous parent
to_update.append((tree.browsing_id, ht.tcp_id))
if ((abs_url in href_set) or (abs_url in iframe_set) or (abs_url in image_set) or abs_url.endswith(
image_formats)):
to_update.append((browsing_id, tree.tcp_id))
self.sql.update_query("""UPDATE ParsedHTTP SET browsing_session_id = %s WHERE tcp_session_id = %s""",
to_update)
# Link html/text parents and their iframe/img children with page id
res = self.sql.request_query(
"""SELECT no, page_id, tcp_session_id, host, url FROM ParsedHTTP WHERE referer = %s AND timestmp BETWEEN %s AND %s""",
(absolute_url, float(ht.timestamp), float(ht.timestamp) + PAGE_LOAD_TIMEOUT))
if (len(res) == 0):
return
if (ht.page_id == 0):
page_id = page_id + 1
to_update_page_ob = self.build_page(page_id, res, iframe_set, image_set)
to_update_page_ob.append((page_id, ht.id))
else:
to_update_page_ob = self.build_page(ht.page_id, res, iframe_set, image_set)
self.sql.update_query("""UPDATE ParsedHTTP SET page_id = %s WHERE no = %s""", to_update_page_ob)
def build_page(self, parent_page_id, fetched_rows, iframe_set, image_set):
"""
This function links the pages together based on iframes and images.
:param parent_page_id: The page id of root
:param fetched_rows: list of tuples returned on query
:param iframe_set: Set of iframes
:param image_set: Set of images
:return: ids to update
"""
to_update = []
for row in fetched_rows:
abs_url = get_abs_url(row[3], row[4])
if (abs_url in iframe_set or abs_url in image_set or abs_url.endswith(image_formats)):
if (row[1] == 0):
to_update.append((parent_page_id, row[0]))
else: # iframes/images belong to some previous parent
continue
return to_update
class HttpObject:
def __init__(self, id, timestamp, tcp_id, browsing_session_id, type, host, url, referer, content_type):
self.id = id
self.timestamp = timestamp
self.tcp_id = tcp_id
self.browsing_session_id = browsing_session_id
self.type = type
self.host = host
self.url = url
self.referer = referer
self.content_type = content_type
self.href_dict = {}
class HttpUtil:
"""
Functions in this class parse HTTP request/response data using BeautifulSoup module
"""
def __init__(self, payload, host):
self.soup = BeautifulSoup(payload, "html.parser")
self.host = host
self.hrefs = self.parse_href()
self.iframes = self.parse_iframe()
self.cnt = 0
self.images = self.parse_img()
def parse_href(self):
"""
This function parses the hrefs from BeautifulSoup object
:return: list of hrefs
"""
children = []
for a in self.soup.find_all('a'):
url = str(a.get('href'))
if (url.startswith("#") or url.startswith("https") or url.startswith(
"javascript:void(0)")): # Skip javascript scrolling and secure links
continue
children.append(get_abs_url(self.host, url))
return children
def parse_iframe(self):
"""
This function parses the iframes from BeautifulSoup object
:return: list of iframes
"""
iframes = []
for iframe in self.soup.find_all('iframe'):
url = str(iframe.get('src'))
if (url.startswith("https")):
continue
if (url != "None" and url != ""):
iframes.append(get_abs_url(self.host, url))
return iframes
def parse_img(self):
"""
This function parses the images from BeautifulSoup object
:return: list of images
"""
images = []
cnt = 0
for image in self.soup.find_all('img'):
url = str(image.get('src'))
if (url.startswith("https")):
continue
if (url != "None" and url != ""):
images.append(get_abs_url(self.host, url))
cnt += 1
self.cnt = cnt
return images
def count_img(self):
"""
This function counts the number of images from BeautifulSoup object
:return: count of images
"""
cnt = 0
for image in self.soup.find_all('img'):
url = image.get('src')
cnt += 1
return cnt
def is_parent(self):
l = self.soup.find('html')
if (l):
return True
def get_abs_url(host, url):
"""
Utility function to get absolute URL from generic URLs
:param host: domain name
:param url: domain directory
:return: absolute url of the form http://www.example.com
"""
res = ""
if (url.startswith("http")):
res = url
elif (url.startswith("//")): # For protocol dependendent URLs make them http
res = "http:" + url
elif (not url.startswith("/")):
res = "http://" + host + "/" + url
else:
res = "http://" + host + url
return res.replace("com?", "com/?", 1)
def buildsession_worker():
session = BuildSession()
session.process_table()