-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfetcher.py
465 lines (329 loc) · 16.5 KB
/
fetcher.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
"""
Fetcher: Webpage Retrieval and Caching System
2021 Eiza Stanford ("Bash Sudo" / "Charky Barky")
Main Script
"""
# >>> STANDARD LIBRARY IMPORTS:
# Datetime
import datetime
from datetime import datetime as datetime_object
from datetime import timedelta as timedelta_object
# Other
import os
import random
import string
import time
# >>> THIRD-PARTY IMPORTS:
# Termcolor
from termcolor import colored
# Fake Useragent
from fake_useragent import UserAgent
# Requests
import requests
# ===== ===== ===== ===== ===== ===== ===== ===== ===== =====
# ===== ===== ===== ===== ===== ===== ===== ===== ===== =====
# File and folder names for the cache-related data:
cacheDatabaseFileName = 'fetcher_db.txt'
cacheFolderName = 'fetcher_cache'
# Data read from the cache database file (string and list form):
cacheDatabaseRead = None
cacheDatabaseReadLines = None
cacheDatabaseReadLinesUrl = None
# This is the version of "cacheDatabaseReadLines" that will actually be written to the database file:
cacheDatabaseModifiedLines = None
# Default file and folder permissions when creating things:
folderMode = 0o755
fileMode = 0o644
# The length and list of characters to use (by default) when generating file names:
fileNameGenerateChars = string.ascii_letters + string.digits
fileNameGenerateLength = 32
# Other important cache-related variables:
cacheReferenceDict = {}
cacheExpirationIntervalMinDefault = 1
# Randomize the user agent when requesting pages, to avoid 403 Forbidden errors (among other things):
requestUserAgentRandomized = UserAgent().random
# Boolean variable that either enables or disables ALL "DebugPrint..." functions.
verboseOutput = False
# ===== ===== ===== ===== ===== ===== ===== ===== ===== =====
# ===== ===== ===== ===== ===== ===== ===== ===== ===== =====
# === === === === === === === === === === === === === === === === === === === === === === === === === ===
# >>> >>> >>> WARNING: THIS CLASS IS NOT DESIGNED TO BE USED TO EASILY AND SEEMLESSLY MAKE CACHE OBJECTS <<< <<< <<<
# COMPLETE "ENOUGH" FOR NOW
class Cache_Item_Object:
# COMPLETE "ENOUGH" FOR NOW
def DatabasePrepare(self):
global cacheDatabaseModifiedLines
# If the URL is in the content of the database file.
if self.url in cacheDatabaseRead:
# Find the line number with the URL.
self.databaseLineNumber = cacheDatabaseReadLinesUrl.index(self.url)
else:
# If there is no URL, simply append a new line and use that.
cacheDatabaseModifiedLines.append('')
self.databaseLineNumber = len(cacheDatabaseReadLines)
DebugPrint('line number (NOT LIST NUMBER) now %d' % (self.databaseLineNumber + 1), preface='(OBJECT) CACHE %s' % self.url)
# COMPLETE "ENOUGH" FOR NOW
def DatabaseUpdate(self):
global cacheDatabaseModifiedLines
# For readability, assign string ISO-formatted timestamps to variables.
creationDatetimeIso = datetime_object.isoformat(self.creationDatetimeObject)
expirationDatetimeIso = datetime_object.isoformat(self.expirationDatetimeObject)
# Update the line that this cache object exists on.
cacheDatabaseModifiedLines[self.databaseLineNumber] = '%s\t%s\t%s\t%s' % (self.url, self.fileName, creationDatetimeIso, expirationDatetimeIso)
# MIGHT BE CHANGED to only write, instead of write AND read (this may be a bad idea!)
DatabaseFileWriteReadCycle()
# DatabaseFileWrite()
# COMPLETE "ENOUGH" FOR NOW
def FileCreatePlaceholder(self):
try:
open(self.fileName, 'x').close()
os.chmod(self.fileName, fileMode)
except FileExistsError:
return False
return True
# COMPLETE "ENOUGH" FOR NOW
def FileRead(self):
with open(self.fileName, 'r') as fileItself:
self.html = fileItself.read()
# COMPLETE "ENOUGH" FOR NOW
def FileWrite(self):
with open(self.fileName, 'w') as fileItself:
fileItself.write(self.html)
# COMPLETE "ENOUGH" FOR NOW
def CacheContentUpdate(self):
# Actually make the request for the website and update the object's html.
pageRequest = requests.get(self.url, headers={'User-Agent':requestUserAgentRandomized})
self.html = pageRequest.text
# Update the creation datetime: take the current date and time (datetime) at this moment.
self.creationDatetimeObject = datetime_object.now()
# Update the expiration datetime: take the creation datetime and add it with the expiration interval timedelta (i.e. expiration = creation + interval).
self.expirationDatetimeObject = self.creationDatetimeObject + self.expirationIntervalObject
# Update the file with the new cache content.
self.FileWrite()
# Update the database with the new creation, expiration times, etc.
self.DatabaseUpdate()
# COMPLETE "ENOUGH" FOR NOW
def ExpirationCheck(self, autoUpdateFile=True, autoUpdateExpirationInterval=None):
# Create a boolean whether or not the cache object has expired (passed expiration time).
expired = (datetime_object.now() > self.expirationDatetimeObject)
# NOTE: originally this if-statement was to be executed IF the object expired
# This was moved OUTSIDE of the expired code block
# Thus the expiration interval can be changed even if the cache has not expired yet
if autoUpdateExpirationInterval:
self.ExpirationIntervalChange(autoUpdateExpirationInterval)
# Update the cache (and file).
if expired:
DebugPrint('cache object expired!', important=True, preface='(OBJECT) CACHE %s' % self.url)
if autoUpdateFile:
self.CacheContentUpdate()
return expired
# COMPLETE "ENOUGH" FOR NOW
def ExpirationIntervalChange(self, expirationIntervalMin):
DebugPrint('changed expiration interval to %s' % str(expirationIntervalMin), preface='(OBJECT) CACHE %s' % self.url)
self.expirationIntervalObject = timedelta_object(minutes=expirationIntervalMin)
# COMPLETE "ENOUGH" FOR NOW
def __init__(self, url, fileName, creationDatetimeObject, expirationDatetimeObject, expirationIntervalMin=None):
# Update the attributes that can be directly taken from parameters.
self.url = url
self.fileName = fileName
self.creationDatetimeObject = creationDatetimeObject
self.expirationDatetimeObject = expirationDatetimeObject
self.databaseLineNumber = None
self.html = None
# >>> EXPIRATION INTERVAL ATTRIBUTE
if expirationIntervalMin:
# if given the expiration interval (minute integer), then make a timedelta object out of it.
self.expirationIntervalObject = timedelta_object(minutes=expirationIntervalMin)
else:
# If NOT given any expiration interval (None), then make a timedelta object out of the difference between the creation datetime and expiration datetime.
self.expirationIntervalObject = self.expirationDatetimeObject - self.creationDatetimeObject
# Immediately create a placeholder file.
if self.fileName:
newlyCreated = self.FileCreatePlaceholder()
if not newlyCreated:
DebugPrint('HTML cache file already exists; updating object HTML content to cache file', preface='(OBJECT) CACHE %s' % self.url)
self.FileRead()
# Prepare this object to modify the database file.
self.DatabasePrepare()
# ===== ===== ===== ===== ===== ===== ===== ===== ===== =====
# ===== ===== ===== ===== ===== ===== ===== ===== ===== =====
# COMPLETE "ENOUGH" FOR NOW
def DebugPrint(string, preface=None, important=False):
if verboseOutput:
if important:
colorString = 'yellow'
colorPreface = 'red'
else:
colorString = 'blue'
colorPreface = 'cyan'
string = colored(string, colorString)
if preface:
string = '%s%s' % (colored('%s: ' % preface, colorPreface, attrs=['bold']), string)
print(string, '\n')
# COMPLETE
def GenerateFileName():
return ''.join(random.SystemRandom().choice(fileNameGenerateChars) for loop in range(fileNameGenerateLength))
# COMPLETED "ENOUGH" FOR NOW
def GenerateCacheObject(url, expirationIntervalMin=None):
# If the given expiration interval is None, assume the global default.
if not expirationIntervalMin:
expirationIntervalMin = cacheExpirationIntervalMinDefault
# Get the path: the folder with the cache object files + the generated name.
fileName = '%s/%s.html' % (cacheFolderName, GenerateFileName())
# Get the creation datetime: take the current date and time (datetime) at this moment.
creationDatetimeObject = datetime_object.now()
# Get the expiration datetime: take the creation datetime and add it with the expiration interval timedelta (i.e. expiration = creation + interval).
expirationDatetimeObject = creationDatetimeObject + timedelta_object(minutes=expirationIntervalMin)
# Finally create the object.
cacheObject = Cache_Item_Object(url, fileName, creationDatetimeObject, expirationDatetimeObject, expirationIntervalMin)
return cacheObject
# COMPLETED "ENOUGH" FOR NOW
# === === === === === === === === === === === === === === === === === === === === === === === === === ===
# >>> >>> >>> THIS FUNCTION WILL BE HEAVILY USED BY OTHER SCRIPTS THAT IMPORT THE FETCHER SCRIPT <<< <<< <<<
# >>> >>> >>> IN OTHER WORDS, THIS FUNCTION IS ALMOST THE WHOLE POINT/PURPOSE OF THE FETCHER SCRIPT <<< <<< <<<
def WebpageFetch(url, expirationIntervalMin=None, forceUpdateCache=False):
global cacheReferenceDict
# VERY IMPORTANT NOTE:
# When the parameter "expirationIntervalMin" is NOT None, the expiration interval value will be changed to match it in a certain cache object ONLY when it the cahce content is updated (
# If the URL already has a corresponding cache object in the database.
if url in cacheReferenceDict:
DebugPrint('url is in the list of cache objects (object already exists)', preface='(WebpageFetch) CACHE %s' % url)
# Grab an existing cache object that corresponds with the URL.
cacheObject = cacheReferenceDict[url]
# If the user wants the latest page request (forceUpdateCache).
if forceUpdateCache:
DebugPrint('force update cache requested: returning html from cache object, but with request from website just now', preface='(WebpageFetch) CACHE %s' % url, important=True)
# If "expirationIntervalMin" is not None, forcefully update the expiration interval.
if expirationIntervalMin:
cacheObject.ExpirationIntervalChange(expirationIntervalMin)
# Forcefully update the cache content, regardless if it has expired or not.
cacheObject.CacheContentUpdate()
# If not, then use the current cache data (but update the cache if it is expired).
else:
DebugPrint('returning cached html content (returning new html if expired)', preface='(WebpageFetch) CACHE %s' % url)
# Check to see if it has expired (and tell the function to automatically update and change the expiration interval).
cacheObject.ExpirationCheck(autoUpdateFile=True, autoUpdateExpirationInterval=expirationIntervalMin)
return cacheObject.html
# If there is no matching cache object with the URL.
else:
DebugPrint('url is NOT FOUND in the list of cache objects: creating new cache object', preface='(WebpageFetch) CACHE %s' % url, important=True)
# Create a brand-new cache object.
cacheObject = GenerateCacheObject(url, expirationIntervalMin=expirationIntervalMin)
# Add it to the reference.
cacheReferenceDict[url] = cacheObject
# Update the cache content (so that it can be used later).
cacheObject.CacheContentUpdate()
return cacheObject.html
# THIS FUNCTION SHOULD ONLY BE USED IN THE VERY BEGINNING OF RUNNING THIS SCRIPT, SPECIFICALLY BEFORE DatabaseFileRead(init=False) DUE TO EXISTING CACHE OBJECTS IN THE REFERENCE
def DatabaseCleanup():
if not cacheDatabaseReadLines:
DatabaseFileRead()
for line in cacheDatabaseReadLines:
lineSplit = line.split('\t')
url = lineSplit[0]
fileName = lineSplit[1]
expirationDatetimeObject = datetime_object.fromisoformat(lineSplit[3])
# Testing if the cache object expired (current date-time exceeds the expiration date-time of the object)
if datetime_object.now() > expirationDatetimeObject:
DebugPrint('removed expired cache file in database (filename="%s")' % fileName, important=True, preface='(DatabaseCleanup) CACHE %s' % url)
# Remove the corresponding HTML file with the cache object.
os.remove(fileName)
# Removes the line from the modified-lines buffer list.
# The "modified-lines" is being used now instead of "read-lines" because using "read-lines" for indexes does not accomodate for shifts in lines resulting from deleting lines in "modified-lines."
# Iterating through a list and removing its content IS RISKY, which is why "read-lines" is used in the for-loop.
index = cacheDatabaseModifiedLines.index(line)
del cacheDatabaseModifiedLines[index]
# At the very end, write the changes and read from the file.
DatabaseFileWriteReadCycle()
# Tell the existing cache objects to adjust to the shifted lines in the database file (due to possibly deleted lines).
for cacheObject in list(cacheReferenceDict.values()):
cacheObject.DatabasePrepare()
# COMPLETED "ENOUGH" FOR NOW
def DatabaseFileWrite():
with open(cacheDatabaseFileName, 'w') as fileItself:
fileItself.write('\n'.join(cacheDatabaseModifiedLines))
# COMPLETED "ENOUGH" FOR NOW
def DatabaseFileRead(init=False):
global cacheDatabaseRead
global cacheDatabaseReadLines
global cacheDatabaseReadLinesUrl
global cacheDatabaseModifiedLines
global cacheReferenceDict
with open(cacheDatabaseFileName, 'r') as fileItself:
cacheDatabaseRead = fileItself.read()
# If the data read from the database file is NOT blank.
if cacheDatabaseRead:
cacheDatabaseReadLines = cacheDatabaseRead.split('\n')
cacheDatabaseReadLinesUrl = [line.split('\t')[0] for line in cacheDatabaseReadLines]
# If the data read from the database file IS ACTUALLY blank.
else:
# Keep the list variables EMPTY, and not "empty" with [''].
cacheDatabaseReadLines = []
cacheDatabaseReadLinesUrl = []
cacheDatabaseModifiedLines = cacheDatabaseReadLines.copy()
# If the function was told to create cache objects from every single line read from the database file (done first-time).
if init and cacheDatabaseReadLines:
for line in cacheDatabaseReadLines:
if '\t' in line:
lineSplit = line.split('\t')
# Get the data from the tab-separated fields.
url = lineSplit[0]
fileName = lineSplit[1]
creationDatetimeObject = datetime_object.fromisoformat(lineSplit[2])
expirationDatetimeObject = datetime_object.fromisoformat(lineSplit[3])
DebugPrint('found cache file in database (filename="%s")' % fileName, preface='(DatabaseFileRead) CACHE %s' % url)
# Update the cache reference with brand-new cache objects.
cacheReferenceDict[url] = Cache_Item_Object(url, fileName, creationDatetimeObject, expirationDatetimeObject)
# COMPLETED "ENOUGH" FOR NOW
def DatabaseFileWriteReadCycle():
DatabaseFileWrite()
DatabaseFileRead()
# COMPLETED "ENOUGH" FOR NOW
def InitFilesNeededCreate():
# Create the cache folder.
try:
os.mkdir(cacheFolderName, mode=folderMode)
except FileExistsError:
DebugPrint('folder "%s" already exists!' % cacheFolderName, important=True)
# Create the database file.
try:
open(cacheDatabaseFileName, 'x').close()
os.chmod(cacheDatabaseFileName, fileMode)
except FileExistsError:
DebugPrint('file "%s" already exists!' % cacheDatabaseFileName, important=True)
# COMPLETED "ENOUGH" FOR NOW
# === === === === === === === === === === === === === === === === === === === === === === === === === ===
# >>> >>> >>> THIS FUNCTION SHOULD BE CALLED IMMEDIATELY WHEN THIS SCRIPT IS IMPORTED OR RAN! <<< <<< <<<
def InitEverything():
InitFilesNeededCreate()
DatabaseCleanup()
DatabaseFileRead(init=True)
# ===== ===== ===== ===== ===== ===== ===== ===== ===== =====
# ===== ===== ===== ===== ===== ===== ===== ===== ===== =====
def TestProgram():
DebugPrint('===== ===== ===== ===== ===== ===== ===== ===== ===== =====', important=True)
DebugPrint('THIS TEST FUNCTION WILL DO THE FOLLOWING:', important=True)
DebugPrint('* USE WebpageFetch TO REQUEST 3 EXAMPLE URLS', important=True)
DebugPrint('* HAVE EXPIRATION INTERVALS OF 12 SECONDS', important=True)
DebugPrint('* LOOP EVERY 4 SECONDS TO REPEAT THE PROCESS', important=True)
DebugPrint('* THE EXPECTATION IS THAT EVERY 3 LOOPS (GIVE OR TAKE), THE CACHE WILL EXPIRE AND UPDATE', important=True)
DebugPrint('===== ===== ===== ===== ===== ===== ===== ===== ===== =====', important=True)
expirationInterval = 0.2
time.sleep(6)
while True:
urlList = [
'https://www.cnn.com',
'https://en.wikipedia.org',
'https://ikea.com/'
]
for url in urlList:
WebpageFetch(url, expirationInterval)
DebugPrint('\n\n(END, TIME: %s)\n\n' % datetime_object.isoformat(datetime_object.now()), important=True)
time.sleep(4)
DebugPrint('\n\nLOOPED!\n\n', important=True)
if __name__ == '__main__':
verboseOutput = True
InitEverything()
TestProgram()
else:
InitEverything()