-
Notifications
You must be signed in to change notification settings - Fork 48
/
subDownloader.py
76 lines (64 loc) · 2.38 KB
/
subDownloader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import requests
from datetime import datetime
import traceback
import time
import json
subreddit = "redditdev"
url = "https://api.pushshift.io/reddit/{}/search?limit=1000&sort=desc&subreddit={}&before="
start_time = datetime.utcnow()
def downloadFromUrl(filename, object_type):
print(f"Saving {object_type}s to {filename}")
count = 0
handle = open(filename, 'w')
previous_epoch = int(start_time.timestamp())
while True:
new_url = url.format(object_type, subreddit)+str(previous_epoch)
json_text = requests.get(new_url, headers={'User-Agent': "Post downloader by /u/Watchful1"})
time.sleep(1) # pushshift has a rate limit, if we send requests too fast it will start returning error messages
try:
json_data = json_text.json()
except json.decoder.JSONDecodeError:
time.sleep(1)
continue
if 'data' not in json_data:
break
objects = json_data['data']
if len(objects) == 0:
break
for object in objects:
previous_epoch = object['created_utc'] - 1
count += 1
if object_type == 'comment':
try:
handle.write(str(object['score']))
handle.write(" : ")
handle.write(datetime.fromtimestamp(object['created_utc']).strftime("%Y-%m-%d"))
handle.write("\n")
text = object['body']
textASCII = text.encode(encoding='ascii', errors='ignore').decode()
handle.write(textASCII)
handle.write("\n-------------------------------\n")
except Exception as err:
print(f"Couldn't print comment: https://www.reddit.com{object['permalink']}")
print(traceback.format_exc())
elif object_type == 'submission':
if object['is_self']:
if 'selftext' not in object:
continue
try:
#handle.write(str(object['score']))
#handle.write(" : ")
#handle.write(datetime.fromtimestamp(object['created_utc']).strftime("%Y-%m-%d"))
#handle.write("\n")
text = object['selftext']
textASCII = text.encode(encoding='ascii', errors='ignore').decode()
handle.write(textASCII)
handle.write("\n-------------------------------\n")
except Exception as err:
print(f"Couldn't print post: {object['url']}")
print(traceback.format_exc())
print("Saved {} {}s through {}".format(count, object_type, datetime.fromtimestamp(previous_epoch).strftime("%Y-%m-%d")))
print(f"Saved {count} {object_type}s")
handle.close()
downloadFromUrl("posts.txt", "submission")
#downloadFromUrl("comments.txt", "comment")