-
Notifications
You must be signed in to change notification settings - Fork 86
/
dump_github_users.py
executable file
·98 lines (79 loc) · 2.68 KB
/
dump_github_users.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# vim: ai ts=4 sts=4 et sw=4 nu
'''
Dumps the whole Github users Database
Takes a great while as script must pause an hour after each 6000 users.
With 3,900,000 users, it'd be 28days...
When authenticating, it pauses after 500,000 users so 8h total!
The output file is copiable at anytime as it is closed after each write.
It's content is valid JSON as long as you close the list (])
'''
import os
import json
import logging
import time
import requests
from requests.auth import HTTPBasicAuth
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.DEBUG)
GITHUB_TOKEN = os.environ.get('GITHUB_TOKEN')
last_id = None
output = 'all_users.json_txt'
one_minute = 60
one_hour = one_minute * 60
min_remaining_tostop = 30
reqs = 0
reqs_limit = None
reqs_remaining = None
# headers = {'Authorization': 'token %s' % GITHUB_TOKEN} if GITHUB_TOKEN else {}
headers = {}
TOKEN_AUTH = HTTPBasicAuth(GITHUB_TOKEN, "x-oauth-basic")
def pause(duration):
''' basic sleep with periodic logging (to show progess) '''
interval = 10
tick = duration / interval
for i in xrange(interval):
logger.info(u"Pause (%dmn) Elapsed: %dmn" % (duration / one_minute,
tick * i / one_minute))
time.sleep(tick)
with open(output, 'w') as f:
f.write("[\n")
while True:
if last_id is not None:
params = {'since': last_id}
else:
params = None
logger.info(u"Requesting 100 users from %s -- %s/%s"
% (last_id, reqs_limit, reqs_remaining))
req = requests.get('https://api.github.com/users',
params=params,
headers=headers,
auth=TOKEN_AUTH)
reqs += 1
if not req.status_code == requests.codes.ok:
logger.error(u"Received status code %d. Pausing 1h." % req.status_code)
pause(one_hour)
continue
try:
json_content = json.loads(req.content)
last_id = json_content[-1].get('id')
except:
pass
if json_content is not None:
with open(output, 'a') as f:
if reqs > 1:
f.write(",\n")
f.write(req.content[1:-1])
del(json_content)
if len(req.content) < 20:
break
reqs_limit = int(req.headers.get('X-RateLimit-Limit', 0))
reqs_remaining = int(req.headers.get('X-RateLimit-Remaining', 0))
if reqs_remaining <= min_remaining_tostop:
logger.info("Reached %d requests over %d. Pausing one hour."
% (reqs_limit - reqs_remaining, reqs_limit))
pause(one_hour)
continue
with open(output, 'a') as f:
f.write("\n]\n")