forked from ssc-oscar/gather
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathghUpdatedRepos.py
138 lines (122 loc) · 4.15 KB
/
ghUpdatedRepos.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
'''
Script to scrape GitHub repos using the GraphQL API
Obtains all repos that have been updated AFTER a specified date
Scrapes all repos from that date up to the current time
'''
import requests
import json
import pymongo
from datetime import datetime, timedelta
import time
import sys
# get start and end date, and GITHUB API token from command line
token, begin, end = sys.stdin.readline().strip().split(' ')
try:
datetime.strptime(begin, '%Y-%m-%d')
datetime.strptime(end, '%Y-%m-%d')
except ValueError:
raise ValueError("Incorrect beginning date format, should be YYYY-MM-DD")
# DB info
client = pymongo.MongoClient()
dbName = sys.argv[1] # db name as second arg
collName = sys.argv[2] # coll name as third arg
db = client[dbName]
coll = db[collName]
url = 'https://api.github.com/graphql'
headers = {'Authorization': 'token ' + token}
start = begin + 'T00:00:00Z'
end_time = datetime.strptime(end + 'T00:00:00Z', "%Y-%m-%dT%H:%M:%SZ")
interval = datetime.strptime(start, "%Y-%m-%dT%H:%M:%SZ")
total = 0
remaining = 5000
# query that specifies which repos and what content to extract
query = '''{
rateLimit {
cost
remaining
resetAt
}
search(query: "is:public archived:false fork:false mirror:false pushed:%s..%s", type: REPOSITORY, first: 100) {
repositoryCount
pageInfo {
hasNextPage
endCursor
startCursor
}
nodes {
... on Repository {
nameWithOwner
updatedAt
createdAt
pushedAt
id
forkCount
description
}
}
}
}'''
jsonS = { 'query': query }
# wait for reset if we exhaust our number of calls
def wait(reset):
now = datetime.now()
then = datetime.strptime(reset, "%Y-%m-%dT%H:%M:%SZ")
wait = (then-now).total_seconds() + 30
time.sleep(wait)
# helper function to loop through and insert repos into mongo db
def gatherData (res):
global total
repos = res['data']['search']['nodes']
#dt = res['data']['search']['nodes']
for i in repos:
coll.insert(i)
#for repo in repos:
# coll.insert({**repo['node'],**{'period': begin}})
total += len(repos)
output = "Got {} repos. Total count is {}. Have {} calls remaining."
print (output.format(len(repos), total, remaining))
# driver loop that iterates through repos in 10 minute intervals
# iterates from the specified date up to the current time
while (interval < end_time):
fromStr = interval.strftime("%Y-%m-%dT%H:%M:%SZ")
toStr = (interval + timedelta(minutes=10)).strftime("%Y-%m-%dT%H:%M:%SZ")
nextQuery = query % (fromStr, toStr)
jsonS['query'] = nextQuery
if (token == ''):
print("Please provide your Github API token in the script. Exiting.")
sys.exit()
r = requests.post(url=url, json=jsonS, headers=headers)
if r.ok:
try:
res = json.loads(r.content)
print("did it come here? {}".format(res['data']['search']['pageInfo']))
remaining = res['data']['rateLimit']['remaining']
reset = res['data']['rateLimit']['resetAt']
if remaining < 11:
wait(reset)
repos = res['data']['search']['repositoryCount']
hasNextPage = res['data']['search']['pageInfo']['hasNextPage']
gatherData(res)
# check if we got more than 100 results and need to paginate
while (repos > 100 and hasNextPage):
endCursor = res['data']['search']['pageInfo']['endCursor']
print("Have to paginate, using cursor {}".format(endCursor))
index = nextQuery.find("REPOSITORY") + len("REPOSITORY")
pageQuery = nextQuery[:index] + ',after:"{}"'.format(endCursor) + nextQuery[index:]
jsonS['query'] = pageQuery
r = requests.post(url=url, json=jsonS, headers=headers)
if r.ok:
res = json.loads(r.text)
try:
remaining = res['data']['rateLimit']['remaining']
reset = res['data']['rateLimit']['resetAt']
if remaining < 11:
wait(reset)
repos = res['data']['search']['repositoryCount']
hasNextPage = res['data']['search']['pageInfo']['hasNextPage']
gatherData(res)
except Exception as e:
print(e)
except Exception as e:
print(e)
interval += timedelta(minutes=10)