-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathcommon_functions.py
339 lines (267 loc) · 9.89 KB
/
common_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
# Copyright (C) 2019-2022 Dawn M. Foster
# Licensed under GNU General Public License (GPL), version 3 or later: http://www.gnu.org/licenses/gpl.txt
def read_cncf_affiliations():
"""
Download the contents of the CNCF json file and create an affiliation dictionary indexed
by GitHub username to make finding affilions faster for later functions.
Includes only current affiliation and excludes robot accounts.
Returns
-------
affil_dict : dict
Contains a mapping of github username to affiliation
"""
import json
from common_functions import download_file
filename = download_file('https://github.com/cncf/devstats/blob/master/github_users.json?raw=true')
affil_file = json.load(filename)
affil_dict = {}
for item in affil_file:
# Force username to lower case for consistent affiliation checks
username = item['login'].lower()
try:
affiliation = item['affiliation']
if '(Robots)' not in affiliation:
if ',' in affiliation: # get only current affiliation
affil_dict[username] = affiliation.rsplit(',', 1)[1].strip()
else:
affil_dict[username] = affiliation
except:
affiliation = 'N/A'
return affil_dict
def get_affil(affil_dict, username, api_token):
"""Get the company affiliation for a username starting with the
GitHub API, since that's likely the most up to date source. The CNCF gitdm
data is used a secondary source of this data.
Parameters
----------
affil_dict : dict
generated by the read_cncf_affiliations function
username : str
api_token : str
string containing a GitHub API token
Returns
-------
affil : str
"""
from github import Github
g = Github(api_token)
affil = 'NotFound'
if username in affil_dict:
# If affiliation is listed on GH, use that instead as more
# likely up to date
try:
affil = g.get_user(username).company
if affil == None:
affil = affil_dict[username]
except:
pass
if affil == '?':
affil = 'NotFound'
if affil == 'NotFound':
try:
affil = g.get_user(username).company
except:
affil = 'NotFound'
if affil == None:
affil = 'NotFound'
# remove any commas from the company name
affil = affil.replace(",","")
return affil
def download_file(url):
# Takes a URL and downloads the contents of the file into a var to be used by other functions
# NOTE: Make sure you pass in a raw yaml file, not html.
# Example: sig_file = download_file('https://raw.githubusercontent.com/kubernetes/community/master/sigs.yaml')
import urllib.request
sig_file = urllib.request.urlopen(url)
return sig_file
def read_sig_yaml(sig_file):
import yaml
#stream = open(sig_file, 'r')
sigs_wgs = yaml.safe_load(sig_file)
return sigs_wgs
def process_sig_yaml():
sig_file = download_file('https://raw.githubusercontent.com/kubernetes/community/master/sigs.yaml')
sigs_wgs = read_sig_yaml(sig_file)
return sigs_wgs
def write_affil_line_istio (username, team, affil_dict, api_token, csv_file):
"""Used to write istio data to the CSV file
Parameters
----------
username : str
team : str
affil_dict : dict
generated by the read_cncf_affiliations function
api_token : str
string containing a GitHub API token
csv_file : csv
"""
affil = get_affil(affil_dict, username, api_token)
if affil == None or affil == '':
affil = 'NotFound'
line = ",".join([affil, username, team]) + "\n"
csv_file.write(line)
def write_affil_line(username, role, sig_name, subproject, owners_url, csv_file, affil_dict):
# Writes a single line to the csv file with data about owners, including
# SIG/WG, subproject (if applicable), affiliation, owners url
# Make sure username is lower case before checking affiliation
username = username.lower()
# Only print real users to the csv file. Need to filter out aliases.
ban = ['approve', 'review', 'maintain', 'provider', 'leads', 'sig-', 'admins', 'release', 'licensing', 'github-admin-team', 'test-infra-oncall', 'managers', 'owners', 'committee', 'steering']
flag = 1
for b in ban:
if b in username:
flag = 0
if flag == 1:
if username in affil_dict:
affil = affil_dict[username]
if affil == '?':
affil = 'NotFound'
else:
affil = 'NotFound'
line = ",".join([affil, username, role, sig_name, subproject, owners_url]) + "\n"
csv_file.write(line)
def read_owners_file(owners_url, sig_name, subproject, csv_file, affil_dict):
# Download contents of owners files and load them. Print error message for files that 404
import yaml
try:
owners_file = download_file(owners_url)
#stream = open(owners_file, 'r')
#owners = yaml.safe_load(stream)
owners = yaml.safe_load(owners_file)
except:
print("Cannot get", sig_name, owners_url)
# Wrapped with 'try' since not every owners file has approvers and reviewers.
try:
for label in owners['labels']:
label_spl = label.split('/')
if subproject == 'NA' and label_spl[0] == 'area':
subproject = label_spl[1]
elif sig_name == 'NA' and label_spl[0] == 'sig':
sig_name = 'sig-' + label_spl[1]
except:
pass
try:
for username in owners['approvers']:
write_affil_line(username, 'approver', sig_name, subproject, owners_url, csv_file, affil_dict)
except:
pass
try:
for username in owners['reviewers']:
write_affil_line(username, 'reviewer', sig_name, subproject, owners_url, csv_file, affil_dict)
except:
pass
def files_done(owners_file_csv):
"""
Reads the output csv file generated by reading the intial list of
OWNERS files and is used to avoid re-reading files again when
an additional list of OWNERS files is provided.
Parameters
----------
owners_file_csv : file object
Returns
-------
files_doneDF : dataframe
Contains the contents of the csv file as a dataframe
"""
import pandas as pd
files_doneDF = pd.read_csv(owners_file_csv)
return files_doneDF
def read_key(file_name):
"""Retrieves a GitHub API key from a file.
Parameters
----------
file_name : str
Returns
-------
key : str
"""
from os.path import dirname, join
# Reads the first line of a file containing the GitHub API key
# Usage: key = read_key('gh_key')
current_dir = dirname(__file__)
file2 = "./" + file_name
file_path = join(current_dir, file2)
with open(file_path, 'r') as kf:
key = kf.readline().rstrip() # remove newline & trailing whitespace
return key
def run_search_query(query, g, branch_name, owners_rows):
"""Runs the query against the GitHub search API, appends the results
to owners_rows list and returns the list with results.
Parameters
----------
query : str
String formatted as a search query.
g : Github object
branch_name : str
Default branch name from the API to use to build the URL
owners_rows: list
Returns
-------
owners_rows : list
"""
import time
# Run the search query to get all of the owners files
output = g.search_code(query=query)
print("Total number found", output.totalCount)
# Format the results for each owners file to get the full path as a url
# Sleep in the loop to avoid secondary rate limit exception
for owners in output:
full_path = 'https://raw.githubusercontent.com/' + owners.repository.full_name + '/' + branch_name + '/' + owners.path
owners_rows.append(full_path)
time.sleep(5)
# Add an extra sleep before returning to give it more time to
# avoid the rate limit exception
time.sleep(60)
return owners_rows
def expand_name_df(df,old_col,new_col):
"""Takes a dataframe df with an API JSON object with nested elements in old_col,
extracts the name, and saves it in a new dataframe column called new_col
Parameters
----------
df : dataframe
old_col : str
new_col : str
Returns
-------
df : dataframe
"""
import pandas as pd
def expand_name(nested_name):
"""Takes an API JSON object with nested elements and extracts the name
Parameters
----------
nested_name : JSON API object
Returns
-------
object_name : str
"""
if pd.isnull(nested_name):
object_name = 'Likely Missing'
else:
object_name = nested_name['name']
return object_name
df[new_col] = df[old_col].apply(expand_name)
return df
def create_file(pre_string):
"""Creates an output file in an "output" directory with today's date
as part of the filename and prints the file_path to the terminal to
make it easier to open the output file.
Parameters
----------
pre_string : str
This is the string that will preface today's date in the filename
Returns
-------
file : file object
file_path : str
This is the full path to the file name for the output.
"""
from datetime import datetime
from os.path import dirname, join
today = datetime.today().strftime('%Y-%m-%d')
output_filename = "./output/" + pre_string + "_" + today + ".csv"
current_dir = dirname(__file__)
file_path = join(current_dir, output_filename)
file = open(file_path, 'w', newline ='')
print("Output file:\n", file_path, sep="")
return file, file_path