-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathget_more_owners.py
175 lines (139 loc) · 5.44 KB
/
get_more_owners.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
#!/usr/local/bin/python3
# Copyright (C) 2022 Dawn M. Foster
# Licensed under GNU General Public License (GPL), version 3 or later: http://www.gnu.org/licenses/gpl.txt
"""
This script is designed to find the path to OWNERS files within Kubernetes
organizations, but it is generic enough to find any files of a specific
filename within a specified GitHub org.
Note: The GitHub search API is a little flaky, so it's likely that some
files are missing.
As input, this script requires that you have a GitHub API token in a file
called 'gh_key' in this directory.
Parameters
----------
org_name : str
The GitHub organization to be searched
file_name : str
The filename to search, like OWNERS or CODEOWNERS
"""
from datetime import datetime
from os.path import dirname, join
from github import Github
from common_functions import read_key, run_search_query, expand_name_df
api_token = read_key('gh_key')
g = Github(api_token)
def read_args():
"""Reads the org name and filename to be used in the search
Parameters
----------
None
Output
-------
org_name : str
The GitHub organization to be searched
file_name : str
The filename to search, like OWNERS or CODEOWNERS
"""
import sys
# read org name and filename from command line
try:
org_name = str(sys.argv[1])
file_name = str(sys.argv[2])
except:
print("Please enter the org name and filename to search when prompted.")
org_name = input("Enter a GitHub org name (like kubernetes): ")
file_name = input("Enter a file name (like OWNERS): ")
return org_name, file_name
org_name, file_name = read_args()
def make_repo_query(after_cursor = None):
"""Creates the query string for the GraphQL API call using after_cursor
to handle multiple pages of results.
Parameters
----------
after_cursor : str
Returns
-------
query : str
"""
return """query RepoQuery($org_name: String!) {
organization(login: $org_name) {
repositories (first: 100 after: AFTER){
pageInfo {
hasNextPage
endCursor
}
nodes {
name
defaultBranchRef {
name
}
}
}
}
}""".replace(
"AFTER", '"{}"'.format(after_cursor) if after_cursor else "null"
)
def get_repo_list(api_token, org_name):
"""Uses the make_repo_query function to run the GraphQL query and
returns the results as a dataframe containing the default branch
and repo name for all of the repo within the org. Note: default
branch is needed to build the url to the file.
Parameters
----------
api_token : str
org_name : str
Returns
-------
repo_info_df : dataframe
"""
import requests
import json
import pandas as pd
# Setting up the variables needed for the API
url = 'https://api.github.com/graphql'
headers = {'Authorization': 'token %s' % api_token}
repo_info_df = pd.DataFrame()
# Initialize the variables needed to page through the results.
# and while there are more pages, query a new page of results
has_next_page = True
after_cursor = None
while has_next_page:
query = make_repo_query(after_cursor)
# Pass the variables into the query and run it using the graphQL
# API returning a json file
variables = {"org_name": org_name}
r = requests.post(url=url, json={'query': query, 'variables': variables}, headers=headers)
json_data = json.loads(r.text)
# Convert the json file to a temporary dataframe that is added
# to the main dataframe with all of the results to be returned.
df_temp = pd.DataFrame(json_data['data']['organization']['repositories']['nodes'])
repo_info_df = pd.concat([repo_info_df, df_temp])
# Set variables that check for and handle results with
# multiple pages.
has_next_page = json_data["data"]["organization"]["repositories"]["pageInfo"]["hasNextPage"]
after_cursor = json_data["data"]["organization"]["repositories"]["pageInfo"]["endCursor"]
return repo_info_df
# Runs the function that gets the repos from the graphQL API
# and convert the output dataframe to a list of repos.
repo_info_df = get_repo_list(api_token, org_name)
#repo_list = repo_info_df['name'].tolist()
repo_info_df = expand_name_df(repo_info_df,'defaultBranchRef','defaultBranch')
owners_rows = []
# Iterate through the list of repos and run a search API query that
# gets the owners files for each repo.
for item in repo_info_df.iterrows():
repo_name = item[1]['name']
branch_name = item[1]['defaultBranch']
query = "filename:" + file_name + " repo:" + org_name + "/" + repo_name
print(query)
owners_rows = run_search_query(query, g, branch_name, owners_rows)
# prepare file and write rows to csv
try:
today = datetime.today().strftime('%Y-%m-%d')
output_filename = "./output/" + org_name + "_" + file_name + "_" + today + ".csv"
current_dir = dirname(__file__)
file_path = join(current_dir, output_filename)
with open(file_path, 'w') as file:
file.writelines("%s\n" % item for item in owners_rows)
except:
print('Could not write to csv file. This may be because the output directory is missing or you do not have permissions to write to it. Exiting')