From 8683d3350db0846f3666bba7c763a7447aa83055 Mon Sep 17 00:00:00 2001 From: Adnan Khan Date: Sat, 23 Dec 2023 16:08:14 -0500 Subject: [PATCH] Release 1.6 (#58) Co-authored-by: Mason Davis <31484153+mas0nd@users.noreply.github.com> Co-authored-by: Remy --- README.md | 50 +++-- gato/attack/attack.py | 13 +- gato/cli/cli.py | 52 ++++- gato/enumerate/enumerate.py | 15 +- gato/enumerate/recommender.py | 7 +- gato/enumerate/repository.py | 63 +++++- gato/github/__init__.py | 1 + gato/github/api.py | 272 ++++++++++++++++++++++-- gato/github/gql_queries.py | 54 +++++ gato/github/search.py | 2 +- gato/models/runner.py | 7 +- gato/search/search.py | 82 ++++++- gato/workflow_parser/workflow_parser.py | 78 ++++++- pyproject.toml | 2 +- test/test_cases.json | 2 +- unit_test/test_api.py | 98 ++++++++- unit_test/test_enumerate.py | 10 +- unit_test/test_repo_enumerate.py | 4 +- 18 files changed, 723 insertions(+), 89 deletions(-) create mode 100644 gato/github/gql_queries.py diff --git a/README.md b/README.md index a2d5be7..d1ab5fe 100644 --- a/README.md +++ b/README.md @@ -8,28 +8,43 @@ Gato, or GitHub Attack Toolkit, is an enumeration and attack tool that allows both -blue teamers and offensive security practitioners to evaluate the blast radius -of a compromised personal access token within a GitHub organization. +blue teamers and offensive security practitioners to identify and exploit +pipeline vulnerabilities within a GitHub organization's public and private +repositories. -The tool also allows searching for and thoroughly enumerating public -repositories that utilize self-hosted runners. GitHub recommends that -self-hosted runners only be utilized for private repositories, however, there -are thousands of organizations that utilize self-hosted runners. +The tool has post-exploitation features to leverage a compromised personal +access token in addition to enumeration features to identify poisoned pipeline +execution vulnerabilities against public repositories that use self-hosted GitHub Actions +runners. -## Version 1.5 Released +GitHub recommends that self-hosted runners only be utilized for private repositories, however, there are thousands of organizations that utilize self-hosted runners. Default configurations are often vulnerable, and Gato uses a mix of workflow file analysis and run-log analysis to identify potentially vulnerable repositories at scale. -Gato version 1.5 was released on June 27th, 2023! +## Version 1.6 -#### New Features +Gato version 1.6 improves the public repository enumeration feature set. -* Secrets Enumeration -* Secrets Exfiltration -* API-only Enumeration -* JSON Output -* Improved Code Search -* GitHub Enterprise Server Support -* PAT Validation Only Mode -* Quality of life and UX improvements +Previously, Gato's code search functionality by default only looked for +yaml files that explicitly had "self-hosted" in the name. Now, the +code search functionality supports a SourceGraph query. This query has a +lower false negative rate and is not limited by GitHub's code search limit. + +For example, the following query will identify public repositories that use +self-hosted runners: + +`gato search --sourcegraph --output-text public_repos.txt` + +This can be fed back into Gato's enumeration feature: + +`gato enumerate --repositories public_repos.txt --output-json enumeration_results.json` + +Additionally the release contains several improvements under the hood to speed up the enumeration process. This includes changes to limit redundant run-log downloads (which are the slowest part of Gato's enumeration process) and using the GraphQL API to download workflow files when enumerating an entire organization. Finally, Gato will use a heuristic to detect if an attached runner is non-ephemeral. Most poisoned pipeline execution attacks require a non-ephemeral runner in order to exploit. + +### New Features + +* SourceGraph Search Functionality +* Improved Public Repository Enumeration Speed +* Improved Workflow File Analysis +* Non-ephemeral self-hosted runner detection ## Who is it for? @@ -44,6 +59,7 @@ Gato version 1.5 was released on June 27th, 2023! * GitHub Classic PAT Privilege Enumeration * GitHub Code Search API-based enumeration +* SourceGraph Search enumeration * GitHub Action Run Log Parsing to identify Self-Hosted Runners * Bulk Repo Sparse Clone Features * GitHub Action Workflow Parsing diff --git a/gato/attack/attack.py b/gato/attack/attack.py index e830c2c..d22a502 100644 --- a/gato/attack/attack.py +++ b/gato/attack/attack.py @@ -181,28 +181,23 @@ def __execute_and_wait_workflow( """ workflow_id = None - branch_created = self.api.create_branch(target_repo, branch) - - if not branch_created: - Output.error("Failed to create branch!") - return False if self.author_email and self.author_name: - rev_hash = self.api.commit_file( + rev_hash = self.api.commit_workflow( target_repo, branch, - f".github/workflows/{yaml_name}.yml", yaml_contents.encode(), + f"{yaml_name}.yml", commit_author=self.author_name, commit_email=self.author_email, message=commit_message ) else: - rev_hash = self.api.commit_file( + rev_hash = self.api.commit_workflow( target_repo, branch, - f".github/workflows/{yaml_name}.yml", yaml_contents.encode(), + f"{yaml_name}.yml", message=commit_message ) diff --git a/gato/cli/cli.py b/gato/cli/cli.py index 849af48..ef476c2 100644 --- a/gato/cli/cli.py +++ b/gato/cli/cli.py @@ -276,19 +276,35 @@ def search(args, parser): http_proxy=args.http_proxy, github_url=args.api_url ) + if args.sourcegraph: + if args.query and args.target: + parser.error( + f"{Fore.RED}[-]{Style.RESET_ALL} You cannot select an organization " + "with a custom query!" + ) - if not (args.query or args.target): - parser.error( - f"{Fore.RED}[-]{Style.RESET_ALL} You must select an organization " - "or pass a custom query!." - ) - - if args.query: - gh_search_runner.use_search_api( - organization=args.target, query=args.query + results = gh_search_runner.use_sourcegraph_api( + organization=args.target, + query=args.query ) else: - gh_search_runner.use_search_api(organization=args.target) + if not (args.query or args.target): + parser.error( + f"{Fore.RED}[-]{Style.RESET_ALL} You must select an organization " + "or pass a custom query!." + ) + if args.query: + results = gh_search_runner.use_search_api( + organization=args.target, + query=args.query + ) + else: + results = gh_search_runner.use_search_api( + organization=args.target + ) + + if results: + gh_search_runner.present_results(results, args.output_text) def configure_parser_general(parser): @@ -563,3 +579,19 @@ def configure_parser_search(parser): metavar="QUERY", required=False ) + + parser.add_argument( + "--sourcegraph", "-sg", + help="Use Sourcegraph API to search for self-hosted runners.", + required=False, + action="store_true" + ) + + parser.add_argument( + "--output-text", "-oT", + help=( + "Save enumeration output to text file." + ), + metavar="TEXT_FILE", + type=StringType(256) + ) diff --git a/gato/enumerate/enumerate.py b/gato/enumerate/enumerate.py index c5ed134..3ac1254 100644 --- a/gato/enumerate/enumerate.py +++ b/gato/enumerate/enumerate.py @@ -1,6 +1,7 @@ import logging from gato.github import Api +from gato.github import GqlQueries from gato.models import Repository, Organization from gato.cli import Output from gato.enumerate.repository import RepositoryEnum @@ -173,12 +174,22 @@ def enumerate_organization(self, org: str): f"the {organization.name} organization!" ) + Output.info(f"Querying and caching workflow YAML files!") + wf_queries = GqlQueries.get_workflow_ymls(enum_list) + + for wf_query in wf_queries: + result = self.org_e.api.call_post('/graphql', wf_query) + # Sometimes we don't get a 200, fall back in this case. + if result.status_code == 200: + self.repo_e.construct_workflow_cache(result.json()['data']['nodes']) + else: + Output.warn("GraphQL query failed, will revert to REST workflow query for impacted repositories!") for repo in enum_list: - Output.tabbed( f"Enumerating: {Output.bright(repo.name)}!" ) - self.repo_e.enumerate_repository(repo) + + self.repo_e.enumerate_repository(repo, large_org_enum=len(enum_list) > 100) self.repo_e.enumerate_repository_secrets(repo) Recommender.print_repo_secrets( diff --git a/gato/enumerate/recommender.py b/gato/enumerate/recommender.py index c391961..9ecee80 100644 --- a/gato/enumerate/recommender.py +++ b/gato/enumerate/recommender.py @@ -140,7 +140,7 @@ def print_repo_runner_info(repository: Repository): Output.result( f"The repository contains a workflow: " f"{Output.bright(repository.sh_workflow_names[0])} that " - "executes on self-hosted runners!" + "might execute on self-hosted runners!" ) if repository.accessible_runners: @@ -157,6 +157,11 @@ def print_repo_runner_info(repository: Repository): f"{Output.bright(repository.accessible_runners[0].machine_name)}" ) + for runner in repository.accessible_runners: + if runner.non_ephemeral: + Output.owned("The repository contains a non-ephemeral self-hosted runner!") + break + if repository.runners: Output.result( f"The repository has {len(repository.runners)} repo-level" diff --git a/gato/enumerate/repository.py b/gato/enumerate/repository.py index 58e7a3c..dfc2885 100644 --- a/gato/enumerate/repository.py +++ b/gato/enumerate/repository.py @@ -21,6 +21,7 @@ def __init__(self, api: Api, skip_log: bool, output_yaml): api (Api): GitHub API wraper object. """ self.api = api + self.workflow_cache = {} self.skip_log = skip_log self.output_yaml = output_yaml @@ -40,11 +41,12 @@ def __perform_runlog_enumeration(self, repository: Repository): ) if wf_runs: - runner = Runner( - wf_runs[0]['runner_name'], wf_runs[0]['machine_name'] - ) + for wf_run in wf_runs: + runner = Runner( + wf_run['runner_name'], wf_run['machine_name'], non_ephemeral=wf_run['non_ephemeral'] + ) - repository.add_accessible_runner(runner) + repository.add_accessible_runner(runner) runner_detected = True return runner_detected @@ -60,12 +62,15 @@ def __perform_yml_enumeration(self, repository: Repository): list: List of workflows that execute on sh runner, empty otherwise. """ runner_wfs = [] - ymls = self.api.retrieve_workflow_ymls(repository.name) + + if repository.name in self.workflow_cache: + ymls = self.workflow_cache[repository.name] + else: + ymls = self.api.retrieve_workflow_ymls(repository.name) for (wf, yml) in ymls: try: parsed_yml = WorkflowParser(yml, repository.name, wf) - self_hosted_jobs = parsed_yml.self_hosted() if self_hosted_jobs: @@ -79,12 +84,13 @@ def __perform_yml_enumeration(self, repository: Repository): # At this point we only know the extension, so handle and # ignore malformed yml files. except Exception as parse_error: - print(parse_error) + + print(f"{wf}: {str(parse_error)}") logger.warning("Attmpted to parse invalid yaml!") return runner_wfs - def enumerate_repository(self, repository: Repository): + def enumerate_repository(self, repository: Repository, large_org_enum=False): """Enumerate a repository, and check everything relevant to self-hosted runner abuse that that the user has permissions to check. @@ -119,15 +125,25 @@ def enumerate_repository(self, repository: Repository): repository.set_runners(repo_runners) - if not self.skip_log and self.__perform_runlog_enumeration(repository): - runner_detected = True - workflows = self.__perform_yml_enumeration(repository) if len(workflows) > 0: repository.add_self_hosted_workflows(workflows) runner_detected = True + if not self.skip_log: + # If we are enumerating an organization, only enumerate runlogs if + # the workflow suggests a sh_runner. + if large_org_enum and runner_detected: + self.__perform_runlog_enumeration(repository) + + # If we are doing internal enum, get the logs, because coverage is + # more important here and it's ok if it takes time. + elif not repository.is_public() and self.__perform_runlog_enumeration(repository): + runner_detected = True + else: + runner_detected = self.__perform_runlog_enumeration(repository) + if runner_detected: # Only display permissions (beyond having none) if runner is # detected. @@ -158,3 +174,28 @@ def enumerate_repository_secrets( if org_secrets: repository.set_accessible_org_secrets(org_secrets) + + def construct_workflow_cache(self, yml_results): + """Creates a cache of workflow yml files retrieved from graphQL. Since + graphql and REST do not have parity, we still need to use rest for most + enumeration calls. This method saves off all yml files, so during org + level enumeration if we perform yml enumeration the cached file is used + instead of making github REST requests. + + Args: + yml_results (list): List of results from individual GraphQL queries + (100 nodes at a time). + """ + for result in yml_results: + owner = result['nameWithOwner'] + + self.workflow_cache[owner] = list() + + if not result['object']: + continue + + for yml_node in result['object']['entries']: + yml_name = yml_node['name'] + if yml_name.lower().endswith('yml') or yml_name.lower().endswith('yaml'): + contents = yml_node['object']['text'] + self.workflow_cache[owner].append((yml_name, contents)) diff --git a/gato/github/__init__.py b/gato/github/__init__.py index a0f8375..284d7c1 100644 --- a/gato/github/__init__.py +++ b/gato/github/__init__.py @@ -1,2 +1,3 @@ from .api import Api +from .gql_queries import GqlQueries from .search import Search diff --git a/gato/github/api.py b/gato/github/api.py index f8629b3..77807cc 100644 --- a/gato/github/api.py +++ b/gato/github/api.py @@ -8,7 +8,7 @@ import io from gato.cli import Output -from datetime import datetime, timezone +from datetime import datetime, timezone, timedelta logger = logging.getLogger(__name__) @@ -19,8 +19,9 @@ class Api(): rate limiting or network issues. """ - RUNNER_RE = re.compile(r'Runner name: \'([\w+-]+)\'') - MACHINE_RE = re.compile(r'Machine name: \'([\w+-]+)\'') + RUNNER_RE = re.compile(r'Runner name: \'([\w+-.]+)\'') + MACHINE_RE = re.compile(r'Machine name: \'([\w+-.]+)\'') + RUN_THRESHOLD = 90 def __init__(self, pat: str, version: str = "2022-11-28", http_proxy: str = None, socks_proxy: str = None, @@ -110,12 +111,29 @@ def __process_run_log(self, log_content: bytes, run_info: dict): Returns: dict: metadata about the run execution. """ - with zipfile.ZipFile(io.BytesIO(log_content)) as runres: + log_package = None + non_ephemeral = False + with zipfile.ZipFile(io.BytesIO(log_content)) as runres: for zipinfo in runres.infolist(): + # TODO use a lambda for this messy logic + if "checkout" in zipinfo.filename or "Checkout" in zipinfo.filename: + with runres.open(zipinfo) as run_setup: + content = run_setup.read().decode() + if "Cleaning the repository" in content: + non_ephemeral = True + + if log_package: + log_package['non_ephemeral'] = non_ephemeral + if "Set up job" in zipinfo.filename: with runres.open(zipinfo) as run_setup: content = run_setup.read().decode() + if "Image Release: https://github.com/actions/runner-images" in content: + # Larger runners will appear to be self-hosted, but + # they will have the image name. Skip if we see this. + continue + if "Runner name" in content or \ "Machine name" in content: @@ -132,9 +150,10 @@ def __process_run_log(self, log_content: bytes, run_info: dict): "runner_name": runner_name, "machine_name": hostname, "run_id": run_info["id"], - "run_attempt": run_info["run_attempt"] + "run_attempt": run_info["run_attempt"], + "non_ephemeral": non_ephemeral } - return log_package + return log_package def __get_full_runlog(self, log_content: bytes, run_name: str): """Gets the full text of the runlog from the zip file by matching the @@ -155,6 +174,24 @@ def __get_full_runlog(self, log_content: bytes, run_name: str): return content + @staticmethod + def __verify_result(response: requests.Response, expected_code: int): + """Verifies that the response matches the expected code. If it does not + match, then the response is logged and the program exits. + + Args: + response (requests.Response): Response object from a request. + expected_code (int): Expected status code from the request. + """ + if response.status_code != expected_code: + logger.warn( + f"Expected status code {expected_code}, but got " + f"{response.status_code}!" + ) + logger.debug(response.text) + return False + return True + def call_get(self, url: str, params: dict = None, strip_auth=False): """Internal method to wrap a GET request so that proxies and headers do not need to be repeated. @@ -212,8 +249,34 @@ def call_post(self, url: str, params: dict = None): return api_response + def call_patch(self, url: str, params: dict = None): + """Internal method to wrap a PATCH request so that proxies and headers + do not need to be updated in each method. + + Args: + url (str): URL path to make PATCH request to. + params (dict, optional): Parameters to send as part of the request. + Defaults to None. + Returns: + Response: Returns the requests response object. + """ + request_url = self.github_url + url + logger.debug(f'Making PATCH API request to {request_url}!') + + api_response = requests.patch(request_url, headers=self.headers, + proxies=self.proxies, json=params, + verify=self.verify_ssl) + logger.debug( + f'The PATCH request to {request_url} returned a ' + f'{api_response.status_code}!') + + self.__check_rate_limit(api_response.headers) + + return api_response + def call_put(self, url: str, params: dict = None): - """_summary_ + """Internal method to wrap a PUT request so that proxies and headers + do not need to be updated in each method. Args: url (stre): _description_ @@ -601,30 +664,59 @@ def retrieve_run_logs(self, repo_name: str, short_circuit: str = True): Returns: list: List of run logs for runs that ran on self-hosted runners. """ - runs = self.call_get(f'/repos/{repo_name}/actions/runs') + start_date = datetime.now() - timedelta(days = 60) + runs = self.call_get( + f'/repos/{repo_name}/actions/runs', params={ + "per_page": "30", + "status":"completed", + "exclude_pull_requests": "true", + "created":f">{start_date.isoformat()}" + } + ) - run_logs = [] + # This is a dictionary so we can de-duplicate runner IDs based on + # the machine_name:runner_name. + run_logs = {} + names = set() if runs.status_code == 200: logger.debug(f'Enumerating runs within {repo_name}') for run in runs.json()['workflow_runs']: + + # We are only interested in runs that actually executed. + if run['conclusion'] != 'success' and \ + run['conclusion'] != 'failure': + continue + + if short_circuit: + # If we are only looking for the presence of SH runners and + # not trying to determine ephmeral vs not from repeats, then + # we just need to look at each branch + wf combination once. + workflow_key = f"{run['head_branch']}:{run['path']}" + if workflow_key in names: + continue + names.add(workflow_key) + run_log = self.call_get( f'/repos/{repo_name}/actions/runs/{run["id"]}/' f'attempts/{run["run_attempt"]}/logs') - if run_log.status_code == 200: run_log = self.__process_run_log(run_log.content, run) if run_log: - run_logs.append(run_log) + key = f"{run_log['machine_name']}:{run_log['runner_name']}" + run_logs[key] = run_log + if short_circuit: - return run_logs + return run_logs.values() + elif run_log.status_code == 410: + break else: logger.debug( f"Call to retrieve run logs from {repo_name} run " f"{run['id']} attempt {run['run_attempt']} returned " f"{run_log.status_code}!") - return run_logs + return run_logs.values() def parse_workflow_runs(self, repo_name: str): """Returns the number of workflow runs associated with the repository. @@ -671,12 +763,12 @@ def get_recent_workflow(self, repo_name: str, sha: str, file_name: str): if data['total_count'] == 0: return 0 - + # find the id of our malicious workflow for workflow in data['workflow_runs']: if f'.github/workflows/{file_name}.yml' in workflow['path']: return workflow['id'] - + return 0 def get_workflow_status(self, repo_name: str, workflow_id: int): @@ -763,12 +855,14 @@ def create_branch(self, repo_name: str, branch_name: str): repo_name (str): Name of repository in Org/Repo format. branch_name (str): Name of branch to create. """ - - resp = self.call_get(f'/repos/{repo_name}/git/refs/heads') + resp = self.call_get(f'/repos/{repo_name}') + default_branch = resp.json()['default_branch'] + resp = self.call_get( + f'/repos/{repo_name}/git/ref/heads/{default_branch}' + ) json_resp = resp.json() - - sha = json_resp[0]['object']['sha'] + sha = json_resp['object']['sha'] branch_data = { "ref": f"refs/heads/{branch_name}", @@ -942,3 +1036,143 @@ def get_repo_org_secrets(self, repo_name: str): secrets = secrets_response['secrets'] return secrets + + def commit_workflow(self, repo_name: str, + target_branch: str, + workflow_contents: bytes, file_name: str, + commit_author: str = "Gato", + commit_email: str = "gato@gato.infosec", + message="Testing"): + """ + Commits a new workflow file to a specified repository. + + This function performs the following steps: + 1. Gets the latest commit SHA of the target branch. + 2. Gets the tree SHA of the latest commit of the new branch. + 3. Gets the tree of the .github/workflows directory. + 4. If the workflows tree exists, it gets the SHA of the workflows tree. + 5. Creates a new tree where all blobs in the .github/workflows tree are removed. + 6. Creates a new commit on the new branch with the new tree. + 7. Updates the new branch to point to the new commit. + + Args: + repo_name (str): The name of the repository. + target_branch (str): The name of the target branch. + workflow_contents (bytes): The content of the new workflow file. + file_name (str): The name of the new workflow file. + commit_author (str, optional): The author of the commit. Defaults to "Gato". + commit_email (str, optional): The email of the commit author. Defaults to "gato@gato.infosec". + message (str, optional): The commit message. Defaults to "Testing". + + Returns: + str: The SHA of the new commit if the commit was successful, None otherwise. + """ + # Step 1: Get latest commit SHA of target branch + r = self.call_get( + f'/repos/{repo_name}' + ) + if self.__verify_result(r, 200) is False: + return None + default_branch = r.json()['default_branch'] + + r = self.call_get( + f'/repos/{repo_name}/commits/{default_branch}' + ) + if self.__verify_result(r, 200) is False: + return None + latest_commit_sha = r.json()['sha'] + + # Step 2: Get tree SHA of latest commit of default + r = self.call_get( + f'/repos/{repo_name}/git/commits/{latest_commit_sha}' + ) + if self.__verify_result(r, 200) is False: + return None + tree_sha = r.json()['tree']['sha'] + + # Step 3: Get the tree of the .github/workflows directory + r = self.call_get( + f'/repos/{repo_name}/git/trees/{tree_sha}', + params={"recursive": "1"} + ) + if self.__verify_result(r, 200) is False: + return None + + base_sha = r.json()['sha'] + tree = r.json()['tree'] + + existing_files = (item for item in tree if '.github/workflows' in + item['path'] and item['type'] == 'blob') + + # Step 4: Create a new tree where all blobs in the .github/workflows + # tree are removed + new_workflow_file_content = base64.b64encode( + workflow_contents + ).decode() + + r = self.call_post(f'/repos/{repo_name}/git/blobs', params={ + "content": new_workflow_file_content, + "encoding": "base64" + }) + if self.__verify_result(r, 201) is False: + return None + + new_tree = [ + { + 'path': f'.github/workflows/{file_name}', + 'mode': '100644', + 'type': 'blob', + 'sha': r.json()['sha'] + } + ] + + # Delete everything else + for existing in existing_files: + # Don't delete the same file - this will happen if the workflow + # already exists (such as a test.yml file) + if existing['path'] == f'.github/workflows/{file_name}': + continue + + new_tree.append({ + 'path': existing['path'], + 'mode': existing['mode'], + 'type': existing['type'], + 'sha': None, + }) + + r = self.call_post( + f'/repos/{repo_name}/git/trees', params={ + 'base_tree': base_sha, + 'tree': new_tree + } + ) + if self.__verify_result(r, 201) is False: + return None + new_tree_sha = r.json()['sha'] + + # Step 5: Create new commit on new branch + r = self.call_post( + f'/repos/{repo_name}/git/commits', params={ + 'message': message, + 'tree': new_tree_sha, + 'parents': [latest_commit_sha], + 'author': { + 'name': commit_author, + 'email': commit_email + } + } + ) + new_commit_sha = r.json()['sha'] + + # Step 6: Update the new branch to point to the new commit + r = self.call_post( + f'/repos/{repo_name}/git/refs', + params={ + 'sha': new_commit_sha, + 'ref': f'refs/heads/{target_branch}' + } + ) + if self.__verify_result(r, 201) is False: + return None + + return new_commit_sha diff --git a/gato/github/gql_queries.py b/gato/github/gql_queries.py new file mode 100644 index 0000000..177c459 --- /dev/null +++ b/gato/github/gql_queries.py @@ -0,0 +1,54 @@ +from gato.models import Repository + +class GqlQueries(): + """Constructs graphql queries for use with the GitHub GraphQL api. + """ + + GET_YMLS = """ + query RepoFiles($node_ids: [ID!]!) { + nodes(ids: $node_ids) { + ... on Repository { + nameWithOwner + object(expression: "HEAD:.github/workflows/") { + ... on Tree { + entries { + name + type + mode + object { + ... on Blob { + byteSize + text + } + } + } + } + } + } + } + } + """ + + @staticmethod + def get_workflow_ymls(repos: list): + """Retrieve workflow yml files for each repository. + + Args: + repos (List[Repository]): List of repository objects + Returns: + (list): List of JSON post parameters for each graphQL query. + """ + queries = [] + + for i in range(0, (len(repos) // 100) + 1): + + top_len = len(repos) if len(repos) < (100 + i*100) else (100 + i*100) + query = { + "query": GqlQueries.GET_YMLS, + "variables": { + "node_ids": [repo.repo_data['node_id'] for repo in repos[0+100*i:top_len]] + } + } + + queries.append(query) + return queries \ No newline at end of file diff --git a/gato/github/search.py b/gato/github/search.py index c793c00..88adf6c 100644 --- a/gato/github/search.py +++ b/gato/github/search.py @@ -46,7 +46,7 @@ def search_enumeration( if custom_query: query['q'] = custom_query else: - query['q'] = f'self-hosted org:{organization} language:yaml path:.github/workflows', + query['q'] = f'self-hosted org:{organization} language:yaml path:.github/workflows' next_page = f"/search/code?q={query['q']}&sort={query['sort']}" \ f"&per_page={query['per_page']}&page={query['page']}" diff --git a/gato/models/runner.py b/gato/models/runner.py index 5739a12..96626f6 100644 --- a/gato/models/runner.py +++ b/gato/models/runner.py @@ -11,7 +11,8 @@ def __init__( machine_name=None, os=None, status=None, - labels=[]): + labels=[], + non_ephemeral=False): """Constructor for runner wrapper object. Args: @@ -27,6 +28,7 @@ def __init__( self.os = os self.status = status self.labels = labels + self.non_ephemeral = non_ephemeral def toJSON(self): """Converts the repository to a Gato JSON representation. @@ -37,7 +39,8 @@ def toJSON(self): else "Unknown", "os": self.os if self.os else "Unknown", "status": self.status if self.status else "Unknown", - "labels": [label for label in self.labels] + "labels": [label for label in self.labels], + "non_ephemeral": self.non_ephemeral } return representation diff --git a/gato/search/search.py b/gato/search/search.py index db071bf..355ccde 100644 --- a/gato/search/search.py +++ b/gato/search/search.py @@ -1,4 +1,6 @@ import logging +import requests +import json from gato.github import Search from gato.github import Api @@ -57,6 +59,62 @@ def __setup_user_info(self): return True + def use_sourcegraph_api( + self, + organization: str, + query=None, + output_text=None): + """ + This method is used to search for repositories in an organization using the Sourcegraph API. + It constructs a search query and sends a GET request to the Sourcegraph search API. + The results are streamed and added to a set. + + Args: + organization (str): The name of the organization to search in. + query (str, optional): A custom search query. If not provided, a default query is used. + + Returns: + set: A set of search results. + """ + repo_filter = f"repo:{organization}/ " if organization else "" + url = "https://sourcegraph.com/.api/search/stream" + headers = {"Content-Type": "application/json"} + params = { + "q": ( + "('self-hosted' OR " + "(/runs-on/ AND NOT " + "/(ubuntu-16.04|ubuntu-18.04|ubuntu-20.04|ubuntu-22.04|ubuntu-latest|" + "windows-2019|windows-2022|windows-latest|macos-11|macos-12|macos-13|" + "macos-12-xl|macos-13-xl|macos-latest|matrix.[a-zA-Z]\\s)/)) " + f"{repo_filter}" + "lang:YAML file:.github/workflows/ count:30000" + ) + } + if query: + Output.info( + f"Searching SourceGraph with the following query: {Output.bright(query)}" + ) + params["q"] = query + else: + Output.info( + f"Searching SourceGraph with the default Gato query: {Output.bright(params['q'])}" + ) + response = requests.get(url, headers=headers, params=params, stream=True) + results = set() + + if response.status_code == 200: + for line in response.iter_lines(): + if line and line.decode().startswith("data:"): + json_line = line.decode().replace("data:", "").strip() + event = json.loads(json_line) + for element in event: + if "repository" in element: + results.add( + element["repository"].replace("github.com/", "") + ) + + return results + def use_search_api(self, organization: str, query=None): """Utilize GitHub Code Search API to try and identify repositories using self-hosted runners. This is subject to a high false-positive @@ -95,10 +153,30 @@ def use_search_api(self, organization: str, query=None): organization, custom_query=query ) + return candidates + + def present_results(self, results, output_text=None): + """ + This method is used to present the results of the search. It first + prints the number of non-fork repositories that matched the criteria. + If an output_text file path is provided, it writes the results into + that file. Finally, it prints each result in a tabbed format. + + Args: + results (list): A list of non-fork repositories that matched the + criteria. + output_text (str, optional): The file path where the results + should be written. Defaults to None. + """ Output.result( - f"Identified {len(candidates)} non-fork repositories that matched " + f"Identified {len(results)} non-fork repositories that matched " "the criteria!" ) - for candidate in candidates: + if output_text: + with open(output_text, "w") as file_output: + for candidate in results: + file_output.write(f"{candidate}\n") + + for candidate in results: Output.tabbed(candidate) diff --git a/gato/workflow_parser/workflow_parser.py b/gato/workflow_parser/workflow_parser.py index 3e82ecd..f90d4ce 100644 --- a/gato/workflow_parser/workflow_parser.py +++ b/gato/workflow_parser/workflow_parser.py @@ -2,6 +2,7 @@ import yaml from pathlib import Path import os +import re logger = logging.getLogger(__name__) @@ -16,6 +17,30 @@ class WorkflowParser(): as the project grows in capability. """ + GITHUB_HOSTED_LABELS = [ + 'ubuntu-latest', + 'macos-latest', + 'macOS-latest', + 'windows-latest', + 'ubuntu-18.04', # deprecated, but we don't want false positives on older repos. + 'ubuntu-20.04', + 'ubuntu-22.04', + 'windows-2022', + 'windows-2019', + 'windows-2016', # deprecated, but we don't want false positives on older repos. + 'macOS-13', + 'macOS-12', + 'macOS-11', + 'macos-11', + 'macos-12', + 'macos-13', + 'macos-13-xl', + 'macos-12', + ] + + LARGER_RUNNER_REGEX_LIST = r'(windows|ubuntu)-(22.04|20.04|2019-2022)-(4|8|16|32|64)core-(16|32|64|128|256)gb' + MATRIX_KEY_EXTRACTION_REGEX = r'{{\s*matrix\.([\w-]+)\s*}}' + def __init__(self, workflow_yml: str, repo_name: str, workflow_name: str): """Initialize class with workflow file. @@ -54,13 +79,64 @@ def self_hosted(self): list: List of jobs within the workflow that utilize self-hosted runners. """ - sh_jobs = [] + if 'jobs' not in self.parsed_yml: + return sh_jobs + for jobname, job_details in self.parsed_yml['jobs'].items(): if 'runs-on' in job_details: runs_on = job_details['runs-on'] + # Clear cut if 'self-hosted' in runs_on: sh_jobs.append((jobname, job_details)) + elif 'matrix.' in runs_on: + # We need to check each OS in the matrix strategy. + # Extract the matrix key from the variable + matrix_match = re.search(self.MATRIX_KEY_EXTRACTION_REGEX, runs_on) + + if matrix_match: + matrix_key = matrix_match.group(1) + else: + continue + # Check if strategy exists in the yaml file + if 'strategy' in job_details and 'matrix' in job_details['strategy']: + matrix = job_details['strategy']['matrix'] + + # Use previously acquired key to retrieve list of OSes + if matrix_key in matrix: + os_list = matrix[matrix_key] + elif 'include' in matrix: + inclusions = matrix['include'] + os_list = [] + for inclusion in inclusions: + if matrix_key in inclusion: + os_list.append(inclusion[matrix_key]) + else: + continue + + # We only need ONE to be self hosted, others can be + # GitHub hosted + for key in os_list: + if type(key) == str: + if key not in self.GITHUB_HOSTED_LABELS and not re.match(self.LARGER_RUNNER_REGEX_LIST, key): + sh_jobs.append((jobname, job_details)) + break + pass + else: + if type(runs_on) == list: + for label in runs_on: + if label in self.GITHUB_HOSTED_LABELS: + break + if re.match(self.LARGER_RUNNER_REGEX_LIST, label): + break + else: + sh_jobs.append((jobname, job_details)) + elif type(runs_on) == str: + if runs_on in self.GITHUB_HOSTED_LABELS: + break + if re.match(self.LARGER_RUNNER_REGEX_LIST, runs_on): + break + sh_jobs.append((jobname, job_details)) return sh_jobs diff --git a/pyproject.toml b/pyproject.toml index 0378c85..7b29f9c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "praetorian-gato" -version = "1.5.1" +version = "1.6.0" description = "GitHub Actions Enumeration and Attack Framework" readme = "readme.md" authors = [ diff --git a/test/test_cases.json b/test/test_cases.json index 57d1820..7c5b5bb 100644 --- a/test/test_cases.json +++ b/test/test_cases.json @@ -119,7 +119,7 @@ "type": "stdout" }, { - "expect": "[+] The repository contains a workflow: main.yml that executes on self-hosted runners!", + "expect": "[+] The repository contains a workflow: main.yml that might execute on self-hosted runners!", "type": "stdout" } ], diff --git a/unit_test/test_api.py b/unit_test/test_api.py index 4589a37..8899700 100644 --- a/unit_test/test_api.py +++ b/unit_test/test_api.py @@ -456,7 +456,7 @@ def test_retrieve_run_logs(mock_get): mock_get.return_value.json.return_value = { "workflow_runs": [ - {"id": 123, "run_attempt": 1} + {"id": 123, "run_attempt": 1, "conclusion": "success", "head_branch": "dev", "path": ".github/workflows/build.yml@dev"} ] } @@ -469,14 +469,14 @@ def test_retrieve_run_logs(mock_get): logs = abstraction_layer.retrieve_run_logs("testOrg/testRepo") assert len(logs) == 1 - assert logs[0]['runner_name'] == 'ghrunner-test' + assert list(logs)[0]['runner_name'] == 'ghrunner-test' logs = abstraction_layer.retrieve_run_logs( "testOrg/testRepo", short_circuit=False ) assert len(logs) == 1 - assert logs[0]['runner_name'] == 'ghrunner-test' + assert list(logs)[0]['runner_name'] == 'ghrunner-test' @patch("gato.github.api.requests.get") @@ -670,7 +670,10 @@ def test_create_branch(mock_get, mock_post): test_pat = "ghp_AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" mock_get.return_value.status_code = 200 - mock_get.return_value.json.return_value = [ + mock_get.return_value.json.side_effect = [ + { + "default_branch": "dev" + }, { "ref": "refs/heads/dev", "node_id": "REF_AAAAAAAAAAAAAAAAAAAAAAAAAAA", @@ -698,7 +701,10 @@ def test_create_branch_fail(mock_get, mock_post): test_pat = "ghp_AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" mock_get.return_value.status_code = 200 - mock_get.return_value.json.return_value = [ + mock_get.return_value.json.side_effect = [ + { + "default_branch": "dev" + }, { "ref": "refs/heads/dev", "node_id": "REF_AAAAAAAAAAAAAAAAAAAAAAAAAAA", @@ -920,3 +926,85 @@ def test_handle_ratelimit(mock_time): api._Api__check_rate_limit(test_headers) mock_time.sleep.assert_called_once() + + +@patch('gato.github.api.requests.get') +@patch('gato.github.api.requests.post') +def test_commit_workflow(mock_call_post, mock_call_get): + # Arrange + test_pat = "ghp_AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" + api = Api(test_pat, "2022-11-28") + mock_call_get.side_effect = [ + MagicMock(status_code=200, json=MagicMock(return_value={'default_branch': 'main'})), + MagicMock(status_code=200, json=MagicMock(return_value={'sha': '123'})), + MagicMock(status_code=200, json=MagicMock(return_value={'tree': {'sha': '456'}})), + MagicMock(status_code=200, json=MagicMock(return_value={'sha': '789', 'tree': []})) + ] + mock_call_post.side_effect = [ + MagicMock(status_code=201, json=MagicMock(return_value={'sha': 'abc'})), + MagicMock(status_code=201, json=MagicMock(return_value={'sha': 'def'})), + MagicMock(status_code=201, json=MagicMock(return_value={'sha': 'ghi'})), + MagicMock(status_code=201, json=MagicMock(return_value={'sha': 'jkl'})) + ] + + # Act + result = api.commit_workflow('test_repo', 'test_branch', b'test_content', 'test_file') + + # Assert + assert result == 'ghi' + assert mock_call_get.call_count == 4 + assert mock_call_post.call_count == 4 + + +@patch('gato.github.api.requests.get') +@patch('gato.github.api.requests.post') +def test_commit_workflow_failure(mock_call_post, mock_call_get): + # Arrange + test_pat = "ghp_AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" + api = Api(test_pat, "2022-11-28") + mock_call_get.side_effect = [ + MagicMock(status_code=200, json=MagicMock(return_value={'default_branch': 'main'})), + MagicMock(status_code=200, json=MagicMock(return_value={'sha': '123'})), + MagicMock(status_code=200, json=MagicMock(return_value={'tree': {'sha': '456'}})), + MagicMock(status_code=200, json=MagicMock(return_value={'sha': '789', 'tree': []})) + ] + mock_call_post.side_effect = [ + MagicMock(status_code=201, json=MagicMock(return_value={'sha': 'abc'})), + MagicMock(status_code=201, json=MagicMock(return_value={'sha': 'def'})), + MagicMock(status_code=201, json=MagicMock(return_value={'sha': 'ghi'})), + MagicMock(status_code=400, json=MagicMock(return_value={'sha': 'jkl'})) + ] + + # Act + result = api.commit_workflow('test_repo', 'test_branch', b'test_content', 'test_file') + + # Assert + assert result is None + assert mock_call_get.call_count == 4 + assert mock_call_post.call_count == 4 + + +@patch('gato.github.api.requests.get') +@patch('gato.github.api.requests.post') +def test_commit_workflow_failure2(mock_call_post, mock_call_get): + # Arrange + test_pat = "ghp_AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA" + api = Api(test_pat, "2022-11-28") + mock_call_get.side_effect = [ + MagicMock(status_code=200, json=MagicMock(return_value={'default_branch': 'main'})), + MagicMock(status_code=200, json=MagicMock(return_value={'sha': '123'})), + MagicMock(status_code=200, json=MagicMock(return_value={'tree': {'sha': '456'}})), + MagicMock(status_code=200, json=MagicMock(return_value={'sha': '789', 'tree': []})) + ] + mock_call_post.side_effect = [ + MagicMock(status_code=201, json=MagicMock(return_value={'sha': 'abc'})), + MagicMock(status_code=404, json=MagicMock(return_value=None)), + ] + + # Act + result = api.commit_workflow('test_repo', 'test_branch', b'test_content', 'test_file') + + # Assert + assert result is None + assert mock_call_get.call_count == 4 + assert mock_call_post.call_count == 2 diff --git a/unit_test/test_enumerate.py b/unit_test/test_enumerate.py index d5601fb..22d346a 100644 --- a/unit_test/test_enumerate.py +++ b/unit_test/test_enumerate.py @@ -103,7 +103,7 @@ def test_enumerate_repo_admin(mock_api, capsys): } mock_api.return_value.retrieve_run_logs.return_value = [ - {"machine_name": "unittest1", "runner_name": "much_unit_such_test"} + {"machine_name": "unittest1", "runner_name": "much_unit_such_test", "non_ephemeral": False} ] repo_data = json.loads(json.dumps(TEST_REPO_DATA)) @@ -143,7 +143,7 @@ def test_enumerate_repo_admin_no_wf(mock_api, capsys): } mock_api.return_value.retrieve_run_logs.return_value = [ - {"machine_name": "unittest1", "runner_name": "much_unit_such_test"} + {"machine_name": "unittest1", "runner_name": "much_unit_such_test", "non_ephemeral": False} ] repo_data = json.loads(json.dumps(TEST_REPO_DATA)) @@ -183,7 +183,7 @@ def test_enumerate_repo_no_wf_no_admin(mock_api, capsys): } mock_api.return_value.retrieve_run_logs.return_value = [ - {"machine_name": "unittest1", "runner_name": "much_unit_such_test"} + {"machine_name": "unittest1", "runner_name": "much_unit_such_test", "non_ephemeral": False} ] repo_data = json.loads(json.dumps(TEST_REPO_DATA)) @@ -222,7 +222,7 @@ def test_enumerate_repo_no_wf_maintain(mock_api, capsys): } mock_api.return_value.retrieve_run_logs.return_value = [ - {"machine_name": "unittest1", "runner_name": "much_unit_such_test"} + {"machine_name": "unittest1", "runner_name": "much_unit_such_test", "non_ephemeral": False} ] repo_data = json.loads(json.dumps(TEST_REPO_DATA)) @@ -262,7 +262,7 @@ def test_enumerate_repo_only(mock_api, capsys): } mock_api.return_value.retrieve_run_logs.return_value = [ - {"machine_name": "unittest1", "runner_name": "much_unit_such_test"} + {"machine_name": "unittest1", "runner_name": "much_unit_such_test", "non_ephemeral": False} ] repo_data = json.loads(json.dumps(TEST_REPO_DATA)) diff --git a/unit_test/test_repo_enumerate.py b/unit_test/test_repo_enumerate.py index 3f049d4..3b2d23c 100644 --- a/unit_test/test_repo_enumerate.py +++ b/unit_test/test_repo_enumerate.py @@ -45,7 +45,7 @@ def test_enumerate_repo(): } mock_api.retrieve_run_logs.return_value = [ - {"machine_name": "unittest1", "runner_name": "much_unit_such_test"} + {"machine_name": "unittest1", "runner_name": "much_unit_such_test", "non_ephemeral": False} ] repo_data = json.loads(json.dumps(TEST_REPO_DATA)) @@ -75,7 +75,7 @@ def test_enumerate_repo_admin(): } mock_api.retrieve_run_logs.return_value = [ - {"machine_name": "unittest1", "runner_name": "much_unit_such_test"} + {"machine_name": "unittest1", "runner_name": "much_unit_such_test", "non_ephemeral": False} ] repo_data = json.loads(json.dumps(TEST_REPO_DATA))