From f5d965d5e82b80f0763f1e43c49864a8eb05530e Mon Sep 17 00:00:00 2001 From: Julien Doutre Date: Wed, 5 Feb 2025 11:54:22 +0100 Subject: [PATCH 1/4] Add new GIthub Action ecosystem --- README.md | 15 ++++++ guarddog/analyzer/metadata/__init__.py | 3 ++ .../metadata/github_action/__init__.py | 11 ++++ guarddog/analyzer/sourcecode/__init__.py | 3 ++ guarddog/ecosystems.py | 3 ++ guarddog/scanners/__init__.py | 3 ++ guarddog/scanners/github_action_scanner.py | 51 +++++++++++++++++++ tests/core/test_github_action_scanner.py | 22 ++++++++ 8 files changed, 111 insertions(+) create mode 100644 guarddog/analyzer/metadata/github_action/__init__.py create mode 100644 guarddog/scanners/github_action_scanner.py create mode 100644 tests/core/test_github_action_scanner.py diff --git a/README.md b/README.md index c575515b..d1cfb25e 100644 --- a/README.md +++ b/README.md @@ -155,6 +155,21 @@ Metadata heuristics: | typosquatting | Identify packages that are named closely to an highly popular package | +### GitHub Action + +Source code heuristics: + +| **Heuristic** | **Description** | +|:-------------:|:---------------:| +| npm-serialize-environment | Identify when a package serializes 'process.env' to exfiltrate environment variables | +| npm-obfuscation | Identify when a package uses a common obfuscation method often used by malware | +| npm-silent-process-execution | Identify when a package silently executes an executable | +| shady-links | Identify when a package contains an URL to a domain with a suspicious extension | +| npm-exec-base64 | Identify when a package dynamically executes code through 'eval' | +| npm-install-script | Identify when a package has a pre or post-install script automatically running commands | +| npm-steganography | Identify when a package retrieves hidden data from an image and executes it | +| npm-dll-hijacking | Identifies when a malicious package manipulates a trusted application into loading a malicious DLL | +| npm-exfiltrate-sensitive-data | Identify when a package reads and exfiltrates sensitive data from the local system | ## Custom Rules diff --git a/guarddog/analyzer/metadata/__init__.py b/guarddog/analyzer/metadata/__init__.py index 8cb94385..0853e650 100644 --- a/guarddog/analyzer/metadata/__init__.py +++ b/guarddog/analyzer/metadata/__init__.py @@ -2,6 +2,7 @@ from guarddog.analyzer.metadata.npm import NPM_METADATA_RULES from guarddog.analyzer.metadata.pypi import PYPI_METADATA_RULES from guarddog.analyzer.metadata.go import GO_METADATA_RULES +from guarddog.analyzer.metadata.github_action import GITHUB_ACTION_METADATA_RULES from guarddog.ecosystems import ECOSYSTEM @@ -13,3 +14,5 @@ def get_metadata_detectors(ecosystem: ECOSYSTEM) -> dict[str, Detector]: return NPM_METADATA_RULES case ECOSYSTEM.GO: return GO_METADATA_RULES + case ECOSYSTEM.GITHUB_ACTION: + return GITHUB_ACTION_METADATA_RULES diff --git a/guarddog/analyzer/metadata/github_action/__init__.py b/guarddog/analyzer/metadata/github_action/__init__.py new file mode 100644 index 00000000..ad3fccb0 --- /dev/null +++ b/guarddog/analyzer/metadata/github_action/__init__.py @@ -0,0 +1,11 @@ +from typing import Type + +from guarddog.analyzer.metadata import Detector + +GITHUB_ACTION_METADATA_RULES = {} + +classes: list[Type[Detector]] = [] + +for detectorClass in classes: + detectorInstance = detectorClass() # type: ignore + GITHUB_ACTION_METADATA_RULES[detectorInstance.get_name()] = detectorInstance diff --git a/guarddog/analyzer/sourcecode/__init__.py b/guarddog/analyzer/sourcecode/__init__.py index 08a06320..44a46673 100644 --- a/guarddog/analyzer/sourcecode/__init__.py +++ b/guarddog/analyzer/sourcecode/__init__.py @@ -51,6 +51,9 @@ def get_sourcecode_rules( ecosystem: The ecosystem to filter for if rules are ecosystem specific kind: The kind of rule to filter for """ + if ecosystem == ECOSYSTEM.GITHUB_ACTION: + ecosystem = ECOSYSTEM.NPM + for rule in SOURCECODE_RULES: if kind and not isinstance(rule, kind): continue diff --git a/guarddog/ecosystems.py b/guarddog/ecosystems.py index 8878a6de..fcd73c73 100644 --- a/guarddog/ecosystems.py +++ b/guarddog/ecosystems.py @@ -5,6 +5,7 @@ class ECOSYSTEM(Enum): PYPI = "pypi" NPM = "npm" GO = "go" + GITHUB_ACTION = "github-action" def get_friendly_name(ecosystem: ECOSYSTEM) -> str: @@ -15,5 +16,7 @@ def get_friendly_name(ecosystem: ECOSYSTEM) -> str: return "npm" case ECOSYSTEM.GO: return "go" + case ECOSYSTEM.GITHUB_ACTION: + return "GitHub Action" case _: return ecosystem.value diff --git a/guarddog/scanners/__init__.py b/guarddog/scanners/__init__.py index 03060050..515c0871 100644 --- a/guarddog/scanners/__init__.py +++ b/guarddog/scanners/__init__.py @@ -6,6 +6,7 @@ from .pypi_project_scanner import PypiRequirementsScanner from .go_package_scanner import GoModuleScanner from .go_project_scanner import GoDependenciesScanner +from .github_action_scanner import GithubActionScanner from .scanner import PackageScanner, ProjectScanner from ..ecosystems import ECOSYSTEM @@ -29,6 +30,8 @@ def get_package_scanner(ecosystem: ECOSYSTEM) -> Optional[PackageScanner]: return NPMPackageScanner() case ECOSYSTEM.GO: return GoModuleScanner() + case ECOSYSTEM.GITHUB_ACTION: + return GithubActionScanner() return None diff --git a/guarddog/scanners/github_action_scanner.py b/guarddog/scanners/github_action_scanner.py new file mode 100644 index 00000000..791e4835 --- /dev/null +++ b/guarddog/scanners/github_action_scanner.py @@ -0,0 +1,51 @@ +import logging +import os +import pathlib +import typing +from urllib.parse import urlparse + +from guarddog.analyzer.analyzer import Analyzer +from guarddog.ecosystems import ECOSYSTEM +from guarddog.scanners.scanner import PackageScanner + +log = logging.getLogger("guarddog") + + +class GithubActionScanner(PackageScanner): + def __init__(self) -> None: + super().__init__(Analyzer(ECOSYSTEM.GITHUB_ACTION)) + + def download_and_get_package_info(self, directory: str, package_name: str, version=None) -> typing.Tuple[dict, str]: + repo = self._get_repo(package_name) + tarball_url = self._get_git_tarball_url(repo, version) + + log.debug(f"Downloading GitHub Action source from {tarball_url}") + + file_extension = pathlib.Path(tarball_url).suffix + if file_extension == "": + file_extension = ".zip" + + zippath = os.path.join(directory, package_name.replace("/", "-") + file_extension) + unzippedpath = zippath.removesuffix(file_extension) + self.download_compressed(tarball_url, zippath, unzippedpath) + + return {}, unzippedpath + + def _get_repo(self, url: str) -> str: + parsed_url = urlparse(url) + + if parsed_url.hostname and parsed_url.hostname != "github.com": + raise Exception(parsed_url) + + path = parsed_url.path.removesuffix(".git").strip("/") + + if path.count("/") != 1: + raise Exception("Invalid GitHub repo URL: " + url) + + return path + + def _get_git_tarball_url(self, repo: str, version=None) -> str: + if not version: + return f"https://api.github.com/repos/{repo}/zipball" + else: + return f"https://github.com/{repo}/archive/refs/tags/{version}.zip" diff --git a/tests/core/test_github_action_scanner.py b/tests/core/test_github_action_scanner.py new file mode 100644 index 00000000..0475e92f --- /dev/null +++ b/tests/core/test_github_action_scanner.py @@ -0,0 +1,22 @@ +import os.path +import tempfile + +import pytest + +from guarddog.scanners import GithubActionScanner + + +def test_download_and_get_github_action_by_url(): + scanner = GithubActionScanner() + with tempfile.TemporaryDirectory() as tmpdirname: + data, path = scanner.download_and_get_package_info(tmpdirname, "https://github.com/expressjs/express.git", "v5.0.0") + assert not data + assert os.path.exists(os.path.join(tmpdirname, "https:--github.com-expressjs-express.git", "express-5.0.0", "package.json")) + + +def test_download_and_get_github_action_by_name(): + scanner = GithubActionScanner() + with tempfile.TemporaryDirectory() as tmpdirname: + data, path = scanner.download_and_get_package_info(tmpdirname, "expressjs/express", "v5.0.0") + assert not data + assert os.path.exists(os.path.join(tmpdirname, "expressjs-express", "express-5.0.0", "package.json")) From ae7a8bba236b126845e7f01e51e1270942573194 Mon Sep 17 00:00:00 2001 From: Julien Doutre Date: Wed, 5 Feb 2025 12:05:39 +0100 Subject: [PATCH 2/4] Use actual action in tests --- tests/core/test_github_action_scanner.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/core/test_github_action_scanner.py b/tests/core/test_github_action_scanner.py index 0475e92f..52bb0f99 100644 --- a/tests/core/test_github_action_scanner.py +++ b/tests/core/test_github_action_scanner.py @@ -9,14 +9,14 @@ def test_download_and_get_github_action_by_url(): scanner = GithubActionScanner() with tempfile.TemporaryDirectory() as tmpdirname: - data, path = scanner.download_and_get_package_info(tmpdirname, "https://github.com/expressjs/express.git", "v5.0.0") + data, path = scanner.download_and_get_package_info(tmpdirname, "https://github.com/actions/checkout.git", "v4.2.2") assert not data - assert os.path.exists(os.path.join(tmpdirname, "https:--github.com-expressjs-express.git", "express-5.0.0", "package.json")) + assert os.path.exists(os.path.join(tmpdirname, "https:--github.com-actions-checkout.git", "checkout-4.2.2", "package.json")) def test_download_and_get_github_action_by_name(): scanner = GithubActionScanner() with tempfile.TemporaryDirectory() as tmpdirname: - data, path = scanner.download_and_get_package_info(tmpdirname, "expressjs/express", "v5.0.0") + data, path = scanner.download_and_get_package_info(tmpdirname, "actions/checkout", "v4.2.2") assert not data - assert os.path.exists(os.path.join(tmpdirname, "expressjs-express", "express-5.0.0", "package.json")) + assert os.path.exists(os.path.join(tmpdirname, "actions-checkout", "checkout-4.2.2", "package.json")) From 09fe976f01185b6ad4f37b93c7743bc5c37cb29b Mon Sep 17 00:00:00 2001 From: Julien Doutre Date: Wed, 5 Feb 2025 13:58:54 +0100 Subject: [PATCH 3/4] Use ValueError exceptions --- guarddog/scanners/github_action_scanner.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/guarddog/scanners/github_action_scanner.py b/guarddog/scanners/github_action_scanner.py index 791e4835..bd4fa173 100644 --- a/guarddog/scanners/github_action_scanner.py +++ b/guarddog/scanners/github_action_scanner.py @@ -35,12 +35,12 @@ def _get_repo(self, url: str) -> str: parsed_url = urlparse(url) if parsed_url.hostname and parsed_url.hostname != "github.com": - raise Exception(parsed_url) + raise ValueError("Invalid GitHub repo URL: " + url) path = parsed_url.path.removesuffix(".git").strip("/") if path.count("/") != 1: - raise Exception("Invalid GitHub repo URL: " + url) + raise ValueError("Invalid GitHub repo name: " + path) return path From 2343238df501d4f655131cc9281ce44e0bd7c8bb Mon Sep 17 00:00:00 2001 From: Julien Doutre Date: Wed, 5 Feb 2025 14:21:20 +0100 Subject: [PATCH 4/4] Remove masking logic --- guarddog/analyzer/sourcecode/__init__.py | 45 ++++++++++++------------ 1 file changed, 22 insertions(+), 23 deletions(-) diff --git a/guarddog/analyzer/sourcecode/__init__.py b/guarddog/analyzer/sourcecode/__init__.py index 44a46673..0c47b630 100644 --- a/guarddog/analyzer/sourcecode/__init__.py +++ b/guarddog/analyzer/sourcecode/__init__.py @@ -51,9 +51,6 @@ def get_sourcecode_rules( ecosystem: The ecosystem to filter for if rules are ecosystem specific kind: The kind of rule to filter for """ - if ecosystem == ECOSYSTEM.GITHUB_ACTION: - ecosystem = ECOSYSTEM.NPM - for rule in SOURCECODE_RULES: if kind and not isinstance(rule, kind): continue @@ -74,34 +71,36 @@ def get_sourcecode_rules( data = yaml.load(fd, Loader=SafeLoader) for rule in data["rules"]: for lang in rule["languages"]: - ecosystem = None + ecosystems = set() match lang: case "python": - ecosystem = ECOSYSTEM.PYPI + ecosystems.add(ECOSYSTEM.PYPI) case "javascript" | "typescript" | "json": - ecosystem = ECOSYSTEM.NPM + ecosystems.add(ECOSYSTEM.NPM) + ecosystems.add(ECOSYSTEM.GITHUB_ACTION) case "go": - ecosystem = ECOSYSTEM.GO + ecosystems.add(ECOSYSTEM.GO) case _: continue - # avoids duplicates when multiple languages are supported by a rule - if not next( - filter( - lambda r: r.id == rule["id"], - get_sourcecode_rules(ecosystem, SempgrepRule), - ), - None, - ): - SOURCECODE_RULES.append( - SempgrepRule( - id=rule["id"], - ecosystem=ecosystem, - description=rule.get("metadata", {}).get("description", ""), - file=file_name, - rule_content=rule, + for ecosystem in ecosystems: + # avoids duplicates when multiple languages are supported by a rule + if not next( + filter( + lambda r: r.id == rule["id"], + get_sourcecode_rules(ecosystem, SempgrepRule), + ), + None, + ): + SOURCECODE_RULES.append( + SempgrepRule( + id=rule["id"], + ecosystem=ecosystem, + description=rule.get("metadata", {}).get("description", ""), + file=file_name, + rule_content=rule, + ) ) - ) yara_rule_file_names = list( filter(lambda x: x.endswith("yar"), os.listdir(current_dir))