From 3b09717b3d59ecb27c9b8687823fdd56e2e0fab9 Mon Sep 17 00:00:00 2001
From: Matheus Tosta <me@mtosta.dev>
Date: Thu, 3 Oct 2024 14:52:55 -0400
Subject: [PATCH] PENG-2369 implement the jobbergate agent snap (#628)

* PENG-2369 implement the jobbergate agent snap

* PENG-2369 improvements in the README, as well as a improvement in the configure hook for better readability

* PENG-2369 add apps to the snap for starting, stopping and restarting the agent daemon
---
 .github/workflows/publish_on_tag.yaml         | 22 +++++
 .github/workflows/test_on_push.yaml           | 18 ++++
 Makefile                                      |  3 +
 jobbergate-agent-snap/.gitignore              |  9 ++
 jobbergate-agent-snap/Makefile                | 34 +++++++
 jobbergate-agent-snap/README.md               | 77 +++++++++++++++
 jobbergate-agent-snap/hooks/bin/configure     | 82 ++++++++++++++++
 jobbergate-agent-snap/mypy.ini                |  3 +
 jobbergate-agent-snap/ruff.toml               | 11 +++
 jobbergate-agent-snap/snap/snapcraft.yaml     | 98 +++++++++++++++++++
 jobbergate-agent-snap/tox.ini                 | 24 +++++
 .../wrappers/commands/daemon.restart          |  6 ++
 .../wrappers/commands/daemon.start            | 26 +++++
 .../wrappers/commands/daemon.stop             |  5 +
 jobbergate-agent/jobbergate_agent/settings.py | 12 +--
 15 files changed, 424 insertions(+), 6 deletions(-)
 create mode 100644 jobbergate-agent-snap/.gitignore
 create mode 100644 jobbergate-agent-snap/Makefile
 create mode 100644 jobbergate-agent-snap/README.md
 create mode 100755 jobbergate-agent-snap/hooks/bin/configure
 create mode 100644 jobbergate-agent-snap/mypy.ini
 create mode 100644 jobbergate-agent-snap/ruff.toml
 create mode 100644 jobbergate-agent-snap/snap/snapcraft.yaml
 create mode 100644 jobbergate-agent-snap/tox.ini
 create mode 100755 jobbergate-agent-snap/wrappers/commands/daemon.restart
 create mode 100755 jobbergate-agent-snap/wrappers/commands/daemon.start
 create mode 100755 jobbergate-agent-snap/wrappers/commands/daemon.stop

diff --git a/.github/workflows/publish_on_tag.yaml b/.github/workflows/publish_on_tag.yaml
index 0109224bd..5a0017995 100644
--- a/.github/workflows/publish_on_tag.yaml
+++ b/.github/workflows/publish_on_tag.yaml
@@ -4,6 +4,7 @@ name: Publish on Tag
 #   - Is triggered when a new version is tagged;
 #   - Check if the version for each sub-project matches the tag;
 #   - Build and release the packages to PyPI.
+#   - Build the agent snap and upload to the snap store.
 
 on:
   push:
@@ -70,3 +71,24 @@ jobs:
           poetry config pypi-token.pypi ${{ secrets.OMNIVECTOR_PYPI_TOKEN }}
           poetry build
           poetry publish
+
+  snapstore:
+    runs-on: ubuntu-24.04
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v3
+
+      - name: Set up Snapcraft
+        uses: snapcore/action-build@v1
+        id: snapcraft
+        with:
+          snapcraft-args: "--destructive-mode"
+          snapcraft-channel: "8.x/stable"
+          path: "./jobbergate-agent-snap"
+
+      - name: Release to Edge
+        run: |
+          snapcraft upload --release latest/edge,latest/candidate ${{ steps.snapcraft.outputs.snap }}
+        env:
+          SNAPCRAFT_STORE_CREDENTIALS: ${{ secrets.SNAPCRAFT_STORE_CREDENTIALS }}
diff --git a/.github/workflows/test_on_push.yaml b/.github/workflows/test_on_push.yaml
index a3539196f..5a58bdbe8 100644
--- a/.github/workflows/test_on_push.yaml
+++ b/.github/workflows/test_on_push.yaml
@@ -130,3 +130,21 @@ jobs:
           file: tests/coverage.xml
           disable_search: true
           fail_ci_if_error: true
+
+  agent-snap-tests:
+    name: "jobbergate-agent-snap tests"
+    runs-on: "ubuntu-22.04"
+    steps:
+      - uses: actions/checkout@v3
+      - uses: actions/setup-python@v4
+        with:
+          python-version: "3.12.3"
+          architecture: "x64"
+          cache: "pip"
+          cache-dependency-path: |
+            .github/workflows/test_on_push.yaml
+      - name: "run quality control checks"
+        working-directory: jobbergate-agent-snap
+        run: |
+          python -m pip install tox==4.20.0
+          make qa
diff --git a/Makefile b/Makefile
index 6055ada2b..579be14ab 100644
--- a/Makefile
+++ b/Makefile
@@ -3,11 +3,14 @@ SHELL:= /bin/bash
 qa:
 	$(MAKE) -C jobbergate-api qa
 	$(MAKE) -C jobbergate-cli qa
+	$(MAKE) -C jobbergate-agent-snap qa
 
 format:
 	$(MAKE) -C jobbergate-api format
 	$(MAKE) -C jobbergate-cli format
+	$(MAKE) -C jobbergate-agent-snap format
 
 clean:
 	$(MAKE) -C jobbergate-api clean
 	$(MAKE) -C jobbergate-cli clean
+	$(MAKE) -C jobbergate-agent-snap clean
diff --git a/jobbergate-agent-snap/.gitignore b/jobbergate-agent-snap/.gitignore
new file mode 100644
index 000000000..489604030
--- /dev/null
+++ b/jobbergate-agent-snap/.gitignore
@@ -0,0 +1,9 @@
+.venv
+.mypy_cache
+.ruff_cache
+__pycache__
+*.snap
+parts/
+prime/
+stage/
+.tox
\ No newline at end of file
diff --git a/jobbergate-agent-snap/Makefile b/jobbergate-agent-snap/Makefile
new file mode 100644
index 000000000..0e51d5bfb
--- /dev/null
+++ b/jobbergate-agent-snap/Makefile
@@ -0,0 +1,34 @@
+SHELL:=/bin/bash
+
+.PHONY: lint
+lint: ## Run the linters
+	tox -e lint
+
+.PHONY: type
+type: ## Run the type checker
+	tox -e type
+
+.PHONY: qa
+qa: lint type ## Run all quality assurance checks
+
+.PHONY: format
+format: ## Run the formatter
+	tox -e format
+
+.PHONY: clean
+clean: ## Clean all files/folders created by the project
+	@find . -iname '*.pyc' -delete
+	@find . -iname '*.pyo' -delete
+	@find . -iname '*~' -delete
+	@find . -iname '*.swp' -delete
+	@find . -iname '__pycache__' -delete
+	@find . -iname '.zip' -delete
+	@find . -type f -name "*.snap" -delete
+	@rm -r .mypy_cache || true
+	@rm -r .ruff_cache || true
+	@rm -r .tox || true
+	@rm -r .venv || true
+	@rm -r parts || true
+	@rm -r prime || true
+	@rm -r stage || true
+
diff --git a/jobbergate-agent-snap/README.md b/jobbergate-agent-snap/README.md
new file mode 100644
index 000000000..5dcbfaaee
--- /dev/null
+++ b/jobbergate-agent-snap/README.md
@@ -0,0 +1,77 @@
+# jobbergate-agent-snap
+The Jobbergate Agent bundled into a Snap.
+
+# Installation instructions
+
+For installing from the Snap Store, run:
+```bash
+sudo snap install jobbergate-agent
+```
+
+## Basic Usage
+
+This snap requires a few configuration values to be set before it can be used. The required values are:
+- oidc-client-id: The client ID of the OIDC application that the agent will use for authentication.
+
+- oidc-client-secret: The client secret of the OIDC application that the agent will use for authentication.
+
+The optional values are:
+- base-api-url: The URL of the Jobbergate API server where the agent will send its data. Setting/unsetting this value is more interesting when using the snap in a development environment; do not change it otherwise.
+
+- oidc-domain: The domain of the OIDC server that the agent will use for authentication. Setting/unsetting this value is more interesting when using the snap in a development environment; do not change it otherwise.
+
+- task-jobs-interval-seconds: task-jobs-interval-seconds: The interval in seconds between execution of the agent's internal tasks. This controls the frequency that data is sent to the Jobbergate API. This is optional and defaults to 30 seconds.
+
+- task-self-update-interval-seconds: The interval in seconds at which the agent will check for updates to itself. This is optional and defaults to 30 seconds.
+
+- sbatch-path: The absolute path to the *sbatch* command on the host system. This is optional and defaults to /usr/bin/sbatch.
+
+- scontrol-path: The absolute path to the *scontrol* command on the host system. This is optional and defaults to /usr/bin/scontrol.
+
+- default-slurm-work-dir: The default working directory that the agent will use when submitting jobs to the SLURM cluster. This is optional and defaults to /tmp.
+
+- slurm-user-mapper: The user mapper that the agent will use to map the system user name to the SLURM user name. This is optional and defaults to none.
+
+- single-user-submitter: The system user name that the agent will use to submit jobs to the SLURM cluster if the *slurm-user-mapper* is not set. This is optional and defaults to *ubuntu*.
+
+- write-submission-files: A boolean value (true, false) that indicates whether the agent should write submission files to disk. This is optional and defaults to false.
+
+Any configuration can be set using the *snap* command line, e.g.:
+```bash
+sudo snap set jobbergate-agent oidc-client-id=foo
+sudo snap set jobbergate-agent oidc-client-secret=boo
+```
+
+# Development
+
+For development purposes, you can build the `jobbergate-agent` part prior to packing the snap. To do that, run:
+```bash
+snapcraft prime -v
+```
+
+Add the `--debug` flag for creating a shell in case there's any error after the build is complete.
+
+For building the snap end-to-end, run:
+```bash
+snapcraft -v --debug
+```
+
+Once the command completes successfully, a `.snap` file will be created in the directory. For installing this snap, run:
+```bash
+sudo snap install --dangerous jobbergate-agent_<snap version>_amd64.snap
+```
+
+Once the snap is installed, it is possible to check the status of the daemon and the logs:
+```bash
+systemctl status snap.jobbergate-agent.daemon  # check the daemon status
+sudo journalctl -u snap.jobbergate-agent.daemon --follow  # follow the agent logs
+```
+
+Sometimes is important to clean the environment for deleting cached files and dependencies. For doing that, run:
+```bash
+sudo snapcraft clean
+```
+
+# Publish
+
+Every time a new tag is created, a new version of the snap will be published to the *latest/candidate* and *latest/edge* channels. The version follows the pattern `<snap version>-<git revision>`, e.g. `1.0.0-8418de0`.
diff --git a/jobbergate-agent-snap/hooks/bin/configure b/jobbergate-agent-snap/hooks/bin/configure
new file mode 100755
index 000000000..3fa51de7b
--- /dev/null
+++ b/jobbergate-agent-snap/hooks/bin/configure
@@ -0,0 +1,82 @@
+#!/usr/bin/env python3
+"""Snapcraft `configure` hook for the Jobergate Agent snap."""
+
+import os
+import subprocess
+import sys
+from pathlib import Path
+from typing import Union
+
+SNAP_COMMON_PATH = "/var/snap/jobbergate-agent/common"
+SNAP_INSTANCE_NAME = os.environ["SNAP_INSTANCE_NAME"]
+DOTENV_PREFIX = "JOBBERGATE_AGENT_"
+DOTENV_FILE_LOCATION = Path(f"{SNAP_COMMON_PATH}/.env")
+AGENT_VARIABLES_MAP: dict[str, Union[str, int]] = {
+    "BASE_API_URL": "https://apis.vantagehpc.io",
+    "OIDC_DOMAIN": "auth.vantagehpc.io/realms/vantage",
+    "OIDC_CLIENT_ID": "",
+    "OIDC_CLIENT_SECRET": "",
+    "TASK_JOBS_INTERVAL_SECONDS": 30,
+    "TASK_SELF_UPDATE_INTERVAL_SECONDS": 30,
+    "CACHE_DIR": f"{SNAP_COMMON_PATH}/.cache",
+    "SBATCH_PATH": "/usr/bin/sbatch",
+    "SCONTROL_PATH": "/usr/bin/scontrol",
+    "DEFAULT_SLURM_WORK_DIR": "/tmp",
+    "SLURM_USER_MAPPER": "",
+    "SINGLE_USER_SUBMITTER": "ubuntu",
+    "WRITE_SUBMISSION_FILES": "true",
+}
+
+
+def run_bash(bash_string: str) -> str:
+    """Run bash command and return output as string."""
+    return subprocess.check_output(bash_string.split()).decode().rstrip()
+
+
+def daemon_starter():
+    """Start the daemon."""
+    try:
+        run_bash(f"snapctl start {SNAP_INSTANCE_NAME}.daemon")
+    except Exception:
+        sys.exit(1)
+
+
+def daemon_stopper():
+    """Stop the daemon."""
+    try:
+        run_bash(f"snapctl stop {SNAP_INSTANCE_NAME}.daemon")
+    except Exception:
+        sys.exit(1)
+
+
+def snapctl_get(snap_config_value: str) -> Union[str, None]:
+    """Get snap config from snapctl.
+
+    Return python None if snapctl returns the empty string.
+    """
+    snapctl_out: Union[str, None]
+    snapctl_out = run_bash(f"snapctl get {snap_config_value}")
+
+    if snapctl_out == "":
+        snapctl_out = None
+
+    return snapctl_out
+
+
+def configure_dotenv_files():
+    """Configure the .env files based on the snap mode."""
+    env_file_content = ""
+    for env_var, env_value in AGENT_VARIABLES_MAP.items():
+        snapctl_value = snapctl_get(env_var.lower().replace("_", "-"))
+        if snapctl_value is not None:
+            env_value = snapctl_value
+        elif bool(env_value) is False:
+            continue
+        env_file_content += f"{DOTENV_PREFIX}{env_var}={env_value}\n"
+    DOTENV_FILE_LOCATION.write_text(env_file_content)
+
+
+if __name__ == "__main__":
+    daemon_stopper()
+    configure_dotenv_files()
+    daemon_starter()
diff --git a/jobbergate-agent-snap/mypy.ini b/jobbergate-agent-snap/mypy.ini
new file mode 100644
index 000000000..6642d474a
--- /dev/null
+++ b/jobbergate-agent-snap/mypy.ini
@@ -0,0 +1,3 @@
+[mypy]
+follow_imports = silent
+ignore_missing_imports = false
diff --git a/jobbergate-agent-snap/ruff.toml b/jobbergate-agent-snap/ruff.toml
new file mode 100644
index 000000000..125588609
--- /dev/null
+++ b/jobbergate-agent-snap/ruff.toml
@@ -0,0 +1,11 @@
+line-length = 110
+include = ["hooks/bin/configure"]
+extend-exclude = ["__pycache__"]
+
+[lint]
+select = ["E", "W", "F", "C", "N", "D", "I001", "I"]
+ignore = ["D213", "D211", "D203", "C408"]
+fixable = ["ALL"]
+
+[lint.mccabe]
+max-complexity = 10
diff --git a/jobbergate-agent-snap/snap/snapcraft.yaml b/jobbergate-agent-snap/snap/snapcraft.yaml
new file mode 100644
index 000000000..b0d9c229b
--- /dev/null
+++ b/jobbergate-agent-snap/snap/snapcraft.yaml
@@ -0,0 +1,98 @@
+name: jobbergate-agent
+base: core24
+version: '0.1.0'
+summary: The Jobbergate Agent snap
+adopt-info: metadata
+license: MIT
+description: |
+  The Jobbergate Agent Snap deploys the Jobbergate Agent Python package on your host system. This agent
+  is an essencial component of the Jobbergate platform for managing and submitting jobs to HPC clusters.
+
+  This snap requires a few configuration values to be set before it can be used. These values are:
+  - base-api-url: The URL of the Jobbergate API server where the agent will send its data. Setting/unsetting this value is more interesting when using the snap in a development environment; do not change it otherwise.
+
+  - oidc-domain: The domain of the OIDC server that the agent will use for authentication. Setting/unsetting this value is more interesting when using the snap in a development environment; do not change it otherwise.
+
+  - oidc-client-id: The client ID of the OIDC application that the agent will use for authentication.
+
+  - oidc-client-secret: The client secret of the OIDC application that the agent will use for authentication.
+
+  - task-jobs-interval-seconds: The interval in seconds at which the agent will run its internal task jobs, hence sending data to the Jobbergate API server. This is optional and defaults to 30 seconds.
+
+  - task-self-update-interval-seconds: The interval in seconds at which the agent will check for updates to itself. This is optional and defaults to 30 seconds (1 hour).
+
+  - sbatch-path: The absolute path to the *sbatch* command on the host system. This is optional and defaults to /usr/bin/sbatch.
+
+  - scontrol-path: The absolute path to the *scontrol* command on the host system. This is optional and defaults to /usr/bin/scontrol.
+
+  - default-slurm-work-dir: The default working directory that the agent will use when submitting jobs to the SLURM cluster. This is optional and defaults to /tmp.
+
+  - slurm-user-mapper: The user mapper that the agent will use to map the system user name to the SLURM user name. This is optional and defaults to none.
+
+  - single-user-submitter: The system user name that the agent will use to submit jobs to the SLURM cluster on behalf of in case the *single-user-submitter* is not set. This is optional and defaults to *ubuntu*.
+
+  - write-submission-files: A boolean value (true, false) that indicates whether the agent should write submission files to disk. This is optional and defaults to false.
+
+  For learning more about Jobbergate and how it can be used on Vantage, please visit https://docs.vantagehpc.io
+
+grade: stable
+confinement: strict
+
+parts:
+  jobbergate-agent:
+    source: .
+    plugin: python
+    python-packages:
+    - jobbergate-agent
+
+  hooks:
+    plugin: dump
+    source: hooks
+    organize:
+      bin/: snap/hooks/
+
+  metadata:
+    plugin: nil
+    override-pull: |
+      craftctl default
+      craftctl set version="$(craftctl get version)-$(git rev-parse --short HEAD)"
+
+  wrappers:
+    plugin: dump
+    source: wrappers/
+    source-type: local
+
+apps:
+  daemon:
+    command: bin/jg-run
+    daemon: simple
+    plugs:
+    - network
+    install-mode: disable
+
+  start:
+    command: commands/daemon.start
+    daemon: simple
+    plugs:
+      - system-observe
+    install-mode: disable
+
+  stop:
+    command: commands/daemon.stop
+    daemon: simple
+    plugs:
+      - system-observe
+    install-mode: disable
+
+  restart:
+    command: commands/daemon.restart
+    daemon: simple
+    plugs:
+      - system-observe
+    install-mode: disable
+
+hooks:
+  configure:
+    plugs:
+    - system-observe
+    - network
diff --git a/jobbergate-agent-snap/tox.ini b/jobbergate-agent-snap/tox.ini
new file mode 100644
index 000000000..8d79e0d09
--- /dev/null
+++ b/jobbergate-agent-snap/tox.ini
@@ -0,0 +1,24 @@
+[tox]
+requires =
+    tox>=4
+env_list = lint, type, py{310,311,312}
+
+[testenv:lint]
+description = run linters
+skip_install = true
+deps =
+    ruff==0.6.5
+commands = ruff check hooks
+
+[testenv:type]
+description = run type checks
+deps =
+    mypy==1.11.2
+commands =
+    mypy hooks/bin/configure --pretty
+
+[testenv:format]
+description = run formatters
+deps =
+    ruff==0.6.5
+commands = ruff format hooks
diff --git a/jobbergate-agent-snap/wrappers/commands/daemon.restart b/jobbergate-agent-snap/wrappers/commands/daemon.restart
new file mode 100755
index 000000000..24a69b85b
--- /dev/null
+++ b/jobbergate-agent-snap/wrappers/commands/daemon.restart
@@ -0,0 +1,6 @@
+#!/bin/sh
+set -eu
+
+snapctl restart $SNAP_NAME.daemon
+snapctl set-health okay
+exit 0
\ No newline at end of file
diff --git a/jobbergate-agent-snap/wrappers/commands/daemon.start b/jobbergate-agent-snap/wrappers/commands/daemon.start
new file mode 100755
index 000000000..cf04e9455
--- /dev/null
+++ b/jobbergate-agent-snap/wrappers/commands/daemon.start
@@ -0,0 +1,26 @@
+#!/bin/sh
+set -eu
+
+OIDC_CLIENT_ID=$(snapctl get oidc-client-id)
+OIDC_CLIENT_SECRET=$(snapctl get oidc-client-secret)
+
+# exit 1 if any of the required configuration is missing
+if [ -z "$OIDC_CLIENT_ID"]; then
+    snapctl set-health blocked "Missing the oidc-client-id configuration"
+    echo "Missing the oidc-client-id configuration"
+    exit 1
+fi
+if [ -z "$OIDC_CLIENT_SECRET" ]; then
+    snapctl set-health blocked "Missing the oidc-client-secret configuration"
+    echo "Missing the oidc-client-secret configuration"
+    exit 1
+fi
+
+STATUS=$(snap services $SNAP_NAME.daemon | awk 'FNR == 1 {next} {print $3}')
+
+# start the daemon if the STATUS is `inactive`, otherwise, just exit 0
+if [ "$STATUS" = "inactive" ]; then
+    snapctl start $SNAP_NAME.daemon
+fi
+snapctl set-health okay
+exit 0
\ No newline at end of file
diff --git a/jobbergate-agent-snap/wrappers/commands/daemon.stop b/jobbergate-agent-snap/wrappers/commands/daemon.stop
new file mode 100755
index 000000000..0ad14aa0f
--- /dev/null
+++ b/jobbergate-agent-snap/wrappers/commands/daemon.stop
@@ -0,0 +1,5 @@
+#!/bin/sh
+set -eu
+
+snapctl stop $SNAP_NAME.daemon
+exit 0
\ No newline at end of file
diff --git a/jobbergate-agent/jobbergate_agent/settings.py b/jobbergate-agent/jobbergate_agent/settings.py
index 0017fd69c..239c80ec5 100644
--- a/jobbergate-agent/jobbergate_agent/settings.py
+++ b/jobbergate-agent/jobbergate_agent/settings.py
@@ -10,17 +10,17 @@
 from jobbergate_agent.utils.logging import logger
 
 
-def _get_env_file() -> str | None:
+def _get_env_file() -> Path | None:
     """
     Determine if running in test mode and return the correct path to the .env file if not.
     """
     _test_mode = "pytest" in sys.modules
-    env_file: str | None
     if not _test_mode:
-        env_file = ".env"
-    else:
-        env_file = None
-    return env_file
+        default_dotenv_file_location = Path("/var/snap/jobbergate-agent/common/.env")
+        if default_dotenv_file_location.exists():
+            return default_dotenv_file_location
+        return Path(".env")
+    return None
 
 
 class Settings(BaseSettings):