From adf4d6f42fd9584b033fa2970fcdec6d8f28619a Mon Sep 17 00:00:00 2001
From: Adam Dyess <adam.dyess@canonical.com>
Date: Wed, 10 Jul 2024 22:20:41 -0500
Subject: [PATCH] Vault testing no longer requires mysql db, so long as vault
 is initiazlied properly

---
 jobs/integration/validation.py | 356 +++++++++++++++++----------------
 jobs/validate/vault-spec       |  23 +--
 2 files changed, 189 insertions(+), 190 deletions(-)

diff --git a/jobs/integration/validation.py b/jobs/integration/validation.py
index b0eaa21fc..d9461800b 100644
--- a/jobs/integration/validation.py
+++ b/jobs/integration/validation.py
@@ -12,7 +12,6 @@
 import random
 import pytest
 import logging
-import click
 from base64 import b64encode
 from cilib.enums import K8S_STABLE_VERSION
 
@@ -55,15 +54,9 @@
 from bs4 import BeautifulSoup as bs
 from bs4.element import ResultSet as bs_ResultSet
 from juju.application import Application
+from juju.model import Model
 from juju.unit import Unit
 
-# Quiet the noise
-logging.getLogger("asyncio").setLevel(logging.INFO)
-logging.getLogger("connector").setLevel(logging.INFO)
-logging.getLogger("websockets.client").setLevel(logging.INFO)
-logging.getLogger("websockets.protocol").setLevel(logging.INFO)
-# bump up juju debug
-logging.getLogger("juju").setLevel(logging.INFO)
 # validation logging
 log = logging.getLogger(__name__)
 
@@ -185,15 +178,13 @@ async def test_auth_file_propagation(model, tools):
 
     """
     # Get a leader and non-leader unit to test with
-    masters = model.applications["kubernetes-control-plane"]
-    if len(masters.units) < 2:
+    app = model.applications["kubernetes-control-plane"]
+    if len(app.units) < 2:
         pytest.skip("Test requires multiple control-plane units")
 
-    for master in masters.units:
-        if await master.is_leader_from_status():
-            leader = master
-        else:
-            follower = master
+    idx = await get_leader(app)
+    leader = app.units[idx]
+    follower = app.units[(idx + 1) % len(app.units)]
 
     # Change serviceaccount.key on the leader, and get its md5sum
     leader_md5 = await run_until_success(
@@ -254,7 +245,7 @@ async def test_snap_versions(model, tools):
         if "/" not in channel:
             message = "validate_snap_versions: skipping %s, channel=%s"
             message = message % (app_name, channel)
-            click.echo(message)
+            log.info(message)
             continue
         track, _ = channel.split("/", 1)
         if track == "latest":
@@ -345,7 +336,7 @@ async def test_microbot(model, tools, teardown_microbot):
             if resp.status_code == 200:
                 break
         except requests.exceptions.ConnectionError as e:
-            click.echo(
+            log.info(
                 f"Caught connection error attempting to hit {url}, "
                 f"retrying. Error follows: {e}"
             )
@@ -388,7 +379,7 @@ async def query_dashboard(url, config):
         "/proxy/#!/login"
     )
 
-    click.echo("Waiting for dashboard to stabilize...")
+    log.info("Waiting for dashboard to stabilize...")
 
     async def dashboard_present(url, config):
         resp = await query_dashboard(url, config)
@@ -469,7 +460,7 @@ async def test_network_policies(model, tools):
         "/snap/bin/kubectl --kubeconfig /root/.kube/config delete ns netpolicy",
         check=False,
     )
-    click.echo("Waiting for pods to finish terminating...")
+    log.info("Waiting for pods to finish terminating...")
 
     await retry_async_with_timeout(
         verify_deleted,
@@ -500,10 +491,10 @@ async def test_network_policies(model, tools):
         check=False,
     )
     if not cmd.code == 0:
-        click.echo("Failed to create netpolicy test!")
-        click.echo(cmd.results)
+        log.info("Failed to create netpolicy test!")
+        log.info(cmd.results)
     assert cmd.status == "completed" and cmd.code == 0
-    click.echo("Waiting for pods to show up...")
+    log.info("Waiting for pods to show up...")
     await retry_async_with_timeout(
         verify_ready,
         (unit, "po", ["bboxgood", "bboxbad"], "-n netpolicy"),
@@ -513,7 +504,7 @@ async def test_network_policies(model, tools):
     # Try to get to nginx from both busyboxes.
     # We expect no failures since we have not applied the policy yet.
     async def get_to_networkpolicy_service():
-        click.echo("Reaching out to nginx.netpolicy with no restrictions")
+        log.info("Reaching out to nginx.netpolicy with no restrictions")
         query_from_bad = "/snap/bin/kubectl --kubeconfig /root/.kube/config exec bboxbad -n netpolicy -- wget --timeout=30  nginx.netpolicy"
         query_from_good = "/snap/bin/kubectl --kubeconfig /root/.kube/config exec bboxgood -n netpolicy -- wget --timeout=30  nginx.netpolicy"
         cmd_good = await juju_run(unit, query_from_good, check=False)
@@ -546,7 +537,7 @@ async def get_to_networkpolicy_service():
     await asyncio.sleep(10)
 
     async def get_to_restricted_networkpolicy_service():
-        click.echo("Reaching out to nginx.netpolicy with restrictions")
+        log.info("Reaching out to nginx.netpolicy with restrictions")
         query_from_bad = (
             "/snap/bin/kubectl --kubeconfig /root/.kube/config exec bboxbad -n netpolicy -- "
             "wget --timeout=30  nginx.netpolicy -O foo.html"
@@ -686,12 +677,12 @@ async def test_ipv6(model, tools):
 
 
 @pytest.mark.skip("Unskip when this can be speed up considerably")
-async def test_worker_master_removal(model, tools):
-    # Add a second master
-    masters = model.applications["kubernetes-control-plane"]
-    original_master_count = len(masters.units)
-    if original_master_count < 2:
-        await masters.add_unit(1)
+async def test_worker_control_plane_removal(model, tools):
+    # Add a second control-plane
+    control_plane = model.applications["kubernetes-control-plane"]
+    original_cp_count = len(control_plane.units)
+    if original_cp_count < 2:
+        await control_plane.add_unit(1)
         await disable_source_dest_check(tools.model_name)
 
     # Add a second worker
@@ -702,28 +693,28 @@ async def test_worker_master_removal(model, tools):
         await disable_source_dest_check(tools.model_name)
     await tools.juju_wait()
 
-    # Remove a worker to see how the masters handle it
+    # Remove a worker to see how the control_plane handle it
     unit_count = len(workers.units)
     await workers.units[0].remove()
     await tools.juju_wait()
 
     while len(workers.units) == unit_count:
         await asyncio.sleep(15)
-        click.echo(
+        log.info(
             "Waiting for worker removal. (%d/%d)" % (len(workers.units), unit_count)
         )
 
-    # Remove the master leader
-    unit_count = len(masters.units)
-    for master in masters.units:
-        if await master.is_leader_from_status():
-            await master.remove()
+    # Remove the control_plane leader
+    unit_count = len(control_plane.units)
+    idx = get_leader(control_plane)
+    await control_plane.units[idx].remove()
     await tools.juju_wait()
 
-    while len(masters.units) == unit_count:
+    while len(control_plane.units) == unit_count:
         await asyncio.sleep(15)
-        click.echo(
-            "Waiting for master removal. (%d/%d)" % (len(masters.units), unit_count)
+        log.info(
+            "Waiting for control-plane removal. (%d/%d)"
+            % (len(control_plane.units), unit_count)
         )
 
     # Try and restore the cluster state
@@ -731,10 +722,10 @@ async def test_worker_master_removal(model, tools):
     # would fail in a multi-control-plane situation
     while len(workers.units) < original_worker_count:
         await workers.add_unit(1)
-    while len(masters.units) < original_master_count:
-        await masters.add_unit(1)
+    while len(control_plane.units) < original_cp_count:
+        await control_plane.add_unit(1)
     await disable_source_dest_check(tools.model_name)
-    click.echo("Waiting for new master and worker.")
+    log.info("Waiting for new control-plane and worker.")
     await tools.juju_wait()
 
 
@@ -872,7 +863,7 @@ async def run_extra_args_test(app_name, new_config, expected_args):
                             break
                         await asyncio.sleep(5)
             except asyncio.CancelledError as e:
-                click.echo("Dumping locals:\n" + pformat(locals()))
+                log.info("Dumping locals:\n" + pformat(locals()))
                 msg = f"While applying new_config to {app_name}, {service} has {args_per_unit}"
                 raise AssertionError(msg) from e
 
@@ -891,7 +882,7 @@ async def run_extra_args_test(app_name, new_config, expected_args):
                             break
                         await asyncio.sleep(5)
             except asyncio.CancelledError as e:
-                click.echo("Dumping locals:\n" + pformat(locals()))
+                log.info("Dumping locals:\n" + pformat(locals()))
                 msg = f"While restoring config to {app_name}, {service} has {new_args}"
                 raise AssertionError(msg) from e
 
@@ -1221,7 +1212,7 @@ async def test_audit_default_config(model, tools):
         # Verify total log size is less than 1 GB
         raw = await run_until_success(unit, "du -bs /root/cdk/audit")
         size_in_bytes = int(raw.split()[0])
-        click.echo("Audit log size in bytes: %d" % size_in_bytes)
+        log.info("Audit log size in bytes: %d" % size_in_bytes)
         max_size_in_bytes = 1000 * 1000 * 1000 * 1.01  # 1 GB, plus some tolerance
         assert size_in_bytes <= max_size_in_bytes
     finally:
@@ -1611,40 +1602,90 @@ async def test_keystone(model, keystone_deployment):
     assert output.code == 0, output.stderr
 
 
-@pytest.mark.skip_arch(["aarch64"])
-@pytest.mark.on_model("validate-vault")
-async def test_encryption_at_rest(model, tools):
-    """Testing integrating vault secrets into cluster"""
-    control_plane_app = model.applications["kubernetes-control-plane"]
-    etcd_app = model.applications["etcd"]
-    vault_app = model.applications["vault"]
+async def get_leader(app: Application):
+    is_leader = await asyncio.gather(*(u.is_leader_from_status() for u in app.units))
+    for idx, flag in enumerate(is_leader):
+        if flag:
+            return idx
+
+
+async def retry_hook(unit):
+    # Until https://github.com/juju/python-libjuju/issues/484 is fixed, we
+    # have to do this manually.
+    from juju.client import client
 
-    async def ensure_vault_up():
+    app_facade = client.ApplicationFacade.from_connection(unit.connection)
+    await app_facade.ResolveUnitErrors(
+        all_=False, retry=True, tags={"entities": [{"tag": unit.tag}]}
+    )
+
+
+@pytest.mark.skip_arch(["aarch64"])
+@pytest.mark.on_model("validate-ck")
+class TestEncryptionAtRest:
+    vault_app: Application = None
+    control_plane: Application = None
+    etcd_app: Application = None
+
+    def _load(self, model: Model):
+        self.model = model
+        self.vault_app = model.applications["vault"]
+        self.control_plane = model.applications["kubernetes-control-plane"]
+        self.etcd_app = model.applications["etcd"]
+
+    async def ensure_vault_up(self):
         await asyncio.gather(
             *(
                 retry_async_with_timeout(vault_status, [unit])
-                for unit in vault_app.units
+                for unit in self.vault_app.units
             )
         )
 
-    async def init_vault():
-        # init vault
-        click.echo("Initializing Vault")
-        await ensure_vault_up()
+    async def force_update_status(self, app):
+        # force unit status to update
+        await asyncio.gather(
+            *(juju_run(unit, "hooks/update-status", check=False) for unit in app.units)
+        )
+        await self.model.wait_for_idle(apps=[app.name])
+
+    async def vault_ready_status(self):
+        statuses = [unit.workload_status_message for unit in self.vault_app.units]
+        log.info(statuses)
+        return set(statuses) == {
+            "Unit is ready (active: false, mlock: disabled)",
+            "Unit is ready (active: true, mlock: disabled)",
+        }
+
+    async def test_init_vault(self, model: Model):
+        """Test initializing vault and unsealing it"""
+        log.info("Waiting for Vault to settle")
+        self._load(model)
+        await self.model.wait_for_idle(apps=["vault"], timeout=30 * 60)
+        idx = await get_leader(self.vault_app)
+        leader = self.vault_app.units[idx]
+
+        if await self.vault_ready_status():
+            log.info("Vault is already initialized and unsealed")
+            return
+
+        log.info("Initializing Vault")
+        await self.ensure_vault_up()
         init_info = await vault(leader, "operator init -key-shares=5 -key-threshold=3")
-        click.echo(init_info)
+        log.info(init_info)
         # unseal vault leader (could also unseal follower, but it will be resealed later anyway)
         for key in init_info["unseal_keys_hex"][:3]:
             await vault(leader, "operator unseal " + key)
+        await self.force_update_status(self.vault_app)
 
         # authorize charm
-        click.echo("Authorizing charm")
+        log.info("Collect Authorization Token")
         root_token = init_info["root_token"]
         token_info = await vault(
             leader, "token create -ttl=10m", VAULT_TOKEN=root_token
         )
-        click.echo(token_info)
+        log.info(token_info)
         charm_token = token_info["auth"]["client_token"]
+        log.info("Authorizing charm")
         await juju_run_action(leader, "authorize-charm", token=charm_token)
         # At this point, Vault is up but in non-HA mode. If we weren't using the
         # auto-generate-root-ca-cert config, though, it would still be blocking
@@ -1656,10 +1697,12 @@ async def init_vault():
 
         # Since we are using the auto-generate-root-ca-cert config, though, we can
         # just go straight to waiting for etcd to settle.
-        click.echo("Waiting for etcd to settle")
-        await model.wait_for_idle(apps=["etcd"], timeout=30 * 60)
+        log.info("Waiting for etcd to settle")
+        await self.model.wait_for_idle(apps=["etcd"], timeout=30 * 60)
         for _ in range(3):
-            actual_status = {unit.workload_status_message for unit in etcd_app.units}
+            actual_status = {
+                unit.workload_status_message for unit in self.etcd_app.units
+            }
             expected_status = {"Healthy with 3 known peers"}
             if actual_status == expected_status:
                 break
@@ -1668,127 +1711,98 @@ async def init_vault():
             # feasible for the charm code to block, so it sometimes takes an
             # update-status hook or two before the unit status is accurate. We
             # can hurry that along a bit, however.
-            click.echo("Poking etcd to refresh status")
+            log.info("Poking etcd to refresh status")
             await asyncio.gather(
                 *(
                     juju_run(unit, "hooks/update-status", check=False)
-                    for unit in etcd_app.units
+                    for unit in self.etcd_app.units
                 )
             )
 
-        # Even once etcd is ready, Vault will remain in non-HA mode until the Vault
-        # service is restarted, which will re-seal the vault.
-        click.echo("Restarting Vault for HA")
-        await asyncio.gather(
-            *(
-                juju_run(unit, "systemctl restart vault", check=False)
-                for unit in vault_app.units
-            )
-        )
-        await ensure_vault_up()
-
-        click.echo("Unsealing Vault again in HA mode")
+        await self.ensure_vault_up()
+        await self.force_update_status(self.vault_app)
+        log.info("Unsealing Vault again in HA mode")
         for key in init_info["unseal_keys_hex"][:3]:
             await asyncio.gather(
-                *(vault(unit, "operator unseal " + key) for unit in vault_app.units)
-            )
-        # force unit status to update
-        await asyncio.gather(
-            *(
-                juju_run(unit, "hooks/update-status", check=False)
-                for unit in vault_app.units
+                *(
+                    vault(unit, "operator unseal " + key)
+                    for unit in self.vault_app.units
+                )
             )
+        await self.force_update_status(self.vault_app)
+        assert await self.vault_ready_status()
+
+    async def test_kubernetes_with_vault(self, model: Model):
+        # NB: At this point, depending on the version of the Vault charm, its status
+        # might either be (a less than informative) "'etcd' incomplete" (cs:vault-44)
+        # or "Vault needs to be initialized" (cs:~openstack-charmers-next/vault).
+
+        # Until https://github.com/juju-solutions/layer-vault-kv/pull/11 lands, the
+        # k8s-control-plane units can go into error due to trying to talk to Vault during
+        # the restart. Once Vault is back up, the errored hooks can just be retried.
+        self._load(model)
+        await self.model.wait_for_idle(
+            apps=[self.control_plane.name], raise_on_error=False, timeout=60 * 60
         )
-        assert await vault_ready_status()
-
-    async def vault_ready_status():
-        statuses = sorted(unit.workload_status_message for unit in vault_app.units)
-        click.echo(statuses)
-        return statuses == [
-            "Unit is ready (active: false, mlock: disabled)",
-            "Unit is ready (active: true, mlock: disabled)",
-        ]
-
-    click.echo("Waiting for Vault to settle")
-    await model.wait_for_idle(apps=["vault"], timeout=30 * 60)
 
-    if await vault_app.units[0].is_leader_from_status():
-        leader = vault_app.units[0]
-    else:
-        leader = vault_app.units[1]
-
-    if not await vault_ready_status():
-        await init_vault()
-
-    # NB: At this point, depending on the version of the Vault charm, its status
-    # might either be (a less than informative) "'etcd' incomplete" (cs:vault-44)
-    # or "Vault needs to be initialized" (cs:~openstack-charmers-next/vault).
-
-    # Until https://github.com/juju-solutions/layer-vault-kv/pull/11 lands, the
-    # k8s-control-plane units can go into error due to trying to talk to Vault during
-    # the restart. Once Vault is back up, the errored hooks can just be retried.
-    await model.wait_for_idle(
-        apps=["kubernetes-control-plane"], raise_on_error=False, timeout=60 * 60
-    )
-
-    async def retry_hook(unit):
-        # Until https://github.com/juju/python-libjuju/issues/484 is fixed, we
-        # have to do this manually.
-        from juju.client import client
+        for _ in range(3):
+            blocked_unit = [
+                unit
+                for unit in self.control_plane.units
+                if unit.workload_status in "blocked"
+            ]
+            if not blocked_unit:
+                break
+            if any(
+                "refresh-secrets" in u.workload_status_message for u in blocked_unit
+            ):
+                idx = await get_leader(self.vault_app)
+                leader = self.vault_app.units[idx]
+                await juju_run_action(leader, "refresh-secrets")
+            await self.model.wait_for_idle(
+                apps=[self.control_plane.name], raise_on_error=False
+            )
 
-        app_facade = client.ApplicationFacade.from_connection(unit.connection)
-        await app_facade.ResolveUnitErrors(
-            all_=False, retry=True, tags={"entities": [{"tag": unit.tag}]}
+        # The cluster is probably mostly settled by this point, since the controllers typically
+        # take the longest to go into quiescence. However, in case they got into an errored
+        # state, we need to give things another chance to settle out, while also checking
+        # for any other failed units.
+        log.info("Waiting for cluster to settle")
+        await self.model.wait_for_idle(
+            status="active", raise_on_blocked=True, timeout=60 * 60
         )
 
-    for _ in range(3):
-        errored_units = [
-            unit for unit in control_plane_app.units if unit.workload_status == "error"
-        ]
-        if not errored_units:
-            break
-        click.echo("Retrying failed k8s-control-plane hook for Vault restart")
-        await asyncio.gather(*(retry_hook(unit) for unit in errored_units))
-        await model.wait_for_idle(
-            apps=["kubernetes-control-plane"], raise_on_error=False
+        log.info("Creating secret")
+        await kubectl(
+            self.model,
+            "create secret generic test-secret --from-literal=username='secret-value'",
         )
 
-    # The cluster is probably mostly settled by this point, since the masters typically
-    # take the longest to go into quiescence. However, in case they got into an errored
-    # state, we need to give things another chance to settle out, while also checking
-    # for any other failed units.
-    click.echo("Waiting for cluster to settle")
-    await model.wait_for_idle(status="active", raise_on_blocked=True, timeout=60 * 60)
-
-    click.echo("Creating secret")
-    await kubectl(
-        model,
-        "create secret generic test-secret --from-literal=username='secret-value'",
-    )
-
-    try:
-        click.echo("Verifying secret")
-        result = await kubectl(model, "get secret test-secret -o json")
-        secret_value = json.loads(result.stdout)["data"]["username"]
-        b64_value = b64encode(b"secret-value").decode("utf8")
-        assert secret_value == b64_value
-
-        click.echo("Verifying secret encryption")
-        etcd = model.applications["etcd"].units[0]
-        result = await juju_run(
-            etcd,
-            "ETCDCTL_API=3 /snap/bin/etcd.etcdctl "
-            "--endpoints https://127.0.0.1:2379 "
-            "--cacert=/var/snap/etcd/common/ca.crt "
-            "--cert=/var/snap/etcd/common/server.crt "
-            "--key=/var/snap/etcd/common/server.key "
-            "get /registry/secrets/default/test-secret | strings",
-        )
-        assert "enc:aescbc:v1" in result.output, "Should see encoded secret"
-        assert "secret-value" not in result.output, "Should not see plain-text secret"
-    finally:
-        click.echo("Deleting secret")
-        await kubectl(model, "delete secret test-secret")
+        try:
+            log.info("Verifying secret")
+            result = await kubectl(self.model, "get secret test-secret -o json")
+            secret_value = json.loads(result.stdout)["data"]["username"]
+            b64_value = b64encode(b"secret-value").decode("utf8")
+            assert secret_value == b64_value
+
+            log.info("Verifying secret encryption")
+            etcd = self.model.applications["etcd"].units[0]
+            result = await juju_run(
+                etcd,
+                "ETCDCTL_API=3 /snap/bin/etcd.etcdctl "
+                "--endpoints https://127.0.0.1:2379 "
+                "--cacert=/var/snap/etcd/common/ca.crt "
+                "--cert=/var/snap/etcd/common/server.crt "
+                "--key=/var/snap/etcd/common/server.key "
+                "get /registry/secrets/default/test-secret | strings",
+            )
+            assert "enc:aescbc:v1" in result.output, "Should see encoded secret"
+            assert (
+                "secret-value" not in result.output
+            ), "Should not see plain-text secret"
+        finally:
+            log.info("Deleting secret")
+            await kubectl(self.model, "delete secret test-secret")
 
 
 @pytest.mark.clouds(["ec2", "vsphere", "gce"])
@@ -1982,7 +1996,7 @@ async def verify_sysctl(units, desired_values):
             lines = raw_output.splitlines()
             assert len(lines) == len(desired_results)
             if not lines == desired_results:
-                click.echo(f"retry...{lines} != {desired_results}")
+                log.info(f"retry...{lines} != {desired_results}")
                 return False
         return True
 
diff --git a/jobs/validate/vault-spec b/jobs/validate/vault-spec
index 5ed04846c..a3dcbc986 100755
--- a/jobs/validate/vault-spec
+++ b/jobs/validate/vault-spec
@@ -41,35 +41,20 @@ applications:
     options:
       channel: $SNAP_VERSION
   easyrsa: null
-  mysql-innodb-cluster:
-    channel: 8.0/stable
-    charm: mysql-innodb-cluster
-    constraints: cores=2 mem=8G root-disk=64G
-    num_units: 3
-    options:
-      enable-binlogs: true
-      innodb-buffer-pool-size: 256M
-      max-connections: 2000
-      wait-timeout: 3600
   vault:
-    channel: 1.7/stable
+    channel: 1.8/stable
     charm: vault
-    num_units: 2
+    num_units: 3
+    series: jammy
     options:
       auto-generate-root-ca-cert: true
       disable-mlock: true
-  vault-mysql-router:
-    channel: 8.0/stable
-    charm: mysql-router
 relations:
   - ["vault:certificates", "etcd:certificates"]
   - ["vault:certificates", "kubeapi-load-balancer:certificates"]
   - ["vault:certificates", "kubernetes-control-plane:certificates"]
   - ["vault:certificates", "kubernetes-worker:certificates"]
-  - ["vault-mysql-router:db-router", "mysql-innodb-cluster:db-router"]
-  - ["vault-mysql-router:shared-db", "vault:shared-db"]
-  - ["vault:etcd", "etcd:db"]
-  - ["vault:secrets", "kubernetes-control-plane:vault-kv"]
+  - ["vault:secrets",      "kubernetes-control-plane:vault-kv"]
 EOF
 }