From 47c310f3de175558fb3a64cb6c33be9c853e6857 Mon Sep 17 00:00:00 2001 From: Jacob Callahan Date: Tue, 19 Nov 2024 16:21:37 -0500 Subject: [PATCH] Add initial attempts to automatically clean up dangling hosts Now, when workflows fail, Broker will attempt to find a handgling host and check it in if found. --- broker/providers/ansible_tower.py | 63 ++++++++++++++++++++++++++++--- 1 file changed, 58 insertions(+), 5 deletions(-) diff --git a/broker/providers/ansible_tower.py b/broker/providers/ansible_tower.py index 6684a2e..d3922b8 100644 --- a/broker/providers/ansible_tower.py +++ b/broker/providers/ansible_tower.py @@ -8,9 +8,10 @@ import click from dynaconf import Validator from logzero import logger +from rich.prompt import Prompt from broker import exceptions -from broker.helpers import eval_filter, find_origin +from broker.helpers import eval_filter, find_origin, update_inventory from broker.settings import settings try: @@ -23,7 +24,7 @@ def convert_pseudonamespaces(attr_dict): - """Recursively convert PsuedoNamespace objects into dictionaries.""" + """Recursively convert PseudoNamespace objects into dictionaries.""" out_dict = {} for key, value in attr_dict.items(): if isinstance(value, awxkit.utils.PseudoNamespace): @@ -121,6 +122,7 @@ class AnsibleTower(Provider): | Validator("ANSIBLETOWER.token", must_exist=True) ), Validator("ANSIBLETOWER.inventory", default=None), + Validator("ANSIBLETOWER.dangling_behavior", default="checkin"), ] _checkout_options = [ @@ -164,11 +166,12 @@ def __init__(self, **kwargs): self.uname = settings.ANSIBLETOWER.get("username") self.pword = settings.ANSIBLETOWER.get("password") self.token = settings.ANSIBLETOWER.get("token") + self.dangling_behavior = settings.ANSIBLETOWER.get("dangling_behavior") self._inventory = kwargs.get("tower_inventory") or settings.ANSIBLETOWER.inventory # Init the class itself config = kwargs.get("config") root = kwargs.get("root") - self._v2, self.username = get_awxkit_and_uname( + self._v2, self.uname = get_awxkit_and_uname( config=config, root=root, url=self.url, @@ -374,6 +377,53 @@ def _get_failure_messages(self, workflow): else: return failure_messages + def _try_get_dangling_hosts(self, failed_workflow): + """Get one or more hosts that may have been left behind by a failed workflow.""" + hosts = [] + for node in failed_workflow.get_related("workflow_nodes").results: + if not (job_fields := node.summary_fields.get("job", {})) or job_fields.get( + "failed" + ): # skip jobs with no summary fields and failed jobs + continue + if jobs := self._v2.jobs.get(id=job_fields["id"]).results: + if vm_name := jobs[0].artifacts.get("vm_name"): + hosts.append(vm_name) + return list(set(hosts)) + + def handle_dangling_hosts(self, job): + """Attempt to check in dangling hosts associated with the given job.""" + dangling_hosts = self._try_get_dangling_hosts(job) + if not dangling_hosts: + logger.debug("No dangling hosts found for the failed job.") + return + dangling_behavior = self.dangling_behavior + for dangling_host in dangling_hosts: + logger.info(f"Found dangling host: {dangling_host}") + if dangling_behavior == "prompt": + choice = Prompt.ask( + "What would you like to do with this host? [c/s/cA/sA]\n", + "Checkin (c), Store (s), Checkin All (cA), Store All (sA)", + choices=["c", "s", "cA", "sA"], + ) + if choice == "cA": + dangling_behavior = "checkin" + elif choice == "sA": + dangling_behavior = "store" + else: + choice = None + # handle checkins + if choice == "c" or dangling_behavior == "checkin": + try: + self.release(dangling_host) + logger.debug(f"Successfully checked in dangling host: {dangling_host}") + except exceptions.BrokerError: + logger.warning(f"Failed to check in dangling host: {dangling_host}") + elif choice == "s" or dangling_behavior == "store": + logger.debug(f"Storing dangling host: {dangling_host}") + host = self._v2.hosts.get(name=dangling_host).results[0] + host = self._compile_host_info(host) + update_inventory(add=host) + def _compile_host_info(self, host): try: host_facts = host.related.ansible_facts.get() @@ -601,12 +651,15 @@ def execute(self, **kwargs): # noqa: PLR0912,PLR0915 - Possible TODO refactor logger.info(f"Waiting for job: \nAPI: {job_api_url}\nUI: {job_ui_url}") job.wait_until_completed(timeout=settings.ANSIBLETOWER.workflow_timeout) if job.status != "successful": + failure_message = self._get_failure_messages(job) message_data = { f"{subject.capitalize()} Status": job.status, - "Reason(s)": self._get_failure_messages(job), + "Reason(s)": failure_message, "URL": job_ui_url, } helpers.emit(message_data) + if "was automatically checked-in" not in failure_message: + self.handle_dangling_hosts(job) raise JobExecutionError(message_data=message_data["Reason(s)"]) if strategy := kwargs.pop("artifacts", None): return self._merge_artifacts(job, strategy=strategy) @@ -614,7 +667,7 @@ def execute(self, **kwargs): # noqa: PLR0912,PLR0915 - Possible TODO refactor def get_inventory(self, user=None): """Compile a list of hosts based on any inventory a user's name is mentioned.""" - user = user or self.username + user = user or self.uname invs = [ inv for inv in self._v2.inventory.get(page_size=200).results