From 6d65d28922cc8c148d01fc17cc74f1dbbf7687d8 Mon Sep 17 00:00:00 2001 From: Jacob Callahan Date: Tue, 19 Nov 2024 16:21:37 -0500 Subject: [PATCH] Add initial attempts to automatically clean up dangling hosts Now, when workflows fail, Broker will attempt to find a handgling host and check it in if found. --- broker/providers/ansible_tower.py | 36 ++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/broker/providers/ansible_tower.py b/broker/providers/ansible_tower.py index 6684a2e..f521823 100644 --- a/broker/providers/ansible_tower.py +++ b/broker/providers/ansible_tower.py @@ -168,7 +168,7 @@ def __init__(self, **kwargs): # Init the class itself config = kwargs.get("config") root = kwargs.get("root") - self._v2, self.username = get_awxkit_and_uname( + self._v2, self.uname = get_awxkit_and_uname( config=config, root=root, url=self.url, @@ -374,6 +374,33 @@ def _get_failure_messages(self, workflow): else: return failure_messages + def _try_get_dangling_hosts(self, failed_workflow): + """Get one or more hosts that may have been left behind by a failed workflow.""" + hosts = [] + for node in failed_workflow.get_related("workflow_nodes").results: + if not (job_fields := node.summary_fields.get("job", {})) or job_fields.get( + "failed" + ): # skip jobs with no summary fields and failed jobs + continue + if jobs := self._v2.jobs.get(id=job_fields["id"]).results: + if vm_name := jobs[0].artifacts.get("vm_name"): + hosts.append(vm_name) + return list(set(hosts)) + + def _try_checkin_dangling_host(self, job): + """Attempt to check in dangling hosts associated with the given job.""" + dangling_hosts = self._try_get_dangling_hosts(job) + if not dangling_hosts: + logger.debug("No dangling hosts found for the failed job.") + return + for dangling_host in dangling_hosts: + logger.info(f"Found dangling host: {dangling_host}. Attempting to check in.") + try: + self.release(dangling_host) + logger.debug(f"Successfully checked in dangling host: {dangling_host}") + except exceptions.BrokerError: + logger.warning(f"Failed to check in dangling host: {dangling_host}") + def _compile_host_info(self, host): try: host_facts = host.related.ansible_facts.get() @@ -601,12 +628,15 @@ def execute(self, **kwargs): # noqa: PLR0912,PLR0915 - Possible TODO refactor logger.info(f"Waiting for job: \nAPI: {job_api_url}\nUI: {job_ui_url}") job.wait_until_completed(timeout=settings.ANSIBLETOWER.workflow_timeout) if job.status != "successful": + failure_message = self._get_failure_messages(job) message_data = { f"{subject.capitalize()} Status": job.status, - "Reason(s)": self._get_failure_messages(job), + "Reason(s)": failure_message, "URL": job_ui_url, } helpers.emit(message_data) + if "was automatically checked-in" not in failure_message: + self._try_checkin_dangling_host(job) raise JobExecutionError(message_data=message_data["Reason(s)"]) if strategy := kwargs.pop("artifacts", None): return self._merge_artifacts(job, strategy=strategy) @@ -614,7 +644,7 @@ def execute(self, **kwargs): # noqa: PLR0912,PLR0915 - Possible TODO refactor def get_inventory(self, user=None): """Compile a list of hosts based on any inventory a user's name is mentioned.""" - user = user or self.username + user = user or self.uname invs = [ inv for inv in self._v2.inventory.get(page_size=200).results