Skip to content

Commit

Permalink
Add initial attempts to automatically clean up dangling hosts
Browse files Browse the repository at this point in the history
Now, when workflows fail, Broker will attempt to find a handgling host
and check it in if found.
  • Loading branch information
JacobCallahan committed Nov 26, 2024
1 parent f5b68dc commit 47c310f
Showing 1 changed file with 58 additions and 5 deletions.
63 changes: 58 additions & 5 deletions broker/providers/ansible_tower.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,10 @@
import click
from dynaconf import Validator
from logzero import logger
from rich.prompt import Prompt

from broker import exceptions
from broker.helpers import eval_filter, find_origin
from broker.helpers import eval_filter, find_origin, update_inventory
from broker.settings import settings

try:
Expand All @@ -23,7 +24,7 @@


def convert_pseudonamespaces(attr_dict):
"""Recursively convert PsuedoNamespace objects into dictionaries."""
"""Recursively convert PseudoNamespace objects into dictionaries."""
out_dict = {}
for key, value in attr_dict.items():
if isinstance(value, awxkit.utils.PseudoNamespace):
Expand Down Expand Up @@ -121,6 +122,7 @@ class AnsibleTower(Provider):
| Validator("ANSIBLETOWER.token", must_exist=True)
),
Validator("ANSIBLETOWER.inventory", default=None),
Validator("ANSIBLETOWER.dangling_behavior", default="checkin"),
]

_checkout_options = [
Expand Down Expand Up @@ -164,11 +166,12 @@ def __init__(self, **kwargs):
self.uname = settings.ANSIBLETOWER.get("username")
self.pword = settings.ANSIBLETOWER.get("password")
self.token = settings.ANSIBLETOWER.get("token")
self.dangling_behavior = settings.ANSIBLETOWER.get("dangling_behavior")
self._inventory = kwargs.get("tower_inventory") or settings.ANSIBLETOWER.inventory
# Init the class itself
config = kwargs.get("config")
root = kwargs.get("root")
self._v2, self.username = get_awxkit_and_uname(
self._v2, self.uname = get_awxkit_and_uname(
config=config,
root=root,
url=self.url,
Expand Down Expand Up @@ -374,6 +377,53 @@ def _get_failure_messages(self, workflow):
else:
return failure_messages

def _try_get_dangling_hosts(self, failed_workflow):
"""Get one or more hosts that may have been left behind by a failed workflow."""
hosts = []
for node in failed_workflow.get_related("workflow_nodes").results:
if not (job_fields := node.summary_fields.get("job", {})) or job_fields.get(
"failed"
): # skip jobs with no summary fields and failed jobs
continue
if jobs := self._v2.jobs.get(id=job_fields["id"]).results:
if vm_name := jobs[0].artifacts.get("vm_name"):
hosts.append(vm_name)
return list(set(hosts))

def handle_dangling_hosts(self, job):
"""Attempt to check in dangling hosts associated with the given job."""
dangling_hosts = self._try_get_dangling_hosts(job)
if not dangling_hosts:
logger.debug("No dangling hosts found for the failed job.")
return
dangling_behavior = self.dangling_behavior
for dangling_host in dangling_hosts:
logger.info(f"Found dangling host: {dangling_host}")
if dangling_behavior == "prompt":
choice = Prompt.ask(
"What would you like to do with this host? [c/s/cA/sA]\n",
"Checkin (c), Store (s), Checkin All (cA), Store All (sA)",
choices=["c", "s", "cA", "sA"],
)
if choice == "cA":
dangling_behavior = "checkin"
elif choice == "sA":
dangling_behavior = "store"
else:
choice = None
# handle checkins
if choice == "c" or dangling_behavior == "checkin":
try:
self.release(dangling_host)
logger.debug(f"Successfully checked in dangling host: {dangling_host}")
except exceptions.BrokerError:
logger.warning(f"Failed to check in dangling host: {dangling_host}")
elif choice == "s" or dangling_behavior == "store":
logger.debug(f"Storing dangling host: {dangling_host}")
host = self._v2.hosts.get(name=dangling_host).results[0]
host = self._compile_host_info(host)
update_inventory(add=host)

def _compile_host_info(self, host):
try:
host_facts = host.related.ansible_facts.get()
Expand Down Expand Up @@ -601,20 +651,23 @@ def execute(self, **kwargs): # noqa: PLR0912,PLR0915 - Possible TODO refactor
logger.info(f"Waiting for job: \nAPI: {job_api_url}\nUI: {job_ui_url}")
job.wait_until_completed(timeout=settings.ANSIBLETOWER.workflow_timeout)
if job.status != "successful":
failure_message = self._get_failure_messages(job)
message_data = {
f"{subject.capitalize()} Status": job.status,
"Reason(s)": self._get_failure_messages(job),
"Reason(s)": failure_message,
"URL": job_ui_url,
}
helpers.emit(message_data)
if "was automatically checked-in" not in failure_message:
self.handle_dangling_hosts(job)
raise JobExecutionError(message_data=message_data["Reason(s)"])
if strategy := kwargs.pop("artifacts", None):
return self._merge_artifacts(job, strategy=strategy)
return job

def get_inventory(self, user=None):
"""Compile a list of hosts based on any inventory a user's name is mentioned."""
user = user or self.username
user = user or self.uname
invs = [
inv
for inv in self._v2.inventory.get(page_size=200).results
Expand Down

0 comments on commit 47c310f

Please sign in to comment.