From aaaa511fe9975feb798c27a56f1c3189fce610c1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Guillaume=20Cor=C3=A9?= <gucore@redhat.com>
Date: Wed, 15 Jan 2025 21:36:06 +0100
Subject: [PATCH 01/11] Fix and Containerize the creation of AWS sandboxes

- [X] Create a container image that has all the software needed for the creation of a new sandbox
- [X] Fix Locales in Containerfile.admin
- [X] Create a wrapper script for automation => Python
- [X] Automatically guess the next sandbox number from all different DB (prod and dev)
- [X] Add profiling callback to ansible creation playbook runs.
- [X] Enable Gold images by using the new HCC (console) APIs instead of Cypress + access.redhat.com
      There is a transition from access.redhat.com web page to HCC (console.redhat.com)
      Advantages:
        - much much faster
        - less dependencies in the images (roughly -500MB)
- [X] Status script `creation_status.py`  -- list creation in progress freshly created sandbox
- [X] Add a `--retry sandbox123` capability
- [ ] Add an Org Policy to p protect anything that is required by HCC (role, ...)
- [ ] make slow task async in the playbook
- [X] New feature: provide the reservation name,  by default new sandboxes end up in a 'new' reservation
- [X] sandboxes are created in a 'untested' reservation first. After the functional tests, if successful, we move the new sandboxes to the target reservation (default 'new')
- [ ] Create monitoring dashboard or at least scripts for the creation
- [ ] Add a test to ensure Vault value is correct. Try to read one key with the passed vault secret. If it doesn't work, exit. That will prevent accidentally creating sandboxes with a vault different that the one currently in use for the 'target DB'
- [ ] allow to change the target OU
- [ ] document (upstream and confluence)
- [ ] Package everything for OpenShift: use OpenShift job to run the creation
---
 Containerfile.admin                           |  34 +-
 playbooks/create_range.yml                    |   4 +-
 playbooks/create_sandbox.py                   | 759 ++++++++++++++++++
 playbooks/creation_status.py                  |  94 +++
 .../roles/infra-aws-sandbox/defaults/main.yml |   6 +
 .../roles/infra-aws-sandbox/tasks/account.yml | 168 +++-
 .../roles/infra-aws-sandbox/tasks/assume.yml  |   3 +
 .../roles/infra-aws-sandbox/tasks/iam.yml     |  11 +-
 .../roles/infra-aws-sandbox/tasks/keypair.yml |  54 +-
 .../roles/infra-aws-sandbox/tasks/main.yml    |   2 +
 .../roles/infra-aws-sandbox/tasks/ou.yml      |  23 +-
 .../roles/infra-aws-sandbox/tasks/pool.yml    |   2 +-
 .../roles/infra-aws-sandbox/tasks/route53.yml |   6 +-
 .../roles/infra-aws-sandbox/tasks/user.yml    |   2 +
 .../infra-aws-sandbox/tasks/validate.yaml     |  20 +-
 requirements.txt                              |  26 +
 16 files changed, 1120 insertions(+), 94 deletions(-)
 create mode 100755 playbooks/create_sandbox.py
 create mode 100755 playbooks/creation_status.py
 create mode 100644 requirements.txt

diff --git a/Containerfile.admin b/Containerfile.admin
index 5512c4c2..d10c7c34 100644
--- a/Containerfile.admin
+++ b/Containerfile.admin
@@ -7,30 +7,49 @@ COPY ./ ./
 RUN make
 
 FROM registry.access.redhat.com/ubi8/ubi:latest AS deploy
+USER root
 RUN dnf install -y https://download.postgresql.org/pub/repos/yum/reporpms/EL-8-x86_64/pgdg-redhat-repo-latest.noarch.rpm \
     && dnf install -y \
     bash \
+    bzip2 \
     bind-utils \
     curl \
     findutils \
     gcc \
     git \
+    glibc-langpack-en \
     jq \
     nc \
     net-tools \
+    nodejs \
+    npm \
     openssl \
     postgresql \
-    python39 \
-    python39-pip \
+    python3.12 \
+    python3.12-pip \
     rsync \
     tar \
     unzip \
     vim \
     wget \
     && dnf clean all \
+    && sed -i 's/^LANG=.*/LANG="en_US.utf8"/' /etc/locale.conf \
     && VERSION=4.1.0 \
     && curl --silent --location https://github.com/Orange-OpenSource/hurl/releases/download/$VERSION/hurl-$VERSION-x86_64-unknown-linux-gnu.tar.gz \
-    | tar -xz -C /usr/local/bin --strip-components=1 --wildcards '*/hurl' '*/hurlfmt'
+    | tar -xz -C /usr/local/bin --strip-components=1 --wildcards '*/hurl' '*/hurlfmt' \
+    && cd /tmp \
+    && curl -s -L "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" \
+    && unzip "awscliv2.zip" \
+    && ./aws/install && rm -rf awscliv2.zip aws
+
+# Python
+
+RUN alternatives --set python /usr/bin/python3.12 \
+    && alternatives --set python3 /usr/bin/python3.12 \
+    && alternatives --install /usr/bin/pip pip /usr/bin/pip3.12 1
+RUN pip install --no-cache-dir --upgrade pip
+COPY requirements.txt /tmp/requirements.txt
+RUN pip install --no-cache-dir -r /tmp/requirements.txt
 
 COPY --from=docker.io/migrate/migrate /usr/local/bin/migrate /usr/local/bin/migrate
 WORKDIR /sandbox/
@@ -38,11 +57,20 @@ USER ${USER_UID}
 COPY build/github_known_hosts /ssh/known_hosts
 env SSH_KNOWN_HOSTS /ssh/known_hosts
 COPY --from=builder /sandbox/build/sandbox-* ./
+COPY --from=builder /sandbox/build/sandbox-* /usr/bin/
 COPY --from=builder /sandbox/tools ./tools
 COPY --from=builder /sandbox/tests ./tests
 COPY --from=builder /sandbox/db ./db
+COPY --from=builder /sandbox/cloud-automation ./cloud-automation
+COPY --from=builder /sandbox/playbooks ./playbooks
+
+RUN cd cloud-automation && npm ci
+COPY conan/ansible.cfg /etc/ansible/ansible.cfg
+RUN rm -rf /tmp/* /root/.cache /root/*
+
 CMD ["/bin/bash"]
 
+ENV LANG='en_US.UTF-8' LANGUAGE='en_US:en' LC_ALL='en_US.UTF-8'
 
 ENV DESCRIPTION="Image for Admins to interact with the Sandbox API"
 LABEL name="rhpds/sandbox-admin" \
diff --git a/playbooks/create_range.yml b/playbooks/create_range.yml
index 343a6f7a..4cb4d14f 100755
--- a/playbooks/create_range.yml
+++ b/playbooks/create_range.yml
@@ -1,9 +1,9 @@
 #!/usr/bin/env ansible-playbook
 
 - hosts: localhost
-  gather_facts: no
+  gather_facts: false
   connection: local
-  run_once: yes
+  run_once: true
   tasks:
     - assert:
         msg: Please set account_num_start
diff --git a/playbooks/create_sandbox.py b/playbooks/create_sandbox.py
new file mode 100755
index 00000000..efe9a0b7
--- /dev/null
+++ b/playbooks/create_sandbox.py
@@ -0,0 +1,759 @@
+#!/usr/bin/env python3
+
+# First, grab the list of all sandboxes
+
+import subprocess
+import os
+import sys
+import boto3
+import argparse
+import atexit
+import structlog
+import logging
+import tempfile
+import random
+import string
+import requests
+import time
+from ansible_vault import Vault
+
+START_TIME = time.time()
+
+#structlog.configure(
+    #processors=[
+        #structlog.stdlib.filter_by_level,
+        #structlog.processors.TimeStamper(fmt="iso"),
+        #structlog.processors.JSONRenderer()],
+    #context_class=dict, logger_factory=structlog.stdlib.LoggerFactory())
+
+logger = structlog.get_logger()
+structlog.configure(wrapper_class=structlog.make_filtering_bound_logger(logging.INFO))
+
+# args: --reservation reservation_name
+
+# parse the args
+
+parser = argparse.ArgumentParser(description='Create a new sandbox')
+parser.add_argument('--reservation', required=False, help='The reservation name', default='new')
+parser.add_argument('--target-db', required=False, help='The target database', default='dev')
+parser.add_argument('--log-level', required=False, help='The log level', default='info')
+parser.add_argument('--retry', required=False, help='Retry sandbox by passing its name', default=None)
+
+args = parser.parse_args()
+
+reservation = args.reservation
+logger = logger.bind(reservation=reservation)
+target_db = args.target_db
+log_level = args.log_level
+retry = args.retry
+
+if log_level == 'debug':
+    logger.info("Setting log level to DEBUG")
+    structlog.configure(wrapper_class=structlog.make_filtering_bound_logger(logging.DEBUG))
+
+logger.debug(f"Reservation: {reservation}")
+
+# Make sure all environment variables are set
+
+# Set default values for the environment variables
+os.environ.setdefault('ddns_key_name', 'mydynamickey')
+os.environ.setdefault('ddns_key_algorithm', 'hmac-sha512')
+os.environ.setdefault('ddns_ttl', '600')
+os.environ.setdefault('email_domain', 'opentlc.com')
+# set default to ~/.aws/credentials_create
+os.environ.setdefault('AWS_SHARED_CREDENTIALS_FILE', os.path.expanduser('~/.aws/credentials_create'))
+# Create directory if it doesn't exist, chmod 700
+logger.info(f"Creating directory {os.path.dirname(os.environ['AWS_SHARED_CREDENTIALS_FILE'])}")
+os.makedirs(os.path.dirname(os.environ['AWS_SHARED_CREDENTIALS_FILE']), exist_ok=True)
+os.chmod(os.path.dirname(os.environ['AWS_SHARED_CREDENTIALS_FILE']), 0o700)
+
+
+required_env_vars = [
+    'AWS_ACCESS_KEY_ID',
+    'AWS_SECRET_ACCESS_KEY',
+    'AWS_ACCESS_KEY_ID_DEV',
+    'AWS_SECRET_ACCESS_KEY_DEV',
+    'INFRA_VAULT_SECRET_DEV',
+    'INFRA_VAULT_SECRET_PROD',
+    'ddns_server',
+    'ddns_key_secret',
+    'RH_USERNAME',
+    'RH_PASSWORD',
+]
+
+# constants: Steps
+
+# step '0 - created in DB only'
+
+STAGE0 = '0 - created in DB only'
+STAGE1_STARTED = "1 - Account Creation Started"
+STAGE1_FAILED = "1 - Account Creation Failed"
+STAGE2_ACCOUNT_CREATED = "2 - Account Created"
+STAGE3_GOLD_IMAGE = "3 - Gold Image Enabled"
+STAGE4_VALIDATED = "4 - Account Validated and Ready"
+
+
+for env_var in required_env_vars:
+    if not os.environ.get(env_var):
+        logger.info(f"Environment variable {env_var} not set")
+        sys.exit(1)
+
+
+
+session_prod = boto3.Session(region_name='us-east-1')
+dynamodb_prod = session_prod.client('dynamodb')
+
+session_dev = boto3.Session(region_name='us-east-1',
+                            aws_access_key_id=os.environ['AWS_ACCESS_KEY_ID_DEV'],
+                            aws_secret_access_key=os.environ['AWS_SECRET_ACCESS_KEY_DEV'])
+dynamodb_dev = session_dev.client('dynamodb')
+
+# Set the target database
+dynamodb_table = 'accounts-dev'
+dynamodb = dynamodb_dev
+
+if target_db == 'prod':
+    logger.info("Using PROD dynamoDB database")
+    dynamodb_table = 'accounts'
+    dynamodb = dynamodb_prod
+    logger = logger.bind(target_db='prod')
+    os.environ['INFRA_VAULT_SECRET'] = os.environ['INFRA_VAULT_SECRET_PROD']
+else:
+    logger.info("Using DEV dynamoDB database")
+    # bind context variable to the logger
+    logger = logger.bind(target_db='dev')
+    os.environ['INFRA_VAULT_SECRET'] = os.environ['INFRA_VAULT_SECRET_DEV']
+
+# Create temporary file using tempfile with the INFRA_VAULT_SECRET as content, with mode 700
+
+with tempfile.NamedTemporaryFile(mode='w', delete=False) as f:
+    f.write(os.environ['INFRA_VAULT_SECRET'])
+    INFRA_VAULT_SECRET_FILE = f.name
+    logger.info(f"Created temporary file {INFRA_VAULT_SECRET_FILE}")
+
+# run `sandbox-list -all --sort name`
+
+response = dynamodb_dev.scan(
+    TableName='accounts-dev',
+    ConsistentRead=True,
+    ProjectionExpression='#n',
+    ExpressionAttributeNames={
+        '#n': 'name'
+    }
+)
+
+if response['ResponseMetadata']['HTTPStatusCode'] != 200:
+    logger.error("Failed to get items from dynamodb")
+    sys.exit(1)
+
+data = response['Items']
+while 'LastEvaluatedKey' in response:
+    response = dynamodb_dev.scan(
+        TableName='accounts-dev',
+        ConsistentRead=True,
+        ProjectionExpression='#n',
+        ExpressionAttributeNames={'#n': 'name'},
+        ExclusiveStartKey=response['LastEvaluatedKey']
+    )
+    data.extend(response['Items'])
+
+if 'Items' in response:
+    sandboxes = [item['name']['S'] for item in data]
+    logger.info(f"Found {len(sandboxes)} sandboxes in dev")
+
+# Now run the command for the prod database
+
+response = dynamodb_prod.scan(
+    TableName='accounts',
+    ConsistentRead=True,
+    ProjectionExpression='#n',
+    ExpressionAttributeNames={'#n': 'name'}
+)
+
+if response['ResponseMetadata']['HTTPStatusCode'] != 200:
+    logger.error("Failed to get items from dynamodb")
+    sys.exit(1)
+
+data = response['Items']
+
+while 'LastEvaluatedKey' in response:
+    response = dynamodb_prod.scan(
+        TableName='accounts',
+        ConsistentRead=True,
+        ProjectionExpression='#n',
+        ExpressionAttributeNames={'#n': 'name'},
+        ExclusiveStartKey=response['LastEvaluatedKey']
+    )
+    data.extend(response['Items'])
+
+if 'Items' in response:
+    sandboxes_prod = [item['name']['S'] for item in data]
+    logger.info(f"Found {len(sandboxes_prod)} sandboxes in prod")
+    sandboxes = sandboxes + sandboxes_prod
+
+# transform into a dictionary
+sandboxes_dict = {sandbox: True for sandbox in sandboxes}
+
+def set_(dynamodb, sandbox, key, value):
+    '''Set the key value pair in the DB'''
+    response = dynamodb.update_item(
+        TableName=dynamodb_table,
+        Key={
+            'name': {
+                'S': sandbox
+            }
+        },
+        UpdateExpression='SET #k = :val1',
+        ExpressionAttributeNames={
+            '#k': key
+        },
+        ExpressionAttributeValues={
+            ':val1': {
+                'S': value
+            }
+        }
+    )
+
+    if response['ResponseMetadata']['HTTPStatusCode'] != 200:
+        raise Exception(f"Failed to set {key} to {value}")
+
+def set_stage(dynamodb, sandbox, stage):
+    """Set the stage of the sandbox"""
+    response = dynamodb.update_item(
+        TableName=dynamodb_table,
+        Key={
+            'name': {
+                'S': sandbox
+            }
+        },
+        UpdateExpression='SET #s = :val1',
+        ExpressionAttributeNames={
+            '#s': 'stage'
+        },
+        ExpressionAttributeValues={
+            ':val1': {
+                'S': stage
+            }
+        }
+    )
+
+    if response['ResponseMetadata']['HTTPStatusCode'] != 200:
+        raise Exception(f"Failed to set the stage to {stage}")
+
+def get_stage(dynamodb, sandbox):
+    """Get the stage of the sandbox"""
+    response = dynamodb.get_item(
+        TableName=dynamodb_table,
+        Key={
+            'name': {
+                'S': sandbox
+            }
+        }
+    )
+
+    if 'Item' in response:
+        return response['Item'].get('stage', {}).get('S', '')
+    else:
+        return ''
+
+def get_sandbox(dynamodb, sandbox):
+    """Get the sandbox from the DB"""
+    response = dynamodb.get_item(
+        TableName=dynamodb_table,
+        Key={
+            'name': {
+                'S': sandbox
+            }
+        }
+    )
+
+    if 'Item' in response:
+        return response['Item']
+    else:
+        return {}
+
+
+def extract_sandbox_number(sandbox):
+    """Extract the number from the sandbox name, for example sandbox1234 returns 1234"""
+    return int(sandbox.split('sandbox')[1])
+
+
+def guess_next_sandbox(sandboxes, sandboxes_dict):
+    """Find the first available sandbox name"""
+    # Generate a random email tag sandbox1+RANDSTR@opentlc.com
+    # used when we reuse the account name. For some reason, the email is still registered
+    # in AWS and we need to use a different email address even if the previous account is closed.
+    random_email_tag = ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(4))
+
+    if retry:
+        return retry, f"{retry}+{random_email_tag}@{os.environ['email_domain']}"
+    for i in range(1, len(sandboxes) + 1):
+        if not sandboxes_dict.get(f"sandbox{i}", False):
+            return f"sandbox{i}", f"sandbox{i}+{random_email_tag}@{os.environ['email_domain']}"
+
+    s = f"sandbox{extract_sandbox_number(sandboxes[-1]) + 1}"
+    return s, f"{s}+{random_email_tag}@{os.environ['email_domain']}"
+
+
+def decrypt_vaulted_str(secret):
+    '''Decrypt the vaulted secret'''
+    return Vault(os.environ['INFRA_VAULT_SECRET']).load_raw(secret).decode('utf-8')
+
+new_sandbox, new_email = guess_next_sandbox(sandboxes, sandboxes_dict)
+logger.info(f"=> Create {new_sandbox}")
+
+
+# Lock the name of the sandbox in DB so another
+# concurrent process won't be able to create the same sandbox.
+sandbox_data = get_sandbox(dynamodb, new_sandbox)
+if sandbox_data:
+    stage = sandbox_data.get('stage', {}).get('S', '')
+    if not retry:
+        logger.info(f"Sandbox {new_sandbox} already exists")
+        sys.exit(1)
+
+    # Ensure the sandbox is not in use, available should be absent or true
+    if retry:
+        if sandbox_data.get('available', {}).get('BOOL', True) is False:
+            logger.info(f"Retry {new_sandbox}")
+        else:
+            logger.error(f"{new_sandbox} is not available")
+            sys.exit(1)
+
+        if not stage:
+            logger.error(f"Failed to get the stage for {new_sandbox}")
+            sys.exit(1)
+
+        creation_status = sandbox_data.get('creation_status', {}).get('S', '')
+
+        if not creation_status:
+            logger.error(f"Failed to get the creation_status for {new_sandbox}")
+            sys.exit(1)
+
+
+def lock_sandbox(dynamodb, sandbox):
+    '''Lock the sandbox name'''
+    item = {
+        'name': {
+            'S': new_sandbox
+        },
+        'available': {
+            'BOOL': False
+        },
+        'to_cleanup': {
+            'BOOL': False
+        },
+        'reservation': {
+            'S': 'untested'
+        },
+        'comment': {
+            'S': 'Creating new sandbox'
+        },
+        'stage': {
+            'S': STAGE0
+        },
+        'creation_status': {
+            'S': 'in progress'
+        }
+    }
+
+    response = dynamodb.put_item(
+        TableName=dynamodb_table,
+        # If retry, no condition is needed
+        ConditionExpression='attribute_not_exists(#n)' if not retry else 'attribute_exists(#n) or attribute_not_exists(#n)',
+        ExpressionAttributeNames={
+            '#n': 'name'
+        },
+        Item=item
+    )
+
+    if response['ResponseMetadata']['HTTPStatusCode'] != 200:
+        logger.error("Failed to lock the sandbox name")
+        sys.exit(1)
+
+    logger.info(f"Locked {new_sandbox}")
+
+lock_sandbox(dynamodb, new_sandbox)
+
+def exit_handler(db, table, sandbox):
+    '''Function to cleanup everything in case something went wrong'''
+
+    # Delete INFRA_VAULT_SECRET_FILE
+
+    os.remove(INFRA_VAULT_SECRET_FILE)
+
+    # Check if the stage is STAGE0
+    stage = get_stage(db, sandbox)
+    if stage in [ STAGE0, STAGE1_FAILED ]:
+        response = db.delete_item(
+            TableName=table,
+            Key={
+                'name': {
+                    'S': sandbox
+                }
+            }
+        )
+
+        if response['ResponseMetadata']['HTTPStatusCode'] != 200:
+            logger.error(f"Failed to delete {sandbox}")
+            sys.exit(1)
+        else:
+            logger.info(f"Deleted {sandbox}")
+    elif stage == STAGE4_VALIDATED:
+        pass
+    else:
+        # something went wrong
+        logger.error(f"Unexpected stage: {stage}, missing validation")
+        logger.info(f"You can retry the operation by running the command with --retry {sandbox}")
+        set_(dynamodb, new_sandbox, 'creation_status', 'failed')
+        sys.exit(1)
+
+atexit.register(exit_handler, dynamodb, dynamodb_table, new_sandbox)
+
+# Prepare the AWS profile for the ansible-playbook command
+# - dynamodb   profile to manage the dynamodb table
+# - pool-manager profile to manage the pool
+# Save the file to AWS_SHARED_CREDENTIALS_FILE
+if target_db == 'prod':
+    with open(os.environ['AWS_SHARED_CREDENTIALS_FILE'], 'w') as f:
+        f.write(
+            f'''
+[dynamodb]
+aws_access_key_id = {os.environ['AWS_ACCESS_KEY_ID']}
+aws_secret_access_key = {os.environ['AWS_SECRET_ACCESS_KEY']}
+[pool-manager]
+aws_access_key_id = {os.environ['AWS_ACCESS_KEY_ID']}
+aws_secret_access_key = {os.environ['AWS_SECRET_ACCESS_KEY']}
+'''
+        )
+else:
+    with open(os.environ['AWS_SHARED_CREDENTIALS_FILE'], 'w') as f:
+        f.write(
+            f'''
+[dynamodb]
+aws_access_key_id = {os.environ['AWS_ACCESS_KEY_ID_DEV']}
+aws_secret_access_key = {os.environ['AWS_SECRET_ACCESS_KEY_DEV']}
+[pool-manager]
+aws_access_key_id = {os.environ['AWS_ACCESS_KEY_ID']}
+aws_secret_access_key = {os.environ['AWS_SECRET_ACCESS_KEY']}
+            ''')
+
+# Prepare args for the ansible-playbook command
+#./create_range.yml -e account_num_start=3001 -e account_count=10 -e ddns_key_name=... -e ddns_key_secret=... -e ddns_server=...
+
+local_path = os.path.dirname(os.path.realpath(__file__))
+playbook = os.path.join(local_path, '..', 'playbooks', 'create_range.yml')
+
+args = [
+    'ansible-playbook',
+    playbook,
+    '-e', f'account_num_start={extract_sandbox_number(new_sandbox)}',
+    '-e', f'account_email={new_email}',
+    '-e', 'account_count=1',
+    '-e', f'ddns_key_name={os.environ["ddns_key_name"]}',
+    '-e', f'ddns_server={os.environ["ddns_server"]}',
+    '-e', f'ddns_ttl={os.environ["ddns_ttl"]}',
+    '-e', f'sandbox={new_sandbox}',
+    '-e', 'update_stage=true',
+    '-e', 'dynamodb_profile=dynamodb',
+    '-e', f'dynamodb_table={dynamodb_table}',
+    '-e', 'aws_master_profile=pool-manager',
+    # Listing all accounts in the organization is a costly operation
+    # it takes currently 47s to execute.
+    # Check the account only in certain scenario, like for a retry
+    '-e', f'check_account_list={True if retry else False}',
+    '-e', f'vault_file={INFRA_VAULT_SECRET_FILE}',
+]
+
+
+# Run the command
+logger.info(f"Running {' '.join(args)}")
+# Add the ddns_key_secret to the args
+args = args + ['-e', f'ddns_key_secret={os.environ["ddns_key_secret"]}']
+try:
+    completed = subprocess.run(
+        args, check=True,
+        #capture_output=True,
+        timeout=1800,
+    )
+except subprocess.CalledProcessError as e:
+    # Sanitize the error message by removing the DDNS key secret
+    e_sanitized = str(e).replace(os.environ['ddns_key_secret'], '***')
+    logger.error(f"Failed to run the command: {e_sanitized}")
+    # print stdout and stderr
+    logger.error(e.stdout.decode(), stdout=True)
+    logger.error(e.stderr.decode(), stderr=True)
+
+    # Set sandbox status to failed
+    response = dynamodb.update_item(
+        TableName=dynamodb_table,
+        Key={
+            'name': {
+                'S': new_sandbox
+            }
+        },
+        UpdateExpression='SET #s = :val1',
+        ExpressionAttributeNames={
+            '#s': 'creation_status'
+        },
+        ExpressionAttributeValues={
+            ':val1': {
+                'S': 'failed'
+            }
+        }
+    )
+
+    sys.exit(1)
+except subprocess.TimeoutExpired as e:
+    # Sanitize the error message by removing the DDNS key secret
+    e_sanitized = str(e).replace(os.environ['ddns_key_secret'], '***')
+    logger.error(f"Timeout: {e_sanitized}", sandbox=new_sandbox)
+    # Set sandbox status to failed
+    response = dynamodb.update_item(
+        TableName=dynamodb_table,
+        Key={
+            'name': {
+                'S': new_sandbox
+            }
+        },
+        UpdateExpression='SET #s = :val1',
+        ExpressionAttributeNames={
+            '#s': 'creation_status'
+        },
+        ExpressionAttributeValues={
+            ':val1': {
+                'S': 'failed'
+            }
+        }
+    )
+    sys.exit(1)
+
+logger.info(f"Created {new_sandbox}")
+
+# Get the account_id from the db
+
+sandbox_data = get_sandbox(dynamodb, new_sandbox)
+
+if sandbox_data:
+    account_id = sandbox_data.get('account_id', {}).get('S', '')
+    logger.info(f"Account ID: {account_id}")
+    logger = logger.bind(account_id=account_id)
+
+    # Write the account_id and the account name to cloud-automation/new_sandboxes.txt
+    with open('cloud-automation/new_sandboxes.txt', 'w') as f:
+        f.write(f"{new_sandbox} {account_id}\n")
+
+set_(dynamodb, new_sandbox, 'stage', STAGE2_ACCOUNT_CREATED)
+ACCOUNT_CREATED_TIME = time.time()
+logger.info(f"Duration: {round(ACCOUNT_CREATED_TIME - START_TIME)} seconds to create {new_sandbox}")
+
+# Use https://console.redhat.com/docs/api/sources/v3.1#operations-sources-bulkCreate
+
+sandbox_data = get_sandbox(dynamodb, new_sandbox)
+
+if not sandbox_data:
+    logger.error(f"Failed to get the sandbox data for {new_sandbox}")
+    sys.exit(1)
+
+if 'aws_secret_access_key' not in sandbox_data:
+    logger.error(f"Failed to get the aws_secret_access_key for {new_sandbox}")
+    sys.exit(1)
+
+plaintext_key = decrypt_vaulted_str(sandbox_data.get('aws_secret_access_key', {}).get('S', '')).strip(' \t\n\r')
+access_key = sandbox_data.get('aws_access_key_id', {}).get('S', '').strip(' \t\n\r')
+
+if not access_key or not plaintext_key:
+    logger.error(f"Failed to get the access key for {new_sandbox}")
+    sys.exit(1)
+
+# use requests and create the POST request
+
+baseurl = 'https://console.redhat.com/api/sources/v3.1'
+
+s = requests.Session()
+s.auth = (os.environ['RH_USERNAME'], os.environ['RH_PASSWORD'])
+
+# delete the source if it exists
+# First get the source_id
+max_retries = 20
+while True:
+    response = s.get(f"{baseurl}/sources?filter[name][eq]={new_sandbox}")
+    if response.status_code == 200:
+        break
+    logger.error(f"Failed to get the source: {response.text}", status_code=response.status_code, sandbox=new_sandbox)
+    if max_retries == 0:
+        sys.exit(1)
+
+    logger.info(f"Retrying: {max_retries} retries left")
+    max_retries -= 1
+    time.sleep(5)
+
+result = response.json().get('data', [])
+if len(result) > 0:
+    source_id = response.json().get('data', [{}])[0].get('id', '')
+
+    if source_id:
+        response = s.delete(f"{baseurl}/sources/{source_id}")
+        if response.status_code not in [200, 201, 202]:
+            logger.error(f"Failed to delete the source: {response.text}", status_code=response.status_code, sandbox=new_sandbox)
+
+        logger.info(f"Deleted the source {source_id} for {new_sandbox}")
+
+        # Wait for the deletion to complete
+        max_retries = 20
+        while max_retries > 0:
+            response = s.get(f"{baseurl}/sources/{source_id}")
+            if response.status_code == 404:
+                break
+            max_retries -= 1
+            logger.info(f"Waiting for the source to be deleted from HCC (console): {max_retries} retries left")
+            time.sleep(5)
+
+        if max_retries == 0:
+            logger.error(f"Failed to delete the source {source_id} for {new_sandbox}")
+            sys.exit(1)
+
+payload = {
+    "sources": [
+        {
+            "name": new_sandbox,
+            "source_type_name": "amazon",
+            "app_creation_workflow": "account_authorization"
+        }
+    ],
+    "authentications": [
+        {
+            "resource_type": "source",
+            "resource_name": new_sandbox,
+            "username": access_key,
+            "password": plaintext_key,
+            "authtype": "access_key_secret_key"
+        }
+    ],
+
+    "applications": [
+        {
+            "source_name": new_sandbox,
+            "application_type_name": "cloud-meter"
+        }
+    ]
+}
+response = s.post(f"{baseurl}/bulk_create", json=payload)
+
+if response.status_code not in [200, 201]:
+    logger.error(f"Failed to create the source: {response.text}", status_code=response.status_code, sandbox=new_sandbox)
+    sys.exit(1)
+
+# Run the validation playbook operation
+
+local_path = os.path.dirname(os.path.realpath(__file__))
+playbook = os.path.join(local_path, '..', 'playbooks', 'validate.yml')
+
+args = [
+    'ansible-playbook',
+    playbook,
+    '-e', f'account_num_start={extract_sandbox_number(new_sandbox)}',
+    '-e', f'account_num_end={extract_sandbox_number(new_sandbox)}',
+    '-e', f'sandbox={new_sandbox}',
+    '-e', 'dynamodb_profile=dynamodb',
+    '-e', f'dynamodb_table={dynamodb_table}',
+    '-e', 'aws_master_profile=pool-manager',
+    '-e', f'vault_file={INFRA_VAULT_SECRET_FILE}',
+    '-e', 'operation=VALIDATE',
+]
+
+# Run the command
+logger.info(f"Running {' '.join(args)}")
+
+try:
+    completed = subprocess.run(
+        args, check=True,
+        #capture_output=True,
+        timeout=1800,
+    )
+
+except subprocess.CalledProcessError as e:
+    logger.error(f"Failed to run the command: {e}")
+    # print stdout and stderr
+    logger.error(e.stdout.decode(), stdout=True)
+    logger.error(e.stderr.decode(), stderr=True)
+
+    # Set sandbox status to validation failed
+    response = dynamodb.update_item(
+        TableName=dynamodb_table,
+        Key={
+            'name': {
+                'S': new_sandbox
+            }
+        },
+        UpdateExpression='SET #s = :val1',
+        ExpressionAttributeNames={
+            '#s': 'creation_status'
+        },
+        ExpressionAttributeValues={
+            ':val1': {
+                'S': 'validation failed'
+            }
+        }
+    )
+
+    sys.exit(1)
+
+except subprocess.TimeoutExpired as e:
+    logger.error(f"Timeout: {e}")
+    # Set sandbox status to validation failed
+    response = dynamodb.update_item(
+        TableName=dynamodb_table,
+        Key={
+            'name': {
+                'S': new_sandbox
+            }
+        },
+        UpdateExpression='SET #s = :val1',
+        ExpressionAttributeNames={
+            '#s': 'creation_status'
+        },
+        ExpressionAttributeValues={
+            ':val1': {
+                'S': 'validation timed out'
+            }
+        }
+    )
+    sys.exit(1)
+
+logger.info(f"Validation successful for {new_sandbox}")
+
+# Move the sandbox to the final reservation
+
+response = dynamodb.update_item(
+    TableName=dynamodb_table,
+    Key={
+        'name': {
+            'S': new_sandbox
+        }
+    },
+    UpdateExpression='SET #r = :val1, #s = :val2, #c = :val3',
+    ExpressionAttributeNames={
+        '#r': 'reservation',
+        '#s': 'stage',
+        '#c': 'creation_status'
+    },
+    ExpressionAttributeValues={
+        ':val1': {
+            'S': reservation
+        },
+        ':val2': {
+            'S': STAGE4_VALIDATED
+        },
+        ':val3': {
+            'S': 'success'
+        }
+    }
+)
+
+if response['ResponseMetadata']['HTTPStatusCode'] != 200:
+    logger.error("Failed to update the reservation")
+    sys.exit(1)
+
+logger.info(f"Moved {new_sandbox} to {reservation}")
+logger.info(f"Total duration: {round(time.time() - START_TIME)} seconds")
diff --git a/playbooks/creation_status.py b/playbooks/creation_status.py
new file mode 100755
index 00000000..c73fb7b3
--- /dev/null
+++ b/playbooks/creation_status.py
@@ -0,0 +1,94 @@
+#!/usr/bin/env python3
+
+
+import subprocess
+import os
+import sys
+import boto3
+import argparse
+import atexit
+import structlog
+import logging
+import tempfile
+import random
+import string
+
+logger = structlog.get_logger()
+structlog.configure(wrapper_class=structlog.make_filtering_bound_logger(logging.INFO))
+
+session_prod = boto3.Session(region_name='us-east-1')
+dynamodb_prod = session_prod.client('dynamodb')
+
+session_dev = boto3.Session(region_name='us-east-1',
+                            aws_access_key_id=os.environ['AWS_ACCESS_KEY_ID_DEV'],
+                            aws_secret_access_key=os.environ['AWS_SECRET_ACCESS_KEY_DEV'])
+dynamodb_dev = session_dev.client('dynamodb')
+
+
+def print_sandbox(item, db):
+    if 'stage' not in item:
+        return
+
+    logger.info(item['name']['S'],
+                creation_status=item.get('creation_status', {}).get('S', ''),
+                stage= item.get('stage', {}).get('S', ''),
+                reservation= item.get('reservation', {}).get('S', ''),
+                db=db)
+
+
+response = dynamodb_dev.scan(
+    TableName='accounts-dev',
+    ConsistentRead=True,
+    ProjectionExpression='#n, creation_status, stage, reservation',
+    ExpressionAttributeNames={
+        '#n': 'name'
+    }
+)
+
+if response['ResponseMetadata']['HTTPStatusCode'] != 200:
+    logger.error("Failed to get items from dynamodb")
+    sys.exit(1)
+
+data = response['Items']
+while 'LastEvaluatedKey' in response:
+    response = dynamodb_dev.scan(
+        TableName='accounts-dev',
+        ConsistentRead=True,
+        ProjectionExpression='#n',
+        ExpressionAttributeNames={'#n': 'name'},
+        ExclusiveStartKey=response['LastEvaluatedKey']
+    )
+    data.extend(response['Items'])
+
+if 'Items' in response:
+    for item in data:
+        print_sandbox(item, 'dev')
+
+# Now run the command for the prod database
+
+response = dynamodb_prod.scan(
+    TableName='accounts',
+    ConsistentRead=True,
+    ProjectionExpression='#n',
+    ExpressionAttributeNames={'#n': 'name'}
+)
+
+if response['ResponseMetadata']['HTTPStatusCode'] != 200:
+    logger.error("Failed to get items from dynamodb")
+    sys.exit(1)
+
+data = response['Items']
+
+while 'LastEvaluatedKey' in response:
+    response = dynamodb_prod.scan(
+        TableName='accounts',
+        ConsistentRead=True,
+        ProjectionExpression='#n',
+        ExpressionAttributeNames={'#n': 'name'},
+        ExclusiveStartKey=response['LastEvaluatedKey']
+    )
+    data.extend(response['Items'])
+
+if 'Items' in response:
+    for item in data:
+        print_sandbox(item, 'prod')
diff --git a/playbooks/roles/infra-aws-sandbox/defaults/main.yml b/playbooks/roles/infra-aws-sandbox/defaults/main.yml
index 5a741a1e..502a4f3f 100644
--- a/playbooks/roles/infra-aws-sandbox/defaults/main.yml
+++ b/playbooks/roles/infra-aws-sandbox/defaults/main.yml
@@ -8,6 +8,11 @@ account_user: student
 
 operation: CREATE
 
+# Listing all accounts in the organization is a costly operation
+# it takes currently 47s to execute.
+# Check the account only in certain scenario, like for a retry
+check_account_list: false
+
 available_after_reset: true
 available_after_create: false
 alias_suffix: '-gpte'
@@ -168,5 +173,6 @@ dynamodb_region: us-east-1
 account_altready_exists: false
 
 force_create: false
+update_stage: false
 
 aws_cli: aws
diff --git a/playbooks/roles/infra-aws-sandbox/tasks/account.yml b/playbooks/roles/infra-aws-sandbox/tasks/account.yml
index 41da4139..36dbb11b 100644
--- a/playbooks/roles/infra-aws-sandbox/tasks/account.yml
+++ b/playbooks/roles/infra-aws-sandbox/tasks/account.yml
@@ -21,52 +21,92 @@
     account_id: "{{ _getaccountid.stdout }}"
     account_already_exists: true
 
-- when:
-    - all_accounts is not defined
-    - operation == 'CREATE' or account_id is not defined
-  block:
-    - name: List all accounts in the organization.
-      command: >-
-        {{ aws_cli }} --profile {{ aws_master_profile }}
-        organizations list-accounts
-        --query 'Accounts[].{Name: Name, Id: Id}'
-      register: _listaccounts
-      changed_when: false
-
-    - set_fact:
-        all_accounts: "{{_listaccounts.stdout|from_json}}"
-
-- when:
-    - operation == 'CREATE' or account_id is not defined
-    - account_name not in all_accounts|json_query('[].Name')
-  name: Create New account.
+- name: List all accounts in the organization.
   command: >-
     {{ aws_cli }} --profile {{ aws_master_profile }}
-    organizations create-account
-    --email {{ account_email }}
-    --account-name "{{ account_name }}"
-    --role-name "{{ aws_role_name }}"
-    --query 'CreateAccountStatus.[Id]'
-    --output text
-  register: _createaccount
+    organizations list-accounts
+    --query 'Accounts[].{Name: Name, Id: Id, Email: Email, Status: Status}'
+  register: _listaccounts
+  changed_when: false
+  when:
+    - check_account_list | bool
+    - all_accounts is not defined
+    - operation == 'CREATE' or account_id is not defined
 
 - when:
+    - check_account_list | bool
     - operation == 'CREATE' or account_id is not defined
-    - account_name in all_accounts|json_query('[].Name')
   block:
-    - name: Find and set the account_id (existing account)
-      set_fact:
-        account_id: >-
+    - set_fact:
+        all_active_accounts: >-
+          {{
+          _listaccounts.stdout
+          | from_json
+          | json_query('[?Status==`ACTIVE`].{Name: Name, Id: Id}') }}
+
+    - name: Fail if the account name exists multiple times
+      vars:
+        selected_accounts: >-
+          {{
+          all_active_accounts
+          | json_query('[?Name==`'~ account_name ~'`]')
+          | list
+          }}
+      fail:
+        msg: |-
+          Account name exists multiple times in the organization.
+          {{ selected_accounts }}
+      when: selected_accounts | length > 1
+
+    - name: Reuse the existing account if one exists
+      vars:
+        selected_accounts: >-
           {{
-          (
-          all_accounts
-          |selectattr('Name', 'equalto', account_name)
-          |first
-          )['Id']
+          all_active_accounts
+          | json_query('[?Name==`'~ account_name ~'`]')
+          | list
           }}
+      when: selected_accounts | length == 1
+      set_fact:
+        account_id: >-
+          {{ all_active_accounts
+          | json_query('[?Name==`' ~ account_name ~ '`].Id')
+          | first }}
 
-    - debug:
-        var: account_id
+- when: account_id is defined
+  debug:
+    msg: "Reusing existing account with id {{ account_id }}"
+
+- when:
+    - operation == 'CREATE'
+    - account_id is not defined
+  block:
+  - name: Save status of the sandbox
+    when: update_stage
+    vars:
+      step1: "1 - Account Creation Started"
+      _data:
+        name:
+          S: "{{ account_name }}"
+    command: >-
+      {{ aws_cli }} --profile {{ dynamodb_profile }}
+      --region {{ dynamodb_region }}
+      dynamodb update-item
+      --table-name {{ dynamodb_table }}
+      --key '{{ _data | to_json }}'
+      --update-expression 'SET stage = :val'
+      --expression-attribute-values '{":val": {"S": "{{ step1 }}"}}'
+
+  - name: Create New account.
+    command: >-
+      {{ aws_cli }} --profile {{ aws_master_profile }}
+      organizations create-account
+      --email {{ account_email }}
+      --account-name "{{ account_name }}"
+      --role-name "{{ aws_role_name }}"
+      --query 'CreateAccountStatus.[Id]'
+      --output text
+    register: _createaccount
 
 - when:
     - _createaccount is not skipped
@@ -85,9 +125,37 @@
       retries: 40
       changed_when: false
 
-    - fail:
-        msg: The creation of the account failed.
-      when: _describestatus.stdout == 'FAILED'
+    - when: update_stage and _describestatus.stdout == 'FAILED'
+      block:
+        - name: Save failed status of the sandbox
+          command: >-
+            {{ aws_cli }} --profile {{ aws_master_profile }}
+            organizations describe-create-account-status
+            --create-account-request-id {{ _createaccount.stdout }}
+            --output json
+          register: _describestatus2
+          changed_when: false
+
+        - name: Save failed status of the sandbox
+          vars:
+            step1: "1 - Account Creation Failed"
+            _data:
+              name:
+                S: "{{ account_name }}"
+          command: >-
+            {{ aws_cli }} --profile {{ dynamodb_profile }}
+            --region {{ dynamodb_region }}
+            dynamodb update-item
+            --table-name {{ dynamodb_table }}
+            --key '{{ _data | to_json }}'
+            --update-expression 'SET stage = :val'
+            --expression-attribute-values '{":val": {"S": "{{ step1 }}"}}'
+
+        - debug:
+            var: _describestatus2
+
+        - fail:
+            msg: The creation of the account failed.
 
     - name: Get the account ID
       command: >-
@@ -107,6 +175,26 @@
   fail:
     msg: Account Id not defined
 
+- name: Save status of the sandbox
+  vars:
+    step1: "1 - Account Creation Succeeded"
+    _data:
+      name:
+        S: "{{ account_name }}"
+    _expr:
+      ":val":
+        "S": "{{ step1 }}"
+      ":accountid":
+        "S": "{{ account_id }}"
+  command: >-
+    {{ aws_cli }} --profile {{ dynamodb_profile }}
+    --region {{ dynamodb_region }}
+    dynamodb update-item
+    --table-name {{ dynamodb_table }}
+    --key '{{ _data | to_json }}'
+    --update-expression 'SET stage = :val, account_id = :accountid'
+    --expression-attribute-values '{{ _expr | to_json }}'
+
 - name: Add Account Id to the report
   lineinfile:
     path: "{{ output_dir }}/{{ account_name }}_report.txt"
diff --git a/playbooks/roles/infra-aws-sandbox/tasks/assume.yml b/playbooks/roles/infra-aws-sandbox/tasks/assume.yml
index 1a9845d0..00f41438 100644
--- a/playbooks/roles/infra-aws-sandbox/tasks/assume.yml
+++ b/playbooks/roles/infra-aws-sandbox/tasks/assume.yml
@@ -1,5 +1,8 @@
 ---
 - name: Get temporary token for the sandbox (Assume Role)
+  environment:
+    AWS_ACCESS_KEY_ID: ""
+    AWS_SECRET_ACCESS_KEY: ""
   sts_assume_role:
     profile: "{{ aws_master_profile }}"
     role_arn: "arn:aws:iam::{{ account_id }}:role/OrganizationAccountAccessRole"
diff --git a/playbooks/roles/infra-aws-sandbox/tasks/iam.yml b/playbooks/roles/infra-aws-sandbox/tasks/iam.yml
index 999d83d7..0ca6fafe 100644
--- a/playbooks/roles/infra-aws-sandbox/tasks/iam.yml
+++ b/playbooks/roles/infra-aws-sandbox/tasks/iam.yml
@@ -1,14 +1,23 @@
 ---
 - name: Create IAM role using Cloudformation
+  environment:
+    AWS_ACCESS_KEY_ID: ""
+    AWS_SECRET_ACCESS_KEY: ""
   cloudformation:
     profile: "{{ account_profile }}"
     template_body: "{{ lookup('file', 'CF-IAM.json') }}"
     region: "{{ aws_region }}"
     stack_name: roles
+  retries: 50
+  delay: 2
   register: r_cf
-  ignore_errors: yes
+  until: r_cf is succeeded
+  ignore_errors: true
 
 - when: r_cf is failed
+  environment:
+    AWS_ACCESS_KEY_ID: ""
+    AWS_SECRET_ACCESS_KEY: ""
   block:
     - name: Delete IAM role Cloudformation stack
       cloudformation:
diff --git a/playbooks/roles/infra-aws-sandbox/tasks/keypair.yml b/playbooks/roles/infra-aws-sandbox/tasks/keypair.yml
index ea821cbb..0ba94db6 100644
--- a/playbooks/roles/infra-aws-sandbox/tasks/keypair.yml
+++ b/playbooks/roles/infra-aws-sandbox/tasks/keypair.yml
@@ -1,33 +1,23 @@
 ---
-# TODO: create a list of keys and loop over it instead of doing one by one
-- name: Import OPENTLC backdoor key
-  ec2_key:
-    name: opentlc_admin_backdoor
-    region: "{{ _region }}"
-    key_material: "{{ opentlc_admin_backdoor }}"
-    aws_access_key: "{{ assumed_role.sts_creds.access_key }}"
-    aws_secret_key: "{{ assumed_role.sts_creds.secret_key }}"
-    session_token: "{{ assumed_role.sts_creds.session_token }}"
-  loop: "{{ all_regions }}"
-  loop_control:
-    loop_var: _region
-  register: r_import
-  retries: 5
-  delay: "{{ 30|random(start=3, step=1) }}"
-  until: r_import is succeeded
-
-- name: Import OPENTLC ocpkey
-  ec2_key:
-    name: ocpkey
-    region: "{{ _region }}"
-    key_material: "{{ ocpkey }}"
-    aws_access_key: "{{ assumed_role.sts_creds.access_key }}"
-    aws_secret_key: "{{ assumed_role.sts_creds.secret_key }}"
-    session_token: "{{ assumed_role.sts_creds.session_token }}"
-  loop: "{{ all_regions }}"
-  loop_control:
-    loop_var: _region
-  register: r_import2
-  retries: 5
-  delay: "{{ 30|random(start=3, step=1) }}"
-  until: r_import2 is succeeded
+# TODO: make sure this is unused across the accounts and retire
+- environment:
+    AWS_ACCESS_KEY_ID: ""
+    AWS_SECRET_ACCESS_KEY: ""
+  block:
+  - name: Import OPENTLC backdoor key
+    # ap-southeast-4 region breaks the ec2_key module, ignore it
+    when: _region != "ap-southeast-4"
+    ec2_key:
+      name: opentlc_admin_backdoor
+      region: "{{ _region }}"
+      key_material: "{{ opentlc_admin_backdoor }}"
+      aws_access_key: "{{ assumed_role.sts_creds.access_key }}"
+      aws_secret_key: "{{ assumed_role.sts_creds.secret_key }}"
+      session_token: "{{ assumed_role.sts_creds.session_token }}"
+    loop: "{{ all_regions }}"
+    loop_control:
+      loop_var: _region
+    register: r_import
+    retries: 10
+    delay: "{{ 10|random(start=3, step=1) }}"
+    until: r_import is succeeded
diff --git a/playbooks/roles/infra-aws-sandbox/tasks/main.yml b/playbooks/roles/infra-aws-sandbox/tasks/main.yml
index 80356498..ce62e22d 100644
--- a/playbooks/roles/infra-aws-sandbox/tasks/main.yml
+++ b/playbooks/roles/infra-aws-sandbox/tasks/main.yml
@@ -1,5 +1,7 @@
 ---
 - import_tasks: pre_checks.yml
+- when: operation == 'VALIDATE' or operation == 'validate'
+  include_tasks: validate.yaml
 - import_tasks: account.yml
   tags: account
 - import_tasks: profile.yml
diff --git a/playbooks/roles/infra-aws-sandbox/tasks/ou.yml b/playbooks/roles/infra-aws-sandbox/tasks/ou.yml
index f6240988..cee38eb6 100644
--- a/playbooks/roles/infra-aws-sandbox/tasks/ou.yml
+++ b/playbooks/roles/infra-aws-sandbox/tasks/ou.yml
@@ -30,21 +30,20 @@
 
 - when: accounts_in_ou is not defined
   block:
-    - name: List the accounts in the destination OU
+    - name: Get the OU of the account
       command: >-
-        {{ aws_cli }} --profile {{ aws_master_profile }}
-        organizations list-accounts-for-parent
-        --parent-id {{ destouid }}
-        --query 'Accounts[].Id'
-        --output json
-      register: _listaccounts_in_ou
+         {{ aws_cli }} --profile {{ aws_master_profile }}
+         organizations list-parents --child-id {{ account_id }}
+      register: r_ou
       changed_when: false
 
-    - name: Save organization OU
-      set_fact:
-        accounts_in_ou: "{{ _listaccounts_in_ou.stdout | from_json | list }}"
-
-- when: account_id not in accounts_in_ou
+- when: >-
+    destouid not in
+    ( r_ou.stdout
+    | from_json
+    | json_query('Parents[].Id')
+    | default([], true)
+    )
   name: Move account to destination OU
   command: >-
     {{ aws_cli }} --profile {{ aws_master_profile }}
diff --git a/playbooks/roles/infra-aws-sandbox/tasks/pool.yml b/playbooks/roles/infra-aws-sandbox/tasks/pool.yml
index ad417d4f..666ba656 100644
--- a/playbooks/roles/infra-aws-sandbox/tasks/pool.yml
+++ b/playbooks/roles/infra-aws-sandbox/tasks/pool.yml
@@ -37,7 +37,7 @@
     --table-name {{ dynamodb_table }}
     --item '{{ _data | to_json }}'
   register: _putaccount
-  when: _getaccount.stdout == '' or force_create
+  when: _getaccount.stdout == '' or force_create or update_stage
 
 - debug:
     var: _putaccount
diff --git a/playbooks/roles/infra-aws-sandbox/tasks/route53.yml b/playbooks/roles/infra-aws-sandbox/tasks/route53.yml
index b6c58c5f..6f5dd22c 100644
--- a/playbooks/roles/infra-aws-sandbox/tasks/route53.yml
+++ b/playbooks/roles/infra-aws-sandbox/tasks/route53.yml
@@ -1,13 +1,15 @@
 ---
 - environment:
     AWS_PROFILE: "{{ account_profile }}"
+    AWS_ACCESS_KEY_ID: ""
+    AWS_SECRET_ACCESS_KEY: ""
   block:
     - name: Create the public zone
       route53_zone:
         zone: "{{ account_name }}{{subdomain_base}}."
       register: _route53zone
-      retries: 5
-      delay: "{{ 60|random(start=3, step=1) }}"
+      retries: 10
+      delay: "{{ 10|random(start=3, step=1) }}"
       until: _route53zone is succeeded
 
     - set_fact:
diff --git a/playbooks/roles/infra-aws-sandbox/tasks/user.yml b/playbooks/roles/infra-aws-sandbox/tasks/user.yml
index 68ba4dbf..4c423f6e 100644
--- a/playbooks/roles/infra-aws-sandbox/tasks/user.yml
+++ b/playbooks/roles/infra-aws-sandbox/tasks/user.yml
@@ -2,6 +2,8 @@
 # NOTE: Use 'command' module instead of ansible iam module because it doesn't work well with boto profiles.
 - environment:
     AWS_PROFILE: "{{ account_profile }}"
+    AWS_ACCESS_KEY_ID: ""
+    AWS_SECRET_ACCESS_KEY: ""
   block:
     - name: Check if user already exists
       command: >-
diff --git a/playbooks/roles/infra-aws-sandbox/tasks/validate.yaml b/playbooks/roles/infra-aws-sandbox/tasks/validate.yaml
index cd69a1b1..c8dbf34d 100644
--- a/playbooks/roles/infra-aws-sandbox/tasks/validate.yaml
+++ b/playbooks/roles/infra-aws-sandbox/tasks/validate.yaml
@@ -33,6 +33,9 @@
   include_tasks: unvault.yml
 
 - name: Validate AWS access
+  environment:
+    AWS_ACCESS_KEY_ID: ""
+    AWS_SECRET_ACCESS_KEY: ""
   aws_caller_info:
     aws_access_key: "{{ sandbox_aws_access_key_id }}"
     aws_secret_key: "{{ sandbox_aws_secret_access_key }}"
@@ -67,12 +70,20 @@
 
   assert:
     that: ns_entries == ns_entries_route53
+  retries: 60
+  delay: 1
 
 - name: Ensure Red Hat GOLD AMI are accessible from within the sandbox
   ec2_ami_info:
     aws_access_key: "{{ sandbox_aws_access_key_id }}"
     aws_secret_key: "{{ sandbox_aws_secret_access_key }}"
-    region: us-east-1
+    region: >-
+      {{
+      ['us-east-1',
+      'us-east-2',
+      'us-west-1',
+      'us-west-2'
+      ] | shuffle | first }}
     # Red Hat official
     owner: 309956199498
     filters:
@@ -80,6 +91,10 @@
       name: RHEL-9.0*Access*
       is-public: "false"
   register: r_image
+  # Try for 15m
+  retries: 150
+  delay: 6
+  until: r_image is succeeded and r_image.images | length > 0
 
 - assert:
     that: >-
@@ -89,3 +104,6 @@
       or
       r_image.images[0].platform_details == 'Red Hat BYOL Linux'
       )
+
+- when: operation == 'VALIDATE' or operation == 'validate'
+  meta: end_play
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 00000000..a1b085ea
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,26 @@
+Jinja2
+MarkupSafe
+PyYAML
+ansible
+ansible_vault
+awscli
+boto3
+botocore
+cffi
+colorama
+cryptography
+distro
+dnspython
+docutils
+jmespath
+psutil
+pyasn1
+pycparser
+python-dateutil
+requests
+rsa
+s3transfer
+selinux
+six
+structlog
+urllib3

From 8a2364eb7bff7b1029a60870d69916c34e948fd9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Guillaume=20Cor=C3=A9?= <gucore@redhat.com>
Date: Wed, 15 Jan 2025 21:56:47 +0100
Subject: [PATCH 02/11] Add argument to toggle playbook output easily

---
 playbooks/create_sandbox.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/playbooks/create_sandbox.py b/playbooks/create_sandbox.py
index efe9a0b7..ff4ac53a 100755
--- a/playbooks/create_sandbox.py
+++ b/playbooks/create_sandbox.py
@@ -38,6 +38,7 @@
 parser.add_argument('--target-db', required=False, help='The target database', default='dev')
 parser.add_argument('--log-level', required=False, help='The log level', default='info')
 parser.add_argument('--retry', required=False, help='Retry sandbox by passing its name', default=None)
+parser.add_argument('--playbook-output', required=False, help='Print output of ansible-playbook commands?', action=argparse.BooleanOptionalAction, default=True)
 
 args = parser.parse_args()
 
@@ -46,6 +47,7 @@
 target_db = args.target_db
 log_level = args.log_level
 retry = args.retry
+playbook_output = args.playbook_output
 
 if log_level == 'debug':
     logger.info("Setting log level to DEBUG")
@@ -473,7 +475,7 @@ def exit_handler(db, table, sandbox):
 try:
     completed = subprocess.run(
         args, check=True,
-        #capture_output=True,
+        capture_output=(not playbook_output),
         timeout=1800,
     )
 except subprocess.CalledProcessError as e:
@@ -668,7 +670,7 @@ def exit_handler(db, table, sandbox):
 try:
     completed = subprocess.run(
         args, check=True,
-        #capture_output=True,
+        capture_output=(not playbook_output),
         timeout=1800,
     )
 

From b008ec7c10fab5b9c116d8ea11a4899e1e3e1fc4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Guillaume=20Cor=C3=A9?= <gucore@redhat.com>
Date: Thu, 16 Jan 2025 15:39:44 +0100
Subject: [PATCH 03/11] Add sec, operation and billing info to account

---
 playbooks/create_sandbox.py                   |  4 +-
 .../roles/infra-aws-sandbox/tasks/account.yml | 69 +++++++++++++++++++
 2 files changed, 72 insertions(+), 1 deletion(-)

diff --git a/playbooks/create_sandbox.py b/playbooks/create_sandbox.py
index ff4ac53a..e705b2a9 100755
--- a/playbooks/create_sandbox.py
+++ b/playbooks/create_sandbox.py
@@ -302,7 +302,7 @@ def decrypt_vaulted_str(secret):
     return Vault(os.environ['INFRA_VAULT_SECRET']).load_raw(secret).decode('utf-8')
 
 new_sandbox, new_email = guess_next_sandbox(sandboxes, sandboxes_dict)
-logger.info(f"=> Create {new_sandbox}")
+logger = logger.bind(sandbox=new_sandbox)
 
 
 # Lock the name of the sandbox in DB so another
@@ -646,6 +646,8 @@ def exit_handler(db, table, sandbox):
     logger.error(f"Failed to create the source: {response.text}", status_code=response.status_code, sandbox=new_sandbox)
     sys.exit(1)
 
+logger.info(f"Source create in HCC")
+
 # Run the validation playbook operation
 
 local_path = os.path.dirname(os.path.realpath(__file__))
diff --git a/playbooks/roles/infra-aws-sandbox/tasks/account.yml b/playbooks/roles/infra-aws-sandbox/tasks/account.yml
index 36dbb11b..980c7a45 100644
--- a/playbooks/roles/infra-aws-sandbox/tasks/account.yml
+++ b/playbooks/roles/infra-aws-sandbox/tasks/account.yml
@@ -200,3 +200,72 @@
     path: "{{ output_dir }}/{{ account_name }}_report.txt"
     line: "account_id: {{ account_id }}"
     create: true
+
+- name: Load environment variables for SECURITY contact
+  set_fact:
+    security_email: "{{ lookup('env', 'SECURITY_EMAIL') }}"
+    security_name: "{{ lookup('env', 'SECURITY_NAME') }}"
+    security_phone: "{{ lookup('env', 'SECURITY_PHONE') }}"
+    security_title: "{{ lookup('env', 'SECURITY_TITLE') }}"
+
+- name: Set SECURITY alternate contact
+  when:
+    - security_email != ''
+    - security_name  != ''
+    - security_phone != ''
+    - security_title != ''
+  command: >-
+    aws --profile {{ aws_master_profile | quote }}
+    account put-alternate-contact
+    --account-id {{ account_id | quote }}
+    --alternate-contact-type=SECURITY
+    --email-address {{ security_email | quote }}
+    --name {{ security_name | quote }}
+    --phone-number {{ security_phone | quote }}
+    --title {{ security_title | quote }}
+
+- name: Load environment variables for OPERATIONS contact
+  set_fact:
+    operations_email: "{{ lookup('env', 'OPERATIONS_EMAIL') }}"
+    operations_name: "{{ lookup('env', 'OPERATIONS_NAME') }}"
+    operations_phone: "{{ lookup('env', 'OPERATIONS_PHONE') }}"
+    operations_title: "{{ lookup('env', 'OPERATIONS_TITLE') }}"
+
+- name: Set OPERATIONS alternate contact
+  when:
+    - operations_email != ''
+    - operations_name  != ''
+    - operations_phone != ''
+    - operations_title != ''
+  command: >-
+    aws --profile {{ aws_master_profile | quote }}
+    account put-alternate-contact
+    --account-id {{ account_id | quote }}
+    --alternate-contact-type=OPERATIONS
+    --email-address {{ operations_email | quote }}
+    --name {{ operations_name | quote }}
+    --phone-number {{ operations_phone | quote }}
+    --title {{ operations_title | quote }}
+
+- name: Load environment variables for BILLING contact
+  set_fact:
+    billing_email: "{{ lookup('env', 'BILLING_EMAIL') }}"
+    billing_name: "{{ lookup('env', 'BILLING_NAME') }}"
+    billing_phone: "{{ lookup('env', 'BILLING_PHONE') }}"
+    billing_title: "{{ lookup('env', 'BILLING_TITLE') }}"
+
+- name: Set BILLING alternate contact
+  when:
+    - billing_email != ''
+    - billing_name  != ''
+    - billing_phone != ''
+    - billing_title != ''
+  command: >-
+    aws --profile {{ aws_master_profile | quote }}
+    account put-alternate-contact
+    --account-id {{ account_id | quote }}
+    --alternate-contact-type=BILLING
+    --email-address {{ billing_email | quote }}
+    --name {{ billing_name | quote }}
+    --phone-number {{ billing_phone | quote }}
+    --title {{ billing_title | quote }}

From 17a8a449ce91d0cea4d88b707f2c65b0e1e59248 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Guillaume=20Cor=C3=A9?= <gucore@redhat.com>
Date: Thu, 16 Jan 2025 23:03:44 +0100
Subject: [PATCH 04/11] Add option to disable hcc and validation. Fix guessing

---
 playbooks/create_sandbox.py | 369 +++++++++++++++++++-----------------
 1 file changed, 191 insertions(+), 178 deletions(-)

diff --git a/playbooks/create_sandbox.py b/playbooks/create_sandbox.py
index e705b2a9..1d1a3981 100755
--- a/playbooks/create_sandbox.py
+++ b/playbooks/create_sandbox.py
@@ -39,6 +39,9 @@
 parser.add_argument('--log-level', required=False, help='The log level', default='info')
 parser.add_argument('--retry', required=False, help='Retry sandbox by passing its name', default=None)
 parser.add_argument('--playbook-output', required=False, help='Print output of ansible-playbook commands?', action=argparse.BooleanOptionalAction, default=True)
+parser.add_argument('--hcc', required=False, help='run the registration step for Gold images?', action=argparse.BooleanOptionalAction, default=True)
+parser.add_argument('--validation', required=False, help='run the validation playbook?', action=argparse.BooleanOptionalAction, default=True)
+parser.add_argument('--guess-strategy', required=False, help='How to guess the next number: smart, end', default='end')
 
 args = parser.parse_args()
 
@@ -48,6 +51,9 @@
 log_level = args.log_level
 retry = args.retry
 playbook_output = args.playbook_output
+hcc = args.hcc
+validation = args.validation
+guess_strategy = args.guess_strategy
 
 if log_level == 'debug':
     logger.info("Setting log level to DEBUG")
@@ -193,6 +199,12 @@
     logger.info(f"Found {len(sandboxes_prod)} sandboxes in prod")
     sandboxes = sandboxes + sandboxes_prod
 
+def extract_sandbox_number(sandbox):
+    """Extract the number from the sandbox name, for example sandbox1234 returns 1234"""
+    return int(sandbox.split('sandbox')[1])
+
+sandboxes.sort(key=extract_sandbox_number)
+
 # transform into a dictionary
 sandboxes_dict = {sandbox: True for sandbox in sandboxes}
 
@@ -275,10 +287,6 @@ def get_sandbox(dynamodb, sandbox):
         return {}
 
 
-def extract_sandbox_number(sandbox):
-    """Extract the number from the sandbox name, for example sandbox1234 returns 1234"""
-    return int(sandbox.split('sandbox')[1])
-
 
 def guess_next_sandbox(sandboxes, sandboxes_dict):
     """Find the first available sandbox name"""
@@ -289,10 +297,13 @@ def guess_next_sandbox(sandboxes, sandboxes_dict):
 
     if retry:
         return retry, f"{retry}+{random_email_tag}@{os.environ['email_domain']}"
-    for i in range(1, len(sandboxes) + 1):
-        if not sandboxes_dict.get(f"sandbox{i}", False):
-            return f"sandbox{i}", f"sandbox{i}+{random_email_tag}@{os.environ['email_domain']}"
 
+    if guess_strategy == 'smart':
+        for i in range(1, len(sandboxes) + 1):
+            if not sandboxes_dict.get(f"sandbox{i}", False):
+                return f"sandbox{i}", f"sandbox{i}+{random_email_tag}@{os.environ['email_domain']}"
+
+    logger.info(f"len(sanboxes) = {len(sandboxes)}")
     s = f"sandbox{extract_sandbox_number(sandboxes[-1]) + 1}"
     return s, f"{s}+{random_email_tag}@{os.environ['email_domain']}"
 
@@ -304,7 +315,6 @@ def decrypt_vaulted_str(secret):
 new_sandbox, new_email = guess_next_sandbox(sandboxes, sandboxes_dict)
 logger = logger.bind(sandbox=new_sandbox)
 
-
 # Lock the name of the sandbox in DB so another
 # concurrent process won't be able to create the same sandbox.
 sandbox_data = get_sandbox(dynamodb, new_sandbox)
@@ -549,163 +559,189 @@ def exit_handler(db, table, sandbox):
 ACCOUNT_CREATED_TIME = time.time()
 logger.info(f"Duration: {round(ACCOUNT_CREATED_TIME - START_TIME)} seconds to create {new_sandbox}")
 
-# Use https://console.redhat.com/docs/api/sources/v3.1#operations-sources-bulkCreate
 
-sandbox_data = get_sandbox(dynamodb, new_sandbox)
+if hcc:
+    # Use https://console.redhat.com/docs/api/sources/v3.1#operations-sources-bulkCreate
 
-if not sandbox_data:
-    logger.error(f"Failed to get the sandbox data for {new_sandbox}")
-    sys.exit(1)
+    sandbox_data = get_sandbox(dynamodb, new_sandbox)
 
-if 'aws_secret_access_key' not in sandbox_data:
-    logger.error(f"Failed to get the aws_secret_access_key for {new_sandbox}")
-    sys.exit(1)
-
-plaintext_key = decrypt_vaulted_str(sandbox_data.get('aws_secret_access_key', {}).get('S', '')).strip(' \t\n\r')
-access_key = sandbox_data.get('aws_access_key_id', {}).get('S', '').strip(' \t\n\r')
-
-if not access_key or not plaintext_key:
-    logger.error(f"Failed to get the access key for {new_sandbox}")
-    sys.exit(1)
-
-# use requests and create the POST request
-
-baseurl = 'https://console.redhat.com/api/sources/v3.1'
-
-s = requests.Session()
-s.auth = (os.environ['RH_USERNAME'], os.environ['RH_PASSWORD'])
+    if not sandbox_data:
+        logger.error(f"Failed to get the sandbox data for {new_sandbox}")
+        sys.exit(1)
 
-# delete the source if it exists
-# First get the source_id
-max_retries = 20
-while True:
-    response = s.get(f"{baseurl}/sources?filter[name][eq]={new_sandbox}")
-    if response.status_code == 200:
-        break
-    logger.error(f"Failed to get the source: {response.text}", status_code=response.status_code, sandbox=new_sandbox)
-    if max_retries == 0:
+    if 'aws_secret_access_key' not in sandbox_data:
+        logger.error(f"Failed to get the aws_secret_access_key for {new_sandbox}")
         sys.exit(1)
 
-    logger.info(f"Retrying: {max_retries} retries left")
-    max_retries -= 1
-    time.sleep(5)
+    plaintext_key = decrypt_vaulted_str(sandbox_data.get('aws_secret_access_key', {}).get('S', '')).strip(' \t\n\r')
+    access_key = sandbox_data.get('aws_access_key_id', {}).get('S', '').strip(' \t\n\r')
 
-result = response.json().get('data', [])
-if len(result) > 0:
-    source_id = response.json().get('data', [{}])[0].get('id', '')
+    if not access_key or not plaintext_key:
+        logger.error(f"Failed to get the access key for {new_sandbox}")
+        sys.exit(1)
 
-    if source_id:
-        response = s.delete(f"{baseurl}/sources/{source_id}")
-        if response.status_code not in [200, 201, 202]:
-            logger.error(f"Failed to delete the source: {response.text}", status_code=response.status_code, sandbox=new_sandbox)
+    # use requests and create the POST request
 
-        logger.info(f"Deleted the source {source_id} for {new_sandbox}")
+    baseurl = 'https://console.redhat.com/api/sources/v3.1'
 
-        # Wait for the deletion to complete
-        max_retries = 20
-        while max_retries > 0:
-            response = s.get(f"{baseurl}/sources/{source_id}")
-            if response.status_code == 404:
-                break
-            max_retries -= 1
-            logger.info(f"Waiting for the source to be deleted from HCC (console): {max_retries} retries left")
-            time.sleep(5)
+    s = requests.Session()
+    s.auth = (os.environ['RH_USERNAME'], os.environ['RH_PASSWORD'])
 
+    # delete the source if it exists
+    # First get the source_id
+    max_retries = 20
+    while True:
+        response = s.get(f"{baseurl}/sources?filter[name][eq]={new_sandbox}")
+        if response.status_code == 200:
+            break
+        logger.error(f"Failed to get the source: {response.text}", status_code=response.status_code, sandbox=new_sandbox)
         if max_retries == 0:
-            logger.error(f"Failed to delete the source {source_id} for {new_sandbox}")
             sys.exit(1)
 
-payload = {
-    "sources": [
-        {
-            "name": new_sandbox,
-            "source_type_name": "amazon",
-            "app_creation_workflow": "account_authorization"
-        }
-    ],
-    "authentications": [
-        {
-            "resource_type": "source",
-            "resource_name": new_sandbox,
-            "username": access_key,
-            "password": plaintext_key,
-            "authtype": "access_key_secret_key"
-        }
-    ],
-
-    "applications": [
-        {
-            "source_name": new_sandbox,
-            "application_type_name": "cloud-meter"
-        }
-    ]
-}
-response = s.post(f"{baseurl}/bulk_create", json=payload)
+        logger.info(f"Retrying: {max_retries} retries left")
+        max_retries -= 1
+        time.sleep(5)
+
+    result = response.json().get('data', [])
+    if len(result) > 0:
+        source_id = response.json().get('data', [{}])[0].get('id', '')
+
+        if source_id:
+            response = s.delete(f"{baseurl}/sources/{source_id}")
+            if response.status_code not in [200, 201, 202]:
+                logger.error(f"Failed to delete the source: {response.text}", status_code=response.status_code, sandbox=new_sandbox)
+
+            logger.info(f"Deleted the source {source_id} for {new_sandbox}")
+
+            # Wait for the deletion to complete
+            max_retries = 20
+            while max_retries > 0:
+                response = s.get(f"{baseurl}/sources/{source_id}")
+                if response.status_code == 404:
+                    break
+                max_retries -= 1
+                logger.info(f"Waiting for the source to be deleted from HCC (console): {max_retries} retries left")
+                time.sleep(5)
+
+            if max_retries == 0:
+                logger.error(f"Failed to delete the source {source_id} for {new_sandbox}")
+                sys.exit(1)
+
+    payload = {
+        "sources": [
+            {
+                "name": new_sandbox,
+                "source_type_name": "amazon",
+                "app_creation_workflow": "account_authorization"
+            }
+        ],
+        "authentications": [
+            {
+                "resource_type": "source",
+                "resource_name": new_sandbox,
+                "username": access_key,
+                "password": plaintext_key,
+                "authtype": "access_key_secret_key"
+            }
+        ],
 
-if response.status_code not in [200, 201]:
-    logger.error(f"Failed to create the source: {response.text}", status_code=response.status_code, sandbox=new_sandbox)
-    sys.exit(1)
+        "applications": [
+            {
+                "source_name": new_sandbox,
+                "application_type_name": "cloud-meter"
+            }
+        ]
+    }
+    response = s.post(f"{baseurl}/bulk_create", json=payload)
 
-logger.info(f"Source create in HCC")
+    if response.status_code not in [200, 201]:
+        logger.error(f"Failed to create the source: {response.text}", status_code=response.status_code, sandbox=new_sandbox)
+        sys.exit(1)
 
-# Run the validation playbook operation
+    logger.info(f"Source create in HCC")
+
+if validation:
+    # Run the validation playbook operation
+
+    local_path = os.path.dirname(os.path.realpath(__file__))
+    playbook = os.path.join(local_path, '..', 'playbooks', 'validate.yml')
+
+    args = [
+        'ansible-playbook',
+        playbook,
+        '-e', f'account_num_start={extract_sandbox_number(new_sandbox)}',
+        '-e', f'account_num_end={extract_sandbox_number(new_sandbox)}',
+        '-e', f'sandbox={new_sandbox}',
+        '-e', 'dynamodb_profile=dynamodb',
+        '-e', f'dynamodb_table={dynamodb_table}',
+        '-e', 'aws_master_profile=pool-manager',
+        '-e', f'vault_file={INFRA_VAULT_SECRET_FILE}',
+        '-e', 'operation=VALIDATE',
+    ]
 
-local_path = os.path.dirname(os.path.realpath(__file__))
-playbook = os.path.join(local_path, '..', 'playbooks', 'validate.yml')
+    # Run the command
+    logger.info(f"Running {' '.join(args)}")
 
-args = [
-    'ansible-playbook',
-    playbook,
-    '-e', f'account_num_start={extract_sandbox_number(new_sandbox)}',
-    '-e', f'account_num_end={extract_sandbox_number(new_sandbox)}',
-    '-e', f'sandbox={new_sandbox}',
-    '-e', 'dynamodb_profile=dynamodb',
-    '-e', f'dynamodb_table={dynamodb_table}',
-    '-e', 'aws_master_profile=pool-manager',
-    '-e', f'vault_file={INFRA_VAULT_SECRET_FILE}',
-    '-e', 'operation=VALIDATE',
-]
+    try:
+        completed = subprocess.run(
+            args, check=True,
+            capture_output=(not playbook_output),
+            timeout=1800,
+        )
 
-# Run the command
-logger.info(f"Running {' '.join(args)}")
+    except subprocess.CalledProcessError as e:
+        logger.error(f"Failed to run the command: {e}")
+        # print stdout and stderr
+        logger.error(e.stdout.decode(), stdout=True)
+        logger.error(e.stderr.decode(), stderr=True)
 
-try:
-    completed = subprocess.run(
-        args, check=True,
-        capture_output=(not playbook_output),
-        timeout=1800,
-    )
+        # Set sandbox status to validation failed
+        response = dynamodb.update_item(
+            TableName=dynamodb_table,
+            Key={
+                'name': {
+                    'S': new_sandbox
+                }
+            },
+            UpdateExpression='SET #s = :val1',
+            ExpressionAttributeNames={
+                '#s': 'creation_status'
+            },
+            ExpressionAttributeValues={
+                ':val1': {
+                    'S': 'validation failed'
+                }
+            }
+        )
 
-except subprocess.CalledProcessError as e:
-    logger.error(f"Failed to run the command: {e}")
-    # print stdout and stderr
-    logger.error(e.stdout.decode(), stdout=True)
-    logger.error(e.stderr.decode(), stderr=True)
+        sys.exit(1)
 
-    # Set sandbox status to validation failed
-    response = dynamodb.update_item(
-        TableName=dynamodb_table,
-        Key={
-            'name': {
-                'S': new_sandbox
-            }
-        },
-        UpdateExpression='SET #s = :val1',
-        ExpressionAttributeNames={
-            '#s': 'creation_status'
-        },
-        ExpressionAttributeValues={
-            ':val1': {
-                'S': 'validation failed'
+    except subprocess.TimeoutExpired as e:
+        logger.error(f"Timeout: {e}")
+        # Set sandbox status to validation failed
+        response = dynamodb.update_item(
+            TableName=dynamodb_table,
+            Key={
+                'name': {
+                    'S': new_sandbox
+                }
+            },
+            UpdateExpression='SET #s = :val1',
+            ExpressionAttributeNames={
+                '#s': 'creation_status'
+            },
+            ExpressionAttributeValues={
+                ':val1': {
+                    'S': 'validation timed out'
+                }
             }
-        }
-    )
+        )
+        sys.exit(1)
 
-    sys.exit(1)
+    logger.info(f"Validation successful for {new_sandbox}")
+
+    # Move the sandbox to the final reservation
 
-except subprocess.TimeoutExpired as e:
-    logger.error(f"Timeout: {e}")
-    # Set sandbox status to validation failed
     response = dynamodb.update_item(
         TableName=dynamodb_table,
         Key={
@@ -713,51 +749,28 @@ def exit_handler(db, table, sandbox):
                 'S': new_sandbox
             }
         },
-        UpdateExpression='SET #s = :val1',
+        UpdateExpression='SET #r = :val1, #s = :val2, #c = :val3',
         ExpressionAttributeNames={
-            '#s': 'creation_status'
+            '#r': 'reservation',
+            '#s': 'stage',
+            '#c': 'creation_status'
         },
         ExpressionAttributeValues={
             ':val1': {
-                'S': 'validation timed out'
+                'S': reservation
+            },
+            ':val2': {
+                'S': STAGE4_VALIDATED
+            },
+            ':val3': {
+                'S': 'success'
             }
         }
     )
-    sys.exit(1)
 
-logger.info(f"Validation successful for {new_sandbox}")
-
-# Move the sandbox to the final reservation
-
-response = dynamodb.update_item(
-    TableName=dynamodb_table,
-    Key={
-        'name': {
-            'S': new_sandbox
-        }
-    },
-    UpdateExpression='SET #r = :val1, #s = :val2, #c = :val3',
-    ExpressionAttributeNames={
-        '#r': 'reservation',
-        '#s': 'stage',
-        '#c': 'creation_status'
-    },
-    ExpressionAttributeValues={
-        ':val1': {
-            'S': reservation
-        },
-        ':val2': {
-            'S': STAGE4_VALIDATED
-        },
-        ':val3': {
-            'S': 'success'
-        }
-    }
-)
-
-if response['ResponseMetadata']['HTTPStatusCode'] != 200:
-    logger.error("Failed to update the reservation")
-    sys.exit(1)
+    if response['ResponseMetadata']['HTTPStatusCode'] != 200:
+        logger.error("Failed to update the reservation")
+        sys.exit(1)
 
-logger.info(f"Moved {new_sandbox} to {reservation}")
-logger.info(f"Total duration: {round(time.time() - START_TIME)} seconds")
+    logger.info(f"Moved {new_sandbox} to {reservation}")
+    logger.info(f"Total duration: {round(time.time() - START_TIME)} seconds")

From a8abe449cee0d66d62c5172402505e8a8557f952 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Guillaume=20Cor=C3=A9?= <gucore@redhat.com>
Date: Fri, 17 Jan 2025 16:00:25 +0100
Subject: [PATCH 05/11] Add the ability to skip playbook/validation/hcc

---
 playbooks/create_sandbox.py  | 222 +++++++++++++++++++----------------
 playbooks/creation_status.py |  11 +-
 2 files changed, 119 insertions(+), 114 deletions(-)

diff --git a/playbooks/create_sandbox.py b/playbooks/create_sandbox.py
index 1d1a3981..985c29e5 100755
--- a/playbooks/create_sandbox.py
+++ b/playbooks/create_sandbox.py
@@ -39,6 +39,7 @@
 parser.add_argument('--log-level', required=False, help='The log level', default='info')
 parser.add_argument('--retry', required=False, help='Retry sandbox by passing its name', default=None)
 parser.add_argument('--playbook-output', required=False, help='Print output of ansible-playbook commands?', action=argparse.BooleanOptionalAction, default=True)
+parser.add_argument('--playbook', required=False, help='run the creation playbook?', action=argparse.BooleanOptionalAction, default=True)
 parser.add_argument('--hcc', required=False, help='run the registration step for Gold images?', action=argparse.BooleanOptionalAction, default=True)
 parser.add_argument('--validation', required=False, help='run the validation playbook?', action=argparse.BooleanOptionalAction, default=True)
 parser.add_argument('--guess-strategy', required=False, help='How to guess the next number: smart, end', default='end')
@@ -50,6 +51,7 @@
 target_db = args.target_db
 log_level = args.log_level
 retry = args.retry
+playbook= args.playbook
 playbook_output = args.playbook_output
 hcc = args.hcc
 validation = args.validation
@@ -385,8 +387,6 @@ def lock_sandbox(dynamodb, sandbox):
 
     logger.info(f"Locked {new_sandbox}")
 
-lock_sandbox(dynamodb, new_sandbox)
-
 def exit_handler(db, table, sandbox):
     '''Function to cleanup everything in case something went wrong'''
 
@@ -414,14 +414,24 @@ def exit_handler(db, table, sandbox):
     elif stage == STAGE4_VALIDATED:
         pass
     else:
-        # something went wrong
-        logger.error(f"Unexpected stage: {stage}, missing validation")
-        logger.info(f"You can retry the operation by running the command with --retry {sandbox}")
-        set_(dynamodb, new_sandbox, 'creation_status', 'failed')
-        sys.exit(1)
+        if validation:
+            # something went wrong
+            logger.error(f"Unexpected stage: {stage}, missing validation")
+            logger.info(f"You can retry the operation by running the command with --retry {sandbox}")
+            set_(dynamodb, new_sandbox, 'creation_status', 'failed')
+            sys.exit(1)
+        if hcc:
+            if stage != STAGE3_GOLD_IMAGE:
+                # something went wrong
+                logger.error(f"Unexpected stage: {stage}, missing validation")
+                logger.info(f"You can retry the operation by running the command with --retry {sandbox}")
+                set_(dynamodb, new_sandbox, 'creation_status', 'failed')
+                sys.exit(1)
+
 
 atexit.register(exit_handler, dynamodb, dynamodb_table, new_sandbox)
 
+
 # Prepare the AWS profile for the ansible-playbook command
 # - dynamodb   profile to manage the dynamodb table
 # - pool-manager profile to manage the pool
@@ -450,114 +460,118 @@ def exit_handler(db, table, sandbox):
 aws_secret_access_key = {os.environ['AWS_SECRET_ACCESS_KEY']}
             ''')
 
-# Prepare args for the ansible-playbook command
-#./create_range.yml -e account_num_start=3001 -e account_count=10 -e ddns_key_name=... -e ddns_key_secret=... -e ddns_server=...
-
-local_path = os.path.dirname(os.path.realpath(__file__))
-playbook = os.path.join(local_path, '..', 'playbooks', 'create_range.yml')
-
-args = [
-    'ansible-playbook',
-    playbook,
-    '-e', f'account_num_start={extract_sandbox_number(new_sandbox)}',
-    '-e', f'account_email={new_email}',
-    '-e', 'account_count=1',
-    '-e', f'ddns_key_name={os.environ["ddns_key_name"]}',
-    '-e', f'ddns_server={os.environ["ddns_server"]}',
-    '-e', f'ddns_ttl={os.environ["ddns_ttl"]}',
-    '-e', f'sandbox={new_sandbox}',
-    '-e', 'update_stage=true',
-    '-e', 'dynamodb_profile=dynamodb',
-    '-e', f'dynamodb_table={dynamodb_table}',
-    '-e', 'aws_master_profile=pool-manager',
-    # Listing all accounts in the organization is a costly operation
-    # it takes currently 47s to execute.
-    # Check the account only in certain scenario, like for a retry
-    '-e', f'check_account_list={True if retry else False}',
-    '-e', f'vault_file={INFRA_VAULT_SECRET_FILE}',
-]
+if playbook:
+    lock_sandbox(dynamodb, new_sandbox)
 
+    # Prepare args for the ansible-playbook command
+    #./create_range.yml -e account_num_start=3001 -e account_count=10 -e ddns_key_name=... -e ddns_key_secret=... -e ddns_server=...
 
-# Run the command
-logger.info(f"Running {' '.join(args)}")
-# Add the ddns_key_secret to the args
-args = args + ['-e', f'ddns_key_secret={os.environ["ddns_key_secret"]}']
-try:
-    completed = subprocess.run(
-        args, check=True,
-        capture_output=(not playbook_output),
-        timeout=1800,
-    )
-except subprocess.CalledProcessError as e:
-    # Sanitize the error message by removing the DDNS key secret
-    e_sanitized = str(e).replace(os.environ['ddns_key_secret'], '***')
-    logger.error(f"Failed to run the command: {e_sanitized}")
-    # print stdout and stderr
-    logger.error(e.stdout.decode(), stdout=True)
-    logger.error(e.stderr.decode(), stderr=True)
-
-    # Set sandbox status to failed
-    response = dynamodb.update_item(
-        TableName=dynamodb_table,
-        Key={
-            'name': {
-                'S': new_sandbox
-            }
-        },
-        UpdateExpression='SET #s = :val1',
-        ExpressionAttributeNames={
-            '#s': 'creation_status'
-        },
-        ExpressionAttributeValues={
-            ':val1': {
-                'S': 'failed'
-            }
-        }
-    )
+    local_path = os.path.dirname(os.path.realpath(__file__))
+    playbook = os.path.join(local_path, '..', 'playbooks', 'create_range.yml')
 
-    sys.exit(1)
-except subprocess.TimeoutExpired as e:
-    # Sanitize the error message by removing the DDNS key secret
-    e_sanitized = str(e).replace(os.environ['ddns_key_secret'], '***')
-    logger.error(f"Timeout: {e_sanitized}", sandbox=new_sandbox)
-    # Set sandbox status to failed
-    response = dynamodb.update_item(
-        TableName=dynamodb_table,
-        Key={
-            'name': {
-                'S': new_sandbox
+    args = [
+        'ansible-playbook',
+        playbook,
+        '-e', f'account_num_start={extract_sandbox_number(new_sandbox)}',
+        '-e', f'account_email={new_email}',
+        '-e', 'account_count=1',
+        '-e', f'ddns_key_name={os.environ["ddns_key_name"]}',
+        '-e', f'ddns_server={os.environ["ddns_server"]}',
+        '-e', f'ddns_ttl={os.environ["ddns_ttl"]}',
+        '-e', f'sandbox={new_sandbox}',
+        '-e', 'update_stage=true',
+        '-e', 'dynamodb_profile=dynamodb',
+        '-e', f'dynamodb_table={dynamodb_table}',
+        '-e', 'aws_master_profile=pool-manager',
+        # Listing all accounts in the organization is a costly operation
+        # it takes currently 47s to execute.
+        # Check the account only in certain scenario, like for a retry
+        '-e', f'check_account_list={True if retry else False}',
+        '-e', f'vault_file={INFRA_VAULT_SECRET_FILE}',
+    ]
+
+
+    # Run the command
+    logger.info(f"Running {' '.join(args)}")
+    # Add the ddns_key_secret to the args
+    args = args + ['-e', f'ddns_key_secret={os.environ["ddns_key_secret"]}']
+    try:
+        completed = subprocess.run(
+            args, check=True,
+            capture_output=(not playbook_output),
+            timeout=1800,
+        )
+    except subprocess.CalledProcessError as e:
+        # Sanitize the error message by removing the DDNS key secret
+        e_sanitized = str(e).replace(os.environ['ddns_key_secret'], '***')
+        logger.error(f"Failed to run the command: {e_sanitized}")
+        # print stdout and stderr
+        logger.error(e.stdout.decode(), stdout=True)
+        logger.error(e.stderr.decode(), stderr=True)
+
+        # Set sandbox status to failed
+        response = dynamodb.update_item(
+            TableName=dynamodb_table,
+            Key={
+                'name': {
+                    'S': new_sandbox
+                }
+            },
+            UpdateExpression='SET #s = :val1',
+            ExpressionAttributeNames={
+                '#s': 'creation_status'
+            },
+            ExpressionAttributeValues={
+                ':val1': {
+                    'S': 'failed'
+                }
             }
-        },
-        UpdateExpression='SET #s = :val1',
-        ExpressionAttributeNames={
-            '#s': 'creation_status'
-        },
-        ExpressionAttributeValues={
-            ':val1': {
-                'S': 'failed'
+        )
+
+        sys.exit(1)
+    except subprocess.TimeoutExpired as e:
+        # Sanitize the error message by removing the DDNS key secret
+        e_sanitized = str(e).replace(os.environ['ddns_key_secret'], '***')
+        logger.error(f"Timeout: {e_sanitized}", sandbox=new_sandbox)
+        # Set sandbox status to failed
+        response = dynamodb.update_item(
+            TableName=dynamodb_table,
+            Key={
+                'name': {
+                    'S': new_sandbox
+                }
+            },
+            UpdateExpression='SET #s = :val1',
+            ExpressionAttributeNames={
+                '#s': 'creation_status'
+            },
+            ExpressionAttributeValues={
+                ':val1': {
+                    'S': 'failed'
+                }
             }
-        }
-    )
-    sys.exit(1)
+        )
+        sys.exit(1)
 
-logger.info(f"Created {new_sandbox}")
+    logger.info(f"Created {new_sandbox}")
 
-# Get the account_id from the db
+    # Get the account_id from the db
 
-sandbox_data = get_sandbox(dynamodb, new_sandbox)
+    sandbox_data = get_sandbox(dynamodb, new_sandbox)
 
-if sandbox_data:
-    account_id = sandbox_data.get('account_id', {}).get('S', '')
-    logger.info(f"Account ID: {account_id}")
-    logger = logger.bind(account_id=account_id)
+    if sandbox_data:
+        account_id = sandbox_data.get('account_id', {}).get('S', '')
+        logger.info(f"Account ID: {account_id}")
+        logger = logger.bind(account_id=account_id)
 
-    # Write the account_id and the account name to cloud-automation/new_sandboxes.txt
-    with open('cloud-automation/new_sandboxes.txt', 'w') as f:
-        f.write(f"{new_sandbox} {account_id}\n")
+        # Write the account_id and the account name to cloud-automation/new_sandboxes.txt
+        with open('cloud-automation/new_sandboxes.txt', 'w') as f:
+            f.write(f"{new_sandbox} {account_id}\n")
 
-set_(dynamodb, new_sandbox, 'stage', STAGE2_ACCOUNT_CREATED)
-ACCOUNT_CREATED_TIME = time.time()
-logger.info(f"Duration: {round(ACCOUNT_CREATED_TIME - START_TIME)} seconds to create {new_sandbox}")
+    set_(dynamodb, new_sandbox, 'stage', STAGE2_ACCOUNT_CREATED)
+    set_(dynamodb, new_sandbox, 'reservation', 'untested')
+    ACCOUNT_CREATED_TIME = time.time()
+    logger.info(f"Duration: {round(ACCOUNT_CREATED_TIME - START_TIME)} seconds to create {new_sandbox}")
 
 
 if hcc:
diff --git a/playbooks/creation_status.py b/playbooks/creation_status.py
index c73fb7b3..af884f4b 100755
--- a/playbooks/creation_status.py
+++ b/playbooks/creation_status.py
@@ -33,16 +33,13 @@ def print_sandbox(item, db):
                 creation_status=item.get('creation_status', {}).get('S', ''),
                 stage= item.get('stage', {}).get('S', ''),
                 reservation= item.get('reservation', {}).get('S', ''),
+                account_id=item.get('account_id', {}).get('S', ''),
                 db=db)
 
 
 response = dynamodb_dev.scan(
     TableName='accounts-dev',
     ConsistentRead=True,
-    ProjectionExpression='#n, creation_status, stage, reservation',
-    ExpressionAttributeNames={
-        '#n': 'name'
-    }
 )
 
 if response['ResponseMetadata']['HTTPStatusCode'] != 200:
@@ -54,8 +51,6 @@ def print_sandbox(item, db):
     response = dynamodb_dev.scan(
         TableName='accounts-dev',
         ConsistentRead=True,
-        ProjectionExpression='#n',
-        ExpressionAttributeNames={'#n': 'name'},
         ExclusiveStartKey=response['LastEvaluatedKey']
     )
     data.extend(response['Items'])
@@ -69,8 +64,6 @@ def print_sandbox(item, db):
 response = dynamodb_prod.scan(
     TableName='accounts',
     ConsistentRead=True,
-    ProjectionExpression='#n',
-    ExpressionAttributeNames={'#n': 'name'}
 )
 
 if response['ResponseMetadata']['HTTPStatusCode'] != 200:
@@ -83,8 +76,6 @@ def print_sandbox(item, db):
     response = dynamodb_prod.scan(
         TableName='accounts',
         ConsistentRead=True,
-        ProjectionExpression='#n',
-        ExpressionAttributeNames={'#n': 'name'},
         ExclusiveStartKey=response['LastEvaluatedKey']
     )
     data.extend(response['Items'])

From 933c28203248d663b9fd8a9891fd30dc18373fe0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Guillaume=20Cor=C3=A9?= <gucore@redhat.com>
Date: Mon, 20 Jan 2025 15:49:34 +0100
Subject: [PATCH 06/11] Handle concurrency better by backing off

---
 playbooks/roles/infra-aws-sandbox/tasks/account.yml | 3 +++
 playbooks/roles/infra-aws-sandbox/tasks/ou.yml      | 2 ++
 2 files changed, 5 insertions(+)

diff --git a/playbooks/roles/infra-aws-sandbox/tasks/account.yml b/playbooks/roles/infra-aws-sandbox/tasks/account.yml
index 980c7a45..9014bdd6 100644
--- a/playbooks/roles/infra-aws-sandbox/tasks/account.yml
+++ b/playbooks/roles/infra-aws-sandbox/tasks/account.yml
@@ -107,6 +107,9 @@
       --query 'CreateAccountStatus.[Id]'
       --output text
     register: _createaccount
+    retries: 10
+    # Make this especially long to avoid the issue with too many requests
+    delay: 120
 
 - when:
     - _createaccount is not skipped
diff --git a/playbooks/roles/infra-aws-sandbox/tasks/ou.yml b/playbooks/roles/infra-aws-sandbox/tasks/ou.yml
index cee38eb6..d14d2d8e 100644
--- a/playbooks/roles/infra-aws-sandbox/tasks/ou.yml
+++ b/playbooks/roles/infra-aws-sandbox/tasks/ou.yml
@@ -51,3 +51,5 @@
     --account-id {{ account_id }}
     --source-parent-id {{ rootid }}
     --destination-parent-id {{ destouid }}
+  retries: 10
+  delay: 120

From 212ac4cea3353d00e7c88d0974032a89985ba4d1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Guillaume=20Cor=C3=A9?= <gucore@redhat.com>
Date: Mon, 20 Jan 2025 15:49:47 +0100
Subject: [PATCH 07/11] Improve retry validation

---
 playbooks/create_sandbox.py | 31 ++++++++++++++++++++++++-------
 1 file changed, 24 insertions(+), 7 deletions(-)

diff --git a/playbooks/create_sandbox.py b/playbooks/create_sandbox.py
index 985c29e5..4224c4f6 100755
--- a/playbooks/create_sandbox.py
+++ b/playbooks/create_sandbox.py
@@ -338,11 +338,11 @@ def decrypt_vaulted_str(secret):
             logger.error(f"Failed to get the stage for {new_sandbox}")
             sys.exit(1)
 
-        creation_status = sandbox_data.get('creation_status', {}).get('S', '')
+        # creation_status = sandbox_data.get('creation_status', {}).get('S', '')
 
-        if not creation_status:
-            logger.error(f"Failed to get the creation_status for {new_sandbox}")
-            sys.exit(1)
+        # if not creation_status:
+        #     logger.error(f"Failed to get the creation_status for {new_sandbox}")
+        #     sys.exit(1)
 
 
 def lock_sandbox(dynamodb, sandbox):
@@ -396,7 +396,7 @@ def exit_handler(db, table, sandbox):
 
     # Check if the stage is STAGE0
     stage = get_stage(db, sandbox)
-    if stage in [ STAGE0, STAGE1_FAILED ]:
+    if stage in [ STAGE0, STAGE1_STARTED, STAGE1_FAILED ]:
         response = db.delete_item(
             TableName=table,
             Key={
@@ -675,6 +675,19 @@ def exit_handler(db, table, sandbox):
     logger.info(f"Source create in HCC")
 
 if validation:
+    # First ensure the current reservation of the sandbox is 'untested'
+
+    sandbox_data = get_sandbox(dynamodb, new_sandbox)
+
+    if sandbox_data:
+        if sandbox_data.get('stage', {}).get('S', '') == STAGE4_VALIDATED:
+            logger.info("Sandbox is already validated. Skipping.")
+            exit(0)
+
+        if sandbox_data.get('reservation', {}).get('S', '') != 'untested':
+            logger.error("Sandbox reservation is not 'untested'. something's off.")
+            exit(1)
+
     # Run the validation playbook operation
 
     local_path = os.path.dirname(os.path.realpath(__file__))
@@ -763,11 +776,12 @@ def exit_handler(db, table, sandbox):
                 'S': new_sandbox
             }
         },
-        UpdateExpression='SET #r = :val1, #s = :val2, #c = :val3',
+        UpdateExpression='SET #r = :val1, #s = :val2, #c = :val3, #a = :val4',
         ExpressionAttributeNames={
             '#r': 'reservation',
             '#s': 'stage',
-            '#c': 'creation_status'
+            '#c': 'creation_status',
+            '#a': 'available'
         },
         ExpressionAttributeValues={
             ':val1': {
@@ -778,6 +792,9 @@ def exit_handler(db, table, sandbox):
             },
             ':val3': {
                 'S': 'success'
+            },
+            ':val4': {
+                'BOOL': True,
             }
         }
     )

From 74ec3147956efa884c12b824cfe240806aca9401 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Guillaume=20Cor=C3=A9?= <gucore@redhat.com>
Date: Tue, 21 Jan 2025 10:53:46 +0100
Subject: [PATCH 08/11] Improve performance + validation

---
 playbooks/create_sandbox.py  | 190 +++++++++++++++++++++--------------
 playbooks/creation_status.py |   1 +
 2 files changed, 117 insertions(+), 74 deletions(-)

diff --git a/playbooks/create_sandbox.py b/playbooks/create_sandbox.py
index 4224c4f6..4a340949 100755
--- a/playbooks/create_sandbox.py
+++ b/playbooks/create_sandbox.py
@@ -141,76 +141,36 @@
     INFRA_VAULT_SECRET_FILE = f.name
     logger.info(f"Created temporary file {INFRA_VAULT_SECRET_FILE}")
 
-# run `sandbox-list -all --sort name`
-
-response = dynamodb_dev.scan(
-    TableName='accounts-dev',
-    ConsistentRead=True,
-    ProjectionExpression='#n',
-    ExpressionAttributeNames={
-        '#n': 'name'
-    }
-)
-
-if response['ResponseMetadata']['HTTPStatusCode'] != 200:
-    logger.error("Failed to get items from dynamodb")
-    sys.exit(1)
-
-data = response['Items']
-while 'LastEvaluatedKey' in response:
-    response = dynamodb_dev.scan(
-        TableName='accounts-dev',
-        ConsistentRead=True,
-        ProjectionExpression='#n',
-        ExpressionAttributeNames={'#n': 'name'},
-        ExclusiveStartKey=response['LastEvaluatedKey']
-    )
-    data.extend(response['Items'])
-
-if 'Items' in response:
-    sandboxes = [item['name']['S'] for item in data]
-    logger.info(f"Found {len(sandboxes)} sandboxes in dev")
-
-# Now run the command for the prod database
-
-response = dynamodb_prod.scan(
-    TableName='accounts',
-    ConsistentRead=True,
-    ProjectionExpression='#n',
-    ExpressionAttributeNames={'#n': 'name'}
-)
-
-if response['ResponseMetadata']['HTTPStatusCode'] != 200:
-    logger.error("Failed to get items from dynamodb")
-    sys.exit(1)
-
-data = response['Items']
-
-while 'LastEvaluatedKey' in response:
-    response = dynamodb_prod.scan(
-        TableName='accounts',
-        ConsistentRead=True,
-        ProjectionExpression='#n',
-        ExpressionAttributeNames={'#n': 'name'},
-        ExclusiveStartKey=response['LastEvaluatedKey']
-    )
-    data.extend(response['Items'])
-
-if 'Items' in response:
-    sandboxes_prod = [item['name']['S'] for item in data]
-    logger.info(f"Found {len(sandboxes_prod)} sandboxes in prod")
-    sandboxes = sandboxes + sandboxes_prod
 
 def extract_sandbox_number(sandbox):
     """Extract the number from the sandbox name, for example sandbox1234 returns 1234"""
     return int(sandbox.split('sandbox')[1])
 
-sandboxes.sort(key=extract_sandbox_number)
+def set_str(dynamodb, sandbox, key, value):
+    '''Set the key value pair in the DB'''
+    response = dynamodb.update_item(
+        TableName=dynamodb_table,
+        Key={
+            'name': {
+                'S': sandbox
+            }
+        },
+        UpdateExpression='SET #k = :val1',
+        ExpressionAttributeNames={
+            '#k': key
+        },
+        ExpressionAttributeValues={
+            ':val1': {
+                'S': value
+            }
+        }
+    )
 
-# transform into a dictionary
-sandboxes_dict = {sandbox: True for sandbox in sandboxes}
+    if response['ResponseMetadata']['HTTPStatusCode'] != 200:
+        raise Exception(f"Failed to set {key} to {value}")
 
-def set_(dynamodb, sandbox, key, value):
+# TODO detect type instead of this _bool and _str
+def set_bool(dynamodb, sandbox, key, value):
     '''Set the key value pair in the DB'''
     response = dynamodb.update_item(
         TableName=dynamodb_table,
@@ -225,7 +185,7 @@ def set_(dynamodb, sandbox, key, value):
         },
         ExpressionAttributeValues={
             ':val1': {
-                'S': value
+                'BOOL': value
             }
         }
     )
@@ -288,9 +248,74 @@ def get_sandbox(dynamodb, sandbox):
     else:
         return {}
 
+def get_all_sandboxes(dynamodb_prod, dynamodb_dev):
+    # run `sandbox-list -all --sort name`
+
+    response = dynamodb_dev.scan(
+        TableName='accounts-dev',
+        ConsistentRead=True,
+        ProjectionExpression='#n',
+        ExpressionAttributeNames={
+            '#n': 'name'
+        }
+    )
+
+    if response['ResponseMetadata']['HTTPStatusCode'] != 200:
+        logger.error("Failed to get items from dynamodb")
+        sys.exit(1)
+
+    data = response['Items']
+    while 'LastEvaluatedKey' in response:
+        response = dynamodb_dev.scan(
+            TableName='accounts-dev',
+            ConsistentRead=True,
+            ProjectionExpression='#n',
+            ExpressionAttributeNames={'#n': 'name'},
+            ExclusiveStartKey=response['LastEvaluatedKey']
+        )
+        data.extend(response['Items'])
+
+    if 'Items' in response:
+        sandboxes = [item['name']['S'] for item in data]
+        logger.info(f"Found {len(sandboxes)} sandboxes in dev")
 
+    # Now run the command for the prod database
 
-def guess_next_sandbox(sandboxes, sandboxes_dict):
+    response = dynamodb_prod.scan(
+        TableName='accounts',
+        ConsistentRead=True,
+        ProjectionExpression='#n',
+        ExpressionAttributeNames={'#n': 'name'}
+    )
+
+    if response['ResponseMetadata']['HTTPStatusCode'] != 200:
+        logger.error("Failed to get items from dynamodb")
+        sys.exit(1)
+
+    data = response['Items']
+
+    while 'LastEvaluatedKey' in response:
+        response = dynamodb_prod.scan(
+            TableName='accounts',
+            ConsistentRead=True,
+            ProjectionExpression='#n',
+            ExpressionAttributeNames={'#n': 'name'},
+            ExclusiveStartKey=response['LastEvaluatedKey']
+        )
+        data.extend(response['Items'])
+
+    if 'Items' in response:
+        sandboxes_prod = [item['name']['S'] for item in data]
+        logger.info(f"Found {len(sandboxes_prod)} sandboxes in prod")
+        sandboxes = sandboxes + sandboxes_prod
+
+    sandboxes.sort(key=extract_sandbox_number)
+
+    return sandboxes
+
+
+
+def guess_next_sandbox(dynamodb_prod, dynamodb_dev):
     """Find the first available sandbox name"""
     # Generate a random email tag sandbox1+RANDSTR@opentlc.com
     # used when we reuse the account name. For some reason, the email is still registered
@@ -300,6 +325,11 @@ def guess_next_sandbox(sandboxes, sandboxes_dict):
     if retry:
         return retry, f"{retry}+{random_email_tag}@{os.environ['email_domain']}"
 
+    sandboxes = get_all_sandboxes(dynamodb_prod, dynamodb_dev)
+
+    # transform into a dictionary
+    sandboxes_dict = {sandbox: True for sandbox in sandboxes}
+
     if guess_strategy == 'smart':
         for i in range(1, len(sandboxes) + 1):
             if not sandboxes_dict.get(f"sandbox{i}", False):
@@ -314,7 +344,7 @@ def decrypt_vaulted_str(secret):
     '''Decrypt the vaulted secret'''
     return Vault(os.environ['INFRA_VAULT_SECRET']).load_raw(secret).decode('utf-8')
 
-new_sandbox, new_email = guess_next_sandbox(sandboxes, sandboxes_dict)
+new_sandbox, new_email = guess_next_sandbox(dynamodb_prod, dynamodb_dev)
 logger = logger.bind(sandbox=new_sandbox)
 
 # Lock the name of the sandbox in DB so another
@@ -328,7 +358,7 @@ def decrypt_vaulted_str(secret):
 
     # Ensure the sandbox is not in use, available should be absent or true
     if retry:
-        if sandbox_data.get('available', {}).get('BOOL', True) is False:
+        if sandbox_data.get('service_uuid', {}).get('S', '') == '':
             logger.info(f"Retry {new_sandbox}")
         else:
             logger.error(f"{new_sandbox} is not available")
@@ -418,14 +448,14 @@ def exit_handler(db, table, sandbox):
             # something went wrong
             logger.error(f"Unexpected stage: {stage}, missing validation")
             logger.info(f"You can retry the operation by running the command with --retry {sandbox}")
-            set_(dynamodb, new_sandbox, 'creation_status', 'failed')
+            set_str(dynamodb, new_sandbox, 'creation_status', 'failed')
             sys.exit(1)
         if hcc:
             if stage != STAGE3_GOLD_IMAGE:
                 # something went wrong
                 logger.error(f"Unexpected stage: {stage}, missing validation")
                 logger.info(f"You can retry the operation by running the command with --retry {sandbox}")
-                set_(dynamodb, new_sandbox, 'creation_status', 'failed')
+                set_str(dynamodb, new_sandbox, 'creation_status', 'failed')
                 sys.exit(1)
 
 
@@ -568,8 +598,8 @@ def exit_handler(db, table, sandbox):
         with open('cloud-automation/new_sandboxes.txt', 'w') as f:
             f.write(f"{new_sandbox} {account_id}\n")
 
-    set_(dynamodb, new_sandbox, 'stage', STAGE2_ACCOUNT_CREATED)
-    set_(dynamodb, new_sandbox, 'reservation', 'untested')
+    set_str(dynamodb, new_sandbox, 'stage', STAGE2_ACCOUNT_CREATED)
+    set_str(dynamodb, new_sandbox, 'reservation', 'untested')
     ACCOUNT_CREATED_TIME = time.time()
     logger.info(f"Duration: {round(ACCOUNT_CREATED_TIME - START_TIME)} seconds to create {new_sandbox}")
 
@@ -680,12 +710,24 @@ def exit_handler(db, table, sandbox):
     sandbox_data = get_sandbox(dynamodb, new_sandbox)
 
     if sandbox_data:
+        reservation_current = sandbox_data.get('reservation', {}).get('S', '')
         if sandbox_data.get('stage', {}).get('S', '') == STAGE4_VALIDATED:
-            logger.info("Sandbox is already validated. Skipping.")
+
+            if sandbox_data.get('available', {}).get('BOOL', '') == False:
+                set_bool(dynamodb, new_sandbox, 'available', True)
+                logger.info(f"Set {new_sandbox} as available")
+            logger.info("Sandbox is already validated. Skipping validation.")
+
+            if reservation_current != reservation:
+                set_str(dynamodb, new_sandbox, 'reservation', reservation)
+
+                logger.info("Reservation updated",
+                            previous_reservation=reservation_current)
+
             exit(0)
 
-        if sandbox_data.get('reservation', {}).get('S', '') != 'untested':
-            logger.error("Sandbox reservation is not 'untested'. something's off.")
+        if reservation_current != 'untested':
+            logger.error("Sandbox reservation is not 'untested'. something's off.", found=reservation_current)
             exit(1)
 
     # Run the validation playbook operation
diff --git a/playbooks/creation_status.py b/playbooks/creation_status.py
index af884f4b..a1da7e4c 100755
--- a/playbooks/creation_status.py
+++ b/playbooks/creation_status.py
@@ -32,6 +32,7 @@ def print_sandbox(item, db):
     logger.info(item['name']['S'],
                 creation_status=item.get('creation_status', {}).get('S', ''),
                 stage= item.get('stage', {}).get('S', ''),
+                available=item.get('available', {}).get('BOOL', ''),
                 reservation= item.get('reservation', {}).get('S', ''),
                 account_id=item.get('account_id', {}).get('S', ''),
                 db=db)

From 07086535ab73ed3d7cbdd6bf9ab3fb1ae63d3e10 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Guillaume=20Cor=C3=A9?= <gucore@redhat.com>
Date: Thu, 30 Jan 2025 11:59:46 +0100
Subject: [PATCH 09/11] Switch to manual conf for HCC + improvements

- Use CLIENT ID and SECRET instead of username / password credentials
- Switch to manual configuration for console HCC trust, so we can use
  static role and policy
- Generate a random external_id and save it in DB
- Keep the external_id across cleanup
- Filter redhat-HCC-role and policy during cleanup.
---
 playbooks/create_sandbox.py                   | 240 ++++++++++++++++--
 playbooks/creation_status.py                  |   1 +
 .../roles/infra-aws-sandbox/defaults/main.yml |   4 +
 .../roles/infra-aws-sandbox/tasks/pool.yml    |  22 ++
 4 files changed, 248 insertions(+), 19 deletions(-)

diff --git a/playbooks/create_sandbox.py b/playbooks/create_sandbox.py
index 4a340949..b10d401e 100755
--- a/playbooks/create_sandbox.py
+++ b/playbooks/create_sandbox.py
@@ -1,11 +1,10 @@
 #!/usr/bin/env python3
 
-# First, grab the list of all sandboxes
-
 import subprocess
 import os
 import sys
 import boto3
+import hashlib
 import argparse
 import atexit
 import structlog
@@ -15,6 +14,7 @@
 import string
 import requests
 import time
+import json
 from ansible_vault import Vault
 
 START_TIME = time.time()
@@ -70,6 +70,7 @@
 os.environ.setdefault('ddns_key_algorithm', 'hmac-sha512')
 os.environ.setdefault('ddns_ttl', '600')
 os.environ.setdefault('email_domain', 'opentlc.com')
+os.environ.setdefault('REDHAT_ACCOUNT', '998366406740')
 # set default to ~/.aws/credentials_create
 os.environ.setdefault('AWS_SHARED_CREDENTIALS_FILE', os.path.expanduser('~/.aws/credentials_create'))
 # Create directory if it doesn't exist, chmod 700
@@ -87,8 +88,8 @@
     'INFRA_VAULT_SECRET_PROD',
     'ddns_server',
     'ddns_key_secret',
-    'RH_USERNAME',
-    'RH_PASSWORD',
+    'HCC_CLIENT_ID',
+    'HCC_CLIENT_SECRET',
 ]
 
 # constants: Steps
@@ -462,6 +463,33 @@ def exit_handler(db, table, sandbox):
 atexit.register(exit_handler, dynamodb, dynamodb_table, new_sandbox)
 
 
+def get_sso_access_token():
+    """ Create a session token using HCC_CLIENT_ID and HCC_CLIENT_SECRET"""
+
+    # This is the standard Keycloak endpoint for client_credentials
+    token_url = "https://sso.redhat.com/auth/realms/redhat-external/protocol/openid-connect/token"
+    # Client Credentials Grant
+    payload = {
+        "grant_type": "client_credentials",
+        "client_id": os.environ['HCC_CLIENT_ID'],
+        "client_secret": os.environ['HCC_CLIENT_SECRET']
+    }
+
+    response = requests.post(token_url, data=payload)
+
+    if response.status_code != 200:
+        raise ValueError(f"Failed to obtain token: {response.status_code} {response.text}")
+
+
+    # Parse out the access token
+    access_token = response.json().get("access_token")
+    if not access_token:
+        raise ValueError("No access token found in the response")
+
+
+    logger.info("Successfully obtained an access token for console.redhat.com.")
+    return access_token
+
 # Prepare the AWS profile for the ansible-playbook command
 # - dynamodb   profile to manage the dynamodb table
 # - pool-manager profile to manage the pool
@@ -490,6 +518,24 @@ def exit_handler(db, table, sandbox):
 aws_secret_access_key = {os.environ['AWS_SECRET_ACCESS_KEY']}
             ''')
 
+def assume_role(master_profile, role_arn, role_session_name, region_name='us-east-2'):
+    """Assume a role using the master profile"""
+
+
+    session = boto3.Session(profile_name=master_profile)
+    sts = session.client('sts', region_name=region_name)
+
+    response = sts.assume_role(
+        RoleArn=role_arn,
+        RoleSessionName=role_session_name
+    )
+
+    if response['ResponseMetadata']['HTTPStatusCode'] != 200:
+        raise Exception("Failed to assume role")
+
+    return response['Credentials']
+
+
 if playbook:
     lock_sandbox(dynamodb, new_sandbox)
 
@@ -617,19 +663,169 @@ def exit_handler(db, table, sandbox):
         logger.error(f"Failed to get the aws_secret_access_key for {new_sandbox}")
         sys.exit(1)
 
-    plaintext_key = decrypt_vaulted_str(sandbox_data.get('aws_secret_access_key', {}).get('S', '')).strip(' \t\n\r')
-    access_key = sandbox_data.get('aws_access_key_id', {}).get('S', '').strip(' \t\n\r')
 
-    if not access_key or not plaintext_key:
-        logger.error(f"Failed to get the access key for {new_sandbox}")
+    account_id = sandbox_data.get('account_id', {}).get('S', '')
+    role_arn = f"arn:aws:iam::{account_id}:role/OrganizationAccountAccessRole"
+
+    credentials = assume_role('pool-manager', role_arn, 'hcc-registration')
+
+    if not credentials:
+        logger.error("Failed to assume role", role_arn=role_arn)
+        sys.exit(1)
+
+    # create a new session with the assumed role credentials
+    sandbox_session = boto3.Session(
+        aws_access_key_id=credentials['AccessKeyId'],
+        aws_secret_access_key=credentials['SecretAccessKey'],
+        aws_session_token=credentials['SessionToken'],
+        region_name='us-east-1'
+    )
+
+    policy_name = 'redhat-HCC-policy'
+
+    iam_client = sandbox_session.client('iam')
+
+    policies = iam_client.list_policies()
+    policy = {
+        "Version": "2012-10-17",
+        "Statement": [
+            {
+                "Sid": "CloudigradePolicy",
+                "Effect": "Allow",
+                "Action": [
+                    "sts:GetCallerIdentity",
+                    "ec2:DescribeImages",
+                    "ec2:DescribeInstances",
+                    "ec2:ModifySnapshotAttribute",
+                    "ec2:DescribeSnapshotAttribute",
+                    "ec2:DescribeSnapshots",
+                    "ec2:CopyImage",
+                    "ec2:CreateTags",
+                    "ec2:DescribeRegions",
+                    "cloudtrail:CreateTrail",
+                    "cloudtrail:UpdateTrail",
+                    "cloudtrail:PutEventSelectors",
+                    "cloudtrail:DescribeTrails",
+                    "cloudtrail:StartLogging",
+                    "cloudtrail:DeleteTrail"
+                ],
+                "Resource": "*"
+            }
+        ]
+    }
+
+    md5_policy = hashlib.md5(json.dumps(policy).encode()).hexdigest()
+
+    if policy_name not in [policy['PolicyName'] for policy in policies['Policies']]:
+        response = iam_client.create_policy(
+            PolicyName=policy_name,
+            PolicyDocument=json.dumps(policy),
+            Description="Policy to grant access to Red Hat Hybrid Cloud Console to the AWS account"
+        )
+
+        if response['ResponseMetadata']['HTTPStatusCode'] != 200:
+            logger.error("Failed to create the policy")
+            sys.exit(1)
+
+        logger.info("Policy created", policy_name=policy_name, md5=md5_policy)
+    else:
+        # update permission
+        response = iam_client.create_policy_version(
+            PolicyArn=f"arn:aws:iam::{account_id}:policy/{policy_name}",
+            PolicyDocument=json.dumps(policy),
+            SetAsDefault=True
+        )
+
+        logger.info("Policy updated", policy_name=policy_name, md5=md5_policy)
+
+
+
+    # Create the role redhat-HCC-role, using the external_id created earlier
+    # Get the external_id from db or generate a new one
+    external_id = sandbox_data.get('external_id', {}).get('S', '')
+
+    if not external_id:
+        # generate a random uuid
+        #external_id = str(uuid.uuid4())
+        # generate a random string
+        external_id = ''.join(
+            random.choice(string.ascii_uppercase + string.digits)
+            for _ in range(16)
+        )
+        set_str(dynamodb, new_sandbox, 'external_id', external_id)
+        logger.info(f"Generated external_id", hcc_external_id=external_id)
+    else:   # Create, if it doesn't exist, an IAM policy redhat-HCC-policy
+        logger.info(f"External ID already exists", hcc_external_id=external_id)
+
+    role_name = 'redhat-HCC-role'
+
+    roles = iam_client.list_roles()
+
+    policy_document = {
+        "Version": "2012-10-17",
+        "Statement": [
+            {
+                "Effect": "Allow",
+                "Action": "sts:AssumeRole",
+                "Principal": {
+                    "AWS": f"arn:aws:iam::{os.environ['REDHAT_ACCOUNT']}:root"
+                },
+                "Condition": {
+                    "StringEquals": {
+                        "sts:ExternalId": external_id
+                    }
+                }
+            }
+        ]
+    }
+
+    if role_name in [role['RoleName'] for role in roles['Roles']]:
+        # update the role to ensure it has the right external_id
+        response = iam_client.update_assume_role_policy(
+            RoleName=role_name,
+            PolicyDocument=json.dumps(policy_document)
+        )
+        logger.info("Role updated", role_name=role_name, hcc_external_id=external_id)
+    else:
+        response = iam_client.create_role(
+            RoleName=role_name,
+            AssumeRolePolicyDocument=json.dumps(policy_document),
+        )
+
+        if response['ResponseMetadata']['HTTPStatusCode'] != 200:
+            logger.error("Failed to create the role")
+            sys.exit(1)
+
+        logger.info("Role created", role_name=role_name)
+
+    logger = logger.bind(hcc_external_id=external_id)
+    role_arn = f"arn:aws:iam::{account_id}:role/{role_name}"
+    logger = logger.bind(role_arn=role_arn)
+
+    # Attach the policy to the role
+
+    response = iam_client.attach_role_policy(
+        RoleName=role_name,
+        PolicyArn=f"arn:aws:iam::{account_id}:policy/{policy_name}"
+    )
+
+    if response['ResponseMetadata']['HTTPStatusCode'] != 200:
+        logger.error("Failed to attach the policy to the role")
         sys.exit(1)
+    else:
+        logger.info("Policy attached to the role", role_name=role_name, policy_name=policy_name)
 
     # use requests and create the POST request
 
-    baseurl = 'https://console.redhat.com/api/sources/v3.1'
+    try:
+        access_token = get_sso_access_token()
+    except Exception as e:
+        logger.error("Error getting the access token to console.redhat.com", error=e)
 
     s = requests.Session()
-    s.auth = (os.environ['RH_USERNAME'], os.environ['RH_PASSWORD'])
+    #s.auth = (os.environ['RH_USERNAME'], os.environ['RH_PASSWORD'])
+    s.headers.update({"Authorization": f"Bearer {access_token}"})
+    baseurl = 'https://console.redhat.com/api/sources/v3.1'
 
     # delete the source if it exists
     # First get the source_id
@@ -651,9 +847,11 @@ def exit_handler(db, table, sandbox):
         source_id = response.json().get('data', [{}])[0].get('id', '')
 
         if source_id:
+
             response = s.delete(f"{baseurl}/sources/{source_id}")
-            if response.status_code not in [200, 201, 202]:
+            if response.status_code not in [200, 201, 202, 204]:
                 logger.error(f"Failed to delete the source: {response.text}", status_code=response.status_code, sandbox=new_sandbox)
+                os.exit(1)
 
             logger.info(f"Deleted the source {source_id} for {new_sandbox}")
 
@@ -671,24 +869,26 @@ def exit_handler(db, table, sandbox):
                 logger.error(f"Failed to delete the source {source_id} for {new_sandbox}")
                 sys.exit(1)
 
+
     payload = {
         "sources": [
             {
                 "name": new_sandbox,
                 "source_type_name": "amazon",
-                "app_creation_workflow": "account_authorization"
+                "app_creation_workflow": "manual_configuration",
             }
         ],
         "authentications": [
             {
-                "resource_type": "source",
-                "resource_name": new_sandbox,
-                "username": access_key,
-                "password": plaintext_key,
-                "authtype": "access_key_secret_key"
+                "resource_type": "application",
+                "resource_name": "cloud-meter",
+                "authtype": "cloud-meter-arn",
+                "username": role_arn,
+                "extra": {
+                    "external_id": external_id
+                }
             }
         ],
-
         "applications": [
             {
                 "source_name": new_sandbox,
@@ -702,7 +902,9 @@ def exit_handler(db, table, sandbox):
         logger.error(f"Failed to create the source: {response.text}", status_code=response.status_code, sandbox=new_sandbox)
         sys.exit(1)
 
-    logger.info(f"Source create in HCC")
+    source_id = response.json().get('sources', [{}])[0].get('id', '')
+    logger.info(f"Source create in HCC", source_id=source_id)
+
 
 if validation:
     # First ensure the current reservation of the sandbox is 'untested'
diff --git a/playbooks/creation_status.py b/playbooks/creation_status.py
index a1da7e4c..1cd0553c 100755
--- a/playbooks/creation_status.py
+++ b/playbooks/creation_status.py
@@ -35,6 +35,7 @@ def print_sandbox(item, db):
                 available=item.get('available', {}).get('BOOL', ''),
                 reservation= item.get('reservation', {}).get('S', ''),
                 account_id=item.get('account_id', {}).get('S', ''),
+                external_id=item.get('external_id', {}).get('S', ''),
                 db=db)
 
 
diff --git a/playbooks/roles/infra-aws-sandbox/defaults/main.yml b/playbooks/roles/infra-aws-sandbox/defaults/main.yml
index 502a4f3f..9ff791f1 100644
--- a/playbooks/roles/infra-aws-sandbox/defaults/main.yml
+++ b/playbooks/roles/infra-aws-sandbox/defaults/main.yml
@@ -78,10 +78,12 @@ aws_nuke_filters_default:
     - AWSServiceRoleForSupport
     - AWSServiceRoleForTrustedAdvisor
     - CloudabilityRole_OU
+    - redhat-HCC-role
 
   IAMRolePolicy:
     - "OrganizationAccountAccessRole -> AdministratorAccess"
     - config-rule-role -> config-rule-policy
+    - redhat-HCC-role -> redhat-HCC-policy
     - CloudabilityRole_OU -> CloudabilityAutomationPolicy
     - CloudabilityRole_OU -> CloudabilityMonitorResourcesPolicy
     - CloudabilityRole_OU -> CloudabilityVerificationPolicy
@@ -103,6 +105,8 @@ aws_nuke_filters_default:
 
   IAMPolicy:
     - arn:aws:iam::{{ account_id }}:policy/config-rule-policy
+    - arn:aws:iam::{{ account_id }}:policy/redhat-HCC-policy
+    - redhat-HCC-policy
 
   EC2KeyPair:
     - opentlc_admin_backdoor
diff --git a/playbooks/roles/infra-aws-sandbox/tasks/pool.yml b/playbooks/roles/infra-aws-sandbox/tasks/pool.yml
index 666ba656..6d0afdef 100644
--- a/playbooks/roles/infra-aws-sandbox/tasks/pool.yml
+++ b/playbooks/roles/infra-aws-sandbox/tasks/pool.yml
@@ -64,11 +64,33 @@
       set_fact:
         sandbox_reservation: "{{ r_get_reservation.stdout }}"
 
+    - name: Get external_id
+      vars:
+        _data:
+          name:
+            S: "{{ account_name }}"
+      command: >-
+        {{ aws_cli }} --profile {{ dynamodb_profile | quote }}
+        --region {{ dynamodb_region | quote }}
+        dynamodb get-item
+        --table-name {{ dynamodb_table }}
+        --key '{{ _data | to_json }}'
+        --query 'Item.external_id'
+        --output text
+      register: r_get_external_id
+      changed_when: false
+
+    - name: Save some values for after cleanup
+      set_fact:
+        external_id: "{{ r_get_external_id.stdout }}"
+
     - when: sandbox_reservation | default("", true) not in ["", "None", "null"]
       set_fact:
         additional_data:
           reservation:
             S: "{{ sandbox_reservation }}"
+          external_id:
+            S: "{{ external_id }}"
 
     - when: sandbox_reservation | default("", true) in ["", "None", "null"]
       set_fact:

From 6af0413121c34ed20d93a28574c18856a749fb48 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Guillaume=20Cor=C3=A9?= <gucore@redhat.com>
Date: Fri, 31 Jan 2025 11:07:11 +0100
Subject: [PATCH 10/11] Add script to validate a sandbox or a reservation

---
 playbooks/create_sandbox.py    |  18 ++--
 playbooks/sandbox_functions.py |  51 +++++++++++
 playbooks/validate_sandbox.py  | 150 +++++++++++++++++++++++++++++++++
 3 files changed, 210 insertions(+), 9 deletions(-)
 create mode 100644 playbooks/sandbox_functions.py
 create mode 100755 playbooks/validate_sandbox.py

diff --git a/playbooks/create_sandbox.py b/playbooks/create_sandbox.py
index b10d401e..19c9ae60 100755
--- a/playbooks/create_sandbox.py
+++ b/playbooks/create_sandbox.py
@@ -1,20 +1,20 @@
 #!/usr/bin/env python3
 
-import subprocess
-import os
-import sys
-import boto3
-import hashlib
 import argparse
 import atexit
-import structlog
+import boto3
+import hashlib
+import json
 import logging
-import tempfile
+import os
 import random
-import string
 import requests
+import string
+import structlog
+import subprocess
+import sys
+import tempfile
 import time
-import json
 from ansible_vault import Vault
 
 START_TIME = time.time()
diff --git a/playbooks/sandbox_functions.py b/playbooks/sandbox_functions.py
new file mode 100644
index 00000000..1c6d6a4f
--- /dev/null
+++ b/playbooks/sandbox_functions.py
@@ -0,0 +1,51 @@
+#!/usr/bin/env python3
+
+import os
+from ansible_vault import Vault
+
+def extract_sandbox_number(sandbox):
+    """Extract the number from the sandbox name, for example sandbox1234 returns 1234"""
+    return int(sandbox.split('sandbox')[1])
+
+def get_sandbox(dynamodb, dynamodb_table, sandbox):
+    """Get the sandbox from the DB"""
+    response = dynamodb.get_item(
+        TableName=dynamodb_table,
+        Key={
+            'name': {
+                'S': sandbox
+            }
+        }
+    )
+
+    if 'Item' in response:
+        return response['Item']
+    else:
+        return {}
+
+def decrypt_vaulted_str(secret):
+    '''Decrypt the vaulted secret'''
+    return Vault(os.environ['INFRA_VAULT_SECRET']).load_raw(secret).decode('utf-8')
+
+def get_all_sandboxes(dynamodb, dynamodb_table):
+    response = dynamodb.scan(
+        TableName=dynamodb_table,
+        ConsistentRead=True,
+    )
+
+    if response['ResponseMetadata']['HTTPStatusCode'] != 200:
+        raise Exception("Failed to get items from dynamodb")
+
+    data = response['Items']
+    while 'LastEvaluatedKey' in response:
+        response = dynamodb.scan(
+            TableName=dynamodb_table,
+            ConsistentRead=True,
+            ExclusiveStartKey=response['LastEvaluatedKey']
+        )
+        data.extend(response['Items'])
+
+    if 'Items' in response:
+        sandboxes = data
+
+    return sandboxes
diff --git a/playbooks/validate_sandbox.py b/playbooks/validate_sandbox.py
new file mode 100755
index 00000000..8beb6da8
--- /dev/null
+++ b/playbooks/validate_sandbox.py
@@ -0,0 +1,150 @@
+#!/usr/bin/env python3
+
+
+import boto3
+import time
+import random
+import logging
+import os
+import argparse
+import structlog
+from ansible_vault import Vault
+from sandbox_functions import get_sandbox, decrypt_vaulted_str, get_all_sandboxes
+
+START_TIME = time.time()
+logger = structlog.get_logger()
+structlog.configure(wrapper_class=structlog.make_filtering_bound_logger(logging.INFO))
+
+parser = argparse.ArgumentParser(description='Validate a sandbox')
+parser.add_argument('--sandbox', required=False, help='sandbox to validate, by passing its name', default=None)
+parser.add_argument('--reservation', required=False, help='reservation to validate, by passing its name', default=None)
+parser.add_argument('--target-db', required=False, help='The target database', default='dev')
+args = parser.parse_args()
+
+required_env_vars = [
+    'AWS_ACCESS_KEY_ID',
+    'AWS_SECRET_ACCESS_KEY',
+    'AWS_ACCESS_KEY_ID_DEV',
+    'AWS_SECRET_ACCESS_KEY_DEV',
+    'INFRA_VAULT_SECRET_DEV',
+    'INFRA_VAULT_SECRET_PROD',
+]
+
+for env_var in required_env_vars:
+    if not os.environ.get(env_var):
+        logger.info(f"Environment variable {env_var} not set")
+        sys.exit(1)
+
+sandbox = args.sandbox
+target_db = args.target_db
+reservation = args.reservation
+
+if not sandbox and not reservation:
+    logger.error("Either sandbox or reservation is required")
+    sys.exit(1)
+
+# Set the target database
+session_prod = boto3.Session(region_name='us-east-1')
+dynamodb_prod = session_prod.client('dynamodb')
+
+session_dev = boto3.Session(region_name='us-east-1',
+                            aws_access_key_id=os.environ['AWS_ACCESS_KEY_ID_DEV'],
+                            aws_secret_access_key=os.environ['AWS_SECRET_ACCESS_KEY_DEV'])
+dynamodb_dev = session_dev.client('dynamodb')
+dynamodb_table = 'accounts-dev'
+dynamodb = dynamodb_dev
+
+if target_db == 'prod':
+    logger.info("Using PROD dynamoDB database")
+    dynamodb_table = 'accounts'
+    dynamodb = dynamodb_prod
+    logger = logger.bind(target_db='prod')
+    os.environ['INFRA_VAULT_SECRET'] = os.environ['INFRA_VAULT_SECRET_PROD']
+else:
+    logger.info("Using DEV dynamoDB database")
+    # bind context variable to the logger
+    logger = logger.bind(target_db='dev')
+    os.environ['INFRA_VAULT_SECRET'] = os.environ['INFRA_VAULT_SECRET_DEV']
+
+#def find_rhel_amis(sandbox, account_id, aws_access_key_id, aws_secret_access_key):
+def find_rhel_amis(sandbox, dynamodb, dynamodb_table):
+    # List of possible regions
+    regions = [
+        'us-east-1',
+        'us-east-2',
+        'us-west-1',
+        'us-west-2',
+        'eu-central-1',
+        'eu-west-1',
+        'eu-west-2',
+        'ap-southeast-1',
+    ]
+
+    sandbox_data = get_sandbox(dynamodb, dynamodb_table, sandbox)
+    account_id = sandbox_data.get('account_id').get('S')
+    aws_access_key_id = sandbox_data.get('aws_access_key_id').get('S').strip(' \t\n\r')
+    aws_secret_access_key = decrypt_vaulted_str(sandbox_data.get('aws_secret_access_key').get('S')).strip(' \t\n\r')
+
+
+    for region in regions:
+        find_rhel_ami_in_region(sandbox, account_id, region, aws_access_key_id, aws_secret_access_key)
+
+def find_rhel_ami_in_region(sandbox, account_id, region, aws_access_key_id, aws_secret_access_key):
+    # Create an EC2 client
+    ec2_client = boto3.client(
+        'ec2',
+        region_name=region,
+        aws_access_key_id=aws_access_key_id,
+        aws_secret_access_key=aws_secret_access_key
+    )
+
+    # We'll try up to 150 times, sleeping 6 seconds each time (total of ~15 minutes)
+    max_retries = 1
+    delay = 6
+
+    images = []
+    for attempt in range(max_retries):
+        try:
+            response = ec2_client.describe_images(
+                Owners=['309956199498'],
+                Filters=[
+                    {'Name': 'architecture', 'Values': ['x86_64']},
+                    {'Name': 'name', 'Values': ['RHEL-9.0*Access*']},
+                    {'Name': 'is-public', 'Values': ['false']}
+                ]
+            )
+            images = response.get('Images', [])
+
+            # If we got at least one image, break out of the loop
+            if images:
+                logger.info(f"Found {len(images)} matching image(s).", region=region, sandbox=sandbox, account_id=account_id)
+                break
+
+            logger.info(
+                f"Attempt {attempt + 1}/{max_retries}: No matching images yet. Retrying in {delay} seconds...",
+                region=region,
+                sandbox=sandbox,
+                account_id=account_id
+            )
+            if max_retries > 1:
+                time.sleep(delay)
+
+        except Exception as e:
+            logger.error(f"Encountered an error: {e}", region=region, sandbox=sandbox, account_id=account_id)
+            time.sleep(delay)
+
+    # After the loop, check if we found images
+    if not images:
+        logger.error("No AMIs found after all retries.", region=region, sandbox=sandbox, account_id=account_id)
+
+if __name__ == "__main__":
+    # Get the credentials from dynamodb
+    if sandbox:
+        find_rhel_amis(sandbox, dynamodb, dynamodb_table)
+
+    if reservation:
+        sandboxes = get_all_sandboxes(dynamodb, dynamodb_table)
+
+        for sandbox in sandboxes:
+            if sandbox.get('reservation', {}).get('S', '') == reservation:
+                find_rhel_amis(sandbox.get('name').get('S'), dynamodb, dynamodb_table)

From 18d4f9ae4fd7c80eb97860314198c074c0c3e97a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Guillaume=20Cor=C3=A9?= <gucore@redhat.com>
Date: Fri, 31 Jan 2025 11:09:46 +0100
Subject: [PATCH 11/11] Address JK's comment

---
 playbooks/roles/infra-aws-sandbox/tasks/assume.yml    | 2 +-
 playbooks/roles/infra-aws-sandbox/tasks/keypair.yml   | 2 +-
 playbooks/roles/infra-aws-sandbox/tasks/regions.yml   | 2 +-
 playbooks/roles/infra-aws-sandbox/tasks/reset.yml     | 2 +-
 playbooks/roles/infra-aws-sandbox/tasks/route53.yml   | 6 +++---
 playbooks/roles/infra-aws-sandbox/tasks/validate.yaml | 2 +-
 6 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/playbooks/roles/infra-aws-sandbox/tasks/assume.yml b/playbooks/roles/infra-aws-sandbox/tasks/assume.yml
index 00f41438..8753b822 100644
--- a/playbooks/roles/infra-aws-sandbox/tasks/assume.yml
+++ b/playbooks/roles/infra-aws-sandbox/tasks/assume.yml
@@ -10,5 +10,5 @@
     region: aws-global
   register: assumed_role
   retries: 5
-  delay: "{{ 30|random(start=3, step=1) }}"
+  delay: 15
   until: assumed_role is succeeded
diff --git a/playbooks/roles/infra-aws-sandbox/tasks/keypair.yml b/playbooks/roles/infra-aws-sandbox/tasks/keypair.yml
index 0ba94db6..b008479f 100644
--- a/playbooks/roles/infra-aws-sandbox/tasks/keypair.yml
+++ b/playbooks/roles/infra-aws-sandbox/tasks/keypair.yml
@@ -19,5 +19,5 @@
       loop_var: _region
     register: r_import
     retries: 10
-    delay: "{{ 10|random(start=3, step=1) }}"
+    delay: 10
     until: r_import is succeeded
diff --git a/playbooks/roles/infra-aws-sandbox/tasks/regions.yml b/playbooks/roles/infra-aws-sandbox/tasks/regions.yml
index 9e3ab222..5d1af19d 100644
--- a/playbooks/roles/infra-aws-sandbox/tasks/regions.yml
+++ b/playbooks/roles/infra-aws-sandbox/tasks/regions.yml
@@ -11,7 +11,7 @@
       register: _regions
       changed_when: false
       retries: 5
-      delay: "{{ 30|random(start=3, step=1) }}"
+      delay: 15
       until: _regions is succeeded
 
     - set_fact:
diff --git a/playbooks/roles/infra-aws-sandbox/tasks/reset.yml b/playbooks/roles/infra-aws-sandbox/tasks/reset.yml
index 10ecab2e..535fff3a 100644
--- a/playbooks/roles/infra-aws-sandbox/tasks/reset.yml
+++ b/playbooks/roles/infra-aws-sandbox/tasks/reset.yml
@@ -8,7 +8,7 @@
     zone: "{{ account_name }}{{subdomain_base}}."
   register: _route53zone
   retries: 5
-  delay: "{{ 60|random(start=3, step=1) }}"
+  delay: 30
   until: _route53zone is succeeded
 
 - name: Cleanup DNS Zone
diff --git a/playbooks/roles/infra-aws-sandbox/tasks/route53.yml b/playbooks/roles/infra-aws-sandbox/tasks/route53.yml
index 6f5dd22c..8db8008f 100644
--- a/playbooks/roles/infra-aws-sandbox/tasks/route53.yml
+++ b/playbooks/roles/infra-aws-sandbox/tasks/route53.yml
@@ -9,7 +9,7 @@
         zone: "{{ account_name }}{{subdomain_base}}."
       register: _route53zone
       retries: 10
-      delay: "{{ 10|random(start=3, step=1) }}"
+      delay: 10
       until: _route53zone is succeeded
 
     - set_fact:
@@ -22,7 +22,7 @@
         hosted_zone_id: "{{ _route53zone.zone_id }}"
       register: _route53facts
       retries: 5
-      delay: "{{ 60|random(start=3, step=1) }}"
+      delay: 30
       until: _route53facts is succeeded
 
     - name: Save NS records
@@ -41,7 +41,7 @@
         overwrite: true
       register: _route53zoneNS
       retries: 5
-      delay: "{{ 60|random(start=3, step=1) }}"
+      delay: 30
       until: _route53zoneNS is succeeded
 
     - name: Add HostedZoneId to the report
diff --git a/playbooks/roles/infra-aws-sandbox/tasks/validate.yaml b/playbooks/roles/infra-aws-sandbox/tasks/validate.yaml
index c8dbf34d..76eb8cee 100644
--- a/playbooks/roles/infra-aws-sandbox/tasks/validate.yaml
+++ b/playbooks/roles/infra-aws-sandbox/tasks/validate.yaml
@@ -50,7 +50,7 @@
     hosted_zone_id: "{{ sandbox_hosted_zone_id }}"
   register: _route53facts
   retries: 5
-  delay: "{{ 60|random(start=3, step=1) }}"
+  delay: 30
   until: _route53facts is succeeded
 
 - name: Validate route53 zone