From aaaa511fe9975feb798c27a56f1c3189fce610c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillaume=20Cor=C3=A9?= Date: Wed, 15 Jan 2025 21:36:06 +0100 Subject: [PATCH 01/11] Fix and Containerize the creation of AWS sandboxes - [X] Create a container image that has all the software needed for the creation of a new sandbox - [X] Fix Locales in Containerfile.admin - [X] Create a wrapper script for automation => Python - [X] Automatically guess the next sandbox number from all different DB (prod and dev) - [X] Add profiling callback to ansible creation playbook runs. - [X] Enable Gold images by using the new HCC (console) APIs instead of Cypress + access.redhat.com There is a transition from access.redhat.com web page to HCC (console.redhat.com) Advantages: - much much faster - less dependencies in the images (roughly -500MB) - [X] Status script `creation_status.py` -- list creation in progress freshly created sandbox - [X] Add a `--retry sandbox123` capability - [ ] Add an Org Policy to p protect anything that is required by HCC (role, ...) - [ ] make slow task async in the playbook - [X] New feature: provide the reservation name, by default new sandboxes end up in a 'new' reservation - [X] sandboxes are created in a 'untested' reservation first. After the functional tests, if successful, we move the new sandboxes to the target reservation (default 'new') - [ ] Create monitoring dashboard or at least scripts for the creation - [ ] Add a test to ensure Vault value is correct. Try to read one key with the passed vault secret. If it doesn't work, exit. That will prevent accidentally creating sandboxes with a vault different that the one currently in use for the 'target DB' - [ ] allow to change the target OU - [ ] document (upstream and confluence) - [ ] Package everything for OpenShift: use OpenShift job to run the creation --- Containerfile.admin | 34 +- playbooks/create_range.yml | 4 +- playbooks/create_sandbox.py | 759 ++++++++++++++++++ playbooks/creation_status.py | 94 +++ .../roles/infra-aws-sandbox/defaults/main.yml | 6 + .../roles/infra-aws-sandbox/tasks/account.yml | 168 +++- .../roles/infra-aws-sandbox/tasks/assume.yml | 3 + .../roles/infra-aws-sandbox/tasks/iam.yml | 11 +- .../roles/infra-aws-sandbox/tasks/keypair.yml | 54 +- .../roles/infra-aws-sandbox/tasks/main.yml | 2 + .../roles/infra-aws-sandbox/tasks/ou.yml | 23 +- .../roles/infra-aws-sandbox/tasks/pool.yml | 2 +- .../roles/infra-aws-sandbox/tasks/route53.yml | 6 +- .../roles/infra-aws-sandbox/tasks/user.yml | 2 + .../infra-aws-sandbox/tasks/validate.yaml | 20 +- requirements.txt | 26 + 16 files changed, 1120 insertions(+), 94 deletions(-) create mode 100755 playbooks/create_sandbox.py create mode 100755 playbooks/creation_status.py create mode 100644 requirements.txt diff --git a/Containerfile.admin b/Containerfile.admin index 5512c4c2..d10c7c34 100644 --- a/Containerfile.admin +++ b/Containerfile.admin @@ -7,30 +7,49 @@ COPY ./ ./ RUN make FROM registry.access.redhat.com/ubi8/ubi:latest AS deploy +USER root RUN dnf install -y https://download.postgresql.org/pub/repos/yum/reporpms/EL-8-x86_64/pgdg-redhat-repo-latest.noarch.rpm \ && dnf install -y \ bash \ + bzip2 \ bind-utils \ curl \ findutils \ gcc \ git \ + glibc-langpack-en \ jq \ nc \ net-tools \ + nodejs \ + npm \ openssl \ postgresql \ - python39 \ - python39-pip \ + python3.12 \ + python3.12-pip \ rsync \ tar \ unzip \ vim \ wget \ && dnf clean all \ + && sed -i 's/^LANG=.*/LANG="en_US.utf8"/' /etc/locale.conf \ && VERSION=4.1.0 \ && curl --silent --location https://github.com/Orange-OpenSource/hurl/releases/download/$VERSION/hurl-$VERSION-x86_64-unknown-linux-gnu.tar.gz \ - | tar -xz -C /usr/local/bin --strip-components=1 --wildcards '*/hurl' '*/hurlfmt' + | tar -xz -C /usr/local/bin --strip-components=1 --wildcards '*/hurl' '*/hurlfmt' \ + && cd /tmp \ + && curl -s -L "https://awscli.amazonaws.com/awscli-exe-linux-x86_64.zip" -o "awscliv2.zip" \ + && unzip "awscliv2.zip" \ + && ./aws/install && rm -rf awscliv2.zip aws + +# Python + +RUN alternatives --set python /usr/bin/python3.12 \ + && alternatives --set python3 /usr/bin/python3.12 \ + && alternatives --install /usr/bin/pip pip /usr/bin/pip3.12 1 +RUN pip install --no-cache-dir --upgrade pip +COPY requirements.txt /tmp/requirements.txt +RUN pip install --no-cache-dir -r /tmp/requirements.txt COPY --from=docker.io/migrate/migrate /usr/local/bin/migrate /usr/local/bin/migrate WORKDIR /sandbox/ @@ -38,11 +57,20 @@ USER ${USER_UID} COPY build/github_known_hosts /ssh/known_hosts env SSH_KNOWN_HOSTS /ssh/known_hosts COPY --from=builder /sandbox/build/sandbox-* ./ +COPY --from=builder /sandbox/build/sandbox-* /usr/bin/ COPY --from=builder /sandbox/tools ./tools COPY --from=builder /sandbox/tests ./tests COPY --from=builder /sandbox/db ./db +COPY --from=builder /sandbox/cloud-automation ./cloud-automation +COPY --from=builder /sandbox/playbooks ./playbooks + +RUN cd cloud-automation && npm ci +COPY conan/ansible.cfg /etc/ansible/ansible.cfg +RUN rm -rf /tmp/* /root/.cache /root/* + CMD ["/bin/bash"] +ENV LANG='en_US.UTF-8' LANGUAGE='en_US:en' LC_ALL='en_US.UTF-8' ENV DESCRIPTION="Image for Admins to interact with the Sandbox API" LABEL name="rhpds/sandbox-admin" \ diff --git a/playbooks/create_range.yml b/playbooks/create_range.yml index 343a6f7a..4cb4d14f 100755 --- a/playbooks/create_range.yml +++ b/playbooks/create_range.yml @@ -1,9 +1,9 @@ #!/usr/bin/env ansible-playbook - hosts: localhost - gather_facts: no + gather_facts: false connection: local - run_once: yes + run_once: true tasks: - assert: msg: Please set account_num_start diff --git a/playbooks/create_sandbox.py b/playbooks/create_sandbox.py new file mode 100755 index 00000000..efe9a0b7 --- /dev/null +++ b/playbooks/create_sandbox.py @@ -0,0 +1,759 @@ +#!/usr/bin/env python3 + +# First, grab the list of all sandboxes + +import subprocess +import os +import sys +import boto3 +import argparse +import atexit +import structlog +import logging +import tempfile +import random +import string +import requests +import time +from ansible_vault import Vault + +START_TIME = time.time() + +#structlog.configure( + #processors=[ + #structlog.stdlib.filter_by_level, + #structlog.processors.TimeStamper(fmt="iso"), + #structlog.processors.JSONRenderer()], + #context_class=dict, logger_factory=structlog.stdlib.LoggerFactory()) + +logger = structlog.get_logger() +structlog.configure(wrapper_class=structlog.make_filtering_bound_logger(logging.INFO)) + +# args: --reservation reservation_name + +# parse the args + +parser = argparse.ArgumentParser(description='Create a new sandbox') +parser.add_argument('--reservation', required=False, help='The reservation name', default='new') +parser.add_argument('--target-db', required=False, help='The target database', default='dev') +parser.add_argument('--log-level', required=False, help='The log level', default='info') +parser.add_argument('--retry', required=False, help='Retry sandbox by passing its name', default=None) + +args = parser.parse_args() + +reservation = args.reservation +logger = logger.bind(reservation=reservation) +target_db = args.target_db +log_level = args.log_level +retry = args.retry + +if log_level == 'debug': + logger.info("Setting log level to DEBUG") + structlog.configure(wrapper_class=structlog.make_filtering_bound_logger(logging.DEBUG)) + +logger.debug(f"Reservation: {reservation}") + +# Make sure all environment variables are set + +# Set default values for the environment variables +os.environ.setdefault('ddns_key_name', 'mydynamickey') +os.environ.setdefault('ddns_key_algorithm', 'hmac-sha512') +os.environ.setdefault('ddns_ttl', '600') +os.environ.setdefault('email_domain', 'opentlc.com') +# set default to ~/.aws/credentials_create +os.environ.setdefault('AWS_SHARED_CREDENTIALS_FILE', os.path.expanduser('~/.aws/credentials_create')) +# Create directory if it doesn't exist, chmod 700 +logger.info(f"Creating directory {os.path.dirname(os.environ['AWS_SHARED_CREDENTIALS_FILE'])}") +os.makedirs(os.path.dirname(os.environ['AWS_SHARED_CREDENTIALS_FILE']), exist_ok=True) +os.chmod(os.path.dirname(os.environ['AWS_SHARED_CREDENTIALS_FILE']), 0o700) + + +required_env_vars = [ + 'AWS_ACCESS_KEY_ID', + 'AWS_SECRET_ACCESS_KEY', + 'AWS_ACCESS_KEY_ID_DEV', + 'AWS_SECRET_ACCESS_KEY_DEV', + 'INFRA_VAULT_SECRET_DEV', + 'INFRA_VAULT_SECRET_PROD', + 'ddns_server', + 'ddns_key_secret', + 'RH_USERNAME', + 'RH_PASSWORD', +] + +# constants: Steps + +# step '0 - created in DB only' + +STAGE0 = '0 - created in DB only' +STAGE1_STARTED = "1 - Account Creation Started" +STAGE1_FAILED = "1 - Account Creation Failed" +STAGE2_ACCOUNT_CREATED = "2 - Account Created" +STAGE3_GOLD_IMAGE = "3 - Gold Image Enabled" +STAGE4_VALIDATED = "4 - Account Validated and Ready" + + +for env_var in required_env_vars: + if not os.environ.get(env_var): + logger.info(f"Environment variable {env_var} not set") + sys.exit(1) + + + +session_prod = boto3.Session(region_name='us-east-1') +dynamodb_prod = session_prod.client('dynamodb') + +session_dev = boto3.Session(region_name='us-east-1', + aws_access_key_id=os.environ['AWS_ACCESS_KEY_ID_DEV'], + aws_secret_access_key=os.environ['AWS_SECRET_ACCESS_KEY_DEV']) +dynamodb_dev = session_dev.client('dynamodb') + +# Set the target database +dynamodb_table = 'accounts-dev' +dynamodb = dynamodb_dev + +if target_db == 'prod': + logger.info("Using PROD dynamoDB database") + dynamodb_table = 'accounts' + dynamodb = dynamodb_prod + logger = logger.bind(target_db='prod') + os.environ['INFRA_VAULT_SECRET'] = os.environ['INFRA_VAULT_SECRET_PROD'] +else: + logger.info("Using DEV dynamoDB database") + # bind context variable to the logger + logger = logger.bind(target_db='dev') + os.environ['INFRA_VAULT_SECRET'] = os.environ['INFRA_VAULT_SECRET_DEV'] + +# Create temporary file using tempfile with the INFRA_VAULT_SECRET as content, with mode 700 + +with tempfile.NamedTemporaryFile(mode='w', delete=False) as f: + f.write(os.environ['INFRA_VAULT_SECRET']) + INFRA_VAULT_SECRET_FILE = f.name + logger.info(f"Created temporary file {INFRA_VAULT_SECRET_FILE}") + +# run `sandbox-list -all --sort name` + +response = dynamodb_dev.scan( + TableName='accounts-dev', + ConsistentRead=True, + ProjectionExpression='#n', + ExpressionAttributeNames={ + '#n': 'name' + } +) + +if response['ResponseMetadata']['HTTPStatusCode'] != 200: + logger.error("Failed to get items from dynamodb") + sys.exit(1) + +data = response['Items'] +while 'LastEvaluatedKey' in response: + response = dynamodb_dev.scan( + TableName='accounts-dev', + ConsistentRead=True, + ProjectionExpression='#n', + ExpressionAttributeNames={'#n': 'name'}, + ExclusiveStartKey=response['LastEvaluatedKey'] + ) + data.extend(response['Items']) + +if 'Items' in response: + sandboxes = [item['name']['S'] for item in data] + logger.info(f"Found {len(sandboxes)} sandboxes in dev") + +# Now run the command for the prod database + +response = dynamodb_prod.scan( + TableName='accounts', + ConsistentRead=True, + ProjectionExpression='#n', + ExpressionAttributeNames={'#n': 'name'} +) + +if response['ResponseMetadata']['HTTPStatusCode'] != 200: + logger.error("Failed to get items from dynamodb") + sys.exit(1) + +data = response['Items'] + +while 'LastEvaluatedKey' in response: + response = dynamodb_prod.scan( + TableName='accounts', + ConsistentRead=True, + ProjectionExpression='#n', + ExpressionAttributeNames={'#n': 'name'}, + ExclusiveStartKey=response['LastEvaluatedKey'] + ) + data.extend(response['Items']) + +if 'Items' in response: + sandboxes_prod = [item['name']['S'] for item in data] + logger.info(f"Found {len(sandboxes_prod)} sandboxes in prod") + sandboxes = sandboxes + sandboxes_prod + +# transform into a dictionary +sandboxes_dict = {sandbox: True for sandbox in sandboxes} + +def set_(dynamodb, sandbox, key, value): + '''Set the key value pair in the DB''' + response = dynamodb.update_item( + TableName=dynamodb_table, + Key={ + 'name': { + 'S': sandbox + } + }, + UpdateExpression='SET #k = :val1', + ExpressionAttributeNames={ + '#k': key + }, + ExpressionAttributeValues={ + ':val1': { + 'S': value + } + } + ) + + if response['ResponseMetadata']['HTTPStatusCode'] != 200: + raise Exception(f"Failed to set {key} to {value}") + +def set_stage(dynamodb, sandbox, stage): + """Set the stage of the sandbox""" + response = dynamodb.update_item( + TableName=dynamodb_table, + Key={ + 'name': { + 'S': sandbox + } + }, + UpdateExpression='SET #s = :val1', + ExpressionAttributeNames={ + '#s': 'stage' + }, + ExpressionAttributeValues={ + ':val1': { + 'S': stage + } + } + ) + + if response['ResponseMetadata']['HTTPStatusCode'] != 200: + raise Exception(f"Failed to set the stage to {stage}") + +def get_stage(dynamodb, sandbox): + """Get the stage of the sandbox""" + response = dynamodb.get_item( + TableName=dynamodb_table, + Key={ + 'name': { + 'S': sandbox + } + } + ) + + if 'Item' in response: + return response['Item'].get('stage', {}).get('S', '') + else: + return '' + +def get_sandbox(dynamodb, sandbox): + """Get the sandbox from the DB""" + response = dynamodb.get_item( + TableName=dynamodb_table, + Key={ + 'name': { + 'S': sandbox + } + } + ) + + if 'Item' in response: + return response['Item'] + else: + return {} + + +def extract_sandbox_number(sandbox): + """Extract the number from the sandbox name, for example sandbox1234 returns 1234""" + return int(sandbox.split('sandbox')[1]) + + +def guess_next_sandbox(sandboxes, sandboxes_dict): + """Find the first available sandbox name""" + # Generate a random email tag sandbox1+RANDSTR@opentlc.com + # used when we reuse the account name. For some reason, the email is still registered + # in AWS and we need to use a different email address even if the previous account is closed. + random_email_tag = ''.join(random.choice(string.ascii_uppercase + string.digits) for _ in range(4)) + + if retry: + return retry, f"{retry}+{random_email_tag}@{os.environ['email_domain']}" + for i in range(1, len(sandboxes) + 1): + if not sandboxes_dict.get(f"sandbox{i}", False): + return f"sandbox{i}", f"sandbox{i}+{random_email_tag}@{os.environ['email_domain']}" + + s = f"sandbox{extract_sandbox_number(sandboxes[-1]) + 1}" + return s, f"{s}+{random_email_tag}@{os.environ['email_domain']}" + + +def decrypt_vaulted_str(secret): + '''Decrypt the vaulted secret''' + return Vault(os.environ['INFRA_VAULT_SECRET']).load_raw(secret).decode('utf-8') + +new_sandbox, new_email = guess_next_sandbox(sandboxes, sandboxes_dict) +logger.info(f"=> Create {new_sandbox}") + + +# Lock the name of the sandbox in DB so another +# concurrent process won't be able to create the same sandbox. +sandbox_data = get_sandbox(dynamodb, new_sandbox) +if sandbox_data: + stage = sandbox_data.get('stage', {}).get('S', '') + if not retry: + logger.info(f"Sandbox {new_sandbox} already exists") + sys.exit(1) + + # Ensure the sandbox is not in use, available should be absent or true + if retry: + if sandbox_data.get('available', {}).get('BOOL', True) is False: + logger.info(f"Retry {new_sandbox}") + else: + logger.error(f"{new_sandbox} is not available") + sys.exit(1) + + if not stage: + logger.error(f"Failed to get the stage for {new_sandbox}") + sys.exit(1) + + creation_status = sandbox_data.get('creation_status', {}).get('S', '') + + if not creation_status: + logger.error(f"Failed to get the creation_status for {new_sandbox}") + sys.exit(1) + + +def lock_sandbox(dynamodb, sandbox): + '''Lock the sandbox name''' + item = { + 'name': { + 'S': new_sandbox + }, + 'available': { + 'BOOL': False + }, + 'to_cleanup': { + 'BOOL': False + }, + 'reservation': { + 'S': 'untested' + }, + 'comment': { + 'S': 'Creating new sandbox' + }, + 'stage': { + 'S': STAGE0 + }, + 'creation_status': { + 'S': 'in progress' + } + } + + response = dynamodb.put_item( + TableName=dynamodb_table, + # If retry, no condition is needed + ConditionExpression='attribute_not_exists(#n)' if not retry else 'attribute_exists(#n) or attribute_not_exists(#n)', + ExpressionAttributeNames={ + '#n': 'name' + }, + Item=item + ) + + if response['ResponseMetadata']['HTTPStatusCode'] != 200: + logger.error("Failed to lock the sandbox name") + sys.exit(1) + + logger.info(f"Locked {new_sandbox}") + +lock_sandbox(dynamodb, new_sandbox) + +def exit_handler(db, table, sandbox): + '''Function to cleanup everything in case something went wrong''' + + # Delete INFRA_VAULT_SECRET_FILE + + os.remove(INFRA_VAULT_SECRET_FILE) + + # Check if the stage is STAGE0 + stage = get_stage(db, sandbox) + if stage in [ STAGE0, STAGE1_FAILED ]: + response = db.delete_item( + TableName=table, + Key={ + 'name': { + 'S': sandbox + } + } + ) + + if response['ResponseMetadata']['HTTPStatusCode'] != 200: + logger.error(f"Failed to delete {sandbox}") + sys.exit(1) + else: + logger.info(f"Deleted {sandbox}") + elif stage == STAGE4_VALIDATED: + pass + else: + # something went wrong + logger.error(f"Unexpected stage: {stage}, missing validation") + logger.info(f"You can retry the operation by running the command with --retry {sandbox}") + set_(dynamodb, new_sandbox, 'creation_status', 'failed') + sys.exit(1) + +atexit.register(exit_handler, dynamodb, dynamodb_table, new_sandbox) + +# Prepare the AWS profile for the ansible-playbook command +# - dynamodb profile to manage the dynamodb table +# - pool-manager profile to manage the pool +# Save the file to AWS_SHARED_CREDENTIALS_FILE +if target_db == 'prod': + with open(os.environ['AWS_SHARED_CREDENTIALS_FILE'], 'w') as f: + f.write( + f''' +[dynamodb] +aws_access_key_id = {os.environ['AWS_ACCESS_KEY_ID']} +aws_secret_access_key = {os.environ['AWS_SECRET_ACCESS_KEY']} +[pool-manager] +aws_access_key_id = {os.environ['AWS_ACCESS_KEY_ID']} +aws_secret_access_key = {os.environ['AWS_SECRET_ACCESS_KEY']} +''' + ) +else: + with open(os.environ['AWS_SHARED_CREDENTIALS_FILE'], 'w') as f: + f.write( + f''' +[dynamodb] +aws_access_key_id = {os.environ['AWS_ACCESS_KEY_ID_DEV']} +aws_secret_access_key = {os.environ['AWS_SECRET_ACCESS_KEY_DEV']} +[pool-manager] +aws_access_key_id = {os.environ['AWS_ACCESS_KEY_ID']} +aws_secret_access_key = {os.environ['AWS_SECRET_ACCESS_KEY']} + ''') + +# Prepare args for the ansible-playbook command +#./create_range.yml -e account_num_start=3001 -e account_count=10 -e ddns_key_name=... -e ddns_key_secret=... -e ddns_server=... + +local_path = os.path.dirname(os.path.realpath(__file__)) +playbook = os.path.join(local_path, '..', 'playbooks', 'create_range.yml') + +args = [ + 'ansible-playbook', + playbook, + '-e', f'account_num_start={extract_sandbox_number(new_sandbox)}', + '-e', f'account_email={new_email}', + '-e', 'account_count=1', + '-e', f'ddns_key_name={os.environ["ddns_key_name"]}', + '-e', f'ddns_server={os.environ["ddns_server"]}', + '-e', f'ddns_ttl={os.environ["ddns_ttl"]}', + '-e', f'sandbox={new_sandbox}', + '-e', 'update_stage=true', + '-e', 'dynamodb_profile=dynamodb', + '-e', f'dynamodb_table={dynamodb_table}', + '-e', 'aws_master_profile=pool-manager', + # Listing all accounts in the organization is a costly operation + # it takes currently 47s to execute. + # Check the account only in certain scenario, like for a retry + '-e', f'check_account_list={True if retry else False}', + '-e', f'vault_file={INFRA_VAULT_SECRET_FILE}', +] + + +# Run the command +logger.info(f"Running {' '.join(args)}") +# Add the ddns_key_secret to the args +args = args + ['-e', f'ddns_key_secret={os.environ["ddns_key_secret"]}'] +try: + completed = subprocess.run( + args, check=True, + #capture_output=True, + timeout=1800, + ) +except subprocess.CalledProcessError as e: + # Sanitize the error message by removing the DDNS key secret + e_sanitized = str(e).replace(os.environ['ddns_key_secret'], '***') + logger.error(f"Failed to run the command: {e_sanitized}") + # print stdout and stderr + logger.error(e.stdout.decode(), stdout=True) + logger.error(e.stderr.decode(), stderr=True) + + # Set sandbox status to failed + response = dynamodb.update_item( + TableName=dynamodb_table, + Key={ + 'name': { + 'S': new_sandbox + } + }, + UpdateExpression='SET #s = :val1', + ExpressionAttributeNames={ + '#s': 'creation_status' + }, + ExpressionAttributeValues={ + ':val1': { + 'S': 'failed' + } + } + ) + + sys.exit(1) +except subprocess.TimeoutExpired as e: + # Sanitize the error message by removing the DDNS key secret + e_sanitized = str(e).replace(os.environ['ddns_key_secret'], '***') + logger.error(f"Timeout: {e_sanitized}", sandbox=new_sandbox) + # Set sandbox status to failed + response = dynamodb.update_item( + TableName=dynamodb_table, + Key={ + 'name': { + 'S': new_sandbox + } + }, + UpdateExpression='SET #s = :val1', + ExpressionAttributeNames={ + '#s': 'creation_status' + }, + ExpressionAttributeValues={ + ':val1': { + 'S': 'failed' + } + } + ) + sys.exit(1) + +logger.info(f"Created {new_sandbox}") + +# Get the account_id from the db + +sandbox_data = get_sandbox(dynamodb, new_sandbox) + +if sandbox_data: + account_id = sandbox_data.get('account_id', {}).get('S', '') + logger.info(f"Account ID: {account_id}") + logger = logger.bind(account_id=account_id) + + # Write the account_id and the account name to cloud-automation/new_sandboxes.txt + with open('cloud-automation/new_sandboxes.txt', 'w') as f: + f.write(f"{new_sandbox} {account_id}\n") + +set_(dynamodb, new_sandbox, 'stage', STAGE2_ACCOUNT_CREATED) +ACCOUNT_CREATED_TIME = time.time() +logger.info(f"Duration: {round(ACCOUNT_CREATED_TIME - START_TIME)} seconds to create {new_sandbox}") + +# Use https://console.redhat.com/docs/api/sources/v3.1#operations-sources-bulkCreate + +sandbox_data = get_sandbox(dynamodb, new_sandbox) + +if not sandbox_data: + logger.error(f"Failed to get the sandbox data for {new_sandbox}") + sys.exit(1) + +if 'aws_secret_access_key' not in sandbox_data: + logger.error(f"Failed to get the aws_secret_access_key for {new_sandbox}") + sys.exit(1) + +plaintext_key = decrypt_vaulted_str(sandbox_data.get('aws_secret_access_key', {}).get('S', '')).strip(' \t\n\r') +access_key = sandbox_data.get('aws_access_key_id', {}).get('S', '').strip(' \t\n\r') + +if not access_key or not plaintext_key: + logger.error(f"Failed to get the access key for {new_sandbox}") + sys.exit(1) + +# use requests and create the POST request + +baseurl = 'https://console.redhat.com/api/sources/v3.1' + +s = requests.Session() +s.auth = (os.environ['RH_USERNAME'], os.environ['RH_PASSWORD']) + +# delete the source if it exists +# First get the source_id +max_retries = 20 +while True: + response = s.get(f"{baseurl}/sources?filter[name][eq]={new_sandbox}") + if response.status_code == 200: + break + logger.error(f"Failed to get the source: {response.text}", status_code=response.status_code, sandbox=new_sandbox) + if max_retries == 0: + sys.exit(1) + + logger.info(f"Retrying: {max_retries} retries left") + max_retries -= 1 + time.sleep(5) + +result = response.json().get('data', []) +if len(result) > 0: + source_id = response.json().get('data', [{}])[0].get('id', '') + + if source_id: + response = s.delete(f"{baseurl}/sources/{source_id}") + if response.status_code not in [200, 201, 202]: + logger.error(f"Failed to delete the source: {response.text}", status_code=response.status_code, sandbox=new_sandbox) + + logger.info(f"Deleted the source {source_id} for {new_sandbox}") + + # Wait for the deletion to complete + max_retries = 20 + while max_retries > 0: + response = s.get(f"{baseurl}/sources/{source_id}") + if response.status_code == 404: + break + max_retries -= 1 + logger.info(f"Waiting for the source to be deleted from HCC (console): {max_retries} retries left") + time.sleep(5) + + if max_retries == 0: + logger.error(f"Failed to delete the source {source_id} for {new_sandbox}") + sys.exit(1) + +payload = { + "sources": [ + { + "name": new_sandbox, + "source_type_name": "amazon", + "app_creation_workflow": "account_authorization" + } + ], + "authentications": [ + { + "resource_type": "source", + "resource_name": new_sandbox, + "username": access_key, + "password": plaintext_key, + "authtype": "access_key_secret_key" + } + ], + + "applications": [ + { + "source_name": new_sandbox, + "application_type_name": "cloud-meter" + } + ] +} +response = s.post(f"{baseurl}/bulk_create", json=payload) + +if response.status_code not in [200, 201]: + logger.error(f"Failed to create the source: {response.text}", status_code=response.status_code, sandbox=new_sandbox) + sys.exit(1) + +# Run the validation playbook operation + +local_path = os.path.dirname(os.path.realpath(__file__)) +playbook = os.path.join(local_path, '..', 'playbooks', 'validate.yml') + +args = [ + 'ansible-playbook', + playbook, + '-e', f'account_num_start={extract_sandbox_number(new_sandbox)}', + '-e', f'account_num_end={extract_sandbox_number(new_sandbox)}', + '-e', f'sandbox={new_sandbox}', + '-e', 'dynamodb_profile=dynamodb', + '-e', f'dynamodb_table={dynamodb_table}', + '-e', 'aws_master_profile=pool-manager', + '-e', f'vault_file={INFRA_VAULT_SECRET_FILE}', + '-e', 'operation=VALIDATE', +] + +# Run the command +logger.info(f"Running {' '.join(args)}") + +try: + completed = subprocess.run( + args, check=True, + #capture_output=True, + timeout=1800, + ) + +except subprocess.CalledProcessError as e: + logger.error(f"Failed to run the command: {e}") + # print stdout and stderr + logger.error(e.stdout.decode(), stdout=True) + logger.error(e.stderr.decode(), stderr=True) + + # Set sandbox status to validation failed + response = dynamodb.update_item( + TableName=dynamodb_table, + Key={ + 'name': { + 'S': new_sandbox + } + }, + UpdateExpression='SET #s = :val1', + ExpressionAttributeNames={ + '#s': 'creation_status' + }, + ExpressionAttributeValues={ + ':val1': { + 'S': 'validation failed' + } + } + ) + + sys.exit(1) + +except subprocess.TimeoutExpired as e: + logger.error(f"Timeout: {e}") + # Set sandbox status to validation failed + response = dynamodb.update_item( + TableName=dynamodb_table, + Key={ + 'name': { + 'S': new_sandbox + } + }, + UpdateExpression='SET #s = :val1', + ExpressionAttributeNames={ + '#s': 'creation_status' + }, + ExpressionAttributeValues={ + ':val1': { + 'S': 'validation timed out' + } + } + ) + sys.exit(1) + +logger.info(f"Validation successful for {new_sandbox}") + +# Move the sandbox to the final reservation + +response = dynamodb.update_item( + TableName=dynamodb_table, + Key={ + 'name': { + 'S': new_sandbox + } + }, + UpdateExpression='SET #r = :val1, #s = :val2, #c = :val3', + ExpressionAttributeNames={ + '#r': 'reservation', + '#s': 'stage', + '#c': 'creation_status' + }, + ExpressionAttributeValues={ + ':val1': { + 'S': reservation + }, + ':val2': { + 'S': STAGE4_VALIDATED + }, + ':val3': { + 'S': 'success' + } + } +) + +if response['ResponseMetadata']['HTTPStatusCode'] != 200: + logger.error("Failed to update the reservation") + sys.exit(1) + +logger.info(f"Moved {new_sandbox} to {reservation}") +logger.info(f"Total duration: {round(time.time() - START_TIME)} seconds") diff --git a/playbooks/creation_status.py b/playbooks/creation_status.py new file mode 100755 index 00000000..c73fb7b3 --- /dev/null +++ b/playbooks/creation_status.py @@ -0,0 +1,94 @@ +#!/usr/bin/env python3 + + +import subprocess +import os +import sys +import boto3 +import argparse +import atexit +import structlog +import logging +import tempfile +import random +import string + +logger = structlog.get_logger() +structlog.configure(wrapper_class=structlog.make_filtering_bound_logger(logging.INFO)) + +session_prod = boto3.Session(region_name='us-east-1') +dynamodb_prod = session_prod.client('dynamodb') + +session_dev = boto3.Session(region_name='us-east-1', + aws_access_key_id=os.environ['AWS_ACCESS_KEY_ID_DEV'], + aws_secret_access_key=os.environ['AWS_SECRET_ACCESS_KEY_DEV']) +dynamodb_dev = session_dev.client('dynamodb') + + +def print_sandbox(item, db): + if 'stage' not in item: + return + + logger.info(item['name']['S'], + creation_status=item.get('creation_status', {}).get('S', ''), + stage= item.get('stage', {}).get('S', ''), + reservation= item.get('reservation', {}).get('S', ''), + db=db) + + +response = dynamodb_dev.scan( + TableName='accounts-dev', + ConsistentRead=True, + ProjectionExpression='#n, creation_status, stage, reservation', + ExpressionAttributeNames={ + '#n': 'name' + } +) + +if response['ResponseMetadata']['HTTPStatusCode'] != 200: + logger.error("Failed to get items from dynamodb") + sys.exit(1) + +data = response['Items'] +while 'LastEvaluatedKey' in response: + response = dynamodb_dev.scan( + TableName='accounts-dev', + ConsistentRead=True, + ProjectionExpression='#n', + ExpressionAttributeNames={'#n': 'name'}, + ExclusiveStartKey=response['LastEvaluatedKey'] + ) + data.extend(response['Items']) + +if 'Items' in response: + for item in data: + print_sandbox(item, 'dev') + +# Now run the command for the prod database + +response = dynamodb_prod.scan( + TableName='accounts', + ConsistentRead=True, + ProjectionExpression='#n', + ExpressionAttributeNames={'#n': 'name'} +) + +if response['ResponseMetadata']['HTTPStatusCode'] != 200: + logger.error("Failed to get items from dynamodb") + sys.exit(1) + +data = response['Items'] + +while 'LastEvaluatedKey' in response: + response = dynamodb_prod.scan( + TableName='accounts', + ConsistentRead=True, + ProjectionExpression='#n', + ExpressionAttributeNames={'#n': 'name'}, + ExclusiveStartKey=response['LastEvaluatedKey'] + ) + data.extend(response['Items']) + +if 'Items' in response: + for item in data: + print_sandbox(item, 'prod') diff --git a/playbooks/roles/infra-aws-sandbox/defaults/main.yml b/playbooks/roles/infra-aws-sandbox/defaults/main.yml index 5a741a1e..502a4f3f 100644 --- a/playbooks/roles/infra-aws-sandbox/defaults/main.yml +++ b/playbooks/roles/infra-aws-sandbox/defaults/main.yml @@ -8,6 +8,11 @@ account_user: student operation: CREATE +# Listing all accounts in the organization is a costly operation +# it takes currently 47s to execute. +# Check the account only in certain scenario, like for a retry +check_account_list: false + available_after_reset: true available_after_create: false alias_suffix: '-gpte' @@ -168,5 +173,6 @@ dynamodb_region: us-east-1 account_altready_exists: false force_create: false +update_stage: false aws_cli: aws diff --git a/playbooks/roles/infra-aws-sandbox/tasks/account.yml b/playbooks/roles/infra-aws-sandbox/tasks/account.yml index 41da4139..36dbb11b 100644 --- a/playbooks/roles/infra-aws-sandbox/tasks/account.yml +++ b/playbooks/roles/infra-aws-sandbox/tasks/account.yml @@ -21,52 +21,92 @@ account_id: "{{ _getaccountid.stdout }}" account_already_exists: true -- when: - - all_accounts is not defined - - operation == 'CREATE' or account_id is not defined - block: - - name: List all accounts in the organization. - command: >- - {{ aws_cli }} --profile {{ aws_master_profile }} - organizations list-accounts - --query 'Accounts[].{Name: Name, Id: Id}' - register: _listaccounts - changed_when: false - - - set_fact: - all_accounts: "{{_listaccounts.stdout|from_json}}" - -- when: - - operation == 'CREATE' or account_id is not defined - - account_name not in all_accounts|json_query('[].Name') - name: Create New account. +- name: List all accounts in the organization. command: >- {{ aws_cli }} --profile {{ aws_master_profile }} - organizations create-account - --email {{ account_email }} - --account-name "{{ account_name }}" - --role-name "{{ aws_role_name }}" - --query 'CreateAccountStatus.[Id]' - --output text - register: _createaccount + organizations list-accounts + --query 'Accounts[].{Name: Name, Id: Id, Email: Email, Status: Status}' + register: _listaccounts + changed_when: false + when: + - check_account_list | bool + - all_accounts is not defined + - operation == 'CREATE' or account_id is not defined - when: + - check_account_list | bool - operation == 'CREATE' or account_id is not defined - - account_name in all_accounts|json_query('[].Name') block: - - name: Find and set the account_id (existing account) - set_fact: - account_id: >- + - set_fact: + all_active_accounts: >- + {{ + _listaccounts.stdout + | from_json + | json_query('[?Status==`ACTIVE`].{Name: Name, Id: Id}') }} + + - name: Fail if the account name exists multiple times + vars: + selected_accounts: >- + {{ + all_active_accounts + | json_query('[?Name==`'~ account_name ~'`]') + | list + }} + fail: + msg: |- + Account name exists multiple times in the organization. + {{ selected_accounts }} + when: selected_accounts | length > 1 + + - name: Reuse the existing account if one exists + vars: + selected_accounts: >- {{ - ( - all_accounts - |selectattr('Name', 'equalto', account_name) - |first - )['Id'] + all_active_accounts + | json_query('[?Name==`'~ account_name ~'`]') + | list }} + when: selected_accounts | length == 1 + set_fact: + account_id: >- + {{ all_active_accounts + | json_query('[?Name==`' ~ account_name ~ '`].Id') + | first }} - - debug: - var: account_id +- when: account_id is defined + debug: + msg: "Reusing existing account with id {{ account_id }}" + +- when: + - operation == 'CREATE' + - account_id is not defined + block: + - name: Save status of the sandbox + when: update_stage + vars: + step1: "1 - Account Creation Started" + _data: + name: + S: "{{ account_name }}" + command: >- + {{ aws_cli }} --profile {{ dynamodb_profile }} + --region {{ dynamodb_region }} + dynamodb update-item + --table-name {{ dynamodb_table }} + --key '{{ _data | to_json }}' + --update-expression 'SET stage = :val' + --expression-attribute-values '{":val": {"S": "{{ step1 }}"}}' + + - name: Create New account. + command: >- + {{ aws_cli }} --profile {{ aws_master_profile }} + organizations create-account + --email {{ account_email }} + --account-name "{{ account_name }}" + --role-name "{{ aws_role_name }}" + --query 'CreateAccountStatus.[Id]' + --output text + register: _createaccount - when: - _createaccount is not skipped @@ -85,9 +125,37 @@ retries: 40 changed_when: false - - fail: - msg: The creation of the account failed. - when: _describestatus.stdout == 'FAILED' + - when: update_stage and _describestatus.stdout == 'FAILED' + block: + - name: Save failed status of the sandbox + command: >- + {{ aws_cli }} --profile {{ aws_master_profile }} + organizations describe-create-account-status + --create-account-request-id {{ _createaccount.stdout }} + --output json + register: _describestatus2 + changed_when: false + + - name: Save failed status of the sandbox + vars: + step1: "1 - Account Creation Failed" + _data: + name: + S: "{{ account_name }}" + command: >- + {{ aws_cli }} --profile {{ dynamodb_profile }} + --region {{ dynamodb_region }} + dynamodb update-item + --table-name {{ dynamodb_table }} + --key '{{ _data | to_json }}' + --update-expression 'SET stage = :val' + --expression-attribute-values '{":val": {"S": "{{ step1 }}"}}' + + - debug: + var: _describestatus2 + + - fail: + msg: The creation of the account failed. - name: Get the account ID command: >- @@ -107,6 +175,26 @@ fail: msg: Account Id not defined +- name: Save status of the sandbox + vars: + step1: "1 - Account Creation Succeeded" + _data: + name: + S: "{{ account_name }}" + _expr: + ":val": + "S": "{{ step1 }}" + ":accountid": + "S": "{{ account_id }}" + command: >- + {{ aws_cli }} --profile {{ dynamodb_profile }} + --region {{ dynamodb_region }} + dynamodb update-item + --table-name {{ dynamodb_table }} + --key '{{ _data | to_json }}' + --update-expression 'SET stage = :val, account_id = :accountid' + --expression-attribute-values '{{ _expr | to_json }}' + - name: Add Account Id to the report lineinfile: path: "{{ output_dir }}/{{ account_name }}_report.txt" diff --git a/playbooks/roles/infra-aws-sandbox/tasks/assume.yml b/playbooks/roles/infra-aws-sandbox/tasks/assume.yml index 1a9845d0..00f41438 100644 --- a/playbooks/roles/infra-aws-sandbox/tasks/assume.yml +++ b/playbooks/roles/infra-aws-sandbox/tasks/assume.yml @@ -1,5 +1,8 @@ --- - name: Get temporary token for the sandbox (Assume Role) + environment: + AWS_ACCESS_KEY_ID: "" + AWS_SECRET_ACCESS_KEY: "" sts_assume_role: profile: "{{ aws_master_profile }}" role_arn: "arn:aws:iam::{{ account_id }}:role/OrganizationAccountAccessRole" diff --git a/playbooks/roles/infra-aws-sandbox/tasks/iam.yml b/playbooks/roles/infra-aws-sandbox/tasks/iam.yml index 999d83d7..0ca6fafe 100644 --- a/playbooks/roles/infra-aws-sandbox/tasks/iam.yml +++ b/playbooks/roles/infra-aws-sandbox/tasks/iam.yml @@ -1,14 +1,23 @@ --- - name: Create IAM role using Cloudformation + environment: + AWS_ACCESS_KEY_ID: "" + AWS_SECRET_ACCESS_KEY: "" cloudformation: profile: "{{ account_profile }}" template_body: "{{ lookup('file', 'CF-IAM.json') }}" region: "{{ aws_region }}" stack_name: roles + retries: 50 + delay: 2 register: r_cf - ignore_errors: yes + until: r_cf is succeeded + ignore_errors: true - when: r_cf is failed + environment: + AWS_ACCESS_KEY_ID: "" + AWS_SECRET_ACCESS_KEY: "" block: - name: Delete IAM role Cloudformation stack cloudformation: diff --git a/playbooks/roles/infra-aws-sandbox/tasks/keypair.yml b/playbooks/roles/infra-aws-sandbox/tasks/keypair.yml index ea821cbb..0ba94db6 100644 --- a/playbooks/roles/infra-aws-sandbox/tasks/keypair.yml +++ b/playbooks/roles/infra-aws-sandbox/tasks/keypair.yml @@ -1,33 +1,23 @@ --- -# TODO: create a list of keys and loop over it instead of doing one by one -- name: Import OPENTLC backdoor key - ec2_key: - name: opentlc_admin_backdoor - region: "{{ _region }}" - key_material: "{{ opentlc_admin_backdoor }}" - aws_access_key: "{{ assumed_role.sts_creds.access_key }}" - aws_secret_key: "{{ assumed_role.sts_creds.secret_key }}" - session_token: "{{ assumed_role.sts_creds.session_token }}" - loop: "{{ all_regions }}" - loop_control: - loop_var: _region - register: r_import - retries: 5 - delay: "{{ 30|random(start=3, step=1) }}" - until: r_import is succeeded - -- name: Import OPENTLC ocpkey - ec2_key: - name: ocpkey - region: "{{ _region }}" - key_material: "{{ ocpkey }}" - aws_access_key: "{{ assumed_role.sts_creds.access_key }}" - aws_secret_key: "{{ assumed_role.sts_creds.secret_key }}" - session_token: "{{ assumed_role.sts_creds.session_token }}" - loop: "{{ all_regions }}" - loop_control: - loop_var: _region - register: r_import2 - retries: 5 - delay: "{{ 30|random(start=3, step=1) }}" - until: r_import2 is succeeded +# TODO: make sure this is unused across the accounts and retire +- environment: + AWS_ACCESS_KEY_ID: "" + AWS_SECRET_ACCESS_KEY: "" + block: + - name: Import OPENTLC backdoor key + # ap-southeast-4 region breaks the ec2_key module, ignore it + when: _region != "ap-southeast-4" + ec2_key: + name: opentlc_admin_backdoor + region: "{{ _region }}" + key_material: "{{ opentlc_admin_backdoor }}" + aws_access_key: "{{ assumed_role.sts_creds.access_key }}" + aws_secret_key: "{{ assumed_role.sts_creds.secret_key }}" + session_token: "{{ assumed_role.sts_creds.session_token }}" + loop: "{{ all_regions }}" + loop_control: + loop_var: _region + register: r_import + retries: 10 + delay: "{{ 10|random(start=3, step=1) }}" + until: r_import is succeeded diff --git a/playbooks/roles/infra-aws-sandbox/tasks/main.yml b/playbooks/roles/infra-aws-sandbox/tasks/main.yml index 80356498..ce62e22d 100644 --- a/playbooks/roles/infra-aws-sandbox/tasks/main.yml +++ b/playbooks/roles/infra-aws-sandbox/tasks/main.yml @@ -1,5 +1,7 @@ --- - import_tasks: pre_checks.yml +- when: operation == 'VALIDATE' or operation == 'validate' + include_tasks: validate.yaml - import_tasks: account.yml tags: account - import_tasks: profile.yml diff --git a/playbooks/roles/infra-aws-sandbox/tasks/ou.yml b/playbooks/roles/infra-aws-sandbox/tasks/ou.yml index f6240988..cee38eb6 100644 --- a/playbooks/roles/infra-aws-sandbox/tasks/ou.yml +++ b/playbooks/roles/infra-aws-sandbox/tasks/ou.yml @@ -30,21 +30,20 @@ - when: accounts_in_ou is not defined block: - - name: List the accounts in the destination OU + - name: Get the OU of the account command: >- - {{ aws_cli }} --profile {{ aws_master_profile }} - organizations list-accounts-for-parent - --parent-id {{ destouid }} - --query 'Accounts[].Id' - --output json - register: _listaccounts_in_ou + {{ aws_cli }} --profile {{ aws_master_profile }} + organizations list-parents --child-id {{ account_id }} + register: r_ou changed_when: false - - name: Save organization OU - set_fact: - accounts_in_ou: "{{ _listaccounts_in_ou.stdout | from_json | list }}" - -- when: account_id not in accounts_in_ou +- when: >- + destouid not in + ( r_ou.stdout + | from_json + | json_query('Parents[].Id') + | default([], true) + ) name: Move account to destination OU command: >- {{ aws_cli }} --profile {{ aws_master_profile }} diff --git a/playbooks/roles/infra-aws-sandbox/tasks/pool.yml b/playbooks/roles/infra-aws-sandbox/tasks/pool.yml index ad417d4f..666ba656 100644 --- a/playbooks/roles/infra-aws-sandbox/tasks/pool.yml +++ b/playbooks/roles/infra-aws-sandbox/tasks/pool.yml @@ -37,7 +37,7 @@ --table-name {{ dynamodb_table }} --item '{{ _data | to_json }}' register: _putaccount - when: _getaccount.stdout == '' or force_create + when: _getaccount.stdout == '' or force_create or update_stage - debug: var: _putaccount diff --git a/playbooks/roles/infra-aws-sandbox/tasks/route53.yml b/playbooks/roles/infra-aws-sandbox/tasks/route53.yml index b6c58c5f..6f5dd22c 100644 --- a/playbooks/roles/infra-aws-sandbox/tasks/route53.yml +++ b/playbooks/roles/infra-aws-sandbox/tasks/route53.yml @@ -1,13 +1,15 @@ --- - environment: AWS_PROFILE: "{{ account_profile }}" + AWS_ACCESS_KEY_ID: "" + AWS_SECRET_ACCESS_KEY: "" block: - name: Create the public zone route53_zone: zone: "{{ account_name }}{{subdomain_base}}." register: _route53zone - retries: 5 - delay: "{{ 60|random(start=3, step=1) }}" + retries: 10 + delay: "{{ 10|random(start=3, step=1) }}" until: _route53zone is succeeded - set_fact: diff --git a/playbooks/roles/infra-aws-sandbox/tasks/user.yml b/playbooks/roles/infra-aws-sandbox/tasks/user.yml index 68ba4dbf..4c423f6e 100644 --- a/playbooks/roles/infra-aws-sandbox/tasks/user.yml +++ b/playbooks/roles/infra-aws-sandbox/tasks/user.yml @@ -2,6 +2,8 @@ # NOTE: Use 'command' module instead of ansible iam module because it doesn't work well with boto profiles. - environment: AWS_PROFILE: "{{ account_profile }}" + AWS_ACCESS_KEY_ID: "" + AWS_SECRET_ACCESS_KEY: "" block: - name: Check if user already exists command: >- diff --git a/playbooks/roles/infra-aws-sandbox/tasks/validate.yaml b/playbooks/roles/infra-aws-sandbox/tasks/validate.yaml index cd69a1b1..c8dbf34d 100644 --- a/playbooks/roles/infra-aws-sandbox/tasks/validate.yaml +++ b/playbooks/roles/infra-aws-sandbox/tasks/validate.yaml @@ -33,6 +33,9 @@ include_tasks: unvault.yml - name: Validate AWS access + environment: + AWS_ACCESS_KEY_ID: "" + AWS_SECRET_ACCESS_KEY: "" aws_caller_info: aws_access_key: "{{ sandbox_aws_access_key_id }}" aws_secret_key: "{{ sandbox_aws_secret_access_key }}" @@ -67,12 +70,20 @@ assert: that: ns_entries == ns_entries_route53 + retries: 60 + delay: 1 - name: Ensure Red Hat GOLD AMI are accessible from within the sandbox ec2_ami_info: aws_access_key: "{{ sandbox_aws_access_key_id }}" aws_secret_key: "{{ sandbox_aws_secret_access_key }}" - region: us-east-1 + region: >- + {{ + ['us-east-1', + 'us-east-2', + 'us-west-1', + 'us-west-2' + ] | shuffle | first }} # Red Hat official owner: 309956199498 filters: @@ -80,6 +91,10 @@ name: RHEL-9.0*Access* is-public: "false" register: r_image + # Try for 15m + retries: 150 + delay: 6 + until: r_image is succeeded and r_image.images | length > 0 - assert: that: >- @@ -89,3 +104,6 @@ or r_image.images[0].platform_details == 'Red Hat BYOL Linux' ) + +- when: operation == 'VALIDATE' or operation == 'validate' + meta: end_play diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..a1b085ea --- /dev/null +++ b/requirements.txt @@ -0,0 +1,26 @@ +Jinja2 +MarkupSafe +PyYAML +ansible +ansible_vault +awscli +boto3 +botocore +cffi +colorama +cryptography +distro +dnspython +docutils +jmespath +psutil +pyasn1 +pycparser +python-dateutil +requests +rsa +s3transfer +selinux +six +structlog +urllib3 From 8a2364eb7bff7b1029a60870d69916c34e948fd9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillaume=20Cor=C3=A9?= Date: Wed, 15 Jan 2025 21:56:47 +0100 Subject: [PATCH 02/11] Add argument to toggle playbook output easily --- playbooks/create_sandbox.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/playbooks/create_sandbox.py b/playbooks/create_sandbox.py index efe9a0b7..ff4ac53a 100755 --- a/playbooks/create_sandbox.py +++ b/playbooks/create_sandbox.py @@ -38,6 +38,7 @@ parser.add_argument('--target-db', required=False, help='The target database', default='dev') parser.add_argument('--log-level', required=False, help='The log level', default='info') parser.add_argument('--retry', required=False, help='Retry sandbox by passing its name', default=None) +parser.add_argument('--playbook-output', required=False, help='Print output of ansible-playbook commands?', action=argparse.BooleanOptionalAction, default=True) args = parser.parse_args() @@ -46,6 +47,7 @@ target_db = args.target_db log_level = args.log_level retry = args.retry +playbook_output = args.playbook_output if log_level == 'debug': logger.info("Setting log level to DEBUG") @@ -473,7 +475,7 @@ def exit_handler(db, table, sandbox): try: completed = subprocess.run( args, check=True, - #capture_output=True, + capture_output=(not playbook_output), timeout=1800, ) except subprocess.CalledProcessError as e: @@ -668,7 +670,7 @@ def exit_handler(db, table, sandbox): try: completed = subprocess.run( args, check=True, - #capture_output=True, + capture_output=(not playbook_output), timeout=1800, ) From b008ec7c10fab5b9c116d8ea11a4899e1e3e1fc4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillaume=20Cor=C3=A9?= Date: Thu, 16 Jan 2025 15:39:44 +0100 Subject: [PATCH 03/11] Add sec, operation and billing info to account --- playbooks/create_sandbox.py | 4 +- .../roles/infra-aws-sandbox/tasks/account.yml | 69 +++++++++++++++++++ 2 files changed, 72 insertions(+), 1 deletion(-) diff --git a/playbooks/create_sandbox.py b/playbooks/create_sandbox.py index ff4ac53a..e705b2a9 100755 --- a/playbooks/create_sandbox.py +++ b/playbooks/create_sandbox.py @@ -302,7 +302,7 @@ def decrypt_vaulted_str(secret): return Vault(os.environ['INFRA_VAULT_SECRET']).load_raw(secret).decode('utf-8') new_sandbox, new_email = guess_next_sandbox(sandboxes, sandboxes_dict) -logger.info(f"=> Create {new_sandbox}") +logger = logger.bind(sandbox=new_sandbox) # Lock the name of the sandbox in DB so another @@ -646,6 +646,8 @@ def exit_handler(db, table, sandbox): logger.error(f"Failed to create the source: {response.text}", status_code=response.status_code, sandbox=new_sandbox) sys.exit(1) +logger.info(f"Source create in HCC") + # Run the validation playbook operation local_path = os.path.dirname(os.path.realpath(__file__)) diff --git a/playbooks/roles/infra-aws-sandbox/tasks/account.yml b/playbooks/roles/infra-aws-sandbox/tasks/account.yml index 36dbb11b..980c7a45 100644 --- a/playbooks/roles/infra-aws-sandbox/tasks/account.yml +++ b/playbooks/roles/infra-aws-sandbox/tasks/account.yml @@ -200,3 +200,72 @@ path: "{{ output_dir }}/{{ account_name }}_report.txt" line: "account_id: {{ account_id }}" create: true + +- name: Load environment variables for SECURITY contact + set_fact: + security_email: "{{ lookup('env', 'SECURITY_EMAIL') }}" + security_name: "{{ lookup('env', 'SECURITY_NAME') }}" + security_phone: "{{ lookup('env', 'SECURITY_PHONE') }}" + security_title: "{{ lookup('env', 'SECURITY_TITLE') }}" + +- name: Set SECURITY alternate contact + when: + - security_email != '' + - security_name != '' + - security_phone != '' + - security_title != '' + command: >- + aws --profile {{ aws_master_profile | quote }} + account put-alternate-contact + --account-id {{ account_id | quote }} + --alternate-contact-type=SECURITY + --email-address {{ security_email | quote }} + --name {{ security_name | quote }} + --phone-number {{ security_phone | quote }} + --title {{ security_title | quote }} + +- name: Load environment variables for OPERATIONS contact + set_fact: + operations_email: "{{ lookup('env', 'OPERATIONS_EMAIL') }}" + operations_name: "{{ lookup('env', 'OPERATIONS_NAME') }}" + operations_phone: "{{ lookup('env', 'OPERATIONS_PHONE') }}" + operations_title: "{{ lookup('env', 'OPERATIONS_TITLE') }}" + +- name: Set OPERATIONS alternate contact + when: + - operations_email != '' + - operations_name != '' + - operations_phone != '' + - operations_title != '' + command: >- + aws --profile {{ aws_master_profile | quote }} + account put-alternate-contact + --account-id {{ account_id | quote }} + --alternate-contact-type=OPERATIONS + --email-address {{ operations_email | quote }} + --name {{ operations_name | quote }} + --phone-number {{ operations_phone | quote }} + --title {{ operations_title | quote }} + +- name: Load environment variables for BILLING contact + set_fact: + billing_email: "{{ lookup('env', 'BILLING_EMAIL') }}" + billing_name: "{{ lookup('env', 'BILLING_NAME') }}" + billing_phone: "{{ lookup('env', 'BILLING_PHONE') }}" + billing_title: "{{ lookup('env', 'BILLING_TITLE') }}" + +- name: Set BILLING alternate contact + when: + - billing_email != '' + - billing_name != '' + - billing_phone != '' + - billing_title != '' + command: >- + aws --profile {{ aws_master_profile | quote }} + account put-alternate-contact + --account-id {{ account_id | quote }} + --alternate-contact-type=BILLING + --email-address {{ billing_email | quote }} + --name {{ billing_name | quote }} + --phone-number {{ billing_phone | quote }} + --title {{ billing_title | quote }} From 17a8a449ce91d0cea4d88b707f2c65b0e1e59248 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillaume=20Cor=C3=A9?= Date: Thu, 16 Jan 2025 23:03:44 +0100 Subject: [PATCH 04/11] Add option to disable hcc and validation. Fix guessing --- playbooks/create_sandbox.py | 369 +++++++++++++++++++----------------- 1 file changed, 191 insertions(+), 178 deletions(-) diff --git a/playbooks/create_sandbox.py b/playbooks/create_sandbox.py index e705b2a9..1d1a3981 100755 --- a/playbooks/create_sandbox.py +++ b/playbooks/create_sandbox.py @@ -39,6 +39,9 @@ parser.add_argument('--log-level', required=False, help='The log level', default='info') parser.add_argument('--retry', required=False, help='Retry sandbox by passing its name', default=None) parser.add_argument('--playbook-output', required=False, help='Print output of ansible-playbook commands?', action=argparse.BooleanOptionalAction, default=True) +parser.add_argument('--hcc', required=False, help='run the registration step for Gold images?', action=argparse.BooleanOptionalAction, default=True) +parser.add_argument('--validation', required=False, help='run the validation playbook?', action=argparse.BooleanOptionalAction, default=True) +parser.add_argument('--guess-strategy', required=False, help='How to guess the next number: smart, end', default='end') args = parser.parse_args() @@ -48,6 +51,9 @@ log_level = args.log_level retry = args.retry playbook_output = args.playbook_output +hcc = args.hcc +validation = args.validation +guess_strategy = args.guess_strategy if log_level == 'debug': logger.info("Setting log level to DEBUG") @@ -193,6 +199,12 @@ logger.info(f"Found {len(sandboxes_prod)} sandboxes in prod") sandboxes = sandboxes + sandboxes_prod +def extract_sandbox_number(sandbox): + """Extract the number from the sandbox name, for example sandbox1234 returns 1234""" + return int(sandbox.split('sandbox')[1]) + +sandboxes.sort(key=extract_sandbox_number) + # transform into a dictionary sandboxes_dict = {sandbox: True for sandbox in sandboxes} @@ -275,10 +287,6 @@ def get_sandbox(dynamodb, sandbox): return {} -def extract_sandbox_number(sandbox): - """Extract the number from the sandbox name, for example sandbox1234 returns 1234""" - return int(sandbox.split('sandbox')[1]) - def guess_next_sandbox(sandboxes, sandboxes_dict): """Find the first available sandbox name""" @@ -289,10 +297,13 @@ def guess_next_sandbox(sandboxes, sandboxes_dict): if retry: return retry, f"{retry}+{random_email_tag}@{os.environ['email_domain']}" - for i in range(1, len(sandboxes) + 1): - if not sandboxes_dict.get(f"sandbox{i}", False): - return f"sandbox{i}", f"sandbox{i}+{random_email_tag}@{os.environ['email_domain']}" + if guess_strategy == 'smart': + for i in range(1, len(sandboxes) + 1): + if not sandboxes_dict.get(f"sandbox{i}", False): + return f"sandbox{i}", f"sandbox{i}+{random_email_tag}@{os.environ['email_domain']}" + + logger.info(f"len(sanboxes) = {len(sandboxes)}") s = f"sandbox{extract_sandbox_number(sandboxes[-1]) + 1}" return s, f"{s}+{random_email_tag}@{os.environ['email_domain']}" @@ -304,7 +315,6 @@ def decrypt_vaulted_str(secret): new_sandbox, new_email = guess_next_sandbox(sandboxes, sandboxes_dict) logger = logger.bind(sandbox=new_sandbox) - # Lock the name of the sandbox in DB so another # concurrent process won't be able to create the same sandbox. sandbox_data = get_sandbox(dynamodb, new_sandbox) @@ -549,163 +559,189 @@ def exit_handler(db, table, sandbox): ACCOUNT_CREATED_TIME = time.time() logger.info(f"Duration: {round(ACCOUNT_CREATED_TIME - START_TIME)} seconds to create {new_sandbox}") -# Use https://console.redhat.com/docs/api/sources/v3.1#operations-sources-bulkCreate -sandbox_data = get_sandbox(dynamodb, new_sandbox) +if hcc: + # Use https://console.redhat.com/docs/api/sources/v3.1#operations-sources-bulkCreate -if not sandbox_data: - logger.error(f"Failed to get the sandbox data for {new_sandbox}") - sys.exit(1) + sandbox_data = get_sandbox(dynamodb, new_sandbox) -if 'aws_secret_access_key' not in sandbox_data: - logger.error(f"Failed to get the aws_secret_access_key for {new_sandbox}") - sys.exit(1) - -plaintext_key = decrypt_vaulted_str(sandbox_data.get('aws_secret_access_key', {}).get('S', '')).strip(' \t\n\r') -access_key = sandbox_data.get('aws_access_key_id', {}).get('S', '').strip(' \t\n\r') - -if not access_key or not plaintext_key: - logger.error(f"Failed to get the access key for {new_sandbox}") - sys.exit(1) - -# use requests and create the POST request - -baseurl = 'https://console.redhat.com/api/sources/v3.1' - -s = requests.Session() -s.auth = (os.environ['RH_USERNAME'], os.environ['RH_PASSWORD']) + if not sandbox_data: + logger.error(f"Failed to get the sandbox data for {new_sandbox}") + sys.exit(1) -# delete the source if it exists -# First get the source_id -max_retries = 20 -while True: - response = s.get(f"{baseurl}/sources?filter[name][eq]={new_sandbox}") - if response.status_code == 200: - break - logger.error(f"Failed to get the source: {response.text}", status_code=response.status_code, sandbox=new_sandbox) - if max_retries == 0: + if 'aws_secret_access_key' not in sandbox_data: + logger.error(f"Failed to get the aws_secret_access_key for {new_sandbox}") sys.exit(1) - logger.info(f"Retrying: {max_retries} retries left") - max_retries -= 1 - time.sleep(5) + plaintext_key = decrypt_vaulted_str(sandbox_data.get('aws_secret_access_key', {}).get('S', '')).strip(' \t\n\r') + access_key = sandbox_data.get('aws_access_key_id', {}).get('S', '').strip(' \t\n\r') -result = response.json().get('data', []) -if len(result) > 0: - source_id = response.json().get('data', [{}])[0].get('id', '') + if not access_key or not plaintext_key: + logger.error(f"Failed to get the access key for {new_sandbox}") + sys.exit(1) - if source_id: - response = s.delete(f"{baseurl}/sources/{source_id}") - if response.status_code not in [200, 201, 202]: - logger.error(f"Failed to delete the source: {response.text}", status_code=response.status_code, sandbox=new_sandbox) + # use requests and create the POST request - logger.info(f"Deleted the source {source_id} for {new_sandbox}") + baseurl = 'https://console.redhat.com/api/sources/v3.1' - # Wait for the deletion to complete - max_retries = 20 - while max_retries > 0: - response = s.get(f"{baseurl}/sources/{source_id}") - if response.status_code == 404: - break - max_retries -= 1 - logger.info(f"Waiting for the source to be deleted from HCC (console): {max_retries} retries left") - time.sleep(5) + s = requests.Session() + s.auth = (os.environ['RH_USERNAME'], os.environ['RH_PASSWORD']) + # delete the source if it exists + # First get the source_id + max_retries = 20 + while True: + response = s.get(f"{baseurl}/sources?filter[name][eq]={new_sandbox}") + if response.status_code == 200: + break + logger.error(f"Failed to get the source: {response.text}", status_code=response.status_code, sandbox=new_sandbox) if max_retries == 0: - logger.error(f"Failed to delete the source {source_id} for {new_sandbox}") sys.exit(1) -payload = { - "sources": [ - { - "name": new_sandbox, - "source_type_name": "amazon", - "app_creation_workflow": "account_authorization" - } - ], - "authentications": [ - { - "resource_type": "source", - "resource_name": new_sandbox, - "username": access_key, - "password": plaintext_key, - "authtype": "access_key_secret_key" - } - ], - - "applications": [ - { - "source_name": new_sandbox, - "application_type_name": "cloud-meter" - } - ] -} -response = s.post(f"{baseurl}/bulk_create", json=payload) + logger.info(f"Retrying: {max_retries} retries left") + max_retries -= 1 + time.sleep(5) + + result = response.json().get('data', []) + if len(result) > 0: + source_id = response.json().get('data', [{}])[0].get('id', '') + + if source_id: + response = s.delete(f"{baseurl}/sources/{source_id}") + if response.status_code not in [200, 201, 202]: + logger.error(f"Failed to delete the source: {response.text}", status_code=response.status_code, sandbox=new_sandbox) + + logger.info(f"Deleted the source {source_id} for {new_sandbox}") + + # Wait for the deletion to complete + max_retries = 20 + while max_retries > 0: + response = s.get(f"{baseurl}/sources/{source_id}") + if response.status_code == 404: + break + max_retries -= 1 + logger.info(f"Waiting for the source to be deleted from HCC (console): {max_retries} retries left") + time.sleep(5) + + if max_retries == 0: + logger.error(f"Failed to delete the source {source_id} for {new_sandbox}") + sys.exit(1) + + payload = { + "sources": [ + { + "name": new_sandbox, + "source_type_name": "amazon", + "app_creation_workflow": "account_authorization" + } + ], + "authentications": [ + { + "resource_type": "source", + "resource_name": new_sandbox, + "username": access_key, + "password": plaintext_key, + "authtype": "access_key_secret_key" + } + ], -if response.status_code not in [200, 201]: - logger.error(f"Failed to create the source: {response.text}", status_code=response.status_code, sandbox=new_sandbox) - sys.exit(1) + "applications": [ + { + "source_name": new_sandbox, + "application_type_name": "cloud-meter" + } + ] + } + response = s.post(f"{baseurl}/bulk_create", json=payload) -logger.info(f"Source create in HCC") + if response.status_code not in [200, 201]: + logger.error(f"Failed to create the source: {response.text}", status_code=response.status_code, sandbox=new_sandbox) + sys.exit(1) -# Run the validation playbook operation + logger.info(f"Source create in HCC") + +if validation: + # Run the validation playbook operation + + local_path = os.path.dirname(os.path.realpath(__file__)) + playbook = os.path.join(local_path, '..', 'playbooks', 'validate.yml') + + args = [ + 'ansible-playbook', + playbook, + '-e', f'account_num_start={extract_sandbox_number(new_sandbox)}', + '-e', f'account_num_end={extract_sandbox_number(new_sandbox)}', + '-e', f'sandbox={new_sandbox}', + '-e', 'dynamodb_profile=dynamodb', + '-e', f'dynamodb_table={dynamodb_table}', + '-e', 'aws_master_profile=pool-manager', + '-e', f'vault_file={INFRA_VAULT_SECRET_FILE}', + '-e', 'operation=VALIDATE', + ] -local_path = os.path.dirname(os.path.realpath(__file__)) -playbook = os.path.join(local_path, '..', 'playbooks', 'validate.yml') + # Run the command + logger.info(f"Running {' '.join(args)}") -args = [ - 'ansible-playbook', - playbook, - '-e', f'account_num_start={extract_sandbox_number(new_sandbox)}', - '-e', f'account_num_end={extract_sandbox_number(new_sandbox)}', - '-e', f'sandbox={new_sandbox}', - '-e', 'dynamodb_profile=dynamodb', - '-e', f'dynamodb_table={dynamodb_table}', - '-e', 'aws_master_profile=pool-manager', - '-e', f'vault_file={INFRA_VAULT_SECRET_FILE}', - '-e', 'operation=VALIDATE', -] + try: + completed = subprocess.run( + args, check=True, + capture_output=(not playbook_output), + timeout=1800, + ) -# Run the command -logger.info(f"Running {' '.join(args)}") + except subprocess.CalledProcessError as e: + logger.error(f"Failed to run the command: {e}") + # print stdout and stderr + logger.error(e.stdout.decode(), stdout=True) + logger.error(e.stderr.decode(), stderr=True) -try: - completed = subprocess.run( - args, check=True, - capture_output=(not playbook_output), - timeout=1800, - ) + # Set sandbox status to validation failed + response = dynamodb.update_item( + TableName=dynamodb_table, + Key={ + 'name': { + 'S': new_sandbox + } + }, + UpdateExpression='SET #s = :val1', + ExpressionAttributeNames={ + '#s': 'creation_status' + }, + ExpressionAttributeValues={ + ':val1': { + 'S': 'validation failed' + } + } + ) -except subprocess.CalledProcessError as e: - logger.error(f"Failed to run the command: {e}") - # print stdout and stderr - logger.error(e.stdout.decode(), stdout=True) - logger.error(e.stderr.decode(), stderr=True) + sys.exit(1) - # Set sandbox status to validation failed - response = dynamodb.update_item( - TableName=dynamodb_table, - Key={ - 'name': { - 'S': new_sandbox - } - }, - UpdateExpression='SET #s = :val1', - ExpressionAttributeNames={ - '#s': 'creation_status' - }, - ExpressionAttributeValues={ - ':val1': { - 'S': 'validation failed' + except subprocess.TimeoutExpired as e: + logger.error(f"Timeout: {e}") + # Set sandbox status to validation failed + response = dynamodb.update_item( + TableName=dynamodb_table, + Key={ + 'name': { + 'S': new_sandbox + } + }, + UpdateExpression='SET #s = :val1', + ExpressionAttributeNames={ + '#s': 'creation_status' + }, + ExpressionAttributeValues={ + ':val1': { + 'S': 'validation timed out' + } } - } - ) + ) + sys.exit(1) - sys.exit(1) + logger.info(f"Validation successful for {new_sandbox}") + + # Move the sandbox to the final reservation -except subprocess.TimeoutExpired as e: - logger.error(f"Timeout: {e}") - # Set sandbox status to validation failed response = dynamodb.update_item( TableName=dynamodb_table, Key={ @@ -713,51 +749,28 @@ def exit_handler(db, table, sandbox): 'S': new_sandbox } }, - UpdateExpression='SET #s = :val1', + UpdateExpression='SET #r = :val1, #s = :val2, #c = :val3', ExpressionAttributeNames={ - '#s': 'creation_status' + '#r': 'reservation', + '#s': 'stage', + '#c': 'creation_status' }, ExpressionAttributeValues={ ':val1': { - 'S': 'validation timed out' + 'S': reservation + }, + ':val2': { + 'S': STAGE4_VALIDATED + }, + ':val3': { + 'S': 'success' } } ) - sys.exit(1) -logger.info(f"Validation successful for {new_sandbox}") - -# Move the sandbox to the final reservation - -response = dynamodb.update_item( - TableName=dynamodb_table, - Key={ - 'name': { - 'S': new_sandbox - } - }, - UpdateExpression='SET #r = :val1, #s = :val2, #c = :val3', - ExpressionAttributeNames={ - '#r': 'reservation', - '#s': 'stage', - '#c': 'creation_status' - }, - ExpressionAttributeValues={ - ':val1': { - 'S': reservation - }, - ':val2': { - 'S': STAGE4_VALIDATED - }, - ':val3': { - 'S': 'success' - } - } -) - -if response['ResponseMetadata']['HTTPStatusCode'] != 200: - logger.error("Failed to update the reservation") - sys.exit(1) + if response['ResponseMetadata']['HTTPStatusCode'] != 200: + logger.error("Failed to update the reservation") + sys.exit(1) -logger.info(f"Moved {new_sandbox} to {reservation}") -logger.info(f"Total duration: {round(time.time() - START_TIME)} seconds") + logger.info(f"Moved {new_sandbox} to {reservation}") + logger.info(f"Total duration: {round(time.time() - START_TIME)} seconds") From a8abe449cee0d66d62c5172402505e8a8557f952 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillaume=20Cor=C3=A9?= Date: Fri, 17 Jan 2025 16:00:25 +0100 Subject: [PATCH 05/11] Add the ability to skip playbook/validation/hcc --- playbooks/create_sandbox.py | 222 +++++++++++++++++++---------------- playbooks/creation_status.py | 11 +- 2 files changed, 119 insertions(+), 114 deletions(-) diff --git a/playbooks/create_sandbox.py b/playbooks/create_sandbox.py index 1d1a3981..985c29e5 100755 --- a/playbooks/create_sandbox.py +++ b/playbooks/create_sandbox.py @@ -39,6 +39,7 @@ parser.add_argument('--log-level', required=False, help='The log level', default='info') parser.add_argument('--retry', required=False, help='Retry sandbox by passing its name', default=None) parser.add_argument('--playbook-output', required=False, help='Print output of ansible-playbook commands?', action=argparse.BooleanOptionalAction, default=True) +parser.add_argument('--playbook', required=False, help='run the creation playbook?', action=argparse.BooleanOptionalAction, default=True) parser.add_argument('--hcc', required=False, help='run the registration step for Gold images?', action=argparse.BooleanOptionalAction, default=True) parser.add_argument('--validation', required=False, help='run the validation playbook?', action=argparse.BooleanOptionalAction, default=True) parser.add_argument('--guess-strategy', required=False, help='How to guess the next number: smart, end', default='end') @@ -50,6 +51,7 @@ target_db = args.target_db log_level = args.log_level retry = args.retry +playbook= args.playbook playbook_output = args.playbook_output hcc = args.hcc validation = args.validation @@ -385,8 +387,6 @@ def lock_sandbox(dynamodb, sandbox): logger.info(f"Locked {new_sandbox}") -lock_sandbox(dynamodb, new_sandbox) - def exit_handler(db, table, sandbox): '''Function to cleanup everything in case something went wrong''' @@ -414,14 +414,24 @@ def exit_handler(db, table, sandbox): elif stage == STAGE4_VALIDATED: pass else: - # something went wrong - logger.error(f"Unexpected stage: {stage}, missing validation") - logger.info(f"You can retry the operation by running the command with --retry {sandbox}") - set_(dynamodb, new_sandbox, 'creation_status', 'failed') - sys.exit(1) + if validation: + # something went wrong + logger.error(f"Unexpected stage: {stage}, missing validation") + logger.info(f"You can retry the operation by running the command with --retry {sandbox}") + set_(dynamodb, new_sandbox, 'creation_status', 'failed') + sys.exit(1) + if hcc: + if stage != STAGE3_GOLD_IMAGE: + # something went wrong + logger.error(f"Unexpected stage: {stage}, missing validation") + logger.info(f"You can retry the operation by running the command with --retry {sandbox}") + set_(dynamodb, new_sandbox, 'creation_status', 'failed') + sys.exit(1) + atexit.register(exit_handler, dynamodb, dynamodb_table, new_sandbox) + # Prepare the AWS profile for the ansible-playbook command # - dynamodb profile to manage the dynamodb table # - pool-manager profile to manage the pool @@ -450,114 +460,118 @@ def exit_handler(db, table, sandbox): aws_secret_access_key = {os.environ['AWS_SECRET_ACCESS_KEY']} ''') -# Prepare args for the ansible-playbook command -#./create_range.yml -e account_num_start=3001 -e account_count=10 -e ddns_key_name=... -e ddns_key_secret=... -e ddns_server=... - -local_path = os.path.dirname(os.path.realpath(__file__)) -playbook = os.path.join(local_path, '..', 'playbooks', 'create_range.yml') - -args = [ - 'ansible-playbook', - playbook, - '-e', f'account_num_start={extract_sandbox_number(new_sandbox)}', - '-e', f'account_email={new_email}', - '-e', 'account_count=1', - '-e', f'ddns_key_name={os.environ["ddns_key_name"]}', - '-e', f'ddns_server={os.environ["ddns_server"]}', - '-e', f'ddns_ttl={os.environ["ddns_ttl"]}', - '-e', f'sandbox={new_sandbox}', - '-e', 'update_stage=true', - '-e', 'dynamodb_profile=dynamodb', - '-e', f'dynamodb_table={dynamodb_table}', - '-e', 'aws_master_profile=pool-manager', - # Listing all accounts in the organization is a costly operation - # it takes currently 47s to execute. - # Check the account only in certain scenario, like for a retry - '-e', f'check_account_list={True if retry else False}', - '-e', f'vault_file={INFRA_VAULT_SECRET_FILE}', -] +if playbook: + lock_sandbox(dynamodb, new_sandbox) + # Prepare args for the ansible-playbook command + #./create_range.yml -e account_num_start=3001 -e account_count=10 -e ddns_key_name=... -e ddns_key_secret=... -e ddns_server=... -# Run the command -logger.info(f"Running {' '.join(args)}") -# Add the ddns_key_secret to the args -args = args + ['-e', f'ddns_key_secret={os.environ["ddns_key_secret"]}'] -try: - completed = subprocess.run( - args, check=True, - capture_output=(not playbook_output), - timeout=1800, - ) -except subprocess.CalledProcessError as e: - # Sanitize the error message by removing the DDNS key secret - e_sanitized = str(e).replace(os.environ['ddns_key_secret'], '***') - logger.error(f"Failed to run the command: {e_sanitized}") - # print stdout and stderr - logger.error(e.stdout.decode(), stdout=True) - logger.error(e.stderr.decode(), stderr=True) - - # Set sandbox status to failed - response = dynamodb.update_item( - TableName=dynamodb_table, - Key={ - 'name': { - 'S': new_sandbox - } - }, - UpdateExpression='SET #s = :val1', - ExpressionAttributeNames={ - '#s': 'creation_status' - }, - ExpressionAttributeValues={ - ':val1': { - 'S': 'failed' - } - } - ) + local_path = os.path.dirname(os.path.realpath(__file__)) + playbook = os.path.join(local_path, '..', 'playbooks', 'create_range.yml') - sys.exit(1) -except subprocess.TimeoutExpired as e: - # Sanitize the error message by removing the DDNS key secret - e_sanitized = str(e).replace(os.environ['ddns_key_secret'], '***') - logger.error(f"Timeout: {e_sanitized}", sandbox=new_sandbox) - # Set sandbox status to failed - response = dynamodb.update_item( - TableName=dynamodb_table, - Key={ - 'name': { - 'S': new_sandbox + args = [ + 'ansible-playbook', + playbook, + '-e', f'account_num_start={extract_sandbox_number(new_sandbox)}', + '-e', f'account_email={new_email}', + '-e', 'account_count=1', + '-e', f'ddns_key_name={os.environ["ddns_key_name"]}', + '-e', f'ddns_server={os.environ["ddns_server"]}', + '-e', f'ddns_ttl={os.environ["ddns_ttl"]}', + '-e', f'sandbox={new_sandbox}', + '-e', 'update_stage=true', + '-e', 'dynamodb_profile=dynamodb', + '-e', f'dynamodb_table={dynamodb_table}', + '-e', 'aws_master_profile=pool-manager', + # Listing all accounts in the organization is a costly operation + # it takes currently 47s to execute. + # Check the account only in certain scenario, like for a retry + '-e', f'check_account_list={True if retry else False}', + '-e', f'vault_file={INFRA_VAULT_SECRET_FILE}', + ] + + + # Run the command + logger.info(f"Running {' '.join(args)}") + # Add the ddns_key_secret to the args + args = args + ['-e', f'ddns_key_secret={os.environ["ddns_key_secret"]}'] + try: + completed = subprocess.run( + args, check=True, + capture_output=(not playbook_output), + timeout=1800, + ) + except subprocess.CalledProcessError as e: + # Sanitize the error message by removing the DDNS key secret + e_sanitized = str(e).replace(os.environ['ddns_key_secret'], '***') + logger.error(f"Failed to run the command: {e_sanitized}") + # print stdout and stderr + logger.error(e.stdout.decode(), stdout=True) + logger.error(e.stderr.decode(), stderr=True) + + # Set sandbox status to failed + response = dynamodb.update_item( + TableName=dynamodb_table, + Key={ + 'name': { + 'S': new_sandbox + } + }, + UpdateExpression='SET #s = :val1', + ExpressionAttributeNames={ + '#s': 'creation_status' + }, + ExpressionAttributeValues={ + ':val1': { + 'S': 'failed' + } } - }, - UpdateExpression='SET #s = :val1', - ExpressionAttributeNames={ - '#s': 'creation_status' - }, - ExpressionAttributeValues={ - ':val1': { - 'S': 'failed' + ) + + sys.exit(1) + except subprocess.TimeoutExpired as e: + # Sanitize the error message by removing the DDNS key secret + e_sanitized = str(e).replace(os.environ['ddns_key_secret'], '***') + logger.error(f"Timeout: {e_sanitized}", sandbox=new_sandbox) + # Set sandbox status to failed + response = dynamodb.update_item( + TableName=dynamodb_table, + Key={ + 'name': { + 'S': new_sandbox + } + }, + UpdateExpression='SET #s = :val1', + ExpressionAttributeNames={ + '#s': 'creation_status' + }, + ExpressionAttributeValues={ + ':val1': { + 'S': 'failed' + } } - } - ) - sys.exit(1) + ) + sys.exit(1) -logger.info(f"Created {new_sandbox}") + logger.info(f"Created {new_sandbox}") -# Get the account_id from the db + # Get the account_id from the db -sandbox_data = get_sandbox(dynamodb, new_sandbox) + sandbox_data = get_sandbox(dynamodb, new_sandbox) -if sandbox_data: - account_id = sandbox_data.get('account_id', {}).get('S', '') - logger.info(f"Account ID: {account_id}") - logger = logger.bind(account_id=account_id) + if sandbox_data: + account_id = sandbox_data.get('account_id', {}).get('S', '') + logger.info(f"Account ID: {account_id}") + logger = logger.bind(account_id=account_id) - # Write the account_id and the account name to cloud-automation/new_sandboxes.txt - with open('cloud-automation/new_sandboxes.txt', 'w') as f: - f.write(f"{new_sandbox} {account_id}\n") + # Write the account_id and the account name to cloud-automation/new_sandboxes.txt + with open('cloud-automation/new_sandboxes.txt', 'w') as f: + f.write(f"{new_sandbox} {account_id}\n") -set_(dynamodb, new_sandbox, 'stage', STAGE2_ACCOUNT_CREATED) -ACCOUNT_CREATED_TIME = time.time() -logger.info(f"Duration: {round(ACCOUNT_CREATED_TIME - START_TIME)} seconds to create {new_sandbox}") + set_(dynamodb, new_sandbox, 'stage', STAGE2_ACCOUNT_CREATED) + set_(dynamodb, new_sandbox, 'reservation', 'untested') + ACCOUNT_CREATED_TIME = time.time() + logger.info(f"Duration: {round(ACCOUNT_CREATED_TIME - START_TIME)} seconds to create {new_sandbox}") if hcc: diff --git a/playbooks/creation_status.py b/playbooks/creation_status.py index c73fb7b3..af884f4b 100755 --- a/playbooks/creation_status.py +++ b/playbooks/creation_status.py @@ -33,16 +33,13 @@ def print_sandbox(item, db): creation_status=item.get('creation_status', {}).get('S', ''), stage= item.get('stage', {}).get('S', ''), reservation= item.get('reservation', {}).get('S', ''), + account_id=item.get('account_id', {}).get('S', ''), db=db) response = dynamodb_dev.scan( TableName='accounts-dev', ConsistentRead=True, - ProjectionExpression='#n, creation_status, stage, reservation', - ExpressionAttributeNames={ - '#n': 'name' - } ) if response['ResponseMetadata']['HTTPStatusCode'] != 200: @@ -54,8 +51,6 @@ def print_sandbox(item, db): response = dynamodb_dev.scan( TableName='accounts-dev', ConsistentRead=True, - ProjectionExpression='#n', - ExpressionAttributeNames={'#n': 'name'}, ExclusiveStartKey=response['LastEvaluatedKey'] ) data.extend(response['Items']) @@ -69,8 +64,6 @@ def print_sandbox(item, db): response = dynamodb_prod.scan( TableName='accounts', ConsistentRead=True, - ProjectionExpression='#n', - ExpressionAttributeNames={'#n': 'name'} ) if response['ResponseMetadata']['HTTPStatusCode'] != 200: @@ -83,8 +76,6 @@ def print_sandbox(item, db): response = dynamodb_prod.scan( TableName='accounts', ConsistentRead=True, - ProjectionExpression='#n', - ExpressionAttributeNames={'#n': 'name'}, ExclusiveStartKey=response['LastEvaluatedKey'] ) data.extend(response['Items']) From 933c28203248d663b9fd8a9891fd30dc18373fe0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillaume=20Cor=C3=A9?= Date: Mon, 20 Jan 2025 15:49:34 +0100 Subject: [PATCH 06/11] Handle concurrency better by backing off --- playbooks/roles/infra-aws-sandbox/tasks/account.yml | 3 +++ playbooks/roles/infra-aws-sandbox/tasks/ou.yml | 2 ++ 2 files changed, 5 insertions(+) diff --git a/playbooks/roles/infra-aws-sandbox/tasks/account.yml b/playbooks/roles/infra-aws-sandbox/tasks/account.yml index 980c7a45..9014bdd6 100644 --- a/playbooks/roles/infra-aws-sandbox/tasks/account.yml +++ b/playbooks/roles/infra-aws-sandbox/tasks/account.yml @@ -107,6 +107,9 @@ --query 'CreateAccountStatus.[Id]' --output text register: _createaccount + retries: 10 + # Make this especially long to avoid the issue with too many requests + delay: 120 - when: - _createaccount is not skipped diff --git a/playbooks/roles/infra-aws-sandbox/tasks/ou.yml b/playbooks/roles/infra-aws-sandbox/tasks/ou.yml index cee38eb6..d14d2d8e 100644 --- a/playbooks/roles/infra-aws-sandbox/tasks/ou.yml +++ b/playbooks/roles/infra-aws-sandbox/tasks/ou.yml @@ -51,3 +51,5 @@ --account-id {{ account_id }} --source-parent-id {{ rootid }} --destination-parent-id {{ destouid }} + retries: 10 + delay: 120 From 212ac4cea3353d00e7c88d0974032a89985ba4d1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillaume=20Cor=C3=A9?= Date: Mon, 20 Jan 2025 15:49:47 +0100 Subject: [PATCH 07/11] Improve retry validation --- playbooks/create_sandbox.py | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/playbooks/create_sandbox.py b/playbooks/create_sandbox.py index 985c29e5..4224c4f6 100755 --- a/playbooks/create_sandbox.py +++ b/playbooks/create_sandbox.py @@ -338,11 +338,11 @@ def decrypt_vaulted_str(secret): logger.error(f"Failed to get the stage for {new_sandbox}") sys.exit(1) - creation_status = sandbox_data.get('creation_status', {}).get('S', '') + # creation_status = sandbox_data.get('creation_status', {}).get('S', '') - if not creation_status: - logger.error(f"Failed to get the creation_status for {new_sandbox}") - sys.exit(1) + # if not creation_status: + # logger.error(f"Failed to get the creation_status for {new_sandbox}") + # sys.exit(1) def lock_sandbox(dynamodb, sandbox): @@ -396,7 +396,7 @@ def exit_handler(db, table, sandbox): # Check if the stage is STAGE0 stage = get_stage(db, sandbox) - if stage in [ STAGE0, STAGE1_FAILED ]: + if stage in [ STAGE0, STAGE1_STARTED, STAGE1_FAILED ]: response = db.delete_item( TableName=table, Key={ @@ -675,6 +675,19 @@ def exit_handler(db, table, sandbox): logger.info(f"Source create in HCC") if validation: + # First ensure the current reservation of the sandbox is 'untested' + + sandbox_data = get_sandbox(dynamodb, new_sandbox) + + if sandbox_data: + if sandbox_data.get('stage', {}).get('S', '') == STAGE4_VALIDATED: + logger.info("Sandbox is already validated. Skipping.") + exit(0) + + if sandbox_data.get('reservation', {}).get('S', '') != 'untested': + logger.error("Sandbox reservation is not 'untested'. something's off.") + exit(1) + # Run the validation playbook operation local_path = os.path.dirname(os.path.realpath(__file__)) @@ -763,11 +776,12 @@ def exit_handler(db, table, sandbox): 'S': new_sandbox } }, - UpdateExpression='SET #r = :val1, #s = :val2, #c = :val3', + UpdateExpression='SET #r = :val1, #s = :val2, #c = :val3, #a = :val4', ExpressionAttributeNames={ '#r': 'reservation', '#s': 'stage', - '#c': 'creation_status' + '#c': 'creation_status', + '#a': 'available' }, ExpressionAttributeValues={ ':val1': { @@ -778,6 +792,9 @@ def exit_handler(db, table, sandbox): }, ':val3': { 'S': 'success' + }, + ':val4': { + 'BOOL': True, } } ) From 74ec3147956efa884c12b824cfe240806aca9401 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillaume=20Cor=C3=A9?= Date: Tue, 21 Jan 2025 10:53:46 +0100 Subject: [PATCH 08/11] Improve performance + validation --- playbooks/create_sandbox.py | 190 +++++++++++++++++++++-------------- playbooks/creation_status.py | 1 + 2 files changed, 117 insertions(+), 74 deletions(-) diff --git a/playbooks/create_sandbox.py b/playbooks/create_sandbox.py index 4224c4f6..4a340949 100755 --- a/playbooks/create_sandbox.py +++ b/playbooks/create_sandbox.py @@ -141,76 +141,36 @@ INFRA_VAULT_SECRET_FILE = f.name logger.info(f"Created temporary file {INFRA_VAULT_SECRET_FILE}") -# run `sandbox-list -all --sort name` - -response = dynamodb_dev.scan( - TableName='accounts-dev', - ConsistentRead=True, - ProjectionExpression='#n', - ExpressionAttributeNames={ - '#n': 'name' - } -) - -if response['ResponseMetadata']['HTTPStatusCode'] != 200: - logger.error("Failed to get items from dynamodb") - sys.exit(1) - -data = response['Items'] -while 'LastEvaluatedKey' in response: - response = dynamodb_dev.scan( - TableName='accounts-dev', - ConsistentRead=True, - ProjectionExpression='#n', - ExpressionAttributeNames={'#n': 'name'}, - ExclusiveStartKey=response['LastEvaluatedKey'] - ) - data.extend(response['Items']) - -if 'Items' in response: - sandboxes = [item['name']['S'] for item in data] - logger.info(f"Found {len(sandboxes)} sandboxes in dev") - -# Now run the command for the prod database - -response = dynamodb_prod.scan( - TableName='accounts', - ConsistentRead=True, - ProjectionExpression='#n', - ExpressionAttributeNames={'#n': 'name'} -) - -if response['ResponseMetadata']['HTTPStatusCode'] != 200: - logger.error("Failed to get items from dynamodb") - sys.exit(1) - -data = response['Items'] - -while 'LastEvaluatedKey' in response: - response = dynamodb_prod.scan( - TableName='accounts', - ConsistentRead=True, - ProjectionExpression='#n', - ExpressionAttributeNames={'#n': 'name'}, - ExclusiveStartKey=response['LastEvaluatedKey'] - ) - data.extend(response['Items']) - -if 'Items' in response: - sandboxes_prod = [item['name']['S'] for item in data] - logger.info(f"Found {len(sandboxes_prod)} sandboxes in prod") - sandboxes = sandboxes + sandboxes_prod def extract_sandbox_number(sandbox): """Extract the number from the sandbox name, for example sandbox1234 returns 1234""" return int(sandbox.split('sandbox')[1]) -sandboxes.sort(key=extract_sandbox_number) +def set_str(dynamodb, sandbox, key, value): + '''Set the key value pair in the DB''' + response = dynamodb.update_item( + TableName=dynamodb_table, + Key={ + 'name': { + 'S': sandbox + } + }, + UpdateExpression='SET #k = :val1', + ExpressionAttributeNames={ + '#k': key + }, + ExpressionAttributeValues={ + ':val1': { + 'S': value + } + } + ) -# transform into a dictionary -sandboxes_dict = {sandbox: True for sandbox in sandboxes} + if response['ResponseMetadata']['HTTPStatusCode'] != 200: + raise Exception(f"Failed to set {key} to {value}") -def set_(dynamodb, sandbox, key, value): +# TODO detect type instead of this _bool and _str +def set_bool(dynamodb, sandbox, key, value): '''Set the key value pair in the DB''' response = dynamodb.update_item( TableName=dynamodb_table, @@ -225,7 +185,7 @@ def set_(dynamodb, sandbox, key, value): }, ExpressionAttributeValues={ ':val1': { - 'S': value + 'BOOL': value } } ) @@ -288,9 +248,74 @@ def get_sandbox(dynamodb, sandbox): else: return {} +def get_all_sandboxes(dynamodb_prod, dynamodb_dev): + # run `sandbox-list -all --sort name` + + response = dynamodb_dev.scan( + TableName='accounts-dev', + ConsistentRead=True, + ProjectionExpression='#n', + ExpressionAttributeNames={ + '#n': 'name' + } + ) + + if response['ResponseMetadata']['HTTPStatusCode'] != 200: + logger.error("Failed to get items from dynamodb") + sys.exit(1) + + data = response['Items'] + while 'LastEvaluatedKey' in response: + response = dynamodb_dev.scan( + TableName='accounts-dev', + ConsistentRead=True, + ProjectionExpression='#n', + ExpressionAttributeNames={'#n': 'name'}, + ExclusiveStartKey=response['LastEvaluatedKey'] + ) + data.extend(response['Items']) + + if 'Items' in response: + sandboxes = [item['name']['S'] for item in data] + logger.info(f"Found {len(sandboxes)} sandboxes in dev") + # Now run the command for the prod database -def guess_next_sandbox(sandboxes, sandboxes_dict): + response = dynamodb_prod.scan( + TableName='accounts', + ConsistentRead=True, + ProjectionExpression='#n', + ExpressionAttributeNames={'#n': 'name'} + ) + + if response['ResponseMetadata']['HTTPStatusCode'] != 200: + logger.error("Failed to get items from dynamodb") + sys.exit(1) + + data = response['Items'] + + while 'LastEvaluatedKey' in response: + response = dynamodb_prod.scan( + TableName='accounts', + ConsistentRead=True, + ProjectionExpression='#n', + ExpressionAttributeNames={'#n': 'name'}, + ExclusiveStartKey=response['LastEvaluatedKey'] + ) + data.extend(response['Items']) + + if 'Items' in response: + sandboxes_prod = [item['name']['S'] for item in data] + logger.info(f"Found {len(sandboxes_prod)} sandboxes in prod") + sandboxes = sandboxes + sandboxes_prod + + sandboxes.sort(key=extract_sandbox_number) + + return sandboxes + + + +def guess_next_sandbox(dynamodb_prod, dynamodb_dev): """Find the first available sandbox name""" # Generate a random email tag sandbox1+RANDSTR@opentlc.com # used when we reuse the account name. For some reason, the email is still registered @@ -300,6 +325,11 @@ def guess_next_sandbox(sandboxes, sandboxes_dict): if retry: return retry, f"{retry}+{random_email_tag}@{os.environ['email_domain']}" + sandboxes = get_all_sandboxes(dynamodb_prod, dynamodb_dev) + + # transform into a dictionary + sandboxes_dict = {sandbox: True for sandbox in sandboxes} + if guess_strategy == 'smart': for i in range(1, len(sandboxes) + 1): if not sandboxes_dict.get(f"sandbox{i}", False): @@ -314,7 +344,7 @@ def decrypt_vaulted_str(secret): '''Decrypt the vaulted secret''' return Vault(os.environ['INFRA_VAULT_SECRET']).load_raw(secret).decode('utf-8') -new_sandbox, new_email = guess_next_sandbox(sandboxes, sandboxes_dict) +new_sandbox, new_email = guess_next_sandbox(dynamodb_prod, dynamodb_dev) logger = logger.bind(sandbox=new_sandbox) # Lock the name of the sandbox in DB so another @@ -328,7 +358,7 @@ def decrypt_vaulted_str(secret): # Ensure the sandbox is not in use, available should be absent or true if retry: - if sandbox_data.get('available', {}).get('BOOL', True) is False: + if sandbox_data.get('service_uuid', {}).get('S', '') == '': logger.info(f"Retry {new_sandbox}") else: logger.error(f"{new_sandbox} is not available") @@ -418,14 +448,14 @@ def exit_handler(db, table, sandbox): # something went wrong logger.error(f"Unexpected stage: {stage}, missing validation") logger.info(f"You can retry the operation by running the command with --retry {sandbox}") - set_(dynamodb, new_sandbox, 'creation_status', 'failed') + set_str(dynamodb, new_sandbox, 'creation_status', 'failed') sys.exit(1) if hcc: if stage != STAGE3_GOLD_IMAGE: # something went wrong logger.error(f"Unexpected stage: {stage}, missing validation") logger.info(f"You can retry the operation by running the command with --retry {sandbox}") - set_(dynamodb, new_sandbox, 'creation_status', 'failed') + set_str(dynamodb, new_sandbox, 'creation_status', 'failed') sys.exit(1) @@ -568,8 +598,8 @@ def exit_handler(db, table, sandbox): with open('cloud-automation/new_sandboxes.txt', 'w') as f: f.write(f"{new_sandbox} {account_id}\n") - set_(dynamodb, new_sandbox, 'stage', STAGE2_ACCOUNT_CREATED) - set_(dynamodb, new_sandbox, 'reservation', 'untested') + set_str(dynamodb, new_sandbox, 'stage', STAGE2_ACCOUNT_CREATED) + set_str(dynamodb, new_sandbox, 'reservation', 'untested') ACCOUNT_CREATED_TIME = time.time() logger.info(f"Duration: {round(ACCOUNT_CREATED_TIME - START_TIME)} seconds to create {new_sandbox}") @@ -680,12 +710,24 @@ def exit_handler(db, table, sandbox): sandbox_data = get_sandbox(dynamodb, new_sandbox) if sandbox_data: + reservation_current = sandbox_data.get('reservation', {}).get('S', '') if sandbox_data.get('stage', {}).get('S', '') == STAGE4_VALIDATED: - logger.info("Sandbox is already validated. Skipping.") + + if sandbox_data.get('available', {}).get('BOOL', '') == False: + set_bool(dynamodb, new_sandbox, 'available', True) + logger.info(f"Set {new_sandbox} as available") + logger.info("Sandbox is already validated. Skipping validation.") + + if reservation_current != reservation: + set_str(dynamodb, new_sandbox, 'reservation', reservation) + + logger.info("Reservation updated", + previous_reservation=reservation_current) + exit(0) - if sandbox_data.get('reservation', {}).get('S', '') != 'untested': - logger.error("Sandbox reservation is not 'untested'. something's off.") + if reservation_current != 'untested': + logger.error("Sandbox reservation is not 'untested'. something's off.", found=reservation_current) exit(1) # Run the validation playbook operation diff --git a/playbooks/creation_status.py b/playbooks/creation_status.py index af884f4b..a1da7e4c 100755 --- a/playbooks/creation_status.py +++ b/playbooks/creation_status.py @@ -32,6 +32,7 @@ def print_sandbox(item, db): logger.info(item['name']['S'], creation_status=item.get('creation_status', {}).get('S', ''), stage= item.get('stage', {}).get('S', ''), + available=item.get('available', {}).get('BOOL', ''), reservation= item.get('reservation', {}).get('S', ''), account_id=item.get('account_id', {}).get('S', ''), db=db) From 07086535ab73ed3d7cbdd6bf9ab3fb1ae63d3e10 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillaume=20Cor=C3=A9?= Date: Thu, 30 Jan 2025 11:59:46 +0100 Subject: [PATCH 09/11] Switch to manual conf for HCC + improvements - Use CLIENT ID and SECRET instead of username / password credentials - Switch to manual configuration for console HCC trust, so we can use static role and policy - Generate a random external_id and save it in DB - Keep the external_id across cleanup - Filter redhat-HCC-role and policy during cleanup. --- playbooks/create_sandbox.py | 240 ++++++++++++++++-- playbooks/creation_status.py | 1 + .../roles/infra-aws-sandbox/defaults/main.yml | 4 + .../roles/infra-aws-sandbox/tasks/pool.yml | 22 ++ 4 files changed, 248 insertions(+), 19 deletions(-) diff --git a/playbooks/create_sandbox.py b/playbooks/create_sandbox.py index 4a340949..b10d401e 100755 --- a/playbooks/create_sandbox.py +++ b/playbooks/create_sandbox.py @@ -1,11 +1,10 @@ #!/usr/bin/env python3 -# First, grab the list of all sandboxes - import subprocess import os import sys import boto3 +import hashlib import argparse import atexit import structlog @@ -15,6 +14,7 @@ import string import requests import time +import json from ansible_vault import Vault START_TIME = time.time() @@ -70,6 +70,7 @@ os.environ.setdefault('ddns_key_algorithm', 'hmac-sha512') os.environ.setdefault('ddns_ttl', '600') os.environ.setdefault('email_domain', 'opentlc.com') +os.environ.setdefault('REDHAT_ACCOUNT', '998366406740') # set default to ~/.aws/credentials_create os.environ.setdefault('AWS_SHARED_CREDENTIALS_FILE', os.path.expanduser('~/.aws/credentials_create')) # Create directory if it doesn't exist, chmod 700 @@ -87,8 +88,8 @@ 'INFRA_VAULT_SECRET_PROD', 'ddns_server', 'ddns_key_secret', - 'RH_USERNAME', - 'RH_PASSWORD', + 'HCC_CLIENT_ID', + 'HCC_CLIENT_SECRET', ] # constants: Steps @@ -462,6 +463,33 @@ def exit_handler(db, table, sandbox): atexit.register(exit_handler, dynamodb, dynamodb_table, new_sandbox) +def get_sso_access_token(): + """ Create a session token using HCC_CLIENT_ID and HCC_CLIENT_SECRET""" + + # This is the standard Keycloak endpoint for client_credentials + token_url = "https://sso.redhat.com/auth/realms/redhat-external/protocol/openid-connect/token" + # Client Credentials Grant + payload = { + "grant_type": "client_credentials", + "client_id": os.environ['HCC_CLIENT_ID'], + "client_secret": os.environ['HCC_CLIENT_SECRET'] + } + + response = requests.post(token_url, data=payload) + + if response.status_code != 200: + raise ValueError(f"Failed to obtain token: {response.status_code} {response.text}") + + + # Parse out the access token + access_token = response.json().get("access_token") + if not access_token: + raise ValueError("No access token found in the response") + + + logger.info("Successfully obtained an access token for console.redhat.com.") + return access_token + # Prepare the AWS profile for the ansible-playbook command # - dynamodb profile to manage the dynamodb table # - pool-manager profile to manage the pool @@ -490,6 +518,24 @@ def exit_handler(db, table, sandbox): aws_secret_access_key = {os.environ['AWS_SECRET_ACCESS_KEY']} ''') +def assume_role(master_profile, role_arn, role_session_name, region_name='us-east-2'): + """Assume a role using the master profile""" + + + session = boto3.Session(profile_name=master_profile) + sts = session.client('sts', region_name=region_name) + + response = sts.assume_role( + RoleArn=role_arn, + RoleSessionName=role_session_name + ) + + if response['ResponseMetadata']['HTTPStatusCode'] != 200: + raise Exception("Failed to assume role") + + return response['Credentials'] + + if playbook: lock_sandbox(dynamodb, new_sandbox) @@ -617,19 +663,169 @@ def exit_handler(db, table, sandbox): logger.error(f"Failed to get the aws_secret_access_key for {new_sandbox}") sys.exit(1) - plaintext_key = decrypt_vaulted_str(sandbox_data.get('aws_secret_access_key', {}).get('S', '')).strip(' \t\n\r') - access_key = sandbox_data.get('aws_access_key_id', {}).get('S', '').strip(' \t\n\r') - if not access_key or not plaintext_key: - logger.error(f"Failed to get the access key for {new_sandbox}") + account_id = sandbox_data.get('account_id', {}).get('S', '') + role_arn = f"arn:aws:iam::{account_id}:role/OrganizationAccountAccessRole" + + credentials = assume_role('pool-manager', role_arn, 'hcc-registration') + + if not credentials: + logger.error("Failed to assume role", role_arn=role_arn) + sys.exit(1) + + # create a new session with the assumed role credentials + sandbox_session = boto3.Session( + aws_access_key_id=credentials['AccessKeyId'], + aws_secret_access_key=credentials['SecretAccessKey'], + aws_session_token=credentials['SessionToken'], + region_name='us-east-1' + ) + + policy_name = 'redhat-HCC-policy' + + iam_client = sandbox_session.client('iam') + + policies = iam_client.list_policies() + policy = { + "Version": "2012-10-17", + "Statement": [ + { + "Sid": "CloudigradePolicy", + "Effect": "Allow", + "Action": [ + "sts:GetCallerIdentity", + "ec2:DescribeImages", + "ec2:DescribeInstances", + "ec2:ModifySnapshotAttribute", + "ec2:DescribeSnapshotAttribute", + "ec2:DescribeSnapshots", + "ec2:CopyImage", + "ec2:CreateTags", + "ec2:DescribeRegions", + "cloudtrail:CreateTrail", + "cloudtrail:UpdateTrail", + "cloudtrail:PutEventSelectors", + "cloudtrail:DescribeTrails", + "cloudtrail:StartLogging", + "cloudtrail:DeleteTrail" + ], + "Resource": "*" + } + ] + } + + md5_policy = hashlib.md5(json.dumps(policy).encode()).hexdigest() + + if policy_name not in [policy['PolicyName'] for policy in policies['Policies']]: + response = iam_client.create_policy( + PolicyName=policy_name, + PolicyDocument=json.dumps(policy), + Description="Policy to grant access to Red Hat Hybrid Cloud Console to the AWS account" + ) + + if response['ResponseMetadata']['HTTPStatusCode'] != 200: + logger.error("Failed to create the policy") + sys.exit(1) + + logger.info("Policy created", policy_name=policy_name, md5=md5_policy) + else: + # update permission + response = iam_client.create_policy_version( + PolicyArn=f"arn:aws:iam::{account_id}:policy/{policy_name}", + PolicyDocument=json.dumps(policy), + SetAsDefault=True + ) + + logger.info("Policy updated", policy_name=policy_name, md5=md5_policy) + + + + # Create the role redhat-HCC-role, using the external_id created earlier + # Get the external_id from db or generate a new one + external_id = sandbox_data.get('external_id', {}).get('S', '') + + if not external_id: + # generate a random uuid + #external_id = str(uuid.uuid4()) + # generate a random string + external_id = ''.join( + random.choice(string.ascii_uppercase + string.digits) + for _ in range(16) + ) + set_str(dynamodb, new_sandbox, 'external_id', external_id) + logger.info(f"Generated external_id", hcc_external_id=external_id) + else: # Create, if it doesn't exist, an IAM policy redhat-HCC-policy + logger.info(f"External ID already exists", hcc_external_id=external_id) + + role_name = 'redhat-HCC-role' + + roles = iam_client.list_roles() + + policy_document = { + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": "sts:AssumeRole", + "Principal": { + "AWS": f"arn:aws:iam::{os.environ['REDHAT_ACCOUNT']}:root" + }, + "Condition": { + "StringEquals": { + "sts:ExternalId": external_id + } + } + } + ] + } + + if role_name in [role['RoleName'] for role in roles['Roles']]: + # update the role to ensure it has the right external_id + response = iam_client.update_assume_role_policy( + RoleName=role_name, + PolicyDocument=json.dumps(policy_document) + ) + logger.info("Role updated", role_name=role_name, hcc_external_id=external_id) + else: + response = iam_client.create_role( + RoleName=role_name, + AssumeRolePolicyDocument=json.dumps(policy_document), + ) + + if response['ResponseMetadata']['HTTPStatusCode'] != 200: + logger.error("Failed to create the role") + sys.exit(1) + + logger.info("Role created", role_name=role_name) + + logger = logger.bind(hcc_external_id=external_id) + role_arn = f"arn:aws:iam::{account_id}:role/{role_name}" + logger = logger.bind(role_arn=role_arn) + + # Attach the policy to the role + + response = iam_client.attach_role_policy( + RoleName=role_name, + PolicyArn=f"arn:aws:iam::{account_id}:policy/{policy_name}" + ) + + if response['ResponseMetadata']['HTTPStatusCode'] != 200: + logger.error("Failed to attach the policy to the role") sys.exit(1) + else: + logger.info("Policy attached to the role", role_name=role_name, policy_name=policy_name) # use requests and create the POST request - baseurl = 'https://console.redhat.com/api/sources/v3.1' + try: + access_token = get_sso_access_token() + except Exception as e: + logger.error("Error getting the access token to console.redhat.com", error=e) s = requests.Session() - s.auth = (os.environ['RH_USERNAME'], os.environ['RH_PASSWORD']) + #s.auth = (os.environ['RH_USERNAME'], os.environ['RH_PASSWORD']) + s.headers.update({"Authorization": f"Bearer {access_token}"}) + baseurl = 'https://console.redhat.com/api/sources/v3.1' # delete the source if it exists # First get the source_id @@ -651,9 +847,11 @@ def exit_handler(db, table, sandbox): source_id = response.json().get('data', [{}])[0].get('id', '') if source_id: + response = s.delete(f"{baseurl}/sources/{source_id}") - if response.status_code not in [200, 201, 202]: + if response.status_code not in [200, 201, 202, 204]: logger.error(f"Failed to delete the source: {response.text}", status_code=response.status_code, sandbox=new_sandbox) + os.exit(1) logger.info(f"Deleted the source {source_id} for {new_sandbox}") @@ -671,24 +869,26 @@ def exit_handler(db, table, sandbox): logger.error(f"Failed to delete the source {source_id} for {new_sandbox}") sys.exit(1) + payload = { "sources": [ { "name": new_sandbox, "source_type_name": "amazon", - "app_creation_workflow": "account_authorization" + "app_creation_workflow": "manual_configuration", } ], "authentications": [ { - "resource_type": "source", - "resource_name": new_sandbox, - "username": access_key, - "password": plaintext_key, - "authtype": "access_key_secret_key" + "resource_type": "application", + "resource_name": "cloud-meter", + "authtype": "cloud-meter-arn", + "username": role_arn, + "extra": { + "external_id": external_id + } } ], - "applications": [ { "source_name": new_sandbox, @@ -702,7 +902,9 @@ def exit_handler(db, table, sandbox): logger.error(f"Failed to create the source: {response.text}", status_code=response.status_code, sandbox=new_sandbox) sys.exit(1) - logger.info(f"Source create in HCC") + source_id = response.json().get('sources', [{}])[0].get('id', '') + logger.info(f"Source create in HCC", source_id=source_id) + if validation: # First ensure the current reservation of the sandbox is 'untested' diff --git a/playbooks/creation_status.py b/playbooks/creation_status.py index a1da7e4c..1cd0553c 100755 --- a/playbooks/creation_status.py +++ b/playbooks/creation_status.py @@ -35,6 +35,7 @@ def print_sandbox(item, db): available=item.get('available', {}).get('BOOL', ''), reservation= item.get('reservation', {}).get('S', ''), account_id=item.get('account_id', {}).get('S', ''), + external_id=item.get('external_id', {}).get('S', ''), db=db) diff --git a/playbooks/roles/infra-aws-sandbox/defaults/main.yml b/playbooks/roles/infra-aws-sandbox/defaults/main.yml index 502a4f3f..9ff791f1 100644 --- a/playbooks/roles/infra-aws-sandbox/defaults/main.yml +++ b/playbooks/roles/infra-aws-sandbox/defaults/main.yml @@ -78,10 +78,12 @@ aws_nuke_filters_default: - AWSServiceRoleForSupport - AWSServiceRoleForTrustedAdvisor - CloudabilityRole_OU + - redhat-HCC-role IAMRolePolicy: - "OrganizationAccountAccessRole -> AdministratorAccess" - config-rule-role -> config-rule-policy + - redhat-HCC-role -> redhat-HCC-policy - CloudabilityRole_OU -> CloudabilityAutomationPolicy - CloudabilityRole_OU -> CloudabilityMonitorResourcesPolicy - CloudabilityRole_OU -> CloudabilityVerificationPolicy @@ -103,6 +105,8 @@ aws_nuke_filters_default: IAMPolicy: - arn:aws:iam::{{ account_id }}:policy/config-rule-policy + - arn:aws:iam::{{ account_id }}:policy/redhat-HCC-policy + - redhat-HCC-policy EC2KeyPair: - opentlc_admin_backdoor diff --git a/playbooks/roles/infra-aws-sandbox/tasks/pool.yml b/playbooks/roles/infra-aws-sandbox/tasks/pool.yml index 666ba656..6d0afdef 100644 --- a/playbooks/roles/infra-aws-sandbox/tasks/pool.yml +++ b/playbooks/roles/infra-aws-sandbox/tasks/pool.yml @@ -64,11 +64,33 @@ set_fact: sandbox_reservation: "{{ r_get_reservation.stdout }}" + - name: Get external_id + vars: + _data: + name: + S: "{{ account_name }}" + command: >- + {{ aws_cli }} --profile {{ dynamodb_profile | quote }} + --region {{ dynamodb_region | quote }} + dynamodb get-item + --table-name {{ dynamodb_table }} + --key '{{ _data | to_json }}' + --query 'Item.external_id' + --output text + register: r_get_external_id + changed_when: false + + - name: Save some values for after cleanup + set_fact: + external_id: "{{ r_get_external_id.stdout }}" + - when: sandbox_reservation | default("", true) not in ["", "None", "null"] set_fact: additional_data: reservation: S: "{{ sandbox_reservation }}" + external_id: + S: "{{ external_id }}" - when: sandbox_reservation | default("", true) in ["", "None", "null"] set_fact: From 6af0413121c34ed20d93a28574c18856a749fb48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillaume=20Cor=C3=A9?= Date: Fri, 31 Jan 2025 11:07:11 +0100 Subject: [PATCH 10/11] Add script to validate a sandbox or a reservation --- playbooks/create_sandbox.py | 18 ++-- playbooks/sandbox_functions.py | 51 +++++++++++ playbooks/validate_sandbox.py | 150 +++++++++++++++++++++++++++++++++ 3 files changed, 210 insertions(+), 9 deletions(-) create mode 100644 playbooks/sandbox_functions.py create mode 100755 playbooks/validate_sandbox.py diff --git a/playbooks/create_sandbox.py b/playbooks/create_sandbox.py index b10d401e..19c9ae60 100755 --- a/playbooks/create_sandbox.py +++ b/playbooks/create_sandbox.py @@ -1,20 +1,20 @@ #!/usr/bin/env python3 -import subprocess -import os -import sys -import boto3 -import hashlib import argparse import atexit -import structlog +import boto3 +import hashlib +import json import logging -import tempfile +import os import random -import string import requests +import string +import structlog +import subprocess +import sys +import tempfile import time -import json from ansible_vault import Vault START_TIME = time.time() diff --git a/playbooks/sandbox_functions.py b/playbooks/sandbox_functions.py new file mode 100644 index 00000000..1c6d6a4f --- /dev/null +++ b/playbooks/sandbox_functions.py @@ -0,0 +1,51 @@ +#!/usr/bin/env python3 + +import os +from ansible_vault import Vault + +def extract_sandbox_number(sandbox): + """Extract the number from the sandbox name, for example sandbox1234 returns 1234""" + return int(sandbox.split('sandbox')[1]) + +def get_sandbox(dynamodb, dynamodb_table, sandbox): + """Get the sandbox from the DB""" + response = dynamodb.get_item( + TableName=dynamodb_table, + Key={ + 'name': { + 'S': sandbox + } + } + ) + + if 'Item' in response: + return response['Item'] + else: + return {} + +def decrypt_vaulted_str(secret): + '''Decrypt the vaulted secret''' + return Vault(os.environ['INFRA_VAULT_SECRET']).load_raw(secret).decode('utf-8') + +def get_all_sandboxes(dynamodb, dynamodb_table): + response = dynamodb.scan( + TableName=dynamodb_table, + ConsistentRead=True, + ) + + if response['ResponseMetadata']['HTTPStatusCode'] != 200: + raise Exception("Failed to get items from dynamodb") + + data = response['Items'] + while 'LastEvaluatedKey' in response: + response = dynamodb.scan( + TableName=dynamodb_table, + ConsistentRead=True, + ExclusiveStartKey=response['LastEvaluatedKey'] + ) + data.extend(response['Items']) + + if 'Items' in response: + sandboxes = data + + return sandboxes diff --git a/playbooks/validate_sandbox.py b/playbooks/validate_sandbox.py new file mode 100755 index 00000000..8beb6da8 --- /dev/null +++ b/playbooks/validate_sandbox.py @@ -0,0 +1,150 @@ +#!/usr/bin/env python3 + + +import boto3 +import time +import random +import logging +import os +import argparse +import structlog +from ansible_vault import Vault +from sandbox_functions import get_sandbox, decrypt_vaulted_str, get_all_sandboxes + +START_TIME = time.time() +logger = structlog.get_logger() +structlog.configure(wrapper_class=structlog.make_filtering_bound_logger(logging.INFO)) + +parser = argparse.ArgumentParser(description='Validate a sandbox') +parser.add_argument('--sandbox', required=False, help='sandbox to validate, by passing its name', default=None) +parser.add_argument('--reservation', required=False, help='reservation to validate, by passing its name', default=None) +parser.add_argument('--target-db', required=False, help='The target database', default='dev') +args = parser.parse_args() + +required_env_vars = [ + 'AWS_ACCESS_KEY_ID', + 'AWS_SECRET_ACCESS_KEY', + 'AWS_ACCESS_KEY_ID_DEV', + 'AWS_SECRET_ACCESS_KEY_DEV', + 'INFRA_VAULT_SECRET_DEV', + 'INFRA_VAULT_SECRET_PROD', +] + +for env_var in required_env_vars: + if not os.environ.get(env_var): + logger.info(f"Environment variable {env_var} not set") + sys.exit(1) + +sandbox = args.sandbox +target_db = args.target_db +reservation = args.reservation + +if not sandbox and not reservation: + logger.error("Either sandbox or reservation is required") + sys.exit(1) + +# Set the target database +session_prod = boto3.Session(region_name='us-east-1') +dynamodb_prod = session_prod.client('dynamodb') + +session_dev = boto3.Session(region_name='us-east-1', + aws_access_key_id=os.environ['AWS_ACCESS_KEY_ID_DEV'], + aws_secret_access_key=os.environ['AWS_SECRET_ACCESS_KEY_DEV']) +dynamodb_dev = session_dev.client('dynamodb') +dynamodb_table = 'accounts-dev' +dynamodb = dynamodb_dev + +if target_db == 'prod': + logger.info("Using PROD dynamoDB database") + dynamodb_table = 'accounts' + dynamodb = dynamodb_prod + logger = logger.bind(target_db='prod') + os.environ['INFRA_VAULT_SECRET'] = os.environ['INFRA_VAULT_SECRET_PROD'] +else: + logger.info("Using DEV dynamoDB database") + # bind context variable to the logger + logger = logger.bind(target_db='dev') + os.environ['INFRA_VAULT_SECRET'] = os.environ['INFRA_VAULT_SECRET_DEV'] + +#def find_rhel_amis(sandbox, account_id, aws_access_key_id, aws_secret_access_key): +def find_rhel_amis(sandbox, dynamodb, dynamodb_table): + # List of possible regions + regions = [ + 'us-east-1', + 'us-east-2', + 'us-west-1', + 'us-west-2', + 'eu-central-1', + 'eu-west-1', + 'eu-west-2', + 'ap-southeast-1', + ] + + sandbox_data = get_sandbox(dynamodb, dynamodb_table, sandbox) + account_id = sandbox_data.get('account_id').get('S') + aws_access_key_id = sandbox_data.get('aws_access_key_id').get('S').strip(' \t\n\r') + aws_secret_access_key = decrypt_vaulted_str(sandbox_data.get('aws_secret_access_key').get('S')).strip(' \t\n\r') + + + for region in regions: + find_rhel_ami_in_region(sandbox, account_id, region, aws_access_key_id, aws_secret_access_key) + +def find_rhel_ami_in_region(sandbox, account_id, region, aws_access_key_id, aws_secret_access_key): + # Create an EC2 client + ec2_client = boto3.client( + 'ec2', + region_name=region, + aws_access_key_id=aws_access_key_id, + aws_secret_access_key=aws_secret_access_key + ) + + # We'll try up to 150 times, sleeping 6 seconds each time (total of ~15 minutes) + max_retries = 1 + delay = 6 + + images = [] + for attempt in range(max_retries): + try: + response = ec2_client.describe_images( + Owners=['309956199498'], + Filters=[ + {'Name': 'architecture', 'Values': ['x86_64']}, + {'Name': 'name', 'Values': ['RHEL-9.0*Access*']}, + {'Name': 'is-public', 'Values': ['false']} + ] + ) + images = response.get('Images', []) + + # If we got at least one image, break out of the loop + if images: + logger.info(f"Found {len(images)} matching image(s).", region=region, sandbox=sandbox, account_id=account_id) + break + + logger.info( + f"Attempt {attempt + 1}/{max_retries}: No matching images yet. Retrying in {delay} seconds...", + region=region, + sandbox=sandbox, + account_id=account_id + ) + if max_retries > 1: + time.sleep(delay) + + except Exception as e: + logger.error(f"Encountered an error: {e}", region=region, sandbox=sandbox, account_id=account_id) + time.sleep(delay) + + # After the loop, check if we found images + if not images: + logger.error("No AMIs found after all retries.", region=region, sandbox=sandbox, account_id=account_id) + +if __name__ == "__main__": + # Get the credentials from dynamodb + if sandbox: + find_rhel_amis(sandbox, dynamodb, dynamodb_table) + + if reservation: + sandboxes = get_all_sandboxes(dynamodb, dynamodb_table) + + for sandbox in sandboxes: + if sandbox.get('reservation', {}).get('S', '') == reservation: + find_rhel_amis(sandbox.get('name').get('S'), dynamodb, dynamodb_table) From 18d4f9ae4fd7c80eb97860314198c074c0c3e97a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Guillaume=20Cor=C3=A9?= Date: Fri, 31 Jan 2025 11:09:46 +0100 Subject: [PATCH 11/11] Address JK's comment --- playbooks/roles/infra-aws-sandbox/tasks/assume.yml | 2 +- playbooks/roles/infra-aws-sandbox/tasks/keypair.yml | 2 +- playbooks/roles/infra-aws-sandbox/tasks/regions.yml | 2 +- playbooks/roles/infra-aws-sandbox/tasks/reset.yml | 2 +- playbooks/roles/infra-aws-sandbox/tasks/route53.yml | 6 +++--- playbooks/roles/infra-aws-sandbox/tasks/validate.yaml | 2 +- 6 files changed, 8 insertions(+), 8 deletions(-) diff --git a/playbooks/roles/infra-aws-sandbox/tasks/assume.yml b/playbooks/roles/infra-aws-sandbox/tasks/assume.yml index 00f41438..8753b822 100644 --- a/playbooks/roles/infra-aws-sandbox/tasks/assume.yml +++ b/playbooks/roles/infra-aws-sandbox/tasks/assume.yml @@ -10,5 +10,5 @@ region: aws-global register: assumed_role retries: 5 - delay: "{{ 30|random(start=3, step=1) }}" + delay: 15 until: assumed_role is succeeded diff --git a/playbooks/roles/infra-aws-sandbox/tasks/keypair.yml b/playbooks/roles/infra-aws-sandbox/tasks/keypair.yml index 0ba94db6..b008479f 100644 --- a/playbooks/roles/infra-aws-sandbox/tasks/keypair.yml +++ b/playbooks/roles/infra-aws-sandbox/tasks/keypair.yml @@ -19,5 +19,5 @@ loop_var: _region register: r_import retries: 10 - delay: "{{ 10|random(start=3, step=1) }}" + delay: 10 until: r_import is succeeded diff --git a/playbooks/roles/infra-aws-sandbox/tasks/regions.yml b/playbooks/roles/infra-aws-sandbox/tasks/regions.yml index 9e3ab222..5d1af19d 100644 --- a/playbooks/roles/infra-aws-sandbox/tasks/regions.yml +++ b/playbooks/roles/infra-aws-sandbox/tasks/regions.yml @@ -11,7 +11,7 @@ register: _regions changed_when: false retries: 5 - delay: "{{ 30|random(start=3, step=1) }}" + delay: 15 until: _regions is succeeded - set_fact: diff --git a/playbooks/roles/infra-aws-sandbox/tasks/reset.yml b/playbooks/roles/infra-aws-sandbox/tasks/reset.yml index 10ecab2e..535fff3a 100644 --- a/playbooks/roles/infra-aws-sandbox/tasks/reset.yml +++ b/playbooks/roles/infra-aws-sandbox/tasks/reset.yml @@ -8,7 +8,7 @@ zone: "{{ account_name }}{{subdomain_base}}." register: _route53zone retries: 5 - delay: "{{ 60|random(start=3, step=1) }}" + delay: 30 until: _route53zone is succeeded - name: Cleanup DNS Zone diff --git a/playbooks/roles/infra-aws-sandbox/tasks/route53.yml b/playbooks/roles/infra-aws-sandbox/tasks/route53.yml index 6f5dd22c..8db8008f 100644 --- a/playbooks/roles/infra-aws-sandbox/tasks/route53.yml +++ b/playbooks/roles/infra-aws-sandbox/tasks/route53.yml @@ -9,7 +9,7 @@ zone: "{{ account_name }}{{subdomain_base}}." register: _route53zone retries: 10 - delay: "{{ 10|random(start=3, step=1) }}" + delay: 10 until: _route53zone is succeeded - set_fact: @@ -22,7 +22,7 @@ hosted_zone_id: "{{ _route53zone.zone_id }}" register: _route53facts retries: 5 - delay: "{{ 60|random(start=3, step=1) }}" + delay: 30 until: _route53facts is succeeded - name: Save NS records @@ -41,7 +41,7 @@ overwrite: true register: _route53zoneNS retries: 5 - delay: "{{ 60|random(start=3, step=1) }}" + delay: 30 until: _route53zoneNS is succeeded - name: Add HostedZoneId to the report diff --git a/playbooks/roles/infra-aws-sandbox/tasks/validate.yaml b/playbooks/roles/infra-aws-sandbox/tasks/validate.yaml index c8dbf34d..76eb8cee 100644 --- a/playbooks/roles/infra-aws-sandbox/tasks/validate.yaml +++ b/playbooks/roles/infra-aws-sandbox/tasks/validate.yaml @@ -50,7 +50,7 @@ hosted_zone_id: "{{ sandbox_hosted_zone_id }}" register: _route53facts retries: 5 - delay: "{{ 60|random(start=3, step=1) }}" + delay: 30 until: _route53facts is succeeded - name: Validate route53 zone