diff --git a/.workshop-infra/Makefile b/.workshop-infra/Makefile new file mode 100644 index 0000000..a50a037 --- /dev/null +++ b/.workshop-infra/Makefile @@ -0,0 +1,85 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 +STACK_NAME?="textract-transformers-workshop" +DEPLOYMENT_BUCKET_NAME?="UNDEFINED" +DEPLOYMENT_BUCKET_PREFIX?="" +TARGET_REPO?="https://github.com/aws-samples/amazon-textract-transformer-pipeline" + +target: + $(info ${HELP_MESSAGE}) + @exit 0 + +package: ##=> Build SAM template & assets to CloudFormation on S3 + $(info [*] Building AWS SAM stack...) + sam build \ + --use-container \ + --template template.sam.yaml && \ + sam package \ + --s3-bucket $(DEPLOYMENT_BUCKET_NAME) \ + --s3-prefix $(DEPLOYMENT_BUCKET_PREFIX)sam \ + --use-json \ + --output-template-file template.tmp.json && \ + python sam-postproc.py template.tmp.json template.tmp.json && \ + aws s3 cp template.tmp.json \ + s3://$(DEPLOYMENT_BUCKET_NAME)/$(DEPLOYMENT_BUCKET_PREFIX)template.cf.json + +# CF with --disable-rollback is faster for debugging than sam deploy +create: ##=> Create services stack (only) + $(info [*] Deploying...) + aws cloudformation create-stack \ + --template-body file://template.tmp.yaml \ + --stack-name $(STACK_NAME) \ + --capabilities CAPABILITY_IAM CAPABILITY_AUTO_EXPAND \ + --disable-rollback + # --parameters \ + # ParameterKey=ParamName,ParameterValue=$(PARAM_VAR) + +deploy: ##=> Deploy services (flexible create or update) + $(info [*] Deploying...) + sam deploy \ + --template-file template.tmp.yaml \ + --stack-name $(STACK_NAME) \ + --capabilities CAPABILITY_IAM CAPABILITY_AUTO_EXPAND \ + --no-fail-on-empty-changeset + # --parameter-overrides \ + # ParamName=$(PARAM_VAR) + +all: ##=> Build and create stack + @$(MAKE) package + @$(MAKE) create + +delete: ##=> Delete services + $(info [*] Deleting stack...) + aws cloudformation delete-stack --stack-name $(STACK_NAME) + + +############# +# Helpers # +############# + +define HELP_MESSAGE + + STACK_NAME: "textract-transformers-workshop" + Description: Stack Name to deploy/redeploy to + DEPLOYMENT_BUCKET_NAME: + Description: Amazon S3 bucket for staging built SAM Lambda bundles and assets + DEPLOYMENT_BUCKET_PREFIX: "" + Description: For publishing to a prefix in your deployment bucket, instead of root. Should + include trailing slash e.g. 'my-prefix/' + TARGET_REPO: "https://github.com/aws-samples/amazon-textract-transformer-pipeline" + Target repository where your workshop code lives + + Common usage: + + ...::: Build all SAM based services :::... + $ make package + + ...::: Deploy or re-deploy all SAM based services :::... + $ make deploy + + ...::: Create (cannot re-deploy) all SAM based services with rollback disabled :::... + $ make create + + ...::: Delete all SAM based services :::... + $ make delete +endef diff --git a/.workshop-infra/README.md b/.workshop-infra/README.md new file mode 100644 index 0000000..c96265f --- /dev/null +++ b/.workshop-infra/README.md @@ -0,0 +1,74 @@ +# Infrastructure for SageMaker Workshop with a CDK solution stack + +This folder provides a helper stack which will: + +- Create a SageMaker Notebook Instance with the repository cloned in +- Create an (IAM-authenticated) SageMaker Studio domain, with a user profile, with the repository cloned in (and some VPC infrastructure required to make that happen) +- Run a one-off AWS CodeBuild build to download the repository, `poetry install` the dependencies and `cdk deploy --all` stacks in the solution + +It's intended to help automate setting up workshops on temporary AWS accounts, with CDK-based solutions (like this one) that assume a SageMaker notebook environment will be provisioned separately. + +## Prerequisites and Caveats + +This helper stack assumes that (in your target AWS Region): + +- You have not yet onboarded to SageMaker Studio +- You have a default VPC you're willing to use with standard configuration, or else would like to use a custom VPC but are comfortable checking the compatibility of the stack with your VPC configuration. + +> ⚠️ This stack is oriented towards convenience of **getting started** and first exploring SageMaker Studio with the companion solution stack. It is **not recommended for long-lived environments**. +> +> In particular, **be aware that:** +> +> - The stack grants broad power user permissions to the CodeBuild job (for whatever resources the CDK deployment may need to create) +> - When you delete the stack +> - The SageMaker Studio setup for your target AWS Region will be deleted (and the stack should *fail* to delete if you have any users running 'apps' in Studio apart from the ones set up by the stack. You can manage these through the [SageMaker console UI](https://console.aws.amazon.com/sagemaker/home?#/studio)) +> - The CDK solution deployed deployed by the CodeBuild project will *not* automatically be cleaned up + +## Developing and Deploying Locally + +In addition to having an AWS account, you'll need an environment with: + +- The [AWS CLI](https://aws.amazon.com/cli/) +- The [AWS SAM CLI](https://docs.aws.amazon.com/serverless-application-model/latest/developerguide/serverless-sam-cli-install.html) +- A Docker-compatible container runtime such as [Docker Desktop](https://www.docker.com/products/docker-desktop) +- A `make` utility such as [GNU Make](https://www.gnu.org/software/make/) - probably already installed if you have some bundled build tools already. +- *Probably* a UNIX-like (non-Windows) shell if you want things to run smoothly... But you can always give it a try and resort to translating commands from the [Makefile](Makefile) if things go wrong. + +You'll also need: + +- Sufficient access (log in with `aws configure`) to be able to deploy the stacks in your target region +- An *[Amazon S3](https://s3.console.aws.amazon.com/s3/home) Bucket* to use for staging deployment assets (Lambda bundles, etc) + +**Step 1: Build the Lambda bundles and final CloudFormation template to S3 with AWS SAM** + +(This command builds your assets and CloudFormation template, and stages them to your nominated Amazon S3 bucket) + +```sh +make package DEPLOYMENT_BUCKET_NAME=DOC-EXAMPLE-BUCKET +``` + +**Step 2: Deploy (create or update) the stack** + +```sh +make deploy STACK_NAME=workshopstack +``` + +***Alternative: Build and create the stack in one go** + +(This option only *creates* stacks, and disables rollback, for easier debugging) + +```sh +make all DEPLOYMENT_BUCKET_NAME=example-bucket STACK_NAME=workshopstack +``` + +There's also a `make delete` option to help with cleaning up. + +## Preparing Templates for Multi-Region Deployment + +If you'd like your template to be deployable in multiple AWS Regions: + +- Set up an asset hosting bucket in each region of interest, and use the AWS Region ID (e.g. `us-east-1`) in the bucket names +- Set up cross-region replication to copy contents from your lead region to other regions +- Run the `make package` script against your lead region + +The generated template will be automatically post-processed (by [sam-postproc.py](sam-postproc.py)) to tokenize S3 references to hosted assets to refer to the `${AWS::Region}` placeholder. diff --git a/.workshop-infra/fn-codebuild-run/main.py b/.workshop-infra/fn-codebuild-run/main.py new file mode 100644 index 0000000..7bcc708 --- /dev/null +++ b/.workshop-infra/fn-codebuild-run/main.py @@ -0,0 +1,85 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 +"""Custom CloudFormation Resource to kick off CodeBuild project builds + +This custom resource expects a 'ProjectName' property, and will simply kick off a run of that AWS +CodeBuild Project on creation. It doesn't wait for the run to complete successfully, and it doesn't +do anything on resource UPDATE/DELETE. +""" +# Python Built-Ins: +import logging +import traceback + +# External Dependencies: +import boto3 +import cfnresponse + +codebuild = boto3.client("codebuild") + + +def lambda_handler(event, context): + try: + request_type = event["RequestType"] + if request_type == "Create": + handle_create(event, context) + elif request_type == "Update": + handle_update(event, context) + elif request_type == "Delete": + handle_delete(event, context) + else: + cfnresponse.send( + event, + context, + cfnresponse.FAILED, + {}, + error=f"Unsupported CFN RequestType '{request_type}'", + ) + except Exception as e: + logging.error("Uncaught exception in CFN custom resource handler - reporting failure") + traceback.print_exc() + cfnresponse.send( + event, + context, + cfnresponse.FAILED, + {}, + error=str(e), + ) + raise e + + +def handle_create(event, context): + logging.info("**Received create request") + resource_config = event["ResourceProperties"] + logging.info("**Running CodeBuild Job") + result = codebuild.start_build( + projectName=resource_config["ProjectName"], + ) + cfnresponse.send( + event, + context, + cfnresponse.SUCCESS, + {}, + physicalResourceId=result["build"]["arn"], + ) + + +def handle_delete(event, context): + logging.info("**Received delete event - no-op") + cfnresponse.send( + event, + context, + cfnresponse.SUCCESS, + {}, + physicalResourceId=event["PhysicalResourceId"], + ) + + +def handle_update(event, context): + logging.info("**Received update event - no-op") + cfnresponse.send( + event, + context, + cfnresponse.SUCCESS, + {}, + physicalResourceId=event["PhysicalResourceId"], + ) diff --git a/.workshop-infra/fn-codebuild-run/requirements.txt b/.workshop-infra/fn-codebuild-run/requirements.txt new file mode 100644 index 0000000..6ef298a --- /dev/null +++ b/.workshop-infra/fn-codebuild-run/requirements.txt @@ -0,0 +1 @@ +# Nothing else required beyond common Lambda layer diff --git a/.workshop-infra/fn-domain/main.py b/.workshop-infra/fn-domain/main.py new file mode 100644 index 0000000..7d511d8 --- /dev/null +++ b/.workshop-infra/fn-domain/main.py @@ -0,0 +1,266 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 +"""Custom CloudFormation Resource for a SageMaker Studio Domain (with additional outputs) + +As well as creating a SMStudio domain, this implementation: +- Defaults to the default VPC, or to any VPC when exactly one is present, if not explicitly + configured +- Defaults to all default subnets if any are present, or else all subnets in VPC, if not + explicitly set +- Discovers and outputs a list of security group IDs (default+SM-generated) that downstream + resources may use to perform user setup actions on the Elastic File System +""" +# Python Built-Ins: +import logging +import time +import traceback + +# External Dependencies: +import boto3 +import cfnresponse + +# Local Dependencies: +import vpctools + +ec2 = boto3.client("ec2") +smclient = boto3.client("sagemaker") + + +def lambda_handler(event, context): + try: + request_type = event["RequestType"] + if request_type == "Create": + handle_create(event, context) + elif request_type == "Update": + handle_update(event, context) + elif request_type == "Delete": + handle_delete(event, context) + else: + cfnresponse.send( + event, + context, + cfnresponse.FAILED, + {}, + error=f"Unsupported CFN RequestType '{request_type}'", + ) + except Exception as e: + logging.error("Uncaught exception in CFN custom resource handler - reporting failure") + traceback.print_exc() + cfnresponse.send( + event, + context, + cfnresponse.FAILED, + {}, + error=str(e), + ) + raise e + + +def handle_create(event, context): + logging.info("**Received create request") + resource_config = event["ResourceProperties"] + + # We split out pre- and post-processing because we'd like to always report our correct + # physicalResourceId if erroring out after the actual creation, so that the subsequent deletion + # request can clean up. + logging.info("**Preparing studio domain creation parameters") + create_domain_args = preprocess_create_domain_args(resource_config) + logging.info("**Creating studio domain") + creation = smclient.create_domain(**create_domain_args) + _, _, domain_id = creation["DomainArn"].rpartition("/") + try: + result = post_domain_create(domain_id) + domain_desc = result["DomainDescription"] + response = { + "DomainId": domain_desc["DomainId"], + "DomainName": domain_desc["DomainName"], + "HomeEfsFileSystemId": domain_desc["HomeEfsFileSystemId"], + "SubnetIds": ",".join(domain_desc["SubnetIds"]), + "Url": domain_desc["Url"], + "VpcId": domain_desc["VpcId"], + "ProposedAdminSubnetCidr": result["ProposedAdminSubnetCidr"], + "InboundEFSSecurityGroupId": result["InboundEFSSecurityGroupId"], + "OutboundEFSSecurityGroupId": result["OutboundEFSSecurityGroupId"], + } + print(response) + cfnresponse.send( + event, + context, + cfnresponse.SUCCESS, + response, + physicalResourceId=domain_id, + ) + except Exception as e: + logging.error("Uncaught exception in post-creation processing") + traceback.print_exc() + cfnresponse.send( + event, + context, + cfnresponse.FAILED, + {}, + physicalResourceId=domain_id, + error=str(e), + ) + + +def handle_delete(event, context): + logging.info("**Received delete event") + domain_id = event["PhysicalResourceId"] + try: + smclient.describe_domain(DomainId=domain_id) + except smclient.exceptions.ResourceNotFound as exception: + # Already does not exist -> deletion success + cfnresponse.send( + event, + context, + cfnresponse.SUCCESS, + {}, + physicalResourceId=event["PhysicalResourceId"], + ) + return + logging.info("**Deleting studio domain") + delete_domain(domain_id) + cfnresponse.send( + event, + context, + cfnresponse.SUCCESS, + {}, + physicalResourceId=event["PhysicalResourceId"], + ) + + +def handle_update(event, context): + logging.info("**Received update event") + domain_id = event["PhysicalResourceId"] + default_user_settings = event["ResourceProperties"]["DefaultUserSettings"] + logging.info("**Updating studio domain") + update_domain(domain_id, default_user_settings) + # TODO: Should we wait here for the domain to enter active state again? + cfnresponse.send( + event, + context, + cfnresponse.SUCCESS, + {"DomainId": domain_id}, + physicalResourceId=event["PhysicalResourceId"], + ) + + +def preprocess_create_domain_args(config): + default_user_settings = config["DefaultUserSettings"] + domain_name = config["DomainName"] + vpc_id = config.get("VPC") + subnet_ids = config.get("SubnetIds") + + if not vpc_id: + # Try to look up the default VPC ID: + # TODO: NextToken handling on this list API? + available_vpcs = ec2.describe_vpcs()["Vpcs"] + if len(available_vpcs) <= 0: + raise ValueError("No default VPC exists - cannot create SageMaker Studio Domain") + + default_vpcs = list(filter(lambda v: v["IsDefault"], available_vpcs)) + if len(default_vpcs) == 1: + vpc = default_vpcs[0] + elif len(default_vpcs) > 1: + raise ValueError("'VPC' not specified in config, and multiple default VPCs found") + else: + if len(available_vpcs) == 1: + vpc = available_vpcs[0] + logging.warning(f"Found exactly one (non-default) VPC: Using {vpc['VpcId']}") + else: + raise ValueError( + "'VPC' not specified in config, and multiple VPCs found with no 'default' VPC" + ) + vpc_id = vpc["VpcId"] + + if not subnet_ids: + # Use all the subnets + # TODO: NextToken handling on this list API? + available_subnets = ec2.describe_subnets( + Filters=[ + { + "Name": "vpc-id", + "Values": [vpc_id], + } + ], + )["Subnets"] + default_subnets = list(filter(lambda n: n["DefaultForAz"], available_subnets)) + subnet_ids = [ + n["SubnetId"] + for n in (default_subnets if len(default_subnets) > 0 else available_subnets) + ] + elif isinstance(subnet_ids, str): + subnet_ids = subnet_ids.split(",") + + return { + "DomainName": domain_name, + "AuthMode": "IAM", + "DefaultUserSettings": default_user_settings, + "SubnetIds": subnet_ids, + "VpcId": vpc_id, + } + + +def post_domain_create(domain_id): + created = False + time.sleep(0.2) + while not created: + description = smclient.describe_domain(DomainId=domain_id) + status_lower = description["Status"].lower() + if status_lower == "inservice": + created = True + break + elif "fail" in status_lower: + raise ValueError(f"Domain {domain_id} entered failed status") + time.sleep(5) + logging.info("**SageMaker domain created successfully: %s", domain_id) + + vpc_id = description["VpcId"] + # Retrieve the VPC security groups set up by SageMaker for EFS communication: + inbound_efs_sg_id, outbound_efs_sg_id = vpctools.get_studio_efs_security_group_ids( + domain_id, + vpc_id, + ) + # Propose a valid subnet to create in this VPC for managing further setup actions: + proposed_admin_subnet = vpctools.propose_subnet(vpc_id) + return { + "DomainDescription": description, + "ProposedAdminSubnetCidr": proposed_admin_subnet["CidrBlock"], + "InboundEFSSecurityGroupId": inbound_efs_sg_id, + "OutboundEFSSecurityGroupId": outbound_efs_sg_id, + } + + +def delete_domain(domain_id): + response = smclient.delete_domain( + DomainId=domain_id, + RetentionPolicy={"HomeEfsFileSystem": "Delete"}, + ) + deleted = False + time.sleep(0.2) + while not deleted: + try: + smclient.describe_domain(DomainId=domain_id) + except smclient.exceptions.ResourceNotFound: + logging.info(f"Deleted domain {domain_id}") + deleted = True + break + time.sleep(5) + return response + + +def update_domain(domain_id, default_user_settings): + response = smclient.update_domain( + DomainId=domain_id, + DefaultUserSettings=default_user_settings, + ) + updated = False + time.sleep(0.2) + while not updated: + response = smclient.describe_domain(DomainId=domain_id) + if response["Status"] == "InService": + updated = True + else: + logging.info("Updating domain %s.. %s", domain_id, response["Status"]) + time.sleep(5) + return response diff --git a/.workshop-infra/fn-domain/requirements.txt b/.workshop-infra/fn-domain/requirements.txt new file mode 100644 index 0000000..6ef298a --- /dev/null +++ b/.workshop-infra/fn-domain/requirements.txt @@ -0,0 +1 @@ +# Nothing else required beyond common Lambda layer diff --git a/.workshop-infra/fn-domain/vpctools.py b/.workshop-infra/fn-domain/vpctools.py new file mode 100644 index 0000000..6fb99ce --- /dev/null +++ b/.workshop-infra/fn-domain/vpctools.py @@ -0,0 +1,145 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 +"""Utilities for analyzing VPCs for use with SageMaker Studio""" +# Python Built-Ins: +import ipaddress +from typing import List, Tuple, Union + +# External Dependencies: +import boto3 + +ec2 = boto3.client("ec2") + + +def get_studio_efs_security_group_ids( + studio_domain_id: str, vpc_id: str +) -> Tuple[Union[str, None], Union[str, None]]: + """Retrieve the security groups you need for [inbound, outbound] comms with SMStudio EFS + + Returns + ------- + inbound : Union[str, None] + Security Group ID for inbound connection from SMStudio filesystem, or None if not found + outbound : str + Secrity Group ID for outbound connection to SMStudio filesystem, or None if not found + + Raises + ------ + ValueError : + If multiple potential SGs are found for either inbound or outbound connection (suggests + duplication or otherwise erroneous SMStudio/VPC setup). + Other : + As per boto3 EC2 describe_security_groups() + """ + inbound_sg_name = f"security-group-for-inbound-nfs-{studio_domain_id}" + outbound_sg_name = f"security-group-for-outbound-nfs-{studio_domain_id}" + nfs_sgs = ec2.describe_security_groups( + Filters=[ + {"Name": "vpc-id", "Values": [vpc_id]}, + {"Name": "group-name", "Values": [inbound_sg_name, outbound_sg_name]}, + ], + )["SecurityGroups"] + inbound_sgs = list( + filter( + lambda sg: sg["GroupName"] == inbound_sg_name, + nfs_sgs, + ) + ) + n_inbound_sgs = len(inbound_sgs) + outbound_sgs = list( + filter( + lambda sg: sg["GroupName"] == outbound_sg_name, + nfs_sgs, + ) + ) + n_outbound_sgs = len(outbound_sgs) + if n_inbound_sgs > 1 or n_outbound_sgs > 1: + raise ValueError( + "Found duplicate EFS security groups for SMStudio {}: Got {} inbound, {} outbound".format( + studio_domain_id, + n_inbound_sgs, + n_outbound_sgs, + ) + ) + return ( + inbound_sgs[0]["GroupId"] if n_inbound_sgs else None, + outbound_sgs[0]["GroupId"] if n_outbound_sgs else None, + ) + + +def propose_subnet(vpc_id, new_subnet_prefixlen=26): + """Propose a valid configuration for a new IPv4 subnet to add to the VPC for CF stack purposes + + Parameters + ---------- + vpc_id : str + ID of the VPC to propose a subnet for + new_subnet_prefixlen : int (optional) + CIDR mask length in bits for requested new subnet to propose. Defaults to 26 bits (64 IPs) + """ + + # Get VPC info: + vpc_list = ec2.describe_vpcs( + Filters=[{"Name": "vpc-id", "Values": [vpc_id]}], + )["Vpcs"] + if not len(vpc_list): + raise ValueError(f"VPC ID {vpc_id} not found") + vpc_description = vpc_list[0] + existing_subnets = ec2.describe_subnets( + Filters=[{"Name": "vpc-id", "Values": [vpc_id]}], + )["Subnets"] + + # Load CIDRs of provided VPC and existing subnets with Python ipaddress library: + vpc_net = ipaddress.ip_network(vpc_description["CidrBlock"]) + existing_nets = list( + map( + lambda subnet: ipaddress.ip_network(subnet["CidrBlock"]), + existing_subnets, + ) + ) + + # Validate existing configuration: + # (Could probably skip this since we just retrieved fresh data, but might help to prevent any + # weird errors manifesting as harder-to-interpret issues further down) + for subnet in existing_nets: + if not subnet.subnet_of(vpc_net): + raise ValueError(f"Listed 'subnet' {subnet} is not inside VPC {vpc_net}") + for checknet in existing_nets: + if checknet != subnet and subnet.overlaps(checknet): + raise ValueError(f"Listed subnets {subnet} and {checknet} overlap") + + # Calculate remaining vacant ranges: + available_nets = [vpc_net] + for subnet in existing_nets: + next_available = [] + for vacancy in available_nets: + if vacancy.subnet_of(subnet): + # This gap is fully contained by `subnet` + continue + try: + # Preserve the list of subranges in `vacancy` after excluding `subnet`: + next_available += list(vacancy.address_exclude(subnet)) + except ValueError: + # This `vacancy` does not contain `subnet`: + next_available.append(vacancy) + available_nets = next_available + available_nets.sort() + + # Select the first available subnet of requested size: + try: + parent = next( + filter( + lambda n: n.prefixlen <= new_subnet_prefixlen, + available_nets, + ) + ) + except StopIteration: + raise ValueError(f"No vacant subnets of requested size /{new_subnet_prefixlen} left in VPC") + + if parent.prefixlen == new_subnet_prefixlen: + proposed_net = parent + else: + diff = new_subnet_prefixlen - parent.prefixlen + proposed_net = next(parent.subnets(diff)) + + return {"CidrBlock": str(proposed_net)} diff --git a/.workshop-infra/fn-user/main.py b/.workshop-infra/fn-user/main.py new file mode 100644 index 0000000..b996945 --- /dev/null +++ b/.workshop-infra/fn-user/main.py @@ -0,0 +1,194 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 +"""Custom CloudFormation Resource for a SageMaker Studio User Profile""" +# Python Built-Ins: +import logging +import time +import traceback + +# External Dependencies: +import boto3 +import cfnresponse + +smclient = boto3.client("sagemaker") + + +def lambda_handler(event, context): + try: + request_type = event["RequestType"] + if request_type == "Create": + handle_create(event, context) + elif request_type == "Update": + handle_update(event, context) + elif request_type == "Delete": + handle_delete(event, context) + else: + cfnresponse.send( + event, + context, + cfnresponse.FAILED, + {}, + error=f"Unsupported CFN RequestType '{request_type}'", + ) + except Exception as e: + logging.error("Uncaught exception in CFN custom resource handler - reporting failure") + traceback.print_exc() + cfnresponse.send( + event, + context, + cfnresponse.FAILED, + {}, + error=str(e), + ) + raise e + + +def handle_create(event, context): + logging.info("**Received create request") + resource_config = event["ResourceProperties"] + + logging.info("**Creating user profile") + result = create_user_profile(resource_config) + # TODO: Do we need to wait for completion? + response = { + "UserProfileName": result["UserProfileName"], + "HomeEfsFileSystemUid": result["HomeEfsFileSystemUid"], + } + print(response) + cfnresponse.send( + event, + context, + cfnresponse.SUCCESS, + response, + physicalResourceId=result["UserProfileName"], + ) + + +def handle_delete(event, context): + logging.info("**Received delete event") + user_profile_name = event["PhysicalResourceId"] + domain_id = event["ResourceProperties"]["DomainId"] + try: + smclient.describe_user_profile(DomainId=domain_id, UserProfileName=user_profile_name) + except smclient.exceptions.ResourceNotFound as exception: + cfnresponse.send( + event, + context, + cfnresponse.SUCCESS, + {}, + physicalResourceId=event["PhysicalResourceId"], + ) + return + delete_user_profile(domain_id, user_profile_name) + cfnresponse.send( + event, + context, + cfnresponse.SUCCESS, + {}, + physicalResourceId=event["PhysicalResourceId"], + ) + + +def handle_update(event, context): + logging.info("**Received update event") + user_profile_name = event["PhysicalResourceId"] + domain_id = event["ResourceProperties"]["DomainId"] + user_settings = event["ResourceProperties"]["UserSettings"] + update_user_profile(domain_id, user_profile_name, user_settings) + cfnresponse.send( + event, + context, + cfnresponse.SUCCESS, + {}, + physicalResourceId=event["PhysicalResourceId"], + ) + + +def create_user_profile(config): + domain_id = config["DomainId"] + user_profile_name = config["UserProfileName"] + user_settings = config["UserSettings"] + + response = smclient.create_user_profile( + DomainId=domain_id, + UserProfileName=user_profile_name, + UserSettings=user_settings, + ) + created = False + time.sleep(0.2) + while not created: + response = smclient.describe_user_profile( + DomainId=domain_id, + UserProfileName=user_profile_name, + ) + status_lower = response["Status"].lower() + if status_lower == "inservice": + created = True + break + elif "failed" in status_lower: + raise ValueError( + "User '%s' entered Failed state during creation (domain %s)" + % (user_profile_name, domain_id) + ) + time.sleep(5) + + logging.info("**SageMaker domain created successfully: %s", domain_id) + return response + + +def delete_user_profile(domain_id, user_profile_name): + response = smclient.delete_user_profile( + DomainId=domain_id, + UserProfileName=user_profile_name, + ) + deleted = False + time.sleep(0.2) + while not deleted: + try: + response = smclient.describe_user_profile( + DomainId=domain_id, + UserProfileName=user_profile_name, + ) + status_lower = response["Status"].lower() + if "failed" in status_lower: + raise ValueError( + "User '%s' entered Failed state during deletion (domain %s)" + % (user_profile_name, domain_id) + ) + elif "deleting" not in status_lower: + raise ValueError( + "User '%s' no longer 'Deleting' but not deleted (domain %s)" + % (user_profile_name, domain_id) + ) + except smclient.exceptions.ResourceNotFound: + logging.info("Deleted user %s from domain %s", user_profile_name, domain_id) + deleted = True + break + time.sleep(5) + return response + + +def update_user_profile(domain_id, user_profile_name, user_settings): + response = smclient.update_user_profile( + DomainId=domain_id, + UserProfileName=user_profile_name, + UserSettings=user_settings, + ) + updated = False + time.sleep(0.2) + while not updated: + response = smclient.describe_user_profile( + DomainId=domain_id, + UserProfileName=user_profile_name, + ) + status_lower = response["Status"].lower() + if status_lower == "inservice": + updated = True + break + elif "failed" in status_lower: + raise ValueError( + "User '%s' entered Failed state during deletion (domain %s)" + % (user_profile_name, domain_id) + ) + time.sleep(5) + return response diff --git a/.workshop-infra/fn-user/requirements.txt b/.workshop-infra/fn-user/requirements.txt new file mode 100644 index 0000000..6ef298a --- /dev/null +++ b/.workshop-infra/fn-user/requirements.txt @@ -0,0 +1 @@ +# Nothing else required beyond common Lambda layer diff --git a/.workshop-infra/fn-usersetup/main.py b/.workshop-infra/fn-usersetup/main.py new file mode 100644 index 0000000..b6347ae --- /dev/null +++ b/.workshop-infra/fn-usersetup/main.py @@ -0,0 +1,160 @@ +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 +"""Custom CloudFormation Resource for post-creation setup of a SageMaker Studio user + +Clones a (public) 'GitRepository' into the user's home folder. + +Updating or deleting this resource does not currently do anything. Errors in the setup process are +also ignored (typically don't want to roll back the whole stack just because we couldn't clone a +repo - as users can always do it manually!) +""" +# Python Built-Ins: +import logging +import os +import traceback + +# External Dependencies: +import boto3 +import cfnresponse +from git import Repo + +smclient = boto3.client("sagemaker") + + +def lambda_handler(event, context): + try: + request_type = event["RequestType"] + if request_type == "Create": + handle_create(event, context) + elif request_type == "Update": + handle_update(event, context) + elif request_type == "Delete": + handle_delete(event, context) + else: + cfnresponse.send( + event, + context, + cfnresponse.FAILED, + {}, + error=f"Unsupported CFN RequestType '{request_type}'", + ) + except Exception as e: + logging.error("Uncaught exception in CFN custom resource handler - reporting failure") + traceback.print_exc() + cfnresponse.send( + event, + context, + cfnresponse.FAILED, + {}, + error=str(e), + ) + raise e + + +def handle_create(event, context): + logging.info("**Received create request") + resource_config = event["ResourceProperties"] + logging.info("**Setting up user") + result = create_user_setup(resource_config) + cfnresponse.send( + event, + context, + cfnresponse.SUCCESS, + {"UserProfileName": result["UserProfileName"]}, + physicalResourceId=result["UserProfileName"], + ) + + +def handle_delete(event, context): + logging.info("**Received delete event") + user_profile_name = event["PhysicalResourceId"] + domain_id = event["ResourceProperties"]["DomainId"] + logging.info("**Deleting user setup") + delete_user_setup(domain_id, user_profile_name) + cfnresponse.send( + event, + context, + cfnresponse.SUCCESS, + {}, + physicalResourceId=event["PhysicalResourceId"], + ) + + +def handle_update(event, context): + logging.info("**Received update event") + user_profile_name = event["PhysicalResourceId"] + domain_id = event["ResourceProperties"]["DomainId"] + git_repo = event["ResourceProperties"]["GitRepository"] + logging.info("**Updating user setup") + update_user_setup(domain_id, user_profile_name, git_repo) + cfnresponse.send( + event, + context, + cfnresponse.SUCCESS, + {}, + physicalResourceId=event["PhysicalResourceId"], + ) + + +def chown_recursive(path, uid=-1, gid=-1): + """Workaround for os.chown() not having a recursive option for folders""" + for dirpath, dirnames, filenames in os.walk(path): + os.chown(dirpath, uid, gid) + for filename in filenames: + os.chown(os.path.join(dirpath, filename), uid, gid) + + +def create_user_setup(config): + domain_id = config["DomainId"] + user_profile_name = config["UserProfileName"] + git_repo = config["GitRepository"] + efs_uid = config["HomeEfsFileSystemUid"] + print(f"Setting up user: {config}") + try: + # The root of the EFS contains folders named for each user UID, but these may not be + # created before the user has first logged in (could os.listdir("/mnt/efs") to check): + print("Creating/checking home folder...") + home_folder = f"/mnt/efs/{efs_uid}" + os.makedirs(home_folder, exist_ok=True) + # Set correct ownership permissions for this folder straight away, in case a later process + # errors out + os.chown(home_folder, int(efs_uid), -1) + + # Now ready to clone in Git content (or whatever else...) + print(f"Cloning code... {git_repo}") + # Our target folder for Repo.clone_from() needs to be the *actual* target folder, not the + # parent under which a new folder will be created, so we'll infer that from the repo name: + repo_folder_name = git_repo.rpartition("/")[2] + if repo_folder_name.lower().endswith(".git"): + repo_folder_name = repo_folder_name[: -len(".git")] + Repo.clone_from(git_repo, f"{home_folder}/{repo_folder_name}") + + # Remember to set ownership/permissions for all the stuff we just created, to give the user + # write access: + chown_recursive(f"{home_folder}/{repo_folder_name}", uid=int(efs_uid)) + print("All done") + except Exception as e: + # Don't bring the entire CF stack down just because we couldn't copy a repo: + print("IGNORING CONTENT SETUP ERROR") + traceback.print_exc() + + logging.info("**SageMaker Studio user '%s' set up successfully", user_profile_name) + return {"UserProfileName": user_profile_name} + + +def delete_user_setup(domain_id, user_profile_name): + logging.info( + "**Deleting user setup is a no-op: user '%s' on domain '%s", + user_profile_name, + domain_id, + ) + return {"UserProfileName": user_profile_name} + + +def update_user_setup(domain_id, user_profile_name, git_repo): + logging.info( + "**Updating user setup is a no-op: user '%s' on domain '%s", + user_profile_name, + domain_id, + ) + return {"UserProfileName": user_profile_name} diff --git a/.workshop-infra/fn-usersetup/requirements.txt b/.workshop-infra/fn-usersetup/requirements.txt new file mode 100644 index 0000000..afd09d3 --- /dev/null +++ b/.workshop-infra/fn-usersetup/requirements.txt @@ -0,0 +1,4 @@ +# GitPython provides Python bindings for git *assuming you already have the git binaries installed* +# - We've handled this via a 3rd party Lambda Layer, but you could instead consider instead using a +# PyPI package like 'lambda-git' which bundles binaries. +gitpython>=3.1,<4 diff --git a/.workshop-infra/lambda-common/cfnresponse.py b/.workshop-infra/lambda-common/cfnresponse.py new file mode 100644 index 0000000..6346791 --- /dev/null +++ b/.workshop-infra/lambda-common/cfnresponse.py @@ -0,0 +1,54 @@ +# Copyright Amazon Web Services, Inc. or its affiliates. All Rights Reserved. +# This file is licensed to you under the AWS Customer Agreement (the "License"). +# You may not use this file except in compliance with the License. +# A copy of the License is located at http://aws.amazon.com/agreement/ . +# This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, express or implied. +# See the License for the specific language governing permissions and limitations under the License. + +import urllib3 +import json + +http = urllib3.PoolManager() +SUCCESS = "SUCCESS" +FAILED = "FAILED" + + +def send( + event, context, responseStatus, responseData, physicalResourceId=None, noEcho=False, error=None +): + responseUrl = event["ResponseURL"] + + print(responseUrl) + + responseBody = {} + responseBody["Status"] = responseStatus + if error is None: + responseBody["Reason"] = ( + "See the details in CloudWatch Log Stream: " + + context.log_stream_name + + " LogGroup: " + + context.log_group_name + ) + else: + responseBody["Reason"] = error + responseBody["PhysicalResourceId"] = physicalResourceId or context.log_stream_name + responseBody["StackId"] = event["StackId"] + responseBody["RequestId"] = event["RequestId"] + responseBody["LogicalResourceId"] = event["LogicalResourceId"] + responseBody["NoEcho"] = noEcho + responseBody["Data"] = responseData + + json_responseBody = json.dumps(responseBody) + + print("Response body:\n" + json_responseBody) + + headers = {"content-type": "", "content-length": str(len(json_responseBody))} + + try: + + response = http.request( + "PUT", responseUrl, body=json_responseBody.encode("utf-8"), headers=headers + ) + print("Status code: " + response.reason) + except Exception as e: + print("send(..) failed executing requests.put(..): " + str(e)) diff --git a/.workshop-infra/lambda-common/requirements.txt b/.workshop-infra/lambda-common/requirements.txt new file mode 100644 index 0000000..d04a601 --- /dev/null +++ b/.workshop-infra/lambda-common/requirements.txt @@ -0,0 +1 @@ +# Nothing else required beyond standard AWS Lambda environment (boto3, etc) diff --git a/.workshop-infra/sam-postproc.py b/.workshop-infra/sam-postproc.py new file mode 100644 index 0000000..ec36e90 --- /dev/null +++ b/.workshop-infra/sam-postproc.py @@ -0,0 +1,75 @@ +#!/usr/bin/python +# Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +# SPDX-License-Identifier: MIT-0 +"""CLI tool to tokenize AWS regions in asset S3 URIs generated by AWS SAM + +This enables easy cross-region deployment, by setting up cross-region-replicated asset hosting +buckets with region codes in the names. + +You could probably do much the same thing in shell with a tool like jq, but Python gives lots of +flexibility to customize and extend where needed. +""" +# Python Built-Ins: +import argparse +import json +import re + +AWS_REGION_SUFFIX_REGEX = ( + r"(?:af|ap|ca|eu|me|sa|us)-(?:central|north|south|(north|south)?east|(north|south)?west)-[1-3]$" +) + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Utility to parse SAM-generated JSON templates to multi-region-ify assets" + ) + parser.add_argument( + "infile", + type=str, + help="Path to input JSON template file generated by AWS SAM", + ) + parser.add_argument( + "outfile", + type=str, + help="Path to output file to save modified template", + ) + return parser.parse_args() + + +def main(args): + print(f"Loading input template... {args.infile}") + with open(args.infile, "r") as fin: + template = json.loads(fin.read()) + + print("\nAdjusting region-suffixed asset URIs...") + resources = template.get("Resources", {}) + n_edited = 0 + for resname in resources: + resprops = resources[resname].get("Properties", {}) + for asset_attr in ("ContentUri", "CodeUri"): + if ( + asset_attr in resprops + and isinstance(resprops[asset_attr], str) + and resprops[asset_attr].lower().startswith("s3://") + ): + bucket, _, key = resprops[asset_attr][len("s3://") :].partition("/") + + bucket_tokenized = re.sub( + AWS_REGION_SUFFIX_REGEX, + r"${AWS::Region}", + bucket, + ) + if bucket != bucket_tokenized: + resprops[asset_attr] = {"Bucket": {"Fn::Sub": bucket_tokenized}, "Key": key} + n_edited += 1 + print(f" - Region-tokenized {resname}.{asset_attr}") + print(f"\nEdited {n_edited} resource properties\n") + + print(f"Writing output to {args.outfile}") + with open(args.outfile, "w") as fout: + fout.write(json.dumps(template, indent=2)) + + +if __name__ == "__main__": + args = parse_args() + main(args) diff --git a/.workshop-infra/template.sam.yaml b/.workshop-infra/template.sam.yaml new file mode 100644 index 0000000..833b2a9 --- /dev/null +++ b/.workshop-infra/template.sam.yaml @@ -0,0 +1,541 @@ +--- +AWSTemplateFormatVersion: '2010-09-09' +Transform: 'AWS::Serverless-2016-10-31' + +Description: >- + Workshop CDK bootstrap and deployer stack + +Parameters: + NotebookName: + Type: String + Default: WorkshopNotebook + Description: Enter the name of the SageMaker notebook instance. Default is WorkshopNotebook. + + TargetRepo: + Type: String + Default: https://github.com/aws-samples/amazon-textract-transformer-pipeline + Description: URL of code repository e.g. https://github.com/my-user/my-repo + + VpcId: + Type: String + Description: VPC ID to use (e.g. vpc-xxxxxx), or blank to use default. + Default: '' + + SubnetIds: + Type: String + Description: >- + Comma-separated list of subnet IDs to use (e.g. subnet-xxxxxx), or blank to use all default subnets. + Default: '' + +Mappings: + RegionMap: + us-east-1: + datascience: 'arn:aws:sagemaker:us-east-1:081325390199:image/datascience-1.0' + us-east-2: + datascience: 'arn:aws:sagemaker:us-east-2:429704687514:image/datascience-1.0' + us-west-1: + datascience: 'arn:aws:sagemaker:us-west-1:742091327244:image/datascience-1.0' + us-west-2: + datascience: 'arn:aws:sagemaker:us-west-2:236514542706:image/datascience-1.0' + af-south-1: + datascience: 'arn:aws:sagemaker:af-south-1:559312083959:image/datascience-1.0' + ap-east-1: + datascience: 'arn:aws:sagemaker:ap-east-1:493642496378:image/datascience-1.0' + ap-south-1: + datascience: 'arn:aws:sagemaker:ap-south-1:394103062818:image/datascience-1.0' + ap-northeast-2: + datascience: 'arn:aws:sagemaker:ap-northeast-2:806072073708:image/datascience-1.0' + ap-southeast-1: + datascience: 'arn:aws:sagemaker:ap-southeast-1:492261229750:image/datascience-1.0' + ap-southeast-2: + datascience: 'arn:aws:sagemaker:ap-southeast-2:452832661640:image/datascience-1.0' + ap-northeast-1: + datascience: 'arn:aws:sagemaker:ap-northeast-1:102112518831:image/datascience-1.0' + ca-central-1: + datascience: 'arn:aws:sagemaker:ca-central-1:310906938811:image/datascience-1.0' + eu-central-1: + datascience: 'arn:aws:sagemaker:eu-central-1:936697816551:image/datascience-1.0' + eu-west-1: + datascience: 'arn:aws:sagemaker:eu-west-1:470317259841:image/datascience-1.0' + eu-west-2: + datascience: 'arn:aws:sagemaker:eu-west-2:712779665605:image/datascience-1.0' + eu-west-3: + datascience: 'arn:aws:sagemaker:eu-west-3:615547856133:image/datascience-1.0' + eu-north-1: + datascience: 'arn:aws:sagemaker:eu-north-1:243637512696:image/datascience-1.0' + eu-south-1: + datascience: 'arn:aws:sagemaker:eu-south-1:592751261982:image/datascience-1.0' + sa-east-1: + datascience: 'arn:aws:sagemaker:sa-east-1:782484402741:image/datascience-1.0' + +Resources: + +######## COMMON RESOURCES ######## + LambdaExecutionPolicy: + # TODO: Scope setup Lambda permissions down further + # Would be good to scope more perms down to the current region & account ID for e.g. + Type: 'AWS::IAM::ManagedPolicy' + Properties: + Path: / + PolicyDocument: + Version: '2012-10-17' + Statement: + - Sid: CloudWatchLogsPermissions + Effect: Allow + Action: + - logs:CreateLogGroup + - logs:CreateLogStream + - logs:PutLogEvents + Resource: !Sub 'arn:${AWS::Partition}:logs:*:*:*' + - Sid: GetVPCsAndSubnets + Effect: Allow + Action: + - ec2:AssignPrivateIpAddresses # (Only needed for user post-setup) + - ec2:CreateNetworkInterface # (Only needed for user post-setup) + - ec2:DeleteNetworkInterface # (Only needed for user post-setup) + - ec2:DescribeNetworkInterfaces # (Only needed for user post-setup) + - ec2:DescribeSecurityGroups + - ec2:DescribeSubnets + - ec2:DescribeVpcs + - ec2:UnassignPrivateIpAddresses # (Only needed for user post-setup) + Resource: + - '*' + - Sid: StudioEFSWrite # (Only needed for user post-setup) + Effect: Allow + Action: + - elasticfilesystem:ClientMount + - elasticfilesystem:ClientRootAccess + - elasticfilesystem:ClientWrite + - elasticfilesystem:DescribeMountTargets + Resource: '*' # TODO: Restrict + - Sid: SageMakerDomainPermission + Effect: Allow + Action: + - sagemaker:CreateDomain + - sagemaker:DeleteDomain + - sagemaker:DescribeDomain + - sagemaker:UpdateDomain + - sagemaker:CreateUserProfile + - sagemaker:DeleteUserProfile + - sagemaker:DescribeUserProfile + - sagemaker:UpdateUserProfile + Resource: + - !Sub 'arn:${AWS::Partition}:sagemaker:*:*:domain/*' + - !Sub 'arn:${AWS::Partition}:sagemaker:*:*:user-profile/*' + - Sid: CreateSageMakerServiceLinkedRole + Effect: Allow + Action: + - iam:CreateServiceLinkedRole + Resource: + - !Sub 'arn:${AWS::Partition}:iam::${AWS::AccountId}:role/aws-service-role/sagemaker.amazonaws.com/AWSServiceRoleForAmazonSageMakerNotebooks' + - Sid: SageMakerExecPassRole + Effect: Allow + Action: + - iam:PassRole + Resource: !GetAtt SageMakerExecutionRole.Arn + - Sid: StartCodeBuild + Effect: Allow + Action: + - codebuild:StartBuild + Resource: + - !GetAtt CodeBuildProject.Arn + + # Permissions for the Lambda functions implementing our custom CFN resources: + LambdaExecutionRole: + Type: 'AWS::IAM::Role' + Properties: + AssumeRolePolicyDocument: + Version: '2012-10-17' + Statement: + - Effect: Allow + Principal: + Service: + - lambda.amazonaws.com + Action: + - 'sts:AssumeRole' + ManagedPolicyArns: + - !Ref LambdaExecutionPolicy + Path: / + + # Shared layer loading the cfn-response module (doesn't seem to auto-import with SAM Lambdas) + LambdaCommonLayer: + Type: 'AWS::Serverless::LayerVersion' + Properties: + ContentUri: ./lambda-common/ + CompatibleRuntimes: + - python3.8 + - python3.7 + - python3.6 + Metadata: + BuildMethod: python3.8 + +######## END COMMON RESOURCES ######## + +######## CDK STACK DEPLOYMENT VIA CODEBUILD ######## + CodeBuildServiceRole: + Type: 'AWS::IAM::Role' + Properties: + AssumeRolePolicyDocument: + Version: '2012-10-17' + Statement: + - Effect: Allow + Principal: + Service: codebuild.amazonaws.com + Action: sts:AssumeRole + Path: '/workshop-deployment/' + Policies: + - PolicyName: StackDeploymentPerms + PolicyDocument: + Version: '2012-10-17' + Statement: + - Sid: IAMAccess + Effect: Allow + Action: + - 'iam:AttachRolePolicy' + - 'iam:CreatePolicy' + - 'iam:CreatePolicyVersion' + - 'iam:CreateRole' + - 'iam:DeletePolicy' + - 'iam:DeletePolicyVersion' + - 'iam:DeleteRole' + - 'iam:DeleteRolePolicy' + - 'iam:GetPolicy' + - 'iam:GetPolicyVersion' + - 'iam:GetRole' + - 'iam:GetRolePolicy' + - 'iam:PutRolePolicy' + - 'iam:TagPolicy' + - 'iam:TagRole' + - 'iam:UpdateRole' + - 'iam:UpdateRoleDescription' + Resource: '*' + ManagedPolicyArns: + - 'arn:aws:iam::aws:policy/PowerUserAccess' + + # CodeBuild project to run the CDK deployment: + CodeBuildProject: + Type: 'AWS::CodeBuild::Project' + Properties: + Artifacts: + Type: NO_ARTIFACTS + ConcurrentBuildLimit: 1 + Description: CDK stack deployer + Environment: + # Certificate + # See https://docs.aws.amazon.com/codebuild/latest/userguide/build-env-ref-compute-types.html + ComputeType: BUILD_GENERAL1_SMALL + EnvironmentVariables: + - Name: PUBLIC_REPO + Type: PLAINTEXT + Value: !Ref TargetRepo + Image: aws/codebuild/standard:5.0 + ImagePullCredentialsType: CODEBUILD + PrivilegedMode: true # Enable Docker + Type: LINUX_CONTAINER + QueuedTimeoutInMinutes: 60 # 480 is the max + ServiceRole: !GetAtt CodeBuildServiceRole.Arn + Source: + BuildSpec: | + version: 0.2 + env: + variables: + CDK_NEW_BOOTSTRAP: "1" + phases: + pre_build: + commands: + - set -ex + - curl -sSL https://raw.githubusercontent.com/python-poetry/poetry/master/install-poetry.py | python - + - export PATH="/root/.local/bin:$PATH" + - npm install -g aws-cdk + build: + commands: + - set -ex + - git clone --depth 1 $PUBLIC_REPO code + - cd code + - poetry install + - poetry run cdk bootstrap + - poetry run cdk deploy --all --require-approval never + SourceIdentifier: targetrepo + # Note we use 'NO_SOURCE' and git clone in the job, because GitHub sources require auth + # setup even if the repository is public! + Type: NO_SOURCE + + TimeoutInMinutes: 40 # 480 is the max, should really take 10 + + CodeBuildTriggerFunction: + Type: 'AWS::Serverless::Function' + Properties: + Description: CloudFormation custom resource implementation for running CodeBuild project + CodeUri: ./fn-codebuild-run/ + Handler: main.lambda_handler + MemorySize: 128 + Role: !GetAtt LambdaExecutionRole.Arn + Runtime: python3.8 + Timeout: 900 + Layers: + - !Ref LambdaCommonLayer + + # Trigger the CodeBuild project just once on stack create: + CodeBuildTrigger: + Type: 'Custom::CodeBuildTrigger' + Properties: + ServiceToken: !GetAtt CodeBuildTriggerFunction.Arn + ProjectName: !Ref CodeBuildProject + +######## END CDK STACK DEPLOYMENT VIA CODEBUILD ######## + +######## SAGEMAKER ENVIRONMENT BASE SETUP ######## + # Permissions for the SageMaker notebook environment: + SageMakerExecutionRole: + Type: 'AWS::IAM::Role' + Properties: + AssumeRolePolicyDocument: + Version: '2012-10-17' + Statement: + - + Effect: Allow + Principal: + Service: sagemaker.amazonaws.com + Action: sts:AssumeRole + Path: '/' + ManagedPolicyArns: + - 'arn:aws:iam::aws:policy/AmazonSageMakerFullAccess' + - 'arn:aws:iam::aws:policy/AmazonS3FullAccess' + - 'arn:aws:iam::aws:policy/IAMFullAccess' + + # If running on classic notebook instances, make sure widgets library is installed: + NotebookConfig: + Type: 'AWS::SageMaker::NotebookInstanceLifecycleConfig' + Properties: + NotebookInstanceLifecycleConfigName: !Sub '${AWS::StackName}-LifecycleConfig' + OnStart: + - Content: + Fn::Base64: !Sub | + #!/bin/bash + set -e + + # Install extension for ipywidgets interactivity: + sudo -u ec2-user -i <<'EOF' + source /home/ec2-user/anaconda3/bin/activate JupyterSystemEnv + jupyter labextension install @jupyter-widgets/jupyterlab-manager + source /home/ec2-user/anaconda3/bin/deactivate + EOF + + # Classic notebook instance in case you want to run the workshop there: + NotebookInstance: + Type: 'AWS::SageMaker::NotebookInstance' + Properties: + InstanceType: ml.t3.medium + PlatformIdentifier: notebook-al2-v1 + LifecycleConfigName: !GetAtt NotebookConfig.NotebookInstanceLifecycleConfigName + NotebookInstanceName: !Ref NotebookName + RoleArn: !GetAtt SageMakerExecutionRole.Arn + VolumeSizeInGB: 30 + DefaultCodeRepository: !Ref TargetRepo + + # Custom resource implementation for creating SMStudio domains + StudioDomainFunction: + Type: 'AWS::Serverless::Function' + Properties: + Description: CloudFormation custom resource implementation for SageMaker Studio domain + CodeUri: ./fn-domain/ + Handler: main.lambda_handler + MemorySize: 128 + Role: !GetAtt LambdaExecutionRole.Arn + Runtime: python3.8 + Timeout: 900 + Layers: + - !Ref LambdaCommonLayer + + # Custom resource implementation for creating SMStudio users + UserProfileFunction: + Type: 'AWS::Serverless::Function' + Properties: + Description: CloudFormation custom resource implementation for SageMaker Studio users + CodeUri: ./fn-user/ + Handler: main.lambda_handler + Role: !GetAtt LambdaExecutionRole.Arn + Runtime: python3.8 + Timeout: 900 + Layers: + - !Ref LambdaCommonLayer + + StudioDomain: + Type: 'Custom::StudioDomain' + Properties: + ServiceToken: !GetAtt StudioDomainFunction.Arn + VPC: !Ref VpcId + SubnetIds: !Ref SubnetIds + DomainName: 'PoCDomain' + DefaultUserSettings: + ExecutionRole: !GetAtt SageMakerExecutionRole.Arn + + UserProfile: + Type: 'Custom::UserProfile' + DependsOn: + - StudioDomain + Properties: + ServiceToken: !GetAtt UserProfileFunction.Arn + DomainId: !GetAtt StudioDomain.DomainId + UserProfileName: 'workshop-user' + UserSettings: + ExecutionRole: !GetAtt SageMakerExecutionRole.Arn + + # Pre-warm the JupyterServer app to make initially opening Studio faster for participants: + SMJupyterApp: + Type: 'AWS::SageMaker::App' + DependsOn: UserProfile + Properties: + AppName: default + AppType: JupyterServer + DomainId: !GetAtt StudioDomain.DomainId + UserProfileName: !GetAtt UserProfile.UserProfileName + + # Pre-warm the default Python 3 (Data Science) kernel to make first exercise start-up faster: + SMDataScienceApp: + Type: 'AWS::SageMaker::App' + DependsOn: UserProfile + Properties: + AppName: instance-prewarm-datascience-ml-t3-medium + AppType: KernelGateway + DomainId: !GetAtt StudioDomain.DomainId + ResourceSpec: + InstanceType: ml.t3.medium + SageMakerImageArn: !FindInMap + - RegionMap + - !Ref 'AWS::Region' + - datascience + UserProfileName: !GetAtt UserProfile.UserProfileName + +######## END SAGEMAKER ENVIRONMENT BASE SETUP ######## + +######## PRE-CLONING GIT REPO TO SAGEMAKER STUDIO ENVIRONMENT ######## +## This is easy on a notebook instance (see above) but requires a few more steps for Studio + + # First we'll need to set up an access point for the EFS filesystem backing our Studio domain: + StudioEFSAccessPoint: + Type: 'AWS::EFS::AccessPoint' + Properties: + FileSystemId: !GetAtt StudioDomain.HomeEfsFileSystemId + PosixUser: + Gid: '0' + Uid: '0' + + # - To access EFS a Lambda function needs to be deployed in VPC. + # - VPC-deployed Lambdas do not get public IP addresses by default, so can't reach internet even + # if an internet gateway / relevant security groups are in place: so we need a NAT Gateway. + # Hence all this VPC stuff... + # (We'll create just a single-AZ deployment for our user setup Lambda to keep things simple) + LambdaSubnet: + Type: 'AWS::EC2::Subnet' + Properties: + AvailabilityZone: !Select [0, !GetAZs ''] + CidrBlock: !GetAtt StudioDomain.ProposedAdminSubnetCidr + # TODO: Support IPv6 + #AssignIpv6AddressOnCreation: false + #Ipv6CidrBlock: !Select [3, !Cidr [!Select [0, !GetAtt 'VPC.Ipv6CidrBlocks'], 4, 64]] + VpcId: !GetAtt StudioDomain.VpcId + Tags: + - Key: Name + Value: 'StudioSetupLambdaSubnet' + LambdaRouteTable: + Type: 'AWS::EC2::RouteTable' + Properties: + VpcId: !GetAtt StudioDomain.VpcId + Tags: + - Key: Name + Value: 'StudioSetupLambdaSubnet' + LambdaRouteTableAssociation: + Type: 'AWS::EC2::SubnetRouteTableAssociation' + Properties: + SubnetId: !Ref LambdaSubnet + RouteTableId: !Ref LambdaRouteTable + NatGatewayEIP: + Type: 'AWS::EC2::EIP' + Properties: + Domain: vpc + NatGateway: + Type: 'AWS::EC2::NatGateway' + Properties: + AllocationId: !GetAtt NatGatewayEIP.AllocationId + SubnetId: !Select [0, !Split [',', !GetAtt StudioDomain.SubnetIds]] + LambdaNatGatewayRoute: + Type: 'AWS::EC2::Route' + Properties: + RouteTableId: !Ref LambdaRouteTable + DestinationCidrBlock: '0.0.0.0/0' + NatGatewayId: !Ref NatGateway + + LambdaPublicAccessSecurityGroup: + Type: 'AWS::EC2::SecurityGroup' + Properties: + GroupDescription: >- + Security group conferring public internet access to SageMaker Studio user setup Lambda + SecurityGroupEgress: + - CidrIp: '0.0.0.0/0' + Description: All traffic + IpProtocol: '-1' + SecurityGroupIngress: + - CidrIp: '0.0.0.0/0' + Description: All traffic + IpProtocol: '-1' + VpcId: !GetAtt StudioDomain.VpcId + + # Now ready to define the implementation of our custom resource: + UserSetupFunction: + Type: 'AWS::Serverless::Function' + DependsOn: + - StudioEFSAccessPoint + - LambdaNatGatewayRoute + - LambdaRouteTableAssociation + - NatGateway + Properties: + Description: CloudFormation custom resource implementation for SageMaker Studio users + CodeUri: ./fn-usersetup/ + Handler: main.lambda_handler + Role: !GetAtt LambdaExecutionRole.Arn + Runtime: python3.8 + Timeout: 300 + Layers: + # 3rd party layer to install Git binaries on Lambda function. + # See: https://github.com/lambci/git-lambda-layer + - !Sub 'arn:aws:lambda:${AWS::Region}:553035198032:layer:git-lambda2:8' + - !Ref LambdaCommonLayer + FileSystemConfigs: + - Arn: !GetAtt StudioEFSAccessPoint.Arn + LocalMountPath: /mnt/efs + VpcConfig: + SecurityGroupIds: + - !Ref LambdaPublicAccessSecurityGroup # Public access for fetching from GitHub/etc + - !GetAtt StudioDomain.OutboundEFSSecurityGroupId # Access to connect to Studio home filesystem + - !GetAtt StudioDomain.InboundEFSSecurityGroupId # TODO: Is this actually needed? + SubnetIds: + - !Ref LambdaSubnet + + # Actual resource instantiation: Clone the TargetRepo into the user's SMStudio EFS home folder: + UserSetup: + Type: 'Custom::UserSetup' + DependsOn: + - UserProfile + Properties: + ServiceToken: !GetAtt UserSetupFunction.Arn + DomainId: !GetAtt StudioDomain.DomainId + UserProfileName: !GetAtt UserProfile.UserProfileName + HomeEfsFileSystemUid: !GetAtt UserProfile.HomeEfsFileSystemUid + GitRepository: !Ref TargetRepo + +######## END PRE-CLONING GIT REPO TO SAGEMAKER STUDIO ENVIRONMENT ######## + +Outputs: + DomainId: + Description: SageMaker Studio Domain ID + Value: !GetAtt StudioDomain.DomainId + UserProfileName: + Description: SageMaker Studio Username + Value: !GetAtt UserProfile.UserProfileName + HomeEfsFileSystemUid: + Description: SageMaker Studio EFS User ID + Value: !GetAtt UserProfile.HomeEfsFileSystemUid + SubnetIds: + Description: VPC Subnet IDs + Value: !GetAtt StudioDomain.SubnetIds