From 4e548b36ecf5ab478d87b8c772de747727f52e97 Mon Sep 17 00:00:00 2001 From: andrewpeng02 Date: Tue, 27 Feb 2024 14:32:50 -0500 Subject: [PATCH] django on fargate, training on ec2 --- .github/workflows/push-django-ecs.yml | 97 ++++++++++++++++++ dlp-terraform/ecs/alb.tf | 2 +- dlp-terraform/ecs/ecr.tf | 14 +++ dlp-terraform/ecs/ecs.tf | 108 ++------------------ dlp-terraform/ecs/ecs_django_service.tf | 88 ++++++++++++++++ dlp-terraform/ecs/ecs_training_service.tf | 117 ++++++++++++++++++++-- training/training/core/authenticator.py | 3 +- training/training/settings.py | 12 +-- 8 files changed, 319 insertions(+), 122 deletions(-) create mode 100644 .github/workflows/push-django-ecs.yml diff --git a/.github/workflows/push-django-ecs.yml b/.github/workflows/push-django-ecs.yml new file mode 100644 index 000000000..0cb50ef72 --- /dev/null +++ b/.github/workflows/push-django-ecs.yml @@ -0,0 +1,97 @@ +# This workflow will build and push a new container image to Amazon ECR, +# and then will deploy a new task definition to Amazon ECS, when there is a push to the "main" branch. +# +# To use this workflow, you will need to complete the following set-up steps: +# +# 1. Create an ECR repository to store your images. +# For example: `aws ecr create-repository --repository-name my-ecr-repo --region us-east-2`. +# Replace the value of the `ECR_REPOSITORY` environment variable in the workflow below with your repository's name. +# Replace the value of the `AWS_REGION` environment variable in the workflow below with your repository's region. +# +# 2. Create an ECS task definition, an ECS cluster, and an ECS service. +# For example, follow the Getting Started guide on the ECS console: +# https://us-east-2.console.aws.amazon.com/ecs/home?region=us-east-2#/firstRun +# Replace the value of the `ECS_SERVICE` environment variable in the workflow below with the name you set for the Amazon ECS service. +# Replace the value of the `ECS_CLUSTER` environment variable in the workflow below with the name you set for the cluster. +# +# 3. Store your ECS task definition as a JSON file in your repository. +# The format should follow the output of `aws ecs register-task-definition --generate-cli-skeleton`. +# Replace the value of the `ECS_TASK_DEFINITION` environment variable in the workflow below with the path to the JSON file. +# Replace the value of the `CONTAINER_NAME` environment variable in the workflow below with the name of the container +# in the `containerDefinitions` section of the task definition. +# +# 4. Store an IAM user access key in GitHub Actions secrets named `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`. +# See the documentation for each action used below for the recommended IAM policies for this IAM user, +# and best practices on handling the access key credentials. + +name: ECS Django Container Deployment + +# Only trigger when user clicks "run workflow" +on: + workflow_dispatch: + +env: + AWS_REGION: "us-east-1" # set this to your preferred AWS region, e.g. us-west-1 + ECR_REPOSITORY: "django" # set this to your Amazon ECR repository name + ECS_SERVICE: "django" # set this to your Amazon ECS service name + ECS_CLUSTER: "backend" # set this to your Amazon ECS cluster name + CONTAINER_NAME: "django" # set this to the name of the container in the containerDefinitions section of your task definition + +permissions: + contents: read + actions: write + +jobs: + deploy: + name: Deploy + runs-on: ubuntu-latest + environment: production + steps: + - name: Get current branch + run: echo running on branch ${GITHUB_REF##*/} + + - name: Checkout + uses: actions/checkout@v3 + + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v1 + with: + aws-access-key-id: ${{ secrets.AWS_DEPLOY_ACCESS_KEY_ID }} + aws-secret-access-key: ${{ secrets.AWS_DEPLOY_SECRET_ACCESS_KEY }} + aws-region: ${{ env.AWS_REGION }} + + - name: Login to Amazon ECR + id: login-ecr + uses: aws-actions/amazon-ecr-login@v1 + + - name: Build, tag, and push image to Amazon ECR + id: build-image + env: + ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }} + IMAGE_TAG: ${{ github.sha }} + run: | + # Build a docker container and + # push it to ECR so that it can + # be deployed to ECS. + docker build -t $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG training -f training/Dockerfile.prod + docker push $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG + echo "image=$ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG" >> $GITHUB_OUTPUT + + - name: Download task definition + run: | + aws ecs describe-task-definition --task-definition django --query taskDefinition > temp-task-definition.json + - name: Fill in the new image ID in the Amazon ECS task definition + id: task-def + uses: aws-actions/amazon-ecs-render-task-definition@v1 + with: + task-definition: temptask-definition.json + container-name: ${{ env.CONTAINER_NAME }} + image: ${{ steps.build-image.outputs.image }} + + - name: Deploy Amazon ECS task definition + uses: aws-actions/amazon-ecs-deploy-task-definition@v1 + with: + task-definition: ${{ steps.task-def.outputs.task-definition }} + service: ${{ env.ECS_SERVICE }} + cluster: ${{ env.ECS_CLUSTER }} + wait-for-service-stability: true diff --git a/dlp-terraform/ecs/alb.tf b/dlp-terraform/ecs/alb.tf index 985717240..71640852d 100644 --- a/dlp-terraform/ecs/alb.tf +++ b/dlp-terraform/ecs/alb.tf @@ -34,7 +34,7 @@ resource "aws_lb_target_group" "app" { vpc_id = aws_vpc.main.id protocol = "HTTP" port = 8000 - target_type = "instance" + target_type = "ip" health_check { enabled = true diff --git a/dlp-terraform/ecs/ecr.tf b/dlp-terraform/ecs/ecr.tf index 318588be0..c3cf046e5 100644 --- a/dlp-terraform/ecs/ecr.tf +++ b/dlp-terraform/ecs/ecr.tf @@ -8,6 +8,20 @@ resource "aws_ecr_repository" "training" { } } +resource "aws_ecr_repository" "django" { + name = "django" + image_tag_mutability = "MUTABLE" + force_delete = true + + image_scanning_configuration { + scan_on_push = true + } +} + output "training_repo_url" { value = aws_ecr_repository.training.repository_url } + +output "django_repo_url" { + value = aws_ecr_repository.django.repository_url +} diff --git a/dlp-terraform/ecs/ecs.tf b/dlp-terraform/ecs/ecs.tf index eef9c418f..89cdaafdc 100644 --- a/dlp-terraform/ecs/ecs.tf +++ b/dlp-terraform/ecs/ecs.tf @@ -31,105 +31,6 @@ resource "aws_iam_instance_profile" "ecs_node" { role = aws_iam_role.ecs_node_role.name } -# --- ECS Node Security Group --- -resource "aws_security_group" "ecs_node_sg" { - name_prefix = "backend-ecs-node-sg-" - vpc_id = aws_vpc.main.id - - ingress { - from_port = 0 - to_port = 0 - protocol = "-1" - # cidr_blocks = [aws_vpc.main.cidr_block] - security_groups = [ aws_security_group.http.id ] - } - - egress { - from_port = 0 - to_port = 0 - protocol = "-1" - cidr_blocks = ["0.0.0.0/0"] - } -} - -# --- ECS Launch Template --- -resource "aws_launch_template" "ecs_lt_training" { - name_prefix = "training-ecs-template-" - image_id = "ami-01ff5874b57a57613" - instance_type = "g4dn.xlarge" - - vpc_security_group_ids = [aws_security_group.ecs_node_sg.id] - iam_instance_profile { - arn = aws_iam_instance_profile.ecs_node.arn - } - monitoring { - enabled = true - } - - user_data = base64encode(<<-EOF - #!/bin/bash - echo ECS_CLUSTER=${aws_ecs_cluster.main.name} >> /etc/ecs/ecs.config; - EOF - ) -} - -# --- ECS ASG --- -resource "aws_autoscaling_group" "training" { - name_prefix = "training-ecs-asg-" - vpc_zone_identifier = aws_subnet.public[*].id - min_size = 0 - max_size = 2 - desired_capacity = 1 - health_check_grace_period = 0 - health_check_type = "EC2" - protect_from_scale_in = false - - launch_template { - id = aws_launch_template.ecs_lt_training.id - version = "$Latest" - } - - tag { - key = "Name" - value = "backend-ecs-cluster" - propagate_at_launch = true - } - - tag { - key = "AmazonECSManaged" - value = "" - propagate_at_launch = true - } -} - -# --- ECS Capacity Provider --- -resource "aws_ecs_capacity_provider" "training" { - name = "training-ecs-ec2" - - auto_scaling_group_provider { - auto_scaling_group_arn = aws_autoscaling_group.training.arn - managed_termination_protection = "DISABLED" - - managed_scaling { - maximum_scaling_step_size = 2 - minimum_scaling_step_size = 1 - status = "ENABLED" - target_capacity = 100 - } - } -} - -resource "aws_ecs_cluster_capacity_providers" "main" { - cluster_name = aws_ecs_cluster.main.name - capacity_providers = [aws_ecs_capacity_provider.training.name] - - default_capacity_provider_strategy { - capacity_provider = aws_ecs_capacity_provider.training.name - base = 1 - weight = 100 - } -} - # --- ECS Task Role --- data "aws_iam_policy_document" "ecs_task_doc" { statement { @@ -169,7 +70,12 @@ resource "aws_iam_role_policy_attachment" "ecs_exec_role_policy" { policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy" } -resource "aws_cloudwatch_log_group" "ecs" { - name = "/ecs/backend" +resource "aws_cloudwatch_log_group" "training" { + name = "/ecs/training" retention_in_days = 14 } + +resource "aws_cloudwatch_log_group" "django" { + name = "/ecs/django" + retention_in_days = 14 +} \ No newline at end of file diff --git a/dlp-terraform/ecs/ecs_django_service.tf b/dlp-terraform/ecs/ecs_django_service.tf index e69de29bb..6a2168d72 100644 --- a/dlp-terraform/ecs/ecs_django_service.tf +++ b/dlp-terraform/ecs/ecs_django_service.tf @@ -0,0 +1,88 @@ +resource "aws_ecs_task_definition" "django" { + family = "django" + task_role_arn = aws_iam_role.ecs_task_role.arn + execution_role_arn = aws_iam_role.ecs_exec_role.arn + network_mode = "awsvpc" + requires_compatibilities = ["FARGATE"] + cpu = 1024 + memory = 2048 + + container_definitions = jsonencode([ + { + "name": "django", + "image" : "${aws_ecr_repository.django.repository_url}:latest", + "cpu": 1024, + "memory": 2048, + "essential": true, + "portMappings": [ + { + "name" : "gunicorn-port", + "containerPort" : 8000, + "hostPort" : 8000, + "protocol" : "tcp", + } + ], + "logConfiguration" : { + "logDriver" : "awslogs", + "options" : { + "awslogs-create-group" : "true", + "awslogs-region" : "us-east-1", + "awslogs-group" : aws_cloudwatch_log_group.django.name, + "awslogs-stream-prefix" : "ecs" + } + }, + "environment": [ + { + "name": "ALLOWED_HOST", + "value": "${aws_lb.main.dns_name}" + } + ] + } + ]) +} + +# --- ECS Django Security Group --- +resource "aws_security_group" "ecs_django_sg" { + name_prefix = "backend-ecs-django-sg-" + vpc_id = aws_vpc.main.id +} + +resource "aws_vpc_security_group_ingress_rule" "ecs_django_sg_ingress" { + security_group_id = aws_security_group.ecs_django_sg.id + + ip_protocol = "-1" + referenced_security_group_id = aws_security_group.http.id +} + +resource "aws_vpc_security_group_egress_rule" "ecs_django_sg_egress" { + security_group_id = aws_security_group.ecs_django_sg.id + + ip_protocol = "-1" + cidr_ipv4 = "0.0.0.0/0" +} + +resource "aws_ecs_service" "django" { + name = "django" + cluster = aws_ecs_cluster.main.id + task_definition = aws_ecs_task_definition.django.arn + desired_count = 2 + launch_type = "FARGATE" + + network_configuration { + security_groups = [ aws_security_group.ecs_django_sg.id] + subnets = aws_subnet.public[*].id + assign_public_ip = true + } + + lifecycle { + ignore_changes = [desired_count] + } + + load_balancer { + target_group_arn = aws_lb_target_group.app.arn + container_name = "django" + container_port = 8000 + } + + depends_on = [aws_lb_target_group.app] +} \ No newline at end of file diff --git a/dlp-terraform/ecs/ecs_training_service.tf b/dlp-terraform/ecs/ecs_training_service.tf index e9d3e2ff5..02ebb059b 100644 --- a/dlp-terraform/ecs/ecs_training_service.tf +++ b/dlp-terraform/ecs/ecs_training_service.tf @@ -6,7 +6,7 @@ resource "aws_ecs_task_definition" "training" { cpu = 1024 memory = 4096 - container_definitions = jsonencode(([ + container_definitions = jsonencode([ { "name" : "training", "image" : "${aws_ecr_repository.training.repository_url}:latest", @@ -28,12 +28,12 @@ resource "aws_ecs_task_definition" "training" { "options" : { "awslogs-create-group" : "true", "awslogs-region" : "us-east-1", - "awslogs-group" : aws_cloudwatch_log_group.ecs.name, + "awslogs-group" : aws_cloudwatch_log_group.training.name, "awslogs-stream-prefix" : "ecs" } } } - ])) + ]) } # --- ECS Service --- @@ -58,13 +58,13 @@ resource "aws_ecs_service" "training" { ignore_changes = [desired_count] } - load_balancer { - target_group_arn = aws_lb_target_group.app.arn - container_name = "training" - container_port = 8000 - } + # load_balancer { + # target_group_arn = aws_lb_target_group.app.arn + # container_name = "training" + # container_port = 8000 + # } - depends_on = [aws_lb_target_group.app] + # depends_on = [aws_lb_target_group.app] } # --- ECS Service Auto Scaling --- @@ -110,4 +110,103 @@ resource "aws_appautoscaling_policy" "training_ecs_target_memory" { scale_in_cooldown = 300 scale_out_cooldown = 300 } +} + +# --- ECS Training Security Group --- +resource "aws_security_group" "ecs_training_sg" { + name_prefix = "backend-ecs-training-sg-" + vpc_id = aws_vpc.main.id +} + +resource "aws_vpc_security_group_ingress_rule" "ecs_training_sg_ingress" { + security_group_id = aws_security_group.ecs_training_sg.id + + ip_protocol = "-1" + # cidr_blocks = [aws_vpc.main.cidr_block] + referenced_security_group_id = aws_security_group.ecs_django_sg.id +} + +resource "aws_vpc_security_group_egress_rule" "ecs_training_sg_egress" { + security_group_id = aws_security_group.ecs_training_sg.id + + ip_protocol = "-1" + cidr_ipv4 = "0.0.0.0/0" +} + +# --- ECS Launch Template --- +resource "aws_launch_template" "ecs_lt_training" { + name_prefix = "training-ecs-template-" + image_id = "ami-01ff5874b57a57613" + instance_type = "g4dn.xlarge" + + vpc_security_group_ids = [aws_security_group.ecs_training_sg.id] + iam_instance_profile { + arn = aws_iam_instance_profile.ecs_node.arn + } + monitoring { + enabled = true + } + + user_data = base64encode(<<-EOF + #!/bin/bash + echo ECS_CLUSTER=${aws_ecs_cluster.main.name} >> /etc/ecs/ecs.config; + EOF + ) +} + +# --- ECS ASG --- +resource "aws_autoscaling_group" "training" { + name_prefix = "training-ecs-asg-" + vpc_zone_identifier = aws_subnet.public[*].id + min_size = 0 + max_size = 1 + desired_capacity = 1 + health_check_grace_period = 0 + health_check_type = "EC2" + protect_from_scale_in = false + + launch_template { + id = aws_launch_template.ecs_lt_training.id + version = "$Latest" + } + + tag { + key = "Name" + value = "backend-ecs-cluster" + propagate_at_launch = true + } + + tag { + key = "AmazonECSManaged" + value = "" + propagate_at_launch = true + } +} + +# --- ECS Capacity Provider --- +resource "aws_ecs_capacity_provider" "training" { + name = "training-ecs-ec2" + + auto_scaling_group_provider { + auto_scaling_group_arn = aws_autoscaling_group.training.arn + managed_termination_protection = "DISABLED" + + managed_scaling { + maximum_scaling_step_size = 2 + minimum_scaling_step_size = 1 + status = "ENABLED" + target_capacity = 100 + } + } +} + +resource "aws_ecs_cluster_capacity_providers" "main" { + cluster_name = aws_ecs_cluster.main.name + capacity_providers = [aws_ecs_capacity_provider.training.name] + + default_capacity_provider_strategy { + capacity_provider = aws_ecs_capacity_provider.training.name + base = 1 + weight = 100 + } } \ No newline at end of file diff --git a/training/training/core/authenticator.py b/training/training/core/authenticator.py index e4f5ad7fe..a250215f2 100644 --- a/training/training/core/authenticator.py +++ b/training/training/core/authenticator.py @@ -14,10 +14,11 @@ def authenticate(self, request, token): app = init_firebase() try: firebase_admin.auth.verify_id_token(token) - firebase_admin.delete_app(app) except Exception as e: logger.info(e) return + finally: + firebase_admin.delete_app(app) return token diff --git a/training/training/settings.py b/training/training/settings.py index 12bba5624..74f789f6c 100644 --- a/training/training/settings.py +++ b/training/training/settings.py @@ -28,16 +28,8 @@ import requests, os ALLOWED_HOSTS = [] -if "ECS_CONTAINER_METADATA_URI" in os.environ: - ELB_HEALTHCHECK_HOSTNAMES = [ - ip - for network in requests.get(os.environ["ECS_CONTAINER_METADATA_URI"]).json()[ - "Networks" - ] - for ip in network["IPv4Addresses"] - ] - ALLOWED_HOSTS += ELB_HEALTHCHECK_HOSTNAMES - ALLOWED_HOSTS.append("alb-912662400.us-east-1.elb.amazonaws.com") +if "ALLOWED_HOST" in os.environ: + ALLOWED_HOSTS.append(os.environ["ALLOWED_HOST"]) # Application definition