Skip to content

Commit

Permalink
Feature 1137 Create Terraform scripts for ECS backend (#1141)
Browse files Browse the repository at this point in the history
* move backend infra into terraform

* django on fargate, training on ec2
  • Loading branch information
andrewpeng02 authored Mar 5, 2024
1 parent 453d6e9 commit 83d8827
Show file tree
Hide file tree
Showing 9 changed files with 624 additions and 81 deletions.
97 changes: 97 additions & 0 deletions .github/workflows/push-django-ecs.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
# This workflow will build and push a new container image to Amazon ECR,
# and then will deploy a new task definition to Amazon ECS, when there is a push to the "main" branch.
#
# To use this workflow, you will need to complete the following set-up steps:
#
# 1. Create an ECR repository to store your images.
# For example: `aws ecr create-repository --repository-name my-ecr-repo --region us-east-2`.
# Replace the value of the `ECR_REPOSITORY` environment variable in the workflow below with your repository's name.
# Replace the value of the `AWS_REGION` environment variable in the workflow below with your repository's region.
#
# 2. Create an ECS task definition, an ECS cluster, and an ECS service.
# For example, follow the Getting Started guide on the ECS console:
# https://us-east-2.console.aws.amazon.com/ecs/home?region=us-east-2#/firstRun
# Replace the value of the `ECS_SERVICE` environment variable in the workflow below with the name you set for the Amazon ECS service.
# Replace the value of the `ECS_CLUSTER` environment variable in the workflow below with the name you set for the cluster.
#
# 3. Store your ECS task definition as a JSON file in your repository.
# The format should follow the output of `aws ecs register-task-definition --generate-cli-skeleton`.
# Replace the value of the `ECS_TASK_DEFINITION` environment variable in the workflow below with the path to the JSON file.
# Replace the value of the `CONTAINER_NAME` environment variable in the workflow below with the name of the container
# in the `containerDefinitions` section of the task definition.
#
# 4. Store an IAM user access key in GitHub Actions secrets named `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`.
# See the documentation for each action used below for the recommended IAM policies for this IAM user,
# and best practices on handling the access key credentials.

name: ECS Django Container Deployment

# Only trigger when user clicks "run workflow"
on:
workflow_dispatch:

env:
AWS_REGION: "us-east-1" # set this to your preferred AWS region, e.g. us-west-1
ECR_REPOSITORY: "django" # set this to your Amazon ECR repository name
ECS_SERVICE: "django" # set this to your Amazon ECS service name
ECS_CLUSTER: "backend" # set this to your Amazon ECS cluster name
CONTAINER_NAME: "django" # set this to the name of the container in the containerDefinitions section of your task definition

permissions:
contents: read
actions: write

jobs:
deploy:
name: Deploy
runs-on: ubuntu-latest
environment: production
steps:
- name: Get current branch
run: echo running on branch ${GITHUB_REF##*/}

- name: Checkout
uses: actions/checkout@v3

- name: Configure AWS credentials
uses: aws-actions/configure-aws-credentials@v1
with:
aws-access-key-id: ${{ secrets.AWS_DEPLOY_ACCESS_KEY_ID }}
aws-secret-access-key: ${{ secrets.AWS_DEPLOY_SECRET_ACCESS_KEY }}
aws-region: ${{ env.AWS_REGION }}

- name: Login to Amazon ECR
id: login-ecr
uses: aws-actions/amazon-ecr-login@v1

- name: Build, tag, and push image to Amazon ECR
id: build-image
env:
ECR_REGISTRY: ${{ steps.login-ecr.outputs.registry }}
IMAGE_TAG: ${{ github.sha }}
run: |
# Build a docker container and
# push it to ECR so that it can
# be deployed to ECS.
docker build -t $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG training -f training/Dockerfile.prod
docker push $ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG
echo "image=$ECR_REGISTRY/$ECR_REPOSITORY:$IMAGE_TAG" >> $GITHUB_OUTPUT
- name: Download task definition
run: |
aws ecs describe-task-definition --task-definition django --query taskDefinition > temp-task-definition.json
- name: Fill in the new image ID in the Amazon ECS task definition
id: task-def
uses: aws-actions/amazon-ecs-render-task-definition@v1
with:
task-definition: temp-task-definition.json
container-name: ${{ env.CONTAINER_NAME }}
image: ${{ steps.build-image.outputs.image }}

- name: Deploy Amazon ECS task definition
uses: aws-actions/amazon-ecs-deploy-task-definition@v1
with:
task-definition: ${{ steps.task-def.outputs.task-definition }}
service: ${{ env.ECS_SERVICE }}
cluster: ${{ env.ECS_CLUSTER }}
wait-for-service-stability: true
63 changes: 63 additions & 0 deletions dlp-terraform/ecs/alb.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
# --- ALB ---
resource "aws_security_group" "http" {
name_prefix = "http-sg-"
description = "Allow all HTTP/HTTPS traffic from public"
vpc_id = aws_vpc.main.id

dynamic "ingress" {
for_each = [80, 443]
content {
protocol = "tcp"
from_port = ingress.value
to_port = ingress.value
cidr_blocks = ["0.0.0.0/0"]
}
}

egress {
protocol = "-1"
from_port = 0
to_port = 0
cidr_blocks = ["0.0.0.0/0"]
}
}

resource "aws_lb" "main" {
name = "alb"
load_balancer_type = "application"
subnets = aws_subnet.public[*].id
security_groups = [aws_security_group.http.id]
}

resource "aws_lb_target_group" "app" {
name_prefix = "app-"
vpc_id = aws_vpc.main.id
protocol = "HTTP"
port = 8000
target_type = "ip"

health_check {
enabled = true
path = "/health"
matcher = 200
interval = 30
timeout = 5
healthy_threshold = 5
unhealthy_threshold = 2
}
}

resource "aws_lb_listener" "http" {
load_balancer_arn = aws_lb.main.id
port = 80
protocol = "HTTP"

default_action {
type = "forward"
target_group_arn = aws_lb_target_group.app.id
}
}

output "alb_url" {
value = aws_lb.main.dns_name
}
27 changes: 27 additions & 0 deletions dlp-terraform/ecs/ecr.tf
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
resource "aws_ecr_repository" "training" {
name = "training"
image_tag_mutability = "MUTABLE"
force_delete = true

image_scanning_configuration {
scan_on_push = true
}
}

resource "aws_ecr_repository" "django" {
name = "django"
image_tag_mutability = "MUTABLE"
force_delete = true

image_scanning_configuration {
scan_on_push = true
}
}

output "training_repo_url" {
value = aws_ecr_repository.training.repository_url
}

output "django_repo_url" {
value = aws_ecr_repository.django.repository_url
}
130 changes: 61 additions & 69 deletions dlp-terraform/ecs/ecs.tf
Original file line number Diff line number Diff line change
@@ -1,89 +1,81 @@
terraform {
required_providers {
aws = {
source = "hashicorp/aws"
version = "~> 4.16"
resource "aws_ecs_cluster" "main" {
name = "backend"
}

# --- ECS Node Role ---
data "aws_iam_policy_document" "ecs_node_doc" {
statement {
actions = ["sts:AssumeRole"]
effect = "Allow"

principals {
type = "Service"
identifiers = ["ec2.amazonaws.com"]
}
}

required_version = ">= 1.2.0"
}

provider "aws" {
region = "us-west-2"
resource "aws_iam_role" "ecs_node_role" {
name_prefix = "backend-ecs-node-role-"
assume_role_policy = data.aws_iam_policy_document.ecs_node_doc.json
}

resource "aws_ecs_cluster" "deep-learning-playground-kernels" {
name = "deep-learning-playground-kernels-test"
setting {
name = "containerInsights"
value = "enabled"
}
resource "aws_iam_role_policy_attachment" "ecs_node_role_policy" {
role = aws_iam_role.ecs_node_role.name
policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonEC2ContainerServiceforEC2Role"
}
resource "aws_ecs_service" "dlp-training-service" {
name = "dlp-training-service-test"
cluster = aws_ecs_cluster.deep-learning-playground-kernels.id
task_definition = "arn:aws:ecs:us-west-2:521654603461:task-definition/dlp-training-task:9"
desired_count = 1

launch_type = "FARGATE"
resource "aws_iam_instance_profile" "ecs_node" {
name_prefix = "backend-ecs-node-profile-"
path = "/ecs/instance/"
role = aws_iam_role.ecs_node_role.name
}

deployment_maximum_percent = "200"
deployment_minimum_healthy_percent = "100"
scheduling_strategy = "REPLICA"
# --- ECS Task Role ---
data "aws_iam_policy_document" "ecs_task_doc" {
statement {
actions = ["sts:AssumeRole"]
effect = "Allow"

network_configuration {
security_groups = ["sg-09291eb84a19daeed"]
subnets = ["subnet-0bebe768ad78b896c", "subnet-0f3e41ad21cfe6ff5"]
assign_public_ip = true
principals {
type = "Service"
identifiers = ["ecs-tasks.amazonaws.com"]
}
}
}
resource "aws_appautoscaling_target" "dev_to_target" {
max_capacity = 1
min_capacity = 1
resource_id = "service/${aws_ecs_cluster.deep-learning-playground-kernels.name}/${aws_ecs_service.dlp-training-service.name}"
scalable_dimension = "ecs:service:DesiredCount"
service_namespace = "ecs"

resource "aws_iam_role" "ecs_task_role" {
name_prefix = "backend-ecs-task-role"
assume_role_policy = data.aws_iam_policy_document.ecs_task_doc.json
}
resource "aws_appautoscaling_policy" "training_service_auto_scaling_policy" {
name = "TrainingServiceAutoScalingPolicy"
policy_type = "StepScaling"
resource_id = "service/${aws_ecs_cluster.deep-learning-playground-kernels.name}/${aws_ecs_service.dlp-training-service.name}"
scalable_dimension = "ecs:service:DesiredCount"
service_namespace = "ecs"

step_scaling_policy_configuration {
adjustment_type = "ChangeInCapacity"
cooldown = 30
metric_aggregation_type = "Average"
resource "aws_iam_role_policy_attachment" "ecs_task_role_policy" {
for_each = toset([
"arn:aws:iam::aws:policy/AmazonDynamoDBFullAccess",
"arn:aws:iam::aws:policy/SecretsManagerReadWrite"
])

step_adjustment {
metric_interval_lower_bound = 0
scaling_adjustment = 3
}
}

depends_on = [
aws_appautoscaling_target.dev_to_target
]
role = aws_iam_role.ecs_task_role.name
policy_arn = each.value
}
resource "aws_appautoscaling_policy" "dlp-queue-size-too-small-policy" {
name = "DLPQueueSizeTooSmallPolicy"
policy_type = "StepScaling"
resource_id = "service/${aws_ecs_cluster.deep-learning-playground-kernels.name}/${aws_ecs_service.dlp-training-service.name}"
scalable_dimension = "ecs:service:DesiredCount"
service_namespace = "ecs"

step_scaling_policy_configuration {
adjustment_type = "ExactCapacity"
cooldown = 30
metric_aggregation_type = "Average"

step_adjustment {
resource "aws_iam_role" "ecs_exec_role" {
name_prefix = "backend-ecs-exec-role"
assume_role_policy = data.aws_iam_policy_document.ecs_task_doc.json
}

metric_interval_upper_bound = 0
scaling_adjustment = 1
}
}
depends_on = [aws_appautoscaling_target.dev_to_target]
resource "aws_iam_role_policy_attachment" "ecs_exec_role_policy" {
role = aws_iam_role.ecs_exec_role.name
policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonECSTaskExecutionRolePolicy"
}

resource "aws_cloudwatch_log_group" "training" {
name = "/ecs/training"
retention_in_days = 14
}

resource "aws_cloudwatch_log_group" "django" {
name = "/ecs/django"
retention_in_days = 14
}
Loading

0 comments on commit 83d8827

Please sign in to comment.