diff --git a/manifests/modules/troubleshooting/alb/.workshop/cleanup.sh b/manifests/modules/troubleshooting/alb/.workshop/cleanup.sh new file mode 100755 index 000000000..3c51870ba --- /dev/null +++ b/manifests/modules/troubleshooting/alb/.workshop/cleanup.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +logmessage "Restoring public subnet tags..." + +# Function to create ftags for subnets ids +remove_tags_from_subnets() { + subnets_vpc=$(aws ec2 describe-subnets --filters "Name=tag:Name,Values=*Public*" "Name=tag:created-by,Values=eks-workshop-v2" --query 'Subnets[*].SubnetId' --output text) + #logmessage "subnets_vpc: $subnets_vpc" + + +#remove tag from subnets with AWS cli + for subnet_id in $subnets_vpc; do + #logmessage "public subnets: $subnet_id" + aws ec2 create-tags --resources "$subnet_id" --tags Key=kubernetes.io/role/elb,Value='1' || logmessage "Failed to create tag from subnet $subnet_id" + done + return 0 +} + +remove_tags_from_subnets \ No newline at end of file diff --git a/manifests/modules/troubleshooting/alb/.workshop/terraform/main.tf b/manifests/modules/troubleshooting/alb/.workshop/terraform/main.tf new file mode 100644 index 000000000..2448576ca --- /dev/null +++ b/manifests/modules/troubleshooting/alb/.workshop/terraform/main.tf @@ -0,0 +1,176 @@ +terraform { + required_providers { + # kubectl = { + # source = "gavinbunney/kubectl" + # version = ">= 1.14" + # } + } +} + + + +provider "aws" { + region = "us-east-1" + alias = "virginia" +} + +locals { + tags = { + module = "troubleshooting" + } +} + +data "aws_vpc" "selected" { + tags = { + created-by = "eks-workshop-v2" + env = var.addon_context.eks_cluster_id + } +} + +data "aws_subnets" "public" { + tags = { + created-by = "eks-workshop-v2" + env = var.addon_context.eks_cluster_id + } + + filter { + name = "tag:Name" + values = ["*Public*"] + } +} + + +resource "time_sleep" "blueprints_addons_sleep" { + depends_on = [ + module.eks_blueprints_addons + ] + + create_duration = "15s" + destroy_duration = "15s" +} + + +resource "null_resource" "break_public_subnet" { + triggers = { + public_subnets = join(" ", data.aws_subnets.public.ids) + always_run = timestamp() + } + count = length(data.aws_subnets.public) + + lifecycle { + create_before_destroy = false + } + + + provisioner "local-exec" { + when = create + command = "aws ec2 delete-tags --resources ${self.triggers.public_subnets} --tags Key=kubernetes.io/role/elb,Value='1'" + } + +} + +module "eks_blueprints_addons" { + source = "aws-ia/eks-blueprints-addons/aws" + version = "1.16.2" + + enable_aws_load_balancer_controller = true + aws_load_balancer_controller = { + wait = true + } + + cluster_name = var.addon_context.eks_cluster_id + cluster_endpoint = var.addon_context.aws_eks_cluster_endpoint + cluster_version = var.eks_cluster_version + oidc_provider_arn = var.addon_context.eks_oidc_provider_arn + + tags = merge( + var.tags, + local.tags + ) + + depends_on = [null_resource.break_public_subnet] + +} + + +# create a new policy from json file +resource "aws_iam_policy" "issue" { + name = "issue" + path = "/" + policy = file("${path.module}/template/other_issue.json") +} + +# attach issue policy to role +resource "aws_iam_role_policy_attachment" "issue_policy_attachment" { + role = module.eks_blueprints_addons.aws_load_balancer_controller.iam_role_name + policy_arn = aws_iam_policy.issue.arn + depends_on = [module.eks_blueprints_addons, time_sleep.blueprints_addons_sleep] +} + +resource "null_resource" "detach_existing_policy" { + triggers = { + role_name = module.eks_blueprints_addons.aws_load_balancer_controller.iam_role_name, + always_run = timestamp() + } + + provisioner "local-exec" { + command = "aws iam detach-role-policy --role-name ${self.triggers.role_name} --policy-arn ${module.eks_blueprints_addons.aws_load_balancer_controller.iam_policy_arn}" + when = create + } + + depends_on = [aws_iam_role_policy_attachment.issue_policy_attachment] +} + +resource "null_resource" "kustomize_app" { + triggers = { + always_run = timestamp() + } + + provisioner "local-exec" { + command = "kubectl apply -k ~/environment/eks-workshop/modules/troubleshooting/alb/creating-alb" + when = create + } + + depends_on = [aws_iam_role_policy_attachment.issue_policy_attachment] +} + + + +# Example to now how to get variables from add ons outputs DO-NOT-DELETE; AddOns and helms documentaitons does not show exactly the output variables returned +#resource "null_resource" "blue_print_output" { +# for_each = module.eks_blueprints_addons.aws_load_balancer_controller +# triggers = { +# +# timestamp = timestamp() +# } +# +# #count = length(module.eks_blueprints_addons.aws_load_balancer_controller) +# provisioner "local-exec" { +# command = "mkdir -p /eks-workshop/logs; echo \" key: ${each.key} Value:${each.value}\" >> /eks-workshop/logs/action-load-balancer-output.log" +# } +# +# depends_on = [module.eks_blueprints_addons,time_sleep.blueprints_addons_sleep] +#} + +#option to run a bash script file +#resource "null_resource" "break2" { +# provisioner "local-exec" { +# command = "${path.module}/template/break.sh ${path.module} mod2" +# } +# +# triggers = { +# always_run = timestamp() +# } +# depends_on = [module.eks_blueprints_addons,time_sleep.blueprints_addons_sleep] +#} + +#option to run a kubectl manifest +#resource "kubectl_manifest" "alb" { +# yaml_body = templatefile("${path.module}/template/ingress.yaml", { +# +# }) +# +# depends_on = [null_resource.break_policy] +#} + + diff --git a/manifests/modules/troubleshooting/alb/.workshop/terraform/outputs.tf b/manifests/modules/troubleshooting/alb/.workshop/terraform/outputs.tf new file mode 100644 index 000000000..8669df9bd --- /dev/null +++ b/manifests/modules/troubleshooting/alb/.workshop/terraform/outputs.tf @@ -0,0 +1,13 @@ +output "environment_variables" { + description = "Environment variables to be added to the IDE shell" + value = merge({ + VPC_ID = data.aws_vpc.selected.id, + LOAD_BALANCER_CONTROLLER_ROLE_NAME = module.eks_blueprints_addons.aws_load_balancer_controller.iam_role_name, + LOAD_BALANCER_CONTROLLER_POLICY_ARN_FIX = module.eks_blueprints_addons.aws_load_balancer_controller.iam_policy_arn, + LOAD_BALANCER_CONTROLLER_POLICY_ARN_ISSUE = aws_iam_policy.issue.arn, + LOAD_BALANCER_CONTROLLER_ROLE_ARN = module.eks_blueprints_addons.aws_load_balancer_controller.iam_role_arn + }, { + for index, id in data.aws_subnets.public.ids : "PUBLIC_SUBNET_${index + 1}" => id + } + ) +} \ No newline at end of file diff --git a/manifests/modules/troubleshooting/alb/.workshop/terraform/template/break.sh b/manifests/modules/troubleshooting/alb/.workshop/terraform/template/break.sh new file mode 100755 index 000000000..02011b4e9 --- /dev/null +++ b/manifests/modules/troubleshooting/alb/.workshop/terraform/template/break.sh @@ -0,0 +1,155 @@ +#!/usr/bin/env bash +#. .env + +set -e + +mkdir -p /eks-workshop/logs +log_file=/eks-workshop/logs/action-$(date +%s).log + +exec 2>&1 + +logmessage() { + echo "$@" >&7 + echo "$@" >&1 +} +export -f logmessage + +# Function to get the role name from a role ARN +get_role_name_from_arn() { + local role_arn=$1 + + # Extract the role name from the ARN + role_name=$(logmessage "$role_arn" | awk -F'/' '{print $NF}') + + if [ -n "$role_name" ]; then + logmessage "$role_name" + else + logmessage "Failed to retrieve role name from ARN: $role_arn" + return 1 + fi +} + +# Function to get the Kubernetes role attached to a service account +get_service_account_role() { + local namespace=$1 + local service_account=$2 + + # Get the role ARN associated with the service account + role_arn=$(kubectl get serviceaccount "$service_account" -n "$namespace" -o jsonpath="{.metadata.annotations['eks\.amazonaws\.com\/role-arn']}") + + if [ -n "$role_arn" ]; then + logmessage "Service Account: $service_account" + logmessage "Namespace: $namespace" + logmessage "Role ARN: $role_arn" + get_role_name_from_arn "$role_arn" + return 0 + else + logmessage "Failed to retrieve role for service account '$service_account' in namespace '$namespace'" + return 1 + fi + +} + +# Function to get the first policy ARN attached to a role ARN +get_first_policy_arn_from_role_arn() { + local role_arn=$1 + + # Get the list of policies attached to the role + policy_arn=$(aws iam list-attached-role-policies --role-name "$role_arn" --query 'AttachedPolicies[0].PolicyArn' --output text) + + if [ -n "$policy_arn" ]; then + logmessage "First Policy ARN attached to role '$role_arn':" + logmessage "Policy: $policy_arn" + return 0 + else + logmessage "Failed to retrieve policy ARN for role '$role_arn'" + return 1 + fi +} + +# Function to update the policy with new statement +update_policy_with_new_statement() { + local policy_arn=$1 + local new_statement=$2 + + logmessage "PolicyARN: $policy_arn" + logmessage "Statement: $new_statement" + aws iam create-policy-version --policy-arn $policy_arn --policy-document $new_statement --set-as-default + +} + +# Function to remove an action from a policy statement +remove_action_from_policy_statement() { + local policy_name=$1 + local action_to_remove=$2 + + # Get the current policy document + policy_document=$(aws iam get-policy-version --policy-arn "$policy_arn" --query 'PolicyVersion.Document' --version-id v1 --output json) + + # Remove the specified action from the statements + new_statements=$(logmessage "$policy_document" | jq ".Statement[] | select(.Action[] | contains('$action_to_remove')) | .Action = [.Action[] | select(. != '$action_to_remove')]") + new_policy_document=$(logmessage '{"Version": "2012-10-17", "Statement": '"$new_statements"'}') ++ + # Update the policy with the modified document + logmessage "Policy Document" + logmessage $new_policy_document + #aws iam create-policy-version --policy-arn "$policy_arn" --policy-document "$new_policy_document" --set-as-default + + if [ $? -eq 0 ]; then + logmessage "Action removed from policy statement successfully." + return 0 + else + logmessage "Failed to remove action from policy statement." + return 1 + fi +} + +# Function to remove tags from subnets ids +remove_tags_from_subnets() { + local tag_key="Key=kubernetes.io/role/elb,Value=1" + + logmessage "retrive subnets ids with tag key assigned to specific vpc_id via aws cli" + logmessage "getting public subnets from VPC: $vpc_id " + + + subnets_vpc=$(aws ec2 describe-subnets --filters "Name=vpc-id,Values=$vpc_id" --query 'Subnets[*].SubnetId' --output text) + logmessage "subnets_vpc: $subnets_vpc" + + +#remove tag from subnets with AWS cli + for subnet_id in $subnets_vpc; do + logmessage "public subnets: $subnet_id" + aws ec2 delete-tags --resources "$subnet_id" --tags "Key=$tag_key" || logmessage "Failed to remove tag from subnet $subnet_id" + done + return 0 +} + +# Getting the service role +path_tofile=$1 +mode=$2 +vpc_id=$3 +public_subnets=$4 +namespace="kube-system" +service_account="aws-load-balancer-controller-sa" +#new_statement="file://$path_tofile/template/iam_policy_incorrect.json" +new_statement="file://$path_tofile/template/other_issue.json" + +logmessage "path_sent: $path_tofile" + + +# validate if mode is equal to mod1 +logmessage "mode: $mode" +if [ "$mode" == "mod1" ]; then + logmessage "Removing subnet tags" + remove_tags_from_subnets +else + logmessage "Removing permissions" + get_service_account_role "$namespace" "$service_account" + get_first_policy_arn_from_role_arn "$role_name" + update_policy_with_new_statement "$policy_arn" "$new_statement" + +fi + + + + diff --git a/manifests/modules/troubleshooting/alb/.workshop/terraform/template/other_issue.json b/manifests/modules/troubleshooting/alb/.workshop/terraform/template/other_issue.json new file mode 100644 index 000000000..4503e4e9b --- /dev/null +++ b/manifests/modules/troubleshooting/alb/.workshop/terraform/template/other_issue.json @@ -0,0 +1,207 @@ +{ + "Version": "2012-10-17", + "Statement": [ + { + "Effect": "Allow", + "Action": ["iam:CreateServiceLinkedRole"], + "Resource": "*", + "Condition": { + "StringEquals": { + "iam:AWSServiceName": "elasticloadbalancing.amazonaws.com" + } + } + }, + { + "Effect": "Allow", + "Action": [ + "ec2:DescribeAccountAttributes", + "ec2:DescribeAddresses", + "ec2:DescribeAvailabilityZones", + "ec2:DescribeInternetGateways", + "ec2:DescribeVpcs", + "ec2:DescribeVpcPeeringConnections", + "ec2:DescribeSubnets", + "ec2:DescribeSecurityGroups", + "ec2:DescribeInstances", + "ec2:DescribeNetworkInterfaces", + "ec2:DescribeTags", + "ec2:GetCoipPoolUsage", + "ec2:DescribeCoipPools", + "elasticloadbalancing:DescribeLoadBalancers", + "elasticloadbalancing:DescribeLoadBalancerAttributes", + "elasticloadbalancing:DescribeListeners", + "elasticloadbalancing:DescribeListenerCertificates", + "elasticloadbalancing:DescribeSSLPolicies", + "elasticloadbalancing:DescribeRules", + "elasticloadbalancing:DescribeTargetGroups", + "elasticloadbalancing:DescribeTargetGroupAttributes", + "elasticloadbalancing:DescribeTargetHealth", + "elasticloadbalancing:DescribeTags" + ], + "Resource": "*" + }, + { + "Effect": "Allow", + "Action": [ + "cognito-idp:DescribeUserPoolClient", + "acm:ListCertificates", + "acm:DescribeCertificate", + "iam:ListServerCertificates", + "iam:GetServerCertificate", + "waf-regional:GetWebACL", + "waf-regional:GetWebACLForResource", + "waf-regional:AssociateWebACL", + "waf-regional:DisassociateWebACL", + "wafv2:GetWebACL", + "wafv2:GetWebACLForResource", + "wafv2:AssociateWebACL", + "wafv2:DisassociateWebACL", + "shield:GetSubscriptionState", + "shield:DescribeProtection", + "shield:CreateProtection", + "shield:DeleteProtection" + ], + "Resource": "*" + }, + { + "Effect": "Allow", + "Action": [ + "ec2:AuthorizeSecurityGroupIngress", + "ec2:RevokeSecurityGroupIngress" + ], + "Resource": "*" + }, + { + "Effect": "Allow", + "Action": ["ec2:CreateSecurityGroup"], + "Resource": "*" + }, + { + "Effect": "Allow", + "Action": ["ec2:CreateTags"], + "Resource": "arn:aws:ec2:*:*:security-group/*", + "Condition": { + "StringEquals": { + "ec2:CreateAction": "CreateSecurityGroup" + }, + "Null": { + "aws:RequestTag/elbv2.k8s.aws/cluster": "false" + } + } + }, + { + "Effect": "Allow", + "Action": ["ec2:CreateTags", "ec2:DeleteTags"], + "Resource": "arn:aws:ec2:*:*:security-group/*", + "Condition": { + "Null": { + "aws:RequestTag/elbv2.k8s.aws/cluster": "true", + "aws:ResourceTag/elbv2.k8s.aws/cluster": "false" + } + } + }, + { + "Effect": "Allow", + "Action": [ + "ec2:AuthorizeSecurityGroupIngress", + "ec2:RevokeSecurityGroupIngress", + "ec2:DeleteSecurityGroup" + ], + "Resource": "*", + "Condition": { + "Null": { + "aws:ResourceTag/elbv2.k8s.aws/cluster": "false" + } + } + }, + { + "Effect": "Allow", + "Action": ["elasticloadbalancing:CreateTargetGroup"], + "Resource": "*", + "Condition": { + "Null": { + "aws:RequestTag/elbv2.k8s.aws/cluster": "false" + } + } + }, + { + "Effect": "Allow", + "Action": [ + "elasticloadbalancing:CreateListener", + "elasticloadbalancing:DeleteListener", + "elasticloadbalancing:CreateRule", + "elasticloadbalancing:DeleteRule" + ], + "Resource": "*" + }, + { + "Effect": "Allow", + "Action": [ + "elasticloadbalancing:AddTags", + "elasticloadbalancing:RemoveTags" + ], + "Resource": [ + "arn:aws:elasticloadbalancing:*:*:targetgroup/*/*", + "arn:aws:elasticloadbalancing:*:*:loadbalancer/net/*/*", + "arn:aws:elasticloadbalancing:*:*:loadbalancer/app/*/*" + ], + "Condition": { + "Null": { + "aws:RequestTag/elbv2.k8s.aws/cluster": "true", + "aws:ResourceTag/elbv2.k8s.aws/cluster": "false" + } + } + }, + { + "Effect": "Allow", + "Action": [ + "elasticloadbalancing:AddTags", + "elasticloadbalancing:RemoveTags" + ], + "Resource": [ + "arn:aws:elasticloadbalancing:*:*:listener/net/*/*/*", + "arn:aws:elasticloadbalancing:*:*:listener/app/*/*/*", + "arn:aws:elasticloadbalancing:*:*:listener-rule/net/*/*/*", + "arn:aws:elasticloadbalancing:*:*:listener-rule/app/*/*/*" + ] + }, + { + "Effect": "Allow", + "Action": [ + "elasticloadbalancing:ModifyLoadBalancerAttributes", + "elasticloadbalancing:SetIpAddressType", + "elasticloadbalancing:SetSecurityGroups", + "elasticloadbalancing:SetSubnets", + "elasticloadbalancing:DeleteLoadBalancer", + "elasticloadbalancing:ModifyTargetGroup", + "elasticloadbalancing:ModifyTargetGroupAttributes", + "elasticloadbalancing:DeleteTargetGroup" + ], + "Resource": "*", + "Condition": { + "Null": { + "aws:ResourceTag/elbv2.k8s.aws/cluster": "false" + } + } + }, + { + "Effect": "Allow", + "Action": [ + "elasticloadbalancing:RegisterTargets", + "elasticloadbalancing:DeregisterTargets" + ], + "Resource": "arn:aws:elasticloadbalancing:*:*:targetgroup/*/*" + }, + { + "Effect": "Allow", + "Action": [ + "elasticloadbalancing:SetWebAcl", + "elasticloadbalancing:ModifyListener", + "elasticloadbalancing:AddListenerCertificates", + "elasticloadbalancing:RemoveListenerCertificates", + "elasticloadbalancing:ModifyRule" + ], + "Resource": "*" + } + ] +} diff --git a/manifests/modules/troubleshooting/alb/.workshop/terraform/vars.tf b/manifests/modules/troubleshooting/alb/.workshop/terraform/vars.tf new file mode 100644 index 000000000..812087dc5 --- /dev/null +++ b/manifests/modules/troubleshooting/alb/.workshop/terraform/vars.tf @@ -0,0 +1,35 @@ +# tflint-ignore: terraform_unused_declarations +variable "eks_cluster_id" { + description = "EKS cluster name" + type = string +} + +# tflint-ignore: terraform_unused_declarations +variable "eks_cluster_version" { + description = "EKS cluster version" + type = string +} + +# tflint-ignore: terraform_unused_declarations +variable "cluster_security_group_id" { + description = "EKS cluster security group ID" + type = any +} + +# tflint-ignore: terraform_unused_declarations +variable "addon_context" { + description = "Addon context that can be passed directly to blueprints addon modules" + type = any +} + +# tflint-ignore: terraform_unused_declarations +variable "tags" { + description = "Tags to apply to AWS resources" + type = any +} + +# tflint-ignore: terraform_unused_declarations +variable "resources_precreated" { + description = "Have expensive resources been created already" + type = bool +} diff --git a/manifests/modules/troubleshooting/alb/creating-alb/fix_ingress/ingress.yaml b/manifests/modules/troubleshooting/alb/creating-alb/fix_ingress/ingress.yaml new file mode 100644 index 000000000..521dbf7c1 --- /dev/null +++ b/manifests/modules/troubleshooting/alb/creating-alb/fix_ingress/ingress.yaml @@ -0,0 +1,21 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: ui + namespace: ui + annotations: + alb.ingress.kubernetes.io/scheme: internet-facing + alb.ingress.kubernetes.io/target-type: ip + alb.ingress.kubernetes.io/healthcheck-path: /actuator/health/liveness +spec: + ingressClassName: alb + rules: + - http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: ui + port: + number: 80 diff --git a/manifests/modules/troubleshooting/alb/creating-alb/fix_ingress/kustomization.yaml b/manifests/modules/troubleshooting/alb/creating-alb/fix_ingress/kustomization.yaml new file mode 100644 index 000000000..972f3ed06 --- /dev/null +++ b/manifests/modules/troubleshooting/alb/creating-alb/fix_ingress/kustomization.yaml @@ -0,0 +1,4 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - ingress.yaml diff --git a/manifests/modules/troubleshooting/alb/creating-alb/fix_ui/kustomization.yaml b/manifests/modules/troubleshooting/alb/creating-alb/fix_ui/kustomization.yaml new file mode 100644 index 000000000..5400397eb --- /dev/null +++ b/manifests/modules/troubleshooting/alb/creating-alb/fix_ui/kustomization.yaml @@ -0,0 +1,6 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - ../../../../../base-application/ui +patches: + - path: service.yaml diff --git a/manifests/modules/troubleshooting/alb/creating-alb/fix_ui/service.yaml b/manifests/modules/troubleshooting/alb/creating-alb/fix_ui/service.yaml new file mode 100644 index 000000000..8a78041ce --- /dev/null +++ b/manifests/modules/troubleshooting/alb/creating-alb/fix_ui/service.yaml @@ -0,0 +1,16 @@ +apiVersion: v1 +kind: Service +metadata: + name: ui + labels: + helm.sh/chart: ui-0.0.1 + app.kubernetes.io/name: ui + app.kubernetes.io/instance: ui + app.kubernetes.io/component: service + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/created-by: eks-workshop +spec: + selector: + app.kubernetes.io/name: ui + app.kubernetes.io/instance: ui + app.kubernetes.io/component: service diff --git a/manifests/modules/troubleshooting/alb/creating-alb/ingress.yaml b/manifests/modules/troubleshooting/alb/creating-alb/ingress.yaml new file mode 100644 index 000000000..7f4d06f31 --- /dev/null +++ b/manifests/modules/troubleshooting/alb/creating-alb/ingress.yaml @@ -0,0 +1,21 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: ui + namespace: ui + annotations: + alb.ingress.kubernetes.io/scheme: internet-facing + alb.ingress.kubernetes.io/target-type: ip + alb.ingress.kubernetes.io/healthcheck-path: /actuator/health/liveness +spec: + ingressClassName: alb + rules: + - http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: service-ui + port: + number: 80 diff --git a/manifests/modules/troubleshooting/alb/creating-alb/kustomization.yaml b/manifests/modules/troubleshooting/alb/creating-alb/kustomization.yaml new file mode 100644 index 000000000..439dd9a06 --- /dev/null +++ b/manifests/modules/troubleshooting/alb/creating-alb/kustomization.yaml @@ -0,0 +1,7 @@ +apiVersion: kustomize.config.k8s.io/v1beta1 +kind: Kustomization +resources: + - ../../../../base-application/ui + - ingress.yaml +patches: + - path: service.yaml diff --git a/manifests/modules/troubleshooting/alb/creating-alb/service.yaml b/manifests/modules/troubleshooting/alb/creating-alb/service.yaml new file mode 100644 index 000000000..5d02e9441 --- /dev/null +++ b/manifests/modules/troubleshooting/alb/creating-alb/service.yaml @@ -0,0 +1,16 @@ +apiVersion: v1 +kind: Service +metadata: + name: ui + labels: + helm.sh/chart: ui-0.0.1 + app.kubernetes.io/name: ui + app.kubernetes.io/instance: ui + app.kubernetes.io/component: service + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/created-by: eks-workshop +spec: + selector: + app.kubernetes.io/name: ui-app + app.kubernetes.io/instance: ui + app.kubernetes.io/component: service diff --git a/website/docs/troubleshooting/_category_.json b/website/docs/troubleshooting/_category_.json new file mode 100644 index 000000000..0d4475828 --- /dev/null +++ b/website/docs/troubleshooting/_category_.json @@ -0,0 +1,3 @@ +{ + "collapsed": false +} diff --git a/website/docs/troubleshooting/alb/alb_fix_1.md b/website/docs/troubleshooting/alb/alb_fix_1.md new file mode 100644 index 000000000..6b663f183 --- /dev/null +++ b/website/docs/troubleshooting/alb/alb_fix_1.md @@ -0,0 +1,166 @@ +--- +title: "Section 1 - Fixing Tag Issue" +sidebar_position: 30 +--- + +The task for you in this troubleshooting scenario is to investigate the deployment for AWS Load Balancer Controller as well as the ingress object created by following the prompts with the script. At the end of this session, you should be able to see the ui app on your EKS cluster using ALB ingress through the browsers as depicted in the image. + +![ingress](./assets/ingress.webp) + +## Let's start the troubleshooting + +### Step 1 + +First, we need to verify the status of our pods and get ingress for ingress object creation. To do so, we will use `kubectl` tool. + +```bash +$ kubectl get pod -n ui +NAME READY STATUS RESTARTS AGE +ui-68495c748c-jkh2z 1/1 Running 0 85s +``` + +### Step 2 + +In _Step 1_, we checked the pods status for our application and aws-load-balancer-controller. The _aws-load-balancer-controller_ deployment is responsible for ALB creation for any ingress objects applied to the cluster. + +Upon looking for ingress object, did you observe any ALB DNS name to access your application with the ingress object? You can also verify ALB creation in the AWS Management Console. In a successful installation scenario, the ingress object should have an ALB DNS name shown like the example below. However in this case, the ADDRESS section where the ALB DNS should have populated is empty. + +```bash +$ kubectl get ingress/ui -n ui +NAME CLASS HOSTS ADDRESS PORTS AGE +ui alb * 80 105s + +#---This is the expected output when the ingress was deployed correctly-- +NAME CLASS HOSTS ADDRESS PORTS AGE +ingress-2048 * k8s-ui-ingress2-xxxxxxxxxx-yyyyyyyyyy.region-code.elb.amazonaws.com 80 2m32s +``` + +### Step 3 + +Check further into the ingress for any events indicating why we do not see the ALB DNS. You can retrieve those logs by running the following command. The event logs should point you towards what the issue might be with ingress creation. + +```bash +$ kubectl describe ingress/ui -n ui +Name: ui +Labels: +Namespace: ui +Address: +Ingress Class: alb +Default backend: +Rules: + Host Path Backends + ---- ---- -------- + * + / service-ui:80 () +Annotations: alb.ingress.kubernetes.io/healthcheck-path: /actuator/health/liveness + alb.ingress.kubernetes.io/scheme: internet-facing + alb.ingress.kubernetes.io/target-type: ip +Events: + Type Reason Age From Message + ---- ------ ---- ---- ------- + Warning FailedBuildModel 2m23s (x16 over 5m9s) ingress Failed build model due to couldn't auto-discover subnets: unable to resolve at least one subnet (0 match VPC and tags: [kubernetes.io/role/elb]) + +``` + +Refer the documentation on prerequisites for setting up [ALB with EKS](https://kubernetes-sigs.github.io/aws-load-balancer-controller/v2.4/deploy/subnet_discovery/) + +### Step 4 + +_Step 3_ points to issues with the subnet auto-discovery for load balancer controller deployment. Ensure that all the public subnets have correct tags `tag:kubernetes.io/role/elb,Values=1'` + +:::info +Keep in mind that public subnet means the route table for the subnet has an Internet Gateway allowing traffic to and from the internet. +::: + +**1** To find the all subnets through the command line, filter through existing ones with the following tag "Key: `alpha.eksctl.io/cluster-name` Value: `${EKS_CLUSTER_NAME}`". There should be four subnets. **Note:** _For your convenience we have added the cluster name as env variable with the variable `$EKS_CLUSTER_NAME`._ + +```bash +$ aws ec2 describe-subnets --filters "Name=tag:alpha.eksctl.io/cluster-name,Values=${EKS_CLUSTER_NAME}" --query 'Subnets[].SubnetId[]' +[ + "subnet-xxxxxxxxxxxxxxxxx", + "subnet-xxxxxxxxxxxxxxxxx", + "subnet-xxxxxxxxxxxxxxxxx", + "subnet-xxxxxxxxxxxxxxxxx", + "subnet-xxxxxxxxxxxxxxxxx", + "subnet-xxxxxxxxxxxxxxxxx" +] +``` + +**2** Then by adding in the subnet ID into the route tables CLI filter one at a time, `--filters 'Name=association.subnet-id,Values=subnet-xxxxxxxxxxxxxxxxx'`, identify which subnets are public. + +```text +aws ec2 describe-route-tables --filters 'Name=association.subnet-id,Values=' --query 'RouteTables[].Routes[].[DestinationCidrBlock,GatewayId]' + +``` + +Here a script that will help to iterate over the list of subnets + +```bash +$ for subnet_id in $(aws ec2 describe-subnets --filters "Name=tag:alpha.eksctl.io/cluster-name,Values=${EKS_CLUSTER_NAME}" --query 'Subnets[].SubnetId[]' --output text); do echo "Subnect: ${subnet_id}"; aws ec2 describe-route-tables --filters "Name=association.subnet-id,Values=${subnet_id}" --query 'RouteTables[].Routes[].[DestinationCidrBlock,GatewayId]'; done +``` + +If the output shows `0.0.0.0/0` route to an Internet gateway ID, this is a public subnet. See below example. + +```text +aws ec2 describe-route-tables --filters "Name=association.subnet-id,Values=subnet-xxxxxxxxxxxxx0470" --query 'RouteTables[].Routes[].[DestinationCidrBlock,GatewayId]' +[ + [ + "10.42.0.0/16", + "local" + ], + [ + "0.0.0.0/0", + "igw-xxxxxxxxxxxxxxxxx" + ] +] +``` + +**3** Once you have all the public subnet ID's, describe subnets with the appropriate tag and confirm that the public subnet ID's that you identified are missing. In our case, none of our subnets have the correct tags. + +```bash +$ aws ec2 describe-subnets --filters 'Name=tag:kubernetes.io/role/elb,Values=1' --query 'Subnets[].SubnetId' +[] +``` + +**4** Then add the correct tags. To help you a little bit, we have added the 3 public subnets to the `env` variables with the names `PUBLIC_SUBNET_1, PUBLIC_SUBNET_2 and PUBLIC_SUBNET_3` + +```text +aws ec2 create-tags --resources subnet-xxxxxxxxxxxxxxxxx subnet-xxxxxxxxxxxxxxxxx subnet-xxxxxxxxxxxxxxxxx --tags 'Key="kubernetes.io/role/elb",Value=1' + +``` + +```bash +$ aws ec2 create-tags --resources $PUBLIC_SUBNET_1 $PUBLIC_SUBNET_2 $PUBLIC_SUBNET_3 --tags 'Key="kubernetes.io/role/elb",Value=1' +``` + +**5** Confirm the tags are created. You should see the public subnet ID's populated following the command below. + +```bash +$ aws ec2 describe-subnets --filters 'Name=tag:kubernetes.io/role/elb,Values=1' --query 'Subnets[].SubnetId' +[ + "subnet-xxxxxxxxxxxxxxxxx", + "subnet-xxxxxxxxxxxxxxxxx", + "subnet-xxxxxxxxxxxxxxxxx" +] +``` + +**6** Now restart the controller deployment using the kubectl rollout restart command: + +```bash timeout=180 +$ kubectl -n kube-system rollout restart deploy aws-load-balancer-controller +deployment.apps/aws-load-balancer-controller restarted +``` + +**7** Now, check again the ingress deployment: + +```bash expectError=true timeout=180 hook=fix-1 hookTimeout=600 +$ kubectl describe ingress/ui -n ui + Warning FailedDeployModel 68s ingress Failed deploy model due to AccessDenied: User: arn:aws:sts::xxxxxxxxxxxx:assumed-role/alb-controller-20240611131524228000000002/1718115201989397805 is not authorized to perform: elasticloadbalancing:CreateLoadBalancer on resource: arn:aws:elasticloadbalancing:us-west-2:xxxxxxxxxxxx:loadbalancer/app/k8s-ui-ui-5ddc3ba496/* because no identity-based policy allows the elasticloadbalancing:CreateLoadBalancer action + status code: 403, request id: b862fb9c-480b-44b5-ba6f-426a3884b6b6 + Warning FailedDeployModel 26s (x5 over 66s) ingress (combined from similar events): Failed deploy model due to AccessDenied: User: arn:aws:sts::xxxxxxxxxxxx:assumed-role/alb-controller-20240611131524228000000002/1718115201989397805 is not authorized to perform: elasticloadbalancing:CreateLoadBalancer on resource: arn:aws:elasticloadbalancing:us-west-2:xxxxxxxxxxxx:loadbalancer/app/k8s-ui-ui-5ddc3ba496/* because no identity-based policy allows the elasticloadbalancing:CreateLoadBalancer action + status code: 403, request id: 197cf2f7-2f68-44f2-92ae-ff5b36cb150f +``` + +:::tip +In AWS generally for creation/deletion/update of any resource, you will observe a corresponding API call which are recorded in CloudTrail. Look for any CloudTrail events for CreateLoadBalancer API calls. Do you observe any such calls in the last 1 hour of this lab setup? +::: diff --git a/website/docs/troubleshooting/alb/alb_fix_5.md b/website/docs/troubleshooting/alb/alb_fix_5.md new file mode 100644 index 000000000..bb2536fea --- /dev/null +++ b/website/docs/troubleshooting/alb/alb_fix_5.md @@ -0,0 +1,93 @@ +--- +title: "Section 2 - Fixing Policy Issue" +sidebar_position: 31 +--- + +In this section we will cover a specific troubleshooting step to address issues where the ALB is not properly forwarding traffic to the target groups. It provides step-by-step instructions and relevant configuration examples to help resolve this problem. + +### Step 5 + +With this setup, we’re leveraging IAM Roles for Service Accounts, which essentially allows pods to assume IAM roles using service accounts in Kubernetes and OIDC provider associated with your EKS cluster. Locate the service account that load balancer controller is using and find out the IAM role associated with it, to identify the IAM entity that would make API calls to provision your load balancer. +Try running: + +```bash +$ kubectl get serviceaccounts -n kube-system -l app.kubernetes.io/name=aws-load-balancer-controller -o yaml +``` + +```yaml {8} +apiVersion: v1 +items: + - apiVersion: v1 + automountServiceAccountToken: true + kind: ServiceAccount + metadata: + annotations: + eks.amazonaws.com/role-arn: arn:aws:iam::xxxxxxxxxxxx:role/alb-controller-20240611131524228000000002 + meta.helm.sh/release-name: aws-load-balancer-controller + meta.helm.sh/release-namespace: kube-system + creationTimestamp: "2024-06-11T13:15:32Z" + labels: + app.kubernetes.io/instance: aws-load-balancer-controller + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: aws-load-balancer-controller + app.kubernetes.io/version: v2.7.1 + helm.sh/chart: aws-load-balancer-controller-1.7.1 + name: aws-load-balancer-controller-sa + namespace: kube-system + resourceVersion: "4950707" + uid: 6d842045-f2b4-4406-869b-f2addc67ff4d +kind: List +metadata: + resourceVersion: "" +``` + +:::tip +Can you verify if there’s a call in your CloudTrail events with the IAM role listed in the output for above command? If not, take a look at the logs from your controller. +::: + +### Step 6 + +You can check the logs from controller pods to find additional details which could be preventing the load balancer to create. Let's check the logs using the command below. + +```bash +$ kubectl logs -n kube-system -l app.kubernetes.io/name=aws-load-balancer-controller +``` + +For example the output may show something similar to the below output. + +```text +{"level":"error","ts":"2024-06-11T14:24:24Z","msg":"Reconciler error","controller":"ingress","object":{"name":"ui","namespace":"ui"},"namespace":"ui","name":"ui","reconcileID":"49d27bbb-96e5-43b4-b115-b7a07e757148","error":"AccessDenied: User: arn:aws:sts::xxxxxxxxxxxx:assumed-role/alb-controller-20240611131524228000000002/1718115201989397805 is not authorized to perform: elasticloadbalancing:CreateLoadBalancer on resource: arn:aws:elasticloadbalancing:us-west-2:xxxxxxxxxxxx:loadbalancer/app/k8s-ui-ui-5ddc3ba496/* because no identity-based policy allows the elasticloadbalancing:CreateLoadBalancer action\n\tstatus code: 403, request id: a24a1620-3a75-46b7-b3c3-9c80fada159e"} +``` + +As you can see the error indicates the IAM role does not have the correct permissions, in this case the permissions to create the load balancer `elasticloadbalancing:CreateLoadBalancer`. + +:::tip +Verify the correct permissions required by the IAM role in the documentations here [[1]](https://kubernetes-sigs.github.io/aws-load-balancer-controller/v2.4/deploy/installation/#setup-iam-manually) where you can find the latest IAM permissions json file required for the LB Controller. After the changes, you have to wait a few minutes for the changes to reflect, since IAM uses an eventual consistency model. To make the changes, locate the IAM role through the AWS console and add the missing permissions that are shown in the log. In this case CreateLoadBalancer is missing. +::: + +Now let's fix it. To avoid conflicts with the automation of the workshop, we have already provisioned the correct permissions into the account and added the environment variable `LOAD_BALANCER_CONTROLLER_ROLE_NAME` that contains the role name and `LOAD_BALANCER_CONTROLLER_POLICY_ARN_FIX` which contains the correct IAM policy arn, and `LOAD_BALANCER_CONTROLLER_POLICY_ARN_ISSUE` that contains the incorrect IAM policy arn. + +So, to fix it we will just need to attach the correct IAM policy, as follows: + +```bash +$ aws iam attach-role-policy --role-name ${LOAD_BALANCER_CONTROLLER_ROLE_NAME} --policy-arn ${LOAD_BALANCER_CONTROLLER_POLICY_ARN_FIX} +``` + +and detach the incorrect IAM policy from the role: + +```bash +$ aws iam detach-role-policy --role-name ${LOAD_BALANCER_CONTROLLER_ROLE_NAME} --policy-arn ${LOAD_BALANCER_CONTROLLER_POLICY_ARN_ISSUE} +``` + +Try accessing the new Ingress URL in the browser as before to check if you can access the UI app: + +```bash timeout=180 hook=fix-5 hookTimeout=600 +$ kubectl get ingress -n ui ui -o jsonpath="{.status.loadBalancer.ingress[*].hostname}{'\n'}" +k8s-ui-ui-5ddc3ba496-1208241872.us-west-2.elb.amazonaws.com +``` + +:::tip +It can take a couple of minutes for the Load Balancer to be available once created. +::: + +Also, feel free to go to CloudTrail again and verify the API call for CreateLoadBalancer is there. diff --git a/website/docs/troubleshooting/alb/alb_fix_7.md b/website/docs/troubleshooting/alb/alb_fix_7.md new file mode 100644 index 000000000..21ce2198f --- /dev/null +++ b/website/docs/troubleshooting/alb/alb_fix_7.md @@ -0,0 +1,239 @@ +--- +title: "Section 3 - Fixing Manifest Configs" +sidebar_position: 32 +--- + +We are almost done, now let's troubleshoot a scenario where the ALB is not properly registering the Kubernetes service endpoints. Again, it offers detailed guidance and configuration samples to assist in identifying and fixing this type of issue. + +### Step 7 + +Even though the ingress creation succeeded, when you try accessing the app in browser there is an error stating, "Backend service does not exist". + +![ALb-Backend-DoesNotExist](./assets/alb-does-not-exist.webp) + +Since ingress is created, that would mean that there is an issue with communication from the Kubernetes ingress to the service. Check the deployment and service using: + +```bash +$ kubectl -n ui get service/ui -o yaml +``` + +```yaml {27} +apiVersion: v1 +kind: Service +metadata: + annotations: + ... + labels: + app.kubernetes.io/component: service + app.kubernetes.io/created-by: eks-workshop + app.kubernetes.io/instance: ui + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: ui + helm.sh/chart: ui-0.0.1 + name: ui + namespace: ui + resourceVersion: "4950875" + uid: dc832144-b2a1-41cd-b7a1-8979111da677 +spec: + ... + ports: + - name: http + port: 80 + protocol: TCP + targetPort: http + selector: + app.kubernetes.io/component: service + app.kubernetes.io/instance: ui + app.kubernetes.io/name: ui-app + sessionAffinity: None + type: ClusterIP +status: + loadBalancer: {} +``` + +And now check the ingress configuration: + +```bash +$ kubectl get ingress/ui -n ui -o yaml +``` + +```yaml {23} +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + annotations: + alb.ingress.kubernetes.io/healthcheck-path: /actuator/health/liveness + alb.ingress.kubernetes.io/scheme: internet-facing + alb.ingress.kubernetes.io/target-type: ip + ... + finalizers: + - ingress.k8s.aws/resources + generation: 1 + name: ui + namespace: ui + resourceVersion: "4950883" + uid: 327b899c-405e-431b-8d67-32578435f0b9 +spec: + ingressClassName: alb + rules: + - http: + paths: + - backend: + service: + name: service-ui + port: + number: 80 + path: / + pathType: Prefix +... +``` + +From the outputs, observe the ingress spec and the service name `name: service-ui` that it is pointing to versus what the service name should be. + +We will need to edit the ingress spec to point to correct service name using the command below, which contains the fix: + +```bash +$ kubectl apply -k ~/environment/eks-workshop/modules/troubleshooting/alb/creating-alb/fix_ingress +``` + +To look like: + +```yaml {10} +spec: + ingressClassName: alb + rules: + - http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: ui + port: + number: 80 +``` + +Try accessing the ALB again using the domain name shared in the get ingress output and check if you can access the app now? + +### Step 8 + +Now we observe a 503 error when accessing the ALB: + +![ALb-503-ERROR](./assets/alb-503.webp) + +503 would suggest a server-side issue, specifically with the service being unavailable. But we ensured that the service was running on the cluster when we ran get service command in _Step 7_. + +In Kubernetes, a service is just a construct to expose deployments either externally or within the cluster. Services rely on selectors to be able to send traffic to the correct backend deployment. To verify that we have our service pointing to the correct deployment, check the endpoints that are dynamically configured by kube-proxy on service creation. Run the following command: + +```bash +$ kubectl -n ui get endpoints ui +NAME ENDPOINTS AGE +ui 13d +``` + +The endpoints in command above should be pointing to IPs of the app pods running in _ui_ namespace. Can you identify if the selectors are setup correctly in service? + +### Step 9 + +Taking a look at the deployment spec using command below, verify the selector value being used versus the one used in your service. + +```bash +$ kubectl -n ui get deploy/ui -o yaml +``` + +```yaml {34} +apiVersion: apps/v1 +kind: Deployment +metadata: + annotations: + ... + name: ui + namespace: ui + .. +spec: + progressDeadlineSeconds: 600 + replicas: 1 + revisionHistoryLimit: 10 + selector: + matchLabels: + app.kubernetes.io/component: service + app.kubernetes.io/instance: ui + app.kubernetes.io/name: ui + strategy: + rollingUpdate: + maxSurge: 25% + maxUnavailable: 25% + type: RollingUpdate + template: + metadata: + annotations: + prometheus.io/path: /actuator/prometheus + prometheus.io/port: "8080" + prometheus.io/scrape: "true" + creationTimestamp: null + labels: + app.kubernetes.io/component: service + app.kubernetes.io/created-by: eks-workshop + app.kubernetes.io/instance: ui + app.kubernetes.io/name: ui + spec: + containers: +... + +``` + +And + +```bash +$ kubectl -n ui get svc ui -o yaml +``` + +```yaml {22} +apiVersion: v1 +kind: Service +metadata: + annotations: + ... + labels: + app.kubernetes.io/component: service + app.kubernetes.io/created-by: eks-workshop + app.kubernetes.io/instance: ui + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/name: ui + helm.sh/chart: ui-0.0.1 + name: ui + namespace: ui + resourceVersion: "5000404" + uid: dc832144-b2a1-41cd-b7a1-8979111da677 +spec: + ... + selector: + app.kubernetes.io/component: service + app.kubernetes.io/instance: ui + app.kubernetes.io/name: ui-app + sessionAffinity: None + type: ClusterIP +... +``` + +Notice what the `service/ui` selector is using and what the actual `deployment/ui` labels are. To fix the issue, we need to update the `service/ui` selector `app.kubernetes.io/name: ui-app` to `app.kubernetes.io/name: ui`. + +:::tip +You can either update the service selector with: + +- `kubectl edit service -n ` or +- `kubectl patch service -n --type='json' -p='[{"op": "replace", "path": "/spec/selector", "value": {"key1": "value1", "key2": "value2"}}]'` + +::: + +for your convenience, we have added a kustomize script that update the selector, just execute the following command: + +```bash timeout=180 hook=fix-7 hookTimeout=600 +$ kubectl apply -k ~/environment/eks-workshop/modules/troubleshooting/alb/creating-alb/fix_ui +``` + +Now refresh the browsers and you should see the ui application: + +![ALB-UI-APP](./assets/alb-working.webp) + +### Go ahead and enjoy a break, you’ve earned it diff --git a/website/docs/troubleshooting/alb/alb_fix_wrapping_up.md b/website/docs/troubleshooting/alb/alb_fix_wrapping_up.md new file mode 100644 index 000000000..9dcee5a2a --- /dev/null +++ b/website/docs/troubleshooting/alb/alb_fix_wrapping_up.md @@ -0,0 +1,20 @@ +--- +title: "Wrapping it up" +sidebar_position: 33 +--- + +## Wrapping it up + +Here’s the general flow of how Load Balancer Controller works: + +1. The controller watches for [ingress events](https://kubernetes.io/docs/concepts/services-networking/ingress/#ingress-controllers) from the API server. When it finds ingress resources that satisfy its requirements, it begins the creation of AWS resources. + +2. An [ALB](https://docs.aws.amazon.com/elasticloadbalancing/latest/application/introduction.html) (ELBv2) is created in AWS for the new ingress resource. This ALB can be internet-facing or internal. You can also specify the subnets it's created in using annotations. + +3. [Target Groups](https://docs.aws.amazon.com/elasticloadbalancing/latest/application/load-balancer-target-groups.html) are created in AWS for each unique Kubernetes service described in the ingress resource. + +4. [Listeners](https://docs.aws.amazon.com/elasticloadbalancing/latest/application/load-balancer-listeners.html) are created for every port detailed in your ingress resource annotations. When no port is specified, sensible defaults (80 or 443) are used. Certificates may also be attached via annotations. + +5. [Rules](https://docs.aws.amazon.com/elasticloadbalancing/latest/application/listener-update-rules.html) are created for each path specified in your ingress resource. This ensures traffic to a specific path is routed to the correct Kubernetes Service. + +--- diff --git a/website/docs/troubleshooting/alb/assets/IntScen-ALBC-1-2.webp b/website/docs/troubleshooting/alb/assets/IntScen-ALBC-1-2.webp new file mode 100644 index 000000000..c7cf4200f Binary files /dev/null and b/website/docs/troubleshooting/alb/assets/IntScen-ALBC-1-2.webp differ diff --git a/website/docs/troubleshooting/alb/assets/alb-503.webp b/website/docs/troubleshooting/alb/assets/alb-503.webp new file mode 100644 index 000000000..05f876830 Binary files /dev/null and b/website/docs/troubleshooting/alb/assets/alb-503.webp differ diff --git a/website/docs/troubleshooting/alb/assets/alb-does-not-exist.webp b/website/docs/troubleshooting/alb/assets/alb-does-not-exist.webp new file mode 100644 index 000000000..3bc725a2e Binary files /dev/null and b/website/docs/troubleshooting/alb/assets/alb-does-not-exist.webp differ diff --git a/website/docs/troubleshooting/alb/assets/alb-working.webp b/website/docs/troubleshooting/alb/assets/alb-working.webp new file mode 100644 index 000000000..1d3bfb1b4 Binary files /dev/null and b/website/docs/troubleshooting/alb/assets/alb-working.webp differ diff --git a/website/docs/troubleshooting/alb/assets/ingress.webp b/website/docs/troubleshooting/alb/assets/ingress.webp new file mode 100644 index 000000000..2ad91baa6 Binary files /dev/null and b/website/docs/troubleshooting/alb/assets/ingress.webp differ diff --git a/website/docs/troubleshooting/alb/index.md b/website/docs/troubleshooting/alb/index.md new file mode 100644 index 000000000..62b95f1c2 --- /dev/null +++ b/website/docs/troubleshooting/alb/index.md @@ -0,0 +1,73 @@ +--- +title: "Load balancer scenario" +sidebar_position: 20 +chapter: true +sidebar_custom_props: { "module": true } +description: "Expose HTTP and HTTPS routes to the outside world using Ingress API on Amazon Elastic Kubernetes Service And introduces an issue to the configuration" +--- + +::required-time + +On this scenario we will learn how to troubleshoot various AWS Load Balancer Controller deployment issues, as well as ingress objects created. If you want to learn more about how a Load balancer controller works please check out the [Fundamentals module](/docs/fundamentals/) + +:::tip Before you start +Prepare your environment for this section: + +```bash timeout=600 wait=300 +$ prepare-environment troubleshooting/alb +``` + +The preparation of the lab might take a couple of minutes and it will make the following changes to your lab environment: + +- Pre-configure the base application from the introduction module +- Configure the AWS Load Balancer Controller in the Amazon EKS cluster +- Configure an ingress to get access to the UI via an AWS Load Balancer +- Introduce an issue to the configuration, so we can learn how to troubleshoot these types of issues + +::: + +You can view the Terraform that applies these changes [here](https://github.com/VAR::MANIFESTS_OWNER/VAR::MANIFESTS_REPOSITORY/tree/VAR::MANIFESTS_REF/manifests/modules/troubleshooting/alb/.workshop/terraform). + +:::info Root Cause Analysis (RCA) Methodology + +While we wait for the scenario to finalize its configuration, lets talk about the _RCA Methodology_ really quick. + +The Root Cause Analysis (RCA) helps in identifying how and why an event or failure happened, allowing for corrective and preventive measures to be put in place and the RCA generally serves as input to a remediation process whereby corrective actions are taken to prevent the problem from reoccurring. + +**_The method steps:_** + +1. Identify and describe the problem clearly. +2. Collect data +3. Establish a timeline from the normal situation until the problem occurs. +4. Identify Root Cause +5. Distinguish between the root cause and other causal factors (e.g., using event correlation). +6. Establish a causal graph between the root cause and the problem. +7. Although the word "cause" is singular in RCA, experience shows that generally causes are plural. Therefore, look for multiple causes when carrying out RCA. + +::: + +Now let's verify if the service and ingress is up and running, so we can start troubleshooting the scenario. + +```bash +$ kubectl get svc -n ui +NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE +ui ClusterIP 172.20.224.112 80/TCP 12d +``` + +and + +```bash +$ kubectl get ingress -n ui +NAME CLASS HOSTS ADDRESS PORTS AGE +ui alb * 80 11m + +``` + +Now, do not panic!! the output is expected since it is supposed the ingress/alb shouldn't be created. Let's verify the load balancer was indeed not created: + +```bash +$ aws elbv2 describe-load-balancers --query 'LoadBalancers[?contains(LoadBalancerName, `k8s-ui-ui`) == `true`]' +[] +``` + +If you get the same outputs, it means you are ready to start the troubleshooting. So please, continue with the next page. diff --git a/website/docs/troubleshooting/alb/tests/hook-fix-1.sh b/website/docs/troubleshooting/alb/tests/hook-fix-1.sh new file mode 100644 index 000000000..030f5577b --- /dev/null +++ b/website/docs/troubleshooting/alb/tests/hook-fix-1.sh @@ -0,0 +1,24 @@ +set -Eeuo pipefail + +before() { + echo "noop" +} + +after() { + sleep 20 + + number_of_subnets=$(aws ec2 describe-subnets --filters 'Name=tag:kubernetes.io/role/elb,Values=1' --query 'Subnets[].SubnetId' --output json | jq 'length') + + echo "# of subnets: ${number_of_subnets}" + + output_message=$(kubectl describe ingress/ui -n ui) + + if [[ $output_message == *"Failed deploy model due to AccessDenied"* ]]; then + >&2 echo "text Not found: Failed deploy model due to AccessDenied" + exit 1 + fi + + EXIT_CODE=0 +} + +"$@" diff --git a/website/docs/troubleshooting/alb/tests/hook-fix-5.sh b/website/docs/troubleshooting/alb/tests/hook-fix-5.sh new file mode 100644 index 000000000..90c5f6345 --- /dev/null +++ b/website/docs/troubleshooting/alb/tests/hook-fix-5.sh @@ -0,0 +1,19 @@ +set -Eeuo pipefail + +before() { + echo "noop" +} + +after() { + sleep 120 + + export ui_endpoint=$(kubectl -n kube-system get ingress -n ui ui -o json | jq -r '.status.loadBalancer.ingress[0].hostname') + + if [ -z "$ui_endpoint" ]; then + >&2 echo "Failed to retrieve hostname from Service" + exit 1 + fi + +} + +"$@" diff --git a/website/docs/troubleshooting/alb/tests/hook-fix-7.sh b/website/docs/troubleshooting/alb/tests/hook-fix-7.sh new file mode 100644 index 000000000..e04d20ffb --- /dev/null +++ b/website/docs/troubleshooting/alb/tests/hook-fix-7.sh @@ -0,0 +1,32 @@ +set -Eeuo pipefail + +before() { + echo "noop" +} + +after() { + sleep 120 + + export ui_endpoint=$(kubectl -n kube-system get ingress -n ui ui -o json | jq -r '.status.loadBalancer.ingress[0].hostname') + + if [ -z "$ui_endpoint" ]; then + >&2 echo "Failed to retrieve hostname from Ingress" + exit 1 + fi + + EXIT_CODE=0 + + timeout -s TERM 400 bash -c \ + 'while [[ "$(curl -s -o /dev/null -L -w ''%{http_code}'' ${ui_endpoint}/home)" != "200" ]];\ + do sleep 20;\ + done' || EXIT_CODE=$? + + echo "Timeout completed" + + if [ $EXIT_CODE -ne 0 ]; then + >&2 echo "Ingress did not become available after 400 seconds" + exit 1 + fi +} + +"$@" diff --git a/website/docs/troubleshooting/alb/tests/hook-suite.sh b/website/docs/troubleshooting/alb/tests/hook-suite.sh new file mode 100644 index 000000000..8b5a4baea --- /dev/null +++ b/website/docs/troubleshooting/alb/tests/hook-suite.sh @@ -0,0 +1,11 @@ +set -e + +before() { + echo "noop" +} + +after() { + prepare-environment +} + +"$@" diff --git a/website/docs/troubleshooting/index.md b/website/docs/troubleshooting/index.md new file mode 100644 index 000000000..1a006a9b3 --- /dev/null +++ b/website/docs/troubleshooting/index.md @@ -0,0 +1,30 @@ +--- +title: "Troubleshooting Scenarios" +sidebar_position: 1 +weight: 40 +--- + +Even with careful planning and preparation, unexpected issues can sometimes arise when working with technology or completing complex tasks. This module provides examples of common troubleshooting scenarios to issues reported to AWS support, along with step-by-step guidance on how to diagnose and resolve the problems. + +Keep in mind that we will use previous concepts from the other chapters while going through each scenario. + +### These are the scenarios covered in this module + +- **AWS Load Balancer Controller** +- **Node not ready (Coming soon)** +- others.. + +:::info Troubleshooting Methodologies +As you progress through the scenarios, we will be introducing an overview of different troubleshooting methodologies. For example, all our scenarios are based in the **Reproductions method**. + +#### Reproductions Method + +Systems and applications come in varying sizes and complexities, which means that you cannot always rely on a full-scale reproduction. We recommend starting with a cut-down reproduction, focusing solely on the components involved. There are times where the issue is specific to the environment and there is a combination of factors necessary for it to occur - in this case, you may need a more complex or even full-scale reproduction, but in our experience that is far less common. + +Being able to reproduce an issue allows you: + +- Observe and experiment in a controlled environment, without affecting users of the system. +- Allows your team to hand over the problem to the team responsible for the failing components, so your team can continue to focus on more pressing matters and mitigation. +- Provide reproduction instructions to the designers or builders of that component, they can perform a deep, targeted investigation. + +::: diff --git a/website/docusaurus.config.js b/website/docusaurus.config.js index f712159a3..c1831a3d0 100644 --- a/website/docusaurus.config.js +++ b/website/docusaurus.config.js @@ -165,6 +165,12 @@ const config = { position: "left", label: "AI/ML", }, + { + type: "doc", + docId: "troubleshooting/index", + position: "left", + label: "Troubleshooting", + }, { href: "https://github.com/aws-samples/eks-workshop-v2", position: "right", diff --git a/website/sidebars.js b/website/sidebars.js index 7da64994c..4375d0340 100644 --- a/website/sidebars.js +++ b/website/sidebars.js @@ -22,6 +22,7 @@ const sidebars = { observability: [{ type: "autogenerated", dirName: "observability" }], automation: [{ type: "autogenerated", dirName: "automation" }], aiml: [{ type: "autogenerated", dirName: "aiml" }], + troubleshooting: [{ type: "autogenerated", dirName: "troubleshooting" }], }; module.exports = sidebars; diff --git a/website/test-durations.json b/website/test-durations.json index ebbbc9f31..30575364c 100644 --- a/website/test-durations.json +++ b/website/test-durations.json @@ -190,5 +190,6 @@ "/security/secrets-management/secrets-manager/create-secret.md": 1273, "/security/secrets-management/secrets-manager/external-secrets.md": 14963, "/security/secrets-management/secrets-manager/index.md": 281009, - "/security/secrets-management/secrets-manager/mounting-secrets.md": 16049 + "/security/secrets-management/secrets-manager/mounting-secrets.md": 16049, + "/troubleshooting/alb/index.md": 16049 }