Skip to content

Commit

Permalink
Merge pull request containers#286 from cevich/murder_ec2_orphans
Browse files Browse the repository at this point in the history
[CI:TOOLING] Automatic termination of EC2 VMs
  • Loading branch information
cevich authored Aug 16, 2023
2 parents ca033fc + 4e37a05 commit dcc416c
Show file tree
Hide file tree
Showing 4 changed files with 42 additions and 4 deletions.
3 changes: 2 additions & 1 deletion .cirrus.yml
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ tooling_images_task:
alias: tooling_images
name: 'Build Tooling image ${TARGET_NAME}'
only_if: $CIRRUS_CRON == ''
skip: &ci_docs $CIRRUS_CHANGE_TITLE =~ '.*CI:DOCS.*'
skip: *ci_docs
depends_on:
- imgts_build
timeout_in: 30m
Expand Down Expand Up @@ -396,6 +396,7 @@ test_orphanvms_task:
GCPPROJECT: 'libpod-218412'
GCPPROJECTS: 'libpod-218412' # value for testing, otherwise see gcpprojects.txt
AWSINI: ENCRYPTED[1ab89ff7bc1515dc964efe7ef6e094e01164ba8dd2e11c9a01259c6af3b3968ab841dbe473fe4ab5b573f2f5fa3653e8]
DRY_RUN: 1
EVERYTHING: 1 # Alter age-limit from 3-days -> 3 seconds for a test-run.
script: /usr/local/bin/entrypoint.sh

Expand Down
2 changes: 1 addition & 1 deletion IMG_SFX
Original file line number Diff line number Diff line change
@@ -1 +1 @@
20230810t190003z-f38f37d13
20230815t160849z-f38f37d13
39 changes: 37 additions & 2 deletions orphanvms/_ec2
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,10 @@ fi
# I don't expect there will ever be more than maybe 0-20 instances at any time.
for instance_index in $(seq 1 $(jq -e 'length'<<<"$simple_inst_list")); do
instance=$(jq -e ".[$instance_index - 1]"<<<"$simple_inst_list")
# aws commands require an instance ID
instid=$(jq -r ".ID"<<<"$instance")
# A Name-tag isn't guaranteed, default to stupid, unreadable, generated ID
name=$(jq -r ".ID"<<<"$instance")
name=$instid
if name_tag=$(get_tag_value "Name" "$instance"); then
# This is MUCH more human-friendly and easier to find in the WebUI.
# If it was an instance leaked by Cirrus-CI, it may even include the
Expand Down Expand Up @@ -69,14 +71,47 @@ for instance_index in $(seq 1 $(jq -e 'length'<<<"$simple_inst_list")); do
continue
fi

# First part of the status line item to append in the e-mail
line="* VM $name running $age_days days"

# It would be nice to list all the tags like we do for GCE VMs,
# but it's a PITA to do for AWS in a human-readable format.
# Only print this handy-one (set by get_ci_vm) if it's there.
if inuseby_tag=$(get_tag_value "in-use-by" "$instance"); then
dbg "Found instance '$name' tagged in-use-by=$inuseby_tag."
line+=" tagged in-use-by=$inuseby_tag"
line+="; likely get_ci_vm, in-use-by=$inuseby_tag"
elif ((DRY_RUN==0)); then # NOT a persistent or a get_ci_vm instance
# Around Jun/Jul '23 an annoyingly steady stream of EC2 orphans were
# reported to Cirrus-support. They've taken actions to resolve,
# but the failure-modes are many and complex. Since most of the EC2
# instances are rather expensive to keep needlessly running, and manual
# cleanup is annoying, try to terminate them automatically.
dbg "Attempting to terminate instance '$name'"

# Operation runs asynchronously, no error reported for already terminated instance.
# Any stdout/stderr here would make the eventual e-mail unreadable.
if ! termout=$(aws ec2 terminate-instances --no-paginate --output json --instance-ids "$instid" 2>&1)
then
echo "::error::Auto-term. of '$instid' failed, 'aws' output: $termout" > /dev/stderr

# Catch rare TOCTOU race, instance was running, terminated, and pruned while looping.
# (terminated instances stick around for a while until purged automatically)
if [[ "$termout" =~ InvalidInstanceID ]]; then
line+="; auto-term. failed, instance vanished"
else # Something else horrible broke, let the operators know.
line+="; auto-term. failed, see GHA workflow log"
fi
else
dbg "Successful term. command output: '$termout'"
# At this point, the script could sit around in a poll-loop, waiting to confirm
# the `$termout` JSON contains `CurrentState: { Code: 48, Name: terminated }`.
# However this could take _minutes_, and there may be a LOT of instances left
# to process. Do the next best thing: Hope the termination eventually works,
# but also let the operator know an attempt was made.
line+="; probably successful auto-termination"
fi
else # no in-use-by tag, DRY_RUN==1
dbg "DRY_RUN: Would normally have tried to terminate instance '$name' (ID $instid)"
fi

echo "$line" >> "$OUTPUT"
Expand Down
2 changes: 2 additions & 0 deletions orphanvms/entrypoint.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@ req_env_vars GCPJSON GCPNAME GCPPROJECT GCPPROJECTS AWSINI
NOW=$(date +%s)
TOO_OLD='3 days ago' # Detect Friday Orphans on Monday
EVERYTHING=${EVERYTHING:-0} # set to '1' for testing
DRY_RUN=${DRY_RUN:-0}
if ((EVERYTHING)); then
DRY_RUN=1
TOO_OLD="3 seconds ago"
fi
# Anything older than this is "too old"
Expand Down

0 comments on commit dcc416c

Please sign in to comment.