From b3a106cf137ffe0fb8527874b561655fffb5782b Mon Sep 17 00:00:00 2001 From: Chris Evich Date: Mon, 14 Aug 2023 14:10:38 -0400 Subject: [PATCH 1/2] Minor: Fix duplicate YAML anchor Signed-off-by: Chris Evich --- .cirrus.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.cirrus.yml b/.cirrus.yml index 7a6b6098..5f8386ee 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -132,7 +132,7 @@ tooling_images_task: alias: tooling_images name: 'Build Tooling image ${TARGET_NAME}' only_if: $CIRRUS_CRON == '' - skip: &ci_docs $CIRRUS_CHANGE_TITLE =~ '.*CI:DOCS.*' + skip: *ci_docs depends_on: - imgts_build timeout_in: 30m From 4e37a053314600f33a41cccda16fa72012fe905d Mon Sep 17 00:00:00 2001 From: Chris Evich Date: Mon, 31 Jul 2023 16:15:17 -0400 Subject: [PATCH 2/2] Automatic termination of EC2 VMs Around the time of this commit, an annoyingly steady stream of EC2 orphans were reported to Cirrus-support. They've taken actions to resolve, but the failure-modes are many and complex. Since most of the EC2 instances are rather expensive to keep needlessly running, and manual cleanup is annoying, enhance the monitoring script try to attempt termination automatically. This isn't perfect, it's possible for the script to break in strange ways and it's not practical to check for all of them. Instead, include some helpful indications in the monitoring e-mail regarding what was attempted. Signed-off-by: Chris Evich --- .cirrus.yml | 1 + IMG_SFX | 2 +- orphanvms/_ec2 | 39 +++++++++++++++++++++++++++++++++++++-- orphanvms/entrypoint.sh | 2 ++ 4 files changed, 41 insertions(+), 3 deletions(-) diff --git a/.cirrus.yml b/.cirrus.yml index 5f8386ee..37961f7c 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -396,6 +396,7 @@ test_orphanvms_task: GCPPROJECT: 'libpod-218412' GCPPROJECTS: 'libpod-218412' # value for testing, otherwise see gcpprojects.txt AWSINI: ENCRYPTED[1ab89ff7bc1515dc964efe7ef6e094e01164ba8dd2e11c9a01259c6af3b3968ab841dbe473fe4ab5b573f2f5fa3653e8] + DRY_RUN: 1 EVERYTHING: 1 # Alter age-limit from 3-days -> 3 seconds for a test-run. script: /usr/local/bin/entrypoint.sh diff --git a/IMG_SFX b/IMG_SFX index 51e0758f..3b73ff7c 100644 --- a/IMG_SFX +++ b/IMG_SFX @@ -1 +1 @@ -20230810t190003z-f38f37d13 +20230815t160849z-f38f37d13 diff --git a/orphanvms/_ec2 b/orphanvms/_ec2 index 6f0825f3..3a6bc189 100644 --- a/orphanvms/_ec2 +++ b/orphanvms/_ec2 @@ -40,8 +40,10 @@ fi # I don't expect there will ever be more than maybe 0-20 instances at any time. for instance_index in $(seq 1 $(jq -e 'length'<<<"$simple_inst_list")); do instance=$(jq -e ".[$instance_index - 1]"<<<"$simple_inst_list") + # aws commands require an instance ID + instid=$(jq -r ".ID"<<<"$instance") # A Name-tag isn't guaranteed, default to stupid, unreadable, generated ID - name=$(jq -r ".ID"<<<"$instance") + name=$instid if name_tag=$(get_tag_value "Name" "$instance"); then # This is MUCH more human-friendly and easier to find in the WebUI. # If it was an instance leaked by Cirrus-CI, it may even include the @@ -69,6 +71,7 @@ for instance_index in $(seq 1 $(jq -e 'length'<<<"$simple_inst_list")); do continue fi + # First part of the status line item to append in the e-mail line="* VM $name running $age_days days" # It would be nice to list all the tags like we do for GCE VMs, @@ -76,7 +79,39 @@ for instance_index in $(seq 1 $(jq -e 'length'<<<"$simple_inst_list")); do # Only print this handy-one (set by get_ci_vm) if it's there. if inuseby_tag=$(get_tag_value "in-use-by" "$instance"); then dbg "Found instance '$name' tagged in-use-by=$inuseby_tag." - line+=" tagged in-use-by=$inuseby_tag" + line+="; likely get_ci_vm, in-use-by=$inuseby_tag" + elif ((DRY_RUN==0)); then # NOT a persistent or a get_ci_vm instance + # Around Jun/Jul '23 an annoyingly steady stream of EC2 orphans were + # reported to Cirrus-support. They've taken actions to resolve, + # but the failure-modes are many and complex. Since most of the EC2 + # instances are rather expensive to keep needlessly running, and manual + # cleanup is annoying, try to terminate them automatically. + dbg "Attempting to terminate instance '$name'" + + # Operation runs asynchronously, no error reported for already terminated instance. + # Any stdout/stderr here would make the eventual e-mail unreadable. + if ! termout=$(aws ec2 terminate-instances --no-paginate --output json --instance-ids "$instid" 2>&1) + then + echo "::error::Auto-term. of '$instid' failed, 'aws' output: $termout" > /dev/stderr + + # Catch rare TOCTOU race, instance was running, terminated, and pruned while looping. + # (terminated instances stick around for a while until purged automatically) + if [[ "$termout" =~ InvalidInstanceID ]]; then + line+="; auto-term. failed, instance vanished" + else # Something else horrible broke, let the operators know. + line+="; auto-term. failed, see GHA workflow log" + fi + else + dbg "Successful term. command output: '$termout'" + # At this point, the script could sit around in a poll-loop, waiting to confirm + # the `$termout` JSON contains `CurrentState: { Code: 48, Name: terminated }`. + # However this could take _minutes_, and there may be a LOT of instances left + # to process. Do the next best thing: Hope the termination eventually works, + # but also let the operator know an attempt was made. + line+="; probably successful auto-termination" + fi + else # no in-use-by tag, DRY_RUN==1 + dbg "DRY_RUN: Would normally have tried to terminate instance '$name' (ID $instid)" fi echo "$line" >> "$OUTPUT" diff --git a/orphanvms/entrypoint.sh b/orphanvms/entrypoint.sh index 7dd98133..0b2a9e53 100644 --- a/orphanvms/entrypoint.sh +++ b/orphanvms/entrypoint.sh @@ -18,7 +18,9 @@ req_env_vars GCPJSON GCPNAME GCPPROJECT GCPPROJECTS AWSINI NOW=$(date +%s) TOO_OLD='3 days ago' # Detect Friday Orphans on Monday EVERYTHING=${EVERYTHING:-0} # set to '1' for testing +DRY_RUN=${DRY_RUN:-0} if ((EVERYTHING)); then + DRY_RUN=1 TOO_OLD="3 seconds ago" fi # Anything older than this is "too old"