From b3a106cf137ffe0fb8527874b561655fffb5782b Mon Sep 17 00:00:00 2001
From: Chris Evich <cevich@redhat.com>
Date: Mon, 14 Aug 2023 14:10:38 -0400
Subject: [PATCH 1/2] Minor: Fix duplicate YAML anchor

Signed-off-by: Chris Evich <cevich@redhat.com>
---
 .cirrus.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.cirrus.yml b/.cirrus.yml
index 7a6b6098..5f8386ee 100644
--- a/.cirrus.yml
+++ b/.cirrus.yml
@@ -132,7 +132,7 @@ tooling_images_task:
     alias: tooling_images
     name: 'Build Tooling image ${TARGET_NAME}'
     only_if: $CIRRUS_CRON == ''
-    skip: &ci_docs $CIRRUS_CHANGE_TITLE =~ '.*CI:DOCS.*'
+    skip: *ci_docs
     depends_on:
         - imgts_build
     timeout_in: 30m

From 4e37a053314600f33a41cccda16fa72012fe905d Mon Sep 17 00:00:00 2001
From: Chris Evich <cevich@redhat.com>
Date: Mon, 31 Jul 2023 16:15:17 -0400
Subject: [PATCH 2/2] Automatic termination of EC2 VMs

Around the time of this commit, an annoyingly steady stream of EC2
orphans were reported to Cirrus-support.  They've taken actions to
resolve, but the failure-modes are many and complex.  Since most of the
EC2 instances are rather expensive to keep needlessly running, and manual
cleanup is annoying, enhance the monitoring script try to attempt
termination automatically.

This isn't perfect, it's possible for the script to break in strange ways
and it's not practical to check for all of them.  Instead, include
some helpful indications in the monitoring e-mail regarding what was
attempted.

Signed-off-by: Chris Evich <cevich@redhat.com>
---
 .cirrus.yml             |  1 +
 IMG_SFX                 |  2 +-
 orphanvms/_ec2          | 39 +++++++++++++++++++++++++++++++++++++--
 orphanvms/entrypoint.sh |  2 ++
 4 files changed, 41 insertions(+), 3 deletions(-)

diff --git a/.cirrus.yml b/.cirrus.yml
index 5f8386ee..37961f7c 100644
--- a/.cirrus.yml
+++ b/.cirrus.yml
@@ -396,6 +396,7 @@ test_orphanvms_task:
         GCPPROJECT: 'libpod-218412'
         GCPPROJECTS: 'libpod-218412' # value for testing, otherwise see gcpprojects.txt
         AWSINI: ENCRYPTED[1ab89ff7bc1515dc964efe7ef6e094e01164ba8dd2e11c9a01259c6af3b3968ab841dbe473fe4ab5b573f2f5fa3653e8]
+        DRY_RUN: 1
         EVERYTHING: 1  # Alter age-limit from 3-days -> 3 seconds for a test-run.
     script: /usr/local/bin/entrypoint.sh
 
diff --git a/IMG_SFX b/IMG_SFX
index 51e0758f..3b73ff7c 100644
--- a/IMG_SFX
+++ b/IMG_SFX
@@ -1 +1 @@
-20230810t190003z-f38f37d13
+20230815t160849z-f38f37d13
diff --git a/orphanvms/_ec2 b/orphanvms/_ec2
index 6f0825f3..3a6bc189 100644
--- a/orphanvms/_ec2
+++ b/orphanvms/_ec2
@@ -40,8 +40,10 @@ fi
 # I don't expect there will ever be more than maybe 0-20 instances at any time.
 for instance_index in $(seq 1 $(jq -e 'length'<<<"$simple_inst_list")); do
     instance=$(jq -e ".[$instance_index - 1]"<<<"$simple_inst_list")
+    # aws commands require an instance ID
+    instid=$(jq -r ".ID"<<<"$instance")
     # A Name-tag isn't guaranteed, default to stupid, unreadable, generated ID
-    name=$(jq -r ".ID"<<<"$instance")
+    name=$instid
     if name_tag=$(get_tag_value "Name" "$instance"); then
         # This is MUCH more human-friendly and easier to find in the WebUI.
         # If it was an instance leaked by Cirrus-CI, it may even include the
@@ -69,6 +71,7 @@ for instance_index in $(seq 1 $(jq -e 'length'<<<"$simple_inst_list")); do
         continue
     fi
 
+    # First part of the status line item to append in the e-mail
     line="* VM $name running $age_days days"
 
     # It would be nice to list all the tags like we do for GCE VMs,
@@ -76,7 +79,39 @@ for instance_index in $(seq 1 $(jq -e 'length'<<<"$simple_inst_list")); do
     # Only print this handy-one (set by get_ci_vm) if it's there.
     if inuseby_tag=$(get_tag_value "in-use-by" "$instance"); then
         dbg "Found instance '$name' tagged in-use-by=$inuseby_tag."
-        line+=" tagged in-use-by=$inuseby_tag"
+        line+="; likely get_ci_vm, in-use-by=$inuseby_tag"
+    elif ((DRY_RUN==0)); then # NOT a persistent or a get_ci_vm instance
+        # Around Jun/Jul '23 an annoyingly steady stream of EC2 orphans were
+        # reported to Cirrus-support.  They've taken actions to resolve,
+        # but the failure-modes are many and complex.  Since most of the EC2
+        # instances are rather expensive to keep needlessly running, and manual
+        # cleanup is annoying, try to terminate them automatically.
+        dbg "Attempting to terminate instance '$name'"
+
+        # Operation runs asynchronously, no error reported for already terminated instance.
+        # Any stdout/stderr here would make the eventual e-mail unreadable.
+        if ! termout=$(aws ec2 terminate-instances --no-paginate --output json --instance-ids "$instid" 2>&1)
+        then
+            echo "::error::Auto-term. of '$instid' failed, 'aws' output: $termout" > /dev/stderr
+
+            # Catch rare TOCTOU race, instance was running, terminated, and pruned while looping.
+            # (terminated instances stick around for a while until purged automatically)
+            if [[ "$termout" =~ InvalidInstanceID ]]; then
+                line+="; auto-term. failed, instance vanished"
+            else  # Something else horrible broke, let the operators know.
+                line+="; auto-term. failed, see GHA workflow log"
+            fi
+        else
+            dbg "Successful term. command output: '$termout'"
+            # At this point, the script could sit around in a poll-loop, waiting to confirm
+            # the `$termout` JSON contains `CurrentState: { Code: 48, Name: terminated }`.
+            # However this could take _minutes_, and there may be a LOT of instances left
+            # to process.  Do the next best thing: Hope the termination eventually works,
+            # but also let the operator know an attempt was made.
+            line+="; probably successful auto-termination"
+        fi
+    else  # no in-use-by tag, DRY_RUN==1
+        dbg "DRY_RUN: Would normally have tried to terminate instance '$name' (ID $instid)"
     fi
 
     echo "$line" >> "$OUTPUT"
diff --git a/orphanvms/entrypoint.sh b/orphanvms/entrypoint.sh
index 7dd98133..0b2a9e53 100644
--- a/orphanvms/entrypoint.sh
+++ b/orphanvms/entrypoint.sh
@@ -18,7 +18,9 @@ req_env_vars GCPJSON GCPNAME GCPPROJECT GCPPROJECTS AWSINI
 NOW=$(date +%s)
 TOO_OLD='3 days ago'  # Detect Friday Orphans on Monday
 EVERYTHING=${EVERYTHING:-0}  # set to '1' for testing
+DRY_RUN=${DRY_RUN:-0}
 if ((EVERYTHING)); then
+    DRY_RUN=1
     TOO_OLD="3 seconds ago"
 fi
 # Anything older than this is "too old"