From d571ca6536c2f49ad1d95e2c93f982e18201c45a Mon Sep 17 00:00:00 2001
From: Ed Santiago <santiago@redhat.com>
Date: Tue, 17 Sep 2024 09:19:52 -0600
Subject: [PATCH 1/9] system test parallelization: enable two-pass approach

For the past two months we've been splitting system tests
into two categories: those that CAN be run in parallel,
and those that CANNOT. Much work has been done to replace
hardcoded names (mycontainer, mypod) with safename().
Hundreds of test runs, in CI and on Ed's laptop, have
proven this approach viable.

make {local,remote}system now runs in two steps: first
the serial ones, then the parallel ones. hack/bats will
now recognize the 'ci:parallel' tag and add --jobs (nprocs).

This requires some tweaking of leak_check, because there
can be umpteen tests running (affecting image/container/pod/etc
state) when any given test completes.

Rules for enabling parallelization in tests:

   * use unique container/pod/volume/network names (safename)
   * do not run 'podman rm -a' or 'rmi -a'
   * never use the -l (--latest) option
   * do not run 'podman ps/images' and expect precise output

Signed-off-by: Ed Santiago <santiago@redhat.com>
---
 Makefile                     |  6 ++++--
 hack/bats                    |  9 +++++---
 test/system/helpers.bash     | 42 +++++++++++++++++++++++-------------
 test/system/setup_suite.bash | 15 ++++++++-----
 4 files changed, 47 insertions(+), 25 deletions(-)

diff --git a/Makefile b/Makefile
index 799ca84390ce..3cdae32101f7 100644
--- a/Makefile
+++ b/Makefile
@@ -688,7 +688,8 @@ localmachine:
 localsystem:
 	# Wipe existing config, database, and cache: start with clean slate.
 	$(RM) -rf ${HOME}/.local/share/containers ${HOME}/.config/containers
-	if timeout -v 1 true; then PODMAN=$(CURDIR)/bin/podman QUADLET=$(CURDIR)/bin/quadlet bats -T test/system/; else echo "Skipping $@: 'timeout -v' unavailable'"; fi
+	PODMAN=$(CURDIR)/bin/podman QUADLET=$(CURDIR)/bin/quadlet bats -T --filter-tags '!ci:parallel' test/system/
+	PODMAN=$(CURDIR)/bin/podman QUADLET=$(CURDIR)/bin/quadlet bats -T --filter-tags ci:parallel -j $$(nproc) test/system/
 
 .PHONY: remotesystem
 remotesystem:
@@ -717,7 +718,8 @@ remotesystem:
 			echo "Error: ./bin/podman system service did not come up" >&2;\
 			exit 1;\
 		fi;\
-		env PODMAN="$(CURDIR)/bin/podman-remote" bats -T test/system/ ;\
+		env PODMAN="$(CURDIR)/bin/podman-remote" bats -T --filter-tags '!ci:parallel' test/system/ ;\
+		env PODMAN="$(CURDIR)/bin/podman-remote" bats -T --filter-tags ci:parallel -j $$(nproc) test/system/ ;\
 		rc=$$?;\
 		kill %1;\
 	else \
diff --git a/hack/bats b/hack/bats
index 3002c0199bf3..3184c95b7621 100755
--- a/hack/bats
+++ b/hack/bats
@@ -83,7 +83,10 @@ for i;do
         --rootless) TEST_ROOT= ;;
         --remote)   REMOTE=remote ;;
         --ts|-T)    bats_opts+=("-T") ;;
-        --tag=*)    bats_filter=("--filter-tags" "$value") ;;
+        --tag=*)    bats_filter=("--filter-tags" "$value")
+                    if [[ "$value" = "ci:parallel" ]]; then
+                        bats_opts+=("--jobs" $(nproc))
+                    fi;;
         */*.bats)   TESTS=$i ;;
         *)
             if [[ $i =~ : ]]; then
@@ -130,7 +133,7 @@ export PODMAN_BATS_LEAK_CHECK=1
 
 # Root
 if [[ "$TEST_ROOT" ]]; then
-    echo "# bats ${bats_filter[*]} $TESTS"
+    echo "# bats ${bats_opts[*]} ${bats_filter[*]} $TESTS"
     sudo    --preserve-env=PODMAN \
             --preserve-env=QUADLET \
             --preserve-env=PODMAN_TEST_DEBUG \
@@ -144,7 +147,7 @@ fi
 # Rootless. (Only if we're not already root)
 if [[ "$TEST_ROOTLESS" && "$(id -u)" != 0 ]]; then
     echo "--------------------------------------------------"
-    echo "\$ bats ${bats_filter[*]} $TESTS"
+    echo "\$ bats ${bats_opts[*]} ${bats_filter[*]} $TESTS"
     bats "${bats_opts[@]}" "${bats_filter[@]}" $TESTS
     rc=$((rc | $?))
 fi
diff --git a/test/system/helpers.bash b/test/system/helpers.bash
index 665c19210528..280729c5ab1b 100644
--- a/test/system/helpers.bash
+++ b/test/system/helpers.bash
@@ -222,22 +222,34 @@ function basic_teardown() {
     # This must be directly after immediate-assertion-failures to capture the error code
     local exit_code=$?
 
-    # Only checks for leaks on a successful run (BATS_TEST_COMPLETED is set 1),
-    # immediate-assertion-failures didn't fail (exit_code -eq 0)
-    # and PODMAN_BATS_LEAK_CHECK is set.
-    # As these podman commands are slow we do not want to do this by default
-    # and only provide this as opt in option. (#22909)
-    if [[ "$BATS_TEST_COMPLETED" -eq 1 ]] && [ $exit_code -eq 0 ] && [ -n "$PODMAN_BATS_LEAK_CHECK" ]; then
-        leak_check
-        exit_code=$((exit_code + $?))
+    # Leak check and state reset. Only run these when running tests serially!
+    # (For parallel tests, we run a leak check only at the very end of all tests)
+    if [[ -z "$PARALLEL_JOBSLOT" ]]; then
+        # Check for leaks, but only if:
+        #  1) test was successful (BATS_TEST_COMPLETED is set 1); and
+        #  2) immediate-assertion-failures didn't fail (exit_code -eq 0); and
+        #  3) PODMAN_BATS_LEAK_CHECK is set (usually only in cron).
+        # As these podman commands are slow we do not want to do this by default
+        # and only provide this as opt-in option. (#22909)
+        if [[ "$BATS_TEST_COMPLETED" -eq 1 ]] && [[ $exit_code -eq 0 ]] && [[ -n "$PODMAN_BATS_LEAK_CHECK" ]]; then
+            leak_check
+            exit_code=$((exit_code + $?))
+        fi
+
+        # Some error happened (either in teardown itself or the actual test failed)
+        # so do a full cleanup to ensure following tests start with a clean env.
+        if [ $exit_code -gt 0 ] || [ -z "$BATS_TEST_COMPLETED" ]; then
+            clean_setup
+            exit_code=$((exit_code + $?))
+        fi
     fi
 
-    # Some error happened (either in teardown itself or the actual test failed)
-    # so do a full cleanup to ensure following tests start with a clean env.
-    if [ $exit_code -gt 0 ] || [ -z "$BATS_TEST_COMPLETED" ]; then
-        clean_setup
-        exit_code=$((exit_code + $?))
+    # Status file used in teardown_suite() to decide whether or not
+    # to check for leaks
+    if [[ "$BATS_TEST_COMPLETED" -ne 1 ]]; then
+        rm -f "$BATS_SUITE_TMPDIR/all-tests-passed"
     fi
+
     command rm -rf $PODMAN_TMPDIR
     exit_code=$((exit_code + $?))
     return $exit_code
@@ -426,12 +438,12 @@ function clean_setup() {
         else
             # Always remove image that doesn't match by name
             echo "# setup(): removing stray image $1" >&3
-            run_podman rmi --force "$1" >/dev/null 2>&1 || true
+            _run_podman_quiet rmi --force "$1"
 
             # Tagged image will have same IID as our test image; don't rmi it.
             if [[ $2 != "$PODMAN_TEST_IMAGE_ID" ]]; then
                 echo "# setup(): removing stray image $2" >&3
-                run_podman rmi --force "$2" >/dev/null 2>&1 || true
+                _run_podman_quiet rmi --force "$2"
             fi
         fi
     done
diff --git a/test/system/setup_suite.bash b/test/system/setup_suite.bash
index 4dd59981db0c..8a3f910839fd 100644
--- a/test/system/setup_suite.bash
+++ b/test/system/setup_suite.bash
@@ -32,6 +32,9 @@ function setup_suite() {
            "Unable to set PODMAN_LOGIN_REGISTRY_PORT"
 
     clean_setup
+
+    # Canary file. Will be removed if any individual test fails.
+    touch "$BATS_SUITE_TMPDIR/all-tests-passed"
 }
 
 # Run at the very end of all tests. Useful for cleanup of non-BATS tmpdirs.
@@ -39,11 +42,13 @@ function teardown_suite() {
     stop_registry
     local exit_code=$?
 
-    # After all tests make sure there are no leaks and cleanup if there are
-    leak_check
-    if [ $? -gt 0 ]; then
-        exit_code=$((exit_code + 1))
-        clean_setup
+    # At end, if all tests have passed, check for leaks.
+    # Don't do this if there were errors: failing tests may not clean up.
+    if [[ -e "$BATS_SUITE_TMPDIR/all-tests-passed" ]]; then
+        leak_check
+        if [ $? -gt 0 ]; then
+            exit_code=$((exit_code + 1))
+        fi
     fi
 
     return $exit_code

From bcffa9ce301cc0d69e589b7ffe51f4a5816603d9 Mon Sep 17 00:00:00 2001
From: Ed Santiago <santiago@redhat.com>
Date: Tue, 17 Sep 2024 11:15:56 -0600
Subject: [PATCH 2/9] clean_setup: create pause image

Workaround for #23292, where simultaneous 'pod create' commands
will all start a podman-build of the pause image, but only
one of them will be tagged, and the others will leak <none>
images.

Signed-off-by: Ed Santiago <santiago@redhat.com>
---
 test/system/helpers.bash | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/test/system/helpers.bash b/test/system/helpers.bash
index 280729c5ab1b..c38273d78b0b 100644
--- a/test/system/helpers.bash
+++ b/test/system/helpers.bash
@@ -452,6 +452,17 @@ function clean_setup() {
     if [[ -z "$found_needed_image" ]]; then
         _prefetch $PODMAN_TEST_IMAGE_FQN
     fi
+
+    # When running in parallel, load (create, actually) the pause image.
+    # This way, all pod tests will have it available. Without this,
+    # parallel pod tests will leave behind <none>:<none> images.
+    # FIXME: #23292 -- this should not be necessary.
+    if [[ -n "$PARALLEL_JOBSLOT" ]]; then
+        run_podman pod create mypod
+        run_podman pod rm mypod
+        # And now, we have a pause image, and each test does not
+        # need to build their own.
+    fi
 }
 
 # END   setup/teardown tools

From 6b621d9571421489c3028975ddf2fcc728e16bfe Mon Sep 17 00:00:00 2001
From: Ed Santiago <santiago@redhat.com>
Date: Tue, 16 Jul 2024 12:56:59 -0600
Subject: [PATCH 3/9] ci: bump system tests to fastvm

Signed-off-by: Ed Santiago <santiago@redhat.com>
---
 .cirrus.yml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/.cirrus.yml b/.cirrus.yml
index 9cba9792bb38..120aeaa1dc25 100644
--- a/.cirrus.yml
+++ b/.cirrus.yml
@@ -836,8 +836,8 @@ local_system_test_task: &local_system_test_task
         (changesInclude('**/*.go', '**/*.c', '**/*.h') && !changesIncludeOnly('test/**', 'pkg/machine/e2e/**'))
     depends_on: *build
     matrix: *platform_axis
-    gce_instance: *standardvm
-    timeout_in: 35m
+    gce_instance: *fastvm
+    timeout_in: 25m
     env:
         TEST_FLAVOR: sys
     clone_script: *get_gosrc
@@ -890,8 +890,8 @@ rootless_remote_system_test_task:
               CTR_FQIN: ${FEDORA_CONTAINER_FQIN}
     <<: *local_system_test_task
     alias: rootless_remote_system_test
-    gce_instance: *standardvm
-    timeout_in: 35m
+    gce_instance: *fastvm
+    timeout_in: 25m
     env:
         TEST_FLAVOR: sys
         PODBIN_NAME: remote
@@ -905,8 +905,8 @@ rootless_system_test_task:
     only_if: *only_if_system_test
     depends_on: *build
     matrix: *platform_axis
-    gce_instance: *standardvm
-    timeout_in: 35m
+    gce_instance: *fastvm
+    timeout_in: 25m
     env:
         TEST_FLAVOR: sys
         PRIV_NAME: rootless

From 6502e30cfd71dc8435e8da220936752c2bec1599 Mon Sep 17 00:00:00 2001
From: Ed Santiago <santiago@redhat.com>
Date: Thu, 25 Jul 2024 13:05:10 -0600
Subject: [PATCH 4/9] bats log: differentiate parallel tests from sequential

For tests run in parallel, show file number as |nnn| (vs [nnn])

Teach logformatter to distinguish the two, adding 'p' to anchors
in parallel tests. Necessary because in this scheme we run bats
twice, thus see 'ok 1' twice, and we want to differentiate them.

Signed-off-by: Ed Santiago <santiago@redhat.com>
---
 contrib/cirrus/logformatter | 11 ++++++++---
 test/system/helpers.bash    |  7 ++++++-
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/contrib/cirrus/logformatter b/contrib/cirrus/logformatter
index 78ac1850d719..ed7135e8d2d9 100755
--- a/contrib/cirrus/logformatter
+++ b/contrib/cirrus/logformatter
@@ -349,7 +349,7 @@ END_HTML
         # BATS handling. This will recognize num_tests both at start and end
         if ($line =~ /^1\.\.(\d+)$/) {
             $looks_like_bats = 1;
-            $bats_count{expected_total} = $1;
+            $bats_count{expected_total} += $1;
             undef $looks_like_python;
         }
         # Since the number of tests can't always be predicted, recognize
@@ -445,9 +445,14 @@ END_HTML
             if ($css) {
                 # Make it linkable, e.g. foo.html#t--00001
                 if ($line =~ /^(not\s+)?ok\s+(\d+)\s+(.*)/) {
-                    $line = sprintf("<a name='t--%05d'>%s</a>", $2, $line);
+                    my ($notok, $num, $name) = ($1, $2, $3);
+                    # Parallel system tests are shown as |nnn| instead of [nnn]
+                    my $p = '';
+                    $p = 'p' if $name =~ /^\s*\|\d+\|/;
+
+                    $line = sprintf("<a name='t--%05d%s'>%s</a>", $num, $p, $line);
 
-                    push @{$bats_count{__fail_list}}, [ $2, $3 ] if $1;
+                    push @{$bats_count{__fail_list}}, [ $num, $name ] if $notok;
                 }
                 $bats_count{$css}++;
                 $css = "bats-$css";
diff --git a/test/system/helpers.bash b/test/system/helpers.bash
index c38273d78b0b..3b6b5b00ed5b 100644
--- a/test/system/helpers.bash
+++ b/test/system/helpers.bash
@@ -160,7 +160,12 @@ function basic_setup() {
 
     # Test filenames must match ###-name.bats; use "[###] " as prefix
     run expr "$BATS_TEST_FILENAME" : "^.*/\([0-9]\{3\}\)-[^/]\+\.bats\$"
-    BATS_TEST_NAME_PREFIX="[${output}] "
+    # If parallel, use |nnn|. Serial, [nnn]
+    if [[ -n "$PARALLEL_JOBSLOT" ]]; then
+        BATS_TEST_NAME_PREFIX="|${output}| "
+    else
+        BATS_TEST_NAME_PREFIX="[${output}] "
+    fi
 
     # By default, assert() and die() cause an immediate test failure.
     # Under special circumstances (usually long test loops), tests

From bf6131780accc2725805493bc07f443f615b2cd3 Mon Sep 17 00:00:00 2001
From: Ed Santiago <santiago@redhat.com>
Date: Tue, 17 Sep 2024 09:54:49 -0600
Subject: [PATCH 5/9] Update system test template and README

Add a few best-practices examples, and add a whole section
describing the dos and donts of writing parallel-safe tests.

Signed-off-by: Ed Santiago <santiago@redhat.com>
---
 test/system/000-TEMPLATE | 42 ++++++++++++++++++++++++++++++++++++++++
 test/system/README.md    | 17 ++++++++++------
 2 files changed, 53 insertions(+), 6 deletions(-)

diff --git a/test/system/000-TEMPLATE b/test/system/000-TEMPLATE
index a6a9ca141873..cff5de7263a3 100644
--- a/test/system/000-TEMPLATE
+++ b/test/system/000-TEMPLATE
@@ -11,6 +11,22 @@ load helpers
     args="some sort of argument list"
     run_podman subcmd $args
     assert "$output" == "what we expect"   "output from 'podman subcmd $args'"
+
+    # safename() provides a lower-case string with both the BATS test number
+    # and a pseudorandom element. Its use is strongly encouraged for container
+    # names, volumes, networks, images, everything, because the test number
+    # makes it possible to find leaked elements.
+    cname="c-$(safename)"
+
+    # Run "top" in a detached container with a safe name
+    run_podman run -d --name $cname --this-option --that $IMAGE top
+
+    # A number ("17") as the first run_podman arg means "expect this exit code".
+    # By default, run_podman expects success, and will barf on nonzero status.
+    run_podman 17 run --this-option --that $IMAGE exit 17
+
+    # Give a hoot, don't pollute
+    run_podman rm -f -t0 $cname
 }
 
 # vim: filetype=sh
@@ -110,5 +126,31 @@ size      | -\\\?[0-9]\\\+
     done < <(parse_table "$tests")
 }
 
+# Whenever possible, please add the ci:parallel tag to new tests,
+# not only for speed but for stress testing.
+#
+# This is an example of what NOT to do when enabling parallel tests.
+#
+# bats test_tags=ci:parallel
+@test "this test is completely broken in parallel" {
+    # Never use "--name HARDCODED". Define 'cname=c-$(safename)' instead.
+    # Not only does that guarantee uniqueness, it also gives the test number
+    # in the name, so if there's a leak (at end of tests) it will be possible
+    # to identify the culprit.
+    run_podman --name mycontainer $IMAGE top
+
+    # Same thing for build and -t
+    run_podman build -t myimage ...
+
+    # Never assume exact output from podman ps! Many other containers may be running.
+    run_podman ps
+    assert "$output" = "mycontainer"
+
+    # Never use "-l". It is meaningless when other processes are running.
+    run_podman container inspect -l
+
+    # Never 'rm -a'!!! OMG like seriously just don't.
+    run_podman rm -f -a
+}
 
 # vim: filetype=sh
diff --git a/test/system/README.md b/test/system/README.md
index 8e4a98d4cd3b..1800a1fd17f2 100644
--- a/test/system/README.md
+++ b/test/system/README.md
@@ -5,11 +5,10 @@ debug failures.
 Quick Start
 ===========
 
-Look at [030-run.bats](030-run.bats) for a simple but packed example.
+Look at [000-TEMPLATE](000-TEMPLATE) for a simple starting point.
 This introduces the basic set of helper functions:
 
-* `setup` (implicit) - resets container storage so there's
-one and only one (standard) image, and no running containers.
+* `setup` (implicit) - establishes a test environment.
 
 * `parse_table` - you can define tables of inputs and expected results,
 then read those in a `while` loop. This makes it easy to add new tests.
@@ -21,7 +20,7 @@ examples of how to deal with the more typical such issues.
 but could also be './bin/podman' or 'podman-remote'), with a timeout.
 Checks its exit status.
 
-* `is` - compare actual vs expected output. Emits a useful diagnostic
+* `assert` - compare actual vs expected output. Emits a useful diagnostic
 on failure.
 
 * `die` - output a properly-formatted message to stderr, and fail test
@@ -30,7 +29,13 @@ on failure.
 
 * `skip_if_remote` - like the above, but skip if testing `podman-remote`
 
-* `random_string` - returns a pseudorandom alphanumeric string
+* `safename` - generates a pseudorandom lower-case string suitable
+for use in names for containers, images, volumes, any object. String
+includes the BATS test number, making it possible to identify the
+source of leaks (failure to clean up) at the end of tests.
+
+* `random_string` - returns a pseudorandom alphanumeric string suitable
+for verifying I/O.
 
 Test files are of the form `NNN-name.bats` where NNN is a three-digit
 number. Please preserve this convention, it simplifies viewing the
@@ -59,7 +64,7 @@ commands, their output and exit codes. In a normal run you will never
 see this, but BATS will display it on failure. The goal here is to
 give you everything you need to diagnose without having to rerun tests.
 
-The `is` comparison function is designed to emit useful diagnostics,
+The `assert` comparison function is designed to emit useful diagnostics,
 in particular, the actual and expected strings. Please do not use
 the horrible BATS standard of `[ x = y ]`; that's nearly useless
 for tracking down failures.

From 5fc3de55837be1849b33b41490e56dd8e3835794 Mon Sep 17 00:00:00 2001
From: Ed Santiago <santiago@redhat.com>
Date: Tue, 17 Sep 2024 10:06:34 -0600
Subject: [PATCH 6/9] registry: lock start attempts

When running parallel, multiple tests could be trying to start
the registry at once. Make this parallel-safe.

Also, use a safer port range for the registry. Something
outside of /proc/sys/net/ipv4/ip_local_port_range

Sorry, I'm including a FIXME section that I haven't investigated
deeply enough.

Signed-off-by: Ed Santiago <santiago@redhat.com>
---
 test/system/helpers.registry.bash | 39 +++++++++++++++++++++++++------
 test/system/setup_suite.bash      |  2 +-
 2 files changed, 33 insertions(+), 8 deletions(-)

diff --git a/test/system/helpers.registry.bash b/test/system/helpers.registry.bash
index 643a0e51510c..ff57ad8b876a 100644
--- a/test/system/helpers.registry.bash
+++ b/test/system/helpers.registry.bash
@@ -17,8 +17,26 @@ unset REGISTRY_AUTH_FILE
 # Start a local registry. Only needed on demand (e.g. by 150-login.bats)
 # and then only once: if we start, leave it running until final teardown.
 function start_registry() {
-    if [[ -d "$PODMAN_LOGIN_WORKDIR/auth" ]]; then
-        # Already started
+    AUTHDIR=${PODMAN_LOGIN_WORKDIR}/auth
+
+    local startflag=${PODMAN_LOGIN_WORKDIR}/OK
+
+    if ! mkdir $AUTHDIR; then
+        # *Possibly* already started. Or, possibly (when running
+        # parallel tests) another process is trying to start it.
+        # Give it some time.
+        local timeout=30
+        while [[ $timeout -gt 0 ]]; do
+            if [[ -e $startflag ]]; then
+                echo "Registry has already been started by another process"
+                return
+            fi
+
+            sleep 1
+            timeout=$((timeout - 1))
+        done
+
+        die "Internal error: timed out waiting for another process to start registry"
 
         # Fixes very obscure corner case in root system tests:
         #  1) we run 150-login tests, starting a registry; then
@@ -26,11 +44,15 @@ function start_registry() {
         #  3) run 700-play, the "private" test, which needs the
         #     already-started registry, but its port is now DROPped,
         #     so the test times out trying to talk to registry
-        run_podman --storage-driver vfs $(podman_isolation_opts ${PODMAN_LOGIN_WORKDIR}) network reload --all
+
+        ###### FIXME FIXME FIXME TEMPORARY!
+        ###### Trying to understand flake #23725. What happens if we stop
+        ###### doing the network reload?
+        ###### FIXME FIXME FIXME, should we do it in stop_registry??
+        ###### run_podman --storage-driver vfs $(podman_isolation_opts ${PODMAN_LOGIN_WORKDIR}) network reload --all
         return
     fi
 
-    AUTHDIR=${PODMAN_LOGIN_WORKDIR}/auth
     mkdir -p $AUTHDIR
 
     # Registry image; copy of docker.io, but on our own registry
@@ -79,6 +101,9 @@ function start_registry() {
     wait_for_port 127.0.0.1 ${PODMAN_LOGIN_REGISTRY_PORT}
     # ...so we look in container logs for confirmation that registry is running.
     _PODMAN_TEST_OPTS="${PODMAN_LOGIN_ARGS}" wait_for_output "listening on .::.:5000" $cid
+
+    touch $startflag
+    echo "I have started the registry"
 }
 
 function stop_registry() {
@@ -103,10 +128,10 @@ function stop_registry() {
         mount | grep ${PODMAN_LOGIN_WORKDIR} | awk '{print $3}' | xargs --no-run-if-empty umount
 
         if [[ $(id -u) -eq 0 ]]; then
-            rm -rf ${PODMAN_LOGIN_WORKDIR}
+            rm -rf ${PODMAN_LOGIN_WORKDIR}/*
         else
             # rootless image data is owned by a subuid
-            run_podman unshare rm -rf ${PODMAN_LOGIN_WORKDIR}
+            run_podman unshare rm -rf ${PODMAN_LOGIN_WORKDIR}/*
         fi
     fi
 
@@ -119,7 +144,7 @@ function stop_registry() {
         echo ""
         echo "lsof -i -P"
         lsof -i -P
-        die "Socket still seems open"
+        die "Socket $PODMAN_LOGIN_REGISTRY_PORT still seems open"
     fi
 }
 
diff --git a/test/system/setup_suite.bash b/test/system/setup_suite.bash
index 8a3f910839fd..fa7169a12782 100644
--- a/test/system/setup_suite.bash
+++ b/test/system/setup_suite.bash
@@ -25,7 +25,7 @@ function setup_suite() {
 
     # FIXME: racy! It could be many minutes between now and when we start it.
     # To mitigate, we use a range not used anywhere else in system tests.
-    export PODMAN_LOGIN_REGISTRY_PORT=$(random_free_port 42000-42999)
+    export PODMAN_LOGIN_REGISTRY_PORT=$(random_free_port 27000-27999)
 
     # The above does not handle errors. Do a final confirmation.
     assert "$PODMAN_LOGIN_REGISTRY_PORT" != "" \

From b3da5be2b1fdaffb0d5cf5571b6468a6952f3b66 Mon Sep 17 00:00:00 2001
From: Ed Santiago <santiago@redhat.com>
Date: Tue, 17 Sep 2024 10:37:08 -0600
Subject: [PATCH 7/9] Add workaround for buildah parallel bug

Need --layers=false in podman build, otherwise a buildah race
can trigger "layer not known" failures:

   https://github.com/containers/buildah/issues/5674

Signed-off-by: Ed Santiago <santiago@redhat.com>
---
 test/system/030-run.bats   | 6 ++++--
 test/system/060-mount.bats | 5 +++--
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/test/system/030-run.bats b/test/system/030-run.bats
index e254fe8183df..117e90cc4857 100644
--- a/test/system/030-run.bats
+++ b/test/system/030-run.bats
@@ -874,8 +874,9 @@ RUN rm /etc/mtab
 EOF
     expected="'/etc/mtab' -> '/proc/mounts'"
 
+    # --layers=false needed to work around buildah#5674 parallel flake
     local iname=nomtab-$(safename)
-    run_podman build -t $iname $tmpdir
+    run_podman build -t $iname --layers=false $tmpdir
     run_podman run --rm $iname stat -c %N /etc/mtab
     is "$output" "$expected" "/etc/mtab should be created"
 
@@ -1080,8 +1081,9 @@ echo -e "#!/bin/sh\nfalse" >> /usr/bin/nsenter; \
 chmod +x /usr/bin/nsenter
 EOF
 
+    # --layers=false needed to work around buildah#5674 parallel flake
     test_image="cve_2022_1227_test-$(safename)"
-    run_podman build -t $test_image $tmpbuilddir
+    run_podman build -t $test_image --layers=false $tmpbuilddir
     run_podman run -d ${keepid} $test_image top
     ctr="$output"
     run_podman top $ctr huser,user
diff --git a/test/system/060-mount.bats b/test/system/060-mount.bats
index 3dbcd8631215..792c77716521 100644
--- a/test/system/060-mount.bats
+++ b/test/system/060-mount.bats
@@ -409,8 +409,9 @@ FROM $IMAGE
 RUN mkdir /mountroot && echo ${datacontent[img]} > /mountroot/data
 EOF
 
+    # --layers=false needed to work around buildah#5674 parallel flake
     img="localhost/img-$(safename)"
-    run_podman build -t $img -f $dockerfile
+    run_podman build -t $img --layers=false -f $dockerfile
 
     # Each test is set up in exactly the same way:
     #
@@ -554,7 +555,7 @@ glob | /*    | /mountroot/           | in
     echo "$datafile_contents" > $workdir/$datafile
     ln -s $datafile $workdir/link
 
-    run_podman create --mount type=glob,src=$workdir/*,dst=/mountroot/,no-dereference --privileged $img sh -c "stat -c '%N' /mountroot/link; cat /mountroot/link"
+    run_podman create --mount type=glob,src=$workdir/*,dst=/mountroot/,no-dereference --privileged $img sh -c "stat -c '%N' /mountroot/link; cat /mountroot/link; ls -l /mountroot"
     cid="$output"
     run_podman start -a $cid
     assert "${lines[0]}" = "'/mountroot/link' -> '$datafile'" "symlink is preserved, on start"

From 7fcf94d7b5d641068db99aad72bc13c38e93a7cf Mon Sep 17 00:00:00 2001
From: Ed Santiago <santiago@redhat.com>
Date: Tue, 17 Sep 2024 10:41:32 -0600
Subject: [PATCH 8/9] Add network namespace leak check

Signed-off-by: Ed Santiago <santiago@redhat.com>
---
 test/system/setup_suite.bash | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/test/system/setup_suite.bash b/test/system/setup_suite.bash
index fa7169a12782..8a920c614167 100644
--- a/test/system/setup_suite.bash
+++ b/test/system/setup_suite.bash
@@ -35,6 +35,9 @@ function setup_suite() {
 
     # Canary file. Will be removed if any individual test fails.
     touch "$BATS_SUITE_TMPDIR/all-tests-passed"
+
+    # Track network namespaces, so we can check for leaks at test end
+    ip netns list > $BATS_SUITE_TMPDIR/netns-pre
 }
 
 # Run at the very end of all tests. Useful for cleanup of non-BATS tmpdirs.
@@ -49,6 +52,15 @@ function teardown_suite() {
         if [ $? -gt 0 ]; then
             exit_code=$((exit_code + 1))
         fi
+
+        # Network namespace leak check. List should match what we saw above.
+        echo
+        ip netns list > $BATS_SUITE_TMPDIR/netns-post
+        if ! diff -u $BATS_SUITE_TMPDIR/netns-{pre,post}; then
+            echo
+            echo "^^^^^ Leaks found in /run/netns ^^^^^"
+            exit_code=$((exit_code + 1))
+        fi
     fi
 
     return $exit_code

From 8402b6535f287fbf061c001a6454b061553b6884 Mon Sep 17 00:00:00 2001
From: Ed Santiago <santiago@redhat.com>
Date: Tue, 17 Sep 2024 10:50:20 -0600
Subject: [PATCH 9/9] Misc minor test fixes

...for dealing with flakes in parallel mode

Signed-off-by: Ed Santiago <santiago@redhat.com>
---
 test/system/055-rm.bats          |  4 ++--
 test/system/220-healthcheck.bats |  7 ++++++-
 test/system/252-quadlet.bats     |  6 ++++--
 test/system/500-networking.bats  | 19 +++++++++++++------
 4 files changed, 25 insertions(+), 11 deletions(-)

diff --git a/test/system/055-rm.bats b/test/system/055-rm.bats
index e9a23dba5200..3673b7dc2581 100644
--- a/test/system/055-rm.bats
+++ b/test/system/055-rm.bats
@@ -127,10 +127,10 @@ function __run_healthcheck_container() {
     run_podman run -d --name $1 \
                --health-cmd /bin/false \
                --health-interval 1s \
-               --health-retries 1 \
+               --health-retries 2 \
                --health-timeout 1s \
                --health-on-failure=stop \
-               --stop-timeout=1 \
+               --stop-timeout=2 \
                --health-start-period 0 \
                --stop-signal SIGTERM \
                $IMAGE sleep infinity
diff --git a/test/system/220-healthcheck.bats b/test/system/220-healthcheck.bats
index 7e01a5545bca..1cf1f9c0b325 100644
--- a/test/system/220-healthcheck.bats
+++ b/test/system/220-healthcheck.bats
@@ -18,7 +18,12 @@ function _check_health {
     local hc_status="$5"
 
     # Loop-wait (up to a few seconds) for healthcheck event (#20342)
+    # Allow a margin when running parallel, because of system load
     local timeout=5
+    if [[ -n "$PARALLEL_JOBSLOT" ]]; then
+        timeout=$((timeout + 3))
+    fi
+
     while :; do
         run_podman events --filter container=$ctrname --filter event=health_status \
                    --since "$since" --stream=false --format "{{.HealthStatus}}"
@@ -157,7 +162,7 @@ Log[-1].Output   | \"Uh-oh on stdout!\\\nUh-oh on stderr!\\\n\"
 
         # Wait for the container in the background and create the $wait_file to
         # signal the specified wait condition was met.
-        (timeout --foreground -v --kill=5 5 $PODMAN wait --condition=$condition $ctr && touch $wait_file) &
+        (timeout --foreground -v --kill=5 10 $PODMAN wait --condition=$condition $ctr && touch $wait_file) &
 
         # Sleep 1 second to make sure above commands are running
         sleep 1
diff --git a/test/system/252-quadlet.bats b/test/system/252-quadlet.bats
index f13f15169fa0..3ef297b86932 100644
--- a/test/system/252-quadlet.bats
+++ b/test/system/252-quadlet.bats
@@ -428,12 +428,14 @@ EOF
 
 # A quadlet container depends on a named quadlet volume
 @test "quadlet - named volume dependency" {
+    local volume_name="v-$(safename)"
+
     # Save the unit name to use as the volume for the container
     local quadlet_vol_unit=dep_$(safename).volume
     local quadlet_vol_file=$PODMAN_TMPDIR/${quadlet_vol_unit}
     cat > $quadlet_vol_file <<EOF
 [Volume]
-VolumeName=foo
+VolumeName=$volume_name
 EOF
 
     # Have quadlet create the systemd unit file for the volume unit
@@ -442,7 +444,6 @@ EOF
 
     # Save the volume service name since the variable will be overwritten
     local vol_service=$QUADLET_SERVICE_NAME
-    local volume_name="foo"
 
     local quadlet_file=$PODMAN_TMPDIR/user_$(safename).container
     cat > $quadlet_file <<EOF
@@ -975,6 +976,7 @@ EOF
 
       service_setup $QUADLET_SERVICE_NAME
 
+      # FIXME: log.91: Starting, not Started
       # Ensure we have output. Output is synced via sd-notify (socat in Exec)
       run journalctl "--since=$STARTED_TIME" --unit="$QUADLET_SERVICE_NAME"
       is "$output" '.*Started.*\.service.*'
diff --git a/test/system/500-networking.bats b/test/system/500-networking.bats
index 9bfaf05aa44c..4018206cd0e3 100644
--- a/test/system/500-networking.bats
+++ b/test/system/500-networking.bats
@@ -169,18 +169,25 @@ load helpers.network
     local con2_name=con2-$(safename)
     run_podman pod create --name $pod_name  --infra-name $infra_name
     pid="$output"
-    run_podman run --pod $pod_name --name $con1_name $IMAGE cat /etc/hosts
-    is "$output" ".*\s$pod_name $infra_name.*" "Pod hostname in /etc/hosts"
-    is "$output" ".*127.0.0.1\s$con1_name.*" "Container1 name in /etc/hosts"
+    run_podman run --rm --pod $pod_name --name $con1_name $IMAGE cat /etc/hosts
+    assert "$output" =~ ".*\s$pod_name $infra_name.*" \
+           "Pod hostname in /etc/hosts"
+    assert "$output" =~ ".*127.0.0.1\s$con1_name.*" \
+           "Container1 name in /etc/hosts"
     # get the length of the hosts file
     old_lines=${#lines[@]}
 
     # since the first container should be cleaned up now we should only see the
     # new host entry and the old one should be removed (lines check)
     run_podman run --pod $pod_name --name $con2_name $IMAGE cat /etc/hosts
-    is "$output" ".*\s$pod_name $infra_name.*" "Pod hostname in /etc/hosts"
-    is "$output" ".*127.0.0.1\s$con2_name.*" "Container2 name in /etc/hosts"
-    is "${#lines[@]}" "$old_lines" "Number of hosts lines is equal"
+    assert "$output" =~ ".*\s$pod_name $infra_name.*" \
+           "Pod hostname in /etc/hosts"
+    assert "$output" =~ ".*127.0.0.1\s$con2_name.*" \
+           "Container2 name in /etc/hosts"
+    assert "$output" !~ "$con1_name" \
+           "Container1 name should not be in /etc/hosts"
+    is "${#lines[@]}" "$old_lines" \
+       "Number of hosts lines is equal"
 
     run_podman run --pod $pod_name  $IMAGE sh -c  "hostname && cat /etc/hostname"
     is "${lines[0]}" "$pod_name" "hostname is the pod hostname"