From d571ca6536c2f49ad1d95e2c93f982e18201c45a Mon Sep 17 00:00:00 2001 From: Ed Santiago Date: Tue, 17 Sep 2024 09:19:52 -0600 Subject: [PATCH 1/9] system test parallelization: enable two-pass approach For the past two months we've been splitting system tests into two categories: those that CAN be run in parallel, and those that CANNOT. Much work has been done to replace hardcoded names (mycontainer, mypod) with safename(). Hundreds of test runs, in CI and on Ed's laptop, have proven this approach viable. make {local,remote}system now runs in two steps: first the serial ones, then the parallel ones. hack/bats will now recognize the 'ci:parallel' tag and add --jobs (nprocs). This requires some tweaking of leak_check, because there can be umpteen tests running (affecting image/container/pod/etc state) when any given test completes. Rules for enabling parallelization in tests: * use unique container/pod/volume/network names (safename) * do not run 'podman rm -a' or 'rmi -a' * never use the -l (--latest) option * do not run 'podman ps/images' and expect precise output Signed-off-by: Ed Santiago --- Makefile | 6 ++++-- hack/bats | 9 +++++--- test/system/helpers.bash | 42 +++++++++++++++++++++++------------- test/system/setup_suite.bash | 15 ++++++++----- 4 files changed, 47 insertions(+), 25 deletions(-) diff --git a/Makefile b/Makefile index 799ca84390ce..3cdae32101f7 100644 --- a/Makefile +++ b/Makefile @@ -688,7 +688,8 @@ localmachine: localsystem: # Wipe existing config, database, and cache: start with clean slate. $(RM) -rf ${HOME}/.local/share/containers ${HOME}/.config/containers - if timeout -v 1 true; then PODMAN=$(CURDIR)/bin/podman QUADLET=$(CURDIR)/bin/quadlet bats -T test/system/; else echo "Skipping $@: 'timeout -v' unavailable'"; fi + PODMAN=$(CURDIR)/bin/podman QUADLET=$(CURDIR)/bin/quadlet bats -T --filter-tags '!ci:parallel' test/system/ + PODMAN=$(CURDIR)/bin/podman QUADLET=$(CURDIR)/bin/quadlet bats -T --filter-tags ci:parallel -j $$(nproc) test/system/ .PHONY: remotesystem remotesystem: @@ -717,7 +718,8 @@ remotesystem: echo "Error: ./bin/podman system service did not come up" >&2;\ exit 1;\ fi;\ - env PODMAN="$(CURDIR)/bin/podman-remote" bats -T test/system/ ;\ + env PODMAN="$(CURDIR)/bin/podman-remote" bats -T --filter-tags '!ci:parallel' test/system/ ;\ + env PODMAN="$(CURDIR)/bin/podman-remote" bats -T --filter-tags ci:parallel -j $$(nproc) test/system/ ;\ rc=$$?;\ kill %1;\ else \ diff --git a/hack/bats b/hack/bats index 3002c0199bf3..3184c95b7621 100755 --- a/hack/bats +++ b/hack/bats @@ -83,7 +83,10 @@ for i;do --rootless) TEST_ROOT= ;; --remote) REMOTE=remote ;; --ts|-T) bats_opts+=("-T") ;; - --tag=*) bats_filter=("--filter-tags" "$value") ;; + --tag=*) bats_filter=("--filter-tags" "$value") + if [[ "$value" = "ci:parallel" ]]; then + bats_opts+=("--jobs" $(nproc)) + fi;; */*.bats) TESTS=$i ;; *) if [[ $i =~ : ]]; then @@ -130,7 +133,7 @@ export PODMAN_BATS_LEAK_CHECK=1 # Root if [[ "$TEST_ROOT" ]]; then - echo "# bats ${bats_filter[*]} $TESTS" + echo "# bats ${bats_opts[*]} ${bats_filter[*]} $TESTS" sudo --preserve-env=PODMAN \ --preserve-env=QUADLET \ --preserve-env=PODMAN_TEST_DEBUG \ @@ -144,7 +147,7 @@ fi # Rootless. (Only if we're not already root) if [[ "$TEST_ROOTLESS" && "$(id -u)" != 0 ]]; then echo "--------------------------------------------------" - echo "\$ bats ${bats_filter[*]} $TESTS" + echo "\$ bats ${bats_opts[*]} ${bats_filter[*]} $TESTS" bats "${bats_opts[@]}" "${bats_filter[@]}" $TESTS rc=$((rc | $?)) fi diff --git a/test/system/helpers.bash b/test/system/helpers.bash index 665c19210528..280729c5ab1b 100644 --- a/test/system/helpers.bash +++ b/test/system/helpers.bash @@ -222,22 +222,34 @@ function basic_teardown() { # This must be directly after immediate-assertion-failures to capture the error code local exit_code=$? - # Only checks for leaks on a successful run (BATS_TEST_COMPLETED is set 1), - # immediate-assertion-failures didn't fail (exit_code -eq 0) - # and PODMAN_BATS_LEAK_CHECK is set. - # As these podman commands are slow we do not want to do this by default - # and only provide this as opt in option. (#22909) - if [[ "$BATS_TEST_COMPLETED" -eq 1 ]] && [ $exit_code -eq 0 ] && [ -n "$PODMAN_BATS_LEAK_CHECK" ]; then - leak_check - exit_code=$((exit_code + $?)) + # Leak check and state reset. Only run these when running tests serially! + # (For parallel tests, we run a leak check only at the very end of all tests) + if [[ -z "$PARALLEL_JOBSLOT" ]]; then + # Check for leaks, but only if: + # 1) test was successful (BATS_TEST_COMPLETED is set 1); and + # 2) immediate-assertion-failures didn't fail (exit_code -eq 0); and + # 3) PODMAN_BATS_LEAK_CHECK is set (usually only in cron). + # As these podman commands are slow we do not want to do this by default + # and only provide this as opt-in option. (#22909) + if [[ "$BATS_TEST_COMPLETED" -eq 1 ]] && [[ $exit_code -eq 0 ]] && [[ -n "$PODMAN_BATS_LEAK_CHECK" ]]; then + leak_check + exit_code=$((exit_code + $?)) + fi + + # Some error happened (either in teardown itself or the actual test failed) + # so do a full cleanup to ensure following tests start with a clean env. + if [ $exit_code -gt 0 ] || [ -z "$BATS_TEST_COMPLETED" ]; then + clean_setup + exit_code=$((exit_code + $?)) + fi fi - # Some error happened (either in teardown itself or the actual test failed) - # so do a full cleanup to ensure following tests start with a clean env. - if [ $exit_code -gt 0 ] || [ -z "$BATS_TEST_COMPLETED" ]; then - clean_setup - exit_code=$((exit_code + $?)) + # Status file used in teardown_suite() to decide whether or not + # to check for leaks + if [[ "$BATS_TEST_COMPLETED" -ne 1 ]]; then + rm -f "$BATS_SUITE_TMPDIR/all-tests-passed" fi + command rm -rf $PODMAN_TMPDIR exit_code=$((exit_code + $?)) return $exit_code @@ -426,12 +438,12 @@ function clean_setup() { else # Always remove image that doesn't match by name echo "# setup(): removing stray image $1" >&3 - run_podman rmi --force "$1" >/dev/null 2>&1 || true + _run_podman_quiet rmi --force "$1" # Tagged image will have same IID as our test image; don't rmi it. if [[ $2 != "$PODMAN_TEST_IMAGE_ID" ]]; then echo "# setup(): removing stray image $2" >&3 - run_podman rmi --force "$2" >/dev/null 2>&1 || true + _run_podman_quiet rmi --force "$2" fi fi done diff --git a/test/system/setup_suite.bash b/test/system/setup_suite.bash index 4dd59981db0c..8a3f910839fd 100644 --- a/test/system/setup_suite.bash +++ b/test/system/setup_suite.bash @@ -32,6 +32,9 @@ function setup_suite() { "Unable to set PODMAN_LOGIN_REGISTRY_PORT" clean_setup + + # Canary file. Will be removed if any individual test fails. + touch "$BATS_SUITE_TMPDIR/all-tests-passed" } # Run at the very end of all tests. Useful for cleanup of non-BATS tmpdirs. @@ -39,11 +42,13 @@ function teardown_suite() { stop_registry local exit_code=$? - # After all tests make sure there are no leaks and cleanup if there are - leak_check - if [ $? -gt 0 ]; then - exit_code=$((exit_code + 1)) - clean_setup + # At end, if all tests have passed, check for leaks. + # Don't do this if there were errors: failing tests may not clean up. + if [[ -e "$BATS_SUITE_TMPDIR/all-tests-passed" ]]; then + leak_check + if [ $? -gt 0 ]; then + exit_code=$((exit_code + 1)) + fi fi return $exit_code From bcffa9ce301cc0d69e589b7ffe51f4a5816603d9 Mon Sep 17 00:00:00 2001 From: Ed Santiago Date: Tue, 17 Sep 2024 11:15:56 -0600 Subject: [PATCH 2/9] clean_setup: create pause image Workaround for #23292, where simultaneous 'pod create' commands will all start a podman-build of the pause image, but only one of them will be tagged, and the others will leak images. Signed-off-by: Ed Santiago --- test/system/helpers.bash | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/test/system/helpers.bash b/test/system/helpers.bash index 280729c5ab1b..c38273d78b0b 100644 --- a/test/system/helpers.bash +++ b/test/system/helpers.bash @@ -452,6 +452,17 @@ function clean_setup() { if [[ -z "$found_needed_image" ]]; then _prefetch $PODMAN_TEST_IMAGE_FQN fi + + # When running in parallel, load (create, actually) the pause image. + # This way, all pod tests will have it available. Without this, + # parallel pod tests will leave behind : images. + # FIXME: #23292 -- this should not be necessary. + if [[ -n "$PARALLEL_JOBSLOT" ]]; then + run_podman pod create mypod + run_podman pod rm mypod + # And now, we have a pause image, and each test does not + # need to build their own. + fi } # END setup/teardown tools From 6b621d9571421489c3028975ddf2fcc728e16bfe Mon Sep 17 00:00:00 2001 From: Ed Santiago Date: Tue, 16 Jul 2024 12:56:59 -0600 Subject: [PATCH 3/9] ci: bump system tests to fastvm Signed-off-by: Ed Santiago --- .cirrus.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.cirrus.yml b/.cirrus.yml index 9cba9792bb38..120aeaa1dc25 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -836,8 +836,8 @@ local_system_test_task: &local_system_test_task (changesInclude('**/*.go', '**/*.c', '**/*.h') && !changesIncludeOnly('test/**', 'pkg/machine/e2e/**')) depends_on: *build matrix: *platform_axis - gce_instance: *standardvm - timeout_in: 35m + gce_instance: *fastvm + timeout_in: 25m env: TEST_FLAVOR: sys clone_script: *get_gosrc @@ -890,8 +890,8 @@ rootless_remote_system_test_task: CTR_FQIN: ${FEDORA_CONTAINER_FQIN} <<: *local_system_test_task alias: rootless_remote_system_test - gce_instance: *standardvm - timeout_in: 35m + gce_instance: *fastvm + timeout_in: 25m env: TEST_FLAVOR: sys PODBIN_NAME: remote @@ -905,8 +905,8 @@ rootless_system_test_task: only_if: *only_if_system_test depends_on: *build matrix: *platform_axis - gce_instance: *standardvm - timeout_in: 35m + gce_instance: *fastvm + timeout_in: 25m env: TEST_FLAVOR: sys PRIV_NAME: rootless From 6502e30cfd71dc8435e8da220936752c2bec1599 Mon Sep 17 00:00:00 2001 From: Ed Santiago Date: Thu, 25 Jul 2024 13:05:10 -0600 Subject: [PATCH 4/9] bats log: differentiate parallel tests from sequential For tests run in parallel, show file number as |nnn| (vs [nnn]) Teach logformatter to distinguish the two, adding 'p' to anchors in parallel tests. Necessary because in this scheme we run bats twice, thus see 'ok 1' twice, and we want to differentiate them. Signed-off-by: Ed Santiago --- contrib/cirrus/logformatter | 11 ++++++++--- test/system/helpers.bash | 7 ++++++- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/contrib/cirrus/logformatter b/contrib/cirrus/logformatter index 78ac1850d719..ed7135e8d2d9 100755 --- a/contrib/cirrus/logformatter +++ b/contrib/cirrus/logformatter @@ -349,7 +349,7 @@ END_HTML # BATS handling. This will recognize num_tests both at start and end if ($line =~ /^1\.\.(\d+)$/) { $looks_like_bats = 1; - $bats_count{expected_total} = $1; + $bats_count{expected_total} += $1; undef $looks_like_python; } # Since the number of tests can't always be predicted, recognize @@ -445,9 +445,14 @@ END_HTML if ($css) { # Make it linkable, e.g. foo.html#t--00001 if ($line =~ /^(not\s+)?ok\s+(\d+)\s+(.*)/) { - $line = sprintf("%s", $2, $line); + my ($notok, $num, $name) = ($1, $2, $3); + # Parallel system tests are shown as |nnn| instead of [nnn] + my $p = ''; + $p = 'p' if $name =~ /^\s*\|\d+\|/; + + $line = sprintf("%s", $num, $p, $line); - push @{$bats_count{__fail_list}}, [ $2, $3 ] if $1; + push @{$bats_count{__fail_list}}, [ $num, $name ] if $notok; } $bats_count{$css}++; $css = "bats-$css"; diff --git a/test/system/helpers.bash b/test/system/helpers.bash index c38273d78b0b..3b6b5b00ed5b 100644 --- a/test/system/helpers.bash +++ b/test/system/helpers.bash @@ -160,7 +160,12 @@ function basic_setup() { # Test filenames must match ###-name.bats; use "[###] " as prefix run expr "$BATS_TEST_FILENAME" : "^.*/\([0-9]\{3\}\)-[^/]\+\.bats\$" - BATS_TEST_NAME_PREFIX="[${output}] " + # If parallel, use |nnn|. Serial, [nnn] + if [[ -n "$PARALLEL_JOBSLOT" ]]; then + BATS_TEST_NAME_PREFIX="|${output}| " + else + BATS_TEST_NAME_PREFIX="[${output}] " + fi # By default, assert() and die() cause an immediate test failure. # Under special circumstances (usually long test loops), tests From bf6131780accc2725805493bc07f443f615b2cd3 Mon Sep 17 00:00:00 2001 From: Ed Santiago Date: Tue, 17 Sep 2024 09:54:49 -0600 Subject: [PATCH 5/9] Update system test template and README Add a few best-practices examples, and add a whole section describing the dos and donts of writing parallel-safe tests. Signed-off-by: Ed Santiago --- test/system/000-TEMPLATE | 42 ++++++++++++++++++++++++++++++++++++++++ test/system/README.md | 17 ++++++++++------ 2 files changed, 53 insertions(+), 6 deletions(-) diff --git a/test/system/000-TEMPLATE b/test/system/000-TEMPLATE index a6a9ca141873..cff5de7263a3 100644 --- a/test/system/000-TEMPLATE +++ b/test/system/000-TEMPLATE @@ -11,6 +11,22 @@ load helpers args="some sort of argument list" run_podman subcmd $args assert "$output" == "what we expect" "output from 'podman subcmd $args'" + + # safename() provides a lower-case string with both the BATS test number + # and a pseudorandom element. Its use is strongly encouraged for container + # names, volumes, networks, images, everything, because the test number + # makes it possible to find leaked elements. + cname="c-$(safename)" + + # Run "top" in a detached container with a safe name + run_podman run -d --name $cname --this-option --that $IMAGE top + + # A number ("17") as the first run_podman arg means "expect this exit code". + # By default, run_podman expects success, and will barf on nonzero status. + run_podman 17 run --this-option --that $IMAGE exit 17 + + # Give a hoot, don't pollute + run_podman rm -f -t0 $cname } # vim: filetype=sh @@ -110,5 +126,31 @@ size | -\\\?[0-9]\\\+ done < <(parse_table "$tests") } +# Whenever possible, please add the ci:parallel tag to new tests, +# not only for speed but for stress testing. +# +# This is an example of what NOT to do when enabling parallel tests. +# +# bats test_tags=ci:parallel +@test "this test is completely broken in parallel" { + # Never use "--name HARDCODED". Define 'cname=c-$(safename)' instead. + # Not only does that guarantee uniqueness, it also gives the test number + # in the name, so if there's a leak (at end of tests) it will be possible + # to identify the culprit. + run_podman --name mycontainer $IMAGE top + + # Same thing for build and -t + run_podman build -t myimage ... + + # Never assume exact output from podman ps! Many other containers may be running. + run_podman ps + assert "$output" = "mycontainer" + + # Never use "-l". It is meaningless when other processes are running. + run_podman container inspect -l + + # Never 'rm -a'!!! OMG like seriously just don't. + run_podman rm -f -a +} # vim: filetype=sh diff --git a/test/system/README.md b/test/system/README.md index 8e4a98d4cd3b..1800a1fd17f2 100644 --- a/test/system/README.md +++ b/test/system/README.md @@ -5,11 +5,10 @@ debug failures. Quick Start =========== -Look at [030-run.bats](030-run.bats) for a simple but packed example. +Look at [000-TEMPLATE](000-TEMPLATE) for a simple starting point. This introduces the basic set of helper functions: -* `setup` (implicit) - resets container storage so there's -one and only one (standard) image, and no running containers. +* `setup` (implicit) - establishes a test environment. * `parse_table` - you can define tables of inputs and expected results, then read those in a `while` loop. This makes it easy to add new tests. @@ -21,7 +20,7 @@ examples of how to deal with the more typical such issues. but could also be './bin/podman' or 'podman-remote'), with a timeout. Checks its exit status. -* `is` - compare actual vs expected output. Emits a useful diagnostic +* `assert` - compare actual vs expected output. Emits a useful diagnostic on failure. * `die` - output a properly-formatted message to stderr, and fail test @@ -30,7 +29,13 @@ on failure. * `skip_if_remote` - like the above, but skip if testing `podman-remote` -* `random_string` - returns a pseudorandom alphanumeric string +* `safename` - generates a pseudorandom lower-case string suitable +for use in names for containers, images, volumes, any object. String +includes the BATS test number, making it possible to identify the +source of leaks (failure to clean up) at the end of tests. + +* `random_string` - returns a pseudorandom alphanumeric string suitable +for verifying I/O. Test files are of the form `NNN-name.bats` where NNN is a three-digit number. Please preserve this convention, it simplifies viewing the @@ -59,7 +64,7 @@ commands, their output and exit codes. In a normal run you will never see this, but BATS will display it on failure. The goal here is to give you everything you need to diagnose without having to rerun tests. -The `is` comparison function is designed to emit useful diagnostics, +The `assert` comparison function is designed to emit useful diagnostics, in particular, the actual and expected strings. Please do not use the horrible BATS standard of `[ x = y ]`; that's nearly useless for tracking down failures. From 5fc3de55837be1849b33b41490e56dd8e3835794 Mon Sep 17 00:00:00 2001 From: Ed Santiago Date: Tue, 17 Sep 2024 10:06:34 -0600 Subject: [PATCH 6/9] registry: lock start attempts When running parallel, multiple tests could be trying to start the registry at once. Make this parallel-safe. Also, use a safer port range for the registry. Something outside of /proc/sys/net/ipv4/ip_local_port_range Sorry, I'm including a FIXME section that I haven't investigated deeply enough. Signed-off-by: Ed Santiago --- test/system/helpers.registry.bash | 39 +++++++++++++++++++++++++------ test/system/setup_suite.bash | 2 +- 2 files changed, 33 insertions(+), 8 deletions(-) diff --git a/test/system/helpers.registry.bash b/test/system/helpers.registry.bash index 643a0e51510c..ff57ad8b876a 100644 --- a/test/system/helpers.registry.bash +++ b/test/system/helpers.registry.bash @@ -17,8 +17,26 @@ unset REGISTRY_AUTH_FILE # Start a local registry. Only needed on demand (e.g. by 150-login.bats) # and then only once: if we start, leave it running until final teardown. function start_registry() { - if [[ -d "$PODMAN_LOGIN_WORKDIR/auth" ]]; then - # Already started + AUTHDIR=${PODMAN_LOGIN_WORKDIR}/auth + + local startflag=${PODMAN_LOGIN_WORKDIR}/OK + + if ! mkdir $AUTHDIR; then + # *Possibly* already started. Or, possibly (when running + # parallel tests) another process is trying to start it. + # Give it some time. + local timeout=30 + while [[ $timeout -gt 0 ]]; do + if [[ -e $startflag ]]; then + echo "Registry has already been started by another process" + return + fi + + sleep 1 + timeout=$((timeout - 1)) + done + + die "Internal error: timed out waiting for another process to start registry" # Fixes very obscure corner case in root system tests: # 1) we run 150-login tests, starting a registry; then @@ -26,11 +44,15 @@ function start_registry() { # 3) run 700-play, the "private" test, which needs the # already-started registry, but its port is now DROPped, # so the test times out trying to talk to registry - run_podman --storage-driver vfs $(podman_isolation_opts ${PODMAN_LOGIN_WORKDIR}) network reload --all + + ###### FIXME FIXME FIXME TEMPORARY! + ###### Trying to understand flake #23725. What happens if we stop + ###### doing the network reload? + ###### FIXME FIXME FIXME, should we do it in stop_registry?? + ###### run_podman --storage-driver vfs $(podman_isolation_opts ${PODMAN_LOGIN_WORKDIR}) network reload --all return fi - AUTHDIR=${PODMAN_LOGIN_WORKDIR}/auth mkdir -p $AUTHDIR # Registry image; copy of docker.io, but on our own registry @@ -79,6 +101,9 @@ function start_registry() { wait_for_port 127.0.0.1 ${PODMAN_LOGIN_REGISTRY_PORT} # ...so we look in container logs for confirmation that registry is running. _PODMAN_TEST_OPTS="${PODMAN_LOGIN_ARGS}" wait_for_output "listening on .::.:5000" $cid + + touch $startflag + echo "I have started the registry" } function stop_registry() { @@ -103,10 +128,10 @@ function stop_registry() { mount | grep ${PODMAN_LOGIN_WORKDIR} | awk '{print $3}' | xargs --no-run-if-empty umount if [[ $(id -u) -eq 0 ]]; then - rm -rf ${PODMAN_LOGIN_WORKDIR} + rm -rf ${PODMAN_LOGIN_WORKDIR}/* else # rootless image data is owned by a subuid - run_podman unshare rm -rf ${PODMAN_LOGIN_WORKDIR} + run_podman unshare rm -rf ${PODMAN_LOGIN_WORKDIR}/* fi fi @@ -119,7 +144,7 @@ function stop_registry() { echo "" echo "lsof -i -P" lsof -i -P - die "Socket still seems open" + die "Socket $PODMAN_LOGIN_REGISTRY_PORT still seems open" fi } diff --git a/test/system/setup_suite.bash b/test/system/setup_suite.bash index 8a3f910839fd..fa7169a12782 100644 --- a/test/system/setup_suite.bash +++ b/test/system/setup_suite.bash @@ -25,7 +25,7 @@ function setup_suite() { # FIXME: racy! It could be many minutes between now and when we start it. # To mitigate, we use a range not used anywhere else in system tests. - export PODMAN_LOGIN_REGISTRY_PORT=$(random_free_port 42000-42999) + export PODMAN_LOGIN_REGISTRY_PORT=$(random_free_port 27000-27999) # The above does not handle errors. Do a final confirmation. assert "$PODMAN_LOGIN_REGISTRY_PORT" != "" \ From b3da5be2b1fdaffb0d5cf5571b6468a6952f3b66 Mon Sep 17 00:00:00 2001 From: Ed Santiago Date: Tue, 17 Sep 2024 10:37:08 -0600 Subject: [PATCH 7/9] Add workaround for buildah parallel bug Need --layers=false in podman build, otherwise a buildah race can trigger "layer not known" failures: https://github.com/containers/buildah/issues/5674 Signed-off-by: Ed Santiago --- test/system/030-run.bats | 6 ++++-- test/system/060-mount.bats | 5 +++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/test/system/030-run.bats b/test/system/030-run.bats index e254fe8183df..117e90cc4857 100644 --- a/test/system/030-run.bats +++ b/test/system/030-run.bats @@ -874,8 +874,9 @@ RUN rm /etc/mtab EOF expected="'/etc/mtab' -> '/proc/mounts'" + # --layers=false needed to work around buildah#5674 parallel flake local iname=nomtab-$(safename) - run_podman build -t $iname $tmpdir + run_podman build -t $iname --layers=false $tmpdir run_podman run --rm $iname stat -c %N /etc/mtab is "$output" "$expected" "/etc/mtab should be created" @@ -1080,8 +1081,9 @@ echo -e "#!/bin/sh\nfalse" >> /usr/bin/nsenter; \ chmod +x /usr/bin/nsenter EOF + # --layers=false needed to work around buildah#5674 parallel flake test_image="cve_2022_1227_test-$(safename)" - run_podman build -t $test_image $tmpbuilddir + run_podman build -t $test_image --layers=false $tmpbuilddir run_podman run -d ${keepid} $test_image top ctr="$output" run_podman top $ctr huser,user diff --git a/test/system/060-mount.bats b/test/system/060-mount.bats index 3dbcd8631215..792c77716521 100644 --- a/test/system/060-mount.bats +++ b/test/system/060-mount.bats @@ -409,8 +409,9 @@ FROM $IMAGE RUN mkdir /mountroot && echo ${datacontent[img]} > /mountroot/data EOF + # --layers=false needed to work around buildah#5674 parallel flake img="localhost/img-$(safename)" - run_podman build -t $img -f $dockerfile + run_podman build -t $img --layers=false -f $dockerfile # Each test is set up in exactly the same way: # @@ -554,7 +555,7 @@ glob | /* | /mountroot/ | in echo "$datafile_contents" > $workdir/$datafile ln -s $datafile $workdir/link - run_podman create --mount type=glob,src=$workdir/*,dst=/mountroot/,no-dereference --privileged $img sh -c "stat -c '%N' /mountroot/link; cat /mountroot/link" + run_podman create --mount type=glob,src=$workdir/*,dst=/mountroot/,no-dereference --privileged $img sh -c "stat -c '%N' /mountroot/link; cat /mountroot/link; ls -l /mountroot" cid="$output" run_podman start -a $cid assert "${lines[0]}" = "'/mountroot/link' -> '$datafile'" "symlink is preserved, on start" From 7fcf94d7b5d641068db99aad72bc13c38e93a7cf Mon Sep 17 00:00:00 2001 From: Ed Santiago Date: Tue, 17 Sep 2024 10:41:32 -0600 Subject: [PATCH 8/9] Add network namespace leak check Signed-off-by: Ed Santiago --- test/system/setup_suite.bash | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/test/system/setup_suite.bash b/test/system/setup_suite.bash index fa7169a12782..8a920c614167 100644 --- a/test/system/setup_suite.bash +++ b/test/system/setup_suite.bash @@ -35,6 +35,9 @@ function setup_suite() { # Canary file. Will be removed if any individual test fails. touch "$BATS_SUITE_TMPDIR/all-tests-passed" + + # Track network namespaces, so we can check for leaks at test end + ip netns list > $BATS_SUITE_TMPDIR/netns-pre } # Run at the very end of all tests. Useful for cleanup of non-BATS tmpdirs. @@ -49,6 +52,15 @@ function teardown_suite() { if [ $? -gt 0 ]; then exit_code=$((exit_code + 1)) fi + + # Network namespace leak check. List should match what we saw above. + echo + ip netns list > $BATS_SUITE_TMPDIR/netns-post + if ! diff -u $BATS_SUITE_TMPDIR/netns-{pre,post}; then + echo + echo "^^^^^ Leaks found in /run/netns ^^^^^" + exit_code=$((exit_code + 1)) + fi fi return $exit_code From 8402b6535f287fbf061c001a6454b061553b6884 Mon Sep 17 00:00:00 2001 From: Ed Santiago Date: Tue, 17 Sep 2024 10:50:20 -0600 Subject: [PATCH 9/9] Misc minor test fixes ...for dealing with flakes in parallel mode Signed-off-by: Ed Santiago --- test/system/055-rm.bats | 4 ++-- test/system/220-healthcheck.bats | 7 ++++++- test/system/252-quadlet.bats | 6 ++++-- test/system/500-networking.bats | 19 +++++++++++++------ 4 files changed, 25 insertions(+), 11 deletions(-) diff --git a/test/system/055-rm.bats b/test/system/055-rm.bats index e9a23dba5200..3673b7dc2581 100644 --- a/test/system/055-rm.bats +++ b/test/system/055-rm.bats @@ -127,10 +127,10 @@ function __run_healthcheck_container() { run_podman run -d --name $1 \ --health-cmd /bin/false \ --health-interval 1s \ - --health-retries 1 \ + --health-retries 2 \ --health-timeout 1s \ --health-on-failure=stop \ - --stop-timeout=1 \ + --stop-timeout=2 \ --health-start-period 0 \ --stop-signal SIGTERM \ $IMAGE sleep infinity diff --git a/test/system/220-healthcheck.bats b/test/system/220-healthcheck.bats index 7e01a5545bca..1cf1f9c0b325 100644 --- a/test/system/220-healthcheck.bats +++ b/test/system/220-healthcheck.bats @@ -18,7 +18,12 @@ function _check_health { local hc_status="$5" # Loop-wait (up to a few seconds) for healthcheck event (#20342) + # Allow a margin when running parallel, because of system load local timeout=5 + if [[ -n "$PARALLEL_JOBSLOT" ]]; then + timeout=$((timeout + 3)) + fi + while :; do run_podman events --filter container=$ctrname --filter event=health_status \ --since "$since" --stream=false --format "{{.HealthStatus}}" @@ -157,7 +162,7 @@ Log[-1].Output | \"Uh-oh on stdout!\\\nUh-oh on stderr!\\\n\" # Wait for the container in the background and create the $wait_file to # signal the specified wait condition was met. - (timeout --foreground -v --kill=5 5 $PODMAN wait --condition=$condition $ctr && touch $wait_file) & + (timeout --foreground -v --kill=5 10 $PODMAN wait --condition=$condition $ctr && touch $wait_file) & # Sleep 1 second to make sure above commands are running sleep 1 diff --git a/test/system/252-quadlet.bats b/test/system/252-quadlet.bats index f13f15169fa0..3ef297b86932 100644 --- a/test/system/252-quadlet.bats +++ b/test/system/252-quadlet.bats @@ -428,12 +428,14 @@ EOF # A quadlet container depends on a named quadlet volume @test "quadlet - named volume dependency" { + local volume_name="v-$(safename)" + # Save the unit name to use as the volume for the container local quadlet_vol_unit=dep_$(safename).volume local quadlet_vol_file=$PODMAN_TMPDIR/${quadlet_vol_unit} cat > $quadlet_vol_file < $quadlet_file <