diff --git a/.cirrus.yml b/.cirrus.yml index 9cba9792bb..120aeaa1dc 100644 --- a/.cirrus.yml +++ b/.cirrus.yml @@ -836,8 +836,8 @@ local_system_test_task: &local_system_test_task (changesInclude('**/*.go', '**/*.c', '**/*.h') && !changesIncludeOnly('test/**', 'pkg/machine/e2e/**')) depends_on: *build matrix: *platform_axis - gce_instance: *standardvm - timeout_in: 35m + gce_instance: *fastvm + timeout_in: 25m env: TEST_FLAVOR: sys clone_script: *get_gosrc @@ -890,8 +890,8 @@ rootless_remote_system_test_task: CTR_FQIN: ${FEDORA_CONTAINER_FQIN} <<: *local_system_test_task alias: rootless_remote_system_test - gce_instance: *standardvm - timeout_in: 35m + gce_instance: *fastvm + timeout_in: 25m env: TEST_FLAVOR: sys PODBIN_NAME: remote @@ -905,8 +905,8 @@ rootless_system_test_task: only_if: *only_if_system_test depends_on: *build matrix: *platform_axis - gce_instance: *standardvm - timeout_in: 35m + gce_instance: *fastvm + timeout_in: 25m env: TEST_FLAVOR: sys PRIV_NAME: rootless diff --git a/Makefile b/Makefile index 799ca84390..3cdae32101 100644 --- a/Makefile +++ b/Makefile @@ -688,7 +688,8 @@ localmachine: localsystem: # Wipe existing config, database, and cache: start with clean slate. $(RM) -rf ${HOME}/.local/share/containers ${HOME}/.config/containers - if timeout -v 1 true; then PODMAN=$(CURDIR)/bin/podman QUADLET=$(CURDIR)/bin/quadlet bats -T test/system/; else echo "Skipping $@: 'timeout -v' unavailable'"; fi + PODMAN=$(CURDIR)/bin/podman QUADLET=$(CURDIR)/bin/quadlet bats -T --filter-tags '!ci:parallel' test/system/ + PODMAN=$(CURDIR)/bin/podman QUADLET=$(CURDIR)/bin/quadlet bats -T --filter-tags ci:parallel -j $$(nproc) test/system/ .PHONY: remotesystem remotesystem: @@ -717,7 +718,8 @@ remotesystem: echo "Error: ./bin/podman system service did not come up" >&2;\ exit 1;\ fi;\ - env PODMAN="$(CURDIR)/bin/podman-remote" bats -T test/system/ ;\ + env PODMAN="$(CURDIR)/bin/podman-remote" bats -T --filter-tags '!ci:parallel' test/system/ ;\ + env PODMAN="$(CURDIR)/bin/podman-remote" bats -T --filter-tags ci:parallel -j $$(nproc) test/system/ ;\ rc=$$?;\ kill %1;\ else \ diff --git a/contrib/cirrus/logformatter b/contrib/cirrus/logformatter index 78ac1850d7..ed7135e8d2 100755 --- a/contrib/cirrus/logformatter +++ b/contrib/cirrus/logformatter @@ -349,7 +349,7 @@ END_HTML # BATS handling. This will recognize num_tests both at start and end if ($line =~ /^1\.\.(\d+)$/) { $looks_like_bats = 1; - $bats_count{expected_total} = $1; + $bats_count{expected_total} += $1; undef $looks_like_python; } # Since the number of tests can't always be predicted, recognize @@ -445,9 +445,14 @@ END_HTML if ($css) { # Make it linkable, e.g. foo.html#t--00001 if ($line =~ /^(not\s+)?ok\s+(\d+)\s+(.*)/) { - $line = sprintf("%s", $2, $line); + my ($notok, $num, $name) = ($1, $2, $3); + # Parallel system tests are shown as |nnn| instead of [nnn] + my $p = ''; + $p = 'p' if $name =~ /^\s*\|\d+\|/; + + $line = sprintf("%s", $num, $p, $line); - push @{$bats_count{__fail_list}}, [ $2, $3 ] if $1; + push @{$bats_count{__fail_list}}, [ $num, $name ] if $notok; } $bats_count{$css}++; $css = "bats-$css"; diff --git a/hack/bats b/hack/bats index 3002c0199b..3184c95b76 100755 --- a/hack/bats +++ b/hack/bats @@ -83,7 +83,10 @@ for i;do --rootless) TEST_ROOT= ;; --remote) REMOTE=remote ;; --ts|-T) bats_opts+=("-T") ;; - --tag=*) bats_filter=("--filter-tags" "$value") ;; + --tag=*) bats_filter=("--filter-tags" "$value") + if [[ "$value" = "ci:parallel" ]]; then + bats_opts+=("--jobs" $(nproc)) + fi;; */*.bats) TESTS=$i ;; *) if [[ $i =~ : ]]; then @@ -130,7 +133,7 @@ export PODMAN_BATS_LEAK_CHECK=1 # Root if [[ "$TEST_ROOT" ]]; then - echo "# bats ${bats_filter[*]} $TESTS" + echo "# bats ${bats_opts[*]} ${bats_filter[*]} $TESTS" sudo --preserve-env=PODMAN \ --preserve-env=QUADLET \ --preserve-env=PODMAN_TEST_DEBUG \ @@ -144,7 +147,7 @@ fi # Rootless. (Only if we're not already root) if [[ "$TEST_ROOTLESS" && "$(id -u)" != 0 ]]; then echo "--------------------------------------------------" - echo "\$ bats ${bats_filter[*]} $TESTS" + echo "\$ bats ${bats_opts[*]} ${bats_filter[*]} $TESTS" bats "${bats_opts[@]}" "${bats_filter[@]}" $TESTS rc=$((rc | $?)) fi diff --git a/test/system/000-TEMPLATE b/test/system/000-TEMPLATE index a6a9ca1418..cff5de7263 100644 --- a/test/system/000-TEMPLATE +++ b/test/system/000-TEMPLATE @@ -11,6 +11,22 @@ load helpers args="some sort of argument list" run_podman subcmd $args assert "$output" == "what we expect" "output from 'podman subcmd $args'" + + # safename() provides a lower-case string with both the BATS test number + # and a pseudorandom element. Its use is strongly encouraged for container + # names, volumes, networks, images, everything, because the test number + # makes it possible to find leaked elements. + cname="c-$(safename)" + + # Run "top" in a detached container with a safe name + run_podman run -d --name $cname --this-option --that $IMAGE top + + # A number ("17") as the first run_podman arg means "expect this exit code". + # By default, run_podman expects success, and will barf on nonzero status. + run_podman 17 run --this-option --that $IMAGE exit 17 + + # Give a hoot, don't pollute + run_podman rm -f -t0 $cname } # vim: filetype=sh @@ -110,5 +126,31 @@ size | -\\\?[0-9]\\\+ done < <(parse_table "$tests") } +# Whenever possible, please add the ci:parallel tag to new tests, +# not only for speed but for stress testing. +# +# This is an example of what NOT to do when enabling parallel tests. +# +# bats test_tags=ci:parallel +@test "this test is completely broken in parallel" { + # Never use "--name HARDCODED". Define 'cname=c-$(safename)' instead. + # Not only does that guarantee uniqueness, it also gives the test number + # in the name, so if there's a leak (at end of tests) it will be possible + # to identify the culprit. + run_podman --name mycontainer $IMAGE top + + # Same thing for build and -t + run_podman build -t myimage ... + + # Never assume exact output from podman ps! Many other containers may be running. + run_podman ps + assert "$output" = "mycontainer" + + # Never use "-l". It is meaningless when other processes are running. + run_podman container inspect -l + + # Never 'rm -a'!!! OMG like seriously just don't. + run_podman rm -f -a +} # vim: filetype=sh diff --git a/test/system/030-run.bats b/test/system/030-run.bats index e254fe8183..117e90cc48 100644 --- a/test/system/030-run.bats +++ b/test/system/030-run.bats @@ -874,8 +874,9 @@ RUN rm /etc/mtab EOF expected="'/etc/mtab' -> '/proc/mounts'" + # --layers=false needed to work around buildah#5674 parallel flake local iname=nomtab-$(safename) - run_podman build -t $iname $tmpdir + run_podman build -t $iname --layers=false $tmpdir run_podman run --rm $iname stat -c %N /etc/mtab is "$output" "$expected" "/etc/mtab should be created" @@ -1080,8 +1081,9 @@ echo -e "#!/bin/sh\nfalse" >> /usr/bin/nsenter; \ chmod +x /usr/bin/nsenter EOF + # --layers=false needed to work around buildah#5674 parallel flake test_image="cve_2022_1227_test-$(safename)" - run_podman build -t $test_image $tmpbuilddir + run_podman build -t $test_image --layers=false $tmpbuilddir run_podman run -d ${keepid} $test_image top ctr="$output" run_podman top $ctr huser,user diff --git a/test/system/055-rm.bats b/test/system/055-rm.bats index e9a23dba52..3673b7dc25 100644 --- a/test/system/055-rm.bats +++ b/test/system/055-rm.bats @@ -127,10 +127,10 @@ function __run_healthcheck_container() { run_podman run -d --name $1 \ --health-cmd /bin/false \ --health-interval 1s \ - --health-retries 1 \ + --health-retries 2 \ --health-timeout 1s \ --health-on-failure=stop \ - --stop-timeout=1 \ + --stop-timeout=2 \ --health-start-period 0 \ --stop-signal SIGTERM \ $IMAGE sleep infinity diff --git a/test/system/060-mount.bats b/test/system/060-mount.bats index 3dbcd86312..792c777165 100644 --- a/test/system/060-mount.bats +++ b/test/system/060-mount.bats @@ -409,8 +409,9 @@ FROM $IMAGE RUN mkdir /mountroot && echo ${datacontent[img]} > /mountroot/data EOF + # --layers=false needed to work around buildah#5674 parallel flake img="localhost/img-$(safename)" - run_podman build -t $img -f $dockerfile + run_podman build -t $img --layers=false -f $dockerfile # Each test is set up in exactly the same way: # @@ -554,7 +555,7 @@ glob | /* | /mountroot/ | in echo "$datafile_contents" > $workdir/$datafile ln -s $datafile $workdir/link - run_podman create --mount type=glob,src=$workdir/*,dst=/mountroot/,no-dereference --privileged $img sh -c "stat -c '%N' /mountroot/link; cat /mountroot/link" + run_podman create --mount type=glob,src=$workdir/*,dst=/mountroot/,no-dereference --privileged $img sh -c "stat -c '%N' /mountroot/link; cat /mountroot/link; ls -l /mountroot" cid="$output" run_podman start -a $cid assert "${lines[0]}" = "'/mountroot/link' -> '$datafile'" "symlink is preserved, on start" diff --git a/test/system/220-healthcheck.bats b/test/system/220-healthcheck.bats index 7e01a5545b..1cf1f9c0b3 100644 --- a/test/system/220-healthcheck.bats +++ b/test/system/220-healthcheck.bats @@ -18,7 +18,12 @@ function _check_health { local hc_status="$5" # Loop-wait (up to a few seconds) for healthcheck event (#20342) + # Allow a margin when running parallel, because of system load local timeout=5 + if [[ -n "$PARALLEL_JOBSLOT" ]]; then + timeout=$((timeout + 3)) + fi + while :; do run_podman events --filter container=$ctrname --filter event=health_status \ --since "$since" --stream=false --format "{{.HealthStatus}}" @@ -157,7 +162,7 @@ Log[-1].Output | \"Uh-oh on stdout!\\\nUh-oh on stderr!\\\n\" # Wait for the container in the background and create the $wait_file to # signal the specified wait condition was met. - (timeout --foreground -v --kill=5 5 $PODMAN wait --condition=$condition $ctr && touch $wait_file) & + (timeout --foreground -v --kill=5 10 $PODMAN wait --condition=$condition $ctr && touch $wait_file) & # Sleep 1 second to make sure above commands are running sleep 1 diff --git a/test/system/252-quadlet.bats b/test/system/252-quadlet.bats index f13f15169f..3ef297b869 100644 --- a/test/system/252-quadlet.bats +++ b/test/system/252-quadlet.bats @@ -428,12 +428,14 @@ EOF # A quadlet container depends on a named quadlet volume @test "quadlet - named volume dependency" { + local volume_name="v-$(safename)" + # Save the unit name to use as the volume for the container local quadlet_vol_unit=dep_$(safename).volume local quadlet_vol_file=$PODMAN_TMPDIR/${quadlet_vol_unit} cat > $quadlet_vol_file < $quadlet_file <&3 - run_podman rmi --force "$1" >/dev/null 2>&1 || true + _run_podman_quiet rmi --force "$1" # Tagged image will have same IID as our test image; don't rmi it. if [[ $2 != "$PODMAN_TEST_IMAGE_ID" ]]; then echo "# setup(): removing stray image $2" >&3 - run_podman rmi --force "$2" >/dev/null 2>&1 || true + _run_podman_quiet rmi --force "$2" fi fi done @@ -440,6 +457,17 @@ function clean_setup() { if [[ -z "$found_needed_image" ]]; then _prefetch $PODMAN_TEST_IMAGE_FQN fi + + # When running in parallel, load (create, actually) the pause image. + # This way, all pod tests will have it available. Without this, + # parallel pod tests will leave behind : images. + # FIXME: #23292 -- this should not be necessary. + if [[ -n "$PARALLEL_JOBSLOT" ]]; then + run_podman pod create mypod + run_podman pod rm mypod + # And now, we have a pause image, and each test does not + # need to build their own. + fi } # END setup/teardown tools diff --git a/test/system/helpers.registry.bash b/test/system/helpers.registry.bash index 643a0e5151..ff57ad8b87 100644 --- a/test/system/helpers.registry.bash +++ b/test/system/helpers.registry.bash @@ -17,8 +17,26 @@ unset REGISTRY_AUTH_FILE # Start a local registry. Only needed on demand (e.g. by 150-login.bats) # and then only once: if we start, leave it running until final teardown. function start_registry() { - if [[ -d "$PODMAN_LOGIN_WORKDIR/auth" ]]; then - # Already started + AUTHDIR=${PODMAN_LOGIN_WORKDIR}/auth + + local startflag=${PODMAN_LOGIN_WORKDIR}/OK + + if ! mkdir $AUTHDIR; then + # *Possibly* already started. Or, possibly (when running + # parallel tests) another process is trying to start it. + # Give it some time. + local timeout=30 + while [[ $timeout -gt 0 ]]; do + if [[ -e $startflag ]]; then + echo "Registry has already been started by another process" + return + fi + + sleep 1 + timeout=$((timeout - 1)) + done + + die "Internal error: timed out waiting for another process to start registry" # Fixes very obscure corner case in root system tests: # 1) we run 150-login tests, starting a registry; then @@ -26,11 +44,15 @@ function start_registry() { # 3) run 700-play, the "private" test, which needs the # already-started registry, but its port is now DROPped, # so the test times out trying to talk to registry - run_podman --storage-driver vfs $(podman_isolation_opts ${PODMAN_LOGIN_WORKDIR}) network reload --all + + ###### FIXME FIXME FIXME TEMPORARY! + ###### Trying to understand flake #23725. What happens if we stop + ###### doing the network reload? + ###### FIXME FIXME FIXME, should we do it in stop_registry?? + ###### run_podman --storage-driver vfs $(podman_isolation_opts ${PODMAN_LOGIN_WORKDIR}) network reload --all return fi - AUTHDIR=${PODMAN_LOGIN_WORKDIR}/auth mkdir -p $AUTHDIR # Registry image; copy of docker.io, but on our own registry @@ -79,6 +101,9 @@ function start_registry() { wait_for_port 127.0.0.1 ${PODMAN_LOGIN_REGISTRY_PORT} # ...so we look in container logs for confirmation that registry is running. _PODMAN_TEST_OPTS="${PODMAN_LOGIN_ARGS}" wait_for_output "listening on .::.:5000" $cid + + touch $startflag + echo "I have started the registry" } function stop_registry() { @@ -103,10 +128,10 @@ function stop_registry() { mount | grep ${PODMAN_LOGIN_WORKDIR} | awk '{print $3}' | xargs --no-run-if-empty umount if [[ $(id -u) -eq 0 ]]; then - rm -rf ${PODMAN_LOGIN_WORKDIR} + rm -rf ${PODMAN_LOGIN_WORKDIR}/* else # rootless image data is owned by a subuid - run_podman unshare rm -rf ${PODMAN_LOGIN_WORKDIR} + run_podman unshare rm -rf ${PODMAN_LOGIN_WORKDIR}/* fi fi @@ -119,7 +144,7 @@ function stop_registry() { echo "" echo "lsof -i -P" lsof -i -P - die "Socket still seems open" + die "Socket $PODMAN_LOGIN_REGISTRY_PORT still seems open" fi } diff --git a/test/system/setup_suite.bash b/test/system/setup_suite.bash index 4dd59981db..8a920c6141 100644 --- a/test/system/setup_suite.bash +++ b/test/system/setup_suite.bash @@ -25,13 +25,19 @@ function setup_suite() { # FIXME: racy! It could be many minutes between now and when we start it. # To mitigate, we use a range not used anywhere else in system tests. - export PODMAN_LOGIN_REGISTRY_PORT=$(random_free_port 42000-42999) + export PODMAN_LOGIN_REGISTRY_PORT=$(random_free_port 27000-27999) # The above does not handle errors. Do a final confirmation. assert "$PODMAN_LOGIN_REGISTRY_PORT" != "" \ "Unable to set PODMAN_LOGIN_REGISTRY_PORT" clean_setup + + # Canary file. Will be removed if any individual test fails. + touch "$BATS_SUITE_TMPDIR/all-tests-passed" + + # Track network namespaces, so we can check for leaks at test end + ip netns list > $BATS_SUITE_TMPDIR/netns-pre } # Run at the very end of all tests. Useful for cleanup of non-BATS tmpdirs. @@ -39,11 +45,22 @@ function teardown_suite() { stop_registry local exit_code=$? - # After all tests make sure there are no leaks and cleanup if there are - leak_check - if [ $? -gt 0 ]; then - exit_code=$((exit_code + 1)) - clean_setup + # At end, if all tests have passed, check for leaks. + # Don't do this if there were errors: failing tests may not clean up. + if [[ -e "$BATS_SUITE_TMPDIR/all-tests-passed" ]]; then + leak_check + if [ $? -gt 0 ]; then + exit_code=$((exit_code + 1)) + fi + + # Network namespace leak check. List should match what we saw above. + echo + ip netns list > $BATS_SUITE_TMPDIR/netns-post + if ! diff -u $BATS_SUITE_TMPDIR/netns-{pre,post}; then + echo + echo "^^^^^ Leaks found in /run/netns ^^^^^" + exit_code=$((exit_code + 1)) + fi fi return $exit_code