From 5397c2b8c1bc147ef17347813f5b24dcac2a5f26 Mon Sep 17 00:00:00 2001 From: Christian Theune Date: Sat, 18 Nov 2023 18:54:28 +0100 Subject: [PATCH] migration: fix crash due to failed heartbeats not propagating tests: reduce test flakiness This introduces pytest-patterns as a replacement for the current use of Ellipsis. Increase general timeouts so tests are a bit less flaky. Improve some lock logging to make it easier to understand and debug when reading the locks and I think this also fixes some race conditions. Fix VM kill handling to include our little supervisor process and selectively decide whether to kill it or not. Add more testing output about the environment in case something goes wrong in the tests and we need to debug it. Fixes PL-131744 --- pytest.ini | 2 +- setup.py | 3 + src/fc/qemu/agent.py | 27 ++-- src/fc/qemu/conftest.py | 63 +++++++- src/fc/qemu/ellipsis.py | 2 - src/fc/qemu/hazmat/qemu.py | 29 +++- src/fc/qemu/incoming.py | 6 + src/fc/qemu/outgoing.py | 22 ++- src/fc/qemu/tests/test_consul.py | 52 +++++-- src/fc/qemu/tests/vm_test.py | 242 ++++++++++++++++++++----------- 10 files changed, 325 insertions(+), 123 deletions(-) diff --git a/pytest.ini b/pytest.ini index 7c0c157..fde24ec 100644 --- a/pytest.ini +++ b/pytest.ini @@ -1,5 +1,5 @@ [pytest] -addopts = --tb=native --timeout=3 --cov=src --cov-report=html --junitxml=/tmp/fc.qemu-report.xml +addopts = --tb=native --timeout=20 --cov=src --cov-report=html --junitxml=/tmp/fc.qemu-report.xml markers = slow: mark test as slow. diff --git a/setup.py b/setup.py index 12bafef..fe212ce 100644 --- a/setup.py +++ b/setup.py @@ -35,6 +35,9 @@ "setuptools", "structlog>=16.1.0", ], + tests_require=[ + "pytest-patterns", + ], entry_points={ "console_scripts": [ "fc-qemu = fc.qemu.main:main", diff --git a/src/fc/qemu/agent.py b/src/fc/qemu/agent.py index fc021a0..028e1cf 100644 --- a/src/fc/qemu/agent.py +++ b/src/fc/qemu/agent.py @@ -203,22 +203,24 @@ def locked_func(self, *args, **kw): mode="nonblocking", ) return os.EX_TEMPFAIL + self._lock_count += 1 self.log.debug( - "acquire-lock", target=self.lockfile, result="locked" + "acquire-lock", + target=self.lockfile, + result="locked", + count=self._lock_count, ) - - self._lock_count += 1 - self.log.debug("lock-status", count=self._lock_count) try: return f(self, *args, **kw) finally: self._lock_count -= 1 - self.log.debug("lock-status", count=self._lock_count) - + self.log.debug( + "release-lock", + target=self.lockfile, + count=self._lock_count, + ) if self._lock_count == 0: - # New try: - self.log.debug("release-lock", target=self.lockfile) fcntl.flock(self._lockfile_fd, fcntl.LOCK_UN) self.log.debug( "release-lock", @@ -232,7 +234,8 @@ def locked_func(self, *args, **kw): target=self.lockfile, result="error", ) - pass + os.close(self._lockfile_fd) + self._lockfile_fd = None return locked_func @@ -1005,7 +1008,7 @@ def ensure_(self): self.log.error( "inconsistent-state", action="destroy", exc_info=True ) - self.qemu.destroy() + self.qemu.destroy(kill_supervisor=True) def ensure_offline(self): if self.qemu.is_running(): @@ -1335,7 +1338,7 @@ def restart(self): def kill(self): self.log.info("kill-vm") timeout = TimeOut(15, interval=1, raise_on_timeout=True) - self.qemu.destroy() + self.qemu.destroy(kill_supervisor=True) while timeout.tick(): if not self.qemu.is_running(): self.log.info("killed-vm") @@ -1396,6 +1399,8 @@ def outmigrate(self): client = Outgoing(self) exitcode = client() if not exitcode: + # XXX I think this can lead to inconsistent behaviour + # if the local VM is destroyed during migration? self.consul_deregister() self.log.info("outmigrate-finished", exitcode=exitcode) return exitcode diff --git a/src/fc/qemu/conftest.py b/src/fc/qemu/conftest.py index d57299f..f24f523 100644 --- a/src/fc/qemu/conftest.py +++ b/src/fc/qemu/conftest.py @@ -2,6 +2,7 @@ import shutil import subprocess import sys +import time import traceback import pkg_resources @@ -38,7 +39,7 @@ def setup_structlog(): from . import util # set to True to temporarily get detailed tracebacks - log_exceptions = False + log_exceptions = True def test_logger(logger, method_name, event): stack = event.pop("stack", None) @@ -112,23 +113,77 @@ def reset_structlog(setup_structlog): def pytest_assertrepr_compare(op, left, right): if left.__class__.__name__ == "Ellipsis": return left.compare(right).diff - elif right.__class__.__name__ == "Ellipsis": + if right.__class__.__name__ == "Ellipsis": return right.compare(left).diff @pytest.fixture def clean_environment(): def clean(): - subprocess.call("pkill -f qemu", shell=True) + subprocess.call(["pkill", "-f", "supervised-qemu"]) + subprocess.call(["pkill", "-f", "qemu-system-x86_64"]) subprocess.call("rbd rm rbd.ssd/simplevm.swap", shell=True) subprocess.call("rbd snap purge rbd.ssd/simplevm.root", shell=True) subprocess.call("rbd rm rbd.ssd/simplevm.root", shell=True) subprocess.call("rbd rm rbd.ssd/simplevm.tmp", shell=True) + time.sleep(1) clean() + + print( + subprocess.check_output( + "rbd ls rbd.ssd", + shell=True, + ).decode("ascii") + ) + yield clean() + print( + subprocess.check_output( + "free && sync && echo 3 > /proc/sys/vm/drop_caches && free", + shell=True, + ).decode("ascii") + ) + print( + subprocess.check_output( + "ceph df", + shell=True, + ).decode("ascii") + ) + print( + subprocess.check_output( + "ps auxf", + shell=True, + ).decode("ascii") + ) + print( + subprocess.check_output( + "df -h", + shell=True, + ).decode("ascii") + ) + print( + subprocess.check_output( + "rbd showmapped", + shell=True, + ).decode("ascii") + ) + + +@pytest.fixture(autouse=True) +def clean_tmpdir_with_flakefinder(tmpdir, pytestconfig): + """The `tmpdir` is normally not cleaned out for debugging purposes. + + Running with flakefinder causes the tmpdir to quickly grow too big. + So, if flakefinder is active, we clean it out to avoid running out of + disk space. + """ + yield + if pytestconfig.getoption("flake_finder_enable") > 0: + shutil.rmtree(tmpdir) + @pytest.fixture def vm(clean_environment, monkeypatch, tmpdir): @@ -146,7 +201,7 @@ def vm(clean_environment, monkeypatch, tmpdir): vm.qemu.vm_expected_overhead = 128 for snapshot in vm.ceph.root.snapshots: snapshot.remove() - vm.qemu.destroy() + vm.qemu.destroy(kill_supervisor=True) vm.unlock() get_log() yield vm diff --git a/src/fc/qemu/ellipsis.py b/src/fc/qemu/ellipsis.py index a63865a..0a4ef47 100644 --- a/src/fc/qemu/ellipsis.py +++ b/src/fc/qemu/ellipsis.py @@ -3,7 +3,6 @@ class Report(object): - matches = None def __init__(self): @@ -49,7 +48,6 @@ def match(pattern, line): class Ellipsis(object): - # other = other.replace('\t', ' '*8) oder allgemein white-space unsensibel # multi-line support diff --git a/src/fc/qemu/hazmat/qemu.py b/src/fc/qemu/hazmat/qemu.py index 08de25a..d79a4e9 100644 --- a/src/fc/qemu/hazmat/qemu.py +++ b/src/fc/qemu/hazmat/qemu.py @@ -53,6 +53,7 @@ def detect_current_machine_type( def locked_global(f): LOCK = "/run/fc-qemu.lock" + # This is thread-safe *AS LONG* as every thread uses a separate instance # of the agent. Using multiple file descriptors will guarantee that the # lock can only be held once even within a single process. @@ -85,7 +86,6 @@ def locked(self, *args, **kw): class Qemu(object): - prefix = "" executable = "qemu-system-x86_64" @@ -576,20 +576,41 @@ def graceful_shutdown(self): ], ) - def destroy(self): + def destroy(self, kill_supervisor=False): # We use this destroy command in "fire-and-forget"-style because # sometimes the init script will complain even if we achieve what # we want: that the VM isn't running any longer. We check this # by contacting the monitor instead. - timeout = TimeOut(100, interval=1, raise_on_timeout=True) p = self.proc() if not p: return + + # Check whether the parent is the supervising process. + # Kill that one first so we avoid immediate restarts. + if kill_supervisor: + parent = p.parent() + if "supervised-qemu-wrapped" in parent.cmdline()[1]: + timeout = TimeOut(100, interval=2, raise_on_timeout=True) + attempt = 0 + while parent.is_running() and timeout.tick(): + attempt += 1 + self.log.debug( + "vm-destroy-kill-supervisor", attempt=attempt + ) + try: + parent.terminate() + except psutil.NoSuchProcess: + break + + timeout = TimeOut(100, interval=2, raise_on_timeout=True) + attempt = 0 while p.is_running() and timeout.tick(): + attempt += 1 + self.log.debug("vm-destroy-kill-vm", attempt=attempt) try: p.terminate() except psutil.NoSuchProcess: - pass + break def resize_root(self, size): self.qmp.command("block_resize", device="virtio0", size=size) diff --git a/src/fc/qemu/incoming.py b/src/fc/qemu/incoming.py index 32630b8..771c090 100644 --- a/src/fc/qemu/incoming.py +++ b/src/fc/qemu/incoming.py @@ -139,6 +139,9 @@ def run(self): if self.finished == "success": return 0 else: + # Do not kill the supervisor. We will be giving up here, but the + # supervisor will restart the process, potentially with updated + # config data so we get a "free" retry. self.qemu.destroy() return 1 @@ -229,6 +232,9 @@ def destroy(self): log.info("destroying", machine=self.name) self.finished = "destroyed" try: + # Do not kill the supervisor. We will be giving up here, but the + # supervisor will restart the process, potentially with updated + # config data so we get a "free" retry. self.qemu.destroy() except QemuNotRunning: pass diff --git a/src/fc/qemu/outgoing.py b/src/fc/qemu/outgoing.py index 10e3528..3963dce 100644 --- a/src/fc/qemu/outgoing.py +++ b/src/fc/qemu/outgoing.py @@ -19,6 +19,7 @@ class Heartbeat(object): """ cookie = None + failed = False def __init__(self, log): self.thread = threading.Thread(target=self.run) @@ -48,13 +49,18 @@ def run(self): self.connection.ping(self.cookie) except Exception: self.log.exception("ping-failed", exc_info=True) + self.failed = True time.sleep(10) finally: self.log.debug("stopped-heartbeat-ping") + def propagate(self): + if self.failed: + self.log.warning("heartbeat-propagate") + raise RuntimeError("Heartbeat failed.") -class Outgoing(object): +class Outgoing(object): migration_exitcode = None target = None cookie = None @@ -209,6 +215,7 @@ def acquire_migration_locks(self): if self.agent.has_new_config(): self.target.cancel(self.cookie) raise ConfigChanged() + self.heartbeat.propagate() # Try to acquire local lock if self.agent.qemu.acquire_migration_lock(): @@ -230,6 +237,7 @@ def acquire_migration_locks(self): self.log.debug( "acquire-remote-migration-lock", result="success" ) + self.heartbeat.propagate() except Exception: self.log.exception( "acquire-remote-migration-lock", @@ -264,6 +272,7 @@ def migrate(self): mbps=mbps, output=pprint.pformat(stat), ) + self.heartbeat.propagate() except Exception: self.log.exception("error-waiting-for-migration", exc_info=True) raise @@ -272,12 +281,13 @@ def migrate(self): assert not status["running"], status assert status["status"] == "postmigrate", status self.log.info("finish-migration") + self.destroy(kill_supervisor=True) try: - self.heartbeat.stop() + self.log.info("finish-remote") self.target.finish_incoming(self.cookie) except Exception: self.log.exception("error-finish-remote", exc_info=True) - self.agent.qemu.destroy() + self.heartbeat.stop() def rescue(self): """Outgoing rescue: try to rescue the remote side first.""" @@ -287,7 +297,7 @@ def rescue(self): self.target.rescue(self.cookie) self.target.finish_incoming(self.cookie) self.log.info("rescue-remote-success", action="destroy local") - self.destroy() + self.destroy(kill_supervisor=True) # We managed to rescue on the remote side - hooray! self.migration_exitcode = 0 return @@ -317,6 +327,6 @@ def rescue(self): else: self.log.info("continue-locally", result="success") - def destroy(self): - self.agent.qemu.destroy() + def destroy(self, kill_supervisor=False): + self.agent.qemu.destroy(kill_supervisor) self.agent.qemu.clean_run_files() diff --git a/src/fc/qemu/tests/test_consul.py b/src/fc/qemu/tests/test_consul.py index 182f313..67270c4 100644 --- a/src/fc/qemu/tests/test_consul.py +++ b/src/fc/qemu/tests/test_consul.py @@ -43,12 +43,21 @@ def test_empty_event(): def test_no_key_event(): stdin = StringIO('[{"ModifyIndex": 123, "Value": ""}]') Agent.handle_consul_event(stdin) - assert util.log_data == [ - "start-consul-events count=1", - "handle-key-failed exc_info=True key=None", - "finish-handle-key key=None", - "finish-consul-events", - ] + + assert ( + Ellipsis( + """\ +start-consul-events count=1 +handle-key-failed key=None +Traceback (most recent call last): + File "/nix/store/...-python3-...-env/lib/python3.8/site-packages/fc/qemu/agent.py", line ..., in handle + log.debug("handle-key", key=event["Key"]) +KeyError: 'Key' +finish-handle-key key=None +finish-consul-events""" + ) + == get_log() + ) @pytest.fixture @@ -260,9 +269,24 @@ def test_snapshot_online_vm(vm): freeze-failed action=continue machine=simplevm reason=Unable to sync with guest agent after 10 tries. snapshot-ignore machine=simplevm reason=not frozen ensure-thawed machine=simplevm volume=root -guest-fsfreeze-thaw-failed exc_info=True machine=simplevm subsystem=qemu +guest-fsfreeze-thaw-failed machine=simplevm subsystem=qemu +Traceback (most recent call last): +... + raise RuntimeError("VM not frozen, not making snapshot.") +RuntimeError: VM not frozen, not making snapshot. + +During handling of the above exception, another exception occurred: + +Traceback (most recent call last): +... + raise ClientError( +fc.qemu.hazmat.guestagent.ClientError: Unable to sync with guest agent after 10 tries. ensure-thawed-failed machine=simplevm reason=Unable to sync with guest agent after 10 tries. -handle-key-failed exc_info=True key=snapshot/7468743 +handle-key-failed key=snapshot/7468743 +Traceback (most recent call last): +... + raise RuntimeError("VM not frozen, not making snapshot.") +RuntimeError: VM not frozen, not making snapshot. finish-consul-events""" ) == get_log() @@ -290,7 +314,7 @@ def test_snapshot_nonexisting_vm(): @pytest.mark.live @pytest.mark.timeout(60) def test_snapshot_offline_vm(vm): - util.test_log_options["show_events"] = ["consul", "snapshot"] + # util.test_log_options["show_events"] = ["consul", "snapshot"] vm.enc["parameters"]["kvm_host"] = "foobar" vm.stage_new_config() @@ -298,7 +322,8 @@ def test_snapshot_offline_vm(vm): vm.ceph.ensure_root_volume() vm.ensure_offline() - get_log() + + print(get_log()) snapshot = {"vm": "simplevm", "snapshot": "backy-1234"} event = prepare_consul_event("snapshot/7468743", snapshot, 123) @@ -307,8 +332,15 @@ def test_snapshot_offline_vm(vm): get_log() == """\ start-consul-events count=1 +handle-key key=snapshot/7468743 +connect-rados machine=simplevm subsystem=ceph snapshot machine=simplevm snapshot=backy-1234 +acquire-lock machine=simplevm target=/run/qemu.simplevm.lock +acquire-lock count=1 machine=simplevm result=locked target=/run/qemu.simplevm.lock snapshot expected=VM running machine=simplevm +release-lock count=0 machine=simplevm target=/run/qemu.simplevm.lock +release-lock machine=simplevm result=unlocked target=/run/qemu.simplevm.lock +finish-handle-key key=snapshot/7468743 finish-consul-events""" ) diff --git a/src/fc/qemu/tests/vm_test.py b/src/fc/qemu/tests/vm_test.py index d8727be..4d5daef 100644 --- a/src/fc/qemu/tests/vm_test.py +++ b/src/fc/qemu/tests/vm_test.py @@ -64,8 +64,7 @@ def test_simple_vm_lifecycle_start_stop(vm): assert out == Ellipsis( """\ acquire-lock machine=simplevm target=/run/qemu.simplevm.lock -acquire-lock machine=simplevm result=locked target=/run/qemu.simplevm.lock -lock-status count=1 machine=simplevm +acquire-lock count=1 machine=simplevm result=locked target=/run/qemu.simplevm.lock generate-config machine=simplevm ensure-root machine=simplevm subsystem=ceph create-vm machine=simplevm subsystem=ceph @@ -148,8 +147,7 @@ def test_simple_vm_lifecycle_start_stop(vm): block_set_io_throttle arguments={'device': 'virtio2', 'iops': 10000, 'iops_rd': 0, 'iops_wr': 0, 'bps': 0, 'bps_wr': 0, 'bps_rd': 0} id=None machine=simplevm subsystem=qemu/qmp ensure-watchdog action=none machine=simplevm human-monitor-command arguments={'command-line': 'watchdog_action action=none'} id=None machine=simplevm subsystem=qemu/qmp -lock-status count=0 machine=simplevm -release-lock machine=simplevm target=/run/qemu.simplevm.lock +release-lock count=0 machine=simplevm target=/run/qemu.simplevm.lock release-lock machine=simplevm result=unlocked target=/run/qemu.simplevm.lock""" ) @@ -188,6 +186,11 @@ def test_simple_vm_lifecycle_start_stop(vm): @pytest.mark.timeout(60) @pytest.mark.live def test_simple_vm_lifecycle_ensure_going_offline(vm, capsys, caplog): + print( + subprocess.check_output( + "top -b -n 1 -o %MEM | head -n 20", shell=True + ).decode("ascii") + ) util.test_log_options["show_events"] = [ "vm-status", "rbd-status", @@ -360,6 +363,10 @@ def test_crashed_vm_clean_restart(vm): graceful-shutdown machine=simplevm graceful-shutdown-failed machine=simplevm reason=timeout kill-vm machine=simplevm +vm-destroy-kill-supervisor attempt=1 machine=simplevm subsystem=qemu +vm-destroy-kill-supervisor attempt=2 machine=simplevm subsystem=qemu +vm-destroy-kill-vm attempt=1 machine=simplevm subsystem=qemu +vm-destroy-kill-vm attempt=2 machine=simplevm subsystem=qemu killed-vm machine=simplevm unlock machine=simplevm subsystem=ceph volume=rbd.ssd/simplevm.root unlock machine=simplevm subsystem=ceph volume=rbd.ssd/simplevm.swap @@ -379,8 +386,9 @@ def test_crashed_vm_clean_restart(vm): def test_do_not_clean_up_crashed_vm_that_doesnt_get_restarted(vm): vm.ensure() assert vm.qemu.is_running() is True - vm.qemu.proc().kill() - vm.qemu.proc().wait(2) + proc = vm.qemu.proc() + proc.kill() + proc.wait(2) assert vm.ceph.locked_by_me() is True vm.enc["parameters"]["online"] = False vm.enc["consul-generation"] += 1 @@ -428,7 +436,26 @@ def test_vm_snapshot_with_missing_guest_agent(vm, monkeypatch): freeze-failed action=continue machine=simplevm reason=Unable to sync with guest agent after 10 tries. snapshot-ignore machine=simplevm reason=not frozen ensure-thawed machine=simplevm volume=root -guest-fsfreeze-thaw-failed exc_info=True machine=simplevm subsystem=qemu +guest-fsfreeze-thaw-failed machine=simplevm subsystem=qemu +Traceback (most recent call last): + File "/nix/store/...-python3-.../site-packages/fc/qemu/agent.py", line ..., in frozen_vm + yield frozen + File "/nix/store/...-python3-.../site-packages/fc/qemu/agent.py", line ..., in snapshot + raise RuntimeError("VM not frozen, not making snapshot.") +RuntimeError: VM not frozen, not making snapshot. + +During handling of the above exception, another exception occurred: + +Traceback (most recent call last): + File "/nix/store/...-python3-.../site-packages/fc/qemu/hazmat/qemu.py", line ..., in thaw + self._thaw_via_guest_agent() + File "/nix/store/...-python3-.../site-packages/fc/qemu/hazmat/qemu.py", line ..., in _thaw_via_guest_agent + with self.guestagent as guest: + File "/nix/store/...-python3-.../site-packages/fc/qemu/hazmat/guestagent.py", line ..., in __enter__ + self.sync() + File "/nix/store/...-python3-.../site-packages/fc/qemu/hazmat/guestagent.py", line ..., in sync + raise ClientError( +fc.qemu.hazmat.guestagent.ClientError: Unable to sync with guest agent after 10 tries. ensure-thawed-failed machine=simplevm reason=Unable to sync with guest agent after 10 tries.""" ) == get_log() @@ -444,7 +471,26 @@ def test_vm_snapshot_with_missing_guest_agent(vm, monkeypatch): freeze-failed action=continue machine=simplevm reason=... snapshot-ignore machine=simplevm reason=not frozen ensure-thawed machine=simplevm volume=root -guest-fsfreeze-thaw-failed exc_info=True machine=simplevm subsystem=qemu +guest-fsfreeze-thaw-failed machine=simplevm subsystem=qemu +Traceback (most recent call last): + File "/nix/store/...-python3-.../site-packages/fc/qemu/agent.py", line ..., in frozen_vm + yield frozen + File "/nix/store/...-python3-.../site-packages/fc/qemu/agent.py", line ..., in snapshot + raise RuntimeError("VM not frozen, not making snapshot.") +RuntimeError: VM not frozen, not making snapshot. + +During handling of the above exception, another exception occurred: + +Traceback (most recent call last): + File "/nix/store/...-python3-.../site-packages/fc/qemu/hazmat/qemu.py", line ..., in thaw + self._thaw_via_guest_agent() + File "/nix/store/...-python3-.../site-packages/fc/qemu/hazmat/qemu.py", line ..., in _thaw_via_guest_agent + with self.guestagent as guest: + File "/nix/store/...-python3-.../site-packages/fc/qemu/hazmat/guestagent.py", line ..., in __enter__ + self.sync() + File "/nix/store/...-python3-.../site-packages/fc/qemu/hazmat/guestagent.py", line ..., in sync + raise ClientError( +fc.qemu.hazmat.guestagent.ClientError: Unable to sync with guest agent after 10 tries. ensure-thawed-failed machine=simplevm reason=Unable to sync with guest agent after 10 tries.""" ) == get_log() @@ -560,95 +606,116 @@ def _kill_vms(): _kill_vms() -def assert_outmigrate_log(log): - filtered_log = [] - for line in log.split("\n"): - # The heartbeat pings show up in varying places. We need to filter - # those out. Note the leading space to avoid eliminating the - # heartbeat-initialized message. - if " heartbeat-ping" in line: - continue - if "stopped-heartbeat-ping" in line: - continue - filtered_log.append(line) - filtered_log = "\n".join(filtered_log) - - assert filtered_log == Ellipsis( - """\ +@pytest.fixture +def outmigrate_pattern(patterns): + outmigrate = patterns.outmigrate + outmigrate.in_order( + """ /nix/store/.../bin/fc-qemu -v outmigrate simplevm load-system-config simplevm connect-rados subsystem='ceph' simplevm acquire-lock target='/run/qemu.simplevm.lock' -simplevm acquire-lock result='locked' target='/run/qemu.simplevm.lock' -simplevm lock-status count=1 +simplevm acquire-lock count=1 result='locked' target='/run/qemu.simplevm.lock' simplevm qmp_capabilities arguments={} id=None subsystem='qemu/qmp' simplevm query-status arguments={} id=None subsystem='qemu/qmp' + simplevm outmigrate simplevm consul-register -simplevm heartbeat-initialized simplevm locate-inmigration-service simplevm check-staging-config result='none' simplevm located-inmigration-service url='http://host2.mgm.test.gocept.net:...' -simplevm started-heartbeat-ping + simplevm acquire-migration-locks simplevm check-staging-config result='none' simplevm acquire-migration-lock result='success' subsystem='qemu' simplevm acquire-local-migration-lock result='success' simplevm acquire-remote-migration-lock simplevm acquire-remote-migration-lock result='success' + simplevm unlock subsystem='ceph' volume='rbd.ssd/simplevm.root' simplevm unlock subsystem='ceph' volume='rbd.ssd/simplevm.swap' simplevm unlock subsystem='ceph' volume='rbd.ssd/simplevm.tmp' + simplevm prepare-remote-environment simplevm start-migration target='tcp:192.168.4.7:...' simplevm migrate subsystem='qemu' simplevm migrate-set-capabilities arguments={'capabilities': [{'capability': 'xbzrle', 'state': False}, {'capability': 'auto-converge', 'state': True}]} id=None subsystem='qemu/qmp' simplevm migrate-set-parameters arguments={'compress-level': 0, 'downtime-limit': 4000, 'max-bandwidth': 22500} id=None subsystem='qemu/qmp' simplevm migrate arguments={'uri': 'tcp:192.168.4.7:...'} id=None subsystem='qemu/qmp' + simplevm query-migrate-parameters arguments={} id=None subsystem='qemu/qmp' simplevm migrate-parameters announce-initial=50 announce-max=550 announce-rounds=5 announce-step=100 block-incremental=False compress-level=0 compress-threads=8 compress-wait-thread=True cpu-throttle-increment=10 cpu-throttle-initial=20 cpu-throttle-tailslow=False decompress-threads=2 downtime-limit=4000 max-bandwidth=22500 max-cpu-throttle=99 max-postcopy-bandwidth=0 multifd-channels=2 multifd-compression='none' multifd-zlib-level=1 multifd-zstd-level=1 subsystem='qemu' throttle-trigger-threshold=50 tls-authz='' tls-creds='' tls-hostname='' x-checkpoint-delay=20000 xbzrle-cache-size=67108864 + simplevm query-migrate arguments={} id=None subsystem='qemu/qmp' simplevm migration-status mbps='-' remaining='0' status='setup' -simplevm> {'blocked': False, 'status': 'setup'} -simplevm query-migrate arguments={} id=None subsystem='qemu/qmp' -simplevm migration-status mbps=... remaining='...' status='active' -simplevm> {'blocked': False, -simplevm> 'expected-downtime': 4000, -... -simplevm> 'status': 'active', -simplevm> 'total-time': ...} -... + simplevm query-migrate arguments={} id=None subsystem='qemu/qmp' simplevm migration-status mbps=... remaining='...' status='active' -... + simplevm migration-status mbps=... remaining='...' status='completed' -simplevm> {'blocked': False, -simplevm> 'downtime': ..., -... -simplevm> 'status': 'completed', -simplevm> 'total-time': ...} + simplevm query-status arguments={} id=None subsystem='qemu/qmp' simplevm finish-migration + +simplevm vm-destroy-kill-supervisor attempt=1 subsystem='qemu' +simplevm vm-destroy-kill-supervisor attempt=2 subsystem='qemu' +simplevm vm-destroy-kill-vm attempt=1 subsystem='qemu' +simplevm vm-destroy-kill-vm attempt=2 subsystem='qemu' +simplevm clean-run-files subsystem='qemu' +simplevm finish-remote simplevm consul-deregister simplevm outmigrate-finished exitcode=0 -simplevm lock-status count=0 -simplevm release-lock target='/run/qemu.simplevm.lock' +simplevm release-lock count=0 target='/run/qemu.simplevm.lock' simplevm release-lock result='unlocked' target='/run/qemu.simplevm.lock' """ ) + # There are a couple of steps in the migration process that may repeat due to + # timings,so this may or may not appear more often: + outmigrate.optional( + """ +simplevm waiting interval=3 remaining=... +simplevm check-staging-config result='none' +simplevm query-migrate arguments={} id=None subsystem='qemu/qmp' +simplevm migration-status mbps=... remaining='...' status='active' +simplevm vm-destroy-kill-vm attempt=... subsystem='qemu' + """ + ) + # Expect debug output that doesn't matter as much + patterns.debug.optional("simplevm> ...") + # This part of the heartbeats must show up + patterns.heartbeat.in_order( + """\ +simplevm heartbeat-initialized +simplevm started-heartbeat-ping +simplevm heartbeat-ping +""" + ) + # The pings may happen more times and sometimes the stopping part + # isn't visible because we terminate too fast. + patterns.heartbeat.optional( + """ +simplevm heartbeat-ping +simplevm stopped-heartbeat-ping +""" + ) + + outmigrate.merge("heartbeat", "debug") -def test_vm_migration_pattern(): + return outmigrate + + +def test_vm_migration_pattern(outmigrate_pattern): # This is a variety of outputs we have seen that are valid and where we want to be # sure that the Ellipsis matches them properly. - assert_outmigrate_log( - """\ + assert ( + outmigrate_pattern + == """\ /nix/store/c5kfr1yx6920s7lcswsv9dn61g8dc5xk-python3.8-fc.qemu-dev/bin/fc-qemu -v outmigrate simplevm load-system-config simplevm connect-rados subsystem='ceph' simplevm acquire-lock target='/run/qemu.simplevm.lock' -simplevm acquire-lock result='locked' target='/run/qemu.simplevm.lock' -simplevm lock-status count=1 +simplevm acquire-lock count=1 result='locked' target='/run/qemu.simplevm.lock' simplevm qmp_capabilities arguments={} id=None subsystem='qemu/qmp' simplevm query-status arguments={} id=None subsystem='qemu/qmp' simplevm outmigrate @@ -669,7 +736,6 @@ def test_vm_migration_pattern(): simplevm unlock subsystem='ceph' volume='rbd.ssd/simplevm.swap' simplevm unlock subsystem='ceph' volume='rbd.ssd/simplevm.tmp' simplevm prepare-remote-environment -simplevm heartbeat-ping simplevm start-migration target='tcp:192.168.4.7:2345' simplevm migrate subsystem='qemu' simplevm migrate-set-capabilities arguments={'capabilities': [{'capability': 'xbzrle', 'state': False}, {'capability': 'auto-converge', 'state': True}]} id=None subsystem='qemu/qmp' @@ -898,11 +964,16 @@ def test_vm_migration_pattern(): simplevm> 'total-time': 68420} simplevm query-status arguments={} id=None subsystem='qemu/qmp' simplevm finish-migration +simplevm vm-destroy-kill-supervisor attempt=1 subsystem='qemu' +simplevm vm-destroy-kill-supervisor attempt=2 subsystem='qemu' +simplevm vm-destroy-kill-vm attempt=1 subsystem='qemu' +simplevm vm-destroy-kill-vm attempt=2 subsystem='qemu' +simplevm clean-run-files subsystem='qemu' +simplevm finish-remote simplevm stopped-heartbeat-ping simplevm consul-deregister simplevm outmigrate-finished exitcode=0 -simplevm lock-status count=0 -simplevm release-lock target='/run/qemu.simplevm.lock' +simplevm release-lock count=0 target='/run/qemu.simplevm.lock' simplevm release-lock result='unlocked' target='/run/qemu.simplevm.lock' """ ) @@ -910,14 +981,14 @@ def test_vm_migration_pattern(): # This one is missing the "stopped-heartbeat-ping". This can happen # because the heartbeat has a sleep cycle of 10s and only finishes with # this log output when it actually triggers at the right moment. - assert_outmigrate_log( - """\ + assert ( + outmigrate_pattern + == """\ /nix/store/kj63j38ji0c8yk037yvzj9c5f27mzh58-python3.8-fc.qemu-d26a0eae29efd95fe5c328d8a9978eb5a6a4529e/bin/fc-qemu -v outmigrate simplevm load-system-config simplevm connect-rados subsystem='ceph' simplevm acquire-lock target='/run/qemu.simplevm.lock' -simplevm acquire-lock result='locked' target='/run/qemu.simplevm.lock' -simplevm lock-status count=1 +simplevm acquire-lock count=1 result='locked' target='/run/qemu.simplevm.lock' simplevm qmp_capabilities arguments={} id=None subsystem='qemu/qmp' simplevm query-status arguments={} id=None subsystem='qemu/qmp' simplevm outmigrate @@ -1167,18 +1238,23 @@ def test_vm_migration_pattern(): simplevm> 'total-time': 68526} simplevm query-status arguments={} id=None subsystem='qemu/qmp' simplevm finish-migration +simplevm vm-destroy-kill-supervisor attempt=1 subsystem='qemu' +simplevm vm-destroy-kill-supervisor attempt=2 subsystem='qemu' +simplevm vm-destroy-kill-vm attempt=1 subsystem='qemu' +simplevm vm-destroy-kill-vm attempt=2 subsystem='qemu' +simplevm clean-run-files subsystem='qemu' +simplevm finish-remote simplevm consul-deregister simplevm outmigrate-finished exitcode=0 -simplevm lock-status count=0 -simplevm release-lock target='/run/qemu.simplevm.lock' +simplevm release-lock count=0 target='/run/qemu.simplevm.lock' simplevm release-lock result='unlocked' target='/run/qemu.simplevm.lock' """ ) @pytest.mark.live -@pytest.mark.timeout(240) -def test_vm_migration(vm, kill_vms): +@pytest.mark.timeout(300) +def test_vm_migration(vm, kill_vms, outmigrate_pattern, patterns): def call(cmd, wait=True, host=None): for ssh_cmd in ["scp", "ssh"]: if not cmd.startswith(ssh_cmd): @@ -1226,37 +1302,31 @@ def communicate_progress(p): inmigrate = call("ssh host2 'fc-qemu -v inmigrate simplevm'", wait=False) outmigrate = call("fc-qemu -v outmigrate simplevm", wait=False) - outmigrate_result = communicate_progress(outmigrate) - assert_outmigrate_log(outmigrate_result) + assert outmigrate_pattern == communicate_progress(outmigrate) assert outmigrate.returncode == 0 inmigrate_result = communicate_progress(inmigrate) - assert inmigrate_result == Ellipsis( - """\ + inmigrate_pattern = patterns.inmigrate + inmigrate_pattern.in_order( + """ /nix/store/.../bin/fc-qemu -v inmigrate simplevm load-system-config simplevm connect-rados subsystem='ceph' simplevm acquire-lock target='/run/qemu.simplevm.lock' -simplevm acquire-lock result='locked' target='/run/qemu.simplevm.lock' -simplevm lock-status count=1 +simplevm acquire-lock count=1 result='locked' target='/run/qemu.simplevm.lock' + simplevm inmigrate simplevm start-server type='incoming' url='http://host2.mgm.test.gocept.net:.../' simplevm setup-incoming-api cookie='...' simplevm consul-register-inmigrate -simplevm received-ping timeout=60 -simplevm waiting interval=0 remaining=59 + simplevm received-acquire-migration-lock simplevm acquire-migration-lock result='success' subsystem='qemu' -simplevm reset-timeout -simplevm waiting interval=0 remaining=59 -simplevm received-ping timeout=60 -simplevm waiting interval=0 remaining=59 simplevm received-acquire-ceph-locks simplevm lock subsystem='ceph' volume='rbd.ssd/simplevm.root' simplevm lock subsystem='ceph' volume='rbd.ssd/simplevm.swap' simplevm lock subsystem='ceph' volume='rbd.ssd/simplevm.tmp' -simplevm reset-timeout -simplevm waiting interval=0 remaining=59 + simplevm received-prepare-incoming simplevm acquire-global-lock subsystem='qemu' target='/run/fc-qemu.lock' simplevm global-lock-acquire result='locked' subsystem='qemu' target='/run/fc-qemu.lock' @@ -1266,32 +1336,34 @@ def communicate_progress(p): simplevm qemu-system-x86_64 additional_args=['-incoming tcp:192.168.4.7:...'] local_args=['-nodefaults', '-only-migratable', '-cpu qemu64,enforce', '-name simplevm,process=kvm.simplevm', '-chroot /srv/vm/simplevm', '-runas nobody', '-serial file:/var/log/vm/simplevm.log', '-display vnc=127.0.0.1:2345', '-pidfile /run/qemu.simplevm.pid', '-vga std', '-m 256', '-readconfig /run/qemu.simplevm.cfg'] subsystem='qemu' simplevm exec cmd='supervised-qemu qemu-system-x86_64 -nodefaults -only-migratable -cpu qemu64,enforce -name simplevm,process=kvm.simplevm -chroot /srv/vm/simplevm -runas nobody -serial file:/var/log/vm/simplevm.log -display vnc=127.0.0.1:2345 -pidfile /run/qemu.simplevm.pid -vga std -m 256 -readconfig /run/qemu.simplevm.cfg -incoming tcp:192.168.4.7:2345 -D /var/log/vm/simplevm.qemu.internal.log simplevm /var/log/vm/simplevm.supervisor.log' subsystem='qemu' simplevm supervised-qemu-stdout subsystem='qemu' -simplevm> simplevm supervised-qemu-stderr subsystem='qemu' -simplevm> + simplevm global-lock-status count=0 subsystem='qemu' target='/run/fc-qemu.lock' simplevm global-lock-release subsystem='qemu' target='/run/fc-qemu.lock' simplevm global-lock-release result='unlocked' subsystem='qemu' simplevm qmp_capabilities arguments={} id=None subsystem='qemu/qmp' simplevm query-status arguments={} id=None subsystem='qemu/qmp' -simplevm reset-timeout -simplevm waiting interval=0 remaining=59 -simplevm received-ping timeout=60 -simplevm waiting interval=0 remaining=59 -simplevm received-ping timeout=60 -... + simplevm received-finish-incoming simplevm query-status arguments={} id=None subsystem='qemu/qmp' -simplevm reset-timeout simplevm consul-deregister-inmigrate simplevm stop-server result='success' type='incoming' simplevm consul-register simplevm inmigrate-finished exitcode=0 -simplevm lock-status count=0 -simplevm release-lock target='/run/qemu.simplevm.lock' +simplevm release-lock count=0 target='/run/qemu.simplevm.lock' simplevm release-lock result='unlocked' target='/run/qemu.simplevm.lock' """ ) + inmigrate_pattern.optional( + """ +simplevm> +simplevm received-ping timeout=60 +simplevm reset-timeout +simplevm waiting interval=0 remaining=... +""" + ) + + assert inmigrate_pattern == inmigrate_result assert inmigrate.returncode == 0 # The consul check is a bit flaky as it only checks every 5 seconds