Skip to content

Commit

Permalink
migration: fix crash due to failed heartbeats not propagating
Browse files Browse the repository at this point in the history
Fixes PL-131744
  • Loading branch information
ctheune committed Nov 18, 2023
1 parent 6c56e40 commit 774533d
Show file tree
Hide file tree
Showing 10 changed files with 427 additions and 114 deletions.
2 changes: 1 addition & 1 deletion pytest.ini
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[pytest]
addopts = --tb=native --timeout=3 --cov=src --cov-report=html --junitxml=/tmp/fc.qemu-report.xml
addopts = --tb=native --timeout=20 --cov=src --cov-report=html --junitxml=/tmp/fc.qemu-report.xml

markers =
slow: mark test as slow.
Expand Down
27 changes: 16 additions & 11 deletions src/fc/qemu/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,22 +203,24 @@ def locked_func(self, *args, **kw):
mode="nonblocking",
)
return os.EX_TEMPFAIL
self._lock_count += 1
self.log.debug(
"acquire-lock", target=self.lockfile, result="locked"
"acquire-lock",
target=self.lockfile,
result="locked",
count=self._lock_count,
)

self._lock_count += 1
self.log.debug("lock-status", count=self._lock_count)
try:
return f(self, *args, **kw)
finally:
self._lock_count -= 1
self.log.debug("lock-status", count=self._lock_count)

self.log.debug(
"release-lock",
target=self.lockfile,
count=self._lock_count,
)
if self._lock_count == 0:
# New
try:
self.log.debug("release-lock", target=self.lockfile)
fcntl.flock(self._lockfile_fd, fcntl.LOCK_UN)
self.log.debug(
"release-lock",
Expand All @@ -232,7 +234,8 @@ def locked_func(self, *args, **kw):
target=self.lockfile,
result="error",
)
pass
os.close(self._lockfile_fd)
self._lockfile_fd = None

return locked_func

Expand Down Expand Up @@ -1005,7 +1008,7 @@ def ensure_(self):
self.log.error(
"inconsistent-state", action="destroy", exc_info=True
)
self.qemu.destroy()
self.qemu.destroy(kill_supervisor=True)

def ensure_offline(self):
if self.qemu.is_running():
Expand Down Expand Up @@ -1335,7 +1338,7 @@ def restart(self):
def kill(self):
self.log.info("kill-vm")
timeout = TimeOut(15, interval=1, raise_on_timeout=True)
self.qemu.destroy()
self.qemu.destroy(kill_supervisor=True)
while timeout.tick():
if not self.qemu.is_running():
self.log.info("killed-vm")
Expand Down Expand Up @@ -1396,6 +1399,8 @@ def outmigrate(self):
client = Outgoing(self)
exitcode = client()
if not exitcode:
# XXX I think this can lead to inconsistent behaviour
# if the local VM is destroyed during migration?
self.consul_deregister()
self.log.info("outmigrate-finished", exitcode=exitcode)
return exitcode
Expand Down
119 changes: 119 additions & 0 deletions src/fc/qemu/audit.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
#
# Show reporting of all expectations and which of those lines were found
#
#
import re
import typing

WARN_SYMBOL = "🟡"
OK_SYMBOL = "⚪️"


def match(pattern, line):
pattern = pattern.replace("\t", " " * 8)
line = line.replace("\t", " " * 8)
pattern = re.escape(pattern)
pattern = pattern.replace(r"\.\.\.", ".+?")
pattern = re.compile("^" + pattern + "$")
return pattern.match(line)


class Line:
matched = False

def __init__(self, data: str):
self.data = data

def matches(self, expectation: str):
matched = bool(match(expectation, self.data))
self.matched |= matched
return matched


class Audit:
lines: typing.List[Line]
unmatched_expectations: typing.List[str]

def __init__(self, content):
self.lines = []
self.unmatched_expectations = []

for line in content.splitlines():
self.lines.append(Line(line))

def expect(self, lines: str):
"""Expect all lines exist and come in order, but they
may be interleaved with other lines."""
lines = lines.strip()
scan = iter(self.lines)
for expected in lines.splitlines():
if not expected:
continue
for line in scan:
if line.matches(expected):
break
else:
self.unmatched_expectations.append(expected)
# Reset the scan, maybe the other lines will match
scan = iter(self.lines)

def maybe(self, lines: str):
"""Those lines may exist and then they may appear anywhere
a number of times, or they may not exist.
"""
lines = lines.strip()
for expected in lines.splitlines():
scan = iter(self.lines)
for line in scan:
line.matches(expected)

def report(self):
yield "String did not meet the expectations."
yield ""
yield OK_SYMBOL + "=OK "
yield WARN_SYMBOL + "=UNEXPECTED"
yield ""
yield "Here is string that was tested: "
yield ""
for line in self.lines:
if line.matched:
prefix = OK_SYMBOL
else:
prefix = WARN_SYMBOL
yield prefix + " " + line.data
if self.unmatched_expectations:
yield ""
yield "Unmatched expectations: "
yield ""
for line in self.unmatched_expectations:
yield WARN_SYMBOL + " " + line

def is_ok(self):
if self.unmatched_expectations:
return False
for line in self.lines:
if not line.matched:
return False
return True


class Pattern:
def __init__(self):
self.ops = []

def expect(self, lines: str):
self.ops.append(("expect", lines))

def maybe(self, lines: str):
self.ops.append(("maybe", lines))

def _audit(self, other):
audit = Audit(other)
for op, *args in self.ops:
getattr(audit, op)(*args)
return audit

def __eq__(self, other):
assert isinstance(other, str)
report = self._audit(other)
return report.is_ok()
62 changes: 58 additions & 4 deletions src/fc/qemu/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import shutil
import subprocess
import sys
import time
import traceback

import pkg_resources
Expand Down Expand Up @@ -38,7 +39,7 @@ def setup_structlog():
from . import util

# set to True to temporarily get detailed tracebacks
log_exceptions = False
log_exceptions = True

def test_logger(logger, method_name, event):
stack = event.pop("stack", None)
Expand Down Expand Up @@ -112,23 +113,76 @@ def reset_structlog(setup_structlog):
def pytest_assertrepr_compare(op, left, right):
if left.__class__.__name__ == "Ellipsis":
return left.compare(right).diff
elif right.__class__.__name__ == "Ellipsis":
if right.__class__.__name__ == "Ellipsis":
return right.compare(left).diff

if left.__class__.__name__ == "Pattern":
return left._audit(right).report()
if right.__class__.__name__ == "Pattern":
return right._audit(left).report()


@pytest.fixture
def clean_environment():
def clean():
subprocess.call("pkill -f qemu", shell=True)
subprocess.call(["pkill", "-f", "supervised-qemu"])
subprocess.call(["pkill", "-f", "qemu-system-x86_64"])
subprocess.call("rbd rm rbd.ssd/simplevm.swap", shell=True)
subprocess.call("rbd snap purge rbd.ssd/simplevm.root", shell=True)
subprocess.call("rbd rm rbd.ssd/simplevm.root", shell=True)
subprocess.call("rbd rm rbd.ssd/simplevm.tmp", shell=True)
time.sleep(1)

clean()

print(
subprocess.check_output(
"rbd ls rbd.ssd",
shell=True,
).decode("ascii")
)

yield
clean()

print(
subprocess.check_output(
"free && sync && echo 3 > /proc/sys/vm/drop_caches && free",
shell=True,
).decode("ascii")
)
print(
subprocess.check_output(
"ceph df",
shell=True,
).decode("ascii")
)
print(
subprocess.check_output(
"df -h",
shell=True,
).decode("ascii")
)
print(
subprocess.check_output(
"rbd showmapped",
shell=True,
).decode("ascii")
)


@pytest.fixture(autouse=True)
def clean_tmpdir_with_flakefinder(tmpdir, pytestconfig):
"""The `tmpdir` is normally not cleaned out for debugging purposes.
Running with flakefinder causes the tmpdir to quickly grow too big.
So, if flakefinder is active, we clean it out to avoid running out of
disk space.
"""
yield
if pytestconfig.getoption("flake_finder_enable") > 0:
shutil.rmtree(tmpdir)


@pytest.fixture
def vm(clean_environment, monkeypatch, tmpdir):
Expand All @@ -146,7 +200,7 @@ def vm(clean_environment, monkeypatch, tmpdir):
vm.qemu.vm_expected_overhead = 128
for snapshot in vm.ceph.root.snapshots:
snapshot.remove()
vm.qemu.destroy()
vm.qemu.destroy(kill_supervisor=True)
vm.unlock()
get_log()
yield vm
Expand Down
2 changes: 0 additions & 2 deletions src/fc/qemu/ellipsis.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@


class Report(object):

matches = None

def __init__(self):
Expand Down Expand Up @@ -49,7 +48,6 @@ def match(pattern, line):


class Ellipsis(object):

# other = other.replace('\t', ' '*8) oder allgemein white-space unsensibel
# multi-line support

Expand Down
29 changes: 25 additions & 4 deletions src/fc/qemu/hazmat/qemu.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ def detect_current_machine_type(

def locked_global(f):
LOCK = "/run/fc-qemu.lock"

# This is thread-safe *AS LONG* as every thread uses a separate instance
# of the agent. Using multiple file descriptors will guarantee that the
# lock can only be held once even within a single process.
Expand Down Expand Up @@ -85,7 +86,6 @@ def locked(self, *args, **kw):


class Qemu(object):

prefix = ""
executable = "qemu-system-x86_64"

Expand Down Expand Up @@ -576,20 +576,41 @@ def graceful_shutdown(self):
],
)

def destroy(self):
def destroy(self, kill_supervisor=False):
# We use this destroy command in "fire-and-forget"-style because
# sometimes the init script will complain even if we achieve what
# we want: that the VM isn't running any longer. We check this
# by contacting the monitor instead.
timeout = TimeOut(100, interval=1, raise_on_timeout=True)
p = self.proc()
if not p:
return

# Check whether the parent is the supervising process.
# Kill that one first so we avoid immediate restarts.
if kill_supervisor:
parent = p.parent()
if "supervised-qemu-wrapped" in parent.cmdline()[1]:
timeout = TimeOut(100, interval=2, raise_on_timeout=True)
attempt = 0
while parent.is_running() and timeout.tick():
attempt += 1
self.log.debug(
"vm-destroy-kill-supervisor", attempt=attempt
)
try:
parent.terminate()
except psutil.NoSuchProcess:
break

timeout = TimeOut(100, interval=2, raise_on_timeout=True)
attempt = 0
while p.is_running() and timeout.tick():
attempt += 1
self.log.debug("vm-destroy-kill-vm", attempt=attempt)
try:
p.terminate()
except psutil.NoSuchProcess:
pass
break

def resize_root(self, size):
self.qmp.command("block_resize", device="virtio0", size=size)
Expand Down
6 changes: 6 additions & 0 deletions src/fc/qemu/incoming.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,9 @@ def run(self):
if self.finished == "success":
return 0
else:
# Do not kill the supervisor. We will be giving up here, but the
# supervisor will restart the process, potentially with updated
# config data so we get a "free" retry.
self.qemu.destroy()
return 1

Expand Down Expand Up @@ -229,6 +232,9 @@ def destroy(self):
log.info("destroying", machine=self.name)
self.finished = "destroyed"
try:
# Do not kill the supervisor. We will be giving up here, but the
# supervisor will restart the process, potentially with updated
# config data so we get a "free" retry.
self.qemu.destroy()
except QemuNotRunning:
pass
Expand Down
Loading

0 comments on commit 774533d

Please sign in to comment.