Skip to content

Commit

Permalink
Watchdog check
Browse files Browse the repository at this point in the history
  • Loading branch information
happz committed Oct 19, 2023
1 parent 7d67df0 commit 89a69d6
Show file tree
Hide file tree
Showing 5 changed files with 338 additions and 1 deletion.
14 changes: 14 additions & 0 deletions tests/test/check/data/main.fmf
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,17 @@
sudo bash -c "passwd --help &> /root/passwd.log; \
ls -alZ /root/passwd.log; \
rm -f /root/passwd.log" || /bin/true

/watchdog/ping:
test: |
set -x
echo "hello!"
sleep 20
echo "bye!"

check:
- name: watchdog
interval: 5

ping: true
ping-packets: 1
12 changes: 12 additions & 0 deletions tests/test/check/main.fmf
Original file line number Diff line number Diff line change
Expand Up @@ -24,3 +24,15 @@ tier: 2
environment:
PROVISION_METHODS: local virtual
tag+: [additional_coverage]

/watchdog:
test: ./test-watchdog.sh

environment:
PROVISION_METHODS: local

adjust:
- when: how == full
environment:
PROVISION_METHODS: local virtual
tag+: [additional_coverage]
43 changes: 43 additions & 0 deletions tests/test/check/test-watchdog.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#!/bin/bash

. /usr/share/beakerlib/beakerlib.sh || exit 1

function assert_check_result () {
rlAssertEquals "$1" "avc:$2" "$(yq -r ".[] | .check | .[] | select(.event == \"$3\") | \"\\(.name):\\(.result)\"" $results)"
}


rlJournalStart
rlPhaseStartSetup
rlRun "run=\$(mktemp -d)" 0 "Create run directory"

rlRun "results=$run/plan/execute/results.yaml"

rlRun "pushd data"
rlRun "set -o pipefail"
rlPhaseEnd

for method in ${PROVISION_METHODS:-local}; do
rlPhaseStartTest "Test guest watchdog ping with $method provisioning"
# rlRun "avc_log=$run/plan/execute/data/guest/default-0/avc/harmless-1/tmt-avc-after-test.txt"
#
rlRun "tmt run --id $run --scratch -a -vv provision -h $method test -n /watchdog"
rlRun "tmt run --id $run --scratch -a -vv provision -h $method provision --insert -h $method test -n /watchdog"
#
# rlRun "cat $results"
# rlRun "cat $avc_log"
#
# rlAssertExists "$avc_log"
#
# assert_check_result "avc as an after-test should pass" "pass" "after-test"
#
# rlAssertGrep "<no matches>" "$avc_log"
rlPhaseEnd
done

rlPhaseStartCleanup
rlRun "popd"

# rlRun "rm -rf $run"
rlPhaseEnd
rlJournalEnd
268 changes: 268 additions & 0 deletions tmt/checks/watchdog.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,268 @@
import dataclasses
import datetime
import re
import threading
import time
from typing import List, Optional, Iterable, Iterator, Dict

import tmt.log
import tmt.steps.execute
import tmt.steps.provision
import tmt.utils
from tmt.checks import Check, CheckPlugin, provides_check
from tmt.result import CheckResult, ResultOutcome
from tmt.utils import Path, field, render_run_exception_streams

PING_OUTPUT_PATTERN = re.compile(
r'(?m)(?P<transmitted>\d+) packets transmitted, (?P<received>\d+) received')


REPORT_FILENAME = 'tmt-watchdog.txt'


def render_report_path(
plugin: tmt.steps.execute.ExecutePlugin,
guest: tmt.steps.provision.Guest,
test: 'tmt.base.Test') -> Path:
""" Render path to a watchdog report file from necessary components """

return plugin.data_path(
test,
guest,
filename=REPORT_FILENAME,
create=True,
full=True)


def report_progress(
log: Path,
report: Iterable[str],
command_output: Optional[str] = None) -> None:
"""
Add new report into a report file.
:param log: path to the report file.
:param report: iterable of report lines to add. Each line is emitted on its
own line in the file.
:param command_output: if set, the string is added to the report file once
``report`` lines are written into it.
"""

timestamp = tmt.steps.execute.ExecutePlugin.format_timestamp(
datetime.datetime.now(datetime.timezone.utc))

with open(log, mode='a') as f:
f.write(f'# Reported at {timestamp}\n')

for line in report:
f.write(line)
f.write('\n')

if command_output:
f.write('\n')
f.write(command_output)

f.write('\n')


@dataclasses.dataclass
class GuestContext:
""" Per-guest watchdog context """

#: Current number of failed watchdog checks.
ping_failures: int = 0

#: If set, contains a daemonized thread running the watchdog checks.
thread: Optional[threading.Thread] = None

#: As long as this field is set to ``True``, the watchdog will run its
#: internal loop and run relevant checks. It is unset when terminating
#: the watchdog check to notify the thread it's time to quit.
keep_running: bool = True


@dataclasses.dataclass
class WatchdogCheck(Check):
interval: int = field(default=60)

ping: bool = field(default=False)
ping_packets: int = field(default=1)
ping_threshold: int = field(default=10)

#: Per-guest context entries. One for each guest name.
guest_context: Dict[str, GuestContext] = field(
default_factory=dict,
internal=True,
# It is not necessary to preserve guest context.
serialize=lambda context: {})

def do_ping(
self,
plugin: tmt.steps.execute.ExecutePlugin,
guest: tmt.steps.provision.Guest,
test: 'tmt.base.Test',
logger: tmt.log.Logger) -> None:
""" Perform a ping check """

logger.verbose('pinging', level=2)

log = render_report_path(plugin, guest, test)

guest_context = self.guest_context[guest.name]

def _fail_parse_error(ping_output: str) -> None:
""" Handle unparseable ``ping`` output """

logger.fail('failed to parse ping output')

guest_context.ping_failures += 1

report_progress(
log,
[
'# failed to parse ping output',
f'# failed {guest_context.ping_failures} of {self.ping_threshold} allowed',
],
command_output=ping_output
)

def _fail_lost_packets(ping_output: str, transmitted: int, received: int) -> None:
""" Handle missing response packets """

logger.fail(f'not all packets returned: {transmitted=} {received=}')

guest_context.ping_failures += 1

report_progress(
log,
[
'# not all packets returned',
f'# failed {guest_context.ping_failures} of {self.ping_threshold} allowed',
],
command_output=ping_output
)

def _success(ping_output: str) -> None:
""" Handle successfull response """

logger.verbose('successful response')

report = [
'# successfull response'
]

if guest_context.ping_failures != 0:
report.append(f'# replenished failure budget back to {self.ping_threshold}')

guest_context.ping_failures = 0

report_progress(
log,
report,
command_output=ping_output
)

def _handle_output(ping_output: str) -> None:
""" Process ``ping`` output and decide on its outcome """

match = PING_OUTPUT_PATTERN.search(ping_output)

if match is None:
_fail_parse_error(ping_output)

else:
groups = match.groupdict()

transmitted = int(groups['transmitted'])
received = int(groups['received'])

if transmitted != received:
_fail_lost_packets(ping_output, transmitted, received)

else:
_success(ping_output)

if guest_context.ping_failures >= self.ping_threshold:
logger.fail(f'exhausted {self.ping_threshold} ping attempts')

try:
assert guest.guest is not None # narrow type

output = tmt.utils.Command('ping', '-c', str(self.ping_packets), guest.guest) \
.run(cwd=Path.cwd(), logger=logger)

_handle_output(output.stdout or '')

except tmt.utils.RunError as exc:
if exc.returncode == 1:
_handle_output(exc.stdout or '')

else:
_handle_output('\n'.join(render_run_exception_streams(exc.stdout, exc.stderr)))


@provides_check('watchdog')
class Watchdog(CheckPlugin):
_check_class = WatchdogCheck

@classmethod
def before_test(
cls,
*,
check: WatchdogCheck,
plugin: tmt.steps.execute.ExecutePlugin,
guest: tmt.steps.provision.Guest,
test: 'tmt.base.Test',
environment: Optional[tmt.utils.EnvironmentType] = None,
logger: tmt.log.Logger) -> List[CheckResult]:

# Setup a logger
watchdog_logger = logger.clone()
watchdog_logger.labels.append('watchdog')

# Create a guest context for the guest we've been given
guest_context = check.guest_context[guest.name] = GuestContext()

def watchdog() -> None:
""" Watchdog thread code """

while guest_context.keep_running:
if check.ping:
check.do_ping(plugin, guest, test, watchdog_logger)

time.sleep(check.interval)

guest_context.thread = threading.Thread(
target=watchdog,
name=f'watchdog-{guest.name}',
daemon=True)

guest_context.thread.start()

return []

@classmethod
def after_test(
cls,
*,
check: WatchdogCheck,
plugin: tmt.steps.execute.ExecutePlugin,
guest: tmt.steps.provision.Guest,
test: 'tmt.base.Test',
environment: Optional[tmt.utils.EnvironmentType] = None,
logger: tmt.log.Logger) -> List[CheckResult]:

guest_context = check.guest_context.pop(guest.name)

if guest_context.thread:
guest_context.keep_running = False
guest_context.thread.join()

guest_context.thread = None

return [
CheckResult(
name='watchdog',
result=ResultOutcome.PASS,
log=[render_report_path(plugin, guest, test)]
)]
2 changes: 1 addition & 1 deletion tmt/log.py
Original file line number Diff line number Diff line change
Expand Up @@ -523,7 +523,7 @@ def clone(self) -> 'Logger':
return Logger(
self._logger,
base_shift=self._base_shift,
labels=self.labels,
labels=self.labels[:],
labels_padding=self.labels_padding,
verbosity_level=self.verbosity_level,
debug_level=self.debug_level,
Expand Down

0 comments on commit 89a69d6

Please sign in to comment.