Skip to content

Commit

Permalink
fix(validator): fix various issues with sstable scrub validator
Browse files Browse the repository at this point in the history
We face issues from time to time with sstable scrub validator, like
delayed logs or validating on unbootstrapped node.

fixes: #7440
  • Loading branch information
soyacz authored and fruch committed Jun 26, 2024
1 parent 90252fb commit 8fe625d
Showing 1 changed file with 12 additions and 2 deletions.
14 changes: 12 additions & 2 deletions sdcm/teardown_validators/sstables.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import logging
from functools import partial

from sdcm import wait
from sdcm.cluster import BaseNode
from sdcm.exceptions import WaitForTimeoutError
from sdcm.sct_events import Severity
from sdcm.sct_events.teardown_validators import ValidatorEvent, ScrubValidationErrorEvent
from sdcm.teardown_validators.base import TeardownValidator
Expand Down Expand Up @@ -31,7 +33,12 @@ def _upload_corrupted_files(self, node: BaseNode, quarantine_log_lines):
return s3_link

def _run_nodetool_scrub(self, node: BaseNode, keyspace: str, table: str, timeout=1200):
node.wait_db_up(timeout=300)
try:
node.wait_db_up(timeout=300)
except WaitForTimeoutError as ex:
# sometimes node can boot very long after last nemesis (e.g. bootstrap new node).
LOGGER.error("Error waiting for node %s to be up in sstable validator: %s\nskipping validation", node.name, ex)
return
finish_scrub_follower = node.follow_system_log(patterns=['Finished scrubbing in validate mode'])
quarantine_lines = node.follow_system_log(patterns=['sstable - Moving sstable'], start_from_beginning=True)
result = node.run_nodetool(sub_cmd='scrub', args=f"--mode VALIDATE --no-snapshot {keyspace} {table}".strip(),
Expand All @@ -40,7 +47,10 @@ def _run_nodetool_scrub(self, node: BaseNode, keyspace: str, table: str, timeout
ValidatorEvent(
message=f'Error running nodetool scrub on node {node.name}: {result.stdout}\n{result.stderr}',
severity=Severity.ERROR).publish()
scrub_finish_lines = list(finish_scrub_follower)
# sometimes logs might be delayed, so we need to wait for them
scrub_finish_lines = wait.wait_for(func=lambda: list(finish_scrub_follower), step=10,
text="Waiting for 'Finished scrubbing in validate mode' logs",
timeout=300, throw_exc=False)
if not scrub_finish_lines:
ValidatorEvent(
message=f'No scrubbing validation message found in db logs on node: {node.name}', severity=Severity.ERROR).publish()
Expand Down

0 comments on commit 8fe625d

Please sign in to comment.