From bf7d02772c956f357a7be88796eac88791e03307 Mon Sep 17 00:00:00 2001 From: Dmitriy Kruglov Date: Tue, 3 Sep 2024 20:15:12 +0200 Subject: [PATCH] fix(disrupt_mgr): skip take_snapshot errors per issue The change adds a decorator helper that uses SkipPerIssues machanism and applies (enters) provided contexts to a function if issue in question is open. Also the new decorator is used to decorate mgmt_backup Nemesis disruption to skip take_snapshot related erros, until issue https://github.com/scylladb/scylla-manager/issues/3389 is resolved. --- sdcm/nemesis.py | 5 ++++ sdcm/sct_events/group_common_events.py | 35 ++++++++++++++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/sdcm/nemesis.py b/sdcm/nemesis.py index eb78452f3e..ea23a325ce 100644 --- a/sdcm/nemesis.py +++ b/sdcm/nemesis.py @@ -87,6 +87,8 @@ decorate_with_context, ignore_reactor_stall_errors, ignore_disk_quota_exceeded_errors, + decorate_with_context_if_issues_open, + ignore_take_snapshot_failing, ) from sdcm.sct_events.health import DataValidatorEvent from sdcm.sct_events.loaders import CassandraStressLogEvent, ScyllaBenchEvent @@ -3004,6 +3006,9 @@ def _delete_existing_backups(self, mgr_cluster): self.log.warning("Deleted the following backup tasks before the nemesis starts: %s", ", ".join(deleted_tasks)) + @decorate_with_context_if_issues_open( + ignore_take_snapshot_failing, + issue_refs=['https://github.com/scylladb/scylla-manager/issues/3389']) def _mgmt_backup(self, backup_specific_tables): if not self.cluster.params.get('use_mgmt') and not self.cluster.params.get('use_cloud_manager'): raise UnsupportedNemesis('Scylla-manager configuration is not defined!') diff --git a/sdcm/sct_events/group_common_events.py b/sdcm/sct_events/group_common_events.py index 1878f33d9b..f9aedb7bd8 100644 --- a/sdcm/sct_events/group_common_events.py +++ b/sdcm/sct_events/group_common_events.py @@ -15,11 +15,13 @@ from functools import wraps from typing import ContextManager, Callable, Sequence +from sdcm.cluster import TestConfig from sdcm.sct_events import Severity from sdcm.sct_events.filters import DbEventsFilter, EventsSeverityChangerFilter, EventsFilter from sdcm.sct_events.loaders import YcsbStressEvent from sdcm.sct_events.database import DatabaseLogEvent from sdcm.sct_events.monitors import PrometheusAlertManagerEvent +from sdcm.utils.issues import SkipPerIssues @contextmanager @@ -347,6 +349,22 @@ def ignore_raft_transport_failing(): yield +@contextmanager +def ignore_take_snapshot_failing(): + with ExitStack() as stack: + stack.enter_context(EventsSeverityChangerFilter( + new_severity=Severity.WARNING, + event_class=DatabaseLogEvent, + regex=r".*api - take_snapshot failed: std::filesystem::__cxx11::filesystem_error.*No such file or directory", + extra_time_to_expiration=60)) + stack.enter_context(EventsSeverityChangerFilter( + new_severity=Severity.WARNING, + event_class=DatabaseLogEvent, + regex=r".*api - take_snapshot failed: std::runtime_error \(Keyspace.*snapshot.*already exists", + extra_time_to_expiration=60)) + yield + + def decorate_with_context(context_list: list[Callable | ContextManager] | Callable | ContextManager): """ helper to decorate a function to run with a list of callables that return context managers @@ -373,3 +391,20 @@ def inner_func(*args, **kwargs): return func(*args, **kwargs) return inner_func return inner_decorator + + +def decorate_with_context_if_issues_open( + contexts: list[Callable | ContextManager] | Callable | ContextManager, issue_refs: list[str]): + """ + Helper to decorate a function, to apply the provided contexts only if referenced GitHub issues are opened. + """ + def decorator(func): + @wraps(func) + def wrapper(*args, **kwargs): + if SkipPerIssues(issue_refs, TestConfig().tester_obj().params): + decorated_func = decorate_with_context(contexts)(func) + return decorated_func(*args, **kwargs) + else: + return func(*args, **kwargs) + return wrapper + return decorator