From 07500577d3feb2203523b62ea16cdbdf4a420d35 Mon Sep 17 00:00:00 2001 From: "Michael J. Sullivan" Date: Fri, 1 Mar 2024 11:07:52 -0800 Subject: [PATCH] try retries on the signal_sysevent --- edb/common/retryloop.py | 33 +++++++++++++++++++++++++++++---- edb/server/tenant.py | 19 +++++++++++++++++-- 2 files changed, 46 insertions(+), 6 deletions(-) diff --git a/edb/common/retryloop.py b/edb/common/retryloop.py index be9feed0e70..d6c7d927579 100644 --- a/edb/common/retryloop.py +++ b/edb/common/retryloop.py @@ -52,14 +52,18 @@ def __init__( self, *, backoff: Callable[[int], float] = const_backoff(0.5), - timeout: float, + timeout: float | None = None, + iterations: int | None = None, ignore: Type[Exception] | Tuple[Type[Exception], ...] | None = None, wait_for: Type[Exception] | Tuple[Type[Exception], ...] | None = None, retry_cb: Callable[[Optional[BaseException]], None] | None = None, ) -> None: + assert timeout is not None or iterations is not None + self._iteration = 0 self._backoff = backoff self._timeout = timeout + self._max_iterations = iterations self._ignore = ignore self._wait_for = wait_for self._started_at = 0.0 @@ -113,10 +117,19 @@ async def __aexit__( # Propagate, it's not the error we expected. return False - if elapsed > self._loop._timeout: + if ( + self._loop._timeout is not None + and elapsed > self._loop._timeout + ): # Propagate -- we've run it enough times. return False + if ( + self._loop._max_iterations is not None + and self._loop._iteration >= self._loop._max_iterations + ): + return False + if self._loop._retry_cb is not None: self._loop._retry_cb(e) @@ -137,10 +150,22 @@ async def __aexit__( # Propagate, it's not the error we expected. return False - if elapsed > self._loop._timeout: + if ( + self._loop._timeout is not None + and elapsed > self._loop._timeout + ): + raise TimeoutError( + f'exception matching {self._loop._wait_for!r} ' + f'has not happened in {self._loop._timeout} seconds') + + if ( + self._loop._max_iterations is not None + and self._loop._iteration >= self._loop._max_iterations + ): raise TimeoutError( f'exception matching {self._loop._wait_for!r} ' - f'has not happen in {self._loop._timeout} seconds') + f'has not happened in {self._loop._max_iterations} ' + f'iterations') # Ignore the exception until next run. return True diff --git a/edb/server/tenant.py b/edb/server/tenant.py index d47c456f46e..57dcc5c64f1 100644 --- a/edb/server/tenant.py +++ b/edb/server/tenant.py @@ -1308,9 +1308,24 @@ async def signal_sysevent(self, event: str, **kwargs) -> None: # in flight. return + rloop = retryloop.RetryLoop( + iterations=3, + ignore=pgcon.BackendError, + backoff=retryloop.const_backoff(0.0), + ) async with self.use_sys_pgcon() as con: - await con.signal_sysevent(event, **kwargs) - except Exception: + async for iteration in rloop: + async with iteration: + try: + await con.signal_sysevent(event, **kwargs) + except Exception as ex: + print("=== YES IT IS FUCKED", ex) + raise + + except Exception as ex: + print("SYSEVENT ERROR") + debug.dump(ex) + sys.stdout.flush() metrics.background_errors.inc( 1.0, self._instance_name, "signal_sysevent" )