Skip to content

Commit

Permalink
Merge pull request ceph#989 from leonidc/fix_rebalance_exeption_handling
Browse files Browse the repository at this point in the history
fix exception handling in rebalane thread
  • Loading branch information
leonidc authored Dec 19, 2024
2 parents 2539ac4 + e8d349c commit 719a6ee
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 2 deletions.
7 changes: 5 additions & 2 deletions control/rebalance.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,11 @@ def __init__(self, gateway_service):
self.ceph_utils = gateway_service.ceph_utils
self.rebalance_period_sec = gateway_service.config.getint_with_default("gateway", "rebalance_period_sec", 7)
self.rebalance_max_ns_to_change_lb_grp = gateway_service.config.getint_with_default("gateway", "max_ns_to_change_lb_grp", 8)
self.auto_rebalance = threading.Thread(target=self.auto_rebalance_task, daemon=True)
self.rebalance_event = threading.Event()
self.auto_rebalance = threading.Thread(target=self.auto_rebalance_task, daemon=True, args=(self.rebalance_event,))
self.auto_rebalance.start() #start the thread

def auto_rebalance_task(self ):
def auto_rebalance_task(self, death_event):
"""Periodically calls for auto rebalance."""
while (self.rebalance_period_sec > 0):
for i in range(self.rebalance_max_ns_to_change_lb_grp):
Expand All @@ -41,6 +42,8 @@ def auto_rebalance_task(self ):
break
except Exception:
self.logger.exception(f"Exception in auto rebalance")
if death_event:
death_event.set()
raise
time.sleep(0.01) #release lock for 10ms after rebalancing each 1 NS
time.sleep(self.rebalance_period_sec)
Expand Down
4 changes: 4 additions & 0 deletions control/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -692,6 +692,10 @@ def keep_alive(self):
spdk_ping_interval_in_seconds = 0.0

while True:
if self.gateway_rpc:
if self.gateway_rpc.rebalance.rebalance_event.is_set():
self.logger.critical(f"Failure in rebalance, aborting")
raise SystemExit(f"Failure in rebalance, quitting gateway")
timedout = self.server.wait_for_termination(timeout=1)
if not timedout:
break
Expand Down

0 comments on commit 719a6ee

Please sign in to comment.