Skip to content

Commit

Permalink
CURATOR-724. Fix LeaderLatch recover on reconnected and missing leade…
Browse files Browse the repository at this point in the history
…rPath

Signed-off-by: tison <[email protected]>
  • Loading branch information
tisonkun committed Dec 17, 2024
1 parent ad19795 commit a054902
Showing 1 changed file with 21 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -509,7 +509,7 @@ public void processResult(CuratorFramework client, CuratorEvent event) throws Ex
getChildren();
}
} else {
log.error("getChildren() failed. rc = {}", event.getResultCode());
log.error("creatingParentContainersIfNeeded() failed (rc = {})", event.getResultCode());
}
}
};
Expand All @@ -528,7 +528,7 @@ private synchronized void internalStart() {
reset();
} catch (Exception e) {
ThreadUtils.checkInterrupted(e);
log.error("An error occurred checking resetting leadership.", e);
log.error("failed to check resetting leadership.", e);
}
}
}
Expand All @@ -548,7 +548,7 @@ private void checkLeadership(List<String> children) throws Exception {
log.debug("checkLeadership with id: {}, ourPath: {}, children: {}", id, localOurPath, sortedChildren);

if (ourIndex < 0) {
log.error("Can't find our node. Resetting. Index: {}", ourIndex);
log.error("failed to find our node; resetting (index: {})", ourIndex);
reset();
return;
}
Expand Down Expand Up @@ -582,7 +582,7 @@ public void process(WatchedEvent event) {
getChildren();
} catch (Exception ex) {
ThreadUtils.checkInterrupted(ex);
log.error("An error occurred checking the leadership.", ex);
log.error("failed to check the leadership.", ex);
}
}
}
Expand All @@ -607,6 +607,17 @@ private void getChildren() throws Exception {
public void processResult(CuratorFramework client, CuratorEvent event) throws Exception {
if (event.getResultCode() == KeeperException.Code.OK.intValue()) {
checkLeadership(event.getChildren());
} else if (event.getResultCode() == KeeperException.Code.NONODE.intValue()) {
// latchPath has gone - reset
//
// This is possible when RECONNECTED during:
// (1) Scale the zk cluster to 0 nodes.
// (2) Scale it back.
//
// See also https://issues.apache.org/jira/browse/CURATOR-724
reset();
} else {
log.error("getChildren() failed (rc = {})", event.getResultCode());
}
}
};
Expand All @@ -616,11 +627,6 @@ public void processResult(CuratorFramework client, CuratorEvent event) throws Ex
@VisibleForTesting
protected void handleStateChange(ConnectionState newState) {
switch (newState) {
default: {
// NOP
break;
}

case RECONNECTED: {
try {
if (client.getConnectionStateErrorPolicy().isErrorState(ConnectionState.SUSPENDED)
Expand All @@ -629,7 +635,7 @@ protected void handleStateChange(ConnectionState newState) {
}
} catch (Exception e) {
ThreadUtils.checkInterrupted(e);
log.error("Could not reset leader latch", e);
log.error("failed to reset leader latch", e);
setLeadership(false);
}
break;
Expand All @@ -646,6 +652,11 @@ protected void handleStateChange(ConnectionState newState) {
setLeadership(false);
break;
}

default: {
// NOP
break;
}
}
}

Expand Down

0 comments on commit a054902

Please sign in to comment.