From ea32f84864ac43c450fc1206580397359d23744b Mon Sep 17 00:00:00 2001 From: Heesung Sohn <103456639+heesung-sn@users.noreply.github.com> Date: Thu, 8 Aug 2024 01:19:38 -0700 Subject: [PATCH 1/3] [fix][broker] Fix the bug that elected leader thinks it's a follower (#23138) (cherry picked from commit 3560ddb64f44fb2a53d52ef3df0624bb9bda1af6) --- .../coordination/impl/LeaderElectionImpl.java | 15 ++++- .../pulsar/metadata/LeaderElectionTest.java | 2 + .../apache/pulsar/metadata/ZKSessionTest.java | 55 +++++++++++++++++++ 3 files changed, 69 insertions(+), 3 deletions(-) diff --git a/pulsar-metadata/src/main/java/org/apache/pulsar/metadata/coordination/impl/LeaderElectionImpl.java b/pulsar-metadata/src/main/java/org/apache/pulsar/metadata/coordination/impl/LeaderElectionImpl.java index 8e1d2cd4aa756..965155802afb8 100644 --- a/pulsar-metadata/src/main/java/org/apache/pulsar/metadata/coordination/impl/LeaderElectionImpl.java +++ b/pulsar-metadata/src/main/java/org/apache/pulsar/metadata/coordination/impl/LeaderElectionImpl.java @@ -133,8 +133,11 @@ private synchronized CompletableFuture handleExistingLeader // If the value is the same as our proposed value, it means this instance was the leader at some // point before. The existing value can either be for this same session or for a previous one. if (res.getStat().isCreatedBySelf()) { + log.info("Keeping the existing value {} for {} as it's from the same session stat={}", existingValue, + path, res.getStat()); // The value is still valid because it was created in the same session changeState(LeaderElectionState.Leading); + return CompletableFuture.completedFuture(LeaderElectionState.Leading); } else { // Since the value was created in a different session, it might be expiring. We need to delete it // and try the election again. @@ -259,7 +262,13 @@ public synchronized CompletableFuture asyncClose() { return CompletableFuture.completedFuture(null); } - return store.delete(path, version); + return store.delete(path, version) + .thenAccept(__ -> { + synchronized (LeaderElectionImpl.this) { + leaderElectionState = LeaderElectionState.NoLeader; + } + } + ); } @Override @@ -280,8 +289,8 @@ public Optional getLeaderValueIfPresent() { private void handleSessionNotification(SessionEvent event) { // Ensure we're only processing one session event at a time. executor.execute(SafeRunnable.safeRun(() -> { - if (event == SessionEvent.SessionReestablished) { - log.info("Revalidating leadership for {}", path); + if (event == SessionEvent.Reconnected || event == SessionEvent.SessionReestablished) { + log.info("Revalidating leadership for {}, event:{}", path, event); try { LeaderElectionState les = elect().get(); diff --git a/pulsar-metadata/src/test/java/org/apache/pulsar/metadata/LeaderElectionTest.java b/pulsar-metadata/src/test/java/org/apache/pulsar/metadata/LeaderElectionTest.java index 97587b1520e75..6c540a0761b8f 100644 --- a/pulsar-metadata/src/test/java/org/apache/pulsar/metadata/LeaderElectionTest.java +++ b/pulsar-metadata/src/test/java/org/apache/pulsar/metadata/LeaderElectionTest.java @@ -69,6 +69,8 @@ public void basicTest(String provider, Supplier urlSupplier) throws Exce leaderElection.close(); + assertEquals(leaderElection.getState(), LeaderElectionState.NoLeader); + assertEquals(cache.get("/my/leader-election").join(), Optional.empty()); } diff --git a/pulsar-metadata/src/test/java/org/apache/pulsar/metadata/ZKSessionTest.java b/pulsar-metadata/src/test/java/org/apache/pulsar/metadata/ZKSessionTest.java index 126383c3afeb1..21b5eca077fcd 100644 --- a/pulsar-metadata/src/test/java/org/apache/pulsar/metadata/ZKSessionTest.java +++ b/pulsar-metadata/src/test/java/org/apache/pulsar/metadata/ZKSessionTest.java @@ -27,6 +27,7 @@ import java.util.concurrent.LinkedBlockingQueue; import java.util.concurrent.TimeUnit; import lombok.Cleanup; +import org.apache.commons.lang3.reflect.FieldUtils; import org.apache.pulsar.metadata.api.MetadataStoreConfig; import org.apache.pulsar.metadata.api.coordination.CoordinationService; import org.apache.pulsar.metadata.api.coordination.LeaderElection; @@ -180,4 +181,58 @@ public void testReacquireLeadershipAfterSessionLost() throws Exception { .untilAsserted(()-> assertEquals(le1.getState(),LeaderElectionState.Leading)); assertTrue(store.get(path).join().isPresent()); } + + + @Test + public void testElectAfterReconnected() throws Exception { + // --- init + @Cleanup + MetadataStoreExtended store = MetadataStoreExtended.create(zks.getConnectionString(), + MetadataStoreConfig.builder() + .sessionTimeoutMillis(2_000) + .build()); + + + BlockingQueue sessionEvents = new LinkedBlockingQueue<>(); + store.registerSessionListener(sessionEvents::add); + BlockingQueue leaderElectionEvents = new LinkedBlockingQueue<>(); + String path = newKey(); + + @Cleanup + CoordinationService coordinationService = new CoordinationServiceImpl(store); + @Cleanup + LeaderElection le1 = coordinationService.getLeaderElection(String.class, path, + leaderElectionEvents::add); + + // --- test manual elect + String proposed = "value-1"; + le1.elect(proposed).join(); + assertEquals(le1.getState(), LeaderElectionState.Leading); + LeaderElectionState les = leaderElectionEvents.poll(5, TimeUnit.SECONDS); + assertEquals(les, LeaderElectionState.Leading); + + + // simulate no leader state + FieldUtils.writeDeclaredField(le1, "leaderElectionState", LeaderElectionState.NoLeader, true); + + // reconnect + zks.stop(); + + SessionEvent e = sessionEvents.poll(5, TimeUnit.SECONDS); + assertEquals(e, SessionEvent.ConnectionLost); + + zks.start(); + + + // --- test le1 can be leader + e = sessionEvents.poll(10, TimeUnit.SECONDS); + assertEquals(e, SessionEvent.Reconnected); + Awaitility.await().atMost(Duration.ofSeconds(15)) + .untilAsserted(()-> { + assertEquals(le1.getState(),LeaderElectionState.Leading); + }); // reacquire leadership + + + assertTrue(store.get(path).join().isPresent()); + } } From 8d5b4bc5a5f4fdd03544206988488c664d50b294 Mon Sep 17 00:00:00 2001 From: nikhil-ctds Date: Thu, 16 Jan 2025 13:34:43 +0530 Subject: [PATCH 2/3] fix cgget command not found --- .github/actions/tune-runner-vm/action.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.github/actions/tune-runner-vm/action.yml b/.github/actions/tune-runner-vm/action.yml index e8914dbe74f6c..e636a961678fd 100644 --- a/.github/actions/tune-runner-vm/action.yml +++ b/.github/actions/tune-runner-vm/action.yml @@ -71,6 +71,10 @@ runs: # stop Azure Linux agent to save RAM sudo systemctl stop walinuxagent.service || true + # Install cgroup-tools package to use cgget command + sudo apt-get update + sudo apt-get install -y cgroup-tools + # show memory free -m # show disk From 8f5c8dbb6aad6156ab005304b10b6e378777bd25 Mon Sep 17 00:00:00 2001 From: nikhil-ctds Date: Thu, 16 Jan 2025 17:30:42 +0530 Subject: [PATCH 3/3] fix docker cleanup in Github CI --- .github/workflows/ci-cpp.yaml | 2 +- .github/workflows/ci-integration-backwards-compatibility.yaml | 2 +- .github/workflows/ci-integration-cli.yaml | 2 +- .github/workflows/ci-integration-function.yaml | 2 +- .github/workflows/ci-integration-messaging.yaml | 2 +- .github/workflows/ci-integration-process.yaml | 2 +- .github/workflows/ci-integration-pulsar-io-ora.yaml | 2 +- .github/workflows/ci-integration-pulsar-io.yaml | 2 +- .github/workflows/ci-integration-schema.yaml | 2 +- .github/workflows/ci-integration-sql.yaml | 2 +- .github/workflows/ci-integration-standalone.yaml | 2 +- .github/workflows/ci-integration-thread.yaml | 2 +- .github/workflows/ci-integration-tiered-filesystem.yaml | 2 +- .github/workflows/ci-integration-tiered-jcloud.yaml | 2 +- .github/workflows/ci-integration-transaction.yaml | 2 +- .github/workflows/ci-owasp-dep-check.yaml | 2 +- .github/workflows/ci-pulsar-website-build.yaml | 2 +- .github/workflows/ci-shade-test.yaml | 2 +- 18 files changed, 18 insertions(+), 18 deletions(-) diff --git a/.github/workflows/ci-cpp.yaml b/.github/workflows/ci-cpp.yaml index 85cc8966ca723..730909b2d60bd 100644 --- a/.github/workflows/ci-cpp.yaml +++ b/.github/workflows/ci-cpp.yaml @@ -76,7 +76,7 @@ jobs: if: ${{ steps.check_changes.outputs.docs_only != 'true' }} run: | sudo apt clean - docker rmi $(docker images -q) -f + docker rmi $(docker images -q) -f || echo "No Docker images to remove." df -h - name: build package diff --git a/.github/workflows/ci-integration-backwards-compatibility.yaml b/.github/workflows/ci-integration-backwards-compatibility.yaml index 509fa3ac6eb6d..d60fd8b2feefc 100644 --- a/.github/workflows/ci-integration-backwards-compatibility.yaml +++ b/.github/workflows/ci-integration-backwards-compatibility.yaml @@ -79,7 +79,7 @@ jobs: run: | sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc sudo apt clean - docker rmi $(docker images -q) -f + docker rmi $(docker images -q) -f || echo "No Docker images to remove." df -h - name: run install by skip tests diff --git a/.github/workflows/ci-integration-cli.yaml b/.github/workflows/ci-integration-cli.yaml index 1c8e323fa7fcb..d90f8b5a93b13 100644 --- a/.github/workflows/ci-integration-cli.yaml +++ b/.github/workflows/ci-integration-cli.yaml @@ -79,7 +79,7 @@ jobs: run: | sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc sudo apt clean - docker rmi $(docker images -q) -f + docker rmi $(docker images -q) -f || echo "No Docker images to remove." df -h - name: run install by skip tests diff --git a/.github/workflows/ci-integration-function.yaml b/.github/workflows/ci-integration-function.yaml index 205e44dab6634..e6dbac790b458 100644 --- a/.github/workflows/ci-integration-function.yaml +++ b/.github/workflows/ci-integration-function.yaml @@ -79,7 +79,7 @@ jobs: run: | sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc sudo apt clean - docker rmi $(docker images -q) -f + docker rmi $(docker images -q) -f || echo "No Docker images to remove." df -h - name: run install by skip tests diff --git a/.github/workflows/ci-integration-messaging.yaml b/.github/workflows/ci-integration-messaging.yaml index 13e16aa1894a2..6e7a8a87a2548 100644 --- a/.github/workflows/ci-integration-messaging.yaml +++ b/.github/workflows/ci-integration-messaging.yaml @@ -79,7 +79,7 @@ jobs: run: | sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc sudo apt clean - docker rmi $(docker images -q) -f + docker rmi $(docker images -q) -f || echo "No Docker images to remove." df -h - name: run install by skip tests diff --git a/.github/workflows/ci-integration-process.yaml b/.github/workflows/ci-integration-process.yaml index a3e46cb538f3a..134feb26734bd 100644 --- a/.github/workflows/ci-integration-process.yaml +++ b/.github/workflows/ci-integration-process.yaml @@ -78,7 +78,7 @@ jobs: run: | sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc sudo apt clean - docker rmi $(docker images -q) -f + docker rmi $(docker images -q) -f || echo "No Docker images to remove." df -h - name: run install by skip tests diff --git a/.github/workflows/ci-integration-pulsar-io-ora.yaml b/.github/workflows/ci-integration-pulsar-io-ora.yaml index db905dd857974..1663e4623ceac 100644 --- a/.github/workflows/ci-integration-pulsar-io-ora.yaml +++ b/.github/workflows/ci-integration-pulsar-io-ora.yaml @@ -80,7 +80,7 @@ jobs: sudo swapoff -a sudo rm -rf /swapfile /usr/share/dotnet /usr/local/lib/android /opt/ghc sudo apt clean - docker rmi $(docker images -q) -f + docker rmi $(docker images -q) -f || echo "No Docker images to remove." df -h - name: run install by skip tests diff --git a/.github/workflows/ci-integration-pulsar-io.yaml b/.github/workflows/ci-integration-pulsar-io.yaml index f8eac1b83fda1..5919574afddfc 100644 --- a/.github/workflows/ci-integration-pulsar-io.yaml +++ b/.github/workflows/ci-integration-pulsar-io.yaml @@ -80,7 +80,7 @@ jobs: sudo swapoff -a sudo rm -rf /swapfile /usr/share/dotnet /usr/local/lib/android /opt/ghc sudo apt clean - docker rmi $(docker images -q) -f + docker rmi $(docker images -q) -f || echo "No Docker images to remove." df -h - name: run install by skip tests diff --git a/.github/workflows/ci-integration-schema.yaml b/.github/workflows/ci-integration-schema.yaml index bc0b1b3cfa6af..8ba0fe8c81aa2 100644 --- a/.github/workflows/ci-integration-schema.yaml +++ b/.github/workflows/ci-integration-schema.yaml @@ -78,7 +78,7 @@ jobs: run: | sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc sudo apt clean - docker rmi $(docker images -q) -f + docker rmi $(docker images -q) -f || echo "No Docker images to remove." df -h - name: run install by skip tests diff --git a/.github/workflows/ci-integration-sql.yaml b/.github/workflows/ci-integration-sql.yaml index c5bd21ffc1c5f..3ae9888a5e453 100644 --- a/.github/workflows/ci-integration-sql.yaml +++ b/.github/workflows/ci-integration-sql.yaml @@ -79,7 +79,7 @@ jobs: run: | sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc sudo apt clean - docker rmi $(docker images -q) -f + docker rmi $(docker images -q) -f || echo "No Docker images to remove." df -h - name: run install by skip tests diff --git a/.github/workflows/ci-integration-standalone.yaml b/.github/workflows/ci-integration-standalone.yaml index 87df27eff7e77..c7f6efe0417c6 100644 --- a/.github/workflows/ci-integration-standalone.yaml +++ b/.github/workflows/ci-integration-standalone.yaml @@ -78,7 +78,7 @@ jobs: run: | sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc sudo apt clean - docker rmi $(docker images -q) -f + docker rmi $(docker images -q) -f || echo "No Docker images to remove." df -h - name: run install by skip tests diff --git a/.github/workflows/ci-integration-thread.yaml b/.github/workflows/ci-integration-thread.yaml index fd7168fc01731..7cd1e5098c301 100644 --- a/.github/workflows/ci-integration-thread.yaml +++ b/.github/workflows/ci-integration-thread.yaml @@ -78,7 +78,7 @@ jobs: run: | sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc sudo apt clean - docker rmi $(docker images -q) -f + docker rmi $(docker images -q) -f || echo "No Docker images to remove." df -h - name: run install by skip tests diff --git a/.github/workflows/ci-integration-tiered-filesystem.yaml b/.github/workflows/ci-integration-tiered-filesystem.yaml index 11b66edc8d4a8..f5ce4d6994f80 100644 --- a/.github/workflows/ci-integration-tiered-filesystem.yaml +++ b/.github/workflows/ci-integration-tiered-filesystem.yaml @@ -78,7 +78,7 @@ jobs: run: | sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc sudo apt clean - docker rmi $(docker images -q) -f + docker rmi $(docker images -q) -f || echo "No Docker images to remove." df -h - name: run install by skip tests diff --git a/.github/workflows/ci-integration-tiered-jcloud.yaml b/.github/workflows/ci-integration-tiered-jcloud.yaml index 4952a40b1d965..10ae5f1286e0b 100644 --- a/.github/workflows/ci-integration-tiered-jcloud.yaml +++ b/.github/workflows/ci-integration-tiered-jcloud.yaml @@ -78,7 +78,7 @@ jobs: run: | sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc sudo apt clean - docker rmi $(docker images -q) -f + docker rmi $(docker images -q) -f || echo "No Docker images to remove." df -h - name: run install by skip tests diff --git a/.github/workflows/ci-integration-transaction.yaml b/.github/workflows/ci-integration-transaction.yaml index d06a9599cb6fa..8fc238da1a288 100644 --- a/.github/workflows/ci-integration-transaction.yaml +++ b/.github/workflows/ci-integration-transaction.yaml @@ -78,7 +78,7 @@ jobs: run: | sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc sudo apt clean - docker rmi $(docker images -q) -f + docker rmi $(docker images -q) -f || echo "No Docker images to remove." df -h - name: run install by skip tests diff --git a/.github/workflows/ci-owasp-dep-check.yaml b/.github/workflows/ci-owasp-dep-check.yaml index 10dc48f2b3a48..2519ad3ac49c2 100644 --- a/.github/workflows/ci-owasp-dep-check.yaml +++ b/.github/workflows/ci-owasp-dep-check.yaml @@ -78,7 +78,7 @@ jobs: sudo swapoff -a sudo rm -rf /swapfile /usr/share/dotnet /usr/local/lib/android /opt/ghc sudo apt clean - docker rmi $(docker images -q) -f + docker rmi $(docker images -q) -f || echo "No Docker images to remove." df -h # Projects dependent on flume, hdfs, hbase, and presto currently excluded from the scan. diff --git a/.github/workflows/ci-pulsar-website-build.yaml b/.github/workflows/ci-pulsar-website-build.yaml index 9d47f90e834f8..d0763fd1757df 100644 --- a/.github/workflows/ci-pulsar-website-build.yaml +++ b/.github/workflows/ci-pulsar-website-build.yaml @@ -60,7 +60,7 @@ jobs: run: | sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc sudo apt clean - docker rmi $(docker images -q) -f + docker rmi $(docker images -q) -f || echo "No Docker images to remove." df -h - name: run install by skip tests diff --git a/.github/workflows/ci-shade-test.yaml b/.github/workflows/ci-shade-test.yaml index c65c3414cf4ce..0ee12dc3f713b 100644 --- a/.github/workflows/ci-shade-test.yaml +++ b/.github/workflows/ci-shade-test.yaml @@ -79,7 +79,7 @@ jobs: run: | sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc sudo apt clean - docker rmi $(docker images -q) -f + docker rmi $(docker images -q) -f || echo "No Docker images to remove." df -h - name: run install by skip tests