From 6a0989758fac9e7e3ee9db3db50ade8f82795d07 Mon Sep 17 00:00:00 2001 From: John Morman Date: Tue, 18 Jul 2023 08:39:48 -0700 Subject: [PATCH 1/3] Ensure Unregister Removes from local cache first before removing from grain directory so we stop serving bad entries as soon as possible --- .../GrainDirectory/CachedGrainLocator.cs | 13 ++++------ .../Directory/CachedGrainLocatorTests.cs | 26 +++++++++++++++++++ 2 files changed, 31 insertions(+), 8 deletions(-) diff --git a/src/Orleans.Runtime/GrainDirectory/CachedGrainLocator.cs b/src/Orleans.Runtime/GrainDirectory/CachedGrainLocator.cs index 1f823a9f1e..b78abc9d63 100644 --- a/src/Orleans.Runtime/GrainDirectory/CachedGrainLocator.cs +++ b/src/Orleans.Runtime/GrainDirectory/CachedGrainLocator.cs @@ -110,14 +110,11 @@ public async Task Register(GrainAddress address, GrainAddress prev public async Task Unregister(GrainAddress address, UnregistrationCause cause) { - try - { - await GetGrainDirectory(address.GrainId.Type).Unregister(address); - } - finally - { - this.cache.Remove(address); - } + // Remove from local cache first so we don't return it anymore + this.cache.Remove(address); + + // Remove from grain directory which may take significantly longer + await GetGrainDirectory(address.GrainId.Type).Unregister(address); } public void Participate(ISiloLifecycle lifecycle) diff --git a/test/NonSilo.Tests/Directory/CachedGrainLocatorTests.cs b/test/NonSilo.Tests/Directory/CachedGrainLocatorTests.cs index f538e342f0..316ee77396 100644 --- a/test/NonSilo.Tests/Directory/CachedGrainLocatorTests.cs +++ b/test/NonSilo.Tests/Directory/CachedGrainLocatorTests.cs @@ -6,6 +6,7 @@ using Microsoft.Extensions.DependencyInjection; using Microsoft.Extensions.Logging; using NSubstitute; +using NSubstitute.ReceivedExtensions; using Orleans; using Orleans.GrainDirectory; using Orleans.Metadata; @@ -363,6 +364,31 @@ public async Task UnregisterCallDirectoryAndCleanCache() Assert.False(this.grainLocator.TryLookupInCache(expectedAddr.GrainId, out _)); } + [Fact] + public async Task UnregisterRemovesFromCacheFirst() + { + var expectedSilo = GenerateSiloAddress(); + + // Setup membership service + this.mockMembershipService.UpdateSiloStatus(expectedSilo, SiloStatus.Active, "exp"); + await this.lifecycle.OnStart(); + await WaitUntilClusterChangePropagated(); + + var expectedAddr = GenerateGrainAddress(expectedSilo); + + // Give up control then Run forever + this.grainDirectory.When(v => v.Unregister(expectedAddr)).Do(async (t) => { await Task.Yield(); while (true) { } }); + + this.grainDirectory.Register(expectedAddr, previousAddress: null).Returns(expectedAddr); + + // Register to populate cache + await this.grainLocator.Register(expectedAddr, previousAddress: null); + + // Unregister and check if cache was cleaned + _ = this.grainLocator.Unregister(expectedAddr, UnregistrationCause.Force); + Assert.False(this.grainLocator.TryLookupInCache(expectedAddr.GrainId, out _)); + } + private GrainAddress GenerateGrainAddress(SiloAddress siloAddress = null) { return new GrainAddress From d191ec4442a9fc8a2f1fec49e81a32bd59d6b36b Mon Sep 17 00:00:00 2001 From: John Morman Date: Fri, 4 Aug 2023 06:50:27 -0700 Subject: [PATCH 2/3] Add check post Unregister to ensure that any race with Unregister that might re-add to cache is cleared when Unregister is done --- .../GrainDirectory/CachedGrainLocator.cs | 9 +- .../Directory/CachedGrainLocatorTests.cs | 86 ++++++++++++++++++- 2 files changed, 92 insertions(+), 3 deletions(-) diff --git a/src/Orleans.Runtime/GrainDirectory/CachedGrainLocator.cs b/src/Orleans.Runtime/GrainDirectory/CachedGrainLocator.cs index b78abc9d63..aabbdc9da3 100644 --- a/src/Orleans.Runtime/GrainDirectory/CachedGrainLocator.cs +++ b/src/Orleans.Runtime/GrainDirectory/CachedGrainLocator.cs @@ -103,7 +103,7 @@ public async Task Register(GrainAddress address, GrainAddress prev } // Cache update - this.cache.AddOrUpdate(result, (int) result.MembershipVersion.Value); + this.cache.AddOrUpdate(result, (int)result.MembershipVersion.Value); return result; } @@ -112,9 +112,14 @@ public async Task Unregister(GrainAddress address, UnregistrationCause cause) { // Remove from local cache first so we don't return it anymore this.cache.Remove(address); - + // Remove from grain directory which may take significantly longer await GetGrainDirectory(address.GrainId.Type).Unregister(address); + + if (this.cache.LookUp(address.GrainId, out var entry, out _) && entry == address) + { + this.cache.Remove(address); + } } public void Participate(ISiloLifecycle lifecycle) diff --git a/test/NonSilo.Tests/Directory/CachedGrainLocatorTests.cs b/test/NonSilo.Tests/Directory/CachedGrainLocatorTests.cs index 316ee77396..611838ae0f 100644 --- a/test/NonSilo.Tests/Directory/CachedGrainLocatorTests.cs +++ b/test/NonSilo.Tests/Directory/CachedGrainLocatorTests.cs @@ -377,7 +377,7 @@ public async Task UnregisterRemovesFromCacheFirst() var expectedAddr = GenerateGrainAddress(expectedSilo); // Give up control then Run forever - this.grainDirectory.When(v => v.Unregister(expectedAddr)).Do(async (t) => { await Task.Yield(); while (true) { } }); + this.grainDirectory.Unregister(expectedAddr).Returns(async (t) => { await Task.Yield(); while (true) { } }); this.grainDirectory.Register(expectedAddr, previousAddress: null).Returns(expectedAddr); @@ -389,6 +389,90 @@ public async Task UnregisterRemovesFromCacheFirst() Assert.False(this.grainLocator.TryLookupInCache(expectedAddr.GrainId, out _)); } + [Fact] + public async Task UnregisterRacesWithLookupSameId() + { + var expectedSilo = GenerateSiloAddress(); + + // Setup membership service + this.mockMembershipService.UpdateSiloStatus(expectedSilo, SiloStatus.Active, "exp"); + await this.lifecycle.OnStart(); + await WaitUntilClusterChangePropagated(); + + var expectedAddr = GenerateGrainAddress(expectedSilo); + + ManualResetEvent mre = new ManualResetEvent(false); + + // Give up control then Run forever + this.grainDirectory.Unregister(expectedAddr).Returns(async (t) => + { + await Task.Yield(); + mre.WaitOne(); + }); + + this.grainDirectory.Register(expectedAddr, previousAddress: null).Returns(expectedAddr); + + // Register to populate cache + await this.grainLocator.Register(expectedAddr, previousAddress: null); + + // Unregister and check if cache was cleaned + Task t = this.grainLocator.Unregister(expectedAddr, UnregistrationCause.Force); + Assert.False(this.grainLocator.TryLookupInCache(expectedAddr.GrainId, out _)); + + // Add back to cache simulating a race from lookup + await this.grainLocator.Register(expectedAddr, previousAddress: null); + Assert.True(this.grainLocator.TryLookupInCache(expectedAddr.GrainId, out _)); + + // Ensure when Unregister finishes if the race occured on the same id that it was removed + mre.Set(); + await t; + Assert.False(this.grainLocator.TryLookupInCache(expectedAddr.GrainId, out _)); + } + + [Fact] + public async Task UnregisterRacesWithLookupDifferentId() + { + var expectedSilo = GenerateSiloAddress(); + var secondSilo = GenerateSiloAddress(); + + // Setup membership service + this.mockMembershipService.UpdateSiloStatus(expectedSilo, SiloStatus.Active, "exp"); + this.mockMembershipService.UpdateSiloStatus(secondSilo, SiloStatus.Active, "exp"); + await this.lifecycle.OnStart(); + await WaitUntilClusterChangePropagated(); + + var expectedAddr = GenerateGrainAddress(expectedSilo); + var secondAddr = GenerateGrainAddress(secondSilo); + + ManualResetEvent mre = new ManualResetEvent(false); + + // Give up control then Run forever + this.grainDirectory.Unregister(expectedAddr).Returns(async (t) => + { + await Task.Yield(); + mre.WaitOne(); + }); + + this.grainDirectory.Register(expectedAddr, previousAddress: null).Returns(expectedAddr); + this.grainDirectory.Register(secondAddr, previousAddress: null).Returns(secondAddr); + + // Register to populate cache + await this.grainLocator.Register(expectedAddr, previousAddress: null); + + // Unregister and check if cache was cleaned + Task t = this.grainLocator.Unregister(expectedAddr, UnregistrationCause.Force); + Assert.False(this.grainLocator.TryLookupInCache(expectedAddr.GrainId, out _)); + + // Add back to cache simulating a race from lookup + await this.grainLocator.Register(secondAddr, previousAddress: null); + Assert.True(this.grainLocator.TryLookupInCache(secondAddr.GrainId, out _)); + + // Ensure when Unregister finishes if the race occured on the same id that it was removed + mre.Set(); + await t; + Assert.True(this.grainLocator.TryLookupInCache(secondAddr.GrainId, out _)); + } + private GrainAddress GenerateGrainAddress(SiloAddress siloAddress = null) { return new GrainAddress From 0bbed12b87458f4490cff52f903d65cef301c3c0 Mon Sep 17 00:00:00 2001 From: John Morman Date: Fri, 4 Aug 2023 06:52:55 -0700 Subject: [PATCH 3/3] Add comment for race condition --- src/Orleans.Runtime/GrainDirectory/CachedGrainLocator.cs | 1 + 1 file changed, 1 insertion(+) diff --git a/src/Orleans.Runtime/GrainDirectory/CachedGrainLocator.cs b/src/Orleans.Runtime/GrainDirectory/CachedGrainLocator.cs index aabbdc9da3..5d28a003b4 100644 --- a/src/Orleans.Runtime/GrainDirectory/CachedGrainLocator.cs +++ b/src/Orleans.Runtime/GrainDirectory/CachedGrainLocator.cs @@ -116,6 +116,7 @@ public async Task Unregister(GrainAddress address, UnregistrationCause cause) // Remove from grain directory which may take significantly longer await GetGrainDirectory(address.GrainId.Type).Unregister(address); + // There is the potential for a lookup to race with the Unregister and add the bad entry back to the cache. if (this.cache.LookUp(address.GrainId, out var entry, out _) && entry == address) { this.cache.Remove(address);