-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add ProcessorIdCache from .NET internals
- Loading branch information
1 parent
3636804
commit 1402fc3
Showing
12 changed files
with
231 additions
and
13 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
[*] | ||
|
||
resharper_csharp_keep_existing_attribute_arrangement = true |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,181 @@ | ||
// Licensed to the .NET Foundation under one or more agreements. | ||
// The .NET Foundation licenses this file to you under the MIT license. | ||
// See the LICENSE file in the project root for more information. | ||
|
||
using System; | ||
using System.Diagnostics; | ||
using System.Runtime.CompilerServices; | ||
using System.Runtime.InteropServices; | ||
|
||
namespace Spreads.Native | ||
{ | ||
public static class ProcessorIdCache | ||
{ | ||
private const string NativeLibraryName = UnsafeEx.NativeLibraryName; | ||
|
||
[DllImport(NativeLibraryName, EntryPoint = "spreads_get_cpu_number", | ||
CallingConvention = CallingConvention.Cdecl)] | ||
private static extern int spreads_get_cpu_number(); | ||
|
||
public static int get_cpu_number() | ||
{ | ||
try | ||
{ | ||
return spreads_get_cpu_number(); | ||
} | ||
catch | ||
{ | ||
return -1; | ||
} | ||
} | ||
|
||
|
||
// The upper bits of t_currentProcessorIdCache are the currentProcessorId. The lower bits of | ||
// the t_currentProcessorIdCache are counting down to get it periodically refreshed. | ||
// TODO: Consider flushing the currentProcessorIdCache on Wait operations or similar | ||
// actions that are likely to result in changing the executing core | ||
[ThreadStatic] | ||
private static int t_currentProcessorIdCache; | ||
|
||
private const int ProcessorIdCacheShift = 16; | ||
|
||
private const int ProcessorIdCacheCountDownMask = (1 << ProcessorIdCacheShift) - 1; | ||
|
||
// Refresh rate of the cache. Will be derived from a speed check of GetCurrentProcessorNumber API. | ||
private static int s_processorIdRefreshRate; | ||
|
||
// We will not adjust higher than this though. | ||
private const int MaxIdRefreshRate = 5000; | ||
|
||
private static int RefreshCurrentProcessorId() | ||
{ | ||
int currentProcessorId = get_cpu_number(); | ||
|
||
// On Unix, GetCurrentProcessorNumber() is implemented in terms of sched_getcpu, which | ||
// doesn't exist on all platforms. On those it doesn't exist on, GetCurrentProcessorNumber() | ||
// returns -1. As a fallback in that case and to spread the threads across the buckets | ||
// by default, we use the current managed thread ID as a proxy. | ||
if (currentProcessorId < 0) | ||
currentProcessorId = Environment.CurrentManagedThreadId; | ||
|
||
Debug.Assert(s_processorIdRefreshRate <= ProcessorIdCacheCountDownMask); | ||
|
||
// Mask with int.MaxValue to ensure the execution Id is not negative | ||
t_currentProcessorIdCache = ((currentProcessorId << ProcessorIdCacheShift) & int.MaxValue) | | ||
s_processorIdRefreshRate; | ||
|
||
return currentProcessorId; | ||
} | ||
|
||
[MethodImpl(MethodImplOptions.AggressiveInlining)] | ||
public static int GetCurrentProcessorId() | ||
{ | ||
int currentProcessorIdCache = t_currentProcessorIdCache--; | ||
if ((currentProcessorIdCache & ProcessorIdCacheCountDownMask) == 0) | ||
{ | ||
return RefreshCurrentProcessorId(); | ||
} | ||
|
||
return currentProcessorIdCache >> ProcessorIdCacheShift; | ||
} | ||
|
||
// If GetCurrentProcessorNumber takes any nontrivial time (compared to TLS access), return false. | ||
// Check more than once - to make sure it was not because TLS was delayed by GC or a context switch. | ||
private static bool ProcessorNumberSpeedCheck() | ||
{ | ||
// NOTE: We do not check the frequency of the Stopwatch. | ||
// The frequency often does not match the actual timer refresh rate anyways. | ||
// If the resolution, precision or access time to the timer are inadequate for our measures here, | ||
// the test will fail anyways. | ||
|
||
double minID = double.MaxValue; | ||
double minTLS = double.MaxValue; | ||
|
||
// warm up the code paths. | ||
UninlinedThreadStatic(); | ||
// also check if API is actually functional (-1 means not supported) | ||
if (get_cpu_number() < 0) | ||
{ | ||
s_processorIdRefreshRate = ProcessorIdCacheCountDownMask; | ||
return false; | ||
} | ||
|
||
long oneMicrosecond = Stopwatch.Frequency / 1000000 + 1; | ||
for (int i = 0; i < 10; i++) | ||
{ | ||
// we will measure at least 16 iterations and at least 1 microsecond | ||
long t; | ||
int iters = 8; | ||
do | ||
{ | ||
iters *= 2; | ||
t = Stopwatch.GetTimestamp(); | ||
for (int j = 0; j < iters; j++) | ||
{ | ||
get_cpu_number(); | ||
} | ||
|
||
t = Stopwatch.GetTimestamp() - t; | ||
} while (t < oneMicrosecond); | ||
|
||
minID = Math.Min(minID, (double) t / iters); | ||
|
||
// we will measure at least 1 microsecond, | ||
// and use at least 1/2 of ProcID iterations | ||
// we assume that TLS can't be more than 2x slower than ProcID | ||
iters /= 4; | ||
do | ||
{ | ||
iters *= 2; | ||
t = Stopwatch.GetTimestamp(); | ||
for (int j = 0; j < iters; j++) | ||
{ | ||
UninlinedThreadStatic(); | ||
} | ||
|
||
t = Stopwatch.GetTimestamp() - t; | ||
} while (t < oneMicrosecond); | ||
|
||
minTLS = Math.Min(minTLS, (double) t / iters); | ||
} | ||
|
||
// A few words about choosing cache refresh rate: | ||
// | ||
// There are too reasons why data structures use core affinity: | ||
// 1) To improve locality - avoid running on one core and using data in other core's cache. | ||
// 2) To reduce sharing - avoid multiple threads using the same piece of data. | ||
// | ||
// Scenarios with large footprint, like striped caches, are sensitive to both parts. It is desirable to access | ||
// large data from the "right" core. | ||
// In scenarios where the state is small, like a striped counter, it is mostly about sharing. | ||
// Otherwise the state is small and occasionally moving counter to a different core via cache miss is not a big deal. | ||
// | ||
// In scenarios that care more about sharing precise results of GetCurrentProcessorNumber may not justify | ||
// the cost unless the underlying implementation is very cheap. | ||
// In such cases it is desirable to amortize the cost over multiple accesses by caching in a ThreadStatic. | ||
// | ||
// In addition to the data structure, the benefits also depend on use pattern and on concurrency level. | ||
// I.E. if an array pool user only rents array "just in case" but does not actually use it, and concurrency level is low, | ||
// a longer refresh would be beneficial since that could lower the API cost. | ||
// If array is actually used, then there is benefit from higher precision of the API and shorter refresh is more attractive. | ||
// | ||
// Overall we do not know the ideal refresh rate and using some kind of dynamic feedback is unlikely to be feasible. | ||
// Experiments have shown, however, that 5x amortization rate is a good enough balance between precision and cost of the API. | ||
s_processorIdRefreshRate = Math.Min((int) (minID * 5 / minTLS), MaxIdRefreshRate); | ||
|
||
// In a case if GetCurrentProcessorNumber is particularly fast, like it happens on platforms supporting RDPID instruction, | ||
// caching is not an improvement, thus it is desirable to bypass the cache entirely. | ||
// Such systems consistently derive the refresh rate at or below 2-3, while the next tier, RDTSCP based implementations result in ~10, | ||
// so we use "5" as a criteria to separate "fast" machines from the rest. | ||
return s_processorIdRefreshRate <= 5; | ||
} | ||
|
||
// NoInlining is to make sure JIT does not CSE and to have a better perf proxy for TLS access. | ||
// Measuring inlined ThreadStatic in a loop results in underestimates and unnecessary caching. | ||
[MethodImpl(MethodImplOptions.NoInlining)] | ||
private static int UninlinedThreadStatic() | ||
{ | ||
return t_currentProcessorIdCache; | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
27 changes: 27 additions & 0 deletions
27
dotnet/tests/Spreads.Native.Tests/ProcessorIdCacheTests.cs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,27 @@ | ||
using System; | ||
using System.Runtime.InteropServices; | ||
using NUnit.Framework; | ||
|
||
namespace Spreads.Native.Tests | ||
{ | ||
[TestFixture] | ||
public class ProcessorIdCacheTests | ||
{ | ||
[Test] | ||
public void CouldGetCpuNumberFromCache() | ||
{ | ||
var nativeCpuId = ProcessorIdCache.get_cpu_number(); | ||
Console.WriteLine($"native: {nativeCpuId}"); | ||
|
||
var cpuId = ProcessorIdCache.GetCurrentProcessorId(); | ||
if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows) | ||
|| RuntimeInformation.IsOSPlatform(OSPlatform.Linux)) | ||
{ | ||
Assert.IsTrue(nativeCpuId >= 0); | ||
} | ||
|
||
Assert.IsTrue(cpuId >= 0); | ||
Console.WriteLine($"cached: {cpuId}"); | ||
} | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,18 +1,22 @@ | ||
#[cfg(windows)] | ||
pub fn get_cpu_number() -> libc::c_int { | ||
#[no_mangle] | ||
pub extern "C" fn spreads_get_cpu_number() -> libc::c_int { | ||
unsafe { | ||
return winapi::um::processthreadsapi::GetCurrentProcessorNumber() as i32; | ||
} | ||
} | ||
|
||
#[cfg(any(linux, unix))] | ||
pub fn get_cpu_number() -> libc::c_int { | ||
#[no_mangle] | ||
pub extern "C" fn spreads_get_cpu_number() -> libc::c_int { | ||
unsafe { | ||
return libc::sched_getcpu(); | ||
let cpu_id = libc::sched_getcpu(); | ||
return if cpu_id < 0 { -1 } else { cpu_id }; | ||
} | ||
} | ||
|
||
#[cfg(not(any(windows, linux, unix)))] | ||
pub fn get_cpu_number() -> libc::c_int { | ||
#[no_mangle] | ||
pub extern "C" fn spreads_get_cpu_number() -> libc::c_int { | ||
return -1; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters