Skip to content

Commit

Permalink
Add ProcessorIdCache from .NET internals
Browse files Browse the repository at this point in the history
  • Loading branch information
buybackoff committed Jan 28, 2020
1 parent 3636804 commit 1402fc3
Show file tree
Hide file tree
Showing 12 changed files with 231 additions and 13 deletions.
3 changes: 3 additions & 0 deletions dotnet/.editorconfig
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[*]

resharper_csharp_keep_existing_attribute_arrangement = true
1 change: 1 addition & 0 deletions dotnet/Spreads.Native.sln
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "Solution Items", "Solution
..\README.md = ..\README.md
build\Spreads.props = build\Spreads.props
test.bat = test.bat
.editorconfig = .editorconfig
EndProjectSection
EndProject
Project("{2150E333-8FDC-42A3-9474-1A3956D46DE8}") = "tests", "tests", "{ED8079DD-2B06-4030-9F0F-DC548F98E1C4}"
Expand Down
Binary file modified dotnet/lib/runtimes/linux-x64/native/libspreads_native.so
Binary file not shown.
Binary file modified dotnet/lib/runtimes/win-x64/native/libspreads_native.dll
Binary file not shown.
Binary file modified dotnet/lib/runtimes/win-x64/native/libspreads_native.x86.dll
Binary file not shown.
Binary file modified dotnet/lib/runtimes/win-x86/native/libspreads_native.dll
Binary file not shown.
4 changes: 2 additions & 2 deletions dotnet/src/Spreads.Native/Compression.cs
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ namespace Spreads.Native
[SuppressUnmanagedCodeSecurity]
public unsafe class Compression
{
internal const string NativeLibraryName = "libspreads_native";

private const string NativeLibraryName = UnsafeEx.NativeLibraryName;
internal static IntPtr compress_copy_ptr = UnsafeEx.CopyCompressMethod();
internal static IntPtr decompress_copy_ptr = UnsafeEx.CopyDecompressMethod();

Expand Down
181 changes: 181 additions & 0 deletions dotnet/src/Spreads.Native/ProcessorIdCache.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.
// See the LICENSE file in the project root for more information.

using System;
using System.Diagnostics;
using System.Runtime.CompilerServices;
using System.Runtime.InteropServices;

namespace Spreads.Native
{
public static class ProcessorIdCache
{
private const string NativeLibraryName = UnsafeEx.NativeLibraryName;

[DllImport(NativeLibraryName, EntryPoint = "spreads_get_cpu_number",
CallingConvention = CallingConvention.Cdecl)]
private static extern int spreads_get_cpu_number();

public static int get_cpu_number()
{
try
{
return spreads_get_cpu_number();
}
catch
{
return -1;
}
}


// The upper bits of t_currentProcessorIdCache are the currentProcessorId. The lower bits of
// the t_currentProcessorIdCache are counting down to get it periodically refreshed.
// TODO: Consider flushing the currentProcessorIdCache on Wait operations or similar
// actions that are likely to result in changing the executing core
[ThreadStatic]
private static int t_currentProcessorIdCache;

private const int ProcessorIdCacheShift = 16;

private const int ProcessorIdCacheCountDownMask = (1 << ProcessorIdCacheShift) - 1;

// Refresh rate of the cache. Will be derived from a speed check of GetCurrentProcessorNumber API.
private static int s_processorIdRefreshRate;

// We will not adjust higher than this though.
private const int MaxIdRefreshRate = 5000;

private static int RefreshCurrentProcessorId()
{
int currentProcessorId = get_cpu_number();

// On Unix, GetCurrentProcessorNumber() is implemented in terms of sched_getcpu, which
// doesn't exist on all platforms. On those it doesn't exist on, GetCurrentProcessorNumber()
// returns -1. As a fallback in that case and to spread the threads across the buckets
// by default, we use the current managed thread ID as a proxy.
if (currentProcessorId < 0)
currentProcessorId = Environment.CurrentManagedThreadId;

Debug.Assert(s_processorIdRefreshRate <= ProcessorIdCacheCountDownMask);

// Mask with int.MaxValue to ensure the execution Id is not negative
t_currentProcessorIdCache = ((currentProcessorId << ProcessorIdCacheShift) & int.MaxValue) |
s_processorIdRefreshRate;

return currentProcessorId;
}

[MethodImpl(MethodImplOptions.AggressiveInlining)]
public static int GetCurrentProcessorId()
{
int currentProcessorIdCache = t_currentProcessorIdCache--;
if ((currentProcessorIdCache & ProcessorIdCacheCountDownMask) == 0)
{
return RefreshCurrentProcessorId();
}

return currentProcessorIdCache >> ProcessorIdCacheShift;
}

// If GetCurrentProcessorNumber takes any nontrivial time (compared to TLS access), return false.
// Check more than once - to make sure it was not because TLS was delayed by GC or a context switch.
private static bool ProcessorNumberSpeedCheck()
{
// NOTE: We do not check the frequency of the Stopwatch.
// The frequency often does not match the actual timer refresh rate anyways.
// If the resolution, precision or access time to the timer are inadequate for our measures here,
// the test will fail anyways.

double minID = double.MaxValue;
double minTLS = double.MaxValue;

// warm up the code paths.
UninlinedThreadStatic();
// also check if API is actually functional (-1 means not supported)
if (get_cpu_number() < 0)
{
s_processorIdRefreshRate = ProcessorIdCacheCountDownMask;
return false;
}

long oneMicrosecond = Stopwatch.Frequency / 1000000 + 1;
for (int i = 0; i < 10; i++)
{
// we will measure at least 16 iterations and at least 1 microsecond
long t;
int iters = 8;
do
{
iters *= 2;
t = Stopwatch.GetTimestamp();
for (int j = 0; j < iters; j++)
{
get_cpu_number();
}

t = Stopwatch.GetTimestamp() - t;
} while (t < oneMicrosecond);

minID = Math.Min(minID, (double) t / iters);

// we will measure at least 1 microsecond,
// and use at least 1/2 of ProcID iterations
// we assume that TLS can't be more than 2x slower than ProcID
iters /= 4;
do
{
iters *= 2;
t = Stopwatch.GetTimestamp();
for (int j = 0; j < iters; j++)
{
UninlinedThreadStatic();
}

t = Stopwatch.GetTimestamp() - t;
} while (t < oneMicrosecond);

minTLS = Math.Min(minTLS, (double) t / iters);
}

// A few words about choosing cache refresh rate:
//
// There are too reasons why data structures use core affinity:
// 1) To improve locality - avoid running on one core and using data in other core's cache.
// 2) To reduce sharing - avoid multiple threads using the same piece of data.
//
// Scenarios with large footprint, like striped caches, are sensitive to both parts. It is desirable to access
// large data from the "right" core.
// In scenarios where the state is small, like a striped counter, it is mostly about sharing.
// Otherwise the state is small and occasionally moving counter to a different core via cache miss is not a big deal.
//
// In scenarios that care more about sharing precise results of GetCurrentProcessorNumber may not justify
// the cost unless the underlying implementation is very cheap.
// In such cases it is desirable to amortize the cost over multiple accesses by caching in a ThreadStatic.
//
// In addition to the data structure, the benefits also depend on use pattern and on concurrency level.
// I.E. if an array pool user only rents array "just in case" but does not actually use it, and concurrency level is low,
// a longer refresh would be beneficial since that could lower the API cost.
// If array is actually used, then there is benefit from higher precision of the API and shorter refresh is more attractive.
//
// Overall we do not know the ideal refresh rate and using some kind of dynamic feedback is unlikely to be feasible.
// Experiments have shown, however, that 5x amortization rate is a good enough balance between precision and cost of the API.
s_processorIdRefreshRate = Math.Min((int) (minID * 5 / minTLS), MaxIdRefreshRate);

// In a case if GetCurrentProcessorNumber is particularly fast, like it happens on platforms supporting RDPID instruction,
// caching is not an improvement, thus it is desirable to bypass the cache entirely.
// Such systems consistently derive the refresh rate at or below 2-3, while the next tier, RDTSCP based implementations result in ~10,
// so we use "5" as a criteria to separate "fast" machines from the rest.
return s_processorIdRefreshRate <= 5;
}

// NoInlining is to make sure JIT does not CSE and to have a better perf proxy for TLS access.
// Measuring inlined ThreadStatic in a loop results in underestimates and unnecessary caching.
[MethodImpl(MethodImplOptions.NoInlining)]
private static int UninlinedThreadStatic()
{
return t_currentProcessorIdCache;
}
}
}
2 changes: 2 additions & 0 deletions dotnet/src/Spreads.Native/UnsafeEx.cs
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ namespace Spreads.Native
/// <seealso cref="System.Runtime.CompilerServices.Unsafe"/>
public static unsafe class UnsafeEx
{
internal const string NativeLibraryName = "libspreads_native";

[EditorBrowsable(EditorBrowsableState.Never)]
[MethodImpl(MethodImplOptions.ForwardRef)]
// ReSharper disable once UnusedMember.Local
Expand Down
27 changes: 27 additions & 0 deletions dotnet/tests/Spreads.Native.Tests/ProcessorIdCacheTests.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
using System;
using System.Runtime.InteropServices;
using NUnit.Framework;

namespace Spreads.Native.Tests
{
[TestFixture]
public class ProcessorIdCacheTests
{
[Test]
public void CouldGetCpuNumberFromCache()
{
var nativeCpuId = ProcessorIdCache.get_cpu_number();
Console.WriteLine($"native: {nativeCpuId}");

var cpuId = ProcessorIdCache.GetCurrentProcessorId();
if (RuntimeInformation.IsOSPlatform(OSPlatform.Windows)
|| RuntimeInformation.IsOSPlatform(OSPlatform.Linux))
{
Assert.IsTrue(nativeCpuId >= 0);
}

Assert.IsTrue(cpuId >= 0);
Console.WriteLine($"cached: {cpuId}");
}
}
}
12 changes: 8 additions & 4 deletions rs/spreads-pal/src/cpu.rs
Original file line number Diff line number Diff line change
@@ -1,18 +1,22 @@
#[cfg(windows)]
pub fn get_cpu_number() -> libc::c_int {
#[no_mangle]
pub extern "C" fn spreads_get_cpu_number() -> libc::c_int {
unsafe {
return winapi::um::processthreadsapi::GetCurrentProcessorNumber() as i32;
}
}

#[cfg(any(linux, unix))]
pub fn get_cpu_number() -> libc::c_int {
#[no_mangle]
pub extern "C" fn spreads_get_cpu_number() -> libc::c_int {
unsafe {
return libc::sched_getcpu();
let cpu_id = libc::sched_getcpu();
return if cpu_id < 0 { -1 } else { cpu_id };
}
}

#[cfg(not(any(windows, linux, unix)))]
pub fn get_cpu_number() -> libc::c_int {
#[no_mangle]
pub extern "C" fn spreads_get_cpu_number() -> libc::c_int {
return -1;
}
14 changes: 7 additions & 7 deletions rs/spreads-pal/src/lib.rs
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
mod cpu;
pub mod cpu;

#[no_mangle]
pub extern "C" fn spreads_get_cpu_number() -> libc::c_int {
return cpu::get_cpu_number();
}
// #[no_mangle]
// pub extern "C" fn spreads_get_cpu_number() -> libc::c_int {
// return cpu::get_cpu_number();
// }

#[cfg(test)]
mod tests {
#[test]
#[cfg(any(windows,linux, unix))]
fn can_get_cpu_number_current() {
// TODO xplat set affinity
let result = super::spreads_get_cpu_number();
let result = super::cpu::spreads_get_cpu_number();
println!("CPU number: {}", result);
assert!(result >= 0);
}
Expand All @@ -33,7 +33,7 @@ mod tests {
std::io::Error::last_os_error()
);
}
let result = super::spreads_get_cpu_number();
let result = super::cpu::spreads_get_cpu_number();
println!("CPU number: {}", result);
assert_eq!(cpu_num as i32, result);
}
Expand Down

0 comments on commit 1402fc3

Please sign in to comment.