From ef16a17e0cded996449add464af4904c16edb53f Mon Sep 17 00:00:00 2001 From: Tomas Grosup Date: Tue, 14 Mar 2023 14:50:12 +0100 Subject: [PATCH 1/2] Array.Parallel.groupBy added --- src/FSharp.Core/array.fs | 122 ++++++++++++++++++ src/FSharp.Core/array.fsi | 27 ++++ .../ArrayModule.fs | 71 ++++++++++ 3 files changed, 220 insertions(+) diff --git a/src/FSharp.Core/array.fs b/src/FSharp.Core/array.fs index 8c2fa8470fa..5d493207c36 100644 --- a/src/FSharp.Core/array.fs +++ b/src/FSharp.Core/array.fs @@ -1932,7 +1932,9 @@ module Array = result module Parallel = + open System.Threading open System.Threading.Tasks + open System.Collections.Concurrent [] let choose chooser (array: 'T[]) = @@ -2014,6 +2016,126 @@ module Array = result + // The following two parameters were benchmarked and found to be optimal. + // Benchmark was run using: 11th Gen Intel Core i9-11950H 2.60GHz, 1 CPU, 16 logical and 8 physical cores + let private maxPartitions = Environment.ProcessorCount // The maximum number of partitions to use + let private minChunkSize = 256 // The minimum size of a chunk to be sorted in parallel + + let private createPartitionsUpTo maxIdxExclusive (array: 'T[]) = + [| + let chunkSize = + match maxIdxExclusive with + | smallSize when smallSize < minChunkSize -> smallSize + | biggerSize when biggerSize % maxPartitions = 0 -> biggerSize / maxPartitions + | biggerSize -> (biggerSize / maxPartitions) + 1 + + let mutable offset = 0 + + while (offset + chunkSize) < maxIdxExclusive do + yield new ArraySegment<'T>(array, offset, chunkSize) + offset <- offset + chunkSize + + yield new ArraySegment<'T>(array, offset, maxIdxExclusive - offset) + |] + + let inline groupByImplParallel + (comparer: IEqualityComparer<'SafeKey>) + ([] keyf: 'T -> 'SafeKey) + ([] getKey: 'SafeKey -> 'Key) + (array: 'T[]) + = + let counts = + new ConcurrentDictionary<_, _>( + concurrencyLevel = maxPartitions, + capacity = Operators.min (array.Length) 1_000, + comparer = comparer + ) + + let valueFactory = new Func<_, _>(fun _ -> ref 0) + + let projectedValues = + Microsoft.FSharp.Primitives.Basics.Array.zeroCreateUnchecked array.Length + + let inputChunks = createPartitionsUpTo array.Length array + + Parallel.For( + 0, + inputChunks.Length, + fun chunkIdx -> + let chunk = inputChunks[chunkIdx] + + for elemIdx = chunk.Offset to (chunk.Offset + chunk.Count - 1) do + let projected = keyf array[elemIdx] + projectedValues[elemIdx] <- projected + let counter = counts.GetOrAdd(projected, valueFactory = valueFactory) + Interlocked.Increment(counter) |> ignore + ) + |> ignore + + let finalResults = + Microsoft.FSharp.Primitives.Basics.Array.zeroCreateUnchecked counts.Count + + let mutable finalIdx = 0 + + let finalResultsLookup = + new Dictionary<'SafeKey, int ref * 'T[]>(capacity = counts.Count, comparer = comparer) + + for kvp in counts do + let arrayForThisGroup = + Microsoft.FSharp.Primitives.Basics.Array.zeroCreateUnchecked kvp.Value.Value + + finalResults.[finalIdx] <- getKey kvp.Key, arrayForThisGroup + finalResultsLookup[kvp.Key] <- kvp.Value, arrayForThisGroup + finalIdx <- finalIdx + 1 + + Parallel.For( + 0, + inputChunks.Length, + fun chunkIdx -> + let chunk = inputChunks[chunkIdx] + + for elemIdx = chunk.Offset to (chunk.Offset + chunk.Count - 1) do + let key = projectedValues[elemIdx] + let (counter, arrayForThisGroup) = finalResultsLookup[key] + let idxToWrite = Interlocked.Decrement(counter) + arrayForThisGroup[idxToWrite] <- array[elemIdx] + ) + |> ignore + + finalResults + + let groupByValueTypeParallel (keyf: 'T -> 'Key) (array: 'T[]) = + // Is it a bad idea to put floating points as keys for grouping? Yes + // But would the implementation fail with KeyNotFound "nan" if we just leave it? Also yes + // Here we enforce nan=nan equality to prevent throwing + if typeof<'Key> = typeof || typeof<'Key> = typeof then + let genericCmp = + HashIdentity.FromFunctions<'Key> + (LanguagePrimitives.GenericHash) + (LanguagePrimitives.GenericEqualityER) + + groupByImplParallel genericCmp keyf id array + else + groupByImplParallel HashIdentity.Structural<'Key> keyf id array + + // Just like in regular Array.groupBy: Wrap a StructBox around all keys in order to avoid nulls + // (dotnet doesn't allow null keys in dictionaries) + let groupByRefTypeParallel (keyf: 'T -> 'Key) (array: 'T[]) = + groupByImplParallel + RuntimeHelpers.StructBox<'Key>.Comparer + (fun t -> RuntimeHelpers.StructBox(keyf t)) + (fun sb -> sb.Value) + array + + [] + let groupBy (projection: 'T -> 'Key) (array: 'T[]) = + checkNonNull "array" array + + if typeof<'Key>.IsValueType then + groupByValueTypeParallel projection array + else + groupByRefTypeParallel projection array + [] let iter action (array: 'T[]) = checkNonNull "array" array diff --git a/src/FSharp.Core/array.fsi b/src/FSharp.Core/array.fsi index 2d08d1116e1..fa7c8ccbf76 100644 --- a/src/FSharp.Core/array.fsi +++ b/src/FSharp.Core/array.fsi @@ -3211,6 +3211,33 @@ module Array = [] val mapi: mapping:(int -> 'T -> 'U) -> array:'T[] -> 'U[] + /// Applies a key-generating function to each element of an array in parallel and yields an array of + /// unique keys. Each unique key contains an array of all elements that match + /// to this key. + /// + /// Performs the operation in parallel using . + /// The order in which the given function is applied to elements of the input array is not specified. + /// The order of the keys and values in the result is also not specified + + /// A function that transforms an element of the array into a comparable key. + /// The input array. + /// + /// The result array. + /// + /// Thrown when the input array is null. + /// + /// + /// + /// let inputs = [| 1; 2; 3; 4; 5 |] + /// + /// inputs |> Array.Parallel.groupBy (fun n -> n % 2) + /// + /// Evaluates to [| (1, [| 1; 3; 5 |]); (0, [| 2; 4 |]) |] + /// + [] + [] + val groupBy: projection:('T -> 'Key) -> array:'T[] -> ('Key * 'T[])[] when 'Key : equality + /// Apply the given function to each element of the array. /// /// Performs the operation in parallel using . diff --git a/tests/FSharp.Core.UnitTests/FSharp.Core/Microsoft.FSharp.Collections/ArrayModule.fs b/tests/FSharp.Core.UnitTests/FSharp.Core/Microsoft.FSharp.Collections/ArrayModule.fs index 3d9185468d9..705c05fea52 100644 --- a/tests/FSharp.Core.UnitTests/FSharp.Core/Microsoft.FSharp.Collections/ArrayModule.fs +++ b/tests/FSharp.Core.UnitTests/FSharp.Core/Microsoft.FSharp.Collections/ArrayModule.fs @@ -1237,6 +1237,77 @@ type ArrayModule() = CheckThrowsArgumentNullException(fun () -> Array.groupBy funcInt (null : int array) |> ignore) () + [] + member _.ParallelGroupBy() = + + let assertEqualityOfGroupings opName (seqGroup: ('TKey * 'TVal[])[]) (paraGroup: ('TKey * 'TVal[])[]) = + seqGroup |> Array.sortInPlaceBy fst + paraGroup |> Array.sortInPlaceBy fst + + seqGroup |> Array.iter (snd >> Array.sortInPlace) + paraGroup |> Array.iter (snd >> Array.sortInPlace) + + if seqGroup.Length <> paraGroup.Length then + Assert.Fail($"{opName} produced different lengths of results. Seq={seqGroup.Length};Para={paraGroup.Length}.") + + let seqKeys = seqGroup |> Array.map fst + let paraKeys = paraGroup |> Array.map fst + if(seqKeys <> paraKeys) then + Assert.Fail($"{opName} produced different keys. Seq=%A{seqKeys};Para=%A{paraKeys}.") + + Array.zip seqGroup paraGroup + |> Array.iter (fun ((seqKey,seqGroup), (paraKey,paraGroup)) -> + Assert.AreEqual(seqKey,paraKey,opName) + if seqGroup <> paraGroup then + Assert.Fail($"{opName} produced different results for key={seqKey}. Seq=%A{seqGroup};Para=%A{paraGroup}.")) + + Assert.True((seqGroup=paraGroup), $"{opName} produced different results. Seq=%A{seqGroup};Para=%A{paraGroup}.") + + + let compareAndAssert opName array projection = + let seqGroup = array |> Array.groupBy projection + let paraGroup = array |> Array.Parallel.groupBy projection + assertEqualityOfGroupings opName seqGroup paraGroup + + // int array + let funcInt x = x%5 + let IntArray = [| 0 .. 250 |] + compareAndAssert "Int grouping" IntArray funcInt + + + // string array + let funcStr (x:string) = x.Length + let strArray = Array.init 177 (fun i -> string i) + compareAndAssert "String grouping" strArray funcStr + + + // Empty array + compareAndAssert "Empty group" [||] funcInt + + // Reference key which can be null + let sampleStringsCanBeNull = [|"a";null;"abc";String.Empty|] + let pickStringByIdx idx = sampleStringsCanBeNull[idx % sampleStringsCanBeNull.Length] + compareAndAssert "Key can be null" IntArray pickStringByIdx + + //String array w/ null keys and values + let strArray = Array.init 222 (fun i -> if i%3=0 then String.Empty else null ) + compareAndAssert "String grouping w/ nulls" strArray id + + // Keys being special floats. Array.groupBy does not work here, we test results manually + let specialFloats = [|infinity; -infinity;-0.0; 0.0; 1.0; -1.0; -0.0/0.0; -nan|] + let pickSpecialFloatByIdx idx = specialFloats[idx % specialFloats.Length] + + let paraGroup = IntArray |> Array.Parallel.groupBy pickSpecialFloatByIdx + Assert.AreEqual(6, paraGroup.Length, "There should be 6 special floats!") + let (nan,nansGroup) = paraGroup |> Array.find (fun (k,_) -> Double.IsNaN(k)) + // Both -0.0/0.0; -nan are a Nan. => 2/8 => every 4th elements goes to the NaN bucket + Assert.AreEqual((IntArray.Length / 4), nansGroup.Length, $"There should be {(IntArray.Length / 4)} NaNs!") + + + + CheckThrowsArgumentNullException(fun () -> Array.Parallel.groupBy funcInt (null : int array) |> ignore) + () + member private this.InitTester initInt initString = // integer array let resultInt : int[] = initInt 3 (fun x -> x + 3) From d29b252719ef6f8a547e7b2a30ab1226522d4394 Mon Sep 17 00:00:00 2001 From: Tomas Grosup Date: Tue, 14 Mar 2023 15:09:20 +0100 Subject: [PATCH 2/2] surface area updated --- .../FSharp.Core.SurfaceArea.netstandard20.debug.bsl | 1 + .../FSharp.Core.SurfaceArea.netstandard20.release.bsl | 1 + .../FSharp.Core.SurfaceArea.netstandard21.debug.bsl | 1 + .../FSharp.Core.SurfaceArea.netstandard21.release.bsl | 1 + 4 files changed, 4 insertions(+) diff --git a/tests/FSharp.Core.UnitTests/FSharp.Core.SurfaceArea.netstandard20.debug.bsl b/tests/FSharp.Core.UnitTests/FSharp.Core.SurfaceArea.netstandard20.debug.bsl index 348c413de1b..44e13a1ea3a 100644 --- a/tests/FSharp.Core.UnitTests/FSharp.Core.SurfaceArea.netstandard20.debug.bsl +++ b/tests/FSharp.Core.UnitTests/FSharp.Core.SurfaceArea.netstandard20.debug.bsl @@ -40,6 +40,7 @@ Microsoft.FSharp.Collections.Array4DModule: T[,,,] Create[T](Int32, Int32, Int32 Microsoft.FSharp.Collections.Array4DModule: T[,,,] Initialize[T](Int32, Int32, Int32, Int32, Microsoft.FSharp.Core.FSharpFunc`2[System.Int32,Microsoft.FSharp.Core.FSharpFunc`2[System.Int32,Microsoft.FSharp.Core.FSharpFunc`2[System.Int32,Microsoft.FSharp.Core.FSharpFunc`2[System.Int32,T]]]]) Microsoft.FSharp.Collections.Array4DModule: T[,,,] ZeroCreate[T](Int32, Int32, Int32, Int32) Microsoft.FSharp.Collections.Array4DModule: Void Set[T](T[,,,], Int32, Int32, Int32, Int32, T) +Microsoft.FSharp.Collections.ArrayModule+Parallel: System.Tuple`2[TKey,T[]][] GroupBy[T,TKey](Microsoft.FSharp.Core.FSharpFunc`2[T,TKey], T[]) Microsoft.FSharp.Collections.ArrayModule+Parallel: System.Tuple`2[T[],T[]] Partition[T](Microsoft.FSharp.Core.FSharpFunc`2[T,System.Boolean], T[]) Microsoft.FSharp.Collections.ArrayModule+Parallel: TResult[] Choose[T,TResult](Microsoft.FSharp.Core.FSharpFunc`2[T,Microsoft.FSharp.Core.FSharpOption`1[TResult]], T[]) Microsoft.FSharp.Collections.ArrayModule+Parallel: TResult[] Collect[T,TResult](Microsoft.FSharp.Core.FSharpFunc`2[T,TResult[]], T[]) diff --git a/tests/FSharp.Core.UnitTests/FSharp.Core.SurfaceArea.netstandard20.release.bsl b/tests/FSharp.Core.UnitTests/FSharp.Core.SurfaceArea.netstandard20.release.bsl index 2d5ddb40c7a..a29ee6615ca 100644 --- a/tests/FSharp.Core.UnitTests/FSharp.Core.SurfaceArea.netstandard20.release.bsl +++ b/tests/FSharp.Core.UnitTests/FSharp.Core.SurfaceArea.netstandard20.release.bsl @@ -40,6 +40,7 @@ Microsoft.FSharp.Collections.Array4DModule: T[,,,] Create[T](Int32, Int32, Int32 Microsoft.FSharp.Collections.Array4DModule: T[,,,] Initialize[T](Int32, Int32, Int32, Int32, Microsoft.FSharp.Core.FSharpFunc`2[System.Int32,Microsoft.FSharp.Core.FSharpFunc`2[System.Int32,Microsoft.FSharp.Core.FSharpFunc`2[System.Int32,Microsoft.FSharp.Core.FSharpFunc`2[System.Int32,T]]]]) Microsoft.FSharp.Collections.Array4DModule: T[,,,] ZeroCreate[T](Int32, Int32, Int32, Int32) Microsoft.FSharp.Collections.Array4DModule: Void Set[T](T[,,,], Int32, Int32, Int32, Int32, T) +Microsoft.FSharp.Collections.ArrayModule+Parallel: System.Tuple`2[TKey,T[]][] GroupBy[T,TKey](Microsoft.FSharp.Core.FSharpFunc`2[T,TKey], T[]) Microsoft.FSharp.Collections.ArrayModule+Parallel: System.Tuple`2[T[],T[]] Partition[T](Microsoft.FSharp.Core.FSharpFunc`2[T,System.Boolean], T[]) Microsoft.FSharp.Collections.ArrayModule+Parallel: TResult[] Choose[T,TResult](Microsoft.FSharp.Core.FSharpFunc`2[T,Microsoft.FSharp.Core.FSharpOption`1[TResult]], T[]) Microsoft.FSharp.Collections.ArrayModule+Parallel: TResult[] Collect[T,TResult](Microsoft.FSharp.Core.FSharpFunc`2[T,TResult[]], T[]) diff --git a/tests/FSharp.Core.UnitTests/FSharp.Core.SurfaceArea.netstandard21.debug.bsl b/tests/FSharp.Core.UnitTests/FSharp.Core.SurfaceArea.netstandard21.debug.bsl index 45805d53ae0..94cfec50f39 100644 --- a/tests/FSharp.Core.UnitTests/FSharp.Core.SurfaceArea.netstandard21.debug.bsl +++ b/tests/FSharp.Core.UnitTests/FSharp.Core.SurfaceArea.netstandard21.debug.bsl @@ -40,6 +40,7 @@ Microsoft.FSharp.Collections.Array4DModule: T[,,,] Create[T](Int32, Int32, Int32 Microsoft.FSharp.Collections.Array4DModule: T[,,,] Initialize[T](Int32, Int32, Int32, Int32, Microsoft.FSharp.Core.FSharpFunc`2[System.Int32,Microsoft.FSharp.Core.FSharpFunc`2[System.Int32,Microsoft.FSharp.Core.FSharpFunc`2[System.Int32,Microsoft.FSharp.Core.FSharpFunc`2[System.Int32,T]]]]) Microsoft.FSharp.Collections.Array4DModule: T[,,,] ZeroCreate[T](Int32, Int32, Int32, Int32) Microsoft.FSharp.Collections.Array4DModule: Void Set[T](T[,,,], Int32, Int32, Int32, Int32, T) +Microsoft.FSharp.Collections.ArrayModule+Parallel: System.Tuple`2[TKey,T[]][] GroupBy[T,TKey](Microsoft.FSharp.Core.FSharpFunc`2[T,TKey], T[]) Microsoft.FSharp.Collections.ArrayModule+Parallel: System.Tuple`2[T[],T[]] Partition[T](Microsoft.FSharp.Core.FSharpFunc`2[T,System.Boolean], T[]) Microsoft.FSharp.Collections.ArrayModule+Parallel: TResult[] Choose[T,TResult](Microsoft.FSharp.Core.FSharpFunc`2[T,Microsoft.FSharp.Core.FSharpOption`1[TResult]], T[]) Microsoft.FSharp.Collections.ArrayModule+Parallel: TResult[] Collect[T,TResult](Microsoft.FSharp.Core.FSharpFunc`2[T,TResult[]], T[]) diff --git a/tests/FSharp.Core.UnitTests/FSharp.Core.SurfaceArea.netstandard21.release.bsl b/tests/FSharp.Core.UnitTests/FSharp.Core.SurfaceArea.netstandard21.release.bsl index 275087e02e0..3d12e057a97 100644 --- a/tests/FSharp.Core.UnitTests/FSharp.Core.SurfaceArea.netstandard21.release.bsl +++ b/tests/FSharp.Core.UnitTests/FSharp.Core.SurfaceArea.netstandard21.release.bsl @@ -40,6 +40,7 @@ Microsoft.FSharp.Collections.Array4DModule: T[,,,] Create[T](Int32, Int32, Int32 Microsoft.FSharp.Collections.Array4DModule: T[,,,] Initialize[T](Int32, Int32, Int32, Int32, Microsoft.FSharp.Core.FSharpFunc`2[System.Int32,Microsoft.FSharp.Core.FSharpFunc`2[System.Int32,Microsoft.FSharp.Core.FSharpFunc`2[System.Int32,Microsoft.FSharp.Core.FSharpFunc`2[System.Int32,T]]]]) Microsoft.FSharp.Collections.Array4DModule: T[,,,] ZeroCreate[T](Int32, Int32, Int32, Int32) Microsoft.FSharp.Collections.Array4DModule: Void Set[T](T[,,,], Int32, Int32, Int32, Int32, T) +Microsoft.FSharp.Collections.ArrayModule+Parallel: System.Tuple`2[TKey,T[]][] GroupBy[T,TKey](Microsoft.FSharp.Core.FSharpFunc`2[T,TKey], T[]) Microsoft.FSharp.Collections.ArrayModule+Parallel: System.Tuple`2[T[],T[]] Partition[T](Microsoft.FSharp.Core.FSharpFunc`2[T,System.Boolean], T[]) Microsoft.FSharp.Collections.ArrayModule+Parallel: TResult[] Choose[T,TResult](Microsoft.FSharp.Core.FSharpFunc`2[T,Microsoft.FSharp.Core.FSharpOption`1[TResult]], T[]) Microsoft.FSharp.Collections.ArrayModule+Parallel: TResult[] Collect[T,TResult](Microsoft.FSharp.Core.FSharpFunc`2[T,TResult[]], T[])