From f0c7efa9a44d137f202e6f14263800d630e641a0 Mon Sep 17 00:00:00 2001 From: Muhammed Fatih BALIN Date: Mon, 11 Mar 2024 08:23:36 -0400 Subject: [PATCH] [GraphBolt] Implement dependent minibatching for labor. (#7205) --- graphbolt/include/graphbolt/continuous_seed.h | 97 +++++++++++++++++++ .../graphbolt/fused_csc_sampling_graph.h | 3 +- graphbolt/src/cuda/neighbor_sampler.cu | 21 ++-- graphbolt/src/fused_csc_sampling_graph.cc | 25 ++++- graphbolt/src/random.h | 26 ----- 5 files changed, 126 insertions(+), 46 deletions(-) create mode 100644 graphbolt/include/graphbolt/continuous_seed.h diff --git a/graphbolt/include/graphbolt/continuous_seed.h b/graphbolt/include/graphbolt/continuous_seed.h new file mode 100644 index 000000000000..c659b1753cf4 --- /dev/null +++ b/graphbolt/include/graphbolt/continuous_seed.h @@ -0,0 +1,97 @@ +/** + * Copyright (c) 2023, GT-TDAlab (Muhammed Fatih Balin & Umit V. Catalyurek) + * All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * @file graphbolt/continuous_seed.h + * @brief CPU and CUDA implementation for continuous random seeds + */ +#ifndef GRAPHBOLT_CONTINUOUS_SEED_H_ +#define GRAPHBOLT_CONTINUOUS_SEED_H_ + +#include + +#include + +#ifdef __CUDACC__ +#include +#else +#include +#include +#endif // __CUDA_ARCH__ + +#ifndef M_SQRT1_2 +#define M_SQRT1_2 0.707106781186547524401 +#endif // M_SQRT1_2 + +namespace graphbolt { + +class continuous_seed { + uint64_t s[2]; + float c[2]; + + public: + /* implicit */ continuous_seed(const int64_t seed) { // NOLINT + s[0] = s[1] = seed; + c[0] = c[1] = 0; + } + + continuous_seed(torch::Tensor seed_arr, float r) { + auto seed = seed_arr.data_ptr(); + s[0] = seed[0]; + s[1] = seed[seed_arr.size(0) - 1]; + const auto pi = std::acos(-1.0); + c[0] = std::cos(pi * r / 2); + c[1] = std::sin(pi * r / 2); + } + +#ifdef __CUDACC__ + __device__ inline float uniform(const uint64_t t) const { + const uint64_t kCurandSeed = 999961; // Could be any random number. + curandStatePhilox4_32_10_t rng; + curand_init(kCurandSeed, s[0], t, &rng); + float rnd; + if (s[0] != s[1]) { + rnd = c[0] * curand_normal(&rng); + curand_init(kCurandSeed, s[1], t, &rng); + rnd += c[1] * curand_normal(&rng); + rnd = normcdff(rnd); + } else { + rnd = curand_uniform(&rng); + } + return rnd; + } +#else + inline float uniform(const uint64_t t) const { + pcg32 ng0(s[0], t); + float rnd; + if (s[0] != s[1]) { + std::normal_distribution norm; + rnd = c[0] * norm(ng0); + pcg32 ng1(s[1], t); + norm.reset(); + rnd += c[1] * norm(ng1); + rnd = std::erfc(-rnd * static_cast(M_SQRT1_2)) / 2.0f; + } else { + std::uniform_real_distribution uni; + rnd = uni(ng0); + } + return rnd; + } +#endif // __CUDA_ARCH__ +}; + +} // namespace graphbolt + +#endif // GRAPHBOLT_CONTINUOUS_SEED_H_ diff --git a/graphbolt/include/graphbolt/fused_csc_sampling_graph.h b/graphbolt/include/graphbolt/fused_csc_sampling_graph.h index e7eadd3f36a1..33437c1336f6 100644 --- a/graphbolt/include/graphbolt/fused_csc_sampling_graph.h +++ b/graphbolt/include/graphbolt/fused_csc_sampling_graph.h @@ -6,6 +6,7 @@ #ifndef GRAPHBOLT_CSC_SAMPLING_GRAPH_H_ #define GRAPHBOLT_CSC_SAMPLING_GRAPH_H_ +#include #include #include #include @@ -27,7 +28,7 @@ struct SamplerArgs {}; template <> struct SamplerArgs { const torch::Tensor& indices; - int64_t random_seed; + continuous_seed random_seed; int64_t num_nodes; }; diff --git a/graphbolt/src/cuda/neighbor_sampler.cu b/graphbolt/src/cuda/neighbor_sampler.cu index 3292abd9905c..49fb727a630c 100644 --- a/graphbolt/src/cuda/neighbor_sampler.cu +++ b/graphbolt/src/cuda/neighbor_sampler.cu @@ -6,6 +6,7 @@ */ #include #include +#include #include #include #include @@ -41,27 +42,17 @@ __global__ void _ComputeRandoms( const int64_t num_edges, const indptr_t* const sliced_indptr, const indptr_t* const sub_indptr, const indices_t* const csr_rows, const weights_t* const sliced_weights, const indices_t* const indices, - const uint64_t random_seed, float_t* random_arr, edge_id_t* edge_ids) { + const continuous_seed random_seed, float_t* random_arr, + edge_id_t* edge_ids) { int64_t i = blockIdx.x * blockDim.x + threadIdx.x; const int stride = gridDim.x * blockDim.x; - curandStatePhilox4_32_10_t rng; const auto labor = indices != nullptr; - if (!labor) { - curand_init(random_seed, i, 0, &rng); - } - while (i < num_edges) { const auto row_position = csr_rows[i]; const auto row_offset = i - sub_indptr[row_position]; const auto in_idx = sliced_indptr[row_position] + row_offset; - - if (labor) { - constexpr uint64_t kCurandSeed = 999961; - curand_init(kCurandSeed, random_seed, indices[in_idx], &rng); - } - - const auto rnd = curand_uniform(&rng); + const auto rnd = random_seed.uniform(labor ? indices[in_idx] : i); const auto prob = sliced_weights ? sliced_weights[i] : static_cast(1); const auto exp_rnd = -__logf(rnd); @@ -211,8 +202,8 @@ c10::intrusive_ptr SampleNeighbors( auto coo_rows = ExpandIndptrImpl( sub_indptr, indices.scalar_type(), torch::nullopt, num_edges); num_edges = coo_rows.size(0); - const auto random_seed = RandomEngine::ThreadLocal()->RandInt( - static_cast(0), std::numeric_limits::max()); + const continuous_seed random_seed(RandomEngine::ThreadLocal()->RandInt( + static_cast(0), std::numeric_limits::max())); auto output_indptr = torch::empty_like(sub_indptr); torch::Tensor picked_eids; torch::Tensor output_indices; diff --git a/graphbolt/src/fused_csc_sampling_graph.cc b/graphbolt/src/fused_csc_sampling_graph.cc index 4dacb9792448..86bbdda7cb4c 100644 --- a/graphbolt/src/fused_csc_sampling_graph.cc +++ b/graphbolt/src/fused_csc_sampling_graph.cc @@ -1417,6 +1417,25 @@ inline void safe_divide(T& a, U b) { a = b > 0 ? (T)(a / b) : std::numeric_limits::infinity(); } +namespace labor { + +template +inline T invcdf(T u, int64_t n, T rem) { + constexpr T one = 1; + return rem * (one - std::pow(one - u, one / n)); +} + +template +inline T jth_sorted_uniform_random( + continuous_seed seed, int64_t t, int64_t c, int64_t j, T& rem, int64_t n) { + const T u = seed.uniform(t + j * c); + // https://mathematica.stackexchange.com/a/256707 + rem -= invcdf(u, n, rem); + return 1 - rem; +} + +}; // namespace labor + /** * @brief Perform uniform-nonuniform sampling of elements depending on the * template parameter NonUniform and return the sampled indices. @@ -1563,8 +1582,7 @@ inline int64_t LaborPick( // O(num_neighbors). for (uint32_t i = 0; i < fanout; ++i) { const auto t = local_indices_data[i]; - auto rnd = - labor::uniform_random(args.random_seed, t); // r_t + auto rnd = args.random_seed.uniform(t); // r_t if constexpr (NonUniform) { safe_divide(rnd, local_probs_data[i]); } // r_t / \pi_t @@ -1575,8 +1593,7 @@ inline int64_t LaborPick( } for (uint32_t i = fanout; i < num_neighbors; ++i) { const auto t = local_indices_data[i]; - auto rnd = - labor::uniform_random(args.random_seed, t); // r_t + auto rnd = args.random_seed.uniform(t); // r_t if constexpr (NonUniform) { safe_divide(rnd, local_probs_data[i]); } // r_t / \pi_t diff --git a/graphbolt/src/random.h b/graphbolt/src/random.h index f7c5d87b5fbe..1c74123c945f 100644 --- a/graphbolt/src/random.h +++ b/graphbolt/src/random.h @@ -76,32 +76,6 @@ class RandomEngine { pcg32 rng_; }; -namespace labor { - -template -inline T uniform_random(int64_t random_seed, int64_t t) { - pcg32 ng(random_seed, t); - std::uniform_real_distribution uni; - return uni(ng); -} - -template -inline T invcdf(T u, int64_t n, T rem) { - constexpr T one = 1; - return rem * (one - std::pow(one - u, one / n)); -} - -template -inline T jth_sorted_uniform_random( - int64_t random_seed, int64_t t, int64_t c, int64_t j, T& rem, int64_t n) { - const auto u = uniform_random(random_seed, t + j * c); - // https://mathematica.stackexchange.com/a/256707 - rem -= invcdf(u, n, rem); - return 1 - rem; -} - -}; // namespace labor - } // namespace graphbolt #endif // GRAPHBOLT_RANDOM_H_