Use UBJSON for serializing splits for vertical data split. (#10059)

dmlc · Feb 24, 2024 · 0ce4372 · 0ce4372
1 parent f7005d3
commit 0ce4372
Show file tree

Hide file tree

Showing 14 changed files with 162 additions and 165 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -174,7 +174,7 @@ jobs:
     - uses: actions/checkout@e2f20e631ae6d7dd3b768f56a5d2af784dd54791 # v2.5.0
       with:
         submodules: 'true'
-    - uses: actions/setup-python@7f80679172b057fc5e90d70d197929d454754a5a # v4.3.0
+    - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0
       with:
         python-version: "3.8"
         architecture: 'x64'

diff --git a/.github/workflows/python_tests.yml b/.github/workflows/python_tests.yml
@@ -310,7 +310,7 @@ jobs:
           submodules: 'true'
 
       - name: Set up Python 3.8
-        uses: actions/setup-python@v4
+        uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0
         with:
           python-version: 3.8
 

diff --git a/.github/workflows/python_wheels.yml b/.github/workflows/python_wheels.yml
@@ -21,7 +21,7 @@ jobs:
       with:
         submodules: 'true'
     - name: Setup Python
-      uses: actions/setup-python@7f80679172b057fc5e90d70d197929d454754a5a # v4.3.0
+      uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0
       with:
         python-version: "3.8"
     - name: Build wheels

diff --git a/.github/workflows/r_tests.yml b/.github/workflows/r_tests.yml
@@ -74,7 +74,7 @@ jobs:
         key: ${{ runner.os }}-r-${{ matrix.config.r }}-6-${{ hashFiles('R-package/DESCRIPTION') }}
         restore-keys: ${{ runner.os }}-r-${{ matrix.config.r }}-6-${{ hashFiles('R-package/DESCRIPTION') }}
 
-    - uses: actions/setup-python@7f80679172b057fc5e90d70d197929d454754a5a # v4.3.0
+    - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0
       with:
         python-version: "3.8"
         architecture: 'x64'

diff --git a/R-package/src/Makevars.in b/R-package/src/Makevars.in
@@ -104,6 +104,7 @@ OBJECTS= \
     $(PKGROOT)/src/collective/broadcast.o \
     $(PKGROOT)/src/collective/comm.o \
     $(PKGROOT)/src/collective/coll.o \
+    $(PKGROOT)/src/collective/communicator-inl.o \
     $(PKGROOT)/src/collective/tracker.o \
     $(PKGROOT)/src/collective/communicator.o \
     $(PKGROOT)/src/collective/in_memory_communicator.o \

diff --git a/R-package/src/Makevars.win b/R-package/src/Makevars.win
@@ -104,6 +104,7 @@ OBJECTS= \
     $(PKGROOT)/src/collective/broadcast.o \
     $(PKGROOT)/src/collective/comm.o \
     $(PKGROOT)/src/collective/coll.o \
+    $(PKGROOT)/src/collective/communicator-inl.o \
     $(PKGROOT)/src/collective/tracker.o \
     $(PKGROOT)/src/collective/communicator.o \
     $(PKGROOT)/src/collective/in_memory_communicator.o \

diff --git a/src/collective/communicator-inl.cc b/src/collective/communicator-inl.cc
@@ -0,0 +1,34 @@
+/**
+ * Copyright 2024, XGBoost contributors
+ */
+#include "communicator-inl.h"
+
+namespace xgboost::collective {
+[[nodiscard]] std::vector<std::vector<char>> VectorAllgatherV(
+    std::vector<std::vector<char>> const &input) {
+  auto n_inputs = input.size();
+  std::vector<std::int64_t> sizes(n_inputs);
+  std::transform(input.cbegin(), input.cend(), sizes.begin(),
+                 [](auto const &vec) { return vec.size(); });
+
+  std::vector<std::int64_t> global_sizes = AllgatherV(sizes);
+  std::vector<std::int64_t> offset(global_sizes.size() + 1);
+  offset[0] = 0;
+  for (std::size_t i = 1; i < offset.size(); i++) {
+    offset[i] = offset[i - 1] + global_sizes[i - 1];
+  }
+
+  std::vector<char> collected;
+  for (auto const &vec : input) {
+    collected.insert(collected.end(), vec.cbegin(), vec.cend());
+  }
+  auto out = AllgatherV(collected);
+
+  std::vector<std::vector<char>> result;
+  for (std::size_t i = 1; i < offset.size(); ++i) {
+    std::vector<char> local(out.cbegin() + offset[i - 1], out.cbegin() + offset[i]);
+    result.emplace_back(std::move(local));
+  }
+  return result;
+}
+}  // namespace xgboost::collective
diff --git a/src/collective/communicator-inl.h b/src/collective/communicator-inl.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2022-2023 by XGBoost contributors
+ * Copyright 2022-2024, XGBoost contributors
  */
 #pragma once
 #include <string>
@@ -192,6 +192,18 @@ inline std::vector<T> AllgatherV(std::vector<T> const &input) {
   return result;
 }
 
+/**
+ * @brief Gathers variable-length data from all processes and distributes it to all processes.
+ *
+ * @param inputs All the inputs from the local worker. The number of inputs can vary
+ *               across different workers. Along with which, the size of each vector in
+ *               the input can also vary.
+ *
+ * @return The AllgatherV result, containing vectors from all workers.
+ */
+[[nodiscard]] std::vector<std::vector<char>> VectorAllgatherV(
+    std::vector<std::vector<char>> const &input);
+
 /**
  * @brief Gathers variable-length strings from all processes and distributes them to all processes.
  * @param input Variable-length list of variable-length strings.
@@ -294,38 +306,5 @@ template <Operation op>
 inline void Allreduce(double *send_receive_buffer, size_t count) {
   Communicator::Get()->AllReduce(send_receive_buffer, count, DataType::kDouble, op);
 }
-
-template <typename T>
-struct SpecialAllgatherVResult {
-  std::vector<std::size_t> offsets;
-  std::vector<std::size_t> sizes;
-  std::vector<T> result;
-};
-
-/**
- * @brief Gathers variable-length data from all processes and distributes it to all processes.
- *
- * We assume each worker has the same number of inputs, but each input may be of a different size.
- *
- * @param inputs All the inputs from the local worker.
- * @param sizes  Sizes of each input.
- */
-template <typename T>
-inline SpecialAllgatherVResult<T> SpecialAllgatherV(std::vector<T> const &inputs,
-                                                    std::vector<std::size_t> const &sizes) {
-  // Gather the sizes across all workers.
-  auto const all_sizes = Allgather(sizes);
-
-  // Calculate input offsets (std::exclusive_scan).
-  std::vector<std::size_t> offsets(all_sizes.size());
-  for (std::size_t i = 1; i < offsets.size(); i++) {
-    offsets[i] = offsets[i - 1] + all_sizes[i - 1];
-  }
-
-  // Gather all the inputs.
-  auto const all_inputs = AllgatherV(inputs);
-
-  return {offsets, all_sizes, all_inputs};
-}
 }  // namespace collective
 }  // namespace xgboost