From b4cc350ec5c057adc9d06b728e97b428a1b1cb4e Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 18 Jun 2024 04:34:54 +0800
Subject: [PATCH] Fix categorical data with external memory. (#10433)

---
 demo/guide-python/external_memory.py |  2 +-
 src/common/hist_util.h               |  3 +--
 src/data/gradient_index.cc           |  5 ++---
 src/data/histogram_cut_format.h      | 13 ++++++++++++-
 tests/python/test_data_iterator.py   | 15 +++++++++++++++
 5 files changed, 31 insertions(+), 7 deletions(-)
diff --git a/demo/guide-python/external_memory.py b/demo/guide-python/external_memory.py
index b19f550c9149..e1bcbe99ae62 100644
--- a/demo/guide-python/external_memory.py
+++ b/demo/guide-python/external_memory.py
@@ -43,7 +43,7 @@ def make_batches(
 class Iterator(xgboost.DataIter):
     """A custom iterator for loading files in batches."""
 
-    def __init__(self, file_paths: List[Tuple[str, str]]):
+    def __init__(self, file_paths: List[Tuple[str, str]]) -> None:
         self._file_paths = file_paths
         self._it = 0
         # XGBoost will generate some cache files under current directory with the prefix
diff --git a/src/common/hist_util.h b/src/common/hist_util.h
index e829752dae3d..8f940500f73c 100644
--- a/src/common/hist_util.h
+++ b/src/common/hist_util.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2017-2024 by XGBoost Contributors
+ * Copyright 2017-2024, XGBoost Contributors
  * \file hist_util.h
  * \brief Utility for fast histogram aggregation
  * \author Philip Cho, Tianqi Chen
@@ -11,7 +11,6 @@
 #include <cstdint>  // for uint32_t
 #include <limits>
 #include <map>
-#include <memory>
 #include <utility>
 #include <vector>
 
diff --git a/src/data/gradient_index.cc b/src/data/gradient_index.cc
index 493aded70098..e600892db90f 100644
--- a/src/data/gradient_index.cc
+++ b/src/data/gradient_index.cc
@@ -4,7 +4,6 @@
  */
 #include "gradient_index.h"
 
-#include <algorithm>
 #include <limits>
 #include <memory>
 #include <utility>  // for forward
@@ -126,8 +125,8 @@ INSTANTIATION_PUSH(data::ColumnarAdapterBatch)
 void GHistIndexMatrix::ResizeIndex(const size_t n_index, const bool isDense) {
   auto make_index = [this, n_index](auto t, common::BinTypeSize t_size) {
     // Must resize instead of allocating a new one. This function is called everytime a
-    // new batch is pushed, and we grow the size accordingly without loosing the data the
-    // previous batches.
+    // new batch is pushed, and we grow the size accordingly without loosing the data in
+    // the previous batches.
     using T = decltype(t);
     std::size_t n_bytes = sizeof(T) * n_index;
     CHECK_GE(n_bytes, this->data.size());
diff --git a/src/data/histogram_cut_format.h b/src/data/histogram_cut_format.h
index 45a96134f8d0..d4eb81ad2849 100644
--- a/src/data/histogram_cut_format.h
+++ b/src/data/histogram_cut_format.h
@@ -1,5 +1,5 @@
 /**
- * Copyright 2021-2023, XGBoost contributors
+ * Copyright 2021-2024, XGBoost contributors
  */
 #ifndef XGBOOST_DATA_HISTOGRAM_CUT_FORMAT_H_
 #define XGBOOST_DATA_HISTOGRAM_CUT_FORMAT_H_
@@ -23,6 +23,15 @@ inline bool ReadHistogramCuts(common::HistogramCuts *cuts, common::AlignedResour
   if (!common::ReadVec(fi, &cuts->min_vals_.HostVector())) {
     return false;
   }
+  bool has_cat{false};
+  if (!fi->Read(&has_cat)) {
+    return false;
+  }
+  decltype(cuts->MaxCategory()) max_cat{0};
+  if (!fi->Read(&max_cat)) {
+    return false;
+  }
+  cuts->SetCategorical(has_cat, max_cat);
   return true;
 }
 
@@ -32,6 +41,8 @@ inline std::size_t WriteHistogramCuts(common::HistogramCuts const &cuts,
   bytes += common::WriteVec(fo, cuts.Values());
   bytes += common::WriteVec(fo, cuts.Ptrs());
   bytes += common::WriteVec(fo, cuts.MinValues());
+  bytes += fo->Write(cuts.HasCategorical());
+  bytes += fo->Write(cuts.MaxCategory());
   return bytes;
 }
 }  // namespace xgboost::data
diff --git a/tests/python/test_data_iterator.py b/tests/python/test_data_iterator.py
index 7f0153565c4b..e665bcb10d9f 100644
--- a/tests/python/test_data_iterator.py
+++ b/tests/python/test_data_iterator.py
@@ -52,6 +52,21 @@ def test_single_batch(tree_method: str = "approx") -> None:
     assert from_np.get_dump() == from_it.get_dump()
 
 
+def test_with_cat_single() -> None:
+    X, y = tm.make_categorical(
+        n_samples=128, n_features=3, n_categories=6, onehot=False
+    )
+    Xy = xgb.DMatrix(SingleBatch(data=X, label=y), enable_categorical=True)
+    from_it = xgb.train({}, Xy, num_boost_round=3)
+
+    Xy = xgb.DMatrix(X, y, enable_categorical=True)
+    from_Xy = xgb.train({}, Xy, num_boost_round=3)
+
+    jit = from_it.save_raw(raw_format="json")
+    jxy = from_Xy.save_raw(raw_format="json")
+    assert jit == jxy
+
+
 def run_data_iterator(
     n_samples_per_batch: int,
     n_features: int,