Skip to content

Commit

Permalink
[EM] Allow staging ellpack on host for GPU external memory. (#10488)
Browse files Browse the repository at this point in the history

- New parameter `on_host`.
- Abstract format creation and stream creation into policy classes.
  • Loading branch information
trivialfis authored Jun 27, 2024
1 parent 824fba7 commit e8a9625
Show file tree
Hide file tree
Showing 36 changed files with 846 additions and 321 deletions.
1 change: 1 addition & 0 deletions R-package/src/Makevars.in
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ OBJECTS= \
$(PKGROOT)/src/data/gradient_index_page_source.o \
$(PKGROOT)/src/data/gradient_index_format.o \
$(PKGROOT)/src/data/sparse_page_dmatrix.o \
$(PKGROOT)/src/data/sparse_page_source.o \
$(PKGROOT)/src/data/proxy_dmatrix.o \
$(PKGROOT)/src/data/iterative_dmatrix.o \
$(PKGROOT)/src/predictor/predictor.o \
Expand Down
1 change: 1 addition & 0 deletions R-package/src/Makevars.win
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ OBJECTS= \
$(PKGROOT)/src/data/gradient_index_page_source.o \
$(PKGROOT)/src/data/gradient_index_format.o \
$(PKGROOT)/src/data/sparse_page_dmatrix.o \
$(PKGROOT)/src/data/sparse_page_source.o \
$(PKGROOT)/src/data/proxy_dmatrix.o \
$(PKGROOT)/src/data/iterative_dmatrix.o \
$(PKGROOT)/src/predictor/predictor.o \
Expand Down
49 changes: 25 additions & 24 deletions include/xgboost/data.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ class MetaInfo {
static constexpr uint64_t kNumField = 12;

/*! \brief number of rows in the data */
uint64_t num_row_{0}; // NOLINT
bst_idx_t num_row_{0}; // NOLINT
/*! \brief number of columns in the data */
uint64_t num_col_{0}; // NOLINT
/*! \brief number of nonzero entries in the data */
Expand Down Expand Up @@ -535,10 +535,11 @@ class DMatrix {
template <typename T>
[[nodiscard]] bool PageExists() const;

// the following are column meta data, should be able to answer them fast.
/*! \return Whether the data columns single column block. */
/**
* @return Whether the data columns single column block.
*/
[[nodiscard]] virtual bool SingleColBlock() const = 0;
/*! \brief virtual destructor */

virtual ~DMatrix();

/**
Expand Down Expand Up @@ -600,34 +601,34 @@ class DMatrix {
int nthread, bst_bin_t max_bin);

/**
* \brief Create an external memory DMatrix with callbacks.
* @brief Create an external memory DMatrix with callbacks.
*
* \tparam DataIterHandle External iterator type, defined in C API.
* \tparam DMatrixHandle DMatrix handle, defined in C API.
* \tparam DataIterResetCallback Callback for reset, prototype defined in C API.
* \tparam XGDMatrixCallbackNext Callback for next, prototype defined in C API.
* @tparam DataIterHandle External iterator type, defined in C API.
* @tparam DMatrixHandle DMatrix handle, defined in C API.
* @tparam DataIterResetCallback Callback for reset, prototype defined in C API.
* @tparam XGDMatrixCallbackNext Callback for next, prototype defined in C API.
*
* \param iter External data iterator
* \param proxy A hanlde to ProxyDMatrix
* \param reset Callback for reset
* \param next Callback for next
* \param missing Value that should be treated as missing.
* \param nthread number of threads used for initialization.
* \param cache Prefix of cache file path.
* @param iter External data iterator
* @param proxy A hanlde to ProxyDMatrix
* @param reset Callback for reset
* @param next Callback for next
* @param missing Value that should be treated as missing.
* @param nthread number of threads used for initialization.
* @param cache Prefix of cache file path.
* @param on_host Used for GPU, whether the data should be cached on host memory.
*
* \return A created external memory DMatrix.
* @return A created external memory DMatrix.
*/
template <typename DataIterHandle, typename DMatrixHandle,
typename DataIterResetCallback, typename XGDMatrixCallbackNext>
static DMatrix *Create(DataIterHandle iter, DMatrixHandle proxy,
DataIterResetCallback *reset,
XGDMatrixCallbackNext *next, float missing,
int32_t nthread, std::string cache);
template <typename DataIterHandle, typename DMatrixHandle, typename DataIterResetCallback,
typename XGDMatrixCallbackNext>
static DMatrix* Create(DataIterHandle iter, DMatrixHandle proxy, DataIterResetCallback* reset,
XGDMatrixCallbackNext* next, float missing, int32_t nthread,
std::string cache, bool on_host);

virtual DMatrix *Slice(common::Span<int32_t const> ridxs) = 0;

/**
* \brief Slice a DMatrix by columns.
* @brief Slice a DMatrix by columns.
*
* @param num_slices Total number of slices
* @param slice_id Index of the current slice
Expand Down
27 changes: 19 additions & 8 deletions python-package/xgboost/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -503,18 +503,29 @@ class DataIter(ABC): # pylint: disable=too-many-instance-attributes
----------
cache_prefix :
Prefix to the cache files, only used in external memory.
release_data :
Whether the iterator should release the data during iteration. Set it to True if
the data transformation (converting data to np.float32 type) is memory
intensive. Otherwise, if the transformation is computation intensive then we can
keep the cache.
on_host :
Whether the data should be cached on host memory instead of harddrive when using
GPU with external memory. If set to true, then the "external memory" would
simply be CPU (host) memory. This is still working in progress, not ready for
test yet.
"""

def __init__(
self, cache_prefix: Optional[str] = None, release_data: bool = True
self,
cache_prefix: Optional[str] = None,
release_data: bool = True,
on_host: bool = False,
) -> None:
self.cache_prefix = cache_prefix
self.on_host = on_host

self._handle = _ProxyDMatrix()
self._exception: Optional[Exception] = None
Expand Down Expand Up @@ -905,20 +916,20 @@ def __init__(

def _init_from_iter(self, iterator: DataIter, enable_categorical: bool) -> None:
it = iterator
args = {
"missing": self.missing,
"nthread": self.nthread,
"cache_prefix": it.cache_prefix if it.cache_prefix else "",
}
args_cstr = from_pystr_to_cstr(json.dumps(args))
args = make_jcargs(
missing=self.missing,
nthread=self.nthread,
cache_prefix=it.cache_prefix if it.cache_prefix else "",
on_host=it.on_host,
)
handle = ctypes.c_void_p()
reset_callback, next_callback = it.get_callbacks(enable_categorical)
ret = _LIB.XGDMatrixCreateFromCallback(
None,
it.proxy.handle,
reset_callback,
next_callback,
args_cstr,
args,
ctypes.byref(handle),
)
it.reraise()
Expand Down
11 changes: 8 additions & 3 deletions python-package/xgboost/testing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -198,19 +198,20 @@ def skip_win() -> PytestSkip:
class IteratorForTest(xgb.core.DataIter):
"""Iterator for testing streaming DMatrix. (external memory, quantile)"""

def __init__(
def __init__( # pylint: disable=too-many-arguments
self,
X: Sequence,
y: Sequence,
w: Optional[Sequence],
cache: Optional[str],
on_host: bool = False,
) -> None:
assert len(X) == len(y)
self.X = X
self.y = y
self.w = w
self.it = 0
super().__init__(cache_prefix=cache)
super().__init__(cache_prefix=cache, on_host=on_host)

def next(self, input_data: Callable) -> int:
if self.it == len(self.X):
Expand Down Expand Up @@ -367,7 +368,11 @@ def get_external_dmat(self) -> xgb.DMatrix:
weight.append(w)

it = IteratorForTest(
predictor, response, weight if weight else None, cache="cache"
predictor,
response,
weight if weight else None,
cache="cache",
on_host=False,
)
return xgb.DMatrix(it)

Expand Down
2 changes: 1 addition & 1 deletion python-package/xgboost/testing/data_iter.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ def run_mixed_sparsity(device: str) -> None:

X = [cp.array(batch) for batch in X]

it = tm.IteratorForTest(X, y, None, None)
it = tm.IteratorForTest(X, y, None, None, on_host=False)
Xy_0 = xgboost.QuantileDMatrix(it)

X_1, y_1 = tm.make_sparse_regression(256, 16, 0.1, True)
Expand Down
1 change: 1 addition & 0 deletions python-package/xgboost/testing/updater.py
Original file line number Diff line number Diff line change
Expand Up @@ -207,6 +207,7 @@ def check_get_quantile_cut_device(tree_method: str, use_cupy: bool) -> None:
it = tm.IteratorForTest(
*tm.make_batches(n_samples_per_batch, n_features, n_batches, use_cupy),
cache="cache",
on_host=False,
)
Xy: xgb.DMatrix = xgb.DMatrix(it)
xgb.train({"tree_method": tree_method, "max_bin": max_bin}, Xyw)
Expand Down
3 changes: 2 additions & 1 deletion src/c_api/c_api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -298,13 +298,14 @@ XGB_DLL int XGDMatrixCreateFromCallback(DataIterHandle iter, DMatrixHandle proxy
auto missing = GetMissing(jconfig);
std::string cache = RequiredArg<String>(jconfig, "cache_prefix", __func__);
auto n_threads = OptionalArg<Integer, int64_t>(jconfig, "nthread", 0);
auto on_host = OptionalArg<Boolean, bool>(jconfig, "on_host", false);

xgboost_CHECK_C_ARG_PTR(next);
xgboost_CHECK_C_ARG_PTR(reset);
xgboost_CHECK_C_ARG_PTR(out);

*out = new std::shared_ptr<xgboost::DMatrix>{
xgboost::DMatrix::Create(iter, proxy, reset, next, missing, n_threads, cache)};
xgboost::DMatrix::Create(iter, proxy, reset, next, missing, n_threads, cache, on_host)};
API_END();
}

Expand Down
6 changes: 3 additions & 3 deletions src/common/device_helpers.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -429,7 +429,7 @@ struct XGBDefaultDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
}
#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
XGBDefaultDeviceAllocatorImpl()
: SuperT(rmm::cuda_stream_default, rmm::mr::get_current_device_resource()) {}
: SuperT(rmm::cuda_stream_per_thread, rmm::mr::get_current_device_resource()) {}
#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
};

Expand Down Expand Up @@ -484,8 +484,8 @@ struct XGBCachingDeviceAllocatorImpl : XGBBaseDeviceAllocator<T> {
}
#if defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
XGBCachingDeviceAllocatorImpl()
: SuperT(rmm::cuda_stream_default, rmm::mr::get_current_device_resource()),
use_cub_allocator_(!xgboost::GlobalConfigThreadLocalStore::Get()->use_rmm) {}
: SuperT(rmm::cuda_stream_per_thread, rmm::mr::get_current_device_resource()),
use_cub_allocator_(!xgboost::GlobalConfigThreadLocalStore::Get()->use_rmm) {}
#endif // defined(XGBOOST_USE_RMM) && XGBOOST_USE_RMM == 1
XGBOOST_DEVICE void construct(T *) {} // NOLINT
private:
Expand Down
8 changes: 7 additions & 1 deletion src/common/error_msg.h
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
#ifndef XGBOOST_COMMON_ERROR_MSG_H_
#define XGBOOST_COMMON_ERROR_MSG_H_

#include <cinttypes> // for uint64_t
#include <cstdint> // for uint64_t
#include <limits> // for numeric_limits
#include <string> // for string

Expand Down Expand Up @@ -103,5 +103,11 @@ inline auto NoFederated() { return "XGBoost is not compiled with federated learn
inline auto NoCategorical(std::string name) {
return name + " doesn't support categorical features.";
}

inline void NoOnHost(bool on_host) {
if (on_host) {
LOG(FATAL) << "Caching on host memory is only available for GPU.";
}
}
} // namespace xgboost::error
#endif // XGBOOST_COMMON_ERROR_MSG_H_
2 changes: 1 addition & 1 deletion src/common/hist_util.h
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ class HistogramCuts {
return vals[bin_idx - 1];
}

void SetDevice(DeviceOrd d) const {
void SetDevice(DeviceOrd d) {
this->cut_ptrs_.SetDevice(d);
this->cut_ptrs_.ConstDevicePointer();

Expand Down
24 changes: 11 additions & 13 deletions src/data/data.cc
Original file line number Diff line number Diff line change
Expand Up @@ -901,15 +901,12 @@ DMatrix* DMatrix::Create(DataIterHandle iter, DMatrixHandle proxy, std::shared_p
return new data::IterativeDMatrix(iter, proxy, ref, reset, next, missing, nthread, max_bin);
}

template <typename DataIterHandle, typename DMatrixHandle,
typename DataIterResetCallback, typename XGDMatrixCallbackNext>
DMatrix *DMatrix::Create(DataIterHandle iter, DMatrixHandle proxy,
DataIterResetCallback *reset,
XGDMatrixCallbackNext *next, float missing,
int32_t n_threads,
std::string cache) {
return new data::SparsePageDMatrix(iter, proxy, reset, next, missing, n_threads,
cache);
template <typename DataIterHandle, typename DMatrixHandle, typename DataIterResetCallback,
typename XGDMatrixCallbackNext>
DMatrix* DMatrix::Create(DataIterHandle iter, DMatrixHandle proxy, DataIterResetCallback* reset,
XGDMatrixCallbackNext* next, float missing, int32_t n_threads,
std::string cache, bool on_host) {
return new data::SparsePageDMatrix{iter, proxy, reset, next, missing, n_threads, cache, on_host};
}

template DMatrix* DMatrix::Create<DataIterHandle, DMatrixHandle, DataIterResetCallback,
Expand All @@ -919,10 +916,11 @@ template DMatrix* DMatrix::Create<DataIterHandle, DMatrixHandle, DataIterResetCa
XGDMatrixCallbackNext* next, float missing,
int nthread, int max_bin);

template DMatrix *DMatrix::Create<DataIterHandle, DMatrixHandle,
DataIterResetCallback, XGDMatrixCallbackNext>(
DataIterHandle iter, DMatrixHandle proxy, DataIterResetCallback *reset,
XGDMatrixCallbackNext *next, float missing, int32_t n_threads, std::string);
template DMatrix* DMatrix::Create<DataIterHandle, DMatrixHandle, DataIterResetCallback,
XGDMatrixCallbackNext>(DataIterHandle iter, DMatrixHandle proxy,
DataIterResetCallback* reset,
XGDMatrixCallbackNext* next, float missing,
int32_t n_threads, std::string, bool);

template <typename AdapterT>
DMatrix* DMatrix::Create(AdapterT* adapter, float missing, int nthread, const std::string&,
Expand Down
2 changes: 1 addition & 1 deletion src/data/ellpack_page.cc
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ void EllpackPage::SetBaseRowId(std::size_t) {
LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but "
"EllpackPage is required";
}
size_t EllpackPage::Size() const {
bst_idx_t EllpackPage::Size() const {
LOG(FATAL) << "Internal Error: XGBoost is not compiled with CUDA but "
"EllpackPage is required";
return 0;
Expand Down
25 changes: 13 additions & 12 deletions src/data/ellpack_page.cu
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ EllpackPage::~EllpackPage() = default;

EllpackPage::EllpackPage(EllpackPage&& that) { std::swap(impl_, that.impl_); }

size_t EllpackPage::Size() const { return impl_->Size(); }
[[nodiscard]] bst_idx_t EllpackPage::Size() const { return impl_->Size(); }

void EllpackPage::SetBaseRowId(std::size_t row_id) { impl_->SetBaseRowId(row_id); }

Expand Down Expand Up @@ -91,13 +91,13 @@ __global__ void CompressBinEllpackKernel(
// Construct an ELLPACK matrix with the given number of empty rows.
EllpackPageImpl::EllpackPageImpl(DeviceOrd device,
std::shared_ptr<common::HistogramCuts const> cuts, bool is_dense,
size_t row_stride, size_t n_rows)
: is_dense(is_dense), cuts_(std::move(cuts)), row_stride(row_stride), n_rows(n_rows) {
bst_idx_t row_stride, bst_idx_t n_rows)
: is_dense(is_dense), cuts_(std::move(cuts)), row_stride{row_stride}, n_rows{n_rows} {
monitor_.Init("ellpack_page");
dh::safe_cuda(cudaSetDevice(device.ordinal));

monitor_.Start("InitCompressedData");
InitCompressedData(device);
this->InitCompressedData(device);
monitor_.Stop("InitCompressedData");
}

Expand Down Expand Up @@ -403,7 +403,7 @@ struct CopyPage {
// Copy the data from the given EllpackPage to the current page.
size_t EllpackPageImpl::Copy(DeviceOrd device, EllpackPageImpl const* page, size_t offset) {
monitor_.Start("Copy");
size_t num_elements = page->n_rows * page->row_stride;
bst_idx_t num_elements = page->n_rows * page->row_stride;
CHECK_EQ(row_stride, page->row_stride);
CHECK_EQ(NumSymbols(), page->NumSymbols());
CHECK_GE(n_rows * row_stride, offset + num_elements);
Expand Down Expand Up @@ -461,16 +461,17 @@ struct CompactPage {
};

// Compacts the data from the given EllpackPage into the current page.
void EllpackPageImpl::Compact(DeviceOrd device, EllpackPageImpl const* page,
void EllpackPageImpl::Compact(Context const* ctx, EllpackPageImpl const* page,
common::Span<size_t> row_indexes) {
monitor_.Start("Compact");
monitor_.Start(__func__);
CHECK_EQ(row_stride, page->row_stride);
CHECK_EQ(NumSymbols(), page->NumSymbols());
CHECK_LE(page->base_rowid + page->n_rows, row_indexes.size());
gidx_buffer.SetDevice(device);
page->gidx_buffer.SetDevice(device);
dh::LaunchN(page->n_rows, CompactPage(this, page, row_indexes));
monitor_.Stop("Compact");
gidx_buffer.SetDevice(ctx->Device());
page->gidx_buffer.SetDevice(ctx->Device());
auto cuctx = ctx->CUDACtx();
dh::LaunchN(page->n_rows, cuctx->Stream(), CompactPage(this, page, row_indexes));
monitor_.Stop(__func__);
}

// Initialize the buffer to stored compressed features.
Expand Down Expand Up @@ -551,7 +552,7 @@ void EllpackPageImpl::CreateHistIndices(DeviceOrd device,
}

// Return the number of rows contained in this page.
size_t EllpackPageImpl::Size() const { return n_rows; }
[[nodiscard]] bst_idx_t EllpackPageImpl::Size() const { return n_rows; }

// Return the memory cost for storing the compressed features.
size_t EllpackPageImpl::MemCostBytes(size_t num_rows, size_t row_stride,
Expand Down
Loading

0 comments on commit e8a9625

Please sign in to comment.