Skip to content

Commit

Permalink
SYCL. Unify calculations for objectives reg:absoluteerror, reg:quanti…
Browse files Browse the repository at this point in the history
…leerror, binary:hinge (#10993)



---------

Co-authored-by: Dmitry Razdoburdin <>
  • Loading branch information
razdoburdin authored Nov 21, 2024
1 parent 96d60ad commit a502f67
Show file tree
Hide file tree
Showing 26 changed files with 267 additions and 150 deletions.
16 changes: 8 additions & 8 deletions include/xgboost/linalg.h
Original file line number Diff line number Diff line change
Expand Up @@ -855,22 +855,22 @@ class Tensor {
* @brief Get a @ref TensorView for this tensor.
*/
auto View(DeviceOrd device) {
if (device.IsCUDA()) {
data_.SetDevice(device);
auto span = data_.DeviceSpan();
if (device.IsCPU()) {
auto span = data_.HostSpan();
return TensorView<T, kDim>{span, shape_, device, order_};
} else {
auto span = data_.HostSpan();
data_.SetDevice(device);
auto span = data_.DeviceSpan();
return TensorView<T, kDim>{span, shape_, device, order_};
}
}
auto View(DeviceOrd device) const {
if (device.IsCUDA()) {
data_.SetDevice(device);
auto span = data_.ConstDeviceSpan();
if (device.IsCPU()) {
auto span = data_.ConstHostSpan();
return TensorView<T const, kDim>{span, shape_, device, order_};
} else {
auto span = data_.ConstHostSpan();
data_.SetDevice(device);
auto span = data_.ConstDeviceSpan();
return TensorView<T const, kDim>{span, shape_, device, order_};
}
}
Expand Down
4 changes: 4 additions & 0 deletions plugin/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
if(PLUGIN_SYCL)
set(CMAKE_CXX_COMPILER "icpx")
file(GLOB_RECURSE SYCL_SOURCES "sycl/*.cc")
list(APPEND SYCL_SOURCES
${xgboost_SOURCE_DIR}/src/objective/regression_obj.cc
${xgboost_SOURCE_DIR}/src/objective/hinge.cc
${xgboost_SOURCE_DIR}/src/objective/quantile_obj.cc)
add_library(plugin_sycl OBJECT ${SYCL_SOURCES})
target_include_directories(plugin_sycl
PRIVATE
Expand Down
41 changes: 41 additions & 0 deletions plugin/sycl/common/linalg_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,17 @@
#include <utility>

#include "../data.h"
#include "../device_manager.h"

#include <sycl/sycl.hpp>

namespace xgboost {
namespace sycl {
namespace linalg {

template<typename T, std::int32_t D>
using TensorView = xgboost::linalg::TensorView<T, D>;

struct WorkGroupsParams {
size_t n_workgroups;
size_t workgroup_size;
Expand All @@ -40,7 +44,44 @@ ::sycl::event GroupWiseKernel(::sycl::queue* qu, int* flag_ptr,
});
return event;
}

template<typename Fn, typename TupleType, size_t ... I>
auto call(Fn&& fn, TupleType t, std::index_sequence<I ...>) {
return fn(std::get<I>(t) ...);
}

template<typename Fn, typename TupleType>
auto call(Fn&& fn, TupleType t) {
static constexpr auto size = std::tuple_size<TupleType>::value;
return call(fn, t, std::make_index_sequence<size>{});
}

template <typename T, int32_t D, typename Fn>
void ElementWiseKernel(TensorView<T, D> t, Fn&& fn) {
sycl::DeviceManager device_manager;
auto* qu = device_manager.GetQueue(t.Device());
qu->submit([&](::sycl::handler& cgh) {
cgh.parallel_for<>(::sycl::range<1>(t.Size()),
[=](::sycl::id<1> pid) {
const size_t idx = pid[0];
call(const_cast<Fn&&>(fn), xgboost::linalg::UnravelIndex(idx, t.Shape()));
});
}).wait_and_throw();
}

} // namespace linalg
} // namespace sycl

namespace linalg {
template <typename T, int32_t D, typename Fn>
void ElementWiseKernel(Context const* ctx, TensorView<T, D> t, Fn&& fn) {
if (ctx->IsSycl()) {
sycl::linalg::ElementWiseKernel(t, fn);
} else {
ElementWiseKernelHost(t, ctx->Threads(), fn);
}
}

} // namespace linalg
} // namespace xgboost
#endif // PLUGIN_SYCL_COMMON_LINALG_OP_H_
80 changes: 0 additions & 80 deletions plugin/sycl/objective/regression_obj.cc
Original file line number Diff line number Diff line change
Expand Up @@ -195,86 +195,6 @@ XGBOOST_REGISTER_OBJECTIVE(LogisticRaw,
"before logistic transformation with SYCL backend.")
.set_body([]() { return new RegLossObj<xgboost::obj::LogisticRaw>(); });

class MeanAbsoluteError : public ObjFunction {
public:
void Configure(Args const&) override {}

ObjInfo Task() const override {
return {ObjInfo::kRegression, true, true};
}

bst_target_t Targets(MetaInfo const& info) const override {
return std::max(static_cast<std::size_t>(1), info.labels.Shape(1));
}

void GetGradient(HostDeviceVector<float> const& preds, const MetaInfo& info,
std::int32_t, xgboost::linalg::Matrix<GradientPair>* out_gpair) override {
if (qu_ == nullptr) {
qu_ = device_manager.GetQueue(ctx_->Device());
}

size_t const ndata = preds.Size();
auto const n_targets = this->Targets(info);

xgboost::obj::CheckInitInputs(info);
CHECK_EQ(info.labels.Size(), preds.Size()) << "Invalid shape of labels.";
const bst_float* label_ptr = info.labels.Data()->ConstDevicePointer();

out_gpair->SetDevice(ctx_->Device());
out_gpair->Reshape(info.num_row_, this->Targets(info));
GradientPair* out_gpair_ptr = out_gpair->Data()->DevicePointer();

preds.SetDevice(ctx_->Device());
const bst_float* preds_ptr = preds.ConstDevicePointer();
auto predt = xgboost::linalg::MakeTensorView(ctx_, &preds, info.num_row_, this->Targets(info));
info.weights_.SetDevice(ctx_->Device());
common::OptionalWeights weight{ctx_->IsCPU() ? info.weights_.ConstHostSpan()
: info.weights_.ConstDeviceSpan()};

qu_->submit([&](::sycl::handler& cgh) {
cgh.parallel_for<>(::sycl::range<1>(ndata), [=](::sycl::id<1> pid) {
int idx = pid[0];
auto sign = [](auto x) {
return (x > static_cast<decltype(x)>(0)) - (x < static_cast<decltype(x)>(0));
};
const bst_float pred = preds_ptr[idx];
const bst_float label = label_ptr[idx];

bst_float hess = weight[idx/n_targets];
bst_float grad = sign(pred - label) * hess;
out_gpair_ptr[idx] = GradientPair{grad, hess};
});
});
qu_->wait_and_throw();
}

void UpdateTreeLeaf(HostDeviceVector<bst_node_t> const& position, MetaInfo const& info,
float learning_rate, HostDeviceVector<float> const& prediction,
std::int32_t group_idx, RegTree* p_tree) const override {
::xgboost::obj::UpdateTreeLeaf(ctx_, position, group_idx, info, learning_rate, prediction, 0.5,
p_tree);
}

const char* DefaultEvalMetric() const override { return "mae"; }

void SaveConfig(Json* p_out) const override {
auto& out = *p_out;
out["name"] = String("reg:absoluteerror");
}

void LoadConfig(Json const& in) override {
CHECK_EQ(StringView{get<String const>(in["name"])}, StringView{"reg:absoluteerror"});
}

protected:
sycl::DeviceManager device_manager;
mutable ::sycl::queue* qu_ = nullptr;
};

XGBOOST_REGISTER_OBJECTIVE(MeanAbsoluteError, "reg:absoluteerror_sycl")
.describe("Mean absoluate error.")
.set_body([]() { return new MeanAbsoluteError(); });

} // namespace obj
} // namespace sycl
} // namespace xgboost
6 changes: 6 additions & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
file(GLOB_RECURSE CPU_SOURCES *.cc *.h)
list(REMOVE_ITEM CPU_SOURCES ${xgboost_SOURCE_DIR}/src/cli_main.cc)

if(PLUGIN_SYCL)
list(REMOVE_ITEM CPU_SOURCES ${xgboost_SOURCE_DIR}/src/objective/regression_obj.cc)
list(REMOVE_ITEM CPU_SOURCES ${xgboost_SOURCE_DIR}/src/objective/hinge.cc)
list(REMOVE_ITEM CPU_SOURCES ${xgboost_SOURCE_DIR}/src/objective/quantile_obj.cc)
endif()

#-- Object library
# Object library is necessary for jvm-package, which creates its own shared library.
add_library(objxgboost OBJECT)
Expand Down
4 changes: 2 additions & 2 deletions src/common/linalg_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ void ElementWiseKernelHost(linalg::TensorView<T, D> t, std::int32_t n_threads, F
}
}

#if !defined(XGBOOST_USE_CUDA)
#if !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_SYCL)
template <typename T, int32_t D, typename Fn>
void ElementWiseKernelDevice(linalg::TensorView<T, D>, Fn&&, void* = nullptr) {
common::AssertGPUSupport();
Expand All @@ -66,7 +66,7 @@ void ElementWiseKernel(Context const* ctx, linalg::TensorView<T, D> t, Fn&& fn)
}
ElementWiseKernelHost(t, ctx->Threads(), fn);
}
#endif // !defined(XGBOOST_USE_CUDA)
#endif // !defined(XGBOOST_USE_CUDA) && !defined(XGBOOST_USE_SYCL)

template <typename T, std::int32_t kDim>
auto cbegin(TensorView<T, kDim> const& v) { // NOLINT
Expand Down
18 changes: 11 additions & 7 deletions src/metric/elementwise_metric.cu
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,9 @@ namespace {
template <typename Fn>
PackedReduceResult Reduce(Context const* ctx, MetaInfo const& info, Fn&& loss) {
PackedReduceResult result;
auto labels = info.labels.View(ctx->Device());
// This function doesn't have sycl-specific implementation yet.
// For that reason we transfer data to host in case of sycl is used for propper execution.
auto labels = info.labels.View(ctx->Device().IsSycl() ? DeviceOrd::CPU() : ctx->Device());
if (ctx->IsCUDA()) {
#if defined(XGBOOST_USE_CUDA)
thrust::counting_iterator<size_t> begin(0);
Expand Down Expand Up @@ -182,10 +184,11 @@ class PseudoErrorLoss : public MetricNoCache {

double Eval(const HostDeviceVector<bst_float>& preds, const MetaInfo& info) override {
CHECK_EQ(info.labels.Shape(0), info.num_row_);
auto labels = info.labels.View(ctx_->Device());
preds.SetDevice(ctx_->Device());
auto device = ctx_->Device().IsSycl() ? DeviceOrd::CPU() : ctx_->Device();
auto labels = info.labels.View(device);
preds.SetDevice(device);
auto predts = ctx_->IsCUDA() ? preds.ConstDeviceSpan() : preds.ConstHostSpan();
info.weights_.SetDevice(ctx_->Device());
info.weights_.SetDevice(device);
common::OptionalWeights weights(ctx_->IsCUDA() ? info.weights_.ConstDeviceSpan()
: info.weights_.ConstHostSpan());
float slope = this->param_.huber_slope;
Expand Down Expand Up @@ -349,11 +352,12 @@ struct EvalEWiseBase : public MetricNoCache {
if (info.labels.Size() != 0) {
CHECK_NE(info.labels.Shape(1), 0);
}
auto labels = info.labels.View(ctx_->Device());
info.weights_.SetDevice(ctx_->Device());
auto device = ctx_->Device().IsSycl() ? DeviceOrd::CPU() : ctx_->Device();
auto labels = info.labels.View(device);
info.weights_.SetDevice(device);
common::OptionalWeights weights(ctx_->IsCUDA() ? info.weights_.ConstDeviceSpan()
: info.weights_.ConstHostSpan());
preds.SetDevice(ctx_->Device());
preds.SetDevice(device);
auto predts = ctx_->IsCUDA() ? preds.ConstDeviceSpan() : preds.ConstHostSpan();

auto d_policy = policy_;
Expand Down
7 changes: 5 additions & 2 deletions src/objective/hinge.cu
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@
#if defined(XGBOOST_USE_CUDA)
#include "../common/linalg_op.cuh"
#endif
#if defined(XGBOOST_USE_SYCL)
#include "../../plugin/sycl/common/linalg_op.h"
#endif
#include "../common/linalg_op.h"
#include "../common/optional_weight.h" // for OptionalWeights
#include "../common/transform.h" // for Transform
Expand Down Expand Up @@ -58,8 +61,8 @@ class HingeObj : public FitIntercept {
auto labels = info.labels.View(ctx_->Device());

info.weights_.SetDevice(ctx_->Device());
common::OptionalWeights weight{ctx_->IsCUDA() ? info.weights_.ConstDeviceSpan()
: info.weights_.ConstHostSpan()};
common::OptionalWeights weight{ctx_->IsCPU() ? info.weights_.ConstHostSpan()
: info.weights_.ConstDeviceSpan()};

linalg::ElementWiseKernel(this->ctx_, labels,
[=] XGBOOST_DEVICE(std::size_t i, std::size_t j) mutable {
Expand Down
1 change: 0 additions & 1 deletion src/objective/objective.cc
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,6 @@ std::string ObjFunction::GetSyclImplementationName(const std::string& name) {
return name + sycl_postfix;
} else {
// Function hasn't specific sycl implementation
LOG(FATAL) << "`" << name << "` doesn't have sycl implementation yet\n";
return name;
}
}
Expand Down
10 changes: 7 additions & 3 deletions src/objective/quantile_obj.cu
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@

#endif // defined(XGBOOST_USE_CUDA)

#if defined(XGBOOST_USE_SYCL)
#include "../../plugin/sycl/common/linalg_op.h" // ElementWiseKernel
#endif

namespace xgboost::obj {
class QuantileRegression : public ObjFunction {
common::QuantileLossParam param_;
Expand Down Expand Up @@ -71,14 +75,14 @@ class QuantileRegression : public ObjFunction {
auto gpair = out_gpair->View(ctx_->Device());

info.weights_.SetDevice(ctx_->Device());
common::OptionalWeights weight{ctx_->IsCUDA() ? info.weights_.ConstDeviceSpan()
: info.weights_.ConstHostSpan()};
common::OptionalWeights weight{ctx_->IsCPU() ? info.weights_.ConstHostSpan()
: info.weights_.ConstDeviceSpan()};

preds.SetDevice(ctx_->Device());
auto predt = linalg::MakeTensorView(ctx_, &preds, info.num_row_, n_targets);

alpha_.SetDevice(ctx_->Device());
auto alpha = ctx_->IsCUDA() ? alpha_.ConstDeviceSpan() : alpha_.ConstHostSpan();
auto alpha = ctx_->IsCPU() ? alpha_.ConstHostSpan() : alpha_.ConstDeviceSpan();

linalg::ElementWiseKernel(ctx_, gpair,
[=] XGBOOST_DEVICE(std::size_t i, std::size_t j) mutable {
Expand Down
12 changes: 8 additions & 4 deletions src/objective/regression_obj.cu
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,10 @@
#include "../common/linalg_op.cuh"
#endif // defined(XGBOOST_USE_CUDA)

#if defined(XGBOOST_USE_SYCL)
#include "../../plugin/sycl/common/linalg_op.h"
#endif

namespace xgboost::obj {
namespace {
void CheckRegInputs(MetaInfo const& info, HostDeviceVector<bst_float> const& preds) {
Expand Down Expand Up @@ -253,8 +257,8 @@ class PseudoHuberRegression : public FitIntercept {
auto predt = linalg::MakeTensorView(ctx_, &preds, info.num_row_, this->Targets(info));

info.weights_.SetDevice(ctx_->Device());
common::OptionalWeights weight{ctx_->IsCUDA() ? info.weights_.ConstDeviceSpan()
: info.weights_.ConstHostSpan()};
common::OptionalWeights weight{ctx_->IsCPU() ? info.weights_.ConstHostSpan()
: info.weights_.ConstDeviceSpan()};

linalg::ElementWiseKernel(
ctx_, labels, [=] XGBOOST_DEVICE(std::size_t i, std::size_t j) mutable {
Expand Down Expand Up @@ -632,8 +636,8 @@ class MeanAbsoluteError : public ObjFunction {
preds.SetDevice(ctx_->Device());
auto predt = linalg::MakeTensorView(ctx_, &preds, info.num_row_, this->Targets(info));
info.weights_.SetDevice(ctx_->Device());
common::OptionalWeights weight{ctx_->IsCUDA() ? info.weights_.ConstDeviceSpan()
: info.weights_.ConstHostSpan()};
common::OptionalWeights weight{ctx_->IsCPU() ? info.weights_.ConstHostSpan()
: info.weights_.ConstDeviceSpan()};

linalg::ElementWiseKernel(
ctx_, labels, [=] XGBOOST_DEVICE(std::size_t i, std::size_t j) mutable {
Expand Down
5 changes: 3 additions & 2 deletions src/predictor/cpu_predictor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -720,8 +720,9 @@ class CPUPredictor : public Predictor {
CHECK_NE(ngroup, 0);
size_t const ncolumns = num_feature + 1;
CHECK_NE(ncolumns, 0);
auto base_margin = info.base_margin_.View(ctx_->Device());
auto base_score = model.learner_model_param->BaseScore(ctx_->Device())(0);
auto device = ctx_->Device().IsSycl() ? DeviceOrd::CPU() : ctx_->Device();
auto base_margin = info.base_margin_.View(device);
auto base_score = model.learner_model_param->BaseScore(device)(0);

// parallel over local batch
common::ParallelFor(batch.Size(), this->ctx_->Threads(), [&](auto i) {
Expand Down
7 changes: 4 additions & 3 deletions tests/cpp/objective/test_hinge.cc
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,12 @@
#include <limits>

#include "../helpers.h"
#include "test_hinge.h"
#include "../../../src/common/linalg_op.h"
namespace xgboost {
TEST(Objective, DeclareUnifiedTest(HingeObj)) {
Context ctx = MakeCUDACtx(GPUIDX);
std::unique_ptr<ObjFunction> obj{ObjFunction::Create("binary:hinge", &ctx)};

void TestHingeObj(const Context* ctx) {
std::unique_ptr<ObjFunction> obj{ObjFunction::Create("binary:hinge", ctx)};

float eps = std::numeric_limits<xgboost::bst_float>::min();
std::vector<float> predt{-1.0f, -0.5f, 0.5f, 1.0f, -1.0f, -0.5f, 0.5f, 1.0f};
Expand Down
2 changes: 1 addition & 1 deletion tests/cpp/objective/test_hinge.cu
Original file line number Diff line number Diff line change
@@ -1 +1 @@
#include "test_hinge.cc"
#include "test_hinge_cpu.cc"
Loading

0 comments on commit a502f67

Please sign in to comment.