Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SYCL. Unify calculations for objectives reg:absoluteerror, reg:quantileerror, binary:hinge #10993

Merged
merged 12 commits into from
Nov 21, 2024
16 changes: 8 additions & 8 deletions include/xgboost/linalg.h
Original file line number Diff line number Diff line change
Expand Up @@ -855,22 +855,22 @@ class Tensor {
* @brief Get a @ref TensorView for this tensor.
*/
auto View(DeviceOrd device) {
if (device.IsCUDA()) {
data_.SetDevice(device);
auto span = data_.DeviceSpan();
if (device.IsCPU()) {
auto span = data_.HostSpan();
return TensorView<T, kDim>{span, shape_, device, order_};
} else {
auto span = data_.HostSpan();
data_.SetDevice(device);
auto span = data_.DeviceSpan();
return TensorView<T, kDim>{span, shape_, device, order_};
}
}
auto View(DeviceOrd device) const {
if (device.IsCUDA()) {
data_.SetDevice(device);
auto span = data_.ConstDeviceSpan();
if (device.IsCPU()) {
auto span = data_.ConstHostSpan();
return TensorView<T const, kDim>{span, shape_, device, order_};
} else {
auto span = data_.ConstHostSpan();
data_.SetDevice(device);
auto span = data_.ConstDeviceSpan();
return TensorView<T const, kDim>{span, shape_, device, order_};
}
}
Expand Down
4 changes: 4 additions & 0 deletions plugin/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
if(PLUGIN_SYCL)
set(CMAKE_CXX_COMPILER "icpx")
file(GLOB_RECURSE SYCL_SOURCES "sycl/*.cc")
list(APPEND SYCL_SOURCES
${xgboost_SOURCE_DIR}/src/objective/regression_obj.cc
${xgboost_SOURCE_DIR}/src/objective/hinge.cc
${xgboost_SOURCE_DIR}/src/objective/quantile_obj.cc)
add_library(plugin_sycl OBJECT ${SYCL_SOURCES})
target_include_directories(plugin_sycl
PRIVATE
Expand Down
41 changes: 41 additions & 0 deletions plugin/sycl/common/linalg_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,17 @@
#include <utility>

#include "../data.h"
#include "../device_manager.h"

#include <sycl/sycl.hpp>

namespace xgboost {
namespace sycl {
namespace linalg {

template<typename T, std::int32_t D>
using TensorView = xgboost::linalg::TensorView<T, D>;

struct WorkGroupsParams {
size_t n_workgroups;
size_t workgroup_size;
Expand All @@ -40,7 +44,44 @@ ::sycl::event GroupWiseKernel(::sycl::queue* qu, int* flag_ptr,
});
return event;
}

template<typename Fn, typename TupleType, size_t ... I>
auto call(Fn&& fn, TupleType t, std::index_sequence<I ...>) {
return fn(std::get<I>(t) ...);
}

template<typename Fn, typename TupleType>
auto call(Fn&& fn, TupleType t) {
static constexpr auto size = std::tuple_size<TupleType>::value;
return call(fn, t, std::make_index_sequence<size>{});
}
Comment on lines +48 to +57
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Isn't this just std::apply?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is, but std::apply can not be executed in sycl-kernel.


template <typename T, int32_t D, typename Fn>
void ElementWiseKernel(TensorView<T, D> t, Fn&& fn) {
sycl::DeviceManager device_manager;
auto* qu = device_manager.GetQueue(t.Device());
qu->submit([&](::sycl::handler& cgh) {
cgh.parallel_for<>(::sycl::range<1>(t.Size()),
[=](::sycl::id<1> pid) {
const size_t idx = pid[0];
call(const_cast<Fn&&>(fn), xgboost::linalg::UnravelIndex(idx, t.Shape()));
});
}).wait_and_throw();
}

} // namespace linalg
} // namespace sycl

namespace linalg {
template <typename T, int32_t D, typename Fn>
void ElementWiseKernel(Context const* ctx, TensorView<T, D> t, Fn&& fn) {
if (ctx->IsSycl()) {
sycl::linalg::ElementWiseKernel(t, fn);
} else {
ElementWiseKernelHost(t, ctx->Threads(), fn);
}
}

} // namespace linalg
} // namespace xgboost
#endif // PLUGIN_SYCL_COMMON_LINALG_OP_H_
80 changes: 0 additions & 80 deletions plugin/sycl/objective/regression_obj.cc
Original file line number Diff line number Diff line change
Expand Up @@ -195,86 +195,6 @@ XGBOOST_REGISTER_OBJECTIVE(LogisticRaw,
"before logistic transformation with SYCL backend.")
.set_body([]() { return new RegLossObj<xgboost::obj::LogisticRaw>(); });

class MeanAbsoluteError : public ObjFunction {
public:
void Configure(Args const&) override {}

ObjInfo Task() const override {
return {ObjInfo::kRegression, true, true};
}

bst_target_t Targets(MetaInfo const& info) const override {
return std::max(static_cast<std::size_t>(1), info.labels.Shape(1));
}

void GetGradient(HostDeviceVector<float> const& preds, const MetaInfo& info,
std::int32_t, xgboost::linalg::Matrix<GradientPair>* out_gpair) override {
if (qu_ == nullptr) {
qu_ = device_manager.GetQueue(ctx_->Device());
}

size_t const ndata = preds.Size();
auto const n_targets = this->Targets(info);

xgboost::obj::CheckInitInputs(info);
CHECK_EQ(info.labels.Size(), preds.Size()) << "Invalid shape of labels.";
const bst_float* label_ptr = info.labels.Data()->ConstDevicePointer();

out_gpair->SetDevice(ctx_->Device());
out_gpair->Reshape(info.num_row_, this->Targets(info));
GradientPair* out_gpair_ptr = out_gpair->Data()->DevicePointer();

preds.SetDevice(ctx_->Device());
const bst_float* preds_ptr = preds.ConstDevicePointer();
auto predt = xgboost::linalg::MakeTensorView(ctx_, &preds, info.num_row_, this->Targets(info));
info.weights_.SetDevice(ctx_->Device());
common::OptionalWeights weight{ctx_->IsCPU() ? info.weights_.ConstHostSpan()
: info.weights_.ConstDeviceSpan()};

qu_->submit([&](::sycl::handler& cgh) {
cgh.parallel_for<>(::sycl::range<1>(ndata), [=](::sycl::id<1> pid) {
int idx = pid[0];
auto sign = [](auto x) {
return (x > static_cast<decltype(x)>(0)) - (x < static_cast<decltype(x)>(0));
};
const bst_float pred = preds_ptr[idx];
const bst_float label = label_ptr[idx];

bst_float hess = weight[idx/n_targets];
bst_float grad = sign(pred - label) * hess;
out_gpair_ptr[idx] = GradientPair{grad, hess};
});
});
qu_->wait_and_throw();
}

void UpdateTreeLeaf(HostDeviceVector<bst_node_t> const& position, MetaInfo const& info,
float learning_rate, HostDeviceVector<float> const& prediction,
std::int32_t group_idx, RegTree* p_tree) const override {
::xgboost::obj::UpdateTreeLeaf(ctx_, position, group_idx, info, learning_rate, prediction, 0.5,
p_tree);
}

const char* DefaultEvalMetric() const override { return "mae"; }

void SaveConfig(Json* p_out) const override {
auto& out = *p_out;
out["name"] = String("reg:absoluteerror");
}

void LoadConfig(Json const& in) override {
CHECK_EQ(StringView{get<String const>(in["name"])}, StringView{"reg:absoluteerror"});
}

protected:
sycl::DeviceManager device_manager;
mutable ::sycl::queue* qu_ = nullptr;
};

XGBOOST_REGISTER_OBJECTIVE(MeanAbsoluteError, "reg:absoluteerror_sycl")
.describe("Mean absoluate error.")
.set_body([]() { return new MeanAbsoluteError(); });

} // namespace obj
} // namespace sycl
} // namespace xgboost
6 changes: 6 additions & 0 deletions src/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
file(GLOB_RECURSE CPU_SOURCES *.cc *.h)
list(REMOVE_ITEM CPU_SOURCES ${xgboost_SOURCE_DIR}/src/cli_main.cc)

if(PLUGIN_SYCL)
list(REMOVE_ITEM CPU_SOURCES ${xgboost_SOURCE_DIR}/src/objective/regression_obj.cc)
list(REMOVE_ITEM CPU_SOURCES ${xgboost_SOURCE_DIR}/src/objective/hinge.cc)
list(REMOVE_ITEM CPU_SOURCES ${xgboost_SOURCE_DIR}/src/objective/quantile_obj.cc)
endif()

#-- Object library
# Object library is necessary for jvm-package, which creates its own shared library.
add_library(objxgboost OBJECT)
Expand Down
2 changes: 2 additions & 0 deletions src/common/linalg_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@ void ElementWiseKernelHost(linalg::TensorView<T, D> t, std::int32_t n_threads, F
}

#if !defined(XGBOOST_USE_CUDA)
#if !defined(XGBOOST_USE_SYCL)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

#if !(use_cuda or use_sycl)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

template <typename T, int32_t D, typename Fn>
void ElementWiseKernelDevice(linalg::TensorView<T, D>, Fn&&, void* = nullptr) {
common::AssertGPUSupport();
Expand All @@ -66,6 +67,7 @@ void ElementWiseKernel(Context const* ctx, linalg::TensorView<T, D> t, Fn&& fn)
}
ElementWiseKernelHost(t, ctx->Threads(), fn);
}
#endif // !defined(XGBOOST_USE_SYCL)
#endif // !defined(XGBOOST_USE_CUDA)

template <typename T, std::int32_t kDim>
Expand Down
16 changes: 9 additions & 7 deletions src/metric/elementwise_metric.cu
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ namespace {
template <typename Fn>
PackedReduceResult Reduce(Context const* ctx, MetaInfo const& info, Fn&& loss) {
PackedReduceResult result;
auto labels = info.labels.View(ctx->Device());
auto labels = info.labels.View(ctx->Device().IsSycl() ? DeviceOrd::CPU() : ctx->Device());
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I assume you want to use IsCPU instead of IsSycl?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No. This function doesn't have sycl-specific implementation yet. So for the proper execution on cpu the data should be transferred back to host.
I plan to add sycl-implementation for it in the future. Didn't do it by single PR just to avoid unreadable PR size.

if (ctx->IsCUDA()) {
#if defined(XGBOOST_USE_CUDA)
thrust::counting_iterator<size_t> begin(0);
Expand Down Expand Up @@ -182,10 +182,11 @@ class PseudoErrorLoss : public MetricNoCache {

double Eval(const HostDeviceVector<bst_float>& preds, const MetaInfo& info) override {
CHECK_EQ(info.labels.Shape(0), info.num_row_);
auto labels = info.labels.View(ctx_->Device());
preds.SetDevice(ctx_->Device());
auto device = ctx_->Device().IsSycl() ? DeviceOrd::CPU() : ctx_->Device();
auto labels = info.labels.View(device);
preds.SetDevice(device);
auto predts = ctx_->IsCUDA() ? preds.ConstDeviceSpan() : preds.ConstHostSpan();
info.weights_.SetDevice(ctx_->Device());
info.weights_.SetDevice(device);
common::OptionalWeights weights(ctx_->IsCUDA() ? info.weights_.ConstDeviceSpan()
: info.weights_.ConstHostSpan());
float slope = this->param_.huber_slope;
Expand Down Expand Up @@ -349,11 +350,12 @@ struct EvalEWiseBase : public MetricNoCache {
if (info.labels.Size() != 0) {
CHECK_NE(info.labels.Shape(1), 0);
}
auto labels = info.labels.View(ctx_->Device());
info.weights_.SetDevice(ctx_->Device());
auto device = ctx_->Device().IsSycl() ? DeviceOrd::CPU() : ctx_->Device();
auto labels = info.labels.View(device);
info.weights_.SetDevice(device);
common::OptionalWeights weights(ctx_->IsCUDA() ? info.weights_.ConstDeviceSpan()
: info.weights_.ConstHostSpan());
preds.SetDevice(ctx_->Device());
preds.SetDevice(device);
auto predts = ctx_->IsCUDA() ? preds.ConstDeviceSpan() : preds.ConstHostSpan();

auto d_policy = policy_;
Expand Down
7 changes: 5 additions & 2 deletions src/objective/hinge.cu
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@
#if defined(XGBOOST_USE_CUDA)
#include "../common/linalg_op.cuh"
#endif
#if defined(XGBOOST_USE_SYCL)
#include "../../plugin/sycl/common/linalg_op.h"
#endif
#include "../common/linalg_op.h"
#include "../common/optional_weight.h" // for OptionalWeights
#include "../common/transform.h" // for Transform
Expand Down Expand Up @@ -58,8 +61,8 @@ class HingeObj : public FitIntercept {
auto labels = info.labels.View(ctx_->Device());

info.weights_.SetDevice(ctx_->Device());
common::OptionalWeights weight{ctx_->IsCUDA() ? info.weights_.ConstDeviceSpan()
: info.weights_.ConstHostSpan()};
common::OptionalWeights weight{ctx_->IsCPU() ? info.weights_.ConstHostSpan()
: info.weights_.ConstDeviceSpan()};

linalg::ElementWiseKernel(this->ctx_, labels,
[=] XGBOOST_DEVICE(std::size_t i, std::size_t j) mutable {
Expand Down
2 changes: 1 addition & 1 deletion src/objective/objective.cc
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ std::string ObjFunction::GetSyclImplementationName(const std::string& name) {
return name + sycl_postfix;
} else {
// Function hasn't specific sycl implementation
LOG(FATAL) << "`" << name << "` doesn't have sycl implementation yet\n";
// LOG(FATAL) << "`" << name << "` doesn't have sycl implementation yet\n";
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please remove the commented code.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

return name;
}
}
Expand Down
10 changes: 7 additions & 3 deletions src/objective/quantile_obj.cu
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,10 @@

#endif // defined(XGBOOST_USE_CUDA)

#if defined(XGBOOST_USE_SYCL)
#include "../../plugin/sycl/common/linalg_op.h" // ElementWiseKernel
#endif

namespace xgboost::obj {
class QuantileRegression : public ObjFunction {
common::QuantileLossParam param_;
Expand Down Expand Up @@ -71,14 +75,14 @@ class QuantileRegression : public ObjFunction {
auto gpair = out_gpair->View(ctx_->Device());

info.weights_.SetDevice(ctx_->Device());
common::OptionalWeights weight{ctx_->IsCUDA() ? info.weights_.ConstDeviceSpan()
: info.weights_.ConstHostSpan()};
common::OptionalWeights weight{ctx_->IsCPU() ? info.weights_.ConstHostSpan()
: info.weights_.ConstDeviceSpan()};

preds.SetDevice(ctx_->Device());
auto predt = linalg::MakeTensorView(ctx_, &preds, info.num_row_, n_targets);

alpha_.SetDevice(ctx_->Device());
auto alpha = ctx_->IsCUDA() ? alpha_.ConstDeviceSpan() : alpha_.ConstHostSpan();
auto alpha = ctx_->IsCPU() ? alpha_.ConstHostSpan() : alpha_.ConstDeviceSpan();

linalg::ElementWiseKernel(ctx_, gpair,
[=] XGBOOST_DEVICE(std::size_t i, std::size_t j) mutable {
Expand Down
12 changes: 8 additions & 4 deletions src/objective/regression_obj.cu
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,10 @@
#include "../common/linalg_op.cuh"
#endif // defined(XGBOOST_USE_CUDA)

#if defined(XGBOOST_USE_SYCL)
#include "../../plugin/sycl/common/linalg_op.h"
#endif

namespace xgboost::obj {
namespace {
void CheckRegInputs(MetaInfo const& info, HostDeviceVector<bst_float> const& preds) {
Expand Down Expand Up @@ -253,8 +257,8 @@ class PseudoHuberRegression : public FitIntercept {
auto predt = linalg::MakeTensorView(ctx_, &preds, info.num_row_, this->Targets(info));

info.weights_.SetDevice(ctx_->Device());
common::OptionalWeights weight{ctx_->IsCUDA() ? info.weights_.ConstDeviceSpan()
: info.weights_.ConstHostSpan()};
common::OptionalWeights weight{ctx_->IsCPU() ? info.weights_.ConstHostSpan()
: info.weights_.ConstDeviceSpan()};

linalg::ElementWiseKernel(
ctx_, labels, [=] XGBOOST_DEVICE(std::size_t i, std::size_t j) mutable {
Expand Down Expand Up @@ -632,8 +636,8 @@ class MeanAbsoluteError : public ObjFunction {
preds.SetDevice(ctx_->Device());
auto predt = linalg::MakeTensorView(ctx_, &preds, info.num_row_, this->Targets(info));
info.weights_.SetDevice(ctx_->Device());
common::OptionalWeights weight{ctx_->IsCUDA() ? info.weights_.ConstDeviceSpan()
: info.weights_.ConstHostSpan()};
common::OptionalWeights weight{ctx_->IsCPU() ? info.weights_.ConstHostSpan()
: info.weights_.ConstDeviceSpan()};

linalg::ElementWiseKernel(
ctx_, labels, [=] XGBOOST_DEVICE(std::size_t i, std::size_t j) mutable {
Expand Down
5 changes: 3 additions & 2 deletions src/predictor/cpu_predictor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -720,8 +720,9 @@ class CPUPredictor : public Predictor {
CHECK_NE(ngroup, 0);
size_t const ncolumns = num_feature + 1;
CHECK_NE(ncolumns, 0);
auto base_margin = info.base_margin_.View(ctx_->Device());
auto base_score = model.learner_model_param->BaseScore(ctx_->Device())(0);
auto device = ctx_->Device().IsSycl() ? DeviceOrd::CPU() : ctx_->Device();
auto base_margin = info.base_margin_.View(device);
auto base_score = model.learner_model_param->BaseScore(device)(0);

// parallel over local batch
common::ParallelFor(batch.Size(), this->ctx_->Threads(), [&](auto i) {
Expand Down
7 changes: 4 additions & 3 deletions tests/cpp/objective/test_hinge.cc
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,12 @@
#include <limits>

#include "../helpers.h"
#include "test_hinge.h"
#include "../../../src/common/linalg_op.h"
namespace xgboost {
TEST(Objective, DeclareUnifiedTest(HingeObj)) {
Context ctx = MakeCUDACtx(GPUIDX);
std::unique_ptr<ObjFunction> obj{ObjFunction::Create("binary:hinge", &ctx)};

void TestHingeObj(const Context* ctx) {
std::unique_ptr<ObjFunction> obj{ObjFunction::Create("binary:hinge", ctx)};

float eps = std::numeric_limits<xgboost::bst_float>::min();
std::vector<float> predt{-1.0f, -0.5f, 0.5f, 1.0f, -1.0f, -0.5f, 0.5f, 1.0f};
Expand Down
15 changes: 15 additions & 0 deletions tests/cpp/objective/test_hinge.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
/**
* Copyright 2020-2024 by XGBoost Contributors
*/
#ifndef XGBOOST_TEST_HINGE_H_
#define XGBOOST_TEST_HINGE_H_

#include <xgboost/context.h> // for Context

namespace xgboost {

void TestHingeObj(const Context* ctx);

} // namespace xgboost

#endif // XGBOOST_TEST_REGRESSION_OBJ_H_
Loading
Loading