Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: forest rework #2799

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions WORKSPACE
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,12 @@ ccl_repo(
],
)

load("@onedal//dev/bazel/deps:dpl.bzl", "dpl_repo")
dpl_repo(
name = "dpl",
root_env_var = "DPL_ROOT",
)

load("@onedal//dev/bazel/deps:mkl.bzl", "mkl_repo")
mkl_repo(
name = "mkl",
Expand Down
2 changes: 2 additions & 0 deletions cpp/daal/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -130,6 +130,8 @@ daal_module(
"@config//:backend_ref": [
":services",
"@onedal//cpp/daal/src/algorithms/engines:kernel",
"@micromkl_dpc//:headers",
"@dpl//:headers",
],
"//conditions:default": [
":services",
Expand Down
1 change: 1 addition & 0 deletions cpp/oneapi/dal/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ dal_module(
"@onedal//cpp/daal:data_management",
],
dpc_deps = [
"@dpl//:headers",
"@micromkl_dpc//:mkl_dpc",
],
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,11 @@ void infer_kernel_impl<Float, Index, Task>::validate_input(const descriptor_t& d
if (data.get_row_count() > de::limits<Index>::max()) {
throw domain_error(dal::detail::error_messages::invalid_range_of_rows());
}

if (data.get_column_count() > de::limits<Index>::max()) {
throw domain_error(dal::detail::error_messages::invalid_range_of_columns());
}

if (model.get_tree_count() > de::limits<Index>::max()) {
throw domain_error(dal::detail::error_messages::invalid_number_of_trees());
}
Expand All @@ -67,6 +69,7 @@ void infer_kernel_impl<Float, Index, Task>::init_params(infer_context_t& ctx,
ctx.class_count = de::integral_cast<Index>(desc.get_class_count());
ctx.voting_mode = desc.get_voting_mode();
}

ctx.row_count = de::integral_cast<Index>(data.get_row_count());
ctx.column_count = de::integral_cast<Index>(data.get_column_count());

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,7 @@ class indexed_features {
const pr::ndarray<Index, 1>& bin_offsets_nd,
pr::ndarray<Float, 1>& bin_borders_nd,
Index max_bins,
pr::ndarray<Index, 1>& unique_offsets_nd,
const bk::event_vector& deps = {});

std::tuple<pr::ndarray<Float, 1>, Index, sycl::event> gather_bin_borders(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,12 @@ namespace de = dal::detail;
namespace bk = dal::backend;
namespace pr = dal::backend::primitives;

template <typename Float>
std::int64_t propose_block_size(const sycl::queue& q, const std::int64_t r) {
constexpr std::int64_t fsize = sizeof(Float);
return 0x10000l * (8 / fsize);
}

template <typename Float, typename Index>
inline sycl::event sort_inplace(sycl::queue& queue_,
pr::ndarray<Float, 1>& src,
Expand Down Expand Up @@ -56,18 +62,29 @@ sycl::event indexed_features<Float, Bin, Index>::extract_column(
Float* values = values_nd.get_mutable_data();
Index* indices = indices_nd.get_mutable_data();
auto column_count = column_count_;

const sycl::range<1> range = de::integral_cast<std::size_t>(row_count_);

auto event = queue_.submit([&](sycl::handler& h) {
h.depends_on(deps);
h.parallel_for(range, [=](sycl::id<1> idx) {
values[idx] = data[idx * column_count + feature_id];
indices[idx] = idx;
const auto block_size = propose_block_size<Float>(queue_, row_count_);
const bk::uniform_blocking blocking(row_count_, block_size);

std::vector<sycl::event> events(blocking.get_block_count());
for (std::int64_t block_index = 0; block_index < blocking.get_block_count(); ++block_index) {
const auto first_row = blocking.get_block_start_index(block_index);
const auto last_row = blocking.get_block_end_index(block_index);
const auto curr_block = last_row - first_row;
ONEDAL_ASSERT(curr_block > 0);

auto event = queue_.submit([&](sycl::handler& cgh) {
cgh.depends_on(deps);
cgh.parallel_for<>(de::integral_cast<std::size_t>(curr_block), [=](sycl::id<1> idx) {
const std::int64_t row = idx + first_row;

values[row] = data[row * column_count + feature_id];
indices[row] = row;
});
});
});

return event;
events.push_back(event);
}
return bk::wait_or_pass(events);
}

template <typename Float, typename Bin, typename Index>
Expand All @@ -77,6 +94,7 @@ sycl::event indexed_features<Float, Bin, Index>::collect_bin_borders(
const pr::ndarray<Index, 1>& bin_offsets_nd,
pr::ndarray<Float, 1>& bin_borders_nd,
Index max_bins,
pr::ndarray<Index, 1>& unique_nd,
const bk::event_vector& deps) {
ONEDAL_ASSERT(values_nd.get_count() == row_count);
ONEDAL_ASSERT(bin_offsets_nd.get_count() == max_bins);
Expand All @@ -87,11 +105,22 @@ sycl::event indexed_features<Float, Bin, Index>::collect_bin_borders(
const Float* values = values_nd.get_data();
const Index* bin_offsets = bin_offsets_nd.get_data();
Float* bin_borders = bin_borders_nd.get_mutable_data();

Index* unique_bin_count = unique_nd.get_mutable_data();
auto event = queue_.submit([&](sycl::handler& cgh) {
cgh.depends_on(deps);
cgh.parallel_for(range, [=](sycl::id<1> idx) {
bin_borders[idx] = values[bin_offsets[idx]];
Float bin_border_value = values[bin_offsets[idx]];

// Filtering unique bin borders
if (idx == 0 || bin_border_value != bin_borders[idx - 1]) {
bin_borders[idx] = bin_border_value;
sycl::atomic_ref<Index,
sycl::memory_order::relaxed,
sycl::memory_scope::device,
sycl::access::address_space::ext_intel_global_device_space>
counter_atomic(unique_bin_count[0]);
counter_atomic.fetch_add(1);
}
});
});

Expand Down Expand Up @@ -165,15 +194,14 @@ std::tuple<pr::ndarray<Float, 1>, Index, sycl::event>
indexed_features<Float, Bin, Index>::gather_bin_borders(const pr::ndarray<Float, 1>& values_nd,
Index row_count,
const bk::event_vector& deps) {
ONEDAL_PROFILER_TASK(indexed_features.gather_bin_borders, queue_);
ONEDAL_PROFILER_TASK(indexed_features.gather_bin_borders_local, queue_);

ONEDAL_ASSERT(values_nd.get_count() == row_count);

sycl::event::wait_and_throw(deps);

const Index max_bins = std::min(max_bins_, row_count);

auto bin_offsets_nd_host = pr::ndarray<Index, 1>::empty({ max_bins });
auto [bin_unique, bin_unique_event] =
pr::ndarray<Index, 1>::full(queue_, 1, 0, sycl::usm::alloc::device);
auto bin_borders_nd_device =
pr::ndarray<Float, 1>::empty(queue_, { max_bins }, sycl::usm::alloc::device);

Expand All @@ -192,21 +220,14 @@ indexed_features<Float, Bin, Index>::gather_bin_borders(const pr::ndarray<Float,
bin_offsets_nd_,
bin_borders_nd_device,
max_bins,
{ deps });
bin_unique,
{ bin_unique_event });

Index bin_count = 0;
auto bin_borders_nd_host = bin_borders_nd_device.to_host(queue_, { last_event });
auto bin_borders_ptr = bin_borders_nd_host.get_mutable_data();

for (Index i = 0; i < max_bins; ++i) {
if (0 == bin_count ||
(bin_count > 0 && bin_borders_ptr[i] != bin_borders_ptr[bin_count - 1])) {
bin_borders_ptr[bin_count] = bin_borders_ptr[i];
bin_count++;
}
}
last_event.wait_and_throw();

bin_borders_nd_device = bin_borders_nd_host.slice(0, bin_count).to_device(queue_);
Index bin_count = bin_unique.to_host(queue_).get_data()[0];

bin_borders_nd_device = bin_borders_nd_device.slice(0, bin_count);

return std::make_tuple(bin_borders_nd_device, bin_count, last_event);
}
Expand All @@ -217,7 +238,7 @@ indexed_features<Float, Bin, Index>::gather_bin_borders_distr(
const pr::ndarray<Float, 1>& values_nd,
Index row_count,
const bk::event_vector& deps) {
ONEDAL_PROFILER_TASK(indexed_features.gather_bin_borders, queue_);
ONEDAL_PROFILER_TASK(indexed_features.gather_bin_borders_distr, queue_);

ONEDAL_ASSERT(values_nd.get_count() == row_count);

Expand All @@ -234,9 +255,10 @@ indexed_features<Float, Bin, Index>::gather_bin_borders_distr(
last_event = event;

Index com_bin_count = 0;
std::int64_t rank_count = comm_.get_rank_count();
// using std::int64_t instead of Index because of it is used as displ in gatherv
auto com_bin_count_arr = pr::ndarray<std::int64_t, 1>::empty({ comm_.get_rank_count() });
auto com_bin_offset_arr = pr::ndarray<std::int64_t, 1>::empty({ comm_.get_rank_count() });
auto com_bin_count_arr = pr::ndarray<std::int64_t, 1>::empty({ rank_count });
auto com_bin_offset_arr = pr::ndarray<std::int64_t, 1>::empty({ rank_count });

std::int64_t lbc_64 = static_cast<std::int64_t>(local_bin_count);
{
Expand All @@ -250,14 +272,14 @@ indexed_features<Float, Bin, Index>::gather_bin_borders_distr(
comm_.allreduce(com_bin_count).wait();
}

pr::ndarray<Float, 1> com_bin_brd;
com_bin_brd = pr::ndarray<Float, 1>::empty(queue_, { com_bin_count }, sycl::usm::alloc::device);
auto com_bin_brd =
pr::ndarray<Float, 1>::empty(queue_, { com_bin_count }, sycl::usm::alloc::device);

const std::int64_t* com_bin_count_ptr = com_bin_count_arr.get_data();
std::int64_t* com_bin_offset_ptr = com_bin_offset_arr.get_mutable_data();

std::int64_t offset = 0;
for (Index i = 0; i < comm_.get_rank_count(); ++i) {
for (Index i = 0; i < rank_count; ++i) {
com_bin_offset_ptr[i] = offset;
offset += com_bin_count_ptr[i];
}
Expand All @@ -273,11 +295,12 @@ indexed_features<Float, Bin, Index>::gather_bin_borders_distr(
}

if (comm_.is_root_rank()) {
ONEDAL_PROFILER_TASK(sort_and_gather_on_main_rank);
last_event = sort_inplace<Float, Index>(queue_, com_bin_brd, { last_event });

// filter out fin bin set
auto [fin_borders_nd_device_temp, fin_bin_count_temp, event] =
gather_bin_borders(com_bin_brd, com_bin_count);
gather_bin_borders(com_bin_brd, com_bin_count, { last_event });
event.wait_and_throw();

bin_borders_nd_device = fin_borders_nd_device_temp;
Expand Down Expand Up @@ -317,10 +340,10 @@ sycl::event indexed_features<Float, Bin, Index>::compute_bins(
sycl::event::wait_and_throw(deps);

sycl::event last_event;

const std::int64_t rank_count = comm_.get_rank_count();
auto [bin_borders_nd_device, bin_count, event] =
comm_.get_rank_count() > 1 ? gather_bin_borders_distr(values_nd, row_count_, deps)
: gather_bin_borders(values_nd, row_count_, deps);
rank_count > 1 ? gather_bin_borders_distr(values_nd, row_count_, deps)
: gather_bin_borders(values_nd, row_count_, deps);
last_event = event;

const Index local_size = bk::device_max_sg_size(queue_);
Expand Down
27 changes: 27 additions & 0 deletions dev/bazel/deps/dpl.bzl
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#===============================================================================
# Copyright 2024 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#===============================================================================

load("@onedal//dev/bazel:repos.bzl", "repos")

dpl_repo = repos.prebuilt_libs_repo_rule(
includes = [
"include",
],
libs = [
"lib",
],
build_template = "@onedal//dev/bazel/deps:dpl.tpl.BUILD",
)
8 changes: 8 additions & 0 deletions dev/bazel/deps/dpl.tpl.BUILD
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
package(default_visibility = ["//visibility:public"])

cc_library(
name = "headers",
hdrs = glob(["include/**/**/*"]),
includes = [ "include" ],
)

2 changes: 1 addition & 1 deletion dev/make/compiler_definitions/icc.mkl.32e.mk
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ CORE.SERV.COMPILER.icc = generic
-Qopt = $(if $(OS_is_win),-Qopt-,-qopt-)

COMPILER.lnx.icc = $(if $(COVFILE),cov01 -1; covc --no-banner -i )icc -qopenmp-simd \
-Werror -Wreturn-type -diag-disable=10441
-Werror -Wreturn-type -diag-disable=10441 -Werror-Wdeprecated-declarations
COMPILER.lnx.icc += $(if $(COVFILE), $(-Q)m64)
COMPILER.win.icc = icl $(if $(MSVC_RT_is_release),-MD, -MDd /debug:none) -nologo -WX -Qopenmp-simd -Qdiag-disable:10441
COMPILER.mac.icc = icc -stdlib=libc++ -mmacosx-version-min=10.15 \
Expand Down
2 changes: 1 addition & 1 deletion dev/make/compiler_definitions/icx.mkl.32e.mk
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ CORE.SERV.COMPILER.icx = generic
-DEBC.icx = -g

COMPILER.lnx.icx = icpx -m64 \
-Werror -Wreturn-type
-Werror -Wreturn-type -Werror -Wdeprecated-declarations


link.dynamic.lnx.icx = icpx -m64
Expand Down
Loading