Skip to content

Commit

Permalink
Use batched memcpy when writing ORC statistics (#17572)
Browse files Browse the repository at this point in the history
This PR replaces a set of per-column, per-rowgroup D2D memcopies with a single call to the `batched_memcpy_async` utility. Should improve performance when writing wide tables.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Muhammad Haseeb (https://github.com/mhaseeb123)
  - Yunsong Wang (https://github.com/PointKernel)

URL: #17572
  • Loading branch information
vuule authored Dec 12, 2024
1 parent 32548b0 commit 78e5c0d
Showing 1 changed file with 21 additions and 15 deletions.
36 changes: 21 additions & 15 deletions cpp/src/io/orc/writer_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@

#include <cudf/detail/iterator.cuh>
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/utilities/batched_memcpy.hpp>
#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/detail/utilities/stream_pool.hpp>
#include <cudf/detail/utilities/vector_factories.hpp>
Expand Down Expand Up @@ -1386,29 +1387,34 @@ encoded_footer_statistics finish_statistic_blobs(Footer const& footer,
// we know the size of each array. The number of stripes per column in a chunk array can
// be calculated by dividing the number of chunks by the number of columns.
// That many chunks need to be copied at a time to the proper destination.
size_t num_entries_seen = 0;
size_t num_entries_seen = 0;
auto const num_buffers_to_copy = per_chunk_stats.stripe_stat_chunks.size() * num_columns * 2;
auto h_srcs = cudf::detail::make_empty_host_vector<void*>(num_buffers_to_copy, stream);
auto h_dsts = cudf::detail::make_empty_host_vector<void*>(num_buffers_to_copy, stream);
auto h_lens = cudf::detail::make_empty_host_vector<size_t>(num_buffers_to_copy, stream);

for (size_t i = 0; i < per_chunk_stats.stripe_stat_chunks.size(); ++i) {
auto const stripes_per_col = per_chunk_stats.stripe_stat_chunks[i].size() / num_columns;

auto const chunk_bytes = stripes_per_col * sizeof(statistics_chunk);
auto const merge_bytes = stripes_per_col * sizeof(statistics_merge_group);
for (size_t col = 0; col < num_columns; ++col) {
CUDF_CUDA_TRY(
cudaMemcpyAsync(stat_chunks.data() + (num_stripes * col) + num_entries_seen,
per_chunk_stats.stripe_stat_chunks[i].data() + col * stripes_per_col,
chunk_bytes,
cudaMemcpyDefault,
stream.value()));
CUDF_CUDA_TRY(
cudaMemcpyAsync(stats_merge.device_ptr() + (num_stripes * col) + num_entries_seen,
per_chunk_stats.stripe_stat_merge[i].device_ptr() + col * stripes_per_col,
merge_bytes,
cudaMemcpyDefault,
stream.value()));
h_srcs.push_back(per_chunk_stats.stripe_stat_chunks[i].data() + col * stripes_per_col);
h_dsts.push_back(stat_chunks.data() + (num_stripes * col) + num_entries_seen);
h_lens.push_back(stripes_per_col * sizeof(statistics_chunk));

h_srcs.push_back(per_chunk_stats.stripe_stat_merge[i].device_ptr() + col * stripes_per_col);
h_dsts.push_back(stats_merge.device_ptr() + (num_stripes * col) + num_entries_seen);
h_lens.push_back(stripes_per_col * sizeof(statistics_merge_group));
}
num_entries_seen += stripes_per_col;
}

auto const& mr = cudf::get_current_device_resource_ref();
auto const d_srcs = cudf::detail::make_device_uvector_async(h_srcs, stream, mr);
auto const d_dsts = cudf::detail::make_device_uvector_async(h_dsts, stream, mr);
auto const d_lens = cudf::detail::make_device_uvector_async(h_lens, stream, mr);
cudf::detail::batched_memcpy_async(
d_srcs.begin(), d_dsts.begin(), d_lens.begin(), d_srcs.size(), stream);

auto file_stats_merge =
cudf::detail::make_host_vector<statistics_merge_group>(num_file_blobs, stream);
for (auto i = 0u; i < num_file_blobs; ++i) {
Expand Down

0 comments on commit 78e5c0d

Please sign in to comment.