From 1c0ea5e7f7968fbeb6852a533df30795ad754b2b Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Mon, 3 Mar 2025 11:18:37 -0800 Subject: [PATCH] Reduce memory use when writing tables with very short columns to ORC (#18136) Closes #18059 To avoid estimating the maximum compressed size for each actual block in the file, ORC writer uses the estimate for the (uncompressed) block size limit, which defaults to 256KB. However, when we write many small blocks, this compressed block size estimate is much larger than what is needed, leading to high memory use for wide/short tables. This PR adds logic to take the actual block size into account, and to use the size of the actual largest block in the file, not the largest possible block. This changes the memory usage by orders of magnitude in some tests. --------- Co-authored-by: Bradley Dice --- cpp/src/io/orc/writer_impl.cu | 20 +++++++++++++++++++- cpp/src/utilities/host_memory.cpp | 1 + cpp/tests/CMakeLists.txt | 4 ++-- 3 files changed, 22 insertions(+), 3 deletions(-) diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu index 3a20ffbce19..217aff48d5e 100644 --- a/cpp/src/io/orc/writer_impl.cu +++ b/cpp/src/io/orc/writer_impl.cu @@ -2226,6 +2226,22 @@ stripe_dictionaries build_dictionaries(orc_table_view& orc_table, std::move(dict_order_owner)}; } +[[nodiscard]] uint32_t find_largest_stream_size(device_2dspan ss, + rmm::cuda_stream_view stream) +{ + auto const longest_stream = thrust::max_element( + rmm::exec_policy(stream), + ss.data(), + ss.data() + ss.count(), + cuda::proclaim_return_type([] __device__(auto const& lhs, auto const& rhs) { + return lhs.stream_size < rhs.stream_size; + })); + + auto const h_longest_stream = cudf::detail::make_host_vector_sync( + device_span{longest_stream, 1}, stream); + return h_longest_stream[0].stream_size; +} + /** * @brief Perform the processing steps needed to convert the input table into the output ORC data * for writing, such as compression and ORC encoding. @@ -2319,7 +2335,9 @@ auto convert_table_to_orc_data(table_view const& input, size_t compressed_bfr_size = 0; size_t num_compressed_blocks = 0; - auto const max_compressed_block_size = max_compressed_size(compression, compression_blocksize); + auto const largest_stream_size = find_largest_stream_size(strm_descs, stream); + auto const max_compressed_block_size = + max_compressed_size(compression, std::min(largest_stream_size, compression_blocksize)); auto const padded_max_compressed_block_size = util::round_up_unsafe(max_compressed_block_size, block_align); auto const padded_block_header_size = diff --git a/cpp/src/utilities/host_memory.cpp b/cpp/src/utilities/host_memory.cpp index 94d27d976c3..e41d772a479 100644 --- a/cpp/src/utilities/host_memory.cpp +++ b/cpp/src/utilities/host_memory.cpp @@ -29,6 +29,7 @@ namespace cudf { namespace { + class fixed_pinned_pool_memory_resource { using upstream_mr = rmm::mr::pinned_host_memory_resource; using host_pooled_mr = rmm::mr::pool_memory_resource; diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index cfc6a0dc425..e3ca8b70b87 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -309,7 +309,7 @@ ConfigureTest( ConfigureTest( ORC_TEST io/orc_chunked_reader_test.cu io/orc_test.cpp GPUS 1 - PERCENT 30 + PERCENT 100 ) ConfigureTest( PARQUET_TEST @@ -340,7 +340,7 @@ ConfigureTest(JSON_TREE_CSR io/json/json_tree_csr.cu) ConfigureTest( DATA_CHUNK_SOURCE_TEST io/text/data_chunk_source_test.cpp GPUS 1 - PERCENT 30 + PERCENT 100 ) target_link_libraries(DATA_CHUNK_SOURCE_TEST PRIVATE ZLIB::ZLIB) ConfigureTest(LOGICAL_STACK_TEST io/fst/logical_stack_test.cu)