diff --git a/ci/run_cudf_polars_pytests.sh b/ci/run_cudf_polars_pytests.sh
index e881055e9e3..5a1d5f56bf0 100755
--- a/ci/run_cudf_polars_pytests.sh
+++ b/ci/run_cudf_polars_pytests.sh
@@ -17,5 +17,5 @@ python -m pytest --cache-clear "$@" tests --executor dask-experimental
 # Test the "dask-experimental" executor with Distributed cluster
 # Not all tests pass yet, deselecting by name those that are failing.
 python -m pytest --cache-clear "$@" tests --executor dask-experimental --dask-cluster \
-    -k "not test_groupby_maintain_order_random and not test_scan_csv_multi and not test_select_literal_series" \
-    --cov-fail-under=89  # Override coverage, Distributed cluster coverage not yet 100%
+    -k "not test_groupby_maintain_order_random and not test_scan_csv_multi and not test_select_literal_series and not test_can_convert_lists and not test_executor_basics and not test_replace_literal and not test_hconcat_different_heights and not test_join and not test_dataframescan and not test_strip_chars" \
+    --cov-fail-under=80  # Override coverage, Distributed cluster coverage not yet 100%
diff --git a/cpp/include/cudf/strings/string_view.cuh b/cpp/include/cudf/strings/string_view.cuh
index b91748cfc7d..15539c50da9 100644
--- a/cpp/include/cudf/strings/string_view.cuh
+++ b/cpp/include/cudf/strings/string_view.cuh
@@ -443,10 +443,12 @@ __device__ inline size_type string_view::rfind(char_utf8 chr, size_type pos, siz
 __device__ inline string_view string_view::substr(size_type pos, size_type count) const
 {
   if (pos < 0 || pos >= length()) { return string_view{}; }
-  auto const itr  = begin() + pos;
-  auto const spos = itr.byte_offset();
-  auto const epos = count >= 0 ? (itr + count).byte_offset() : size_bytes();
-  return {data() + spos, epos - spos};
+  auto const spos = begin() + pos;
+  auto const epos = count >= 0 ? (spos + count) : const_iterator{*this, _length, size_bytes()};
+  auto ss = string_view{data() + spos.byte_offset(), epos.byte_offset() - spos.byte_offset()};
+  // this potentially saves redundant character counting downstream
+  if (_length != UNKNOWN_STRING_LENGTH) { ss._length = epos.position() - spos.position(); }
+  return ss;
 }
 
 __device__ inline size_type string_view::character_offset(size_type bytepos) const
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index 3a20ffbce19..217aff48d5e 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -2226,6 +2226,22 @@ stripe_dictionaries build_dictionaries(orc_table_view& orc_table,
           std::move(dict_order_owner)};
 }
 
+[[nodiscard]] uint32_t find_largest_stream_size(device_2dspan<stripe_stream const> ss,
+                                                rmm::cuda_stream_view stream)
+{
+  auto const longest_stream = thrust::max_element(
+    rmm::exec_policy(stream),
+    ss.data(),
+    ss.data() + ss.count(),
+    cuda::proclaim_return_type<bool>([] __device__(auto const& lhs, auto const& rhs) {
+      return lhs.stream_size < rhs.stream_size;
+    }));
+
+  auto const h_longest_stream = cudf::detail::make_host_vector_sync(
+    device_span<stripe_stream const>{longest_stream, 1}, stream);
+  return h_longest_stream[0].stream_size;
+}
+
 /**
  * @brief Perform the processing steps needed to convert the input table into the output ORC data
  * for writing, such as compression and ORC encoding.
@@ -2319,7 +2335,9 @@ auto convert_table_to_orc_data(table_view const& input,
   size_t compressed_bfr_size   = 0;
   size_t num_compressed_blocks = 0;
 
-  auto const max_compressed_block_size = max_compressed_size(compression, compression_blocksize);
+  auto const largest_stream_size = find_largest_stream_size(strm_descs, stream);
+  auto const max_compressed_block_size =
+    max_compressed_size(compression, std::min<size_t>(largest_stream_size, compression_blocksize));
   auto const padded_max_compressed_block_size =
     util::round_up_unsafe<size_t>(max_compressed_block_size, block_align);
   auto const padded_block_header_size =
diff --git a/cpp/src/io/utilities/config_utils.cpp b/cpp/src/io/utilities/config_utils.cpp
index 46816604918..fa6f04eed73 100644
--- a/cpp/src/io/utilities/config_utils.cpp
+++ b/cpp/src/io/utilities/config_utils.cpp
@@ -36,10 +36,10 @@ void set_up_kvikio()
     cudaFree(nullptr);
 
     auto const compat_mode = kvikio::getenv_or("KVIKIO_COMPAT_MODE", kvikio::CompatMode::ON);
-    kvikio::defaults::compat_mode_reset(compat_mode);
+    kvikio::defaults::set_compat_mode(compat_mode);
 
     auto const nthreads = getenv_or<unsigned int>("KVIKIO_NTHREADS", 4u);
-    kvikio::defaults::thread_pool_nthreads_reset(nthreads);
+    kvikio::defaults::set_thread_pool_nthreads(nthreads);
   });
 }
 
diff --git a/cpp/src/utilities/host_memory.cpp b/cpp/src/utilities/host_memory.cpp
index 94d27d976c3..e41d772a479 100644
--- a/cpp/src/utilities/host_memory.cpp
+++ b/cpp/src/utilities/host_memory.cpp
@@ -29,6 +29,7 @@
 namespace cudf {
 
 namespace {
+
 class fixed_pinned_pool_memory_resource {
   using upstream_mr    = rmm::mr::pinned_host_memory_resource;
   using host_pooled_mr = rmm::mr::pool_memory_resource<upstream_mr>;
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index cfc6a0dc425..e3ca8b70b87 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -309,7 +309,7 @@ ConfigureTest(
 ConfigureTest(
   ORC_TEST io/orc_chunked_reader_test.cu io/orc_test.cpp
   GPUS 1
-  PERCENT 30
+  PERCENT 100
 )
 ConfigureTest(
   PARQUET_TEST
@@ -340,7 +340,7 @@ ConfigureTest(JSON_TREE_CSR io/json/json_tree_csr.cu)
 ConfigureTest(
   DATA_CHUNK_SOURCE_TEST io/text/data_chunk_source_test.cpp
   GPUS 1
-  PERCENT 30
+  PERCENT 100
 )
 target_link_libraries(DATA_CHUNK_SOURCE_TEST PRIVATE ZLIB::ZLIB)
 ConfigureTest(LOGICAL_STACK_TEST io/fst/logical_stack_test.cu)
diff --git a/docs/cudf/source/user_guide/cupy-interop.ipynb b/docs/cudf/source/user_guide/cupy-interop.ipynb
index 112f0bcfca6..93e62d90c0f 100644
--- a/docs/cudf/source/user_guide/cupy-interop.ipynb
+++ b/docs/cudf/source/user_guide/cupy-interop.ipynb
@@ -566,7 +566,7 @@
     "%%timeit\n",
     "\n",
     "fortran_arr = cp.asfortranarray(reshaped_arr)\n",
-    "reshaped_df = cudf.from_dlpack(fortran_arr.toDlpack())"
+    "reshaped_df = cudf.from_dlpack(fortran_arr.__dlpack__())"
    ]
   },
   {
@@ -1418,7 +1418,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.12"
+   "version": "3.12.9"
   }
  },
  "nbformat": 4,
diff --git a/python/cudf/cudf/core/subword_tokenizer.py b/python/cudf/cudf/core/subword_tokenizer.py
index c59a16f99f5..36f9eea0619 100644
--- a/python/cudf/cudf/core/subword_tokenizer.py
+++ b/python/cudf/cudf/core/subword_tokenizer.py
@@ -20,7 +20,7 @@ def _cast_to_appropriate_type(ar, cast_type):
     elif cast_type == "tf":
         from tensorflow.experimental.dlpack import from_dlpack
 
-    return from_dlpack(ar.astype(np.dtype(np.int32)).toDlpack())
+    return from_dlpack(ar.astype(np.dtype(np.int32)).__dlpack__())
 
 
 class SubwordTokenizer:
diff --git a/python/cudf/cudf/tests/test_dlpack.py b/python/cudf/cudf/tests/test_dlpack.py
index 20c24bd7564..187a5524e8e 100644
--- a/python/cudf/cudf/tests/test_dlpack.py
+++ b/python/cudf/cudf/tests/test_dlpack.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2024, NVIDIA CORPORATION.
+# Copyright (c) 2019-2025, NVIDIA CORPORATION.
 
 import itertools
 from contextlib import ExitStack as does_not_raise
@@ -140,7 +140,7 @@ def test_to_dlpack_cupy_2d(data_2d):
 def test_from_dlpack_cupy_1d(data_1d):
     cupy_array = cupy.array(data_1d)
     cupy_host_array = cupy_array.get()
-    dlt = cupy_array.toDlpack()
+    dlt = cupy_array.__dlpack__()
 
     gs = cudf.from_dlpack(dlt)
     cudf_host_array = gs.to_numpy(na_value=np.nan)
@@ -151,7 +151,7 @@ def test_from_dlpack_cupy_1d(data_1d):
 def test_from_dlpack_cupy_2d(data_2d):
     cupy_array = cupy.array(data_2d, order="F")
     cupy_host_array = cupy_array.get().flatten()
-    dlt = cupy_array.toDlpack()
+    dlt = cupy_array.__dlpack__()
 
     gdf = cudf.from_dlpack(dlt)
     cudf_host_array = np.array(gdf.to_pandas()).flatten()