diff --git a/apis/python/tests/ht/_ht_test_config.py b/apis/python/tests/ht/_ht_test_config.py index b489b06ae9..4364a34ee7 100644 --- a/apis/python/tests/ht/_ht_test_config.py +++ b/apis/python/tests/ht/_ht_test_config.py @@ -7,7 +7,7 @@ # # data corruption due to incorrect Arrow array offset handling # See also sc-62104 - "sc-61239_workaround": True, + "sc-61239_workaround": False, # creating array with timestamp==0 fails in 1.15 (regression) "sc-61054_workaround": True, # Tables returned by SparseNDArray.read have incorrect nullability in schema fields diff --git a/apis/python/tests/ht/_ht_util.py b/apis/python/tests/ht/_ht_util.py index c0efa83e65..828a65336e 100644 --- a/apis/python/tests/ht/_ht_util.py +++ b/apis/python/tests/ht/_ht_util.py @@ -342,25 +342,51 @@ def arrow_slice(draw: st.DrawFn, size: int) -> ArrowSlice: return (offset, length) -def pad_array(arr: npt.NDArray[Any], draw: st.DrawFn) -> pa.Array: - """Strategy helper: add padding to one or both ends of the array. This tests for Arrow array "offset" handling.""" +def pad_array(arr: pa.Array | npt.NDArray[Any], draw: st.DrawFn) -> pa.Array: + """Strategy helper: add padding to one or both ends of the array. This tests for Arrow array + offset & length handling.""" if HT_TEST_CONFIG.get("sc-61239_workaround", False): return pa.array(arr) + if not isinstance(arr, pa.Array): + arr = pa.array(arr) + head = draw(st.integers(min_value=0, max_value=16)) tail = draw(st.integers(min_value=0, max_value=16)) if not bool(head or tail): - return pa.array(arr) + return arr + + if pa.types.is_dictionary(arr.type): + padding = draw(st.integers(min_value=0, max_value=len(arr.dictionary) - 1)) + head_arr = pa.DictionaryArray.from_arrays( + indices=pa.array([padding] * head, type=arr.type.index_type), + dictionary=arr.dictionary, + ordered=arr.type.ordered, + ) + tail_arr = pa.DictionaryArray.from_arrays( + indices=pa.array([padding] * tail, type=arr.type.index_type), + dictionary=arr.dictionary, + ordered=arr.type.ordered, + ) + + else: + if pa.types.is_large_string(arr.type) or pa.types.is_string(arr.type): + pad_type = str + elif pa.types.is_large_binary(arr.type) or pa.types.is_binary(arr.type): + pad_type = bytes + elif pa.types.is_timestamp(arr.type): + pad_type = np.int64 + else: + pad_type = np.dtype(arr.type.to_pandas_dtype()).type - padding = draw(st.from_type(arr.dtype.type)) + padding = draw(st.from_type(pad_type)) + head_arr = pa.array([padding] * head).cast(arr.type) + tail_arr = pa.array([padding] * tail).cast(arr.type) - shape = (arr.shape[0] + head + tail, *arr.shape[1:]) - padded_arr = np.empty_like(arr, shape=shape) - padded_arr[0:head] = padding - padded_arr[head : head + len(arr)] = arr - padded_arr[head + len(arr) :] = padding - return pa.array(padded_arr)[head : head + len(arr)] + assert arr.type == head_arr.type == tail_arr.type + padded_arr = pa.chunked_array([head_arr, arr, tail_arr]).combine_chunks() + return padded_arr.slice(head, len(arr)) @st.composite diff --git a/apis/python/tests/ht/test_ht_dataframe.py b/apis/python/tests/ht/test_ht_dataframe.py index ffee6dbad3..288c3fa1a3 100644 --- a/apis/python/tests/ht/test_ht_dataframe.py +++ b/apis/python/tests/ht/test_ht_dataframe.py @@ -603,10 +603,7 @@ def get_max_size() -> int: if draw(st.booleans()) and not HT_TEST_CONFIG["sc-61239_workaround"]: batches = tbl.to_batches() batch_to_pad = draw(st.integers(min_value=0, max_value=len(batches) - 1)) - batch_arrays = [ - pad_array(arr.to_numpy(zero_copy_only=(arr.type != pa.bool_())), draw) - for arr in batches[batch_to_pad].columns - ] + batch_arrays = [pad_array(arr, draw) for arr in batches[batch_to_pad].columns] batches[batch_to_pad] = pa.RecordBatch.from_arrays( batch_arrays, schema=tbl.schema )