From d84582b746500237c52701975e006ba8a813d229 Mon Sep 17 00:00:00 2001 From: Oliver Borchert Date: Wed, 6 Dec 2023 16:18:28 +0000 Subject: [PATCH 1/2] Fix null handling for Arrow data (#6227) --- include/LightGBM/arrow.tpp | 2 +- tests/cpp_tests/test_arrow.cpp | 6 ++++-- tests/python_package_test/test_arrow.py | 11 +++++++++++ 3 files changed, 16 insertions(+), 3 deletions(-) diff --git a/include/LightGBM/arrow.tpp b/include/LightGBM/arrow.tpp index 67b481c9497e..8d1ce4f4c0c1 100644 --- a/include/LightGBM/arrow.tpp +++ b/include/LightGBM/arrow.tpp @@ -144,7 +144,7 @@ struct ArrayIndexAccessor { // - The structure of validity bitmasks is taken from here: // https://arrow.apache.org/docs/format/Columnar.html#validity-bitmaps // - If the bitmask is NULL, all indices are valid - if (validity == nullptr || !(validity[buffer_idx / 8] & (1 << (buffer_idx % 8)))) { + if (validity == nullptr || (validity[buffer_idx / 8] & (1 << (buffer_idx % 8)))) { // In case the index is valid, we take it from the data buffer auto data = static_cast(array->buffers[1]); return static_cast(data[buffer_idx]); diff --git a/tests/cpp_tests/test_arrow.cpp b/tests/cpp_tests/test_arrow.cpp index 7e3c57c401f4..e975b6ba374b 100644 --- a/tests/cpp_tests/test_arrow.cpp +++ b/tests/cpp_tests/test_arrow.cpp @@ -41,10 +41,12 @@ class ArrowChunkedArrayTest : public testing::Test { // 1) Create validity bitmap char* validity = nullptr; if (!null_indices.empty()) { - validity = static_cast(calloc(values.size() + sizeof(char) - 1, sizeof(char))); + auto num_bytes = (values.size() + 7) / 8; + validity = static_cast(calloc(num_bytes, sizeof(char))); + memset(validity, 0xff, num_bytes * sizeof(char)); for (size_t i = 0; i < values.size(); ++i) { if (std::find(null_indices.begin(), null_indices.end(), i) != null_indices.end()) { - validity[i / 8] |= (1 << (i % 8)); + validity[i / 8] &= ~(1 << (i % 8)); } } } diff --git a/tests/python_package_test/test_arrow.py b/tests/python_package_test/test_arrow.py index fd20df25dd87..5e09465e34b3 100644 --- a/tests/python_package_test/test_arrow.py +++ b/tests/python_package_test/test_arrow.py @@ -46,6 +46,16 @@ def generate_simple_arrow_table() -> pa.Table: return pa.Table.from_arrays(columns, names=[f"col_{i}" for i in range(len(columns))]) +def generate_nullable_arrow_table() -> pa.Table: + columns = [ + pa.chunked_array([[1, None, 3, 4, 5]], type=pa.float32()), + pa.chunked_array([[None, 2, 3, 4, 5]], type=pa.float32()), + pa.chunked_array([[1, 2, 3, 4, None]], type=pa.float32()), + pa.chunked_array([[None, None, None, None, None]], type=pa.float32()), + ] + return pa.Table.from_arrays(columns, names=[f"col_{i}" for i in range(len(columns))]) + + def generate_dummy_arrow_table() -> pa.Table: col1 = pa.chunked_array([[1, 2, 3], [4, 5]], type=pa.uint8()) col2 = pa.chunked_array([[0.5, 0.6], [0.1, 0.8, 1.5]], type=pa.float32()) @@ -95,6 +105,7 @@ def dummy_dataset_params() -> Dict[str, Any]: [ # Use lambda functions here to minimize memory consumption (lambda: generate_simple_arrow_table(), dummy_dataset_params()), (lambda: generate_dummy_arrow_table(), dummy_dataset_params()), + (lambda: generate_nullable_arrow_table(), dummy_dataset_params()), (lambda: generate_random_arrow_table(3, 1000, 42), {}), (lambda: generate_random_arrow_table(100, 10000, 43), {}), ], From 4aba4fc1326210a1501f144bd54d77a64d127362 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Wed, 6 Dec 2023 12:56:27 -0600 Subject: [PATCH 2/2] [R-package] change CRAN maintainer (#6224) --- R-package/DESCRIPTION | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/R-package/DESCRIPTION b/R-package/DESCRIPTION index 1193c0d463b9..62b479530b4a 100755 --- a/R-package/DESCRIPTION +++ b/R-package/DESCRIPTION @@ -4,10 +4,10 @@ Title: Light Gradient Boosting Machine Version: ~~VERSION~~ Date: ~~DATE~~ Authors@R: c( - person("Yu", "Shi", email = "yushi2@microsoft.com", role = c("aut", "cre")), + person("Yu", "Shi", email = "yushi2@microsoft.com", role = c("aut")), person("Guolin", "Ke", email = "guolin.ke@outlook.com", role = c("aut")), person("Damien", "Soukhavong", email = "damien.soukhavong@skema.edu", role = c("aut")), - person("James", "Lamb", email="jaylamb20@gmail.com", role = c("aut")), + person("James", "Lamb", email="jaylamb20@gmail.com", role = c("aut", "cre")), person("Qi", "Meng", role = c("aut")), person("Thomas", "Finley", role = c("aut")), person("Taifeng", "Wang", role = c("aut")),