From 54cda90b99ddcf89259e3244f86f4b4efbecbec7 Mon Sep 17 00:00:00 2001 From: Malte Londschien Date: Thu, 9 Jan 2025 18:23:49 +0100 Subject: [PATCH 1/7] Infer feature names from . --- python-package/lightgbm/basic.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index 7b152fd2b006..b07ad92c792e 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -2126,6 +2126,8 @@ def _lazy_init( categorical_feature=categorical_feature, pandas_categorical=self.pandas_categorical, ) + elif isinstance(data, pa_Table): + feature_name = data.column_names # process for args params = {} if params is None else params From 7ba4fd6d90b6a23f592d2e1ca762720d94e8a2ff Mon Sep 17 00:00:00 2001 From: Malte Londschien Date: Thu, 9 Jan 2025 21:17:04 +0100 Subject: [PATCH 2/7] Assigning once is enough. --- python-package/lightgbm/basic.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index b07ad92c792e..caa11b39f6d8 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -2187,7 +2187,6 @@ def _lazy_init( self.__init_from_np2d(data, params_str, ref_dataset) elif _is_pyarrow_table(data): self.__init_from_pyarrow_table(data, params_str, ref_dataset) - feature_name = data.column_names elif isinstance(data, list) and len(data) > 0: if _is_list_of_numpy_arrays(data): self.__init_from_list_np2d(data, params_str, ref_dataset) From 2bfe09ec97640c9088524496d65dfa8f92c7c2d2 Mon Sep 17 00:00:00 2001 From: Malte Londschien Date: Thu, 9 Jan 2025 21:17:11 +0100 Subject: [PATCH 3/7] Add test. --- tests/python_package_test/test_arrow.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/python_package_test/test_arrow.py b/tests/python_package_test/test_arrow.py index 3a7e0f8d4fce..7ddb49927401 100644 --- a/tests/python_package_test/test_arrow.py +++ b/tests/python_package_test/test_arrow.py @@ -432,3 +432,15 @@ def test_predict_ranking(): num_boost_round=5, ) assert_equal_predict_arrow_pandas(booster, data) + + +def test_arrow_categorical(): + data = generate_random_arrow_table(10, 10000, 42) + dataset = lgb.Dataset( + data, + label=generate_random_arrow_array(10000, 43, generate_nulls=False, values=np.arange(4)), + params=dummy_dataset_params(), + categorical_feature=["col_0"] + ) + booster = lgb.train({"num_leaves": 7}, dataset, num_boost_round=5) + assert_equal_predict_arrow_pandas(booster, data) From 9ea6d2e336b260cc1b63a2adc50c70c5d17c64bc Mon Sep 17 00:00:00 2001 From: Malte Londschien Date: Thu, 9 Jan 2025 21:18:47 +0100 Subject: [PATCH 4/7] Use _is_pyarrow_table --- python-package/lightgbm/basic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index caa11b39f6d8..36b6baf55ddf 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -2126,7 +2126,7 @@ def _lazy_init( categorical_feature=categorical_feature, pandas_categorical=self.pandas_categorical, ) - elif isinstance(data, pa_Table): + elif _is_pyarrow_table(data): feature_name = data.column_names # process for args From c989d9d91ef5a9ef9217860d479e72e0025f447d Mon Sep 17 00:00:00 2001 From: Malte Londschien Date: Fri, 10 Jan 2025 10:05:46 +0100 Subject: [PATCH 5/7] Only reassign feature_name if feature_name == 'auto' --- python-package/lightgbm/basic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python-package/lightgbm/basic.py b/python-package/lightgbm/basic.py index 36b6baf55ddf..c3abb62469dd 100644 --- a/python-package/lightgbm/basic.py +++ b/python-package/lightgbm/basic.py @@ -2126,7 +2126,7 @@ def _lazy_init( categorical_feature=categorical_feature, pandas_categorical=self.pandas_categorical, ) - elif _is_pyarrow_table(data): + elif _is_pyarrow_table(data) and feature_name == "auto": feature_name = data.column_names # process for args From b316f07056063acdaf8f313bc6cec6cd31ce00eb Mon Sep 17 00:00:00 2001 From: Malte Londschien Date: Fri, 10 Jan 2025 10:13:10 +0100 Subject: [PATCH 6/7] Update tests. --- tests/python_package_test/test_arrow.py | 22 +++++++++++++++++----- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/tests/python_package_test/test_arrow.py b/tests/python_package_test/test_arrow.py index 7ddb49927401..2059f3ba6442 100644 --- a/tests/python_package_test/test_arrow.py +++ b/tests/python_package_test/test_arrow.py @@ -434,13 +434,25 @@ def test_predict_ranking(): assert_equal_predict_arrow_pandas(booster, data) -def test_arrow_categorical(): - data = generate_random_arrow_table(10, 10000, 42) +def test_arrow_feature_name_auto(): + data = generate_dummy_arrow_table() dataset = lgb.Dataset( data, - label=generate_random_arrow_array(10000, 43, generate_nulls=False, values=np.arange(4)), + label=pa.array([0, 1, 0, 0, 1]), params=dummy_dataset_params(), - categorical_feature=["col_0"] + categorical_feature=["a"] ) booster = lgb.train({"num_leaves": 7}, dataset, num_boost_round=5) - assert_equal_predict_arrow_pandas(booster, data) + assert booster.feature_name() == ["a", "b"] + +def test_arrow_feature_name_manual(): + data = generate_dummy_arrow_table() + dataset = lgb.Dataset( + data, + label=pa.array([0, 1, 0, 0, 1]), + params=dummy_dataset_params(), + feature_name=["c", "d"], + categorical_feature=["c"] + ) + booster = lgb.train({"num_leaves": 7}, dataset, num_boost_round=5) + assert booster.feature_name() == ["c", "d"] From 86ead3f836a89e62fc2f5be5443c5fcd46f8d739 Mon Sep 17 00:00:00 2001 From: Malte Londschien Date: Fri, 10 Jan 2025 10:17:00 +0100 Subject: [PATCH 7/7] Format. --- tests/python_package_test/test_arrow.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/tests/python_package_test/test_arrow.py b/tests/python_package_test/test_arrow.py index 2059f3ba6442..d8246f3842de 100644 --- a/tests/python_package_test/test_arrow.py +++ b/tests/python_package_test/test_arrow.py @@ -437,14 +437,12 @@ def test_predict_ranking(): def test_arrow_feature_name_auto(): data = generate_dummy_arrow_table() dataset = lgb.Dataset( - data, - label=pa.array([0, 1, 0, 0, 1]), - params=dummy_dataset_params(), - categorical_feature=["a"] + data, label=pa.array([0, 1, 0, 0, 1]), params=dummy_dataset_params(), categorical_feature=["a"] ) booster = lgb.train({"num_leaves": 7}, dataset, num_boost_round=5) assert booster.feature_name() == ["a", "b"] + def test_arrow_feature_name_manual(): data = generate_dummy_arrow_table() dataset = lgb.Dataset( @@ -452,7 +450,7 @@ def test_arrow_feature_name_manual(): label=pa.array([0, 1, 0, 0, 1]), params=dummy_dataset_params(), feature_name=["c", "d"], - categorical_feature=["c"] + categorical_feature=["c"], ) booster = lgb.train({"num_leaves": 7}, dataset, num_boost_round=5) assert booster.feature_name() == ["c", "d"]