From 6bf8675b096b35023c13c488d26195552f101d7e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?mutlu=20=C5=9Fim=C5=9Fek?= Date: Tue, 19 Nov 2024 18:48:25 +0300 Subject: [PATCH 1/2] max_cat fix --- Cargo.toml | 4 ++-- python-package/Cargo.toml | 4 ++-- python-package/pyproject.toml | 2 +- python-package/python/perpetual/utils.py | 1 - python-package/tests/test_booster.py | 14 ++++++++++++++ scripts/run-single-python-test.ps1 | 2 +- 6 files changed, 20 insertions(+), 7 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 34e025f..9f3c160 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "perpetual" -version = "0.7.4" +version = "0.7.5" edition = "2021" authors = ["Mutlu Simsek "] homepage = "https://perpetual-ml.com" @@ -22,7 +22,7 @@ codegen-units = 1 [dependencies] rayon = "1.8" thiserror = "2.0.3" -serde_json = { version = "1.0.132", features = ["float_roundtrip"] } +serde_json = { version = "1.0.133", features = ["float_roundtrip"] } serde = { version = "1.0.215", features = ["derive"] } approx = "0.5" log = "0.4" diff --git a/python-package/Cargo.toml b/python-package/Cargo.toml index 9986d1a..ac40204 100644 --- a/python-package/Cargo.toml +++ b/python-package/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "py-perpetual" -version = "0.7.4" +version = "0.7.5" edition = "2021" authors = ["Mutlu Simsek "] homepage = "https://perpetual-ml.com" @@ -19,7 +19,7 @@ crate-type = ["cdylib", "rlib"] [dependencies] pyo3 = { version = "0.22.6", features = ["extension-module"] } -perpetual_rs = {package="perpetual", version = "0.7.4", path = "../" } +perpetual_rs = {package="perpetual", version = "0.7.5", path = "../" } numpy = "0.22.1" ndarray = "0.16.1" serde_plain = { version = "1.0" } diff --git a/python-package/pyproject.toml b/python-package/pyproject.toml index 49c9be4..ce38f51 100644 --- a/python-package/pyproject.toml +++ b/python-package/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "maturin" [project] name = "perpetual" -version = "0.7.4" +version = "0.7.5" description = "A self-generalizing gradient boosting machine which doesn't need hyperparameter optimization" license = { file = "LICENSE" } keywords = [ diff --git a/python-package/python/perpetual/utils.py b/python-package/python/perpetual/utils.py index e0c9f68..92b454e 100644 --- a/python-package/python/perpetual/utils.py +++ b/python-package/python/perpetual/utils.py @@ -123,7 +123,6 @@ def convert_input_frame( logger.warning( f"Feature {features_[i]} will be treated as numerical since the number of categories ({len(categories)}) exceeds max_cat ({max_cat}) threshold." ) - continue categories = [c for c in list(categories) if c != "nan"] categories.insert(0, "nan") cat_mapping[features_[i]] = categories diff --git a/python-package/tests/test_booster.py b/python-package/tests/test_booster.py index e39f3a7..e4c63d9 100644 --- a/python-package/tests/test_booster.py +++ b/python-package/tests/test_booster.py @@ -27,6 +27,20 @@ def X_y() -> Tuple[pd.DataFrame, pd.Series]: return X, y +def test_booster_max_cat(X_y): + df = pd.read_csv("../resources/titanic.csv") + X = df.drop(columns="survived").reset_index(drop=True) + y = df["survived"] + + num_cols = X.select_dtypes(include=np.number).columns.tolist() + all_cols = X.columns.tolist() + cat_cols = [x for x in all_cols if x not in num_cols] + X[cat_cols] = X[cat_cols].astype('category') + + model = PerpetualBooster(objective="LogLoss", max_cat=4) + model.fit(X, y) + + def test_booster_no_variance(X_y): X, y = X_y X.iloc[:, 3] = 1 diff --git a/scripts/run-single-python-test.ps1 b/scripts/run-single-python-test.ps1 index 5cfd0c9..6817433 100644 --- a/scripts/run-single-python-test.ps1 +++ b/scripts/run-single-python-test.ps1 @@ -1,4 +1,4 @@ Set-Location python-package maturin develop --release -pytest tests/test_openml.py::test_sensory -s +pytest tests/test_booster.py::test_booster_max_cat -s Set-Location .. \ No newline at end of file From a9384419c481d31190de25464e0c5947c8530c97 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?mutlu=20=C5=9Fim=C5=9Fek?= Date: Tue, 19 Nov 2024 18:56:25 +0300 Subject: [PATCH 2/2] formatting update --- python-package/tests/test_booster.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python-package/tests/test_booster.py b/python-package/tests/test_booster.py index e4c63d9..f8430bd 100644 --- a/python-package/tests/test_booster.py +++ b/python-package/tests/test_booster.py @@ -35,7 +35,7 @@ def test_booster_max_cat(X_y): num_cols = X.select_dtypes(include=np.number).columns.tolist() all_cols = X.columns.tolist() cat_cols = [x for x in all_cols if x not in num_cols] - X[cat_cols] = X[cat_cols].astype('category') + X[cat_cols] = X[cat_cols].astype("category") model = PerpetualBooster(objective="LogLoss", max_cat=4) model.fit(X, y)