Skip to content

Commit

Permalink
Merge pull request #28 from perpetual-ml/max_cat
Browse files Browse the repository at this point in the history
max_cat added
  • Loading branch information
deadsoul44 authored Nov 19, 2024
2 parents da5a4bd + c12f32b commit 21ac336
Show file tree
Hide file tree
Showing 8 changed files with 138 additions and 35 deletions.
2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "perpetual"
version = "0.7.3"
version = "0.7.4"
edition = "2021"
authors = ["Mutlu Simsek <[email protected]>"]
homepage = "https://perpetual-ml.com"
Expand Down
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ Documentation for the Python API can be found [here](https://perpetual-ml.github

## Installation

The package can be installed directly from [pypi](https://pypi.org/project/perpetual).
The package can be installed directly from [pypi](https://pypi.org/project/perpetual):

```shell
pip install perpetual
Expand All @@ -64,10 +64,10 @@ Using [conda-forge](https://anaconda.org/conda-forge/perpetual):
conda install conda-forge::perpetual
```

To use in a Rust project, add the following to your Cargo.toml file to get the package from [crates.io](https://crates.io/crates/perpetual).
To use in a Rust project and to get the package from [crates.io](https://crates.io/crates/perpetual):

```toml
perpetual = "0.7.3"
cargo add perpetual
```

## Contribution
Expand Down
4 changes: 2 additions & 2 deletions python-package/Cargo.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[package]
name = "py-perpetual"
version = "0.7.3"
version = "0.7.4"
edition = "2021"
authors = ["Mutlu Simsek <[email protected]>"]
homepage = "https://perpetual-ml.com"
Expand All @@ -19,7 +19,7 @@ crate-type = ["cdylib", "rlib"]

[dependencies]
pyo3 = { version = "0.22.6", features = ["extension-module"] }
perpetual_rs = {package="perpetual", version = "0.7.3", path = "../" }
perpetual_rs = {package="perpetual", version = "0.7.4", path = "../" }
numpy = "0.22.1"
ndarray = "0.16.1"
serde_plain = { version = "1.0" }
Expand Down
81 changes: 81 additions & 0 deletions python-package/examples/fetch_openml.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"from perpetual import PerpetualBooster\n",
"from sklearn.datasets import fetch_openml"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data, target = fetch_openml(data_id=41147, return_X_y=True, as_frame=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"data.dtypes"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model = PerpetualBooster()\n",
"model.fit(data, target, budget=0.1)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"model.number_of_trees"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "py311",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
2 changes: 1 addition & 1 deletion python-package/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "maturin"

[project]
name = "perpetual"
version = "0.7.3"
version = "0.7.4"
description = "A self-generalizing gradient boosting machine which doesn't need hyperparameter optimization"
license = { file = "LICENSE" }
keywords = [
Expand Down
51 changes: 29 additions & 22 deletions python-package/python/perpetual/booster.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,10 +57,11 @@ def __init__(
memory_limit: Optional[float] = None,
stopping_rounds: Optional[int] = None,
max_bin: int = 256,
max_cat: int = 1000,
):
"""PerpetualBooster class, used to generate gradient boosted decision tree ensembles.
The following parameters can also be specified in the fit method to override the values in the constructor:
budget, alpha, reset, categorical_features, timeout, iteration_limit, and memory_limit.
budget, alpha, reset, categorical_features, timeout, iteration_limit, memory_limit, and stopping_rounds.
Args:
objective (str, optional): Learning objective function to be used for optimization.
Expand Down Expand Up @@ -104,21 +105,24 @@ def __init__(
- "AverageNodeWeight": Set the missing node to be equal to the weighted average weight of the left and the right nodes.
log_iterations (int, optional): Setting to a value (N) other than zero will result in information being logged about ever N iterations, info can be interacted with directly with the python [`logging`](https://docs.python.org/3/howto/logging.html) module. For an example of how to utilize the logging information see the example [here](/#logging-output).
feature_importance_method (str, optional): The feature importance method type that will be used to calculate the `feature_importances_` attribute on the booster.
budget: a positive number for fitting budget. Increasing this number will more
budget (float, optional): a positive number for fitting budget. Increasing this number will more
likely result in more boosting rounds and more increased predictive power.
Default value is 1.0.
alpha: only used in quantile regression.
reset: whether to reset the model or continue training.
categorical_features: The names or indices for categorical features.
`auto` for Polars or Pandas categorical data type.
timeout: optional fit timeout in seconds
iteration_limit: optional limit for the number of boosting rounds. The default value is 1000 boosting rounds.
alpha (float, optional): only used in quantile regression.
reset (bool, optional): whether to reset the model or continue training.
categorical_features (Union[Iterable[int], Iterable[str], str, None], optional): The names or indices for categorical features.
Defaults to `auto` for Polars or Pandas categorical data types.
timeout (float, optional): optional fit timeout in seconds
iteration_limit (int, optional): optional limit for the number of boosting rounds. The default value is 1000 boosting rounds.
The algorithm automatically stops for most of the cases before hitting this limit.
If you want to experiment with very high budget (>2.0), you can also increase this limit.
memory_limit: optional limit for memory allocation in GB. If not set, the memory will be allocated based on
memory_limit (float, optional): optional limit for memory allocation in GB. If not set, the memory will be allocated based on
available memory and the algorithm requirements.
stopping_rounds: optional limit for auto stopping.
max_bin: number bins for feature discretization.
stopping_rounds (int, optional): optional limit for auto stopping.
max_bin (int, optional): maximum number of bins for feature discretization. Defaults to 256.
max_cat (int, optional): Maximum number of unique categories for a categorical feature.
Features with more categories will be treated as numerical.
Defaults to 1000.
Raises:
TypeError: Raised if an invalid dtype is passed.
Expand Down Expand Up @@ -181,6 +185,7 @@ def __init__(
self.memory_limit = memory_limit
self.stopping_rounds = stopping_rounds
self.max_bin = max_bin
self.max_cat = max_cat

booster = CratePerpetualBooster(
objective=self.objective,
Expand Down Expand Up @@ -220,24 +225,26 @@ def fit(
sample_weight (Union[ArrayLike, None], optional): Instance weights to use when
training the model. If None is passed, a weight of 1 will be used for every record.
Defaults to None.
budget: a positive number for fitting budget. Increasing this number will more
budget (float, optional): a positive number for fitting budget. Increasing this number will more
likely result in more boosting rounds and more increased predictive power.
Default value is 1.0.
alpha: only used in quantile regression.
reset: whether to reset the model or continue training.
categorical_features: The names or indices for categorical features.
`auto` for Polars or Pandas categorical data type.
timeout: optional fit timeout in seconds
iteration_limit: optional limit for the number of boosting rounds. The default value is 1000 boosting rounds.
Defaults to 1.0.
alpha (float, optional): only used in quantile regression.
reset (bool, optional): whether to reset the model or continue training.
categorical_features (Union[Iterable[int], Iterable[str], str, None], optional): The names or indices for categorical features.
Defaults to `auto` for Polars or Pandas categorical data types.
timeout (float, optional): optional fit timeout in seconds
iteration_limit (int, optional): optional limit for the number of boosting rounds. The default value is 1000 boosting rounds.
The algorithm automatically stops for most of the cases before hitting this limit.
If you want to experiment with very high budget (>2.0), you can also increase this limit.
memory_limit: optional limit for memory allocation in GB. If not set, the memory will be allocated based on
memory_limit (float, optional): optional limit for memory allocation in GB. If not set, the memory will be allocated based on
available memory and the algorithm requirements.
stopping_rounds: optional limit for auto stopping. Defaults to 3.
stopping_rounds (int, optional): optional limit for auto stopping. Defaults to 3.
"""

features_, flat_data, rows, cols, categorical_features_, cat_mapping = (
convert_input_frame(X, categorical_features or self.categorical_features)
convert_input_frame(
X, categorical_features or self.categorical_features, self.max_cat
)
)
self.n_features_ = cols
self.cat_mapping = cat_mapping
Expand Down
23 changes: 19 additions & 4 deletions python-package/python/perpetual/utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
import logging
import numpy as np
from typing import Dict, Iterable, List, Optional, Tuple


logger = logging.getLogger(__name__)


def type_df(df):
library_name = type(df).__module__.split(".")[0]
if type(df).__name__ == "DataFrame":
Expand Down Expand Up @@ -61,7 +65,7 @@ def convert_input_array(x, objective) -> np.ndarray:


def convert_input_frame(
X, categorical_features
X, categorical_features, max_cat
) -> Tuple[List[str], np.ndarray, int, int, Optional[Iterable[int]], Optional[Dict]]:
"""Convert data to format needed by booster.
Expand Down Expand Up @@ -110,18 +114,28 @@ def convert_input_frame(
categorical_features_ = [features_.index(c) for c in categorical_features]

cat_mapping = {} # key: feature_name, value: ordered category names
cat_to_num = []
if categorical_features_:
for i in categorical_features_:
categories = np.unique(X_[:, i].astype(dtype="str", copy=False))
if len(categories) > max_cat:
cat_to_num.append(i)
logger.warning(
f"Feature {features_[i]} will be treated as numerical since the number of categories ({len(categories)}) exceeds max_cat ({max_cat}) threshold."
)
continue
categories = [c for c in list(categories) if c != "nan"]
categories.insert(0, "nan")
cat_mapping[features_[i]] = categories
categorical_features_ = [
x for x in categorical_features_ if x not in cat_to_num
]

if cat_mapping:
print(f"Categorical features: {categorical_features_}")
print(f"Mapping of categories: {cat_mapping}")
logger.info(f"Categorical features: {categorical_features_}")
logger.info(f"Mapping of categories: {cat_mapping}")

for feature_name, categories in cat_mapping.items():
feature_index = features_.index(feature_name)

def f(x):
try:
Expand All @@ -133,6 +147,7 @@ def f(x):
except (ValueError, IndexError):
return np.nan

feature_index = features_.index(feature_name)
X_[:, feature_index] = np.apply_along_axis(f, 1, X_)

if not np.issubdtype(X_.dtype, "float64"):
Expand Down
4 changes: 2 additions & 2 deletions scripts/make_resources.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@

data_train, data_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

features_, titanic_train_flat, rows, cols, categorical_features_, cat_mapping = convert_input_frame(data_train, "auto")
features_, titanic_train_flat, rows, cols, categorical_features_, cat_mapping = convert_input_frame(data_train, "auto", 1000)
features_, titanic_test_flat, rows, cols = transform_input_frame(data_test, cat_mapping)

data_test.to_csv("resources/titanic_test_df.csv", index=False)
Expand All @@ -97,6 +97,6 @@
df = fetch_openml(data_id=546)
X = df.data
y = df.target
features_, sensory_flat, rows, cols, categorical_features_, cat_mapping = convert_input_frame(X, "auto")
features_, sensory_flat, rows, cols, categorical_features_, cat_mapping = convert_input_frame(X, "auto", 1000)
pd.Series(sensory_flat).to_csv("resources/sensory_flat.csv", index=False, header=False)
pd.Series(y).to_csv("resources/sensory_y.csv", index=False, header=False)

0 comments on commit 21ac336

Please sign in to comment.