diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index dee7036..d2a13f1 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -32,7 +32,7 @@ Prior to running the tests, you should install `python-package` in editable mode cd python-package # Install the project in editable mode and all development dependencies python -m pip install -e .[dev] -# You can now return to the rood directory and run the tests... +# You can now return to the root directory and run the tests... cd .. # Prior to running the tests, build all required test artifacts diff --git a/python-package/python/perpetual/__init__.py b/python-package/python/perpetual/__init__.py index 7d290f3..9f292b6 100644 --- a/python-package/python/perpetual/__init__.py +++ b/python-package/python/perpetual/__init__.py @@ -144,11 +144,13 @@ def convert_input_frame(X: FrameLike, categorical_features) -> tuple[list[str], Returns: tuple[list[str], np.ndarray, int, int, Optional[Iterable[int]], Optional[dict]]: Return column names, the flat data, number of rows, the number of columns, cat_index, cat_mapping """ + categorical_features_ = None if isinstance(X, pd.DataFrame): X_ = X.to_numpy() features_ = X.columns.to_list() if categorical_features == "auto": - categorical_features_ = [features_.index(c) for c in X.select_dtypes(include=['category']).columns.tolist()] or None + categorical_columns = X.select_dtypes(include=['category']).columns.tolist() + categorical_features_ = [features_.index(c) for c in categorical_columns] or None else: # Assume it's a numpy array. X_ = X @@ -158,8 +160,7 @@ def convert_input_frame(X: FrameLike, categorical_features) -> tuple[list[str], categorical_features_ = categorical_features elif categorical_features and all(isinstance(s, str) for s in categorical_features) and isinstance(categorical_features, list): categorical_features_ = [features_.index(c) for c in categorical_features] - else: - categorical_features_ = None + cat_mapping = {} # key: feature_name, value: ordered category names if categorical_features_: diff --git a/python-package/tests/test_booster.py b/python-package/tests/test_booster.py index 7b4db9c..2b210ec 100644 --- a/python-package/tests/test_booster.py +++ b/python-package/tests/test_booster.py @@ -730,3 +730,11 @@ def test_booster_saving_with_monotone_constraints( save_func(model, f64_model_path) model_loaded = load_func(f64_model_path) assert all(preds == model_loaded.predict(X)) + +def test_categorical(X_y): + X = pd.read_csv("../resources/adult_test_df.csv", index_col=False) + y = np.array(pd.read_csv("../resources/adult_test_y.csv", index_col=False, header=None).squeeze('columns')) + cols = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'native-country'] + X[cols] = X[cols].astype('category') + model = PerpetualBooster() + model.fit(X, y) \ No newline at end of file diff --git a/scripts/make_resources.py b/scripts/make_resources.py index e0711da..59a4bc6 100644 --- a/scripts/make_resources.py +++ b/scripts/make_resources.py @@ -83,6 +83,8 @@ features_, adult_train_flat, rows, cols, categorical_features_, cat_mapping = convert_input_frame(data_train, "auto") features_, adult_test_flat, rows, cols = transform_input_frame(data_test, cat_mapping) + data_test.to_csv("resources/adult_test_df.csv", index=False) + pd.Series(adult_train_flat).to_csv("resources/adult_train_flat.csv", index=False, header=False) pd.Series(adult_test_flat).to_csv("resources/adult_test_flat.csv", index=False, header=False) pd.Series(y_train).to_csv("resources/adult_train_y.csv", index=False, header=False)