seventh commit

perpetual-ml · Jun 7, 2024 · e34ec03 · e34ec03
1 parent 2da3358
commit e34ec03
Show file tree

Hide file tree

Showing 4 changed files with 15 additions and 4 deletions.
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -32,7 +32,7 @@ Prior to running the tests, you should install `python-package` in editable mode
 cd python-package
 # Install the project in editable mode and all development dependencies
 python -m pip install -e .[dev]
-# You can now return to the rood directory and run the tests...
+# You can now return to the root directory and run the tests...
 cd ..
 
 # Prior to running the tests, build all required test artifacts

diff --git a/python-package/python/perpetual/__init__.py b/python-package/python/perpetual/__init__.py
@@ -144,11 +144,13 @@ def convert_input_frame(X: FrameLike, categorical_features) -> tuple[list[str],
     Returns:
         tuple[list[str], np.ndarray, int, int, Optional[Iterable[int]], Optional[dict]]: Return column names, the flat data, number of rows, the number of columns, cat_index, cat_mapping
     """
+    categorical_features_ = None
     if isinstance(X, pd.DataFrame):
         X_ = X.to_numpy()
         features_ = X.columns.to_list()
         if categorical_features == "auto":
-            categorical_features_ = [features_.index(c) for c in X.select_dtypes(include=['category']).columns.tolist()] or None
+            categorical_columns = X.select_dtypes(include=['category']).columns.tolist()
+            categorical_features_ = [features_.index(c) for c in categorical_columns] or None
     else:
         # Assume it's a numpy array.
         X_ = X
@@ -158,8 +160,7 @@ def convert_input_frame(X: FrameLike, categorical_features) -> tuple[list[str],
         categorical_features_ = categorical_features
     elif categorical_features and all(isinstance(s, str) for s in categorical_features) and isinstance(categorical_features, list):
         categorical_features_ = [features_.index(c) for c in categorical_features]
-    else:
-        categorical_features_ = None
+
 
     cat_mapping = {}  # key: feature_name, value: ordered category names
     if categorical_features_:

diff --git a/python-package/tests/test_booster.py b/python-package/tests/test_booster.py
@@ -730,3 +730,11 @@ def test_booster_saving_with_monotone_constraints(
         save_func(model, f64_model_path)
         model_loaded = load_func(f64_model_path)
         assert all(preds == model_loaded.predict(X))
+
+def test_categorical(X_y):
+    X = pd.read_csv("../resources/adult_test_df.csv", index_col=False)
+    y = np.array(pd.read_csv("../resources/adult_test_y.csv", index_col=False, header=None).squeeze('columns'))
+    cols = ['workclass', 'education', 'marital-status', 'occupation', 'relationship', 'race', 'native-country']
+    X[cols] = X[cols].astype('category')
+    model = PerpetualBooster()
+    model.fit(X, y)
diff --git a/scripts/make_resources.py b/scripts/make_resources.py
@@ -83,6 +83,8 @@
     features_, adult_train_flat, rows, cols, categorical_features_, cat_mapping = convert_input_frame(data_train, "auto")
     features_, adult_test_flat, rows, cols = transform_input_frame(data_test, cat_mapping)
 
+    data_test.to_csv("resources/adult_test_df.csv", index=False)
+
     pd.Series(adult_train_flat).to_csv("resources/adult_train_flat.csv", index=False, header=False)
     pd.Series(adult_test_flat).to_csv("resources/adult_test_flat.csv", index=False, header=False)
     pd.Series(y_train).to_csv("resources/adult_train_y.csv", index=False, header=False)