fix: Issue #18 error while fitting and predicting on dataset with boolean columns

cezary.maszczyk · cezary.maszczyk · commit 4132386d815b · 2024-02-05T10:27:46.000+01:00
diff --git a/rulekit/_helpers.py b/rulekit/_helpers.py
@@ -175,6 +175,17 @@ def set_attribute_role(example_set, attribute: str, role: str) -> object:
     return role_setter.apply(example_set)
 
 
+def _sanitize_dataset_columns(
+    data: pd.DataFrame
+) -> pd.DataFrame:
+    for column_index in range(data.shape[1]):
+        if data.iloc[:, column_index].dtypes.name == 'bool':
+            # ExampleSet class that RuleKit internally uses does not
+            # support boolean columns at the moment (see Issue #18)
+            data.iloc[:, column_index] = data.iloc[:, column_index].astype(str)
+    return data
+
+
 def create_example_set(
     values: Union[pd.DataFrame, np.ndarray],
     labels: Union[pd.Series, np.ndarray] = None,
@@ -201,6 +212,7 @@ def create_example_set(
     attributes_names = None
     label_name = None
     if isinstance(values, pd.DataFrame):
+        values = _sanitize_dataset_columns(values)
         attributes_names = values.columns.values
         values = values.to_numpy()
     if isinstance(labels, pd.Series):
diff --git a/rulekit/classification.py b/rulekit/classification.py
@@ -192,10 +192,17 @@ def _get_unique_label_values(self, labels: Data):
 
     def _prepare_labels(self, labels: Data) -> Data:
         if isinstance(labels, pd.DataFrame) or isinstance(labels, pd.Series):
+            if labels.dtypes.name == 'bool':
+                return labels.astype(str)
             if isinstance(labels.iloc[0], Number):
                 self._remap_to_numeric = True
                 return labels.astype(str)
         else:
+            if (
+                isinstance(labels[0], bool) or
+                (isinstance(labels, np.ndarray) and labels.dtype.name == 'bool')
+            ):
+                return list(map(str, labels))
             if isinstance(labels[0], Number):
                 self._remap_to_numeric = True
                 return list(map(str, labels))
diff --git a/rulekit/regression.py b/rulekit/regression.py
@@ -112,7 +112,8 @@ def _validate_labels(self, labels: Data):
             first_label = labels[0]
         if not isinstance(first_label, Number):
             raise ValueError(
-                'DecisionTreeRegressor requires lables values to be numeric')
+                f'{self.__class__.__name__} requires lables values to be numeric'
+            )
 
     def fit(self, values: Data, labels: Data) -> RuleRegressor:  # pylint: disable=arguments-differ
         """Train model on given dataset.
diff --git a/tests/test_classifier.py b/tests/test_classifier.py
@@ -12,7 +12,12 @@
 from sklearn import metrics
 import numpy as np
 
-from tests.utils import get_test_cases, assert_rules_are_equals, assert_accuracy_is_greater
+from tests.utils import (
+    dir_path,
+    get_test_cases,
+    assert_rules_are_equals,
+    assert_accuracy_is_greater,
+)
 
 
 class TestClassifier(unittest.TestCase):
@@ -152,7 +157,7 @@ def test_compare_with_java_results(self):
     def test_predict_proba(self):
         test_case = get_test_cases('ClassificationSnCTest')[0]
         params = test_case.induction_params
-        clf = classification.ExpertRuleClassifier(**params)
+        clf = classification.RuleClassifier(**params)
         example_set = test_case.example_set
         clf.fit(
             example_set.values,
@@ -174,6 +179,24 @@ def test_predict_proba(self):
             'Predicted probabilities should be in range [0, 1]'
         )
 
+    def test_fit_and_predict_on_boolean_columns(self):
+        test_case = get_test_cases('ClassificationSnCTest')[0]
+        params = test_case.induction_params
+        clf = classification.RuleClassifier(**params)
+        X, y = test_case.example_set.values, test_case.example_set.labels
+        X['boolean_column'] = np.random.randint(
+            low=0, high=2, size=X.shape[0]).astype(bool)
+        clf.fit(X, y)
+        clf.predict(X)
+
+        y = y.astype(bool)
+        clf.fit(X, y)
+        clf.predict(X)
+
+        y = pd.Series(y)
+        clf.fit(X, y)
+        clf.predict(X)
+
 
 class TestExperClassifier(unittest.TestCase):
 
@@ -222,6 +245,46 @@ def test_predict_proba(self):
             'Predicted probabilities should be in range [0, 1]'
         )
 
+    # Issue #17
+    def test_left_open_intervals_in_expert_induction(self):
+        df = pd.DataFrame(arff.loadarff(
+            f'{dir_path}/resources/data/seismic-bumps-train-minimal.arff')[0]
+        )
+        X = df.drop('class', axis=1)
+        y = df['class']
+
+        expert_rules = [
+            ('rule-0', 'IF [[gimpuls = <-inf, 750)]] THEN class = {0}'),
+            ('rule-1', 'IF [[gimpuls = (750, inf)]] THEN class = {1}')
+        ]
+
+        expert_preferred_conditions = [
+            ('preferred-condition-0',
+             '1: IF [[seismic = {a}]] THEN class = {0}'),
+            ('preferred-attribute-0',
+             '1: IF [[gimpuls = Any]] THEN class = {1}')
+        ]
+
+        expert_forbidden_conditions = [
+            ('forb-attribute-0',
+             '1: IF [[seismoacoustic  = Any]] THEN class = {0}'),
+            ('forb-attribute-1', 'inf: IF [[ghazard  = Any]] THEN class = {1}')
+        ]
+        clf = classification.ExpertRuleClassifier(
+            minsupp_new=8,
+            max_growing=0,
+            extend_using_preferred=True,
+            extend_using_automatic=True,
+            induce_using_preferred=True,
+            induce_using_automatic=True
+        )
+        clf.fit(
+            X, y,
+            expert_rules=expert_rules,
+            expert_preferred_conditions=expert_preferred_conditions,
+            expert_forbidden_conditions=expert_forbidden_conditions
+        )
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/test_regression.py b/tests/test_regression.py
@@ -1,5 +1,7 @@
 import unittest
 import threading
+import numpy as np
+import pandas as pd
 
 from rulekit.main import RuleKit
 from rulekit import regression
@@ -70,14 +72,27 @@ def test_compare_with_java_results(self):
             assert_score_is_greater(tree.predict(
                 example_set.values), example_set.labels, 0.7)
 
+    def test_fit_and_predict_on_boolean_columns(self):
+        test_case = get_test_cases('RegressionSnCTest')[0]
+        params = test_case.induction_params
+        clf = regression.RuleRegressor(**params)
+        X, y = test_case.example_set.values, test_case.example_set.labels
+        X['boolean_column'] = np.random.randint(low=0, high=2, size=X.shape[0]).astype(bool)
+        clf.fit(X, y)
+        clf.predict(X)
+
+        y = pd.Series(y)
+        clf.fit(X, y)
+        clf.predict(X)
+
 
 class TestExpertRegressor(unittest.TestCase):
 
     @classmethod
     def setUpClass(cls):
         RuleKit.init()
 
-    @unittest.skip("TODO skipping due to Issue #17")
+    @unittest.skip("TODO skipping due to Issue #19")
     def test_compare_with_java_results(self):
         test_cases = get_test_cases('RegressionExpertSnCTest')
 
diff --git a/tests/test_survival.py b/tests/test_survival.py
@@ -1,5 +1,7 @@
 import unittest
 import threading
+import numpy as np
+import pandas as pd
 
 from rulekit.main import RuleKit
 from rulekit import survival
@@ -65,14 +67,30 @@ def test_compare_with_java_results(self):
             actual = list(map(lambda e: str(e), model.rules))
             assert_rules_are_equals(expected, actual)
 
+    def test_fit_and_predict_on_boolean_columns(self):
+        test_case = get_test_cases('SurvivalLogRankSnCTest')[0]
+        params = test_case.induction_params
+        clf = survival.SurvivalRules(
+            **params, survival_time_attr=test_case.survival_time
+        )
+        X, y = test_case.example_set.values, test_case.example_set.labels
+        X['boolean_column'] = np.random.randint(
+            low=0, high=2, size=X.shape[0]).astype(bool)
+        clf.fit(X, y)
+        clf.predict(X)
+
+        y = pd.Series(y)
+        clf.fit(X, y)
+        clf.predict(X)
+
 
 class TestExpertSurvivalLogRankTree(unittest.TestCase):
 
     @classmethod
     def setUpClass(cls):
         RuleKit.init()
 
-    @unittest.skip("TODO skipping due to Issue #17")
+    @unittest.skip("TODO skipping due to Issue #19")
     def test_compare_with_java_results(self):
         test_cases = get_test_cases('SurvivalLogRankExpertSnCTest')