Skip to content

Commit 4132386

Browse files
author
cezary.maszczyk
committed
fix: Issue #18 error while fitting and predicting on dataset with boolean columns
1 parent 8ae1292 commit 4132386

File tree

6 files changed

+121
-5
lines changed

6 files changed

+121
-5
lines changed

rulekit/_helpers.py

+12
Original file line numberDiff line numberDiff line change
@@ -175,6 +175,17 @@ def set_attribute_role(example_set, attribute: str, role: str) -> object:
175175
return role_setter.apply(example_set)
176176

177177

178+
def _sanitize_dataset_columns(
179+
data: pd.DataFrame
180+
) -> pd.DataFrame:
181+
for column_index in range(data.shape[1]):
182+
if data.iloc[:, column_index].dtypes.name == 'bool':
183+
# ExampleSet class that RuleKit internally uses does not
184+
# support boolean columns at the moment (see Issue #18)
185+
data.iloc[:, column_index] = data.iloc[:, column_index].astype(str)
186+
return data
187+
188+
178189
def create_example_set(
179190
values: Union[pd.DataFrame, np.ndarray],
180191
labels: Union[pd.Series, np.ndarray] = None,
@@ -201,6 +212,7 @@ def create_example_set(
201212
attributes_names = None
202213
label_name = None
203214
if isinstance(values, pd.DataFrame):
215+
values = _sanitize_dataset_columns(values)
204216
attributes_names = values.columns.values
205217
values = values.to_numpy()
206218
if isinstance(labels, pd.Series):

rulekit/classification.py

+7
Original file line numberDiff line numberDiff line change
@@ -192,10 +192,17 @@ def _get_unique_label_values(self, labels: Data):
192192

193193
def _prepare_labels(self, labels: Data) -> Data:
194194
if isinstance(labels, pd.DataFrame) or isinstance(labels, pd.Series):
195+
if labels.dtypes.name == 'bool':
196+
return labels.astype(str)
195197
if isinstance(labels.iloc[0], Number):
196198
self._remap_to_numeric = True
197199
return labels.astype(str)
198200
else:
201+
if (
202+
isinstance(labels[0], bool) or
203+
(isinstance(labels, np.ndarray) and labels.dtype.name == 'bool')
204+
):
205+
return list(map(str, labels))
199206
if isinstance(labels[0], Number):
200207
self._remap_to_numeric = True
201208
return list(map(str, labels))

rulekit/regression.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,8 @@ def _validate_labels(self, labels: Data):
112112
first_label = labels[0]
113113
if not isinstance(first_label, Number):
114114
raise ValueError(
115-
'DecisionTreeRegressor requires lables values to be numeric')
115+
f'{self.__class__.__name__} requires lables values to be numeric'
116+
)
116117

117118
def fit(self, values: Data, labels: Data) -> RuleRegressor: # pylint: disable=arguments-differ
118119
"""Train model on given dataset.

tests/test_classifier.py

+65-2
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,12 @@
1212
from sklearn import metrics
1313
import numpy as np
1414

15-
from tests.utils import get_test_cases, assert_rules_are_equals, assert_accuracy_is_greater
15+
from tests.utils import (
16+
dir_path,
17+
get_test_cases,
18+
assert_rules_are_equals,
19+
assert_accuracy_is_greater,
20+
)
1621

1722

1823
class TestClassifier(unittest.TestCase):
@@ -152,7 +157,7 @@ def test_compare_with_java_results(self):
152157
def test_predict_proba(self):
153158
test_case = get_test_cases('ClassificationSnCTest')[0]
154159
params = test_case.induction_params
155-
clf = classification.ExpertRuleClassifier(**params)
160+
clf = classification.RuleClassifier(**params)
156161
example_set = test_case.example_set
157162
clf.fit(
158163
example_set.values,
@@ -174,6 +179,24 @@ def test_predict_proba(self):
174179
'Predicted probabilities should be in range [0, 1]'
175180
)
176181

182+
def test_fit_and_predict_on_boolean_columns(self):
183+
test_case = get_test_cases('ClassificationSnCTest')[0]
184+
params = test_case.induction_params
185+
clf = classification.RuleClassifier(**params)
186+
X, y = test_case.example_set.values, test_case.example_set.labels
187+
X['boolean_column'] = np.random.randint(
188+
low=0, high=2, size=X.shape[0]).astype(bool)
189+
clf.fit(X, y)
190+
clf.predict(X)
191+
192+
y = y.astype(bool)
193+
clf.fit(X, y)
194+
clf.predict(X)
195+
196+
y = pd.Series(y)
197+
clf.fit(X, y)
198+
clf.predict(X)
199+
177200

178201
class TestExperClassifier(unittest.TestCase):
179202

@@ -222,6 +245,46 @@ def test_predict_proba(self):
222245
'Predicted probabilities should be in range [0, 1]'
223246
)
224247

248+
# Issue #17
249+
def test_left_open_intervals_in_expert_induction(self):
250+
df = pd.DataFrame(arff.loadarff(
251+
f'{dir_path}/resources/data/seismic-bumps-train-minimal.arff')[0]
252+
)
253+
X = df.drop('class', axis=1)
254+
y = df['class']
255+
256+
expert_rules = [
257+
('rule-0', 'IF [[gimpuls = <-inf, 750)]] THEN class = {0}'),
258+
('rule-1', 'IF [[gimpuls = (750, inf)]] THEN class = {1}')
259+
]
260+
261+
expert_preferred_conditions = [
262+
('preferred-condition-0',
263+
'1: IF [[seismic = {a}]] THEN class = {0}'),
264+
('preferred-attribute-0',
265+
'1: IF [[gimpuls = Any]] THEN class = {1}')
266+
]
267+
268+
expert_forbidden_conditions = [
269+
('forb-attribute-0',
270+
'1: IF [[seismoacoustic = Any]] THEN class = {0}'),
271+
('forb-attribute-1', 'inf: IF [[ghazard = Any]] THEN class = {1}')
272+
]
273+
clf = classification.ExpertRuleClassifier(
274+
minsupp_new=8,
275+
max_growing=0,
276+
extend_using_preferred=True,
277+
extend_using_automatic=True,
278+
induce_using_preferred=True,
279+
induce_using_automatic=True
280+
)
281+
clf.fit(
282+
X, y,
283+
expert_rules=expert_rules,
284+
expert_preferred_conditions=expert_preferred_conditions,
285+
expert_forbidden_conditions=expert_forbidden_conditions
286+
)
287+
225288

226289
if __name__ == '__main__':
227290
unittest.main()

tests/test_regression.py

+16-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import unittest
22
import threading
3+
import numpy as np
4+
import pandas as pd
35

46
from rulekit.main import RuleKit
57
from rulekit import regression
@@ -70,14 +72,27 @@ def test_compare_with_java_results(self):
7072
assert_score_is_greater(tree.predict(
7173
example_set.values), example_set.labels, 0.7)
7274

75+
def test_fit_and_predict_on_boolean_columns(self):
76+
test_case = get_test_cases('RegressionSnCTest')[0]
77+
params = test_case.induction_params
78+
clf = regression.RuleRegressor(**params)
79+
X, y = test_case.example_set.values, test_case.example_set.labels
80+
X['boolean_column'] = np.random.randint(low=0, high=2, size=X.shape[0]).astype(bool)
81+
clf.fit(X, y)
82+
clf.predict(X)
83+
84+
y = pd.Series(y)
85+
clf.fit(X, y)
86+
clf.predict(X)
87+
7388

7489
class TestExpertRegressor(unittest.TestCase):
7590

7691
@classmethod
7792
def setUpClass(cls):
7893
RuleKit.init()
7994

80-
@unittest.skip("TODO skipping due to Issue #17")
95+
@unittest.skip("TODO skipping due to Issue #19")
8196
def test_compare_with_java_results(self):
8297
test_cases = get_test_cases('RegressionExpertSnCTest')
8398

tests/test_survival.py

+19-1
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
import unittest
22
import threading
3+
import numpy as np
4+
import pandas as pd
35

46
from rulekit.main import RuleKit
57
from rulekit import survival
@@ -65,14 +67,30 @@ def test_compare_with_java_results(self):
6567
actual = list(map(lambda e: str(e), model.rules))
6668
assert_rules_are_equals(expected, actual)
6769

70+
def test_fit_and_predict_on_boolean_columns(self):
71+
test_case = get_test_cases('SurvivalLogRankSnCTest')[0]
72+
params = test_case.induction_params
73+
clf = survival.SurvivalRules(
74+
**params, survival_time_attr=test_case.survival_time
75+
)
76+
X, y = test_case.example_set.values, test_case.example_set.labels
77+
X['boolean_column'] = np.random.randint(
78+
low=0, high=2, size=X.shape[0]).astype(bool)
79+
clf.fit(X, y)
80+
clf.predict(X)
81+
82+
y = pd.Series(y)
83+
clf.fit(X, y)
84+
clf.predict(X)
85+
6886

6987
class TestExpertSurvivalLogRankTree(unittest.TestCase):
7088

7189
@classmethod
7290
def setUpClass(cls):
7391
RuleKit.init()
7492

75-
@unittest.skip("TODO skipping due to Issue #17")
93+
@unittest.skip("TODO skipping due to Issue #19")
7694
def test_compare_with_java_results(self):
7795
test_cases = get_test_cases('SurvivalLogRankExpertSnCTest')
7896

0 commit comments

Comments
 (0)