diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4d73e5c..87f3158 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -15,8 +15,9 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - python-version: ['3.7', '3.8', '3.9', '3.10'] - os: [ubuntu-latest, windows-latest] + python-version: ["3.8", "3.9", "3.10"] + os: [ubuntu-latest, macos-latest, windows-latest] + fail-fast: false steps: - uses: actions/checkout@v2 - name: Set up Python ${{ matrix.python-version }} diff --git a/feature/text_based.py b/feature/text_based.py index fff1ba5..274b4d7 100644 --- a/feature/text_based.py +++ b/feature/text_based.py @@ -124,6 +124,10 @@ def __init__(self, num_features: int, seed: int = Constants.default_seed, trials self.matrix = None self.text_embeddings = None + # Initialize models + self.set_cover_model = None + self.max_cover_model = None + def run(self, input_df: pd.DataFrame, labels: pd.DataFrame, num_features: int, featurization_method: TextWiser, optimization_method: str = "exact", cost_metric: str = "diverse", trials: int = 10) -> List: @@ -484,22 +488,22 @@ def _select_random(self, trials: int = 10) -> List[int]: def _solve_set_cover(self, data: Data) -> List: # Create Model object - model = Model("Set Cover Model") + self.set_cover_model = Model("Set Cover Model") # Variables - x = [model.add_var(var_type=BINARY) for _ in data.X] + x = [self.set_cover_model.add_var(var_type=BINARY) for _ in data.X] # Constraint: every row should be covered for row in data.rows: - model.add_constr(xsum(data.matrix[row, i] * x[i] for i in data.X) >= 1) + self.set_cover_model.add_constr(xsum(data.matrix[row, i] * x[i] for i in data.X) >= 1) # Objective: minimize - model.objective = minimize(xsum(data.cost[i] * x[i] for i in data.X)) + self.set_cover_model.objective = minimize(xsum(data.cost[i] * x[i] for i in data.X)) # Solve - model.verbose = False - model.optimize() - check_true(model.status == OptimizationStatus.OPTIMAL, ValueError("Max Cover Error: " + self.set_cover_model.verbose = False + self.set_cover_model.optimize() + check_true(self.set_cover_model.status == OptimizationStatus.OPTIMAL, ValueError("Max Cover Error: " "optimal solution not found.")) # Solution @@ -507,9 +511,9 @@ def _solve_set_cover(self, data: Data) -> List: if self.verbose: print("=" * 40) - print("SET COVER OBJECTIVE:", model.objective_value) + print("SET COVER OBJECTIVE:", self.set_cover_model.objective_value) print("SELECTED:", selected) - print("STATUS:", model.status) + print("STATUS:", self.set_cover_model.status) print("=" * 40) # Return @@ -518,54 +522,54 @@ def _solve_set_cover(self, data: Data) -> List: def _solve_max_cover(self, data: Data, selected: List) -> List: # Model - model = Model("Max Cover Model") + self.max_cover_model = Model("Max Cover Model") # Variables - x = [model.add_var(var_type=BINARY) for _ in data.X] - is_row_covered = [model.add_var(var_type=BINARY) for _ in data.rows] - num_row_covered = model.add_var(var_type=INTEGER) + x = [self.max_cover_model.add_var(var_type=BINARY) for _ in data.X] + is_row_covered = [self.max_cover_model.add_var(var_type=BINARY) for _ in data.rows] + num_row_covered = self.max_cover_model.add_var(var_type=INTEGER) # Constraint: Link between x and is_row_covered for row in data.rows: for i in data.X: # if any selected column has the label, then the row would be covered - model.add_constr(data.matrix[row, i] * x[i] <= is_row_covered[row]) + self.max_cover_model.add_constr(data.matrix[row, i] * x[i] <= is_row_covered[row]) # total selected - model.add_constr(xsum(data.matrix[row, i] * x[i] for i in data.X) >= is_row_covered[row]) + self.max_cover_model.add_constr(xsum(data.matrix[row, i] * x[i] for i in data.X) >= is_row_covered[row]) # Constraint: Link is_row_covered with num_row_covered - model.add_constr(xsum(is_row_covered[row] for row in data.rows) == num_row_covered) + self.max_cover_model.add_constr(xsum(is_row_covered[row] for row in data.rows) == num_row_covered) # Constraint: If selected is given, discard columns that are not part of selection for i in data.X: if i not in selected: - model.add_constr(x[i] == 0) + self.max_cover_model.add_constr(x[i] == 0) # Constraint: limit number of selected to max_cover_size - model.add_constr(xsum(x[i] for i in data.X) <= self.num_features) + self.max_cover_model.add_constr(xsum(x[i] for i in data.X) <= self.num_features) # Objective: maximize "row" coverage (not the whole coverage of 1s) - model.objective = maximize(xsum(is_row_covered[row] for row in data.rows)) + self.max_cover_model.objective = maximize(xsum(is_row_covered[row] for row in data.rows)) # Solve - model.verbose = False - model.optimize() + self.max_cover_model.verbose = False + self.max_cover_model.optimize() # Solution selected = [i for i in data.X if float(x[i].x) >= 0.99] if self.verbose: print("=" * 40) - print("MAX COVER OBJECTIVE:", model.objective_value) + print("MAX COVER OBJECTIVE:", self.max_cover_model.objective_value) print("NUM ROWS COVERED:", num_row_covered.x, "coverage: {:.2f}".format(num_row_covered.x / data.matrix.shape[0])) print("SIZE:", len(selected), "reduction: {:.2f}".format((data.matrix.shape[1] - len(selected)) / data.matrix.shape[1])) print("SELECTED:", selected) - print("STATUS:", model.status) + print("STATUS:", self.max_cover_model.status) print("=" * 40) - check_true(model.status == OptimizationStatus.OPTIMAL, ValueError("Max Cover Error: " + check_true(self.max_cover_model.status == OptimizationStatus.OPTIMAL, ValueError("Max Cover Error: " "optimal solution not found.")) # Return diff --git a/tests/test_base.py b/tests/test_base.py index e4986a7..d9aedb1 100644 --- a/tests/test_base.py +++ b/tests/test_base.py @@ -3,8 +3,6 @@ # SPDX-License-Identifier: GNU GPLv3 import unittest -from sklearn.datasets import load_iris -from feature.utils import get_data_label, reduce_memory, DataTransformer class BaseTest(unittest.TestCase): @@ -23,18 +21,3 @@ def assertListAlmostEqual(self, list1, list2): for index, val in enumerate(list1): self.assertAlmostEqual(val, list2[index], delta=0.01) - - @staticmethod - def test_mem_usage(): - data, label = get_data_label(load_iris()) - data_reduced = reduce_memory(data, verbose=False) - - @staticmethod - def test_cap_floor(): - data, label = get_data_label(load_iris()) - - # Fit transformer and transform to numeric contexts - data_transformer = DataTransformer() - contexts = data_transformer.fit(data) - contexts = data_transformer.transform(data) - contexts = data_transformer.fit_transform(data) diff --git a/tests/test_text.py b/tests/test_text.py index b99dc53..56fc395 100644 --- a/tests/test_text.py +++ b/tests/test_text.py @@ -165,7 +165,7 @@ def test_text_based_random_diverse(self): self.assertEqual(data.shape[1], labels.shape[1]) method = SelectionMethod.TextBased(num_features=None, - featurization_method=TextWiser(Embedding.TfIdf(min_df=0), + featurization_method=TextWiser(Embedding.TfIdf(min_df=0.), Transformation.NMF(n_components=10, random_state=123)), optimization_method="random", @@ -285,7 +285,7 @@ def test_text_based_greedy_num_feature_unicost_diverse(self): self.assertEqual(data.shape[1], labels.shape[1]) method = SelectionMethod.TextBased(num_features=3, - featurization_method=TextWiser(Embedding.TfIdf(min_df=0), + featurization_method=TextWiser(Embedding.TfIdf(min_df=0.), Transformation.NMF(n_components=10, random_state=123)), optimization_method="greedy", @@ -323,7 +323,7 @@ def test_text_based_greedy_unicost_diverse_identity(self): self.assertEqual(data.shape[1], labels.shape[1]) method = SelectionMethod.TextBased(num_features=None, - featurization_method=TextWiser(Embedding.TfIdf(min_df=0), + featurization_method=TextWiser(Embedding.TfIdf(min_df=0.), Transformation.NMF(n_components=10, random_state=123)), optimization_method="greedy", @@ -358,7 +358,7 @@ def test_text_based_kmeans_num_feature(self): "item7": [1, 0, 0, 1, 0, 0, 1]}) method = SelectionMethod.TextBased(num_features=2, - featurization_method=TextWiser(Embedding.TfIdf(min_df=0), + featurization_method=TextWiser(Embedding.TfIdf(min_df=0.), Transformation.NMF(n_components=10, random_state=123)), optimization_method="kmeans", @@ -396,7 +396,7 @@ def test_text_based_kmeans_unicost(self): "item7": [0, 1, 0, 0, 0, 0, 1]}) method = SelectionMethod.TextBased(num_features=None, - featurization_method=TextWiser(Embedding.TfIdf(min_df=0), + featurization_method=TextWiser(Embedding.TfIdf(min_df=0.), Transformation.NMF(n_components=10, random_state=123)), optimization_method="kmeans", @@ -433,7 +433,7 @@ def test_text_based_kmeans_diverse(self): "item7": [0, 1, 0, 0, 0, 0, 1]}) method = SelectionMethod.TextBased(num_features=None, - featurization_method=TextWiser(Embedding.TfIdf(min_df=0), + featurization_method=TextWiser(Embedding.TfIdf(min_df=0.), Transformation.NMF(n_components=10, random_state=123)), optimization_method="kmeans", @@ -563,7 +563,7 @@ def test_text_based_exact_diverse(self): self.assertEqual(data.shape[1], labels.shape[1]) method = SelectionMethod.TextBased(num_features=None, - featurization_method=TextWiser(Embedding.TfIdf(min_df=0), + featurization_method=TextWiser(Embedding.TfIdf(min_df=0.), Transformation.NMF(n_components=10, random_state=123)), optimization_method="exact", @@ -571,7 +571,7 @@ def test_text_based_exact_diverse(self): trials=1) method2 = SelectionMethod.TextBased(num_features=None, - featurization_method=TextWiser(Embedding.TfIdf(min_df=0), + featurization_method=TextWiser(Embedding.TfIdf(min_df=0.), Transformation.NMF(n_components=10, random_state=123)), optimization_method="exact", @@ -580,20 +580,13 @@ def test_text_based_exact_diverse(self): selector = Selective(method) selector.fit(data, labels) - selected_features = selector.transform(data) - - self.assertEqual(selector.selection_method.trials, 1) # Only run once - self.assertTrue(isinstance(selected_features, pd.DataFrame)) selector2 = Selective(method2) selector2.fit(data, labels) - selected_features2 = selector2.transform(data) # Verify the consistency of selected features with the initial run - self.assertTrue(selected_features.equals(selected_features2)) - - # Verify that the features selected - self.assertListEqual(list(selected_features2.columns), ['item3', 'item4', 'item7']) + self.assertEqual(selector._imp.content_selector.set_cover_model.objective_value, + selector2._imp.content_selector.set_cover_model.objective_value) # Verify selection for the Exact method, diverse, and fixed number of features with the same seed # (the same features should select) @@ -615,7 +608,7 @@ def test_text_based_exact_num_feature_diverse(self): self.assertEqual(data.shape[1], labels.shape[1]) method = SelectionMethod.TextBased(num_features=2, # num_features is less than the solution of set cover - featurization_method=TextWiser(Embedding.TfIdf(min_df=0), + featurization_method=TextWiser(Embedding.TfIdf(min_df=0.), Transformation.NMF(n_components=10, random_state=123)), optimization_method="exact", @@ -623,7 +616,7 @@ def test_text_based_exact_num_feature_diverse(self): trials=1) # Default cost metric is diverse method2 = SelectionMethod.TextBased(num_features=2, - featurization_method=TextWiser(Embedding.TfIdf(min_df=0), + featurization_method=TextWiser(Embedding.TfIdf(min_df=0.), Transformation.NMF(n_components=10, random_state=123)), optimization_method="exact", @@ -632,17 +625,16 @@ def test_text_based_exact_num_feature_diverse(self): selector = Selective(method) selector.fit(data, labels) - selected_features = selector.transform(data) selector2 = Selective(method2) selector2.fit(data, labels) - selected_features2 = selector2.transform(data) # Verify the consistency of selected features with the initial run - self.assertTrue(selected_features.equals(selected_features2)) + self.assertEqual(selector._imp.content_selector.set_cover_model.objective_value, + selector2._imp.content_selector.set_cover_model.objective_value) - # Verify that the features selected - self.assertListEqual(list(selected_features2.columns), ['item3', 'item7']) + self.assertEqual(selector._imp.content_selector.max_cover_model.objective_value, + selector2._imp.content_selector.max_cover_model.objective_value) ################################################ ########## Verify invalid tests ############### diff --git a/tests/test_utils.py b/tests/test_utils.py new file mode 100644 index 0000000..c56ae6a --- /dev/null +++ b/tests/test_utils.py @@ -0,0 +1,29 @@ +# -*- coding: utf-8 -*- +# Copyright FMR LLC +# SPDX-License-Identifier: GNU GPLv3 + +import numpy as np +from sklearn.datasets import load_iris +from feature.utils import get_data_label, reduce_memory, DataTransformer + +from tests.test_base import BaseTest + + +class TestUtils(BaseTest): + + def test_mem_usage(self): + data, label = get_data_label(load_iris()) + data_reduced = reduce_memory(data, verbose=False) + self.assertEqual(data_reduced.shape, (150, 4)) + self.assertFalse(data_reduced.isna().any().any()) + + def test_cap_floor(self): + data, label = get_data_label(load_iris()) + + # Fit transformer and transform to numeric contexts + data_transformer = DataTransformer() + contexts = data_transformer.fit(data) + contexts = data_transformer.transform(data) + contexts = data_transformer.fit_transform(data) + self.assertFalse(np.isnan(contexts).any()) + self.assertEqual(contexts.shape, (150, 4))