Skip to content

Commit

Permalink
Py Version and Fix text based tests (#16)
Browse files Browse the repository at this point in the history
  • Loading branch information
bkleyn authored Sep 6, 2023
1 parent 6001d64 commit 62a5b74
Show file tree
Hide file tree
Showing 5 changed files with 76 additions and 67 deletions.
5 changes: 3 additions & 2 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,9 @@ jobs:
runs-on: ${{ matrix.os }}
strategy:
matrix:
python-version: ['3.7', '3.8', '3.9', '3.10']
os: [ubuntu-latest, windows-latest]
python-version: ["3.8", "3.9", "3.10"]
os: [ubuntu-latest, macos-latest, windows-latest]
fail-fast: false
steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
Expand Down
52 changes: 28 additions & 24 deletions feature/text_based.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,10 @@ def __init__(self, num_features: int, seed: int = Constants.default_seed, trials
self.matrix = None
self.text_embeddings = None

# Initialize models
self.set_cover_model = None
self.max_cover_model = None

def run(self, input_df: pd.DataFrame, labels: pd.DataFrame, num_features: int,
featurization_method: TextWiser, optimization_method: str = "exact",
cost_metric: str = "diverse", trials: int = 10) -> List:
Expand Down Expand Up @@ -484,32 +488,32 @@ def _select_random(self, trials: int = 10) -> List[int]:
def _solve_set_cover(self, data: Data) -> List:

# Create Model object
model = Model("Set Cover Model")
self.set_cover_model = Model("Set Cover Model")

# Variables
x = [model.add_var(var_type=BINARY) for _ in data.X]
x = [self.set_cover_model.add_var(var_type=BINARY) for _ in data.X]

# Constraint: every row should be covered
for row in data.rows:
model.add_constr(xsum(data.matrix[row, i] * x[i] for i in data.X) >= 1)
self.set_cover_model.add_constr(xsum(data.matrix[row, i] * x[i] for i in data.X) >= 1)

# Objective: minimize
model.objective = minimize(xsum(data.cost[i] * x[i] for i in data.X))
self.set_cover_model.objective = minimize(xsum(data.cost[i] * x[i] for i in data.X))

# Solve
model.verbose = False
model.optimize()
check_true(model.status == OptimizationStatus.OPTIMAL, ValueError("Max Cover Error: "
self.set_cover_model.verbose = False
self.set_cover_model.optimize()
check_true(self.set_cover_model.status == OptimizationStatus.OPTIMAL, ValueError("Max Cover Error: "
"optimal solution not found."))

# Solution
selected = [i for i in data.X if float(x[i].x) >= 0.99]

if self.verbose:
print("=" * 40)
print("SET COVER OBJECTIVE:", model.objective_value)
print("SET COVER OBJECTIVE:", self.set_cover_model.objective_value)
print("SELECTED:", selected)
print("STATUS:", model.status)
print("STATUS:", self.set_cover_model.status)
print("=" * 40)

# Return
Expand All @@ -518,54 +522,54 @@ def _solve_set_cover(self, data: Data) -> List:
def _solve_max_cover(self, data: Data, selected: List) -> List:

# Model
model = Model("Max Cover Model")
self.max_cover_model = Model("Max Cover Model")

# Variables
x = [model.add_var(var_type=BINARY) for _ in data.X]
is_row_covered = [model.add_var(var_type=BINARY) for _ in data.rows]
num_row_covered = model.add_var(var_type=INTEGER)
x = [self.max_cover_model.add_var(var_type=BINARY) for _ in data.X]
is_row_covered = [self.max_cover_model.add_var(var_type=BINARY) for _ in data.rows]
num_row_covered = self.max_cover_model.add_var(var_type=INTEGER)

# Constraint: Link between x and is_row_covered
for row in data.rows:
for i in data.X:
# if any selected column has the label, then the row would be covered
model.add_constr(data.matrix[row, i] * x[i] <= is_row_covered[row])
self.max_cover_model.add_constr(data.matrix[row, i] * x[i] <= is_row_covered[row])
# total selected
model.add_constr(xsum(data.matrix[row, i] * x[i] for i in data.X) >= is_row_covered[row])
self.max_cover_model.add_constr(xsum(data.matrix[row, i] * x[i] for i in data.X) >= is_row_covered[row])

# Constraint: Link is_row_covered with num_row_covered
model.add_constr(xsum(is_row_covered[row] for row in data.rows) == num_row_covered)
self.max_cover_model.add_constr(xsum(is_row_covered[row] for row in data.rows) == num_row_covered)

# Constraint: If selected is given, discard columns that are not part of selection
for i in data.X:
if i not in selected:
model.add_constr(x[i] == 0)
self.max_cover_model.add_constr(x[i] == 0)

# Constraint: limit number of selected to max_cover_size
model.add_constr(xsum(x[i] for i in data.X) <= self.num_features)
self.max_cover_model.add_constr(xsum(x[i] for i in data.X) <= self.num_features)

# Objective: maximize "row" coverage (not the whole coverage of 1s)
model.objective = maximize(xsum(is_row_covered[row] for row in data.rows))
self.max_cover_model.objective = maximize(xsum(is_row_covered[row] for row in data.rows))

# Solve
model.verbose = False
model.optimize()
self.max_cover_model.verbose = False
self.max_cover_model.optimize()

# Solution
selected = [i for i in data.X if float(x[i].x) >= 0.99]

if self.verbose:
print("=" * 40)
print("MAX COVER OBJECTIVE:", model.objective_value)
print("MAX COVER OBJECTIVE:", self.max_cover_model.objective_value)
print("NUM ROWS COVERED:", num_row_covered.x,
"coverage: {:.2f}".format(num_row_covered.x / data.matrix.shape[0]))
print("SIZE:", len(selected),
"reduction: {:.2f}".format((data.matrix.shape[1] - len(selected)) / data.matrix.shape[1]))
print("SELECTED:", selected)
print("STATUS:", model.status)
print("STATUS:", self.max_cover_model.status)
print("=" * 40)

check_true(model.status == OptimizationStatus.OPTIMAL, ValueError("Max Cover Error: "
check_true(self.max_cover_model.status == OptimizationStatus.OPTIMAL, ValueError("Max Cover Error: "
"optimal solution not found."))

# Return
Expand Down
17 changes: 0 additions & 17 deletions tests/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,6 @@
# SPDX-License-Identifier: GNU GPLv3

import unittest
from sklearn.datasets import load_iris
from feature.utils import get_data_label, reduce_memory, DataTransformer


class BaseTest(unittest.TestCase):
Expand All @@ -23,18 +21,3 @@ def assertListAlmostEqual(self, list1, list2):

for index, val in enumerate(list1):
self.assertAlmostEqual(val, list2[index], delta=0.01)

@staticmethod
def test_mem_usage():
data, label = get_data_label(load_iris())
data_reduced = reduce_memory(data, verbose=False)

@staticmethod
def test_cap_floor():
data, label = get_data_label(load_iris())

# Fit transformer and transform to numeric contexts
data_transformer = DataTransformer()
contexts = data_transformer.fit(data)
contexts = data_transformer.transform(data)
contexts = data_transformer.fit_transform(data)
40 changes: 16 additions & 24 deletions tests/test_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ def test_text_based_random_diverse(self):
self.assertEqual(data.shape[1], labels.shape[1])

method = SelectionMethod.TextBased(num_features=None,
featurization_method=TextWiser(Embedding.TfIdf(min_df=0),
featurization_method=TextWiser(Embedding.TfIdf(min_df=0.),
Transformation.NMF(n_components=10,
random_state=123)),
optimization_method="random",
Expand Down Expand Up @@ -285,7 +285,7 @@ def test_text_based_greedy_num_feature_unicost_diverse(self):
self.assertEqual(data.shape[1], labels.shape[1])

method = SelectionMethod.TextBased(num_features=3,
featurization_method=TextWiser(Embedding.TfIdf(min_df=0),
featurization_method=TextWiser(Embedding.TfIdf(min_df=0.),
Transformation.NMF(n_components=10,
random_state=123)),
optimization_method="greedy",
Expand Down Expand Up @@ -323,7 +323,7 @@ def test_text_based_greedy_unicost_diverse_identity(self):
self.assertEqual(data.shape[1], labels.shape[1])

method = SelectionMethod.TextBased(num_features=None,
featurization_method=TextWiser(Embedding.TfIdf(min_df=0),
featurization_method=TextWiser(Embedding.TfIdf(min_df=0.),
Transformation.NMF(n_components=10,
random_state=123)),
optimization_method="greedy",
Expand Down Expand Up @@ -358,7 +358,7 @@ def test_text_based_kmeans_num_feature(self):
"item7": [1, 0, 0, 1, 0, 0, 1]})

method = SelectionMethod.TextBased(num_features=2,
featurization_method=TextWiser(Embedding.TfIdf(min_df=0),
featurization_method=TextWiser(Embedding.TfIdf(min_df=0.),
Transformation.NMF(n_components=10,
random_state=123)),
optimization_method="kmeans",
Expand Down Expand Up @@ -396,7 +396,7 @@ def test_text_based_kmeans_unicost(self):
"item7": [0, 1, 0, 0, 0, 0, 1]})

method = SelectionMethod.TextBased(num_features=None,
featurization_method=TextWiser(Embedding.TfIdf(min_df=0),
featurization_method=TextWiser(Embedding.TfIdf(min_df=0.),
Transformation.NMF(n_components=10,
random_state=123)),
optimization_method="kmeans",
Expand Down Expand Up @@ -433,7 +433,7 @@ def test_text_based_kmeans_diverse(self):
"item7": [0, 1, 0, 0, 0, 0, 1]})

method = SelectionMethod.TextBased(num_features=None,
featurization_method=TextWiser(Embedding.TfIdf(min_df=0),
featurization_method=TextWiser(Embedding.TfIdf(min_df=0.),
Transformation.NMF(n_components=10,
random_state=123)),
optimization_method="kmeans",
Expand Down Expand Up @@ -563,15 +563,15 @@ def test_text_based_exact_diverse(self):
self.assertEqual(data.shape[1], labels.shape[1])

method = SelectionMethod.TextBased(num_features=None,
featurization_method=TextWiser(Embedding.TfIdf(min_df=0),
featurization_method=TextWiser(Embedding.TfIdf(min_df=0.),
Transformation.NMF(n_components=10,
random_state=123)),
optimization_method="exact",
cost_metric="diverse",
trials=1)

method2 = SelectionMethod.TextBased(num_features=None,
featurization_method=TextWiser(Embedding.TfIdf(min_df=0),
featurization_method=TextWiser(Embedding.TfIdf(min_df=0.),
Transformation.NMF(n_components=10,
random_state=123)),
optimization_method="exact",
Expand All @@ -580,20 +580,13 @@ def test_text_based_exact_diverse(self):

selector = Selective(method)
selector.fit(data, labels)
selected_features = selector.transform(data)

self.assertEqual(selector.selection_method.trials, 1) # Only run once
self.assertTrue(isinstance(selected_features, pd.DataFrame))

selector2 = Selective(method2)
selector2.fit(data, labels)
selected_features2 = selector2.transform(data)

# Verify the consistency of selected features with the initial run
self.assertTrue(selected_features.equals(selected_features2))

# Verify that the features selected
self.assertListEqual(list(selected_features2.columns), ['item3', 'item4', 'item7'])
self.assertEqual(selector._imp.content_selector.set_cover_model.objective_value,
selector2._imp.content_selector.set_cover_model.objective_value)

# Verify selection for the Exact method, diverse, and fixed number of features with the same seed
# (the same features should select)
Expand All @@ -615,15 +608,15 @@ def test_text_based_exact_num_feature_diverse(self):
self.assertEqual(data.shape[1], labels.shape[1])

method = SelectionMethod.TextBased(num_features=2, # num_features is less than the solution of set cover
featurization_method=TextWiser(Embedding.TfIdf(min_df=0),
featurization_method=TextWiser(Embedding.TfIdf(min_df=0.),
Transformation.NMF(n_components=10,
random_state=123)),
optimization_method="exact",
cost_metric="diverse",
trials=1) # Default cost metric is diverse

method2 = SelectionMethod.TextBased(num_features=2,
featurization_method=TextWiser(Embedding.TfIdf(min_df=0),
featurization_method=TextWiser(Embedding.TfIdf(min_df=0.),
Transformation.NMF(n_components=10,
random_state=123)),
optimization_method="exact",
Expand All @@ -632,17 +625,16 @@ def test_text_based_exact_num_feature_diverse(self):

selector = Selective(method)
selector.fit(data, labels)
selected_features = selector.transform(data)

selector2 = Selective(method2)
selector2.fit(data, labels)
selected_features2 = selector2.transform(data)

# Verify the consistency of selected features with the initial run
self.assertTrue(selected_features.equals(selected_features2))
self.assertEqual(selector._imp.content_selector.set_cover_model.objective_value,
selector2._imp.content_selector.set_cover_model.objective_value)

# Verify that the features selected
self.assertListEqual(list(selected_features2.columns), ['item3', 'item7'])
self.assertEqual(selector._imp.content_selector.max_cover_model.objective_value,
selector2._imp.content_selector.max_cover_model.objective_value)

################################################
########## Verify invalid tests ###############
Expand Down
29 changes: 29 additions & 0 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# -*- coding: utf-8 -*-
# Copyright FMR LLC <[email protected]>
# SPDX-License-Identifier: GNU GPLv3

import numpy as np
from sklearn.datasets import load_iris
from feature.utils import get_data_label, reduce_memory, DataTransformer

from tests.test_base import BaseTest


class TestUtils(BaseTest):

def test_mem_usage(self):
data, label = get_data_label(load_iris())
data_reduced = reduce_memory(data, verbose=False)
self.assertEqual(data_reduced.shape, (150, 4))
self.assertFalse(data_reduced.isna().any().any())

def test_cap_floor(self):
data, label = get_data_label(load_iris())

# Fit transformer and transform to numeric contexts
data_transformer = DataTransformer()
contexts = data_transformer.fit(data)
contexts = data_transformer.transform(data)
contexts = data_transformer.fit_transform(data)
self.assertFalse(np.isnan(contexts).any())
self.assertEqual(contexts.shape, (150, 4))

0 comments on commit 62a5b74

Please sign in to comment.