Add BaselineRankMultiFeature (#871)

* changing PercentileRankOneFeature descending parameter * update parameter name * add deprecated descend parameter for backwards compatability * multi feature ranker * multi feature ranker tests * debug test * fix typo * avoid duplicating logic in tests * deprecation warning for PercentileRankOneFeature
dssg · Dec 7, 2021 · b4ff916 · b4ff916
1 parent 8bd22d8
commit b4ff916
Show file tree

Hide file tree

Showing 6 changed files with 230 additions and 30 deletions.
diff --git a/docs/sources/dirtyduck/eis.md b/docs/sources/dirtyduck/eis.md
@@ -327,7 +327,7 @@ We will begin defining some basic models as baselines.
 
 'triage.component.catwalk.baselines.rankers.PercentileRankOneFeature':
   feature: ['risks_entity_id_all_risk_high_sum', 'inspections_entity_id_all_total_count', 'results_entity_id_all_result_fail_sum']
-  descend: [True]
+  low_value_high_score: [True]
 
 'sklearn.dummy.DummyClassifier':
   strategy: ['prior', 'stratified']
@@ -475,9 +475,9 @@ After the experiment finishes, we can create the following table:
               1 | SimpleThresholder        | {"rules": ["inspections\_entity\_id\_1month\_total\_count > 0"]}                              | {inspection,inspections,results,risks} | {1,19,37,55,73,91}        | {2014-12-01,2015-06-01,2015-12-01,2016-06-01,2016-12-01,2017-06-01} | {" 0.358"," 0.231"," 0.321"," 0.267"," 0.355"," 0.239"}
               2 | SimpleThresholder        | {"rules": ["results\_entity\_id\_1month\_result\_fail\_sum > 0"]}                              | {inspection,inspections,results,risks} | {2,20,38,56,74,92}        | {2014-12-01,2015-06-01,2015-12-01,2016-06-01,2016-12-01,2017-06-01} | {" 0.316"," 0.316"," 0.323"," 0.344"," 0.330"," 0.312"}
               3 | SimpleThresholder        | {"rules": ["risks\_entity\_id\_1month\_risk\_high\_sum > 0"]}                                  | {inspection,inspections,results,risks} | {3,21,39,57,75,93}        | {2014-12-01,2015-06-01,2015-12-01,2016-06-01,2016-12-01,2017-06-01} | {" 0.364"," 0.248"," 0.355"," 0.286"," 0.371"," 0.257"}
-              4 | PercentileRankOneFeature | {"descend": true, "feature": "risks\_entity\_id\_all\_risk\_high\_sum"}                        | {inspection,inspections,results,risks} | {4,22,40,58,76,94}        | {2014-12-01,2015-06-01,2015-12-01,2016-06-01,2016-12-01,2017-06-01} | {" 0.121"," 0.193"," 0.124"," 0.230"," 0.112"," 0.161"}
-              5 | PercentileRankOneFeature | {"descend": true, "feature": "inspections\_entity\_id\_all\_total\_count"}                    | {inspection,inspections,results,risks} | {5,23,41,59,77,95}        | {2014-12-01,2015-06-01,2015-12-01,2016-06-01,2016-12-01,2017-06-01} | {" 0.076"," 0.133"," 0.098"," 0.101"," 0.086"," 0.082"}
-              6 | PercentileRankOneFeature | {"descend": true, "feature": "results\_entity\_id\_all\_result\_fail\_sum"}                    | {inspection,inspections,results,risks} | {6,24,42,60,78,96}        | {2014-12-01,2015-06-01,2015-12-01,2016-06-01,2016-12-01,2017-06-01} | {" 0.237"," 0.274"," 0.250"," 0.275"," 0.225"," 0.221"}
+              4 | PercentileRankOneFeature | {"low_value_high_score": true, "feature": "risks\_entity\_id\_all\_risk\_high\_sum"}                        | {inspection,inspections,results,risks} | {4,22,40,58,76,94}        | {2014-12-01,2015-06-01,2015-12-01,2016-06-01,2016-12-01,2017-06-01} | {" 0.121"," 0.193"," 0.124"," 0.230"," 0.112"," 0.161"}
+              5 | PercentileRankOneFeature | {"low_value_high_score": true, "feature": "inspections\_entity\_id\_all\_total\_count"}                    | {inspection,inspections,results,risks} | {5,23,41,59,77,95}        | {2014-12-01,2015-06-01,2015-12-01,2016-06-01,2016-12-01,2017-06-01} | {" 0.076"," 0.133"," 0.098"," 0.101"," 0.086"," 0.082"}
+              6 | PercentileRankOneFeature | {"low_value_high_score": true, "feature": "results\_entity\_id\_all\_result\_fail\_sum"}                    | {inspection,inspections,results,risks} | {6,24,42,60,78,96}        | {2014-12-01,2015-06-01,2015-12-01,2016-06-01,2016-12-01,2017-06-01} | {" 0.237"," 0.274"," 0.250"," 0.275"," 0.225"," 0.221"}
               7 | DecisionTreeClassifier   | {"criterion": "gini", "max\_depth": 1, "max\_features": "sqrt", "min\_samples\_split": 2}    | {inspection,inspections,results,risks} | {7,25,43,61,79,97}        | {2014-12-01,2015-06-01,2015-12-01,2016-06-01,2016-12-01,2017-06-01} | {" 0.284"," 0.441"," 0.559"," 0.479"," 0.463"," 0.412"}
               8 | DecisionTreeClassifier   | {"criterion": "gini", "max\_depth": 2, "max\_features": "sqrt", "min\_samples\_split": 2}    | {inspection,inspections,results,risks} | {8,26,44,62,80,98}        | {2014-12-01,2015-06-01,2015-12-01,2016-06-01,2016-12-01,2017-06-01} | {" 0.401"," 0.388"," 0.533"," 0.594"," 0.519"," 0.649"}
               9 | DecisionTreeClassifier   | {"criterion": "gini", "max\_depth": 5, "max\_features": "sqrt", "min\_samples\_split": 2}    | {inspection,inspections,results,risks} | {9,27,45,63,81,99}        | {2014-12-01,2015-06-01,2015-12-01,2016-06-01,2016-12-01,2017-06-01} | {" 0.594"," 0.876"," 0.764"," 0.843"," 0.669"," 0.890"}

diff --git a/example/config/experiment.yaml b/example/config/experiment.yaml
@@ -474,7 +474,18 @@ grid_config:
     # predictive models to simply ranking entities on a single feature.
     'triage.component.catwalk.baselines.rankers.PercentileRankOneFeature':
         feature: ['feature_one', 'feature_two']
-        descend: True
+        low_value_high_score: [True]
+    # catwalk's BaselineRankMultiFeature baseline will score based on the ranking
+    # by one or more feature (note that the scores don't map to the percentiles as
+    # in PercentileRankOneFeature. This provides a slightly more complex baseline
+    # than above, but still realistic for what might be encountered in practice.
+    # The example below will create two ranker "models": one ranking by two features
+    # and the other just by a single feature. Note that the rules are lists of
+    # dictionaries.
+    'triage.component.catwalk.baselines.rankers.BaselineRankMultiFeature':
+        rules:
+            - [{feature: 'feature_1', low_value_high_score: True}, {feature: 'feature_2', low_value_high_score: False}]
+            - [{feature: 'feature_3', low_value_high_score: True}]
     # catwalk's SimpleThresholder baseline will evaluate each entity against
     # a list of rules and classify entities as 1 based on whether they meet
     # any or all of these rules, depending on whether 'or' or 'and' is

diff --git a/example/dirtyduck/experiments/eis_01.yaml b/example/dirtyduck/experiments/eis_01.yaml
@@ -177,7 +177,7 @@ grid_config:
 
   'triage.component.catwalk.baselines.rankers.PercentileRankOneFeature':
     feature: ['risks_entity_id_all_risk_high_sum', 'inspections_entity_id_all_total_count', 'results_entity_id_all_result_fail_sum']
-    descend: [True]
+    low_value_high_score: [True]
 
   'sklearn.dummy.DummyClassifier':
     strategy: ['prior', 'stratified']

diff --git a/src/tests/catwalk_tests/test_baselines.py b/src/tests/catwalk_tests/test_baselines.py
@@ -6,6 +6,7 @@
 from unittest import TestCase
 
 from triage.component.catwalk.baselines.rankers import PercentileRankOneFeature
+from triage.component.catwalk.baselines.rankers import BaselineRankMultiFeature
 from triage.component.catwalk.baselines.thresholders import SimpleThresholder
 from triage.component.catwalk.baselines.thresholders import get_operator_method
 from triage.component.catwalk.baselines.thresholders import OPERATOR_METHODS
@@ -46,6 +47,52 @@ def rules(request):
     request.cls.rules = ["x1 > 0", "x2 <= 1"]
 
 
+def scores_align_with_ranks(expected_ranks, returned_scores):
+    '''
+    Helper function to check that scores align with ranks
+    correctly for the ranking baselines (e.g., higher ranks
+    get higher scores and ties have the same score)
+    '''
+    df = pd.DataFrame({
+        'rank': expected_ranks,
+        'score': returned_scores
+        }).sort_values('rank', ascending=True)
+
+    curr_rank = None
+    curr_score = None
+
+    # Loop through the sorted records to check for any inconsistencies
+    for ix, rec in df.iterrows():
+        if curr_rank is None:
+            curr_rank = rec['rank']
+            curr_score = rec['score']
+            continue
+
+        if rec['rank'] < curr_rank:
+            return RuntimeError('Something has gone wrong with df.sort_values!')
+        elif rec['rank'] == curr_rank and rec['score'] != curr_score:
+            return False
+        elif rec['rank'] > curr_rank and rec['score'] <= curr_score:
+            return False
+
+        curr_rank = rec['rank']
+        curr_score = rec['score']
+
+    # If we got through the loop without any issues, return True
+    return True
+
+
+def test_scores_align_with_ranks():
+    # correct, no ties
+    assert scores_align_with_ranks([1,2,3], [0,0.5,1.0])
+    # correct, with ties
+    assert scores_align_with_ranks([1,2,2,3], [0,0.5,0.5,1.0])
+    # incorrect, no ties
+    assert not scores_align_with_ranks([1,2,3], [1.0,0.5,0.8])
+    # ties with different scores
+    assert not scores_align_with_ranks([1,2,2,3], [0,0.5,0.7,1.0])
+
+
 @pytest.mark.usefixtures("data")
 class TestRankOneFeature(TestCase):
     def test_fit(self):
@@ -62,25 +109,61 @@ def test_ranking_on_unavailable_feature_raises_error(self):
             ranker.fit(x=self.data["X_train"], y=self.data["y_train"])
 
     def test_predict_proba(self):
-        for descend_value in [True, False]:
-            ranker = PercentileRankOneFeature(feature="x3", descend=descend_value)
+        for direction_value in [True, False]:
+            ranker = PercentileRankOneFeature(feature="x3", low_value_high_score=direction_value)
             ranker.fit(x=self.data["X_train"], y=self.data["y_train"])
             results = ranker.predict_proba(self.data["X_test"])
-            if descend_value:
-                expected_results = np.array(
-                    [
-                        np.zeros(len(self.data["X_test"])),
-                        [0.875, 0.125, 0.375, 0, 0.625, 0.25, 0.5, 0.625],
-                    ]
-                ).transpose()
+            if direction_value:
+                expected_ranks = [6, 1, 3, 0, 5, 2, 4, 5]
             else:
-                expected_results = np.array(
-                    [
-                        np.zeros(len(self.data["X_test"])),
-                        [0, 0.75, 0.5, 0.875, 0.125, 0.625, 0.375, 0.125],
-                    ]
-                ).transpose()
-            np.testing.assert_array_equal(results, expected_results)
+               expected_ranks = [0, 5, 3, 6, 1, 4, 2, 1]
+
+            assert scores_align_with_ranks(expected_ranks, results[:,1])
+
+
+@pytest.mark.usefixtures("data")
+class TestRankMultiFeature(TestCase):
+    def test_fit(self):
+        rules = {'feature': 'x3', 'low_value_high_score': False}
+        ranker = BaselineRankMultiFeature(rules=rules)
+        assert ranker.feature_importances_ is None
+        ranker.fit(x=self.data["X_train"], y=self.data["y_train"])
+        np.testing.assert_array_equal(
+            ranker.feature_importances_, np.array([0, 0, 1, 0])
+        )
+
+    def test_ranking_on_unavailable_feature_raises_error(self):
+        rules = [{'feature': 'x5', 'low_value_high_score': False}]
+        ranker = BaselineRankMultiFeature(rules=rules)
+        with self.assertRaises(BaselineFeatureNotInMatrix):
+            ranker.fit(x=self.data["X_train"], y=self.data["y_train"])
+
+    def test_predict_proba_one_feature(self):
+        for direction_value in [True, False]:
+            rules = {'feature': 'x3', 'low_value_high_score': direction_value}
+            ranker = BaselineRankMultiFeature(rules=rules)
+            ranker.fit(x=self.data["X_train"], y=self.data["y_train"])
+            results = ranker.predict_proba(self.data["X_test"])
+            if direction_value:
+                expected_ranks = [6, 1, 3, 0, 5, 2, 4, 5]
+            else:
+                expected_ranks = [0, 5, 3, 6, 1, 4, 2, 1]
+
+            assert scores_align_with_ranks(expected_ranks, results[:,1])
+
+    def test_predict_proba_multi_feature(self):
+        rules = [
+            {'feature': 'x3', 'low_value_high_score': True},
+            {'feature': 'x2', 'low_value_high_score': False}
+        ]
+
+        ranker = BaselineRankMultiFeature(rules=rules)
+        ranker.fit(x=self.data["X_train"], y=self.data["y_train"])
+        results = ranker.predict_proba(self.data["X_test"])
+
+        expected_ranks = [7, 1, 3, 0, 5, 2, 4, 6]
+
+        assert scores_align_with_ranks(expected_ranks, results[:,1])
 
 
 @pytest.mark.parametrize('operator', OPERATOR_METHODS.keys())

diff --git a/src/tests/conftest.py b/src/tests/conftest.py
@@ -259,7 +259,7 @@ def sample_grid_config():
         },
         'triage.component.catwalk.baselines.rankers.PercentileRankOneFeature': {
             'feature': ['feature_one', 'feature_two'],
-            'descend': [True]
+            'low_value_high_score': [True]
         },
         'triage.component.catwalk.baselines.thresholders.SimpleThresholder': {
             'rules': [['feature_one > 3', 'feature_two <= 5']],

diff --git a/src/triage/component/catwalk/baselines/rankers.py b/src/triage/component/catwalk/baselines/rankers.py
@@ -1,13 +1,39 @@
+import verboselogs, logging
+logger = verboselogs.VerboseLogger(__name__)
 from scipy import stats
 import numpy as np
+import pandas as pd
 from triage.component.catwalk.exceptions import BaselineFeatureNotInMatrix
 
+REQUIRED_KEYS = frozenset(["feature", "low_value_high_score"])
+
 
 class PercentileRankOneFeature:
-    def __init__(self, feature, descend=False):
+    def __init__(self, feature, low_value_high_score=None, descend=None):
+        logger.warning("DEPRECATION WARNING: PercentileRankOneFeature is being replaced by "
+            "BaselineRankMultiFeature. Note, however, that the scores returned by the new "
+            "ranker cannot be interpreted as percentiles."
+        )
+        if descend is not None:
+            # If the deprecated `descend` parameter has been specified, raise a
+            # warning, then use this value for low_value_high_score, which has
+            # the same behavior
+            logger.warning("DEPRECATION WARNING: parameter `descend` is deprecated for "
+                "PercentileRankOneFeature. Use `low_value_high_score` instead."
+            )
+            if low_value_high_score is not None:
+                raise ValueError("Only one of `descend` or `low_value_high_score` can be "
+                    "specified for PercentileRankOneFeature."
+                    )
+            low_value_high_score = descend
+
+        # set default this way so we can check if both have been specified above
+        if low_value_high_score is None:
+            low_value_high_score = False
+
         self.feature = feature  # which feature to rank on
-        self.descend = (
-            descend
+        self.low_value_high_score = (
+            low_value_high_score
         )  # should feature be ranked so lower values -> higher scores
         self.feature_importances_ = None
 
@@ -48,7 +74,7 @@ def predict_proba(self, x):
         # values of the feature. so if the entities have values [0, 0, 1, 2, 2],
         # the first two entities will have the lowest ranks (and therefore the
         # lowest risk scores) and the last two will have the highest ranks (and
-        # highest risk scores). for the descending method, we need to reverse
+        # highest risk scores). for the "low_value_high_score" method, we need to reverse
         # this, and for both sorting directions, we need to convert the ranks to
         # percentiles.
 
@@ -60,7 +86,7 @@ def predict_proba(self, x):
         method = "min"
         subtract = 1
 
-        # when descending: tied entities should get the *highest* rank, so for
+        # when `low_value_high_score=True`: tied entities should get the *highest* rank, so for
         # [0, 0, 1, 2, 2] the ranks should be [2, 2, 3, 5, 5]. if we reverse
         # these ranks by substracting all items from the maximum rank (5), we
         # end up with the correct ranks for calculating percentiles:
@@ -70,15 +96,95 @@ def predict_proba(self, x):
         #   ([5, 5, 5, 5, 5] -  [2, 2, 3, 5, 5]) / 5  = [0.6, 0.6, 0.4, 0, 0]
         # and
         #    [1, 1, 1, 1, 1] - ([2, 2, 3, 5, 5]  / 5) = [0.6, 0.6, 0.4, 0, 0]
-        if self.descend:
+        if self.low_value_high_score:
             method = "max"
             subtract = 0
 
         # get the ranks and convert to percentiles
         ranks = stats.rankdata(x, method)
         ranks = [(rank - subtract) / len(x) for rank in ranks]
-        if self.descend:
+        if self.low_value_high_score:
             ranks = [1 - rank for rank in ranks]
 
         # format it like sklearn output and return
         return np.array([np.zeros(len(x)), ranks]).transpose()
+
+
+class BaselineRankMultiFeature:
+    def __init__(self, rules):
+        if not isinstance(rules, list):
+            rules = [rules]
+
+        # validate rules: must have feature and sort order
+        for rule in rules:
+            if not isinstance(rule, dict):
+                raise ValueError('Rules for BaselineRankMultiFeature must be of type dict')
+            if not rule.keys() >= REQUIRED_KEYS:
+                raise ValueError(f'BaselineRankMultiFeature rule "{rule}" missing one or more required keys ({REQUIRED_KEYS})')
+
+        self.rules = rules
+        self.feature_importances_ = None
+
+    @property
+    def all_feature_names(self):
+        return [rule["feature"] for rule in self.rules]
+
+    @property
+    def all_sort_directions(self):
+        # note that ascending=True sort will mean low values get low scores,
+        # so negate the parameter direction to get the right relationship
+        return [not rule['low_value_high_score'] for rule in self.rules]
+
+    def _set_feature_importances_(self, x):
+        """ Assigns feature importances following the rule: 1 for the features
+        we are thresholding on, 0 for all other features.
+        """
+        feature_importances = [0] * len(x.columns)
+        for feature_name in self.all_feature_names:
+            try:
+                position = x.columns.get_loc(feature_name)
+            except KeyError:
+                raise BaselineFeatureNotInMatrix(
+                    (
+                        "Rules refer to a feature ({feature_name}) not included in "
+                        "the training matrix!".format(feature_name=feature_name)
+                    )
+                )
+            feature_importances[position] = 1
+        self.feature_importances_ = np.array(feature_importances)
+
+    def fit(self, x, y):
+        """ Set feature importances and return self.
+        """
+        self._set_feature_importances_(x)
+        return self
+
+    def predict_proba(self, x):
+        """ Generate the rank scores and return these.
+        """
+        # reduce x to the selected set of features
+        x = x[self.all_feature_names].reset_index(drop=True)
+
+        x = x.sort_values(self.all_feature_names, ascending=self.all_sort_directions)
+
+        # initialize curr_rank to -1 so the first record will have rank 0 (hence "score"
+        # will range from 0 to 1)
+        ranks = []
+        curr_rank = -1
+        prev = []
+
+        # calculate ranks over sorted records, giving ties the same rank
+        for rec in x.values:
+            if not np.array_equal(prev, rec):
+                curr_rank += 1
+            ranks.append(curr_rank)
+            prev = rec
+
+        # normalize to 0 to 1 range
+        x['score'] = [r/max(ranks) for r in ranks]
+
+        # reset back to original sort order, calculate "score" for "0 class"
+        scores_1 = x.sort_index()['score'].values
+        scores_0 = np.array([1-s for s in scores_1])
+
+        return np.array([scores_0, scores_1]).transpose()