21483: Adjusts settings for matrix normalizations, MAJOR (#281)

jackx111 · web-flow · commit 14c9f42f42f8 · 2024-09-05T17:21:02.000-04:00
diff --git a/howso/client/typing.py b/howso/client/typing.py
@@ -52,7 +52,7 @@ class Evaluation(TypedDict):
 NewCaseThreshold: TypeAlias = Literal["max", "min", "most_similar"]
 """Valid values for ``new_case_threshold`` parameters."""
 
-NormalizeMethod: TypeAlias = Literal["feature_count", "fractional", "relative"]
+NormalizeMethod: TypeAlias = Literal["fractional_absolute", "fractional", "relative"]
 """Valid values for ``normalize_method`` parameters."""
 
 PathLike: TypeAlias = Union[str, os.PathLike]
diff --git a/howso/engine/tests/test_engine.py b/howso/engine/tests/test_engine.py
@@ -295,7 +295,6 @@ def test_get_contribution_matrix(self, trainee):
         """Test `get_contribution_matrix`."""
         matrix = trainee.get_contribution_matrix(
             normalize=True,
-            absolute=True,
             fill_diagonal=True
         )
         assert len(matrix) == 5
@@ -311,7 +310,6 @@ def test_get_contribution_matrix(self, trainee):
         saved_matrix = matrix_processing(
             saved_matrix['contribution'],
             normalize=True,
-            absolute=True,
             fill_diagonal=True
         )
 
@@ -320,7 +318,6 @@ def test_get_contribution_matrix(self, trainee):
     def test_get_mda_matrix(self, trainee):
         """Test `get_mda_matrix`."""
         matrix = trainee.get_mda_matrix(
-            normalize=True,
             absolute=True,
             fill_diagonal=True
         )
@@ -336,7 +333,6 @@ def test_get_mda_matrix(self, trainee):
 
         saved_matrix = matrix_processing(
             saved_matrix['mda'],
-            normalize=True,
             absolute=True,
             fill_diagonal=True
         )
diff --git a/howso/engine/trainee.py b/howso/engine/trainee.py
@@ -3585,11 +3585,7 @@ def get_contribution_matrix(
         directional: bool = False,
         robust: bool = True,
         targeted: bool = False,
-        normalize: bool = False,
-        normalize_method: NormalizeMethod | Callable | Iterable[
-            NormalizeMethod | Callable
-        ] = "relative",
-        absolute: bool = False,
+        normalize: bool = True,
         fill_diagonal: bool = True,
         fill_diagonal_value: float | int = 1,
     ) -> DataFrame:
@@ -3609,25 +3605,8 @@ def get_contribution_matrix(
         targeted : bool, default False
             Whether to do a targeted re-analyze before each feature's contribution is calculated.
         normalize : bool, default False
-            Whether to normalize the matrix row wise. Normalization method is set by the ``normalize_method``
-            parameter.
-        normalize_method : str or callable or iterable of str or callable, default "relative"
-            The normalization method. The method may either one of the strings below that correspond to a
-            default method or a custom callable.
-
-            These methods may be passed in as an individual string or in a iterable where they will
-            be processed sequentially.
-
-            Default Methods:
-            - 'relative': normalizes each row by dividing each value by the maximum absolute value in the row.
-            - 'fractional': normalizes each row by dividing each value by the sum of absolute values in the row.
-            - 'feature_count': normalizes each row by dividing by the feature count.
-
-            Custom Callable:
-            - If a custom Callable is provided, then it will be passed onto the DataFrame apply function:
-                ``matrix.apply(Callable)``
-        absolute : bool, default False
-            Whether to transform the matrix values into the absolute values.
+            Whether to normalize the matrix row wise. If True, normalizes each row by dividing each value
+                by the sum of the values in the row, so the fractional values sum to 1.
         fill_diagonal : bool, default False
             Whether to fill in the diagonals of the matrix. If set to true,
             the diagonal values will be filled in based on the ``fill_diagonal_value`` value.
@@ -3675,8 +3654,7 @@ def get_contribution_matrix(
         matrix = matrix_processing(
             matrix,
             normalize=normalize,
-            normalize_method=normalize_method,
-            absolute=absolute,
+            normalize_method="fractional",
             fill_diagonal=fill_diagonal,
             fill_diagonal_value=fill_diagonal_value
         )
@@ -3721,8 +3699,9 @@ def get_mda_matrix(
 
             Default Methods:
             - 'relative': normalizes each row by dividing each value by the maximum absolute value in the row.
-            - 'fractional': normalizes each row by dividing each value by the sum of absolute values in the row.
-            - 'feature_count': normalizes each row by dividing by the feature count.
+            - 'fractional': normalizes each row by dividing each value by the sum of the values in the row, so the relative
+            values sum to 1.
+            - 'fractional_absolute': normalizes each row by dividing each value by the sum of absolute values in the row.
 
             Custom Callable:
             - If a custom Callable is provided, then it will be passed onto the DataFrame apply function:
diff --git a/howso/utilities/tests/test_utilities.py b/howso/utilities/tests/test_utilities.py
@@ -352,6 +352,7 @@ def test_matrix_processing(
         processed_matrix = matrix_processing(
             matrix=df,
             normalize=normalize,
+            normalize_method='relative',
             ignore_diagonals_normalize=ignore_diagonals_normalize,
             absolute=absolute,
             fill_diagonal=fill_diagonal,
@@ -364,6 +365,7 @@ def test_matrix_processing(
         processed_matrix = matrix_processing(
             matrix=df,
             normalize=normalize,
+            normalize_method='relative',
             ignore_diagonals_normalize=ignore_diagonals_normalize,
             absolute=absolute,
             fill_diagonal=fill_diagonal,
@@ -384,6 +386,7 @@ def test_matrix_processing(
         processed_matrix = matrix_processing(
             matrix=df,
             normalize=normalize,
+            normalize_method='relative',
             ignore_diagonals_normalize=ignore_diagonals_normalize,
             absolute=absolute,
             fill_diagonal=fill_diagonal,
@@ -403,6 +406,7 @@ def test_matrix_processing(
         processed_matrix = matrix_processing(
             matrix=df,
             normalize=normalize,
+            normalize_method='relative',
             ignore_diagonals_normalize=ignore_diagonals_normalize,
             absolute=absolute,
             fill_diagonal=fill_diagonal,
@@ -421,6 +425,7 @@ def test_matrix_processing(
         processed_matrix = matrix_processing(
             matrix=df,
             normalize=normalize,
+            normalize_method='relative',
             ignore_diagonals_normalize=ignore_diagonals_normalize,
             absolute=absolute,
             fill_diagonal=fill_diagonal,
@@ -439,6 +444,7 @@ def test_matrix_processing(
         processed_matrix = matrix_processing(
             matrix=df,
             normalize=normalize,
+            normalize_method='relative',
             ignore_diagonals_normalize=ignore_diagonals_normalize,
             absolute=absolute,
             fill_diagonal=fill_diagonal,
@@ -456,9 +462,8 @@ def test_matrix_processing(
 @pytest.mark.parametrize(
     'normalize_method',
     (
-        ('sum'),
-        ('absolute_sum'),
-        ('feature_count'),
+        ('fractional'),
+        ('fractional_absolute'),
     )
 )
 def test_matrix_processing_normalization_single_method(
@@ -471,7 +476,7 @@ def test_matrix_processing_normalization_single_method(
         'c': [0.5, -1.5, 3.0],
     }, index=['a', 'b', 'c']).T
 
-    if normalize_method == 'sum':
+    if normalize_method == 'fractional':
         processed_matrix = round(
             matrix_processing(
                 matrix=df,
@@ -487,7 +492,7 @@ def test_matrix_processing_normalization_single_method(
 
         assert_frame_equal(processed_matrix, correct_matrix)
 
-    if normalize_method == 'absolute_sum':
+    if normalize_method == 'fractional_absolute':
         processed_matrix = round(
             matrix_processing(
                 matrix=df,
@@ -503,59 +508,6 @@ def test_matrix_processing_normalization_single_method(
 
         assert_frame_equal(processed_matrix, correct_matrix)
 
-    if normalize_method == 'feature_count':
-        processed_matrix = round(
-            matrix_processing(
-                matrix=df,
-                normalize=True,
-                ignore_diagonals_normalize=False,
-                normalize_method=normalize_method
-            ), 2)
-        correct_matrix = pd.DataFrame({
-            'a': [0.33, -1.00, 2.00],
-            'b': [0.17, -0.17, 0.33],
-            'c': [0.17, -0.50, 1.00]
-        }, index=['a', 'b', 'c']).T
-
-        assert_frame_equal(processed_matrix, correct_matrix)
-
-
-def test_matrix_processing_normalization_list(
-    normalize_method=['feature_count', 'sum'],
-):
-    """Tests that `matrix_processing` normalization parameters with lists works properly."""
-    df = pd.DataFrame({
-        'a': [1.0, -3.0, 6.0],
-        'b': [0.5, -0.5, 1.0],
-        'c': [0.5, -1.5, 3.0],
-    }, index=['a', 'b', 'c']).T
-
-    processed_matrix = round(
-        matrix_processing(
-            matrix=df,
-            normalize=True,
-            ignore_diagonals_normalize=False,
-            normalize_method=normalize_method
-        ), 2)
-
-    # When a list is the parameter, the methods inside are calculated sequentialy. It should be the
-    # same as doing the two methods sequentially independently as well.
-    correct_matrix = matrix_processing(
-        matrix=df,
-        normalize=True,
-        ignore_diagonals_normalize=False,
-        normalize_method=normalize_method[0]
-    )
-    correct_matrix = round(
-        matrix_processing(
-            matrix=correct_matrix,
-            normalize=True,
-            ignore_diagonals_normalize=False,
-            normalize_method=normalize_method[1]
-        ), 2)
-
-    assert_frame_equal(processed_matrix, correct_matrix)
-
 
 def test_matrix_processing_normalization_callable():
     """Tests that `matrix_processing` normalization parameters with Callable works properly."""
@@ -565,7 +517,7 @@ def test_matrix_processing_normalization_callable():
         'c': [0.5, -1.5, 3.0],
     }, index=['a', 'b', 'c']).T
 
-    # This is the exact same function as the 'absolute_sum' normalization method,
+    # This is the exact same function as the 'fractional_absolute' normalization method,
     # thus it should return the same results.
     def divide_by_sum_abs(row):
         sum_abs = row.abs().sum()
@@ -584,7 +536,7 @@ def divide_by_sum_abs(row):
             matrix=df,
             normalize=True,
             ignore_diagonals_normalize=False,
-            normalize_method='absolute_sum'
+            normalize_method='fractional_absolute'
         ), 2)
 
     assert_frame_equal(processed_matrix, correct_matrix)
@@ -605,7 +557,7 @@ def test_matrix_processing_normalization_zero_division():
                 matrix=df,
                 normalize=True,
                 ignore_diagonals_normalize=False,
-                normalize_method='sum'
+                normalize_method='fractional'
             ), 2)
 
     # First row is returned unnormalized
diff --git a/howso/utilities/utilities.py b/howso/utilities/utilities.py
@@ -1176,7 +1176,7 @@ def deep_update(base, updates):
 def matrix_processing( # noqa
     matrix: pd.DataFrame,
     normalize: bool = False,
-    normalize_method: Iterable[NormalizeMethod | Callable] | NormalizeMethod | Callable = "relative",
+    normalize_method: Iterable[NormalizeMethod | Callable] | NormalizeMethod | Callable = "fractional",
     ignore_diagonals_normalize: bool = True,
     absolute: bool = False,
     fill_diagonal: bool = False,
@@ -1203,8 +1203,9 @@ def matrix_processing( # noqa
 
         Default Methods:
         - 'relative': normalizes each row by dividing each value by the maximum absolute value in the row.
-        - 'fractional': normalizes each row by dividing each value by the sum of absolute values in the row.
-        - 'feature_count': normalizes each row by dividing by the feature count.
+        - 'fractional': normalizes each row by dividing each value by the sum of the values in the row, so the relative
+           values sum to 1.
+        - 'fractional_absolute': normalizes each row by dividing each value by the sum of absolute values in the row.
 
         Custom Callable:
         - If a custom Callable is provided, then it will be passed onto the DataFrame apply function:
@@ -1286,18 +1287,16 @@ def divide_by_max_abs(row):
         for method in normalize_method:
             if method == "relative":
                 matrix = matrix.apply(divide_by_max_abs, axis=1)  # type: ignore
-            elif method == "sum":
+            elif method == "fractional":
                 matrix = matrix.apply(sum_if_not_zero, axis=1)  # type: ignore
-            elif method == "absolute_sum":
+            elif method == "fractional_absolute":
                 matrix = matrix.apply(abs_sum_if_not_zero, axis=1)  # type: ignore
-            elif method == "feature_count":
-                matrix = matrix.apply(lambda x: x / len(matrix.columns), axis=1)  # type: ignore
             elif callable(method):
                 matrix = matrix.apply(method, axis=1)   # type: ignore
             else:
                 raise ValueError(
                     f"Invalid normalization method: {normalize_method}. "
-                    "Must be 'relative', 'sum', 'absolute_sum', 'feature_count' or a Callable."
+                    "Must be 'relative', 'fractional', 'fractional_absolute', or a Callable."
                 )
         if ignore_diagonals_normalize:
             for i, value in enumerate(diagonal_values):