GH-16328: remove enum predictors based on best performing coefficient…

… level [nocheck] (#16334) * GH-16328: Remove enum predictor only when its best performing level has big p-value * GH-16328: fixed criteria to discard categorical predictors and fixed test. * Incorporate Tomas F comment to change function name to clarify its purpose. * change dataset from aws path to local path. * Incorporate maurever comments.
h2oai · Aug 6, 2024 · 9d90b05 · 9d90b05
1 parent 04188e5
commit 9d90b05
Show file tree

Hide file tree

Showing 3 changed files with 41 additions and 27 deletions.
diff --git a/h2o-algos/src/main/java/hex/modelselection/ModelSelectionUtils.java b/h2o-algos/src/main/java/hex/modelselection/ModelSelectionUtils.java
@@ -1063,10 +1063,10 @@ public static int findMinZValue(GLMModel model, List<String> numPredNames, List<
         }
         // grab min z-values for numerical and categorical columns
         PredNameMinZVal numericalPred = findNumMinZVal(numPredNames, zValList, coeffNames);
-        PredNameMinZVal categoricalPred = findCatMinZVal(model, zValList);
+       PredNameMinZVal categoricalPred = findCatMinOfMaxZScore(model, zValList); // null if all predictors are inactive
 
         // choose the min z-value from numerical and categorical predictors and return its index in predNames
-        if (categoricalPred._minZVal >= 0 && categoricalPred._minZVal < numericalPred._minZVal) { // categorical pred has minimum z-value
+        if (categoricalPred != null && categoricalPred._minZVal >= 0 && categoricalPred._minZVal < numericalPred._minZVal) { // categorical pred has minimum z-value
             return predNames.indexOf(categoricalPred._predName);
         } else {    // numerical pred has minimum z-value
             return predNames.indexOf(numericalPred._predName);
@@ -1095,26 +1095,28 @@ public static PredNameMinZVal findNumMinZVal(List<String> numPredNames, List<Dou
     }
 
     /***
-     * This method extracts the categorical coefficient z-value by using the following method:
+     * This method extracts the categorical coefficient z-score (abs(z-value)) by using the following method:
      * 1. From GLMModel model, it extracts the column names of the dinfo._adaptedFrame that is used to build the glm 
      * model and generate the glm coefficients.  The column names will be in exactly the same order as the coefficient
      * names with the exception that each enum levels will not be given a name in the column names.
      * 2. To figure out which coefficient name corresponds to which column name, we use the catOffsets which will tell
      * us how many enum levels are used in the glm model coefficients.  If the catOffset for the first coefficient
      * says 3, that means that column will have three enum levels represented in the glm model coefficients.
+     * 
+     * For categorical predictors with multiple enum levels, we will look at the max z-score.  This will show the best
+     * performing enum levels.  We will remove the enum predictor if its best z-score is not good enough when compared
+     * to the z-score of other predictors.
      */
-    public static PredNameMinZVal findCatMinZVal(GLMModel model, List<Double> zValList) {
+    public static PredNameMinZVal findCatMinOfMaxZScore(GLMModel model, List<Double> zValList) {
         String[] columnNames = model.names(); // column names of dinfo._adaptedFrame
         int[] catOffsets = model._output.getDinfo()._catOffsets;
-        double minCatVal = -1;
-        String catPredMinZ = null;
+        List<Double> bestZValues = new ArrayList<>();
+        List<String> catPredNames = new ArrayList<>();
         if (catOffsets != null) {
-            minCatVal = Double.MAX_VALUE;
             int numCatCol = catOffsets.length - 1;
-
             int numNaN = (int) zValList.stream().filter(x -> Double.isNaN(x)).count();
             if (numNaN == zValList.size()) {    // if all levels are NaN, this predictor is redundant
-                new PredNameMinZVal(catPredMinZ, Double.POSITIVE_INFINITY);
+                return null;
             } else {
                 for (int catInd = 0; catInd < numCatCol; catInd++) {    // go through each categorical column
                     List<Double> catZValues = new ArrayList<>();
@@ -1130,15 +1132,17 @@ public static PredNameMinZVal findCatMinZVal(GLMModel model, List<Double> zValLi
                     }
                     if (catZValues.size() > 0) {
                         double oneCatMinZ = catZValues.stream().max(Double::compare).get(); // choose the best z-value here
-                        if (oneCatMinZ < minCatVal) {
-                            minCatVal = oneCatMinZ;
-                            catPredMinZ = columnNames[catInd];
-                        }
+                        bestZValues.add(oneCatMinZ);
+                        catPredNames.add(columnNames[catInd]);
                     }
                 }
             }
         }
-        return new PredNameMinZVal(catPredMinZ, minCatVal);
+        if (bestZValues.size() < 1)
+            return null;
+        double maxCatLevel = bestZValues.stream().min(Double::compare).get();
+        String catPredBestZ = catPredNames.get(bestZValues.indexOf(maxCatLevel));
+        return new PredNameMinZVal(catPredBestZ, maxCatLevel);
     }
 
     static class PredNameMinZVal {

diff --git a/h2o-py/tests/testdir_algos/modelselection/pyunit_PUBDEV_8675_modelselection_fail.py b/h2o-py/tests/testdir_algos/modelselection/pyunit_PUBDEV_8675_modelselection_fail.py
@@ -8,7 +8,8 @@
 from h2o.estimators.glm import H2OGeneralizedLinearEstimator
 
 # Megan Kurka found that categorical columns do not work with modelselection backward mode.  I fixed the bug and 
-# extended her test to check that each time a predictor is dropped, it must has the smallest z-value magnitude.
+# extended her test to check that each time a predictor is dropped, the best performing level is compared to other 
+# predictors.  If the best level is not good enough, the whole enum predictor is dropped.
 def test_megan_failure():
     df = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/demos/bank-additional-full.csv")
     y = "y"
@@ -28,7 +29,6 @@ def test_megan_failure():
     best_predictor_subset = backward_model.get_best_model_predictors()
 
     counter = 0
-    back_coef = backward_model.coef()
     for ind in list(range(num_models-1, 0, -1)):
         pred_large = coefficient_orders[ind]
         pred_small = coefficient_orders[ind-1]
@@ -40,11 +40,11 @@ def test_megan_failure():
 
         # assert z-values removed has smallest magnitude
         x = best_predictor_subset[ind]
-        assert_smallest_z_removed(back_coef[ind], z_values_list, z_values_removed, pred_large, predictor_removed, x, y, df)
+        assert_correct_z_removed(z_values_list, z_values_removed, pred_large, predictor_removed, x, y, df)
 
         counter += 1
 
-def assert_smallest_z_removed(back_coef, z_values_backward, z_values_removed, coeff_backward, predictor_removed, x, y, df):
+def assert_correct_z_removed(z_values_backward, z_values_removed, coeff_backward, predictor_removed, x, y, df):
     glm_model = H2OGeneralizedLinearEstimator(seed=1234, remove_collinear_columns=True, lambda_=0.0, compute_p_values=True)
     glm_model.train(x=x, y=y, training_frame=df)
     cat_predictors = extractCatCols(df, x)
@@ -53,11 +53,21 @@ def assert_smallest_z_removed(back_coef, z_values_backward, z_values_removed, co
     model_z_values = glm_model._model_json["output"]["coefficients_table"]["z_value"]
     model_coeffs = glm_model._model_json["output"]["coefficients_table"]["names"]
 
-    assert_equal_z_values(back_coef, glm_model.coef(), z_values_backward, coeff_backward, model_z_values, model_coeffs)
-    min_z_value = min(z_values_removed)
+    assert_equal_z_values(z_values_backward, coeff_backward, model_z_values, model_coeffs)
+
+    num_predictor_removed = False
+    for one_value in predictor_removed:
+        if one_value in num_predictors:
+            num_predictor_removed = True
+            break
+    if num_predictor_removed:
+        min_z_value = min(z_values_removed)
+    else:    
+        min_z_value = max(z_values_removed)
+
     # check that predictor with smallest z-value magnitude is removed
-    assert_smallest_z_value_numerical(num_predictors, min_z_value, model_coeffs, model_z_values)
-    assert_smallest_z_value_categorical(cat_predictors, min_z_value, model_coeffs, model_z_values)
+    assert_correct_z_value_numerical(num_predictors, min_z_value, model_coeffs, model_z_values)
+    assert_correct_z_value_categorical(cat_predictors, min_z_value, model_coeffs, model_z_values)
 
     for name in cat_predictors:
         for coeff_name in predictor_removed:
@@ -66,7 +76,7 @@ def assert_smallest_z_removed(back_coef, z_values_backward, z_values_removed, co
                 return
     x.remove(predictor_removed[0])   # numerical predictor is removed
 
-def assert_smallest_z_value_categorical(cat_predictors, min_z_value, model_coeffs, model_z_values):
+def assert_correct_z_value_categorical(cat_predictors, min_z_value, model_coeffs, model_z_values):
     for name in cat_predictors:
         model_z = []
         for coeff_name in model_coeffs:
@@ -80,7 +90,7 @@ def assert_smallest_z_value_categorical(cat_predictors, min_z_value, model_coeff
                                             "than mininum_z_values {2}".format(name, model_z, min_z_value)
 
 
-def assert_smallest_z_value_numerical(num_predictors, min_z_value, model_coeffs, model_z_values):
+def assert_correct_z_value_numerical(num_predictors, min_z_value, model_coeffs, model_z_values):
     for name in num_predictors:
         pred_ind = model_coeffs.index(name)
         val = model_z_values[pred_ind]
@@ -96,7 +106,7 @@ def extractCatCols(df, x):
             cat_pred.append(name)
     return cat_pred
 
-def assert_equal_z_values(back_coef, curr_coef, z_values_backward, coeff_backward, model_z_values, glm_coeff):
+def assert_equal_z_values(z_values_backward, coeff_backward, model_z_values, glm_coeff):
     for coeff in glm_coeff:
         backward_z_value = z_values_backward[coeff_backward.index(coeff)]
         model_z_value = model_z_values[glm_coeff.index(coeff)]

diff --git a/h2o-r/tests/testdir_algos/glm/runit_pubdev_4641_glm_beta_constraints_bad_results.R b/h2o-r/tests/testdir_algos/glm/runit_pubdev_4641_glm_beta_constraints_bad_results.R
@@ -3,8 +3,8 @@ source("../../../scripts/h2o-r-test-setup.R")
 
 # add test from Erin Ledell
 glmBetaConstraints <- function() {
-  df <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv")
-  test <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv")
+  df <- h2o.importFile(locate("smalldata/higgs/higgs_train_10k.csv"))
+  test <- h2o.importFile(locate("smalldata/higgs/higgs_test_5k.csv"))
 
   y <- "response"
   x <- setdiff(names(df), y)