diff --git a/h2o-algos/src/main/java/hex/modelselection/ModelSelectionUtils.java b/h2o-algos/src/main/java/hex/modelselection/ModelSelectionUtils.java index c70b76f91ee7..ec60fde3ac8c 100644 --- a/h2o-algos/src/main/java/hex/modelselection/ModelSelectionUtils.java +++ b/h2o-algos/src/main/java/hex/modelselection/ModelSelectionUtils.java @@ -1063,10 +1063,10 @@ public static int findMinZValue(GLMModel model, List numPredNames, List< } // grab min z-values for numerical and categorical columns PredNameMinZVal numericalPred = findNumMinZVal(numPredNames, zValList, coeffNames); - PredNameMinZVal categoricalPred = findCatMinZVal(model, zValList); + PredNameMinZVal categoricalPred = findCatMinOfMaxZScore(model, zValList); // null if all predictors are inactive // choose the min z-value from numerical and categorical predictors and return its index in predNames - if (categoricalPred._minZVal >= 0 && categoricalPred._minZVal < numericalPred._minZVal) { // categorical pred has minimum z-value + if (categoricalPred != null && categoricalPred._minZVal >= 0 && categoricalPred._minZVal < numericalPred._minZVal) { // categorical pred has minimum z-value return predNames.indexOf(categoricalPred._predName); } else { // numerical pred has minimum z-value return predNames.indexOf(numericalPred._predName); @@ -1095,26 +1095,28 @@ public static PredNameMinZVal findNumMinZVal(List numPredNames, List zValList) { + public static PredNameMinZVal findCatMinOfMaxZScore(GLMModel model, List zValList) { String[] columnNames = model.names(); // column names of dinfo._adaptedFrame int[] catOffsets = model._output.getDinfo()._catOffsets; - double minCatVal = -1; - String catPredMinZ = null; + List bestZValues = new ArrayList<>(); + List catPredNames = new ArrayList<>(); if (catOffsets != null) { - minCatVal = Double.MAX_VALUE; int numCatCol = catOffsets.length - 1; - int numNaN = (int) zValList.stream().filter(x -> Double.isNaN(x)).count(); if (numNaN == zValList.size()) { // if all levels are NaN, this predictor is redundant - new PredNameMinZVal(catPredMinZ, Double.POSITIVE_INFINITY); + return null; } else { for (int catInd = 0; catInd < numCatCol; catInd++) { // go through each categorical column List catZValues = new ArrayList<>(); @@ -1130,15 +1132,17 @@ public static PredNameMinZVal findCatMinZVal(GLMModel model, List zValLi } if (catZValues.size() > 0) { double oneCatMinZ = catZValues.stream().max(Double::compare).get(); // choose the best z-value here - if (oneCatMinZ < minCatVal) { - minCatVal = oneCatMinZ; - catPredMinZ = columnNames[catInd]; - } + bestZValues.add(oneCatMinZ); + catPredNames.add(columnNames[catInd]); } } } } - return new PredNameMinZVal(catPredMinZ, minCatVal); + if (bestZValues.size() < 1) + return null; + double maxCatLevel = bestZValues.stream().min(Double::compare).get(); + String catPredBestZ = catPredNames.get(bestZValues.indexOf(maxCatLevel)); + return new PredNameMinZVal(catPredBestZ, maxCatLevel); } static class PredNameMinZVal { diff --git a/h2o-py/tests/testdir_algos/modelselection/pyunit_PUBDEV_8675_modelselection_fail.py b/h2o-py/tests/testdir_algos/modelselection/pyunit_PUBDEV_8675_modelselection_fail.py index 7540abf1dc5c..186062fba82f 100644 --- a/h2o-py/tests/testdir_algos/modelselection/pyunit_PUBDEV_8675_modelselection_fail.py +++ b/h2o-py/tests/testdir_algos/modelselection/pyunit_PUBDEV_8675_modelselection_fail.py @@ -8,7 +8,8 @@ from h2o.estimators.glm import H2OGeneralizedLinearEstimator # Megan Kurka found that categorical columns do not work with modelselection backward mode. I fixed the bug and -# extended her test to check that each time a predictor is dropped, it must has the smallest z-value magnitude. +# extended her test to check that each time a predictor is dropped, the best performing level is compared to other +# predictors. If the best level is not good enough, the whole enum predictor is dropped. def test_megan_failure(): df = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/demos/bank-additional-full.csv") y = "y" @@ -28,7 +29,6 @@ def test_megan_failure(): best_predictor_subset = backward_model.get_best_model_predictors() counter = 0 - back_coef = backward_model.coef() for ind in list(range(num_models-1, 0, -1)): pred_large = coefficient_orders[ind] pred_small = coefficient_orders[ind-1] @@ -40,11 +40,11 @@ def test_megan_failure(): # assert z-values removed has smallest magnitude x = best_predictor_subset[ind] - assert_smallest_z_removed(back_coef[ind], z_values_list, z_values_removed, pred_large, predictor_removed, x, y, df) + assert_correct_z_removed(z_values_list, z_values_removed, pred_large, predictor_removed, x, y, df) counter += 1 -def assert_smallest_z_removed(back_coef, z_values_backward, z_values_removed, coeff_backward, predictor_removed, x, y, df): +def assert_correct_z_removed(z_values_backward, z_values_removed, coeff_backward, predictor_removed, x, y, df): glm_model = H2OGeneralizedLinearEstimator(seed=1234, remove_collinear_columns=True, lambda_=0.0, compute_p_values=True) glm_model.train(x=x, y=y, training_frame=df) cat_predictors = extractCatCols(df, x) @@ -53,11 +53,21 @@ def assert_smallest_z_removed(back_coef, z_values_backward, z_values_removed, co model_z_values = glm_model._model_json["output"]["coefficients_table"]["z_value"] model_coeffs = glm_model._model_json["output"]["coefficients_table"]["names"] - assert_equal_z_values(back_coef, glm_model.coef(), z_values_backward, coeff_backward, model_z_values, model_coeffs) - min_z_value = min(z_values_removed) + assert_equal_z_values(z_values_backward, coeff_backward, model_z_values, model_coeffs) + + num_predictor_removed = False + for one_value in predictor_removed: + if one_value in num_predictors: + num_predictor_removed = True + break + if num_predictor_removed: + min_z_value = min(z_values_removed) + else: + min_z_value = max(z_values_removed) + # check that predictor with smallest z-value magnitude is removed - assert_smallest_z_value_numerical(num_predictors, min_z_value, model_coeffs, model_z_values) - assert_smallest_z_value_categorical(cat_predictors, min_z_value, model_coeffs, model_z_values) + assert_correct_z_value_numerical(num_predictors, min_z_value, model_coeffs, model_z_values) + assert_correct_z_value_categorical(cat_predictors, min_z_value, model_coeffs, model_z_values) for name in cat_predictors: for coeff_name in predictor_removed: @@ -66,7 +76,7 @@ def assert_smallest_z_removed(back_coef, z_values_backward, z_values_removed, co return x.remove(predictor_removed[0]) # numerical predictor is removed -def assert_smallest_z_value_categorical(cat_predictors, min_z_value, model_coeffs, model_z_values): +def assert_correct_z_value_categorical(cat_predictors, min_z_value, model_coeffs, model_z_values): for name in cat_predictors: model_z = [] for coeff_name in model_coeffs: @@ -80,7 +90,7 @@ def assert_smallest_z_value_categorical(cat_predictors, min_z_value, model_coeff "than mininum_z_values {2}".format(name, model_z, min_z_value) -def assert_smallest_z_value_numerical(num_predictors, min_z_value, model_coeffs, model_z_values): +def assert_correct_z_value_numerical(num_predictors, min_z_value, model_coeffs, model_z_values): for name in num_predictors: pred_ind = model_coeffs.index(name) val = model_z_values[pred_ind] @@ -96,7 +106,7 @@ def extractCatCols(df, x): cat_pred.append(name) return cat_pred -def assert_equal_z_values(back_coef, curr_coef, z_values_backward, coeff_backward, model_z_values, glm_coeff): +def assert_equal_z_values(z_values_backward, coeff_backward, model_z_values, glm_coeff): for coeff in glm_coeff: backward_z_value = z_values_backward[coeff_backward.index(coeff)] model_z_value = model_z_values[glm_coeff.index(coeff)] diff --git a/h2o-r/tests/testdir_algos/glm/runit_pubdev_4641_glm_beta_constraints_bad_results.R b/h2o-r/tests/testdir_algos/glm/runit_pubdev_4641_glm_beta_constraints_bad_results.R index 914c524d5c61..a92b1acfe66f 100644 --- a/h2o-r/tests/testdir_algos/glm/runit_pubdev_4641_glm_beta_constraints_bad_results.R +++ b/h2o-r/tests/testdir_algos/glm/runit_pubdev_4641_glm_beta_constraints_bad_results.R @@ -3,8 +3,8 @@ source("../../../scripts/h2o-r-test-setup.R") # add test from Erin Ledell glmBetaConstraints <- function() { - df <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv") - test <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv") + df <- h2o.importFile(locate("smalldata/higgs/higgs_train_10k.csv")) + test <- h2o.importFile(locate("smalldata/higgs/higgs_test_5k.csv")) y <- "response" x <- setdiff(names(df), y)