Skip to content

Commit

Permalink
GH-16328: remove enum predictors based on best performing coefficient…
Browse files Browse the repository at this point in the history
… level [nocheck] (#16334)

* GH-16328: Remove enum predictor only when its best performing level has big p-value
* GH-16328: fixed criteria to discard categorical predictors and fixed test.
* Incorporate Tomas F comment to change function name to clarify its purpose.
* change dataset from aws path to local path.
* Incorporate maurever comments.
  • Loading branch information
wendycwong authored Aug 6, 2024
1 parent 04188e5 commit 9d90b05
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 27 deletions.
32 changes: 18 additions & 14 deletions h2o-algos/src/main/java/hex/modelselection/ModelSelectionUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -1063,10 +1063,10 @@ public static int findMinZValue(GLMModel model, List<String> numPredNames, List<
}
// grab min z-values for numerical and categorical columns
PredNameMinZVal numericalPred = findNumMinZVal(numPredNames, zValList, coeffNames);
PredNameMinZVal categoricalPred = findCatMinZVal(model, zValList);
PredNameMinZVal categoricalPred = findCatMinOfMaxZScore(model, zValList); // null if all predictors are inactive

// choose the min z-value from numerical and categorical predictors and return its index in predNames
if (categoricalPred._minZVal >= 0 && categoricalPred._minZVal < numericalPred._minZVal) { // categorical pred has minimum z-value
if (categoricalPred != null && categoricalPred._minZVal >= 0 && categoricalPred._minZVal < numericalPred._minZVal) { // categorical pred has minimum z-value
return predNames.indexOf(categoricalPred._predName);
} else { // numerical pred has minimum z-value
return predNames.indexOf(numericalPred._predName);
Expand Down Expand Up @@ -1095,26 +1095,28 @@ public static PredNameMinZVal findNumMinZVal(List<String> numPredNames, List<Dou
}

/***
* This method extracts the categorical coefficient z-value by using the following method:
* This method extracts the categorical coefficient z-score (abs(z-value)) by using the following method:
* 1. From GLMModel model, it extracts the column names of the dinfo._adaptedFrame that is used to build the glm
* model and generate the glm coefficients. The column names will be in exactly the same order as the coefficient
* names with the exception that each enum levels will not be given a name in the column names.
* 2. To figure out which coefficient name corresponds to which column name, we use the catOffsets which will tell
* us how many enum levels are used in the glm model coefficients. If the catOffset for the first coefficient
* says 3, that means that column will have three enum levels represented in the glm model coefficients.
*
* For categorical predictors with multiple enum levels, we will look at the max z-score. This will show the best
* performing enum levels. We will remove the enum predictor if its best z-score is not good enough when compared
* to the z-score of other predictors.
*/
public static PredNameMinZVal findCatMinZVal(GLMModel model, List<Double> zValList) {
public static PredNameMinZVal findCatMinOfMaxZScore(GLMModel model, List<Double> zValList) {
String[] columnNames = model.names(); // column names of dinfo._adaptedFrame
int[] catOffsets = model._output.getDinfo()._catOffsets;
double minCatVal = -1;
String catPredMinZ = null;
List<Double> bestZValues = new ArrayList<>();
List<String> catPredNames = new ArrayList<>();
if (catOffsets != null) {
minCatVal = Double.MAX_VALUE;
int numCatCol = catOffsets.length - 1;

int numNaN = (int) zValList.stream().filter(x -> Double.isNaN(x)).count();
if (numNaN == zValList.size()) { // if all levels are NaN, this predictor is redundant
new PredNameMinZVal(catPredMinZ, Double.POSITIVE_INFINITY);
return null;
} else {
for (int catInd = 0; catInd < numCatCol; catInd++) { // go through each categorical column
List<Double> catZValues = new ArrayList<>();
Expand All @@ -1130,15 +1132,17 @@ public static PredNameMinZVal findCatMinZVal(GLMModel model, List<Double> zValLi
}
if (catZValues.size() > 0) {
double oneCatMinZ = catZValues.stream().max(Double::compare).get(); // choose the best z-value here
if (oneCatMinZ < minCatVal) {
minCatVal = oneCatMinZ;
catPredMinZ = columnNames[catInd];
}
bestZValues.add(oneCatMinZ);
catPredNames.add(columnNames[catInd]);
}
}
}
}
return new PredNameMinZVal(catPredMinZ, minCatVal);
if (bestZValues.size() < 1)
return null;
double maxCatLevel = bestZValues.stream().min(Double::compare).get();
String catPredBestZ = catPredNames.get(bestZValues.indexOf(maxCatLevel));
return new PredNameMinZVal(catPredBestZ, maxCatLevel);
}

static class PredNameMinZVal {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
from h2o.estimators.glm import H2OGeneralizedLinearEstimator

# Megan Kurka found that categorical columns do not work with modelselection backward mode. I fixed the bug and
# extended her test to check that each time a predictor is dropped, it must has the smallest z-value magnitude.
# extended her test to check that each time a predictor is dropped, the best performing level is compared to other
# predictors. If the best level is not good enough, the whole enum predictor is dropped.
def test_megan_failure():
df = h2o.import_file("https://s3.amazonaws.com/h2o-public-test-data/smalldata/demos/bank-additional-full.csv")
y = "y"
Expand All @@ -28,7 +29,6 @@ def test_megan_failure():
best_predictor_subset = backward_model.get_best_model_predictors()

counter = 0
back_coef = backward_model.coef()
for ind in list(range(num_models-1, 0, -1)):
pred_large = coefficient_orders[ind]
pred_small = coefficient_orders[ind-1]
Expand All @@ -40,11 +40,11 @@ def test_megan_failure():

# assert z-values removed has smallest magnitude
x = best_predictor_subset[ind]
assert_smallest_z_removed(back_coef[ind], z_values_list, z_values_removed, pred_large, predictor_removed, x, y, df)
assert_correct_z_removed(z_values_list, z_values_removed, pred_large, predictor_removed, x, y, df)

counter += 1

def assert_smallest_z_removed(back_coef, z_values_backward, z_values_removed, coeff_backward, predictor_removed, x, y, df):
def assert_correct_z_removed(z_values_backward, z_values_removed, coeff_backward, predictor_removed, x, y, df):
glm_model = H2OGeneralizedLinearEstimator(seed=1234, remove_collinear_columns=True, lambda_=0.0, compute_p_values=True)
glm_model.train(x=x, y=y, training_frame=df)
cat_predictors = extractCatCols(df, x)
Expand All @@ -53,11 +53,21 @@ def assert_smallest_z_removed(back_coef, z_values_backward, z_values_removed, co
model_z_values = glm_model._model_json["output"]["coefficients_table"]["z_value"]
model_coeffs = glm_model._model_json["output"]["coefficients_table"]["names"]

assert_equal_z_values(back_coef, glm_model.coef(), z_values_backward, coeff_backward, model_z_values, model_coeffs)
min_z_value = min(z_values_removed)
assert_equal_z_values(z_values_backward, coeff_backward, model_z_values, model_coeffs)

num_predictor_removed = False
for one_value in predictor_removed:
if one_value in num_predictors:
num_predictor_removed = True
break
if num_predictor_removed:
min_z_value = min(z_values_removed)
else:
min_z_value = max(z_values_removed)

# check that predictor with smallest z-value magnitude is removed
assert_smallest_z_value_numerical(num_predictors, min_z_value, model_coeffs, model_z_values)
assert_smallest_z_value_categorical(cat_predictors, min_z_value, model_coeffs, model_z_values)
assert_correct_z_value_numerical(num_predictors, min_z_value, model_coeffs, model_z_values)
assert_correct_z_value_categorical(cat_predictors, min_z_value, model_coeffs, model_z_values)

for name in cat_predictors:
for coeff_name in predictor_removed:
Expand All @@ -66,7 +76,7 @@ def assert_smallest_z_removed(back_coef, z_values_backward, z_values_removed, co
return
x.remove(predictor_removed[0]) # numerical predictor is removed

def assert_smallest_z_value_categorical(cat_predictors, min_z_value, model_coeffs, model_z_values):
def assert_correct_z_value_categorical(cat_predictors, min_z_value, model_coeffs, model_z_values):
for name in cat_predictors:
model_z = []
for coeff_name in model_coeffs:
Expand All @@ -80,7 +90,7 @@ def assert_smallest_z_value_categorical(cat_predictors, min_z_value, model_coeff
"than mininum_z_values {2}".format(name, model_z, min_z_value)


def assert_smallest_z_value_numerical(num_predictors, min_z_value, model_coeffs, model_z_values):
def assert_correct_z_value_numerical(num_predictors, min_z_value, model_coeffs, model_z_values):
for name in num_predictors:
pred_ind = model_coeffs.index(name)
val = model_z_values[pred_ind]
Expand All @@ -96,7 +106,7 @@ def extractCatCols(df, x):
cat_pred.append(name)
return cat_pred

def assert_equal_z_values(back_coef, curr_coef, z_values_backward, coeff_backward, model_z_values, glm_coeff):
def assert_equal_z_values(z_values_backward, coeff_backward, model_z_values, glm_coeff):
for coeff in glm_coeff:
backward_z_value = z_values_backward[coeff_backward.index(coeff)]
model_z_value = model_z_values[glm_coeff.index(coeff)]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@ source("../../../scripts/h2o-r-test-setup.R")

# add test from Erin Ledell
glmBetaConstraints <- function() {
df <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_train_10k.csv")
test <- h2o.importFile("https://s3.amazonaws.com/erin-data/higgs/higgs_test_5k.csv")
df <- h2o.importFile(locate("smalldata/higgs/higgs_train_10k.csv"))
test <- h2o.importFile(locate("smalldata/higgs/higgs_test_5k.csv"))

y <- "response"
x <- setdiff(names(df), y)
Expand Down

0 comments on commit 9d90b05

Please sign in to comment.