Skip to content

Commit

Permalink
subsample on some of the ML methods to make them work
Browse files Browse the repository at this point in the history
  • Loading branch information
paulbkoch committed Aug 30, 2024
1 parent 47404c3 commit b293d23
Showing 1 changed file with 33 additions and 43 deletions.
76 changes: 33 additions & 43 deletions docs/benchmarks/ebm-benchmark.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,6 @@
" \"svm\",\n",
" \"nn\",\n",
" \"knn\",\n",
" \"gp\",\n",
" # \"aplr\",\n",
" ]"
]
Expand All @@ -136,6 +135,7 @@
"source": [
"def trial_runner(trial):\n",
" seed=42 + int(trial.replicate_num)\n",
" max_samples = 1000000000000\n",
"\n",
" from interpret.glassbox import ExplainableBoostingClassifier, ExplainableBoostingRegressor\n",
" from xgboost import XGBClassifier, XGBRegressor, XGBRFClassifier, XGBRFRegressor\n",
Expand All @@ -146,7 +146,6 @@
" from sklearn.svm import LinearSVC, LinearSVR, SVC, SVR\n",
" from sklearn.neural_network import MLPClassifier, MLPRegressor\n",
" from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor\n",
" from sklearn.gaussian_process import GaussianProcessClassifier, GaussianProcessRegressor\n",
" from sklearn.calibration import CalibratedClassifierCV\n",
" from sklearn.preprocessing import StandardScaler\n",
" from aplr import APLRClassifier, APLRRegressor\n",
Expand Down Expand Up @@ -178,14 +177,16 @@
" # stratification = y\n",
" pass # Re-enable stratification if dataset fails from absent class in train/test sets (PMLB)\n",
" \n",
" X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=stratification, random_state=seed)\n",
" fit_params = {}\n",
" fit_params[\"X\"], X_test, fit_params[\"y\"], y_test = train_test_split(X, y, test_size=0.3, stratify=stratification, random_state=seed)\n",
" del X\n",
"\n",
" # Build optional preprocessor for use by methods below\n",
" # missing categoricals already handled above by making new \"nan\" category\n",
" cat_encoder = OneHotEncoder(handle_unknown=\"ignore\", sparse_output=True, dtype=np.int16)\n",
" num_imputer = Pipeline(steps=[('imputer', SimpleImputer(strategy='mean')), ('scaler', StandardScaler())])\n",
" transformers = [(\"cat\", cat_encoder, cat_cols), (\"num\", num_imputer, num_cols)]\n",
" ct = ColumnTransformer(transformers=transformers, sparse_threshold=1.0) # densify or sparsify\n",
" ct = ColumnTransformer(transformers=transformers) #, sparse_threshold=1.0) # densify or sparsify\n",
"\n",
"\n",
" ebm_params = {}\n",
Expand All @@ -201,7 +202,6 @@
" svm_params = {}\n",
" nn_params = {}\n",
" knn_params = {}\n",
" gp_params = {}\n",
" aplr_params = {}\n",
"\n",
" ebm_params[\"feature_types\"] = [\"nominal\" if cat else \"continuous\" for cat in cat_bools]\n",
Expand All @@ -222,29 +222,33 @@
" lm_svm_params[\"random_state\"] = seed # TODO: is this needed for reproducibility?\n",
" nn_params[\"random_state\"] = seed # TODO: is this needed for reproducibility?\n",
" knn_params[\"n_jobs\"] = -1\n",
" gp_params[\"random_state\"] = seed # TODO: is this needed for reproducibility?\n",
" aplr_params[\"m\"] = 3000\n",
"\n",
" if 1000 < trial.task.scalar_measure(\"n_cols\"):\n",
" # TODO: EBMs can crash for now with too many interactions, so limit it until we have better fix\n",
" ebm_params[\"interactions\"] = 0\n",
"\n",
" # DEBUG params to make the algorithms super fast\n",
" #if 10000 < len(y_train):\n",
" # debug_stratify = y_train if trial.task.problem in [\"binary\", \"multiclass\"] else None\n",
" # X_train, _, y_train, _ = train_test_split(X_train, y_train, test_size=len(y_train) - 5000, stratify=debug_stratify, random_state=seed)\n",
" #if 10000 < len(fit_params[\"y\"]):\n",
" # debug_stratify = fit_params[\"y\"] if trial.task.problem in [\"binary\", \"multiclass\"] else None\n",
" # fit_params[\"X\"], _, fit_params[\"y\"], _ = train_test_split(fit_params[\"X\"], fit_params[\"y\"], test_size=len(fit_params[\"y\"]) - 5000, stratify=debug_stratify, random_state=seed)\n",
" #ebm_params[\"max_rounds\"] = 1\n",
" #ebm_params[\"interactions\"] = 0\n",
" #xgb_params[\"n_estimators\"] = 1\n",
" #lgbm_params[\"n_estimators\"] = 1\n",
" #catboost_params[\"n_estimators\"] = 1\n",
" #rf_xgb_params[\"n_estimators\"] = 1\n",
" #rf_sk_params[\"n_estimators\"] = 1\n",
" #ert_params[\"n_estimators\"] = 1\n",
" #elastic_params[\"max_iter\"] = 1\n",
" #lm_svm_params[\"max_iter\"] = 1\n",
" #nn_params[\"max_iter\"] = 1\n",
" #knn_params[\"n_neighbors\"] = 1\n",
" #knn_params[\"leaf_size\"] = 1\n",
" #aplr_params[\"m\"] = 1\n",
" \n",
" # Specify method\n",
" if trial.task.problem in [\"binary\", \"multiclass\"]:\n",
" fit_params = {\"X\":X_train, \"y\":y_train}\n",
" if trial.method.name == \"ebm\":\n",
" est = ExplainableBoostingClassifier(**ebm_params)\n",
" elif trial.method.name == \"xgb\":\n",
Expand All @@ -271,22 +275,14 @@
" est = Pipeline([(\"ct\", ct), (\"est\", LogisticRegression(**lm_params))])\n",
" elif trial.method.name == \"lm_svm\":\n",
" if trial.task.name in {\"CIFAR_10\", \"Devnagari-Script\"}:\n",
" # fit time too long without subsampling\n",
" # TODO: see if we can include more samples\n",
" X_train, _, y_train, _ = train_test_split(X_train, y_train, test_size=len(y_train) - 10000, random_state=seed)\n",
" fit_params[\"X\"] = X_train\n",
" fit_params[\"y\"] = y_train\n",
" max_samples = 10000 # crashes or fit time too long without subsampling\n",
" if trial.task.problem == \"multiclass\":\n",
" est = Pipeline([(\"ct\", ct), (\"est\", CalibratedClassifierCV(OneVsRestClassifier(LinearSVC(**lm_svm_params), n_jobs=-1)))])\n",
" else:\n",
" est = Pipeline([(\"ct\", ct), (\"est\", CalibratedClassifierCV(LinearSVC(**lm_svm_params), n_jobs=-1))])\n",
" elif trial.method.name == \"svm\":\n",
" if trial.task.name in {\"CIFAR_10\", \"Devnagari-Script\"}:\n",
" # fit time too long without subsampling\n",
" # TODO: see if we can include more samples\n",
" X_train, _, y_train, _ = train_test_split(X_train, y_train, test_size=len(y_train) - 10000, random_state=seed)\n",
" fit_params[\"X\"] = X_train\n",
" fit_params[\"y\"] = y_train\n",
" max_samples = 10000 # crashes or fit time too long without subsampling\n",
" svm_params[\"random_state\"] = seed\n",
" if trial.task.problem == \"multiclass\":\n",
" est = Pipeline([(\"ct\", ct), (\"est\", CalibratedClassifierCV(OneVsRestClassifier(SVC(**svm_params), n_jobs=-1)))])\n",
Expand All @@ -296,21 +292,16 @@
" est = Pipeline([(\"ct\", ct), (\"est\", MLPClassifier(**nn_params))])\n",
" elif trial.method.name == \"knn\":\n",
" est = Pipeline([(\"ct\", ct), (\"est\", KNeighborsClassifier(**knn_params))])\n",
" elif trial.method.name == \"gp\":\n",
" ct.sparse_threshold = 0\n",
" gp_params[\"n_jobs\"] = -1\n",
" est = Pipeline([(\"ct\", ct), (\"est\", GaussianProcessClassifier(**gp_params))])\n",
" elif trial.method.name == \"aplr\":\n",
" ct.sparse_threshold = 0 # APLR only handles dense\n",
" est = Pipeline([(\"ct\", ct), (\"est\", APLRClassifier(**aplr_params))])\n",
" y_train = y_train.astype(str).to_numpy()\n",
" fit_params[\"y\"] = fit_params[\"y\"].astype(str).to_numpy()\n",
" y_test = y_test.astype(str).to_numpy()\n",
" fit_params[\"y\"] = y_train\n",
" else:\n",
" raise Exception(f\"Unrecognized method name {trial.method.name}\")\n",
"\n",
" predict_fn = est.predict_proba\n",
" elif trial.task.problem == \"regression\":\n",
" fit_params = {\"X\":X_train, \"y\":y_train}\n",
" if trial.method.name == \"ebm\":\n",
" est = ExplainableBoostingRegressor(**ebm_params)\n",
" elif trial.method.name == \"xgb\":\n",
Expand All @@ -327,13 +318,11 @@
" fit_params[\"verbose\"] = False\n",
" elif trial.method.name == \"rf_sk\":\n",
" if trial.task.name in {\"Airlines_DepDelay_10M\", \"Allstate_Claims_Severity\"}:\n",
" # fit time too long without subsampling\n",
" # TODO: see if we can include more samples\n",
" X_train, _, y_train, _ = train_test_split(X_train, y_train, test_size=len(y_train) - 100000, random_state=seed)\n",
" fit_params[\"X\"] = X_train\n",
" fit_params[\"y\"] = y_train\n",
" max_samples = 100000 # crashes or fit time too long without subsampling\n",
" est = Pipeline([(\"ct\", ct), (\"est\", RandomForestRegressor(**rf_sk_params))])\n",
" elif trial.method.name == \"ert\":\n",
" if trial.task.name in {\"Airlines_DepDelay_10M\", \"Allstate_Claims_Severity\"}:\n",
" max_samples = 100000 # crashes or fit time too long without subsampling\n",
" est = Pipeline([(\"ct\", ct), (\"est\", ExtraTreesRegressor(**ert_params))])\n",
" elif trial.method.name == \"elastic\":\n",
" est = Pipeline([(\"ct\", ct), (\"est\", ElasticNet(**elastic_params))])\n",
Expand All @@ -343,31 +332,32 @@
" est = Pipeline([(\"ct\", ct), (\"est\", LinearSVR(**lm_svm_params))])\n",
" elif trial.method.name == \"svm\":\n",
" if trial.task.name in {\"Buzzinsocialmedia_Twitter\", \"nyc-taxi-green-dec-2016\", \"Airlines_DepDelay_10M\"}:\n",
" # fit time too long without subsampling\n",
" # TODO: see if we can include more samples\n",
" X_train, _, y_train, _ = train_test_split(X_train, y_train, test_size=len(y_train) - 100000, random_state=seed)\n",
" fit_params[\"X\"] = X_train\n",
" fit_params[\"y\"] = y_train\n",
" max_samples = 100000 # crashes or fit time too long without subsampling\n",
" est = Pipeline([(\"ct\", ct), (\"est\", SVR(**svm_params))])\n",
" elif trial.method.name == \"nn\":\n",
" est = Pipeline([(\"ct\", ct), (\"est\", MLPRegressor(**nn_params))])\n",
" elif trial.method.name == \"knn\":\n",
" if trial.task.name in {\"Airlines_DepDelay_10M\"}:\n",
" max_samples = 100000 # crashes or fit time too long without subsampling\n",
" est = Pipeline([(\"ct\", ct), (\"est\", KNeighborsRegressor(**knn_params))])\n",
" elif trial.method.name == \"gp\":\n",
" ct.sparse_threshold = 0\n",
" est = Pipeline([(\"ct\", ct), (\"est\", GaussianProcessRegressor(**gp_params))])\n",
" elif trial.method.name == \"aplr\":\n",
" if trial.task.name in {\"Airlines_DepDelay_10M\"}:\n",
" max_samples = 100000 # crashes or fit time too long without subsampling\n",
" ct.sparse_threshold = 0 # APLR only handles dense\n",
" est = Pipeline([(\"ct\", ct), (\"est\", APLRRegressor(**aplr_params))])\n",
" y_train = y_train.astype(str).to_numpy()\n",
" fit_params[\"y\"] = fit_params[\"y\"].astype(str).to_numpy()\n",
" y_test = y_test.astype(str).to_numpy()\n",
" fit_params[\"y\"] = y_train\n",
" else:\n",
" raise Exception(f\"Unrecognized method name {trial.method.name}\")\n",
"\n",
" predict_fn = est.predict\n",
" else:\n",
" raise Exception(f\"Unrecognized problem {trial.task.problem}\")\n",
"\n",
" if max_samples < len(fit_params[\"y\"]):\n",
" # subsample because the ML method crashes or takes too long\n",
" fit_params[\"X\"], _, fit_params[\"y\"], _ = train_test_split(fit_params[\"X\"], fit_params[\"y\"], test_size=len(fit_params[\"y\"]) - max_samples, random_state=seed)\n",
" \n",
" global global_counter\n",
" try:\n",
" global_counter += 1\n",
Expand Down Expand Up @@ -502,7 +492,7 @@
" executor = AzureContainerInstance(\n",
" store, azure_tenant_id, azure_client_id, azure_client_secret, subscription_id, resource_group, credential,\n",
" image=\"mcr.microsoft.com/devcontainers/python:latest\",\n",
" pip_install= requirements + \" psycopg2-binary\",\n",
" pip_install= requirements + \" psycopg2-binary\" + \" azure-mgmt-containerinstance azure-identity\", #TODO remove azure-mgmt-containerinstance azure-identity once our powerlift image is updated\n",
" wheel_filepaths=wheel_filepaths,\n",
" n_running_containers=n_containers, num_cores=4, mem_size_gb=16, delete_group_container_on_complete=True\n",
" )\n",
Expand Down

0 comments on commit b293d23

Please sign in to comment.