feat: template changes for some kaggle competitions (#484)

* change tpl metric codes * change tpl
microsoft · Nov 12, 2024 · 2e38000 · 2e38000
1 parent 92975fd
commit 2e38000
Show file tree

Hide file tree

Showing 3 changed files with 27 additions and 37 deletions.
diff --git a/...ios/kaggle/experiment/tabular-playground-series-dec-2021_template/fea_share_preprocess.py b/...ios/kaggle/experiment/tabular-playground-series-dec-2021_template/fea_share_preprocess.py
@@ -4,23 +4,7 @@
 import pandas as pd
 from sklearn.impute import SimpleImputer
 from sklearn.model_selection import train_test_split
-
-
-def prepreprocess():
-    """
-    This method loads the data, drops the unnecessary columns, and splits it into train and validation sets.
-    """
-    # Load and preprocess the data
-    data_df = pd.read_csv("/kaggle/input/train.csv")
-    data_df = data_df.drop(["Id"], axis=1)
-
-    X = data_df.drop(["Cover_Type"], axis=1)
-    y = data_df["Cover_Type"] - 1
-
-    # Split the data into training and validation sets
-    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20, random_state=42)
-
-    return X_train, X_valid, y_train, y_valid
+from sklearn.preprocessing import LabelEncoder
 
 
 def preprocess_script():
@@ -37,14 +21,23 @@ def preprocess_script():
 
         return X_train, X_valid, y_train, y_valid, X_test, *others
 
-    X_train, X_valid, y_train, y_valid = prepreprocess()
+    label_encoder = LabelEncoder()
+    data_df = pd.read_csv("/kaggle/input/train.csv")
+    data_df = data_df.drop(["Id"], axis=1)
+    data_df["Cover_Type"] = label_encoder.fit_transform(data_df["Cover_Type"])
+
+    X = data_df.drop(["Cover_Type", "Soil_Type7", "Soil_Type15"], axis=1)
+    y = data_df["Cover_Type"]
+
+    # Split the data into training and validation sets
+    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20, random_state=42)
 
     # Load and preprocess the test data
-    submission_df = pd.read_csv("/kaggle/input/test.csv")
-    ids = submission_df["Id"]
-    X_test = submission_df.drop(["Id"], axis=1)
+    test_df = pd.read_csv("/kaggle/input/test.csv")
+    ids = test_df["Id"]
+    X_test = test_df.drop(["Id", "Soil_Type7", "Soil_Type15"], axis=1)
 
-    return X_train, X_valid, y_train, y_valid, X_test, ids
+    return X_train, X_valid, y_train, y_valid, X_test, ids, label_encoder
 
 
 def clean_and_impute_data(X_train, X_valid, X_test):

diff --git a/...rios/kaggle/experiment/tabular-playground-series-dec-2021_template/model/model_xgboost.py b/...rios/kaggle/experiment/tabular-playground-series-dec-2021_template/model/model_xgboost.py
@@ -15,13 +15,14 @@ def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_v
         "objective": "multi:softmax",  # Use softmax for multi-class classification
         "num_class": len(set(y_train)),  # Number of classes
         "nthread": -1,
-        "tree_method": "gpu_hist",
+        "tree_method": "hist",
         "device": "cuda",
+        "eval_metric": "merror",
     }
     num_round = 100
 
-    evallist = [(dtrain, "train"), (dvalid, "eval")]
-    bst = xgb.train(params, dtrain, num_round, evallist)
+    evallist = [(dtrain, "train"), (dvalid, "valid")]
+    bst = xgb.train(params, dtrain, num_round, evallist, verbose_eval=10)
 
     return bst
 

diff --git a/rdagent/scenarios/kaggle/experiment/tabular-playground-series-dec-2021_template/train.py b/rdagent/scenarios/kaggle/experiment/tabular-playground-series-dec-2021_template/train.py
@@ -4,8 +4,8 @@
 
 import numpy as np
 import pandas as pd
-from fea_share_preprocess import clean_and_impute_data, preprocess_script
-from sklearn.metrics import accuracy_score, matthews_corrcoef
+from fea_share_preprocess import preprocess_script
+from sklearn.metrics import accuracy_score
 
 # Set random seed for reproducibility
 SEED = 42
@@ -14,12 +14,6 @@
 DIRNAME = Path(__file__).absolute().resolve().parent
 
 
-def compute_metrics_for_classification(y_true, y_pred):
-    """Compute MCC for classification."""
-    mcc = matthews_corrcoef(y_true, y_pred)
-    return mcc
-
-
 def import_module_from_path(module_name, module_path):
     spec = importlib.util.spec_from_file_location(module_name, module_path)
     module = importlib.util.module_from_spec(spec)
@@ -28,7 +22,7 @@ def import_module_from_path(module_name, module_path):
 
 
 # 1) Preprocess the data
-X_train, X_valid, y_train, y_valid, X_test, ids = preprocess_script()
+X_train, X_valid, y_train, y_valid, X_test, ids, label_encoder = preprocess_script()
 
 # 2) Auto feature engineering
 X_train_l, X_valid_l = [], []
@@ -45,6 +39,7 @@ def import_module_from_path(module_name, module_path):
         X_train_l.append(X_train_f)
         X_valid_l.append(X_valid_f)
         X_test_l.append(X_test_f)
+        print(f"Feature [{f.stem}] has been added to the feature list")
 
 X_train = pd.concat(X_train_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_train_l))])
 X_valid = pd.concat(X_valid_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_valid_l))])
@@ -53,7 +48,7 @@ def import_module_from_path(module_name, module_path):
 print(X_train.shape, X_valid.shape, X_test.shape)
 
 # Handle inf and -inf values
-X_train, X_valid, X_test = clean_and_impute_data(X_train, X_valid, X_test)
+# X_train, X_valid, X_test = clean_and_impute_data(X_train, X_valid, X_test)
 
 
 model_l = []  # list[tuple[model, predict_func]]
@@ -65,14 +60,15 @@ def import_module_from_path(module_name, module_path):
 
     m = import_module_from_path(f.stem, f)
     model_l.append((m.fit(X_train_selected, y_train, X_valid_selected, y_valid), m.predict, select_m))
+    print(f"Model [{f.stem}] has been trained")
 
 # 4) Evaluate the model on the validation set
 metrics_all = []
 for model, predict_func, select_m in model_l:
     X_valid_selected = select_m.select(X_valid.copy())
     y_valid_pred = predict_func(model, X_valid_selected)
     accuracy = accuracy_score(y_valid, y_valid_pred)
-    print(f"final accuracy on valid set: {accuracy}")
+    print(f"[{type(model).__name__}] MCC on valid set: {accuracy}")
     metrics_all.append(accuracy)
 
 # 5) Save the validation accuracy
@@ -81,7 +77,7 @@ def import_module_from_path(module_name, module_path):
 
 # 6) Make predictions on the test set and save them
 X_test_selected = model_l[max_index][2].select(X_test.copy())
-y_test_pred = model_l[max_index][1](model_l[max_index][0], X_test_selected).flatten() + 1
+y_test_pred = label_encoder.inverse_transform(model_l[max_index][1](model_l[max_index][0], X_test_selected))
 
 
 # 7) Submit predictions for the test set