Skip to content

Commit

Permalink
feat: template changes for some kaggle competitions (#484)
Browse files Browse the repository at this point in the history
* change tpl metric codes

* change tpl
  • Loading branch information
XianBW authored Nov 12, 2024
1 parent 92975fd commit 2e38000
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 37 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -4,23 +4,7 @@
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split


def prepreprocess():
"""
This method loads the data, drops the unnecessary columns, and splits it into train and validation sets.
"""
# Load and preprocess the data
data_df = pd.read_csv("/kaggle/input/train.csv")
data_df = data_df.drop(["Id"], axis=1)

X = data_df.drop(["Cover_Type"], axis=1)
y = data_df["Cover_Type"] - 1

# Split the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20, random_state=42)

return X_train, X_valid, y_train, y_valid
from sklearn.preprocessing import LabelEncoder


def preprocess_script():
Expand All @@ -37,14 +21,23 @@ def preprocess_script():

return X_train, X_valid, y_train, y_valid, X_test, *others

X_train, X_valid, y_train, y_valid = prepreprocess()
label_encoder = LabelEncoder()
data_df = pd.read_csv("/kaggle/input/train.csv")
data_df = data_df.drop(["Id"], axis=1)
data_df["Cover_Type"] = label_encoder.fit_transform(data_df["Cover_Type"])

X = data_df.drop(["Cover_Type", "Soil_Type7", "Soil_Type15"], axis=1)
y = data_df["Cover_Type"]

# Split the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20, random_state=42)

# Load and preprocess the test data
submission_df = pd.read_csv("/kaggle/input/test.csv")
ids = submission_df["Id"]
X_test = submission_df.drop(["Id"], axis=1)
test_df = pd.read_csv("/kaggle/input/test.csv")
ids = test_df["Id"]
X_test = test_df.drop(["Id", "Soil_Type7", "Soil_Type15"], axis=1)

return X_train, X_valid, y_train, y_valid, X_test, ids
return X_train, X_valid, y_train, y_valid, X_test, ids, label_encoder


def clean_and_impute_data(X_train, X_valid, X_test):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,13 +15,14 @@ def fit(X_train: pd.DataFrame, y_train: pd.DataFrame, X_valid: pd.DataFrame, y_v
"objective": "multi:softmax", # Use softmax for multi-class classification
"num_class": len(set(y_train)), # Number of classes
"nthread": -1,
"tree_method": "gpu_hist",
"tree_method": "hist",
"device": "cuda",
"eval_metric": "merror",
}
num_round = 100

evallist = [(dtrain, "train"), (dvalid, "eval")]
bst = xgb.train(params, dtrain, num_round, evallist)
evallist = [(dtrain, "train"), (dvalid, "valid")]
bst = xgb.train(params, dtrain, num_round, evallist, verbose_eval=10)

return bst

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@

import numpy as np
import pandas as pd
from fea_share_preprocess import clean_and_impute_data, preprocess_script
from sklearn.metrics import accuracy_score, matthews_corrcoef
from fea_share_preprocess import preprocess_script
from sklearn.metrics import accuracy_score

# Set random seed for reproducibility
SEED = 42
Expand All @@ -14,12 +14,6 @@
DIRNAME = Path(__file__).absolute().resolve().parent


def compute_metrics_for_classification(y_true, y_pred):
"""Compute MCC for classification."""
mcc = matthews_corrcoef(y_true, y_pred)
return mcc


def import_module_from_path(module_name, module_path):
spec = importlib.util.spec_from_file_location(module_name, module_path)
module = importlib.util.module_from_spec(spec)
Expand All @@ -28,7 +22,7 @@ def import_module_from_path(module_name, module_path):


# 1) Preprocess the data
X_train, X_valid, y_train, y_valid, X_test, ids = preprocess_script()
X_train, X_valid, y_train, y_valid, X_test, ids, label_encoder = preprocess_script()

# 2) Auto feature engineering
X_train_l, X_valid_l = [], []
Expand All @@ -45,6 +39,7 @@ def import_module_from_path(module_name, module_path):
X_train_l.append(X_train_f)
X_valid_l.append(X_valid_f)
X_test_l.append(X_test_f)
print(f"Feature [{f.stem}] has been added to the feature list")

X_train = pd.concat(X_train_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_train_l))])
X_valid = pd.concat(X_valid_l, axis=1, keys=[f"feature_{i}" for i in range(len(X_valid_l))])
Expand All @@ -53,7 +48,7 @@ def import_module_from_path(module_name, module_path):
print(X_train.shape, X_valid.shape, X_test.shape)

# Handle inf and -inf values
X_train, X_valid, X_test = clean_and_impute_data(X_train, X_valid, X_test)
# X_train, X_valid, X_test = clean_and_impute_data(X_train, X_valid, X_test)


model_l = [] # list[tuple[model, predict_func]]
Expand All @@ -65,14 +60,15 @@ def import_module_from_path(module_name, module_path):

m = import_module_from_path(f.stem, f)
model_l.append((m.fit(X_train_selected, y_train, X_valid_selected, y_valid), m.predict, select_m))
print(f"Model [{f.stem}] has been trained")

# 4) Evaluate the model on the validation set
metrics_all = []
for model, predict_func, select_m in model_l:
X_valid_selected = select_m.select(X_valid.copy())
y_valid_pred = predict_func(model, X_valid_selected)
accuracy = accuracy_score(y_valid, y_valid_pred)
print(f"final accuracy on valid set: {accuracy}")
print(f"[{type(model).__name__}] MCC on valid set: {accuracy}")
metrics_all.append(accuracy)

# 5) Save the validation accuracy
Expand All @@ -81,7 +77,7 @@ def import_module_from_path(module_name, module_path):

# 6) Make predictions on the test set and save them
X_test_selected = model_l[max_index][2].select(X_test.copy())
y_test_pred = model_l[max_index][1](model_l[max_index][0], X_test_selected).flatten() + 1
y_test_pred = label_encoder.inverse_transform(model_l[max_index][1](model_l[max_index][0], X_test_selected))


# 7) Submit predictions for the test set
Expand Down

0 comments on commit 2e38000

Please sign in to comment.