diff --git a/lab-ensemble.ipynb b/lab-ensemble.ipynb index 5c5b3c7..83397f7 100644 --- a/lab-ensemble.ipynb +++ b/lab-ensemble.ipynb @@ -31,7 +31,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 304, "metadata": {}, "outputs": [], "source": [ @@ -43,7 +43,29 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 305, + "metadata": {}, + "outputs": [], + "source": [ + "spaceship = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/spaceship_titanic.csv\")\n", + "spaceship.shape\n", + "\n", + "spaceship.dropna(inplace=True)\n", + "spaceship.reset_index(inplace=True, drop=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now perform the same as before:\n", + "- Feature Scaling\n", + "- Feature Selection\n" + ] + }, + { + "cell_type": "code", + "execution_count": 306, "metadata": {}, "outputs": [ { @@ -67,200 +89,1734 @@ " \n", " \n", " \n", - " PassengerId\n", - " HomePlanet\n", - " CryoSleep\n", - " Cabin\n", - " Destination\n", " Age\n", - " VIP\n", " RoomService\n", " FoodCourt\n", " ShoppingMall\n", " Spa\n", " VRDeck\n", - " Name\n", " Transported\n", " \n", " \n", " \n", " \n", + " Age\n", + " 1.000000\n", + " 0.074783\n", + " 0.135844\n", + " 0.042314\n", + " 0.123820\n", + " 0.105031\n", + " -0.082553\n", + " \n", + " \n", + " RoomService\n", + " 0.074783\n", + " 1.000000\n", + " -0.013614\n", + " 0.060478\n", + " 0.012472\n", + " -0.026002\n", + " -0.247291\n", + " \n", + " \n", + " FoodCourt\n", + " 0.135844\n", + " -0.013614\n", + " 1.000000\n", + " -0.012320\n", + " 0.215995\n", + " 0.216997\n", + " 0.055025\n", + " \n", + " \n", + " ShoppingMall\n", + " 0.042314\n", + " 0.060478\n", + " -0.012320\n", + " 1.000000\n", + " 0.022168\n", + " 0.000383\n", + " 0.011602\n", + " \n", + " \n", + " Spa\n", + " 0.123820\n", + " 0.012472\n", + " 0.215995\n", + " 0.022168\n", + " 1.000000\n", + " 0.149447\n", + " -0.219854\n", + " \n", + " \n", + " VRDeck\n", + " 0.105031\n", + " -0.026002\n", + " 0.216997\n", + " 0.000383\n", + " 0.149447\n", + " 1.000000\n", + " -0.207950\n", + " \n", + " \n", + " Transported\n", + " -0.082553\n", + " -0.247291\n", + " 0.055025\n", + " 0.011602\n", + " -0.219854\n", + " -0.207950\n", + " 1.000000\n", + " \n", + " \n", + "\n", + "" + ], + "text/plain": [ + " Age RoomService FoodCourt ShoppingMall Spa \\\n", + "Age 1.000000 0.074783 0.135844 0.042314 0.123820 \n", + "RoomService 0.074783 1.000000 -0.013614 0.060478 0.012472 \n", + "FoodCourt 0.135844 -0.013614 1.000000 -0.012320 0.215995 \n", + "ShoppingMall 0.042314 0.060478 -0.012320 1.000000 0.022168 \n", + "Spa 0.123820 0.012472 0.215995 0.022168 1.000000 \n", + "VRDeck 0.105031 -0.026002 0.216997 0.000383 0.149447 \n", + "Transported -0.082553 -0.247291 0.055025 0.011602 -0.219854 \n", + "\n", + " VRDeck Transported \n", + "Age 0.105031 -0.082553 \n", + "RoomService -0.026002 -0.247291 \n", + "FoodCourt 0.216997 0.055025 \n", + "ShoppingMall 0.000383 0.011602 \n", + "Spa 0.149447 -0.219854 \n", + "VRDeck 1.000000 -0.207950 \n", + "Transported -0.207950 1.000000 " + ] + }, + "execution_count": 306, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# 1. Feature selection\n", + "\n", + "spaceship.select_dtypes(include=['number', 'bool']).corr()" + ] + }, + { + "cell_type": "code", + "execution_count": 307, + "metadata": {}, + "outputs": [], + "source": [ + "#your code here\n", + "\n", + "features = spaceship.select_dtypes(include='number') # Mejor manual\n", + "\n", + "target = spaceship['Transported'] # Estamos haciendo una clasificación\n" + ] + }, + { + "cell_type": "code", + "execution_count": 308, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", - " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", - " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", - " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", - " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", " \n", " \n", " \n", - " \n", - " \n", - " \n", - " \n", - " \n", " \n", - " \n", " \n", " \n", " \n", " \n", " \n", - " \n", - " \n", " \n", " \n", "
AgeRoomServiceFoodCourtShoppingMallSpaVRDeck
00001_01EuropaFalseB/0/PTRAPPIST-1e39.0False0.00.00.00.00.0Maham OfracculyFalse
10002_01EarthFalseF/0/STRAPPIST-1e24.0False109.09.025.0549.044.0Juanna VinesTrue
20003_01EuropaFalseA/0/STRAPPIST-1e58.0True43.03576.00.06715.049.0Altark SusentFalse
30003_02EuropaFalseA/0/STRAPPIST-1e33.0False0.01283.0371.03329.0193.0Solam SusentFalse
40004_01EarthFalseF/1/STRAPPIST-1e16.0False303.070.0151.0565.02.0Willy SantantinesTrue
\n", "
" ], "text/plain": [ - " PassengerId HomePlanet CryoSleep Cabin Destination Age VIP \\\n", - "0 0001_01 Europa False B/0/P TRAPPIST-1e 39.0 False \n", - "1 0002_01 Earth False F/0/S TRAPPIST-1e 24.0 False \n", - "2 0003_01 Europa False A/0/S TRAPPIST-1e 58.0 True \n", - "3 0003_02 Europa False A/0/S TRAPPIST-1e 33.0 False \n", - "4 0004_01 Earth False F/1/S TRAPPIST-1e 16.0 False \n", - "\n", - " RoomService FoodCourt ShoppingMall Spa VRDeck Name \\\n", - "0 0.0 0.0 0.0 0.0 0.0 Maham Ofracculy \n", - "1 109.0 9.0 25.0 549.0 44.0 Juanna Vines \n", - "2 43.0 3576.0 0.0 6715.0 49.0 Altark Susent \n", - "3 0.0 1283.0 371.0 3329.0 193.0 Solam Susent \n", - "4 303.0 70.0 151.0 565.0 2.0 Willy Santantines \n", - "\n", - " Transported \n", - "0 False \n", - "1 True \n", - "2 False \n", - "3 False \n", - "4 True " + " Age RoomService FoodCourt ShoppingMall Spa VRDeck\n", + "0 39.0 0.0 0.0 0.0 0.0 0.0\n", + "1 24.0 109.0 9.0 25.0 549.0 44.0\n", + "2 58.0 43.0 3576.0 0.0 6715.0 49.0\n", + "3 33.0 0.0 1283.0 371.0 3329.0 193.0\n", + "4 16.0 303.0 70.0 151.0 565.0 2.0" ] }, - "execution_count": 2, + "execution_count": 308, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "spaceship = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/spaceship_titanic.csv\")\n", - "spaceship.head()" + "features.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Now perform the same as before:\n", - "- Feature Scaling\n", - "- Feature Selection\n" + "**Perform Train Test Split**" ] }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 309, "metadata": {}, "outputs": [], "source": [ - "#your code here" + "#your code here\n", + "\n", + "X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.20, random_state=42) " ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "**Perform Train Test Split**" + "**Model Selection** - now you will try to apply different ensemble methods in order to get a better model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- Bagging and Pasting" ] }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 310, "metadata": {}, "outputs": [], "source": [ - "#your code here" + "from sklearn.ensemble import BaggingClassifier\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "\n", + "bagging_clf = BaggingClassifier(DecisionTreeClassifier(max_depth=20),\n", + " n_estimators=100, # 100 arboles\n", + " max_samples=1000 # Del 80% del X_train va a coger paquetes de 1000\n", + " )\n" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 311, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
BaggingClassifier(estimator=DecisionTreeClassifier(max_depth=20),\n",
+       "                  max_samples=1000, n_estimators=100)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "BaggingClassifier(estimator=DecisionTreeClassifier(max_depth=20),\n", + " max_samples=1000, n_estimators=100)" + ] + }, + "execution_count": 311, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bagging_clf.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 312, "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " False 0.83 0.74 0.78 653\n", + " True 0.77 0.85 0.81 669\n", + "\n", + " accuracy 0.80 1322\n", + " macro avg 0.80 0.79 0.79 1322\n", + "weighted avg 0.80 0.80 0.79 1322\n", + "\n" + ] + } + ], "source": [ - "**Model Selection** - now you will try to apply different ensemble methods in order to get a better model" + "from sklearn.metrics import classification_report\n", + "\n", + "# Make predictions\n", + "y_pred = bagging_clf.predict(X_test)\n", + "\n", + "# Generate the classification report\n", + "report = classification_report(y_test, y_pred)\n", + "print(report)" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 313, "metadata": {}, + "outputs": [], "source": [ - "- Bagging and Pasting" + "# 2. Repetición con Normalization\n", + "\n", + "from sklearn.preprocessing import MinMaxScaler\n", + "\n", + "normalizer = MinMaxScaler()\n", + "\n", + "X_train_norm = normalizer.fit_transform(X_train) \n", + "\n", + "X_test_norm = normalizer.transform(X_test)" + ] + }, + { + "cell_type": "code", + "execution_count": 314, + "metadata": {}, + "outputs": [], + "source": [ + "X_train_norm = pd.DataFrame(X_train_norm, columns = X_train.columns) # Vuelvo a dar formato de df\n", + "X_test_norm = pd.DataFrame(X_test_norm, columns = X_test.columns)" + ] + }, + { + "cell_type": "code", + "execution_count": 315, + "metadata": {}, + "outputs": [], + "source": [ + "bagging_clf_norm = BaggingClassifier(DecisionTreeClassifier(max_depth=20),\n", + " n_estimators=100, \n", + " bootstrap=True, # Esto es para que sea bagging o pasting (no es obligatorio especificar) \n", + " max_samples=1000 # Toma 1000 muestras para cada uno (si pusiera 0.X cogeria el X0% de las muestras) \n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 316, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
BaggingClassifier(estimator=DecisionTreeClassifier(max_depth=20),\n",
+       "                  max_samples=1000, n_estimators=100)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "BaggingClassifier(estimator=DecisionTreeClassifier(max_depth=20),\n", + " max_samples=1000, n_estimators=100)" + ] + }, + "execution_count": 316, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bagging_clf_norm.fit(X_train_norm, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 317, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " False 0.83 0.74 0.78 653\n", + " True 0.77 0.85 0.81 669\n", + "\n", + " accuracy 0.80 1322\n", + " macro avg 0.80 0.80 0.80 1322\n", + "weighted avg 0.80 0.80 0.80 1322\n", + "\n" + ] + } + ], + "source": [ + "# Make predictions\n", + "y_pred_norm = bagging_clf_norm.predict(X_test_norm)\n", + "\n", + "# Generate the classification report\n", + "report = classification_report(y_test, y_pred_norm)\n", + "print(report)" + ] + }, + { + "cell_type": "code", + "execution_count": 318, + "metadata": {}, + "outputs": [], + "source": [ + "# 3. Repetición con Scaling\n", + "from sklearn.preprocessing import StandardScaler\n", + "\n", + "scaler = StandardScaler() \n", + "X_train_scaling = scaler.fit_transform(X_train)\n", + "X_train_scaling = pd.DataFrame(X_train_scaling, columns = X_train.columns)\n", + "\n", + "X_test_scaling = scaler.transform(X_test)\n", + "X_test_scaling = pd.DataFrame(X_test_scaling, columns = X_test.columns)" + ] + }, + { + "cell_type": "code", + "execution_count": 319, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
BaggingClassifier(estimator=DecisionTreeClassifier(max_depth=20),\n",
+       "                  max_samples=1000, n_estimators=100)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" + ], + "text/plain": [ + "BaggingClassifier(estimator=DecisionTreeClassifier(max_depth=20),\n", + " max_samples=1000, n_estimators=100)" + ] + }, + "execution_count": 319, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "bagging_clf_scaling = BaggingClassifier(DecisionTreeClassifier(max_depth=20),\n", + " n_estimators=100, \n", + " max_samples=1000 \n", + " )\n", + "\n", + "bagging_clf_scaling.fit(X_train_scaling, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 320, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " False 0.83 0.74 0.78 653\n", + " True 0.77 0.85 0.81 669\n", + "\n", + " accuracy 0.80 1322\n", + " macro avg 0.80 0.80 0.79 1322\n", + "weighted avg 0.80 0.80 0.80 1322\n", + "\n" + ] + } + ], + "source": [ + "# Make predictions\n", + "y_pred_scaling = bagging_clf_scaling.predict(X_test_scaling)\n", + "\n", + "# Generate the classification report\n", + "report = classification_report(y_test, y_pred_scaling)\n", + "print(report)" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 321, "metadata": {}, "outputs": [], "source": [ - "#your code here" + "### EN VISTA DE LAS 3 FORMAS DE REALIZAR ESTA PREDICCIÓN, LA MEJOR MANERA, ATENDIENDO AL f1-score (POR UNA LIGERA DIFERENCIA) ES LA NORMALIZACIÓN" ] }, { @@ -272,11 +1828,46 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 322, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " False 0.83 0.74 0.78 653\n", + " True 0.77 0.85 0.81 669\n", + "\n", + " accuracy 0.79 1322\n", + " macro avg 0.80 0.79 0.79 1322\n", + "weighted avg 0.80 0.79 0.79 1322\n", + "\n", + "Accuracy: 0.7934947049924357\n" + ] + } + ], "source": [ - "#your code here" + "#your code here\n", + "# Import necessary libraries\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score\n", + "from sklearn.metrics import classification_report, accuracy_score\n", + "# Create and train the Random Forest regressor\n", + "\n", + "rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)\n", + "rf_classifier.fit(X_train_norm, y_train)\n", + "\n", + "# Make predictions\n", + "y_pred_norm = rf_classifier.predict(X_test_norm)\n", + "\n", + "# Evaluate the model\n", + "report = classification_report(y_test, y_pred_norm) \n", + "print(report)\n", + "\n", + "accuracy = accuracy_score(y_test, y_pred_norm)\n", + "print(f\"Accuracy: {accuracy}\")\n" ] }, { @@ -288,49 +1879,201 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 323, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " False 0.83 0.74 0.78 653\n", + " True 0.77 0.85 0.81 669\n", + "\n", + " accuracy 0.79 1322\n", + " macro avg 0.80 0.79 0.79 1322\n", + "weighted avg 0.80 0.79 0.79 1322\n", + "\n" + ] + } + ], "source": [ - "#your code here" + "#your code here\n", + "from sklearn.ensemble import BaggingRegressor, RandomForestRegressor, GradientBoostingClassifier\n", + "\n", + "gb_clas = GradientBoostingClassifier(max_depth=20,\n", + " n_estimators=100)\n", + "\n", + "gb_clas.fit(X_train_norm, y_train)\n", + "\n", + "pred_gb_norm = gb_clas.predict(X_test_norm)\n", + "\n", + "report = classification_report(y_test, y_pred_norm) \n", + "print(report)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "- Adaptive Boosting" + "- Adaptive Boosting (AdaBoost)" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 324, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Anaconda\\Lib\\site-packages\\sklearn\\ensemble\\_weight_boosting.py:519: FutureWarning: The SAMME.R algorithm (the default) is deprecated and will be removed in 1.6. Use the SAMME algorithm to circumvent this warning.\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + " precision recall f1-score support\n", + "\n", + " False 0.83 0.74 0.78 653\n", + " True 0.77 0.85 0.81 669\n", + "\n", + " accuracy 0.79 1322\n", + " macro avg 0.80 0.79 0.79 1322\n", + "weighted avg 0.80 0.79 0.79 1322\n", + "\n" + ] + } + ], "source": [ - "#your code here" + "#your code here\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "from sklearn.ensemble import AdaBoostClassifier\n", + "\n", + "ada_clas = AdaBoostClassifier(DecisionTreeClassifier(max_depth=20),\n", + " n_estimators=200)\n", + "\n", + "ada_clas.fit(X_train_norm, y_train)\n", + "\n", + "report = classification_report(y_test, y_pred_norm) \n", + "print(report)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Which model is the best and why?" + "## Which model is the best and why?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### The best model is bagging with scaling (very close to the others) because its f1-score is 0.80, and for the rest 0.79" + ] + }, + { + "cell_type": "code", + "execution_count": 325, + "metadata": {}, + "outputs": [], + "source": [ + "# Una vez que tenemos el mejor modelo, podriamos predecir los resultados de la Spaceship.\n", + "\n", + "predictions = bagging_clf_norm.predict(features)\n", + "\n", + "\n", + "predictions = pd.DataFrame(predictions, columns=['Resuls_predicted'])" + ] + }, + { + "cell_type": "code", + "execution_count": 326, + "metadata": {}, + "outputs": [], + "source": [ + "spaceship[\"predictions\"] = predictions " + ] + }, + { + "cell_type": "code", + "execution_count": 327, + "metadata": {}, + "outputs": [], + "source": [ + "spaceship[\"Resta\"] = spaceship[\"Transported\"].astype(int) - spaceship[\"predictions\"].astype(int)" + ] + }, + { + "cell_type": "code", + "execution_count": 328, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.7275204359673024" + ] + }, + "execution_count": 328, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "spaceship[\"Resta\"].value_counts()[0] / spaceship[\"Resta\"].value_counts().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 329, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from sklearn.metrics import confusion_matrix\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "\n", + "cm = confusion_matrix(spaceship[\"Transported\"], spaceship[\"predictions\"])\n", + "sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')\n", + "plt.xlabel('Predicción')\n", + "plt.ylabel('Real')\n", + "plt.title('Matriz de Confusión')\n", + "plt.show()\n", + "\n", + "\n", + "# ESTO ES MUY ÚTIL PERO SOLO EN CLASIFICACIÓN" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 330, "metadata": {}, "outputs": [], "source": [ - "#comment here" + "# La razón de que que el f1-score sea 0.80 pero tenga un % de aciertos de apenas el 50% es porque " ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "base", "language": "python", "name": "python3" }, @@ -344,7 +2087,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9" + "version": "3.12.4" } }, "nbformat": 4,