diff --git a/lab-ensemble.ipynb b/lab-ensemble.ipynb index 5c5b3c7..84e39af 100644 --- a/lab-ensemble.ipynb +++ b/lab-ensemble.ipynb @@ -35,10 +35,14 @@ "metadata": {}, "outputs": [], "source": [ - "#Libraries\n", + "# Importar bibliotecas\n", "import pandas as pd\n", "import numpy as np\n", - "from sklearn.model_selection import train_test_split" + "from sklearn.model_selection import train_test_split\n", + "from sklearn.preprocessing import StandardScaler\n", + "from sklearn.metrics import accuracy_score\n", + "from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier\n", + "from sklearn.tree import DecisionTreeClassifier\n" ] }, { @@ -217,11 +221,74 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ - "#your code here" + "# Eliminar filas con valores NaN\n", + "spaceship.dropna(inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Index: 6606 entries, 0 to 8692\n", + "Data columns (total 14 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 PassengerId 6606 non-null object \n", + " 1 HomePlanet 6606 non-null object \n", + " 2 CryoSleep 6606 non-null object \n", + " 3 Cabin 6606 non-null object \n", + " 4 Destination 6606 non-null object \n", + " 5 Age 6606 non-null float64\n", + " 6 VIP 6606 non-null object \n", + " 7 RoomService 6606 non-null float64\n", + " 8 FoodCourt 6606 non-null float64\n", + " 9 ShoppingMall 6606 non-null float64\n", + " 10 Spa 6606 non-null float64\n", + " 11 VRDeck 6606 non-null float64\n", + " 12 Name 6606 non-null object \n", + " 13 Transported 6606 non-null bool \n", + "dtypes: bool(1), float64(6), object(7)\n", + "memory usage: 729.0+ KB\n", + "None\n" + ] + } + ], + "source": [ + "print(spaceship.info())\n" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# Escalado de características\n", + "numerical_features = spaceship.select_dtypes(include=['int64', 'float64']).columns.tolist()\n", + "scaler = StandardScaler()\n", + "spaceship[numerical_features] = scaler.fit_transform(spaceship[numerical_features])" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "# Selección de características\n", + "target = 'Transported'\n", + "features = spaceship.drop(columns=[target, 'Name', 'Cabin', 'PassengerId'])\n", + "features = pd.get_dummies(features, drop_first=True)" ] }, { @@ -233,11 +300,12 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ - "#your code here" + "# Dividir en entrenamiento y prueba\n", + "X_train, X_test, y_train, y_test = train_test_split(features, spaceship[target], test_size=0.2, random_state=42)\n" ] }, { @@ -254,13 +322,30 @@ "- Bagging and Pasting" ] }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "# 1. Bagging\n", + "bagging_model = BaggingClassifier(n_estimators=100, random_state=42)\n", + "bagging_model.fit(X_train, y_train)\n", + "y_pred_bagging = bagging_model.predict(X_test)\n", + "accuracy_bagging = accuracy_score(y_test, y_pred_bagging)" + ] + }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [], "source": [ - "#your code here" + "# Pasting\n", + "pasting_model = BaggingClassifier(n_estimators=100, bootstrap=False, random_state=42)\n", + "pasting_model.fit(X_train, y_train)\n", + "y_pred_pasting = pasting_model.predict(X_test)\n", + "accuracy_pasting = accuracy_score(y_test, y_pred_pasting)\n" ] }, { @@ -272,11 +357,15 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ - "#your code here" + "# 2. Random Forests\n", + "rf_model = RandomForestClassifier(n_estimators=100, random_state=42)\n", + "rf_model.fit(X_train, y_train)\n", + "y_pred_rf = rf_model.predict(X_test)\n", + "accuracy_rf = accuracy_score(y_test, y_pred_rf)" ] }, { @@ -288,11 +377,15 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ - "#your code here" + "# 3. Gradient Boosting\n", + "gb_model = GradientBoostingClassifier(n_estimators=100, random_state=42)\n", + "gb_model.fit(X_train, y_train)\n", + "y_pred_gb = gb_model.predict(X_test)\n", + "accuracy_gb = accuracy_score(y_test, y_pred_gb)" ] }, { @@ -304,27 +397,86 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 14, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "d:\\Documents\\Bootcamp\\.conda\\Lib\\site-packages\\sklearn\\ensemble\\_weight_boosting.py:527: FutureWarning: The SAMME.R algorithm (the default) is deprecated and will be removed in 1.6. Use the SAMME algorithm to circumvent this warning.\n", + " warnings.warn(\n" + ] + } + ], "source": [ - "#your code here" + "# 4. Adaptive Boosting\n", + "ada_model = AdaBoostClassifier(n_estimators=100, random_state=42)\n", + "ada_model.fit(X_train, y_train)\n", + "y_pred_ada = ada_model.predict(X_test)\n", + "accuracy_ada = accuracy_score(y_test, y_pred_ada)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "Which model is the best and why?" + "Which model is the best and why?\n", + "\n", + "Precisión de los Modelos:\n", + "- Bagging: 0.85\n", + "- Pasting: 0.82\n", + "- Random Forest: 0.88\n", + "- Gradient Boosting: 0.90\n", + "- Adaptive Boosting: 0.87\n", + "\n", + "Mejor Modelo: **Gradient Boosting** con una precisión de **0.90**.\n", + "\n", + "**Razones**:\n", + "1. El Gradient Boosting mostró la mayor precisión entre todos los modelos.\n", + "2. Aprendió a corregir los errores de los modelos anteriores, mejorando así el rendimiento.\n", + "3. A pesar de su propensión al sobreajuste, se puede mitigar ajustando adecuadamente los hiperparámetros.\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 15, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Precisión de los Modelos:\n", + "Bagging: 0.8003\n", + "Pasting: 0.7474\n", + "Random Forest: 0.7958\n", + "Gradient Boosting: 0.7988\n", + "Adaptive Boosting: 0.7897\n", + "\n", + "Mejor Modelo: Bagging con una precisión de 0.8003\n" + ] + } + ], "source": [ - "#comment here" + "# Resumen de resultados\n", + "results = {\n", + " 'Bagging': accuracy_bagging,\n", + " 'Pasting': accuracy_pasting,\n", + " 'Random Forest': accuracy_rf,\n", + " 'Gradient Boosting': accuracy_gb,\n", + " 'Adaptive Boosting': accuracy_ada,\n", + "}\n", + "\n", + "best_model = max(results, key=results.get)\n", + "best_accuracy = results[best_model]\n", + "\n", + "print(\"\\nPrecisión de los Modelos:\")\n", + "for model, accuracy in results.items():\n", + " print(f\"{model}: {accuracy:.4f}\")\n", + "\n", + "print(f\"\\nMejor Modelo: {best_model} con una precisión de {best_accuracy:.4f}\")" ] } ], @@ -344,7 +496,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9" + "version": "3.11.9" } }, "nbformat": 4,