data-bootcamp-v4 · JayEm65 · Oct 14, 2024
diff --git a/lab-hypothesis-testing.ipynb b/lab-hypothesis-testing.ipynb
@@ -299,9 +299,36 @@
    "cell_type": "code",
    "execution_count": 1,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "T-Statistic: 3.3349632905124063, P-Value: 0.0015987219490841197\n",
+      "We reject the null hypothesis. Dragon-type Pokémon have significantly different HP than Grass-type.\n"
+     ]
+    }
+   ],
    "source": [
-    "#code here"
+    "import pandas as pd\n",
+    "import scipy.stats as st\n",
+    "\n",
+    "# Load the dataset\n",
+    "df = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/pokemon.csv\")\n",
+    "\n",
+    "# Challenge 1: Dragon vs Grass HP comparison\n",
+    "dragon_hp = df[df['Type 1'] == 'Dragon']['HP']\n",
+    "grass_hp = df[df['Type 1'] == 'Grass']['HP']\n",
+    "\n",
+    "# Perform Two-Sample T-Test\n",
+    "t_stat, p_value = st.ttest_ind(dragon_hp, grass_hp, equal_var=False)\n",
+    "print(f\"T-Statistic: {t_stat}, P-Value: {p_value}\")\n",
+    "\n",
+    "# Check significance at 0.05\n",
+    "if p_value < 0.05:\n",
+    "    print(\"We reject the null hypothesis. Dragon-type Pokémon have significantly different HP than Grass-type.\")\n",
+    "else:\n",
+    "    print(\"We fail to reject the null hypothesis. No significant difference in HP.\")\n"
    ]
   },
   {
@@ -313,11 +340,60 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
+   "execution_count": 3,
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "HP - T-Statistic: 8.981370483625046, P-Value: 1.0026911708035284e-13\n",
+      "We reject the null hypothesis for HP. Legendary Pokémon have significantly different HP compared to Non-Legendary.\n",
+      "Attack - T-Statistic: 10.438133539322203, P-Value: 2.520372449236646e-16\n",
+      "We reject the null hypothesis for Attack. Legendary Pokémon have significantly different Attack compared to Non-Legendary.\n",
+      "Defense - T-Statistic: 7.637078164784618, P-Value: 4.826998494919331e-11\n",
+      "We reject the null hypothesis for Defense. Legendary Pokémon have significantly different Defense compared to Non-Legendary.\n",
+      "Sp. Atk - T-Statistic: 13.417449984138461, P-Value: 1.5514614112239816e-21\n",
+      "We reject the null hypothesis for Sp. Atk. Legendary Pokémon have significantly different Sp. Atk compared to Non-Legendary.\n",
+      "Sp. Def - T-Statistic: 10.015696613114878, P-Value: 2.2949327864052826e-15\n",
+      "We reject the null hypothesis for Sp. Def. Legendary Pokémon have significantly different Sp. Def compared to Non-Legendary.\n",
+      "Speed - T-Statistic: 11.47504444631443, P-Value: 1.0490163118824507e-18\n",
+      "We reject the null hypothesis for Speed. Legendary Pokémon have significantly different Speed compared to Non-Legendary.\n"
+     ]
+    }
+   ],
    "source": [
-    "#code here"
+    "import pandas as pd\n",
+    "import scipy.stats as st\n",
+    "\n",
+    "# Load the dataset\n",
+    "df = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/pokemon.csv\")\n",
+    "\n",
+    "# Split data into Legendary and Non-Legendary\n",
+    "legendary = df[df['Legendary'] == True]\n",
+    "non_legendary = df[df['Legendary'] == False]\n",
+    "\n",
+    "# Define the stats we want to test\n",
+    "stats = ['HP', 'Attack', 'Defense', 'Sp. Atk', 'Sp. Def', 'Speed']\n",
+    "\n",
+    "# Perform Two-Sample T-Test for each stat\n",
+    "for stat in stats:\n",
+    "    legendary_stat = legendary[stat]\n",
+    "    non_legendary_stat = non_legendary[stat]\n",
+    "    \n",
+    "    # Perform Two-Sample T-Test\n",
+    "    t_stat, p_value = st.ttest_ind(legendary_stat, non_legendary_stat, equal_var=False)\n",
+    "    \n",
+    "    # Apply Bonferroni correction\n",
+    "    corrected_alpha = 0.05 / len(stats)\n",
+    "    \n",
+    "    # Print results\n",
+    "    print(f\"{stat} - T-Statistic: {t_stat}, P-Value: {p_value}\")\n",
+    "    \n",
+    "    if p_value < corrected_alpha:\n",
+    "        print(f\"We reject the null hypothesis for {stat}. Legendary Pokémon have significantly different {stat} compared to Non-Legendary.\")\n",
+    "    else:\n",
+    "        print(f\"We fail to reject the null hypothesis for {stat}. No significant difference in {stat}.\")\n"
    ]
   },
   {
@@ -483,17 +559,54 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "metadata": {},
-   "outputs": [],
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "T-Statistic: -17.174167998688404, P-Value: 5.220018561223529e-05\n",
+      "We reject the null hypothesis. Houses close to schools or hospitals are more expensive.\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import scipy.stats as st\n",
+    "\n",
+    "# Load the dataset\n",
+    "df = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/california_housing.csv\")\n",
+    "\n",
+    "# Coordinates of the school and hospital\n",
+    "school_coords = (-118, 37)\n",
+    "hospital_coords = (-122, 34)\n",
+    "\n",
+    "# Define a function to calculate Euclidean distance\n",
+    "def euclidean_distance(x1, y1, x2, y2):\n",
+    "    return np.sqrt((x2 - x1)**2 + (y2 - y1)**2)\n",
+    "\n",
+    "# Calculate distances to school and hospital\n",
+    "df['dist_school'] = df.apply(lambda row: euclidean_distance(row['longitude'], row['latitude'], school_coords[0], school_coords[1]), axis=1)\n",
+    "df['dist_hospital'] = df.apply(lambda row: euclidean_distance(row['longitude'], row['latitude'], hospital_coords[0], hospital_coords[1]), axis=1)\n",
+    "\n",
+    "# Consider a house 'close' if it's within 0.50 distance to either school or hospital\n",
+    "df['close'] = np.where((df['dist_school'] < 0.5) | (df['dist_hospital'] < 0.5), 'close', 'far')\n",
+    "\n",
+    "# Perform Two-Sample T-Test\n",
+    "close_prices = df[df['close'] == 'close']['median_house_value']\n",
+    "far_prices = df[df['close'] == 'far']['median_house_value']\n",
+    "\n",
+    "t_stat, p_value = st.ttest_ind(close_prices, far_prices, equal_var=False)\n",
+    "print(f\"T-Statistic: {t_stat}, P-Value: {p_value}\")\n",
+    "\n",
+    "# Check significance at 0.05\n",
+    "if p_value < 0.05:\n",
+    "    print(\"We reject the null hypothesis. Houses close to schools or hospitals are more expensive.\")\n",
+    "else:\n",
+    "    print(\"We fail to reject the null hypothesis. No significant difference in house prices.\")\n"
+   ]
   }
  ],
  "metadata": {
@@ -512,7 +625,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.9"
+   "version": "3.12.6"
   }
  },
  "nbformat": 4,