From b716349e5209393d5fe2e985b4c7f0462d02a199 Mon Sep 17 00:00:00 2001 From: Diego-Llorente Date: Sat, 12 Oct 2024 09:41:49 +0100 Subject: [PATCH] lab complete --- .../lab-hypothesis-testing-checkpoint.ipynb | 1288 +++++++++++++++++ lab-hypothesis-testing.ipynb | 1004 +++++++++++-- 2 files changed, 2174 insertions(+), 118 deletions(-) create mode 100644 .ipynb_checkpoints/lab-hypothesis-testing-checkpoint.ipynb diff --git a/.ipynb_checkpoints/lab-hypothesis-testing-checkpoint.ipynb b/.ipynb_checkpoints/lab-hypothesis-testing-checkpoint.ipynb new file mode 100644 index 0000000..c1fa91c --- /dev/null +++ b/.ipynb_checkpoints/lab-hypothesis-testing-checkpoint.ipynb @@ -0,0 +1,1288 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Lab | Hypothesis Testing" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Objective**\n", + "\n", + "Welcome to the Hypothesis Testing Lab, where we embark on an enlightening journey through the realm of statistical decision-making! In this laboratory, we delve into various scenarios, applying the powerful tools of hypothesis testing to scrutinize and interpret data.\n", + "\n", + "From testing the mean of a single sample (One Sample T-Test), to investigating differences between independent groups (Two Sample T-Test), and exploring relationships within dependent samples (Paired Sample T-Test), our exploration knows no bounds. Furthermore, we'll venture into the realm of Analysis of Variance (ANOVA), unraveling the complexities of comparing means across multiple groups.\n", + "\n", + "So, grab your statistical tools, prepare your hypotheses, and let's embark on this fascinating journey of exploration and discovery in the world of hypothesis testing!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Challenge 1**" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this challenge, we will be working with pokemon data. The data can be found here:\n", + "\n", + "- https://raw.githubusercontent.com/data-bootcamp-v4/data/main/pokemon.csv" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "#libraries\n", + "import pandas as pd\n", + "import scipy.stats as st\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NameType 1Type 2HPAttackDefenseSp. AtkSp. DefSpeedGenerationLegendary
0BulbasaurGrassPoison4549496565451False
1IvysaurGrassPoison6062638080601False
2VenusaurGrassPoison808283100100801False
3Mega VenusaurGrassPoison80100123122120801False
4CharmanderFireNaN3952436050651False
....................................
795DiancieRockFairy50100150100150506True
796Mega DiancieRockFairy501601101601101106True
797Hoopa ConfinedPsychicGhost8011060150130706True
798Hoopa UnboundPsychicDark8016060170130806True
799VolcanionFireWater8011012013090706True
\n", + "

800 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " Name Type 1 Type 2 HP Attack Defense Sp. Atk Sp. Def \\\n", + "0 Bulbasaur Grass Poison 45 49 49 65 65 \n", + "1 Ivysaur Grass Poison 60 62 63 80 80 \n", + "2 Venusaur Grass Poison 80 82 83 100 100 \n", + "3 Mega Venusaur Grass Poison 80 100 123 122 120 \n", + "4 Charmander Fire NaN 39 52 43 60 50 \n", + ".. ... ... ... .. ... ... ... ... \n", + "795 Diancie Rock Fairy 50 100 150 100 150 \n", + "796 Mega Diancie Rock Fairy 50 160 110 160 110 \n", + "797 Hoopa Confined Psychic Ghost 80 110 60 150 130 \n", + "798 Hoopa Unbound Psychic Dark 80 160 60 170 130 \n", + "799 Volcanion Fire Water 80 110 120 130 90 \n", + "\n", + " Speed Generation Legendary \n", + "0 45 1 False \n", + "1 60 1 False \n", + "2 80 1 False \n", + "3 80 1 False \n", + "4 65 1 False \n", + ".. ... ... ... \n", + "795 50 6 True \n", + "796 110 6 True \n", + "797 70 6 True \n", + "798 80 6 True \n", + "799 70 6 True \n", + "\n", + "[800 rows x 11 columns]" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/pokemon.csv\")\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- We posit that Pokemons of type Dragon have, on average, more HP stats than Grass. Choose the propper test and, with 5% significance, comment your findings." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "#H0: avg dragon HP <= avg grass HP\n", + "#H1: avg dragon HP > avg grass HP" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "alpha: 0.05, p_value: 0.0002567969150153481\n", + "We can reject the hypothesis with a 95% degree of confidence. THe average HP for dragon pokemons is greater than grass pokemons\n" + ] + } + ], + "source": [ + "#code here\n", + "#I need to use One tailed t-test\n", + "\n", + "alpha = 0.05\n", + "\n", + "grass_hp = df[df[\"Type 1\"] == \"Grass\"][\"HP\"]\n", + "dragon_hp = df[df[\"Type 1\"] == \"Dragon\"][\"HP\"]\n", + "\n", + "t_stat, p_value = st.ttest_ind(dragon_hp, grass_hp, alternative = \"greater\")\n", + "\n", + "print(f\"alpha: {alpha}, p_value: {p_value}\")\n", + "\n", + "if p_value < alpha:\n", + " print(\"We can reject the hypothesis with a 95% degree of confidence. THe average HP for dragon pokemons is greater than grass pokemons\")\n", + "else:\n", + " print(\"We don't have enough data to be able to reject the null hypothesis.\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "67.27142857142857" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "grass_hp.mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "83.3125" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "dragon_hp.mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "- We posit that Legendary Pokemons have different stats (HP, Attack, Defense, Sp.Atk, Sp.Def, Speed) when comparing with Non-Legendary. Choose the propper test and, with 5% significance, comment your findings.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "#H0: legendary stats = non-legendary stats\n", + "#H1: legendary stats != non-legendary stats" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
NameType 1Type 2HPAttackDefenseSp. AtkSp. DefSpeedGenerationLegendary
\n", + "
" + ], + "text/plain": [ + "Empty DataFrame\n", + "Columns: [Name, Type 1, Type 2, HP, Attack, Defense, Sp. Atk, Sp. Def, Speed, Generation, Legendary]\n", + "Index: []" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head(0)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "For HP, t_stat: 8.036124405043928, p_value: 3.330647684846191e-15\n", + "We reject the null hypothesis. We can say, with 95% degree of confidence that there is a significant difference between legendary and non legeendary in HP\n", + "For Attack, t_stat: 10.397321023700622, p_value: 7.827253003205333e-24\n", + "We reject the null hypothesis. We can say, with 95% degree of confidence that there is a significant difference between legendary and non legeendary in Attack\n", + "For Defense, t_stat: 7.181240122992339, p_value: 1.5842226094427255e-12\n", + "We reject the null hypothesis. We can say, with 95% degree of confidence that there is a significant difference between legendary and non legeendary in Defense\n", + "For Sp. Atk, t_stat: 14.191406210846289, p_value: 6.314915770427266e-41\n", + "We reject the null hypothesis. We can say, with 95% degree of confidence that there is a significant difference between legendary and non legeendary in Sp. Atk\n", + "For Sp. Def, t_stat: 11.03775106120522, p_value: 1.8439809580409594e-26\n", + "We reject the null hypothesis. We can say, with 95% degree of confidence that there is a significant difference between legendary and non legeendary in Sp. Def\n", + "For Speed, t_stat: 9.765234331931898, p_value: 2.3540754436898437e-21\n", + "We reject the null hypothesis. We can say, with 95% degree of confidence that there is a significant difference between legendary and non legeendary in Speed\n" + ] + } + ], + "source": [ + "#code here\n", + "#Two_sample-t-Test\n", + "#WE're going to compare lengendary and non-legendary stas one by one to test if the are equal.\n", + "\n", + "alpha = 0.05\n", + "\n", + "stats = [\"HP\", \"Attack\", \"Defense\", \"Sp. Atk\", \"Sp. Def\", \"Speed\"]\n", + "\n", + "legendary = df[df[\"Legendary\"] == True]\n", + "\n", + "not_legendary = df[df[\"Legendary\"] == False]\n", + "\n", + "for stat in stats:\n", + " t_stat, p_value = st.ttest_ind(legendary[stat], not_legendary[stat])\n", + " print(f\"For {stat}, t_stat: {t_stat}, p_value: {p_value}\")\n", + " \n", + " if p_value < alpha:\n", + " print(f\"We reject the null hypothesis. We can say, with 95% degree of confidence that there is a significant difference between legendary and non legeendary in {stat}\")\n", + " else:\n", + " print(f\"We don't have enough data to reject the null hypothesis for {stat}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "stats = [\"HP\", \"Attack\", \"Defense\", \"Sp. Atk\", \"Sp. Def\", \"Speed\"]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "92.73846153846154" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "legendary[\"HP\"].mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "67.18231292517007" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "not_legendary[\"HP\"].mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "116.67692307692307" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "legendary[\"Attack\"].mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "75.66938775510204" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "not_legendary[\"Attack\"].mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "99.66153846153846" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "legendary[\"Defense\"].mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "71.55918367346939" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "not_legendary[\"Defense\"].mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "122.18461538461538" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "legendary[\"Sp. Atk\"].mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "68.45442176870748" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "not_legendary[\"Sp. Atk\"].mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "105.93846153846154" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "legendary[\"Sp. Def\"].mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "68.89251700680272" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "not_legendary[\"Sp. Def\"].mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "100.18461538461538" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "legendary[\"Speed\"].mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "65.45578231292517" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "not_legendary[\"Speed\"].mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Challenge 2**" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this challenge, we will be working with california-housing data. The data can be found here:\n", + "- https://raw.githubusercontent.com/data-bootcamp-v4/data/main/california_housing.csv" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
longitudelatitudehousing_median_agetotal_roomstotal_bedroomspopulationhouseholdsmedian_incomemedian_house_value
0-114.3134.1915.05612.01283.01015.0472.01.493666900.0
1-114.4734.4019.07650.01901.01129.0463.01.820080100.0
2-114.5633.6917.0720.0174.0333.0117.01.650985700.0
3-114.5733.6414.01501.0337.0515.0226.03.191773400.0
4-114.5733.5720.01454.0326.0624.0262.01.925065500.0
5-114.5833.6329.01387.0236.0671.0239.03.343874000.0
6-114.5833.6125.02907.0680.01841.0633.02.676882400.0
7-114.5934.8341.0812.0168.0375.0158.01.708348500.0
8-114.5933.6134.04789.01175.03134.01056.02.178258400.0
9-114.6034.8346.01497.0309.0787.0271.02.190848100.0
\n", + "
" + ], + "text/plain": [ + " longitude latitude housing_median_age total_rooms total_bedrooms \\\n", + "0 -114.31 34.19 15.0 5612.0 1283.0 \n", + "1 -114.47 34.40 19.0 7650.0 1901.0 \n", + "2 -114.56 33.69 17.0 720.0 174.0 \n", + "3 -114.57 33.64 14.0 1501.0 337.0 \n", + "4 -114.57 33.57 20.0 1454.0 326.0 \n", + "5 -114.58 33.63 29.0 1387.0 236.0 \n", + "6 -114.58 33.61 25.0 2907.0 680.0 \n", + "7 -114.59 34.83 41.0 812.0 168.0 \n", + "8 -114.59 33.61 34.0 4789.0 1175.0 \n", + "9 -114.60 34.83 46.0 1497.0 309.0 \n", + "\n", + " population households median_income median_house_value \n", + "0 1015.0 472.0 1.4936 66900.0 \n", + "1 1129.0 463.0 1.8200 80100.0 \n", + "2 333.0 117.0 1.6509 85700.0 \n", + "3 515.0 226.0 3.1917 73400.0 \n", + "4 624.0 262.0 1.9250 65500.0 \n", + "5 671.0 239.0 3.3438 74000.0 \n", + "6 1841.0 633.0 2.6768 82400.0 \n", + "7 375.0 158.0 1.7083 48500.0 \n", + "8 3134.0 1056.0 2.1782 58400.0 \n", + "9 787.0 271.0 2.1908 48100.0 " + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/california_housing.csv\")\n", + "df.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**We posit that houses close to either a school or a hospital are more expensive.**\n", + "\n", + "- School coordinates (-118, 37)\n", + "- Hospital coordinates (-122, 34)\n", + "\n", + "We consider a house (neighborhood) to be close to a school or hospital if the distance is lower than 0.50.\n", + "\n", + "Hint:\n", + "- Write a function to calculate euclidean distance from each house (neighborhood) to the school and to the hospital.\n", + "- Divide your dataset into houses close and far from either a hospital or school.\n", + "- Choose the propper test and, with 5% significance, comment your findings.\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "#Hypothesis:\n", + "#H0: prices of houses near schools or hospitals = houses far from schools or hospitals \n", + "#H1: prices of houses near schools or hospitals != houses far from schools or hospitals" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "#Euclidean distance\n", + "def euc_dist(x1, x2, y1, y2):\n", + " return np.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "#Dividing the dataset into houses close and far from EITHER a hospital OR school\n", + "\n", + "school = (-118, 37)\n", + "hospital = (-122, 34)\n", + "\n", + "#distance from school\n", + "df[\"dist_school\"] = df.apply(lambda row: euc_dist(row[\"longitude\"], school[0], row[\"latitude\"], school[1]), axis = 1)\n", + "\n", + "df[\"dist_hospital\"] = df.apply(lambda row: euc_dist(row[\"longitude\"], hospital[0], row[\"latitude\"], hospital[1]), axis = 1)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
longitudelatitudehousing_median_agetotal_roomstotal_bedroomspopulationhouseholdsmedian_incomemedian_house_valuedist_schooldist_hospital
0-114.3134.1915.05612.01283.01015.0472.01.493666900.04.6381257.692347
1-114.4734.4019.07650.01901.01129.0463.01.820080100.04.3841657.540617
2-114.5633.6917.0720.0174.0333.0117.01.650985700.04.7738567.446456
\n", + "
" + ], + "text/plain": [ + " longitude latitude housing_median_age total_rooms total_bedrooms \\\n", + "0 -114.31 34.19 15.0 5612.0 1283.0 \n", + "1 -114.47 34.40 19.0 7650.0 1901.0 \n", + "2 -114.56 33.69 17.0 720.0 174.0 \n", + "\n", + " population households median_income median_house_value dist_school \\\n", + "0 1015.0 472.0 1.4936 66900.0 4.638125 \n", + "1 1129.0 463.0 1.8200 80100.0 4.384165 \n", + "2 333.0 117.0 1.6509 85700.0 4.773856 \n", + "\n", + " dist_hospital \n", + "0 7.692347 \n", + "1 7.540617 \n", + "2 7.446456 " + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "longitude float64\n", + "latitude float64\n", + "housing_median_age float64\n", + "total_rooms float64\n", + "total_bedrooms float64\n", + "population float64\n", + "households float64\n", + "median_income float64\n", + "median_house_value float64\n", + "dist_school float64\n", + "dist_hospital float64\n", + "dtype: object" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "df[\"close_to_school_hospital\"] = ((df[\"dist_school\"] < 0.5) | (df[\"dist_hospital\"] < 0.5))" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "close_to_school_hospital\n", + "False 16995\n", + "True 5\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[\"close_to_school_hospital\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "#We need to be cautios because we only have 5 datapoints which are near to a hospital or a school." + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "alpha = 0.05\n", + "\n", + "close_price = df[df[\"close_to_school_hospital\"] == True][\"median_house_value\"]\n", + "\n", + "far_price = df[df[\"close_to_school_hospital\"] == False][\"median_house_value\"]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.026799733071128685" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "t_stat, p_value = st.ttest_ind(close_price, far_price)\n", + "p_value" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "We reject the null hypothesis. We can say, with 95% degree of confidence that there is a significant \n", + " difference between house prices of those near and those far fromm schools or hospitals\n" + ] + } + ], + "source": [ + "if p_value < alpha:\n", + " print(f'''We reject the null hypothesis. We can say, with 95% degree of confidence that there is a significant \n", + " difference between house prices of those near and those far fromm schools or hospitals''')\n", + "else: \n", + " print(f\"We don't have enough data to reject the null hypothesis.\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/lab-hypothesis-testing.ipynb b/lab-hypothesis-testing.ipynb index 18ad6d5..c1fa91c 100644 --- a/lab-hypothesis-testing.ipynb +++ b/lab-hypothesis-testing.ipynb @@ -45,13 +45,12 @@ "#libraries\n", "import pandas as pd\n", "import scipy.stats as st\n", - "import numpy as np\n", - "\n" + "import numpy as np" ] }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 2, "metadata": {}, "outputs": [ { @@ -278,7 +277,7 @@ "[800 rows x 11 columns]" ] }, - "execution_count": 3, + "execution_count": 2, "metadata": {}, "output_type": "execute_result" } @@ -297,47 +296,107 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ - "#code here" + "#H0: avg dragon HP <= avg grass HP\n", + "#H1: avg dragon HP > avg grass HP" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 4, "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "alpha: 0.05, p_value: 0.0002567969150153481\n", + "We can reject the hypothesis with a 95% degree of confidence. THe average HP for dragon pokemons is greater than grass pokemons\n" + ] + } + ], "source": [ - "- We posit that Legendary Pokemons have different stats (HP, Attack, Defense, Sp.Atk, Sp.Def, Speed) when comparing with Non-Legendary. Choose the propper test and, with 5% significance, comment your findings.\n" + "#code here\n", + "#I need to use One tailed t-test\n", + "\n", + "alpha = 0.05\n", + "\n", + "grass_hp = df[df[\"Type 1\"] == \"Grass\"][\"HP\"]\n", + "dragon_hp = df[df[\"Type 1\"] == \"Dragon\"][\"HP\"]\n", + "\n", + "t_stat, p_value = st.ttest_ind(dragon_hp, grass_hp, alternative = \"greater\")\n", + "\n", + "print(f\"alpha: {alpha}, p_value: {p_value}\")\n", + "\n", + "if p_value < alpha:\n", + " print(\"We can reject the hypothesis with a 95% degree of confidence. THe average HP for dragon pokemons is greater than grass pokemons\")\n", + "else:\n", + " print(\"We don't have enough data to be able to reject the null hypothesis.\")" ] }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 5, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "67.27142857142857" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "#code here" + "grass_hp.mean()" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 6, "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "83.3125" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "**Challenge 2**" + "dragon_hp.mean()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "In this challenge, we will be working with california-housing data. The data can be found here:\n", - "- https://raw.githubusercontent.com/data-bootcamp-v4/data/main/california_housing.csv" + "- We posit that Legendary Pokemons have different stats (HP, Attack, Defense, Sp.Atk, Sp.Def, Speed) when comparing with Non-Legendary. Choose the propper test and, with 5% significance, comment your findings.\n" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "#H0: legendary stats = non-legendary stats\n", + "#H1: legendary stats != non-legendary stats" + ] + }, + { + "cell_type": "code", + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -361,144 +420,853 @@ " \n", " \n", " \n", - " longitude\n", - " latitude\n", - " housing_median_age\n", - " total_rooms\n", - " total_bedrooms\n", - " population\n", - " households\n", - " median_income\n", - " median_house_value\n", + " Name\n", + " Type 1\n", + " Type 2\n", + " HP\n", + " Attack\n", + " Defense\n", + " Sp. Atk\n", + " Sp. Def\n", + " Speed\n", + " Generation\n", + " Legendary\n", " \n", " \n", " \n", - " \n", - " 0\n", - " -114.31\n", - " 34.19\n", - " 15.0\n", - " 5612.0\n", - " 1283.0\n", - " 1015.0\n", - " 472.0\n", - " 1.4936\n", - " 66900.0\n", - " \n", - " \n", - " 1\n", - " -114.47\n", - " 34.40\n", - " 19.0\n", - " 7650.0\n", - " 1901.0\n", - " 1129.0\n", - " 463.0\n", - " 1.8200\n", - " 80100.0\n", - " \n", - " \n", - " 2\n", - " -114.56\n", - " 33.69\n", - " 17.0\n", - " 720.0\n", - " 174.0\n", - " 333.0\n", - " 117.0\n", - " 1.6509\n", - " 85700.0\n", - " \n", - " \n", - " 3\n", - " -114.57\n", - " 33.64\n", - " 14.0\n", - " 1501.0\n", - " 337.0\n", - " 515.0\n", - " 226.0\n", - " 3.1917\n", - " 73400.0\n", - " \n", - " \n", - " 4\n", - " -114.57\n", - " 33.57\n", - " 20.0\n", - " 1454.0\n", - " 326.0\n", - " 624.0\n", - " 262.0\n", - " 1.9250\n", - " 65500.0\n", - " \n", " \n", "\n", "" ], "text/plain": [ - " longitude latitude housing_median_age total_rooms total_bedrooms \\\n", - "0 -114.31 34.19 15.0 5612.0 1283.0 \n", - "1 -114.47 34.40 19.0 7650.0 1901.0 \n", - "2 -114.56 33.69 17.0 720.0 174.0 \n", - "3 -114.57 33.64 14.0 1501.0 337.0 \n", - "4 -114.57 33.57 20.0 1454.0 326.0 \n", - "\n", - " population households median_income median_house_value \n", - "0 1015.0 472.0 1.4936 66900.0 \n", - "1 1129.0 463.0 1.8200 80100.0 \n", - "2 333.0 117.0 1.6509 85700.0 \n", - "3 515.0 226.0 3.1917 73400.0 \n", - "4 624.0 262.0 1.9250 65500.0 " + "Empty DataFrame\n", + "Columns: [Name, Type 1, Type 2, HP, Attack, Defense, Sp. Atk, Sp. Def, Speed, Generation, Legendary]\n", + "Index: []" ] }, - "execution_count": 5, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/california_housing.csv\")\n", - "df.head()" + "df.head(0)" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 9, "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "For HP, t_stat: 8.036124405043928, p_value: 3.330647684846191e-15\n", + "We reject the null hypothesis. We can say, with 95% degree of confidence that there is a significant difference between legendary and non legeendary in HP\n", + "For Attack, t_stat: 10.397321023700622, p_value: 7.827253003205333e-24\n", + "We reject the null hypothesis. We can say, with 95% degree of confidence that there is a significant difference between legendary and non legeendary in Attack\n", + "For Defense, t_stat: 7.181240122992339, p_value: 1.5842226094427255e-12\n", + "We reject the null hypothesis. We can say, with 95% degree of confidence that there is a significant difference between legendary and non legeendary in Defense\n", + "For Sp. Atk, t_stat: 14.191406210846289, p_value: 6.314915770427266e-41\n", + "We reject the null hypothesis. We can say, with 95% degree of confidence that there is a significant difference between legendary and non legeendary in Sp. Atk\n", + "For Sp. Def, t_stat: 11.03775106120522, p_value: 1.8439809580409594e-26\n", + "We reject the null hypothesis. We can say, with 95% degree of confidence that there is a significant difference between legendary and non legeendary in Sp. Def\n", + "For Speed, t_stat: 9.765234331931898, p_value: 2.3540754436898437e-21\n", + "We reject the null hypothesis. We can say, with 95% degree of confidence that there is a significant difference between legendary and non legeendary in Speed\n" + ] + } + ], "source": [ - "**We posit that houses close to either a school or a hospital are more expensive.**\n", + "#code here\n", + "#Two_sample-t-Test\n", + "#WE're going to compare lengendary and non-legendary stas one by one to test if the are equal.\n", "\n", - "- School coordinates (-118, 37)\n", - "- Hospital coordinates (-122, 34)\n", + "alpha = 0.05\n", "\n", - "We consider a house (neighborhood) to be close to a school or hospital if the distance is lower than 0.50.\n", + "stats = [\"HP\", \"Attack\", \"Defense\", \"Sp. Atk\", \"Sp. Def\", \"Speed\"]\n", "\n", - "Hint:\n", - "- Write a function to calculate euclidean distance from each house (neighborhood) to the school and to the hospital.\n", - "- Divide your dataset into houses close and far from either a hospital or school.\n", - "- Choose the propper test and, with 5% significance, comment your findings.\n", - " " + "legendary = df[df[\"Legendary\"] == True]\n", + "\n", + "not_legendary = df[df[\"Legendary\"] == False]\n", + "\n", + "for stat in stats:\n", + " t_stat, p_value = st.ttest_ind(legendary[stat], not_legendary[stat])\n", + " print(f\"For {stat}, t_stat: {t_stat}, p_value: {p_value}\")\n", + " \n", + " if p_value < alpha:\n", + " print(f\"We reject the null hypothesis. We can say, with 95% degree of confidence that there is a significant difference between legendary and non legeendary in {stat}\")\n", + " else:\n", + " print(f\"We don't have enough data to reject the null hypothesis for {stat}\")" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "stats = [\"HP\", \"Attack\", \"Defense\", \"Sp. Atk\", \"Sp. Def\", \"Speed\"]\n" + ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/plain": [ + "92.73846153846154" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "legendary[\"HP\"].mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "67.18231292517007" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "not_legendary[\"HP\"].mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "116.67692307692307" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "legendary[\"Attack\"].mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "75.66938775510204" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "not_legendary[\"Attack\"].mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "99.66153846153846" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "legendary[\"Defense\"].mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "71.55918367346939" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "not_legendary[\"Defense\"].mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "122.18461538461538" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "legendary[\"Sp. Atk\"].mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "68.45442176870748" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "not_legendary[\"Sp. Atk\"].mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "105.93846153846154" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "legendary[\"Sp. Def\"].mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "68.89251700680272" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "not_legendary[\"Sp. Def\"].mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "100.18461538461538" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "legendary[\"Speed\"].mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "65.45578231292517" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "not_legendary[\"Speed\"].mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Challenge 2**" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "In this challenge, we will be working with california-housing data. The data can be found here:\n", + "- https://raw.githubusercontent.com/data-bootcamp-v4/data/main/california_housing.csv" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
longitudelatitudehousing_median_agetotal_roomstotal_bedroomspopulationhouseholdsmedian_incomemedian_house_value
0-114.3134.1915.05612.01283.01015.0472.01.493666900.0
1-114.4734.4019.07650.01901.01129.0463.01.820080100.0
2-114.5633.6917.0720.0174.0333.0117.01.650985700.0
3-114.5733.6414.01501.0337.0515.0226.03.191773400.0
4-114.5733.5720.01454.0326.0624.0262.01.925065500.0
5-114.5833.6329.01387.0236.0671.0239.03.343874000.0
6-114.5833.6125.02907.0680.01841.0633.02.676882400.0
7-114.5934.8341.0812.0168.0375.0158.01.708348500.0
8-114.5933.6134.04789.01175.03134.01056.02.178258400.0
9-114.6034.8346.01497.0309.0787.0271.02.190848100.0
\n", + "
" + ], + "text/plain": [ + " longitude latitude housing_median_age total_rooms total_bedrooms \\\n", + "0 -114.31 34.19 15.0 5612.0 1283.0 \n", + "1 -114.47 34.40 19.0 7650.0 1901.0 \n", + "2 -114.56 33.69 17.0 720.0 174.0 \n", + "3 -114.57 33.64 14.0 1501.0 337.0 \n", + "4 -114.57 33.57 20.0 1454.0 326.0 \n", + "5 -114.58 33.63 29.0 1387.0 236.0 \n", + "6 -114.58 33.61 25.0 2907.0 680.0 \n", + "7 -114.59 34.83 41.0 812.0 168.0 \n", + "8 -114.59 33.61 34.0 4789.0 1175.0 \n", + "9 -114.60 34.83 46.0 1497.0 309.0 \n", + "\n", + " population households median_income median_house_value \n", + "0 1015.0 472.0 1.4936 66900.0 \n", + "1 1129.0 463.0 1.8200 80100.0 \n", + "2 333.0 117.0 1.6509 85700.0 \n", + "3 515.0 226.0 3.1917 73400.0 \n", + "4 624.0 262.0 1.9250 65500.0 \n", + "5 671.0 239.0 3.3438 74000.0 \n", + "6 1841.0 633.0 2.6768 82400.0 \n", + "7 375.0 158.0 1.7083 48500.0 \n", + "8 3134.0 1056.0 2.1782 58400.0 \n", + "9 787.0 271.0 2.1908 48100.0 " + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df = pd.read_csv(\"https://raw.githubusercontent.com/data-bootcamp-v4/data/main/california_housing.csv\")\n", + "df.head(10)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**We posit that houses close to either a school or a hospital are more expensive.**\n", + "\n", + "- School coordinates (-118, 37)\n", + "- Hospital coordinates (-122, 34)\n", + "\n", + "We consider a house (neighborhood) to be close to a school or hospital if the distance is lower than 0.50.\n", + "\n", + "Hint:\n", + "- Write a function to calculate euclidean distance from each house (neighborhood) to the school and to the hospital.\n", + "- Divide your dataset into houses close and far from either a hospital or school.\n", + "- Choose the propper test and, with 5% significance, comment your findings.\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "#Hypothesis:\n", + "#H0: prices of houses near schools or hospitals = houses far from schools or hospitals \n", + "#H1: prices of houses near schools or hospitals != houses far from schools or hospitals" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "#Euclidean distance\n", + "def euc_dist(x1, x2, y1, y2):\n", + " return np.sqrt((x2 - x1) ** 2 + (y2 - y1) ** 2)" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "#Dividing the dataset into houses close and far from EITHER a hospital OR school\n", + "\n", + "school = (-118, 37)\n", + "hospital = (-122, 34)\n", + "\n", + "#distance from school\n", + "df[\"dist_school\"] = df.apply(lambda row: euc_dist(row[\"longitude\"], school[0], row[\"latitude\"], school[1]), axis = 1)\n", + "\n", + "df[\"dist_hospital\"] = df.apply(lambda row: euc_dist(row[\"longitude\"], hospital[0], row[\"latitude\"], hospital[1]), axis = 1)" + ] + }, + { + "cell_type": "code", + "execution_count": 27, + "metadata": { + "scrolled": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
longitudelatitudehousing_median_agetotal_roomstotal_bedroomspopulationhouseholdsmedian_incomemedian_house_valuedist_schooldist_hospital
0-114.3134.1915.05612.01283.01015.0472.01.493666900.04.6381257.692347
1-114.4734.4019.07650.01901.01129.0463.01.820080100.04.3841657.540617
2-114.5633.6917.0720.0174.0333.0117.01.650985700.04.7738567.446456
\n", + "
" + ], + "text/plain": [ + " longitude latitude housing_median_age total_rooms total_bedrooms \\\n", + "0 -114.31 34.19 15.0 5612.0 1283.0 \n", + "1 -114.47 34.40 19.0 7650.0 1901.0 \n", + "2 -114.56 33.69 17.0 720.0 174.0 \n", + "\n", + " population households median_income median_house_value dist_school \\\n", + "0 1015.0 472.0 1.4936 66900.0 4.638125 \n", + "1 1129.0 463.0 1.8200 80100.0 4.384165 \n", + "2 333.0 117.0 1.6509 85700.0 4.773856 \n", + "\n", + " dist_hospital \n", + "0 7.692347 \n", + "1 7.540617 \n", + "2 7.446456 " + ] + }, + "execution_count": 27, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head(3)" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "longitude float64\n", + "latitude float64\n", + "housing_median_age float64\n", + "total_rooms float64\n", + "total_bedrooms float64\n", + "population float64\n", + "households float64\n", + "median_income float64\n", + "median_house_value float64\n", + "dist_school float64\n", + "dist_hospital float64\n", + "dtype: object" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.dtypes" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "df[\"close_to_school_hospital\"] = ((df[\"dist_school\"] < 0.5) | (df[\"dist_hospital\"] < 0.5))" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "close_to_school_hospital\n", + "False 16995\n", + "True 5\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[\"close_to_school_hospital\"].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [], + "source": [ + "#We need to be cautios because we only have 5 datapoints which are near to a hospital or a school." + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "alpha = 0.05\n", + "\n", + "close_price = df[df[\"close_to_school_hospital\"] == True][\"median_house_value\"]\n", + "\n", + "far_price = df[df[\"close_to_school_hospital\"] == False][\"median_house_value\"]\n" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0.026799733071128685" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "t_stat, p_value = st.ttest_ind(close_price, far_price)\n", + "p_value" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "We reject the null hypothesis. We can say, with 95% degree of confidence that there is a significant \n", + " difference between house prices of those near and those far fromm schools or hospitals\n" + ] + } + ], + "source": [ + "if p_value < alpha:\n", + " print(f'''We reject the null hypothesis. We can say, with 95% degree of confidence that there is a significant \n", + " difference between house prices of those near and those far fromm schools or hospitals''')\n", + "else: \n", + " print(f\"We don't have enough data to reject the null hypothesis.\")" + ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -512,7 +1280,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.9" + "version": "3.11.7" } }, "nbformat": 4,