From 48acb60e7a7fd7b9fe5b8668659e1f3cf28334c7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carlota=20Gordillo=20=C3=81lvarez?= Date: Sat, 5 Oct 2024 13:00:26 +0200 Subject: [PATCH] first commit --- lab-dw-pandas.ipynb | 2115 ++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 2101 insertions(+), 14 deletions(-) diff --git a/lab-dw-pandas.ipynb b/lab-dw-pandas.ipynb index fbd468314..e0c23a00b 100644 --- a/lab-dw-pandas.ipynb +++ b/lab-dw-pandas.ipynb @@ -82,12 +82,1201 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, + "id": "d807707b", + "metadata": {}, + "outputs": [], + "source": [ + "import pandas as pd\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 4, "id": "dd4e8cd8-a6f6-486c-a5c4-1745b0c035f4", "metadata": {}, "outputs": [], "source": [ - "# Your code here" + "# Input the data set \n", + "\n", + "url = 'https://raw.githubusercontent.com/data-bootcamp-v4/data/main/file1.csv'\n", + "df = pd.read_csv(url)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "dbfafc8f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CustomerSTGENDEREducationCustomer Lifetime ValueIncomeMonthly Premium AutoNumber of Open ComplaintsPolicy TypeVehicle ClassTotal Claim Amount
0RB50392WashingtonNaNMasterNaN0.01000.01/0/00Personal AutoFour-Door Car2.704934
1QZ44356ArizonaFBachelor697953.59%0.094.01/0/00Personal AutoFour-Door Car1131.464935
2AI49188NevadaFBachelor1288743.17%48767.0108.01/0/00Personal AutoTwo-Door Car566.472247
3WW63253CaliforniaMBachelor764586.18%0.0106.01/0/00Corporate AutoSUV529.881344
4GA49547WashingtonMHigh School or Below536307.65%36357.068.01/0/00Personal AutoFour-Door Car17.269323
....................................
4003NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4004NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4005NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4006NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4007NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", + "

4008 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " Customer ST GENDER Education \\\n", + "0 RB50392 Washington NaN Master \n", + "1 QZ44356 Arizona F Bachelor \n", + "2 AI49188 Nevada F Bachelor \n", + "3 WW63253 California M Bachelor \n", + "4 GA49547 Washington M High School or Below \n", + "... ... ... ... ... \n", + "4003 NaN NaN NaN NaN \n", + "4004 NaN NaN NaN NaN \n", + "4005 NaN NaN NaN NaN \n", + "4006 NaN NaN NaN NaN \n", + "4007 NaN NaN NaN NaN \n", + "\n", + " Customer Lifetime Value Income Monthly Premium Auto \\\n", + "0 NaN 0.0 1000.0 \n", + "1 697953.59% 0.0 94.0 \n", + "2 1288743.17% 48767.0 108.0 \n", + "3 764586.18% 0.0 106.0 \n", + "4 536307.65% 36357.0 68.0 \n", + "... ... ... ... \n", + "4003 NaN NaN NaN \n", + "4004 NaN NaN NaN \n", + "4005 NaN NaN NaN \n", + "4006 NaN NaN NaN \n", + "4007 NaN NaN NaN \n", + "\n", + " Number of Open Complaints Policy Type Vehicle Class \\\n", + "0 1/0/00 Personal Auto Four-Door Car \n", + "1 1/0/00 Personal Auto Four-Door Car \n", + "2 1/0/00 Personal Auto Two-Door Car \n", + "3 1/0/00 Corporate Auto SUV \n", + "4 1/0/00 Personal Auto Four-Door Car \n", + "... ... ... ... \n", + "4003 NaN NaN NaN \n", + "4004 NaN NaN NaN \n", + "4005 NaN NaN NaN \n", + "4006 NaN NaN NaN \n", + "4007 NaN NaN NaN \n", + "\n", + " Total Claim Amount \n", + "0 2.704934 \n", + "1 1131.464935 \n", + "2 566.472247 \n", + "3 529.881344 \n", + "4 17.269323 \n", + "... ... \n", + "4003 NaN \n", + "4004 NaN \n", + "4005 NaN \n", + "4006 NaN \n", + "4007 NaN \n", + "\n", + "[4008 rows x 11 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "markdown", + "id": "f9273a62", + "metadata": {}, + "source": [ + "- Identify the dimensions of the dataset by determining the number of rows and columns it contains." + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "424441f8", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of rows: 4008\n", + "Number of columns: 11\n" + ] + } + ], + "source": [ + "df.shape\n", + "print(f\"Number of rows: {df.shape[0]}\")\n", + "print(f\"Number of columns: {df.shape[1]}\")" + ] + }, + { + "cell_type": "markdown", + "id": "f2988285", + "metadata": {}, + "source": [ + "- Determine the data types of each column and evaluate whether they are appropriate for the nature of the variable. You should also provide suggestions for fixing any incorrect data types." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "02d94c0b", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 4008 entries, 0 to 4007\n", + "Data columns (total 11 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 Customer 1071 non-null object \n", + " 1 ST 1071 non-null object \n", + " 2 GENDER 954 non-null object \n", + " 3 Education 1071 non-null object \n", + " 4 Customer Lifetime Value 1068 non-null object \n", + " 5 Income 1071 non-null float64\n", + " 6 Monthly Premium Auto 1071 non-null float64\n", + " 7 Number of Open Complaints 1071 non-null object \n", + " 8 Policy Type 1071 non-null object \n", + " 9 Vehicle Class 1071 non-null object \n", + " 10 Total Claim Amount 1071 non-null float64\n", + "dtypes: float64(3), object(8)\n", + "memory usage: 344.6+ KB\n" + ] + } + ], + "source": [ + "df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "id": "1b2d3c0b", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Customer object\n", + "ST object\n", + "GENDER object\n", + "Education object\n", + "Customer Lifetime Value object\n", + "Income float64\n", + "Monthly Premium Auto float64\n", + "Number of Open Complaints object\n", + "Policy Type object\n", + "Vehicle Class object\n", + "Total Claim Amount float64\n", + "dtype: object" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# types of data\n", + "df.dtypes" + ] + }, + { + "cell_type": "markdown", + "id": "4d35b079", + "metadata": {}, + "source": [ + "- Customer -> Object (Correct)\n", + "- ST -> Object (Correct)\n", + "- Gender -> Object (Correct)\n", + "-Education object\n", + "- Customer Lifetime Value object (Not correct, but it's for the NaN) In my opinion, this should be type float for next calculations \n", + "- Income float64 (Correct)\n", + "- Monthly Premium Auto float64 (Correct)\n", + "- Number of Open Complaints object \n", + "- Policy Type object (Correct)\n", + "- Vehicle Class object (Correct)\n", + "- Total Claim Amount float64 (Correct)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "id": "70db2edc", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['1/0/00', '1/2/00', '1/1/00', '1/3/00', '1/5/00', '1/4/00', nan],\n", + " dtype=object)" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[\"Number of Open Complaints\"].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "id": "ed13492d", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.info" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "id": "9f765875", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CustomerSTGENDEREducationCustomer Lifetime ValueIncomeMonthly Premium AutoNumber of Open ComplaintsPolicy TypeVehicle ClassTotal Claim Amount
0RB50392WashingtonNaNMasterNaN0.01000.01/0/00Personal AutoFour-Door Car2.704934
1QZ44356ArizonaFBachelor697953.59%0.094.01/0/00Personal AutoFour-Door Car1131.464935
2AI49188NevadaFBachelor1288743.17%48767.0108.01/0/00Personal AutoTwo-Door Car566.472247
3WW63253CaliforniaMBachelor764586.18%0.0106.01/0/00Corporate AutoSUV529.881344
4GA49547WashingtonMHigh School or Below536307.65%36357.068.01/0/00Personal AutoFour-Door Car17.269323
....................................
4003NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4004NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4005NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4006NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
4007NaNNaNNaNNaNNaNNaNNaNNaNNaNNaNNaN
\n", + "

4008 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " Customer ST GENDER Education \\\n", + "0 RB50392 Washington NaN Master \n", + "1 QZ44356 Arizona F Bachelor \n", + "2 AI49188 Nevada F Bachelor \n", + "3 WW63253 California M Bachelor \n", + "4 GA49547 Washington M High School or Below \n", + "... ... ... ... ... \n", + "4003 NaN NaN NaN NaN \n", + "4004 NaN NaN NaN NaN \n", + "4005 NaN NaN NaN NaN \n", + "4006 NaN NaN NaN NaN \n", + "4007 NaN NaN NaN NaN \n", + "\n", + " Customer Lifetime Value Income Monthly Premium Auto \\\n", + "0 NaN 0.0 1000.0 \n", + "1 697953.59% 0.0 94.0 \n", + "2 1288743.17% 48767.0 108.0 \n", + "3 764586.18% 0.0 106.0 \n", + "4 536307.65% 36357.0 68.0 \n", + "... ... ... ... \n", + "4003 NaN NaN NaN \n", + "4004 NaN NaN NaN \n", + "4005 NaN NaN NaN \n", + "4006 NaN NaN NaN \n", + "4007 NaN NaN NaN \n", + "\n", + " Number of Open Complaints Policy Type Vehicle Class \\\n", + "0 1/0/00 Personal Auto Four-Door Car \n", + "1 1/0/00 Personal Auto Four-Door Car \n", + "2 1/0/00 Personal Auto Two-Door Car \n", + "3 1/0/00 Corporate Auto SUV \n", + "4 1/0/00 Personal Auto Four-Door Car \n", + "... ... ... ... \n", + "4003 NaN NaN NaN \n", + "4004 NaN NaN NaN \n", + "4005 NaN NaN NaN \n", + "4006 NaN NaN NaN \n", + "4007 NaN NaN NaN \n", + "\n", + " Total Claim Amount \n", + "0 2.704934 \n", + "1 1131.464935 \n", + "2 566.472247 \n", + "3 529.881344 \n", + "4 17.269323 \n", + "... ... \n", + "4003 NaN \n", + "4004 NaN \n", + "4005 NaN \n", + "4006 NaN \n", + "4007 NaN \n", + "\n", + "[4008 rows x 11 columns]" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df" + ] + }, + { + "cell_type": "markdown", + "id": "fe2c79f9", + "metadata": {}, + "source": [ + "- Identify the number of unique values for each column and determine which columns appear to be categorical. You should also describe the unique values of each categorical column and the range of values for numerical columns, and give your insights." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "id": "39e32c3a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Customer 1071\n", + "ST 8\n", + "GENDER 5\n", + "Education 6\n", + "Customer Lifetime Value 1027\n", + "Income 774\n", + "Monthly Premium Auto 132\n", + "Number of Open Complaints 6\n", + "Policy Type 3\n", + "Vehicle Class 6\n", + "Total Claim Amount 761\n", + "dtype: int64" + ] + }, + "execution_count": 18, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Number of unique values for each column\n", + "df.nunique()" + ] + }, + { + "cell_type": "markdown", + "id": "0efe1f2e", + "metadata": {}, + "source": [ + "Columnas categoricas -> Customer, ST, GENDER, Education, Policy Type, Vehicle Class" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "id": "47e89b48", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['RB50392', 'QZ44356', 'AI49188', ..., 'CW49887', 'MY31220', nan],\n", + " dtype=object)" + ] + }, + "execution_count": 19, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[\"Customer\"].unique() # hace referencia al código del cliente y por tanto, hay uno diferente para cada uno" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "id": "57fda049", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['Washington', 'Arizona', 'Nevada', 'California', 'Oregon', 'Cali',\n", + " 'AZ', 'WA', nan], dtype=object)" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[\"ST\"].unique() #Hace referencia al estado en el que vive el cliente y hay 8" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "id": "31277ff8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([nan, 'F', 'M', 'Femal', 'Male', 'female'], dtype=object)" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[\"GENDER\"].unique() #Es el genero de la persona, deberia haber unicamente dos. Hay un problema con la notacion" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "id": "a57af54f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['Master', 'Bachelor', 'High School or Below', 'College',\n", + " 'Bachelors', 'Doctor', nan], dtype=object)" + ] + }, + "execution_count": 24, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[\"Education\"].unique() #hay un problema con la notacion tambien, hay repetidas" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "id": "e38bcbec", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['Personal Auto', 'Corporate Auto', 'Special Auto', nan],\n", + " dtype=object)" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[\"Policy Type\"].unique() #tipo de poliza de seguro, hay 3" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "id": "d9167648", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['Four-Door Car', 'Two-Door Car', 'SUV', 'Luxury SUV', 'Sports Car',\n", + " 'Luxury Car', nan], dtype=object)" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[\"Vehicle Class\"].unique() #tipo de vehiculo" + ] + }, + { + "cell_type": "markdown", + "id": "03bd14c5", + "metadata": {}, + "source": [ + "- Compute summary statistics such as mean, median, mode, standard deviation, and quartiles to understand the central tendency and distribution of the data for numerical columns. You should also provide your conclusions based on these summary statistics." + ] + }, + { + "cell_type": "markdown", + "id": "f2bc8479", + "metadata": {}, + "source": [ + "El estudio descriptivo solo lo podemos hacer de las variables cuantitativas y no las cualitativas. En el resumen, podemos observar que la media de \"Income\": 39295.701214 y tambien podemos observar que este caso un outlier sera el maximo, pues hay mucha diferencia entre la media con ese valor. Además, de reconocer que el 75% de los usarios esta por debajo de un valor mucho mas pequeño. " + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "id": "e51329d6", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
IncomeMonthly Premium AutoTotal Claim Amount
count1071.0000001071.0000001071.000000
mean39295.701214193.234360404.986909
std30469.4270601601.190369293.027260
min0.00000061.0000000.382107
25%14072.00000068.000000202.157702
50%36234.00000083.000000354.729129
75%64631.000000109.500000532.800000
max99960.00000035354.0000002893.239678
\n", + "
" + ], + "text/plain": [ + " Income Monthly Premium Auto Total Claim Amount\n", + "count 1071.000000 1071.000000 1071.000000\n", + "mean 39295.701214 193.234360 404.986909\n", + "std 30469.427060 1601.190369 293.027260\n", + "min 0.000000 61.000000 0.382107\n", + "25% 14072.000000 68.000000 202.157702\n", + "50% 36234.000000 83.000000 354.729129\n", + "75% 64631.000000 109.500000 532.800000\n", + "max 99960.000000 35354.000000 2893.239678" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.describe(include=\"float\")" + ] + }, + { + "cell_type": "markdown", + "id": "c26659ba", + "metadata": {}, + "source": [ + "- Compute summary statistics for categorical columns and providing your conclusions based on these statistics." + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "id": "b967342f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CustomerSTGENDEREducationCustomer Lifetime ValueNumber of Open ComplaintsPolicy TypeVehicle Class
count1071107195410711068107110711071
unique10718561027636
topRB50392OregonFBachelor445811.34%1/0/00Personal AutoFour-Door Car
freq13204573244830780576
\n", + "
" + ], + "text/plain": [ + " Customer ST GENDER Education Customer Lifetime Value \\\n", + "count 1071 1071 954 1071 1068 \n", + "unique 1071 8 5 6 1027 \n", + "top RB50392 Oregon F Bachelor 445811.34% \n", + "freq 1 320 457 324 4 \n", + "\n", + " Number of Open Complaints Policy Type Vehicle Class \n", + "count 1071 1071 1071 \n", + "unique 6 3 6 \n", + "top 1/0/00 Personal Auto Four-Door Car \n", + "freq 830 780 576 " + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.describe(include = \"object\")" + ] + }, + { + "cell_type": "markdown", + "id": "fa700686", + "metadata": {}, + "source": [ + "Que la mayoria de clientes soy mujeres, la educación = Bachelor, el número de quejas = 1, poseen coches de cuatro puertas y su poliza es de tipo Personal" ] }, { @@ -116,12 +1305,31 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 43, "id": "2dca5073-4520-4f42-9390-4b92733284ed", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "ST\n", + "AZ 25\n", + "WA 30\n", + "Washington 81\n", + "Nevada 98\n", + "Cali 120\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here" + "less_common_location = pd.Series(df[\"ST\"])\n", + "top_5 = less_common_location.value_counts().sort_values(ascending=True)[:5]\n", + "top_5" ] }, { @@ -146,12 +1354,27 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 52, "id": "bcfad6c1-9af2-4b0b-9aa9-0dc5c17473c0", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "Policy Type\n", + "Personal Auto 780\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here" + "policy_types = pd.Series(df[\"Policy Type\"]).value_counts().sort_values(ascending=False)\n", + "max_number_policies_sold = policy_types[:1]\n", + "max_number_policies_sold" ] }, { @@ -176,12 +1399,68 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 53, + "id": "fa0b637f", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['Personal Auto', 'Corporate Auto', 'Special Auto', nan],\n", + " dtype=object)" + ] + }, + "execution_count": 53, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[\"Policy Type\"].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 68, "id": "0c0563cf-6f8b-463d-a321-651a972f82e5", "metadata": {}, "outputs": [], "source": [ - "# Your code here" + "personal = df[df[\"Policy Type\"] == \"Personal Auto\"]\n", + "average_personal = personal[\"Income\"].mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "id": "6813e0b7", + "metadata": {}, + "outputs": [], + "source": [ + "corporate = df[df[\"Policy Type\"] == \"Corporate Auto\"]\n", + "average_corporate = corporate[\"Income\"].mean()" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "id": "56af80db", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Average income Personal Policy's costumer: 38180.69871794872\n", + "Average Income Corporate Policy's costumer: 41390.31196581197\n" + ] + } + ], + "source": [ + "print(f\"Average income Personal Policy's costumer: {average_personal}\")\n", + "print(f\"Average Income Corporate Policy's costumer: {average_corporate}\")\n", + "\n", + "# En media, el salario de los usarios que tiene una poliza corporte es mayor que de aquellos que tienen una personal" ] }, { @@ -224,15 +1503,823 @@ "*Hint 2: check `Boolean selection according to the values of a single column` in https://towardsdatascience.com/filtering-data-frames-in-pandas-b570b1f834b9*" ] }, + { + "cell_type": "markdown", + "id": "5e04819e", + "metadata": {}, + "source": [ + ">75% - > 532.800000" + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 81, + "id": "6cc4cf9c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "532.8" + ] + }, + "execution_count": 81, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "quantile_75 = df[\"Total Claim Amount\"].quantile(0.75)\n", + "quantile_75" + ] + }, + { + "cell_type": "code", + "execution_count": 72, "id": "b731bca6-a760-4860-a27b-a33efa712ce0", "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CustomerSTGENDEREducationCustomer Lifetime ValueIncomeMonthly Premium AutoNumber of Open ComplaintsPolicy TypeVehicle ClassTotal Claim Amount
count10711071954107110681071.0000001071.0000001071107110711071.000000
unique10718561027NaNNaN636NaN
topRB50392OregonFBachelor445811.34%NaNNaN1/0/00Personal AutoFour-Door CarNaN
freq13204573244NaNNaN830780576NaN
meanNaNNaNNaNNaNNaN39295.701214193.234360NaNNaNNaN404.986909
stdNaNNaNNaNNaNNaN30469.4270601601.190369NaNNaNNaN293.027260
minNaNNaNNaNNaNNaN0.00000061.000000NaNNaNNaN0.382107
25%NaNNaNNaNNaNNaN14072.00000068.000000NaNNaNNaN202.157702
50%NaNNaNNaNNaNNaN36234.00000083.000000NaNNaNNaN354.729129
75%NaNNaNNaNNaNNaN64631.000000109.500000NaNNaNNaN532.800000
maxNaNNaNNaNNaNNaN99960.00000035354.000000NaNNaNNaN2893.239678
\n", + "
" + ], + "text/plain": [ + " Customer ST GENDER Education Customer Lifetime Value \\\n", + "count 1071 1071 954 1071 1068 \n", + "unique 1071 8 5 6 1027 \n", + "top RB50392 Oregon F Bachelor 445811.34% \n", + "freq 1 320 457 324 4 \n", + "mean NaN NaN NaN NaN NaN \n", + "std NaN NaN NaN NaN NaN \n", + "min NaN NaN NaN NaN NaN \n", + "25% NaN NaN NaN NaN NaN \n", + "50% NaN NaN NaN NaN NaN \n", + "75% NaN NaN NaN NaN NaN \n", + "max NaN NaN NaN NaN NaN \n", + "\n", + " Income Monthly Premium Auto Number of Open Complaints \\\n", + "count 1071.000000 1071.000000 1071 \n", + "unique NaN NaN 6 \n", + "top NaN NaN 1/0/00 \n", + "freq NaN NaN 830 \n", + "mean 39295.701214 193.234360 NaN \n", + "std 30469.427060 1601.190369 NaN \n", + "min 0.000000 61.000000 NaN \n", + "25% 14072.000000 68.000000 NaN \n", + "50% 36234.000000 83.000000 NaN \n", + "75% 64631.000000 109.500000 NaN \n", + "max 99960.000000 35354.000000 NaN \n", + "\n", + " Policy Type Vehicle Class Total Claim Amount \n", + "count 1071 1071 1071.000000 \n", + "unique 3 6 NaN \n", + "top Personal Auto Four-Door Car NaN \n", + "freq 780 576 NaN \n", + "mean NaN NaN 404.986909 \n", + "std NaN NaN 293.027260 \n", + "min NaN NaN 0.382107 \n", + "25% NaN NaN 202.157702 \n", + "50% NaN NaN 354.729129 \n", + "75% NaN NaN 532.800000 \n", + "max NaN NaN 2893.239678 " + ] + }, + "execution_count": 72, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "# Your code here" + "df.describe(include='all')" ] + }, + { + "cell_type": "code", + "execution_count": 82, + "id": "f5ee6f8a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CustomerSTGENDEREducationCustomer Lifetime ValueIncomeMonthly Premium AutoNumber of Open ComplaintsPolicy TypeVehicle ClassTotal Claim Amount
1QZ44356ArizonaFBachelor697953.59%0.094.01/0/00Personal AutoFour-Door Car1131.464935
2AI49188NevadaFBachelor1288743.17%48767.0108.01/0/00Personal AutoTwo-Door Car566.472247
17OE15005CaliNaNCollege394524.16%28855.0101.01/0/00Personal AutoSUV647.442031
23TZ98966NevadaNaNBachelor245019.10%0.073.01/3/00Corporate AutoFour-Door Car554.376763
26US89481CaliforniaNaNBachelor394637.21%0.0111.01/0/00Personal AutoFour-Door Car799.200000
....................................
1059YG44474OregonMCollege1401472.13%54193.0117.01/0/00Corporate AutoSUV720.752945
1061RY92647CaliFBachelor1050677.17%0.092.01/0/00Personal AutoFour-Door Car546.524896
1068GS98873ArizonaFBachelor323912.47%16061.088.01/0/00Personal AutoFour-Door Car633.600000
1069CW49887CaliforniaFMaster462680.11%79487.0114.01/0/00Special AutoSUV547.200000
1070MY31220CaliforniaFCollege899704.02%54230.0112.01/0/00Personal AutoTwo-Door Car537.600000
\n", + "

264 rows × 11 columns

\n", + "
" + ], + "text/plain": [ + " Customer ST GENDER Education Customer Lifetime Value Income \\\n", + "1 QZ44356 Arizona F Bachelor 697953.59% 0.0 \n", + "2 AI49188 Nevada F Bachelor 1288743.17% 48767.0 \n", + "17 OE15005 Cali NaN College 394524.16% 28855.0 \n", + "23 TZ98966 Nevada NaN Bachelor 245019.10% 0.0 \n", + "26 US89481 California NaN Bachelor 394637.21% 0.0 \n", + "... ... ... ... ... ... ... \n", + "1059 YG44474 Oregon M College 1401472.13% 54193.0 \n", + "1061 RY92647 Cali F Bachelor 1050677.17% 0.0 \n", + "1068 GS98873 Arizona F Bachelor 323912.47% 16061.0 \n", + "1069 CW49887 California F Master 462680.11% 79487.0 \n", + "1070 MY31220 California F College 899704.02% 54230.0 \n", + "\n", + " Monthly Premium Auto Number of Open Complaints Policy Type \\\n", + "1 94.0 1/0/00 Personal Auto \n", + "2 108.0 1/0/00 Personal Auto \n", + "17 101.0 1/0/00 Personal Auto \n", + "23 73.0 1/3/00 Corporate Auto \n", + "26 111.0 1/0/00 Personal Auto \n", + "... ... ... ... \n", + "1059 117.0 1/0/00 Corporate Auto \n", + "1061 92.0 1/0/00 Personal Auto \n", + "1068 88.0 1/0/00 Personal Auto \n", + "1069 114.0 1/0/00 Special Auto \n", + "1070 112.0 1/0/00 Personal Auto \n", + "\n", + " Vehicle Class Total Claim Amount \n", + "1 Four-Door Car 1131.464935 \n", + "2 Two-Door Car 566.472247 \n", + "17 SUV 647.442031 \n", + "23 Four-Door Car 554.376763 \n", + "26 Four-Door Car 799.200000 \n", + "... ... ... \n", + "1059 SUV 720.752945 \n", + "1061 Four-Door Car 546.524896 \n", + "1068 Four-Door Car 633.600000 \n", + "1069 SUV 547.200000 \n", + "1070 Two-Door Car 537.600000 \n", + "\n", + "[264 rows x 11 columns]" + ] + }, + "execution_count": 82, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "max_claim = df[df[\"Total Claim Amount\"]>quantile_75]\n", + "max_claim #aqui estan los clientes que tiene un total Claim Amount superior al 75%" + ] + }, + { + "cell_type": "code", + "execution_count": 76, + "id": "62d7c63e", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CustomerSTGENDEREducationCustomer Lifetime ValueIncomeMonthly Premium AutoNumber of Open ComplaintsPolicy TypeVehicle ClassTotal Claim Amount
count264264238264264264.000000264.000000264264264264.000000
unique264755256NaNNaN636NaN
topQZ44356OregonFBachelor578018.22%NaNNaN1/0/00Personal AutoSUVNaN
freq190115853NaNNaN206191101NaN
meanNaNNaNNaNNaNNaN23677.344697165.193182NaNNaNNaN782.228263
stdNaNNaNNaNNaNNaN27013.483721623.930992NaNNaNNaN292.751640
minNaNNaNNaNNaNNaN0.00000063.000000NaNNaNNaN537.600000
25%NaNNaNNaNNaNNaN0.00000099.000000NaNNaNNaN606.521741
50%NaNNaNNaNNaNNaN18807.000000114.000000NaNNaNNaN679.597985
75%NaNNaNNaNNaNNaN42423.750000133.250000NaNNaNNaN851.400000
maxNaNNaNNaNNaNNaN99316.00000010202.000000NaNNaNNaN2893.239678
\n", + "
" + ], + "text/plain": [ + " Customer ST GENDER Education Customer Lifetime Value \\\n", + "count 264 264 238 264 264 \n", + "unique 264 7 5 5 256 \n", + "top QZ44356 Oregon F Bachelor 578018.22% \n", + "freq 1 90 115 85 3 \n", + "mean NaN NaN NaN NaN NaN \n", + "std NaN NaN NaN NaN NaN \n", + "min NaN NaN NaN NaN NaN \n", + "25% NaN NaN NaN NaN NaN \n", + "50% NaN NaN NaN NaN NaN \n", + "75% NaN NaN NaN NaN NaN \n", + "max NaN NaN NaN NaN NaN \n", + "\n", + " Income Monthly Premium Auto Number of Open Complaints \\\n", + "count 264.000000 264.000000 264 \n", + "unique NaN NaN 6 \n", + "top NaN NaN 1/0/00 \n", + "freq NaN NaN 206 \n", + "mean 23677.344697 165.193182 NaN \n", + "std 27013.483721 623.930992 NaN \n", + "min 0.000000 63.000000 NaN \n", + "25% 0.000000 99.000000 NaN \n", + "50% 18807.000000 114.000000 NaN \n", + "75% 42423.750000 133.250000 NaN \n", + "max 99316.000000 10202.000000 NaN \n", + "\n", + " Policy Type Vehicle Class Total Claim Amount \n", + "count 264 264 264.000000 \n", + "unique 3 6 NaN \n", + "top Personal Auto SUV NaN \n", + "freq 191 101 NaN \n", + "mean NaN NaN 782.228263 \n", + "std NaN NaN 292.751640 \n", + "min NaN NaN 537.600000 \n", + "25% NaN NaN 606.521741 \n", + "50% NaN NaN 679.597985 \n", + "75% NaN NaN 851.400000 \n", + "max NaN NaN 2893.239678 " + ] + }, + "execution_count": 76, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "max_claim.describe(include=\"all\")" + ] + }, + { + "cell_type": "code", + "execution_count": 83, + "id": "ac8fbd36", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(264, 11)" + ] + }, + "execution_count": 83, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "max_claim.shape" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "319705ac", + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "0fa81645", + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -251,7 +2338,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.13" + "version": "3.11.7" } }, "nbformat": 4,