From 5f99a28d10da4bc175c7fffeeb9654f1bddba852 Mon Sep 17 00:00:00 2001 From: Oriol Codina Vallori Date: Thu, 19 Oct 2023 16:59:20 +0200 Subject: [PATCH] lab done --- code/pandas_1_concat-merge-join.ipynb | 649 +++++++++++++++++++++----- 1 file changed, 545 insertions(+), 104 deletions(-) diff --git a/code/pandas_1_concat-merge-join.ipynb b/code/pandas_1_concat-merge-join.ipynb index c66e580..17312e7 100644 --- a/code/pandas_1_concat-merge-join.ipynb +++ b/code/pandas_1_concat-merge-join.ipynb @@ -2,11 +2,12 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 52, "metadata": {}, "outputs": [], "source": [ - "import pandas as pd" + "import pandas as pd\n", + "import numpy as np" ] }, { @@ -41,16 +42,7 @@ }, { "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd" - ] - }, - { - "cell_type": "code", - "execution_count": 3, + "execution_count": 67, "metadata": { "scrolled": true }, @@ -114,7 +106,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -194,7 +186,7 @@ "5 a5 b5 c5" ] }, - "execution_count": 5, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -205,10 +197,94 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
DEF
0d0e0f0
1d1e1f1
2d2e2f2
3d3e3f3
4d4e4f4
5d5e5f5
\n", + "
" + ], + "text/plain": [ + " D E F\n", + "0 d0 e0 f0\n", + "1 d1 e1 f1\n", + "2 d2 e2 f2\n", + "3 d3 e3 f3\n", + "4 d4 e4 f4\n", + "5 d5 e5 f5" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.concat([df3,df4])" + ] }, { "cell_type": "markdown", @@ -223,10 +299,175 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 10, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABCDEF
0a0b0c0NaNNaNNaN
1a1b1c1NaNNaNNaN
2a2b2c2NaNNaNNaN
3a3b3c3NaNNaNNaN
4a4b4c4NaNNaNNaN
5a5b5c5NaNNaNNaN
0NaNNaNNaNd0e0f0
1NaNNaNNaNd1e1f1
2NaNNaNNaNd2e2f2
3NaNNaNNaNd3e3f3
4NaNNaNNaNd4e4f4
5NaNNaNNaNd5e5f5
\n", + "
" + ], + "text/plain": [ + " A B C D E F\n", + "0 a0 b0 c0 NaN NaN NaN\n", + "1 a1 b1 c1 NaN NaN NaN\n", + "2 a2 b2 c2 NaN NaN NaN\n", + "3 a3 b3 c3 NaN NaN NaN\n", + "4 a4 b4 c4 NaN NaN NaN\n", + "5 a5 b5 c5 NaN NaN NaN\n", + "0 NaN NaN NaN d0 e0 f0\n", + "1 NaN NaN NaN d1 e1 f1\n", + "2 NaN NaN NaN d2 e2 f2\n", + "3 NaN NaN NaN d3 e3 f3\n", + "4 NaN NaN NaN d4 e4 f4\n", + "5 NaN NaN NaN d5 e5 f5" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.concat([df1,df2,df3, df4])" + ] }, { "cell_type": "markdown", @@ -244,17 +485,184 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 11, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABCDEF
0a0b0c0NaNNaNNaN
1a1b1c1NaNNaNNaN
2a2b2c2NaNNaNNaN
3a3b3c3NaNNaNNaN
4a4b4c4NaNNaNNaN
5a5b5c5NaNNaNNaN
6NaNNaNNaNd0e0f0
7NaNNaNNaNd1e1f1
8NaNNaNNaNd2e2f2
9NaNNaNNaNd3e3f3
10NaNNaNNaNd4e4f4
11NaNNaNNaNd5e5f5
\n", + "
" + ], + "text/plain": [ + " A B C D E F\n", + "0 a0 b0 c0 NaN NaN NaN\n", + "1 a1 b1 c1 NaN NaN NaN\n", + "2 a2 b2 c2 NaN NaN NaN\n", + "3 a3 b3 c3 NaN NaN NaN\n", + "4 a4 b4 c4 NaN NaN NaN\n", + "5 a5 b5 c5 NaN NaN NaN\n", + "6 NaN NaN NaN d0 e0 f0\n", + "7 NaN NaN NaN d1 e1 f1\n", + "8 NaN NaN NaN d2 e2 f2\n", + "9 NaN NaN NaN d3 e3 f3\n", + "10 NaN NaN NaN d4 e4 f4\n", + "11 NaN NaN NaN d5 e5 f5" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.concat([df1,df2,df3, df4], ignore_index=True)" + ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], - "source": [] + "source": [ + "#now doesn't retain original index, each row has individual index" + ] }, { "cell_type": "markdown", @@ -277,7 +685,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -293,7 +701,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -352,7 +760,7 @@ "2 i2 a2 b2" ] }, - "execution_count": 8, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -363,7 +771,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -422,7 +830,7 @@ "2 i3 c3 d3" ] }, - "execution_count": 9, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } @@ -447,56 +855,7 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And you see, `join` disregards the row of `right` with the unmatching index `i3`. It retains the row of `left` with the unmatching index `i0` but uses `NaN` for the missing data after joining." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### There are other options we can explore with the `merge()` and `join()` functions. \n", - "\n", - "Specifically, we can specify `how`. This argument in the function tells us whether we are performing an inner, left, right, or outer join.\n", - "\n", - "We can also specify a different column for joining in the `merge()` function using the `left_on` and `right_on` arguments. Check out the following documentations if you want to explore more:\n", - "\n", - "[pandas.DataFrame.merge](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.merge.html)\n", - "\n", - "[pandas.DataFrame.join](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.join.html)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Bonus Question\n", - "\n", - "Now if you look back on `merge` and `join`, you realize that in order to perform these functions on a set of dataframes, these dataframes must share a common column as the index. Only rows that have the same index values will be joined. This is similar to the [`join` function in MySQL](https://www.w3schools.com/sql/sql_join.asp), isn't it?\n", - "\n", - "The bonus question for you is to figure out how to join and concatenate `df1`, `df2`, `df3`, and `df4` we created at the beginning of this challenge. Your end product should look like this:\n", - "\n", - "![df1-2-3-4.png](../images/df1-2-3-4.png)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 11, + "execution_count": 50, "metadata": {}, "outputs": [ { @@ -523,50 +882,97 @@ " A\n", " B\n", " C\n", + " D\n", + " \n", + " \n", + " idx\n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", - " 0\n", + " i0\n", " a0\n", " b0\n", - " c0\n", + " NaN\n", + " NaN\n", " \n", " \n", - " 1\n", + " i1\n", " a1\n", " b1\n", " c1\n", + " d1\n", " \n", " \n", - " 2\n", + " i2\n", " a2\n", " b2\n", " c2\n", + " d2\n", " \n", " \n", "\n", "" ], "text/plain": [ - " A B C\n", - "0 a0 b0 c0\n", - "1 a1 b1 c1\n", - "2 a2 b2 c2" + " A B C D\n", + "idx \n", + "i0 a0 b0 NaN NaN\n", + "i1 a1 b1 c1 d1\n", + "i2 a2 b2 c2 d2" ] }, - "execution_count": 11, + "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df1" + "left.set_index('idx').join(right.set_index('idx'))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And you see, `join` disregards the row of `right` with the unmatching index `i3`. It retains the row of `left` with the unmatching index `i0` but uses `NaN` for the missing data after joining." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### There are other options we can explore with the `merge()` and `join()` functions. \n", + "\n", + "Specifically, we can specify `how`. This argument in the function tells us whether we are performing an inner, left, right, or outer join.\n", + "\n", + "We can also specify a different column for joining in the `merge()` function using the `left_on` and `right_on` arguments. Check out the following documentations if you want to explore more:\n", + "\n", + "[pandas.DataFrame.merge](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.merge.html)\n", + "\n", + "[pandas.DataFrame.join](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.join.html)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Bonus Question\n", + "\n", + "Now if you look back on `merge` and `join`, you realize that in order to perform these functions on a set of dataframes, these dataframes must share a common column as the index. Only rows that have the same index values will be joined. This is similar to the [`join` function in MySQL](https://www.w3schools.com/sql/sql_join.asp), isn't it?\n", + "\n", + "The bonus question for you is to figure out how to join and concatenate `df1`, `df2`, `df3`, and `df4` we created at the beginning of this challenge. Your end product should look like this:\n", + "\n", + "![df1-2-3-4.png](../images/df1-2-3-4.png)" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 86, "metadata": {}, "outputs": [ { @@ -593,53 +999,88 @@ " A\n", " B\n", " C\n", + " D\n", + " E\n", + " F\n", " \n", " \n", " \n", " \n", + " 0\n", + " a0\n", + " b0\n", + " c0\n", + " d0\n", + " e0\n", + " f0\n", + " \n", + " \n", + " 1\n", + " a1\n", + " b1\n", + " c1\n", + " d1\n", + " e1\n", + " f1\n", + " \n", + " \n", + " 2\n", + " a2\n", + " b2\n", + " c2\n", + " d2\n", + " e2\n", + " f2\n", + " \n", + " \n", " 3\n", " a3\n", " b3\n", " c3\n", + " d3\n", + " e3\n", + " f3\n", " \n", " \n", " 4\n", " a4\n", " b4\n", " c4\n", + " d4\n", + " e4\n", + " f4\n", " \n", " \n", " 5\n", " a5\n", " b5\n", " c5\n", + " d5\n", + " e5\n", + " f5\n", " \n", " \n", "\n", "" ], "text/plain": [ - " A B C\n", - "3 a3 b3 c3\n", - "4 a4 b4 c4\n", - "5 a5 b5 c5" + " A B C D E F\n", + "0 a0 b0 c0 d0 e0 f0\n", + "1 a1 b1 c1 d1 e1 f1\n", + "2 a2 b2 c2 d2 e2 f2\n", + "3 a3 b3 c3 d3 e3 f3\n", + "4 a4 b4 c4 d4 e4 f4\n", + "5 a5 b5 c5 d5 e5 f5" ] }, - "execution_count": 12, + "execution_count": 86, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df2" + "(pd.concat([df1, df2])).join(pd.concat([df3, df4]))" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { @@ -658,7 +1099,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.12" + "version": "3.11.5" }, "toc": { "base_numbering": 1,