diff --git a/code/pandas_1_concat-merge-join.ipynb b/code/pandas_1_concat-merge-join.ipynb index c66e580..117b95c 100644 --- a/code/pandas_1_concat-merge-join.ipynb +++ b/code/pandas_1_concat-merge-join.ipynb @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -114,7 +114,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -194,7 +194,7 @@ "5 a5 b5 c5" ] }, - "execution_count": 5, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -205,10 +205,94 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
DEF
0d0e0f0
1d1e1f1
2d2e2f2
3d3e3f3
4d4e4f4
5d5e5f5
\n", + "
" + ], + "text/plain": [ + " D E F\n", + "0 d0 e0 f0\n", + "1 d1 e1 f1\n", + "2 d2 e2 f2\n", + "3 d3 e3 f3\n", + "4 d4 e4 f4\n", + "5 d5 e5 f5" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.concat([df3,df4])" + ] }, { "cell_type": "markdown", @@ -223,10 +307,175 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 48, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABCDEF
0a0b0c0NaNNaNNaN
1a1b1c1NaNNaNNaN
2a2b2c2NaNNaNNaN
3a3b3c3NaNNaNNaN
4a4b4c4NaNNaNNaN
5a5b5c5NaNNaNNaN
0NaNNaNNaNd0e0f0
1NaNNaNNaNd1e1f1
2NaNNaNNaNd2e2f2
3NaNNaNNaNd3e3f3
4NaNNaNNaNd4e4f4
5NaNNaNNaNd5e5f5
\n", + "
" + ], + "text/plain": [ + " A B C D E F\n", + "0 a0 b0 c0 NaN NaN NaN\n", + "1 a1 b1 c1 NaN NaN NaN\n", + "2 a2 b2 c2 NaN NaN NaN\n", + "3 a3 b3 c3 NaN NaN NaN\n", + "4 a4 b4 c4 NaN NaN NaN\n", + "5 a5 b5 c5 NaN NaN NaN\n", + "0 NaN NaN NaN d0 e0 f0\n", + "1 NaN NaN NaN d1 e1 f1\n", + "2 NaN NaN NaN d2 e2 f2\n", + "3 NaN NaN NaN d3 e3 f3\n", + "4 NaN NaN NaN d4 e4 f4\n", + "5 NaN NaN NaN d5 e5 f5" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.concat([df1, df2, df3, df4])" + ] }, { "cell_type": "markdown", @@ -244,56 +493,858 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Merging and Joining\n", - "\n", - "Pandas has two functions for joining datasets: `merge()` and `join()`. They perform the same task but have different options and syntax. \n", - "\n", - "Below is an example of `merge` and `join`. \n", - "HINT (uses the column that repeats in both dataframes )" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.merge.html" - ] - }, - { - "cell_type": "code", - "execution_count": 7, + "execution_count": 9, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABCDEF
0a0b0c0NaNNaNNaN
1a1b1c1NaNNaNNaN
2a2b2c2NaNNaNNaN
3a3b3c3NaNNaNNaN
4a4b4c4NaNNaNNaN
5a5b5c5NaNNaNNaN
6NaNNaNNaNd0e0f0
7NaNNaNNaNd1e1f1
8NaNNaNNaNd2e2f2
9NaNNaNNaNd3e3f3
10NaNNaNNaNd4e4f4
11NaNNaNNaNd5e5f5
\n", + "
" + ], + "text/plain": [ + " A B C D E F\n", + "0 a0 b0 c0 NaN NaN NaN\n", + "1 a1 b1 c1 NaN NaN NaN\n", + "2 a2 b2 c2 NaN NaN NaN\n", + "3 a3 b3 c3 NaN NaN NaN\n", + "4 a4 b4 c4 NaN NaN NaN\n", + "5 a5 b5 c5 NaN NaN NaN\n", + "6 NaN NaN NaN d0 e0 f0\n", + "7 NaN NaN NaN d1 e1 f1\n", + "8 NaN NaN NaN d2 e2 f2\n", + "9 NaN NaN NaN d3 e3 f3\n", + "10 NaN NaN NaN d4 e4 f4\n", + "11 NaN NaN NaN d5 e5 f5" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "pd.concat([df1, df2, df3, df4], ignore_index=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Merging and Joining\n", + "\n", + "Pandas has two functions for joining datasets: `merge()` and `join()`. They perform the same task but have different options and syntax. \n", + "\n", + "Below is an example of `merge` and `join`. \n", + "HINT (uses the column that repeats in both dataframes )" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.merge.html" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "left = pd.DataFrame({'idx': ['i'+str(x) for x in range(3)],\n", + " 'A': ['a'+str(x) for x in range(3)],\n", + " 'B': ['b'+str(x) for x in range(3)]})\n", + "\n", + "\n", + "right = pd.DataFrame({'idx': ['i'+str(x) for x in range(1,4)],\n", + " 'C': ['c'+str(x) for x in range(1,4)],\n", + " 'D': ['d'+str(x) for x in range(1,4)]})" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idxAB
0i0a0b0
1i1a1b1
2i2a2b2
\n", + "
" + ], + "text/plain": [ + " idx A B\n", + "0 i0 a0 b0\n", + "1 i1 a1 b1\n", + "2 i2 a2 b2" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "left" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idxCD
0i1c1d1
1i2c2d2
2i3c3d3
\n", + "
" + ], + "text/plain": [ + " idx C D\n", + "0 i1 c1 d1\n", + "1 i2 c2 d2\n", + "2 i3 c3 d3" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "right" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`join` is identical to `merge`. But when using join, we need to explicitly set the index column of the dataframes to join using `set_index`:" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.join.html" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABCD
idx
i0a0b0NaNNaN
i1a1b1c1d1
i2a2b2c2d2
\n", + "
" + ], + "text/plain": [ + " A B C D\n", + "idx \n", + "i0 a0 b0 NaN NaN\n", + "i1 a1 b1 c1 d1\n", + "i2 a2 b2 c2 d2" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "left_join = left.set_index('idx').join(right.set_index('idx'))\n", + "left_join" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And you see, `join` disregards the row of `right` with the unmatching index `i3`. It retains the row of `left` with the unmatching index `i0` but uses `NaN` for the missing data after joining." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### There are other options we can explore with the `merge()` and `join()` functions. \n", + "\n", + "Specifically, we can specify `how`. This argument in the function tells us whether we are performing an inner, left, right, or outer join.\n", + "\n", + "We can also specify a different column for joining in the `merge()` function using the `left_on` and `right_on` arguments. Check out the following documentations if you want to explore more:\n", + "\n", + "[pandas.DataFrame.merge](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.merge.html)\n", + "\n", + "[pandas.DataFrame.join](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.join.html)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Bonus Question\n", + "\n", + "Now if you look back on `merge` and `join`, you realize that in order to perform these functions on a set of dataframes, these dataframes must share a common column as the index. Only rows that have the same index values will be joined. This is similar to the [`join` function in MySQL](https://www.w3schools.com/sql/sql_join.asp), isn't it?\n", + "\n", + "The bonus question for you is to figure out how to join and concatenate `df1`, `df2`, `df3`, and `df4` we created at the beginning of this challenge. Your end product should look like this:\n", + "\n", + "![df1-2-3-4.png](../images/df1-2-3-4.png)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABC
0a0b0c0
1a1b1c1
2a2b2c2
\n", + "
" + ], + "text/plain": [ + " A B C\n", + "0 a0 b0 c0\n", + "1 a1 b1 c1\n", + "2 a2 b2 c2" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABC
3a3b3c3
4a4b4c4
5a5b5c5
\n", + "
" + ], + "text/plain": [ + " A B C\n", + "3 a3 b3 c3\n", + "4 a4 b4 c4\n", + "5 a5 b5 c5" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df2" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
DEF
0d0e0f0
1d1e1f1
2d2e2f2
\n", + "
" + ], + "text/plain": [ + " D E F\n", + "0 d0 e0 f0\n", + "1 d1 e1 f1\n", + "2 d2 e2 f2" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df3" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
DEF
3d3e3f3
4d4e4f4
5d5e5f5
\n", + "
" + ], + "text/plain": [ + " D E F\n", + "3 d3 e3 f3\n", + "4 d4 e4 f4\n", + "5 d5 e5 f5" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df4" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ABC
0a0b0c0
1a1b1c1
2a2b2c2
3a3b3c3
4a4b4c4
5a5b5c5
\n", + "
" + ], + "text/plain": [ + " A B C\n", + "0 a0 b0 c0\n", + "1 a1 b1 c1\n", + "2 a2 b2 c2\n", + "3 a3 b3 c3\n", + "4 a4 b4 c4\n", + "5 a5 b5 c5" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "left = pd.DataFrame({'idx': ['i'+str(x) for x in range(3)],\n", - " 'A': ['a'+str(x) for x in range(3)],\n", - " 'B': ['b'+str(x) for x in range(3)]})\n", - "\n", - "\n", - "right = pd.DataFrame({'idx': ['i'+str(x) for x in range(1,4)],\n", - " 'C': ['c'+str(x) for x in range(1,4)],\n", - " 'D': ['d'+str(x) for x in range(1,4)]})" + "concat12 = pd.concat([df1,df2])\n", + "concat12" ] }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 37, "metadata": {}, "outputs": [ { @@ -317,54 +1368,78 @@ " \n", " \n", " \n", - " idx\n", - " A\n", - " B\n", + " D\n", + " E\n", + " F\n", " \n", " \n", " \n", " \n", " 0\n", - " i0\n", - " a0\n", - " b0\n", + " d0\n", + " e0\n", + " f0\n", " \n", " \n", " 1\n", - " i1\n", - " a1\n", - " b1\n", + " d1\n", + " e1\n", + " f1\n", " \n", " \n", " 2\n", - " i2\n", - " a2\n", - " b2\n", + " d2\n", + " e2\n", + " f2\n", + " \n", + " \n", + " 3\n", + " d3\n", + " e3\n", + " f3\n", + " \n", + " \n", + " 4\n", + " d4\n", + " e4\n", + " f4\n", + " \n", + " \n", + " 5\n", + " d5\n", + " e5\n", + " f5\n", " \n", " \n", "\n", "" ], "text/plain": [ - " idx A B\n", - "0 i0 a0 b0\n", - "1 i1 a1 b1\n", - "2 i2 a2 b2" + " D E F\n", + "0 d0 e0 f0\n", + "1 d1 e1 f1\n", + "2 d2 e2 f2\n", + "3 d3 e3 f3\n", + "4 d4 e4 f4\n", + "5 d5 e5 f5" ] }, - "execution_count": 8, + "execution_count": 37, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "left" + "concat34 = pd.concat([df3,df4])\n", + "concat34" ] }, { "cell_type": "code", - "execution_count": 9, - "metadata": {}, + "execution_count": 40, + "metadata": { + "scrolled": true + }, "outputs": [ { "data": { @@ -387,116 +1462,82 @@ " \n", " \n", " \n", - " idx\n", + " A\n", + " B\n", " C\n", - " D\n", + " \n", + " \n", + " Index\n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " 0\n", - " i1\n", - " c1\n", - " d1\n", + " a0\n", + " b0\n", + " c0\n", " \n", " \n", " 1\n", - " i2\n", - " c2\n", - " d2\n", + " a1\n", + " b1\n", + " c1\n", " \n", " \n", " 2\n", - " i3\n", + " a2\n", + " b2\n", + " c2\n", + " \n", + " \n", + " 3\n", + " a3\n", + " b3\n", " c3\n", - " d3\n", + " \n", + " \n", + " 4\n", + " a4\n", + " b4\n", + " c4\n", + " \n", + " \n", + " 5\n", + " a5\n", + " b5\n", + " c5\n", " \n", " \n", "\n", "" ], "text/plain": [ - " idx C D\n", - "0 i1 c1 d1\n", - "1 i2 c2 d2\n", - "2 i3 c3 d3" + " A B C\n", + "Index \n", + "0 a0 b0 c0\n", + "1 a1 b1 c1\n", + "2 a2 b2 c2\n", + "3 a3 b3 c3\n", + "4 a4 b4 c4\n", + "5 a5 b5 c5" ] }, - "execution_count": 9, + "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "right" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "`join` is identical to `merge`. But when using join, we need to explicitly set the index column of the dataframes to join using `set_index`:" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.join.html" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "And you see, `join` disregards the row of `right` with the unmatching index `i3`. It retains the row of `left` with the unmatching index `i0` but uses `NaN` for the missing data after joining." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### There are other options we can explore with the `merge()` and `join()` functions. \n", - "\n", - "Specifically, we can specify `how`. This argument in the function tells us whether we are performing an inner, left, right, or outer join.\n", - "\n", - "We can also specify a different column for joining in the `merge()` function using the `left_on` and `right_on` arguments. Check out the following documentations if you want to explore more:\n", - "\n", - "[pandas.DataFrame.merge](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.merge.html)\n", - "\n", - "[pandas.DataFrame.join](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.join.html)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Bonus Question\n", - "\n", - "Now if you look back on `merge` and `join`, you realize that in order to perform these functions on a set of dataframes, these dataframes must share a common column as the index. Only rows that have the same index values will be joined. This is similar to the [`join` function in MySQL](https://www.w3schools.com/sql/sql_join.asp), isn't it?\n", - "\n", - "The bonus question for you is to figure out how to join and concatenate `df1`, `df2`, `df3`, and `df4` we created at the beginning of this challenge. Your end product should look like this:\n", - "\n", - "![df1-2-3-4.png](../images/df1-2-3-4.png)" + "concat12_a = concat12.rename_axis(\"Index\")\n", + "concat12_a" ] }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": 11, + "execution_count": 41, "metadata": {}, "outputs": [ { @@ -520,53 +1561,82 @@ " \n", " \n", " \n", - " A\n", - " B\n", - " C\n", + " D\n", + " E\n", + " F\n", + " \n", + " \n", + " Index\n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", " 0\n", - " a0\n", - " b0\n", - " c0\n", + " d0\n", + " e0\n", + " f0\n", " \n", " \n", " 1\n", - " a1\n", - " b1\n", - " c1\n", + " d1\n", + " e1\n", + " f1\n", " \n", " \n", " 2\n", - " a2\n", - " b2\n", - " c2\n", + " d2\n", + " e2\n", + " f2\n", + " \n", + " \n", + " 3\n", + " d3\n", + " e3\n", + " f3\n", + " \n", + " \n", + " 4\n", + " d4\n", + " e4\n", + " f4\n", + " \n", + " \n", + " 5\n", + " d5\n", + " e5\n", + " f5\n", " \n", " \n", "\n", "" ], "text/plain": [ - " A B C\n", - "0 a0 b0 c0\n", - "1 a1 b1 c1\n", - "2 a2 b2 c2" + " D E F\n", + "Index \n", + "0 d0 e0 f0\n", + "1 d1 e1 f1\n", + "2 d2 e2 f2\n", + "3 d3 e3 f3\n", + "4 d4 e4 f4\n", + "5 d5 e5 f5" ] }, - "execution_count": 11, + "execution_count": 41, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df1" + "concat34_a = concat34.rename_axis(\"Index\")\n", + "concat34_a" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 53, "metadata": {}, "outputs": [ { @@ -593,53 +1663,99 @@ " A\n", " B\n", " C\n", + " D\n", + " E\n", + " F\n", + " \n", + " \n", + " Index\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", " \n", " \n", " \n", " \n", + " 0\n", + " a0\n", + " b0\n", + " c0\n", + " d0\n", + " e0\n", + " f0\n", + " \n", + " \n", + " 1\n", + " a1\n", + " b1\n", + " c1\n", + " d1\n", + " e1\n", + " f1\n", + " \n", + " \n", + " 2\n", + " a2\n", + " b2\n", + " c2\n", + " d2\n", + " e2\n", + " f2\n", + " \n", + " \n", " 3\n", " a3\n", " b3\n", " c3\n", + " d3\n", + " e3\n", + " f3\n", " \n", " \n", " 4\n", " a4\n", " b4\n", " c4\n", + " d4\n", + " e4\n", + " f4\n", " \n", " \n", " 5\n", " a5\n", " b5\n", " c5\n", + " d5\n", + " e5\n", + " f5\n", " \n", " \n", "\n", "" ], "text/plain": [ - " A B C\n", - "3 a3 b3 c3\n", - "4 a4 b4 c4\n", - "5 a5 b5 c5" + " A B C D E F\n", + "Index \n", + "0 a0 b0 c0 d0 e0 f0\n", + "1 a1 b1 c1 d1 e1 f1\n", + "2 a2 b2 c2 d2 e2 f2\n", + "3 a3 b3 c3 d3 e3 f3\n", + "4 a4 b4 c4 d4 e4 f4\n", + "5 a5 b5 c5 d5 e5 f5" ] }, - "execution_count": 12, + "execution_count": 53, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df2" + "final_concat = pd.concat([concat12_a, concat34_a], axis = 1)\n", + "final_concat" ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] } ], "metadata": { @@ -658,7 +1774,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.12" + "version": "3.11.4" }, "toc": { "base_numbering": 1,