diff --git a/code/pandas_1_concat-merge-join.ipynb b/code/pandas_1_concat-merge-join.ipynb
index c66e580..17312e7 100644
--- a/code/pandas_1_concat-merge-join.ipynb
+++ b/code/pandas_1_concat-merge-join.ipynb
@@ -2,11 +2,12 @@
"cells": [
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 52,
"metadata": {},
"outputs": [],
"source": [
- "import pandas as pd"
+ "import pandas as pd\n",
+ "import numpy as np"
]
},
{
@@ -41,16 +42,7 @@
},
{
"cell_type": "code",
- "execution_count": 2,
- "metadata": {},
- "outputs": [],
- "source": [
- "import pandas as pd"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
+ "execution_count": 67,
"metadata": {
"scrolled": true
},
@@ -114,7 +106,7 @@
},
{
"cell_type": "code",
- "execution_count": 5,
+ "execution_count": 4,
"metadata": {},
"outputs": [
{
@@ -194,7 +186,7 @@
"5 a5 b5 c5"
]
},
- "execution_count": 5,
+ "execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
@@ -205,10 +197,94 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 5,
"metadata": {},
- "outputs": [],
- "source": []
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " D | \n",
+ " E | \n",
+ " F | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " d0 | \n",
+ " e0 | \n",
+ " f0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " d1 | \n",
+ " e1 | \n",
+ " f1 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " d2 | \n",
+ " e2 | \n",
+ " f2 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " d3 | \n",
+ " e3 | \n",
+ " f3 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " d4 | \n",
+ " e4 | \n",
+ " f4 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " d5 | \n",
+ " e5 | \n",
+ " f5 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " D E F\n",
+ "0 d0 e0 f0\n",
+ "1 d1 e1 f1\n",
+ "2 d2 e2 f2\n",
+ "3 d3 e3 f3\n",
+ "4 d4 e4 f4\n",
+ "5 d5 e5 f5"
+ ]
+ },
+ "execution_count": 5,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pd.concat([df3,df4])"
+ ]
},
{
"cell_type": "markdown",
@@ -223,10 +299,175 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 10,
"metadata": {},
- "outputs": [],
- "source": []
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " A | \n",
+ " B | \n",
+ " C | \n",
+ " D | \n",
+ " E | \n",
+ " F | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " a0 | \n",
+ " b0 | \n",
+ " c0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " a1 | \n",
+ " b1 | \n",
+ " c1 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " a2 | \n",
+ " b2 | \n",
+ " c2 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " a3 | \n",
+ " b3 | \n",
+ " c3 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " a4 | \n",
+ " b4 | \n",
+ " c4 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " a5 | \n",
+ " b5 | \n",
+ " c5 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " d0 | \n",
+ " e0 | \n",
+ " f0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " d1 | \n",
+ " e1 | \n",
+ " f1 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " d2 | \n",
+ " e2 | \n",
+ " f2 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " d3 | \n",
+ " e3 | \n",
+ " f3 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " d4 | \n",
+ " e4 | \n",
+ " f4 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " d5 | \n",
+ " e5 | \n",
+ " f5 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " A B C D E F\n",
+ "0 a0 b0 c0 NaN NaN NaN\n",
+ "1 a1 b1 c1 NaN NaN NaN\n",
+ "2 a2 b2 c2 NaN NaN NaN\n",
+ "3 a3 b3 c3 NaN NaN NaN\n",
+ "4 a4 b4 c4 NaN NaN NaN\n",
+ "5 a5 b5 c5 NaN NaN NaN\n",
+ "0 NaN NaN NaN d0 e0 f0\n",
+ "1 NaN NaN NaN d1 e1 f1\n",
+ "2 NaN NaN NaN d2 e2 f2\n",
+ "3 NaN NaN NaN d3 e3 f3\n",
+ "4 NaN NaN NaN d4 e4 f4\n",
+ "5 NaN NaN NaN d5 e5 f5"
+ ]
+ },
+ "execution_count": 10,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pd.concat([df1,df2,df3, df4])"
+ ]
},
{
"cell_type": "markdown",
@@ -244,17 +485,184 @@
},
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 11,
"metadata": {},
- "outputs": [],
- "source": []
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " A | \n",
+ " B | \n",
+ " C | \n",
+ " D | \n",
+ " E | \n",
+ " F | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " a0 | \n",
+ " b0 | \n",
+ " c0 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " a1 | \n",
+ " b1 | \n",
+ " c1 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " a2 | \n",
+ " b2 | \n",
+ " c2 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " a3 | \n",
+ " b3 | \n",
+ " c3 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " a4 | \n",
+ " b4 | \n",
+ " c4 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " a5 | \n",
+ " b5 | \n",
+ " c5 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " d0 | \n",
+ " e0 | \n",
+ " f0 | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " d1 | \n",
+ " e1 | \n",
+ " f1 | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " d2 | \n",
+ " e2 | \n",
+ " f2 | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " d3 | \n",
+ " e3 | \n",
+ " f3 | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " d4 | \n",
+ " e4 | \n",
+ " f4 | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " NaN | \n",
+ " d5 | \n",
+ " e5 | \n",
+ " f5 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " A B C D E F\n",
+ "0 a0 b0 c0 NaN NaN NaN\n",
+ "1 a1 b1 c1 NaN NaN NaN\n",
+ "2 a2 b2 c2 NaN NaN NaN\n",
+ "3 a3 b3 c3 NaN NaN NaN\n",
+ "4 a4 b4 c4 NaN NaN NaN\n",
+ "5 a5 b5 c5 NaN NaN NaN\n",
+ "6 NaN NaN NaN d0 e0 f0\n",
+ "7 NaN NaN NaN d1 e1 f1\n",
+ "8 NaN NaN NaN d2 e2 f2\n",
+ "9 NaN NaN NaN d3 e3 f3\n",
+ "10 NaN NaN NaN d4 e4 f4\n",
+ "11 NaN NaN NaN d5 e5 f5"
+ ]
+ },
+ "execution_count": 11,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "pd.concat([df1,df2,df3, df4], ignore_index=True)"
+ ]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
- "source": []
+ "source": [
+ "#now doesn't retain original index, each row has individual index"
+ ]
},
{
"cell_type": "markdown",
@@ -277,7 +685,7 @@
},
{
"cell_type": "code",
- "execution_count": 7,
+ "execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
@@ -293,7 +701,7 @@
},
{
"cell_type": "code",
- "execution_count": 8,
+ "execution_count": 14,
"metadata": {},
"outputs": [
{
@@ -352,7 +760,7 @@
"2 i2 a2 b2"
]
},
- "execution_count": 8,
+ "execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
@@ -363,7 +771,7 @@
},
{
"cell_type": "code",
- "execution_count": 9,
+ "execution_count": 15,
"metadata": {},
"outputs": [
{
@@ -422,7 +830,7 @@
"2 i3 c3 d3"
]
},
- "execution_count": 9,
+ "execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
@@ -447,56 +855,7 @@
},
{
"cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "And you see, `join` disregards the row of `right` with the unmatching index `i3`. It retains the row of `left` with the unmatching index `i0` but uses `NaN` for the missing data after joining."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "#### There are other options we can explore with the `merge()` and `join()` functions. \n",
- "\n",
- "Specifically, we can specify `how`. This argument in the function tells us whether we are performing an inner, left, right, or outer join.\n",
- "\n",
- "We can also specify a different column for joining in the `merge()` function using the `left_on` and `right_on` arguments. Check out the following documentations if you want to explore more:\n",
- "\n",
- "[pandas.DataFrame.merge](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.merge.html)\n",
- "\n",
- "[pandas.DataFrame.join](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.join.html)"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {},
- "source": [
- "## Bonus Question\n",
- "\n",
- "Now if you look back on `merge` and `join`, you realize that in order to perform these functions on a set of dataframes, these dataframes must share a common column as the index. Only rows that have the same index values will be joined. This is similar to the [`join` function in MySQL](https://www.w3schools.com/sql/sql_join.asp), isn't it?\n",
- "\n",
- "The bonus question for you is to figure out how to join and concatenate `df1`, `df2`, `df3`, and `df4` we created at the beginning of this challenge. Your end product should look like this:\n",
- "\n",
- "![df1-2-3-4.png](../images/df1-2-3-4.png)"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
- },
- {
- "cell_type": "code",
- "execution_count": 11,
+ "execution_count": 50,
"metadata": {},
"outputs": [
{
@@ -523,50 +882,97 @@
" A | \n",
" B | \n",
" C | \n",
+ " D | \n",
+ " \n",
+ " \n",
+ " idx | \n",
+ " | \n",
+ " | \n",
+ " | \n",
+ " | \n",
"
\n",
" \n",
" \n",
" \n",
- " 0 | \n",
+ " i0 | \n",
" a0 | \n",
" b0 | \n",
- " c0 | \n",
+ " NaN | \n",
+ " NaN | \n",
"
\n",
" \n",
- " 1 | \n",
+ " i1 | \n",
" a1 | \n",
" b1 | \n",
" c1 | \n",
+ " d1 | \n",
"
\n",
" \n",
- " 2 | \n",
+ " i2 | \n",
" a2 | \n",
" b2 | \n",
" c2 | \n",
+ " d2 | \n",
"
\n",
" \n",
"\n",
""
],
"text/plain": [
- " A B C\n",
- "0 a0 b0 c0\n",
- "1 a1 b1 c1\n",
- "2 a2 b2 c2"
+ " A B C D\n",
+ "idx \n",
+ "i0 a0 b0 NaN NaN\n",
+ "i1 a1 b1 c1 d1\n",
+ "i2 a2 b2 c2 d2"
]
},
- "execution_count": 11,
+ "execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "df1"
+ "left.set_index('idx').join(right.set_index('idx'))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "And you see, `join` disregards the row of `right` with the unmatching index `i3`. It retains the row of `left` with the unmatching index `i0` but uses `NaN` for the missing data after joining."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### There are other options we can explore with the `merge()` and `join()` functions. \n",
+ "\n",
+ "Specifically, we can specify `how`. This argument in the function tells us whether we are performing an inner, left, right, or outer join.\n",
+ "\n",
+ "We can also specify a different column for joining in the `merge()` function using the `left_on` and `right_on` arguments. Check out the following documentations if you want to explore more:\n",
+ "\n",
+ "[pandas.DataFrame.merge](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.merge.html)\n",
+ "\n",
+ "[pandas.DataFrame.join](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.join.html)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "## Bonus Question\n",
+ "\n",
+ "Now if you look back on `merge` and `join`, you realize that in order to perform these functions on a set of dataframes, these dataframes must share a common column as the index. Only rows that have the same index values will be joined. This is similar to the [`join` function in MySQL](https://www.w3schools.com/sql/sql_join.asp), isn't it?\n",
+ "\n",
+ "The bonus question for you is to figure out how to join and concatenate `df1`, `df2`, `df3`, and `df4` we created at the beginning of this challenge. Your end product should look like this:\n",
+ "\n",
+ "![df1-2-3-4.png](../images/df1-2-3-4.png)"
]
},
{
"cell_type": "code",
- "execution_count": 12,
+ "execution_count": 86,
"metadata": {},
"outputs": [
{
@@ -593,53 +999,88 @@
" A | \n",
" B | \n",
" C | \n",
+ " D | \n",
+ " E | \n",
+ " F | \n",
" \n",
" \n",
" \n",
" \n",
+ " 0 | \n",
+ " a0 | \n",
+ " b0 | \n",
+ " c0 | \n",
+ " d0 | \n",
+ " e0 | \n",
+ " f0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " a1 | \n",
+ " b1 | \n",
+ " c1 | \n",
+ " d1 | \n",
+ " e1 | \n",
+ " f1 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " a2 | \n",
+ " b2 | \n",
+ " c2 | \n",
+ " d2 | \n",
+ " e2 | \n",
+ " f2 | \n",
+ "
\n",
+ " \n",
" 3 | \n",
" a3 | \n",
" b3 | \n",
" c3 | \n",
+ " d3 | \n",
+ " e3 | \n",
+ " f3 | \n",
"
\n",
" \n",
" 4 | \n",
" a4 | \n",
" b4 | \n",
" c4 | \n",
+ " d4 | \n",
+ " e4 | \n",
+ " f4 | \n",
"
\n",
" \n",
" 5 | \n",
" a5 | \n",
" b5 | \n",
" c5 | \n",
+ " d5 | \n",
+ " e5 | \n",
+ " f5 | \n",
"
\n",
" \n",
"\n",
""
],
"text/plain": [
- " A B C\n",
- "3 a3 b3 c3\n",
- "4 a4 b4 c4\n",
- "5 a5 b5 c5"
+ " A B C D E F\n",
+ "0 a0 b0 c0 d0 e0 f0\n",
+ "1 a1 b1 c1 d1 e1 f1\n",
+ "2 a2 b2 c2 d2 e2 f2\n",
+ "3 a3 b3 c3 d3 e3 f3\n",
+ "4 a4 b4 c4 d4 e4 f4\n",
+ "5 a5 b5 c5 d5 e5 f5"
]
},
- "execution_count": 12,
+ "execution_count": 86,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
- "df2"
+ "(pd.concat([df1, df2])).join(pd.concat([df3, df4]))"
]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {},
- "outputs": [],
- "source": []
}
],
"metadata": {
@@ -658,7 +1099,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.9.12"
+ "version": "3.11.5"
},
"toc": {
"base_numbering": 1,