advanced-computing
diff --git a/‎lab_08/data.db
780 KB b/‎lab_08/data.db
780 KB
diff --git a/‎lab_08/lab_08_example.ipynb
Lines changed: 201 additions & 4 deletions b/‎lab_08/lab_08_example.ipynb
Lines changed: 201 additions & 4 deletions
diff --git a/‎lab_08/pcpiMvMd.xlsx
748 KB b/‎lab_08/pcpiMvMd.xlsx
748 KB
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 57,
+   "execution_count": 1,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -19,7 +19,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 58,
+   "execution_count": 2,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -39,7 +39,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 59,
+   "execution_count": 3,
    "metadata": {},
    "outputs": [
     {
@@ -83,7 +83,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 4,
    "metadata": {},
    "outputs": [
     {
@@ -118,6 +118,203 @@
     "    con.sql('COMMIT') # committing the transaction\n",
     "    print(con.sql('SELECT * FROM data_append').fetchdf())"
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Incremental load method\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "   id     name last_update\n",
+      "0   2    Alice  2021-01-01\n",
+      "1   3  Charlie  2021-01-01\n",
+      "2   4    David  2022-01-01\n",
+      "3   5      Eve  2022-01-01\n"
+     ]
+    }
+   ],
+   "source": [
+    "def incremental_load(con, data):\n",
+    "    # creating a copy of the original data (not necessary in general)\n",
+    "    con.sql((\"CREATE OR REPLACE TABLE data_incremental AS SELECT * FROM original_data\"))\n",
+    "\n",
+    "    # deleting \"outdated\" rows\n",
+    "    new_earliest_date = data['last_update'].min().strftime('%Y-%m-%d')\n",
+    "    con.sql(f\"DELETE FROM data_incremental WHERE last_update < '{new_earliest_date}'\")\n",
+    "\n",
+    "\n",
+    "    # removing rows that will be updated\n",
+    "    ids_timestamps = con.sql('SELECT id, last_update FROM data_incremental').fetchdf()\n",
+    "    # merging new data to compoare last_update\n",
+    "    ids_timestamps = ids_timestamps.merge(data[['id','last_update']], on='id', \n",
+    "                                          suffixes=('_old', '_new'))\n",
+    "    # finding ids for which the dates are different\n",
+    "    ids_to_remove = ids_timestamps[ids_timestamps['last_update_old'] != ids_timestamps['last_update_new']]['id']\n",
+    "    # removing rows\n",
+    "    for id in ids_to_remove:\n",
+    "        con.sql(f\"DELETE FROM data_incremental WHERE id = {id}\")\n",
+    "\n",
+    "\n",
+    "    # appending the new data\n",
+    "    most_recent_date = con.sql('SELECT MAX(last_update) FROM data_incremental').fetchdf().values[0][0]\n",
+    "    data = data[data['last_update'] > most_recent_date]\n",
+    "    for _, row in data.iterrows():\n",
+    "        date_str = row['last_update'].strftime('%Y-%m-%d')\n",
+    "        con.sql(f\"INSERT INTO data_incremental VALUES ({row['id']}, '{row['name']}', '{date_str}')\")\n",
+    "\n",
+    "with duckdb.connect(file) as con:\n",
+    "    con.sql('BEGIN TRANSACTION') # starting a transaction -- changes are synced once = improves performance\n",
+    "    incremental_load(con, new_data)\n",
+    "    con.sql('COMMIT') # committing the transaction \n",
+    "    print(con.sql('SELECT * FROM data_incremental').fetchdf())\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Truncate and load method\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 62,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "   id     name last_update\n",
+      "0   2    Alice  2021-01-01\n",
+      "1   3  Charlie  2021-01-01\n",
+      "2   4    David  2022-01-01\n",
+      "3   5      Eve  2022-01-01\n"
+     ]
+    }
+   ],
+   "source": [
+    "def trunc_and_load(con,data):\n",
+    "    # we simply truncate the table and load the new data\n",
+    "    con.sql((\"CREATE OR REPLACE TABLE data_trunc AS SELECT * FROM data\"))\n",
+    "\n",
+    "with duckdb.connect(file) as con:\n",
+    "    trunc_and_load(con, new_data)\n",
+    "    print(con.sql('SELECT * FROM data_trunc').fetchdf())    "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Getting latest data at a given date"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>DATE</th>\n",
+       "      <th>PCPI21M1</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>1947:01</td>\n",
+       "      <td>21.48</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>1947:02</td>\n",
+       "      <td>21.62</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>1947:03</td>\n",
+       "      <td>22.00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>1947:04</td>\n",
+       "      <td>22.00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>1947:05</td>\n",
+       "      <td>21.95</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "      DATE  PCPI21M1\n",
+       "0  1947:01     21.48\n",
+       "1  1947:02     21.62\n",
+       "2  1947:03     22.00\n",
+       "3  1947:04     22.00\n",
+       "4  1947:05     21.95"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "def get_data(filename,date):\n",
+    "\n",
+    "    # split date into year, month, day\n",
+    "    year, month, _ = date.split('-')\n",
+    "    year = year[2:]\n",
+    "    month = str(int(month))\n",
+    "\n",
+    "    # read the data\n",
+    "    data = pd.read_excel(filename)\n",
+    "\n",
+    "    # construct column name\n",
+    "    col_name = f'PCPI{year}M{month}'\n",
+    "\n",
+    "    data = data[['DATE',col_name]]\n",
+    "\n",
+    "    return data\n",
+    "\n",
+    "data = get_data('pcpiMvMd.xlsx','2021-01-01')\n",
+    "data.head()"
+   ]
   }
  ],
  "metadata": {