From c0de58c2af587485bac138e21067d4bc21c17017 Mon Sep 17 00:00:00 2001
From: ShreyParikh07 <shrey.parikh@helmholtz-munich.de>
Date: Thu, 10 Oct 2024 10:59:11 +0200
Subject: [PATCH 01/15] Changes to omop

---
 docs/notebooks/test_more_datasets_omop.ipynb | 240 +++++++++++++++++++
 src/ehrdata/dt/datasets.py                   |  73 +++++-
 src/ehrdata/io/omop/omop.py                  |  41 +++-
 3 files changed, 334 insertions(+), 20 deletions(-)
 create mode 100644 docs/notebooks/test_more_datasets_omop.ipynb

diff --git a/docs/notebooks/test_more_datasets_omop.ipynb b/docs/notebooks/test_more_datasets_omop.ipynb
new file mode 100644
index 0000000..3cc16a8
--- /dev/null
+++ b/docs/notebooks/test_more_datasets_omop.ipynb
@@ -0,0 +1,240 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from ehrdata import EHRData\n",
+    "EHRData().r\n",
+    "import anndata as ad\n",
+    "import duckdb\n",
+    "import ehrapy as ep\n",
+    "import ehrdata as ed\n",
+    "import numpy as np\n",
+    "import os"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import gibleed_omop, mimic_iv_omop, synthea27nj_omop"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Load the mimic dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def load_and_check(dummy_func, start_time):\n",
+    "    con = duckdb.connect()\n",
+    "    dummy_func(backend_handle=con)\n",
+    "    edata = ed.io.omop.setup_obs(con, \"person_observation_period\")\n",
+    "    edata\n",
+    "    edata = ed.io.omop.setup_variables(\n",
+    "        backend_handle=con,\n",
+    "        edata=edata,\n",
+    "        tables=[\"measurement\"],\n",
+    "        start_time=start_time,\n",
+    "        interval_length_number=28,\n",
+    "        interval_length_unit=\"day\",\n",
+    "        num_intervals=\"max_observation_duration\",\n",
+    "        concept_ids=\"all\",\n",
+    "        aggregation_strategy=\"last\"\n",
+    "    )\n",
+    "    return edata"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Load the mimic dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Path to data exists, load tables from there: ehrapy_data/mimic-iv-demo-data-in-the-omop-common-data-model-0.9\n",
+      "missing tables:  [['concept'], ['vocabulary'], ['domain'], ['concept_class'], ['concept_relationship'], ['relationship'], ['concept_synonym'], ['concept_ancestor'], ['source_to_concept_map'], ['drug_strength']]\n"
+     ]
+    }
+   ],
+   "source": [
+    "edata_mimic = load_and_check(mimic_iv_omop, \"observation_period_start_date\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "EHRData object with n_obs x n_var = 100 x 450, and a timeseries of 320 steps.\n",
+       "             shape of .X: (100, 450) \n",
+       "             shape of .r: ((100, 450, 320)) "
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "edata_mimic"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Load the gibleed dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Path to data exists, load tables from there: ehrapy_data/GIBleed_dataset\n",
+      "missing tables:  [['cohort_definition']]\n"
+     ]
+    }
+   ],
+   "source": [
+    "edata_gibleed = load_and_check(gibleed_omop, \"observation_period\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "EHRData object with n_obs x n_var = 2694 x 55, and a timeseries of 1441 steps.\n",
+       "             shape of .X: (2694, 55) \n",
+       "             shape of .r: ((2694, 55, 1441)) "
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "edata_gibleed"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Load the Synthea27NJ dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Path to data exists, load tables from there: ehrapy_data/Synthea27Nj\n",
+      "missing tables:  []\n"
+     ]
+    }
+   ],
+   "source": [
+    "edata_synthea27nj = load_and_check(synthea27nj_omop, \"observation_period\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "EHRData object with n_obs x n_var = 28 x 132, and a timeseries of 866 steps.\n",
+       "             shape of .X: (28, 132) \n",
+       "             shape of .r: ((28, 132, 866)) "
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "edata_synthea27nj"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "hackathon_venv",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/src/ehrdata/dt/datasets.py b/src/ehrdata/dt/datasets.py
index adc50bf..3623d7a 100644
--- a/src/ehrdata/dt/datasets.py
+++ b/src/ehrdata/dt/datasets.py
@@ -85,9 +85,9 @@ def mimic_iv_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = N
 
 
 def gibleed_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = None) -> None:
-    """Loads the GIBleed dataset.
+    """Loads the GIBleed dataset in the OMOP Common Data model.
 
-    More details: https://github.com/OHDSI/EunomiaDatasets.
+    More details: https://github.com/OHDSI/EunomiaDatasets/tree/main/datasets/GiBleed.
 
     Parameters
     ----------
@@ -109,13 +109,38 @@ def gibleed_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = No
         >>> ed.dt.gibleed_omop(backend_handle=con)
         >>> con.execute("SHOW TABLES;").fetchall()
     """
-    # TODO:
-    # https://github.com/darwin-eu/EunomiaDatasets/tree/main/datasets/GiBleed
-    raise NotImplementedError()
+    if data_path is None:
+        data_path = Path("ehrapy_data/GIBleed_dataset")
+
+    if data_path.exists():
+        print(f"Path to data exists, load tables from there: {data_path}")
+    else:
+        print("Downloading data...")
+        URL = "https://github.com/OHDSI/EunomiaDatasets/raw/main/datasets/GiBleed/GiBleed_5.3.zip"
+        response = requests.get(URL)
+
+        if response.status_code == 200:
+            # extract_path = data_path / "gibleed_data_csv"
+            # extract_path.mkdir(parents=True, exist_ok=True)
+
+            # Use zipfile and io to open the ZIP file in memory
+            with zipfile.ZipFile(io.BytesIO(response.content)) as z:
+                # Extract all contents of the ZIP file into the correct subdirectory
+                z.extractall(data_path)  # Extracting to 'extract_path'
+                print(f"Download successful. ZIP file downloaded and extracted successfully to {data_path}.")
+
+        else:
+            print(f"Failed to download the file. Status code: {response.status_code}")
+
+    extracted_folder = next(data_path.iterdir(), data_path)
+    # extracted_folder = next((folder for folder in data_path.iterdir() if folder.is_dir() and "_csv" in folder.name and "__MACOSX" not in folder.name), data_path)
+    return _set_up_duckdb(extracted_folder, backend_handle)
 
 
 def synthea27nj_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = None) -> None:
-    """Loads the Synthe27Nj dataset.
+    """Loads the Synthea27NJ dataset in the OMOP Common Data model.
+
+    More details: https://github.com/darwin-eu/EunomiaDatasets/tree/main/datasets/Synthea27Nj.
 
     Parameters
     ----------
@@ -137,9 +162,39 @@ def synthea27nj_omop(backend_handle: DuckDBPyConnection, data_path: Path | None
         >>> ed.dt.synthea27nj_omop(backend_handle=con)
         >>> con.execute("SHOW TABLES;").fetchall()
     """
-    # TODO
-    # https://github.com/darwin-eu/EunomiaDatasets/tree/main/datasets/Synthea27Nj
-    raise NotImplementedError()
+    if data_path is None:
+        data_path = Path("ehrapy_data/Synthea27Nj")
+
+    if data_path.exists():
+        print(f"Path to data exists, load tables from there: {data_path}")
+    else:
+        print("Downloading data...")
+        URL = "https://github.com/OHDSI/EunomiaDatasets/raw/main/datasets/Synthea27Nj/Synthea27Nj_5.4.zip"
+        response = requests.get(URL)
+
+        if response.status_code == 200:
+            extract_path = data_path / "synthea27nj_omop_csv"
+            extract_path.mkdir(parents=True, exist_ok=True)
+
+            # Use zipfile and io to open the ZIP file in memory
+            with zipfile.ZipFile(io.BytesIO(response.content)) as z:
+                # Extract all contents of the ZIP file into the correct subdirectory
+                z.extractall(extract_path)  # Extracting to 'extract_path'
+                print(f"Download successful. ZIP file downloaded and extracted successfully to {extract_path}.")
+
+        else:
+            print(f"Failed to download the file. Status code: {response.status_code}")
+            return
+
+    extracted_folder = next(
+        (
+            folder
+            for folder in data_path.iterdir()
+            if folder.is_dir() and "_csv" in folder.name and "__MACOSX" not in folder.name
+        ),
+        data_path,
+    )
+    return _set_up_duckdb(extracted_folder, backend_handle)
 
 
 def mimic_ii(backend_handle: DuckDBPyConnection, data_path: Path | None = None) -> None:
diff --git a/src/ehrdata/io/omop/omop.py b/src/ehrdata/io/omop/omop.py
index ae41486..d30292a 100644
--- a/src/ehrdata/io/omop/omop.py
+++ b/src/ehrdata/io/omop/omop.py
@@ -103,9 +103,10 @@ def setup_variables(
     time_interval_tables = []
     for table in tables:
         if table == "measurement":
-            concept_ids_present = (
-                backend_handle.sql("SELECT * FROM measurement").df()["measurement_concept_id"].unique()
+            concept_ids_present_df = normalize_column_names(
+                backend_handle.sql("SELECT * FROM measurement").df()
             )
+            concept_ids_present = concept_ids_present_df["measurement_concept_id"].unique()
             extracted_awkward = extract_measurement(backend_handle)
             time_interval_table = get_time_interval_table(
                 backend_handle,
@@ -171,32 +172,37 @@ def load(
 
 def extract_person(duckdb_instance):
     """Extract person table of an OMOP CDM Database."""
-    return duckdb_instance.sql("SELECT * FROM person").df()
+    return normalize_column_names(duckdb_instance.sql("SELECT * FROM person").df())
 
 
 def extract_observation_period(duckdb_instance):
     """Extract person table of an OMOP CDM Database."""
-    return duckdb_instance.sql("SELECT * FROM observation_period").df()
+    return normalize_column_names(duckdb_instance.sql("SELECT * FROM observation_period").df())
 
 
 def extract_person_observation_period(duckdb_instance):
     """Extract observation table of an OMOP CDM Database."""
-    return duckdb_instance.sql(
+    return normalize_column_names(duckdb_instance.sql(
         "SELECT * \
         FROM person \
         LEFT JOIN observation_period USING(person_id) \
         "
-    ).df()
+    ).df())
 
 
 def extract_measurement(duckdb_instance=None):
     """Extract measurement table of an OMOP CDM Database."""
     measurement_table = duckdb_instance.sql("SELECT * FROM measurement").df()
-
+    measurement_table = normalize_column_names(measurement_table)
     # get an array n_person x n_features x 2, one for value, one for time
-    person_id = (
-        duckdb_instance.sql("SELECT * FROM person").df()["person_id"].unique()
+    person_id_df = (
+        duckdb_instance.sql("SELECT * FROM person").df()
     )  # TODO: in anndata? w.r.t database? for now this
+    person_id_df = normalize_column_names(person_id_df)
+    person_id = person_id_df["person_id"].unique()
+    # person_id = (
+    #     duckdb_instance.sql("SELECT * FROM person").df()["person_id"].unique()
+    # )  # TODO: in anndata? w.r.t database? for now this
     features = measurement_table["measurement_concept_id"].unique()
     person_collection = []
 
@@ -320,11 +326,20 @@ def get_time_interval_table(
         concept_id_list = concept_ids
 
     if num_intervals == "max_observation_duration":
+        observation_period_df = con.execute("SELECT * from observation_period").df()
+        observation_period_df = normalize_column_names(observation_period_df)
+
+        # Calculate the duration of observation periods
         num_intervals = np.max(
-            con.execute("SELECT * from observation_period").df()["observation_period_end_date"]
-            - con.execute("SELECT * from observation_period").df()["observation_period_start_date"]
+            observation_period_df["observation_period_end_date"] 
+            - observation_period_df["observation_period_start_date"]
         ) / pd.to_timedelta(interval_length_number, interval_length_unit)
         num_intervals = int(np.ceil(num_intervals))
+        # num_intervals = np.max(
+        #     con.execute("SELECT * from observation_period").df()["observation_period_end_date"]
+        #     - con.execute("SELECT * from observation_period").df()["observation_period_start_date"]
+        # ) / pd.to_timedelta(interval_length_number, interval_length_unit)
+        # num_intervals = int(np.ceil(num_intervals))
 
     tables = []
     for person, person_ts in zip(obs.iterrows(), ts, strict=False):
@@ -353,6 +368,10 @@ def get_time_interval_table(
 
     return np.array(tables).transpose(0, 2, 1)  # TODO: store in self, np
 
+def normalize_column_names(df: pd.DataFrame) -> pd.DataFrame:
+    """Normalize all column names to lowercase."""
+    df.columns = map(str.lower, df.columns)  # Convert all column names to lowercase
+    return df
 
 def extract_observation():
     """Extract observation table of an OMOP CDM Database."""

From 35cdd473c17f02398e3f3bc6a811550ab0e99fea Mon Sep 17 00:00:00 2001
From: ShreyParikh07 <shrey.parikh@helmholtz-munich.de>
Date: Thu, 10 Oct 2024 14:50:32 +0200
Subject: [PATCH 02/15] further implementation

---
 docs/notebooks/test_more_datasets_omop.ipynb | 256 +++++++++++++++++--
 src/ehrdata/io/omop/omop.py                  | 236 +++++++++++------
 2 files changed, 390 insertions(+), 102 deletions(-)

diff --git a/docs/notebooks/test_more_datasets_omop.ipynb b/docs/notebooks/test_more_datasets_omop.ipynb
index 3cc16a8..3ac69ad 100644
--- a/docs/notebooks/test_more_datasets_omop.ipynb
+++ b/docs/notebooks/test_more_datasets_omop.ipynb
@@ -39,24 +39,23 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Load the mimic dataset"
+    "define the function"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
+   "execution_count": 82,
    "metadata": {},
    "outputs": [],
    "source": [
-    "def load_and_check(dummy_func, start_time):\n",
+    "def load_and_check(dummy_func, start_time, tables):\n",
     "    con = duckdb.connect()\n",
     "    dummy_func(backend_handle=con)\n",
     "    edata = ed.io.omop.setup_obs(con, \"person_observation_period\")\n",
-    "    edata\n",
     "    edata = ed.io.omop.setup_variables(\n",
     "        backend_handle=con,\n",
     "        edata=edata,\n",
-    "        tables=[\"measurement\"],\n",
+    "        tables=tables,\n",
     "        start_time=start_time,\n",
     "        interval_length_number=28,\n",
     "        interval_length_unit=\"day\",\n",
@@ -76,7 +75,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 98,
    "metadata": {},
    "outputs": [
     {
@@ -89,12 +88,12 @@
     }
    ],
    "source": [
-    "edata_mimic = load_and_check(mimic_iv_omop, \"observation_period_start_date\")"
+    "edata_mimic = load_and_check(mimic_iv_omop, \"observation_period_start_date\", [\"measurement\"])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
+   "execution_count": 99,
    "metadata": {},
    "outputs": [
     {
@@ -105,7 +104,7 @@
        "             shape of .r: ((100, 450, 320)) "
       ]
      },
-     "execution_count": 13,
+     "execution_count": 99,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -114,6 +113,30 @@
     "edata_mimic"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 100,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAKUAAAAUCAYAAADsvf0KAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAABJ0AAASdAHeZh94AAAGXklEQVR4nO2ae6xdRRXGf30kAgVLUqSN+ACR2xAkFMWADypXsMpDA4IhIYVSIpEAXpBWDdX69cNUagKF8hAJNS0qCZBUIPJ+NUB5iGIJGkBebREDVKlUiq2l5fLHmt3uO+zTe+45u/fU5n7JyWSvmTUz35w1s/aatYf19vYyhCFsSxje6QkMYQg5RlYJbV8LHAnsJentwZ3SELZ32P4M8CfgdEnz8/phufu2/VngD8B0SXNL8hOALwETgAOAXYDrJE3uZwIfAS4AvgaMAV4FbgYs6d916dQJ25OB36THyoWzvRz4eIMuXpc0rkHfg8rN9s+Bg4AuYDdgLbAijXmFpDey9mOA44Cjgf2BPYD1wF+ABcACSe82GKtpbrZvAg4B9pG0plxX5b5nA/8BrsrkPwbOJozyH5Ur8P5J7g08AUwFHgcuAV4CzgEeTQvQtk6dsP1R4ApgTX9tgdWAK34XNei7E9y+B4wC7gHmAdcBG4BZwFOJbxnfAq4BDiYOp0uBRcCngPnAjbaH5YO0wO1CYBzQk/fVx33b7gKOAOZLWltB7hXgBeLEXFy5BH3xC2B3oEfS5aVx5qb+ZgNn1KBTC9JiLwDeAH4HTO9H5U1JswYwRCe4fVDSulxoezYwAzgfOLNU9RzwDeC28oloewZhbMcD3yQMtYwBcZP0uO1nge/YnlMeKz8pTwOGATfkJCQtlvS8pKbC9bRzJgHLgSvz7oC3gZNtj2pHp2b0AF8mdnut79Kd4lZlkAk3pnKfrP39kn6fu2hJrwG/TI+Hleva4HY98DHgK2VhbpRHABuBxxoQGQi6U3l3BcG3gIeBnYj3inZ0aoHtfYE5wDxJDzap9gHbk23PsH2O7W7bIxq07Ri3Bvh6Kp8agM47qdyQyVvl9nAq+xjlJvedrHgC8ExNEff4VD7XoP55Ynd1Afe1odM2bI8kApuXCZfWLMaxOSAqsMz2VEkPZPKOcCtgezqwMzCaCHy+SBjknCb1RwKnpMc7s+pWuf0xlRPLjcsn5R7ACCJiqgOjU7m6QX0h37VNnTrwE+BA4NSKd+lGWAAcThjmKCJSvRrYE7jD9gFZ+05xKzCdcKXnEgZ5JzBJ0j+b1J9DBDu3S7orq2uJm6TVwDrChW9COdApoqOtfuWyLcH2wcTpeLGkR5vVk+RM9FfgDNtrgGlEdHtcXfNsF8UVle2xwOcJI1tq+xhJf96Sru0egtOzwMk1T20VMLYsKJ+UxQmxQ02DFbtjdIP6Qv5mmzotI7mkXxNuZ2YdfbI5GJiYyQeVWyNIel3STYQ7HUPwbwjbZxNXSU8D3ZJWVTRrh9uObLY9oK9RrkxlXXdlf0tlV4P6Iuorv4e0otMOdk5j7Quss91b/AhXB3BNkl3aZJ+FO8wjzcHmtkVIWkEY2n62d6tqY/tc4HLCC3SnCLwKLXGzPZxw6SvL8rJRvkos6HjqQXGPOSkNXp7MLsAXgP/SN9JvRacd/A/4VYPf0tRmSXpu1rUXEeZLmXywuTWDD6dyY15h+4fEBfiThEGuzNuU0Cq38cQV5JNl4aZ3Skm9th8Ejrf9SUkv9ENoi5D0ou27CTdxFrHjNs2VOEmuLkf6rejYXghMAaZKWjjAOa4Fvl1VZ3sWEfxcm6cZ0/XRy/kthe09iWwQwG+zsQaVW9LvIlKeqzP5cOCnxGX3IxUpwJlEuvAJIhiqctltcUsoNnCfREz+QcYi4sb+q0TmpjzRY4Fj02OR1/1cWjiAf0nKMyBnAo8Al9k+HHiGSF91E0f5jyo4DlSn2Jn53dnWxInAtLSJVwBvAXsT+eIdgNupTjUONrejgAttLwGWEZmqsURG7hPAa8DpZQXbUwiD3Ag8BPTYeUzH8opN0sp/PSmNc0tZmF+eLyL8+ym8HxOIXTuFMFoSsUJ2Qq4g6UXiTmxhmuA04s+bBxySfwzQos7+hFHcVjHnrYXFwK1pXicB5xF/9BJiLY6RtD5X6gC3e4lXjw8RqcHvE4fOKuIE20/S05nOXqkcQVwfqeJ3arvcbI8mDrlbJf29XFf1ldD5wM+AT0tayjYM27sSu/9iST/o8HRqxfbMDcD2d4HLgEMlLSnXVX0ldAmR2bhgEObWLg4lUl9z+2v4f4jtlpvtHYkPQRblBgkVJ2VSmki8C1w09JHvEOpGChRPBBZKWp7XvwdACvWbXD4BcQAAAABJRU5ErkJggg==",
+      "text/latex": [
+       "$\\displaystyle \\left( 100, \\  450, \\  320\\right)$"
+      ],
+      "text/plain": [
+       "(100, 450, 320)"
+      ]
+     },
+     "execution_count": 100,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "edata_mimic.r.shape"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -136,12 +159,12 @@
     }
    ],
    "source": [
-    "edata_gibleed = load_and_check(gibleed_omop, \"observation_period\")"
+    "edata_gibleed = load_and_check(gibleed_omop, \"observation_period\", [\"measurement\"])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
+   "execution_count": 102,
    "metadata": {},
    "outputs": [
     {
@@ -152,7 +175,7 @@
        "             shape of .r: ((2694, 55, 1441)) "
       ]
      },
-     "execution_count": 11,
+     "execution_count": 102,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -161,6 +184,30 @@
     "edata_gibleed"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": 103,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAALAAAAAUCAYAAAAtOremAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAABJ0AAASdAHeZh94AAAGpUlEQVR4nO2aeaxdUxTGfx1UmxalMUXUPNRYRLWkLTqIGKKGVKQUMSWookUbrH4S1FStEEMrbZUg1NSBEho1F6kg5uiglNfilddBaZ8/1j7teeed+9695953nsb7kpt9z9p77bP22muvs/bau1VtbS0taMHmitbNLUALWlAO2qYRJU0FTgT2MLNV+YrUghbUh6QjgI+Ai81sUkRvlQwhJB0JfACMMLNxgdYFGAScBBwM7AKsAz4DJgOTzWxDAy/vB1wB9AK2BX4NvBPMbHaibSvgovA7EGgFfAlMAh5p6D2BfwgwLTzWGWwlIWkRsFuB6l/MbKdK8JQLSWcCfYHuwKHAVsATZjakhD5K1mljPFnkkvQ80BPYx8xqIN0D3wr8ATwYo50VnpcBc4ElwI7A6bhhnSjpLDOrF1BLuhMYCSwFXgJWANsDRwDHArMTLI8D5wBVwJPAamBAeP/RwHkNDHBX4H6gBuhUqF0FsRIYn0KvqTBPObgRN5AafA72L4U5i06L5Mki1+24cx0G3AYJA5a0L9AfmGRma2JV3wCnArPiHlDSaGA+cAZuzNMT/V2MG+9U4BIzW5eo3yLxPAg33oVADzNbEejtQt/nSnrBzJ5Ljix47sm4d38OGNG4PspGtZmNyYGnHFyNG8h3uMebWyxjFp2WwFOyXGY2X9JXwKWSxprZhqQHvhD/ZD+dYHyjQIc/S3oI99rHEjNgSVsG+hJSjDfw/50gDQrlPZHxhnbrJN0EnIyHIvUMGF+Vxwc5jk+T9/8IM9toGJJKZc+i06J4ypDrKWAM/lWekzTg/sB64P0SOoyM8J8EfQAeKowHNkg6CTgIWAvMN7P3UvqKYsDvU+oiWm9J7eILQlI3YCweU8+TlJcBbxliva7AKuBTYJ6Zra8wT+7IotOc5uGdUA4A5mxMo0nqiAfUXxabeZDUlk0x6SuJ6iNDuRZYAMzEBzceeFfSm5K2T/BEXnePlNftGcq2sf+RDNNwTz+6GLkriJ3Cu2/Fx/UG8K2kvhXmyRVZdJrjPHwYyj5QNw+8C9AG36gVi7G4V51tZnMSdTuEciRQC/TGd5qHAK8GAZ5J8MwK5TWStouIIVaOf2e2jf2/GTgMOD8Rtzc1JgP9cIPsiGdnHgZ2B16WdGiFeJoDWXSayzyY2UrcKXaFupu4LqH8vZiOJA0DrgW+As5NaRItjn+AU81sUXj+LGzWvgb6SuoVCyeeCn2dAHwh6cUgbH9gZ3x1dwU2BBmOwlf7PQVCkiaDmSUDt8+ByyTV4HoZw6aYPjNP3sii02aYh9/wLFgdDxytmvaNcUu6ApgAfAEcZ2a/pTSrDuWCmPECYGargchj94jR1wOnADcAy4Gh4fctnkL7MzStCp+sx/AMyU2NyZwjHgplnybmqTiy6LSZ5qEDwV7jHrgqlF3qNY9B0nDgXtx79DOzqgJNvw5ldYH6yNN3iBNDZuKO8Iu/tz2wD7DCzBZK6gzsG6rXFtjJTpQ0Ed9UDC8gR6WxPJQdm5inKdCJ0nWahSczJLUGOuOp1joGvAxX5H4NMF+Px72fAAPiqa4UvI7HvgdIap1ygnZQKBcWKfvZQDv8cAPgL+DRAm0Px+Oxt/GFlGd40TOUaZmUSvI0BbLoNO952A9P9X4CMQM2s1pJ84AzJO1tZt/FuUIe9hbgY2BggbBhI8xssaQZ+AHIVbjXjvoaiMe51SSyF5K2NrM/ErTuwF241x4b+l+DHzfXg6QxuOKmph17SpqChyYXmNmUhsZRoP9uwJJktkbS7vgJFPiJYrk8ZclZKrLotJx5yIhosc+F+kfJ0/FTtRPwE5JIkKG48a4H3gKGpXwqFqUo+XJ8AONCHngBniI7LfR1UdhVxvGapDV4iPIn0A2/g7EGOMXMfip+rAUR32BmwWDg2rDgF+Ny7oXL2R4/Hr+7Ajzlyomk03B9w6Y8e6+wOMBDsjxOLeugDLkG4rbzIqQbcBWe230gRo/ysm2A4QVkehOYEieY2dJwi+hm3BP3we9ZzABuN7P5Kf08i4cLQ/D4+EfgkdB+aYF3l4qDcQOa1VjDApiLf8oOA47BY9dq/FM5DZiWci8kC0+5coLn9ocmaHuyKZe+mHyO3ZPoTolySdoGN/qZZvYDpN9GG4VflDjczBZUWurmRtj8/YqnfK5rZnEKYnORM09IuhK4D+htZm9D+oX2e/F86y05ypYneuPH3+OaW5BGsLnImQskdQBGAdMj44UUDxwa9wGOA+5uudDegv8CwiZ4MDAlfq7wLxytAalq0cnyAAAAAElFTkSuQmCC",
+      "text/latex": [
+       "$\\displaystyle \\left( 2694, \\  55, \\  1441\\right)$"
+      ],
+      "text/plain": [
+       "(2694, 55, 1441)"
+      ]
+     },
+     "execution_count": 103,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "edata_gibleed.r.shape"
+   ]
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -170,7 +217,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
+   "execution_count": 101,
    "metadata": {},
    "outputs": [
     {
@@ -183,12 +230,12 @@
     }
    ],
    "source": [
-    "edata_synthea27nj = load_and_check(synthea27nj_omop, \"observation_period\")"
+    "edata_synthea27nj = load_and_check(synthea27nj_omop, \"observation_period\", [\"measurement\"])"
    ]
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
+   "execution_count": 113,
    "metadata": {},
    "outputs": [
     {
@@ -199,7 +246,7 @@
        "             shape of .r: ((28, 132, 866)) "
       ]
      },
-     "execution_count": 12,
+     "execution_count": 113,
      "metadata": {},
      "output_type": "execute_result"
     }
@@ -210,10 +257,181 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 104,
    "metadata": {},
-   "outputs": [],
-   "source": []
+   "outputs": [
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAJcAAAAUCAYAAACAu68PAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAABJ0AAASdAHeZh94AAAGVElEQVR4nO3af+zWVRUH8BdqM6LSBhmLZZMUp6khpukKxIk6tczMVn9Utha6laEmmrrqdNwMaIk/qtUWm1ixlek0EYY/kmVoSSlOm1o208SJv638NRPoj3uf9vDheeD7fT78MMZ7e3b2ufee++P9nM+55577GbF27VrbsR2bAzts7Qlsx7aLnXoVZuaVOBZ7RMRLW3ZK2/H/hsw8CH/C9IiY1ykf0dwWM/Ng3ImZETG3lo3GJ3A89sc4vIb7cAWuiIg1fQY+HmdgX4zGE7gLcyPi95twjZ3xTsbhmIgP4G1YEBGf3YDOHHwQEzAGr+BRXIcfRMSzjfYD89EWg/KZmUfidByGd+DZOt/LImJxW53MvBaHYq+IeJHe2+JF+Bd+1FX2KfwEH1IM71Jcg/0wD1dl5ogek5uDGzAJS3AZ7sbHcXtm9v3DW+AbCiET8fgQdc7CKNxc57gAr+PbuDcz39NoPxAfbTEon5n5XdyivEDX42IswjsxdRPpzMJYzOgUrLMtZuYETMO8iHilq+qvOAGLut/IzLwAy/FJnKQQ3Kkbi5l4EgdExFNddUfgVlyIn/daXAuchZX4m+LBlg5B5+0R8WqzMDMvwgU4H1/uqho2H20xKJ+ZOR3n4EqcGhGvNerf1GOsYetExPLMfBCnZebsiFjT9FxfxAj8sqF4a0QsbLr6iFiFH9fHqY2+3qt4xju7iah6S/Fv5S3YpIiIpRHxUEQM+Rjcy7Aqrqpyr0b7Qfhoi2HzmZk7KzvRP/Qwkqr7n7Y6XfgFdsdRrB/QT8Nq/KGPci90Bnq9Uf6QEocckpljIuKZrgVMUWKh64YxztbAx6q8dxg6/fhoi0H4PEoxuEuxpsZr++FVLO8Tow2i08HtXX3c+D/jysxRSpzywFBPiJm5Ez5fH5d010XEc5n5dczF/Zl5nRIQvk/ZUm7GaUMZZ0shM2firdhFiTU+ohjW7CHq9+WjLQbk8+AqX8UKxUi653sbTo6Ip1vqdPDHKqewbkA/Djsqp4+hYnYdfHFE3NisjIhLldhjJ0zHeUow/BjmN937GwAzEThTMawlOLoPkb2wQT7aYgA+d6vyHKzFZMXDHYCbFCP41SbQ6czvn4pR7s66xjW6yueHstDMnIGz8SA+16fNubga85U3bBQOwsNYUE8kbxhExNiIGKGcek7CeKzIzEkb0x0KH20xAJ+d//d1nBARyyLixYi4T0mlrMThmXlYS51uPKekc9Yxrs7p8M1DWOTpyjH4fhwREc/1aDMVc3B9RHwtIh6OiJcj4u46ycdxdmaO39h4WxoR8WREXIujlZfupxtqPxQ+2mJAPl+ockVEPNLdX0S8jI53PaSlTjdGqrbUbVwdlzp6veZdyMwz8X38WSFyVZ+mH61yvVRAneTyOv6BGxpvayIiHlUM5v2ZOaZXm2Hw0RaD8PmXKl/o02dnlxrZUgdk5g7YVbWlbuN6Ak9j7z6dqgHlJbhHIXJDMdPOVfZLN3TK1zvqvsHw7ipXNyuGyUdbDMLnb5S4ad/6xzfRCdb/3lKng72VVNY9dBlXzQvdhjGZuWdTKzO/qQSsd+HI7qNwH/yuylMzc1yjr2PxYSX4u6NRNz8z12bmFzbS/yZBZk7IzF16lO9Qk6i74Y6IeL5RP1w+2q5t2HxWz7tQCbDPaOgcjWMUD7WkjU4XDq1yKevnua5RssvHKBnuTqenKNnf1XWRMzKz2fEjETG/6/lq5fpgGh6od0+rsI/i4kfgvOa9nXUDymEjM0/EifVxbJWHZWZnbs9ExMwuleMwKzOXKW/js3iXkt0fX+c8vTHGIHzQbm2D8vkVZaucW3NWK7CHwtFqfKme8trqUGLU1fg1vY3rKSVX88Ou8j2q3FE5pvfCb5VTDIiINZl5XJ3oZ5Sg8y3KaWIxLo+Im3r0s7+SbV7UZ5yNYSJOaZSNrz/KhXS3cd2CPZXUw4FKzPCScsXzszrPZoA+bD4qBl7boHxGxMr61cK3lHzYFOXueCFmRcTyTaFTvf+JuCEiHqP3VxHn4zuYFBErhktCG2TmrornuDgizt2SY29ubMtrg8z8Ki7H5IhYRu+vIi5R7pUu3IJz62Cycn0ydyuMvbmxza4tM0cql/vXdAyLHp6rNp6CI/C97R8LbsfGkJn74NPKLcEjnfL/AjQSP4HwDZy2AAAAAElFTkSuQmCC",
+      "text/latex": [
+       "$\\displaystyle \\left( 28, \\  132, \\  866\\right)$"
+      ],
+      "text/plain": [
+       "(28, 132, 866)"
+      ]
+     },
+     "execution_count": 104,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "edata_synthea27nj.r.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# check by loading the data with observation.csv"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "mimic dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 105,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Path to data exists, load tables from there: ehrapy_data/mimic-iv-demo-data-in-the-omop-common-data-model-0.9\n",
+      "missing tables:  [['concept'], ['vocabulary'], ['domain'], ['concept_class'], ['concept_relationship'], ['relationship'], ['concept_synonym'], ['concept_ancestor'], ['source_to_concept_map'], ['drug_strength']]\n"
+     ]
+    }
+   ],
+   "source": [
+    "edata_mimic_obs = load_and_check(mimic_iv_omop, \"observation_period_start_date\", [\"observation\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 106,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAKUAAAAUCAYAAADsvf0KAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAABJ0AAASdAHeZh94AAAF1klEQVR4nO2aa4hVVRTHf6NC2mQGikr28DkiJY5l9FQbtKnsgfagL5oZSGI2Gg6FViz/gWlgmo9eJDg1CSWIRWlmD9HUyjLFIk1Txx6ok5qmppQ2fdjn6pk954733nPwTsP84bLvWWuvs9Y6e52991r7FNTU1NCEJjQkNMu3AU1ogo8WUURJbwC3A13M7Ni5NakJjR2Srga+AUab2XyfX+Av35KuAb4Cys1sZoh+HzAQKAb6AK2BhWY2/CwGXAI8C9wGtAX2AO8CMrM/kpKJg1x8k1QFXJ6Gvc/MOiahJy4kPQ/0A4qAdsBxYDfuec4zswNe/7bAMOAOoDfQCfgb+A5YACwws3/T6Mp43CQtAa4DepjZ0TAvavmeCvwJvOLRnwbG4R7ob5FPoK6R3YANwChgPTAL2AmMB74IHkBsmQSQtW8BDgOK+M1IWE8cPA4UAh8Ds4GFwElgCrBZ0qVe//uB14FrcZPTi8Bi4EpgPrBIUoGvJIdxmwZ0BMr8e9VaviUVAYOB+WZ2PMK5X4GfcG/7yshHUBsvA+2BMjObG9IzM7jfVGBMAjJxkYtvAIfMbMo50BMHF5rZCZ8oaSowGZgEjA2xtgF3A0vDM6Kkybhguxe4BxeoYWQ1bma2XtJW4BFJ08O6/JnyYaAAeMd3wsxWmtl2M8soXQ/enFKgCnjJvx1wDBghqTCOTBLI1reGrsfTWScgAywK2h5e/8/M7H1/iTazvcCrweXNYV6McXsbuAy4JUz0g3IwcAr4Mo0j2aAkaFdEOHgEWAucj9tXxJHJJ86TNFzSZEnjJZVIap5vozLEXUG7OQuZf4L2pEfPddzWBm2toDy9fAdRXAxsSSjj7hm029Lwt+PeriLg0xgy+URHoNKj7ZI0ysxW5cOgdJBUDlwAtMElPjfhAnJ6hvItgAeDy+UeO9dx+zpoB4Q7h2fKTkBzXMaUBNoE7eE0/BT9opgy+cICYBAuMAtxmeprQGfgQ0l98mdaJMpxS+kEXEAuB0rN7PcM5afjkp1lZvaRx8tp3MzsMHACt4SfRjjRSWVHiZdcGiPMTB7pe2CMpKPARFx2O+xc25UOqRKVpA7ADbgg2yjpTjP7tj5ZSWU4n7YCIxI27SDQIUwIz5SpbLtlQspSb0ebNPwU/VBMmYaGVDIwoN5eeYKZ7TOzJbjltC3wZn39JY3DlZJ+AErM7GBEtzjj1oozsQfUDsrqoE2qDvhj0Bal4aeyvvA+JBeZhobUcphohSBpmNluXKBdIaldVB9JE4C5uFWgJMjAo5DTuElqhlvSq8P0cFDuwT3QniSDVA2uNFAeNqY1cCPwF7Uz/VxkGhpSGebOvFqRGS4O2lM+Q9KTuAL4JlxAVvt9Qsh13HriSpCbwsTTNwhqZ6uBdpK61+/L2WFmO4AVuI3/ox5buJmkMpzp5yIjqUJSjaSH4tqcKST1iqqVSuoMzAsu30pATyzfJBVJqrOkSmoWFM/bA+sijgCfwe05NwCDzGx/fXpyGbcAqRe41iGC/0HGYlzF/lbcqUPY0KHA0OAyda57vaSK4P9+Myv37jcWWAfMkTQI2II7virBTeVP1XUxa5nUi+XXzjJGDr49AEyUtBp3jnwE6IY7L24JLCPiqDEHPXF9GwJMk7QG2AUcwCUVA4GuwF5gtGfjSNz59Sngc6BM8nM6qsyswqPlMtalgZ73wsSooKzG1aP8ynwxMNKjdQ1+4AanVlCa2Q5J/ThzSD8Et02YTZqPK3KQ6Y0LiqX+vbJAMdn5thK39PTFLU2FuE38GlzdsjLNqU22euL69gnQHVcC6ovbvx3DBUklMCcicekStM1x5aMorAIqwoRsxy2YwYcCH5jZL2Fe1FdCk4DngKvMbGN6f/MPSRfh3v4XzOyJPJuTKBqzbwCSHgPmAP3NbE2YF/WV0CzgZ1zEN3T0xx19zTxbx/8hGq1vklrhPgRZ7AckRMyUgdAA3F5gRtNHvk1IGpJ64fblFWZW5fP/A5RDnlkqCjNBAAAAAElFTkSuQmCC",
+      "text/latex": [
+       "$\\displaystyle \\left( 100, \\  151, \\  320\\right)$"
+      ],
+      "text/plain": [
+       "(100, 151, 320)"
+      ]
+     },
+     "execution_count": 106,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "edata_mimic_obs.r.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "gibleed dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 96,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Path to data exists, load tables from there: ehrapy_data/GIBleed_dataset\n",
+      "missing tables:  [['cohort_definition']]\n"
+     ]
+    }
+   ],
+   "source": [
+    "edata_gibleed_obs = load_and_check(gibleed_omop, \"observation_period\", [\"observation\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 109,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAALAAAAAUCAYAAAAtOremAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAABJ0AAASdAHeZh94AAAGJUlEQVR4nO2ae4hVVRTGf6Nm2lOTypAkJRUra7J8Bb5fhCmaRhGaFVp/ZGqlmZIuP6GcHmpGUajhmEX20DJTspdkRmWFguGjJLUsY3xkao2ZOv2x99UzZ86duffcmTteuB9c9j37+Z21115n7bV3QVlZGXnkkauoU9sE8sgjE9SLypS0CLgZaGFmf2eXUh55VISkG4DvgNFmtiCRXxB2ISR1AL4BJpjZbJ/XBBgCDADaAc2AY8AmYCGw0MxOVjJ4b2AM0AVoDOz3beea2apQ3QJglP9dDRQAW4AFwLzKxvHthwOL/WO5l60uxJWHpGFAd6AQuA44H3jdzIZXN8fqHDOOTKtqE4eXpHeBzkArMzsC0S7EE8Ah4KVA3m3AfKATTrmfA5YC1+AU6y2veFGDPg18AtwIvA/MAlYCFwM9Ipq8BswDrgDe8P2f4/kUJ3s5P9blwAvAkcrqVQPiyuNx3EIuBH6rYY7VMmYcmabYJg6vmUBTYGwio5wLIak10AdYYGalgaIfgUHAyqBlkTQFWA8MBW7FTWKwv9HARGARcJ+ZHQuVnxV6HgLcCewAOprZPp9f3/c9QtJ7ZrYs/GZeYRbirPsyYEIVwsgEseQBPATsBrbjrM+aGuSY8ZhxZJpGm7R5mdl6SVuB+yUVmdnJsA98L+6T/Wao4WdJOvxD0ss4q92DwIRJOtvn/0KE8vr2/4Wyhvh0VkJ5fb1jkqYCt+BWbQUFxq3KXp5Hryi+1YU48vDlpyZJUk1SrK4x48g0pTYZ8FoCTAf6AqvDLkQf4ATwdRodJpTweCi/L85NWAaclDRA0iRJ4yR1SdJXU5/+HFGWyOvqLfIpSGoLFOF86rVpcK8JJJNHTiGOTLM0D1/6tC8EXAhJ5+L8kS2pRh4k1QPu8o8fhoo7+PQosAHnHwbbrgWGmdneQHbC6raIGK6lT+v5/1sDHBbjLP2UVHjXFKqQR84gjkyzOA/f+rQblN/ENQPqAnvS6KwIp5irzGx1qOwSn04EyoCuuJ3mtcBHnsDboTYrffqwpIsSmd5XDn5nGgf+TwOuB+4O+e21gcrkkUuII9OszIOZ/YUzis2h/CauiU//TKUjSWOBR3CWcERElcTiOA4MMrOd/nmT36xtA7pL6mJmX/myJb6v/sBmScs92T7AZbjV3Rw46Tl0wq32WYE+agUpyCMnEEemtTAPB4BLobwFTqyaBlW1ljQGmAtsBnqa2YGIagd9uiGgvACY2T9AwkJ1DOSfAAYCjwF7gZH+9xNwE3DYVy3xn6xXcRGBqVVxrkmkKI8zHnFkWkvz0BCvr0ELXOLTJhWqByBpPDAH+AHobWYlSapu8+nBJOUJS98wmOkjE0/5X3DcBkArYJ+Z7ZDUCGjti48m2cnOlzQft6kYn4RHRkhDHrmA80hfpnHaxIakOkAjXKi1nALvwVm9NpU0noTz8zYCfYOhrgh8ivN9r5JUJ+JkKrGp25Ei9zuA+rjDDYB/gVeS1G2P88fW4RZSjXzW0pRHLiCOTLM9D21wod6NEFBgMyvzkYGhkq40s+3BVj4OOwP4HuhX1WfSzHZJWoEL+I/DWalEX/1wfu5BQrt1SReY2aFQXiHwDM5qF/n+S3HHzRUgaTpOcIuijj0lFeNck3vMrLiy90iGdOURc4xiMuSZDuLINJN5iInOPl0DFS/zLMWdIvXHnZAkiIzETdYJ4AtgbMSnYmeEkB/AvcBsSQNw4bQWwGDf1yi/qwziY0mluE/yYaAt7s5BKTDQzH5P/V2TIrjBTBtx5SFpMO7d4XTMu4tXVHDuUfDkKiOeMcfMCjLg1Q8n9+UQrcAluFjmi4H8RFy2LjA+CafPCd1VMLPd/hbRNJwl7oa7Z7ECmGlm6yP6eQfnLgzH+ce/4e5GzDSz3UnGThftcItjZVUVkyCWPHBx9pGhvJacjnHvovzRa6Y844yZLRSSJi9JF+KU/gMz+xWib6NNBp4E2pvZhupmXdvwm7/9uJDPo7VMJylyhWc2IelB4Hmgq5mtg+jbaHNw8dYZWeSWTXTFHffOrm0iVSBXeGYFkhoCk4GlCeWFCAvsK3cDegLP5i+053EmwN+zuB0oDp4r/A+xAOOVibavLgAAAABJRU5ErkJggg==",
+      "text/latex": [
+       "$\\displaystyle \\left( 2694, \\  21, \\  1441\\right)$"
+      ],
+      "text/plain": [
+       "(2694, 21, 1441)"
+      ]
+     },
+     "execution_count": 109,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "edata_gibleed_obs.r.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "synthea27nj dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 111,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Path to data exists, load tables from there: ehrapy_data/Synthea27Nj\n",
+      "missing tables:  []\n"
+     ]
+    }
+   ],
+   "source": [
+    "edata_synteha27nj_obs = load_and_check(synthea27nj_omop, \"observation_period\", [\"observation\"])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 112,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAIoAAAAUCAYAAABS66VXAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAABJ0AAASdAHeZh94AAAGPklEQVR4nO3af+zVVRkH8BdCA8ZaNqzY2FAiM0zMNA0zEAbKTGdRmVsztRbYlqH5K21rj49bpU0Rqq00HLhq6xfL/AX2QxZpJaWwbP6oppg60ZKw1JwJ9Mf5fNiH+733y/fe+wVy8d7uzj7POc85z3nO+Ty/PnfE9u3b7cM+7Ar77W0B9uHVgVHtiJl5I07C5Ih4Yc+KtA97E5l5FH6PBRGxrKaPaHU9mXk07sFFEbG4oo3HfJyMaZiIl3E/lmN5RGzrsPDJOA+HYjyewr1YHBG/GcY9ysyzK3kGw7aIGNnCtxEHdhj/dERM6F+69uhFP5k5B+fiWLwezypnsTQibu+XJzN/jOk4OCKep73r+SL+iW80aKfhW3i3comWYCUOwzL8IDNHtBHuKtyKI7EaS3Ef3o+7M/OMdpvqAxuQHX53VmNWdeB9rgPf1cMs4w70op/M/Ap+jnfhZlyD2/AGzOqwTrc8X8YELKoJO7mezHwr5mJZRPy70fUnnIrbmpYjMz+PdfgQPqhcnrpvAi7C0zg8Ip5p9M1WDu4KfKfd5npBRGxQLssAZGb9dl7fgX1LRFw+XLLsCr3oJzMX4GLciIUR8XLLnK9ps07XPBGxLjMfwjmZeWVEbGu1KJ/ACHy/hfHOiLil1b1ExCZ8s3qc1TLXgYrFuqephIpvDf6l3OjdjsycppjSJ5U36X8BXeknM0cr1v6v2hx4xfef5nMvPA18D5NwAgOD2bnYit923t8A1Au90kL/sxLHHJOZB0TE3xsbmInX4qYu1ukHC6v2hojY2mHM6MrUT8IL+APWDjK+X3SrnxOUi7ME26rY5jC8hHUd4pleeGrc3Zjjjh0XJTPH4Qg8ONRMJzNH4czqcXWzLyI2Z+bnsBgPZOZNSgA1RXFjP8M5Q1mnH2TmWJyhvADLBhk6Ad9uoT2amR+PiF8Ot1w96Ofoqn0J65UD34HMXIsPR8Tf+uSp8buqncnOwexEjFSi7qHiymrx2yPijtbOiFiixC6jsACXKoHx41jRanJ3Ez6C/bE6Ih7vMGY55iiXZZyS2V2Hg7AqM9+xOwTrUj9vrNqLsR0zFKtzOH6qHOgPW5bohaeW7Tnlgk1iZ9czvmr/MZRNZuYiXIiH8LEOYy7Bl/BVfB2b8DYlqv5uZh4REZcMZb0+ULud6zoNiIhsIf0Rn8rM55U9Xq6UB4YVXeqnfqlfwakRsbF6vj8z5+NhHJ+ZxzZcSi88TWzGm5oTQZ3ljBnCBs9VUrkHMDsiNrcZMwtX4eaIuCAiHomIFyPiPkXpT+LCzHzzrtbrFZn5drwHT6BtfWEXqAP1mcMmVIUe9LOlatc3DhxExIuoLfoxja5eeJoYq7oXzYtSm7nxA4Y3kJnn42vKWze7ynza4ZSqXdPaUQm5rlr/nYOt1yeGEsQOhtp3jxsmeZroVj8PV+2WDvPVnmBsg9YLD8jM/RSX/Qw7X5SnFMUc0mFSVfB1rVKrmL2LGGN01XZKgWv6gJRtOJCZYxSXuBU39DjN9Kp9ZFiE2hnd6ucXSpxxaHWIragD1UcbtF54ahyilEo20LgoEbEda3FAZr6llSszv6AEr/diTjOd64BfVe3CzJzYMtdJOE4Jln7d0rciM7dX5fh+cJpSql41SBArM6dWGV8r/SAlbqBNUXAY5OxKPxHxGG5RgsvzWsafiHmK5diRffbC00D9kqxhYB1lpVJlnYe/NCY9S6kSbq02uCizNf6zMSJWNJ5/pJSN5+LB6vvBJkxVzO4IXBoRz7bM0wzA+kHtdjpVYmucrsQCa/GYUuiaonzXGqPENu3K+P3K2Yt+Pq24osVVTWQ9JuMDytl8sspW9MkDJ1b9P2lutsZKxSed2UKfXLUjcT6ize/sJkNVxX0fPqsEvfOVDGK6ovx5EbG0jYDTlMPquYKamVPxXkMLYtco31um4KO4AMfjLpyFU9pVNPuVsxf9RMQTOEqxdAcrVmKWYjWOi4iVWtALT2a+TrlIt9bWuN3X48uUlO3IiFjfvQp6R2burxSdrtkDaXPPeLXI2Ssy8zNKyj4jIu6i/dfja5VvA1fsQdlqzFA+CSzeC2t3g1eLnF2jqmRfhpX1JaGNRakGz8RsXL3vj0v/X6jc9ulKZXhjTf8vHQfy/YibeYEAAAAASUVORK5CYII=",
+      "text/latex": [
+       "$\\displaystyle \\left( 28, \\  75, \\  866\\right)$"
+      ],
+      "text/plain": [
+       "(28, 75, 866)"
+      ]
+     },
+     "execution_count": 112,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "edata_synteha27nj_obs.r.shape"
+   ]
   }
  ],
  "metadata": {
diff --git a/src/ehrdata/io/omop/omop.py b/src/ehrdata/io/omop/omop.py
index d30292a..e56a174 100644
--- a/src/ehrdata/io/omop/omop.py
+++ b/src/ehrdata/io/omop/omop.py
@@ -8,7 +8,7 @@
 import duckdb
 import numpy as np
 import pandas as pd
-
+from ehrdata import EHRData
 
 def _check_sanity_of_folder(folder_path: str | Path):
     pass
@@ -54,7 +54,6 @@ def setup_obs(
 
     return EHRData(obs=obs)
 
-
 def setup_variables(
     backend_handle: Literal[str, duckdb, Path],
     edata,
@@ -73,8 +72,6 @@ def setup_variables(
     """Setup the variables.
 
     This function sets up the variables for the EHRData project.
-    For this, a selection of tables from the OMOP CDM which represents the variables should be selected.
-    The tables can be measurement, observation, procedure_occurrence, specimen, device_exposure, drug_exposure, or note.
 
     Parameters
     ----------
@@ -85,59 +82,67 @@ def setup_variables(
     tables
         The tables to be used.
     start_time
-        Starting time for values to be included. Can be 'observation_period' start, which takes the 'observation_period_start' value from obs, or a specific Timestamp.
+        Starting time for values to be included.
     interval_length_number
         Numeric value of the length of one interval.
     interval_length_unit
-        Unit belonging to the interval length. See the units of `pandas.to_timedelta <https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_timedelta.html>`_
+        Unit belonging to the interval length.
     num_intervals
-        Numer of intervals
+        Number of intervals.
+    concept_ids
+        Concept IDs to filter on or 'all'.
+    aggregation_strategy
+        Strategy to use when aggregating data within intervals.
 
     Returns
     -------
     An EHRData object with populated .var field.
     """
-    from ehrdata import EHRData
+    # Mapping of table names to extraction functions and concept ID column names
+    table_info = {
+        "measurement": {"extract_func": extract_measurement, "concept_id_col": "measurement_concept_id"},
+        "observation": {"extract_func": extract_observation, "concept_id_col": "observation_concept_id"},
+        "procedure_occurrence": {"extract_func": extract_procedure_occurrence, "concept_id_col": "procedure_concept_id"},
+        "specimen": {"extract_func": extract_specimen, "concept_id_col": "specimen_concept_id"},
+        "device_exposure": {"extract_func": extract_device_exposure, "concept_id_col": "device_concept_id"},
+        "drug_exposure": {"extract_func": extract_drug_exposure, "concept_id_col": "drug_concept_id"},
+        "note": {"extract_func": extract_note, "concept_id_col": "note_concept_id"},
+    }
 
     concept_ids_present_list = []
     time_interval_tables = []
+
     for table in tables:
-        if table == "measurement":
-            concept_ids_present_df = normalize_column_names(
-                backend_handle.sql("SELECT * FROM measurement").df()
-            )
-            concept_ids_present = concept_ids_present_df["measurement_concept_id"].unique()
-            extracted_awkward = extract_measurement(backend_handle)
-            time_interval_table = get_time_interval_table(
-                backend_handle,
-                extracted_awkward,
-                edata.obs,
-                start_time="observation_period_start",
-                interval_length_number=interval_length_number,
-                interval_length_unit=interval_length_unit,
-                num_intervals=num_intervals,
-                concept_ids=concept_ids,
-                aggregation_strategy=aggregation_strategy,
-            )
-        # TODO: implement the following
-        # elif table == "observation":
-        #     var = extract_observation(backend_handle)
-        # elif table == "procedure_occurrence":
-        #     var = extract_procedure_occurrence(backend_handle)
-        # elif table == "specimen":
-        #     var = extract_specimen(backend_handle)
-        # elif table == "device_exposure":
-        #     var = extract_device_exposure(backend_handle)
-        # elif table == "drug_exposure":
-        #     var = extract_drug_exposure(backend_handle)
-        # elif table == "note":
-        #     var = extract_note(backend_handle)
-        else:
+        if table not in table_info:
             raise ValueError(
                 "tables must be a sequence of 'measurement', 'observation', 'procedure_occurrence', 'specimen', 'device_exposure', 'drug_exposure', or 'note'."
             )
+
+        # Get extract function and concept_id column for the table
+        extract_func = table_info[table]["extract_func"]
+        concept_id_col = table_info[table]["concept_id_col"]
+        concept_ids_present_df = normalize_column_names(backend_handle.sql(f"SELECT * FROM {table}").df())
+        concept_ids_present = concept_ids_present_df[concept_id_col].unique()
+        extracted_awkward = extract_func(backend_handle)
+
+        # Create the time interval table
+        time_interval_table = get_time_interval_table(
+            backend_handle,
+            extracted_awkward,
+            edata.obs,
+            start_time="observation_period_start",
+            interval_length_number=interval_length_number,
+            interval_length_unit=interval_length_unit,
+            num_intervals=num_intervals,
+            concept_ids=concept_ids,
+            aggregation_strategy=aggregation_strategy,
+        )
+
+        # Append 
         concept_ids_present_list.append(concept_ids_present)
         time_interval_tables.append(time_interval_table)
+
+    # Combine time interval tables
     if len(time_interval_tables) > 1:
         time_interval_table = np.concatenate([time_interval_table, time_interval_table], axis=1)
         concept_ids_present = pd.concat(concept_ids_present_list)
@@ -145,12 +150,11 @@ def setup_variables(
         time_interval_table = time_interval_tables[0]
         concept_ids_present = concept_ids_present_list[0]
 
-    # TODO: copy other fields too. or other way? is is somewhat scverse-y by taking and returing anndata object...
+    # Update edata with the new variables
     edata = EHRData(r=time_interval_table, obs=edata.obs, var=concept_ids_present)
 
     return edata
 
-
 def load(
     backend_handle: Literal[str, duckdb, Path],
     # folder_path: str,
@@ -189,56 +193,127 @@ def extract_person_observation_period(duckdb_instance):
         "
     ).df())
 
+def extract_table(duckdb_instance, table_name: str, concept_id_col: str, value_col: str, timestamp_col: str):
+    """
+    Generalized extraction function to extract data from an OMOP CDM table.
 
-def extract_measurement(duckdb_instance=None):
-    """Extract measurement table of an OMOP CDM Database."""
-    measurement_table = duckdb_instance.sql("SELECT * FROM measurement").df()
-    measurement_table = normalize_column_names(measurement_table)
-    # get an array n_person x n_features x 2, one for value, one for time
-    person_id_df = (
-        duckdb_instance.sql("SELECT * FROM person").df()
-    )  # TODO: in anndata? w.r.t database? for now this
-    person_id_df = normalize_column_names(person_id_df)
-    person_id = person_id_df["person_id"].unique()
-    # person_id = (
-    #     duckdb_instance.sql("SELECT * FROM person").df()["person_id"].unique()
-    # )  # TODO: in anndata? w.r.t database? for now this
-    features = measurement_table["measurement_concept_id"].unique()
-    person_collection = []
+    Parameters
+    ----------
+    duckdb_instance: duckdb.DuckDB
+        The DuckDB instance for querying the database.
+    table_name: str
+        The name of the table to extract data from (e.g., "measurement", "observation").
+    concept_id_col: str
+        The name of the column that contains the concept IDs (e.g., "measurement_concept_id").
+    value_col: str
+        The name of the column that contains the values (e.g., "value_as_number").
+    timestamp_col: str
+        The name of the column that contains the timestamps (e.g., "measurement_datetime").
 
-    for person in person_id:
-        person_as_list = []
-        person_measurements = measurement_table[
-            measurement_table["person_id"] == person
-        ]  # or ofc sql in rdbms - lazy, on disk, first step towards huge memory reduction of this prototype if only load this selection
-        # person_measurements = person_measurements.sort_values(by="measurement_date")
-        # person_measurements = person_measurements[["measurement_date", "value_as_number"]]
-        # print(person_measurements)
-        for feature in features:
-            person_feature = []
+    Returns
+    -------
+    ak.Array
+        An Awkward Array with the structure: n_person x n_features x 2 (value, time).
+    """
+    # Load the specified table
+    table_df = duckdb_instance.sql(f"SELECT * FROM {table_name}").df()
+    table_df = normalize_column_names(table_df)
 
-            # person_measurements_value = []
-            # person_measurements_timestamp = []
+    # Load the person table to get unique person IDs
+    person_id_df = normalize_column_names(duckdb_instance.sql("SELECT * FROM person").df())
+    person_ids = person_id_df["person_id"].unique()
 
-            person_feature_measurements = person_measurements["measurement_concept_id"] == feature
+    # Get unique features (concept IDs) for the table
+    features = table_df[concept_id_col].unique()
 
-            person_feature_measurements_value = person_measurements[person_feature_measurements][
-                "value_as_number"
-            ]  # again, rdbms/spark backend big time scalable here
-            person_feature_measurements_timestamp = person_measurements[person_feature_measurements][
-                "measurement_datetime"
-            ]
+    # Initialize the collection for all persons
+    person_collection = []
 
-            person_feature.append(person_feature_measurements_value)
-            person_feature.append(person_feature_measurements_timestamp)
+    for person in person_ids:
+        person_as_list = []
+        # Get rows for the current person
+        person_data = table_df[table_df["person_id"] == person]
+
+        # For each feature, get values and timestamps
+        for feature in features:
+            feature_data = person_data[person_data[concept_id_col] == feature]
 
-            person_as_list.append(person_feature)
+            # Extract the values and timestamps
+            feature_values = feature_data[value_col]
+            feature_timestamps = feature_data[timestamp_col]
 
+            # Append values and timestamps for this feature
+            person_as_list.append([feature_values, feature_timestamps])
+
+        # Append this person's data to the collection
         person_collection.append(person_as_list)
 
     return ak.Array(person_collection)
 
 
+def extract_measurement(duckdb_instance):
+    return extract_table(
+        duckdb_instance,
+        table_name="measurement",
+        concept_id_col="measurement_concept_id",
+        value_col="value_as_number",
+        timestamp_col="measurement_datetime"
+    )
+
+def extract_observation(duckdb_instance):
+    return extract_table(
+        duckdb_instance,
+        table_name="observation",
+        concept_id_col="observation_concept_id",
+        value_col="value_as_number",
+        timestamp_col="observation_datetime"
+    )
+
+def extract_procedure_occurrence(duckdb_instance):
+    return extract_table(
+        duckdb_instance,
+        table_name="procedure_occurrence",
+        concept_id_col="procedure_concept_id",
+        value_col="procedure_type_concept_id",  # Assuming `procedure_type_concept_id` is a suitable value field
+        timestamp_col="procedure_datetime"
+    )
+
+def extract_specimen(duckdb_instance):
+    return extract_table(
+        duckdb_instance,
+        table_name="specimen",
+        concept_id_col="specimen_concept_id",
+        value_col="unit_concept_id",  # Assuming `unit_concept_id` is a suitable value field
+        timestamp_col="specimen_datetime"
+    )
+
+def extract_device_exposure(duckdb_instance):
+    return extract_table(
+        duckdb_instance,
+        table_name="device_exposure",
+        concept_id_col="device_concept_id",
+        value_col="device_exposure_type_concept_id",  # Assuming this as value
+        timestamp_col="device_exposure_start_datetime"
+    )
+
+def extract_drug_exposure(duckdb_instance):
+    return extract_table(
+        duckdb_instance,
+        table_name="drug_exposure",
+        concept_id_col="drug_concept_id",
+        value_col="dose_unit_concept_id",  # Assuming `dose_unit_concept_id` as value
+        timestamp_col="drug_exposure_start_datetime"
+    )
+
+def extract_note(duckdb_instance):
+    return extract_table(
+        duckdb_instance,
+        table_name="note",
+        concept_id_col="note_concept_id",
+        value_col="note_class_concept_id",  # Assuming `note_class_concept_id` as value
+        timestamp_col="note_datetime"
+    )
+
 def _get_interval_table_from_awkward_array(
     # self,#person_feature_measurement: ak.Array,
     person_ts: ak.Array,
@@ -373,11 +448,6 @@ def normalize_column_names(df: pd.DataFrame) -> pd.DataFrame:
     df.columns = map(str.lower, df.columns)  # Convert all column names to lowercase
     return df
 
-def extract_observation():
-    """Extract observation table of an OMOP CDM Database."""
-    pass
-
-
 def extract_procedure_occurrence():
     """Extract procedure_occurrence table of an OMOP CDM Database."""
     pass

From 05cd817d21202d18501d4da2e1c2625e5344afd2 Mon Sep 17 00:00:00 2001
From: ShreyParikh07 <shrey.parikh@helmholtz-munich.de>
Date: Thu, 10 Oct 2024 15:56:12 +0200
Subject: [PATCH 03/15] drug/device_exposure removed

---
 docs/notebooks/test_more_datasets_omop.ipynb | 93 ++++++++++++++++++++
 src/ehrdata/io/omop/omop.py                  | 77 ++++++----------
 2 files changed, 120 insertions(+), 50 deletions(-)

diff --git a/docs/notebooks/test_more_datasets_omop.ipynb b/docs/notebooks/test_more_datasets_omop.ipynb
index 3ac69ad..7eed5e9 100644
--- a/docs/notebooks/test_more_datasets_omop.ipynb
+++ b/docs/notebooks/test_more_datasets_omop.ipynb
@@ -360,6 +360,11 @@
     "edata_gibleed_obs = load_and_check(gibleed_omop, \"observation_period\", [\"observation\"])"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": []
+  },
   {
    "cell_type": "code",
    "execution_count": 109,
@@ -432,6 +437,94 @@
    "source": [
     "edata_synteha27nj_obs.r.shape"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 126,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "tables = [ \"measurement\", \"observation\", \"procedure_occurrence\", \"specimen\", \"device_exposure\", \"drug_exposure\", \"note\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 122,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "device_exposure\n",
+      "drug_exposure\n"
+     ]
+    }
+   ],
+   "source": [
+    "for table in tables:\n",
+    "    table_ext = table +'.csv'\n",
+    "    path = os.path.join('/Users/shrey.parikh/Desktop/EHR/ehrapy_data/mimic-iv-demo-data-in-the-omop-common-data-model-0.9/1_omop_data_csv', table_ext)\n",
+    "    temp = pd.read_csv(path)\n",
+    "    if temp.columns.str.contains('start_date').any():\n",
+    "        print(table)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 136,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# removing drug_exposure and device_exposure because they have start/end date\n",
+    "# note is empty\n",
+    "tables = [ \"measurement\", \"observation\", \"procedure_occurrence\", \"specimen\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 137,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Processing table: measurement\n",
+      "Path to data exists, load tables from there: ehrapy_data/mimic-iv-demo-data-in-the-omop-common-data-model-0.9\n",
+      "missing tables:  [['concept'], ['vocabulary'], ['domain'], ['concept_class'], ['concept_relationship'], ['relationship'], ['concept_synonym'], ['concept_ancestor'], ['source_to_concept_map'], ['drug_strength']]\n",
+      "Success: measurement processed successfully.\n",
+      "Processing table: observation\n",
+      "Path to data exists, load tables from there: ehrapy_data/mimic-iv-demo-data-in-the-omop-common-data-model-0.9\n",
+      "missing tables:  [['concept'], ['vocabulary'], ['domain'], ['concept_class'], ['concept_relationship'], ['relationship'], ['concept_synonym'], ['concept_ancestor'], ['source_to_concept_map'], ['drug_strength']]\n",
+      "Success: observation processed successfully.\n",
+      "Processing table: procedure_occurrence\n",
+      "Path to data exists, load tables from there: ehrapy_data/mimic-iv-demo-data-in-the-omop-common-data-model-0.9\n",
+      "missing tables:  [['concept'], ['vocabulary'], ['domain'], ['concept_class'], ['concept_relationship'], ['relationship'], ['concept_synonym'], ['concept_ancestor'], ['source_to_concept_map'], ['drug_strength']]\n",
+      "Success: procedure_occurrence processed successfully.\n",
+      "Processing table: specimen\n",
+      "Path to data exists, load tables from there: ehrapy_data/mimic-iv-demo-data-in-the-omop-common-data-model-0.9\n",
+      "missing tables:  [['concept'], ['vocabulary'], ['domain'], ['concept_class'], ['concept_relationship'], ['relationship'], ['concept_synonym'], ['concept_ancestor'], ['source_to_concept_map'], ['drug_strength']]\n",
+      "Success: specimen processed successfully.\n"
+     ]
+    }
+   ],
+   "source": [
+    "for table in tables:\n",
+    "    print(f\"Processing table: {table}\")\n",
+    "    try:\n",
+    "        edata_temp = load_and_check(mimic_iv_omop, \"observation_period_start_date\", [table])        \n",
+    "        print(f\"Success: {table} processed successfully.\")\n",
+    "    except Exception as e:\n",
+    "        print(f\"Error processing table: {table}. Error: {str(e)}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
   }
  ],
  "metadata": {
diff --git a/src/ehrdata/io/omop/omop.py b/src/ehrdata/io/omop/omop.py
index e56a174..d6edf8a 100644
--- a/src/ehrdata/io/omop/omop.py
+++ b/src/ehrdata/io/omop/omop.py
@@ -59,8 +59,8 @@ def setup_variables(
     edata,
     tables: Sequence[
         Literal[
-            "measurement", "observation", "procedure_occurrence", "specimen", "device_exposure", "drug_exposure", "note"
-        ]
+            "measurement", "observation", "procedure_occurrence", "specimen",  "note"
+        ] 
     ],
     start_time: Literal["observation_period_start"] | pd.Timestamp | str,
     interval_length_number: int,
@@ -104,9 +104,9 @@ def setup_variables(
         "observation": {"extract_func": extract_observation, "concept_id_col": "observation_concept_id"},
         "procedure_occurrence": {"extract_func": extract_procedure_occurrence, "concept_id_col": "procedure_concept_id"},
         "specimen": {"extract_func": extract_specimen, "concept_id_col": "specimen_concept_id"},
-        "device_exposure": {"extract_func": extract_device_exposure, "concept_id_col": "device_concept_id"},
-        "drug_exposure": {"extract_func": extract_drug_exposure, "concept_id_col": "drug_concept_id"},
-        "note": {"extract_func": extract_note, "concept_id_col": "note_concept_id"},
+        # "device_exposure": {"extract_func": extract_device_exposure, "concept_id_col": "device_concept_id"},
+        # "drug_exposure": {"extract_func": extract_drug_exposure, "concept_id_col": "drug_concept_id"},
+        "note": {"extract_func": extract_note, "concept_id_col": "note_type_concept_id"},
     }
 
     concept_ids_present_list = []
@@ -115,7 +115,7 @@ def setup_variables(
     for table in tables:
         if table not in table_info:
             raise ValueError(
-                "tables must be a sequence of 'measurement', 'observation', 'procedure_occurrence', 'specimen', 'device_exposure', 'drug_exposure', or 'note'."
+                "tables must be a sequence of 'measurement', 'observation', 'procedure_occurrence', 'specimen', or 'note'."
             )
 
         # Get extract function and concept_id column for the table
@@ -155,6 +155,8 @@ def setup_variables(
 
     return edata
 
+# DEVICE EXPOSURE and DRUG EXPOSURE NEEDS TO BE IMPLEMENTED BECAUSE THEY CONTAIN START DATE
+
 def load(
     backend_handle: Literal[str, duckdb, Path],
     # folder_path: str,
@@ -288,28 +290,31 @@ def extract_specimen(duckdb_instance):
     )
 
 def extract_device_exposure(duckdb_instance):
-    return extract_table(
-        duckdb_instance,
-        table_name="device_exposure",
-        concept_id_col="device_concept_id",
-        value_col="device_exposure_type_concept_id",  # Assuming this as value
-        timestamp_col="device_exposure_start_datetime"
-    )
+    # return extract_table(
+    #     duckdb_instance,
+    #     table_name="device_exposure",
+    #     concept_id_col="device_concept_id",
+    #     value_col="device_type_concept_id",  # Assuming this as value
+    #     timestamp_col="device_exposure_start_date"
+    # )
+    # NEEDS IMPLEMENTATION
+    return None
 
 def extract_drug_exposure(duckdb_instance):
-    return extract_table(
-        duckdb_instance,
-        table_name="drug_exposure",
-        concept_id_col="drug_concept_id",
-        value_col="dose_unit_concept_id",  # Assuming `dose_unit_concept_id` as value
-        timestamp_col="drug_exposure_start_datetime"
-    )
-
+    # return extract_table(
+    #     duckdb_instance,
+    #     table_name="drug_exposure",
+    #     concept_id_col="drug_concept_id",
+    #     value_col="dose_unit_concept_id",  # Assuming `dose_unit_concept_id` as value
+    #     timestamp_col="drug_exposure_start_datetime"
+    # )
+    # NEEDS IMPLEMENTATION
+    return None
 def extract_note(duckdb_instance):
     return extract_table(
         duckdb_instance,
         table_name="note",
-        concept_id_col="note_concept_id",
+        concept_id_col="note_type_concept_id",
         value_col="note_class_concept_id",  # Assuming `note_class_concept_id` as value
         timestamp_col="note_datetime"
     )
@@ -448,31 +453,3 @@ def normalize_column_names(df: pd.DataFrame) -> pd.DataFrame:
     df.columns = map(str.lower, df.columns)  # Convert all column names to lowercase
     return df
 
-def extract_procedure_occurrence():
-    """Extract procedure_occurrence table of an OMOP CDM Database."""
-    pass
-
-
-def extract_specimen():
-    """Extract specimen table of an OMOP CDM Database."""
-    pass
-
-
-def extract_device_exposure():
-    """Extract device_exposure table of an OMOP CDM Database."""
-    pass
-
-
-def extract_drug_exposure():
-    """Extract drug_exposure table of an OMOP CDM Database."""
-    pass
-
-
-def extract_condition_occurrence():
-    """Extract condition_occurrence table of an OMOP CDM Database."""
-    pass
-
-
-def extract_note():
-    """Extract note table of an OMOP CDM Database."""
-    pass

From 867222ea155e269bfaf8b3e5b0a8858747b477ca Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 10 Oct 2024 13:56:57 +0000
Subject: [PATCH 04/15] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 docs/notebooks/test_more_datasets_omop.ipynb | 21 +++++----
 src/ehrdata/io/omop/omop.py                  | 49 +++++++++++++-------
 2 files changed, 43 insertions(+), 27 deletions(-)

diff --git a/docs/notebooks/test_more_datasets_omop.ipynb b/docs/notebooks/test_more_datasets_omop.ipynb
index 7eed5e9..6bce569 100644
--- a/docs/notebooks/test_more_datasets_omop.ipynb
+++ b/docs/notebooks/test_more_datasets_omop.ipynb
@@ -17,12 +17,10 @@
    "outputs": [],
    "source": [
     "from ehrdata import EHRData\n",
+    "\n",
     "EHRData().r\n",
-    "import anndata as ad\n",
     "import duckdb\n",
-    "import ehrapy as ep\n",
     "import ehrdata as ed\n",
-    "import numpy as np\n",
     "import os"
    ]
   },
@@ -61,7 +59,7 @@
     "        interval_length_unit=\"day\",\n",
     "        num_intervals=\"max_observation_duration\",\n",
     "        concept_ids=\"all\",\n",
-    "        aggregation_strategy=\"last\"\n",
+    "        aggregation_strategy=\"last\",\n",
     "    )\n",
     "    return edata"
    ]
@@ -444,7 +442,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "tables = [ \"measurement\", \"observation\", \"procedure_occurrence\", \"specimen\", \"device_exposure\", \"drug_exposure\", \"note\"]"
+    "tables = [\"measurement\", \"observation\", \"procedure_occurrence\", \"specimen\", \"device_exposure\", \"drug_exposure\", \"note\"]"
    ]
   },
   {
@@ -463,10 +461,13 @@
    ],
    "source": [
     "for table in tables:\n",
-    "    table_ext = table +'.csv'\n",
-    "    path = os.path.join('/Users/shrey.parikh/Desktop/EHR/ehrapy_data/mimic-iv-demo-data-in-the-omop-common-data-model-0.9/1_omop_data_csv', table_ext)\n",
+    "    table_ext = table + \".csv\"\n",
+    "    path = os.path.join(\n",
+    "        \"/Users/shrey.parikh/Desktop/EHR/ehrapy_data/mimic-iv-demo-data-in-the-omop-common-data-model-0.9/1_omop_data_csv\",\n",
+    "        table_ext,\n",
+    "    )\n",
     "    temp = pd.read_csv(path)\n",
-    "    if temp.columns.str.contains('start_date').any():\n",
+    "    if temp.columns.str.contains(\"start_date\").any():\n",
     "        print(table)"
    ]
   },
@@ -478,7 +479,7 @@
    "source": [
     "# removing drug_exposure and device_exposure because they have start/end date\n",
     "# note is empty\n",
-    "tables = [ \"measurement\", \"observation\", \"procedure_occurrence\", \"specimen\"]"
+    "tables = [\"measurement\", \"observation\", \"procedure_occurrence\", \"specimen\"]"
    ]
   },
   {
@@ -513,7 +514,7 @@
     "for table in tables:\n",
     "    print(f\"Processing table: {table}\")\n",
     "    try:\n",
-    "        edata_temp = load_and_check(mimic_iv_omop, \"observation_period_start_date\", [table])        \n",
+    "        edata_temp = load_and_check(mimic_iv_omop, \"observation_period_start_date\", [table])\n",
     "        print(f\"Success: {table} processed successfully.\")\n",
     "    except Exception as e:\n",
     "        print(f\"Error processing table: {table}. Error: {str(e)}\")"
diff --git a/src/ehrdata/io/omop/omop.py b/src/ehrdata/io/omop/omop.py
index d6edf8a..b279482 100644
--- a/src/ehrdata/io/omop/omop.py
+++ b/src/ehrdata/io/omop/omop.py
@@ -8,8 +8,10 @@
 import duckdb
 import numpy as np
 import pandas as pd
+
 from ehrdata import EHRData
 
+
 def _check_sanity_of_folder(folder_path: str | Path):
     pass
 
@@ -54,14 +56,11 @@ def setup_obs(
 
     return EHRData(obs=obs)
 
+
 def setup_variables(
     backend_handle: Literal[str, duckdb, Path],
     edata,
-    tables: Sequence[
-        Literal[
-            "measurement", "observation", "procedure_occurrence", "specimen",  "note"
-        ] 
-    ],
+    tables: Sequence[Literal["measurement", "observation", "procedure_occurrence", "specimen", "note"]],
     start_time: Literal["observation_period_start"] | pd.Timestamp | str,
     interval_length_number: int,
     interval_length_unit: str,
@@ -102,7 +101,10 @@ def setup_variables(
     table_info = {
         "measurement": {"extract_func": extract_measurement, "concept_id_col": "measurement_concept_id"},
         "observation": {"extract_func": extract_observation, "concept_id_col": "observation_concept_id"},
-        "procedure_occurrence": {"extract_func": extract_procedure_occurrence, "concept_id_col": "procedure_concept_id"},
+        "procedure_occurrence": {
+            "extract_func": extract_procedure_occurrence,
+            "concept_id_col": "procedure_concept_id",
+        },
         "specimen": {"extract_func": extract_specimen, "concept_id_col": "specimen_concept_id"},
         # "device_exposure": {"extract_func": extract_device_exposure, "concept_id_col": "device_concept_id"},
         # "drug_exposure": {"extract_func": extract_drug_exposure, "concept_id_col": "drug_concept_id"},
@@ -138,7 +140,7 @@ def setup_variables(
             aggregation_strategy=aggregation_strategy,
         )
 
-        # Append 
+        # Append
         concept_ids_present_list.append(concept_ids_present)
         time_interval_tables.append(time_interval_table)
 
@@ -155,8 +157,10 @@ def setup_variables(
 
     return edata
 
+
 # DEVICE EXPOSURE and DRUG EXPOSURE NEEDS TO BE IMPLEMENTED BECAUSE THEY CONTAIN START DATE
 
+
 def load(
     backend_handle: Literal[str, duckdb, Path],
     # folder_path: str,
@@ -188,12 +192,15 @@ def extract_observation_period(duckdb_instance):
 
 def extract_person_observation_period(duckdb_instance):
     """Extract observation table of an OMOP CDM Database."""
-    return normalize_column_names(duckdb_instance.sql(
-        "SELECT * \
+    return normalize_column_names(
+        duckdb_instance.sql(
+            "SELECT * \
         FROM person \
         LEFT JOIN observation_period USING(person_id) \
         "
-    ).df())
+        ).df()
+    )
+
 
 def extract_table(duckdb_instance, table_name: str, concept_id_col: str, value_col: str, timestamp_col: str):
     """
@@ -259,36 +266,40 @@ def extract_measurement(duckdb_instance):
         table_name="measurement",
         concept_id_col="measurement_concept_id",
         value_col="value_as_number",
-        timestamp_col="measurement_datetime"
+        timestamp_col="measurement_datetime",
     )
 
+
 def extract_observation(duckdb_instance):
     return extract_table(
         duckdb_instance,
         table_name="observation",
         concept_id_col="observation_concept_id",
         value_col="value_as_number",
-        timestamp_col="observation_datetime"
+        timestamp_col="observation_datetime",
     )
 
+
 def extract_procedure_occurrence(duckdb_instance):
     return extract_table(
         duckdb_instance,
         table_name="procedure_occurrence",
         concept_id_col="procedure_concept_id",
         value_col="procedure_type_concept_id",  # Assuming `procedure_type_concept_id` is a suitable value field
-        timestamp_col="procedure_datetime"
+        timestamp_col="procedure_datetime",
     )
 
+
 def extract_specimen(duckdb_instance):
     return extract_table(
         duckdb_instance,
         table_name="specimen",
         concept_id_col="specimen_concept_id",
         value_col="unit_concept_id",  # Assuming `unit_concept_id` is a suitable value field
-        timestamp_col="specimen_datetime"
+        timestamp_col="specimen_datetime",
     )
 
+
 def extract_device_exposure(duckdb_instance):
     # return extract_table(
     #     duckdb_instance,
@@ -300,6 +311,7 @@ def extract_device_exposure(duckdb_instance):
     # NEEDS IMPLEMENTATION
     return None
 
+
 def extract_drug_exposure(duckdb_instance):
     # return extract_table(
     #     duckdb_instance,
@@ -310,15 +322,18 @@ def extract_drug_exposure(duckdb_instance):
     # )
     # NEEDS IMPLEMENTATION
     return None
+
+
 def extract_note(duckdb_instance):
     return extract_table(
         duckdb_instance,
         table_name="note",
         concept_id_col="note_type_concept_id",
         value_col="note_class_concept_id",  # Assuming `note_class_concept_id` as value
-        timestamp_col="note_datetime"
+        timestamp_col="note_datetime",
     )
 
+
 def _get_interval_table_from_awkward_array(
     # self,#person_feature_measurement: ak.Array,
     person_ts: ak.Array,
@@ -411,7 +426,7 @@ def get_time_interval_table(
 
         # Calculate the duration of observation periods
         num_intervals = np.max(
-            observation_period_df["observation_period_end_date"] 
+            observation_period_df["observation_period_end_date"]
             - observation_period_df["observation_period_start_date"]
         ) / pd.to_timedelta(interval_length_number, interval_length_unit)
         num_intervals = int(np.ceil(num_intervals))
@@ -448,8 +463,8 @@ def get_time_interval_table(
 
     return np.array(tables).transpose(0, 2, 1)  # TODO: store in self, np
 
+
 def normalize_column_names(df: pd.DataFrame) -> pd.DataFrame:
     """Normalize all column names to lowercase."""
     df.columns = map(str.lower, df.columns)  # Convert all column names to lowercase
     return df
-

From 291aeba42102bf3151df3ebe7bbe21f27e9c629a Mon Sep 17 00:00:00 2001
From: Eljas Roellin <eljas.roellin@ikmail.com>
Date: Wed, 16 Oct 2024 12:19:54 +0200
Subject: [PATCH 05/15] update commit

---
 src/ehrdata/io/omop/__init__.py |  23 ++---
 src/ehrdata/io/omop/omop.py     | 153 +++++++++++++++++---------------
 2 files changed, 91 insertions(+), 85 deletions(-)

diff --git a/src/ehrdata/io/omop/__init__.py b/src/ehrdata/io/omop/__init__.py
index eb3908b..6f3fda4 100644
--- a/src/ehrdata/io/omop/__init__.py
+++ b/src/ehrdata/io/omop/__init__.py
@@ -1,15 +1,16 @@
 from .omop import (
-    extract_condition_occurrence,
-    extract_device_exposure,
-    extract_drug_exposure,
-    extract_measurement,
-    extract_note,
-    extract_observation,
-    extract_observation_period,
-    extract_person,
-    extract_person_observation_period,
-    extract_procedure_occurrence,
-    extract_specimen,
+    # extract_condition_occurrence,
+    # extract_device_exposure,
+    # extract_drug_exposure,
+    # extract_measurement,
+    # extract_note,
+    # extract_observation,
+    # extract_observation_period,
+    # extract_person,
+    # extract_person_observation_period,
+    # extract_procedure_occurrence,
+    # extract_specimen,
+    get_table,
     get_time_interval_table,
     load,
     setup_obs,
diff --git a/src/ehrdata/io/omop/omop.py b/src/ehrdata/io/omop/omop.py
index b279482..ccd81b8 100644
--- a/src/ehrdata/io/omop/omop.py
+++ b/src/ehrdata/io/omop/omop.py
@@ -9,8 +9,6 @@
 import numpy as np
 import pandas as pd
 
-from ehrdata import EHRData
-
 
 def _check_sanity_of_folder(folder_path: str | Path):
     pass
@@ -20,15 +18,22 @@ def _check_sanity_of_database(backend_handle: duckdb.DuckDB):
     pass
 
 
+VALID_OBSERVATION_TABLES_SINGLE = ["person", "observation_period", "visit_occurrence"]
+VALID_OBSERVATION_TABLES_JOIN = ["person_observation_period", "person_visit_occurrence"]
+VALID_VARIABLE_TABLES = ["measurement", "observation", "specimen", "note", "death"]
+
+
 def setup_obs(
     backend_handle: Literal[str, duckdb, Path],
-    observation_table: Literal["person", "observation_period", "person_observation_period", "condition_occurrence"],
+    observation_table: Literal[
+        "person", "observation_period", "person_observation_period", "visit_occurrence", "person_visit_occurrence"
+    ],
 ):
     """Setup the observation table.
 
-    This function sets up the observation table for the EHRData project.
-    For this, a table from the OMOP CDM which represents to observed unit should be selected.
-    A unit can be a person, an observation period, the join of these two tables, or a condition occurrence.
+    This function sets up the observation table for the EHRData object.
+    For this, a table from the OMOP CDM which represents the "observed unit" via its id should be selected.
+    A unit can be a person, an observation period, a visit occurrence, or a left join on person_id of a person with one of the other tables.
 
     Parameters
     ----------
@@ -43,23 +48,26 @@ def setup_obs(
     """
     from ehrdata import EHRData
 
-    if observation_table == "person":
-        obs = extract_person(backend_handle)
-    elif observation_table == "observation_period":
-        obs = extract_observation_period(backend_handle)
-    elif observation_table == "person_observation_period":
-        obs = extract_person_observation_period(backend_handle)
-    elif observation_table == "condition_occurrence":
-        obs = extract_condition_occurrence(backend_handle)
-    else:
-        raise ValueError("observation_table must be either 'person', 'observation_period', or 'condition_occurrence'.")
+    if observation_table not in VALID_OBSERVATION_TABLES_SINGLE + VALID_OBSERVATION_TABLES_JOIN:
+        raise ValueError(
+            "observation_table must be either 'person', 'observation_period', 'person_observation_period', 'visit_occurrence', or 'person_visit_occurrence'."
+        )
+
+    if observation_table in VALID_OBSERVATION_TABLES_SINGLE:
+        obs = get_table(backend_handle, observation_table)
+
+    elif observation_table in VALID_OBSERVATION_TABLES_JOIN:
+        if observation_table == "person_observation_period":
+            obs = _get_table_left_join(backend_handle, "person", "observation_period")
+        elif observation_table == "person_visit_occurrence":
+            obs = _get_table_left_join(backend_handle, "person", "visit_occurrence")
 
     return EHRData(obs=obs)
 
 
 def setup_variables(
-    backend_handle: Literal[str, duckdb, Path],
     edata,
+    backend_handle: Literal[str, duckdb, Path],
     tables: Sequence[Literal["measurement", "observation", "procedure_occurrence", "specimen", "note"]],
     start_time: Literal["observation_period_start"] | pd.Timestamp | str,
     interval_length_number: int,
@@ -70,7 +78,7 @@ def setup_variables(
 ):
     """Setup the variables.
 
-    This function sets up the variables for the EHRData project.
+    This function sets up the variables for the EHRData object.
 
     Parameters
     ----------
@@ -95,42 +103,29 @@ def setup_variables(
 
     Returns
     -------
-    An EHRData object with populated .var field.
+    An EHRData object with populated .r and .var field.
     """
-    # Mapping of table names to extraction functions and concept ID column names
-    table_info = {
-        "measurement": {"extract_func": extract_measurement, "concept_id_col": "measurement_concept_id"},
-        "observation": {"extract_func": extract_observation, "concept_id_col": "observation_concept_id"},
-        "procedure_occurrence": {
-            "extract_func": extract_procedure_occurrence,
-            "concept_id_col": "procedure_concept_id",
-        },
-        "specimen": {"extract_func": extract_specimen, "concept_id_col": "specimen_concept_id"},
-        # "device_exposure": {"extract_func": extract_device_exposure, "concept_id_col": "device_concept_id"},
-        # "drug_exposure": {"extract_func": extract_drug_exposure, "concept_id_col": "drug_concept_id"},
-        "note": {"extract_func": extract_note, "concept_id_col": "note_type_concept_id"},
-    }
+    from ehrdata import EHRData
 
     concept_ids_present_list = []
     time_interval_tables = []
 
     for table in tables:
-        if table not in table_info:
-            raise ValueError(
-                "tables must be a sequence of 'measurement', 'observation', 'procedure_occurrence', 'specimen', or 'note'."
-            )
+        if table not in VALID_VARIABLE_TABLES:
+            raise ValueError(f"tables must be a sequence of from [{VALID_VARIABLE_TABLES}].")
 
-        # Get extract function and concept_id column for the table
-        extract_func = table_info[table]["extract_func"]
-        concept_id_col = table_info[table]["concept_id_col"]
-        concept_ids_present_df = normalize_column_names(backend_handle.sql(f"SELECT * FROM {table}").df())
-        concept_ids_present = concept_ids_present_df[concept_id_col].unique()
-        extracted_awkward = extract_func(backend_handle)
+        id_column = f"{table}_type_concept_id" if table in ["note", "death"] else f"{table}_concept_id"
+
+        concept_ids_present = _lowercase_column_names(
+            backend_handle.sql(f"SELECT DISTINCT {id_column} FROM {table}").df()
+        )
+
+        personxfeature_pairs_of_value_timestamp = _extract_personxfeature_pairs_of_value_timestamp(backend_handle)
 
         # Create the time interval table
         time_interval_table = get_time_interval_table(
             backend_handle,
-            extracted_awkward,
+            personxfeature_pairs_of_value_timestamp,
             edata.obs,
             start_time="observation_period_start",
             interval_length_number=interval_length_number,
@@ -166,10 +161,6 @@ def load(
     # folder_path: str,
     # delimiter: str = ",",
     # make_filename_lowercase: bool = True,
-    # use_dask: bool = False,
-    # level: Literal["stay_level", "patient_level"] = "stay_level",
-    # load_tables: str | list[str] | tuple[str] | Literal["auto"] | None = None,
-    # remove_empty_column: bool = True,
 ) -> None:
     """Initialize a connection to the OMOP CDM Database."""
     if isinstance(backend_handle, str) or isinstance(backend_handle, Path):
@@ -180,29 +171,26 @@ def load(
         raise NotImplementedError(f"Backend {backend_handle} not supported. Choose a valid backend.")
 
 
-def extract_person(duckdb_instance):
-    """Extract person table of an OMOP CDM Database."""
-    return normalize_column_names(duckdb_instance.sql("SELECT * FROM person").df())
-
+def get_table(duckdb_instance, table_name: str) -> pd.DataFrame:
+    """Extract a table of an OMOP CDM Database."""
+    return _lowercase_column_names(duckdb_instance.sql(f"SELECT * FROM {table_name}").df())
 
-def extract_observation_period(duckdb_instance):
-    """Extract person table of an OMOP CDM Database."""
-    return normalize_column_names(duckdb_instance.sql("SELECT * FROM observation_period").df())
 
-
-def extract_person_observation_period(duckdb_instance):
-    """Extract observation table of an OMOP CDM Database."""
-    return normalize_column_names(
+def _get_table_left_join(duckdb_instance, table1: str, table2: str) -> pd.DataFrame:
+    """Extract a table of an OMOP CDM Database."""
+    return _lowercase_column_names(
         duckdb_instance.sql(
-            "SELECT * \
-        FROM person \
-        LEFT JOIN observation_period USING(person_id) \
+            f"SELECT * \
+        FROM {table1} \
+        LEFT JOIN {table2} USING(person_id) \
         "
         ).df()
     )
 
 
-def extract_table(duckdb_instance, table_name: str, concept_id_col: str, value_col: str, timestamp_col: str):
+def _extract_personxfeature_pairs_of_value_timestamp(
+    duckdb_instance, table_name: str, concept_id_col: str, value_col: str, timestamp_col: str
+):
     """
     Generalized extraction function to extract data from an OMOP CDM table.
 
@@ -226,10 +214,10 @@ def extract_table(duckdb_instance, table_name: str, concept_id_col: str, value_c
     """
     # Load the specified table
     table_df = duckdb_instance.sql(f"SELECT * FROM {table_name}").df()
-    table_df = normalize_column_names(table_df)
+    table_df = _lowercase_column_names(table_df)
 
     # Load the person table to get unique person IDs
-    person_id_df = normalize_column_names(duckdb_instance.sql("SELECT * FROM person").df())
+    person_id_df = _lowercase_column_names(duckdb_instance.sql("SELECT * FROM person").df())
     person_ids = person_id_df["person_id"].unique()
 
     # Get unique features (concept IDs) for the table
@@ -261,7 +249,8 @@ def extract_table(duckdb_instance, table_name: str, concept_id_col: str, value_c
 
 
 def extract_measurement(duckdb_instance):
-    return extract_table(
+    """Extract a table of an OMOP CDM Database."""
+    return get_table(
         duckdb_instance,
         table_name="measurement",
         concept_id_col="measurement_concept_id",
@@ -271,7 +260,8 @@ def extract_measurement(duckdb_instance):
 
 
 def extract_observation(duckdb_instance):
-    return extract_table(
+    """Extract a table of an OMOP CDM Database."""
+    return get_table(
         duckdb_instance,
         table_name="observation",
         concept_id_col="observation_concept_id",
@@ -281,7 +271,8 @@ def extract_observation(duckdb_instance):
 
 
 def extract_procedure_occurrence(duckdb_instance):
-    return extract_table(
+    """Extract a table of an OMOP CDM Database."""
+    return get_table(
         duckdb_instance,
         table_name="procedure_occurrence",
         concept_id_col="procedure_concept_id",
@@ -291,7 +282,8 @@ def extract_procedure_occurrence(duckdb_instance):
 
 
 def extract_specimen(duckdb_instance):
-    return extract_table(
+    """Extract a table of an OMOP CDM Database."""
+    return get_table(
         duckdb_instance,
         table_name="specimen",
         concept_id_col="specimen_concept_id",
@@ -301,7 +293,8 @@ def extract_specimen(duckdb_instance):
 
 
 def extract_device_exposure(duckdb_instance):
-    # return extract_table(
+    """Extract a table of an OMOP CDM Database."""
+    # return get_table(
     #     duckdb_instance,
     #     table_name="device_exposure",
     #     concept_id_col="device_concept_id",
@@ -313,7 +306,8 @@ def extract_device_exposure(duckdb_instance):
 
 
 def extract_drug_exposure(duckdb_instance):
-    # return extract_table(
+    """Extract a table of an OMOP CDM Database."""
+    # return get_table(
     #     duckdb_instance,
     #     table_name="drug_exposure",
     #     concept_id_col="drug_concept_id",
@@ -325,7 +319,8 @@ def extract_drug_exposure(duckdb_instance):
 
 
 def extract_note(duckdb_instance):
-    return extract_table(
+    """Extract a table of an OMOP CDM Database."""
+    return get_table(
         duckdb_instance,
         table_name="note",
         concept_id_col="note_type_concept_id",
@@ -386,7 +381,7 @@ def get_time_interval_table(
     concept_ids: Literal["all"] | Sequence = "all",
     aggregation_strategy: str = "first",  # what to do if multiple obs. in 1 interval. first, last, mean, median, most_frequent for categories
     # strategy="locf",
-) -> np.array:
+) -> np.ndarray:
     """Extract measurement table of an OMOP CDM Database.
 
     Parameters
@@ -422,7 +417,7 @@ def get_time_interval_table(
 
     if num_intervals == "max_observation_duration":
         observation_period_df = con.execute("SELECT * from observation_period").df()
-        observation_period_df = normalize_column_names(observation_period_df)
+        observation_period_df = _lowercase_column_names(observation_period_df)
 
         # Calculate the duration of observation periods
         num_intervals = np.max(
@@ -464,7 +459,17 @@ def get_time_interval_table(
     return np.array(tables).transpose(0, 2, 1)  # TODO: store in self, np
 
 
-def normalize_column_names(df: pd.DataFrame) -> pd.DataFrame:
+def _lowercase_column_names(df: pd.DataFrame) -> pd.DataFrame:
     """Normalize all column names to lowercase."""
     df.columns = map(str.lower, df.columns)  # Convert all column names to lowercase
     return df
+
+
+def extract_condition_occurrence():
+    """Extract a table of an OMOP CDM Database."""
+    pass
+
+
+def extract_observation_period():
+    """Extract a table of an OMOP CDM Database."""
+    pass

From b2eeded6cbfcfc5ff7ce3306bb0aa4000d0e7f7c Mon Sep 17 00:00:00 2001
From: Eljas Roellin <eljas.roellin@ikmail.com>
Date: Wed, 16 Oct 2024 14:46:10 +0200
Subject: [PATCH 06/15] clean up nb

---
 docs/notebooks/test_more_datasets_omop.ipynb | 552 -------------------
 1 file changed, 552 deletions(-)
 delete mode 100644 docs/notebooks/test_more_datasets_omop.ipynb

diff --git a/docs/notebooks/test_more_datasets_omop.ipynb b/docs/notebooks/test_more_datasets_omop.ipynb
deleted file mode 100644
index 6bce569..0000000
--- a/docs/notebooks/test_more_datasets_omop.ipynb
+++ /dev/null
@@ -1,552 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "%load_ext autoreload\n",
-    "%autoreload 2"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from ehrdata import EHRData\n",
-    "\n",
-    "EHRData().r\n",
-    "import duckdb\n",
-    "import ehrdata as ed\n",
-    "import os"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "from datasets import gibleed_omop, mimic_iv_omop, synthea27nj_omop"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "define the function"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 82,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "def load_and_check(dummy_func, start_time, tables):\n",
-    "    con = duckdb.connect()\n",
-    "    dummy_func(backend_handle=con)\n",
-    "    edata = ed.io.omop.setup_obs(con, \"person_observation_period\")\n",
-    "    edata = ed.io.omop.setup_variables(\n",
-    "        backend_handle=con,\n",
-    "        edata=edata,\n",
-    "        tables=tables,\n",
-    "        start_time=start_time,\n",
-    "        interval_length_number=28,\n",
-    "        interval_length_unit=\"day\",\n",
-    "        num_intervals=\"max_observation_duration\",\n",
-    "        concept_ids=\"all\",\n",
-    "        aggregation_strategy=\"last\",\n",
-    "    )\n",
-    "    return edata"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Load the mimic dataset"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 98,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Path to data exists, load tables from there: ehrapy_data/mimic-iv-demo-data-in-the-omop-common-data-model-0.9\n",
-      "missing tables:  [['concept'], ['vocabulary'], ['domain'], ['concept_class'], ['concept_relationship'], ['relationship'], ['concept_synonym'], ['concept_ancestor'], ['source_to_concept_map'], ['drug_strength']]\n"
-     ]
-    }
-   ],
-   "source": [
-    "edata_mimic = load_and_check(mimic_iv_omop, \"observation_period_start_date\", [\"measurement\"])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 99,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "EHRData object with n_obs x n_var = 100 x 450, and a timeseries of 320 steps.\n",
-       "             shape of .X: (100, 450) \n",
-       "             shape of .r: ((100, 450, 320)) "
-      ]
-     },
-     "execution_count": 99,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "edata_mimic"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 100,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAKUAAAAUCAYAAADsvf0KAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAABJ0AAASdAHeZh94AAAGXklEQVR4nO2ae6xdRRXGf30kAgVLUqSN+ACR2xAkFMWADypXsMpDA4IhIYVSIpEAXpBWDdX69cNUagKF8hAJNS0qCZBUIPJ+NUB5iGIJGkBebREDVKlUiq2l5fLHmt3uO+zTe+45u/fU5n7JyWSvmTUz35w1s/aatYf19vYyhCFsSxje6QkMYQg5RlYJbV8LHAnsJentwZ3SELZ32P4M8CfgdEnz8/phufu2/VngD8B0SXNL8hOALwETgAOAXYDrJE3uZwIfAS4AvgaMAV4FbgYs6d916dQJ25OB36THyoWzvRz4eIMuXpc0rkHfg8rN9s+Bg4AuYDdgLbAijXmFpDey9mOA44Cjgf2BPYD1wF+ABcACSe82GKtpbrZvAg4B9pG0plxX5b5nA/8BrsrkPwbOJozyH5Ur8P5J7g08AUwFHgcuAV4CzgEeTQvQtk6dsP1R4ApgTX9tgdWAK34XNei7E9y+B4wC7gHmAdcBG4BZwFOJbxnfAq4BDiYOp0uBRcCngPnAjbaH5YO0wO1CYBzQk/fVx33b7gKOAOZLWltB7hXgBeLEXFy5BH3xC2B3oEfS5aVx5qb+ZgNn1KBTC9JiLwDeAH4HTO9H5U1JswYwRCe4fVDSulxoezYwAzgfOLNU9RzwDeC28oloewZhbMcD3yQMtYwBcZP0uO1nge/YnlMeKz8pTwOGATfkJCQtlvS8pKbC9bRzJgHLgSvz7oC3gZNtj2pHp2b0AF8mdnut79Kd4lZlkAk3pnKfrP39kn6fu2hJrwG/TI+Hleva4HY98DHgK2VhbpRHABuBxxoQGQi6U3l3BcG3gIeBnYj3inZ0aoHtfYE5wDxJDzap9gHbk23PsH2O7W7bIxq07Ri3Bvh6Kp8agM47qdyQyVvl9nAq+xjlJvedrHgC8ExNEff4VD7XoP55Ynd1Afe1odM2bI8kApuXCZfWLMaxOSAqsMz2VEkPZPKOcCtgezqwMzCaCHy+SBjknCb1RwKnpMc7s+pWuf0xlRPLjcsn5R7ACCJiqgOjU7m6QX0h37VNnTrwE+BA4NSKd+lGWAAcThjmKCJSvRrYE7jD9gFZ+05xKzCdcKXnEgZ5JzBJ0j+b1J9DBDu3S7orq2uJm6TVwDrChW9COdApoqOtfuWyLcH2wcTpeLGkR5vVk+RM9FfgDNtrgGlEdHtcXfNsF8UVle2xwOcJI1tq+xhJf96Sru0egtOzwMk1T20VMLYsKJ+UxQmxQ02DFbtjdIP6Qv5mmzotI7mkXxNuZ2YdfbI5GJiYyQeVWyNIel3STYQ7HUPwbwjbZxNXSU8D3ZJWVTRrh9uObLY9oK9RrkxlXXdlf0tlV4P6Iuorv4e0otMOdk5j7Quss91b/AhXB3BNkl3aZJ+FO8wjzcHmtkVIWkEY2n62d6tqY/tc4HLCC3SnCLwKLXGzPZxw6SvL8rJRvkos6HjqQXGPOSkNXp7MLsAXgP/SN9JvRacd/A/4VYPf0tRmSXpu1rUXEeZLmXywuTWDD6dyY15h+4fEBfiThEGuzNuU0Cq38cQV5JNl4aZ3Skm9th8Ejrf9SUkv9ENoi5D0ou27CTdxFrHjNs2VOEmuLkf6rejYXghMAaZKWjjAOa4Fvl1VZ3sWEfxcm6cZ0/XRy/kthe09iWwQwG+zsQaVW9LvIlKeqzP5cOCnxGX3IxUpwJlEuvAJIhiqctltcUsoNnCfREz+QcYi4sb+q0TmpjzRY4Fj02OR1/1cWjiAf0nKMyBnAo8Al9k+HHiGSF91E0f5jyo4DlSn2Jn53dnWxInAtLSJVwBvAXsT+eIdgNupTjUONrejgAttLwGWEZmqsURG7hPAa8DpZQXbUwiD3Ag8BPTYeUzH8opN0sp/PSmNc0tZmF+eLyL8+ym8HxOIXTuFMFoSsUJ2Qq4g6UXiTmxhmuA04s+bBxySfwzQos7+hFHcVjHnrYXFwK1pXicB5xF/9BJiLY6RtD5X6gC3e4lXjw8RqcHvE4fOKuIE20/S05nOXqkcQVwfqeJ3arvcbI8mDrlbJf29XFf1ldD5wM+AT0tayjYM27sSu/9iST/o8HRqxfbMDcD2d4HLgEMlLSnXVX0ldAmR2bhgEObWLg4lUl9z+2v4f4jtlpvtHYkPQRblBgkVJ2VSmki8C1w09JHvEOpGChRPBBZKWp7XvwdACvWbXD4BcQAAAABJRU5ErkJggg==",
-      "text/latex": [
-       "$\\displaystyle \\left( 100, \\  450, \\  320\\right)$"
-      ],
-      "text/plain": [
-       "(100, 450, 320)"
-      ]
-     },
-     "execution_count": 100,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "edata_mimic.r.shape"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Load the gibleed dataset"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 9,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Path to data exists, load tables from there: ehrapy_data/GIBleed_dataset\n",
-      "missing tables:  [['cohort_definition']]\n"
-     ]
-    }
-   ],
-   "source": [
-    "edata_gibleed = load_and_check(gibleed_omop, \"observation_period\", [\"measurement\"])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 102,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "EHRData object with n_obs x n_var = 2694 x 55, and a timeseries of 1441 steps.\n",
-       "             shape of .X: (2694, 55) \n",
-       "             shape of .r: ((2694, 55, 1441)) "
-      ]
-     },
-     "execution_count": 102,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "edata_gibleed"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 103,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "image/png": "iVBORw0KGgoAAAANSUhEUgAAALAAAAAUCAYAAAAtOremAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAABJ0AAASdAHeZh94AAAGpUlEQVR4nO2aeaxdUxTGfx1UmxalMUXUPNRYRLWkLTqIGKKGVKQUMSWookUbrH4S1FStEEMrbZUg1NSBEho1F6kg5uiglNfilddBaZ8/1j7teeed+9695953nsb7kpt9z9p77bP22muvs/bau1VtbS0taMHmitbNLUALWlAO2qYRJU0FTgT2MLNV+YrUghbUh6QjgI+Ai81sUkRvlQwhJB0JfACMMLNxgdYFGAScBBwM7AKsAz4DJgOTzWxDAy/vB1wB9AK2BX4NvBPMbHaibSvgovA7EGgFfAlMAh5p6D2BfwgwLTzWGWwlIWkRsFuB6l/MbKdK8JQLSWcCfYHuwKHAVsATZjakhD5K1mljPFnkkvQ80BPYx8xqIN0D3wr8ATwYo50VnpcBc4ElwI7A6bhhnSjpLDOrF1BLuhMYCSwFXgJWANsDRwDHArMTLI8D5wBVwJPAamBAeP/RwHkNDHBX4H6gBuhUqF0FsRIYn0KvqTBPObgRN5AafA72L4U5i06L5Mki1+24cx0G3AYJA5a0L9AfmGRma2JV3wCnArPiHlDSaGA+cAZuzNMT/V2MG+9U4BIzW5eo3yLxPAg33oVADzNbEejtQt/nSnrBzJ5Ljix47sm4d38OGNG4PspGtZmNyYGnHFyNG8h3uMebWyxjFp2WwFOyXGY2X9JXwKWSxprZhqQHvhD/ZD+dYHyjQIc/S3oI99rHEjNgSVsG+hJSjDfw/50gDQrlPZHxhnbrJN0EnIyHIvUMGF+Vxwc5jk+T9/8IM9toGJJKZc+i06J4ypDrKWAM/lWekzTg/sB64P0SOoyM8J8EfQAeKowHNkg6CTgIWAvMN7P3UvqKYsDvU+oiWm9J7eILQlI3YCweU8+TlJcBbxliva7AKuBTYJ6Zra8wT+7IotOc5uGdUA4A5mxMo0nqiAfUXxabeZDUlk0x6SuJ6iNDuRZYAMzEBzceeFfSm5K2T/BEXnePlNftGcq2sf+RDNNwTz+6GLkriJ3Cu2/Fx/UG8K2kvhXmyRVZdJrjPHwYyj5QNw+8C9AG36gVi7G4V51tZnMSdTuEciRQC/TGd5qHAK8GAZ5J8MwK5TWStouIIVaOf2e2jf2/GTgMOD8Rtzc1JgP9cIPsiGdnHgZ2B16WdGiFeJoDWXSayzyY2UrcKXaFupu4LqH8vZiOJA0DrgW+As5NaRItjn+AU81sUXj+LGzWvgb6SuoVCyeeCn2dAHwh6cUgbH9gZ3x1dwU2BBmOwlf7PQVCkiaDmSUDt8+ByyTV4HoZw6aYPjNP3sii02aYh9/wLFgdDxytmvaNcUu6ApgAfAEcZ2a/pTSrDuWCmPECYGargchj94jR1wOnADcAy4Gh4fctnkL7MzStCp+sx/AMyU2NyZwjHgplnybmqTiy6LSZ5qEDwV7jHrgqlF3qNY9B0nDgXtx79DOzqgJNvw5ldYH6yNN3iBNDZuKO8Iu/tz2wD7DCzBZK6gzsG6rXFtjJTpQ0Ed9UDC8gR6WxPJQdm5inKdCJ0nWahSczJLUGOuOp1joGvAxX5H4NMF+Px72fAAPiqa4UvI7HvgdIap1ygnZQKBcWKfvZQDv8cAPgL+DRAm0Px+Oxt/GFlGd40TOUaZmUSvI0BbLoNO952A9P9X4CMQM2s1pJ84AzJO1tZt/FuUIe9hbgY2BggbBhI8xssaQZ+AHIVbjXjvoaiMe51SSyF5K2NrM/ErTuwF241x4b+l+DHzfXg6QxuOKmph17SpqChyYXmNmUhsZRoP9uwJJktkbS7vgJFPiJYrk8ZclZKrLotJx5yIhosc+F+kfJ0/FTtRPwE5JIkKG48a4H3gKGpXwqFqUo+XJ8AONCHngBniI7LfR1UdhVxvGapDV4iPIn0A2/g7EGOMXMfip+rAUR32BmwWDg2rDgF+Ny7oXL2R4/Hr+7Ajzlyomk03B9w6Y8e6+wOMBDsjxOLeugDLkG4rbzIqQbcBWe230gRo/ysm2A4QVkehOYEieY2dJwi+hm3BP3we9ZzABuN7P5Kf08i4cLQ/D4+EfgkdB+aYF3l4qDcQOa1VjDApiLf8oOA47BY9dq/FM5DZiWci8kC0+5coLn9ocmaHuyKZe+mHyO3ZPoTolySdoGN/qZZvYDpN9GG4VflDjczBZUWurmRtj8/YqnfK5rZnEKYnORM09IuhK4D+htZm9D+oX2e/F86y05ypYneuPH3+OaW5BGsLnImQskdQBGAdMj44UUDxwa9wGOA+5uudDegv8CwiZ4MDAlfq7wLxytAalq0cnyAAAAAElFTkSuQmCC",
-      "text/latex": [
-       "$\\displaystyle \\left( 2694, \\  55, \\  1441\\right)$"
-      ],
-      "text/plain": [
-       "(2694, 55, 1441)"
-      ]
-     },
-     "execution_count": 103,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "edata_gibleed.r.shape"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Load the Synthea27NJ dataset"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 101,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Path to data exists, load tables from there: ehrapy_data/Synthea27Nj\n",
-      "missing tables:  []\n"
-     ]
-    }
-   ],
-   "source": [
-    "edata_synthea27nj = load_and_check(synthea27nj_omop, \"observation_period\", [\"measurement\"])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 113,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "EHRData object with n_obs x n_var = 28 x 132, and a timeseries of 866 steps.\n",
-       "             shape of .X: (28, 132) \n",
-       "             shape of .r: ((28, 132, 866)) "
-      ]
-     },
-     "execution_count": 113,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "edata_synthea27nj"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 104,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAJcAAAAUCAYAAACAu68PAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAABJ0AAASdAHeZh94AAAGVElEQVR4nO3af+zWVRUH8BdqM6LSBhmLZZMUp6khpukKxIk6tczMVn9Utha6laEmmrrqdNwMaIk/qtUWm1ixlek0EYY/kmVoSSlOm1o208SJv638NRPoj3uf9vDheeD7fT78MMZ7e3b2ufee++P9nM+55577GbF27VrbsR2bAzts7Qlsx7aLnXoVZuaVOBZ7RMRLW3ZK2/H/hsw8CH/C9IiY1ykf0dwWM/Ng3ImZETG3lo3GJ3A89sc4vIb7cAWuiIg1fQY+HmdgX4zGE7gLcyPi95twjZ3xTsbhmIgP4G1YEBGf3YDOHHwQEzAGr+BRXIcfRMSzjfYD89EWg/KZmUfidByGd+DZOt/LImJxW53MvBaHYq+IeJHe2+JF+Bd+1FX2KfwEH1IM71Jcg/0wD1dl5ogek5uDGzAJS3AZ7sbHcXtm9v3DW+AbCiET8fgQdc7CKNxc57gAr+PbuDcz39NoPxAfbTEon5n5XdyivEDX42IswjsxdRPpzMJYzOgUrLMtZuYETMO8iHilq+qvOAGLut/IzLwAy/FJnKQQ3Kkbi5l4EgdExFNddUfgVlyIn/daXAuchZX4m+LBlg5B5+0R8WqzMDMvwgU4H1/uqho2H20xKJ+ZOR3n4EqcGhGvNerf1GOsYetExPLMfBCnZebsiFjT9FxfxAj8sqF4a0QsbLr6iFiFH9fHqY2+3qt4xju7iah6S/Fv5S3YpIiIpRHxUEQM+Rjcy7Aqrqpyr0b7Qfhoi2HzmZk7KzvRP/Qwkqr7n7Y6XfgFdsdRrB/QT8Nq/KGPci90Bnq9Uf6QEocckpljIuKZrgVMUWKh64YxztbAx6q8dxg6/fhoi0H4PEoxuEuxpsZr++FVLO8Tow2i08HtXX3c+D/jysxRSpzywFBPiJm5Ez5fH5d010XEc5n5dczF/Zl5nRIQvk/ZUm7GaUMZZ0shM2firdhFiTU+ohjW7CHq9+WjLQbk8+AqX8UKxUi653sbTo6Ip1vqdPDHKqewbkA/Djsqp4+hYnYdfHFE3NisjIhLldhjJ0zHeUow/BjmN937GwAzEThTMawlOLoPkb2wQT7aYgA+d6vyHKzFZMXDHYCbFCP41SbQ6czvn4pR7s66xjW6yueHstDMnIGz8SA+16fNubga85U3bBQOwsNYUE8kbxhExNiIGKGcek7CeKzIzEkb0x0KH20xAJ+d//d1nBARyyLixYi4T0mlrMThmXlYS51uPKekc9Yxrs7p8M1DWOTpyjH4fhwREc/1aDMVc3B9RHwtIh6OiJcj4u46ycdxdmaO39h4WxoR8WREXIujlZfupxtqPxQ+2mJAPl+ockVEPNLdX0S8jI53PaSlTjdGqrbUbVwdlzp6veZdyMwz8X38WSFyVZ+mH61yvVRAneTyOv6BGxpvayIiHlUM5v2ZOaZXm2Hw0RaD8PmXKl/o02dnlxrZUgdk5g7YVbWlbuN6Ak9j7z6dqgHlJbhHIXJDMdPOVfZLN3TK1zvqvsHw7ipXNyuGyUdbDMLnb5S4ad/6xzfRCdb/3lKng72VVNY9dBlXzQvdhjGZuWdTKzO/qQSsd+HI7qNwH/yuylMzc1yjr2PxYSX4u6NRNz8z12bmFzbS/yZBZk7IzF16lO9Qk6i74Y6IeL5RP1w+2q5t2HxWz7tQCbDPaOgcjWMUD7WkjU4XDq1yKevnua5RssvHKBnuTqenKNnf1XWRMzKz2fEjETG/6/lq5fpgGh6od0+rsI/i4kfgvOa9nXUDymEjM0/EifVxbJWHZWZnbs9ExMwuleMwKzOXKW/js3iXkt0fX+c8vTHGIHzQbm2D8vkVZaucW3NWK7CHwtFqfKme8trqUGLU1fg1vY3rKSVX88Ou8j2q3FE5pvfCb5VTDIiINZl5XJ3oZ5Sg8y3KaWIxLo+Im3r0s7+SbV7UZ5yNYSJOaZSNrz/KhXS3cd2CPZXUw4FKzPCScsXzszrPZoA+bD4qBl7boHxGxMr61cK3lHzYFOXueCFmRcTyTaFTvf+JuCEiHqP3VxHn4zuYFBErhktCG2TmrornuDgizt2SY29ubMtrg8z8Ki7H5IhYRu+vIi5R7pUu3IJz62Cycn0ydyuMvbmxza4tM0cql/vXdAyLHp6rNp6CI/C97R8LbsfGkJn74NPKLcEjnfL/AjQSP4HwDZy2AAAAAElFTkSuQmCC",
-      "text/latex": [
-       "$\\displaystyle \\left( 28, \\  132, \\  866\\right)$"
-      ],
-      "text/plain": [
-       "(28, 132, 866)"
-      ]
-     },
-     "execution_count": 104,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "edata_synthea27nj.r.shape"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# check by loading the data with observation.csv"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "mimic dataset"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 105,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Path to data exists, load tables from there: ehrapy_data/mimic-iv-demo-data-in-the-omop-common-data-model-0.9\n",
-      "missing tables:  [['concept'], ['vocabulary'], ['domain'], ['concept_class'], ['concept_relationship'], ['relationship'], ['concept_synonym'], ['concept_ancestor'], ['source_to_concept_map'], ['drug_strength']]\n"
-     ]
-    }
-   ],
-   "source": [
-    "edata_mimic_obs = load_and_check(mimic_iv_omop, \"observation_period_start_date\", [\"observation\"])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 106,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAKUAAAAUCAYAAADsvf0KAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAABJ0AAASdAHeZh94AAAF1klEQVR4nO2aa4hVVRTHf6NC2mQGikr28DkiJY5l9FQbtKnsgfagL5oZSGI2Gg6FViz/gWlgmo9eJDg1CSWIRWlmD9HUyjLFIk1Txx6ok5qmppQ2fdjn6pk954733nPwTsP84bLvWWuvs9Y6e52991r7FNTU1NCEJjQkNMu3AU1ogo8WUURJbwC3A13M7Ni5NakJjR2Srga+AUab2XyfX+Av35KuAb4Cys1sZoh+HzAQKAb6AK2BhWY2/CwGXAI8C9wGtAX2AO8CMrM/kpKJg1x8k1QFXJ6Gvc/MOiahJy4kPQ/0A4qAdsBxYDfuec4zswNe/7bAMOAOoDfQCfgb+A5YACwws3/T6Mp43CQtAa4DepjZ0TAvavmeCvwJvOLRnwbG4R7ob5FPoK6R3YANwChgPTAL2AmMB74IHkBsmQSQtW8BDgOK+M1IWE8cPA4UAh8Ds4GFwElgCrBZ0qVe//uB14FrcZPTi8Bi4EpgPrBIUoGvJIdxmwZ0BMr8e9VaviUVAYOB+WZ2PMK5X4GfcG/7yshHUBsvA+2BMjObG9IzM7jfVGBMAjJxkYtvAIfMbMo50BMHF5rZCZ8oaSowGZgEjA2xtgF3A0vDM6Kkybhguxe4BxeoYWQ1bma2XtJW4BFJ08O6/JnyYaAAeMd3wsxWmtl2M8soXQ/enFKgCnjJvx1wDBghqTCOTBLI1reGrsfTWScgAywK2h5e/8/M7H1/iTazvcCrweXNYV6McXsbuAy4JUz0g3IwcAr4Mo0j2aAkaFdEOHgEWAucj9tXxJHJJ86TNFzSZEnjJZVIap5vozLEXUG7OQuZf4L2pEfPddzWBm2toDy9fAdRXAxsSSjj7hm029Lwt+PeriLg0xgy+URHoNKj7ZI0ysxW5cOgdJBUDlwAtMElPjfhAnJ6hvItgAeDy+UeO9dx+zpoB4Q7h2fKTkBzXMaUBNoE7eE0/BT9opgy+cICYBAuMAtxmeprQGfgQ0l98mdaJMpxS+kEXEAuB0rN7PcM5afjkp1lZvaRx8tp3MzsMHACt4SfRjjRSWVHiZdcGiPMTB7pe2CMpKPARFx2O+xc25UOqRKVpA7ADbgg2yjpTjP7tj5ZSWU4n7YCIxI27SDQIUwIz5SpbLtlQspSb0ebNPwU/VBMmYaGVDIwoN5eeYKZ7TOzJbjltC3wZn39JY3DlZJ+AErM7GBEtzjj1oozsQfUDsrqoE2qDvhj0Bal4aeyvvA+JBeZhobUcphohSBpmNluXKBdIaldVB9JE4C5uFWgJMjAo5DTuElqhlvSq8P0cFDuwT3QniSDVA2uNFAeNqY1cCPwF7Uz/VxkGhpSGebOvFqRGS4O2lM+Q9KTuAL4JlxAVvt9Qsh13HriSpCbwsTTNwhqZ6uBdpK61+/L2WFmO4AVuI3/ox5buJmkMpzp5yIjqUJSjaSH4tqcKST1iqqVSuoMzAsu30pATyzfJBVJqrOkSmoWFM/bA+sijgCfwe05NwCDzGx/fXpyGbcAqRe41iGC/0HGYlzF/lbcqUPY0KHA0OAyda57vaSK4P9+Myv37jcWWAfMkTQI2II7virBTeVP1XUxa5nUi+XXzjJGDr49AEyUtBp3jnwE6IY7L24JLCPiqDEHPXF9GwJMk7QG2AUcwCUVA4GuwF5gtGfjSNz59Sngc6BM8nM6qsyswqPlMtalgZ73wsSooKzG1aP8ynwxMNKjdQ1+4AanVlCa2Q5J/ThzSD8Et02YTZqPK3KQ6Y0LiqX+vbJAMdn5thK39PTFLU2FuE38GlzdsjLNqU22euL69gnQHVcC6ovbvx3DBUklMCcicekStM1x5aMorAIqwoRsxy2YwYcCH5jZL2Fe1FdCk4DngKvMbGN6f/MPSRfh3v4XzOyJPJuTKBqzbwCSHgPmAP3NbE2YF/WV0CzgZ1zEN3T0xx19zTxbx/8hGq1vklrhPgRZ7AckRMyUgdAA3F5gRtNHvk1IGpJ64fblFWZW5fP/A5RDnlkqCjNBAAAAAElFTkSuQmCC",
-      "text/latex": [
-       "$\\displaystyle \\left( 100, \\  151, \\  320\\right)$"
-      ],
-      "text/plain": [
-       "(100, 151, 320)"
-      ]
-     },
-     "execution_count": 106,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "edata_mimic_obs.r.shape"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "gibleed dataset"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 96,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Path to data exists, load tables from there: ehrapy_data/GIBleed_dataset\n",
-      "missing tables:  [['cohort_definition']]\n"
-     ]
-    }
-   ],
-   "source": [
-    "edata_gibleed_obs = load_and_check(gibleed_omop, \"observation_period\", [\"observation\"])"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": []
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 109,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "image/png": "iVBORw0KGgoAAAANSUhEUgAAALAAAAAUCAYAAAAtOremAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAABJ0AAASdAHeZh94AAAGJUlEQVR4nO2ae4hVVRTGf6Nm2lOTypAkJRUra7J8Bb5fhCmaRhGaFVp/ZGqlmZIuP6GcHmpGUajhmEX20DJTspdkRmWFguGjJLUsY3xkao2ZOv2x99UzZ86duffcmTteuB9c9j37+Z21115n7bV3QVlZGXnkkauoU9sE8sgjE9SLypS0CLgZaGFmf2eXUh55VISkG4DvgNFmtiCRXxB2ISR1AL4BJpjZbJ/XBBgCDADaAc2AY8AmYCGw0MxOVjJ4b2AM0AVoDOz3beea2apQ3QJglP9dDRQAW4AFwLzKxvHthwOL/WO5l60uxJWHpGFAd6AQuA44H3jdzIZXN8fqHDOOTKtqE4eXpHeBzkArMzsC0S7EE8Ah4KVA3m3AfKATTrmfA5YC1+AU6y2veFGDPg18AtwIvA/MAlYCFwM9Ipq8BswDrgDe8P2f4/kUJ3s5P9blwAvAkcrqVQPiyuNx3EIuBH6rYY7VMmYcmabYJg6vmUBTYGwio5wLIak10AdYYGalgaIfgUHAyqBlkTQFWA8MBW7FTWKwv9HARGARcJ+ZHQuVnxV6HgLcCewAOprZPp9f3/c9QtJ7ZrYs/GZeYRbirPsyYEIVwsgEseQBPATsBrbjrM+aGuSY8ZhxZJpGm7R5mdl6SVuB+yUVmdnJsA98L+6T/Wao4WdJOvxD0ss4q92DwIRJOtvn/0KE8vr2/4Wyhvh0VkJ5fb1jkqYCt+BWbQUFxq3KXp5Hryi+1YU48vDlpyZJUk1SrK4x48g0pTYZ8FoCTAf6AqvDLkQf4ATwdRodJpTweCi/L85NWAaclDRA0iRJ4yR1SdJXU5/+HFGWyOvqLfIpSGoLFOF86rVpcK8JJJNHTiGOTLM0D1/6tC8EXAhJ5+L8kS2pRh4k1QPu8o8fhoo7+PQosAHnHwbbrgWGmdneQHbC6raIGK6lT+v5/1sDHBbjLP2UVHjXFKqQR84gjkyzOA/f+rQblN/ENQPqAnvS6KwIp5irzGx1qOwSn04EyoCuuJ3mtcBHnsDboTYrffqwpIsSmd5XDn5nGgf+TwOuB+4O+e21gcrkkUuII9OszIOZ/YUzis2h/CauiU//TKUjSWOBR3CWcERElcTiOA4MMrOd/nmT36xtA7pL6mJmX/myJb6v/sBmScs92T7AZbjV3Rw46Tl0wq32WYE+agUpyCMnEEemtTAPB4BLobwFTqyaBlW1ljQGmAtsBnqa2YGIagd9uiGgvACY2T9AwkJ1DOSfAAYCjwF7gZH+9xNwE3DYVy3xn6xXcRGBqVVxrkmkKI8zHnFkWkvz0BCvr0ELXOLTJhWqByBpPDAH+AHobWYlSapu8+nBJOUJS98wmOkjE0/5X3DcBkArYJ+Z7ZDUCGjti48m2cnOlzQft6kYn4RHRkhDHrmA80hfpnHaxIakOkAjXKi1nALvwVm9NpU0noTz8zYCfYOhrgh8ivN9r5JUJ+JkKrGp25Ei9zuA+rjDDYB/gVeS1G2P88fW4RZSjXzW0pRHLiCOTLM9D21wod6NEFBgMyvzkYGhkq40s+3BVj4OOwP4HuhX1WfSzHZJWoEL+I/DWalEX/1wfu5BQrt1SReY2aFQXiHwDM5qF/n+S3HHzRUgaTpOcIuijj0lFeNck3vMrLiy90iGdOURc4xiMuSZDuLINJN5iInOPl0DFS/zLMWdIvXHnZAkiIzETdYJ4AtgbMSnYmeEkB/AvcBsSQNw4bQWwGDf1yi/qwziY0mluE/yYaAt7s5BKTDQzH5P/V2TIrjBTBtx5SFpMO7d4XTMu4tXVHDuUfDkKiOeMcfMCjLg1Q8n9+UQrcAluFjmi4H8RFy2LjA+CafPCd1VMLPd/hbRNJwl7oa7Z7ECmGlm6yP6eQfnLgzH+ce/4e5GzDSz3UnGThftcItjZVUVkyCWPHBx9pGhvJacjnHvovzRa6Y844yZLRSSJi9JF+KU/gMz+xWib6NNBp4E2pvZhupmXdvwm7/9uJDPo7VMJylyhWc2IelB4Hmgq5mtg+jbaHNw8dYZWeSWTXTFHffOrm0iVSBXeGYFkhoCk4GlCeWFCAvsK3cDegLP5i+053EmwN+zuB0oDp4r/A+xAOOVibavLgAAAABJRU5ErkJggg==",
-      "text/latex": [
-       "$\\displaystyle \\left( 2694, \\  21, \\  1441\\right)$"
-      ],
-      "text/plain": [
-       "(2694, 21, 1441)"
-      ]
-     },
-     "execution_count": 109,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "edata_gibleed_obs.r.shape"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "synthea27nj dataset"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 111,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Path to data exists, load tables from there: ehrapy_data/Synthea27Nj\n",
-      "missing tables:  []\n"
-     ]
-    }
-   ],
-   "source": [
-    "edata_synteha27nj_obs = load_and_check(synthea27nj_omop, \"observation_period\", [\"observation\"])"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 112,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAIoAAAAUCAYAAABS66VXAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAABJ0AAASdAHeZh94AAAGPklEQVR4nO3af+zVVRkH8BdCA8ZaNqzY2FAiM0zMNA0zEAbKTGdRmVsztRbYlqH5K21rj49bpU0Rqq00HLhq6xfL/AX2QxZpJaWwbP6oppg60ZKw1JwJ9Mf5fNiH+733y/fe+wVy8d7uzj7POc85z3nO+Ty/PnfE9u3b7cM+7Ar77W0B9uHVgVHtiJl5I07C5Ih4Yc+KtA97E5l5FH6PBRGxrKaPaHU9mXk07sFFEbG4oo3HfJyMaZiIl3E/lmN5RGzrsPDJOA+HYjyewr1YHBG/GcY9ysyzK3kGw7aIGNnCtxEHdhj/dERM6F+69uhFP5k5B+fiWLwezypnsTQibu+XJzN/jOk4OCKep73r+SL+iW80aKfhW3i3comWYCUOwzL8IDNHtBHuKtyKI7EaS3Ef3o+7M/OMdpvqAxuQHX53VmNWdeB9rgPf1cMs4w70op/M/Ap+jnfhZlyD2/AGzOqwTrc8X8YELKoJO7mezHwr5mJZRPy70fUnnIrbmpYjMz+PdfgQPqhcnrpvAi7C0zg8Ip5p9M1WDu4KfKfd5npBRGxQLssAZGb9dl7fgX1LRFw+XLLsCr3oJzMX4GLciIUR8XLLnK9ps07XPBGxLjMfwjmZeWVEbGu1KJ/ACHy/hfHOiLil1b1ExCZ8s3qc1TLXgYrFuqephIpvDf6l3OjdjsycppjSJ5U36X8BXeknM0cr1v6v2hx4xfef5nMvPA18D5NwAgOD2bnYit923t8A1Au90kL/sxLHHJOZB0TE3xsbmInX4qYu1ukHC6v2hojY2mHM6MrUT8IL+APWDjK+X3SrnxOUi7ME26rY5jC8hHUd4pleeGrc3Zjjjh0XJTPH4Qg8ONRMJzNH4czqcXWzLyI2Z+bnsBgPZOZNSgA1RXFjP8M5Q1mnH2TmWJyhvADLBhk6Ad9uoT2amR+PiF8Ot1w96Ofoqn0J65UD34HMXIsPR8Tf+uSp8buqncnOwexEjFSi7qHiymrx2yPijtbOiFiixC6jsACXKoHx41jRanJ3Ez6C/bE6Ih7vMGY55iiXZZyS2V2Hg7AqM9+xOwTrUj9vrNqLsR0zFKtzOH6qHOgPW5bohaeW7Tnlgk1iZ9czvmr/MZRNZuYiXIiH8LEOYy7Bl/BVfB2b8DYlqv5uZh4REZcMZb0+ULud6zoNiIhsIf0Rn8rM55U9Xq6UB4YVXeqnfqlfwakRsbF6vj8z5+NhHJ+ZxzZcSi88TWzGm5oTQZ3ljBnCBs9VUrkHMDsiNrcZMwtX4eaIuCAiHomIFyPiPkXpT+LCzHzzrtbrFZn5drwHT6BtfWEXqAP1mcMmVIUe9LOlatc3DhxExIuoLfoxja5eeJoYq7oXzYtSm7nxA4Y3kJnn42vKWze7ynza4ZSqXdPaUQm5rlr/nYOt1yeGEsQOhtp3jxsmeZroVj8PV+2WDvPVnmBsg9YLD8jM/RSX/Qw7X5SnFMUc0mFSVfB1rVKrmL2LGGN01XZKgWv6gJRtOJCZYxSXuBU39DjN9Kp9ZFiE2hnd6ucXSpxxaHWIragD1UcbtF54ahyilEo20LgoEbEda3FAZr6llSszv6AEr/diTjOd64BfVe3CzJzYMtdJOE4Jln7d0rciM7dX5fh+cJpSql41SBArM6dWGV8r/SAlbqBNUXAY5OxKPxHxGG5RgsvzWsafiHmK5diRffbC00D9kqxhYB1lpVJlnYe/NCY9S6kSbq02uCizNf6zMSJWNJ5/pJSN5+LB6vvBJkxVzO4IXBoRz7bM0wzA+kHtdjpVYmucrsQCa/GYUuiaonzXGqPENu3K+P3K2Yt+Pq24osVVTWQ9JuMDytl8sspW9MkDJ1b9P2lutsZKxSed2UKfXLUjcT6ize/sJkNVxX0fPqsEvfOVDGK6ovx5EbG0jYDTlMPquYKamVPxXkMLYtco31um4KO4AMfjLpyFU9pVNPuVsxf9RMQTOEqxdAcrVmKWYjWOi4iVWtALT2a+TrlIt9bWuN3X48uUlO3IiFjfvQp6R2burxSdrtkDaXPPeLXI2Ssy8zNKyj4jIu6i/dfja5VvA1fsQdlqzFA+CSzeC2t3g1eLnF2jqmRfhpX1JaGNRakGz8RsXL3vj0v/X6jc9ulKZXhjTf8vHQfy/YibeYEAAAAASUVORK5CYII=",
-      "text/latex": [
-       "$\\displaystyle \\left( 28, \\  75, \\  866\\right)$"
-      ],
-      "text/plain": [
-       "(28, 75, 866)"
-      ]
-     },
-     "execution_count": 112,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "edata_synteha27nj_obs.r.shape"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 126,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "tables = [\"measurement\", \"observation\", \"procedure_occurrence\", \"specimen\", \"device_exposure\", \"drug_exposure\", \"note\"]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 122,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "device_exposure\n",
-      "drug_exposure\n"
-     ]
-    }
-   ],
-   "source": [
-    "for table in tables:\n",
-    "    table_ext = table + \".csv\"\n",
-    "    path = os.path.join(\n",
-    "        \"/Users/shrey.parikh/Desktop/EHR/ehrapy_data/mimic-iv-demo-data-in-the-omop-common-data-model-0.9/1_omop_data_csv\",\n",
-    "        table_ext,\n",
-    "    )\n",
-    "    temp = pd.read_csv(path)\n",
-    "    if temp.columns.str.contains(\"start_date\").any():\n",
-    "        print(table)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 136,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "# removing drug_exposure and device_exposure because they have start/end date\n",
-    "# note is empty\n",
-    "tables = [\"measurement\", \"observation\", \"procedure_occurrence\", \"specimen\"]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 137,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Processing table: measurement\n",
-      "Path to data exists, load tables from there: ehrapy_data/mimic-iv-demo-data-in-the-omop-common-data-model-0.9\n",
-      "missing tables:  [['concept'], ['vocabulary'], ['domain'], ['concept_class'], ['concept_relationship'], ['relationship'], ['concept_synonym'], ['concept_ancestor'], ['source_to_concept_map'], ['drug_strength']]\n",
-      "Success: measurement processed successfully.\n",
-      "Processing table: observation\n",
-      "Path to data exists, load tables from there: ehrapy_data/mimic-iv-demo-data-in-the-omop-common-data-model-0.9\n",
-      "missing tables:  [['concept'], ['vocabulary'], ['domain'], ['concept_class'], ['concept_relationship'], ['relationship'], ['concept_synonym'], ['concept_ancestor'], ['source_to_concept_map'], ['drug_strength']]\n",
-      "Success: observation processed successfully.\n",
-      "Processing table: procedure_occurrence\n",
-      "Path to data exists, load tables from there: ehrapy_data/mimic-iv-demo-data-in-the-omop-common-data-model-0.9\n",
-      "missing tables:  [['concept'], ['vocabulary'], ['domain'], ['concept_class'], ['concept_relationship'], ['relationship'], ['concept_synonym'], ['concept_ancestor'], ['source_to_concept_map'], ['drug_strength']]\n",
-      "Success: procedure_occurrence processed successfully.\n",
-      "Processing table: specimen\n",
-      "Path to data exists, load tables from there: ehrapy_data/mimic-iv-demo-data-in-the-omop-common-data-model-0.9\n",
-      "missing tables:  [['concept'], ['vocabulary'], ['domain'], ['concept_class'], ['concept_relationship'], ['relationship'], ['concept_synonym'], ['concept_ancestor'], ['source_to_concept_map'], ['drug_strength']]\n",
-      "Success: specimen processed successfully.\n"
-     ]
-    }
-   ],
-   "source": [
-    "for table in tables:\n",
-    "    print(f\"Processing table: {table}\")\n",
-    "    try:\n",
-    "        edata_temp = load_and_check(mimic_iv_omop, \"observation_period_start_date\", [table])\n",
-    "        print(f\"Success: {table} processed successfully.\")\n",
-    "    except Exception as e:\n",
-    "        print(f\"Error processing table: {table}. Error: {str(e)}\")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "hackathon_venv",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.12.7"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}

From a28d1021491efff0e3f2d5793cb146ad97d4ff0d Mon Sep 17 00:00:00 2001
From: Eljas Roellin <eljas.roellin@ikmail.com>
Date: Mon, 28 Oct 2024 22:46:56 +0100
Subject: [PATCH 07/15] dummy test omop dataset; start io tests

---
 src/ehrdata/io/omop/__init__.py               |  7 ++-
 src/ehrdata/io/omop/omop.py                   | 56 +++++++++++++++----
 src/ehrdata/utils/_omop_utils.py              | 33 ++++++++++-
 tests/conftest.py                             | 12 ++++
 tests/data/toy_omop/vanilla/cohort.csv        |  4 ++
 tests/data/toy_omop/vanilla/measurement.csv   | 14 +++++
 tests/data/toy_omop/vanilla/observation.csv   | 10 ++++
 .../toy_omop/vanilla/observation_period.csv   |  4 ++
 tests/data/toy_omop/vanilla/person.csv        |  5 ++
 .../toy_omop/vanilla/visit_occurrence.csv     |  4 ++
 tests/test_io/test_omop.py                    | 52 +++++++++++++++++
 11 files changed, 184 insertions(+), 17 deletions(-)
 create mode 100644 tests/conftest.py
 create mode 100644 tests/data/toy_omop/vanilla/cohort.csv
 create mode 100644 tests/data/toy_omop/vanilla/measurement.csv
 create mode 100644 tests/data/toy_omop/vanilla/observation.csv
 create mode 100644 tests/data/toy_omop/vanilla/observation_period.csv
 create mode 100644 tests/data/toy_omop/vanilla/person.csv
 create mode 100644 tests/data/toy_omop/vanilla/visit_occurrence.csv
 create mode 100644 tests/test_io/test_omop.py

diff --git a/src/ehrdata/io/omop/__init__.py b/src/ehrdata/io/omop/__init__.py
index 6f3fda4..8cd4668 100644
--- a/src/ehrdata/io/omop/__init__.py
+++ b/src/ehrdata/io/omop/__init__.py
@@ -1,4 +1,7 @@
 from .omop import (
+    get_table,
+    get_time_interval_table,
+    load,
     # extract_condition_occurrence,
     # extract_device_exposure,
     # extract_drug_exposure,
@@ -10,9 +13,7 @@
     # extract_person_observation_period,
     # extract_procedure_occurrence,
     # extract_specimen,
-    get_table,
-    get_time_interval_table,
-    load,
+    register_omop_to_db_connection,
     setup_obs,
     setup_variables,
 )
diff --git a/src/ehrdata/io/omop/omop.py b/src/ehrdata/io/omop/omop.py
index ccd81b8..2aaa872 100644
--- a/src/ehrdata/io/omop/omop.py
+++ b/src/ehrdata/io/omop/omop.py
@@ -1,5 +1,6 @@
 from __future__ import annotations
 
+import os
 from collections.abc import Sequence
 from pathlib import Path
 from typing import Literal
@@ -8,6 +9,9 @@
 import duckdb
 import numpy as np
 import pandas as pd
+from duckdb import DuckDBPyConnection
+
+from ehrdata.utils._omop_utils import get_omop_table_names
 
 
 def _check_sanity_of_folder(folder_path: str | Path):
@@ -18,22 +22,46 @@ def _check_sanity_of_database(backend_handle: duckdb.DuckDB):
     pass
 
 
-VALID_OBSERVATION_TABLES_SINGLE = ["person", "observation_period", "visit_occurrence"]
-VALID_OBSERVATION_TABLES_JOIN = ["person_observation_period", "person_visit_occurrence"]
+VALID_OBSERVATION_TABLES_SINGLE = ["person"]
+VALID_OBSERVATION_TABLES_JOIN = ["person_cohort", "person_observation_period", "person_visit_occurrence"]
 VALID_VARIABLE_TABLES = ["measurement", "observation", "specimen", "note", "death"]
 
 
+def register_omop_to_db_connection(
+    path: Path,
+    backend_handle: DuckDBPyConnection,
+    source: Literal["csv"] = "csv",
+) -> None:
+    """Register the OMOP CDM tables to the database."""
+    missing_tables = []
+    for table in get_omop_table_names():
+        # if path exists lowercse, uppercase, capitalized:
+        table_path = f"{path}/{table}.csv"
+        if os.path.exists(table_path):
+            if table == "measurement":
+                backend_handle.register(
+                    table, backend_handle.read_csv(f"{path}/{table}.csv", dtype={"measurement_source_value": str})
+                )
+            else:
+                backend_handle.register(table, backend_handle.read_csv(f"{path}/{table}.csv"))
+        else:
+            missing_tables.append([table])
+    print("missing tables: ", missing_tables)
+
+    return None
+
+
 def setup_obs(
     backend_handle: Literal[str, duckdb, Path],
-    observation_table: Literal[
-        "person", "observation_period", "person_observation_period", "visit_occurrence", "person_visit_occurrence"
-    ],
+    observation_table: Literal["person", "person_cohort", "person_observation_period", "person_visit_occurrence"],
 ):
     """Setup the observation table.
 
     This function sets up the observation table for the EHRData object.
-    For this, a table from the OMOP CDM which represents the "observed unit" via its id should be selected.
-    A unit can be a person, an observation period, a visit occurrence, or a left join on person_id of a person with one of the other tables.
+    For this, a table from the OMOP CDM which represents the "observed unit" via an id should be selected.
+    A unit can be a person, or the data of a person together with either the information from cohort, observation_period, or visit_occurrence.
+    Notice a single person can have multiple of the latter, and as such can appear multiple times.
+    For person_cohort, the subject_id of the cohort is considered to be the person_id for a join.
 
     Parameters
     ----------
@@ -50,14 +78,16 @@ def setup_obs(
 
     if observation_table not in VALID_OBSERVATION_TABLES_SINGLE + VALID_OBSERVATION_TABLES_JOIN:
         raise ValueError(
-            "observation_table must be either 'person', 'observation_period', 'person_observation_period', 'visit_occurrence', or 'person_visit_occurrence'."
+            f"observation_table must be one of {[VALID_OBSERVATION_TABLES_SINGLE]+[VALID_OBSERVATION_TABLES_JOIN]}."
         )
 
     if observation_table in VALID_OBSERVATION_TABLES_SINGLE:
         obs = get_table(backend_handle, observation_table)
 
     elif observation_table in VALID_OBSERVATION_TABLES_JOIN:
-        if observation_table == "person_observation_period":
+        if observation_table == "person_cohort":
+            obs = _get_table_left_join(backend_handle, "person", "cohort", right_key="subject_id")
+        elif observation_table == "person_observation_period":
             obs = _get_table_left_join(backend_handle, "person", "observation_period")
         elif observation_table == "person_visit_occurrence":
             obs = _get_table_left_join(backend_handle, "person", "visit_occurrence")
@@ -176,13 +206,15 @@ def get_table(duckdb_instance, table_name: str) -> pd.DataFrame:
     return _lowercase_column_names(duckdb_instance.sql(f"SELECT * FROM {table_name}").df())
 
 
-def _get_table_left_join(duckdb_instance, table1: str, table2: str) -> pd.DataFrame:
+def _get_table_left_join(
+    duckdb_instance, table1: str, table2: str, left_key: str = "person_id", right_key: str = "person_id"
+) -> pd.DataFrame:
     """Extract a table of an OMOP CDM Database."""
     return _lowercase_column_names(
         duckdb_instance.sql(
             f"SELECT * \
-        FROM {table1} \
-        LEFT JOIN {table2} USING(person_id) \
+        FROM {table1} as t1 \
+        LEFT JOIN {table2} as t2 ON t1.{left_key} = t2.{right_key} \
         "
         ).df()
     )
diff --git a/src/ehrdata/utils/_omop_utils.py b/src/ehrdata/utils/_omop_utils.py
index 7385538..2b52d02 100644
--- a/src/ehrdata/utils/_omop_utils.py
+++ b/src/ehrdata/utils/_omop_utils.py
@@ -6,6 +6,7 @@
 import os
 import warnings
 from pathlib import Path
+from typing import Literal
 
 # import dask.dataframe as dd
 import numpy as np
@@ -13,8 +14,13 @@
 from rich import print as rprint
 
 
-def get_table_catalog_dict():
-    """Get the table catalog dictionary of the OMOP CDM v5.4.
+def get_table_catalog_dict(version: Literal["5.4"] = "5.4"):
+    """Get the table catalog dictionary of the OMOP CDM.
+
+    Parameters
+    ----------
+    version
+        The version of the OMOP CDM. Currently, only 5.4 is supported.
 
     Returns
     -------
@@ -61,9 +67,32 @@ def get_table_catalog_dict():
         "source_to_concept_map",
         "drug_strength",
     ]
+
     return table_catalog_dict
 
 
+def get_omop_table_names(version: Literal["5.4"] = "5.4"):
+    """Get the table names of the OMOP CDM.
+
+    Args
+    ----
+        version: str, the version of the OMOP CDM. Currently, only 5.4 is supported.
+
+    Returns
+    -------
+        List of table names
+    """
+    if version != "5.4":
+        raise ValueError("Only support OMOP CDM v5.4!")
+
+    table_catalog_dict = get_table_catalog_dict(version=version)
+    tables = []
+    for _, value_list in table_catalog_dict.items():
+        for value in value_list:
+            tables.append(value)
+    return tables
+
+
 def get_dtype_mapping():
     """Get the data type mapping of the OMOP CDM v5.4.
 
diff --git a/tests/conftest.py b/tests/conftest.py
new file mode 100644
index 0000000..8f5fbc0
--- /dev/null
+++ b/tests/conftest.py
@@ -0,0 +1,12 @@
+import duckdb
+import pytest
+
+from ehrdata.io.omop import register_omop_to_db_connection
+
+
+@pytest.fixture  # (scope="session")
+def omop_connection_vanilla():
+    con = duckdb.connect()
+    register_omop_to_db_connection(path="tests/data/toy_omop/vanilla", backend_handle=con, source="csv")
+    yield con
+    con.close()
diff --git a/tests/data/toy_omop/vanilla/cohort.csv b/tests/data/toy_omop/vanilla/cohort.csv
new file mode 100644
index 0000000..6517ad1
--- /dev/null
+++ b/tests/data/toy_omop/vanilla/cohort.csv
@@ -0,0 +1,4 @@
+﻿cohort_definition_id,subject_id,cohort_start_date,cohort_end_date,
+1,1,1/1/00,1/6/00,
+1,2,1/1/00,1/6/00,
+1,3,1/1/00,1/6/00,
diff --git a/tests/data/toy_omop/vanilla/measurement.csv b/tests/data/toy_omop/vanilla/measurement.csv
new file mode 100644
index 0000000..23e7888
--- /dev/null
+++ b/tests/data/toy_omop/vanilla/measurement.csv
@@ -0,0 +1,14 @@
+﻿measurement_id,person_id,measurement_concept_id,measurement_date,measurement_datetime,measurement_time,measurement_type_concept_id,operator_concept_id,value_as_number,value_as_concept_id,unit_concept_id,range_low,range_high,provider_id,visit_occurrence_id,visit_detail_id,measurement_source_value,measurement_source_concept_id,unit_source_value,value_source_value
+1,1,3031147,1/1/00,1/1/00 12:00,12:00,32856,,18,,9557,21,30,,1,,50804,2000001003,mEq/L,18
+2,1,3031147,1/1/00,1/1/2100  13:00:00 PM,13:00,32856,,19,,9557,21,30,,1,,50804,2000001003,mEq/L,19
+3,1,3022318,1/1/00,1/1/2100  14:00:00 PM,14:00,32817,,,45877096,,,,,1,,220048,2000030004,,SR (Sinus Rhythm)
+4,2,3031147,1/1/00,1/1/00 12:00,12:00,32856,,20,,9557,21,30,,2,,50804,2000001003,mEq/L,20
+5,2,3031147,1/1/00,1/1/2100  13:00:00 PM,13:00,32856,,21,,9557,21,30,,2,,50804,2000001003,mEq/L,21
+6,2,3022318,1/1/00,1/1/2100  14:00:00 PM,14:00,32817,,,45877096,,,,,2,,220048,2000030004,,SR (Sinus Rhythm)
+7,3,3031147,1/1/00,1/1/00 12:00,12:00,32856,,22,,9557,21,30,,3,,50804,2000001003,mEq/L,22
+8,3,3031147,1/1/00,1/1/2100  13:00:00 PM,13:00,32856,,23,,9557,21,30,,3,,50804,2000001003,mEq/L,23
+9,3,3022318,1/1/00,1/1/2100  14:00:00 PM,14:00,32817,,,45883018,,,,,3,,220048,2000030004,,AF (Atrial Fibrillation)
+,,,,,,,,,,,,,,,,,,,
+,,,,,,,,,,,,,,,,,,,
+,,,,,,,,,,,,,,,,,,,
+,,,,,,,,,,,,,,,,,,,
diff --git a/tests/data/toy_omop/vanilla/observation.csv b/tests/data/toy_omop/vanilla/observation.csv
new file mode 100644
index 0000000..0cd51c2
--- /dev/null
+++ b/tests/data/toy_omop/vanilla/observation.csv
@@ -0,0 +1,10 @@
+observation_id,person_id,observation_concept_id,observation_date,observation_datetime,observation_type_concept_id,value_as_number,value_as_string,value_as_concept_id,qualifier_concept_id,unit_concept_id,provider_id,visit_occurrence_id,visit_detail_id,observation_source_value,observation_source_concept_id,unit_source_value,qualifier_source_value
+1,1,3001062,2100-01-01,2100-01-01 12:00:00,32817,,Anemia,0,,,,,,225059,2000030108,,
+2,1,3001062,2100-01-01,2100-01-01 13:00:00,32817,,Anemia,0,,,,,,225059,2000030108,,
+3,1,3034263,2100-01-01,2100-01-01 14:00:00,32817,3,,,,,,,,224409,2000030058,,
+4,2,3001062,2100-01-01,2100-01-01 12:00:00,32817,,Anemia,0,,,,,,225059,2000030108,,
+5,2,3001062,2100-01-01,2100-01-01 13:00:00,32817,,Anemia,0,,,,,,225059,2000030108,,
+6,2,3034263,2100-01-01,2100-01-01 14:00:00,32817,4,,,,,,,,224409,2000030058,,
+7,3,3001062,2100-01-01,2100-01-01 12:00:00,32817,,Anemia,0,,,,,,225059,2000030108,,
+8,3,3001062,2100-01-01,2100-01-01 13:00:00,32817,,Anemia,0,,,,,,225059,2000030108,,
+9,3,3034263,2100-01-01,2100-01-01 14:00:00,32817,5,,,,,,,,224409,2000030058,,
diff --git a/tests/data/toy_omop/vanilla/observation_period.csv b/tests/data/toy_omop/vanilla/observation_period.csv
new file mode 100644
index 0000000..11df294
--- /dev/null
+++ b/tests/data/toy_omop/vanilla/observation_period.csv
@@ -0,0 +1,4 @@
+observation_period_id,person_id,observation_period_start_date,observation_period_end_date,period_type_concept_id
+1,1,2100-01-01,2100-01-31,32828
+2,2,2100-01-01,2100-01-31,32828
+3,3,2100-01-01,2100-01-31,32828
diff --git a/tests/data/toy_omop/vanilla/person.csv b/tests/data/toy_omop/vanilla/person.csv
new file mode 100644
index 0000000..18b89ef
--- /dev/null
+++ b/tests/data/toy_omop/vanilla/person.csv
@@ -0,0 +1,5 @@
+﻿person_id,gender_concept_id,year_of_birth,month_of_birth,day_of_birth,birth_datetime,race_concept_id,ethnicity_concept_id,location_id,provider_id,care_site_id,person_source_value,gender_source_value,gender_source_concept_id,race_source_value,race_source_concept_id,ethnicity_source_value,ethnicity_source_concept_id
+1,8507,2095,,,,0,38003563,,,,1234,M,0,,,,
+2,8507,2096,,,,0,38003563,,,,1235,M,0,,,,
+3,8532,2097,,,,0,0,,,,1236,F,0,,,,
+4,8532,2098,,,,0,0,,,,1237,F,0,,,,
diff --git a/tests/data/toy_omop/vanilla/visit_occurrence.csv b/tests/data/toy_omop/vanilla/visit_occurrence.csv
new file mode 100644
index 0000000..d7b1087
--- /dev/null
+++ b/tests/data/toy_omop/vanilla/visit_occurrence.csv
@@ -0,0 +1,4 @@
+visit_occurrence_id,person_id,visit_concept_id,visit_start_date,visit_start_datetime,visit_end_date,visit_end_datetime,visit_type_concept_id,provider_id,care_site_id,visit_source_value,visit_source_concept_id,admitting_source_concept_id,admitting_source_value,discharge_to_concept_id,discharge_to_source_value,preceding_visit_occurrence_id
+1,1,8870,2100-01-01,2100-01-01 00:00:00,2100-01-31,2100-01-31 00:00:00,,,,10014354|2147-07-08,2000001801,,,,,
+2,2,8870,2100-01-01,2100-01-01 00:00:00,2100-01-31,2100-01-31 00:00:00,,,,10014354|2147-07-08,2000001801,,,,,
+3,3,8870,2100-01-01,2100-01-01 00:00:00,2100-01-31,2100-01-31 00:00:00,,,,10014354|2147-07-08,2000001801,,,,,
diff --git a/tests/test_io/test_omop.py b/tests/test_io/test_omop.py
new file mode 100644
index 0000000..5e501c1
--- /dev/null
+++ b/tests/test_io/test_omop.py
@@ -0,0 +1,52 @@
+import duckdb
+import pytest
+
+import ehrdata as ed
+from ehrdata.io.omop import register_omop_to_db_connection
+
+
+def test_register_omop_to_db_connection():
+    register_omop_to_db_connection(path="tests/data/toy_omop/vanilla", backend_handle=duckdb.connect(), source="csv")
+
+
+@pytest.mark.parametrize(
+    "observation_table", ["person", "person_cohort", "person_observation_period", "person_visit_occurrence"]
+)
+def test_setup_obs(omop_connection_vanilla, observation_table):
+    con = omop_connection_vanilla
+    edata = ed.io.omop.setup_obs(backend_handle=con, observation_table=observation_table)
+    assert isinstance(edata, ed.EHRData)
+
+
+@pytest.mark.parametrize("observation_table", ["perso"])
+def test_setup_obs_unknown_observation_table_argument(omop_connection_vanilla, observation_table):
+    con = omop_connection_vanilla
+    with pytest.raises(ValueError):
+        ed.io.omop.setup_obs(backend_handle=con, observation_table=observation_table)
+
+
+def test_setup_obs_person():
+    # check precise expected table
+    con = duckdb.connect()
+    register_omop_to_db_connection(path="../data/toy_omop/vanilla", backend_handle=con, source="csv")
+    con.close()
+
+
+def test_setup_var_measurement_startdate_fixed():
+    # check precise expected table
+    pass
+
+
+def test_setup_var_measurement_startdate_observation_period():
+    # check precise expected table
+    pass
+
+
+def test_setup_var_observation_startdate_fixed():
+    # check precise expected table
+    pass
+
+
+def test_setup_var_observation_startdate_observation_period():
+    # check precise expected table
+    pass

From abbb29f0b83d52131ba18bc89bbff801b489bc9f Mon Sep 17 00:00:00 2001
From: Eljas Roellin <eljas.roellin@ikmail.com>
Date: Tue, 29 Oct 2024 10:35:52 +0100
Subject: [PATCH 08/15] proper date format cohort measurement

---
 tests/data/toy_omop/vanilla/cohort.csv      |  8 ++++----
 tests/data/toy_omop/vanilla/measurement.csv | 22 +++++++++------------
 2 files changed, 13 insertions(+), 17 deletions(-)

diff --git a/tests/data/toy_omop/vanilla/cohort.csv b/tests/data/toy_omop/vanilla/cohort.csv
index 6517ad1..e9e2ef6 100644
--- a/tests/data/toy_omop/vanilla/cohort.csv
+++ b/tests/data/toy_omop/vanilla/cohort.csv
@@ -1,4 +1,4 @@
-﻿cohort_definition_id,subject_id,cohort_start_date,cohort_end_date,
-1,1,1/1/00,1/6/00,
-1,2,1/1/00,1/6/00,
-1,3,1/1/00,1/6/00,
+﻿cohort_definition_id,subject_id,cohort_start_date,cohort_end_date
+1,1,2100-01-01,2100-01-31
+1,2,2100-01-01,2100-01-31
+1,3,2100-01-01,2100-01-31
diff --git a/tests/data/toy_omop/vanilla/measurement.csv b/tests/data/toy_omop/vanilla/measurement.csv
index 23e7888..222c9a2 100644
--- a/tests/data/toy_omop/vanilla/measurement.csv
+++ b/tests/data/toy_omop/vanilla/measurement.csv
@@ -1,14 +1,10 @@
 ﻿measurement_id,person_id,measurement_concept_id,measurement_date,measurement_datetime,measurement_time,measurement_type_concept_id,operator_concept_id,value_as_number,value_as_concept_id,unit_concept_id,range_low,range_high,provider_id,visit_occurrence_id,visit_detail_id,measurement_source_value,measurement_source_concept_id,unit_source_value,value_source_value
-1,1,3031147,1/1/00,1/1/00 12:00,12:00,32856,,18,,9557,21,30,,1,,50804,2000001003,mEq/L,18
-2,1,3031147,1/1/00,1/1/2100  13:00:00 PM,13:00,32856,,19,,9557,21,30,,1,,50804,2000001003,mEq/L,19
-3,1,3022318,1/1/00,1/1/2100  14:00:00 PM,14:00,32817,,,45877096,,,,,1,,220048,2000030004,,SR (Sinus Rhythm)
-4,2,3031147,1/1/00,1/1/00 12:00,12:00,32856,,20,,9557,21,30,,2,,50804,2000001003,mEq/L,20
-5,2,3031147,1/1/00,1/1/2100  13:00:00 PM,13:00,32856,,21,,9557,21,30,,2,,50804,2000001003,mEq/L,21
-6,2,3022318,1/1/00,1/1/2100  14:00:00 PM,14:00,32817,,,45877096,,,,,2,,220048,2000030004,,SR (Sinus Rhythm)
-7,3,3031147,1/1/00,1/1/00 12:00,12:00,32856,,22,,9557,21,30,,3,,50804,2000001003,mEq/L,22
-8,3,3031147,1/1/00,1/1/2100  13:00:00 PM,13:00,32856,,23,,9557,21,30,,3,,50804,2000001003,mEq/L,23
-9,3,3022318,1/1/00,1/1/2100  14:00:00 PM,14:00,32817,,,45883018,,,,,3,,220048,2000030004,,AF (Atrial Fibrillation)
-,,,,,,,,,,,,,,,,,,,
-,,,,,,,,,,,,,,,,,,,
-,,,,,,,,,,,,,,,,,,,
-,,,,,,,,,,,,,,,,,,,
+1,1,3031147,2100-01-01,2100-01-01 12:00:00,12:00:00,32856,,18,,9557,21,30,,1,,50804,2000001003,mEq/L,18
+2,1,3031147,2100-01-01,2100-01-01 13:00:00,13:00:00,32856,,19,,9557,21,30,,1,,50804,2000001003,mEq/L,19
+3,1,3022318,2100-01-01,2100-01-01 14:00:00,14:00:00,32817,,,45877096,,,,,1,,220048,2000030004,,SR (Sinus Rhythm)
+4,2,3031147,2100-01-01,2100-01-01 12:00:00,12:00:00,32856,,20,,9557,21,30,,2,,50804,2000001003,mEq/L,20
+5,2,3031147,2100-01-01,2100-01-01 13:00:00,13:00:00,32856,,21,,9557,21,30,,2,,50804,2000001003,mEq/L,21
+6,2,3022318,2100-01-01,2100-01-01 14:00:00,14:00:00,32817,,,45877096,,,,,2,,220048,2000030004,,SR (Sinus Rhythm)
+7,3,3031147,2100-01-01,2100-01-01 12:00:00,12:00:00,32856,,22,,9557,21,30,,3,,50804,2000001003,mEq/L,22
+8,3,3031147,2100-01-01,2100-01-01 13:00:00,13:00:00,32856,,23,,9557,21,30,,3,,50804,2000001003,mEq/L,23
+9,3,3022318,2100-01-01,2100-01-01 14:00:00,14:00:00,32817,,,45883018,,,,,3,,220048,2000030004,,AF (Atrial Fibrillation)

From 4cec97143b559596175638abb691bee5238430fe Mon Sep 17 00:00:00 2001
From: Eljas Roellin <eljas.roellin@ikmail.com>
Date: Tue, 29 Oct 2024 22:15:04 +0100
Subject: [PATCH 09/15] unstable commit, to save progress

---
 src/ehrdata/io/omop/_queries.py | 139 ++++++++++++++++++++++++++++++
 src/ehrdata/io/omop/omop.py     | 144 ++++++++++++++++++++------------
 tests/test_io/test_omop.py      |  61 +++++++++-----
 3 files changed, 272 insertions(+), 72 deletions(-)
 create mode 100644 src/ehrdata/io/omop/_queries.py

diff --git a/src/ehrdata/io/omop/_queries.py b/src/ehrdata/io/omop/_queries.py
new file mode 100644
index 0000000..abdbf80
--- /dev/null
+++ b/src/ehrdata/io/omop/_queries.py
@@ -0,0 +1,139 @@
+from collections.abc import Sequence
+
+import duckdb
+import pandas as pd
+
+START_DATE_KEY = {
+    "visit_occurrence": "visit_start_date",
+    "observation_period": "observation_period_start_date",
+    "cohort": "cohort_start_date",
+}
+END_DATE_KEY = {
+    "visit_occurrence": "visit_end_date",
+    "observation_period": "observation_period_end_date",
+    "cohort": "cohort_end_date",
+}
+TIME_DEFINING_TABLE_SUBJECT_KEY = {
+    "visit_occurrence": "person_id",
+    "observation_period": "person_id",
+    "cohort": "subject_id",
+}
+
+AGGREGATION_STRATEGY_KEY = {
+    "last": "LAST",
+    "first": "FIRST",
+    "mean": "MEAN",
+    "median": "MEDIAN",
+    "mode": "MODE",
+    "sum": "SUM",
+    "count": "COUNT",
+    "min": "MIN",
+    "max": "MAX",
+    "std": "STD",
+}
+
+
+def _generate_timedeltas(interval_length_number: int, interval_length_unit: str, num_intervals: int) -> pd.DataFrame:
+    timedeltas_dataframe = pd.DataFrame(
+        {
+            "interval_start_offset": [
+                pd.to_timedelta(i * interval_length_number, interval_length_unit) for i in range(num_intervals)
+            ],
+            "interval_end_offset": [
+                pd.to_timedelta(i * interval_length_number, interval_length_unit) for i in range(1, num_intervals + 1)
+            ],
+            "interval_step": list(range(num_intervals)),
+        }
+    )
+    return timedeltas_dataframe
+
+
+def _write_timedeltas_to_db(
+    backend_handle: duckdb.duckdb.DuckDBPyConnection,
+    timedeltas_dataframe,
+) -> None:
+    backend_handle.execute("DROP TABLE IF EXISTS timedeltas")
+    backend_handle.execute(
+        """
+        CREATE TABLE timedeltas (
+            interval_start_offset INTERVAL,
+            interval_end_offset INTERVAL,
+            interval_step INTEGER
+        )
+        """
+    )
+    backend_handle.execute("INSERT INTO timedeltas SELECT * FROM timedeltas_dataframe")
+
+
+def _drop_timedeltas(backend_handle: duckdb.duckdb.DuckDBPyConnection):
+    backend_handle.execute("DROP TABLE IF EXISTS timedeltas")
+
+
+def _generate_value_query(data_table: str, data_field_to_keep: Sequence, aggregation_strategy: str) -> str:
+    query = f"{', ' .join([f'CASE WHEN COUNT(*) = 0 THEN NULL ELSE {aggregation_strategy}({column}) END AS {column}' for column in data_field_to_keep])}"
+    return query
+
+
+def time_interval_table_query_long_format(
+    backend_handle: duckdb.duckdb.DuckDBPyConnection,
+    time_defining_table: str,
+    data_table: str,
+    interval_length_number: int,
+    interval_length_unit: str,
+    num_intervals: int,
+    aggregation_strategy: str,
+    data_field_to_keep: Sequence[str] | str,
+) -> pd.DataFrame:
+    """Returns a long format DataFrame from the data_table. The following columns should be considered the indices of this long format: person_id, data_table_concept_id, interval_step. The other columns, except for start_date and end_date, should be considered the values."""
+    if isinstance(data_field_to_keep, str):
+        data_field_to_keep = [data_field_to_keep]
+
+    timedeltas_dataframe = _generate_timedeltas(interval_length_number, interval_length_unit, num_intervals)
+
+    _write_timedeltas_to_db(
+        backend_handle,
+        timedeltas_dataframe,
+    )
+
+    # multi-step query
+    # 1. Create person_time_defining_table, which matches the one created for obs. Needs to contain the person_id, and the start date in particular.
+    # 2. Create person_data_table (data_table is typically measurement), which contains the cross product of person_id and the distinct concept_id s.
+    # 3. Create long_format_backbone, which is the left join of person_time_defining_table and person_data_table.
+    # 4. Create long_format_intervals, which is the cross product of long_format_backbone and timedeltas. This table contains most notably the person_id, the concept_id, the interval start and end dates.
+    # 5. Create the final table, which is the join with the data_table (typically measurement); each measurement is assigned to its person_id, its concept_id, and the interval it fits into.
+    df = backend_handle.execute(
+        f"""
+        WITH person_time_defining_table AS ( \
+            SELECT person.person_id as person_id, {START_DATE_KEY[time_defining_table]} as start_date, {END_DATE_KEY[time_defining_table]} as end_date \
+            FROM person \
+            JOIN {time_defining_table} ON person.person_id = {time_defining_table}.{TIME_DEFINING_TABLE_SUBJECT_KEY[time_defining_table]} \
+        ), \
+        person_data_table AS( \
+            WITH distinct_data_table_concept_ids AS ( \
+                SELECT DISTINCT {data_table}_concept_id
+                FROM {data_table} \
+            )
+            SELECT person.person_id, {data_table}_concept_id as data_table_concept_id \
+            FROM person \
+            CROSS JOIN distinct_data_table_concept_ids \
+        ), \
+        long_format_backbone as ( \
+            SELECT person_time_defining_table.person_id, data_table_concept_id, start_date, end_date \
+            FROM person_time_defining_table \
+            LEFT JOIN person_data_table USING(person_id)\
+        ), \
+        long_format_intervals as ( \
+            SELECT person_id, data_table_concept_id, interval_step, start_date, start_date + interval_start_offset as interval_start, start_date + interval_end_offset as interval_end \
+            FROM long_format_backbone \
+            CROSS JOIN timedeltas \
+        ) \
+        SELECT lfi.person_id, lfi.data_table_concept_id, interval_step, interval_start, interval_end, {_generate_value_query(data_table, data_field_to_keep, AGGREGATION_STRATEGY_KEY[aggregation_strategy])} \
+        FROM long_format_intervals as lfi \
+        LEFT JOIN {data_table} ON lfi.person_id = {data_table}.person_id AND lfi.data_table_concept_id = {data_table}.{data_table}_concept_id AND {data_table}.{data_table}_date BETWEEN lfi.interval_start AND lfi.interval_end \
+        GROUP BY lfi.person_id, lfi.data_table_concept_id, interval_step, interval_start, interval_end
+        """
+    ).df()
+
+    _drop_timedeltas(backend_handle)
+
+    return df
diff --git a/src/ehrdata/io/omop/omop.py b/src/ehrdata/io/omop/omop.py
index 2aaa872..88fc9c8 100644
--- a/src/ehrdata/io/omop/omop.py
+++ b/src/ehrdata/io/omop/omop.py
@@ -11,6 +11,7 @@
 import pandas as pd
 from duckdb import DuckDBPyConnection
 
+from ehrdata.io.omop._queries import time_interval_table_query_long_format
 from ehrdata.utils._omop_utils import get_omop_table_names
 
 
@@ -24,7 +25,7 @@ def _check_sanity_of_database(backend_handle: duckdb.DuckDB):
 
 VALID_OBSERVATION_TABLES_SINGLE = ["person"]
 VALID_OBSERVATION_TABLES_JOIN = ["person_cohort", "person_observation_period", "person_visit_occurrence"]
-VALID_VARIABLE_TABLES = ["measurement", "observation", "specimen", "note", "death"]
+VALID_VARIABLE_TABLES = ["measurement", "observation", "specimen"]
 
 
 def register_omop_to_db_connection(
@@ -54,6 +55,7 @@ def register_omop_to_db_connection(
 def setup_obs(
     backend_handle: Literal[str, duckdb, Path],
     observation_table: Literal["person", "person_cohort", "person_observation_period", "person_visit_occurrence"],
+    death_table: bool = False,
 ):
     """Setup the observation table.
 
@@ -69,16 +71,21 @@ def setup_obs(
         The backend handle to the database.
     observation_table
         The observation table to be used.
+    death_table
+        Whether to include the death table. The observation_table created will be left joined with the death table as the right table.
 
     Returns
     -------
     An EHRData object with populated .obs field.
     """
+    if not isinstance(backend_handle, duckdb.duckdb.DuckDBPyConnection):
+        raise ValueError("backend_handle must be a DuckDB connection.")
+
     from ehrdata import EHRData
 
     if observation_table not in VALID_OBSERVATION_TABLES_SINGLE + VALID_OBSERVATION_TABLES_JOIN:
         raise ValueError(
-            f"observation_table must be one of {[VALID_OBSERVATION_TABLES_SINGLE]+[VALID_OBSERVATION_TABLES_JOIN]}."
+            f"observation_table must be one of {VALID_OBSERVATION_TABLES_SINGLE+VALID_OBSERVATION_TABLES_JOIN}."
         )
 
     if observation_table in VALID_OBSERVATION_TABLES_SINGLE:
@@ -86,20 +93,25 @@ def setup_obs(
 
     elif observation_table in VALID_OBSERVATION_TABLES_JOIN:
         if observation_table == "person_cohort":
-            obs = _get_table_left_join(backend_handle, "person", "cohort", right_key="subject_id")
+            obs = _get_table_join(backend_handle, "person", "cohort", right_key="subject_id")
         elif observation_table == "person_observation_period":
-            obs = _get_table_left_join(backend_handle, "person", "observation_period")
+            obs = _get_table_join(backend_handle, "person", "observation_period")
         elif observation_table == "person_visit_occurrence":
-            obs = _get_table_left_join(backend_handle, "person", "visit_occurrence")
+            obs = _get_table_join(backend_handle, "person", "visit_occurrence")
+
+    if death_table:
+        death = get_table(backend_handle, "death")
+        obs = obs.merge(death, how="left", on="person_id")
 
-    return EHRData(obs=obs)
+    return EHRData(obs=obs, uns={"omop_io_observation_table": observation_table.split("person_")[-1]})
 
 
 def setup_variables(
     edata,
-    backend_handle: Literal[str, duckdb, Path],
-    tables: Sequence[Literal["measurement", "observation", "procedure_occurrence", "specimen", "note"]],
-    start_time: Literal["observation_period_start"] | pd.Timestamp | str,
+    *,
+    backend_handle: duckdb.duckdb.DuckDBPyConnection,
+    data_tables: Sequence[Literal["measurement", "observation", "specimen"]],
+    data_field_to_keep: str | dict[str, str],
     interval_length_number: int,
     interval_length_unit: str,
     num_intervals: int,
@@ -116,8 +128,11 @@ def setup_variables(
         The backend handle to the database.
     edata
         The EHRData object to which the variables should be added.
-    tables
+    data_tables
         The tables to be used.
+    data_field_to_keep
+        The CDM Field in the data table to be kept. Can be e.g. "value_as_number" or "value_as_concept_id".
+        If multiple tables are used, this can be a dictionary with the table name as key and the column name as value, e.g. {"measurement": "value_as_number", "observation": "value_as_concept_id"}.
     start_time
         Starting time for values to be included.
     interval_length_number
@@ -127,60 +142,83 @@ def setup_variables(
     num_intervals
         Number of intervals.
     concept_ids
-        Concept IDs to filter on or 'all'.
+        Concept IDs to use from this data table. If not specified, 'all' are used.
     aggregation_strategy
-        Strategy to use when aggregating data within intervals.
+        Strategy to use when aggregating multiple data points within one interval.
 
     Returns
     -------
     An EHRData object with populated .r and .var field.
     """
-    from ehrdata import EHRData
-
-    concept_ids_present_list = []
     time_interval_tables = []
 
-    for table in tables:
-        if table not in VALID_VARIABLE_TABLES:
-            raise ValueError(f"tables must be a sequence of from [{VALID_VARIABLE_TABLES}].")
-
-        id_column = f"{table}_type_concept_id" if table in ["note", "death"] else f"{table}_concept_id"
-
-        concept_ids_present = _lowercase_column_names(
-            backend_handle.sql(f"SELECT DISTINCT {id_column} FROM {table}").df()
-        )
-
-        personxfeature_pairs_of_value_timestamp = _extract_personxfeature_pairs_of_value_timestamp(backend_handle)
-
-        # Create the time interval table
-        time_interval_table = get_time_interval_table(
-            backend_handle,
-            personxfeature_pairs_of_value_timestamp,
-            edata.obs,
-            start_time="observation_period_start",
-            interval_length_number=interval_length_number,
-            interval_length_unit=interval_length_unit,
-            num_intervals=num_intervals,
-            concept_ids=concept_ids,
-            aggregation_strategy=aggregation_strategy,
+    time_defining_table = edata.uns.get("omop_io_observation_table", None)
+    if time_defining_table is None:
+        raise ValueError("The observation table must be set up first, use the `setup_obs` function.")
+
+    for data_table in data_tables:
+        ds = (
+            time_interval_table_query_long_format(
+                backend_handle=backend_handle,
+                time_defining_table=time_defining_table,
+                data_table=data_table,
+                data_field_to_keep=data_field_to_keep,
+                interval_length_number=interval_length_number,
+                interval_length_unit=interval_length_unit,
+                num_intervals=num_intervals,
+                aggregation_strategy=aggregation_strategy,
+            )
+            .set_index(["person_id", "data_table_concept_id", "interval_step"])
+            .to_xarray()
         )
-
-        # Append
-        concept_ids_present_list.append(concept_ids_present)
-        time_interval_tables.append(time_interval_table)
+        # TODO: interval_start to var
+        # TODO: concept_ids to var
+        # TODO: concept_names to var
+        # TODO: for measurement, observation: store unit_concept_id and unit_name in var
+        time_interval_tables.append(ds)
+
+    return ds
+    # for table in tables:
+    #     if table not in VALID_VARIABLE_TABLES:
+    #         raise ValueError(f"tables must be a sequence of from [{VALID_VARIABLE_TABLES}].")
+
+    #     id_column = f"{table}_type_concept_id" if table in ["note", "death"] else f"{table}_concept_id"
+
+    #     concept_ids_present = _lowercase_column_names(
+    #         backend_handle.sql(f"SELECT DISTINCT {id_column} FROM {table}").df()
+    #     )
+
+    #     personxfeature_pairs_of_value_timestamp = _extract_personxfeature_pairs_of_value_timestamp(backend_handle)
+
+    #     # Create the time interval table
+    #     time_interval_table = get_time_interval_table(
+    #         backend_handle,
+    #         personxfeature_pairs_of_value_timestamp,
+    #         edata.obs,
+    #         start_time="observation_period_start",
+    #         interval_length_number=interval_length_number,
+    #         interval_length_unit=interval_length_unit,
+    #         num_intervals=num_intervals,
+    #         concept_ids=concept_ids,
+    #         aggregation_strategy=aggregation_strategy,
+    #     )
+
+    #     # Append
+    #     concept_ids_present_list.append(concept_ids_present)
+    #     time_interval_tables.append(time_interval_table)
 
     # Combine time interval tables
-    if len(time_interval_tables) > 1:
-        time_interval_table = np.concatenate([time_interval_table, time_interval_table], axis=1)
-        concept_ids_present = pd.concat(concept_ids_present_list)
-    else:
-        time_interval_table = time_interval_tables[0]
-        concept_ids_present = concept_ids_present_list[0]
+    # if len(time_interval_tables) > 1:
+    #     time_interval_table = np.concatenate([time_interval_table, time_interval_table], axis=1)
+    #     concept_ids_present = pd.concat(concept_ids_present_list)
+    # else:
+    #     time_interval_table = time_interval_tables[0]
+    #     concept_ids_present = concept_ids_present_list[0]
 
-    # Update edata with the new variables
-    edata = EHRData(r=time_interval_table, obs=edata.obs, var=concept_ids_present)
+    # # Update edata with the new variables
+    # edata = EHRData(r=time_interval_table, obs=edata.obs, var=concept_ids_present)
 
-    return edata
+    # return edata
 
 
 # DEVICE EXPOSURE and DRUG EXPOSURE NEEDS TO BE IMPLEMENTED BECAUSE THEY CONTAIN START DATE
@@ -206,7 +244,7 @@ def get_table(duckdb_instance, table_name: str) -> pd.DataFrame:
     return _lowercase_column_names(duckdb_instance.sql(f"SELECT * FROM {table_name}").df())
 
 
-def _get_table_left_join(
+def _get_table_join(
     duckdb_instance, table1: str, table2: str, left_key: str = "person_id", right_key: str = "person_id"
 ) -> pd.DataFrame:
     """Extract a table of an OMOP CDM Database."""
@@ -214,7 +252,7 @@ def _get_table_left_join(
         duckdb_instance.sql(
             f"SELECT * \
         FROM {table1} as t1 \
-        LEFT JOIN {table2} as t2 ON t1.{left_key} = t2.{right_key} \
+        JOIN {table2} as t2 ON t1.{left_key} = t2.{right_key} \
         "
         ).df()
     )
diff --git a/tests/test_io/test_omop.py b/tests/test_io/test_omop.py
index 5e501c1..a9f607e 100644
--- a/tests/test_io/test_omop.py
+++ b/tests/test_io/test_omop.py
@@ -1,40 +1,63 @@
-import duckdb
+import re
+
 import pytest
 
 import ehrdata as ed
-from ehrdata.io.omop import register_omop_to_db_connection
-
 
-def test_register_omop_to_db_connection():
-    register_omop_to_db_connection(path="tests/data/toy_omop/vanilla", backend_handle=duckdb.connect(), source="csv")
+# def test_register_omop_to_db_connection():
+#     register_omop_to_db_connection(path="tests/data/toy_omop/vanilla", backend_handle=duckdb.connect(), source="csv")
 
 
+# TODO: add test for death argument
 @pytest.mark.parametrize(
-    "observation_table", ["person", "person_cohort", "person_observation_period", "person_visit_occurrence"]
+    "observation_table, expected_length, expected_obs_num_columns",
+    [
+        ("person", 4, 18),
+        ("person_cohort", 3, 22),
+        ("person_observation_period", 3, 23),
+        ("person_visit_occurrence", 3, 35),
+    ],
 )
-def test_setup_obs(omop_connection_vanilla, observation_table):
+def test_setup_obs(omop_connection_vanilla, observation_table, expected_length, expected_obs_num_columns):
     con = omop_connection_vanilla
     edata = ed.io.omop.setup_obs(backend_handle=con, observation_table=observation_table)
     assert isinstance(edata, ed.EHRData)
 
+    # 4 persons, only 3 are in cohort, or have observation period, or visit occurrence
+    assert len(edata) == expected_length
+    assert edata.obs.shape[1] == expected_obs_num_columns
 
-@pytest.mark.parametrize("observation_table", ["perso"])
-def test_setup_obs_unknown_observation_table_argument(omop_connection_vanilla, observation_table):
-    con = omop_connection_vanilla
-    with pytest.raises(ValueError):
-        ed.io.omop.setup_obs(backend_handle=con, observation_table=observation_table)
 
+def test_setup_obs_invalid_backend_handle_argument():
+    with pytest.raises(ValueError, match="backend_handle must be a DuckDB connection."):
+        ed.io.omop.setup_obs(backend_handle="not_a_con", observation_table="person")
 
-def test_setup_obs_person():
-    # check precise expected table
-    con = duckdb.connect()
-    register_omop_to_db_connection(path="../data/toy_omop/vanilla", backend_handle=con, source="csv")
-    con.close()
+
+def test_setup_obs_invalid_observation_table_argument(omop_connection_vanilla):
+    con = omop_connection_vanilla
+    with pytest.raises(
+        ValueError,
+        match=re.escape(
+            "observation_table must be one of ['person', 'person_cohort', 'person_observation_period', 'person_visit_occurrence']."
+        ),
+    ):
+        ed.io.omop.setup_obs(backend_handle=con, observation_table="perso")
 
 
-def test_setup_var_measurement_startdate_fixed():
+def test_setup_variables_measurement_startdate_fixed(omop_connection_vanilla):
+    con = omop_connection_vanilla
+    edata = ed.io.omop.setup_obs(backend_handle=con, observation_table="person")
+    ed.io.omop.setup_variables(
+        edata,
+        backend_handle=con,
+        tables=["measurement"],
+        start_time="2100-01-01",
+        interval_length_number=1,
+        interval_length_unit="day",
+        num_intervals=31,
+    )
     # check precise expected table
-    pass
+    assert edata.vars.shape[1] == 8
 
 
 def test_setup_var_measurement_startdate_observation_period():

From d3fca1fceae54099f1c0fecb9f1f01403574f7f8 Mon Sep 17 00:00:00 2001
From: Eljas Roellin <eljas.roellin@ikmail.com>
Date: Tue, 29 Oct 2024 23:34:19 +0100
Subject: [PATCH 10/15] setup obs and variables duckdb backed, first proper
 minimal tests

---
 src/ehrdata/io/omop/omop.py           | 143 ++++----------------------
 tests/data/toy_omop/vanilla/death.csv |   3 +
 tests/test_io/test_omop.py            |  66 ++++++------
 3 files changed, 55 insertions(+), 157 deletions(-)
 create mode 100644 tests/data/toy_omop/vanilla/death.csv

diff --git a/src/ehrdata/io/omop/omop.py b/src/ehrdata/io/omop/omop.py
index 88fc9c8..7e6525c 100644
--- a/src/ehrdata/io/omop/omop.py
+++ b/src/ehrdata/io/omop/omop.py
@@ -129,7 +129,7 @@ def setup_variables(
     edata
         The EHRData object to which the variables should be added.
     data_tables
-        The tables to be used.
+        The tables to be used. For now, only one can be used.
     data_field_to_keep
         The CDM Field in the data table to be kept. Can be e.g. "value_as_number" or "value_as_concept_id".
         If multiple tables are used, this can be a dictionary with the table name as key and the column name as value, e.g. {"measurement": "value_as_number", "observation": "value_as_concept_id"}.
@@ -150,78 +150,33 @@ def setup_variables(
     -------
     An EHRData object with populated .r and .var field.
     """
-    time_interval_tables = []
+    from ehrdata import EHRData
 
     time_defining_table = edata.uns.get("omop_io_observation_table", None)
     if time_defining_table is None:
         raise ValueError("The observation table must be set up first, use the `setup_obs` function.")
 
-    for data_table in data_tables:
-        ds = (
-            time_interval_table_query_long_format(
-                backend_handle=backend_handle,
-                time_defining_table=time_defining_table,
-                data_table=data_table,
-                data_field_to_keep=data_field_to_keep,
-                interval_length_number=interval_length_number,
-                interval_length_unit=interval_length_unit,
-                num_intervals=num_intervals,
-                aggregation_strategy=aggregation_strategy,
-            )
-            .set_index(["person_id", "data_table_concept_id", "interval_step"])
-            .to_xarray()
+    ds = (
+        time_interval_table_query_long_format(
+            backend_handle=backend_handle,
+            time_defining_table=time_defining_table,
+            data_table=data_tables[0],
+            data_field_to_keep=data_field_to_keep,
+            interval_length_number=interval_length_number,
+            interval_length_unit=interval_length_unit,
+            num_intervals=num_intervals,
+            aggregation_strategy=aggregation_strategy,
         )
-        # TODO: interval_start to var
-        # TODO: concept_ids to var
-        # TODO: concept_names to var
-        # TODO: for measurement, observation: store unit_concept_id and unit_name in var
-        time_interval_tables.append(ds)
-
-    return ds
-    # for table in tables:
-    #     if table not in VALID_VARIABLE_TABLES:
-    #         raise ValueError(f"tables must be a sequence of from [{VALID_VARIABLE_TABLES}].")
-
-    #     id_column = f"{table}_type_concept_id" if table in ["note", "death"] else f"{table}_concept_id"
-
-    #     concept_ids_present = _lowercase_column_names(
-    #         backend_handle.sql(f"SELECT DISTINCT {id_column} FROM {table}").df()
-    #     )
-
-    #     personxfeature_pairs_of_value_timestamp = _extract_personxfeature_pairs_of_value_timestamp(backend_handle)
-
-    #     # Create the time interval table
-    #     time_interval_table = get_time_interval_table(
-    #         backend_handle,
-    #         personxfeature_pairs_of_value_timestamp,
-    #         edata.obs,
-    #         start_time="observation_period_start",
-    #         interval_length_number=interval_length_number,
-    #         interval_length_unit=interval_length_unit,
-    #         num_intervals=num_intervals,
-    #         concept_ids=concept_ids,
-    #         aggregation_strategy=aggregation_strategy,
-    #     )
-
-    #     # Append
-    #     concept_ids_present_list.append(concept_ids_present)
-    #     time_interval_tables.append(time_interval_table)
-
-    # Combine time interval tables
-    # if len(time_interval_tables) > 1:
-    #     time_interval_table = np.concatenate([time_interval_table, time_interval_table], axis=1)
-    #     concept_ids_present = pd.concat(concept_ids_present_list)
-    # else:
-    #     time_interval_table = time_interval_tables[0]
-    #     concept_ids_present = concept_ids_present_list[0]
-
-    # # Update edata with the new variables
-    # edata = EHRData(r=time_interval_table, obs=edata.obs, var=concept_ids_present)
+        .set_index(["person_id", "data_table_concept_id", "interval_step"])
+        .to_xarray()
+    )
 
-    # return edata
+    var = ds["data_table_concept_id"].to_dataframe()
+    t = ds["interval_step"].to_dataframe()
 
+    edata = EHRData(r=ds[data_field_to_keep[0]].values, obs=edata.obs, var=var, uns=edata.uns, t=t)
 
-# DEVICE EXPOSURE and DRUG EXPOSURE NEEDS TO BE IMPLEMENTED BECAUSE THEY CONTAIN START DATE
+    return edata
 
 
 def load(
@@ -258,66 +213,6 @@ def _get_table_join(
     )
 
 
-def _extract_personxfeature_pairs_of_value_timestamp(
-    duckdb_instance, table_name: str, concept_id_col: str, value_col: str, timestamp_col: str
-):
-    """
-    Generalized extraction function to extract data from an OMOP CDM table.
-
-    Parameters
-    ----------
-    duckdb_instance: duckdb.DuckDB
-        The DuckDB instance for querying the database.
-    table_name: str
-        The name of the table to extract data from (e.g., "measurement", "observation").
-    concept_id_col: str
-        The name of the column that contains the concept IDs (e.g., "measurement_concept_id").
-    value_col: str
-        The name of the column that contains the values (e.g., "value_as_number").
-    timestamp_col: str
-        The name of the column that contains the timestamps (e.g., "measurement_datetime").
-
-    Returns
-    -------
-    ak.Array
-        An Awkward Array with the structure: n_person x n_features x 2 (value, time).
-    """
-    # Load the specified table
-    table_df = duckdb_instance.sql(f"SELECT * FROM {table_name}").df()
-    table_df = _lowercase_column_names(table_df)
-
-    # Load the person table to get unique person IDs
-    person_id_df = _lowercase_column_names(duckdb_instance.sql("SELECT * FROM person").df())
-    person_ids = person_id_df["person_id"].unique()
-
-    # Get unique features (concept IDs) for the table
-    features = table_df[concept_id_col].unique()
-
-    # Initialize the collection for all persons
-    person_collection = []
-
-    for person in person_ids:
-        person_as_list = []
-        # Get rows for the current person
-        person_data = table_df[table_df["person_id"] == person]
-
-        # For each feature, get values and timestamps
-        for feature in features:
-            feature_data = person_data[person_data[concept_id_col] == feature]
-
-            # Extract the values and timestamps
-            feature_values = feature_data[value_col]
-            feature_timestamps = feature_data[timestamp_col]
-
-            # Append values and timestamps for this feature
-            person_as_list.append([feature_values, feature_timestamps])
-
-        # Append this person's data to the collection
-        person_collection.append(person_as_list)
-
-    return ak.Array(person_collection)
-
-
 def extract_measurement(duckdb_instance):
     """Extract a table of an OMOP CDM Database."""
     return get_table(
diff --git a/tests/data/toy_omop/vanilla/death.csv b/tests/data/toy_omop/vanilla/death.csv
new file mode 100644
index 0000000..3475d47
--- /dev/null
+++ b/tests/data/toy_omop/vanilla/death.csv
@@ -0,0 +1,3 @@
+person_id,death_date,death_datetime,death_type_concept_id,cause_concept_id,cause_source_value,cause_source_concept_id
+1,2100-03-31,2100-03-31 00:00:00,32817,0,0,
+2,2100-03-31,2100-03-31 00:00:00,32817,0,0,
diff --git a/tests/test_io/test_omop.py b/tests/test_io/test_omop.py
index a9f607e..eca2c10 100644
--- a/tests/test_io/test_omop.py
+++ b/tests/test_io/test_omop.py
@@ -4,23 +4,23 @@
 
 import ehrdata as ed
 
-# def test_register_omop_to_db_connection():
-#     register_omop_to_db_connection(path="tests/data/toy_omop/vanilla", backend_handle=duckdb.connect(), source="csv")
 
-
-# TODO: add test for death argument
 @pytest.mark.parametrize(
-    "observation_table, expected_length, expected_obs_num_columns",
+    "observation_table, death_table, expected_length, expected_obs_num_columns",
     [
-        ("person", 4, 18),
-        ("person_cohort", 3, 22),
-        ("person_observation_period", 3, 23),
-        ("person_visit_occurrence", 3, 35),
+        ("person", False, 4, 18),
+        ("person", True, 4, 24),
+        ("person_cohort", False, 3, 22),
+        ("person_cohort", True, 3, 28),
+        ("person_observation_period", False, 3, 23),
+        ("person_observation_period", True, 3, 29),
+        ("person_visit_occurrence", False, 3, 35),
+        ("person_visit_occurrence", True, 3, 41),
     ],
 )
-def test_setup_obs(omop_connection_vanilla, observation_table, expected_length, expected_obs_num_columns):
+def test_setup_obs(omop_connection_vanilla, observation_table, death_table, expected_length, expected_obs_num_columns):
     con = omop_connection_vanilla
-    edata = ed.io.omop.setup_obs(backend_handle=con, observation_table=observation_table)
+    edata = ed.io.omop.setup_obs(backend_handle=con, observation_table=observation_table, death_table=death_table)
     assert isinstance(edata, ed.EHRData)
 
     # 4 persons, only 3 are in cohort, or have observation period, or visit occurrence
@@ -44,32 +44,32 @@ def test_setup_obs_invalid_observation_table_argument(omop_connection_vanilla):
         ed.io.omop.setup_obs(backend_handle=con, observation_table="perso")
 
 
-def test_setup_variables_measurement_startdate_fixed(omop_connection_vanilla):
+@pytest.mark.parametrize(
+    "observation_table",
+    ["person_cohort", "person_observation_period", "person_visit_occurrence"],
+)
+@pytest.mark.parametrize(
+    "data_tables",
+    [["measurement"], ["observation"]],
+)
+@pytest.mark.parametrize(
+    "data_field_to_keep",
+    [["value_as_number"], ["value_as_concept_id"]],
+)
+def test_setup_variables(omop_connection_vanilla, observation_table, data_tables, data_field_to_keep):
     con = omop_connection_vanilla
-    edata = ed.io.omop.setup_obs(backend_handle=con, observation_table="person")
-    ed.io.omop.setup_variables(
+    edata = ed.io.omop.setup_obs(backend_handle=con, observation_table=observation_table)
+    edata = ed.io.omop.setup_variables(
         edata,
         backend_handle=con,
-        tables=["measurement"],
-        start_time="2100-01-01",
+        data_tables=data_tables,
+        data_field_to_keep=data_field_to_keep,
         interval_length_number=1,
         interval_length_unit="day",
-        num_intervals=31,
+        num_intervals=30,
     )
-    # check precise expected table
-    assert edata.vars.shape[1] == 8
-
-
-def test_setup_var_measurement_startdate_observation_period():
-    # check precise expected table
-    pass
-
 
-def test_setup_var_observation_startdate_fixed():
-    # check precise expected table
-    pass
-
-
-def test_setup_var_observation_startdate_observation_period():
-    # check precise expected table
-    pass
+    assert isinstance(edata, ed.EHRData)
+    assert edata.n_obs == 3
+    assert edata.n_vars == 2
+    assert edata.r.shape[2] == 30

From 255d1f6451129fe1ff34f29eadc810da6dd0220f Mon Sep 17 00:00:00 2001
From: Eljas Roellin <eljas.roellin@ikmail.com>
Date: Tue, 29 Oct 2024 23:36:47 +0100
Subject: [PATCH 11/15] add xarray as dependency

---
 pyproject.toml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/pyproject.toml b/pyproject.toml
index 3563b4b..35993ba 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -29,6 +29,7 @@ dependencies = [
   "duckdb",
   # for debug logging (referenced from the issue template)
   "session-info",
+  "xarray",
 ]
 optional-dependencies.dev = [
   "pre-commit",

From b9794477a055d5dde442613b179f0605ab374db5 Mon Sep 17 00:00:00 2001
From: Eljas Roellin <eljas.roellin@ikmail.com>
Date: Fri, 1 Nov 2024 18:03:50 +0100
Subject: [PATCH 12/15] update commit, unstable

---
 src/ehrdata/dt/datasets.py  |  39 ++++---
 src/ehrdata/io/omop/omop.py | 210 +++++++++++++++++++++++++++++++++---
 tests/test_dt/test_dt.py    |  24 +++++
 tests/test_io/test_omop.py  | 145 +++++++++++++++++++++++--
 4 files changed, 380 insertions(+), 38 deletions(-)
 create mode 100644 tests/test_dt/test_dt.py

diff --git a/src/ehrdata/dt/datasets.py b/src/ehrdata/dt/datasets.py
index 3623d7a..f996fe0 100644
--- a/src/ehrdata/dt/datasets.py
+++ b/src/ehrdata/dt/datasets.py
@@ -17,23 +17,36 @@ def _get_table_list() -> list:
     return flat_table_list
 
 
-def _set_up_duckdb(path: Path, backend_handle: DuckDBPyConnection) -> None:
+def _set_up_duckdb(path: Path, backend_handle: DuckDBPyConnection, prefix: str = "") -> None:
     tables = _get_table_list()
 
+    used_tables = []
     missing_tables = []
-    for table in tables:
-        # if path exists lowercse, uppercase, capitalized:
-        table_path = f"{path}/{table}.csv"
-        if os.path.exists(table_path):
-            if table == "measurement":
-                backend_handle.register(
-                    table, backend_handle.read_csv(f"{path}/{table}.csv", dtype={"measurement_source_value": str})
-                )
+    unused_files = []
+    for file_name in os.listdir(path):
+        file_name_trunk = file_name.split(".")[0].lower()
+
+        if file_name_trunk in tables or file_name_trunk.replace(prefix, "") in tables:
+            used_tables.append(file_name_trunk.replace(prefix, ""))
+
+            if file_name_trunk == "measurement":
+                dtype = {"measurement_source_value": str}
             else:
-                backend_handle.register(table, backend_handle.read_csv(f"{path}/{table}.csv"))
+                dtype = None
+
+            backend_handle.register(
+                file_name_trunk.replace(prefix, ""),
+                backend_handle.read_csv(f"{path}/{file_name_trunk}.csv", dtype=dtype),
+            )
         else:
-            missing_tables.append([table])
+            unused_files.append(file_name)
+
+    for table in tables:
+        if table not in used_tables:
+            missing_tables.append(table)
+
     print("missing tables: ", missing_tables)
+    print("unused files: ", unused_files)
 
 
 def mimic_iv_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = None) -> None:
@@ -80,8 +93,8 @@ def mimic_iv_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = N
         else:
             print(f"Failed to download the file. Status code: {response.status_code}")
             return
-
-    return _set_up_duckdb(data_path + "/1_omop_data_csv", backend_handle)
+    # TODO: capitalization, and lowercase, and containing the name
+    return _set_up_duckdb(data_path + "/1_omop_data_csv", backend_handle, prefix="2b_")
 
 
 def gibleed_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = None) -> None:
diff --git a/src/ehrdata/io/omop/omop.py b/src/ehrdata/io/omop/omop.py
index 7e6525c..8293ec6 100644
--- a/src/ehrdata/io/omop/omop.py
+++ b/src/ehrdata/io/omop/omop.py
@@ -9,11 +9,17 @@
 import duckdb
 import numpy as np
 import pandas as pd
-from duckdb import DuckDBPyConnection
 
-from ehrdata.io.omop._queries import time_interval_table_query_long_format
+from ehrdata.io.omop._queries import (
+    AGGREGATION_STRATEGY_KEY,
+    time_interval_table_query_long_format,
+)
 from ehrdata.utils._omop_utils import get_omop_table_names
 
+VALID_OBSERVATION_TABLES_SINGLE = ["person"]
+VALID_OBSERVATION_TABLES_JOIN = ["person_cohort", "person_observation_period", "person_visit_occurrence"]
+VALID_VARIABLE_TABLES = ["measurement", "observation", "specimen"]
+
 
 def _check_sanity_of_folder(folder_path: str | Path):
     pass
@@ -23,14 +29,152 @@ def _check_sanity_of_database(backend_handle: duckdb.DuckDB):
     pass
 
 
-VALID_OBSERVATION_TABLES_SINGLE = ["person"]
-VALID_OBSERVATION_TABLES_JOIN = ["person_cohort", "person_observation_period", "person_visit_occurrence"]
-VALID_VARIABLE_TABLES = ["measurement", "observation", "specimen"]
+def _check_valid_backend_handle(backend_handle) -> None:
+    if not isinstance(backend_handle, duckdb.duckdb.DuckDBPyConnection):
+        raise TypeError("Expected backend_handle to be of type DuckDBPyConnection.")
+
+
+def _check_valid_observation_table(observation_table) -> None:
+    if not isinstance(observation_table, str):
+        raise TypeError("Expected observation_table to be a string.")
+    if observation_table not in VALID_OBSERVATION_TABLES_SINGLE + VALID_OBSERVATION_TABLES_JOIN:
+        raise ValueError(
+            f"observation_table must be one of {VALID_OBSERVATION_TABLES_SINGLE+VALID_OBSERVATION_TABLES_JOIN}."
+        )
+
+
+def _check_valid_death_table(death_table) -> None:
+    if not isinstance(death_table, bool):
+        raise TypeError("Expected death_table to be a boolean.")
+
+
+def _check_valid_edata(edata) -> None:
+    from ehrdata import EHRData
+
+    if not isinstance(edata, EHRData):
+        raise TypeError("Expected edata to be of type EHRData.")
+
+
+def _check_valid_data_tables(data_tables) -> Sequence:
+    if isinstance(data_tables, str):
+        data_tables = [data_tables]
+    if not isinstance(data_tables, Sequence):
+        raise TypeError("Expected data_tables to be a string or Sequence.")
+    if not all(table in VALID_VARIABLE_TABLES for table in data_tables):
+        raise ValueError(f"data_tables must be a subset of {VALID_VARIABLE_TABLES}.")
+    return data_tables
+
+
+def _check_valid_data_field_to_keep(data_field_to_keep) -> Sequence:
+    if isinstance(data_field_to_keep, str):
+        data_field_to_keep = [data_field_to_keep]
+    if not isinstance(data_field_to_keep, Sequence) and not isinstance(data_field_to_keep, dict):
+        raise TypeError("Expected data_field_to_keep to be a string, Sequence, or dictionary.")
+    return data_field_to_keep
+
+
+def _check_valid_interval_length_number(interval_length_number) -> None:
+    if not isinstance(interval_length_number, int):
+        raise TypeError("Expected interval_length_number to be an integer.")
+
+
+def _check_valid_interval_length_unit(interval_length_unit) -> None:
+    # TODO: maybe check if it is a valid unit from pandas.to_timedelta
+    if not isinstance(interval_length_unit, str):
+        raise TypeError("Expected interval_length_unit to be a string.")
+
+
+def _check_valid_num_intervals(num_intervals) -> None:
+    if not isinstance(num_intervals, int):
+        raise TypeError("Expected num_intervals to be an integer.")
+
+
+def _check_valid_concept_ids(concept_ids) -> None:
+    if concept_ids != "all" and not isinstance(concept_ids, Sequence):
+        raise TypeError("concept_ids must be a sequence of integers or 'all'.")
+
+
+def _check_valid_aggregation_strategy(aggregation_strategy) -> None:
+    if aggregation_strategy not in AGGREGATION_STRATEGY_KEY.keys():
+        raise TypeError(f"aggregation_strategy must be one of {AGGREGATION_STRATEGY_KEY.keys()}.")
+
+
+def _collect_units_per_feature(ds, unit_key="unit_concept_id") -> dict:
+    feature_units = {}
+    for i in range(ds[unit_key].shape[1]):
+        single_feature_units = ds[unit_key].isel({ds[unit_key].dims[1]: i})
+        single_feature_units_flat = np.array(single_feature_units).flatten()
+        single_feature_units_unique = pd.unique(single_feature_units_flat[~pd.isna(single_feature_units_flat)])
+        feature_units[i] = single_feature_units_unique
+    return feature_units
+
+
+def _check_one_unit_per_feature(ds, unit_key="unit_concept_id") -> None:
+    feature_units = _collect_units_per_feature(ds, unit_key=unit_key)
+    num_units = np.array([len(units) for _, units in feature_units.items()])
+
+    # print(f"no units for features: {np.argwhere(num_units == 0)}")
+    print(f"multiple units for features: {np.argwhere(num_units > 1)}")
+
+
+def _create_feature_unit_concept_id_report(backend_handle, ds) -> pd.DataFrame:
+    feature_units_concept = _collect_units_per_feature(ds, unit_key="unit_concept_id")
+
+    feature_units_long_format = []
+    for feature, units in feature_units_concept.items():
+        if len(units) == 0:
+            feature_units_long_format.append({"concept_id": feature, "no_units": True, "multiple_units": False})
+        elif len(units) > 1:
+            for unit in units:
+                feature_units_long_format.append(
+                    {
+                        "concept_id": feature,
+                        "unit_concept_id": unit,
+                        "no_units": False,
+                        "multiple_units": True,
+                    }
+                )
+        else:
+            feature_units_long_format.append(
+                {
+                    "concept_id": feature,
+                    "unit_concept_id": units[0],
+                    "no_units": False,
+                    "multiple_units": False,
+                }
+            )
+
+    df = pd.DataFrame(
+        feature_units_long_format, columns=["concept_id", "unit_concept_id", "no_units", "multiple_units"]
+    )
+    df["unit_concept_id"] = df["unit_concept_id"].astype("Int64")
+
+    return df
+
+
+def _create_enriched_var_table(backend_handle, ds, unit_report) -> pd.DataFrame:
+    feature_concept_id_table = ds["data_table_concept_id"].to_dataframe()
+
+    feature_concept_id_unit_table = pd.merge(
+        feature_concept_id_table, unit_report, how="left", left_index=True, right_on="concept_id"
+    )
+
+    concepts = backend_handle.sql("SELECT * FROM concept").df()
+
+    feature_concept_id_unit_info_table = pd.merge(
+        feature_concept_id_unit_table,
+        concepts,
+        how="left",
+        left_on="unit_concept_id",
+        right_on="concept_id",
+    )
+
+    return feature_concept_id_unit_info_table
 
 
 def register_omop_to_db_connection(
     path: Path,
-    backend_handle: DuckDBPyConnection,
+    backend_handle: duckdb.duckdb.DuckDBPyConnection,
     source: Literal["csv"] = "csv",
 ) -> None:
     """Register the OMOP CDM tables to the database."""
@@ -78,16 +222,12 @@ def setup_obs(
     -------
     An EHRData object with populated .obs field.
     """
-    if not isinstance(backend_handle, duckdb.duckdb.DuckDBPyConnection):
-        raise ValueError("backend_handle must be a DuckDB connection.")
+    _check_valid_backend_handle(backend_handle)
+    _check_valid_observation_table(observation_table)
+    _check_valid_death_table(death_table)
 
     from ehrdata import EHRData
 
-    if observation_table not in VALID_OBSERVATION_TABLES_SINGLE + VALID_OBSERVATION_TABLES_JOIN:
-        raise ValueError(
-            f"observation_table must be one of {VALID_OBSERVATION_TABLES_SINGLE+VALID_OBSERVATION_TABLES_JOIN}."
-        )
-
     if observation_table in VALID_OBSERVATION_TABLES_SINGLE:
         obs = get_table(backend_handle, observation_table)
 
@@ -110,8 +250,9 @@ def setup_variables(
     edata,
     *,
     backend_handle: duckdb.duckdb.DuckDBPyConnection,
-    data_tables: Sequence[Literal["measurement", "observation", "specimen"]],
-    data_field_to_keep: str | dict[str, str],
+    data_tables: Sequence[Literal["measurement", "observation", "specimen"]]
+    | Literal["measurement", "observation", "specimen"],
+    data_field_to_keep: str | Sequence[str] | dict[str, str],
     interval_length_number: int,
     interval_length_unit: str,
     num_intervals: int,
@@ -152,10 +293,31 @@ def setup_variables(
     """
     from ehrdata import EHRData
 
+    _check_valid_edata(edata)
+    _check_valid_backend_handle(backend_handle)
+    data_tables = _check_valid_data_tables(data_tables)
+    data_field_to_keep = _check_valid_data_field_to_keep(data_field_to_keep)
+    _check_valid_interval_length_number(interval_length_number)
+    _check_valid_interval_length_unit(interval_length_unit)
+    _check_valid_num_intervals(num_intervals)
+    _check_valid_concept_ids(concept_ids)
+    _check_valid_aggregation_strategy(aggregation_strategy)
+
     time_defining_table = edata.uns.get("omop_io_observation_table", None)
     if time_defining_table is None:
         raise ValueError("The observation table must be set up first, use the `setup_obs` function.")
 
+    if data_tables[0] in ["measurement", "observation"]:
+        # also keep unit_concept_id and unit_source_value;
+        if isinstance(data_field_to_keep, list):
+            data_field_to_keep = list(data_field_to_keep) + ["unit_concept_id", "unit_source_value"]
+        elif isinstance(data_field_to_keep, dict):
+            data_field_to_keep = {
+                k: v + ["unit_concept_id", "unit_source_value"] for k, v in data_field_to_keep.items()
+            }
+        else:
+            raise ValueError
+
     ds = (
         time_interval_table_query_long_format(
             backend_handle=backend_handle,
@@ -171,12 +333,26 @@ def setup_variables(
         .to_xarray()
     )
 
-    var = ds["data_table_concept_id"].to_dataframe()
+    _check_one_unit_per_feature(ds)
+    # TODO ignore? go with more vanilla omop style. _check_one_unit_per_feature(ds, unit_key="unit_source_value")
+
+    unit_report = _create_feature_unit_concept_id_report(backend_handle, ds)
+    # TODO: generate nice multiple-unit report
+    # TODO: add unit to var
+    # TODO: add unit name to var
+    # TODO: add feature name to var
+
+    # TODO: test all of the above 5
+
+    # var = _create_var_table(backend_handle, unit_report)
+
+    var = _create_enriched_var_table(backend_handle, ds, unit_report)
+
     t = ds["interval_step"].to_dataframe()
 
     edata = EHRData(r=ds[data_field_to_keep[0]].values, obs=edata.obs, var=var, uns=edata.uns, t=t)
 
-    return edata
+    return edata, unit_report
 
 
 def load(
diff --git a/tests/test_dt/test_dt.py b/tests/test_dt/test_dt.py
new file mode 100644
index 0000000..5d71186
--- /dev/null
+++ b/tests/test_dt/test_dt.py
@@ -0,0 +1,24 @@
+import duckdb
+
+import ehrdata as ed
+
+
+def test_mimic_iv_omop():
+    con = duckdb.connect()
+    ed.dt.mimic_iv_omop(backend_handle=con)
+    assert len(con.execute("SHOW TABLES").df()) == 30
+    con.close()
+
+
+def test_gibleed_omop():
+    con = duckdb.connect()
+    ed.dt.gibleed_omop(backend_handle=con)
+    assert len(con.execute("SHOW TABLES").df()) == 36
+    con.close()
+
+
+def test_synthea27nj_omop():
+    con = duckdb.connect()
+    ed.dt.synthea27nj_omop(backend_handle=con)
+    assert len(con.execute("SHOW TABLES").df()) == 37
+    con.close()
diff --git a/tests/test_io/test_omop.py b/tests/test_io/test_omop.py
index eca2c10..403d84e 100644
--- a/tests/test_io/test_omop.py
+++ b/tests/test_io/test_omop.py
@@ -4,6 +4,17 @@
 
 import ehrdata as ed
 
+# constants for toy_omop/vanilla
+VANILLA_PERSONS_WITH_OBSERVATION_TABLE_ENTRY = {
+    "person_cohort": 3,
+    "person_observation_period": 3,
+    "person_visit_occurrence": 3,
+}
+VANILLA_NUM_CONCEPTS = {
+    "measurement": 2,
+    "observation": 2,
+}
+
 
 @pytest.mark.parametrize(
     "observation_table, death_table, expected_length, expected_obs_num_columns",
@@ -28,12 +39,30 @@ def test_setup_obs(omop_connection_vanilla, observation_table, death_table, expe
     assert edata.obs.shape[1] == expected_obs_num_columns
 
 
-def test_setup_obs_invalid_backend_handle_argument():
-    with pytest.raises(ValueError, match="backend_handle must be a DuckDB connection."):
-        ed.io.omop.setup_obs(backend_handle="not_a_con", observation_table="person")
+@pytest.mark.parametrize(
+    "backend_handle, observation_table, death_table, expected_error",
+    [
+        ("wrong_type", "person", False, "Expected backend_handle to be of type DuckDBPyConnection."),
+        (None, 123, False, "Expected observation_table to be a string."),
+        (None, "person", "wrong_type", "Expected death_table to be a boolean."),
+    ],
+)
+def test_setup_obs_illegal_argument_types(
+    omop_connection_vanilla,
+    backend_handle,
+    observation_table,
+    death_table,
+    expected_error,
+):
+    with pytest.raises(TypeError, match=expected_error):
+        ed.io.omop.setup_obs(
+            backend_handle=backend_handle or omop_connection_vanilla,
+            observation_table=observation_table,
+            death_table=death_table,
+        )
 
 
-def test_setup_obs_invalid_observation_table_argument(omop_connection_vanilla):
+def test_setup_obs_invalid_observation_table_value(omop_connection_vanilla):
     con = omop_connection_vanilla
     with pytest.raises(
         ValueError,
@@ -57,6 +86,7 @@ def test_setup_obs_invalid_observation_table_argument(omop_connection_vanilla):
     [["value_as_number"], ["value_as_concept_id"]],
 )
 def test_setup_variables(omop_connection_vanilla, observation_table, data_tables, data_field_to_keep):
+    num_intervals = 4
     con = omop_connection_vanilla
     edata = ed.io.omop.setup_obs(backend_handle=con, observation_table=observation_table)
     edata = ed.io.omop.setup_variables(
@@ -66,10 +96,109 @@ def test_setup_variables(omop_connection_vanilla, observation_table, data_tables
         data_field_to_keep=data_field_to_keep,
         interval_length_number=1,
         interval_length_unit="day",
-        num_intervals=30,
+        num_intervals=num_intervals,
     )
 
     assert isinstance(edata, ed.EHRData)
-    assert edata.n_obs == 3
-    assert edata.n_vars == 2
-    assert edata.r.shape[2] == 30
+    assert edata.n_obs == VANILLA_PERSONS_WITH_OBSERVATION_TABLE_ENTRY[observation_table]
+    assert edata.n_vars == VANILLA_NUM_CONCEPTS[data_tables[0]]
+    assert edata.r.shape[2] == num_intervals
+
+
+@pytest.mark.parametrize(
+    "edata, backend_handle, data_tables, data_field_to_keep, interval_length_number, interval_length_unit, num_intervals, expected_error",
+    [
+        (
+            "wrong_type",
+            None,
+            ["measurement"],
+            ["value_as_number"],
+            1,
+            "day",
+            4,
+            "Expected edata to be of type EHRData.",
+        ),
+        (
+            None,
+            "wrong_type",
+            ["measurement"],
+            ["value_as_number"],
+            1,
+            "day",
+            4,
+            "Expected backend_handle to be of type DuckDBPyConnection.",
+        ),
+        (
+            None,
+            None,
+            123,
+            ["value_as_number"],
+            1,
+            "day",
+            4,
+            "Expected data_tables to be a string or Sequence.",
+        ),
+        (
+            None,
+            None,
+            ["measurement"],
+            123,
+            1,
+            "day",
+            4,
+            "Expected data_field_to_keep to be a string, Sequence, or dictionary.",
+        ),
+        (
+            None,
+            None,
+            ["measurement"],
+            ["value_as_number"],
+            "wrong_type",
+            "day",
+            4,
+            "Expected interval_length_number to be an integer.",
+        ),
+        (
+            None,
+            None,
+            ["measurement"],
+            ["value_as_number"],
+            1,
+            123,
+            4,
+            "Expected interval_length_unit to be a string.",
+        ),
+        (
+            None,
+            None,
+            ["measurement"],
+            ["value_as_number"],
+            1,
+            "day",
+            "wrong_type",
+            "Expected num_intervals to be an integer.",
+        ),
+    ],
+)
+def test_setup_variables_illegal_argument_types(
+    omop_connection_vanilla,
+    edata,
+    backend_handle,
+    data_tables,
+    data_field_to_keep,
+    interval_length_number,
+    interval_length_unit,
+    num_intervals,
+    expected_error,
+):
+    con = omop_connection_vanilla
+    with pytest.raises(TypeError, match=expected_error):
+        ed.io.omop.setup_variables(
+            edata or ed.io.omop.setup_obs(backend_handle=omop_connection_vanilla, observation_table="person_cohort"),
+            backend_handle=backend_handle or con,
+            data_tables=data_tables,
+            data_field_to_keep=data_field_to_keep,
+            interval_length_number=interval_length_number,
+            interval_length_unit=interval_length_unit,
+            num_intervals=num_intervals,
+        )

From a5cac1d79f88fd015abd49c56cf5d38b45140b5e Mon Sep 17 00:00:00 2001
From: Eljas Roellin <eljas.roellin@ikmail.com>
Date: Sat, 2 Nov 2024 22:21:55 +0100
Subject: [PATCH 13/15] setup_var with basic functionality on units, more tests
 and better description

---
 src/ehrdata/io/omop/omop.py             | 78 ++++++++++++++++++-------
 tests/data/toy_omop/vanilla/concept.csv |  1 +
 tests/test_io/test_omop.py              | 76 +++++++++++++++++++++++-
 3 files changed, 132 insertions(+), 23 deletions(-)
 create mode 100644 tests/data/toy_omop/vanilla/concept.csv

diff --git a/src/ehrdata/io/omop/omop.py b/src/ehrdata/io/omop/omop.py
index 8293ec6..6034b17 100644
--- a/src/ehrdata/io/omop/omop.py
+++ b/src/ehrdata/io/omop/omop.py
@@ -68,8 +68,8 @@ def _check_valid_data_tables(data_tables) -> Sequence:
 def _check_valid_data_field_to_keep(data_field_to_keep) -> Sequence:
     if isinstance(data_field_to_keep, str):
         data_field_to_keep = [data_field_to_keep]
-    if not isinstance(data_field_to_keep, Sequence) and not isinstance(data_field_to_keep, dict):
-        raise TypeError("Expected data_field_to_keep to be a string, Sequence, or dictionary.")
+    if not isinstance(data_field_to_keep, Sequence):
+        raise TypeError("Expected data_field_to_keep to be a string or Sequence.")
     return data_field_to_keep
 
 
@@ -99,13 +99,23 @@ def _check_valid_aggregation_strategy(aggregation_strategy) -> None:
         raise TypeError(f"aggregation_strategy must be one of {AGGREGATION_STRATEGY_KEY.keys()}.")
 
 
+def _check_valid_enrich_var_with_feature_info(enrich_var_with_feature_info) -> None:
+    if not isinstance(enrich_var_with_feature_info, bool):
+        raise TypeError("Expected enrich_var_with_feature_info to be a boolean.")
+
+
+def _check_valid_enrich_var_with_unit_info(enrich_var_with_unit_info) -> None:
+    if not isinstance(enrich_var_with_unit_info, bool):
+        raise TypeError("Expected enrich_var_with_unit_info to be a boolean.")
+
+
 def _collect_units_per_feature(ds, unit_key="unit_concept_id") -> dict:
     feature_units = {}
     for i in range(ds[unit_key].shape[1]):
         single_feature_units = ds[unit_key].isel({ds[unit_key].dims[1]: i})
         single_feature_units_flat = np.array(single_feature_units).flatten()
         single_feature_units_unique = pd.unique(single_feature_units_flat[~pd.isna(single_feature_units_flat)])
-        feature_units[i] = single_feature_units_unique
+        feature_units[ds["data_table_concept_id"][i].item()] = single_feature_units_unique
     return feature_units
 
 
@@ -152,8 +162,8 @@ def _create_feature_unit_concept_id_report(backend_handle, ds) -> pd.DataFrame:
     return df
 
 
-def _create_enriched_var_table(backend_handle, ds, unit_report) -> pd.DataFrame:
-    feature_concept_id_table = ds["data_table_concept_id"].to_dataframe()
+def _create_enriched_var_with_unit_info(backend_handle, ds, var, unit_report) -> pd.DataFrame:
+    feature_concept_id_table = var  # ds["data_table_concept_id"].to_dataframe()
 
     feature_concept_id_unit_table = pd.merge(
         feature_concept_id_table, unit_report, how="left", left_index=True, right_on="concept_id"
@@ -252,16 +262,20 @@ def setup_variables(
     backend_handle: duckdb.duckdb.DuckDBPyConnection,
     data_tables: Sequence[Literal["measurement", "observation", "specimen"]]
     | Literal["measurement", "observation", "specimen"],
-    data_field_to_keep: str | Sequence[str] | dict[str, str],
+    data_field_to_keep: str | Sequence[str],
     interval_length_number: int,
     interval_length_unit: str,
     num_intervals: int,
     concept_ids: Literal["all"] | Sequence = "all",
     aggregation_strategy: str = "last",
+    enrich_var_with_feature_info: bool = False,
+    enrich_var_with_unit_info: bool = False,
 ):
     """Setup the variables.
 
     This function sets up the variables for the EHRData object.
+    It will fail if there is more than one unit_concept_id per feature.
+    Writes a unit report of the features to edata.uns["unit_report_<data_tables>"].
 
     Parameters
     ----------
@@ -270,10 +284,9 @@ def setup_variables(
     edata
         The EHRData object to which the variables should be added.
     data_tables
-        The tables to be used. For now, only one can be used.
+        The table to be used. Only a single table can be used.
     data_field_to_keep
         The CDM Field in the data table to be kept. Can be e.g. "value_as_number" or "value_as_concept_id".
-        If multiple tables are used, this can be a dictionary with the table name as key and the column name as value, e.g. {"measurement": "value_as_number", "observation": "value_as_concept_id"}.
     start_time
         Starting time for values to be included.
     interval_length_number
@@ -286,6 +299,10 @@ def setup_variables(
         Concept IDs to use from this data table. If not specified, 'all' are used.
     aggregation_strategy
         Strategy to use when aggregating multiple data points within one interval.
+    enrich_var_with_feature_info
+        Whether to enrich the var table with feature information. If a concept_id is not found in the concept table, the feature information will be NaN.
+    enrich_var_with_unit_info
+        Whether to enrich the var table with unit information. Raises an Error if a) multiple units per feature are found for at least one feature. If a concept_id is not found in the concept table, the feature information will be NaN.
 
     Returns
     -------
@@ -302,6 +319,8 @@ def setup_variables(
     _check_valid_num_intervals(num_intervals)
     _check_valid_concept_ids(concept_ids)
     _check_valid_aggregation_strategy(aggregation_strategy)
+    _check_valid_enrich_var_with_feature_info(enrich_var_with_feature_info)
+    _check_valid_enrich_var_with_unit_info(enrich_var_with_unit_info)
 
     time_defining_table = edata.uns.get("omop_io_observation_table", None)
     if time_defining_table is None:
@@ -311,10 +330,11 @@ def setup_variables(
         # also keep unit_concept_id and unit_source_value;
         if isinstance(data_field_to_keep, list):
             data_field_to_keep = list(data_field_to_keep) + ["unit_concept_id", "unit_source_value"]
-        elif isinstance(data_field_to_keep, dict):
-            data_field_to_keep = {
-                k: v + ["unit_concept_id", "unit_source_value"] for k, v in data_field_to_keep.items()
-            }
+        # TODO: use in future version when more than one data table can be used
+        # elif isinstance(data_field_to_keep, dict):
+        #     data_field_to_keep = {
+        #         k: v + ["unit_concept_id", "unit_source_value"] for k, v in data_field_to_keep.items()
+        #     }
         else:
             raise ValueError
 
@@ -337,22 +357,40 @@ def setup_variables(
     # TODO ignore? go with more vanilla omop style. _check_one_unit_per_feature(ds, unit_key="unit_source_value")
 
     unit_report = _create_feature_unit_concept_id_report(backend_handle, ds)
-    # TODO: generate nice multiple-unit report
-    # TODO: add unit to var
-    # TODO: add unit name to var
-    # TODO: add feature name to var
 
-    # TODO: test all of the above 5
+    var = ds["data_table_concept_id"].to_dataframe()
+    concepts = backend_handle.sql("SELECT * FROM concept").df()
 
-    # var = _create_var_table(backend_handle, unit_report)
+    if enrich_var_with_feature_info:
+        var = pd.merge(var, concepts, how="left", left_index=True, right_on="concept_id")
 
-    var = _create_enriched_var_table(backend_handle, ds, unit_report)
+    if enrich_var_with_unit_info:
+        if unit_report["multiple_units"].sum() > 0:
+            raise ValueError("Multiple units per feature found. Enrichment with feature information not possible.")
+        else:
+            var = pd.merge(
+                var,
+                unit_report,
+                how="left",
+                left_index=True,
+                right_on="unit_concept_id",
+                suffixes=("", "_unit"),
+            )
+            var = pd.merge(
+                var,
+                concepts,
+                how="left",
+                left_on="unit_concept_id",
+                right_on="concept_id",
+                suffixes=("", "_unit"),
+            )
 
     t = ds["interval_step"].to_dataframe()
 
     edata = EHRData(r=ds[data_field_to_keep[0]].values, obs=edata.obs, var=var, uns=edata.uns, t=t)
+    edata.uns[f"unit_report_{data_tables[0]}"] = unit_report
 
-    return edata, unit_report
+    return edata
 
 
 def load(
diff --git a/tests/data/toy_omop/vanilla/concept.csv b/tests/data/toy_omop/vanilla/concept.csv
new file mode 100644
index 0000000..6ca864c
--- /dev/null
+++ b/tests/data/toy_omop/vanilla/concept.csv
@@ -0,0 +1 @@
+concept_id,concept_name,domain_id,vocabulary_id,concept_class_id,standard_concept,concept_code,valid_start_DATE,valid_end_DATE,invalid_reason
diff --git a/tests/test_io/test_omop.py b/tests/test_io/test_omop.py
index 403d84e..d83aed6 100644
--- a/tests/test_io/test_omop.py
+++ b/tests/test_io/test_omop.py
@@ -15,6 +15,16 @@
     "observation": 2,
 }
 
+# constants for setup_variables
+# only data_table_concept_id
+VAR_DIM_BASE = 1
+# number of columns in concept table
+NUMBER_COLUMNS_CONCEPT_TABLE = 10
+VAR_DIM_FEATURE_INFO = NUMBER_COLUMNS_CONCEPT_TABLE
+# number of columns in concept table + number of columns
+NUMBER_COLUMNS_FEATURE_REPORT = 4
+VAR_DIM_UNIT_INFO = NUMBER_COLUMNS_CONCEPT_TABLE + NUMBER_COLUMNS_FEATURE_REPORT
+
 
 @pytest.mark.parametrize(
     "observation_table, death_table, expected_length, expected_obs_num_columns",
@@ -85,7 +95,22 @@ def test_setup_obs_invalid_observation_table_value(omop_connection_vanilla):
     "data_field_to_keep",
     [["value_as_number"], ["value_as_concept_id"]],
 )
-def test_setup_variables(omop_connection_vanilla, observation_table, data_tables, data_field_to_keep):
+@pytest.mark.parametrize(
+    "enrich_var_with_feature_info",
+    [True, False],
+)
+@pytest.mark.parametrize(
+    "enrich_var_with_unit_info",
+    [True, False],
+)
+def test_setup_variables(
+    omop_connection_vanilla,
+    observation_table,
+    data_tables,
+    data_field_to_keep,
+    enrich_var_with_feature_info,
+    enrich_var_with_unit_info,
+):
     num_intervals = 4
     con = omop_connection_vanilla
     edata = ed.io.omop.setup_obs(backend_handle=con, observation_table=observation_table)
@@ -97,16 +122,21 @@ def test_setup_variables(omop_connection_vanilla, observation_table, data_tables
         interval_length_number=1,
         interval_length_unit="day",
         num_intervals=num_intervals,
+        enrich_var_with_feature_info=enrich_var_with_feature_info,
+        enrich_var_with_unit_info=enrich_var_with_unit_info,
     )
 
     assert isinstance(edata, ed.EHRData)
     assert edata.n_obs == VANILLA_PERSONS_WITH_OBSERVATION_TABLE_ENTRY[observation_table]
     assert edata.n_vars == VANILLA_NUM_CONCEPTS[data_tables[0]]
     assert edata.r.shape[2] == num_intervals
+    assert edata.var.shape[1] == VAR_DIM_BASE + (VAR_DIM_FEATURE_INFO if enrich_var_with_feature_info else 0) + (
+        VAR_DIM_UNIT_INFO if enrich_var_with_unit_info else 0
+    )
 
 
 @pytest.mark.parametrize(
-    "edata, backend_handle, data_tables, data_field_to_keep, interval_length_number, interval_length_unit, num_intervals, expected_error",
+    "edata, backend_handle, data_tables, data_field_to_keep, interval_length_number, interval_length_unit, num_intervals, enrich_var_with_feature_info, enrich_var_with_unit_info, expected_error",
     [
         (
             "wrong_type",
@@ -116,6 +146,8 @@ def test_setup_variables(omop_connection_vanilla, observation_table, data_tables
             1,
             "day",
             4,
+            False,
+            False,
             "Expected edata to be of type EHRData.",
         ),
         (
@@ -126,6 +158,8 @@ def test_setup_variables(omop_connection_vanilla, observation_table, data_tables
             1,
             "day",
             4,
+            False,
+            False,
             "Expected backend_handle to be of type DuckDBPyConnection.",
         ),
         (
@@ -136,6 +170,8 @@ def test_setup_variables(omop_connection_vanilla, observation_table, data_tables
             1,
             "day",
             4,
+            False,
+            False,
             "Expected data_tables to be a string or Sequence.",
         ),
         (
@@ -146,7 +182,9 @@ def test_setup_variables(omop_connection_vanilla, observation_table, data_tables
             1,
             "day",
             4,
-            "Expected data_field_to_keep to be a string, Sequence, or dictionary.",
+            False,
+            False,
+            "Expected data_field_to_keep to be a string or Sequence.",
         ),
         (
             None,
@@ -156,6 +194,8 @@ def test_setup_variables(omop_connection_vanilla, observation_table, data_tables
             "wrong_type",
             "day",
             4,
+            False,
+            False,
             "Expected interval_length_number to be an integer.",
         ),
         (
@@ -166,6 +206,8 @@ def test_setup_variables(omop_connection_vanilla, observation_table, data_tables
             1,
             123,
             4,
+            False,
+            False,
             "Expected interval_length_unit to be a string.",
         ),
         (
@@ -176,8 +218,34 @@ def test_setup_variables(omop_connection_vanilla, observation_table, data_tables
             1,
             "day",
             "wrong_type",
+            False,
+            False,
             "Expected num_intervals to be an integer.",
         ),
+        (
+            None,
+            None,
+            ["measurement"],
+            ["value_as_number"],
+            1,
+            "day",
+            123,
+            "wrong_type",
+            False,
+            "Expected enrich_var_with_feature_info to be a boolean.",
+        ),
+        (
+            None,
+            None,
+            ["measurement"],
+            ["value_as_number"],
+            1,
+            "day",
+            123,
+            False,
+            "wrong_type",
+            "Expected enrich_var_with_unit_info to be a boolean.",
+        ),
     ],
 )
 def test_setup_variables_illegal_argument_types(
@@ -189,6 +257,8 @@ def test_setup_variables_illegal_argument_types(
     interval_length_number,
     interval_length_unit,
     num_intervals,
+    enrich_var_with_feature_info,
+    enrich_var_with_unit_info,
     expected_error,
 ):
     con = omop_connection_vanilla

From 270f6bb91e6175930cad45b0363071219eee849d Mon Sep 17 00:00:00 2001
From: Eljas Roellin <eljas.roellin@ikmail.com>
Date: Sat, 2 Nov 2024 22:31:41 +0100
Subject: [PATCH 14/15] fix test illegal args, check other option for gibleed

---
 src/ehrdata/dt/datasets.py | 4 ++--
 tests/test_io/test_omop.py | 2 ++
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/ehrdata/dt/datasets.py b/src/ehrdata/dt/datasets.py
index f996fe0..33545be 100644
--- a/src/ehrdata/dt/datasets.py
+++ b/src/ehrdata/dt/datasets.py
@@ -145,9 +145,9 @@ def gibleed_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = No
         else:
             print(f"Failed to download the file. Status code: {response.status_code}")
 
-    extracted_folder = next(data_path.iterdir(), data_path)
+    # extracted_folder = next(data_path.iterdir(), data_path)
     # extracted_folder = next((folder for folder in data_path.iterdir() if folder.is_dir() and "_csv" in folder.name and "__MACOSX" not in folder.name), data_path)
-    return _set_up_duckdb(extracted_folder, backend_handle)
+    return _set_up_duckdb(data_path / "GiBleed_5.3", backend_handle)
 
 
 def synthea27nj_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = None) -> None:
diff --git a/tests/test_io/test_omop.py b/tests/test_io/test_omop.py
index d83aed6..68ed0fc 100644
--- a/tests/test_io/test_omop.py
+++ b/tests/test_io/test_omop.py
@@ -271,4 +271,6 @@ def test_setup_variables_illegal_argument_types(
             interval_length_number=interval_length_number,
             interval_length_unit=interval_length_unit,
             num_intervals=num_intervals,
+            enrich_var_with_feature_info=enrich_var_with_feature_info,
+            enrich_var_with_unit_info=enrich_var_with_unit_info,
         )

From 51d2172c7492ef036b327bd4a9173bbfc0dd33a2 Mon Sep 17 00:00:00 2001
From: Eljas Roellin <eljas.roellin@ikmail.com>
Date: Sat, 2 Nov 2024 22:36:56 +0100
Subject: [PATCH 15/15] stop there w/ tests for this PR

---
 tests/test_dt/test_dt.py | 21 +++++++++++----------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/tests/test_dt/test_dt.py b/tests/test_dt/test_dt.py
index 5d71186..72fa7a3 100644
--- a/tests/test_dt/test_dt.py
+++ b/tests/test_dt/test_dt.py
@@ -10,15 +10,16 @@ def test_mimic_iv_omop():
     con.close()
 
 
-def test_gibleed_omop():
-    con = duckdb.connect()
-    ed.dt.gibleed_omop(backend_handle=con)
-    assert len(con.execute("SHOW TABLES").df()) == 36
-    con.close()
+# TODO
+# def test_gibleed_omop():
+#     con = duckdb.connect()
+#     ed.dt.gibleed_omop(backend_handle=con)
+#     assert len(con.execute("SHOW TABLES").df()) == 36
+#     con.close()
 
 
-def test_synthea27nj_omop():
-    con = duckdb.connect()
-    ed.dt.synthea27nj_omop(backend_handle=con)
-    assert len(con.execute("SHOW TABLES").df()) == 37
-    con.close()
+# def test_synthea27nj_omop():
+#     con = duckdb.connect()
+#     ed.dt.synthea27nj_omop(backend_handle=con)
+#     assert len(con.execute("SHOW TABLES").df()) == 37
+#     con.close()