From c0de58c2af587485bac138e21067d4bc21c17017 Mon Sep 17 00:00:00 2001 From: ShreyParikh07 Date: Thu, 10 Oct 2024 10:59:11 +0200 Subject: [PATCH 01/15] Changes to omop --- docs/notebooks/test_more_datasets_omop.ipynb | 240 +++++++++++++++++++ src/ehrdata/dt/datasets.py | 73 +++++- src/ehrdata/io/omop/omop.py | 41 +++- 3 files changed, 334 insertions(+), 20 deletions(-) create mode 100644 docs/notebooks/test_more_datasets_omop.ipynb diff --git a/docs/notebooks/test_more_datasets_omop.ipynb b/docs/notebooks/test_more_datasets_omop.ipynb new file mode 100644 index 0000000..3cc16a8 --- /dev/null +++ b/docs/notebooks/test_more_datasets_omop.ipynb @@ -0,0 +1,240 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "from ehrdata import EHRData\n", + "EHRData().r\n", + "import anndata as ad\n", + "import duckdb\n", + "import ehrapy as ep\n", + "import ehrdata as ed\n", + "import numpy as np\n", + "import os" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "from datasets import gibleed_omop, mimic_iv_omop, synthea27nj_omop" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Load the mimic dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "def load_and_check(dummy_func, start_time):\n", + " con = duckdb.connect()\n", + " dummy_func(backend_handle=con)\n", + " edata = ed.io.omop.setup_obs(con, \"person_observation_period\")\n", + " edata\n", + " edata = ed.io.omop.setup_variables(\n", + " backend_handle=con,\n", + " edata=edata,\n", + " tables=[\"measurement\"],\n", + " start_time=start_time,\n", + " interval_length_number=28,\n", + " interval_length_unit=\"day\",\n", + " num_intervals=\"max_observation_duration\",\n", + " concept_ids=\"all\",\n", + " aggregation_strategy=\"last\"\n", + " )\n", + " return edata" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Load the mimic dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Path to data exists, load tables from there: ehrapy_data/mimic-iv-demo-data-in-the-omop-common-data-model-0.9\n", + "missing tables: [['concept'], ['vocabulary'], ['domain'], ['concept_class'], ['concept_relationship'], ['relationship'], ['concept_synonym'], ['concept_ancestor'], ['source_to_concept_map'], ['drug_strength']]\n" + ] + } + ], + "source": [ + "edata_mimic = load_and_check(mimic_iv_omop, \"observation_period_start_date\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "EHRData object with n_obs x n_var = 100 x 450, and a timeseries of 320 steps.\n", + " shape of .X: (100, 450) \n", + " shape of .r: ((100, 450, 320)) " + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "edata_mimic" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Load the gibleed dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Path to data exists, load tables from there: ehrapy_data/GIBleed_dataset\n", + "missing tables: [['cohort_definition']]\n" + ] + } + ], + "source": [ + "edata_gibleed = load_and_check(gibleed_omop, \"observation_period\")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "EHRData object with n_obs x n_var = 2694 x 55, and a timeseries of 1441 steps.\n", + " shape of .X: (2694, 55) \n", + " shape of .r: ((2694, 55, 1441)) " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "edata_gibleed" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Load the Synthea27NJ dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Path to data exists, load tables from there: ehrapy_data/Synthea27Nj\n", + "missing tables: []\n" + ] + } + ], + "source": [ + "edata_synthea27nj = load_and_check(synthea27nj_omop, \"observation_period\")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "EHRData object with n_obs x n_var = 28 x 132, and a timeseries of 866 steps.\n", + " shape of .X: (28, 132) \n", + " shape of .r: ((28, 132, 866)) " + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "edata_synthea27nj" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "hackathon_venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/src/ehrdata/dt/datasets.py b/src/ehrdata/dt/datasets.py index adc50bf..3623d7a 100644 --- a/src/ehrdata/dt/datasets.py +++ b/src/ehrdata/dt/datasets.py @@ -85,9 +85,9 @@ def mimic_iv_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = N def gibleed_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = None) -> None: - """Loads the GIBleed dataset. + """Loads the GIBleed dataset in the OMOP Common Data model. - More details: https://github.com/OHDSI/EunomiaDatasets. + More details: https://github.com/OHDSI/EunomiaDatasets/tree/main/datasets/GiBleed. Parameters ---------- @@ -109,13 +109,38 @@ def gibleed_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = No >>> ed.dt.gibleed_omop(backend_handle=con) >>> con.execute("SHOW TABLES;").fetchall() """ - # TODO: - # https://github.com/darwin-eu/EunomiaDatasets/tree/main/datasets/GiBleed - raise NotImplementedError() + if data_path is None: + data_path = Path("ehrapy_data/GIBleed_dataset") + + if data_path.exists(): + print(f"Path to data exists, load tables from there: {data_path}") + else: + print("Downloading data...") + URL = "https://github.com/OHDSI/EunomiaDatasets/raw/main/datasets/GiBleed/GiBleed_5.3.zip" + response = requests.get(URL) + + if response.status_code == 200: + # extract_path = data_path / "gibleed_data_csv" + # extract_path.mkdir(parents=True, exist_ok=True) + + # Use zipfile and io to open the ZIP file in memory + with zipfile.ZipFile(io.BytesIO(response.content)) as z: + # Extract all contents of the ZIP file into the correct subdirectory + z.extractall(data_path) # Extracting to 'extract_path' + print(f"Download successful. ZIP file downloaded and extracted successfully to {data_path}.") + + else: + print(f"Failed to download the file. Status code: {response.status_code}") + + extracted_folder = next(data_path.iterdir(), data_path) + # extracted_folder = next((folder for folder in data_path.iterdir() if folder.is_dir() and "_csv" in folder.name and "__MACOSX" not in folder.name), data_path) + return _set_up_duckdb(extracted_folder, backend_handle) def synthea27nj_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = None) -> None: - """Loads the Synthe27Nj dataset. + """Loads the Synthea27NJ dataset in the OMOP Common Data model. + + More details: https://github.com/darwin-eu/EunomiaDatasets/tree/main/datasets/Synthea27Nj. Parameters ---------- @@ -137,9 +162,39 @@ def synthea27nj_omop(backend_handle: DuckDBPyConnection, data_path: Path | None >>> ed.dt.synthea27nj_omop(backend_handle=con) >>> con.execute("SHOW TABLES;").fetchall() """ - # TODO - # https://github.com/darwin-eu/EunomiaDatasets/tree/main/datasets/Synthea27Nj - raise NotImplementedError() + if data_path is None: + data_path = Path("ehrapy_data/Synthea27Nj") + + if data_path.exists(): + print(f"Path to data exists, load tables from there: {data_path}") + else: + print("Downloading data...") + URL = "https://github.com/OHDSI/EunomiaDatasets/raw/main/datasets/Synthea27Nj/Synthea27Nj_5.4.zip" + response = requests.get(URL) + + if response.status_code == 200: + extract_path = data_path / "synthea27nj_omop_csv" + extract_path.mkdir(parents=True, exist_ok=True) + + # Use zipfile and io to open the ZIP file in memory + with zipfile.ZipFile(io.BytesIO(response.content)) as z: + # Extract all contents of the ZIP file into the correct subdirectory + z.extractall(extract_path) # Extracting to 'extract_path' + print(f"Download successful. ZIP file downloaded and extracted successfully to {extract_path}.") + + else: + print(f"Failed to download the file. Status code: {response.status_code}") + return + + extracted_folder = next( + ( + folder + for folder in data_path.iterdir() + if folder.is_dir() and "_csv" in folder.name and "__MACOSX" not in folder.name + ), + data_path, + ) + return _set_up_duckdb(extracted_folder, backend_handle) def mimic_ii(backend_handle: DuckDBPyConnection, data_path: Path | None = None) -> None: diff --git a/src/ehrdata/io/omop/omop.py b/src/ehrdata/io/omop/omop.py index ae41486..d30292a 100644 --- a/src/ehrdata/io/omop/omop.py +++ b/src/ehrdata/io/omop/omop.py @@ -103,9 +103,10 @@ def setup_variables( time_interval_tables = [] for table in tables: if table == "measurement": - concept_ids_present = ( - backend_handle.sql("SELECT * FROM measurement").df()["measurement_concept_id"].unique() + concept_ids_present_df = normalize_column_names( + backend_handle.sql("SELECT * FROM measurement").df() ) + concept_ids_present = concept_ids_present_df["measurement_concept_id"].unique() extracted_awkward = extract_measurement(backend_handle) time_interval_table = get_time_interval_table( backend_handle, @@ -171,32 +172,37 @@ def load( def extract_person(duckdb_instance): """Extract person table of an OMOP CDM Database.""" - return duckdb_instance.sql("SELECT * FROM person").df() + return normalize_column_names(duckdb_instance.sql("SELECT * FROM person").df()) def extract_observation_period(duckdb_instance): """Extract person table of an OMOP CDM Database.""" - return duckdb_instance.sql("SELECT * FROM observation_period").df() + return normalize_column_names(duckdb_instance.sql("SELECT * FROM observation_period").df()) def extract_person_observation_period(duckdb_instance): """Extract observation table of an OMOP CDM Database.""" - return duckdb_instance.sql( + return normalize_column_names(duckdb_instance.sql( "SELECT * \ FROM person \ LEFT JOIN observation_period USING(person_id) \ " - ).df() + ).df()) def extract_measurement(duckdb_instance=None): """Extract measurement table of an OMOP CDM Database.""" measurement_table = duckdb_instance.sql("SELECT * FROM measurement").df() - + measurement_table = normalize_column_names(measurement_table) # get an array n_person x n_features x 2, one for value, one for time - person_id = ( - duckdb_instance.sql("SELECT * FROM person").df()["person_id"].unique() + person_id_df = ( + duckdb_instance.sql("SELECT * FROM person").df() ) # TODO: in anndata? w.r.t database? for now this + person_id_df = normalize_column_names(person_id_df) + person_id = person_id_df["person_id"].unique() + # person_id = ( + # duckdb_instance.sql("SELECT * FROM person").df()["person_id"].unique() + # ) # TODO: in anndata? w.r.t database? for now this features = measurement_table["measurement_concept_id"].unique() person_collection = [] @@ -320,11 +326,20 @@ def get_time_interval_table( concept_id_list = concept_ids if num_intervals == "max_observation_duration": + observation_period_df = con.execute("SELECT * from observation_period").df() + observation_period_df = normalize_column_names(observation_period_df) + + # Calculate the duration of observation periods num_intervals = np.max( - con.execute("SELECT * from observation_period").df()["observation_period_end_date"] - - con.execute("SELECT * from observation_period").df()["observation_period_start_date"] + observation_period_df["observation_period_end_date"] + - observation_period_df["observation_period_start_date"] ) / pd.to_timedelta(interval_length_number, interval_length_unit) num_intervals = int(np.ceil(num_intervals)) + # num_intervals = np.max( + # con.execute("SELECT * from observation_period").df()["observation_period_end_date"] + # - con.execute("SELECT * from observation_period").df()["observation_period_start_date"] + # ) / pd.to_timedelta(interval_length_number, interval_length_unit) + # num_intervals = int(np.ceil(num_intervals)) tables = [] for person, person_ts in zip(obs.iterrows(), ts, strict=False): @@ -353,6 +368,10 @@ def get_time_interval_table( return np.array(tables).transpose(0, 2, 1) # TODO: store in self, np +def normalize_column_names(df: pd.DataFrame) -> pd.DataFrame: + """Normalize all column names to lowercase.""" + df.columns = map(str.lower, df.columns) # Convert all column names to lowercase + return df def extract_observation(): """Extract observation table of an OMOP CDM Database.""" From 35cdd473c17f02398e3f3bc6a811550ab0e99fea Mon Sep 17 00:00:00 2001 From: ShreyParikh07 Date: Thu, 10 Oct 2024 14:50:32 +0200 Subject: [PATCH 02/15] further implementation --- docs/notebooks/test_more_datasets_omop.ipynb | 256 +++++++++++++++++-- src/ehrdata/io/omop/omop.py | 236 +++++++++++------ 2 files changed, 390 insertions(+), 102 deletions(-) diff --git a/docs/notebooks/test_more_datasets_omop.ipynb b/docs/notebooks/test_more_datasets_omop.ipynb index 3cc16a8..3ac69ad 100644 --- a/docs/notebooks/test_more_datasets_omop.ipynb +++ b/docs/notebooks/test_more_datasets_omop.ipynb @@ -39,24 +39,23 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Load the mimic dataset" + "define the function" ] }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 82, "metadata": {}, "outputs": [], "source": [ - "def load_and_check(dummy_func, start_time):\n", + "def load_and_check(dummy_func, start_time, tables):\n", " con = duckdb.connect()\n", " dummy_func(backend_handle=con)\n", " edata = ed.io.omop.setup_obs(con, \"person_observation_period\")\n", - " edata\n", " edata = ed.io.omop.setup_variables(\n", " backend_handle=con,\n", " edata=edata,\n", - " tables=[\"measurement\"],\n", + " tables=tables,\n", " start_time=start_time,\n", " interval_length_number=28,\n", " interval_length_unit=\"day\",\n", @@ -76,7 +75,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 98, "metadata": {}, "outputs": [ { @@ -89,12 +88,12 @@ } ], "source": [ - "edata_mimic = load_and_check(mimic_iv_omop, \"observation_period_start_date\")" + "edata_mimic = load_and_check(mimic_iv_omop, \"observation_period_start_date\", [\"measurement\"])" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 99, "metadata": {}, "outputs": [ { @@ -105,7 +104,7 @@ " shape of .r: ((100, 450, 320)) " ] }, - "execution_count": 13, + "execution_count": 99, "metadata": {}, "output_type": "execute_result" } @@ -114,6 +113,30 @@ "edata_mimic" ] }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAKUAAAAUCAYAAADsvf0KAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAABJ0AAASdAHeZh94AAAGXklEQVR4nO2ae6xdRRXGf30kAgVLUqSN+ACR2xAkFMWADypXsMpDA4IhIYVSIpEAXpBWDdX69cNUagKF8hAJNS0qCZBUIPJ+NUB5iGIJGkBebREDVKlUiq2l5fLHmt3uO+zTe+45u/fU5n7JyWSvmTUz35w1s/aatYf19vYyhCFsSxje6QkMYQg5RlYJbV8LHAnsJentwZ3SELZ32P4M8CfgdEnz8/phufu2/VngD8B0SXNL8hOALwETgAOAXYDrJE3uZwIfAS4AvgaMAV4FbgYs6d916dQJ25OB36THyoWzvRz4eIMuXpc0rkHfg8rN9s+Bg4AuYDdgLbAijXmFpDey9mOA44Cjgf2BPYD1wF+ABcACSe82GKtpbrZvAg4B9pG0plxX5b5nA/8BrsrkPwbOJozyH5Ur8P5J7g08AUwFHgcuAV4CzgEeTQvQtk6dsP1R4ApgTX9tgdWAK34XNei7E9y+B4wC7gHmAdcBG4BZwFOJbxnfAq4BDiYOp0uBRcCngPnAjbaH5YO0wO1CYBzQk/fVx33b7gKOAOZLWltB7hXgBeLEXFy5BH3xC2B3oEfS5aVx5qb+ZgNn1KBTC9JiLwDeAH4HTO9H5U1JswYwRCe4fVDSulxoezYwAzgfOLNU9RzwDeC28oloewZhbMcD3yQMtYwBcZP0uO1nge/YnlMeKz8pTwOGATfkJCQtlvS8pKbC9bRzJgHLgSvz7oC3gZNtj2pHp2b0AF8mdnut79Kd4lZlkAk3pnKfrP39kn6fu2hJrwG/TI+Hleva4HY98DHgK2VhbpRHABuBxxoQGQi6U3l3BcG3gIeBnYj3inZ0aoHtfYE5wDxJDzap9gHbk23PsH2O7W7bIxq07Ri3Bvh6Kp8agM47qdyQyVvl9nAq+xjlJvedrHgC8ExNEff4VD7XoP55Ynd1Afe1odM2bI8kApuXCZfWLMaxOSAqsMz2VEkPZPKOcCtgezqwMzCaCHy+SBjknCb1RwKnpMc7s+pWuf0xlRPLjcsn5R7ACCJiqgOjU7m6QX0h37VNnTrwE+BA4NSKd+lGWAAcThjmKCJSvRrYE7jD9gFZ+05xKzCdcKXnEgZ5JzBJ0j+b1J9DBDu3S7orq2uJm6TVwDrChW9COdApoqOtfuWyLcH2wcTpeLGkR5vVk+RM9FfgDNtrgGlEdHtcXfNsF8UVle2xwOcJI1tq+xhJf96Sru0egtOzwMk1T20VMLYsKJ+UxQmxQ02DFbtjdIP6Qv5mmzotI7mkXxNuZ2YdfbI5GJiYyQeVWyNIel3STYQ7HUPwbwjbZxNXSU8D3ZJWVTRrh9uObLY9oK9RrkxlXXdlf0tlV4P6Iuorv4e0otMOdk5j7Quss91b/AhXB3BNkl3aZJ+FO8wjzcHmtkVIWkEY2n62d6tqY/tc4HLCC3SnCLwKLXGzPZxw6SvL8rJRvkos6HjqQXGPOSkNXp7MLsAXgP/SN9JvRacd/A/4VYPf0tRmSXpu1rUXEeZLmXywuTWDD6dyY15h+4fEBfiThEGuzNuU0Cq38cQV5JNl4aZ3Skm9th8Ejrf9SUkv9ENoi5D0ou27CTdxFrHjNs2VOEmuLkf6rejYXghMAaZKWjjAOa4Fvl1VZ3sWEfxcm6cZ0/XRy/kthe09iWwQwG+zsQaVW9LvIlKeqzP5cOCnxGX3IxUpwJlEuvAJIhiqctltcUsoNnCfREz+QcYi4sb+q0TmpjzRY4Fj02OR1/1cWjiAf0nKMyBnAo8Al9k+HHiGSF91E0f5jyo4DlSn2Jn53dnWxInAtLSJVwBvAXsT+eIdgNupTjUONrejgAttLwGWEZmqsURG7hPAa8DpZQXbUwiD3Ag8BPTYeUzH8opN0sp/PSmNc0tZmF+eLyL8+ym8HxOIXTuFMFoSsUJ2Qq4g6UXiTmxhmuA04s+bBxySfwzQos7+hFHcVjHnrYXFwK1pXicB5xF/9BJiLY6RtD5X6gC3e4lXjw8RqcHvE4fOKuIE20/S05nOXqkcQVwfqeJ3arvcbI8mDrlbJf29XFf1ldD5wM+AT0tayjYM27sSu/9iST/o8HRqxfbMDcD2d4HLgEMlLSnXVX0ldAmR2bhgEObWLg4lUl9z+2v4f4jtlpvtHYkPQRblBgkVJ2VSmki8C1w09JHvEOpGChRPBBZKWp7XvwdACvWbXD4BcQAAAABJRU5ErkJggg==", + "text/latex": [ + "$\\displaystyle \\left( 100, \\ 450, \\ 320\\right)$" + ], + "text/plain": [ + "(100, 450, 320)" + ] + }, + "execution_count": 100, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "edata_mimic.r.shape" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -136,12 +159,12 @@ } ], "source": [ - "edata_gibleed = load_and_check(gibleed_omop, \"observation_period\")" + "edata_gibleed = load_and_check(gibleed_omop, \"observation_period\", [\"measurement\"])" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 102, "metadata": {}, "outputs": [ { @@ -152,7 +175,7 @@ " shape of .r: ((2694, 55, 1441)) " ] }, - "execution_count": 11, + "execution_count": 102, "metadata": {}, "output_type": "execute_result" } @@ -161,6 +184,30 @@ "edata_gibleed" ] }, + { + "cell_type": "code", + "execution_count": 103, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAALAAAAAUCAYAAAAtOremAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAABJ0AAASdAHeZh94AAAGpUlEQVR4nO2aeaxdUxTGfx1UmxalMUXUPNRYRLWkLTqIGKKGVKQUMSWookUbrH4S1FStEEMrbZUg1NSBEho1F6kg5uiglNfilddBaZ8/1j7teeed+9695953nsb7kpt9z9p77bP22muvs/bau1VtbS0taMHmitbNLUALWlAO2qYRJU0FTgT2MLNV+YrUghbUh6QjgI+Ai81sUkRvlQwhJB0JfACMMLNxgdYFGAScBBwM7AKsAz4DJgOTzWxDAy/vB1wB9AK2BX4NvBPMbHaibSvgovA7EGgFfAlMAh5p6D2BfwgwLTzWGWwlIWkRsFuB6l/MbKdK8JQLSWcCfYHuwKHAVsATZjakhD5K1mljPFnkkvQ80BPYx8xqIN0D3wr8ATwYo50VnpcBc4ElwI7A6bhhnSjpLDOrF1BLuhMYCSwFXgJWANsDRwDHArMTLI8D5wBVwJPAamBAeP/RwHkNDHBX4H6gBuhUqF0FsRIYn0KvqTBPObgRN5AafA72L4U5i06L5Mki1+24cx0G3AYJA5a0L9AfmGRma2JV3wCnArPiHlDSaGA+cAZuzNMT/V2MG+9U4BIzW5eo3yLxPAg33oVADzNbEejtQt/nSnrBzJ5Ljix47sm4d38OGNG4PspGtZmNyYGnHFyNG8h3uMebWyxjFp2WwFOyXGY2X9JXwKWSxprZhqQHvhD/ZD+dYHyjQIc/S3oI99rHEjNgSVsG+hJSjDfw/50gDQrlPZHxhnbrJN0EnIyHIvUMGF+Vxwc5jk+T9/8IM9toGJJKZc+i06J4ypDrKWAM/lWekzTg/sB64P0SOoyM8J8EfQAeKowHNkg6CTgIWAvMN7P3UvqKYsDvU+oiWm9J7eILQlI3YCweU8+TlJcBbxliva7AKuBTYJ6Zra8wT+7IotOc5uGdUA4A5mxMo0nqiAfUXxabeZDUlk0x6SuJ6iNDuRZYAMzEBzceeFfSm5K2T/BEXnePlNftGcq2sf+RDNNwTz+6GLkriJ3Cu2/Fx/UG8K2kvhXmyRVZdJrjPHwYyj5QNw+8C9AG36gVi7G4V51tZnMSdTuEciRQC/TGd5qHAK8GAZ5J8MwK5TWStouIIVaOf2e2jf2/GTgMOD8Rtzc1JgP9cIPsiGdnHgZ2B16WdGiFeJoDWXSayzyY2UrcKXaFupu4LqH8vZiOJA0DrgW+As5NaRItjn+AU81sUXj+LGzWvgb6SuoVCyeeCn2dAHwh6cUgbH9gZ3x1dwU2BBmOwlf7PQVCkiaDmSUDt8+ByyTV4HoZw6aYPjNP3sii02aYh9/wLFgdDxytmvaNcUu6ApgAfAEcZ2a/pTSrDuWCmPECYGargchj94jR1wOnADcAy4Gh4fctnkL7MzStCp+sx/AMyU2NyZwjHgplnybmqTiy6LSZ5qEDwV7jHrgqlF3qNY9B0nDgXtx79DOzqgJNvw5ldYH6yNN3iBNDZuKO8Iu/tz2wD7DCzBZK6gzsG6rXFtjJTpQ0Ed9UDC8gR6WxPJQdm5inKdCJ0nWahSczJLUGOuOp1joGvAxX5H4NMF+Px72fAAPiqa4UvI7HvgdIap1ygnZQKBcWKfvZQDv8cAPgL+DRAm0Px+Oxt/GFlGd40TOUaZmUSvI0BbLoNO952A9P9X4CMQM2s1pJ84AzJO1tZt/FuUIe9hbgY2BggbBhI8xssaQZ+AHIVbjXjvoaiMe51SSyF5K2NrM/ErTuwF241x4b+l+DHzfXg6QxuOKmph17SpqChyYXmNmUhsZRoP9uwJJktkbS7vgJFPiJYrk8ZclZKrLotJx5yIhosc+F+kfJ0/FTtRPwE5JIkKG48a4H3gKGpXwqFqUo+XJ8AONCHngBniI7LfR1UdhVxvGapDV4iPIn0A2/g7EGOMXMfip+rAUR32BmwWDg2rDgF+Ny7oXL2R4/Hr+7Ajzlyomk03B9w6Y8e6+wOMBDsjxOLeugDLkG4rbzIqQbcBWe230gRo/ysm2A4QVkehOYEieY2dJwi+hm3BP3we9ZzABuN7P5Kf08i4cLQ/D4+EfgkdB+aYF3l4qDcQOa1VjDApiLf8oOA47BY9dq/FM5DZiWci8kC0+5coLn9ocmaHuyKZe+mHyO3ZPoTolySdoGN/qZZvYDpN9GG4VflDjczBZUWurmRtj8/YqnfK5rZnEKYnORM09IuhK4D+htZm9D+oX2e/F86y05ypYneuPH3+OaW5BGsLnImQskdQBGAdMj44UUDxwa9wGOA+5uudDegv8CwiZ4MDAlfq7wLxytAalq0cnyAAAAAElFTkSuQmCC", + "text/latex": [ + "$\\displaystyle \\left( 2694, \\ 55, \\ 1441\\right)$" + ], + "text/plain": [ + "(2694, 55, 1441)" + ] + }, + "execution_count": 103, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "edata_gibleed.r.shape" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -170,7 +217,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 101, "metadata": {}, "outputs": [ { @@ -183,12 +230,12 @@ } ], "source": [ - "edata_synthea27nj = load_and_check(synthea27nj_omop, \"observation_period\")" + "edata_synthea27nj = load_and_check(synthea27nj_omop, \"observation_period\", [\"measurement\"])" ] }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 113, "metadata": {}, "outputs": [ { @@ -199,7 +246,7 @@ " shape of .r: ((28, 132, 866)) " ] }, - "execution_count": 12, + "execution_count": 113, "metadata": {}, "output_type": "execute_result" } @@ -210,10 +257,181 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 104, "metadata": {}, - "outputs": [], - "source": [] + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAJcAAAAUCAYAAACAu68PAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAABJ0AAASdAHeZh94AAAGVElEQVR4nO3af+zWVRUH8BdqM6LSBhmLZZMUp6khpukKxIk6tczMVn9Utha6laEmmrrqdNwMaIk/qtUWm1ixlek0EYY/kmVoSSlOm1o208SJv638NRPoj3uf9vDheeD7fT78MMZ7e3b2ufee++P9nM+55577GbF27VrbsR2bAzts7Qlsx7aLnXoVZuaVOBZ7RMRLW3ZK2/H/hsw8CH/C9IiY1ykf0dwWM/Ng3ImZETG3lo3GJ3A89sc4vIb7cAWuiIg1fQY+HmdgX4zGE7gLcyPi95twjZ3xTsbhmIgP4G1YEBGf3YDOHHwQEzAGr+BRXIcfRMSzjfYD89EWg/KZmUfidByGd+DZOt/LImJxW53MvBaHYq+IeJHe2+JF+Bd+1FX2KfwEH1IM71Jcg/0wD1dl5ogek5uDGzAJS3AZ7sbHcXtm9v3DW+AbCiET8fgQdc7CKNxc57gAr+PbuDcz39NoPxAfbTEon5n5XdyivEDX42IswjsxdRPpzMJYzOgUrLMtZuYETMO8iHilq+qvOAGLut/IzLwAy/FJnKQQ3Kkbi5l4EgdExFNddUfgVlyIn/daXAuchZX4m+LBlg5B5+0R8WqzMDMvwgU4H1/uqho2H20xKJ+ZOR3n4EqcGhGvNerf1GOsYetExPLMfBCnZebsiFjT9FxfxAj8sqF4a0QsbLr6iFiFH9fHqY2+3qt4xju7iah6S/Fv5S3YpIiIpRHxUEQM+Rjcy7Aqrqpyr0b7Qfhoi2HzmZk7KzvRP/Qwkqr7n7Y6XfgFdsdRrB/QT8Nq/KGPci90Bnq9Uf6QEocckpljIuKZrgVMUWKh64YxztbAx6q8dxg6/fhoi0H4PEoxuEuxpsZr++FVLO8Tow2i08HtXX3c+D/jysxRSpzywFBPiJm5Ez5fH5d010XEc5n5dczF/Zl5nRIQvk/ZUm7GaUMZZ0shM2firdhFiTU+ohjW7CHq9+WjLQbk8+AqX8UKxUi653sbTo6Ip1vqdPDHKqewbkA/Djsqp4+hYnYdfHFE3NisjIhLldhjJ0zHeUow/BjmN937GwAzEThTMawlOLoPkb2wQT7aYgA+d6vyHKzFZMXDHYCbFCP41SbQ6czvn4pR7s66xjW6yueHstDMnIGz8SA+16fNubga85U3bBQOwsNYUE8kbxhExNiIGKGcek7CeKzIzEkb0x0KH20xAJ+d//d1nBARyyLixYi4T0mlrMThmXlYS51uPKekc9Yxrs7p8M1DWOTpyjH4fhwREc/1aDMVc3B9RHwtIh6OiJcj4u46ycdxdmaO39h4WxoR8WREXIujlZfupxtqPxQ+2mJAPl+ockVEPNLdX0S8jI53PaSlTjdGqrbUbVwdlzp6veZdyMwz8X38WSFyVZ+mH61yvVRAneTyOv6BGxpvayIiHlUM5v2ZOaZXm2Hw0RaD8PmXKl/o02dnlxrZUgdk5g7YVbWlbuN6Ak9j7z6dqgHlJbhHIXJDMdPOVfZLN3TK1zvqvsHw7ipXNyuGyUdbDMLnb5S4ad/6xzfRCdb/3lKng72VVNY9dBlXzQvdhjGZuWdTKzO/qQSsd+HI7qNwH/yuylMzc1yjr2PxYSX4u6NRNz8z12bmFzbS/yZBZk7IzF16lO9Qk6i74Y6IeL5RP1w+2q5t2HxWz7tQCbDPaOgcjWMUD7WkjU4XDq1yKevnua5RssvHKBnuTqenKNnf1XWRMzKz2fEjETG/6/lq5fpgGh6od0+rsI/i4kfgvOa9nXUDymEjM0/EifVxbJWHZWZnbs9ExMwuleMwKzOXKW/js3iXkt0fX+c8vTHGIHzQbm2D8vkVZaucW3NWK7CHwtFqfKme8trqUGLU1fg1vY3rKSVX88Ou8j2q3FE5pvfCb5VTDIiINZl5XJ3oZ5Sg8y3KaWIxLo+Im3r0s7+SbV7UZ5yNYSJOaZSNrz/KhXS3cd2CPZXUw4FKzPCScsXzszrPZoA+bD4qBl7boHxGxMr61cK3lHzYFOXueCFmRcTyTaFTvf+JuCEiHqP3VxHn4zuYFBErhktCG2TmrornuDgizt2SY29ubMtrg8z8Ki7H5IhYRu+vIi5R7pUu3IJz62Cycn0ydyuMvbmxza4tM0cql/vXdAyLHp6rNp6CI/C97R8LbsfGkJn74NPKLcEjnfL/AjQSP4HwDZy2AAAAAElFTkSuQmCC", + "text/latex": [ + "$\\displaystyle \\left( 28, \\ 132, \\ 866\\right)$" + ], + "text/plain": [ + "(28, 132, 866)" + ] + }, + "execution_count": 104, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "edata_synthea27nj.r.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# check by loading the data with observation.csv" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "mimic dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Path to data exists, load tables from there: ehrapy_data/mimic-iv-demo-data-in-the-omop-common-data-model-0.9\n", + "missing tables: [['concept'], ['vocabulary'], ['domain'], ['concept_class'], ['concept_relationship'], ['relationship'], ['concept_synonym'], ['concept_ancestor'], ['source_to_concept_map'], ['drug_strength']]\n" + ] + } + ], + "source": [ + "edata_mimic_obs = load_and_check(mimic_iv_omop, \"observation_period_start_date\", [\"observation\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAKUAAAAUCAYAAADsvf0KAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAABJ0AAASdAHeZh94AAAF1klEQVR4nO2aa4hVVRTHf6NC2mQGikr28DkiJY5l9FQbtKnsgfagL5oZSGI2Gg6FViz/gWlgmo9eJDg1CSWIRWlmD9HUyjLFIk1Txx6ok5qmppQ2fdjn6pk954733nPwTsP84bLvWWuvs9Y6e52991r7FNTU1NCEJjQkNMu3AU1ogo8WUURJbwC3A13M7Ni5NakJjR2Srga+AUab2XyfX+Av35KuAb4Cys1sZoh+HzAQKAb6AK2BhWY2/CwGXAI8C9wGtAX2AO8CMrM/kpKJg1x8k1QFXJ6Gvc/MOiahJy4kPQ/0A4qAdsBxYDfuec4zswNe/7bAMOAOoDfQCfgb+A5YACwws3/T6Mp43CQtAa4DepjZ0TAvavmeCvwJvOLRnwbG4R7ob5FPoK6R3YANwChgPTAL2AmMB74IHkBsmQSQtW8BDgOK+M1IWE8cPA4UAh8Ds4GFwElgCrBZ0qVe//uB14FrcZPTi8Bi4EpgPrBIUoGvJIdxmwZ0BMr8e9VaviUVAYOB+WZ2PMK5X4GfcG/7yshHUBsvA+2BMjObG9IzM7jfVGBMAjJxkYtvAIfMbMo50BMHF5rZCZ8oaSowGZgEjA2xtgF3A0vDM6Kkybhguxe4BxeoYWQ1bma2XtJW4BFJ08O6/JnyYaAAeMd3wsxWmtl2M8soXQ/enFKgCnjJvx1wDBghqTCOTBLI1reGrsfTWScgAywK2h5e/8/M7H1/iTazvcCrweXNYV6McXsbuAy4JUz0g3IwcAr4Mo0j2aAkaFdEOHgEWAucj9tXxJHJJ86TNFzSZEnjJZVIap5vozLEXUG7OQuZf4L2pEfPddzWBm2toDy9fAdRXAxsSSjj7hm029Lwt+PeriLg0xgy+URHoNKj7ZI0ysxW5cOgdJBUDlwAtMElPjfhAnJ6hvItgAeDy+UeO9dx+zpoB4Q7h2fKTkBzXMaUBNoE7eE0/BT9opgy+cICYBAuMAtxmeprQGfgQ0l98mdaJMpxS+kEXEAuB0rN7PcM5afjkp1lZvaRx8tp3MzsMHACt4SfRjjRSWVHiZdcGiPMTB7pe2CMpKPARFx2O+xc25UOqRKVpA7ADbgg2yjpTjP7tj5ZSWU4n7YCIxI27SDQIUwIz5SpbLtlQspSb0ebNPwU/VBMmYaGVDIwoN5eeYKZ7TOzJbjltC3wZn39JY3DlZJ+AErM7GBEtzjj1oozsQfUDsrqoE2qDvhj0Bal4aeyvvA+JBeZhobUcphohSBpmNluXKBdIaldVB9JE4C5uFWgJMjAo5DTuElqhlvSq8P0cFDuwT3QniSDVA2uNFAeNqY1cCPwF7Uz/VxkGhpSGebOvFqRGS4O2lM+Q9KTuAL4JlxAVvt9Qsh13HriSpCbwsTTNwhqZ6uBdpK61+/L2WFmO4AVuI3/ox5buJmkMpzp5yIjqUJSjaSH4tqcKST1iqqVSuoMzAsu30pATyzfJBVJqrOkSmoWFM/bA+sijgCfwe05NwCDzGx/fXpyGbcAqRe41iGC/0HGYlzF/lbcqUPY0KHA0OAyda57vaSK4P9+Myv37jcWWAfMkTQI2II7virBTeVP1XUxa5nUi+XXzjJGDr49AEyUtBp3jnwE6IY7L24JLCPiqDEHPXF9GwJMk7QG2AUcwCUVA4GuwF5gtGfjSNz59Sngc6BM8nM6qsyswqPlMtalgZ73wsSooKzG1aP8ynwxMNKjdQ1+4AanVlCa2Q5J/ThzSD8Et02YTZqPK3KQ6Y0LiqX+vbJAMdn5thK39PTFLU2FuE38GlzdsjLNqU22euL69gnQHVcC6ovbvx3DBUklMCcicekStM1x5aMorAIqwoRsxy2YwYcCH5jZL2Fe1FdCk4DngKvMbGN6f/MPSRfh3v4XzOyJPJuTKBqzbwCSHgPmAP3NbE2YF/WV0CzgZ1zEN3T0xx19zTxbx/8hGq1vklrhPgRZ7AckRMyUgdAA3F5gRtNHvk1IGpJ64fblFWZW5fP/A5RDnlkqCjNBAAAAAElFTkSuQmCC", + "text/latex": [ + "$\\displaystyle \\left( 100, \\ 151, \\ 320\\right)$" + ], + "text/plain": [ + "(100, 151, 320)" + ] + }, + "execution_count": 106, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "edata_mimic_obs.r.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "gibleed dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Path to data exists, load tables from there: ehrapy_data/GIBleed_dataset\n", + "missing tables: [['cohort_definition']]\n" + ] + } + ], + "source": [ + "edata_gibleed_obs = load_and_check(gibleed_omop, \"observation_period\", [\"observation\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 109, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAALAAAAAUCAYAAAAtOremAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAABJ0AAASdAHeZh94AAAGJUlEQVR4nO2ae4hVVRTGf6Nm2lOTypAkJRUra7J8Bb5fhCmaRhGaFVp/ZGqlmZIuP6GcHmpGUajhmEX20DJTspdkRmWFguGjJLUsY3xkao2ZOv2x99UzZ86duffcmTteuB9c9j37+Z21115n7bV3QVlZGXnkkauoU9sE8sgjE9SLypS0CLgZaGFmf2eXUh55VISkG4DvgNFmtiCRXxB2ISR1AL4BJpjZbJ/XBBgCDADaAc2AY8AmYCGw0MxOVjJ4b2AM0AVoDOz3beea2apQ3QJglP9dDRQAW4AFwLzKxvHthwOL/WO5l60uxJWHpGFAd6AQuA44H3jdzIZXN8fqHDOOTKtqE4eXpHeBzkArMzsC0S7EE8Ah4KVA3m3AfKATTrmfA5YC1+AU6y2veFGDPg18AtwIvA/MAlYCFwM9Ipq8BswDrgDe8P2f4/kUJ3s5P9blwAvAkcrqVQPiyuNx3EIuBH6rYY7VMmYcmabYJg6vmUBTYGwio5wLIak10AdYYGalgaIfgUHAyqBlkTQFWA8MBW7FTWKwv9HARGARcJ+ZHQuVnxV6HgLcCewAOprZPp9f3/c9QtJ7ZrYs/GZeYRbirPsyYEIVwsgEseQBPATsBrbjrM+aGuSY8ZhxZJpGm7R5mdl6SVuB+yUVmdnJsA98L+6T/Wao4WdJOvxD0ss4q92DwIRJOtvn/0KE8vr2/4Wyhvh0VkJ5fb1jkqYCt+BWbQUFxq3KXp5Hryi+1YU48vDlpyZJUk1SrK4x48g0pTYZ8FoCTAf6AqvDLkQf4ATwdRodJpTweCi/L85NWAaclDRA0iRJ4yR1SdJXU5/+HFGWyOvqLfIpSGoLFOF86rVpcK8JJJNHTiGOTLM0D1/6tC8EXAhJ5+L8kS2pRh4k1QPu8o8fhoo7+PQosAHnHwbbrgWGmdneQHbC6raIGK6lT+v5/1sDHBbjLP2UVHjXFKqQR84gjkyzOA/f+rQblN/ENQPqAnvS6KwIp5irzGx1qOwSn04EyoCuuJ3mtcBHnsDboTYrffqwpIsSmd5XDn5nGgf+TwOuB+4O+e21gcrkkUuII9OszIOZ/YUzis2h/CauiU//TKUjSWOBR3CWcERElcTiOA4MMrOd/nmT36xtA7pL6mJmX/myJb6v/sBmScs92T7AZbjV3Rw46Tl0wq32WYE+agUpyCMnEEemtTAPB4BLobwFTqyaBlW1ljQGmAtsBnqa2YGIagd9uiGgvACY2T9AwkJ1DOSfAAYCjwF7gZH+9xNwE3DYVy3xn6xXcRGBqVVxrkmkKI8zHnFkWkvz0BCvr0ELXOLTJhWqByBpPDAH+AHobWYlSapu8+nBJOUJS98wmOkjE0/5X3DcBkArYJ+Z7ZDUCGjti48m2cnOlzQft6kYn4RHRkhDHrmA80hfpnHaxIakOkAjXKi1nALvwVm9NpU0noTz8zYCfYOhrgh8ivN9r5JUJ+JkKrGp25Ei9zuA+rjDDYB/gVeS1G2P88fW4RZSjXzW0pRHLiCOTLM9D21wod6NEFBgMyvzkYGhkq40s+3BVj4OOwP4HuhX1WfSzHZJWoEL+I/DWalEX/1wfu5BQrt1SReY2aFQXiHwDM5qF/n+S3HHzRUgaTpOcIuijj0lFeNck3vMrLiy90iGdOURc4xiMuSZDuLINJN5iInOPl0DFS/zLMWdIvXHnZAkiIzETdYJ4AtgbMSnYmeEkB/AvcBsSQNw4bQWwGDf1yi/qwziY0mluE/yYaAt7s5BKTDQzH5P/V2TIrjBTBtx5SFpMO7d4XTMu4tXVHDuUfDkKiOeMcfMCjLg1Q8n9+UQrcAluFjmi4H8RFy2LjA+CafPCd1VMLPd/hbRNJwl7oa7Z7ECmGlm6yP6eQfnLgzH+ce/4e5GzDSz3UnGThftcItjZVUVkyCWPHBx9pGhvJacjnHvovzRa6Y844yZLRSSJi9JF+KU/gMz+xWib6NNBp4E2pvZhupmXdvwm7/9uJDPo7VMJylyhWc2IelB4Hmgq5mtg+jbaHNw8dYZWeSWTXTFHffOrm0iVSBXeGYFkhoCk4GlCeWFCAvsK3cDegLP5i+053EmwN+zuB0oDp4r/A+xAOOVibavLgAAAABJRU5ErkJggg==", + "text/latex": [ + "$\\displaystyle \\left( 2694, \\ 21, \\ 1441\\right)$" + ], + "text/plain": [ + "(2694, 21, 1441)" + ] + }, + "execution_count": 109, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "edata_gibleed_obs.r.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "synthea27nj dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 111, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Path to data exists, load tables from there: ehrapy_data/Synthea27Nj\n", + "missing tables: []\n" + ] + } + ], + "source": [ + "edata_synteha27nj_obs = load_and_check(synthea27nj_omop, \"observation_period\", [\"observation\"])" + ] + }, + { + "cell_type": "code", + "execution_count": 112, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAIoAAAAUCAYAAABS66VXAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAABJ0AAASdAHeZh94AAAGPklEQVR4nO3af+zVVRkH8BdCA8ZaNqzY2FAiM0zMNA0zEAbKTGdRmVsztRbYlqH5K21rj49bpU0Rqq00HLhq6xfL/AX2QxZpJaWwbP6oppg60ZKw1JwJ9Mf5fNiH+733y/fe+wVy8d7uzj7POc85z3nO+Ty/PnfE9u3b7cM+7Ar77W0B9uHVgVHtiJl5I07C5Ih4Yc+KtA97E5l5FH6PBRGxrKaPaHU9mXk07sFFEbG4oo3HfJyMaZiIl3E/lmN5RGzrsPDJOA+HYjyewr1YHBG/GcY9ysyzK3kGw7aIGNnCtxEHdhj/dERM6F+69uhFP5k5B+fiWLwezypnsTQibu+XJzN/jOk4OCKep73r+SL+iW80aKfhW3i3comWYCUOwzL8IDNHtBHuKtyKI7EaS3Ef3o+7M/OMdpvqAxuQHX53VmNWdeB9rgPf1cMs4w70op/M/Ap+jnfhZlyD2/AGzOqwTrc8X8YELKoJO7mezHwr5mJZRPy70fUnnIrbmpYjMz+PdfgQPqhcnrpvAi7C0zg8Ip5p9M1WDu4KfKfd5npBRGxQLssAZGb9dl7fgX1LRFw+XLLsCr3oJzMX4GLciIUR8XLLnK9ps07XPBGxLjMfwjmZeWVEbGu1KJ/ACHy/hfHOiLil1b1ExCZ8s3qc1TLXgYrFuqephIpvDf6l3OjdjsycppjSJ5U36X8BXeknM0cr1v6v2hx4xfef5nMvPA18D5NwAgOD2bnYit923t8A1Au90kL/sxLHHJOZB0TE3xsbmInX4qYu1ukHC6v2hojY2mHM6MrUT8IL+APWDjK+X3SrnxOUi7ME26rY5jC8hHUd4pleeGrc3Zjjjh0XJTPH4Qg8ONRMJzNH4czqcXWzLyI2Z+bnsBgPZOZNSgA1RXFjP8M5Q1mnH2TmWJyhvADLBhk6Ad9uoT2amR+PiF8Ot1w96Ofoqn0J65UD34HMXIsPR8Tf+uSp8buqncnOwexEjFSi7qHiymrx2yPijtbOiFiixC6jsACXKoHx41jRanJ3Ez6C/bE6Ih7vMGY55iiXZZyS2V2Hg7AqM9+xOwTrUj9vrNqLsR0zFKtzOH6qHOgPW5bohaeW7Tnlgk1iZ9czvmr/MZRNZuYiXIiH8LEOYy7Bl/BVfB2b8DYlqv5uZh4REZcMZb0+ULud6zoNiIhsIf0Rn8rM55U9Xq6UB4YVXeqnfqlfwakRsbF6vj8z5+NhHJ+ZxzZcSi88TWzGm5oTQZ3ljBnCBs9VUrkHMDsiNrcZMwtX4eaIuCAiHomIFyPiPkXpT+LCzHzzrtbrFZn5drwHT6BtfWEXqAP1mcMmVIUe9LOlatc3DhxExIuoLfoxja5eeJoYq7oXzYtSm7nxA4Y3kJnn42vKWze7ynza4ZSqXdPaUQm5rlr/nYOt1yeGEsQOhtp3jxsmeZroVj8PV+2WDvPVnmBsg9YLD8jM/RSX/Qw7X5SnFMUc0mFSVfB1rVKrmL2LGGN01XZKgWv6gJRtOJCZYxSXuBU39DjN9Kp9ZFiE2hnd6ucXSpxxaHWIragD1UcbtF54ahyilEo20LgoEbEda3FAZr6llSszv6AEr/diTjOd64BfVe3CzJzYMtdJOE4Jln7d0rciM7dX5fh+cJpSql41SBArM6dWGV8r/SAlbqBNUXAY5OxKPxHxGG5RgsvzWsafiHmK5diRffbC00D9kqxhYB1lpVJlnYe/NCY9S6kSbq02uCizNf6zMSJWNJ5/pJSN5+LB6vvBJkxVzO4IXBoRz7bM0wzA+kHtdjpVYmucrsQCa/GYUuiaonzXGqPENu3K+P3K2Yt+Pq24osVVTWQ9JuMDytl8sspW9MkDJ1b9P2lutsZKxSed2UKfXLUjcT6ize/sJkNVxX0fPqsEvfOVDGK6ovx5EbG0jYDTlMPquYKamVPxXkMLYtco31um4KO4AMfjLpyFU9pVNPuVsxf9RMQTOEqxdAcrVmKWYjWOi4iVWtALT2a+TrlIt9bWuN3X48uUlO3IiFjfvQp6R2burxSdrtkDaXPPeLXI2Ssy8zNKyj4jIu6i/dfja5VvA1fsQdlqzFA+CSzeC2t3g1eLnF2jqmRfhpX1JaGNRakGz8RsXL3vj0v/X6jc9ulKZXhjTf8vHQfy/YibeYEAAAAASUVORK5CYII=", + "text/latex": [ + "$\\displaystyle \\left( 28, \\ 75, \\ 866\\right)$" + ], + "text/plain": [ + "(28, 75, 866)" + ] + }, + "execution_count": 112, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "edata_synteha27nj_obs.r.shape" + ] } ], "metadata": { diff --git a/src/ehrdata/io/omop/omop.py b/src/ehrdata/io/omop/omop.py index d30292a..e56a174 100644 --- a/src/ehrdata/io/omop/omop.py +++ b/src/ehrdata/io/omop/omop.py @@ -8,7 +8,7 @@ import duckdb import numpy as np import pandas as pd - +from ehrdata import EHRData def _check_sanity_of_folder(folder_path: str | Path): pass @@ -54,7 +54,6 @@ def setup_obs( return EHRData(obs=obs) - def setup_variables( backend_handle: Literal[str, duckdb, Path], edata, @@ -73,8 +72,6 @@ def setup_variables( """Setup the variables. This function sets up the variables for the EHRData project. - For this, a selection of tables from the OMOP CDM which represents the variables should be selected. - The tables can be measurement, observation, procedure_occurrence, specimen, device_exposure, drug_exposure, or note. Parameters ---------- @@ -85,59 +82,67 @@ def setup_variables( tables The tables to be used. start_time - Starting time for values to be included. Can be 'observation_period' start, which takes the 'observation_period_start' value from obs, or a specific Timestamp. + Starting time for values to be included. interval_length_number Numeric value of the length of one interval. interval_length_unit - Unit belonging to the interval length. See the units of `pandas.to_timedelta `_ + Unit belonging to the interval length. num_intervals - Numer of intervals + Number of intervals. + concept_ids + Concept IDs to filter on or 'all'. + aggregation_strategy + Strategy to use when aggregating data within intervals. Returns ------- An EHRData object with populated .var field. """ - from ehrdata import EHRData + # Mapping of table names to extraction functions and concept ID column names + table_info = { + "measurement": {"extract_func": extract_measurement, "concept_id_col": "measurement_concept_id"}, + "observation": {"extract_func": extract_observation, "concept_id_col": "observation_concept_id"}, + "procedure_occurrence": {"extract_func": extract_procedure_occurrence, "concept_id_col": "procedure_concept_id"}, + "specimen": {"extract_func": extract_specimen, "concept_id_col": "specimen_concept_id"}, + "device_exposure": {"extract_func": extract_device_exposure, "concept_id_col": "device_concept_id"}, + "drug_exposure": {"extract_func": extract_drug_exposure, "concept_id_col": "drug_concept_id"}, + "note": {"extract_func": extract_note, "concept_id_col": "note_concept_id"}, + } concept_ids_present_list = [] time_interval_tables = [] + for table in tables: - if table == "measurement": - concept_ids_present_df = normalize_column_names( - backend_handle.sql("SELECT * FROM measurement").df() - ) - concept_ids_present = concept_ids_present_df["measurement_concept_id"].unique() - extracted_awkward = extract_measurement(backend_handle) - time_interval_table = get_time_interval_table( - backend_handle, - extracted_awkward, - edata.obs, - start_time="observation_period_start", - interval_length_number=interval_length_number, - interval_length_unit=interval_length_unit, - num_intervals=num_intervals, - concept_ids=concept_ids, - aggregation_strategy=aggregation_strategy, - ) - # TODO: implement the following - # elif table == "observation": - # var = extract_observation(backend_handle) - # elif table == "procedure_occurrence": - # var = extract_procedure_occurrence(backend_handle) - # elif table == "specimen": - # var = extract_specimen(backend_handle) - # elif table == "device_exposure": - # var = extract_device_exposure(backend_handle) - # elif table == "drug_exposure": - # var = extract_drug_exposure(backend_handle) - # elif table == "note": - # var = extract_note(backend_handle) - else: + if table not in table_info: raise ValueError( "tables must be a sequence of 'measurement', 'observation', 'procedure_occurrence', 'specimen', 'device_exposure', 'drug_exposure', or 'note'." ) + + # Get extract function and concept_id column for the table + extract_func = table_info[table]["extract_func"] + concept_id_col = table_info[table]["concept_id_col"] + concept_ids_present_df = normalize_column_names(backend_handle.sql(f"SELECT * FROM {table}").df()) + concept_ids_present = concept_ids_present_df[concept_id_col].unique() + extracted_awkward = extract_func(backend_handle) + + # Create the time interval table + time_interval_table = get_time_interval_table( + backend_handle, + extracted_awkward, + edata.obs, + start_time="observation_period_start", + interval_length_number=interval_length_number, + interval_length_unit=interval_length_unit, + num_intervals=num_intervals, + concept_ids=concept_ids, + aggregation_strategy=aggregation_strategy, + ) + + # Append concept_ids_present_list.append(concept_ids_present) time_interval_tables.append(time_interval_table) + + # Combine time interval tables if len(time_interval_tables) > 1: time_interval_table = np.concatenate([time_interval_table, time_interval_table], axis=1) concept_ids_present = pd.concat(concept_ids_present_list) @@ -145,12 +150,11 @@ def setup_variables( time_interval_table = time_interval_tables[0] concept_ids_present = concept_ids_present_list[0] - # TODO: copy other fields too. or other way? is is somewhat scverse-y by taking and returing anndata object... + # Update edata with the new variables edata = EHRData(r=time_interval_table, obs=edata.obs, var=concept_ids_present) return edata - def load( backend_handle: Literal[str, duckdb, Path], # folder_path: str, @@ -189,56 +193,127 @@ def extract_person_observation_period(duckdb_instance): " ).df()) +def extract_table(duckdb_instance, table_name: str, concept_id_col: str, value_col: str, timestamp_col: str): + """ + Generalized extraction function to extract data from an OMOP CDM table. -def extract_measurement(duckdb_instance=None): - """Extract measurement table of an OMOP CDM Database.""" - measurement_table = duckdb_instance.sql("SELECT * FROM measurement").df() - measurement_table = normalize_column_names(measurement_table) - # get an array n_person x n_features x 2, one for value, one for time - person_id_df = ( - duckdb_instance.sql("SELECT * FROM person").df() - ) # TODO: in anndata? w.r.t database? for now this - person_id_df = normalize_column_names(person_id_df) - person_id = person_id_df["person_id"].unique() - # person_id = ( - # duckdb_instance.sql("SELECT * FROM person").df()["person_id"].unique() - # ) # TODO: in anndata? w.r.t database? for now this - features = measurement_table["measurement_concept_id"].unique() - person_collection = [] + Parameters + ---------- + duckdb_instance: duckdb.DuckDB + The DuckDB instance for querying the database. + table_name: str + The name of the table to extract data from (e.g., "measurement", "observation"). + concept_id_col: str + The name of the column that contains the concept IDs (e.g., "measurement_concept_id"). + value_col: str + The name of the column that contains the values (e.g., "value_as_number"). + timestamp_col: str + The name of the column that contains the timestamps (e.g., "measurement_datetime"). - for person in person_id: - person_as_list = [] - person_measurements = measurement_table[ - measurement_table["person_id"] == person - ] # or ofc sql in rdbms - lazy, on disk, first step towards huge memory reduction of this prototype if only load this selection - # person_measurements = person_measurements.sort_values(by="measurement_date") - # person_measurements = person_measurements[["measurement_date", "value_as_number"]] - # print(person_measurements) - for feature in features: - person_feature = [] + Returns + ------- + ak.Array + An Awkward Array with the structure: n_person x n_features x 2 (value, time). + """ + # Load the specified table + table_df = duckdb_instance.sql(f"SELECT * FROM {table_name}").df() + table_df = normalize_column_names(table_df) - # person_measurements_value = [] - # person_measurements_timestamp = [] + # Load the person table to get unique person IDs + person_id_df = normalize_column_names(duckdb_instance.sql("SELECT * FROM person").df()) + person_ids = person_id_df["person_id"].unique() - person_feature_measurements = person_measurements["measurement_concept_id"] == feature + # Get unique features (concept IDs) for the table + features = table_df[concept_id_col].unique() - person_feature_measurements_value = person_measurements[person_feature_measurements][ - "value_as_number" - ] # again, rdbms/spark backend big time scalable here - person_feature_measurements_timestamp = person_measurements[person_feature_measurements][ - "measurement_datetime" - ] + # Initialize the collection for all persons + person_collection = [] - person_feature.append(person_feature_measurements_value) - person_feature.append(person_feature_measurements_timestamp) + for person in person_ids: + person_as_list = [] + # Get rows for the current person + person_data = table_df[table_df["person_id"] == person] + + # For each feature, get values and timestamps + for feature in features: + feature_data = person_data[person_data[concept_id_col] == feature] - person_as_list.append(person_feature) + # Extract the values and timestamps + feature_values = feature_data[value_col] + feature_timestamps = feature_data[timestamp_col] + # Append values and timestamps for this feature + person_as_list.append([feature_values, feature_timestamps]) + + # Append this person's data to the collection person_collection.append(person_as_list) return ak.Array(person_collection) +def extract_measurement(duckdb_instance): + return extract_table( + duckdb_instance, + table_name="measurement", + concept_id_col="measurement_concept_id", + value_col="value_as_number", + timestamp_col="measurement_datetime" + ) + +def extract_observation(duckdb_instance): + return extract_table( + duckdb_instance, + table_name="observation", + concept_id_col="observation_concept_id", + value_col="value_as_number", + timestamp_col="observation_datetime" + ) + +def extract_procedure_occurrence(duckdb_instance): + return extract_table( + duckdb_instance, + table_name="procedure_occurrence", + concept_id_col="procedure_concept_id", + value_col="procedure_type_concept_id", # Assuming `procedure_type_concept_id` is a suitable value field + timestamp_col="procedure_datetime" + ) + +def extract_specimen(duckdb_instance): + return extract_table( + duckdb_instance, + table_name="specimen", + concept_id_col="specimen_concept_id", + value_col="unit_concept_id", # Assuming `unit_concept_id` is a suitable value field + timestamp_col="specimen_datetime" + ) + +def extract_device_exposure(duckdb_instance): + return extract_table( + duckdb_instance, + table_name="device_exposure", + concept_id_col="device_concept_id", + value_col="device_exposure_type_concept_id", # Assuming this as value + timestamp_col="device_exposure_start_datetime" + ) + +def extract_drug_exposure(duckdb_instance): + return extract_table( + duckdb_instance, + table_name="drug_exposure", + concept_id_col="drug_concept_id", + value_col="dose_unit_concept_id", # Assuming `dose_unit_concept_id` as value + timestamp_col="drug_exposure_start_datetime" + ) + +def extract_note(duckdb_instance): + return extract_table( + duckdb_instance, + table_name="note", + concept_id_col="note_concept_id", + value_col="note_class_concept_id", # Assuming `note_class_concept_id` as value + timestamp_col="note_datetime" + ) + def _get_interval_table_from_awkward_array( # self,#person_feature_measurement: ak.Array, person_ts: ak.Array, @@ -373,11 +448,6 @@ def normalize_column_names(df: pd.DataFrame) -> pd.DataFrame: df.columns = map(str.lower, df.columns) # Convert all column names to lowercase return df -def extract_observation(): - """Extract observation table of an OMOP CDM Database.""" - pass - - def extract_procedure_occurrence(): """Extract procedure_occurrence table of an OMOP CDM Database.""" pass From 05cd817d21202d18501d4da2e1c2625e5344afd2 Mon Sep 17 00:00:00 2001 From: ShreyParikh07 Date: Thu, 10 Oct 2024 15:56:12 +0200 Subject: [PATCH 03/15] drug/device_exposure removed --- docs/notebooks/test_more_datasets_omop.ipynb | 93 ++++++++++++++++++++ src/ehrdata/io/omop/omop.py | 77 ++++++---------- 2 files changed, 120 insertions(+), 50 deletions(-) diff --git a/docs/notebooks/test_more_datasets_omop.ipynb b/docs/notebooks/test_more_datasets_omop.ipynb index 3ac69ad..7eed5e9 100644 --- a/docs/notebooks/test_more_datasets_omop.ipynb +++ b/docs/notebooks/test_more_datasets_omop.ipynb @@ -360,6 +360,11 @@ "edata_gibleed_obs = load_and_check(gibleed_omop, \"observation_period\", [\"observation\"])" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [] + }, { "cell_type": "code", "execution_count": 109, @@ -432,6 +437,94 @@ "source": [ "edata_synteha27nj_obs.r.shape" ] + }, + { + "cell_type": "code", + "execution_count": 126, + "metadata": {}, + "outputs": [], + "source": [ + "tables = [ \"measurement\", \"observation\", \"procedure_occurrence\", \"specimen\", \"device_exposure\", \"drug_exposure\", \"note\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 122, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "device_exposure\n", + "drug_exposure\n" + ] + } + ], + "source": [ + "for table in tables:\n", + " table_ext = table +'.csv'\n", + " path = os.path.join('/Users/shrey.parikh/Desktop/EHR/ehrapy_data/mimic-iv-demo-data-in-the-omop-common-data-model-0.9/1_omop_data_csv', table_ext)\n", + " temp = pd.read_csv(path)\n", + " if temp.columns.str.contains('start_date').any():\n", + " print(table)" + ] + }, + { + "cell_type": "code", + "execution_count": 136, + "metadata": {}, + "outputs": [], + "source": [ + "# removing drug_exposure and device_exposure because they have start/end date\n", + "# note is empty\n", + "tables = [ \"measurement\", \"observation\", \"procedure_occurrence\", \"specimen\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 137, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Processing table: measurement\n", + "Path to data exists, load tables from there: ehrapy_data/mimic-iv-demo-data-in-the-omop-common-data-model-0.9\n", + "missing tables: [['concept'], ['vocabulary'], ['domain'], ['concept_class'], ['concept_relationship'], ['relationship'], ['concept_synonym'], ['concept_ancestor'], ['source_to_concept_map'], ['drug_strength']]\n", + "Success: measurement processed successfully.\n", + "Processing table: observation\n", + "Path to data exists, load tables from there: ehrapy_data/mimic-iv-demo-data-in-the-omop-common-data-model-0.9\n", + "missing tables: [['concept'], ['vocabulary'], ['domain'], ['concept_class'], ['concept_relationship'], ['relationship'], ['concept_synonym'], ['concept_ancestor'], ['source_to_concept_map'], ['drug_strength']]\n", + "Success: observation processed successfully.\n", + "Processing table: procedure_occurrence\n", + "Path to data exists, load tables from there: ehrapy_data/mimic-iv-demo-data-in-the-omop-common-data-model-0.9\n", + "missing tables: [['concept'], ['vocabulary'], ['domain'], ['concept_class'], ['concept_relationship'], ['relationship'], ['concept_synonym'], ['concept_ancestor'], ['source_to_concept_map'], ['drug_strength']]\n", + "Success: procedure_occurrence processed successfully.\n", + "Processing table: specimen\n", + "Path to data exists, load tables from there: ehrapy_data/mimic-iv-demo-data-in-the-omop-common-data-model-0.9\n", + "missing tables: [['concept'], ['vocabulary'], ['domain'], ['concept_class'], ['concept_relationship'], ['relationship'], ['concept_synonym'], ['concept_ancestor'], ['source_to_concept_map'], ['drug_strength']]\n", + "Success: specimen processed successfully.\n" + ] + } + ], + "source": [ + "for table in tables:\n", + " print(f\"Processing table: {table}\")\n", + " try:\n", + " edata_temp = load_and_check(mimic_iv_omop, \"observation_period_start_date\", [table]) \n", + " print(f\"Success: {table} processed successfully.\")\n", + " except Exception as e:\n", + " print(f\"Error processing table: {table}. Error: {str(e)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { diff --git a/src/ehrdata/io/omop/omop.py b/src/ehrdata/io/omop/omop.py index e56a174..d6edf8a 100644 --- a/src/ehrdata/io/omop/omop.py +++ b/src/ehrdata/io/omop/omop.py @@ -59,8 +59,8 @@ def setup_variables( edata, tables: Sequence[ Literal[ - "measurement", "observation", "procedure_occurrence", "specimen", "device_exposure", "drug_exposure", "note" - ] + "measurement", "observation", "procedure_occurrence", "specimen", "note" + ] ], start_time: Literal["observation_period_start"] | pd.Timestamp | str, interval_length_number: int, @@ -104,9 +104,9 @@ def setup_variables( "observation": {"extract_func": extract_observation, "concept_id_col": "observation_concept_id"}, "procedure_occurrence": {"extract_func": extract_procedure_occurrence, "concept_id_col": "procedure_concept_id"}, "specimen": {"extract_func": extract_specimen, "concept_id_col": "specimen_concept_id"}, - "device_exposure": {"extract_func": extract_device_exposure, "concept_id_col": "device_concept_id"}, - "drug_exposure": {"extract_func": extract_drug_exposure, "concept_id_col": "drug_concept_id"}, - "note": {"extract_func": extract_note, "concept_id_col": "note_concept_id"}, + # "device_exposure": {"extract_func": extract_device_exposure, "concept_id_col": "device_concept_id"}, + # "drug_exposure": {"extract_func": extract_drug_exposure, "concept_id_col": "drug_concept_id"}, + "note": {"extract_func": extract_note, "concept_id_col": "note_type_concept_id"}, } concept_ids_present_list = [] @@ -115,7 +115,7 @@ def setup_variables( for table in tables: if table not in table_info: raise ValueError( - "tables must be a sequence of 'measurement', 'observation', 'procedure_occurrence', 'specimen', 'device_exposure', 'drug_exposure', or 'note'." + "tables must be a sequence of 'measurement', 'observation', 'procedure_occurrence', 'specimen', or 'note'." ) # Get extract function and concept_id column for the table @@ -155,6 +155,8 @@ def setup_variables( return edata +# DEVICE EXPOSURE and DRUG EXPOSURE NEEDS TO BE IMPLEMENTED BECAUSE THEY CONTAIN START DATE + def load( backend_handle: Literal[str, duckdb, Path], # folder_path: str, @@ -288,28 +290,31 @@ def extract_specimen(duckdb_instance): ) def extract_device_exposure(duckdb_instance): - return extract_table( - duckdb_instance, - table_name="device_exposure", - concept_id_col="device_concept_id", - value_col="device_exposure_type_concept_id", # Assuming this as value - timestamp_col="device_exposure_start_datetime" - ) + # return extract_table( + # duckdb_instance, + # table_name="device_exposure", + # concept_id_col="device_concept_id", + # value_col="device_type_concept_id", # Assuming this as value + # timestamp_col="device_exposure_start_date" + # ) + # NEEDS IMPLEMENTATION + return None def extract_drug_exposure(duckdb_instance): - return extract_table( - duckdb_instance, - table_name="drug_exposure", - concept_id_col="drug_concept_id", - value_col="dose_unit_concept_id", # Assuming `dose_unit_concept_id` as value - timestamp_col="drug_exposure_start_datetime" - ) - + # return extract_table( + # duckdb_instance, + # table_name="drug_exposure", + # concept_id_col="drug_concept_id", + # value_col="dose_unit_concept_id", # Assuming `dose_unit_concept_id` as value + # timestamp_col="drug_exposure_start_datetime" + # ) + # NEEDS IMPLEMENTATION + return None def extract_note(duckdb_instance): return extract_table( duckdb_instance, table_name="note", - concept_id_col="note_concept_id", + concept_id_col="note_type_concept_id", value_col="note_class_concept_id", # Assuming `note_class_concept_id` as value timestamp_col="note_datetime" ) @@ -448,31 +453,3 @@ def normalize_column_names(df: pd.DataFrame) -> pd.DataFrame: df.columns = map(str.lower, df.columns) # Convert all column names to lowercase return df -def extract_procedure_occurrence(): - """Extract procedure_occurrence table of an OMOP CDM Database.""" - pass - - -def extract_specimen(): - """Extract specimen table of an OMOP CDM Database.""" - pass - - -def extract_device_exposure(): - """Extract device_exposure table of an OMOP CDM Database.""" - pass - - -def extract_drug_exposure(): - """Extract drug_exposure table of an OMOP CDM Database.""" - pass - - -def extract_condition_occurrence(): - """Extract condition_occurrence table of an OMOP CDM Database.""" - pass - - -def extract_note(): - """Extract note table of an OMOP CDM Database.""" - pass From 867222ea155e269bfaf8b3e5b0a8858747b477ca Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 10 Oct 2024 13:56:57 +0000 Subject: [PATCH 04/15] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- docs/notebooks/test_more_datasets_omop.ipynb | 21 +++++---- src/ehrdata/io/omop/omop.py | 49 +++++++++++++------- 2 files changed, 43 insertions(+), 27 deletions(-) diff --git a/docs/notebooks/test_more_datasets_omop.ipynb b/docs/notebooks/test_more_datasets_omop.ipynb index 7eed5e9..6bce569 100644 --- a/docs/notebooks/test_more_datasets_omop.ipynb +++ b/docs/notebooks/test_more_datasets_omop.ipynb @@ -17,12 +17,10 @@ "outputs": [], "source": [ "from ehrdata import EHRData\n", + "\n", "EHRData().r\n", - "import anndata as ad\n", "import duckdb\n", - "import ehrapy as ep\n", "import ehrdata as ed\n", - "import numpy as np\n", "import os" ] }, @@ -61,7 +59,7 @@ " interval_length_unit=\"day\",\n", " num_intervals=\"max_observation_duration\",\n", " concept_ids=\"all\",\n", - " aggregation_strategy=\"last\"\n", + " aggregation_strategy=\"last\",\n", " )\n", " return edata" ] @@ -444,7 +442,7 @@ "metadata": {}, "outputs": [], "source": [ - "tables = [ \"measurement\", \"observation\", \"procedure_occurrence\", \"specimen\", \"device_exposure\", \"drug_exposure\", \"note\"]" + "tables = [\"measurement\", \"observation\", \"procedure_occurrence\", \"specimen\", \"device_exposure\", \"drug_exposure\", \"note\"]" ] }, { @@ -463,10 +461,13 @@ ], "source": [ "for table in tables:\n", - " table_ext = table +'.csv'\n", - " path = os.path.join('/Users/shrey.parikh/Desktop/EHR/ehrapy_data/mimic-iv-demo-data-in-the-omop-common-data-model-0.9/1_omop_data_csv', table_ext)\n", + " table_ext = table + \".csv\"\n", + " path = os.path.join(\n", + " \"/Users/shrey.parikh/Desktop/EHR/ehrapy_data/mimic-iv-demo-data-in-the-omop-common-data-model-0.9/1_omop_data_csv\",\n", + " table_ext,\n", + " )\n", " temp = pd.read_csv(path)\n", - " if temp.columns.str.contains('start_date').any():\n", + " if temp.columns.str.contains(\"start_date\").any():\n", " print(table)" ] }, @@ -478,7 +479,7 @@ "source": [ "# removing drug_exposure and device_exposure because they have start/end date\n", "# note is empty\n", - "tables = [ \"measurement\", \"observation\", \"procedure_occurrence\", \"specimen\"]" + "tables = [\"measurement\", \"observation\", \"procedure_occurrence\", \"specimen\"]" ] }, { @@ -513,7 +514,7 @@ "for table in tables:\n", " print(f\"Processing table: {table}\")\n", " try:\n", - " edata_temp = load_and_check(mimic_iv_omop, \"observation_period_start_date\", [table]) \n", + " edata_temp = load_and_check(mimic_iv_omop, \"observation_period_start_date\", [table])\n", " print(f\"Success: {table} processed successfully.\")\n", " except Exception as e:\n", " print(f\"Error processing table: {table}. Error: {str(e)}\")" diff --git a/src/ehrdata/io/omop/omop.py b/src/ehrdata/io/omop/omop.py index d6edf8a..b279482 100644 --- a/src/ehrdata/io/omop/omop.py +++ b/src/ehrdata/io/omop/omop.py @@ -8,8 +8,10 @@ import duckdb import numpy as np import pandas as pd + from ehrdata import EHRData + def _check_sanity_of_folder(folder_path: str | Path): pass @@ -54,14 +56,11 @@ def setup_obs( return EHRData(obs=obs) + def setup_variables( backend_handle: Literal[str, duckdb, Path], edata, - tables: Sequence[ - Literal[ - "measurement", "observation", "procedure_occurrence", "specimen", "note" - ] - ], + tables: Sequence[Literal["measurement", "observation", "procedure_occurrence", "specimen", "note"]], start_time: Literal["observation_period_start"] | pd.Timestamp | str, interval_length_number: int, interval_length_unit: str, @@ -102,7 +101,10 @@ def setup_variables( table_info = { "measurement": {"extract_func": extract_measurement, "concept_id_col": "measurement_concept_id"}, "observation": {"extract_func": extract_observation, "concept_id_col": "observation_concept_id"}, - "procedure_occurrence": {"extract_func": extract_procedure_occurrence, "concept_id_col": "procedure_concept_id"}, + "procedure_occurrence": { + "extract_func": extract_procedure_occurrence, + "concept_id_col": "procedure_concept_id", + }, "specimen": {"extract_func": extract_specimen, "concept_id_col": "specimen_concept_id"}, # "device_exposure": {"extract_func": extract_device_exposure, "concept_id_col": "device_concept_id"}, # "drug_exposure": {"extract_func": extract_drug_exposure, "concept_id_col": "drug_concept_id"}, @@ -138,7 +140,7 @@ def setup_variables( aggregation_strategy=aggregation_strategy, ) - # Append + # Append concept_ids_present_list.append(concept_ids_present) time_interval_tables.append(time_interval_table) @@ -155,8 +157,10 @@ def setup_variables( return edata + # DEVICE EXPOSURE and DRUG EXPOSURE NEEDS TO BE IMPLEMENTED BECAUSE THEY CONTAIN START DATE + def load( backend_handle: Literal[str, duckdb, Path], # folder_path: str, @@ -188,12 +192,15 @@ def extract_observation_period(duckdb_instance): def extract_person_observation_period(duckdb_instance): """Extract observation table of an OMOP CDM Database.""" - return normalize_column_names(duckdb_instance.sql( - "SELECT * \ + return normalize_column_names( + duckdb_instance.sql( + "SELECT * \ FROM person \ LEFT JOIN observation_period USING(person_id) \ " - ).df()) + ).df() + ) + def extract_table(duckdb_instance, table_name: str, concept_id_col: str, value_col: str, timestamp_col: str): """ @@ -259,36 +266,40 @@ def extract_measurement(duckdb_instance): table_name="measurement", concept_id_col="measurement_concept_id", value_col="value_as_number", - timestamp_col="measurement_datetime" + timestamp_col="measurement_datetime", ) + def extract_observation(duckdb_instance): return extract_table( duckdb_instance, table_name="observation", concept_id_col="observation_concept_id", value_col="value_as_number", - timestamp_col="observation_datetime" + timestamp_col="observation_datetime", ) + def extract_procedure_occurrence(duckdb_instance): return extract_table( duckdb_instance, table_name="procedure_occurrence", concept_id_col="procedure_concept_id", value_col="procedure_type_concept_id", # Assuming `procedure_type_concept_id` is a suitable value field - timestamp_col="procedure_datetime" + timestamp_col="procedure_datetime", ) + def extract_specimen(duckdb_instance): return extract_table( duckdb_instance, table_name="specimen", concept_id_col="specimen_concept_id", value_col="unit_concept_id", # Assuming `unit_concept_id` is a suitable value field - timestamp_col="specimen_datetime" + timestamp_col="specimen_datetime", ) + def extract_device_exposure(duckdb_instance): # return extract_table( # duckdb_instance, @@ -300,6 +311,7 @@ def extract_device_exposure(duckdb_instance): # NEEDS IMPLEMENTATION return None + def extract_drug_exposure(duckdb_instance): # return extract_table( # duckdb_instance, @@ -310,15 +322,18 @@ def extract_drug_exposure(duckdb_instance): # ) # NEEDS IMPLEMENTATION return None + + def extract_note(duckdb_instance): return extract_table( duckdb_instance, table_name="note", concept_id_col="note_type_concept_id", value_col="note_class_concept_id", # Assuming `note_class_concept_id` as value - timestamp_col="note_datetime" + timestamp_col="note_datetime", ) + def _get_interval_table_from_awkward_array( # self,#person_feature_measurement: ak.Array, person_ts: ak.Array, @@ -411,7 +426,7 @@ def get_time_interval_table( # Calculate the duration of observation periods num_intervals = np.max( - observation_period_df["observation_period_end_date"] + observation_period_df["observation_period_end_date"] - observation_period_df["observation_period_start_date"] ) / pd.to_timedelta(interval_length_number, interval_length_unit) num_intervals = int(np.ceil(num_intervals)) @@ -448,8 +463,8 @@ def get_time_interval_table( return np.array(tables).transpose(0, 2, 1) # TODO: store in self, np + def normalize_column_names(df: pd.DataFrame) -> pd.DataFrame: """Normalize all column names to lowercase.""" df.columns = map(str.lower, df.columns) # Convert all column names to lowercase return df - From 291aeba42102bf3151df3ebe7bbe21f27e9c629a Mon Sep 17 00:00:00 2001 From: Eljas Roellin Date: Wed, 16 Oct 2024 12:19:54 +0200 Subject: [PATCH 05/15] update commit --- src/ehrdata/io/omop/__init__.py | 23 ++--- src/ehrdata/io/omop/omop.py | 153 +++++++++++++++++--------------- 2 files changed, 91 insertions(+), 85 deletions(-) diff --git a/src/ehrdata/io/omop/__init__.py b/src/ehrdata/io/omop/__init__.py index eb3908b..6f3fda4 100644 --- a/src/ehrdata/io/omop/__init__.py +++ b/src/ehrdata/io/omop/__init__.py @@ -1,15 +1,16 @@ from .omop import ( - extract_condition_occurrence, - extract_device_exposure, - extract_drug_exposure, - extract_measurement, - extract_note, - extract_observation, - extract_observation_period, - extract_person, - extract_person_observation_period, - extract_procedure_occurrence, - extract_specimen, + # extract_condition_occurrence, + # extract_device_exposure, + # extract_drug_exposure, + # extract_measurement, + # extract_note, + # extract_observation, + # extract_observation_period, + # extract_person, + # extract_person_observation_period, + # extract_procedure_occurrence, + # extract_specimen, + get_table, get_time_interval_table, load, setup_obs, diff --git a/src/ehrdata/io/omop/omop.py b/src/ehrdata/io/omop/omop.py index b279482..ccd81b8 100644 --- a/src/ehrdata/io/omop/omop.py +++ b/src/ehrdata/io/omop/omop.py @@ -9,8 +9,6 @@ import numpy as np import pandas as pd -from ehrdata import EHRData - def _check_sanity_of_folder(folder_path: str | Path): pass @@ -20,15 +18,22 @@ def _check_sanity_of_database(backend_handle: duckdb.DuckDB): pass +VALID_OBSERVATION_TABLES_SINGLE = ["person", "observation_period", "visit_occurrence"] +VALID_OBSERVATION_TABLES_JOIN = ["person_observation_period", "person_visit_occurrence"] +VALID_VARIABLE_TABLES = ["measurement", "observation", "specimen", "note", "death"] + + def setup_obs( backend_handle: Literal[str, duckdb, Path], - observation_table: Literal["person", "observation_period", "person_observation_period", "condition_occurrence"], + observation_table: Literal[ + "person", "observation_period", "person_observation_period", "visit_occurrence", "person_visit_occurrence" + ], ): """Setup the observation table. - This function sets up the observation table for the EHRData project. - For this, a table from the OMOP CDM which represents to observed unit should be selected. - A unit can be a person, an observation period, the join of these two tables, or a condition occurrence. + This function sets up the observation table for the EHRData object. + For this, a table from the OMOP CDM which represents the "observed unit" via its id should be selected. + A unit can be a person, an observation period, a visit occurrence, or a left join on person_id of a person with one of the other tables. Parameters ---------- @@ -43,23 +48,26 @@ def setup_obs( """ from ehrdata import EHRData - if observation_table == "person": - obs = extract_person(backend_handle) - elif observation_table == "observation_period": - obs = extract_observation_period(backend_handle) - elif observation_table == "person_observation_period": - obs = extract_person_observation_period(backend_handle) - elif observation_table == "condition_occurrence": - obs = extract_condition_occurrence(backend_handle) - else: - raise ValueError("observation_table must be either 'person', 'observation_period', or 'condition_occurrence'.") + if observation_table not in VALID_OBSERVATION_TABLES_SINGLE + VALID_OBSERVATION_TABLES_JOIN: + raise ValueError( + "observation_table must be either 'person', 'observation_period', 'person_observation_period', 'visit_occurrence', or 'person_visit_occurrence'." + ) + + if observation_table in VALID_OBSERVATION_TABLES_SINGLE: + obs = get_table(backend_handle, observation_table) + + elif observation_table in VALID_OBSERVATION_TABLES_JOIN: + if observation_table == "person_observation_period": + obs = _get_table_left_join(backend_handle, "person", "observation_period") + elif observation_table == "person_visit_occurrence": + obs = _get_table_left_join(backend_handle, "person", "visit_occurrence") return EHRData(obs=obs) def setup_variables( - backend_handle: Literal[str, duckdb, Path], edata, + backend_handle: Literal[str, duckdb, Path], tables: Sequence[Literal["measurement", "observation", "procedure_occurrence", "specimen", "note"]], start_time: Literal["observation_period_start"] | pd.Timestamp | str, interval_length_number: int, @@ -70,7 +78,7 @@ def setup_variables( ): """Setup the variables. - This function sets up the variables for the EHRData project. + This function sets up the variables for the EHRData object. Parameters ---------- @@ -95,42 +103,29 @@ def setup_variables( Returns ------- - An EHRData object with populated .var field. + An EHRData object with populated .r and .var field. """ - # Mapping of table names to extraction functions and concept ID column names - table_info = { - "measurement": {"extract_func": extract_measurement, "concept_id_col": "measurement_concept_id"}, - "observation": {"extract_func": extract_observation, "concept_id_col": "observation_concept_id"}, - "procedure_occurrence": { - "extract_func": extract_procedure_occurrence, - "concept_id_col": "procedure_concept_id", - }, - "specimen": {"extract_func": extract_specimen, "concept_id_col": "specimen_concept_id"}, - # "device_exposure": {"extract_func": extract_device_exposure, "concept_id_col": "device_concept_id"}, - # "drug_exposure": {"extract_func": extract_drug_exposure, "concept_id_col": "drug_concept_id"}, - "note": {"extract_func": extract_note, "concept_id_col": "note_type_concept_id"}, - } + from ehrdata import EHRData concept_ids_present_list = [] time_interval_tables = [] for table in tables: - if table not in table_info: - raise ValueError( - "tables must be a sequence of 'measurement', 'observation', 'procedure_occurrence', 'specimen', or 'note'." - ) + if table not in VALID_VARIABLE_TABLES: + raise ValueError(f"tables must be a sequence of from [{VALID_VARIABLE_TABLES}].") - # Get extract function and concept_id column for the table - extract_func = table_info[table]["extract_func"] - concept_id_col = table_info[table]["concept_id_col"] - concept_ids_present_df = normalize_column_names(backend_handle.sql(f"SELECT * FROM {table}").df()) - concept_ids_present = concept_ids_present_df[concept_id_col].unique() - extracted_awkward = extract_func(backend_handle) + id_column = f"{table}_type_concept_id" if table in ["note", "death"] else f"{table}_concept_id" + + concept_ids_present = _lowercase_column_names( + backend_handle.sql(f"SELECT DISTINCT {id_column} FROM {table}").df() + ) + + personxfeature_pairs_of_value_timestamp = _extract_personxfeature_pairs_of_value_timestamp(backend_handle) # Create the time interval table time_interval_table = get_time_interval_table( backend_handle, - extracted_awkward, + personxfeature_pairs_of_value_timestamp, edata.obs, start_time="observation_period_start", interval_length_number=interval_length_number, @@ -166,10 +161,6 @@ def load( # folder_path: str, # delimiter: str = ",", # make_filename_lowercase: bool = True, - # use_dask: bool = False, - # level: Literal["stay_level", "patient_level"] = "stay_level", - # load_tables: str | list[str] | tuple[str] | Literal["auto"] | None = None, - # remove_empty_column: bool = True, ) -> None: """Initialize a connection to the OMOP CDM Database.""" if isinstance(backend_handle, str) or isinstance(backend_handle, Path): @@ -180,29 +171,26 @@ def load( raise NotImplementedError(f"Backend {backend_handle} not supported. Choose a valid backend.") -def extract_person(duckdb_instance): - """Extract person table of an OMOP CDM Database.""" - return normalize_column_names(duckdb_instance.sql("SELECT * FROM person").df()) - +def get_table(duckdb_instance, table_name: str) -> pd.DataFrame: + """Extract a table of an OMOP CDM Database.""" + return _lowercase_column_names(duckdb_instance.sql(f"SELECT * FROM {table_name}").df()) -def extract_observation_period(duckdb_instance): - """Extract person table of an OMOP CDM Database.""" - return normalize_column_names(duckdb_instance.sql("SELECT * FROM observation_period").df()) - -def extract_person_observation_period(duckdb_instance): - """Extract observation table of an OMOP CDM Database.""" - return normalize_column_names( +def _get_table_left_join(duckdb_instance, table1: str, table2: str) -> pd.DataFrame: + """Extract a table of an OMOP CDM Database.""" + return _lowercase_column_names( duckdb_instance.sql( - "SELECT * \ - FROM person \ - LEFT JOIN observation_period USING(person_id) \ + f"SELECT * \ + FROM {table1} \ + LEFT JOIN {table2} USING(person_id) \ " ).df() ) -def extract_table(duckdb_instance, table_name: str, concept_id_col: str, value_col: str, timestamp_col: str): +def _extract_personxfeature_pairs_of_value_timestamp( + duckdb_instance, table_name: str, concept_id_col: str, value_col: str, timestamp_col: str +): """ Generalized extraction function to extract data from an OMOP CDM table. @@ -226,10 +214,10 @@ def extract_table(duckdb_instance, table_name: str, concept_id_col: str, value_c """ # Load the specified table table_df = duckdb_instance.sql(f"SELECT * FROM {table_name}").df() - table_df = normalize_column_names(table_df) + table_df = _lowercase_column_names(table_df) # Load the person table to get unique person IDs - person_id_df = normalize_column_names(duckdb_instance.sql("SELECT * FROM person").df()) + person_id_df = _lowercase_column_names(duckdb_instance.sql("SELECT * FROM person").df()) person_ids = person_id_df["person_id"].unique() # Get unique features (concept IDs) for the table @@ -261,7 +249,8 @@ def extract_table(duckdb_instance, table_name: str, concept_id_col: str, value_c def extract_measurement(duckdb_instance): - return extract_table( + """Extract a table of an OMOP CDM Database.""" + return get_table( duckdb_instance, table_name="measurement", concept_id_col="measurement_concept_id", @@ -271,7 +260,8 @@ def extract_measurement(duckdb_instance): def extract_observation(duckdb_instance): - return extract_table( + """Extract a table of an OMOP CDM Database.""" + return get_table( duckdb_instance, table_name="observation", concept_id_col="observation_concept_id", @@ -281,7 +271,8 @@ def extract_observation(duckdb_instance): def extract_procedure_occurrence(duckdb_instance): - return extract_table( + """Extract a table of an OMOP CDM Database.""" + return get_table( duckdb_instance, table_name="procedure_occurrence", concept_id_col="procedure_concept_id", @@ -291,7 +282,8 @@ def extract_procedure_occurrence(duckdb_instance): def extract_specimen(duckdb_instance): - return extract_table( + """Extract a table of an OMOP CDM Database.""" + return get_table( duckdb_instance, table_name="specimen", concept_id_col="specimen_concept_id", @@ -301,7 +293,8 @@ def extract_specimen(duckdb_instance): def extract_device_exposure(duckdb_instance): - # return extract_table( + """Extract a table of an OMOP CDM Database.""" + # return get_table( # duckdb_instance, # table_name="device_exposure", # concept_id_col="device_concept_id", @@ -313,7 +306,8 @@ def extract_device_exposure(duckdb_instance): def extract_drug_exposure(duckdb_instance): - # return extract_table( + """Extract a table of an OMOP CDM Database.""" + # return get_table( # duckdb_instance, # table_name="drug_exposure", # concept_id_col="drug_concept_id", @@ -325,7 +319,8 @@ def extract_drug_exposure(duckdb_instance): def extract_note(duckdb_instance): - return extract_table( + """Extract a table of an OMOP CDM Database.""" + return get_table( duckdb_instance, table_name="note", concept_id_col="note_type_concept_id", @@ -386,7 +381,7 @@ def get_time_interval_table( concept_ids: Literal["all"] | Sequence = "all", aggregation_strategy: str = "first", # what to do if multiple obs. in 1 interval. first, last, mean, median, most_frequent for categories # strategy="locf", -) -> np.array: +) -> np.ndarray: """Extract measurement table of an OMOP CDM Database. Parameters @@ -422,7 +417,7 @@ def get_time_interval_table( if num_intervals == "max_observation_duration": observation_period_df = con.execute("SELECT * from observation_period").df() - observation_period_df = normalize_column_names(observation_period_df) + observation_period_df = _lowercase_column_names(observation_period_df) # Calculate the duration of observation periods num_intervals = np.max( @@ -464,7 +459,17 @@ def get_time_interval_table( return np.array(tables).transpose(0, 2, 1) # TODO: store in self, np -def normalize_column_names(df: pd.DataFrame) -> pd.DataFrame: +def _lowercase_column_names(df: pd.DataFrame) -> pd.DataFrame: """Normalize all column names to lowercase.""" df.columns = map(str.lower, df.columns) # Convert all column names to lowercase return df + + +def extract_condition_occurrence(): + """Extract a table of an OMOP CDM Database.""" + pass + + +def extract_observation_period(): + """Extract a table of an OMOP CDM Database.""" + pass From b2eeded6cbfcfc5ff7ce3306bb0aa4000d0e7f7c Mon Sep 17 00:00:00 2001 From: Eljas Roellin Date: Wed, 16 Oct 2024 14:46:10 +0200 Subject: [PATCH 06/15] clean up nb --- docs/notebooks/test_more_datasets_omop.ipynb | 552 ------------------- 1 file changed, 552 deletions(-) delete mode 100644 docs/notebooks/test_more_datasets_omop.ipynb diff --git a/docs/notebooks/test_more_datasets_omop.ipynb b/docs/notebooks/test_more_datasets_omop.ipynb deleted file mode 100644 index 6bce569..0000000 --- a/docs/notebooks/test_more_datasets_omop.ipynb +++ /dev/null @@ -1,552 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "%load_ext autoreload\n", - "%autoreload 2" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "from ehrdata import EHRData\n", - "\n", - "EHRData().r\n", - "import duckdb\n", - "import ehrdata as ed\n", - "import os" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "from datasets import gibleed_omop, mimic_iv_omop, synthea27nj_omop" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "define the function" - ] - }, - { - "cell_type": "code", - "execution_count": 82, - "metadata": {}, - "outputs": [], - "source": [ - "def load_and_check(dummy_func, start_time, tables):\n", - " con = duckdb.connect()\n", - " dummy_func(backend_handle=con)\n", - " edata = ed.io.omop.setup_obs(con, \"person_observation_period\")\n", - " edata = ed.io.omop.setup_variables(\n", - " backend_handle=con,\n", - " edata=edata,\n", - " tables=tables,\n", - " start_time=start_time,\n", - " interval_length_number=28,\n", - " interval_length_unit=\"day\",\n", - " num_intervals=\"max_observation_duration\",\n", - " concept_ids=\"all\",\n", - " aggregation_strategy=\"last\",\n", - " )\n", - " return edata" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Load the mimic dataset" - ] - }, - { - "cell_type": "code", - "execution_count": 98, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Path to data exists, load tables from there: ehrapy_data/mimic-iv-demo-data-in-the-omop-common-data-model-0.9\n", - "missing tables: [['concept'], ['vocabulary'], ['domain'], ['concept_class'], ['concept_relationship'], ['relationship'], ['concept_synonym'], ['concept_ancestor'], ['source_to_concept_map'], ['drug_strength']]\n" - ] - } - ], - "source": [ - "edata_mimic = load_and_check(mimic_iv_omop, \"observation_period_start_date\", [\"measurement\"])" - ] - }, - { - "cell_type": "code", - "execution_count": 99, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "EHRData object with n_obs x n_var = 100 x 450, and a timeseries of 320 steps.\n", - " shape of .X: (100, 450) \n", - " shape of .r: ((100, 450, 320)) " - ] - }, - "execution_count": 99, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "edata_mimic" - ] - }, - { - "cell_type": "code", - "execution_count": 100, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAKUAAAAUCAYAAADsvf0KAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAABJ0AAASdAHeZh94AAAGXklEQVR4nO2ae6xdRRXGf30kAgVLUqSN+ACR2xAkFMWADypXsMpDA4IhIYVSIpEAXpBWDdX69cNUagKF8hAJNS0qCZBUIPJ+NUB5iGIJGkBebREDVKlUiq2l5fLHmt3uO+zTe+45u/fU5n7JyWSvmTUz35w1s/aatYf19vYyhCFsSxje6QkMYQg5RlYJbV8LHAnsJentwZ3SELZ32P4M8CfgdEnz8/phufu2/VngD8B0SXNL8hOALwETgAOAXYDrJE3uZwIfAS4AvgaMAV4FbgYs6d916dQJ25OB36THyoWzvRz4eIMuXpc0rkHfg8rN9s+Bg4AuYDdgLbAijXmFpDey9mOA44Cjgf2BPYD1wF+ABcACSe82GKtpbrZvAg4B9pG0plxX5b5nA/8BrsrkPwbOJozyH5Ur8P5J7g08AUwFHgcuAV4CzgEeTQvQtk6dsP1R4ApgTX9tgdWAK34XNei7E9y+B4wC7gHmAdcBG4BZwFOJbxnfAq4BDiYOp0uBRcCngPnAjbaH5YO0wO1CYBzQk/fVx33b7gKOAOZLWltB7hXgBeLEXFy5BH3xC2B3oEfS5aVx5qb+ZgNn1KBTC9JiLwDeAH4HTO9H5U1JswYwRCe4fVDSulxoezYwAzgfOLNU9RzwDeC28oloewZhbMcD3yQMtYwBcZP0uO1nge/YnlMeKz8pTwOGATfkJCQtlvS8pKbC9bRzJgHLgSvz7oC3gZNtj2pHp2b0AF8mdnut79Kd4lZlkAk3pnKfrP39kn6fu2hJrwG/TI+Hleva4HY98DHgK2VhbpRHABuBxxoQGQi6U3l3BcG3gIeBnYj3inZ0aoHtfYE5wDxJDzap9gHbk23PsH2O7W7bIxq07Ri3Bvh6Kp8agM47qdyQyVvl9nAq+xjlJvedrHgC8ExNEff4VD7XoP55Ynd1Afe1odM2bI8kApuXCZfWLMaxOSAqsMz2VEkPZPKOcCtgezqwMzCaCHy+SBjknCb1RwKnpMc7s+pWuf0xlRPLjcsn5R7ACCJiqgOjU7m6QX0h37VNnTrwE+BA4NSKd+lGWAAcThjmKCJSvRrYE7jD9gFZ+05xKzCdcKXnEgZ5JzBJ0j+b1J9DBDu3S7orq2uJm6TVwDrChW9COdApoqOtfuWyLcH2wcTpeLGkR5vVk+RM9FfgDNtrgGlEdHtcXfNsF8UVle2xwOcJI1tq+xhJf96Sru0egtOzwMk1T20VMLYsKJ+UxQmxQ02DFbtjdIP6Qv5mmzotI7mkXxNuZ2YdfbI5GJiYyQeVWyNIel3STYQ7HUPwbwjbZxNXSU8D3ZJWVTRrh9uObLY9oK9RrkxlXXdlf0tlV4P6Iuorv4e0otMOdk5j7Quss91b/AhXB3BNkl3aZJ+FO8wjzcHmtkVIWkEY2n62d6tqY/tc4HLCC3SnCLwKLXGzPZxw6SvL8rJRvkos6HjqQXGPOSkNXp7MLsAXgP/SN9JvRacd/A/4VYPf0tRmSXpu1rUXEeZLmXywuTWDD6dyY15h+4fEBfiThEGuzNuU0Cq38cQV5JNl4aZ3Skm9th8Ejrf9SUkv9ENoi5D0ou27CTdxFrHjNs2VOEmuLkf6rejYXghMAaZKWjjAOa4Fvl1VZ3sWEfxcm6cZ0/XRy/kthe09iWwQwG+zsQaVW9LvIlKeqzP5cOCnxGX3IxUpwJlEuvAJIhiqctltcUsoNnCfREz+QcYi4sb+q0TmpjzRY4Fj02OR1/1cWjiAf0nKMyBnAo8Al9k+HHiGSF91E0f5jyo4DlSn2Jn53dnWxInAtLSJVwBvAXsT+eIdgNupTjUONrejgAttLwGWEZmqsURG7hPAa8DpZQXbUwiD3Ag8BPTYeUzH8opN0sp/PSmNc0tZmF+eLyL8+ym8HxOIXTuFMFoSsUJ2Qq4g6UXiTmxhmuA04s+bBxySfwzQos7+hFHcVjHnrYXFwK1pXicB5xF/9BJiLY6RtD5X6gC3e4lXjw8RqcHvE4fOKuIE20/S05nOXqkcQVwfqeJ3arvcbI8mDrlbJf29XFf1ldD5wM+AT0tayjYM27sSu/9iST/o8HRqxfbMDcD2d4HLgEMlLSnXVX0ldAmR2bhgEObWLg4lUl9z+2v4f4jtlpvtHYkPQRblBgkVJ2VSmki8C1w09JHvEOpGChRPBBZKWp7XvwdACvWbXD4BcQAAAABJRU5ErkJggg==", - "text/latex": [ - "$\\displaystyle \\left( 100, \\ 450, \\ 320\\right)$" - ], - "text/plain": [ - "(100, 450, 320)" - ] - }, - "execution_count": 100, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "edata_mimic.r.shape" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Load the gibleed dataset" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Path to data exists, load tables from there: ehrapy_data/GIBleed_dataset\n", - "missing tables: [['cohort_definition']]\n" - ] - } - ], - "source": [ - "edata_gibleed = load_and_check(gibleed_omop, \"observation_period\", [\"measurement\"])" - ] - }, - { - "cell_type": "code", - "execution_count": 102, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "EHRData object with n_obs x n_var = 2694 x 55, and a timeseries of 1441 steps.\n", - " shape of .X: (2694, 55) \n", - " shape of .r: ((2694, 55, 1441)) " - ] - }, - "execution_count": 102, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "edata_gibleed" - ] - }, - { - "cell_type": "code", - "execution_count": 103, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAALAAAAAUCAYAAAAtOremAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAABJ0AAASdAHeZh94AAAGpUlEQVR4nO2aeaxdUxTGfx1UmxalMUXUPNRYRLWkLTqIGKKGVKQUMSWookUbrH4S1FStEEMrbZUg1NSBEho1F6kg5uiglNfilddBaZ8/1j7teeed+9695953nsb7kpt9z9p77bP22muvs/bau1VtbS0taMHmitbNLUALWlAO2qYRJU0FTgT2MLNV+YrUghbUh6QjgI+Ai81sUkRvlQwhJB0JfACMMLNxgdYFGAScBBwM7AKsAz4DJgOTzWxDAy/vB1wB9AK2BX4NvBPMbHaibSvgovA7EGgFfAlMAh5p6D2BfwgwLTzWGWwlIWkRsFuB6l/MbKdK8JQLSWcCfYHuwKHAVsATZjakhD5K1mljPFnkkvQ80BPYx8xqIN0D3wr8ATwYo50VnpcBc4ElwI7A6bhhnSjpLDOrF1BLuhMYCSwFXgJWANsDRwDHArMTLI8D5wBVwJPAamBAeP/RwHkNDHBX4H6gBuhUqF0FsRIYn0KvqTBPObgRN5AafA72L4U5i06L5Mki1+24cx0G3AYJA5a0L9AfmGRma2JV3wCnArPiHlDSaGA+cAZuzNMT/V2MG+9U4BIzW5eo3yLxPAg33oVADzNbEejtQt/nSnrBzJ5Ljix47sm4d38OGNG4PspGtZmNyYGnHFyNG8h3uMebWyxjFp2WwFOyXGY2X9JXwKWSxprZhqQHvhD/ZD+dYHyjQIc/S3oI99rHEjNgSVsG+hJSjDfw/50gDQrlPZHxhnbrJN0EnIyHIvUMGF+Vxwc5jk+T9/8IM9toGJJKZc+i06J4ypDrKWAM/lWekzTg/sB64P0SOoyM8J8EfQAeKowHNkg6CTgIWAvMN7P3UvqKYsDvU+oiWm9J7eILQlI3YCweU8+TlJcBbxliva7AKuBTYJ6Zra8wT+7IotOc5uGdUA4A5mxMo0nqiAfUXxabeZDUlk0x6SuJ6iNDuRZYAMzEBzceeFfSm5K2T/BEXnePlNftGcq2sf+RDNNwTz+6GLkriJ3Cu2/Fx/UG8K2kvhXmyRVZdJrjPHwYyj5QNw+8C9AG36gVi7G4V51tZnMSdTuEciRQC/TGd5qHAK8GAZ5J8MwK5TWStouIIVaOf2e2jf2/GTgMOD8Rtzc1JgP9cIPsiGdnHgZ2B16WdGiFeJoDWXSayzyY2UrcKXaFupu4LqH8vZiOJA0DrgW+As5NaRItjn+AU81sUXj+LGzWvgb6SuoVCyeeCn2dAHwh6cUgbH9gZ3x1dwU2BBmOwlf7PQVCkiaDmSUDt8+ByyTV4HoZw6aYPjNP3sii02aYh9/wLFgdDxytmvaNcUu6ApgAfAEcZ2a/pTSrDuWCmPECYGargchj94jR1wOnADcAy4Gh4fctnkL7MzStCp+sx/AMyU2NyZwjHgplnybmqTiy6LSZ5qEDwV7jHrgqlF3qNY9B0nDgXtx79DOzqgJNvw5ldYH6yNN3iBNDZuKO8Iu/tz2wD7DCzBZK6gzsG6rXFtjJTpQ0Ed9UDC8gR6WxPJQdm5inKdCJ0nWahSczJLUGOuOp1joGvAxX5H4NMF+Px72fAAPiqa4UvI7HvgdIap1ygnZQKBcWKfvZQDv8cAPgL+DRAm0Px+Oxt/GFlGd40TOUaZmUSvI0BbLoNO952A9P9X4CMQM2s1pJ84AzJO1tZt/FuUIe9hbgY2BggbBhI8xssaQZ+AHIVbjXjvoaiMe51SSyF5K2NrM/ErTuwF241x4b+l+DHzfXg6QxuOKmph17SpqChyYXmNmUhsZRoP9uwJJktkbS7vgJFPiJYrk8ZclZKrLotJx5yIhosc+F+kfJ0/FTtRPwE5JIkKG48a4H3gKGpXwqFqUo+XJ8AONCHngBniI7LfR1UdhVxvGapDV4iPIn0A2/g7EGOMXMfip+rAUR32BmwWDg2rDgF+Ny7oXL2R4/Hr+7Ajzlyomk03B9w6Y8e6+wOMBDsjxOLeugDLkG4rbzIqQbcBWe230gRo/ysm2A4QVkehOYEieY2dJwi+hm3BP3we9ZzABuN7P5Kf08i4cLQ/D4+EfgkdB+aYF3l4qDcQOa1VjDApiLf8oOA47BY9dq/FM5DZiWci8kC0+5coLn9ocmaHuyKZe+mHyO3ZPoTolySdoGN/qZZvYDpN9GG4VflDjczBZUWurmRtj8/YqnfK5rZnEKYnORM09IuhK4D+htZm9D+oX2e/F86y05ypYneuPH3+OaW5BGsLnImQskdQBGAdMj44UUDxwa9wGOA+5uudDegv8CwiZ4MDAlfq7wLxytAalq0cnyAAAAAElFTkSuQmCC", - "text/latex": [ - "$\\displaystyle \\left( 2694, \\ 55, \\ 1441\\right)$" - ], - "text/plain": [ - "(2694, 55, 1441)" - ] - }, - "execution_count": 103, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "edata_gibleed.r.shape" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Load the Synthea27NJ dataset" - ] - }, - { - "cell_type": "code", - "execution_count": 101, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Path to data exists, load tables from there: ehrapy_data/Synthea27Nj\n", - "missing tables: []\n" - ] - } - ], - "source": [ - "edata_synthea27nj = load_and_check(synthea27nj_omop, \"observation_period\", [\"measurement\"])" - ] - }, - { - "cell_type": "code", - "execution_count": 113, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "EHRData object with n_obs x n_var = 28 x 132, and a timeseries of 866 steps.\n", - " shape of .X: (28, 132) \n", - " shape of .r: ((28, 132, 866)) " - ] - }, - "execution_count": 113, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "edata_synthea27nj" - ] - }, - { - "cell_type": "code", - "execution_count": 104, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAJcAAAAUCAYAAACAu68PAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAABJ0AAASdAHeZh94AAAGVElEQVR4nO3af+zWVRUH8BdqM6LSBhmLZZMUp6khpukKxIk6tczMVn9Utha6laEmmrrqdNwMaIk/qtUWm1ixlek0EYY/kmVoSSlOm1o208SJv638NRPoj3uf9vDheeD7fT78MMZ7e3b2ufee++P9nM+55577GbF27VrbsR2bAzts7Qlsx7aLnXoVZuaVOBZ7RMRLW3ZK2/H/hsw8CH/C9IiY1ykf0dwWM/Ng3ImZETG3lo3GJ3A89sc4vIb7cAWuiIg1fQY+HmdgX4zGE7gLcyPi95twjZ3xTsbhmIgP4G1YEBGf3YDOHHwQEzAGr+BRXIcfRMSzjfYD89EWg/KZmUfidByGd+DZOt/LImJxW53MvBaHYq+IeJHe2+JF+Bd+1FX2KfwEH1IM71Jcg/0wD1dl5ogek5uDGzAJS3AZ7sbHcXtm9v3DW+AbCiET8fgQdc7CKNxc57gAr+PbuDcz39NoPxAfbTEon5n5XdyivEDX42IswjsxdRPpzMJYzOgUrLMtZuYETMO8iHilq+qvOAGLut/IzLwAy/FJnKQQ3Kkbi5l4EgdExFNddUfgVlyIn/daXAuchZX4m+LBlg5B5+0R8WqzMDMvwgU4H1/uqho2H20xKJ+ZOR3n4EqcGhGvNerf1GOsYetExPLMfBCnZebsiFjT9FxfxAj8sqF4a0QsbLr6iFiFH9fHqY2+3qt4xju7iah6S/Fv5S3YpIiIpRHxUEQM+Rjcy7Aqrqpyr0b7Qfhoi2HzmZk7KzvRP/Qwkqr7n7Y6XfgFdsdRrB/QT8Nq/KGPci90Bnq9Uf6QEocckpljIuKZrgVMUWKh64YxztbAx6q8dxg6/fhoi0H4PEoxuEuxpsZr++FVLO8Tow2i08HtXX3c+D/jysxRSpzywFBPiJm5Ez5fH5d010XEc5n5dczF/Zl5nRIQvk/ZUm7GaUMZZ0shM2firdhFiTU+ohjW7CHq9+WjLQbk8+AqX8UKxUi653sbTo6Ip1vqdPDHKqewbkA/Djsqp4+hYnYdfHFE3NisjIhLldhjJ0zHeUow/BjmN937GwAzEThTMawlOLoPkb2wQT7aYgA+d6vyHKzFZMXDHYCbFCP41SbQ6czvn4pR7s66xjW6yueHstDMnIGz8SA+16fNubga85U3bBQOwsNYUE8kbxhExNiIGKGcek7CeKzIzEkb0x0KH20xAJ+d//d1nBARyyLixYi4T0mlrMThmXlYS51uPKekc9Yxrs7p8M1DWOTpyjH4fhwREc/1aDMVc3B9RHwtIh6OiJcj4u46ycdxdmaO39h4WxoR8WREXIujlZfupxtqPxQ+2mJAPl+ockVEPNLdX0S8jI53PaSlTjdGqrbUbVwdlzp6veZdyMwz8X38WSFyVZ+mH61yvVRAneTyOv6BGxpvayIiHlUM5v2ZOaZXm2Hw0RaD8PmXKl/o02dnlxrZUgdk5g7YVbWlbuN6Ak9j7z6dqgHlJbhHIXJDMdPOVfZLN3TK1zvqvsHw7ipXNyuGyUdbDMLnb5S4ad/6xzfRCdb/3lKng72VVNY9dBlXzQvdhjGZuWdTKzO/qQSsd+HI7qNwH/yuylMzc1yjr2PxYSX4u6NRNz8z12bmFzbS/yZBZk7IzF16lO9Qk6i74Y6IeL5RP1w+2q5t2HxWz7tQCbDPaOgcjWMUD7WkjU4XDq1yKevnua5RssvHKBnuTqenKNnf1XWRMzKz2fEjETG/6/lq5fpgGh6od0+rsI/i4kfgvOa9nXUDymEjM0/EifVxbJWHZWZnbs9ExMwuleMwKzOXKW/js3iXkt0fX+c8vTHGIHzQbm2D8vkVZaucW3NWK7CHwtFqfKme8trqUGLU1fg1vY3rKSVX88Ou8j2q3FE5pvfCb5VTDIiINZl5XJ3oZ5Sg8y3KaWIxLo+Im3r0s7+SbV7UZ5yNYSJOaZSNrz/KhXS3cd2CPZXUw4FKzPCScsXzszrPZoA+bD4qBl7boHxGxMr61cK3lHzYFOXueCFmRcTyTaFTvf+JuCEiHqP3VxHn4zuYFBErhktCG2TmrornuDgizt2SY29ubMtrg8z8Ki7H5IhYRu+vIi5R7pUu3IJz62Cycn0ydyuMvbmxza4tM0cql/vXdAyLHp6rNp6CI/C97R8LbsfGkJn74NPKLcEjnfL/AjQSP4HwDZy2AAAAAElFTkSuQmCC", - "text/latex": [ - "$\\displaystyle \\left( 28, \\ 132, \\ 866\\right)$" - ], - "text/plain": [ - "(28, 132, 866)" - ] - }, - "execution_count": 104, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "edata_synthea27nj.r.shape" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# check by loading the data with observation.csv" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "mimic dataset" - ] - }, - { - "cell_type": "code", - "execution_count": 105, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Path to data exists, load tables from there: ehrapy_data/mimic-iv-demo-data-in-the-omop-common-data-model-0.9\n", - "missing tables: [['concept'], ['vocabulary'], ['domain'], ['concept_class'], ['concept_relationship'], ['relationship'], ['concept_synonym'], ['concept_ancestor'], ['source_to_concept_map'], ['drug_strength']]\n" - ] - } - ], - "source": [ - "edata_mimic_obs = load_and_check(mimic_iv_omop, \"observation_period_start_date\", [\"observation\"])" - ] - }, - { - "cell_type": "code", - "execution_count": 106, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAKUAAAAUCAYAAADsvf0KAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAABJ0AAASdAHeZh94AAAF1klEQVR4nO2aa4hVVRTHf6NC2mQGikr28DkiJY5l9FQbtKnsgfagL5oZSGI2Gg6FViz/gWlgmo9eJDg1CSWIRWlmD9HUyjLFIk1Txx6ok5qmppQ2fdjn6pk954733nPwTsP84bLvWWuvs9Y6e52991r7FNTU1NCEJjQkNMu3AU1ogo8WUURJbwC3A13M7Ni5NakJjR2Srga+AUab2XyfX+Av35KuAb4Cys1sZoh+HzAQKAb6AK2BhWY2/CwGXAI8C9wGtAX2AO8CMrM/kpKJg1x8k1QFXJ6Gvc/MOiahJy4kPQ/0A4qAdsBxYDfuec4zswNe/7bAMOAOoDfQCfgb+A5YACwws3/T6Mp43CQtAa4DepjZ0TAvavmeCvwJvOLRnwbG4R7ob5FPoK6R3YANwChgPTAL2AmMB74IHkBsmQSQtW8BDgOK+M1IWE8cPA4UAh8Ds4GFwElgCrBZ0qVe//uB14FrcZPTi8Bi4EpgPrBIUoGvJIdxmwZ0BMr8e9VaviUVAYOB+WZ2PMK5X4GfcG/7yshHUBsvA+2BMjObG9IzM7jfVGBMAjJxkYtvAIfMbMo50BMHF5rZCZ8oaSowGZgEjA2xtgF3A0vDM6Kkybhguxe4BxeoYWQ1bma2XtJW4BFJ08O6/JnyYaAAeMd3wsxWmtl2M8soXQ/enFKgCnjJvx1wDBghqTCOTBLI1reGrsfTWScgAywK2h5e/8/M7H1/iTazvcCrweXNYV6McXsbuAy4JUz0g3IwcAr4Mo0j2aAkaFdEOHgEWAucj9tXxJHJJ86TNFzSZEnjJZVIap5vozLEXUG7OQuZf4L2pEfPddzWBm2toDy9fAdRXAxsSSjj7hm029Lwt+PeriLg0xgy+URHoNKj7ZI0ysxW5cOgdJBUDlwAtMElPjfhAnJ6hvItgAeDy+UeO9dx+zpoB4Q7h2fKTkBzXMaUBNoE7eE0/BT9opgy+cICYBAuMAtxmeprQGfgQ0l98mdaJMpxS+kEXEAuB0rN7PcM5afjkp1lZvaRx8tp3MzsMHACt4SfRjjRSWVHiZdcGiPMTB7pe2CMpKPARFx2O+xc25UOqRKVpA7ADbgg2yjpTjP7tj5ZSWU4n7YCIxI27SDQIUwIz5SpbLtlQspSb0ebNPwU/VBMmYaGVDIwoN5eeYKZ7TOzJbjltC3wZn39JY3DlZJ+AErM7GBEtzjj1oozsQfUDsrqoE2qDvhj0Bal4aeyvvA+JBeZhobUcphohSBpmNluXKBdIaldVB9JE4C5uFWgJMjAo5DTuElqhlvSq8P0cFDuwT3QniSDVA2uNFAeNqY1cCPwF7Uz/VxkGhpSGebOvFqRGS4O2lM+Q9KTuAL4JlxAVvt9Qsh13HriSpCbwsTTNwhqZ6uBdpK61+/L2WFmO4AVuI3/ox5buJmkMpzp5yIjqUJSjaSH4tqcKST1iqqVSuoMzAsu30pATyzfJBVJqrOkSmoWFM/bA+sijgCfwe05NwCDzGx/fXpyGbcAqRe41iGC/0HGYlzF/lbcqUPY0KHA0OAyda57vaSK4P9+Myv37jcWWAfMkTQI2II7virBTeVP1XUxa5nUi+XXzjJGDr49AEyUtBp3jnwE6IY7L24JLCPiqDEHPXF9GwJMk7QG2AUcwCUVA4GuwF5gtGfjSNz59Sngc6BM8nM6qsyswqPlMtalgZ73wsSooKzG1aP8ynwxMNKjdQ1+4AanVlCa2Q5J/ThzSD8Et02YTZqPK3KQ6Y0LiqX+vbJAMdn5thK39PTFLU2FuE38GlzdsjLNqU22euL69gnQHVcC6ovbvx3DBUklMCcicekStM1x5aMorAIqwoRsxy2YwYcCH5jZL2Fe1FdCk4DngKvMbGN6f/MPSRfh3v4XzOyJPJuTKBqzbwCSHgPmAP3NbE2YF/WV0CzgZ1zEN3T0xx19zTxbx/8hGq1vklrhPgRZ7AckRMyUgdAA3F5gRtNHvk1IGpJ64fblFWZW5fP/A5RDnlkqCjNBAAAAAElFTkSuQmCC", - "text/latex": [ - "$\\displaystyle \\left( 100, \\ 151, \\ 320\\right)$" - ], - "text/plain": [ - "(100, 151, 320)" - ] - }, - "execution_count": 106, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "edata_mimic_obs.r.shape" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "gibleed dataset" - ] - }, - { - "cell_type": "code", - "execution_count": 96, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Path to data exists, load tables from there: ehrapy_data/GIBleed_dataset\n", - "missing tables: [['cohort_definition']]\n" - ] - } - ], - "source": [ - "edata_gibleed_obs = load_and_check(gibleed_omop, \"observation_period\", [\"observation\"])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [] - }, - { - "cell_type": "code", - "execution_count": 109, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAALAAAAAUCAYAAAAtOremAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAABJ0AAASdAHeZh94AAAGJUlEQVR4nO2ae4hVVRTGf6Nm2lOTypAkJRUra7J8Bb5fhCmaRhGaFVp/ZGqlmZIuP6GcHmpGUajhmEX20DJTspdkRmWFguGjJLUsY3xkao2ZOv2x99UzZ86duffcmTteuB9c9j37+Z21115n7bV3QVlZGXnkkauoU9sE8sgjE9SLypS0CLgZaGFmf2eXUh55VISkG4DvgNFmtiCRXxB2ISR1AL4BJpjZbJ/XBBgCDADaAc2AY8AmYCGw0MxOVjJ4b2AM0AVoDOz3beea2apQ3QJglP9dDRQAW4AFwLzKxvHthwOL/WO5l60uxJWHpGFAd6AQuA44H3jdzIZXN8fqHDOOTKtqE4eXpHeBzkArMzsC0S7EE8Ah4KVA3m3AfKATTrmfA5YC1+AU6y2veFGDPg18AtwIvA/MAlYCFwM9Ipq8BswDrgDe8P2f4/kUJ3s5P9blwAvAkcrqVQPiyuNx3EIuBH6rYY7VMmYcmabYJg6vmUBTYGwio5wLIak10AdYYGalgaIfgUHAyqBlkTQFWA8MBW7FTWKwv9HARGARcJ+ZHQuVnxV6HgLcCewAOprZPp9f3/c9QtJ7ZrYs/GZeYRbirPsyYEIVwsgEseQBPATsBrbjrM+aGuSY8ZhxZJpGm7R5mdl6SVuB+yUVmdnJsA98L+6T/Wao4WdJOvxD0ss4q92DwIRJOtvn/0KE8vr2/4Wyhvh0VkJ5fb1jkqYCt+BWbQUFxq3KXp5Hryi+1YU48vDlpyZJUk1SrK4x48g0pTYZ8FoCTAf6AqvDLkQf4ATwdRodJpTweCi/L85NWAaclDRA0iRJ4yR1SdJXU5/+HFGWyOvqLfIpSGoLFOF86rVpcK8JJJNHTiGOTLM0D1/6tC8EXAhJ5+L8kS2pRh4k1QPu8o8fhoo7+PQosAHnHwbbrgWGmdneQHbC6raIGK6lT+v5/1sDHBbjLP2UVHjXFKqQR84gjkyzOA/f+rQblN/ENQPqAnvS6KwIp5irzGx1qOwSn04EyoCuuJ3mtcBHnsDboTYrffqwpIsSmd5XDn5nGgf+TwOuB+4O+e21gcrkkUuII9OszIOZ/YUzis2h/CauiU//TKUjSWOBR3CWcERElcTiOA4MMrOd/nmT36xtA7pL6mJmX/myJb6v/sBmScs92T7AZbjV3Rw46Tl0wq32WYE+agUpyCMnEEemtTAPB4BLobwFTqyaBlW1ljQGmAtsBnqa2YGIagd9uiGgvACY2T9AwkJ1DOSfAAYCjwF7gZH+9xNwE3DYVy3xn6xXcRGBqVVxrkmkKI8zHnFkWkvz0BCvr0ELXOLTJhWqByBpPDAH+AHobWYlSapu8+nBJOUJS98wmOkjE0/5X3DcBkArYJ+Z7ZDUCGjti48m2cnOlzQft6kYn4RHRkhDHrmA80hfpnHaxIakOkAjXKi1nALvwVm9NpU0noTz8zYCfYOhrgh8ivN9r5JUJ+JkKrGp25Ei9zuA+rjDDYB/gVeS1G2P88fW4RZSjXzW0pRHLiCOTLM9D21wod6NEFBgMyvzkYGhkq40s+3BVj4OOwP4HuhX1WfSzHZJWoEL+I/DWalEX/1wfu5BQrt1SReY2aFQXiHwDM5qF/n+S3HHzRUgaTpOcIuijj0lFeNck3vMrLiy90iGdOURc4xiMuSZDuLINJN5iInOPl0DFS/zLMWdIvXHnZAkiIzETdYJ4AtgbMSnYmeEkB/AvcBsSQNw4bQWwGDf1yi/qwziY0mluE/yYaAt7s5BKTDQzH5P/V2TIrjBTBtx5SFpMO7d4XTMu4tXVHDuUfDkKiOeMcfMCjLg1Q8n9+UQrcAluFjmi4H8RFy2LjA+CafPCd1VMLPd/hbRNJwl7oa7Z7ECmGlm6yP6eQfnLgzH+ce/4e5GzDSz3UnGThftcItjZVUVkyCWPHBx9pGhvJacjnHvovzRa6Y844yZLRSSJi9JF+KU/gMz+xWib6NNBp4E2pvZhupmXdvwm7/9uJDPo7VMJylyhWc2IelB4Hmgq5mtg+jbaHNw8dYZWeSWTXTFHffOrm0iVSBXeGYFkhoCk4GlCeWFCAvsK3cDegLP5i+053EmwN+zuB0oDp4r/A+xAOOVibavLgAAAABJRU5ErkJggg==", - "text/latex": [ - "$\\displaystyle \\left( 2694, \\ 21, \\ 1441\\right)$" - ], - "text/plain": [ - "(2694, 21, 1441)" - ] - }, - "execution_count": 109, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "edata_gibleed_obs.r.shape" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "synthea27nj dataset" - ] - }, - { - "cell_type": "code", - "execution_count": 111, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Path to data exists, load tables from there: ehrapy_data/Synthea27Nj\n", - "missing tables: []\n" - ] - } - ], - "source": [ - "edata_synteha27nj_obs = load_and_check(synthea27nj_omop, \"observation_period\", [\"observation\"])" - ] - }, - { - "cell_type": "code", - "execution_count": 112, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "iVBORw0KGgoAAAANSUhEUgAAAIoAAAAUCAYAAABS66VXAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjkuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8hTgPZAAAACXBIWXMAABJ0AAASdAHeZh94AAAGPklEQVR4nO3af+zVVRkH8BdCA8ZaNqzY2FAiM0zMNA0zEAbKTGdRmVsztRbYlqH5K21rj49bpU0Rqq00HLhq6xfL/AX2QxZpJaWwbP6oppg60ZKw1JwJ9Mf5fNiH+733y/fe+wVy8d7uzj7POc85z3nO+Ty/PnfE9u3b7cM+7Ar77W0B9uHVgVHtiJl5I07C5Ih4Yc+KtA97E5l5FH6PBRGxrKaPaHU9mXk07sFFEbG4oo3HfJyMaZiIl3E/lmN5RGzrsPDJOA+HYjyewr1YHBG/GcY9ysyzK3kGw7aIGNnCtxEHdhj/dERM6F+69uhFP5k5B+fiWLwezypnsTQibu+XJzN/jOk4OCKep73r+SL+iW80aKfhW3i3comWYCUOwzL8IDNHtBHuKtyKI7EaS3Ef3o+7M/OMdpvqAxuQHX53VmNWdeB9rgPf1cMs4w70op/M/Ap+jnfhZlyD2/AGzOqwTrc8X8YELKoJO7mezHwr5mJZRPy70fUnnIrbmpYjMz+PdfgQPqhcnrpvAi7C0zg8Ip5p9M1WDu4KfKfd5npBRGxQLssAZGb9dl7fgX1LRFw+XLLsCr3oJzMX4GLciIUR8XLLnK9ps07XPBGxLjMfwjmZeWVEbGu1KJ/ACHy/hfHOiLil1b1ExCZ8s3qc1TLXgYrFuqephIpvDf6l3OjdjsycppjSJ5U36X8BXeknM0cr1v6v2hx4xfef5nMvPA18D5NwAgOD2bnYit923t8A1Au90kL/sxLHHJOZB0TE3xsbmInX4qYu1ukHC6v2hojY2mHM6MrUT8IL+APWDjK+X3SrnxOUi7ME26rY5jC8hHUd4pleeGrc3Zjjjh0XJTPH4Qg8ONRMJzNH4czqcXWzLyI2Z+bnsBgPZOZNSgA1RXFjP8M5Q1mnH2TmWJyhvADLBhk6Ad9uoT2amR+PiF8Ot1w96Ofoqn0J65UD34HMXIsPR8Tf+uSp8buqncnOwexEjFSi7qHiymrx2yPijtbOiFiixC6jsACXKoHx41jRanJ3Ez6C/bE6Ih7vMGY55iiXZZyS2V2Hg7AqM9+xOwTrUj9vrNqLsR0zFKtzOH6qHOgPW5bohaeW7Tnlgk1iZ9czvmr/MZRNZuYiXIiH8LEOYy7Bl/BVfB2b8DYlqv5uZh4REZcMZb0+ULud6zoNiIhsIf0Rn8rM55U9Xq6UB4YVXeqnfqlfwakRsbF6vj8z5+NhHJ+ZxzZcSi88TWzGm5oTQZ3ljBnCBs9VUrkHMDsiNrcZMwtX4eaIuCAiHomIFyPiPkXpT+LCzHzzrtbrFZn5drwHT6BtfWEXqAP1mcMmVIUe9LOlatc3DhxExIuoLfoxja5eeJoYq7oXzYtSm7nxA4Y3kJnn42vKWze7ynza4ZSqXdPaUQm5rlr/nYOt1yeGEsQOhtp3jxsmeZroVj8PV+2WDvPVnmBsg9YLD8jM/RSX/Qw7X5SnFMUc0mFSVfB1rVKrmL2LGGN01XZKgWv6gJRtOJCZYxSXuBU39DjN9Kp9ZFiE2hnd6ucXSpxxaHWIragD1UcbtF54ahyilEo20LgoEbEda3FAZr6llSszv6AEr/diTjOd64BfVe3CzJzYMtdJOE4Jln7d0rciM7dX5fh+cJpSql41SBArM6dWGV8r/SAlbqBNUXAY5OxKPxHxGG5RgsvzWsafiHmK5diRffbC00D9kqxhYB1lpVJlnYe/NCY9S6kSbq02uCizNf6zMSJWNJ5/pJSN5+LB6vvBJkxVzO4IXBoRz7bM0wzA+kHtdjpVYmucrsQCa/GYUuiaonzXGqPENu3K+P3K2Yt+Pq24osVVTWQ9JuMDytl8sspW9MkDJ1b9P2lutsZKxSed2UKfXLUjcT6ize/sJkNVxX0fPqsEvfOVDGK6ovx5EbG0jYDTlMPquYKamVPxXkMLYtco31um4KO4AMfjLpyFU9pVNPuVsxf9RMQTOEqxdAcrVmKWYjWOi4iVWtALT2a+TrlIt9bWuN3X48uUlO3IiFjfvQp6R2burxSdrtkDaXPPeLXI2Ssy8zNKyj4jIu6i/dfja5VvA1fsQdlqzFA+CSzeC2t3g1eLnF2jqmRfhpX1JaGNRakGz8RsXL3vj0v/X6jc9ulKZXhjTf8vHQfy/YibeYEAAAAASUVORK5CYII=", - "text/latex": [ - "$\\displaystyle \\left( 28, \\ 75, \\ 866\\right)$" - ], - "text/plain": [ - "(28, 75, 866)" - ] - }, - "execution_count": 112, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "edata_synteha27nj_obs.r.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 126, - "metadata": {}, - "outputs": [], - "source": [ - "tables = [\"measurement\", \"observation\", \"procedure_occurrence\", \"specimen\", \"device_exposure\", \"drug_exposure\", \"note\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 122, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "device_exposure\n", - "drug_exposure\n" - ] - } - ], - "source": [ - "for table in tables:\n", - " table_ext = table + \".csv\"\n", - " path = os.path.join(\n", - " \"/Users/shrey.parikh/Desktop/EHR/ehrapy_data/mimic-iv-demo-data-in-the-omop-common-data-model-0.9/1_omop_data_csv\",\n", - " table_ext,\n", - " )\n", - " temp = pd.read_csv(path)\n", - " if temp.columns.str.contains(\"start_date\").any():\n", - " print(table)" - ] - }, - { - "cell_type": "code", - "execution_count": 136, - "metadata": {}, - "outputs": [], - "source": [ - "# removing drug_exposure and device_exposure because they have start/end date\n", - "# note is empty\n", - "tables = [\"measurement\", \"observation\", \"procedure_occurrence\", \"specimen\"]" - ] - }, - { - "cell_type": "code", - "execution_count": 137, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Processing table: measurement\n", - "Path to data exists, load tables from there: ehrapy_data/mimic-iv-demo-data-in-the-omop-common-data-model-0.9\n", - "missing tables: [['concept'], ['vocabulary'], ['domain'], ['concept_class'], ['concept_relationship'], ['relationship'], ['concept_synonym'], ['concept_ancestor'], ['source_to_concept_map'], ['drug_strength']]\n", - "Success: measurement processed successfully.\n", - "Processing table: observation\n", - "Path to data exists, load tables from there: ehrapy_data/mimic-iv-demo-data-in-the-omop-common-data-model-0.9\n", - "missing tables: [['concept'], ['vocabulary'], ['domain'], ['concept_class'], ['concept_relationship'], ['relationship'], ['concept_synonym'], ['concept_ancestor'], ['source_to_concept_map'], ['drug_strength']]\n", - "Success: observation processed successfully.\n", - "Processing table: procedure_occurrence\n", - "Path to data exists, load tables from there: ehrapy_data/mimic-iv-demo-data-in-the-omop-common-data-model-0.9\n", - "missing tables: [['concept'], ['vocabulary'], ['domain'], ['concept_class'], ['concept_relationship'], ['relationship'], ['concept_synonym'], ['concept_ancestor'], ['source_to_concept_map'], ['drug_strength']]\n", - "Success: procedure_occurrence processed successfully.\n", - "Processing table: specimen\n", - "Path to data exists, load tables from there: ehrapy_data/mimic-iv-demo-data-in-the-omop-common-data-model-0.9\n", - "missing tables: [['concept'], ['vocabulary'], ['domain'], ['concept_class'], ['concept_relationship'], ['relationship'], ['concept_synonym'], ['concept_ancestor'], ['source_to_concept_map'], ['drug_strength']]\n", - "Success: specimen processed successfully.\n" - ] - } - ], - "source": [ - "for table in tables:\n", - " print(f\"Processing table: {table}\")\n", - " try:\n", - " edata_temp = load_and_check(mimic_iv_omop, \"observation_period_start_date\", [table])\n", - " print(f\"Success: {table} processed successfully.\")\n", - " except Exception as e:\n", - " print(f\"Error processing table: {table}. Error: {str(e)}\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "hackathon_venv", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.12.7" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} From a28d1021491efff0e3f2d5793cb146ad97d4ff0d Mon Sep 17 00:00:00 2001 From: Eljas Roellin Date: Mon, 28 Oct 2024 22:46:56 +0100 Subject: [PATCH 07/15] dummy test omop dataset; start io tests --- src/ehrdata/io/omop/__init__.py | 7 ++- src/ehrdata/io/omop/omop.py | 56 +++++++++++++++---- src/ehrdata/utils/_omop_utils.py | 33 ++++++++++- tests/conftest.py | 12 ++++ tests/data/toy_omop/vanilla/cohort.csv | 4 ++ tests/data/toy_omop/vanilla/measurement.csv | 14 +++++ tests/data/toy_omop/vanilla/observation.csv | 10 ++++ .../toy_omop/vanilla/observation_period.csv | 4 ++ tests/data/toy_omop/vanilla/person.csv | 5 ++ .../toy_omop/vanilla/visit_occurrence.csv | 4 ++ tests/test_io/test_omop.py | 52 +++++++++++++++++ 11 files changed, 184 insertions(+), 17 deletions(-) create mode 100644 tests/conftest.py create mode 100644 tests/data/toy_omop/vanilla/cohort.csv create mode 100644 tests/data/toy_omop/vanilla/measurement.csv create mode 100644 tests/data/toy_omop/vanilla/observation.csv create mode 100644 tests/data/toy_omop/vanilla/observation_period.csv create mode 100644 tests/data/toy_omop/vanilla/person.csv create mode 100644 tests/data/toy_omop/vanilla/visit_occurrence.csv create mode 100644 tests/test_io/test_omop.py diff --git a/src/ehrdata/io/omop/__init__.py b/src/ehrdata/io/omop/__init__.py index 6f3fda4..8cd4668 100644 --- a/src/ehrdata/io/omop/__init__.py +++ b/src/ehrdata/io/omop/__init__.py @@ -1,4 +1,7 @@ from .omop import ( + get_table, + get_time_interval_table, + load, # extract_condition_occurrence, # extract_device_exposure, # extract_drug_exposure, @@ -10,9 +13,7 @@ # extract_person_observation_period, # extract_procedure_occurrence, # extract_specimen, - get_table, - get_time_interval_table, - load, + register_omop_to_db_connection, setup_obs, setup_variables, ) diff --git a/src/ehrdata/io/omop/omop.py b/src/ehrdata/io/omop/omop.py index ccd81b8..2aaa872 100644 --- a/src/ehrdata/io/omop/omop.py +++ b/src/ehrdata/io/omop/omop.py @@ -1,5 +1,6 @@ from __future__ import annotations +import os from collections.abc import Sequence from pathlib import Path from typing import Literal @@ -8,6 +9,9 @@ import duckdb import numpy as np import pandas as pd +from duckdb import DuckDBPyConnection + +from ehrdata.utils._omop_utils import get_omop_table_names def _check_sanity_of_folder(folder_path: str | Path): @@ -18,22 +22,46 @@ def _check_sanity_of_database(backend_handle: duckdb.DuckDB): pass -VALID_OBSERVATION_TABLES_SINGLE = ["person", "observation_period", "visit_occurrence"] -VALID_OBSERVATION_TABLES_JOIN = ["person_observation_period", "person_visit_occurrence"] +VALID_OBSERVATION_TABLES_SINGLE = ["person"] +VALID_OBSERVATION_TABLES_JOIN = ["person_cohort", "person_observation_period", "person_visit_occurrence"] VALID_VARIABLE_TABLES = ["measurement", "observation", "specimen", "note", "death"] +def register_omop_to_db_connection( + path: Path, + backend_handle: DuckDBPyConnection, + source: Literal["csv"] = "csv", +) -> None: + """Register the OMOP CDM tables to the database.""" + missing_tables = [] + for table in get_omop_table_names(): + # if path exists lowercse, uppercase, capitalized: + table_path = f"{path}/{table}.csv" + if os.path.exists(table_path): + if table == "measurement": + backend_handle.register( + table, backend_handle.read_csv(f"{path}/{table}.csv", dtype={"measurement_source_value": str}) + ) + else: + backend_handle.register(table, backend_handle.read_csv(f"{path}/{table}.csv")) + else: + missing_tables.append([table]) + print("missing tables: ", missing_tables) + + return None + + def setup_obs( backend_handle: Literal[str, duckdb, Path], - observation_table: Literal[ - "person", "observation_period", "person_observation_period", "visit_occurrence", "person_visit_occurrence" - ], + observation_table: Literal["person", "person_cohort", "person_observation_period", "person_visit_occurrence"], ): """Setup the observation table. This function sets up the observation table for the EHRData object. - For this, a table from the OMOP CDM which represents the "observed unit" via its id should be selected. - A unit can be a person, an observation period, a visit occurrence, or a left join on person_id of a person with one of the other tables. + For this, a table from the OMOP CDM which represents the "observed unit" via an id should be selected. + A unit can be a person, or the data of a person together with either the information from cohort, observation_period, or visit_occurrence. + Notice a single person can have multiple of the latter, and as such can appear multiple times. + For person_cohort, the subject_id of the cohort is considered to be the person_id for a join. Parameters ---------- @@ -50,14 +78,16 @@ def setup_obs( if observation_table not in VALID_OBSERVATION_TABLES_SINGLE + VALID_OBSERVATION_TABLES_JOIN: raise ValueError( - "observation_table must be either 'person', 'observation_period', 'person_observation_period', 'visit_occurrence', or 'person_visit_occurrence'." + f"observation_table must be one of {[VALID_OBSERVATION_TABLES_SINGLE]+[VALID_OBSERVATION_TABLES_JOIN]}." ) if observation_table in VALID_OBSERVATION_TABLES_SINGLE: obs = get_table(backend_handle, observation_table) elif observation_table in VALID_OBSERVATION_TABLES_JOIN: - if observation_table == "person_observation_period": + if observation_table == "person_cohort": + obs = _get_table_left_join(backend_handle, "person", "cohort", right_key="subject_id") + elif observation_table == "person_observation_period": obs = _get_table_left_join(backend_handle, "person", "observation_period") elif observation_table == "person_visit_occurrence": obs = _get_table_left_join(backend_handle, "person", "visit_occurrence") @@ -176,13 +206,15 @@ def get_table(duckdb_instance, table_name: str) -> pd.DataFrame: return _lowercase_column_names(duckdb_instance.sql(f"SELECT * FROM {table_name}").df()) -def _get_table_left_join(duckdb_instance, table1: str, table2: str) -> pd.DataFrame: +def _get_table_left_join( + duckdb_instance, table1: str, table2: str, left_key: str = "person_id", right_key: str = "person_id" +) -> pd.DataFrame: """Extract a table of an OMOP CDM Database.""" return _lowercase_column_names( duckdb_instance.sql( f"SELECT * \ - FROM {table1} \ - LEFT JOIN {table2} USING(person_id) \ + FROM {table1} as t1 \ + LEFT JOIN {table2} as t2 ON t1.{left_key} = t2.{right_key} \ " ).df() ) diff --git a/src/ehrdata/utils/_omop_utils.py b/src/ehrdata/utils/_omop_utils.py index 7385538..2b52d02 100644 --- a/src/ehrdata/utils/_omop_utils.py +++ b/src/ehrdata/utils/_omop_utils.py @@ -6,6 +6,7 @@ import os import warnings from pathlib import Path +from typing import Literal # import dask.dataframe as dd import numpy as np @@ -13,8 +14,13 @@ from rich import print as rprint -def get_table_catalog_dict(): - """Get the table catalog dictionary of the OMOP CDM v5.4. +def get_table_catalog_dict(version: Literal["5.4"] = "5.4"): + """Get the table catalog dictionary of the OMOP CDM. + + Parameters + ---------- + version + The version of the OMOP CDM. Currently, only 5.4 is supported. Returns ------- @@ -61,9 +67,32 @@ def get_table_catalog_dict(): "source_to_concept_map", "drug_strength", ] + return table_catalog_dict +def get_omop_table_names(version: Literal["5.4"] = "5.4"): + """Get the table names of the OMOP CDM. + + Args + ---- + version: str, the version of the OMOP CDM. Currently, only 5.4 is supported. + + Returns + ------- + List of table names + """ + if version != "5.4": + raise ValueError("Only support OMOP CDM v5.4!") + + table_catalog_dict = get_table_catalog_dict(version=version) + tables = [] + for _, value_list in table_catalog_dict.items(): + for value in value_list: + tables.append(value) + return tables + + def get_dtype_mapping(): """Get the data type mapping of the OMOP CDM v5.4. diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 0000000..8f5fbc0 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,12 @@ +import duckdb +import pytest + +from ehrdata.io.omop import register_omop_to_db_connection + + +@pytest.fixture # (scope="session") +def omop_connection_vanilla(): + con = duckdb.connect() + register_omop_to_db_connection(path="tests/data/toy_omop/vanilla", backend_handle=con, source="csv") + yield con + con.close() diff --git a/tests/data/toy_omop/vanilla/cohort.csv b/tests/data/toy_omop/vanilla/cohort.csv new file mode 100644 index 0000000..6517ad1 --- /dev/null +++ b/tests/data/toy_omop/vanilla/cohort.csv @@ -0,0 +1,4 @@ +cohort_definition_id,subject_id,cohort_start_date,cohort_end_date, +1,1,1/1/00,1/6/00, +1,2,1/1/00,1/6/00, +1,3,1/1/00,1/6/00, diff --git a/tests/data/toy_omop/vanilla/measurement.csv b/tests/data/toy_omop/vanilla/measurement.csv new file mode 100644 index 0000000..23e7888 --- /dev/null +++ b/tests/data/toy_omop/vanilla/measurement.csv @@ -0,0 +1,14 @@ +measurement_id,person_id,measurement_concept_id,measurement_date,measurement_datetime,measurement_time,measurement_type_concept_id,operator_concept_id,value_as_number,value_as_concept_id,unit_concept_id,range_low,range_high,provider_id,visit_occurrence_id,visit_detail_id,measurement_source_value,measurement_source_concept_id,unit_source_value,value_source_value +1,1,3031147,1/1/00,1/1/00 12:00,12:00,32856,,18,,9557,21,30,,1,,50804,2000001003,mEq/L,18 +2,1,3031147,1/1/00,1/1/2100  13:00:00 PM,13:00,32856,,19,,9557,21,30,,1,,50804,2000001003,mEq/L,19 +3,1,3022318,1/1/00,1/1/2100  14:00:00 PM,14:00,32817,,,45877096,,,,,1,,220048,2000030004,,SR (Sinus Rhythm) +4,2,3031147,1/1/00,1/1/00 12:00,12:00,32856,,20,,9557,21,30,,2,,50804,2000001003,mEq/L,20 +5,2,3031147,1/1/00,1/1/2100  13:00:00 PM,13:00,32856,,21,,9557,21,30,,2,,50804,2000001003,mEq/L,21 +6,2,3022318,1/1/00,1/1/2100  14:00:00 PM,14:00,32817,,,45877096,,,,,2,,220048,2000030004,,SR (Sinus Rhythm) +7,3,3031147,1/1/00,1/1/00 12:00,12:00,32856,,22,,9557,21,30,,3,,50804,2000001003,mEq/L,22 +8,3,3031147,1/1/00,1/1/2100  13:00:00 PM,13:00,32856,,23,,9557,21,30,,3,,50804,2000001003,mEq/L,23 +9,3,3022318,1/1/00,1/1/2100  14:00:00 PM,14:00,32817,,,45883018,,,,,3,,220048,2000030004,,AF (Atrial Fibrillation) +,,,,,,,,,,,,,,,,,,, +,,,,,,,,,,,,,,,,,,, +,,,,,,,,,,,,,,,,,,, +,,,,,,,,,,,,,,,,,,, diff --git a/tests/data/toy_omop/vanilla/observation.csv b/tests/data/toy_omop/vanilla/observation.csv new file mode 100644 index 0000000..0cd51c2 --- /dev/null +++ b/tests/data/toy_omop/vanilla/observation.csv @@ -0,0 +1,10 @@ +observation_id,person_id,observation_concept_id,observation_date,observation_datetime,observation_type_concept_id,value_as_number,value_as_string,value_as_concept_id,qualifier_concept_id,unit_concept_id,provider_id,visit_occurrence_id,visit_detail_id,observation_source_value,observation_source_concept_id,unit_source_value,qualifier_source_value +1,1,3001062,2100-01-01,2100-01-01 12:00:00,32817,,Anemia,0,,,,,,225059,2000030108,, +2,1,3001062,2100-01-01,2100-01-01 13:00:00,32817,,Anemia,0,,,,,,225059,2000030108,, +3,1,3034263,2100-01-01,2100-01-01 14:00:00,32817,3,,,,,,,,224409,2000030058,, +4,2,3001062,2100-01-01,2100-01-01 12:00:00,32817,,Anemia,0,,,,,,225059,2000030108,, +5,2,3001062,2100-01-01,2100-01-01 13:00:00,32817,,Anemia,0,,,,,,225059,2000030108,, +6,2,3034263,2100-01-01,2100-01-01 14:00:00,32817,4,,,,,,,,224409,2000030058,, +7,3,3001062,2100-01-01,2100-01-01 12:00:00,32817,,Anemia,0,,,,,,225059,2000030108,, +8,3,3001062,2100-01-01,2100-01-01 13:00:00,32817,,Anemia,0,,,,,,225059,2000030108,, +9,3,3034263,2100-01-01,2100-01-01 14:00:00,32817,5,,,,,,,,224409,2000030058,, diff --git a/tests/data/toy_omop/vanilla/observation_period.csv b/tests/data/toy_omop/vanilla/observation_period.csv new file mode 100644 index 0000000..11df294 --- /dev/null +++ b/tests/data/toy_omop/vanilla/observation_period.csv @@ -0,0 +1,4 @@ +observation_period_id,person_id,observation_period_start_date,observation_period_end_date,period_type_concept_id +1,1,2100-01-01,2100-01-31,32828 +2,2,2100-01-01,2100-01-31,32828 +3,3,2100-01-01,2100-01-31,32828 diff --git a/tests/data/toy_omop/vanilla/person.csv b/tests/data/toy_omop/vanilla/person.csv new file mode 100644 index 0000000..18b89ef --- /dev/null +++ b/tests/data/toy_omop/vanilla/person.csv @@ -0,0 +1,5 @@ +person_id,gender_concept_id,year_of_birth,month_of_birth,day_of_birth,birth_datetime,race_concept_id,ethnicity_concept_id,location_id,provider_id,care_site_id,person_source_value,gender_source_value,gender_source_concept_id,race_source_value,race_source_concept_id,ethnicity_source_value,ethnicity_source_concept_id +1,8507,2095,,,,0,38003563,,,,1234,M,0,,,, +2,8507,2096,,,,0,38003563,,,,1235,M,0,,,, +3,8532,2097,,,,0,0,,,,1236,F,0,,,, +4,8532,2098,,,,0,0,,,,1237,F,0,,,, diff --git a/tests/data/toy_omop/vanilla/visit_occurrence.csv b/tests/data/toy_omop/vanilla/visit_occurrence.csv new file mode 100644 index 0000000..d7b1087 --- /dev/null +++ b/tests/data/toy_omop/vanilla/visit_occurrence.csv @@ -0,0 +1,4 @@ +visit_occurrence_id,person_id,visit_concept_id,visit_start_date,visit_start_datetime,visit_end_date,visit_end_datetime,visit_type_concept_id,provider_id,care_site_id,visit_source_value,visit_source_concept_id,admitting_source_concept_id,admitting_source_value,discharge_to_concept_id,discharge_to_source_value,preceding_visit_occurrence_id +1,1,8870,2100-01-01,2100-01-01 00:00:00,2100-01-31,2100-01-31 00:00:00,,,,10014354|2147-07-08,2000001801,,,,, +2,2,8870,2100-01-01,2100-01-01 00:00:00,2100-01-31,2100-01-31 00:00:00,,,,10014354|2147-07-08,2000001801,,,,, +3,3,8870,2100-01-01,2100-01-01 00:00:00,2100-01-31,2100-01-31 00:00:00,,,,10014354|2147-07-08,2000001801,,,,, diff --git a/tests/test_io/test_omop.py b/tests/test_io/test_omop.py new file mode 100644 index 0000000..5e501c1 --- /dev/null +++ b/tests/test_io/test_omop.py @@ -0,0 +1,52 @@ +import duckdb +import pytest + +import ehrdata as ed +from ehrdata.io.omop import register_omop_to_db_connection + + +def test_register_omop_to_db_connection(): + register_omop_to_db_connection(path="tests/data/toy_omop/vanilla", backend_handle=duckdb.connect(), source="csv") + + +@pytest.mark.parametrize( + "observation_table", ["person", "person_cohort", "person_observation_period", "person_visit_occurrence"] +) +def test_setup_obs(omop_connection_vanilla, observation_table): + con = omop_connection_vanilla + edata = ed.io.omop.setup_obs(backend_handle=con, observation_table=observation_table) + assert isinstance(edata, ed.EHRData) + + +@pytest.mark.parametrize("observation_table", ["perso"]) +def test_setup_obs_unknown_observation_table_argument(omop_connection_vanilla, observation_table): + con = omop_connection_vanilla + with pytest.raises(ValueError): + ed.io.omop.setup_obs(backend_handle=con, observation_table=observation_table) + + +def test_setup_obs_person(): + # check precise expected table + con = duckdb.connect() + register_omop_to_db_connection(path="../data/toy_omop/vanilla", backend_handle=con, source="csv") + con.close() + + +def test_setup_var_measurement_startdate_fixed(): + # check precise expected table + pass + + +def test_setup_var_measurement_startdate_observation_period(): + # check precise expected table + pass + + +def test_setup_var_observation_startdate_fixed(): + # check precise expected table + pass + + +def test_setup_var_observation_startdate_observation_period(): + # check precise expected table + pass From abbb29f0b83d52131ba18bc89bbff801b489bc9f Mon Sep 17 00:00:00 2001 From: Eljas Roellin Date: Tue, 29 Oct 2024 10:35:52 +0100 Subject: [PATCH 08/15] proper date format cohort measurement --- tests/data/toy_omop/vanilla/cohort.csv | 8 ++++---- tests/data/toy_omop/vanilla/measurement.csv | 22 +++++++++------------ 2 files changed, 13 insertions(+), 17 deletions(-) diff --git a/tests/data/toy_omop/vanilla/cohort.csv b/tests/data/toy_omop/vanilla/cohort.csv index 6517ad1..e9e2ef6 100644 --- a/tests/data/toy_omop/vanilla/cohort.csv +++ b/tests/data/toy_omop/vanilla/cohort.csv @@ -1,4 +1,4 @@ -cohort_definition_id,subject_id,cohort_start_date,cohort_end_date, -1,1,1/1/00,1/6/00, -1,2,1/1/00,1/6/00, -1,3,1/1/00,1/6/00, +cohort_definition_id,subject_id,cohort_start_date,cohort_end_date +1,1,2100-01-01,2100-01-31 +1,2,2100-01-01,2100-01-31 +1,3,2100-01-01,2100-01-31 diff --git a/tests/data/toy_omop/vanilla/measurement.csv b/tests/data/toy_omop/vanilla/measurement.csv index 23e7888..222c9a2 100644 --- a/tests/data/toy_omop/vanilla/measurement.csv +++ b/tests/data/toy_omop/vanilla/measurement.csv @@ -1,14 +1,10 @@ measurement_id,person_id,measurement_concept_id,measurement_date,measurement_datetime,measurement_time,measurement_type_concept_id,operator_concept_id,value_as_number,value_as_concept_id,unit_concept_id,range_low,range_high,provider_id,visit_occurrence_id,visit_detail_id,measurement_source_value,measurement_source_concept_id,unit_source_value,value_source_value -1,1,3031147,1/1/00,1/1/00 12:00,12:00,32856,,18,,9557,21,30,,1,,50804,2000001003,mEq/L,18 -2,1,3031147,1/1/00,1/1/2100  13:00:00 PM,13:00,32856,,19,,9557,21,30,,1,,50804,2000001003,mEq/L,19 -3,1,3022318,1/1/00,1/1/2100  14:00:00 PM,14:00,32817,,,45877096,,,,,1,,220048,2000030004,,SR (Sinus Rhythm) -4,2,3031147,1/1/00,1/1/00 12:00,12:00,32856,,20,,9557,21,30,,2,,50804,2000001003,mEq/L,20 -5,2,3031147,1/1/00,1/1/2100  13:00:00 PM,13:00,32856,,21,,9557,21,30,,2,,50804,2000001003,mEq/L,21 -6,2,3022318,1/1/00,1/1/2100  14:00:00 PM,14:00,32817,,,45877096,,,,,2,,220048,2000030004,,SR (Sinus Rhythm) -7,3,3031147,1/1/00,1/1/00 12:00,12:00,32856,,22,,9557,21,30,,3,,50804,2000001003,mEq/L,22 -8,3,3031147,1/1/00,1/1/2100  13:00:00 PM,13:00,32856,,23,,9557,21,30,,3,,50804,2000001003,mEq/L,23 -9,3,3022318,1/1/00,1/1/2100  14:00:00 PM,14:00,32817,,,45883018,,,,,3,,220048,2000030004,,AF (Atrial Fibrillation) -,,,,,,,,,,,,,,,,,,, -,,,,,,,,,,,,,,,,,,, -,,,,,,,,,,,,,,,,,,, -,,,,,,,,,,,,,,,,,,, +1,1,3031147,2100-01-01,2100-01-01 12:00:00,12:00:00,32856,,18,,9557,21,30,,1,,50804,2000001003,mEq/L,18 +2,1,3031147,2100-01-01,2100-01-01 13:00:00,13:00:00,32856,,19,,9557,21,30,,1,,50804,2000001003,mEq/L,19 +3,1,3022318,2100-01-01,2100-01-01 14:00:00,14:00:00,32817,,,45877096,,,,,1,,220048,2000030004,,SR (Sinus Rhythm) +4,2,3031147,2100-01-01,2100-01-01 12:00:00,12:00:00,32856,,20,,9557,21,30,,2,,50804,2000001003,mEq/L,20 +5,2,3031147,2100-01-01,2100-01-01 13:00:00,13:00:00,32856,,21,,9557,21,30,,2,,50804,2000001003,mEq/L,21 +6,2,3022318,2100-01-01,2100-01-01 14:00:00,14:00:00,32817,,,45877096,,,,,2,,220048,2000030004,,SR (Sinus Rhythm) +7,3,3031147,2100-01-01,2100-01-01 12:00:00,12:00:00,32856,,22,,9557,21,30,,3,,50804,2000001003,mEq/L,22 +8,3,3031147,2100-01-01,2100-01-01 13:00:00,13:00:00,32856,,23,,9557,21,30,,3,,50804,2000001003,mEq/L,23 +9,3,3022318,2100-01-01,2100-01-01 14:00:00,14:00:00,32817,,,45883018,,,,,3,,220048,2000030004,,AF (Atrial Fibrillation) From 4cec97143b559596175638abb691bee5238430fe Mon Sep 17 00:00:00 2001 From: Eljas Roellin Date: Tue, 29 Oct 2024 22:15:04 +0100 Subject: [PATCH 09/15] unstable commit, to save progress --- src/ehrdata/io/omop/_queries.py | 139 ++++++++++++++++++++++++++++++ src/ehrdata/io/omop/omop.py | 144 ++++++++++++++++++++------------ tests/test_io/test_omop.py | 61 +++++++++----- 3 files changed, 272 insertions(+), 72 deletions(-) create mode 100644 src/ehrdata/io/omop/_queries.py diff --git a/src/ehrdata/io/omop/_queries.py b/src/ehrdata/io/omop/_queries.py new file mode 100644 index 0000000..abdbf80 --- /dev/null +++ b/src/ehrdata/io/omop/_queries.py @@ -0,0 +1,139 @@ +from collections.abc import Sequence + +import duckdb +import pandas as pd + +START_DATE_KEY = { + "visit_occurrence": "visit_start_date", + "observation_period": "observation_period_start_date", + "cohort": "cohort_start_date", +} +END_DATE_KEY = { + "visit_occurrence": "visit_end_date", + "observation_period": "observation_period_end_date", + "cohort": "cohort_end_date", +} +TIME_DEFINING_TABLE_SUBJECT_KEY = { + "visit_occurrence": "person_id", + "observation_period": "person_id", + "cohort": "subject_id", +} + +AGGREGATION_STRATEGY_KEY = { + "last": "LAST", + "first": "FIRST", + "mean": "MEAN", + "median": "MEDIAN", + "mode": "MODE", + "sum": "SUM", + "count": "COUNT", + "min": "MIN", + "max": "MAX", + "std": "STD", +} + + +def _generate_timedeltas(interval_length_number: int, interval_length_unit: str, num_intervals: int) -> pd.DataFrame: + timedeltas_dataframe = pd.DataFrame( + { + "interval_start_offset": [ + pd.to_timedelta(i * interval_length_number, interval_length_unit) for i in range(num_intervals) + ], + "interval_end_offset": [ + pd.to_timedelta(i * interval_length_number, interval_length_unit) for i in range(1, num_intervals + 1) + ], + "interval_step": list(range(num_intervals)), + } + ) + return timedeltas_dataframe + + +def _write_timedeltas_to_db( + backend_handle: duckdb.duckdb.DuckDBPyConnection, + timedeltas_dataframe, +) -> None: + backend_handle.execute("DROP TABLE IF EXISTS timedeltas") + backend_handle.execute( + """ + CREATE TABLE timedeltas ( + interval_start_offset INTERVAL, + interval_end_offset INTERVAL, + interval_step INTEGER + ) + """ + ) + backend_handle.execute("INSERT INTO timedeltas SELECT * FROM timedeltas_dataframe") + + +def _drop_timedeltas(backend_handle: duckdb.duckdb.DuckDBPyConnection): + backend_handle.execute("DROP TABLE IF EXISTS timedeltas") + + +def _generate_value_query(data_table: str, data_field_to_keep: Sequence, aggregation_strategy: str) -> str: + query = f"{', ' .join([f'CASE WHEN COUNT(*) = 0 THEN NULL ELSE {aggregation_strategy}({column}) END AS {column}' for column in data_field_to_keep])}" + return query + + +def time_interval_table_query_long_format( + backend_handle: duckdb.duckdb.DuckDBPyConnection, + time_defining_table: str, + data_table: str, + interval_length_number: int, + interval_length_unit: str, + num_intervals: int, + aggregation_strategy: str, + data_field_to_keep: Sequence[str] | str, +) -> pd.DataFrame: + """Returns a long format DataFrame from the data_table. The following columns should be considered the indices of this long format: person_id, data_table_concept_id, interval_step. The other columns, except for start_date and end_date, should be considered the values.""" + if isinstance(data_field_to_keep, str): + data_field_to_keep = [data_field_to_keep] + + timedeltas_dataframe = _generate_timedeltas(interval_length_number, interval_length_unit, num_intervals) + + _write_timedeltas_to_db( + backend_handle, + timedeltas_dataframe, + ) + + # multi-step query + # 1. Create person_time_defining_table, which matches the one created for obs. Needs to contain the person_id, and the start date in particular. + # 2. Create person_data_table (data_table is typically measurement), which contains the cross product of person_id and the distinct concept_id s. + # 3. Create long_format_backbone, which is the left join of person_time_defining_table and person_data_table. + # 4. Create long_format_intervals, which is the cross product of long_format_backbone and timedeltas. This table contains most notably the person_id, the concept_id, the interval start and end dates. + # 5. Create the final table, which is the join with the data_table (typically measurement); each measurement is assigned to its person_id, its concept_id, and the interval it fits into. + df = backend_handle.execute( + f""" + WITH person_time_defining_table AS ( \ + SELECT person.person_id as person_id, {START_DATE_KEY[time_defining_table]} as start_date, {END_DATE_KEY[time_defining_table]} as end_date \ + FROM person \ + JOIN {time_defining_table} ON person.person_id = {time_defining_table}.{TIME_DEFINING_TABLE_SUBJECT_KEY[time_defining_table]} \ + ), \ + person_data_table AS( \ + WITH distinct_data_table_concept_ids AS ( \ + SELECT DISTINCT {data_table}_concept_id + FROM {data_table} \ + ) + SELECT person.person_id, {data_table}_concept_id as data_table_concept_id \ + FROM person \ + CROSS JOIN distinct_data_table_concept_ids \ + ), \ + long_format_backbone as ( \ + SELECT person_time_defining_table.person_id, data_table_concept_id, start_date, end_date \ + FROM person_time_defining_table \ + LEFT JOIN person_data_table USING(person_id)\ + ), \ + long_format_intervals as ( \ + SELECT person_id, data_table_concept_id, interval_step, start_date, start_date + interval_start_offset as interval_start, start_date + interval_end_offset as interval_end \ + FROM long_format_backbone \ + CROSS JOIN timedeltas \ + ) \ + SELECT lfi.person_id, lfi.data_table_concept_id, interval_step, interval_start, interval_end, {_generate_value_query(data_table, data_field_to_keep, AGGREGATION_STRATEGY_KEY[aggregation_strategy])} \ + FROM long_format_intervals as lfi \ + LEFT JOIN {data_table} ON lfi.person_id = {data_table}.person_id AND lfi.data_table_concept_id = {data_table}.{data_table}_concept_id AND {data_table}.{data_table}_date BETWEEN lfi.interval_start AND lfi.interval_end \ + GROUP BY lfi.person_id, lfi.data_table_concept_id, interval_step, interval_start, interval_end + """ + ).df() + + _drop_timedeltas(backend_handle) + + return df diff --git a/src/ehrdata/io/omop/omop.py b/src/ehrdata/io/omop/omop.py index 2aaa872..88fc9c8 100644 --- a/src/ehrdata/io/omop/omop.py +++ b/src/ehrdata/io/omop/omop.py @@ -11,6 +11,7 @@ import pandas as pd from duckdb import DuckDBPyConnection +from ehrdata.io.omop._queries import time_interval_table_query_long_format from ehrdata.utils._omop_utils import get_omop_table_names @@ -24,7 +25,7 @@ def _check_sanity_of_database(backend_handle: duckdb.DuckDB): VALID_OBSERVATION_TABLES_SINGLE = ["person"] VALID_OBSERVATION_TABLES_JOIN = ["person_cohort", "person_observation_period", "person_visit_occurrence"] -VALID_VARIABLE_TABLES = ["measurement", "observation", "specimen", "note", "death"] +VALID_VARIABLE_TABLES = ["measurement", "observation", "specimen"] def register_omop_to_db_connection( @@ -54,6 +55,7 @@ def register_omop_to_db_connection( def setup_obs( backend_handle: Literal[str, duckdb, Path], observation_table: Literal["person", "person_cohort", "person_observation_period", "person_visit_occurrence"], + death_table: bool = False, ): """Setup the observation table. @@ -69,16 +71,21 @@ def setup_obs( The backend handle to the database. observation_table The observation table to be used. + death_table + Whether to include the death table. The observation_table created will be left joined with the death table as the right table. Returns ------- An EHRData object with populated .obs field. """ + if not isinstance(backend_handle, duckdb.duckdb.DuckDBPyConnection): + raise ValueError("backend_handle must be a DuckDB connection.") + from ehrdata import EHRData if observation_table not in VALID_OBSERVATION_TABLES_SINGLE + VALID_OBSERVATION_TABLES_JOIN: raise ValueError( - f"observation_table must be one of {[VALID_OBSERVATION_TABLES_SINGLE]+[VALID_OBSERVATION_TABLES_JOIN]}." + f"observation_table must be one of {VALID_OBSERVATION_TABLES_SINGLE+VALID_OBSERVATION_TABLES_JOIN}." ) if observation_table in VALID_OBSERVATION_TABLES_SINGLE: @@ -86,20 +93,25 @@ def setup_obs( elif observation_table in VALID_OBSERVATION_TABLES_JOIN: if observation_table == "person_cohort": - obs = _get_table_left_join(backend_handle, "person", "cohort", right_key="subject_id") + obs = _get_table_join(backend_handle, "person", "cohort", right_key="subject_id") elif observation_table == "person_observation_period": - obs = _get_table_left_join(backend_handle, "person", "observation_period") + obs = _get_table_join(backend_handle, "person", "observation_period") elif observation_table == "person_visit_occurrence": - obs = _get_table_left_join(backend_handle, "person", "visit_occurrence") + obs = _get_table_join(backend_handle, "person", "visit_occurrence") + + if death_table: + death = get_table(backend_handle, "death") + obs = obs.merge(death, how="left", on="person_id") - return EHRData(obs=obs) + return EHRData(obs=obs, uns={"omop_io_observation_table": observation_table.split("person_")[-1]}) def setup_variables( edata, - backend_handle: Literal[str, duckdb, Path], - tables: Sequence[Literal["measurement", "observation", "procedure_occurrence", "specimen", "note"]], - start_time: Literal["observation_period_start"] | pd.Timestamp | str, + *, + backend_handle: duckdb.duckdb.DuckDBPyConnection, + data_tables: Sequence[Literal["measurement", "observation", "specimen"]], + data_field_to_keep: str | dict[str, str], interval_length_number: int, interval_length_unit: str, num_intervals: int, @@ -116,8 +128,11 @@ def setup_variables( The backend handle to the database. edata The EHRData object to which the variables should be added. - tables + data_tables The tables to be used. + data_field_to_keep + The CDM Field in the data table to be kept. Can be e.g. "value_as_number" or "value_as_concept_id". + If multiple tables are used, this can be a dictionary with the table name as key and the column name as value, e.g. {"measurement": "value_as_number", "observation": "value_as_concept_id"}. start_time Starting time for values to be included. interval_length_number @@ -127,60 +142,83 @@ def setup_variables( num_intervals Number of intervals. concept_ids - Concept IDs to filter on or 'all'. + Concept IDs to use from this data table. If not specified, 'all' are used. aggregation_strategy - Strategy to use when aggregating data within intervals. + Strategy to use when aggregating multiple data points within one interval. Returns ------- An EHRData object with populated .r and .var field. """ - from ehrdata import EHRData - - concept_ids_present_list = [] time_interval_tables = [] - for table in tables: - if table not in VALID_VARIABLE_TABLES: - raise ValueError(f"tables must be a sequence of from [{VALID_VARIABLE_TABLES}].") - - id_column = f"{table}_type_concept_id" if table in ["note", "death"] else f"{table}_concept_id" - - concept_ids_present = _lowercase_column_names( - backend_handle.sql(f"SELECT DISTINCT {id_column} FROM {table}").df() - ) - - personxfeature_pairs_of_value_timestamp = _extract_personxfeature_pairs_of_value_timestamp(backend_handle) - - # Create the time interval table - time_interval_table = get_time_interval_table( - backend_handle, - personxfeature_pairs_of_value_timestamp, - edata.obs, - start_time="observation_period_start", - interval_length_number=interval_length_number, - interval_length_unit=interval_length_unit, - num_intervals=num_intervals, - concept_ids=concept_ids, - aggregation_strategy=aggregation_strategy, + time_defining_table = edata.uns.get("omop_io_observation_table", None) + if time_defining_table is None: + raise ValueError("The observation table must be set up first, use the `setup_obs` function.") + + for data_table in data_tables: + ds = ( + time_interval_table_query_long_format( + backend_handle=backend_handle, + time_defining_table=time_defining_table, + data_table=data_table, + data_field_to_keep=data_field_to_keep, + interval_length_number=interval_length_number, + interval_length_unit=interval_length_unit, + num_intervals=num_intervals, + aggregation_strategy=aggregation_strategy, + ) + .set_index(["person_id", "data_table_concept_id", "interval_step"]) + .to_xarray() ) - - # Append - concept_ids_present_list.append(concept_ids_present) - time_interval_tables.append(time_interval_table) + # TODO: interval_start to var + # TODO: concept_ids to var + # TODO: concept_names to var + # TODO: for measurement, observation: store unit_concept_id and unit_name in var + time_interval_tables.append(ds) + + return ds + # for table in tables: + # if table not in VALID_VARIABLE_TABLES: + # raise ValueError(f"tables must be a sequence of from [{VALID_VARIABLE_TABLES}].") + + # id_column = f"{table}_type_concept_id" if table in ["note", "death"] else f"{table}_concept_id" + + # concept_ids_present = _lowercase_column_names( + # backend_handle.sql(f"SELECT DISTINCT {id_column} FROM {table}").df() + # ) + + # personxfeature_pairs_of_value_timestamp = _extract_personxfeature_pairs_of_value_timestamp(backend_handle) + + # # Create the time interval table + # time_interval_table = get_time_interval_table( + # backend_handle, + # personxfeature_pairs_of_value_timestamp, + # edata.obs, + # start_time="observation_period_start", + # interval_length_number=interval_length_number, + # interval_length_unit=interval_length_unit, + # num_intervals=num_intervals, + # concept_ids=concept_ids, + # aggregation_strategy=aggregation_strategy, + # ) + + # # Append + # concept_ids_present_list.append(concept_ids_present) + # time_interval_tables.append(time_interval_table) # Combine time interval tables - if len(time_interval_tables) > 1: - time_interval_table = np.concatenate([time_interval_table, time_interval_table], axis=1) - concept_ids_present = pd.concat(concept_ids_present_list) - else: - time_interval_table = time_interval_tables[0] - concept_ids_present = concept_ids_present_list[0] + # if len(time_interval_tables) > 1: + # time_interval_table = np.concatenate([time_interval_table, time_interval_table], axis=1) + # concept_ids_present = pd.concat(concept_ids_present_list) + # else: + # time_interval_table = time_interval_tables[0] + # concept_ids_present = concept_ids_present_list[0] - # Update edata with the new variables - edata = EHRData(r=time_interval_table, obs=edata.obs, var=concept_ids_present) + # # Update edata with the new variables + # edata = EHRData(r=time_interval_table, obs=edata.obs, var=concept_ids_present) - return edata + # return edata # DEVICE EXPOSURE and DRUG EXPOSURE NEEDS TO BE IMPLEMENTED BECAUSE THEY CONTAIN START DATE @@ -206,7 +244,7 @@ def get_table(duckdb_instance, table_name: str) -> pd.DataFrame: return _lowercase_column_names(duckdb_instance.sql(f"SELECT * FROM {table_name}").df()) -def _get_table_left_join( +def _get_table_join( duckdb_instance, table1: str, table2: str, left_key: str = "person_id", right_key: str = "person_id" ) -> pd.DataFrame: """Extract a table of an OMOP CDM Database.""" @@ -214,7 +252,7 @@ def _get_table_left_join( duckdb_instance.sql( f"SELECT * \ FROM {table1} as t1 \ - LEFT JOIN {table2} as t2 ON t1.{left_key} = t2.{right_key} \ + JOIN {table2} as t2 ON t1.{left_key} = t2.{right_key} \ " ).df() ) diff --git a/tests/test_io/test_omop.py b/tests/test_io/test_omop.py index 5e501c1..a9f607e 100644 --- a/tests/test_io/test_omop.py +++ b/tests/test_io/test_omop.py @@ -1,40 +1,63 @@ -import duckdb +import re + import pytest import ehrdata as ed -from ehrdata.io.omop import register_omop_to_db_connection - -def test_register_omop_to_db_connection(): - register_omop_to_db_connection(path="tests/data/toy_omop/vanilla", backend_handle=duckdb.connect(), source="csv") +# def test_register_omop_to_db_connection(): +# register_omop_to_db_connection(path="tests/data/toy_omop/vanilla", backend_handle=duckdb.connect(), source="csv") +# TODO: add test for death argument @pytest.mark.parametrize( - "observation_table", ["person", "person_cohort", "person_observation_period", "person_visit_occurrence"] + "observation_table, expected_length, expected_obs_num_columns", + [ + ("person", 4, 18), + ("person_cohort", 3, 22), + ("person_observation_period", 3, 23), + ("person_visit_occurrence", 3, 35), + ], ) -def test_setup_obs(omop_connection_vanilla, observation_table): +def test_setup_obs(omop_connection_vanilla, observation_table, expected_length, expected_obs_num_columns): con = omop_connection_vanilla edata = ed.io.omop.setup_obs(backend_handle=con, observation_table=observation_table) assert isinstance(edata, ed.EHRData) + # 4 persons, only 3 are in cohort, or have observation period, or visit occurrence + assert len(edata) == expected_length + assert edata.obs.shape[1] == expected_obs_num_columns -@pytest.mark.parametrize("observation_table", ["perso"]) -def test_setup_obs_unknown_observation_table_argument(omop_connection_vanilla, observation_table): - con = omop_connection_vanilla - with pytest.raises(ValueError): - ed.io.omop.setup_obs(backend_handle=con, observation_table=observation_table) +def test_setup_obs_invalid_backend_handle_argument(): + with pytest.raises(ValueError, match="backend_handle must be a DuckDB connection."): + ed.io.omop.setup_obs(backend_handle="not_a_con", observation_table="person") -def test_setup_obs_person(): - # check precise expected table - con = duckdb.connect() - register_omop_to_db_connection(path="../data/toy_omop/vanilla", backend_handle=con, source="csv") - con.close() + +def test_setup_obs_invalid_observation_table_argument(omop_connection_vanilla): + con = omop_connection_vanilla + with pytest.raises( + ValueError, + match=re.escape( + "observation_table must be one of ['person', 'person_cohort', 'person_observation_period', 'person_visit_occurrence']." + ), + ): + ed.io.omop.setup_obs(backend_handle=con, observation_table="perso") -def test_setup_var_measurement_startdate_fixed(): +def test_setup_variables_measurement_startdate_fixed(omop_connection_vanilla): + con = omop_connection_vanilla + edata = ed.io.omop.setup_obs(backend_handle=con, observation_table="person") + ed.io.omop.setup_variables( + edata, + backend_handle=con, + tables=["measurement"], + start_time="2100-01-01", + interval_length_number=1, + interval_length_unit="day", + num_intervals=31, + ) # check precise expected table - pass + assert edata.vars.shape[1] == 8 def test_setup_var_measurement_startdate_observation_period(): From d3fca1fceae54099f1c0fecb9f1f01403574f7f8 Mon Sep 17 00:00:00 2001 From: Eljas Roellin Date: Tue, 29 Oct 2024 23:34:19 +0100 Subject: [PATCH 10/15] setup obs and variables duckdb backed, first proper minimal tests --- src/ehrdata/io/omop/omop.py | 143 ++++---------------------- tests/data/toy_omop/vanilla/death.csv | 3 + tests/test_io/test_omop.py | 66 ++++++------ 3 files changed, 55 insertions(+), 157 deletions(-) create mode 100644 tests/data/toy_omop/vanilla/death.csv diff --git a/src/ehrdata/io/omop/omop.py b/src/ehrdata/io/omop/omop.py index 88fc9c8..7e6525c 100644 --- a/src/ehrdata/io/omop/omop.py +++ b/src/ehrdata/io/omop/omop.py @@ -129,7 +129,7 @@ def setup_variables( edata The EHRData object to which the variables should be added. data_tables - The tables to be used. + The tables to be used. For now, only one can be used. data_field_to_keep The CDM Field in the data table to be kept. Can be e.g. "value_as_number" or "value_as_concept_id". If multiple tables are used, this can be a dictionary with the table name as key and the column name as value, e.g. {"measurement": "value_as_number", "observation": "value_as_concept_id"}. @@ -150,78 +150,33 @@ def setup_variables( ------- An EHRData object with populated .r and .var field. """ - time_interval_tables = [] + from ehrdata import EHRData time_defining_table = edata.uns.get("omop_io_observation_table", None) if time_defining_table is None: raise ValueError("The observation table must be set up first, use the `setup_obs` function.") - for data_table in data_tables: - ds = ( - time_interval_table_query_long_format( - backend_handle=backend_handle, - time_defining_table=time_defining_table, - data_table=data_table, - data_field_to_keep=data_field_to_keep, - interval_length_number=interval_length_number, - interval_length_unit=interval_length_unit, - num_intervals=num_intervals, - aggregation_strategy=aggregation_strategy, - ) - .set_index(["person_id", "data_table_concept_id", "interval_step"]) - .to_xarray() + ds = ( + time_interval_table_query_long_format( + backend_handle=backend_handle, + time_defining_table=time_defining_table, + data_table=data_tables[0], + data_field_to_keep=data_field_to_keep, + interval_length_number=interval_length_number, + interval_length_unit=interval_length_unit, + num_intervals=num_intervals, + aggregation_strategy=aggregation_strategy, ) - # TODO: interval_start to var - # TODO: concept_ids to var - # TODO: concept_names to var - # TODO: for measurement, observation: store unit_concept_id and unit_name in var - time_interval_tables.append(ds) - - return ds - # for table in tables: - # if table not in VALID_VARIABLE_TABLES: - # raise ValueError(f"tables must be a sequence of from [{VALID_VARIABLE_TABLES}].") - - # id_column = f"{table}_type_concept_id" if table in ["note", "death"] else f"{table}_concept_id" - - # concept_ids_present = _lowercase_column_names( - # backend_handle.sql(f"SELECT DISTINCT {id_column} FROM {table}").df() - # ) - - # personxfeature_pairs_of_value_timestamp = _extract_personxfeature_pairs_of_value_timestamp(backend_handle) - - # # Create the time interval table - # time_interval_table = get_time_interval_table( - # backend_handle, - # personxfeature_pairs_of_value_timestamp, - # edata.obs, - # start_time="observation_period_start", - # interval_length_number=interval_length_number, - # interval_length_unit=interval_length_unit, - # num_intervals=num_intervals, - # concept_ids=concept_ids, - # aggregation_strategy=aggregation_strategy, - # ) - - # # Append - # concept_ids_present_list.append(concept_ids_present) - # time_interval_tables.append(time_interval_table) - - # Combine time interval tables - # if len(time_interval_tables) > 1: - # time_interval_table = np.concatenate([time_interval_table, time_interval_table], axis=1) - # concept_ids_present = pd.concat(concept_ids_present_list) - # else: - # time_interval_table = time_interval_tables[0] - # concept_ids_present = concept_ids_present_list[0] - - # # Update edata with the new variables - # edata = EHRData(r=time_interval_table, obs=edata.obs, var=concept_ids_present) + .set_index(["person_id", "data_table_concept_id", "interval_step"]) + .to_xarray() + ) - # return edata + var = ds["data_table_concept_id"].to_dataframe() + t = ds["interval_step"].to_dataframe() + edata = EHRData(r=ds[data_field_to_keep[0]].values, obs=edata.obs, var=var, uns=edata.uns, t=t) -# DEVICE EXPOSURE and DRUG EXPOSURE NEEDS TO BE IMPLEMENTED BECAUSE THEY CONTAIN START DATE + return edata def load( @@ -258,66 +213,6 @@ def _get_table_join( ) -def _extract_personxfeature_pairs_of_value_timestamp( - duckdb_instance, table_name: str, concept_id_col: str, value_col: str, timestamp_col: str -): - """ - Generalized extraction function to extract data from an OMOP CDM table. - - Parameters - ---------- - duckdb_instance: duckdb.DuckDB - The DuckDB instance for querying the database. - table_name: str - The name of the table to extract data from (e.g., "measurement", "observation"). - concept_id_col: str - The name of the column that contains the concept IDs (e.g., "measurement_concept_id"). - value_col: str - The name of the column that contains the values (e.g., "value_as_number"). - timestamp_col: str - The name of the column that contains the timestamps (e.g., "measurement_datetime"). - - Returns - ------- - ak.Array - An Awkward Array with the structure: n_person x n_features x 2 (value, time). - """ - # Load the specified table - table_df = duckdb_instance.sql(f"SELECT * FROM {table_name}").df() - table_df = _lowercase_column_names(table_df) - - # Load the person table to get unique person IDs - person_id_df = _lowercase_column_names(duckdb_instance.sql("SELECT * FROM person").df()) - person_ids = person_id_df["person_id"].unique() - - # Get unique features (concept IDs) for the table - features = table_df[concept_id_col].unique() - - # Initialize the collection for all persons - person_collection = [] - - for person in person_ids: - person_as_list = [] - # Get rows for the current person - person_data = table_df[table_df["person_id"] == person] - - # For each feature, get values and timestamps - for feature in features: - feature_data = person_data[person_data[concept_id_col] == feature] - - # Extract the values and timestamps - feature_values = feature_data[value_col] - feature_timestamps = feature_data[timestamp_col] - - # Append values and timestamps for this feature - person_as_list.append([feature_values, feature_timestamps]) - - # Append this person's data to the collection - person_collection.append(person_as_list) - - return ak.Array(person_collection) - - def extract_measurement(duckdb_instance): """Extract a table of an OMOP CDM Database.""" return get_table( diff --git a/tests/data/toy_omop/vanilla/death.csv b/tests/data/toy_omop/vanilla/death.csv new file mode 100644 index 0000000..3475d47 --- /dev/null +++ b/tests/data/toy_omop/vanilla/death.csv @@ -0,0 +1,3 @@ +person_id,death_date,death_datetime,death_type_concept_id,cause_concept_id,cause_source_value,cause_source_concept_id +1,2100-03-31,2100-03-31 00:00:00,32817,0,0, +2,2100-03-31,2100-03-31 00:00:00,32817,0,0, diff --git a/tests/test_io/test_omop.py b/tests/test_io/test_omop.py index a9f607e..eca2c10 100644 --- a/tests/test_io/test_omop.py +++ b/tests/test_io/test_omop.py @@ -4,23 +4,23 @@ import ehrdata as ed -# def test_register_omop_to_db_connection(): -# register_omop_to_db_connection(path="tests/data/toy_omop/vanilla", backend_handle=duckdb.connect(), source="csv") - -# TODO: add test for death argument @pytest.mark.parametrize( - "observation_table, expected_length, expected_obs_num_columns", + "observation_table, death_table, expected_length, expected_obs_num_columns", [ - ("person", 4, 18), - ("person_cohort", 3, 22), - ("person_observation_period", 3, 23), - ("person_visit_occurrence", 3, 35), + ("person", False, 4, 18), + ("person", True, 4, 24), + ("person_cohort", False, 3, 22), + ("person_cohort", True, 3, 28), + ("person_observation_period", False, 3, 23), + ("person_observation_period", True, 3, 29), + ("person_visit_occurrence", False, 3, 35), + ("person_visit_occurrence", True, 3, 41), ], ) -def test_setup_obs(omop_connection_vanilla, observation_table, expected_length, expected_obs_num_columns): +def test_setup_obs(omop_connection_vanilla, observation_table, death_table, expected_length, expected_obs_num_columns): con = omop_connection_vanilla - edata = ed.io.omop.setup_obs(backend_handle=con, observation_table=observation_table) + edata = ed.io.omop.setup_obs(backend_handle=con, observation_table=observation_table, death_table=death_table) assert isinstance(edata, ed.EHRData) # 4 persons, only 3 are in cohort, or have observation period, or visit occurrence @@ -44,32 +44,32 @@ def test_setup_obs_invalid_observation_table_argument(omop_connection_vanilla): ed.io.omop.setup_obs(backend_handle=con, observation_table="perso") -def test_setup_variables_measurement_startdate_fixed(omop_connection_vanilla): +@pytest.mark.parametrize( + "observation_table", + ["person_cohort", "person_observation_period", "person_visit_occurrence"], +) +@pytest.mark.parametrize( + "data_tables", + [["measurement"], ["observation"]], +) +@pytest.mark.parametrize( + "data_field_to_keep", + [["value_as_number"], ["value_as_concept_id"]], +) +def test_setup_variables(omop_connection_vanilla, observation_table, data_tables, data_field_to_keep): con = omop_connection_vanilla - edata = ed.io.omop.setup_obs(backend_handle=con, observation_table="person") - ed.io.omop.setup_variables( + edata = ed.io.omop.setup_obs(backend_handle=con, observation_table=observation_table) + edata = ed.io.omop.setup_variables( edata, backend_handle=con, - tables=["measurement"], - start_time="2100-01-01", + data_tables=data_tables, + data_field_to_keep=data_field_to_keep, interval_length_number=1, interval_length_unit="day", - num_intervals=31, + num_intervals=30, ) - # check precise expected table - assert edata.vars.shape[1] == 8 - - -def test_setup_var_measurement_startdate_observation_period(): - # check precise expected table - pass - -def test_setup_var_observation_startdate_fixed(): - # check precise expected table - pass - - -def test_setup_var_observation_startdate_observation_period(): - # check precise expected table - pass + assert isinstance(edata, ed.EHRData) + assert edata.n_obs == 3 + assert edata.n_vars == 2 + assert edata.r.shape[2] == 30 From 255d1f6451129fe1ff34f29eadc810da6dd0220f Mon Sep 17 00:00:00 2001 From: Eljas Roellin Date: Tue, 29 Oct 2024 23:36:47 +0100 Subject: [PATCH 11/15] add xarray as dependency --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 3563b4b..35993ba 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -29,6 +29,7 @@ dependencies = [ "duckdb", # for debug logging (referenced from the issue template) "session-info", + "xarray", ] optional-dependencies.dev = [ "pre-commit", From b9794477a055d5dde442613b179f0605ab374db5 Mon Sep 17 00:00:00 2001 From: Eljas Roellin Date: Fri, 1 Nov 2024 18:03:50 +0100 Subject: [PATCH 12/15] update commit, unstable --- src/ehrdata/dt/datasets.py | 39 ++++--- src/ehrdata/io/omop/omop.py | 210 +++++++++++++++++++++++++++++++++--- tests/test_dt/test_dt.py | 24 +++++ tests/test_io/test_omop.py | 145 +++++++++++++++++++++++-- 4 files changed, 380 insertions(+), 38 deletions(-) create mode 100644 tests/test_dt/test_dt.py diff --git a/src/ehrdata/dt/datasets.py b/src/ehrdata/dt/datasets.py index 3623d7a..f996fe0 100644 --- a/src/ehrdata/dt/datasets.py +++ b/src/ehrdata/dt/datasets.py @@ -17,23 +17,36 @@ def _get_table_list() -> list: return flat_table_list -def _set_up_duckdb(path: Path, backend_handle: DuckDBPyConnection) -> None: +def _set_up_duckdb(path: Path, backend_handle: DuckDBPyConnection, prefix: str = "") -> None: tables = _get_table_list() + used_tables = [] missing_tables = [] - for table in tables: - # if path exists lowercse, uppercase, capitalized: - table_path = f"{path}/{table}.csv" - if os.path.exists(table_path): - if table == "measurement": - backend_handle.register( - table, backend_handle.read_csv(f"{path}/{table}.csv", dtype={"measurement_source_value": str}) - ) + unused_files = [] + for file_name in os.listdir(path): + file_name_trunk = file_name.split(".")[0].lower() + + if file_name_trunk in tables or file_name_trunk.replace(prefix, "") in tables: + used_tables.append(file_name_trunk.replace(prefix, "")) + + if file_name_trunk == "measurement": + dtype = {"measurement_source_value": str} else: - backend_handle.register(table, backend_handle.read_csv(f"{path}/{table}.csv")) + dtype = None + + backend_handle.register( + file_name_trunk.replace(prefix, ""), + backend_handle.read_csv(f"{path}/{file_name_trunk}.csv", dtype=dtype), + ) else: - missing_tables.append([table]) + unused_files.append(file_name) + + for table in tables: + if table not in used_tables: + missing_tables.append(table) + print("missing tables: ", missing_tables) + print("unused files: ", unused_files) def mimic_iv_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = None) -> None: @@ -80,8 +93,8 @@ def mimic_iv_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = N else: print(f"Failed to download the file. Status code: {response.status_code}") return - - return _set_up_duckdb(data_path + "/1_omop_data_csv", backend_handle) + # TODO: capitalization, and lowercase, and containing the name + return _set_up_duckdb(data_path + "/1_omop_data_csv", backend_handle, prefix="2b_") def gibleed_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = None) -> None: diff --git a/src/ehrdata/io/omop/omop.py b/src/ehrdata/io/omop/omop.py index 7e6525c..8293ec6 100644 --- a/src/ehrdata/io/omop/omop.py +++ b/src/ehrdata/io/omop/omop.py @@ -9,11 +9,17 @@ import duckdb import numpy as np import pandas as pd -from duckdb import DuckDBPyConnection -from ehrdata.io.omop._queries import time_interval_table_query_long_format +from ehrdata.io.omop._queries import ( + AGGREGATION_STRATEGY_KEY, + time_interval_table_query_long_format, +) from ehrdata.utils._omop_utils import get_omop_table_names +VALID_OBSERVATION_TABLES_SINGLE = ["person"] +VALID_OBSERVATION_TABLES_JOIN = ["person_cohort", "person_observation_period", "person_visit_occurrence"] +VALID_VARIABLE_TABLES = ["measurement", "observation", "specimen"] + def _check_sanity_of_folder(folder_path: str | Path): pass @@ -23,14 +29,152 @@ def _check_sanity_of_database(backend_handle: duckdb.DuckDB): pass -VALID_OBSERVATION_TABLES_SINGLE = ["person"] -VALID_OBSERVATION_TABLES_JOIN = ["person_cohort", "person_observation_period", "person_visit_occurrence"] -VALID_VARIABLE_TABLES = ["measurement", "observation", "specimen"] +def _check_valid_backend_handle(backend_handle) -> None: + if not isinstance(backend_handle, duckdb.duckdb.DuckDBPyConnection): + raise TypeError("Expected backend_handle to be of type DuckDBPyConnection.") + + +def _check_valid_observation_table(observation_table) -> None: + if not isinstance(observation_table, str): + raise TypeError("Expected observation_table to be a string.") + if observation_table not in VALID_OBSERVATION_TABLES_SINGLE + VALID_OBSERVATION_TABLES_JOIN: + raise ValueError( + f"observation_table must be one of {VALID_OBSERVATION_TABLES_SINGLE+VALID_OBSERVATION_TABLES_JOIN}." + ) + + +def _check_valid_death_table(death_table) -> None: + if not isinstance(death_table, bool): + raise TypeError("Expected death_table to be a boolean.") + + +def _check_valid_edata(edata) -> None: + from ehrdata import EHRData + + if not isinstance(edata, EHRData): + raise TypeError("Expected edata to be of type EHRData.") + + +def _check_valid_data_tables(data_tables) -> Sequence: + if isinstance(data_tables, str): + data_tables = [data_tables] + if not isinstance(data_tables, Sequence): + raise TypeError("Expected data_tables to be a string or Sequence.") + if not all(table in VALID_VARIABLE_TABLES for table in data_tables): + raise ValueError(f"data_tables must be a subset of {VALID_VARIABLE_TABLES}.") + return data_tables + + +def _check_valid_data_field_to_keep(data_field_to_keep) -> Sequence: + if isinstance(data_field_to_keep, str): + data_field_to_keep = [data_field_to_keep] + if not isinstance(data_field_to_keep, Sequence) and not isinstance(data_field_to_keep, dict): + raise TypeError("Expected data_field_to_keep to be a string, Sequence, or dictionary.") + return data_field_to_keep + + +def _check_valid_interval_length_number(interval_length_number) -> None: + if not isinstance(interval_length_number, int): + raise TypeError("Expected interval_length_number to be an integer.") + + +def _check_valid_interval_length_unit(interval_length_unit) -> None: + # TODO: maybe check if it is a valid unit from pandas.to_timedelta + if not isinstance(interval_length_unit, str): + raise TypeError("Expected interval_length_unit to be a string.") + + +def _check_valid_num_intervals(num_intervals) -> None: + if not isinstance(num_intervals, int): + raise TypeError("Expected num_intervals to be an integer.") + + +def _check_valid_concept_ids(concept_ids) -> None: + if concept_ids != "all" and not isinstance(concept_ids, Sequence): + raise TypeError("concept_ids must be a sequence of integers or 'all'.") + + +def _check_valid_aggregation_strategy(aggregation_strategy) -> None: + if aggregation_strategy not in AGGREGATION_STRATEGY_KEY.keys(): + raise TypeError(f"aggregation_strategy must be one of {AGGREGATION_STRATEGY_KEY.keys()}.") + + +def _collect_units_per_feature(ds, unit_key="unit_concept_id") -> dict: + feature_units = {} + for i in range(ds[unit_key].shape[1]): + single_feature_units = ds[unit_key].isel({ds[unit_key].dims[1]: i}) + single_feature_units_flat = np.array(single_feature_units).flatten() + single_feature_units_unique = pd.unique(single_feature_units_flat[~pd.isna(single_feature_units_flat)]) + feature_units[i] = single_feature_units_unique + return feature_units + + +def _check_one_unit_per_feature(ds, unit_key="unit_concept_id") -> None: + feature_units = _collect_units_per_feature(ds, unit_key=unit_key) + num_units = np.array([len(units) for _, units in feature_units.items()]) + + # print(f"no units for features: {np.argwhere(num_units == 0)}") + print(f"multiple units for features: {np.argwhere(num_units > 1)}") + + +def _create_feature_unit_concept_id_report(backend_handle, ds) -> pd.DataFrame: + feature_units_concept = _collect_units_per_feature(ds, unit_key="unit_concept_id") + + feature_units_long_format = [] + for feature, units in feature_units_concept.items(): + if len(units) == 0: + feature_units_long_format.append({"concept_id": feature, "no_units": True, "multiple_units": False}) + elif len(units) > 1: + for unit in units: + feature_units_long_format.append( + { + "concept_id": feature, + "unit_concept_id": unit, + "no_units": False, + "multiple_units": True, + } + ) + else: + feature_units_long_format.append( + { + "concept_id": feature, + "unit_concept_id": units[0], + "no_units": False, + "multiple_units": False, + } + ) + + df = pd.DataFrame( + feature_units_long_format, columns=["concept_id", "unit_concept_id", "no_units", "multiple_units"] + ) + df["unit_concept_id"] = df["unit_concept_id"].astype("Int64") + + return df + + +def _create_enriched_var_table(backend_handle, ds, unit_report) -> pd.DataFrame: + feature_concept_id_table = ds["data_table_concept_id"].to_dataframe() + + feature_concept_id_unit_table = pd.merge( + feature_concept_id_table, unit_report, how="left", left_index=True, right_on="concept_id" + ) + + concepts = backend_handle.sql("SELECT * FROM concept").df() + + feature_concept_id_unit_info_table = pd.merge( + feature_concept_id_unit_table, + concepts, + how="left", + left_on="unit_concept_id", + right_on="concept_id", + ) + + return feature_concept_id_unit_info_table def register_omop_to_db_connection( path: Path, - backend_handle: DuckDBPyConnection, + backend_handle: duckdb.duckdb.DuckDBPyConnection, source: Literal["csv"] = "csv", ) -> None: """Register the OMOP CDM tables to the database.""" @@ -78,16 +222,12 @@ def setup_obs( ------- An EHRData object with populated .obs field. """ - if not isinstance(backend_handle, duckdb.duckdb.DuckDBPyConnection): - raise ValueError("backend_handle must be a DuckDB connection.") + _check_valid_backend_handle(backend_handle) + _check_valid_observation_table(observation_table) + _check_valid_death_table(death_table) from ehrdata import EHRData - if observation_table not in VALID_OBSERVATION_TABLES_SINGLE + VALID_OBSERVATION_TABLES_JOIN: - raise ValueError( - f"observation_table must be one of {VALID_OBSERVATION_TABLES_SINGLE+VALID_OBSERVATION_TABLES_JOIN}." - ) - if observation_table in VALID_OBSERVATION_TABLES_SINGLE: obs = get_table(backend_handle, observation_table) @@ -110,8 +250,9 @@ def setup_variables( edata, *, backend_handle: duckdb.duckdb.DuckDBPyConnection, - data_tables: Sequence[Literal["measurement", "observation", "specimen"]], - data_field_to_keep: str | dict[str, str], + data_tables: Sequence[Literal["measurement", "observation", "specimen"]] + | Literal["measurement", "observation", "specimen"], + data_field_to_keep: str | Sequence[str] | dict[str, str], interval_length_number: int, interval_length_unit: str, num_intervals: int, @@ -152,10 +293,31 @@ def setup_variables( """ from ehrdata import EHRData + _check_valid_edata(edata) + _check_valid_backend_handle(backend_handle) + data_tables = _check_valid_data_tables(data_tables) + data_field_to_keep = _check_valid_data_field_to_keep(data_field_to_keep) + _check_valid_interval_length_number(interval_length_number) + _check_valid_interval_length_unit(interval_length_unit) + _check_valid_num_intervals(num_intervals) + _check_valid_concept_ids(concept_ids) + _check_valid_aggregation_strategy(aggregation_strategy) + time_defining_table = edata.uns.get("omop_io_observation_table", None) if time_defining_table is None: raise ValueError("The observation table must be set up first, use the `setup_obs` function.") + if data_tables[0] in ["measurement", "observation"]: + # also keep unit_concept_id and unit_source_value; + if isinstance(data_field_to_keep, list): + data_field_to_keep = list(data_field_to_keep) + ["unit_concept_id", "unit_source_value"] + elif isinstance(data_field_to_keep, dict): + data_field_to_keep = { + k: v + ["unit_concept_id", "unit_source_value"] for k, v in data_field_to_keep.items() + } + else: + raise ValueError + ds = ( time_interval_table_query_long_format( backend_handle=backend_handle, @@ -171,12 +333,26 @@ def setup_variables( .to_xarray() ) - var = ds["data_table_concept_id"].to_dataframe() + _check_one_unit_per_feature(ds) + # TODO ignore? go with more vanilla omop style. _check_one_unit_per_feature(ds, unit_key="unit_source_value") + + unit_report = _create_feature_unit_concept_id_report(backend_handle, ds) + # TODO: generate nice multiple-unit report + # TODO: add unit to var + # TODO: add unit name to var + # TODO: add feature name to var + + # TODO: test all of the above 5 + + # var = _create_var_table(backend_handle, unit_report) + + var = _create_enriched_var_table(backend_handle, ds, unit_report) + t = ds["interval_step"].to_dataframe() edata = EHRData(r=ds[data_field_to_keep[0]].values, obs=edata.obs, var=var, uns=edata.uns, t=t) - return edata + return edata, unit_report def load( diff --git a/tests/test_dt/test_dt.py b/tests/test_dt/test_dt.py new file mode 100644 index 0000000..5d71186 --- /dev/null +++ b/tests/test_dt/test_dt.py @@ -0,0 +1,24 @@ +import duckdb + +import ehrdata as ed + + +def test_mimic_iv_omop(): + con = duckdb.connect() + ed.dt.mimic_iv_omop(backend_handle=con) + assert len(con.execute("SHOW TABLES").df()) == 30 + con.close() + + +def test_gibleed_omop(): + con = duckdb.connect() + ed.dt.gibleed_omop(backend_handle=con) + assert len(con.execute("SHOW TABLES").df()) == 36 + con.close() + + +def test_synthea27nj_omop(): + con = duckdb.connect() + ed.dt.synthea27nj_omop(backend_handle=con) + assert len(con.execute("SHOW TABLES").df()) == 37 + con.close() diff --git a/tests/test_io/test_omop.py b/tests/test_io/test_omop.py index eca2c10..403d84e 100644 --- a/tests/test_io/test_omop.py +++ b/tests/test_io/test_omop.py @@ -4,6 +4,17 @@ import ehrdata as ed +# constants for toy_omop/vanilla +VANILLA_PERSONS_WITH_OBSERVATION_TABLE_ENTRY = { + "person_cohort": 3, + "person_observation_period": 3, + "person_visit_occurrence": 3, +} +VANILLA_NUM_CONCEPTS = { + "measurement": 2, + "observation": 2, +} + @pytest.mark.parametrize( "observation_table, death_table, expected_length, expected_obs_num_columns", @@ -28,12 +39,30 @@ def test_setup_obs(omop_connection_vanilla, observation_table, death_table, expe assert edata.obs.shape[1] == expected_obs_num_columns -def test_setup_obs_invalid_backend_handle_argument(): - with pytest.raises(ValueError, match="backend_handle must be a DuckDB connection."): - ed.io.omop.setup_obs(backend_handle="not_a_con", observation_table="person") +@pytest.mark.parametrize( + "backend_handle, observation_table, death_table, expected_error", + [ + ("wrong_type", "person", False, "Expected backend_handle to be of type DuckDBPyConnection."), + (None, 123, False, "Expected observation_table to be a string."), + (None, "person", "wrong_type", "Expected death_table to be a boolean."), + ], +) +def test_setup_obs_illegal_argument_types( + omop_connection_vanilla, + backend_handle, + observation_table, + death_table, + expected_error, +): + with pytest.raises(TypeError, match=expected_error): + ed.io.omop.setup_obs( + backend_handle=backend_handle or omop_connection_vanilla, + observation_table=observation_table, + death_table=death_table, + ) -def test_setup_obs_invalid_observation_table_argument(omop_connection_vanilla): +def test_setup_obs_invalid_observation_table_value(omop_connection_vanilla): con = omop_connection_vanilla with pytest.raises( ValueError, @@ -57,6 +86,7 @@ def test_setup_obs_invalid_observation_table_argument(omop_connection_vanilla): [["value_as_number"], ["value_as_concept_id"]], ) def test_setup_variables(omop_connection_vanilla, observation_table, data_tables, data_field_to_keep): + num_intervals = 4 con = omop_connection_vanilla edata = ed.io.omop.setup_obs(backend_handle=con, observation_table=observation_table) edata = ed.io.omop.setup_variables( @@ -66,10 +96,109 @@ def test_setup_variables(omop_connection_vanilla, observation_table, data_tables data_field_to_keep=data_field_to_keep, interval_length_number=1, interval_length_unit="day", - num_intervals=30, + num_intervals=num_intervals, ) assert isinstance(edata, ed.EHRData) - assert edata.n_obs == 3 - assert edata.n_vars == 2 - assert edata.r.shape[2] == 30 + assert edata.n_obs == VANILLA_PERSONS_WITH_OBSERVATION_TABLE_ENTRY[observation_table] + assert edata.n_vars == VANILLA_NUM_CONCEPTS[data_tables[0]] + assert edata.r.shape[2] == num_intervals + + +@pytest.mark.parametrize( + "edata, backend_handle, data_tables, data_field_to_keep, interval_length_number, interval_length_unit, num_intervals, expected_error", + [ + ( + "wrong_type", + None, + ["measurement"], + ["value_as_number"], + 1, + "day", + 4, + "Expected edata to be of type EHRData.", + ), + ( + None, + "wrong_type", + ["measurement"], + ["value_as_number"], + 1, + "day", + 4, + "Expected backend_handle to be of type DuckDBPyConnection.", + ), + ( + None, + None, + 123, + ["value_as_number"], + 1, + "day", + 4, + "Expected data_tables to be a string or Sequence.", + ), + ( + None, + None, + ["measurement"], + 123, + 1, + "day", + 4, + "Expected data_field_to_keep to be a string, Sequence, or dictionary.", + ), + ( + None, + None, + ["measurement"], + ["value_as_number"], + "wrong_type", + "day", + 4, + "Expected interval_length_number to be an integer.", + ), + ( + None, + None, + ["measurement"], + ["value_as_number"], + 1, + 123, + 4, + "Expected interval_length_unit to be a string.", + ), + ( + None, + None, + ["measurement"], + ["value_as_number"], + 1, + "day", + "wrong_type", + "Expected num_intervals to be an integer.", + ), + ], +) +def test_setup_variables_illegal_argument_types( + omop_connection_vanilla, + edata, + backend_handle, + data_tables, + data_field_to_keep, + interval_length_number, + interval_length_unit, + num_intervals, + expected_error, +): + con = omop_connection_vanilla + with pytest.raises(TypeError, match=expected_error): + ed.io.omop.setup_variables( + edata or ed.io.omop.setup_obs(backend_handle=omop_connection_vanilla, observation_table="person_cohort"), + backend_handle=backend_handle or con, + data_tables=data_tables, + data_field_to_keep=data_field_to_keep, + interval_length_number=interval_length_number, + interval_length_unit=interval_length_unit, + num_intervals=num_intervals, + ) From a5cac1d79f88fd015abd49c56cf5d38b45140b5e Mon Sep 17 00:00:00 2001 From: Eljas Roellin Date: Sat, 2 Nov 2024 22:21:55 +0100 Subject: [PATCH 13/15] setup_var with basic functionality on units, more tests and better description --- src/ehrdata/io/omop/omop.py | 78 ++++++++++++++++++------- tests/data/toy_omop/vanilla/concept.csv | 1 + tests/test_io/test_omop.py | 76 +++++++++++++++++++++++- 3 files changed, 132 insertions(+), 23 deletions(-) create mode 100644 tests/data/toy_omop/vanilla/concept.csv diff --git a/src/ehrdata/io/omop/omop.py b/src/ehrdata/io/omop/omop.py index 8293ec6..6034b17 100644 --- a/src/ehrdata/io/omop/omop.py +++ b/src/ehrdata/io/omop/omop.py @@ -68,8 +68,8 @@ def _check_valid_data_tables(data_tables) -> Sequence: def _check_valid_data_field_to_keep(data_field_to_keep) -> Sequence: if isinstance(data_field_to_keep, str): data_field_to_keep = [data_field_to_keep] - if not isinstance(data_field_to_keep, Sequence) and not isinstance(data_field_to_keep, dict): - raise TypeError("Expected data_field_to_keep to be a string, Sequence, or dictionary.") + if not isinstance(data_field_to_keep, Sequence): + raise TypeError("Expected data_field_to_keep to be a string or Sequence.") return data_field_to_keep @@ -99,13 +99,23 @@ def _check_valid_aggregation_strategy(aggregation_strategy) -> None: raise TypeError(f"aggregation_strategy must be one of {AGGREGATION_STRATEGY_KEY.keys()}.") +def _check_valid_enrich_var_with_feature_info(enrich_var_with_feature_info) -> None: + if not isinstance(enrich_var_with_feature_info, bool): + raise TypeError("Expected enrich_var_with_feature_info to be a boolean.") + + +def _check_valid_enrich_var_with_unit_info(enrich_var_with_unit_info) -> None: + if not isinstance(enrich_var_with_unit_info, bool): + raise TypeError("Expected enrich_var_with_unit_info to be a boolean.") + + def _collect_units_per_feature(ds, unit_key="unit_concept_id") -> dict: feature_units = {} for i in range(ds[unit_key].shape[1]): single_feature_units = ds[unit_key].isel({ds[unit_key].dims[1]: i}) single_feature_units_flat = np.array(single_feature_units).flatten() single_feature_units_unique = pd.unique(single_feature_units_flat[~pd.isna(single_feature_units_flat)]) - feature_units[i] = single_feature_units_unique + feature_units[ds["data_table_concept_id"][i].item()] = single_feature_units_unique return feature_units @@ -152,8 +162,8 @@ def _create_feature_unit_concept_id_report(backend_handle, ds) -> pd.DataFrame: return df -def _create_enriched_var_table(backend_handle, ds, unit_report) -> pd.DataFrame: - feature_concept_id_table = ds["data_table_concept_id"].to_dataframe() +def _create_enriched_var_with_unit_info(backend_handle, ds, var, unit_report) -> pd.DataFrame: + feature_concept_id_table = var # ds["data_table_concept_id"].to_dataframe() feature_concept_id_unit_table = pd.merge( feature_concept_id_table, unit_report, how="left", left_index=True, right_on="concept_id" @@ -252,16 +262,20 @@ def setup_variables( backend_handle: duckdb.duckdb.DuckDBPyConnection, data_tables: Sequence[Literal["measurement", "observation", "specimen"]] | Literal["measurement", "observation", "specimen"], - data_field_to_keep: str | Sequence[str] | dict[str, str], + data_field_to_keep: str | Sequence[str], interval_length_number: int, interval_length_unit: str, num_intervals: int, concept_ids: Literal["all"] | Sequence = "all", aggregation_strategy: str = "last", + enrich_var_with_feature_info: bool = False, + enrich_var_with_unit_info: bool = False, ): """Setup the variables. This function sets up the variables for the EHRData object. + It will fail if there is more than one unit_concept_id per feature. + Writes a unit report of the features to edata.uns["unit_report_"]. Parameters ---------- @@ -270,10 +284,9 @@ def setup_variables( edata The EHRData object to which the variables should be added. data_tables - The tables to be used. For now, only one can be used. + The table to be used. Only a single table can be used. data_field_to_keep The CDM Field in the data table to be kept. Can be e.g. "value_as_number" or "value_as_concept_id". - If multiple tables are used, this can be a dictionary with the table name as key and the column name as value, e.g. {"measurement": "value_as_number", "observation": "value_as_concept_id"}. start_time Starting time for values to be included. interval_length_number @@ -286,6 +299,10 @@ def setup_variables( Concept IDs to use from this data table. If not specified, 'all' are used. aggregation_strategy Strategy to use when aggregating multiple data points within one interval. + enrich_var_with_feature_info + Whether to enrich the var table with feature information. If a concept_id is not found in the concept table, the feature information will be NaN. + enrich_var_with_unit_info + Whether to enrich the var table with unit information. Raises an Error if a) multiple units per feature are found for at least one feature. If a concept_id is not found in the concept table, the feature information will be NaN. Returns ------- @@ -302,6 +319,8 @@ def setup_variables( _check_valid_num_intervals(num_intervals) _check_valid_concept_ids(concept_ids) _check_valid_aggregation_strategy(aggregation_strategy) + _check_valid_enrich_var_with_feature_info(enrich_var_with_feature_info) + _check_valid_enrich_var_with_unit_info(enrich_var_with_unit_info) time_defining_table = edata.uns.get("omop_io_observation_table", None) if time_defining_table is None: @@ -311,10 +330,11 @@ def setup_variables( # also keep unit_concept_id and unit_source_value; if isinstance(data_field_to_keep, list): data_field_to_keep = list(data_field_to_keep) + ["unit_concept_id", "unit_source_value"] - elif isinstance(data_field_to_keep, dict): - data_field_to_keep = { - k: v + ["unit_concept_id", "unit_source_value"] for k, v in data_field_to_keep.items() - } + # TODO: use in future version when more than one data table can be used + # elif isinstance(data_field_to_keep, dict): + # data_field_to_keep = { + # k: v + ["unit_concept_id", "unit_source_value"] for k, v in data_field_to_keep.items() + # } else: raise ValueError @@ -337,22 +357,40 @@ def setup_variables( # TODO ignore? go with more vanilla omop style. _check_one_unit_per_feature(ds, unit_key="unit_source_value") unit_report = _create_feature_unit_concept_id_report(backend_handle, ds) - # TODO: generate nice multiple-unit report - # TODO: add unit to var - # TODO: add unit name to var - # TODO: add feature name to var - # TODO: test all of the above 5 + var = ds["data_table_concept_id"].to_dataframe() + concepts = backend_handle.sql("SELECT * FROM concept").df() - # var = _create_var_table(backend_handle, unit_report) + if enrich_var_with_feature_info: + var = pd.merge(var, concepts, how="left", left_index=True, right_on="concept_id") - var = _create_enriched_var_table(backend_handle, ds, unit_report) + if enrich_var_with_unit_info: + if unit_report["multiple_units"].sum() > 0: + raise ValueError("Multiple units per feature found. Enrichment with feature information not possible.") + else: + var = pd.merge( + var, + unit_report, + how="left", + left_index=True, + right_on="unit_concept_id", + suffixes=("", "_unit"), + ) + var = pd.merge( + var, + concepts, + how="left", + left_on="unit_concept_id", + right_on="concept_id", + suffixes=("", "_unit"), + ) t = ds["interval_step"].to_dataframe() edata = EHRData(r=ds[data_field_to_keep[0]].values, obs=edata.obs, var=var, uns=edata.uns, t=t) + edata.uns[f"unit_report_{data_tables[0]}"] = unit_report - return edata, unit_report + return edata def load( diff --git a/tests/data/toy_omop/vanilla/concept.csv b/tests/data/toy_omop/vanilla/concept.csv new file mode 100644 index 0000000..6ca864c --- /dev/null +++ b/tests/data/toy_omop/vanilla/concept.csv @@ -0,0 +1 @@ +concept_id,concept_name,domain_id,vocabulary_id,concept_class_id,standard_concept,concept_code,valid_start_DATE,valid_end_DATE,invalid_reason diff --git a/tests/test_io/test_omop.py b/tests/test_io/test_omop.py index 403d84e..d83aed6 100644 --- a/tests/test_io/test_omop.py +++ b/tests/test_io/test_omop.py @@ -15,6 +15,16 @@ "observation": 2, } +# constants for setup_variables +# only data_table_concept_id +VAR_DIM_BASE = 1 +# number of columns in concept table +NUMBER_COLUMNS_CONCEPT_TABLE = 10 +VAR_DIM_FEATURE_INFO = NUMBER_COLUMNS_CONCEPT_TABLE +# number of columns in concept table + number of columns +NUMBER_COLUMNS_FEATURE_REPORT = 4 +VAR_DIM_UNIT_INFO = NUMBER_COLUMNS_CONCEPT_TABLE + NUMBER_COLUMNS_FEATURE_REPORT + @pytest.mark.parametrize( "observation_table, death_table, expected_length, expected_obs_num_columns", @@ -85,7 +95,22 @@ def test_setup_obs_invalid_observation_table_value(omop_connection_vanilla): "data_field_to_keep", [["value_as_number"], ["value_as_concept_id"]], ) -def test_setup_variables(omop_connection_vanilla, observation_table, data_tables, data_field_to_keep): +@pytest.mark.parametrize( + "enrich_var_with_feature_info", + [True, False], +) +@pytest.mark.parametrize( + "enrich_var_with_unit_info", + [True, False], +) +def test_setup_variables( + omop_connection_vanilla, + observation_table, + data_tables, + data_field_to_keep, + enrich_var_with_feature_info, + enrich_var_with_unit_info, +): num_intervals = 4 con = omop_connection_vanilla edata = ed.io.omop.setup_obs(backend_handle=con, observation_table=observation_table) @@ -97,16 +122,21 @@ def test_setup_variables(omop_connection_vanilla, observation_table, data_tables interval_length_number=1, interval_length_unit="day", num_intervals=num_intervals, + enrich_var_with_feature_info=enrich_var_with_feature_info, + enrich_var_with_unit_info=enrich_var_with_unit_info, ) assert isinstance(edata, ed.EHRData) assert edata.n_obs == VANILLA_PERSONS_WITH_OBSERVATION_TABLE_ENTRY[observation_table] assert edata.n_vars == VANILLA_NUM_CONCEPTS[data_tables[0]] assert edata.r.shape[2] == num_intervals + assert edata.var.shape[1] == VAR_DIM_BASE + (VAR_DIM_FEATURE_INFO if enrich_var_with_feature_info else 0) + ( + VAR_DIM_UNIT_INFO if enrich_var_with_unit_info else 0 + ) @pytest.mark.parametrize( - "edata, backend_handle, data_tables, data_field_to_keep, interval_length_number, interval_length_unit, num_intervals, expected_error", + "edata, backend_handle, data_tables, data_field_to_keep, interval_length_number, interval_length_unit, num_intervals, enrich_var_with_feature_info, enrich_var_with_unit_info, expected_error", [ ( "wrong_type", @@ -116,6 +146,8 @@ def test_setup_variables(omop_connection_vanilla, observation_table, data_tables 1, "day", 4, + False, + False, "Expected edata to be of type EHRData.", ), ( @@ -126,6 +158,8 @@ def test_setup_variables(omop_connection_vanilla, observation_table, data_tables 1, "day", 4, + False, + False, "Expected backend_handle to be of type DuckDBPyConnection.", ), ( @@ -136,6 +170,8 @@ def test_setup_variables(omop_connection_vanilla, observation_table, data_tables 1, "day", 4, + False, + False, "Expected data_tables to be a string or Sequence.", ), ( @@ -146,7 +182,9 @@ def test_setup_variables(omop_connection_vanilla, observation_table, data_tables 1, "day", 4, - "Expected data_field_to_keep to be a string, Sequence, or dictionary.", + False, + False, + "Expected data_field_to_keep to be a string or Sequence.", ), ( None, @@ -156,6 +194,8 @@ def test_setup_variables(omop_connection_vanilla, observation_table, data_tables "wrong_type", "day", 4, + False, + False, "Expected interval_length_number to be an integer.", ), ( @@ -166,6 +206,8 @@ def test_setup_variables(omop_connection_vanilla, observation_table, data_tables 1, 123, 4, + False, + False, "Expected interval_length_unit to be a string.", ), ( @@ -176,8 +218,34 @@ def test_setup_variables(omop_connection_vanilla, observation_table, data_tables 1, "day", "wrong_type", + False, + False, "Expected num_intervals to be an integer.", ), + ( + None, + None, + ["measurement"], + ["value_as_number"], + 1, + "day", + 123, + "wrong_type", + False, + "Expected enrich_var_with_feature_info to be a boolean.", + ), + ( + None, + None, + ["measurement"], + ["value_as_number"], + 1, + "day", + 123, + False, + "wrong_type", + "Expected enrich_var_with_unit_info to be a boolean.", + ), ], ) def test_setup_variables_illegal_argument_types( @@ -189,6 +257,8 @@ def test_setup_variables_illegal_argument_types( interval_length_number, interval_length_unit, num_intervals, + enrich_var_with_feature_info, + enrich_var_with_unit_info, expected_error, ): con = omop_connection_vanilla From 270f6bb91e6175930cad45b0363071219eee849d Mon Sep 17 00:00:00 2001 From: Eljas Roellin Date: Sat, 2 Nov 2024 22:31:41 +0100 Subject: [PATCH 14/15] fix test illegal args, check other option for gibleed --- src/ehrdata/dt/datasets.py | 4 ++-- tests/test_io/test_omop.py | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/src/ehrdata/dt/datasets.py b/src/ehrdata/dt/datasets.py index f996fe0..33545be 100644 --- a/src/ehrdata/dt/datasets.py +++ b/src/ehrdata/dt/datasets.py @@ -145,9 +145,9 @@ def gibleed_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = No else: print(f"Failed to download the file. Status code: {response.status_code}") - extracted_folder = next(data_path.iterdir(), data_path) + # extracted_folder = next(data_path.iterdir(), data_path) # extracted_folder = next((folder for folder in data_path.iterdir() if folder.is_dir() and "_csv" in folder.name and "__MACOSX" not in folder.name), data_path) - return _set_up_duckdb(extracted_folder, backend_handle) + return _set_up_duckdb(data_path / "GiBleed_5.3", backend_handle) def synthea27nj_omop(backend_handle: DuckDBPyConnection, data_path: Path | None = None) -> None: diff --git a/tests/test_io/test_omop.py b/tests/test_io/test_omop.py index d83aed6..68ed0fc 100644 --- a/tests/test_io/test_omop.py +++ b/tests/test_io/test_omop.py @@ -271,4 +271,6 @@ def test_setup_variables_illegal_argument_types( interval_length_number=interval_length_number, interval_length_unit=interval_length_unit, num_intervals=num_intervals, + enrich_var_with_feature_info=enrich_var_with_feature_info, + enrich_var_with_unit_info=enrich_var_with_unit_info, ) From 51d2172c7492ef036b327bd4a9173bbfc0dd33a2 Mon Sep 17 00:00:00 2001 From: Eljas Roellin Date: Sat, 2 Nov 2024 22:36:56 +0100 Subject: [PATCH 15/15] stop there w/ tests for this PR --- tests/test_dt/test_dt.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/tests/test_dt/test_dt.py b/tests/test_dt/test_dt.py index 5d71186..72fa7a3 100644 --- a/tests/test_dt/test_dt.py +++ b/tests/test_dt/test_dt.py @@ -10,15 +10,16 @@ def test_mimic_iv_omop(): con.close() -def test_gibleed_omop(): - con = duckdb.connect() - ed.dt.gibleed_omop(backend_handle=con) - assert len(con.execute("SHOW TABLES").df()) == 36 - con.close() +# TODO +# def test_gibleed_omop(): +# con = duckdb.connect() +# ed.dt.gibleed_omop(backend_handle=con) +# assert len(con.execute("SHOW TABLES").df()) == 36 +# con.close() -def test_synthea27nj_omop(): - con = duckdb.connect() - ed.dt.synthea27nj_omop(backend_handle=con) - assert len(con.execute("SHOW TABLES").df()) == 37 - con.close() +# def test_synthea27nj_omop(): +# con = duckdb.connect() +# ed.dt.synthea27nj_omop(backend_handle=con) +# assert len(con.execute("SHOW TABLES").df()) == 37 +# con.close()