From b1ac9a58c8ee681a91bb0213f16f44752951dccc Mon Sep 17 00:00:00 2001 From: SphtKr Date: Wed, 26 Apr 2023 00:41:15 +0000 Subject: [PATCH 01/20] First "working" version (all SQL runs except for a few pivot concepts)...committing before gutting dead code. --- .../duckdb/concepts/icustay_hours.sql | 21 + .../buildmimic/duckdb/duckdb_concepts.py | 425 ++++++++++++++++++ 2 files changed, 446 insertions(+) create mode 100644 mimic-iii/buildmimic/duckdb/concepts/icustay_hours.sql create mode 100644 mimic-iii/buildmimic/duckdb/duckdb_concepts.py diff --git a/mimic-iii/buildmimic/duckdb/concepts/icustay_hours.sql b/mimic-iii/buildmimic/duckdb/concepts/icustay_hours.sql new file mode 100644 index 000000000..71ecfc1c4 --- /dev/null +++ b/mimic-iii/buildmimic/duckdb/concepts/icustay_hours.sql @@ -0,0 +1,21 @@ +WITH all_hours AS ( + SELECT + it.icustay_id, /* ceiling the intime to the nearest hour by adding 59 minutes then truncating */ + DATE_TRUNC('hour', it.intime_hr + INTERVAL '59' minute) AS endtime, /* create integers for each charttime in hours from admission */ + /* so 0 is admission time, 1 is one hour after admission, etc, up to ICU disch */ + GENERATE_SERIES( + -24, + CAST(CEIL(EXTRACT(EPOCH FROM it.outtime_hr - it.intime_hr) / 60.0 / 60.0) AS INT) + ) AS hr + FROM icustay_times AS it +) +SELECT + ah.icustay_id, + unnest(ah.hr) as hr, + /* endtime now indexes the end time of every hour for each patient */ + unnest(list_transform(hr, ahr -> ah.endtime + ahr*INTERVAL 1 hour)) AS endtime + --ah.endtime+ hr*INTERVAL 1 hour as endtime +FROM all_hours AS ah +ORDER BY + ah.icustay_id NULLS LAST +limit 20 \ No newline at end of file diff --git a/mimic-iii/buildmimic/duckdb/duckdb_concepts.py b/mimic-iii/buildmimic/duckdb/duckdb_concepts.py new file mode 100644 index 000000000..b14c53365 --- /dev/null +++ b/mimic-iii/buildmimic/duckdb/duckdb_concepts.py @@ -0,0 +1,425 @@ +import cProfile + +import sys +import os +import re +import argparse + +import duckdb +import datetime + +#import sqlparse +import sqlglot +import sqlglot.dialects.bigquery +import sqlglot.dialects.duckdb +from sqlglot import exp, generator, parser, tokens, transforms +from sqlglot.helper import seq_get + +from pprint import pprint + +concept_name_map = { + #'icustay_times': {"path": "../../concepts_postgres/demographics/icustay_times.sql"}, + 'icustay_times': {"path": "../../concepts/demographics/icustay_times.sql", "db": "bigquery"}, + #'icustay_hours': {"path": "../../concepts/demographics/icustay_hours.sql", "db": "bigquery"}, + 'icustay_hours': {"path": "./concepts/icustay_hours.sql", "db": "duckdb"}, + 'echo_data': {"path": "../../concepts/echo_data.sql", "db": "bigquery"}, + #'code_status': {"path": "../../concepts_postgres/code_status.sql"}, + 'code_status': {"path": "../../concepts/code_status.sql", "db": "bigquery"}, + 'weight_durations': {"path": "../../concepts/durations/weight_durations.sql", "db": "bigquery"}, + #'rrt': {"path": "../../concepts_postgres/rrt.sql"}, + 'rrt': {"path": "../../concepts/rrt.sql", "db": "bigquery"}, + 'heightweight': {"path": "../../concepts/demographics/heightweight.sql", "db": "bigquery"}, + 'icustay_detail': {"path": "../../concepts/demographics/icustay_detail.sql", "db": "bigquery"}, + + 'ventilation_classification': {"path": "../../concepts/durations/ventilation_classification.sql", "db": "bigquery"}, + 'ventilation_durations': {"path": "../../concepts/durations/ventilation_durations.sql", "db": "bigquery"}, + 'crrt_durations': {"path": "../../concepts/durations/crrt_durations.sql", "db": "bigquery"}, + 'adenosine_durations': {"path": "../../concepts/durations/adenosine_durations.sql", "db": "bigquery"}, + 'dobutamine_durations': {"path": "../../concepts/durations/dobutamine_durations.sql", "db": "bigquery"}, + 'dopamine_durations': {"path": "../../concepts/durations/dopamine_durations.sql", "db": "bigquery"}, + 'epinephrine_durations': {"path": "../../concepts/durations/epinephrine_durations.sql", "db": "bigquery"}, + 'isuprel_durations': {"path": "../../concepts/durations/isuprel_durations.sql", "db": "bigquery"}, + 'milrinone_durations': {"path": "../../concepts/durations/milrinone_durations.sql", "db": "bigquery"}, + 'norepinephrine_durations': {"path": "../../concepts/durations/norepinephrine_durations.sql", "db": "bigquery"}, + 'phenylephrine_durations': {"path": "../../concepts/durations/phenylephrine_durations.sql", "db": "bigquery"}, + 'vasopressin_durations': {"path": "../../concepts/durations/vasopressin_durations.sql", "db": "bigquery"}, + 'vasopressor_durations': {"path": "../../concepts/durations/vasopressor_durations.sql", "db": "bigquery"}, + # move weight_durations here + + 'dobutamine_dose': {"path": "../../concepts/durations/dobutamine_dose.sql", "db": "bigquery"}, + 'dopamine_dose': {"path": "../../concepts/durations/dopamine_dose.sql", "db": "bigquery"}, + 'epinephrine_dose': {"path": "../../concepts/durations/epinephrine_dose.sql", "db": "bigquery"}, + 'norepinephrine_dose': {"path": "../../concepts/durations/norepinephrine_dose.sql", "db": "bigquery"}, + 'phenylephrine_dose': {"path": "../../concepts/durations/phenylephrine_dose.sql", "db": "bigquery"}, + 'vasopressin_dose': {"path": "../../concepts/durations/vasopressin_dose.sql", "db": "bigquery"}, + + 'pivoted_vital': {"path": "../../concepts/pivot/pivoted_vital.sql", "db": "bigquery"}, + 'pivoted_uo': {"path": "../../concepts/pivot/pivoted_uo.sql", "db": "bigquery"}, + 'pivoted_rrt': {"path": "../../concepts/pivot/pivoted_rrt.sql", "db": "bigquery"}, + 'pivoted_lab': {"path": "../../concepts/pivot/pivoted_lab.sql", "db": "bigquery"}, + 'pivoted_invasive_lines': {"path": "../../concepts/pivot/pivoted_invasive_lines.sql", "db": "bigquery"}, + 'pivoted_icp': {"path": "../../concepts/pivot/pivoted_icp.sql", "db": "bigquery"}, + 'pivoted_height': {"path": "../../concepts/pivot/pivoted_height.sql", "db": "bigquery"}, + 'pivoted_gcs': {"path": "../../concepts/pivot/pivoted_gcs.sql", "db": "bigquery"}, + 'pivoted_fio2': {"path": "../../concepts/pivot/pivoted_fio2.sql", "db": "bigquery"}, + 'pivoted_bg': {"path": "../../concepts/pivot/pivoted_bg.sql", "db": "bigquery"}, + # pivoted_bg_art must be run after pivoted_bg + 'pivoted_bg_art': {"path": "../../concepts/pivot/pivoted_bg_art.sql", "db": "bigquery"}, + # Difficult error here, the original query seems to reference something non-existent... + # the `pivot` queries are omitted from the Postgres version... we may have to do the same? + # pivoted oasis depends on icustay_hours in demographics + #'pivoted_oasis': {"path": "../../concepts/pivot/pivoted_oasis.sql", "db": "bigquery"}, + # Another puzzling error here, duckdb doesn't like something on the `WITH` line! + # pivoted sofa depends on many above pivoted views, ventilation_durations, and dose queries + #'pivoted_sofa': {"path": "../../concepts/pivot/pivoted_sofa.sql", "db": "bigquery"}, + + 'elixhauser_ahrq_v37': {"path": "../../concepts/comorbidity/elixhauser_ahrq_v37.sql", "db": "bigquery"}, + 'elixhauser_ahrq_v37_no_drg': {"path": "../../concepts/comorbidity/elixhauser_ahrq_v37_no_drg.sql", "db": "bigquery"}, + 'elixhauser_quan': {"path": "../../concepts/comorbidity/elixhauser_quan.sql", "db": "bigquery"}, + 'elixhauser_score_ahrq': {"path": "../../concepts/comorbidity/elixhauser_score_ahrq.sql", "db": "bigquery"}, + 'elixhauser_score_quan': {"path": "../../concepts/comorbidity/elixhauser_score_quan.sql", "db": "bigquery"}, + + 'blood_gas_first_day': {"path": "../../concepts/firstday/blood_gas_first_day.sql", "db": "bigquery"}, + 'blood_gas_first_day_arterial': {"path": "../../concepts/firstday/blood_gas_first_day_arterial.sql", "db": "bigquery"}, + 'gcs_first_day': {"path": "../../concepts/firstday/gcs_first_day.sql", "db": "bigquery"}, + 'labs_first_day': {"path": "../../concepts/firstday/labs_first_day.sql", "db": "bigquery"}, + 'rrt_first_day': {"path": "../../concepts/firstday/rrt_first_day.sql", "db": "bigquery"}, + 'urine_output_first_day': {"path": "../../concepts/firstday/urine_output_first_day.sql", "db": "bigquery"}, + 'ventilation_first_day': {"path": "../../concepts/firstday/ventilation_first_day.sql", "db": "bigquery"}, + 'vitals_first_day': {"path": "../../concepts/firstday/vitals_first_day.sql", "db": "bigquery"}, + 'weight_first_day': {"path": "../../concepts/firstday/weight_first_day.sql", "db": "bigquery"}, + + 'urine_output': {"path": "../../concepts/fluid_balance/urine_output.sql", "db": "bigquery"}, + + 'angus': {"path": "../../concepts/sepsis/angus.sql", "db": "bigquery"}, + 'martin': {"path": "../../concepts/sepsis/martin.sql", "db": "bigquery"}, + 'explicit': {"path": "../../concepts/sepsis/explicit.sql", "db": "bigquery"}, + + #FIXME: Must load ccs_multi_dx lookup table first! + 'ccs_dx': {"path": "../../concepts/diagnosis/ccs_dx.sql", "db": "bigquery"}, + + 'kdigo_creatinine': {"path": "../../concepts/organfailure/kdigo_creatinine.sql", "db": "bigquery"}, + 'kdigo_uo': {"path": "../../concepts/organfailure/kdigo_uo.sql", "db": "bigquery"}, + 'kdigo_stages': {"path": "../../concepts/organfailure/kdigo_stages.sql", "db": "bigquery"}, + 'kdigo_stages_7day': {"path": "../../concepts/organfailure/kdigo_stages_7day.sql", "db": "bigquery"}, + 'kdigo_stages_48hr': {"path": "../../concepts/organfailure/kdigo_stages_48hr.sql", "db": "bigquery"}, + 'meld': {"path": "../../concepts/organfailure/meld.sql", "db": "bigquery"}, + + 'oasis': {"path": "../../concepts/severityscores/oasis.sql", "db": "bigquery"}, + 'sofa': {"path": "../../concepts/severityscores/sofa.sql", "db": "bigquery"}, + 'saps': {"path": "../../concepts/severityscores/saps.sql", "db": "bigquery"}, + 'sapsii': {"path": "../../concepts/severityscores/sapsii.sql", "db": "bigquery"}, + 'apsiii': {"path": "../../concepts/severityscores/apsiii.sql", "db": "bigquery"}, + 'lods': {"path": "../../concepts/severityscores/lods.sql", "db": "bigquery"}, + 'sirs': {"path": "../../concepts/severityscores/sirs.sql", "db": "bigquery"}, + +} + +# BigQuery monkey patches +sqlglot.dialects.bigquery.BigQuery.Parser.FUNCTIONS["PARSE_DATETIME"] = lambda args: exp.StrToTime( + this=seq_get(args, 1), format=seq_get(args, 0) +) +sqlglot.dialects.bigquery.BigQuery.Parser.FUNCTIONS["FORMAT_DATE"] = lambda args: exp.TimeToStr( + this=seq_get(args, 1), format=seq_get(args, 0) +) +sqlglot.dialects.bigquery.BigQuery.Parser.STRICT_CAST = False + +# DuckDB monkey patches +macros = [ + #"CREATE MACRO PARSE_DATETIME(a, b) AS strptime(b, a);", + #"CREATE MACRO FORMAT_DATE(a, b) AS strftime(CAST(b AS DATE), CAST(a AS VARCHAR));", + #"CREATE OR REPLACE MACRO DATETIME_DIFF(a, b, u := 'DAY') AS date_diff(CAST(u AS VARCHAR), CAST(a AS TIME), CAST(b AS TIME));", + #"CREATE OR REPLACE MACRO DATETIME_DIFF_MACRO(u, a, b) AS date_diff(u, CAST(a AS TIME), CAST(b AS TIME));", +] +def duckdb_date_sub_sql(self, expression): + #print("CALLING duckdb._date_sub") + this = self.sql(expression, "this") + unit = self.sql(expression, "unit") or "DAY" # .strip("'") + return f"{this} - {self.sql(exp.Interval(this=expression.expression, unit=unit))}" +#sqlglot.dialects.duckdb._date_sub_sql = duckdb_date_sub_sql +#sqlglot.dialects.duckdb.DuckDB.Generator.TRANSFORMS[exp.DateSub] = duckdb_date_sub_sql +sqlglot.dialects.duckdb.DuckDB.Generator.TRANSFORMS[exp.DatetimeSub] = duckdb_date_sub_sql +sqlglot.dialects.duckdb.DuckDB.Generator.TRANSFORMS[exp.DatetimeAdd] = sqlglot.dialects.duckdb._date_add + +def duckdb_date_diff_sql(self, expression): + #print("CALLING duckdb._date_diff") + this = self.sql(expression, "this") + unit = self.sql(expression, "unit") or "DAY" + return f"DATE_DIFF('{unit}', {this}, {self.sql(expression.expression)})" +#sqlglot.dialects.duckdb._date_diff_sql = duckdb_date_diff_sql +sqlglot.dialects.duckdb.DuckDB.Generator.TRANSFORMS[exp.DatetimeDiff] = duckdb_date_diff_sql +#sqlglot.dialects.duckdb.DuckDB.Generator.TRANSFORMS[exp.DatetimeDiff] = sqlglot.dialects.duckdb._date_diff_sql +#sqlglot.dialects.duckdb.DuckDB.Generator.TRANSFORMS[exp.DatetimeDiff] = sqlglot.dialects.duckdb.DuckDB.Generator.TRANSFORMS[exp.DateDiff] +sqlglot.dialects.duckdb.DuckDB.Generator.TRANSFORMS[exp.DateDiff] = duckdb_date_diff_sql + +if False: + concept_name_map = { + 'icustay_times': {"path": "../../concepts_postgres/demographics/icustay_times.sql"}, + 'icustay_hours': {"path": "./demographics/icustay_hours.sql", "db": "duckdb"}, + 'echo_data': {"path": "../../concepts/echo_data.sql", "db": "bigquery"}, + 'code_status': {"path": "../../concepts_postgres/code_status.sql"}, + 'weight_durations': {"path": "../../concepts/durations/weight_durations.sql", "db": "bigquery"}, + 'rrt': {"path": "../../concepts_postgres/rrt.sql"}, + 'urine_output': {"path": "../../concepts/fluid_balance/urine_output.sql", "db": "bigquery"}, + 'kdigo_uo': {"path": "../../concepts/organfailure/kdigo_uo.sql", "db": "bigquery"} + } + + ##sqlglot.dialects.duckdb.DuckDB.Generator.TRANSFORMS[exp.DatetimeSub] = sqlglot.dialects.dialect.rename_func("date_sub") + sqlglot.dialects.duckdb.DuckDB.Generator.TRANSFORMS[exp.DatetimeSub] = sqlglot.dialects.duckdb._date_sub_sql + sqlglot.dialects.duckdb.DuckDB.Generator.TRANSFORMS[exp.DatetimeAdd] = sqlglot.dialects.duckdb._date_add + sqlglot.dialects.duckdb.DuckDB.Generator.TRANSFORMS[exp.DateSub] = sqlglot.dialects.duckdb._date_sub_sql + ##sqlglot.dialects.duckdb.DuckDB.Generator.TRANSFORMS[exp.DatetimeDiff] = sqlglot.dialects.dialect.rename_func("date_diff") + ##sqlglot.dialects.duckdb.DuckDB.Generator.TRANSFORMS[exp.DatetimeDiff] = lambda self, e: self.func( + ## "DATE_DIFF", ("'"+(e.args.get("unit") or exp.Literal.string("day")).name+"'"), e.expression, e.this + ##) + ##sqlglot.dialects.duckdb.DuckDB.Generator.TRANSFORMS[exp.Cast] = exp.TryCast + + + # BigQuery monkey patches + + # Postgres monkey patches + sqlglot.dialects.postgres.Postgres.Parser.FUNCTIONS["PARSE_DATETIME"] = lambda args: exp.StrToTime( + this=seq_get(args, 1), format=seq_get(args, 0) + ) + sqlglot.dialects.postgres.Postgres.Parser.FUNCTIONS["FORMAT_DATE"] = lambda args: exp.TimeToStr( + this=seq_get(args, 1), format=seq_get(args, 0) + ) + sqlglot.dialects.postgres.Postgres.Parser.STRICT_CAST = False + +_time_unit_kwargs_map = { + "HOUR": "hour", + "DAY": "day", + "MINUTE": "minute", + "SECOND": "second", + "YEAR": "year" +} +def _bigquery_duckdb_transformer(node): + if isinstance(node, exp.Var) and node.name in _time_unit_kwargs_map: + #print(f"{node=}") + return sqlglot.parse_one(f"{node.name}") + return node + + if isinstance(node, exp.Anonymous): + #print(f"{node.name=}") + #return sqlglot.parse_one(node.name) #no-op + return node + if isinstance(node, exp.DatetimeSub): + #print(f"{node=}") + return node + if isinstance(node, exp.Column): + #print(f"COLUMN {node.name=}") + return node + return node + + +def _make_duckdb_query_bigquery(qname: str, conn): + _multischema_trunc_re = re.compile("\"physionet-data\.mimiciii_\w+\.") + + #TODO: better anwer here? should only hit ccs_dx.sql! + _too_many_backslashes_re = re.compile("\\\\([\[\.\]])", ) + + #_whole_line_comments_strip_re = re.compile("^\s*--.*$", flags=re.MULTILINE) + qfile = concept_name_map[qname]["path"] + with open(qfile, "r") as fp: + sql = fp.read() + ##strip comments manually... one weird thing happening in pivoted_sofa...?? + #sql = re.sub(_whole_line_comments_strip_re, '', sql) + sql = re.sub(_too_many_backslashes_re, '\\$1', sql) + try: + #print(repr(sqlglot.parse_one(sql.replace('`','"')))) + sql_list = sqlglot.transpile(sql, read="bigquery", write="duckdb", pretty=True) + except Exception as e: + print(sql) + raise e + print() + for st in sql_list: + sql = re.sub(_multischema_trunc_re, "\"", st) + #ast = sqlglot.parse_one(sql) + #ast2 = ast.transform(_bigquery_duckdb_transformer) + #sql = ast2.sql(pretty=True) + #print(sql) + + if concept_name_map[qname].get("nocreate", False): + cursor = conn.cursor() + try: + cursor.execute(sql) + except Exception as e: + print(sql) + print(repr(sqlglot.parse_one(sql))) + raise e + result = cursor.fetchone() + print(result) + cursor.close() + return sql + + conn.execute(f"DROP VIEW IF EXISTS {qname}") + try: + conn.execute(f"CREATE TEMP VIEW {qname} AS " + sql) + except Exception as e: + print(sql) + #print(repr(sqlglot.parse_one(sql))) + raise e + print(f"CREATED VIEW {qname}") + + #print() + +def _make_duckdb_query_postgres(qname: str, conn): + _uncreate_re = re.compile("DROP.*?;\s+CREATE.*?AS") + _multischema_trunc_re = re.compile("\"physionet-data\.mimiciii_\w+\.") + qfile = concept_name_map[qname]["path"] + with open(qfile, "r") as fp: + sql = fp.read() + sql = re.sub(_uncreate_re, "", sql).strip() + sql_list = sqlglot.transpile(sql, read="postgres", write="duckdb", pretty=True) + for st in sql_list: + if st == '': + continue + sql = re.sub(_multischema_trunc_re, "\"", st) + #ast = sqlglot.parse_one(sql) + #ast2 = ast.transform(_bigquery_sqlite_transformer) + #sql = ast2.sql() + #print(sql) + + if concept_name_map[qname].get("nocreate", False): + cursor = conn.cursor() + try: + cursor.execute(sql) + except Exception as e: + print(sql) + print(repr(sqlglot.parse_one(sql))) + raise e + result = cursor.fetchone() + print(result) + cursor.close() + return sql + + conn.execute(f"DROP VIEW IF EXISTS {qname}") + try: + conn.execute(f"CREATE TEMP VIEW {qname} AS " + sql) + conn.execute(sql) + except Exception as e: + print(sql) + #print(repr(sqlglot.parse_one(sql))) + raise e + print(f"CREATED VIEW {qname}") + + #print() + +def _make_duckdb_query_duckdb(qname: str, conn): + qfile = concept_name_map[qname]["path"] + with open(qfile, "r") as fp: + sql = fp.read() + if concept_name_map[qname].get("nocreate", False): + cursor = conn.cursor() + try: + cursor.execute(sql) + except Exception as e: + print(sql) + raise e + result = cursor.fetchone() + print(result) + cursor.close() + return sql + try: + conn.execute(f"CREATE TEMP VIEW {qname} AS " + sql) + except Exception as e: + print(sql) + raise e + print(f"CREATED VIEW {qname}") + + +def main() -> int: + global concept_name_map + + parser = argparse.ArgumentParser( + prog='buildmimic_duckdb', + description='Creates the MIMIC-III database in DuckDB and optionally the concepts views.', + ) + parser.add_argument('output_db_file', help="The destination DuckDB file to be written", default="./mimiciii.db") + parser.add_argument('--data-path', required=True) + parser.add_argument('--make-concepts', action="store_true") + parser.add_argument('--mimic-code-root', default='../../../') + args = parser.parse_args() + output_db_file = args.output_db_file + data_path = args.data_path + make_concepts = args.make_concepts + mimic_code_root = args.mimic_code_root + + if make_concepts: + connection = duckdb.connect(output_db_file) + print("Connected to duckdb...") + + #print("Defining macros...") + #for macro in macros: + # connection.execute(macro) + + print("Creating tables...") + + # ccs_dx is an outlier...this is adapted from the BigQuery version... + ccs_multi_dx_create = """ + DROP TABLE IF EXISTS ccs_multi_dx; + CREATE TABLE ccs_multi_dx + ( + icd9_code CHAR(5) NOT NULL, + -- CCS levels and names based on position in hierarchy + ccs_level1 VARCHAR(10), + ccs_group1 VARCHAR(100), + ccs_level2 VARCHAR(10), + ccs_group2 VARCHAR(100), + ccs_level3 VARCHAR(10), + ccs_group3 VARCHAR(100), + ccs_level4 VARCHAR(10), + ccs_group4 VARCHAR(100) + ); + """ + + print("Loading data...") + try: + #FIXME: Turn this line back on! + #connection.execute(ccs_multi_dx_create) + #connection.execute(...) + data_path = os.path.join(mimic_code_root, 'mimic-iii','concepts_postgres','diagnosis','ccs_multi_dx.csv.gz') + #connection.from_csv_auto( + # name=data_path, + # header=True) + #FIXME: Turn this line back on! + #connection.execute(f"COPY ccs_multi_dx from '{data_path}' (FORMAT CSV, DELIMITER ',', HEADER);") + + print(connection.sql("SELECT * FROM ccs_multi_dx LIMIT 10;")) + except Exception as error: + print("Failed to setup ccs_multi_dx: ", error) + raise error + finally: + if connection: + connection.close() + print("duckdb connection is closed") + + connection = duckdb.connect(output_db_file) + + print("Creating views...") + try: + for key in concept_name_map: + #cProfile.run('...') + #print(f"Making view {key}...") + db = concept_name_map[key].get("db", "postgres") + if db == "duckdb": + _make_duckdb_query_duckdb(key, connection) + elif db == "bigquery": + _make_duckdb_query_bigquery(key, connection) + elif db == "postgres": + _make_duckdb_query_postgres(key, connection) + + except Exception as error: + print("Failed to execute translated SQL: ", error) + raise error + finally: + if connection: + connection.close() + print("duckdb connection is closed") + +if __name__ == '__main__': + sys.exit(main()) + + + + From 4da73b277a56d08f5e3eaa40c1d47080696a4527 Mon Sep 17 00:00:00 2001 From: SphtKr Date: Wed, 26 Apr 2023 10:34:19 +0000 Subject: [PATCH 02/20] Stripped dead code --- .../buildmimic/duckdb/duckdb_concepts.py | 120 +----------------- 1 file changed, 1 insertion(+), 119 deletions(-) diff --git a/mimic-iii/buildmimic/duckdb/duckdb_concepts.py b/mimic-iii/buildmimic/duckdb/duckdb_concepts.py index b14c53365..a01391840 100644 --- a/mimic-iii/buildmimic/duckdb/duckdb_concepts.py +++ b/mimic-iii/buildmimic/duckdb/duckdb_concepts.py @@ -18,15 +18,11 @@ from pprint import pprint concept_name_map = { - #'icustay_times': {"path": "../../concepts_postgres/demographics/icustay_times.sql"}, 'icustay_times': {"path": "../../concepts/demographics/icustay_times.sql", "db": "bigquery"}, - #'icustay_hours': {"path": "../../concepts/demographics/icustay_hours.sql", "db": "bigquery"}, 'icustay_hours': {"path": "./concepts/icustay_hours.sql", "db": "duckdb"}, 'echo_data': {"path": "../../concepts/echo_data.sql", "db": "bigquery"}, - #'code_status': {"path": "../../concepts_postgres/code_status.sql"}, 'code_status': {"path": "../../concepts/code_status.sql", "db": "bigquery"}, 'weight_durations': {"path": "../../concepts/durations/weight_durations.sql", "db": "bigquery"}, - #'rrt': {"path": "../../concepts_postgres/rrt.sql"}, 'rrt': {"path": "../../concepts/rrt.sql", "db": "bigquery"}, 'heightweight': {"path": "../../concepts/demographics/heightweight.sql", "db": "bigquery"}, 'icustay_detail': {"path": "../../concepts/demographics/icustay_detail.sql", "db": "bigquery"}, @@ -44,7 +40,6 @@ 'phenylephrine_durations': {"path": "../../concepts/durations/phenylephrine_durations.sql", "db": "bigquery"}, 'vasopressin_durations': {"path": "../../concepts/durations/vasopressin_durations.sql", "db": "bigquery"}, 'vasopressor_durations': {"path": "../../concepts/durations/vasopressor_durations.sql", "db": "bigquery"}, - # move weight_durations here 'dobutamine_dose': {"path": "../../concepts/durations/dobutamine_dose.sql", "db": "bigquery"}, 'dopamine_dose': {"path": "../../concepts/durations/dopamine_dose.sql", "db": "bigquery"}, @@ -95,7 +90,6 @@ 'martin': {"path": "../../concepts/sepsis/martin.sql", "db": "bigquery"}, 'explicit': {"path": "../../concepts/sepsis/explicit.sql", "db": "bigquery"}, - #FIXME: Must load ccs_multi_dx lookup table first! 'ccs_dx': {"path": "../../concepts/diagnosis/ccs_dx.sql", "db": "bigquery"}, 'kdigo_creatinine': {"path": "../../concepts/organfailure/kdigo_creatinine.sql", "db": "bigquery"}, @@ -125,19 +119,11 @@ sqlglot.dialects.bigquery.BigQuery.Parser.STRICT_CAST = False # DuckDB monkey patches -macros = [ - #"CREATE MACRO PARSE_DATETIME(a, b) AS strptime(b, a);", - #"CREATE MACRO FORMAT_DATE(a, b) AS strftime(CAST(b AS DATE), CAST(a AS VARCHAR));", - #"CREATE OR REPLACE MACRO DATETIME_DIFF(a, b, u := 'DAY') AS date_diff(CAST(u AS VARCHAR), CAST(a AS TIME), CAST(b AS TIME));", - #"CREATE OR REPLACE MACRO DATETIME_DIFF_MACRO(u, a, b) AS date_diff(u, CAST(a AS TIME), CAST(b AS TIME));", -] def duckdb_date_sub_sql(self, expression): #print("CALLING duckdb._date_sub") this = self.sql(expression, "this") unit = self.sql(expression, "unit") or "DAY" # .strip("'") return f"{this} - {self.sql(exp.Interval(this=expression.expression, unit=unit))}" -#sqlglot.dialects.duckdb._date_sub_sql = duckdb_date_sub_sql -#sqlglot.dialects.duckdb.DuckDB.Generator.TRANSFORMS[exp.DateSub] = duckdb_date_sub_sql sqlglot.dialects.duckdb.DuckDB.Generator.TRANSFORMS[exp.DatetimeSub] = duckdb_date_sub_sql sqlglot.dialects.duckdb.DuckDB.Generator.TRANSFORMS[exp.DatetimeAdd] = sqlglot.dialects.duckdb._date_add @@ -146,76 +132,14 @@ def duckdb_date_diff_sql(self, expression): this = self.sql(expression, "this") unit = self.sql(expression, "unit") or "DAY" return f"DATE_DIFF('{unit}', {this}, {self.sql(expression.expression)})" -#sqlglot.dialects.duckdb._date_diff_sql = duckdb_date_diff_sql sqlglot.dialects.duckdb.DuckDB.Generator.TRANSFORMS[exp.DatetimeDiff] = duckdb_date_diff_sql -#sqlglot.dialects.duckdb.DuckDB.Generator.TRANSFORMS[exp.DatetimeDiff] = sqlglot.dialects.duckdb._date_diff_sql -#sqlglot.dialects.duckdb.DuckDB.Generator.TRANSFORMS[exp.DatetimeDiff] = sqlglot.dialects.duckdb.DuckDB.Generator.TRANSFORMS[exp.DateDiff] sqlglot.dialects.duckdb.DuckDB.Generator.TRANSFORMS[exp.DateDiff] = duckdb_date_diff_sql -if False: - concept_name_map = { - 'icustay_times': {"path": "../../concepts_postgres/demographics/icustay_times.sql"}, - 'icustay_hours': {"path": "./demographics/icustay_hours.sql", "db": "duckdb"}, - 'echo_data': {"path": "../../concepts/echo_data.sql", "db": "bigquery"}, - 'code_status': {"path": "../../concepts_postgres/code_status.sql"}, - 'weight_durations': {"path": "../../concepts/durations/weight_durations.sql", "db": "bigquery"}, - 'rrt': {"path": "../../concepts_postgres/rrt.sql"}, - 'urine_output': {"path": "../../concepts/fluid_balance/urine_output.sql", "db": "bigquery"}, - 'kdigo_uo': {"path": "../../concepts/organfailure/kdigo_uo.sql", "db": "bigquery"} - } - - ##sqlglot.dialects.duckdb.DuckDB.Generator.TRANSFORMS[exp.DatetimeSub] = sqlglot.dialects.dialect.rename_func("date_sub") - sqlglot.dialects.duckdb.DuckDB.Generator.TRANSFORMS[exp.DatetimeSub] = sqlglot.dialects.duckdb._date_sub_sql - sqlglot.dialects.duckdb.DuckDB.Generator.TRANSFORMS[exp.DatetimeAdd] = sqlglot.dialects.duckdb._date_add - sqlglot.dialects.duckdb.DuckDB.Generator.TRANSFORMS[exp.DateSub] = sqlglot.dialects.duckdb._date_sub_sql - ##sqlglot.dialects.duckdb.DuckDB.Generator.TRANSFORMS[exp.DatetimeDiff] = sqlglot.dialects.dialect.rename_func("date_diff") - ##sqlglot.dialects.duckdb.DuckDB.Generator.TRANSFORMS[exp.DatetimeDiff] = lambda self, e: self.func( - ## "DATE_DIFF", ("'"+(e.args.get("unit") or exp.Literal.string("day")).name+"'"), e.expression, e.this - ##) - ##sqlglot.dialects.duckdb.DuckDB.Generator.TRANSFORMS[exp.Cast] = exp.TryCast - - - # BigQuery monkey patches - - # Postgres monkey patches - sqlglot.dialects.postgres.Postgres.Parser.FUNCTIONS["PARSE_DATETIME"] = lambda args: exp.StrToTime( - this=seq_get(args, 1), format=seq_get(args, 0) - ) - sqlglot.dialects.postgres.Postgres.Parser.FUNCTIONS["FORMAT_DATE"] = lambda args: exp.TimeToStr( - this=seq_get(args, 1), format=seq_get(args, 0) - ) - sqlglot.dialects.postgres.Postgres.Parser.STRICT_CAST = False - -_time_unit_kwargs_map = { - "HOUR": "hour", - "DAY": "day", - "MINUTE": "minute", - "SECOND": "second", - "YEAR": "year" -} -def _bigquery_duckdb_transformer(node): - if isinstance(node, exp.Var) and node.name in _time_unit_kwargs_map: - #print(f"{node=}") - return sqlglot.parse_one(f"{node.name}") - return node - - if isinstance(node, exp.Anonymous): - #print(f"{node.name=}") - #return sqlglot.parse_one(node.name) #no-op - return node - if isinstance(node, exp.DatetimeSub): - #print(f"{node=}") - return node - if isinstance(node, exp.Column): - #print(f"COLUMN {node.name=}") - return node - return node - def _make_duckdb_query_bigquery(qname: str, conn): _multischema_trunc_re = re.compile("\"physionet-data\.mimiciii_\w+\.") - #TODO: better anwer here? should only hit ccs_dx.sql! + #TODO: better answer here? should only hit ccs_dx.sql! _too_many_backslashes_re = re.compile("\\\\([\[\.\]])", ) #_whole_line_comments_strip_re = re.compile("^\s*--.*$", flags=re.MULTILINE) @@ -263,47 +187,6 @@ def _make_duckdb_query_bigquery(qname: str, conn): #print() -def _make_duckdb_query_postgres(qname: str, conn): - _uncreate_re = re.compile("DROP.*?;\s+CREATE.*?AS") - _multischema_trunc_re = re.compile("\"physionet-data\.mimiciii_\w+\.") - qfile = concept_name_map[qname]["path"] - with open(qfile, "r") as fp: - sql = fp.read() - sql = re.sub(_uncreate_re, "", sql).strip() - sql_list = sqlglot.transpile(sql, read="postgres", write="duckdb", pretty=True) - for st in sql_list: - if st == '': - continue - sql = re.sub(_multischema_trunc_re, "\"", st) - #ast = sqlglot.parse_one(sql) - #ast2 = ast.transform(_bigquery_sqlite_transformer) - #sql = ast2.sql() - #print(sql) - - if concept_name_map[qname].get("nocreate", False): - cursor = conn.cursor() - try: - cursor.execute(sql) - except Exception as e: - print(sql) - print(repr(sqlglot.parse_one(sql))) - raise e - result = cursor.fetchone() - print(result) - cursor.close() - return sql - - conn.execute(f"DROP VIEW IF EXISTS {qname}") - try: - conn.execute(f"CREATE TEMP VIEW {qname} AS " + sql) - conn.execute(sql) - except Exception as e: - print(sql) - #print(repr(sqlglot.parse_one(sql))) - raise e - print(f"CREATED VIEW {qname}") - - #print() def _make_duckdb_query_duckdb(qname: str, conn): qfile = concept_name_map[qname]["path"] @@ -329,7 +212,6 @@ def _make_duckdb_query_duckdb(qname: str, conn): def main() -> int: - global concept_name_map parser = argparse.ArgumentParser( prog='buildmimic_duckdb', From 4eb302fe1068b4603e77d4c1f3ac485f99abf823 Mon Sep 17 00:00:00 2001 From: SphtKr Date: Wed, 26 Apr 2023 11:33:08 +0000 Subject: [PATCH 03/20] Simplify pathing, strip more dead code --- .../buildmimic/duckdb/duckdb_concepts.py | 180 +++++++++--------- 1 file changed, 85 insertions(+), 95 deletions(-) diff --git a/mimic-iii/buildmimic/duckdb/duckdb_concepts.py b/mimic-iii/buildmimic/duckdb/duckdb_concepts.py index a01391840..014c88ea7 100644 --- a/mimic-iii/buildmimic/duckdb/duckdb_concepts.py +++ b/mimic-iii/buildmimic/duckdb/duckdb_concepts.py @@ -18,94 +18,94 @@ from pprint import pprint concept_name_map = { - 'icustay_times': {"path": "../../concepts/demographics/icustay_times.sql", "db": "bigquery"}, - 'icustay_hours': {"path": "./concepts/icustay_hours.sql", "db": "duckdb"}, - 'echo_data': {"path": "../../concepts/echo_data.sql", "db": "bigquery"}, - 'code_status': {"path": "../../concepts/code_status.sql", "db": "bigquery"}, - 'weight_durations': {"path": "../../concepts/durations/weight_durations.sql", "db": "bigquery"}, - 'rrt': {"path": "../../concepts/rrt.sql", "db": "bigquery"}, - 'heightweight': {"path": "../../concepts/demographics/heightweight.sql", "db": "bigquery"}, - 'icustay_detail': {"path": "../../concepts/demographics/icustay_detail.sql", "db": "bigquery"}, - - 'ventilation_classification': {"path": "../../concepts/durations/ventilation_classification.sql", "db": "bigquery"}, - 'ventilation_durations': {"path": "../../concepts/durations/ventilation_durations.sql", "db": "bigquery"}, - 'crrt_durations': {"path": "../../concepts/durations/crrt_durations.sql", "db": "bigquery"}, - 'adenosine_durations': {"path": "../../concepts/durations/adenosine_durations.sql", "db": "bigquery"}, - 'dobutamine_durations': {"path": "../../concepts/durations/dobutamine_durations.sql", "db": "bigquery"}, - 'dopamine_durations': {"path": "../../concepts/durations/dopamine_durations.sql", "db": "bigquery"}, - 'epinephrine_durations': {"path": "../../concepts/durations/epinephrine_durations.sql", "db": "bigquery"}, - 'isuprel_durations': {"path": "../../concepts/durations/isuprel_durations.sql", "db": "bigquery"}, - 'milrinone_durations': {"path": "../../concepts/durations/milrinone_durations.sql", "db": "bigquery"}, - 'norepinephrine_durations': {"path": "../../concepts/durations/norepinephrine_durations.sql", "db": "bigquery"}, - 'phenylephrine_durations': {"path": "../../concepts/durations/phenylephrine_durations.sql", "db": "bigquery"}, - 'vasopressin_durations': {"path": "../../concepts/durations/vasopressin_durations.sql", "db": "bigquery"}, - 'vasopressor_durations': {"path": "../../concepts/durations/vasopressor_durations.sql", "db": "bigquery"}, - - 'dobutamine_dose': {"path": "../../concepts/durations/dobutamine_dose.sql", "db": "bigquery"}, - 'dopamine_dose': {"path": "../../concepts/durations/dopamine_dose.sql", "db": "bigquery"}, - 'epinephrine_dose': {"path": "../../concepts/durations/epinephrine_dose.sql", "db": "bigquery"}, - 'norepinephrine_dose': {"path": "../../concepts/durations/norepinephrine_dose.sql", "db": "bigquery"}, - 'phenylephrine_dose': {"path": "../../concepts/durations/phenylephrine_dose.sql", "db": "bigquery"}, - 'vasopressin_dose': {"path": "../../concepts/durations/vasopressin_dose.sql", "db": "bigquery"}, - - 'pivoted_vital': {"path": "../../concepts/pivot/pivoted_vital.sql", "db": "bigquery"}, - 'pivoted_uo': {"path": "../../concepts/pivot/pivoted_uo.sql", "db": "bigquery"}, - 'pivoted_rrt': {"path": "../../concepts/pivot/pivoted_rrt.sql", "db": "bigquery"}, - 'pivoted_lab': {"path": "../../concepts/pivot/pivoted_lab.sql", "db": "bigquery"}, - 'pivoted_invasive_lines': {"path": "../../concepts/pivot/pivoted_invasive_lines.sql", "db": "bigquery"}, - 'pivoted_icp': {"path": "../../concepts/pivot/pivoted_icp.sql", "db": "bigquery"}, - 'pivoted_height': {"path": "../../concepts/pivot/pivoted_height.sql", "db": "bigquery"}, - 'pivoted_gcs': {"path": "../../concepts/pivot/pivoted_gcs.sql", "db": "bigquery"}, - 'pivoted_fio2': {"path": "../../concepts/pivot/pivoted_fio2.sql", "db": "bigquery"}, - 'pivoted_bg': {"path": "../../concepts/pivot/pivoted_bg.sql", "db": "bigquery"}, + 'icustay_times': {"path": "demographics/icustay_times.sql"}, + 'icustay_hours': {"path": "icustay_hours.sql", "db": "duckdb"}, + 'echo_data': {"path": "echo_data.sql"}, + 'code_status': {"path": "code_status.sql"}, + 'weight_durations': {"path": "durations/weight_durations.sql"}, + 'rrt': {"path": "rrt.sql"}, + 'heightweight': {"path": "demographics/heightweight.sql"}, + 'icustay_detail': {"path": "demographics/icustay_detail.sql"}, + + 'ventilation_classification': {"path": "durations/ventilation_classification.sql"}, + 'ventilation_durations': {"path": "durations/ventilation_durations.sql"}, + 'crrt_durations': {"path": "durations/crrt_durations.sql"}, + 'adenosine_durations': {"path": "durations/adenosine_durations.sql"}, + 'dobutamine_durations': {"path": "durations/dobutamine_durations.sql"}, + 'dopamine_durations': {"path": "durations/dopamine_durations.sql"}, + 'epinephrine_durations': {"path": "durations/epinephrine_durations.sql"}, + 'isuprel_durations': {"path": "durations/isuprel_durations.sql"}, + 'milrinone_durations': {"path": "durations/milrinone_durations.sql"}, + 'norepinephrine_durations': {"path": "durations/norepinephrine_durations.sql"}, + 'phenylephrine_durations': {"path": "durations/phenylephrine_durations.sql"}, + 'vasopressin_durations': {"path": "durations/vasopressin_durations.sql"}, + 'vasopressor_durations': {"path": "durations/vasopressor_durations.sql"}, + + 'dobutamine_dose': {"path": "durations/dobutamine_dose.sql"}, + 'dopamine_dose': {"path": "durations/dopamine_dose.sql"}, + 'epinephrine_dose': {"path": "durations/epinephrine_dose.sql"}, + 'norepinephrine_dose': {"path": "durations/norepinephrine_dose.sql"}, + 'phenylephrine_dose': {"path": "durations/phenylephrine_dose.sql"}, + 'vasopressin_dose': {"path": "durations/vasopressin_dose.sql"}, + + 'pivoted_vital': {"path": "pivot/pivoted_vital.sql"}, + 'pivoted_uo': {"path": "pivot/pivoted_uo.sql"}, + 'pivoted_rrt': {"path": "pivot/pivoted_rrt.sql"}, + 'pivoted_lab': {"path": "pivot/pivoted_lab.sql"}, + 'pivoted_invasive_lines': {"path": "pivot/pivoted_invasive_lines.sql"}, + 'pivoted_icp': {"path": "pivot/pivoted_icp.sql"}, + 'pivoted_height': {"path": "pivot/pivoted_height.sql"}, + 'pivoted_gcs': {"path": "pivot/pivoted_gcs.sql"}, + 'pivoted_fio2': {"path": "pivot/pivoted_fio2.sql"}, + 'pivoted_bg': {"path": "pivot/pivoted_bg.sql"}, # pivoted_bg_art must be run after pivoted_bg - 'pivoted_bg_art': {"path": "../../concepts/pivot/pivoted_bg_art.sql", "db": "bigquery"}, + 'pivoted_bg_art': {"path": "pivot/pivoted_bg_art.sql"}, # Difficult error here, the original query seems to reference something non-existent... # the `pivot` queries are omitted from the Postgres version... we may have to do the same? # pivoted oasis depends on icustay_hours in demographics - #'pivoted_oasis': {"path": "../../concepts/pivot/pivoted_oasis.sql", "db": "bigquery"}, + #'pivoted_oasis': {"path": "pivot/pivoted_oasis.sql"}, # Another puzzling error here, duckdb doesn't like something on the `WITH` line! # pivoted sofa depends on many above pivoted views, ventilation_durations, and dose queries - #'pivoted_sofa': {"path": "../../concepts/pivot/pivoted_sofa.sql", "db": "bigquery"}, - - 'elixhauser_ahrq_v37': {"path": "../../concepts/comorbidity/elixhauser_ahrq_v37.sql", "db": "bigquery"}, - 'elixhauser_ahrq_v37_no_drg': {"path": "../../concepts/comorbidity/elixhauser_ahrq_v37_no_drg.sql", "db": "bigquery"}, - 'elixhauser_quan': {"path": "../../concepts/comorbidity/elixhauser_quan.sql", "db": "bigquery"}, - 'elixhauser_score_ahrq': {"path": "../../concepts/comorbidity/elixhauser_score_ahrq.sql", "db": "bigquery"}, - 'elixhauser_score_quan': {"path": "../../concepts/comorbidity/elixhauser_score_quan.sql", "db": "bigquery"}, - - 'blood_gas_first_day': {"path": "../../concepts/firstday/blood_gas_first_day.sql", "db": "bigquery"}, - 'blood_gas_first_day_arterial': {"path": "../../concepts/firstday/blood_gas_first_day_arterial.sql", "db": "bigquery"}, - 'gcs_first_day': {"path": "../../concepts/firstday/gcs_first_day.sql", "db": "bigquery"}, - 'labs_first_day': {"path": "../../concepts/firstday/labs_first_day.sql", "db": "bigquery"}, - 'rrt_first_day': {"path": "../../concepts/firstday/rrt_first_day.sql", "db": "bigquery"}, - 'urine_output_first_day': {"path": "../../concepts/firstday/urine_output_first_day.sql", "db": "bigquery"}, - 'ventilation_first_day': {"path": "../../concepts/firstday/ventilation_first_day.sql", "db": "bigquery"}, - 'vitals_first_day': {"path": "../../concepts/firstday/vitals_first_day.sql", "db": "bigquery"}, - 'weight_first_day': {"path": "../../concepts/firstday/weight_first_day.sql", "db": "bigquery"}, + #'pivoted_sofa': {"path": "pivot/pivoted_sofa.sql"}, + + 'elixhauser_ahrq_v37': {"path": "comorbidity/elixhauser_ahrq_v37.sql"}, + 'elixhauser_ahrq_v37_no_drg': {"path": "comorbidity/elixhauser_ahrq_v37_no_drg.sql"}, + 'elixhauser_quan': {"path": "comorbidity/elixhauser_quan.sql"}, + 'elixhauser_score_ahrq': {"path": "comorbidity/elixhauser_score_ahrq.sql"}, + 'elixhauser_score_quan': {"path": "comorbidity/elixhauser_score_quan.sql"}, + + 'blood_gas_first_day': {"path": "firstday/blood_gas_first_day.sql"}, + 'blood_gas_first_day_arterial': {"path": "firstday/blood_gas_first_day_arterial.sql"}, + 'gcs_first_day': {"path": "firstday/gcs_first_day.sql"}, + 'labs_first_day': {"path": "firstday/labs_first_day.sql"}, + 'rrt_first_day': {"path": "firstday/rrt_first_day.sql"}, + 'urine_output_first_day': {"path": "firstday/urine_output_first_day.sql"}, + 'ventilation_first_day': {"path": "firstday/ventilation_first_day.sql"}, + 'vitals_first_day': {"path": "firstday/vitals_first_day.sql"}, + 'weight_first_day': {"path": "firstday/weight_first_day.sql"}, - 'urine_output': {"path": "../../concepts/fluid_balance/urine_output.sql", "db": "bigquery"}, + 'urine_output': {"path": "fluid_balance/urine_output.sql"}, - 'angus': {"path": "../../concepts/sepsis/angus.sql", "db": "bigquery"}, - 'martin': {"path": "../../concepts/sepsis/martin.sql", "db": "bigquery"}, - 'explicit': {"path": "../../concepts/sepsis/explicit.sql", "db": "bigquery"}, + 'angus': {"path": "sepsis/angus.sql"}, + 'martin': {"path": "sepsis/martin.sql"}, + 'explicit': {"path": "sepsis/explicit.sql"}, - 'ccs_dx': {"path": "../../concepts/diagnosis/ccs_dx.sql", "db": "bigquery"}, + 'ccs_dx': {"path": "diagnosis/ccs_dx.sql"}, - 'kdigo_creatinine': {"path": "../../concepts/organfailure/kdigo_creatinine.sql", "db": "bigquery"}, - 'kdigo_uo': {"path": "../../concepts/organfailure/kdigo_uo.sql", "db": "bigquery"}, - 'kdigo_stages': {"path": "../../concepts/organfailure/kdigo_stages.sql", "db": "bigquery"}, - 'kdigo_stages_7day': {"path": "../../concepts/organfailure/kdigo_stages_7day.sql", "db": "bigquery"}, - 'kdigo_stages_48hr': {"path": "../../concepts/organfailure/kdigo_stages_48hr.sql", "db": "bigquery"}, - 'meld': {"path": "../../concepts/organfailure/meld.sql", "db": "bigquery"}, + 'kdigo_creatinine': {"path": "organfailure/kdigo_creatinine.sql"}, + 'kdigo_uo': {"path": "organfailure/kdigo_uo.sql"}, + 'kdigo_stages': {"path": "organfailure/kdigo_stages.sql"}, + 'kdigo_stages_7day': {"path": "organfailure/kdigo_stages_7day.sql"}, + 'kdigo_stages_48hr': {"path": "organfailure/kdigo_stages_48hr.sql"}, + 'meld': {"path": "organfailure/meld.sql"}, - 'oasis': {"path": "../../concepts/severityscores/oasis.sql", "db": "bigquery"}, - 'sofa': {"path": "../../concepts/severityscores/sofa.sql", "db": "bigquery"}, - 'saps': {"path": "../../concepts/severityscores/saps.sql", "db": "bigquery"}, - 'sapsii': {"path": "../../concepts/severityscores/sapsii.sql", "db": "bigquery"}, - 'apsiii': {"path": "../../concepts/severityscores/apsiii.sql", "db": "bigquery"}, - 'lods': {"path": "../../concepts/severityscores/lods.sql", "db": "bigquery"}, - 'sirs': {"path": "../../concepts/severityscores/sirs.sql", "db": "bigquery"}, + 'oasis': {"path": "severityscores/oasis.sql"}, + 'sofa': {"path": "severityscores/sofa.sql"}, + 'saps': {"path": "severityscores/saps.sql"}, + 'sapsii': {"path": "severityscores/sapsii.sql"}, + 'apsiii': {"path": "severityscores/apsiii.sql"}, + 'lods': {"path": "severityscores/lods.sql"}, + 'sirs': {"path": "severityscores/sirs.sql"}, } @@ -136,21 +136,16 @@ def duckdb_date_diff_sql(self, expression): sqlglot.dialects.duckdb.DuckDB.Generator.TRANSFORMS[exp.DateDiff] = duckdb_date_diff_sql -def _make_duckdb_query_bigquery(qname: str, conn): +def _make_duckdb_query_bigquery(qname: str, qfile: str, conn): _multischema_trunc_re = re.compile("\"physionet-data\.mimiciii_\w+\.") #TODO: better answer here? should only hit ccs_dx.sql! - _too_many_backslashes_re = re.compile("\\\\([\[\.\]])", ) + _too_many_backslashes_re = re.compile("\\\\([\[\.\]])") - #_whole_line_comments_strip_re = re.compile("^\s*--.*$", flags=re.MULTILINE) - qfile = concept_name_map[qname]["path"] with open(qfile, "r") as fp: sql = fp.read() - ##strip comments manually... one weird thing happening in pivoted_sofa...?? - #sql = re.sub(_whole_line_comments_strip_re, '', sql) sql = re.sub(_too_many_backslashes_re, '\\$1', sql) try: - #print(repr(sqlglot.parse_one(sql.replace('`','"')))) sql_list = sqlglot.transpile(sql, read="bigquery", write="duckdb", pretty=True) except Exception as e: print(sql) @@ -158,10 +153,6 @@ def _make_duckdb_query_bigquery(qname: str, conn): print() for st in sql_list: sql = re.sub(_multischema_trunc_re, "\"", st) - #ast = sqlglot.parse_one(sql) - #ast2 = ast.transform(_bigquery_duckdb_transformer) - #sql = ast2.sql(pretty=True) - #print(sql) if concept_name_map[qname].get("nocreate", False): cursor = conn.cursor() @@ -188,8 +179,7 @@ def _make_duckdb_query_bigquery(qname: str, conn): #print() -def _make_duckdb_query_duckdb(qname: str, conn): - qfile = concept_name_map[qname]["path"] +def _make_duckdb_query_duckdb(qname: str, qfile: str, conn): with open(qfile, "r") as fp: sql = fp.read() if concept_name_map[qname].get("nocreate", False): @@ -283,13 +273,13 @@ def main() -> int: for key in concept_name_map: #cProfile.run('...') #print(f"Making view {key}...") - db = concept_name_map[key].get("db", "postgres") + db = concept_name_map[key].get("db", "bigquery") if db == "duckdb": - _make_duckdb_query_duckdb(key, connection) + qpath = os.path.join(mimic_code_root, 'mimic-iii', 'buildmimic', 'duckdb', 'concepts', concept_name_map[key]['path']) + _make_duckdb_query_duckdb(key, qpath, connection) elif db == "bigquery": - _make_duckdb_query_bigquery(key, connection) - elif db == "postgres": - _make_duckdb_query_postgres(key, connection) + qpath = os.path.join(mimic_code_root, 'mimic-iii', 'concepts', concept_name_map[key]['path']) + _make_duckdb_query_bigquery(key, qpath, connection) except Exception as error: print("Failed to execute translated SQL: ", error) From d1fb76b0119af23eab0633e9992ff672ebb1c374 Mon Sep 17 00:00:00 2001 From: SphtKr Date: Wed, 26 Apr 2023 11:44:37 +0000 Subject: [PATCH 04/20] Align with shell script version --- mimic-iii/buildmimic/duckdb/import_duckdb.py | 297 +++++++++++++++++++ 1 file changed, 297 insertions(+) create mode 100644 mimic-iii/buildmimic/duckdb/import_duckdb.py diff --git a/mimic-iii/buildmimic/duckdb/import_duckdb.py b/mimic-iii/buildmimic/duckdb/import_duckdb.py new file mode 100644 index 000000000..6c0635066 --- /dev/null +++ b/mimic-iii/buildmimic/duckdb/import_duckdb.py @@ -0,0 +1,297 @@ +import cProfile + +import sys +import os +import re +import argparse + +import duckdb +import datetime + +#import sqlparse +import sqlglot +import sqlglot.dialects.bigquery +import sqlglot.dialects.duckdb +from sqlglot import exp, generator, parser, tokens, transforms +from sqlglot.helper import seq_get + +from pprint import pprint + +concept_name_map = { + 'icustay_times': {"path": "demographics/icustay_times.sql"}, + 'icustay_hours': {"path": "icustay_hours.sql", "db": "duckdb"}, + 'echo_data': {"path": "echo_data.sql"}, + 'code_status': {"path": "code_status.sql"}, + 'weight_durations': {"path": "durations/weight_durations.sql"}, + 'rrt': {"path": "rrt.sql"}, + 'heightweight': {"path": "demographics/heightweight.sql"}, + 'icustay_detail': {"path": "demographics/icustay_detail.sql"}, + + 'ventilation_classification': {"path": "durations/ventilation_classification.sql"}, + 'ventilation_durations': {"path": "durations/ventilation_durations.sql"}, + 'crrt_durations': {"path": "durations/crrt_durations.sql"}, + 'adenosine_durations': {"path": "durations/adenosine_durations.sql"}, + 'dobutamine_durations': {"path": "durations/dobutamine_durations.sql"}, + 'dopamine_durations': {"path": "durations/dopamine_durations.sql"}, + 'epinephrine_durations': {"path": "durations/epinephrine_durations.sql"}, + 'isuprel_durations': {"path": "durations/isuprel_durations.sql"}, + 'milrinone_durations': {"path": "durations/milrinone_durations.sql"}, + 'norepinephrine_durations': {"path": "durations/norepinephrine_durations.sql"}, + 'phenylephrine_durations': {"path": "durations/phenylephrine_durations.sql"}, + 'vasopressin_durations': {"path": "durations/vasopressin_durations.sql"}, + 'vasopressor_durations': {"path": "durations/vasopressor_durations.sql"}, + + 'dobutamine_dose': {"path": "durations/dobutamine_dose.sql"}, + 'dopamine_dose': {"path": "durations/dopamine_dose.sql"}, + 'epinephrine_dose': {"path": "durations/epinephrine_dose.sql"}, + 'norepinephrine_dose': {"path": "durations/norepinephrine_dose.sql"}, + 'phenylephrine_dose': {"path": "durations/phenylephrine_dose.sql"}, + 'vasopressin_dose': {"path": "durations/vasopressin_dose.sql"}, + + 'pivoted_vital': {"path": "pivot/pivoted_vital.sql"}, + 'pivoted_uo': {"path": "pivot/pivoted_uo.sql"}, + 'pivoted_rrt': {"path": "pivot/pivoted_rrt.sql"}, + 'pivoted_lab': {"path": "pivot/pivoted_lab.sql"}, + 'pivoted_invasive_lines': {"path": "pivot/pivoted_invasive_lines.sql"}, + 'pivoted_icp': {"path": "pivot/pivoted_icp.sql"}, + 'pivoted_height': {"path": "pivot/pivoted_height.sql"}, + 'pivoted_gcs': {"path": "pivot/pivoted_gcs.sql"}, + 'pivoted_fio2': {"path": "pivot/pivoted_fio2.sql"}, + 'pivoted_bg': {"path": "pivot/pivoted_bg.sql"}, + # pivoted_bg_art must be run after pivoted_bg + 'pivoted_bg_art': {"path": "pivot/pivoted_bg_art.sql"}, + # Difficult error here, the original query seems to reference something non-existent... + # the `pivot` queries are omitted from the Postgres version... we may have to do the same? + # pivoted oasis depends on icustay_hours in demographics + #'pivoted_oasis': {"path": "pivot/pivoted_oasis.sql"}, + # Another puzzling error here, duckdb doesn't like something on the `WITH` line! + # pivoted sofa depends on many above pivoted views, ventilation_durations, and dose queries + #'pivoted_sofa': {"path": "pivot/pivoted_sofa.sql"}, + + 'elixhauser_ahrq_v37': {"path": "comorbidity/elixhauser_ahrq_v37.sql"}, + 'elixhauser_ahrq_v37_no_drg': {"path": "comorbidity/elixhauser_ahrq_v37_no_drg.sql"}, + 'elixhauser_quan': {"path": "comorbidity/elixhauser_quan.sql"}, + 'elixhauser_score_ahrq': {"path": "comorbidity/elixhauser_score_ahrq.sql"}, + 'elixhauser_score_quan': {"path": "comorbidity/elixhauser_score_quan.sql"}, + + 'blood_gas_first_day': {"path": "firstday/blood_gas_first_day.sql"}, + 'blood_gas_first_day_arterial': {"path": "firstday/blood_gas_first_day_arterial.sql"}, + 'gcs_first_day': {"path": "firstday/gcs_first_day.sql"}, + 'labs_first_day': {"path": "firstday/labs_first_day.sql"}, + 'rrt_first_day': {"path": "firstday/rrt_first_day.sql"}, + 'urine_output_first_day': {"path": "firstday/urine_output_first_day.sql"}, + 'ventilation_first_day': {"path": "firstday/ventilation_first_day.sql"}, + 'vitals_first_day': {"path": "firstday/vitals_first_day.sql"}, + 'weight_first_day': {"path": "firstday/weight_first_day.sql"}, + + 'urine_output': {"path": "fluid_balance/urine_output.sql"}, + + 'angus': {"path": "sepsis/angus.sql"}, + 'martin': {"path": "sepsis/martin.sql"}, + 'explicit': {"path": "sepsis/explicit.sql"}, + + 'ccs_dx': {"path": "diagnosis/ccs_dx.sql"}, + + 'kdigo_creatinine': {"path": "organfailure/kdigo_creatinine.sql"}, + 'kdigo_uo': {"path": "organfailure/kdigo_uo.sql"}, + 'kdigo_stages': {"path": "organfailure/kdigo_stages.sql"}, + 'kdigo_stages_7day': {"path": "organfailure/kdigo_stages_7day.sql"}, + 'kdigo_stages_48hr': {"path": "organfailure/kdigo_stages_48hr.sql"}, + 'meld': {"path": "organfailure/meld.sql"}, + + 'oasis': {"path": "severityscores/oasis.sql"}, + 'sofa': {"path": "severityscores/sofa.sql"}, + 'saps': {"path": "severityscores/saps.sql"}, + 'sapsii': {"path": "severityscores/sapsii.sql"}, + 'apsiii': {"path": "severityscores/apsiii.sql"}, + 'lods': {"path": "severityscores/lods.sql"}, + 'sirs': {"path": "severityscores/sirs.sql"}, + +} + +# BigQuery monkey patches +sqlglot.dialects.bigquery.BigQuery.Parser.FUNCTIONS["PARSE_DATETIME"] = lambda args: exp.StrToTime( + this=seq_get(args, 1), format=seq_get(args, 0) +) +sqlglot.dialects.bigquery.BigQuery.Parser.FUNCTIONS["FORMAT_DATE"] = lambda args: exp.TimeToStr( + this=seq_get(args, 1), format=seq_get(args, 0) +) +sqlglot.dialects.bigquery.BigQuery.Parser.STRICT_CAST = False + +# DuckDB monkey patches +def duckdb_date_sub_sql(self, expression): + #print("CALLING duckdb._date_sub") + this = self.sql(expression, "this") + unit = self.sql(expression, "unit") or "DAY" # .strip("'") + return f"{this} - {self.sql(exp.Interval(this=expression.expression, unit=unit))}" +sqlglot.dialects.duckdb.DuckDB.Generator.TRANSFORMS[exp.DatetimeSub] = duckdb_date_sub_sql +sqlglot.dialects.duckdb.DuckDB.Generator.TRANSFORMS[exp.DatetimeAdd] = sqlglot.dialects.duckdb._date_add + +def duckdb_date_diff_sql(self, expression): + #print("CALLING duckdb._date_diff") + this = self.sql(expression, "this") + unit = self.sql(expression, "unit") or "DAY" + return f"DATE_DIFF('{unit}', {this}, {self.sql(expression.expression)})" +sqlglot.dialects.duckdb.DuckDB.Generator.TRANSFORMS[exp.DatetimeDiff] = duckdb_date_diff_sql +sqlglot.dialects.duckdb.DuckDB.Generator.TRANSFORMS[exp.DateDiff] = duckdb_date_diff_sql + + +def _make_duckdb_query_bigquery(qname: str, qfile: str, conn): + _multischema_trunc_re = re.compile("\"physionet-data\.mimiciii_\w+\.") + + #TODO: better answer here? should only hit ccs_dx.sql! + _too_many_backslashes_re = re.compile("\\\\([\[\.\]])") + + with open(qfile, "r") as fp: + sql = fp.read() + sql = re.sub(_too_many_backslashes_re, '\\$1', sql) + try: + sql_list = sqlglot.transpile(sql, read="bigquery", write="duckdb", pretty=True) + except Exception as e: + print(sql) + raise e + print() + for st in sql_list: + sql = re.sub(_multischema_trunc_re, "\"", st) + + if concept_name_map[qname].get("nocreate", False): + cursor = conn.cursor() + try: + cursor.execute(sql) + except Exception as e: + print(sql) + print(repr(sqlglot.parse_one(sql))) + raise e + result = cursor.fetchone() + print(result) + cursor.close() + return sql + + conn.execute(f"DROP VIEW IF EXISTS {qname}") + try: + conn.execute(f"CREATE TEMP VIEW {qname} AS " + sql) + except Exception as e: + print(sql) + #print(repr(sqlglot.parse_one(sql))) + raise e + print(f"CREATED VIEW {qname}") + + #print() + + +def _make_duckdb_query_duckdb(qname: str, qfile: str, conn): + with open(qfile, "r") as fp: + sql = fp.read() + if concept_name_map[qname].get("nocreate", False): + cursor = conn.cursor() + try: + cursor.execute(sql) + except Exception as e: + print(sql) + raise e + result = cursor.fetchone() + print(result) + cursor.close() + return sql + try: + conn.execute(f"CREATE TEMP VIEW {qname} AS " + sql) + except Exception as e: + print(sql) + raise e + print(f"CREATED VIEW {qname}") + + +def main() -> int: + + parser = argparse.ArgumentParser( + prog='buildmimic_duckdb', + description='Creates the MIMIC-III database in DuckDB and optionally the concepts views.', + ) + parser.add_argument('mimic_data_dir', help="directory that contains csv.tar.gz or csv files") + parser.add_argument('output_db', help="filename for duckdb file (default: mimic3.db)", default="./mimic3.db") + parser.add_argument('--make-concepts', help="generate the concepts views", action="store_true") + parser.add_argument('--mimic-code-root', help="location of the mimic-code repo (used to find concepts SQL)", default='../../../') + args = parser.parse_args() + output_db = args.output_db + mimic_data_dir = args.mimic_data_dir + make_concepts = args.make_concepts + mimic_code_root = args.mimic_code_root + + if make_concepts: + connection = duckdb.connect(output_db) + print("Connected to duckdb...") + + #print("Defining macros...") + #for macro in macros: + # connection.execute(macro) + + print("Creating tables...") + + # ccs_dx is an outlier...this is adapted from the BigQuery version... + ccs_multi_dx_create = """ + DROP TABLE IF EXISTS ccs_multi_dx; + CREATE TABLE ccs_multi_dx + ( + icd9_code CHAR(5) NOT NULL, + -- CCS levels and names based on position in hierarchy + ccs_level1 VARCHAR(10), + ccs_group1 VARCHAR(100), + ccs_level2 VARCHAR(10), + ccs_group2 VARCHAR(100), + ccs_level3 VARCHAR(10), + ccs_group3 VARCHAR(100), + ccs_level4 VARCHAR(10), + ccs_group4 VARCHAR(100) + ); + """ + + print("Loading data...") + try: + #FIXME: Turn this line back on! + #connection.execute(ccs_multi_dx_create) + #connection.execute(...) + csvgz_path = os.path.join(mimic_code_root, 'mimic-iii','concepts_postgres','diagnosis','ccs_multi_dx.csv.gz') + #connection.from_csv_auto( + # name=data_path, + # header=True) + #FIXME: Turn this line back on! + #connection.execute(f"COPY ccs_multi_dx from '{csvgz_path}' (FORMAT CSV, DELIMITER ',', HEADER);") + + print(connection.sql("SELECT * FROM ccs_multi_dx LIMIT 10;")) + except Exception as error: + print("Failed to setup ccs_multi_dx: ", error) + raise error + finally: + if connection: + connection.close() + print("duckdb connection is closed") + + connection = duckdb.connect(output_db) + + print("Creating views...") + try: + for key in concept_name_map: + #cProfile.run('...') + #print(f"Making view {key}...") + db = concept_name_map[key].get("db", "bigquery") + if db == "duckdb": + qpath = os.path.join(mimic_code_root, 'mimic-iii', 'buildmimic', 'duckdb', 'concepts', concept_name_map[key]['path']) + _make_duckdb_query_duckdb(key, qpath, connection) + elif db == "bigquery": + qpath = os.path.join(mimic_code_root, 'mimic-iii', 'concepts', concept_name_map[key]['path']) + _make_duckdb_query_bigquery(key, qpath, connection) + + except Exception as error: + print("Failed to execute translated SQL: ", error) + raise error + finally: + if connection: + connection.close() + print("duckdb connection is closed") + +if __name__ == '__main__': + sys.exit(main()) + + + + From 8274982d6d646d56e10dc1752bae83856e3a868e Mon Sep 17 00:00:00 2001 From: SphtKr Date: Wed, 26 Apr 2023 11:52:11 +0000 Subject: [PATCH 05/20] Add requirements.txt --- mimic-iii/buildmimic/duckdb/requirements.txt | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 mimic-iii/buildmimic/duckdb/requirements.txt diff --git a/mimic-iii/buildmimic/duckdb/requirements.txt b/mimic-iii/buildmimic/duckdb/requirements.txt new file mode 100644 index 000000000..1a0bffa7e --- /dev/null +++ b/mimic-iii/buildmimic/duckdb/requirements.txt @@ -0,0 +1,2 @@ +duckdb>=0.7.1 +sqlglot>=11.5.7 \ No newline at end of file From 4b0ac76858e6808863dfd2716264e7540002f934 Mon Sep 17 00:00:00 2001 From: SphtKr Date: Thu, 27 Apr 2023 00:26:49 +0000 Subject: [PATCH 06/20] Pulled sql out of .sh plus minor related mods --- mimic-iii/buildmimic/duckdb/import_duckdb.sh | 494 +----------------- .../duckdb/import_duckdb_tables.sql | 492 +++++++++++++++++ 2 files changed, 494 insertions(+), 492 deletions(-) create mode 100644 mimic-iii/buildmimic/duckdb/import_duckdb_tables.sql diff --git a/mimic-iii/buildmimic/duckdb/import_duckdb.sh b/mimic-iii/buildmimic/duckdb/import_duckdb.sh index 0a70e1e42..e7f6187bd 100755 --- a/mimic-iii/buildmimic/duckdb/import_duckdb.sh +++ b/mimic-iii/buildmimic/duckdb/import_duckdb.sh @@ -70,497 +70,7 @@ fi # create tables using DDL from postgres # minor changes: TIMESTAMP(nn) -> TIMESTAMP -try duckdb "$OUTFILE" < 0.7.1 is released? + -- See https://github.com/duckdb/duckdb/issues/6668#issuecomment-1474880266 + --,CONSTRAINT chartevents_rowid_pk PRIMARY KEY (ROW_ID) +); +-- Remove this index when the PK can be re-added... +CREATE UNIQUE INDEX chartevents_rowid_pk ON CHARTEVENTS (ROW_ID); + +DROP TABLE IF EXISTS CPTEVENTS CASCADE; +CREATE TABLE CPTEVENTS +( + ROW_ID INT NOT NULL, + SUBJECT_ID INT NOT NULL, + HADM_ID INT NOT NULL, + COSTCENTER VARCHAR(10) NOT NULL, + CHARTDATE TIMESTAMP, + CPT_CD VARCHAR(10) NOT NULL, + CPT_NUMBER INT, + CPT_SUFFIX VARCHAR(5), + TICKET_ID_SEQ INT, + SECTIONHEADER VARCHAR(50), + SUBSECTIONHEADER VARCHAR(255), + DESCRIPTION VARCHAR(200), + CONSTRAINT cpt_rowid_pk PRIMARY KEY (ROW_ID) +) ; + +DROP TABLE IF EXISTS DATETIMEEVENTS CASCADE; +CREATE TABLE DATETIMEEVENTS +( + ROW_ID INT NOT NULL, + SUBJECT_ID INT NOT NULL, + HADM_ID INT, + ICUSTAY_ID INT, + ITEMID INT NOT NULL, + CHARTTIME TIMESTAMP NOT NULL, + STORETIME TIMESTAMP NOT NULL, + CGID INT NOT NULL, + VALUE TIMESTAMP, + VALUEUOM VARCHAR(50) NOT NULL, + WARNING SMALLINT, + ERROR SMALLINT, + RESULTSTATUS VARCHAR(50), + STOPPED VARCHAR(50), + CONSTRAINT datetime_rowid_pk PRIMARY KEY (ROW_ID) +) ; + +DROP TABLE IF EXISTS DIAGNOSES_ICD CASCADE; +CREATE TABLE DIAGNOSES_ICD +( + ROW_ID INT NOT NULL, + SUBJECT_ID INT NOT NULL, + HADM_ID INT NOT NULL, + SEQ_NUM INT, + ICD9_CODE VARCHAR(10), + CONSTRAINT diagnosesicd_rowid_pk PRIMARY KEY (ROW_ID) +) ; + +DROP TABLE IF EXISTS DRGCODES CASCADE; +CREATE TABLE DRGCODES +( + ROW_ID INT NOT NULL, + SUBJECT_ID INT NOT NULL, + HADM_ID INT NOT NULL, + DRG_TYPE VARCHAR(20) NOT NULL, + DRG_CODE VARCHAR(20) NOT NULL, + DESCRIPTION VARCHAR(255), + DRG_SEVERITY SMALLINT, + DRG_MORTALITY SMALLINT, + CONSTRAINT drg_rowid_pk PRIMARY KEY (ROW_ID) +) ; + +DROP TABLE IF EXISTS D_CPT CASCADE; +CREATE TABLE D_CPT +( + ROW_ID INT NOT NULL, + CATEGORY SMALLINT NOT NULL, + SECTIONRANGE VARCHAR(100) NOT NULL, + SECTIONHEADER VARCHAR(50) NOT NULL, + SUBSECTIONRANGE VARCHAR(100) NOT NULL, + SUBSECTIONHEADER VARCHAR(255) NOT NULL, + CODESUFFIX VARCHAR(5), + MINCODEINSUBSECTION INT NOT NULL, + MAXCODEINSUBSECTION INT NOT NULL, + CONSTRAINT dcpt_ssrange_unique UNIQUE (SUBSECTIONRANGE), + CONSTRAINT dcpt_rowid_pk PRIMARY KEY (ROW_ID) +) ; + +DROP TABLE IF EXISTS D_ICD_DIAGNOSES CASCADE; +CREATE TABLE D_ICD_DIAGNOSES +( + ROW_ID INT NOT NULL, + ICD9_CODE VARCHAR(10) NOT NULL, + SHORT_TITLE VARCHAR(50) NOT NULL, + LONG_TITLE VARCHAR(255) NOT NULL, + CONSTRAINT d_icd_diag_code_unique UNIQUE (ICD9_CODE), + CONSTRAINT d_icd_diag_rowid_pk PRIMARY KEY (ROW_ID) +) ; + +DROP TABLE IF EXISTS D_ICD_PROCEDURES CASCADE; +CREATE TABLE D_ICD_PROCEDURES +( + ROW_ID INT NOT NULL, + ICD9_CODE VARCHAR(10) NOT NULL, + SHORT_TITLE VARCHAR(50) NOT NULL, + LONG_TITLE VARCHAR(255) NOT NULL, + CONSTRAINT d_icd_proc_code_unique UNIQUE (ICD9_CODE), + CONSTRAINT d_icd_proc_rowid_pk PRIMARY KEY (ROW_ID) +) ; + +DROP TABLE IF EXISTS D_ITEMS CASCADE; +CREATE TABLE D_ITEMS +( + ROW_ID INT NOT NULL, + ITEMID INT NOT NULL, + LABEL VARCHAR(200), + ABBREVIATION VARCHAR(100), + DBSOURCE VARCHAR(20), + LINKSTO VARCHAR(50), + CATEGORY VARCHAR(100), + UNITNAME VARCHAR(100), + PARAM_TYPE VARCHAR(30), + CONCEPTID INT, + CONSTRAINT ditems_itemid_unique UNIQUE (ITEMID), + CONSTRAINT ditems_rowid_pk PRIMARY KEY (ROW_ID) +) ; + +DROP TABLE IF EXISTS D_LABITEMS CASCADE; +CREATE TABLE D_LABITEMS +( + ROW_ID INT NOT NULL, + ITEMID INT NOT NULL, + LABEL VARCHAR(100) NOT NULL, + FLUID VARCHAR(100) NOT NULL, + CATEGORY VARCHAR(100) NOT NULL, + LOINC_CODE VARCHAR(100), + CONSTRAINT dlabitems_itemid_unique UNIQUE (ITEMID), + CONSTRAINT dlabitems_rowid_pk PRIMARY KEY (ROW_ID) +) ; + +DROP TABLE IF EXISTS ICUSTAYS CASCADE; +CREATE TABLE ICUSTAYS +( + ROW_ID INT NOT NULL, + SUBJECT_ID INT NOT NULL, + HADM_ID INT NOT NULL, + ICUSTAY_ID INT NOT NULL, + DBSOURCE VARCHAR(20) NOT NULL, + FIRST_CAREUNIT VARCHAR(20) NOT NULL, + LAST_CAREUNIT VARCHAR(20) NOT NULL, + FIRST_WARDID SMALLINT NOT NULL, + LAST_WARDID SMALLINT NOT NULL, + INTIME TIMESTAMP NOT NULL, + OUTTIME TIMESTAMP, + LOS DOUBLE PRECISION, + CONSTRAINT icustay_icustayid_unique UNIQUE (ICUSTAY_ID), + CONSTRAINT icustay_rowid_pk PRIMARY KEY (ROW_ID) +) ; + +DROP TABLE IF EXISTS INPUTEVENTS_CV CASCADE; +CREATE TABLE INPUTEVENTS_CV +( + ROW_ID INT NOT NULL, + SUBJECT_ID INT NOT NULL, + HADM_ID INT, + ICUSTAY_ID INT, + CHARTTIME TIMESTAMP, + ITEMID INT, + AMOUNT DOUBLE PRECISION, + AMOUNTUOM VARCHAR(30), + RATE DOUBLE PRECISION, + RATEUOM VARCHAR(30), + STORETIME TIMESTAMP, + CGID INT, + ORDERID INT, + LINKORDERID INT, + STOPPED VARCHAR(30), + NEWBOTTLE INT, + ORIGINALAMOUNT DOUBLE PRECISION, + ORIGINALAMOUNTUOM VARCHAR(30), + ORIGINALROUTE VARCHAR(30), + ORIGINALRATE DOUBLE PRECISION, + ORIGINALRATEUOM VARCHAR(30), + ORIGINALSITE VARCHAR(30), + CONSTRAINT inputevents_cv_rowid_pk PRIMARY KEY (ROW_ID) +) ; + +DROP TABLE IF EXISTS INPUTEVENTS_MV CASCADE; +CREATE TABLE INPUTEVENTS_MV +( + ROW_ID INT NOT NULL, + SUBJECT_ID INT NOT NULL, + HADM_ID INT, + ICUSTAY_ID INT, + STARTTIME TIMESTAMP, + ENDTIME TIMESTAMP, + ITEMID INT, + AMOUNT DOUBLE PRECISION, + AMOUNTUOM VARCHAR(30), + RATE DOUBLE PRECISION, + RATEUOM VARCHAR(30), + STORETIME TIMESTAMP, + CGID INT, + ORDERID INT, + LINKORDERID INT, + ORDERCATEGORYNAME VARCHAR(100), + SECONDARYORDERCATEGORYNAME VARCHAR(100), + ORDERCOMPONENTTYPEDESCRIPTION VARCHAR(200), + ORDERCATEGORYDESCRIPTION VARCHAR(50), + PATIENTWEIGHT DOUBLE PRECISION, + TOTALAMOUNT DOUBLE PRECISION, + TOTALAMOUNTUOM VARCHAR(50), + ISOPENBAG SMALLINT, + CONTINUEINNEXTDEPT SMALLINT, + CANCELREASON SMALLINT, + STATUSDESCRIPTION VARCHAR(30), + COMMENTS_EDITEDBY VARCHAR(30), + COMMENTS_CANCELEDBY VARCHAR(40), + COMMENTS_DATE TIMESTAMP, + ORIGINALAMOUNT DOUBLE PRECISION, + ORIGINALRATE DOUBLE PRECISION, + CONSTRAINT inputevents_mv_rowid_pk PRIMARY KEY (ROW_ID) +) ; + +DROP TABLE IF EXISTS LABEVENTS CASCADE; +CREATE TABLE LABEVENTS +( + ROW_ID INT NOT NULL, + SUBJECT_ID INT NOT NULL, + HADM_ID INT, + ITEMID INT NOT NULL, + CHARTTIME TIMESTAMP, + VALUE VARCHAR(200), + VALUENUM DOUBLE PRECISION, + VALUEUOM VARCHAR(20), + FLAG VARCHAR(20), + CONSTRAINT labevents_rowid_pk PRIMARY KEY (ROW_ID) +) ; + +DROP TABLE IF EXISTS MICROBIOLOGYEVENTS CASCADE; +CREATE TABLE MICROBIOLOGYEVENTS +( + ROW_ID INT NOT NULL, + SUBJECT_ID INT NOT NULL, + HADM_ID INT, + CHARTDATE TIMESTAMP, + CHARTTIME TIMESTAMP, + SPEC_ITEMID INT, + SPEC_TYPE_DESC VARCHAR(100), + ORG_ITEMID INT, + ORG_NAME VARCHAR(100), + ISOLATE_NUM SMALLINT, + AB_ITEMID INT, + AB_NAME VARCHAR(30), + DILUTION_TEXT VARCHAR(10), + DILUTION_COMPARISON VARCHAR(20), + DILUTION_VALUE DOUBLE PRECISION, + INTERPRETATION VARCHAR(5), + CONSTRAINT micro_rowid_pk PRIMARY KEY (ROW_ID) +) ; + +DROP TABLE IF EXISTS NOTEEVENTS CASCADE; +CREATE TABLE NOTEEVENTS +( + ROW_ID INT NOT NULL, + SUBJECT_ID INT NOT NULL, + HADM_ID INT, + CHARTDATE TIMESTAMP, + CHARTTIME TIMESTAMP, + STORETIME TIMESTAMP, + CATEGORY VARCHAR(50), + DESCRIPTION VARCHAR(255), + CGID INT, + ISERROR CHAR(1), + TEXT TEXT, + CONSTRAINT noteevents_rowid_pk PRIMARY KEY (ROW_ID) +) ; + +DROP TABLE IF EXISTS OUTPUTEVENTS CASCADE; +CREATE TABLE OUTPUTEVENTS +( + ROW_ID INT NOT NULL, + SUBJECT_ID INT NOT NULL, + HADM_ID INT, + ICUSTAY_ID INT, + CHARTTIME TIMESTAMP, + ITEMID INT, + VALUE DOUBLE PRECISION, + VALUEUOM VARCHAR(30), + STORETIME TIMESTAMP, + CGID INT, + STOPPED VARCHAR(30), + NEWBOTTLE CHAR(1), + ISERROR INT, + CONSTRAINT outputevents_cv_rowid_pk PRIMARY KEY (ROW_ID) +) ; + +DROP TABLE IF EXISTS PATIENTS CASCADE; +CREATE TABLE PATIENTS +( + ROW_ID INT NOT NULL, + SUBJECT_ID INT NOT NULL, + GENDER VARCHAR(5) NOT NULL, + DOB TIMESTAMP NOT NULL, + DOD TIMESTAMP, + DOD_HOSP TIMESTAMP, + DOD_SSN TIMESTAMP, + EXPIRE_FLAG INT NOT NULL, + CONSTRAINT pat_subid_unique UNIQUE (SUBJECT_ID), + CONSTRAINT pat_rowid_pk PRIMARY KEY (ROW_ID) +) ; + +DROP TABLE IF EXISTS PRESCRIPTIONS CASCADE; +CREATE TABLE PRESCRIPTIONS +( + ROW_ID INT NOT NULL, + SUBJECT_ID INT NOT NULL, + HADM_ID INT NOT NULL, + ICUSTAY_ID INT, + STARTDATE TIMESTAMP, + ENDDATE TIMESTAMP, + DRUG_TYPE VARCHAR(100) NOT NULL, + DRUG VARCHAR(100) NOT NULL, + DRUG_NAME_POE VARCHAR(100), + DRUG_NAME_GENERIC VARCHAR(100), + FORMULARY_DRUG_CD VARCHAR(120), + GSN VARCHAR(200), + NDC VARCHAR(120), + PROD_STRENGTH VARCHAR(120), + DOSE_VAL_RX VARCHAR(120), + DOSE_UNIT_RX VARCHAR(120), + FORM_VAL_DISP VARCHAR(120), + FORM_UNIT_DISP VARCHAR(120), + ROUTE VARCHAR(120), + CONSTRAINT prescription_rowid_pk PRIMARY KEY (ROW_ID) +) ; + +DROP TABLE IF EXISTS PROCEDUREEVENTS_MV CASCADE; +CREATE TABLE PROCEDUREEVENTS_MV +( + ROW_ID INT NOT NULL, + SUBJECT_ID INT NOT NULL, + HADM_ID INT NOT NULL, + ICUSTAY_ID INT, + STARTTIME TIMESTAMP, + ENDTIME TIMESTAMP, + ITEMID INT, + VALUE DOUBLE PRECISION, + VALUEUOM VARCHAR(30), + LOCATION VARCHAR(30), + LOCATIONCATEGORY VARCHAR(30), + STORETIME TIMESTAMP, + CGID INT, + ORDERID INT, + LINKORDERID INT, + ORDERCATEGORYNAME VARCHAR(100), + SECONDARYORDERCATEGORYNAME VARCHAR(100), + ORDERCATEGORYDESCRIPTION VARCHAR(50), + ISOPENBAG SMALLINT, + CONTINUEINNEXTDEPT SMALLINT, + CANCELREASON SMALLINT, + STATUSDESCRIPTION VARCHAR(30), + COMMENTS_EDITEDBY VARCHAR(30), + COMMENTS_CANCELEDBY VARCHAR(30), + COMMENTS_DATE TIMESTAMP, + CONSTRAINT procedureevents_mv_rowid_pk PRIMARY KEY (ROW_ID) +) ; + +DROP TABLE IF EXISTS PROCEDURES_ICD CASCADE; +CREATE TABLE PROCEDURES_ICD +( + ROW_ID INT NOT NULL, + SUBJECT_ID INT NOT NULL, + HADM_ID INT NOT NULL, + SEQ_NUM INT NOT NULL, + ICD9_CODE VARCHAR(10) NOT NULL, + CONSTRAINT proceduresicd_rowid_pk PRIMARY KEY (ROW_ID) +) ; + +DROP TABLE IF EXISTS SERVICES CASCADE; +CREATE TABLE SERVICES +( + ROW_ID INT NOT NULL, + SUBJECT_ID INT NOT NULL, + HADM_ID INT NOT NULL, + TRANSFERTIME TIMESTAMP NOT NULL, + PREV_SERVICE VARCHAR(20), + CURR_SERVICE VARCHAR(20), + CONSTRAINT services_rowid_pk PRIMARY KEY (ROW_ID) +) ; + +DROP TABLE IF EXISTS TRANSFERS CASCADE; +CREATE TABLE TRANSFERS +( + ROW_ID INT NOT NULL, + SUBJECT_ID INT NOT NULL, + HADM_ID INT NOT NULL, + ICUSTAY_ID INT, + DBSOURCE VARCHAR(20), + EVENTTYPE VARCHAR(20), + PREV_CAREUNIT VARCHAR(20), + CURR_CAREUNIT VARCHAR(20), + PREV_WARDID SMALLINT, + CURR_WARDID SMALLINT, + INTIME TIMESTAMP, + OUTTIME TIMESTAMP, + LOS DOUBLE PRECISION, + CONSTRAINT transfers_rowid_pk PRIMARY KEY (ROW_ID) +) ; \ No newline at end of file From a7d0597e928b8b25686030d03ef9b1d1dd2a01e9 Mon Sep 17 00:00:00 2001 From: SphtKr Date: Thu, 27 Apr 2023 01:48:50 +0000 Subject: [PATCH 07/20] Add table creation/loading --- mimic-iii/buildmimic/duckdb/import_duckdb.py | 61 +++++++++++++++----- 1 file changed, 46 insertions(+), 15 deletions(-) diff --git a/mimic-iii/buildmimic/duckdb/import_duckdb.py b/mimic-iii/buildmimic/duckdb/import_duckdb.py index 6c0635066..8aaf51bcc 100644 --- a/mimic-iii/buildmimic/duckdb/import_duckdb.py +++ b/mimic-iii/buildmimic/duckdb/import_duckdb.py @@ -208,25 +208,62 @@ def main() -> int: description='Creates the MIMIC-III database in DuckDB and optionally the concepts views.', ) parser.add_argument('mimic_data_dir', help="directory that contains csv.tar.gz or csv files") - parser.add_argument('output_db', help="filename for duckdb file (default: mimic3.db)", default="./mimic3.db") - parser.add_argument('--make-concepts', help="generate the concepts views", action="store_true") + parser.add_argument('output_db', help="filename for duckdb file (default: mimic3.db)", nargs='?', default="./mimic3.db") parser.add_argument('--mimic-code-root', help="location of the mimic-code repo (used to find concepts SQL)", default='../../../') + parser.add_argument('--make-concepts', help="generate the concepts views", action="store_true") + parser.add_argument('--skip-tables', help="don't create schema or load data (they must already exist)", action="store_true") args = parser.parse_args() output_db = args.output_db mimic_data_dir = args.mimic_data_dir make_concepts = args.make_concepts mimic_code_root = args.mimic_code_root + skip_tables = args.skip_tables + + if not skip_tables: - if make_concepts: connection = duckdb.connect(output_db) print("Connected to duckdb...") - #print("Defining macros...") - #for macro in macros: - # connection.execute(macro) + try: + print("Creating tables...") + + with open(os.path.join(mimic_code_root, 'mimic-iii','buildmimic','duckdb','import_duckdb_tables.sql'), 'r') as fp: + sql = fp.read() + connection.execute(sql) + + print("Loading data...") + + for f in os.listdir(mimic_data_dir): + m = re.match(r'^(.*)\.csv(\.gz)*', f) + if m is not None: + print(f" {m.group(1)}") + connection.execute(f"COPY {m.group(1)} from '{os.path.join(mimic_data_dir,m.group(0))}' (FORMAT CSV, DELIMITER ',', HEADER);") + + connection.execute(ccs_multi_dx_create) + #connection.execute(...) + csvgz_path = os.path.join(mimic_code_root, 'mimic-iii','concepts_postgres','diagnosis','ccs_multi_dx.csv.gz') + #connection.from_csv_auto( + # name=data_path, + # header=True) + connection.execute(f"COPY ccs_multi_dx from '{csvgz_path}' (FORMAT CSV, DELIMITER ',', HEADER);") + + print(connection.sql("SELECT * FROM ccs_multi_dx LIMIT 10;")) + except Exception as error: + print("Failed to setup ccs_multi_dx: ", error) + raise error + finally: + if connection: + connection.close() + print("duckdb connection is closed") + + + + if make_concepts: + connection = duckdb.connect(output_db) + print("Connected to duckdb...") print("Creating tables...") - + # ccs_dx is an outlier...this is adapted from the BigQuery version... ccs_multi_dx_create = """ DROP TABLE IF EXISTS ccs_multi_dx; @@ -247,15 +284,9 @@ def main() -> int: print("Loading data...") try: - #FIXME: Turn this line back on! - #connection.execute(ccs_multi_dx_create) - #connection.execute(...) + connection.execute(ccs_multi_dx_create) csvgz_path = os.path.join(mimic_code_root, 'mimic-iii','concepts_postgres','diagnosis','ccs_multi_dx.csv.gz') - #connection.from_csv_auto( - # name=data_path, - # header=True) - #FIXME: Turn this line back on! - #connection.execute(f"COPY ccs_multi_dx from '{csvgz_path}' (FORMAT CSV, DELIMITER ',', HEADER);") + connection.execute(f"COPY ccs_multi_dx from '{csvgz_path}' (FORMAT CSV, DELIMITER ',', HEADER);") print(connection.sql("SELECT * FROM ccs_multi_dx LIMIT 10;")) except Exception as error: From 92c738823c4be9d09151564b5c0a390387836395 Mon Sep 17 00:00:00 2001 From: SphtKr Date: Thu, 27 Apr 2023 22:57:12 +0000 Subject: [PATCH 08/20] Schema support (option) to mirror psql version --- mimic-iii/buildmimic/duckdb/import_duckdb.py | 99 +++++++++++++++----- 1 file changed, 77 insertions(+), 22 deletions(-) diff --git a/mimic-iii/buildmimic/duckdb/import_duckdb.py b/mimic-iii/buildmimic/duckdb/import_duckdb.py index 8aaf51bcc..34a16c281 100644 --- a/mimic-iii/buildmimic/duckdb/import_duckdb.py +++ b/mimic-iii/buildmimic/duckdb/import_duckdb.py @@ -90,7 +90,7 @@ 'martin': {"path": "sepsis/martin.sql"}, 'explicit': {"path": "sepsis/explicit.sql"}, - 'ccs_dx': {"path": "diagnosis/ccs_dx.sql"}, + 'ccs_dx': {"path": "diagnosis/ccs_dx.sql", "schema": None}, # explicit None means default schema not schema_name 'kdigo_creatinine': {"path": "organfailure/kdigo_creatinine.sql"}, 'kdigo_uo': {"path": "organfailure/kdigo_uo.sql"}, @@ -109,6 +109,9 @@ } +# This will contain all the table/view names to put in a namespace... +tables_in_schema = set() + # BigQuery monkey patches sqlglot.dialects.bigquery.BigQuery.Parser.FUNCTIONS["PARSE_DATETIME"] = lambda args: exp.StrToTime( this=seq_get(args, 1), format=seq_get(args, 0) @@ -135,8 +138,36 @@ def duckdb_date_diff_sql(self, expression): sqlglot.dialects.duckdb.DuckDB.Generator.TRANSFORMS[exp.DatetimeDiff] = duckdb_date_diff_sql sqlglot.dialects.duckdb.DuckDB.Generator.TRANSFORMS[exp.DateDiff] = duckdb_date_diff_sql - -def _make_duckdb_query_bigquery(qname: str, qfile: str, conn): +# This may not be strictly necessary because the views work without +# it IF you `use` the schema first... but making them fully qualified +# makes them work regardless of the current schema. +def _duckdb_rewrite_schema(sql: str, schema: str): + parsed = sqlglot.parse_one(sql, read=sqlglot.dialects.DuckDB) + for table in parsed.find_all(exp.Table): + for identifier in table.find_all(exp.Identifier): + if identifier.this.lower() in tables_in_schema: + sql = sql.replace('"'+identifier.this+'"', schema+'.'+identifier.this.lower()) + print(sql) + print(identifier) + # The below (unfinished) causes problems because some munging of functions + # occurs in the output. The above approach is kludgy, but works and limits + # the blast radius of potential problems regexping SQL. + """ + def transformer(node): + if isinstance(node, exp.Table): #and node.name == "a": + for id in node.find_all(exp.Identifier): + if id.this.lower() in tables_in_schema: + id.this = schema + '.' + id.this.lower() + #print(id) + return node + return node + transformed_tree = parsed.transform(transformer) + sql = transformed_tree.sql(dialect=sqlglot.dialects.DuckDB) + """ + return sql + + +def _make_duckdb_query_bigquery(qname: str, qfile: str, conn, schema: str = None): _multischema_trunc_re = re.compile("\"physionet-data\.mimiciii_\w+\.") #TODO: better answer here? should only hit ccs_dx.sql! @@ -154,6 +185,9 @@ def _make_duckdb_query_bigquery(qname: str, qfile: str, conn): for st in sql_list: sql = re.sub(_multischema_trunc_re, "\"", st) + if schema is not None: + sql = _duckdb_rewrite_schema(sql, schema) + if concept_name_map[qname].get("nocreate", False): cursor = conn.cursor() try: @@ -169,7 +203,7 @@ def _make_duckdb_query_bigquery(qname: str, qfile: str, conn): conn.execute(f"DROP VIEW IF EXISTS {qname}") try: - conn.execute(f"CREATE TEMP VIEW {qname} AS " + sql) + conn.execute(f"CREATE VIEW {qname} AS " + sql) except Exception as e: print(sql) #print(repr(sqlglot.parse_one(sql))) @@ -179,9 +213,13 @@ def _make_duckdb_query_bigquery(qname: str, qfile: str, conn): #print() -def _make_duckdb_query_duckdb(qname: str, qfile: str, conn): +def _make_duckdb_query_duckdb(qname: str, qfile: str, conn, schema: str = None): with open(qfile, "r") as fp: sql = fp.read() + + if schema is not None: + sql = _duckdb_rewrite_schema(sql, schema) + if concept_name_map[qname].get("nocreate", False): cursor = conn.cursor() try: @@ -194,7 +232,7 @@ def _make_duckdb_query_duckdb(qname: str, qfile: str, conn): cursor.close() return sql try: - conn.execute(f"CREATE TEMP VIEW {qname} AS " + sql) + conn.execute(f"CREATE VIEW {qname} AS " + sql) except Exception as e: print(sql) raise e @@ -212,12 +250,15 @@ def main() -> int: parser.add_argument('--mimic-code-root', help="location of the mimic-code repo (used to find concepts SQL)", default='../../../') parser.add_argument('--make-concepts', help="generate the concepts views", action="store_true") parser.add_argument('--skip-tables', help="don't create schema or load data (they must already exist)", action="store_true") + parser.add_argument('--schema-name', help="put all object (except ccs_dx) into a schema (like the PostgreSQL version)", default=None) args = parser.parse_args() output_db = args.output_db mimic_data_dir = args.mimic_data_dir make_concepts = args.make_concepts mimic_code_root = args.mimic_code_root skip_tables = args.skip_tables + #TODO: validate schema_name is valid identifier + schema_name = args.schema_name if not skip_tables: @@ -225,6 +266,12 @@ def main() -> int: print("Connected to duckdb...") try: + schema_prequel = "" + if schema_name is not None: + connection.execute(f"CREATE SCHEMA IF NOT EXISTS {schema_name};") + connection.execute(f"USE {schema_name};") + schema_prequel = f"USE {schema_name};" + print("Creating tables...") with open(os.path.join(mimic_code_root, 'mimic-iii','buildmimic','duckdb','import_duckdb_tables.sql'), 'r') as fp: @@ -236,27 +283,24 @@ def main() -> int: for f in os.listdir(mimic_data_dir): m = re.match(r'^(.*)\.csv(\.gz)*', f) if m is not None: - print(f" {m.group(1)}") - connection.execute(f"COPY {m.group(1)} from '{os.path.join(mimic_data_dir,m.group(0))}' (FORMAT CSV, DELIMITER ',', HEADER);") - - connection.execute(ccs_multi_dx_create) - #connection.execute(...) - csvgz_path = os.path.join(mimic_code_root, 'mimic-iii','concepts_postgres','diagnosis','ccs_multi_dx.csv.gz') - #connection.from_csv_auto( - # name=data_path, - # header=True) - connection.execute(f"COPY ccs_multi_dx from '{csvgz_path}' (FORMAT CSV, DELIMITER ',', HEADER);") + tablename = m.group(1).lower() + tables_in_schema.add(tablename) + tablename = tablename if schema_name is None else schema_name+'.'+tablename + print(f" {tablename}") + connection.execute(f"COPY {tablename} from '{os.path.join(mimic_data_dir,m.group(0))}' (FORMAT CSV, DELIMITER ',', HEADER);") - print(connection.sql("SELECT * FROM ccs_multi_dx LIMIT 10;")) except Exception as error: - print("Failed to setup ccs_multi_dx: ", error) + print("Failed setting up database: ", error) raise error finally: if connection: connection.close() print("duckdb connection is closed") - + #TODO: If both --schema-name and --skip-tables are specified, we won't have + # populated tables_in_schema with the data table names... so the views won't + # work... So, here, read the tables already in the destination schema from + # the DB and add those tablenames to tables_in_schema? if make_concepts: connection = duckdb.connect(output_db) @@ -284,11 +328,13 @@ def main() -> int: print("Loading data...") try: + + connection.execute(f"USE main;") + connection.execute(ccs_multi_dx_create) csvgz_path = os.path.join(mimic_code_root, 'mimic-iii','concepts_postgres','diagnosis','ccs_multi_dx.csv.gz') connection.execute(f"COPY ccs_multi_dx from '{csvgz_path}' (FORMAT CSV, DELIMITER ',', HEADER);") - print(connection.sql("SELECT * FROM ccs_multi_dx LIMIT 10;")) except Exception as error: print("Failed to setup ccs_multi_dx: ", error) raise error @@ -301,16 +347,25 @@ def main() -> int: print("Creating views...") try: + + if schema_name is not None: + connection.execute(f"CREATE SCHEMA IF NOT EXISTS {schema_name};") + connection.execute(f"USE {schema_name}") + for key in concept_name_map: + if schema_name is not None: + if "schema" not in concept_name_map[key]: + tables_in_schema.add(key.lower()) + #cProfile.run('...') #print(f"Making view {key}...") db = concept_name_map[key].get("db", "bigquery") if db == "duckdb": qpath = os.path.join(mimic_code_root, 'mimic-iii', 'buildmimic', 'duckdb', 'concepts', concept_name_map[key]['path']) - _make_duckdb_query_duckdb(key, qpath, connection) + _make_duckdb_query_duckdb(key, qpath, connection, schema=concept_name_map[key].get('schema', schema_name)) elif db == "bigquery": qpath = os.path.join(mimic_code_root, 'mimic-iii', 'concepts', concept_name_map[key]['path']) - _make_duckdb_query_bigquery(key, qpath, connection) + _make_duckdb_query_bigquery(key, qpath, connection, schema=schema_name) except Exception as error: print("Failed to execute translated SQL: ", error) From be785890016826c95e5d18a67c00988e2f6a138d Mon Sep 17 00:00:00 2001 From: SphtKr Date: Thu, 27 Apr 2023 22:58:54 +0000 Subject: [PATCH 09/20] Another chunk of dead code --- mimic-iii/buildmimic/duckdb/import_duckdb.py | 24 -------------------- 1 file changed, 24 deletions(-) diff --git a/mimic-iii/buildmimic/duckdb/import_duckdb.py b/mimic-iii/buildmimic/duckdb/import_duckdb.py index 34a16c281..c6b9b53e3 100644 --- a/mimic-iii/buildmimic/duckdb/import_duckdb.py +++ b/mimic-iii/buildmimic/duckdb/import_duckdb.py @@ -188,19 +188,6 @@ def _make_duckdb_query_bigquery(qname: str, qfile: str, conn, schema: str = None if schema is not None: sql = _duckdb_rewrite_schema(sql, schema) - if concept_name_map[qname].get("nocreate", False): - cursor = conn.cursor() - try: - cursor.execute(sql) - except Exception as e: - print(sql) - print(repr(sqlglot.parse_one(sql))) - raise e - result = cursor.fetchone() - print(result) - cursor.close() - return sql - conn.execute(f"DROP VIEW IF EXISTS {qname}") try: conn.execute(f"CREATE VIEW {qname} AS " + sql) @@ -220,17 +207,6 @@ def _make_duckdb_query_duckdb(qname: str, qfile: str, conn, schema: str = None): if schema is not None: sql = _duckdb_rewrite_schema(sql, schema) - if concept_name_map[qname].get("nocreate", False): - cursor = conn.cursor() - try: - cursor.execute(sql) - except Exception as e: - print(sql) - raise e - result = cursor.fetchone() - print(result) - cursor.close() - return sql try: conn.execute(f"CREATE VIEW {qname} AS " + sql) except Exception as e: From 43a67ad1a772ff3d7798002cff8df1efae09afda Mon Sep 17 00:00:00 2001 From: SphtKr Date: Fri, 28 Apr 2023 16:19:50 +0000 Subject: [PATCH 10/20] Big bug in DatetimeDiff implementation! --- mimic-iii/buildmimic/duckdb/import_duckdb.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mimic-iii/buildmimic/duckdb/import_duckdb.py b/mimic-iii/buildmimic/duckdb/import_duckdb.py index c6b9b53e3..4b250938d 100644 --- a/mimic-iii/buildmimic/duckdb/import_duckdb.py +++ b/mimic-iii/buildmimic/duckdb/import_duckdb.py @@ -134,7 +134,8 @@ def duckdb_date_diff_sql(self, expression): #print("CALLING duckdb._date_diff") this = self.sql(expression, "this") unit = self.sql(expression, "unit") or "DAY" - return f"DATE_DIFF('{unit}', {this}, {self.sql(expression.expression)})" + # DuckDB DATE_DIFF operand order is start_time, end_time--not like end_time - start_time! + return f"DATE_DIFF('{unit}', {self.sql(expression.expression)}, {this})" sqlglot.dialects.duckdb.DuckDB.Generator.TRANSFORMS[exp.DatetimeDiff] = duckdb_date_diff_sql sqlglot.dialects.duckdb.DuckDB.Generator.TRANSFORMS[exp.DateDiff] = duckdb_date_diff_sql @@ -181,7 +182,6 @@ def _make_duckdb_query_bigquery(qname: str, qfile: str, conn, schema: str = None except Exception as e: print(sql) raise e - print() for st in sql_list: sql = re.sub(_multischema_trunc_re, "\"", st) From 382c7ff686bd14984483bdee7fe91dd29ee5ff74 Mon Sep 17 00:00:00 2001 From: SphtKr Date: Fri, 28 Apr 2023 17:45:32 +0000 Subject: [PATCH 11/20] Adding missing fluid_balance views --- mimic-iii/buildmimic/duckdb/import_duckdb.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/mimic-iii/buildmimic/duckdb/import_duckdb.py b/mimic-iii/buildmimic/duckdb/import_duckdb.py index 4b250938d..4172032af 100644 --- a/mimic-iii/buildmimic/duckdb/import_duckdb.py +++ b/mimic-iii/buildmimic/duckdb/import_duckdb.py @@ -85,6 +85,10 @@ 'weight_first_day': {"path": "firstday/weight_first_day.sql"}, 'urine_output': {"path": "fluid_balance/urine_output.sql"}, + 'colloid_bolus': {"path": "fluid_balance/colloid_bolus.sql"}, + 'crystalloid_bolus': {"path": "fluid_balance/crystalloid_bolus.sql"}, + 'ffp_transfusion': {"path": "fluid_balance/ffp_transfusion.sql"}, + 'rbc_transfusion': {"path": "fluid_balance/rbc_transfusion.sql"}, 'angus': {"path": "sepsis/angus.sql"}, 'martin': {"path": "sepsis/martin.sql"}, From 99ef1f33ee21c5337108a5ca82bf1926d76463cb Mon Sep 17 00:00:00 2001 From: SphtKr Date: Fri, 28 Apr 2023 17:46:00 +0000 Subject: [PATCH 12/20] missed deletion --- .../buildmimic/duckdb/duckdb_concepts.py | 297 ------------------ 1 file changed, 297 deletions(-) delete mode 100644 mimic-iii/buildmimic/duckdb/duckdb_concepts.py diff --git a/mimic-iii/buildmimic/duckdb/duckdb_concepts.py b/mimic-iii/buildmimic/duckdb/duckdb_concepts.py deleted file mode 100644 index 014c88ea7..000000000 --- a/mimic-iii/buildmimic/duckdb/duckdb_concepts.py +++ /dev/null @@ -1,297 +0,0 @@ -import cProfile - -import sys -import os -import re -import argparse - -import duckdb -import datetime - -#import sqlparse -import sqlglot -import sqlglot.dialects.bigquery -import sqlglot.dialects.duckdb -from sqlglot import exp, generator, parser, tokens, transforms -from sqlglot.helper import seq_get - -from pprint import pprint - -concept_name_map = { - 'icustay_times': {"path": "demographics/icustay_times.sql"}, - 'icustay_hours': {"path": "icustay_hours.sql", "db": "duckdb"}, - 'echo_data': {"path": "echo_data.sql"}, - 'code_status': {"path": "code_status.sql"}, - 'weight_durations': {"path": "durations/weight_durations.sql"}, - 'rrt': {"path": "rrt.sql"}, - 'heightweight': {"path": "demographics/heightweight.sql"}, - 'icustay_detail': {"path": "demographics/icustay_detail.sql"}, - - 'ventilation_classification': {"path": "durations/ventilation_classification.sql"}, - 'ventilation_durations': {"path": "durations/ventilation_durations.sql"}, - 'crrt_durations': {"path": "durations/crrt_durations.sql"}, - 'adenosine_durations': {"path": "durations/adenosine_durations.sql"}, - 'dobutamine_durations': {"path": "durations/dobutamine_durations.sql"}, - 'dopamine_durations': {"path": "durations/dopamine_durations.sql"}, - 'epinephrine_durations': {"path": "durations/epinephrine_durations.sql"}, - 'isuprel_durations': {"path": "durations/isuprel_durations.sql"}, - 'milrinone_durations': {"path": "durations/milrinone_durations.sql"}, - 'norepinephrine_durations': {"path": "durations/norepinephrine_durations.sql"}, - 'phenylephrine_durations': {"path": "durations/phenylephrine_durations.sql"}, - 'vasopressin_durations': {"path": "durations/vasopressin_durations.sql"}, - 'vasopressor_durations': {"path": "durations/vasopressor_durations.sql"}, - - 'dobutamine_dose': {"path": "durations/dobutamine_dose.sql"}, - 'dopamine_dose': {"path": "durations/dopamine_dose.sql"}, - 'epinephrine_dose': {"path": "durations/epinephrine_dose.sql"}, - 'norepinephrine_dose': {"path": "durations/norepinephrine_dose.sql"}, - 'phenylephrine_dose': {"path": "durations/phenylephrine_dose.sql"}, - 'vasopressin_dose': {"path": "durations/vasopressin_dose.sql"}, - - 'pivoted_vital': {"path": "pivot/pivoted_vital.sql"}, - 'pivoted_uo': {"path": "pivot/pivoted_uo.sql"}, - 'pivoted_rrt': {"path": "pivot/pivoted_rrt.sql"}, - 'pivoted_lab': {"path": "pivot/pivoted_lab.sql"}, - 'pivoted_invasive_lines': {"path": "pivot/pivoted_invasive_lines.sql"}, - 'pivoted_icp': {"path": "pivot/pivoted_icp.sql"}, - 'pivoted_height': {"path": "pivot/pivoted_height.sql"}, - 'pivoted_gcs': {"path": "pivot/pivoted_gcs.sql"}, - 'pivoted_fio2': {"path": "pivot/pivoted_fio2.sql"}, - 'pivoted_bg': {"path": "pivot/pivoted_bg.sql"}, - # pivoted_bg_art must be run after pivoted_bg - 'pivoted_bg_art': {"path": "pivot/pivoted_bg_art.sql"}, - # Difficult error here, the original query seems to reference something non-existent... - # the `pivot` queries are omitted from the Postgres version... we may have to do the same? - # pivoted oasis depends on icustay_hours in demographics - #'pivoted_oasis': {"path": "pivot/pivoted_oasis.sql"}, - # Another puzzling error here, duckdb doesn't like something on the `WITH` line! - # pivoted sofa depends on many above pivoted views, ventilation_durations, and dose queries - #'pivoted_sofa': {"path": "pivot/pivoted_sofa.sql"}, - - 'elixhauser_ahrq_v37': {"path": "comorbidity/elixhauser_ahrq_v37.sql"}, - 'elixhauser_ahrq_v37_no_drg': {"path": "comorbidity/elixhauser_ahrq_v37_no_drg.sql"}, - 'elixhauser_quan': {"path": "comorbidity/elixhauser_quan.sql"}, - 'elixhauser_score_ahrq': {"path": "comorbidity/elixhauser_score_ahrq.sql"}, - 'elixhauser_score_quan': {"path": "comorbidity/elixhauser_score_quan.sql"}, - - 'blood_gas_first_day': {"path": "firstday/blood_gas_first_day.sql"}, - 'blood_gas_first_day_arterial': {"path": "firstday/blood_gas_first_day_arterial.sql"}, - 'gcs_first_day': {"path": "firstday/gcs_first_day.sql"}, - 'labs_first_day': {"path": "firstday/labs_first_day.sql"}, - 'rrt_first_day': {"path": "firstday/rrt_first_day.sql"}, - 'urine_output_first_day': {"path": "firstday/urine_output_first_day.sql"}, - 'ventilation_first_day': {"path": "firstday/ventilation_first_day.sql"}, - 'vitals_first_day': {"path": "firstday/vitals_first_day.sql"}, - 'weight_first_day': {"path": "firstday/weight_first_day.sql"}, - - 'urine_output': {"path": "fluid_balance/urine_output.sql"}, - - 'angus': {"path": "sepsis/angus.sql"}, - 'martin': {"path": "sepsis/martin.sql"}, - 'explicit': {"path": "sepsis/explicit.sql"}, - - 'ccs_dx': {"path": "diagnosis/ccs_dx.sql"}, - - 'kdigo_creatinine': {"path": "organfailure/kdigo_creatinine.sql"}, - 'kdigo_uo': {"path": "organfailure/kdigo_uo.sql"}, - 'kdigo_stages': {"path": "organfailure/kdigo_stages.sql"}, - 'kdigo_stages_7day': {"path": "organfailure/kdigo_stages_7day.sql"}, - 'kdigo_stages_48hr': {"path": "organfailure/kdigo_stages_48hr.sql"}, - 'meld': {"path": "organfailure/meld.sql"}, - - 'oasis': {"path": "severityscores/oasis.sql"}, - 'sofa': {"path": "severityscores/sofa.sql"}, - 'saps': {"path": "severityscores/saps.sql"}, - 'sapsii': {"path": "severityscores/sapsii.sql"}, - 'apsiii': {"path": "severityscores/apsiii.sql"}, - 'lods': {"path": "severityscores/lods.sql"}, - 'sirs': {"path": "severityscores/sirs.sql"}, - -} - -# BigQuery monkey patches -sqlglot.dialects.bigquery.BigQuery.Parser.FUNCTIONS["PARSE_DATETIME"] = lambda args: exp.StrToTime( - this=seq_get(args, 1), format=seq_get(args, 0) -) -sqlglot.dialects.bigquery.BigQuery.Parser.FUNCTIONS["FORMAT_DATE"] = lambda args: exp.TimeToStr( - this=seq_get(args, 1), format=seq_get(args, 0) -) -sqlglot.dialects.bigquery.BigQuery.Parser.STRICT_CAST = False - -# DuckDB monkey patches -def duckdb_date_sub_sql(self, expression): - #print("CALLING duckdb._date_sub") - this = self.sql(expression, "this") - unit = self.sql(expression, "unit") or "DAY" # .strip("'") - return f"{this} - {self.sql(exp.Interval(this=expression.expression, unit=unit))}" -sqlglot.dialects.duckdb.DuckDB.Generator.TRANSFORMS[exp.DatetimeSub] = duckdb_date_sub_sql -sqlglot.dialects.duckdb.DuckDB.Generator.TRANSFORMS[exp.DatetimeAdd] = sqlglot.dialects.duckdb._date_add - -def duckdb_date_diff_sql(self, expression): - #print("CALLING duckdb._date_diff") - this = self.sql(expression, "this") - unit = self.sql(expression, "unit") or "DAY" - return f"DATE_DIFF('{unit}', {this}, {self.sql(expression.expression)})" -sqlglot.dialects.duckdb.DuckDB.Generator.TRANSFORMS[exp.DatetimeDiff] = duckdb_date_diff_sql -sqlglot.dialects.duckdb.DuckDB.Generator.TRANSFORMS[exp.DateDiff] = duckdb_date_diff_sql - - -def _make_duckdb_query_bigquery(qname: str, qfile: str, conn): - _multischema_trunc_re = re.compile("\"physionet-data\.mimiciii_\w+\.") - - #TODO: better answer here? should only hit ccs_dx.sql! - _too_many_backslashes_re = re.compile("\\\\([\[\.\]])") - - with open(qfile, "r") as fp: - sql = fp.read() - sql = re.sub(_too_many_backslashes_re, '\\$1', sql) - try: - sql_list = sqlglot.transpile(sql, read="bigquery", write="duckdb", pretty=True) - except Exception as e: - print(sql) - raise e - print() - for st in sql_list: - sql = re.sub(_multischema_trunc_re, "\"", st) - - if concept_name_map[qname].get("nocreate", False): - cursor = conn.cursor() - try: - cursor.execute(sql) - except Exception as e: - print(sql) - print(repr(sqlglot.parse_one(sql))) - raise e - result = cursor.fetchone() - print(result) - cursor.close() - return sql - - conn.execute(f"DROP VIEW IF EXISTS {qname}") - try: - conn.execute(f"CREATE TEMP VIEW {qname} AS " + sql) - except Exception as e: - print(sql) - #print(repr(sqlglot.parse_one(sql))) - raise e - print(f"CREATED VIEW {qname}") - - #print() - - -def _make_duckdb_query_duckdb(qname: str, qfile: str, conn): - with open(qfile, "r") as fp: - sql = fp.read() - if concept_name_map[qname].get("nocreate", False): - cursor = conn.cursor() - try: - cursor.execute(sql) - except Exception as e: - print(sql) - raise e - result = cursor.fetchone() - print(result) - cursor.close() - return sql - try: - conn.execute(f"CREATE TEMP VIEW {qname} AS " + sql) - except Exception as e: - print(sql) - raise e - print(f"CREATED VIEW {qname}") - - -def main() -> int: - - parser = argparse.ArgumentParser( - prog='buildmimic_duckdb', - description='Creates the MIMIC-III database in DuckDB and optionally the concepts views.', - ) - parser.add_argument('output_db_file', help="The destination DuckDB file to be written", default="./mimiciii.db") - parser.add_argument('--data-path', required=True) - parser.add_argument('--make-concepts', action="store_true") - parser.add_argument('--mimic-code-root', default='../../../') - args = parser.parse_args() - output_db_file = args.output_db_file - data_path = args.data_path - make_concepts = args.make_concepts - mimic_code_root = args.mimic_code_root - - if make_concepts: - connection = duckdb.connect(output_db_file) - print("Connected to duckdb...") - - #print("Defining macros...") - #for macro in macros: - # connection.execute(macro) - - print("Creating tables...") - - # ccs_dx is an outlier...this is adapted from the BigQuery version... - ccs_multi_dx_create = """ - DROP TABLE IF EXISTS ccs_multi_dx; - CREATE TABLE ccs_multi_dx - ( - icd9_code CHAR(5) NOT NULL, - -- CCS levels and names based on position in hierarchy - ccs_level1 VARCHAR(10), - ccs_group1 VARCHAR(100), - ccs_level2 VARCHAR(10), - ccs_group2 VARCHAR(100), - ccs_level3 VARCHAR(10), - ccs_group3 VARCHAR(100), - ccs_level4 VARCHAR(10), - ccs_group4 VARCHAR(100) - ); - """ - - print("Loading data...") - try: - #FIXME: Turn this line back on! - #connection.execute(ccs_multi_dx_create) - #connection.execute(...) - data_path = os.path.join(mimic_code_root, 'mimic-iii','concepts_postgres','diagnosis','ccs_multi_dx.csv.gz') - #connection.from_csv_auto( - # name=data_path, - # header=True) - #FIXME: Turn this line back on! - #connection.execute(f"COPY ccs_multi_dx from '{data_path}' (FORMAT CSV, DELIMITER ',', HEADER);") - - print(connection.sql("SELECT * FROM ccs_multi_dx LIMIT 10;")) - except Exception as error: - print("Failed to setup ccs_multi_dx: ", error) - raise error - finally: - if connection: - connection.close() - print("duckdb connection is closed") - - connection = duckdb.connect(output_db_file) - - print("Creating views...") - try: - for key in concept_name_map: - #cProfile.run('...') - #print(f"Making view {key}...") - db = concept_name_map[key].get("db", "bigquery") - if db == "duckdb": - qpath = os.path.join(mimic_code_root, 'mimic-iii', 'buildmimic', 'duckdb', 'concepts', concept_name_map[key]['path']) - _make_duckdb_query_duckdb(key, qpath, connection) - elif db == "bigquery": - qpath = os.path.join(mimic_code_root, 'mimic-iii', 'concepts', concept_name_map[key]['path']) - _make_duckdb_query_bigquery(key, qpath, connection) - - except Exception as error: - print("Failed to execute translated SQL: ", error) - raise error - finally: - if connection: - connection.close() - print("duckdb connection is closed") - -if __name__ == '__main__': - sys.exit(main()) - - - - From 2b877a83b48e49bbedca43a411c5525c8179aca8 Mon Sep 17 00:00:00 2001 From: SphtKr Date: Fri, 28 Apr 2023 18:00:46 +0000 Subject: [PATCH 13/20] Rename in line with Postgres --- .../{import_duckdb_tables.sql => duckdb_add_tables.sql} | 0 mimic-iii/buildmimic/duckdb/import_duckdb.py | 4 +--- 2 files changed, 1 insertion(+), 3 deletions(-) rename mimic-iii/buildmimic/duckdb/{import_duckdb_tables.sql => duckdb_add_tables.sql} (100%) diff --git a/mimic-iii/buildmimic/duckdb/import_duckdb_tables.sql b/mimic-iii/buildmimic/duckdb/duckdb_add_tables.sql similarity index 100% rename from mimic-iii/buildmimic/duckdb/import_duckdb_tables.sql rename to mimic-iii/buildmimic/duckdb/duckdb_add_tables.sql diff --git a/mimic-iii/buildmimic/duckdb/import_duckdb.py b/mimic-iii/buildmimic/duckdb/import_duckdb.py index 4172032af..bcc3d1355 100644 --- a/mimic-iii/buildmimic/duckdb/import_duckdb.py +++ b/mimic-iii/buildmimic/duckdb/import_duckdb.py @@ -152,8 +152,6 @@ def _duckdb_rewrite_schema(sql: str, schema: str): for identifier in table.find_all(exp.Identifier): if identifier.this.lower() in tables_in_schema: sql = sql.replace('"'+identifier.this+'"', schema+'.'+identifier.this.lower()) - print(sql) - print(identifier) # The below (unfinished) causes problems because some munging of functions # occurs in the output. The above approach is kludgy, but works and limits # the blast radius of potential problems regexping SQL. @@ -254,7 +252,7 @@ def main() -> int: print("Creating tables...") - with open(os.path.join(mimic_code_root, 'mimic-iii','buildmimic','duckdb','import_duckdb_tables.sql'), 'r') as fp: + with open(os.path.join(mimic_code_root, 'mimic-iii','buildmimic','duckdb','duckdb_add_tables.sql'), 'r') as fp: sql = fp.read() connection.execute(sql) From f74a4eba5964cd685b7423e9a993532cce4958fa Mon Sep 17 00:00:00 2001 From: SphtKr Date: Fri, 28 Apr 2023 18:08:11 +0000 Subject: [PATCH 14/20] Adding indexes --- .../buildmimic/duckdb/duckdb_add_indexes.sql | 546 ++++++++++++++++++ mimic-iii/buildmimic/duckdb/import_duckdb.py | 9 + 2 files changed, 555 insertions(+) create mode 100644 mimic-iii/buildmimic/duckdb/duckdb_add_indexes.sql diff --git a/mimic-iii/buildmimic/duckdb/duckdb_add_indexes.sql b/mimic-iii/buildmimic/duckdb/duckdb_add_indexes.sql new file mode 100644 index 000000000..b350f7ca0 --- /dev/null +++ b/mimic-iii/buildmimic/duckdb/duckdb_add_indexes.sql @@ -0,0 +1,546 @@ +-- ---------------------------------------------------------------- +-- +-- This is a script to add the MIMIC-III indexes for DuckDB. +-- +-- ---------------------------------------------------------------- + +-- This is based on the PostgreSQL version but removes the partitioning +-- for the CHARTEVENTS table. + +------------- +-- ADMISSIONS +------------- + +DROP INDEX IF EXISTS ADMISSIONS_idx01; +CREATE INDEX ADMISSIONS_IDX01 + ON ADMISSIONS (SUBJECT_ID); + +DROP INDEX IF EXISTS ADMISSIONS_idx02; +CREATE INDEX ADMISSIONS_IDX02 + ON ADMISSIONS (HADM_ID); + +-- DROP INDEX IF EXISTS ADMISSIONS_idx03; +-- CREATE INDEX ADMISSIONS_IDX03 +-- ON ADMISSIONS (ADMISSION_TYPE); + + +----------- +--CALLOUT-- +----------- + +DROP INDEX IF EXISTS CALLOUT_idx01; +CREATE INDEX CALLOUT_IDX01 + ON CALLOUT (SUBJECT_ID); + +DROP INDEX IF EXISTS CALLOUT_idx02; +CREATE INDEX CALLOUT_IDX02 + ON CALLOUT (HADM_ID); + +-- DROP INDEX IF EXISTS CALLOUT_idx03; +-- CREATE INDEX CALLOUT_IDX03 +-- ON CALLOUT (CALLOUT_SERVICE); + +-- DROP INDEX IF EXISTS CALLOUT_idx04; +-- CREATE INDEX CALLOUT_IDX04 +-- ON CALLOUT (CURR_WARDID, CALLOUT_WARDID, +-- DISCHARGE_WARDID); + +-- DROP INDEX IF EXISTS CALLOUT_idx05; +-- CREATE INDEX CALLOUT_IDX05 +-- ON CALLOUT (CALLOUT_STATUS, +-- CALLOUT_OUTCOME); + +-- DROP INDEX IF EXISTS CALLOUT_idx06; +-- CREATE INDEX CALLOUT_IDX06 +-- ON CALLOUT (CREATETIME, UPDATETIME, +-- ACKNOWLEDGETIME, OUTCOMETIME); + +--------------- +-- CAREGIVERS +--------------- + +-- DROP INDEX IF EXISTS CAREGIVERS_idx01; +-- CREATE INDEX CAREGIVERS_IDX01 +-- ON CAREGIVERS (CGID, LABEL); + +--------------- +-- CHARTEVENTS +--------------- + +DROP INDEX IF EXISTS chartevents_idx01; +CREATE INDEX chartevents_idx01 ON chartevents (itemid); + +--------------- +-- CPTEVENTS +--------------- + +DROP INDEX IF EXISTS CPTEVENTS_idx01; +CREATE INDEX CPTEVENTS_idx01 + ON CPTEVENTS (SUBJECT_ID); + +DROP INDEX IF EXISTS CPTEVENTS_idx02; +CREATE INDEX CPTEVENTS_idx02 + ON CPTEVENTS (CPT_CD); + +----------- +-- D_CPT +----------- + +-- Table is 134 rows - doesn't need an index. + +-------------------- +-- D_ICD_DIAGNOSES +-------------------- + +DROP INDEX IF EXISTS D_ICD_DIAG_idx01; +CREATE INDEX D_ICD_DIAG_idx01 + ON D_ICD_DIAGNOSES (ICD9_CODE); + +DROP INDEX IF EXISTS D_ICD_DIAG_idx02; +CREATE INDEX D_ICD_DIAG_idx02 + ON D_ICD_DIAGNOSES (LONG_TITLE); + +-------------------- +-- D_ICD_PROCEDURES +-------------------- + +DROP INDEX IF EXISTS D_ICD_PROC_idx01; +CREATE INDEX D_ICD_PROC_idx01 + ON D_ICD_PROCEDURES (ICD9_CODE); + +DROP INDEX IF EXISTS D_ICD_PROC_idx02; +CREATE INDEX D_ICD_PROC_idx02 + ON D_ICD_PROCEDURES (LONG_TITLE); + +----------- +-- D_ITEMS +----------- + +DROP INDEX IF EXISTS D_ITEMS_idx01; +CREATE INDEX D_ITEMS_idx01 + ON D_ITEMS (ITEMID); + +DROP INDEX IF EXISTS D_ITEMS_idx02; +CREATE INDEX D_ITEMS_idx02 + ON D_ITEMS (LABEL); + +-- DROP INDEX IF EXISTS D_ITEMS_idx03; +-- CREATE INDEX D_ITEMS_idx03 +-- ON D_ITEMS (CATEGORY); + +--------------- +-- D_LABITEMS +--------------- + +DROP INDEX IF EXISTS D_LABITEMS_idx01; +CREATE INDEX D_LABITEMS_idx01 + ON D_LABITEMS (ITEMID); + +DROP INDEX IF EXISTS D_LABITEMS_idx02; +CREATE INDEX D_LABITEMS_idx02 + ON D_LABITEMS (LABEL); + +DROP INDEX IF EXISTS D_LABITEMS_idx03; +CREATE INDEX D_LABITEMS_idx03 + ON D_LABITEMS (LOINC_CODE); + +------------------- +-- DATETIMEEVENTS +------------------- + +DROP INDEX IF EXISTS DATETIMEEVENTS_idx01; +CREATE INDEX DATETIMEEVENTS_idx01 + ON DATETIMEEVENTS (SUBJECT_ID); + +DROP INDEX IF EXISTS DATETIMEEVENTS_idx02; +CREATE INDEX DATETIMEEVENTS_idx02 + ON DATETIMEEVENTS (ITEMID); + +DROP INDEX IF EXISTS DATETIMEEVENTS_idx03; +CREATE INDEX DATETIMEEVENTS_idx03 + ON DATETIMEEVENTS (ICUSTAY_ID); + +DROP INDEX IF EXISTS DATETIMEEVENTS_idx04; +CREATE INDEX DATETIMEEVENTS_idx04 + ON DATETIMEEVENTS (HADM_ID); + +-- DROP INDEX IF EXISTS DATETIMEEVENTS_idx05; +-- CREATE INDEX DATETIMEEVENTS_idx05 +-- ON DATETIMEEVENTS (VALUE); + +------------------ +-- DIAGNOSES_ICD +------------------ + +DROP INDEX IF EXISTS DIAGNOSES_ICD_idx01; +CREATE INDEX DIAGNOSES_ICD_idx01 + ON DIAGNOSES_ICD (SUBJECT_ID); + +DROP INDEX IF EXISTS DIAGNOSES_ICD_idx02; +CREATE INDEX DIAGNOSES_ICD_idx02 + ON DIAGNOSES_ICD (ICD9_CODE); + +DROP INDEX IF EXISTS DIAGNOSES_ICD_idx03; +CREATE INDEX DIAGNOSES_ICD_idx03 + ON DIAGNOSES_ICD (HADM_ID); + +-------------- +-- DRGCODES +-------------- + +DROP INDEX IF EXISTS DRGCODES_idx01; +CREATE INDEX DRGCODES_idx01 + ON DRGCODES (SUBJECT_ID); + +DROP INDEX IF EXISTS DRGCODES_idx02; +CREATE INDEX DRGCODES_idx02 + ON DRGCODES (DRG_CODE); + +DROP INDEX IF EXISTS DRGCODES_idx03; +CREATE INDEX DRGCODES_idx03 + ON DRGCODES (DESCRIPTION); + +-- HADM_ID + +------------------ +-- ICUSTAYS +------------------ + +DROP INDEX IF EXISTS ICUSTAYS_idx01; +CREATE INDEX ICUSTAYS_idx01 + ON ICUSTAYS (SUBJECT_ID); + +DROP INDEX IF EXISTS ICUSTAYS_idx02; +CREATE INDEX ICUSTAYS_idx02 + ON ICUSTAYS (ICUSTAY_ID); + +-- DROP INDEX IF EXISTS ICUSTAYS_idx03; +-- CREATE INDEX ICUSTAYS_idx03 +-- ON ICUSTAYS (LOS); + +-- DROP INDEX IF EXISTS ICUSTAYS_idx04; +-- CREATE INDEX ICUSTAYS_idx04 +-- ON ICUSTAYS (FIRST_CAREUNIT); + +-- DROP INDEX IF EXISTS ICUSTAYS_idx05; +-- CREATE INDEX ICUSTAYS_idx05 +-- ON ICUSTAYS (LAST_CAREUNIT); + +DROP INDEX IF EXISTS ICUSTAYS_idx06; +CREATE INDEX ICUSTAYS_IDX06 + ON ICUSTAYS (HADM_ID); + +------------- +-- INPUTEVENTS_CV +------------- + +DROP INDEX IF EXISTS INPUTEVENTS_CV_idx01; +CREATE INDEX INPUTEVENTS_CV_idx01 + ON INPUTEVENTS_CV (SUBJECT_ID); + +DROP INDEX IF EXISTS INPUTEVENTS_CV_idx02; +CREATE INDEX INPUTEVENTS_CV_idx02 + ON INPUTEVENTS_CV (HADM_ID); + +DROP INDEX IF EXISTS INPUTEVENTS_CV_idx03; +CREATE INDEX INPUTEVENTS_CV_idx03 + ON INPUTEVENTS_CV (ICUSTAY_ID); + +DROP INDEX IF EXISTS INPUTEVENTS_CV_idx04; +CREATE INDEX INPUTEVENTS_CV_idx04 + ON INPUTEVENTS_CV (CHARTTIME); + +DROP INDEX IF EXISTS INPUTEVENTS_CV_idx05; +CREATE INDEX INPUTEVENTS_CV_idx05 + ON INPUTEVENTS_CV (ITEMID); + +-- DROP INDEX IF EXISTS INPUTEVENTS_CV_idx06; +-- CREATE INDEX INPUTEVENTS_CV_idx06 +-- ON INPUTEVENTS_CV (RATE); + +-- DROP INDEX IF EXISTS INPUTEVENTS_CV_idx07; +-- CREATE INDEX INPUTEVENTS_CV_idx07 +-- ON INPUTEVENTS_CV (AMOUNT); + +-- DROP INDEX IF EXISTS INPUTEVENTS_CV_idx08; +-- CREATE INDEX INPUTEVENTS_CV_idx08 +-- ON INPUTEVENTS_CV (CGID); + +-- DROP INDEX IF EXISTS INPUTEVENTS_CV_idx09; +-- CREATE INDEX INPUTEVENTS_CV_idx09 +-- ON INPUTEVENTS_CV (LINKORDERID, ORDERID); + +------------- +-- INPUTEVENTS_MV +------------- + +DROP INDEX IF EXISTS INPUTEVENTS_MV_idx01; +CREATE INDEX INPUTEVENTS_MV_idx01 + ON INPUTEVENTS_MV (SUBJECT_ID); + +DROP INDEX IF EXISTS INPUTEVENTS_MV_idx02; +CREATE INDEX INPUTEVENTS_MV_idx02 + ON INPUTEVENTS_MV (HADM_ID); + +DROP INDEX IF EXISTS INPUTEVENTS_MV_idx03; +CREATE INDEX INPUTEVENTS_MV_idx03 + ON INPUTEVENTS_MV (ICUSTAY_ID); + +-- DROP INDEX IF EXISTS INPUTEVENTS_MV_idx04; +-- CREATE INDEX INPUTEVENTS_MV_idx04 +-- ON INPUTEVENTS_MV (ENDTIME, STARTTIME); + +DROP INDEX IF EXISTS INPUTEVENTS_MV_idx05; +CREATE INDEX INPUTEVENTS_MV_idx05 + ON INPUTEVENTS_MV (ITEMID); + +-- DROP INDEX IF EXISTS INPUTEVENTS_MV_idx06; +-- CREATE INDEX INPUTEVENTS_MV_idx06 +-- ON INPUTEVENTS_MV (RATE); + +-- DROP INDEX IF EXISTS INPUTEVENTS_MV_idx07; +-- CREATE INDEX INPUTEVENTS_MV_idx07 +-- ON INPUTEVENTS_MV (VOLUME); + +-- DROP INDEX IF EXISTS INPUTEVENTS_MV_idx08; +-- CREATE INDEX INPUTEVENTS_MV_idx08 +-- ON INPUTEVENTS_MV (CGID); + +-- DROP INDEX IF EXISTS INPUTEVENTS_MV_idx09; +-- CREATE INDEX INPUTEVENTS_MV_idx09 +-- ON INPUTEVENTS_MV (LINKORDERID, ORDERID); + +-- DROP INDEX IF EXISTS INPUTEVENTS_MV_idx10; +-- CREATE INDEX INPUTEVENTS_MV_idx10 +-- ON INPUTEVENTS_MV (ORDERCATEGORYDESCRIPTION, +-- ORDERCATEGORYNAME, SECONDARYORDERCATEGORYNAME); + +-- DROP INDEX IF EXISTS INPUTEVENTS_MV_idx11; +-- CREATE INDEX INPUTEVENTS_MV_idx11 +-- ON INPUTEVENTS_MV (ORDERCOMPONENTTYPEDESCRIPTION, +-- ORDERCATEGORYDESCRIPTION); + + +-------------- +-- LABEVENTS +-------------- + +DROP INDEX IF EXISTS LABEVENTS_idx01; +CREATE INDEX LABEVENTS_idx01 + ON LABEVENTS (SUBJECT_ID); + +DROP INDEX IF EXISTS LABEVENTS_idx02; +CREATE INDEX LABEVENTS_idx02 + ON LABEVENTS (HADM_ID); + +DROP INDEX IF EXISTS LABEVENTS_idx03; +CREATE INDEX LABEVENTS_idx03 + ON LABEVENTS (ITEMID); + +-- DROP INDEX IF EXISTS LABEVENTS_idx04; +-- CREATE INDEX LABEVENTS_idx04 +-- ON LABEVENTS (VALUE, VALUENUM); + +---------------------- +-- MICROBIOLOGYEVENTS +---------------------- + +DROP INDEX IF EXISTS MICROBIOLOGYEVENTS_idx01; +CREATE INDEX MICROBIOLOGYEVENTS_idx01 + ON MICROBIOLOGYEVENTS (SUBJECT_ID); + +DROP INDEX IF EXISTS MICROBIOLOGYEVENTS_idx02; +CREATE INDEX MICROBIOLOGYEVENTS_idx02 + ON MICROBIOLOGYEVENTS (HADM_ID); + +-- DROP INDEX IF EXISTS MICROBIOLOGYEVENTS_idx03; +-- CREATE INDEX MICROBIOLOGYEVENTS_idx03 +-- ON MICROBIOLOGYEVENTS (SPEC_ITEMID, +-- ORG_ITEMID, AB_ITEMID); + +--------------- +-- NOTEEVENTS +--------------- + +DROP INDEX IF EXISTS NOTEEVENTS_idx01; +CREATE INDEX NOTEEVENTS_idx01 + ON NOTEEVENTS (SUBJECT_ID); + +DROP INDEX IF EXISTS NOTEEVENTS_idx02; +CREATE INDEX NOTEEVENTS_idx02 + ON NOTEEVENTS (HADM_ID); + +-- DROP INDEX IF EXISTS NOTEEVENTS_idx03; +-- CREATE INDEX NOTEEVENTS_idx03 +-- ON NOTEEVENTS (CGID); + +-- DROP INDEX IF EXISTS NOTEEVENTS_idx04; +-- CREATE INDEX NOTEEVENTS_idx04 +-- ON NOTEEVENTS (RECORD_ID); + +DROP INDEX IF EXISTS NOTEEVENTS_idx05; +CREATE INDEX NOTEEVENTS_idx05 + ON NOTEEVENTS (CATEGORY); + + +--------------- +-- OUTPUTEVENTS +--------------- +DROP INDEX IF EXISTS OUTPUTEVENTS_idx01; +CREATE INDEX OUTPUTEVENTS_idx01 + ON OUTPUTEVENTS (SUBJECT_ID); + + +DROP INDEX IF EXISTS OUTPUTEVENTS_idx02; +CREATE INDEX OUTPUTEVENTS_idx02 + ON OUTPUTEVENTS (ITEMID); + + +DROP INDEX IF EXISTS OUTPUTEVENTS_idx03; +CREATE INDEX OUTPUTEVENTS_idx03 + ON OUTPUTEVENTS (ICUSTAY_ID); + + +DROP INDEX IF EXISTS OUTPUTEVENTS_idx04; +CREATE INDEX OUTPUTEVENTS_idx04 + ON OUTPUTEVENTS (HADM_ID); + +-- Perhaps not useful to index on just value? Index just for popular subset? +-- DROP INDEX IF EXISTS OUTPUTEVENTS_idx05; +-- CREATE INDEX OUTPUTEVENTS_idx05 +-- ON OUTPUTEVENTS (VALUE); + + +------------- +-- PATIENTS +------------- + +-- Note that SUBJECT_ID is already indexed as it is unique + +-- DROP INDEX IF EXISTS PATIENTS_idx01; +-- CREATE INDEX PATIENTS_idx01 +-- ON PATIENTS (EXPIRE_FLAG); + + +------------------ +-- PRESCRIPTIONS +------------------ + +DROP INDEX IF EXISTS PRESCRIPTIONS_idx01; +CREATE INDEX PRESCRIPTIONS_idx01 + ON PRESCRIPTIONS (SUBJECT_ID); + +DROP INDEX IF EXISTS PRESCRIPTIONS_idx02; +CREATE INDEX PRESCRIPTIONS_idx02 + ON PRESCRIPTIONS (ICUSTAY_ID); + +DROP INDEX IF EXISTS PRESCRIPTIONS_idx03; +CREATE INDEX PRESCRIPTIONS_idx03 + ON PRESCRIPTIONS (DRUG_TYPE); + +DROP INDEX IF EXISTS PRESCRIPTIONS_idx04; +CREATE INDEX PRESCRIPTIONS_idx04 + ON PRESCRIPTIONS (DRUG); + +DROP INDEX IF EXISTS PRESCRIPTIONS_idx05; +CREATE INDEX PRESCRIPTIONS_idx05 + ON PRESCRIPTIONS (HADM_ID); + + +--------------------- +-- PROCEDUREEVENTS_MV +--------------------- + +DROP INDEX IF EXISTS PROCEDUREEVENTS_MV_idx01; +CREATE INDEX PROCEDUREEVENTS_MV_idx01 + ON PROCEDUREEVENTS_MV (SUBJECT_ID); + +DROP INDEX IF EXISTS PROCEDUREEVENTS_MV_idx02; +CREATE INDEX PROCEDUREEVENTS_MV_idx02 + ON PROCEDUREEVENTS_MV (HADM_ID); + +DROP INDEX IF EXISTS PROCEDUREEVENTS_MV_idx03; +CREATE INDEX PROCEDUREEVENTS_MV_idx03 + ON PROCEDUREEVENTS_MV (ICUSTAY_ID); + +-- DROP INDEX IF EXISTS PROCEDUREEVENTS_MV_idx04; +-- CREATE INDEX PROCEDUREEVENTS_MV_idx04 +-- ON PROCEDUREEVENTS_MV (ENDTIME, STARTTIME); + +DROP INDEX IF EXISTS PROCEDUREEVENTS_MV_idx05; +CREATE INDEX PROCEDUREEVENTS_MV_idx05 + ON PROCEDUREEVENTS_MV (ITEMID); + +-- DROP INDEX IF EXISTS PROCEDUREEVENTS_MV_idx06; +-- CREATE INDEX PROCEDUREEVENTS_MV_idx06 +-- ON PROCEDUREEVENTS_MV (VALUE); + +-- DROP INDEX IF EXISTS PROCEDUREEVENTS_MV_idx07; +-- CREATE INDEX PROCEDUREEVENTS_MV_idx07 +-- ON PROCEDUREEVENTS_MV (CGID); + +-- DROP INDEX IF EXISTS PROCEDUREEVENTS_MV_idx08; +-- CREATE INDEX PROCEDUREEVENTS_MV_idx08 +-- ON PROCEDUREEVENTS_MV (LINKORDERID, ORDERID); + +-- DROP INDEX IF EXISTS PROCEDUREEVENTS_MV_idx09; +-- CREATE INDEX PROCEDUREEVENTS_MV_idx09 +-- ON PROCEDUREEVENTS_MV (ORDERCATEGORYDESCRIPTION, +-- ORDERCATEGORYNAME, SECONDARYORDERCATEGORYNAME); + +------------------- +-- PROCEDURES_ICD +------------------- + +DROP INDEX IF EXISTS PROCEDURES_ICD_idx01; +CREATE INDEX PROCEDURES_ICD_idx01 + ON PROCEDURES_ICD (SUBJECT_ID); + +DROP INDEX IF EXISTS PROCEDURES_ICD_idx02; +CREATE INDEX PROCEDURES_ICD_idx02 + ON PROCEDURES_ICD (ICD9_CODE); + +DROP INDEX IF EXISTS PROCEDURES_ICD_idx03; +CREATE INDEX PROCEDURES_ICD_idx03 + ON PROCEDURES_ICD (HADM_ID); + + +------------- +-- SERVICES +------------- + +DROP INDEX IF EXISTS SERVICES_idx01; +CREATE INDEX SERVICES_idx01 + ON SERVICES (SUBJECT_ID); + +DROP INDEX IF EXISTS SERVICES_idx02; +CREATE INDEX SERVICES_idx02 + ON SERVICES (HADM_ID); + +-- DROP INDEX IF EXISTS SERVICES_idx03; +-- CREATE INDEX SERVICES_idx03 +-- ON SERVICES (CURR_SERVICE, PREV_SERVICE); + +------------- +-- TRANSFERS +------------- + +DROP INDEX IF EXISTS TRANSFERS_idx01; +CREATE INDEX TRANSFERS_idx01 + ON TRANSFERS (SUBJECT_ID); + +DROP INDEX IF EXISTS TRANSFERS_idx02; +CREATE INDEX TRANSFERS_idx02 + ON TRANSFERS (ICUSTAY_ID); + +DROP INDEX IF EXISTS TRANSFERS_idx03; +CREATE INDEX TRANSFERS_idx03 + ON TRANSFERS (HADM_ID); + +-- DROP INDEX IF EXISTS TRANSFERS_idx04; +-- CREATE INDEX TRANSFERS_idx04 +-- ON TRANSFERS (INTIME, OUTTIME); + +-- DROP INDEX IF EXISTS TRANSFERS_idx05; +-- CREATE INDEX TRANSFERS_idx05 +-- ON TRANSFERS (LOS); diff --git a/mimic-iii/buildmimic/duckdb/import_duckdb.py b/mimic-iii/buildmimic/duckdb/import_duckdb.py index bcc3d1355..9c804b1a3 100644 --- a/mimic-iii/buildmimic/duckdb/import_duckdb.py +++ b/mimic-iii/buildmimic/duckdb/import_duckdb.py @@ -228,6 +228,7 @@ def main() -> int: parser.add_argument('--mimic-code-root', help="location of the mimic-code repo (used to find concepts SQL)", default='../../../') parser.add_argument('--make-concepts', help="generate the concepts views", action="store_true") parser.add_argument('--skip-tables', help="don't create schema or load data (they must already exist)", action="store_true") + parser.add_argument('--skip-indexes', help="don't create indexes (implied by --skip-tables)", action="store_true") parser.add_argument('--schema-name', help="put all object (except ccs_dx) into a schema (like the PostgreSQL version)", default=None) args = parser.parse_args() output_db = args.output_db @@ -235,6 +236,7 @@ def main() -> int: make_concepts = args.make_concepts mimic_code_root = args.mimic_code_root skip_tables = args.skip_tables + skip_indexes = args.skip_indexes #TODO: validate schema_name is valid identifier schema_name = args.schema_name @@ -267,6 +269,13 @@ def main() -> int: print(f" {tablename}") connection.execute(f"COPY {tablename} from '{os.path.join(mimic_data_dir,m.group(0))}' (FORMAT CSV, DELIMITER ',', HEADER);") + if not skip_indexes: + print("Adding indexes...") + + with open(os.path.join(mimic_code_root, 'mimic-iii','buildmimic','duckdb','duckdb_add_indexes.sql'), 'r') as fp: + sql = fp.read() + connection.execute(sql) + except Exception as error: print("Failed setting up database: ", error) raise error From 91fed32fe9be6ea9c3d358e7e8fe49ffad0fbc11 Mon Sep 17 00:00:00 2001 From: SphtKr Date: Fri, 28 Apr 2023 18:13:46 +0000 Subject: [PATCH 15/20] Added checks --- mimic-iii/buildmimic/duckdb/duckdb_checks.sql | 77 +++++++++++++++++++ mimic-iii/buildmimic/duckdb/import_duckdb.py | 8 ++ 2 files changed, 85 insertions(+) create mode 100644 mimic-iii/buildmimic/duckdb/duckdb_checks.sql diff --git a/mimic-iii/buildmimic/duckdb/duckdb_checks.sql b/mimic-iii/buildmimic/duckdb/duckdb_checks.sql new file mode 100644 index 000000000..2b149ccb7 --- /dev/null +++ b/mimic-iii/buildmimic/duckdb/duckdb_checks.sql @@ -0,0 +1,77 @@ +-- this query runs a few simple checks to make sure the database has loaded in OK +-- These checks are designed for MIMIC-III v1.4 + +-- If running scripts individually, you can set the schema where all tables are created as follows: +-- SET search_path TO mimiciii; + +with expected as +( +select 'admissions' as tbl, 58976 as row_count UNION ALL +select 'callout' as tbl, 34499 as row_count UNION ALL +select 'caregivers' as tbl, 7567 as row_count UNION ALL +select 'chartevents' as tbl, 330712483 as row_count UNION ALL +select 'cptevents' as tbl, 573146 as row_count UNION ALL +select 'd_cpt' as tbl, 134 as row_count UNION ALL +select 'd_icd_diagnoses' as tbl, 14567 as row_count UNION ALL +select 'd_icd_procedures' as tbl, 3882 as row_count UNION ALL +select 'd_items' as tbl, 12487 as row_count UNION ALL +select 'd_labitems' as tbl, 753 as row_count UNION ALL +select 'datetimeevents' as tbl, 4485937 as row_count UNION ALL +select 'diagnoses_icd' as tbl, 651047 as row_count UNION ALL +select 'drgcodes' as tbl, 125557 as row_count UNION ALL +select 'icustays' as tbl, 61532 as row_count UNION ALL +select 'inputevents_cv' as tbl, 17527935 as row_count UNION ALL +select 'inputevents_mv' as tbl, 3618991 as row_count UNION ALL +select 'labevents' as tbl, 27854055 as row_count UNION ALL +select 'microbiologyevents' as tbl, 631726 as row_count UNION ALL +select 'noteevents' as tbl, 2083180 as row_count UNION ALL +select 'outputevents' as tbl, 4349218 as row_count UNION ALL +select 'patients' as tbl, 46520 as row_count UNION ALL +select 'prescriptions' as tbl, 4156450 as row_count UNION ALL +select 'procedureevents_mv' as tbl, 258066 as row_count UNION ALL +select 'procedures_icd' as tbl, 240095 as row_count UNION ALL +select 'services' as tbl, 73343 as row_count UNION ALL +select 'transfers' as tbl, 261897 as row_count +) +, observed as +( + select 'admissions' as tbl, count(*) as row_count from admissions UNION ALL + select 'callout' as tbl, count(*) as row_count from callout UNION ALL + select 'caregivers' as tbl, count(*) as row_count from caregivers UNION ALL + select 'chartevents' as tbl, count(*) as row_count from chartevents UNION ALL + select 'cptevents' as tbl, count(*) as row_count from cptevents UNION ALL + select 'd_cpt' as tbl, count(*) as row_count from d_cpt UNION ALL + select 'd_icd_diagnoses' as tbl, count(*) as row_count from d_icd_diagnoses UNION ALL + select 'd_icd_procedures' as tbl, count(*) as row_count from d_icd_procedures UNION ALL + select 'd_items' as tbl, count(*) as row_count from d_items UNION ALL + select 'd_labitems' as tbl, count(*) as row_count from d_labitems UNION ALL + select 'datetimeevents' as tbl, count(*) as row_count from datetimeevents UNION ALL + select 'diagnoses_icd' as tbl, count(*) as row_count from diagnoses_icd UNION ALL + select 'drgcodes' as tbl, count(*) as row_count from drgcodes UNION ALL + select 'icustays' as tbl, count(*) as row_count from icustays UNION ALL + select 'inputevents_cv' as tbl, count(*) as row_count from inputevents_cv UNION ALL + select 'inputevents_mv' as tbl, count(*) as row_count from inputevents_mv UNION ALL + select 'labevents' as tbl, count(*) as row_count from labevents UNION ALL + select 'microbiologyevents' as tbl, count(*) as row_count from microbiologyevents UNION ALL + select 'noteevents' as tbl, count(*) as row_count from noteevents UNION ALL + select 'outputevents' as tbl, count(*) as row_count from outputevents UNION ALL + select 'patients' as tbl, count(*) as row_count from patients UNION ALL + select 'prescriptions' as tbl, count(*) as row_count from prescriptions UNION ALL + select 'procedureevents_mv' as tbl, count(*) as row_count from procedureevents_mv UNION ALL + select 'procedures_icd' as tbl, count(*) as row_count from procedures_icd UNION ALL + select 'services' as tbl, count(*) as row_count from services UNION ALL + select 'transfers' as tbl, count(*) as row_count from transfers +) +select + exp.tbl + , exp.row_count as expected_count + , obs.row_count as observed_count + , case + when exp.row_count = obs.row_count + then 'PASSED' + else 'FAILED' + end as ROW_COUNT_CHECK +from expected exp +inner join observed obs + on exp.tbl = obs.tbl +order by exp.tbl; diff --git a/mimic-iii/buildmimic/duckdb/import_duckdb.py b/mimic-iii/buildmimic/duckdb/import_duckdb.py index 9c804b1a3..344c931c9 100644 --- a/mimic-iii/buildmimic/duckdb/import_duckdb.py +++ b/mimic-iii/buildmimic/duckdb/import_duckdb.py @@ -276,6 +276,14 @@ def main() -> int: sql = fp.read() connection.execute(sql) + print("Running checks...") + + with open(os.path.join(mimic_code_root, 'mimic-iii','buildmimic','duckdb','duckdb_checks.sql'), 'r') as fp: + sql = fp.read() + result = connection.execute(sql).fetchall() + for row in result: + print(f"{row[0]}: {row[2]} records ({row[1]} expected) - {row[3]}") + except Exception as error: print("Failed setting up database: ", error) raise error From 9dea2876b317eb9722ade0ef2cabd904af768ed8 Mon Sep 17 00:00:00 2001 From: SphtKr Date: Fri, 28 Apr 2023 23:57:51 +0000 Subject: [PATCH 16/20] Updating README.md --- mimic-iii/buildmimic/duckdb/README.md | 147 ++++++++++++++++++-------- 1 file changed, 105 insertions(+), 42 deletions(-) diff --git a/mimic-iii/buildmimic/duckdb/README.md b/mimic-iii/buildmimic/duckdb/README.md index 17787e2ec..6fc6ef4c9 100644 --- a/mimic-iii/buildmimic/duckdb/README.md +++ b/mimic-iii/buildmimic/duckdb/README.md @@ -1,75 +1,78 @@ -# DuckDB +# MIMIC-III in DuckDB -The script in this folder creates the schema for MIMIC-IV and +The scripts in this folder create the schema for MIMIC-III and loads the data into the appropriate tables for [DuckDB](https://duckdb.org/). + +The Python script (`import_duckdb.py`) also includes the option to +add the [concepts views](../../concepts/README.md) to the database. +This makes it much easier to use the concepts views as you do not +have to install and setup PostgreSQL or use BigQuery. + DuckDB, like SQLite, is serverless and stores all information in a single file. Unlike SQLite, an OLTP database, DuckDB is an OLAP database, and therefore optimized for analytical queries. -This will result in faster queries for researchers using MIMIC-IV +This will result in faster queries for researchers using MIMIC-III with DuckDB compared to SQLite. To learn more, please read their ["why duckdb"](https://duckdb.org/docs/why_duckdb) page. -The instructions to load MIMIC-III into a DuckDB -only require: -1. DuckDB to be installed and -2. Your computer to have a POSIX-compliant terminal shell, - which is already found by default on any Mac OSX, Linux, or BSD installation. - -To use these instructions on Windows, -you need a Unix command line environment, -which you can obtain by either installing -[Windows Subsystem for Linux](https://docs.microsoft.com/en-us/windows/wsl/install-win10) -or [Cygwin](https://www.cygwin.com/). - -## Set-up - -### Quick overview - -1. [Install](https://duckdb.org/docs/installation/) the CLI version of DuckDB -2. [Download](https://physionet.org/content/mimiciii/1.4/) the MIMIC-III files -3. Create DuckDB database and load data +## Download MIMIC-III files -### Install DuckDB +[Download](https://physionet.org/content/mimiciii/1.4/) +the CSV files for MIMIC-III by any method you wish. +(These scripts should also work with the much smaller +[demo version](https://physionet.org/content/mimiciii-demo/1.4/#files-panel) +of the dataset.) -Follow instructions on their website to -[install](https://duckdb.org/docs/installation/) -the CLI version of DuckDB. +The easiest way to download them is to open a terminal then run: -You will need to place the `duckdb` binary in a folder on your environment path, -e.g. `/usr/local/bin`. +``` +wget -r -N -c -np -nH --cut-dirs=1 --user YOURUSERNAME --ask-password https://physionet.org/files/mimiciii/1.4/ +``` -### Download MIMIC-III files +Replace `YOURUSERNAME` with your physionet username. -[Download](https://physionet.org/content/mimiciii/1.4/) -the CSV files for MIMIC-III by any method you wish. +This will make you `mimic_data_dir` be `mimiciii/1.4`. -The intructions assume the CSV files are in the folder structure as follows: +The rest of these intructions assume the CSV files are in the folder structure as follows: ``` -mimic_data_dir +mimic_data_dir/ ADMISSIONS.csv.gz + CALLOUT.csv.gz ... ``` The CSV files can be uncompressed (end in `.csv`) or compressed (end in `.csv.gz`). -The easiest way to download them is to open a terminal then run: -``` -wget -r -N -c -np -nH --cut-dirs=1 --user YOURUSERNAME --ask-password https://physionet.org/files/mimiciii/1.4/ -``` +## Shell script method (`import_duckdb.sh`) -Replace `YOURUSERNAME` with your physionet username. +Using this script to load MIMIC-III into a DuckDB +only requires: +1. DuckDB to be installed (the `duckdb` executable must be in your PATH) +2. Your computer to have a POSIX-compliant terminal shell, + which is already found by default on any Mac OSX, Linux, or BSD installation. -This will make you `mimic_data_dir` be `mimiciii/1.4`. +To use these instructions on Windows, +you need a Unix command line environment, +which you can obtain by either installing +[Windows Subsystem for Linux](https://docs.microsoft.com/en-us/windows/wsl/install-win10) +or [Cygwin](https://www.cygwin.com/). + +### Install DuckDB + +Follow instructions on their website to +[install](https://duckdb.org/docs/installation/) +the CLI version of DuckDB. + +You will need to place the `duckdb` binary in a folder on your environment path, +e.g. `/usr/local/bin`. -# Create DuckDB database and load data -The last step requires creating a DuckDB database and -loading the data into it. +### Create DuckDB database and load data You can do all of this will one shell script, `import_duckdb.sh`, located in this repository. @@ -102,6 +105,66 @@ The script will print out progress as it goes. Be patient, this can take minutes to hours to load depending on your computer's configuration. +## Python script method (`import_duckdb.py`) + +This method does not require the DuckDB executable, the DuckDB Python +module, and the [sqlglot](#build-and-modify-sql), both of which can be +easily installed with `pip`. + +### Install dependencies + +Install the dependencies by using the included `requirements.txt` file: + +```sh +python3 -m pip install -r ./requirements.txt +``` + +### Create DuckDB database and load data + +Create the MIMIC-III database with `import_duckdb.py` like so: + +```sh +python ./import_duckdb.py /path/to/mimic_data_dir ./mimic3.db +``` + +...where `/path/to/mimic_data_dir` is the path containing the .csv or .csv.gz +data files downloaded above. + +This command will create the `mimic3.db` file in the current directory. Be aware that +for the full MIMIC-III v1.4 dataset the resulting file will be about 34GB in size. +This process will take some time, as with the shell script version. + +The default options will create only the tables and load the data, and assume +that you are running the script from the same directory where this README.md +is located. See the full options below if the defaults are insufficient. + +### Create the concepts views + +In most cases you will want to create the concepts views at the same time as +the database. To do this, add the `--make-concepts` option: + +```sh +python ./import_duckdb.py /path/to/mimic_data_dir ./mimic3.db --make-concepts +``` + +If you want to add the concepts to a database already created without this +option (or created with the shell script version), you can add the +`--skip-tables` option as well: + +```sh +python ./import_duckdb.py /path/to/mimic_data_dir ./mimic3.db --make-concepts --skip-tables +``` + +### Additional options + +There are a few additional options for special situations: + +| Option | Description +| - | - +| `--skip-indexes` | Don't create additional indexes when creating tables and loading data. This may be useful in memory-constrained systems or to save a little time. +| `--mimic-code-root [path]` | This argument specifies the location of the mimic-code repository files. This is needed to find the concepts SQL files. This is useful if you are running the script from a different directory than the one where this README.md file is located (the default is `../../../`) +| `--schema-name [name]` | This puts the tables and concepts views into a named schema in the database. This is mainly useful to mirror the behavior of the PostgreSQL version of the database, which places objects in a schema named `mimiciii` by default--if you have existing code designed for the PostgreSQL version, this may make migration easier. Note that--like the PostgreSQL version--the `ccs_dx` view is *not* placed in the specified schema, but in the default schema (which is `main` in DuckDB, not `public` as in PostgreSQL). + # Help Please see the [issues page](https://github.com/MIT-LCP/mimic-iii/issues) to discuss other issues you may be having. From 060cc71d3cf222c93726c87609014efc9d22e9d7 Mon Sep 17 00:00:00 2001 From: SphtKr Date: Sat, 29 Apr 2023 00:18:09 +0000 Subject: [PATCH 17/20] Missed file rename --- mimic-iii/buildmimic/duckdb/import_duckdb.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mimic-iii/buildmimic/duckdb/import_duckdb.sh b/mimic-iii/buildmimic/duckdb/import_duckdb.sh index e7f6187bd..96c34cddc 100755 --- a/mimic-iii/buildmimic/duckdb/import_duckdb.sh +++ b/mimic-iii/buildmimic/duckdb/import_duckdb.sh @@ -70,7 +70,7 @@ fi # create tables using DDL from postgres # minor changes: TIMESTAMP(nn) -> TIMESTAMP -try duckdb "$OUTFILE" < import_duckdb_tables.sql +try duckdb "$OUTFILE" < duckdb_add_tables.sql # goal: get path from find, e.g., ./1.0/icu/d_items # and return database table name for it, e.g., mimic_icu.d_items From e36f49cfbc9d91c9c490d8e3b5c096c453b1e0d4 Mon Sep 17 00:00:00 2001 From: S'pht'Kr Date: Sun, 30 Apr 2023 21:07:08 -0500 Subject: [PATCH 18/20] Move fake CHARTEVENTS PK to indexes script -- this may fail on machines without a large amount of RAM, and this way it can be skipped with `--skip-indexes` --- mimic-iii/buildmimic/duckdb/duckdb_add_indexes.sql | 8 ++++++++ mimic-iii/buildmimic/duckdb/duckdb_add_tables.sql | 2 -- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/mimic-iii/buildmimic/duckdb/duckdb_add_indexes.sql b/mimic-iii/buildmimic/duckdb/duckdb_add_indexes.sql index b350f7ca0..fabd7bd8f 100644 --- a/mimic-iii/buildmimic/duckdb/duckdb_add_indexes.sql +++ b/mimic-iii/buildmimic/duckdb/duckdb_add_indexes.sql @@ -544,3 +544,11 @@ CREATE INDEX TRANSFERS_idx03 -- DROP INDEX IF EXISTS TRANSFERS_idx05; -- CREATE INDEX TRANSFERS_idx05 -- ON TRANSFERS (LOS); + +-------------------------------------- +-- CHARTEVENTS PRIMARY KEY SUBSTITUTE +-------------------------------------- + +-- FIXME: Remove this index when the PK can be re-added... +CREATE UNIQUE INDEX chartevents_rowid_pk ON CHARTEVENTS (ROW_ID); + diff --git a/mimic-iii/buildmimic/duckdb/duckdb_add_tables.sql b/mimic-iii/buildmimic/duckdb/duckdb_add_tables.sql index e945ef0d1..9b8072b22 100644 --- a/mimic-iii/buildmimic/duckdb/duckdb_add_tables.sql +++ b/mimic-iii/buildmimic/duckdb/duckdb_add_tables.sql @@ -87,8 +87,6 @@ CREATE TABLE CHARTEVENTS -- See https://github.com/duckdb/duckdb/issues/6668#issuecomment-1474880266 --,CONSTRAINT chartevents_rowid_pk PRIMARY KEY (ROW_ID) ); --- Remove this index when the PK can be re-added... -CREATE UNIQUE INDEX chartevents_rowid_pk ON CHARTEVENTS (ROW_ID); DROP TABLE IF EXISTS CPTEVENTS CASCADE; CREATE TABLE CPTEVENTS From 6ba9ce73ea447a7bf1d3295d1cbde389896f78f5 Mon Sep 17 00:00:00 2001 From: SphtKr Date: Tue, 2 May 2023 23:46:51 +0000 Subject: [PATCH 19/20] Fixed outright errors --- mimic-iii/buildmimic/duckdb/README.md | 6 +++--- mimic-iii/buildmimic/duckdb/concepts/icustay_hours.sql | 3 +-- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/mimic-iii/buildmimic/duckdb/README.md b/mimic-iii/buildmimic/duckdb/README.md index 6fc6ef4c9..7980183b6 100644 --- a/mimic-iii/buildmimic/duckdb/README.md +++ b/mimic-iii/buildmimic/duckdb/README.md @@ -107,9 +107,9 @@ depending on your computer's configuration. ## Python script method (`import_duckdb.py`) -This method does not require the DuckDB executable, the DuckDB Python -module, and the [sqlglot](#build-and-modify-sql), both of which can be -easily installed with `pip`. +This method does not require the DuckDB executable, it only requires the DuckDB Python +module and the [SQLGlot](https://github.com/tobymao/sqlglot) Python module, both of which can be +easily installed with `pip`. ### Install dependencies diff --git a/mimic-iii/buildmimic/duckdb/concepts/icustay_hours.sql b/mimic-iii/buildmimic/duckdb/concepts/icustay_hours.sql index 71ecfc1c4..139ad20b8 100644 --- a/mimic-iii/buildmimic/duckdb/concepts/icustay_hours.sql +++ b/mimic-iii/buildmimic/duckdb/concepts/icustay_hours.sql @@ -17,5 +17,4 @@ SELECT --ah.endtime+ hr*INTERVAL 1 hour as endtime FROM all_hours AS ah ORDER BY - ah.icustay_id NULLS LAST -limit 20 \ No newline at end of file + ah.icustay_id NULLS LAST \ No newline at end of file From 206c0f7375e181e3928fdfd9559fcf6eb2cabb4e Mon Sep 17 00:00:00 2001 From: SphtKr Date: Mon, 8 May 2023 23:27:51 +0000 Subject: [PATCH 20/20] Experimental option to use integer or fractional DATETIME_DIFF function --- mimic-iii/buildmimic/duckdb/import_duckdb.py | 30 +++++++++++++++++--- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/mimic-iii/buildmimic/duckdb/import_duckdb.py b/mimic-iii/buildmimic/duckdb/import_duckdb.py index 344c931c9..7a797b90d 100644 --- a/mimic-iii/buildmimic/duckdb/import_duckdb.py +++ b/mimic-iii/buildmimic/duckdb/import_duckdb.py @@ -134,14 +134,25 @@ def duckdb_date_sub_sql(self, expression): sqlglot.dialects.duckdb.DuckDB.Generator.TRANSFORMS[exp.DatetimeSub] = duckdb_date_sub_sql sqlglot.dialects.duckdb.DuckDB.Generator.TRANSFORMS[exp.DatetimeAdd] = sqlglot.dialects.duckdb._date_add -def duckdb_date_diff_sql(self, expression): +_unit_ms_conversion_factor_map = { + 'SECOND': 1e6, + 'MINUTE': 60.0*1e6, + 'HOUR': 3600.0*1e6, + 'DAY': 24*3600.0*1e6, + 'YEAR': 365.242*24*3600.0*1e6, +} +def duckdb_date_diff_whole_sql(self, expression): #print("CALLING duckdb._date_diff") this = self.sql(expression, "this") unit = self.sql(expression, "unit") or "DAY" # DuckDB DATE_DIFF operand order is start_time, end_time--not like end_time - start_time! return f"DATE_DIFF('{unit}', {self.sql(expression.expression)}, {this})" -sqlglot.dialects.duckdb.DuckDB.Generator.TRANSFORMS[exp.DatetimeDiff] = duckdb_date_diff_sql -sqlglot.dialects.duckdb.DuckDB.Generator.TRANSFORMS[exp.DateDiff] = duckdb_date_diff_sql +def duckdb_date_diff_frac_sql(self, expression): + this = self.sql(expression, "this") + mfactor = _unit_ms_conversion_factor_map[self.sql(expression, "unit").upper() or "DAY"] + # DuckDB DATE_DIFF operand order is start_time, end_time--not like end_time - start_time! + return f"DATE_DIFF('microseconds', {self.sql(expression.expression)}, {this})/{mfactor:.1f}" +# only one of these will be used, set later based on arguments! # This may not be strictly necessary because the views work without # it IF you `use` the schema first... but making them fully qualified @@ -210,7 +221,7 @@ def _make_duckdb_query_duckdb(qname: str, qfile: str, conn, schema: str = None): sql = _duckdb_rewrite_schema(sql, schema) try: - conn.execute(f"CREATE VIEW {qname} AS " + sql) + conn.execute(f"CREATE OR REPLACE VIEW {qname} AS " + sql) except Exception as e: print(sql) raise e @@ -230,6 +241,7 @@ def main() -> int: parser.add_argument('--skip-tables', help="don't create schema or load data (they must already exist)", action="store_true") parser.add_argument('--skip-indexes', help="don't create indexes (implied by --skip-tables)", action="store_true") parser.add_argument('--schema-name', help="put all object (except ccs_dx) into a schema (like the PostgreSQL version)", default=None) + parser.add_argument('--integer-datetime-diff', help="EXPERIMENTAL: calculate integer DATETIME_DIFF results (like BigQuery) for e.g. icustay_detail.los_icu", action="store_true") args = parser.parse_args() output_db = args.output_db mimic_data_dir = args.mimic_data_dir @@ -237,9 +249,19 @@ def main() -> int: mimic_code_root = args.mimic_code_root skip_tables = args.skip_tables skip_indexes = args.skip_indexes + integer_datetime_diff = args.integer_datetime_diff #TODO: validate schema_name is valid identifier schema_name = args.schema_name + #EXPERIMENTAL! May be removed. + if integer_datetime_diff: + sqlglot.dialects.duckdb.DuckDB.Generator.TRANSFORMS[exp.DatetimeDiff] = duckdb_date_diff_whole_sql + sqlglot.dialects.duckdb.DuckDB.Generator.TRANSFORMS[exp.DateDiff] = duckdb_date_diff_whole_sql + else: + sqlglot.dialects.duckdb.DuckDB.Generator.TRANSFORMS[exp.DatetimeDiff] = duckdb_date_diff_frac_sql + sqlglot.dialects.duckdb.DuckDB.Generator.TRANSFORMS[exp.DateDiff] = duckdb_date_diff_frac_sql + + if not skip_tables: connection = duckdb.connect(output_db)