diff --git a/app/minds.egg-info/PKG-INFO b/app/minds.egg-info/PKG-INFO index a1f6702..9e075fd 100644 --- a/app/minds.egg-info/PKG-INFO +++ b/app/minds.egg-info/PKG-INFO @@ -1,6 +1,6 @@ Metadata-Version: 2.1 Name: minds -Version: 0.0.4 +Version: 0.0.5 Summary: A package for downloading and processing data from the MINDS database Home-page: https://github.com/lab-rasool/MINDS Author: Aakash Tripathi @@ -75,7 +75,7 @@ tables = minds.get_tables() columns = minds.get_columns("clinical") # Query the database directly -query = "SELECT * FROM nihnci.clinical WHERE project_id = 'TCGA-LUAD' LIMIT 10" +query = "SELECT * FROM minds.clinical WHERE project_id = 'TCGA-LUAD' LIMIT 10" df = minds.query(query) ``` @@ -89,11 +89,11 @@ query_cohort = minds.build_cohort(query=query, output_dir="./data") gdc_cohort = minds.build_cohort(gdc_cohort="cohort_Unsaved_Cohort.2024-02-12.tsv", output_dir="./data") # to get the cohort details -cohort.stats() +gdc_cohort.stats() # to download the data from the cohort to the output directory specified # you can also specify the number of threads to use and the modalities to exclude or include -cohort.download(threads=12, exclude=["Slide Image"]) +gdc_cohort.download(threads=12, exclude=["Slide Image"]) ``` ## Please cite our work diff --git a/build/lib/minds/database.py b/build/lib/minds/database.py index acc1af8..fefc10a 100644 --- a/build/lib/minds/database.py +++ b/build/lib/minds/database.py @@ -6,6 +6,8 @@ import pandas as pd from dotenv import load_dotenv from sqlalchemy import create_engine +from sqlalchemy.exc import PendingRollbackError, SQLAlchemyError +from sqlalchemy.orm import sessionmaker class DatabaseManager: @@ -25,9 +27,15 @@ def __init__(self, dotenv_path=".env"): f"mysql+pymysql://{user}:{password}@{host}:{port}/{self.database}" ) self.engine = create_engine(database_url) + self.Session = sessionmaker(bind=self.engine) def execute(self, query): - return pd.read_sql(query, self.engine) + try: + with self.engine.connect() as connection: + return pd.read_sql(query, connection) + except SQLAlchemyError as e: + logging.error(f"Error executing query: {e}") + raise def get_minds_cohort(self, query): df = self.execute(query) @@ -36,9 +44,12 @@ def get_minds_cohort(self, query): def get_gdc_cohort(self, gdc_cohort): cohort = pd.read_csv(gdc_cohort, sep="\t", dtype=str) - df = self.execute( - f"SELECT case_id, case_submitter_id FROM {self.database}.clinical WHERE case_id IN {tuple(cohort['id'])}" - ) + query = f""" + SELECT case_id, case_submitter_id + FROM {self.database}.clinical + WHERE case_id IN ({','.join([f"'{i}'" for i in cohort['id']])}) + """ + df = self.execute(query) cohort = df.groupby("case_id")["case_submitter_id"].unique() return cohort @@ -55,29 +66,41 @@ def get_columns(self, table): return columns["Field"] def update(self, temp_folder): - # make sure the temp folder exists if not os.path.exists(temp_folder): os.makedirs(temp_folder) - # upload all the files to the database as a table + logging.info("Uploading new data to the database") - for file in os.listdir(temp_folder): - table_name = file.split(".")[0] - df = pd.read_csv(f"{temp_folder}/{file}", sep="\t", dtype=str) - df.replace("'--", np.nan, inplace=True) - # if table already exists, append the new data - if table_name in self.get_tables().tolist(): - logging.info(f"Updating {table_name}") - df.to_sql( - name=table_name, - con=self.engine, - if_exists="append", - index=False, - chunksize=1000, - ) - else: - logging.info(f"Creating {table_name}") - df.to_sql( - name=table_name, con=self.engine, if_exists="replace", index=False - ) - logging.info("Finished uploading to the database") - shutil.rmtree(temp_folder) + + session = self.Session() + try: + for file in os.listdir(temp_folder): + table_name = file.split(".")[0] + df = pd.read_csv(f"{temp_folder}/{file}", sep="\t", dtype=str) + df.replace("'--", np.nan, inplace=True) + + if table_name in self.get_tables().tolist(): + logging.info(f"Updating {table_name}") + df.to_sql( + name=table_name, + con=self.engine, + if_exists="append", + index=False, + chunksize=1000, + ) + else: + logging.info(f"Creating {table_name}") + df.to_sql( + name=table_name, + con=self.engine, + if_exists="replace", + index=False, + ) + session.commit() + logging.info("Finished uploading to the database") + except (SQLAlchemyError, PendingRollbackError) as e: + session.rollback() + logging.error(f"Error during update: {e}") + raise + finally: + session.close() + shutil.rmtree(temp_folder) diff --git a/dist/minds-0.0.5-py3-none-any.whl b/dist/minds-0.0.5-py3-none-any.whl new file mode 100644 index 0000000..62e4dfe Binary files /dev/null and b/dist/minds-0.0.5-py3-none-any.whl differ diff --git a/dist/minds-0.0.5.tar.gz b/dist/minds-0.0.5.tar.gz new file mode 100644 index 0000000..6454df8 Binary files /dev/null and b/dist/minds-0.0.5.tar.gz differ