lab-rasool
diff --git a/‎app/minds.egg-info/PKG-INFO
Lines changed: 4 additions & 4 deletions b/‎app/minds.egg-info/PKG-INFO
Lines changed: 4 additions & 4 deletions
diff --git a/‎build/lib/minds/database.py
Lines changed: 50 additions & 27 deletions b/‎build/lib/minds/database.py
Lines changed: 50 additions & 27 deletions
diff --git a/‎dist/minds-0.0.5-py3-none-any.whl
15.8 KB b/‎dist/minds-0.0.5-py3-none-any.whl
15.8 KB
diff --git a/‎dist/minds-0.0.5.tar.gz
15.6 KB b/‎dist/minds-0.0.5.tar.gz
15.6 KB
@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: minds
-Version: 0.0.4
+Version: 0.0.5
 Summary: A package for downloading and processing data from the MINDS database
 Home-page: https://github.com/lab-rasool/MINDS
 Author: Aakash Tripathi
@@ -75,7 +75,7 @@ tables = minds.get_tables()
 columns = minds.get_columns("clinical")
 
 # Query the database directly
-query = "SELECT * FROM nihnci.clinical WHERE project_id = 'TCGA-LUAD' LIMIT 10"
+query = "SELECT * FROM minds.clinical WHERE project_id = 'TCGA-LUAD' LIMIT 10"
 df = minds.query(query)
 ```
 
@@ -89,11 +89,11 @@ query_cohort = minds.build_cohort(query=query, output_dir="./data")
 gdc_cohort = minds.build_cohort(gdc_cohort="cohort_Unsaved_Cohort.2024-02-12.tsv", output_dir="./data")
 
 # to get the cohort details
-cohort.stats()
+gdc_cohort.stats()
 
 # to download the data from the cohort to the output directory specified
 # you can also specify the number of threads to use and the modalities to exclude or include
-cohort.download(threads=12, exclude=["Slide Image"])
+gdc_cohort.download(threads=12, exclude=["Slide Image"])
 ```
 
 ## Please cite our work
 
@@ -6,6 +6,8 @@
 import pandas as pd
 from dotenv import load_dotenv
 from sqlalchemy import create_engine
+from sqlalchemy.exc import PendingRollbackError, SQLAlchemyError
+from sqlalchemy.orm import sessionmaker
 
 
 class DatabaseManager:
@@ -25,9 +27,15 @@ def __init__(self, dotenv_path=".env"):
             f"mysql+pymysql://{user}:{password}@{host}:{port}/{self.database}"
         )
         self.engine = create_engine(database_url)
+        self.Session = sessionmaker(bind=self.engine)
 
     def execute(self, query):
-        return pd.read_sql(query, self.engine)
+        try:
+            with self.engine.connect() as connection:
+                return pd.read_sql(query, connection)
+        except SQLAlchemyError as e:
+            logging.error(f"Error executing query: {e}")
+            raise
 
     def get_minds_cohort(self, query):
         df = self.execute(query)
@@ -36,9 +44,12 @@ def get_minds_cohort(self, query):
 
     def get_gdc_cohort(self, gdc_cohort):
         cohort = pd.read_csv(gdc_cohort, sep="\t", dtype=str)
-        df = self.execute(
-            f"SELECT case_id, case_submitter_id FROM {self.database}.clinical WHERE case_id IN {tuple(cohort['id'])}"
-        )
+        query = f"""
+            SELECT case_id, case_submitter_id 
+            FROM {self.database}.clinical 
+            WHERE case_id IN ({','.join([f"'{i}'" for i in cohort['id']])})
+        """
+        df = self.execute(query)
         cohort = df.groupby("case_id")["case_submitter_id"].unique()
         return cohort
 
@@ -55,29 +66,41 @@ def get_columns(self, table):
         return columns["Field"]
 
     def update(self, temp_folder):
-        # make sure the temp folder exists
         if not os.path.exists(temp_folder):
             os.makedirs(temp_folder)
-        # upload all the files to the database as a table
+
         logging.info("Uploading new data to the database")
-        for file in os.listdir(temp_folder):
-            table_name = file.split(".")[0]
-            df = pd.read_csv(f"{temp_folder}/{file}", sep="\t", dtype=str)
-            df.replace("'--", np.nan, inplace=True)
-            # if table already exists, append the new data
-            if table_name in self.get_tables().tolist():
-                logging.info(f"Updating {table_name}")
-                df.to_sql(
-                    name=table_name,
-                    con=self.engine,
-                    if_exists="append",
-                    index=False,
-                    chunksize=1000,
-                )
-            else:
-                logging.info(f"Creating {table_name}")
-                df.to_sql(
-                    name=table_name, con=self.engine, if_exists="replace", index=False
-                )
-        logging.info("Finished uploading to the database")
-        shutil.rmtree(temp_folder)
+
+        session = self.Session()
+        try:
+            for file in os.listdir(temp_folder):
+                table_name = file.split(".")[0]
+                df = pd.read_csv(f"{temp_folder}/{file}", sep="\t", dtype=str)
+                df.replace("'--", np.nan, inplace=True)
+
+                if table_name in self.get_tables().tolist():
+                    logging.info(f"Updating {table_name}")
+                    df.to_sql(
+                        name=table_name,
+                        con=self.engine,
+                        if_exists="append",
+                        index=False,
+                        chunksize=1000,
+                    )
+                else:
+                    logging.info(f"Creating {table_name}")
+                    df.to_sql(
+                        name=table_name,
+                        con=self.engine,
+                        if_exists="replace",
+                        index=False,
+                    )
+            session.commit()
+            logging.info("Finished uploading to the database")
+        except (SQLAlchemyError, PendingRollbackError) as e:
+            session.rollback()
+            logging.error(f"Error during update: {e}")
+            raise
+        finally:
+            session.close()
+            shutil.rmtree(temp_folder)