Skip to content

Commit 132cc9e

Browse files
v0.0.5
1 parent d43cacd commit 132cc9e

File tree

4 files changed

+54
-31
lines changed

4 files changed

+54
-31
lines changed

app/minds.egg-info/PKG-INFO

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
Metadata-Version: 2.1
22
Name: minds
3-
Version: 0.0.4
3+
Version: 0.0.5
44
Summary: A package for downloading and processing data from the MINDS database
55
Home-page: https://github.com/lab-rasool/MINDS
66
Author: Aakash Tripathi
@@ -75,7 +75,7 @@ tables = minds.get_tables()
7575
columns = minds.get_columns("clinical")
7676

7777
# Query the database directly
78-
query = "SELECT * FROM nihnci.clinical WHERE project_id = 'TCGA-LUAD' LIMIT 10"
78+
query = "SELECT * FROM minds.clinical WHERE project_id = 'TCGA-LUAD' LIMIT 10"
7979
df = minds.query(query)
8080
```
8181

@@ -89,11 +89,11 @@ query_cohort = minds.build_cohort(query=query, output_dir="./data")
8989
gdc_cohort = minds.build_cohort(gdc_cohort="cohort_Unsaved_Cohort.2024-02-12.tsv", output_dir="./data")
9090

9191
# to get the cohort details
92-
cohort.stats()
92+
gdc_cohort.stats()
9393

9494
# to download the data from the cohort to the output directory specified
9595
# you can also specify the number of threads to use and the modalities to exclude or include
96-
cohort.download(threads=12, exclude=["Slide Image"])
96+
gdc_cohort.download(threads=12, exclude=["Slide Image"])
9797
```
9898

9999
## Please cite our work

build/lib/minds/database.py

Lines changed: 50 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66
import pandas as pd
77
from dotenv import load_dotenv
88
from sqlalchemy import create_engine
9+
from sqlalchemy.exc import PendingRollbackError, SQLAlchemyError
10+
from sqlalchemy.orm import sessionmaker
911

1012

1113
class DatabaseManager:
@@ -25,9 +27,15 @@ def __init__(self, dotenv_path=".env"):
2527
f"mysql+pymysql://{user}:{password}@{host}:{port}/{self.database}"
2628
)
2729
self.engine = create_engine(database_url)
30+
self.Session = sessionmaker(bind=self.engine)
2831

2932
def execute(self, query):
30-
return pd.read_sql(query, self.engine)
33+
try:
34+
with self.engine.connect() as connection:
35+
return pd.read_sql(query, connection)
36+
except SQLAlchemyError as e:
37+
logging.error(f"Error executing query: {e}")
38+
raise
3139

3240
def get_minds_cohort(self, query):
3341
df = self.execute(query)
@@ -36,9 +44,12 @@ def get_minds_cohort(self, query):
3644

3745
def get_gdc_cohort(self, gdc_cohort):
3846
cohort = pd.read_csv(gdc_cohort, sep="\t", dtype=str)
39-
df = self.execute(
40-
f"SELECT case_id, case_submitter_id FROM {self.database}.clinical WHERE case_id IN {tuple(cohort['id'])}"
41-
)
47+
query = f"""
48+
SELECT case_id, case_submitter_id
49+
FROM {self.database}.clinical
50+
WHERE case_id IN ({','.join([f"'{i}'" for i in cohort['id']])})
51+
"""
52+
df = self.execute(query)
4253
cohort = df.groupby("case_id")["case_submitter_id"].unique()
4354
return cohort
4455

@@ -55,29 +66,41 @@ def get_columns(self, table):
5566
return columns["Field"]
5667

5768
def update(self, temp_folder):
58-
# make sure the temp folder exists
5969
if not os.path.exists(temp_folder):
6070
os.makedirs(temp_folder)
61-
# upload all the files to the database as a table
71+
6272
logging.info("Uploading new data to the database")
63-
for file in os.listdir(temp_folder):
64-
table_name = file.split(".")[0]
65-
df = pd.read_csv(f"{temp_folder}/{file}", sep="\t", dtype=str)
66-
df.replace("'--", np.nan, inplace=True)
67-
# if table already exists, append the new data
68-
if table_name in self.get_tables().tolist():
69-
logging.info(f"Updating {table_name}")
70-
df.to_sql(
71-
name=table_name,
72-
con=self.engine,
73-
if_exists="append",
74-
index=False,
75-
chunksize=1000,
76-
)
77-
else:
78-
logging.info(f"Creating {table_name}")
79-
df.to_sql(
80-
name=table_name, con=self.engine, if_exists="replace", index=False
81-
)
82-
logging.info("Finished uploading to the database")
83-
shutil.rmtree(temp_folder)
73+
74+
session = self.Session()
75+
try:
76+
for file in os.listdir(temp_folder):
77+
table_name = file.split(".")[0]
78+
df = pd.read_csv(f"{temp_folder}/{file}", sep="\t", dtype=str)
79+
df.replace("'--", np.nan, inplace=True)
80+
81+
if table_name in self.get_tables().tolist():
82+
logging.info(f"Updating {table_name}")
83+
df.to_sql(
84+
name=table_name,
85+
con=self.engine,
86+
if_exists="append",
87+
index=False,
88+
chunksize=1000,
89+
)
90+
else:
91+
logging.info(f"Creating {table_name}")
92+
df.to_sql(
93+
name=table_name,
94+
con=self.engine,
95+
if_exists="replace",
96+
index=False,
97+
)
98+
session.commit()
99+
logging.info("Finished uploading to the database")
100+
except (SQLAlchemyError, PendingRollbackError) as e:
101+
session.rollback()
102+
logging.error(f"Error during update: {e}")
103+
raise
104+
finally:
105+
session.close()
106+
shutil.rmtree(temp_folder)

dist/minds-0.0.5-py3-none-any.whl

15.8 KB
Binary file not shown.

dist/minds-0.0.5.tar.gz

15.6 KB
Binary file not shown.

0 commit comments

Comments
 (0)