Skip to content

Commit

Permalink
v0.0.5
Browse files Browse the repository at this point in the history
  • Loading branch information
Aakash-Tripathi committed May 31, 2024
1 parent d43cacd commit 132cc9e
Show file tree
Hide file tree
Showing 4 changed files with 54 additions and 31 deletions.
8 changes: 4 additions & 4 deletions app/minds.egg-info/PKG-INFO
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
Metadata-Version: 2.1
Name: minds
Version: 0.0.4
Version: 0.0.5
Summary: A package for downloading and processing data from the MINDS database
Home-page: https://github.com/lab-rasool/MINDS
Author: Aakash Tripathi
Expand Down Expand Up @@ -75,7 +75,7 @@ tables = minds.get_tables()
columns = minds.get_columns("clinical")

# Query the database directly
query = "SELECT * FROM nihnci.clinical WHERE project_id = 'TCGA-LUAD' LIMIT 10"
query = "SELECT * FROM minds.clinical WHERE project_id = 'TCGA-LUAD' LIMIT 10"
df = minds.query(query)
```

Expand All @@ -89,11 +89,11 @@ query_cohort = minds.build_cohort(query=query, output_dir="./data")
gdc_cohort = minds.build_cohort(gdc_cohort="cohort_Unsaved_Cohort.2024-02-12.tsv", output_dir="./data")

# to get the cohort details
cohort.stats()
gdc_cohort.stats()

# to download the data from the cohort to the output directory specified
# you can also specify the number of threads to use and the modalities to exclude or include
cohort.download(threads=12, exclude=["Slide Image"])
gdc_cohort.download(threads=12, exclude=["Slide Image"])
```

## Please cite our work
Expand Down
77 changes: 50 additions & 27 deletions build/lib/minds/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
import pandas as pd
from dotenv import load_dotenv
from sqlalchemy import create_engine
from sqlalchemy.exc import PendingRollbackError, SQLAlchemyError
from sqlalchemy.orm import sessionmaker


class DatabaseManager:
Expand All @@ -25,9 +27,15 @@ def __init__(self, dotenv_path=".env"):
f"mysql+pymysql://{user}:{password}@{host}:{port}/{self.database}"
)
self.engine = create_engine(database_url)
self.Session = sessionmaker(bind=self.engine)

def execute(self, query):
return pd.read_sql(query, self.engine)
try:
with self.engine.connect() as connection:
return pd.read_sql(query, connection)
except SQLAlchemyError as e:
logging.error(f"Error executing query: {e}")
raise

def get_minds_cohort(self, query):
df = self.execute(query)
Expand All @@ -36,9 +44,12 @@ def get_minds_cohort(self, query):

def get_gdc_cohort(self, gdc_cohort):
cohort = pd.read_csv(gdc_cohort, sep="\t", dtype=str)
df = self.execute(
f"SELECT case_id, case_submitter_id FROM {self.database}.clinical WHERE case_id IN {tuple(cohort['id'])}"
)
query = f"""
SELECT case_id, case_submitter_id
FROM {self.database}.clinical
WHERE case_id IN ({','.join([f"'{i}'" for i in cohort['id']])})
"""
df = self.execute(query)
cohort = df.groupby("case_id")["case_submitter_id"].unique()
return cohort

Expand All @@ -55,29 +66,41 @@ def get_columns(self, table):
return columns["Field"]

def update(self, temp_folder):
# make sure the temp folder exists
if not os.path.exists(temp_folder):
os.makedirs(temp_folder)
# upload all the files to the database as a table

logging.info("Uploading new data to the database")
for file in os.listdir(temp_folder):
table_name = file.split(".")[0]
df = pd.read_csv(f"{temp_folder}/{file}", sep="\t", dtype=str)
df.replace("'--", np.nan, inplace=True)
# if table already exists, append the new data
if table_name in self.get_tables().tolist():
logging.info(f"Updating {table_name}")
df.to_sql(
name=table_name,
con=self.engine,
if_exists="append",
index=False,
chunksize=1000,
)
else:
logging.info(f"Creating {table_name}")
df.to_sql(
name=table_name, con=self.engine, if_exists="replace", index=False
)
logging.info("Finished uploading to the database")
shutil.rmtree(temp_folder)

session = self.Session()
try:
for file in os.listdir(temp_folder):
table_name = file.split(".")[0]
df = pd.read_csv(f"{temp_folder}/{file}", sep="\t", dtype=str)
df.replace("'--", np.nan, inplace=True)

if table_name in self.get_tables().tolist():
logging.info(f"Updating {table_name}")
df.to_sql(
name=table_name,
con=self.engine,
if_exists="append",
index=False,
chunksize=1000,
)
else:
logging.info(f"Creating {table_name}")
df.to_sql(
name=table_name,
con=self.engine,
if_exists="replace",
index=False,
)
session.commit()
logging.info("Finished uploading to the database")
except (SQLAlchemyError, PendingRollbackError) as e:
session.rollback()
logging.error(f"Error during update: {e}")
raise
finally:
session.close()
shutil.rmtree(temp_folder)
Binary file added dist/minds-0.0.5-py3-none-any.whl
Binary file not shown.
Binary file added dist/minds-0.0.5.tar.gz
Binary file not shown.

0 comments on commit 132cc9e

Please sign in to comment.