-
Notifications
You must be signed in to change notification settings - Fork 4
cd: create a gha that generate an index for each query in the src/sql folder #5
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
df0c196
dc81122
9933808
67f1e30
fc64bfd
52954f1
b2a5d45
17c9e13
e002336
2503daa
985c9c8
3324ffe
0ba0527
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -22,12 +22,22 @@ jobs: | |
- uses: actions/checkout@v4 | ||
with: | ||
fetch-depth: 0 | ||
|
||
- name: Authorize Google Cloud | ||
uses: google-github-actions/auth@v1 | ||
with: | ||
credentials_json: "${{ secrets.SERVICE_ACCOUNT_KEY }}" | ||
create_credentials_file: true | ||
export_environment_variables: true | ||
|
||
- uses: actions/setup-python@v5 | ||
with: | ||
python-version: "3.x" | ||
|
||
- uses: pre-commit/[email protected] | ||
with: | ||
extra_args: --hook-stage manual --all-files | ||
|
||
- name: Run PyLint | ||
run: | | ||
echo "::add-matcher::$GITHUB_WORKSPACE/.github/matchers/pylint.json" | ||
|
@@ -43,15 +53,23 @@ jobs: | |
python-version: ["3.8", "3.12"] | ||
runs-on: [ubuntu-latest, macos-latest, windows-latest] | ||
|
||
include: | ||
- python-version: pypy-3.10 | ||
runs-on: ubuntu-latest | ||
#currently not working on pypi-3.10 | ||
# include: | ||
# - python-version: pypy-3.10 | ||
# runs-on: ubuntu-latest | ||
|
||
steps: | ||
- uses: actions/checkout@v4 | ||
with: | ||
fetch-depth: 0 | ||
|
||
- name: Authorize Google Cloud | ||
uses: google-github-actions/auth@v1 | ||
with: | ||
credentials_json: "${{ secrets.SERVICE_ACCOUNT_KEY }}" | ||
create_credentials_file: true | ||
export_environment_variables: true | ||
|
||
- uses: actions/setup-python@v5 | ||
with: | ||
python-version: ${{ matrix.python-version }} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -156,3 +156,6 @@ Thumbs.db | |
# Common editor files | ||
*~ | ||
*.swp | ||
|
||
# gcp service account keys | ||
gha-creds-**.json |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,20 +1,37 @@ | ||
cmake_minimum_required(VERSION 3.15...3.26) | ||
project(${SKBUILD_PROJECT_NAME} LANGUAGES NONE) | ||
|
||
find_package( | ||
Python | ||
COMPONENTS Interpreter | ||
REQUIRED) | ||
|
||
set(idc_index_release_version "0.3.2") | ||
set(idc_index_data_url "https://github.com/ImagingDataCommons/idc-index/releases/download/${idc_index_release_version}/idc_index.csv.zip") | ||
set(idc_index_data_sha256 "70ec9f915686a27bee3098163b8695c69c8696c05bfb7bd76943a24024cdeeb9") | ||
if(NOT DEFINED ENV{GCP_PROJECT}) | ||
message(FATAL_ERROR "GCP_PROJECT env. variable is not set") | ||
endif() | ||
|
||
option(IDC_INDEX_DATA_GENERATE_CSV_ARCHIVE "Generate idc_index.csv.zip file" ON) | ||
option(IDC_INDEX_DATA_GENERATE_PARQUET "Generate idc_index.parquet file" OFF) | ||
|
||
# | ||
# Download and install index | ||
# | ||
set(download_dir "${PROJECT_BINARY_DIR}") | ||
include(FetchContent) | ||
FetchContent_Populate(s5cmd | ||
URL ${idc_index_data_url} | ||
URL_HASH SHA256=${idc_index_data_sha256} | ||
DOWNLOAD_DIR ${download_dir} | ||
DOWNLOAD_NO_EXTRACT TRUE | ||
) | ||
install(FILES "${download_dir}/idc_index.csv.zip" DESTINATION "idc_index_data") | ||
|
||
add_custom_command( | ||
OUTPUT | ||
$<$<BOOL:IDC_INDEX_DATA_GENERATE_CSV_ARCHIVE>:${download_dir}/idc_index.csv.zip> | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @jcfr if we want to build this for growth from the start, we cannot hard-code the file names. The generator code will now produce CSV/Parquet for each of the queries in the sql folder, but any changes to the SQL queries file names or addition of new queries will break the code at the moment. It would make more sense to use wildcards to package all CSV/Parquet files depending on what is configured with the flags. Vamsi is going to give it a try to replace with the wildcard, unless you have other thoughts. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Using wildcard as such will not work as the target check the timestamp of the output (aka the filename) to decide if it should "re-build" or not. Once this is integrated, I suggest you create an issue describing the type, name and size of index files, and how these related to queries for generating them (as well as which domain they relates) (e.g microscopy, ...) Let's also not that with working CI and CD pipeline refactoring will be straightforward. |
||
$<$<BOOL:IDC_INDEX_DATA_GENERATE_PARQUET>:${download_dir}/idc_index.parquet> | ||
COMMAND python ${CMAKE_CURRENT_SOURCE_DIR}/scripts/python/idc_index_data_manager.py | ||
--generate-csv-archive | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should this line be instead
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Also, I do not know if the boolean value for parquet is passed along to the index manager properly, as it seem to generate both csv and parquet although the latter is disabled in cmakelists. |
||
$<$<BOOL:IDC_INDEX_DATA_GENERATE_PARQUET>:--generate-parquet> | ||
) | ||
|
||
add_custom_target(run_idc_index_data_manager ALL | ||
DEPENDS | ||
$<$<BOOL:IDC_INDEX_DATA_GENERATE_CSV_ARCHIVE>:${download_dir}/idc_index.csv.zip> | ||
$<$<BOOL:IDC_INDEX_DATA_GENERATE_PARQUET>:${download_dir}/idc_index.parquet> | ||
) | ||
|
||
install( | ||
FILES | ||
$<$<BOOL:IDC_INDEX_DATA_GENERATE_CSV_ARCHIVE>:${download_dir}/idc_index.csv.zip> | ||
$<$<BOOL:IDC_INDEX_DATA_GENERATE_PARQUET>:${download_dir}/idc_index.parquet> | ||
DESTINATION "idc_index_data") |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,114 @@ | ||
from __future__ import annotations | ||
|
||
import logging | ||
import os | ||
from pathlib import Path | ||
|
||
import pandas as pd | ||
from google.cloud import bigquery | ||
|
||
logging.basicConfig(level=logging.DEBUG) | ||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class IDCIndexDataManager: | ||
def __init__(self, project_id: str): | ||
""" | ||
Initializes the IDCIndexDataManager using the Google Cloud Platform project ID. | ||
""" | ||
self.project_id = project_id | ||
self.client = bigquery.Client(project=project_id) | ||
logger.debug("IDCIndexDataManager initialized with project ID: %s", project_id) | ||
|
||
def execute_sql_query(self, file_path: str) -> tuple[pd.DataFrame, str]: | ||
""" | ||
Executes the SQL query in the specified file. | ||
|
||
Returns: | ||
Tuple[pd.DataFrame, str]: A tuple containing the DataFrame with query results, | ||
the output basename. | ||
""" | ||
with Path(file_path).open("r") as file: | ||
sql_query = file.read() | ||
index_df = self.client.query(sql_query).to_dataframe() | ||
output_basename = Path(file_path).name.split(".")[0] | ||
logger.debug("Executed SQL query from file: %s", file_path) | ||
return index_df, output_basename | ||
|
||
def generate_index_data_files( | ||
self, generate_compressed_csv: bool = True, generate_parquet: bool = False | ||
) -> None: | ||
""" | ||
Executes SQL queries in the specified folder and creates a | ||
compressed CSV file and/or Parquet file from a pandas DataFrame. | ||
|
||
This method iterates over all .sql files in the 'scripts/sql' directory, | ||
executes each query using the 'execute_sql_query' method, and generates | ||
a DataFrame 'index_df'. The DataFrame is then saved as a compressed CSV | ||
and/or a Parquet file, depending on the method arguments. | ||
""" | ||
|
||
scripts_dir = Path(__file__).parent.parent | ||
sql_dir = scripts_dir / "sql" | ||
|
||
for file_name in os.listdir(sql_dir): | ||
if file_name.endswith(".sql"): | ||
file_path = Path(sql_dir) / file_name | ||
index_df, output_basename = self.execute_sql_query(file_path) | ||
logger.debug( | ||
"Executed and processed SQL queries from file: %s", file_path | ||
) | ||
if generate_compressed_csv: | ||
csv_file_name = f"{output_basename}.csv.zip" | ||
index_df.to_csv( | ||
csv_file_name, compression={"method": "zip"}, escapechar="\\" | ||
) | ||
logger.debug("Created CSV zip file: %s", csv_file_name) | ||
|
||
if generate_parquet: | ||
parquet_file_name = f"{output_basename}.parquet" | ||
index_df.to_parquet(parquet_file_name) | ||
logger.debug("Created Parquet file: %s", parquet_file_name) | ||
|
||
def run( | ||
self, generate_compressed_csv: bool = True, generate_parquet: bool = False | ||
) -> None: | ||
""" | ||
Runs the IDCIndexDataManager to locally generate index-data files by | ||
running queries against the Google Cloud Platform IDC project tables. | ||
""" | ||
self.generate_index_data_files( | ||
generate_compressed_csv=generate_compressed_csv, | ||
generate_parquet=generate_parquet, | ||
) | ||
|
||
|
||
if __name__ == "__main__": | ||
import argparse | ||
|
||
project_id = os.environ["GCP_PROJECT"] | ||
|
||
parser = argparse.ArgumentParser() | ||
parser.add_argument( | ||
"--generate-csv-archive", | ||
action="store_true", | ||
help="Generate idc_index.csv.zip file", | ||
) | ||
parser.add_argument( | ||
"--generate-parquet", | ||
action="store_true", | ||
help="Generate idc_index.parquet file", | ||
) | ||
|
||
args = parser.parse_args() | ||
|
||
if not any([args.generate_csv_archive, args.generate_parquet]): | ||
parser.error( | ||
"At least --generate-csv-archive or --generate-parquet must be specified" | ||
) | ||
|
||
manager = IDCIndexDataManager(project_id) | ||
manager.run( | ||
generate_compressed_csv=args.generate_csv_archive, | ||
generate_parquet=args.generate_parquet, | ||
) |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
SELECT | ||
# collection level attributes | ||
ANY_VALUE(collection_id) AS collection_id, | ||
ANY_VALUE(PatientID) AS PatientID, | ||
SeriesInstanceUID, | ||
ANY_VALUE(StudyInstanceUID) AS StudyInstanceUID, | ||
ANY_VALUE(source_DOI) AS source_DOI, | ||
# patient level attributes | ||
ANY_VALUE(PatientAge) AS PatientAge, | ||
ANY_VALUE(PatientSex) AS PatientSex, | ||
# study level attributes | ||
ANY_VALUE(StudyDate) AS StudyDate, | ||
ANY_VALUE(StudyDescription) AS StudyDescription, | ||
ANY_VALUE(dicom_curated.BodyPartExamined) AS BodyPartExamined, | ||
# series level attributes | ||
ANY_VALUE(Modality) AS Modality, | ||
ANY_VALUE(Manufacturer) AS Manufacturer, | ||
ANY_VALUE(ManufacturerModelName) AS ManufacturerModelName, | ||
ANY_VALUE(SAFE_CAST(SeriesDate AS STRING)) AS SeriesDate, | ||
ANY_VALUE(SeriesDescription) AS SeriesDescription, | ||
ANY_VALUE(SeriesNumber) AS SeriesNumber, | ||
COUNT(dicom_all.SOPInstanceUID) AS instanceCount, | ||
ANY_VALUE(license_short_name) as license_short_name, | ||
# download related attributes | ||
ANY_VALUE(CONCAT("s3://", SPLIT(aws_url,"/")[SAFE_OFFSET(2)], "/", crdc_series_uuid, "/*")) AS series_aws_url, | ||
ROUND(SUM(SAFE_CAST(instance_size AS float64))/1000000, 2) AS series_size_MB, | ||
FROM | ||
`bigquery-public-data.idc_current.dicom_all` AS dicom_all | ||
JOIN | ||
`bigquery-public-data.idc_current.dicom_metadata_curated` AS dicom_curated | ||
ON | ||
dicom_all.SOPInstanceUID = dicom_curated.SOPInstanceUID | ||
GROUP BY | ||
SeriesInstanceUID |
Uh oh!
There was an error while loading. Please reload this page.