ImagingDataCommons · vkt1414 · Mar 15, 2024 · Mar 16, 2024 · Mar 20, 2024 · Mar 21, 2024
diff --git a/.github/workflows/cd.yml b/.github/workflows/cd.yml
@@ -27,6 +27,13 @@ jobs:
         with:
           fetch-depth: 0
 
+      - name: Authorize Google Cloud
+        uses: google-github-actions/auth@v1
+        with:
+          credentials_json: "${{ secrets.SERVICE_ACCOUNT_KEY }}"
+          create_credentials_file: true
+          export_environment_variables: true
+
       - uses: hynek/build-and-inspect-python-package@v2
 
   publish:

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -22,12 +22,22 @@ jobs:
       - uses: actions/checkout@v4
         with:
           fetch-depth: 0
+
+      - name: Authorize Google Cloud
+        uses: google-github-actions/auth@v1
+        with:
+          credentials_json: "${{ secrets.SERVICE_ACCOUNT_KEY }}"
+          create_credentials_file: true
+          export_environment_variables: true
+
       - uses: actions/setup-python@v5
         with:
           python-version: "3.x"
+
       - uses: pre-commit/[email protected]
         with:
           extra_args: --hook-stage manual --all-files
+
       - name: Run PyLint
         run: |
           echo "::add-matcher::$GITHUB_WORKSPACE/.github/matchers/pylint.json"
@@ -43,15 +53,23 @@ jobs:
         python-version: ["3.8", "3.12"]
         runs-on: [ubuntu-latest, macos-latest, windows-latest]
 
-        include:
-          - python-version: pypy-3.10
-            runs-on: ubuntu-latest
+        #currently not working on pypi-3.10
+        # include:
+        #   - python-version: pypy-3.10
+        #     runs-on: ubuntu-latest
 
     steps:
       - uses: actions/checkout@v4
         with:
           fetch-depth: 0
 
+      - name: Authorize Google Cloud
+        uses: google-github-actions/auth@v1
+        with:
+          credentials_json: "${{ secrets.SERVICE_ACCOUNT_KEY }}"
+          create_credentials_file: true
+          export_environment_variables: true
+
       - uses: actions/setup-python@v5
         with:
           python-version: ${{ matrix.python-version }}

diff --git a/.gitignore b/.gitignore
@@ -156,3 +156,6 @@ Thumbs.db
 # Common editor files
 *~
 *.swp
+
+# gcp service account keys
+gha-creds-**.json
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,20 +1,37 @@
 cmake_minimum_required(VERSION 3.15...3.26)
 project(${SKBUILD_PROJECT_NAME} LANGUAGES NONE)
 
+find_package(
+  Python
+  COMPONENTS Interpreter
+  REQUIRED)
 
-set(idc_index_release_version "0.3.2")
-set(idc_index_data_url "https://github.com/ImagingDataCommons/idc-index/releases/download/${idc_index_release_version}/idc_index.csv.zip")
-set(idc_index_data_sha256 "70ec9f915686a27bee3098163b8695c69c8696c05bfb7bd76943a24024cdeeb9")
+if(NOT DEFINED ENV{GCP_PROJECT})
+  message(FATAL_ERROR "GCP_PROJECT env. variable is not set")
+endif()
+
+option(IDC_INDEX_DATA_GENERATE_CSV_ARCHIVE "Generate idc_index.csv.zip file" ON)
+option(IDC_INDEX_DATA_GENERATE_PARQUET "Generate idc_index.parquet file" OFF)
 
-#
-# Download and install index
-#
 set(download_dir "${PROJECT_BINARY_DIR}")
-include(FetchContent)
-FetchContent_Populate(s5cmd
-  URL ${idc_index_data_url}
-  URL_HASH SHA256=${idc_index_data_sha256}
-  DOWNLOAD_DIR ${download_dir}
-  DOWNLOAD_NO_EXTRACT TRUE
-  )
-install(FILES "${download_dir}/idc_index.csv.zip" DESTINATION "idc_index_data")
+
+add_custom_command(
+  OUTPUT
+    $<$<BOOL:IDC_INDEX_DATA_GENERATE_CSV_ARCHIVE>:${download_dir}/idc_index.csv.zip>
+    $<$<BOOL:IDC_INDEX_DATA_GENERATE_PARQUET>:${download_dir}/idc_index.parquet>
+  COMMAND python ${CMAKE_CURRENT_SOURCE_DIR}/scripts/python/idc_index_data_manager.py
+    --generate-csv-archive
+    $<$<BOOL:IDC_INDEX_DATA_GENERATE_PARQUET>:--generate-parquet>
+)
+
+add_custom_target(run_idc_index_data_manager ALL
+  DEPENDS
+    $<$<BOOL:IDC_INDEX_DATA_GENERATE_CSV_ARCHIVE>:${download_dir}/idc_index.csv.zip>
+    $<$<BOOL:IDC_INDEX_DATA_GENERATE_PARQUET>:${download_dir}/idc_index.parquet>
+)
+
+install(
+  FILES
+    $<$<BOOL:IDC_INDEX_DATA_GENERATE_CSV_ARCHIVE>:${download_dir}/idc_index.csv.zip>
+    $<$<BOOL:IDC_INDEX_DATA_GENERATE_PARQUET>:${download_dir}/idc_index.parquet>
+  DESTINATION "idc_index_data")
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,5 +1,13 @@
 [build-system]
-requires = ["scikit-build-core"]
+requires = [
+  "scikit-build-core",
+  "db-dtypes",
+  "google-cloud-bigquery",
+  "pandas",
+  "pyarrow",
+  "pygithub",
+  "requests"
+]
 build-backend = "scikit_build_core.build"
 
 
@@ -108,7 +116,7 @@ disallow_incomplete_defs = true
 
 
 [tool.ruff]
-src = ["src"]
+src = ["src", "scripts"]
 
 [tool.ruff.lint]
 extend-select = [

diff --git a/scripts/python/idc_index_data_manager.py b/scripts/python/idc_index_data_manager.py
@@ -0,0 +1,114 @@
+from __future__ import annotations
+
+import logging
+import os
+from pathlib import Path
+
+import pandas as pd
+from google.cloud import bigquery
+
+logging.basicConfig(level=logging.DEBUG)
+logger = logging.getLogger(__name__)
+
+
+class IDCIndexDataManager:
+    def __init__(self, project_id: str):
+        """
+        Initializes the IDCIndexDataManager using the Google Cloud Platform project ID.
+        """
+        self.project_id = project_id
+        self.client = bigquery.Client(project=project_id)
+        logger.debug("IDCIndexDataManager initialized with project ID: %s", project_id)
+
+    def execute_sql_query(self, file_path: str) -> tuple[pd.DataFrame, str]:
+        """
+        Executes the SQL query in the specified file.
+
+        Returns:
+            Tuple[pd.DataFrame, str]: A tuple containing the DataFrame with query results,
+            the output basename.
+        """
+        with Path(file_path).open("r") as file:
+            sql_query = file.read()
+        index_df = self.client.query(sql_query).to_dataframe()
+        output_basename = Path(file_path).name.split(".")[0]
+        logger.debug("Executed SQL query from file: %s", file_path)
+        return index_df, output_basename
+
+    def generate_index_data_files(
+        self, generate_compressed_csv: bool = True, generate_parquet: bool = False
+    ) -> None:
+        """
+        Executes SQL queries in the specified folder and creates a
+        compressed CSV file and/or Parquet file from a pandas DataFrame.
+
+        This method iterates over all .sql files in the 'scripts/sql' directory,
+        executes each query using the 'execute_sql_query' method, and generates
+        a DataFrame 'index_df'. The DataFrame is then saved as a compressed CSV
+        and/or a Parquet file, depending on the method arguments.
+        """
+
+        scripts_dir = Path(__file__).parent.parent
+        sql_dir = scripts_dir / "sql"
+
+        for file_name in os.listdir(sql_dir):
+            if file_name.endswith(".sql"):
+                file_path = Path(sql_dir) / file_name
+                index_df, output_basename = self.execute_sql_query(file_path)
+                logger.debug(
+                    "Executed and processed SQL queries from file: %s", file_path
+                )
+            if generate_compressed_csv:
+                csv_file_name = f"{output_basename}.csv.zip"
+                index_df.to_csv(
+                    csv_file_name, compression={"method": "zip"}, escapechar="\\"
+                )
+                logger.debug("Created CSV zip file: %s", csv_file_name)
+
+            if generate_parquet:
+                parquet_file_name = f"{output_basename}.parquet"
+                index_df.to_parquet(parquet_file_name)
+                logger.debug("Created Parquet file: %s", parquet_file_name)
+
+    def run(
+        self, generate_compressed_csv: bool = True, generate_parquet: bool = False
+    ) -> None:
+        """
+        Runs the IDCIndexDataManager to locally generate index-data files by
+        running queries against the Google Cloud Platform IDC project tables.
+        """
+        self.generate_index_data_files(
+            generate_compressed_csv=generate_compressed_csv,
+            generate_parquet=generate_parquet,
+        )
+
+
+if __name__ == "__main__":
+    import argparse
+
+    project_id = os.environ["GCP_PROJECT"]
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--generate-csv-archive",
+        action="store_true",
+        help="Generate idc_index.csv.zip file",
+    )
+    parser.add_argument(
+        "--generate-parquet",
+        action="store_true",
+        help="Generate idc_index.parquet file",
+    )
+
+    args = parser.parse_args()
+
+    if not any([args.generate_csv_archive, args.generate_parquet]):
+        parser.error(
+            "At least --generate-csv-archive or --generate-parquet must be specified"
+        )
+
+    manager = IDCIndexDataManager(project_id)
+    manager.run(
+        generate_compressed_csv=args.generate_csv_archive,
+        generate_parquet=args.generate_parquet,
+    )
diff --git a/scripts/sql/idc_index.sql b/scripts/sql/idc_index.sql
@@ -0,0 +1,34 @@
+SELECT
+  # collection level attributes
+  ANY_VALUE(collection_id) AS collection_id,
+  ANY_VALUE(PatientID) AS PatientID,
+  SeriesInstanceUID,
+  ANY_VALUE(StudyInstanceUID) AS StudyInstanceUID,
+  ANY_VALUE(source_DOI) AS source_DOI,
+  # patient level attributes
+  ANY_VALUE(PatientAge) AS PatientAge,
+  ANY_VALUE(PatientSex) AS PatientSex,
+  # study level attributes
+  ANY_VALUE(StudyDate) AS StudyDate,
+  ANY_VALUE(StudyDescription) AS StudyDescription,
+  ANY_VALUE(dicom_curated.BodyPartExamined) AS BodyPartExamined,
+  # series level attributes
+  ANY_VALUE(Modality) AS Modality,
+  ANY_VALUE(Manufacturer) AS Manufacturer,
+  ANY_VALUE(ManufacturerModelName) AS ManufacturerModelName,
+  ANY_VALUE(SAFE_CAST(SeriesDate AS STRING)) AS SeriesDate,
+  ANY_VALUE(SeriesDescription) AS SeriesDescription,
+  ANY_VALUE(SeriesNumber) AS SeriesNumber,
+  COUNT(dicom_all.SOPInstanceUID) AS instanceCount,
+  ANY_VALUE(license_short_name) as license_short_name,
+  # download related attributes
+  ANY_VALUE(CONCAT("s3://", SPLIT(aws_url,"/")[SAFE_OFFSET(2)], "/", crdc_series_uuid, "/*")) AS series_aws_url,
+  ROUND(SUM(SAFE_CAST(instance_size AS float64))/1000000, 2) AS series_size_MB,
+FROM
+  `bigquery-public-data.idc_current.dicom_all` AS dicom_all
+JOIN
+  `bigquery-public-data.idc_current.dicom_metadata_curated` AS dicom_curated
+ON
+  dicom_all.SOPInstanceUID = dicom_curated.SOPInstanceUID
+GROUP BY
+  SeriesInstanceUID
diff --git a/src/idc_index_data/__init__.py b/src/idc_index_data/__init__.py
@@ -16,18 +16,30 @@
 
 from ._version import version as __version__
 
-__all__ = ["__version__", "IDC_INDEX_CSV_ARCHIVE_FILEPATH"]
+__all__ = [
+    "__version__",
+    "IDC_INDEX_CSV_ARCHIVE_FILEPATH",
+    # "IDC_INDEX_PARQUET_FILEPATH",
+]
 
 
-def _lookup(path: str) -> Path:
+def _lookup(path: str, optional: bool = False) -> Path | None:
     """Support editable installation by looking up path using distribution API."""
     files = distribution("idc_index_data").files
     if files is not None:
         for _file in files:
             if str(_file) == path:
                 return Path(str(_file.locate())).resolve(strict=True)
+    if optional:
+        return None
+
     msg = f"Failed to lookup '{path}`."
     raise FileNotFoundError(msg)
 
 
-IDC_INDEX_CSV_ARCHIVE_FILEPATH: Path = _lookup("idc_index_data/idc_index.csv.zip")
+IDC_INDEX_CSV_ARCHIVE_FILEPATH: Path | None = _lookup(
+    "idc_index_data/idc_index.csv.zip"
+)
+IDC_INDEX_PARQUET_FILEPATH: Path | None = _lookup(
+    "idc_index_data/idc_index.parquet", optional=True
+)
diff --git a/tests/test_package.py b/tests/test_package.py
@@ -10,5 +10,10 @@ def test_version():
 
 
 def test_filepath():
-    assert m.IDC_INDEX_CSV_ARCHIVE_FILEPATH.is_file()
-    assert m.IDC_INDEX_CSV_ARCHIVE_FILEPATH.name == "idc_index.csv.zip"
+    if m.IDC_INDEX_CSV_ARCHIVE_FILEPATH is not None:
+        assert m.IDC_INDEX_CSV_ARCHIVE_FILEPATH.is_file()
+        assert m.IDC_INDEX_CSV_ARCHIVE_FILEPATH.name == "idc_index.csv.zip"
+
+    if m.IDC_INDEX_PARQUET_FILEPATH is not None:
+        assert m.IDC_INDEX_PARQUET_FILEPATH.is_file()
+        assert m.IDC_INDEX_PARQUET_FILEPATH.name == "idc_index.parquet"