Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

cd: create a gha that generate an index for each query in the src/sql folder #5

Closed
wants to merge 13 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions .github/workflows/cd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,13 @@ jobs:
with:
fetch-depth: 0

- name: Authorize Google Cloud
uses: google-github-actions/auth@v1
with:
credentials_json: "${{ secrets.SERVICE_ACCOUNT_KEY }}"
create_credentials_file: true
export_environment_variables: true

- uses: hynek/build-and-inspect-python-package@v2

publish:
Expand Down
24 changes: 21 additions & 3 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,22 @@ jobs:
- uses: actions/checkout@v4
with:
fetch-depth: 0

- name: Authorize Google Cloud
uses: google-github-actions/auth@v1
with:
credentials_json: "${{ secrets.SERVICE_ACCOUNT_KEY }}"
create_credentials_file: true
export_environment_variables: true

- uses: actions/setup-python@v5
with:
python-version: "3.x"

- uses: pre-commit/[email protected]
with:
extra_args: --hook-stage manual --all-files

- name: Run PyLint
run: |
echo "::add-matcher::$GITHUB_WORKSPACE/.github/matchers/pylint.json"
Expand All @@ -43,15 +53,23 @@ jobs:
python-version: ["3.8", "3.12"]
runs-on: [ubuntu-latest, macos-latest, windows-latest]

include:
- python-version: pypy-3.10
runs-on: ubuntu-latest
#currently not working on pypi-3.10
vkt1414 marked this conversation as resolved.
Show resolved Hide resolved
# include:
# - python-version: pypy-3.10
# runs-on: ubuntu-latest

steps:
- uses: actions/checkout@v4
with:
fetch-depth: 0

- name: Authorize Google Cloud
uses: google-github-actions/auth@v1
with:
credentials_json: "${{ secrets.SERVICE_ACCOUNT_KEY }}"
create_credentials_file: true
export_environment_variables: true

- uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
Expand Down
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -156,3 +156,6 @@ Thumbs.db
# Common editor files
*~
*.swp

# gcp service account keys
gha-creds-**.json
45 changes: 31 additions & 14 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,20 +1,37 @@
cmake_minimum_required(VERSION 3.15...3.26)
project(${SKBUILD_PROJECT_NAME} LANGUAGES NONE)

find_package(
Python
COMPONENTS Interpreter
REQUIRED)

set(idc_index_release_version "0.3.2")
set(idc_index_data_url "https://github.com/ImagingDataCommons/idc-index/releases/download/${idc_index_release_version}/idc_index.csv.zip")
set(idc_index_data_sha256 "70ec9f915686a27bee3098163b8695c69c8696c05bfb7bd76943a24024cdeeb9")
if(NOT DEFINED ENV{GCP_PROJECT})
message(FATAL_ERROR "GCP_PROJECT env. variable is not set")
endif()

option(IDC_INDEX_DATA_GENERATE_CSV_ARCHIVE "Generate idc_index.csv.zip file" ON)
option(IDC_INDEX_DATA_GENERATE_PARQUET "Generate idc_index.parquet file" OFF)

#
# Download and install index
#
set(download_dir "${PROJECT_BINARY_DIR}")
include(FetchContent)
FetchContent_Populate(s5cmd
URL ${idc_index_data_url}
URL_HASH SHA256=${idc_index_data_sha256}
DOWNLOAD_DIR ${download_dir}
DOWNLOAD_NO_EXTRACT TRUE
)
install(FILES "${download_dir}/idc_index.csv.zip" DESTINATION "idc_index_data")

add_custom_command(
OUTPUT
$<$<BOOL:IDC_INDEX_DATA_GENERATE_CSV_ARCHIVE>:${download_dir}/idc_index.csv.zip>
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jcfr if we want to build this for growth from the start, we cannot hard-code the file names. The generator code will now produce CSV/Parquet for each of the queries in the sql folder, but any changes to the SQL queries file names or addition of new queries will break the code at the moment.

It would make more sense to use wildcards to package all CSV/Parquet files depending on what is configured with the flags. Vamsi is going to give it a try to replace with the wildcard, unless you have other thoughts.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Vamsi is going to give it a try to replace with the wildcard

Using wildcard as such will not work as the target check the timestamp of the output (aka the filename) to decide if it should "re-build" or not.

Once this is integrated, I suggest you create an issue describing the type, name and size of index files, and how these related to queries for generating them (as well as which domain they relates) (e.g microscopy, ...)

Let's also not that with working CI and CD pipeline refactoring will be straightforward.

$<$<BOOL:IDC_INDEX_DATA_GENERATE_PARQUET>:${download_dir}/idc_index.parquet>
COMMAND python ${CMAKE_CURRENT_SOURCE_DIR}/scripts/python/idc_index_data_manager.py
--generate-csv-archive
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this line be instead

$<$<BOOL:IDC_INDEX_DATA_GENERATE_CSV_ARCHIVE>:--generate-csv-archive

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, I do not know if the boolean value for parquet is passed along to the index manager properly, as it seem to generate both csv and parquet although the latter is disabled in cmakelists.
This was from latest CD run:
https://github.com/ImagingDataCommons/idc-index-data/actions/runs/8425241913
image

$<$<BOOL:IDC_INDEX_DATA_GENERATE_PARQUET>:--generate-parquet>
)

add_custom_target(run_idc_index_data_manager ALL
DEPENDS
$<$<BOOL:IDC_INDEX_DATA_GENERATE_CSV_ARCHIVE>:${download_dir}/idc_index.csv.zip>
$<$<BOOL:IDC_INDEX_DATA_GENERATE_PARQUET>:${download_dir}/idc_index.parquet>
)

install(
FILES
$<$<BOOL:IDC_INDEX_DATA_GENERATE_CSV_ARCHIVE>:${download_dir}/idc_index.csv.zip>
$<$<BOOL:IDC_INDEX_DATA_GENERATE_PARQUET>:${download_dir}/idc_index.parquet>
DESTINATION "idc_index_data")
12 changes: 10 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,5 +1,13 @@
[build-system]
requires = ["scikit-build-core"]
requires = [
"scikit-build-core",
"db-dtypes",
"google-cloud-bigquery",
"pandas",
"pyarrow",
"pygithub",
"requests"
]
build-backend = "scikit_build_core.build"


Expand Down Expand Up @@ -108,7 +116,7 @@ disallow_incomplete_defs = true


[tool.ruff]
src = ["src"]
src = ["src", "scripts"]

[tool.ruff.lint]
extend-select = [
Expand Down
114 changes: 114 additions & 0 deletions scripts/python/idc_index_data_manager.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
from __future__ import annotations

import logging
import os
from pathlib import Path

import pandas as pd
from google.cloud import bigquery

logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)


class IDCIndexDataManager:
def __init__(self, project_id: str):
"""
Initializes the IDCIndexDataManager using the Google Cloud Platform project ID.
"""
self.project_id = project_id
self.client = bigquery.Client(project=project_id)
logger.debug("IDCIndexDataManager initialized with project ID: %s", project_id)

def execute_sql_query(self, file_path: str) -> tuple[pd.DataFrame, str]:
"""
Executes the SQL query in the specified file.

Returns:
Tuple[pd.DataFrame, str]: A tuple containing the DataFrame with query results,
the output basename.
"""
with Path(file_path).open("r") as file:
sql_query = file.read()
index_df = self.client.query(sql_query).to_dataframe()
output_basename = Path(file_path).name.split(".")[0]
logger.debug("Executed SQL query from file: %s", file_path)
return index_df, output_basename

def generate_index_data_files(
self, generate_compressed_csv: bool = True, generate_parquet: bool = False
) -> None:
"""
Executes SQL queries in the specified folder and creates a
compressed CSV file and/or Parquet file from a pandas DataFrame.

This method iterates over all .sql files in the 'scripts/sql' directory,
executes each query using the 'execute_sql_query' method, and generates
a DataFrame 'index_df'. The DataFrame is then saved as a compressed CSV
and/or a Parquet file, depending on the method arguments.
"""

scripts_dir = Path(__file__).parent.parent
sql_dir = scripts_dir / "sql"

for file_name in os.listdir(sql_dir):
if file_name.endswith(".sql"):
file_path = Path(sql_dir) / file_name
index_df, output_basename = self.execute_sql_query(file_path)
logger.debug(
"Executed and processed SQL queries from file: %s", file_path
)
if generate_compressed_csv:
csv_file_name = f"{output_basename}.csv.zip"
index_df.to_csv(
csv_file_name, compression={"method": "zip"}, escapechar="\\"
)
logger.debug("Created CSV zip file: %s", csv_file_name)

if generate_parquet:
parquet_file_name = f"{output_basename}.parquet"
index_df.to_parquet(parquet_file_name)
logger.debug("Created Parquet file: %s", parquet_file_name)

def run(
self, generate_compressed_csv: bool = True, generate_parquet: bool = False
) -> None:
"""
Runs the IDCIndexDataManager to locally generate index-data files by
running queries against the Google Cloud Platform IDC project tables.
"""
self.generate_index_data_files(
generate_compressed_csv=generate_compressed_csv,
generate_parquet=generate_parquet,
)


if __name__ == "__main__":
import argparse

project_id = os.environ["GCP_PROJECT"]

parser = argparse.ArgumentParser()
parser.add_argument(
"--generate-csv-archive",
action="store_true",
help="Generate idc_index.csv.zip file",
)
parser.add_argument(
"--generate-parquet",
action="store_true",
help="Generate idc_index.parquet file",
)

args = parser.parse_args()

if not any([args.generate_csv_archive, args.generate_parquet]):
parser.error(
"At least --generate-csv-archive or --generate-parquet must be specified"
)

manager = IDCIndexDataManager(project_id)
manager.run(
generate_compressed_csv=args.generate_csv_archive,
generate_parquet=args.generate_parquet,
)
34 changes: 34 additions & 0 deletions scripts/sql/idc_index.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
SELECT
# collection level attributes
ANY_VALUE(collection_id) AS collection_id,
ANY_VALUE(PatientID) AS PatientID,
SeriesInstanceUID,
ANY_VALUE(StudyInstanceUID) AS StudyInstanceUID,
ANY_VALUE(source_DOI) AS source_DOI,
# patient level attributes
ANY_VALUE(PatientAge) AS PatientAge,
ANY_VALUE(PatientSex) AS PatientSex,
# study level attributes
ANY_VALUE(StudyDate) AS StudyDate,
ANY_VALUE(StudyDescription) AS StudyDescription,
ANY_VALUE(dicom_curated.BodyPartExamined) AS BodyPartExamined,
# series level attributes
ANY_VALUE(Modality) AS Modality,
ANY_VALUE(Manufacturer) AS Manufacturer,
ANY_VALUE(ManufacturerModelName) AS ManufacturerModelName,
ANY_VALUE(SAFE_CAST(SeriesDate AS STRING)) AS SeriesDate,
ANY_VALUE(SeriesDescription) AS SeriesDescription,
ANY_VALUE(SeriesNumber) AS SeriesNumber,
COUNT(dicom_all.SOPInstanceUID) AS instanceCount,
ANY_VALUE(license_short_name) as license_short_name,
# download related attributes
ANY_VALUE(CONCAT("s3://", SPLIT(aws_url,"/")[SAFE_OFFSET(2)], "/", crdc_series_uuid, "/*")) AS series_aws_url,
ROUND(SUM(SAFE_CAST(instance_size AS float64))/1000000, 2) AS series_size_MB,
FROM
`bigquery-public-data.idc_current.dicom_all` AS dicom_all
JOIN
`bigquery-public-data.idc_current.dicom_metadata_curated` AS dicom_curated
ON
dicom_all.SOPInstanceUID = dicom_curated.SOPInstanceUID
GROUP BY
SeriesInstanceUID
18 changes: 15 additions & 3 deletions src/idc_index_data/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,18 +16,30 @@

from ._version import version as __version__

__all__ = ["__version__", "IDC_INDEX_CSV_ARCHIVE_FILEPATH"]
__all__ = [
"__version__",
"IDC_INDEX_CSV_ARCHIVE_FILEPATH",
# "IDC_INDEX_PARQUET_FILEPATH",
]


def _lookup(path: str) -> Path:
def _lookup(path: str, optional: bool = False) -> Path | None:
"""Support editable installation by looking up path using distribution API."""
files = distribution("idc_index_data").files
if files is not None:
for _file in files:
if str(_file) == path:
return Path(str(_file.locate())).resolve(strict=True)
if optional:
return None

msg = f"Failed to lookup '{path}`."
raise FileNotFoundError(msg)


IDC_INDEX_CSV_ARCHIVE_FILEPATH: Path = _lookup("idc_index_data/idc_index.csv.zip")
IDC_INDEX_CSV_ARCHIVE_FILEPATH: Path | None = _lookup(
"idc_index_data/idc_index.csv.zip"
)
IDC_INDEX_PARQUET_FILEPATH: Path | None = _lookup(
"idc_index_data/idc_index.parquet", optional=True
)
9 changes: 7 additions & 2 deletions tests/test_package.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,5 +10,10 @@ def test_version():


def test_filepath():
assert m.IDC_INDEX_CSV_ARCHIVE_FILEPATH.is_file()
assert m.IDC_INDEX_CSV_ARCHIVE_FILEPATH.name == "idc_index.csv.zip"
if m.IDC_INDEX_CSV_ARCHIVE_FILEPATH is not None:
assert m.IDC_INDEX_CSV_ARCHIVE_FILEPATH.is_file()
assert m.IDC_INDEX_CSV_ARCHIVE_FILEPATH.name == "idc_index.csv.zip"

if m.IDC_INDEX_PARQUET_FILEPATH is not None:
assert m.IDC_INDEX_PARQUET_FILEPATH.is_file()
assert m.IDC_INDEX_PARQUET_FILEPATH.name == "idc_index.parquet"
Loading