bug(fix): change date column to str prior to writing to parquet (#22)

ImagingDataCommons · Apr 9, 2024 · 5278a96 · 5278a96
1 parent a678877
commit 5278a96
Show file tree

Hide file tree

Showing 4 changed files with 19 additions and 1 deletion.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -54,6 +54,7 @@ repos:
         args: []
         additional_dependencies:
           - pytest
+          - pandas-stubs
 
   - repo: https://github.com/codespell-project/codespell
     rev: "v2.2.6"

diff --git a/pyproject.toml b/pyproject.toml
@@ -13,7 +13,7 @@ build-backend = "scikit_build_core.build"
 
 [project]
 name = "idc-index-data"
-version = "17.0.1"
+version = "17.0.2"
 authors = [
   { name = "Andrey Fedorov", email = "[email protected]" },
   { name = "Vamsi Thiriveedhi", email = "[email protected]" },
@@ -44,6 +44,8 @@ dependencies = []
 
 [project.optional-dependencies]
 test = [
+  "pandas",
+  "pyarrow",
   "pytest >=6",
   "pytest-cov >=3",
 ]

diff --git a/scripts/python/idc_index_data_manager.py b/scripts/python/idc_index_data_manager.py
@@ -31,6 +31,8 @@ def execute_sql_query(self, file_path: str) -> tuple[pd.DataFrame, str]:
         with Path(file_path).open("r") as file:
             sql_query = file.read()
         index_df = self.client.query(sql_query).to_dataframe()
+        if "StudyDate" in index_df.columns:
+            index_df["StudyDate"] = index_df["StudyDate"].astype(str)
         output_basename = Path(file_path).name.split(".")[0]
         logger.debug("Executed SQL query from file: %s", file_path)
         return index_df, output_basename

diff --git a/tests/test_package.py b/tests/test_package.py
@@ -2,6 +2,7 @@
 
 import importlib.metadata
 
+import pandas as pd
 from packaging.version import Version
 
 import idc_index_data as m
@@ -25,3 +26,15 @@ def test_filepath():
     if m.IDC_INDEX_PARQUET_FILEPATH is not None:
         assert m.IDC_INDEX_PARQUET_FILEPATH.is_file()
         assert m.IDC_INDEX_PARQUET_FILEPATH.name == "idc_index.parquet"
+
+
+def test_reading_index():
+    if m.IDC_INDEX_CSV_ARCHIVE_FILEPATH is not None:
+        assert m.IDC_INDEX_CSV_ARCHIVE_FILEPATH.is_file()
+        df_csv = pd.read_csv(m.IDC_INDEX_CSV_ARCHIVE_FILEPATH)
+        assert not df_csv.empty
+
+    if m.IDC_INDEX_PARQUET_FILEPATH is not None:
+        assert m.IDC_INDEX_PARQUET_FILEPATH.is_file()
+        df_parquet = pd.read_parquet(m.IDC_INDEX_PARQUET_FILEPATH)
+        assert not df_parquet.empty