Add IN Carta preset and related compatibility changes + testing (#145)

* add initial in-carta preset * add test dir fixture for in carta * add in carta test * modify source path processing for no targets * raise errors for special no target sources case * update pre-commit versions * set explicit order for column metadata extraction * further test revision * add comment about uncertain version * add casting for mypy * add docs for in carta usage * add in-carta version specification * distinct dest dir for test * move to errno reference for dir not empty * Update docs/source/overview.md Co-authored-by: Gregory Way <[email protected]> * add in-carta as data source in main readme * add logging, comments for no target circumstances * linting --------- Co-authored-by: Gregory Way <[email protected]>
cytomining · Jan 19, 2024 · 562d493 · 562d493
1 parent d5f6b46
commit 562d493
Show file tree

Hide file tree

Showing 11 changed files with 171 additions and 28 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -33,7 +33,7 @@ repos:
     hooks:
       - id: yamllint
   - repo: https://github.com/psf/black
-    rev: 23.12.0
+    rev: 23.12.1
     hooks:
       - id: black
   - repo: https://github.com/asottile/blacken-docs
@@ -55,7 +55,7 @@ repos:
     hooks:
       - id: vulture
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.7.1
+    rev: v1.8.0
     hooks:
       - id: mypy
   - repo: https://github.com/PyCQA/pylint

diff --git a/cytotable/convert.py b/cytotable/convert.py
@@ -75,7 +75,9 @@ def _get_table_columns_and_types(source: Dict[str, Any]) -> List[Dict[str, str]]
             segment_type as column_dtype
         FROM pragma_storage_info('column_details')
         /* avoid duplicate entries in the form of VALIDITY segment_types */
-        WHERE segment_type != 'VALIDITY';
+        WHERE segment_type != 'VALIDITY'
+        /* explicitly order the columns by their id to avoid inconsistent results */
+        ORDER BY column_id ASC;
         """
 
     # attempt to read the data to parquet from duckdb
@@ -319,7 +321,7 @@ def _source_chunk_to_parquet(
     select_columns = ",".join(
         [
             # here we cast the column to the specified type ensure the colname remains the same
-            f"CAST({column['column_name']} AS {column['column_dtype']}) AS {column['column_name']}"
+            f"CAST(\"{column['column_name']}\" AS {column['column_dtype']}) AS \"{column['column_name']}\""
             for column in source["columns"]
         ]
     )
@@ -414,15 +416,28 @@ def _prepend_column_name(
             Path to the modified file.
     """
 
+    import logging
     import pathlib
 
     import pyarrow.parquet as parquet
 
     from cytotable.constants import CYTOTABLE_ARROW_USE_MEMORY_MAPPING
     from cytotable.utils import _write_parquet_table_with_metadata
 
+    logger = logging.getLogger(__name__)
+
     targets = tuple(metadata) + tuple(compartments)
 
+    # if we have no targets or metadata to work from, return the table unchanged
+    if len(targets) == 0:
+        logger.warning(
+            msg=(
+                "Skipping column name prepend operations"
+                "because no compartments or metadata were provided."
+            )
+        )
+        return table_path
+
     table = parquet.read_table(
         source=table_path, memory_map=CYTOTABLE_ARROW_USE_MEMORY_MAPPING
     )
@@ -569,6 +584,7 @@ def _concat_source_group(
             Updated dictionary containing concatenated sources.
     """
 
+    import errno
     import pathlib
 
     import pyarrow as pa
@@ -649,7 +665,7 @@ def _concat_source_group(
                 pathlib.Path(pathlib.Path(source["table"][0]).parent).rmdir()
             except OSError as os_err:
                 # raise only if we don't have a dir not empty errno
-                if os_err.errno != 66:
+                if os_err.errno != errno.ENOTEMPTY:
                     raise
 
     # return the concatted parquet filename

diff --git a/cytotable/presets.py b/cytotable/presets.py
@@ -204,6 +204,34 @@
                 AND nuclei.Nuclei_ObjectNumber = cytoplasm.Metadata_Cytoplasm_Parent_Nuclei
         """,
     },
+    "in-carta": {
+        # version specifications using related references
+        "CONFIG_SOURCE_VERSION": {
+            "in-carta": "v1.17.0412545",
+        },
+        # names of source table compartments (for ex. cells.csv, etc.)
+        "CONFIG_NAMES_COMPARTMENTS": tuple(),
+        # names of source table metadata (for ex. image.csv, etc.)
+        "CONFIG_NAMES_METADATA": tuple(),
+        # column names in any compartment or metadata tables which contain
+        # unique names to avoid renaming
+        "CONFIG_IDENTIFYING_COLUMNS": (
+            "OBJECT ID",
+            "Row",
+            "Column",
+            "FOV",
+            "WELL LABEL",
+            "Z",
+            "T",
+        ),
+        # chunk size to use for join operations to help with possible performance issues
+        # note: this number is an estimate and is may need changes contingent on data
+        # and system used by this library.
+        "CONFIG_CHUNK_SIZE": 1000,
+        # compartment and metadata joins performed using DuckDB SQL
+        # and modified at runtime as needed
+        "CONFIG_JOINS": "",
+    },
 }
 """
 Configuration presets for CytoTable

diff --git a/cytotable/sources.py b/cytotable/sources.py
@@ -47,6 +47,7 @@ def _build_path(
 def _get_source_filepaths(
     path: Union[pathlib.Path, AnyPath],
     targets: List[str],
+    source_datatype: Optional[str] = None,
 ) -> Dict[str, List[Dict[str, Any]]]:
     """
     Gather dataset of filepaths from a provided directory path.
@@ -56,19 +57,27 @@ def _get_source_filepaths(
             Either a directory path to seek filepaths within or a path directly to a file.
         targets: List[str]:
             Compartment and metadata names to seek within the provided path.
+        source_datatype: Optional[str]:  (Default value = None)
+            The source datatype (extension) to use for reading the tables.
 
     Returns:
         Dict[str, List[Dict[str, Any]]]
             Data structure which groups related files based on the compartments.
     """
 
+    import os
     import pathlib
 
     from cloudpathlib import AnyPath
 
-    from cytotable.exceptions import NoInputDataException
+    from cytotable.exceptions import DatatypeException, NoInputDataException
     from cytotable.utils import _cache_cloudpath_to_local, _duckdb_reader
 
+    if (targets is None or targets == []) and source_datatype is None:
+        raise DatatypeException(
+            f"A source_datatype must be specified when using undefined compartments and metadata names."
+        )
+
     # gathers files from provided path using compartments + metadata as a filter
     sources = [
         # build source_paths for all files
@@ -85,6 +94,7 @@ def _get_source_filepaths(
         # ensure the subpaths meet certain specifications
         if (
             targets is None
+            or targets == []
             # checks for name of the file from targets (compartment + metadata names)
             or str(subpath.stem).lower() in [target.lower() for target in targets]
             # checks for sqlite extension (which may include compartment + metadata names)
@@ -134,21 +144,38 @@ def _get_source_filepaths(
 
     # group files together by similar filename for later data operations
     grouped_sources = {}
-    for unique_source in set(source["source_path"].name for source in sources):
-        grouped_sources[unique_source.capitalize()] = [
-            # case for files besides sqlite
-            source if source["source_path"].suffix.lower() != ".sqlite"
-            # if we have sqlite entries, update the source_path to the parent
-            # (the parent table database file) as grouped key name will now
-            # encapsulate the table name details.
-            else {
-                "source_path": source["source_path"].parent,
-                "table_name": source["table_name"],
-            }
-            for source in sources
-            # focus only on entries which include the unique_source name
-            if source["source_path"].name == unique_source
-        ]
+
+    # if we have no targets, create a single group inferred from a common prefix and suffix
+    # note: this may apply for scenarios where no compartments or metadata are
+    # provided as input to CytoTable operations.
+    if targets is None or targets == []:
+        # gather a common prefix to use for the group
+        common_prefix = os.path.commonprefix(
+            [
+                source["source_path"].stem
+                for source in sources
+                if source["source_path"].suffix == f".{source_datatype}"
+            ]
+        )
+        grouped_sources[f"{common_prefix}.{source_datatype}"] = sources
+
+    # otherwise, use the unique names in the paths to determine source grouping
+    else:
+        for unique_source in set(source["source_path"].name for source in sources):
+            grouped_sources[unique_source.capitalize()] = [
+                # case for files besides sqlite
+                source if source["source_path"].suffix.lower() != ".sqlite"
+                # if we have sqlite entries, update the source_path to the parent
+                # (the parent table database file) as grouped key name will now
+                # encapsulate the table name details.
+                else {
+                    "source_path": source["source_path"].parent,
+                    "table_name": source["table_name"],
+                }
+                for source in sources
+                # focus only on entries which include the unique_source name
+                if source["source_path"].name == unique_source
+            ]
 
     return grouped_sources
 
@@ -190,7 +217,7 @@ def _infer_source_datatype(
         raise DatatypeException(
             (
                 f"Unable to find source datatype {source_datatype} "
-                "within files. Detected datatypes: {suffixes}"
+                f"within files. Detected datatypes: {suffixes}"
             )
         )
 
@@ -270,7 +297,9 @@ def _gather_sources(
     source_path = _build_path(path=source_path, **kwargs)
 
     # gather filepaths which will be used as the basis for this work
-    sources = _get_source_filepaths(path=source_path, targets=targets)
+    sources = _get_source_filepaths(
+        path=source_path, targets=targets, source_datatype=source_datatype
+    )
 
     # infer or validate the source datatype based on source filepaths
     source_datatype = _infer_source_datatype(

diff --git a/cytotable/utils.py b/cytotable/utils.py
@@ -202,13 +202,18 @@ def _sqlite_mixed_type_query_to_parquet(
     with sqlite3.connect(source_path) as conn:
         cursor = conn.cursor()
 
-        # gather table column details including datatype
+        # Gather table column details including datatype.
+        # Note: uses SQLite pragma for table information.
+        # See the following for more information:
+        # https://sqlite.org/pragma.html#pragma_table_info
         cursor.execute(
             f"""
             SELECT :table_name as table_name,
                     name as column_name,
                     type as column_type
-            FROM pragma_table_info(:table_name);
+            FROM pragma_table_info(:table_name)
+            /* explicit column ordering by 'cid' */
+            ORDER BY cid ASC;
             """,
             {"table_name": table_name},
         )

diff --git a/docs/source/_static/dataflow.mmd b/docs/source/_static/dataflow.mmd
@@ -6,20 +6,27 @@ flowchart LR
     DeepProfiler
     npz[(NPZ Files)]
     cytominer-database
-    sqlite[(SQLite File)]
+    sqlite[(SQLite Files)]
     cp_sqlite[(SQLite File)]
+    in_carta[IN Carta]
+    ic_csv[(CSV files)]
     pycytominer
     CytoTable
 
     images --> CellProfiler
     images --> DeepProfiler
+     images --> in_carta
     CellProfiler --> csv
     CellProfiler --> cp_sqlite
     DeepProfiler --> npz
     csv --> cytominer-database
     cytominer-database --> sqlite
+    in_carta --> ic_csv
     csv --> CytoTable
     npz --> CytoTable
     sqlite --> CytoTable
     cp_sqlite --> CytoTable
+    ic_csv --> CytoTable
     CytoTable --> pycytominer
+
+    style CytoTable fill:#FDCA88,stroke:#D96026;
diff --git a/docs/source/_static/dataflow.svg b/docs/source/_static/dataflow.svg
diff --git a/docs/source/overview.md b/docs/source/overview.md
@@ -100,6 +100,15 @@ Data source compatibility for CytoTable is focused (but not explicitly limited t
   * **Preset specification:** SQLite data sources from CellProfiler may use the configuration preset :code:`convert(..., preset="cellprofiler_sqlite", ...)` (:mod:`convert() <cytotable.convert.convert>`).
 ```
 
+#### IN Carta Data Sources
+
+- __Comma-separated values (.csv)__: [Molecular Devices IN Carta](https://www.moleculardevices.com/products/cellular-imaging-systems/high-content-analysis/in-carta-image-analysis-software) software provides output data in CSV format.
+
+```{eval-rst}
+  * **Manual specification:** CSV data source types may be manually specified by using :code:`convert(..., source_datatype="csv", ...)` (:mod:`convert() <cytotable.convert.convert>`).
+  * **Preset specification:** CSV data sources from In Carta Image Analysis Software may use the configuration preset :code:`convert(..., preset="in-carta", ...)` (:mod:`convert() <cytotable.convert.convert>`).
+```
+
 ## Data Destinations
 
 ### Data Destination Locations

diff --git a/readme.md b/readme.md
@@ -7,7 +7,7 @@ _Diagram showing data flow relative to this project._
 
 ## Summary
 
-CytoTable enables single-cell morphology data analysis by cleaning and transforming CellProfiler (`.csv` or `.sqlite`), cytominer-database (`.sqlite`), and DeepProfiler (`.npz`) output data at scale.
+CytoTable enables single-cell morphology data analysis by cleaning and transforming CellProfiler (`.csv` or `.sqlite`), cytominer-database (`.sqlite`), and DeepProfiler (`.npz`), and other sources such as IN Carta data output data at scale.
 CytoTable creates parquet files for both independent analysis and for input into [Pycytominer](https://github.com/cytomining/pycytominer).
 The Parquet files will have a unified and documented data model, including referenceable schema where appropriate (for validation within Pycytominer or other projects).
 

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -134,6 +134,15 @@ def fixture_data_dirs_cytominerdatabase(data_dir_cytominerdatabase: str) -> List
     ]
 
 
+@pytest.fixture(name="data_dirs_in_carta")
+def fixture_data_dir_in_carta() -> List[str]:
+    """
+    Provide data directories for IN Carta test data
+    """
+
+    return [f"{pathlib.Path(__file__).parent}/data/in-carta/colas-lab"]
+
+
 @pytest.fixture(name="cytominerdatabase_sqlite")
 def fixture_cytominerdatabase_sqlite(
     fx_tempdir: str,

diff --git a/tests/test_convert.py b/tests/test_convert.py
@@ -1071,3 +1071,43 @@ def test_cell_health_cellprofiler_to_cytominer_database_legacy(
             ]
         )
     )
+
+
+def test_in_carta_to_parquet(
+    load_parsl_default: None, fx_tempdir: str, data_dirs_in_carta: List[str]
+):
+    """
+    Testing IN Carta preset with CytoTable convert to parquet output.
+    """
+
+    for data_dir in data_dirs_in_carta:
+        # read the directory of data with wildcard
+        with duckdb.connect() as ddb:
+            ddb_result = ddb.execute(
+                f"""
+                SELECT *
+                FROM read_csv_auto('{data_dir}/*.csv')
+                """
+            ).arrow()
+
+        # process the data with cytotable using in-carta preset
+        cytotable_result = convert(
+            source_path=data_dir,
+            dest_path=f"{fx_tempdir}/{pathlib.Path(data_dir).name}",
+            dest_datatype="parquet",
+            source_datatype="csv",
+            preset="in-carta",
+            join=False,
+        )
+
+        # read the result from CytoTable as a table
+        cytotable_result_table = parquet.read_table(
+            # note: we use cast here to explicitly tell mypy about the types involved
+            cast(list, cytotable_result[list(cast(dict, cytotable_result).keys())[0]])[
+                0
+            ]["table"][0]
+        )
+
+        # check the data against one another
+        assert cytotable_result_table.schema.equals(ddb_result.schema)
+        assert cytotable_result_table.shape == ddb_result.shape