From 562d4935d7c95d52136c63abea7accc7cc400dc9 Mon Sep 17 00:00:00 2001 From: Dave Bunten Date: Fri, 19 Jan 2024 15:46:25 -0700 Subject: [PATCH] Add IN Carta preset and related compatibility changes + testing (#145) * add initial in-carta preset * add test dir fixture for in carta * add in carta test * modify source path processing for no targets * raise errors for special no target sources case * update pre-commit versions * set explicit order for column metadata extraction * further test revision * add comment about uncertain version * add casting for mypy * add docs for in carta usage * add in-carta version specification * distinct dest dir for test * move to errno reference for dir not empty * Update docs/source/overview.md Co-authored-by: Gregory Way * add in-carta as data source in main readme * add logging, comments for no target circumstances * linting --------- Co-authored-by: Gregory Way --- .pre-commit-config.yaml | 4 +- cytotable/convert.py | 22 +++++++++-- cytotable/presets.py | 28 ++++++++++++++ cytotable/sources.py | 65 +++++++++++++++++++++++--------- cytotable/utils.py | 9 ++++- docs/source/_static/dataflow.mmd | 9 ++++- docs/source/_static/dataflow.svg | 2 +- docs/source/overview.md | 9 +++++ readme.md | 2 +- tests/conftest.py | 9 +++++ tests/test_convert.py | 40 ++++++++++++++++++++ 11 files changed, 171 insertions(+), 28 deletions(-) diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c25e6787..48c64bae 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -33,7 +33,7 @@ repos: hooks: - id: yamllint - repo: https://github.com/psf/black - rev: 23.12.0 + rev: 23.12.1 hooks: - id: black - repo: https://github.com/asottile/blacken-docs @@ -55,7 +55,7 @@ repos: hooks: - id: vulture - repo: https://github.com/pre-commit/mirrors-mypy - rev: v1.7.1 + rev: v1.8.0 hooks: - id: mypy - repo: https://github.com/PyCQA/pylint diff --git a/cytotable/convert.py b/cytotable/convert.py index ad083b80..098f040a 100644 --- a/cytotable/convert.py +++ b/cytotable/convert.py @@ -75,7 +75,9 @@ def _get_table_columns_and_types(source: Dict[str, Any]) -> List[Dict[str, str]] segment_type as column_dtype FROM pragma_storage_info('column_details') /* avoid duplicate entries in the form of VALIDITY segment_types */ - WHERE segment_type != 'VALIDITY'; + WHERE segment_type != 'VALIDITY' + /* explicitly order the columns by their id to avoid inconsistent results */ + ORDER BY column_id ASC; """ # attempt to read the data to parquet from duckdb @@ -319,7 +321,7 @@ def _source_chunk_to_parquet( select_columns = ",".join( [ # here we cast the column to the specified type ensure the colname remains the same - f"CAST({column['column_name']} AS {column['column_dtype']}) AS {column['column_name']}" + f"CAST(\"{column['column_name']}\" AS {column['column_dtype']}) AS \"{column['column_name']}\"" for column in source["columns"] ] ) @@ -414,6 +416,7 @@ def _prepend_column_name( Path to the modified file. """ + import logging import pathlib import pyarrow.parquet as parquet @@ -421,8 +424,20 @@ def _prepend_column_name( from cytotable.constants import CYTOTABLE_ARROW_USE_MEMORY_MAPPING from cytotable.utils import _write_parquet_table_with_metadata + logger = logging.getLogger(__name__) + targets = tuple(metadata) + tuple(compartments) + # if we have no targets or metadata to work from, return the table unchanged + if len(targets) == 0: + logger.warning( + msg=( + "Skipping column name prepend operations" + "because no compartments or metadata were provided." + ) + ) + return table_path + table = parquet.read_table( source=table_path, memory_map=CYTOTABLE_ARROW_USE_MEMORY_MAPPING ) @@ -569,6 +584,7 @@ def _concat_source_group( Updated dictionary containing concatenated sources. """ + import errno import pathlib import pyarrow as pa @@ -649,7 +665,7 @@ def _concat_source_group( pathlib.Path(pathlib.Path(source["table"][0]).parent).rmdir() except OSError as os_err: # raise only if we don't have a dir not empty errno - if os_err.errno != 66: + if os_err.errno != errno.ENOTEMPTY: raise # return the concatted parquet filename diff --git a/cytotable/presets.py b/cytotable/presets.py index 488fe6b1..4d8c7095 100644 --- a/cytotable/presets.py +++ b/cytotable/presets.py @@ -204,6 +204,34 @@ AND nuclei.Nuclei_ObjectNumber = cytoplasm.Metadata_Cytoplasm_Parent_Nuclei """, }, + "in-carta": { + # version specifications using related references + "CONFIG_SOURCE_VERSION": { + "in-carta": "v1.17.0412545", + }, + # names of source table compartments (for ex. cells.csv, etc.) + "CONFIG_NAMES_COMPARTMENTS": tuple(), + # names of source table metadata (for ex. image.csv, etc.) + "CONFIG_NAMES_METADATA": tuple(), + # column names in any compartment or metadata tables which contain + # unique names to avoid renaming + "CONFIG_IDENTIFYING_COLUMNS": ( + "OBJECT ID", + "Row", + "Column", + "FOV", + "WELL LABEL", + "Z", + "T", + ), + # chunk size to use for join operations to help with possible performance issues + # note: this number is an estimate and is may need changes contingent on data + # and system used by this library. + "CONFIG_CHUNK_SIZE": 1000, + # compartment and metadata joins performed using DuckDB SQL + # and modified at runtime as needed + "CONFIG_JOINS": "", + }, } """ Configuration presets for CytoTable diff --git a/cytotable/sources.py b/cytotable/sources.py index 2e81955e..87ec9578 100644 --- a/cytotable/sources.py +++ b/cytotable/sources.py @@ -47,6 +47,7 @@ def _build_path( def _get_source_filepaths( path: Union[pathlib.Path, AnyPath], targets: List[str], + source_datatype: Optional[str] = None, ) -> Dict[str, List[Dict[str, Any]]]: """ Gather dataset of filepaths from a provided directory path. @@ -56,19 +57,27 @@ def _get_source_filepaths( Either a directory path to seek filepaths within or a path directly to a file. targets: List[str]: Compartment and metadata names to seek within the provided path. + source_datatype: Optional[str]: (Default value = None) + The source datatype (extension) to use for reading the tables. Returns: Dict[str, List[Dict[str, Any]]] Data structure which groups related files based on the compartments. """ + import os import pathlib from cloudpathlib import AnyPath - from cytotable.exceptions import NoInputDataException + from cytotable.exceptions import DatatypeException, NoInputDataException from cytotable.utils import _cache_cloudpath_to_local, _duckdb_reader + if (targets is None or targets == []) and source_datatype is None: + raise DatatypeException( + f"A source_datatype must be specified when using undefined compartments and metadata names." + ) + # gathers files from provided path using compartments + metadata as a filter sources = [ # build source_paths for all files @@ -85,6 +94,7 @@ def _get_source_filepaths( # ensure the subpaths meet certain specifications if ( targets is None + or targets == [] # checks for name of the file from targets (compartment + metadata names) or str(subpath.stem).lower() in [target.lower() for target in targets] # checks for sqlite extension (which may include compartment + metadata names) @@ -134,21 +144,38 @@ def _get_source_filepaths( # group files together by similar filename for later data operations grouped_sources = {} - for unique_source in set(source["source_path"].name for source in sources): - grouped_sources[unique_source.capitalize()] = [ - # case for files besides sqlite - source if source["source_path"].suffix.lower() != ".sqlite" - # if we have sqlite entries, update the source_path to the parent - # (the parent table database file) as grouped key name will now - # encapsulate the table name details. - else { - "source_path": source["source_path"].parent, - "table_name": source["table_name"], - } - for source in sources - # focus only on entries which include the unique_source name - if source["source_path"].name == unique_source - ] + + # if we have no targets, create a single group inferred from a common prefix and suffix + # note: this may apply for scenarios where no compartments or metadata are + # provided as input to CytoTable operations. + if targets is None or targets == []: + # gather a common prefix to use for the group + common_prefix = os.path.commonprefix( + [ + source["source_path"].stem + for source in sources + if source["source_path"].suffix == f".{source_datatype}" + ] + ) + grouped_sources[f"{common_prefix}.{source_datatype}"] = sources + + # otherwise, use the unique names in the paths to determine source grouping + else: + for unique_source in set(source["source_path"].name for source in sources): + grouped_sources[unique_source.capitalize()] = [ + # case for files besides sqlite + source if source["source_path"].suffix.lower() != ".sqlite" + # if we have sqlite entries, update the source_path to the parent + # (the parent table database file) as grouped key name will now + # encapsulate the table name details. + else { + "source_path": source["source_path"].parent, + "table_name": source["table_name"], + } + for source in sources + # focus only on entries which include the unique_source name + if source["source_path"].name == unique_source + ] return grouped_sources @@ -190,7 +217,7 @@ def _infer_source_datatype( raise DatatypeException( ( f"Unable to find source datatype {source_datatype} " - "within files. Detected datatypes: {suffixes}" + f"within files. Detected datatypes: {suffixes}" ) ) @@ -270,7 +297,9 @@ def _gather_sources( source_path = _build_path(path=source_path, **kwargs) # gather filepaths which will be used as the basis for this work - sources = _get_source_filepaths(path=source_path, targets=targets) + sources = _get_source_filepaths( + path=source_path, targets=targets, source_datatype=source_datatype + ) # infer or validate the source datatype based on source filepaths source_datatype = _infer_source_datatype( diff --git a/cytotable/utils.py b/cytotable/utils.py index 8f317b0c..9789f42e 100644 --- a/cytotable/utils.py +++ b/cytotable/utils.py @@ -202,13 +202,18 @@ def _sqlite_mixed_type_query_to_parquet( with sqlite3.connect(source_path) as conn: cursor = conn.cursor() - # gather table column details including datatype + # Gather table column details including datatype. + # Note: uses SQLite pragma for table information. + # See the following for more information: + # https://sqlite.org/pragma.html#pragma_table_info cursor.execute( f""" SELECT :table_name as table_name, name as column_name, type as column_type - FROM pragma_table_info(:table_name); + FROM pragma_table_info(:table_name) + /* explicit column ordering by 'cid' */ + ORDER BY cid ASC; """, {"table_name": table_name}, ) diff --git a/docs/source/_static/dataflow.mmd b/docs/source/_static/dataflow.mmd index bdd06f58..eb0159e9 100644 --- a/docs/source/_static/dataflow.mmd +++ b/docs/source/_static/dataflow.mmd @@ -6,20 +6,27 @@ flowchart LR DeepProfiler npz[(NPZ Files)] cytominer-database - sqlite[(SQLite File)] + sqlite[(SQLite Files)] cp_sqlite[(SQLite File)] + in_carta[IN Carta] + ic_csv[(CSV files)] pycytominer CytoTable images --> CellProfiler images --> DeepProfiler + images --> in_carta CellProfiler --> csv CellProfiler --> cp_sqlite DeepProfiler --> npz csv --> cytominer-database cytominer-database --> sqlite + in_carta --> ic_csv csv --> CytoTable npz --> CytoTable sqlite --> CytoTable cp_sqlite --> CytoTable + ic_csv --> CytoTable CytoTable --> pycytominer + + style CytoTable fill:#FDCA88,stroke:#D96026; diff --git a/docs/source/_static/dataflow.svg b/docs/source/_static/dataflow.svg index 387b37ac..ac86ca67 100644 --- a/docs/source/_static/dataflow.svg +++ b/docs/source/_static/dataflow.svg @@ -1 +1 @@ -
Images
CellProfiler
CSV
DeepProfiler
NPZ
cytominer-database
SQLite
SQLite
pycytominer
CytoTable
Parquet
+
Cell Images
CellProfiler
CSV Files
DeepProfiler
NPZ Files
cytominer-database
SQLite Files
SQLite File
IN Carta
CSV files
pycytominer
CytoTable
diff --git a/docs/source/overview.md b/docs/source/overview.md index f7ca3862..5aacc2e5 100644 --- a/docs/source/overview.md +++ b/docs/source/overview.md @@ -100,6 +100,15 @@ Data source compatibility for CytoTable is focused (but not explicitly limited t * **Preset specification:** SQLite data sources from CellProfiler may use the configuration preset :code:`convert(..., preset="cellprofiler_sqlite", ...)` (:mod:`convert() `). ``` +#### IN Carta Data Sources + +- __Comma-separated values (.csv)__: [Molecular Devices IN Carta](https://www.moleculardevices.com/products/cellular-imaging-systems/high-content-analysis/in-carta-image-analysis-software) software provides output data in CSV format. + +```{eval-rst} + * **Manual specification:** CSV data source types may be manually specified by using :code:`convert(..., source_datatype="csv", ...)` (:mod:`convert() `). + * **Preset specification:** CSV data sources from In Carta Image Analysis Software may use the configuration preset :code:`convert(..., preset="in-carta", ...)` (:mod:`convert() `). +``` + ## Data Destinations ### Data Destination Locations diff --git a/readme.md b/readme.md index dff87fa2..4cb161c9 100644 --- a/readme.md +++ b/readme.md @@ -7,7 +7,7 @@ _Diagram showing data flow relative to this project._ ## Summary -CytoTable enables single-cell morphology data analysis by cleaning and transforming CellProfiler (`.csv` or `.sqlite`), cytominer-database (`.sqlite`), and DeepProfiler (`.npz`) output data at scale. +CytoTable enables single-cell morphology data analysis by cleaning and transforming CellProfiler (`.csv` or `.sqlite`), cytominer-database (`.sqlite`), and DeepProfiler (`.npz`), and other sources such as IN Carta data output data at scale. CytoTable creates parquet files for both independent analysis and for input into [Pycytominer](https://github.com/cytomining/pycytominer). The Parquet files will have a unified and documented data model, including referenceable schema where appropriate (for validation within Pycytominer or other projects). diff --git a/tests/conftest.py b/tests/conftest.py index b71290a8..0350ff99 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -134,6 +134,15 @@ def fixture_data_dirs_cytominerdatabase(data_dir_cytominerdatabase: str) -> List ] +@pytest.fixture(name="data_dirs_in_carta") +def fixture_data_dir_in_carta() -> List[str]: + """ + Provide data directories for IN Carta test data + """ + + return [f"{pathlib.Path(__file__).parent}/data/in-carta/colas-lab"] + + @pytest.fixture(name="cytominerdatabase_sqlite") def fixture_cytominerdatabase_sqlite( fx_tempdir: str, diff --git a/tests/test_convert.py b/tests/test_convert.py index 8572fd29..8b877dfc 100644 --- a/tests/test_convert.py +++ b/tests/test_convert.py @@ -1071,3 +1071,43 @@ def test_cell_health_cellprofiler_to_cytominer_database_legacy( ] ) ) + + +def test_in_carta_to_parquet( + load_parsl_default: None, fx_tempdir: str, data_dirs_in_carta: List[str] +): + """ + Testing IN Carta preset with CytoTable convert to parquet output. + """ + + for data_dir in data_dirs_in_carta: + # read the directory of data with wildcard + with duckdb.connect() as ddb: + ddb_result = ddb.execute( + f""" + SELECT * + FROM read_csv_auto('{data_dir}/*.csv') + """ + ).arrow() + + # process the data with cytotable using in-carta preset + cytotable_result = convert( + source_path=data_dir, + dest_path=f"{fx_tempdir}/{pathlib.Path(data_dir).name}", + dest_datatype="parquet", + source_datatype="csv", + preset="in-carta", + join=False, + ) + + # read the result from CytoTable as a table + cytotable_result_table = parquet.read_table( + # note: we use cast here to explicitly tell mypy about the types involved + cast(list, cytotable_result[list(cast(dict, cytotable_result).keys())[0]])[ + 0 + ]["table"][0] + ) + + # check the data against one another + assert cytotable_result_table.schema.equals(ddb_result.schema) + assert cytotable_result_table.shape == ddb_result.shape