Skip to content

Commit

Permalink
Add IN Carta preset and related compatibility changes + testing (#145)
Browse files Browse the repository at this point in the history
* add initial in-carta preset

* add test dir fixture for in carta

* add in carta test

* modify source path processing for no targets

* raise errors for special no target sources case

* update pre-commit versions

* set explicit order for column metadata extraction

* further test revision

* add comment about uncertain version

* add casting for mypy

* add docs for in carta usage

* add in-carta version specification

* distinct dest dir for test

* move to errno reference for dir not empty

* Update docs/source/overview.md

Co-authored-by: Gregory Way <[email protected]>

* add in-carta as data source in main readme

* add logging, comments for no target circumstances

* linting

---------

Co-authored-by: Gregory Way <[email protected]>
  • Loading branch information
d33bs and gwaybio authored Jan 19, 2024
1 parent d5f6b46 commit 562d493
Show file tree
Hide file tree
Showing 11 changed files with 171 additions and 28 deletions.
4 changes: 2 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ repos:
hooks:
- id: yamllint
- repo: https://github.com/psf/black
rev: 23.12.0
rev: 23.12.1
hooks:
- id: black
- repo: https://github.com/asottile/blacken-docs
Expand All @@ -55,7 +55,7 @@ repos:
hooks:
- id: vulture
- repo: https://github.com/pre-commit/mirrors-mypy
rev: v1.7.1
rev: v1.8.0
hooks:
- id: mypy
- repo: https://github.com/PyCQA/pylint
Expand Down
22 changes: 19 additions & 3 deletions cytotable/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,9 @@ def _get_table_columns_and_types(source: Dict[str, Any]) -> List[Dict[str, str]]
segment_type as column_dtype
FROM pragma_storage_info('column_details')
/* avoid duplicate entries in the form of VALIDITY segment_types */
WHERE segment_type != 'VALIDITY';
WHERE segment_type != 'VALIDITY'
/* explicitly order the columns by their id to avoid inconsistent results */
ORDER BY column_id ASC;
"""

# attempt to read the data to parquet from duckdb
Expand Down Expand Up @@ -319,7 +321,7 @@ def _source_chunk_to_parquet(
select_columns = ",".join(
[
# here we cast the column to the specified type ensure the colname remains the same
f"CAST({column['column_name']} AS {column['column_dtype']}) AS {column['column_name']}"
f"CAST(\"{column['column_name']}\" AS {column['column_dtype']}) AS \"{column['column_name']}\""
for column in source["columns"]
]
)
Expand Down Expand Up @@ -414,15 +416,28 @@ def _prepend_column_name(
Path to the modified file.
"""

import logging
import pathlib

import pyarrow.parquet as parquet

from cytotable.constants import CYTOTABLE_ARROW_USE_MEMORY_MAPPING
from cytotable.utils import _write_parquet_table_with_metadata

logger = logging.getLogger(__name__)

targets = tuple(metadata) + tuple(compartments)

# if we have no targets or metadata to work from, return the table unchanged
if len(targets) == 0:
logger.warning(
msg=(
"Skipping column name prepend operations"
"because no compartments or metadata were provided."
)
)
return table_path

table = parquet.read_table(
source=table_path, memory_map=CYTOTABLE_ARROW_USE_MEMORY_MAPPING
)
Expand Down Expand Up @@ -569,6 +584,7 @@ def _concat_source_group(
Updated dictionary containing concatenated sources.
"""

import errno
import pathlib

import pyarrow as pa
Expand Down Expand Up @@ -649,7 +665,7 @@ def _concat_source_group(
pathlib.Path(pathlib.Path(source["table"][0]).parent).rmdir()
except OSError as os_err:
# raise only if we don't have a dir not empty errno
if os_err.errno != 66:
if os_err.errno != errno.ENOTEMPTY:
raise

# return the concatted parquet filename
Expand Down
28 changes: 28 additions & 0 deletions cytotable/presets.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,6 +204,34 @@
AND nuclei.Nuclei_ObjectNumber = cytoplasm.Metadata_Cytoplasm_Parent_Nuclei
""",
},
"in-carta": {
# version specifications using related references
"CONFIG_SOURCE_VERSION": {
"in-carta": "v1.17.0412545",
},
# names of source table compartments (for ex. cells.csv, etc.)
"CONFIG_NAMES_COMPARTMENTS": tuple(),
# names of source table metadata (for ex. image.csv, etc.)
"CONFIG_NAMES_METADATA": tuple(),
# column names in any compartment or metadata tables which contain
# unique names to avoid renaming
"CONFIG_IDENTIFYING_COLUMNS": (
"OBJECT ID",
"Row",
"Column",
"FOV",
"WELL LABEL",
"Z",
"T",
),
# chunk size to use for join operations to help with possible performance issues
# note: this number is an estimate and is may need changes contingent on data
# and system used by this library.
"CONFIG_CHUNK_SIZE": 1000,
# compartment and metadata joins performed using DuckDB SQL
# and modified at runtime as needed
"CONFIG_JOINS": "",
},
}
"""
Configuration presets for CytoTable
Expand Down
65 changes: 47 additions & 18 deletions cytotable/sources.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def _build_path(
def _get_source_filepaths(
path: Union[pathlib.Path, AnyPath],
targets: List[str],
source_datatype: Optional[str] = None,
) -> Dict[str, List[Dict[str, Any]]]:
"""
Gather dataset of filepaths from a provided directory path.
Expand All @@ -56,19 +57,27 @@ def _get_source_filepaths(
Either a directory path to seek filepaths within or a path directly to a file.
targets: List[str]:
Compartment and metadata names to seek within the provided path.
source_datatype: Optional[str]: (Default value = None)
The source datatype (extension) to use for reading the tables.
Returns:
Dict[str, List[Dict[str, Any]]]
Data structure which groups related files based on the compartments.
"""

import os
import pathlib

from cloudpathlib import AnyPath

from cytotable.exceptions import NoInputDataException
from cytotable.exceptions import DatatypeException, NoInputDataException
from cytotable.utils import _cache_cloudpath_to_local, _duckdb_reader

if (targets is None or targets == []) and source_datatype is None:
raise DatatypeException(
f"A source_datatype must be specified when using undefined compartments and metadata names."
)

# gathers files from provided path using compartments + metadata as a filter
sources = [
# build source_paths for all files
Expand All @@ -85,6 +94,7 @@ def _get_source_filepaths(
# ensure the subpaths meet certain specifications
if (
targets is None
or targets == []
# checks for name of the file from targets (compartment + metadata names)
or str(subpath.stem).lower() in [target.lower() for target in targets]
# checks for sqlite extension (which may include compartment + metadata names)
Expand Down Expand Up @@ -134,21 +144,38 @@ def _get_source_filepaths(

# group files together by similar filename for later data operations
grouped_sources = {}
for unique_source in set(source["source_path"].name for source in sources):
grouped_sources[unique_source.capitalize()] = [
# case for files besides sqlite
source if source["source_path"].suffix.lower() != ".sqlite"
# if we have sqlite entries, update the source_path to the parent
# (the parent table database file) as grouped key name will now
# encapsulate the table name details.
else {
"source_path": source["source_path"].parent,
"table_name": source["table_name"],
}
for source in sources
# focus only on entries which include the unique_source name
if source["source_path"].name == unique_source
]

# if we have no targets, create a single group inferred from a common prefix and suffix
# note: this may apply for scenarios where no compartments or metadata are
# provided as input to CytoTable operations.
if targets is None or targets == []:
# gather a common prefix to use for the group
common_prefix = os.path.commonprefix(
[
source["source_path"].stem
for source in sources
if source["source_path"].suffix == f".{source_datatype}"
]
)
grouped_sources[f"{common_prefix}.{source_datatype}"] = sources

# otherwise, use the unique names in the paths to determine source grouping
else:
for unique_source in set(source["source_path"].name for source in sources):
grouped_sources[unique_source.capitalize()] = [
# case for files besides sqlite
source if source["source_path"].suffix.lower() != ".sqlite"
# if we have sqlite entries, update the source_path to the parent
# (the parent table database file) as grouped key name will now
# encapsulate the table name details.
else {
"source_path": source["source_path"].parent,
"table_name": source["table_name"],
}
for source in sources
# focus only on entries which include the unique_source name
if source["source_path"].name == unique_source
]

return grouped_sources

Expand Down Expand Up @@ -190,7 +217,7 @@ def _infer_source_datatype(
raise DatatypeException(
(
f"Unable to find source datatype {source_datatype} "
"within files. Detected datatypes: {suffixes}"
f"within files. Detected datatypes: {suffixes}"
)
)

Expand Down Expand Up @@ -270,7 +297,9 @@ def _gather_sources(
source_path = _build_path(path=source_path, **kwargs)

# gather filepaths which will be used as the basis for this work
sources = _get_source_filepaths(path=source_path, targets=targets)
sources = _get_source_filepaths(
path=source_path, targets=targets, source_datatype=source_datatype
)

# infer or validate the source datatype based on source filepaths
source_datatype = _infer_source_datatype(
Expand Down
9 changes: 7 additions & 2 deletions cytotable/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,13 +202,18 @@ def _sqlite_mixed_type_query_to_parquet(
with sqlite3.connect(source_path) as conn:
cursor = conn.cursor()

# gather table column details including datatype
# Gather table column details including datatype.
# Note: uses SQLite pragma for table information.
# See the following for more information:
# https://sqlite.org/pragma.html#pragma_table_info
cursor.execute(
f"""
SELECT :table_name as table_name,
name as column_name,
type as column_type
FROM pragma_table_info(:table_name);
FROM pragma_table_info(:table_name)
/* explicit column ordering by 'cid' */
ORDER BY cid ASC;
""",
{"table_name": table_name},
)
Expand Down
9 changes: 8 additions & 1 deletion docs/source/_static/dataflow.mmd
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,27 @@ flowchart LR
DeepProfiler
npz[(NPZ Files)]
cytominer-database
sqlite[(SQLite File)]
sqlite[(SQLite Files)]
cp_sqlite[(SQLite File)]
in_carta[IN Carta]
ic_csv[(CSV files)]
pycytominer
CytoTable

images --> CellProfiler
images --> DeepProfiler
images --> in_carta
CellProfiler --> csv
CellProfiler --> cp_sqlite
DeepProfiler --> npz
csv --> cytominer-database
cytominer-database --> sqlite
in_carta --> ic_csv
csv --> CytoTable
npz --> CytoTable
sqlite --> CytoTable
cp_sqlite --> CytoTable
ic_csv --> CytoTable
CytoTable --> pycytominer

style CytoTable fill:#FDCA88,stroke:#D96026;
2 changes: 1 addition & 1 deletion docs/source/_static/dataflow.svg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
9 changes: 9 additions & 0 deletions docs/source/overview.md
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,15 @@ Data source compatibility for CytoTable is focused (but not explicitly limited t
* **Preset specification:** SQLite data sources from CellProfiler may use the configuration preset :code:`convert(..., preset="cellprofiler_sqlite", ...)` (:mod:`convert() <cytotable.convert.convert>`).
```

#### IN Carta Data Sources

- __Comma-separated values (.csv)__: [Molecular Devices IN Carta](https://www.moleculardevices.com/products/cellular-imaging-systems/high-content-analysis/in-carta-image-analysis-software) software provides output data in CSV format.

```{eval-rst}
* **Manual specification:** CSV data source types may be manually specified by using :code:`convert(..., source_datatype="csv", ...)` (:mod:`convert() <cytotable.convert.convert>`).
* **Preset specification:** CSV data sources from In Carta Image Analysis Software may use the configuration preset :code:`convert(..., preset="in-carta", ...)` (:mod:`convert() <cytotable.convert.convert>`).
```

## Data Destinations

### Data Destination Locations
Expand Down
2 changes: 1 addition & 1 deletion readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ _Diagram showing data flow relative to this project._

## Summary

CytoTable enables single-cell morphology data analysis by cleaning and transforming CellProfiler (`.csv` or `.sqlite`), cytominer-database (`.sqlite`), and DeepProfiler (`.npz`) output data at scale.
CytoTable enables single-cell morphology data analysis by cleaning and transforming CellProfiler (`.csv` or `.sqlite`), cytominer-database (`.sqlite`), and DeepProfiler (`.npz`), and other sources such as IN Carta data output data at scale.
CytoTable creates parquet files for both independent analysis and for input into [Pycytominer](https://github.com/cytomining/pycytominer).
The Parquet files will have a unified and documented data model, including referenceable schema where appropriate (for validation within Pycytominer or other projects).

Expand Down
9 changes: 9 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,15 @@ def fixture_data_dirs_cytominerdatabase(data_dir_cytominerdatabase: str) -> List
]


@pytest.fixture(name="data_dirs_in_carta")
def fixture_data_dir_in_carta() -> List[str]:
"""
Provide data directories for IN Carta test data
"""

return [f"{pathlib.Path(__file__).parent}/data/in-carta/colas-lab"]


@pytest.fixture(name="cytominerdatabase_sqlite")
def fixture_cytominerdatabase_sqlite(
fx_tempdir: str,
Expand Down
40 changes: 40 additions & 0 deletions tests/test_convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -1071,3 +1071,43 @@ def test_cell_health_cellprofiler_to_cytominer_database_legacy(
]
)
)


def test_in_carta_to_parquet(
load_parsl_default: None, fx_tempdir: str, data_dirs_in_carta: List[str]
):
"""
Testing IN Carta preset with CytoTable convert to parquet output.
"""

for data_dir in data_dirs_in_carta:
# read the directory of data with wildcard
with duckdb.connect() as ddb:
ddb_result = ddb.execute(
f"""
SELECT *
FROM read_csv_auto('{data_dir}/*.csv')
"""
).arrow()

# process the data with cytotable using in-carta preset
cytotable_result = convert(
source_path=data_dir,
dest_path=f"{fx_tempdir}/{pathlib.Path(data_dir).name}",
dest_datatype="parquet",
source_datatype="csv",
preset="in-carta",
join=False,
)

# read the result from CytoTable as a table
cytotable_result_table = parquet.read_table(
# note: we use cast here to explicitly tell mypy about the types involved
cast(list, cytotable_result[list(cast(dict, cytotable_result).keys())[0]])[
0
]["table"][0]
)

# check the data against one another
assert cytotable_result_table.schema.equals(ddb_result.schema)
assert cytotable_result_table.shape == ddb_result.shape

0 comments on commit 562d493

Please sign in to comment.