diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index c25e6787..48c64bae 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -33,7 +33,7 @@ repos:
hooks:
- id: yamllint
- repo: https://github.com/psf/black
- rev: 23.12.0
+ rev: 23.12.1
hooks:
- id: black
- repo: https://github.com/asottile/blacken-docs
@@ -55,7 +55,7 @@ repos:
hooks:
- id: vulture
- repo: https://github.com/pre-commit/mirrors-mypy
- rev: v1.7.1
+ rev: v1.8.0
hooks:
- id: mypy
- repo: https://github.com/PyCQA/pylint
diff --git a/cytotable/convert.py b/cytotable/convert.py
index ad083b80..098f040a 100644
--- a/cytotable/convert.py
+++ b/cytotable/convert.py
@@ -75,7 +75,9 @@ def _get_table_columns_and_types(source: Dict[str, Any]) -> List[Dict[str, str]]
segment_type as column_dtype
FROM pragma_storage_info('column_details')
/* avoid duplicate entries in the form of VALIDITY segment_types */
- WHERE segment_type != 'VALIDITY';
+ WHERE segment_type != 'VALIDITY'
+ /* explicitly order the columns by their id to avoid inconsistent results */
+ ORDER BY column_id ASC;
"""
# attempt to read the data to parquet from duckdb
@@ -319,7 +321,7 @@ def _source_chunk_to_parquet(
select_columns = ",".join(
[
# here we cast the column to the specified type ensure the colname remains the same
- f"CAST({column['column_name']} AS {column['column_dtype']}) AS {column['column_name']}"
+ f"CAST(\"{column['column_name']}\" AS {column['column_dtype']}) AS \"{column['column_name']}\""
for column in source["columns"]
]
)
@@ -414,6 +416,7 @@ def _prepend_column_name(
Path to the modified file.
"""
+ import logging
import pathlib
import pyarrow.parquet as parquet
@@ -421,8 +424,20 @@ def _prepend_column_name(
from cytotable.constants import CYTOTABLE_ARROW_USE_MEMORY_MAPPING
from cytotable.utils import _write_parquet_table_with_metadata
+ logger = logging.getLogger(__name__)
+
targets = tuple(metadata) + tuple(compartments)
+ # if we have no targets or metadata to work from, return the table unchanged
+ if len(targets) == 0:
+ logger.warning(
+ msg=(
+ "Skipping column name prepend operations"
+ "because no compartments or metadata were provided."
+ )
+ )
+ return table_path
+
table = parquet.read_table(
source=table_path, memory_map=CYTOTABLE_ARROW_USE_MEMORY_MAPPING
)
@@ -569,6 +584,7 @@ def _concat_source_group(
Updated dictionary containing concatenated sources.
"""
+ import errno
import pathlib
import pyarrow as pa
@@ -649,7 +665,7 @@ def _concat_source_group(
pathlib.Path(pathlib.Path(source["table"][0]).parent).rmdir()
except OSError as os_err:
# raise only if we don't have a dir not empty errno
- if os_err.errno != 66:
+ if os_err.errno != errno.ENOTEMPTY:
raise
# return the concatted parquet filename
diff --git a/cytotable/presets.py b/cytotable/presets.py
index 488fe6b1..4d8c7095 100644
--- a/cytotable/presets.py
+++ b/cytotable/presets.py
@@ -204,6 +204,34 @@
AND nuclei.Nuclei_ObjectNumber = cytoplasm.Metadata_Cytoplasm_Parent_Nuclei
""",
},
+ "in-carta": {
+ # version specifications using related references
+ "CONFIG_SOURCE_VERSION": {
+ "in-carta": "v1.17.0412545",
+ },
+ # names of source table compartments (for ex. cells.csv, etc.)
+ "CONFIG_NAMES_COMPARTMENTS": tuple(),
+ # names of source table metadata (for ex. image.csv, etc.)
+ "CONFIG_NAMES_METADATA": tuple(),
+ # column names in any compartment or metadata tables which contain
+ # unique names to avoid renaming
+ "CONFIG_IDENTIFYING_COLUMNS": (
+ "OBJECT ID",
+ "Row",
+ "Column",
+ "FOV",
+ "WELL LABEL",
+ "Z",
+ "T",
+ ),
+ # chunk size to use for join operations to help with possible performance issues
+ # note: this number is an estimate and is may need changes contingent on data
+ # and system used by this library.
+ "CONFIG_CHUNK_SIZE": 1000,
+ # compartment and metadata joins performed using DuckDB SQL
+ # and modified at runtime as needed
+ "CONFIG_JOINS": "",
+ },
}
"""
Configuration presets for CytoTable
diff --git a/cytotable/sources.py b/cytotable/sources.py
index 2e81955e..87ec9578 100644
--- a/cytotable/sources.py
+++ b/cytotable/sources.py
@@ -47,6 +47,7 @@ def _build_path(
def _get_source_filepaths(
path: Union[pathlib.Path, AnyPath],
targets: List[str],
+ source_datatype: Optional[str] = None,
) -> Dict[str, List[Dict[str, Any]]]:
"""
Gather dataset of filepaths from a provided directory path.
@@ -56,19 +57,27 @@ def _get_source_filepaths(
Either a directory path to seek filepaths within or a path directly to a file.
targets: List[str]:
Compartment and metadata names to seek within the provided path.
+ source_datatype: Optional[str]: (Default value = None)
+ The source datatype (extension) to use for reading the tables.
Returns:
Dict[str, List[Dict[str, Any]]]
Data structure which groups related files based on the compartments.
"""
+ import os
import pathlib
from cloudpathlib import AnyPath
- from cytotable.exceptions import NoInputDataException
+ from cytotable.exceptions import DatatypeException, NoInputDataException
from cytotable.utils import _cache_cloudpath_to_local, _duckdb_reader
+ if (targets is None or targets == []) and source_datatype is None:
+ raise DatatypeException(
+ f"A source_datatype must be specified when using undefined compartments and metadata names."
+ )
+
# gathers files from provided path using compartments + metadata as a filter
sources = [
# build source_paths for all files
@@ -85,6 +94,7 @@ def _get_source_filepaths(
# ensure the subpaths meet certain specifications
if (
targets is None
+ or targets == []
# checks for name of the file from targets (compartment + metadata names)
or str(subpath.stem).lower() in [target.lower() for target in targets]
# checks for sqlite extension (which may include compartment + metadata names)
@@ -134,21 +144,38 @@ def _get_source_filepaths(
# group files together by similar filename for later data operations
grouped_sources = {}
- for unique_source in set(source["source_path"].name for source in sources):
- grouped_sources[unique_source.capitalize()] = [
- # case for files besides sqlite
- source if source["source_path"].suffix.lower() != ".sqlite"
- # if we have sqlite entries, update the source_path to the parent
- # (the parent table database file) as grouped key name will now
- # encapsulate the table name details.
- else {
- "source_path": source["source_path"].parent,
- "table_name": source["table_name"],
- }
- for source in sources
- # focus only on entries which include the unique_source name
- if source["source_path"].name == unique_source
- ]
+
+ # if we have no targets, create a single group inferred from a common prefix and suffix
+ # note: this may apply for scenarios where no compartments or metadata are
+ # provided as input to CytoTable operations.
+ if targets is None or targets == []:
+ # gather a common prefix to use for the group
+ common_prefix = os.path.commonprefix(
+ [
+ source["source_path"].stem
+ for source in sources
+ if source["source_path"].suffix == f".{source_datatype}"
+ ]
+ )
+ grouped_sources[f"{common_prefix}.{source_datatype}"] = sources
+
+ # otherwise, use the unique names in the paths to determine source grouping
+ else:
+ for unique_source in set(source["source_path"].name for source in sources):
+ grouped_sources[unique_source.capitalize()] = [
+ # case for files besides sqlite
+ source if source["source_path"].suffix.lower() != ".sqlite"
+ # if we have sqlite entries, update the source_path to the parent
+ # (the parent table database file) as grouped key name will now
+ # encapsulate the table name details.
+ else {
+ "source_path": source["source_path"].parent,
+ "table_name": source["table_name"],
+ }
+ for source in sources
+ # focus only on entries which include the unique_source name
+ if source["source_path"].name == unique_source
+ ]
return grouped_sources
@@ -190,7 +217,7 @@ def _infer_source_datatype(
raise DatatypeException(
(
f"Unable to find source datatype {source_datatype} "
- "within files. Detected datatypes: {suffixes}"
+ f"within files. Detected datatypes: {suffixes}"
)
)
@@ -270,7 +297,9 @@ def _gather_sources(
source_path = _build_path(path=source_path, **kwargs)
# gather filepaths which will be used as the basis for this work
- sources = _get_source_filepaths(path=source_path, targets=targets)
+ sources = _get_source_filepaths(
+ path=source_path, targets=targets, source_datatype=source_datatype
+ )
# infer or validate the source datatype based on source filepaths
source_datatype = _infer_source_datatype(
diff --git a/cytotable/utils.py b/cytotable/utils.py
index 8f317b0c..9789f42e 100644
--- a/cytotable/utils.py
+++ b/cytotable/utils.py
@@ -202,13 +202,18 @@ def _sqlite_mixed_type_query_to_parquet(
with sqlite3.connect(source_path) as conn:
cursor = conn.cursor()
- # gather table column details including datatype
+ # Gather table column details including datatype.
+ # Note: uses SQLite pragma for table information.
+ # See the following for more information:
+ # https://sqlite.org/pragma.html#pragma_table_info
cursor.execute(
f"""
SELECT :table_name as table_name,
name as column_name,
type as column_type
- FROM pragma_table_info(:table_name);
+ FROM pragma_table_info(:table_name)
+ /* explicit column ordering by 'cid' */
+ ORDER BY cid ASC;
""",
{"table_name": table_name},
)
diff --git a/docs/source/_static/dataflow.mmd b/docs/source/_static/dataflow.mmd
index bdd06f58..eb0159e9 100644
--- a/docs/source/_static/dataflow.mmd
+++ b/docs/source/_static/dataflow.mmd
@@ -6,20 +6,27 @@ flowchart LR
DeepProfiler
npz[(NPZ Files)]
cytominer-database
- sqlite[(SQLite File)]
+ sqlite[(SQLite Files)]
cp_sqlite[(SQLite File)]
+ in_carta[IN Carta]
+ ic_csv[(CSV files)]
pycytominer
CytoTable
images --> CellProfiler
images --> DeepProfiler
+ images --> in_carta
CellProfiler --> csv
CellProfiler --> cp_sqlite
DeepProfiler --> npz
csv --> cytominer-database
cytominer-database --> sqlite
+ in_carta --> ic_csv
csv --> CytoTable
npz --> CytoTable
sqlite --> CytoTable
cp_sqlite --> CytoTable
+ ic_csv --> CytoTable
CytoTable --> pycytominer
+
+ style CytoTable fill:#FDCA88,stroke:#D96026;
diff --git a/docs/source/_static/dataflow.svg b/docs/source/_static/dataflow.svg
index 387b37ac..ac86ca67 100644
--- a/docs/source/_static/dataflow.svg
+++ b/docs/source/_static/dataflow.svg
@@ -1 +1 @@
-
+
diff --git a/docs/source/overview.md b/docs/source/overview.md
index f7ca3862..5aacc2e5 100644
--- a/docs/source/overview.md
+++ b/docs/source/overview.md
@@ -100,6 +100,15 @@ Data source compatibility for CytoTable is focused (but not explicitly limited t
* **Preset specification:** SQLite data sources from CellProfiler may use the configuration preset :code:`convert(..., preset="cellprofiler_sqlite", ...)` (:mod:`convert() `).
```
+#### IN Carta Data Sources
+
+- __Comma-separated values (.csv)__: [Molecular Devices IN Carta](https://www.moleculardevices.com/products/cellular-imaging-systems/high-content-analysis/in-carta-image-analysis-software) software provides output data in CSV format.
+
+```{eval-rst}
+ * **Manual specification:** CSV data source types may be manually specified by using :code:`convert(..., source_datatype="csv", ...)` (:mod:`convert() `).
+ * **Preset specification:** CSV data sources from In Carta Image Analysis Software may use the configuration preset :code:`convert(..., preset="in-carta", ...)` (:mod:`convert() `).
+```
+
## Data Destinations
### Data Destination Locations
diff --git a/readme.md b/readme.md
index dff87fa2..4cb161c9 100644
--- a/readme.md
+++ b/readme.md
@@ -7,7 +7,7 @@ _Diagram showing data flow relative to this project._
## Summary
-CytoTable enables single-cell morphology data analysis by cleaning and transforming CellProfiler (`.csv` or `.sqlite`), cytominer-database (`.sqlite`), and DeepProfiler (`.npz`) output data at scale.
+CytoTable enables single-cell morphology data analysis by cleaning and transforming CellProfiler (`.csv` or `.sqlite`), cytominer-database (`.sqlite`), and DeepProfiler (`.npz`), and other sources such as IN Carta data output data at scale.
CytoTable creates parquet files for both independent analysis and for input into [Pycytominer](https://github.com/cytomining/pycytominer).
The Parquet files will have a unified and documented data model, including referenceable schema where appropriate (for validation within Pycytominer or other projects).
diff --git a/tests/conftest.py b/tests/conftest.py
index b71290a8..0350ff99 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -134,6 +134,15 @@ def fixture_data_dirs_cytominerdatabase(data_dir_cytominerdatabase: str) -> List
]
+@pytest.fixture(name="data_dirs_in_carta")
+def fixture_data_dir_in_carta() -> List[str]:
+ """
+ Provide data directories for IN Carta test data
+ """
+
+ return [f"{pathlib.Path(__file__).parent}/data/in-carta/colas-lab"]
+
+
@pytest.fixture(name="cytominerdatabase_sqlite")
def fixture_cytominerdatabase_sqlite(
fx_tempdir: str,
diff --git a/tests/test_convert.py b/tests/test_convert.py
index 8572fd29..8b877dfc 100644
--- a/tests/test_convert.py
+++ b/tests/test_convert.py
@@ -1071,3 +1071,43 @@ def test_cell_health_cellprofiler_to_cytominer_database_legacy(
]
)
)
+
+
+def test_in_carta_to_parquet(
+ load_parsl_default: None, fx_tempdir: str, data_dirs_in_carta: List[str]
+):
+ """
+ Testing IN Carta preset with CytoTable convert to parquet output.
+ """
+
+ for data_dir in data_dirs_in_carta:
+ # read the directory of data with wildcard
+ with duckdb.connect() as ddb:
+ ddb_result = ddb.execute(
+ f"""
+ SELECT *
+ FROM read_csv_auto('{data_dir}/*.csv')
+ """
+ ).arrow()
+
+ # process the data with cytotable using in-carta preset
+ cytotable_result = convert(
+ source_path=data_dir,
+ dest_path=f"{fx_tempdir}/{pathlib.Path(data_dir).name}",
+ dest_datatype="parquet",
+ source_datatype="csv",
+ preset="in-carta",
+ join=False,
+ )
+
+ # read the result from CytoTable as a table
+ cytotable_result_table = parquet.read_table(
+ # note: we use cast here to explicitly tell mypy about the types involved
+ cast(list, cytotable_result[list(cast(dict, cytotable_result).keys())[0]])[
+ 0
+ ]["table"][0]
+ )
+
+ # check the data against one another
+ assert cytotable_result_table.schema.equals(ddb_result.schema)
+ assert cytotable_result_table.shape == ddb_result.shape