From 768d79e5a42f3fff3638c7e379bfbe1f9f809228 Mon Sep 17 00:00:00 2001 From: d33bs Date: Fri, 12 Apr 2024 09:53:14 -0600 Subject: [PATCH 01/12] initial rework of tablenumber efforts --- cytotable/convert.py | 104 ++++++++++++++++++++++++++++++++++++++++++- cytotable/utils.py | 63 +++++++++++++++++++++++--- 2 files changed, 160 insertions(+), 7 deletions(-) diff --git a/cytotable/convert.py b/cytotable/convert.py index 4064526a..e73f0d62 100644 --- a/cytotable/convert.py +++ b/cytotable/convert.py @@ -174,6 +174,86 @@ def _prep_cast_column_data_types( return columns +@python_app +def _set_tablenumber( + sources: Dict[str, List[Dict[str, Any]]], + add_tablenumber: Optional[bool] = None, +) -> Dict[str, List[Dict[str, Any]]]: + """ + Gathers a "TableNumber" for the table which is a unique identifier intended + to help differentiate between imagenumbers to create distinct results. + + Note: + - If using CSV data sources, the image.csv table is used for checksum. + - If using SQLite data sources, the entire SQLite database is used for checksum. + + Args: + sources: Dict[str, List[Dict[str, Any]]] + Contains metadata about data tables and related contents. + add_tablenumber: Optional[bool] + Whether to add a calculated tablenumber. + Note: when False, adds None as the tablenumber + + Returns: + List[Dict[str, Any]] + New source group with added TableNumber details. + """ + + from cloudpathlib import AnyPath + + from cytotable.utils import _gather_tablenumber_checksum + + print(sources) + + # determine if we need to add tablenumber data + if add_tablenumber is None: + pass + + # if we're configured not to add tablenumber, add None + if not add_tablenumber: + return { + source_group_name: [ + dict( + source, + **{ + "tablenumber": None, + }, + ) + for source in source_group_vals + ] + for source_group_name, source_group_vals in sources.items() + } + + # gather the image table from the source_group + tablenumber_table = { + # create a data structure with the common parent for each dataset + # and the calculated checksum from the image table + str(source["source_path"].parent): _gather_tablenumber_checksum( + source["source_path"] + ) + for source_group_name, source_group_vals in sources.items() + # use the image tables references only for the basis of the + # these calculations. + if any( + value in str(AnyPath(source_group_name).stem).lower() + for value in ["image", "per_image"] + ) + for source in source_group_vals + } + + # return a modified sources data structure with the table number added + return { + source_group_name: [ + dict( + source, + **{"tablenumber": tablenumber_table[str(source["source_path"].parent)]}, + ) + for source in source_group_vals + ] + for source_group_name, source_group_vals in sources.items() + } + + @python_app def _get_table_chunk_offsets( chunk_size: int, @@ -316,8 +396,19 @@ def _source_chunk_to_parquet( ) pathlib.Path(source_dest_path).mkdir(parents=True, exist_ok=True) + # build tablenumber segment addition (if necessary) + tablenumber_sql = ( + # to become tablenumber in sql select later with bigint (8-byte integer) + # we cast here to bigint to avoid concat or join conflicts later due to + # misaligned automatic data typing. + f"CAST({source['tablenumber']} AS BIGINT) as TableNumber, " + if source["tablenumber"] is not None + # if we don't have a tablenumber value, don't introduce the column + else "" + ) + # build the column selection block of query - select_columns = ",".join( + select_columns = tablenumber_sql + ",".join( [ # here we cast the column to the specified type ensure the colname remains the same f"CAST(\"{column['column_name']}\" AS {column['column_dtype']}) AS \"{column['column_name']}\"" @@ -372,6 +463,7 @@ def _source_chunk_to_parquet( table_name=str(source["table_name"]), chunk_size=chunk_size, offset=offset, + tablenumber=source["tablenumber"], ), where=result_filepath, ) @@ -992,6 +1084,7 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals infer_common_schema: bool, drop_null: bool, data_type_cast_map: Optional[Dict[str, str]] = None, + add_tablenumber: Optional[bool] = None, **kwargs, ) -> Union[Dict[str, List[Dict[str, Any]]], str]: """ @@ -1118,6 +1211,11 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals for source_group_name, source_group_vals in invalid_files_dropped.items() } + # add tablenumber details, appending None if not add_tablenumber + tablenumber_prepared = _set_tablenumber( + sources=column_names_and_types_gathered, add_tablenumber=add_tablenumber + ).result() + results = { source_group_name: [ dict( @@ -1145,7 +1243,7 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals ) for source in source_group_vals ] - for source_group_name, source_group_vals in column_names_and_types_gathered.items() + for source_group_name, source_group_vals in tablenumber_prepared.items() } # if we're concatting or joining and need to infer the common schema @@ -1233,6 +1331,7 @@ def convert( # pylint: disable=too-many-arguments,too-many-locals infer_common_schema: bool = True, drop_null: bool = False, data_type_cast_map: Optional[Dict[str, str]] = None, + add_tablenumber: Optional[bool] = None, preset: Optional[str] = "cellprofiler_csv", parsl_config: Optional[parsl.Config] = None, **kwargs, @@ -1390,6 +1489,7 @@ def convert( # pylint: disable=too-many-arguments,too-many-locals infer_common_schema=infer_common_schema, drop_null=drop_null, data_type_cast_map=data_type_cast_map, + add_tablenumber=add_tablenumber, **kwargs, ).result() diff --git a/cytotable/utils.py b/cytotable/utils.py index 9789f42e..98106a90 100644 --- a/cytotable/utils.py +++ b/cytotable/utils.py @@ -171,6 +171,7 @@ def _sqlite_mixed_type_query_to_parquet( table_name: str, chunk_size: int, offset: int, + tablenumber: Optional[int] = None, ) -> str: """ Performs SQLite table data extraction where one or many @@ -241,9 +242,19 @@ def _sqlite_affinity_data_type_lookup(col_type: str) -> str: # return the translated type for use in SQLite return translated_type[0] + # build tablenumber segment addition (if necessary) + tablenumber_sql = ( + # to become tablenumber in sql select later with integer + f"CAST({tablenumber} AS INTEGER) as TableNumber, " + if tablenumber is not None + # if we don't have a tablenumber value, don't introduce the column + else "" + ) + # create cases for mixed-type handling in each column discovered above - query_parts = [ - f""" + query_parts = tablenumber_sql + ", ".join( + [ + f""" CASE /* when the storage class type doesn't match the column, return nulltype */ WHEN typeof({col['column_name']}) != @@ -252,12 +263,13 @@ def _sqlite_affinity_data_type_lookup(col_type: str) -> str: ELSE {col['column_name']} END AS {col['column_name']} """ - for col in column_info - ] + for col in column_info + ] + ) # perform the select using the cases built above and using chunksize + offset cursor.execute( - f'SELECT {", ".join(query_parts)} FROM {table_name} LIMIT {chunk_size} OFFSET {offset};' + f"SELECT {query_parts} FROM {table_name} LIMIT {chunk_size} OFFSET {offset};" ) # collect the results and include the column name with values results = [ @@ -457,3 +469,44 @@ def _write_parquet_table_with_metadata(table: pa.Table, **kwargs) -> None: ), **kwargs, ) + + +def _gather_tablenumber_checksum(pathname: str, buffer_size: int = 1048576) -> int: + """ + Build and return a checksum for use as a unique identifier across datasets + referenced from cytominer-database: + https://github.com/cytomining/cytominer-database/blob/master/cytominer_database/ingest_variable_engine.py#L129 + + Args: + pathname: str: + A path to a file with which to generate the checksum on. + buffer_size: int: + Buffer size to use for reading data. + + Returns: + int + an integer representing the checksum of the pathname file. + """ + + import os + import zlib + + # check whether the buffer size is larger than the file_size + file_size = os.path.getsize(pathname) + if file_size < buffer_size: + buffer_size = file_size + + # open file + with open(str(pathname), "rb") as stream: + # begin result formation + result = zlib.crc32(bytes(0)) + while True: + # read data from stream using buffer size + buffer = stream.read(buffer_size) + if not buffer: + # if we have no more data to use, break while loop + break + # use buffer read data to form checksum + result = zlib.crc32(buffer, result) + + return result & 0xFFFFFFFF From a32f12c1a9502140f1d2863da22251f8a37d94ee Mon Sep 17 00:00:00 2001 From: d33bs Date: Fri, 12 Apr 2024 13:32:22 -0600 Subject: [PATCH 02/12] enhance tablenumber work --- cytotable/convert.py | 49 +++++++++++++++++++++++++++++--------------- 1 file changed, 32 insertions(+), 17 deletions(-) diff --git a/cytotable/convert.py b/cytotable/convert.py index e73f0d62..d3919fc4 100644 --- a/cytotable/convert.py +++ b/cytotable/convert.py @@ -203,14 +203,37 @@ def _set_tablenumber( from cytotable.utils import _gather_tablenumber_checksum - print(sources) + image_table_groups = { + # create a data structure with the common parent for each dataset + # and the calculated checksum from the image table. + # note: the source_path parent is used for non-SQLite files + # whereas the direct source path is used for SQLite files. + ( + str(source["source_path"].parent) + if source["source_path"].suffix != "sqlite" + else source["source_path"] + ): source["source_path"] + for source_group_name, source_group_vals in sources.items() + # use the image tables references only for the basis of the + # these calculations. + if any( + value in str(AnyPath(source_group_name).stem).lower() + for value in ["image", "per_image"] + ) + for source in source_group_vals + } # determine if we need to add tablenumber data - if add_tablenumber is None: - pass + if ( + # case for detecting multiple image tables which need to be differentiated + add_tablenumber is None + and (len(image_table_groups) <= 1) + ) or ( + # case for explicitly set no tablenumbers + add_tablenumber + is False + ): - # if we're configured not to add tablenumber, add None - if not add_tablenumber: return { source_group_name: [ dict( @@ -228,20 +251,11 @@ def _set_tablenumber( tablenumber_table = { # create a data structure with the common parent for each dataset # and the calculated checksum from the image table - str(source["source_path"].parent): _gather_tablenumber_checksum( - source["source_path"] - ) - for source_group_name, source_group_vals in sources.items() - # use the image tables references only for the basis of the - # these calculations. - if any( - value in str(AnyPath(source_group_name).stem).lower() - for value in ["image", "per_image"] - ) - for source in source_group_vals + group: _gather_tablenumber_checksum(path) + for group, path in image_table_groups.items() } - # return a modified sources data structure with the table number added + # return a modified sources data structure with the tablenumber added return { source_group_name: [ dict( @@ -249,6 +263,7 @@ def _set_tablenumber( **{"tablenumber": tablenumber_table[str(source["source_path"].parent)]}, ) for source in source_group_vals + if str(source["source_path"].parent) in list(tablenumber_table.keys()) ] for source_group_name, source_group_vals in sources.items() } From 52c3544f16e2da822412f638f4faa1550e393b5f Mon Sep 17 00:00:00 2001 From: d33bs Date: Fri, 12 Apr 2024 13:32:41 -0600 Subject: [PATCH 03/12] add manual join to compare during tests --- tests/conftest.py | 118 +++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 111 insertions(+), 7 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index b11ceeb8..c96922a4 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -208,6 +208,98 @@ def cytominerdatabase_to_pycytominer_merge_single_cells_parquet( return output_paths +@pytest.fixture() +def cytominerdatabase_to_manual_join_parquet( + fx_tempdir: str, + cytominerdatabase_sqlite: List[str], +) -> List[str]: + """ + Processed cytominer-database test sqlite data as + pycytominer merged single cell parquet files + """ + + output_paths = [] + for sqlite_file in cytominerdatabase_sqlite: + destination_path = ( + f"{fx_tempdir}/manual_join.{pathlib.Path(sqlite_file).name}.parquet" + ) + df_cytominerdatabase = ( + pd.read_sql( + sql=""" + WITH Image_Filtered AS ( + SELECT + TableNumber, + ImageNumber + FROM + Image + ), + /* gather unique objectnumber column names from each + compartment so as to retain differentiation */ + Cytoplasm_renamed AS ( + SELECT + ObjectNumber AS Cytoplasm_ObjectNumber, + * + FROM Cytoplasm + ), + Cells_renamed AS ( + SELECT + ObjectNumber AS Cells_ObjectNumber, + * + FROM Cells + ), + Nuclei_renamed AS ( + SELECT + ObjectNumber AS Nuclei_ObjectNumber, + * + FROM Nuclei + ) + SELECT * + FROM Cytoplasm_renamed cytoplasm + LEFT JOIN Cells_renamed cells ON + cells.ImageNumber = cytoplasm.ImageNumber + AND cells.TableNumber = cytoplasm.TableNumber + AND cells.Cells_Number_Object_Number = cytoplasm.Cytoplasm_Parent_Cells + LEFT JOIN Nuclei_renamed nuclei ON + nuclei.ImageNumber = cytoplasm.ImageNumber + AND nuclei.TableNumber = cytoplasm.TableNumber + AND nuclei.Nuclei_Number_Object_Number = cytoplasm.Cytoplasm_Parent_Nuclei + LEFT JOIN Image_Filtered image ON + image.ImageNumber = cytoplasm.ImageNumber + AND image.TableNumber = cytoplasm.TableNumber + """, + con=sqlite_file, + ) + # replacing 'nan' strings with None + .replace(to_replace="nan", value=None) + # renaming columns as appropriate + .rename( + columns={ + "ImageNumber": "Metadata_ImageNumber", + "TableNumber": "Metadata_TableNumber", + "Cytoplasm_Parent_Cells": "Metadata_Cytoplasm_Parent_Cells", + "Cytoplasm_Parent_Nuclei": "Metadata_Cytoplasm_Parent_Nuclei", + "Cells_Parent_Nuclei": "Metadata_Cells_Parent_Nuclei", + } + # drop generic objectnumber column gathered from each compartment + # (we'll rely on the compartment prefixed name instead for comparisons) + ).drop(columns="ObjectNumber") + ) + + # drop duplicate column names + df_cytominerdatabase = df_cytominerdatabase.loc[ + :, ~df_cytominerdatabase.columns.duplicated() + ].copy() + + # sort the columns and export to parquet + df_cytominerdatabase[ + sorted(sorted(df_cytominerdatabase.columns.tolist()), key=_column_sort) + ].to_parquet(destination_path) + + output_paths.append(destination_path) + + return output_paths + + @pytest.fixture(name="example_tables") def fixture_example_tables() -> Tuple[pa.Table, ...]: """ @@ -356,13 +448,25 @@ def col_renames(name: str, table: pa.Table): """ return table.rename_columns( [ - f"Metadata_{colname}" - if colname in ["ImageNumber", "ObjectNumber"] - else f"Metadata_{name}_{colname}" - if any(name in colname for name in ["Parent_Cells", "Parent_Nuclei"]) - else f"{name}_{colname}" - if not (colname.startswith(name) or colname.startswith("Metadata_")) - else colname + ( + f"Metadata_{colname}" + if colname in ["ImageNumber", "ObjectNumber"] + else ( + f"Metadata_{name}_{colname}" + if any( + name in colname + for name in ["Parent_Cells", "Parent_Nuclei"] + ) + else ( + f"{name}_{colname}" + if not ( + colname.startswith(name) + or colname.startswith("Metadata_") + ) + else colname + ) + ) + ) for colname in table.column_names ] ) From 4cb0927f195ac34cb4b5aafa5406695174f1e4a9 Mon Sep 17 00:00:00 2001 From: d33bs Date: Fri, 12 Apr 2024 14:50:49 -0600 Subject: [PATCH 04/12] linting and threaded test --- cytotable/convert.py | 1 - tests/test_convert_threaded.py | 75 ++++++++++++++++++++++++++++++++++ 2 files changed, 75 insertions(+), 1 deletion(-) diff --git a/cytotable/convert.py b/cytotable/convert.py index d3919fc4..4d99cae5 100644 --- a/cytotable/convert.py +++ b/cytotable/convert.py @@ -233,7 +233,6 @@ def _set_tablenumber( add_tablenumber is False ): - return { source_group_name: [ dict( diff --git a/tests/test_convert_threaded.py b/tests/test_convert_threaded.py index b62d1370..93d7e3cc 100644 --- a/tests/test_convert_threaded.py +++ b/tests/test_convert_threaded.py @@ -10,6 +10,7 @@ import parsl import pyarrow as pa +import pyarrow.compute as pc import pytest from parsl.config import Config from parsl.executors import ThreadPoolExecutor @@ -246,3 +247,77 @@ def test_get_source_filepaths( ).result() # test that the single dir structure includes 4 unique keys assert len(set(single_dir_result.keys())) == 4 + + +def test_gather_tablenumber( + load_parsl_threaded: None, + fx_tempdir: str, + data_dirs_cytominerdatabase: List[str], + cytominerdatabase_to_manual_join_parquet: List[str], +): + """ + Tests _gather_tablenumber + """ + + for unprocessed_cytominerdatabase, processed_cytominerdatabase in zip( + data_dirs_cytominerdatabase, cytominerdatabase_to_manual_join_parquet + ): + test_table = parquet.read_table( + source=convert( + source_path=unprocessed_cytominerdatabase, + dest_path=( + f"{fx_tempdir}/{pathlib.Path(unprocessed_cytominerdatabase).name}.test_table.parquet" + ), + dest_datatype="parquet", + source_datatype="csv", + join=True, + joins=""" + WITH Image_Filtered AS ( + SELECT + Metadata_TableNumber, + Metadata_ImageNumber + FROM + read_parquet('image.parquet') + ) + SELECT + * + FROM + read_parquet('cytoplasm.parquet') AS cytoplasm + LEFT JOIN read_parquet('cells.parquet') AS cells ON + cells.Metadata_TableNumber = cells.Metadata_TableNumber + AND cells.Metadata_ImageNumber = cytoplasm.Metadata_ImageNumber + AND cells.Cells_ObjectNumber = cytoplasm.Metadata_Cytoplasm_Parent_Cells + LEFT JOIN read_parquet('nuclei.parquet') AS nuclei ON + nuclei.Metadata_TableNumber = nuclei.Metadata_TableNumber + AND nuclei.Metadata_ImageNumber = cytoplasm.Metadata_ImageNumber + AND nuclei.Nuclei_ObjectNumber = cytoplasm.Metadata_Cytoplasm_Parent_Nuclei + LEFT JOIN Image_Filtered AS image ON + image.Metadata_TableNumber = cytoplasm.Metadata_TableNumber + AND image.Metadata_ImageNumber = cytoplasm.Metadata_ImageNumber + """, + preset="cell-health-cellprofiler-to-cytominer-database", + ) + ) + control_table = parquet.read_table(source=processed_cytominerdatabase) + + # test_unique_tablenumbers = pc.unique(test_table["Metadata_TableNumber"]) + control_unique_tablenumbers = pc.unique(control_table["Metadata_TableNumber"]) + + assert ( + test_table.filter( + # we use only those tablenumbers which appear in cytominer-database related results + # to help compare. CytoTable only removes datasets which have no image table whereas + # cytominer-database removes any dataset which has no image table or problematic + # compartment tables (any compartment table with errors triggers the entire dataset + # being removed). + pc.field("Metadata_TableNumber").isin(control_unique_tablenumbers) + ) + .sort_by([(name, "ascending") for name in test_table.column_names]) + .equals( + control_table.sort_by( + [(name, "ascending") for name in control_table.column_names] + ) + ) + ) + + assert False From f3f171ebb2c1c8231678094cb4675b059f0334b9 Mon Sep 17 00:00:00 2001 From: d33bs Date: Fri, 18 Oct 2024 16:11:18 -0600 Subject: [PATCH 05/12] add list dep --- tests/test_convert_threaded.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/test_convert_threaded.py b/tests/test_convert_threaded.py index 638e10d6..0676184f 100644 --- a/tests/test_convert_threaded.py +++ b/tests/test_convert_threaded.py @@ -6,6 +6,7 @@ import pathlib +from typing import List import pandas as pd import pyarrow as pa @@ -212,8 +213,12 @@ def test_gather_tablenumber( ) control_table = parquet.read_table(source=processed_cytominerdatabase) - # test_unique_tablenumbers = pc.unique(test_table["Metadata_TableNumber"]) + test_unique_tablenumbers = pc.unique(test_table["Metadata_TableNumber"]) + print(test_unique_tablenumbers) control_unique_tablenumbers = pc.unique(control_table["Metadata_TableNumber"]) + print(control_unique_tablenumbers) + print(test_table.column_names) + print(control_table.column_names) assert ( test_table.filter( From dbe77b0b166777eb6b253302dd7c6d5416a4e452 Mon Sep 17 00:00:00 2001 From: d33bs Date: Wed, 23 Oct 2024 16:41:02 -0600 Subject: [PATCH 06/12] fix htex tests --- cytotable/convert.py | 3 +- cytotable/utils.py | 8 ++- poetry.lock | 97 ++-------------------------------- tests/test_convert_threaded.py | 2 + 4 files changed, 16 insertions(+), 94 deletions(-) diff --git a/cytotable/convert.py b/cytotable/convert.py index 59bf9746..ef37448a 100644 --- a/cytotable/convert.py +++ b/cytotable/convert.py @@ -1246,7 +1246,8 @@ def _to_parquet( # pylint: disable=too-many-arguments, too-many-locals # add tablenumber details, appending None if not add_tablenumber tablenumber_prepared = _set_tablenumber( - sources=column_names_and_types_gathered, add_tablenumber=add_tablenumber + sources=evaluate_futures(column_names_and_types_gathered), + add_tablenumber=add_tablenumber, ).result() results = { diff --git a/cytotable/utils.py b/cytotable/utils.py index c711cd52..377144bb 100644 --- a/cytotable/utils.py +++ b/cytotable/utils.py @@ -182,6 +182,7 @@ def _sqlite_mixed_type_query_to_parquet( page_key: str, pageset: Tuple[Union[int, float], Union[int, float]], sort_output: bool, + tablenumber: Optional[int] = None, ) -> str: """ Performs SQLite table data extraction where one or many @@ -201,6 +202,9 @@ def _sqlite_mixed_type_query_to_parquet( Specifies whether to sort cytotable output or not. add_cytotable_meta: bool, default=False: Whether to add CytoTable metadata fields or not + tablenumber: Optional[int], default=None: + An optional table number to append to the results. + Defaults to None. Returns: pyarrow.Table: @@ -284,7 +288,7 @@ def _sqlite_affinity_data_type_lookup(col_type: str) -> str: # perform the select using the cases built above and using chunksize + offset sql_stmt = f""" SELECT - {', '.join(query_parts)} + {query_parts} FROM {table_name} WHERE {page_key} BETWEEN {pageset[0]} AND {pageset[1]} {"ORDER BY " + page_key if sort_output else ""}; @@ -532,6 +536,8 @@ def _gather_tablenumber_checksum(pathname: str, buffer_size: int = 1048576) -> i result = zlib.crc32(buffer, result) return result & 0xFFFFFFFF + + def _unwrap_value(val: Union[parsl.dataflow.futures.AppFuture, Any]) -> Any: """ Helper function to unwrap futures from values or return values diff --git a/poetry.lock b/poetry.lock index 3b120058..9356592e 100644 --- a/poetry.lock +++ b/poetry.lock @@ -171,46 +171,6 @@ files = [ {file = "backports.weakref-1.0.post1.tar.gz", hash = "sha256:bc4170a29915f8b22c9e7c4939701859650f2eb84184aee80da329ac0b9825c2"}, ] -[[package]] -name = "bcrypt" -version = "4.1.2" -description = "Modern password hashing for your software and your servers" -optional = false -python-versions = ">=3.7" -files = [ - {file = "bcrypt-4.1.2-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:ac621c093edb28200728a9cca214d7e838529e557027ef0581685909acd28b5e"}, - {file = "bcrypt-4.1.2-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ea505c97a5c465ab8c3ba75c0805a102ce526695cd6818c6de3b1a38f6f60da1"}, - {file = "bcrypt-4.1.2-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:57fa9442758da926ed33a91644649d3e340a71e2d0a5a8de064fb621fd5a3326"}, - {file = "bcrypt-4.1.2-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:eb3bd3321517916696233b5e0c67fd7d6281f0ef48e66812db35fc963a422a1c"}, - {file = "bcrypt-4.1.2-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:6cad43d8c63f34b26aef462b6f5e44fdcf9860b723d2453b5d391258c4c8e966"}, - {file = "bcrypt-4.1.2-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:44290ccc827d3a24604f2c8bcd00d0da349e336e6503656cb8192133e27335e2"}, - {file = "bcrypt-4.1.2-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:732b3920a08eacf12f93e6b04ea276c489f1c8fb49344f564cca2adb663b3e4c"}, - {file = "bcrypt-4.1.2-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:1c28973decf4e0e69cee78c68e30a523be441972c826703bb93099868a8ff5b5"}, - {file = "bcrypt-4.1.2-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:b8df79979c5bae07f1db22dcc49cc5bccf08a0380ca5c6f391cbb5790355c0b0"}, - {file = "bcrypt-4.1.2-cp37-abi3-win32.whl", hash = "sha256:fbe188b878313d01b7718390f31528be4010fed1faa798c5a1d0469c9c48c369"}, - {file = "bcrypt-4.1.2-cp37-abi3-win_amd64.whl", hash = "sha256:9800ae5bd5077b13725e2e3934aa3c9c37e49d3ea3d06318010aa40f54c63551"}, - {file = "bcrypt-4.1.2-cp39-abi3-macosx_10_12_universal2.whl", hash = "sha256:71b8be82bc46cedd61a9f4ccb6c1a493211d031415a34adde3669ee1b0afbb63"}, - {file = "bcrypt-4.1.2-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:68e3c6642077b0c8092580c819c1684161262b2e30c4f45deb000c38947bf483"}, - {file = "bcrypt-4.1.2-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:387e7e1af9a4dd636b9505a465032f2f5cb8e61ba1120e79a0e1cd0b512f3dfc"}, - {file = "bcrypt-4.1.2-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:f70d9c61f9c4ca7d57f3bfe88a5ccf62546ffbadf3681bb1e268d9d2e41c91a7"}, - {file = "bcrypt-4.1.2-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:2a298db2a8ab20056120b45e86c00a0a5eb50ec4075b6142db35f593b97cb3fb"}, - {file = "bcrypt-4.1.2-cp39-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:ba55e40de38a24e2d78d34c2d36d6e864f93e0d79d0b6ce915e4335aa81d01b1"}, - {file = "bcrypt-4.1.2-cp39-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:3566a88234e8de2ccae31968127b0ecccbb4cddb629da744165db72b58d88ca4"}, - {file = "bcrypt-4.1.2-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:b90e216dc36864ae7132cb151ffe95155a37a14e0de3a8f64b49655dd959ff9c"}, - {file = "bcrypt-4.1.2-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:69057b9fc5093ea1ab00dd24ede891f3e5e65bee040395fb1e66ee196f9c9b4a"}, - {file = "bcrypt-4.1.2-cp39-abi3-win32.whl", hash = "sha256:02d9ef8915f72dd6daaef40e0baeef8a017ce624369f09754baf32bb32dba25f"}, - {file = "bcrypt-4.1.2-cp39-abi3-win_amd64.whl", hash = "sha256:be3ab1071662f6065899fe08428e45c16aa36e28bc42921c4901a191fda6ee42"}, - {file = "bcrypt-4.1.2-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:d75fc8cd0ba23f97bae88a6ec04e9e5351ff3c6ad06f38fe32ba50cbd0d11946"}, - {file = "bcrypt-4.1.2-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:a97e07e83e3262599434816f631cc4c7ca2aa8e9c072c1b1a7fec2ae809a1d2d"}, - {file = "bcrypt-4.1.2-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:e51c42750b7585cee7892c2614be0d14107fad9581d1738d954a262556dd1aab"}, - {file = "bcrypt-4.1.2-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:ba4e4cc26610581a6329b3937e02d319f5ad4b85b074846bf4fef8a8cf51e7bb"}, - {file = "bcrypt-4.1.2.tar.gz", hash = "sha256:33313a1200a3ae90b75587ceac502b048b840fc69e7f7a0905b5f87fac7a1258"}, -] - -[package.extras] -tests = ["pytest (>=3.2.1,!=3.3.0)"] -typecheck = ["mypy"] - [[package]] name = "boto3" version = "1.34.63" @@ -1549,27 +1509,6 @@ sql-other = ["SQLAlchemy (>=1.4.16)"] test = ["hypothesis (>=6.34.2)", "pytest (>=7.3.2)", "pytest-asyncio (>=0.17.0)", "pytest-xdist (>=2.2.0)"] xml = ["lxml (>=4.6.3)"] -[[package]] -name = "paramiko" -version = "3.4.0" -description = "SSH2 protocol library" -optional = false -python-versions = ">=3.6" -files = [ - {file = "paramiko-3.4.0-py3-none-any.whl", hash = "sha256:43f0b51115a896f9c00f59618023484cb3a14b98bbceab43394a39c6739b7ee7"}, - {file = "paramiko-3.4.0.tar.gz", hash = "sha256:aac08f26a31dc4dffd92821527d1682d99d52f9ef6851968114a8728f3c274d3"}, -] - -[package.dependencies] -bcrypt = ">=3.2" -cryptography = ">=3.3" -pynacl = ">=1.5" - -[package.extras] -all = ["gssapi (>=1.4.1)", "invoke (>=2.0)", "pyasn1 (>=0.1.7)", "pywin32 (>=2.1.8)"] -gssapi = ["gssapi (>=1.4.1)", "pyasn1 (>=0.1.7)", "pywin32 (>=2.1.8)"] -invoke = ["invoke (>=2.0)"] - [[package]] name = "parsedatetime" version = "2.6" @@ -1583,20 +1522,19 @@ files = [ [[package]] name = "parsl" -version = "2024.6.17" +version = "2024.10.7" description = "Simple data dependent workflows in Python" optional = false python-versions = ">=3.8.0" files = [ - {file = "parsl-2024.6.17-py3-none-any.whl", hash = "sha256:95cdbff3657efbe61d0b0dc501b10bd8d23b238d527e0a6327d58639d24f7e87"}, - {file = "parsl-2024.6.17.tar.gz", hash = "sha256:78e31ec46dcb9f665c0b5c50090dbf755b8cddb43b2bc39d467e50ce9c7eabfc"}, + {file = "parsl-2024.10.7-py3-none-any.whl", hash = "sha256:79b0f1e0e1854d261f548ddd79daca254fdd345610daf72d0866004093fdf93e"}, + {file = "parsl-2024.10.7.tar.gz", hash = "sha256:ab155f51b92f62bd2a2ca82abe70ea608d280d79b7beaa29fd8ce871d41ae59e"}, ] [package.dependencies] dill = "*" filelock = ">=3.13,<4" globus-sdk = "*" -paramiko = "*" psutil = ">=5.5.1" pyzmq = ">=17.1.2" requests = "*" @@ -1606,7 +1544,7 @@ typeguard = ">=2.10,<3.dev0 || ==4.*" typing-extensions = ">=4.6,<5" [package.extras] -all = ["Flask (>=1.0.2)", "azure (<=4)", "boto3", "cffi", "flask-sqlalchemy", "google-api-python-client", "google-auth", "ipython (<=8.6.0)", "jsonschema", "kubernetes", "msrestazure", "nbsphinx", "networkx (>=2.5,<2.6)", "oauth-ssh (>=0.9)", "pandas (<2.2)", "plotly", "proxystore", "pydot", "python-daemon", "python-gssapi", "pyyaml", "radical.pilot (==1.60)", "radical.utils (==1.60)", "sphinx (>=7.1,<7.2)", "sphinx-rtd-theme", "sqlalchemy (>=1.4,<2)", "work-queue"] +all = ["Flask (>=1.0.2)", "azure (<=4)", "boto3", "cffi", "flask-sqlalchemy", "google-api-python-client", "google-auth", "ipython (<=8.6.0)", "jsonschema", "kubernetes", "msrestazure", "nbsphinx", "networkx (>=2.5,<2.6)", "oauth-ssh (>=0.9)", "pandas (<2.2)", "paramiko", "plotly", "proxystore", "pydot", "python-daemon", "python-gssapi", "pyyaml", "radical.pilot (==1.60)", "radical.utils (==1.60)", "sphinx (>=7.1,<7.2)", "sphinx-rtd-theme", "sqlalchemy (>=1.4,<2)", "work-queue"] aws = ["boto3"] azure = ["azure (<=4)", "msrestazure"] docs = ["ipython (<=8.6.0)", "nbsphinx", "sphinx (>=7.1,<7.2)", "sphinx-rtd-theme"] @@ -1618,6 +1556,7 @@ monitoring = ["sqlalchemy (>=1.4,<2)"] oauth-ssh = ["oauth-ssh (>=0.9)"] proxystore = ["proxystore"] radical-pilot = ["radical.pilot (==1.60)", "radical.utils (==1.60)"] +ssh = ["paramiko"] visualization = ["Flask (>=1.0.2)", "flask-sqlalchemy", "networkx (>=2.5,<2.6)", "pandas (<2.2)", "plotly", "pydot", "python-daemon"] workqueue = ["work-queue"] @@ -1826,32 +1765,6 @@ dev = ["coverage[toml] (==5.0.4)", "cryptography (>=3.4.0)", "pre-commit", "pyte docs = ["sphinx (>=4.5.0,<5.0.0)", "sphinx-rtd-theme", "zope.interface"] tests = ["coverage[toml] (==5.0.4)", "pytest (>=6.0.0,<7.0.0)"] -[[package]] -name = "pynacl" -version = "1.5.0" -description = "Python binding to the Networking and Cryptography (NaCl) library" -optional = false -python-versions = ">=3.6" -files = [ - {file = "PyNaCl-1.5.0-cp36-abi3-macosx_10_10_universal2.whl", hash = "sha256:401002a4aaa07c9414132aaed7f6836ff98f59277a234704ff66878c2ee4a0d1"}, - {file = "PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.manylinux_2_24_aarch64.whl", hash = "sha256:52cb72a79269189d4e0dc537556f4740f7f0a9ec41c1322598799b0bdad4ef92"}, - {file = "PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a36d4a9dda1f19ce6e03c9a784a2921a4b726b02e1c736600ca9c22029474394"}, - {file = "PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl", hash = "sha256:0c84947a22519e013607c9be43706dd42513f9e6ae5d39d3613ca1e142fba44d"}, - {file = "PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:06b8f6fa7f5de8d5d2f7573fe8c863c051225a27b61e6860fd047b1775807858"}, - {file = "PyNaCl-1.5.0-cp36-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:a422368fc821589c228f4c49438a368831cb5bbc0eab5ebe1d7fac9dded6567b"}, - {file = "PyNaCl-1.5.0-cp36-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:61f642bf2378713e2c2e1de73444a3778e5f0a38be6fee0fe532fe30060282ff"}, - {file = "PyNaCl-1.5.0-cp36-abi3-win32.whl", hash = "sha256:e46dae94e34b085175f8abb3b0aaa7da40767865ac82c928eeb9e57e1ea8a543"}, - {file = "PyNaCl-1.5.0-cp36-abi3-win_amd64.whl", hash = "sha256:20f42270d27e1b6a29f54032090b972d97f0a1b0948cc52392041ef7831fee93"}, - {file = "PyNaCl-1.5.0.tar.gz", hash = "sha256:8ac7448f09ab85811607bdd21ec2464495ac8b7c66d146bf545b0f08fb9220ba"}, -] - -[package.dependencies] -cffi = ">=1.4.1" - -[package.extras] -docs = ["sphinx (>=1.6.5)", "sphinx-rtd-theme"] -tests = ["hypothesis (>=3.27.0)", "pytest (>=3.2.1,!=3.3.0)"] - [[package]] name = "pytest" version = "7.4.4" diff --git a/tests/test_convert_threaded.py b/tests/test_convert_threaded.py index 0676184f..78600d5e 100644 --- a/tests/test_convert_threaded.py +++ b/tests/test_convert_threaded.py @@ -238,6 +238,8 @@ def test_gather_tablenumber( ) assert False + + def test_avoid_na_row_output( load_parsl_threaded: None, fx_tempdir: str, data_dir_cellprofiler: str ): From ca01aeb08cd9e61130e4854941ecebb58979e4e3 Mon Sep 17 00:00:00 2001 From: d33bs Date: Wed, 23 Oct 2024 17:09:08 -0600 Subject: [PATCH 07/12] resolve legacy test issue with cytominer-database --- tests/conftest.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/conftest.py b/tests/conftest.py index 6ee98099..cf6fb8bd 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -217,7 +217,7 @@ def cytominerdatabase_to_pycytominer_merge_single_cells_parquet( @pytest.fixture() def cytominerdatabase_to_manual_join_parquet( fx_tempdir: str, - cytominerdatabase_sqlite: List[str], + cytominerdatabase_sqlite_static: List[str], ) -> List[str]: """ Processed cytominer-database test sqlite data as @@ -225,7 +225,7 @@ def cytominerdatabase_to_manual_join_parquet( """ output_paths = [] - for sqlite_file in cytominerdatabase_sqlite: + for sqlite_file in cytominerdatabase_sqlite_static: destination_path = ( f"{fx_tempdir}/manual_join.{pathlib.Path(sqlite_file).name}.parquet" ) From 81fe58dc5d454a94766ec4d8a6d7cd4420c1c32b Mon Sep 17 00:00:00 2001 From: d33bs Date: Thu, 24 Oct 2024 08:53:25 -0600 Subject: [PATCH 08/12] fix tablenumber test --- tests/test_convert_threaded.py | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/tests/test_convert_threaded.py b/tests/test_convert_threaded.py index 78600d5e..f4340d76 100644 --- a/tests/test_convert_threaded.py +++ b/tests/test_convert_threaded.py @@ -193,7 +193,10 @@ def test_gather_tablenumber( read_parquet('image.parquet') ) SELECT - * + image.*, + cytoplasm.* EXCLUDE (Metadata_TableNumber, Metadata_ImageNumber), + nuclei.* EXCLUDE (Metadata_TableNumber, Metadata_ImageNumber), + cells.* EXCLUDE (Metadata_TableNumber, Metadata_ImageNumber) FROM read_parquet('cytoplasm.parquet') AS cytoplasm LEFT JOIN read_parquet('cells.parquet') AS cells ON @@ -213,14 +216,13 @@ def test_gather_tablenumber( ) control_table = parquet.read_table(source=processed_cytominerdatabase) - test_unique_tablenumbers = pc.unique(test_table["Metadata_TableNumber"]) - print(test_unique_tablenumbers) control_unique_tablenumbers = pc.unique(control_table["Metadata_TableNumber"]) - print(control_unique_tablenumbers) - print(test_table.column_names) - print(control_table.column_names) - assert ( + # use pandas to assert a test of equality to help with differences in how + # data may be rounded by CytoTable vs cytominer-database (which use different data parsers + # and related conversions). + # See here for more information: https://github.com/cytomining/CytoTable/issues/187 + pd.testing.assert_frame_equal( test_table.filter( # we use only those tablenumbers which appear in cytominer-database related results # to help compare. CytoTable only removes datasets which have no image table whereas @@ -230,15 +232,12 @@ def test_gather_tablenumber( pc.field("Metadata_TableNumber").isin(control_unique_tablenumbers) ) .sort_by([(name, "ascending") for name in test_table.column_names]) - .equals( - control_table.sort_by( - [(name, "ascending") for name in control_table.column_names] - ) - ) + .to_pandas(), + control_table.sort_by( + [(name, "ascending") for name in control_table.column_names] + ).to_pandas(), ) - assert False - def test_avoid_na_row_output( load_parsl_threaded: None, fx_tempdir: str, data_dir_cellprofiler: str From 3a9ffeb464ad74ca1fee52fbf372c5717802ab14 Mon Sep 17 00:00:00 2001 From: d33bs Date: Thu, 24 Oct 2024 09:31:57 -0600 Subject: [PATCH 09/12] add docs --- cytotable/convert.py | 2 ++ docs/source/architecture.data.md | 1 + docs/source/python-api.md | 4 ++++ 3 files changed, 7 insertions(+) diff --git a/cytotable/convert.py b/cytotable/convert.py index ef37448a..eaa8c6b5 100644 --- a/cytotable/convert.py +++ b/cytotable/convert.py @@ -1439,6 +1439,8 @@ def convert( # pylint: disable=too-many-arguments,too-many-locals add_tablenumber: Optional[bool] Whether to add a calculated tablenumber which helps differentiate various repeated values (such as ObjectNumber) within source data. + Useful for processing multiple SQLite or CSV data sources together + to retain distinction from each dataset. page_keys: str: The table and column names to be used for key pagination. Uses the form: {"table_name":"column_name"}. diff --git a/docs/source/architecture.data.md b/docs/source/architecture.data.md index 3591dd06..876b7066 100644 --- a/docs/source/architecture.data.md +++ b/docs/source/architecture.data.md @@ -25,6 +25,7 @@ Data are organized into tables of generally two categories: Identifying or key fields for image and compartment tables may include the following: +- __TableNumber__: Provides a unique number based on the file referenced to build CytoTable output to help distinguish from repeated values in ImageNumber, ObjectNumber or other metadata columns which are referenced. Typically useful when using multiple SQLite or CSV-based source datasets. - __ImageNumber__: Provides specificity on what image is being referenced (there may be many). - __ObjectNumber__: Provides specificity for a specific compartment object within an ImageNumber. - __Parent_Cells__: Provides a related Cell compartment ObjectNumber. This field is canonically referenced from the Cytoplasm compartment for joining Cytoplasm and Cell compartment data. (see [Cytoplasm Compartment Data Relationships](architecture.data.md#cytoplasm-compartment-data-relationships) below for greater detail) diff --git a/docs/source/python-api.md b/docs/source/python-api.md index ab1fc62d..0e9986ce 100644 --- a/docs/source/python-api.md +++ b/docs/source/python-api.md @@ -45,6 +45,10 @@ Convert | +.. autofunction:: _set_tablenumber + +| + .. autofunction:: _prepend_column_name | From 4560a29efda4c52854dbb8232a579bdf499be948 Mon Sep 17 00:00:00 2001 From: Dave Bunten Date: Fri, 25 Oct 2024 09:56:48 -0600 Subject: [PATCH 10/12] Update cytotable/convert.py Co-authored-by: Gregory Way --- cytotable/convert.py | 1 + 1 file changed, 1 insertion(+) diff --git a/cytotable/convert.py b/cytotable/convert.py index eaa8c6b5..c707c26d 100644 --- a/cytotable/convert.py +++ b/cytotable/convert.py @@ -212,6 +212,7 @@ def _set_tablenumber( if source["source_path"].suffix != "sqlite" else source["source_path"] ): source["source_path"] + for source_group_name, source_group_vals in sources.items() # use the image tables references only for the basis of the # these calculations. From 2660cb2010b9240c813977a57ff5ef5c511936f9 Mon Sep 17 00:00:00 2001 From: d33bs Date: Fri, 25 Oct 2024 10:12:32 -0600 Subject: [PATCH 11/12] update docs --- cytotable/convert.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/cytotable/convert.py b/cytotable/convert.py index c707c26d..91a274ba 100644 --- a/cytotable/convert.py +++ b/cytotable/convert.py @@ -179,8 +179,14 @@ def _set_tablenumber( add_tablenumber: Optional[bool] = None, ) -> Dict[str, List[Dict[str, Any]]]: """ - Gathers a "TableNumber" for the table which is a unique identifier intended - to help differentiate between imagenumbers to create distinct results. + Gathers a "TableNumber" from the image table (if CSV) or + SQLite file (if SQLite source) which is a unique identifier + intended to help differentiate between imagenumbers + to create distinct records for single-cell profiles + referenced across multiple source data exports. + For example, ImageNumber column values from CellProfiler + will repeat across exports, meaning we may lose distinction + when combining multiple export files together through CytoTable. Note: - If using CSV data sources, the image.csv table is used for checksum. @@ -412,7 +418,8 @@ def _source_pageset_to_parquet( # misaligned automatic data typing. f"CAST({source['tablenumber']} AS BIGINT) as TableNumber, " if source["tablenumber"] is not None - # if we don't have a tablenumber value, don't introduce the column + # don't introduce the column if we aren't supposed to add tablenumber + # as per parameter. else "" ) From 22c5ee708b682588b16cbeb1eae82c8d13d2e7ca Mon Sep 17 00:00:00 2001 From: d33bs Date: Fri, 25 Oct 2024 10:13:09 -0600 Subject: [PATCH 12/12] linting --- cytotable/convert.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/cytotable/convert.py b/cytotable/convert.py index 91a274ba..4653e745 100644 --- a/cytotable/convert.py +++ b/cytotable/convert.py @@ -179,9 +179,9 @@ def _set_tablenumber( add_tablenumber: Optional[bool] = None, ) -> Dict[str, List[Dict[str, Any]]]: """ - Gathers a "TableNumber" from the image table (if CSV) or + Gathers a "TableNumber" from the image table (if CSV) or SQLite file (if SQLite source) which is a unique identifier - intended to help differentiate between imagenumbers + intended to help differentiate between imagenumbers to create distinct records for single-cell profiles referenced across multiple source data exports. For example, ImageNumber column values from CellProfiler @@ -218,7 +218,6 @@ def _set_tablenumber( if source["source_path"].suffix != "sqlite" else source["source_path"] ): source["source_path"] - for source_group_name, source_group_vals in sources.items() # use the image tables references only for the basis of the # these calculations. @@ -418,7 +417,7 @@ def _source_pageset_to_parquet( # misaligned automatic data typing. f"CAST({source['tablenumber']} AS BIGINT) as TableNumber, " if source["tablenumber"] is not None - # don't introduce the column if we aren't supposed to add tablenumber + # don't introduce the column if we aren't supposed to add tablenumber # as per parameter. else "" )