From 29446de869ce18e002d9686fb2e28685e4db2b77 Mon Sep 17 00:00:00 2001 From: Muhanad Zahra Date: Wed, 2 Feb 2022 12:10:51 +0000 Subject: [PATCH 01/61] Added support for DatasetColumn, specifically dataset ID column --- src/omero_metadata/populate.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/omero_metadata/populate.py b/src/omero_metadata/populate.py index b6b68a39..f3f1f136 100644 --- a/src/omero_metadata/populate.py +++ b/src/omero_metadata/populate.py @@ -416,8 +416,6 @@ def resolve(self, column, value, row): ) break elif column.name.lower() == "dataset name": - # DatasetColumn unimplemented at the momnet - # We can still access column names though images_by_id = self.wrapper.images_by_id[ self.wrapper.datasets_by_name[column_value].id.val ] @@ -427,8 +425,6 @@ def resolve(self, column, value, row): ) break elif column.name.lower() == "dataset": - # DatasetColumn unimplemented at the momnet - # We can still access column names though images_by_id = self.wrapper.images_by_id[ self.wrapper.datasets_by_id[ int(column_value)].id.val @@ -825,7 +821,10 @@ def get_image_name_by_id(self, iid, did=None): def resolve_dataset(self, column, row, value): try: - return self.datasets_by_name[value].id.val + if column.name.lower() == 'dataset': + return self.datasets_by_id[int(value)].id.val + else: + return self.datasets_by_name[value].id.val except KeyError: log.warn('Project is missing dataset: %s' % value) return Skip() @@ -1160,6 +1159,8 @@ def preprocess_data(self, reader): column.values.append(value) elif column.name.lower() == "plate": column.values.append(value) + elif column.name.lower() == "dataset": + column.values.append(value) except TypeError: log.error('Original value "%s" now "%s" of bad type!' % ( original_value, value)) From 30dd5a7b02bf13714caaca4e425dd0ef3305774d Mon Sep 17 00:00:00 2001 From: Muhanad Zahra Date: Wed, 2 Feb 2022 12:11:52 +0000 Subject: [PATCH 02/61] Added CLI argument and tool to automatically detect header type --- src/omero_metadata/cli.py | 51 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 50 insertions(+), 1 deletion(-) diff --git a/src/omero_metadata/cli.py b/src/omero_metadata/cli.py index bc788371..624331e1 100755 --- a/src/omero_metadata/cli.py +++ b/src/omero_metadata/cli.py @@ -32,6 +32,8 @@ from omero.grid import LongColumn from omero.model.enums import UnitsLength +import pandas as pd + HELP = """Metadata utilities Provides access to and editing of the metadata which @@ -242,6 +244,9 @@ def _configure(self, parser): populate.add_argument("--allow_nan", action="store_true", help=( "Allow empty values to become Nan in Long or Double columns")) + populate.add_argument("--detect_header", action="store_true", help=( + "Automatically detect header row to populate")) + populateroi.add_argument( "--measurement", type=int, default=None, help="Index of the measurement to populate. By default, all") @@ -483,6 +488,44 @@ def testtables(self, args): if not initialized: self.ctx.die(100, "Failed to initialize Table") + def detect_headers(self, csv_path): + ''' + Function to automatically detect headers from a CSV file. This function + loads the table to pandas to detects the column type and match headers + ''' + + conserved_headers = ['well', 'plate', 'image', 'dataset', 'roi'] + headers = [] + table = pd.read_csv(csv_path) + col_types = table.dtypes.values.tolist() + cols = list(table.columns) + + for index, col_type in enumerate(col_types): + col = cols[index] + if col.lower() in conserved_headers: + headers.append(col.lower()) + elif col.lower() == 'image name' or col.lower() == 'imagename' or \ + col.lower() == 'image_name': + headers.append('image') + elif col.lower() == 'dataset name' or col.lower() == 'datasetname' or \ + col.lower() == 'dataset_name': + headers.append('dataset') + elif col.lower() == 'plate name' or col.lower() == 'platename' or \ + col.lower() == 'plate_name': + headers.append('plate') + elif col.lower() == 'well name' or col.lower() == 'wellname' or \ + col.lower() == 'well_name': + headers.append('well') + elif col_type.name == 'object': + headers.append('s') + elif col_type.name == 'float64': + headers.append('d') + elif col_type.name == 'int64': + headers.append('l') + elif col_type.name == 'bool': + headers.append('b') + return headers + # WRITE def populate(self, args): @@ -521,6 +564,12 @@ def populate(self, args): cfgid = cfgann.getFile().getId() md.linkAnnotation(cfgann) + header_type = None + if args.detect_header: + header_type = self.detect_headers(args.file) + if args.dry_run: + omero_metadata.populate.log.info(f"Header Types:{header_type}") + # add condition col_type = blarg, open arg.file, arg.detect_header loops = 0 ms = 0 wait = args.wait @@ -533,7 +582,7 @@ def populate(self, args): cfg=args.cfg, cfgid=cfgid, attach=args.attach, options=localcfg, batch_size=args.batch, loops=loops, ms=ms, dry_run=args.dry_run, - allow_nan=args.allow_nan) + allow_nan=args.allow_nan, column_types=header_type) ctx.parse() def rois(self, args): From 95d610632dbfd90b32e7c52df0c79bc502b6f840 Mon Sep 17 00:00:00 2001 From: Muhanad Zahra Date: Wed, 2 Feb 2022 12:24:59 +0000 Subject: [PATCH 03/61] Removed comment. --- src/omero_metadata/cli.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/omero_metadata/cli.py b/src/omero_metadata/cli.py index 624331e1..6937732d 100755 --- a/src/omero_metadata/cli.py +++ b/src/omero_metadata/cli.py @@ -569,7 +569,6 @@ def populate(self, args): header_type = self.detect_headers(args.file) if args.dry_run: omero_metadata.populate.log.info(f"Header Types:{header_type}") - # add condition col_type = blarg, open arg.file, arg.detect_header loops = 0 ms = 0 wait = args.wait From 3f559af3095478f0118aa63bfe6642d313f8f5e0 Mon Sep 17 00:00:00 2001 From: Muhanad Zahra Date: Thu, 3 Feb 2022 14:59:45 +0000 Subject: [PATCH 04/61] Fix flake8 --- src/omero_metadata/cli.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/omero_metadata/cli.py b/src/omero_metadata/cli.py index 6937732d..6c67c9e8 100755 --- a/src/omero_metadata/cli.py +++ b/src/omero_metadata/cli.py @@ -505,16 +505,17 @@ def detect_headers(self, csv_path): if col.lower() in conserved_headers: headers.append(col.lower()) elif col.lower() == 'image name' or col.lower() == 'imagename' or \ - col.lower() == 'image_name': + col.lower() == 'image_name': headers.append('image') - elif col.lower() == 'dataset name' or col.lower() == 'datasetname' or \ - col.lower() == 'dataset_name': + elif col.lower() == 'dataset name' or \ + col.lower() == 'datasetname' or \ + col.lower() == 'dataset_name': headers.append('dataset') elif col.lower() == 'plate name' or col.lower() == 'platename' or \ - col.lower() == 'plate_name': + col.lower() == 'plate_name': headers.append('plate') elif col.lower() == 'well name' or col.lower() == 'wellname' or \ - col.lower() == 'well_name': + col.lower() == 'well_name': headers.append('well') elif col_type.name == 'object': headers.append('s') From 0f764486fc88f3c46490dc0ab37193b8f762e336 Mon Sep 17 00:00:00 2001 From: Muhanad Zahra Date: Thu, 3 Feb 2022 15:09:01 +0000 Subject: [PATCH 05/61] Added pandas module requirment --- setup.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index cdc649f8..bde27697 100644 --- a/setup.py +++ b/setup.py @@ -127,7 +127,8 @@ def read(fname): 'future', 'omero-py>=5.6.0', 'PyYAML', - 'jinja2' + 'jinja2', + 'pandas' ], python_requires='>=3', tests_require=[ From cfe97c4f0ccbbc2c9004418da55e252a098afcf8 Mon Sep 17 00:00:00 2001 From: Muhanad Zahra Date: Wed, 23 Feb 2022 09:41:05 +0000 Subject: [PATCH 06/61] Modified code to not detect dataset/image header type for dataset_name/image_name columns and only dataset_id/image_id --- src/omero_metadata/cli.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/src/omero_metadata/cli.py b/src/omero_metadata/cli.py index 6c67c9e8..9af0a6e6 100755 --- a/src/omero_metadata/cli.py +++ b/src/omero_metadata/cli.py @@ -504,12 +504,15 @@ def detect_headers(self, csv_path): col = cols[index] if col.lower() in conserved_headers: headers.append(col.lower()) - elif col.lower() == 'image name' or col.lower() == 'imagename' or \ - col.lower() == 'image_name': + elif col.lower() == 'image id' or col.lower() == 'imageid' or \ + col.lower() == 'image_id': headers.append('image') - elif col.lower() == 'dataset name' or \ - col.lower() == 'datasetname' or \ - col.lower() == 'dataset_name': + elif col.lower() == 'roi id' or col.lower() == 'roiid' or \ + col.lower() == 'roi_id': + headers.append('roi') + elif col.lower() == 'dataset id' or \ + col.lower() == 'datasetid' or \ + col.lower() == 'dataset_id': headers.append('dataset') elif col.lower() == 'plate name' or col.lower() == 'platename' or \ col.lower() == 'plate_name': From 9daa9f364f70dfc48409a8534aec8ea6cd86ec46 Mon Sep 17 00:00:00 2001 From: Muhanad Zahra Date: Thu, 24 Feb 2022 11:20:44 +0000 Subject: [PATCH 07/61] Added 'Dataset Name' column to be consistent with other column types. Ensured DatasetColumn is named 'Dataset' --- src/omero_metadata/populate.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/omero_metadata/populate.py b/src/omero_metadata/populate.py index f3f1f136..0d329cc6 100644 --- a/src/omero_metadata/populate.py +++ b/src/omero_metadata/populate.py @@ -313,6 +313,11 @@ def _create_columns(self, klass): self.DEFAULT_COLUMN_SIZE, list())) # Ensure RoiColumn is named 'Roi' column.name = "Roi" + if column.__class__ is DatasetColumn: + append.append(StringColumn(DATASET_NAME_COLUMN, '', + self.DEFAULT_COLUMN_SIZE, list())) + # Ensure DatasetColumn is named 'Dataset' + column.name = "Dataset" # If image/roi name, then add ID column" if column.name == IMAGE_NAME_COLUMN: append.append(ImageColumn("Image", '', list())) From 9b158f539f766e9338f3742e5ed0cc68b1769e59 Mon Sep 17 00:00:00 2001 From: Muhanad Zahra Date: Tue, 1 Mar 2022 11:41:01 +0000 Subject: [PATCH 08/61] Prevent other contexts from breaking when using --detect_header --- src/omero_metadata/populate.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/omero_metadata/populate.py b/src/omero_metadata/populate.py index 0d329cc6..88b483e0 100644 --- a/src/omero_metadata/populate.py +++ b/src/omero_metadata/populate.py @@ -1556,7 +1556,7 @@ class BulkToMapAnnotationContext(_QueryContext): def __init__(self, client, target_object, file=None, fileid=None, cfg=None, cfgid=None, attach=False, options=None, batch_size=1000, loops=10, ms=10, dry_run=False, - allow_nan=False): + allow_nan=False, **kwargs): """ :param client: OMERO client object :param target_object: The object to be annotated @@ -1889,7 +1889,7 @@ class DeleteMapAnnotationContext(_QueryContext): def __init__(self, client, target_object, file=None, fileid=None, cfg=None, cfgid=None, attach=False, options=None, batch_size=1000, loops=10, ms=500, dry_run=False, - allow_nan=False): + allow_nan=False, **kwargs): """ :param client: OMERO client object From 33de3ba587ee2a8cef7fcda49e5c924d746fe473 Mon Sep 17 00:00:00 2001 From: Muhanad Zahra Date: Mon, 28 Mar 2022 14:23:20 +0100 Subject: [PATCH 09/61] Changed the code's default behaviour to use the new header detection method. User can now either pass '--manual_header' or a csv with '# header' header to bypass the auto-detect header method. --- src/omero_metadata/cli.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/src/omero_metadata/cli.py b/src/omero_metadata/cli.py index 9af0a6e6..e0fa15a1 100755 --- a/src/omero_metadata/cli.py +++ b/src/omero_metadata/cli.py @@ -244,8 +244,8 @@ def _configure(self, parser): populate.add_argument("--allow_nan", action="store_true", help=( "Allow empty values to become Nan in Long or Double columns")) - populate.add_argument("--detect_header", action="store_true", help=( - "Automatically detect header row to populate")) + populate.add_argument("--manual_header", action="store_true", help=( + "Disable automatic header detection row to populate")) populateroi.add_argument( "--measurement", type=int, default=None, @@ -569,10 +569,18 @@ def populate(self, args): md.linkAnnotation(cfgann) header_type = None - if args.detect_header: + # To use auto detect header by default unless instructed not to + # AND + # Check if first row contains `# header` + first_row = pd.read_csv(args.file, nrows=1, header=None) + if not args.manual_header and \ + not first_row[0].str.contains('# header'): + omero_metadata.populate.log.info("Detecting header types") header_type = self.detect_headers(args.file) if args.dry_run: omero_metadata.populate.log.info(f"Header Types:{header_type}") + else: + omero_metadata.populate.log.info("Using user defined header types") loops = 0 ms = 0 wait = args.wait From 3b8ff20c6203c380cc5b75335c98df740d807b39 Mon Sep 17 00:00:00 2001 From: Muhanad Zahra Date: Tue, 29 Mar 2022 13:20:50 +0100 Subject: [PATCH 10/61] Removed the newely added 'Dataset Name' column as it wasn't fully implemented later and caused a bug --- src/omero_metadata/populate.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/omero_metadata/populate.py b/src/omero_metadata/populate.py index 88b483e0..dc7c5194 100644 --- a/src/omero_metadata/populate.py +++ b/src/omero_metadata/populate.py @@ -314,8 +314,10 @@ def _create_columns(self, klass): # Ensure RoiColumn is named 'Roi' column.name = "Roi" if column.__class__ is DatasetColumn: - append.append(StringColumn(DATASET_NAME_COLUMN, '', - self.DEFAULT_COLUMN_SIZE, list())) + # This breaks the code, as currently there is no implementation + # of a method to populate the 'Dataset Name' column + # append.append(StringColumn(DATASET_NAME_COLUMN, '', + # self.DEFAULT_COLUMN_SIZE, list())) # Ensure DatasetColumn is named 'Dataset' column.name = "Dataset" # If image/roi name, then add ID column" From 06ad0626e8c03b4888eb34679e75e3ed4d1924e0 Mon Sep 17 00:00:00 2001 From: Muhanad Zahra Date: Tue, 29 Mar 2022 13:21:44 +0100 Subject: [PATCH 11/61] Improved '--manual_header' help description and fixed a bug with '# header' detection --- src/omero_metadata/cli.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/omero_metadata/cli.py b/src/omero_metadata/cli.py index e0fa15a1..b06c9109 100755 --- a/src/omero_metadata/cli.py +++ b/src/omero_metadata/cli.py @@ -245,7 +245,7 @@ def _configure(self, parser): "Allow empty values to become Nan in Long or Double columns")) populate.add_argument("--manual_header", action="store_true", help=( - "Disable automatic header detection row to populate")) + "Disable automatic header detection during population")) populateroi.add_argument( "--measurement", type=int, default=None, @@ -574,7 +574,7 @@ def populate(self, args): # Check if first row contains `# header` first_row = pd.read_csv(args.file, nrows=1, header=None) if not args.manual_header and \ - not first_row[0].str.contains('# header'): + not first_row[0].str.contains('# header').bool(): omero_metadata.populate.log.info("Detecting header types") header_type = self.detect_headers(args.file) if args.dry_run: From 99383af2e50af35f8669aad99dc1f683b2fd67f1 Mon Sep 17 00:00:00 2001 From: Muhanad Zahra Date: Wed, 30 Mar 2022 13:45:33 +0100 Subject: [PATCH 12/61] Made detect_headers method a static method --- src/omero_metadata/cli.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/omero_metadata/cli.py b/src/omero_metadata/cli.py index b06c9109..edce60b6 100755 --- a/src/omero_metadata/cli.py +++ b/src/omero_metadata/cli.py @@ -488,7 +488,8 @@ def testtables(self, args): if not initialized: self.ctx.die(100, "Failed to initialize Table") - def detect_headers(self, csv_path): + @staticmethod + def detect_headers(csv_path): ''' Function to automatically detect headers from a CSV file. This function loads the table to pandas to detects the column type and match headers @@ -576,7 +577,7 @@ def populate(self, args): if not args.manual_header and \ not first_row[0].str.contains('# header').bool(): omero_metadata.populate.log.info("Detecting header types") - header_type = self.detect_headers(args.file) + header_type = MetadataControl.detect_headers(args.file) if args.dry_run: omero_metadata.populate.log.info(f"Header Types:{header_type}") else: From 4677348cf0581880f3b64a155160ba623ac06b33 Mon Sep 17 00:00:00 2001 From: Sebastien Besson Date: Tue, 5 Apr 2022 12:02:52 +0100 Subject: [PATCH 13/61] =?UTF-8?q?Bump=20version:=200.10.1.dev0=20=E2=86=92?= =?UTF-8?q?=200.11.0.dev0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .bumpversion.cfg | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index ee1374ac..796606da 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.10.1.dev0 +current_version = 0.11.0.dev0 commit = True tag = True sign_tags = True diff --git a/setup.py b/setup.py index 0aacefa8..0c5b9766 100644 --- a/setup.py +++ b/setup.py @@ -92,7 +92,7 @@ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() -version = '0.10.1.dev0' +version = '0.11.0.dev0' url = "https://github.com/ome/omero-metadata/" setup( From 3ca6f776628397b463583891a1153819cf483524 Mon Sep 17 00:00:00 2001 From: Dominik Lindner Date: Wed, 6 Apr 2022 13:28:44 +0100 Subject: [PATCH 14/61] Skip empty rows --- src/omero_metadata/populate.py | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/src/omero_metadata/populate.py b/src/omero_metadata/populate.py index 166408a2..98005e43 100644 --- a/src/omero_metadata/populate.py +++ b/src/omero_metadata/populate.py @@ -1304,14 +1304,17 @@ def populate_from_reader(self, for (r, row) in enumerate(reader): log.debug('Row %d', r) if filter_function(row): - self.populate_row(row) - row_count = row_count + 1 - if row_count >= batch_size: - self.post_process() - table.addData(self.columns) - for column in self.columns: - column.values = [] - row_count = 0 + if row: + self.populate_row(row) + row_count = row_count + 1 + if row_count >= batch_size: + self.post_process() + table.addData(self.columns) + for column in self.columns: + column.values = [] + row_count = 0 + else: + log.info('Skip empty row %d', r + 1) if row_count != 0: log.debug("DATA TO ADD") log.debug(self.columns) @@ -1341,7 +1344,10 @@ def populate(self, rows): nrows = len(rows) for (r, row) in enumerate(rows): log.debug('Row %d/%d', r + 1, nrows) - self.populate_row(row) + if row: + self.populate_row(row) + else: + log.info('Skip empty row %d', r + 1) def post_process(self): target_class = self.target_object.__class__ From 43f3cc43594eaa09dd034f86404c596b8f6ef895 Mon Sep 17 00:00:00 2001 From: Dominik Lindner Date: Wed, 6 Apr 2022 14:23:01 +0100 Subject: [PATCH 15/61] Warn instead of info for skipping empty row --- src/omero_metadata/populate.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/omero_metadata/populate.py b/src/omero_metadata/populate.py index 98005e43..f9b7adb2 100644 --- a/src/omero_metadata/populate.py +++ b/src/omero_metadata/populate.py @@ -1314,7 +1314,7 @@ def populate_from_reader(self, column.values = [] row_count = 0 else: - log.info('Skip empty row %d', r + 1) + log.warning('Skip empty row %d', r + 1) if row_count != 0: log.debug("DATA TO ADD") log.debug(self.columns) @@ -1347,7 +1347,7 @@ def populate(self, rows): if row: self.populate_row(row) else: - log.info('Skip empty row %d', r + 1) + log.warning('Skip empty row %d', r + 1) def post_process(self): target_class = self.target_object.__class__ From 4b0c1e8b70dc2551514a273a64057151029a1f6a Mon Sep 17 00:00:00 2001 From: Muhanad Zahra <86613209+muhanadz@users.noreply.github.com> Date: Thu, 7 Apr 2022 13:14:20 +0100 Subject: [PATCH 16/61] Updating README Updating README.rst to include the changes to default behaviour --- README.rst | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/README.rst b/README.rst index 1e568745..513df1fb 100644 --- a/README.rst +++ b/README.rst @@ -70,8 +70,12 @@ object IDs in the ``OMERO.table``. The ``CSV`` file must be provided as local file with ``--file path/to/file.csv``. -If you wish to ensure that ``number`` columns are created for numerical data, this will -allow you to make numerical queries on the table. + + +Automatic header +^^^^^^^^^ + +**The default behaviour of the script is to automatically detect the column types and specific object types from an input ``CSV`` using the list below.** Column Types are: - ``d``: ``DoubleColumn``, for floating point numbers @@ -80,8 +84,14 @@ Column Types are: - ``b``: ``BoolColumn``, for true/false - ``plate``, ``well``, ``image``, ``dataset``, ``roi`` to specify objects -These can be specified in the first row of a ``CSV`` with a ``# header`` tag (see examples below). -The ``# header`` row is optional. Default column type is ``String``. + + +Manual Header +^^^^^^^^^ + +However, it is possible to override the default behaviour, ignoring the automatic header detection, and manually assign the header to define the column type if a ``CSV`` with with a ``# header`` tag is passed (see examples below). + +Automatic header detection can also be ignored if using the ``--manual_headers`` flag. If the ``# header`` is not present and this flag is used, column types will default to ``String`` NB: Column names should not contain spaces if you want to be able to query by these columns. From 115dcbf77fca9a2c31da8286acadbd2fef05709a Mon Sep 17 00:00:00 2001 From: Muhanad Zahra <86613209+muhanadz@users.noreply.github.com> Date: Thu, 7 Apr 2022 13:16:08 +0100 Subject: [PATCH 17/61] Clearer titles --- README.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index 513df1fb..c4347203 100644 --- a/README.rst +++ b/README.rst @@ -72,7 +72,7 @@ The ``CSV`` file must be provided as local file with ``--file path/to/file.csv`` -Automatic header +Automatic Column Types ^^^^^^^^^ **The default behaviour of the script is to automatically detect the column types and specific object types from an input ``CSV`` using the list below.** @@ -86,7 +86,7 @@ Column Types are: -Manual Header +Manual Column Types ^^^^^^^^^ However, it is possible to override the default behaviour, ignoring the automatic header detection, and manually assign the header to define the column type if a ``CSV`` with with a ``# header`` tag is passed (see examples below). From 0d291cc2263e0f5145c6a82c64b902b4bb13047e Mon Sep 17 00:00:00 2001 From: Muhanad Zahra <86613209+muhanadz@users.noreply.github.com> Date: Fri, 8 Apr 2022 11:04:31 +0100 Subject: [PATCH 18/61] Fixed grammer. Added more examples. Fixed grammer. Added more examples for automatic header detection. Added linkable titles to some headings --- README.rst | 64 +++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 46 insertions(+), 18 deletions(-) diff --git a/README.rst b/README.rst index c4347203..321747f6 100644 --- a/README.rst +++ b/README.rst @@ -71,10 +71,6 @@ object IDs in the ``OMERO.table``. The ``CSV`` file must be provided as local file with ``--file path/to/file.csv``. - -Automatic Column Types -^^^^^^^^^ - **The default behaviour of the script is to automatically detect the column types and specific object types from an input ``CSV`` using the list below.** Column Types are: @@ -85,25 +81,25 @@ Column Types are: - ``plate``, ``well``, ``image``, ``dataset``, ``roi`` to specify objects +However, it is possible to manually define the column types , ignoring the automatic header detection, if a ``CSV`` with a ``# header`` row is passed (see examples below). -Manual Column Types -^^^^^^^^^ - -However, it is possible to override the default behaviour, ignoring the automatic header detection, and manually assign the header to define the column type if a ``CSV`` with with a ``# header`` tag is passed (see examples below). - -Automatic header detection can also be ignored if using the ``--manual_headers`` flag. If the ``# header`` is not present and this flag is used, column types will default to ``String`` +Automatic header detection can also be ignored if using the ``--manual_headers`` flag. If the ``# header`` is not present and this flag is used, column types will default to ``String`` (unless the column names correspond to OMERO objects such as ``image`` or ``plate``). NB: Column names should not contain spaces if you want to be able to query by these columns. +Examples +^^^^^^^^^ + **Project / Dataset** +^^^^^^^^^ To add a table to a Project, the ``CSV`` file needs to specify ``Dataset Name`` and ``Image Name`` or ``Image ID``:: $ omero metadata populate Project:1 --file path/to/project.csv - -project.csv:: + +project.csv (manual column types definition):: # header s,s,d,l,s Image Name,Dataset Name,ROI_Area,Channel_Index,Channel_Name @@ -112,7 +108,15 @@ project.csv:: img-03.png,dataset01,0.093,3,TRITC img-04.png,dataset01,0.429,4,Cy5 -This will create an OMERO.table linked to the Project like this with +project.csv (automatic column types detection):: + + Image Name,Dataset Name,ROI_Area,Channel_Index,Channel_Name + img-01.png,dataset01,0.0469,1,DAPI + img-02.png,dataset01,0.142,2,GFP + img-03.png,dataset01,0.093,3,TRITC + img-04.png,dataset01,0.429,4,Cy5 + +Both manual definition or automatic detection of column types will create an OMERO.table linked to the Project as folows with a new ``Image`` column with IDs: ========== ============ ======== ============= ============ ===== @@ -128,11 +132,14 @@ If the target is a Dataset instead of a Project, the ``Dataset Name`` column is **Screen / Plate** +^^^^^^^^^ To add a table to a Screen, the ``CSV`` file needs to specify ``Plate`` name and ``Well``. -If a ``# header`` is specified, column types must be ``well`` and ``plate``. +If a ``# header`` is specified, column types must be ``well`` and ``plate``:: -screen.csv:: + $ omero metadata populate Screen:1 --file path/to/screen.csv + +screen.csv (manual column types definition):: # header well,plate,s,d,l,d Well,Plate,Drug,Concentration,Cell_Count,Percent_Mitotic @@ -141,7 +148,15 @@ screen.csv:: A3,plate01,DMSO,5.5,550,4 B1,plate01,DrugX,12.3,50,44.43 -This will create an OMERO.table linked to the Screen, with the +screen.csv (automatic column types detection):: + + Well,Plate,Drug,Concentration,Cell_Count,Percent_Mitotic + A1,plate01,DMSO,10.1,10,25.4 + A2,plate01,DMSO,0.1,1000,2.54 + A3,plate01,DMSO,5.5,550,4 + B1,plate01,DrugX,12.3,50,44.43 + +Similarly, this will create an OMERO.table linked to the Screen, with the ``Well Name`` and ``Plate Name`` columns added and the ``Well`` and ``Plate`` columns used for IDs: @@ -157,6 +172,7 @@ Well Plate Drug Concentration Cell_Count Percent_Mitotic Well Name Plat If the target is a Plate instead of a Screen, the ``Plate`` column is not needed. **ROIs** +^^^^^^^^^ If the target is an Image or a Dataset, a ``CSV`` with ROI-level or Shape-level data can be used to create an ``OMERO.table`` (bulk annotation) as a ``File Annotation`` linked to the target object. @@ -168,9 +184,11 @@ NB: Columns of type ``shape`` aren't yet supported on the OMERO.server. Alternatively, if the target is an Image, the ROI input column can be ``Roi Name`` (with type ``s``), and an ``roi`` type column will be appended containing ROI IDs. -In this case, it is required that ROIs on the Image in OMERO have the ``Name`` attribute set. +In this case, it is required that ROIs on the Image in OMERO have the ``Name`` attribute set:: -image.csv:: + $ omero metadata populate Image:1 --file path/to/image.csv + +image.csv (manual column types definition):: # header roi,l,l,d,l Roi,shape,object,probability,area @@ -179,6 +197,16 @@ image.csv:: 503,1068,3,0.2,25 503,1069,4,0.8,400 503,1070,5,0.5,200 + + +image.csv (automatic column types detection):: + + Roi,shape,object,probability,area + 501,1066,1,0.8,250 + 502,1067,2,0.9,500 + 503,1068,3,0.2,25 + 503,1069,4,0.8,400 + 503,1070,5,0.5,200 This will create an OMERO.table linked to the Image like this: From 338caee7358559d385babffc7ac0b8435729152f Mon Sep 17 00:00:00 2001 From: Muhanad Zahra Date: Fri, 8 Apr 2022 12:06:33 +0100 Subject: [PATCH 19/61] init --- .../detection/test_automatic_header.py | 55 +++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 test/integration/detection/test_automatic_header.py diff --git a/test/integration/detection/test_automatic_header.py b/test/integration/detection/test_automatic_header.py new file mode 100644 index 00000000..59bdbd30 --- /dev/null +++ b/test/integration/detection/test_automatic_header.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# +# Copyright (c) 2022 Glencoe Software, Inc. All rights reserved. +# +# This software is distributed under the terms described by the LICENSE.txt +# file you can find at the root of the distribution bundle. If the file is +# missing please request a copy by contacting info@glencoesoftware.com + +""" + Test of the default automatic column type detection behaviour +""" + +from omero_metadata.cli import MetadataControl +import pandas as pd +import tempfile + + +def test_detect_headers(): + d = { + 'project_name': ['a', 'b', 'c'], + 'dataset_name': ['a', 'b', 'c'], + 'plate_name': ['a', 'b', 'c'], + 'well_name': ['a', 'b', 'c'], + 'image_name': ['a', 'b', 'c'], + 'roi_name': ['a', 'b', 'c'], + 'project_id': [1, 2, 3], + 'dataset_id': [1, 2, 3], + 'plate_id': [1, 2, 3], + 'well_id': [1, 2, 3], + 'image_id': [1, 2, 3], + 'roi_id': [1, 2, 3], + 'project': [1, 2, 3], + 'dataset': [1, 2, 3], + 'plate': [1, 2, 3], + 'well': [1, 2, 3], + 'image': [1, 2, 3], + 'roi': [1, 2, 3], + 'measurement 1': [11, 22, 33], + 'measurement 2': [0.1, 0.2, 0.3], + 'measurement 3': ['a', 'b', 'c'], + 'measurement 4': [True, True, False] + } + + df = pd.DataFrame(data=d) + tmp = tempfile.NamedTemporaryFile() + df.to_csv(tmp.name, index=False) + header = MetadataControl.detect_headers(tmp.name) + expected_header = [ + 's', 's', 'plate', 'well', 's', 's', + 'l', 'dataset', 'l', 'l', 'image', 'roi', + 'l', 'dataset', 'plate', 'well', 'image', 'roi', + 'l', 'd', 's', 'b' + ] + assert header == expected_header From 494314a04b5b1f38eb3076b03edd740fd99a5423 Mon Sep 17 00:00:00 2001 From: Muhanad Zahra <86613209+muhanadz@users.noreply.github.com> Date: Fri, 8 Apr 2022 14:46:58 +0100 Subject: [PATCH 20/61] Improved clarify --- README.rst | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/README.rst b/README.rst index 321747f6..370597cf 100644 --- a/README.rst +++ b/README.rst @@ -70,19 +70,20 @@ object IDs in the ``OMERO.table``. The ``CSV`` file must be provided as local file with ``--file path/to/file.csv``. +OMERO.tables have defined column types to specify the data-type such as `double` or `long` and special object-types of each column for storing OMERO object IDs such as ``ImageColumn`` or ``WellColumn`` -**The default behaviour of the script is to automatically detect the column types and specific object types from an input ``CSV`` using the list below.** -Column Types are: +**The default behaviour of the script is to automatically detect these column types from an input ``CSV``**. This behaviour works as folows: +* Columns named with a supported object-type (e.g. 'image', 'plate'...) or with object_id (e.g. 'image_id', 'dataset_id' ) will generate the corresponding column type in the OMERO.table. e.g (ImageColumn, PlateColumn, DatasetColumn, etc) +* Other column types will be detected based on the column's data using the pandas library (e.g. column of data type double). + +However, it is possible to manually define the column types , ignoring the automatic header detection, if a ``CSV`` with a ``# header`` row is passed. The ``# header`` row should be the first row of the CSV and defines columns according to the following list (see examples below): - ``d``: ``DoubleColumn``, for floating point numbers - ``l``: ``LongColumn``, for integer numbers - ``s``: ``StringColumn``, for text - ``b``: ``BoolColumn``, for true/false - ``plate``, ``well``, ``image``, ``dataset``, ``roi`` to specify objects - -However, it is possible to manually define the column types , ignoring the automatic header detection, if a ``CSV`` with a ``# header`` row is passed (see examples below). - Automatic header detection can also be ignored if using the ``--manual_headers`` flag. If the ``# header`` is not present and this flag is used, column types will default to ``String`` (unless the column names correspond to OMERO objects such as ``image`` or ``plate``). NB: Column names should not contain spaces if you want to be able to query From 70a1c92a795759342cf4f98fe9d6c03ba85a0a19 Mon Sep 17 00:00:00 2001 From: Muhanad Zahra <86613209+muhanadz@users.noreply.github.com> Date: Fri, 8 Apr 2022 15:55:37 +0100 Subject: [PATCH 21/61] Fixed formatting --- README.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index 370597cf..02bb4455 100644 --- a/README.rst +++ b/README.rst @@ -73,8 +73,9 @@ The ``CSV`` file must be provided as local file with ``--file path/to/file.csv`` OMERO.tables have defined column types to specify the data-type such as `double` or `long` and special object-types of each column for storing OMERO object IDs such as ``ImageColumn`` or ``WellColumn`` **The default behaviour of the script is to automatically detect these column types from an input ``CSV``**. This behaviour works as folows: -* Columns named with a supported object-type (e.g. 'image', 'plate'...) or with object_id (e.g. 'image_id', 'dataset_id' ) will generate the corresponding column type in the OMERO.table. e.g (ImageColumn, PlateColumn, DatasetColumn, etc) -* Other column types will be detected based on the column's data using the pandas library (e.g. column of data type double). + +* Columns named with a supported object-type (e.g. 'image', 'plate'...) or with object_id (e.g. 'image_id', 'dataset_id' ) will generate the corresponding column type in the OMERO.table. e.g (ImageColumn, PlateColumn, DatasetColumn, etc) +* Other column types will be detected based on the column's data using the pandas library (e.g. column of data type double). However, it is possible to manually define the column types , ignoring the automatic header detection, if a ``CSV`` with a ``# header`` row is passed. The ``# header`` row should be the first row of the CSV and defines columns according to the following list (see examples below): From 2bc1a06aafd76f143d0c246e70ba940720e0df5e Mon Sep 17 00:00:00 2001 From: Muhanad Zahra <86613209+muhanadz@users.noreply.github.com> Date: Fri, 8 Apr 2022 15:57:41 +0100 Subject: [PATCH 22/61] Added full list for supported object-type --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 02bb4455..91830e7a 100644 --- a/README.rst +++ b/README.rst @@ -74,7 +74,7 @@ OMERO.tables have defined column types to specify the data-type such as `double` **The default behaviour of the script is to automatically detect these column types from an input ``CSV``**. This behaviour works as folows: -* Columns named with a supported object-type (e.g. 'image', 'plate'...) or with object_id (e.g. 'image_id', 'dataset_id' ) will generate the corresponding column type in the OMERO.table. e.g (ImageColumn, PlateColumn, DatasetColumn, etc) +* Columns named with a supported object-type ('plate', 'well', 'image', 'dataset', or 'roi') or with object_id (e.g. 'image_id', 'dataset_id') will generate the corresponding column type in the OMERO.table. e.g (ImageColumn, PlateColumn, DatasetColumn, etc) * Other column types will be detected based on the column's data using the pandas library (e.g. column of data type double). From f6610a20bbb6914c5843474f2e80d46b9bf8c92d Mon Sep 17 00:00:00 2001 From: Muhanad Zahra <86613209+muhanadz@users.noreply.github.com> Date: Fri, 15 Apr 2022 14:21:03 +0100 Subject: [PATCH 23/61] Updating linting --- README.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/README.rst b/README.rst index 91830e7a..d8f33849 100644 --- a/README.rst +++ b/README.rst @@ -79,6 +79,7 @@ OMERO.tables have defined column types to specify the data-type such as `double` However, it is possible to manually define the column types , ignoring the automatic header detection, if a ``CSV`` with a ``# header`` row is passed. The ``# header`` row should be the first row of the CSV and defines columns according to the following list (see examples below): + - ``d``: ``DoubleColumn``, for floating point numbers - ``l``: ``LongColumn``, for integer numbers - ``s``: ``StringColumn``, for text From 6c4e14ab1efe0bb3d78134df97c49ed00626d3be Mon Sep 17 00:00:00 2001 From: Muhanad Zahra Date: Fri, 15 Apr 2022 15:17:53 +0100 Subject: [PATCH 24/61] Moved test to unit tests directory --- test/{integration/detection => unit}/test_automatic_header.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename test/{integration/detection => unit}/test_automatic_header.py (100%) diff --git a/test/integration/detection/test_automatic_header.py b/test/unit/test_automatic_header.py similarity index 100% rename from test/integration/detection/test_automatic_header.py rename to test/unit/test_automatic_header.py From 654195b27aa87a336cb4d79d6462a7e2a5fd05fc Mon Sep 17 00:00:00 2001 From: Muhanad Zahra Date: Fri, 15 Apr 2022 15:37:10 +0100 Subject: [PATCH 25/61] Increase test coverage --- test/unit/test_automatic_header.py | 45 ++++++++++++++---------------- 1 file changed, 21 insertions(+), 24 deletions(-) diff --git a/test/unit/test_automatic_header.py b/test/unit/test_automatic_header.py index 59bdbd30..95f71b1f 100644 --- a/test/unit/test_automatic_header.py +++ b/test/unit/test_automatic_header.py @@ -18,38 +18,35 @@ def test_detect_headers(): d = { - 'project_name': ['a', 'b', 'c'], - 'dataset_name': ['a', 'b', 'c'], - 'plate_name': ['a', 'b', 'c'], - 'well_name': ['a', 'b', 'c'], - 'image_name': ['a', 'b', 'c'], - 'roi_name': ['a', 'b', 'c'], - 'project_id': [1, 2, 3], - 'dataset_id': [1, 2, 3], - 'plate_id': [1, 2, 3], - 'well_id': [1, 2, 3], - 'image_id': [1, 2, 3], - 'roi_id': [1, 2, 3], - 'project': [1, 2, 3], - 'dataset': [1, 2, 3], - 'plate': [1, 2, 3], - 'well': [1, 2, 3], - 'image': [1, 2, 3], - 'roi': [1, 2, 3], 'measurement 1': [11, 22, 33], 'measurement 2': [0.1, 0.2, 0.3], 'measurement 3': ['a', 'b', 'c'], - 'measurement 4': [True, True, False] + 'measurement 4': [True, True, False], + 'measurement 5': [11, 0.1, True] } + prefix_list = ['project', 'dataset', 'plate', 'well', 'image', 'roi', ] + # Create a dictionary with every combination of headers + # eg plate_name/platename/plate name/plate_id/plateid/plate id + for prefix in prefix_list: + d[f'{prefix}_name'] = ['a', 'b', 'c'] + d[f'{prefix} name'] = ['a', 'b', 'c'] + d[f'{prefix}name'] = ['a', 'b', 'c'] + d[f'{prefix}_id'] = [1, 2, 3] + d[f'{prefix} id'] = [1, 2, 3] + d[f'{prefix}id'] = [1, 2, 3] + d[f'{prefix}'] = [1, 2, 3] df = pd.DataFrame(data=d) tmp = tempfile.NamedTemporaryFile() df.to_csv(tmp.name, index=False) header = MetadataControl.detect_headers(tmp.name) expected_header = [ - 's', 's', 'plate', 'well', 's', 's', - 'l', 'dataset', 'l', 'l', 'image', 'roi', - 'l', 'dataset', 'plate', 'well', 'image', 'roi', - 'l', 'd', 's', 'b' - ] + 'l', 'd', 's', 'b', 's', + 's', 's', 's', 'l', 'l', 'l', 'l', + 's', 's', 's', 'dataset', 'dataset', 'dataset', 'dataset', + 'plate', 'plate', 'plate', 'l', 'l', 'l', 'plate', + 'well', 'well', 'well', 'l', 'l', 'l', 'well', + 's', 's', 's', 'image', 'image', 'image', 'image', + 's', 's', 's', 'roi', 'roi', 'roi', 'roi' + ] assert header == expected_header From 0cc977f9c4d2309b8712f2ef20ad93cba68295f2 Mon Sep 17 00:00:00 2001 From: Muhanad Zahra <86613209+muhanadz@users.noreply.github.com> Date: Fri, 15 Apr 2022 16:00:17 +0100 Subject: [PATCH 26/61] Fixed Title underline too short --- README.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index d8f33849..9f4aa6a8 100644 --- a/README.rst +++ b/README.rst @@ -95,7 +95,7 @@ Examples ^^^^^^^^^ **Project / Dataset** -^^^^^^^^^ +^^^^^^^^^^^^^^^^^^^^^^ To add a table to a Project, the ``CSV`` file needs to specify ``Dataset Name`` and ``Image Name`` or ``Image ID``:: @@ -135,7 +135,7 @@ If the target is a Dataset instead of a Project, the ``Dataset Name`` column is **Screen / Plate** -^^^^^^^^^ +^^^^^^^^^^^^^^^^^^^ To add a table to a Screen, the ``CSV`` file needs to specify ``Plate`` name and ``Well``. If a ``# header`` is specified, column types must be ``well`` and ``plate``:: From b3610bcad0ea9cf58543eec14be9b9fe0f2d5b89 Mon Sep 17 00:00:00 2001 From: Muhanad Zahra <86613209+muhanadz@users.noreply.github.com> Date: Mon, 18 Apr 2022 16:05:05 +0100 Subject: [PATCH 27/61] removed double star Co-authored-by: pwalczysko --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 9f4aa6a8..c154563d 100644 --- a/README.rst +++ b/README.rst @@ -72,7 +72,7 @@ The ``CSV`` file must be provided as local file with ``--file path/to/file.csv`` OMERO.tables have defined column types to specify the data-type such as `double` or `long` and special object-types of each column for storing OMERO object IDs such as ``ImageColumn`` or ``WellColumn`` -**The default behaviour of the script is to automatically detect these column types from an input ``CSV``**. This behaviour works as folows: +The default behaviour of the script is to ``automatically detect these column types`` from an input ``CSV``. This behaviour works as folows: * Columns named with a supported object-type ('plate', 'well', 'image', 'dataset', or 'roi') or with object_id (e.g. 'image_id', 'dataset_id') will generate the corresponding column type in the OMERO.table. e.g (ImageColumn, PlateColumn, DatasetColumn, etc) * Other column types will be detected based on the column's data using the pandas library (e.g. column of data type double). From bf24a6b7c1ffabe231e9da83c0e3e0f5b30f35eb Mon Sep 17 00:00:00 2001 From: Muhanad Zahra <86613209+muhanadz@users.noreply.github.com> Date: Mon, 18 Apr 2022 16:14:12 +0100 Subject: [PATCH 28/61] Removed extra backticks --- README.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index c154563d..bfac6f80 100644 --- a/README.rst +++ b/README.rst @@ -70,9 +70,9 @@ object IDs in the ``OMERO.table``. The ``CSV`` file must be provided as local file with ``--file path/to/file.csv``. -OMERO.tables have defined column types to specify the data-type such as `double` or `long` and special object-types of each column for storing OMERO object IDs such as ``ImageColumn`` or ``WellColumn`` +OMERO.tables have defined column types to specify the data-type such as ``double`` or ``long`` and special object-types of each column for storing OMERO object IDs such as ``ImageColumn`` or ``WellColumn`` -The default behaviour of the script is to ``automatically detect these column types`` from an input ``CSV``. This behaviour works as folows: +The default behaviour of the script is to automatically detect the column types from an input ``CSV``. This behaviour works as folows: * Columns named with a supported object-type ('plate', 'well', 'image', 'dataset', or 'roi') or with object_id (e.g. 'image_id', 'dataset_id') will generate the corresponding column type in the OMERO.table. e.g (ImageColumn, PlateColumn, DatasetColumn, etc) * Other column types will be detected based on the column's data using the pandas library (e.g. column of data type double). From fcbc1c8a63fad20aec91560bbc93bc948b87816e Mon Sep 17 00:00:00 2001 From: Muhanad Zahra <86613209+muhanadz@users.noreply.github.com> Date: Thu, 12 May 2022 11:12:27 +0100 Subject: [PATCH 29/61] Added dataset id example --- README.rst | 34 ++++++++++++++++++++++++++++++++-- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index bfac6f80..20fd176a 100644 --- a/README.rst +++ b/README.rst @@ -97,7 +97,7 @@ Examples **Project / Dataset** ^^^^^^^^^^^^^^^^^^^^^^ -To add a table to a Project, the ``CSV`` file needs to specify ``Dataset Name`` +To add a table to a Project, the ``CSV`` file needs to specify ``Dataset Name`` or ``Dataset ID`` and ``Image Name`` or ``Image ID``:: $ omero metadata populate Project:1 --file path/to/project.csv @@ -131,8 +131,38 @@ img-03.png dataset01 0.093 3 TRITC 36640 img-04.png dataset01 0.429 4 Cy5 36641 ========== ============ ======== ============= ============ ===== -If the target is a Dataset instead of a Project, the ``Dataset Name`` column is not needed. +Example using ``Image ID`` and ``Dataset ID``. + +project.csv (manual column types definition):: + + # header image,dataset,d,l,s + image id,Dataset ID,ROI_Area,Channel_Index,Channel_Name + 36638,101,0.0469,1,DAPI + 36639,101,0.142,2,GFP + 36640,101,0.093,3,TRITC + 36641,101,0.429,4,Cy5 + +project.csv (automatic column types detection):: + image id,Dataset ID,ROI_Area,Channel_Index,Channel_Name + 36638,101,0.0469,1,DAPI + 36639,101,0.142,2,GFP + 36640,101,0.093,3,TRITC + 36641,101,0.429,4,Cy5 + +Both manual definition or automatic detection of column types will create an OMERO.table linked to the Project as folows with +a new ``Image`` column with Names: + +===== ======= ======== ============= ============ ========== +Image Dataset ROI_Area Channel_Index Channel_Name Image Name +===== ======= ======== ============= ============ ========== +36638 101 0.0469 1 DAPI img-01.png +36639 101 0.142 2 GFP img-02.png +36640 101 0.093 3 TRITC img-03.png +36641 101 0.429 4 Cy5 img-04.png +===== ======= ======== ============= ============ ========== + +If the target is a Dataset instead of a Project, the ``Dataset Name`` column is not needed. **Screen / Plate** ^^^^^^^^^^^^^^^^^^^ From 6c5500b60b648e3f3f7de86e01af53d18df22bb5 Mon Sep 17 00:00:00 2001 From: Muhanad Zahra <86613209+muhanadz@users.noreply.github.com> Date: Thu, 12 May 2022 11:38:28 +0100 Subject: [PATCH 30/61] fixing language --- README.rst | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.rst b/README.rst index 20fd176a..c157d3b5 100644 --- a/README.rst +++ b/README.rst @@ -74,8 +74,8 @@ OMERO.tables have defined column types to specify the data-type such as ``double The default behaviour of the script is to automatically detect the column types from an input ``CSV``. This behaviour works as folows: -* Columns named with a supported object-type ('plate', 'well', 'image', 'dataset', or 'roi') or with object_id (e.g. 'image_id', 'dataset_id') will generate the corresponding column type in the OMERO.table. e.g (ImageColumn, PlateColumn, DatasetColumn, etc) -* Other column types will be detected based on the column's data using the pandas library (e.g. column of data type double). +* Columns named with a supported object-type (e.g. 'plate', 'well', 'image', 'dataset', or 'roi'), with _id (e.g. 'image_id', 'dataset_id') or with _name (e.g. 'plate_name', 'dataset_name') will generate the corresponding column type in the OMERO.table (e.g. ImageColumn, PlateColumn, DatasetColumn, etc). +* All other column types will be detected based on the column's data using the pandas library (e.g. columns of data type double will be detected as ``DoubleColumn``). However, it is possible to manually define the column types , ignoring the automatic header detection, if a ``CSV`` with a ``# header`` row is passed. The ``# header`` row should be the first row of the CSV and defines columns according to the following list (see examples below): @@ -150,7 +150,7 @@ project.csv (automatic column types detection):: 36640,101,0.093,3,TRITC 36641,101,0.429,4,Cy5 -Both manual definition or automatic detection of column types will create an OMERO.table linked to the Project as folows with +The previous example will create an OMERO.table linked to the Project as folows with a new ``Image`` column with Names: ===== ======= ======== ============= ============ ========== @@ -189,7 +189,7 @@ screen.csv (automatic column types detection):: A3,plate01,DMSO,5.5,550,4 B1,plate01,DrugX,12.3,50,44.43 -Similarly, this will create an OMERO.table linked to the Screen, with the +This will create an OMERO.table linked to the Screen, with the ``Well Name`` and ``Plate Name`` columns added and the ``Well`` and ``Plate`` columns used for IDs: From 529445dea8eb5150ca74d7d18532cf5c747bc01e Mon Sep 17 00:00:00 2001 From: Muhanad Zahra Date: Tue, 17 May 2022 14:08:54 +0100 Subject: [PATCH 31/61] Added column type tests --- test/unit/test_automatic_header.py | 182 ++++++++++++++++++++++++++++- 1 file changed, 179 insertions(+), 3 deletions(-) diff --git a/test/unit/test_automatic_header.py b/test/unit/test_automatic_header.py index 95f71b1f..c01547c4 100644 --- a/test/unit/test_automatic_header.py +++ b/test/unit/test_automatic_header.py @@ -7,16 +7,20 @@ # file you can find at the root of the distribution bundle. If the file is # missing please request a copy by contacting info@glencoesoftware.com -""" - Test of the default automatic column type detection behaviour -""" +from omero.model import ScreenI, ProjectI +from omero_metadata.populate import HeaderResolver from omero_metadata.cli import MetadataControl import pandas as pd import tempfile +from omero.grid import ImageColumn, LongColumn, PlateColumn, RoiColumn, \ + StringColumn, WellColumn, DoubleColumn, BoolColumn, DatasetColumn def test_detect_headers(): + ''' + Test of the default automatic column type detection behaviour + ''' d = { 'measurement 1': [11, 22, 33], 'measurement 2': [0.1, 0.2, 0.3], @@ -50,3 +54,175 @@ def test_detect_headers(): 's', 's', 's', 'roi', 'roi', 'roi', 'roi' ] assert header == expected_header + + +class TestColumnTypes: + ''' + To test resolved column types and column names. + ''' + def assert_expected( + self, target_object, column_name, header_type, + expected_resolved_column_type, expected_resolved_column_names + ): + header_resolver = HeaderResolver( + target_object, column_name, column_types=header_type) + resolved_column_types = header_resolver.create_columns() + for index, col in enumerate(resolved_column_types): + assert col.__class__ == expected_resolved_column_type[index] + assert col.name == expected_resolved_column_names[index] + + def test_plate_name_well_name(self): + column_name = [ + 'plate_name', 'well_name', 'measurement 1', + 'measurement 2', 'measurement 3', 'measurement 4'] + + header_type = ['plate', 'well', 'l', 'd', 's', 'b'] + + # We expect populate to append 'Plate Name' and 'Well Name' at the end + expected_resolved_column_names = [ + 'Plate', 'Well', 'measurement 1', 'measurement 2', 'measurement 3', + 'measurement 4', 'Plate Name', 'Well Name'] + + expected_resolved_column_type = [ + PlateColumn, WellColumn, + LongColumn, DoubleColumn, StringColumn, BoolColumn, + StringColumn, StringColumn] + + target_object = ScreenI(0, None) # Target object is Screen + self.assert_expected( + target_object, column_name, header_type, + expected_resolved_column_type, expected_resolved_column_names) + + def test_plate_id_well_id(self): + column_name = [ + 'plate_id', 'well_id', 'measurement 1', + 'measurement 2', 'measurement 3', 'measurement 4'] + + # plate_id = 'l' since 'plate' header type is not supported for plateid + header_type = ['l', 'l', 'l', 'd', 's', 'b'] + + expected_resolved_column_names = [ + 'plate_id', 'well_id', 'measurement 1', 'measurement 2', + 'measurement 3', 'measurement 4'] + + expected_resolved_column_type = [ + LongColumn, LongColumn, + LongColumn, DoubleColumn, StringColumn, BoolColumn] + + target_object = ScreenI(0, None) # Target object is Screen + + self.assert_expected( + target_object, column_name, header_type, + expected_resolved_column_type, expected_resolved_column_names) + + def test_plate_well(self): + column_name = [ + 'plate', 'well', 'measurement 1', + 'measurement 2', 'measurement 3', 'measurement 4'] + + # plate_id = 'l' since 'plate' header type is not supported for plateid + header_type = ['plate', 'well', 'l', 'd', 's', 'b'] + + expected_resolved_column_names = [ + 'Plate', 'Well', 'measurement 1', 'measurement 2', 'measurement 3', + 'measurement 4', 'Plate Name', 'Well Name'] + + expected_resolved_column_type = [ + PlateColumn, WellColumn, + LongColumn, DoubleColumn, StringColumn, BoolColumn, + StringColumn, StringColumn] + + target_object = ScreenI(0, None) # Target object is Screen + + self.assert_expected( + target_object, column_name, header_type, + expected_resolved_column_type, expected_resolved_column_names) + + def test_dataset_name_image_name(self): + ''' + In the case column name is 'Image Name' (case sensitive), + specific behaviour is executed. + ''' + column_name = [ + 'dataset_name', 'Image Name', 'measurement 1', + 'measurement 2', 'measurement 3', 'measurement 4'] + + header_type = ['s', 's', 'l', 'd', 's', 'b'] + + expected_resolved_column_names = [ + 'dataset_name', 'Image Name', 'measurement 1', 'measurement 2', + 'measurement 3', 'measurement 4', 'Image'] + + expected_resolved_column_type = [ + StringColumn, StringColumn, + LongColumn, DoubleColumn, StringColumn, BoolColumn, ImageColumn] + + target_object = ProjectI(0, None) # Target object is Project + + self.assert_expected( + target_object, column_name, header_type, + expected_resolved_column_type, expected_resolved_column_names) + + def test_dataset_id_image_id(self): + column_name = [ + 'dataset_id', 'image_id', 'measurement 1', + 'measurement 2', 'measurement 3', 'measurement 4'] + + header_type = ['dataset', 'image', 'l', 'd', 's', 'b'] + + expected_resolved_column_names = [ + 'Dataset', 'Image', 'measurement 1', 'measurement 2', + 'measurement 3', 'measurement 4', 'Image Name'] + + expected_resolved_column_type = [ + DatasetColumn, ImageColumn, + LongColumn, DoubleColumn, StringColumn, BoolColumn, StringColumn] + + target_object = ProjectI(0, None) # Target object is Project + + self.assert_expected( + target_object, column_name, header_type, + expected_resolved_column_type, expected_resolved_column_names) + + def test_dataset_image(self): + column_name = [ + 'dataset', 'image', 'measurement 1', + 'measurement 2', 'measurement 3', 'measurement 4'] + + header_type = ['dataset', 'image', 'l', 'd', 's', 'b'] + + expected_resolved_column_names = [ + 'Dataset', 'Image', 'measurement 1', 'measurement 2', + 'measurement 3', 'measurement 4', 'Image Name', ] + + expected_resolved_column_type = [ + DatasetColumn, ImageColumn, + LongColumn, DoubleColumn, StringColumn, BoolColumn, StringColumn] + + target_object = ProjectI(0, None) # Target object is Project + + self.assert_expected( + target_object, column_name, header_type, + expected_resolved_column_type, expected_resolved_column_names) + + def test_roi(self): + column_name = [ + 'image', 'roi', 'measurement 1', + 'measurement 2', 'measurement 3', 'measurement 4'] + + header_type = ['image', 'roi', 'l', 'd', 's', 'b'] + + expected_resolved_column_names = [ + 'Image', 'Roi', 'measurement 1', 'measurement 2', + 'measurement 3', 'measurement 4', 'Image Name', 'Roi Name'] + + expected_resolved_column_type = [ + ImageColumn, RoiColumn, + LongColumn, DoubleColumn, StringColumn, BoolColumn, + StringColumn, StringColumn] + + target_object = ProjectI(0, None) # Target object is Project + + self.assert_expected( + target_object, column_name, header_type, + expected_resolved_column_type, expected_resolved_column_names) From 30c22fe4afaddca7ab82162793a8d661b7bf57fb Mon Sep 17 00:00:00 2001 From: Muhanad Zahra <86613209+muhanadz@users.noreply.github.com> Date: Tue, 17 May 2022 14:48:14 +0100 Subject: [PATCH 32/61] Updated workding. Added documented table. Removed some of the complex wording and replaced it with a table in a form of documentation. --- README.rst | 88 +++++++++++++++++++++++++++++------------------------- 1 file changed, 47 insertions(+), 41 deletions(-) diff --git a/README.rst b/README.rst index c157d3b5..945c9993 100644 --- a/README.rst +++ b/README.rst @@ -64,7 +64,7 @@ populate -------- This command creates an ``OMERO.table`` (bulk annotation) from a ``CSV`` file and links -the table as a ``File Annotation`` to a parent container such as Screen, Plate, Project +the table as a ``File Annotation`` to a parent container such as Screen, Plate, Project, Dataset or Image. It also attempts to convert Image, Well or ROI names from the ``CSV`` into object IDs in the ``OMERO.table``. @@ -72,13 +72,45 @@ The ``CSV`` file must be provided as local file with ``--file path/to/file.csv`` OMERO.tables have defined column types to specify the data-type such as ``double`` or ``long`` and special object-types of each column for storing OMERO object IDs such as ``ImageColumn`` or ``WellColumn`` -The default behaviour of the script is to automatically detect the column types from an input ``CSV``. This behaviour works as folows: - -* Columns named with a supported object-type (e.g. 'plate', 'well', 'image', 'dataset', or 'roi'), with _id (e.g. 'image_id', 'dataset_id') or with _name (e.g. 'plate_name', 'dataset_name') will generate the corresponding column type in the OMERO.table (e.g. ImageColumn, PlateColumn, DatasetColumn, etc). -* All other column types will be detected based on the column's data using the pandas library (e.g. columns of data type double will be detected as ``DoubleColumn``). - - -However, it is possible to manually define the column types , ignoring the automatic header detection, if a ``CSV`` with a ``# header`` row is passed. The ``# header`` row should be the first row of the CSV and defines columns according to the following list (see examples below): +The default behaviour of the script is to automatically detect the column types from an input ``CSV``. This behaviour works as follows: + +* Columns named with a supported object-type (e.g. ``plate``, ``well``, ``image``, ``dataset``, or ``roi``), with ``_id`` or ``_name`` will generate the corresponding column type in the OMERO.table. See table below for full list of supported column names. + +============ ================= ==================== ================================== +Column Name Column type Detected Header Type Notes +============ ================= ==================== ================================== +Image ``ImageColumn`` ``image`` Appends 'Image Name' column +Image Name ``StringColumn`` ``s`` Appends 'Image' column +Image ID ``ImageColumn`` ``image`` Appends 'Image Name' column +Dataset ``DatasetColumn`` ``dataset`` \- +Dataset Name ``StringColumn`` ``s`` \- +Dataset ID ``DatasetColumn`` ``dataset`` \- +Plate ``PlateColumn`` ``plate`` Adds 'Plate' column +Plate Name ``PlateColumn`` ``plate`` Adds 'Plate' column +Plate ID ``LongColumn`` ``l`` \- +Well ``WellColumn`` ``well`` Adds 'Well' column +Well Name ``WellColumn`` ``well`` Adds 'Well' column +Well ID ``LongColumn`` ``l`` \- +ROI ``RoiColumn`` ``roi`` Appends 'ROI Name' column +ROI Name ``StringColumn`` ``s`` \- +ROI ID ``RoiColumn`` ``roi`` Appends 'ROI Name' column +============ ================= ==================== ================================== + +Note: Column names are case insensitive. Space, nospace, and underscore are all accepted as seperaters for column names (i.e. `` name``/`` id```, ``name``/``id``, ``_name``/``_id`` are all accepted) + +* All other column types will be detected based on the column's data using the pandas library. See table below. + +=============== ================= ==================== +Column Name Column type Detected Header Type +=============== ================= ==================== +Example String ``StringColumn`` ``s`` +Example Long ``LongColumn`` ``l`` +Example Float ``DoubleColumn`` ``d`` +Example boolean ``BoolColumn`` ``b`` +=============== ================= ==================== + + +However, it is possible to manually define the header types, ignoring the automatic header detection, if a ``CSV`` with a ``# header`` row is passed. The ``# header`` row should be the first row of the CSV and defines columns according to the following list (see examples below): - ``d``: ``DoubleColumn``, for floating point numbers - ``l``: ``LongColumn``, for integer numbers @@ -110,16 +142,10 @@ project.csv (manual column types definition):: img-02.png,dataset01,0.142,2,GFP img-03.png,dataset01,0.093,3,TRITC img-04.png,dataset01,0.429,4,Cy5 + +Note: Remove ``# header`` row for automatic column types detection. -project.csv (automatic column types detection):: - - Image Name,Dataset Name,ROI_Area,Channel_Index,Channel_Name - img-01.png,dataset01,0.0469,1,DAPI - img-02.png,dataset01,0.142,2,GFP - img-03.png,dataset01,0.093,3,TRITC - img-04.png,dataset01,0.429,4,Cy5 - -Both manual definition or automatic detection of column types will create an OMERO.table linked to the Project as folows with +Both manual definition or automatic detection of column types will create an OMERO.table linked to the Project as follows with a new ``Image`` column with IDs: ========== ============ ======== ============= ============ ===== @@ -141,16 +167,10 @@ project.csv (manual column types definition):: 36639,101,0.142,2,GFP 36640,101,0.093,3,TRITC 36641,101,0.429,4,Cy5 - -project.csv (automatic column types detection):: - image id,Dataset ID,ROI_Area,Channel_Index,Channel_Name - 36638,101,0.0469,1,DAPI - 36639,101,0.142,2,GFP - 36640,101,0.093,3,TRITC - 36641,101,0.429,4,Cy5 +Note: Remove ``# header`` row for automatic column types detection. -The previous example will create an OMERO.table linked to the Project as folows with +The previous example will create an OMERO.table linked to the Project as follows with a new ``Image`` column with Names: ===== ======= ======== ============= ============ ========== @@ -181,13 +201,7 @@ screen.csv (manual column types definition):: A3,plate01,DMSO,5.5,550,4 B1,plate01,DrugX,12.3,50,44.43 -screen.csv (automatic column types detection):: - - Well,Plate,Drug,Concentration,Cell_Count,Percent_Mitotic - A1,plate01,DMSO,10.1,10,25.4 - A2,plate01,DMSO,0.1,1000,2.54 - A3,plate01,DMSO,5.5,550,4 - B1,plate01,DrugX,12.3,50,44.43 +Note: Remove ``# header`` row for automatic column types detection. This will create an OMERO.table linked to the Screen, with the ``Well Name`` and ``Plate Name`` columns added and the ``Well`` and @@ -231,15 +245,7 @@ image.csv (manual column types definition):: 503,1069,4,0.8,400 503,1070,5,0.5,200 - -image.csv (automatic column types detection):: - - Roi,shape,object,probability,area - 501,1066,1,0.8,250 - 502,1067,2,0.9,500 - 503,1068,3,0.2,25 - 503,1069,4,0.8,400 - 503,1070,5,0.5,200 +Note: Remove ``# header`` row for automatic column types detection. This will create an OMERO.table linked to the Image like this: From 397150ee156227ebf3b998313a3cb4083c6c99e3 Mon Sep 17 00:00:00 2001 From: Muhanad Zahra Date: Tue, 17 May 2022 14:53:56 +0100 Subject: [PATCH 33/61] removed duplicate comment --- test/unit/test_automatic_header.py | 1 - 1 file changed, 1 deletion(-) diff --git a/test/unit/test_automatic_header.py b/test/unit/test_automatic_header.py index c01547c4..39b92e88 100644 --- a/test/unit/test_automatic_header.py +++ b/test/unit/test_automatic_header.py @@ -120,7 +120,6 @@ def test_plate_well(self): 'plate', 'well', 'measurement 1', 'measurement 2', 'measurement 3', 'measurement 4'] - # plate_id = 'l' since 'plate' header type is not supported for plateid header_type = ['plate', 'well', 'l', 'd', 's', 'b'] expected_resolved_column_names = [ From 9db1280527b460a30da26f6cca345a3d4cdd7677 Mon Sep 17 00:00:00 2001 From: Muhanad Zahra <86613209+muhanadz@users.noreply.github.com> Date: Wed, 25 May 2022 16:28:52 +0100 Subject: [PATCH 34/61] Fixed formatting and concise wording --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 945c9993..bd1816d8 100644 --- a/README.rst +++ b/README.rst @@ -36,7 +36,7 @@ conflicts when importing the Python module. Usage ===== -The plugin is called from the command-line using the `omero` command:: +The plugin is called from the command-line using the ``omero metadata`` command:: $ omero metadata From 2e6e7cf76f9223ba9d9eb1b935df92c277010c32 Mon Sep 17 00:00:00 2001 From: Muhanad Zahra <86613209+muhanadz@users.noreply.github.com> Date: Wed, 25 May 2022 16:30:33 +0100 Subject: [PATCH 35/61] Consistent wording --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index bd1816d8..4c1842f0 100644 --- a/README.rst +++ b/README.rst @@ -74,7 +74,7 @@ OMERO.tables have defined column types to specify the data-type such as ``double The default behaviour of the script is to automatically detect the column types from an input ``CSV``. This behaviour works as follows: -* Columns named with a supported object-type (e.g. ``plate``, ``well``, ``image``, ``dataset``, or ``roi``), with ``_id`` or ``_name`` will generate the corresponding column type in the OMERO.table. See table below for full list of supported column names. +* Columns named with a supported object-type (e.g. ``plate``, ``well``, ``image``, ``dataset``, or ``roi``), with `` id`` or `` name`` will generate the corresponding column type in the OMERO.table. See table below for full list of supported column names. ============ ================= ==================== ================================== Column Name Column type Detected Header Type Notes From 42ca428587413aa137fab3f2afe5252f675f6658 Mon Sep 17 00:00:00 2001 From: Muhanad Zahra <86613209+muhanadz@users.noreply.github.com> Date: Wed, 25 May 2022 17:11:56 +0100 Subject: [PATCH 36/61] Reverted the paradigm the examples now show the default behavior instead of the manual selection. --- README.rst | 35 ++++++++++++++++++++--------------- 1 file changed, 20 insertions(+), 15 deletions(-) diff --git a/README.rst b/README.rst index 4c1842f0..4504bb6b 100644 --- a/README.rst +++ b/README.rst @@ -123,9 +123,12 @@ Automatic header detection can also be ignored if using the ``--manual_headers`` NB: Column names should not contain spaces if you want to be able to query by these columns. + Examples ^^^^^^^^^ +The examples below will use the default automatic column types detection behaviour. It is possible to achieve the same results (or a different desired result) by manually adding a custom ``# header`` row at the top of the CSV. + **Project / Dataset** ^^^^^^^^^^^^^^^^^^^^^^ @@ -134,18 +137,18 @@ and ``Image Name`` or ``Image ID``:: $ omero metadata populate Project:1 --file path/to/project.csv -project.csv (manual column types definition):: +Using ``Image Name`` and ``Dataset Name``: + +project.csv:: - # header s,s,d,l,s Image Name,Dataset Name,ROI_Area,Channel_Index,Channel_Name img-01.png,dataset01,0.0469,1,DAPI img-02.png,dataset01,0.142,2,GFP img-03.png,dataset01,0.093,3,TRITC img-04.png,dataset01,0.429,4,Cy5 -Note: Remove ``# header`` row for automatic column types detection. -Both manual definition or automatic detection of column types will create an OMERO.table linked to the Project as follows with +The previous example will create an OMERO.table linked to the Project as follows with a new ``Image`` column with IDs: ========== ============ ======== ============= ============ ===== @@ -157,19 +160,19 @@ img-03.png dataset01 0.093 3 TRITC 36640 img-04.png dataset01 0.429 4 Cy5 36641 ========== ============ ======== ============= ============ ===== -Example using ``Image ID`` and ``Dataset ID``. +Note: equivalent to adding ``# header s,s,d,l,s`` row to the top of the ``project.csv`` for manual definition. + +Using ``Image ID`` and ``Dataset ID``: -project.csv (manual column types definition):: +project.csv:: - # header image,dataset,d,l,s image id,Dataset ID,ROI_Area,Channel_Index,Channel_Name 36638,101,0.0469,1,DAPI 36639,101,0.142,2,GFP 36640,101,0.093,3,TRITC 36641,101,0.429,4,Cy5 -Note: Remove ``# header`` row for automatic column types detection. - + The previous example will create an OMERO.table linked to the Project as follows with a new ``Image`` column with Names: @@ -184,6 +187,8 @@ Image Dataset ROI_Area Channel_Index Channel_Name Image Name If the target is a Dataset instead of a Project, the ``Dataset Name`` column is not needed. +Note: equivalent to adding ``# header image,dataset,d,l,s`` row to the top of the ``project.csv`` for manual definition. + **Screen / Plate** ^^^^^^^^^^^^^^^^^^^ @@ -192,16 +197,14 @@ If a ``# header`` is specified, column types must be ``well`` and ``plate``:: $ omero metadata populate Screen:1 --file path/to/screen.csv -screen.csv (manual column types definition):: +screen.csv:: - # header well,plate,s,d,l,d Well,Plate,Drug,Concentration,Cell_Count,Percent_Mitotic A1,plate01,DMSO,10.1,10,25.4 A2,plate01,DMSO,0.1,1000,2.54 A3,plate01,DMSO,5.5,550,4 B1,plate01,DrugX,12.3,50,44.43 -Note: Remove ``# header`` row for automatic column types detection. This will create an OMERO.table linked to the Screen, with the ``Well Name`` and ``Plate Name`` columns added and the ``Well`` and @@ -218,6 +221,8 @@ Well Plate Drug Concentration Cell_Count Percent_Mitotic Well Name Plat If the target is a Plate instead of a Screen, the ``Plate`` column is not needed. +Note: equivalent to adding ``# header well,plate,s,d,l,d`` row to the top of the ``screen.csv`` for manual definition. + **ROIs** ^^^^^^^^^ @@ -235,9 +240,8 @@ In this case, it is required that ROIs on the Image in OMERO have the ``Name`` a $ omero metadata populate Image:1 --file path/to/image.csv -image.csv (manual column types definition):: +image.csv:: - # header roi,l,l,d,l Roi,shape,object,probability,area 501,1066,1,0.8,250 502,1067,2,0.9,500 @@ -245,7 +249,6 @@ image.csv (manual column types definition):: 503,1069,4,0.8,400 503,1070,5,0.5,200 -Note: Remove ``# header`` row for automatic column types detection. This will create an OMERO.table linked to the Image like this: @@ -259,6 +262,8 @@ Roi shape object probability area Roi Name 503 1070 5 0.5 200 Sample3 === ===== ====== =========== ==== ======== +Note: equivalent to adding ``# header roi,l,l,d,l`` row to the top of the ``image.csv`` for manual definition. + Note that the ROI-level data from an ``OMERO.table`` is not visible in the OMERO.web UI right-hand panel under the ``Tables`` tab, but the table can be visualized by clicking the "eye" on the bulk annotation attachment on the Image. From d74b318d710c7ed277679901c53c2ec00f9153fd Mon Sep 17 00:00:00 2001 From: Muhanad Zahra Date: Mon, 30 May 2022 11:53:34 +0100 Subject: [PATCH 37/61] updated comment in test for clarity --- test/unit/test_automatic_header.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/test/unit/test_automatic_header.py b/test/unit/test_automatic_header.py index 39b92e88..3e553525 100644 --- a/test/unit/test_automatic_header.py +++ b/test/unit/test_automatic_header.py @@ -88,7 +88,7 @@ def test_plate_name_well_name(self): LongColumn, DoubleColumn, StringColumn, BoolColumn, StringColumn, StringColumn] - target_object = ScreenI(0, None) # Target object is Screen + target_object = ScreenI(0, None) # Target is agnostic self.assert_expected( target_object, column_name, header_type, expected_resolved_column_type, expected_resolved_column_names) @@ -109,7 +109,7 @@ def test_plate_id_well_id(self): LongColumn, LongColumn, LongColumn, DoubleColumn, StringColumn, BoolColumn] - target_object = ScreenI(0, None) # Target object is Screen + target_object = ScreenI(0, None) # Target is agnostic self.assert_expected( target_object, column_name, header_type, @@ -131,7 +131,7 @@ def test_plate_well(self): LongColumn, DoubleColumn, StringColumn, BoolColumn, StringColumn, StringColumn] - target_object = ScreenI(0, None) # Target object is Screen + target_object = ScreenI(0, None) # Target is agnostic self.assert_expected( target_object, column_name, header_type, @@ -156,7 +156,7 @@ def test_dataset_name_image_name(self): StringColumn, StringColumn, LongColumn, DoubleColumn, StringColumn, BoolColumn, ImageColumn] - target_object = ProjectI(0, None) # Target object is Project + target_object = ProjectI(0, None) # Target is agnostic self.assert_expected( target_object, column_name, header_type, @@ -177,7 +177,7 @@ def test_dataset_id_image_id(self): DatasetColumn, ImageColumn, LongColumn, DoubleColumn, StringColumn, BoolColumn, StringColumn] - target_object = ProjectI(0, None) # Target object is Project + target_object = ProjectI(0, None) # Target is agnostic self.assert_expected( target_object, column_name, header_type, @@ -198,7 +198,7 @@ def test_dataset_image(self): DatasetColumn, ImageColumn, LongColumn, DoubleColumn, StringColumn, BoolColumn, StringColumn] - target_object = ProjectI(0, None) # Target object is Project + target_object = ProjectI(0, None) # Target is agnostic self.assert_expected( target_object, column_name, header_type, @@ -220,7 +220,7 @@ def test_roi(self): LongColumn, DoubleColumn, StringColumn, BoolColumn, StringColumn, StringColumn] - target_object = ProjectI(0, None) # Target object is Project + target_object = ProjectI(0, None) # Target is agnostic self.assert_expected( target_object, column_name, header_type, From b36691eacad2463b813b017d551259cc99392458 Mon Sep 17 00:00:00 2001 From: Muhanad Zahra <86613209+muhanadz@users.noreply.github.com> Date: Wed, 1 Jun 2022 23:12:51 +0100 Subject: [PATCH 38/61] Moved column name space warning earlier --- README.rst | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/README.rst b/README.rst index 4504bb6b..96fe7c0b 100644 --- a/README.rst +++ b/README.rst @@ -98,6 +98,8 @@ ROI ID ``RoiColumn`` ``roi`` Appends 'ROI Name' column Note: Column names are case insensitive. Space, nospace, and underscore are all accepted as seperaters for column names (i.e. `` name``/`` id```, ``name``/``id``, ``_name``/``_id`` are all accepted) +NB: Column names should not contain spaces if you want to be able to query by these columns. + * All other column types will be detected based on the column's data using the pandas library. See table below. =============== ================= ==================== @@ -120,9 +122,6 @@ However, it is possible to manually define the header types, ignoring the automa Automatic header detection can also be ignored if using the ``--manual_headers`` flag. If the ``# header`` is not present and this flag is used, column types will default to ``String`` (unless the column names correspond to OMERO objects such as ``image`` or ``plate``). -NB: Column names should not contain spaces if you want to be able to query -by these columns. - Examples ^^^^^^^^^ From 29a81030ddabbf628c9e2bcdefb4b5d80d47ea46 Mon Sep 17 00:00:00 2001 From: Muhanad Zahra <86613209+muhanadz@users.noreply.github.com> Date: Wed, 1 Jun 2022 23:20:48 +0100 Subject: [PATCH 39/61] Moved paragraph to avoid confusion --- README.rst | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.rst b/README.rst index 96fe7c0b..b3acdb38 100644 --- a/README.rst +++ b/README.rst @@ -231,11 +231,7 @@ If there is an ``roi`` column (header type ``roi``) containing ROI IDs, an ``Roi column will be appended automatically (see example below). If a column of Shape IDs named ``shape`` of type ``l`` is included, the Shape IDs will be validated (and set to -1 if invalid). Also if an ``image`` column of Image IDs is included, an ``Image Name`` column will be added. -NB: Columns of type ``shape`` aren't yet supported on the OMERO.server. - -Alternatively, if the target is an Image, the ROI input column can be -``Roi Name`` (with type ``s``), and an ``roi`` type column will be appended containing ROI IDs. -In this case, it is required that ROIs on the Image in OMERO have the ``Name`` attribute set:: +NB: Columns of type ``shape`` aren't yet supported on the OMERO.server:: $ omero metadata populate Image:1 --file path/to/image.csv @@ -263,6 +259,10 @@ Roi shape object probability area Roi Name Note: equivalent to adding ``# header roi,l,l,d,l`` row to the top of the ``image.csv`` for manual definition. +Alternatively, if the target is an Image, the ROI input column can be +``Roi Name`` (with type ``s``), and an ``roi`` type column will be appended containing ROI IDs. +In this case, it is required that ROIs on the Image in OMERO have the ``Name`` attribute set. + Note that the ROI-level data from an ``OMERO.table`` is not visible in the OMERO.web UI right-hand panel under the ``Tables`` tab, but the table can be visualized by clicking the "eye" on the bulk annotation attachment on the Image. From 08bd5d842cb5c027aa821f88cd306e9ee54b878c Mon Sep 17 00:00:00 2001 From: Muhanad Zahra <86613209+muhanadz@users.noreply.github.com> Date: Wed, 1 Jun 2022 23:44:11 +0100 Subject: [PATCH 40/61] Updated README --- README.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index b3acdb38..469324b1 100644 --- a/README.rst +++ b/README.rst @@ -173,7 +173,7 @@ project.csv:: The previous example will create an OMERO.table linked to the Project as follows with -a new ``Image`` column with Names: +a new ``Image Name`` column with Names: ===== ======= ======== ============= ============ ========== Image Dataset ROI_Area Channel_Index Channel_Name Image Name @@ -285,4 +285,4 @@ licensed under the terms of the GNU General Public License (GPL) v2 or later. Copyright --------- -2018-2021, The Open Microscopy Environment +2018-2022, The Open Microscopy Environment and Glencoe Software, Inc From f8bcffdb02e62f1a127193824f95b866a9178d99 Mon Sep 17 00:00:00 2001 From: Muhanad Zahra <86613209+muhanadz@users.noreply.github.com> Date: Wed, 1 Jun 2022 23:48:34 +0100 Subject: [PATCH 41/61] Updated supported object-type table --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 469324b1..a9415b00 100644 --- a/README.rst +++ b/README.rst @@ -92,7 +92,7 @@ Well ``WellColumn`` ``well`` Adds 'Well' column Well Name ``WellColumn`` ``well`` Adds 'Well' column Well ID ``LongColumn`` ``l`` \- ROI ``RoiColumn`` ``roi`` Appends 'ROI Name' column -ROI Name ``StringColumn`` ``s`` \- +ROI Name ``StringColumn`` ``s`` Appends 'ROI' column ROI ID ``RoiColumn`` ``roi`` Appends 'ROI Name' column ============ ================= ==================== ================================== From 9c346f5924e9a04effcf5a6b732cfb2bd52f5377 Mon Sep 17 00:00:00 2001 From: Muhanad Zahra <86613209+muhanadz@users.noreply.github.com> Date: Wed, 1 Jun 2022 23:51:23 +0100 Subject: [PATCH 42/61] Fixed spelling Co-authored-by: pwalczysko --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index a9415b00..b6a8be62 100644 --- a/README.rst +++ b/README.rst @@ -96,7 +96,7 @@ ROI Name ``StringColumn`` ``s`` Appends 'ROI' column ROI ID ``RoiColumn`` ``roi`` Appends 'ROI Name' column ============ ================= ==================== ================================== -Note: Column names are case insensitive. Space, nospace, and underscore are all accepted as seperaters for column names (i.e. `` name``/`` id```, ``name``/``id``, ``_name``/``_id`` are all accepted) +Note: Column names are case insensitive. Space, no space, and underscore are all accepted as separators for column names (i.e. `` name``/`` id```, ``name``/``id``, ``_name``/``_id`` are all accepted) NB: Column names should not contain spaces if you want to be able to query by these columns. From 12cac5b5f3b560443099d2b9a94dc8e5cffdf323 Mon Sep 17 00:00:00 2001 From: Muhanad Zahra <86613209+muhanadz@users.noreply.github.com> Date: Thu, 2 Jun 2022 00:00:40 +0100 Subject: [PATCH 43/61] Updated alternative target object explanation --- README.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index b6a8be62..bda9d143 100644 --- a/README.rst +++ b/README.rst @@ -184,10 +184,10 @@ Image Dataset ROI_Area Channel_Index Channel_Name Image Name 36641 101 0.429 4 Cy5 img-04.png ===== ======= ======== ============= ============ ========== -If the target is a Dataset instead of a Project, the ``Dataset Name`` column is not needed. - Note: equivalent to adding ``# header image,dataset,d,l,s`` row to the top of the ``project.csv`` for manual definition. +For both examples above, alternatively, if the target is a Dataset instead of a Project, the ``Dataset`` or ``Dataset Name`` column is not needed. + **Screen / Plate** ^^^^^^^^^^^^^^^^^^^ From 04eda4ef665681a3ee54b03bceed65d3291505af Mon Sep 17 00:00:00 2001 From: Muhanad Zahra <86613209+muhanadz@users.noreply.github.com> Date: Thu, 2 Jun 2022 00:29:28 +0100 Subject: [PATCH 44/61] Added more info to object-types table --- README.rst | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/README.rst b/README.rst index bda9d143..ad15a5df 100644 --- a/README.rst +++ b/README.rst @@ -76,25 +76,25 @@ The default behaviour of the script is to automatically detect the column types * Columns named with a supported object-type (e.g. ``plate``, ``well``, ``image``, ``dataset``, or ``roi``), with `` id`` or `` name`` will generate the corresponding column type in the OMERO.table. See table below for full list of supported column names. -============ ================= ==================== ================================== +============ ================= ==================== ==================================================================== Column Name Column type Detected Header Type Notes -============ ================= ==================== ================================== -Image ``ImageColumn`` ``image`` Appends 'Image Name' column -Image Name ``StringColumn`` ``s`` Appends 'Image' column -Image ID ``ImageColumn`` ``image`` Appends 'Image Name' column -Dataset ``DatasetColumn`` ``dataset`` \- -Dataset Name ``StringColumn`` ``s`` \- -Dataset ID ``DatasetColumn`` ``dataset`` \- -Plate ``PlateColumn`` ``plate`` Adds 'Plate' column -Plate Name ``PlateColumn`` ``plate`` Adds 'Plate' column -Plate ID ``LongColumn`` ``l`` \- -Well ``WellColumn`` ``well`` Adds 'Well' column -Well Name ``WellColumn`` ``well`` Adds 'Well' column -Well ID ``LongColumn`` ``l`` \- -ROI ``RoiColumn`` ``roi`` Appends 'ROI Name' column -ROI Name ``StringColumn`` ``s`` Appends 'ROI' column -ROI ID ``RoiColumn`` ``roi`` Appends 'ROI Name' column -============ ================= ==================== ================================== +============ ================= ==================== ==================================================================== +Image ``ImageColumn`` ``image`` Accepts image IDs. Appends new 'Image Name' column with image names. +Image Name ``StringColumn`` ``s`` Accepts image names. Appends new 'Image' column with image IDs. +Image ID ``ImageColumn`` ``image`` Accepts image IDs. Appends new 'Image Name' column with image names. +Dataset ``DatasetColumn`` ``dataset`` Accepts dataset IDs. +Dataset Name ``StringColumn`` ``s`` Accepts dataset names. +Dataset ID ``DatasetColumn`` ``dataset`` Accepts dataset IDs. +Plate ``PlateColumn`` ``plate`` Accepts plate names. Adds new 'Plate' column with plate IDs. +Plate Name ``PlateColumn`` ``plate`` Accepts plate names. Adds new 'Plate' column with plate IDs. +Plate ID ``LongColumn`` ``l`` Accepts plate IDs. +Well ``WellColumn`` ``well`` Accepts well names. Adds new 'Well' column with well IDs. +Well Name ``WellColumn`` ``well`` Accepts well names. Adds new 'Well' column with well IDs. +Well ID ``LongColumn`` ``l`` Accepts well IDs. +ROI ``RoiColumn`` ``roi`` Accepts ROI IDs. Appends new 'ROI Name' column with ROI names. +ROI Name ``StringColumn`` ``s`` Accepts ROI names. Appends new 'ROI' column with ROI IDs. +ROI ID ``RoiColumn`` ``roi`` Accepts ROI IDs. Appends new 'ROI Name' column with ROI names. +============ ================= ==================== ==================================================================== Note: Column names are case insensitive. Space, no space, and underscore are all accepted as separators for column names (i.e. `` name``/`` id```, ``name``/``id``, ``_name``/``_id`` are all accepted) From 03ebc18b4aa19696241f9d5af233d6fa7079fcb8 Mon Sep 17 00:00:00 2001 From: Sebastien Besson Date: Thu, 2 Jun 2022 14:20:14 +0100 Subject: [PATCH 45/61] Convert detect_headers unit tests into class --- test/unit/test_automatic_header.py | 86 +++++++++++++++++------------- 1 file changed, 49 insertions(+), 37 deletions(-) mode change 100644 => 100755 test/unit/test_automatic_header.py diff --git a/test/unit/test_automatic_header.py b/test/unit/test_automatic_header.py old mode 100644 new mode 100755 index 3e553525..67d14975 --- a/test/unit/test_automatic_header.py +++ b/test/unit/test_automatic_header.py @@ -17,43 +17,55 @@ StringColumn, WellColumn, DoubleColumn, BoolColumn, DatasetColumn -def test_detect_headers(): - ''' - Test of the default automatic column type detection behaviour - ''' - d = { - 'measurement 1': [11, 22, 33], - 'measurement 2': [0.1, 0.2, 0.3], - 'measurement 3': ['a', 'b', 'c'], - 'measurement 4': [True, True, False], - 'measurement 5': [11, 0.1, True] - } - prefix_list = ['project', 'dataset', 'plate', 'well', 'image', 'roi', ] - # Create a dictionary with every combination of headers - # eg plate_name/platename/plate name/plate_id/plateid/plate id - for prefix in prefix_list: - d[f'{prefix}_name'] = ['a', 'b', 'c'] - d[f'{prefix} name'] = ['a', 'b', 'c'] - d[f'{prefix}name'] = ['a', 'b', 'c'] - d[f'{prefix}_id'] = [1, 2, 3] - d[f'{prefix} id'] = [1, 2, 3] - d[f'{prefix}id'] = [1, 2, 3] - d[f'{prefix}'] = [1, 2, 3] - - df = pd.DataFrame(data=d) - tmp = tempfile.NamedTemporaryFile() - df.to_csv(tmp.name, index=False) - header = MetadataControl.detect_headers(tmp.name) - expected_header = [ - 'l', 'd', 's', 'b', 's', - 's', 's', 's', 'l', 'l', 'l', 'l', - 's', 's', 's', 'dataset', 'dataset', 'dataset', 'dataset', - 'plate', 'plate', 'plate', 'l', 'l', 'l', 'plate', - 'well', 'well', 'well', 'l', 'l', 'l', 'well', - 's', 's', 's', 'image', 'image', 'image', 'image', - 's', 's', 's', 'roi', 'roi', 'roi', 'roi' - ] - assert header == expected_header +class TestDetectHeaders: + """Test the MetadataControl.detect_headers API""" + def assert_detect_headers(self): + df = pd.DataFrame(data=self.d) + tmp = tempfile.NamedTemporaryFile() + df.to_csv(tmp.name, index=False) + header = MetadataControl.detect_headers(tmp.name) + assert header == self.expected_header + + def create_objects_dictionary(self): + # Create a dictionary with every combination of headers + # eg plate_name/platename/plate name/plate_id/plateid/plate id + self.d = {} + prefix_list = ['project', 'dataset', 'plate', 'well', 'image', 'roi', ] + for prefix in prefix_list: + self.d[f'{prefix}_name'] = ['a', 'b', 'c'] + self.d[f'{prefix} name'] = ['a', 'b', 'c'] + self.d[f'{prefix}name'] = ['a', 'b', 'c'] + self.d[f'{prefix}_id'] = [1, 2, 3] + self.d[f'{prefix} id'] = [1, 2, 3] + self.d[f'{prefix}id'] = [1, 2, 3] + self.d[f'{prefix}'] = [1, 2, 3] + self.expected_header = [ + 's', 's', 's', 'l', 'l', 'l', 'l', + 's', 's', 's', 'dataset', 'dataset', 'dataset', 'dataset', + 'plate', 'plate', 'plate', 'l', 'l', 'l', 'plate', + 'well', 'well', 'well', 'l', 'l', 'l', 'well', + 's', 's', 's', 'image', 'image', 'image', 'image', + 's', 's', 's', 'roi', 'roi', 'roi', 'roi' + ] + + def test_objects_columns(self): + self.create_objects_dictionary() + self.assert_detect_headers() + + def test_dense_extra_columns(self): + ''' + Test of the default automatic column type detection behaviour + ''' + self.create_objects_dictionary() + self.d.update({ + 'measurement 1': [11, 22, 33], + 'measurement 2': [0.1, 0.2, 0.3], + 'measurement 3': ['a', 'b', 'c'], + 'measurement 4': [True, True, False], + 'measurement 5': [11, 0.1, True] + }) + self.expected_header.extend(['l', 'd', 's', 'b', 's']) + self.assert_detect_headers() class TestColumnTypes: From 7ff94a2a6ee3b0bea8629af9beb8cc3ff8a92d79 Mon Sep 17 00:00:00 2001 From: Sebastien Besson Date: Thu, 2 Jun 2022 14:32:36 +0100 Subject: [PATCH 46/61] Allow MetadataControl.detect_header to control default NA --- src/omero_metadata/cli.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/omero_metadata/cli.py b/src/omero_metadata/cli.py index edce60b6..54c57422 100755 --- a/src/omero_metadata/cli.py +++ b/src/omero_metadata/cli.py @@ -489,7 +489,7 @@ def testtables(self, args): self.ctx.die(100, "Failed to initialize Table") @staticmethod - def detect_headers(csv_path): + def detect_headers(csv_path, keep_default_na=True): ''' Function to automatically detect headers from a CSV file. This function loads the table to pandas to detects the column type and match headers @@ -497,7 +497,7 @@ def detect_headers(csv_path): conserved_headers = ['well', 'plate', 'image', 'dataset', 'roi'] headers = [] - table = pd.read_csv(csv_path) + table = pd.read_csv(csv_path, keep_default_na=keep_default_na) col_types = table.dtypes.values.tolist() cols = list(table.columns) From 31d562bd0f4e7d09ee3a5329438be6f061580a7e Mon Sep 17 00:00:00 2001 From: Sebastien Besson Date: Thu, 2 Jun 2022 14:49:18 +0100 Subject: [PATCH 47/61] Add unit tests for the detection of sparse columns --- test/unit/test_automatic_header.py | 34 +++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/test/unit/test_automatic_header.py b/test/unit/test_automatic_header.py index 67d14975..2fd0bcc7 100755 --- a/test/unit/test_automatic_header.py +++ b/test/unit/test_automatic_header.py @@ -19,11 +19,11 @@ class TestDetectHeaders: """Test the MetadataControl.detect_headers API""" - def assert_detect_headers(self): + def assert_detect_headers(self, **kwargs): df = pd.DataFrame(data=self.d) tmp = tempfile.NamedTemporaryFile() df.to_csv(tmp.name, index=False) - header = MetadataControl.detect_headers(tmp.name) + header = MetadataControl.detect_headers(tmp.name, **kwargs) assert header == self.expected_header def create_objects_dictionary(self): @@ -52,7 +52,7 @@ def test_objects_columns(self): self.create_objects_dictionary() self.assert_detect_headers() - def test_dense_extra_columns(self): + def test_dense_columns(self): ''' Test of the default automatic column type detection behaviour ''' @@ -67,6 +67,34 @@ def test_dense_extra_columns(self): self.expected_header.extend(['l', 'd', 's', 'b', 's']) self.assert_detect_headers() + def test_sparse_default_na(self): + ''' + Test default handling of missing values + ''' + self.create_objects_dictionary() + self.d.update({ + 'measurement 1': [11, None, 33], + 'measurement 2': [0.1, 0.2, None], + 'measurement 3': ['a', 'b', None], + 'measurement 4': [True, None, False], + }) + self.expected_header.extend(['d', 'd', 's', 's']) + self.assert_detect_headers(keep_default_na=True) + + def test_sparse_no_default_na(self): + ''' + Test handling of missing values as string columns + ''' + self.create_objects_dictionary() + self.d.update({ + 'measurement 1': [11, None, 33], + 'measurement 2': [0.1, 0.2, None], + 'measurement 3': ['a', 'b', None], + 'measurement 4': [True, None, False], + }) + self.expected_header.extend(['s', 's', 's', 's']) + self.assert_detect_headers(keep_default_na=False) + class TestColumnTypes: ''' From 1f637bd0f73a2023112d4e87786b7c4432b1c298 Mon Sep 17 00:00:00 2001 From: Sebastien Besson Date: Thu, 2 Jun 2022 15:05:47 +0100 Subject: [PATCH 48/61] Use --allow_nan flag in MetadataControl.detect_headers From 1ffac2ae5d4d9e8b12f3761ad486587b317ffa79 Mon Sep 17 00:00:00 2001 From: Sebastien Besson Date: Thu, 2 Jun 2022 15:21:25 +0100 Subject: [PATCH 49/61] Allow fixtures to override the value assertion with their expectations --- test/integration/metadata/test_populate.py | 27 +++++++++++++--------- 1 file changed, 16 insertions(+), 11 deletions(-) mode change 100644 => 100755 test/integration/metadata/test_populate.py diff --git a/test/integration/metadata/test_populate.py b/test/integration/metadata/test_populate.py old mode 100644 new mode 100755 index 619bc968..42af38e7 --- a/test/integration/metadata/test_populate.py +++ b/test/integration/metadata/test_populate.py @@ -175,6 +175,13 @@ def assert_columns(self, columns): col_names = "Well,Well Type,Concentration,Well Name" assert col_names == ",".join([c.name for c in columns]) + def assert_values(self, row_values): + # Unsure where the lower-casing is happening + if "A1" in row_values or "a1" in row_values: + assert "Control" in row_values + elif "A2" in row_values or "a2" in row_values: + assert "Treatment" in row_values + def assert_child_annotations(self, oas): for ma, wid, wr, wc in oas: assert isinstance(ma, MapAnnotationI) @@ -767,6 +774,14 @@ def assert_columns(self, columns): def assert_row_count(self, rows): assert rows == len(self.roi_names) + def assert_values(self, row_values): + if "roi1" in row_values: + assert 0.5 in row_values + assert 100 in row_values + elif "roi2" in row_values: + assert 'nan' in [str(value) for value in row_values] + assert 200 in row_values + def get_target(self): if not self.image: image = self.test.make_image() @@ -1218,17 +1233,7 @@ def _assert_parsing_context_values(self, t, fixture): row_values = [col.values[0] for col in t.read( list(range(len(cols))), hit, hit+1).columns] assert len(row_values) == fixture.count - # Unsure where the lower-casing is happening - if "A1" in row_values or "a1" in row_values: - assert "Control" in row_values - elif "A2" in row_values or "a2" in row_values: - assert "Treatment" in row_values - elif "roi1" in row_values: - assert 0.5 in row_values - assert 100 in row_values - elif "roi2" in row_values: - assert 'nan' in [str(value) for value in row_values] - assert 200 in row_values + fixture.assert_values(row_values) def _test_bulk_to_map_annotation_context(self, fixture, batch_size): # self._testPopulateMetadataPlate() From fe73a17d2a71fd7d220c48891a2364110b59f4f1 Mon Sep 17 00:00:00 2001 From: Sebastien Besson Date: Thu, 2 Jun 2022 16:18:30 +0100 Subject: [PATCH 50/61] Add GNU-style hyphen-separated version of CLI arguments --- src/omero_metadata/cli.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/omero_metadata/cli.py b/src/omero_metadata/cli.py index 54c57422..874d085e 100755 --- a/src/omero_metadata/cli.py +++ b/src/omero_metadata/cli.py @@ -241,11 +241,13 @@ def _configure(self, parser): populate.add_argument("--localcfg", help=( "Local configuration file or a JSON object string")) - populate.add_argument("--allow_nan", action="store_true", help=( - "Allow empty values to become Nan in Long or Double columns")) + populate.add_argument( + "--allow-nan", "--allow_nan", action="store_true", help=( + "Allow empty values to become Nan in Long or Double columns")) - populate.add_argument("--manual_header", action="store_true", help=( - "Disable automatic header detection during population")) + populate.add_argument( + "--manual-header", "--manual_header", action="store_true", help=( + "Disable automatic header detection during population")) populateroi.add_argument( "--measurement", type=int, default=None, From 89b8084f7c142bf70d2ef9d0ce745067234ede99 Mon Sep 17 00:00:00 2001 From: Sebastien Besson Date: Sun, 5 Jun 2022 14:47:05 +0100 Subject: [PATCH 51/61] Use --allow-nan flag to determine NA behavior in detect_headers --- src/omero_metadata/cli.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/omero_metadata/cli.py b/src/omero_metadata/cli.py index 874d085e..3836051b 100755 --- a/src/omero_metadata/cli.py +++ b/src/omero_metadata/cli.py @@ -579,7 +579,8 @@ def populate(self, args): if not args.manual_header and \ not first_row[0].str.contains('# header').bool(): omero_metadata.populate.log.info("Detecting header types") - header_type = MetadataControl.detect_headers(args.file) + header_type = MetadataControl.detect_headers( + args.file, keep_default_na=args.allow_nan) if args.dry_run: omero_metadata.populate.log.info(f"Header Types:{header_type}") else: From c6573afb734c13cc98d491c33156e1e7e6095348 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?S=C3=A9bastien=20Besson?= Date: Tue, 21 Jun 2022 12:02:20 +0100 Subject: [PATCH 52/61] Update README.rst Co-authored-by: pwalczysko --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index ad15a5df..a4bd3241 100644 --- a/README.rst +++ b/README.rst @@ -70,7 +70,7 @@ object IDs in the ``OMERO.table``. The ``CSV`` file must be provided as local file with ``--file path/to/file.csv``. -OMERO.tables have defined column types to specify the data-type such as ``double`` or ``long`` and special object-types of each column for storing OMERO object IDs such as ``ImageColumn`` or ``WellColumn`` +OMERO.tables have defined column types to specify the data-type such as ``double`` or ``long`` and special object-types of each column for storing OMERO object IDs such as ``ImageColumn`` or ``WellColumn``. The default behaviour of the script is to automatically detect the column types from an input ``CSV``. This behaviour works as follows: From 27050f2c84350bffcdbdec7610f7d56bf5dd913c Mon Sep 17 00:00:00 2001 From: Sebastien Besson Date: Tue, 28 Jun 2022 09:23:39 +0100 Subject: [PATCH 53/61] Add note to README about the missing value handling --- README.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.rst b/README.rst index a4bd3241..89b1741c 100644 --- a/README.rst +++ b/README.rst @@ -111,6 +111,9 @@ Example Float ``DoubleColumn`` ``d`` Example boolean ``BoolColumn`` ``b`` =============== ================= ==================== +In the case of missing values, the column will be detected as ``StringColumn`` by default. If ``--allow-nan`` is passed to the +``omero metadata populate`` commands, missing values in floating-point columns will be detected as ``DoubleColumn`` and the +missing values will be stored as NaN. However, it is possible to manually define the header types, ignoring the automatic header detection, if a ``CSV`` with a ``# header`` row is passed. The ``# header`` row should be the first row of the CSV and defines columns according to the following list (see examples below): From 19830f33ddac5f6192ed64be80b37c755d95ce44 Mon Sep 17 00:00:00 2001 From: Sebastien Besson Date: Tue, 28 Jun 2022 11:39:36 +0100 Subject: [PATCH 54/61] Rename CHANGES.rst as CHANGELOG.md for unification with other plugins --- CHANGES.rst => CHANGELOG.md | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename CHANGES.rst => CHANGELOG.md (100%) diff --git a/CHANGES.rst b/CHANGELOG.md similarity index 100% rename from CHANGES.rst rename to CHANGELOG.md From e233a2c43d958878596ff1542d37f3271f91b670 Mon Sep 17 00:00:00 2001 From: Sebastien Besson Date: Tue, 28 Jun 2022 11:41:41 +0100 Subject: [PATCH 55/61] Add entry for 0.11.0 --- CHANGELOG.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index df2fc3bd..e8898a27 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,8 @@ -CHANGES -======= +0.11.0 +------ + +* Add support for column type auto-detection using pandas ([#67](https://github.com/ome/omero-metadata/pull/67), [#71](https://github.com/ome/omero-metadata/pull/67), [#72](https://github.com/ome/omero-metadata/pull/72), [#75](https://github.com/ome/omero-metadata/pull/75), [#77](https://github.com/ome/omero-metadata/pull/77)) +* Skip empty rows when reading CSV files ([#70](https://github.com/ome/omero-metadata/pull/70)) 0.10.0 ------ From 7fce3a5dbd5a9eafb6b00a33be0fa9f926831a7f Mon Sep 17 00:00:00 2001 From: Sebastien Besson Date: Tue, 28 Jun 2022 11:42:28 +0100 Subject: [PATCH 56/61] =?UTF-8?q?Bump=20version:=200.11.0.dev0=20=E2=86=92?= =?UTF-8?q?=200.11.0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .bumpversion.cfg | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 796606da..350df8b5 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.11.0.dev0 +current_version = 0.11.0 commit = True tag = True sign_tags = True diff --git a/setup.py b/setup.py index 0c5b9766..6017ffec 100644 --- a/setup.py +++ b/setup.py @@ -92,7 +92,7 @@ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() -version = '0.11.0.dev0' +version = '0.11.0' url = "https://github.com/ome/omero-metadata/" setup( From 27c10d01dfb5319944fc018e1f0a6c56e3a6b421 Mon Sep 17 00:00:00 2001 From: Sebastien Besson Date: Tue, 28 Jun 2022 11:42:40 +0100 Subject: [PATCH 57/61] =?UTF-8?q?Bump=20version:=200.11.0=20=E2=86=92=200.?= =?UTF-8?q?11.1.dev0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .bumpversion.cfg | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 350df8b5..8aa69065 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.11.0 +current_version = 0.11.1.dev0 commit = True tag = True sign_tags = True diff --git a/setup.py b/setup.py index 6017ffec..00a561b2 100644 --- a/setup.py +++ b/setup.py @@ -92,7 +92,7 @@ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() -version = '0.11.0' +version = '0.11.1.dev0' url = "https://github.com/ome/omero-metadata/" setup( From 70cce56de13c5850b9aac2b4a8b76c21e13d6615 Mon Sep 17 00:00:00 2001 From: Emil Rozbicki Date: Tue, 28 Jun 2022 21:43:23 +0200 Subject: [PATCH 58/61] info -> debug --- src/omero_metadata/populate.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/omero_metadata/populate.py b/src/omero_metadata/populate.py index f9b7adb2..c210161b 100644 --- a/src/omero_metadata/populate.py +++ b/src/omero_metadata/populate.py @@ -1401,7 +1401,7 @@ def post_process(self): if well_name_column is None and plate_name_column is None \ and image_name_column is None and roi_name_column is None \ and roi_column is None: - log.info('Nothing to do during post processing.') + log.debug('Nothing to do during post processing.') return sz = max([len(x.values) for x in self.columns]) From ab9ed9a800058b3a2745f459659964097241364d Mon Sep 17 00:00:00 2001 From: Sebastien Besson Date: Wed, 29 Jun 2022 09:17:30 +0100 Subject: [PATCH 59/61] Add changelog entry --- CHANGELOG.md | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index e8898a27..b1bca662 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +0.11.1 +------ + +* Reduce logging level of post_process statement ([#78](https://github.com/ome/omero-metadata/pull/78)) + 0.11.0 ------ From caa5c953cd2b2f89f0adaeb77e98ab5f66854ad3 Mon Sep 17 00:00:00 2001 From: Sebastien Besson Date: Wed, 29 Jun 2022 09:17:37 +0100 Subject: [PATCH 60/61] =?UTF-8?q?Bump=20version:=200.11.1.dev0=20=E2=86=92?= =?UTF-8?q?=200.11.1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .bumpversion.cfg | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 8aa69065..231d226f 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.11.1.dev0 +current_version = 0.11.1 commit = True tag = True sign_tags = True diff --git a/setup.py b/setup.py index 00a561b2..387acacd 100644 --- a/setup.py +++ b/setup.py @@ -92,7 +92,7 @@ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() -version = '0.11.1.dev0' +version = '0.11.1' url = "https://github.com/ome/omero-metadata/" setup( From 298f02623cd90268162161a988575db0b8c3d1ab Mon Sep 17 00:00:00 2001 From: Sebastien Besson Date: Wed, 29 Jun 2022 09:17:44 +0100 Subject: [PATCH 61/61] =?UTF-8?q?Bump=20version:=200.11.1=20=E2=86=92=200.?= =?UTF-8?q?11.2.dev0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .bumpversion.cfg | 2 +- setup.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 231d226f..16ad673c 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.11.1 +current_version = 0.11.2.dev0 commit = True tag = True sign_tags = True diff --git a/setup.py b/setup.py index 387acacd..239e8d6c 100644 --- a/setup.py +++ b/setup.py @@ -92,7 +92,7 @@ def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() -version = '0.11.1' +version = '0.11.2.dev0' url = "https://github.com/ome/omero-metadata/" setup(