From 52bac3d0943e538736891a13a999eb4e8fc93936 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Wed, 2 Oct 2024 12:03:47 +0000 Subject: [PATCH 1/4] Bump spec_tests/hed-examples from `f117c4d` to `ca307b9` Bumps [spec_tests/hed-examples](https://github.com/hed-standard/hed-examples) from `f117c4d` to `ca307b9`. - [Release notes](https://github.com/hed-standard/hed-examples/releases) - [Commits](https://github.com/hed-standard/hed-examples/compare/f117c4d669ea034053011e6d3d152b0e906c1914...ca307b99fa12e75335bf7a8b1d3313d85ad5d9e5) --- updated-dependencies: - dependency-name: spec_tests/hed-examples dependency-type: direct:production ... Signed-off-by: dependabot[bot] --- spec_tests/hed-examples | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spec_tests/hed-examples b/spec_tests/hed-examples index f117c4d6..ca307b99 160000 --- a/spec_tests/hed-examples +++ b/spec_tests/hed-examples @@ -1 +1 @@ -Subproject commit f117c4d669ea034053011e6d3d152b0e906c1914 +Subproject commit ca307b99fa12e75335bf7a8b1d3313d85ad5d9e5 From fd3a57288174d7f5516ff883ae378f231d5bc291 Mon Sep 17 00:00:00 2001 From: Kay Robbins <1189050+VisLab@users.noreply.github.com> Date: Tue, 8 Oct 2024 09:45:16 -0500 Subject: [PATCH 2/4] Changed ONSETS_OUT_OF_ORDER to ONSETS_UNORDERED --- hed/errors/error_messages.py | 6 +- hed/errors/error_reporter.py | 1372 ++++++++--------- hed/errors/error_types.py | 2 +- hed/models/base_input.py | 956 ++++++------ hed/models/column_mapper.py | 843 +++++----- hed/models/df_util.py | 4 +- hed/models/tabular_input.py | 176 +-- hed/validator/spreadsheet_validator.py | 4 +- tests/models/test_base_input.py | 6 +- tests/validator/test_spreadsheet_validator.py | 11 +- 10 files changed, 1690 insertions(+), 1690 deletions(-) diff --git a/hed/errors/error_messages.py b/hed/errors/error_messages.py index 06c9c651..94e2408a 100644 --- a/hed/errors/error_messages.py +++ b/hed/errors/error_messages.py @@ -77,9 +77,9 @@ def val_error_CURLY_BRACE_UNSUPPORTED_HERE(tag, problem_tag): f"Invalid character '{problem_tag}' in tag '{tag}'") -@hed_error(ValidationErrors.ONSETS_OUT_OF_ORDER, default_severity=ErrorSeverity.WARNING) -def val_error_ONSETS_OUT_OF_ORDER(): - return "Onsets need to be temporally increasing for most downstream tools to work." +@hed_error(ValidationErrors.ONSETS_UNORDERED, default_severity=ErrorSeverity.WARNING) +def val_error_ONSETS_UNORDERED(): + return "Onsets need to be temporally increasing and defined for many downstream tools to work." @hed_error(ValidationErrors.COMMA_MISSING) diff --git a/hed/errors/error_reporter.py b/hed/errors/error_reporter.py index 42b12842..b108c61b 100644 --- a/hed/errors/error_reporter.py +++ b/hed/errors/error_reporter.py @@ -1,686 +1,686 @@ -""" -Support functions for reporting validation errors. - -You can scope the formatted errors with calls to push_error_context and pop_error_context. -""" - -from functools import wraps -import xml.etree.ElementTree as ET - -from hed.errors.error_types import ErrorContext, ErrorSeverity -from hed.errors.known_error_codes import known_error_codes - -error_functions = {} - -# Controls if the default issue printing skips adding indentation for this context. -no_tab_context = {ErrorContext.HED_STRING, ErrorContext.SCHEMA_ATTRIBUTE} - -# Default sort ordering for issues list. -default_sort_list = [ - ErrorContext.CUSTOM_TITLE, - ErrorContext.FILE_NAME, - ErrorContext.SIDECAR_COLUMN_NAME, - ErrorContext.SIDECAR_KEY_NAME, - ErrorContext.ROW, - ErrorContext.COLUMN, - ErrorContext.LINE, - ErrorContext.SCHEMA_SECTION, - ErrorContext.SCHEMA_TAG, - ErrorContext.SCHEMA_ATTRIBUTE, -] - -# ErrorContext which is expected to be int based. -int_sort_list = [ - ErrorContext.ROW -] - - -def _register_error_function(error_type, wrapper_func): - if error_type in error_functions: - raise KeyError(f"{error_type} defined more than once.") - - error_functions[error_type] = wrapper_func - - -def hed_error(error_type, default_severity=ErrorSeverity.ERROR, actual_code=None): - """ Decorator for errors in error handler or inherited classes. - - Parameters: - error_type (str): A value from error_types or optionally another value. - default_severity (ErrorSeverity): The default severity for the decorated error. - actual_code (str): The actual error to report to the outside world. - - """ - if actual_code is None: - actual_code = error_type - - def inner_decorator(func): - @wraps(func) - def wrapper(*args, severity=default_severity, **kwargs): - """ Wrapper function for error handling non-tag errors. - - Parameters: - args (args): non keyword args. - severity (ErrorSeverity): Will override the default error value if passed. - kwargs (**kwargs): Any keyword args to be passed down to error message function. - - Returns: - list: A list of dict with the errors. - """ - base_message = func(*args, **kwargs) - error_object = ErrorHandler._create_error_object(actual_code, base_message, severity) - return error_object - - _register_error_function(error_type, wrapper_func=wrapper) - return wrapper - - return inner_decorator - - -def hed_tag_error(error_type, default_severity=ErrorSeverity.ERROR, has_sub_tag=False, actual_code=None): - """ Decorator for errors in error handler or inherited classes. - - Parameters: - error_type (str): A value from error_types or optionally another value. - default_severity (ErrorSeverity): The default severity for the decorated error. - has_sub_tag (bool): If True, this error message also wants a sub_tag passed down. eg "This" in "This/Is/A/Tag" - actual_code (str): The actual error to report to the outside world. - - """ - if actual_code is None: - actual_code = error_type - - def inner_decorator(func): - if has_sub_tag: - @wraps(func) - def wrapper(tag, index_in_tag, index_in_tag_end, *args, severity=default_severity, **kwargs): - """ Wrapper function for error handling tag errors with sub tags. - - Parameters: - tag (HedTag): The HED tag object with the problem. - index_in_tag (int): The index into the tag with a problem(usually 0). - index_in_tag_end (int): The last index into the tag with a problem - usually len(tag). - args (args): Any other non keyword args. - severity (ErrorSeverity): Used to include warnings as well as errors. - kwargs (**kwargs): Any keyword args to be passed down to error message function. - - Returns: - list: A list of dict with the errors. - - """ - try: - tag_as_string = tag.tag - except AttributeError: - tag_as_string = str(tag) - - if index_in_tag_end is None: - index_in_tag_end = len(tag_as_string) - problem_sub_tag = tag_as_string[index_in_tag: index_in_tag_end] - try: - org_tag_text = tag.org_tag - except AttributeError: - org_tag_text = str(tag) - - base_message = func(org_tag_text, problem_sub_tag, *args, **kwargs) - error_object = ErrorHandler._create_error_object(actual_code, base_message, severity, - index_in_tag=index_in_tag, - index_in_tag_end=index_in_tag_end, source_tag=tag) - - return error_object - - _register_error_function(error_type, wrapper_func=wrapper) - return wrapper - else: - @wraps(func) - def wrapper(tag, *args, severity=default_severity, **kwargs): - """ Wrapper function for error handling tag errors. - - Parameters: - tag (HedTag or HedGroup): The HED tag object with the problem. - args (non keyword args): Any other non keyword args. - severity (ErrorSeverity): For including warnings. - kwargs (keyword args): Any keyword args to be passed down to error message function. - - Returns: - list: A list of dict with the errors. - - """ - from hed.models.hed_tag import HedTag - from hed.models.hed_group import HedGroup - if isinstance(tag, HedTag): - org_tag_text = tag.org_tag - elif isinstance(tag, HedGroup): - org_tag_text = tag.get_original_hed_string() - else: - org_tag_text = str(tag) - base_message = func(org_tag_text, *args, **kwargs) - error_object = ErrorHandler._create_error_object(actual_code, base_message, severity, - source_tag=tag) - - return error_object - - _register_error_function(error_type, wrapper_func=wrapper) - return wrapper - - return inner_decorator - - -# Import after hed_error decorators are defined. -from hed.errors import error_messages # noqa:E402 -from hed.errors import schema_error_messages # noqa:E402 - -# Intentional to make sure tools don't think the import is unused -error_messages.mark_as_used = True -schema_error_messages.mark_as_used = True - - -class ErrorHandler: - """Class to hold error context and having general error functions.""" - def __init__(self, check_for_warnings=True): - # The current (ordered) dictionary of contexts. - self.error_context = [] - self._check_for_warnings = check_for_warnings - - def push_error_context(self, context_type, context): - """ Push a new error context to narrow down error scope. - - Parameters: - context_type (ErrorContext): A value from ErrorContext representing the type of scope. - context (str, int, or HedString): The main value for the context_type. - - Notes: - The context depends on the context_type. For ErrorContext.FILE_NAME this would be the actual filename. - - """ - if context is None: - if context_type in int_sort_list: - context = 0 - else: - context = "" - self.error_context.append((context_type, context)) - - def pop_error_context(self): - """ Remove the last scope from the error context. - - Notes: - Modifies the error context of this reporter. - - """ - - self.error_context.pop(-1) - - def reset_error_context(self): - """ Reset all error context information to defaults. - - Notes: - This function is mainly for testing and should not be needed with proper usage. - - """ - self.error_context = [] - - def format_error_with_context(self, *args, **kwargs): - error_object = ErrorHandler.format_error(*args, **kwargs) - if self is not None: - actual_error = error_object[0] - # # Filter out warning errors - if not self._check_for_warnings and actual_error['severity'] >= ErrorSeverity.WARNING: - return [] - self._add_context_to_errors(actual_error, self.error_context) - self._update_error_with_char_pos(actual_error) - - return error_object - - @staticmethod - def format_error(error_type, *args, actual_error=None, **kwargs): - """ Format an error based on the parameters, which vary based on what type of error this is. - - Parameters: - error_type (str): The type of error for this. Registered with @hed_error or @hed_tag_error. - args (args): Any remaining non keyword args after those required by the error type. - actual_error (str or None): Code to actually add to report out. - kwargs (kwargs): The other keyword args to pass down to the error handling func. - - Returns: - list: A list containing a single dictionary representing a single error. - - Notes: - The actual error is useful for errors that are shared like invalid character. - - """ - error_func = error_functions.get(error_type) - if not error_func: - error_object = ErrorHandler.val_error_unknown(*args, **kwargs) - error_object['code'] = error_type - else: - error_object = error_func(*args, **kwargs) - - if actual_error: - error_object['code'] = actual_error - - return [error_object] - - def add_context_and_filter(self, issues): - """ Filter out warnings if requested, while adding context to issues. - - issues(list): - list: A list containing a single dictionary representing a single error. - """ - if not self._check_for_warnings: - issues[:] = self.filter_issues_by_severity(issues, ErrorSeverity.ERROR) - - for error_object in issues: - self._add_context_to_errors(error_object, self.error_context) - self._update_error_with_char_pos(error_object) - - @staticmethod - def format_error_from_context(error_type, error_context, *args, actual_error=None, **kwargs): - """ Format an error based on the error type. - - Parameters: - error_type (str): The type of error. Registered with @hed_error or @hed_tag_error. - error_context (list): Contains the error context to use for this error. - args (args): Any remaining non keyword args. - actual_error (str or None): Error code to actually add to report out. - kwargs (kwargs): Keyword parameters to pass down to the error handling func. - - Returns: - list: A list containing a single dictionary. - - Notes: - - Generally the error_context is returned from _add_context_to_errors. - - The actual_error is useful for errors that are shared like invalid character. - - This can't filter out warnings like the other ones. - - """ - error_list = ErrorHandler.format_error(error_type, *args, actual_error=actual_error, **kwargs) - - ErrorHandler._add_context_to_errors(error_list[0], error_context) - ErrorHandler._update_error_with_char_pos(error_list[0]) - return error_list - - @staticmethod - def _add_context_to_errors(error_object, error_context_to_add): - """ Add relevant context such as row number or column name around an error object. - - Parameters: - error_object (dict): Generated error containing at least a code and message entry. - error_context_to_add (list): Source context to use. If none, the error handler context is used. - - Returns: - list: A list of dict with needed context strings added at the beginning of the list. - - """ - for (context_type, context) in error_context_to_add: - error_object[context_type] = context - - return error_object - - @staticmethod - def _create_error_object(error_type, base_message, severity, **kwargs): - error_object = {'code': error_type, - 'message': base_message, - 'severity': severity - } - - for key, value in kwargs.items(): - error_object.setdefault(key, value) - - return error_object - - @staticmethod - def _get_tag_span_to_error_object(error_object): - if ErrorContext.HED_STRING not in error_object: - return None, None - - if 'source_tag' in error_object: - source_tag = error_object['source_tag'] - if isinstance(source_tag, int): - return None, None - else: - return None, None - - hed_string = error_object[ErrorContext.HED_STRING] - span = hed_string._get_org_span(source_tag) - return span - - @staticmethod - def _update_error_with_char_pos(error_object): - # This part is optional as you can always generate these as needed. - start, end = ErrorHandler._get_tag_span_to_error_object(error_object) - if start is not None: - # silence warning in pycharm - start = int(start) - source_tag = error_object.get('source_tag', None) - # Todo: Move this functionality somewhere more centralized. - # If the tag has been modified from the original, don't try to use sub indexing. - if source_tag and source_tag._tag: - new_start, new_end = start, end - else: - new_start = start + error_object.get('index_in_tag', 0) - index_in_tag_end = end - if 'index_in_tag_end' in error_object: - index_in_tag_end = start + error_object['index_in_tag_end'] - new_end = index_in_tag_end - error_object['char_index'], error_object['char_index_end'] = new_start, new_end - error_object['message'] += f" Problem spans string indexes: {new_start}, {new_end}" - - @hed_error("Unknown") - def val_error_unknown(*args, **kwargs): - """ Default error handler if no error of this type was registered. - - Parameters: - args (args): List of non-keyword parameters (varies). - kwargs (kwargs): Keyword parameters (varies) - - Returns: - str: The error message. - - """ - return f"Unknown error. Args: {str(args), str(kwargs)}" - - @staticmethod - def filter_issues_by_severity(issues_list, severity): - """ Gather all issues matching or below a given severity. - - Parameters: - issues_list (list): A list of dictionaries containing the full issue list. - severity (int): The level of issues to keep. - - Returns: - list: A list of dictionaries containing the issue list after filtering by severity. - - """ - return [issue for issue in issues_list if issue['severity'] <= severity] - - -def sort_issues(issues, reverse=False): - """Sort a list of issues by the error context values. - - Parameters: - issues (list): A list of dictionaries representing the issues to be sorted. - reverse (bool, optional): If True, sorts the list in descending order. Default is False. - - Returns: - list: The sorted list of issues.""" - def _get_keys(d): - result = [] - for key in default_sort_list: - if key in int_sort_list: - result.append(d.get(key, -1)) - else: - result.append(d.get(key, "")) - return tuple(result) - - issues = sorted(issues, key=_get_keys, reverse=reverse) - - return issues - - -def check_for_any_errors(issues_list): - """ Return True if there are any errors with a severity of warning. """ - for issue in issues_list: - if issue['severity'] < ErrorSeverity.WARNING: - return True - - return False - - -def get_printable_issue_string(issues, title=None, severity=None, skip_filename=True, add_link=False): - """ Return a string with issues list flatted into single string, one per line. - - Parameters: - issues (list): Issues to print. - title (str): Optional title that will always show up first if present(even if there are no validation issues). - severity (int): Return only warnings >= severity. - skip_filename (bool): If True, don't add the filename context to the printable string. - add_link (bool): Add a link at the end of message to the appropriate error if True - Returns: - str: A string containing printable version of the issues or ''. - - """ - if severity is not None: - issues = ErrorHandler.filter_issues_by_severity(issues, severity) - - output_dict = _build_error_context_dict(issues, skip_filename) - issue_string = _error_dict_to_string(output_dict, add_link=add_link) - - if title: - issue_string = title + '\n' + issue_string - return issue_string - - -def get_printable_issue_string_html(issues, title=None, severity=None, skip_filename=True): - """ Return a string with issues list as an HTML tree. - - Parameters: - issues (list): Issues to print. - title (str): Optional title that will always show up first if present. - severity (int): Return only warnings >= severity. - skip_filename (bool): If True, don't add the filename context to the printable string. - - Returns: - str: An HTML string containing the issues or ''. - """ - if severity is not None: - issues = ErrorHandler.filter_issues_by_severity(issues, severity) - - output_dict = _build_error_context_dict(issues, skip_filename) - - root_element = _create_error_tree(output_dict) - if title: - title_element = ET.Element("h1") - title_element.text = title - root_element.insert(0, title_element) - return ET.tostring(root_element, encoding='unicode') - - -def create_doc_link(error_code): - """If error code is a known code, return a documentation url for it. - - Parameters: - error_code(str): A HED error code. - - Returns: - url(str or None): The URL if it's a valid code. - """ - if error_code in known_error_codes["hed_validation_errors"] \ - or error_code in known_error_codes["schema_validation_errors"]: - modified_error_code = error_code.replace("_", "-").lower() - return f"https://hed-specification.readthedocs.io/en/latest/Appendix_B.html#{modified_error_code}" - return None - - -def _build_error_context_dict(issues, skip_filename): - """Build the context -> error dictionary for an entire list of issues. - - Returns: - dict: A nested dictionary structure with a "children" key at each level for unrelated children. - """ - output_dict = None - for single_issue in issues: - single_issue_context = _get_context_from_issue(single_issue, skip_filename) - output_dict = _add_single_error_to_dict(single_issue_context, output_dict, single_issue) - - return output_dict - - -def _add_single_error_to_dict(items, root=None, issue_to_add=None): - """ Build a nested dictionary out of the context lists. - - Parameters: - items (list): A list of error contexts - root (dict, optional): An existing nested dictionary structure to update. - issue_to_add (dict, optional): The issue to add at this level of context. - - Returns: - dict: A nested dictionary structure with a "children" key at each level for unrelated children. - """ - if root is None: - root = {"children": []} - - current_dict = root - for item in items: - # Navigate to the next level if the item already exists, or create a new level - next_dict = current_dict.get(item, {"children": []}) - current_dict[item] = next_dict - current_dict = next_dict - - if issue_to_add: - current_dict["children"].append(issue_to_add) - - return root - - -def _error_dict_to_string(print_dict, add_link=True, level=0): - output = "" - if print_dict is None: - return output - for context, value in print_dict.items(): - if context == "children": - for child in value: - single_issue_message = child["message"] - issue_string = level * "\t" + _get_error_prefix(child) - issue_string += f"{single_issue_message}\n" - if add_link: - link_url = create_doc_link(child['code']) - if link_url: - single_issue_message += f" See... {link_url}" - output += issue_string - continue - output += _format_single_context_string(context[0], context[1], level) - output += _error_dict_to_string(value, add_link, level + 1) - - return output - - -def _get_context_from_issue(val_issue, skip_filename=True): - """ Extract all the context values from the given issue. - - Parameters: - val_issue (dict): A dictionary a representing a single error. - skip_filename (bool): If True, don't gather the filename context. - - Returns: - list: A list of tuples containing the context_type and context for the given issue. - - """ - single_issue_context = [] - for key, value in val_issue.items(): - if skip_filename and key == ErrorContext.FILE_NAME: - continue - if key == ErrorContext.HED_STRING: - value = value.get_original_hed_string() - if key.startswith("ec_"): - single_issue_context.append((key, str(value))) - - return single_issue_context - - -def _get_error_prefix(single_issue): - """Return the prefix for the error message based on severity and error code. - - Parameters: - single_issue(dict): A single issue object. - - Returns: - error_prefix(str): the prefix to use. - """ - severity = single_issue.get('severity', ErrorSeverity.ERROR) - error_code = single_issue['code'] - - if severity == ErrorSeverity.ERROR: - error_prefix = f"{error_code}: " - else: - error_prefix = f"{error_code}: (Warning) " - return error_prefix - - -def _format_single_context_string(context_type, context, tab_count=0): - """ Return the human-readable form of a single context tuple. - - Parameters: - context_type (str): The context type of this entry. - context (str or HedString): The value of this context. - tab_count (int): Number of tabs to name_prefix each line with. - - Returns: - str: A string containing the context, including tabs. - - """ - tab_string = tab_count * '\t' - error_types = { - ErrorContext.FILE_NAME: f"\nErrors in file '{context}'", - ErrorContext.SIDECAR_COLUMN_NAME: f"Column '{context}':", - ErrorContext.SIDECAR_KEY_NAME: f"Key: {context}", - ErrorContext.ROW: f'Issues in row {context}:', - ErrorContext.COLUMN: f'Issues in column {context}:', - ErrorContext.CUSTOM_TITLE: context, - ErrorContext.LINE: f"Line: {context}", - ErrorContext.HED_STRING: f"hed string: {context}", - ErrorContext.SCHEMA_SECTION: f"Schema Section: {context}", - ErrorContext.SCHEMA_TAG: f"Source tag: {context}", - ErrorContext.SCHEMA_ATTRIBUTE: f"Source Attribute: {context}", - } - context_portion = error_types[context_type] - context_string = f"{tab_string}{context_portion}\n" - return context_string - - -def _create_error_tree(error_dict, parent_element=None, add_link=True): - if parent_element is None: - parent_element = ET.Element("ul") - - for context, value in error_dict.items(): - if context == "children": - for child in value: - child_li = ET.SubElement(parent_element, "li") - error_prefix = _get_error_prefix(child) - single_issue_message = child["message"] - - # Create a link for the error prefix if add_link is True. - if add_link: - link_url = create_doc_link(child['code']) - if link_url: - a_element = ET.SubElement(child_li, "a", href=link_url) - a_element.text = error_prefix - a_element.tail = " " + single_issue_message - else: - child_li.text = error_prefix + " " + single_issue_message - else: - child_li.text = error_prefix + " " + single_issue_message - continue - - context_li = ET.SubElement(parent_element, "li") - context_li.text = _format_single_context_string(context[0], context[1]) - context_ul = ET.SubElement(context_li, "ul") - _create_error_tree(value, context_ul, add_link) - - return parent_element - - -def replace_tag_references(list_or_dict): - """ Utility function to remove any references to tags, strings, etc. from any type of nested list or dict. - - Use this if you want to save out issues to a file. - - If you'd prefer a copy returned, use replace_tag_references(list_or_dict.copy()). - - Parameters: - list_or_dict(list or dict): An arbitrarily nested list/dict structure - """ - if isinstance(list_or_dict, dict): - for key, value in list_or_dict.items(): - if isinstance(value, (dict, list)): - replace_tag_references(value) - elif isinstance(value, (bool, float, int)): - list_or_dict[key] = value - else: - list_or_dict[key] = str(value) - elif isinstance(list_or_dict, list): - for key, value in enumerate(list_or_dict): - if isinstance(value, (dict, list)): - replace_tag_references(value) - elif isinstance(value, (bool, float, int)): - list_or_dict[key] = value - else: - list_or_dict[key] = str(value) +"""" +Support functions for reporting validation errors. + +You can scope the formatted errors with calls to push_error_context and pop_error_context. +""" + +from functools import wraps +import xml.etree.ElementTree as ET + +from hed.errors.error_types import ErrorContext, ErrorSeverity +from hed.errors.known_error_codes import known_error_codes + +error_functions = {} + +# Controls if the default issue printing skips adding indentation for this context. +no_tab_context = {ErrorContext.HED_STRING, ErrorContext.SCHEMA_ATTRIBUTE} + +# Default sort ordering for issues list. +default_sort_list = [ + ErrorContext.CUSTOM_TITLE, + ErrorContext.FILE_NAME, + ErrorContext.SIDECAR_COLUMN_NAME, + ErrorContext.SIDECAR_KEY_NAME, + ErrorContext.ROW, + ErrorContext.COLUMN, + ErrorContext.LINE, + ErrorContext.SCHEMA_SECTION, + ErrorContext.SCHEMA_TAG, + ErrorContext.SCHEMA_ATTRIBUTE, +] + +# ErrorContext which is expected to be int based. +int_sort_list = [ + ErrorContext.ROW +] + + +def _register_error_function(error_type, wrapper_func): + if error_type in error_functions: + raise KeyError(f"{error_type} defined more than once.") + + error_functions[error_type] = wrapper_func + + +def hed_error(error_type, default_severity=ErrorSeverity.ERROR, actual_code=None): + """ Decorator for errors in error handler or inherited classes. + + Parameters: + error_type (str): A value from error_types or optionally another value. + default_severity (ErrorSeverity): The default severity for the decorated error. + actual_code (str): The actual error to report to the outside world. + + """ + if actual_code is None: + actual_code = error_type + + def inner_decorator(func): + @wraps(func) + def wrapper(*args, severity=default_severity, **kwargs): + """ Wrapper function for error handling non-tag errors. + + Parameters: + args (args): non keyword args. + severity (ErrorSeverity): Will override the default error value if passed. + kwargs (**kwargs): Any keyword args to be passed down to error message function. + + Returns: + list: A list of dict with the errors. + """ + base_message = func(*args, **kwargs) + error_object = ErrorHandler._create_error_object(actual_code, base_message, severity) + return error_object + + _register_error_function(error_type, wrapper_func=wrapper) + return wrapper + + return inner_decorator + + +def hed_tag_error(error_type, default_severity=ErrorSeverity.ERROR, has_sub_tag=False, actual_code=None): + """ Decorator for errors in error handler or inherited classes. + + Parameters: + error_type (str): A value from error_types or optionally another value. + default_severity (ErrorSeverity): The default severity for the decorated error. + has_sub_tag (bool): If True, this error message also wants a sub_tag passed down. eg "This" in "This/Is/A/Tag" + actual_code (str): The actual error to report to the outside world. + + """ + if actual_code is None: + actual_code = error_type + + def inner_decorator(func): + if has_sub_tag: + @wraps(func) + def wrapper(tag, index_in_tag, index_in_tag_end, *args, severity=default_severity, **kwargs): + """ Wrapper function for error handling tag errors with sub tags. + + Parameters: + tag (HedTag): The HED tag object with the problem. + index_in_tag (int): The index into the tag with a problem(usually 0). + index_in_tag_end (int): The last index into the tag with a problem - usually len(tag). + args (args): Any other non keyword args. + severity (ErrorSeverity): Used to include warnings as well as errors. + kwargs (**kwargs): Any keyword args to be passed down to error message function. + + Returns: + list: A list of dict with the errors. + + """ + try: + tag_as_string = tag.tag + except AttributeError: + tag_as_string = str(tag) + + if index_in_tag_end is None: + index_in_tag_end = len(tag_as_string) + problem_sub_tag = tag_as_string[index_in_tag: index_in_tag_end] + try: + org_tag_text = tag.org_tag + except AttributeError: + org_tag_text = str(tag) + + base_message = func(org_tag_text, problem_sub_tag, *args, **kwargs) + error_object = ErrorHandler._create_error_object(actual_code, base_message, severity, + index_in_tag=index_in_tag, + index_in_tag_end=index_in_tag_end, source_tag=tag) + + return error_object + + _register_error_function(error_type, wrapper_func=wrapper) + return wrapper + else: + @wraps(func) + def wrapper(tag, *args, severity=default_severity, **kwargs): + """ Wrapper function for error handling tag errors. + + Parameters: + tag (HedTag or HedGroup): The HED tag object with the problem. + args (non keyword args): Any other non keyword args. + severity (ErrorSeverity): For including warnings. + kwargs (keyword args): Any keyword args to be passed down to error message function. + + Returns: + list: A list of dict with the errors. + + """ + from hed.models.hed_tag import HedTag + from hed.models.hed_group import HedGroup + if isinstance(tag, HedTag): + org_tag_text = tag.org_tag + elif isinstance(tag, HedGroup): + org_tag_text = tag.get_original_hed_string() + else: + org_tag_text = str(tag) + base_message = func(org_tag_text, *args, **kwargs) + error_object = ErrorHandler._create_error_object(actual_code, base_message, severity, + source_tag=tag) + + return error_object + + _register_error_function(error_type, wrapper_func=wrapper) + return wrapper + + return inner_decorator + + +# Import after hed_error decorators are defined. +from hed.errors import error_messages # noqa:E402 +from hed.errors import schema_error_messages # noqa:E402 + +# Intentional to make sure tools don't think the import is unused +error_messages.mark_as_used = True +schema_error_messages.mark_as_used = True + + +class ErrorHandler: + """Class to hold error context and having general error functions.""" + def __init__(self, check_for_warnings=True): + # The current (ordered) dictionary of contexts. + self.error_context = [] + self._check_for_warnings = check_for_warnings + + def push_error_context(self, context_type, context): + """ Push a new error context to narrow down error scope. + + Parameters: + context_type (ErrorContext): A value from ErrorContext representing the type of scope. + context (str, int, or HedString): The main value for the context_type. + + Notes: + The context depends on the context_type. For ErrorContext.FILE_NAME this would be the actual filename. + + """ + if context is None: + if context_type in int_sort_list: + context = 0 + else: + context = "" + self.error_context.append((context_type, context)) + + def pop_error_context(self): + """ Remove the last scope from the error context. + + Notes: + Modifies the error context of this reporter. + + """ + + self.error_context.pop(-1) + + def reset_error_context(self): + """ Reset all error context information to defaults. + + Notes: + This function is mainly for testing and should not be needed with proper usage. + + """ + self.error_context = [] + + def format_error_with_context(self, *args, **kwargs): + error_object = ErrorHandler.format_error(*args, **kwargs) + if self is not None: + actual_error = error_object[0] + # # Filter out warning errors + if not self._check_for_warnings and actual_error['severity'] >= ErrorSeverity.WARNING: + return [] + self._add_context_to_errors(actual_error, self.error_context) + self._update_error_with_char_pos(actual_error) + + return error_object + + @staticmethod + def format_error(error_type, *args, actual_error=None, **kwargs): + """ Format an error based on the parameters, which vary based on what type of error this is. + + Parameters: + error_type (str): The type of error for this. Registered with @hed_error or @hed_tag_error. + args (args): Any remaining non keyword args after those required by the error type. + actual_error (str or None): Code to actually add to report out. + kwargs (kwargs): The other keyword args to pass down to the error handling func. + + Returns: + list: A list containing a single dictionary representing a single error. + + Notes: + The actual error is useful for errors that are shared like invalid character. + + """ + error_func = error_functions.get(error_type) + if not error_func: + error_object = ErrorHandler.val_error_unknown(*args, **kwargs) + error_object['code'] = error_type + else: + error_object = error_func(*args, **kwargs) + + if actual_error: + error_object['code'] = actual_error + + return [error_object] + + def add_context_and_filter(self, issues): + """ Filter out warnings if requested, while adding context to issues. + + issues(list): + list: A list containing a single dictionary representing a single error. + """ + if not self._check_for_warnings: + issues[:] = self.filter_issues_by_severity(issues, ErrorSeverity.ERROR) + + for error_object in issues: + self._add_context_to_errors(error_object, self.error_context) + self._update_error_with_char_pos(error_object) + + @staticmethod + def format_error_from_context(error_type, error_context, *args, actual_error=None, **kwargs): + """ Format an error based on the error type. + + Parameters: + error_type (str): The type of error. Registered with @hed_error or @hed_tag_error. + error_context (list): Contains the error context to use for this error. + args (args): Any remaining non keyword args. + actual_error (str or None): Error code to actually add to report out. + kwargs (kwargs): Keyword parameters to pass down to the error handling func. + + Returns: + list: A list containing a single dictionary. + + Notes: + - Generally the error_context is returned from _add_context_to_errors. + - The actual_error is useful for errors that are shared like invalid character. + - This can't filter out warnings like the other ones. + + """ + error_list = ErrorHandler.format_error(error_type, *args, actual_error=actual_error, **kwargs) + + ErrorHandler._add_context_to_errors(error_list[0], error_context) + ErrorHandler._update_error_with_char_pos(error_list[0]) + return error_list + + @staticmethod + def _add_context_to_errors(error_object, error_context_to_add): + """ Add relevant context such as row number or column name around an error object. + + Parameters: + error_object (dict): Generated error containing at least a code and message entry. + error_context_to_add (list): Source context to use. If none, the error handler context is used. + + Returns: + list: A list of dict with needed context strings added at the beginning of the list. + + """ + for (context_type, context) in error_context_to_add: + error_object[context_type] = context + + return error_object + + @staticmethod + def _create_error_object(error_type, base_message, severity, **kwargs): + error_object = {'code': error_type, + 'message': base_message, + 'severity': severity + } + + for key, value in kwargs.items(): + error_object.setdefault(key, value) + + return error_object + + @staticmethod + def _get_tag_span_to_error_object(error_object): + if ErrorContext.HED_STRING not in error_object: + return None, None + + if 'source_tag' in error_object: + source_tag = error_object['source_tag'] + if isinstance(source_tag, int): + return None, None + else: + return None, None + + hed_string = error_object[ErrorContext.HED_STRING] + span = hed_string._get_org_span(source_tag) + return span + + @staticmethod + def _update_error_with_char_pos(error_object): + # This part is optional as you can always generate these as needed. + start, end = ErrorHandler._get_tag_span_to_error_object(error_object) + if start is not None: + # silence warning in pycharm + start = int(start) + source_tag = error_object.get('source_tag', None) + # Todo: Move this functionality somewhere more centralized. + # If the tag has been modified from the original, don't try to use sub indexing. + if source_tag and source_tag._tag: + new_start, new_end = start, end + else: + new_start = start + error_object.get('index_in_tag', 0) + index_in_tag_end = end + if 'index_in_tag_end' in error_object: + index_in_tag_end = start + error_object['index_in_tag_end'] + new_end = index_in_tag_end + error_object['char_index'], error_object['char_index_end'] = new_start, new_end + error_object['message'] += f" Problem spans string indexes: {new_start}, {new_end}" + + @hed_error("Unknown") + def val_error_unknown(*args, **kwargs): + """ Default error handler if no error of this type was registered. + + Parameters: + args (args): List of non-keyword parameters (varies). + kwargs (kwargs): Keyword parameters (varies) + + Returns: + str: The error message. + + """ + return f"Unknown error. Args: {str(args), str(kwargs)}" + + @staticmethod + def filter_issues_by_severity(issues_list, severity): + """ Gather all issues matching or below a given severity. + + Parameters: + issues_list (list): A list of dictionaries containing the full issue list. + severity (int): The level of issues to keep. + + Returns: + list: A list of dictionaries containing the issue list after filtering by severity. + + """ + return [issue for issue in issues_list if issue['severity'] <= severity] + + +def sort_issues(issues, reverse=False): + """Sort a list of issues by the error context values. + + Parameters: + issues (list): A list of dictionaries representing the issues to be sorted. + reverse (bool, optional): If True, sorts the list in descending order. Default is False. + + Returns: + list: The sorted list of issues.""" + def _get_keys(d): + result = [] + for key in default_sort_list: + if key in int_sort_list: + result.append(d.get(key, -1)) + else: + result.append(d.get(key, "")) + return tuple(result) + + issues = sorted(issues, key=_get_keys, reverse=reverse) + + return issues + + +def check_for_any_errors(issues_list): + """ Return True if there are any errors with a severity of warning. """ + for issue in issues_list: + if issue['severity'] < ErrorSeverity.WARNING: + return True + + return False + + +def get_printable_issue_string(issues, title=None, severity=None, skip_filename=True, add_link=False): + """ Return a string with issues list flatted into single string, one per line. + + Parameters: + issues (list): Issues to print. + title (str): Optional title that will always show up first if present(even if there are no validation issues). + severity (int): Return only warnings >= severity. + skip_filename (bool): If True, don't add the filename context to the printable string. + add_link (bool): Add a link at the end of message to the appropriate error if True + Returns: + str: A string containing printable version of the issues or ''. + + """ + if severity is not None: + issues = ErrorHandler.filter_issues_by_severity(issues, severity) + + output_dict = _build_error_context_dict(issues, skip_filename) + issue_string = _error_dict_to_string(output_dict, add_link=add_link) + + if title: + issue_string = title + '\n' + issue_string + return issue_string + + +def get_printable_issue_string_html(issues, title=None, severity=None, skip_filename=True): + """ Return a string with issues list as an HTML tree. + + Parameters: + issues (list): Issues to print. + title (str): Optional title that will always show up first if present. + severity (int): Return only warnings >= severity. + skip_filename (bool): If True, don't add the filename context to the printable string. + + Returns: + str: An HTML string containing the issues or ''. + """ + if severity is not None: + issues = ErrorHandler.filter_issues_by_severity(issues, severity) + + output_dict = _build_error_context_dict(issues, skip_filename) + + root_element = _create_error_tree(output_dict) + if title: + title_element = ET.Element("h1") + title_element.text = title + root_element.insert(0, title_element) + return ET.tostring(root_element, encoding='unicode') + + +def create_doc_link(error_code): + """If error code is a known code, return a documentation url for it. + + Parameters: + error_code(str): A HED error code. + + Returns: + url(str or None): The URL if it's a valid code. + """ + if error_code in known_error_codes["hed_validation_errors"] \ + or error_code in known_error_codes["schema_validation_errors"]: + modified_error_code = error_code.replace("_", "-").lower() + return f"https://hed-specification.readthedocs.io/en/latest/Appendix_B.html#{modified_error_code}" + return None + + +def _build_error_context_dict(issues, skip_filename): + """Build the context -> error dictionary for an entire list of issues. + + Returns: + dict: A nested dictionary structure with a "children" key at each level for unrelated children. + """ + output_dict = None + for single_issue in issues: + single_issue_context = _get_context_from_issue(single_issue, skip_filename) + output_dict = _add_single_error_to_dict(single_issue_context, output_dict, single_issue) + + return output_dict + + +def _add_single_error_to_dict(items, root=None, issue_to_add=None): + """ Build a nested dictionary out of the context lists. + + Parameters: + items (list): A list of error contexts + root (dict, optional): An existing nested dictionary structure to update. + issue_to_add (dict, optional): The issue to add at this level of context. + + Returns: + dict: A nested dictionary structure with a "children" key at each level for unrelated children. + """ + if root is None: + root = {"children": []} + + current_dict = root + for item in items: + # Navigate to the next level if the item already exists, or create a new level + next_dict = current_dict.get(item, {"children": []}) + current_dict[item] = next_dict + current_dict = next_dict + + if issue_to_add: + current_dict["children"].append(issue_to_add) + + return root + + +def _error_dict_to_string(print_dict, add_link=True, level=0): + output = "" + if print_dict is None: + return output + for context, value in print_dict.items(): + if context == "children": + for child in value: + single_issue_message = child["message"] + issue_string = level * "\t" + _get_error_prefix(child) + issue_string += f"{single_issue_message}\n" + if add_link: + link_url = create_doc_link(child['code']) + if link_url: + single_issue_message += f" See... {link_url}" + output += issue_string + continue + output += _format_single_context_string(context[0], context[1], level) + output += _error_dict_to_string(value, add_link, level + 1) + + return output + + +def _get_context_from_issue(val_issue, skip_filename=True): + """ Extract all the context values from the given issue. + + Parameters: + val_issue (dict): A dictionary a representing a single error. + skip_filename (bool): If True, don't gather the filename context. + + Returns: + list: A list of tuples containing the context_type and context for the given issue. + + """ + single_issue_context = [] + for key, value in val_issue.items(): + if skip_filename and key == ErrorContext.FILE_NAME: + continue + if key == ErrorContext.HED_STRING: + value = value.get_original_hed_string() + if key.startswith("ec_"): + single_issue_context.append((key, str(value))) + + return single_issue_context + + +def _get_error_prefix(single_issue): + """Return the prefix for the error message based on severity and error code. + + Parameters: + single_issue(dict): A single issue object. + + Returns: + error_prefix(str): the prefix to use. + """ + severity = single_issue.get('severity', ErrorSeverity.ERROR) + error_code = single_issue['code'] + + if severity == ErrorSeverity.ERROR: + error_prefix = f"{error_code}: " + else: + error_prefix = f"{error_code}: (Warning) " + return error_prefix + + +def _format_single_context_string(context_type, context, tab_count=0): + """ Return the human-readable form of a single context tuple. + + Parameters: + context_type (str): The context type of this entry. + context (str or HedString): The value of this context. + tab_count (int): Number of tabs to name_prefix each line with. + + Returns: + str: A string containing the context, including tabs. + + """ + tab_string = tab_count * '\t' + error_types = { + ErrorContext.FILE_NAME: f"\nErrors in file '{context}'", + ErrorContext.SIDECAR_COLUMN_NAME: f"Column '{context}':", + ErrorContext.SIDECAR_KEY_NAME: f"Key: {context}", + ErrorContext.ROW: f'Issues in row {context}:', + ErrorContext.COLUMN: f'Issues in column {context}:', + ErrorContext.CUSTOM_TITLE: context, + ErrorContext.LINE: f"Line: {context}", + ErrorContext.HED_STRING: f"hed string: {context}", + ErrorContext.SCHEMA_SECTION: f"Schema Section: {context}", + ErrorContext.SCHEMA_TAG: f"Source tag: {context}", + ErrorContext.SCHEMA_ATTRIBUTE: f"Source Attribute: {context}", + } + context_portion = error_types[context_type] + context_string = f"{tab_string}{context_portion}\n" + return context_string + + +def _create_error_tree(error_dict, parent_element=None, add_link=True): + if parent_element is None: + parent_element = ET.Element("ul") + + for context, value in error_dict.items(): + if context == "children": + for child in value: + child_li = ET.SubElement(parent_element, "li") + error_prefix = _get_error_prefix(child) + single_issue_message = child["message"] + + # Create a link for the error prefix if add_link is True. + if add_link: + link_url = create_doc_link(child['code']) + if link_url: + a_element = ET.SubElement(child_li, "a", href=link_url) + a_element.text = error_prefix + a_element.tail = " " + single_issue_message + else: + child_li.text = error_prefix + " " + single_issue_message + else: + child_li.text = error_prefix + " " + single_issue_message + continue + + context_li = ET.SubElement(parent_element, "li") + context_li.text = _format_single_context_string(context[0], context[1]) + context_ul = ET.SubElement(context_li, "ul") + _create_error_tree(value, context_ul, add_link) + + return parent_element + + +def replace_tag_references(list_or_dict): + """ Utility function to remove any references to tags, strings, etc. from any type of nested list or dict. + + Use this if you want to save out issues to a file. + + If you'd prefer a copy returned, use replace_tag_references(list_or_dict.copy()). + + Parameters: + list_or_dict(list or dict): An arbitrarily nested list/dict structure + """ + if isinstance(list_or_dict, dict): + for key, value in list_or_dict.items(): + if isinstance(value, (dict, list)): + replace_tag_references(value) + elif isinstance(value, (bool, float, int)): + list_or_dict[key] = value + else: + list_or_dict[key] = str(value) + elif isinstance(list_or_dict, list): + for key, value in enumerate(list_or_dict): + if isinstance(value, (dict, list)): + replace_tag_references(value) + elif isinstance(value, (bool, float, int)): + list_or_dict[key] = value + else: + list_or_dict[key] = str(value) diff --git a/hed/errors/error_types.py b/hed/errors/error_types.py index b07e3544..c53e7c6b 100644 --- a/hed/errors/error_types.py +++ b/hed/errors/error_types.py @@ -93,7 +93,7 @@ class ValidationErrors: INVALID_TAG_CHARACTER = 'invalidTagCharacter' CURLY_BRACE_UNSUPPORTED_HERE = "CURLY_BRACE_UNSUPPORTED_HERE" - ONSETS_OUT_OF_ORDER = "ONSETS_OUT_OF_ORDER" + ONSETS_UNORDERED = "ONSETS_UNORDERED" class SidecarErrors: diff --git a/hed/models/base_input.py b/hed/models/base_input.py index 24dc033c..1419d8c5 100644 --- a/hed/models/base_input.py +++ b/hed/models/base_input.py @@ -1,477 +1,479 @@ -""" -Superclass representing a basic columnar file. -""" -import os - -import openpyxl -import pandas as pd - -from hed.models.column_mapper import ColumnMapper -from hed.errors.exceptions import HedFileError, HedExceptions - -from hed.models.df_util import _handle_curly_braces_refs, filter_series_by_onset - - -class BaseInput: - """ Superclass representing a basic columnar file. """ - - TEXT_EXTENSION = ['.tsv', '.txt'] - EXCEL_EXTENSION = ['.xlsx'] - - def __init__(self, file, file_type=None, worksheet_name=None, has_column_names=True, mapper=None, name=None, - allow_blank_names=True): - """ Constructor for the BaseInput class. - - Parameters: - file (str or file-like or pd.Dataframe): An xlsx/tsv file to open. - file_type (str or None): ".xlsx" (Excel), ".tsv" or ".txt" (tab-separated text). - Derived from file if file is a filename. Ignored if pandas dataframe. - worksheet_name (str or None): Name of Excel workbook worksheet name to use. - (Not applicable to tsv files.) - has_column_names (bool): True if file has column names. - This value is ignored if you pass in a pandas dataframe. - mapper (ColumnMapper or None): Indicates which columns have HED tags. - See SpreadsheetInput or TabularInput for examples of how to use built-in a ColumnMapper. - name (str or None): Optional field for how this file will report errors. - allow_blank_names(bool): If True, column names can be blank - - :raises HedFileError: - - file is blank. - - An invalid dataframe was passed with size 0. - - An invalid extension was provided. - - A duplicate or empty column name appears. - - Cannot open the indicated file. - - The specified worksheet name does not exist. - - If the sidecar file or tabular file had invalid format and could not be read. - - """ - if mapper is None: - mapper = ColumnMapper() - self._mapper = mapper - self._has_column_names = has_column_names - self._name = name - # This is the loaded workbook if we loaded originally from an Excel file. - self._loaded_workbook = None - self._worksheet_name = worksheet_name - self._dataframe = None - - input_type = file_type - if isinstance(file, str): - if file_type is None: - _, input_type = os.path.splitext(file) - if self.name is None: - self._name = file - - self._open_dataframe_file(file, has_column_names, input_type) - - column_issues = ColumnMapper.check_for_blank_names(self.columns, allow_blank_names=allow_blank_names) - if column_issues: - raise HedFileError(HedExceptions.BAD_COLUMN_NAMES, "Duplicate or blank columns found. See issues.", - self.name, issues=column_issues) - - self.reset_mapper(mapper) - - def reset_mapper(self, new_mapper): - """ Set mapper to a different view of the file. - - Parameters: - new_mapper (ColumnMapper): A column mapper to be associated with this base input. - """ - self._mapper = new_mapper - if not self._mapper: - self._mapper = ColumnMapper() - - if self._dataframe is not None and self._has_column_names: - columns = self._dataframe.columns - self._mapper.set_column_map(columns) - - @property - def dataframe(self): - """ The underlying dataframe. """ - return self._dataframe - - @property - def dataframe_a(self): - """Return the assembled dataframe Probably a placeholder name. - - Returns: - Dataframe: the assembled dataframe""" - return self.assemble() - - @property - def series_a(self): - """Return the assembled dataframe as a series. - - Returns: - Series: the assembled dataframe with columns merged. - """ - return self.combine_dataframe(self.assemble()) - - @property - def series_filtered(self): - """Return the assembled dataframe as a series, with rows that have the same onset combined. - - Returns: - Series or None: the assembled dataframe with columns merged, and the rows filtered together. - """ - if self.onsets is not None: - return filter_series_by_onset(self.series_a, self.onsets) - - @property - def onsets(self): - """Return the onset column if it exists. """ - if "onset" in self.columns: - return self._dataframe["onset"] - - @property - def needs_sorting(self): - """Return True if this both has an onset column, and it needs sorting.""" - onsets = self.onsets - if onsets is not None: - onsets = pd.to_numeric(self.dataframe['onset'], errors='coerce') - return not onsets.is_monotonic_increasing - - @property - def name(self): - """ Name of the data. """ - return self._name - - @property - def has_column_names(self): - """ True if dataframe has column names. """ - return self._has_column_names - - @property - def loaded_workbook(self): - """ The underlying loaded workbooks. """ - return self._loaded_workbook - - @property - def worksheet_name(self): - """ The worksheet name. """ - return self._worksheet_name - - def convert_to_form(self, hed_schema, tag_form): - """ Convert all tags in underlying dataframe to the specified form. - - Parameters: - hed_schema (HedSchema): The schema to use to convert tags. - tag_form(str): HedTag property to convert tags to. - Most cases should use convert_to_short or convert_to_long below. - """ - from hed.models.df_util import convert_to_form - convert_to_form(self._dataframe, hed_schema, tag_form, self._mapper.get_tag_columns()) - - def convert_to_short(self, hed_schema): - """ Convert all tags in underlying dataframe to short form. - - Parameters: - hed_schema (HedSchema): The schema to use to convert tags. - """ - return self.convert_to_form(hed_schema, "short_tag") - - def convert_to_long(self, hed_schema): - """ Convert all tags in underlying dataframe to long form. - - Parameters: - hed_schema (HedSchema or None): The schema to use to convert tags. - """ - return self.convert_to_form(hed_schema, "long_tag") - - def shrink_defs(self, hed_schema): - """ Shrinks any def-expand found in the underlying dataframe. - - Parameters: - hed_schema (HedSchema or None): The schema to use to identify defs. - """ - from df_util import shrink_defs - shrink_defs(self._dataframe, hed_schema=hed_schema, columns=self._mapper.get_tag_columns()) - - def expand_defs(self, hed_schema, def_dict): - """ Shrinks any def-expand found in the underlying dataframe. - - Parameters: - hed_schema (HedSchema or None): The schema to use to identify defs. - def_dict (DefinitionDict): The definitions to expand. - """ - from df_util import expand_defs - expand_defs(self._dataframe, hed_schema=hed_schema, def_dict=def_dict, columns=self._mapper.get_tag_columns()) - - def to_excel(self, file): - """ Output to an Excel file. - - Parameters: - file (str or file-like): Location to save this base input. - - :raises ValueError: - - If empty file object was passed. - - :raises OSError: - - Cannot open the indicated file. - """ - if not file: - raise ValueError("Empty file name or object passed in to BaseInput.save.") - - dataframe = self._dataframe - if self._loaded_workbook: - old_worksheet = self.get_worksheet(self._worksheet_name) - # Excel spreadsheets are 1 based, then add another 1 for column names if present - adj_row_for_col_names = 1 - if self._has_column_names: - adj_row_for_col_names += 1 - adj_for_one_based_cols = 1 - for row_number, text_file_row in dataframe.iterrows(): - for column_number, column_text in enumerate(text_file_row): - cell_value = dataframe.iloc[row_number, column_number] - old_worksheet.cell(row_number + adj_row_for_col_names, - column_number + adj_for_one_based_cols).value = cell_value - - self._loaded_workbook.save(file) - else: - dataframe.to_excel(file, header=self._has_column_names) - - def to_csv(self, file=None): - """ Write to file or return as a string. - - Parameters: - file (str, file-like, or None): Location to save this file. If None, return as string. - Returns: - None or str: None if file is given or the contents as a str if file is None. - - :raises OSError: - - Cannot open the indicated file. - """ - dataframe = self._dataframe - csv_string_if_filename_none = dataframe.to_csv(file, sep='\t', index=False, header=self._has_column_names) - return csv_string_if_filename_none - - @property - def columns(self): - """ Returns a list of the column names. - - Empty if no column names. - - Returns: - columns(list): The column names. - """ - columns = [] - if self._dataframe is not None and self._has_column_names: - columns = list(self._dataframe.columns) - return columns - - def column_metadata(self): - """ Return the metadata for each column. - - Returns: - dict: Number/ColumnMeta pairs. - """ - if self._mapper: - return self._mapper._final_column_map - return {} - - def set_cell(self, row_number, column_number, new_string_obj, tag_form="short_tag"): - """ Replace the specified cell with transformed text. - - Parameters: - row_number (int): The row number of the spreadsheet to set. - column_number (int): The column number of the spreadsheet to set. - new_string_obj (HedString): Object with text to put in the given cell. - tag_form (str): Version of the tags (short_tag, long_tag, base_tag, etc) - - Notes: - Any attribute of a HedTag that returns a string is a valid value of tag_form. - - :raises ValueError: - - There is not a loaded dataframe. - - :raises KeyError: - - The indicated row/column does not exist. - - :raises AttributeError: - - The indicated tag_form is not an attribute of HedTag. - """ - if self._dataframe is None: - raise ValueError("No data frame loaded") - - new_text = new_string_obj.get_as_form(tag_form) - self._dataframe.iloc[row_number, column_number] = new_text - - def get_worksheet(self, worksheet_name=None): - """ Get the requested worksheet. - - Parameters: - worksheet_name (str or None): The name of the requested worksheet by name or the first one if None. - - Returns: - openpyxl.workbook.Workbook: The workbook request. - - Notes: - If None, returns the first worksheet. - - :raises KeyError: - - The specified worksheet name does not exist. - """ - if worksheet_name and self._loaded_workbook: - # return self._loaded_workbook.get_sheet_by_name(worksheet_name) - return self._loaded_workbook[worksheet_name] - elif self._loaded_workbook: - return self._loaded_workbook.worksheets[0] - else: - return None - - @staticmethod - def _get_dataframe_from_worksheet(worksheet, has_headers): - """ Create a dataframe from the worksheet. - - Parameters: - worksheet (Worksheet): The loaded worksheet to convert. - has_headers (bool): True if this worksheet has column headers. - - Returns: - DataFrame: The converted data frame. - - """ - if has_headers: - data = worksheet.values - # first row is columns - cols = next(data) - data = list(data) - return pd.DataFrame(data, columns=cols, dtype=str) - else: - return pd.DataFrame(worksheet.values, dtype=str) - - def validate(self, hed_schema, extra_def_dicts=None, name=None, error_handler=None): - """Creates a SpreadsheetValidator and returns all issues with this file. - - Parameters: - hed_schema(HedSchema): The schema to use for validation. - extra_def_dicts(list of DefDict or DefDict): All definitions to use for validation. - name(str): The name to report errors from this file as. - error_handler (ErrorHandler): Error context to use. Creates a new one if None. - - Returns: - issues (list of dict): A list of issues for a HED string. - """ - from hed.validator.spreadsheet_validator import SpreadsheetValidator - if not name: - name = self.name - tab_validator = SpreadsheetValidator(hed_schema) - validation_issues = tab_validator.validate(self, self._mapper.get_def_dict(hed_schema, extra_def_dicts), name, - error_handler=error_handler) - return validation_issues - - @staticmethod - def _dataframe_has_names(dataframe): - for column in dataframe.columns: - if isinstance(column, str): - return True - return False - - def assemble(self, mapper=None, skip_curly_braces=False): - """ Assembles the HED strings. - - Parameters: - mapper(ColumnMapper or None): Generally pass none here unless you want special behavior. - skip_curly_braces (bool): If True, don't plug in curly brace values into columns. - Returns: - Dataframe: The assembled dataframe. - """ - if mapper is None: - mapper = self._mapper - - all_columns = self._handle_transforms(mapper) - if skip_curly_braces: - return all_columns - transformers, _ = mapper.get_transformers() - refs = self.get_column_refs() - column_names = list(transformers) - return _handle_curly_braces_refs(all_columns, refs, column_names) - - def _handle_transforms(self, mapper): - transformers, need_categorical = mapper.get_transformers() - if transformers: - all_columns = self._dataframe - if need_categorical: - all_columns[need_categorical] = all_columns[need_categorical].astype('category') - - all_columns = all_columns.transform(transformers) - - if need_categorical: - all_columns[need_categorical] = all_columns[need_categorical].astype('str') - else: - all_columns = self._dataframe - - return all_columns - - @staticmethod - def combine_dataframe(dataframe): - """ Combine all columns in the given dataframe into a single HED string series, - skipping empty columns and columns with empty strings. - - Parameters: - dataframe(Dataframe): The dataframe to combine - - Returns: - Series: The assembled series. - """ - dataframe = dataframe.apply( - lambda x: ', '.join(filter(lambda e: bool(e) and e != "n/a", map(str, x))), - axis=1 - ) - return dataframe - - def get_def_dict(self, hed_schema, extra_def_dicts=None): - """ Return the definition dict for this file. - - Note: Baseclass implementation returns just extra_def_dicts. - - Parameters: - hed_schema(HedSchema): Identifies tags to find definitions(if needed). - extra_def_dicts (list, DefinitionDict, or None): Extra dicts to add to the list. - - Returns: - DefinitionDict: A single definition dict representing all the data(and extra def dicts). - """ - from hed.models.definition_dict import DefinitionDict - return DefinitionDict(extra_def_dicts, hed_schema) - - def get_column_refs(self): - """ Return a list of column refs for this file. - - Default implementation returns none. - - Returns: - column_refs(list): A list of unique column refs found. - """ - return [] - - def _open_dataframe_file(self, file, has_column_names, input_type): - pandas_header = 0 - if not has_column_names: - pandas_header = None - - if isinstance(file, pd.DataFrame): - self._dataframe = file.astype(str) - self._has_column_names = self._dataframe_has_names(self._dataframe) - elif not file: - raise HedFileError(HedExceptions.FILE_NOT_FOUND, "Empty file passed to BaseInput.", file) - elif input_type in self.TEXT_EXTENSION: - try: - self._dataframe = pd.read_csv(file, delimiter='\t', header=pandas_header, - dtype=str, keep_default_na=True, na_values=("", "null")) - except Exception as e: - raise HedFileError(HedExceptions.INVALID_FILE_FORMAT, str(e), self.name) from e - # Convert nan values to a known value - self._dataframe = self._dataframe.fillna("n/a") - elif input_type in self.EXCEL_EXTENSION: - try: - self._loaded_workbook = openpyxl.load_workbook(file) - loaded_worksheet = self.get_worksheet(self._worksheet_name) - self._dataframe = self._get_dataframe_from_worksheet(loaded_worksheet, has_column_names) - except Exception as e: - raise HedFileError(HedExceptions.GENERIC_ERROR, str(e), self.name) from e - else: - raise HedFileError(HedExceptions.INVALID_EXTENSION, "", file) - - if self._dataframe.size == 0: - raise HedFileError(HedExceptions.INVALID_DATAFRAME, "Invalid dataframe(malformed datafile, etc)", file) +""" +Superclass representing a basic columnar file. +""" +import os + +import openpyxl +import pandas as pd + +from hed.models.column_mapper import ColumnMapper +from hed.errors.exceptions import HedFileError, HedExceptions + +from hed.models.df_util import _handle_curly_braces_refs, filter_series_by_onset + + +class BaseInput: + """ Superclass representing a basic columnar file. """ + + TEXT_EXTENSION = ['.tsv', '.txt'] + EXCEL_EXTENSION = ['.xlsx'] + + def __init__(self, file, file_type=None, worksheet_name=None, has_column_names=True, mapper=None, name=None, + allow_blank_names=True): + """ Constructor for the BaseInput class. + + Parameters: + file (str or file-like or pd.Dataframe): An xlsx/tsv file to open. + file_type (str or None): ".xlsx" (Excel), ".tsv" or ".txt" (tab-separated text). + Derived from file if file is a filename. Ignored if pandas dataframe. + worksheet_name (str or None): Name of Excel workbook worksheet name to use. + (Not applicable to tsv files.) + has_column_names (bool): True if file has column names. + This value is ignored if you pass in a pandas dataframe. + mapper (ColumnMapper or None): Indicates which columns have HED tags. + See SpreadsheetInput or TabularInput for examples of how to use built-in a ColumnMapper. + name (str or None): Optional field for how this file will report errors. + allow_blank_names(bool): If True, column names can be blank + + :raises HedFileError: + - file is blank. + - An invalid dataframe was passed with size 0. + - An invalid extension was provided. + - A duplicate or empty column name appears. + - Cannot open the indicated file. + - The specified worksheet name does not exist. + - If the sidecar file or tabular file had invalid format and could not be read. + + """ + if mapper is None: + mapper = ColumnMapper() + self._mapper = mapper + self._has_column_names = has_column_names + self._name = name + # This is the loaded workbook if we loaded originally from an Excel file. + self._loaded_workbook = None + self._worksheet_name = worksheet_name + self._dataframe = None + + input_type = file_type + if isinstance(file, str): + if file_type is None: + _, input_type = os.path.splitext(file) + if self.name is None: + self._name = file + + self._open_dataframe_file(file, has_column_names, input_type) + + column_issues = ColumnMapper.check_for_blank_names(self.columns, allow_blank_names=allow_blank_names) + if column_issues: + raise HedFileError(HedExceptions.BAD_COLUMN_NAMES, "Duplicate or blank columns found. See issues.", + self.name, issues=column_issues) + + self.reset_mapper(mapper) + + def reset_mapper(self, new_mapper): + """ Set mapper to a different view of the file. + + Parameters: + new_mapper (ColumnMapper): A column mapper to be associated with this base input. + """ + self._mapper = new_mapper + if not self._mapper: + self._mapper = ColumnMapper() + + if self._dataframe is not None and self._has_column_names: + columns = self._dataframe.columns + self._mapper.set_column_map(columns) + + @property + def dataframe(self): + """ The underlying dataframe. """ + return self._dataframe + + @property + def dataframe_a(self): + """Return the assembled dataframe Probably a placeholder name. + + Returns: + Dataframe: the assembled dataframe""" + return self.assemble() + + @property + def series_a(self): + """Return the assembled dataframe as a series. + + Returns: + Series: the assembled dataframe with columns merged. + """ + return self.combine_dataframe(self.assemble()) + + @property + def series_filtered(self): + """Return the assembled dataframe as a series, with rows that have the same onset combined. + + Returns: + Series or None: the assembled dataframe with columns merged, and the rows filtered together. + """ + if self.onsets is not None: + return filter_series_by_onset(self.series_a, self.onsets) + + @property + def onsets(self): + """Return the onset column if it exists. """ + if "onset" in self.columns: + return self._dataframe["onset"] + + @property + def needs_sorting(self): + """Return True if this both has an onset column, and it needs sorting.""" + onsets = self.onsets + if onsets is not None: + onsets = pd.to_numeric(self.dataframe['onset'], errors='coerce') + return not onsets.is_monotonic_increasing + else: + return False + + @property + def name(self): + """ Name of the data. """ + return self._name + + @property + def has_column_names(self): + """ True if dataframe has column names. """ + return self._has_column_names + + @property + def loaded_workbook(self): + """ The underlying loaded workbooks. """ + return self._loaded_workbook + + @property + def worksheet_name(self): + """ The worksheet name. """ + return self._worksheet_name + + def convert_to_form(self, hed_schema, tag_form): + """ Convert all tags in underlying dataframe to the specified form. + + Parameters: + hed_schema (HedSchema): The schema to use to convert tags. + tag_form(str): HedTag property to convert tags to. + Most cases should use convert_to_short or convert_to_long below. + """ + from hed.models.df_util import convert_to_form + convert_to_form(self._dataframe, hed_schema, tag_form, self._mapper.get_tag_columns()) + + def convert_to_short(self, hed_schema): + """ Convert all tags in underlying dataframe to short form. + + Parameters: + hed_schema (HedSchema): The schema to use to convert tags. + """ + return self.convert_to_form(hed_schema, "short_tag") + + def convert_to_long(self, hed_schema): + """ Convert all tags in underlying dataframe to long form. + + Parameters: + hed_schema (HedSchema or None): The schema to use to convert tags. + """ + return self.convert_to_form(hed_schema, "long_tag") + + def shrink_defs(self, hed_schema): + """ Shrinks any def-expand found in the underlying dataframe. + + Parameters: + hed_schema (HedSchema or None): The schema to use to identify defs. + """ + from df_util import shrink_defs + shrink_defs(self._dataframe, hed_schema=hed_schema, columns=self._mapper.get_tag_columns()) + + def expand_defs(self, hed_schema, def_dict): + """ Shrinks any def-expand found in the underlying dataframe. + + Parameters: + hed_schema (HedSchema or None): The schema to use to identify defs. + def_dict (DefinitionDict): The definitions to expand. + """ + from df_util import expand_defs + expand_defs(self._dataframe, hed_schema=hed_schema, def_dict=def_dict, columns=self._mapper.get_tag_columns()) + + def to_excel(self, file): + """ Output to an Excel file. + + Parameters: + file (str or file-like): Location to save this base input. + + :raises ValueError: + - If empty file object was passed. + + :raises OSError: + - Cannot open the indicated file. + """ + if not file: + raise ValueError("Empty file name or object passed in to BaseInput.save.") + + dataframe = self._dataframe + if self._loaded_workbook: + old_worksheet = self.get_worksheet(self._worksheet_name) + # Excel spreadsheets are 1 based, then add another 1 for column names if present + adj_row_for_col_names = 1 + if self._has_column_names: + adj_row_for_col_names += 1 + adj_for_one_based_cols = 1 + for row_number, text_file_row in dataframe.iterrows(): + for column_number, column_text in enumerate(text_file_row): + cell_value = dataframe.iloc[row_number, column_number] + old_worksheet.cell(row_number + adj_row_for_col_names, + column_number + adj_for_one_based_cols).value = cell_value + + self._loaded_workbook.save(file) + else: + dataframe.to_excel(file, header=self._has_column_names) + + def to_csv(self, file=None): + """ Write to file or return as a string. + + Parameters: + file (str, file-like, or None): Location to save this file. If None, return as string. + Returns: + None or str: None if file is given or the contents as a str if file is None. + + :raises OSError: + - Cannot open the indicated file. + """ + dataframe = self._dataframe + csv_string_if_filename_none = dataframe.to_csv(file, sep='\t', index=False, header=self._has_column_names) + return csv_string_if_filename_none + + @property + def columns(self): + """ Returns a list of the column names. + + Empty if no column names. + + Returns: + columns(list): The column names. + """ + columns = [] + if self._dataframe is not None and self._has_column_names: + columns = list(self._dataframe.columns) + return columns + + def column_metadata(self): + """ Return the metadata for each column. + + Returns: + dict: Number/ColumnMeta pairs. + """ + if self._mapper: + return self._mapper._final_column_map + return {} + + def set_cell(self, row_number, column_number, new_string_obj, tag_form="short_tag"): + """ Replace the specified cell with transformed text. + + Parameters: + row_number (int): The row number of the spreadsheet to set. + column_number (int): The column number of the spreadsheet to set. + new_string_obj (HedString): Object with text to put in the given cell. + tag_form (str): Version of the tags (short_tag, long_tag, base_tag, etc.) + + Notes: + Any attribute of a HedTag that returns a string is a valid value of tag_form. + + :raises ValueError: + - There is not a loaded dataframe. + + :raises KeyError: + - The indicated row/column does not exist. + + :raises AttributeError: + - The indicated tag_form is not an attribute of HedTag. + """ + if self._dataframe is None: + raise ValueError("No data frame loaded") + + new_text = new_string_obj.get_as_form(tag_form) + self._dataframe.iloc[row_number, column_number] = new_text + + def get_worksheet(self, worksheet_name=None): + """ Get the requested worksheet. + + Parameters: + worksheet_name (str or None): The name of the requested worksheet by name or the first one if None. + + Returns: + openpyxl.workbook.Workbook: The workbook request. + + Notes: + If None, returns the first worksheet. + + :raises KeyError: + - The specified worksheet name does not exist. + """ + if worksheet_name and self._loaded_workbook: + # return self._loaded_workbook.get_sheet_by_name(worksheet_name) + return self._loaded_workbook[worksheet_name] + elif self._loaded_workbook: + return self._loaded_workbook.worksheets[0] + else: + return None + + @staticmethod + def _get_dataframe_from_worksheet(worksheet, has_headers): + """ Create a dataframe from the worksheet. + + Parameters: + worksheet (Worksheet): The loaded worksheet to convert. + has_headers (bool): True if this worksheet has column headers. + + Returns: + DataFrame: The converted data frame. + + """ + if has_headers: + data = worksheet.values + # first row is columns + cols = next(data) + data = list(data) + return pd.DataFrame(data, columns=cols, dtype=str) + else: + return pd.DataFrame(worksheet.values, dtype=str) + + def validate(self, hed_schema, extra_def_dicts=None, name=None, error_handler=None): + """Creates a SpreadsheetValidator and returns all issues with this file. + + Parameters: + hed_schema(HedSchema): The schema to use for validation. + extra_def_dicts(list of DefDict or DefDict): All definitions to use for validation. + name(str): The name to report errors from this file as. + error_handler (ErrorHandler): Error context to use. Creates a new one if None. + + Returns: + issues (list of dict): A list of issues for a HED string. + """ + from hed.validator.spreadsheet_validator import SpreadsheetValidator + if not name: + name = self.name + tab_validator = SpreadsheetValidator(hed_schema) + validation_issues = tab_validator.validate(self, self._mapper.get_def_dict(hed_schema, extra_def_dicts), name, + error_handler=error_handler) + return validation_issues + + @staticmethod + def _dataframe_has_names(dataframe): + for column in dataframe.columns: + if isinstance(column, str): + return True + return False + + def assemble(self, mapper=None, skip_curly_braces=False): + """ Assembles the HED strings. + + Parameters: + mapper(ColumnMapper or None): Generally pass none here unless you want special behavior. + skip_curly_braces (bool): If True, don't plug in curly brace values into columns. + Returns: + Dataframe: The assembled dataframe. + """ + if mapper is None: + mapper = self._mapper + + all_columns = self._handle_transforms(mapper) + if skip_curly_braces: + return all_columns + transformers, _ = mapper.get_transformers() + refs = self.get_column_refs() + column_names = list(transformers) + return _handle_curly_braces_refs(all_columns, refs, column_names) + + def _handle_transforms(self, mapper): + transformers, need_categorical = mapper.get_transformers() + if transformers: + all_columns = self._dataframe + if need_categorical: + all_columns[need_categorical] = all_columns[need_categorical].astype('category') + + all_columns = all_columns.transform(transformers) + + if need_categorical: + all_columns[need_categorical] = all_columns[need_categorical].astype('str') + else: + all_columns = self._dataframe + + return all_columns + + @staticmethod + def combine_dataframe(dataframe): + """ Combine all columns in the given dataframe into a single HED string series, + skipping empty columns and columns with empty strings. + + Parameters: + dataframe(Dataframe): The dataframe to combine + + Returns: + Series: The assembled series. + """ + dataframe = dataframe.apply( + lambda x: ', '.join(filter(lambda e: bool(e) and e != "n/a", map(str, x))), + axis=1 + ) + return dataframe + + def get_def_dict(self, hed_schema, extra_def_dicts=None): + """ Return the definition dict for this file. + + Note: Baseclass implementation returns just extra_def_dicts. + + Parameters: + hed_schema(HedSchema): Identifies tags to find definitions(if needed). + extra_def_dicts (list, DefinitionDict, or None): Extra dicts to add to the list. + + Returns: + DefinitionDict: A single definition dict representing all the data(and extra def dicts). + """ + from hed.models.definition_dict import DefinitionDict + return DefinitionDict(extra_def_dicts, hed_schema) + + def get_column_refs(self): + """ Return a list of column refs for this file. + + Default implementation returns none. + + Returns: + column_refs(list): A list of unique column refs found. + """ + return [] + + def _open_dataframe_file(self, file, has_column_names, input_type): + pandas_header = 0 + if not has_column_names: + pandas_header = None + + if isinstance(file, pd.DataFrame): + self._dataframe = file.astype(str) + self._has_column_names = self._dataframe_has_names(self._dataframe) + elif not file: + raise HedFileError(HedExceptions.FILE_NOT_FOUND, "Empty file passed to BaseInput.", file) + elif input_type in self.TEXT_EXTENSION: + try: + self._dataframe = pd.read_csv(file, delimiter='\t', header=pandas_header, + dtype=str, keep_default_na=True, na_values=("", "null")) + except Exception as e: + raise HedFileError(HedExceptions.INVALID_FILE_FORMAT, str(e), self.name) from e + # Convert nan values to a known value + self._dataframe = self._dataframe.fillna("n/a") + elif input_type in self.EXCEL_EXTENSION: + try: + self._loaded_workbook = openpyxl.load_workbook(file) + loaded_worksheet = self.get_worksheet(self._worksheet_name) + self._dataframe = self._get_dataframe_from_worksheet(loaded_worksheet, has_column_names) + except Exception as e: + raise HedFileError(HedExceptions.GENERIC_ERROR, str(e), self.name) from e + else: + raise HedFileError(HedExceptions.INVALID_EXTENSION, "", file) + + if self._dataframe.size == 0: + raise HedFileError(HedExceptions.INVALID_DATAFRAME, "Invalid dataframe(malformed datafile, etc)", file) diff --git a/hed/models/column_mapper.py b/hed/models/column_mapper.py index d9fc51f8..203ee05f 100644 --- a/hed/models/column_mapper.py +++ b/hed/models/column_mapper.py @@ -1,421 +1,422 @@ -""" -Mapping of a base input file columns into HED tags. -""" -from hed.models.column_metadata import ColumnMetadata, ColumnType -from hed.errors.error_reporter import ErrorHandler -from hed.errors.error_types import ValidationErrors -from hed.models.definition_dict import DefinitionDict - -import copy -from collections import Counter - -PANDAS_COLUMN_PREFIX_TO_IGNORE = "Unnamed: " - - -class ColumnMapper: - """ Mapping of a base input file columns into HED tags. - - Notes: - - All column numbers are 0 based. - """ - - def __init__(self, sidecar=None, tag_columns=None, column_prefix_dictionary=None, - optional_tag_columns=None, warn_on_missing_column=False): - """ Constructor for ColumnMapper. - - Parameters: - sidecar (Sidecar): A sidecar to gather column data from. - tag_columns: (list): A list of ints or strings containing the columns that contain the HED tags. - Sidecar column definitions will take precedent if there is a conflict with tag_columns. - column_prefix_dictionary (dict): Dictionary with keys that are column numbers/names and values are HED tag - prefixes to prepend to the tags in that column before processing. - optional_tag_columns (list): A list of ints or strings containing the columns that contain - the HED tags. If the column is otherwise unspecified, convert this column type to HEDTags. - warn_on_missing_column (bool): If True, issue mapping warnings on column names that are missing from - the sidecar. - - Notes: - - All column numbers are 0 based. - - The column_prefix_dictionary may be deprecated/renamed in the future. - - These are no longer prefixes, but rather converted to value columns: - {"key": "Description", 1: "Label/"} will turn into value columns as - {"key": "Description/#", 1: "Label/#"} - It will be a validation issue if column 1 is called "key" in the above example. - This means it no longer accepts anything but the value portion only in the columns. - - """ - - # Maps column number to column_entry. This is what's actually used by most code. - self._final_column_map = {} - self._no_mapping_info = True - - self._column_map = {} - self._reverse_column_map = {} - self._warn_on_missing_column = warn_on_missing_column - if tag_columns is None: - tag_columns = [] - self._tag_columns = tag_columns - if optional_tag_columns is None: - optional_tag_columns = [] - self._optional_tag_columns = optional_tag_columns - if column_prefix_dictionary is None: - column_prefix_dictionary = {} - self._column_prefix_dictionary = column_prefix_dictionary - - self._na_patterns = ["n/a", "nan"] - self._sidecar = None - self._set_sidecar(sidecar) - - # finalize the column map based on initial settings with no header - self._finalize_mapping() - - @property - def tag_columns(self): - """ Return the known tag and optional tag columns with numbers as names when possible. - - Returns: - tag_columns(list of str or int): A list of all tag and optional tag columns as labels. - """ - joined_list = self._tag_columns + self._optional_tag_columns - return list(set(self._convert_to_names(self._column_map, joined_list))) - - @property - def column_prefix_dictionary(self): - """ Return the column_prefix_dictionary with numbers turned into names where possible. - - Returns: - column_prefix_dictionary(list of str or int): A column_prefix_dictionary with column labels as keys. - """ - return self._convert_to_names_dict(self._column_map, self._column_prefix_dictionary) - - def get_transformers(self): - """ Return the transformers to use on a dataframe. - - Returns: - tuple(dict, list): - dict({str or int: func}): The functions to use to transform each column. - need_categorical(list of int): A list of columns to treat as categorical. - """ - final_transformers = {} - need_categorical = [] - for column in self._final_column_map.values(): - assign_to_column = column.column_name - if isinstance(assign_to_column, int): - if self._column_map: - assign_to_column = self._column_map[assign_to_column] - else: - assign_to_column = assign_to_column - if column.column_type == ColumnType.Ignore: - continue - elif column.column_type == ColumnType.Value: - value_str = column.hed_dict - from functools import partial - final_transformers[assign_to_column] = partial(self._value_handler, value_str) - elif column.column_type == ColumnType.Categorical: - need_categorical.append(column.column_name) - category_values = column.hed_dict - from functools import partial - final_transformers[assign_to_column] = partial(self._category_handler, category_values) - else: - final_transformers[assign_to_column] = lambda x: x - - return final_transformers, need_categorical - - @staticmethod - def check_for_blank_names(column_map, allow_blank_names): - """ Validate there are no blank column names. - - Parameters: - column_map(iterable): A list of column names. - allow_blank_names(bool): Only find issues if True. - - Returns: - issues(list): A list of dicts, one per issue. - """ - # We don't have any checks right now if blank/duplicate is allowed - if allow_blank_names: - return [] - - issues = [] - - for column_number, name in enumerate(column_map): - if name is None or not name or name.startswith(PANDAS_COLUMN_PREFIX_TO_IGNORE): - issues += ErrorHandler.format_error(ValidationErrors.HED_BLANK_COLUMN, column_number) - continue - - return issues - - def _set_sidecar(self, sidecar): - """ Set the sidecar this column mapper uses. - - Parameters: - sidecar (Sidecar or None): The sidecar to use. - - :raises ValueError: - - A sidecar was previously set. - """ - if self._sidecar: - raise ValueError("Trying to set a second sidecar on a column mapper.") - if not sidecar: - return None - - self._sidecar = sidecar - - @property - def sidecar_column_data(self): - """ Pass through to get the sidecar ColumnMetadata. - - Returns: - dict({str:ColumnMetadata}): The column metadata defined by this sidecar. - """ - if self._sidecar: - return self._sidecar.column_data - - return {} - - def get_tag_columns(self): - """ Return the column numbers or names that are mapped to be HedTags. - - Note: This is NOT the tag_columns or optional_tag_columns parameter, though they set it. - - Returns: - column_identifiers(list): A list of column numbers or names that are ColumnType.HedTags. - 0-based if integer-based, otherwise column name. - """ - return [column_entry.column_name for number, column_entry in self._final_column_map.items() - if column_entry.column_type == ColumnType.HEDTags] - - def set_tag_columns(self, tag_columns=None, optional_tag_columns=None, finalize_mapping=True): - """ Set tag columns and optional tag columns. - - Parameters: - tag_columns (list): A list of ints or strings containing the columns that contain the HED tags. - If None, clears existing tag_columns - optional_tag_columns (list): A list of ints or strings containing the columns that contain the HED tags, - but not an error if missing. - If None, clears existing tag_columns - finalize_mapping (bool): Re-generate the internal mapping if True, otherwise no effect until finalize. - """ - if tag_columns is None: - tag_columns = [] - if optional_tag_columns is None: - optional_tag_columns = [] - self._tag_columns = tag_columns - self._optional_tag_columns = optional_tag_columns - if finalize_mapping: - self._finalize_mapping() - - def set_column_map(self, new_column_map=None): - """ Set the column number to name mapping. - - Parameters: - new_column_map (list or dict): Either an ordered list of the column names or column_number:column name. - dictionary. In both cases, column numbers start at 0. - - Returns: - list: List of issues. Each issue is a dictionary. - - """ - if new_column_map is None: - new_column_map = {} - if isinstance(new_column_map, dict): - column_map = new_column_map - # List like - else: - column_map = {column_number: column_name for column_number, column_name in enumerate(new_column_map)} - self._column_map = column_map - self._reverse_column_map = {column_name: column_number for column_number, column_name in column_map.items()} - self._finalize_mapping() - - def set_column_prefix_dictionary(self, column_prefix_dictionary, finalize_mapping=True): - """Set the column prefix dictionary. """ - self._column_prefix_dictionary = column_prefix_dictionary - if finalize_mapping: - self._finalize_mapping() - - @staticmethod - def _get_sidecar_basic_map(column_map, column_data): - basic_final_map = {} - unhandled_cols = [] - if column_map: - for column_number, column_name in column_map.items(): - if column_name is None: - continue - if column_name in column_data: - column_entry = copy.deepcopy(column_data[column_name]) - column_entry.column_name = column_name - basic_final_map[column_name] = column_entry - continue - elif isinstance(column_name, str) and column_name.startswith(PANDAS_COLUMN_PREFIX_TO_IGNORE): - continue - unhandled_cols.append(column_name) - - return basic_final_map, unhandled_cols - - @staticmethod - def _convert_to_names(column_to_name_map, column_list): - converted_names = [] - for index in column_list: - if isinstance(index, int): - if not column_to_name_map: - converted_names.append(index) - elif index in column_to_name_map: - converted_names.append(column_to_name_map[index]) - else: - if index in column_to_name_map.values(): - converted_names.append(index) - return converted_names - - @staticmethod - def _convert_to_names_dict(column_to_name_map, column_dict): - converted_dict = {} - for index, column_data in column_dict.items(): - if isinstance(index, int): - if not column_to_name_map: - converted_dict[index] = column_data - elif index in column_to_name_map: - converted_dict[column_to_name_map[index]] = column_data - else: - if index in column_to_name_map.values(): - converted_dict[index] = column_data - return converted_dict - - @staticmethod - def _add_value_columns(final_map, column_prefix_dictionary): - for col, prefix in column_prefix_dictionary.items(): - if prefix.endswith("/"): - prefix = prefix + "#" - else: - prefix = prefix + "/#" - new_def = ColumnMetadata(ColumnType.Value, col, source=prefix) - final_map[col] = new_def - - @staticmethod - def _add_tag_columns(final_map, tag_columns): - for col in tag_columns: - new_def = ColumnMetadata(ColumnType.HEDTags, col) - final_map[col] = new_def - - def _get_column_lists(self): - column_lists = self._tag_columns, self._optional_tag_columns, self._column_prefix_dictionary - list_names = ["tag_columns", "optional_tag_columns", "column_prefix_dictionary"] - - if not any(column for column in column_lists): - return column_lists, list_names - # Filter out empty lists from the above - column_lists, list_names = zip(*[(col_list, list_name) for col_list, list_name in zip(column_lists, list_names) - if col_list]) - - return column_lists, list_names - - def _check_for_duplicates_and_required(self, list_names, column_lists): - issues = [] - for list_name, col_list in zip(list_names, column_lists): - # Convert all known strings to ints, then check for duplicates - converted_list = [item if isinstance(item, int) else self._reverse_column_map.get(item, item) - for item in col_list] - - if col_list != self._optional_tag_columns: - for test_col in converted_list: - if isinstance(test_col, str) and test_col not in self._reverse_column_map: - issues += ErrorHandler.format_error(ValidationErrors.HED_MISSING_REQUIRED_COLUMN, - test_col, list_name) - - issues += self._check_for_duplicates_between_lists(converted_list, list_name, - ValidationErrors.DUPLICATE_COLUMN_IN_LIST) - - return issues - - def _check_for_duplicates_between_lists(self, checking_list, list_names, error_type): - issues = [] - duplicates = [item for item, count in Counter(checking_list).items() if count > 1] - for duplicate in duplicates: - issues += ErrorHandler.format_error(error_type, duplicate, - self._column_map.get(duplicate), list_names) - return issues - - def check_for_mapping_issues(self, allow_blank_names=False): - """ Find all issues given the current column_map, tag_columns, etc. - - Parameters: - allow_blank_names(bool): Only flag blank names if False. - - Returns: - issue_list(list of dict): All issues found as a list of dicts. - """ - # 1. Get the lists with entries - column_lists, list_names = self._get_column_lists() - # 2. Verify column_prefix columns and tag columns are present, and check for duplicates - issues = self._check_for_duplicates_and_required(list_names, column_lists) - - combined_list = self.tag_columns + list(self.column_prefix_dictionary) - # 3. Verify prefix and tag columns do not conflict. - issues += self._check_for_duplicates_between_lists(combined_list, list_names, - ValidationErrors.DUPLICATE_COLUMN_BETWEEN_SOURCES) - - # 4. Verify we didn't get both a sidecar and a tag column list - if self._sidecar and combined_list and combined_list != ["HED"]: - issues += ErrorHandler.format_error(ValidationErrors.SIDECAR_AND_OTHER_COLUMNS, column_names=combined_list) - - # 5. Verify we handled all columns - if self._warn_on_missing_column: - fully_combined_list = list(self.sidecar_column_data) + combined_list - for column in self._column_map.values(): - if column not in fully_combined_list: - issues += ErrorHandler.format_error(ValidationErrors.HED_UNKNOWN_COLUMN, column) - - issues += self.check_for_blank_names(self._column_map.values(), allow_blank_names=allow_blank_names) - return issues - - def _finalize_mapping(self): - final_map, unhandled_cols = self._get_sidecar_basic_map(self._column_map, self.sidecar_column_data) - - self._add_tag_columns(final_map, self.tag_columns) - self._remove_from_list(unhandled_cols, self.tag_columns) - - self._add_value_columns(final_map, self.column_prefix_dictionary) - self._remove_from_list(unhandled_cols, self.column_prefix_dictionary) - - self._final_column_map = dict(sorted(final_map.items())) - - @staticmethod - def _remove_from_list(list_to_alter, to_remove): - return [item for item in list_to_alter if item not in to_remove] - - def get_def_dict(self, hed_schema, extra_def_dicts=None): - """ Return def dicts from every column description. - - Parameters: - hed_schema (Schema): A HED schema object to use for extracting definitions. - extra_def_dicts (list, DefinitionDict, or None): Extra dicts to add to the list. - - Returns: - DefinitionDict: A single definition dict representing all the data(and extra def dicts). - """ - if self._sidecar: - return self._sidecar.get_def_dict(hed_schema=hed_schema, extra_def_dicts=extra_def_dicts) - - return DefinitionDict(extra_def_dicts, hed_schema=hed_schema) - - def get_column_mapping_issues(self): - """ Get all the issues with finalizing column mapping(duplicate columns, missing required, etc.). - - Notes: - - This is deprecated and now a wrapper for "check_for_mapping_issues()". - - Returns: - list: A list dictionaries of all issues found from mapping column names to numbers. - - """ - return self.check_for_mapping_issues() - - @staticmethod - def _category_handler(category_values, x): - return category_values.get(x, "") - - @staticmethod - def _value_handler(value_str, x): - if x == "n/a": - return "n/a" - - return value_str.replace("#", str(x)) +""" +Mapping of a base input file columns into HED tags. +""" +from hed.models.column_metadata import ColumnMetadata, ColumnType +from hed.errors.error_reporter import ErrorHandler +from hed.errors.error_types import ValidationErrors +from hed.models.definition_dict import DefinitionDict + +import copy +from collections import Counter + +PANDAS_COLUMN_PREFIX_TO_IGNORE = "Unnamed: " +NO_WARN_COLUMNS = ['onset', 'duration'] + + +class ColumnMapper: + """ Mapping of a base input file columns into HED tags. + + Notes: + - All column numbers are 0 based. + """ + + def __init__(self, sidecar=None, tag_columns=None, column_prefix_dictionary=None, + optional_tag_columns=None, warn_on_missing_column=False): + """ Constructor for ColumnMapper. + + Parameters: + sidecar (Sidecar): A sidecar to gather column data from. + tag_columns: (list): A list of ints or strings containing the columns that contain the HED tags. + Sidecar column definitions will take precedent if there is a conflict with tag_columns. + column_prefix_dictionary (dict): Dictionary with keys that are column numbers/names and values are HED tag + prefixes to prepend to the tags in that column before processing. + optional_tag_columns (list): A list of ints or strings containing the columns that contain + the HED tags. If the column is otherwise unspecified, convert this column type to HEDTags. + warn_on_missing_column (bool): If True, issue mapping warnings on column names that are missing from + the sidecar. + + Notes: + - All column numbers are 0 based. + - The column_prefix_dictionary may be deprecated/renamed in the future. + - These are no longer prefixes, but rather converted to value columns: + {"key": "Description", 1: "Label/"} will turn into value columns as + {"key": "Description/#", 1: "Label/#"} + It will be a validation issue if column 1 is called "key" in the above example. + This means it no longer accepts anything but the value portion only in the columns. + + """ + + # Maps column number to column_entry. This is what's actually used by most code. + self._final_column_map = {} + self._no_mapping_info = True + + self._column_map = {} + self._reverse_column_map = {} + self._warn_on_missing_column = warn_on_missing_column + if tag_columns is None: + tag_columns = [] + self._tag_columns = tag_columns + if optional_tag_columns is None: + optional_tag_columns = [] + self._optional_tag_columns = optional_tag_columns + if column_prefix_dictionary is None: + column_prefix_dictionary = {} + self._column_prefix_dictionary = column_prefix_dictionary + + self._na_patterns = ["n/a", "nan"] + self._sidecar = None + self._set_sidecar(sidecar) + + # finalize the column map based on initial settings with no header + self._finalize_mapping() + + @property + def tag_columns(self): + """ Return the known tag and optional tag columns with numbers as names when possible. + + Returns: + tag_columns(list of str or int): A list of all tag and optional tag columns as labels. + """ + joined_list = self._tag_columns + self._optional_tag_columns + return list(set(self._convert_to_names(self._column_map, joined_list))) + + @property + def column_prefix_dictionary(self): + """ Return the column_prefix_dictionary with numbers turned into names where possible. + + Returns: + column_prefix_dictionary(list of str or int): A column_prefix_dictionary with column labels as keys. + """ + return self._convert_to_names_dict(self._column_map, self._column_prefix_dictionary) + + def get_transformers(self): + """ Return the transformers to use on a dataframe. + + Returns: + tuple(dict, list): + dict({str or int: func}): The functions to use to transform each column. + need_categorical(list of int): A list of columns to treat as categorical. + """ + final_transformers = {} + need_categorical = [] + for column in self._final_column_map.values(): + assign_to_column = column.column_name + if isinstance(assign_to_column, int): + if self._column_map: + assign_to_column = self._column_map[assign_to_column] + else: + assign_to_column = assign_to_column + if column.column_type == ColumnType.Ignore: + continue + elif column.column_type == ColumnType.Value: + value_str = column.hed_dict + from functools import partial + final_transformers[assign_to_column] = partial(self._value_handler, value_str) + elif column.column_type == ColumnType.Categorical: + need_categorical.append(column.column_name) + category_values = column.hed_dict + from functools import partial + final_transformers[assign_to_column] = partial(self._category_handler, category_values) + else: + final_transformers[assign_to_column] = lambda x: x + + return final_transformers, need_categorical + + @staticmethod + def check_for_blank_names(column_map, allow_blank_names): + """ Validate there are no blank column names. + + Parameters: + column_map(iterable): A list of column names. + allow_blank_names(bool): Only find issues if True. + + Returns: + issues(list): A list of dicts, one per issue. + """ + # We don't have any checks right now if blank/duplicate is allowed + if allow_blank_names: + return [] + + issues = [] + + for column_number, name in enumerate(column_map): + if name is None or not name or name.startswith(PANDAS_COLUMN_PREFIX_TO_IGNORE): + issues += ErrorHandler.format_error(ValidationErrors.HED_BLANK_COLUMN, column_number) + continue + + return issues + + def _set_sidecar(self, sidecar): + """ Set the sidecar this column mapper uses. + + Parameters: + sidecar (Sidecar or None): The sidecar to use. + + :raises ValueError: + - A sidecar was previously set. + """ + if self._sidecar: + raise ValueError("Trying to set a second sidecar on a column mapper.") + if not sidecar: + return None + + self._sidecar = sidecar + + @property + def sidecar_column_data(self): + """ Pass through to get the sidecar ColumnMetadata. + + Returns: + dict({str:ColumnMetadata}): The column metadata defined by this sidecar. + """ + if self._sidecar: + return self._sidecar.column_data + + return {} + + def get_tag_columns(self): + """ Return the column numbers or names that are mapped to be HedTags. + + Note: This is NOT the tag_columns or optional_tag_columns parameter, though they set it. + + Returns: + column_identifiers(list): A list of column numbers or names that are ColumnType.HedTags. + 0-based if integer-based, otherwise column name. + """ + return [column_entry.column_name for number, column_entry in self._final_column_map.items() + if column_entry.column_type == ColumnType.HEDTags] + + def set_tag_columns(self, tag_columns=None, optional_tag_columns=None, finalize_mapping=True): + """ Set tag columns and optional tag columns. + + Parameters: + tag_columns (list): A list of ints or strings containing the columns that contain the HED tags. + If None, clears existing tag_columns + optional_tag_columns (list): A list of ints or strings containing the columns that contain the HED tags, + but not an error if missing. + If None, clears existing tag_columns + finalize_mapping (bool): Re-generate the internal mapping if True, otherwise no effect until finalize. + """ + if tag_columns is None: + tag_columns = [] + if optional_tag_columns is None: + optional_tag_columns = [] + self._tag_columns = tag_columns + self._optional_tag_columns = optional_tag_columns + if finalize_mapping: + self._finalize_mapping() + + def set_column_map(self, new_column_map=None): + """ Set the column number to name mapping. + + Parameters: + new_column_map (list or dict): Either an ordered list of the column names or column_number:column name. + dictionary. In both cases, column numbers start at 0. + + Returns: + list: List of issues. Each issue is a dictionary. + + """ + if new_column_map is None: + new_column_map = {} + if isinstance(new_column_map, dict): + column_map = new_column_map + # List like + else: + column_map = {column_number: column_name for column_number, column_name in enumerate(new_column_map)} + self._column_map = column_map + self._reverse_column_map = {column_name: column_number for column_number, column_name in column_map.items()} + self._finalize_mapping() + + def set_column_prefix_dictionary(self, column_prefix_dictionary, finalize_mapping=True): + """Set the column prefix dictionary. """ + self._column_prefix_dictionary = column_prefix_dictionary + if finalize_mapping: + self._finalize_mapping() + + @staticmethod + def _get_sidecar_basic_map(column_map, column_data): + basic_final_map = {} + unhandled_cols = [] + if column_map: + for column_number, column_name in column_map.items(): + if column_name is None: + continue + if column_name in column_data: + column_entry = copy.deepcopy(column_data[column_name]) + column_entry.column_name = column_name + basic_final_map[column_name] = column_entry + continue + elif isinstance(column_name, str) and column_name.startswith(PANDAS_COLUMN_PREFIX_TO_IGNORE): + continue + unhandled_cols.append(column_name) + + return basic_final_map, unhandled_cols + + @staticmethod + def _convert_to_names(column_to_name_map, column_list): + converted_names = [] + for index in column_list: + if isinstance(index, int): + if not column_to_name_map: + converted_names.append(index) + elif index in column_to_name_map: + converted_names.append(column_to_name_map[index]) + else: + if index in column_to_name_map.values(): + converted_names.append(index) + return converted_names + + @staticmethod + def _convert_to_names_dict(column_to_name_map, column_dict): + converted_dict = {} + for index, column_data in column_dict.items(): + if isinstance(index, int): + if not column_to_name_map: + converted_dict[index] = column_data + elif index in column_to_name_map: + converted_dict[column_to_name_map[index]] = column_data + else: + if index in column_to_name_map.values(): + converted_dict[index] = column_data + return converted_dict + + @staticmethod + def _add_value_columns(final_map, column_prefix_dictionary): + for col, prefix in column_prefix_dictionary.items(): + if prefix.endswith("/"): + prefix = prefix + "#" + else: + prefix = prefix + "/#" + new_def = ColumnMetadata(ColumnType.Value, col, source=prefix) + final_map[col] = new_def + + @staticmethod + def _add_tag_columns(final_map, tag_columns): + for col in tag_columns: + new_def = ColumnMetadata(ColumnType.HEDTags, col) + final_map[col] = new_def + + def _get_column_lists(self): + column_lists = self._tag_columns, self._optional_tag_columns, self._column_prefix_dictionary + list_names = ["tag_columns", "optional_tag_columns", "column_prefix_dictionary"] + + if not any(column for column in column_lists): + return column_lists, list_names + # Filter out empty lists from the above + column_lists, list_names = zip(*[(col_list, list_name) for col_list, list_name in zip(column_lists, list_names) + if col_list]) + + return column_lists, list_names + + def _check_for_duplicates_and_required(self, list_names, column_lists): + issues = [] + for list_name, col_list in zip(list_names, column_lists): + # Convert all known strings to ints, then check for duplicates + converted_list = [item if isinstance(item, int) else self._reverse_column_map.get(item, item) + for item in col_list] + + if col_list != self._optional_tag_columns: + for test_col in converted_list: + if isinstance(test_col, str) and test_col not in self._reverse_column_map: + issues += ErrorHandler.format_error(ValidationErrors.HED_MISSING_REQUIRED_COLUMN, + test_col, list_name) + + issues += self._check_for_duplicates_between_lists(converted_list, list_name, + ValidationErrors.DUPLICATE_COLUMN_IN_LIST) + + return issues + + def _check_for_duplicates_between_lists(self, checking_list, list_names, error_type): + issues = [] + duplicates = [item for item, count in Counter(checking_list).items() if count > 1] + for duplicate in duplicates: + issues += ErrorHandler.format_error(error_type, duplicate, + self._column_map.get(duplicate), list_names) + return issues + + def check_for_mapping_issues(self, allow_blank_names=False): + """ Find all issues given the current column_map, tag_columns, etc. + + Parameters: + allow_blank_names(bool): Only flag blank names if False. + + Returns: + issue_list(list of dict): All issues found as a list of dicts. + """ + # 1. Get the lists with entries + column_lists, list_names = self._get_column_lists() + # 2. Verify column_prefix columns and tag columns are present, and check for duplicates + issues = self._check_for_duplicates_and_required(list_names, column_lists) + + combined_list = self.tag_columns + list(self.column_prefix_dictionary) + # 3. Verify prefix and tag columns do not conflict. + issues += self._check_for_duplicates_between_lists(combined_list, list_names, + ValidationErrors.DUPLICATE_COLUMN_BETWEEN_SOURCES) + + # 4. Verify we didn't get both a sidecar and a tag column list + if self._sidecar and combined_list and combined_list != ["HED"]: + issues += ErrorHandler.format_error(ValidationErrors.SIDECAR_AND_OTHER_COLUMNS, column_names=combined_list) + + # 5. Verify we handled all columns + if self._warn_on_missing_column: + fully_combined_list = list(self.sidecar_column_data) + combined_list + NO_WARN_COLUMNS + for column in self._column_map.values(): + if column not in fully_combined_list: + issues += ErrorHandler.format_error(ValidationErrors.HED_UNKNOWN_COLUMN, column) + + issues += self.check_for_blank_names(self._column_map.values(), allow_blank_names=allow_blank_names) + return issues + + def _finalize_mapping(self): + final_map, unhandled_cols = self._get_sidecar_basic_map(self._column_map, self.sidecar_column_data) + + self._add_tag_columns(final_map, self.tag_columns) + self._remove_from_list(unhandled_cols, self.tag_columns) + + self._add_value_columns(final_map, self.column_prefix_dictionary) + self._remove_from_list(unhandled_cols, self.column_prefix_dictionary) + + self._final_column_map = dict(sorted(final_map.items())) + + @staticmethod + def _remove_from_list(list_to_alter, to_remove): + return [item for item in list_to_alter if item not in to_remove] + + def get_def_dict(self, hed_schema, extra_def_dicts=None): + """ Return def dicts from every column description. + + Parameters: + hed_schema (Schema): A HED schema object to use for extracting definitions. + extra_def_dicts (list, DefinitionDict, or None): Extra dicts to add to the list. + + Returns: + DefinitionDict: A single definition dict representing all the data(and extra def dicts). + """ + if self._sidecar: + return self._sidecar.get_def_dict(hed_schema=hed_schema, extra_def_dicts=extra_def_dicts) + + return DefinitionDict(extra_def_dicts, hed_schema=hed_schema) + + def get_column_mapping_issues(self): + """ Get all the issues with finalizing column mapping(duplicate columns, missing required, etc.). + + Notes: + - This is deprecated and now a wrapper for "check_for_mapping_issues()". + + Returns: + list: A list dictionaries of all issues found from mapping column names to numbers. + + """ + return self.check_for_mapping_issues() + + @staticmethod + def _category_handler(category_values, x): + return category_values.get(x, "") + + @staticmethod + def _value_handler(value_str, x): + if x == "n/a": + return "n/a" + + return value_str.replace("#", str(x)) diff --git a/hed/models/df_util.py b/hed/models/df_util.py index 39aa979b..8c4c7882 100644 --- a/hed/models/df_util.py +++ b/hed/models/df_util.py @@ -115,7 +115,7 @@ def sort_dataframe_by_onsets(df): if "onset" in df.columns: # Create a copy and sort by onsets as floats(if needed), but continue to keep the string version. df_copy = df.copy() - df_copy['_temp_onset_sort'] = df_copy['onset'].astype(float) + df_copy['_temp_onset_sort'] = pd.to_numeric(df_copy['onset'], errors='coerce') df_copy.sort_values(by='_temp_onset_sort', inplace=True) df_copy.drop(columns=['_temp_onset_sort'], inplace=True) @@ -251,7 +251,7 @@ def filter_series_by_onset(series, onsets): Returns: Series or Dataframe: the series with rows filtered together. """ - indexed_dict = _indexed_dict_from_onsets(onsets.astype(float)) + indexed_dict = _indexed_dict_from_onsets(pd.to_numeric(onsets, errors='coerce')) return _filter_by_index_list(series, indexed_dict=indexed_dict) diff --git a/hed/models/tabular_input.py b/hed/models/tabular_input.py index 61bf90ed..3fea5195 100644 --- a/hed/models/tabular_input.py +++ b/hed/models/tabular_input.py @@ -1,88 +1,88 @@ -""" A BIDS tabular file with sidecar. """ -from hed.models.column_mapper import ColumnMapper -from hed.models.base_input import BaseInput -from hed.models.sidecar import Sidecar - - -class TabularInput(BaseInput): - """ A BIDS tabular file with sidecar. """ - - HED_COLUMN_NAME = "HED" - - def __init__(self, file=None, sidecar=None, name=None): - - """ Constructor for the TabularInput class. - - Parameters: - file (str or FileLike): A tsv file to open. - sidecar (str or Sidecar or FileLike): A Sidecar or source file/filename. - name (str): The name to display for this file for error purposes. - - :raises HedFileError: - - The file is blank. - - An invalid dataframe was passed with size 0. - - An invalid extension was provided. - - A duplicate or empty column name appears. - - :raises OSError: - - Cannot open the indicated file. - - :raises ValueError: - - This file has no column names. - """ - if sidecar and not isinstance(sidecar, Sidecar): - sidecar = Sidecar(sidecar) - new_mapper = ColumnMapper(sidecar=sidecar, optional_tag_columns=[self.HED_COLUMN_NAME], - warn_on_missing_column=True) - - self._sidecar = sidecar - - super().__init__(file, file_type=".tsv", worksheet_name=None, has_column_names=True, mapper=new_mapper, - name=name, allow_blank_names=False, ) - - if not self._has_column_names: - raise ValueError("You are attempting to open a bids_old style file with no column headers provided.\n" - "This is probably not intended.") - - def reset_column_mapper(self, sidecar=None): - """ Change the sidecars and settings. - - Parameters: - sidecar (str or [str] or Sidecar or [Sidecar]): A list of json filenames to pull sidecar info from. - - """ - new_mapper = ColumnMapper(sidecar=sidecar, optional_tag_columns=[self.HED_COLUMN_NAME]) - self._sidecar = sidecar - - self.reset_mapper(new_mapper) - - def get_def_dict(self, hed_schema, extra_def_dicts=None): - """ Return the definition dict for this sidecar. - - Parameters: - hed_schema(HedSchema): Used to identify tags to find definitions. - extra_def_dicts (list, DefinitionDict, or None): Extra dicts to add to the list. - - Returns: - DefinitionDict: A single definition dict representing all the data(and extra def dicts). - """ - if self._sidecar: - return self._sidecar.get_def_dict(hed_schema, extra_def_dicts) - else: - return super().get_def_dict(hed_schema, extra_def_dicts) - - def get_column_refs(self): - """ Return a list of column refs for this file. - - Default implementation returns none. - - Returns: - column_refs(list): A list of unique column refs found. - """ - if self._sidecar: - return self._sidecar.get_column_refs() - return [] - - def get_sidecar(self): - """Return the sidecar associated with this TabularInput.""" - return self._sidecar +""" A BIDS tabular file with sidecar. """ +from hed.models.column_mapper import ColumnMapper +from hed.models.base_input import BaseInput +from hed.models.sidecar import Sidecar + + +class TabularInput(BaseInput): + """ A BIDS tabular file with sidecar. """ + + HED_COLUMN_NAME = "HED" + + def __init__(self, file=None, sidecar=None, name=None): + + """ Constructor for the TabularInput class. + + Parameters: + file (str or FileLike or pd.Dataframe): A tsv file to open. + sidecar (str or Sidecar or FileLike): A Sidecar or source file/filename. + name (str): The name to display for this file for error purposes. + + :raises HedFileError: + - The file is blank. + - An invalid dataframe was passed with size 0. + - An invalid extension was provided. + - A duplicate or empty column name appears. + + :raises OSError: + - Cannot open the indicated file. + + :raises ValueError: + - This file has no column names. + """ + if sidecar and not isinstance(sidecar, Sidecar): + sidecar = Sidecar(sidecar) + new_mapper = ColumnMapper(sidecar=sidecar, optional_tag_columns=[self.HED_COLUMN_NAME], + warn_on_missing_column=True) + + self._sidecar = sidecar + + super().__init__(file, file_type=".tsv", worksheet_name=None, has_column_names=True, mapper=new_mapper, + name=name, allow_blank_names=False, ) + + if not self._has_column_names: + raise ValueError("You are attempting to open a bids_old style file with no column headers provided.\n" + "This is probably not intended.") + + def reset_column_mapper(self, sidecar=None): + """ Change the sidecars and settings. + + Parameters: + sidecar (str or [str] or Sidecar or [Sidecar]): A list of json filenames to pull sidecar info from. + + """ + new_mapper = ColumnMapper(sidecar=sidecar, optional_tag_columns=[self.HED_COLUMN_NAME]) + self._sidecar = sidecar + + self.reset_mapper(new_mapper) + + def get_def_dict(self, hed_schema, extra_def_dicts=None): + """ Return the definition dict for this sidecar. + + Parameters: + hed_schema(HedSchema): Used to identify tags to find definitions. + extra_def_dicts (list, DefinitionDict, or None): Extra dicts to add to the list. + + Returns: + DefinitionDict: A single definition dict representing all the data(and extra def dicts). + """ + if self._sidecar: + return self._sidecar.get_def_dict(hed_schema, extra_def_dicts) + else: + return super().get_def_dict(hed_schema, extra_def_dicts) + + def get_column_refs(self): + """ Return a list of column refs for this file. + + Default implementation returns none. + + Returns: + column_refs(list): A list of unique column refs found. + """ + if self._sidecar: + return self._sidecar.get_column_refs() + return [] + + def get_sidecar(self): + """Return the sidecar associated with this TabularInput.""" + return self._sidecar diff --git a/hed/validator/spreadsheet_validator.py b/hed/validator/spreadsheet_validator.py index 0e283b2a..b8ba1f32 100644 --- a/hed/validator/spreadsheet_validator.py +++ b/hed/validator/spreadsheet_validator.py @@ -61,7 +61,7 @@ def validate(self, data, def_dicts=None, name=None, error_handler=None): if data.needs_sorting: data_new = copy.deepcopy(data) data_new._dataframe = df_util.sort_dataframe_by_onsets(data.dataframe) - issues += error_handler.format_error_with_context(ValidationErrors.ONSETS_OUT_OF_ORDER) + issues += error_handler.format_error_with_context(ValidationErrors.ONSETS_UNORDERED) data = data_new onsets = df_util.split_delay_tags(data.series_a, self._schema, data.onsets) @@ -160,7 +160,7 @@ def _validate_column_structure(self, base_input, error_handler, row_adj): List of issues associated with each invalid value. Each issue is a dictionary. """ issues = [] - col_issues = base_input._mapper.check_for_mapping_issues(base_input) + col_issues = base_input._mapper.check_for_mapping_issues() error_handler.add_context_and_filter(col_issues) issues += col_issues for column in base_input.column_metadata().values(): diff --git a/tests/models/test_base_input.py b/tests/models/test_base_input.py index dba97140..54b75795 100644 --- a/tests/models/test_base_input.py +++ b/tests/models/test_base_input.py @@ -101,7 +101,7 @@ def test_needs_sort(self): self.assertFalse(opened_file.needs_sorting) issues = opened_file.validate(load_schema_version("8.3.0")) - self.assertEqual(issues[1][ErrorContext.ROW], 5) + self.assertEqual(issues[0][ErrorContext.ROW], 5) df.at[3, "onset"] = 1.5 opened_file = TabularInput(df) self.assertFalse(opened_file.needs_sorting) @@ -111,8 +111,8 @@ def test_needs_sort(self): self.assertTrue(opened_file.needs_sorting) issues = opened_file.validate(load_schema_version("8.3.0")) # Should still report the same issue row despite needing sorting for validation - self.assertEqual(issues[1]['code'], ValidationErrors.ONSETS_OUT_OF_ORDER) - self.assertEqual(issues[2][ErrorContext.ROW], 5) + self.assertEqual(issues[0]['code'], ValidationErrors.ONSETS_UNORDERED) + self.assertEqual(issues[1][ErrorContext.ROW], 5) def test_sort(self): from hed.models.df_util import sort_dataframe_by_onsets diff --git a/tests/validator/test_spreadsheet_validator.py b/tests/validator/test_spreadsheet_validator.py index 3b6e7e32..abc6c0fe 100644 --- a/tests/validator/test_spreadsheet_validator.py +++ b/tests/validator/test_spreadsheet_validator.py @@ -62,8 +62,7 @@ def test_invalid_onset_invalid_column(self): self.assertEqual(len(issues), 0) issues = self.validator.validate(TabularInput(self.df_with_onset), def_dicts=def_dict) - self.assertEqual(len(issues), 1) - self.assertEqual(issues[0]['code'], ValidationErrors.HED_UNKNOWN_COLUMN) + self.assertEqual(len(issues), 0) base_has_tags_df = pd.DataFrame({ 'HED': ["(Onset, Def/DefaultOnset)", "(Inset, Def/DefaultOnset), (Event, Age/2)", @@ -78,8 +77,7 @@ def test_invalid_onset_invalid_column(self): self.assertEqual(len(issues), 3) self.assertEqual(issues[0]['code'], ValidationErrors.TEMPORAL_TAG_ERROR) issues = self.validator.validate(TabularInput(self.df_with_onset_has_tags), def_dicts=def_dict) - self.assertEqual(len(issues), 1) - self.assertEqual(issues[0]['code'], ValidationErrors.HED_UNKNOWN_COLUMN) + self.assertEqual(len(issues), 0) base_has_tags_unordered_df = pd.DataFrame({ 'HED': ["(Onset, Def/DefaultOnset)", "(Offset, Def/DefaultOnset), (Age/4)", @@ -93,6 +91,5 @@ def test_invalid_onset_invalid_column(self): self.assertEqual(len(issues), 3) self.assertEqual(issues[0]['code'], ValidationErrors.TEMPORAL_TAG_ERROR) issues = self.validator.validate(TabularInput(self.df_with_onset_has_tags_unordered), def_dicts=def_dict) - self.assertEqual(len(issues), 2) - self.assertEqual(issues[0]['code'], ValidationErrors.HED_UNKNOWN_COLUMN) - self.assertEqual(issues[1]['code'], ValidationErrors.TEMPORAL_TAG_ERROR) + self.assertEqual(len(issues), 1) + self.assertEqual(issues[0]['code'], ValidationErrors.TEMPORAL_TAG_ERROR) From a5b99e3641ba8f2bdd8b05c5c2313066c995763b Mon Sep 17 00:00:00 2001 From: Kay Robbins <1189050+VisLab@users.noreply.github.com> Date: Fri, 11 Oct 2024 09:35:30 -0500 Subject: [PATCH 3/4] Preliminary handling of n/a onsets --- hed/errors/error_messages.py | 6 +- hed/errors/error_types.py | 2 +- hed/models/base_input.py | 1 + hed/models/df_util.py | 29 ++++++++-- hed/tools/analysis/event_manager.py | 57 ++++++++++++------- hed/tools/analysis/hed_tag_counts.py | 2 +- hed/tools/analysis/hed_tag_manager.py | 3 +- hed/tools/analysis/hed_type.py | 4 +- hed/tools/analysis/hed_type_manager.py | 1 + .../operations/factor_hed_tags_op.py | 3 +- .../operations/factor_hed_type_op.py | 3 +- .../operations/summarize_hed_tags_op.py | 5 +- .../operations/summarize_hed_type_op.py | 3 +- hed/validator/onset_validator.py | 2 +- hed/validator/spreadsheet_validator.py | 14 +++-- hed/validator/util/class_regex.json | 6 +- tests/tools/analysis/test_event_manager.py | 37 +++++++++++- tests/tools/analysis/test_hed_tag_counts.py | 18 +++--- tests/tools/analysis/test_hed_tag_manager.py | 2 +- tests/tools/analysis/test_hed_type.py | 36 +++++------- .../operations/test_summarize_hed_tags_op.py | 4 +- tests/validator/test_spreadsheet_validator.py | 43 +++++++++++++- 22 files changed, 195 insertions(+), 86 deletions(-) diff --git a/hed/errors/error_messages.py b/hed/errors/error_messages.py index 94e2408a..bef3a420 100644 --- a/hed/errors/error_messages.py +++ b/hed/errors/error_messages.py @@ -25,9 +25,9 @@ def val_error_empty_group(tag): return f"HED tags cannot be empty. Extra delimiters found: '{tag}'" -@hed_tag_error(TemporalErrors.HED_ONSET_WITH_NO_COLUMN, actual_code=ValidationErrors.TEMPORAL_TAG_ERROR) -def val_error_hed_onset_with_no_column(tag): - return f"Cannot have Temporal tags without an 'Onset' column. Found tag: '{tag}'" +@hed_tag_error(TemporalErrors.TEMPORAL_TAG_NO_TIME, actual_code=ValidationErrors.TEMPORAL_TAG_ERROR) +def val_error_temporal_tag_no_time(tag): + return f"Cannot have Temporal tags without an 'Onset' column and a time. Found tag: '{tag}'" @hed_tag_error(ValidationErrors.TAG_EXTENDED, has_sub_tag=True, default_severity=ErrorSeverity.WARNING) diff --git a/hed/errors/error_types.py b/hed/errors/error_types.py index c53e7c6b..1ca1e047 100644 --- a/hed/errors/error_types.py +++ b/hed/errors/error_types.py @@ -176,7 +176,7 @@ class TemporalErrors: ONSET_TAG_OUTSIDE_OF_GROUP = "ONSET_TAG_OUTSIDE_OF_GROUP" INSET_BEFORE_ONSET = "INSET_BEFORE_ONSET" ONSET_SAME_DEFS_ONE_ROW = "ONSET_SAME_DEFS_ONE_ROW" - HED_ONSET_WITH_NO_COLUMN = 'HED_ONSET_WITH_NO_COLUMN' + TEMPORAL_TAG_NO_TIME = 'TEMPORAL_TAG_NO_TIME' DURATION_HAS_OTHER_TAGS = "DURATION_HAS_OTHER_TAGS" DURATION_WRONG_NUMBER_GROUPS = "DURATION_WRONG_NUMBER_GROUPS" diff --git a/hed/models/base_input.py b/hed/models/base_input.py index 1419d8c5..f02ffe62 100644 --- a/hed/models/base_input.py +++ b/hed/models/base_input.py @@ -105,6 +105,7 @@ def series_a(self): Returns: Series: the assembled dataframe with columns merged. """ + return self.combine_dataframe(self.assemble()) @property diff --git a/hed/models/df_util.py b/hed/models/df_util.py index 8c4c7882..c7caf16a 100644 --- a/hed/models/df_util.py +++ b/hed/models/df_util.py @@ -1,5 +1,7 @@ """ Utilities for assembly and conversion of HED strings to different forms. """ import re +import math +from collections import defaultdict from functools import partial import pandas as pd from hed.models.hed_string import HedString @@ -215,7 +217,7 @@ def split_delay_tags(series, hed_schema, onsets): Note: This dataframe may be longer than the original series, but it will never be shorter. """ if series is None or onsets is None: - return + return None split_df = pd.DataFrame({"onset": onsets, "HED": series, "original_index": series.index}) delay_strings = [(i, HedString(hed_string, hed_schema)) for (i, hed_string) in series.items() if "delay/" in hed_string.casefold()] @@ -251,23 +253,42 @@ def filter_series_by_onset(series, onsets): Returns: Series or Dataframe: the series with rows filtered together. """ + #indexed_dict = _indexed_dict_from_onsets(pd.to_numeric(onsets, errors='coerce')) + #return _filter_by_index_list(series, indexed_dict=indexed_dict) indexed_dict = _indexed_dict_from_onsets(pd.to_numeric(onsets, errors='coerce')) - return _filter_by_index_list(series, indexed_dict=indexed_dict) + y = _filter_by_index_list(series, indexed_dict=indexed_dict) + return y + # return _filter_by_index_list(series, indexed_dict=indexed_dict) def _indexed_dict_from_onsets(onsets): - """Finds series of consecutive lines with the same(or close enough) onset""" + """Finds series of consecutive lines with the same (or close enough) onset.""" current_onset = -1000000.0 tol = 1e-9 - from collections import defaultdict indexed_dict = defaultdict(list) + for i, onset in enumerate(onsets): + if math.isnan(onset): # Ignore NaNs + continue if abs(onset - current_onset) > tol: current_onset = onset indexed_dict[current_onset].append(i) return indexed_dict +# def _indexed_dict_from_onsets(onsets): +# """Finds series of consecutive lines with the same(or close enough) onset""" +# current_onset = -1000000.0 +# tol = 1e-9 +# from collections import defaultdict +# indexed_dict = defaultdict(list) +# for i, onset in enumerate(onsets): +# if abs(onset - current_onset) > tol: +# current_onset = onset +# indexed_dict[current_onset].append(i) +# +# return indexed_dict + def _filter_by_index_list(original_data, indexed_dict): """Filters a series or dataframe by the indexed_dict, joining lines as indicated""" diff --git a/hed/tools/analysis/event_manager.py b/hed/tools/analysis/event_manager.py index d898b1f9..2cbad4bf 100644 --- a/hed/tools/analysis/event_manager.py +++ b/hed/tools/analysis/event_manager.py @@ -29,13 +29,15 @@ def __init__(self, input_data, hed_schema, extra_defs=None): are separated from the rest of the annotations, which are contained in self.hed_strings. """ + if input_data.onsets is not None and input_data.needs_sorting: + raise HedFileError("OnsetsNotOrdered", "Events must have numeric non-decreasing onset values", "") self.hed_schema = hed_schema self.input_data = input_data self.def_dict = input_data.get_def_dict(hed_schema, extra_def_dicts=extra_defs) - if self.input_data.needs_sorting: - raise HedFileError("OnsetsNotOrdered", "The onset values must be non-decreasing", "") - self.onsets = None - self.hed_strings = None + self.onsets = None # list of onset times or None if not an events file + self.base = None # list of strings containing the starts of event processes + self.context = None # list of strings containing the contexts of event processes + self.hed_strings = None # list of HedString objects without the temporal events self.event_list = None self._create_event_list(input_data) @@ -53,6 +55,9 @@ def _create_event_list(self, input_data): """ hed_strings = input_data.series_a df_util.shrink_defs(hed_strings, self.hed_schema) + if input_data.onsets is None: + self.hed_strings = [HedString(hed_string, self.hed_schema) for hed_string in hed_strings] + return delay_df = df_util.split_delay_tags(hed_strings, self.hed_schema, input_data.onsets) hed_strings = [HedString(hed_string, self.hed_schema) for hed_string in delay_df.HED] @@ -66,6 +71,7 @@ def _create_event_list(self, input_data): for item in onset_dict.values(): item.set_end(len(self.onsets), None) self.hed_strings = hed_strings + self._extract_context() def _extract_duration_events(self, hed, event_index): groups = hed.find_top_level_tags(anchor_tags={DefTagNames.DURATION_KEY}) @@ -120,31 +126,42 @@ def unfold_context(self, remove_types=[]): Returns: list of str or HedString representing the information without the events of temporal extent. - list of str or HedString representing the onsets of the events of temporal extent. - list of str or HedString representing the ongoing context information. + list of str or HedString or None representing the onsets of the events of temporal extent. + list of str or HedString or None representing the ongoing context information. + If the """ - placeholder = "" + remove_defs = self.get_type_defs(remove_types) # definitions corresponding to remove types to be filtered out - new_hed = [placeholder for _ in range(len(self.hed_strings))] - new_base = [placeholder for _ in range(len(self.hed_strings))] - new_contexts = [placeholder for _ in range(len(self.hed_strings))] - base, contexts = self._expand_context() + new_hed = ["" for _ in range(len(self.hed_strings))] for index, item in enumerate(self.hed_strings): new_hed[index] = self._filter_hed(item, remove_types=remove_types, remove_defs=remove_defs, remove_group=False) - new_base[index] = self._filter_hed(base[index], remove_types=remove_types, + if self.onsets is None: + return new_hed, None, None + new_base, new_contexts = self._get_base_contexts(remove_types, remove_defs) + return new_hed, new_base, new_contexts + + def _get_base_contexts(self, remove_types, remove_defs): + """ Expand the context and filter to remove specified types. + + Parameters: + remove_types (list): List of types to remove. + remove_defs (list): List of definitions to remove. + + """ + new_base = ["" for _ in range(len(self.hed_strings))] + new_contexts = ["" for _ in range(len(self.hed_strings))] + for index, item in enumerate(self.hed_strings): + new_base[index] = self._filter_hed(self.base[index], remove_types=remove_types, remove_defs=remove_defs, remove_group=True) - new_contexts[index] = self._filter_hed(contexts[index], remove_types=remove_types, + new_contexts[index] = self._filter_hed(self.contexts[index], remove_types=remove_types, remove_defs=remove_defs, remove_group=True) - return new_hed, new_base, new_contexts # these are each a list of strings + return new_base, new_contexts # these are each a list of strings - def _expand_context(self): + def _extract_context(self): """ Expand the onset and the ongoing context for additional processing. - Returns: - tuple of lists: (base list of str, context list of str). - Notes: For each event, the Onset goes in the base list and the remainder of the times go in the contexts list. """ @@ -156,8 +173,8 @@ def _expand_context(self): base[event.start_index].append(this_str) for i in range(event.start_index + 1, event.end_index): contexts[i].append(this_str) - - return self.compress_strings(base), self.compress_strings(contexts) + self.base = self.compress_strings(base) + self.contexts = self.compress_strings(contexts) def _filter_hed(self, hed, remove_types=[], remove_defs=[], remove_group=False): """ Remove types and definitions from a HED string. diff --git a/hed/tools/analysis/hed_tag_counts.py b/hed/tools/analysis/hed_tag_counts.py index 24133f42..542abc25 100644 --- a/hed/tools/analysis/hed_tag_counts.py +++ b/hed/tools/analysis/hed_tag_counts.py @@ -85,7 +85,7 @@ def __init__(self, name, total_events=0): self.files = {} self.total_events = total_events - def update_event_counts(self, hed_string_obj, file_name): + def update_tag_counts(self, hed_string_obj, file_name): """ Update the tag counts based on a HedString object. Parameters: diff --git a/hed/tools/analysis/hed_tag_manager.py b/hed/tools/analysis/hed_tag_manager.py index d8cd0529..d1553f3a 100644 --- a/hed/tools/analysis/hed_tag_manager.py +++ b/hed/tools/analysis/hed_tag_manager.py @@ -2,12 +2,13 @@ from hed.models.hed_string import HedString from hed.models import string_util +from hed.tools.analysis.event_manager import EventManager class HedTagManager: """ Manager for the HED tags from a columnar file. """ - def __init__(self, event_manager, remove_types=[]): + def __init__(self, event_manager, remove_types=[], extra_defs=None): """ Create a tag manager for one tabular file. Parameters: diff --git a/hed/tools/analysis/hed_type.py b/hed/tools/analysis/hed_type.py index d6c64943..96e25dc3 100644 --- a/hed/tools/analysis/hed_type.py +++ b/hed/tools/analysis/hed_type.py @@ -14,7 +14,7 @@ def __init__(self, event_manager, name, type_tag="condition-variable"): """ Create a variable manager for one type-variable for one tabular file. Parameters: - event_manager (EventManager): An event manager for the tabular file. + event_manager (EventManager): Event manager instance name (str): Name of the tabular file as a unique identifier. type_tag (str): Lowercase short form of the tag to be managed. @@ -25,7 +25,7 @@ def __init__(self, event_manager, name, type_tag="condition-variable"): self.name = name self.type_tag = type_tag.casefold() self.event_manager = event_manager - self.type_defs = HedTypeDefs(event_manager.def_dict, type_tag=type_tag) + self.type_defs = HedTypeDefs(self.event_manager.def_dict, type_tag=type_tag) self._type_map = {} # Dictionary of type tags versus dictionary with keys being definition names. self._extract_variables() diff --git a/hed/tools/analysis/hed_type_manager.py b/hed/tools/analysis/hed_type_manager.py index 402d45d1..6d510fd7 100644 --- a/hed/tools/analysis/hed_type_manager.py +++ b/hed/tools/analysis/hed_type_manager.py @@ -3,6 +3,7 @@ import pandas as pd import json from hed.tools.analysis.hed_type import HedType +from hed.tools.analysis.event_manager import EventManager class HedTypeManager: diff --git a/hed/tools/remodeling/operations/factor_hed_tags_op.py b/hed/tools/remodeling/operations/factor_hed_tags_op.py index 69a1464d..1e60894c 100644 --- a/hed/tools/remodeling/operations/factor_hed_tags_op.py +++ b/hed/tools/remodeling/operations/factor_hed_tags_op.py @@ -119,8 +119,7 @@ def do_op(self, dispatcher, df, name, sidecar=None): raise ValueError("QueryNameAlreadyColumn", f"Query [{query_name}]: is already a column name of the data frame") df_list = [input_data.dataframe] - tag_man = HedTagManager(EventManager(input_data, dispatcher.hed_schema), - remove_types=self.remove_types) + tag_man = HedTagManager(input_data, dispatcher.hed_schema, remove_types=self.remove_types) hed_objs = tag_man.get_hed_objs(include_context=self.expand_context, replace_defs=self.replace_defs) df_factors = query_service.search_hed_objs(hed_objs, self.query_handlers, query_names=self.query_names) if len(df_factors.columns) > 0: diff --git a/hed/tools/remodeling/operations/factor_hed_type_op.py b/hed/tools/remodeling/operations/factor_hed_type_op.py index 3d6f523f..67b068d1 100644 --- a/hed/tools/remodeling/operations/factor_hed_type_op.py +++ b/hed/tools/remodeling/operations/factor_hed_type_op.py @@ -73,8 +73,7 @@ def do_op(self, dispatcher, df, name, sidecar=None): input_data = TabularInput(df.copy().fillna('n/a'), sidecar=sidecar, name=name) df_list = [input_data.dataframe] - var_manager = HedTypeManager( - EventManager(input_data, dispatcher.hed_schema)) + var_manager = HedTypeManager(input_data, dispatcher.hed_schema) var_manager.add_type(self.type_tag.casefold()) df_factors = var_manager.get_factor_vectors( diff --git a/hed/tools/remodeling/operations/summarize_hed_tags_op.py b/hed/tools/remodeling/operations/summarize_hed_tags_op.py index 89b678ee..fe6241f8 100644 --- a/hed/tools/remodeling/operations/summarize_hed_tags_op.py +++ b/hed/tools/remodeling/operations/summarize_hed_tags_op.py @@ -249,12 +249,11 @@ def update_summary(self, new_info): new_info['name'], total_events=len(new_info['df'])) input_data = TabularInput( new_info['df'], sidecar=new_info['sidecar'], name=new_info['name']) - tag_man = HedTagManager(EventManager(input_data, new_info['schema']), - remove_types=self.sum_op.remove_types) + tag_man = HedTagManager(input_data, new_info['schema'], remove_types=self.sum_op.remove_types) hed_objs = tag_man.get_hed_objs(include_context=self.sum_op.include_context, replace_defs=self.sum_op.replace_defs) for hed in hed_objs: - counts.update_event_counts(hed, new_info['name']) + counts.update_tag_counts(hed, new_info['name']) self.summary_dict[new_info["name"]] = counts def get_details_dict(self, tag_counts): diff --git a/hed/tools/remodeling/operations/summarize_hed_type_op.py b/hed/tools/remodeling/operations/summarize_hed_type_op.py index dbd09e57..daa26440 100644 --- a/hed/tools/remodeling/operations/summarize_hed_type_op.py +++ b/hed/tools/remodeling/operations/summarize_hed_type_op.py @@ -130,8 +130,7 @@ def update_summary(self, new_info): sidecar = Sidecar(sidecar) input_data = TabularInput( new_info['df'], sidecar=sidecar, name=new_info['name']) - type_values = HedType(EventManager( - input_data, new_info['schema']), new_info['name'], type_tag=self.type_tag) + type_values = HedType(EventManager(input_data, new_info['schema']), new_info['name'], type_tag=self.type_tag) counts = HedTypeCounts(new_info['name'], self.type_tag) counts.update_summary(type_values.get_summary(), type_values.total_events, new_info['name']) diff --git a/hed/validator/onset_validator.py b/hed/validator/onset_validator.py index 1983f409..1d7a04dd 100644 --- a/hed/validator/onset_validator.py +++ b/hed/validator/onset_validator.py @@ -76,5 +76,5 @@ def check_for_banned_tags(hed_string): issues = [] for tag in hed_string.get_all_tags(): if tag.short_base_tag in banned_tag_list: - issues += ErrorHandler.format_error(TemporalErrors.HED_ONSET_WITH_NO_COLUMN, tag) + issues += ErrorHandler.format_error(TemporalErrors.TEMPORAL_TAG_NO_TIME, tag) return issues diff --git a/hed/validator/spreadsheet_validator.py b/hed/validator/spreadsheet_validator.py index b8ba1f32..9fd47443 100644 --- a/hed/validator/spreadsheet_validator.py +++ b/hed/validator/spreadsheet_validator.py @@ -1,6 +1,6 @@ """ Validates spreadsheet tabular data. """ import copy - +import pandas as pd from hed.models.base_input import BaseInput from hed.errors.error_types import ColumnErrors, ErrorContext, ValidationErrors from hed.errors.error_reporter import ErrorHandler @@ -70,12 +70,13 @@ def validate(self, data, def_dicts=None, name=None, error_handler=None): self._hed_validator = HedValidator(self._schema, def_dicts=def_dicts) if data.onsets is not None: self._onset_validator = OnsetValidator() + onset_mask = ~pd.isna(pd.to_numeric(onsets['onset'], errors='coerce')) else: self._onset_validator = None + onset_mask = None # Check the rows of the input data - issues += self._run_checks(df, error_handler=error_handler, row_adj=row_adj, - has_onsets=bool(self._onset_validator)) + issues += self._run_checks(df, error_handler=error_handler, row_adj=row_adj, onset_mask=onset_mask) if self._onset_validator: issues += self._run_onset_checks(onsets, error_handler=error_handler, row_adj=row_adj) error_handler.pop_error_context() @@ -83,7 +84,7 @@ def validate(self, data, def_dicts=None, name=None, error_handler=None): issues = sort_issues(issues) return issues - def _run_checks(self, hed_df, error_handler, row_adj, has_onsets): + def _run_checks(self, hed_df, error_handler, row_adj, onset_mask=None): issues = [] columns = list(hed_df.columns) self.invalid_original_rows = set() @@ -113,7 +114,7 @@ def _run_checks(self, hed_df, error_handler, row_adj, has_onsets): error_handler.pop_error_context() # Row continue - if has_onsets or not row_strings: + if not row_strings or (onset_mask is not None and onset_mask.iloc[row_number]): error_handler.pop_error_context() # Row continue @@ -148,6 +149,9 @@ def _run_onset_checks(self, onset_filtered, error_handler, row_adj): error_handler.pop_error_context() # Row return issues + def _run_onset_nan_checks(self, onsets, error_handler, row_adj): + return + def _validate_column_structure(self, base_input, error_handler, row_adj): """ Validate that each column in the input data has valid values. diff --git a/hed/validator/util/class_regex.json b/hed/validator/util/class_regex.json index 81f49f02..303983da 100644 --- a/hed/validator/util/class_regex.json +++ b/hed/validator/util/class_regex.json @@ -22,9 +22,9 @@ "less-than": "<", "letters": "[A-Za-z]", "lowercase": "[a-z]", - "name": "[\\w\\-\\u0080-\\uFFFF]", + "name": "[\\w\\-\\u00A0-\\uFFFF]", "newline": "\\n", - "nonascii": "[\\u0080-\\uFFFF]", + "nonascii": "[\\u00A0-\\uFFFF]", "number-sign": "#", "numeric": "[0-9.\\-+^Ee]", "percent-sign": "%", @@ -37,7 +37,7 @@ "single-quote": "'", "forward-slash": "/", "tab": "\\t", - "text": "[^\\x00-\\x1F\\x7F,{}]", + "text": "[^\\x00-\\x1F\\x7F-\\x9F,{}]", "tilde": "~", "underscore": "_", "uppercase": "[A-Z]", diff --git a/tests/tools/analysis/test_event_manager.py b/tests/tools/analysis/test_event_manager.py index 29e661e4..09474d6d 100644 --- a/tests/tools/analysis/test_event_manager.py +++ b/tests/tools/analysis/test_event_manager.py @@ -1,5 +1,7 @@ import os import unittest +import json +import io import numpy as np import pandas as pd @@ -8,6 +10,7 @@ from hed.models.tabular_input import TabularInput from hed.schema.hed_schema_io import load_schema_version from hed.tools.analysis.event_manager import EventManager +from hed.validator import SpreadsheetValidator class Test(unittest.TestCase): @@ -22,9 +25,21 @@ def setUpClass(cls): sidecar_path = os.path.realpath(os.path.join(bids_root_path, 'task-FacePerception_events.json')) sidecar1 = Sidecar(sidecar_path, name='face_sub1_json') cls.input_data = TabularInput(events_path, sidecar=sidecar1, name="face_sub1_events") - cls.events_path = events_path + sidecar_dict = { + "event_code": { + "HED": { + "show": "Sensory-event,Visual-presentation", + "respond": "Press", + "whatever": "Black", + "whatelse": "Purple" + } + } + } + + cls.sidecar2 = Sidecar(io.StringIO(json.dumps(sidecar_dict))) cls.sidecar = sidecar1 cls.schema = schema + cls.def_dict = "(Definition/Def1, (Event))" def test_constructor(self): manager1 = EventManager(self.input_data, self.schema) @@ -45,6 +60,26 @@ def test_constructor(self): if not event.end_time: self.assertEqual(event.end_index, len(manager1.input_data.dataframe)) + def test_no_onset_constructor(self): + # + tsv = { + "event_code": ["show", "respond", "show", "respond", "whatever", "show", "whatelse", "respond"], + "HED": ["Age/100", "n/a", "n/a", "n/a", "Green", "n/a", "Female", "n/a"], + } + + tab = TabularInput(pd.DataFrame(tsv), sidecar=self.sidecar2) + eman = EventManager(tab, self.schema) + self.assertIsNone(eman.onsets) + self.assertEqual(str(eman.hed_strings[0]), "Age/100,Sensory-event,Visual-presentation") + + # No onsets and an n/a entry + tsv["event_code"][2] = "n/a" + tab2 = TabularInput(pd.DataFrame(tsv), sidecar=self.sidecar2) + eman2 = EventManager(tab2, self.schema) + self.assertIsNone(eman2.onsets) + self.assertEqual(str(eman2.hed_strings[0]), "Age/100,Sensory-event,Visual-presentation") + self.assertFalse(str(eman2.hed_strings[2])) + def test_unfold_context_no_remove(self): manager1 = EventManager(self.input_data, self.schema) hed, base, context = manager1.unfold_context() diff --git a/tests/tools/analysis/test_hed_tag_counts.py b/tests/tools/analysis/test_hed_tag_counts.py index d70ba1d0..3bcbb0d3 100644 --- a/tests/tools/analysis/test_hed_tag_counts.py +++ b/tests/tools/analysis/test_hed_tag_counts.py @@ -44,8 +44,8 @@ def test_constructor(self): self.assertIsInstance(counts, HedTagCounts) self.assertFalse(counts.tag_dict) for k in range(6): - counts.update_event_counts(HedString(self.input_df.iloc[k]['HED_assembled'], self.hed_schema), - file_name='Base_name') + counts.update_tag_counts(HedString(self.input_df.iloc[k]['HED_assembled'], self.hed_schema), + file_name='Base_name') self.assertIsInstance(counts.tag_dict, dict) self.assertEqual(14, len(counts.tag_dict)) @@ -53,10 +53,10 @@ def test_merge_tag_dicts(self): counts1 = HedTagCounts('Base_name1', 50) counts2 = HedTagCounts('Base_name2', 100) for k in range(6): - counts1.update_event_counts(HedString(self.input_df.iloc[k]['HED_assembled'], self.hed_schema), - file_name='Base_name1') - counts2.update_event_counts(HedString(self.input_df.iloc[k]['HED_assembled'], self.hed_schema), - file_name='Base_name2') + counts1.update_tag_counts(HedString(self.input_df.iloc[k]['HED_assembled'], self.hed_schema), + file_name='Base_name1') + counts2.update_tag_counts(HedString(self.input_df.iloc[k]['HED_assembled'], self.hed_schema), + file_name='Base_name2') counts3 = HedTagCounts("All", 0) counts3.merge_tag_dicts(counts1.tag_dict) counts3.merge_tag_dicts(counts2.tag_dict) @@ -68,8 +68,8 @@ def test_merge_tag_dicts(self): def test_hed_tag_count(self): name = 'Base_name1' counts1 = HedTagCounts(name, 0) - counts1.update_event_counts(HedString(self.input_df.iloc[0]['HED_assembled'], self.hed_schema), - file_name=name) + counts1.update_tag_counts(HedString(self.input_df.iloc[0]['HED_assembled'], self.hed_schema), + file_name=name) self.assertIsInstance(counts1, HedTagCounts) def test_organize_tags(self): @@ -80,7 +80,7 @@ def test_organize_tags(self): # type_defs = input_data.get_definitions().gathered_defs for hed in df["HED_assembled"]: - counts.update_event_counts(HedString(hed, self.hed_schema), 'run-1') + counts.update_tag_counts(HedString(hed, self.hed_schema), 'run-1') self.assertIsInstance(counts.tag_dict, dict) self.assertEqual(46, len(counts.tag_dict)) org_tags, leftovers = counts.organize_tags(self.tag_template) diff --git a/tests/tools/analysis/test_hed_tag_manager.py b/tests/tools/analysis/test_hed_tag_manager.py index c7cb40ea..b539bc6b 100644 --- a/tests/tools/analysis/test_hed_tag_manager.py +++ b/tests/tools/analysis/test_hed_tag_manager.py @@ -109,7 +109,7 @@ def test_get_hed_objs(self): self.assertIsInstance(tag_man, HedTagManager) hed_objs = tag_man.get_hed_objs() self.assertIsInstance(hed_objs, list) - self.assertEqual(len(hed_objs), len(event_man.onsets)) + self.assertEqual(len(hed_objs), len(tag_man.event_manager.onsets)) # def test_constructor_variable_caps(self): # sidecar1 = Sidecar(self.sidecar_path, name='face_sub1_json') diff --git a/tests/tools/analysis/test_hed_type.py b/tests/tools/analysis/test_hed_type.py index 662b82cc..08cede6c 100644 --- a/tests/tools/analysis/test_hed_type.py +++ b/tests/tools/analysis/test_hed_type.py @@ -7,8 +7,8 @@ from hed.models.sidecar import Sidecar from hed.models.tabular_input import TabularInput from hed.schema.hed_schema_io import load_schema_version -from hed.tools.analysis.event_manager import EventManager from hed.tools.analysis.hed_type import HedType +from hed.tools.analysis.event_manager import EventManager class Test(unittest.TestCase): @@ -42,8 +42,7 @@ def setUpClass(cls): test_onsets1 = [0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0] df1 = DataFrame(test_onsets1, columns=['onset']) df1['HED'] = test_strings1 - input_data1 = TabularInput(df1) - cls.event_man1 = EventManager(input_data1, schema, extra_defs=def_dict) + cls.input_data1 = TabularInput(df1) test_strings2 = ["Def/Cond2, (Def/Cond6/4, Onset), (Def/Cond6/7.8, Onset), Def/Cond6/Alpha", "Yellow", "Def/Cond2, (Def/Cond6/4, Onset)", @@ -52,8 +51,7 @@ def setUpClass(cls): test_onsets2 = [0.0, 1.0, 2.0, 3.0, 4.0] df2 = DataFrame(test_onsets2, columns=['onset']) df2['HED'] = test_strings2 - input_data2 = TabularInput(df2) - cls.event_man2 = EventManager(input_data2, schema, extra_defs=def_dict) + cls.input_data2 = TabularInput(df2) test_strings3 = ['(Def/Cond3, Offset)'] test_onsets3 = [0.0] df3 = DataFrame(test_onsets3, columns=['onset']) @@ -68,7 +66,7 @@ def setUpClass(cls): cls.def_dict = def_dict def test_constructor(self): - type_var = HedType(self.event_man1, 'test-it') + type_var = HedType(EventManager(self.input_data1, self.schema, extra_defs=self.def_dict), 'test-it') self.assertIsInstance(type_var, HedType, "Constructor should create a HedType from an event manager") self.assertEqual(len(type_var._type_map), 8, "Constructor ConditionVariables should have the right length") @@ -76,34 +74,30 @@ def test_constructor(self): def test_constructor_from_tabular_input(self): sidecar1 = Sidecar(self.sidecar_path, name='face_sub1_json') input_data = TabularInput(self.events_path, sidecar=sidecar1, name="face_sub1_events") - event_man = EventManager(input_data, self.schema) - var_man = HedType(event_man, 'face') + var_man = HedType(EventManager(input_data, self.schema), 'face') self.assertIsInstance(var_man, HedType, "Constructor should create a HedTypeManager from a tabular input") def test_constructor_variable_caps(self): sidecar1 = Sidecar(self.sidecar_path, name='face_sub1_json') input_data = TabularInput(self.events_path, sidecar1, name="face_sub1_events") - event_man = EventManager(input_data, self.schema) - var_manager = HedType(event_man, 'run-01') + var_manager = HedType(EventManager(input_data, self.schema), 'run-01') self.assertIsInstance(var_manager, HedType, "Constructor should create a HedTypeManager variable caps") def test_constructor_multiple_values(self): - type_var = HedType(self.event_man2, 'test-it') + type_var = HedType(EventManager(self.input_data2, self.schema, extra_defs=self.def_dict), 'test-it') self.assertIsInstance(type_var, HedType, "Constructor should create a HedType from an event manager") self.assertEqual(len(type_var._type_map), 3, "Constructor should have right number of type_variables if multiple") def test_constructor_unmatched(self): with self.assertRaises(KeyError) as context: - event_man = EventManager(self.input_data3, self.schema, extra_defs=self.def_dict) - HedType(event_man, 'run-01') + HedType(EventManager(self.input_data3, self.schema, extra_defs=self.def_dict), 'run-01') self.assertEqual(context.exception.args[0], 'cond3') def test_get_variable_factors(self): sidecar1 = Sidecar(self.sidecar_path, name='face_sub1_json') input_data = TabularInput(self.events_path, sidecar1, name="face_sub1_events") - event_man = EventManager(input_data, self.schema) - var_manager = HedType(event_man, 'run-01') + var_manager = HedType(EventManager(input_data, self.schema), 'run-01') df_new1 = var_manager.get_type_factors() self.assertIsInstance(df_new1, DataFrame) self.assertEqual(len(df_new1), 200) @@ -117,23 +111,21 @@ def test_get_variable_factors(self): def test_str(self): sidecar1 = Sidecar(self.sidecar_path, name='face_sub1_json') input_data = TabularInput(self.events_path, sidecar1, name="face_sub1_events") - event_man = EventManager(input_data, self.schema) - var_manager = HedType(event_man, 'run-01') + var_manager = HedType(EventManager(input_data, self.schema), 'run-01') new_str = str(var_manager) self.assertIsInstance(new_str, str) def test_summarize_variables(self): sidecar1 = Sidecar(self.sidecar_path, name='face_sub1_json') input_data = TabularInput(self.events_path, sidecar1, name="face_sub1_events") - event_man = EventManager(input_data, self.schema) - var_manager = HedType(event_man, 'run-01') + var_manager = HedType(EventManager(input_data, self.schema), 'run-01') summary = var_manager.get_summary() self.assertIsInstance(summary, dict, "get_summary produces a dictionary if not json") self.assertEqual(len(summary), 3, "Summarize_variables has right number of condition type_variables") self.assertIn("key-assignment", summary, "get_summary has a correct key") def test_extract_definition_variables(self): - var_manager = HedType(self.event_man1, 'run-01') + var_manager = HedType(EventManager(self.input_data1, self.schema, extra_defs=self.def_dict), 'run-01') var_levels = var_manager._type_map['var3'].levels self.assertNotIn('cond3/7', var_levels, "_extract_definition_variables before extraction def/cond3/7 not in levels") @@ -143,12 +135,12 @@ def test_extract_definition_variables(self): "_extract_definition_variables after extraction def/cond3/7 not in levels") def test_get_variable_names(self): - conditions1 = HedType(self.event_man1, 'run-01') + conditions1 = HedType(EventManager(self.input_data1, self.schema, extra_defs=self.def_dict), 'run-01') list1 = conditions1.get_type_value_names() self.assertEqual(len(list1), 8, "get_variable_tags list should have the right length") def test_get_variable_def_names(self): - conditions1 = HedType(self.event_man1, 'run-01') + conditions1 = HedType(EventManager(self.input_data1, self.schema, extra_defs=self.def_dict), 'run-01') list1 = conditions1.get_type_def_names() self.assertEqual(len(list1), 5, "get_type_def_names list should have the right length") diff --git a/tests/tools/remodeling/operations/test_summarize_hed_tags_op.py b/tests/tools/remodeling/operations/test_summarize_hed_tags_op.py index 5303b576..e93e5e3d 100644 --- a/tests/tools/remodeling/operations/test_summarize_hed_tags_op.py +++ b/tests/tools/remodeling/operations/test_summarize_hed_tags_op.py @@ -153,7 +153,7 @@ def test_quick3(self): self.assertIsInstance(tag_man, HedTagManager) # hed_objs = tag_man.get_hed_objs(include_context=include_context, replace_defs=replace_defs) # for hed in hed_objs: - # counts.update_event_counts(hed, 'myName') + # counts.update_tag_counts(hed, 'myName') # summary_dict['myName'] = counts def test_quick4(self): @@ -172,7 +172,7 @@ def test_quick4(self): # type_defs = input_data.get_definitions().gathered_defs for hed in df["HED_assembled"]: - counts.update_event_counts(HedString(hed, schema), 'myName') + counts.update_tag_counts(HedString(hed, schema), 'myName') summary_dict['myName'] = counts def test_get_summary_details(self): diff --git a/tests/validator/test_spreadsheet_validator.py b/tests/validator/test_spreadsheet_validator.py index abc6c0fe..6c9b08ac 100644 --- a/tests/validator/test_spreadsheet_validator.py +++ b/tests/validator/test_spreadsheet_validator.py @@ -1,11 +1,13 @@ import pandas as pd import os import shutil +import json +import io import unittest from hed import load_schema_version, load_schema from hed.validator import SpreadsheetValidator -from hed import TabularInput, SpreadsheetInput +from hed import TabularInput, SpreadsheetInput, Sidecar from hed.errors.error_types import ValidationErrors @@ -93,3 +95,42 @@ def test_invalid_onset_invalid_column(self): issues = self.validator.validate(TabularInput(self.df_with_onset_has_tags_unordered), def_dicts=def_dict) self.assertEqual(len(issues), 1) self.assertEqual(issues[0]['code'], ValidationErrors.TEMPORAL_TAG_ERROR) + + def test_onset_na(self): + # Test with no sidecar + def_dict = "(Definition/Def1, (Event))" + tsv = { + "onset": [0.0, 1.2, 1.2, 3.0, "n/a", 3.5, "n/a", 6], + "duration": [0.5, "n/a", "n/a", "n/a", "n/a", "n/a", "n/a", "n/a"], + "event_code": ["show", "respond", "show", "respond", "whatever", "show", "whatelse", "respond"], + "HED": ["Age/100", "(Def/Def1, Onset)", "Red", "n/a", "Green", "(Def/Def1, Offset)", + "Female,(Def/Def1,Onset)", "n/a"], + } + df_with_nans = pd.DataFrame(tsv) + issues = self.validator.validate(TabularInput(df_with_nans), def_dicts=def_dict) + self.assertEqual(len(issues), 3) + self.assertEqual(issues[2]['code'], ValidationErrors.TEMPORAL_TAG_ERROR) + + # Test with sidecar + sidecar_dict = { + "event_code": { + "HED": { + "show": "Sensory-event,Visual-presentation", + "respond": "Press", + "whatever": "Black", + "whatelse": "Purple" + } + } + } + + sidecar1 = Sidecar(io.StringIO(json.dumps(sidecar_dict))) + issues1 = self.validator.validate(TabularInput(df_with_nans, sidecar=sidecar1), def_dicts=def_dict) + self.assertEqual(len(issues1), 2) + self.assertEqual(issues1[1]['code'], ValidationErrors.TEMPORAL_TAG_ERROR) + + # The whatelse does not use the bad HED columns and HED column appears in {} so only when assembled is used. + sidecar_dict["event_code"]["HED"]["whatever"] = "Black, {HED}" + sidecar2 = Sidecar(io.StringIO(json.dumps(sidecar_dict))) + issues2 = self.validator.validate(TabularInput(df_with_nans, sidecar=sidecar2), def_dicts=def_dict) + self.assertEqual(len(issues2), 1) + self.assertEqual(issues1[0]['code'], ValidationErrors.ONSETS_UNORDERED) From 9bf1089f5482b17da60607713f207e5a627ae8f4 Mon Sep 17 00:00:00 2001 From: Kay Robbins <1189050+VisLab@users.noreply.github.com> Date: Fri, 11 Oct 2024 13:24:55 -0500 Subject: [PATCH 4/4] Updated validator and EventManager to handle n/a onsets --- .../operations/factor_hed_tags_op.py | 2 +- .../operations/factor_hed_type_op.py | 2 +- .../operations/merge_consecutive_op.py | 2 +- .../operations/summarize_column_names_op.py | 2 +- .../operations/summarize_column_values_op.py | 2 +- .../operations/summarize_definitions_op.py | 2 +- .../operations/summarize_hed_tags_op.py | 13 +++--- .../operations/summarize_hed_type_op.py | 2 +- .../operations/summarize_hed_validation_op.py | 5 +- tests/tools/analysis/test_event_manager.py | 46 ++++++++++++++++++- 10 files changed, 60 insertions(+), 18 deletions(-) diff --git a/hed/tools/remodeling/operations/factor_hed_tags_op.py b/hed/tools/remodeling/operations/factor_hed_tags_op.py index 1e60894c..7222c2a4 100644 --- a/hed/tools/remodeling/operations/factor_hed_tags_op.py +++ b/hed/tools/remodeling/operations/factor_hed_tags_op.py @@ -119,7 +119,7 @@ def do_op(self, dispatcher, df, name, sidecar=None): raise ValueError("QueryNameAlreadyColumn", f"Query [{query_name}]: is already a column name of the data frame") df_list = [input_data.dataframe] - tag_man = HedTagManager(input_data, dispatcher.hed_schema, remove_types=self.remove_types) + tag_man = HedTagManager(EventManager(input_data, dispatcher.hed_schema), remove_types=self.remove_types) hed_objs = tag_man.get_hed_objs(include_context=self.expand_context, replace_defs=self.replace_defs) df_factors = query_service.search_hed_objs(hed_objs, self.query_handlers, query_names=self.query_names) if len(df_factors.columns) > 0: diff --git a/hed/tools/remodeling/operations/factor_hed_type_op.py b/hed/tools/remodeling/operations/factor_hed_type_op.py index 67b068d1..8a3246c5 100644 --- a/hed/tools/remodeling/operations/factor_hed_type_op.py +++ b/hed/tools/remodeling/operations/factor_hed_type_op.py @@ -73,7 +73,7 @@ def do_op(self, dispatcher, df, name, sidecar=None): input_data = TabularInput(df.copy().fillna('n/a'), sidecar=sidecar, name=name) df_list = [input_data.dataframe] - var_manager = HedTypeManager(input_data, dispatcher.hed_schema) + var_manager = HedTypeManager(EventManager(input_data, dispatcher.hed_schema)) var_manager.add_type(self.type_tag.casefold()) df_factors = var_manager.get_factor_vectors( diff --git a/hed/tools/remodeling/operations/merge_consecutive_op.py b/hed/tools/remodeling/operations/merge_consecutive_op.py index 8041864e..8d193db2 100644 --- a/hed/tools/remodeling/operations/merge_consecutive_op.py +++ b/hed/tools/remodeling/operations/merge_consecutive_op.py @@ -115,7 +115,7 @@ def do_op(self, dispatcher, df, name, sidecar=None): df_new = df.copy() code_mask = df_new[self.column_name] == self.event_code - if sum(code_mask.astype(int)) == 0: + if not code_mask.any(): return df_new match_columns.append(self.column_name) match_df = df_new.loc[:, match_columns] diff --git a/hed/tools/remodeling/operations/summarize_column_names_op.py b/hed/tools/remodeling/operations/summarize_column_names_op.py index 8dbdb15b..03fed614 100644 --- a/hed/tools/remodeling/operations/summarize_column_names_op.py +++ b/hed/tools/remodeling/operations/summarize_column_names_op.py @@ -94,7 +94,7 @@ def __init__(self, sum_op): """ Constructor for column name summary manager. Parameters: - sum_op (BaseOp): Operation associated with this summary. + sum_op (SummarizeColumnNamesOp): Operation associated with this summary. """ super().__init__(sum_op) diff --git a/hed/tools/remodeling/operations/summarize_column_values_op.py b/hed/tools/remodeling/operations/summarize_column_values_op.py index 612e4b5e..5fa5ddff 100644 --- a/hed/tools/remodeling/operations/summarize_column_values_op.py +++ b/hed/tools/remodeling/operations/summarize_column_values_op.py @@ -133,7 +133,7 @@ def __init__(self, sum_op): """ Constructor for column value summary manager. Parameters: - sum_op (BaseOp): Operation associated with this summary. + sum_op (SummarizeColumnValuesOp): Operation associated with this summary. """ super().__init__(sum_op) diff --git a/hed/tools/remodeling/operations/summarize_definitions_op.py b/hed/tools/remodeling/operations/summarize_definitions_op.py index f953d593..65503fce 100644 --- a/hed/tools/remodeling/operations/summarize_definitions_op.py +++ b/hed/tools/remodeling/operations/summarize_definitions_op.py @@ -94,7 +94,7 @@ def __init__(self, sum_op, hed_schema, known_defs=None): """ Constructor for the summary of definitions. Parameters: - sum_op (BaseOp): Summary operation class for gathering definitions. + sum_op (SummarizeDefinitionsOp): Summary operation class for gathering definitions. hed_schema (HedSchema or HedSchemaGroup): Schema used for the dataset. known_defs (str or list or DefinitionDict): Definitions already known to be used. diff --git a/hed/tools/remodeling/operations/summarize_hed_tags_op.py b/hed/tools/remodeling/operations/summarize_hed_tags_op.py index fe6241f8..5e426bae 100644 --- a/hed/tools/remodeling/operations/summarize_hed_tags_op.py +++ b/hed/tools/remodeling/operations/summarize_hed_tags_op.py @@ -187,7 +187,7 @@ def __init__(self, parameters): } if self.word_cloud["use_mask"] and not self.word_cloud["mask_path"]: self.word_cloud["mask_path"] = os.path.realpath( - os.path.join(os.path.dirname(__file__), '../../../resources/word_cloud_brain_mask.png')) + os.path.join(os.path.dirname(__file__), '../../../resources/word_cloud_brain_mask.png')) if self.word_cloud["font_path"]: self.word_cloud["font_path"] = os.path.realpath(self.word_cloud["font_path"]) @@ -224,11 +224,12 @@ def validate_input_data(parameters): class HedTagSummary(BaseSummary): """ Manager of the HED tag summaries. """ + def __init__(self, sum_op): """ Constructor for HED tag summary manager. Parameters: - sum_op (BaseOp): Operation associated with this summary. + sum_op (SummarizeHedTagsOp): Operation associated with this summary. """ @@ -249,7 +250,7 @@ def update_summary(self, new_info): new_info['name'], total_events=len(new_info['df'])) input_data = TabularInput( new_info['df'], sidecar=new_info['sidecar'], name=new_info['name']) - tag_man = HedTagManager(input_data, new_info['schema'], remove_types=self.sum_op.remove_types) + tag_man = HedTagManager(EventManager(input_data, new_info['schema']), remove_types=self.sum_op.remove_types) hed_objs = tag_man.get_hed_objs(include_context=self.sum_op.include_context, replace_defs=self.sum_op.replace_defs) for hed in hed_objs: @@ -392,8 +393,7 @@ def _get_dataset_string(result, indent=BaseSummary.DISPLAY_INDENT): """ sum_list = [f"Dataset: Total events={result.get('Total events', 0)} " f"Total files={len(result.get('Files', []))}"] - sum_list = sum_list + \ - HedTagSummary._get_tag_list(result, indent=indent) + sum_list = sum_list + HedTagSummary._get_tag_list(result, indent=indent) return "\n".join(sum_list) @staticmethod @@ -409,8 +409,7 @@ def _get_individual_string(result, indent=BaseSummary.DISPLAY_INDENT): """ sum_list = [f"Total events={result.get('Total events', 0)}"] - sum_list = sum_list + \ - HedTagSummary._get_tag_list(result, indent=indent) + sum_list = sum_list + HedTagSummary._get_tag_list(result, indent=indent) return "\n".join(sum_list) @staticmethod diff --git a/hed/tools/remodeling/operations/summarize_hed_type_op.py b/hed/tools/remodeling/operations/summarize_hed_type_op.py index daa26440..36a9667f 100644 --- a/hed/tools/remodeling/operations/summarize_hed_type_op.py +++ b/hed/tools/remodeling/operations/summarize_hed_type_op.py @@ -108,7 +108,7 @@ def __init__(self, sum_op): """ Constructor for HED type summary manager. Parameters: - sum_op (BaseOp): Operation associated with this summary. + sum_op (SummarizeHedTypeOp): Operation associated with this summary. """ super().__init__(sum_op) diff --git a/hed/tools/remodeling/operations/summarize_hed_validation_op.py b/hed/tools/remodeling/operations/summarize_hed_validation_op.py index 3f720740..6549c4c6 100644 --- a/hed/tools/remodeling/operations/summarize_hed_validation_op.py +++ b/hed/tools/remodeling/operations/summarize_hed_validation_op.py @@ -106,7 +106,7 @@ def __init__(self, sum_op): """ Constructor for validation issue manager. Parameters: - sum_op (BaseOp): Operation associated with this summary. + sum_op (SummarizeHedValidationOp): Operation associated with this summary. """ super().__init__(sum_op) @@ -153,8 +153,7 @@ def update_summary(self, new_info): sidecar = new_info.get('sidecar', None) if sidecar and not isinstance(sidecar, Sidecar): - sidecar = Sidecar( - files=new_info['sidecar'], name=os.path.basename(sidecar)) + sidecar = Sidecar(files=new_info['sidecar'], name=os.path.basename(sidecar)) results = self._get_sidecar_results( sidecar, new_info, self.sum_op.check_for_warnings) if not results['sidecar_had_issues']: diff --git a/tests/tools/analysis/test_event_manager.py b/tests/tools/analysis/test_event_manager.py index 09474d6d..8a4024d5 100644 --- a/tests/tools/analysis/test_event_manager.py +++ b/tests/tools/analysis/test_event_manager.py @@ -33,6 +33,11 @@ def setUpClass(cls): "whatever": "Black", "whatelse": "Purple" } + }, + "defs": { + "HED": { + "defs1": "(Definition/Con1, (Condition-variable/Cond-one)), (Definition/Con2, (Condition-variable/Cond-one))" + } } } @@ -61,7 +66,7 @@ def test_constructor(self): self.assertEqual(event.end_index, len(manager1.input_data.dataframe)) def test_no_onset_constructor(self): - # + # No onsets --- has an event manager tsv = { "event_code": ["show", "respond", "show", "respond", "whatever", "show", "whatelse", "respond"], "HED": ["Age/100", "n/a", "n/a", "n/a", "Green", "n/a", "Female", "n/a"], @@ -79,6 +84,43 @@ def test_no_onset_constructor(self): self.assertIsNone(eman2.onsets) self.assertEqual(str(eman2.hed_strings[0]), "Age/100,Sensory-event,Visual-presentation") self.assertFalse(str(eman2.hed_strings[2])) + self.assertIsNone(eman2.base) + self.assertIsNone(eman2.context) + + def test_bad_onset_constructor(self): + tsv = { + "onset": [0.0, 1.2, 1.2, 3.0, 5, 3.5, 4, 6], + "duration": [0.5, "n/a", "n/a", "n/a", "n/a", "n/a", "n/a", "n/a"], + "event_code": ["show", "respond", "show", "respond", "whatever", "show", "whatelse", "respond"], + "HED": ["Age/100", "n/a", "n/a", "n/a", "Green", "n/a", "Female", "n/a"], + } + + tab = TabularInput(pd.DataFrame(tsv), sidecar=self.sidecar2) + with self.assertRaises(HedFileError): + EventManager(tab, self.schema) + + tsv = { + "onset": [0.0, 1.2, 1.2, 3.0, "n/a", 3.5, "n/a", 6], + "duration": [0.5, "n/a", "n/a", "n/a", "n/a", "n/a", "n/a", "n/a"], + "event_code": ["show", "respond", "show", "respond", "whatever", "show", "whatelse", "respond"], + "HED": ["Age/100", "n/a", "n/a", "n/a", "Green", "n/a", "Female", "n/a"], + } + tab = TabularInput(pd.DataFrame(tsv), sidecar=self.sidecar2) + with self.assertRaises(HedFileError): + EventManager(tab, self.schema) + + def test_unfold_no_onset(self): + tsv = { + "event_code": ["show", "respond", "show", "respond", "whatever", "show", "whatelse", "respond"], + "HED": ["Age/100,Condition-variable/Temp", "Def/Con1", "Def/Con2", "n/a", "Green", "n/a", "Female", "n/a"], + } + tab = TabularInput(pd.DataFrame(tsv), sidecar=self.sidecar2) + defs = self.sidecar2.get_def_dict(self.schema) + manager1 = EventManager(tab, self.schema) + hed1, base1, context1 = manager1.unfold_context() + hed2, base2, context2 = manager1.unfold_context(remove_types=["Condition-variable"]) + self.assertEqual(hed1[1], "Def/Con1,Press") + self.assertEqual(hed2[1], "Press") def test_unfold_context_no_remove(self): manager1 = EventManager(self.input_data, self.schema) @@ -87,6 +129,8 @@ def test_unfold_context_no_remove(self): self.assertIsInstance(hed[index], str) self.assertIsInstance(base[index], str) + + def test_unfold_context_remove(self): manager1 = EventManager(self.input_data, self.schema) hed, base, context = manager1.unfold_context(remove_types=['Condition-variable', 'Task'])