diff --git a/pipestat/parsed_schema.py b/pipestat/parsed_schema.py index fc1fcac7..0857b2fb 100644 --- a/pipestat/parsed_schema.py +++ b/pipestat/parsed_schema.py @@ -50,13 +50,27 @@ class Config: return BaseModel -def _safe_pop_one_mapping(key: str, data: Dict[str, Any], info_name: str) -> Any: - value = data.pop(key, NULL_MAPPING_VALUE) - if isinstance(value, Mapping): - return value - raise SchemaError( - f"{info_name} info in schema definition has invalid type: {type(value).__name__}" - ) +def _safe_pop_one_mapping( + mappingkey: str, data: Dict[str, Any], info_name: str, subkeys: Optional[List[str]] = None +) -> Any: + """ + mapping key: the dict key where the sample, project or status values are stored, e.g. data["mappingkey"] + subkeys: if using JSON schema, the dict is nested further, e.g. data["properties"]["samples"]["mappingkey"] + """ + if subkeys: + try: + value = data[subkeys[0]].pop(mappingkey, NULL_MAPPING_VALUE) + except KeyError: + value = {} + if isinstance(value, Mapping): + return value + else: + value = data.pop(mappingkey, NULL_MAPPING_VALUE) + if isinstance(value, Mapping): + return value + raise SchemaError( + f"{info_name} info in schema definition has invalid type: {type(value).__name__}" + ) class ParsedSchema(object): @@ -81,46 +95,63 @@ def __init__(self, data: Union[Dict[str, Any], Path, str]) -> None: # initial validation and parse if not isinstance(data, dict): _, data = read_yaml_data(data, "schema") + data = copy.deepcopy(data) - # pipeline identifier - self._pipeline_name = data.pop(SCHEMA_PIPELINE_NAME_KEY, None) + # Currently supporting backwards compatibility with old output schema while now also supporting a JSON schema: + if "properties" in list(data.keys()): + # Assume top-level properties key implies proper JSON schema. + self._pipeline_name = data["properties"].pop(SCHEMA_PIPELINE_NAME_KEY, None) + + sample_data = _safe_pop_one_mapping( + subkeys=["samples"], + data=data["properties"], + info_name="sample-level", + mappingkey="properties", + ) + + prj_data = _safe_pop_one_mapping( + subkeys=["project"], + data=data["properties"], + info_name="project-level", + mappingkey="properties", + ) + + self._status_data = _safe_pop_one_mapping( + subkeys=["status"], + data=data["properties"], + info_name="status", + mappingkey="properties", + ) + + else: + self._pipeline_name = data.pop(SCHEMA_PIPELINE_NAME_KEY, None) + sample_data = _safe_pop_one_mapping( + mappingkey=self._SAMPLES_KEY, data=data, info_name="sample-level" + ) + prj_data = _safe_pop_one_mapping( + mappingkey=self._PROJECT_KEY, data=data, info_name="project-level" + ) + # Parse custom status declaration if present. + self._status_data = _safe_pop_one_mapping( + mappingkey=self._STATUS_KEY, data=data, info_name="status" + ) + if not isinstance(self._pipeline_name, str): raise SchemaError( f"Could not find valid pipeline identifier (key '{SCHEMA_PIPELINE_NAME_KEY}') in given schema data" ) - # Parse sample-level data item declarations. - sample_data = _safe_pop_one_mapping( - key=self._SAMPLES_KEY, data=data, info_name="sample-level" - ) - self._sample_level_data = _recursively_replace_custom_types(sample_data) - # Parse project-level data item declarations. - prj_data = _safe_pop_one_mapping( - key=self._PROJECT_KEY, data=data, info_name="project-level" - ) self._project_level_data = _recursively_replace_custom_types(prj_data) # Sample- and/or project-level data must be declared. if not self._sample_level_data and not self._project_level_data: raise SchemaError("Neither sample-level nor project-level data items are declared.") - # Parse custom status declaration if present. - self._status_data = _safe_pop_one_mapping( - key=self._STATUS_KEY, data=data, info_name="status" - ) - - if data: - _LOGGER.info( - "Top-Level arguments found in output schema. They will be assigned to project-level." - ) - extra_project_data = _recursively_replace_custom_types(data) - self._project_level_data.update(extra_project_data) - # Check that no reserved keywords were used as data items. - resv_kwds = {"id", SAMPLE_NAME} + resv_kwds = {"id", RECORD_IDENTIFIER} reserved_keywords_used = set() for data in [self.project_level_data, self.sample_level_data, self.status_data]: reserved_keywords_used |= set(data.keys()) & resv_kwds @@ -151,7 +182,10 @@ def __str__(self): res += f"\n Sample Level Data:" for k, v in self._sample_level_data.items(): res += f"\n - {k} : {v}" - # TODO: add status schema data + if self._status_data is not None: + res += f"\n Status Data:" + for k, v in self._status_data.items(): + res += f"\n - {k} : {v}" return res @property diff --git a/tests/conftest.py b/tests/conftest.py index cf62a478..16422035 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -86,3 +86,8 @@ def custom_status_schema2(): @pytest.fixture def output_schema_html_report(): return get_data_file_path("output_schema_html_report.yaml") + + +@pytest.fixture +def output_schema_as_JSON_schema(): + return get_data_file_path("output_schema_as_JSON_schema.yaml") diff --git a/tests/data/output_schema_as_JSON_schema.yaml b/tests/data/output_schema_as_JSON_schema.yaml new file mode 100644 index 00000000..e391fd6d --- /dev/null +++ b/tests/data/output_schema_as_JSON_schema.yaml @@ -0,0 +1,43 @@ +title: An example Pipestat output schema +description: A pipeline that uses pipestat to report sample and project level results. +type: object +properties: + pipeline_name: "default_pipeline_name" + samples: + type: object + properties: + number_of_things: + type: integer + description: "Lorem ipsum dolor sit amet, consectetur adipiscing elit." + smooth_bw: + type: string + description: "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Fusce nec cursus nulla." + path: "aligned_{genome}/{sample_name}_smooth.bw" + collection_of_images: + type: array + description: A collection of images. + items: + type: object + properties: + prop1: + type: file + description: An example file. + output_file_in_object: + type: object + description: An object containing output files. + properties: + example_property_1: + type: file + description: An example file. + example_property_2: + type: image + description: An example image. + project: + type: object + properties: + project_output_file: + type: file + description: The path to the output file. + protocol: + type: string + description: example protocol description diff --git a/tests/test_parsed_schema.py b/tests/test_parsed_schema.py index c097faa3..001b8a30 100644 --- a/tests/test_parsed_schema.py +++ b/tests/test_parsed_schema.py @@ -5,7 +5,7 @@ from typing import * import pytest import oyaml -from pipestat.const import SAMPLE_NAME, STATUS +from pipestat.const import SAMPLE_NAME, STATUS, RECORD_IDENTIFIER from pipestat.exceptions import SchemaError from pipestat.parsed_schema import ( NULL_MAPPING_VALUE, @@ -227,10 +227,10 @@ def test_insufficient_schema__raises_expected_error_and_message(schema_data, exp ] for extra in [ [("id", {"type": "string", "description": "identifier"})], - [(SAMPLE_NAME, {"type": "string", "description": "identifier"})], + [(RECORD_IDENTIFIER, {"type": "string", "description": "identifier"})], [ ("id", {"type": "string", "description": "identifier"}), - (SAMPLE_NAME, {"type": "string", "description": "identifier"}), + (RECORD_IDENTIFIER, {"type": "string", "description": "identifier"}), ], ] ], @@ -257,3 +257,9 @@ def test_sample_project_data_item_name_overlap__raises_expected_error_and_messag obs_msg = str(err_ctx.value) exp_msg = f"Overlap between project- and sample-level keys: {common_key}" assert obs_msg == exp_msg + + +def test_JSON_schema_validation(output_schema_as_JSON_schema): + schema = ParsedSchema(output_schema_as_JSON_schema) + assert "number_of_things" in dict(schema.sample_level_data).keys() + assert "protocol" in dict(schema.project_level_data).keys()