Dev json schema (#87)

* first pass to allow for output_schema as a true JSON schema #85 * remove unused import * simplify logic, extend _safe_pop_one_mapping to handles multiple keys * disambiguation key vs keys in parsed schema * make samples and project objects instead of arrays, add more test assertions, clean up docstrings. * add status data to string representation of ParsedSchema
pepkit · Oct 4, 2023 · 5588510 · 5588510
1 parent 239d110
commit 5588510
Show file tree

Hide file tree

Showing 4 changed files with 123 additions and 35 deletions.
diff --git a/pipestat/parsed_schema.py b/pipestat/parsed_schema.py
@@ -50,13 +50,27 @@ class Config:
     return BaseModel
 
 
-def _safe_pop_one_mapping(key: str, data: Dict[str, Any], info_name: str) -> Any:
-    value = data.pop(key, NULL_MAPPING_VALUE)
-    if isinstance(value, Mapping):
-        return value
-    raise SchemaError(
-        f"{info_name} info in schema definition has invalid type: {type(value).__name__}"
-    )
+def _safe_pop_one_mapping(
+    mappingkey: str, data: Dict[str, Any], info_name: str, subkeys: Optional[List[str]] = None
+) -> Any:
+    """
+    mapping key: the dict key where the sample, project or status values are stored, e.g. data["mappingkey"]
+    subkeys: if using JSON schema, the dict is nested further, e.g. data["properties"]["samples"]["mappingkey"]
+    """
+    if subkeys:
+        try:
+            value = data[subkeys[0]].pop(mappingkey, NULL_MAPPING_VALUE)
+        except KeyError:
+            value = {}
+        if isinstance(value, Mapping):
+            return value
+    else:
+        value = data.pop(mappingkey, NULL_MAPPING_VALUE)
+        if isinstance(value, Mapping):
+            return value
+        raise SchemaError(
+            f"{info_name} info in schema definition has invalid type: {type(value).__name__}"
+        )
 
 
 class ParsedSchema(object):
@@ -81,46 +95,63 @@ def __init__(self, data: Union[Dict[str, Any], Path, str]) -> None:
         # initial validation and parse
         if not isinstance(data, dict):
             _, data = read_yaml_data(data, "schema")
+
         data = copy.deepcopy(data)
 
-        # pipeline identifier
-        self._pipeline_name = data.pop(SCHEMA_PIPELINE_NAME_KEY, None)
+        # Currently supporting backwards compatibility with old output schema while now also supporting a JSON schema:
+        if "properties" in list(data.keys()):
+            # Assume top-level properties key implies proper JSON schema.
+            self._pipeline_name = data["properties"].pop(SCHEMA_PIPELINE_NAME_KEY, None)
+
+            sample_data = _safe_pop_one_mapping(
+                subkeys=["samples"],
+                data=data["properties"],
+                info_name="sample-level",
+                mappingkey="properties",
+            )
+
+            prj_data = _safe_pop_one_mapping(
+                subkeys=["project"],
+                data=data["properties"],
+                info_name="project-level",
+                mappingkey="properties",
+            )
+
+            self._status_data = _safe_pop_one_mapping(
+                subkeys=["status"],
+                data=data["properties"],
+                info_name="status",
+                mappingkey="properties",
+            )
+
+        else:
+            self._pipeline_name = data.pop(SCHEMA_PIPELINE_NAME_KEY, None)
+            sample_data = _safe_pop_one_mapping(
+                mappingkey=self._SAMPLES_KEY, data=data, info_name="sample-level"
+            )
+            prj_data = _safe_pop_one_mapping(
+                mappingkey=self._PROJECT_KEY, data=data, info_name="project-level"
+            )
+            # Parse custom status declaration if present.
+            self._status_data = _safe_pop_one_mapping(
+                mappingkey=self._STATUS_KEY, data=data, info_name="status"
+            )
+
         if not isinstance(self._pipeline_name, str):
             raise SchemaError(
                 f"Could not find valid pipeline identifier (key '{SCHEMA_PIPELINE_NAME_KEY}') in given schema data"
             )
 
-        # Parse sample-level data item declarations.
-        sample_data = _safe_pop_one_mapping(
-            key=self._SAMPLES_KEY, data=data, info_name="sample-level"
-        )
-
         self._sample_level_data = _recursively_replace_custom_types(sample_data)
 
-        # Parse project-level data item declarations.
-        prj_data = _safe_pop_one_mapping(
-            key=self._PROJECT_KEY, data=data, info_name="project-level"
-        )
         self._project_level_data = _recursively_replace_custom_types(prj_data)
 
         # Sample- and/or project-level data must be declared.
         if not self._sample_level_data and not self._project_level_data:
             raise SchemaError("Neither sample-level nor project-level data items are declared.")
 
-        # Parse custom status declaration if present.
-        self._status_data = _safe_pop_one_mapping(
-            key=self._STATUS_KEY, data=data, info_name="status"
-        )
-
-        if data:
-            _LOGGER.info(
-                "Top-Level arguments found in output schema. They will be assigned to project-level."
-            )
-            extra_project_data = _recursively_replace_custom_types(data)
-            self._project_level_data.update(extra_project_data)
-
         # Check that no reserved keywords were used as data items.
-        resv_kwds = {"id", SAMPLE_NAME}
+        resv_kwds = {"id", RECORD_IDENTIFIER}
         reserved_keywords_used = set()
         for data in [self.project_level_data, self.sample_level_data, self.status_data]:
             reserved_keywords_used |= set(data.keys()) & resv_kwds
@@ -151,7 +182,10 @@ def __str__(self):
             res += f"\n Sample Level Data:"
             for k, v in self._sample_level_data.items():
                 res += f"\n -  {k} : {v}"
-        # TODO: add status schema data
+        if self._status_data is not None:
+            res += f"\n Status Data:"
+            for k, v in self._status_data.items():
+                res += f"\n -  {k} : {v}"
         return res
 
     @property

diff --git a/tests/conftest.py b/tests/conftest.py
@@ -86,3 +86,8 @@ def custom_status_schema2():
 @pytest.fixture
 def output_schema_html_report():
     return get_data_file_path("output_schema_html_report.yaml")
+
+
+@pytest.fixture
+def output_schema_as_JSON_schema():
+    return get_data_file_path("output_schema_as_JSON_schema.yaml")
diff --git a/tests/data/output_schema_as_JSON_schema.yaml b/tests/data/output_schema_as_JSON_schema.yaml
@@ -0,0 +1,43 @@
+title: An example Pipestat output schema
+description: A pipeline that uses pipestat to report sample and project level results.
+type: object
+properties:
+  pipeline_name: "default_pipeline_name"
+  samples:
+    type: object
+    properties:
+        number_of_things:
+          type: integer
+          description: "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
+        smooth_bw:
+          type: string
+          description: "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Fusce nec cursus nulla."
+          path: "aligned_{genome}/{sample_name}_smooth.bw"
+        collection_of_images:
+          type: array
+          description: A collection of images.
+          items:
+            type: object
+            properties:
+              prop1:
+                type: file
+                description: An example file.
+        output_file_in_object:
+          type: object
+          description: An object containing output files.
+          properties:
+            example_property_1:
+              type: file
+              description: An example file.
+            example_property_2:
+              type: image
+              description: An example image.
+  project:
+    type: object
+    properties:
+      project_output_file:
+        type: file
+        description: The path to the output file.
+      protocol:
+        type: string
+        description: example protocol description
diff --git a/tests/test_parsed_schema.py b/tests/test_parsed_schema.py
@@ -5,7 +5,7 @@
 from typing import *
 import pytest
 import oyaml
-from pipestat.const import SAMPLE_NAME, STATUS
+from pipestat.const import SAMPLE_NAME, STATUS, RECORD_IDENTIFIER
 from pipestat.exceptions import SchemaError
 from pipestat.parsed_schema import (
     NULL_MAPPING_VALUE,
@@ -227,10 +227,10 @@ def test_insufficient_schema__raises_expected_error_and_message(schema_data, exp
         ]
         for extra in [
             [("id", {"type": "string", "description": "identifier"})],
-            [(SAMPLE_NAME, {"type": "string", "description": "identifier"})],
+            [(RECORD_IDENTIFIER, {"type": "string", "description": "identifier"})],
             [
                 ("id", {"type": "string", "description": "identifier"}),
-                (SAMPLE_NAME, {"type": "string", "description": "identifier"}),
+                (RECORD_IDENTIFIER, {"type": "string", "description": "identifier"}),
             ],
         ]
     ],
@@ -257,3 +257,9 @@ def test_sample_project_data_item_name_overlap__raises_expected_error_and_messag
     obs_msg = str(err_ctx.value)
     exp_msg = f"Overlap between project- and sample-level keys: {common_key}"
     assert obs_msg == exp_msg
+
+
+def test_JSON_schema_validation(output_schema_as_JSON_schema):
+    schema = ParsedSchema(output_schema_as_JSON_schema)
+    assert "number_of_things" in dict(schema.sample_level_data).keys()
+    assert "protocol" in dict(schema.project_level_data).keys()