Skip to content

Commit

Permalink
Dev json schema (#87)
Browse files Browse the repository at this point in the history
* first pass to allow for output_schema as a true JSON schema #85

* remove unused import

* simplify logic, extend _safe_pop_one_mapping to handles multiple keys

* disambiguation key vs keys in parsed schema

* make samples and project objects instead of arrays, add more test assertions, clean up docstrings.

* add status data to string representation of ParsedSchema
  • Loading branch information
donaldcampbelljr authored Oct 4, 2023
1 parent 239d110 commit 5588510
Show file tree
Hide file tree
Showing 4 changed files with 123 additions and 35 deletions.
98 changes: 66 additions & 32 deletions pipestat/parsed_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,27 @@ class Config:
return BaseModel


def _safe_pop_one_mapping(key: str, data: Dict[str, Any], info_name: str) -> Any:
value = data.pop(key, NULL_MAPPING_VALUE)
if isinstance(value, Mapping):
return value
raise SchemaError(
f"{info_name} info in schema definition has invalid type: {type(value).__name__}"
)
def _safe_pop_one_mapping(
mappingkey: str, data: Dict[str, Any], info_name: str, subkeys: Optional[List[str]] = None
) -> Any:
"""
mapping key: the dict key where the sample, project or status values are stored, e.g. data["mappingkey"]
subkeys: if using JSON schema, the dict is nested further, e.g. data["properties"]["samples"]["mappingkey"]
"""
if subkeys:
try:
value = data[subkeys[0]].pop(mappingkey, NULL_MAPPING_VALUE)
except KeyError:
value = {}
if isinstance(value, Mapping):
return value
else:
value = data.pop(mappingkey, NULL_MAPPING_VALUE)
if isinstance(value, Mapping):
return value
raise SchemaError(
f"{info_name} info in schema definition has invalid type: {type(value).__name__}"
)


class ParsedSchema(object):
Expand All @@ -81,46 +95,63 @@ def __init__(self, data: Union[Dict[str, Any], Path, str]) -> None:
# initial validation and parse
if not isinstance(data, dict):
_, data = read_yaml_data(data, "schema")

data = copy.deepcopy(data)

# pipeline identifier
self._pipeline_name = data.pop(SCHEMA_PIPELINE_NAME_KEY, None)
# Currently supporting backwards compatibility with old output schema while now also supporting a JSON schema:
if "properties" in list(data.keys()):
# Assume top-level properties key implies proper JSON schema.
self._pipeline_name = data["properties"].pop(SCHEMA_PIPELINE_NAME_KEY, None)

sample_data = _safe_pop_one_mapping(
subkeys=["samples"],
data=data["properties"],
info_name="sample-level",
mappingkey="properties",
)

prj_data = _safe_pop_one_mapping(
subkeys=["project"],
data=data["properties"],
info_name="project-level",
mappingkey="properties",
)

self._status_data = _safe_pop_one_mapping(
subkeys=["status"],
data=data["properties"],
info_name="status",
mappingkey="properties",
)

else:
self._pipeline_name = data.pop(SCHEMA_PIPELINE_NAME_KEY, None)
sample_data = _safe_pop_one_mapping(
mappingkey=self._SAMPLES_KEY, data=data, info_name="sample-level"
)
prj_data = _safe_pop_one_mapping(
mappingkey=self._PROJECT_KEY, data=data, info_name="project-level"
)
# Parse custom status declaration if present.
self._status_data = _safe_pop_one_mapping(
mappingkey=self._STATUS_KEY, data=data, info_name="status"
)

if not isinstance(self._pipeline_name, str):
raise SchemaError(
f"Could not find valid pipeline identifier (key '{SCHEMA_PIPELINE_NAME_KEY}') in given schema data"
)

# Parse sample-level data item declarations.
sample_data = _safe_pop_one_mapping(
key=self._SAMPLES_KEY, data=data, info_name="sample-level"
)

self._sample_level_data = _recursively_replace_custom_types(sample_data)

# Parse project-level data item declarations.
prj_data = _safe_pop_one_mapping(
key=self._PROJECT_KEY, data=data, info_name="project-level"
)
self._project_level_data = _recursively_replace_custom_types(prj_data)

# Sample- and/or project-level data must be declared.
if not self._sample_level_data and not self._project_level_data:
raise SchemaError("Neither sample-level nor project-level data items are declared.")

# Parse custom status declaration if present.
self._status_data = _safe_pop_one_mapping(
key=self._STATUS_KEY, data=data, info_name="status"
)

if data:
_LOGGER.info(
"Top-Level arguments found in output schema. They will be assigned to project-level."
)
extra_project_data = _recursively_replace_custom_types(data)
self._project_level_data.update(extra_project_data)

# Check that no reserved keywords were used as data items.
resv_kwds = {"id", SAMPLE_NAME}
resv_kwds = {"id", RECORD_IDENTIFIER}
reserved_keywords_used = set()
for data in [self.project_level_data, self.sample_level_data, self.status_data]:
reserved_keywords_used |= set(data.keys()) & resv_kwds
Expand Down Expand Up @@ -151,7 +182,10 @@ def __str__(self):
res += f"\n Sample Level Data:"
for k, v in self._sample_level_data.items():
res += f"\n - {k} : {v}"
# TODO: add status schema data
if self._status_data is not None:
res += f"\n Status Data:"
for k, v in self._status_data.items():
res += f"\n - {k} : {v}"
return res

@property
Expand Down
5 changes: 5 additions & 0 deletions tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,3 +86,8 @@ def custom_status_schema2():
@pytest.fixture
def output_schema_html_report():
return get_data_file_path("output_schema_html_report.yaml")


@pytest.fixture
def output_schema_as_JSON_schema():
return get_data_file_path("output_schema_as_JSON_schema.yaml")
43 changes: 43 additions & 0 deletions tests/data/output_schema_as_JSON_schema.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
title: An example Pipestat output schema
description: A pipeline that uses pipestat to report sample and project level results.
type: object
properties:
pipeline_name: "default_pipeline_name"
samples:
type: object
properties:
number_of_things:
type: integer
description: "Lorem ipsum dolor sit amet, consectetur adipiscing elit."
smooth_bw:
type: string
description: "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Fusce nec cursus nulla."
path: "aligned_{genome}/{sample_name}_smooth.bw"
collection_of_images:
type: array
description: A collection of images.
items:
type: object
properties:
prop1:
type: file
description: An example file.
output_file_in_object:
type: object
description: An object containing output files.
properties:
example_property_1:
type: file
description: An example file.
example_property_2:
type: image
description: An example image.
project:
type: object
properties:
project_output_file:
type: file
description: The path to the output file.
protocol:
type: string
description: example protocol description
12 changes: 9 additions & 3 deletions tests/test_parsed_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from typing import *
import pytest
import oyaml
from pipestat.const import SAMPLE_NAME, STATUS
from pipestat.const import SAMPLE_NAME, STATUS, RECORD_IDENTIFIER
from pipestat.exceptions import SchemaError
from pipestat.parsed_schema import (
NULL_MAPPING_VALUE,
Expand Down Expand Up @@ -227,10 +227,10 @@ def test_insufficient_schema__raises_expected_error_and_message(schema_data, exp
]
for extra in [
[("id", {"type": "string", "description": "identifier"})],
[(SAMPLE_NAME, {"type": "string", "description": "identifier"})],
[(RECORD_IDENTIFIER, {"type": "string", "description": "identifier"})],
[
("id", {"type": "string", "description": "identifier"}),
(SAMPLE_NAME, {"type": "string", "description": "identifier"}),
(RECORD_IDENTIFIER, {"type": "string", "description": "identifier"}),
],
]
],
Expand All @@ -257,3 +257,9 @@ def test_sample_project_data_item_name_overlap__raises_expected_error_and_messag
obs_msg = str(err_ctx.value)
exp_msg = f"Overlap between project- and sample-level keys: {common_key}"
assert obs_msg == exp_msg


def test_JSON_schema_validation(output_schema_as_JSON_schema):
schema = ParsedSchema(output_schema_as_JSON_schema)
assert "number_of_things" in dict(schema.sample_level_data).keys()
assert "protocol" in dict(schema.project_level_data).keys()

0 comments on commit 5588510

Please sign in to comment.