Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for validating SDMX-ML v2.1 messages #153

Closed
wants to merge 17 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions doc/howto.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ On other pages:
:maxdepth: 1

howto/create
howto/validate

Access other SDMX data sources
------------------------------
Expand Down
44 changes: 44 additions & 0 deletions doc/howto/validate.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
Validate SDMX-ML against official schemas
*****************************************

:mod:`sdmx` is capable of generating XML for all kinds of SDMX components. When communicating with remote services
though, only valid SDMX-ML messages can be sent. To help ensure your generated XML complies with the standard you can
call :func:`sdmx.validate_xml`.

Validation requires having a copy of the `official schema <https://github.com/sdmx-twg/sdmx-ml-v2_1>`_ files available.
To help make this easier, you can use :func:`sdmx.install_schemas`, which will cache a local copy for use in validation.

Cache schema files
==================

.. note:: This only needs to be run once.

.. ipython:: python

import sdmx
sdmx.install_schemas()

The schema files will be downloaded and placed in your local cache directory.

Validate SDMX-ML messages
=========================

Generate an SDMX-ML message, perhaps by following :doc:`create`. Once you have a file on disk that has an SDMX-ML
message it can be validated by running :func:`sdmx.validate_xml`. These instructions will use the samples provided by
the `SDMX technical working group <https://github.com/sdmx-twg/sdmx-ml-v2_1>`_.

.. code-block:: python

import sdmx

sdmx.validate_xml("samples/common/common.xml")
True

sdmx.validate_xml("samples/demography/demography.xml")
True

Invalid messages will return ``False``. You will also see a log message to help in tracing the problem::

Element '{http://www.sdmx.org/resources/sdmxml/schemas/v2_1/common}Annotations': This element is not expected.
Expected is one of ( {http://www.sdmx.org/resources/sdmxml/schemas/v2_1/common}Description,
{http://www.sdmx.org/resources/sdmxml/schemas/v2_1/structure}Structure )., line 17
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ tests = [
"pytest-xdist",
"requests-mock >= 1.4",
]
validate = ["platformdirs"]

[project.urls]
homepage = "https://github.com/khaeru/sdmx"
Expand Down
4 changes: 3 additions & 1 deletion sdmx/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,22 @@
from sdmx.reader import read_sdmx
from sdmx.rest import Resource
from sdmx.source import add_source, list_sources
from sdmx.writer import to_csv, to_pandas, to_xml
from sdmx.writer import install_schemas, to_csv, to_pandas, to_xml, validate_xml

__all__ = [
"Client",
"Request",
"Resource",
"add_source",
"install_schemas",
"list_sources",
"log",
"read_sdmx",
"read_url",
"to_csv",
"to_pandas",
"to_xml",
"validate_xml",
]


Expand Down
55 changes: 54 additions & 1 deletion sdmx/tests/writer/test_writer_xml.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,16 @@
import io
import logging
import zipfile

import pytest
from lxml import etree
import requests

import sdmx
import sdmx.writer.xml
from sdmx import message
from sdmx.model import v21 as m
from sdmx.model.v21 import DataSet, DataStructureDefinition, Dimension, Key, Observation
from sdmx.writer.xml import etree
from sdmx.writer.xml import writer as XMLWriter

log = logging.getLogger(__name__)
Expand Down Expand Up @@ -287,3 +289,54 @@ def test_structure_roundtrip(specimen, specimen_id, strict, tmp_path):
path.write_bytes(data.getbuffer())
log.error(f"compare() = False; see {path}")
raise


@pytest.mark.network
def test_install_schemas(tmp_path):
"""Test that XSD files are downloaded and ready for use in validation."""
sdmx.install_schemas(schema_dir=tmp_path)

# Look for a couple of the expected files
files = ["SDMXCommon.xsd", "SDMXMessage.xsd"]
for schema_doc in files:
doc = tmp_path.joinpath(schema_doc)
assert doc.exists()


@pytest.mark.network
def test_validate_xml_from_samples(tmp_path):
"""Use official samples to ensure validation work correctly."""
# Grab the latest v2.1 schema release to get the URL to the zip
release_url = "https://api.github.com/repos/sdmx-twg/sdmx-ml-v2_1/releases/latest"
gh_headers = {
"Accept": "application/vnd.github+json",
"X-GitHub-Api-Version": "2022-11-28",
}
resp = requests.get(url=release_url, headers=gh_headers)
zipball_url = resp.json().get("zipball_url")
# Download the zipped content and find the schemas within
resp = requests.get(url=zipball_url, headers=gh_headers)
zipped = zipfile.ZipFile(io.BytesIO(resp.content))
zipped.extractall(path=tmp_path)
extracted_content = list(tmp_path.glob("sdmx-twg-sdmx-ml*"))[0]

# Schemas as just in a flat directory
schema_dir = extracted_content.joinpath("schemas")

# Samples are somewhat spread out, and some are known broken so we pick a bunch
samples_dir = extracted_content.joinpath("samples")
samples = [
samples_dir / "common" / "common.xml",
samples_dir / "demography" / "demography.xml",
samples_dir / "demography" / "esms.xml",
samples_dir / "exr" / "common" / "exr_common.xml",
samples_dir / "exr" / "ecb_exr_ng" / "ecb_exr_ng_full.xml",
samples_dir / "exr" / "ecb_exr_ng" / "ecb_exr_ng.xml",
samples_dir / "query" / "query_cl_all.xml",
samples_dir / "query" / "response_cl_all.xml",
samples_dir / "query" / "query_esms_children.xml",
samples_dir / "query" / "response_esms_children.xml",
]

for sample in samples:
assert sdmx.validate_xml(sample, schema_dir)
4 changes: 3 additions & 1 deletion sdmx/writer/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
from .csv import to_csv
from .pandas import to_pandas
from .xml import to_xml
from .xml import install_schemas, to_xml, validate_xml

__all__ = [
"install_schemas",
"to_csv",
"to_pandas",
"to_xml",
"validate_xml",
]
114 changes: 112 additions & 2 deletions sdmx/writer/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,9 @@
# - writer functions for sdmx.message classes, in the same order as message.py
# - writer functions for sdmx.model classes, in the same order as model.py

from typing import Iterable, List, Literal, cast
import logging
from pathlib import Path
from typing import IO, Iterable, List, Literal, Optional, Union, cast

from lxml import etree
from lxml.builder import ElementMaker
Expand All @@ -17,6 +19,8 @@
from sdmx.model import v21 as model
from sdmx.writer.base import BaseWriter

log = logging.getLogger(__name__)

_element_maker = ElementMaker(nsmap={k: v for k, v in NS.items() if v is not None})

writer = BaseWriter("XML")
Expand Down Expand Up @@ -47,10 +51,116 @@ def to_xml(obj, **kwargs):
return etree.tostring(writer.recurse(obj), **kwargs)


def validate_xml(msg: Union[Path, IO], schema_dir: Optional[Path] = None) -> bool:
"""Validate and SDMX message against the XML Schema (XSD) documents.

The XML Schemas must first be installed or validation will fail. See
:func:`sdmx.install_schemas` to download the schema files.

Parameters
----------
msg
A SDMX-ML Message formatted XML file.
schema_dir
The directory to XSD schemas used to validate the message.

Returns
-------
bool
True if validation passed. False otherwise.
"""
try:
import platformdirs
except ModuleNotFoundError as err:
log.error(
"Missing platformdirs. Re-install sdmx with pip install sdmx1[validation]"
)
raise err

# If the user has no preference, get the schemas from the local cache directory
if not schema_dir:
schema_dir = platformdirs.user_cache_path("sdmx")

msg_doc = etree.parse(msg)

# Make sure the message is a supported type
supported_elements = [
"CodelistQuery",
"DataStructureQuery",
"GenericData",
"GenericMetadata",
"GenericTimeSeriesData",
"MetadataStructureQuery",
"Structure",
"StructureSpecificData",
"StructureSpecificMetadata",
"StructureSpecificTimeSeriesData",
]
root_elem_name = msg_doc.docinfo.root_name
if root_elem_name not in supported_elements:
raise NotImplementedError

message_xsd = schema_dir.joinpath("SDMXMessage.xsd")
if not message_xsd.exists():
raise ValueError

# Turn the XSD into a schema object
xml_schema_doc = etree.parse(message_xsd)
xml_schema = etree.XMLSchema(xml_schema_doc)

try:
xml_schema.assertValid(msg_doc)
except etree.DocumentInvalid as err:
log.error(err)
finally:
return xml_schema.validate(msg_doc)


def install_schemas(schema_dir: Optional[Path] = None) -> None:
"""Cache XML Schema documents locally for use during message validation.

Parameters
----------
schema_dir
The directory where XSD schemas will be downloaded to.
"""
import io
import zipfile

import platformdirs
import requests

# If the user has no preference, download the schemas to the local cache directory
if not schema_dir:
schema_dir = platformdirs.user_cache_path("sdmx")
schema_dir.mkdir(exist_ok=True, parents=True)

# Check the latest release to get the URL to the schema zip
release_url = "https://api.github.com/repos/sdmx-twg/sdmx-ml-v2_1/releases/latest"
gh_headers = {
"Accept": "application/vnd.github+json",
"X-GitHub-Api-Version": "2022-11-28",
}
resp = requests.get(url=release_url, headers=gh_headers)
zipball_url = resp.json().get("zipball_url")

# Download the zipped content and find the schemas within
resp = requests.get(url=zipball_url, headers=gh_headers)
zipped = zipfile.ZipFile(io.BytesIO(resp.content))
schemas = [n for n in zipped.namelist() if "schemas" in n and n.endswith(".xsd")]

# Extract the schemas to the destination directory
# We can't use ZipFile.extract here because it will keep the directory structure
for xsd in schemas:
xsd_path = zipfile.Path(zipped, at=xsd)
target = schema_dir.joinpath(xsd_path.name)
target.write_text(xsd_path.read_text())


RefStyle = Literal["Ref", "URN"]


def reference(obj, parent=None, tag=None, *, style: RefStyle):
def reference(obj, parent=None, tag=None, style=None):
"""Write a reference to `obj`.

.. todo:: Currently other functions in :mod:`.writer.xml` all pass the `style`
Expand Down