From 7f1217e298736302557ca3f6a798bc0d8cecf923 Mon Sep 17 00:00:00 2001 From: Reginald Maltais Date: Thu, 21 Dec 2023 13:49:59 -0500 Subject: [PATCH 01/15] Test validation against sdmx-twg resources --- sdmx/tests/writer/test_writer_xml.py | 55 ++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/sdmx/tests/writer/test_writer_xml.py b/sdmx/tests/writer/test_writer_xml.py index 6f2737f78..7a57e1842 100644 --- a/sdmx/tests/writer/test_writer_xml.py +++ b/sdmx/tests/writer/test_writer_xml.py @@ -1,11 +1,15 @@ +import io import logging +import zipfile import pytest +import requests import sdmx from sdmx import message from sdmx.model import v21 as m from sdmx.model.v21 import DataSet, DataStructureDefinition, Dimension, Key, Observation +from sdmx.writer.xml import etree from sdmx.writer.xml import writer as XMLWriter log = logging.getLogger(__name__) @@ -261,3 +265,54 @@ def test_structure_roundtrip(pytestconfig, specimen, specimen_id, strict, tmp_pa assert msg0.compare(msg1, strict), ( path.read_text() if pytestconfig.getoption("verbose") else path ) + + +@pytest.mark.network +def test_install_schemas(tmp_path): + """Test that XSD files are downloaded and ready for use in validation.""" + sdmx.install_schemas(schema_dir=tmp_path) + + # Look for a couple of the expected files + files = ["SDMXCommon.xsd", "SDMXMessage.xsd"] + for schema_doc in files: + doc = tmp_path.joinpath(schema_doc) + assert doc.exists() + + +@pytest.mark.network +def test_validate_xml_from_samples(tmp_path): + """Use official samples to ensure validation work correctly.""" + # Grab the latest v2.1 schema release to get the URL to the zip + release_url = "https://api.github.com/repos/sdmx-twg/sdmx-ml-v2_1/releases/latest" + gh_headers = { + "Accept": "application/vnd.github+json", + "X-GitHub-Api-Version": "2022-11-28" + } + resp = requests.get(url=release_url, headers=gh_headers) + zipball_url = resp.json().get("zipball_url") + # Download the zipped content and find the schemas within + resp = requests.get(url=zipball_url, headers=gh_headers) + zipped = zipfile.ZipFile(io.BytesIO(resp.content)) + zipped.extractall(path=tmp_path) + extracted_content = list(tmp_path.glob("sdmx-twg-sdmx-ml*"))[0] + + # Schemas as just in a flat directory + schema_dir = extracted_content.joinpath("schemas") + + # Samples are somewhat spread out, and some are known broken so we pick a bunch + samples_dir = extracted_content.joinpath("samples") + samples = [ + samples_dir / "common" / "common.xml", + samples_dir / "demography" / "demography.xml", + samples_dir / "demography" / "esms.xml", + samples_dir / "exr" / "common" / "exr_common.xml", + samples_dir / "exr" / "ecb_exr_ng" / "ecb_exr_ng_full.xml", + samples_dir / "exr" / "ecb_exr_ng" / "ecb_exr_ng.xml", + samples_dir / "query" / "query_cl_all.xml", + samples_dir / "query" / "response_cl_all.xml", + samples_dir / "query" / "query_esms_children.xml", + samples_dir / "query" / "response_esms_children.xml", + ] + + for sample in samples: + assert sdmx.validate_xml(sample, schema_dir) From bf189dc1b5556a74783055a2777dac150786e3cf Mon Sep 17 00:00:00 2001 From: Reginald Maltais Date: Thu, 21 Dec 2023 13:52:05 -0500 Subject: [PATCH 02/15] Add validate_xml and install_schemas methods --- pyproject.toml | 1 + sdmx/__init__.py | 4 +- sdmx/writer/__init__.py | 4 +- sdmx/writer/xml.py | 108 +++++++++++++++++++++++++++++++++++++++- 4 files changed, 114 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 5f2ee008f..2e95e225e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,6 +46,7 @@ tests = [ "pytest-xdist", "requests-mock >= 1.4", ] +validate = ["platformdirs"] [project.urls] homepage = "https://github.com/khaeru/sdmx" diff --git a/sdmx/__init__.py b/sdmx/__init__.py index f0cab717f..bcbbbb35f 100644 --- a/sdmx/__init__.py +++ b/sdmx/__init__.py @@ -5,13 +5,14 @@ from sdmx.reader import read_sdmx from sdmx.rest import Resource from sdmx.source import add_source, list_sources -from sdmx.writer import to_csv, to_pandas, to_xml +from sdmx.writer import install_schemas, to_csv, to_pandas, to_xml, validate_xml __all__ = [ "Client", "Request", "Resource", "add_source", + "install_schemas", "list_sources", "log", "read_sdmx", @@ -19,6 +20,7 @@ "to_csv", "to_pandas", "to_xml", + "validate_xml", ] diff --git a/sdmx/writer/__init__.py b/sdmx/writer/__init__.py index 4affa7c8a..c56614136 100644 --- a/sdmx/writer/__init__.py +++ b/sdmx/writer/__init__.py @@ -1,9 +1,11 @@ from .csv import to_csv from .pandas import to_pandas -from .xml import to_xml +from .xml import install_schemas, to_xml, validate_xml __all__ = [ + "install_schemas", "to_csv", "to_pandas", "to_xml", + "validate_xml", ] diff --git a/sdmx/writer/xml.py b/sdmx/writer/xml.py index 51eb9a07a..3accadf15 100644 --- a/sdmx/writer/xml.py +++ b/sdmx/writer/xml.py @@ -5,7 +5,9 @@ # - writer functions for sdmx.message classes, in the same order as message.py # - writer functions for sdmx.model classes, in the same order as model.py -from typing import Iterable, List, cast +import logging +from pathlib import Path +from typing import Iterable, IO, List, Optional, cast from lxml import etree from lxml.builder import ElementMaker @@ -17,6 +19,8 @@ from sdmx.model import v21 as model from sdmx.writer.base import BaseWriter +log = logging.getLogger(__name__) + _element_maker = ElementMaker(nsmap={k: v for k, v in NS.items() if v is not None}) writer = BaseWriter("XML") @@ -45,6 +49,108 @@ def to_xml(obj, **kwargs): return etree.tostring(writer.recurse(obj), **kwargs) +def validate_xml(msg: Path | IO, schema_dir: Optional[Path] = None) -> bool: + """Validate and SDMX message against the XML Schema (XSD) documents. + + The XML Schemas must first be installed or validation will fail. See + :func:`sdmx.install_schemas` to download the schema files. + + Parameters + ---------- + msg + A SDMX-ML Message formatted XML file. + schema_dir + The directory to XSD schemas used to validate the message. + + Returns + ------- + bool + True if validation passed. False otherwise. + """ + try: + import platformdirs + except ModuleNotFoundError as err: + log.error( + "Missing platformdirs. Re-install sdmx with pip install sdmx1[validation]" + ) + raise err + + # If the user has no preference, get the schemas from the local cache directory + if not schema_dir: + schema_dir = platformdirs.user_cache_path("sdmx") + + msg_doc = etree.parse(msg) + + # Make sure the message is a supported type + supported_elements = [ + "CodelistQuery", + "DataStructureQuery", + "GenericData", + "GenericMetadata", + "GenericTimeSeriesData", + "MetadataStructureQuery", + "Structure", + "StructureSpecificData", + "StructureSpecificMetadata", + "StructureSpecificTimeSeriesData", + ] + root_elem_name = msg_doc.docinfo.root_name + if root_elem_name not in supported_elements: + raise NotImplementedError + + message_xsd = schema_dir.joinpath("SDMXMessage.xsd") + if not message_xsd.exists(): + raise ValueError + + # Turn the XSD into a schema object + xml_schema_doc = etree.parse(message_xsd) + xml_schema = etree.XMLSchema(xml_schema_doc) + + return xml_schema.validate(msg_doc) + # return xml_schema.assertValid(msg_doc) + + +def install_schemas(schema_dir: Optional[Path] = None) -> None: + """Cache XML Schema documents locally for use during message validation. + + Parameters + ---------- + schema_dir + The directory where XSD schemas will be downloaded to. + """ + import io + import zipfile + + import platformdirs + import requests + + # If the user has no preference, download the schemas to the local cache directory + if not schema_dir: + schema_dir = platformdirs.user_cache_path("sdmx") + schema_dir.mkdir(exist_ok=True, parents=True) + + # Check the latest release to get the URL to the schema zip + release_url = "https://api.github.com/repos/sdmx-twg/sdmx-ml-v2_1/releases/latest" + gh_headers = { + "Accept": "application/vnd.github+json", + "X-GitHub-Api-Version": "2022-11-28" + } + resp = requests.get(url=release_url, headers=gh_headers) + zipball_url = resp.json().get("zipball_url") + + # Download the zipped content and find the schemas within + resp = requests.get(url=zipball_url, headers=gh_headers) + zipped = zipfile.ZipFile(io.BytesIO(resp.content)) + schemas = [n for n in zipped.namelist() if "schemas" in n and n.endswith(".xsd")] + + # Extract the schemas to the destination directory + # We can't use ZipFile.extract here because it will keep the directory structure + for xsd in schemas: + xsd_path = zipfile.Path(zipped, at=xsd) + target = schema_dir.joinpath(xsd_path.name) + target.write_text(xsd_path.read_text()) + + def reference(obj, parent=None, tag=None, style=None): """Write a reference to `obj`. From 80e19e4c2fe759f70227f4e80ab2c08cee5fe65c Mon Sep 17 00:00:00 2001 From: Reginald Maltais Date: Wed, 3 Jan 2024 08:37:12 -0500 Subject: [PATCH 03/15] Restore xml validation after conflict resolution --- sdmx/tests/writer/test_writer_xml.py | 51 ++++++++++++++ sdmx/writer/xml.py | 102 +++++++++++++++++++++++++++ 2 files changed, 153 insertions(+) diff --git a/sdmx/tests/writer/test_writer_xml.py b/sdmx/tests/writer/test_writer_xml.py index f6a959a96..9d8d601a1 100644 --- a/sdmx/tests/writer/test_writer_xml.py +++ b/sdmx/tests/writer/test_writer_xml.py @@ -290,3 +290,54 @@ def test_structure_roundtrip(specimen, specimen_id, strict, tmp_path): path.write_bytes(data.getbuffer()) log.error(f"compare() = False; see {path}") raise + + +@pytest.mark.network +def test_install_schemas(tmp_path): + """Test that XSD files are downloaded and ready for use in validation.""" + sdmx.install_schemas(schema_dir=tmp_path) + + # Look for a couple of the expected files + files = ["SDMXCommon.xsd", "SDMXMessage.xsd"] + for schema_doc in files: + doc = tmp_path.joinpath(schema_doc) + assert doc.exists() + + +@pytest.mark.network +def test_validate_xml_from_samples(tmp_path): + """Use official samples to ensure validation work correctly.""" + # Grab the latest v2.1 schema release to get the URL to the zip + release_url = "https://api.github.com/repos/sdmx-twg/sdmx-ml-v2_1/releases/latest" + gh_headers = { + "Accept": "application/vnd.github+json", + "X-GitHub-Api-Version": "2022-11-28" + } + resp = requests.get(url=release_url, headers=gh_headers) + zipball_url = resp.json().get("zipball_url") + # Download the zipped content and find the schemas within + resp = requests.get(url=zipball_url, headers=gh_headers) + zipped = zipfile.ZipFile(io.BytesIO(resp.content)) + zipped.extractall(path=tmp_path) + extracted_content = list(tmp_path.glob("sdmx-twg-sdmx-ml*"))[0] + + # Schemas as just in a flat directory + schema_dir = extracted_content.joinpath("schemas") + + # Samples are somewhat spread out, and some are known broken so we pick a bunch + samples_dir = extracted_content.joinpath("samples") + samples = [ + samples_dir / "common" / "common.xml", + samples_dir / "demography" / "demography.xml", + samples_dir / "demography" / "esms.xml", + samples_dir / "exr" / "common" / "exr_common.xml", + samples_dir / "exr" / "ecb_exr_ng" / "ecb_exr_ng_full.xml", + samples_dir / "exr" / "ecb_exr_ng" / "ecb_exr_ng.xml", + samples_dir / "query" / "query_cl_all.xml", + samples_dir / "query" / "response_cl_all.xml", + samples_dir / "query" / "query_esms_children.xml", + samples_dir / "query" / "response_esms_children.xml", + ] + + for sample in samples: + assert sdmx.validate_xml(sample, schema_dir) diff --git a/sdmx/writer/xml.py b/sdmx/writer/xml.py index 7f1815264..c5b9ecd73 100644 --- a/sdmx/writer/xml.py +++ b/sdmx/writer/xml.py @@ -51,6 +51,108 @@ def to_xml(obj, **kwargs): return etree.tostring(writer.recurse(obj), **kwargs) +def validate_xml(msg: Path | IO, schema_dir: Optional[Path] = None) -> bool: + """Validate and SDMX message against the XML Schema (XSD) documents. + + The XML Schemas must first be installed or validation will fail. See + :func:`sdmx.install_schemas` to download the schema files. + + Parameters + ---------- + msg + A SDMX-ML Message formatted XML file. + schema_dir + The directory to XSD schemas used to validate the message. + + Returns + ------- + bool + True if validation passed. False otherwise. + """ + try: + import platformdirs + except ModuleNotFoundError as err: + log.error( + "Missing platformdirs. Re-install sdmx with pip install sdmx1[validation]" + ) + raise err + + # If the user has no preference, get the schemas from the local cache directory + if not schema_dir: + schema_dir = platformdirs.user_cache_path("sdmx") + + msg_doc = etree.parse(msg) + + # Make sure the message is a supported type + supported_elements = [ + "CodelistQuery", + "DataStructureQuery", + "GenericData", + "GenericMetadata", + "GenericTimeSeriesData", + "MetadataStructureQuery", + "Structure", + "StructureSpecificData", + "StructureSpecificMetadata", + "StructureSpecificTimeSeriesData", + ] + root_elem_name = msg_doc.docinfo.root_name + if root_elem_name not in supported_elements: + raise NotImplementedError + + message_xsd = schema_dir.joinpath("SDMXMessage.xsd") + if not message_xsd.exists(): + raise ValueError + + # Turn the XSD into a schema object + xml_schema_doc = etree.parse(message_xsd) + xml_schema = etree.XMLSchema(xml_schema_doc) + + return xml_schema.validate(msg_doc) + # return xml_schema.assertValid(msg_doc) + + +def install_schemas(schema_dir: Optional[Path] = None) -> None: + """Cache XML Schema documents locally for use during message validation. + + Parameters + ---------- + schema_dir + The directory where XSD schemas will be downloaded to. + """ + import io + import zipfile + + import platformdirs + import requests + + # If the user has no preference, download the schemas to the local cache directory + if not schema_dir: + schema_dir = platformdirs.user_cache_path("sdmx") + schema_dir.mkdir(exist_ok=True, parents=True) + + # Check the latest release to get the URL to the schema zip + release_url = "https://api.github.com/repos/sdmx-twg/sdmx-ml-v2_1/releases/latest" + gh_headers = { + "Accept": "application/vnd.github+json", + "X-GitHub-Api-Version": "2022-11-28" + } + resp = requests.get(url=release_url, headers=gh_headers) + zipball_url = resp.json().get("zipball_url") + + # Download the zipped content and find the schemas within + resp = requests.get(url=zipball_url, headers=gh_headers) + zipped = zipfile.ZipFile(io.BytesIO(resp.content)) + schemas = [n for n in zipped.namelist() if "schemas" in n and n.endswith(".xsd")] + + # Extract the schemas to the destination directory + # We can't use ZipFile.extract here because it will keep the directory structure + for xsd in schemas: + xsd_path = zipfile.Path(zipped, at=xsd) + target = schema_dir.joinpath(xsd_path.name) + target.write_text(xsd_path.read_text()) + + def reference(obj, parent=None, tag=None, style=None): """Write a reference to `obj`. From b09bc5573be637f4a4282cb57eabb7a7d8a3d892 Mon Sep 17 00:00:00 2001 From: Reginald Maltais Date: Wed, 3 Jan 2024 08:42:37 -0500 Subject: [PATCH 04/15] Remove duplicated etree declaration --- sdmx/tests/writer/test_writer_xml.py | 1 - 1 file changed, 1 deletion(-) diff --git a/sdmx/tests/writer/test_writer_xml.py b/sdmx/tests/writer/test_writer_xml.py index 9d8d601a1..5ed580a37 100644 --- a/sdmx/tests/writer/test_writer_xml.py +++ b/sdmx/tests/writer/test_writer_xml.py @@ -4,7 +4,6 @@ import pytest import requests -from lxml import etree import sdmx import sdmx.writer.xml From 38cabf1451efad33f12c0e7e9aaef919b340997b Mon Sep 17 00:00:00 2001 From: Reginald Maltais Date: Wed, 3 Jan 2024 08:43:20 -0500 Subject: [PATCH 05/15] Fix overwritten RefStyle --- sdmx/writer/xml.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/sdmx/writer/xml.py b/sdmx/writer/xml.py index c5b9ecd73..6f2152918 100644 --- a/sdmx/writer/xml.py +++ b/sdmx/writer/xml.py @@ -7,7 +7,7 @@ import logging from pathlib import Path -from typing import Iterable, IO, Literal, List, Optional, cast +from typing import IO, Iterable, List, Literal, Optional, cast from lxml import etree from lxml.builder import ElementMaker @@ -153,6 +153,9 @@ def install_schemas(schema_dir: Optional[Path] = None) -> None: target.write_text(xsd_path.read_text()) +RefStyle = Literal["Ref", "URN"] + + def reference(obj, parent=None, tag=None, style=None): """Write a reference to `obj`. From e34c18cf2d0783c6a32d8217c555e17b725b1cfb Mon Sep 17 00:00:00 2001 From: Reginald Maltais Date: Wed, 3 Jan 2024 09:52:36 -0500 Subject: [PATCH 06/15] Log XML validation errors --- sdmx/writer/xml.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/sdmx/writer/xml.py b/sdmx/writer/xml.py index 6f2152918..cd65481d4 100644 --- a/sdmx/writer/xml.py +++ b/sdmx/writer/xml.py @@ -108,8 +108,12 @@ def validate_xml(msg: Path | IO, schema_dir: Optional[Path] = None) -> bool: xml_schema_doc = etree.parse(message_xsd) xml_schema = etree.XMLSchema(xml_schema_doc) - return xml_schema.validate(msg_doc) - # return xml_schema.assertValid(msg_doc) + try: + xml_schema.assertValid(msg_doc) + except etree.DocumentInvalid as err: + log.error(err) + finally: + return xml_schema.validate(msg_doc) def install_schemas(schema_dir: Optional[Path] = None) -> None: @@ -135,7 +139,7 @@ def install_schemas(schema_dir: Optional[Path] = None) -> None: release_url = "https://api.github.com/repos/sdmx-twg/sdmx-ml-v2_1/releases/latest" gh_headers = { "Accept": "application/vnd.github+json", - "X-GitHub-Api-Version": "2022-11-28" + "X-GitHub-Api-Version": "2022-11-28", } resp = requests.get(url=release_url, headers=gh_headers) zipball_url = resp.json().get("zipball_url") From 3d1b2ec555ff45f069900846e158d275b2a36e09 Mon Sep 17 00:00:00 2001 From: Reginald Maltais Date: Wed, 3 Jan 2024 09:52:58 -0500 Subject: [PATCH 07/15] Provide SDMX-ML validation docs --- doc/howto.rst | 1 + doc/howto/validate.rst | 44 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) create mode 100644 doc/howto/validate.rst diff --git a/doc/howto.rst b/doc/howto.rst index 83912559a..0db4d4347 100644 --- a/doc/howto.rst +++ b/doc/howto.rst @@ -7,6 +7,7 @@ On other pages: :maxdepth: 1 howto/create + howto/validate Access other SDMX data sources ------------------------------ diff --git a/doc/howto/validate.rst b/doc/howto/validate.rst new file mode 100644 index 000000000..77d5e05ed --- /dev/null +++ b/doc/howto/validate.rst @@ -0,0 +1,44 @@ +Validate SDMX-ML against official schemas +***************************************** + +:mod:`sdmx` is capable of generating XML for all kinds of SDMX components. When communicating with remote services +though, only valid SDMX-ML messages can be sent. To help ensure your generated XML complies with the standard you can +call :func:`sdmx.validate_xml`. + +Validation requires having a copy of the `official schema `_ files available. +To help make this easier, you can use :func:`sdmx.install_schemas`, which will cache a local copy for use in validation. + +Cache schema files +================== + +.. note:: This only needs to be run once. + +.. ipython:: python + + import sdmx + sdmx.install_schemas() + +The schema files will be downloaded and placed in your local cache directory. + +Validate SDMX-ML messages +========================= + +Generate an SDMX-ML message, perhaps by following :doc:`create`. Once you have a file on disk that has an SDMX-ML +message it can be validated by running :func:`sdmx.validate_xml`. These instructions will use the samples provided by +the `SDMX technical working group `_. + +.. code-block:: python + + import sdmx + + sdmx.validate_xml("samples/common/common.xml") + True + + sdmx.validate_xml("samples/demography/demography.xml") + True + +Invalid messages will return ``False``. You will also see a log message to help in tracing the problem:: + + Element '{http://www.sdmx.org/resources/sdmxml/schemas/v2_1/common}Annotations': This element is not expected. + Expected is one of ( {http://www.sdmx.org/resources/sdmxml/schemas/v2_1/common}Description, + {http://www.sdmx.org/resources/sdmxml/schemas/v2_1/structure}Structure )., line 17 From 2886ac0cf5bedce07935264222a2fa7944455b60 Mon Sep 17 00:00:00 2001 From: Reginald Maltais Date: Wed, 3 Jan 2024 15:42:36 -0500 Subject: [PATCH 08/15] Fix formatting in GH request headers --- sdmx/tests/writer/test_writer_xml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdmx/tests/writer/test_writer_xml.py b/sdmx/tests/writer/test_writer_xml.py index 5ed580a37..d3d291595 100644 --- a/sdmx/tests/writer/test_writer_xml.py +++ b/sdmx/tests/writer/test_writer_xml.py @@ -310,7 +310,7 @@ def test_validate_xml_from_samples(tmp_path): release_url = "https://api.github.com/repos/sdmx-twg/sdmx-ml-v2_1/releases/latest" gh_headers = { "Accept": "application/vnd.github+json", - "X-GitHub-Api-Version": "2022-11-28" + "X-GitHub-Api-Version": "2022-11-28", } resp = requests.get(url=release_url, headers=gh_headers) zipball_url = resp.json().get("zipball_url") From a2aa9345db7f40a1386b193760ad0edd7beef811 Mon Sep 17 00:00:00 2001 From: Reginald Maltais Date: Wed, 3 Jan 2024 15:43:30 -0500 Subject: [PATCH 09/15] Use Union syntax in params with multiple types --- sdmx/writer/xml.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sdmx/writer/xml.py b/sdmx/writer/xml.py index cd65481d4..8d534e278 100644 --- a/sdmx/writer/xml.py +++ b/sdmx/writer/xml.py @@ -7,7 +7,7 @@ import logging from pathlib import Path -from typing import IO, Iterable, List, Literal, Optional, cast +from typing import IO, Iterable, List, Literal, Optional, Union, cast from lxml import etree from lxml.builder import ElementMaker @@ -51,7 +51,7 @@ def to_xml(obj, **kwargs): return etree.tostring(writer.recurse(obj), **kwargs) -def validate_xml(msg: Path | IO, schema_dir: Optional[Path] = None) -> bool: +def validate_xml(msg: Union[Path, IO], schema_dir: Optional[Path] = None) -> bool: """Validate and SDMX message against the XML Schema (XSD) documents. The XML Schemas must first be installed or validation will fail. See From fb29857292c23f214668be171a6d90fae75d7a17 Mon Sep 17 00:00:00 2001 From: Reginald Maltais Date: Thu, 21 Dec 2023 13:49:59 -0500 Subject: [PATCH 10/15] Test validation against sdmx-twg resources --- sdmx/tests/writer/test_writer_xml.py | 55 +++++++++++++++++++++++++++- 1 file changed, 54 insertions(+), 1 deletion(-) diff --git a/sdmx/tests/writer/test_writer_xml.py b/sdmx/tests/writer/test_writer_xml.py index 867ce2699..5ed580a37 100644 --- a/sdmx/tests/writer/test_writer_xml.py +++ b/sdmx/tests/writer/test_writer_xml.py @@ -1,14 +1,16 @@ import io import logging +import zipfile import pytest -from lxml import etree +import requests import sdmx import sdmx.writer.xml from sdmx import message from sdmx.model import v21 as m from sdmx.model.v21 import DataSet, DataStructureDefinition, Dimension, Key, Observation +from sdmx.writer.xml import etree from sdmx.writer.xml import writer as XMLWriter log = logging.getLogger(__name__) @@ -287,3 +289,54 @@ def test_structure_roundtrip(specimen, specimen_id, strict, tmp_path): path.write_bytes(data.getbuffer()) log.error(f"compare() = False; see {path}") raise + + +@pytest.mark.network +def test_install_schemas(tmp_path): + """Test that XSD files are downloaded and ready for use in validation.""" + sdmx.install_schemas(schema_dir=tmp_path) + + # Look for a couple of the expected files + files = ["SDMXCommon.xsd", "SDMXMessage.xsd"] + for schema_doc in files: + doc = tmp_path.joinpath(schema_doc) + assert doc.exists() + + +@pytest.mark.network +def test_validate_xml_from_samples(tmp_path): + """Use official samples to ensure validation work correctly.""" + # Grab the latest v2.1 schema release to get the URL to the zip + release_url = "https://api.github.com/repos/sdmx-twg/sdmx-ml-v2_1/releases/latest" + gh_headers = { + "Accept": "application/vnd.github+json", + "X-GitHub-Api-Version": "2022-11-28" + } + resp = requests.get(url=release_url, headers=gh_headers) + zipball_url = resp.json().get("zipball_url") + # Download the zipped content and find the schemas within + resp = requests.get(url=zipball_url, headers=gh_headers) + zipped = zipfile.ZipFile(io.BytesIO(resp.content)) + zipped.extractall(path=tmp_path) + extracted_content = list(tmp_path.glob("sdmx-twg-sdmx-ml*"))[0] + + # Schemas as just in a flat directory + schema_dir = extracted_content.joinpath("schemas") + + # Samples are somewhat spread out, and some are known broken so we pick a bunch + samples_dir = extracted_content.joinpath("samples") + samples = [ + samples_dir / "common" / "common.xml", + samples_dir / "demography" / "demography.xml", + samples_dir / "demography" / "esms.xml", + samples_dir / "exr" / "common" / "exr_common.xml", + samples_dir / "exr" / "ecb_exr_ng" / "ecb_exr_ng_full.xml", + samples_dir / "exr" / "ecb_exr_ng" / "ecb_exr_ng.xml", + samples_dir / "query" / "query_cl_all.xml", + samples_dir / "query" / "response_cl_all.xml", + samples_dir / "query" / "query_esms_children.xml", + samples_dir / "query" / "response_esms_children.xml", + ] + + for sample in samples: + assert sdmx.validate_xml(sample, schema_dir) From 4b81d324b1818280b482abb9de2e8e69e598f221 Mon Sep 17 00:00:00 2001 From: Reginald Maltais Date: Thu, 21 Dec 2023 13:52:05 -0500 Subject: [PATCH 11/15] Add validate_xml and install_schemas methods --- pyproject.toml | 1 + sdmx/__init__.py | 4 +- sdmx/writer/__init__.py | 4 +- sdmx/writer/xml.py | 108 +++++++++++++++++++++++++++++++++++++++- 4 files changed, 114 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 5f2ee008f..2e95e225e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -46,6 +46,7 @@ tests = [ "pytest-xdist", "requests-mock >= 1.4", ] +validate = ["platformdirs"] [project.urls] homepage = "https://github.com/khaeru/sdmx" diff --git a/sdmx/__init__.py b/sdmx/__init__.py index f0cab717f..bcbbbb35f 100644 --- a/sdmx/__init__.py +++ b/sdmx/__init__.py @@ -5,13 +5,14 @@ from sdmx.reader import read_sdmx from sdmx.rest import Resource from sdmx.source import add_source, list_sources -from sdmx.writer import to_csv, to_pandas, to_xml +from sdmx.writer import install_schemas, to_csv, to_pandas, to_xml, validate_xml __all__ = [ "Client", "Request", "Resource", "add_source", + "install_schemas", "list_sources", "log", "read_sdmx", @@ -19,6 +20,7 @@ "to_csv", "to_pandas", "to_xml", + "validate_xml", ] diff --git a/sdmx/writer/__init__.py b/sdmx/writer/__init__.py index 4affa7c8a..c56614136 100644 --- a/sdmx/writer/__init__.py +++ b/sdmx/writer/__init__.py @@ -1,9 +1,11 @@ from .csv import to_csv from .pandas import to_pandas -from .xml import to_xml +from .xml import install_schemas, to_xml, validate_xml __all__ = [ + "install_schemas", "to_csv", "to_pandas", "to_xml", + "validate_xml", ] diff --git a/sdmx/writer/xml.py b/sdmx/writer/xml.py index 4f2ed371a..a9eaed059 100644 --- a/sdmx/writer/xml.py +++ b/sdmx/writer/xml.py @@ -5,7 +5,9 @@ # - writer functions for sdmx.message classes, in the same order as message.py # - writer functions for sdmx.model classes, in the same order as model.py -from typing import Iterable, List, Literal, cast +import logging +from pathlib import Path +from typing import Iterable, IO, List, Optional, Literal, cast from lxml import etree from lxml.builder import ElementMaker @@ -17,6 +19,8 @@ from sdmx.model import v21 as model from sdmx.writer.base import BaseWriter +log = logging.getLogger(__name__) + _element_maker = ElementMaker(nsmap={k: v for k, v in NS.items() if v is not None}) writer = BaseWriter("XML") @@ -47,6 +51,108 @@ def to_xml(obj, **kwargs): return etree.tostring(writer.recurse(obj), **kwargs) +def validate_xml(msg: Path | IO, schema_dir: Optional[Path] = None) -> bool: + """Validate and SDMX message against the XML Schema (XSD) documents. + + The XML Schemas must first be installed or validation will fail. See + :func:`sdmx.install_schemas` to download the schema files. + + Parameters + ---------- + msg + A SDMX-ML Message formatted XML file. + schema_dir + The directory to XSD schemas used to validate the message. + + Returns + ------- + bool + True if validation passed. False otherwise. + """ + try: + import platformdirs + except ModuleNotFoundError as err: + log.error( + "Missing platformdirs. Re-install sdmx with pip install sdmx1[validation]" + ) + raise err + + # If the user has no preference, get the schemas from the local cache directory + if not schema_dir: + schema_dir = platformdirs.user_cache_path("sdmx") + + msg_doc = etree.parse(msg) + + # Make sure the message is a supported type + supported_elements = [ + "CodelistQuery", + "DataStructureQuery", + "GenericData", + "GenericMetadata", + "GenericTimeSeriesData", + "MetadataStructureQuery", + "Structure", + "StructureSpecificData", + "StructureSpecificMetadata", + "StructureSpecificTimeSeriesData", + ] + root_elem_name = msg_doc.docinfo.root_name + if root_elem_name not in supported_elements: + raise NotImplementedError + + message_xsd = schema_dir.joinpath("SDMXMessage.xsd") + if not message_xsd.exists(): + raise ValueError + + # Turn the XSD into a schema object + xml_schema_doc = etree.parse(message_xsd) + xml_schema = etree.XMLSchema(xml_schema_doc) + + return xml_schema.validate(msg_doc) + # return xml_schema.assertValid(msg_doc) + + +def install_schemas(schema_dir: Optional[Path] = None) -> None: + """Cache XML Schema documents locally for use during message validation. + + Parameters + ---------- + schema_dir + The directory where XSD schemas will be downloaded to. + """ + import io + import zipfile + + import platformdirs + import requests + + # If the user has no preference, download the schemas to the local cache directory + if not schema_dir: + schema_dir = platformdirs.user_cache_path("sdmx") + schema_dir.mkdir(exist_ok=True, parents=True) + + # Check the latest release to get the URL to the schema zip + release_url = "https://api.github.com/repos/sdmx-twg/sdmx-ml-v2_1/releases/latest" + gh_headers = { + "Accept": "application/vnd.github+json", + "X-GitHub-Api-Version": "2022-11-28" + } + resp = requests.get(url=release_url, headers=gh_headers) + zipball_url = resp.json().get("zipball_url") + + # Download the zipped content and find the schemas within + resp = requests.get(url=zipball_url, headers=gh_headers) + zipped = zipfile.ZipFile(io.BytesIO(resp.content)) + schemas = [n for n in zipped.namelist() if "schemas" in n and n.endswith(".xsd")] + + # Extract the schemas to the destination directory + # We can't use ZipFile.extract here because it will keep the directory structure + for xsd in schemas: + xsd_path = zipfile.Path(zipped, at=xsd) + target = schema_dir.joinpath(xsd_path.name) + target.write_text(xsd_path.read_text()) + + RefStyle = Literal["Ref", "URN"] From 718ddba21b87ab7daf29263687ac25a2f6733bc8 Mon Sep 17 00:00:00 2001 From: Reginald Maltais Date: Wed, 3 Jan 2024 08:37:12 -0500 Subject: [PATCH 12/15] Restore xml validation after conflict resolution --- sdmx/writer/xml.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/sdmx/writer/xml.py b/sdmx/writer/xml.py index a9eaed059..b8735dd0d 100644 --- a/sdmx/writer/xml.py +++ b/sdmx/writer/xml.py @@ -51,19 +51,16 @@ def to_xml(obj, **kwargs): return etree.tostring(writer.recurse(obj), **kwargs) -def validate_xml(msg: Path | IO, schema_dir: Optional[Path] = None) -> bool: +def validate_xml(msg: Union[Path, IO], schema_dir: Optional[Path] = None) -> bool: """Validate and SDMX message against the XML Schema (XSD) documents. - The XML Schemas must first be installed or validation will fail. See :func:`sdmx.install_schemas` to download the schema files. - Parameters ---------- msg A SDMX-ML Message formatted XML file. schema_dir The directory to XSD schemas used to validate the message. - Returns ------- bool @@ -108,13 +105,16 @@ def validate_xml(msg: Path | IO, schema_dir: Optional[Path] = None) -> bool: xml_schema_doc = etree.parse(message_xsd) xml_schema = etree.XMLSchema(xml_schema_doc) - return xml_schema.validate(msg_doc) - # return xml_schema.assertValid(msg_doc) + try: + xml_schema.assertValid(msg_doc) + except etree.DocumentInvalid as err: + log.error(err) + finally: + return xml_schema.validate(msg_doc) def install_schemas(schema_dir: Optional[Path] = None) -> None: """Cache XML Schema documents locally for use during message validation. - Parameters ---------- schema_dir @@ -135,7 +135,7 @@ def install_schemas(schema_dir: Optional[Path] = None) -> None: release_url = "https://api.github.com/repos/sdmx-twg/sdmx-ml-v2_1/releases/latest" gh_headers = { "Accept": "application/vnd.github+json", - "X-GitHub-Api-Version": "2022-11-28" + "X-GitHub-Api-Version": "2022-11-28", } resp = requests.get(url=release_url, headers=gh_headers) zipball_url = resp.json().get("zipball_url") From f1e823f039e54e1d667494c0b0e2ca9d5e947a7b Mon Sep 17 00:00:00 2001 From: Reginald Maltais Date: Wed, 3 Jan 2024 08:43:20 -0500 Subject: [PATCH 13/15] Fix overwritten RefStyle --- sdmx/writer/xml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdmx/writer/xml.py b/sdmx/writer/xml.py index b8735dd0d..443292114 100644 --- a/sdmx/writer/xml.py +++ b/sdmx/writer/xml.py @@ -7,7 +7,7 @@ import logging from pathlib import Path -from typing import Iterable, IO, List, Optional, Literal, cast +from typing import IO, Iterable, List, Literal, Optional, Union, cast from lxml import etree from lxml.builder import ElementMaker From f781f993b73f1852cbfa5b818f07b3c173093f96 Mon Sep 17 00:00:00 2001 From: Reginald Maltais Date: Wed, 3 Jan 2024 09:52:58 -0500 Subject: [PATCH 14/15] Provide SDMX-ML validation docs --- doc/howto.rst | 1 + doc/howto/validate.rst | 44 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) create mode 100644 doc/howto/validate.rst diff --git a/doc/howto.rst b/doc/howto.rst index 83912559a..0db4d4347 100644 --- a/doc/howto.rst +++ b/doc/howto.rst @@ -7,6 +7,7 @@ On other pages: :maxdepth: 1 howto/create + howto/validate Access other SDMX data sources ------------------------------ diff --git a/doc/howto/validate.rst b/doc/howto/validate.rst new file mode 100644 index 000000000..77d5e05ed --- /dev/null +++ b/doc/howto/validate.rst @@ -0,0 +1,44 @@ +Validate SDMX-ML against official schemas +***************************************** + +:mod:`sdmx` is capable of generating XML for all kinds of SDMX components. When communicating with remote services +though, only valid SDMX-ML messages can be sent. To help ensure your generated XML complies with the standard you can +call :func:`sdmx.validate_xml`. + +Validation requires having a copy of the `official schema `_ files available. +To help make this easier, you can use :func:`sdmx.install_schemas`, which will cache a local copy for use in validation. + +Cache schema files +================== + +.. note:: This only needs to be run once. + +.. ipython:: python + + import sdmx + sdmx.install_schemas() + +The schema files will be downloaded and placed in your local cache directory. + +Validate SDMX-ML messages +========================= + +Generate an SDMX-ML message, perhaps by following :doc:`create`. Once you have a file on disk that has an SDMX-ML +message it can be validated by running :func:`sdmx.validate_xml`. These instructions will use the samples provided by +the `SDMX technical working group `_. + +.. code-block:: python + + import sdmx + + sdmx.validate_xml("samples/common/common.xml") + True + + sdmx.validate_xml("samples/demography/demography.xml") + True + +Invalid messages will return ``False``. You will also see a log message to help in tracing the problem:: + + Element '{http://www.sdmx.org/resources/sdmxml/schemas/v2_1/common}Annotations': This element is not expected. + Expected is one of ( {http://www.sdmx.org/resources/sdmxml/schemas/v2_1/common}Description, + {http://www.sdmx.org/resources/sdmxml/schemas/v2_1/structure}Structure )., line 17 From dd33753a9b1e3d72d7f020e420a60eedd585e644 Mon Sep 17 00:00:00 2001 From: Reginald Maltais Date: Wed, 3 Jan 2024 15:42:36 -0500 Subject: [PATCH 15/15] Fix formatting in GH request headers --- sdmx/tests/writer/test_writer_xml.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sdmx/tests/writer/test_writer_xml.py b/sdmx/tests/writer/test_writer_xml.py index 5ed580a37..d3d291595 100644 --- a/sdmx/tests/writer/test_writer_xml.py +++ b/sdmx/tests/writer/test_writer_xml.py @@ -310,7 +310,7 @@ def test_validate_xml_from_samples(tmp_path): release_url = "https://api.github.com/repos/sdmx-twg/sdmx-ml-v2_1/releases/latest" gh_headers = { "Accept": "application/vnd.github+json", - "X-GitHub-Api-Version": "2022-11-28" + "X-GitHub-Api-Version": "2022-11-28", } resp = requests.get(url=release_url, headers=gh_headers) zipball_url = resp.json().get("zipball_url")