Skip to content

Commit

Permalink
Merge pull request #200 from khaeru/issue/199
Browse files Browse the repository at this point in the history
Fix XML namespace handling in read of structure-specific SDMX-ML
  • Loading branch information
khaeru authored Oct 23, 2024
2 parents e88f8d3 + 293b0c6 commit 70cf79c
Show file tree
Hide file tree
Showing 10 changed files with 89 additions and 58 deletions.
1 change: 1 addition & 0 deletions doc/whatsnew.rst
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ Next release
============

- Bug fix for writing :class:`.VersionableArtefact` to SDMX-ML 2.1: :class:`KeyError` was raised if :attr:`.VersionableArtefact.version` was an instance of :class:`.Version` (:pull:`198`).
- Bug fix for reading data from structure-specific SDMX-ML: :class:`.XMLParseError` / :class:`NotImplementedError` was raised if reading 2 messages in sequence with different XML namespaces defined (:pull:`200`, thanks :gh-user:`mephinet` for :issue:`199`).

v2.18.0 (2024-10-15)
====================
Expand Down
5 changes: 3 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -95,10 +95,11 @@ select = ["C9", "E", "F", "I", "W"]
ignore = ["E501", "W191"]
# Exceptions:
# - .client._handle_get_kwargs: 12
# - .reader.xml.v21.read_message: 15
# - .reader.xml.v21._component_end: 12
# - .testing.generate_endpoint_tests: 11
# - .writer.pandas._maybe_convert_datetime: 23
# - .writer.pandas.write_dataset: 12
mccabe.max-complexity = 11
mccabe.max-complexity = 10

[tool.setuptools.packages]
find = {}
Expand Down
2 changes: 1 addition & 1 deletion sdmx/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,7 @@ def _request_from_url(self, kwargs):

return requests.Request("get", url, params=parameters, headers=headers)

def _handle_get_kwargs(self, kwargs): # noqa: C901 TODO reduce complexity 12 → ≤11
def _handle_get_kwargs(self, kwargs): # noqa: C901 TODO reduce complexity 12 → ≤10
if kwargs.pop("validate", None) is not None:
warn("validate= keyword argument to Client.get()", DeprecationWarning)

Expand Down
38 changes: 22 additions & 16 deletions sdmx/format/xml/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@
# To be formatted
"com": "{}/common",
"md": "{}/metadata/generic",
"md_ss": "{}/metadata/structurespecific",
"data": "{}/data/structurespecific",
"str": "{}/structure",
"mes": "{}/message",
Expand Down Expand Up @@ -273,6 +274,8 @@ def install_schemas(


class XMLFormat:
"""Information about an SDMX-ML format."""

NS: Mapping[str, Optional[str]]
_class_tag: list

Expand Down Expand Up @@ -306,29 +309,32 @@ def ns_prefix(self, url) -> str:
return prefix
raise ValueError(url)

_NS_PATTERN = re.compile(r"(\{(?P<ns>.*)\}|(?P<ns_prefix>.*):)?(?P<localname>.*)")

@lru_cache()
def qname(self, ns_or_name, name=None) -> QName:
def qname(self, ns_or_name: str, name: Optional[str] = None) -> QName:
"""Return a fully-qualified tag `name` in namespace `ns`."""
if isinstance(ns_or_name, QName):
# Already a QName; do nothing
return ns_or_name
else:
if name is None:
match = re.fullmatch(
r"(\{(?P<ns_full>.*)\}|(?P<ns_key>.*):)?(?P<name>.*)", ns_or_name
)
assert match
name = match.group("name")
if ns_key := match.group("ns_key"):
ns = self.NS[ns_key]
elif ns := match.group("ns_full"):
pass
else:
ns = None

if name is None:
# `ns_or_name` contains the local name ("tag") and possibly a namespace
# prefix ("ns:tag") or full namespace name ("{foo}tag")
match = self._NS_PATTERN.fullmatch(ns_or_name)
assert match
name = match.group("localname")
if prefix := match.group("ns_prefix"):
ns = self.NS[prefix]
elif ns := match.group("ns"):
pass
else:
ns = self.NS[ns_or_name]
ns = None # Tag without namespace
else:
# `ns_or_name` is the namespace prefix; `name` is the local name
ns = self.NS[ns_or_name]

return QName(ns, name)
return QName(ns, name)

@lru_cache()
def class_for_tag(self, tag) -> Optional[type]:
Expand Down
2 changes: 1 addition & 1 deletion sdmx/reader/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ class Reader(BaseReader):
def detect(cls, content):
return content.startswith(b"{")

def read_message(self, source, structure=None, **kwargs): # noqa: C901 TODO reduce complexity 15 → ≤11
def read_message(self, source, structure=None, **kwargs): # noqa: C901 TODO reduce complexity 15 → ≤10
# Initialize message instance
msg = DataMessage()

Expand Down
32 changes: 11 additions & 21 deletions sdmx/reader/xml/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,9 +129,9 @@ class XMLEventReader(BaseReader):
#: :class:`.BaseReference` subclass used by this reader.
Reference: ClassVar[type[BaseReference]]

# Mapping from (QName, ["start", "end"]) to a function that parses the element/event
# or else None
parser: ClassVar[Mapping[tuple[QName, str], Callable]]
#: Mapping from (QName, ["start", "end"]) to a function that parses the
#: element/event or else None (no parsing).
parser: ClassVar[dict[tuple[QName, str], Callable]]

# One-way counter for use in stacks
_count: Iterator[int]
Expand All @@ -151,7 +151,7 @@ def __init__(self):

# BaseReader methods

def read_message( # noqa: C901 TODO reduce complexity 12 → ≤11
def read_message(
self,
source,
structure=None,
Expand Down Expand Up @@ -196,20 +196,14 @@ def read_message( # noqa: C901 TODO reduce complexity 12 → ≤11
# Don't know what to do for this (element, event)
raise NotImplementedError(element.tag, event) from None

try:
# Parse the element
result = func(self, element)
except TypeError:
if func is None: # Explicitly no parser for this (element, event)
continue # Skip
else: # pragma: no cover
raise
else:
# Store the result
self.push(result)
if func is None:
continue # Explicitly no parser for this (element, event) → skip

result = func(self, element) # Parse the element
self.push(result) # Store the result

if event == "end":
element.clear() # Free memory
if event == "end":
element.clear() # Free memory

except Exception as exc:
# Parsing failed; display some diagnostic information
Expand Down Expand Up @@ -368,10 +362,6 @@ def unstash(self):
self.stack[s].update(values)

# Delegate to version-specific module
@classmethod
def NS(cls):
return cls.format.NS

@classmethod
def class_for_tag(cls, tag: str) -> type:
return cls.format.class_for_tag(tag)
Expand Down
26 changes: 12 additions & 14 deletions sdmx/reader/xml/v21.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
import logging
import re
from copy import copy
from itertools import chain
from itertools import chain, filterfalse
from sys import maxsize
from typing import Any, MutableMapping, Optional, cast

Expand All @@ -34,7 +34,6 @@
)

log = logging.getLogger(__name__)
log.setLevel(logging.DEBUG)


class _NoText:
Expand Down Expand Up @@ -151,14 +150,11 @@ def _message(reader: Reader, elem):
reader.push("DataSetClass", model.get_class(f"{QName(elem).localname}Set"))

# Handle namespaces mapped on `elem` but not part of the standard set
for key, value in filter(
lambda kv: kv[1] not in set(reader.NS().values()), elem.nsmap.items()
):
# Register the namespace
reader.NS().update({key: value})
# Use _ds_start() and _ds_end() to handle <{key}:DataSet> elements
reader.start(f"{key}:DataSet", only=False)(_ds_start)
reader.end(f"{key}:DataSet", only=False)(_ds_end)
existing_ns = set(reader.format.NS.values())
for namespace in filterfalse(existing_ns.__contains__, elem.nsmap.values()):
# Use _ds_start() and _ds_end() to handle <{namespace}DataSet> elements
reader.parser[QName(namespace, "DataSet"), "start"] = _ds_start
reader.parser[QName(namespace, "DataSet"), "end"] = _ds_end

# Instantiate the message object
return reader.class_for_tag(elem.tag)()
Expand Down Expand Up @@ -602,7 +598,7 @@ def _maybe_unbounded(value: str) -> Optional[int]:
return None if value == "unbounded" else int(value)


# TODO Reduce complexity from 12 → 11, by adding separate parsers for certain COMPONENTs
# TODO Reduce complexity from 12 → ≤10, by adding separate parsers for some COMPONENTs
@end(COMPONENT, only=False)
@possible_reference(unstash=True)
def _component_end(reader: Reader, elem): # noqa: C901
Expand Down Expand Up @@ -1160,7 +1156,7 @@ def _obs_ss(reader, elem):
except KeyError:
pass
else:
elem.attrib[dim_at_obs.id] = reader.qname(tmp).localname
_, elem.attrib[dim_at_obs.id] = tmp.split(":", maxsplit=2)

if ss_without_structure and dim_at_obs is not model.AllDimensions:
# Create the observation key
Expand Down Expand Up @@ -1241,8 +1237,10 @@ def _mds_start(reader, elem):
mds = reader.class_for_tag(elem.tag)()

# Retrieve the (message-local) ID referencing a data structure definition
id = elem.attrib.get("structureRef", None) or elem.attrib.get(
reader.qname("metadata:structureRef"), None
id = (
elem.attrib.get("structureRef", None)
or elem.attrib.get(reader.qname("md:structureRef"), None)
or elem.attrib.get(reader.qname("md_ss:structureRef"), None)
)

# Get a reference to the MSD that structures the data set
Expand Down
2 changes: 1 addition & 1 deletion sdmx/testing/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ def parametrize_specimens(metafunc):
}


def generate_endpoint_tests(metafunc):
def generate_endpoint_tests(metafunc): # noqa: C901 TODO reduce complexity 11 → ≤10
"""pytest hook for parametrizing tests that need an "endpoint" fixture.
This function relies on the :class:`.DataSourceTest` base class defined in
Expand Down
35 changes: 35 additions & 0 deletions sdmx/tests/reader/test_reader_xml_v21.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,41 @@ def test_gh_164(specimen):
assert isinstance(da.related_to, v21.NoSpecifiedRelationship)


def test_gh_199():
"""Test of https://github.com/khaeru/sdmx/issues/199."""
import sdmx.format.xml.v21

# Template for DSD URN
URN = "urn:sdmx:org.sdmx.infomodel.datastructure.DataStructure=FOO:BAR({})"

# Template for SDMX-ML data message
CONTENT = """<?xml version="1.0" encoding="UTF-8"?>
<mes:StructureSpecificData
xmlns:mes="http://www.sdmx.org/resources/sdmxml/schemas/v2_1/message"
xmlns:u="{}:ObsLevelDim:TIME_PERIOD">
<u:DataSet>
...
</u:DataSet>
</mes:StructureSpecificData>"""

# Construct a URN and message; store as BytesIO
urn1 = URN.format("1")
dsd1 = v21.DataStructureDefinition(urn=urn1)
f1 = BytesIO(CONTENT.format(urn1).encode())

# Construct a *different* URN and message with this other URN mapped to the "u:" XML
# namespace prefix
urn2 = URN.format("2")
dsd2 = v21.DataStructureDefinition(urn=urn2)
f2 = BytesIO(CONTENT.format(urn2).encode())

# First message can be parsed
sdmx.read_sdmx(f1, structure=dsd1)

# #199: raises XMLParseError/NotImplementedError
sdmx.read_sdmx(f2, structure=dsd2)


# Each entry is a tuple with 2 elements:
# 1. an instance of lxml.etree.Element to be parsed.
# 2. Either:
Expand Down
4 changes: 2 additions & 2 deletions sdmx/writer/pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,7 +209,7 @@ def _rp(obj: model.RangePeriod, **kwargs):


@writer
def write_dataset( # noqa: C901 TODO reduce complexity 12 → ≤11
def write_dataset( # noqa: C901 TODO reduce complexity 12 → ≤10
obj: model.DataSet,
attributes="",
dtype=np.float64,
Expand Down Expand Up @@ -377,7 +377,7 @@ def _dataset_compat(df, datetime, kwargs):
return df, datetime, kwargs


def _maybe_convert_datetime(df, arg, obj, dsd=None): # noqa: C901 TODO reduce complexity 23 → ≤11
def _maybe_convert_datetime(df, arg, obj, dsd=None): # noqa: C901 TODO reduce complexity 23 → ≤10
"""Helper for :meth:`.write_dataset` to handle datetime indices.
Parameters
Expand Down

0 comments on commit 70cf79c

Please sign in to comment.