Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Explore better XML to Thing transformation { WIP *NOK* } #895

Draft
wants to merge 1 commit into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions java/dev/enola/format/xml/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ junit_tests(
":xml",
"//java/dev/enola/common/context",
"//java/dev/enola/common/io",
"//java/dev/enola/common/xml",
"//java/dev/enola/thing:thing_java",
"//java/dev/enola/thing/testlib",
"//test",
Expand Down
119 changes: 119 additions & 0 deletions java/dev/enola/format/xml/XMLToMultimapHandler.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
/*
* SPDX-License-Identifier: Apache-2.0
*
* Copyright 2024 The Enola <https://enola.dev> Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package dev.enola.format.xml;

import com.google.common.base.Strings;
import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.ListMultimap;

import dev.enola.common.io.iri.namespace.NamespaceRepository;
import dev.enola.common.io.iri.namespace.NamespaceRepositoryBuilder;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.Attributes;
import org.xml.sax.helpers.DefaultHandler;

import java.util.ArrayDeque;
import java.util.Deque;
import java.util.Optional;

public class XMLToMultimapHandler extends DefaultHandler {

// TODO Ditch Multimap approach, and just try a DOM-based take!
// because Multimap does NOT "preserve order" (only "by IRI")

// TODO Move from dev.enola.format.xml to dev.enola.common.xml ?

// TODO Make this "streaming" by calling out to a JSON-B/P/?? Handler

private static final Logger LOG = LoggerFactory.getLogger(XMLToMultimapHandler.class);

public static final String TEXT_PROPERTY_IRI = "https://enola.dev/text";
public static final String NODES_PROPERTY_IRI = "https://enola.dev/nodes";

private final String defaultNamespaceIRI;
private final NamespaceRepositoryBuilder nrb = new NamespaceRepositoryBuilder();
private final Deque<ListMultimap<String, Object>> multimaps = new ArrayDeque<>();

@SuppressWarnings("unchecked")
public XMLToMultimapHandler(String defaultNamespaceIRI) {
this.defaultNamespaceIRI = defaultNamespaceIRI;
multimaps.add(ArrayListMultimap.create());
}

// TODO This actually isn't used anywhere - yet?
public NamespaceRepository getNamespaces() {
return nrb.build();
}

public Optional<ListMultimap<String, Object>> getRoot() {
if (multimaps.size() != 1) throw new IllegalStateException(multimaps.toString());
return Optional.of(multimaps.peek());
}

@Override
public void startPrefixMapping(String prefix, String uri) {
nrb.store(prefix, uri);
}

private String iri(String uri, String localName, String qName) {
if (Strings.isNullOrEmpty(uri)) uri = defaultNamespaceIRI;
if (Strings.isNullOrEmpty(localName)) throw new IllegalStateException(uri + " " + qName);
if (!(uri.endsWith("/") || uri.endsWith("#"))) return uri + "/" + localName;
else return uri + localName;
}

@Override
public void startElement(String uri, String localName, String qName, Attributes attributes) {
ListMultimap<String, Object> multimap = ArrayListMultimap.create();
multimaps.add(multimap);

for (int i = 0; i < attributes.getLength(); i++) {
var attributeURI = attributes.getURI(i);
var attributeLocalName = attributes.getLocalName(i);
var attributeQName = attributes.getQName(i);
var attributeValue = attributes.getValue(i);
// TODO ? var attributeType = attributes.getType(i);
var attributeIRI = iri(attributeURI, attributeLocalName, attributeQName);
LOG.trace(
"attribute #{}: uri={}, localName={}, qName={}; value={}",
i,
attributeURI,
attributeLocalName,
attributeQName,
attributeValue);
multimap.put(attributeIRI, attributeValue);
}
}

@Override
public void endElement(String uri, String localName, String qName) {
var nested = multimaps.removeLast();
if (!nested.isEmpty()) multimaps.getLast().put(iri(uri, localName, qName), nested);
}

@Override
public void characters(char[] ch, int start, int length) {
var text = new String(ch, start, length).trim();
if (!text.isEmpty()) {
var thingBuilder = multimaps.getLast();
thingBuilder.put(TEXT_PROPERTY_IRI, text);
}
}
}
42 changes: 42 additions & 0 deletions java/dev/enola/format/xml/XMLToMultimapHandlerTest.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
/*
* SPDX-License-Identifier: Apache-2.0
*
* Copyright 2024 The Enola <https://enola.dev> Authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package dev.enola.format.xml;

import dev.enola.common.io.resource.ClasspathResource;
import dev.enola.common.io.resource.ResourceProvider;
import dev.enola.common.xml.XmlResourceParser;

import org.junit.Test;

import java.io.IOException;
import java.net.URI;

public class XMLToMultimapHandlerTest {

private final ResourceProvider rp = new ClasspathResource.Provider();
private final XmlResourceParser xmlParser = new XmlResourceParser();

@Test
public void greeting1nesteds() throws IOException {
// TODO "classpath:/test.html.xml"
var handler = new XMLToMultimapHandler("https://example.org/");
var from = rp.getReadableResource(URI.create("classpath:/greeting1-nesteds.xml"));
xmlParser.convertInto(from, handler);
throw new IllegalStateException(handler.getRoot().toString());
}
}
50 changes: 43 additions & 7 deletions java/dev/enola/format/xml/XMLToThingHandler.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@
package dev.enola.format.xml;

import com.google.common.base.Strings;
import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.ListMultimap;

import dev.enola.common.io.iri.namespace.NamespaceRepository;
import dev.enola.common.io.iri.namespace.NamespaceRepositoryBuilder;
Expand All @@ -44,12 +47,16 @@
*/
public class XMLToThingHandler extends DefaultHandler {

// TODO Simplify the (historical) thingBuilders & multimaps into 1 single data structure?

// TODO Consider implementing this via & through the existing JSON[-LD, ctx?] support instead?

// TODO Support <xsd:boolean>true</xsd:boolean> or <dateValue type="date">2023-12-31</dateValue>

private static final Logger LOG = LoggerFactory.getLogger(XMLToThingHandler.class);

public static final String TEXT_PROPERTY_IRI = "https://enola.dev/text";
public static final String NODES_PROPERTY_IRI = "https://enola.dev/nodes";

private final String defaultNamespaceIRI;
private final NamespaceRepositoryBuilder nrb = new NamespaceRepositoryBuilder();
Expand All @@ -58,6 +65,9 @@ public class XMLToThingHandler extends DefaultHandler {
private final Deque<IImmutablePredicatesObjects.Builder<IImmutablePredicatesObjects>>
thingBuilders = new ArrayDeque<>();

private final Deque<ListMultimap<String, IImmutablePredicatesObjects>> multimaps =
new ArrayDeque<>();

@SuppressWarnings("unchecked")
public XMLToThingHandler(String baseIRI, Thing.Builder<?> thingBuilder) {
this(
Expand All @@ -72,6 +82,7 @@ public XMLToThingHandler(
this.defaultNamespaceIRI = defaultNamespaceIRI;
}

// TODO This actually isn't used anywhere - yet?
public NamespaceRepository getNamespaces() {
return nrb.build();
}
Expand Down Expand Up @@ -105,6 +116,7 @@ public void startElement(String uri, String localName, String qName, Attributes

var nested = ImmutablePredicatesObjects.builder();
thingBuilders.add(nested);
multimaps.add(ArrayListMultimap.create());

for (int i = 0; i < attributes.getLength(); i++) {
var attributeURI = attributes.getURI(i);
Expand Down Expand Up @@ -133,18 +145,42 @@ public void endElement(String uri, String localName, String qName) {

} else {
// End same level element
// NOOP.
}

var nested = thingBuilders.removeLast().build();
if (!nested.predicateIRIs().isEmpty())
if (nested.predicateIRIs().size() > 1
|| !nested.predicateIRIs().iterator().next().equals(TEXT_PROPERTY_IRI))
thingBuilders.getLast().set(iri(uri, localName, qName), nested);
else {
var multimap = multimaps.removeLast();
if (!multimap.keys().isEmpty()) {
var multimapKeysSize = multimap.keys().size();
if (multimapKeysSize > 1
|| !multimap.keys().iterator().next().equals(TEXT_PROPERTY_IRI)) {
// Are there duplicate keys of the same predicate IRI repeated?
if (multimapKeysSize == multimap.keySet().size()) {
var nested = thingBuilders.removeLast().build();
thingBuilders.getLast().set(iri(uri, localName, qName), nested);
multimap.put(iri(uri, localName, qName), nested);

} else {
// Remove, but do not build() - as it would fail!
thingBuilders.removeLast();
ImmutableList.Builder<ImmutablePredicatesObjects> list =
ImmutableList.builderWithExpectedSize(multimapKeysSize);
// TODO Can this be simplified?
multimap.forEach(
(predicateIRI, predicatesObjects) ->
list.add(
ImmutablePredicatesObjects.builderWithExpectedSize(1)
.set(predicateIRI, predicatesObjects)
.build()));
var builtList = list.build();
thingBuilders.getLast().set(NODES_PROPERTY_IRI, builtList);
// TODO ? multimap.put(iri(uri, localName, qName), builtList);
}

} else {
var nested = thingBuilders.removeLast().build();
var text = nested.get(TEXT_PROPERTY_IRI, String.class);
thingBuilders.getLast().set(iri(uri, localName, qName), text);
}
}
}

@Override
Expand Down
6 changes: 2 additions & 4 deletions java/dev/enola/format/xml/XMLToThingsHandlerTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@
import dev.enola.thing.repo.ThingsBuilder;
import dev.enola.thing.testlib.ThingsSubject;

import org.junit.Ignore;
import org.junit.Test;

import java.io.IOException;
Expand Down Expand Up @@ -100,11 +99,10 @@ public void greeting1nesteds() throws IOException {
}

@Test
@Ignore // TODO FIXME
public void xhtml() throws IOException {
assertThat(loader.convertInto(URI.create("classpath:/test.html.xml"), thingsBuilder))
.isTrue();
throw new IllegalStateException(thingsBuilder.toString());
// TODO ThingsSubject.assertThat(thingsBuilder).isEqualTo("classpath:/test.html.xml.ttl");
// TODO rm? throw new IllegalStateException(thingsBuilder.toString());
ThingsSubject.assertThat(thingsBuilder).isEqualTo("classpath:/test.html.xml.ttl");
}
}
3 changes: 2 additions & 1 deletion test/greeting1-nesteds.xml
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
<greeting1 lang="en">
<message> hello, world </message>
<message>hello</message>
<from style="email">[email protected]</from>
<message>world</message>
</greeting1>
20 changes: 12 additions & 8 deletions test/greeting1-nesteds.xml.ttl
Original file line number Diff line number Diff line change
@@ -1,10 +1,14 @@
@prefix enola: <https://enola.dev/>.
@prefix ex: <https://example.org/>.

<classpath:/greeting1-nesteds.xml>
<https://example.org/greeting1> [
<https://example.org/lang> "en";
<https://example.org/message> "hello, world";
<https://example.org/from> [
<https://example.org/style> "email";
<https://enola.dev/text> "[email protected]";
]
].
ex:greeting1 [
ex:lang "en";
enola:nodes (
ex:message "hello"
ex:from [
ex:style "email";
enola:text "[email protected]";
]
ex:message "world" )
].
1 change: 1 addition & 0 deletions test/test.html.xml
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
<a href="test.html">Recursive Self Link</a>
<a href="https://www.vorburger.ch">Vorburger</a>
<schema:dateCreated datatype="xsd:date">2024-09-28</schema:dateCreated>
.
</p>
<math xmlns="http://www.w3.org/1998/Math/MathML">
<apply>
Expand Down