From a53aae581235dfcfbaf92b2869ab1ab001a64d8d Mon Sep 17 00:00:00 2001 From: Ken Wenzel Date: Mon, 15 Apr 2024 09:48:12 +0200 Subject: [PATCH] GH-4950 LMDB: extensible ID scheme Implement an extensible ID scheme for the LmdbStore that also allows to embed values into IDs. --- .../eclipse/rdf4j/sail/lmdb/TripleStore.java | 15 +- .../org/eclipse/rdf4j/sail/lmdb/ValueIds.java | 83 +++++++ .../eclipse/rdf4j/sail/lmdb/ValueStore.java | 229 +++++++++++------- .../rdf4j/sail/lmdb/ValueStoreTest.java | 7 +- 4 files changed, 239 insertions(+), 95 deletions(-) create mode 100644 core/sail/lmdb/src/main/java/org/eclipse/rdf4j/sail/lmdb/ValueIds.java diff --git a/core/sail/lmdb/src/main/java/org/eclipse/rdf4j/sail/lmdb/TripleStore.java b/core/sail/lmdb/src/main/java/org/eclipse/rdf4j/sail/lmdb/TripleStore.java index b1ea7ad759..0f2040028e 100644 --- a/core/sail/lmdb/src/main/java/org/eclipse/rdf4j/sail/lmdb/TripleStore.java +++ b/core/sail/lmdb/src/main/java/org/eclipse/rdf4j/sail/lmdb/TripleStore.java @@ -602,9 +602,18 @@ protected void filterUsedIds(Collection ids) throws IOException { it.remove(); continue; } - if (component != 2 && (id & 1) == 1) { - // id is a literal and can only appear in object position - continue; + if (component != 2) { + // optimization: ensure that literals are only tested if they appear in object + // position + switch (ValueIds.getIdType(id)) { + case ValueIds.T_URI: + case ValueIds.T_BNODE: + case ValueIds.T_TRIPLE: + // fall through + default: + // id is a literal, do not test it + continue; + } } long subj = c == 0 ? id : -1, pred = c == 1 ? id : -1, diff --git a/core/sail/lmdb/src/main/java/org/eclipse/rdf4j/sail/lmdb/ValueIds.java b/core/sail/lmdb/src/main/java/org/eclipse/rdf4j/sail/lmdb/ValueIds.java new file mode 100644 index 0000000000..7bf001be77 --- /dev/null +++ b/core/sail/lmdb/src/main/java/org/eclipse/rdf4j/sail/lmdb/ValueIds.java @@ -0,0 +1,83 @@ +/******************************************************************************* + * Copyright (c) 2024 Eclipse RDF4J contributors. + * + * All rights reserved. This program and the accompanying materials + * are made available under the terms of the Eclipse Distribution License v1.0 + * which accompanies this distribution, and is available at + * http://www.eclipse.org/org/documents/edl-v10.php. + * + * SPDX-License-Identifier: BSD-3-Clause + *******************************************************************************/ +package org.eclipse.rdf4j.sail.lmdb; + +/** + * Constants and functions for working with ids encoded into long values. + */ +public class ValueIds { + /** + * Pointer to an arbitrary value in the value store. This is not used as RDF value. + */ + public static final int T_PTR = 0; + + /** Reference to a URI */ + public static final int T_URI = 1; + /** Reference to a literal */ + public static final int T_LITERAL = 2; + /** Reference to a blank node */ + public static final int T_BNODE = 3; + /** Reference to a triple */ + public static final int T_TRIPLE = 4; + + // inlined values + public static final int T_INTEGER = 16; + public static final int T_DECIMAL = 17; + public static final int T_FLOAT = 18; + public static final int T_DATETIME = 19; + public static final int T_DATETIMESTAMP = 20; + public static final int T_DATE = 21; + public static final int T_BOOLEAN = 22; + public static final int T_SHORTSTRING = 23; + public static final int T_POSITIVE_INTEGER = 24; + public static final int T_NEGATIVE_INTEGER = 25; + public static final int T_NON_NEGATIVE_INTEGER = 26; + public static final int T_NON_POSITIVE_INTEGER = 27; + public static final int T_LONG = 28; + public static final int T_INT = 29; + public static final int T_SHORT = 30; + public static final int T_BYTE = 31; + public static final int T_UNSIGNEDLONG = 32; + public static final int T_UNSIGNEDINT = 33; + public static final int T_UNSIGNEDSHORT = 34; + public static final int T_UNSIGNEDBYTE = 35; + + /** + * Returns the type section of the given id. + * + * @param id The id of which the type should be extracted. + * @return The id's type. + */ + public static int getIdType(long id) { + return (int) ((id >> 1) & 0x3F); + } + + /** + * Returns the value section of the given id. + * + * @param id The id of which the value should be extracted. + * @return The id's value. + */ + public static long getValue(long id) { + return id >> 7; + } + + /** + * Combines an id type and a value into a single long id. + * + * @param idType The id's type. + * @param value The id's value. + * @return A composite id. + */ + public static long createId(int idType, long value) { + return value << 7 | idType << 1; + } +} diff --git a/core/sail/lmdb/src/main/java/org/eclipse/rdf4j/sail/lmdb/ValueStore.java b/core/sail/lmdb/src/main/java/org/eclipse/rdf4j/sail/lmdb/ValueStore.java index a54338ebdf..832469a2ff 100644 --- a/core/sail/lmdb/src/main/java/org/eclipse/rdf4j/sail/lmdb/ValueStore.java +++ b/core/sail/lmdb/src/main/java/org/eclipse/rdf4j/sail/lmdb/ValueStore.java @@ -49,11 +49,12 @@ import java.io.IOException; import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; -import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; import java.util.Collections; +import java.util.HashMap; import java.util.HashSet; +import java.util.Map; import java.util.Optional; import java.util.Set; import java.util.concurrent.locks.ReadWriteLock; @@ -95,24 +96,28 @@ class ValueStore extends AbstractValueFactory { private static final long VALUE_EVICTION_INTERVAL = 60000; // 60 seconds - private static final byte URI_VALUE = 0x0; // 00 + private static final byte URI_VALUE = 0; - private static final byte LITERAL_VALUE = 0x1; // 01 + private static final byte LITERAL_VALUE = 1; - private static final byte BNODE_VALUE = 0x2; // 10 + private static final byte BNODE_VALUE = 2; - private static final byte NAMESPACE_VALUE = 0x3; // 11 + private static final byte TRIPLE_VALUE = 3; - private static final byte ID_KEY = 0x4; + private static final byte NAMESPACE_VALUE = 4; - private static final byte HASH_KEY = 0x5; + private static final byte ID_KEY = 5; - private static final byte HASHID_KEY = 0x6; + private static final byte HASH_KEY = 6; + + private static final byte HASHID_KEY = 7; /*** * Maximum size of keys before hashing is used (size of two long values) */ private static final int MAX_KEY_SIZE = 16; + // package-protected for testing + final Set unusedRevisionIds = new HashSet<>(); /** * Used to do the actual storage of values, once they're translated to byte arrays. */ @@ -138,6 +143,13 @@ class ValueStore extends AbstractValueFactory { * namespace. */ private final ConcurrentCache namespaceIDCache; + private final boolean forceSync; + private final boolean autoGrow; + /** + * This lock is required to block transactions while auto-growing the map size. + */ + private final ReadWriteLock txnLock = new ReentrantReadWriteLock(); + private final ConcurrentCleaner cleaner = new ConcurrentCleaner(); /** * Used to do the actual storage of values, once they're translated to byte arrays. */ @@ -153,14 +165,7 @@ class ValueStore extends AbstractValueFactory { // database with internal reference counts for IRIs and namespaces private int refCountsDbi; private long writeTxn; - private final boolean forceSync; - private final boolean autoGrow; private boolean invalidateRevisionOnCommit = false; - /** - * This lock is required to block transactions while auto-growing the map size. - */ - private final ReadWriteLock txnLock = new ReentrantReadWriteLock(); - /** * An object that indicates the revision of the value store, which is used to check if cached value IDs are still * valid. In order to be valid, the ValueStoreRevision object of a LmdbValue needs to be equal to this object. @@ -171,19 +176,13 @@ class ValueStore extends AbstractValueFactory { * object is GCed then it is safe to finally remove the ID-value associations and to reuse IDs. */ private volatile ValueStoreRevision.Lazy lazyRevision; - /** * The next ID that is associated with a stored value */ private long nextId = 1; private boolean freeIdsAvailable; - private volatile long nextValueEvictionTime = 0; - - // package-protected for testing - final Set unusedRevisionIds = new HashSet<>(); - - private final ConcurrentCleaner cleaner = new ConcurrentCleaner(); + private final Map refCounts = new HashMap<>(); ValueStore(File dir, LmdbStoreConfig config) throws IOException { this.dir = dir; @@ -353,7 +352,25 @@ private void open() throws IOException { }); } - private long nextId(byte type) throws IOException { + private long nextId(byte valueType) throws IOException { + int idType; + switch (valueType) { + case URI_VALUE: + idType = ValueIds.T_URI; + break; + case BNODE_VALUE: + idType = ValueIds.T_BNODE; + break; + case LITERAL_VALUE: + idType = ValueIds.T_LITERAL; + break; + case NAMESPACE_VALUE: + idType = ValueIds.T_PTR; + break; + default: + throw new IllegalArgumentException("Unexpected value type: " + valueType); + + } if (freeIdsAvailable) { // next id from store Long reusedId = writeTransaction((stack, txn) -> { @@ -366,8 +383,8 @@ private long nextId(byte type) throws IOException { MDBVal keyData = MDBVal.calloc(stack); MDBVal valueData = MDBVal.calloc(stack); if (mdb_cursor_get(cursor, keyData, valueData, MDB_FIRST) == 0) { - // remove lower 2 type bits - long value = data2id(keyData.mv_data()) >> 2; + // unpack value from compound id + long value = ValueIds.getValue(data2id(keyData.mv_data())); // delete entry E(mdb_cursor_del(cursor, 0)); return value; @@ -381,17 +398,12 @@ private long nextId(byte type) throws IOException { } }); if (reusedId != null) { - long result = reusedId; - // encode type in lower 2 bits of id - result = (result << 2) | type; - return result; + return ValueIds.createId(idType, reusedId); } } long result = nextId; nextId++; - // encode type in lower 2 bits of id - result = (result << 2) | type; - return result; + return ValueIds.createId(idType, result); } protected ByteBuffer idBuffer(MemoryStack stack) { @@ -481,18 +493,19 @@ public LmdbValue getLazyValue(long id) throws IOException { LmdbValue resultValue = cachedValue(cacheID); if (resultValue == null) { - switch ((byte) (id & 0x3)) { - case URI_VALUE: + int idType = ValueIds.getIdType(id); + switch (idType) { + case ValueIds.T_URI: resultValue = new LmdbIRI(lazyRevision, id); break; - case LITERAL_VALUE: + case ValueIds.T_LITERAL: resultValue = new LmdbLiteral(lazyRevision, id); break; - case BNODE_VALUE: + case ValueIds.T_BNODE: resultValue = new LmdbBNode(lazyRevision, id); break; default: - throw new IOException("Unsupported value with type id " + (id & 0x3)); + throw new IOException("Unsupported value with id type: " + idType); } // Store value in cache cacheValue(cacheID, resultValue); @@ -568,7 +581,7 @@ private void resizeMap(long txn, long requiredSize) throws IOException { mdb_txn_reset(txn); } if (activeWriteTxn) { - endTransaction(true); + endTransaction(true, true); } mapSize = LmdbUtil.autoGrowMapSize(mapSize, pageSize, requiredSize); E(mdb_env_set_mapsize(env, mapSize)); @@ -586,55 +599,84 @@ private void resizeMap(long txn, long requiredSize) throws IOException { } } - private void incrementRefCount(MemoryStack stack, long writeTxn, byte[] data) throws IOException { + private void incrementRefCount(MemoryStack stack, long writeTxn, byte[] data) { // literals have a datatype id and URIs have a namespace id if (data[0] == LITERAL_VALUE || data[0] == URI_VALUE) { - try { - stack.push(); - ByteBuffer bb = ByteBuffer.wrap(data); - // skip type marker - int idLength = Varint.firstToLength(bb.get(1)); - MDBVal idVal = MDBVal.calloc(stack); - MDBVal dataVal = MDBVal.calloc(stack); - idVal.mv_data(idBuffer(stack).put(ID_KEY).put(data, 1, idLength).flip()); - long newCount = 1; - if (mdb_get(writeTxn, refCountsDbi, idVal, dataVal) == 0) { - // update count - newCount = Varint.readUnsigned(dataVal.mv_data()) + 1; + // skip type marker + long id = Varint.readUnsigned(ByteBuffer.wrap(data, 1, data.length - 1)); + refCounts.compute(id, (k, v) -> { + if (v == null) { + try { + stack.push(); + MDBVal idVal = MDBVal.calloc(stack); + MDBVal dataVal = MDBVal.calloc(stack); + idVal.mv_data(idBuffer(stack).put(data, 1, Varint.calcLengthUnsigned(id)).flip()); + long newCount = 1; + if (mdb_get(writeTxn, refCountsDbi, idVal, dataVal) == 0) { + // update count + newCount = Varint.readUnsigned(dataVal.mv_data()) + 1; + } + return newCount; + } finally { + stack.pop(); + } + } else { + return v + 1; } - // write count - ByteBuffer countBb = stack.malloc(Varint.calcLengthUnsigned(newCount)); - Varint.writeUnsigned(countBb, newCount); - dataVal.mv_data(countBb.flip()); - E(mdb_put(writeTxn, refCountsDbi, idVal, dataVal, 0)); - } finally { - stack.pop(); - } + }); } } - private boolean decrementRefCount(MemoryStack stack, long writeTxn, ByteBuffer idBb) throws IOException { + private boolean decrementRefCount(MemoryStack stack, long writeTxn, long id) { + return refCounts.compute(id, (k, v) -> { + if (v == null) { + try { + stack.push(); + MDBVal idVal = MDBVal.calloc(stack); + MDBVal dataVal = MDBVal.calloc(stack); + ByteBuffer idBb = idBuffer(stack).put(ID_KEY); + Varint.writeUnsigned(idBb, id); + idVal.mv_data(idBb.flip()); + long newCount = 0; + if (mdb_get(writeTxn, refCountsDbi, idVal, dataVal) == 0) { + // update count + newCount = Varint.readUnsigned(dataVal.mv_data()) - 1; + } + return newCount; + } finally { + stack.pop(); + } + } else { + return v - 1; + } + }) == 0; + } + + private void updateRefCounts(MemoryStack stack, long writeTxn) throws IOException { try { stack.push(); - MDBVal idVal = MDBVal.calloc(stack); - idVal.mv_data(idBb); + ByteBuffer idBb = idBuffer(stack); + ByteBuffer countBb = stack.malloc(Long.BYTES + 1); MDBVal dataVal = MDBVal.calloc(stack); - if (mdb_get(writeTxn, refCountsDbi, idVal, dataVal) == 0) { - // update count - long newCount = Varint.readUnsigned(dataVal.mv_data()) - 1; - if (newCount <= 0) { + + for (Map.Entry entry : refCounts.entrySet()) { + long count = entry.getValue(); + idBb.clear(); + idBb.put(ID_KEY); + Varint.writeUnsigned(idBb, entry.getKey()); + idVal.mv_data(idBb.flip()); + if (count <= 0) { + // delete count entry E(mdb_del(writeTxn, refCountsDbi, idVal, null)); - return true; } else { - // write count - ByteBuffer countBb = stack.malloc(Varint.calcLengthUnsigned(newCount)); - Varint.writeUnsigned(countBb, newCount); + // update count + countBb.clear(); + Varint.writeUnsigned(countBb, count); dataVal.mv_data(countBb.flip()); mdb_put(writeTxn, refCountsDbi, idVal, dataVal, 0); } } - return false; } finally { stack.pop(); } @@ -918,8 +960,15 @@ public void gcIds(Collection ids, Collection nextIds) throws IOExcep revIdVal.mv_data(id2data(revIdBb, id).flip()); // check if id has internal references and therefore cannot be deleted idVal.mv_data(revIdBb.slice().position(revLength)); - if (mdb_get(writeTxn, refCountsDbi, idVal, dataVal) == 0) { - continue; + Long refCount = refCounts.get(id); + if (refCount == null) { + if (mdb_get(writeTxn, refCountsDbi, idVal, dataVal) == 0) { + continue; + } + } else { + if (refCount > 0) { + continue; + } } // mark id as unused E(mdb_put(writeTxn, unusedDbi, revIdVal, dataVal, 0)); @@ -946,27 +995,22 @@ protected void deleteValueToIdMappings(MemoryStack stack, long txn, Collection datatypeIds = new LinkedList<>(); - for (int i = 1; i < types.length; i++) { + for (int i = 0; i < types.length; i++) { datatypeIds.add(valueStore.storeValue(types[i])); } valueStore.commit(); valueStore.startTransaction(); valueStore.gcIds(Collections.singleton(values[0].getInternalID()), new HashSet<>()); - valueStore.gcIds(datatypeIds, new HashSet<>()); + valueStore.gcIds(datatypeIds.subList(1, datatypeIds.size() - 1), new HashSet<>()); valueStore.commit(); // close and recreate store valueStore.close(); valueStore = createValueStore(); + // the first value is directly GCed assertNull(valueStore.getValue(values[0].getInternalID())); - // the first datatype is not directly garbage collected and must not be + // the first datatype is not directly GCed and must not be // removed from the store if the related literal is removed assertNotNull(valueStore.getValue(datatypeIds.remove(0)));