From b414f212c9c2ee506031dc59a969e3bc6458500f Mon Sep 17 00:00:00 2001 From: sii62122 Date: Tue, 13 Aug 2024 13:15:21 +0200 Subject: [PATCH 1/4] [OAK-10953] Substitute Elastiknn plugin usage with native Elasticsearch knn search feature --- .../index/elastic/ElasticIndexDefinition.java | 3 - .../elastic/ElasticPropertyDefinition.java | 194 +++++++----------- .../index/elastic/index/ElasticDocument.java | 4 +- .../elastic/index/ElasticDocumentMaker.java | 2 +- .../elastic/index/ElasticIndexHelper.java | 42 ++-- .../index/elastic/query/ElasticIndex.java | 6 +- .../elastic/query/ElasticRequestHandler.java | 175 ++++++++-------- .../async/ElasticResultRowAsyncIterator.java | 27 +-- .../ElasticStatisticalFacetAsyncProvider.java | 18 +- .../index/elastic/util/ElasticIndexUtils.java | 34 +-- .../elastic/util/TermQueryBuilderFactory.java | 7 +- .../elastic/ElasticSimilarQueryTest.java | 58 +++--- .../index/elastic/ElasticTestServer.java | 88 +------- .../elastic/index/ElasticIndexHelperTest.java | 6 - .../test/resources/elasticsearch-plugins.yml | 19 -- .../index/search/PropertyDefinition.java | 22 +- 16 files changed, 265 insertions(+), 440 deletions(-) delete mode 100644 oak-search-elastic/src/test/resources/elasticsearch-plugins.yml diff --git a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexDefinition.java b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexDefinition.java index 284e4d7e019..3b410eebd86 100644 --- a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexDefinition.java +++ b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticIndexDefinition.java @@ -107,9 +107,6 @@ public class ElasticIndexDefinition extends IndexDefinition { public static final String SPLIT_ON_CASE_CHANGE = "splitOnCaseChange"; public static final String SPLIT_ON_NUMERICS = "splitOnNumerics"; - - public static final String ELASTIKNN = "elastiknn"; - private static final String SIMILARITY_TAGS_ENABLED = "similarityTagsEnabled"; private static final boolean SIMILARITY_TAGS_ENABLED_DEFAULT = true; diff --git a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticPropertyDefinition.java b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticPropertyDefinition.java index 59bbe1fdd56..a5d24076b02 100644 --- a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticPropertyDefinition.java +++ b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticPropertyDefinition.java @@ -16,141 +16,93 @@ */ package org.apache.jackrabbit.oak.plugins.index.elastic; +import static org.apache.jackrabbit.oak.plugins.index.search.util.ConfigUtil.getOptionalValue; + import org.apache.jackrabbit.oak.plugins.index.search.IndexDefinition; import org.apache.jackrabbit.oak.plugins.index.search.PropertyDefinition; import org.apache.jackrabbit.oak.spi.state.NodeState; -import static org.apache.jackrabbit.oak.plugins.index.search.util.ConfigUtil.getOptionalValue; - public class ElasticPropertyDefinition extends PropertyDefinition { - SimilaritySearchParameters similaritySearchParameters; - - public static final String PROP_QUERY_MODEL = "queryModel"; - public static final String PROP_NUMBER_OF_HASH_TABLES = "L"; - public static final String PROP_NUMBER_OF_HASH_FUNCTIONS = "k"; - public static final String PROP_NUMBER_OF_BUCKETS = "w"; - public static final String PROP_INDEX_SIMILARITY = "indexSimilarity"; - public static final String PROP_QUERY_SIMILARITY = "querySimilarity"; - public static final String PROP_CANDIDATES = "candidates"; - public static final String PROP_PROBES = "probes"; - - private static final int DEFAULT_NUMBER_OF_HASH_TABLES = 20; - private static final int DEFAULT_NO_OF_HASH_FUNCTIONS = 15; - private static final int DEFAULT_BUCKET_WIDTH = 500; - private static final String DEFAULT_SIMILARITY_QUERY_MODEL = "lsh"; - private static final String DEFAULT_SIMILARITY_INDEX_FUNCTION = "l2"; - private static final String DEFAULT_SIMILARITY_QUERY_FUNCTION = "l2"; - private static final int DEFAULT_QUERY_CANDIDATES = 500; - private static final int DEFAULT_QUERY_PROBES = 3; - - - public ElasticPropertyDefinition(IndexDefinition.IndexingRule idxDefn, String nodeName, NodeState defn) { - super(idxDefn, nodeName, defn); - if (this.useInSimilarity) { - similaritySearchParameters = new SimilaritySearchParameters( - getOptionalValue(defn, PROP_NUMBER_OF_HASH_TABLES, DEFAULT_NUMBER_OF_HASH_TABLES), - getOptionalValue(defn, PROP_NUMBER_OF_HASH_FUNCTIONS, DEFAULT_NO_OF_HASH_FUNCTIONS), - getOptionalValue(defn, PROP_NUMBER_OF_BUCKETS, DEFAULT_BUCKET_WIDTH), - getOptionalValue(defn, PROP_QUERY_MODEL, DEFAULT_SIMILARITY_QUERY_MODEL), - getOptionalValue(defn, PROP_INDEX_SIMILARITY, DEFAULT_SIMILARITY_INDEX_FUNCTION), - getOptionalValue(defn, PROP_QUERY_SIMILARITY, DEFAULT_SIMILARITY_QUERY_FUNCTION), - getOptionalValue(defn, PROP_CANDIDATES, DEFAULT_QUERY_CANDIDATES), - getOptionalValue(defn, PROP_PROBES, DEFAULT_QUERY_PROBES)); - } + public static final String DEFAULT_SIMILARITY_METRIC = "l2_norm"; + static final String PROP_SIMILARITY_METRIC = "similarityMetric"; + private static final String PROP_SIMILARITY = "similarity"; + private static final String PROP_K = "k"; + private static final String PROP_CANDIDATES = "candidates"; + private static final float DEFAULT_SIMILARITY = 0.95f; + private static final int DEFAULT_K = 10; + private static final int DEFAULT_CANDIDATES = 500; + private KnnSearchParameters knnSearchParameters; + + public ElasticPropertyDefinition(IndexDefinition.IndexingRule idxDefn, String nodeName, NodeState defn) { + super(idxDefn, nodeName, defn); + if (this.useInSimilarity) { + knnSearchParameters = new KnnSearchParameters( + getOptionalValue(defn, PROP_SIMILARITY_METRIC, DEFAULT_SIMILARITY_METRIC), + getOptionalValue(defn, PROP_SIMILARITY, DEFAULT_SIMILARITY), + getOptionalValue(defn, PROP_K, DEFAULT_K), + getOptionalValue(defn, PROP_CANDIDATES, DEFAULT_CANDIDATES)); + } + } + + public KnnSearchParameters getKnnSearchParameters() { + return knnSearchParameters; + } + + /** + * Class for defining parameters of approximate knn search on dense_vector fields + * ... and + * ... + */ + public static class KnnSearchParameters { + + public KnnSearchParameters(String similarityMetric, float similarity, int k, int candidates) { + this.similarityMetric = similarityMetric; + this.similarity = similarity; + this.k = k; + this.candidates = candidates; } /** - * Class for defining parameters for similarity search based on https://elastiknn.com/api. - * For all possible models and query combinations, see https://elastiknn.com/api/#model-and-query-compatibility + * Similarity metric used to compare query and document vectors. Possible values are l2_norm (default), cosine, + * dot_product, max_inner_product */ - public static class SimilaritySearchParameters { - - /** - * Number of hash tables. Generally, increasing this value increases recall. - */ - private final int L; - /** - * Number of hash functions combined to form a single hash value. Generally, increasing this value increases precision. - */ - private final int k; - /** - * Integer bucket width. - */ - private final int w; - /** - * Possible values - lsh, exact - */ - private final String queryModel; - /** - * Possible values l2 (with lsh or exact model), l1 (with exact model), A (angular distance - with exact model) - */ - private final String queryTimeSimilarityFunction; - /** - * Possible values l2 (with lsh or exact model), l1 (with exact model), A (angular distance - with exact model) - */ - private final String indexTimeSimilarityFunction; - /** - * Take the top vectors with the most matching hashes and compute their exact similarity to the query vector. The candidates parameter - * controls the number of exact similarity computations. Specifically, we compute exact similarity for the top candidates candidate vectors - * in each segment. As a reminder, each Elasticsearch index has >= 1 shards, and each shard has >= 1 segments. That means if you set - * "candidates": 200 for an index with 2 shards, each with 3 segments, then you’ll compute the exact similarity for 2 * 3 * 200 = 1200 vectors. - * candidates must be set to a number greater or equal to the number of Elasticsearch results you want to get. Higher values generally mean - * higher recall and higher latency. - */ - private final int candidates; - /** - * Number of probes for using the multiprobe search technique. Default value is zero. Max value is 3^k. Generally, increasing probes will - * increase recall, will allow you to use a smaller value for L with comparable recall, but introduces some additional computation at query time. - */ - private final int probes; - - public SimilaritySearchParameters(int l, int k, int w, String queryModel, String indexTimeSimilarityFunction, - String queryTimeSimilarityFunction, int candidates, int probes) { - L = l; - this.k = k; - this.w = w; - this.queryModel = queryModel; - this.indexTimeSimilarityFunction = indexTimeSimilarityFunction; - this.queryTimeSimilarityFunction = queryTimeSimilarityFunction; - this.candidates = candidates; - this.probes = probes; - } - - public int getL() { - return L; - } - - public int getK() { - return k; - } - - public int getW() { - return w; - } - - public String getQueryModel() { - return queryModel; - } - - public String getQueryTimeSimilarityFunction() { - return queryTimeSimilarityFunction; - } + private final String similarityMetric; + /** + * Minimum similarity for the document vector to be considered as a match. Required when cosine, dot_product + * or max_inner_product is set as similarityMetric + */ + private final float similarity; + /** + * Number of nearest neighbours to return. Must be <= candidates + * vector added as a field + */ + private final int k; - public String getIndexTimeSimilarityFunction() { - return indexTimeSimilarityFunction; - } + /** + * Take the top vectors with the most matching hashes and compute their exact similarity to the query vector. The + * candidates parameter controls the number of exact similarity computations. Specifically, we compute exact + * similarity for the top candidates candidate vectors in each segment. As a reminder, each Elasticsearch index has + * >= 1 shards, and each shard has >= 1 segments. That means if you set "candidates": 200 for an index with 2 + * shards, each with 3 segments, then you’ll compute the exact similarity for 2 * 3 * 200 = 1200 vectors. candidates + * must be set to a number greater or equal to the number of Elasticsearch results you want to get. Higher values + * generally mean higher recall and higher latency. + */ + private final int candidates; - public int getCandidates() { - return candidates; - } + public String getSimilarityMetric() { + return similarityMetric; + } + public float getSimilarity() { + return similarity; + } - public int getProbes() { - return probes; - } + public int getK() { + return k; } - public SimilaritySearchParameters getSimilaritySearchParameters() { - return similaritySearchParameters; + public int getCandidates() { + return candidates; } + } } diff --git a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocument.java b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocument.java index aa9d414786e..98ce77ba7dc 100644 --- a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocument.java +++ b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocument.java @@ -33,7 +33,7 @@ import java.util.Set; import java.util.LinkedHashSet; -import static org.apache.jackrabbit.oak.plugins.index.elastic.util.ElasticIndexUtils.toDoubles; +import static org.apache.jackrabbit.oak.plugins.index.elastic.util.ElasticIndexUtils.toFloats; @JsonInclude(JsonInclude.Include.NON_EMPTY) public class ElasticDocument { @@ -130,7 +130,7 @@ void addProperty(String fieldName, Object value) { void addSimilarityField(String name, Blob value) throws IOException { byte[] bytes = new BlobByteSource(value).read(); - addProperty(FieldNames.createSimilarityFieldName(name), toDoubles(bytes)); + addProperty(FieldNames.createSimilarityFieldName(name), toFloats(bytes)); } void indexAncestors(String path) { diff --git a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocumentMaker.java b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocumentMaker.java index 6f739a86827..eeb62ba3f4c 100644 --- a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocumentMaker.java +++ b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocumentMaker.java @@ -217,7 +217,7 @@ protected boolean indexSimilarityTag(ElasticDocument doc, PropertyState property @Override protected void indexSimilarityBinaries(ElasticDocument doc, PropertyDefinition pd, Blob blob) throws IOException { // without this check, if the vector size is not correct, the entire document will be skipped - if (pd.getSimilaritySearchDenseVectorSize() == blob.length() / 8) { + if (pd.getSimilaritySearchDenseVectorSize() == blob.length() / 4) { // see https://www.elastic.co/blog/text-similarity-search-with-vectors-in-elasticsearch // see https://www.elastic.co/guide/en/elasticsearch/reference/current/dense-vector.html doc.addSimilarityField(pd.name, blob); diff --git a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexHelper.java b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexHelper.java index 20fea312023..da65a83b24d 100644 --- a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexHelper.java +++ b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexHelper.java @@ -16,7 +16,17 @@ */ package org.apache.jackrabbit.oak.plugins.index.elastic.index; +import static org.apache.jackrabbit.oak.plugins.index.elastic.ElasticPropertyDefinition.DEFAULT_SIMILARITY_METRIC; + +import org.apache.jackrabbit.oak.api.Type; +import org.apache.jackrabbit.oak.plugins.index.elastic.ElasticIndexDefinition; +import org.apache.jackrabbit.oak.plugins.index.elastic.ElasticPropertyDefinition; +import org.apache.jackrabbit.oak.plugins.index.search.FieldNames; +import org.apache.jackrabbit.oak.plugins.index.search.PropertyDefinition; +import org.jetbrains.annotations.NotNull; + import co.elastic.clients.elasticsearch._types.Time; +import co.elastic.clients.elasticsearch._types.mapping.DenseVectorProperty; import co.elastic.clients.elasticsearch._types.mapping.DynamicMapping; import co.elastic.clients.elasticsearch._types.mapping.Property; import co.elastic.clients.elasticsearch._types.mapping.TypeMapping; @@ -24,17 +34,8 @@ import co.elastic.clients.elasticsearch.indices.IndexSettings; import co.elastic.clients.elasticsearch.indices.IndexSettingsAnalysis; import co.elastic.clients.elasticsearch.indices.PutIndicesSettingsRequest; -import co.elastic.clients.json.JsonData; import co.elastic.clients.util.ObjectBuilder; -import org.apache.jackrabbit.oak.api.Type; -import org.apache.jackrabbit.oak.plugins.index.elastic.ElasticIndexDefinition; -import org.apache.jackrabbit.oak.plugins.index.elastic.ElasticPropertyDefinition; -import org.apache.jackrabbit.oak.plugins.index.search.FieldNames; -import org.apache.jackrabbit.oak.plugins.index.search.PropertyDefinition; -import org.jetbrains.annotations.NotNull; -import java.io.Reader; -import java.io.StringReader; import java.util.Arrays; import java.util.List; import java.util.Map; @@ -156,10 +157,6 @@ public static PutIndicesSettingsRequest enableIndexRequest(String remoteIndexNam private static ObjectBuilder loadSettings(@NotNull IndexSettings.Builder builder, @NotNull ElasticIndexDefinition indexDefinition) { - if (!indexDefinition.getSimilarityProperties().isEmpty()) { - builder.otherSettings(ElasticIndexDefinition.ELASTIKNN, JsonData.of(true)); - } - // collect analyzer settings IndexSettingsAnalysis.Builder analyzerBuilder = ElasticCustomAnalyzer.buildCustomAnalyzers(indexDefinition.getAnalyzersNodeState(), "oak_analyzer"); @@ -263,20 +260,13 @@ private static void mapIndexRules(@NotNull TypeMapping.Builder builder, ElasticPropertyDefinition pd = (ElasticPropertyDefinition) propertyDefinition; int denseVectorSize = pd.getSimilaritySearchDenseVectorSize(); - Reader eknnConfig = new StringReader( - "{" + - " \"type\": \"elastiknn_dense_float_vector\"," + - " \"elastiknn\": {" + - " \"dims\": " + denseVectorSize + "," + - " \"model\": \"lsh\"," + - " \"similarity\": \"" + pd.getSimilaritySearchParameters().getIndexTimeSimilarityFunction() + "\"," + - " \"L\": " + pd.getSimilaritySearchParameters().getL() + "," + - " \"k\": " + pd.getSimilaritySearchParameters().getK() + "," + - " \"w\": " + pd.getSimilaritySearchParameters().getW() + - " }" + - "}"); + DenseVectorProperty denseVectorProperty = new DenseVectorProperty.Builder() + .index(true) + .dims(denseVectorSize) + .similarity(DEFAULT_SIMILARITY_METRIC) + .build(); - builder.properties(FieldNames.createSimilarityFieldName(pd.name), b1 -> b1.withJson(eknnConfig)); + builder.properties(FieldNames.createSimilarityFieldName(pd.name), b1 -> b1.denseVector(denseVectorProperty)); } builder.properties(ElasticIndexDefinition.SIMILARITY_TAGS, diff --git a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticIndex.java b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticIndex.java index ace07625f1c..8c3d466ed57 100644 --- a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticIndex.java +++ b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticIndex.java @@ -16,6 +16,9 @@ */ package org.apache.jackrabbit.oak.plugins.index.elastic.query; +import static org.apache.jackrabbit.oak.plugins.index.IndexConstants.TYPE_PROPERTY_NAME; +import static org.apache.jackrabbit.oak.plugins.index.elastic.ElasticIndexDefinition.TYPE_ELASTICSEARCH; + import org.apache.jackrabbit.oak.commons.PathUtils; import org.apache.jackrabbit.oak.plugins.index.elastic.ElasticIndexNode; import org.apache.jackrabbit.oak.plugins.index.elastic.ElasticIndexTracker; @@ -33,9 +36,6 @@ import java.util.function.BiFunction; import java.util.function.Predicate; -import static org.apache.jackrabbit.oak.plugins.index.IndexConstants.TYPE_PROPERTY_NAME; -import static org.apache.jackrabbit.oak.plugins.index.elastic.ElasticIndexDefinition.TYPE_ELASTICSEARCH; - class ElasticIndex extends FulltextIndex { private static final Predicate ELASTICSEARCH_INDEX_DEFINITION_PREDICATE = state -> TYPE_ELASTICSEARCH.equals(state.getString(TYPE_PROPERTY_NAME)); diff --git a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java index e0062cc255f..47be877bbf0 100644 --- a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java +++ b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/ElasticRequestHandler.java @@ -18,7 +18,8 @@ import static org.apache.jackrabbit.JcrConstants.JCR_MIXINTYPES; import static org.apache.jackrabbit.JcrConstants.JCR_PRIMARYTYPE; -import static org.apache.jackrabbit.oak.plugins.index.elastic.util.ElasticIndexUtils.toDoubles; +import static org.apache.jackrabbit.oak.plugins.index.elastic.ElasticPropertyDefinition.DEFAULT_SIMILARITY_METRIC; +import static org.apache.jackrabbit.oak.plugins.index.elastic.util.ElasticIndexUtils.toFloats; import static org.apache.jackrabbit.oak.plugins.index.elastic.util.TermQueryBuilderFactory.newAncestorQuery; import static org.apache.jackrabbit.oak.plugins.index.elastic.util.TermQueryBuilderFactory.newDepthQuery; import static org.apache.jackrabbit.oak.plugins.index.elastic.util.TermQueryBuilderFactory.newPathQuery; @@ -32,31 +33,6 @@ import static org.apache.jackrabbit.oak.spi.query.QueryConstants.JCR_SCORE; import static org.apache.jackrabbit.util.ISO8601.parse; -import java.io.IOException; -import java.io.StringReader; -import java.nio.charset.StandardCharsets; -import java.util.ArrayList; -import java.util.Base64; -import java.util.HashMap; -import java.util.List; -import java.util.Map; -import java.util.Objects; -import java.util.Optional; -import java.util.concurrent.atomic.AtomicReference; -import java.util.function.BiConsumer; -import java.util.function.BiPredicate; -import java.util.function.Consumer; -import java.util.function.Function; -import java.util.stream.Collectors; -import java.util.stream.Stream; -import java.util.stream.StreamSupport; - -import javax.jcr.PropertyType; - -import co.elastic.clients.elasticsearch._types.query_dsl.TermQuery; -import co.elastic.clients.elasticsearch.core.search.Highlight; -import co.elastic.clients.elasticsearch.core.search.HighlightField; -import co.elastic.clients.json.JsonpUtils; import org.apache.jackrabbit.oak.api.Blob; import org.apache.jackrabbit.oak.api.PropertyState; import org.apache.jackrabbit.oak.api.Type; @@ -94,6 +70,7 @@ import org.slf4j.LoggerFactory; import co.elastic.clients.elasticsearch._types.FieldValue; +import co.elastic.clients.elasticsearch._types.KnnQuery; import co.elastic.clients.elasticsearch._types.SortOptions; import co.elastic.clients.elasticsearch._types.SortOrder; import co.elastic.clients.elasticsearch._types.SuggestMode; @@ -105,9 +82,31 @@ import co.elastic.clients.elasticsearch._types.query_dsl.Operator; import co.elastic.clients.elasticsearch._types.query_dsl.Query; import co.elastic.clients.elasticsearch._types.query_dsl.QueryStringQuery; +import co.elastic.clients.elasticsearch._types.query_dsl.TermQuery; import co.elastic.clients.elasticsearch._types.query_dsl.TextQueryType; +import co.elastic.clients.elasticsearch.core.search.Highlight; +import co.elastic.clients.elasticsearch.core.search.HighlightField; import co.elastic.clients.elasticsearch.core.search.InnerHits; import co.elastic.clients.elasticsearch.core.search.PhraseSuggester; +import co.elastic.clients.json.JsonpUtils; + +import java.io.IOException; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.concurrent.atomic.AtomicReference; +import java.util.function.BiConsumer; +import java.util.function.BiPredicate; +import java.util.function.Consumer; +import java.util.function.Function; +import java.util.stream.Collectors; +import java.util.stream.Stream; +import java.util.stream.StreamSupport; + +import javax.jcr.PropertyType; /** * Class to map query plans into Elastic request objects. @@ -192,7 +191,10 @@ public BoolQuery.Builder baseQueryBuilder() { bqb.must(m -> m.moreLikeThis(mltQuery(mltParams))); } } else { - bqb.must(m -> m.bool(similarityQuery(queryNodePath, sp))); + similarityQuery(queryNodePath, sp).ifPresent(similarityQuery -> + bqb.filter(fb -> fb.exists(ef -> ef.field(similarityQuery.field()))) + .should(s -> s.knn(similarityQuery)) + ); } // Add should clause to improve relevance using similarity tags only when similarity is @@ -224,6 +226,61 @@ public BoolQuery.Builder baseQueryBuilder() { return bqb; } + public Optional similarityQuery(@NotNull String text, List sp) { + if (!sp.isEmpty()) { + LOG.debug("generating similarity query for {}", text); + NodeState targetNodeState = rootState; + for (String token : PathUtils.elements(text)) { + targetNodeState = targetNodeState.getChildNode(token); + } + if (!targetNodeState.exists()) { + throw new IllegalArgumentException("Could not find node " + text); + } + for (PropertyDefinition propertyDefinition : sp) { + ElasticPropertyDefinition pd = (ElasticPropertyDefinition) propertyDefinition; + String propertyPath = PathUtils.getParentPath(pd.name); + String propertyName = PathUtils.getName(pd.name); + NodeState tempState = targetNodeState; + for (String token : PathUtils.elements(propertyPath)) { + if (token.isEmpty()) { + break; + } + tempState = tempState.getChildNode(token); + } + PropertyState ps = tempState.getProperty(propertyName); + Blob property = ps != null ? ps.getValue(Type.BINARY) : null; + if (property == null) { + LOG.warn("Couldn't find property {} on {}", pd.name, text); + continue; + } + byte[] bytes; + try { + bytes = new BlobByteSource(property).read(); + } catch (IOException e) { + LOG.error("Error reading bytes from property {} on {}", pd.name, text, e); + continue; + } + + String similarityPropFieldName = FieldNames.createSimilarityFieldName(pd.name); + KnnQuery knnQuery = baseKnnQueryBuilder(similarityPropFieldName, bytes, pd).build(); + return Optional.of(knnQuery); + } + } + return Optional.empty(); + } + + @NotNull + private KnnQuery.Builder baseKnnQueryBuilder(String similarityPropFieldName, byte[] bytes, ElasticPropertyDefinition pd) { + KnnQuery.Builder knnQueryBuilder = new KnnQuery.Builder() + .field(similarityPropFieldName) + .queryVector(toFloats(bytes)) + .numCandidates(pd.getKnnSearchParameters().getCandidates()); + if (!pd.getKnnSearchParameters().getSimilarityMetric().equals(DEFAULT_SIMILARITY_METRIC)) { + knnQueryBuilder.similarity(pd.getKnnSearchParameters().getSimilarity()); + } + return knnQueryBuilder; + } + public @NotNull List baseSorts() { List sortOrder = indexPlan.getSortOrder(); if (sortOrder == null || sortOrder.isEmpty()) { @@ -303,68 +360,6 @@ public Stream facetFields() { .map(pr -> FulltextIndex.parseFacetField(pr.first.getValue(Type.STRING))); } - private BoolQuery similarityQuery(@NotNull String text, List sp) { - BoolQuery.Builder query = new BoolQuery.Builder(); - if (!sp.isEmpty()) { - LOG.debug("generating similarity query for {}", text); - NodeState targetNodeState = rootState; - for (String token : PathUtils.elements(text)) { - targetNodeState = targetNodeState.getChildNode(token); - } - if (!targetNodeState.exists()) { - throw new IllegalArgumentException("Could not find node " + text); - } - for (PropertyDefinition propertyDefinition : sp) { - ElasticPropertyDefinition pd = (ElasticPropertyDefinition) propertyDefinition; - String propertyPath = PathUtils.getParentPath(pd.name); - String propertyName = PathUtils.getName(pd.name); - NodeState tempState = targetNodeState; - for (String token : PathUtils.elements(propertyPath)) { - if (token.isEmpty()) { - break; - } - tempState = tempState.getChildNode(token); - } - PropertyState ps = tempState.getProperty(propertyName); - Blob property = ps != null ? ps.getValue(Type.BINARY) : null; - if (property == null) { - LOG.warn("Couldn't find property {} on {}", pd.name, text); - continue; - } - byte[] bytes; - try { - bytes = new BlobByteSource(property).read(); - } catch (IOException e) { - LOG.error("Error reading bytes from property {} on {}", pd.name, text, e); - continue; - } - - String similarityPropFieldName = FieldNames.createSimilarityFieldName(pd.name); - String knnQuery = "{" + - " \"elastiknn_nearest_neighbors\": {" + - " \"field\": \"" + similarityPropFieldName + "\"," + - " \"model\": \"" + pd.getSimilaritySearchParameters().getQueryModel() + "\"," + - " \"similarity\": \"" + pd.getSimilaritySearchParameters().getQueryTimeSimilarityFunction() + "\"," + - " \"candidates\": " + pd.getSimilaritySearchParameters().getCandidates() + "," + - " \"probes\": " + pd.getSimilaritySearchParameters().getProbes() + "," + - " \"vec\": {" + - " \"values\": [" + - toDoubles(bytes).stream().map(Objects::toString).collect(Collectors.joining(",")) + - " ]" + - " }" + - " }" + - "}"; - - query - .filter(fb -> fb.exists(ef -> ef.field(similarityPropFieldName))) - .should(s -> s - .wrapper(w -> w.query(Base64.getEncoder().encodeToString(knnQuery.getBytes(StandardCharsets.UTF_8)))) - ); - } - } - return query.build(); - } - /* * Generates mlt query builder from the given mltQueryString There could be 2 * cases here - 1) select [jcr:path] from [nt:base] where similar(., '/test/a') @@ -578,7 +573,7 @@ private boolean visitTerm(String propertyName, String text, String boost, boolea return true; } }); - + return Query.of(q -> q.bool(result.get())); } diff --git a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/async/ElasticResultRowAsyncIterator.java b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/async/ElasticResultRowAsyncIterator.java index 896c8a8fd56..a039d9bde30 100644 --- a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/async/ElasticResultRowAsyncIterator.java +++ b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/async/ElasticResultRowAsyncIterator.java @@ -16,20 +16,10 @@ */ package org.apache.jackrabbit.oak.plugins.index.elastic.query.async; -import co.elastic.clients.elasticsearch._types.ElasticsearchException; -import co.elastic.clients.elasticsearch._types.FieldValue; -import co.elastic.clients.elasticsearch._types.query_dsl.Query; -import co.elastic.clients.elasticsearch._types.SortOptions; -import co.elastic.clients.elasticsearch.core.SearchRequest; -import co.elastic.clients.elasticsearch.core.SearchResponse; -import co.elastic.clients.elasticsearch.core.search.Highlight; -import co.elastic.clients.elasticsearch.core.search.Hit; -import co.elastic.clients.elasticsearch.core.search.SourceConfig; -import co.elastic.clients.elasticsearch.core.search.TotalHitsRelation; -import co.elastic.clients.json.JsonpUtils; import com.fasterxml.jackson.databind.node.ObjectNode; -import org.apache.jackrabbit.oak.plugins.index.elastic.ElasticMetricHandler; + import org.apache.jackrabbit.oak.plugins.index.elastic.ElasticIndexNode; +import org.apache.jackrabbit.oak.plugins.index.elastic.ElasticMetricHandler; import org.apache.jackrabbit.oak.plugins.index.elastic.query.ElasticQueryIterator; import org.apache.jackrabbit.oak.plugins.index.elastic.query.ElasticRequestHandler; import org.apache.jackrabbit.oak.plugins.index.elastic.query.ElasticResponseHandler; @@ -41,6 +31,18 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import co.elastic.clients.elasticsearch._types.ElasticsearchException; +import co.elastic.clients.elasticsearch._types.FieldValue; +import co.elastic.clients.elasticsearch._types.SortOptions; +import co.elastic.clients.elasticsearch._types.query_dsl.Query; +import co.elastic.clients.elasticsearch.core.SearchRequest; +import co.elastic.clients.elasticsearch.core.SearchResponse; +import co.elastic.clients.elasticsearch.core.search.Highlight; +import co.elastic.clients.elasticsearch.core.search.Hit; +import co.elastic.clients.elasticsearch.core.search.SourceConfig; +import co.elastic.clients.elasticsearch.core.search.TotalHitsRelation; +import co.elastic.clients.json.JsonpUtils; + import java.util.ArrayList; import java.util.BitSet; import java.util.Collections; @@ -274,7 +276,6 @@ class ElasticQueryScanner { // use a smaller size when the query contains aggregations. This improves performance // when the client is only interested in insecure facets .size(needsAggregations.get() ? Math.min(SMALL_RESULT_SET_SIZE, getFetchSize(requests)) : getFetchSize(requests)); - if (needsAggregations.get()) { builder.aggregations(elasticRequestHandler.aggregations()); } diff --git a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/async/facets/ElasticStatisticalFacetAsyncProvider.java b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/async/facets/ElasticStatisticalFacetAsyncProvider.java index bc793853087..c26139b93df 100644 --- a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/async/facets/ElasticStatisticalFacetAsyncProvider.java +++ b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/query/async/facets/ElasticStatisticalFacetAsyncProvider.java @@ -16,16 +16,9 @@ */ package org.apache.jackrabbit.oak.plugins.index.elastic.query.async.facets; -import co.elastic.clients.elasticsearch._types.aggregations.Aggregate; -import co.elastic.clients.elasticsearch._types.aggregations.StringTermsBucket; -import co.elastic.clients.elasticsearch._types.query_dsl.BoolQuery; -import co.elastic.clients.elasticsearch._types.query_dsl.Query; -import co.elastic.clients.elasticsearch.core.SearchRequest; -import co.elastic.clients.elasticsearch.core.SearchResponse; -import co.elastic.clients.elasticsearch.core.search.Hit; -import co.elastic.clients.elasticsearch.core.search.SourceConfig; import com.fasterxml.jackson.databind.JsonNode; import com.fasterxml.jackson.databind.node.ObjectNode; + import org.apache.jackrabbit.oak.plugins.index.elastic.ElasticConnection; import org.apache.jackrabbit.oak.plugins.index.elastic.ElasticIndexDefinition; import org.apache.jackrabbit.oak.plugins.index.elastic.query.ElasticRequestHandler; @@ -35,6 +28,15 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import co.elastic.clients.elasticsearch._types.aggregations.Aggregate; +import co.elastic.clients.elasticsearch._types.aggregations.StringTermsBucket; +import co.elastic.clients.elasticsearch._types.query_dsl.BoolQuery; +import co.elastic.clients.elasticsearch._types.query_dsl.Query; +import co.elastic.clients.elasticsearch.core.SearchRequest; +import co.elastic.clients.elasticsearch.core.SearchResponse; +import co.elastic.clients.elasticsearch.core.search.Hit; +import co.elastic.clients.elasticsearch.core.search.SourceConfig; + import java.util.ArrayList; import java.util.HashMap; import java.util.List; diff --git a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/util/ElasticIndexUtils.java b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/util/ElasticIndexUtils.java index 1a2f5883aea..5332b1c2450 100644 --- a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/util/ElasticIndexUtils.java +++ b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/util/ElasticIndexUtils.java @@ -16,6 +16,10 @@ */ package org.apache.jackrabbit.oak.plugins.index.elastic.util; +import org.jetbrains.annotations.NotNull; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + import java.nio.ByteBuffer; import java.nio.charset.StandardCharsets; import java.security.MessageDigest; @@ -23,10 +27,6 @@ import java.util.ArrayList; import java.util.List; -import org.jetbrains.annotations.NotNull; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - public class ElasticIndexUtils { private static final Logger LOG = LoggerFactory.getLogger(ElasticIndexUtils.class); @@ -53,36 +53,36 @@ public static String idFromPath(@NotNull String path) { } /** - * Converts a given byte array (of doubles) to a list of doubles + * Converts a given byte array (of doubles) to a list of floats * @param array given byte array - * @return list of doubles + * @return list of floats */ - public static List toDoubles(byte[] array) { - int blockSize = Double.SIZE / Byte.SIZE; + public static List toFloats(byte[] array) { + int blockSize = Float.SIZE / Byte.SIZE; ByteBuffer wrap = ByteBuffer.wrap(array); if (array.length % blockSize != 0) { LOG.warn("Unexpected byte array length {}", array.length); } int capacity = array.length / blockSize; - List doubles = new ArrayList<>(capacity); + List floats = new ArrayList<>(capacity); for (int i = 0; i < capacity; i++) { - double e = wrap.getDouble(i * blockSize); - doubles.add(e); + float e = wrap.getFloat(i * blockSize); + floats.add(e); } - return doubles; + return floats; } /** - * Converts a given list of double values into a byte array - * @param values given list of doubles + * Converts a given list of float values into a byte array + * @param values given list of floats * @return byte array */ - public static byte[] toByteArray(List values) { - int blockSize = Double.SIZE / Byte.SIZE; + public static byte[] toByteArray(List values) { + int blockSize = Float.SIZE / Byte.SIZE; byte[] bytes = new byte[values.size() * blockSize]; ByteBuffer wrap = ByteBuffer.wrap(bytes); for (int i = 0, j = 0; i < values.size(); i++, j += blockSize) { - wrap.putDouble(values.get(i)); + wrap.putFloat(values.get(i)); } return bytes; } diff --git a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/util/TermQueryBuilderFactory.java b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/util/TermQueryBuilderFactory.java index 33743639a45..04ded9c7f8c 100644 --- a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/util/TermQueryBuilderFactory.java +++ b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/util/TermQueryBuilderFactory.java @@ -16,6 +16,9 @@ */ package org.apache.jackrabbit.oak.plugins.index.elastic.util; +import static org.apache.jackrabbit.oak.plugins.index.search.FieldNames.PATH; +import static org.apache.jackrabbit.oak.plugins.index.search.FieldNames.PATH_DEPTH; + import org.apache.jackrabbit.oak.api.PropertyValue; import org.apache.jackrabbit.oak.commons.PathUtils; import org.apache.jackrabbit.oak.plugins.index.search.FieldNames; @@ -25,15 +28,11 @@ import co.elastic.clients.elasticsearch._types.FieldValue; import co.elastic.clients.elasticsearch._types.query_dsl.Query; -import co.elastic.clients.json.JsonData; import java.util.List; import java.util.function.Function; import java.util.stream.Collectors; -import static org.apache.jackrabbit.oak.plugins.index.search.FieldNames.PATH; -import static org.apache.jackrabbit.oak.plugins.index.search.FieldNames.PATH_DEPTH; - public class TermQueryBuilderFactory { /** * Private constructor. diff --git a/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticSimilarQueryTest.java b/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticSimilarQueryTest.java index dd1ea6839d5..6195152a2eb 100644 --- a/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticSimilarQueryTest.java +++ b/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticSimilarQueryTest.java @@ -16,11 +16,11 @@ */ package org.apache.jackrabbit.oak.plugins.index.elastic; -import co.elastic.clients.elasticsearch._types.mapping.FieldMapping; -import co.elastic.clients.elasticsearch._types.mapping.Property; -import co.elastic.clients.elasticsearch.indices.GetFieldMappingResponse; -import co.elastic.clients.elasticsearch.indices.get_field_mapping.TypeFieldMappings; -import jakarta.json.JsonObject; +import static org.apache.jackrabbit.oak.plugins.index.elastic.util.ElasticIndexUtils.toByteArray; +import static org.apache.jackrabbit.oak.plugins.index.elastic.util.ElasticIndexUtils.toFloats; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotEquals; + import org.apache.commons.io.IOUtils; import org.apache.jackrabbit.oak.api.Blob; import org.apache.jackrabbit.oak.api.Tree; @@ -30,6 +30,12 @@ import org.apache.jackrabbit.oak.plugins.index.search.util.IndexDefinitionBuilder; import org.junit.Test; +import co.elastic.clients.elasticsearch._types.mapping.DenseVectorProperty; +import co.elastic.clients.elasticsearch._types.mapping.FieldMapping; +import co.elastic.clients.elasticsearch._types.mapping.Property; +import co.elastic.clients.elasticsearch.indices.GetFieldMappingResponse; +import co.elastic.clients.elasticsearch.indices.get_field_mapping.TypeFieldMappings; + import java.io.ByteArrayInputStream; import java.io.File; import java.net.URI; @@ -47,11 +53,6 @@ import java.util.stream.Collectors; import java.util.stream.Stream; -import static org.apache.jackrabbit.oak.plugins.index.elastic.util.ElasticIndexUtils.toByteArray; -import static org.apache.jackrabbit.oak.plugins.index.elastic.util.ElasticIndexUtils.toDoubles; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNotEquals; - public class ElasticSimilarQueryTest extends ElasticAbstractQueryTest { @Test @@ -171,16 +172,14 @@ public void similarityTagsAffectRelevance() throws Exception { } @Test - public void vectorSimilarityElastiknnIndexConfiguration() throws Exception { + public void vectorSimilarityIndexConfiguration() throws Exception { final String indexName = "test1"; final String fieldName1 = "fv1"; final String similarityFieldName1 = FieldNames.createSimilarityFieldName(fieldName1); IndexDefinitionBuilder builder = createIndex(fieldName1); Tree tree = builder.indexRule("nt:base").property(fieldName1).useInSimilarity(true).nodeScopeIndex() .similaritySearchDenseVectorSize(2048).getBuilderTree(); - tree.setProperty(ElasticPropertyDefinition.PROP_INDEX_SIMILARITY, "cosine"); - tree.setProperty(ElasticPropertyDefinition.PROP_NUMBER_OF_HASH_TABLES, 10); - tree.setProperty(ElasticPropertyDefinition.PROP_NUMBER_OF_HASH_FUNCTIONS, 12); + tree.setProperty(ElasticPropertyDefinition.PROP_SIMILARITY_METRIC, "cosine"); Tree index = setIndex(indexName, builder); root.commit(); @@ -196,11 +195,9 @@ public void vectorSimilarityElastiknnIndexConfiguration() throws Exception { assertEquals("More than one index found", 1, mappings.size()); Map typeFieldMappings = mappings.entrySet().iterator().next().getValue().mappings(); Property v = typeFieldMappings.get(similarityFieldName1).mapping().get(similarityFieldName1); - JsonObject map1 = v._custom().toJson().asJsonObject().get("elastiknn").asJsonObject(); - assertEquals("Dense vector size doesn't match", 2048, map1.getInt("dims")); - assertEquals("Similarity doesn't match", "cosine", map1.getString("similarity")); - assertEquals("Similarity doesn't match", 10, map1.getInt("L")); - assertEquals("Similarity doesn't match", 12, map1.getInt("k")); + DenseVectorProperty denseVector = v.denseVector(); + assertEquals("Dense vector size doesn't match", 2048, denseVector.dims().intValue()); + assertEquals("Similarity metric doesn't match", "l2_norm", denseVector.similarity()); } @Test @@ -217,9 +214,9 @@ public void vectorSimilarityWithWrongVectorSizes() throws Exception { for (String line : IOUtils.readLines(Files.newInputStream(file.toPath()), Charset.defaultCharset())) { String[] split = line.split(","); - List values = Stream.of(split).skip(1).map(Double::parseDouble).collect(Collectors.toList()); + List values = Stream.of(split).skip(1).map(Float::parseFloat).collect(Collectors.toList()); byte[] bytes = toByteArray(values); - List actual = toDoubles(bytes); + List actual = toFloats(bytes); assertEquals(values, actual); Blob blob = root.createBlob(new ByteArrayInputStream(bytes)); @@ -247,9 +244,9 @@ public void vectorSimilarity() throws Exception { List children = new LinkedList<>(); for (String line : IOUtils.readLines(Files.newInputStream(file.toPath()), Charset.defaultCharset())) { String[] split = line.split(","); - List values = Stream.of(split).skip(1).map(Double::parseDouble).collect(Collectors.toList()); + List values = Stream.of(split).skip(1).map(Float::parseFloat).collect(Collectors.toList()); byte[] bytes = toByteArray(values); - List actual = toDoubles(bytes); + List actual = toFloats(bytes); assertEquals(values, actual); Blob blob = root.createBlob(new ByteArrayInputStream(bytes)); @@ -259,11 +256,6 @@ public void vectorSimilarity() throws Exception { children.add(child.getPath()); } - // add a node without FV, the plugin cannot handle it directly - Tree child = test.addChild("nofv"); - child.setProperty("nofv", "test"); - children.add(child.getPath()); - root.commit(); // check that similarity changes across different feature vectors @@ -325,17 +317,17 @@ public void vectorSimilarityLargeData() throws Exception { for (int i = 0; i < 2000; i++) { String imageName = "img" + i; imageNameList.add(imageName); - List values = new ArrayList<>(); + List values = new ArrayList<>(); float[] imageData = new float[featureVectorLength]; for (int j = 0; j < featureVectorLength; j++) { - double x = r.nextDouble() * 0.5; - double g = 30 * Math.pow(x, 3); + float x = r.nextFloat() * 0.5f; + float g = (float) (30 * Math.pow(x, 3)); values.add(g); - imageData[j] = (float) g; + imageData[j] = g; } imageDataList.add(imageData); byte[] bytes = toByteArray(values); - List actual = toDoubles(bytes); + List actual = toFloats(bytes); assertEquals(values, actual); Blob blob = root.createBlob(new ByteArrayInputStream(bytes)); Tree child = test.addChild(imageName); diff --git a/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticTestServer.java b/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticTestServer.java index 1d42f8cfa9b..9153e92958e 100644 --- a/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticTestServer.java +++ b/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/ElasticTestServer.java @@ -16,9 +16,10 @@ */ package org.apache.jackrabbit.oak.plugins.index.elastic; -import co.elastic.clients.transport.Version; +import static org.junit.Assume.assumeNotNull; + import com.github.dockerjava.api.DockerClient; -import org.apache.jackrabbit.oak.commons.IOUtils; + import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.testcontainers.DockerClientFactory; @@ -27,31 +28,12 @@ import org.testcontainers.elasticsearch.ElasticsearchContainer; import org.testcontainers.utility.MountableFile; -import java.io.File; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.InputStream; -import java.net.URL; -import java.security.DigestInputStream; -import java.security.MessageDigest; -import java.security.NoSuchAlgorithmException; -import java.util.Map; +import co.elastic.clients.transport.Version; -import static org.junit.Assume.assumeNotNull; +import java.io.IOException; public class ElasticTestServer implements AutoCloseable { private static final Logger LOG = LoggerFactory.getLogger(ElasticTestServer.class); - private static final Map PLUGIN_OFFICIAL_RELEASES_DIGEST_MAP = Map.of( - "7.17.7.0", "4252eb55cc7775f1b889d624ac335abfa2e357931c40d0accb4d520144246b8b", - "8.3.3.0", "14d3223456f4b9f00f86628ec8400cb46513935e618ae0f5d0d1088739ccc233", - "8.4.3.0", "5c00d43cdd56c5c5d8e9032ad507acea482fb5ca9445861c5cc12ad63af66425", - "8.5.3.0", "d4c13f68650f9df5ff8c74ec83abc2e416de9c45f991d459326e0e2baf7b0e3f", - "8.7.1.0", "80c8d34334b0cf4def79835ea6dab78b59ba9ee54c8f5f3cba0bde53123d7820", - "8.10.4.0", "b2ae8faf1e272319594b4d47a72580fa4f61a5c11cbc8d3f13453fd34b153441", - "8.11.0.0", "8d4d80b850c4da4da6dfe2d675b2e2355d2014307f8bdc54cc1b34323c81c7ae", - "8.11.3.0", "1f14b496baf973fb5c64e77fc458d9814dd6905170d7b15350f9f1a009824f41", - "8.13.2.0", "586f553b109266d7996265f3f34a20914b569d494b49da2c0534428770e551f0", - "8.15.0.0", "6cbb54d68d654a3476df0b730856cfa3194bce5c6e1050a35e7a86ffec8a3e20"); private static final ElasticTestServer SERVER = new ElasticTestServer(); private static volatile ElasticsearchContainer CONTAINER; @@ -83,11 +65,7 @@ private synchronized void setup() { if (esDockerImageVersion == null) { esDockerImageVersion = Version.VERSION.toString(); } - final String pluginVersion = esDockerImageVersion + ".0"; - final String pluginFileName = "elastiknn-" + pluginVersion + ".zip"; - final String localPluginPath = "target/" + pluginFileName; LOG.info("Elasticsearch test Docker image version: {}.", esDockerImageVersion); - downloadSimilaritySearchPluginIfNotExists(localPluginPath, pluginVersion); checkIfDockerClientAvailable(); Network network = Network.newNetwork(); CONTAINER = new ElasticsearchContainer("docker.elastic.co/elasticsearch/elasticsearch:" + esDockerImageVersion) @@ -95,13 +73,6 @@ private synchronized void setup() { .withCopyFileToContainer( MountableFile.forClasspathResource("elasticsearch.yml"), "/usr/share/elasticsearch/config/elasticsearch.yml") - // https://www.elastic.co/guide/en/elasticsearch/plugins/8.4/manage-plugins-using-configuration-file.html - .withCopyFileToContainer( - MountableFile.forClasspathResource("elasticsearch-plugins.yml"), - "/usr/share/elasticsearch/config/elasticsearch-plugins.yml") - .withCopyFileToContainer( - MountableFile.forHostPath(localPluginPath), - "/tmp/plugins/elastiknn.zip") .withNetwork(network) .withNetworkAliases("elasticsearch") .withStartupAttempts(3); @@ -124,55 +95,6 @@ public void close() { CONTAINER = null; } - private void downloadSimilaritySearchPluginIfNotExists(String localPluginPath, String pluginVersion) { - File pluginFile = new File(localPluginPath); - if (!pluginFile.exists()) { - LOG.info("Plugin file {} doesn't exist. Trying to download.", localPluginPath); - String pluginUri; - String pluginDigest; - if (PLUGIN_OFFICIAL_RELEASES_DIGEST_MAP.containsKey(pluginVersion)) { - pluginDigest = PLUGIN_OFFICIAL_RELEASES_DIGEST_MAP.get(pluginVersion); - pluginUri = "https://github.com/alexklibisz/elastiknn/releases/download/" + pluginVersion - + "/elastiknn-" + pluginVersion + ".zip"; - } else { - pluginDigest = null; // Skip validation - pluginUri = ElasticTestUtils.ELASTIC_KNN_PLUGIN_URI; - if (pluginUri == null) { - throw new RuntimeException("Elastiknn " + pluginVersion + " is not a known official release, so it cannot be downloaded from the official GitHub repo. Please provide the download URI in system property \"" + ElasticTestUtils.ELASTIC_KNN_PLUGIN_URI_KEY + "\"."); - } - } - LOG.info("Downloading Elastiknn plugin from {}.", pluginUri); - try { - try (InputStream inputStream = new URL(pluginUri).openStream(); - FileOutputStream outputStream = new FileOutputStream(pluginFile) - ) { - if (pluginDigest != null) { - MessageDigest messageDigest = MessageDigest.getInstance("SHA-256"); - DigestInputStream dis = new DigestInputStream(inputStream, messageDigest); - IOUtils.copy(dis, outputStream); - messageDigest = dis.getMessageDigest(); - // bytes to hex - StringBuilder result = new StringBuilder(); - for (byte b : messageDigest.digest()) { - result.append(String.format("%02x", b)); - } - if (!pluginDigest.equals(result.toString())) { - String deleteString = "Downloaded plugin file deleted."; - if (!pluginFile.delete()) { - deleteString = "Could not delete downloaded plugin file."; - } - throw new RuntimeException("Plugin digest unequal. Found " + result + ". Expected " + pluginDigest + ". " + deleteString); - } - } else { - IOUtils.copy(inputStream, outputStream); - } - } - } catch (IOException | NoSuchAlgorithmException e) { - throw new RuntimeException("Could not download similarity search plugin", e); - } - } - } - private void checkIfDockerClientAvailable() { DockerClient client = null; try { diff --git a/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexHelperTest.java b/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexHelperTest.java index d4dcddb2de6..131681c2b94 100644 --- a/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexHelperTest.java +++ b/oak-search-elastic/src/test/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticIndexHelperTest.java @@ -25,7 +25,6 @@ import co.elastic.clients.elasticsearch.indices.CreateIndexRequest; import co.elastic.clients.elasticsearch.indices.IndexSettings; import co.elastic.clients.elasticsearch.indices.IndexSettingsAnalysis; -import co.elastic.clients.json.JsonData; import org.apache.jackrabbit.oak.api.Tree; import org.apache.jackrabbit.oak.plugins.index.elastic.ElasticIndexDefinition; import org.apache.jackrabbit.oak.plugins.index.elastic.util.ElasticIndexDefinitionBuilder; @@ -35,8 +34,6 @@ import org.jetbrains.annotations.NotNull; import org.junit.Test; -import java.util.Map; - import static org.hamcrest.CoreMatchers.notNullValue; import static org.hamcrest.CoreMatchers.is; import static org.hamcrest.MatcherAssert.assertThat; @@ -119,9 +116,6 @@ public void indexSettingsAreCorrectlySet() { assertThat(wdgfDef.preserveOriginal(), is(expectedIndexOriginalTerm)); assertThat(wdgfDef.splitOnCaseChange(), is(expectedSplitOnCaseChange)); assertThat(wdgfDef.splitOnNumerics(), is(expectedSplitOnNumerics)); - - Map otherSettings = req.settings().otherSettings(); - assertThat(otherSettings.get(ElasticIndexDefinition.ELASTIKNN).to(Boolean.class), is(true)); } @Test diff --git a/oak-search-elastic/src/test/resources/elasticsearch-plugins.yml b/oak-search-elastic/src/test/resources/elasticsearch-plugins.yml deleted file mode 100644 index 96841ed0a3a..00000000000 --- a/oak-search-elastic/src/test/resources/elasticsearch-plugins.yml +++ /dev/null @@ -1,19 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -plugins: - - id: elastiknn - location: file:///tmp/plugins/elastiknn.zip \ No newline at end of file diff --git a/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/PropertyDefinition.java b/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/PropertyDefinition.java index 3c25d2ed8bc..9310058a135 100644 --- a/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/PropertyDefinition.java +++ b/oak-search/src/main/java/org/apache/jackrabbit/oak/plugins/index/search/PropertyDefinition.java @@ -18,7 +18,16 @@ */ package org.apache.jackrabbit.oak.plugins.index.search; -import javax.jcr.PropertyType; +import static org.apache.jackrabbit.guava.common.collect.ImmutableList.copyOf; +import static org.apache.jackrabbit.guava.common.collect.Iterables.toArray; +import static org.apache.jackrabbit.oak.commons.PathUtils.elements; +import static org.apache.jackrabbit.oak.commons.PathUtils.isAbsolute; +import static org.apache.jackrabbit.oak.plugins.index.search.FulltextIndexConstants.FIELD_BOOST; +import static org.apache.jackrabbit.oak.plugins.index.search.FulltextIndexConstants.PROP_IS_REGEX; +import static org.apache.jackrabbit.oak.plugins.index.search.FulltextIndexConstants.PROP_SIMILARITY_SEARCH_DENSE_VECTOR_SIZE; +import static org.apache.jackrabbit.oak.plugins.index.search.FulltextIndexConstants.PROP_WEIGHT; +import static org.apache.jackrabbit.oak.plugins.index.search.spi.query.FulltextIndexPlanner.DEFAULT_PROPERTY_WEIGHT; +import static org.apache.jackrabbit.oak.plugins.index.search.util.ConfigUtil.getOptionalValue; import org.apache.jackrabbit.oak.api.PropertyState; import org.apache.jackrabbit.oak.api.Type; @@ -32,16 +41,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import static org.apache.jackrabbit.guava.common.collect.ImmutableList.copyOf; -import static org.apache.jackrabbit.guava.common.collect.Iterables.toArray; -import static org.apache.jackrabbit.oak.commons.PathUtils.elements; -import static org.apache.jackrabbit.oak.commons.PathUtils.isAbsolute; -import static org.apache.jackrabbit.oak.plugins.index.search.FulltextIndexConstants.FIELD_BOOST; -import static org.apache.jackrabbit.oak.plugins.index.search.FulltextIndexConstants.PROP_IS_REGEX; -import static org.apache.jackrabbit.oak.plugins.index.search.FulltextIndexConstants.PROP_WEIGHT; -import static org.apache.jackrabbit.oak.plugins.index.search.FulltextIndexConstants.PROP_SIMILARITY_SEARCH_DENSE_VECTOR_SIZE; -import static org.apache.jackrabbit.oak.plugins.index.search.spi.query.FulltextIndexPlanner.DEFAULT_PROPERTY_WEIGHT; -import static org.apache.jackrabbit.oak.plugins.index.search.util.ConfigUtil.getOptionalValue; +import javax.jcr.PropertyType; public class PropertyDefinition { private static final Logger log = LoggerFactory.getLogger(PropertyDefinition.class); From 3e169dfdc7ad9af95781b456cba2cc790201cc61 Mon Sep 17 00:00:00 2001 From: Julian Reschke Date: Thu, 22 Aug 2024 16:17:05 +0100 Subject: [PATCH 2/4] Revert "OAK-10992: rdbmk: update to Derby 15.x (#1613)" This reverts commit 8646da6c21eb057375704aba46748969b3831ecd. --- oak-parent/pom.xml | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/oak-parent/pom.xml b/oak-parent/pom.xml index 3181bb768b7..7fdae2f59db 100644 --- a/oak-parent/pom.xml +++ b/oak-parent/pom.xml @@ -64,7 +64,7 @@ 1.2.13 2.1.214 1.28.5 - 10.15.2.0 + 10.14.2.0 2.17.1 1.19.7 4.13.1 @@ -1002,11 +1002,6 @@ derby ${derby.version} - - org.apache.derby - derbyshared - ${derby.version} - From e634b739aa5161e341baed487ddb6d29f8c3b5dc Mon Sep 17 00:00:00 2001 From: Julian Reschke Date: Sat, 24 Aug 2024 13:34:28 +0200 Subject: [PATCH 3/4] OAK-11058: Update (shaded) Guava to 33.3.0 (#1671) --- oak-shaded-guava/pom.xml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/oak-shaded-guava/pom.xml b/oak-shaded-guava/pom.xml index dc616dce0e2..59513e1afb8 100644 --- a/oak-shaded-guava/pom.xml +++ b/oak-shaded-guava/pom.xml @@ -28,8 +28,8 @@ bundle - 33.2.1-jre - 33.2.1 + 33.3.0-jre + 33.3.0 org.apache.jackrabbit.guava From a45f542c7578f6afb52b679d6a761ca4483a7d8f Mon Sep 17 00:00:00 2001 From: sii62122 Date: Mon, 26 Aug 2024 08:33:31 +0200 Subject: [PATCH 4/4] [OAK-10953] Update log statement and extract BLOB_LENGTH_DIVISOR constant --- .../plugins/index/elastic/index/ElasticDocumentMaker.java | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocumentMaker.java b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocumentMaker.java index eeb62ba3f4c..5b92161da8f 100644 --- a/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocumentMaker.java +++ b/oak-search-elastic/src/main/java/org/apache/jackrabbit/oak/plugins/index/elastic/index/ElasticDocumentMaker.java @@ -39,6 +39,7 @@ public class ElasticDocumentMaker extends FulltextDocumentMaker { private static final Logger LOG = LoggerFactory.getLogger(ElasticDocumentMaker.class); + private static final int BLOB_LENGTH_DIVISOR = 4; public ElasticDocumentMaker(@Nullable FulltextBinaryTextExtractor textExtractor, @NotNull IndexDefinition definition, @@ -217,13 +218,13 @@ protected boolean indexSimilarityTag(ElasticDocument doc, PropertyState property @Override protected void indexSimilarityBinaries(ElasticDocument doc, PropertyDefinition pd, Blob blob) throws IOException { // without this check, if the vector size is not correct, the entire document will be skipped - if (pd.getSimilaritySearchDenseVectorSize() == blob.length() / 4) { + if (pd.getSimilaritySearchDenseVectorSize() == blob.length() / BLOB_LENGTH_DIVISOR) { // see https://www.elastic.co/blog/text-similarity-search-with-vectors-in-elasticsearch // see https://www.elastic.co/guide/en/elasticsearch/reference/current/dense-vector.html doc.addSimilarityField(pd.name, blob); } else { LOG.warn("[{}] Ignoring binary property {} for path {}. Expected dimension is {} but got {}", - getIndexName(), pd.name, this.path, pd.getSimilaritySearchDenseVectorSize(), blob.length() / 8); + getIndexName(), pd.name, this.path, pd.getSimilaritySearchDenseVectorSize(), blob.length() / BLOB_LENGTH_DIVISOR); } }