Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[OAK-10953] #1646

Open
wants to merge 4 commits into
base: trunk
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -107,9 +107,6 @@ public class ElasticIndexDefinition extends IndexDefinition {

public static final String SPLIT_ON_CASE_CHANGE = "splitOnCaseChange";
public static final String SPLIT_ON_NUMERICS = "splitOnNumerics";

public static final String ELASTIKNN = "elastiknn";

private static final String SIMILARITY_TAGS_ENABLED = "similarityTagsEnabled";
private static final boolean SIMILARITY_TAGS_ENABLED_DEFAULT = true;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,141 +16,93 @@
*/
package org.apache.jackrabbit.oak.plugins.index.elastic;

import static org.apache.jackrabbit.oak.plugins.index.search.util.ConfigUtil.getOptionalValue;

import org.apache.jackrabbit.oak.plugins.index.search.IndexDefinition;
import org.apache.jackrabbit.oak.plugins.index.search.PropertyDefinition;
import org.apache.jackrabbit.oak.spi.state.NodeState;

import static org.apache.jackrabbit.oak.plugins.index.search.util.ConfigUtil.getOptionalValue;

public class ElasticPropertyDefinition extends PropertyDefinition {

SimilaritySearchParameters similaritySearchParameters;

public static final String PROP_QUERY_MODEL = "queryModel";
public static final String PROP_NUMBER_OF_HASH_TABLES = "L";
public static final String PROP_NUMBER_OF_HASH_FUNCTIONS = "k";
public static final String PROP_NUMBER_OF_BUCKETS = "w";
public static final String PROP_INDEX_SIMILARITY = "indexSimilarity";
public static final String PROP_QUERY_SIMILARITY = "querySimilarity";
public static final String PROP_CANDIDATES = "candidates";
public static final String PROP_PROBES = "probes";

private static final int DEFAULT_NUMBER_OF_HASH_TABLES = 20;
private static final int DEFAULT_NO_OF_HASH_FUNCTIONS = 15;
private static final int DEFAULT_BUCKET_WIDTH = 500;
private static final String DEFAULT_SIMILARITY_QUERY_MODEL = "lsh";
private static final String DEFAULT_SIMILARITY_INDEX_FUNCTION = "l2";
private static final String DEFAULT_SIMILARITY_QUERY_FUNCTION = "l2";
private static final int DEFAULT_QUERY_CANDIDATES = 500;
private static final int DEFAULT_QUERY_PROBES = 3;


public ElasticPropertyDefinition(IndexDefinition.IndexingRule idxDefn, String nodeName, NodeState defn) {
super(idxDefn, nodeName, defn);
if (this.useInSimilarity) {
similaritySearchParameters = new SimilaritySearchParameters(
getOptionalValue(defn, PROP_NUMBER_OF_HASH_TABLES, DEFAULT_NUMBER_OF_HASH_TABLES),
getOptionalValue(defn, PROP_NUMBER_OF_HASH_FUNCTIONS, DEFAULT_NO_OF_HASH_FUNCTIONS),
getOptionalValue(defn, PROP_NUMBER_OF_BUCKETS, DEFAULT_BUCKET_WIDTH),
getOptionalValue(defn, PROP_QUERY_MODEL, DEFAULT_SIMILARITY_QUERY_MODEL),
getOptionalValue(defn, PROP_INDEX_SIMILARITY, DEFAULT_SIMILARITY_INDEX_FUNCTION),
getOptionalValue(defn, PROP_QUERY_SIMILARITY, DEFAULT_SIMILARITY_QUERY_FUNCTION),
getOptionalValue(defn, PROP_CANDIDATES, DEFAULT_QUERY_CANDIDATES),
getOptionalValue(defn, PROP_PROBES, DEFAULT_QUERY_PROBES));
}
public static final String DEFAULT_SIMILARITY_METRIC = "l2_norm";
static final String PROP_SIMILARITY_METRIC = "similarityMetric";
private static final String PROP_SIMILARITY = "similarity";
private static final String PROP_K = "k";
private static final String PROP_CANDIDATES = "candidates";
private static final float DEFAULT_SIMILARITY = 0.95f;
private static final int DEFAULT_K = 10;
private static final int DEFAULT_CANDIDATES = 500;
private KnnSearchParameters knnSearchParameters;

public ElasticPropertyDefinition(IndexDefinition.IndexingRule idxDefn, String nodeName, NodeState defn) {
super(idxDefn, nodeName, defn);
if (this.useInSimilarity) {
knnSearchParameters = new KnnSearchParameters(
getOptionalValue(defn, PROP_SIMILARITY_METRIC, DEFAULT_SIMILARITY_METRIC),
getOptionalValue(defn, PROP_SIMILARITY, DEFAULT_SIMILARITY),
getOptionalValue(defn, PROP_K, DEFAULT_K),
getOptionalValue(defn, PROP_CANDIDATES, DEFAULT_CANDIDATES));
}
}

public KnnSearchParameters getKnnSearchParameters() {
return knnSearchParameters;
}

/**
* Class for defining parameters of approximate knn search on dense_vector fields
* <a href="https://www.elastic.co/guide/en/elasticsearch/reference/current/dense-vector.html">...</a> and
* <a href="https://www.elastic.co/guide/en/elasticsearch/reference/current/knn-search.html">...</a>
*/
public static class KnnSearchParameters {

public KnnSearchParameters(String similarityMetric, float similarity, int k, int candidates) {
this.similarityMetric = similarityMetric;
this.similarity = similarity;
this.k = k;
this.candidates = candidates;
}

/**
* Class for defining parameters for similarity search based on https://elastiknn.com/api.
* For all possible models and query combinations, see https://elastiknn.com/api/#model-and-query-compatibility
* Similarity metric used to compare query and document vectors. Possible values are l2_norm (default), cosine,
* dot_product, max_inner_product
*/
public static class SimilaritySearchParameters {

/**
* Number of hash tables. Generally, increasing this value increases recall.
*/
private final int L;
/**
* Number of hash functions combined to form a single hash value. Generally, increasing this value increases precision.
*/
private final int k;
/**
* Integer bucket width.
*/
private final int w;
/**
* Possible values - lsh, exact
*/
private final String queryModel;
/**
* Possible values l2 (with lsh or exact model), l1 (with exact model), A (angular distance - with exact model)
*/
private final String queryTimeSimilarityFunction;
/**
* Possible values l2 (with lsh or exact model), l1 (with exact model), A (angular distance - with exact model)
*/
private final String indexTimeSimilarityFunction;
/**
* Take the top vectors with the most matching hashes and compute their exact similarity to the query vector. The candidates parameter
* controls the number of exact similarity computations. Specifically, we compute exact similarity for the top candidates candidate vectors
* in each segment. As a reminder, each Elasticsearch index has >= 1 shards, and each shard has >= 1 segments. That means if you set
* "candidates": 200 for an index with 2 shards, each with 3 segments, then you’ll compute the exact similarity for 2 * 3 * 200 = 1200 vectors.
* candidates must be set to a number greater or equal to the number of Elasticsearch results you want to get. Higher values generally mean
* higher recall and higher latency.
*/
private final int candidates;
/**
* Number of probes for using the multiprobe search technique. Default value is zero. Max value is 3^k. Generally, increasing probes will
* increase recall, will allow you to use a smaller value for L with comparable recall, but introduces some additional computation at query time.
*/
private final int probes;

public SimilaritySearchParameters(int l, int k, int w, String queryModel, String indexTimeSimilarityFunction,
String queryTimeSimilarityFunction, int candidates, int probes) {
L = l;
this.k = k;
this.w = w;
this.queryModel = queryModel;
this.indexTimeSimilarityFunction = indexTimeSimilarityFunction;
this.queryTimeSimilarityFunction = queryTimeSimilarityFunction;
this.candidates = candidates;
this.probes = probes;
}

public int getL() {
return L;
}

public int getK() {
return k;
}

public int getW() {
return w;
}

public String getQueryModel() {
return queryModel;
}

public String getQueryTimeSimilarityFunction() {
return queryTimeSimilarityFunction;
}
private final String similarityMetric;
/**
* Minimum similarity for the document vector to be considered as a match. Required when cosine, dot_product
* or max_inner_product is set as similarityMetric
*/
private final float similarity;
/**
* Number of nearest neighbours to return. Must be <= candidates
* vector added as a field
*/
private final int k;

public String getIndexTimeSimilarityFunction() {
return indexTimeSimilarityFunction;
}
/**
* Take the top vectors with the most matching hashes and compute their exact similarity to the query vector. The
* candidates parameter controls the number of exact similarity computations. Specifically, we compute exact
* similarity for the top candidates candidate vectors in each segment. As a reminder, each Elasticsearch index has
* >= 1 shards, and each shard has >= 1 segments. That means if you set "candidates": 200 for an index with 2
* shards, each with 3 segments, then you’ll compute the exact similarity for 2 * 3 * 200 = 1200 vectors. candidates
* must be set to a number greater or equal to the number of Elasticsearch results you want to get. Higher values
* generally mean higher recall and higher latency.
*/
private final int candidates;

public int getCandidates() {
return candidates;
}
public String getSimilarityMetric() {
return similarityMetric;
}
public float getSimilarity() {
return similarity;
}

public int getProbes() {
return probes;
}
public int getK() {
return k;
}

public SimilaritySearchParameters getSimilaritySearchParameters() {
return similaritySearchParameters;
public int getCandidates() {
return candidates;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
import java.util.Set;
import java.util.LinkedHashSet;

import static org.apache.jackrabbit.oak.plugins.index.elastic.util.ElasticIndexUtils.toDoubles;
import static org.apache.jackrabbit.oak.plugins.index.elastic.util.ElasticIndexUtils.toFloats;

@JsonInclude(JsonInclude.Include.NON_EMPTY)
public class ElasticDocument {
Expand Down Expand Up @@ -130,7 +130,7 @@ void addProperty(String fieldName, Object value) {

void addSimilarityField(String name, Blob value) throws IOException {
byte[] bytes = new BlobByteSource(value).read();
addProperty(FieldNames.createSimilarityFieldName(name), toDoubles(bytes));
addProperty(FieldNames.createSimilarityFieldName(name), toFloats(bytes));
}

void indexAncestors(String path) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,7 @@ protected boolean indexSimilarityTag(ElasticDocument doc, PropertyState property
@Override
protected void indexSimilarityBinaries(ElasticDocument doc, PropertyDefinition pd, Blob blob) throws IOException {
// without this check, if the vector size is not correct, the entire document will be skipped
if (pd.getSimilaritySearchDenseVectorSize() == blob.length() / 8) {
if (pd.getSimilaritySearchDenseVectorSize() == blob.length() / 4) {
fabriziofortino marked this conversation as resolved.
Show resolved Hide resolved
// see https://www.elastic.co/blog/text-similarity-search-with-vectors-in-elasticsearch
// see https://www.elastic.co/guide/en/elasticsearch/reference/current/dense-vector.html
doc.addSimilarityField(pd.name, blob);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,25 +16,26 @@
*/
package org.apache.jackrabbit.oak.plugins.index.elastic.index;

import static org.apache.jackrabbit.oak.plugins.index.elastic.ElasticPropertyDefinition.DEFAULT_SIMILARITY_METRIC;

import org.apache.jackrabbit.oak.api.Type;
import org.apache.jackrabbit.oak.plugins.index.elastic.ElasticIndexDefinition;
import org.apache.jackrabbit.oak.plugins.index.elastic.ElasticPropertyDefinition;
import org.apache.jackrabbit.oak.plugins.index.search.FieldNames;
import org.apache.jackrabbit.oak.plugins.index.search.PropertyDefinition;
import org.jetbrains.annotations.NotNull;

import co.elastic.clients.elasticsearch._types.Time;
import co.elastic.clients.elasticsearch._types.mapping.DenseVectorProperty;
import co.elastic.clients.elasticsearch._types.mapping.DynamicMapping;
import co.elastic.clients.elasticsearch._types.mapping.Property;
import co.elastic.clients.elasticsearch._types.mapping.TypeMapping;
import co.elastic.clients.elasticsearch.indices.CreateIndexRequest;
import co.elastic.clients.elasticsearch.indices.IndexSettings;
import co.elastic.clients.elasticsearch.indices.IndexSettingsAnalysis;
import co.elastic.clients.elasticsearch.indices.PutIndicesSettingsRequest;
import co.elastic.clients.json.JsonData;
import co.elastic.clients.util.ObjectBuilder;
import org.apache.jackrabbit.oak.api.Type;
import org.apache.jackrabbit.oak.plugins.index.elastic.ElasticIndexDefinition;
import org.apache.jackrabbit.oak.plugins.index.elastic.ElasticPropertyDefinition;
import org.apache.jackrabbit.oak.plugins.index.search.FieldNames;
import org.apache.jackrabbit.oak.plugins.index.search.PropertyDefinition;
import org.jetbrains.annotations.NotNull;

import java.io.Reader;
import java.io.StringReader;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
Expand Down Expand Up @@ -156,10 +157,6 @@ public static PutIndicesSettingsRequest enableIndexRequest(String remoteIndexNam

private static ObjectBuilder<IndexSettings> loadSettings(@NotNull IndexSettings.Builder builder,
@NotNull ElasticIndexDefinition indexDefinition) {
if (!indexDefinition.getSimilarityProperties().isEmpty()) {
builder.otherSettings(ElasticIndexDefinition.ELASTIKNN, JsonData.of(true));
}

// collect analyzer settings
IndexSettingsAnalysis.Builder analyzerBuilder =
ElasticCustomAnalyzer.buildCustomAnalyzers(indexDefinition.getAnalyzersNodeState(), "oak_analyzer");
Expand Down Expand Up @@ -263,20 +260,13 @@ private static void mapIndexRules(@NotNull TypeMapping.Builder builder,
ElasticPropertyDefinition pd = (ElasticPropertyDefinition) propertyDefinition;
int denseVectorSize = pd.getSimilaritySearchDenseVectorSize();

Reader eknnConfig = new StringReader(
"{" +
" \"type\": \"elastiknn_dense_float_vector\"," +
" \"elastiknn\": {" +
" \"dims\": " + denseVectorSize + "," +
" \"model\": \"lsh\"," +
" \"similarity\": \"" + pd.getSimilaritySearchParameters().getIndexTimeSimilarityFunction() + "\"," +
" \"L\": " + pd.getSimilaritySearchParameters().getL() + "," +
" \"k\": " + pd.getSimilaritySearchParameters().getK() + "," +
" \"w\": " + pd.getSimilaritySearchParameters().getW() +
" }" +
"}");
DenseVectorProperty denseVectorProperty = new DenseVectorProperty.Builder()
.index(true)
.dims(denseVectorSize)
.similarity(DEFAULT_SIMILARITY_METRIC)
.build();

builder.properties(FieldNames.createSimilarityFieldName(pd.name), b1 -> b1.withJson(eknnConfig));
builder.properties(FieldNames.createSimilarityFieldName(pd.name), b1 -> b1.denseVector(denseVectorProperty));
}

builder.properties(ElasticIndexDefinition.SIMILARITY_TAGS,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,9 @@
*/
package org.apache.jackrabbit.oak.plugins.index.elastic.query;

import static org.apache.jackrabbit.oak.plugins.index.IndexConstants.TYPE_PROPERTY_NAME;
import static org.apache.jackrabbit.oak.plugins.index.elastic.ElasticIndexDefinition.TYPE_ELASTICSEARCH;

import org.apache.jackrabbit.oak.commons.PathUtils;
import org.apache.jackrabbit.oak.plugins.index.elastic.ElasticIndexNode;
import org.apache.jackrabbit.oak.plugins.index.elastic.ElasticIndexTracker;
Expand All @@ -33,9 +36,6 @@
import java.util.function.BiFunction;
import java.util.function.Predicate;

import static org.apache.jackrabbit.oak.plugins.index.IndexConstants.TYPE_PROPERTY_NAME;
import static org.apache.jackrabbit.oak.plugins.index.elastic.ElasticIndexDefinition.TYPE_ELASTICSEARCH;

class ElasticIndex extends FulltextIndex {
private static final Predicate<NodeState> ELASTICSEARCH_INDEX_DEFINITION_PREDICATE =
state -> TYPE_ELASTICSEARCH.equals(state.getString(TYPE_PROPERTY_NAME));
Expand Down
Loading
Loading