From 33624d65cf0b6d0d54eb106aaaa02b736ca2b1de Mon Sep 17 00:00:00 2001 From: Vijayan Balasubramanian Date: Thu, 26 Dec 2024 11:13:20 -0800 Subject: [PATCH 1/3] Have one score definition for cosinesimilarity Currently we have different score calculation for cosine similarity, for ex: script score, approximate search, exact search has diffent formula to convert distance to cosine similarity that is aligned with OpenSearch score. To keep it consistent, we will be using one defintion which is used by Lucene as standard definition for cosine similarity for all search types. Signed-off-by: Vijayan Balasubramanian --- CHANGELOG.md | 1 + .../org/opensearch/knn/index/SpaceType.java | 14 ++++- .../knn/plugin/script/KNNScoringSpace.java | 7 ++- .../org/opensearch/knn/index/NmslibIT.java | 58 +++++++++++++++++++ .../plugin/script/KNNScoringSpaceTests.java | 7 ++- 5 files changed, 84 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b2d87b1f3..104220866 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -24,6 +24,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), - Allow method parameter override for training based indices (#2290) https://github.com/opensearch-project/k-NN/pull/2290] - Optimizes lucene query execution to prevent unnecessary rewrites (#2305)[https://github.com/opensearch-project/k-NN/pull/2305] - Add check to directly use ANN Search when filters match all docs. (#2320)[https://github.com/opensearch-project/k-NN/pull/2320] +- Use one formula to calculate cosine similarity (#2357)[https://github.com/opensearch-project/k-NN/pull/2357] ### Bug Fixes * Fixing the bug when a segment has no vector field present for disk based vector search (#2282)[https://github.com/opensearch-project/k-NN/pull/2282] * Allow validation for non knn index only after 2.17.0 (#2315)[https://github.com/opensearch-project/k-NN/pull/2315] diff --git a/src/main/java/org/opensearch/knn/index/SpaceType.java b/src/main/java/org/opensearch/knn/index/SpaceType.java index abe265a01..5d90071e8 100644 --- a/src/main/java/org/opensearch/knn/index/SpaceType.java +++ b/src/main/java/org/opensearch/knn/index/SpaceType.java @@ -60,9 +60,21 @@ public float scoreToDistanceTranslation(float score) { } }, COSINESIMIL("cosinesimil") { + /** + * Cosine similarity has range of [-1, 1] where -1 represents vectors are at diametrically opposite, and 1 is where + * they are identical in direction and perfectly similar. In Lucene, scores have to be in the range of [0, Float.MAX_VALUE]. + * Hence, to move the range from [-1, 1] to [ 0, Float.MAX_VALUE], we convert using following formula which is adopted + * by Lucene as mentioned here + * https://github.com/apache/lucene/blob/0494c824e0ac8049b757582f60d085932a890800/lucene/core/src/java/org/apache/lucene/index/VectorSimilarityFunction.java#L73 + * We expect raw score = 1 - cosine(x,y), if underlying library returns different range or other than expected raw score, + * they should override this method to either provide valid range or convert raw score to the format as 1 - cosine and call this method + * + * @param rawScore score returned from underlying library + * @return Lucene scaled score + */ @Override public float scoreTranslation(float rawScore) { - return 1 / (1 + rawScore); + return Math.max((2.0F - rawScore) / 2.0F, 0.0F); } @Override diff --git a/src/main/java/org/opensearch/knn/plugin/script/KNNScoringSpace.java b/src/main/java/org/opensearch/knn/plugin/script/KNNScoringSpace.java index 71616c9fd..9744796c6 100644 --- a/src/main/java/org/opensearch/knn/plugin/script/KNNScoringSpace.java +++ b/src/main/java/org/opensearch/knn/plugin/script/KNNScoringSpace.java @@ -144,7 +144,12 @@ public CosineSimilarity(Object query, MappedFieldType fieldType) { protected BiFunction getScoringMethod(final float[] processedQuery) { SpaceType.COSINESIMIL.validateVector(processedQuery); float qVectorSquaredMagnitude = getVectorMagnitudeSquared(processedQuery); - return (float[] q, float[] v) -> 1 + KNNScoringUtil.cosinesimilOptimized(q, v, qVectorSquaredMagnitude); + // To be consistent, we will be using same formula used by lucene as mentioned below + // https://github.com/apache/lucene/blob/0494c824e0ac8049b757582f60d085932a890800/lucene/core/src/java/org/apache/lucene/index/VectorSimilarityFunction.java#L73 + return (float[] q, float[] v) -> Math.max( + (1.0F + KNNScoringUtil.cosinesimilOptimized(q, v, qVectorSquaredMagnitude)) / 2.0F, + 0.0F + ); } } diff --git a/src/test/java/org/opensearch/knn/index/NmslibIT.java b/src/test/java/org/opensearch/knn/index/NmslibIT.java index 8ca436bf4..e2e7613a2 100644 --- a/src/test/java/org/opensearch/knn/index/NmslibIT.java +++ b/src/test/java/org/opensearch/knn/index/NmslibIT.java @@ -195,6 +195,64 @@ public void testEndToEnd() throws Exception { fail("Graphs are not getting evicted"); } + public void testEndToEnd_withApproxAndExactSearch_inSameIndex_ForCosineSpaceType() throws Exception { + String indexName = "test-index-1"; + String fieldName = "test-field-1"; + SpaceType spaceType = SpaceType.COSINESIMIL; + Integer dimension = testData.indexData.vectors[0].length; + + // Create an index + XContentBuilder builder = XContentFactory.jsonBuilder() + .startObject() + .startObject("properties") + .startObject(fieldName) + .field("type", "knn_vector") + .field("dimension", dimension) + .field(KNNConstants.METHOD_PARAMETER_SPACE_TYPE, spaceType.getValue()) + .startObject(KNNConstants.KNN_METHOD) + .field(KNNConstants.NAME, KNNConstants.METHOD_HNSW) + .field(KNNConstants.KNN_ENGINE, KNNEngine.NMSLIB.getName()) + .endObject() + .endObject() + .endObject() + .endObject(); + + Map mappingMap = xContentBuilderToMap(builder); + String mapping = builder.toString(); + + createKnnIndex(indexName, buildKNNIndexSettings(0), mapping); + + // Index one document + addKnnDoc(indexName, randomAlphaOfLength(5), fieldName, Floats.asList(testData.indexData.vectors[0]).toArray()); + + // Assert we have the right number of documents in the index + refreshAllIndices(); + assertEquals(1, getDocCount(indexName)); + // update threshold setting to skip building graph + updateIndexSettings(indexName, Settings.builder().put(KNNSettings.INDEX_KNN_ADVANCED_APPROXIMATE_THRESHOLD, -1)); + // add duplicate document with different id + addKnnDoc(indexName, randomAlphaOfLength(5), fieldName, Floats.asList(testData.indexData.vectors[0]).toArray()); + assertEquals(2, getDocCount(indexName)); + final int k = 2; + // search index + Response response = searchKNNIndex( + indexName, + KNNQueryBuilder.builder().fieldName(fieldName).vector(testData.queries[0]).k(k).build(), + k + ); + String responseBody = EntityUtils.toString(response.getEntity()); + List knnResults = parseSearchResponse(responseBody, fieldName); + assertEquals(k, knnResults.size()); + + List actualScores = parseSearchResponseScore(responseBody, fieldName); + + // both document should have identical score + assertEquals(actualScores.get(0), actualScores.get(1), 0.001); + + // Delete index + deleteKNNIndex(indexName); + } + @SneakyThrows private void validateSearch( final String indexName, diff --git a/src/test/java/org/opensearch/knn/plugin/script/KNNScoringSpaceTests.java b/src/test/java/org/opensearch/knn/plugin/script/KNNScoringSpaceTests.java index 4fc549d6b..99e847eea 100644 --- a/src/test/java/org/opensearch/knn/plugin/script/KNNScoringSpaceTests.java +++ b/src/test/java/org/opensearch/knn/plugin/script/KNNScoringSpaceTests.java @@ -10,6 +10,7 @@ import java.util.Locale; import lombok.SneakyThrows; +import org.apache.lucene.index.VectorSimilarityFunction; import org.opensearch.index.mapper.MappedFieldType; import org.opensearch.knn.KNNTestCase; import org.opensearch.knn.index.engine.KNNMethodContext; @@ -86,7 +87,11 @@ public void testCosineSimilarity_whenValid_thenSucceed() { getMappingConfigForMethodMapping(knnMethodContext, 3) ); KNNScoringSpace.CosineSimilarity cosineSimilarity = new KNNScoringSpace.CosineSimilarity(arrayListQueryObject, fieldType); - assertEquals(2F, cosineSimilarity.getScoringMethod().apply(arrayFloat2, arrayFloat), 0.1F); + assertEquals( + VectorSimilarityFunction.COSINE.compare(arrayFloat2, arrayFloat), + cosineSimilarity.getScoringMethod().apply(arrayFloat2, arrayFloat), + 0.1F + ); // invalid zero vector final List queryZeroVector = List.of(0.0f, 0.0f, 0.0f); From e1aabc9db735bcfa82bc126f3c11607b968394d3 Mon Sep 17 00:00:00 2001 From: Vijayan Balasubramanian Date: Mon, 30 Dec 2024 13:16:57 -0800 Subject: [PATCH 2/3] update test Signed-off-by: Vijayan Balasubramanian --- src/test/java/org/opensearch/knn/index/NmslibIT.java | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/test/java/org/opensearch/knn/index/NmslibIT.java b/src/test/java/org/opensearch/knn/index/NmslibIT.java index e2e7613a2..e0ba58eb1 100644 --- a/src/test/java/org/opensearch/knn/index/NmslibIT.java +++ b/src/test/java/org/opensearch/knn/index/NmslibIT.java @@ -248,9 +248,6 @@ public void testEndToEnd_withApproxAndExactSearch_inSameIndex_ForCosineSpaceType // both document should have identical score assertEquals(actualScores.get(0), actualScores.get(1), 0.001); - - // Delete index - deleteKNNIndex(indexName); } @SneakyThrows From c2f5540389e6e3905a3c2c66fdaa2005f8c7d366 Mon Sep 17 00:00:00 2001 From: Vijayan Balasubramanian Date: Thu, 2 Jan 2025 11:20:47 -0800 Subject: [PATCH 3/3] add version check Signed-off-by: Vijayan Balasubramanian --- .../knn/index/mapper/KNNMappingConfig.java | 9 +++++ .../knn/index/mapper/LuceneFieldMapper.java | 6 ++++ .../knn/index/mapper/MethodFieldMapper.java | 6 ++++ .../knn/index/mapper/ModelFieldMapper.java | 5 +++ .../knn/plugin/script/KNNScoringSpace.java | 36 ++++++++++++++----- 5 files changed, 54 insertions(+), 8 deletions(-) diff --git a/src/main/java/org/opensearch/knn/index/mapper/KNNMappingConfig.java b/src/main/java/org/opensearch/knn/index/mapper/KNNMappingConfig.java index cd77ebd9a..eeaef9847 100644 --- a/src/main/java/org/opensearch/knn/index/mapper/KNNMappingConfig.java +++ b/src/main/java/org/opensearch/knn/index/mapper/KNNMappingConfig.java @@ -5,6 +5,7 @@ package org.opensearch.knn.index.mapper; +import org.opensearch.Version; import org.opensearch.knn.index.engine.KNNMethodContext; import org.opensearch.knn.index.engine.qframe.QuantizationConfig; @@ -62,4 +63,12 @@ default QuantizationConfig getQuantizationConfig() { * @return the dimension of the index; for model based indices, it will be null */ int getDimension(); + + /** + * Returns index created Version + * @return Version + */ + default Version getIndexCreatedVersion() { + return Version.CURRENT; + } } diff --git a/src/main/java/org/opensearch/knn/index/mapper/LuceneFieldMapper.java b/src/main/java/org/opensearch/knn/index/mapper/LuceneFieldMapper.java index 7990fdcab..4ceb9b4b2 100644 --- a/src/main/java/org/opensearch/knn/index/mapper/LuceneFieldMapper.java +++ b/src/main/java/org/opensearch/knn/index/mapper/LuceneFieldMapper.java @@ -17,6 +17,7 @@ import org.apache.lucene.document.FieldType; import org.apache.lucene.document.KnnByteVectorField; import org.apache.lucene.document.KnnFloatVectorField; +import org.opensearch.Version; import org.opensearch.common.Explicit; import org.opensearch.knn.index.KNNVectorSimilarityFunction; import org.opensearch.knn.index.VectorDataType; @@ -73,6 +74,11 @@ public Mode getMode() { public CompressionLevel getCompressionLevel() { return knnMethodConfigContext.getCompressionLevel(); } + + @Override + public Version getIndexCreatedVersion() { + return knnMethodConfigContext.getVersionCreated(); + } } ); diff --git a/src/main/java/org/opensearch/knn/index/mapper/MethodFieldMapper.java b/src/main/java/org/opensearch/knn/index/mapper/MethodFieldMapper.java index bf5bc2b51..755439ce6 100644 --- a/src/main/java/org/opensearch/knn/index/mapper/MethodFieldMapper.java +++ b/src/main/java/org/opensearch/knn/index/mapper/MethodFieldMapper.java @@ -8,6 +8,7 @@ import org.apache.lucene.document.FieldType; import org.apache.lucene.index.DocValuesType; import org.apache.lucene.index.VectorEncoding; +import org.opensearch.Version; import org.opensearch.common.Explicit; import org.opensearch.common.xcontent.XContentFactory; import org.opensearch.knn.index.SpaceType; @@ -86,6 +87,11 @@ public CompressionLevel getCompressionLevel() { public QuantizationConfig getQuantizationConfig() { return quantizationConfig; } + + @Override + public Version getIndexCreatedVersion() { + return knnMethodConfigContext.getVersionCreated(); + } } ); return new MethodFieldMapper( diff --git a/src/main/java/org/opensearch/knn/index/mapper/ModelFieldMapper.java b/src/main/java/org/opensearch/knn/index/mapper/ModelFieldMapper.java index 013cb0c53..cbc7520cf 100644 --- a/src/main/java/org/opensearch/knn/index/mapper/ModelFieldMapper.java +++ b/src/main/java/org/opensearch/knn/index/mapper/ModelFieldMapper.java @@ -107,6 +107,11 @@ public QuantizationConfig getQuantizationConfig() { return quantizationConfig; } + @Override + public Version getIndexCreatedVersion() { + return indexCreatedVersion; + } + // ModelMetadata relies on cluster state which may not be available during field mapper creation. Thus, // we lazily initialize it. private void initFromModelMetadata() { diff --git a/src/main/java/org/opensearch/knn/plugin/script/KNNScoringSpace.java b/src/main/java/org/opensearch/knn/plugin/script/KNNScoringSpace.java index 9744796c6..b613efab2 100644 --- a/src/main/java/org/opensearch/knn/plugin/script/KNNScoringSpace.java +++ b/src/main/java/org/opensearch/knn/plugin/script/KNNScoringSpace.java @@ -8,6 +8,7 @@ import lombok.Getter; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.search.IndexSearcher; +import org.opensearch.Version; import org.opensearch.index.mapper.MappedFieldType; import org.opensearch.knn.index.SpaceType; import org.opensearch.knn.index.VectorDataType; @@ -69,7 +70,7 @@ public KNNFieldSpace( ) { KNNVectorFieldType knnVectorFieldType = toKNNVectorFieldType(fieldType, spaceName, supportingVectorDataTypes); this.processedQuery = getProcessedQuery(query, knnVectorFieldType); - this.scoringMethod = getScoringMethod(this.processedQuery); + this.scoringMethod = getScoringMethod(this.processedQuery, knnVectorFieldType.getKnnMappingConfig().getIndexCreatedVersion()); } public ScoreScript getScoreScript( @@ -122,6 +123,10 @@ protected float[] getProcessedQuery(final Object query, final KNNVectorFieldType protected abstract BiFunction getScoringMethod(final float[] processedQuery); + protected BiFunction getScoringMethod(final float[] processedQuery, Version indexCreatedVersion) { + return getScoringMethod(processedQuery); + } + } class L2 extends KNNFieldSpace { @@ -141,15 +146,30 @@ public CosineSimilarity(Object query, MappedFieldType fieldType) { } @Override - protected BiFunction getScoringMethod(final float[] processedQuery) { + protected BiFunction getScoringMethod(float[] processedQuery) { + return getScoringMethod(processedQuery, Version.CURRENT); + } + + @Override + protected BiFunction getScoringMethod(final float[] processedQuery, Version indexCreatedVersion) { SpaceType.COSINESIMIL.validateVector(processedQuery); float qVectorSquaredMagnitude = getVectorMagnitudeSquared(processedQuery); - // To be consistent, we will be using same formula used by lucene as mentioned below - // https://github.com/apache/lucene/blob/0494c824e0ac8049b757582f60d085932a890800/lucene/core/src/java/org/apache/lucene/index/VectorSimilarityFunction.java#L73 - return (float[] q, float[] v) -> Math.max( - (1.0F + KNNScoringUtil.cosinesimilOptimized(q, v, qVectorSquaredMagnitude)) / 2.0F, - 0.0F - ); + if (indexCreatedVersion.onOrAfter(Version.V_2_19_0)) { + // To be consistent, we will be using same formula used by lucene as mentioned below + // https://github.com/apache/lucene/blob/0494c824e0ac8049b757582f60d085932a890800/lucene/core/src/java/org/apache/lucene/index/VectorSimilarityFunction.java#L73 + // for indices that are created on or after 2.19.0 + // + // OS Score = ( 2 − cosineSimil) / 2 + // However cosineSimil = 1 - cos θ, after applying this to above formula, + // OS Score = ( 2 − ( 1 − cos θ ) ) / 2 + // which simplifies to + // OS Score = ( 1 + cos θ ) / 2 + return (float[] q, float[] v) -> Math.max( + ((1 + KNNScoringUtil.cosinesimilOptimized(q, v, qVectorSquaredMagnitude)) / 2.0F), + 0 + ); + } + return (float[] q, float[] v) -> 1 + KNNScoringUtil.cosinesimilOptimized(q, v, qVectorSquaredMagnitude); } }