feat: adds plugin-qps (#818)

oramasearch · Oct 14, 2024 · 1737903 · 1737903
1 parent 328b1db
commit 1737903
Show file tree

Hide file tree

Showing 11 changed files with 1,087 additions and 1 deletion.
diff --git a/packages/orama/src/types.ts b/packages/orama/src/types.ts
@@ -1316,7 +1316,7 @@ export type OramaPluginSync<T = unknown> = {
   beforeUpdateMultiple?: <T extends AnyOrama>(orama: T, docs: AnyDocument[]) => SyncOrAsyncValue
   afterUpdateMultiple?: <T extends AnyOrama>(orama: T, docs: AnyDocument[]) => SyncOrAsyncValue
   afterCreate?: <T extends AnyOrama>(orama: T) => SyncOrAsyncValue
-  getComponents?: <TIndex extends IIndex<AnyIndexStore>, TDocumentStore, TSorter>(schema: AnySchema) => SyncOrAsyncValue<Partial<ObjectComponents<TIndex, TDocumentStore, TSorter>>>
+  getComponents?: <IndexStore extends AnyIndexStore, TDocumentStore, TSorter>(schema: AnySchema) => SyncOrAsyncValue<Partial<ObjectComponents<IIndex<IndexStore>, TDocumentStore, TSorter>>>
 }
 
 export type OramaPluginAsync<T = unknown> = Promise<OramaPluginSync<T>>

diff --git a/packages/plugin-qps/LICENSE.md b/packages/plugin-qps/LICENSE.md
@@ -0,0 +1,13 @@
+Copyright 2024 OramaSearch Inc
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
diff --git a/packages/plugin-qps/README.md b/packages/plugin-qps/README.md
@@ -0,0 +1,29 @@
+# Orama Plugin Quantum Proximity Scoring
+
+**Orama Plugin Quantum Proximity Scoring** ranks search results based on the proximity of query tokens in the document.
+
+## Installation
+
+To get started with **Orama Plugin PT15**, just install it with npm:
+
+```sh
+npm i @orama/plugin-qps
+```
+
+## Usage
+
+```js
+import { create } from '@orama/orama'
+import { pluginQPS } from '@orama/plugin-qps'
+
+const db = await create({
+  schema: {
+    description: 'string',
+  },
+  plugins: [ pluginQPS() ],
+})
+```
+
+# License
+
+[Apache 2.0](/LICENSE.md)
diff --git a/packages/plugin-qps/package.json b/packages/plugin-qps/package.json
@@ -0,0 +1,53 @@
+{
+  "name": "@orama/plugin-qps",
+  "version": "3.0.0-rc-2",
+  "description": "Performant search algorithm optimized for descriptive texts",
+  "keywords": [
+    "orama",
+    "embeddings",
+    "secure proxy",
+    "vector search"
+  ],
+  "license": "Apache-2.0",
+  "main": "./dist/index.js",
+  "type": "module",
+  "exports": {
+    ".": {
+      "require": "./dist/index.cjs",
+      "import": "./dist/index.js",
+      "types": "./dist/index.d.ts",
+      "browser": "./dist/index.global.js"
+    }
+  },
+  "bugs": {
+    "url": "https://github.com/askorama/orama/issues"
+  },
+  "homepage": "https://github.com/askorama/orama#readme",
+  "repository": {
+    "type": "git",
+    "url": "git+https://github.com/askorama/orama.git"
+  },
+  "sideEffects": false,
+  "types": "./dist/index.d.ts",
+  "files": [
+    "dist"
+  ],
+  "scripts": {
+    "build": "tsup --config tsup.lib.js",
+    "lint": "exit 0",
+    "test": "node --test --import tsx test/*.test.ts"
+  },
+  "publishConfig": {
+    "access": "public"
+  },
+  "devDependencies": {
+    "@types/node": "^20.9.0",
+    "tap": "^21.0.1",
+    "tsup": "^7.2.0",
+    "tsx": "^4.19.1",
+    "typescript": "^5.0.0"
+  },
+  "dependencies": {
+    "@orama/orama": "workspace:*"
+  }
+}
diff --git a/packages/plugin-qps/src/algorithm.ts b/packages/plugin-qps/src/algorithm.ts
@@ -0,0 +1,246 @@
+import { AnyIndexStore, AnyOrama, SearchableType, Tokenizer, VectorIndex } from "@orama/orama"
+import { avl, bkd, flat, radix, bool } from '@orama/orama/trees'
+import {
+  getVectorSize, index as Index, internalDocumentIDStore, isVectorType } from '@orama/orama/components'
+
+type InternalDocumentID = internalDocumentIDStore.InternalDocumentID;
+
+export interface QPSIndex extends AnyIndexStore {
+  indexes: Record<string, Index.Tree>
+  vectorIndexes: Record<string, VectorIndex>
+  searchableProperties: string[]
+  searchablePropertiesWithTypes: Record<string, SearchableType>
+  stats: Record<string, {
+    tokenQuantums: Record<InternalDocumentID, Record<string, number>>
+    tokensLength: Map<InternalDocumentID, number>
+  }>
+}
+
+export function recursiveCreate<T extends AnyOrama>(indexDatastore: QPSIndex, schema: T['schema'], prefix: string) {
+  for (const entry of Object.entries<SearchableType>(schema)) {
+    const prop = entry[0]
+    const type = entry[1]
+    const path = `${prefix}${prefix ? '.' : ''}${prop}`
+
+    if (typeof type === 'object' && !Array.isArray(type)) {
+      // Nested
+      recursiveCreate(indexDatastore, type, path)
+      continue
+    }
+
+    if (isVectorType(type)) {
+      indexDatastore.searchableProperties.push(path)
+      indexDatastore.searchablePropertiesWithTypes[path] = type
+      indexDatastore.vectorIndexes[path] = {
+        size: getVectorSize(type),
+        vectors: {}
+      }
+    } else {
+      const isArray = /\[/.test(type as string)
+      switch (type) {
+        case 'boolean':
+        case 'boolean[]':
+          indexDatastore.indexes[path] = { type: 'Bool', node: new bool.BoolNode(), isArray }
+          break
+        case 'number':
+        case 'number[]':
+          indexDatastore.indexes[path] = { type: 'AVL', node: new avl.AVLTree<number, InternalDocumentID[]>(0, []), isArray }
+          break
+        case 'string':
+        case 'string[]':
+          indexDatastore.indexes[path] = { type: 'Radix', node: new radix.RadixTree(), isArray }
+          break
+        case 'enum':
+        case 'enum[]':
+          indexDatastore.indexes[path] = { type: 'Flat', node: new flat.FlatTree(), isArray }
+          break
+        case 'geopoint':
+          indexDatastore.indexes[path] = { type: 'BKD', node: new bkd.BKDTree(), isArray }
+          break
+        default:
+          throw new Error('INVALID_SCHEMA_TYPE: ' + path)
+      }
+
+      indexDatastore.searchableProperties.push(path)
+      indexDatastore.searchablePropertiesWithTypes[path] = type
+    }
+  }
+}
+
+
+const BIT_MASK_20 = 0b11111111111111111111
+
+export function calculateTokenQuantum(prevValue: number, bit: number) {
+  // if (prevValue < 0) {
+  //   throw new Error("Overflow")
+  // }
+  // if (bit < 0 || bit > 20) {
+  //   throw new Error("Invalid bit")
+  // }
+
+  const currentCount = count(prevValue)
+  const currentSentenceMask = bitmask_20(prevValue)
+  const newSentenceMask = currentSentenceMask | (1 << bit)
+  return ((currentCount + 1) << 20) | newSentenceMask
+}
+
+export function insertString(
+  value: string,
+  radixTree: radix.RadixTree,
+  stats: QPSIndex['stats'][string],
+  prop: string,
+  internalId: InternalDocumentID,
+  language: string | undefined,
+  tokenizer: Tokenizer,
+) {
+  const sentences = value.split(/\.|\?|!/)
+
+  let quantumIndex = 0
+  let tokenNumber = 0
+  for (const sentence of sentences) {
+    const tokens = tokenizer.tokenize(sentence, language, prop)
+
+    for (const token of tokens) {
+      tokenNumber++
+
+      if (!stats[token]) {
+        stats[token] = 0
+      }
+
+      const tokenBitIndex = Math.min(
+        quantumIndex,
+        20
+      )
+
+      stats.tokenQuantums[internalId][token] = calculateTokenQuantum(
+        stats.tokenQuantums[internalId][token],
+        tokenBitIndex
+      )
+
+      radixTree.insert(token, internalId)
+    }
+
+    // Don't increment the quantum index if the sentence is too short
+    if (tokens.length > 1) {
+      quantumIndex++
+    }
+  }
+
+  stats.tokensLength.set(internalId, tokenNumber)
+}
+
+export function searchString(prop: {
+  tokens: string[],
+  radixNode: radix.RadixNode,
+  exact: boolean,
+  tolerance: number,
+  stats: {
+    tokensLength: Map<number, number>,
+    tokenQuantums: Record<number, Record<string, number>>,
+  },
+  boostPerProp: number,
+  resultMap: Map<number, [number, number]>,
+}) {
+  const tokens = prop.tokens
+  const radixNode = prop.radixNode
+  const exact = prop.exact
+  const tolerance = prop.tolerance
+  const stats = prop.stats
+  const boostPerProp = prop.boostPerProp
+  const resultMap = prop.resultMap
+  const tokensLength = stats.tokensLength
+  const tokenQuantums = stats.tokenQuantums
+
+  const findParam = {
+    term: '',
+    exact,
+    tolerance,
+  }
+
+  let foundWords = {} as Record<string, number[]>
+  const tokenLength = tokens.length
+  for (let i = 0; i < tokenLength; i++) {
+    const term = tokens[i]
+    findParam.term = term
+    const results = radixNode.find(findParam)
+    foundWords = {
+      ...foundWords,
+      ...results
+    }
+  }
+
+  const foundKeys = Object.keys(foundWords)
+  const foundKeysLength = foundKeys.length
+  for (let i = 0; i < foundKeysLength; i++) {
+    const key = foundKeys[i]
+    const matchedDocs = foundWords[key]
+    const matchedDocsLength = matchedDocs.length
+    const isExactMatch = tokens.includes(key)
+
+    for (let j = 0; j < matchedDocsLength; j++) {
+      const docId = matchedDocs[j]
+
+      const numberOfQuantums = tokensLength.get(docId)!
+      const tokenQuantumDescriptor = tokenQuantums[docId][key]
+
+      const occurrence = count(tokenQuantumDescriptor)
+      const bitMask = bitmask_20(tokenQuantumDescriptor)
+      const score = (occurrence * occurrence / numberOfQuantums + (isExactMatch ? 1 : 0)) * boostPerProp
+
+      if (!resultMap.has(docId)) {
+        resultMap.set(docId, [score, bitMask])
+        continue
+      }
+
+      const current = resultMap.get(docId)!
+
+      const totalScore = current[0]
+        + numberOfOnes(current[1] & bitMask) * 2
+        + score
+
+      current[0] = totalScore
+      current[1] = current[1] | bitMask
+    }
+  }
+}
+
+export function bitmask_20(n: number) {
+  return n & BIT_MASK_20
+}
+export function count(n: number) {
+  return n >> 20
+}
+
+export function numberOfOnes(n: number) {
+  let i = 0;
+  do {
+      if (n&1) { ++i }
+  // eslint-disable-next-line no-cond-assign
+  } while (n>>=1)
+  return i
+}
+
+export function removeString(
+  value: string,
+  radixTree: radix.RadixTree,
+  prop: string,
+  internalId: InternalDocumentID,
+  tokenizer: Tokenizer,
+  language: string | undefined,
+  stats: {
+    tokensLength: Map<number, number>,
+    tokenQuantums: Record<number, Record<string, number>>,
+  },
+) {
+  const tokensLength = stats.tokensLength
+  const tokenQuantums = stats.tokenQuantums
+
+  const tokens = tokenizer.tokenize(value, language, prop)
+
+  for (const token of tokens) {
+    radixTree.removeDocumentByWord(token, internalId, true)
+  }
+
+  tokensLength.delete(internalId)
+  delete tokenQuantums[internalId]
+}