Skip to content

Commit

Permalink
feat: adds plugin-qps (#818)
Browse files Browse the repository at this point in the history
  • Loading branch information
allevo authored Oct 14, 2024
1 parent 328b1db commit 1737903
Show file tree
Hide file tree
Showing 11 changed files with 1,087 additions and 1 deletion.
2 changes: 1 addition & 1 deletion packages/orama/src/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1316,7 +1316,7 @@ export type OramaPluginSync<T = unknown> = {
beforeUpdateMultiple?: <T extends AnyOrama>(orama: T, docs: AnyDocument[]) => SyncOrAsyncValue
afterUpdateMultiple?: <T extends AnyOrama>(orama: T, docs: AnyDocument[]) => SyncOrAsyncValue
afterCreate?: <T extends AnyOrama>(orama: T) => SyncOrAsyncValue
getComponents?: <TIndex extends IIndex<AnyIndexStore>, TDocumentStore, TSorter>(schema: AnySchema) => SyncOrAsyncValue<Partial<ObjectComponents<TIndex, TDocumentStore, TSorter>>>
getComponents?: <IndexStore extends AnyIndexStore, TDocumentStore, TSorter>(schema: AnySchema) => SyncOrAsyncValue<Partial<ObjectComponents<IIndex<IndexStore>, TDocumentStore, TSorter>>>
}

export type OramaPluginAsync<T = unknown> = Promise<OramaPluginSync<T>>
Expand Down
13 changes: 13 additions & 0 deletions packages/plugin-qps/LICENSE.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
Copyright 2024 OramaSearch Inc

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
29 changes: 29 additions & 0 deletions packages/plugin-qps/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
# Orama Plugin Quantum Proximity Scoring

**Orama Plugin Quantum Proximity Scoring** ranks search results based on the proximity of query tokens in the document.

## Installation

To get started with **Orama Plugin PT15**, just install it with npm:

```sh
npm i @orama/plugin-qps
```

## Usage

```js
import { create } from '@orama/orama'
import { pluginQPS } from '@orama/plugin-qps'

const db = await create({
schema: {
description: 'string',
},
plugins: [ pluginQPS() ],
})
```

# License

[Apache 2.0](/LICENSE.md)
53 changes: 53 additions & 0 deletions packages/plugin-qps/package.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
{
"name": "@orama/plugin-qps",
"version": "3.0.0-rc-2",
"description": "Performant search algorithm optimized for descriptive texts",
"keywords": [
"orama",
"embeddings",
"secure proxy",
"vector search"
],
"license": "Apache-2.0",
"main": "./dist/index.js",
"type": "module",
"exports": {
".": {
"require": "./dist/index.cjs",
"import": "./dist/index.js",
"types": "./dist/index.d.ts",
"browser": "./dist/index.global.js"
}
},
"bugs": {
"url": "https://github.com/askorama/orama/issues"
},
"homepage": "https://github.com/askorama/orama#readme",
"repository": {
"type": "git",
"url": "git+https://github.com/askorama/orama.git"
},
"sideEffects": false,
"types": "./dist/index.d.ts",
"files": [
"dist"
],
"scripts": {
"build": "tsup --config tsup.lib.js",
"lint": "exit 0",
"test": "node --test --import tsx test/*.test.ts"
},
"publishConfig": {
"access": "public"
},
"devDependencies": {
"@types/node": "^20.9.0",
"tap": "^21.0.1",
"tsup": "^7.2.0",
"tsx": "^4.19.1",
"typescript": "^5.0.0"
},
"dependencies": {
"@orama/orama": "workspace:*"
}
}
246 changes: 246 additions & 0 deletions packages/plugin-qps/src/algorithm.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,246 @@
import { AnyIndexStore, AnyOrama, SearchableType, Tokenizer, VectorIndex } from "@orama/orama"
import { avl, bkd, flat, radix, bool } from '@orama/orama/trees'
import {
getVectorSize, index as Index, internalDocumentIDStore, isVectorType } from '@orama/orama/components'

type InternalDocumentID = internalDocumentIDStore.InternalDocumentID;

export interface QPSIndex extends AnyIndexStore {
indexes: Record<string, Index.Tree>
vectorIndexes: Record<string, VectorIndex>
searchableProperties: string[]
searchablePropertiesWithTypes: Record<string, SearchableType>
stats: Record<string, {
tokenQuantums: Record<InternalDocumentID, Record<string, number>>
tokensLength: Map<InternalDocumentID, number>
}>
}

export function recursiveCreate<T extends AnyOrama>(indexDatastore: QPSIndex, schema: T['schema'], prefix: string) {
for (const entry of Object.entries<SearchableType>(schema)) {
const prop = entry[0]
const type = entry[1]
const path = `${prefix}${prefix ? '.' : ''}${prop}`

if (typeof type === 'object' && !Array.isArray(type)) {
// Nested
recursiveCreate(indexDatastore, type, path)
continue
}

if (isVectorType(type)) {
indexDatastore.searchableProperties.push(path)
indexDatastore.searchablePropertiesWithTypes[path] = type
indexDatastore.vectorIndexes[path] = {
size: getVectorSize(type),
vectors: {}
}
} else {
const isArray = /\[/.test(type as string)
switch (type) {
case 'boolean':
case 'boolean[]':
indexDatastore.indexes[path] = { type: 'Bool', node: new bool.BoolNode(), isArray }
break
case 'number':
case 'number[]':
indexDatastore.indexes[path] = { type: 'AVL', node: new avl.AVLTree<number, InternalDocumentID[]>(0, []), isArray }
break
case 'string':
case 'string[]':
indexDatastore.indexes[path] = { type: 'Radix', node: new radix.RadixTree(), isArray }
break
case 'enum':
case 'enum[]':
indexDatastore.indexes[path] = { type: 'Flat', node: new flat.FlatTree(), isArray }
break
case 'geopoint':
indexDatastore.indexes[path] = { type: 'BKD', node: new bkd.BKDTree(), isArray }
break
default:
throw new Error('INVALID_SCHEMA_TYPE: ' + path)
}

indexDatastore.searchableProperties.push(path)
indexDatastore.searchablePropertiesWithTypes[path] = type
}
}
}


const BIT_MASK_20 = 0b11111111111111111111

export function calculateTokenQuantum(prevValue: number, bit: number) {
// if (prevValue < 0) {
// throw new Error("Overflow")
// }
// if (bit < 0 || bit > 20) {
// throw new Error("Invalid bit")
// }

const currentCount = count(prevValue)
const currentSentenceMask = bitmask_20(prevValue)
const newSentenceMask = currentSentenceMask | (1 << bit)
return ((currentCount + 1) << 20) | newSentenceMask
}

export function insertString(
value: string,
radixTree: radix.RadixTree,
stats: QPSIndex['stats'][string],
prop: string,
internalId: InternalDocumentID,
language: string | undefined,
tokenizer: Tokenizer,
) {
const sentences = value.split(/\.|\?|!/)

let quantumIndex = 0
let tokenNumber = 0
for (const sentence of sentences) {
const tokens = tokenizer.tokenize(sentence, language, prop)

for (const token of tokens) {
tokenNumber++

if (!stats[token]) {
stats[token] = 0
}

const tokenBitIndex = Math.min(
quantumIndex,
20
)

stats.tokenQuantums[internalId][token] = calculateTokenQuantum(
stats.tokenQuantums[internalId][token],
tokenBitIndex
)

radixTree.insert(token, internalId)
}

// Don't increment the quantum index if the sentence is too short
if (tokens.length > 1) {
quantumIndex++
}
}

stats.tokensLength.set(internalId, tokenNumber)
}

export function searchString(prop: {
tokens: string[],
radixNode: radix.RadixNode,
exact: boolean,
tolerance: number,
stats: {
tokensLength: Map<number, number>,
tokenQuantums: Record<number, Record<string, number>>,
},
boostPerProp: number,
resultMap: Map<number, [number, number]>,
}) {
const tokens = prop.tokens
const radixNode = prop.radixNode
const exact = prop.exact
const tolerance = prop.tolerance
const stats = prop.stats
const boostPerProp = prop.boostPerProp
const resultMap = prop.resultMap
const tokensLength = stats.tokensLength
const tokenQuantums = stats.tokenQuantums

const findParam = {
term: '',
exact,
tolerance,
}

let foundWords = {} as Record<string, number[]>
const tokenLength = tokens.length
for (let i = 0; i < tokenLength; i++) {
const term = tokens[i]
findParam.term = term
const results = radixNode.find(findParam)
foundWords = {
...foundWords,
...results
}
}

const foundKeys = Object.keys(foundWords)
const foundKeysLength = foundKeys.length
for (let i = 0; i < foundKeysLength; i++) {
const key = foundKeys[i]
const matchedDocs = foundWords[key]
const matchedDocsLength = matchedDocs.length
const isExactMatch = tokens.includes(key)

for (let j = 0; j < matchedDocsLength; j++) {
const docId = matchedDocs[j]

const numberOfQuantums = tokensLength.get(docId)!
const tokenQuantumDescriptor = tokenQuantums[docId][key]

const occurrence = count(tokenQuantumDescriptor)
const bitMask = bitmask_20(tokenQuantumDescriptor)
const score = (occurrence * occurrence / numberOfQuantums + (isExactMatch ? 1 : 0)) * boostPerProp

if (!resultMap.has(docId)) {
resultMap.set(docId, [score, bitMask])
continue
}

const current = resultMap.get(docId)!

const totalScore = current[0]
+ numberOfOnes(current[1] & bitMask) * 2
+ score

current[0] = totalScore
current[1] = current[1] | bitMask
}
}
}

export function bitmask_20(n: number) {
return n & BIT_MASK_20
}
export function count(n: number) {
return n >> 20
}

export function numberOfOnes(n: number) {
let i = 0;
do {
if (n&1) { ++i }
// eslint-disable-next-line no-cond-assign
} while (n>>=1)
return i
}

export function removeString(
value: string,
radixTree: radix.RadixTree,
prop: string,
internalId: InternalDocumentID,
tokenizer: Tokenizer,
language: string | undefined,
stats: {
tokensLength: Map<number, number>,
tokenQuantums: Record<number, Record<string, number>>,
},
) {
const tokensLength = stats.tokensLength
const tokenQuantums = stats.tokenQuantums

const tokens = tokenizer.tokenize(value, language, prop)

for (const token of tokens) {
radixTree.removeDocumentByWord(token, internalId, true)
}

tokensLength.delete(internalId)
delete tokenQuantums[internalId]
}
Loading

0 comments on commit 1737903

Please sign in to comment.