diff --git a/benchmarks/algorithms.js b/benchmarks/algorithms.js new file mode 100644 index 00000000..cfe80cbd --- /dev/null +++ b/benchmarks/algorithms.js @@ -0,0 +1,42 @@ +import b from 'benny' +import { create, insertMultiple, search } from 'orama_latest' +import { pluginPT15 } from '@orama/plugin-pt15' +import dataset from './src/dataset.json' assert { type: 'json' } +import {stopwords} from '@orama/stopwords/english' + +const dbBM25 = create({ + schema: { + description: 'string' + }, + components: { + tokenizer: { + stopWords: stopwords, + }, + } +}) +const dbWithPT15 = create({ + schema: { + description: 'string' + }, + plugins: [pluginPT15()], + components: { + tokenizer: { + stopWords: stopwords, + }, + } +}) +await insertMultiple(dbBM25, dataset) +await insertMultiple(dbWithPT15, dataset) + +b.suite('search-algorithms', + b.add('search bm25', () => { + search(dbBM25, { term: 'L' }) + }), + b.add('search pt15', () => { + search(dbWithPT15, { term: 'L' }) + }), + b.cycle(), + b.complete(), + b.save({ file: 'insert', version: '1.0.0' }), + b.save({ file: 'search-algorithms', format: 'chart.html' }), +) \ No newline at end of file diff --git a/benchmarks/index.js b/benchmarks/index.js index bd59b201..fd8ab279 100644 --- a/benchmarks/index.js +++ b/benchmarks/index.js @@ -12,6 +12,9 @@ function benchmarkInsert() { b.add('insert in Orama latest', () => { insert.oramaLatest() }), + b.add('insert in Orama latest with PT15', () => { + insert.oramaLatestPT15() + }), b.cycle(), b.complete(), b.save({ file: 'insert', version: '1.0.0' }), @@ -30,6 +33,9 @@ function benchmarkInsertMultiple() { b.add('insert multiple in Orama latest', () => { insertMultiple.oramaLatest() }), + b.add('insert multiple in Orama latest with PT15', () => { + insertMultiple.oramaLatestPT15() + }), b.cycle(), b.complete(), b.save({ file: 'insert multiple', version: '1.0.0' }), @@ -48,6 +54,9 @@ function benchmarkSearch() { b.add('plain search in Orama latest', () => { searchPlain.oramaLatest() }), + b.add('plain search in Orama latest with PT15', () => { + searchPlain.oramaLatestPT15() + }), b.cycle(), b.complete(), b.save({ file: 'plain search', version: '1.0.0' }), @@ -66,6 +75,9 @@ function benchmarkSearchWithFilters() { b.add('search with filters in Orama latest', () => { searchWithFilters.oramaLatest() }), + b.add('search with filters in Orama latest with PT15', () => { + searchWithFilters.oramaLatestPT15() + }), b.cycle(), b.complete(), b.save({ file: 'search with filters', version: '1.0.0' }), @@ -84,6 +96,9 @@ function benchmarkSearchWithLongTextAndComplexFilters() { b.add('search with long text and complex filters in Orama latest', () => { searchWithLongTextAndComplexFilters.oramaLatest() }), + b.add('search with long text and complex filters in Orama latest with PT15', () => { + searchWithLongTextAndComplexFilters.oramaLatestPT15() + }), b.cycle(), b.complete(), b.save({ file: 'search with long text and complex filters', version: '1.0.0' }), diff --git a/benchmarks/package-lock.json b/benchmarks/package-lock.json index 6277788f..24ab0025 100644 --- a/benchmarks/package-lock.json +++ b/benchmarks/package-lock.json @@ -9,6 +9,9 @@ "version": "1.0.0", "license": "ISC", "dependencies": { + "@orama/plugin-pt15": "file:../packages/plugin-pt15", + "@orama/plugin-qps": "file:../packages/plugin-qps", + "@orama/stopwords": "file:../packages/stopwords", "orama_211": "npm:@orama/orama@2.1.1", "orama_300_rc_2": "npm:@orama/orama@3.0.0-rc-2", "orama_latest": "file:../packages/orama" @@ -35,7 +38,7 @@ "commitizen": "^4.2.6", "glob": "^9.2.3", "prettier": "^2.8.1", - "tap": "^18.6.1", + "tap": "^21.0.1", "tap-mocha-reporter": "^5.0.3", "tape": "^5.6.1", "tcompare": "^6.0.0", @@ -48,6 +51,62 @@ "node": ">= 16.0.0" } }, + "../packages/plugin-foo": { + "version": "3.0.0-rc-2", + "extraneous": true, + "license": "Apache-2.0", + "dependencies": { + "@orama/orama": "workspace:*" + }, + "devDependencies": { + "@types/node": "^20.9.0", + "tap": "^21.0.1", + "tsup": "^7.2.0", + "tsx": "^4.19.1", + "typescript": "^5.0.0" + } + }, + "../packages/plugin-pt15": { + "name": "@orama/plugin-pt15", + "version": "3.0.0-rc-2", + "license": "Apache-2.0", + "dependencies": { + "@orama/orama": "workspace:*" + }, + "devDependencies": { + "@types/node": "^20.9.0", + "tap": "^21.0.1", + "tsup": "^7.2.0", + "tsx": "^4.19.1", + "typescript": "^5.0.0" + } + }, + "../packages/plugin-qps": { + "name": "@orama/plugin-qps", + "version": "3.0.0-rc-2", + "license": "Apache-2.0", + "dependencies": { + "@orama/orama": "workspace:*" + }, + "devDependencies": { + "@types/node": "^20.9.0", + "tap": "^21.0.1", + "tsup": "^7.2.0", + "tsx": "^4.19.1", + "typescript": "^5.0.0" + } + }, + "../packages/stopwords": { + "name": "@orama/stopwords", + "version": "3.0.0-rc-2", + "license": "Apache-2.0", + "devDependencies": { + "@swc/core": "^1.3.27" + }, + "engines": { + "node": ">= 16.0.0" + } + }, "node_modules/@arrows/array": { "version": "1.4.1", "resolved": "https://registry.npmjs.org/@arrows/array/-/array-1.4.1.tgz", @@ -90,6 +149,18 @@ "fast-deep-equal": "^3.1.3" } }, + "node_modules/@orama/plugin-pt15": { + "resolved": "../packages/plugin-pt15", + "link": true + }, + "node_modules/@orama/plugin-qps": { + "resolved": "../packages/plugin-qps", + "link": true + }, + "node_modules/@orama/stopwords": { + "resolved": "../packages/stopwords", + "link": true + }, "node_modules/ansi-escapes": { "version": "4.3.2", "resolved": "https://registry.npmjs.org/ansi-escapes/-/ansi-escapes-4.3.2.tgz", diff --git a/benchmarks/package.json b/benchmarks/package.json index b6370ff6..ef396429 100644 --- a/benchmarks/package.json +++ b/benchmarks/package.json @@ -13,7 +13,9 @@ "dependencies": { "orama_211": "npm:@orama/orama@2.1.1", "orama_300_rc_2": "npm:@orama/orama@3.0.0-rc-2", - "orama_latest": "file:../packages/orama" + "orama_latest": "file:../packages/orama", + "@orama/stopwords": "file:../packages/stopwords", + "@orama/plugin-pt15": "file:../packages/plugin-pt15" }, "devDependencies": { "benny": "^3.7.1" diff --git a/benchmarks/src/get-orama.js b/benchmarks/src/get-orama.js index 14dd5880..d897b8e2 100644 --- a/benchmarks/src/get-orama.js +++ b/benchmarks/src/get-orama.js @@ -1,6 +1,7 @@ import * as orama211 from 'orama_211' import * as orama300rc2 from 'orama_300_rc_2' import * as oramaLatest from 'orama_latest' +import { pluginPT15 } from '@orama/plugin-pt15' import dataset from './dataset.json' assert { type: 'json' } export const schema = { @@ -13,12 +14,14 @@ export const schema = { const create = { orama211: () => orama211.create({ schema }), orama300rc2: () => orama300rc2.create({ schema }), - oramaLatest: () => oramaLatest.create({ schema }) + oramaLatest: () => oramaLatest.create({ schema }), + oramaLatestPT15: () => oramaLatest.create({ schema, plugins: [pluginPT15()] }) } const db211 = await create.orama211() const db300rc2 = create.orama300rc2() const dbLatest = create.oramaLatest() +const dbLatestPT15 = create.oramaLatestPT15() export const insert = { orama211: async () => { @@ -38,6 +41,12 @@ export const insert = { for (const record of dataset) { oramaLatest.insert(db, record) } + }, + oramaLatestPT15: () => { + const db = create.oramaLatestPT15() + for (const record of dataset) { + oramaLatest.insert(db, record) + } } } @@ -50,6 +59,9 @@ export const insertMultiple = { }, oramaLatest: () => { oramaLatest.insertMultiple(dbLatest, dataset, 50) + }, + oramaLatestPT15: () => { + oramaLatest.insertMultiple(dbLatestPT15, dataset, 50) } } @@ -62,6 +74,9 @@ export const searchPlain = { }, oramaLatest: () => { oramaLatest.search(dbLatest, { term: 'Legend of Zelda' }) + }, + oramaLatestPT15: () => { + oramaLatest.search(dbLatestPT15, { term: 'Legend of Zelda' }) } } @@ -74,6 +89,9 @@ export const searchWithFilters = { }, oramaLatest: () => { oramaLatest.search(dbLatest, { term: 'Super Hero', where: { rating: { gte: 4 } } }) + }, + oramaLatestPT15: () => { + oramaLatest.search(dbLatestPT15, { term: 'Super Hero', where: { rating: { gte: 4 } } }) } } @@ -86,5 +104,8 @@ export const searchWithLongTextAndComplexFilters = { }, oramaLatest: () => { oramaLatest.search(dbLatest, { term: 'classic run gun, action game focused on boss battles', where: { rating: { gte: 4 }, genres: { containsAll: ['Shooter'] } } }) + }, + oramaLatestPT15: () => { + oramaLatest.search(dbLatestPT15, { term: 'classic run gun, action game focused on boss battles', where: { rating: { gte: 4 }, genres: { containsAll: ['Shooter'] } } }) } } \ No newline at end of file diff --git a/packages/orama/src/components/documents-store.ts b/packages/orama/src/components/documents-store.ts index 8e339bc9..9d6849bf 100644 --- a/packages/orama/src/components/documents-store.ts +++ b/packages/orama/src/components/documents-store.ts @@ -51,9 +51,7 @@ export function getAll const elementsLength = elements.length for (let i = 0; i < elementsLength; i++) { - removeScalar(implementation, index, prop, id, elements[i], innerSchemaType, language, tokenizer, docsCount) + removeScalar(implementation, index, prop, id, internalId, elements[i], innerSchemaType, language, tokenizer, docsCount) } return true diff --git a/packages/orama/src/errors.ts b/packages/orama/src/errors.ts index 093bf6cf..f24d6b83 100644 --- a/packages/orama/src/errors.ts +++ b/packages/orama/src/errors.ts @@ -42,7 +42,8 @@ const errors = { PLUGIN_CRASHED: `A plugin crashed during initialization. Please check the error message for more information:`, PLUGIN_SECURE_PROXY_NOT_FOUND: `Could not find '@orama/secure-proxy-plugin' installed in your Orama instance.\nPlease install it before proceeding with creating an answer session.\nRead more at https://docs.orama.com/open-source/plugins/plugin-secure-proxy\n`, PLUGIN_SECURE_PROXY_MISSING_CHAT_MODEL: `Could not find a chat model defined in the secure proxy plugin configuration.\nPlease provide a chat model before proceeding with creating an answer session.\nRead more at https://docs.orama.com/open-source/plugins/plugin-secure-proxy\n`, - ANSWER_SESSION_LAST_MESSAGE_IS_NOT_ASSISTANT: `The last message in the session is not an assistant message. Cannot regenerate non-assistant messages.` + ANSWER_SESSION_LAST_MESSAGE_IS_NOT_ASSISTANT: `The last message in the session is not an assistant message. Cannot regenerate non-assistant messages.`, + PLUGIN_COMPONENT_CONFLICT: `The component "%s" is already defined. The plugin "%s" is trying to redefine it.`, } export type ErrorCode = keyof typeof errors diff --git a/packages/orama/src/methods/create.ts b/packages/orama/src/methods/create.ts index 60f8e9ce..cee687c1 100644 --- a/packages/orama/src/methods/create.ts +++ b/packages/orama/src/methods/create.ts @@ -8,11 +8,13 @@ import { Sorter, createSorter } from '../components/sorter.js' import { createTokenizer } from '../components/tokenizer/index.js' import { createError } from '../errors.js' import { + AnySchema, Components, FunctionComponents, IDocumentsStore, IIndex, ISorter, + ObjectComponents, Orama, OramaPlugin, SorterConfig, @@ -70,7 +72,7 @@ function validateComponents< } export function create< - OramaSchema, + OramaSchema extends AnySchema, TIndex = IIndex, TDocumentStore = IDocumentsStore, TSorter = ISorter @@ -86,6 +88,28 @@ export function create< components = {} } + for (const plugin of plugins ?? []) { + if (!('getComponents' in plugin)) { + continue + } + if (typeof plugin.getComponents !== 'function') { + continue; + } + + const pluginComponents = plugin.getComponents(schema) as Partial>; + + const keys = Object.keys(pluginComponents) + for (const key of keys) { + if (components![key]) { + throw createError('PLUGIN_COMPONENT_CONFLICT', key, plugin.name) + } + } + components = { + ...components, + ...pluginComponents + } + } + if (!id) { id = uniqueId() } diff --git a/packages/orama/src/methods/insert.ts b/packages/orama/src/methods/insert.ts index dc3c4bdd..1c69763b 100644 --- a/packages/orama/src/methods/insert.ts +++ b/packages/orama/src/methods/insert.ts @@ -52,7 +52,8 @@ async function innerInsertAsync( throw createError('DOCUMENT_ID_MUST_BE_STRING', typeof id) } - if (!orama.documentsStore.store(docs, id, doc)) { + const internalId = getInternalDocumentId(orama.internalDocumentIDStore, id) + if (!orama.documentsStore.store(docs, id, internalId, doc)) { throw createError('DOCUMENT_ALREADY_EXISTS', id) } @@ -100,7 +101,8 @@ function innerInsertSync( throw createError('DOCUMENT_ID_MUST_BE_STRING', typeof id) } - if (!orama.documentsStore.store(docs, id, doc)) { + const internalId = getInternalDocumentId(orama.internalDocumentIDStore, id) + if (!orama.documentsStore.store(docs, id, internalId, doc)) { throw createError('DOCUMENT_ALREADY_EXISTS', id) } diff --git a/packages/orama/src/methods/remove.ts b/packages/orama/src/methods/remove.ts index c5a33a50..e858f238 100644 --- a/packages/orama/src/methods/remove.ts +++ b/packages/orama/src/methods/remove.ts @@ -3,7 +3,7 @@ import { runMultipleHook, runSingleHook } from '../components/hooks.js' import { DocumentID, getDocumentIdFromInternalId, - getInternalDocumentId + getInternalDocumentId, } from '../components/internal-document-id-store.js' import { trackRemoval } from '../components/sync-blocking-checker.js' import { isAsyncFunction } from '../utils.js' @@ -40,9 +40,10 @@ async function removeAsync( return false } + const internalId = getInternalDocumentId(orama.internalDocumentIDStore, id) const docId = getDocumentIdFromInternalId( orama.internalDocumentIDStore, - getInternalDocumentId(orama.internalDocumentIDStore, id) + internalId ) const docsCount = orama.documentsStore.count(docs) @@ -79,6 +80,7 @@ async function removeAsync( orama.data.index, prop, id, + internalId, value, schemaType, language, @@ -115,7 +117,7 @@ async function removeAsync( await runSingleHook(orama.afterRemove, orama, docId) } - orama.documentsStore.remove(orama.data.docs, id) + orama.documentsStore.remove(orama.data.docs, id, internalId) trackRemoval(orama) return result @@ -130,9 +132,10 @@ function removeSync(orama: T, id: DocumentID, language?: str return false } + const internalId = getInternalDocumentId(orama.internalDocumentIDStore, id) const docId = getDocumentIdFromInternalId( orama.internalDocumentIDStore, - getInternalDocumentId(orama.internalDocumentIDStore, id) + internalId ) const docsCount = orama.documentsStore.count(docs) @@ -160,6 +163,7 @@ function removeSync(orama: T, id: DocumentID, language?: str orama.data.index, prop, id, + internalId, value, schemaType, language, @@ -187,7 +191,7 @@ function removeSync(orama: T, id: DocumentID, language?: str runSingleHook(orama.afterRemove, orama, docId) } - orama.documentsStore.remove(orama.data.docs, id) + orama.documentsStore.remove(orama.data.docs, id, internalId) trackRemoval(orama) return result diff --git a/packages/orama/src/trees.ts b/packages/orama/src/trees.ts index 81a2c206..9de8b05f 100644 --- a/packages/orama/src/trees.ts +++ b/packages/orama/src/trees.ts @@ -3,3 +3,4 @@ export * as avl from './trees/avl.js' export * as zip from './trees/zip.js' export * as bkd from './trees/bkd.js' export * as flat from './trees/flat.js' +export * as bool from './trees/bool.js' diff --git a/packages/orama/src/types.ts b/packages/orama/src/types.ts index c728c94d..b8e09289 100644 --- a/packages/orama/src/types.ts +++ b/packages/orama/src/types.ts @@ -951,6 +951,7 @@ export interface IIndex { index: T, prop: string, id: DocumentID, + internalId: InternalDocumentID, value: SearchableValue, schemaType: SearchableType, language: string | undefined, @@ -1010,8 +1011,8 @@ export interface IDocumentsStore get(store: D, id: DocumentID): Optional getMultiple(store: D, ids: DocumentID[]): Optional[] getAll(store: D): SyncOrAsyncValue> - store(store: D, id: DocumentID, doc: AnyDocument): boolean - remove(store: D, id: DocumentID): SyncOrAsyncValue + store(store: D, id: DocumentID, internalId: InternalDocumentID, doc: AnyDocument): boolean + remove(store: D, id: DocumentID, internalId: InternalDocumentID): SyncOrAsyncValue count(store: D): number load(sharedInternalDocumentStore: InternalDocumentIDStore, raw: R): D @@ -1315,6 +1316,7 @@ export type OramaPluginSync = { beforeUpdateMultiple?: (orama: T, docs: AnyDocument[]) => SyncOrAsyncValue afterUpdateMultiple?: (orama: T, docs: AnyDocument[]) => SyncOrAsyncValue afterCreate?: (orama: T) => SyncOrAsyncValue + getComponents?: , TDocumentStore, TSorter>(schema: AnySchema) => SyncOrAsyncValue>> } export type OramaPluginAsync = Promise> diff --git a/packages/orama/tests/remove.test.ts b/packages/orama/tests/remove.test.ts index 1277e5da..130be8a0 100644 --- a/packages/orama/tests/remove.test.ts +++ b/packages/orama/tests/remove.test.ts @@ -298,6 +298,26 @@ t.test('should correctly remove documents with vector properties', async (t) => t.ok(await getByID(db, id2)) }) +t.test( + 'test case for #766: Zero division when computing scores after removing all documents from an index.', + async (t) => { + const db = create({ + schema: { + name: 'string' + } as const + }) + + const id = insert(db, { name: 'test' }) + + const success = remove(db, id as string) + + insert(db, { name: 'foo' }) + insert(db, { name: 'bar' }) + + t.ok(success) + } +) + function createSimpleDB() { let i = 0 const db = create({ @@ -354,23 +374,3 @@ function createSimpleDB() { return [db, id1, id2, id3, id4] as const } - -t.test( - 'test case for #766: Zero division when computing scores after removing all documents from an index.', - async (t) => { - const db = create({ - schema: { - name: 'string' - } as const - }) - - const id = insert(db, { name: 'test' }) - - const success = remove(db, id as string) - - insert(db, { name: 'foo' }) - insert(db, { name: 'bar' }) - - t.ok(success) - } -) diff --git a/packages/plugin-docusaurus-v3/src/index.ts b/packages/plugin-docusaurus-v3/src/index.ts index 0bc8572b..3c6e4e9e 100644 --- a/packages/plugin-docusaurus-v3/src/index.ts +++ b/packages/plugin-docusaurus-v3/src/index.ts @@ -3,12 +3,11 @@ import type { Plugin } from '@docusaurus/types' import { cp } from 'node:fs/promises' import { gzip } from 'pako' import { resolve } from 'node:path' -// @ts-ignore import { create, insertMultiple, save } from '@orama/orama' import { JSDOM } from 'jsdom' import MarkdownIt from 'markdown-it' import matter from 'gray-matter' -import { createSnapshot, deployIndex, DOCS_PRESET_SCHEMA, fetchEndpointConfig } from "./utils" +import { createSnapshot, deployIndex, DOCS_PRESET_SCHEMA, fetchEndpointConfig } from "./utils.js" import { parseMarkdownHeadingId, writeMarkdownHeadingId } from '@docusaurus/utils' enum DeployType { diff --git a/packages/plugin-docusaurus-v3/src/utils.ts b/packages/plugin-docusaurus-v3/src/utils.ts index f398582e..884375ea 100644 --- a/packages/plugin-docusaurus-v3/src/utils.ts +++ b/packages/plugin-docusaurus-v3/src/utils.ts @@ -1,3 +1,5 @@ +import { AnySchema } from "@orama/orama"; + export const restFetcher = async (url: string, options?: any): Promise => { const response = await fetch(url, options) @@ -83,7 +85,7 @@ export async function deployIndex(baseUrl: string, APIKey: string, indexId: stri ) } -export const DOCS_PRESET_SCHEMA = { +export const DOCS_PRESET_SCHEMA: AnySchema = { title: 'string', content: 'string', path: 'string', diff --git a/packages/plugin-docusaurus/package.json b/packages/plugin-docusaurus/package.json index 815a65ec..9529bf5f 100644 --- a/packages/plugin-docusaurus/package.json +++ b/packages/plugin-docusaurus/package.json @@ -18,6 +18,11 @@ "scripts": { "build": "rm -rf dist && tsc", "postbuild": "sh scripts/postbuild.sh", + "pretest": "node ../../scripts/test-pack.mjs plugin-docusaurus", + "test": "echo \"Docusaurus v2 will become unmaintained soon. Use Docusaurus v3.\" # node --loader=tsx --no-warnings=loader --test ./test/integration.ts | tap-mocha-reporter spec", + "format": "prettier -w src", + "lint": "eslint src --ext .js,.ts,.tsx,.cts", + "changelog": "auto-changelog -p", "watch": "tsc --watch" }, "dependencies": { diff --git a/packages/plugin-docusaurus/src/index.ts b/packages/plugin-docusaurus/src/index.ts index a0f524ee..a67a2e94 100644 --- a/packages/plugin-docusaurus/src/index.ts +++ b/packages/plugin-docusaurus/src/index.ts @@ -3,12 +3,11 @@ import type { Plugin } from '@docusaurus/types' import { cp } from 'node:fs/promises' import { gzip } from 'pako' import { resolve } from 'node:path' -// @ts-ignore import { create, insertMultiple, save } from '@orama/orama' import { JSDOM } from 'jsdom' import MarkdownIt from 'markdown-it' import matter from 'gray-matter' -import { createSnapshot, deployIndex, DOCS_PRESET_SCHEMA, fetchEndpointConfig } from "./utils" +import { createSnapshot, deployIndex, DOCS_PRESET_SCHEMA, fetchEndpointConfig } from "./utils.js" import { parseMarkdownHeadingId, writeMarkdownHeadingId } from '@docusaurus/utils' enum DeployType { diff --git a/packages/plugin-docusaurus/src/utils.ts b/packages/plugin-docusaurus/src/utils.ts index f398582e..884375ea 100644 --- a/packages/plugin-docusaurus/src/utils.ts +++ b/packages/plugin-docusaurus/src/utils.ts @@ -1,3 +1,5 @@ +import { AnySchema } from "@orama/orama"; + export const restFetcher = async (url: string, options?: any): Promise => { const response = await fetch(url, options) @@ -83,7 +85,7 @@ export async function deployIndex(baseUrl: string, APIKey: string, indexId: stri ) } -export const DOCS_PRESET_SCHEMA = { +export const DOCS_PRESET_SCHEMA: AnySchema = { title: 'string', content: 'string', path: 'string', diff --git a/packages/plugin-pt15/LICENSE.md b/packages/plugin-pt15/LICENSE.md new file mode 100644 index 00000000..1b02b293 --- /dev/null +++ b/packages/plugin-pt15/LICENSE.md @@ -0,0 +1,13 @@ +Copyright 2024 OramaSearch Inc + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. diff --git a/packages/plugin-pt15/README.md b/packages/plugin-pt15/README.md new file mode 100644 index 00000000..b816c68a --- /dev/null +++ b/packages/plugin-pt15/README.md @@ -0,0 +1,29 @@ +# Orama Plugin PT15 + +Fast ranking algorithm based on token position. + +## Installation + +To get started with **Orama Plugin PT15**, just install it with npm: + +```sh +npm i @orama/plugin-pt15 +``` + +## Usage + +```js +import { create } from '@orama/orama' +import { pluginPT15 } from '@orama/plugin-pt15' + +const db = await create({ + schema: { + description: 'string', + }, + plugins: [ pluginPT15() ], +}) +``` + +# License + +[Apache 2.0](/LICENSE.md) \ No newline at end of file diff --git a/packages/plugin-pt15/package.json b/packages/plugin-pt15/package.json new file mode 100644 index 00000000..d919d7b0 --- /dev/null +++ b/packages/plugin-pt15/package.json @@ -0,0 +1,53 @@ +{ + "name": "@orama/plugin-pt15", + "version": "3.0.0-rc-2", + "description": "Performant search algorithm optimized for descriptive texts", + "keywords": [ + "orama", + "embeddings", + "secure proxy", + "vector search" + ], + "license": "Apache-2.0", + "main": "./dist/index.js", + "type": "module", + "exports": { + ".": { + "require": "./dist/index.cjs", + "import": "./dist/index.js", + "types": "./dist/index.d.ts", + "browser": "./dist/index.global.js" + } + }, + "bugs": { + "url": "https://github.com/askorama/orama/issues" + }, + "homepage": "https://github.com/askorama/orama#readme", + "repository": { + "type": "git", + "url": "git+https://github.com/askorama/orama.git" + }, + "sideEffects": false, + "types": "./dist/index.d.ts", + "files": [ + "dist" + ], + "scripts": { + "build": "tsup --config tsup.lib.js", + "lint": "exit 0", + "test": "node --test --import tsx test/*.test.ts" + }, + "publishConfig": { + "access": "public" + }, + "devDependencies": { + "@types/node": "^20.9.0", + "tap": "^21.0.1", + "tsup": "^7.2.0", + "tsx": "^4.19.1", + "typescript": "^5.0.0" + }, + "dependencies": { + "@orama/orama": "workspace:*" + } +} diff --git a/packages/plugin-pt15/src/algorithm.ts b/packages/plugin-pt15/src/algorithm.ts new file mode 100644 index 00000000..e6881e6c --- /dev/null +++ b/packages/plugin-pt15/src/algorithm.ts @@ -0,0 +1,214 @@ +import { AnyIndexStore, AnyOrama, SearchableType, Tokenizer, VectorIndex } from "@orama/orama" +import { avl, bkd, flat, bool } from '@orama/orama/trees' +import { + getVectorSize, internalDocumentIDStore, isVectorType } from '@orama/orama/components' + +type InternalDocumentID = internalDocumentIDStore.InternalDocumentID; + +export type TreeType = 'AVL' | 'Radix' | 'Bool' | 'Flat' | 'BKD'; +export type TTree = { + type: T; + node: N; + isArray: boolean; +}; +export type Tree = + // We don't store strings inside a tree + // | TTree<'Radix', radix.RadixNode> + | TTree<'Position', PositionsStorage> + | TTree<'AVL', avl.AVLTree> + | TTree<'Bool', bool.BoolNode> + | TTree<'Flat', flat.FlatTree> + | TTree<'BKD', bkd.BKDTree>; + +const MAX_POSITION = 15 +export type PositionStorage = Record +// 15 `PositionStorage`s +export type PositionsStorage = [ + PositionStorage, + PositionStorage, + PositionStorage, + PositionStorage, + PositionStorage, + PositionStorage, + PositionStorage, + PositionStorage, + PositionStorage, + PositionStorage, + PositionStorage, + PositionStorage, + PositionStorage, + PositionStorage, + PositionStorage, +] + +export interface PT15IndexStore extends AnyIndexStore { + indexes: Record + vectorIndexes: Record + searchableProperties: string[] + searchablePropertiesWithTypes: Record +} + +function create_obj() { + // object with empty prototype to cheap objects + return Object.create(null) +} + +export function recursiveCreate(indexDatastore: PT15IndexStore, schema: T['schema'], prefix: string) { + for (const [prop, type] of Object.entries(schema)) { + const path = `${prefix}${prefix ? '.' : ''}${prop}` + + if (typeof type === 'object' && !Array.isArray(type)) { + // Nested + recursiveCreate(indexDatastore, type, path) + continue + } + + if (isVectorType(type)) { + indexDatastore.searchableProperties.push(path) + indexDatastore.searchablePropertiesWithTypes[path] = type + indexDatastore.vectorIndexes[path] = { + size: getVectorSize(type), + vectors: {} + } + } else { + const isArray = /\[/.test(type as string) + switch (type) { + case 'boolean': + case 'boolean[]': + indexDatastore.indexes[path] = { type: 'Bool', node: new bool.BoolNode(), isArray } + break + case 'number': + case 'number[]': + indexDatastore.indexes[path] = { type: 'AVL', node: new avl.AVLTree(0, []), isArray } + break + case 'string': + case 'string[]': + indexDatastore.indexes[path] = { type: 'Position', node: [ + create_obj(), + create_obj(), + create_obj(), + create_obj(), + create_obj(), + create_obj(), + create_obj(), + create_obj(), + create_obj(), + create_obj(), + create_obj(), + create_obj(), + create_obj(), + create_obj(), + create_obj(), + ], isArray } + break + case 'enum': + case 'enum[]': + indexDatastore.indexes[path] = { type: 'Flat', node: new flat.FlatTree(), isArray } + break + case 'geopoint': + indexDatastore.indexes[path] = { type: 'BKD', node: new bkd.BKDTree(), isArray } + break + default: + throw new Error('INVALID_SCHEMA_TYPE: ' + path) + } + + indexDatastore.searchableProperties.push(path) + indexDatastore.searchablePropertiesWithTypes[path] = type + } + } +} + +export function insertString( + value: string, + positionsStorage: PositionsStorage, + prop: string, + internalId: InternalDocumentID, + language: string | undefined, + tokenizer: Tokenizer, +) { + const tokens = tokenizer.tokenize(value, language, prop) + const tokensLength = tokens.length + for (let i = 0; i < tokensLength; i++) { + const token = tokens[i] + const position = MAX_POSITION - get_position(i, tokensLength) - 1 + + const positionStorage = positionsStorage[position] + + const tokenLength = token.length + for (let j = tokenLength; j > 0; j--) { + const tokenPart = token.slice(0, j) + positionStorage[tokenPart] = positionStorage[tokenPart] || [] + positionStorage[tokenPart].push(internalId) + } + } +} + +export function get_position(n: number, totalLength: number) { + if (totalLength < MAX_POSITION) { + return n + } + // Scale + return Math.floor(n * MAX_POSITION / totalLength) +} + +export function searchString( + tokenizer: Tokenizer, + term: string, + positionsStorage: PositionsStorage, + boostPerProp: number, +) { + const tokens = tokenizer.tokenize(term) + + const ret: Map = new Map() + for (const token of tokens) { + for (let i = 0; i < MAX_POSITION; i++) { + const positionStorage = positionsStorage[i] + if (positionStorage[token]) { + const a = positionStorage[token] + const aLength = a.length + + for (let j = 0; j < aLength; j++) { + const id = a[j] + if (ret.has(id)) { + ret.set(id, ret.get(id)! + i * boostPerProp) + } else { + ret.set(id, i * boostPerProp) + } + } + } + } + } + + return ret +} + +export function removeString( + value: string, + positionsStorage: PositionsStorage, + prop: string, + internalId: InternalDocumentID, + tokenizer: Tokenizer, + language: string | undefined, +) { + const tokens = tokenizer.tokenize(value, language, prop) + const tokensLength = tokens.length + for (let i = 0; i < tokensLength; i++) { + const token = tokens[i] + const position = MAX_POSITION - get_position(i, tokensLength) - 1 + + const positionStorage = positionsStorage[position] + + const tokenLength = token.length + for (let j = tokenLength; j > 0; j--) { + const tokenPart = token.slice(0, j) + const a = positionStorage[tokenPart] + if (a) { + const index = a.indexOf(internalId) + if (index !== -1) { + a.splice(index, 1) + } + } + } + } + +} \ No newline at end of file diff --git a/packages/plugin-pt15/src/index.ts b/packages/plugin-pt15/src/index.ts new file mode 100644 index 00000000..5f6cf2fd --- /dev/null +++ b/packages/plugin-pt15/src/index.ts @@ -0,0 +1,197 @@ +import type { AnyOrama, SearchableType, IIndex, AnyIndexStore, SearchableValue, Tokenizer, OnlyStrings, FlattenSchemaProperty, TokenScore, WhereCondition, OramaPluginSync, AnySchema, ObjectComponents } from '@orama/orama' +import { + index as Index, internalDocumentIDStore } from '@orama/orama/components' +import { PT15IndexStore, insertString, recursiveCreate, PositionsStorage, searchString, removeString } from './algorithm.js'; + +type InternalDocumentID = internalDocumentIDStore.InternalDocumentID; +type InternalDocumentIDStore = internalDocumentIDStore.InternalDocumentIDStore; +type DocumentID = internalDocumentIDStore.DocumentID; + +export function pluginPT15(): OramaPluginSync { + + return { + name: 'orama-plugin-pt15', + + getComponents: function getComponents(schema: AnySchema) { + return createComponents(schema) + }, + } +} + +function createComponents(schema: AnySchema): Partial> { + return { + index: { + create: function create() { + const indexDatastore: PT15IndexStore = { + indexes: {}, + vectorIndexes: {}, + searchableProperties: [], + searchablePropertiesWithTypes: {}, + } + + recursiveCreate(indexDatastore, schema, '') + + return indexDatastore + }, + insert: function insert( + implementation: IIndex, + indexDatastorage: PT15IndexStore, + prop: string, + id: DocumentID, + internalId: InternalDocumentID, + value: SearchableValue, + schemaType: SearchableType, + language: string | undefined, + tokenizer: Tokenizer, + docsCount: number + ) { + if (!(schemaType === 'string' || schemaType === 'string[]')) { + return Index.insert(implementation as unknown as IIndex, indexDatastorage as unknown as Index.Index, prop, id, internalId, value, schemaType, language, tokenizer, docsCount) + } + + const storage = indexDatastorage.indexes[prop].node as PositionsStorage + + if (Array.isArray(value)) { + for (const item of value) { + insertString( + item as string, + storage, + prop, + internalId, + language, + tokenizer, + ) + } + } else { + insertString( + value as string, + storage, + prop, + internalId, + language, + tokenizer, + ) + } + }, + // remove: (implementation: IIndex, index: T, prop: string, id: DocumentID, value: SearchableValue, schemaType: SearchableType, language: string | undefined, tokenizer: Tokenizer, docsCount: number) => SyncOrAsyncValue; + remove: function remove(implementation: IIndex, indexDatastorage: PT15IndexStore, prop: string, id: DocumentID, internalId: InternalDocumentID, value: SearchableValue, schemaType: SearchableType, language: string | undefined, tokenizer: Tokenizer, docsCount: number) { + if (!(schemaType === 'string' || schemaType === 'string[]')) { + return Index.remove(implementation as IIndex, indexDatastorage as Index.Index, prop, id, internalId, value, schemaType, language, tokenizer, docsCount) + } + + const storage = indexDatastorage.indexes[prop].node as PositionsStorage + + if (Array.isArray(value)) { + for (const item of value) { + removeString( + item as string, + storage, + prop, + internalId, + tokenizer, + language + ) + } + } else { + removeString( + value as string, + storage, + prop, + internalId, + tokenizer, + language + ) + } + + return true + }, + insertDocumentScoreParameters: () => {throw new Error()}, + insertTokenScoreParameters: () => {throw new Error()}, + removeDocumentScoreParameters: () => {throw new Error()}, + removeTokenScoreParameters: () => {throw new Error()}, + calculateResultScores: () => {throw new Error()}, + search: function search(index: PT15IndexStore, term: string, tokenizer: Tokenizer, language: string | undefined, propertiesToSearch: string[], exact: boolean, tolerance: number, boost: Partial[]>, number>>): TokenScore[] { + if (tolerance !== 0) { + throw new Error('Tolerance not implemented yet') + } + if (exact === true) { + throw new Error('Exact not implemented yet') + } + + const maps: Map[] = [] + const propertyLength = propertiesToSearch.length + let max = { + score: -Infinity, + id: -1 + } + for (let i = 0; i < propertyLength; i++) { + const property = propertiesToSearch[i] + const storage = index.indexes[property].node as PositionsStorage + const boostPerProp = boost[property] ?? 1 + const map = searchString(tokenizer, term, storage, boostPerProp); + if (map.size > max.score) { + max = { + score: map.size, + id: i + } + } + maps.push(map) + } + + if (maps.length === 1) { + return Array.from(maps[0]) + } + + const base = maps[max.id] + for (let i = 0; i < maps.length; i++) { + if (i === max.id) { + continue + } + + const map = maps[i] + for (const [id, score] of map) { + if (base.has(id)) { + base.set(id, base.get(id)! + score) + } else { + base.set(id, score) + } + } + } + + return Array.from(base) + }, + searchByWhereClause: function searchByWhereClause(index: AnyIndexStore, tokenizer: Tokenizer, filters: Partial>, language: string | undefined): InternalDocumentID[] { + return Index.searchByWhereClause(index as Index.Index, tokenizer, filters, language) + }, + getSearchableProperties: function getSearchableProperties(index: PT15IndexStore): string[] { + return index.searchableProperties + }, + getSearchablePropertiesWithTypes: function (index: PT15IndexStore) { + return index.searchablePropertiesWithTypes + }, + load: function load(sharedInternalDocumentStore: InternalDocumentIDStore, raw: R): PT15IndexStore { + const dump1 = Index.load(sharedInternalDocumentStore, raw[0]) + const dump2 = raw[1] + return { + ...dump1, + indexes: { + ...Object.fromEntries(dump2), + ...dump1.indexes + } as PT15IndexStore['indexes'] + } + }, + save: function save(index: PT15IndexStore): R { + const baseIndex = index as unknown as Index.Index + const nonStringIndexes = Object.entries(index.indexes).filter(([, { type }]) => type !== 'Position') + const dump1 = Index.save({ + ...baseIndex, + indexes: Object.fromEntries(nonStringIndexes) as Index.Index['indexes'] + }) + + const stringIndexes = Object.entries(index.indexes).filter(([, { type }]) => type === 'Position') + + return [dump1, stringIndexes] as unknown as R + } + } + } +} diff --git a/packages/plugin-pt15/test/index.test.ts b/packages/plugin-pt15/test/index.test.ts new file mode 100644 index 00000000..7fd25c41 --- /dev/null +++ b/packages/plugin-pt15/test/index.test.ts @@ -0,0 +1,89 @@ +import t from 'tap' +import { create, insertMultiple, load, remove, save, search } from '@orama/orama' +import {pluginPT15} from '../src/index.js' +import { get_position } from '../src/algorithm.js' + +t.test('get_position', async t => { + t.equal(get_position(0, 1), 0) + t.equal(get_position(1, 1), 1) + + t.equal(get_position(0, 50), 0) + t.equal(get_position(1, 50), 0) + t.equal(get_position(2, 50), 0) + t.equal(get_position(3, 50), 0) + + t.equal(get_position(4, 50), 1) + t.equal(get_position(5, 50), 1) + t.equal(get_position(6, 50), 1) + + t.equal(get_position(7, 50), 2) + t.equal(get_position(8, 50), 2) + t.equal(get_position(9, 50), 2) + + t.equal(get_position(10, 50), 3) + t.equal(get_position(11, 50), 3) + t.equal(get_position(12, 50), 3) + t.equal(get_position(13, 50), 3) + + t.equal(get_position(14, 50), 4) + + // skip some... + + t.equal(get_position(46, 50), 13) + + t.equal(get_position(47, 50), 14) + t.equal(get_position(48, 50), 14) + t.equal(get_position(49, 50), 14) +}) + +t.test('plugin-pt15', async t => { + const db = create({ + schema: { + name: 'string', + age: 'number', + isCool: 'boolean', + algo: 'string[]', + preferredNumbers: 'number[]', + } as const, + plugins: [pluginPT15()] + }) + + await insertMultiple(db, [ + { id: '1', name: 'The pen is on the table', age: 33, isCool: true, algo: ['algo1', 'algo2'], preferredNumbers: [20] }, + { id: '2', name: 'The can is near the table', age: 32, isCool: true, algo: ['algo3'], preferredNumbers: [55] }, + { id: '3', name: 'My table is cool', age: 22, isCool: false, algo: ['algo4'], preferredNumbers: [22] } + ]) + + const result = await search(db, { + term: 't' + }) + + t.equal(result.count, 3) + + const dump = await save(db) + const restored = JSON.parse(JSON.stringify(dump)) + + const db2 = create({ + schema: { + name: 'string', + age: 'number', + isCool: 'boolean', + algo: 'string[]', + preferredNumbers: 'number[]', + } as const, + plugins: [pluginPT15()] + }) + await load(db2, restored) + + const result2 = await search(db2, { + term: 't' + }) + t.equal(result2.count, 3) + + await remove(db2, '1') + + const result3 = await search(db2, { + term: 't' + }) + t.equal(result3.count, 2) +}) diff --git a/packages/plugin-pt15/tsconfig.json b/packages/plugin-pt15/tsconfig.json new file mode 100644 index 00000000..a60b3643 --- /dev/null +++ b/packages/plugin-pt15/tsconfig.json @@ -0,0 +1,20 @@ +{ + "compilerOptions": { + "allowJs": true, + "target": "ES2020", + "module": "NodeNext", + "outDir": "dist", + "jsx": "react", + "noImplicitAny": false, + "lib": ["ESNext", "DOM", "DOM.Iterable"], + "esModuleInterop": true, + "declaration": true, + "forceConsistentCasingInFileNames": true, + "strict": true, + "skipLibCheck": true, + "resolveJsonModule": true, + "sourceMap": true, + "moduleResolution": "nodenext" + }, + "include": ["src/*.ts", "src/**/*.ts", "src/*.tsx", "src/**/*.tsx"] +} diff --git a/packages/plugin-pt15/tsup.lib.js b/packages/plugin-pt15/tsup.lib.js new file mode 100644 index 00000000..69c0b9d7 --- /dev/null +++ b/packages/plugin-pt15/tsup.lib.js @@ -0,0 +1,17 @@ +import { defineConfig } from 'tsup' + +const entry = new URL('src/index.ts', import.meta.url).pathname +const outDir = new URL('dist', import.meta.url).pathname + +export default defineConfig({ + entry: [entry], + splitting: false, + sourcemap: true, + minify: true, + format: ['cjs', 'esm', 'iife'], + globalName: 'orama.plugin.pt15', + dts: true, + clean: true, + bundle: true, + outDir +}) diff --git a/packages/plugin-vitepress/src/index.ts b/packages/plugin-vitepress/src/index.ts index b273bf6b..1ea84bdf 100644 --- a/packages/plugin-vitepress/src/index.ts +++ b/packages/plugin-vitepress/src/index.ts @@ -3,7 +3,7 @@ import type { SiteConfig } from 'vitepress' import MarkdownIt from 'markdown-it' import { JSDOM } from 'jsdom' import { presets } from '@orama/searchbox' -import { create, insertMultiple } from '@orama/orama' +import { AnySchema, create, insertMultiple } from '@orama/orama' import { persist } from '@orama/plugin-data-persistence' import slugify from 'slugify' import { readFileSync } from 'fs' @@ -45,10 +45,9 @@ async function createOramaContentLoader(paths: string[], root: string, base: str .flatMap((data) => formatForOrama(data, base)) const db = await create({ - schema: presets.docs.schema + schema: presets.docs.schema as AnySchema }) - // @ts-expect-error - can't strongly type contents here await insertMultiple(db, contents) return persist(db, 'json', 'browser') diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 48033f0a..b4b9121a 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -652,6 +652,28 @@ importers: specifier: ^5.0.0 version: 5.6.2 + packages/plugin-pt15: + dependencies: + '@orama/orama': + specifier: workspace:* + version: link:../orama + devDependencies: + '@types/node': + specifier: ^20.9.0 + version: 20.16.9 + tap: + specifier: ^21.0.1 + version: 21.0.1(@swc/core@1.7.28(@swc/helpers@0.5.5))(@types/node@20.16.9)(@types/react@18.3.9)(react-dom@18.3.1(react@18.3.1))(react@18.3.1)(typescript@5.6.2) + tsup: + specifier: ^7.2.0 + version: 7.3.0(@swc/core@1.7.28(@swc/helpers@0.5.5))(postcss@8.4.47)(ts-node@10.9.2(@swc/core@1.7.28(@swc/helpers@0.5.5))(@types/node@20.16.9)(typescript@5.6.2))(typescript@5.6.2) + tsx: + specifier: ^4.19.1 + version: 4.19.1 + typescript: + specifier: ^5.0.0 + version: 5.6.2 + packages/plugin-secure-proxy: dependencies: '@oramacloud/client': diff --git a/turbo.json b/turbo.json index 33cc9fe8..2a074019 100644 --- a/turbo.json +++ b/turbo.json @@ -50,6 +50,10 @@ "dependsOn": ["@orama/orama#build"], "outputs": ["dist/**"] }, + "@orama/plugin-pt15#build": { + "dependsOn": ["@orama/orama#build"], + "outputs": ["dist/**"] + }, "@orama/tokenizers#build": { "dependsOn": ["@orama/orama#build"] },