From cd644de03963b3b8b371d6d98484128a16f5346d Mon Sep 17 00:00:00 2001 From: Pau Ramon Revilla Date: Mon, 17 Jun 2024 11:20:50 +0200 Subject: [PATCH] Several optimizations I've been profiling orama, since it's taking 4s to ingest around 4k documents on my project and I would like to lower this down. I've noticed several things: It's ingesting a large amount of empty strings which is useless CPU time and also it's recalculating ids redundantly. This commit tries to address this two issues. Additionally, I've noticed that due to the async APIs, the code is spending most of it's time waiting for "run microtasks". I have no idea if it would be possible to compile those away, because right now it makes the default implementation much worse (performance-wise) in order for people to be able to provide their storage solution. Lastly, I've also noticed that providing the ID of the document, makes the ID be stored as part of the document properties. I thought it would only be to replace the default orama ID. I will fix this myself in userland by using `getDocumentProperties`, but perhaps is good to either change this default or document it. --- packages/orama/src/components/index.ts | 7 ++----- packages/orama/src/methods/insert.ts | 5 +++++ 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/packages/orama/src/components/index.ts b/packages/orama/src/components/index.ts index 509994c56..544764f8c 100644 --- a/packages/orama/src/components/index.ts +++ b/packages/orama/src/components/index.ts @@ -114,12 +114,10 @@ export interface Index extends AnyIndexStore { export async function insertDocumentScoreParameters( index: Index, prop: string, - id: DocumentID, + internalId: DocumentID, tokens: string[], docsCount: number ): Promise { - const internalId = getInternalDocumentId(index.sharedInternalDocumentStore, id) - index.avgFieldLength[prop] = ((index.avgFieldLength[prop] ?? 0) * (docsCount - 1) + tokens.length) / docsCount index.fieldLengths[prop][internalId] = tokens.length index.frequencies[prop][internalId] = {} @@ -128,7 +126,7 @@ export async function insertDocumentScoreParameters( export async function insertTokenScoreParameters( index: Index, prop: string, - id: DocumentID, + internalId: DocumentID, tokens: string[], token: string ): Promise { @@ -140,7 +138,6 @@ export async function insertTokenScoreParameters( } } - const internalId = getInternalDocumentId(index.sharedInternalDocumentStore, id) const tf = tokenFrequency / tokens.length index.frequencies[prop][internalId]![token] = tf diff --git a/packages/orama/src/methods/insert.ts b/packages/orama/src/methods/insert.ts index 446893935..5a399f4f6 100644 --- a/packages/orama/src/methods/insert.ts +++ b/packages/orama/src/methods/insert.ts @@ -87,10 +87,15 @@ async function innerInsert( for (const prop of indexableProperties) { const value = indexableValues[prop] + if (typeof value === 'undefined') { continue } + if (typeof value === 'string' && value.length === 0) { + continue + } + const expectedType = indexablePropertiesWithTypes[prop] await orama.index.beforeInsert?.( orama.data.index,