Skip to content

Commit

Permalink
Better normalization cache
Browse files Browse the repository at this point in the history
They key seems to be too specific. Specially by using the prop, which
basically makes it redudant to cache tokens that are found in different
props. The goal of that cache seems to be to trade memory for time, but
right now seems to be storing equal computations in different keys which
basically is inefficient. The only thing that the prop is needed for is
the `stemmerSkipProperties`.
  • Loading branch information
masylum committed Jun 17, 2024
1 parent 19df111 commit 6303b59
Show file tree
Hide file tree
Showing 3 changed files with 7 additions and 11 deletions.
4 changes: 1 addition & 3 deletions packages/orama/src/components/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -156,11 +156,9 @@ export async function insertTokenScoreParameters(
export async function removeDocumentScoreParameters(
index: Index,
prop: string,
id: DocumentID,
internalId: DocumentID,
docsCount: number
): Promise<void> {
const internalId = getInternalDocumentId(index.sharedInternalDocumentStore, id)

index.avgFieldLength[prop] =
(index.avgFieldLength[prop] * docsCount - index.fieldLengths[prop][internalId]!) / (docsCount - 1)
index.fieldLengths[prop][internalId] = undefined
Expand Down
12 changes: 5 additions & 7 deletions packages/orama/src/components/tokenizer/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,15 +16,13 @@ export interface DefaultTokenizer extends Tokenizer {
}

export function normalizeToken(this: DefaultTokenizer, prop: string, token: string): string {
const key = `${this.language}:${prop}:${token}`

if (this.normalizationCache.has(key)) {
return this.normalizationCache.get(key)!
if (this.normalizationCache.has(token)) {
return this.normalizationCache.get(token)!
}

// Remove stopwords if enabled
if (this.stopWords?.includes(token)) {
this.normalizationCache.set(key, '')
if (this.stopWords?.has(token)) {
this.normalizationCache.set(token, '')
return ''
}

Expand All @@ -34,7 +32,7 @@ export function normalizeToken(this: DefaultTokenizer, prop: string, token: stri
}

token = replaceDiacritics(token)
this.normalizationCache.set(key, token)
this.normalizationCache.set(token, token)
return token
}

Expand Down
2 changes: 1 addition & 1 deletion packages/orama/tests/search.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -725,7 +725,7 @@ t.test('search method', (t) => {
t.test('with custom tokenizer', async (t) => {
t.plan(4)

const normalizationCache = new Map([['english:foo:dogs', 'Dogs']])
const normalizationCache = new Map([['dogs', 'Dogs']])

const db = await create({
schema: {
Expand Down

0 comments on commit 6303b59

Please sign in to comment.