Skip to content

Commit

Permalink
feat(dsa): adds numbers indexing support (#279)
Browse files Browse the repository at this point in the history
  • Loading branch information
micheleriva authored Feb 14, 2023
1 parent 0757bdb commit 63b27c3
Show file tree
Hide file tree
Showing 27 changed files with 1,133 additions and 82 deletions.
2 changes: 1 addition & 1 deletion src/algorithms.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import type { BM25Params, TokenScore } from "./types.js";
import type { BM25Params, TokenScore } from "./types/index.js";
import * as ERRORS from "./errors.js";

// Adapted from https://github.com/lovasoa/fast_array_intersect
Expand Down
4 changes: 4 additions & 0 deletions src/errors.ts
Original file line number Diff line number Diff line change
Expand Up @@ -78,4 +78,8 @@ export function INVALID_TOKENIZER_FUNCTION(): string {

export function INVALID_BOOST_VALUE(): string {
return `Boost value must be a number greater than, or less than 0.`;
}

export function INVALID_FILTER_OPERATION(found: string[]): string {
return `You can only use one operation per filter. Found ${found.length}: ${found.join(", ")}`;
}
2 changes: 1 addition & 1 deletion src/facets.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import type { FacetSorting, FacetsSearch, PropertiesSchema, ResolveSchema, TokenScore } from "./types.js";
import type { FacetSorting, FacetsSearch, PropertiesSchema, ResolveSchema, TokenScore } from "./types/index.js";
import { getNested } from './utils.js';

export type FacetReturningValue = {
Expand Down
95 changes: 95 additions & 0 deletions src/filters.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
import type { WhereFilter, FilterOperation, PropertiesSchema, Lyra } from "./types/index.js";
import type { AVLNode } from "./trees/avl/node.js";
import { greaterThan, lessThan, rangeSearch, find } from "./trees/avl/index.js";
import { intersect } from './utils.js'
import * as ERRORS from "./errors.js";

export function getWhereFiltersIDs<S extends PropertiesSchema>(filters: WhereFilter<S>, lyra: Lyra<S>): string[] {
const filterKeys = Object.keys(filters);

const filtersMap: Record<string, string[]> = filterKeys.reduce((acc, key) => ({
[key]: [],
...acc,
}), {});

for (const param of filterKeys) {
const operation = filters[param as keyof WhereFilter<S>]
const operationKeys = Object.keys(operation as unknown as FilterOperation[])

if (operationKeys.length > 1) {
throw new Error(ERRORS.INVALID_FILTER_OPERATION(operationKeys))
}

const operationOpt = operationKeys[0] as FilterOperation
const operationValue = operation[operationOpt as keyof typeof operation];

const AVLNode = lyra.index[param] as AVLNode<number, string[]>;

switch (operationOpt) {
case "gt": {
// eslint-disable-next-line @typescript-eslint/ban-ts-comment
// @ts-ignore - this is a bug in the typescript compiler
const filteredIDs = greaterThan(AVLNode, operationValue, false);
filtersMap[param].push(...filteredIDs);
break;
}
case "gte": {
// eslint-disable-next-line @typescript-eslint/ban-ts-comment
// @ts-ignore - this is a bug in the typescript compiler
const filteredIDs = greaterThan(AVLNode, operationValue, true);
filtersMap[param].push(...filteredIDs);
break;
}
case "lt": {
// eslint-disable-next-line @typescript-eslint/ban-ts-comment
// @ts-ignore - this is a bug in the typescript compiler
const filteredIDs = lessThan(AVLNode, operationValue, false);
filtersMap[param].push(...filteredIDs);
break;
}
case "lte": {
// eslint-disable-next-line @typescript-eslint/ban-ts-comment
// @ts-ignore - this is a bug in the typescript compiler
const filteredIDs = lessThan(AVLNode, operationValue, true);
filtersMap[param].push(...filteredIDs);
break;
}
case "eq": {
// eslint-disable-next-line @typescript-eslint/ban-ts-comment
// @ts-ignore - this is a bug in the typescript compiler
const filteredIDs = find(AVLNode, operationValue);
filtersMap[param].push(...filteredIDs!);
break;
}
case "between": {
// eslint-disable-next-line @typescript-eslint/ban-ts-comment
// @ts-ignore - this is a bug in the typescript compiler
const filteredIDs = rangeSearch(AVLNode, operationValue[0], operationValue[1]);
filtersMap[param].push(...filteredIDs);
}
}
}

// AND operation: calculate the intersection between all the IDs in filterMap
const result = intersect(Object.values(filtersMap)) as unknown as string[];

return result;
}

export function intersectFilteredIDs(filtered: string[], lookedUp: [string, number][]): [string, number][] {
const map = new Map<string, boolean>();
const result: [string, number][] = [];

for (const id of filtered) {
map.set(id, true);
}

for (const [id, score] of lookedUp) {
if (map.has(id)) {
result.push([id, score]);
map.delete(id);
}
}

return result;
}
2 changes: 1 addition & 1 deletion src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ export { remove } from "./methods/remove.js";
export { save } from "./methods/save.js";
export { search } from "./methods/search.js";

export * from "./types.js";
export * from "./types/index.js";
export type { Language } from "./tokenizer/languages.js";
export type { InsertConfig, InsertBatchConfig } from "./methods/insert.js";
export type { RetrievedDoc, SearchParams, SearchResult } from "./methods/search.js";
Expand Down
2 changes: 1 addition & 1 deletion src/methods/common.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import * as ERRORS from "../errors.js";
import type { Lyra, PropertiesSchema, ResolveSchema } from "../types.js";
import type { Lyra, PropertiesSchema, ResolveSchema } from "../types/index.js";
import type { SearchParams } from "./search.js";

export function assertDocSchema<S extends PropertiesSchema>(doc: ResolveSchema<S>, lyraSchema: PropertiesSchema) {
Expand Down
17 changes: 13 additions & 4 deletions src/methods/create.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import type { Configuration, Lyra, PropertiesSchema } from "../types.js";
import type { Configuration, Lyra, PropertiesSchema } from "../types/index.js";
import { defaultTokenizerConfig, Language } from "../tokenizer/index.js";
import * as ERRORS from "../errors.js";
import { create as createNode } from "../radix-tree/node.js";
import { create as createNode } from "../trees/radix/node.js";
import { create as createAVLNode } from "../trees/avl/index.js";
import { validateHooks } from "./hooks.js";
import { intersectTokenScores } from "../algorithms.js";

Expand Down Expand Up @@ -65,8 +66,16 @@ function buildIndex<S extends PropertiesSchema>(lyra: Lyra<S>, schema: S, prefix
if (isNested) {
buildIndex(lyra, schema[prop] as S, `${propName}.`);
} else {
lyra.index[propName] = createNode();
lyra.avgFieldLength[propName] = 0;
if (schema[prop] === "string") {
lyra.index[propName] = createNode();
lyra.avgFieldLength[propName] = 0;
continue;
}

if (schema[prop] === "number") {
lyra.index[propName] = createAVLNode<number, string[]>(0, []);
continue;
}
}
}
}
2 changes: 1 addition & 1 deletion src/methods/hooks.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import * as ERRORS from "../errors.js";
import type { Lyra, PropertiesSchema } from "../types.js";
import type { Lyra, PropertiesSchema } from "../types/index.js";

export interface AfterInsertHook {
<S extends PropertiesSchema = PropertiesSchema>(this: Lyra<S>, id: string): Promise<void> | void;
Expand Down
18 changes: 13 additions & 5 deletions src/methods/insert.ts
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
import { Language, TokenizerConfigExec } from "../tokenizer/index.js";
import * as ERRORS from "../errors.js";
import type { Lyra, PropertiesSchema, ResolveSchema } from "../types/index.js";
import type { Language, TokenizerConfigExec } from "../tokenizer/index.js";
import type { AVLNode } from "../../src/trees/avl/node.js";
import type { RadixNode } from "../trees/radix/node.js";
import { trackInsertion } from "../insertion-checker.js";
import { insert as radixInsert } from "../radix-tree/radix.js";
import type { Lyra, PropertiesSchema, ResolveSchema } from "../types.js";
import { insert as radixInsert } from "../trees/radix/index.js";
import { insert as AVLInsert } from "../trees/avl/index.js";
import { uniqueId } from "../utils.js";
import { assertDocSchema } from "./common.js";
import { hookRunner } from "./hooks.js";
import * as ERRORS from "../errors.js";

export type InsertConfig<S extends PropertiesSchema> = {
language?: Language;
Expand Down Expand Up @@ -165,6 +168,11 @@ function recursiveradixInsertion<S extends PropertiesSchema>(
);
}


if (typeof doc[key] === "number" && key in schema && !isSchemaNested) {
AVLInsert(lyra.index[propName] as AVLNode<number, string[]>, doc[key] as number, [id]);
}

if (typeof doc[key] === "string" && key in schema && !isSchemaNested) {
// Use propName here because if doc is a nested object
// We will get the wrong index
Expand Down Expand Up @@ -210,7 +218,7 @@ function recursiveradixInsertion<S extends PropertiesSchema>(
// increase a token counter that may not yet exist
tokenOccurrencies[propName][token] = (tokenOccurrencies[propName][token] ?? 0) + 1;

radixInsert(requestedTrie, token, id);
radixInsert(requestedTrie as RadixNode, token, id);
}
}
}
Expand Down
2 changes: 1 addition & 1 deletion src/methods/load.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import * as ERRORS from "../errors.js";
import type { Data, Lyra, PropertiesSchema } from "../types.js";
import type { Data, Lyra, PropertiesSchema } from "../types/index.js";

export async function load<S extends PropertiesSchema>(
lyra: Lyra<S>,
Expand Down
32 changes: 29 additions & 3 deletions src/methods/remove.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
import type { Lyra, PropertiesSchema, ResolveSchema } from "../types.js";
import type { RadixNode } from "../trees/radix/node.js";
import type { Lyra, PropertiesSchema, ResolveSchema } from "../types/index.js";
import { defaultTokenizerConfig } from "../tokenizer/index.js";
import { removeDocumentByWord } from "../radix-tree/radix.js";
import { removeDocumentByWord } from "../trees/radix/index.js";
import { flattenObject, getNested } from "../utils.js";
import { getNodeByKey as getAVLNodeByKey } from "../trees/avl/index.js";
import * as ERRORS from "../errors.js";
import { AVLNode } from "../trees/avl/node.js";

/**
* Removes a document from a database.
Expand Down Expand Up @@ -48,15 +52,37 @@ export async function remove<S extends PropertiesSchema>(lyra: Lyra<S>, docID: s
const token = tokens[k];
delete lyra.frequencies[key][docID];
lyra.tokenOccurrencies[key][token]--;
if (token && !removeDocumentByWord(idx, token, docID)) {
if (token && !removeDocumentByWord(idx as RadixNode, token, docID)) {
throw new Error(ERRORS.CANT_DELETE_DOCUMENT(docID, key, token));
}
}
}
}

removeNumericValue(lyra, docID);

lyra.docs[docID] = undefined;
lyra.docsCount--;

return true;
}

function removeNumericValue<S extends PropertiesSchema>(lyra: Lyra<S>, docID: string) {
const document = lyra.docs[docID] as Record<string, ResolveSchema<S>>;
const flatDocument = flattenObject(document);
const documentNumericOnly = Object.keys(flatDocument).reduce((acc, key) => {
if (getNested(lyra.schema, key) === "number") {
acc[key] = (flatDocument as any)[key];
}
return acc;
}, {} as Record<string, number>);

for (const [property, value] of Object.entries(documentNumericOnly)) {
const idx = lyra.index[property] as AVLNode<number, string[]>;
const node = getAVLNodeByKey(idx, value);

if (node) {
node.value = node.value.filter((id) => id !== docID);
}
}
}
2 changes: 1 addition & 1 deletion src/methods/save.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import type { Data, Lyra, PropertiesSchema } from "../types.js";
import type { Data, Lyra, PropertiesSchema } from "../types/index.js";

export async function save<S extends PropertiesSchema>(lyra: Lyra<S>): Promise<Data<S>> {
return {
Expand Down
47 changes: 42 additions & 5 deletions src/methods/search.ts
Original file line number Diff line number Diff line change
@@ -1,10 +1,13 @@
import type { Lyra, PropertiesSchema, ResolveSchema, SearchProperties, TokenMap, TokenScore, BM25Params, BM25OptionalParams, PropertiesBoost, FacetsSearch } from "../types.js";
import type { RadixNode } from "../trees/radix/node.js";
import type { Lyra, PropertiesSchema, ResolveSchema, SearchProperties, TokenMap, TokenScore, BM25Params, BM25OptionalParams, PropertiesBoost, FacetsSearch } from "../types/index.js";
import type { WhereFilter } from "../types/filters.js";
import { defaultTokenizerConfig, Language } from "../tokenizer/index.js";
import { find as radixFind } from "../radix-tree/radix.js";
import { find as radixFind } from "../trees/radix/index.js";
import { formatNanoseconds, getNanosecondsTime, sortTokenScorePredicate } from "../utils.js";
import { getIndices } from "./common.js";
import { prioritizeTokenScores, BM25 } from "../algorithms.js";
import { FacetReturningValue, getFacets } from "../facets.js";
import { getWhereFiltersIDs, intersectFilteredIDs } from "../filters.js";

type IndexMap = Record<string, TokenMap>;

Expand Down Expand Up @@ -104,6 +107,25 @@ export type SearchParams<S extends PropertiesSchema> = {
* });
*/
facets?: FacetsSearch<S>;

/**
* Filter the search results.
*
* @example
* // Search for documents that contain 'Headphones' in the 'description' and 'title' fields and
* // have a price less than 100.
*
* const result = await search(db, {
* term: 'Headphones',
* properties: ['description', 'title'],
* where: {
* price: {
* lt: 100
* }
* }
* });
*/
where?: WhereFilter<S>;
};

export type SearchResult<S extends PropertiesSchema> = {
Expand Down Expand Up @@ -165,6 +187,15 @@ export async function search<S extends PropertiesSchema>(
const N = lyra.docsCount;

const timeStart = getNanosecondsTime();

// If filters are enabled, we need to get the IDs of the documents that match the filters.
const hasFilters = Object.keys(params.where ?? {}).length > 0;
let whereFiltersIDs: string[] = [];

if (hasFilters) {
whereFiltersIDs = getWhereFiltersIDs(params.where!, lyra);
}

// uniqueDocsIDs contains unique document IDs for all the tokens in all the indices.
const uniqueDocsIDs: Record<string, number> = {};

Expand Down Expand Up @@ -270,7 +301,13 @@ export async function search<S extends PropertiesSchema>(
}

// Get unique doc IDs from uniqueDocsIDs map, sorted by value.
const uniqueDocsArray = Object.entries(uniqueDocsIDs).sort(sortTokenScorePredicate);
let uniqueDocsArray = Object.entries(uniqueDocsIDs).sort(sortTokenScorePredicate);

// If filters are enabled, we need to remove the IDs of the documents that don't match the filters.
if (hasFilters) {
uniqueDocsArray = intersectFilteredIDs(whereFiltersIDs, uniqueDocsArray);
}

const resultIDs: Set<string> = new Set();
// Populate facets if needed
const facets = shouldCalculateFacets ? getFacets(lyra.schema, lyra.docs, uniqueDocsArray, params.facets!) : {};
Expand Down Expand Up @@ -306,7 +343,7 @@ export async function search<S extends PropertiesSchema>(
const searchResult: SearchResult<S> = {
elapsed,
hits: results.filter(Boolean),
count: Object.keys(uniqueDocsIDs).length,
count: uniqueDocsArray.length,
};

if (shouldCalculateFacets) {
Expand All @@ -321,7 +358,7 @@ function getDocumentIDsFromSearch<S extends PropertiesSchema>(
params: SearchParams<S> & { index: string },
): string[] {
const idx = lyra.index[params.index];
const searchResult = radixFind(idx, {
const searchResult = radixFind(idx as RadixNode, {
term: params.term,
exact: params.exact,
tolerance: params.tolerance,
Expand Down
Loading

0 comments on commit 63b27c3

Please sign in to comment.