Skip to content

Commit

Permalink
Feat/remove natural (#60)
Browse files Browse the repository at this point in the history
* refactor(lyra): removes natural as dependency
  • Loading branch information
micheleriva authored Jul 30, 2022
1 parent 0c142d0 commit d472a46
Show file tree
Hide file tree
Showing 12 changed files with 116 additions and 395 deletions.
3 changes: 2 additions & 1 deletion .eslintignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,5 @@ jest.config.js
*.html
*.svg
*.dpack
get-imdb-dataset.mjs
get-imdb-dataset.mjs
**/tap-snapshots/**
4 changes: 1 addition & 3 deletions packages/lyra/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,9 @@
"module": "./dist/esm/lyra.js",
"types": "./dist/esm/lyra.d.ts",
"dependencies": {
"fastq": "^1.13.0",
"natural": "^5.2.3"
"fastq": "^1.13.0"
},
"devDependencies": {
"@types/natural": "^5.1.1",
"@types/node": "^18.6.2",
"@types/tap": "^15.0.7",
"c8": "^7.12.0",
Expand Down
2 changes: 1 addition & 1 deletion packages/lyra/src/errors.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { SUPPORTED_LANGUAGES } from "./stemmer";
import { SUPPORTED_LANGUAGES } from "./tokenizer/languages";

function formatJSON(input: object) {
return JSON.stringify(input, null, 2);
Expand Down
10 changes: 3 additions & 7 deletions packages/lyra/src/lyra.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import * as ERRORS from "./errors";
import toFastProperties, { insertWithFastProperties } from "./fast-properties";
import { tokenize } from "./tokenizer";
import { getNanosecondsTime, uniqueId } from "./utils";
import { Language, SUPPORTED_LANGUAGES } from "./stemmer";
import { Language, SUPPORTED_LANGUAGES } from "./tokenizer/languages";
import type { ResolveSchema, SearchProperties } from "./types";
import { create as createNode, Node } from "./prefix-tree/node";
import { find as trieFind, insert as trieInsert, removeDocumentByWord, Nodes } from "./prefix-tree/trie";
Expand All @@ -19,7 +19,6 @@ export type PropertiesSchema = {
export type LyraProperties<T extends PropertiesSchema> = {
schema: T;
defaultLanguage?: Language;
stemming?: boolean;
edge?: boolean;
};

Expand All @@ -36,7 +35,6 @@ export type SearchParams<T extends PropertiesSchema> = {

export type InsertConfig = {
language: Language;
stemming: boolean;
};

export type LyraData<T extends PropertiesSchema> = {
Expand Down Expand Up @@ -69,7 +67,6 @@ export interface Lyra<T extends PropertiesSchema> {
docs: LyraDocs<T>;
nodes: Nodes;
index: LyraIndex;
enableStemming: boolean;
edge: boolean;
queue?: fastq.queue<QueueDocParams<T>, void>;
}
Expand Down Expand Up @@ -160,7 +157,7 @@ function _insert<T extends PropertiesSchema>(
// Use propName here because if doc is a nested object
// We will get the wrong index
const requestedTrie = index[propName];
const tokens = tokenize(doc[key] as string, config.language, config.stemming);
const tokens = tokenize(doc[key] as string, config.language);

for (const token of tokens) {
trieInsert(nodes, requestedTrie, token, id);
Expand Down Expand Up @@ -212,7 +209,6 @@ export function create<T extends PropertiesSchema>(properties: LyraProperties<T>
docs: {},
nodes: {},
index: {},
enableStemming: properties.stemming ?? true,
edge: properties.edge ?? false,
};

Expand All @@ -227,7 +223,7 @@ export function insert<T extends PropertiesSchema>(
doc: ResolveSchema<T>,
config?: InsertConfig,
): { id: string } {
config = { language: lyra.defaultLanguage, stemming: lyra.enableStemming, ...config };
config = { language: lyra.defaultLanguage, ...config };
const id = uniqueId();

if (!SUPPORTED_LANGUAGES.includes(config.language)) {
Expand Down
65 changes: 0 additions & 65 deletions packages/lyra/src/stemmer.ts

This file was deleted.

14 changes: 0 additions & 14 deletions packages/lyra/src/tokenizer.ts

This file was deleted.

29 changes: 29 additions & 0 deletions packages/lyra/src/tokenizer/index.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import { Language } from "./languages";

const splitRegex: Record<Language, RegExp> = {
dutch: /[^a-z0-9_'-]+/gim,
english: /[^a-z0-9_'-]+/gim,
french: /[^a-z0-9äâàéèëêïîöôùüûœç-]+/gim,
italian: /[^a-z0-9_'-]+/gim,
norwegian: /[^a-z0-9_æøåÆØÅäÄöÖüÜ]+/gim,
portugese: /[^a-zà-úÀ-Ú]/gim,
russian: /[^a-zа-яА-ЯёЁ]+/gim,
spanish: /[^a-zA-Zá-úÁ-ÚñÑüÜ]+/gim,
swedish: /[^a-z0-9_åÅäÄöÖüÜ-]+/gim,
};

export function tokenize(input: string, language: Language = "english") {
const splitRule = splitRegex[language];
const tokens = input.toLowerCase().split(splitRule);
return Array.from(new Set(trim(tokens)));
}

function trim(text: string[]): string[] {
while (text[text.length - 1] === "") {
text.pop();
}
while (text[0] === "") {
text.shift();
}
return text;
}
13 changes: 13 additions & 0 deletions packages/lyra/src/tokenizer/languages.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
export type Language = typeof SUPPORTED_LANGUAGES[number];

export const SUPPORTED_LANGUAGES = [
"dutch",
"english",
"french",
"italian",
"norwegian",
"portugese",
"russian",
"spanish",
"swedish",
] as const;
Loading

0 comments on commit d472a46

Please sign in to comment.