Add built-in embedding engine into AnythingLLM (#411)

* Implement use of native embedder (all-Mini-L6-v2) stop showing prisma queries during dev * Add native embedder as an available embedder selection * wrap model loader in try/catch * print progress on download * Update to progress output for embedder * move embedder selection options to component * forgot import * add Data privacy alert updates for local embedder
Mintplex-Labs · Dec 6, 2023 · 88cdd8c · 88cdd8c
1 parent 48764d6
commit 88cdd8c
Show file tree

Hide file tree

Showing 14 changed files with 517 additions and 27 deletions.
diff --git a/frontend/src/components/EmbeddingSelection/NativeEmbeddingOptions/index.jsx b/frontend/src/components/EmbeddingSelection/NativeEmbeddingOptions/index.jsx
@@ -0,0 +1,10 @@
+export default function NativeEmbeddingOptions() {
+  return (
+    <div className="w-full h-20 items-center justify-center flex">
+      <p className="text-sm font-base text-white text-opacity-60">
+        There is no set up required when using AnythingLLM's native embedding
+        engine.
+      </p>
+    </div>
+  );
+}
diff --git a/frontend/src/media/logo/anything-llm-icon.png b/frontend/src/media/logo/anything-llm-icon.png
diff --git a/frontend/src/pages/GeneralSettings/EmbeddingPreference/index.jsx b/frontend/src/pages/GeneralSettings/EmbeddingPreference/index.jsx
@@ -5,6 +5,7 @@ import Sidebar, {
 import { isMobile } from "react-device-detect";
 import System from "../../../models/system";
 import showToast from "../../../utils/toast";
+import AnythingLLMIcon from "../../../media/logo/anything-llm-icon.png";
 import OpenAiLogo from "../../../media/llmprovider/openai.png";
 import AzureOpenAiLogo from "../../../media/llmprovider/azure.png";
 import LocalAiLogo from "../../../media/llmprovider/localai.png";
@@ -14,6 +15,7 @@ import ChangeWarningModal from "../../../components/ChangeWarning";
 import OpenAiOptions from "../../../components/EmbeddingSelection/OpenAiOptions";
 import AzureAiOptions from "../../../components/EmbeddingSelection/AzureAiOptions";
 import LocalAiOptions from "../../../components/EmbeddingSelection/LocalAiOptions";
+import NativeEmbeddingOptions from "../../../components/EmbeddingSelection/NativeEmbeddingOptions";
 
 export default function GeneralEmbeddingPreference() {
   const [saving, setSaving] = useState(false);
@@ -138,6 +140,14 @@ export default function GeneralEmbeddingPreference() {
                     name="EmbeddingEngine"
                     value={embeddingChoice}
                   />
+                  <LLMProviderOption
+                    name="AnythingLLM Embedder"
+                    value="native"
+                    description="Use the built-in embedding engine for AnythingLLM. Zero setup!"
+                    checked={embeddingChoice === "native"}
+                    image={AnythingLLMIcon}
+                    onClick={updateChoice}
+                  />
                   <LLMProviderOption
                     name="OpenAI"
                     value="openai"
@@ -167,6 +177,7 @@ export default function GeneralEmbeddingPreference() {
                   />
                 </div>
                 <div className="mt-10 flex flex-wrap gap-4 max-w-[800px]">
+                  {embeddingChoice === "native" && <NativeEmbeddingOptions />}
                   {embeddingChoice === "openai" && (
                     <OpenAiOptions settings={settings} />
                   )}

diff --git a/frontend/src/pages/OnboardingFlow/OnboardingModal/Steps/DataHandling/index.jsx b/frontend/src/pages/OnboardingFlow/OnboardingModal/Steps/DataHandling/index.jsx
@@ -1,5 +1,6 @@
 import React, { memo, useEffect, useState } from "react";
 import System from "../../../../../models/system";
+import AnythingLLMIcon from "../../../../../media/logo/anything-llm-icon.png";
 import OpenAiLogo from "../../../../../media/llmprovider/openai.png";
 import AzureOpenAiLogo from "../../../../../media/llmprovider/azure.png";
 import AnthropicLogo from "../../../../../media/llmprovider/anthropic.png";
@@ -57,67 +58,70 @@ const VECTOR_DB_PRIVACY = {
   chroma: {
     name: "Chroma",
     description: [
-      "Your embedded text not visible outside of your Chroma instance",
+      "Your vectors and document text are stored on your Chroma instance",
       "Access to your instance is managed by you",
     ],
     logo: ChromaLogo,
   },
   pinecone: {
     name: "Pinecone",
     description: [
-      "Your embedded text and vectors are visible to Pinecone, but is not accessed",
-      "They manage your data and access to their servers",
+      "Your vectors and document text are stored on Pinecone's servers",
+      "Access to your data is managed by Pinecone",
     ],
     logo: PineconeLogo,
   },
   qdrant: {
     name: "Qdrant",
     description: [
-      "Your embedded text is visible to Qdrant if using a hosted instance",
-      "Your embedded text is not visible to Qdrant if using a self-hosted instance",
-      "Your data is stored on your Qdrant instance",
+      "Your vectors and document text are stored on your Qdrant instance (cloud or self-hosted)",
     ],
     logo: QDrantLogo,
   },
   weaviate: {
     name: "Weaviate",
     description: [
-      "Your embedded text is visible to Weaviate, if using a hosted instance",
-      "Your embedded text is not visible to Weaviate, if using a self-hosted instance",
-      "Your data is stored on your Weaviate instance",
+      "Your vectors and document text are stored on your Weaviate instance (cloud or self-hosted)",
     ],
     logo: WeaviateLogo,
   },
   lancedb: {
     name: "LanceDB",
     description: [
-      "Your embedded text and vectors are only accessible by this AnythingLLM instance",
+      "Your vectors and document text are stored privately on this instance of AnythingLLM",
     ],
     logo: LanceDbLogo,
   },
 };
 
 const EMBEDDING_ENGINE_PRIVACY = {
+  native: {
+    name: "AnythingLLM Embedder",
+    description: [
+      "Your document text is embedded privately on this instance of AnythingLLM",
+    ],
+    logo: AnythingLLMIcon,
+  },
   openai: {
     name: "OpenAI",
     description: [
-      "Your documents are visible to OpenAI",
+      "Your document text is sent to OpenAI servers",
       "Your documents are not used for training",
     ],
     logo: OpenAiLogo,
   },
   azure: {
     name: "Azure OpenAI",
     description: [
-      "Your documents are not visible to OpenAI or Microsoft",
-      "Your documents not used for training",
+      "Your document text is sent to your Microsoft Azure service",
+      "Your documents are not used for training",
     ],
     logo: AzureOpenAiLogo,
   },
   localai: {
     name: "LocalAI",
     description: [
-      "Your documents are only accessible on the server running LocalAI",
+      "Your document text is embedded privately on the server running LocalAI",
     ],
     logo: LocalAiLogo,
   },

diff --git a/frontend/src/pages/OnboardingFlow/OnboardingModal/Steps/EmbeddingSelection/index.jsx b/frontend/src/pages/OnboardingFlow/OnboardingModal/Steps/EmbeddingSelection/index.jsx
@@ -1,4 +1,5 @@
 import React, { memo, useEffect, useState } from "react";
+import AnythingLLMIcon from "../../../../../media/logo/anything-llm-icon.png";
 import OpenAiLogo from "../../../../../media/llmprovider/openai.png";
 import AzureOpenAiLogo from "../../../../../media/llmprovider/azure.png";
 import LocalAiLogo from "../../../../../media/llmprovider/localai.png";
@@ -8,9 +9,10 @@ import LLMProviderOption from "../../../../../components/LLMSelection/LLMProvide
 import OpenAiOptions from "../../../../../components/EmbeddingSelection/OpenAiOptions";
 import AzureAiOptions from "../../../../../components/EmbeddingSelection/AzureAiOptions";
 import LocalAiOptions from "../../../../../components/EmbeddingSelection/LocalAiOptions";
+import NativeEmbeddingOptions from "../../../../../components/EmbeddingSelection/NativeEmbeddingOptions";
 
 function EmbeddingSelection({ nextStep, prevStep, currentStep }) {
-  const [embeddingChoice, setEmbeddingChoice] = useState("openai");
+  const [embeddingChoice, setEmbeddingChoice] = useState("native");
   const [settings, setSettings] = useState(null);
   const [loading, setLoading] = useState(true);
   const updateChoice = (selection) => {
@@ -21,7 +23,7 @@ function EmbeddingSelection({ nextStep, prevStep, currentStep }) {
     async function fetchKeys() {
       const _settings = await System.keys();
       setSettings(_settings);
-      setEmbeddingChoice(_settings?.EmbeddingEngine || "openai");
+      setEmbeddingChoice(_settings?.EmbeddingEngine || "native");
       setLoading(false);
     }
     fetchKeys();
@@ -62,6 +64,14 @@ function EmbeddingSelection({ nextStep, prevStep, currentStep }) {
               name="EmbeddingEngine"
               value={embeddingChoice}
             />
+            <LLMProviderOption
+              name="AnythingLLM Embedder"
+              value="native"
+              description="Use the built-in embedding engine for AnythingLLM. Zero setup!"
+              checked={embeddingChoice === "native"}
+              image={AnythingLLMIcon}
+              onClick={updateChoice}
+            />
             <LLMProviderOption
               name="OpenAI"
               value="openai"
@@ -91,6 +101,7 @@ function EmbeddingSelection({ nextStep, prevStep, currentStep }) {
             />
           </div>
           <div className="mt-4 flex flex-wrap gap-4 max-w-[752px]">
+            {embeddingChoice === "native" && <NativeEmbeddingOptions />}
             {embeddingChoice === "openai" && (
               <OpenAiOptions settings={settings} />
             )}

diff --git a/server/models/systemSettings.js b/server/models/systemSettings.js
@@ -15,7 +15,7 @@ const SystemSettings = {
   ],
   currentSettings: async function () {
     const llmProvider = process.env.LLM_PROVIDER || "openai";
-    const vectorDB = process.env.VECTOR_DB || "pinecone";
+    const vectorDB = process.env.VECTOR_DB || "lancedb";
     return {
       CanDebug: !!!process.env.NO_DEBUG,
       RequiresAuth: !!process.env.AUTH_TOKEN,

diff --git a/server/package.json b/server/package.json
@@ -26,6 +26,7 @@
     "@pinecone-database/pinecone": "^0.1.6",
     "@prisma/client": "5.3.0",
     "@qdrant/js-client-rest": "^1.4.0",
+    "@xenova/transformers": "^2.10.0",
     "archiver": "^5.3.1",
     "bcrypt": "^5.1.0",
     "body-parser": "^1.20.2",

diff --git a/server/storage/models/.gitignore b/server/storage/models/.gitignore
@@ -0,0 +1,2 @@
+Xenova
+downloaded/*
diff --git a/server/storage/models/README.md b/server/storage/models/README.md
@@ -0,0 +1,13 @@
+## Native models used by AnythingLLM
+
+This folder is specifically created as a local cache and storage folder that is used for native models that can run on a CPU.
+
+Currently, AnythingLLM uses this folder for the following parts of the application.
+
+### Embedding
+When your embedding engine preference is `native` we will use the ONNX **all-MiniLM-L6-v2** model built by [Xenova on HuggingFace.co](https://huggingface.co/Xenova/all-MiniLM-L6-v2). This model is a quantized and WASM version of the popular [all-MiniLM-L6-v2](https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2) which produces a 384-dimension vector.
+
+If you are using the `native` embedding engine your vector database should be configured to accept 384-dimension models if that parameter is directly editable (Pinecone only).
+
+### Text generation (LLM selection)
+_in progress_
diff --git a/server/utils/EmbeddingEngines/native/index.js b/server/utils/EmbeddingEngines/native/index.js
@@ -0,0 +1,80 @@
+const path = require("path");
+const fs = require("fs");
+const { toChunks } = require("../../helpers");
+
+class NativeEmbedder {
+  constructor() {
+    this.model = "Xenova/all-MiniLM-L6-v2";
+    this.cacheDir = path.resolve(
+      process.env.STORAGE_DIR
+        ? path.resolve(process.env.STORAGE_DIR, `models`)
+        : path.resolve(__dirname, `../../../storage/models`)
+    );
+    this.modelPath = path.resolve(this.cacheDir, "Xenova", "all-MiniLM-L6-v2");
+
+    // Limit the number of chunks to send per loop to not overload compute.
+    this.embeddingChunkLimit = 16;
+
+    // Make directory when it does not exist in existing installations
+    if (!fs.existsSync(this.cacheDir)) fs.mkdirSync(this.cacheDir);
+  }
+
+  async embedderClient() {
+    if (!fs.existsSync(this.modelPath)) {
+      console.log(
+        "\x1b[34m[INFO]\x1b[0m The native embedding model has never been run and will be downloaded right now. Subsequent runs will be faster. (~23MB)\n\n"
+      );
+    }
+
+    try {
+      // Convert ESM to CommonJS via import so we can load this library.
+      const pipeline = (...args) =>
+        import("@xenova/transformers").then(({ pipeline }) =>
+          pipeline(...args)
+        );
+      return await pipeline("feature-extraction", this.model, {
+        cache_dir: this.cacheDir,
+        ...(!fs.existsSync(this.modelPath)
+          ? {
+              // Show download progress if we need to download any files
+              progress_callback: (data) => {
+                if (!data.hasOwnProperty("progress")) return;
+                console.log(
+                  `\x1b[34m[Embedding - Downloading Model Files]\x1b[0m ${
+                    data.file
+                  } ${~~data?.progress}%`
+                );
+              },
+            }
+          : {}),
+      });
+    } catch (error) {
+      console.error("Failed to load the native embedding model:", error);
+      throw error;
+    }
+  }
+
+  async embedTextInput(textInput) {
+    const result = await this.embedChunks(textInput);
+    return result?.[0] || [];
+  }
+
+  async embedChunks(textChunks = []) {
+    const Embedder = await this.embedderClient();
+    const embeddingResults = [];
+    for (const chunk of toChunks(textChunks, this.embeddingChunkLimit)) {
+      const output = await Embedder(chunk, {
+        pooling: "mean",
+        normalize: true,
+      });
+      if (output.length === 0) continue;
+      embeddingResults.push(output.tolist());
+    }
+
+    return embeddingResults.length > 0 ? embeddingResults.flat() : null;
+  }
+}
+
+module.exports = {
+  NativeEmbedder,
+};
diff --git a/server/utils/helpers/index.js b/server/utils/helpers/index.js
@@ -59,6 +59,9 @@ function getEmbeddingEngineSelection() {
     case "localai":
       const { LocalAiEmbedder } = require("../EmbeddingEngines/localAi");
       return new LocalAiEmbedder();
+    case "native":
+      const { NativeEmbedder } = require("../EmbeddingEngines/native");
+      return new NativeEmbedder();
     default:
       return null;
   }

diff --git a/server/utils/helpers/updateENV.js b/server/utils/helpers/updateENV.js
@@ -203,7 +203,7 @@ function validAnthropicModel(input = "") {
 }
 
 function supportedEmbeddingModel(input = "") {
-  const supported = ["openai", "azure", "localai"];
+  const supported = ["openai", "azure", "localai", "native"];
   return supported.includes(input)
     ? null
     : `Invalid Embedding model type. Must be one of ${supported.join(", ")}.`;

diff --git a/server/utils/prisma/index.js b/server/utils/prisma/index.js
@@ -5,10 +5,7 @@ const { PrismaClient } = require("@prisma/client");
 // npx prisma migrate dev --name init -> ensures that db is in sync with schema
 // npx prisma migrate reset -> resets the db
 
-const isProd = process.env.NODE_ENV === "production";
-const logLevels = isProd
-  ? ["error", "info", "warn"]
-  : ["query", "info", "warn", "error"];
+const logLevels = ["error", "info", "warn"]; // add "query" to debug query logs
 const prisma = new PrismaClient({
   log: logLevels,
 });