Add split by token.

lgrammel · Apr 28, 2023 · cb9d151 · cb9d151
1 parent 48a3b79
commit cb9d151
Show file tree

Hide file tree

Showing 18 changed files with 156 additions and 57 deletions.
diff --git a/README.md b/README.md
@@ -44,7 +44,7 @@ Features used: function composition (no agent), pdf loading, split-extract-rewri
 
 Splits a text into chunks and generates embeddings.
 
-Features used: direct function calls (no agent), split text, generate embeddings
+Features used: direct function calls (no agent), split text (gpt3-tokenizer), generate embeddings
 
 ## Features
 
@@ -80,7 +80,8 @@ Features used: direct function calls (no agent), split text, generate embeddings
   - Utility functions to combine and convert prompts
 - Text functions
   - Extract information (extract & rewrite; extract recursively)
-  - Split text into chunks
+  - Splitters: split text into chunks
+    - By character, by token (GPT3-tokenizer)
   - Helpers: load, generate
 - Data sources
   - Webpage as HTML text
@@ -150,8 +151,9 @@ export async function runWikipediaAgent({
     },
     execute: $.tool.executeExtractInformationFromWebpage({
       extract: $.text.extractRecursively.asExtractFunction({
-        split: $.text.splitRecursivelyAtCharacter.asSplitFunction({
-          maxCharactersPerChunk: 2048 * 4, // needs to fit into a gpt-3.5-turbo prompt
+        split: $.text.splitRecursivelyAtToken.asSplitFunction({
+          tokenizer: $.provider.openai.gptTokenizer(),
+          maxChunkSize: 2048, // needs to fit into a gpt-3.5-turbo prompt
         }),
         extract: $.text.generateText.asFunction({
           prompt: $.prompt.extractChatPrompt(),

diff --git a/docs/concepts/index.md b/docs/concepts/index.md
@@ -13,9 +13,10 @@ You can use all almost all helper functions in JS Agent directly. This includes
 Here is an example of splitting a text into chunks and using the OpenAI embedding API directly to get the embedding of each chunk ([full example](https://github.com/lgrammel/js-agent/tree/main/examples/split-and-embed-text)):
 
 ```typescript
-const chunks = $.text.splitRecursivelyAtCharacter({
+const chunks = await $.text.splitRecursivelyAtToken({
   text,
-  maxCharactersPerChunk: 1024 * 4,
+  tokenizer: $.provider.openai.gptTokenizer(),
+  maxChunkSize: 128,
 });
 
 const embeddings = [];
@@ -44,7 +45,7 @@ Here is the example that creates a Twitter thread on a topic using the content o
 ```typescript
 const rewriteAsTwitterThread = $.text.splitExtractRewrite.asExtractFunction({
   split: $.text.splitRecursivelyAtCharacter.asSplitFunction({
-    maxCharactersPerChunk: 1024 * 4,
+    maxChunkSize: 1024 * 4,
   }),
   extract: $.text.generateText.asFunction({
     model: gpt4,

diff --git a/docs/docs/tutorial-wikipedia-agent/complete-agent.md b/docs/docs/tutorial-wikipedia-agent/complete-agent.md
@@ -60,8 +60,9 @@ async function runWikipediaAgent({
     },
     execute: $.tool.executeExtractInformationFromWebpage({
       extract: $.text.extractRecursively.asExtractFunction({
-        split: $.text.splitRecursivelyAtCharacter.asSplitFunction({
-          maxCharactersPerChunk: 2048 * 4, // needs to fit into a gpt-3.5-turbo prompt
+        split: $.text.splitRecursivelyAtToken.asSplitFunction({
+          tokenizer: $.provider.openai.gptTokenizer(),
+          maxChunkSize: 2048, // needs to fit into a gpt-3.5-turbo prompt
         }),
         extract: $.text.generateText.asFunction({
           prompt: $.prompt.extractChatPrompt(),

diff --git a/docs/docs/tutorial-wikipedia-agent/create-read-article-tool.md b/docs/docs/tutorial-wikipedia-agent/create-read-article-tool.md
@@ -18,8 +18,9 @@ const readWikipediaArticleAction = $.tool.extractInformationFromWebpage({
   },
   execute: $.tool.executeExtractInformationFromWebpage({
     extract: $.text.extractRecursively.asExtractFunction({
-      split: $.text.splitRecursivelyAtCharacter.asSplitFunction({
-        maxCharactersPerChunk: 2048 * 4, // needs to fit into a gpt-3.5-turbo prompt
+      split: $.text.splitRecursivelyAtToken.asSplitFunction({
+        tokenizer: $.provider.openai.gptTokenizer(),
+        maxChunkSize: 2048, // needs to fit into a gpt-3.5-turbo prompt
       }),
       extract: $.text.generateText.asFunction({
         prompt: $.prompt.extractChatPrompt(),

diff --git a/examples/pdf-to-twitter-thread/src/createTwitterThreadFromPdf.ts b/examples/pdf-to-twitter-thread/src/createTwitterThreadFromPdf.ts
@@ -18,16 +18,18 @@ export async function createTwitterThreadFromPdf({
 
   const rewriteAsTwitterThread = $.text.splitExtractRewrite.asExtractFunction({
     split: $.text.splitRecursivelyAtCharacter.asSplitFunction({
-      maxCharactersPerChunk: 1024 * 4,
+      maxChunkSize: 1024 * 4,
     }),
     extract: $.text.generateText.asFunction({
+      id: "extract",
       model: gpt4,
       prompt: $.prompt.extractAndExcludeChatPrompt({
         excludeKeyword: "IRRELEVANT",
       }),
     }),
     include: (text) => text !== "IRRELEVANT",
     rewrite: $.text.generateText.asFunction({
+      id: "rewrite",
       model: gpt4,
       prompt: async ({ text, topic }) => [
         {

diff --git a/examples/pdf-to-twitter-thread/src/main.ts b/examples/pdf-to-twitter-thread/src/main.ts
@@ -26,7 +26,7 @@ createTwitterThreadFromPdf({
   openAiApiKey,
   context: {
     recordCall: (call) => {
-      console.log(`...${call.metadata.id ?? "unknown"}...`);
+      console.log(`${call.metadata.id ?? "unknown"}...`);
     },
   },
 })

diff --git a/examples/split-and-embed-text/src/main.ts b/examples/split-and-embed-text/src/main.ts
@@ -1,6 +1,6 @@
 import { Command } from "commander";
 import dotenv from "dotenv";
-import { splitAndEmbed } from "./splitAndEmbed";
+import { splitAndEmbedText } from "./splitAndEmbedText";
 
 dotenv.config();
 
@@ -19,7 +19,7 @@ if (!openAiApiKey) {
   throw new Error("OPENAI_API_KEY is not set");
 }
 
-splitAndEmbed({
+splitAndEmbedText({
   textFilePath: file,
   openAiApiKey,
 })

diff --git a/...split-and-embed-text/src/splitAndEmbed.ts → ...t-and-embed-text/src/splitAndEmbedText.ts b/...split-and-embed-text/src/splitAndEmbed.ts → ...t-and-embed-text/src/splitAndEmbedText.ts
@@ -1,7 +1,7 @@
 import * as $ from "js-agent";
 import fs from "node:fs/promises";
 
-export async function splitAndEmbed({
+export async function splitAndEmbedText({
   textFilePath,
   openAiApiKey,
 }: {
@@ -10,9 +10,10 @@ export async function splitAndEmbed({
 }) {
   const text = await fs.readFile(textFilePath, "utf8");
 
-  const chunks = $.text.splitRecursivelyAtCharacter({
+  const chunks = await $.text.splitRecursivelyAtToken({
     text,
-    maxCharactersPerChunk: 1024 * 4,
+    tokenizer: $.provider.openai.gptTokenizer(),
+    maxChunkSize: 128,
   });
 
   const embeddings = [];

diff --git a/examples/wikipedia/src/runWikipediaAgent.ts b/examples/wikipedia/src/runWikipediaAgent.ts
@@ -36,8 +36,9 @@ export async function runWikipediaAgent({
     },
     execute: $.tool.executeExtractInformationFromWebpage({
       extract: $.text.extractRecursively.asExtractFunction({
-        split: $.text.splitRecursivelyAtCharacter.asSplitFunction({
-          maxCharactersPerChunk: 2048 * 4, // needs to fit into a gpt-3.5-turbo prompt
+        split: $.text.splitRecursivelyAtToken.asSplitFunction({
+          tokenizer: $.provider.openai.gptTokenizer(),
+          maxChunkSize: 2048, // needs to fit into a gpt-3.5-turbo prompt
         }),
         extract: $.text.generateText.asFunction({
           prompt: $.prompt.extractChatPrompt(),

diff --git a/packages/agent/README.md b/packages/agent/README.md
@@ -42,7 +42,7 @@ Features used: function composition (no agent), pdf loading, split-extract-rewri
 
 Splits a text into chunks and generates embeddings.
 
-Features used: direct function calls (no agent), split text, generate embeddings
+Features used: direct function calls (no agent), split text (gpt3-tokenizer), generate embeddings
 
 ## Features
 
@@ -78,7 +78,8 @@ Features used: direct function calls (no agent), split text, generate embeddings
   - Utility functions to combine and convert prompts
 - Text functions
   - Extract information (extract & rewrite; extract recursively)
-  - Split text into chunks
+  - Splitters: split text into chunks
+    - By character, by token (GPT3-tokenizer)
   - Helpers: load, generate
 - Data sources
   - Webpage as HTML text
@@ -148,8 +149,9 @@ export async function runWikipediaAgent({
     },
     execute: $.tool.executeExtractInformationFromWebpage({
       extract: $.text.extractRecursively.asExtractFunction({
-        split: $.text.splitRecursivelyAtCharacter.asSplitFunction({
-          maxCharactersPerChunk: 2048 * 4, // needs to fit into a gpt-3.5-turbo prompt
+        split: $.text.splitRecursivelyAtToken.asSplitFunction({
+          tokenizer: $.provider.openai.gptTokenizer(),
+          maxChunkSize: 2048, // needs to fit into a gpt-3.5-turbo prompt
         }),
         extract: $.text.generateText.asFunction({
           prompt: $.prompt.extractChatPrompt(),

diff --git a/packages/agent/package.json b/packages/agent/package.json
@@ -42,6 +42,7 @@
     "fastify": "4.14.1",
     "fastify-type-provider-zod": "1.1.9",
     "html-to-text": "9.0.5",
+    "gpt3-tokenizer": "1.1.5",
     "hyperid": "3.1.1",
     "pdfjs-dist": "3.5.141",
     "pino": "8.11.0",

diff --git a/packages/agent/src/provider/openai/GPTTokenizer.ts b/packages/agent/src/provider/openai/GPTTokenizer.ts
@@ -0,0 +1,21 @@
+import GPT3Tokenizer from "gpt3-tokenizer";
+import { Tokenizer } from "../../tokenizer/Tokenizer";
+
+export const gptTokenizer = ({
+  type = "gpt3",
+}: {
+  type?: "gpt3" | "codex";
+} = {}): Tokenizer => {
+  const gptTokenizer = new GPT3Tokenizer({ type });
+
+  return Object.freeze({
+    encode: async (text: string) => {
+      const encodeResult = gptTokenizer.encode(text);
+      return {
+        tokens: encodeResult.bpe,
+        texts: encodeResult.text,
+      };
+    },
+    decode: async (tokens: Array<number>) => gptTokenizer.decode(tokens),
+  });
+};
diff --git a/packages/agent/src/provider/openai/index.ts b/packages/agent/src/provider/openai/index.ts
@@ -1,3 +1,4 @@
+export * from "./GPTTokenizer.js";
 export * from "./OpenAIChatCompletion.js";
 export * from "./OpenAIEmbedding.js";
 export * from "./OpenAITextCompletion.js";

diff --git a/packages/agent/src/text/split/index.ts b/packages/agent/src/text/split/index.ts
@@ -1,2 +1,2 @@
-export * from "./splitRecursivelyAtCharacter";
+export * from "./splitRecursively";
 export * from "./SplitFunction";
diff --git a/packages/agent/src/text/split/splitRecursively.ts b/packages/agent/src/text/split/splitRecursively.ts
@@ -0,0 +1,75 @@
+import { Tokenizer } from "../../tokenizer/Tokenizer";
+import { SplitFunction } from "./SplitFunction";
+
+function splitRecursivelyImplementation({
+  maxChunkSize,
+  segments,
+}: {
+  maxChunkSize: number;
+  segments: string | Array<string>;
+}): Array<string> {
+  if (segments.length < maxChunkSize) {
+    return Array.isArray(segments) ? [segments.join("")] : [segments];
+  }
+
+  const half = Math.ceil(segments.length / 2);
+  const left = segments.slice(0, half);
+  const right = segments.slice(half);
+
+  return [
+    ...splitRecursivelyImplementation({
+      segments: left,
+      maxChunkSize,
+    }),
+    ...splitRecursivelyImplementation({
+      segments: right,
+      maxChunkSize,
+    }),
+  ];
+}
+
+export const splitRecursivelyAtCharacter = async ({
+  maxChunkSize,
+  text,
+}: {
+  maxChunkSize: number;
+  text: string;
+}) =>
+  splitRecursivelyImplementation({
+    maxChunkSize,
+    segments: text,
+  });
+
+splitRecursivelyAtCharacter.asSplitFunction =
+  ({ maxChunkSize }: { maxChunkSize: number }): SplitFunction =>
+  async ({ text }: { text: string }) =>
+    splitRecursivelyAtCharacter({ maxChunkSize, text });
+
+export const splitRecursivelyAtToken = async ({
+  tokenizer,
+  maxChunkSize,
+  text,
+}: {
+  tokenizer: Tokenizer;
+  maxChunkSize: number;
+  text: string;
+}) =>
+  splitRecursivelyImplementation({
+    maxChunkSize,
+    segments: (await tokenizer.encode(text)).texts,
+  });
+
+splitRecursivelyAtToken.asSplitFunction =
+  ({
+    tokenizer,
+    maxChunkSize,
+  }: {
+    tokenizer: Tokenizer;
+    maxChunkSize: number;
+  }): SplitFunction =>
+  async ({ text }: { text: string }) =>
+    splitRecursivelyAtToken({
+      tokenizer,
+      maxChunkSize,
+      text,
+    });
diff --git a/packages/agent/src/text/split/splitRecursivelyAtCharacter.ts b/packages/agent/src/text/split/splitRecursivelyAtCharacter.ts
diff --git a/packages/agent/src/tokenizer/Tokenizer.ts b/packages/agent/src/tokenizer/Tokenizer.ts
@@ -0,0 +1,7 @@
+export type Tokenizer = {
+  encode: (text: string) => PromiseLike<{
+    tokens: Array<number>;
+    texts: Array<string>;
+  }>;
+  decode: (tokens: Array<number>) => PromiseLike<string>;
+};
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml