diff --git a/README.md b/README.md index 77394f2..0733237 100644 --- a/README.md +++ b/README.md @@ -44,7 +44,7 @@ Features used: function composition (no agent), pdf loading, split-extract-rewri Splits a text into chunks and generates embeddings. -Features used: direct function calls (no agent), split text, generate embeddings +Features used: direct function calls (no agent), split text (gpt3-tokenizer), generate embeddings ## Features @@ -80,7 +80,8 @@ Features used: direct function calls (no agent), split text, generate embeddings - Utility functions to combine and convert prompts - Text functions - Extract information (extract & rewrite; extract recursively) - - Split text into chunks + - Splitters: split text into chunks + - By character, by token (GPT3-tokenizer) - Helpers: load, generate - Data sources - Webpage as HTML text @@ -150,8 +151,9 @@ export async function runWikipediaAgent({ }, execute: $.tool.executeExtractInformationFromWebpage({ extract: $.text.extractRecursively.asExtractFunction({ - split: $.text.splitRecursivelyAtCharacter.asSplitFunction({ - maxCharactersPerChunk: 2048 * 4, // needs to fit into a gpt-3.5-turbo prompt + split: $.text.splitRecursivelyAtToken.asSplitFunction({ + tokenizer: $.provider.openai.gptTokenizer(), + maxChunkSize: 2048, // needs to fit into a gpt-3.5-turbo prompt }), extract: $.text.generateText.asFunction({ prompt: $.prompt.extractChatPrompt(), diff --git a/docs/concepts/index.md b/docs/concepts/index.md index 8989fb2..137ce21 100644 --- a/docs/concepts/index.md +++ b/docs/concepts/index.md @@ -13,9 +13,10 @@ You can use all almost all helper functions in JS Agent directly. This includes Here is an example of splitting a text into chunks and using the OpenAI embedding API directly to get the embedding of each chunk ([full example](https://github.com/lgrammel/js-agent/tree/main/examples/split-and-embed-text)): ```typescript -const chunks = $.text.splitRecursivelyAtCharacter({ +const chunks = await $.text.splitRecursivelyAtToken({ text, - maxCharactersPerChunk: 1024 * 4, + tokenizer: $.provider.openai.gptTokenizer(), + maxChunkSize: 128, }); const embeddings = []; @@ -44,7 +45,7 @@ Here is the example that creates a Twitter thread on a topic using the content o ```typescript const rewriteAsTwitterThread = $.text.splitExtractRewrite.asExtractFunction({ split: $.text.splitRecursivelyAtCharacter.asSplitFunction({ - maxCharactersPerChunk: 1024 * 4, + maxChunkSize: 1024 * 4, }), extract: $.text.generateText.asFunction({ model: gpt4, diff --git a/docs/docs/tutorial-wikipedia-agent/complete-agent.md b/docs/docs/tutorial-wikipedia-agent/complete-agent.md index b621d60..a18ebaa 100644 --- a/docs/docs/tutorial-wikipedia-agent/complete-agent.md +++ b/docs/docs/tutorial-wikipedia-agent/complete-agent.md @@ -60,8 +60,9 @@ async function runWikipediaAgent({ }, execute: $.tool.executeExtractInformationFromWebpage({ extract: $.text.extractRecursively.asExtractFunction({ - split: $.text.splitRecursivelyAtCharacter.asSplitFunction({ - maxCharactersPerChunk: 2048 * 4, // needs to fit into a gpt-3.5-turbo prompt + split: $.text.splitRecursivelyAtToken.asSplitFunction({ + tokenizer: $.provider.openai.gptTokenizer(), + maxChunkSize: 2048, // needs to fit into a gpt-3.5-turbo prompt }), extract: $.text.generateText.asFunction({ prompt: $.prompt.extractChatPrompt(), diff --git a/docs/docs/tutorial-wikipedia-agent/create-read-article-tool.md b/docs/docs/tutorial-wikipedia-agent/create-read-article-tool.md index d79e34b..12c0e0c 100644 --- a/docs/docs/tutorial-wikipedia-agent/create-read-article-tool.md +++ b/docs/docs/tutorial-wikipedia-agent/create-read-article-tool.md @@ -18,8 +18,9 @@ const readWikipediaArticleAction = $.tool.extractInformationFromWebpage({ }, execute: $.tool.executeExtractInformationFromWebpage({ extract: $.text.extractRecursively.asExtractFunction({ - split: $.text.splitRecursivelyAtCharacter.asSplitFunction({ - maxCharactersPerChunk: 2048 * 4, // needs to fit into a gpt-3.5-turbo prompt + split: $.text.splitRecursivelyAtToken.asSplitFunction({ + tokenizer: $.provider.openai.gptTokenizer(), + maxChunkSize: 2048, // needs to fit into a gpt-3.5-turbo prompt }), extract: $.text.generateText.asFunction({ prompt: $.prompt.extractChatPrompt(), diff --git a/examples/pdf-to-twitter-thread/src/createTwitterThreadFromPdf.ts b/examples/pdf-to-twitter-thread/src/createTwitterThreadFromPdf.ts index d31f502..400f5e7 100644 --- a/examples/pdf-to-twitter-thread/src/createTwitterThreadFromPdf.ts +++ b/examples/pdf-to-twitter-thread/src/createTwitterThreadFromPdf.ts @@ -18,9 +18,10 @@ export async function createTwitterThreadFromPdf({ const rewriteAsTwitterThread = $.text.splitExtractRewrite.asExtractFunction({ split: $.text.splitRecursivelyAtCharacter.asSplitFunction({ - maxCharactersPerChunk: 1024 * 4, + maxChunkSize: 1024 * 4, }), extract: $.text.generateText.asFunction({ + id: "extract", model: gpt4, prompt: $.prompt.extractAndExcludeChatPrompt({ excludeKeyword: "IRRELEVANT", @@ -28,6 +29,7 @@ export async function createTwitterThreadFromPdf({ }), include: (text) => text !== "IRRELEVANT", rewrite: $.text.generateText.asFunction({ + id: "rewrite", model: gpt4, prompt: async ({ text, topic }) => [ { diff --git a/examples/pdf-to-twitter-thread/src/main.ts b/examples/pdf-to-twitter-thread/src/main.ts index 53aca1f..cd455ea 100644 --- a/examples/pdf-to-twitter-thread/src/main.ts +++ b/examples/pdf-to-twitter-thread/src/main.ts @@ -26,7 +26,7 @@ createTwitterThreadFromPdf({ openAiApiKey, context: { recordCall: (call) => { - console.log(`...${call.metadata.id ?? "unknown"}...`); + console.log(`${call.metadata.id ?? "unknown"}...`); }, }, }) diff --git a/examples/split-and-embed-text/src/main.ts b/examples/split-and-embed-text/src/main.ts index a741657..f9e3aac 100644 --- a/examples/split-and-embed-text/src/main.ts +++ b/examples/split-and-embed-text/src/main.ts @@ -1,6 +1,6 @@ import { Command } from "commander"; import dotenv from "dotenv"; -import { splitAndEmbed } from "./splitAndEmbed"; +import { splitAndEmbedText } from "./splitAndEmbedText"; dotenv.config(); @@ -19,7 +19,7 @@ if (!openAiApiKey) { throw new Error("OPENAI_API_KEY is not set"); } -splitAndEmbed({ +splitAndEmbedText({ textFilePath: file, openAiApiKey, }) diff --git a/examples/split-and-embed-text/src/splitAndEmbed.ts b/examples/split-and-embed-text/src/splitAndEmbedText.ts similarity index 77% rename from examples/split-and-embed-text/src/splitAndEmbed.ts rename to examples/split-and-embed-text/src/splitAndEmbedText.ts index 50c6ef5..c4f63eb 100644 --- a/examples/split-and-embed-text/src/splitAndEmbed.ts +++ b/examples/split-and-embed-text/src/splitAndEmbedText.ts @@ -1,7 +1,7 @@ import * as $ from "js-agent"; import fs from "node:fs/promises"; -export async function splitAndEmbed({ +export async function splitAndEmbedText({ textFilePath, openAiApiKey, }: { @@ -10,9 +10,10 @@ export async function splitAndEmbed({ }) { const text = await fs.readFile(textFilePath, "utf8"); - const chunks = $.text.splitRecursivelyAtCharacter({ + const chunks = await $.text.splitRecursivelyAtToken({ text, - maxCharactersPerChunk: 1024 * 4, + tokenizer: $.provider.openai.gptTokenizer(), + maxChunkSize: 128, }); const embeddings = []; diff --git a/examples/wikipedia/src/runWikipediaAgent.ts b/examples/wikipedia/src/runWikipediaAgent.ts index c8ecd0b..0efe444 100644 --- a/examples/wikipedia/src/runWikipediaAgent.ts +++ b/examples/wikipedia/src/runWikipediaAgent.ts @@ -36,8 +36,9 @@ export async function runWikipediaAgent({ }, execute: $.tool.executeExtractInformationFromWebpage({ extract: $.text.extractRecursively.asExtractFunction({ - split: $.text.splitRecursivelyAtCharacter.asSplitFunction({ - maxCharactersPerChunk: 2048 * 4, // needs to fit into a gpt-3.5-turbo prompt + split: $.text.splitRecursivelyAtToken.asSplitFunction({ + tokenizer: $.provider.openai.gptTokenizer(), + maxChunkSize: 2048, // needs to fit into a gpt-3.5-turbo prompt }), extract: $.text.generateText.asFunction({ prompt: $.prompt.extractChatPrompt(), diff --git a/packages/agent/README.md b/packages/agent/README.md index b1b49c4..2b58f4a 100644 --- a/packages/agent/README.md +++ b/packages/agent/README.md @@ -42,7 +42,7 @@ Features used: function composition (no agent), pdf loading, split-extract-rewri Splits a text into chunks and generates embeddings. -Features used: direct function calls (no agent), split text, generate embeddings +Features used: direct function calls (no agent), split text (gpt3-tokenizer), generate embeddings ## Features @@ -78,7 +78,8 @@ Features used: direct function calls (no agent), split text, generate embeddings - Utility functions to combine and convert prompts - Text functions - Extract information (extract & rewrite; extract recursively) - - Split text into chunks + - Splitters: split text into chunks + - By character, by token (GPT3-tokenizer) - Helpers: load, generate - Data sources - Webpage as HTML text @@ -148,8 +149,9 @@ export async function runWikipediaAgent({ }, execute: $.tool.executeExtractInformationFromWebpage({ extract: $.text.extractRecursively.asExtractFunction({ - split: $.text.splitRecursivelyAtCharacter.asSplitFunction({ - maxCharactersPerChunk: 2048 * 4, // needs to fit into a gpt-3.5-turbo prompt + split: $.text.splitRecursivelyAtToken.asSplitFunction({ + tokenizer: $.provider.openai.gptTokenizer(), + maxChunkSize: 2048, // needs to fit into a gpt-3.5-turbo prompt }), extract: $.text.generateText.asFunction({ prompt: $.prompt.extractChatPrompt(), diff --git a/packages/agent/package.json b/packages/agent/package.json index 204807e..768eeaa 100644 --- a/packages/agent/package.json +++ b/packages/agent/package.json @@ -42,6 +42,7 @@ "fastify": "4.14.1", "fastify-type-provider-zod": "1.1.9", "html-to-text": "9.0.5", + "gpt3-tokenizer": "1.1.5", "hyperid": "3.1.1", "pdfjs-dist": "3.5.141", "pino": "8.11.0", diff --git a/packages/agent/src/provider/openai/GPTTokenizer.ts b/packages/agent/src/provider/openai/GPTTokenizer.ts new file mode 100644 index 0000000..df5918d --- /dev/null +++ b/packages/agent/src/provider/openai/GPTTokenizer.ts @@ -0,0 +1,21 @@ +import GPT3Tokenizer from "gpt3-tokenizer"; +import { Tokenizer } from "../../tokenizer/Tokenizer"; + +export const gptTokenizer = ({ + type = "gpt3", +}: { + type?: "gpt3" | "codex"; +} = {}): Tokenizer => { + const gptTokenizer = new GPT3Tokenizer({ type }); + + return Object.freeze({ + encode: async (text: string) => { + const encodeResult = gptTokenizer.encode(text); + return { + tokens: encodeResult.bpe, + texts: encodeResult.text, + }; + }, + decode: async (tokens: Array) => gptTokenizer.decode(tokens), + }); +}; diff --git a/packages/agent/src/provider/openai/index.ts b/packages/agent/src/provider/openai/index.ts index c6bdfb6..e45a032 100644 --- a/packages/agent/src/provider/openai/index.ts +++ b/packages/agent/src/provider/openai/index.ts @@ -1,3 +1,4 @@ +export * from "./GPTTokenizer.js"; export * from "./OpenAIChatCompletion.js"; export * from "./OpenAIEmbedding.js"; export * from "./OpenAITextCompletion.js"; diff --git a/packages/agent/src/text/split/index.ts b/packages/agent/src/text/split/index.ts index 0d007d4..30f9c16 100644 --- a/packages/agent/src/text/split/index.ts +++ b/packages/agent/src/text/split/index.ts @@ -1,2 +1,2 @@ -export * from "./splitRecursivelyAtCharacter"; +export * from "./splitRecursively"; export * from "./SplitFunction"; diff --git a/packages/agent/src/text/split/splitRecursively.ts b/packages/agent/src/text/split/splitRecursively.ts new file mode 100644 index 0000000..791d1a2 --- /dev/null +++ b/packages/agent/src/text/split/splitRecursively.ts @@ -0,0 +1,75 @@ +import { Tokenizer } from "../../tokenizer/Tokenizer"; +import { SplitFunction } from "./SplitFunction"; + +function splitRecursivelyImplementation({ + maxChunkSize, + segments, +}: { + maxChunkSize: number; + segments: string | Array; +}): Array { + if (segments.length < maxChunkSize) { + return Array.isArray(segments) ? [segments.join("")] : [segments]; + } + + const half = Math.ceil(segments.length / 2); + const left = segments.slice(0, half); + const right = segments.slice(half); + + return [ + ...splitRecursivelyImplementation({ + segments: left, + maxChunkSize, + }), + ...splitRecursivelyImplementation({ + segments: right, + maxChunkSize, + }), + ]; +} + +export const splitRecursivelyAtCharacter = async ({ + maxChunkSize, + text, +}: { + maxChunkSize: number; + text: string; +}) => + splitRecursivelyImplementation({ + maxChunkSize, + segments: text, + }); + +splitRecursivelyAtCharacter.asSplitFunction = + ({ maxChunkSize }: { maxChunkSize: number }): SplitFunction => + async ({ text }: { text: string }) => + splitRecursivelyAtCharacter({ maxChunkSize, text }); + +export const splitRecursivelyAtToken = async ({ + tokenizer, + maxChunkSize, + text, +}: { + tokenizer: Tokenizer; + maxChunkSize: number; + text: string; +}) => + splitRecursivelyImplementation({ + maxChunkSize, + segments: (await tokenizer.encode(text)).texts, + }); + +splitRecursivelyAtToken.asSplitFunction = + ({ + tokenizer, + maxChunkSize, + }: { + tokenizer: Tokenizer; + maxChunkSize: number; + }): SplitFunction => + async ({ text }: { text: string }) => + splitRecursivelyAtToken({ + tokenizer, + maxChunkSize, + text, + }); diff --git a/packages/agent/src/text/split/splitRecursivelyAtCharacter.ts b/packages/agent/src/text/split/splitRecursivelyAtCharacter.ts deleted file mode 100644 index 89f1f73..0000000 --- a/packages/agent/src/text/split/splitRecursivelyAtCharacter.ts +++ /dev/null @@ -1,31 +0,0 @@ -import { SplitFunction } from "./SplitFunction"; - -export function splitRecursivelyAtCharacter({ - maxCharactersPerChunk, - text, -}: { - maxCharactersPerChunk: number; - text: string; -}): Array { - if (text.length < maxCharactersPerChunk) { - return [text]; - } - - const half = Math.ceil(text.length / 2); - const left = text.substring(0, half); - const right = text.substring(half); - - return [ - ...splitRecursivelyAtCharacter({ text: left, maxCharactersPerChunk }), - ...splitRecursivelyAtCharacter({ text: right, maxCharactersPerChunk }), - ]; -} - -splitRecursivelyAtCharacter.asSplitFunction = - ({ - maxCharactersPerChunk, - }: { - maxCharactersPerChunk: number; - }): SplitFunction => - async ({ text }: { text: string }) => - splitRecursivelyAtCharacter({ maxCharactersPerChunk, text }); diff --git a/packages/agent/src/tokenizer/Tokenizer.ts b/packages/agent/src/tokenizer/Tokenizer.ts new file mode 100644 index 0000000..b5b5ffc --- /dev/null +++ b/packages/agent/src/tokenizer/Tokenizer.ts @@ -0,0 +1,7 @@ +export type Tokenizer = { + encode: (text: string) => PromiseLike<{ + tokens: Array; + texts: Array; + }>; + decode: (tokens: Array) => PromiseLike; +}; diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index cee6020..e8ec1ef 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -63,7 +63,7 @@ importers: specifier: '*' version: link:../../packages/agent - examples/split-and-embed: + examples/split-and-embed-text: dependencies: commander: specifier: 10.0.1 @@ -110,6 +110,9 @@ importers: fastify-type-provider-zod: specifier: 1.1.9 version: 1.1.9(fastify@4.14.1)(zod@3.21.4) + gpt3-tokenizer: + specifier: 1.1.5 + version: 1.1.5 html-to-text: specifier: 9.0.5 version: 9.0.5 @@ -1439,6 +1442,10 @@ packages: resolution: {integrity: sha512-8+9WqebbFzpX9OR+Wa6O29asIogeRMzcGtAINdpMHHyAg10f05aSFVBbcEqGf/PXw1EjAZ+q2/bEBg3DvurK3Q==} dev: true + /array-keyed-map@2.1.3: + resolution: {integrity: sha512-JIUwuFakO+jHjxyp4YgSiKXSZeC0U+R1jR94bXWBcVlFRBycqXlb+kH9JHxBGcxnVuSqx5bnn0Qz9xtSeKOjiA==} + dev: false + /asynckit@0.4.0: resolution: {integrity: sha512-Oei9OH4tRh0YqU3GxhX79dM/mwVgvbZJaSNaRk+bshkj0S5cfHcgYakreBjrHwatXKbz+IoIdYLxrKim2MjW0Q==} @@ -2300,6 +2307,13 @@ packages: engines: {node: '>=4'} dev: true + /gpt3-tokenizer@1.1.5: + resolution: {integrity: sha512-O9iCL8MqGR0Oe9wTh0YftzIbysypNQmS5a5JG3cB3M4LMYjlAVvNnf8LUzVY9MrI7tj+YLY356uHtO2lLX2HpA==} + engines: {node: '>=12'} + dependencies: + array-keyed-map: 2.1.3 + dev: false + /graceful-fs@4.2.11: resolution: {integrity: sha512-RbJ5/jmFcNNCcDV5o9eTnBLJ/HszWV0P73bc+Ff4nS/rJj+YaS6IGyiOL0VoBYX+l1Wrl3k63h/KrH+nhJ0XvQ==} dev: true