From f7f96dcb4827ca1276a96e2a024ff9e9a416a4da Mon Sep 17 00:00:00 2001 From: Julien Vignoud Date: Tue, 12 Nov 2024 16:45:45 +0100 Subject: [PATCH] *: replace line by line text loaders by chunk by chunk text loaders. Loaders now yield token sequences of length blockSize --- cli/src/benchmark_gpt.ts | 12 +- cli/src/train_gpt.ts | 47 ++++--- discojs-node/src/loaders.spec.ts | 80 +++++++++++- discojs-node/src/loaders/text.ts | 116 ++++++++++++++++-- discojs-web/src/loaders.spec.ts | 89 ++++++++++++-- discojs-web/src/loaders/text.ts | 81 +++++++++--- discojs/src/dataset/types.ts | 2 +- discojs/src/default_tasks/wikitext.ts | 4 +- discojs/src/models/gpt/gpt.spec.ts | 14 +-- discojs/src/models/gpt/model.ts | 8 +- discojs/src/processing/index.ts | 36 ++---- discojs/src/processing/text.spec.ts | 72 ++++++++--- discojs/src/processing/text.ts | 75 ++++++----- docs/examples/wikitext.ts | 20 ++- server/tests/e2e/federated.spec.ts | 12 +- webapp/cypress/e2e/testing.cy.ts | 17 ++- webapp/cypress/support/e2e.ts | 2 + .../dataset_input/FileSelection.vue | 24 ++-- .../dataset_input/LabeledDatasetInput.vue | 1 + .../dataset_input/TextDatasetInput.vue | 27 ++-- .../dataset_input/UnlabeledDatasetInput.vue | 2 +- 21 files changed, 547 insertions(+), 194 deletions(-) diff --git a/cli/src/benchmark_gpt.ts b/cli/src/benchmark_gpt.ts index 9e21a44d4..0a1c5aa96 100644 --- a/cli/src/benchmark_gpt.ts +++ b/cli/src/benchmark_gpt.ts @@ -76,11 +76,12 @@ async function main(args: Required): Promise { // to make sure the dataset is batched and tokenized correctly task.trainingInformation.batchSize = batchSize task.trainingInformation.maxSequenceLength = contextLength - const dataset = loadText('../datasets/wikitext/wiki.train.tokens') + const dataset = loadText( + '../datasets/wikitext/wiki.train.tokens', + tokenizer, config.blockSize, batchSize + ) - const maxLength = task.trainingInformation.maxSequenceLength ?? (tokenizer.model_max_length as number) + 1 const preprocessedDataset = dataset - .map((line) => processing.tokenizeAndLeftPad(line, tokenizer, maxLength)) .map((tokens) => [tokens.pop(), tokens.last()] as [List, number]) .batch(batchSize); @@ -111,10 +112,7 @@ async function main(args: Required): Promise { const iterations = 10 console.log("Generating", maxNewTokens, "new tokens") - let tokens = List( - (tokenizer(prompt, { return_tensor: false }) as { input_ids: number[] }) - .input_ids, - ); + let tokens = processing.tokenize(tokenizer, prompt); let inferenceTime = 0 for (let i = 0; i < iterations; i++) { diff --git a/cli/src/train_gpt.ts b/cli/src/train_gpt.ts index 60466bde0..75c151e6f 100644 --- a/cli/src/train_gpt.ts +++ b/cli/src/train_gpt.ts @@ -1,38 +1,49 @@ -import * as tf from "@tensorflow/tfjs-node" import { AutoTokenizer } from "@xenova/transformers"; import { models, processing } from "@epfml/discojs"; +import { loadText } from '@epfml/discojs-node' +import { List } from "immutable"; -async function main(): Promise { - const data = "Lorem ipsum dolor sit amet, consectetur adipis" - const datasetSource = new tf.data.FileDataSource(Buffer.from(data)) - const textDataset = new tf.data.TextLineDataset(datasetSource) +async function main(): Promise { + const config: models.GPTConfig = { modelType: 'gpt-nano', lr: 0.01, - maxIter: 50, + maxIter: 10, evaluateEvery:50, maxEvalBatches: 10, blockSize: 16, - vocabSize: 50257, debug: false } - + + const batchSize = 8 const tokenizer = await AutoTokenizer.from_pretrained('Xenova/gpt2') - const tokenDataset = textDataset.map((text: string) => { - const tokens = processing.tokenizeAndLeftPad(text, tokenizer, config.blockSize + 1) - const ys = tf.oneHot(tokens.slice(1), tokenizer.model.vocab.length) - const xs = tf.tensor(tokens.slice(0, config.blockSize), undefined, 'int32') - return {xs, ys} - }).repeat().batch(16) as tf.data.Dataset<{ xs: tf.Tensor2D, ys: tf.Tensor3D }> + const dataset = loadText( + '../datasets/wikitext/wiki.train.tokens', + tokenizer, config.blockSize, batchSize + ) + const tokenDataset = dataset + .map((tokens) => [tokens.pop(), tokens.last()] as [List, number]) + .batch(batchSize); const model = new models.GPT(config) - - for await (const logs of model.train(tokenDataset, undefined)) { - console.log(logs) + for (let i = 0; i < 6; i++) { + console.log(`Epoch ${i}`) + for await (const logs of model.train(tokenDataset, undefined)) { + console.log(logs) + } } - const generation = await model.generate("Lorem", tokenizer, { maxNewTokens: 10, doSample: false, topk: 5, temperature:0.1 }) + let tokens = processing.tokenize(tokenizer, "First"); + + const maxNewTokens = 10 + for (let n = 0; n < maxNewTokens; n++) { + const next: number = (await model.predict(List.of(tokens), + { doSample: false, topk: 5, temperature: 0.1 })) + .first(); + tokens = tokens.push(next) + } + const generation = tokenizer.decode(tokens.toArray(), { skip_special_tokens: true }) console.log(generation) } diff --git a/discojs-node/src/loaders.spec.ts b/discojs-node/src/loaders.spec.ts index c1f94d5a9..3df2679a7 100644 --- a/discojs-node/src/loaders.spec.ts +++ b/discojs-node/src/loaders.spec.ts @@ -2,6 +2,9 @@ import * as fs from "node:fs/promises"; import { withFile } from "tmp-promise"; import { describe, it } from "mocha"; import { expect } from "chai"; +import { Dataset, processing, Text } from "@epfml/discojs"; +import { AutoTokenizer } from "@xenova/transformers"; +import { List } from "immutable"; import { loadCSV, @@ -50,13 +53,84 @@ describe("image directory parser", () => { }); describe("text parser", () => { + it("parses basic file", async () => { + const tokenizer = await AutoTokenizer.from_pretrained('Xenova/gpt2') + const text = ["a", "b", "c"].join("\n") await withFile(async ({ path }) => { - await fs.writeFile(path, ["a", "b", "c"].join("\n")); + await fs.writeFile(path, text); + // set block size to 4 to get 1 sequence of 4 tokens + 1 label token + const parsed = loadText(path, tokenizer, 4, 1); + const expectedTokens = processing.tokenize(tokenizer, text) + // should return 2 sequences: one with 4 tokens + 1 label token + // and the other with some padding and the label token + const sequences = await arrayFromAsync(parsed) + expect(sequences.length).to.equal(2); + expect(sequences[0]).to.deep.equal(expectedTokens); + }); + }); + + async function checkEachSequence(parsed: Dataset, + expectedTokens: number[], blockSize: number) { + // ceiling because the remaining tokens in the last chunk are padded instead of dropped + // expect the number of sequences to be the total number of tokens divided by blockSize + expect(await parsed.size()).to.equal(Math.ceil(expectedTokens.length / blockSize)); + + let i = 0 + // exclude the last sequence because it has been padded + let sequences = List(await arrayFromAsync(parsed)) + // we expect the last sequence to have blockSize + 1 tokens via padding + expect(sequences.last()?.size).to.equal(blockSize + 1) + sequences = sequences.pop() + for await (const tokens of sequences) { + // each sequence has length blockSize + 1 (for the label) + expect(tokens.toArray()).to.deep.equal(expectedTokens.slice(i, i + blockSize + 1)); + // but the window should move by blockSize only + i += blockSize + } + } + + it("yields the correct block size", async () => { + const tokenizer = await AutoTokenizer.from_pretrained('Xenova/gpt2') + const text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit." + const expectedTokens = processing.tokenize(tokenizer, text).toArray() + + await withFile(async ({ path }) => { + await fs.writeFile(path, text); + + // set block size to 4 to get 1 sequence of 4 tokens + 1 label token + // so we expect 5 tokens per read + const blockSize = 4 + const parsed = loadText(path, tokenizer, blockSize, 1); + await checkEachSequence(parsed, expectedTokens, blockSize) + }) + }); - const parsed = loadText(path); + it("reads multiple chunks", async () => { + const tokenizer = await AutoTokenizer.from_pretrained('Xenova/gpt2') + const text = [ + "Lorem ipsum dolor sit amet, consectetur adipiscing elit.", + "Donec sed risus maximus, ultricies ex sed, dictum elit.", + "Curabitur faucibus egestas enim et auctor. Quisque vel dignissim turpis.", + "Curabitur justo tellus, elementum sit amet erat eget, auctor ornare nisi.", + "Nunc tortor odio, ultrices id leo vitae, euismod congue ex. Curabitur arcu leo,", + "sagittis quis felis nec, imperdiet aliquet tellus.Integer a mollis nulla.", + "Quisque pulvinar lectus eget nisi pharetra, non molestie magna ullamcorper.", + "Sed porttitor diam non blandit molestie.Duis tristique arcu ut efficitur efficitur.", + "Fusce et ullamcorper tortor.Pellentesque a accumsan lacus, nec mollis risus.", + "Nunc quis eros a orci ultricies cursus. Maecenas sodales ipsum a magna ", + "malesuada efficitur.Maecenas at sapien blandit, egestas nisi eu, mollis elit." + ].join(" ") + + const expectedTokens = processing.tokenize(tokenizer, text).toArray() + await withFile(async ({ path }) => { + await fs.writeFile(path, text); - expect(await parsed.size()).to.equal(3); + // set block size to 4 to get 1 sequence of 4 tokens + 1 label token + // so we expect 5 tokens per read + const blockSize = 4 + const parsed = loadText(path, tokenizer, blockSize, 1, 1); // set the min chunk size allowed to 1 bit + await checkEachSequence(parsed, expectedTokens, blockSize) }); }); }); diff --git a/discojs-node/src/loaders/text.ts b/discojs-node/src/loaders/text.ts index c1ae840a2..8dfcddb8b 100644 --- a/discojs-node/src/loaders/text.ts +++ b/discojs-node/src/loaders/text.ts @@ -1,14 +1,114 @@ -import * as fs from "node:fs/promises"; -import * as readline from "node:readline/promises"; +import createDebug from "debug"; +import { createReadStream } from 'node:fs'; +import { PreTrainedTokenizer } from '@xenova/transformers'; +import { Dataset, Text, processing } from "@epfml/discojs"; -import { Dataset, Text } from "@epfml/discojs"; +const debug = createDebug("discojs-node:loaders:text"); -export function load(path: string): Dataset { +/** + * Returns a Dataset that streams and tokenizes text to yield tokenized sequences + * one at a time. + * The sequences returned are going to be split into input and label sequences of size `blockSize` + * The label sequences are the input sequences shifted by one token. + * Since the last token of the input sequence needs a label, + * we include one more token (`blockSize` + 1 total) in the sequences returned. + * * Thus, each sequence yielded has size `blockSize` + 1, where the last token + * is included only to be the label of the last input token: + * xs = tokens[0:blockSize] + * ys = tokens[1:blockSize+1] + * + * Because the `blockSize+1`nth token is only used as label and not as input, + * the next sequence will be shifted by `blockSize` (and not `blockSize + 1`) + * In other words, the dataset yields sequences of size `blockSize` + 1 + * with an overlap of 1 token between each sequence. + * + * @param path path to the text file to read + * @param tokenizer the tokenizer to use, should match the model that will be trained + * @param blockSize the context length, the maximum number of tokens of input sequences + * @param batchSize default to 1, the number of input sequences (of `blockSize` tokens) in each batch. + * The batch size is only used to configure the chunk size of the file stream such that each chunk is + * big enough to contain at least one batch. + * @param minChunkSize default to 16KiB, the minimum size of each chunk in bits + * @returns a dataset of tokenized input and label sequences + */ +export function load(path: string, tokenizer: PreTrainedTokenizer, + blockSize: number, batchSize: number = 1, minChunkSize = 16384): Dataset { return new Dataset(async function* () { - const input = (await fs.open(path)).createReadStream({ encoding: "utf8" }); + if (batchSize < 1 || !Number.isInteger(batchSize) || + blockSize < 1 || !Number.isInteger(blockSize) || + minChunkSize < 1 || !Number.isInteger(minChunkSize)) + throw new Error("batchSize, blockSize and minChunkSize must be positive integers"); + const sequenceLength = blockSize + 1 // + 1 for the blockSize'nth token's label + // we want each chunk to be at least bigger than the block size (each chunk corresponds to a block) + // (or event bigger than batch size * block size so that each chunk corresponds to a batch) + const chunkTokenSize = batchSize * (sequenceLength) + // We read 8*8 = 8 bytes per expected token to ensure we have enough tokens + // For reference, the GPT-2 tokenizer encodes 3 to 4 bytes per token on average + const chunkBitSize = Math.max(minChunkSize, chunkTokenSize * 8 * 8); + debug("Setting the chunk size to %o bits", chunkBitSize) + // Create a stream to read the text file chunk by chunk + const stream = createReadStream(path, { + encoding: "utf8", + highWaterMark: chunkBitSize + }); - // `readline` is a bit overkill but seems standard - // https://nodejs.org/api/readline.html#example-read-file-stream-line-by-line - yield* readline.createInterface({ input, crlfDelay: Infinity }); + // iterate over the chunks + let endOfPreviousChunk = "" + let alreadyAppliedPadding = false + for await (const chunk of stream) { + if (typeof chunk !== 'string') throw new Error('Expected file stream to yield string') + debug("Reading chunk of size %o", chunk.length) + // tokenize the whole chunk at once + // Concatenate with potential leftovers from the previous chunk + let tokens = processing.tokenize(tokenizer, endOfPreviousChunk + chunk) + if (tokens.size < sequenceLength) { + // throw if we need to apply padding more than once + // We can pad if the whole text is smaller than block size or + // if the very last chunk is smaller than block size + if (alreadyAppliedPadding) + throw new Error(`the chunk (${tokens.size} tokens) is too small ` + + `to get a sequence of length blockSize (${sequenceLength} tokens). ` + + `Either the text file or the chunk size (${chunkBitSize} bits) is too small.`); + // if this isn't the first iteration we simply skip + // as we expect the last chunk to be potentially smaller than the block size + debug("chunk smaller than block size, padding to blockSize") + yield processing.tokenize(tokenizer, endOfPreviousChunk + chunk, { + padding: true, max_length: sequenceLength + }) + alreadyAppliedPadding = true + continue + } + debug("batch per chunk: %o", tokens.size / (batchSize * blockSize)) + // yield one block of tokens at a time + while (tokens.size >= sequenceLength) { + yield tokens.take(sequenceLength); + tokens = tokens.slice(blockSize); // only shift by blockSize rather than sequenceLength + } + // keep the last tokens for the next chunk + // if this was the last one the remaining tokens are discarded + if (tokens.size > 0) { + // We actually need to decode the tokens to get the leftover text + // instead of simply keeping the remaining tokens. + // this is because the tokens may be different once prepended to the next chunk + // e.g. if the remaining text is ". A" and the next chunk starts with "nother" + // the tokenization will be different than if we simply concatenate the remaining tokens + endOfPreviousChunk = tokenizer.decode( + tokens.toArray(), + { skip_special_tokens: true } + ) + debug("End of chunk, remaining text: '%s'", endOfPreviousChunk) + } else { + // Note that the difference between tokenizing and then concatenating + // vs concatenating and then tokenizing can happen if their is no + // remaining text. We consider this difference negligible + endOfPreviousChunk = ""; + } + } + if (endOfPreviousChunk.length === 0) return + + // flush the remaining text after the last chunk + yield processing.tokenize(tokenizer, endOfPreviousChunk, { + padding: true, max_length: sequenceLength + }) }); } diff --git a/discojs-web/src/loaders.spec.ts b/discojs-web/src/loaders.spec.ts index 603eb292c..5e1f4c86d 100644 --- a/discojs-web/src/loaders.spec.ts +++ b/discojs-web/src/loaders.spec.ts @@ -1,5 +1,7 @@ +import { AutoTokenizer } from "@xenova/transformers"; import { describe, it, expect } from "vitest"; - +import { List } from "immutable"; +import { processing, Dataset, Text } from "@epfml/discojs"; import { loadCSV, loadText } from "./loaders/index.js"; async function arrayFromAsync(iter: AsyncIterable): Promise { @@ -22,22 +24,83 @@ describe("csv parser", () => { }); describe("text parser", () => { - it("loads", async () => { + it("loads a simple sequence", async () => { + const text = ["first", "second", "third"].join("\n") + + const tokenizer = await AutoTokenizer.from_pretrained('Xenova/gpt2') + const expectedTokens = processing.tokenize(tokenizer, text) + // jsdom doesn't implement .text on File/Blob // trick from https://github.com/jsdom/jsdom/issues/2555 - const text = await ( - await fetch( - // data URL content need to be url-encoded - ["data:,first", "second", "third"].join("%0A"), - ) + const file = await ( + await fetch( "data:," + encodeURIComponent(text)) ).blob(); + const parsed = loadText(file, tokenizer, 4); - const parsed = loadText(text); + // should return 2 sequences: one with 4 tokens + 1 label token + // and the other with some padding and the label token + expect(await parsed.size()).to.equal(2); + expect((await arrayFromAsync(parsed))[0]).to.deep.equal(expectedTokens); + }); - expect(await arrayFromAsync(parsed)).to.have.ordered.members([ - "first", - "second", - "third", - ]); + async function checkEachSequence(parsed: Dataset, + expectedTokens: number[], blockSize: number) { + // ceiling because the remaining tokens in the last chunk are padded instead of dropped + // expect the number of sequences to be the total number of tokens divided by blockSize + expect(await parsed.size()).to.equal(Math.ceil(expectedTokens.length / blockSize)); + + let i = 0 + // exclude the last sequence because it has been padded + let sequences = List(await arrayFromAsync(parsed)) + // we expect the last sequence to have blockSize + 1 tokens via padding + expect(sequences.last()?.size).to.equal(blockSize + 1) + sequences = sequences.pop() + for await (const tokens of sequences) { + // each sequence has length blockSize + 1 (for the label) + expect(tokens.toArray()).to.deep.equal(expectedTokens.slice(i, i + blockSize + 1)); + // but the window should move by blockSize only + i += blockSize + } + } + + it("yields the correct block size", async () => { + const tokenizer = await AutoTokenizer.from_pretrained('Xenova/gpt2') + const text = "Lorem ipsum dolor sit amet, consectetur adipiscing elit. Sed quis faucibus ipsum." + const expectedTokens = processing.tokenize(tokenizer, text) + + const file = await ( + await fetch("data:," + encodeURIComponent(text)) + ).blob(); + + const blockSize = 4 + const parsed = loadText(file, tokenizer, blockSize); + await checkEachSequence(parsed, expectedTokens.toArray(), blockSize) + }); + it("reads multiple chunks", async () => { + const tokenizer = await AutoTokenizer.from_pretrained('Xenova/gpt2') + const text = [ + "Lorem ipsum dolor sit amet, consectetur adipiscing elit.", + "Donec sed risus maximus, ultricies ex sed, dictum elit.", + "Curabitur faucibus egestas enim et auctor. Quisque vel dignissim turpis.", + "Curabitur justo tellus, elementum sit amet erat eget, auctor ornare nisi.", + "Nunc tortor odio, ultrices id leo vitae, euismod congue ex. Curabitur arcu leo,", + "sagittis quis felis nec, imperdiet aliquet tellus.Integer a mollis nulla.", + "Quisque pulvinar lectus eget nisi pharetra, non molestie magna ullamcorper.", + "Sed porttitor diam non blandit molestie.Duis tristique arcu ut efficitur efficitur.", + "Fusce et ullamcorper tortor.Pellentesque a accumsan lacus, nec mollis risus.", + "Nunc quis eros a orci ultricies cursus. Maecenas sodales ipsum a magna ", + "malesuada efficitur.Maecenas at sapien blandit, egestas nisi eu, mollis elit." + ].join(" ") + + const expectedTokens = processing.tokenize(tokenizer, text).toArray() + const file = await ( + await fetch("data:," + encodeURIComponent(text)) + ).blob(); + + // set block size to 4 to get 1 sequence of 4 tokens + 1 label token + // so we expect 5 tokens per read + const blockSize = 4 + const parsed = loadText(file, tokenizer, blockSize); + await checkEachSequence(parsed, expectedTokens, blockSize) }); }); diff --git a/discojs-web/src/loaders/text.ts b/discojs-web/src/loaders/text.ts index 0aee95d74..500bf2e83 100644 --- a/discojs-web/src/loaders/text.ts +++ b/discojs-web/src/loaders/text.ts @@ -1,35 +1,80 @@ -import { Dataset, Text } from "@epfml/discojs"; +import createDebug from "debug"; +import { Dataset, Text, processing } from "@epfml/discojs"; +import { PreTrainedTokenizer } from '@xenova/transformers'; -class LineStream extends TransformStream { - constructor() { - let current_line = ""; +const debug = createDebug("discojs-web:loaders:text"); +/** + * Stream and tokenize text to yield tokenized sequences + * one at a time. Each sequence has size `blockSize` + 1, where the first `blockSize` + * tokens are the input and the last token is the label. The following sequence + * starts with the last token of the previous sequence (so the previous label is now the + * first input token). + * In other words, the stream yields sequences of size `blockSize` + 1 but with an overlap + * of 1 token between each sequence. + * + * @param file the file to read + * @param tokenizer the tokenizer to use, should match the model that will be trained + * @param blockSize the context length, the maximum number of tokens of input sequences + */ +class TokenizerStream extends TransformStream { + constructor(tokenizer: PreTrainedTokenizer, blockSize: number) { + const sequenceLength = blockSize + 1 + let endOfPreviousChunk = "" + let alreadyAppliedPadding = false super({ transform: (chunk, controller) => { - const [head, ...lines] = chunk.split(/\r\n|\r|\n/); - const first_line = current_line + head; - - if (lines.length === 0) { - current_line = first_line; - return; + debug("yield TokenizerStream chunk of length: %o", chunk.length); + // tokenize the whole chunk at once + let tokens = processing.tokenize(tokenizer, endOfPreviousChunk + chunk); + if (tokens.size < sequenceLength) { + // throw if it happens on the 1st chunk + if (alreadyAppliedPadding) + throw new Error(`the chunk (${tokens.size} tokens) is too small ` + + `to get a sequence of length blockSize (${sequenceLength} tokens). ` + + `Either the text file or the chunk size is too small.`); + // if this isn't the first iteration we simply skip + // as we expect the last chunk to be potentially smaller than the block size + debug("chunk smaller than block size, padding to blockSize") + controller.enqueue(processing.tokenize(tokenizer,endOfPreviousChunk + chunk, { + padding: true, max_length: sequenceLength + })); + alreadyAppliedPadding = true; + return } - - controller.enqueue(first_line); - for (const line of lines.slice(0, -1)) controller.enqueue(line); - - current_line = lines[lines.length - 1]; + // yield one block of tokens at a time + // add 1 to include the next token for the prediction label + while (tokens.size >= sequenceLength) { + controller.enqueue(tokens.take(sequenceLength)) + tokens = tokens.slice(blockSize); // no +1 here + } + // keep the last tokens for the next chunk + // if this was the last chunk the remaining tokens are discarded + endOfPreviousChunk = tokens.size ? + tokenizer.decode(tokens.toArray(), { skip_special_tokens: true }) + : endOfPreviousChunk = ""; }, - flush: (controller) => controller.enqueue(current_line), + flush: (controller) => { + if (endOfPreviousChunk.length === 0) return + // flush the remaining text after the last chunk + controller.enqueue(processing.tokenize(tokenizer, endOfPreviousChunk, { + padding: true, max_length: sequenceLength + })); + } }); } } -export function load(file: Blob): Dataset { +export function load(file: Blob, tokenizer: PreTrainedTokenizer, + blockSize: number): Dataset { return new Dataset(async function* () { + if (blockSize < 1 || !Number.isInteger(blockSize)) + throw new Error("blockSize must be a positive integer"); + const reader = file .stream() .pipeThrough(new TextDecoderStream()) - .pipeThrough(new LineStream()) + .pipeThrough(new TokenizerStream(tokenizer, blockSize)) .getReader(); while (true) { diff --git a/discojs/src/dataset/types.ts b/discojs/src/dataset/types.ts index 86700a28e..87428f62f 100644 --- a/discojs/src/dataset/types.ts +++ b/discojs/src/dataset/types.ts @@ -6,4 +6,4 @@ export type Batched = List; export { Image }; export type Tabular = Partial>; -export type Text = string; +export type Text = List; diff --git a/discojs/src/default_tasks/wikitext.ts b/discojs/src/default_tasks/wikitext.ts index 3d7760b09..6041025cb 100644 --- a/discojs/src/default_tasks/wikitext.ts +++ b/discojs/src/default_tasks/wikitext.ts @@ -33,9 +33,9 @@ export const wikitext: TaskProvider<'text'> = { // But if set to 0 then the webapp doesn't display the validation metrics validationSplit: 0.1, roundDuration: 2, - batchSize: 1, // If set too high (e.g. 16) firefox raises a WebGL error + batchSize: 8, // If set too high firefox raises a WebGL error tokenizer: 'Xenova/gpt2', - maxSequenceLength: 128, + maxSequenceLength: 64, tensorBackend: 'gpt' } } diff --git a/discojs/src/models/gpt/gpt.spec.ts b/discojs/src/models/gpt/gpt.spec.ts index 600cb9d89..8dd89bb7b 100644 --- a/discojs/src/models/gpt/gpt.spec.ts +++ b/discojs/src/models/gpt/gpt.spec.ts @@ -2,7 +2,7 @@ import { expect } from "chai"; import "@tensorflow/tfjs-node"; // speed up import { AutoTokenizer } from "@xenova/transformers"; -import { Dataset, DataFormat } from "../../index.js"; +import { Dataset, DataFormat, processing } from "../../index.js"; import { GPT } from "./index.js"; import { List, Repeat } from "immutable"; @@ -13,10 +13,8 @@ describe("gpt-tfjs", function () { const tokenizer = await AutoTokenizer.from_pretrained("Xenova/gpt2"); const data = "Lorem ipsum dolor sit"; - const dataTokens = List( - (tokenizer(data, { return_tensor: false }) as { input_ids: number[] }) - .input_ids, - ); + const dataTokens = processing.tokenize(tokenizer, data); + const dataset = new Dataset( Repeat([dataTokens.pop(), dataTokens.last()]), ).batch(64); @@ -33,10 +31,8 @@ describe("gpt-tfjs", function () { for await (const _ of model.train(dataset, undefined)); const input = "Lorem ipsum dolor"; - const inputTokens = List( - (tokenizer(input, { return_tensor: false }) as { input_ids: number[] }) - .input_ids, - ); + const inputTokens = processing.tokenize(tokenizer, data); + const outputToken: number = ( await model.predict(List.of(inputTokens)) ).first(); diff --git a/discojs/src/models/gpt/model.ts b/discojs/src/models/gpt/model.ts index 92a27e38d..a073be2b1 100644 --- a/discojs/src/models/gpt/model.ts +++ b/discojs/src/models/gpt/model.ts @@ -59,7 +59,7 @@ export class GPTModel extends tf.LayersModel { const callbacks = trainingArgs.callbacks as tf.CustomCallbackArgs const evalDataset = trainingArgs.validationData as tf.data.Dataset<{ xs: tf.Tensor2D, ys: tf.Tensor3D }> await callbacks.onTrainBegin?.() - + for (let epoch = 1; epoch <= trainingArgs.epochs; epoch++) { let accuracyFraction: [number, number] = [0, 0]; let averageLoss = 0 @@ -75,7 +75,7 @@ export class GPTModel extends tf.LayersModel { let preprocessingTime = performance.now() await Promise.all([xs.data(), ys.data()]) preprocessingTime = performance.now() - preprocessingTime - + // TODO include as a tensor inside the model const accTensor = tf.tidy(() => { const logits = this.apply(xs) @@ -92,7 +92,7 @@ export class GPTModel extends tf.LayersModel { if (typeof accSum !== 'number') throw new Error('got multiple accuracy sum') accuracyFraction = [accuracyFraction[0] + accSum, accuracyFraction[1] + accSize]; - tf.dispose([accTensor]) + tf.dispose([accTensor]) const lossTensor = tf.tidy(() => { const { grads, value: lossTensor } = this.optimizer.computeGradients(() => { @@ -141,7 +141,7 @@ export class GPTModel extends tf.LayersModel { tf.dispose([xs, ys]) } let logs: tf.Logs = { - 'loss': averageLoss / iteration, + 'loss': averageLoss / (iteration - 1), // -1 because iteration got incremented at the end of the loop 'acc': accuracyFraction[0] / accuracyFraction[1], } if (evalDataset !== undefined) { diff --git a/discojs/src/processing/index.ts b/discojs/src/processing/index.ts index 2824a63b3..cb9e8703b 100644 --- a/discojs/src/processing/index.ts +++ b/discojs/src/processing/index.ts @@ -18,10 +18,10 @@ export * from "./image.js"; export * from "./tabular.js"; export * from "./text.js"; -export async function preprocess( +export function preprocess( task: Task, dataset: Dataset, -): Promise> { +): Dataset { switch (task.trainingInformation.dataType) { case "image": { // cast as typescript doesn't reduce generic type @@ -55,28 +55,16 @@ export async function preprocess( case "text": { // cast as typescript doesn't reduce generic type const d = dataset as Dataset; - const t = task as Task<"text">; - - const tokenizer = await models.getTaskTokenizer(t); - const totalTokenCount = - task.trainingInformation.maxSequenceLength ?? - (tokenizer.model_max_length as number); - - return d - .map((line) => - processing.tokenizeAndLeftPad(line, tokenizer, totalTokenCount), - ) - .map((tokens) => [tokens.pop(), tokens.last()]) as Dataset< - DataFormat.ModelEncoded[D] - >; + return d.map((tokens) => [tokens.pop(), tokens.last()]) as + Dataset; } } } -export async function preprocessWithoutLabel( +export function preprocessWithoutLabel( task: Task, dataset: Dataset, -): Promise> { +): Dataset { switch (task.trainingInformation.dataType) { case "image": { // cast as typescript doesn't reduce generic type @@ -101,18 +89,8 @@ export async function preprocessWithoutLabel( case "text": { // cast as typescript doesn't reduce generic type const d = dataset as Dataset; - const t = task as Task<"text">; - const tokenizer = await models.getTaskTokenizer(t); - const totalTokenCount = - t.trainingInformation.maxSequenceLength ?? - (tokenizer.model_max_length as number); - - return d - .map((line) => - processing.tokenizeAndLeftPad(line, tokenizer, totalTokenCount), - ) - .map((tokens) => tokens.pop()); + return d.map((tokens) => tokens.pop()) } } } diff --git a/discojs/src/processing/text.spec.ts b/discojs/src/processing/text.spec.ts index 992cf1163..d7029c92c 100644 --- a/discojs/src/processing/text.spec.ts +++ b/discojs/src/processing/text.spec.ts @@ -1,12 +1,19 @@ import { expect } from "chai"; -import { tokenizeAndLeftPad } from "./text.js"; +import { tokenize } from "./text.js"; import { AutoTokenizer } from "@xenova/transformers"; import { Repeat } from "immutable"; describe("text processing", () => { - const text = - "Hello world, a bc 1 2345, '? 976. Wikipedia is a free content online encyclopedia written and maintained by a community \n of volunteers, known as Wikipedians. Founded by Jimmy Wales and Larry Sanger on January 15, 2001, Wikipedia is hosted by the Wikimedia Foundation, an American nonprofit organization that employs a staff of over 700 people.[7]"; + const text = [ + "Hello world, a bc 1 2345, '? 976. Wikipedia is a free content online encyclopedia", + "written and maintained by a community \n of volunteers, known as Wikipedians.", + "Founded by Jimmy Wales and Larry Sanger on January 15, 2001, Wikipedia is hosted by the", + "Wikimedia Foundation, an American nonprofit organization that employs a staff of over 700 people.[7]" + ].join(" "); + + "Hello world, a bc 1 2345, '? 976. Wikipedia is a free content online encyclopedia written and maintained by a community \n of volunteers, known as Wikipedians. Founded by Jimmy Wales and Larry Sanger on January 15, 2001, Wikipedia is hosted by the Wikimedia Foundation, an American nonprofit organization that employs a staff of over 700 people.[7]" + const expectedTokens = [ 15496, 995, 11, 257, 47125, 352, 2242, 2231, 11, 705, 30, 860, 4304, 13, 15312, 318, 257, 1479, 2695, 2691, 45352, 3194, 290, 9456, 416, 257, 2055, @@ -16,29 +23,64 @@ describe("text processing", () => { 257, 3085, 286, 625, 13037, 661, 3693, 22, 60, ]; - it("tokenizes text", async () => { - const tokenizer = await AutoTokenizer.from_pretrained("Xenova/gpt2"); - - const tokens = tokenizeAndLeftPad(text, tokenizer, expectedTokens.length); + const shortText = 'import { AutoTokenizer } from "@xenova/transformers";' + // with GPT 2 tokenizer + const shortExpectedTokens = [ + 11748, 1391, 11160, 30642, 7509, 1782, 422, + 44212, 87, 268, 10071, 14, 35636, 364, 8172 + ] + it("can tokenize text with the Llama 3 tokenizer", async () => { + const tokenizer = await AutoTokenizer.from_pretrained("Xenova/llama-3-tokenizer"); + // Tokenizer playgrounds aren't consistent: https://github.com/huggingface/transformers.js/issues/1019 + // Tokenization with python: + // from transformers import AutoTokenizer + // tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B") + // tokenizer.encode(text, add_special_tokens=False) + const expectedTokens = [ + 9906, 1917, 11, 264, 18399, 220, 16, 220, 11727, 20, 11, 32167, + 220, 25208, 13, 27685, 374, 264, 1949, 2262, 2930, 83708, 5439, 323, 18908, + 555, 264, 4029, 720, 315, 23872, 11, 3967, 439, 119234, 291, 5493, 13, 78811, + 555, 28933, 23782, 323, 30390, 328, 4091, 389, 6186, 220, 868, 11, 220, 1049, + 16, 11, 27685, 374, 21685, 555, 279, 90940, 5114, 11, 459, 3778, 33184, 7471, + 430, 51242, 264, 5687, 315, 927, 220, 7007, 1274, 8032, 22, 60 + ] + const tokens = tokenize(tokenizer, text); expect(tokens.toArray()).to.be.deep.equal(expectedTokens); }); - it("tokenizes until wanted size", async () => { + it("can tokenize text with the GPT2 tokenizer", async () => { const tokenizer = await AutoTokenizer.from_pretrained("Xenova/gpt2"); - const tokens = tokenizeAndLeftPad(text, tokenizer, 10); + const tokens = tokenize(tokenizer, text); + expect(tokens.toArray()).to.be.deep.equal(expectedTokens); + }); + + it("truncates until expected length", async () => { + const tokenizer = await AutoTokenizer.from_pretrained("Xenova/gpt2"); + const tokens = tokenize(tokenizer, text, {truncation: true, max_length: 10}); expect(tokens.toArray()).to.be.deep.equal(expectedTokens.slice(0, 10)); }); - it("pads until enough token are generated", async () => { + it("pads sequence until enough token are generated", async () => { const tokenizer = await AutoTokenizer.from_pretrained("Xenova/gpt2"); + const max_length = 20 - const tokens = tokenizeAndLeftPad("", tokenizer, 10); - - expect(tokens.toArray()).to.be.deep.equal( - Repeat(tokenizer.pad_token_id, 10).toArray(), + const tokens = tokenize(tokenizer, shortText, {padding: true, max_length}); + const paddedSequence = Repeat(tokenizer.pad_token_id, max_length - shortExpectedTokens.length) + .concat(shortExpectedTokens).toArray(); + expect(tokens.toArray()).to.be.deep.equal(paddedSequence); + }); + + it("can pad on right side", async () => { + const tokenizer = await AutoTokenizer.from_pretrained("Xenova/gpt2"); + const max_length = 20 + + const tokens = tokenize(tokenizer, shortText, {padding: true, padding_side: 'right', max_length}); + const paddedSequence = shortExpectedTokens.concat( + Repeat(tokenizer.pad_token_id, max_length - shortExpectedTokens.length).toArray() ); + expect(tokens.toArray()).to.be.deep.equal(paddedSequence); }); -}); +}); \ No newline at end of file diff --git a/discojs/src/processing/text.ts b/discojs/src/processing/text.ts index 393101bb5..e28dfba3e 100644 --- a/discojs/src/processing/text.ts +++ b/discojs/src/processing/text.ts @@ -1,47 +1,56 @@ -import { List, Repeat } from "immutable"; +import { List } from "immutable"; import { PreTrainedTokenizer } from "@xenova/transformers"; +import type { Text } from '../index.js' function isArrayOfNumber(raw: unknown): raw is number[] { return Array.isArray(raw) && raw.every((e) => typeof e === "number"); } -type Token = number; +interface TokenizingConfig { + // default to false, if true pads to max_length + padding?: boolean, + padding_side?: 'left' | 'right', // default to left + truncation?: boolean, + max_length?: number, // the max sequence length used if padding or truncation is enabled + text_pair?: string | null, + add_special_tokens?: boolean, + return_token_type_ids?: boolean, +} /** - * Tokenize and truncates input strings - * - * @param length number of tokens - * @returns encoded string in an array of token, size of max_length + * Tokenize one line of text. + * Wrapper around Transformers.js tokenizer to handle type checking and format the output. + * Note that Transformers.js's tokenizer can tokenize multiple lines of text at once + * but we are currently not making use of it. Can be useful when padding a batch + * + * @param tokenizer the tokenizer object + * @param text the text to tokenize + * @param config TokenizingConfig, the tokenizing parameters when using `tokenizer` + * @returns number[] the tokenized text */ -export function tokenizeAndLeftPad( - line: string, - tokenizer: PreTrainedTokenizer, - length: number, -): List { - if (!Number.isInteger(length)) throw new Error("length should be an integer"); +export function tokenize(tokenizer: PreTrainedTokenizer, text: string, config?: TokenizingConfig): Text { + config = { ...config }; // create a config if undefined + + if (config.padding || config.truncation) { + if (config.max_length === undefined) throw new Error("max_length needs to be specified to use padding or truncation"); + if (!Number.isInteger(config.max_length)) throw new Error("max_length should be an integer"); + } + + if (config.padding) { + // The padding side is set as an attribute, not in the config + tokenizer.padding_side = config.padding_side !== undefined ? config.padding_side : 'left' + config.truncation = true // for a single sequence, padding implies truncation to max_length + } - // Transformers.js currently only supports right padding while we need left for text generation - // Right padding should be supported in the future, once it is, we can directly pad while tokenizing - // https://github.com/xenova/transformers.js/blob/8804c36591d11d8456788d1bb4b16489121b3be2/src/tokenizers.js#L2517 - const tokenized: unknown = tokenizer(line, { - padding: false, - truncation: true, - return_tensor: false, - max_length: length, - }); + const tokenizerResult: unknown = tokenizer(text, {...config, return_tensor: false}); if ( - typeof tokenized !== "object" || - tokenized === null || - !("input_ids" in tokenized) || - !isArrayOfNumber(tokenized.input_ids) + typeof tokenizerResult !== "object" || + tokenizerResult === null || + !("input_ids" in tokenizerResult) || + !isArrayOfNumber(tokenizerResult.input_ids) ) - throw new Error("tokenizer returns unexpected type"); - const tokens: Token[] = tokenized.input_ids; - - const paddingSize = length - tokens.length; - if (paddingSize < 0) - throw new Error("tokenized returned more token than expected"); - - return Repeat(tokenizer.pad_token_id, paddingSize).concat(tokens).toList(); + throw new Error("tokenizer returned unexpected type"); + + return List(tokenizerResult.input_ids) } diff --git a/docs/examples/wikitext.ts b/docs/examples/wikitext.ts index a5f5a79cd..5816df962 100644 --- a/docs/examples/wikitext.ts +++ b/docs/examples/wikitext.ts @@ -1,6 +1,6 @@ import "@tensorflow/tfjs-node" -import { Disco, fetchTasks, models, Task } from '@epfml/discojs' +import { Disco, fetchTasks, models, processing, Task } from '@epfml/discojs' import { saveModelToDisk, loadModelFromDisk, loadText } from '@epfml/discojs-node' import { List } from "immutable" @@ -19,11 +19,15 @@ async function main(): Promise { // Toggle TRAIN_MODEL to either train and save a new model from scratch or load an existing model const TRAIN_MODEL = true + + // Retrieve the tokenizer + const tokenizer = await models.getTaskTokenizer(task) if (TRAIN_MODEL) { + const blockSize = task.trainingInformation.maxSequenceLength ?? 128 + const batchSize = task.trainingInformation.batchSize // Load the wikitext dataset from the `datasets` folder - const dataset = loadText("../../datasets/wikitext/wiki.train.tokens").chain( - loadText("../../datasets/wikitext/wiki.valid.tokens"), - ); + const dataset = loadText("../../datasets/wikitext/wiki.train.tokens", tokenizer, blockSize, batchSize) + .chain(loadText("../../datasets/wikitext/wiki.valid.tokens", tokenizer, blockSize, batchSize)); // Initialize a Disco instance and start training a language model const disco = new Disco(task, url, { scheme: 'federated' }) @@ -37,14 +41,8 @@ async function main(): Promise { // Load the trained model model = await loadModelFromDisk(`${modelFolder}/${modelFileName}`) as models.GPT } - - // Tokenize as in training - const tokenizer = await models.getTaskTokenizer(task) const prompt = 'The game began development in 2010 , carrying over a large portion' - let tokens = List( - (tokenizer(prompt, { return_tensor: false }) as { input_ids: number[] }) - .input_ids, - ); + let tokens = processing.tokenize(tokenizer, prompt) // Predict a few tokens const numberOfTokens = 10; diff --git a/server/tests/e2e/federated.spec.ts b/server/tests/e2e/federated.spec.ts index 31b1de9fb..4bf96e123 100644 --- a/server/tests/e2e/federated.spec.ts +++ b/server/tests/e2e/federated.spec.ts @@ -4,7 +4,7 @@ import type * as http from "node:http"; import path from "node:path"; import type { RoundStatus, WeightsContainer } from "@epfml/discojs"; -import { Disco, defaultTasks } from "@epfml/discojs"; +import { Disco, defaultTasks, models } from "@epfml/discojs"; import { loadCSV, loadImagesInDir, loadText } from "@epfml/discojs-node"; import { Server } from "../../src/index.js"; @@ -92,10 +92,16 @@ describe("end-to-end federated", () => { async function wikitextUser(): Promise { const task = defaultTasks.wikitext.getTask(); task.trainingInformation.epochs = 2; - + const tokenizer = await models.getTaskTokenizer(task) + const blockSize = task.trainingInformation.maxSequenceLength ?? 8 + const batchSize = task.trainingInformation.batchSize const dataset = loadText( path.join(DATASET_DIR, "wikitext", "wiki.train.tokens"), - ).chain(loadText(path.join(DATASET_DIR, "wikitext", "wiki.valid.tokens"))); + tokenizer, blockSize, batchSize + ).chain(loadText( + path.join(DATASET_DIR, "wikitext", "wiki.valid.tokens"), + tokenizer, blockSize, batchSize + )); const disco = new Disco(task, url, { scheme: "federated" }); diff --git a/webapp/cypress/e2e/testing.cy.ts b/webapp/cypress/e2e/testing.cy.ts index 0647d0675..70bdfb57f 100644 --- a/webapp/cypress/e2e/testing.cy.ts +++ b/webapp/cypress/e2e/testing.cy.ts @@ -46,20 +46,29 @@ it("can test lus_covid", () => { it("can start and stop testing of wikitext", () => { setupServerWith(defaultTasks.wikitext); - cy.visit("/#/evaluate"); cy.contains("button", "download").click(); cy.contains("button", "test").click(); - + + // input the dataset cy.contains("label", "select text").selectFile( "../datasets/wikitext/wiki.test.tokens", ); - cy.contains("button", "next").click(); + + // NOTE: internet connection needed + // wait for the tokenizer to load and the filename to display + // otherwise the training starts before the dataset is ready + cy.contains("Connect your data") + .parent() + .parent() + .contains("wiki.test.tokens", { timeout: 20_000 }); + cy.contains("button", "next").click(); + cy.contains("Test & validate") .parent() .parent() .contains("button", "test") .click(); - cy.contains("button", "stop testing").click(); + cy.contains("button", "stop testing").click({ waitForAnimations: false }); }); diff --git a/webapp/cypress/support/e2e.ts b/webapp/cypress/support/e2e.ts index 7d36f290e..60897183c 100644 --- a/webapp/cypress/support/e2e.ts +++ b/webapp/cypress/support/e2e.ts @@ -82,3 +82,5 @@ beforeEach(() => .getDirectory() .then((root) => root.removeEntry("models", { recursive: true })), ); + +beforeEach(() => { localStorage.debug = "discojs*,webapp*" }); diff --git a/webapp/src/components/dataset_input/FileSelection.vue b/webapp/src/components/dataset_input/FileSelection.vue index fb521ef15..250412411 100644 --- a/webapp/src/components/dataset_input/FileSelection.vue +++ b/webapp/src/components/dataset_input/FileSelection.vue @@ -67,14 +67,21 @@ v-if="files !== undefined" class="pt-4 flex flex-col items-center pb-5" > -
+
+ +
+ +
+ + Number of selected files: + {{ files.size }} - Number of selected files: - {{ files.size }} - {{ files.first()?.name ?? "none" }} + + {{ files.first()?.name ?? "none" }}
@@ -89,6 +96,7 @@ diff --git a/webapp/src/components/dataset_input/UnlabeledDatasetInput.vue b/webapp/src/components/dataset_input/UnlabeledDatasetInput.vue index bc139061d..da5dbf9d8 100644 --- a/webapp/src/components/dataset_input/UnlabeledDatasetInput.vue +++ b/webapp/src/components/dataset_input/UnlabeledDatasetInput.vue @@ -3,7 +3,7 @@ - +