diff --git a/docs/llm-example/.eslintrc.json b/docs/llm-example/.eslintrc.json new file mode 100644 index 000000000..274494505 --- /dev/null +++ b/docs/llm-example/.eslintrc.json @@ -0,0 +1,15 @@ +{ + "extends": "standard-with-typescript", + "parserOptions": { + "project": "./tsconfig.eslint.json" + }, + "env": { + "node": true, + "es6": true, + "mocha": true + }, + "rules": { + // TODO need strict + "@typescript-eslint/strict-boolean-expressions": "off" + } +} diff --git a/docs/llm-example/README.md b/docs/llm-example/README.md new file mode 100644 index 000000000..58acc2d2b --- /dev/null +++ b/docs/llm-example/README.md @@ -0,0 +1,166 @@ +# ================================= + +### MOVE THE LLM-EXAMPLE/ FOLDER TO THE ROOT PROJECT FOLDER IN ORDER FOR THIS TO WORK + +# ================================= + +# Prerequisites + +- nvm: https://github.com/nvm-sh/nvm#installing-and-updating +- Bun: https://github.com/oven-sh/bun + +# Installation + +### Required + +The following is always needed + +```sh +# install dependencies in discojs-core +cd discojs/discojs-core/ +bun install + +# install dependencies in discojs-node +cd ../discojs-node/ +bun install + +# install dependencies in server +cd ../../server/ +bun install + +# install dependencies in experiment and download + preprocess dataset +cd ../experiment +bun install + +# Dataset installation, feel free to install only one dataset or both, you can later choose which one to preprocess / train on +./install-wikitext.sh # Installs wikitext-103 dataset +./install-shakespeare.sh # Installs tiny-shakespeare dataset +bun preprocess.ts [wikitext-103 | tiny-shakespeare] +``` + +### Running on Node + +```sh +cd experiment/ +bun main.ts [wikitext-103 | tiny-shakespeare] +``` + +### Running on a browser + +```sh +# install dependencies in discojs-web +cd ../discojs/discojs-web/ +bun install + +# install dependencies for the browser server and run it +cd ../../browser/server +bun install +bun run dev + +# [in a separate terminal] install dependencies for the browser server and run it +cd ../client +nvm use 18 # Node version 18.x or later is required for NextJS +bun install +bun run dev + +# Navigate to http://localhost:3000 on your browser of choice and click on "train" +# If you would like to use WebGPU then firefox won't work, please run the following command to run chrome with WebGPU enabled +# (I advise to run this command in a separate terminal tab as well because you will have logs even in detach mode) +google-chrome --enable-unsafe-webgpu --enable-features=Vulkan,UseSkiaRenderer & +# Or from the browser/client/ directory +./chrome-webgpu.sh # equivalent to command above +``` + +# Running tests + +To run tests, you first need to follow the "Required" section of the Installation instructions. + +### Testing on Node + +```sh +# Follow the instructions "# Running on node" before proceeding +cd discojs/discojs-node/ +bun --bun test text-loader.spec.ts +``` + +### Testing on a "simulated" browser + +```sh +# Since the following will test the web version, +# the websocket server needs to be running. +# Follow the 2 first steps of the installation instructions "# Running on a browser" +# before proceeding +cd browser/server/ +bun --bun socket.ts + +# In a new terminal tab +cd discojs/discojs-web/ +bun --bun test text_loader.spec.ts +``` + +# Benchmarks + +## Text Loader + +Benchmarking of the text loader is done via iterating 1000 times over the dataset and taking the average time is ms. The vocabulary size is set to 50257. We vary the batch and block sizes and report the results here. +Tests run on an AMD Ryzen 6 7600 CPU. + +### Node + +```py +# (batch, block) = time / iter +- (4, 64) = 1.481 ms +- (4, 128) = 2.564 ms +- (4, 256) = 2.213 ms +- (4, 512) = 3.284 ms +- (16, 64) = 1.912 ms +- (16, 128) = 3.323 ms +- (16, 256) = 6.499 ms +- (16, 512) = 12.131 ms +- (32, 64) = 3.299 ms +- (32, 128) = 6.579 ms +- (32, 256) = 12.325 ms +- (32, 512) = 23.752 ms +``` + +### Web (simulated) + +```py +# (batch, block) = time / iter +- (4, 64) = 1.617 ms +- (4, 128) = 2.725 ms +- (4, 256) = 2.162 ms +- (4, 512) = 3.603 ms +- (16, 64) = 2.120 ms +- (16, 128) = 3.751 ms +- (16, 256) = 6.796 ms +- (16, 512) = 12.837 ms +- (32, 64) = 3.598 ms +- (32, 128) = 6.883 ms +- (32, 256) = 12.718 ms +- (32, 512) = 25.475 ms +``` + +### Web (actual browser) + +## Training on GPT + +TODO: put wandb url + +### Node + +### Web (actual browser) + +# TODO + +1. Benchmark all +2. Try new dataset +3. Try new model +4. Investigate if node v18 is required everywhere now + +# Future work + +1. Disco support for various backends (for WebGPU especially) using `tf.setBackend`, and benchmark on them +2. Support for dedicated tfjs model, which allows custom training loop, e.g. `GPTModel extends Model`. This is partially implemented but not fully (issues in Trainer / TrainerBuilder?) +3. Refactor Task, add generic types +4. QloRA in disco core or at least for GPT diff --git a/docs/llm-example/bun.lockb b/docs/llm-example/bun.lockb new file mode 100755 index 000000000..148f0cae1 Binary files /dev/null and b/docs/llm-example/bun.lockb differ diff --git a/docs/llm-example/config.ts b/docs/llm-example/config.ts new file mode 100755 index 000000000..ffe9963c4 --- /dev/null +++ b/docs/llm-example/config.ts @@ -0,0 +1,80 @@ +import { Config, Models as Model } from './tfjs-types' + +export const configModels = { + gpt2: { + nLayer: 12, + nHead: 12, + nEmbd: 768, + vocabSize: 50257, + blockSize: 1024, + }, + 'gpt2-medium': { + nLayer: 24, + nHead: 16, + nEmbd: 1024, + vocabSize: 50257, + blockSize: 1024, + }, + 'gpt2-large': { + nLayer: 36, + nHead: 20, + nEmbd: 1280, + vocabSize: 50257, + blockSize: 1024, + }, + 'gpt2-xl': { + nLayer: 48, + nHead: 25, + nEmbd: 1600, + vocabSize: 50257, + blockSize: 1024, + }, + 'gpt-mini': { nLayer: 6, nHead: 6, nEmbd: 192 }, + 'gpt-micro': { nLayer: 4, nHead: 4, nEmbd: 128 }, + 'gpt-nano': { nLayer: 3, nHead: 3, nEmbd: 48 }, +} as const + +const modelType: Model = 'gpt-nano' +const model = configModels[modelType] +const dataset = 'wikitext-103' +const batchSize = 8 +const blockSize = 128 // = sequence length +const lr = 0.001 +const maxIter = 10 + +const baseConfig = { + debug: false, + verbose: false, + + modelType, + ...model, + + dataset, + batchSize, + blockSize, + lr, + maxIter, + shuffle: NaN, + weightDecay: false, // If set, wasm backend won't work because of the custom AdamW optimizer + optimizer: 'adamw', + gradClip: 1, + scheduler: null, + embdDrop: 0.2, + bias: true, + numWorkers: 0, + vocabSize: 50257, + + wandbProject: 'disco-gpt-benchmark', + + evalFreq: 25, + evalSeqPrefix: 'none', + maxEvalBatches: 24, +} as const + +const config: Config = { + ...baseConfig, + residDrop: baseConfig.embdDrop, + wandbName: `${modelType}_${dataset}_bs=${batchSize}_seq=${blockSize}_lr=${lr}_iter=${maxIter}`, +} as const + +export default config diff --git a/docs/llm-example/data.ts b/docs/llm-example/data.ts new file mode 100644 index 000000000..7c8625111 --- /dev/null +++ b/docs/llm-example/data.ts @@ -0,0 +1,96 @@ +import { readdir } from 'fs/promises' +import path from 'path' +import { dataset, Task, node } from '@epfml/discojs-node' +import { TOKENIZED_FILE_EXTENSION } from './preprocess' + +async function getDatasetSource( + root: string, + splits: (keyof dataset.loader.TextSource)[] +): Promise { + console.log('Preprocessed dataset located at:', root) + const files = await readdir(root) + return Object.fromEntries( + splits.map((split) => { + const splitFiles = files.filter( + (f) => f.endsWith(TOKENIZED_FILE_EXTENSION) && f.includes(split) + ) + + console.log( + 'Found', + splitFiles.length, + 'files in dataset for the', + split, + 'split.' + ) + + const splitFilesPath = splitFiles.map((f) => path.join(root, f)) + return [split, splitFilesPath] + }) + ) as dataset.loader.TextSource +} + +export async function loadData( + task: Task, + name: string, + config?: Partial +): Promise { + // TODO: Make this even more generic so that it works for any dataset / any task + // 1) move getDatasetSource to core so that the web version can use it as well + /* @ts-ignore - for import.meta.dir */ + const root = path.join(import.meta.dir, 'datasets', name) + const source = await getDatasetSource(root, ['train', 'validation']) + return await new node.dataset.loader.NodeTextLoader(task).loadAll( + source, + config + ) +} + +import fs from 'fs' +import Rand from 'rand-seed' + +const rand = new Rand('1234') + +function shuffle(array: T[], arrayTwo: U[]): void { + for (let i = array.length - 1; i > 0; i--) { + const j = Math.floor(rand.next() * (i + 1)) + const temp = array[i] + array[i] = array[j] + array[j] = temp + + const tempTwo = arrayTwo[i] + arrayTwo[i] = arrayTwo[j] + arrayTwo[j] = tempTwo + } +} + +function filesFromFolder(dir: string, folder: string): string[] { + const f = fs.readdirSync(dir + folder) + return f.map((file) => dir + folder + '/' + file) +} + +export async function loadDataFace(task: Task): Promise { + const dir = '../example_training_data/simple_face/' + const youngFolders = ['child'] + const oldFolders = ['adult'] + + const youngFiles = youngFolders.flatMap((folder) => { + return filesFromFolder(dir, folder) + }) + + const oldFiles = oldFolders.flatMap((folder) => { + return filesFromFolder(dir, folder) + }) + + const filesPerFolder = [youngFiles, oldFiles] + + const labels = filesPerFolder.flatMap((files, index) => + Array(files.length).fill(index) + ) + const files = filesPerFolder.flat() + + shuffle(files, labels) + + return await new node.dataset.loader.NodeImageLoader(task).loadAll(files, { + labels: labels, + }) +} diff --git a/docs/llm-example/install-shakespeare.sh b/docs/llm-example/install-shakespeare.sh new file mode 100755 index 000000000..c21d948e4 --- /dev/null +++ b/docs/llm-example/install-shakespeare.sh @@ -0,0 +1,3 @@ +mkdir -p ./datasets/tinyshakespeare +curl https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt --output ./datasets/tiny-shakespeare/train + diff --git a/docs/llm-example/install-wikitext.sh b/docs/llm-example/install-wikitext.sh new file mode 100755 index 000000000..cf209b899 --- /dev/null +++ b/docs/llm-example/install-wikitext.sh @@ -0,0 +1,8 @@ +mkdir -p ./datasets/wikitext +mkdir -p ./datasets/wikitext-103 +curl https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip --output ./datasets/wikitext/raw.zip +unzip ./datasets/wikitext/raw.zip -d ./datasets/wikitext-103 +mv ./datasets/wikitext-103/wikitext-103-raw/wiki.train.raw ./datasets/wikitext-103/train +mv ./datasets/wikitext-103/wikitext-103-raw/wiki.test.raw ./datasets/wikitext-103/test +mv ./datasets/wikitext-103/wikitext-103-raw/wiki.valid.raw ./datasets/wikitext-103/validation +rmdir ./datasets/wikitext-103/wikitext-103-raw/ \ No newline at end of file diff --git a/docs/llm-example/main.ts b/docs/llm-example/main.ts new file mode 100644 index 000000000..ef077fd22 --- /dev/null +++ b/docs/llm-example/main.ts @@ -0,0 +1,73 @@ +import { dataset, Disco, fetchTasks, Task } from '@epfml/discojs-node' + +import { startDisco } from '@epfml/disco-server' +import { loadData, loadDataFace } from './data' + +{ + let serve = Bun.serve + Bun.serve = (x: any) => + serve({ + ...x, + websocket: x.websocket + ? { + ...x.websocket, + maxPayloadLength: 1_000_000_000, + } + : undefined, + }) +} + +/** + * Example of discojs API, we load data, build the appropriate loggers, the disco object + * and finally start training. + */ +async function runUser( + url: URL, + task: Task, + dataset: dataset.DataSplit +): Promise { + // Start federated training + const disco = new Disco(task, { url }) + await disco.fit(dataset) + + // Stop training and disconnect from the remote server + await disco.close() +} + +async function main(): Promise { + if (process.argv.length < 3) + throw new Error( + 'Please provide the dataset name you would like to train on (wikitext-103 | tiny-shakespeare)' + ) + + const name = process.argv[2] + + const [server, serverUrl] = await startDisco() + + const tasks = await fetchTasks(serverUrl) + + // Choose your task to train + // TODO: rename this task just to llm or gpt (?), because it would be the same task for many datasets + // const task = tasks.get('wikitext-103') // no matter the dataset picked, the task is the same + const task = tasks.get('simple_face') // no matter the dataset picked, the task is the same + + if (task === undefined) { + throw new Error('task not found') + } + + const dataset = await loadDataFace(task) + // const dataset = await loadData(task, name) + + // Add more users to the list to simulate more clients + await Promise.all([ + runUser(serverUrl, task, dataset), + // runUser(serverUrl, task, dataset), + ]) + + await new Promise((resolve, reject) => { + server.once('close', resolve) + server.close(reject) + }) +} + +main().catch(console.error) diff --git a/docs/llm-example/package-lock.json b/docs/llm-example/package-lock.json new file mode 100644 index 000000000..47c74297a --- /dev/null +++ b/docs/llm-example/package-lock.json @@ -0,0 +1,443 @@ +{ + "name": "@epfml/disco-node-example", + "lockfileVersion": 2, + "requires": true, + "packages": { + "": { + "name": "@epfml/disco-node-example", + "license": "ISC", + "dependencies": { + "es-main": "^1.3.0", + "gpt-tokenizer": "^2.1.2", + "rand-seed": "1", + "tslib": "2" + }, + "devDependencies": { + "@epfml/discojs-node": "file:../discojs/discojs-node", + "bun-types": "^1.0.15", + "ts-node": "10", + "tsconfig-paths": "^4.2.0" + } + }, + "../discojs/discojs-node": { + "version": "2.1.1", + "dev": true, + "dependencies": { + "@tensorflow/tfjs-node": "^4.14.0" + }, + "devDependencies": { + "bun-types": "^1.0.16" + } + }, + "node_modules/@cspotcode/source-map-support": { + "version": "0.8.1", + "dev": true, + "license": "MIT", + "dependencies": { + "@jridgewell/trace-mapping": "0.3.9" + }, + "engines": { + "node": ">=12" + } + }, + "node_modules/@epfml/discojs-node": { + "resolved": "../discojs/discojs-node", + "link": true + }, + "node_modules/@jridgewell/resolve-uri": { + "version": "3.1.1", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6.0.0" + } + }, + "node_modules/@jridgewell/sourcemap-codec": { + "version": "1.4.15", + "dev": true, + "license": "MIT" + }, + "node_modules/@jridgewell/trace-mapping": { + "version": "0.3.9", + "dev": true, + "license": "MIT", + "dependencies": { + "@jridgewell/resolve-uri": "^3.0.3", + "@jridgewell/sourcemap-codec": "^1.4.10" + } + }, + "node_modules/@tsconfig/node10": { + "version": "1.0.9", + "dev": true, + "license": "MIT" + }, + "node_modules/@tsconfig/node12": { + "version": "1.0.11", + "dev": true, + "license": "MIT" + }, + "node_modules/@tsconfig/node14": { + "version": "1.0.3", + "dev": true, + "license": "MIT" + }, + "node_modules/@tsconfig/node16": { + "version": "1.0.4", + "dev": true, + "license": "MIT" + }, + "node_modules/@types/node": { + "version": "20.10.4", + "dev": true, + "license": "MIT", + "peer": true, + "dependencies": { + "undici-types": "~5.26.4" + } + }, + "node_modules/acorn": { + "version": "8.11.2", + "dev": true, + "license": "MIT", + "bin": { + "acorn": "bin/acorn" + }, + "engines": { + "node": ">=0.4.0" + } + }, + "node_modules/acorn-walk": { + "version": "8.3.1", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=0.4.0" + } + }, + "node_modules/arg": { + "version": "4.1.3", + "dev": true, + "license": "MIT" + }, + "node_modules/bun-types": { + "version": "1.0.17", + "dev": true, + "license": "MIT" + }, + "node_modules/create-require": { + "version": "1.1.1", + "dev": true, + "license": "MIT" + }, + "node_modules/diff": { + "version": "4.0.2", + "dev": true, + "license": "BSD-3-Clause", + "engines": { + "node": ">=0.3.1" + } + }, + "node_modules/es-main": { + "version": "1.3.0", + "license": "MIT" + }, + "node_modules/gpt-tokenizer": { + "version": "2.1.2", + "license": "MIT", + "dependencies": { + "rfc4648": "^1.5.2" + } + }, + "node_modules/json5": { + "version": "2.2.3", + "dev": true, + "license": "MIT", + "bin": { + "json5": "lib/cli.js" + }, + "engines": { + "node": ">=6" + } + }, + "node_modules/make-error": { + "version": "1.3.6", + "dev": true, + "license": "ISC" + }, + "node_modules/minimist": { + "version": "1.2.8", + "dev": true, + "license": "MIT", + "funding": { + "url": "https://github.com/sponsors/ljharb" + } + }, + "node_modules/rand-seed": { + "version": "1.0.2", + "license": "MIT" + }, + "node_modules/rfc4648": { + "version": "1.5.3", + "license": "MIT" + }, + "node_modules/strip-bom": { + "version": "3.0.0", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=4" + } + }, + "node_modules/ts-node": { + "version": "10.9.2", + "dev": true, + "license": "MIT", + "dependencies": { + "@cspotcode/source-map-support": "^0.8.0", + "@tsconfig/node10": "^1.0.7", + "@tsconfig/node12": "^1.0.7", + "@tsconfig/node14": "^1.0.0", + "@tsconfig/node16": "^1.0.2", + "acorn": "^8.4.1", + "acorn-walk": "^8.1.1", + "arg": "^4.1.0", + "create-require": "^1.1.0", + "diff": "^4.0.1", + "make-error": "^1.1.1", + "v8-compile-cache-lib": "^3.0.1", + "yn": "3.1.1" + }, + "bin": { + "ts-node": "dist/bin.js", + "ts-node-cwd": "dist/bin-cwd.js", + "ts-node-esm": "dist/bin-esm.js", + "ts-node-script": "dist/bin-script.js", + "ts-node-transpile-only": "dist/bin-transpile.js", + "ts-script": "dist/bin-script-deprecated.js" + }, + "peerDependencies": { + "@swc/core": ">=1.2.50", + "@swc/wasm": ">=1.2.50", + "@types/node": "*", + "typescript": ">=2.7" + }, + "peerDependenciesMeta": { + "@swc/core": { + "optional": true + }, + "@swc/wasm": { + "optional": true + } + } + }, + "node_modules/tsconfig-paths": { + "version": "4.2.0", + "dev": true, + "license": "MIT", + "dependencies": { + "json5": "^2.2.2", + "minimist": "^1.2.6", + "strip-bom": "^3.0.0" + }, + "engines": { + "node": ">=6" + } + }, + "node_modules/tslib": { + "version": "2.6.2", + "license": "0BSD" + }, + "node_modules/typescript": { + "version": "5.3.3", + "dev": true, + "license": "Apache-2.0", + "peer": true, + "bin": { + "tsc": "bin/tsc", + "tsserver": "bin/tsserver" + }, + "engines": { + "node": ">=14.17" + } + }, + "node_modules/undici-types": { + "version": "5.26.5", + "dev": true, + "license": "MIT", + "peer": true + }, + "node_modules/v8-compile-cache-lib": { + "version": "3.0.1", + "dev": true, + "license": "MIT" + }, + "node_modules/yn": { + "version": "3.1.1", + "dev": true, + "license": "MIT", + "engines": { + "node": ">=6" + } + } + }, + "dependencies": { + "@cspotcode/source-map-support": { + "version": "0.8.1", + "dev": true, + "requires": { + "@jridgewell/trace-mapping": "0.3.9" + } + }, + "@epfml/discojs-node": { + "version": "file:../discojs/discojs-node", + "requires": { + "@tensorflow/tfjs-node": "^4.14.0", + "bun-types": "^1.0.16" + } + }, + "@jridgewell/resolve-uri": { + "version": "3.1.1", + "dev": true + }, + "@jridgewell/sourcemap-codec": { + "version": "1.4.15", + "dev": true + }, + "@jridgewell/trace-mapping": { + "version": "0.3.9", + "dev": true, + "requires": { + "@jridgewell/resolve-uri": "^3.0.3", + "@jridgewell/sourcemap-codec": "^1.4.10" + } + }, + "@tsconfig/node10": { + "version": "1.0.9", + "dev": true + }, + "@tsconfig/node12": { + "version": "1.0.11", + "dev": true + }, + "@tsconfig/node14": { + "version": "1.0.3", + "dev": true + }, + "@tsconfig/node16": { + "version": "1.0.4", + "dev": true + }, + "@types/node": { + "version": "20.10.4", + "dev": true, + "peer": true, + "requires": { + "undici-types": "~5.26.4" + } + }, + "acorn": { + "version": "8.11.2", + "dev": true + }, + "acorn-walk": { + "version": "8.3.1", + "dev": true + }, + "arg": { + "version": "4.1.3", + "dev": true + }, + "bun-types": { + "version": "1.0.17", + "dev": true + }, + "create-require": { + "version": "1.1.1", + "dev": true + }, + "diff": { + "version": "4.0.2", + "dev": true + }, + "es-main": { + "version": "1.3.0" + }, + "gpt-tokenizer": { + "version": "2.1.2", + "requires": { + "rfc4648": "^1.5.2" + } + }, + "json5": { + "version": "2.2.3", + "dev": true + }, + "make-error": { + "version": "1.3.6", + "dev": true + }, + "minimist": { + "version": "1.2.8", + "dev": true + }, + "rand-seed": { + "version": "1.0.2" + }, + "rfc4648": { + "version": "1.5.3" + }, + "strip-bom": { + "version": "3.0.0", + "dev": true + }, + "ts-node": { + "version": "10.9.2", + "dev": true, + "requires": { + "@cspotcode/source-map-support": "^0.8.0", + "@tsconfig/node10": "^1.0.7", + "@tsconfig/node12": "^1.0.7", + "@tsconfig/node14": "^1.0.0", + "@tsconfig/node16": "^1.0.2", + "acorn": "^8.4.1", + "acorn-walk": "^8.1.1", + "arg": "^4.1.0", + "create-require": "^1.1.0", + "diff": "^4.0.1", + "make-error": "^1.1.1", + "v8-compile-cache-lib": "^3.0.1", + "yn": "3.1.1" + } + }, + "tsconfig-paths": { + "version": "4.2.0", + "dev": true, + "requires": { + "json5": "^2.2.2", + "minimist": "^1.2.6", + "strip-bom": "^3.0.0" + } + }, + "tslib": { + "version": "2.6.2" + }, + "typescript": { + "version": "5.3.3", + "dev": true, + "peer": true + }, + "undici-types": { + "version": "5.26.5", + "dev": true, + "peer": true + }, + "v8-compile-cache-lib": { + "version": "3.0.1", + "dev": true + }, + "yn": { + "version": "3.1.1", + "dev": true + } + } +} diff --git a/docs/llm-example/package.json b/docs/llm-example/package.json new file mode 100644 index 000000000..450f195ba --- /dev/null +++ b/docs/llm-example/package.json @@ -0,0 +1,23 @@ +{ + "name": "@epfml/disco-node-example", + "private": true, + "type": "module", + "main": "main.ts", + "scripts": { + "dev": "rm -rf ./models/wikitext-103 && bun --bun main.ts wikitext-103", + "test": "echo \"Error: no test specified\" && exit 1" + }, + "license": "ISC", + "dependencies": { + "es-main": "^1.3.0", + "gpt-tokenizer": "^2.1.2", + "rand-seed": "1", + "tslib": "2" + }, + "devDependencies": { + "@types/bun": "^1.0.2", + "bun-types": "^1.0.23", + "ts-node": "10", + "tsconfig-paths": "^4.2.0" + } +} diff --git a/docs/llm-example/preprocess.ts b/docs/llm-example/preprocess.ts new file mode 100755 index 000000000..42262a260 --- /dev/null +++ b/docs/llm-example/preprocess.ts @@ -0,0 +1,111 @@ +import fs from 'fs' +import path from 'path' +import esMain from 'es-main' +import { readdir } from 'fs/promises' +import { encode } from 'gpt-tokenizer/model/text-davinci-003' + +// For ts-node-esm +import { fileURLToPath } from 'url' +const __filename = fileURLToPath(import.meta.url) +const __dirname = path.dirname(__filename) + +const BATCH_LENGTH = 4096 +export const TOKENIZED_FILE_EXTENSION = 'tokens' + +// TODO: support for multiple files being tokenized together into a single file + +async function getFileStreams(datasetDir: string) { + let files: string[] + try { + files = await readdir(datasetDir) + } catch (err) { + console.error( + 'Could not find dataset directory:', + datasetDir, + 'are you sure you downloaded the dataset?' + ) + throw err + } + + const preprocessFiles = files.filter( + (file) => + !file.endsWith('zip') && !file.endsWith(TOKENIZED_FILE_EXTENSION) + ) + console.log( + 'Found', + preprocessFiles.length, + 'files to preprocess:', + preprocessFiles + ) + const streams = preprocessFiles.map((file) => ({ + file, + getStream: async () => + new Promise((resolve) => { + const stream = fs.createReadStream( + path.join(datasetDir, file), + { + encoding: 'utf8', + highWaterMark: 1, + fd: undefined, + } + ) + stream.on('readable', () => resolve(stream)) + }), + })) + return streams +} + +const preprocessStream = async ( + datasetDir: string, + file: string, + getStream: () => Promise +) => { + const stream = await getStream() + + const writeFilePath = path.join( + datasetDir, + file + '.' + TOKENIZED_FILE_EXTENSION + ) + console.log('Writing to', writeFilePath) + const writeFileStream = fs.createWriteStream(writeFilePath) + + let accumulator: string[] = [] + let char: string + + while (null !== (char = stream.read(1))) { + accumulator.push(char) + if (accumulator.length >= BATCH_LENGTH && char === ' ') { + const chunk = accumulator.join('') + const tokens = encode(chunk) + const array = new Uint16Array(tokens) + const buffer = Buffer.from(array.buffer) + writeFileStream.write(buffer) + accumulator = [] + } + } + + writeFileStream.end() +} + +export default async function preprocess(name: string) { + const datasetDir = path.join(__dirname, 'datasets', name) + console.log('Preprocessing step located at:', datasetDir) + const streams = await getFileStreams(datasetDir) + + for await (const { file, getStream } of streams) { + const label = `Preprocessing ${file}` + console.time(label) + await preprocessStream(datasetDir, file, getStream) + console.timeEnd(label) + } +} + +if (esMain(import.meta)) { + if (process.argv.length < 3) + throw new Error( + 'Please provide the dataset name you would like to preprocess (wikitext-103 | tiny-shakespeare)' + ) + + const name = process.argv[2] + await preprocess(name) +} diff --git a/docs/llm-example/tsconfig.eslint.json b/docs/llm-example/tsconfig.eslint.json new file mode 100644 index 000000000..512d79fca --- /dev/null +++ b/docs/llm-example/tsconfig.eslint.json @@ -0,0 +1,5 @@ +{ + "extends": "../tsconfig.json", + "include": ["*.ts"], + "exclude": [] +} diff --git a/docs/llm-example/tsconfig.json b/docs/llm-example/tsconfig.json new file mode 100644 index 000000000..b3636c951 --- /dev/null +++ b/docs/llm-example/tsconfig.json @@ -0,0 +1,41 @@ +{ + "compilerOptions": { + "rootDir": "..", + "rootDirs": [".", "../discojs/discojs-node", "../server"], + "lib": ["ESNext"], + "module": "esnext", + "target": "esnext", + "esModuleInterop": true, + "moduleResolution": "bundler", + "moduleDetection": "force", + "outDir": "./dist", + "importHelpers": true, + "allowImportingTsExtensions": true, + "noEmit": true, + "composite": true, + "strict": true, + "downlevelIteration": true, + "skipLibCheck": true, + "allowSyntheticDefaultImports": true, + "forceConsistentCasingInFileNames": true, + "allowJs": true, + "declaration": true, + "emitDeclarationOnly": false, + "resolveJsonModule": true, + "isolatedModules": true, + "incremental": true, + "checkJs": false, + "paths": { + "@/*": ["./*"], + "@epfml/disco-server": ["../server/src"], + "@epfml/discojs-node": ["../discojs/discojs-node/src/"] + } + }, + "include": [ + "**/*.ts", + "decs.d.ts", + "../discojs/discojs-node/**/*.ts", + "../server/**/*.ts" + ], + "exclude": ["./node_modules/**/*", "./dist/**/*"] +}