From bd8c6045be3e108f1c221a53012631643b85e873 Mon Sep 17 00:00:00 2001 From: Andrej Karpathy Date: Fri, 13 Sep 2024 19:04:08 +0000 Subject: [PATCH 1/5] change default params: use tinyshakespeare and decrease LR --- train_llama3.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/train_llama3.py b/train_llama3.py index 2c64039fc..f9daafde0 100644 --- a/train_llama3.py +++ b/train_llama3.py @@ -943,7 +943,7 @@ def print0(*args, **kwargs): parser.add_argument("--ckpt_dir", type=str, default=None, help="path to llama3 model checkpoint (needed if use_hf=0)") parser.add_argument("--tokenizer_path", type=str, default=None, help="path to llama3 tokenizer (needed if use_hf=0)") # file system input / output - parser.add_argument("--input_bin", type=str, default="dev/data/tinystories/TinyStories_val.bin", help="input .bin to train on") + parser.add_argument("--input_bin", type=str, default="dev/data/tinyshakespeare/tiny_shakespeare_val.bin", help="input .bin to train on") parser.add_argument("--input_val_bin", type=str, default="", help="input .bin to eval validation loss on") parser.add_argument("--output_dir", type=str, default="", help="output directory to which to write logs and checkpoints") parser.add_argument("--model", type=str, default="meta-llama/Meta-Llama-3.1-8B", help="chose the llama model") @@ -955,7 +955,7 @@ def print0(*args, **kwargs): parser.add_argument("--num_iterations", type=int, default=10, help="number of iterations to run") parser.add_argument("--inference_only", type=int, default=0, help="only run inference") # optimization - parser.add_argument("--learning_rate", type=float, default=1e-4, help="learning rate warmup iterations") + parser.add_argument("--learning_rate", type=float, default=1e-5, help="learning rate warmup iterations") parser.add_argument("--warmup_iters", type=int, default=0, help="learning rate warmup iterations") parser.add_argument("--learning_rate_decay_frac", type=float, default=1.0, help="learning rate warmup iterations") parser.add_argument("--weight_decay", type=float, default=0.0, help="weight decay") From 5b2e3180fbb5668a0d3c8cecd07b0732ebad330a Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Mon, 23 Sep 2024 17:09:45 -0700 Subject: [PATCH 2/5] cuda mode -> gpu mode This is a documentation only change. Hoping this is OK to merge. See this tweet for more context on why we made this change https://x.com/jeremyphoward/status/1838341110344880637 --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 02cfc13d6..2f8b9194b 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # llm.c -LLMs in simple, pure C/CUDA with no need for 245MB of PyTorch or 107MB of cPython. Current focus is on pretraining, in particular reproducing the [GPT-2](https://github.com/openai/gpt-2) and [GPT-3](https://arxiv.org/abs/2005.14165) miniseries, along with a parallel PyTorch reference implementation in [train_gpt2.py](train_gpt2.py). You'll recognize this file as a slightly tweaked [nanoGPT](https://github.com/karpathy/nanoGPT), an earlier project of mine. Currently, llm.c is a bit faster than PyTorch Nightly (by about 7%). In addition to the bleeding edge mainline code in [train_gpt2.cu](train_gpt2.cu), we have a simple reference CPU fp32 implementation in ~1,000 lines of clean code in one file [train_gpt2.c](train_gpt2.c). I'd like this repo to only maintain C and CUDA code. Ports to other languages or repos are very welcome, but should be done in separate repos, and I am happy to link to them below in the "notable forks" section. Developer coordination happens in the [Discussions](https://github.com/karpathy/llm.c/discussions) and on Discord, either the `#llmc` channel on the [Zero to Hero](https://discord.gg/3zy8kqD9Cp) channel, or on `#llmdotc` on CUDA MODE Discord. +LLMs in simple, pure C/CUDA with no need for 245MB of PyTorch or 107MB of cPython. Current focus is on pretraining, in particular reproducing the [GPT-2](https://github.com/openai/gpt-2) and [GPT-3](https://arxiv.org/abs/2005.14165) miniseries, along with a parallel PyTorch reference implementation in [train_gpt2.py](train_gpt2.py). You'll recognize this file as a slightly tweaked [nanoGPT](https://github.com/karpathy/nanoGPT), an earlier project of mine. Currently, llm.c is a bit faster than PyTorch Nightly (by about 7%). In addition to the bleeding edge mainline code in [train_gpt2.cu](train_gpt2.cu), we have a simple reference CPU fp32 implementation in ~1,000 lines of clean code in one file [train_gpt2.c](train_gpt2.c). I'd like this repo to only maintain C and CUDA code. Ports to other languages or repos are very welcome, but should be done in separate repos, and I am happy to link to them below in the "notable forks" section. Developer coordination happens in the [Discussions](https://github.com/karpathy/llm.c/discussions) and on Discord, either the `#llmc` channel on the [Zero to Hero](https://discord.gg/3zy8kqD9Cp) channel, or on `#llmdotc` on GPU MODE Discord. ## quick start @@ -211,7 +211,7 @@ Lastly, I will be a lot more sensitive to complexity in the root folder of the p - CUDA C++ - [llm.cpp](https://github.com/gevtushenko/llm.c) by @[gevtushenko](https://github.com/gevtushenko): a port of this project using the [CUDA C++ Core Libraries](https://github.com/NVIDIA/cccl) - - A presentation this fork was covered in [this lecture](https://www.youtube.com/watch?v=WiB_3Csfj_Q) in the [CUDA MODE Discord Server](https://discord.gg/cudamode) + - A presentation this fork was covered in [this lecture](https://www.youtube.com/watch?v=WiB_3Csfj_Q) in the [GPU MODE Discord Server](https://discord.gg/cudamode) - C++/CUDA - [llm.cpp](https://github.com/zhangpiu/llm.cpp/tree/master/llmcpp) by @[zhangpiu](https://github.com/zhangpiu): a port of this project using the [Eigen](https://gitlab.com/libeigen/eigen), supporting CPU/CUDA. From 315b8d1f626885db31a895510cb299cb282616cc Mon Sep 17 00:00:00 2001 From: Mark Saroufim Date: Tue, 24 Sep 2024 13:08:11 -0700 Subject: [PATCH 3/5] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2f8b9194b..d2d107821 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # llm.c -LLMs in simple, pure C/CUDA with no need for 245MB of PyTorch or 107MB of cPython. Current focus is on pretraining, in particular reproducing the [GPT-2](https://github.com/openai/gpt-2) and [GPT-3](https://arxiv.org/abs/2005.14165) miniseries, along with a parallel PyTorch reference implementation in [train_gpt2.py](train_gpt2.py). You'll recognize this file as a slightly tweaked [nanoGPT](https://github.com/karpathy/nanoGPT), an earlier project of mine. Currently, llm.c is a bit faster than PyTorch Nightly (by about 7%). In addition to the bleeding edge mainline code in [train_gpt2.cu](train_gpt2.cu), we have a simple reference CPU fp32 implementation in ~1,000 lines of clean code in one file [train_gpt2.c](train_gpt2.c). I'd like this repo to only maintain C and CUDA code. Ports to other languages or repos are very welcome, but should be done in separate repos, and I am happy to link to them below in the "notable forks" section. Developer coordination happens in the [Discussions](https://github.com/karpathy/llm.c/discussions) and on Discord, either the `#llmc` channel on the [Zero to Hero](https://discord.gg/3zy8kqD9Cp) channel, or on `#llmdotc` on GPU MODE Discord. +LLMs in simple, pure C/CUDA with no need for 245MB of PyTorch or 107MB of cPython. Current focus is on pretraining, in particular reproducing the [GPT-2](https://github.com/openai/gpt-2) and [GPT-3](https://arxiv.org/abs/2005.14165) miniseries, along with a parallel PyTorch reference implementation in [train_gpt2.py](train_gpt2.py). You'll recognize this file as a slightly tweaked [nanoGPT](https://github.com/karpathy/nanoGPT), an earlier project of mine. Currently, llm.c is a bit faster than PyTorch Nightly (by about 7%). In addition to the bleeding edge mainline code in [train_gpt2.cu](train_gpt2.cu), we have a simple reference CPU fp32 implementation in ~1,000 lines of clean code in one file [train_gpt2.c](train_gpt2.c). I'd like this repo to only maintain C and CUDA code. Ports to other languages or repos are very welcome, but should be done in separate repos, and I am happy to link to them below in the "notable forks" section. Developer coordination happens in the [Discussions](https://github.com/karpathy/llm.c/discussions) and on Discord, either the `#llmc` channel on the [Zero to Hero](https://discord.gg/3zy8kqD9Cp) channel, or on `#llmdotc` on [GPU MODE](https://discord.gg/gpumode) Discord. ## quick start From ba59fe9435552dbafe90dc4146a9d63af4862af3 Mon Sep 17 00:00:00 2001 From: Gregory Kielian Date: Sat, 12 Oct 2024 12:55:10 -0700 Subject: [PATCH 4/5] Add options to use different softmax alternatives Adding Softplus, ReLU, and trying a SparseReLU. --- train_gpt2_fp32.cu | 92 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 90 insertions(+), 2 deletions(-) diff --git a/train_gpt2_fp32.cu b/train_gpt2_fp32.cu index df412ea5e..0e8a15581 100644 --- a/train_gpt2_fp32.cu +++ b/train_gpt2_fp32.cu @@ -1,3 +1,4 @@ +#define USE_SOFTPLUS /* GPT-2 Transformer Neural Net trained in raw CUDA Non-trivial notes to be aware of: @@ -238,6 +239,38 @@ __device__ float vec_at(const float4& vec, int index) { return reinterpret_cast(&vec)[index]; } + +/// RELU 1 + +__global__ void softplus_forward_kernel(float* out, const float* inp, int N, int T) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + float beta = 1.0f; + float threshold = 20.0f; + if (idx < N * T) { + float x = inp[idx]; + // If the input is large enough, avoid computing log(1 + exp(beta * x)) to prevent overflow. + if (x * beta > threshold) { + out[idx] = x; + } else { + out[idx] = (1.0f / beta) * log1pf(expf(beta * x)); // Softplus function + } + } +} + +__global__ void relu_forward_kernel(float* out, const float* inp, int N, int T) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < N * T) { + out[idx] = fmaxf(0.0f, inp[idx]); // ReLU activation + } +} + +__global__ void sparse_relu_forward_kernel(float* out, const float* inp, int N, int T) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < N * T && inp[idx] != 0.0f) { + out[idx] = fmaxf(0.0f, inp[idx]); // Only calculate for non-zero entries + } +} + __global__ void softmax_forward_kernel5(float* out, float inv_temperature, const float* inp, int N, int T) { // inp, out shape: (N, T, T), where N = B * NH // fuses the multiplication by scale inside attention @@ -443,6 +476,35 @@ __global__ void layernorm_backward_kernel2(float* dinp, float* dweight, float* d } } +// Relu 2 + +__global__ void softplus_backward_kernel(float* dinp, const float* dout, const float* inp, int N, int T) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + float beta = 1.0; + if (idx < N * T) { + float x = inp[idx]; + float sigmoid = 1.0f / (1.0f + expf(-beta * x)); // Sigmoid function + dinp[idx] = dout[idx] * sigmoid; + } +} + + +__global__ void relu_backward_kernel(float* dinp, const float* dout, const float* inp, int N, int T) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < N * T) { + // Gradient for ReLU: pass through the gradient where inp > 0, else zero + dinp[idx] = (inp[idx] > 0.0f) ? dout[idx] : 0.0f; + } +} + + +__global__ void sparse_relu_backward_kernel(float* dinp, const float* dout, const float* inp, int N, int T) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx < N * T && inp[idx] != 0.0f) { + dinp[idx] = (inp[idx] > 0.0f) ? dout[idx] : 0.0f; // Backpropagate only for non-zero entries + } +} + __global__ void softmax_autoregressive_backward_kernel(float* dpreatt, const float* datt, const float* att, int B, int T, int C, float scale) { constexpr const int BlockSize = 256; @@ -764,10 +826,19 @@ void attention_forward(float* out, float* qkvr, float* att, float* preatt = inp; cublasCheck(cublasSgemmStridedBatched(cublas_handle, CUBLAS_OP_T, CUBLAS_OP_N, T, T, HS, &alpha, k, HS, T * HS, q, HS, T * HS, &beta, preatt, T, T * T, B * NH)); + int grid_size = CEIL_DIV(B * NH * T * 32, softmax_block_size); +#ifdef USE_RELU + relu_forward_kernel<<>>(att, preatt, B * NH, T); +#elif defined(USE_RELU_SPARSE) + sparse_relu_forward_kernel<<>>(att, preatt, B * NH, T); +#elif defined(USE_SOFTPLUS) + softplus_forward_kernel<<>>(att, preatt, B * NH, T); +#else // multiply all elements of preatt elementwise by scale float scale = 1.0 / sqrtf(HS); - int grid_size = CEIL_DIV(B * NH * T * 32, softmax_block_size); softmax_forward_kernel5<<>>(att, scale, preatt, B * NH, T); +#endif + cudaCheck(cudaGetLastError()); // new approach: first cuBLAS another batched matmul @@ -860,9 +931,17 @@ void attention_backward(float* dinp, float* dqkvr, float* dpreatt, float* datt, // backward into dv cublasCheck(cublasSgemmStridedBatched(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_T, HS, T, T, &one, scratch, HS, T * HS, att, T, T * T, &zero, dv, HS, T * HS, B * NH)); // backward into preatt +#ifdef USE_RELU + relu_backward_kernel<<>>(dpreatt, datt, att, B * NH, T); +#elif defined(USE_RELU_SPARSE) + sparse_relu_backward_kernel <<>>(dpreatt, datt, att, B * NH, T); +#elif defined(USE_SOFTPLUS) + softplus_backward_kernel <<>>(dpreatt, datt, att, B * NH, T); +#else int hs = C / NH; // head size float scale = 1.0f / sqrtf(hs); softmax_autoregressive_backward_kernel<<>>(dpreatt, datt, att, B, T, C, scale); +#endif cudaCheck(cudaGetLastError()); // backward into q cublasCheck(cublasSgemmStridedBatched(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, HS, T, T, &one, k, HS, T * HS, dpreatt, T, T * T, &zero, dq, HS, T * HS, B * NH)); @@ -1592,6 +1671,15 @@ int main(int argc, char *argv[]) { printf("+-----------------------+----------------------------------------------------+\n"); printf("| Parameter | Value |\n"); printf("+-----------------------+----------------------------------------------------+\n"); +#ifdef USE_RELU + printf("| softmax activation | %-50s |\n", "ReLU"); +#elif defined(USE_RELU_SPARSE) + printf("| softmax activation | %-50s |\n", "SparseReLU"); +#elif defined(USE_SOFTPLUS) + printf("| softmax activation | %-50s |\n", "SoftPlus"); +#else + printf("| softmax activation | %-50s |\n", "Softmax"); +#endif printf("| train data pattern | %-50s |\n", train_data_pattern); printf("| val data pattern | %-50s |\n", val_data_pattern); printf("| output log file | %-50s |\n", output_log_file == NULL ? "NULL" : output_log_file); @@ -1752,4 +1840,4 @@ int main(int argc, char *argv[]) { return 0; } -#endif \ No newline at end of file +#endif From 7c278a0cb8b19ad3312a615685b4f69fe9a7f874 Mon Sep 17 00:00:00 2001 From: Gregory Kielian Date: Mon, 21 Oct 2024 23:21:18 -0700 Subject: [PATCH 5/5] Add option for ReaLLMASIC heads and embd_dim We'll create an array of different checkpoints and load them to HF. --- train_gpt2.cu | 1 + train_tiny_stories.sh | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+) create mode 100644 train_tiny_stories.sh diff --git a/train_gpt2.cu b/train_gpt2.cu index 70d8d0c5a..f3f91db3e 100644 --- a/train_gpt2.cu +++ b/train_gpt2.cu @@ -521,6 +521,7 @@ void gpt2_set_hyperparameters(GPT2Config* config, const char* depth_str) { assert(depth > 0); // atoi returns 0 if not a number int channels, num_heads; if (depth == 6) { channels = 384; num_heads = 6; } // (unofficial) gpt2-tiny (30M) + else if (depth == 32) { channels = 512; num_heads = 8; } // ReaLLMASIC else if (depth == 12) { channels = 768; num_heads = 12; } // gpt2 (124M) else if (depth == 24) { channels = 1024; num_heads = 16; } // gpt2-medium (350M) else if (depth == 36) { channels = 1280; num_heads = 20; } // gpt2-large (774M) diff --git a/train_tiny_stories.sh b/train_tiny_stories.sh new file mode 100644 index 000000000..cd20ca73d --- /dev/null +++ b/train_tiny_stories.sh @@ -0,0 +1,33 @@ +#!/bin/bash +set -x + +make clean +make train_gpt2cu + +./train_gpt2cu \ + -e gpt2:d32 \ + -i "dev/data/fineweb10B/fineweb_train_*.bin" \ + -j "dev/data/fineweb10B/fineweb_val_*.bin" + # + # -i dev/data/tinystories/TinyStories_train.bin \ + # -j dev/data/tinystories/TinyStories_val.bin +# make clean + +# make train_gpt2fp32cu + +# ./train_gpt2fp32cu \ +# -i dev/data/tinystories/TinyStories_train.bin \ +# -j dev/data/tinystories/TinyStories_val.bin \ +# -l "1e-3f" + +# ./train_gpt2fp32cu \ + # -i dev/data/tinystories/TinyStories_train.bin \ + # -j dev/data/tinystories/TinyStories_val.bin \ + # -l "1e-3f" \ + # -v "1000" \ + # -b "12" \ + # -s "2000" + + + # -i "dev/data/fineweb10B/fineweb_train_*.bin" \ + # -j "dev/data/fineweb10B/fineweb_val_*.bin" \