From bd8c6045be3e108f1c221a53012631643b85e873 Mon Sep 17 00:00:00 2001
From: Andrej Karpathy <andrej.karpathy@gmail.com>
Date: Fri, 13 Sep 2024 19:04:08 +0000
Subject: [PATCH 1/5] change default params: use tinyshakespeare and decrease
 LR

---
 train_llama3.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/train_llama3.py b/train_llama3.py
index 2c64039fc..f9daafde0 100644
--- a/train_llama3.py
+++ b/train_llama3.py
@@ -943,7 +943,7 @@ def print0(*args, **kwargs):
     parser.add_argument("--ckpt_dir", type=str, default=None, help="path to llama3 model checkpoint (needed if use_hf=0)")
     parser.add_argument("--tokenizer_path", type=str, default=None, help="path to llama3 tokenizer (needed if use_hf=0)")
     # file system input / output
-    parser.add_argument("--input_bin", type=str, default="dev/data/tinystories/TinyStories_val.bin", help="input .bin to train on")
+    parser.add_argument("--input_bin", type=str, default="dev/data/tinyshakespeare/tiny_shakespeare_val.bin", help="input .bin to train on")
     parser.add_argument("--input_val_bin", type=str, default="", help="input .bin to eval validation loss on")
     parser.add_argument("--output_dir", type=str, default="", help="output directory to which to write logs and checkpoints")
     parser.add_argument("--model", type=str, default="meta-llama/Meta-Llama-3.1-8B", help="chose the llama model")
@@ -955,7 +955,7 @@ def print0(*args, **kwargs):
     parser.add_argument("--num_iterations", type=int, default=10, help="number of iterations to run")
     parser.add_argument("--inference_only", type=int, default=0, help="only run inference")
     # optimization
-    parser.add_argument("--learning_rate", type=float, default=1e-4, help="learning rate warmup iterations")
+    parser.add_argument("--learning_rate", type=float, default=1e-5, help="learning rate warmup iterations")
     parser.add_argument("--warmup_iters", type=int, default=0, help="learning rate warmup iterations")
     parser.add_argument("--learning_rate_decay_frac", type=float, default=1.0, help="learning rate warmup iterations")
     parser.add_argument("--weight_decay", type=float, default=0.0, help="weight decay")

From 5b2e3180fbb5668a0d3c8cecd07b0732ebad330a Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@gmail.com>
Date: Mon, 23 Sep 2024 17:09:45 -0700
Subject: [PATCH 2/5] cuda mode -> gpu mode

This is a documentation only change. Hoping this is OK to merge. See this tweet for more context on why we made this change https://x.com/jeremyphoward/status/1838341110344880637
---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 02cfc13d6..2f8b9194b 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # llm.c
 
-LLMs in simple, pure C/CUDA with no need for 245MB of PyTorch or 107MB of cPython. Current focus is on pretraining, in particular reproducing the [GPT-2](https://github.com/openai/gpt-2) and [GPT-3](https://arxiv.org/abs/2005.14165) miniseries, along with a parallel PyTorch reference implementation in [train_gpt2.py](train_gpt2.py). You'll recognize this file as a slightly tweaked [nanoGPT](https://github.com/karpathy/nanoGPT), an earlier project of mine. Currently, llm.c is a bit faster than PyTorch Nightly (by about 7%). In addition to the bleeding edge mainline code in [train_gpt2.cu](train_gpt2.cu), we have a simple reference CPU fp32 implementation in ~1,000 lines of clean code in one file [train_gpt2.c](train_gpt2.c). I'd like this repo to only maintain C and CUDA code. Ports to other languages or repos are very welcome, but should be done in separate repos, and I am happy to link to them below in the "notable forks" section. Developer coordination happens in the [Discussions](https://github.com/karpathy/llm.c/discussions) and on Discord, either the `#llmc` channel on the [Zero to Hero](https://discord.gg/3zy8kqD9Cp) channel, or on `#llmdotc` on CUDA MODE Discord.
+LLMs in simple, pure C/CUDA with no need for 245MB of PyTorch or 107MB of cPython. Current focus is on pretraining, in particular reproducing the [GPT-2](https://github.com/openai/gpt-2) and [GPT-3](https://arxiv.org/abs/2005.14165) miniseries, along with a parallel PyTorch reference implementation in [train_gpt2.py](train_gpt2.py). You'll recognize this file as a slightly tweaked [nanoGPT](https://github.com/karpathy/nanoGPT), an earlier project of mine. Currently, llm.c is a bit faster than PyTorch Nightly (by about 7%). In addition to the bleeding edge mainline code in [train_gpt2.cu](train_gpt2.cu), we have a simple reference CPU fp32 implementation in ~1,000 lines of clean code in one file [train_gpt2.c](train_gpt2.c). I'd like this repo to only maintain C and CUDA code. Ports to other languages or repos are very welcome, but should be done in separate repos, and I am happy to link to them below in the "notable forks" section. Developer coordination happens in the [Discussions](https://github.com/karpathy/llm.c/discussions) and on Discord, either the `#llmc` channel on the [Zero to Hero](https://discord.gg/3zy8kqD9Cp) channel, or on `#llmdotc` on GPU MODE Discord.
 
 ## quick start
 
@@ -211,7 +211,7 @@ Lastly, I will be a lot more sensitive to complexity in the root folder of the p
 
 - CUDA C++
   - [llm.cpp](https://github.com/gevtushenko/llm.c) by @[gevtushenko](https://github.com/gevtushenko): a port of this project using the [CUDA C++ Core Libraries](https://github.com/NVIDIA/cccl)
-     - A presentation this fork was covered in [this lecture](https://www.youtube.com/watch?v=WiB_3Csfj_Q) in the [CUDA MODE Discord Server](https://discord.gg/cudamode)
+     - A presentation this fork was covered in [this lecture](https://www.youtube.com/watch?v=WiB_3Csfj_Q) in the [GPU MODE Discord Server](https://discord.gg/cudamode)
 
 - C++/CUDA
   - [llm.cpp](https://github.com/zhangpiu/llm.cpp/tree/master/llmcpp) by @[zhangpiu](https://github.com/zhangpiu): a port of this project using the [Eigen](https://gitlab.com/libeigen/eigen), supporting CPU/CUDA.

From 315b8d1f626885db31a895510cb299cb282616cc Mon Sep 17 00:00:00 2001
From: Mark Saroufim <marksaroufim@gmail.com>
Date: Tue, 24 Sep 2024 13:08:11 -0700
Subject: [PATCH 3/5] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 2f8b9194b..d2d107821 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 # llm.c
 
-LLMs in simple, pure C/CUDA with no need for 245MB of PyTorch or 107MB of cPython. Current focus is on pretraining, in particular reproducing the [GPT-2](https://github.com/openai/gpt-2) and [GPT-3](https://arxiv.org/abs/2005.14165) miniseries, along with a parallel PyTorch reference implementation in [train_gpt2.py](train_gpt2.py). You'll recognize this file as a slightly tweaked [nanoGPT](https://github.com/karpathy/nanoGPT), an earlier project of mine. Currently, llm.c is a bit faster than PyTorch Nightly (by about 7%). In addition to the bleeding edge mainline code in [train_gpt2.cu](train_gpt2.cu), we have a simple reference CPU fp32 implementation in ~1,000 lines of clean code in one file [train_gpt2.c](train_gpt2.c). I'd like this repo to only maintain C and CUDA code. Ports to other languages or repos are very welcome, but should be done in separate repos, and I am happy to link to them below in the "notable forks" section. Developer coordination happens in the [Discussions](https://github.com/karpathy/llm.c/discussions) and on Discord, either the `#llmc` channel on the [Zero to Hero](https://discord.gg/3zy8kqD9Cp) channel, or on `#llmdotc` on GPU MODE Discord.
+LLMs in simple, pure C/CUDA with no need for 245MB of PyTorch or 107MB of cPython. Current focus is on pretraining, in particular reproducing the [GPT-2](https://github.com/openai/gpt-2) and [GPT-3](https://arxiv.org/abs/2005.14165) miniseries, along with a parallel PyTorch reference implementation in [train_gpt2.py](train_gpt2.py). You'll recognize this file as a slightly tweaked [nanoGPT](https://github.com/karpathy/nanoGPT), an earlier project of mine. Currently, llm.c is a bit faster than PyTorch Nightly (by about 7%). In addition to the bleeding edge mainline code in [train_gpt2.cu](train_gpt2.cu), we have a simple reference CPU fp32 implementation in ~1,000 lines of clean code in one file [train_gpt2.c](train_gpt2.c). I'd like this repo to only maintain C and CUDA code. Ports to other languages or repos are very welcome, but should be done in separate repos, and I am happy to link to them below in the "notable forks" section. Developer coordination happens in the [Discussions](https://github.com/karpathy/llm.c/discussions) and on Discord, either the `#llmc` channel on the [Zero to Hero](https://discord.gg/3zy8kqD9Cp) channel, or on `#llmdotc` on [GPU MODE](https://discord.gg/gpumode) Discord.
 
 ## quick start
 

From ba59fe9435552dbafe90dc4146a9d63af4862af3 Mon Sep 17 00:00:00 2001
From: Gregory Kielian <gkielian@google.com>
Date: Sat, 12 Oct 2024 12:55:10 -0700
Subject: [PATCH 4/5] Add options to use different softmax alternatives

Adding Softplus, ReLU, and trying a SparseReLU.
---
 train_gpt2_fp32.cu | 92 +++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 90 insertions(+), 2 deletions(-)

diff --git a/train_gpt2_fp32.cu b/train_gpt2_fp32.cu
index df412ea5e..0e8a15581 100644
--- a/train_gpt2_fp32.cu
+++ b/train_gpt2_fp32.cu
@@ -1,3 +1,4 @@
+#define USE_SOFTPLUS
 /*
 GPT-2 Transformer Neural Net trained in raw CUDA
 Non-trivial notes to be aware of:
@@ -238,6 +239,38 @@ __device__ float vec_at(const float4& vec, int index) {
     return reinterpret_cast<const float*>(&vec)[index];
 }
 
+
+/// RELU 1
+
+__global__ void softplus_forward_kernel(float* out, const float* inp, int N, int T) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    float beta = 1.0f;
+    float threshold = 20.0f;
+    if (idx < N * T) {
+        float x = inp[idx];
+        // If the input is large enough, avoid computing log(1 + exp(beta * x)) to prevent overflow.
+        if (x * beta > threshold) {
+            out[idx] = x;
+        } else {
+            out[idx] = (1.0f / beta) * log1pf(expf(beta * x));  // Softplus function
+        }
+    }
+}
+
+__global__ void relu_forward_kernel(float* out, const float* inp, int N, int T) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < N * T) {
+        out[idx] = fmaxf(0.0f, inp[idx]);  // ReLU activation
+    }
+}
+
+__global__ void sparse_relu_forward_kernel(float* out, const float* inp, int N, int T) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < N * T && inp[idx] != 0.0f) {
+        out[idx] = fmaxf(0.0f, inp[idx]);  // Only calculate for non-zero entries
+    }
+}
+
 __global__ void softmax_forward_kernel5(float* out, float inv_temperature, const float* inp, int N, int T) {
     // inp, out shape: (N, T, T), where N = B * NH
     // fuses the multiplication by scale inside attention
@@ -443,6 +476,35 @@ __global__ void layernorm_backward_kernel2(float* dinp, float* dweight, float* d
 	}
 }
 
+// Relu 2
+
+__global__ void softplus_backward_kernel(float* dinp, const float* dout, const float* inp, int N, int T) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    float beta = 1.0;
+    if (idx < N * T) {
+        float x = inp[idx];
+        float sigmoid = 1.0f / (1.0f + expf(-beta * x));  // Sigmoid function
+        dinp[idx] = dout[idx] * sigmoid;
+    }
+}
+
+
+__global__ void relu_backward_kernel(float* dinp, const float* dout, const float* inp, int N, int T) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < N * T) {
+        // Gradient for ReLU: pass through the gradient where inp > 0, else zero
+        dinp[idx] = (inp[idx] > 0.0f) ? dout[idx] : 0.0f;
+    }
+}
+
+
+__global__ void sparse_relu_backward_kernel(float* dinp, const float* dout, const float* inp, int N, int T) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx < N * T && inp[idx] != 0.0f) {
+        dinp[idx] = (inp[idx] > 0.0f) ? dout[idx] : 0.0f;  // Backpropagate only for non-zero entries
+    }
+}
+
 __global__ void softmax_autoregressive_backward_kernel(float* dpreatt, const float* datt, const float* att,
                                                        int B, int T, int C, float scale) {
     constexpr const int BlockSize = 256;
@@ -764,10 +826,19 @@ void attention_forward(float* out, float* qkvr, float* att,
     float* preatt = inp;
     cublasCheck(cublasSgemmStridedBatched(cublas_handle, CUBLAS_OP_T, CUBLAS_OP_N, T, T, HS, &alpha, k, HS, T * HS, q, HS, T * HS, &beta, preatt, T, T * T, B * NH));
 
+    int grid_size = CEIL_DIV(B * NH * T * 32, softmax_block_size);
+#ifdef USE_RELU
+    relu_forward_kernel<<<grid_size, softmax_block_size>>>(att, preatt, B * NH, T);
+#elif defined(USE_RELU_SPARSE)
+    sparse_relu_forward_kernel<<<grid_size, softmax_block_size>>>(att, preatt, B * NH, T);
+#elif defined(USE_SOFTPLUS)
+    softplus_forward_kernel<<<grid_size, softmax_block_size>>>(att, preatt, B * NH, T);
+#else
     // multiply all elements of preatt elementwise by scale
     float scale = 1.0 / sqrtf(HS);
-    int grid_size = CEIL_DIV(B * NH * T * 32, softmax_block_size);
     softmax_forward_kernel5<<<grid_size, softmax_block_size>>>(att, scale, preatt, B * NH, T);
+#endif
+
     cudaCheck(cudaGetLastError());
 
     // new approach: first cuBLAS another batched matmul
@@ -860,9 +931,17 @@ void attention_backward(float* dinp, float* dqkvr, float* dpreatt, float* datt,
     // backward into dv
     cublasCheck(cublasSgemmStridedBatched(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_T, HS, T, T, &one, scratch, HS, T * HS, att, T, T * T, &zero, dv, HS, T * HS, B * NH));
     // backward into preatt
+#ifdef USE_RELU
+    relu_backward_kernel<<<dim3(T / 4, B * NH), 256>>>(dpreatt, datt, att, B * NH, T);
+#elif defined(USE_RELU_SPARSE)
+    sparse_relu_backward_kernel <<<dim3(T / 4, B * NH), 256>>>(dpreatt, datt, att, B * NH, T);
+#elif defined(USE_SOFTPLUS)
+    softplus_backward_kernel <<<dim3(T / 4, B * NH), 256>>>(dpreatt, datt, att, B * NH, T);
+#else
     int hs = C / NH; // head size
     float scale = 1.0f / sqrtf(hs);
     softmax_autoregressive_backward_kernel<<<dim3(T / 4, B * NH), 256>>>(dpreatt, datt, att, B, T, C, scale);
+#endif
     cudaCheck(cudaGetLastError());
     // backward into q
     cublasCheck(cublasSgemmStridedBatched(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, HS, T, T, &one, k, HS, T * HS, dpreatt, T, T * T, &zero, dq, HS, T * HS, B * NH));
@@ -1592,6 +1671,15 @@ int main(int argc, char *argv[]) {
     printf("+-----------------------+----------------------------------------------------+\n");
     printf("| Parameter             | Value                                              |\n");
     printf("+-----------------------+----------------------------------------------------+\n");
+#ifdef USE_RELU
+    printf("| softmax activation    | %-50s |\n", "ReLU");
+#elif defined(USE_RELU_SPARSE)
+    printf("| softmax activation    | %-50s |\n", "SparseReLU");
+#elif defined(USE_SOFTPLUS)
+    printf("| softmax activation    | %-50s |\n", "SoftPlus");
+#else
+    printf("| softmax activation    | %-50s |\n", "Softmax");
+#endif
     printf("| train data pattern    | %-50s |\n", train_data_pattern);
     printf("| val data pattern      | %-50s |\n", val_data_pattern);
     printf("| output log file       | %-50s |\n", output_log_file == NULL ? "NULL" : output_log_file);
@@ -1752,4 +1840,4 @@ int main(int argc, char *argv[]) {
 
     return 0;
 }
-#endif
\ No newline at end of file
+#endif

From 7c278a0cb8b19ad3312a615685b4f69fe9a7f874 Mon Sep 17 00:00:00 2001
From: Gregory Kielian <gkielian@google.com>
Date: Mon, 21 Oct 2024 23:21:18 -0700
Subject: [PATCH 5/5] Add option for ReaLLMASIC heads and embd_dim

We'll create an array of different checkpoints and load them to HF.
---
 train_gpt2.cu         |  1 +
 train_tiny_stories.sh | 33 +++++++++++++++++++++++++++++++++
 2 files changed, 34 insertions(+)
 create mode 100644 train_tiny_stories.sh

diff --git a/train_gpt2.cu b/train_gpt2.cu
index 70d8d0c5a..f3f91db3e 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -521,6 +521,7 @@ void gpt2_set_hyperparameters(GPT2Config* config, const char* depth_str) {
     assert(depth > 0); // atoi returns 0 if not a number
     int channels, num_heads;
     if      (depth == 6)  { channels = 384; num_heads = 6; }   // (unofficial) gpt2-tiny (30M)
+    else if (depth == 32)  { channels = 512; num_heads = 8; }  // ReaLLMASIC
     else if (depth == 12) { channels = 768; num_heads = 12; }  // gpt2 (124M)
     else if (depth == 24) { channels = 1024; num_heads = 16; } // gpt2-medium (350M)
     else if (depth == 36) { channels = 1280; num_heads = 20; } // gpt2-large (774M)
diff --git a/train_tiny_stories.sh b/train_tiny_stories.sh
new file mode 100644
index 000000000..cd20ca73d
--- /dev/null
+++ b/train_tiny_stories.sh
@@ -0,0 +1,33 @@
+#!/bin/bash
+set -x
+
+make clean
+make train_gpt2cu
+
+./train_gpt2cu \
+	-e gpt2:d32 \
+	-i "dev/data/fineweb10B/fineweb_train_*.bin" \
+	-j "dev/data/fineweb10B/fineweb_val_*.bin"
+	#
+	# -i dev/data/tinystories/TinyStories_train.bin \
+	# -j dev/data/tinystories/TinyStories_val.bin
+# make clean
+
+# make train_gpt2fp32cu
+
+# ./train_gpt2fp32cu \
+# 	-i dev/data/tinystories/TinyStories_train.bin \
+# 	-j dev/data/tinystories/TinyStories_val.bin \
+# 	-l "1e-3f"
+
+# ./train_gpt2fp32cu \
+	# -i dev/data/tinystories/TinyStories_train.bin \
+	# -j dev/data/tinystories/TinyStories_val.bin \
+	# -l "1e-3f" \
+	# -v "1000" \
+	# -b "12" \
+	# -s "2000"
+
+
+	# -i "dev/data/fineweb10B/fineweb_train_*.bin" \
+	# -j "dev/data/fineweb10B/fineweb_val_*.bin" \