diff --git a/call_for_contributions.md b/call_for_contributions.md
index 3bce1073f9..f8bda21729 100644
--- a/call_for_contributions.md
+++ b/call_for_contributions.md
@@ -21,7 +21,7 @@ mixing numerical, categorical, and text features, doing feature engineering with
 
 ## Text-to-image
 
-A text-to-image diffusion model in the style of Imagen, using a frozen BERT encoder from KerasNLP
+A text-to-image diffusion model in the style of Imagen, using a frozen BERT encoder from KerasHub
 and a multi-stage diffusion model.
 
 
diff --git a/examples/generative/gpt2_text_generation_with_kerasnlp.py b/examples/generative/gpt2_text_generation_with_keras_hub.py
similarity index 83%
rename from examples/generative/gpt2_text_generation_with_kerasnlp.py
rename to examples/generative/gpt2_text_generation_with_keras_hub.py
index 5cab89e19e..8076053470 100644
--- a/examples/generative/gpt2_text_generation_with_kerasnlp.py
+++ b/examples/generative/gpt2_text_generation_with_keras_hub.py
@@ -1,14 +1,14 @@
 """
-Title: GPT2 Text Generation with KerasNLP
+Title: GPT2 Text Generation with KerasHub
 Author: Chen Qian
 Date created: 2023/04/17
 Last modified: 2024/04/12
-Description: Use KerasNLP GPT2 model and `samplers` to do text generation.
+Description: Use KerasHub GPT2 model and `samplers` to do text generation.
 Accelerator: GPU
 """
 
 """
-In this tutorial, you will learn to use [KerasNLP](https://keras.io/keras_nlp/) to load a
+In this tutorial, you will learn to use [KerasHub](https://keras.io/keras_hub/) to load a
 pre-trained Large Language Model (LLM) - [GPT-2 model](https://openai.com/research/better-language-models)
 (originally invented by OpenAI), finetune it to a specific text style, and
 generate text based on users' input (also known as prompt). You will also learn
@@ -25,23 +25,23 @@
 """
 
 """
-## Install KerasNLP, Choose Backend and Import Dependencies
+## Install KerasHub, Choose Backend and Import Dependencies
 
 This examples uses [Keras 3](https://keras.io/keras_3/) to work in any of
 `"tensorflow"`, `"jax"` or `"torch"`. Support for Keras 3 is baked into
-KerasNLP, simply change the `"KERAS_BACKEND"` environment variable to select
+KerasHub, simply change the `"KERAS_BACKEND"` environment variable to select
 the backend of your choice. We select the JAX backend below.
 """
 
 """shell
-pip install git+https://github.com/keras-team/keras-nlp.git -q
+pip install git+https://github.com/keras-team/keras-hub.git -q
 """
 
 import os
 
 os.environ["KERAS_BACKEND"] = "jax"  # or "tensorflow" or "torch"
 
-import keras_nlp
+import keras_hub
 import keras
 import tensorflow as tf
 import time
@@ -70,22 +70,22 @@
 """
 
 """
-## Introduction to KerasNLP
+## Introduction to KerasHub
 
 Large Language Models are complex to build and expensive to train from scratch.
-Luckily there are pretrained LLMs available for use right away. [KerasNLP](https://keras.io/keras_nlp/)
+Luckily there are pretrained LLMs available for use right away. [KerasHub](https://keras.io/keras_hub/)
 provides a large number of pre-trained checkpoints that allow you to experiment
 with SOTA models without needing to train them yourself.
 
-KerasNLP is a natural language processing library that supports users through
-their entire development cycle. KerasNLP offers both pretrained models and
+KerasHub is a natural language processing library that supports users through
+their entire development cycle. KerasHub offers both pretrained models and
 modularized building blocks, so developers could easily reuse pretrained models
 or stack their own LLM.
 
-In a nutshell, for generative LLM, KerasNLP offers:
+In a nutshell, for generative LLM, KerasHub offers:
 
 - Pretrained models with `generate()` method, e.g.,
-    `keras_nlp.models.GPT2CausalLM` and `keras_nlp.models.OPTCausalLM`.
+    `keras_hub.models.GPT2CausalLM` and `keras_hub.models.OPTCausalLM`.
 - Sampler class that implements generation algorithms such as Top-K, Beam and
     contrastive search. These samplers can be used to generate text with
     custom models.
@@ -94,21 +94,21 @@
 """
 ## Load a pre-trained GPT-2 model and generate some text
 
-KerasNLP provides a number of pre-trained models, such as [Google
+KerasHub provides a number of pre-trained models, such as [Google
 Bert](https://ai.googleblog.com/2018/11/open-sourcing-bert-state-of-art-pre.html)
 and [GPT-2](https://openai.com/research/better-language-models). You can see
-the list of models available in the [KerasNLP repository](https://github.com/keras-team/keras-nlp/tree/master/keras_nlp/models).
+the list of models available in the [KerasHub repository](https://github.com/keras-team/keras-hub/tree/master/keras_hub/models).
 
 It's very easy to load the GPT-2 model as you can see below:
 """
 
 # To speed up training and generation, we use preprocessor of length 128
 # instead of full length 1024.
-preprocessor = keras_nlp.models.GPT2CausalLMPreprocessor.from_preset(
+preprocessor = keras_hub.models.GPT2CausalLMPreprocessor.from_preset(
     "gpt2_base_en",
     sequence_length=128,
 )
-gpt2_lm = keras_nlp.models.GPT2CausalLM.from_preset(
+gpt2_lm = keras_hub.models.GPT2CausalLM.from_preset(
     "gpt2_base_en", preprocessor=preprocessor
 )
 
@@ -150,27 +150,27 @@
 """
 
 """
-## More on the GPT-2 model from KerasNLP
+## More on the GPT-2 model from KerasHub
 
 Next up, we will actually fine-tune the model to update its parameters, but
 before we do, let's take a look at the full set of tools we have to for working
 with for GPT2.
 
 The code of GPT2 can be found
-[here](https://github.com/keras-team/keras-nlp/blob/master/keras_nlp/models/gpt2/).
+[here](https://github.com/keras-team/keras-hub/blob/master/keras_hub/models/gpt2/).
 Conceptually the `GPT2CausalLM` can be hierarchically broken down into several
-modules in KerasNLP, all of which have a *from_preset()* function that loads a
+modules in KerasHub, all of which have a *from_preset()* function that loads a
 pretrained model:
 
-- `keras_nlp.models.GPT2Tokenizer`: The tokenizer used by GPT2 model, which is a
+- `keras_hub.models.GPT2Tokenizer`: The tokenizer used by GPT2 model, which is a
     [byte-pair encoder](https://huggingface.co/course/chapter6/5?fw=pt).
-- `keras_nlp.models.GPT2CausalLMPreprocessor`: the preprocessor used by GPT2
+- `keras_hub.models.GPT2CausalLMPreprocessor`: the preprocessor used by GPT2
     causal LM training. It does the tokenization along with other preprocessing
     works such as creating the label and appending the end token.
-- `keras_nlp.models.GPT2Backbone`: the GPT2 model, which is a stack of
-    `keras_nlp.layers.TransformerDecoder`. This is usually just referred as
+- `keras_hub.models.GPT2Backbone`: the GPT2 model, which is a stack of
+    `keras_hub.layers.TransformerDecoder`. This is usually just referred as
     `GPT2`.
-- `keras_nlp.models.GPT2CausalLM`: wraps `GPT2Backbone`, it multiplies the
+- `keras_hub.models.GPT2CausalLM`: wraps `GPT2Backbone`, it multiplies the
     output of `GPT2Backbone` by embedding matrix to generate logits over
     vocab tokens.
 """
@@ -178,7 +178,7 @@
 """
 ## Finetune on Reddit dataset
 
-Now you have the knowledge of the GPT-2 model from KerasNLP, you can take one
+Now you have the knowledge of the GPT-2 model from KerasHub, you can take one
 step further to finetune the model so that it generates text in a specific
 style, short or long, strict or casual. In this tutorial, we will use reddit
 dataset for example.
@@ -217,7 +217,7 @@
 """
 Now you can finetune the model using the familiar *fit()* function. Note that
 `preprocessor` will be automatically called inside `fit` method since
-`GPT2CausalLM` is a `keras_nlp.models.Task` instance.
+`GPT2CausalLM` is a `keras_hub.models.Task` instance.
 
 This step takes quite a bit of GPU memory and a long time if we were to train
 it all the way to a fully trained state. Here we just use part of the dataset
@@ -261,7 +261,7 @@
 """
 ## Into the Sampling Method
 
-In KerasNLP, we offer a few sampling methods, e.g., contrastive search,
+In KerasHub, we offer a few sampling methods, e.g., contrastive search,
 Top-K and beam sampling. By default, our `GPT2CausalLM` uses Top-k search, but
 you can choose your own sampling method.
 
@@ -270,7 +270,7 @@
 
 - Use a string identifier, such as "greedy", you are using the default
 configuration via this way.
-- Pass a `keras_nlp.samplers.Sampler` instance, you can use custom configuration
+- Pass a `keras_hub.samplers.Sampler` instance, you can use custom configuration
 via this way.
 """
 
@@ -281,7 +281,7 @@
 print(output)
 
 # Use a `Sampler` instance. `GreedySampler` tends to repeat itself,
-greedy_sampler = keras_nlp.samplers.GreedySampler()
+greedy_sampler = keras_hub.samplers.GreedySampler()
 gpt2_lm.compile(sampler=greedy_sampler)
 
 output = gpt2_lm.generate("I like basketball", max_length=200)
@@ -289,8 +289,8 @@
 print(output)
 
 """
-For more details on KerasNLP `Sampler` class, you can check the code
-[here](https://github.com/keras-team/keras-nlp/tree/master/keras_nlp/samplers).
+For more details on KerasHub `Sampler` class, you can check the code
+[here](https://github.com/keras-team/keras-hub/tree/master/keras_hub/samplers).
 """
 
 """
diff --git a/examples/generative/ipynb/gpt2_text_generation_with_kerasnlp.ipynb b/examples/generative/ipynb/gpt2_text_generation_with_keras_hub.ipynb
similarity index 89%
rename from examples/generative/ipynb/gpt2_text_generation_with_kerasnlp.ipynb
rename to examples/generative/ipynb/gpt2_text_generation_with_keras_hub.ipynb
index c5fc202075..e41d8e013a 100644
--- a/examples/generative/ipynb/gpt2_text_generation_with_kerasnlp.ipynb
+++ b/examples/generative/ipynb/gpt2_text_generation_with_keras_hub.ipynb
@@ -6,12 +6,12 @@
     "colab_type": "text"
    },
    "source": [
-    "# GPT2 Text Generation with KerasNLP\n",
+    "# GPT2 Text Generation with KerasHub\n",
     "\n",
     "**Author:** Chen Qian<br>\n",
     "**Date created:** 2023/04/17<br>\n",
     "**Last modified:** 2024/04/12<br>\n",
-    "**Description:** Use KerasNLP GPT2 model and `samplers` to do text generation."
+    "**Description:** Use KerasHub GPT2 model and `samplers` to do text generation."
    ]
   },
   {
@@ -20,7 +20,7 @@
     "colab_type": "text"
    },
    "source": [
-    "In this tutorial, you will learn to use [KerasNLP](https://keras.io/keras_nlp/) to load a\n",
+    "In this tutorial, you will learn to use [KerasHub](https://keras.io/keras_hub/) to load a\n",
     "pre-trained Large Language Model (LLM) - [GPT-2 model](https://openai.com/research/better-language-models)\n",
     "(originally invented by OpenAI), finetune it to a specific text style, and\n",
     "generate text based on users' input (also known as prompt). You will also learn\n",
@@ -47,11 +47,11 @@
     "colab_type": "text"
    },
    "source": [
-    "## Install KerasNLP, Choose Backend and Import Dependencies\n",
+    "## Install KerasHub, Choose Backend and Import Dependencies\n",
     "\n",
     "This examples uses [Keras 3](https://keras.io/keras_3/) to work in any of\n",
     "`\"tensorflow\"`, `\"jax\"` or `\"torch\"`. Support for Keras 3 is baked into\n",
-    "KerasNLP, simply change the `\"KERAS_BACKEND\"` environment variable to select\n",
+    "KerasHub, simply change the `\"KERAS_BACKEND\"` environment variable to select\n",
     "the backend of your choice. We select the JAX backend below."
    ]
   },
@@ -63,7 +63,7 @@
    },
    "outputs": [],
    "source": [
-    "!pip install git+https://github.com/keras-team/keras-nlp.git -q"
+    "!pip install git+https://github.com/keras-team/keras-hub.git -q"
    ]
   },
   {
@@ -78,7 +78,7 @@
     "\n",
     "os.environ[\"KERAS_BACKEND\"] = \"jax\"  # or \"tensorflow\" or \"torch\"\n",
     "\n",
-    "import keras_nlp\n",
+    "import keras_hub\n",
     "import keras\n",
     "import tensorflow as tf\n",
     "import time\n",
@@ -118,22 +118,22 @@
     "colab_type": "text"
    },
    "source": [
-    "## Introduction to KerasNLP\n",
+    "## Introduction to KerasHub\n",
     "\n",
     "Large Language Models are complex to build and expensive to train from scratch.\n",
-    "Luckily there are pretrained LLMs available for use right away. [KerasNLP](https://keras.io/keras_nlp/)\n",
+    "Luckily there are pretrained LLMs available for use right away. [KerasHub](https://keras.io/keras_hub/)\n",
     "provides a large number of pre-trained checkpoints that allow you to experiment\n",
     "with SOTA models without needing to train them yourself.\n",
     "\n",
-    "KerasNLP is a natural language processing library that supports users through\n",
-    "their entire development cycle. KerasNLP offers both pretrained models and\n",
+    "KerasHub is a natural language processing library that supports users through\n",
+    "their entire development cycle. KerasHub offers both pretrained models and\n",
     "modularized building blocks, so developers could easily reuse pretrained models\n",
     "or stack their own LLM.\n",
     "\n",
-    "In a nutshell, for generative LLM, KerasNLP offers:\n",
+    "In a nutshell, for generative LLM, KerasHub offers:\n",
     "\n",
     "- Pretrained models with `generate()` method, e.g.,\n",
-    "    `keras_nlp.models.GPT2CausalLM` and `keras_nlp.models.OPTCausalLM`.\n",
+    "    `keras_hub.models.GPT2CausalLM` and `keras_hub.models.OPTCausalLM`.\n",
     "- Sampler class that implements generation algorithms such as Top-K, Beam and\n",
     "    contrastive search. These samplers can be used to generate text with\n",
     "    custom models."
@@ -147,10 +147,10 @@
    "source": [
     "## Load a pre-trained GPT-2 model and generate some text\n",
     "\n",
-    "KerasNLP provides a number of pre-trained models, such as [Google\n",
+    "KerasHub provides a number of pre-trained models, such as [Google\n",
     "Bert](https://ai.googleblog.com/2018/11/open-sourcing-bert-state-of-art-pre.html)\n",
     "and [GPT-2](https://openai.com/research/better-language-models). You can see\n",
-    "the list of models available in the [KerasNLP repository](https://github.com/keras-team/keras-nlp/tree/master/keras_nlp/models).\n",
+    "the list of models available in the [KerasHub repository](https://github.com/keras-team/keras-hub/tree/master/keras_hub/models).\n",
     "\n",
     "It's very easy to load the GPT-2 model as you can see below:"
    ]
@@ -165,11 +165,11 @@
    "source": [
     "# To speed up training and generation, we use preprocessor of length 128\n",
     "# instead of full length 1024.\n",
-    "preprocessor = keras_nlp.models.GPT2CausalLMPreprocessor.from_preset(\n",
+    "preprocessor = keras_hub.models.GPT2CausalLMPreprocessor.from_preset(\n",
     "    \"gpt2_base_en\",\n",
     "    sequence_length=128,\n",
     ")\n",
-    "gpt2_lm = keras_nlp.models.GPT2CausalLM.from_preset(\n",
+    "gpt2_lm = keras_hub.models.GPT2CausalLM.from_preset(\n",
     "    \"gpt2_base_en\", preprocessor=preprocessor\n",
     ")"
    ]
@@ -250,27 +250,27 @@
     "colab_type": "text"
    },
    "source": [
-    "## More on the GPT-2 model from KerasNLP\n",
+    "## More on the GPT-2 model from KerasHub\n",
     "\n",
     "Next up, we will actually fine-tune the model to update its parameters, but\n",
     "before we do, let's take a look at the full set of tools we have to for working\n",
     "with for GPT2.\n",
     "\n",
     "The code of GPT2 can be found\n",
-    "[here](https://github.com/keras-team/keras-nlp/blob/master/keras_nlp/models/gpt2/).\n",
+    "[here](https://github.com/keras-team/keras-hub/blob/master/keras_hub/models/gpt2/).\n",
     "Conceptually the `GPT2CausalLM` can be hierarchically broken down into several\n",
-    "modules in KerasNLP, all of which have a *from_preset()* function that loads a\n",
+    "modules in KerasHub, all of which have a *from_preset()* function that loads a\n",
     "pretrained model:\n",
     "\n",
-    "- `keras_nlp.models.GPT2Tokenizer`: The tokenizer used by GPT2 model, which is a\n",
+    "- `keras_hub.models.GPT2Tokenizer`: The tokenizer used by GPT2 model, which is a\n",
     "    [byte-pair encoder](https://huggingface.co/course/chapter6/5?fw=pt).\n",
-    "- `keras_nlp.models.GPT2CausalLMPreprocessor`: the preprocessor used by GPT2\n",
+    "- `keras_hub.models.GPT2CausalLMPreprocessor`: the preprocessor used by GPT2\n",
     "    causal LM training. It does the tokenization along with other preprocessing\n",
     "    works such as creating the label and appending the end token.\n",
-    "- `keras_nlp.models.GPT2Backbone`: the GPT2 model, which is a stack of\n",
-    "    `keras_nlp.layers.TransformerDecoder`. This is usually just referred as\n",
+    "- `keras_hub.models.GPT2Backbone`: the GPT2 model, which is a stack of\n",
+    "    `keras_hub.layers.TransformerDecoder`. This is usually just referred as\n",
     "    `GPT2`.\n",
-    "- `keras_nlp.models.GPT2CausalLM`: wraps `GPT2Backbone`, it multiplies the\n",
+    "- `keras_hub.models.GPT2CausalLM`: wraps `GPT2Backbone`, it multiplies the\n",
     "    output of `GPT2Backbone` by embedding matrix to generate logits over\n",
     "    vocab tokens."
    ]
@@ -283,7 +283,7 @@
    "source": [
     "## Finetune on Reddit dataset\n",
     "\n",
-    "Now you have the knowledge of the GPT-2 model from KerasNLP, you can take one\n",
+    "Now you have the knowledge of the GPT-2 model from KerasHub, you can take one\n",
     "step further to finetune the model so that it generates text in a specific\n",
     "style, short or long, strict or casual. In this tutorial, we will use reddit\n",
     "dataset for example."
@@ -363,7 +363,7 @@
    "source": [
     "Now you can finetune the model using the familiar *fit()* function. Note that\n",
     "`preprocessor` will be automatically called inside `fit` method since\n",
-    "`GPT2CausalLM` is a `keras_nlp.models.Task` instance.\n",
+    "`GPT2CausalLM` is a `keras_hub.models.Task` instance.\n",
     "\n",
     "This step takes quite a bit of GPU memory and a long time if we were to train\n",
     "it all the way to a fully trained state. Here we just use part of the dataset\n",
@@ -435,7 +435,7 @@
    "source": [
     "## Into the Sampling Method\n",
     "\n",
-    "In KerasNLP, we offer a few sampling methods, e.g., contrastive search,\n",
+    "In KerasHub, we offer a few sampling methods, e.g., contrastive search,\n",
     "Top-K and beam sampling. By default, our `GPT2CausalLM` uses Top-k search, but\n",
     "you can choose your own sampling method.\n",
     "\n",
@@ -444,7 +444,7 @@
     "\n",
     "- Use a string identifier, such as \"greedy\", you are using the default\n",
     "configuration via this way.\n",
-    "- Pass a `keras_nlp.samplers.Sampler` instance, you can use custom configuration\n",
+    "- Pass a `keras_hub.samplers.Sampler` instance, you can use custom configuration\n",
     "via this way."
    ]
   },
@@ -463,7 +463,7 @@
     "print(output)\n",
     "\n",
     "# Use a `Sampler` instance. `GreedySampler` tends to repeat itself,\n",
-    "greedy_sampler = keras_nlp.samplers.GreedySampler()\n",
+    "greedy_sampler = keras_hub.samplers.GreedySampler()\n",
     "gpt2_lm.compile(sampler=greedy_sampler)\n",
     "\n",
     "output = gpt2_lm.generate(\"I like basketball\", max_length=200)\n",
@@ -477,8 +477,8 @@
     "colab_type": "text"
    },
    "source": [
-    "For more details on KerasNLP `Sampler` class, you can check the code\n",
-    "[here](https://github.com/keras-team/keras-nlp/tree/master/keras_nlp/samplers)."
+    "For more details on KerasHub `Sampler` class, you can check the code\n",
+    "[here](https://github.com/keras-team/keras-hub/tree/master/keras_hub/samplers)."
    ]
   },
   {
@@ -642,7 +642,7 @@
   "accelerator": "GPU",
   "colab": {
    "collapsed_sections": [],
-   "name": "gpt2_text_generation_with_kerasnlp",
+   "name": "gpt2_text_generation_with_kerashub",
    "private_outputs": false,
    "provenance": [],
    "toc_visible": true
diff --git a/examples/generative/ipynb/text_generation_gpt.ipynb b/examples/generative/ipynb/text_generation_gpt.ipynb
index 9e044a4ae1..d7cd9c2a46 100644
--- a/examples/generative/ipynb/text_generation_gpt.ipynb
+++ b/examples/generative/ipynb/text_generation_gpt.ipynb
@@ -6,12 +6,12 @@
     "colab_type": "text"
    },
    "source": [
-    "# GPT text generation from scratch with KerasNLP\n",
+    "# GPT text generation from scratch with KerasHub\n",
     "\n",
     "**Author:** [Jesse Chan](https://github.com/jessechancy)<br>\n",
     "**Date created:** 2022/07/25<br>\n",
     "**Last modified:** 2022/07/25<br>\n",
-    "**Description:** Using KerasNLP to train a mini-GPT model for text generation."
+    "**Description:** Using KerasHub to train a mini-GPT model for text generation."
    ]
   },
   {
@@ -22,7 +22,7 @@
    "source": [
     "## Introduction\n",
     "\n",
-    "In this example, we will use KerasNLP to build a scaled down Generative\n",
+    "In this example, we will use KerasHub to build a scaled down Generative\n",
     "Pre-Trained (GPT) model. GPT is a Transformer-based model that allows you to generate\n",
     "sophisticated text from a prompt.\n",
     "\n",
@@ -33,15 +33,15 @@
     "\n",
     "This example combines concepts from\n",
     "[Text generation with a miniature GPT](https://keras.io/examples/generative/text_generation_with_miniature_gpt/)\n",
-    "with KerasNLP abstractions. We will demonstrate how KerasNLP tokenization, layers and\n",
+    "with KerasHub abstractions. We will demonstrate how KerasHub tokenization, layers and\n",
     "metrics simplify the training\n",
-    "process, and then show how to generate output text using the KerasNLP sampling utilities.\n",
+    "process, and then show how to generate output text using the KerasHub sampling utilities.\n",
     "\n",
     "Note: If you are running this example on a Colab,\n",
     "make sure to enable GPU runtime for faster training.\n",
     "\n",
-    "This example requires KerasNLP. You can install it via the following command:\n",
-    "`pip install keras-nlp`"
+    "This example requires KerasHub. You can install it via the following command:\n",
+    "`pip install keras-hub`"
    ]
   },
   {
@@ -61,7 +61,7 @@
    },
    "outputs": [],
    "source": [
-    "!pip install -q --upgrade keras-nlp\n",
+    "!pip install -q --upgrade keras-hub\n",
     "!pip install -q --upgrade keras  # Upgrade to Keras 3."
    ]
   },
@@ -74,7 +74,7 @@
    "outputs": [],
    "source": [
     "import os\n",
-    "import keras_nlp\n",
+    "import keras_hub\n",
     "import keras\n",
     "\n",
     "import tensorflow.data as tf_data\n",
@@ -193,7 +193,7 @@
    "outputs": [],
    "source": [
     "# Train tokenizer vocabulary\n",
-    "vocab = keras_nlp.tokenizers.compute_word_piece_vocabulary(\n",
+    "vocab = keras_hub.tokenizers.compute_word_piece_vocabulary(\n",
     "    raw_train_ds,\n",
     "    vocabulary_size=VOCAB_SIZE,\n",
     "    lowercase=True,\n",
@@ -210,7 +210,7 @@
     "## Load tokenizer\n",
     "\n",
     "We use the vocabulary data to initialize\n",
-    "`keras_nlp.tokenizers.WordPieceTokenizer`. WordPieceTokenizer is an efficient\n",
+    "`keras_hub.tokenizers.WordPieceTokenizer`. WordPieceTokenizer is an efficient\n",
     "implementation of the WordPiece algorithm used by BERT and other models. It will strip,\n",
     "lower-case and do other irreversible preprocessing operations."
    ]
@@ -223,7 +223,7 @@
    },
    "outputs": [],
    "source": [
-    "tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(\n",
+    "tokenizer = keras_hub.tokenizers.WordPieceTokenizer(\n",
     "    vocabulary=vocab,\n",
     "    sequence_length=SEQ_LEN,\n",
     "    lowercase=True,\n",
@@ -250,7 +250,7 @@
    "outputs": [],
    "source": [
     "# packer adds a start token\n",
-    "start_packer = keras_nlp.layers.StartEndPacker(\n",
+    "start_packer = keras_hub.layers.StartEndPacker(\n",
     "    sequence_length=SEQ_LEN,\n",
     "    start_value=tokenizer.token_to_id(\"[BOS]\"),\n",
     ")\n",
@@ -282,9 +282,9 @@
     "\n",
     "We create our scaled down GPT model with the following layers:\n",
     "\n",
-    "- One `keras_nlp.layers.TokenAndPositionEmbedding` layer, which combines the embedding\n",
+    "- One `keras_hub.layers.TokenAndPositionEmbedding` layer, which combines the embedding\n",
     "for the token and its position.\n",
-    "- Multiple `keras_nlp.layers.TransformerDecoder` layers, with the default causal masking.\n",
+    "- Multiple `keras_hub.layers.TransformerDecoder` layers, with the default causal masking.\n",
     "The layer has no cross-attention when run with decoder sequence only.\n",
     "- One final dense linear layer"
    ]
@@ -299,7 +299,7 @@
    "source": [
     "inputs = keras.layers.Input(shape=(None,), dtype=\"int32\")\n",
     "# Embedding.\n",
-    "embedding_layer = keras_nlp.layers.TokenAndPositionEmbedding(\n",
+    "embedding_layer = keras_hub.layers.TokenAndPositionEmbedding(\n",
     "    vocabulary_size=VOCAB_SIZE,\n",
     "    sequence_length=SEQ_LEN,\n",
     "    embedding_dim=EMBED_DIM,\n",
@@ -308,7 +308,7 @@
     "x = embedding_layer(inputs)\n",
     "# Transformer decoders.\n",
     "for _ in range(NUM_LAYERS):\n",
-    "    decoder_layer = keras_nlp.layers.TransformerDecoder(\n",
+    "    decoder_layer = keras_hub.layers.TransformerDecoder(\n",
     "        num_heads=NUM_HEADS,\n",
     "        intermediate_dim=FEED_FORWARD_DIM,\n",
     "    )\n",
@@ -317,7 +317,7 @@
     "outputs = keras.layers.Dense(VOCAB_SIZE)(x)\n",
     "model = keras.Model(inputs=inputs, outputs=outputs)\n",
     "loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True)\n",
-    "perplexity = keras_nlp.metrics.Perplexity(from_logits=True, mask_token_id=0)\n",
+    "perplexity = keras_hub.metrics.Perplexity(from_logits=True, mask_token_id=0)\n",
     "model.compile(optimizer=\"adam\", loss=loss_fn, metrics=[perplexity])"
    ]
   },
@@ -402,7 +402,7 @@
     "colab_type": "text"
    },
    "source": [
-    "We will use the `keras_nlp.samplers` module for inference, which requires a\n",
+    "We will use the `keras_hub.samplers` module for inference, which requires a\n",
     "callback function wrapping the model we just trained. This wrapper calls\n",
     "the model and returns the logit predictions for the current token we are\n",
     "generating.\n",
@@ -411,7 +411,7 @@
     "defining your callback. The first is the ability to take in a `cache` of states\n",
     "computed in previous generation steps, which can be used to speed up generation.\n",
     "The second is the ability to output the final dense \"hidden state\" of each\n",
-    "generated token. This is used by `keras_nlp.samplers.ContrastiveSampler`, which\n",
+    "generated token. This is used by `keras_hub.samplers.ContrastiveSampler`, which\n",
     "avoids repetition by penalizing repeated hidden states. Both are optional, and\n",
     "we will ignore them for now."
    ]
@@ -463,7 +463,7 @@
    },
    "outputs": [],
    "source": [
-    "sampler = keras_nlp.samplers.GreedySampler()\n",
+    "sampler = keras_hub.samplers.GreedySampler()\n",
     "output_tokens = sampler(\n",
     "    next=next,\n",
     "    prompt=prompt_tokens,\n",
@@ -508,7 +508,7 @@
    },
    "outputs": [],
    "source": [
-    "sampler = keras_nlp.samplers.BeamSampler(num_beams=10)\n",
+    "sampler = keras_hub.samplers.BeamSampler(num_beams=10)\n",
     "output_tokens = sampler(\n",
     "    next=next,\n",
     "    prompt=prompt_tokens,\n",
@@ -548,7 +548,7 @@
    },
    "outputs": [],
    "source": [
-    "sampler = keras_nlp.samplers.RandomSampler()\n",
+    "sampler = keras_hub.samplers.RandomSampler()\n",
     "output_tokens = sampler(\n",
     "    next=next,\n",
     "    prompt=prompt_tokens,\n",
@@ -592,7 +592,7 @@
    },
    "outputs": [],
    "source": [
-    "sampler = keras_nlp.samplers.TopKSampler(k=10)\n",
+    "sampler = keras_hub.samplers.TopKSampler(k=10)\n",
     "output_tokens = sampler(\n",
     "    next=next,\n",
     "    prompt=prompt_tokens,\n",
@@ -632,7 +632,7 @@
    },
    "outputs": [],
    "source": [
-    "sampler = keras_nlp.samplers.TopPSampler(p=0.5)\n",
+    "sampler = keras_hub.samplers.TopPSampler(p=0.5)\n",
     "output_tokens = sampler(\n",
     "    next=next,\n",
     "    prompt=prompt_tokens,\n",
@@ -667,7 +667,7 @@
     "    \"\"\"A callback to generate text from a trained model using top-k.\"\"\"\n",
     "\n",
     "    def __init__(self, k):\n",
-    "        self.sampler = keras_nlp.samplers.TopKSampler(k)\n",
+    "        self.sampler = keras_hub.samplers.TopKSampler(k)\n",
     "\n",
     "    def on_epoch_end(self, epoch, logs=None):\n",
     "        output_tokens = self.sampler(\n",
@@ -692,7 +692,7 @@
    "source": [
     "## Conclusion\n",
     "\n",
-    "To recap, in this example, we use KerasNLP layers to train a sub-word vocabulary,\n",
+    "To recap, in this example, we use KerasHub layers to train a sub-word vocabulary,\n",
     "tokenize training data, create a miniature GPT model, and perform inference with the\n",
     "text generation library.\n",
     "\n",
diff --git a/examples/generative/md/gpt2_text_generation_with_kerasnlp.md b/examples/generative/md/gpt2_text_generation_with_keras_hub.md
similarity index 90%
rename from examples/generative/md/gpt2_text_generation_with_kerasnlp.md
rename to examples/generative/md/gpt2_text_generation_with_keras_hub.md
index 12873e9c12..4ea0f14421 100644
--- a/examples/generative/md/gpt2_text_generation_with_kerasnlp.md
+++ b/examples/generative/md/gpt2_text_generation_with_keras_hub.md
@@ -1,16 +1,16 @@
-# GPT2 Text Generation with KerasNLP
+# GPT2 Text Generation with KerasHub
 
 **Author:** Chen Qian<br>
 **Date created:** 2023/04/17<br>
 **Last modified:** 2024/04/12<br>
-**Description:** Use KerasNLP GPT2 model and `samplers` to do text generation.
+**Description:** Use KerasHub GPT2 model and `samplers` to do text generation.
 
 
-<img class="k-inline-icon" src="https://colab.research.google.com/img/colab_favicon.ico"/> [**View in Colab**](https://colab.research.google.com/github/keras-team/keras-io/blob/master/examples/generative/ipynb/gpt2_text_generation_with_kerasnlp.ipynb)  <span class="k-dot">•</span><img class="k-inline-icon" src="https://github.com/favicon.ico"/> [**GitHub source**](https://github.com/keras-team/keras-io/blob/master/examples/generative/gpt2_text_generation_with_kerasnlp.py)
+<img class="k-inline-icon" src="https://colab.research.google.com/img/colab_favicon.ico"/> [**View in Colab**](https://colab.research.google.com/github/keras-team/keras-io/blob/master/examples/generative/ipynb/gpt2_text_generation_with_kerashub.ipynb)  <span class="k-dot">•</span><img class="k-inline-icon" src="https://github.com/favicon.ico"/> [**GitHub source**](https://github.com/keras-team/keras-io/blob/master/examples/generative/gpt2_text_generation_with_kerashub.py)
 
 
 
-In this tutorial, you will learn to use [KerasNLP](https://keras.io/keras_nlp/) to load a
+In this tutorial, you will learn to use [KerasHub](https://keras.io/keras_hub/) to load a
 pre-trained Large Language Model (LLM) - [GPT-2 model](https://openai.com/research/better-language-models)
 (originally invented by OpenAI), finetune it to a specific text style, and
 generate text based on users' input (also known as prompt). You will also learn
@@ -25,16 +25,16 @@ Change runtime type** and choose the GPU Hardware Accelerator runtime
 GPT-2 model. Running this tutorial on CPU runtime will take hours.
 
 ---
-## Install KerasNLP, Choose Backend and Import Dependencies
+## Install KerasHub, Choose Backend and Import Dependencies
 
 This examples uses [Keras 3](https://keras.io/keras_3/) to work in any of
 `"tensorflow"`, `"jax"` or `"torch"`. Support for Keras 3 is baked into
-KerasNLP, simply change the `"KERAS_BACKEND"` environment variable to select
+KerasHub, simply change the `"KERAS_BACKEND"` environment variable to select
 the backend of your choice. We select the JAX backend below.
 
 
 ```python
-!pip install git+https://github.com/keras-team/keras-nlp.git -q
+!pip install git+https://github.com/keras-team/keras-hub.git -q
 ```
 
 ```python
@@ -42,7 +42,7 @@ import os
 
 os.environ["KERAS_BACKEND"] = "jax"  # or "tensorflow" or "torch"
 
-import keras_nlp
+import keras_hub
 import keras
 import tensorflow as tf
 import time
@@ -75,22 +75,22 @@ pedagogical discussion on language models, you can refer to the
 [Stanford CS324 LLM class](https://stanford-cs324.github.io/winter2022/lectures/introduction/).
 
 ---
-## Introduction to KerasNLP
+## Introduction to KerasHub
 
 Large Language Models are complex to build and expensive to train from scratch.
-Luckily there are pretrained LLMs available for use right away. [KerasNLP](https://keras.io/keras_nlp/)
+Luckily there are pretrained LLMs available for use right away. [KerasHub](https://keras.io/keras_hub/)
 provides a large number of pre-trained checkpoints that allow you to experiment
 with SOTA models without needing to train them yourself.
 
-KerasNLP is a natural language processing library that supports users through
-their entire development cycle. KerasNLP offers both pretrained models and
+KerasHub is a natural language processing library that supports users through
+their entire development cycle. KerasHub offers both pretrained models and
 modularized building blocks, so developers could easily reuse pretrained models
 or stack their own LLM.
 
-In a nutshell, for generative LLM, KerasNLP offers:
+In a nutshell, for generative LLM, KerasHub offers:
 
 - Pretrained models with `generate()` method, e.g.,
-    `keras_nlp.models.GPT2CausalLM` and `keras_nlp.models.OPTCausalLM`.
+    `keras_hub.models.GPT2CausalLM` and `keras_hub.models.OPTCausalLM`.
 - Sampler class that implements generation algorithms such as Top-K, Beam and
     contrastive search. These samplers can be used to generate text with
     custom models.
@@ -98,10 +98,10 @@ In a nutshell, for generative LLM, KerasNLP offers:
 ---
 ## Load a pre-trained GPT-2 model and generate some text
 
-KerasNLP provides a number of pre-trained models, such as [Google
+KerasHub provides a number of pre-trained models, such as [Google
 Bert](https://ai.googleblog.com/2018/11/open-sourcing-bert-state-of-art-pre.html)
 and [GPT-2](https://openai.com/research/better-language-models). You can see
-the list of models available in the [KerasNLP repository](https://github.com/keras-team/keras-nlp/tree/master/keras_nlp/models).
+the list of models available in the [KerasHub repository](https://github.com/keras-team/keras-hub/tree/master/keras_hub/models).
 
 It's very easy to load the GPT-2 model as you can see below:
 
@@ -109,11 +109,11 @@ It's very easy to load the GPT-2 model as you can see below:
 ```python
 # To speed up training and generation, we use preprocessor of length 128
 # instead of full length 1024.
-preprocessor = keras_nlp.models.GPT2CausalLMPreprocessor.from_preset(
+preprocessor = keras_hub.models.GPT2CausalLMPreprocessor.from_preset(
     "gpt2_base_en",
     sequence_length=128,
 )
-gpt2_lm = keras_nlp.models.GPT2CausalLM.from_preset(
+gpt2_lm = keras_hub.models.GPT2CausalLM.from_preset(
     "gpt2_base_en", preprocessor=preprocessor
 )
 ```
@@ -258,34 +258,34 @@ The quality of the generated text looks OK, but we can improve it via
 fine-tuning.
 
 ---
-## More on the GPT-2 model from KerasNLP
+## More on the GPT-2 model from KerasHub
 
 Next up, we will actually fine-tune the model to update its parameters, but
 before we do, let's take a look at the full set of tools we have to for working
 with for GPT2.
 
 The code of GPT2 can be found
-[here](https://github.com/keras-team/keras-nlp/blob/master/keras_nlp/models/gpt2/).
+[here](https://github.com/keras-team/keras-hub/blob/master/keras_hub/models/gpt2/).
 Conceptually the `GPT2CausalLM` can be hierarchically broken down into several
-modules in KerasNLP, all of which have a *from_preset()* function that loads a
+modules in KerasHub, all of which have a *from_preset()* function that loads a
 pretrained model:
 
-- `keras_nlp.models.GPT2Tokenizer`: The tokenizer used by GPT2 model, which is a
+- `keras_hub.models.GPT2Tokenizer`: The tokenizer used by GPT2 model, which is a
     [byte-pair encoder](https://huggingface.co/course/chapter6/5?fw=pt).
-- `keras_nlp.models.GPT2CausalLMPreprocessor`: the preprocessor used by GPT2
+- `keras_hub.models.GPT2CausalLMPreprocessor`: the preprocessor used by GPT2
     causal LM training. It does the tokenization along with other preprocessing
     works such as creating the label and appending the end token.
-- `keras_nlp.models.GPT2Backbone`: the GPT2 model, which is a stack of
-    `keras_nlp.layers.TransformerDecoder`. This is usually just referred as
+- `keras_hub.models.GPT2Backbone`: the GPT2 model, which is a stack of
+    `keras_hub.layers.TransformerDecoder`. This is usually just referred as
     `GPT2`.
-- `keras_nlp.models.GPT2CausalLM`: wraps `GPT2Backbone`, it multiplies the
+- `keras_hub.models.GPT2CausalLM`: wraps `GPT2Backbone`, it multiplies the
     output of `GPT2Backbone` by embedding matrix to generate logits over
     vocab tokens.
 
 ---
 ## Finetune on Reddit dataset
 
-Now you have the knowledge of the GPT-2 model from KerasNLP, you can take one
+Now you have the knowledge of the GPT-2 model from KerasHub, you can take one
 step further to finetune the model so that it generates text in a specific
 style, short or long, strict or casual. In this tutorial, we will use reddit
 dataset for example.
@@ -333,7 +333,7 @@ train_ds = (
 
 Now you can finetune the model using the familiar *fit()* function. Note that
 `preprocessor` will be automatically called inside `fit` method since
-`GPT2CausalLM` is a `keras_nlp.models.Task` instance.
+`GPT2CausalLM` is a `keras_hub.models.Task` instance.
 
 This step takes quite a bit of GPU memory and a long time if we were to train
 it all the way to a fully trained state. Here we just use part of the dataset
@@ -415,7 +415,7 @@ TOTAL TIME ELAPSED: 21.13s
 ---
 ## Into the Sampling Method
 
-In KerasNLP, we offer a few sampling methods, e.g., contrastive search,
+In KerasHub, we offer a few sampling methods, e.g., contrastive search,
 Top-K and beam sampling. By default, our `GPT2CausalLM` uses Top-k search, but
 you can choose your own sampling method.
 
@@ -424,7 +424,7 @@ sampler:
 
 - Use a string identifier, such as "greedy", you are using the default
 configuration via this way.
-- Pass a `keras_nlp.samplers.Sampler` instance, you can use custom configuration
+- Pass a `keras_hub.samplers.Sampler` instance, you can use custom configuration
 via this way.
 
 
@@ -436,7 +436,7 @@ print("\nGPT-2 output:")
 print(output)
 
 # Use a `Sampler` instance. `GreedySampler` tends to repeat itself,
-greedy_sampler = keras_nlp.samplers.GreedySampler()
+greedy_sampler = keras_hub.samplers.GreedySampler()
 gpt2_lm.compile(sampler=greedy_sampler)
 
 output = gpt2_lm.generate("I like basketball", max_length=200)
@@ -531,8 +531,8 @@ so i was playing with my brother, and he was playing with his brother
 
 ```
 </div>
-For more details on KerasNLP `Sampler` class, you can check the code
-[here](https://github.com/keras-team/keras-nlp/tree/master/keras_nlp/samplers).
+For more details on KerasHub `Sampler` class, you can check the code
+[here](https://github.com/keras-team/keras-hub/tree/master/keras_hub/samplers).
 
 ---
 ## Finetune on Chinese Poem Dataset
diff --git a/examples/generative/md/text_generation_gpt.md b/examples/generative/md/text_generation_gpt.md
index 4adea7a97c..fbecd5656b 100644
--- a/examples/generative/md/text_generation_gpt.md
+++ b/examples/generative/md/text_generation_gpt.md
@@ -1,9 +1,9 @@
-# GPT text generation from scratch with KerasNLP
+# GPT text generation from scratch with KerasHub
 
 **Author:** [Jesse Chan](https://github.com/jessechancy)<br>
 **Date created:** 2022/07/25<br>
 **Last modified:** 2022/07/25<br>
-**Description:** Using KerasNLP to train a mini-GPT model for text generation.
+**Description:** Using KerasHub to train a mini-GPT model for text generation.
 
 
 <img class="k-inline-icon" src="https://colab.research.google.com/img/colab_favicon.ico"/> [**View in Colab**](https://colab.research.google.com/github/keras-team/keras-io/blob/master/examples/generative/ipynb/text_generation_gpt.ipynb)  <span class="k-dot">•</span><img class="k-inline-icon" src="https://github.com/favicon.ico"/> [**GitHub source**](https://github.com/keras-team/keras-io/blob/master/examples/generative/text_generation_gpt.py)
@@ -13,7 +13,7 @@
 ---
 ## Introduction
 
-In this example, we will use KerasNLP to build a scaled down Generative
+In this example, we will use KerasHub to build a scaled down Generative
 Pre-Trained (GPT) model. GPT is a Transformer-based model that allows you to generate
 sophisticated text from a prompt.
 
@@ -24,28 +24,28 @@ model with few parameters.
 
 This example combines concepts from
 [Text generation with a miniature GPT](https://keras.io/examples/generative/text_generation_with_miniature_gpt/)
-with KerasNLP abstractions. We will demonstrate how KerasNLP tokenization, layers and
+with KerasHub abstractions. We will demonstrate how KerasHub tokenization, layers and
 metrics simplify the training
-process, and then show how to generate output text using the KerasNLP sampling utilities.
+process, and then show how to generate output text using the KerasHub sampling utilities.
 
 Note: If you are running this example on a Colab,
 make sure to enable GPU runtime for faster training.
 
-This example requires KerasNLP. You can install it via the following command:
-`pip install keras-nlp`
+This example requires KerasHub. You can install it via the following command:
+`pip install keras-hub`
 
 ---
 ## Setup
 
 
 ```python
-!pip install -q --upgrade keras-nlp
+!pip install -q --upgrade keras-hub
 !pip install -q --upgrade keras  # Upgrade to Keras 3.
 ```
 
 ```python
 import os
-import keras_nlp
+import keras_hub
 import keras
 
 import tensorflow.data as tf_data
@@ -135,7 +135,7 @@ representing the beginning of each line of training data.
 
 ```python
 # Train tokenizer vocabulary
-vocab = keras_nlp.tokenizers.compute_word_piece_vocabulary(
+vocab = keras_hub.tokenizers.compute_word_piece_vocabulary(
     raw_train_ds,
     vocabulary_size=VOCAB_SIZE,
     lowercase=True,
@@ -147,13 +147,13 @@ vocab = keras_nlp.tokenizers.compute_word_piece_vocabulary(
 ## Load tokenizer
 
 We use the vocabulary data to initialize
-`keras_nlp.tokenizers.WordPieceTokenizer`. WordPieceTokenizer is an efficient
+`keras_hub.tokenizers.WordPieceTokenizer`. WordPieceTokenizer is an efficient
 implementation of the WordPiece algorithm used by BERT and other models. It will strip,
 lower-case and do other irreversible preprocessing operations.
 
 
 ```python
-tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
+tokenizer = keras_hub.tokenizers.WordPieceTokenizer(
     vocabulary=vocab,
     sequence_length=SEQ_LEN,
     lowercase=True,
@@ -168,7 +168,7 @@ We preprocess the dataset by tokenizing and splitting it into `features` and `la
 
 ```python
 # packer adds a start token
-start_packer = keras_nlp.layers.StartEndPacker(
+start_packer = keras_hub.layers.StartEndPacker(
     sequence_length=SEQ_LEN,
     start_value=tokenizer.token_to_id("[BOS]"),
 )
@@ -195,9 +195,9 @@ val_ds = raw_val_ds.map(preprocess, num_parallel_calls=tf_data.AUTOTUNE).prefetc
 
 We create our scaled down GPT model with the following layers:
 
-- One `keras_nlp.layers.TokenAndPositionEmbedding` layer, which combines the embedding
+- One `keras_hub.layers.TokenAndPositionEmbedding` layer, which combines the embedding
 for the token and its position.
-- Multiple `keras_nlp.layers.TransformerDecoder` layers, with the default causal masking.
+- Multiple `keras_hub.layers.TransformerDecoder` layers, with the default causal masking.
 The layer has no cross-attention when run with decoder sequence only.
 - One final dense linear layer
 
@@ -205,7 +205,7 @@ The layer has no cross-attention when run with decoder sequence only.
 ```python
 inputs = keras.layers.Input(shape=(None,), dtype="int32")
 # Embedding.
-embedding_layer = keras_nlp.layers.TokenAndPositionEmbedding(
+embedding_layer = keras_hub.layers.TokenAndPositionEmbedding(
     vocabulary_size=VOCAB_SIZE,
     sequence_length=SEQ_LEN,
     embedding_dim=EMBED_DIM,
@@ -214,7 +214,7 @@ embedding_layer = keras_nlp.layers.TokenAndPositionEmbedding(
 x = embedding_layer(inputs)
 # Transformer decoders.
 for _ in range(NUM_LAYERS):
-    decoder_layer = keras_nlp.layers.TransformerDecoder(
+    decoder_layer = keras_hub.layers.TransformerDecoder(
         num_heads=NUM_HEADS,
         intermediate_dim=FEED_FORWARD_DIM,
     )
@@ -223,7 +223,7 @@ for _ in range(NUM_LAYERS):
 outputs = keras.layers.Dense(VOCAB_SIZE)(x)
 model = keras.Model(inputs=inputs, outputs=outputs)
 loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
-perplexity = keras_nlp.metrics.Perplexity(from_logits=True, mask_token_id=0)
+perplexity = keras_hub.metrics.Perplexity(from_logits=True, mask_token_id=0)
 model.compile(optimizer="adam", loss=loss_fn, metrics=[perplexity])
 ```
 
@@ -343,7 +343,7 @@ array([[2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
 
 ```
 </div>
-We will use the `keras_nlp.samplers` module for inference, which requires a
+We will use the `keras_hub.samplers` module for inference, which requires a
 callback function wrapping the model we just trained. This wrapper calls
 the model and returns the logit predictions for the current token we are
 generating.
@@ -352,7 +352,7 @@ Note: There are two pieces of more advanced functionality available when
 defining your callback. The first is the ability to take in a `cache` of states
 computed in previous generation steps, which can be used to speed up generation.
 The second is the ability to output the final dense "hidden state" of each
-generated token. This is used by `keras_nlp.samplers.ContrastiveSampler`, which
+generated token. This is used by `keras_hub.samplers.ContrastiveSampler`, which
 avoids repetition by penalizing repeated hidden states. Both are optional, and
 we will ignore them for now.
 
@@ -377,7 +377,7 @@ argmax of the model output.
 
 
 ```python
-sampler = keras_nlp.samplers.GreedySampler()
+sampler = keras_hub.samplers.GreedySampler()
 output_tokens = sampler(
     next=next,
     prompt=prompt_tokens,
@@ -411,7 +411,7 @@ greedy search since it has to compute and store multiple potential sequences.
 
 
 ```python
-sampler = keras_nlp.samplers.BeamSampler(num_beams=10)
+sampler = keras_hub.samplers.BeamSampler(num_beams=10)
 output_tokens = sampler(
     next=next,
     prompt=prompt_tokens,
@@ -440,7 +440,7 @@ token using the softmax probabilities provided by the model.
 
 
 ```python
-sampler = keras_nlp.samplers.RandomSampler()
+sampler = keras_hub.samplers.RandomSampler()
 output_tokens = sampler(
     next=next,
     prompt=prompt_tokens,
@@ -473,7 +473,7 @@ nonsensical words!
 
 
 ```python
-sampler = keras_nlp.samplers.TopKSampler(k=10)
+sampler = keras_hub.samplers.TopKSampler(k=10)
 output_tokens = sampler(
     next=next,
     prompt=prompt_tokens,
@@ -509,7 +509,7 @@ similarly filter out the top 10 tokens to sample from.
 
 
 ```python
-sampler = keras_nlp.samplers.TopPSampler(p=0.5)
+sampler = keras_hub.samplers.TopPSampler(p=0.5)
 output_tokens = sampler(
     next=next,
     prompt=prompt_tokens,
@@ -540,7 +540,7 @@ class TopKTextGenerator(keras.callbacks.Callback):
     """A callback to generate text from a trained model using top-k."""
 
     def __init__(self, k):
-        self.sampler = keras_nlp.samplers.TopKSampler(k)
+        self.sampler = keras_hub.samplers.TopKSampler(k)
 
     def on_epoch_end(self, epoch, logs=None):
         output_tokens = self.sampler(
@@ -585,7 +585,7 @@ Top-K search generated text:
 ---
 ## Conclusion
 
-To recap, in this example, we use KerasNLP layers to train a sub-word vocabulary,
+To recap, in this example, we use KerasHub layers to train a sub-word vocabulary,
 tokenize training data, create a miniature GPT model, and perform inference with the
 text generation library.
 
diff --git a/examples/generative/text_generation_gpt.py b/examples/generative/text_generation_gpt.py
index eab77dde96..3b63ab760c 100644
--- a/examples/generative/text_generation_gpt.py
+++ b/examples/generative/text_generation_gpt.py
@@ -1,16 +1,16 @@
 """
-Title: GPT text generation from scratch with KerasNLP
+Title: GPT text generation from scratch with KerasHub
 Author: [Jesse Chan](https://github.com/jessechancy)
 Date created: 2022/07/25
 Last modified: 2022/07/25
-Description: Using KerasNLP to train a mini-GPT model for text generation.
+Description: Using KerasHub to train a mini-GPT model for text generation.
 Accelerator: GPU
 """
 
 """
 ## Introduction
 
-In this example, we will use KerasNLP to build a scaled down Generative
+In this example, we will use KerasHub to build a scaled down Generative
 Pre-Trained (GPT) model. GPT is a Transformer-based model that allows you to generate
 sophisticated text from a prompt.
 
@@ -21,15 +21,15 @@
 
 This example combines concepts from
 [Text generation with a miniature GPT](https://keras.io/examples/generative/text_generation_with_miniature_gpt/)
-with KerasNLP abstractions. We will demonstrate how KerasNLP tokenization, layers and
+with KerasHub abstractions. We will demonstrate how KerasHub tokenization, layers and
 metrics simplify the training
-process, and then show how to generate output text using the KerasNLP sampling utilities.
+process, and then show how to generate output text using the KerasHub sampling utilities.
 
 Note: If you are running this example on a Colab,
 make sure to enable GPU runtime for faster training.
 
-This example requires KerasNLP. You can install it via the following command:
-`pip install keras-nlp`
+This example requires KerasHub. You can install it via the following command:
+`pip install keras-hub`
 """
 
 """
@@ -37,12 +37,12 @@
 """
 
 """shell
-pip install -q --upgrade keras-nlp
+pip install -q --upgrade keras-hub
 pip install -q --upgrade keras  # Upgrade to Keras 3.
 """
 
 import os
-import keras_nlp
+import keras_hub
 import keras
 
 import tensorflow.data as tf_data
@@ -119,7 +119,7 @@
 """
 
 # Train tokenizer vocabulary
-vocab = keras_nlp.tokenizers.compute_word_piece_vocabulary(
+vocab = keras_hub.tokenizers.compute_word_piece_vocabulary(
     raw_train_ds,
     vocabulary_size=VOCAB_SIZE,
     lowercase=True,
@@ -130,12 +130,12 @@
 ## Load tokenizer
 
 We use the vocabulary data to initialize
-`keras_nlp.tokenizers.WordPieceTokenizer`. WordPieceTokenizer is an efficient
+`keras_hub.tokenizers.WordPieceTokenizer`. WordPieceTokenizer is an efficient
 implementation of the WordPiece algorithm used by BERT and other models. It will strip,
 lower-case and do other irreversible preprocessing operations.
 """
 
-tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
+tokenizer = keras_hub.tokenizers.WordPieceTokenizer(
     vocabulary=vocab,
     sequence_length=SEQ_LEN,
     lowercase=True,
@@ -148,7 +148,7 @@
 """
 
 # packer adds a start token
-start_packer = keras_nlp.layers.StartEndPacker(
+start_packer = keras_hub.layers.StartEndPacker(
     sequence_length=SEQ_LEN,
     start_value=tokenizer.token_to_id("[BOS]"),
 )
@@ -174,16 +174,16 @@ def preprocess(inputs):
 
 We create our scaled down GPT model with the following layers:
 
-- One `keras_nlp.layers.TokenAndPositionEmbedding` layer, which combines the embedding
+- One `keras_hub.layers.TokenAndPositionEmbedding` layer, which combines the embedding
 for the token and its position.
-- Multiple `keras_nlp.layers.TransformerDecoder` layers, with the default causal masking.
+- Multiple `keras_hub.layers.TransformerDecoder` layers, with the default causal masking.
 The layer has no cross-attention when run with decoder sequence only.
 - One final dense linear layer
 """
 
 inputs = keras.layers.Input(shape=(None,), dtype="int32")
 # Embedding.
-embedding_layer = keras_nlp.layers.TokenAndPositionEmbedding(
+embedding_layer = keras_hub.layers.TokenAndPositionEmbedding(
     vocabulary_size=VOCAB_SIZE,
     sequence_length=SEQ_LEN,
     embedding_dim=EMBED_DIM,
@@ -192,7 +192,7 @@ def preprocess(inputs):
 x = embedding_layer(inputs)
 # Transformer decoders.
 for _ in range(NUM_LAYERS):
-    decoder_layer = keras_nlp.layers.TransformerDecoder(
+    decoder_layer = keras_hub.layers.TransformerDecoder(
         num_heads=NUM_HEADS,
         intermediate_dim=FEED_FORWARD_DIM,
     )
@@ -201,7 +201,7 @@ def preprocess(inputs):
 outputs = keras.layers.Dense(VOCAB_SIZE)(x)
 model = keras.Model(inputs=inputs, outputs=outputs)
 loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
-perplexity = keras_nlp.metrics.Perplexity(from_logits=True, mask_token_id=0)
+perplexity = keras_hub.metrics.Perplexity(from_logits=True, mask_token_id=0)
 model.compile(optimizer="adam", loss=loss_fn, metrics=[perplexity])
 
 """
@@ -238,7 +238,7 @@ def preprocess(inputs):
 prompt_tokens
 
 """
-We will use the `keras_nlp.samplers` module for inference, which requires a
+We will use the `keras_hub.samplers` module for inference, which requires a
 callback function wrapping the model we just trained. This wrapper calls
 the model and returns the logit predictions for the current token we are
 generating.
@@ -247,7 +247,7 @@ def preprocess(inputs):
 defining your callback. The first is the ability to take in a `cache` of states
 computed in previous generation steps, which can be used to speed up generation.
 The second is the ability to output the final dense "hidden state" of each
-generated token. This is used by `keras_nlp.samplers.ContrastiveSampler`, which
+generated token. This is used by `keras_hub.samplers.ContrastiveSampler`, which
 avoids repetition by penalizing repeated hidden states. Both are optional, and
 we will ignore them for now.
 """
@@ -272,7 +272,7 @@ def next(prompt, cache, index):
 argmax of the model output.
 """
 
-sampler = keras_nlp.samplers.GreedySampler()
+sampler = keras_hub.samplers.GreedySampler()
 output_tokens = sampler(
     next=next,
     prompt=prompt_tokens,
@@ -298,7 +298,7 @@ def next(prompt, cache, index):
 **Note:** beam search with `num_beams=1` is identical to greedy search.
 """
 
-sampler = keras_nlp.samplers.BeamSampler(num_beams=10)
+sampler = keras_hub.samplers.BeamSampler(num_beams=10)
 output_tokens = sampler(
     next=next,
     prompt=prompt_tokens,
@@ -319,7 +319,7 @@ def next(prompt, cache, index):
 token using the softmax probabilities provided by the model.
 """
 
-sampler = keras_nlp.samplers.RandomSampler()
+sampler = keras_hub.samplers.RandomSampler()
 output_tokens = sampler(
     next=next,
     prompt=prompt_tokens,
@@ -344,7 +344,7 @@ def next(prompt, cache, index):
 nonsensical words!
 """
 
-sampler = keras_nlp.samplers.TopKSampler(k=10)
+sampler = keras_hub.samplers.TopKSampler(k=10)
 output_tokens = sampler(
     next=next,
     prompt=prompt_tokens,
@@ -370,7 +370,7 @@ def next(prompt, cache, index):
 similarly filter out the top 10 tokens to sample from.
 """
 
-sampler = keras_nlp.samplers.TopPSampler(p=0.5)
+sampler = keras_hub.samplers.TopPSampler(p=0.5)
 output_tokens = sampler(
     next=next,
     prompt=prompt_tokens,
@@ -391,7 +391,7 @@ class TopKTextGenerator(keras.callbacks.Callback):
     """A callback to generate text from a trained model using top-k."""
 
     def __init__(self, k):
-        self.sampler = keras_nlp.samplers.TopKSampler(k)
+        self.sampler = keras_hub.samplers.TopKSampler(k)
 
     def on_epoch_end(self, epoch, logs=None):
         output_tokens = self.sampler(
@@ -410,7 +410,7 @@ def on_epoch_end(self, epoch, logs=None):
 """
 ## Conclusion
 
-To recap, in this example, we use KerasNLP layers to train a sub-word vocabulary,
+To recap, in this example, we use KerasHub layers to train a sub-word vocabulary,
 tokenize training data, create a miniature GPT model, and perform inference with the
 text generation library.
 
diff --git a/examples/keras_recipes/float8_training_and_inference_with_transformer.py b/examples/keras_recipes/float8_training_and_inference_with_transformer.py
index 6313961c54..13490cb880 100644
--- a/examples/keras_recipes/float8_training_and_inference_with_transformer.py
+++ b/examples/keras_recipes/float8_training_and_inference_with_transformer.py
@@ -46,14 +46,14 @@
 """
 ## Setup
 
-We will use KerasNLP library to simplify the model implementation. Additionally,
+We will use KerasHub library to simplify the model implementation. Additionally,
 use mixed precision training to reduce the training time.
 
 Note: The dependency on TensorFlow is only required for data processing.
 """
 
 """shell
-pip install -q --upgrade keras-nlp
+pip install -q --upgrade keras-hub
 pip install -q --upgrade keras  # Upgrade to Keras 3.
 """
 
@@ -65,7 +65,7 @@
 import re
 
 import keras
-import keras_nlp
+import keras_hub
 import tensorflow as tf
 
 keras.config.set_dtype_policy("mixed_bfloat16")
@@ -144,8 +144,8 @@
 """
 ### Tokenizing the data
 
-We'll be using the `keras_nlp.tokenizers.WordPieceTokenizer` layer to tokenize
-the text. `keras_nlp.tokenizers.WordPieceTokenizer` takes a WordPiece vocabulary
+We'll be using the `keras_hub.tokenizers.WordPieceTokenizer` layer to tokenize
+the text. `keras_hub.tokenizers.WordPieceTokenizer` takes a WordPiece vocabulary
 and has functions for tokenizing the text, and detokenizing sequences of tokens.
 
 Before we define the tokenizer, we first need to train it on the dataset
@@ -153,15 +153,15 @@
 algorithm; training it on a corpus gives us a vocabulary of subwords. A subword
 tokenizer is a compromise between word tokenizers (word tokenizers need very
 large vocabularies for good coverage of input words), and character tokenizers
-(characters don't really encode meaning like words do). Luckily, KerasNLP
+(characters don't really encode meaning like words do). Luckily, KerasHub
 makes it very simple to train WordPiece on a corpus with the
-`keras_nlp.tokenizers.compute_word_piece_vocabulary` utility.
+`keras_hub.tokenizers.compute_word_piece_vocabulary` utility.
 """
 
 
 def train_word_piece(ds, vocab_size, reserved_tokens):
     word_piece_ds = ds.unbatch().map(lambda x, y: x)
-    vocab = keras_nlp.tokenizers.compute_word_piece_vocabulary(
+    vocab = keras_hub.tokenizers.compute_word_piece_vocabulary(
         word_piece_ds.batch(1000).prefetch(2),
         vocabulary_size=vocab_size,
         reserved_tokens=reserved_tokens,
@@ -195,7 +195,7 @@ def train_word_piece(ds, vocab_size, reserved_tokens):
 less than the specified sequence length. Otherwise, the sequence is truncated.
 """
 
-tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
+tokenizer = keras_hub.tokenizers.WordPieceTokenizer(
     vocabulary=vocab,
     lowercase=False,
     sequence_length=MAX_SEQUENCE_LENGTH,
@@ -241,7 +241,7 @@ def make_dataset(dataset):
 ## Model
 
 Let's build a simple Transformer model. We will use `TokenAndPositionEmbedding`
-and `TransformerDecoder` from KerasNLP library. `TokenAndPositionEmbedding`
+and `TransformerDecoder` from KerasHub library. `TokenAndPositionEmbedding`
 represents words and their order in a sentence, while `TransformerDecoder`
 outputs one vector for each time step of our input sequence. Here, we take the
 mean across all time steps and use a feedforward network on top of it to
@@ -258,13 +258,13 @@ def build_model(
     dropout=0.1,
 ):
     token_id_input = keras.layers.Input(shape=(None,), dtype="int32", name="input_ids")
-    x = keras_nlp.layers.TokenAndPositionEmbedding(
+    x = keras_hub.layers.TokenAndPositionEmbedding(
         vocabulary_size=vocabulary_size,
         sequence_length=max_sequence_length,
         embedding_dim=hidden_dim,
     )(token_id_input)
     x = keras.layers.Dropout(rate=dropout)(x)
-    x = keras_nlp.layers.TransformerDecoder(
+    x = keras_hub.layers.TransformerDecoder(
         intermediate_dim=intermediate_dim,
         num_heads=num_heads,
         dropout=dropout,
diff --git a/examples/keras_recipes/ipynb/float8_training_and_inference_with_transformer.ipynb b/examples/keras_recipes/ipynb/float8_training_and_inference_with_transformer.ipynb
index e70c77ab3b..b1736a91e0 100644
--- a/examples/keras_recipes/ipynb/float8_training_and_inference_with_transformer.ipynb
+++ b/examples/keras_recipes/ipynb/float8_training_and_inference_with_transformer.ipynb
@@ -63,7 +63,7 @@
    "source": [
     "## Setup\n",
     "\n",
-    "We will use KerasNLP library to simplify the model implementation. Additionally,\n",
+    "We will use KerasHub library to simplify the model implementation. Additionally,\n",
     "use mixed precision training to reduce the training time.\n",
     "\n",
     "Note: The dependency on TensorFlow is only required for data processing."
@@ -77,7 +77,7 @@
    },
    "outputs": [],
    "source": [
-    "!pip install -q --upgrade keras-nlp\n",
+    "!pip install -q --upgrade keras-hub\n",
     "!pip install -q --upgrade keras  # Upgrade to Keras 3."
    ]
   },
@@ -97,7 +97,7 @@
     "import re\n",
     "\n",
     "import keras\n",
-    "import keras_nlp\n",
+    "import keras_hub\n",
     "import tensorflow as tf\n",
     "\n",
     "keras.config.set_dtype_policy(\"mixed_bfloat16\")"
@@ -250,8 +250,8 @@
    "source": [
     "### Tokenizing the data\n",
     "\n",
-    "We'll be using the `keras_nlp.tokenizers.WordPieceTokenizer` layer to tokenize\n",
-    "the text. `keras_nlp.tokenizers.WordPieceTokenizer` takes a WordPiece vocabulary\n",
+    "We'll be using the `keras_hub.tokenizers.WordPieceTokenizer` layer to tokenize\n",
+    "the text. `keras_hub.tokenizers.WordPieceTokenizer` takes a WordPiece vocabulary\n",
     "and has functions for tokenizing the text, and detokenizing sequences of tokens.\n",
     "\n",
     "Before we define the tokenizer, we first need to train it on the dataset\n",
@@ -259,9 +259,9 @@
     "algorithm; training it on a corpus gives us a vocabulary of subwords. A subword\n",
     "tokenizer is a compromise between word tokenizers (word tokenizers need very\n",
     "large vocabularies for good coverage of input words), and character tokenizers\n",
-    "(characters don't really encode meaning like words do). Luckily, KerasNLP\n",
+    "(characters don't really encode meaning like words do). Luckily, KerasHub\n",
     "makes it very simple to train WordPiece on a corpus with the\n",
-    "`keras_nlp.tokenizers.compute_word_piece_vocabulary` utility."
+    "`keras_hub.tokenizers.compute_word_piece_vocabulary` utility."
    ]
   },
   {
@@ -275,7 +275,7 @@
     "\n",
     "def train_word_piece(ds, vocab_size, reserved_tokens):\n",
     "    word_piece_ds = ds.unbatch().map(lambda x, y: x)\n",
-    "    vocab = keras_nlp.tokenizers.compute_word_piece_vocabulary(\n",
+    "    vocab = keras_hub.tokenizers.compute_word_piece_vocabulary(\n",
     "        word_piece_ds.batch(1000).prefetch(2),\n",
     "        vocabulary_size=vocab_size,\n",
     "        reserved_tokens=reserved_tokens,\n",
@@ -350,7 +350,7 @@
    },
    "outputs": [],
    "source": [
-    "tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(\n",
+    "tokenizer = keras_hub.tokenizers.WordPieceTokenizer(\n",
     "    vocabulary=vocab,\n",
     "    lowercase=False,\n",
     "    sequence_length=MAX_SEQUENCE_LENGTH,\n",
@@ -429,7 +429,7 @@
     "## Model\n",
     "\n",
     "Let's build a simple Transformer model. We will use `TokenAndPositionEmbedding`\n",
-    "and `TransformerDecoder` from KerasNLP library. `TokenAndPositionEmbedding`\n",
+    "and `TransformerDecoder` from KerasHub library. `TokenAndPositionEmbedding`\n",
     "represents words and their order in a sentence, while `TransformerDecoder`\n",
     "outputs one vector for each time step of our input sequence. Here, we take the\n",
     "mean across all time steps and use a feedforward network on top of it to\n",
@@ -454,13 +454,13 @@
     "    dropout=0.1,\n",
     "):\n",
     "    token_id_input = keras.layers.Input(shape=(None,), dtype=\"int32\", name=\"input_ids\")\n",
-    "    x = keras_nlp.layers.TokenAndPositionEmbedding(\n",
+    "    x = keras_hub.layers.TokenAndPositionEmbedding(\n",
     "        vocabulary_size=vocabulary_size,\n",
     "        sequence_length=max_sequence_length,\n",
     "        embedding_dim=hidden_dim,\n",
     "    )(token_id_input)\n",
     "    x = keras.layers.Dropout(rate=dropout)(x)\n",
-    "    x = keras_nlp.layers.TransformerDecoder(\n",
+    "    x = keras_hub.layers.TransformerDecoder(\n",
     "        intermediate_dim=intermediate_dim,\n",
     "        num_heads=num_heads,\n",
     "        dropout=dropout,\n",
diff --git a/examples/keras_recipes/ipynb/parameter_efficient_finetuning_of_gemma_with_lora_and_qlora.ipynb b/examples/keras_recipes/ipynb/parameter_efficient_finetuning_of_gemma_with_lora_and_qlora.ipynb
index f6eff4fa99..497bf88f16 100644
--- a/examples/keras_recipes/ipynb/parameter_efficient_finetuning_of_gemma_with_lora_and_qlora.ipynb
+++ b/examples/keras_recipes/ipynb/parameter_efficient_finetuning_of_gemma_with_lora_and_qlora.ipynb
@@ -11,7 +11,7 @@
     "**Authors:** [Hongyu Chiu](https://github.com/james77777778), [Abheesht Sharma](https://github.com/abheesht17/), [Matthew Watson](https://github.com/mattdangerw/)<br>\n",
     "**Date created:** 2024/08/06<br>\n",
     "**Last modified:** 2024/08/06<br>\n",
-    "**Description:** Use KerasNLP to fine-tune a Gemma LLM with LoRA and QLoRA."
+    "**Description:** Use KerasHub to fine-tune a Gemma LLM with LoRA and QLoRA."
    ]
   },
   {
@@ -42,8 +42,8 @@
     "extends LoRA to enhance efficiency through quantization techniques without\n",
     "performance degradation.\n",
     "\n",
-    "In this example, we will fine-tune KerasNLP's\n",
-    "[Gemma model](https://keras.io/api/keras_nlp/models/gemma/) on the next token\n",
+    "In this example, we will fine-tune KerasHub's\n",
+    "[Gemma model](https://keras.io/api/keras_hub/models/gemma/) on the next token\n",
     "prediction task using LoRA and QLoRA.\n",
     "\n",
     "Note that this example runs on all backends supported by Keras. TensorFlow is\n",
@@ -59,7 +59,7 @@
     "## Setup\n",
     "\n",
     "Before we start implementing the pipeline, let's install and import all the\n",
-    "libraries we need. We'll be using the KerasNLP library.\n",
+    "libraries we need. We'll be using the KerasHub library.\n",
     "\n",
     "Secondly, let's set the precision to bfloat16. This will help us reduce the\n",
     "memory usage and training time.\n",
@@ -76,8 +76,8 @@
    },
    "outputs": [],
    "source": [
-    "# We might need the latest code from Keras and KerasNLP\n",
-    "!pip install -q git+https://github.com/keras-team/keras.git git+https://github.com/keras-team/keras-nlp.git"
+    "# We might need the latest code from Keras and KerasHub\n",
+    "!pip install -q git+https://github.com/keras-team/keras.git git+https://github.com/keras-team/keras-hub.git"
    ]
   },
   {
@@ -98,7 +98,7 @@
     "# os.environ[\"KAGGLE_KEY\"] = \"...\"\n",
     "\n",
     "import keras\n",
-    "import keras_nlp\n",
+    "import keras_hub\n",
     "import tensorflow as tf\n",
     "import tensorflow_datasets as tfds\n",
     "\n",
@@ -241,7 +241,7 @@
    "source": [
     "## Model\n",
     "\n",
-    "KerasNLP provides implementations of many popular model architectures.\n",
+    "KerasHub provides implementations of many popular model architectures.\n",
     "In this example, we will use `GemmaCausalLM`, an end-to-end Gemma model for\n",
     "causal language modeling. A causal language model predicts the next token based\n",
     "on previous tokens.\n",
@@ -257,10 +257,10 @@
    },
    "outputs": [],
    "source": [
-    "preprocessor = keras_nlp.models.GemmaCausalLMPreprocessor.from_preset(\n",
+    "preprocessor = keras_hub.models.GemmaCausalLMPreprocessor.from_preset(\n",
     "    \"gemma_1.1_instruct_2b_en\", sequence_length=256\n",
     ")\n",
-    "gemma_lm = keras_nlp.models.GemmaCausalLM.from_preset(\n",
+    "gemma_lm = keras_hub.models.GemmaCausalLM.from_preset(\n",
     "    \"gemma_1.1_instruct_2b_en\", preprocessor=preprocessor\n",
     ")\n",
     "gemma_lm.summary()"
@@ -340,7 +340,7 @@
     "colab_type": "text"
    },
    "source": [
-    "When using KerasNLP, we can enable LoRA with an one-line API:\n",
+    "When using KerasHub, we can enable LoRA with an one-line API:\n",
     "`enable_lora(rank=4)`\n",
     "\n",
     "From `gemma_lm.summary()`, we can see enabling LoRA reduces the number of\n",
@@ -461,7 +461,7 @@
     "- No double quantization.\n",
     "- No Paged optimizer.\n",
     "\n",
-    "To enable QLoRA in KerasNLP, follow these steps:\n",
+    "To enable QLoRA in KerasHub, follow these steps:\n",
     "\n",
     "1. Instantiate the model.\n",
     "2. Quantize the weights using dynamic int8 quantization.\n",
@@ -481,10 +481,10 @@
    },
    "outputs": [],
    "source": [
-    "preprocessor = keras_nlp.models.GemmaCausalLMPreprocessor.from_preset(\n",
+    "preprocessor = keras_hub.models.GemmaCausalLMPreprocessor.from_preset(\n",
     "    \"gemma_1.1_instruct_2b_en\", sequence_length=256\n",
     ")\n",
-    "gemma_lm = keras_nlp.models.GemmaCausalLM.from_preset(\n",
+    "gemma_lm = keras_hub.models.GemmaCausalLM.from_preset(\n",
     "    \"gemma_1.1_instruct_2b_en\", preprocessor=preprocessor\n",
     ")\n",
     "gemma_lm.quantize(\"int8\")\n",
diff --git a/examples/keras_recipes/md/float8_training_and_inference_with_transformer.md b/examples/keras_recipes/md/float8_training_and_inference_with_transformer.md
index 2885f43f80..57b7c551cc 100644
--- a/examples/keras_recipes/md/float8_training_and_inference_with_transformer.md
+++ b/examples/keras_recipes/md/float8_training_and_inference_with_transformer.md
@@ -48,14 +48,14 @@ performance improvement.
 ---
 ## Setup
 
-We will use KerasNLP library to simplify the model implementation. Additionally,
+We will use KerasHub library to simplify the model implementation. Additionally,
 use mixed precision training to reduce the training time.
 
 Note: The dependency on TensorFlow is only required for data processing.
 
 
 ```python
-!pip install -q --upgrade keras-nlp
+!pip install -q --upgrade keras-hub
 !pip install -q --upgrade keras  # Upgrade to Keras 3.
 ```
 
@@ -69,7 +69,7 @@ os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
 import re
 
 import keras
-import keras_nlp
+import keras_hub
 import tensorflow as tf
 
 keras.config.set_dtype_policy("mixed_bfloat16")
@@ -177,8 +177,8 @@ Label: 1
 </div>
 ### Tokenizing the data
 
-We'll be using the `keras_nlp.tokenizers.WordPieceTokenizer` layer to tokenize
-the text. `keras_nlp.tokenizers.WordPieceTokenizer` takes a WordPiece vocabulary
+We'll be using the `keras_hub.tokenizers.WordPieceTokenizer` layer to tokenize
+the text. `keras_hub.tokenizers.WordPieceTokenizer` takes a WordPiece vocabulary
 and has functions for tokenizing the text, and detokenizing sequences of tokens.
 
 Before we define the tokenizer, we first need to train it on the dataset
@@ -186,16 +186,16 @@ we have. The WordPiece tokenization algorithm is a subword tokenization
 algorithm; training it on a corpus gives us a vocabulary of subwords. A subword
 tokenizer is a compromise between word tokenizers (word tokenizers need very
 large vocabularies for good coverage of input words), and character tokenizers
-(characters don't really encode meaning like words do). Luckily, KerasNLP
+(characters don't really encode meaning like words do). Luckily, KerasHub
 makes it very simple to train WordPiece on a corpus with the
-`keras_nlp.tokenizers.compute_word_piece_vocabulary` utility.
+`keras_hub.tokenizers.compute_word_piece_vocabulary` utility.
 
 
 ```python
 
 def train_word_piece(ds, vocab_size, reserved_tokens):
     word_piece_ds = ds.unbatch().map(lambda x, y: x)
-    vocab = keras_nlp.tokenizers.compute_word_piece_vocabulary(
+    vocab = keras_hub.tokenizers.compute_word_piece_vocabulary(
         word_piece_ds.batch(1000).prefetch(2),
         vocabulary_size=vocab_size,
         reserved_tokens=reserved_tokens,
@@ -238,7 +238,7 @@ less than the specified sequence length. Otherwise, the sequence is truncated.
 
 
 ```python
-tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
+tokenizer = keras_hub.tokenizers.WordPieceTokenizer(
     vocabulary=vocab,
     lowercase=False,
     sequence_length=MAX_SEQUENCE_LENGTH,
@@ -314,7 +314,7 @@ test_ds = make_dataset(test_ds)
 ## Model
 
 Let's build a simple Transformer model. We will use `TokenAndPositionEmbedding`
-and `TransformerDecoder` from KerasNLP library. `TokenAndPositionEmbedding`
+and `TransformerDecoder` from KerasHub library. `TokenAndPositionEmbedding`
 represents words and their order in a sentence, while `TransformerDecoder`
 outputs one vector for each time step of our input sequence. Here, we take the
 mean across all time steps and use a feedforward network on top of it to
@@ -332,13 +332,13 @@ def build_model(
     dropout=0.1,
 ):
     token_id_input = keras.layers.Input(shape=(None,), dtype="int32", name="input_ids")
-    x = keras_nlp.layers.TokenAndPositionEmbedding(
+    x = keras_hub.layers.TokenAndPositionEmbedding(
         vocabulary_size=vocabulary_size,
         sequence_length=max_sequence_length,
         embedding_dim=hidden_dim,
     )(token_id_input)
     x = keras.layers.Dropout(rate=dropout)(x)
-    x = keras_nlp.layers.TransformerDecoder(
+    x = keras_hub.layers.TransformerDecoder(
         intermediate_dim=intermediate_dim,
         num_heads=num_heads,
         dropout=dropout,
diff --git a/examples/keras_recipes/md/parameter_efficient_finetuning_of_gemma_with_lora_and_qlora.md b/examples/keras_recipes/md/parameter_efficient_finetuning_of_gemma_with_lora_and_qlora.md
index 0136ad4d96..e5ad34fb60 100644
--- a/examples/keras_recipes/md/parameter_efficient_finetuning_of_gemma_with_lora_and_qlora.md
+++ b/examples/keras_recipes/md/parameter_efficient_finetuning_of_gemma_with_lora_and_qlora.md
@@ -3,7 +3,7 @@
 **Authors:** [Hongyu Chiu](https://github.com/james77777778), [Abheesht Sharma](https://github.com/abheesht17/), [Matthew Watson](https://github.com/mattdangerw/)<br>
 **Date created:** 2024/08/06<br>
 **Last modified:** 2024/08/06<br>
-**Description:** Use KerasNLP to fine-tune a Gemma LLM with LoRA and QLoRA.
+**Description:** Use KerasHub to fine-tune a Gemma LLM with LoRA and QLoRA.
 
 
 <img class="k-inline-icon" src="https://colab.research.google.com/img/colab_favicon.ico"/> [**View in Colab**](https://colab.research.google.com/github/keras-team/keras-io/blob/master/examples/keras_recipes/ipynb/parameter_efficient_finetuning_of_gemma_with_lora_and_qlora.ipynb)  <span class="k-dot">•</span><img class="k-inline-icon" src="https://github.com/favicon.ico"/> [**GitHub source**](https://github.com/keras-team/keras-io/blob/master/examples/keras_recipes/parameter_efficient_finetuning_of_gemma_with_lora_and_qlora.py)
@@ -33,8 +33,8 @@ Furthermore,
 extends LoRA to enhance efficiency through quantization techniques without
 performance degradation.
 
-In this example, we will fine-tune KerasNLP's
-[Gemma model](https://keras.io/api/keras_nlp/models/gemma/) on the next token
+In this example, we will fine-tune KerasHub's
+[Gemma model](https://keras.io/api/keras_hub/models/gemma/) on the next token
 prediction task using LoRA and QLoRA.
 
 Note that this example runs on all backends supported by Keras. TensorFlow is
@@ -44,7 +44,7 @@ only used for data preprocessing.
 ## Setup
 
 Before we start implementing the pipeline, let's install and import all the
-libraries we need. We'll be using the KerasNLP library.
+libraries we need. We'll be using the KerasHub library.
 
 Secondly, let's set the precision to bfloat16. This will help us reduce the
 memory usage and training time.
@@ -54,8 +54,8 @@ configured to access the Gemma model.
 
 
 ```python
-# We might need the latest code from Keras and KerasNLP
-!pip install -q git+https://github.com/keras-team/keras.git git+https://github.com/keras-team/keras-nlp.git
+# We might need the latest code from Keras and KerasHub
+!pip install -q git+https://github.com/keras-team/keras.git git+https://github.com/keras-team/keras-hub.git
 ```
 
     
@@ -70,7 +70,7 @@ os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"  # Suppress verbose logging from TF
 # os.environ["KAGGLE_KEY"] = "..."
 
 import keras
-import keras_nlp
+import keras_hub
 import tensorflow as tf
 import tensorflow_datasets as tfds
 
@@ -206,7 +206,7 @@ train_ds = train_ds.batch(1).take(100)
 ---
 ## Model
 
-KerasNLP provides implementations of many popular model architectures.
+KerasHub provides implementations of many popular model architectures.
 In this example, we will use `GemmaCausalLM`, an end-to-end Gemma model for
 causal language modeling. A causal language model predicts the next token based
 on previous tokens.
@@ -215,10 +215,10 @@ Note that `sequence_length` is set to `256` to speed up the fitting.
 
 
 ```python
-preprocessor = keras_nlp.models.GemmaCausalLMPreprocessor.from_preset(
+preprocessor = keras_hub.models.GemmaCausalLMPreprocessor.from_preset(
     "gemma_1.1_instruct_2b_en", sequence_length=256
 )
-gemma_lm = keras_nlp.models.GemmaCausalLM.from_preset(
+gemma_lm = keras_hub.models.GemmaCausalLM.from_preset(
     "gemma_1.1_instruct_2b_en", preprocessor=preprocessor
 )
 gemma_lm.summary()
@@ -344,7 +344,7 @@ savings happen.
 - Faster training; and
 - No additional inference latency.
 
-When using KerasNLP, we can enable LoRA with an one-line API:
+When using KerasHub, we can enable LoRA with an one-line API:
 `enable_lora(rank=4)`
 
 From `gemma_lm.summary()`, we can see enabling LoRA reduces the number of
@@ -481,7 +481,7 @@ original. The differences are:
 - No double quantization.
 - No Paged optimizer.
 
-To enable QLoRA in KerasNLP, follow these steps:
+To enable QLoRA in KerasHub, follow these steps:
 
 1. Instantiate the model.
 2. Quantize the weights using dynamic int8 quantization.
@@ -494,10 +494,10 @@ Steps 2 and 3 are achieved with one-line APIs:
 
 
 ```python
-preprocessor = keras_nlp.models.GemmaCausalLMPreprocessor.from_preset(
+preprocessor = keras_hub.models.GemmaCausalLMPreprocessor.from_preset(
     "gemma_1.1_instruct_2b_en", sequence_length=256
 )
-gemma_lm = keras_nlp.models.GemmaCausalLM.from_preset(
+gemma_lm = keras_hub.models.GemmaCausalLM.from_preset(
     "gemma_1.1_instruct_2b_en", preprocessor=preprocessor
 )
 gemma_lm.quantize("int8")
diff --git a/examples/keras_recipes/parameter_efficient_finetuning_of_gemma_with_lora_and_qlora.py b/examples/keras_recipes/parameter_efficient_finetuning_of_gemma_with_lora_and_qlora.py
index a292ffb40b..f56c50f43f 100644
--- a/examples/keras_recipes/parameter_efficient_finetuning_of_gemma_with_lora_and_qlora.py
+++ b/examples/keras_recipes/parameter_efficient_finetuning_of_gemma_with_lora_and_qlora.py
@@ -3,7 +3,7 @@
 Authors: [Hongyu Chiu](https://github.com/james77777778), [Abheesht Sharma](https://github.com/abheesht17/), [Matthew Watson](https://github.com/mattdangerw/)
 Date created: 2024/08/06
 Last modified: 2024/08/06
-Description: Use KerasNLP to fine-tune a Gemma LLM with LoRA and QLoRA.
+Description: Use KerasHub to fine-tune a Gemma LLM with LoRA and QLoRA.
 Accelerator: GPU
 """
 
@@ -30,8 +30,8 @@
 extends LoRA to enhance efficiency through quantization techniques without
 performance degradation.
 
-In this example, we will fine-tune KerasNLP's
-[Gemma model](https://keras.io/api/keras_nlp/models/gemma/) on the next token
+In this example, we will fine-tune KerasHub's
+[Gemma model](https://keras.io/api/keras_hub/models/gemma/) on the next token
 prediction task using LoRA and QLoRA.
 
 Note that this example runs on all backends supported by Keras. TensorFlow is
@@ -42,7 +42,7 @@
 ## Setup
 
 Before we start implementing the pipeline, let's install and import all the
-libraries we need. We'll be using the KerasNLP library.
+libraries we need. We'll be using the KerasHub library.
 
 Secondly, let's set the precision to bfloat16. This will help us reduce the
 memory usage and training time.
@@ -52,8 +52,8 @@
 """
 
 """shell
-# We might need the latest code from Keras and KerasNLP
-pip install -q git+https://github.com/keras-team/keras.git git+https://github.com/keras-team/keras-nlp.git
+# We might need the latest code from Keras and KerasHub
+pip install -q git+https://github.com/keras-team/keras.git git+https://github.com/keras-team/keras-hub.git
 """
 
 import gc
@@ -66,7 +66,7 @@
 # os.environ["KAGGLE_KEY"] = "..."
 
 import keras
-import keras_nlp
+import keras_hub
 import tensorflow as tf
 import tensorflow_datasets as tfds
 
@@ -147,7 +147,7 @@
 """
 ## Model
 
-KerasNLP provides implementations of many popular model architectures.
+KerasHub provides implementations of many popular model architectures.
 In this example, we will use `GemmaCausalLM`, an end-to-end Gemma model for
 causal language modeling. A causal language model predicts the next token based
 on previous tokens.
@@ -155,10 +155,10 @@
 Note that `sequence_length` is set to `256` to speed up the fitting.
 """
 
-preprocessor = keras_nlp.models.GemmaCausalLMPreprocessor.from_preset(
+preprocessor = keras_hub.models.GemmaCausalLMPreprocessor.from_preset(
     "gemma_1.1_instruct_2b_en", sequence_length=256
 )
-gemma_lm = keras_nlp.models.GemmaCausalLM.from_preset(
+gemma_lm = keras_hub.models.GemmaCausalLM.from_preset(
     "gemma_1.1_instruct_2b_en", preprocessor=preprocessor
 )
 gemma_lm.summary()
@@ -227,7 +227,7 @@
 """
 
 """
-When using KerasNLP, we can enable LoRA with an one-line API:
+When using KerasHub, we can enable LoRA with an one-line API:
 `enable_lora(rank=4)`
 
 From `gemma_lm.summary()`, we can see enabling LoRA reduces the number of
@@ -292,7 +292,7 @@
 - No double quantization.
 - No Paged optimizer.
 
-To enable QLoRA in KerasNLP, follow these steps:
+To enable QLoRA in KerasHub, follow these steps:
 
 1. Instantiate the model.
 2. Quantize the weights using dynamic int8 quantization.
@@ -304,10 +304,10 @@
 - `enable_lora(...)`
 """
 
-preprocessor = keras_nlp.models.GemmaCausalLMPreprocessor.from_preset(
+preprocessor = keras_hub.models.GemmaCausalLMPreprocessor.from_preset(
     "gemma_1.1_instruct_2b_en", sequence_length=256
 )
-gemma_lm = keras_nlp.models.GemmaCausalLM.from_preset(
+gemma_lm = keras_hub.models.GemmaCausalLM.from_preset(
     "gemma_1.1_instruct_2b_en", preprocessor=preprocessor
 )
 gemma_lm.quantize("int8")
diff --git a/examples/nlp/abstractive_summarization_with_bart.py b/examples/nlp/abstractive_summarization_with_bart.py
index dfe97ddc3c..b1c2d495a6 100644
--- a/examples/nlp/abstractive_summarization_with_bart.py
+++ b/examples/nlp/abstractive_summarization_with_bart.py
@@ -3,7 +3,7 @@
 Author: [Abheesht Sharma](https://github.com/abheesht17/)
 Date created: 2023/07/08
 Last modified: 2024/03/20
-Description: Use KerasNLP to fine-tune BART on the abstractive summarization task.
+Description: Use KerasHub to fine-tune BART on the abstractive summarization task.
 Accelerator: GPU
 Converted to Keras 3 by: [Sitam Meur](https://github.com/sitamgithub-MSIT)
 """
@@ -26,7 +26,7 @@
 and train BART to fix the order), etc.
 
 In this example, we will demonstrate how to fine-tune BART on the abstractive
-summarization task (on conversations!) using KerasNLP, and generate summaries
+summarization task (on conversations!) using KerasHub, and generate summaries
 using the fine-tuned model.
 """
 
@@ -34,18 +34,18 @@
 ## Setup
 
 Before we start implementing the pipeline, let's install and import all the
-libraries we need. We'll be using the KerasNLP library. We will also need a
+libraries we need. We'll be using the KerasHub library. We will also need a
 couple of utility libraries.
 """
 
 """shell
-pip install git+https://github.com/keras-team/keras-nlp.git py7zr -q
+pip install git+https://github.com/keras-team/keras-hub.git py7zr -q
 """
 
 """
 This examples uses [Keras 3](https://keras.io/keras_3/) to work in any of
 `"tensorflow"`, `"jax"` or `"torch"`. Support for Keras 3 is baked into
-KerasNLP, simply change the `"KERAS_BACKEND"` environment variable to select
+KerasHub, simply change the `"KERAS_BACKEND"` environment variable to select
 the backend of your choice. We select the JAX backend below.
 """
 
@@ -60,7 +60,7 @@
 import py7zr
 import time
 
-import keras_nlp
+import keras_hub
 import keras
 import tensorflow as tf
 import tensorflow_datasets as tfds
@@ -109,7 +109,7 @@
 purpose of this example. The dialogue is fed to the encoder, and the
 corresponding summary serves as input to the decoder. We will, therefore, change
 the format of the dataset to a dictionary having two keys: `"encoder_text"` and
-`"decoder_text"`.This is how `keras_nlp.models.BartSeq2SeqLMPreprocessor`
+`"decoder_text"`.This is how `keras_hub.models.BartSeq2SeqLMPreprocessor`
 expects the input format to be.
 """
 
@@ -139,12 +139,12 @@
 trained to predict the next token.
 """
 
-preprocessor = keras_nlp.models.BartSeq2SeqLMPreprocessor.from_preset(
+preprocessor = keras_hub.models.BartSeq2SeqLMPreprocessor.from_preset(
     "bart_base_en",
     encoder_sequence_length=MAX_ENCODER_SEQUENCE_LENGTH,
     decoder_sequence_length=MAX_DECODER_SEQUENCE_LENGTH,
 )
-bart_lm = keras_nlp.models.BartSeq2SeqLM.from_preset(
+bart_lm = keras_hub.models.BartSeq2SeqLM.from_preset(
     "bart_base_en", preprocessor=preprocessor
 )
 
@@ -188,7 +188,7 @@
 and generate summaries for them. We will use the default decoding strategy, i.e.,
 greedy search.
 
-Generation in KerasNLP is highly optimized. It is backed by the power of XLA.
+Generation in KerasHub is highly optimized. It is backed by the power of XLA.
 Secondly, key/value tensors in the self-attention layer and cross-attention layer
 in the decoder are cached to avoid recomputation at every timestep.
 """
diff --git a/examples/nlp/data_parallel_training_with_keras_nlp.py b/examples/nlp/data_parallel_training_with_keras_hub.py
similarity index 95%
rename from examples/nlp/data_parallel_training_with_keras_nlp.py
rename to examples/nlp/data_parallel_training_with_keras_hub.py
index bc54e9d92c..88912968e4 100644
--- a/examples/nlp/data_parallel_training_with_keras_nlp.py
+++ b/examples/nlp/data_parallel_training_with_keras_hub.py
@@ -1,9 +1,9 @@
 """
-Title: Data Parallel Training with KerasNLP and tf.distribute
+Title: Data Parallel Training with KerasHub and tf.distribute
 Author: Anshuman Mishra
 Date created: 2023/07/07
 Last modified: 2023/07/07
-Description: Data Parallel training with KerasNLP and tf.distribute.
+Description: Data Parallel training with KerasHub and tf.distribute.
 Accelerator: GPU
 """
 
@@ -12,10 +12,10 @@
 
 Distributed training is a technique used to train deep learning models on multiple devices
 or machines simultaneously. It helps to reduce training time and allows for training larger
-models with more data. KerasNLP is a library that provides tools and utilities for natural
+models with more data. KerasHub is a library that provides tools and utilities for natural
 language processing tasks, including distributed training.
 
-In this tutorial, we will use KerasNLP to train a BERT-based masked language model (MLM)
+In this tutorial, we will use KerasHub to train a BERT-based masked language model (MLM)
 on the wikitext-2 dataset (a 2 million word dataset of wikipedia articles). The MLM task
 involves predicting the masked words in a sentence, which helps the model learn contextual
 representations of words.
@@ -37,7 +37,7 @@
 """
 
 """shell
-pip install -q --upgrade keras-nlp
+pip install -q --upgrade keras-hub
 pip install -q --upgrade keras  # Upgrade to Keras 3.
 """
 
@@ -51,7 +51,7 @@
 
 import tensorflow as tf
 import keras
-import keras_nlp
+import keras_hub
 
 """
 Before we start any training, let's configure our single GPU to show up as two logical
@@ -197,7 +197,7 @@ def on_epoch_end(self, epoch, logs=None):
 with strategy.scope():
     # Everything that creates variables should be under the strategy scope.
     # In general this is only model construction & `compile()`.
-    model_dist = keras_nlp.models.BertMaskedLM.from_preset("bert_tiny_en_uncased")
+    model_dist = keras_hub.models.BertMaskedLM.from_preset("bert_tiny_en_uncased")
 
     # This line just sets pooled_dense layer as non-trainiable, we do this to avoid
     # warnings of this layer being unused
diff --git a/examples/nlp/fnet_classification_with_keras_nlp.py b/examples/nlp/fnet_classification_with_keras_hub.py
similarity index 91%
rename from examples/nlp/fnet_classification_with_keras_nlp.py
rename to examples/nlp/fnet_classification_with_keras_hub.py
index fb3c2cbf2d..8ca026cdb1 100644
--- a/examples/nlp/fnet_classification_with_keras_nlp.py
+++ b/examples/nlp/fnet_classification_with_keras_hub.py
@@ -3,7 +3,7 @@
 Author: [Abheesht Sharma](https://github.com/abheesht17/)
 Date created: 2022/06/01
 Last modified: 2022/12/21
-Description: Text Classification on the IMDb Dataset using `keras_nlp.layers.FNetEncoder` layer.
+Description: Text Classification on the IMDb Dataset using `keras_hub.layers.FNetEncoder` layer.
 Accelerator: GPU
 """
 
@@ -17,7 +17,7 @@
 analysis).
 
 To build the tokenizer, model, etc., we will use components from
-[KerasNLP](https://github.com/keras-team/keras-nlp). KerasNLP makes life easier
+[KerasHub](https://github.com/keras-team/keras-hub). KerasHub makes life easier
 for people who want to build NLP pipelines! :)
 
 ### Model
@@ -51,11 +51,11 @@
 """
 
 """shell
-pip install -q --upgrade keras-nlp
+pip install -q --upgrade keras-hub
 pip install -q --upgrade keras  # Upgrade to Keras 3.
 """
 
-import keras_nlp
+import keras_hub
 import keras
 import tensorflow as tf
 import os
@@ -144,8 +144,8 @@
 """
 ### Tokenizing the data
 
-We'll be using the `keras_nlp.tokenizers.WordPieceTokenizer` layer to tokenize
-the text. `keras_nlp.tokenizers.WordPieceTokenizer` takes a WordPiece vocabulary
+We'll be using the `keras_hub.tokenizers.WordPieceTokenizer` layer to tokenize
+the text. `keras_hub.tokenizers.WordPieceTokenizer` takes a WordPiece vocabulary
 and has functions for tokenizing the text, and detokenizing sequences of tokens.
 
 Before we define the tokenizer, we first need to train it on the dataset
@@ -153,9 +153,9 @@
 training it on a corpus gives us a vocabulary of subwords. A subword tokenizer
 is a compromise between word tokenizers (word tokenizers need very large
 vocabularies for good coverage of input words), and character tokenizers
-(characters don't really encode meaning like words do). Luckily, KerasNLP
+(characters don't really encode meaning like words do). Luckily, KerasHub
 makes it very simple to train WordPiece on a corpus with the
-`keras_nlp.tokenizers.compute_word_piece_vocabulary` utility.
+`keras_hub.tokenizers.compute_word_piece_vocabulary` utility.
 
 Note: The official implementation of FNet uses the SentencePiece Tokenizer.
 """
@@ -163,7 +163,7 @@
 
 def train_word_piece(ds, vocab_size, reserved_tokens):
     word_piece_ds = ds.unbatch().map(lambda x, y: x)
-    vocab = keras_nlp.tokenizers.compute_word_piece_vocabulary(
+    vocab = keras_hub.tokenizers.compute_word_piece_vocabulary(
         word_piece_ds.batch(1000).prefetch(2),
         vocabulary_size=vocab_size,
         reserved_tokens=reserved_tokens,
@@ -193,7 +193,7 @@ def train_word_piece(ds, vocab_size, reserved_tokens):
 all sequences are padded to the same length, if the length of the sequence is
 less than the specified sequence length. Otherwise, the sequence is truncated.
 """
-tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
+tokenizer = keras_hub.tokenizers.WordPieceTokenizer(
     vocabulary=vocab,
     lowercase=False,
     sequence_length=MAX_SEQUENCE_LENGTH,
@@ -241,11 +241,11 @@ def make_dataset(dataset):
 We first need an embedding layer, i.e., a layer that maps every token in the input
 sequence to a vector. This embedding layer can be initialised randomly. We also
 need a positional embedding layer which encodes the word order in the sequence.
-The convention is to add, i.e., sum, these two embeddings. KerasNLP has a
-`keras_nlp.layers.TokenAndPositionEmbedding ` layer which does all of the above
+The convention is to add, i.e., sum, these two embeddings. KerasHub has a
+`keras_hub.layers.TokenAndPositionEmbedding ` layer which does all of the above
 steps for us.
 
-Our FNet classification model consists of three `keras_nlp.layers.FNetEncoder`
+Our FNet classification model consists of three `keras_hub.layers.FNetEncoder`
 layers with a `keras.layers.Dense` layer on top.
 
 Note: For FNet, masking the padding tokens has a minimal effect on results. In the
@@ -254,16 +254,16 @@ def make_dataset(dataset):
 
 input_ids = keras.Input(shape=(None,), dtype="int64", name="input_ids")
 
-x = keras_nlp.layers.TokenAndPositionEmbedding(
+x = keras_hub.layers.TokenAndPositionEmbedding(
     vocabulary_size=VOCAB_SIZE,
     sequence_length=MAX_SEQUENCE_LENGTH,
     embedding_dim=EMBED_DIM,
     mask_zero=True,
 )(input_ids)
 
-x = keras_nlp.layers.FNetEncoder(intermediate_dim=INTERMEDIATE_DIM)(inputs=x)
-x = keras_nlp.layers.FNetEncoder(intermediate_dim=INTERMEDIATE_DIM)(inputs=x)
-x = keras_nlp.layers.FNetEncoder(intermediate_dim=INTERMEDIATE_DIM)(inputs=x)
+x = keras_hub.layers.FNetEncoder(intermediate_dim=INTERMEDIATE_DIM)(inputs=x)
+x = keras_hub.layers.FNetEncoder(intermediate_dim=INTERMEDIATE_DIM)(inputs=x)
+x = keras_hub.layers.FNetEncoder(intermediate_dim=INTERMEDIATE_DIM)(inputs=x)
 
 
 x = keras.layers.GlobalAveragePooling1D()(x)
@@ -309,20 +309,20 @@ def make_dataset(dataset):
 input_ids = keras.Input(shape=(None,), dtype="int64", name="input_ids")
 
 
-x = keras_nlp.layers.TokenAndPositionEmbedding(
+x = keras_hub.layers.TokenAndPositionEmbedding(
     vocabulary_size=VOCAB_SIZE,
     sequence_length=MAX_SEQUENCE_LENGTH,
     embedding_dim=EMBED_DIM,
     mask_zero=True,
 )(input_ids)
 
-x = keras_nlp.layers.TransformerEncoder(
+x = keras_hub.layers.TransformerEncoder(
     intermediate_dim=INTERMEDIATE_DIM, num_heads=NUM_HEADS
 )(inputs=x)
-x = keras_nlp.layers.TransformerEncoder(
+x = keras_hub.layers.TransformerEncoder(
     intermediate_dim=INTERMEDIATE_DIM, num_heads=NUM_HEADS
 )(inputs=x)
-x = keras_nlp.layers.TransformerEncoder(
+x = keras_hub.layers.TransformerEncoder(
     intermediate_dim=INTERMEDIATE_DIM, num_heads=NUM_HEADS
 )(inputs=x)
 
diff --git a/examples/nlp/ipynb/abstractive_summarization_with_bart.ipynb b/examples/nlp/ipynb/abstractive_summarization_with_bart.ipynb
index 1d6a6ecc27..0c70e70df5 100644
--- a/examples/nlp/ipynb/abstractive_summarization_with_bart.ipynb
+++ b/examples/nlp/ipynb/abstractive_summarization_with_bart.ipynb
@@ -11,7 +11,7 @@
     "**Author:** [Abheesht Sharma](https://github.com/abheesht17/)<br>\n",
     "**Date created:** 2023/07/08<br>\n",
     "**Last modified:** 2024/03/20<br>\n",
-    "**Description:** Use KerasNLP to fine-tune BART on the abstractive summarization task."
+    "**Description:** Use KerasHub to fine-tune BART on the abstractive summarization task."
    ]
   },
   {
@@ -37,7 +37,7 @@
     "and train BART to fix the order), etc.\n",
     "\n",
     "In this example, we will demonstrate how to fine-tune BART on the abstractive\n",
-    "summarization task (on conversations!) using KerasNLP, and generate summaries\n",
+    "summarization task (on conversations!) using KerasHub, and generate summaries\n",
     "using the fine-tuned model."
    ]
   },
@@ -50,7 +50,7 @@
     "## Setup\n",
     "\n",
     "Before we start implementing the pipeline, let's install and import all the\n",
-    "libraries we need. We'll be using the KerasNLP library. We will also need a\n",
+    "libraries we need. We'll be using the KerasHub library. We will also need a\n",
     "couple of utility libraries."
    ]
   },
@@ -62,7 +62,7 @@
    },
    "outputs": [],
    "source": [
-    "!pip install git+https://github.com/keras-team/keras-nlp.git py7zr -q"
+    "!pip install git+https://github.com/keras-team/keras-hub.git py7zr -q"
    ]
   },
   {
@@ -73,7 +73,7 @@
    "source": [
     "This examples uses [Keras 3](https://keras.io/keras_3/) to work in any of\n",
     "`\"tensorflow\"`, `\"jax\"` or `\"torch\"`. Support for Keras 3 is baked into\n",
-    "KerasNLP, simply change the `\"KERAS_BACKEND\"` environment variable to select\n",
+    "KerasHub, simply change the `\"KERAS_BACKEND\"` environment variable to select\n",
     "the backend of your choice. We select the JAX backend below."
    ]
   },
@@ -110,7 +110,7 @@
     "import py7zr\n",
     "import time\n",
     "\n",
-    "import keras_nlp\n",
+    "import keras_hub\n",
     "import keras\n",
     "import tensorflow as tf\n",
     "import tensorflow_datasets as tfds"
@@ -208,7 +208,7 @@
     "purpose of this example. The dialogue is fed to the encoder, and the\n",
     "corresponding summary serves as input to the decoder. We will, therefore, change\n",
     "the format of the dataset to a dictionary having two keys: `\"encoder_text\"` and\n",
-    "`\"decoder_text\"`.This is how `keras_nlp.models.BartSeq2SeqLMPreprocessor`\n",
+    "`\"decoder_text\"`.This is how `keras_hub.models.BartSeq2SeqLMPreprocessor`\n",
     "expects the input format to be."
    ]
   },
@@ -260,12 +260,12 @@
    },
    "outputs": [],
    "source": [
-    "preprocessor = keras_nlp.models.BartSeq2SeqLMPreprocessor.from_preset(\n",
+    "preprocessor = keras_hub.models.BartSeq2SeqLMPreprocessor.from_preset(\n",
     "    \"bart_base_en\",\n",
     "    encoder_sequence_length=MAX_ENCODER_SEQUENCE_LENGTH,\n",
     "    decoder_sequence_length=MAX_DECODER_SEQUENCE_LENGTH,\n",
     ")\n",
-    "bart_lm = keras_nlp.models.BartSeq2SeqLM.from_preset(\n",
+    "bart_lm = keras_hub.models.BartSeq2SeqLM.from_preset(\n",
     "    \"bart_base_en\", preprocessor=preprocessor\n",
     ")\n",
     "\n",
@@ -343,7 +343,7 @@
     "and generate summaries for them. We will use the default decoding strategy, i.e.,\n",
     "greedy search.\n",
     "\n",
-    "Generation in KerasNLP is highly optimized. It is backed by the power of XLA.\n",
+    "Generation in KerasHub is highly optimized. It is backed by the power of XLA.\n",
     "Secondly, key/value tensors in the self-attention layer and cross-attention layer\n",
     "in the decoder are cached to avoid recomputation at every timestep."
    ]
diff --git a/examples/nlp/ipynb/data_parallel_training_with_keras_nlp.ipynb b/examples/nlp/ipynb/data_parallel_training_with_keras_hub.ipynb
similarity index 96%
rename from examples/nlp/ipynb/data_parallel_training_with_keras_nlp.ipynb
rename to examples/nlp/ipynb/data_parallel_training_with_keras_hub.ipynb
index 7b81df49e9..9faa8a2b6c 100644
--- a/examples/nlp/ipynb/data_parallel_training_with_keras_nlp.ipynb
+++ b/examples/nlp/ipynb/data_parallel_training_with_keras_hub.ipynb
@@ -6,12 +6,12 @@
     "colab_type": "text"
    },
    "source": [
-    "# Data Parallel Training with KerasNLP and tf.distribute\n",
+    "# Data Parallel Training with KerasHub and tf.distribute\n",
     "\n",
     "**Author:** Anshuman Mishra<br>\n",
     "**Date created:** 2023/07/07<br>\n",
     "**Last modified:** 2023/07/07<br>\n",
-    "**Description:** Data Parallel training with KerasNLP and tf.distribute."
+    "**Description:** Data Parallel training with KerasHub and tf.distribute."
    ]
   },
   {
@@ -24,10 +24,10 @@
     "\n",
     "Distributed training is a technique used to train deep learning models on multiple devices\n",
     "or machines simultaneously. It helps to reduce training time and allows for training larger\n",
-    "models with more data. KerasNLP is a library that provides tools and utilities for natural\n",
+    "models with more data. KerasHub is a library that provides tools and utilities for natural\n",
     "language processing tasks, including distributed training.\n",
     "\n",
-    "In this tutorial, we will use KerasNLP to train a BERT-based masked language model (MLM)\n",
+    "In this tutorial, we will use KerasHub to train a BERT-based masked language model (MLM)\n",
     "on the wikitext-2 dataset (a 2 million word dataset of wikipedia articles). The MLM task\n",
     "involves predicting the masked words in a sentence, which helps the model learn contextual\n",
     "representations of words.\n",
@@ -56,7 +56,7 @@
    },
    "outputs": [],
    "source": [
-    "!pip install -q --upgrade keras-nlp\n",
+    "!pip install -q --upgrade keras-hub\n",
     "!pip install -q --upgrade keras  # Upgrade to Keras 3."
    ]
   },
@@ -83,7 +83,7 @@
     "\n",
     "import tensorflow as tf\n",
     "import keras\n",
-    "import keras_nlp"
+    "import keras_hub"
    ]
   },
   {
@@ -352,7 +352,7 @@
     "with strategy.scope():\n",
     "    # Everything that creates variables should be under the strategy scope.\n",
     "    # In general this is only model construction & `compile()`.\n",
-    "    model_dist = keras_nlp.models.BertMaskedLM.from_preset(\"bert_tiny_en_uncased\")\n",
+    "    model_dist = keras_hub.models.BertMaskedLM.from_preset(\"bert_tiny_en_uncased\")\n",
     "\n",
     "    # This line just sets pooled_dense layer as non-trainiable, we do this to avoid\n",
     "    # warnings of this layer being unused\n",
@@ -421,7 +421,7 @@
   "accelerator": "GPU",
   "colab": {
    "collapsed_sections": [],
-   "name": "data_parallel_training_with_keras_nlp",
+   "name": "data_parallel_training_with_keras_hub",
    "private_outputs": false,
    "provenance": [],
    "toc_visible": true
diff --git a/examples/nlp/ipynb/fnet_classification_with_keras_nlp.ipynb b/examples/nlp/ipynb/fnet_classification_with_keras_hub.ipynb
similarity index 94%
rename from examples/nlp/ipynb/fnet_classification_with_keras_nlp.ipynb
rename to examples/nlp/ipynb/fnet_classification_with_keras_hub.ipynb
index ca757d5dbd..465b94bc17 100644
--- a/examples/nlp/ipynb/fnet_classification_with_keras_nlp.ipynb
+++ b/examples/nlp/ipynb/fnet_classification_with_keras_hub.ipynb
@@ -11,7 +11,7 @@
     "**Author:** [Abheesht Sharma](https://github.com/abheesht17/)<br>\n",
     "**Date created:** 2022/06/01<br>\n",
     "**Last modified:** 2022/12/21<br>\n",
-    "**Description:** Text Classification on the IMDb Dataset using `keras_nlp.layers.FNetEncoder` layer."
+    "**Description:** Text Classification on the IMDb Dataset using `keras_hub.layers.FNetEncoder` layer."
    ]
   },
   {
@@ -29,7 +29,7 @@
     "analysis).\n",
     "\n",
     "To build the tokenizer, model, etc., we will use components from\n",
-    "[KerasNLP](https://github.com/keras-team/keras-nlp). KerasNLP makes life easier\n",
+    "[KerasHub](https://github.com/keras-team/keras-hub). KerasHub makes life easier\n",
     "for people who want to build NLP pipelines! :)\n",
     "\n",
     "### Model\n",
@@ -75,7 +75,7 @@
    },
    "outputs": [],
    "source": [
-    "!pip install -q --upgrade keras-nlp\n",
+    "!pip install -q --upgrade keras-hub\n",
     "!pip install -q --upgrade keras  # Upgrade to Keras 3."
    ]
   },
@@ -87,7 +87,7 @@
    },
    "outputs": [],
    "source": [
-    "import keras_nlp\n",
+    "import keras_hub\n",
     "import keras\n",
     "import tensorflow as tf\n",
     "import os\n",
@@ -279,8 +279,8 @@
    "source": [
     "### Tokenizing the data\n",
     "\n",
-    "We'll be using the `keras_nlp.tokenizers.WordPieceTokenizer` layer to tokenize\n",
-    "the text. `keras_nlp.tokenizers.WordPieceTokenizer` takes a WordPiece vocabulary\n",
+    "We'll be using the `keras_hub.tokenizers.WordPieceTokenizer` layer to tokenize\n",
+    "the text. `keras_hub.tokenizers.WordPieceTokenizer` takes a WordPiece vocabulary\n",
     "and has functions for tokenizing the text, and detokenizing sequences of tokens.\n",
     "\n",
     "Before we define the tokenizer, we first need to train it on the dataset\n",
@@ -288,9 +288,9 @@
     "training it on a corpus gives us a vocabulary of subwords. A subword tokenizer\n",
     "is a compromise between word tokenizers (word tokenizers need very large\n",
     "vocabularies for good coverage of input words), and character tokenizers\n",
-    "(characters don't really encode meaning like words do). Luckily, KerasNLP\n",
+    "(characters don't really encode meaning like words do). Luckily, KerasHub\n",
     "makes it very simple to train WordPiece on a corpus with the\n",
-    "`keras_nlp.tokenizers.compute_word_piece_vocabulary` utility.\n",
+    "`keras_hub.tokenizers.compute_word_piece_vocabulary` utility.\n",
     "\n",
     "Note: The official implementation of FNet uses the SentencePiece Tokenizer."
    ]
@@ -306,7 +306,7 @@
     "\n",
     "def train_word_piece(ds, vocab_size, reserved_tokens):\n",
     "    word_piece_ds = ds.unbatch().map(lambda x, y: x)\n",
-    "    vocab = keras_nlp.tokenizers.compute_word_piece_vocabulary(\n",
+    "    vocab = keras_hub.tokenizers.compute_word_piece_vocabulary(\n",
     "        word_piece_ds.batch(1000).prefetch(2),\n",
     "        vocabulary_size=vocab_size,\n",
     "        reserved_tokens=reserved_tokens,\n",
@@ -381,7 +381,7 @@
    },
    "outputs": [],
    "source": [
-    "tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(\n",
+    "tokenizer = keras_hub.tokenizers.WordPieceTokenizer(\n",
     "    vocabulary=vocab,\n",
     "    lowercase=False,\n",
     "    sequence_length=MAX_SEQUENCE_LENGTH,\n",
@@ -464,11 +464,11 @@
     "We first need an embedding layer, i.e., a layer that maps every token in the input\n",
     "sequence to a vector. This embedding layer can be initialised randomly. We also\n",
     "need a positional embedding layer which encodes the word order in the sequence.\n",
-    "The convention is to add, i.e., sum, these two embeddings. KerasNLP has a\n",
-    "`keras_nlp.layers.TokenAndPositionEmbedding ` layer which does all of the above\n",
+    "The convention is to add, i.e., sum, these two embeddings. KerasHub has a\n",
+    "`keras_hub.layers.TokenAndPositionEmbedding ` layer which does all of the above\n",
     "steps for us.\n",
     "\n",
-    "Our FNet classification model consists of three `keras_nlp.layers.FNetEncoder`\n",
+    "Our FNet classification model consists of three `keras_hub.layers.FNetEncoder`\n",
     "layers with a `keras.layers.Dense` layer on top.\n",
     "\n",
     "Note: For FNet, masking the padding tokens has a minimal effect on results. In the\n",
@@ -485,16 +485,16 @@
    "source": [
     "input_ids = keras.Input(shape=(None,), dtype=\"int64\", name=\"input_ids\")\n",
     "\n",
-    "x = keras_nlp.layers.TokenAndPositionEmbedding(\n",
+    "x = keras_hub.layers.TokenAndPositionEmbedding(\n",
     "    vocabulary_size=VOCAB_SIZE,\n",
     "    sequence_length=MAX_SEQUENCE_LENGTH,\n",
     "    embedding_dim=EMBED_DIM,\n",
     "    mask_zero=True,\n",
     ")(input_ids)\n",
     "\n",
-    "x = keras_nlp.layers.FNetEncoder(intermediate_dim=INTERMEDIATE_DIM)(inputs=x)\n",
-    "x = keras_nlp.layers.FNetEncoder(intermediate_dim=INTERMEDIATE_DIM)(inputs=x)\n",
-    "x = keras_nlp.layers.FNetEncoder(intermediate_dim=INTERMEDIATE_DIM)(inputs=x)\n",
+    "x = keras_hub.layers.FNetEncoder(intermediate_dim=INTERMEDIATE_DIM)(inputs=x)\n",
+    "x = keras_hub.layers.FNetEncoder(intermediate_dim=INTERMEDIATE_DIM)(inputs=x)\n",
+    "x = keras_hub.layers.FNetEncoder(intermediate_dim=INTERMEDIATE_DIM)(inputs=x)\n",
     "\n",
     "\n",
     "x = keras.layers.GlobalAveragePooling1D()(x)\n",
@@ -585,20 +585,20 @@
     "input_ids = keras.Input(shape=(None,), dtype=\"int64\", name=\"input_ids\")\n",
     "\n",
     "\n",
-    "x = keras_nlp.layers.TokenAndPositionEmbedding(\n",
+    "x = keras_hub.layers.TokenAndPositionEmbedding(\n",
     "    vocabulary_size=VOCAB_SIZE,\n",
     "    sequence_length=MAX_SEQUENCE_LENGTH,\n",
     "    embedding_dim=EMBED_DIM,\n",
     "    mask_zero=True,\n",
     ")(input_ids)\n",
     "\n",
-    "x = keras_nlp.layers.TransformerEncoder(\n",
+    "x = keras_hub.layers.TransformerEncoder(\n",
     "    intermediate_dim=INTERMEDIATE_DIM, num_heads=NUM_HEADS\n",
     ")(inputs=x)\n",
-    "x = keras_nlp.layers.TransformerEncoder(\n",
+    "x = keras_hub.layers.TransformerEncoder(\n",
     "    intermediate_dim=INTERMEDIATE_DIM, num_heads=NUM_HEADS\n",
     ")(inputs=x)\n",
-    "x = keras_nlp.layers.TransformerEncoder(\n",
+    "x = keras_hub.layers.TransformerEncoder(\n",
     "    intermediate_dim=INTERMEDIATE_DIM, num_heads=NUM_HEADS\n",
     ")(inputs=x)\n",
     "\n",
@@ -667,7 +667,7 @@
   "accelerator": "GPU",
   "colab": {
    "collapsed_sections": [],
-   "name": "fnet_classification_with_keras_nlp",
+   "name": "fnet_classification_with_keras_hub",
    "private_outputs": false,
    "provenance": [],
    "toc_visible": true
diff --git a/examples/nlp/ipynb/masked_language_modeling.ipynb b/examples/nlp/ipynb/masked_language_modeling.ipynb
index ae87e9ee99..977f9f16c0 100644
--- a/examples/nlp/ipynb/masked_language_modeling.ipynb
+++ b/examples/nlp/ipynb/masked_language_modeling.ipynb
@@ -71,7 +71,7 @@
     "import os\n",
     "\n",
     "os.environ[\"KERAS_BACKEND\"] = \"tensorflow\"\n",
-    "import keras_nlp\n",
+    "import keras_hub\n",
     "import keras\n",
     "import tensorflow as tf\n",
     "from keras import layers\n",
@@ -429,7 +429,7 @@
     "    word_embeddings = layers.Embedding(\n",
     "        config.VOCAB_SIZE, config.EMBED_DIM, name=\"word_embedding\"\n",
     "    )(inputs)\n",
-    "    position_embeddings = keras_nlp.layers.PositionEmbedding(\n",
+    "    position_embeddings = keras_hub.layers.PositionEmbedding(\n",
     "        sequence_length=config.MAX_LEN\n",
     "    )(word_embeddings)\n",
     "    embeddings = word_embeddings + position_embeddings\n",
diff --git a/examples/nlp/ipynb/multiple_choice_task_with_transfer_learning.ipynb b/examples/nlp/ipynb/multiple_choice_task_with_transfer_learning.ipynb
index 1cb23824be..b97d00434c 100644
--- a/examples/nlp/ipynb/multiple_choice_task_with_transfer_learning.ipynb
+++ b/examples/nlp/ipynb/multiple_choice_task_with_transfer_learning.ipynb
@@ -45,7 +45,7 @@
    },
    "outputs": [],
    "source": [
-    "import keras_nlp\n",
+    "import keras_hub\n",
     "import keras\n",
     "import tensorflow as tf  # For tf.data only.\n",
     "\n",
@@ -257,9 +257,9 @@
     "making subsequent steps smoother.\n",
     "\n",
     "Explore the following pages to access the available preprocessing and tokenizer layers in\n",
-    "**KerasNLP**:\n",
-    "- [Preprocessing](https://keras.io/api/keras_nlp/preprocessing_layers/)\n",
-    "- [Tokenizers](https://keras.io/api/keras_nlp/tokenizers/)"
+    "**KerasHub**:\n",
+    "- [Preprocessing](https://keras.io/api/keras_hub/preprocessing_layers/)\n",
+    "- [Tokenizers](https://keras.io/api/keras_hub/tokenizers/)"
    ]
   },
   {
@@ -270,7 +270,7 @@
    },
    "outputs": [],
    "source": [
-    "preprocessor = keras_nlp.models.DebertaV3Preprocessor.from_preset(\n",
+    "preprocessor = keras_hub.models.DebertaV3Preprocessor.from_preset(\n",
     "    preset=CFG.preset,  # Name of the model\n",
     "    sequence_length=CFG.sequence_length,  # Max sequence length, will be padded if shorter\n",
     ")"
@@ -615,15 +615,15 @@
    "source": [
     "### Pre-trained Models\n",
     "\n",
-    "The `KerasNLP` library provides comprehensive, ready-to-use implementations of popular\n",
+    "The `KerasHub` library provides comprehensive, ready-to-use implementations of popular\n",
     "NLP model architectures. It features a variety of pre-trained models including `Bert`,\n",
     "`Roberta`, `DebertaV3`, and more. In this notebook, we'll showcase the usage of\n",
-    "`DistillBert`. However, feel free to explore all available models in the [KerasNLP\n",
-    "documentation](https://keras.io/api/keras_nlp/models/). Also for a deeper understanding\n",
-    "of `KerasNLP`, refer to the informative [getting started\n",
-    "guide](https://keras.io/guides/keras_nlp/getting_started/).\n",
+    "`DistillBert`. However, feel free to explore all available models in the [KerasHub\n",
+    "documentation](https://keras.io/api/keras_hub/models/). Also for a deeper understanding\n",
+    "of `KerasHub`, refer to the informative [getting started\n",
+    "guide](https://keras.io/guides/keras_hub/getting_started/).\n",
     "\n",
-    "Our approach involves using `keras_nlp.models.XXClassifier` to process each question and\n",
+    "Our approach involves using `keras_hub.models.XXClassifier` to process each question and\n",
     "option pari (e.g. (Q+A), (Q+B), etc.), generating logits. These logits are then combined\n",
     "and passed through a softmax function to produce the final output."
    ]
@@ -708,7 +708,7 @@
     "        ),\n",
     "    }\n",
     "    # Create a DebertaV3Classifier model\n",
-    "    classifier = keras_nlp.models.DebertaV3Classifier.from_preset(\n",
+    "    classifier = keras_hub.models.DebertaV3Classifier.from_preset(\n",
     "        CFG.preset,\n",
     "        preprocessor=None,\n",
     "        num_classes=1,  # one output per one option, for five options total 5 outputs\n",
@@ -862,7 +862,7 @@
     "## Reference\n",
     "* [Multiple Choice with\n",
     "HF](https://twitter.com/johnowhitaker/status/1689790373454041089?s=20)\n",
-    "* [Keras NLP](https://keras.io/api/keras_nlp/)\n",
+    "* [Keras NLP](https://keras.io/api/keras_hub/)\n",
     "* [BirdCLEF23: Pretraining is All you Need\n",
     "[Train]](https://www.kaggle.com/code/awsaf49/birdclef23-pretraining-is-all-you-need-train)\n",
     "[Train]](https://www.kaggle.com/code/awsaf49/birdclef23-pretraining-is-all-you-need-train)\n",
diff --git a/examples/nlp/ipynb/neural_machine_translation_with_keras_nlp.ipynb b/examples/nlp/ipynb/neural_machine_translation_with_keras_hub.ipynb
similarity index 90%
rename from examples/nlp/ipynb/neural_machine_translation_with_keras_nlp.ipynb
rename to examples/nlp/ipynb/neural_machine_translation_with_keras_hub.ipynb
index 326bbab9d9..3c5c4fa2a9 100644
--- a/examples/nlp/ipynb/neural_machine_translation_with_keras_nlp.ipynb
+++ b/examples/nlp/ipynb/neural_machine_translation_with_keras_hub.ipynb
@@ -6,12 +6,12 @@
     "colab_type": "text"
    },
    "source": [
-    "# English-to-Spanish translation with KerasNLP\n",
+    "# English-to-Spanish translation with KerasHub\n",
     "\n",
     "**Author:** [Abheesht Sharma](https://github.com/abheesht17/)<br>\n",
     "**Date created:** 2022/05/26<br>\n",
     "**Last modified:** 2024/04/30<br>\n",
-    "**Description:** Use KerasNLP to train a sequence-to-sequence Transformer model on the machine translation task."
+    "**Description:** Use KerasHub to train a sequence-to-sequence Transformer model on the machine translation task."
    ]
   },
   {
@@ -22,30 +22,30 @@
    "source": [
     "## Introduction\n",
     "\n",
-    "KerasNLP provides building blocks for NLP (model layers, tokenizers, metrics, etc.) and\n",
+    "KerasHub provides building blocks for NLP (model layers, tokenizers, metrics, etc.) and\n",
     "makes it convenient to construct NLP pipelines.\n",
     "\n",
-    "In this example, we'll use KerasNLP layers to build an encoder-decoder Transformer\n",
+    "In this example, we'll use KerasHub layers to build an encoder-decoder Transformer\n",
     "model, and train it on the English-to-Spanish machine translation task.\n",
     "\n",
     "This example is based on the\n",
     "[English-to-Spanish NMT\n",
     "example](https://keras.io/examples/nlp/neural_machine_translation_with_transformer/)\n",
     "by [fchollet](https://twitter.com/fchollet). The original example is more low-level\n",
-    "and implements layers from scratch, whereas this example uses KerasNLP to show\n",
+    "and implements layers from scratch, whereas this example uses KerasHub to show\n",
     "some more advanced approaches, such as subword tokenization and using metrics\n",
     "to compute the quality of generated translations.\n",
     "\n",
     "You'll learn how to:\n",
     "\n",
-    "- Tokenize text using `keras_nlp.tokenizers.WordPieceTokenizer`.\n",
-    "- Implement a sequence-to-sequence Transformer model using KerasNLP's\n",
-    "`keras_nlp.layers.TransformerEncoder`, `keras_nlp.layers.TransformerDecoder` and\n",
-    "`keras_nlp.layers.TokenAndPositionEmbedding` layers, and train it.\n",
-    "- Use `keras_nlp.samplers` to generate translations of unseen input sentences\n",
+    "- Tokenize text using `keras_hub.tokenizers.WordPieceTokenizer`.\n",
+    "- Implement a sequence-to-sequence Transformer model using KerasHub's\n",
+    "`keras_hub.layers.TransformerEncoder`, `keras_hub.layers.TransformerDecoder` and\n",
+    "`keras_hub.layers.TokenAndPositionEmbedding` layers, and train it.\n",
+    "- Use `keras_hub.samplers` to generate translations of unseen input sentences\n",
     " using the top-p decoding strategy!\n",
     "\n",
-    "Don't worry if you aren't familiar with KerasNLP. This tutorial will start with\n",
+    "Don't worry if you aren't familiar with KerasHub. This tutorial will start with\n",
     "the basics. Let's dive right in!"
    ]
   },
@@ -69,7 +69,7 @@
    "outputs": [],
    "source": [
     "!pip install -q --upgrade rouge-score\n",
-    "!pip install -q --upgrade keras-nlp\n",
+    "!pip install -q --upgrade keras-hub\n",
     "!pip install -q --upgrade keras  # Upgrade to Keras 3."
    ]
   },
@@ -81,7 +81,7 @@
    },
    "outputs": [],
    "source": [
-    "import keras_nlp\n",
+    "import keras_hub\n",
     "import pathlib\n",
     "import random\n",
     "\n",
@@ -244,8 +244,8 @@
     "\n",
     "We'll define two tokenizers - one for the source language (English), and the other\n",
     "for the target language (Spanish). We'll be using\n",
-    "`keras_nlp.tokenizers.WordPieceTokenizer` to tokenize the text.\n",
-    "`keras_nlp.tokenizers.WordPieceTokenizer` takes a WordPiece vocabulary\n",
+    "`keras_hub.tokenizers.WordPieceTokenizer` to tokenize the text.\n",
+    "`keras_hub.tokenizers.WordPieceTokenizer` takes a WordPiece vocabulary\n",
     "and has functions for tokenizing the text, and detokenizing sequences of tokens.\n",
     "\n",
     "Before we define the two tokenizers, we first need to train them on the dataset\n",
@@ -253,9 +253,9 @@
     "training it on a corpus gives us a vocabulary of subwords. A subword tokenizer\n",
     "is a compromise between word tokenizers (word tokenizers need very large\n",
     "vocabularies for good coverage of input words), and character tokenizers\n",
-    "(characters don't really encode meaning like words do). Luckily, KerasNLP\n",
+    "(characters don't really encode meaning like words do). Luckily, KerasHub\n",
     "makes it very simple to train WordPiece on a corpus with the\n",
-    "`keras_nlp.tokenizers.compute_word_piece_vocabulary` utility."
+    "`keras_hub.tokenizers.compute_word_piece_vocabulary` utility."
    ]
   },
   {
@@ -269,7 +269,7 @@
     "\n",
     "def train_word_piece(text_samples, vocab_size, reserved_tokens):\n",
     "    word_piece_ds = tf_data.Dataset.from_tensor_slices(text_samples)\n",
-    "    vocab = keras_nlp.tokenizers.compute_word_piece_vocabulary(\n",
+    "    vocab = keras_hub.tokenizers.compute_word_piece_vocabulary(\n",
     "        word_piece_ds.batch(1000).prefetch(2),\n",
     "        vocabulary_size=vocab_size,\n",
     "        reserved_tokens=reserved_tokens,\n",
@@ -349,10 +349,10 @@
    },
    "outputs": [],
    "source": [
-    "eng_tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(\n",
+    "eng_tokenizer = keras_hub.tokenizers.WordPieceTokenizer(\n",
     "    vocabulary=eng_vocab, lowercase=False\n",
     ")\n",
-    "spa_tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(\n",
+    "spa_tokenizer = keras_hub.tokenizers.WordPieceTokenizer(\n",
     "    vocabulary=spa_vocab, lowercase=False\n",
     ")"
    ]
@@ -422,7 +422,7 @@
     "\n",
     "We will add special tokens, `\"[START]\"` and `\"[END]\"`, to the input Spanish\n",
     "sentence after tokenizing the text. We will also pad the input to a fixed length.\n",
-    "This can be easily done using `keras_nlp.layers.StartEndPacker`."
+    "This can be easily done using `keras_hub.layers.StartEndPacker`."
    ]
   },
   {
@@ -441,14 +441,14 @@
     "    spa = spa_tokenizer(spa)\n",
     "\n",
     "    # Pad `eng` to `MAX_SEQUENCE_LENGTH`.\n",
-    "    eng_start_end_packer = keras_nlp.layers.StartEndPacker(\n",
+    "    eng_start_end_packer = keras_hub.layers.StartEndPacker(\n",
     "        sequence_length=MAX_SEQUENCE_LENGTH,\n",
     "        pad_value=eng_tokenizer.token_to_id(\"[PAD]\"),\n",
     "    )\n",
     "    eng = eng_start_end_packer(eng)\n",
     "\n",
     "    # Add special tokens (`\"[START]\"` and `\"[END]\"`) to `spa` and pad it as well.\n",
-    "    spa_start_end_packer = keras_nlp.layers.StartEndPacker(\n",
+    "    spa_start_end_packer = keras_hub.layers.StartEndPacker(\n",
     "        sequence_length=MAX_SEQUENCE_LENGTH + 1,\n",
     "        start_value=spa_tokenizer.token_to_id(\"[START]\"),\n",
     "        end_value=spa_tokenizer.token_to_id(\"[END]\"),\n",
@@ -516,27 +516,27 @@
     "We first need an embedding layer, i.e., a vector for every token in our input sequence.\n",
     "This embedding layer can be initialised randomly. We also need a positional\n",
     "embedding layer which encodes the word order in the sequence. The convention is\n",
-    "to add these two embeddings. KerasNLP has a `keras_nlp.layers.TokenAndPositionEmbedding `\n",
+    "to add these two embeddings. KerasHub has a `keras_hub.layers.TokenAndPositionEmbedding `\n",
     "layer which does all of the above steps for us.\n",
     "\n",
-    "Our sequence-to-sequence Transformer consists of a `keras_nlp.layers.TransformerEncoder`\n",
-    "layer and a `keras_nlp.layers.TransformerDecoder` layer chained together.\n",
+    "Our sequence-to-sequence Transformer consists of a `keras_hub.layers.TransformerEncoder`\n",
+    "layer and a `keras_hub.layers.TransformerDecoder` layer chained together.\n",
     "\n",
-    "The source sequence will be passed to `keras_nlp.layers.TransformerEncoder`, which\n",
+    "The source sequence will be passed to `keras_hub.layers.TransformerEncoder`, which\n",
     "will produce a new representation of it. This new representation will then be passed\n",
-    "to the `keras_nlp.layers.TransformerDecoder`, together with the target sequence\n",
-    "so far (target words 0 to N). The `keras_nlp.layers.TransformerDecoder` will\n",
+    "to the `keras_hub.layers.TransformerDecoder`, together with the target sequence\n",
+    "so far (target words 0 to N). The `keras_hub.layers.TransformerDecoder` will\n",
     "then seek to predict the next words in the target sequence (N+1 and beyond).\n",
     "\n",
     "A key detail that makes this possible is causal masking.\n",
-    "The `keras_nlp.layers.TransformerDecoder` sees the entire sequence at once, and\n",
+    "The `keras_hub.layers.TransformerDecoder` sees the entire sequence at once, and\n",
     "thus we must make sure that it only uses information from target tokens 0 to N\n",
     "when predicting token N+1 (otherwise, it could use information from the future,\n",
     "which would result in a model that cannot be used at inference time). Causal masking\n",
-    "is enabled by default in `keras_nlp.layers.TransformerDecoder`.\n",
+    "is enabled by default in `keras_hub.layers.TransformerDecoder`.\n",
     "\n",
     "We also need to mask the padding tokens (`\"[PAD]\"`). For this, we can set the\n",
-    "`mask_zero` argument of the `keras_nlp.layers.TokenAndPositionEmbedding` layer\n",
+    "`mask_zero` argument of the `keras_hub.layers.TokenAndPositionEmbedding` layer\n",
     "to True. This will then be propagated to all subsequent layers."
    ]
   },
@@ -551,13 +551,13 @@
     "# Encoder\n",
     "encoder_inputs = keras.Input(shape=(None,), name=\"encoder_inputs\")\n",
     "\n",
-    "x = keras_nlp.layers.TokenAndPositionEmbedding(\n",
+    "x = keras_hub.layers.TokenAndPositionEmbedding(\n",
     "    vocabulary_size=ENG_VOCAB_SIZE,\n",
     "    sequence_length=MAX_SEQUENCE_LENGTH,\n",
     "    embedding_dim=EMBED_DIM,\n",
     ")(encoder_inputs)\n",
     "\n",
-    "encoder_outputs = keras_nlp.layers.TransformerEncoder(\n",
+    "encoder_outputs = keras_hub.layers.TransformerEncoder(\n",
     "    intermediate_dim=INTERMEDIATE_DIM, num_heads=NUM_HEADS\n",
     ")(inputs=x)\n",
     "encoder = keras.Model(encoder_inputs, encoder_outputs)\n",
@@ -567,13 +567,13 @@
     "decoder_inputs = keras.Input(shape=(None,), name=\"decoder_inputs\")\n",
     "encoded_seq_inputs = keras.Input(shape=(None, EMBED_DIM), name=\"decoder_state_inputs\")\n",
     "\n",
-    "x = keras_nlp.layers.TokenAndPositionEmbedding(\n",
+    "x = keras_hub.layers.TokenAndPositionEmbedding(\n",
     "    vocabulary_size=SPA_VOCAB_SIZE,\n",
     "    sequence_length=MAX_SEQUENCE_LENGTH,\n",
     "    embedding_dim=EMBED_DIM,\n",
     ")(decoder_inputs)\n",
     "\n",
-    "x = keras_nlp.layers.TransformerDecoder(\n",
+    "x = keras_hub.layers.TransformerDecoder(\n",
     "    intermediate_dim=INTERMEDIATE_DIM, num_heads=NUM_HEADS\n",
     ")(decoder_sequence=x, encoder_sequence=encoded_seq_inputs)\n",
     "x = keras.layers.Dropout(0.5)(x)\n",
@@ -641,8 +641,8 @@
     "next token. We then we repeatedly generated the next token conditioned on the\n",
     "tokens generated so far, until we hit the token `\"[END]\"`.\n",
     "\n",
-    "For decoding, we will use the `keras_nlp.samplers` module from\n",
-    "KerasNLP. Greedy Decoding is a text decoding method which outputs the most\n",
+    "For decoding, we will use the `keras_hub.samplers` module from\n",
+    "KerasHub. Greedy Decoding is a text decoding method which outputs the most\n",
     "likely next token at each time step, i.e., the token with the highest probability."
    ]
   },
@@ -680,7 +680,7 @@
     "    pad = ops.full((batch_size, length - 1), spa_tokenizer.token_to_id(\"[PAD]\"))\n",
     "    prompt = ops.concatenate((start, pad), axis=-1)\n",
     "\n",
-    "    generated_tokens = keras_nlp.samplers.GreedySampler()(\n",
+    "    generated_tokens = keras_hub.samplers.GreedySampler()(\n",
     "        next,\n",
     "        prompt,\n",
     "        stop_token_ids=[spa_tokenizer.token_to_id(\"[END]\")],\n",
@@ -733,8 +733,8 @@
    },
    "outputs": [],
    "source": [
-    "rouge_1 = keras_nlp.metrics.RougeN(order=1)\n",
-    "rouge_2 = keras_nlp.metrics.RougeN(order=2)\n",
+    "rouge_1 = keras_hub.metrics.RougeN(order=1)\n",
+    "rouge_2 = keras_hub.metrics.RougeN(order=2)\n",
     "\n",
     "for test_pair in test_pairs[:30]:\n",
     "    input_sentence = test_pair[0]\n",
@@ -776,7 +776,7 @@
   "accelerator": "GPU",
   "colab": {
    "collapsed_sections": [],
-   "name": "neural_machine_translation_with_keras_nlp",
+   "name": "neural_machine_translation_with_keras_hub",
    "private_outputs": false,
    "provenance": [],
    "toc_visible": true
diff --git a/examples/nlp/ipynb/parameter_efficient_finetuning_of_gpt2_with_lora.ipynb b/examples/nlp/ipynb/parameter_efficient_finetuning_of_gpt2_with_lora.ipynb
index 79255dc83e..e186f45e76 100644
--- a/examples/nlp/ipynb/parameter_efficient_finetuning_of_gpt2_with_lora.ipynb
+++ b/examples/nlp/ipynb/parameter_efficient_finetuning_of_gpt2_with_lora.ipynb
@@ -11,7 +11,7 @@
     "**Author:** [Abheesht Sharma](https://github.com/abheesht17/), [Matthew Watson](https://github.com/mattdangerw/)<br>\n",
     "**Date created:** 2023/05/27<br>\n",
     "**Last modified:** 2023/05/27<br>\n",
-    "**Description:** Use KerasNLP to fine-tune a GPT-2 LLM with LoRA."
+    "**Description:** Use KerasHub to fine-tune a GPT-2 LLM with LoRA."
    ]
   },
   {
@@ -38,8 +38,8 @@
     "of the outputs.\n",
     "\n",
     "In this example, we will explain LoRA in technical terms, show how the technical\n",
-    "explanation translates to code, hack KerasNLP's\n",
-    "[GPT-2 model](https://keras.io/api/keras_nlp/models/gpt2/) and fine-tune\n",
+    "explanation translates to code, hack KerasHub's\n",
+    "[GPT-2 model](https://keras.io/api/keras_hub/models/gpt2/) and fine-tune\n",
     "it on the next token prediction task using LoRA. We will compare LoRA GPT-2\n",
     "with a fully fine-tuned GPT-2 in terms of the quality of the generated text,\n",
     "training time and GPU memory usage.\n",
@@ -59,7 +59,7 @@
     "## Setup\n",
     "\n",
     "Before we start implementing the pipeline, let's install and import all the\n",
-    "libraries we need. We'll be using the KerasNLP library.\n",
+    "libraries we need. We'll be using the KerasHub library.\n",
     "\n",
     "Secondly, let's enable mixed precision training. This will help us reduce the\n",
     "training time."
@@ -73,7 +73,7 @@
    },
    "outputs": [],
    "source": [
-    "!pip install -q --upgrade keras-nlp\n",
+    "!pip install -q --upgrade keras-hub\n",
     "!pip install -q --upgrade keras  # Upgrade to Keras 3."
    ]
   },
@@ -89,7 +89,7 @@
     "\n",
     "os.environ[\"KERAS_BACKEND\"] = \"tensorflow\"\n",
     "\n",
-    "import keras_nlp\n",
+    "import keras_hub\n",
     "import keras\n",
     "import matplotlib.pyplot as plt\n",
     "import tensorflow as tf\n",
@@ -368,11 +368,11 @@
    },
    "outputs": [],
    "source": [
-    "preprocessor = keras_nlp.models.GPT2CausalLMPreprocessor.from_preset(\n",
+    "preprocessor = keras_hub.models.GPT2CausalLMPreprocessor.from_preset(\n",
     "    \"gpt2_base_en\",\n",
     "    sequence_length=MAX_SEQUENCE_LENGTH,\n",
     ")\n",
-    "gpt2_lm = keras_nlp.models.GPT2CausalLM.from_preset(\n",
+    "gpt2_lm = keras_hub.models.GPT2CausalLM.from_preset(\n",
     "    \"gpt2_base_en\", preprocessor=preprocessor\n",
     ")\n",
     "\n",
@@ -657,11 +657,11 @@
     "tf.config.experimental.reset_memory_stats(\"GPU:0\")\n",
     "\n",
     "# Load the original model.\n",
-    "preprocessor = keras_nlp.models.GPT2CausalLMPreprocessor.from_preset(\n",
+    "preprocessor = keras_hub.models.GPT2CausalLMPreprocessor.from_preset(\n",
     "    \"gpt2_base_en\",\n",
     "    sequence_length=128,\n",
     ")\n",
-    "lora_model = keras_nlp.models.GPT2CausalLM.from_preset(\n",
+    "lora_model = keras_hub.models.GPT2CausalLM.from_preset(\n",
     "    \"gpt2_base_en\",\n",
     "    preprocessor=preprocessor,\n",
     ")"
diff --git a/examples/nlp/ipynb/semantic_similarity_with_keras_nlp.ipynb b/examples/nlp/ipynb/semantic_similarity_with_keras_hub.ipynb
similarity index 91%
rename from examples/nlp/ipynb/semantic_similarity_with_keras_nlp.ipynb
rename to examples/nlp/ipynb/semantic_similarity_with_keras_hub.ipynb
index 38ab04afa4..ba6e649bfa 100644
--- a/examples/nlp/ipynb/semantic_similarity_with_keras_nlp.ipynb
+++ b/examples/nlp/ipynb/semantic_similarity_with_keras_hub.ipynb
@@ -6,12 +6,12 @@
     "colab_type": "text"
    },
    "source": [
-    "# Semantic Similarity with KerasNLP\n",
+    "# Semantic Similarity with KerasHub\n",
     "\n",
     "**Author:** [Anshuman Mishra](https://github.com/shivance/)<br>\n",
     "**Date created:** 2023/02/25<br>\n",
     "**Last modified:** 2023/02/25<br>\n",
-    "**Description:** Use pretrained models from KerasNLP for the Semantic Similarity Task."
+    "**Description:** Use pretrained models from KerasHub for the Semantic Similarity Task."
    ]
   },
   {
@@ -26,10 +26,10 @@
     "sentences in terms of their meaning. We already saw in [this](https://keras.io/examples/nlp/semantic_similarity_with_bert/)\n",
     "example how to use SNLI (Stanford Natural Language Inference) corpus to predict sentence\n",
     "semantic similarity with the HuggingFace Transformers library. In this tutorial we will\n",
-    "learn how to use [KerasNLP](https://keras.io/keras_nlp/), an extension of the core Keras API,\n",
-    "for the same task. Furthermore, we will discover how KerasNLP effectively reduces boilerplate\n",
-    "code and simplifies the process of building and utilizing models. For more information on KerasNLP,\n",
-    "please refer to [KerasNLP's official documentation](https://keras.io/keras_nlp/).\n",
+    "learn how to use [KerasHub](https://keras.io/keras_hub/), an extension of the core Keras API,\n",
+    "for the same task. Furthermore, we will discover how KerasHub effectively reduces boilerplate\n",
+    "code and simplifies the process of building and utilizing models. For more information on KerasHub,\n",
+    "please refer to [KerasHub's official documentation](https://keras.io/keras_hub/).\n",
     "\n",
     "This guide is broken down into the following parts:\n",
     "\n",
@@ -43,7 +43,7 @@
     "\n",
     "The following guide uses [Keras Core](https://keras.io/keras_core/) to work in\n",
     "any of `tensorflow`, `jax` or `torch`. Support for Keras Core is baked into\n",
-    "KerasNLP, simply change the `KERAS_BACKEND` environment variable below to change\n",
+    "KerasHub, simply change the `KERAS_BACKEND` environment variable below to change\n",
     "the backend you would like to use. We select the `jax` backend below, which will\n",
     "give us a particularly fast train step below."
    ]
@@ -56,7 +56,7 @@
    },
    "outputs": [],
    "source": [
-    "!pip install -q --upgrade keras-nlp\n",
+    "!pip install -q --upgrade keras-hub\n",
     "!pip install -q --upgrade keras  # Upgrade to Keras 3."
    ]
   },
@@ -71,7 +71,7 @@
     "import numpy as np\n",
     "import tensorflow as tf\n",
     "import keras\n",
-    "import keras_nlp\n",
+    "import keras_hub\n",
     "import tensorflow_datasets as tfds"
    ]
   },
@@ -151,7 +151,7 @@
    },
    "source": [
     "Here's a utility function that splits the example into an `(x, y)` tuple that is suitable\n",
-    "for `model.fit()`. By default, `keras_nlp.models.BertClassifier` will tokenize and pack\n",
+    "for `model.fit()`. By default, `keras_hub.models.BertClassifier` will tokenize and pack\n",
     "together raw strings using a `\"[SEP]\"` token during training. Therefore, this label\n",
     "splitting is all the data preparation that we need to perform."
    ]
@@ -197,12 +197,12 @@
    "source": [
     "## Establishing baseline with BERT.\n",
     "\n",
-    "We use the BERT model from KerasNLP to establish a baseline for our semantic similarity\n",
-    "task. The `keras_nlp.models.BertClassifier` class attaches a classification head to the BERT\n",
+    "We use the BERT model from KerasHub to establish a baseline for our semantic similarity\n",
+    "task. The `keras_hub.models.BertClassifier` class attaches a classification head to the BERT\n",
     "Backbone, mapping the backbone outputs to a logit output suitable for a classification task.\n",
     "This significantly reduces the need for custom code.\n",
     "\n",
-    "KerasNLP models have built-in tokenization capabilities that handle tokenization by default\n",
+    "KerasHub models have built-in tokenization capabilities that handle tokenization by default\n",
     "based on the selected model. However, users can also use custom preprocessing techniques\n",
     "as per their specific needs. If we pass a tuple as input, the model will tokenize all the\n",
     "strings and concatenate them with a `\"[SEP]\"` separator.\n",
@@ -219,7 +219,7 @@
    },
    "outputs": [],
    "source": [
-    "bert_classifier = keras_nlp.models.BertClassifier.from_preset(\n",
+    "bert_classifier = keras_hub.models.BertClassifier.from_preset(\n",
     "    \"bert_tiny_en_uncased\", num_classes=3\n",
     ")"
    ]
@@ -232,7 +232,7 @@
    "source": [
     "Please note that the BERT Tiny model has only 4,386,307 trainable parameters.\n",
     "\n",
-    "KerasNLP task models come with compilation defaults. We can now train the model we just\n",
+    "KerasHub task models come with compilation defaults. We can now train the model we just\n",
     "instantiated by calling the `fit()` method."
    ]
   },
@@ -289,7 +289,7 @@
    },
    "outputs": [],
    "source": [
-    "bert_classifier = keras_nlp.models.BertClassifier.from_preset(\n",
+    "bert_classifier = keras_hub.models.BertClassifier.from_preset(\n",
     "    \"bert_tiny_en_uncased\", num_classes=3\n",
     ")\n",
     "bert_classifier.compile(\n",
@@ -346,7 +346,7 @@
     "        return keras.ops.maximum(triangular_rate, 0.0)\n",
     "\n",
     "\n",
-    "bert_classifier = keras_nlp.models.BertClassifier.from_preset(\n",
+    "bert_classifier = keras_hub.models.BertClassifier.from_preset(\n",
     "    \"bert_tiny_en_uncased\", num_classes=3\n",
     ")\n",
     "\n",
@@ -429,7 +429,7 @@
    "source": [
     "## Performing inference with the model.\n",
     "\n",
-    "Let's see how to perform inference with KerasNLP models"
+    "Let's see how to perform inference with KerasHub models"
    ]
   },
   {
@@ -451,7 +451,7 @@
     "colab_type": "text"
    },
    "source": [
-    "The default preprocessor in KerasNLP models handles input tokenization automatically,\n",
+    "The default preprocessor in KerasHub models handles input tokenization automatically,\n",
     "so we don't need to perform tokenization explicitly."
    ]
   },
@@ -483,7 +483,7 @@
     "## Improving accuracy with RoBERTa\n",
     "\n",
     "Now that we have established a baseline, we can attempt to improve our results\n",
-    "by experimenting with different models. Thanks to KerasNLP, fine-tuning a RoBERTa\n",
+    "by experimenting with different models. Thanks to KerasHub, fine-tuning a RoBERTa\n",
     "checkpoint on the same dataset is easy with just a few lines of code."
    ]
   },
@@ -496,7 +496,7 @@
    "outputs": [],
    "source": [
     "# Inittializing a RoBERTa from preset\n",
-    "roberta_classifier = keras_nlp.models.RobertaClassifier.from_preset(\n",
+    "roberta_classifier = keras_hub.models.RobertaClassifier.from_preset(\n",
     "    \"roberta_base_en\", num_classes=3\n",
     ")\n",
     "\n",
@@ -541,13 +541,13 @@
    },
    "source": [
     "We hope this tutorial has been helpful in demonstrating the ease and effectiveness\n",
-    "of using KerasNLP and BERT for semantic similarity tasks.\n",
+    "of using KerasHub and BERT for semantic similarity tasks.\n",
     "\n",
     "Throughout this tutorial, we demonstrated how to use a pretrained BERT model to\n",
     "establish a baseline and improve performance by training a larger RoBERTa model\n",
     "using just a few lines of code.\n",
     "\n",
-    "The KerasNLP toolbox provides a range of modular building blocks for preprocessing\n",
+    "The KerasHub toolbox provides a range of modular building blocks for preprocessing\n",
     "text, including pretrained state-of-the-art models and low-level Transformer Encoder\n",
     "layers. We believe that this makes experimenting with natural language solutions\n",
     "more accessible and efficient."
@@ -558,7 +558,7 @@
   "accelerator": "GPU",
   "colab": {
    "collapsed_sections": [],
-   "name": "semantic_similarity_with_keras_nlp",
+   "name": "semantic_similarity_with_keras_hub",
    "private_outputs": false,
    "provenance": [],
    "toc_visible": true
diff --git a/examples/nlp/ipynb/sentence_embeddings_with_sbert.ipynb b/examples/nlp/ipynb/sentence_embeddings_with_sbert.ipynb
index 4d0d1841d9..12d4c6e5d9 100644
--- a/examples/nlp/ipynb/sentence_embeddings_with_sbert.ipynb
+++ b/examples/nlp/ipynb/sentence_embeddings_with_sbert.ipynb
@@ -11,7 +11,7 @@
     "**Author:** [Mohammed Abu El-Nasr](https://github.com/abuelnasr0)<br>\n",
     "**Date created:** 2023/07/14<br>\n",
     "**Last modified:** 2023/07/14<br>\n",
-    "**Description:** Fine-tune a RoBERTa model to generate sentence embeddings using KerasNLP."
+    "**Description:** Fine-tune a RoBERTa model to generate sentence embeddings using KerasHub."
    ]
   },
   {
@@ -61,7 +61,7 @@
    "source": [
     "## Setup\n",
     "\n",
-    "Let's install and import the libraries we need. We'll be using the KerasNLP library in\n",
+    "Let's install and import the libraries we need. We'll be using the KerasHub library in\n",
     "this example.\n",
     "\n",
     "We will also enable [mixed precision](https://www.tensorflow.org/guide/mixed_precision)\n",
@@ -76,7 +76,7 @@
    },
    "outputs": [],
    "source": [
-    "!pip install -q --upgrade keras-nlp\n",
+    "!pip install -q --upgrade keras-hub\n",
     "!pip install -q --upgrade keras  # Upgrade to Keras 3."
    ]
   },
@@ -93,7 +93,7 @@
     "os.environ[\"KERAS_BACKEND\"] = \"tensorflow\"\n",
     "\n",
     "import keras\n",
-    "import keras_nlp\n",
+    "import keras_hub\n",
     "import tensorflow as tf\n",
     "import tensorflow_datasets as tfds\n",
     "import sklearn.cluster as cluster\n",
@@ -255,8 +255,8 @@
    },
    "outputs": [],
    "source": [
-    "preprocessor = keras_nlp.models.RobertaPreprocessor.from_preset(\"roberta_base_en\")\n",
-    "backbone = keras_nlp.models.RobertaBackbone.from_preset(\"roberta_base_en\")\n",
+    "preprocessor = keras_hub.models.RobertaPreprocessor.from_preset(\"roberta_base_en\")\n",
+    "backbone = keras_hub.models.RobertaBackbone.from_preset(\"roberta_base_en\")\n",
     "inputs = keras.Input(shape=(1,), dtype=\"string\", name=\"sentence\")\n",
     "x = preprocessor(inputs)\n",
     "h = backbone(x)\n",
@@ -527,8 +527,8 @@
    },
    "outputs": [],
    "source": [
-    "preprocessor = keras_nlp.models.RobertaPreprocessor.from_preset(\"roberta_base_en\")\n",
-    "backbone = keras_nlp.models.RobertaBackbone.from_preset(\"roberta_base_en\")\n",
+    "preprocessor = keras_hub.models.RobertaPreprocessor.from_preset(\"roberta_base_en\")\n",
+    "backbone = keras_hub.models.RobertaBackbone.from_preset(\"roberta_base_en\")\n",
     "input = keras.Input(shape=(1,), dtype=\"string\", name=\"sentence\")\n",
     "\n",
     "x = preprocessor(input)\n",
diff --git a/examples/nlp/ipynb/t5_hf_summarization.ipynb b/examples/nlp/ipynb/t5_hf_summarization.ipynb
index bd9185156c..70d99a7d9c 100644
--- a/examples/nlp/ipynb/t5_hf_summarization.ipynb
+++ b/examples/nlp/ipynb/t5_hf_summarization.ipynb
@@ -69,7 +69,7 @@
    "outputs": [],
    "source": [
     "!pip install transformers==4.20.0\n",
-    "!pip install keras_nlp==0.3.0\n",
+    "!pip install keras_hub==0.3.0\n",
     "!pip install datasets\n",
     "!pip install huggingface-hub\n",
     "!pip install nltk\n",
@@ -512,9 +512,9 @@
    },
    "outputs": [],
    "source": [
-    "import keras_nlp\n",
+    "import keras_hub\n",
     "\n",
-    "rouge_l = keras_nlp.metrics.RougeL()\n",
+    "rouge_l = keras_hub.metrics.RougeL()\n",
     "\n",
     "\n",
     "def metric_fn(eval_predictions):\n",
diff --git a/examples/nlp/masked_language_modeling.py b/examples/nlp/masked_language_modeling.py
index 6701f0745e..d1c65efe08 100644
--- a/examples/nlp/masked_language_modeling.py
+++ b/examples/nlp/masked_language_modeling.py
@@ -47,7 +47,7 @@
 import os
 
 os.environ["KERAS_BACKEND"] = "tensorflow"
-import keras_nlp
+import keras_hub
 import keras
 import tensorflow as tf
 from keras import layers
@@ -341,7 +341,7 @@ def create_masked_language_bert_model():
     word_embeddings = layers.Embedding(
         config.VOCAB_SIZE, config.EMBED_DIM, name="word_embedding"
     )(inputs)
-    position_embeddings = keras_nlp.layers.PositionEmbedding(
+    position_embeddings = keras_hub.layers.PositionEmbedding(
         sequence_length=config.MAX_LEN
     )(word_embeddings)
     embeddings = word_embeddings + position_embeddings
diff --git a/examples/nlp/md/abstractive_summarization_with_bart.md b/examples/nlp/md/abstractive_summarization_with_bart.md
index d019cc7ec1..709431c476 100644
--- a/examples/nlp/md/abstractive_summarization_with_bart.md
+++ b/examples/nlp/md/abstractive_summarization_with_bart.md
@@ -3,7 +3,7 @@
 **Author:** [Abheesht Sharma](https://github.com/abheesht17/)<br>
 **Date created:** 2023/07/08<br>
 **Last modified:** 2024/03/20<br>
-**Description:** Use KerasNLP to fine-tune BART on the abstractive summarization task.
+**Description:** Use KerasHub to fine-tune BART on the abstractive summarization task.
 
 
 <img class="k-inline-icon" src="https://colab.research.google.com/img/colab_favicon.ico"/> [**View in Colab**](https://colab.research.google.com/github/keras-team/keras-io/blob/master/examples/nlp/ipynb/abstractive_summarization_with_bart.ipynb)  <span class="k-dot">•</span><img class="k-inline-icon" src="https://github.com/favicon.ico"/> [**GitHub source**](https://github.com/keras-team/keras-io/blob/master/examples/nlp/abstractive_summarization_with_bart.py)
@@ -28,19 +28,19 @@ include token masking, token deletion, sentence permutation (shuffle sentences
 and train BART to fix the order), etc.
 
 In this example, we will demonstrate how to fine-tune BART on the abstractive
-summarization task (on conversations!) using KerasNLP, and generate summaries
+summarization task (on conversations!) using KerasHub, and generate summaries
 using the fine-tuned model.
 
 ---
 ## Setup
 
 Before we start implementing the pipeline, let's install and import all the
-libraries we need. We'll be using the KerasNLP library. We will also need a
+libraries we need. We'll be using the KerasHub library. We will also need a
 couple of utility libraries.
 
 
 ```python
-!pip install git+https://github.com/keras-team/keras-nlp.git py7zr -q
+!pip install git+https://github.com/keras-team/keras-hub.git py7zr -q
 ```
 
 <div class="k-default-codeblock">
@@ -55,13 +55,13 @@ couple of utility libraries.
 [2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 49.8/49.8 kB [31m5.8 MB/s eta [36m0:00:00
 [2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 2.7/2.7 MB [31m61.4 MB/s eta [36m0:00:00
 [2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 93.1/93.1 kB [31m10.1 MB/s eta [36m0:00:00
-[?25h  Building wheel for keras-nlp (pyproject.toml) ... [?25l[?25hdone
+[?25h  Building wheel for keras-hub (pyproject.toml) ... [?25l[?25hdone
 
 ```
 </div>
 This examples uses [Keras 3](https://keras.io/keras_3) to work in any of
 `"tensorflow"`, `"jax"` or `"torch"`. Support for Keras 3 is baked into
-KerasNLP, simply change the `"KERAS_BACKEND"` environment variable to select
+KerasHub, simply change the `"KERAS_BACKEND"` environment variable to select
 the backend of your choice. We select the JAX backend below.
 
 
@@ -78,7 +78,7 @@ Import all necessary libraries.
 import py7zr
 import time
 
-import keras_nlp
+import keras_hub
 import keras
 import tensorflow as tf
 import tensorflow_datasets as tfds
@@ -169,7 +169,7 @@ We'll now batch the dataset and retain only a subset of the dataset for the
 purpose of this example. The dialogue is fed to the encoder, and the
 corresponding summary serves as input to the decoder. We will, therefore, change
 the format of the dataset to a dictionary having two keys: `"encoder_text"` and
-`"decoder_text"`.This is how `keras_nlp.models.BartSeq2SeqLMPreprocessor`
+`"decoder_text"`.This is how `keras_hub.models.BartSeq2SeqLMPreprocessor`
 expects the input format to be.
 
 
@@ -202,12 +202,12 @@ trained to predict the next token.
 
 
 ```python
-preprocessor = keras_nlp.models.BartSeq2SeqLMPreprocessor.from_preset(
+preprocessor = keras_hub.models.BartSeq2SeqLMPreprocessor.from_preset(
     "bart_base_en",
     encoder_sequence_length=MAX_ENCODER_SEQUENCE_LENGTH,
     decoder_sequence_length=MAX_DECODER_SEQUENCE_LENGTH,
 )
-bart_lm = keras_nlp.models.BartSeq2SeqLM.from_preset(
+bart_lm = keras_hub.models.BartSeq2SeqLM.from_preset(
     "bart_base_en", preprocessor=preprocessor
 )
 
@@ -216,11 +216,11 @@ bart_lm.summary()
 
 <div class="k-default-codeblock">
 ```
-Downloading data from https://storage.googleapis.com/keras-nlp/models/bart_base_en/v1/vocab.json
+Downloading data from https://storage.googleapis.com/keras-hub/models/bart_base_en/v1/vocab.json
  898823/898823 ━━━━━━━━━━━━━━━━━━━━ 1s 1us/step
-Downloading data from https://storage.googleapis.com/keras-nlp/models/bart_base_en/v1/merges.txt
+Downloading data from https://storage.googleapis.com/keras-hub/models/bart_base_en/v1/merges.txt
  456318/456318 ━━━━━━━━━━━━━━━━━━━━ 1s 1us/step
-Downloading data from https://storage.googleapis.com/keras-nlp/models/bart_base_en/v1/model.h5
+Downloading data from https://storage.googleapis.com/keras-hub/models/bart_base_en/v1/model.h5
  557969120/557969120 ━━━━━━━━━━━━━━━━━━━━ 29s 0us/step
 
 ```
@@ -340,7 +340,7 @@ generating summaries! Let's pick the first 100 samples from the validation set
 and generate summaries for them. We will use the default decoding strategy, i.e.,
 greedy search.
 
-Generation in KerasNLP is highly optimized. It is backed by the power of XLA.
+Generation in KerasHub is highly optimized. It is backed by the power of XLA.
 Secondly, key/value tensors in the self-attention layer and cross-attention layer
 in the decoder are cached to avoid recomputation at every timestep.
 
diff --git a/examples/nlp/md/data_parallel_training_with_keras_nlp.md b/examples/nlp/md/data_parallel_training_with_keras_hub.md
similarity index 95%
rename from examples/nlp/md/data_parallel_training_with_keras_nlp.md
rename to examples/nlp/md/data_parallel_training_with_keras_hub.md
index 8cd0327978..489ee4302d 100644
--- a/examples/nlp/md/data_parallel_training_with_keras_nlp.md
+++ b/examples/nlp/md/data_parallel_training_with_keras_hub.md
@@ -1,12 +1,12 @@
-# Data Parallel Training with KerasNLP and tf.distribute
+# Data Parallel Training with KerasHub and tf.distribute
 
 **Author:** Anshuman Mishra<br>
 **Date created:** 2023/07/07<br>
 **Last modified:** 2023/07/07<br>
-**Description:** Data Parallel training with KerasNLP and tf.distribute.
+**Description:** Data Parallel training with KerasHub and tf.distribute.
 
 
-<img class="k-inline-icon" src="https://colab.research.google.com/img/colab_favicon.ico"/> [**View in Colab**](https://colab.research.google.com/github/keras-team/keras-io/blob/master/examples/nlp/ipynb/data_parallel_training_with_keras_nlp.ipynb)  <span class="k-dot">•</span><img class="k-inline-icon" src="https://github.com/favicon.ico"/> [**GitHub source**](https://github.com/keras-team/keras-io/blob/master/examples/nlp/data_parallel_training_with_keras_nlp.py)
+<img class="k-inline-icon" src="https://colab.research.google.com/img/colab_favicon.ico"/> [**View in Colab**](https://colab.research.google.com/github/keras-team/keras-io/blob/master/examples/nlp/ipynb/data_parallel_training_with_keras_hub.ipynb)  <span class="k-dot">•</span><img class="k-inline-icon" src="https://github.com/favicon.ico"/> [**GitHub source**](https://github.com/keras-team/keras-io/blob/master/examples/nlp/data_parallel_training_with_keras_hub.py)
 
 
 
@@ -15,10 +15,10 @@
 
 Distributed training is a technique used to train deep learning models on multiple devices
 or machines simultaneously. It helps to reduce training time and allows for training larger
-models with more data. KerasNLP is a library that provides tools and utilities for natural
+models with more data. KerasHub is a library that provides tools and utilities for natural
 language processing tasks, including distributed training.
 
-In this tutorial, we will use KerasNLP to train a BERT-based masked language model (MLM)
+In this tutorial, we will use KerasHub to train a BERT-based masked language model (MLM)
 on the wikitext-2 dataset (a 2 million word dataset of wikipedia articles). The MLM task
 involves predicting the masked words in a sentence, which helps the model learn contextual
 representations of words.
@@ -40,7 +40,7 @@ training high-resolution text summarization models on billion word datasets on 2
 
 
 ```python
-!pip install -q --upgrade keras-nlp
+!pip install -q --upgrade keras-hub
 !pip install -q --upgrade keras  # Upgrade to Keras 3.
 ```
 
@@ -55,7 +55,7 @@ os.environ["KERAS_BACKEND"] = "tensorflow"
 
 import tensorflow as tf
 import keras
-import keras_nlp
+import keras_hub
 ```
 
 Before we start any training, let's configure our single GPU to show up as two logical
@@ -227,7 +227,7 @@ the `strategy.scope()`:
 with strategy.scope():
     # Everything that creates variables should be under the strategy scope.
     # In general this is only model construction & `compile()`.
-    model_dist = keras_nlp.models.BertMaskedLM.from_preset("bert_tiny_en_uncased")
+    model_dist = keras_hub.models.BertMaskedLM.from_preset("bert_tiny_en_uncased")
 
     # This line just sets pooled_dense layer as non-trainiable, we do this to avoid
     # warnings of this layer being unused
diff --git a/examples/nlp/md/fnet_classification_with_keras_nlp.md b/examples/nlp/md/fnet_classification_with_keras_hub.md
similarity index 97%
rename from examples/nlp/md/fnet_classification_with_keras_nlp.md
rename to examples/nlp/md/fnet_classification_with_keras_hub.md
index a65dda4af3..99f66f0645 100644
--- a/examples/nlp/md/fnet_classification_with_keras_nlp.md
+++ b/examples/nlp/md/fnet_classification_with_keras_hub.md
@@ -3,10 +3,10 @@
 **Author:** [Abheesht Sharma](https://github.com/abheesht17/)<br>
 **Date created:** 2022/06/01<br>
 **Last modified:** 2022/12/21<br>
-**Description:** Text Classification on the IMDb Dataset using `keras_nlp.layers.FNetEncoder` layer.
+**Description:** Text Classification on the IMDb Dataset using `keras_hub.layers.FNetEncoder` layer.
 
 
-<img class="k-inline-icon" src="https://colab.research.google.com/img/colab_favicon.ico"/> [**View in Colab**](https://colab.research.google.com/github/keras-team/keras-io/blob/master/examples/nlp/ipynb/fnet_classification_with_keras_nlp.ipynb)  <span class="k-dot">•</span><img class="k-inline-icon" src="https://github.com/favicon.ico"/> [**GitHub source**](https://github.com/keras-team/keras-io/blob/master/examples/nlp/fnet_classification_with_keras_nlp.py)
+<img class="k-inline-icon" src="https://colab.research.google.com/img/colab_favicon.ico"/> [**View in Colab**](https://colab.research.google.com/github/keras-team/keras-io/blob/master/examples/nlp/ipynb/fnet_classification_with_keras_hub.ipynb)  <span class="k-dot">•</span><img class="k-inline-icon" src="https://github.com/favicon.ico"/> [**GitHub source**](https://github.com/keras-team/keras-io/blob/master/examples/nlp/fnet_classification_with_keras_hub.py)
 
 
 
@@ -20,7 +20,7 @@ collection of movie reviews labelled either positive or negative (sentiment
 analysis).
 
 To build the tokenizer, model, etc., we will use components from
-[KerasNLP](https://github.com/keras-team/keras-nlp). KerasNLP makes life easier
+[KerasHub](https://github.com/keras-team/keras-hub). KerasHub makes life easier
 for people who want to build NLP pipelines! :)
 
 ### Model
@@ -53,12 +53,12 @@ Before we start with the implementation, let's import all the necessary packages
 
 
 ```python
-!pip install -q --upgrade keras-nlp
+!pip install -q --upgrade keras-hub
 !pip install -q --upgrade keras  # Upgrade to Keras 3.
 ```
 
 ```python
-import keras_nlp
+import keras_hub
 import keras
 import tensorflow as tf
 import os
@@ -212,8 +212,8 @@ b'"hollywood hotel" has relationships to many films like "ella cinders" and "mer
 </div>
 ### Tokenizing the data
 
-We'll be using the `keras_nlp.tokenizers.WordPieceTokenizer` layer to tokenize
-the text. `keras_nlp.tokenizers.WordPieceTokenizer` takes a WordPiece vocabulary
+We'll be using the `keras_hub.tokenizers.WordPieceTokenizer` layer to tokenize
+the text. `keras_hub.tokenizers.WordPieceTokenizer` takes a WordPiece vocabulary
 and has functions for tokenizing the text, and detokenizing sequences of tokens.
 
 Before we define the tokenizer, we first need to train it on the dataset
@@ -221,9 +221,9 @@ we have. The WordPiece tokenization algorithm is a subword tokenization algorith
 training it on a corpus gives us a vocabulary of subwords. A subword tokenizer
 is a compromise between word tokenizers (word tokenizers need very large
 vocabularies for good coverage of input words), and character tokenizers
-(characters don't really encode meaning like words do). Luckily, KerasNLP
+(characters don't really encode meaning like words do). Luckily, KerasHub
 makes it very simple to train WordPiece on a corpus with the
-`keras_nlp.tokenizers.compute_word_piece_vocabulary` utility.
+`keras_hub.tokenizers.compute_word_piece_vocabulary` utility.
 
 Note: The official implementation of FNet uses the SentencePiece Tokenizer.
 
@@ -232,7 +232,7 @@ Note: The official implementation of FNet uses the SentencePiece Tokenizer.
 
 def train_word_piece(ds, vocab_size, reserved_tokens):
     word_piece_ds = ds.unbatch().map(lambda x, y: x)
-    vocab = keras_nlp.tokenizers.compute_word_piece_vocabulary(
+    vocab = keras_hub.tokenizers.compute_word_piece_vocabulary(
         word_piece_ds.batch(1000).prefetch(2),
         vocabulary_size=vocab_size,
         reserved_tokens=reserved_tokens,
@@ -274,7 +274,7 @@ less than the specified sequence length. Otherwise, the sequence is truncated.
 
 
 ```python
-tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
+tokenizer = keras_hub.tokenizers.WordPieceTokenizer(
     vocabulary=vocab,
     lowercase=False,
     sequence_length=MAX_SEQUENCE_LENGTH,
@@ -377,11 +377,11 @@ Now, let's move on to the exciting part - defining our model!
 We first need an embedding layer, i.e., a layer that maps every token in the input
 sequence to a vector. This embedding layer can be initialised randomly. We also
 need a positional embedding layer which encodes the word order in the sequence.
-The convention is to add, i.e., sum, these two embeddings. KerasNLP has a
-`keras_nlp.layers.TokenAndPositionEmbedding ` layer which does all of the above
+The convention is to add, i.e., sum, these two embeddings. KerasHub has a
+`keras_hub.layers.TokenAndPositionEmbedding ` layer which does all of the above
 steps for us.
 
-Our FNet classification model consists of three `keras_nlp.layers.FNetEncoder`
+Our FNet classification model consists of three `keras_hub.layers.FNetEncoder`
 layers with a `keras.layers.Dense` layer on top.
 
 Note: For FNet, masking the padding tokens has a minimal effect on results. In the
@@ -391,16 +391,16 @@ official implementation, the padding tokens are not masked.
 ```python
 input_ids = keras.Input(shape=(None,), dtype="int64", name="input_ids")
 
-x = keras_nlp.layers.TokenAndPositionEmbedding(
+x = keras_hub.layers.TokenAndPositionEmbedding(
     vocabulary_size=VOCAB_SIZE,
     sequence_length=MAX_SEQUENCE_LENGTH,
     embedding_dim=EMBED_DIM,
     mask_zero=True,
 )(input_ids)
 
-x = keras_nlp.layers.FNetEncoder(intermediate_dim=INTERMEDIATE_DIM)(inputs=x)
-x = keras_nlp.layers.FNetEncoder(intermediate_dim=INTERMEDIATE_DIM)(inputs=x)
-x = keras_nlp.layers.FNetEncoder(intermediate_dim=INTERMEDIATE_DIM)(inputs=x)
+x = keras_hub.layers.FNetEncoder(intermediate_dim=INTERMEDIATE_DIM)(inputs=x)
+x = keras_hub.layers.FNetEncoder(intermediate_dim=INTERMEDIATE_DIM)(inputs=x)
+x = keras_hub.layers.FNetEncoder(intermediate_dim=INTERMEDIATE_DIM)(inputs=x)
 
 
 x = keras.layers.GlobalAveragePooling1D()(x)
@@ -536,20 +536,20 @@ NUM_HEADS = 2
 input_ids = keras.Input(shape=(None,), dtype="int64", name="input_ids")
 
 
-x = keras_nlp.layers.TokenAndPositionEmbedding(
+x = keras_hub.layers.TokenAndPositionEmbedding(
     vocabulary_size=VOCAB_SIZE,
     sequence_length=MAX_SEQUENCE_LENGTH,
     embedding_dim=EMBED_DIM,
     mask_zero=True,
 )(input_ids)
 
-x = keras_nlp.layers.TransformerEncoder(
+x = keras_hub.layers.TransformerEncoder(
     intermediate_dim=INTERMEDIATE_DIM, num_heads=NUM_HEADS
 )(inputs=x)
-x = keras_nlp.layers.TransformerEncoder(
+x = keras_hub.layers.TransformerEncoder(
     intermediate_dim=INTERMEDIATE_DIM, num_heads=NUM_HEADS
 )(inputs=x)
-x = keras_nlp.layers.TransformerEncoder(
+x = keras_hub.layers.TransformerEncoder(
     intermediate_dim=INTERMEDIATE_DIM, num_heads=NUM_HEADS
 )(inputs=x)
 
diff --git a/examples/nlp/md/masked_language_modeling.md b/examples/nlp/md/masked_language_modeling.md
index 76a4b0bb76..5a44beb8ba 100644
--- a/examples/nlp/md/masked_language_modeling.md
+++ b/examples/nlp/md/masked_language_modeling.md
@@ -49,7 +49,7 @@ Install `tf-nightly` via `pip install tf-nightly`.
 import os
 
 os.environ["KERAS_BACKEND"] = "tensorflow"
-import keras_nlp
+import keras_hub
 import keras
 import tensorflow as tf
 from keras import layers
@@ -359,7 +359,7 @@ def create_masked_language_bert_model():
     word_embeddings = layers.Embedding(
         config.VOCAB_SIZE, config.EMBED_DIM, name="word_embedding"
     )(inputs)
-    position_embeddings = keras_nlp.layers.PositionEmbedding(
+    position_embeddings = keras_hub.layers.PositionEmbedding(
         sequence_length=config.MAX_LEN
     )(word_embeddings)
     embeddings = word_embeddings + position_embeddings
diff --git a/examples/nlp/md/multiple_choice_task_with_transfer_learning.md b/examples/nlp/md/multiple_choice_task_with_transfer_learning.md
index 03514bfdd7..abd113ddb3 100644
--- a/examples/nlp/md/multiple_choice_task_with_transfer_learning.md
+++ b/examples/nlp/md/multiple_choice_task_with_transfer_learning.md
@@ -23,7 +23,7 @@ unlike question answering. We will use SWAG dataset to demonstrate this example.
 
 
 ```python
-import keras_nlp
+import keras_hub
 import keras
 import tensorflow as tf  # For tf.data only.
 
@@ -197,13 +197,13 @@ Making all sequences the same length through padding boosts computational effici
 making subsequent steps smoother.
 
 Explore the following pages to access the available preprocessing and tokenizer layers in
-**KerasNLP**:
-- [Preprocessing](https://keras.io/api/keras_nlp/preprocessing_layers/)
-- [Tokenizers](https://keras.io/api/keras_nlp/tokenizers/)
+**KerasHub**:
+- [Preprocessing](https://keras.io/api/keras_hub/preprocessing_layers/)
+- [Tokenizers](https://keras.io/api/keras_hub/tokenizers/)
 
 
 ```python
-preprocessor = keras_nlp.models.DebertaV3Preprocessor.from_preset(
+preprocessor = keras_hub.models.DebertaV3Preprocessor.from_preset(
     preset=CFG.preset,  # Name of the model
     sequence_length=CFG.sequence_length,  # Max sequence length, will be padded if shorter
 )
@@ -457,15 +457,15 @@ callbacks = get_callbacks()
 
 ### Pre-trained Models
 
-The `KerasNLP` library provides comprehensive, ready-to-use implementations of popular
+The `KerasHub` library provides comprehensive, ready-to-use implementations of popular
 NLP model architectures. It features a variety of pre-trained models including `Bert`,
 `Roberta`, `DebertaV3`, and more. In this notebook, we'll showcase the usage of
-`DistillBert`. However, feel free to explore all available models in the [KerasNLP
-documentation](https://keras.io/api/keras_nlp/models/). Also for a deeper understanding
-of `KerasNLP`, refer to the informative [getting started
-guide](https://keras.io/guides/keras_nlp/getting_started/).
+`DistillBert`. However, feel free to explore all available models in the [KerasHub
+documentation](https://keras.io/api/keras_hub/models/). Also for a deeper understanding
+of `KerasHub`, refer to the informative [getting started
+guide](https://keras.io/guides/keras_hub/getting_started/).
 
-Our approach involves using `keras_nlp.models.XXClassifier` to process each question and
+Our approach involves using `keras_hub.models.XXClassifier` to process each question and
 option pari (e.g. (Q+A), (Q+B), etc.), generating logits. These logits are then combined
 and passed through a softmax function to produce the final output.
 
@@ -536,7 +536,7 @@ def build_model():
         ),
     }
     # Create a DebertaV3Classifier model
-    classifier = keras_nlp.models.DebertaV3Classifier.from_preset(
+    classifier = keras_hub.models.DebertaV3Classifier.from_preset(
         CFG.preset,
         preprocessor=None,
         num_classes=1,  # one output per one option, for five options total 5 outputs
@@ -881,7 +881,7 @@ People are standing on sand wearing red shirts. They
 ## Reference
 * [Multiple Choice with
 HF](https://twitter.com/johnowhitaker/status/1689790373454041089?s=20)
-* [Keras NLP](https://keras.io/api/keras_nlp/)
+* [Keras NLP](https://keras.io/api/keras_hub/)
 * [BirdCLEF23: Pretraining is All you Need
 [Train]](https://www.kaggle.com/code/awsaf49/birdclef23-pretraining-is-all-you-need-train)
 [Train]](https://www.kaggle.com/code/awsaf49/birdclef23-pretraining-is-all-you-need-train)
diff --git a/examples/nlp/md/neural_machine_translation_with_keras_nlp.md b/examples/nlp/md/neural_machine_translation_with_keras_hub.md
similarity index 91%
rename from examples/nlp/md/neural_machine_translation_with_keras_nlp.md
rename to examples/nlp/md/neural_machine_translation_with_keras_hub.md
index 52958ae317..0e454c63b3 100644
--- a/examples/nlp/md/neural_machine_translation_with_keras_nlp.md
+++ b/examples/nlp/md/neural_machine_translation_with_keras_hub.md
@@ -1,42 +1,42 @@
-# English-to-Spanish translation with KerasNLP
+# English-to-Spanish translation with KerasHub
 
 **Author:** [Abheesht Sharma](https://github.com/abheesht17/)<br>
 **Date created:** 2022/05/26<br>
 **Last modified:** 2024/04/30<br>
-**Description:** Use KerasNLP to train a sequence-to-sequence Transformer model on the machine translation task.
+**Description:** Use KerasHub to train a sequence-to-sequence Transformer model on the machine translation task.
 
 
-<img class="k-inline-icon" src="https://colab.research.google.com/img/colab_favicon.ico"/> [**View in Colab**](https://colab.research.google.com/github/keras-team/keras-io/blob/master/examples/nlp/ipynb/neural_machine_translation_with_keras_nlp.ipynb)  <span class="k-dot">•</span><img class="k-inline-icon" src="https://github.com/favicon.ico"/> [**GitHub source**](https://github.com/keras-team/keras-io/blob/master/examples/nlp/neural_machine_translation_with_keras_nlp.py)
+<img class="k-inline-icon" src="https://colab.research.google.com/img/colab_favicon.ico"/> [**View in Colab**](https://colab.research.google.com/github/keras-team/keras-io/blob/master/examples/nlp/ipynb/neural_machine_translation_with_keras_hub.ipynb)  <span class="k-dot">•</span><img class="k-inline-icon" src="https://github.com/favicon.ico"/> [**GitHub source**](https://github.com/keras-team/keras-io/blob/master/examples/nlp/neural_machine_translation_with_keras_hub.py)
 
 
 
 ---
 ## Introduction
 
-KerasNLP provides building blocks for NLP (model layers, tokenizers, metrics, etc.) and
+KerasHub provides building blocks for NLP (model layers, tokenizers, metrics, etc.) and
 makes it convenient to construct NLP pipelines.
 
-In this example, we'll use KerasNLP layers to build an encoder-decoder Transformer
+In this example, we'll use KerasHub layers to build an encoder-decoder Transformer
 model, and train it on the English-to-Spanish machine translation task.
 
 This example is based on the
 [English-to-Spanish NMT
 example](https://keras.io/examples/nlp/neural_machine_translation_with_transformer/)
 by [fchollet](https://twitter.com/fchollet). The original example is more low-level
-and implements layers from scratch, whereas this example uses KerasNLP to show
+and implements layers from scratch, whereas this example uses KerasHub to show
 some more advanced approaches, such as subword tokenization and using metrics
 to compute the quality of generated translations.
 
 You'll learn how to:
 
-- Tokenize text using `keras_nlp.tokenizers.WordPieceTokenizer`.
-- Implement a sequence-to-sequence Transformer model using KerasNLP's
-`keras_nlp.layers.TransformerEncoder`, `keras_nlp.layers.TransformerDecoder` and
-`keras_nlp.layers.TokenAndPositionEmbedding` layers, and train it.
-- Use `keras_nlp.samplers` to generate translations of unseen input sentences
+- Tokenize text using `keras_hub.tokenizers.WordPieceTokenizer`.
+- Implement a sequence-to-sequence Transformer model using KerasHub's
+`keras_hub.layers.TransformerEncoder`, `keras_hub.layers.TransformerDecoder` and
+`keras_hub.layers.TokenAndPositionEmbedding` layers, and train it.
+- Use `keras_hub.samplers` to generate translations of unseen input sentences
  using the top-p decoding strategy!
 
-Don't worry if you aren't familiar with KerasNLP. This tutorial will start with
+Don't worry if you aren't familiar with KerasHub. This tutorial will start with
 the basics. Let's dive right in!
 
 ---
@@ -47,12 +47,12 @@ Before we start implementing the pipeline, let's import all the libraries we nee
 
 ```python
 !pip install -q --upgrade rouge-score
-!pip install -q --upgrade keras-nlp
+!pip install -q --upgrade keras-hub
 !pip install -q --upgrade keras  # Upgrade to Keras 3.
 ```
 
 ```python
-import keras_nlp
+import keras_hub
 import pathlib
 import random
 
@@ -180,8 +180,8 @@ print(f"{len(test_pairs)} test pairs")
 
 We'll define two tokenizers - one for the source language (English), and the other
 for the target language (Spanish). We'll be using
-`keras_nlp.tokenizers.WordPieceTokenizer` to tokenize the text.
-`keras_nlp.tokenizers.WordPieceTokenizer` takes a WordPiece vocabulary
+`keras_hub.tokenizers.WordPieceTokenizer` to tokenize the text.
+`keras_hub.tokenizers.WordPieceTokenizer` takes a WordPiece vocabulary
 and has functions for tokenizing the text, and detokenizing sequences of tokens.
 
 Before we define the two tokenizers, we first need to train them on the dataset
@@ -189,16 +189,16 @@ we have. The WordPiece tokenization algorithm is a subword tokenization algorith
 training it on a corpus gives us a vocabulary of subwords. A subword tokenizer
 is a compromise between word tokenizers (word tokenizers need very large
 vocabularies for good coverage of input words), and character tokenizers
-(characters don't really encode meaning like words do). Luckily, KerasNLP
+(characters don't really encode meaning like words do). Luckily, KerasHub
 makes it very simple to train WordPiece on a corpus with the
-`keras_nlp.tokenizers.compute_word_piece_vocabulary` utility.
+`keras_hub.tokenizers.compute_word_piece_vocabulary` utility.
 
 
 ```python
 
 def train_word_piece(text_samples, vocab_size, reserved_tokens):
     word_piece_ds = tf_data.Dataset.from_tensor_slices(text_samples)
-    vocab = keras_nlp.tokenizers.compute_word_piece_vocabulary(
+    vocab = keras_hub.tokenizers.compute_word_piece_vocabulary(
         word_piece_ds.batch(1000).prefetch(2),
         vocabulary_size=vocab_size,
         reserved_tokens=reserved_tokens,
@@ -246,10 +246,10 @@ the vocabularies trained above.
 
 
 ```python
-eng_tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
+eng_tokenizer = keras_hub.tokenizers.WordPieceTokenizer(
     vocabulary=eng_vocab, lowercase=False
 )
-spa_tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
+spa_tokenizer = keras_hub.tokenizers.WordPieceTokenizer(
     vocabulary=spa_vocab, lowercase=False
 )
 ```
@@ -317,7 +317,7 @@ it provides the next words in the target sentence -- what the model will try to
 
 We will add special tokens, `"[START]"` and `"[END]"`, to the input Spanish
 sentence after tokenizing the text. We will also pad the input to a fixed length.
-This can be easily done using `keras_nlp.layers.StartEndPacker`.
+This can be easily done using `keras_hub.layers.StartEndPacker`.
 
 
 ```python
@@ -329,14 +329,14 @@ def preprocess_batch(eng, spa):
     spa = spa_tokenizer(spa)
 
     # Pad `eng` to `MAX_SEQUENCE_LENGTH`.
-    eng_start_end_packer = keras_nlp.layers.StartEndPacker(
+    eng_start_end_packer = keras_hub.layers.StartEndPacker(
         sequence_length=MAX_SEQUENCE_LENGTH,
         pad_value=eng_tokenizer.token_to_id("[PAD]"),
     )
     eng = eng_start_end_packer(eng)
 
     # Add special tokens (`"[START]"` and `"[END]"`) to `spa` and pad it as well.
-    spa_start_end_packer = keras_nlp.layers.StartEndPacker(
+    spa_start_end_packer = keras_hub.layers.StartEndPacker(
         sequence_length=MAX_SEQUENCE_LENGTH + 1,
         start_value=spa_tokenizer.token_to_id("[START]"),
         end_value=spa_tokenizer.token_to_id("[END]"),
@@ -394,27 +394,27 @@ Now, let's move on to the exciting part - defining our model!
 We first need an embedding layer, i.e., a vector for every token in our input sequence.
 This embedding layer can be initialised randomly. We also need a positional
 embedding layer which encodes the word order in the sequence. The convention is
-to add these two embeddings. KerasNLP has a `keras_nlp.layers.TokenAndPositionEmbedding `
+to add these two embeddings. KerasHub has a `keras_hub.layers.TokenAndPositionEmbedding `
 layer which does all of the above steps for us.
 
-Our sequence-to-sequence Transformer consists of a `keras_nlp.layers.TransformerEncoder`
-layer and a `keras_nlp.layers.TransformerDecoder` layer chained together.
+Our sequence-to-sequence Transformer consists of a `keras_hub.layers.TransformerEncoder`
+layer and a `keras_hub.layers.TransformerDecoder` layer chained together.
 
-The source sequence will be passed to `keras_nlp.layers.TransformerEncoder`, which
+The source sequence will be passed to `keras_hub.layers.TransformerEncoder`, which
 will produce a new representation of it. This new representation will then be passed
-to the `keras_nlp.layers.TransformerDecoder`, together with the target sequence
-so far (target words 0 to N). The `keras_nlp.layers.TransformerDecoder` will
+to the `keras_hub.layers.TransformerDecoder`, together with the target sequence
+so far (target words 0 to N). The `keras_hub.layers.TransformerDecoder` will
 then seek to predict the next words in the target sequence (N+1 and beyond).
 
 A key detail that makes this possible is causal masking.
-The `keras_nlp.layers.TransformerDecoder` sees the entire sequence at once, and
+The `keras_hub.layers.TransformerDecoder` sees the entire sequence at once, and
 thus we must make sure that it only uses information from target tokens 0 to N
 when predicting token N+1 (otherwise, it could use information from the future,
 which would result in a model that cannot be used at inference time). Causal masking
-is enabled by default in `keras_nlp.layers.TransformerDecoder`.
+is enabled by default in `keras_hub.layers.TransformerDecoder`.
 
 We also need to mask the padding tokens (`"[PAD]"`). For this, we can set the
-`mask_zero` argument of the `keras_nlp.layers.TokenAndPositionEmbedding` layer
+`mask_zero` argument of the `keras_hub.layers.TokenAndPositionEmbedding` layer
 to True. This will then be propagated to all subsequent layers.
 
 
@@ -422,13 +422,13 @@ to True. This will then be propagated to all subsequent layers.
 # Encoder
 encoder_inputs = keras.Input(shape=(None,), name="encoder_inputs")
 
-x = keras_nlp.layers.TokenAndPositionEmbedding(
+x = keras_hub.layers.TokenAndPositionEmbedding(
     vocabulary_size=ENG_VOCAB_SIZE,
     sequence_length=MAX_SEQUENCE_LENGTH,
     embedding_dim=EMBED_DIM,
 )(encoder_inputs)
 
-encoder_outputs = keras_nlp.layers.TransformerEncoder(
+encoder_outputs = keras_hub.layers.TransformerEncoder(
     intermediate_dim=INTERMEDIATE_DIM, num_heads=NUM_HEADS
 )(inputs=x)
 encoder = keras.Model(encoder_inputs, encoder_outputs)
@@ -438,13 +438,13 @@ encoder = keras.Model(encoder_inputs, encoder_outputs)
 decoder_inputs = keras.Input(shape=(None,), name="decoder_inputs")
 encoded_seq_inputs = keras.Input(shape=(None, EMBED_DIM), name="decoder_state_inputs")
 
-x = keras_nlp.layers.TokenAndPositionEmbedding(
+x = keras_hub.layers.TokenAndPositionEmbedding(
     vocabulary_size=SPA_VOCAB_SIZE,
     sequence_length=MAX_SEQUENCE_LENGTH,
     embedding_dim=EMBED_DIM,
 )(decoder_inputs)
 
-x = keras_nlp.layers.TransformerDecoder(
+x = keras_hub.layers.TransformerDecoder(
     intermediate_dim=INTERMEDIATE_DIM, num_heads=NUM_HEADS
 )(decoder_sequence=x, encoder_sequence=encoded_seq_inputs)
 x = keras.layers.Dropout(0.5)(x)
@@ -550,8 +550,8 @@ as well as the target token `"[START]"`. The model outputs probabilities of the
 next token. We then we repeatedly generated the next token conditioned on the
 tokens generated so far, until we hit the token `"[END]"`.
 
-For decoding, we will use the `keras_nlp.samplers` module from
-KerasNLP. Greedy Decoding is a text decoding method which outputs the most
+For decoding, we will use the `keras_hub.samplers` module from
+KerasHub. Greedy Decoding is a text decoding method which outputs the most
 likely next token at each time step, i.e., the token with the highest probability.
 
 
@@ -582,7 +582,7 @@ def decode_sequences(input_sentences):
     pad = ops.full((batch_size, length - 1), spa_tokenizer.token_to_id("[PAD]"))
     prompt = ops.concatenate((start, pad), axis=-1)
 
-    generated_tokens = keras_nlp.samplers.GreedySampler()(
+    generated_tokens = keras_hub.samplers.GreedySampler()(
         next,
         prompt,
         stop_token_ids=[spa_tokenizer.token_to_id("[END]")],
@@ -644,8 +644,8 @@ expensive process).
 
 
 ```python
-rouge_1 = keras_nlp.metrics.RougeN(order=1)
-rouge_2 = keras_nlp.metrics.RougeN(order=2)
+rouge_1 = keras_hub.metrics.RougeN(order=1)
+rouge_2 = keras_hub.metrics.RougeN(order=2)
 
 for test_pair in test_pairs[:30]:
     input_sentence = test_pair[0]
diff --git a/examples/nlp/md/parameter_efficient_finetuning_of_gpt2_with_lora.md b/examples/nlp/md/parameter_efficient_finetuning_of_gpt2_with_lora.md
index cf8c2c83d4..728cdaf105 100644
--- a/examples/nlp/md/parameter_efficient_finetuning_of_gpt2_with_lora.md
+++ b/examples/nlp/md/parameter_efficient_finetuning_of_gpt2_with_lora.md
@@ -3,7 +3,7 @@
 **Author:** [Abheesht Sharma](https://github.com/abheesht17/), [Matthew Watson](https://github.com/mattdangerw/)<br>
 **Date created:** 2023/05/27<br>
 **Last modified:** 2023/05/27<br>
-**Description:** Use KerasNLP to fine-tune a GPT-2 LLM with LoRA.
+**Description:** Use KerasHub to fine-tune a GPT-2 LLM with LoRA.
 
 
 <img class="k-inline-icon" src="https://colab.research.google.com/img/colab_favicon.ico"/> [**View in Colab**](https://colab.research.google.com/github/keras-team/keras-io/blob/master/examples/nlp/ipynb/parameter_efficient_finetuning_of_gpt2_with_lora.ipynb)  <span class="k-dot">•</span><img class="k-inline-icon" src="https://github.com/favicon.ico"/> [**GitHub source**](https://github.com/keras-team/keras-io/blob/master/examples/nlp/parameter_efficient_finetuning_of_gpt2_with_lora.py)
@@ -29,8 +29,8 @@ decrease in training time and GPU memory usage, while maintaining the quality
 of the outputs.
 
 In this example, we will explain LoRA in technical terms, show how the technical
-explanation translates to code, hack KerasNLP's
-[GPT-2 model](https://keras.io/api/keras_nlp/models/gpt2/) and fine-tune
+explanation translates to code, hack KerasHub's
+[GPT-2 model](https://keras.io/api/keras_hub/models/gpt2/) and fine-tune
 it on the next token prediction task using LoRA. We will compare LoRA GPT-2
 with a fully fine-tuned GPT-2 in terms of the quality of the generated text,
 training time and GPU memory usage.
@@ -44,14 +44,14 @@ backends.
 ## Setup
 
 Before we start implementing the pipeline, let's install and import all the
-libraries we need. We'll be using the KerasNLP library.
+libraries we need. We'll be using the KerasHub library.
 
 Secondly, let's enable mixed precision training. This will help us reduce the
 training time.
 
 
 ```python
-!pip install -q --upgrade keras-nlp
+!pip install -q --upgrade keras-hub
 !pip install -q --upgrade keras  # Upgrade to Keras 3.
 ```
 
@@ -61,7 +61,7 @@ import os
 
 os.environ["KERAS_BACKEND"] = "tensorflow"
 
-import keras_nlp
+import keras_hub
 import keras
 import matplotlib.pyplot as plt
 import tensorflow as tf
@@ -239,11 +239,11 @@ on Colab.
 
 
 ```python
-preprocessor = keras_nlp.models.GPT2CausalLMPreprocessor.from_preset(
+preprocessor = keras_hub.models.GPT2CausalLMPreprocessor.from_preset(
     "gpt2_base_en",
     sequence_length=MAX_SEQUENCE_LENGTH,
 )
-gpt2_lm = keras_nlp.models.GPT2CausalLM.from_preset(
+gpt2_lm = keras_hub.models.GPT2CausalLM.from_preset(
     "gpt2_base_en", preprocessor=preprocessor
 )
 
@@ -575,11 +575,11 @@ del loss
 tf.config.experimental.reset_memory_stats("GPU:0")
 
 # Load the original model.
-preprocessor = keras_nlp.models.GPT2CausalLMPreprocessor.from_preset(
+preprocessor = keras_hub.models.GPT2CausalLMPreprocessor.from_preset(
     "gpt2_base_en",
     sequence_length=128,
 )
-lora_model = keras_nlp.models.GPT2CausalLM.from_preset(
+lora_model = keras_hub.models.GPT2CausalLM.from_preset(
     "gpt2_base_en",
     preprocessor=preprocessor,
 )
diff --git a/examples/nlp/md/semantic_similarity_with_keras_nlp.md b/examples/nlp/md/semantic_similarity_with_keras_hub.md
similarity index 91%
rename from examples/nlp/md/semantic_similarity_with_keras_nlp.md
rename to examples/nlp/md/semantic_similarity_with_keras_hub.md
index 8cf554fc9b..58f1d8dadc 100644
--- a/examples/nlp/md/semantic_similarity_with_keras_nlp.md
+++ b/examples/nlp/md/semantic_similarity_with_keras_hub.md
@@ -1,12 +1,12 @@
-# Semantic Similarity with KerasNLP
+# Semantic Similarity with KerasHub
 
 **Author:** [Anshuman Mishra](https://github.com/shivance/)<br>
 **Date created:** 2023/02/25<br>
 **Last modified:** 2023/02/25<br>
-**Description:** Use pretrained models from KerasNLP for the Semantic Similarity Task.
+**Description:** Use pretrained models from KerasHub for the Semantic Similarity Task.
 
 
-<img class="k-inline-icon" src="https://colab.research.google.com/img/colab_favicon.ico"/> [**View in Colab**](https://colab.research.google.com/github/keras-team/keras-io/blob/master/examples/nlp/ipynb/semantic_similarity_with_keras_nlp.ipynb)  <span class="k-dot">•</span><img class="k-inline-icon" src="https://github.com/favicon.ico"/> [**GitHub source**](https://github.com/keras-team/keras-io/blob/master/examples/nlp/semantic_similarity_with_keras_nlp.py)
+<img class="k-inline-icon" src="https://colab.research.google.com/img/colab_favicon.ico"/> [**View in Colab**](https://colab.research.google.com/github/keras-team/keras-io/blob/master/examples/nlp/ipynb/semantic_similarity_with_keras_hub.ipynb)  <span class="k-dot">•</span><img class="k-inline-icon" src="https://github.com/favicon.ico"/> [**GitHub source**](https://github.com/keras-team/keras-io/blob/master/examples/nlp/semantic_similarity_with_keras_hub.py)
 
 
 
@@ -17,10 +17,10 @@ Semantic similarity refers to the task of determining the degree of similarity b
 sentences in terms of their meaning. We already saw in [this](https://keras.io/examples/nlp/semantic_similarity_with_bert/)
 example how to use SNLI (Stanford Natural Language Inference) corpus to predict sentence
 semantic similarity with the HuggingFace Transformers library. In this tutorial we will
-learn how to use [KerasNLP](https://keras.io/keras_nlp/), an extension of the core Keras API,
-for the same task. Furthermore, we will discover how KerasNLP effectively reduces boilerplate
-code and simplifies the process of building and utilizing models. For more information on KerasNLP,
-please refer to [KerasNLP's official documentation](https://keras.io/keras_nlp/).
+learn how to use [KerasHub](https://keras.io/keras_hub/), an extension of the core Keras API,
+for the same task. Furthermore, we will discover how KerasHub effectively reduces boilerplate
+code and simplifies the process of building and utilizing models. For more information on KerasHub,
+please refer to [KerasHub's official documentation](https://keras.io/keras_hub/).
 
 This guide is broken down into the following parts:
 
@@ -35,13 +35,13 @@ This guide is broken down into the following parts:
 
 The following guide uses [Keras Core](https://keras.io/keras_core/) to work in
 any of `tensorflow`, `jax` or `torch`. Support for Keras Core is baked into
-KerasNLP, simply change the `KERAS_BACKEND` environment variable below to change
+KerasHub, simply change the `KERAS_BACKEND` environment variable below to change
 the backend you would like to use. We select the `jax` backend below, which will
 give us a particularly fast train step below.
 
 
 ```python
-!pip install -q --upgrade keras-nlp
+!pip install -q --upgrade keras-hub
 !pip install -q --upgrade keras  # Upgrade to Keras 3.
 ```
 
@@ -49,7 +49,7 @@ give us a particularly fast train step below.
 import numpy as np
 import tensorflow as tf
 import keras
-import keras_nlp
+import keras_hub
 import tensorflow_datasets as tfds
 ```
 <div class="k-default-codeblock">
@@ -122,7 +122,7 @@ def filter_labels(sample):
 ```
 
 Here's a utility function that splits the example into an `(x, y)` tuple that is suitable
-for `model.fit()`. By default, `keras_nlp.models.BertClassifier` will tokenize and pack
+for `model.fit()`. By default, `keras_hub.models.BertClassifier` will tokenize and pack
 together raw strings using a `"[SEP]"` token during training. Therefore, this label
 splitting is all the data preparation that we need to perform.
 
@@ -156,12 +156,12 @@ test_ds = (
 ---
 ## Establishing baseline with BERT.
 
-We use the BERT model from KerasNLP to establish a baseline for our semantic similarity
-task. The `keras_nlp.models.BertClassifier` class attaches a classification head to the BERT
+We use the BERT model from KerasHub to establish a baseline for our semantic similarity
+task. The `keras_hub.models.BertClassifier` class attaches a classification head to the BERT
 Backbone, mapping the backbone outputs to a logit output suitable for a classification task.
 This significantly reduces the need for custom code.
 
-KerasNLP models have built-in tokenization capabilities that handle tokenization by default
+KerasHub models have built-in tokenization capabilities that handle tokenization by default
 based on the selected model. However, users can also use custom preprocessing techniques
 as per their specific needs. If we pass a tuple as input, the model will tokenize all the
 strings and concatenate them with a `"[SEP]"` separator.
@@ -171,14 +171,14 @@ to use our own preprocessor. For the SNLI dataset, we set `num_classes` to 3.
 
 
 ```python
-bert_classifier = keras_nlp.models.BertClassifier.from_preset(
+bert_classifier = keras_hub.models.BertClassifier.from_preset(
     "bert_tiny_en_uncased", num_classes=3
 )
 ```
 
 Please note that the BERT Tiny model has only 4,386,307 trainable parameters.
 
-KerasNLP task models come with compilation defaults. We can now train the model we just
+KerasHub task models come with compilation defaults. We can now train the model we just
 instantiated by calling the `fit()` method.
 
 
@@ -218,7 +218,7 @@ higher learning rate.
 
 
 ```python
-bert_classifier = keras_nlp.models.BertClassifier.from_preset(
+bert_classifier = keras_hub.models.BertClassifier.from_preset(
     "bert_tiny_en_uncased", num_classes=3
 )
 bert_classifier.compile(
@@ -271,7 +271,7 @@ class TriangularSchedule(keras.optimizers.schedules.LearningRateSchedule):
         return keras.ops.maximum(triangular_rate, 0.0)
 
 
-bert_classifier = keras_nlp.models.BertClassifier.from_preset(
+bert_classifier = keras_hub.models.BertClassifier.from_preset(
     "bert_tiny_en_uncased", num_classes=3
 )
 
@@ -353,7 +353,7 @@ restored_model.evaluate(test_ds)
 ---
 ## Performing inference with the model.
 
-Let's see how to perform inference with KerasNLP models
+Let's see how to perform inference with KerasHub models
 
 
 ```python
@@ -381,7 +381,7 @@ sample
 
 ```
 </div>
-The default preprocessor in KerasNLP models handles input tokenization automatically,
+The default preprocessor in KerasHub models handles input tokenization automatically,
 so we don't need to perform tokenization explicitly.
 
 
@@ -407,13 +407,13 @@ predictions = softmax(predictions)
 ## Improving accuracy with RoBERTa
 
 Now that we have established a baseline, we can attempt to improve our results
-by experimenting with different models. Thanks to KerasNLP, fine-tuning a RoBERTa
+by experimenting with different models. Thanks to KerasHub, fine-tuning a RoBERTa
 checkpoint on the same dataset is easy with just a few lines of code.
 
 
 ```python
 # Inittializing a RoBERTa from preset
-roberta_classifier = keras_nlp.models.RobertaClassifier.from_preset(
+roberta_classifier = keras_hub.models.RobertaClassifier.from_preset(
     "roberta_base_en", num_classes=3
 )
 
@@ -455,13 +455,13 @@ print(tf.math.argmax(predictions, axis=1).numpy())
 ```
 </div>
 We hope this tutorial has been helpful in demonstrating the ease and effectiveness
-of using KerasNLP and BERT for semantic similarity tasks.
+of using KerasHub and BERT for semantic similarity tasks.
 
 Throughout this tutorial, we demonstrated how to use a pretrained BERT model to
 establish a baseline and improve performance by training a larger RoBERTa model
 using just a few lines of code.
 
-The KerasNLP toolbox provides a range of modular building blocks for preprocessing
+The KerasHub toolbox provides a range of modular building blocks for preprocessing
 text, including pretrained state-of-the-art models and low-level Transformer Encoder
 layers. We believe that this makes experimenting with natural language solutions
 more accessible and efficient.
diff --git a/examples/nlp/md/sentence_embeddings_with_sbert.md b/examples/nlp/md/sentence_embeddings_with_sbert.md
index b40a19b62a..62799c04f9 100644
--- a/examples/nlp/md/sentence_embeddings_with_sbert.md
+++ b/examples/nlp/md/sentence_embeddings_with_sbert.md
@@ -3,7 +3,7 @@
 **Author:** [Mohammed Abu El-Nasr](https://github.com/abuelnasr0)<br>
 **Date created:** 2023/07/14<br>
 **Last modified:** 2023/07/14<br>
-**Description:** Fine-tune a RoBERTa model to generate sentence embeddings using KerasNLP.
+**Description:** Fine-tune a RoBERTa model to generate sentence embeddings using KerasHub.
 
 
 <img class="k-inline-icon" src="https://colab.research.google.com/img/colab_favicon.ico"/> [**View in Colab**](https://colab.research.google.com/github/keras-team/keras-io/blob/master/examples/nlp/ipynb/sentence_embeddings_with_sbert.ipynb)  <span class="k-dot">•</span><img class="k-inline-icon" src="https://github.com/favicon.ico"/> [**GitHub source**](https://github.com/keras-team/keras-io/blob/master/examples/nlp/sentence_embeddings_with_sbert.py)
@@ -46,7 +46,7 @@ This method of fine-tuning was introduced in
 ---
 ## Setup
 
-Let's install and import the libraries we need. We'll be using the KerasNLP library in
+Let's install and import the libraries we need. We'll be using the KerasHub library in
 this example.
 
 We will also enable [mixed precision](https://www.tensorflow.org/guide/mixed_precision)
@@ -54,7 +54,7 @@ training. This will help us reduce the training time.
 
 
 ```python
-!pip install -q --upgrade keras-nlp
+!pip install -q --upgrade keras-hub
 !pip install -q --upgrade keras  # Upgrade to Keras 3.
 ```
 
@@ -65,7 +65,7 @@ import os
 os.environ["KERAS_BACKEND"] = "tensorflow"
 
 import keras
-import keras_nlp
+import keras_hub
 import tensorflow as tf
 import tensorflow_datasets as tfds
 import sklearn.cluster as cluster
@@ -225,8 +225,8 @@ layer to exclude padded tokens from being averaged.
 
 
 ```python
-preprocessor = keras_nlp.models.RobertaPreprocessor.from_preset("roberta_base_en")
-backbone = keras_nlp.models.RobertaBackbone.from_preset("roberta_base_en")
+preprocessor = keras_hub.models.RobertaPreprocessor.from_preset("roberta_base_en")
+backbone = keras_hub.models.RobertaBackbone.from_preset("roberta_base_en")
 inputs = keras.Input(shape=(1,), dtype="string", name="sentence")
 x = preprocessor(inputs)
 h = backbone(x)
@@ -485,8 +485,8 @@ sentence.
 
 
 ```python
-preprocessor = keras_nlp.models.RobertaPreprocessor.from_preset("roberta_base_en")
-backbone = keras_nlp.models.RobertaBackbone.from_preset("roberta_base_en")
+preprocessor = keras_hub.models.RobertaPreprocessor.from_preset("roberta_base_en")
+backbone = keras_hub.models.RobertaBackbone.from_preset("roberta_base_en")
 input = keras.Input(shape=(1,), dtype="string", name="sentence")
 
 x = preprocessor(input)
diff --git a/examples/nlp/md/t5_hf_summarization.md b/examples/nlp/md/t5_hf_summarization.md
index 707521ae7b..48f60a9537 100644
--- a/examples/nlp/md/t5_hf_summarization.md
+++ b/examples/nlp/md/t5_hf_summarization.md
@@ -42,7 +42,7 @@ task using Hugging Face Transformers on the `XSum` dataset loaded from Hugging F
 
 ```python
 !pip install transformers==4.20.0
-!pip install keras_nlp==0.3.0
+!pip install keras_hub==0.3.0
 !pip install datasets
 !pip install huggingface-hub
 !pip install nltk
@@ -333,9 +333,9 @@ calculate the `ROUGE` score between the groud-truth and predictions.
 
 
 ```python
-import keras_nlp
+import keras_hub
 
-rouge_l = keras_nlp.metrics.RougeL()
+rouge_l = keras_hub.metrics.RougeL()
 
 
 def metric_fn(eval_predictions):
diff --git a/examples/nlp/multiple_choice_task_with_transfer_learning.py b/examples/nlp/multiple_choice_task_with_transfer_learning.py
index 7c812c3389..c83c3eb852 100644
--- a/examples/nlp/multiple_choice_task_with_transfer_learning.py
+++ b/examples/nlp/multiple_choice_task_with_transfer_learning.py
@@ -23,7 +23,7 @@
 """shell
 """
 
-import keras_nlp
+import keras_hub
 import keras
 import tensorflow as tf  # For tf.data only.
 
@@ -143,12 +143,12 @@ def make_options(row):
 making subsequent steps smoother.
 
 Explore the following pages to access the available preprocessing and tokenizer layers in
-**KerasNLP**:
-- [Preprocessing](https://keras.io/api/keras_nlp/preprocessing_layers/)
-- [Tokenizers](https://keras.io/api/keras_nlp/tokenizers/)
+**KerasHub**:
+- [Preprocessing](https://keras.io/api/keras_hub/preprocessing_layers/)
+- [Tokenizers](https://keras.io/api/keras_hub/tokenizers/)
 """
 
-preprocessor = keras_nlp.models.DebertaV3Preprocessor.from_preset(
+preprocessor = keras_hub.models.DebertaV3Preprocessor.from_preset(
     preset=CFG.preset,  # Name of the model
     sequence_length=CFG.sequence_length,  # Max sequence length, will be padded if shorter
 )
@@ -378,15 +378,15 @@ def get_callbacks():
 
 ### Pre-trained Models
 
-The `KerasNLP` library provides comprehensive, ready-to-use implementations of popular
+The `KerasHub` library provides comprehensive, ready-to-use implementations of popular
 NLP model architectures. It features a variety of pre-trained models including `Bert`,
 `Roberta`, `DebertaV3`, and more. In this notebook, we'll showcase the usage of
-`DistillBert`. However, feel free to explore all available models in the [KerasNLP
-documentation](https://keras.io/api/keras_nlp/models/). Also for a deeper understanding
-of `KerasNLP`, refer to the informative [getting started
-guide](https://keras.io/guides/keras_nlp/getting_started/).
+`DistillBert`. However, feel free to explore all available models in the [KerasHub
+documentation](https://keras.io/api/keras_hub/models/). Also for a deeper understanding
+of `KerasHub`, refer to the informative [getting started
+guide](https://keras.io/guides/keras_hub/getting_started/).
 
-Our approach involves using `keras_nlp.models.XXClassifier` to process each question and
+Our approach involves using `keras_hub.models.XXClassifier` to process each question and
 option pari (e.g. (Q+A), (Q+B), etc.), generating logits. These logits are then combined
 and passed through a softmax function to produce the final output.
 """
@@ -459,7 +459,7 @@ def build_model():
         ),
     }
     # Create a DebertaV3Classifier model
-    classifier = keras_nlp.models.DebertaV3Classifier.from_preset(
+    classifier = keras_hub.models.DebertaV3Classifier.from_preset(
         CFG.preset,
         preprocessor=None,
         num_classes=1,  # one output per one option, for five options total 5 outputs
@@ -551,7 +551,7 @@ def build_model():
 ## Reference
 * [Multiple Choice with
 HF](https://twitter.com/johnowhitaker/status/1689790373454041089?s=20)
-* [Keras NLP](https://keras.io/api/keras_nlp/)
+* [Keras NLP](https://keras.io/api/keras_hub/)
 * [BirdCLEF23: Pretraining is All you Need
 [Train]](https://www.kaggle.com/code/awsaf49/birdclef23-pretraining-is-all-you-need-train)
 [Train]](https://www.kaggle.com/code/awsaf49/birdclef23-pretraining-is-all-you-need-train)
diff --git a/examples/nlp/neural_machine_translation_with_keras_nlp.py b/examples/nlp/neural_machine_translation_with_keras_hub.py
similarity index 87%
rename from examples/nlp/neural_machine_translation_with_keras_nlp.py
rename to examples/nlp/neural_machine_translation_with_keras_hub.py
index 7ff2a9c4fa..dcd260c5c4 100644
--- a/examples/nlp/neural_machine_translation_with_keras_nlp.py
+++ b/examples/nlp/neural_machine_translation_with_keras_hub.py
@@ -1,39 +1,39 @@
 """
-Title: English-to-Spanish translation with KerasNLP
+Title: English-to-Spanish translation with KerasHub
 Author: [Abheesht Sharma](https://github.com/abheesht17/)
 Date created: 2022/05/26
 Last modified: 2024/04/30
-Description: Use KerasNLP to train a sequence-to-sequence Transformer model on the machine translation task.
+Description: Use KerasHub to train a sequence-to-sequence Transformer model on the machine translation task.
 Accelerator: GPU
 """
 
 """
 ## Introduction
 
-KerasNLP provides building blocks for NLP (model layers, tokenizers, metrics, etc.) and
+KerasHub provides building blocks for NLP (model layers, tokenizers, metrics, etc.) and
 makes it convenient to construct NLP pipelines.
 
-In this example, we'll use KerasNLP layers to build an encoder-decoder Transformer
+In this example, we'll use KerasHub layers to build an encoder-decoder Transformer
 model, and train it on the English-to-Spanish machine translation task.
 
 This example is based on the
 [English-to-Spanish NMT
 example](https://keras.io/examples/nlp/neural_machine_translation_with_transformer/)
 by [fchollet](https://twitter.com/fchollet). The original example is more low-level
-and implements layers from scratch, whereas this example uses KerasNLP to show
+and implements layers from scratch, whereas this example uses KerasHub to show
 some more advanced approaches, such as subword tokenization and using metrics
 to compute the quality of generated translations.
 
 You'll learn how to:
 
-- Tokenize text using `keras_nlp.tokenizers.WordPieceTokenizer`.
-- Implement a sequence-to-sequence Transformer model using KerasNLP's
-`keras_nlp.layers.TransformerEncoder`, `keras_nlp.layers.TransformerDecoder` and
-`keras_nlp.layers.TokenAndPositionEmbedding` layers, and train it.
-- Use `keras_nlp.samplers` to generate translations of unseen input sentences
+- Tokenize text using `keras_hub.tokenizers.WordPieceTokenizer`.
+- Implement a sequence-to-sequence Transformer model using KerasHub's
+`keras_hub.layers.TransformerEncoder`, `keras_hub.layers.TransformerDecoder` and
+`keras_hub.layers.TokenAndPositionEmbedding` layers, and train it.
+- Use `keras_hub.samplers` to generate translations of unseen input sentences
  using the top-p decoding strategy!
 
-Don't worry if you aren't familiar with KerasNLP. This tutorial will start with
+Don't worry if you aren't familiar with KerasHub. This tutorial will start with
 the basics. Let's dive right in!
 """
 
@@ -45,11 +45,11 @@
 
 """shell
 pip install -q --upgrade rouge-score
-pip install -q --upgrade keras-nlp
+pip install -q --upgrade keras-hub
 pip install -q --upgrade keras  # Upgrade to Keras 3.
 """
 
-import keras_nlp
+import keras_hub
 import pathlib
 import random
 
@@ -136,8 +136,8 @@
 
 We'll define two tokenizers - one for the source language (English), and the other
 for the target language (Spanish). We'll be using
-`keras_nlp.tokenizers.WordPieceTokenizer` to tokenize the text.
-`keras_nlp.tokenizers.WordPieceTokenizer` takes a WordPiece vocabulary
+`keras_hub.tokenizers.WordPieceTokenizer` to tokenize the text.
+`keras_hub.tokenizers.WordPieceTokenizer` takes a WordPiece vocabulary
 and has functions for tokenizing the text, and detokenizing sequences of tokens.
 
 Before we define the two tokenizers, we first need to train them on the dataset
@@ -145,15 +145,15 @@
 training it on a corpus gives us a vocabulary of subwords. A subword tokenizer
 is a compromise between word tokenizers (word tokenizers need very large
 vocabularies for good coverage of input words), and character tokenizers
-(characters don't really encode meaning like words do). Luckily, KerasNLP
+(characters don't really encode meaning like words do). Luckily, KerasHub
 makes it very simple to train WordPiece on a corpus with the
-`keras_nlp.tokenizers.compute_word_piece_vocabulary` utility.
+`keras_hub.tokenizers.compute_word_piece_vocabulary` utility.
 """
 
 
 def train_word_piece(text_samples, vocab_size, reserved_tokens):
     word_piece_ds = tf_data.Dataset.from_tensor_slices(text_samples)
-    vocab = keras_nlp.tokenizers.compute_word_piece_vocabulary(
+    vocab = keras_hub.tokenizers.compute_word_piece_vocabulary(
         word_piece_ds.batch(1000).prefetch(2),
         vocabulary_size=vocab_size,
         reserved_tokens=reserved_tokens,
@@ -191,10 +191,10 @@ def train_word_piece(text_samples, vocab_size, reserved_tokens):
 the vocabularies trained above.
 """
 
-eng_tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
+eng_tokenizer = keras_hub.tokenizers.WordPieceTokenizer(
     vocabulary=eng_vocab, lowercase=False
 )
-spa_tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
+spa_tokenizer = keras_hub.tokenizers.WordPieceTokenizer(
     vocabulary=spa_vocab, lowercase=False
 )
 
@@ -244,7 +244,7 @@ def train_word_piece(text_samples, vocab_size, reserved_tokens):
 
 We will add special tokens, `"[START]"` and `"[END]"`, to the input Spanish
 sentence after tokenizing the text. We will also pad the input to a fixed length.
-This can be easily done using `keras_nlp.layers.StartEndPacker`.
+This can be easily done using `keras_hub.layers.StartEndPacker`.
 """
 
 
@@ -255,14 +255,14 @@ def preprocess_batch(eng, spa):
     spa = spa_tokenizer(spa)
 
     # Pad `eng` to `MAX_SEQUENCE_LENGTH`.
-    eng_start_end_packer = keras_nlp.layers.StartEndPacker(
+    eng_start_end_packer = keras_hub.layers.StartEndPacker(
         sequence_length=MAX_SEQUENCE_LENGTH,
         pad_value=eng_tokenizer.token_to_id("[PAD]"),
     )
     eng = eng_start_end_packer(eng)
 
     # Add special tokens (`"[START]"` and `"[END]"`) to `spa` and pad it as well.
-    spa_start_end_packer = keras_nlp.layers.StartEndPacker(
+    spa_start_end_packer = keras_hub.layers.StartEndPacker(
         sequence_length=MAX_SEQUENCE_LENGTH + 1,
         start_value=spa_tokenizer.token_to_id("[START]"),
         end_value=spa_tokenizer.token_to_id("[END]"),
@@ -310,40 +310,40 @@ def make_dataset(pairs):
 We first need an embedding layer, i.e., a vector for every token in our input sequence.
 This embedding layer can be initialised randomly. We also need a positional
 embedding layer which encodes the word order in the sequence. The convention is
-to add these two embeddings. KerasNLP has a `keras_nlp.layers.TokenAndPositionEmbedding `
+to add these two embeddings. KerasHub has a `keras_hub.layers.TokenAndPositionEmbedding `
 layer which does all of the above steps for us.
 
-Our sequence-to-sequence Transformer consists of a `keras_nlp.layers.TransformerEncoder`
-layer and a `keras_nlp.layers.TransformerDecoder` layer chained together.
+Our sequence-to-sequence Transformer consists of a `keras_hub.layers.TransformerEncoder`
+layer and a `keras_hub.layers.TransformerDecoder` layer chained together.
 
-The source sequence will be passed to `keras_nlp.layers.TransformerEncoder`, which
+The source sequence will be passed to `keras_hub.layers.TransformerEncoder`, which
 will produce a new representation of it. This new representation will then be passed
-to the `keras_nlp.layers.TransformerDecoder`, together with the target sequence
-so far (target words 0 to N). The `keras_nlp.layers.TransformerDecoder` will
+to the `keras_hub.layers.TransformerDecoder`, together with the target sequence
+so far (target words 0 to N). The `keras_hub.layers.TransformerDecoder` will
 then seek to predict the next words in the target sequence (N+1 and beyond).
 
 A key detail that makes this possible is causal masking.
-The `keras_nlp.layers.TransformerDecoder` sees the entire sequence at once, and
+The `keras_hub.layers.TransformerDecoder` sees the entire sequence at once, and
 thus we must make sure that it only uses information from target tokens 0 to N
 when predicting token N+1 (otherwise, it could use information from the future,
 which would result in a model that cannot be used at inference time). Causal masking
-is enabled by default in `keras_nlp.layers.TransformerDecoder`.
+is enabled by default in `keras_hub.layers.TransformerDecoder`.
 
 We also need to mask the padding tokens (`"[PAD]"`). For this, we can set the
-`mask_zero` argument of the `keras_nlp.layers.TokenAndPositionEmbedding` layer
+`mask_zero` argument of the `keras_hub.layers.TokenAndPositionEmbedding` layer
 to True. This will then be propagated to all subsequent layers.
 """
 
 # Encoder
 encoder_inputs = keras.Input(shape=(None,), name="encoder_inputs")
 
-x = keras_nlp.layers.TokenAndPositionEmbedding(
+x = keras_hub.layers.TokenAndPositionEmbedding(
     vocabulary_size=ENG_VOCAB_SIZE,
     sequence_length=MAX_SEQUENCE_LENGTH,
     embedding_dim=EMBED_DIM,
 )(encoder_inputs)
 
-encoder_outputs = keras_nlp.layers.TransformerEncoder(
+encoder_outputs = keras_hub.layers.TransformerEncoder(
     intermediate_dim=INTERMEDIATE_DIM, num_heads=NUM_HEADS
 )(inputs=x)
 encoder = keras.Model(encoder_inputs, encoder_outputs)
@@ -353,13 +353,13 @@ def make_dataset(pairs):
 decoder_inputs = keras.Input(shape=(None,), name="decoder_inputs")
 encoded_seq_inputs = keras.Input(shape=(None, EMBED_DIM), name="decoder_state_inputs")
 
-x = keras_nlp.layers.TokenAndPositionEmbedding(
+x = keras_hub.layers.TokenAndPositionEmbedding(
     vocabulary_size=SPA_VOCAB_SIZE,
     sequence_length=MAX_SEQUENCE_LENGTH,
     embedding_dim=EMBED_DIM,
 )(decoder_inputs)
 
-x = keras_nlp.layers.TransformerDecoder(
+x = keras_hub.layers.TransformerDecoder(
     intermediate_dim=INTERMEDIATE_DIM, num_heads=NUM_HEADS
 )(decoder_sequence=x, encoder_sequence=encoded_seq_inputs)
 x = keras.layers.Dropout(0.5)(x)
@@ -407,8 +407,8 @@ def make_dataset(pairs):
 next token. We then we repeatedly generated the next token conditioned on the
 tokens generated so far, until we hit the token `"[END]"`.
 
-For decoding, we will use the `keras_nlp.samplers` module from
-KerasNLP. Greedy Decoding is a text decoding method which outputs the most
+For decoding, we will use the `keras_hub.samplers` module from
+KerasHub. Greedy Decoding is a text decoding method which outputs the most
 likely next token at each time step, i.e., the token with the highest probability.
 """
 
@@ -438,7 +438,7 @@ def next(prompt, cache, index):
     pad = ops.full((batch_size, length - 1), spa_tokenizer.token_to_id("[PAD]"))
     prompt = ops.concatenate((start, pad), axis=-1)
 
-    generated_tokens = keras_nlp.samplers.GreedySampler()(
+    generated_tokens = keras_hub.samplers.GreedySampler()(
         next,
         prompt,
         stop_token_ids=[spa_tokenizer.token_to_id("[END]")],
@@ -477,8 +477,8 @@ def next(prompt, cache, index):
 expensive process).
 """
 
-rouge_1 = keras_nlp.metrics.RougeN(order=1)
-rouge_2 = keras_nlp.metrics.RougeN(order=2)
+rouge_1 = keras_hub.metrics.RougeN(order=1)
+rouge_2 = keras_hub.metrics.RougeN(order=2)
 
 for test_pair in test_pairs[:30]:
     input_sentence = test_pair[0]
diff --git a/examples/nlp/parameter_efficient_finetuning_of_gpt2_with_lora.py b/examples/nlp/parameter_efficient_finetuning_of_gpt2_with_lora.py
index bc0440f56d..830cdc09c6 100644
--- a/examples/nlp/parameter_efficient_finetuning_of_gpt2_with_lora.py
+++ b/examples/nlp/parameter_efficient_finetuning_of_gpt2_with_lora.py
@@ -3,7 +3,7 @@
 Author: [Abheesht Sharma](https://github.com/abheesht17/), [Matthew Watson](https://github.com/mattdangerw/)
 Date created: 2023/05/27
 Last modified: 2023/05/27
-Description: Use KerasNLP to fine-tune a GPT-2 LLM with LoRA.
+Description: Use KerasHub to fine-tune a GPT-2 LLM with LoRA.
 Accelerator: GPU
 """
 
@@ -26,8 +26,8 @@
 of the outputs.
 
 In this example, we will explain LoRA in technical terms, show how the technical
-explanation translates to code, hack KerasNLP's
-[GPT-2 model](https://keras.io/api/keras_nlp/models/gpt2/) and fine-tune
+explanation translates to code, hack KerasHub's
+[GPT-2 model](https://keras.io/api/keras_hub/models/gpt2/) and fine-tune
 it on the next token prediction task using LoRA. We will compare LoRA GPT-2
 with a fully fine-tuned GPT-2 in terms of the quality of the generated text,
 training time and GPU memory usage.
@@ -42,14 +42,14 @@
 ## Setup
 
 Before we start implementing the pipeline, let's install and import all the
-libraries we need. We'll be using the KerasNLP library.
+libraries we need. We'll be using the KerasHub library.
 
 Secondly, let's enable mixed precision training. This will help us reduce the
 training time.
 """
 
 """shell
-pip install -q --upgrade keras-nlp
+pip install -q --upgrade keras-hub
 pip install -q --upgrade keras  # Upgrade to Keras 3.
 """
 
@@ -57,7 +57,7 @@
 
 os.environ["KERAS_BACKEND"] = "tensorflow"
 
-import keras_nlp
+import keras_hub
 import keras
 import matplotlib.pyplot as plt
 import tensorflow as tf
@@ -219,11 +219,11 @@ def get_optimizer_and_loss():
 on Colab.
 """
 
-preprocessor = keras_nlp.models.GPT2CausalLMPreprocessor.from_preset(
+preprocessor = keras_hub.models.GPT2CausalLMPreprocessor.from_preset(
     "gpt2_base_en",
     sequence_length=MAX_SEQUENCE_LENGTH,
 )
-gpt2_lm = keras_nlp.models.GPT2CausalLM.from_preset(
+gpt2_lm = keras_hub.models.GPT2CausalLM.from_preset(
     "gpt2_base_en", preprocessor=preprocessor
 )
 
@@ -438,11 +438,11 @@ def call(self, inputs):
 tf.config.experimental.reset_memory_stats("GPU:0")
 
 # Load the original model.
-preprocessor = keras_nlp.models.GPT2CausalLMPreprocessor.from_preset(
+preprocessor = keras_hub.models.GPT2CausalLMPreprocessor.from_preset(
     "gpt2_base_en",
     sequence_length=128,
 )
-lora_model = keras_nlp.models.GPT2CausalLM.from_preset(
+lora_model = keras_hub.models.GPT2CausalLM.from_preset(
     "gpt2_base_en",
     preprocessor=preprocessor,
 )
diff --git a/examples/nlp/semantic_similarity_with_keras_nlp.py b/examples/nlp/semantic_similarity_with_keras_hub.py
similarity index 88%
rename from examples/nlp/semantic_similarity_with_keras_nlp.py
rename to examples/nlp/semantic_similarity_with_keras_hub.py
index 60eb0203f3..409177406e 100644
--- a/examples/nlp/semantic_similarity_with_keras_nlp.py
+++ b/examples/nlp/semantic_similarity_with_keras_hub.py
@@ -1,9 +1,9 @@
 """
-Title: Semantic Similarity with KerasNLP
+Title: Semantic Similarity with KerasHub
 Author: [Anshuman Mishra](https://github.com/shivance/)
 Date created: 2023/02/25
 Last modified: 2023/02/25
-Description: Use pretrained models from KerasNLP for the Semantic Similarity Task.
+Description: Use pretrained models from KerasHub for the Semantic Similarity Task.
 Accelerator: GPU
 """
 
@@ -14,10 +14,10 @@
 sentences in terms of their meaning. We already saw in [this](https://keras.io/examples/nlp/semantic_similarity_with_bert/)
 example how to use SNLI (Stanford Natural Language Inference) corpus to predict sentence
 semantic similarity with the HuggingFace Transformers library. In this tutorial we will
-learn how to use [KerasNLP](https://keras.io/keras_nlp/), an extension of the core Keras API,
-for the same task. Furthermore, we will discover how KerasNLP effectively reduces boilerplate
-code and simplifies the process of building and utilizing models. For more information on KerasNLP,
-please refer to [KerasNLP's official documentation](https://keras.io/keras_nlp/).
+learn how to use [KerasHub](https://keras.io/keras_hub/), an extension of the core Keras API,
+for the same task. Furthermore, we will discover how KerasHub effectively reduces boilerplate
+code and simplifies the process of building and utilizing models. For more information on KerasHub,
+please refer to [KerasHub's official documentation](https://keras.io/keras_hub/).
 
 This guide is broken down into the following parts:
 
@@ -31,20 +31,20 @@
 
 The following guide uses [Keras Core](https://keras.io/keras_core/) to work in
 any of `tensorflow`, `jax` or `torch`. Support for Keras Core is baked into
-KerasNLP, simply change the `KERAS_BACKEND` environment variable below to change
+KerasHub, simply change the `KERAS_BACKEND` environment variable below to change
 the backend you would like to use. We select the `jax` backend below, which will
 give us a particularly fast train step below.
 """
 
 """shell
-pip install -q --upgrade keras-nlp
+pip install -q --upgrade keras-hub
 pip install -q --upgrade keras  # Upgrade to Keras 3.
 """
 
 import numpy as np
 import tensorflow as tf
 import keras
-import keras_nlp
+import keras_hub
 import tensorflow_datasets as tfds
 
 """
@@ -90,7 +90,7 @@ def filter_labels(sample):
 
 """
 Here's a utility function that splits the example into an `(x, y)` tuple that is suitable
-for `model.fit()`. By default, `keras_nlp.models.BertClassifier` will tokenize and pack
+for `model.fit()`. By default, `keras_hub.models.BertClassifier` will tokenize and pack
 together raw strings using a `"[SEP]"` token during training. Therefore, this label
 splitting is all the data preparation that we need to perform.
 """
@@ -122,12 +122,12 @@ def split_labels(sample):
 """
 ## Establishing baseline with BERT.
 
-We use the BERT model from KerasNLP to establish a baseline for our semantic similarity
-task. The `keras_nlp.models.BertClassifier` class attaches a classification head to the BERT
+We use the BERT model from KerasHub to establish a baseline for our semantic similarity
+task. The `keras_hub.models.BertClassifier` class attaches a classification head to the BERT
 Backbone, mapping the backbone outputs to a logit output suitable for a classification task.
 This significantly reduces the need for custom code.
 
-KerasNLP models have built-in tokenization capabilities that handle tokenization by default
+KerasHub models have built-in tokenization capabilities that handle tokenization by default
 based on the selected model. However, users can also use custom preprocessing techniques
 as per their specific needs. If we pass a tuple as input, the model will tokenize all the
 strings and concatenate them with a `"[SEP]"` separator.
@@ -136,14 +136,14 @@ def split_labels(sample):
 to use our own preprocessor. For the SNLI dataset, we set `num_classes` to 3.
 """
 
-bert_classifier = keras_nlp.models.BertClassifier.from_preset(
+bert_classifier = keras_hub.models.BertClassifier.from_preset(
     "bert_tiny_en_uncased", num_classes=3
 )
 
 """
 Please note that the BERT Tiny model has only 4,386,307 trainable parameters.
 
-KerasNLP task models come with compilation defaults. We can now train the model we just
+KerasHub task models come with compilation defaults. We can now train the model we just
 instantiated by calling the `fit()` method.
 """
 
@@ -164,7 +164,7 @@ def split_labels(sample):
 higher learning rate.
 """
 
-bert_classifier = keras_nlp.models.BertClassifier.from_preset(
+bert_classifier = keras_hub.models.BertClassifier.from_preset(
     "bert_tiny_en_uncased", num_classes=3
 )
 bert_classifier.compile(
@@ -207,7 +207,7 @@ def __call__(self, step):
         return keras.ops.maximum(triangular_rate, 0.0)
 
 
-bert_classifier = keras_nlp.models.BertClassifier.from_preset(
+bert_classifier = keras_hub.models.BertClassifier.from_preset(
     "bert_tiny_en_uncased", num_classes=3
 )
 
@@ -255,7 +255,7 @@ def __call__(self, step):
 """
 ## Performing inference with the model.
 
-Let's see how to perform inference with KerasNLP models
+Let's see how to perform inference with KerasHub models
 """
 
 # Convert to Hypothesis-Premise pair, for forward pass through model
@@ -263,7 +263,7 @@ def __call__(self, step):
 sample
 
 """
-The default preprocessor in KerasNLP models handles input tokenization automatically,
+The default preprocessor in KerasHub models handles input tokenization automatically,
 so we don't need to perform tokenization explicitly.
 """
 predictions = bert_classifier.predict(sample)
@@ -280,12 +280,12 @@ def softmax(x):
 ## Improving accuracy with RoBERTa
 
 Now that we have established a baseline, we can attempt to improve our results
-by experimenting with different models. Thanks to KerasNLP, fine-tuning a RoBERTa
+by experimenting with different models. Thanks to KerasHub, fine-tuning a RoBERTa
 checkpoint on the same dataset is easy with just a few lines of code.
 """
 
 # Inittializing a RoBERTa from preset
-roberta_classifier = keras_nlp.models.RobertaClassifier.from_preset(
+roberta_classifier = keras_hub.models.RobertaClassifier.from_preset(
     "roberta_base_en", num_classes=3
 )
 
@@ -310,13 +310,13 @@ def softmax(x):
 
 """
 We hope this tutorial has been helpful in demonstrating the ease and effectiveness
-of using KerasNLP and BERT for semantic similarity tasks.
+of using KerasHub and BERT for semantic similarity tasks.
 
 Throughout this tutorial, we demonstrated how to use a pretrained BERT model to
 establish a baseline and improve performance by training a larger RoBERTa model
 using just a few lines of code.
 
-The KerasNLP toolbox provides a range of modular building blocks for preprocessing
+The KerasHub toolbox provides a range of modular building blocks for preprocessing
 text, including pretrained state-of-the-art models and low-level Transformer Encoder
 layers. We believe that this makes experimenting with natural language solutions
 more accessible and efficient.
diff --git a/examples/nlp/sentence_embeddings_with_sbert.py b/examples/nlp/sentence_embeddings_with_sbert.py
index 2e3158ed82..4867eb28bc 100644
--- a/examples/nlp/sentence_embeddings_with_sbert.py
+++ b/examples/nlp/sentence_embeddings_with_sbert.py
@@ -3,7 +3,7 @@
 Author: [Mohammed Abu El-Nasr](https://github.com/abuelnasr0)
 Date created: 2023/07/14
 Last modified: 2023/07/14
-Description: Fine-tune a RoBERTa model to generate sentence embeddings using KerasNLP.
+Description: Fine-tune a RoBERTa model to generate sentence embeddings using KerasHub.
 Accelerator: GPU
 """
 
@@ -44,7 +44,7 @@
 """
 ## Setup
 
-Let's install and import the libraries we need. We'll be using the KerasNLP library in
+Let's install and import the libraries we need. We'll be using the KerasHub library in
 this example.
 
 We will also enable [mixed precision](https://www.tensorflow.org/guide/mixed_precision)
@@ -52,7 +52,7 @@
 """
 
 """shell
-pip install -q --upgrade keras-nlp
+pip install -q --upgrade keras-hub
 pip install -q --upgrade keras  # Upgrade to Keras 3.
 """
 
@@ -61,7 +61,7 @@
 os.environ["KERAS_BACKEND"] = "tensorflow"
 
 import keras
-import keras_nlp
+import keras_hub
 import tensorflow as tf
 import tensorflow_datasets as tfds
 import sklearn.cluster as cluster
@@ -171,8 +171,8 @@ def prepare_dataset(dataset, num_batches, batch_size):
 - A normalization layer to normalize the embeddings as we are using the cosine similarity.
 """
 
-preprocessor = keras_nlp.models.RobertaPreprocessor.from_preset("roberta_base_en")
-backbone = keras_nlp.models.RobertaBackbone.from_preset("roberta_base_en")
+preprocessor = keras_hub.models.RobertaPreprocessor.from_preset("roberta_base_en")
+backbone = keras_hub.models.RobertaBackbone.from_preset("roberta_base_en")
 inputs = keras.Input(shape=(1,), dtype="string", name="sentence")
 x = preprocessor(inputs)
 h = backbone(x)
@@ -347,8 +347,8 @@ def prepare_wiki_data(dataset, num_batches):
 - A mean pooling layer to produce the embeddings.
 """
 
-preprocessor = keras_nlp.models.RobertaPreprocessor.from_preset("roberta_base_en")
-backbone = keras_nlp.models.RobertaBackbone.from_preset("roberta_base_en")
+preprocessor = keras_hub.models.RobertaPreprocessor.from_preset("roberta_base_en")
+backbone = keras_hub.models.RobertaBackbone.from_preset("roberta_base_en")
 input = keras.Input(shape=(1,), dtype="string", name="sentence")
 
 x = preprocessor(input)
diff --git a/examples/nlp/t5_hf_summarization.py b/examples/nlp/t5_hf_summarization.py
index d3073a7815..a9b1fc7249 100644
--- a/examples/nlp/t5_hf_summarization.py
+++ b/examples/nlp/t5_hf_summarization.py
@@ -42,7 +42,7 @@
 
 """shell
 !pip install transformers==4.20.0
-!pip install keras_nlp==0.3.0
+!pip install keras_hub==0.3.0
 !pip install datasets
 !pip install huggingface-hub
 !pip install nltk
@@ -276,9 +276,9 @@ def preprocess_function(examples):
 calculate the `ROUGE` score between the groud-truth and predictions.
 """
 
-import keras_nlp
+import keras_hub
 
-rouge_l = keras_nlp.metrics.RougeL()
+rouge_l = keras_hub.metrics.RougeL()
 
 
 def metric_fn(eval_predictions):
diff --git a/guides/ipynb/keras_cv/classification_with_keras_cv.ipynb b/guides/ipynb/keras_cv/classification_with_keras_cv.ipynb
index 944fa51617..be969ff5ac 100644
--- a/guides/ipynb/keras_cv/classification_with_keras_cv.ipynb
+++ b/guides/ipynb/keras_cv/classification_with_keras_cv.ipynb
@@ -41,7 +41,7 @@
         "We use Professor Keras, the official Keras mascot, as a\n",
         "visual reference for the complexity of the material:\n",
         "\n",
-        "![](https://storage.googleapis.com/keras-nlp/getting_started_guide/prof_keras_evolution.png)"
+        "![](https://storage.googleapis.com/keras-hub/getting_started_guide/prof_keras_evolution.png)"
       ]
     },
     {
@@ -94,7 +94,7 @@
       "source": [
         "## Inference with a pretrained classifier\n",
         "\n",
-        "![](https://storage.googleapis.com/keras-nlp/getting_started_guide/prof_keras_beginner.png)\n",
+        "![](https://storage.googleapis.com/keras-hub/getting_started_guide/prof_keras_beginner.png)\n",
         "\n",
         "Let's get started with the simplest KerasCV API: a pretrained classifier.\n",
         "In this example, we will construct a classifier that was\n",
@@ -264,7 +264,7 @@
         "\n",
         "## Fine tuning a pretrained classifier\n",
         "\n",
-        "![](https://storage.googleapis.com/keras-nlp/getting_started_guide/prof_keras_intermediate.png)\n",
+        "![](https://storage.googleapis.com/keras-hub/getting_started_guide/prof_keras_intermediate.png)\n",
         "\n",
         "When labeled images specific to our task are available, fine-tuning a custom\n",
         "classifier can improve performance.\n",
@@ -414,7 +414,7 @@
       "source": [
         "## Train a Classifier from Scratch\n",
         "\n",
-        "![](https://storage.googleapis.com/keras-nlp/getting_started_guide/prof_keras_advanced.png)\n",
+        "![](https://storage.googleapis.com/keras-hub/getting_started_guide/prof_keras_advanced.png)\n",
         "\n",
         "Now that we've gotten our hands dirty with classification, let's take on one\n",
         "last task: training a classification model from scratch!\n",
diff --git a/guides/ipynb/keras_cv/object_detection_keras_cv.ipynb b/guides/ipynb/keras_cv/object_detection_keras_cv.ipynb
index 37c8b3b784..cfe12b2314 100644
--- a/guides/ipynb/keras_cv/object_detection_keras_cv.ipynb
+++ b/guides/ipynb/keras_cv/object_detection_keras_cv.ipynb
@@ -130,7 +130,7 @@
    "source": [
     "## Perform detections with a pretrained model\n",
     "\n",
-    "![](https://storage.googleapis.com/keras-nlp/getting_started_guide/prof_keras_beginner.png)\n",
+    "![](https://storage.googleapis.com/keras-hub/getting_started_guide/prof_keras_beginner.png)\n",
     "\n",
     "The highest level API in the KerasCV Object Detection API is the `keras_cv.models` API.\n",
     "This API includes fully pretrained object detection models, such as\n",
@@ -484,7 +484,7 @@
     "\n",
     "## Train a custom object detection model\n",
     "\n",
-    "![](https://storage.googleapis.com/keras-nlp/getting_started_guide/prof_keras_advanced.png)\n",
+    "![](https://storage.googleapis.com/keras-hub/getting_started_guide/prof_keras_advanced.png)\n",
     "\n",
     "Whether you're an object detection amateur or a well seasoned veteran, assembling\n",
     "an object detection pipeline from scratch is a massive undertaking.\n",
diff --git a/guides/ipynb/keras_cv/semantic_segmentation_deeplab_v3_plus.ipynb b/guides/ipynb/keras_cv/semantic_segmentation_deeplab_v3_plus.ipynb
index 496df6e071..f47d002a11 100644
--- a/guides/ipynb/keras_cv/semantic_segmentation_deeplab_v3_plus.ipynb
+++ b/guides/ipynb/keras_cv/semantic_segmentation_deeplab_v3_plus.ipynb
@@ -20,7 +20,7 @@
     "colab_type": "text"
    },
    "source": [
-    "![](https://storage.googleapis.com/keras-nlp/getting_started_guide/prof_keras_intermediate.png)\n",
+    "![](https://storage.googleapis.com/keras-hub/getting_started_guide/prof_keras_intermediate.png)\n",
     "\n",
     "## Background\n",
     "Semantic segmentation is a type of computer vision task that involves assigning a\n",
diff --git a/guides/ipynb/keras_hub/getting_started.ipynb b/guides/ipynb/keras_hub/getting_started.ipynb
new file mode 100644
index 0000000000..e585a8a7e6
--- /dev/null
+++ b/guides/ipynb/keras_hub/getting_started.ipynb
@@ -0,0 +1,931 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "# Getting Started with KerasHub\n",
+    "\n",
+    "**Author:** [Jonathan Bischof](https://github.com/jbischof)<br>\n",
+    "**Date created:** 2022/12/15<br>\n",
+    "**Last modified:** 2023/07/01<br>\n",
+    "**Description:** An introduction to the KerasHub API."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "## Introduction\n",
+    "\n",
+    "KerasHub is a natural language processing library that supports users through\n",
+    "their entire development cycle. Our workflows are built from modular components\n",
+    "that have state-of-the-art preset weights and architectures when used\n",
+    "out-of-the-box and are easily customizable when more control is needed.\n",
+    "\n",
+    "This library is an extension of the core Keras API; all high-level modules are\n",
+    "[`Layers`](/api/layers/) or [`Models`](/api/models/). If you are familiar with Keras,\n",
+    "congratulations! You already understand most of KerasHub.\n",
+    "\n",
+    "KerasHub uses Keras 3 to work with any of TensorFlow, Pytorch and Jax. In the\n",
+    "guide below, we will use the `jax` backend for training our models, and\n",
+    "[tf.data](https://www.tensorflow.org/guide/data) for efficiently running our\n",
+    "input preprocessing. But feel free to mix things up! This guide runs in\n",
+    "TensorFlow or PyTorch backends with zero changes, simply update the\n",
+    "`KERAS_BACKEND` below.\n",
+    "\n",
+    "This guide demonstrates our modular approach using a sentiment analysis example at six\n",
+    "levels of complexity:\n",
+    "\n",
+    "* Inference with a pretrained classifier\n",
+    "* Fine tuning a pretrained backbone\n",
+    "* Fine tuning with user-controlled preprocessing\n",
+    "* Fine tuning a custom model\n",
+    "* Pretraining a backbone model\n",
+    "* Build and train your own transformer from scratch\n",
+    "\n",
+    "Throughout our guide, we use Professor Keras, the official Keras mascot, as a visual\n",
+    "reference for the complexity of the material:\n",
+    "\n",
+    "<img src=\"https://storage.googleapis.com/keras-hub/getting_started_guide/prof_keras_evolution.png\" alt=\"drawing\" height=\"250\"/>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "colab_type": "code"
+   },
+   "outputs": [],
+   "source": [
+    "!pip install -q --upgrade keras-hub\n",
+    "!pip install -q --upgrade keras  # Upgrade to Keras 3."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "colab_type": "code"
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "os.environ[\"KERAS_BACKEND\"] = \"jax\"  # or \"tensorflow\" or \"torch\"\n",
+    "\n",
+    "import keras_hub\n",
+    "import keras\n",
+    "\n",
+    "# Use mixed precision to speed up all training in this guide.\n",
+    "keras.mixed_precision.set_global_policy(\"mixed_float16\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "## API quickstart\n",
+    "\n",
+    "Our highest level API is `keras_hub.models`. These symbols cover the complete user\n",
+    "journey of converting strings to tokens, tokens to dense features, and dense features to\n",
+    "task-specific output. For each `XX` architecture (e.g., `Bert`), we offer the following\n",
+    "modules:\n",
+    "\n",
+    "* **Tokenizer**: `keras_hub.models.XXTokenizer`\n",
+    "  * **What it does**: Converts strings to sequences of token ids.\n",
+    "  * **Why it's important**: The raw bytes of a string are too high dimensional to be useful\n",
+    "    features so we first map them to a small number of tokens, for example `\"The quick brown\n",
+    "    fox\"` to `[\"the\", \"qu\", \"##ick\", \"br\", \"##own\", \"fox\"]`.\n",
+    "  * **Inherits from**: `keras.layers.Layer`.\n",
+    "* **Preprocessor**: `keras_hub.models.XXPreprocessor`\n",
+    "  * **What it does**: Converts strings to a dictionary of preprocessed tensors consumed by\n",
+    "    the backbone, starting with tokenization.\n",
+    "  * **Why it's important**: Each model uses special tokens and extra tensors to understand\n",
+    "    the input such as delimiting input segments and identifying padding tokens. Padding each\n",
+    "    sequence to the same length improves computational efficiency.\n",
+    "  * **Has a**: `XXTokenizer`.\n",
+    "  * **Inherits from**: `keras.layers.Layer`.\n",
+    "* **Backbone**: `keras_hub.models.XXBackbone`\n",
+    "  * **What it does**: Converts preprocessed tensors to dense features. *Does not handle\n",
+    "    strings; call the preprocessor first.*\n",
+    "  * **Why it's important**: The backbone distills the input tokens into dense features that\n",
+    "    can be used in downstream tasks. It is generally pretrained on a language modeling task\n",
+    "    using massive amounts of unlabeled data. Transferring this information to a new task is a\n",
+    "    major breakthrough in modern NLP.\n",
+    "  * **Inherits from**: `keras.Model`.\n",
+    "* **Task**: e.g., `keras_hub.models.XXClassifier`\n",
+    "  * **What it does**: Converts strings to task-specific output (e.g., classification\n",
+    "    probabilities).\n",
+    "  * **Why it's important**: Task models combine string preprocessing and the backbone model\n",
+    "    with task-specific `Layers` to solve a problem such as sentence classification, token\n",
+    "    classification, or text generation. The additional `Layers` must be fine-tuned on labeled\n",
+    "    data.\n",
+    "  * **Has a**: `XXBackbone` and `XXPreprocessor`.\n",
+    "  * **Inherits from**: `keras.Model`.\n",
+    "\n",
+    "Here is the modular hierarchy for `BertClassifier` (all relationships are compositional):\n",
+    "\n",
+    "<img src=\"https://storage.googleapis.com/keras-hub/getting_started_guide/class_diagram.png\" alt=\"drawing\" height=\"300\"/>\n",
+    "\n",
+    "All modules can be used independently and have a `from_preset()` method in addition to\n",
+    "the standard constructor that instantiates the class with **preset** architecture and\n",
+    "weights (see examples below)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "## Data\n",
+    "\n",
+    "We will use a running example of sentiment analysis of IMDB movie reviews. In this task,\n",
+    "we use the text to predict whether the review was positive (`label = 1`) or negative\n",
+    "(`label = 0`).\n",
+    "\n",
+    "We load the data using `keras.utils.text_dataset_from_directory`, which utilizes the\n",
+    "powerful `tf.data.Dataset` format for examples."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "colab_type": "code"
+   },
+   "outputs": [],
+   "source": [
+    "!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz\n",
+    "!tar -xf aclImdb_v1.tar.gz\n",
+    "!# Remove unsupervised examples\n",
+    "!rm -r aclImdb/train/unsup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "colab_type": "code"
+   },
+   "outputs": [],
+   "source": [
+    "BATCH_SIZE = 16\n",
+    "imdb_train = keras.utils.text_dataset_from_directory(\n",
+    "    \"aclImdb/train\",\n",
+    "    batch_size=BATCH_SIZE,\n",
+    ")\n",
+    "imdb_test = keras.utils.text_dataset_from_directory(\n",
+    "    \"aclImdb/test\",\n",
+    "    batch_size=BATCH_SIZE,\n",
+    ")\n",
+    "\n",
+    "# Inspect first review\n",
+    "# Format is (review text tensor, label tensor)\n",
+    "print(imdb_train.unbatch().take(1).get_single_element())\n",
+    ""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "## Inference with a pretrained classifier\n",
+    "\n",
+    "<img src=\"https://storage.googleapis.com/keras-hub/getting_started_guide/prof_keras_beginner.png\" alt=\"drawing\" height=\"250\"/>\n",
+    "\n",
+    "The highest level module in KerasHub is a **task**. A **task** is a `keras.Model`\n",
+    "consisting of a (generally pretrained) **backbone** model and task-specific layers.\n",
+    "Here's an example using `keras_hub.models.BertClassifier`.\n",
+    "\n",
+    "**Note**: Outputs are the logits per class (e.g., `[0, 0]` is 50% chance of positive). The output is\n",
+    "[negative, positive] for binary classification."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "colab_type": "code"
+   },
+   "outputs": [],
+   "source": [
+    "classifier = keras_hub.models.BertClassifier.from_preset(\"bert_tiny_en_uncased_sst2\")\n",
+    "# Note: batched inputs expected so must wrap string in iterable\n",
+    "classifier.predict([\"I love modular workflows in keras-hub!\"])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "All **tasks** have a `from_preset` method that constructs a `keras.Model` instance with\n",
+    "preset preprocessing, architecture and weights. This means that we can pass raw strings\n",
+    "in any format accepted by a `keras.Model` and get output specific to our task.\n",
+    "\n",
+    "This particular **preset** is a `\"bert_tiny_uncased_en\"` **backbone** fine-tuned on\n",
+    "`sst2`, another movie review sentiment analysis (this time from Rotten Tomatoes). We use\n",
+    "the `tiny` architecture for demo purposes, but larger models are recommended for SoTA\n",
+    "performance. For all the task-specific presets available for `BertClassifier`, see\n",
+    "our keras.io [models page](https://keras.io/api/keras_hub/models/).\n",
+    "\n",
+    "Let's evaluate our classifier on the IMDB dataset. You will note we don't need to\n",
+    "call `keras.Model.compile` here. All **task** models like `BertClassifier` ship with\n",
+    "compilation defaults, meaning we can just call `keras.Model.evaluate` directly. You\n",
+    "can always call compile as normal to override these defaults (e.g. to add new metrics).\n",
+    "\n",
+    "The output below is [loss, accuracy],"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "colab_type": "code"
+   },
+   "outputs": [],
+   "source": [
+    "classifier.evaluate(imdb_test)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "Our result is 78% accuracy without training anything. Not bad!"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "## Fine tuning a pretrained BERT backbone\n",
+    "\n",
+    "<img src=\"https://storage.googleapis.com/keras-hub/getting_started_guide/prof_keras_intermediate.png\" alt=\"drawing\" height=\"250\"/>\n",
+    "\n",
+    "When labeled text specific to our task is available, fine-tuning a custom classifier can\n",
+    "improve performance. If we want to predict IMDB review sentiment, using IMDB data should\n",
+    "perform better than Rotten Tomatoes data! And for many tasks, no relevant pretrained model\n",
+    "will be available (e.g., categorizing customer reviews).\n",
+    "\n",
+    "The workflow for fine-tuning is almost identical to above, except that we request a\n",
+    "**preset** for the **backbone**-only model rather than the entire classifier. When passed\n",
+    "a **backbone** **preset**, a **task** `Model` will randomly initialize all task-specific\n",
+    "layers in preparation for training. For all the **backbone** presets available for\n",
+    "`BertClassifier`, see our keras.io [models page](https://keras.io/api/keras_hub/models/).\n",
+    "\n",
+    "To train your classifier, use `keras.Model.fit` as with any other\n",
+    "`keras.Model`. As with our inference example, we can rely on the compilation\n",
+    "defaults for the **task** and skip `keras.Model.compile`. As preprocessing is\n",
+    "included, we again pass the raw data."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "colab_type": "code"
+   },
+   "outputs": [],
+   "source": [
+    "classifier = keras_hub.models.BertClassifier.from_preset(\n",
+    "    \"bert_tiny_en_uncased\",\n",
+    "    num_classes=2,\n",
+    ")\n",
+    "classifier.fit(\n",
+    "    imdb_train,\n",
+    "    validation_data=imdb_test,\n",
+    "    epochs=1,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "Here we see a significant lift in validation accuracy (0.78 -> 0.87) with a single epoch of\n",
+    "training even though the IMDB dataset is much smaller than `sst2`."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "## Fine tuning with user-controlled preprocessing\n",
+    "<img src=\"https://storage.googleapis.com/keras-hub/getting_started_guide/prof_keras_advanced.png\" alt=\"drawing\" height=\"250\"/>\n",
+    "\n",
+    "For some advanced training scenarios, users might prefer direct control over\n",
+    "preprocessing. For large datasets, examples can be preprocessed in advance and saved to\n",
+    "disk or preprocessed by a separate worker pool using `tf.data.experimental.service`. In\n",
+    "other cases, custom preprocessing is needed to handle the inputs.\n",
+    "\n",
+    "Pass `preprocessor=None` to the constructor of a **task** `Model` to skip automatic\n",
+    "preprocessing or pass a custom `BertPreprocessor` instead."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "### Separate preprocessing from the same preset\n",
+    "\n",
+    "Each model architecture has a parallel **preprocessor** `Layer` with its own\n",
+    "`from_preset` constructor. Using the same **preset** for this `Layer` will return the\n",
+    "matching **preprocessor** as the **task**.\n",
+    "\n",
+    "In this workflow we train the model over three epochs using `tf.data.Dataset.cache()`,\n",
+    "which computes the preprocessing once and caches the result before fitting begins.\n",
+    "\n",
+    "**Note:** we can use `tf.data` for preprocessing while running on the\n",
+    "Jax or PyTorch backend. The input dataset will automatically be converted to\n",
+    "backend native tensor types during fit. In fact, given the efficiency of `tf.data`\n",
+    "for running preprocessing, this is good practice on all backends."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "colab_type": "code"
+   },
+   "outputs": [],
+   "source": [
+    "import tensorflow as tf\n",
+    "\n",
+    "preprocessor = keras_hub.models.BertPreprocessor.from_preset(\n",
+    "    \"bert_tiny_en_uncased\",\n",
+    "    sequence_length=512,\n",
+    ")\n",
+    "\n",
+    "# Apply the preprocessor to every sample of train and test data using `map()`.\n",
+    "# `tf.data.AUTOTUNE` and `prefetch()` are options to tune performance, see\n",
+    "# https://www.tensorflow.org/guide/data_performance for details.\n",
+    "\n",
+    "# Note: only call `cache()` if you training data fits in CPU memory!\n",
+    "imdb_train_cached = (\n",
+    "    imdb_train.map(preprocessor, tf.data.AUTOTUNE).cache().prefetch(tf.data.AUTOTUNE)\n",
+    ")\n",
+    "imdb_test_cached = (\n",
+    "    imdb_test.map(preprocessor, tf.data.AUTOTUNE).cache().prefetch(tf.data.AUTOTUNE)\n",
+    ")\n",
+    "\n",
+    "classifier = keras_hub.models.BertClassifier.from_preset(\n",
+    "    \"bert_tiny_en_uncased\", preprocessor=None, num_classes=2\n",
+    ")\n",
+    "classifier.fit(\n",
+    "    imdb_train_cached,\n",
+    "    validation_data=imdb_test_cached,\n",
+    "    epochs=3,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "After three epochs, our validation accuracy has only increased to 0.88. This is both a\n",
+    "function of the small size of our dataset and our model. To exceed 90% accuracy, try\n",
+    "larger **presets** such as  `\"bert_base_en_uncased\"`. For all the **backbone** presets\n",
+    "available for `BertClassifier`, see our keras.io [models page](https://keras.io/api/keras_hub/models/)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "### Custom preprocessing\n",
+    "\n",
+    "In cases where custom preprocessing is required, we offer direct access to the\n",
+    "`Tokenizer` class that maps raw strings to tokens. It also has a `from_preset()`\n",
+    "constructor to get the vocabulary matching pretraining.\n",
+    "\n",
+    "**Note:** `BertTokenizer` does not pad sequences by default, so the output is\n",
+    "ragged (each sequence has varying length). The `MultiSegmentPacker` below\n",
+    "handles padding these ragged sequences to dense tensor types (e.g. `tf.Tensor`\n",
+    "or `torch.Tensor`)."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "colab_type": "code"
+   },
+   "outputs": [],
+   "source": [
+    "tokenizer = keras_hub.models.BertTokenizer.from_preset(\"bert_tiny_en_uncased\")\n",
+    "tokenizer([\"I love modular workflows!\", \"Libraries over frameworks!\"])\n",
+    "\n",
+    "# Write your own packer or use one of our `Layers`\n",
+    "packer = keras_hub.layers.MultiSegmentPacker(\n",
+    "    start_value=tokenizer.cls_token_id,\n",
+    "    end_value=tokenizer.sep_token_id,\n",
+    "    # Note: This cannot be longer than the preset's `sequence_length`, and there\n",
+    "    # is no check for a custom preprocessor!\n",
+    "    sequence_length=64,\n",
+    ")\n",
+    "\n",
+    "\n",
+    "# This function that takes a text sample `x` and its\n",
+    "# corresponding label `y` as input and converts the\n",
+    "# text into a format suitable for input into a BERT model.\n",
+    "def preprocessor(x, y):\n",
+    "    token_ids, segment_ids = packer(tokenizer(x))\n",
+    "    x = {\n",
+    "        \"token_ids\": token_ids,\n",
+    "        \"segment_ids\": segment_ids,\n",
+    "        \"padding_mask\": token_ids != 0,\n",
+    "    }\n",
+    "    return x, y\n",
+    "\n",
+    "\n",
+    "imdb_train_preprocessed = imdb_train.map(preprocessor, tf.data.AUTOTUNE).prefetch(\n",
+    "    tf.data.AUTOTUNE\n",
+    ")\n",
+    "imdb_test_preprocessed = imdb_test.map(preprocessor, tf.data.AUTOTUNE).prefetch(\n",
+    "    tf.data.AUTOTUNE\n",
+    ")\n",
+    "\n",
+    "# Preprocessed example\n",
+    "print(imdb_train_preprocessed.unbatch().take(1).get_single_element())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "## Fine tuning with a custom model\n",
+    "<img src=\"https://storage.googleapis.com/keras-hub/getting_started_guide/prof_keras_advanced.png\" alt=\"drawing\" height=\"250\"/>\n",
+    "\n",
+    "For more advanced applications, an appropriate **task** `Model` may not be available. In\n",
+    "this case, we provide direct access to the **backbone** `Model`, which has its own\n",
+    "`from_preset` constructor and can be composed with custom `Layer`s. Detailed examples can\n",
+    "be found at our [transfer learning guide](https://keras.io/guides/transfer_learning/).\n",
+    "\n",
+    "A **backbone** `Model` does not include automatic preprocessing but can be paired with a\n",
+    "matching **preprocessor** using the same **preset** as shown in the previous workflow.\n",
+    "\n",
+    "In this workflow, we experiment with freezing our backbone model and adding two trainable\n",
+    "transformer layers to adapt to the new input.\n",
+    "\n",
+    "**Note**: We can ignore the warning about gradients for the `pooled_dense` layer because\n",
+    "we are using BERT's sequence output."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "colab_type": "code"
+   },
+   "outputs": [],
+   "source": [
+    "preprocessor = keras_hub.models.BertPreprocessor.from_preset(\"bert_tiny_en_uncased\")\n",
+    "backbone = keras_hub.models.BertBackbone.from_preset(\"bert_tiny_en_uncased\")\n",
+    "\n",
+    "imdb_train_preprocessed = (\n",
+    "    imdb_train.map(preprocessor, tf.data.AUTOTUNE).cache().prefetch(tf.data.AUTOTUNE)\n",
+    ")\n",
+    "imdb_test_preprocessed = (\n",
+    "    imdb_test.map(preprocessor, tf.data.AUTOTUNE).cache().prefetch(tf.data.AUTOTUNE)\n",
+    ")\n",
+    "\n",
+    "backbone.trainable = False\n",
+    "inputs = backbone.input\n",
+    "sequence = backbone(inputs)[\"sequence_output\"]\n",
+    "for _ in range(2):\n",
+    "    sequence = keras_hub.layers.TransformerEncoder(\n",
+    "        num_heads=2,\n",
+    "        intermediate_dim=512,\n",
+    "        dropout=0.1,\n",
+    "    )(sequence)\n",
+    "# Use [CLS] token output to classify\n",
+    "outputs = keras.layers.Dense(2)(sequence[:, backbone.cls_token_index, :])\n",
+    "\n",
+    "model = keras.Model(inputs, outputs)\n",
+    "model.compile(\n",
+    "    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),\n",
+    "    optimizer=keras.optimizers.AdamW(5e-5),\n",
+    "    metrics=[keras.metrics.SparseCategoricalAccuracy()],\n",
+    "    jit_compile=True,\n",
+    ")\n",
+    "model.summary()\n",
+    "model.fit(\n",
+    "    imdb_train_preprocessed,\n",
+    "    validation_data=imdb_test_preprocessed,\n",
+    "    epochs=3,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "This model achieves reasonable accuracy despite having only 10% of the trainable parameters\n",
+    "of our `BertClassifier` model. Each training step takes about 1/3 of the time---even\n",
+    "accounting for cached preprocessing."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "## Pretraining a backbone model\n",
+    "<img src=\"https://storage.googleapis.com/keras-hub/getting_started_guide/prof_keras_expert.png\" alt=\"drawing\" height=\"250\"/>\n",
+    "\n",
+    "Do you have access to large unlabeled datasets in your domain? Are they around the\n",
+    "same size as used to train popular backbones such as BERT, RoBERTa, or GPT2 (XX+ GiB)? If\n",
+    "so, you might benefit from domain-specific pretraining of your own backbone models.\n",
+    "\n",
+    "NLP models are generally pretrained on a language modeling task, predicting masked words\n",
+    "given the visible words in an input sentence. For example, given the input\n",
+    "`\"The fox [MASK] over the [MASK] dog\"`, the model might be asked to predict `[\"jumped\", \"lazy\"]`.\n",
+    "The lower layers of this model are then packaged as a **backbone** to be combined with\n",
+    "layers relating to a new task.\n",
+    "\n",
+    "The KerasHub library offers SoTA **backbones** and **tokenizers** to be trained from\n",
+    "scratch without presets.\n",
+    "\n",
+    "In this workflow, we pretrain a BERT **backbone** using our IMDB review text. We skip the\n",
+    "\"next sentence prediction\" (NSP) loss because it adds significant complexity to the data\n",
+    "processing and was dropped by later models like RoBERTa. See our e2e\n",
+    "[Transformer pretraining](https://keras.io/guides/keras_hub/transformer_pretraining/#pretraining)\n",
+    "for step-by-step details on how to replicate the original paper."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "### Preprocessing"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "colab_type": "code"
+   },
+   "outputs": [],
+   "source": [
+    "# All BERT `en` models have the same vocabulary, so reuse preprocessor from\n",
+    "# \"bert_tiny_en_uncased\"\n",
+    "preprocessor = keras_hub.models.BertPreprocessor.from_preset(\n",
+    "    \"bert_tiny_en_uncased\",\n",
+    "    sequence_length=256,\n",
+    ")\n",
+    "packer = preprocessor.packer\n",
+    "tokenizer = preprocessor.tokenizer\n",
+    "\n",
+    "# keras.Layer to replace some input tokens with the \"[MASK]\" token\n",
+    "masker = keras_hub.layers.MaskedLMMaskGenerator(\n",
+    "    vocabulary_size=tokenizer.vocabulary_size(),\n",
+    "    mask_selection_rate=0.25,\n",
+    "    mask_selection_length=64,\n",
+    "    mask_token_id=tokenizer.token_to_id(\"[MASK]\"),\n",
+    "    unselectable_token_ids=[\n",
+    "        tokenizer.token_to_id(x) for x in [\"[CLS]\", \"[PAD]\", \"[SEP]\"]\n",
+    "    ],\n",
+    ")\n",
+    "\n",
+    "\n",
+    "def preprocess(inputs, label):\n",
+    "    inputs = preprocessor(inputs)\n",
+    "    masked_inputs = masker(inputs[\"token_ids\"])\n",
+    "    # Split the masking layer outputs into a (features, labels, and weights)\n",
+    "    # tuple that we can use with keras.Model.fit().\n",
+    "    features = {\n",
+    "        \"token_ids\": masked_inputs[\"token_ids\"],\n",
+    "        \"segment_ids\": inputs[\"segment_ids\"],\n",
+    "        \"padding_mask\": inputs[\"padding_mask\"],\n",
+    "        \"mask_positions\": masked_inputs[\"mask_positions\"],\n",
+    "    }\n",
+    "    labels = masked_inputs[\"mask_ids\"]\n",
+    "    weights = masked_inputs[\"mask_weights\"]\n",
+    "    return features, labels, weights\n",
+    "\n",
+    "\n",
+    "pretrain_ds = imdb_train.map(preprocess, num_parallel_calls=tf.data.AUTOTUNE).prefetch(\n",
+    "    tf.data.AUTOTUNE\n",
+    ")\n",
+    "pretrain_val_ds = imdb_test.map(\n",
+    "    preprocess, num_parallel_calls=tf.data.AUTOTUNE\n",
+    ").prefetch(tf.data.AUTOTUNE)\n",
+    "\n",
+    "# Tokens with ID 103 are \"masked\"\n",
+    "print(pretrain_ds.unbatch().take(1).get_single_element())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "### Pretraining model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "colab_type": "code"
+   },
+   "outputs": [],
+   "source": [
+    "# BERT backbone\n",
+    "backbone = keras_hub.models.BertBackbone(\n",
+    "    vocabulary_size=tokenizer.vocabulary_size(),\n",
+    "    num_layers=2,\n",
+    "    num_heads=2,\n",
+    "    hidden_dim=128,\n",
+    "    intermediate_dim=512,\n",
+    ")\n",
+    "\n",
+    "# Language modeling head\n",
+    "mlm_head = keras_hub.layers.MaskedLMHead(\n",
+    "    token_embedding=backbone.token_embedding,\n",
+    ")\n",
+    "\n",
+    "inputs = {\n",
+    "    \"token_ids\": keras.Input(shape=(None,), dtype=tf.int32, name=\"token_ids\"),\n",
+    "    \"segment_ids\": keras.Input(shape=(None,), dtype=tf.int32, name=\"segment_ids\"),\n",
+    "    \"padding_mask\": keras.Input(shape=(None,), dtype=tf.int32, name=\"padding_mask\"),\n",
+    "    \"mask_positions\": keras.Input(shape=(None,), dtype=tf.int32, name=\"mask_positions\"),\n",
+    "}\n",
+    "\n",
+    "# Encoded token sequence\n",
+    "sequence = backbone(inputs)[\"sequence_output\"]\n",
+    "\n",
+    "# Predict an output word for each masked input token.\n",
+    "# We use the input token embedding to project from our encoded vectors to\n",
+    "# vocabulary logits, which has been shown to improve training efficiency.\n",
+    "outputs = mlm_head(sequence, mask_positions=inputs[\"mask_positions\"])\n",
+    "\n",
+    "# Define and compile our pretraining model.\n",
+    "pretraining_model = keras.Model(inputs, outputs)\n",
+    "pretraining_model.summary()\n",
+    "pretraining_model.compile(\n",
+    "    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),\n",
+    "    optimizer=keras.optimizers.AdamW(learning_rate=5e-4),\n",
+    "    weighted_metrics=[keras.metrics.SparseCategoricalAccuracy()],\n",
+    "    jit_compile=True,\n",
+    ")\n",
+    "\n",
+    "# Pretrain on IMDB dataset\n",
+    "pretraining_model.fit(\n",
+    "    pretrain_ds,\n",
+    "    validation_data=pretrain_val_ds,\n",
+    "    epochs=3,  # Increase to 6 for higher accuracy\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "After pretraining save your `backbone` submodel to use in a new task!"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "## Build and train your own transformer from scratch\n",
+    "<img src=\"https://storage.googleapis.com/keras-hub/getting_started_guide/prof_keras_expert.png\" alt=\"drawing\" height=\"250\"/>\n",
+    "\n",
+    "Want to implement a novel transformer architecture? The KerasHub library offers all the\n",
+    "low-level modules used to build SoTA architectures in our `models` API. This includes the\n",
+    "`keras_hub.tokenizers` API which allows you to train your own subword tokenizer using\n",
+    "`WordPieceTokenizer`, `BytePairTokenizer`, or `SentencePieceTokenizer`.\n",
+    "\n",
+    "In this workflow, we train a custom tokenizer on the IMDB data and design a backbone with\n",
+    "custom transformer architecture. For simplicity, we then train directly on the\n",
+    "classification task. Interested in more details? We wrote an entire guide to pretraining\n",
+    "and finetuning a custom transformer on\n",
+    "[keras.io](https://keras.io/guides/keras_hub/transformer_pretraining/),"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "### Train custom vocabulary from IMDB data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "colab_type": "code"
+   },
+   "outputs": [],
+   "source": [
+    "vocab = keras_hub.tokenizers.compute_word_piece_vocabulary(\n",
+    "    imdb_train.map(lambda x, y: x),\n",
+    "    vocabulary_size=20_000,\n",
+    "    lowercase=True,\n",
+    "    strip_accents=True,\n",
+    "    reserved_tokens=[\"[PAD]\", \"[START]\", \"[END]\", \"[MASK]\", \"[UNK]\"],\n",
+    ")\n",
+    "tokenizer = keras_hub.tokenizers.WordPieceTokenizer(\n",
+    "    vocabulary=vocab,\n",
+    "    lowercase=True,\n",
+    "    strip_accents=True,\n",
+    "    oov_token=\"[UNK]\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "### Preprocess data with a custom tokenizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "colab_type": "code"
+   },
+   "outputs": [],
+   "source": [
+    "packer = keras_hub.layers.StartEndPacker(\n",
+    "    start_value=tokenizer.token_to_id(\"[START]\"),\n",
+    "    end_value=tokenizer.token_to_id(\"[END]\"),\n",
+    "    pad_value=tokenizer.token_to_id(\"[PAD]\"),\n",
+    "    sequence_length=512,\n",
+    ")\n",
+    "\n",
+    "\n",
+    "def preprocess(x, y):\n",
+    "    token_ids = packer(tokenizer(x))\n",
+    "    return token_ids, y\n",
+    "\n",
+    "\n",
+    "imdb_preproc_train_ds = imdb_train.map(\n",
+    "    preprocess, num_parallel_calls=tf.data.AUTOTUNE\n",
+    ").prefetch(tf.data.AUTOTUNE)\n",
+    "imdb_preproc_val_ds = imdb_test.map(\n",
+    "    preprocess, num_parallel_calls=tf.data.AUTOTUNE\n",
+    ").prefetch(tf.data.AUTOTUNE)\n",
+    "\n",
+    "print(imdb_preproc_train_ds.unbatch().take(1).get_single_element())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "### Design a tiny transformer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "colab_type": "code"
+   },
+   "outputs": [],
+   "source": [
+    "token_id_input = keras.Input(\n",
+    "    shape=(None,),\n",
+    "    dtype=\"int32\",\n",
+    "    name=\"token_ids\",\n",
+    ")\n",
+    "outputs = keras_hub.layers.TokenAndPositionEmbedding(\n",
+    "    vocabulary_size=len(vocab),\n",
+    "    sequence_length=packer.sequence_length,\n",
+    "    embedding_dim=64,\n",
+    ")(token_id_input)\n",
+    "outputs = keras_hub.layers.TransformerEncoder(\n",
+    "    num_heads=2,\n",
+    "    intermediate_dim=128,\n",
+    "    dropout=0.1,\n",
+    ")(outputs)\n",
+    "# Use \"[START]\" token to classify\n",
+    "outputs = keras.layers.Dense(2)(outputs[:, 0, :])\n",
+    "model = keras.Model(\n",
+    "    inputs=token_id_input,\n",
+    "    outputs=outputs,\n",
+    ")\n",
+    "\n",
+    "model.summary()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "### Train the transformer directly on the classification objective"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "colab_type": "code"
+   },
+   "outputs": [],
+   "source": [
+    "model.compile(\n",
+    "    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),\n",
+    "    optimizer=keras.optimizers.AdamW(5e-5),\n",
+    "    metrics=[keras.metrics.SparseCategoricalAccuracy()],\n",
+    "    jit_compile=True,\n",
+    ")\n",
+    "model.fit(\n",
+    "    imdb_preproc_train_ds,\n",
+    "    validation_data=imdb_preproc_val_ds,\n",
+    "    epochs=3,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "Excitingly, our custom classifier is similar to the performance of fine-tuning\n",
+    "`\"bert_tiny_en_uncased\"`! To see the advantages of pretraining and exceed 90% accuracy we\n",
+    "would need to use larger **presets** such as `\"bert_base_en_uncased\"`."
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "collapsed_sections": [],
+   "name": "getting_started",
+   "private_outputs": false,
+   "provenance": [],
+   "toc_visible": true
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
\ No newline at end of file
diff --git a/guides/ipynb/keras_hub/transformer_pretraining.ipynb b/guides/ipynb/keras_hub/transformer_pretraining.ipynb
new file mode 100644
index 0000000000..feb9d9f6c7
--- /dev/null
+++ b/guides/ipynb/keras_hub/transformer_pretraining.ipynb
@@ -0,0 +1,690 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "# Pretraining a Transformer from scratch with KerasHub\n",
+    "\n",
+    "**Author:** [Matthew Watson](https://github.com/mattdangerw/)<br>\n",
+    "**Date created:** 2022/04/18<br>\n",
+    "**Last modified:** 2023/07/15<br>\n",
+    "**Description:** Use KerasHub to train a Transformer model from scratch."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "KerasHub aims to make it easy to build state-of-the-art text processing models. In this\n",
+    "guide, we will show how library components simplify pretraining and fine-tuning a\n",
+    "Transformer model from scratch.\n",
+    "\n",
+    "This guide is broken into three parts:\n",
+    "\n",
+    "1. *Setup*, task definition, and establishing a baseline.\n",
+    "2. *Pretraining* a Transformer model.\n",
+    "3. *Fine-tuning* the Transformer model on our classification task."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "## Setup\n",
+    "\n",
+    "The following guide uses Keras 3 to work in any of `tensorflow`, `jax` or\n",
+    "`torch`. We select the `jax` backend below, which will give us a particularly\n",
+    "fast train step below, but feel free to mix it up."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab_type": "code"
+   },
+   "outputs": [],
+   "source": [
+    "!pip install -q --upgrade keras-hub\n",
+    "!pip install -q --upgrade keras  # Upgrade to Keras 3."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab_type": "code"
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "os.environ[\"KERAS_BACKEND\"] = \"jax\"  # or \"tensorflow\" or \"torch\"\n",
+    "\n",
+    "\n",
+    "import keras_hub\n",
+    "import tensorflow as tf\n",
+    "import keras"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "Next up, we can download two datasets.\n",
+    "\n",
+    "- [SST-2](https://paperswithcode.com/sota/sentiment-analysis-on-sst-2-binary) a text\n",
+    "classification dataset and our \"end goal\". This dataset is often used to benchmark\n",
+    "language models.\n",
+    "- [WikiText-103](https://paperswithcode.com/dataset/wikitext-103): A medium sized\n",
+    "collection of featured articles from English Wikipedia, which we will use for\n",
+    "pretraining.\n",
+    "\n",
+    "Finally, we will download a WordPiece vocabulary, to do sub-word tokenization later on in\n",
+    "this guide."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab_type": "code"
+   },
+   "outputs": [],
+   "source": [
+    "# Download pretraining data.\n",
+    "keras.utils.get_file(\n",
+    "    origin=\"https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip\",\n",
+    "    extract=True,\n",
+    ")\n",
+    "wiki_dir = os.path.expanduser(\"~/.keras/datasets/wikitext-103-raw/\")\n",
+    "\n",
+    "# Download finetuning data.\n",
+    "keras.utils.get_file(\n",
+    "    origin=\"https://dl.fbaipublicfiles.com/glue/data/SST-2.zip\",\n",
+    "    extract=True,\n",
+    ")\n",
+    "sst_dir = os.path.expanduser(\"~/.keras/datasets/SST-2/\")\n",
+    "\n",
+    "# Download vocabulary data.\n",
+    "vocab_file = keras.utils.get_file(\n",
+    "    origin=\"https://storage.googleapis.com/tensorflow/keras-hub/examples/bert/bert_vocab_uncased.txt\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "Next, we define some hyperparameters we will use during training."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab_type": "code"
+   },
+   "outputs": [],
+   "source": [
+    "# Preprocessing params.\n",
+    "PRETRAINING_BATCH_SIZE = 128\n",
+    "FINETUNING_BATCH_SIZE = 32\n",
+    "SEQ_LENGTH = 128\n",
+    "MASK_RATE = 0.25\n",
+    "PREDICTIONS_PER_SEQ = 32\n",
+    "\n",
+    "# Model params.\n",
+    "NUM_LAYERS = 3\n",
+    "MODEL_DIM = 256\n",
+    "INTERMEDIATE_DIM = 512\n",
+    "NUM_HEADS = 4\n",
+    "DROPOUT = 0.1\n",
+    "NORM_EPSILON = 1e-5\n",
+    "\n",
+    "# Training params.\n",
+    "PRETRAINING_LEARNING_RATE = 5e-4\n",
+    "PRETRAINING_EPOCHS = 8\n",
+    "FINETUNING_LEARNING_RATE = 5e-5\n",
+    "FINETUNING_EPOCHS = 3"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "### Load data\n",
+    "\n",
+    "We load our data with [tf.data](https://www.tensorflow.org/guide/data), which will allow\n",
+    "us to define input pipelines for tokenizing and preprocessing text."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab_type": "code"
+   },
+   "outputs": [],
+   "source": [
+    "# Load SST-2.\n",
+    "sst_train_ds = tf.data.experimental.CsvDataset(\n",
+    "    sst_dir + \"train.tsv\", [tf.string, tf.int32], header=True, field_delim=\"\\t\"\n",
+    ").batch(FINETUNING_BATCH_SIZE)\n",
+    "sst_val_ds = tf.data.experimental.CsvDataset(\n",
+    "    sst_dir + \"dev.tsv\", [tf.string, tf.int32], header=True, field_delim=\"\\t\"\n",
+    ").batch(FINETUNING_BATCH_SIZE)\n",
+    "\n",
+    "# Load wikitext-103 and filter out short lines.\n",
+    "wiki_train_ds = (\n",
+    "    tf.data.TextLineDataset(wiki_dir + \"wiki.train.raw\")\n",
+    "    .filter(lambda x: tf.strings.length(x) > 100)\n",
+    "    .batch(PRETRAINING_BATCH_SIZE)\n",
+    ")\n",
+    "wiki_val_ds = (\n",
+    "    tf.data.TextLineDataset(wiki_dir + \"wiki.valid.raw\")\n",
+    "    .filter(lambda x: tf.strings.length(x) > 100)\n",
+    "    .batch(PRETRAINING_BATCH_SIZE)\n",
+    ")\n",
+    "\n",
+    "# Take a peak at the sst-2 dataset.\n",
+    "print(sst_train_ds.unbatch().batch(4).take(1).get_single_element())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "You can see that our `SST-2` dataset contains relatively short snippets of movie review\n",
+    "text. Our goal is to predict the sentiment of the snippet. A label of 1 indicates\n",
+    "positive sentiment, and a label of 0 negative sentiment."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "### Establish a baseline\n",
+    "\n",
+    "As a first step, we will establish a baseline of good performance. We don't actually need\n",
+    "KerasHub for this, we can just use core Keras layers.\n",
+    "\n",
+    "We will train a simple bag-of-words model, where we learn a positive or negative weight\n",
+    "for each word in our vocabulary. A sample's score is simply the sum of the weights of all\n",
+    "words that are present in the sample."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab_type": "code"
+   },
+   "outputs": [],
+   "source": [
+    "# This layer will turn our input sentence into a list of 1s and 0s the same size\n",
+    "# our vocabulary, indicating whether a word is present in absent.\n",
+    "multi_hot_layer = keras.layers.TextVectorization(\n",
+    "    max_tokens=4000, output_mode=\"multi_hot\"\n",
+    ")\n",
+    "multi_hot_layer.adapt(sst_train_ds.map(lambda x, y: x))\n",
+    "multi_hot_ds = sst_train_ds.map(lambda x, y: (multi_hot_layer(x), y))\n",
+    "multi_hot_val_ds = sst_val_ds.map(lambda x, y: (multi_hot_layer(x), y))\n",
+    "\n",
+    "# We then learn a linear regression over that layer, and that's our entire\n",
+    "# baseline model!\n",
+    "\n",
+    "inputs = keras.Input(shape=(4000,), dtype=\"int32\")\n",
+    "outputs = keras.layers.Dense(1, activation=\"sigmoid\")(inputs)\n",
+    "baseline_model = keras.Model(inputs, outputs)\n",
+    "baseline_model.compile(loss=\"binary_crossentropy\", metrics=[\"accuracy\"])\n",
+    "baseline_model.fit(multi_hot_ds, validation_data=multi_hot_val_ds, epochs=5)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "A bag-of-words approach can be a fast and surprisingly powerful, especially when input\n",
+    "examples contain a large number of words. With shorter sequences, it can hit a\n",
+    "performance ceiling.\n",
+    "\n",
+    "To do better, we would like to build a model that can evaluate words *in context*. Instead\n",
+    "of evaluating each word in a void, we need to use the information contained in the\n",
+    "*entire ordered sequence* of our input.\n",
+    "\n",
+    "This runs us into a problem. `SST-2` is very small dataset, and there's simply not enough\n",
+    "example text to attempt to build a larger, more parameterized model that can learn on a\n",
+    "sequence. We would quickly start to overfit and memorize our training set, without any\n",
+    "increase in our ability to generalize to unseen examples.\n",
+    "\n",
+    "Enter **pretraining**, which will allow us to learn on a larger corpus, and transfer our\n",
+    "knowledge to the `SST-2` task. And enter **KerasHub**, which will allow us to pretrain a\n",
+    "particularly powerful model, the Transformer, with ease."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "## Pretraining\n",
+    "\n",
+    "To beat our baseline, we will leverage the `WikiText103` dataset, an unlabeled\n",
+    "collection of Wikipedia articles that is much bigger than `SST-2`.\n",
+    "\n",
+    "We are going to train a *transformer*, a highly expressive model which will learn\n",
+    "to embed each word in our input as a low dimensional vector. Our wikipedia dataset has no\n",
+    "labels, so we will use an unsupervised training objective called the *Masked Language\n",
+    "Modeling* (MaskedLM) objective.\n",
+    "\n",
+    "Essentially, we will be playing a big game of \"guess the missing word\". For each input\n",
+    "sample we will obscure 25% of our input data, and train our model to predict the parts we\n",
+    "covered up."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "### Preprocess data for the MaskedLM task\n",
+    "\n",
+    "Our text preprocessing for the MaskedLM task will occur in two stages.\n",
+    "\n",
+    "1. Tokenize input text into integer sequences of token ids.\n",
+    "2. Mask certain positions in our input to predict on.\n",
+    "\n",
+    "To tokenize, we can use a `keras_hub.tokenizers.Tokenizer` -- the KerasHub building block\n",
+    "for transforming text into sequences of integer token ids.\n",
+    "\n",
+    "In particular, we will use `keras_hub.tokenizers.WordPieceTokenizer` which does\n",
+    "*sub-word* tokenization. Sub-word tokenization is popular when training models on large\n",
+    "text corpora. Essentially, it allows our model to learn from uncommon words, while not\n",
+    "requiring a massive vocabulary of every word in our training set.\n",
+    "\n",
+    "The second thing we need to do is mask our input for the MaskedLM task. To do this, we can use\n",
+    "`keras_hub.layers.MaskedLMMaskGenerator`, which will randomly select a set of tokens in each\n",
+    "input and mask them out.\n",
+    "\n",
+    "The tokenizer and the masking layer can both be used inside a call to\n",
+    "[tf.data.Dataset.map](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#map).\n",
+    "We can use `tf.data` to efficiently pre-compute each batch on the CPU, while our GPU or TPU\n",
+    "works on training with the batch that came before. Because our masking layer will\n",
+    "choose new words to mask each time, each epoch over our dataset will give us a totally\n",
+    "new set of labels to train on."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab_type": "code"
+   },
+   "outputs": [],
+   "source": [
+    "# Setting sequence_length will trim or pad the token outputs to shape\n",
+    "# (batch_size, SEQ_LENGTH).\n",
+    "tokenizer = keras_hub.tokenizers.WordPieceTokenizer(\n",
+    "    vocabulary=vocab_file,\n",
+    "    sequence_length=SEQ_LENGTH,\n",
+    "    lowercase=True,\n",
+    "    strip_accents=True,\n",
+    ")\n",
+    "# Setting mask_selection_length will trim or pad the mask outputs to shape\n",
+    "# (batch_size, PREDICTIONS_PER_SEQ).\n",
+    "masker = keras_hub.layers.MaskedLMMaskGenerator(\n",
+    "    vocabulary_size=tokenizer.vocabulary_size(),\n",
+    "    mask_selection_rate=MASK_RATE,\n",
+    "    mask_selection_length=PREDICTIONS_PER_SEQ,\n",
+    "    mask_token_id=tokenizer.token_to_id(\"[MASK]\"),\n",
+    ")\n",
+    "\n",
+    "\n",
+    "def preprocess(inputs):\n",
+    "    inputs = tokenizer(inputs)\n",
+    "    outputs = masker(inputs)\n",
+    "    # Split the masking layer outputs into a (features, labels, and weights)\n",
+    "    # tuple that we can use with keras.Model.fit().\n",
+    "    features = {\n",
+    "        \"token_ids\": outputs[\"token_ids\"],\n",
+    "        \"mask_positions\": outputs[\"mask_positions\"],\n",
+    "    }\n",
+    "    labels = outputs[\"mask_ids\"]\n",
+    "    weights = outputs[\"mask_weights\"]\n",
+    "    return features, labels, weights\n",
+    "\n",
+    "\n",
+    "# We use prefetch() to pre-compute preprocessed batches on the fly on the CPU.\n",
+    "pretrain_ds = wiki_train_ds.map(\n",
+    "    preprocess, num_parallel_calls=tf.data.AUTOTUNE\n",
+    ").prefetch(tf.data.AUTOTUNE)\n",
+    "pretrain_val_ds = wiki_val_ds.map(\n",
+    "    preprocess, num_parallel_calls=tf.data.AUTOTUNE\n",
+    ").prefetch(tf.data.AUTOTUNE)\n",
+    "\n",
+    "# Preview a single input example.\n",
+    "# The masks will change each time you run the cell.\n",
+    "print(pretrain_val_ds.take(1).get_single_element())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "The above block sorts our dataset into a `(features, labels, weights)` tuple, which can be\n",
+    "passed directly to `keras.Model.fit()`.\n",
+    "\n",
+    "We have two features:\n",
+    "\n",
+    "1. `\"token_ids\"`, where some tokens have been replaced with our mask token id.\n",
+    "2. `\"mask_positions\"`, which keeps track of which tokens we masked out.\n",
+    "\n",
+    "Our labels are simply the ids we masked out.\n",
+    "\n",
+    "Because not all sequences will have the same number of masks, we also keep a\n",
+    "`sample_weight` tensor, which removes padded labels from our loss function by giving them\n",
+    "zero weight."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "### Create the Transformer encoder\n",
+    "\n",
+    "KerasHub provides all the building blocks to quickly build a Transformer encoder.\n",
+    "\n",
+    "We use `keras_hub.layers.TokenAndPositionEmbedding` to first embed our input token ids.\n",
+    "This layer simultaneously learns two embeddings -- one for words in a sentence and another\n",
+    "for integer positions in a sentence. The output embedding is simply the sum of the two.\n",
+    "\n",
+    "Then we can add a series of `keras_hub.layers.TransformerEncoder` layers. These are the\n",
+    "bread and butter of the Transformer model, using an attention mechanism to attend to\n",
+    "different parts of the input sentence, followed by a multi-layer perceptron block.\n",
+    "\n",
+    "The output of this model will be a encoded vector per input token id. Unlike the\n",
+    "bag-of-words model we used as a baseline, this model will embed each token accounting for\n",
+    "the context in which it appeared."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab_type": "code"
+   },
+   "outputs": [],
+   "source": [
+    "inputs = keras.Input(shape=(SEQ_LENGTH,), dtype=\"int32\")\n",
+    "\n",
+    "# Embed our tokens with a positional embedding.\n",
+    "embedding_layer = keras_hub.layers.TokenAndPositionEmbedding(\n",
+    "    vocabulary_size=tokenizer.vocabulary_size(),\n",
+    "    sequence_length=SEQ_LENGTH,\n",
+    "    embedding_dim=MODEL_DIM,\n",
+    ")\n",
+    "outputs = embedding_layer(inputs)\n",
+    "\n",
+    "# Apply layer normalization and dropout to the embedding.\n",
+    "outputs = keras.layers.LayerNormalization(epsilon=NORM_EPSILON)(outputs)\n",
+    "outputs = keras.layers.Dropout(rate=DROPOUT)(outputs)\n",
+    "\n",
+    "# Add a number of encoder blocks\n",
+    "for i in range(NUM_LAYERS):\n",
+    "    outputs = keras_hub.layers.TransformerEncoder(\n",
+    "        intermediate_dim=INTERMEDIATE_DIM,\n",
+    "        num_heads=NUM_HEADS,\n",
+    "        dropout=DROPOUT,\n",
+    "        layer_norm_epsilon=NORM_EPSILON,\n",
+    "    )(outputs)\n",
+    "\n",
+    "encoder_model = keras.Model(inputs, outputs)\n",
+    "encoder_model.summary()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "### Pretrain the Transformer\n",
+    "\n",
+    "You can think of the `encoder_model` as it's own modular unit, it is the piece of our\n",
+    "model that we are really interested in for our downstream task. However we still need to\n",
+    "set up the encoder to train on the MaskedLM task; to do that we attach a\n",
+    "`keras_hub.layers.MaskedLMHead`.\n",
+    "\n",
+    "This layer will take as one input the token encodings, and as another the positions we\n",
+    "masked out in the original input. It will gather the token encodings we masked, and\n",
+    "transform them back in predictions over our entire vocabulary.\n",
+    "\n",
+    "With that, we are ready to compile and run pretraining. If you are running this in a\n",
+    "Colab, note that this will take about an hour. Training Transformer is famously compute\n",
+    "intensive, so even this relatively small Transformer will take some time."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab_type": "code"
+   },
+   "outputs": [],
+   "source": [
+    "# Create the pretraining model by attaching a masked language model head.\n",
+    "inputs = {\n",
+    "    \"token_ids\": keras.Input(shape=(SEQ_LENGTH,), dtype=\"int32\", name=\"token_ids\"),\n",
+    "    \"mask_positions\": keras.Input(\n",
+    "        shape=(PREDICTIONS_PER_SEQ,), dtype=\"int32\", name=\"mask_positions\"\n",
+    "    ),\n",
+    "}\n",
+    "\n",
+    "# Encode the tokens.\n",
+    "encoded_tokens = encoder_model(inputs[\"token_ids\"])\n",
+    "\n",
+    "# Predict an output word for each masked input token.\n",
+    "# We use the input token embedding to project from our encoded vectors to\n",
+    "# vocabulary logits, which has been shown to improve training efficiency.\n",
+    "outputs = keras_hub.layers.MaskedLMHead(\n",
+    "    token_embedding=embedding_layer.token_embedding,\n",
+    "    activation=\"softmax\",\n",
+    ")(encoded_tokens, mask_positions=inputs[\"mask_positions\"])\n",
+    "\n",
+    "# Define and compile our pretraining model.\n",
+    "pretraining_model = keras.Model(inputs, outputs)\n",
+    "pretraining_model.compile(\n",
+    "    loss=\"sparse_categorical_crossentropy\",\n",
+    "    optimizer=keras.optimizers.AdamW(PRETRAINING_LEARNING_RATE),\n",
+    "    weighted_metrics=[\"sparse_categorical_accuracy\"],\n",
+    "    jit_compile=True,\n",
+    ")\n",
+    "\n",
+    "# Pretrain the model on our wiki text dataset.\n",
+    "pretraining_model.fit(\n",
+    "    pretrain_ds,\n",
+    "    validation_data=pretrain_val_ds,\n",
+    "    epochs=PRETRAINING_EPOCHS,\n",
+    "    steps_per_epoch=2,\n",
+    ")\n",
+    "\n",
+    "# Save this base model for further finetuning.\n",
+    "encoder_model.save(\"encoder_model.keras\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "## Fine-tuning\n",
+    "\n",
+    "After pretraining, we can now fine-tune our model on the `SST-2` dataset. We can\n",
+    "leverage the ability of the encoder we build to predict on words in context to boost\n",
+    "our performance on the downstream task."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "### Preprocess data for classification\n",
+    "\n",
+    "Preprocessing for fine-tuning is much simpler than for our pretraining MaskedLM task. We just\n",
+    "tokenize our input sentences and we are ready for training!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab_type": "code"
+   },
+   "outputs": [],
+   "source": [
+    "\n",
+    "def preprocess(sentences, labels):\n",
+    "    return tokenizer(sentences), labels\n",
+    "\n",
+    "\n",
+    "# We use prefetch() to pre-compute preprocessed batches on the fly on our CPU.\n",
+    "finetune_ds = sst_train_ds.map(\n",
+    "    preprocess, num_parallel_calls=tf.data.AUTOTUNE\n",
+    ").prefetch(tf.data.AUTOTUNE)\n",
+    "finetune_val_ds = sst_val_ds.map(\n",
+    "    preprocess, num_parallel_calls=tf.data.AUTOTUNE\n",
+    ").prefetch(tf.data.AUTOTUNE)\n",
+    "\n",
+    "# Preview a single input example.\n",
+    "print(finetune_val_ds.take(1).get_single_element())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "### Fine-tune the Transformer\n",
+    "\n",
+    "To go from our encoded token output to a classification prediction, we need to attach\n",
+    "another \"head\" to our Transformer model. We can afford to be simple here. We pool\n",
+    "the encoded tokens together, and use a single dense layer to make a prediction."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab_type": "code"
+   },
+   "outputs": [],
+   "source": [
+    "# Reload the encoder model from disk so we can restart fine-tuning from scratch.\n",
+    "encoder_model = keras.models.load_model(\"encoder_model.keras\", compile=False)\n",
+    "\n",
+    "# Take as input the tokenized input.\n",
+    "inputs = keras.Input(shape=(SEQ_LENGTH,), dtype=\"int32\")\n",
+    "\n",
+    "# Encode and pool the tokens.\n",
+    "encoded_tokens = encoder_model(inputs)\n",
+    "pooled_tokens = keras.layers.GlobalAveragePooling1D()(encoded_tokens[0])\n",
+    "\n",
+    "# Predict an output label.\n",
+    "outputs = keras.layers.Dense(1, activation=\"sigmoid\")(pooled_tokens)\n",
+    "\n",
+    "# Define and compile our fine-tuning model.\n",
+    "finetuning_model = keras.Model(inputs, outputs)\n",
+    "finetuning_model.compile(\n",
+    "    loss=\"binary_crossentropy\",\n",
+    "    optimizer=keras.optimizers.AdamW(FINETUNING_LEARNING_RATE),\n",
+    "    metrics=[\"accuracy\"],\n",
+    ")\n",
+    "\n",
+    "# Finetune the model for the SST-2 task.\n",
+    "finetuning_model.fit(\n",
+    "    finetune_ds,\n",
+    "    validation_data=finetune_val_ds,\n",
+    "    epochs=FINETUNING_EPOCHS,\n",
+    "    steps_per_epoch=2,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "Pretraining was enough to boost our performance to 84%, and this is hardly the ceiling\n",
+    "for Transformer models. You may have noticed during pretraining that our validation\n",
+    "performance was still steadily increasing. Our model is still significantly undertrained.\n",
+    "Training for more epochs, training a large Transformer, and training on more unlabeled\n",
+    "text would all continue to boost performance significantly.\n",
+    "\n",
+    "One of the key goals of KerasHub is to provide a modular approach to NLP model building.\n",
+    "We have shown one approach to building a Transformer here, but KerasHub supports an ever\n",
+    "growing array of components for preprocessing text and building models. We hope it makes\n",
+    "it easier to experiment on solutions to your natural language problems."
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "collapsed_sections": [],
+   "name": "transformer_pretraining",
+   "private_outputs": false,
+   "provenance": [],
+   "toc_visible": true
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/guides/ipynb/keras_hub/upload.ipynb b/guides/ipynb/keras_hub/upload.ipynb
new file mode 100644
index 0000000000..b9824069d6
--- /dev/null
+++ b/guides/ipynb/keras_hub/upload.ipynb
@@ -0,0 +1,521 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "# Uploading Models with KerasHub\n",
+    "\n",
+    "**Author:** [Samaneh Saadat](https://github.com/SamanehSaadat/), [Matthew Watson](https://github.com/mattdangerw/)<br>\n",
+    "**Date created:** 2024/04/29<br>\n",
+    "**Last modified:** 2024/04/29<br>\n",
+    "**Description:** An introduction on how to upload a fine-tuned KerasHub model to model hubs."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "# Introduction\n",
+    "\n",
+    "Fine-tuning a machine learning model can yield impressive results for specific tasks.\n",
+    "Uploading your fine-tuned model to a model hub allows you to share it with the broader community.\n",
+    "By sharing your models, you'll enhance accessibility for other researchers and developers,\n",
+    "making your contributions an integral part of the machine learning landscape.\n",
+    "This can also streamline the integration of your model into real-world applications.\n",
+    "\n",
+    "This guide walks you through how to upload your fine-tuned models to popular model hubs such as\n",
+    "[Kaggle Models](https://www.kaggle.com/models) and [Hugging Face Hub](https://huggingface.co/models)."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "# Setup\n",
+    "\n",
+    "Let's start by installing and importing all the libraries we need. We use KerasHub for this guide."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "colab_type": "code"
+   },
+   "outputs": [],
+   "source": [
+    "!pip install -q --upgrade keras-hub huggingface-hub kagglehub"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "colab_type": "code"
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "\n",
+    "os.environ[\"KERAS_BACKEND\"] = \"jax\"\n",
+    "\n",
+    "import keras_hub\n",
+    ""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "# Data\n",
+    "\n",
+    "We can use the IMDB reviews dataset for this guide. Let's load the dataset from `tensorflow_dataset`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "colab_type": "code"
+   },
+   "outputs": [],
+   "source": [
+    "import tensorflow_datasets as tfds\n",
+    "\n",
+    "imdb_train, imdb_test = tfds.load(\n",
+    "    \"imdb_reviews\",\n",
+    "    split=[\"train\", \"test\"],\n",
+    "    as_supervised=True,\n",
+    "    batch_size=4,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "We only use a small subset of the training samples to make the guide run faster.\n",
+    "However, if you need a higher quality model, consider using a larger number of training samples."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "colab_type": "code"
+   },
+   "outputs": [],
+   "source": [
+    "imdb_train = imdb_train.take(100)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "# Task Upload\n",
+    "\n",
+    "A `keras_hub.models.Task`, wraps a `keras_hub.models.Backbone` and a `keras_hub.models.Preprocessor` to create\n",
+    "a model that can be directly used for training, fine-tuning, and prediction for a given text problem.\n",
+    "In this section, we explain how to create a `Task`, fine-tune and upload it to a model hub."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "## Load Model\n",
+    "\n",
+    "If you want to build a Causal LM based on a base model, simply call `keras_hub.models.CausalLM.from_preset`\n",
+    "and pass a built-in preset identifier."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "colab_type": "code"
+   },
+   "outputs": [],
+   "source": [
+    "causal_lm = keras_hub.models.CausalLM.from_preset(\"gpt2_base_en\")\n",
+    ""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "## Fine-tune Model\n",
+    "\n",
+    "After loading the model, you can call `.fit()` on the model to fine-tune it.\n",
+    "Here, we fine-tune the model on the IMDB reviews which makes the model movie domain-specific."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "colab_type": "code"
+   },
+   "outputs": [],
+   "source": [
+    "# Drop labels and keep the review text only for the Causal LM.\n",
+    "imdb_train_reviews = imdb_train.map(lambda x, y: x)\n",
+    "\n",
+    "# Fine-tune the Causal LM.\n",
+    "causal_lm.fit(imdb_train_reviews)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "## Save the Model Locally\n",
+    "\n",
+    "To upload a model, you need to first save the model locally using `save_to_preset`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "colab_type": "code"
+   },
+   "outputs": [],
+   "source": [
+    "preset_dir = \"./gpt2_imdb\"\n",
+    "causal_lm.save_to_preset(preset_dir)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "Let's see the saved files."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "colab_type": "code"
+   },
+   "outputs": [],
+   "source": [
+    "os.listdir(preset_dir)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "### Load a Locally Saved Model\n",
+    "\n",
+    "A model that is saved to a local preset can be loaded using `from_preset`.\n",
+    "What you save in, is what you get back out."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "colab_type": "code"
+   },
+   "outputs": [],
+   "source": [
+    "causal_lm = keras_hub.models.CausalLM.from_preset(preset_dir)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "You can also load the `keras_hub.models.Backbone` and `keras_hub.models.Tokenizer` objects from this preset directory.\n",
+    "Note that these objects are equivalent to `causal_lm.backbone` and `causal_lm.preprocessor.tokenizer` above."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "colab_type": "code"
+   },
+   "outputs": [],
+   "source": [
+    "backbone = keras_hub.models.Backbone.from_preset(preset_dir)\n",
+    "tokenizer = keras_hub.models.Tokenizer.from_preset(preset_dir)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "## Upload the Model to a Model Hub\n",
+    "\n",
+    "After saving a preset to a directory, this directory can be uploaded to a model hub such as Kaggle or Hugging Face directly from the KerasHub library.\n",
+    "To upload the model to Kaggle, the URI must start with `kaggle://` and to upload to Hugging Face, it should start with `hf://`."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "### Upload to Kaggle"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "To upload a model to Kaggle, first, we need to authenticate with Kaggle.\n",
+    "This can in one of the following ways:\n",
+    "1. Set environment variables `KAGGLE_USERNAME` and `KAGGLE_KEY`.\n",
+    "2. Provide a local `~/.kaggle/kaggle.json`.\n",
+    "3. Call `kagglehub.login()`.\n",
+    "\n",
+    "Let's make sure we are logged in before continuing."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "colab_type": "code"
+   },
+   "outputs": [],
+   "source": [
+    "import kagglehub\n",
+    "\n",
+    "if \"KAGGLE_USERNAME\" not in os.environ or \"KAGGLE_KEY\" not in os.environ:\n",
+    "    kagglehub.login()\n",
+    ""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "To upload a model we can use `keras_hub.upload_preset(uri, preset_dir)` API where `uri` has the format of\n",
+    "`kaggle://<KAGGLE_USERNAME>/<MODEL>/Keras/<VARIATION>` for uploading to Kaggle and `preset_dir` is the directory that the model is saved in.\n",
+    "\n",
+    "Running the following uploads the model that is saved in `preset_dir` to Kaggle:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "colab_type": "code"
+   },
+   "outputs": [],
+   "source": [
+    "kaggle_username = kagglehub.whoami()[\"username\"]\n",
+    "kaggle_uri = f\"kaggle://{kaggle_username}/gpt2/keras/gpt2_imdb\"\n",
+    "keras_hub.upload_preset(kaggle_uri, preset_dir)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "### Upload to Hugging Face"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "To upload a model to Hugging Face, first, we need to authenticate with Hugging Face.\n",
+    "This can in one of the following ways:\n",
+    "1. Set environment variables `HF_USERNAME` and `HF_TOKEN`.\n",
+    "2. Call `huggingface_hub.notebook_login()`.\n",
+    "\n",
+    "Let's make sure we are logged in before coninuing."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "colab_type": "code"
+   },
+   "outputs": [],
+   "source": [
+    "import huggingface_hub\n",
+    "\n",
+    "if \"HF_USERNAME\" not in os.environ or \"HF_TOKEN\" not in os.environ:\n",
+    "    huggingface_hub.notebook_login()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "`keras_hub.upload_preset(uri, preset_dir)` can be used to upload a model to Hugging Face if `uri` has the format of\n",
+    "`kaggle://<HF_USERNAME>/<MODEL>`.\n",
+    "\n",
+    "Running the following uploads the model that is saved in `preset_dir` to Hugging Face:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "colab_type": "code"
+   },
+   "outputs": [],
+   "source": [
+    "hf_username = huggingface_hub.whoami()[\"name\"]\n",
+    "hf_uri = f\"hf://{hf_username}/gpt2_imdb\"\n",
+    "keras_hub.upload_preset(hf_uri, preset_dir)\n",
+    ""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "## Load a User Uploaded Model\n",
+    "\n",
+    "After verifying that the model is uploaded to Kaggle, we can load the model by calling `from_preset`.\n",
+    "\n",
+    "```python\n",
+    "causal_lm = keras_hub.models.CausalLM.from_preset(\n",
+    "    f\"kaggle://{kaggle_username}/gpt2/keras/gpt2_imdb\"\n",
+    ")\n",
+    "```\n",
+    "\n",
+    "We can also load the model uploaded to Hugging Face by calling `from_preset`.\n",
+    "\n",
+    "```python\n",
+    "causal_lm = keras_hub.models.CausalLM.from_preset(f\"hf://{hf_username}/gpt2_imdb\")\n",
+    "```"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "# Classifier Upload\n",
+    "\n",
+    "Uploading a classifier model is similar to Causal LM upload.\n",
+    "To upload the fine-tuned model, first, the model should be saved to a local directory using `save_to_preset`\n",
+    "API and then it can be uploaded via `keras_hub.upload_preset`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 0,
+   "metadata": {
+    "colab_type": "code"
+   },
+   "outputs": [],
+   "source": [
+    "# Load the base model.\n",
+    "classifier = keras_hub.models.Classifier.from_preset(\n",
+    "    \"bert_tiny_en_uncased\", num_classes=2\n",
+    ")\n",
+    "\n",
+    "# Fine-tune the classifier.\n",
+    "classifier.fit(imdb_train)\n",
+    "\n",
+    "# Save the model to a local preset directory.\n",
+    "preset_dir = \"./bert_tiny_imdb\"\n",
+    "classifier.save_to_preset(preset_dir)\n",
+    "\n",
+    "# Upload to Kaggle.\n",
+    "keras_hub.upload_preset(\n",
+    "    f\"kaggle://{kaggle_username}/bert/keras/bert_tiny_imdb\", preset_dir\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "colab_type": "text"
+   },
+   "source": [
+    "After verifying that the model is uploaded to Kaggle, we can load the model by calling `from_preset`.\n",
+    "\n",
+    "```python\n",
+    "classifier = keras_hub.models.Classifier.from_preset(\n",
+    "    f\"kaggle://{kaggle_username}/bert/keras/bert_tiny_imdb\"\n",
+    ")\n",
+    "```"
+   ]
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "collapsed_sections": [],
+   "name": "upload",
+   "private_outputs": false,
+   "provenance": [],
+   "toc_visible": true
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
\ No newline at end of file
diff --git a/guides/keras_cv/classification_with_keras_cv.py b/guides/keras_cv/classification_with_keras_cv.py
index e7ebe45853..4f57923b05 100644
--- a/guides/keras_cv/classification_with_keras_cv.py
+++ b/guides/keras_cv/classification_with_keras_cv.py
@@ -29,7 +29,7 @@
 We use Professor Keras, the official Keras mascot, as a
 visual reference for the complexity of the material:
 
-![](https://storage.googleapis.com/keras-nlp/getting_started_guide/prof_keras_evolution.png)
+![](https://storage.googleapis.com/keras-hub/getting_started_guide/prof_keras_evolution.png)
 """
 
 """shell
@@ -62,7 +62,7 @@
 """
 ## Inference with a pretrained classifier
 
-![](https://storage.googleapis.com/keras-nlp/getting_started_guide/prof_keras_beginner.png)
+![](https://storage.googleapis.com/keras-hub/getting_started_guide/prof_keras_beginner.png)
 
 Let's get started with the simplest KerasCV API: a pretrained classifier.
 In this example, we will construct a classifier that was
@@ -143,7 +143,7 @@
 
 ## Fine tuning a pretrained classifier
 
-![](https://storage.googleapis.com/keras-nlp/getting_started_guide/prof_keras_intermediate.png)
+![](https://storage.googleapis.com/keras-hub/getting_started_guide/prof_keras_intermediate.png)
 
 When labeled images specific to our task are available, fine-tuning a custom
 classifier can improve performance.
@@ -233,7 +233,7 @@ def preprocess_inputs(image, label):
 """
 ## Train a Classifier from Scratch
 
-![](https://storage.googleapis.com/keras-nlp/getting_started_guide/prof_keras_advanced.png)
+![](https://storage.googleapis.com/keras-hub/getting_started_guide/prof_keras_advanced.png)
 
 Now that we've gotten our hands dirty with classification, let's take on one
 last task: training a classification model from scratch!
diff --git a/guides/keras_cv/object_detection_keras_cv.py b/guides/keras_cv/object_detection_keras_cv.py
index 474db57076..3149f8eaa9 100644
--- a/guides/keras_cv/object_detection_keras_cv.py
+++ b/guides/keras_cv/object_detection_keras_cv.py
@@ -92,7 +92,7 @@
 """
 ## Perform detections with a pretrained model
 
-![](https://storage.googleapis.com/keras-nlp/getting_started_guide/prof_keras_beginner.png)
+![](https://storage.googleapis.com/keras-hub/getting_started_guide/prof_keras_beginner.png)
 
 The highest level API in the KerasCV Object Detection API is the `keras_cv.models` API.
 This API includes fully pretrained object detection models, such as
@@ -334,7 +334,7 @@ class mapping now.
 
 ## Train a custom object detection model
 
-![](https://storage.googleapis.com/keras-nlp/getting_started_guide/prof_keras_advanced.png)
+![](https://storage.googleapis.com/keras-hub/getting_started_guide/prof_keras_advanced.png)
 
 Whether you're an object detection amateur or a well seasoned veteran, assembling
 an object detection pipeline from scratch is a massive undertaking.
diff --git a/guides/keras_cv/semantic_segmentation_deeplab_v3_plus.py b/guides/keras_cv/semantic_segmentation_deeplab_v3_plus.py
index 2cc4527816..bc5edc21fc 100644
--- a/guides/keras_cv/semantic_segmentation_deeplab_v3_plus.py
+++ b/guides/keras_cv/semantic_segmentation_deeplab_v3_plus.py
@@ -8,7 +8,7 @@
 """
 
 """
-![](https://storage.googleapis.com/keras-nlp/getting_started_guide/prof_keras_intermediate.png)
+![](https://storage.googleapis.com/keras-hub/getting_started_guide/prof_keras_intermediate.png)
 
 ## Background
 Semantic segmentation is a type of computer vision task that involves assigning a
diff --git a/guides/keras_hub/getting_started.py b/guides/keras_hub/getting_started.py
new file mode 100644
index 0000000000..6aaaeb0510
--- /dev/null
+++ b/guides/keras_hub/getting_started.py
@@ -0,0 +1,633 @@
+"""
+Title: Getting Started with KerasHub
+Author: [Jonathan Bischof](https://github.com/jbischof)
+Date created: 2022/12/15
+Last modified: 2023/07/01
+Description: An introduction to the KerasHub API.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+KerasHub is a natural language processing library that supports users through
+their entire development cycle. Our workflows are built from modular components
+that have state-of-the-art preset weights and architectures when used
+out-of-the-box and are easily customizable when more control is needed.
+
+This library is an extension of the core Keras API; all high-level modules are
+[`Layers`](/api/layers/) or [`Models`](/api/models/). If you are familiar with Keras,
+congratulations! You already understand most of KerasHub.
+
+KerasHub uses Keras 3 to work with any of TensorFlow, Pytorch and Jax. In the
+guide below, we will use the `jax` backend for training our models, and
+[tf.data](https://www.tensorflow.org/guide/data) for efficiently running our
+input preprocessing. But feel free to mix things up! This guide runs in
+TensorFlow or PyTorch backends with zero changes, simply update the
+`KERAS_BACKEND` below.
+
+This guide demonstrates our modular approach using a sentiment analysis example at six
+levels of complexity:
+
+* Inference with a pretrained classifier
+* Fine tuning a pretrained backbone
+* Fine tuning with user-controlled preprocessing
+* Fine tuning a custom model
+* Pretraining a backbone model
+* Build and train your own transformer from scratch
+
+Throughout our guide, we use Professor Keras, the official Keras mascot, as a visual
+reference for the complexity of the material:
+
+<img src="https://storage.googleapis.com/keras-hub/getting_started_guide/prof_keras_evolution.png" alt="drawing" height="250"/>
+"""
+
+"""shell
+pip install -q --upgrade keras-hub
+pip install -q --upgrade keras  # Upgrade to Keras 3.
+"""
+
+import os
+
+os.environ["KERAS_BACKEND"] = "jax"  # or "tensorflow" or "torch"
+
+import keras_hub
+import keras
+
+# Use mixed precision to speed up all training in this guide.
+keras.mixed_precision.set_global_policy("mixed_float16")
+
+"""
+## API quickstart
+
+Our highest level API is `keras_hub.models`. These symbols cover the complete user
+journey of converting strings to tokens, tokens to dense features, and dense features to
+task-specific output. For each `XX` architecture (e.g., `Bert`), we offer the following
+modules:
+
+* **Tokenizer**: `keras_hub.models.XXTokenizer`
+  * **What it does**: Converts strings to sequences of token ids.
+  * **Why it's important**: The raw bytes of a string are too high dimensional to be useful
+    features so we first map them to a small number of tokens, for example `"The quick brown
+    fox"` to `["the", "qu", "##ick", "br", "##own", "fox"]`.
+  * **Inherits from**: `keras.layers.Layer`.
+* **Preprocessor**: `keras_hub.models.XXPreprocessor`
+  * **What it does**: Converts strings to a dictionary of preprocessed tensors consumed by
+    the backbone, starting with tokenization.
+  * **Why it's important**: Each model uses special tokens and extra tensors to understand
+    the input such as delimiting input segments and identifying padding tokens. Padding each
+    sequence to the same length improves computational efficiency.
+  * **Has a**: `XXTokenizer`.
+  * **Inherits from**: `keras.layers.Layer`.
+* **Backbone**: `keras_hub.models.XXBackbone`
+  * **What it does**: Converts preprocessed tensors to dense features. *Does not handle
+    strings; call the preprocessor first.*
+  * **Why it's important**: The backbone distills the input tokens into dense features that
+    can be used in downstream tasks. It is generally pretrained on a language modeling task
+    using massive amounts of unlabeled data. Transferring this information to a new task is a
+    major breakthrough in modern NLP.
+  * **Inherits from**: `keras.Model`.
+* **Task**: e.g., `keras_hub.models.XXClassifier`
+  * **What it does**: Converts strings to task-specific output (e.g., classification
+    probabilities).
+  * **Why it's important**: Task models combine string preprocessing and the backbone model
+    with task-specific `Layers` to solve a problem such as sentence classification, token
+    classification, or text generation. The additional `Layers` must be fine-tuned on labeled
+    data.
+  * **Has a**: `XXBackbone` and `XXPreprocessor`.
+  * **Inherits from**: `keras.Model`.
+
+Here is the modular hierarchy for `BertClassifier` (all relationships are compositional):
+
+<img src="https://storage.googleapis.com/keras-hub/getting_started_guide/class_diagram.png" alt="drawing" height="300"/>
+
+All modules can be used independently and have a `from_preset()` method in addition to
+the standard constructor that instantiates the class with **preset** architecture and
+weights (see examples below).
+"""
+
+"""
+## Data
+
+We will use a running example of sentiment analysis of IMDB movie reviews. In this task,
+we use the text to predict whether the review was positive (`label = 1`) or negative
+(`label = 0`).
+
+We load the data using `keras.utils.text_dataset_from_directory`, which utilizes the
+powerful `tf.data.Dataset` format for examples.
+"""
+
+"""shell
+curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
+tar -xf aclImdb_v1.tar.gz
+# Remove unsupervised examples
+rm -r aclImdb/train/unsup
+"""
+
+BATCH_SIZE = 16
+imdb_train = keras.utils.text_dataset_from_directory(
+    "aclImdb/train",
+    batch_size=BATCH_SIZE,
+)
+imdb_test = keras.utils.text_dataset_from_directory(
+    "aclImdb/test",
+    batch_size=BATCH_SIZE,
+)
+
+# Inspect first review
+# Format is (review text tensor, label tensor)
+print(imdb_train.unbatch().take(1).get_single_element())
+
+
+"""
+## Inference with a pretrained classifier
+
+<img src="https://storage.googleapis.com/keras-hub/getting_started_guide/prof_keras_beginner.png" alt="drawing" height="250"/>
+
+The highest level module in KerasHub is a **task**. A **task** is a `keras.Model`
+consisting of a (generally pretrained) **backbone** model and task-specific layers.
+Here's an example using `keras_hub.models.BertClassifier`.
+
+**Note**: Outputs are the logits per class (e.g., `[0, 0]` is 50% chance of positive). The output is
+[negative, positive] for binary classification.
+"""
+
+classifier = keras_hub.models.BertClassifier.from_preset("bert_tiny_en_uncased_sst2")
+# Note: batched inputs expected so must wrap string in iterable
+classifier.predict(["I love modular workflows in keras-hub!"])
+
+"""
+All **tasks** have a `from_preset` method that constructs a `keras.Model` instance with
+preset preprocessing, architecture and weights. This means that we can pass raw strings
+in any format accepted by a `keras.Model` and get output specific to our task.
+
+This particular **preset** is a `"bert_tiny_uncased_en"` **backbone** fine-tuned on
+`sst2`, another movie review sentiment analysis (this time from Rotten Tomatoes). We use
+the `tiny` architecture for demo purposes, but larger models are recommended for SoTA
+performance. For all the task-specific presets available for `BertClassifier`, see
+our keras.io [models page](https://keras.io/api/keras_hub/models/).
+
+Let's evaluate our classifier on the IMDB dataset. You will note we don't need to
+call `keras.Model.compile` here. All **task** models like `BertClassifier` ship with
+compilation defaults, meaning we can just call `keras.Model.evaluate` directly. You
+can always call compile as normal to override these defaults (e.g. to add new metrics).
+
+The output below is [loss, accuracy],
+"""
+
+classifier.evaluate(imdb_test)
+
+"""
+Our result is 78% accuracy without training anything. Not bad!
+"""
+
+"""
+## Fine tuning a pretrained BERT backbone
+
+<img src="https://storage.googleapis.com/keras-hub/getting_started_guide/prof_keras_intermediate.png" alt="drawing" height="250"/>
+
+When labeled text specific to our task is available, fine-tuning a custom classifier can
+improve performance. If we want to predict IMDB review sentiment, using IMDB data should
+perform better than Rotten Tomatoes data! And for many tasks, no relevant pretrained model
+will be available (e.g., categorizing customer reviews).
+
+The workflow for fine-tuning is almost identical to above, except that we request a
+**preset** for the **backbone**-only model rather than the entire classifier. When passed
+a **backbone** **preset**, a **task** `Model` will randomly initialize all task-specific
+layers in preparation for training. For all the **backbone** presets available for
+`BertClassifier`, see our keras.io [models page](https://keras.io/api/keras_hub/models/).
+
+To train your classifier, use `keras.Model.fit` as with any other
+`keras.Model`. As with our inference example, we can rely on the compilation
+defaults for the **task** and skip `keras.Model.compile`. As preprocessing is
+included, we again pass the raw data.
+"""
+
+classifier = keras_hub.models.BertClassifier.from_preset(
+    "bert_tiny_en_uncased",
+    num_classes=2,
+)
+classifier.fit(
+    imdb_train,
+    validation_data=imdb_test,
+    epochs=1,
+)
+
+"""
+Here we see a significant lift in validation accuracy (0.78 -> 0.87) with a single epoch of
+training even though the IMDB dataset is much smaller than `sst2`.
+"""
+
+"""
+## Fine tuning with user-controlled preprocessing
+<img src="https://storage.googleapis.com/keras-hub/getting_started_guide/prof_keras_advanced.png" alt="drawing" height="250"/>
+
+For some advanced training scenarios, users might prefer direct control over
+preprocessing. For large datasets, examples can be preprocessed in advance and saved to
+disk or preprocessed by a separate worker pool using `tf.data.experimental.service`. In
+other cases, custom preprocessing is needed to handle the inputs.
+
+Pass `preprocessor=None` to the constructor of a **task** `Model` to skip automatic
+preprocessing or pass a custom `BertPreprocessor` instead.
+"""
+
+"""
+### Separate preprocessing from the same preset
+
+Each model architecture has a parallel **preprocessor** `Layer` with its own
+`from_preset` constructor. Using the same **preset** for this `Layer` will return the
+matching **preprocessor** as the **task**.
+
+In this workflow we train the model over three epochs using `tf.data.Dataset.cache()`,
+which computes the preprocessing once and caches the result before fitting begins.
+
+**Note:** we can use `tf.data` for preprocessing while running on the
+Jax or PyTorch backend. The input dataset will automatically be converted to
+backend native tensor types during fit. In fact, given the efficiency of `tf.data`
+for running preprocessing, this is good practice on all backends.
+"""
+
+import tensorflow as tf
+
+preprocessor = keras_hub.models.BertPreprocessor.from_preset(
+    "bert_tiny_en_uncased",
+    sequence_length=512,
+)
+
+# Apply the preprocessor to every sample of train and test data using `map()`.
+# `tf.data.AUTOTUNE` and `prefetch()` are options to tune performance, see
+# https://www.tensorflow.org/guide/data_performance for details.
+
+# Note: only call `cache()` if you training data fits in CPU memory!
+imdb_train_cached = (
+    imdb_train.map(preprocessor, tf.data.AUTOTUNE).cache().prefetch(tf.data.AUTOTUNE)
+)
+imdb_test_cached = (
+    imdb_test.map(preprocessor, tf.data.AUTOTUNE).cache().prefetch(tf.data.AUTOTUNE)
+)
+
+classifier = keras_hub.models.BertClassifier.from_preset(
+    "bert_tiny_en_uncased", preprocessor=None, num_classes=2
+)
+classifier.fit(
+    imdb_train_cached,
+    validation_data=imdb_test_cached,
+    epochs=3,
+)
+
+"""
+After three epochs, our validation accuracy has only increased to 0.88. This is both a
+function of the small size of our dataset and our model. To exceed 90% accuracy, try
+larger **presets** such as  `"bert_base_en_uncased"`. For all the **backbone** presets
+available for `BertClassifier`, see our keras.io [models page](https://keras.io/api/keras_hub/models/).
+"""
+
+"""
+### Custom preprocessing
+
+In cases where custom preprocessing is required, we offer direct access to the
+`Tokenizer` class that maps raw strings to tokens. It also has a `from_preset()`
+constructor to get the vocabulary matching pretraining.
+
+**Note:** `BertTokenizer` does not pad sequences by default, so the output is
+ragged (each sequence has varying length). The `MultiSegmentPacker` below
+handles padding these ragged sequences to dense tensor types (e.g. `tf.Tensor`
+or `torch.Tensor`).
+"""
+
+tokenizer = keras_hub.models.BertTokenizer.from_preset("bert_tiny_en_uncased")
+tokenizer(["I love modular workflows!", "Libraries over frameworks!"])
+
+# Write your own packer or use one of our `Layers`
+packer = keras_hub.layers.MultiSegmentPacker(
+    start_value=tokenizer.cls_token_id,
+    end_value=tokenizer.sep_token_id,
+    # Note: This cannot be longer than the preset's `sequence_length`, and there
+    # is no check for a custom preprocessor!
+    sequence_length=64,
+)
+
+
+# This function that takes a text sample `x` and its
+# corresponding label `y` as input and converts the
+# text into a format suitable for input into a BERT model.
+def preprocessor(x, y):
+    token_ids, segment_ids = packer(tokenizer(x))
+    x = {
+        "token_ids": token_ids,
+        "segment_ids": segment_ids,
+        "padding_mask": token_ids != 0,
+    }
+    return x, y
+
+
+imdb_train_preprocessed = imdb_train.map(preprocessor, tf.data.AUTOTUNE).prefetch(
+    tf.data.AUTOTUNE
+)
+imdb_test_preprocessed = imdb_test.map(preprocessor, tf.data.AUTOTUNE).prefetch(
+    tf.data.AUTOTUNE
+)
+
+# Preprocessed example
+print(imdb_train_preprocessed.unbatch().take(1).get_single_element())
+
+"""
+## Fine tuning with a custom model
+<img src="https://storage.googleapis.com/keras-hub/getting_started_guide/prof_keras_advanced.png" alt="drawing" height="250"/>
+
+For more advanced applications, an appropriate **task** `Model` may not be available. In
+this case, we provide direct access to the **backbone** `Model`, which has its own
+`from_preset` constructor and can be composed with custom `Layer`s. Detailed examples can
+be found at our [transfer learning guide](https://keras.io/guides/transfer_learning/).
+
+A **backbone** `Model` does not include automatic preprocessing but can be paired with a
+matching **preprocessor** using the same **preset** as shown in the previous workflow.
+
+In this workflow, we experiment with freezing our backbone model and adding two trainable
+transformer layers to adapt to the new input.
+
+**Note**: We can ignore the warning about gradients for the `pooled_dense` layer because
+we are using BERT's sequence output.
+"""
+
+preprocessor = keras_hub.models.BertPreprocessor.from_preset("bert_tiny_en_uncased")
+backbone = keras_hub.models.BertBackbone.from_preset("bert_tiny_en_uncased")
+
+imdb_train_preprocessed = (
+    imdb_train.map(preprocessor, tf.data.AUTOTUNE).cache().prefetch(tf.data.AUTOTUNE)
+)
+imdb_test_preprocessed = (
+    imdb_test.map(preprocessor, tf.data.AUTOTUNE).cache().prefetch(tf.data.AUTOTUNE)
+)
+
+backbone.trainable = False
+inputs = backbone.input
+sequence = backbone(inputs)["sequence_output"]
+for _ in range(2):
+    sequence = keras_hub.layers.TransformerEncoder(
+        num_heads=2,
+        intermediate_dim=512,
+        dropout=0.1,
+    )(sequence)
+# Use [CLS] token output to classify
+outputs = keras.layers.Dense(2)(sequence[:, backbone.cls_token_index, :])
+
+model = keras.Model(inputs, outputs)
+model.compile(
+    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+    optimizer=keras.optimizers.AdamW(5e-5),
+    metrics=[keras.metrics.SparseCategoricalAccuracy()],
+    jit_compile=True,
+)
+model.summary()
+model.fit(
+    imdb_train_preprocessed,
+    validation_data=imdb_test_preprocessed,
+    epochs=3,
+)
+
+"""
+This model achieves reasonable accuracy despite having only 10% of the trainable parameters
+of our `BertClassifier` model. Each training step takes about 1/3 of the time---even
+accounting for cached preprocessing.
+"""
+
+"""
+## Pretraining a backbone model
+<img src="https://storage.googleapis.com/keras-hub/getting_started_guide/prof_keras_expert.png" alt="drawing" height="250"/>
+
+Do you have access to large unlabeled datasets in your domain? Are they around the
+same size as used to train popular backbones such as BERT, RoBERTa, or GPT2 (XX+ GiB)? If
+so, you might benefit from domain-specific pretraining of your own backbone models.
+
+NLP models are generally pretrained on a language modeling task, predicting masked words
+given the visible words in an input sentence. For example, given the input
+`"The fox [MASK] over the [MASK] dog"`, the model might be asked to predict `["jumped", "lazy"]`.
+The lower layers of this model are then packaged as a **backbone** to be combined with
+layers relating to a new task.
+
+The KerasHub library offers SoTA **backbones** and **tokenizers** to be trained from
+scratch without presets.
+
+In this workflow, we pretrain a BERT **backbone** using our IMDB review text. We skip the
+"next sentence prediction" (NSP) loss because it adds significant complexity to the data
+processing and was dropped by later models like RoBERTa. See our e2e
+[Transformer pretraining](https://keras.io/guides/keras_hub/transformer_pretraining/#pretraining)
+for step-by-step details on how to replicate the original paper.
+"""
+
+"""
+### Preprocessing
+"""
+
+# All BERT `en` models have the same vocabulary, so reuse preprocessor from
+# "bert_tiny_en_uncased"
+preprocessor = keras_hub.models.BertPreprocessor.from_preset(
+    "bert_tiny_en_uncased",
+    sequence_length=256,
+)
+packer = preprocessor.packer
+tokenizer = preprocessor.tokenizer
+
+# keras.Layer to replace some input tokens with the "[MASK]" token
+masker = keras_hub.layers.MaskedLMMaskGenerator(
+    vocabulary_size=tokenizer.vocabulary_size(),
+    mask_selection_rate=0.25,
+    mask_selection_length=64,
+    mask_token_id=tokenizer.token_to_id("[MASK]"),
+    unselectable_token_ids=[
+        tokenizer.token_to_id(x) for x in ["[CLS]", "[PAD]", "[SEP]"]
+    ],
+)
+
+
+def preprocess(inputs, label):
+    inputs = preprocessor(inputs)
+    masked_inputs = masker(inputs["token_ids"])
+    # Split the masking layer outputs into a (features, labels, and weights)
+    # tuple that we can use with keras.Model.fit().
+    features = {
+        "token_ids": masked_inputs["token_ids"],
+        "segment_ids": inputs["segment_ids"],
+        "padding_mask": inputs["padding_mask"],
+        "mask_positions": masked_inputs["mask_positions"],
+    }
+    labels = masked_inputs["mask_ids"]
+    weights = masked_inputs["mask_weights"]
+    return features, labels, weights
+
+
+pretrain_ds = imdb_train.map(preprocess, num_parallel_calls=tf.data.AUTOTUNE).prefetch(
+    tf.data.AUTOTUNE
+)
+pretrain_val_ds = imdb_test.map(
+    preprocess, num_parallel_calls=tf.data.AUTOTUNE
+).prefetch(tf.data.AUTOTUNE)
+
+# Tokens with ID 103 are "masked"
+print(pretrain_ds.unbatch().take(1).get_single_element())
+
+"""
+### Pretraining model
+"""
+
+# BERT backbone
+backbone = keras_hub.models.BertBackbone(
+    vocabulary_size=tokenizer.vocabulary_size(),
+    num_layers=2,
+    num_heads=2,
+    hidden_dim=128,
+    intermediate_dim=512,
+)
+
+# Language modeling head
+mlm_head = keras_hub.layers.MaskedLMHead(
+    token_embedding=backbone.token_embedding,
+)
+
+inputs = {
+    "token_ids": keras.Input(shape=(None,), dtype=tf.int32, name="token_ids"),
+    "segment_ids": keras.Input(shape=(None,), dtype=tf.int32, name="segment_ids"),
+    "padding_mask": keras.Input(shape=(None,), dtype=tf.int32, name="padding_mask"),
+    "mask_positions": keras.Input(shape=(None,), dtype=tf.int32, name="mask_positions"),
+}
+
+# Encoded token sequence
+sequence = backbone(inputs)["sequence_output"]
+
+# Predict an output word for each masked input token.
+# We use the input token embedding to project from our encoded vectors to
+# vocabulary logits, which has been shown to improve training efficiency.
+outputs = mlm_head(sequence, mask_positions=inputs["mask_positions"])
+
+# Define and compile our pretraining model.
+pretraining_model = keras.Model(inputs, outputs)
+pretraining_model.summary()
+pretraining_model.compile(
+    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+    optimizer=keras.optimizers.AdamW(learning_rate=5e-4),
+    weighted_metrics=[keras.metrics.SparseCategoricalAccuracy()],
+    jit_compile=True,
+)
+
+# Pretrain on IMDB dataset
+pretraining_model.fit(
+    pretrain_ds,
+    validation_data=pretrain_val_ds,
+    epochs=3,  # Increase to 6 for higher accuracy
+)
+
+"""
+After pretraining save your `backbone` submodel to use in a new task!
+"""
+
+"""
+## Build and train your own transformer from scratch
+<img src="https://storage.googleapis.com/keras-hub/getting_started_guide/prof_keras_expert.png" alt="drawing" height="250"/>
+
+Want to implement a novel transformer architecture? The KerasHub library offers all the
+low-level modules used to build SoTA architectures in our `models` API. This includes the
+`keras_hub.tokenizers` API which allows you to train your own subword tokenizer using
+`WordPieceTokenizer`, `BytePairTokenizer`, or `SentencePieceTokenizer`.
+
+In this workflow, we train a custom tokenizer on the IMDB data and design a backbone with
+custom transformer architecture. For simplicity, we then train directly on the
+classification task. Interested in more details? We wrote an entire guide to pretraining
+and finetuning a custom transformer on
+[keras.io](https://keras.io/guides/keras_hub/transformer_pretraining/),
+"""
+
+"""
+### Train custom vocabulary from IMDB data
+"""
+
+vocab = keras_hub.tokenizers.compute_word_piece_vocabulary(
+    imdb_train.map(lambda x, y: x),
+    vocabulary_size=20_000,
+    lowercase=True,
+    strip_accents=True,
+    reserved_tokens=["[PAD]", "[START]", "[END]", "[MASK]", "[UNK]"],
+)
+tokenizer = keras_hub.tokenizers.WordPieceTokenizer(
+    vocabulary=vocab,
+    lowercase=True,
+    strip_accents=True,
+    oov_token="[UNK]",
+)
+
+"""
+### Preprocess data with a custom tokenizer
+"""
+
+packer = keras_hub.layers.StartEndPacker(
+    start_value=tokenizer.token_to_id("[START]"),
+    end_value=tokenizer.token_to_id("[END]"),
+    pad_value=tokenizer.token_to_id("[PAD]"),
+    sequence_length=512,
+)
+
+
+def preprocess(x, y):
+    token_ids = packer(tokenizer(x))
+    return token_ids, y
+
+
+imdb_preproc_train_ds = imdb_train.map(
+    preprocess, num_parallel_calls=tf.data.AUTOTUNE
+).prefetch(tf.data.AUTOTUNE)
+imdb_preproc_val_ds = imdb_test.map(
+    preprocess, num_parallel_calls=tf.data.AUTOTUNE
+).prefetch(tf.data.AUTOTUNE)
+
+print(imdb_preproc_train_ds.unbatch().take(1).get_single_element())
+
+"""
+
+### Design a tiny transformer
+"""
+
+token_id_input = keras.Input(
+    shape=(None,),
+    dtype="int32",
+    name="token_ids",
+)
+outputs = keras_hub.layers.TokenAndPositionEmbedding(
+    vocabulary_size=len(vocab),
+    sequence_length=packer.sequence_length,
+    embedding_dim=64,
+)(token_id_input)
+outputs = keras_hub.layers.TransformerEncoder(
+    num_heads=2,
+    intermediate_dim=128,
+    dropout=0.1,
+)(outputs)
+# Use "[START]" token to classify
+outputs = keras.layers.Dense(2)(outputs[:, 0, :])
+model = keras.Model(
+    inputs=token_id_input,
+    outputs=outputs,
+)
+
+model.summary()
+
+"""
+### Train the transformer directly on the classification objective
+"""
+
+model.compile(
+    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+    optimizer=keras.optimizers.AdamW(5e-5),
+    metrics=[keras.metrics.SparseCategoricalAccuracy()],
+    jit_compile=True,
+)
+model.fit(
+    imdb_preproc_train_ds,
+    validation_data=imdb_preproc_val_ds,
+    epochs=3,
+)
+
+"""
+Excitingly, our custom classifier is similar to the performance of fine-tuning
+`"bert_tiny_en_uncased"`! To see the advantages of pretraining and exceed 90% accuracy we
+would need to use larger **presets** such as `"bert_base_en_uncased"`.
+"""
diff --git a/guides/keras_hub/transformer_pretraining.py b/guides/keras_hub/transformer_pretraining.py
new file mode 100644
index 0000000000..bf625ee18d
--- /dev/null
+++ b/guides/keras_hub/transformer_pretraining.py
@@ -0,0 +1,468 @@
+"""
+Title: Pretraining a Transformer from scratch with KerasHub
+Author: [Matthew Watson](https://github.com/mattdangerw/)
+Date created: 2022/04/18
+Last modified: 2023/07/15
+Description: Use KerasHub to train a Transformer model from scratch.
+Accelerator: GPU
+Converted to Keras 3 by: [Anshuman Mishra](https://github.com/shivance)
+"""
+
+"""
+KerasHub aims to make it easy to build state-of-the-art text processing models. In this
+guide, we will show how library components simplify pretraining and fine-tuning a
+Transformer model from scratch.
+
+This guide is broken into three parts:
+
+1. *Setup*, task definition, and establishing a baseline.
+2. *Pretraining* a Transformer model.
+3. *Fine-tuning* the Transformer model on our classification task.
+"""
+
+"""
+## Setup
+
+The following guide uses Keras 3 to work in any of `tensorflow`, `jax` or
+`torch`. We select the `jax` backend below, which will give us a particularly
+fast train step below, but feel free to mix it up.
+"""
+
+"""shell
+pip install -q --upgrade keras-hub
+pip install -q --upgrade keras  # Upgrade to Keras 3.
+"""
+
+import os
+
+os.environ["KERAS_BACKEND"] = "jax"  # or "tensorflow" or "torch"
+
+
+import keras_hub
+import tensorflow as tf
+import keras
+
+"""
+Next up, we can download two datasets.
+
+- [SST-2](https://paperswithcode.com/sota/sentiment-analysis-on-sst-2-binary) a text
+classification dataset and our "end goal". This dataset is often used to benchmark
+language models.
+- [WikiText-103](https://paperswithcode.com/dataset/wikitext-103): A medium sized
+collection of featured articles from English Wikipedia, which we will use for
+pretraining.
+
+Finally, we will download a WordPiece vocabulary, to do sub-word tokenization later on in
+this guide.
+"""
+
+# Download pretraining data.
+keras.utils.get_file(
+    origin="https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip",
+    extract=True,
+)
+wiki_dir = os.path.expanduser("~/.keras/datasets/wikitext-103-raw/")
+
+# Download finetuning data.
+keras.utils.get_file(
+    origin="https://dl.fbaipublicfiles.com/glue/data/SST-2.zip",
+    extract=True,
+)
+sst_dir = os.path.expanduser("~/.keras/datasets/SST-2/")
+
+# Download vocabulary data.
+vocab_file = keras.utils.get_file(
+    origin="https://storage.googleapis.com/tensorflow/keras-hub/examples/bert/bert_vocab_uncased.txt",
+)
+
+"""
+Next, we define some hyperparameters we will use during training.
+"""
+
+# Preprocessing params.
+PRETRAINING_BATCH_SIZE = 128
+FINETUNING_BATCH_SIZE = 32
+SEQ_LENGTH = 128
+MASK_RATE = 0.25
+PREDICTIONS_PER_SEQ = 32
+
+# Model params.
+NUM_LAYERS = 3
+MODEL_DIM = 256
+INTERMEDIATE_DIM = 512
+NUM_HEADS = 4
+DROPOUT = 0.1
+NORM_EPSILON = 1e-5
+
+# Training params.
+PRETRAINING_LEARNING_RATE = 5e-4
+PRETRAINING_EPOCHS = 8
+FINETUNING_LEARNING_RATE = 5e-5
+FINETUNING_EPOCHS = 3
+
+"""
+### Load data
+
+We load our data with [tf.data](https://www.tensorflow.org/guide/data), which will allow
+us to define input pipelines for tokenizing and preprocessing text.
+"""
+
+# Load SST-2.
+sst_train_ds = tf.data.experimental.CsvDataset(
+    sst_dir + "train.tsv", [tf.string, tf.int32], header=True, field_delim="\t"
+).batch(FINETUNING_BATCH_SIZE)
+sst_val_ds = tf.data.experimental.CsvDataset(
+    sst_dir + "dev.tsv", [tf.string, tf.int32], header=True, field_delim="\t"
+).batch(FINETUNING_BATCH_SIZE)
+
+# Load wikitext-103 and filter out short lines.
+wiki_train_ds = (
+    tf.data.TextLineDataset(wiki_dir + "wiki.train.raw")
+    .filter(lambda x: tf.strings.length(x) > 100)
+    .batch(PRETRAINING_BATCH_SIZE)
+)
+wiki_val_ds = (
+    tf.data.TextLineDataset(wiki_dir + "wiki.valid.raw")
+    .filter(lambda x: tf.strings.length(x) > 100)
+    .batch(PRETRAINING_BATCH_SIZE)
+)
+
+# Take a peak at the sst-2 dataset.
+print(sst_train_ds.unbatch().batch(4).take(1).get_single_element())
+
+"""
+You can see that our `SST-2` dataset contains relatively short snippets of movie review
+text. Our goal is to predict the sentiment of the snippet. A label of 1 indicates
+positive sentiment, and a label of 0 negative sentiment.
+"""
+
+"""
+### Establish a baseline
+
+As a first step, we will establish a baseline of good performance. We don't actually need
+KerasHub for this, we can just use core Keras layers.
+
+We will train a simple bag-of-words model, where we learn a positive or negative weight
+for each word in our vocabulary. A sample's score is simply the sum of the weights of all
+words that are present in the sample.
+"""
+
+# This layer will turn our input sentence into a list of 1s and 0s the same size
+# our vocabulary, indicating whether a word is present in absent.
+multi_hot_layer = keras.layers.TextVectorization(
+    max_tokens=4000, output_mode="multi_hot"
+)
+multi_hot_layer.adapt(sst_train_ds.map(lambda x, y: x))
+multi_hot_ds = sst_train_ds.map(lambda x, y: (multi_hot_layer(x), y))
+multi_hot_val_ds = sst_val_ds.map(lambda x, y: (multi_hot_layer(x), y))
+
+# We then learn a linear regression over that layer, and that's our entire
+# baseline model!
+
+inputs = keras.Input(shape=(4000,), dtype="int32")
+outputs = keras.layers.Dense(1, activation="sigmoid")(inputs)
+baseline_model = keras.Model(inputs, outputs)
+baseline_model.compile(loss="binary_crossentropy", metrics=["accuracy"])
+baseline_model.fit(multi_hot_ds, validation_data=multi_hot_val_ds, epochs=5)
+
+"""
+A bag-of-words approach can be a fast and surprisingly powerful, especially when input
+examples contain a large number of words. With shorter sequences, it can hit a
+performance ceiling.
+
+To do better, we would like to build a model that can evaluate words *in context*. Instead
+of evaluating each word in a void, we need to use the information contained in the
+*entire ordered sequence* of our input.
+
+This runs us into a problem. `SST-2` is very small dataset, and there's simply not enough
+example text to attempt to build a larger, more parameterized model that can learn on a
+sequence. We would quickly start to overfit and memorize our training set, without any
+increase in our ability to generalize to unseen examples.
+
+Enter **pretraining**, which will allow us to learn on a larger corpus, and transfer our
+knowledge to the `SST-2` task. And enter **KerasHub**, which will allow us to pretrain a
+particularly powerful model, the Transformer, with ease.
+"""
+
+"""
+## Pretraining
+
+To beat our baseline, we will leverage the `WikiText103` dataset, an unlabeled
+collection of Wikipedia articles that is much bigger than `SST-2`.
+
+We are going to train a *transformer*, a highly expressive model which will learn
+to embed each word in our input as a low dimensional vector. Our wikipedia dataset has no
+labels, so we will use an unsupervised training objective called the *Masked Language
+Modeling* (MaskedLM) objective.
+
+Essentially, we will be playing a big game of "guess the missing word". For each input
+sample we will obscure 25% of our input data, and train our model to predict the parts we
+covered up.
+"""
+
+"""
+### Preprocess data for the MaskedLM task
+
+Our text preprocessing for the MaskedLM task will occur in two stages.
+
+1. Tokenize input text into integer sequences of token ids.
+2. Mask certain positions in our input to predict on.
+
+To tokenize, we can use a `keras_hub.tokenizers.Tokenizer` -- the KerasHub building block
+for transforming text into sequences of integer token ids.
+
+In particular, we will use `keras_hub.tokenizers.WordPieceTokenizer` which does
+*sub-word* tokenization. Sub-word tokenization is popular when training models on large
+text corpora. Essentially, it allows our model to learn from uncommon words, while not
+requiring a massive vocabulary of every word in our training set.
+
+The second thing we need to do is mask our input for the MaskedLM task. To do this, we can use
+`keras_hub.layers.MaskedLMMaskGenerator`, which will randomly select a set of tokens in each
+input and mask them out.
+
+The tokenizer and the masking layer can both be used inside a call to
+[tf.data.Dataset.map](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#map).
+We can use `tf.data` to efficiently pre-compute each batch on the CPU, while our GPU or TPU
+works on training with the batch that came before. Because our masking layer will
+choose new words to mask each time, each epoch over our dataset will give us a totally
+new set of labels to train on.
+"""
+
+# Setting sequence_length will trim or pad the token outputs to shape
+# (batch_size, SEQ_LENGTH).
+tokenizer = keras_hub.tokenizers.WordPieceTokenizer(
+    vocabulary=vocab_file,
+    sequence_length=SEQ_LENGTH,
+    lowercase=True,
+    strip_accents=True,
+)
+# Setting mask_selection_length will trim or pad the mask outputs to shape
+# (batch_size, PREDICTIONS_PER_SEQ).
+masker = keras_hub.layers.MaskedLMMaskGenerator(
+    vocabulary_size=tokenizer.vocabulary_size(),
+    mask_selection_rate=MASK_RATE,
+    mask_selection_length=PREDICTIONS_PER_SEQ,
+    mask_token_id=tokenizer.token_to_id("[MASK]"),
+)
+
+
+def preprocess(inputs):
+    inputs = tokenizer(inputs)
+    outputs = masker(inputs)
+    # Split the masking layer outputs into a (features, labels, and weights)
+    # tuple that we can use with keras.Model.fit().
+    features = {
+        "token_ids": outputs["token_ids"],
+        "mask_positions": outputs["mask_positions"],
+    }
+    labels = outputs["mask_ids"]
+    weights = outputs["mask_weights"]
+    return features, labels, weights
+
+
+# We use prefetch() to pre-compute preprocessed batches on the fly on the CPU.
+pretrain_ds = wiki_train_ds.map(
+    preprocess, num_parallel_calls=tf.data.AUTOTUNE
+).prefetch(tf.data.AUTOTUNE)
+pretrain_val_ds = wiki_val_ds.map(
+    preprocess, num_parallel_calls=tf.data.AUTOTUNE
+).prefetch(tf.data.AUTOTUNE)
+
+# Preview a single input example.
+# The masks will change each time you run the cell.
+print(pretrain_val_ds.take(1).get_single_element())
+
+"""
+The above block sorts our dataset into a `(features, labels, weights)` tuple, which can be
+passed directly to `keras.Model.fit()`.
+
+We have two features:
+
+1. `"token_ids"`, where some tokens have been replaced with our mask token id.
+2. `"mask_positions"`, which keeps track of which tokens we masked out.
+
+Our labels are simply the ids we masked out.
+
+Because not all sequences will have the same number of masks, we also keep a
+`sample_weight` tensor, which removes padded labels from our loss function by giving them
+zero weight.
+"""
+
+"""
+### Create the Transformer encoder
+
+KerasHub provides all the building blocks to quickly build a Transformer encoder.
+
+We use `keras_hub.layers.TokenAndPositionEmbedding` to first embed our input token ids.
+This layer simultaneously learns two embeddings -- one for words in a sentence and another
+for integer positions in a sentence. The output embedding is simply the sum of the two.
+
+Then we can add a series of `keras_hub.layers.TransformerEncoder` layers. These are the
+bread and butter of the Transformer model, using an attention mechanism to attend to
+different parts of the input sentence, followed by a multi-layer perceptron block.
+
+The output of this model will be a encoded vector per input token id. Unlike the
+bag-of-words model we used as a baseline, this model will embed each token accounting for
+the context in which it appeared.
+"""
+
+inputs = keras.Input(shape=(SEQ_LENGTH,), dtype="int32")
+
+# Embed our tokens with a positional embedding.
+embedding_layer = keras_hub.layers.TokenAndPositionEmbedding(
+    vocabulary_size=tokenizer.vocabulary_size(),
+    sequence_length=SEQ_LENGTH,
+    embedding_dim=MODEL_DIM,
+)
+outputs = embedding_layer(inputs)
+
+# Apply layer normalization and dropout to the embedding.
+outputs = keras.layers.LayerNormalization(epsilon=NORM_EPSILON)(outputs)
+outputs = keras.layers.Dropout(rate=DROPOUT)(outputs)
+
+# Add a number of encoder blocks
+for i in range(NUM_LAYERS):
+    outputs = keras_hub.layers.TransformerEncoder(
+        intermediate_dim=INTERMEDIATE_DIM,
+        num_heads=NUM_HEADS,
+        dropout=DROPOUT,
+        layer_norm_epsilon=NORM_EPSILON,
+    )(outputs)
+
+encoder_model = keras.Model(inputs, outputs)
+encoder_model.summary()
+
+"""
+### Pretrain the Transformer
+
+You can think of the `encoder_model` as it's own modular unit, it is the piece of our
+model that we are really interested in for our downstream task. However we still need to
+set up the encoder to train on the MaskedLM task; to do that we attach a
+`keras_hub.layers.MaskedLMHead`.
+
+This layer will take as one input the token encodings, and as another the positions we
+masked out in the original input. It will gather the token encodings we masked, and
+transform them back in predictions over our entire vocabulary.
+
+With that, we are ready to compile and run pretraining. If you are running this in a
+Colab, note that this will take about an hour. Training Transformer is famously compute
+intensive, so even this relatively small Transformer will take some time.
+"""
+
+# Create the pretraining model by attaching a masked language model head.
+inputs = {
+    "token_ids": keras.Input(shape=(SEQ_LENGTH,), dtype="int32", name="token_ids"),
+    "mask_positions": keras.Input(
+        shape=(PREDICTIONS_PER_SEQ,), dtype="int32", name="mask_positions"
+    ),
+}
+
+# Encode the tokens.
+encoded_tokens = encoder_model(inputs["token_ids"])
+
+# Predict an output word for each masked input token.
+# We use the input token embedding to project from our encoded vectors to
+# vocabulary logits, which has been shown to improve training efficiency.
+outputs = keras_hub.layers.MaskedLMHead(
+    token_embedding=embedding_layer.token_embedding,
+    activation="softmax",
+)(encoded_tokens, mask_positions=inputs["mask_positions"])
+
+# Define and compile our pretraining model.
+pretraining_model = keras.Model(inputs, outputs)
+pretraining_model.compile(
+    loss="sparse_categorical_crossentropy",
+    optimizer=keras.optimizers.AdamW(PRETRAINING_LEARNING_RATE),
+    weighted_metrics=["sparse_categorical_accuracy"],
+    jit_compile=True,
+)
+
+# Pretrain the model on our wiki text dataset.
+pretraining_model.fit(
+    pretrain_ds,
+    validation_data=pretrain_val_ds,
+    epochs=PRETRAINING_EPOCHS,
+)
+
+# Save this base model for further finetuning.
+encoder_model.save("encoder_model.keras")
+
+"""
+## Fine-tuning
+
+After pretraining, we can now fine-tune our model on the `SST-2` dataset. We can
+leverage the ability of the encoder we build to predict on words in context to boost
+our performance on the downstream task.
+"""
+
+"""
+### Preprocess data for classification
+
+Preprocessing for fine-tuning is much simpler than for our pretraining MaskedLM task. We just
+tokenize our input sentences and we are ready for training!
+"""
+
+
+def preprocess(sentences, labels):
+    return tokenizer(sentences), labels
+
+
+# We use prefetch() to pre-compute preprocessed batches on the fly on our CPU.
+finetune_ds = sst_train_ds.map(
+    preprocess, num_parallel_calls=tf.data.AUTOTUNE
+).prefetch(tf.data.AUTOTUNE)
+finetune_val_ds = sst_val_ds.map(
+    preprocess, num_parallel_calls=tf.data.AUTOTUNE
+).prefetch(tf.data.AUTOTUNE)
+
+# Preview a single input example.
+print(finetune_val_ds.take(1).get_single_element())
+
+"""
+### Fine-tune the Transformer
+
+To go from our encoded token output to a classification prediction, we need to attach
+another "head" to our Transformer model. We can afford to be simple here. We pool
+the encoded tokens together, and use a single dense layer to make a prediction.
+"""
+
+# Reload the encoder model from disk so we can restart fine-tuning from scratch.
+encoder_model = keras.models.load_model("encoder_model.keras", compile=False)
+
+# Take as input the tokenized input.
+inputs = keras.Input(shape=(SEQ_LENGTH,), dtype="int32")
+
+# Encode and pool the tokens.
+encoded_tokens = encoder_model(inputs)
+pooled_tokens = keras.layers.GlobalAveragePooling1D()(encoded_tokens[0])
+
+# Predict an output label.
+outputs = keras.layers.Dense(1, activation="sigmoid")(pooled_tokens)
+
+# Define and compile our fine-tuning model.
+finetuning_model = keras.Model(inputs, outputs)
+finetuning_model.compile(
+    loss="binary_crossentropy",
+    optimizer=keras.optimizers.AdamW(FINETUNING_LEARNING_RATE),
+    metrics=["accuracy"],
+)
+
+# Finetune the model for the SST-2 task.
+finetuning_model.fit(
+    finetune_ds,
+    validation_data=finetune_val_ds,
+    epochs=FINETUNING_EPOCHS,
+)
+
+"""
+Pretraining was enough to boost our performance to 84%, and this is hardly the ceiling
+for Transformer models. You may have noticed during pretraining that our validation
+performance was still steadily increasing. Our model is still significantly undertrained.
+Training for more epochs, training a large Transformer, and training on more unlabeled
+text would all continue to boost performance significantly.
+
+One of the key goals of KerasHub is to provide a modular approach to NLP model building.
+We have shown one approach to building a Transformer here, but KerasHub supports an ever
+growing array of components for preprocessing text and building models. We hope it makes
+it easier to experiment on solutions to your natural language problems.
+"""
diff --git a/guides/keras_hub/upload.py b/guides/keras_hub/upload.py
new file mode 100644
index 0000000000..0a859d823e
--- /dev/null
+++ b/guides/keras_hub/upload.py
@@ -0,0 +1,245 @@
+"""
+Title: Uploading Models with KerasHub
+Author: [Samaneh Saadat](https://github.com/SamanehSaadat/), [Matthew Watson](https://github.com/mattdangerw/)
+Date created: 2024/04/29
+Last modified: 2024/04/29
+Description: An introduction on how to upload a fine-tuned KerasHub model to model hubs.
+Accelerator: GPU
+"""
+
+"""
+# Introduction
+
+Fine-tuning a machine learning model can yield impressive results for specific tasks.
+Uploading your fine-tuned model to a model hub allows you to share it with the broader community.
+By sharing your models, you'll enhance accessibility for other researchers and developers,
+making your contributions an integral part of the machine learning landscape.
+This can also streamline the integration of your model into real-world applications.
+
+This guide walks you through how to upload your fine-tuned models to popular model hubs such as
+[Kaggle Models](https://www.kaggle.com/models) and [Hugging Face Hub](https://huggingface.co/models).
+"""
+
+"""
+# Setup
+
+Let's start by installing and importing all the libraries we need. We use KerasHub for this guide.
+"""
+
+"""shell
+pip install -q --upgrade keras-hub huggingface-hub kagglehub
+"""
+
+import os
+
+os.environ["KERAS_BACKEND"] = "jax"
+
+import keras_hub
+
+
+"""
+# Data
+
+We can use the IMDB reviews dataset for this guide. Let's load the dataset from `tensorflow_dataset`.
+"""
+
+import tensorflow_datasets as tfds
+
+imdb_train, imdb_test = tfds.load(
+    "imdb_reviews",
+    split=["train", "test"],
+    as_supervised=True,
+    batch_size=4,
+)
+
+"""
+We only use a small subset of the training samples to make the guide run faster.
+However, if you need a higher quality model, consider using a larger number of training samples.
+"""
+
+imdb_train = imdb_train.take(100)
+
+"""
+# Task Upload
+
+A `keras_hub.models.Task`, wraps a `keras_hub.models.Backbone` and a `keras_hub.models.Preprocessor` to create
+a model that can be directly used for training, fine-tuning, and prediction for a given text problem.
+In this section, we explain how to create a `Task`, fine-tune and upload it to a model hub.
+"""
+
+"""
+## Load Model
+
+If you want to build a Causal LM based on a base model, simply call `keras_hub.models.CausalLM.from_preset`
+and pass a built-in preset identifier.
+"""
+
+causal_lm = keras_hub.models.CausalLM.from_preset("gpt2_base_en")
+
+
+"""
+## Fine-tune Model
+
+After loading the model, you can call `.fit()` on the model to fine-tune it.
+Here, we fine-tune the model on the IMDB reviews which makes the model movie domain-specific.
+"""
+
+# Drop labels and keep the review text only for the Causal LM.
+imdb_train_reviews = imdb_train.map(lambda x, y: x)
+
+# Fine-tune the Causal LM.
+causal_lm.fit(imdb_train_reviews)
+
+"""
+## Save the Model Locally
+
+To upload a model, you need to first save the model locally using `save_to_preset`.
+"""
+
+preset_dir = "./gpt2_imdb"
+causal_lm.save_to_preset(preset_dir)
+
+"""
+Let's see the saved files.
+"""
+
+os.listdir(preset_dir)
+
+"""
+### Load a Locally Saved Model
+
+A model that is saved to a local preset can be loaded using `from_preset`.
+What you save in, is what you get back out.
+"""
+
+causal_lm = keras_hub.models.CausalLM.from_preset(preset_dir)
+
+"""
+You can also load the `keras_hub.models.Backbone` and `keras_hub.models.Tokenizer` objects from this preset directory.
+Note that these objects are equivalent to `causal_lm.backbone` and `causal_lm.preprocessor.tokenizer` above.
+"""
+
+backbone = keras_hub.models.Backbone.from_preset(preset_dir)
+tokenizer = keras_hub.models.Tokenizer.from_preset(preset_dir)
+
+"""
+## Upload the Model to a Model Hub
+
+After saving a preset to a directory, this directory can be uploaded to a model hub such as Kaggle or Hugging Face directly from the KerasHub library.
+To upload the model to Kaggle, the URI must start with `kaggle://` and to upload to Hugging Face, it should start with `hf://`.
+"""
+"""
+### Upload to Kaggle
+"""
+
+"""
+To upload a model to Kaggle, first, we need to authenticate with Kaggle.
+This can in one of the following ways:
+1. Set environment variables `KAGGLE_USERNAME` and `KAGGLE_KEY`.
+2. Provide a local `~/.kaggle/kaggle.json`.
+3. Call `kagglehub.login()`.
+
+Let's make sure we are logged in before continuing.
+"""
+
+import kagglehub
+
+if "KAGGLE_USERNAME" not in os.environ or "KAGGLE_KEY" not in os.environ:
+    kagglehub.login()
+
+
+"""
+
+To upload a model we can use `keras_hub.upload_preset(uri, preset_dir)` API where `uri` has the format of
+`kaggle://<KAGGLE_USERNAME>/<MODEL>/Keras/<VARIATION>` for uploading to Kaggle and `preset_dir` is the directory that the model is saved in.
+
+Running the following uploads the model that is saved in `preset_dir` to Kaggle:
+"""
+kaggle_username = kagglehub.whoami()["username"]
+kaggle_uri = f"kaggle://{kaggle_username}/gpt2/keras/gpt2_imdb"
+keras_hub.upload_preset(kaggle_uri, preset_dir)
+
+"""
+### Upload to Hugging Face
+"""
+
+"""
+To upload a model to Hugging Face, first, we need to authenticate with Hugging Face.
+This can in one of the following ways:
+1. Set environment variables `HF_USERNAME` and `HF_TOKEN`.
+2. Call `huggingface_hub.notebook_login()`.
+
+Let's make sure we are logged in before coninuing.
+"""
+
+import huggingface_hub
+
+if "HF_USERNAME" not in os.environ or "HF_TOKEN" not in os.environ:
+    huggingface_hub.notebook_login()
+
+"""
+
+`keras_hub.upload_preset(uri, preset_dir)` can be used to upload a model to Hugging Face if `uri` has the format of
+`kaggle://<HF_USERNAME>/<MODEL>`.
+
+Running the following uploads the model that is saved in `preset_dir` to Hugging Face:
+"""
+
+hf_username = huggingface_hub.whoami()["name"]
+hf_uri = f"hf://{hf_username}/gpt2_imdb"
+keras_hub.upload_preset(hf_uri, preset_dir)
+
+
+"""
+## Load a User Uploaded Model
+
+After verifying that the model is uploaded to Kaggle, we can load the model by calling `from_preset`.
+
+```python
+causal_lm = keras_hub.models.CausalLM.from_preset(
+    f"kaggle://{kaggle_username}/gpt2/keras/gpt2_imdb"
+)
+```
+
+We can also load the model uploaded to Hugging Face by calling `from_preset`.
+
+```python
+causal_lm = keras_hub.models.CausalLM.from_preset(f"hf://{hf_username}/gpt2_imdb")
+```
+"""
+
+
+"""
+# Classifier Upload
+
+Uploading a classifier model is similar to Causal LM upload.
+To upload the fine-tuned model, first, the model should be saved to a local directory using `save_to_preset`
+API and then it can be uploaded via `keras_hub.upload_preset`.
+"""
+
+# Load the base model.
+classifier = keras_hub.models.Classifier.from_preset(
+    "bert_tiny_en_uncased", num_classes=2
+)
+
+# Fine-tune the classifier.
+classifier.fit(imdb_train)
+
+# Save the model to a local preset directory.
+preset_dir = "./bert_tiny_imdb"
+classifier.save_to_preset(preset_dir)
+
+# Upload to Kaggle.
+keras_hub.upload_preset(
+    f"kaggle://{kaggle_username}/bert/keras/bert_tiny_imdb", preset_dir
+)
+
+"""
+After verifying that the model is uploaded to Kaggle, we can load the model by calling `from_preset`.
+
+```python
+classifier = keras_hub.models.Classifier.from_preset(
+    f"kaggle://{kaggle_username}/bert/keras/bert_tiny_imdb"
+)
+```
+"""
diff --git a/guides/md/keras_cv/classification_with_keras_cv.md b/guides/md/keras_cv/classification_with_keras_cv.md
index 992839357b..34718b0be3 100644
--- a/guides/md/keras_cv/classification_with_keras_cv.md
+++ b/guides/md/keras_cv/classification_with_keras_cv.md
@@ -31,7 +31,7 @@ TensorFlow or PyTorch backends with zero changes, simply update the
 We use Professor Keras, the official Keras mascot, as a
 visual reference for the complexity of the material:
 
-![](https://storage.googleapis.com/keras-nlp/getting_started_guide/prof_keras_evolution.png)
+![](https://storage.googleapis.com/keras-hub/getting_started_guide/prof_keras_evolution.png)
 
 
 ```python
@@ -67,7 +67,7 @@ import tensorflow_datasets as tfds
 ---
 ## Inference with a pretrained classifier
 
-![](https://storage.googleapis.com/keras-nlp/getting_started_guide/prof_keras_beginner.png)
+![](https://storage.googleapis.com/keras-hub/getting_started_guide/prof_keras_beginner.png)
 
 Let's get started with the simplest KerasCV API: a pretrained classifier.
 In this example, we will construct a classifier that was
@@ -185,7 +185,7 @@ This can be solved by fine tuning our own classifier.
 
 ## Fine tuning a pretrained classifier
 
-![](https://storage.googleapis.com/keras-nlp/getting_started_guide/prof_keras_intermediate.png)
+![](https://storage.googleapis.com/keras-hub/getting_started_guide/prof_keras_intermediate.png)
 
 When labeled images specific to our task are available, fine-tuning a custom
 classifier can improve performance.
@@ -363,7 +363,7 @@ Awesome - looks like the model correctly classified the image.
 
 ## Train a Classifier from Scratch
 
-![](https://storage.googleapis.com/keras-nlp/getting_started_guide/prof_keras_advanced.png)
+![](https://storage.googleapis.com/keras-hub/getting_started_guide/prof_keras_advanced.png)
 
 Now that we've gotten our hands dirty with classification, let's take on one
 last task: training a classification model from scratch!
diff --git a/guides/md/keras_cv/object_detection_keras_cv.md b/guides/md/keras_cv/object_detection_keras_cv.md
index 5c92d5cad8..3bc62a16ac 100644
--- a/guides/md/keras_cv/object_detection_keras_cv.md
+++ b/guides/md/keras_cv/object_detection_keras_cv.md
@@ -96,7 +96,7 @@ Let's do this!
 ---
 ## Perform detections with a pretrained model
 
-![](https://storage.googleapis.com/keras-nlp/getting_started_guide/prof_keras_beginner.png)
+![](https://storage.googleapis.com/keras-hub/getting_started_guide/prof_keras_beginner.png)
 
 The highest level API in the KerasCV Object Detection API is the `keras_cv.models` API.
 This API includes fully pretrained object detection models, such as
@@ -416,7 +416,7 @@ That looks a lot better!
 ---
 ## Train a custom object detection model
 
-![](https://storage.googleapis.com/keras-nlp/getting_started_guide/prof_keras_advanced.png)
+![](https://storage.googleapis.com/keras-hub/getting_started_guide/prof_keras_advanced.png)
 
 Whether you're an object detection amateur or a well seasoned veteran, assembling
 an object detection pipeline from scratch is a massive undertaking.
diff --git a/guides/md/keras_cv/semantic_segmentation_deeplab_v3_plus.md b/guides/md/keras_cv/semantic_segmentation_deeplab_v3_plus.md
index c630a3fb95..46b0868d9e 100644
--- a/guides/md/keras_cv/semantic_segmentation_deeplab_v3_plus.md
+++ b/guides/md/keras_cv/semantic_segmentation_deeplab_v3_plus.md
@@ -10,7 +10,7 @@
 
 
 
-![](https://storage.googleapis.com/keras-nlp/getting_started_guide/prof_keras_intermediate.png)
+![](https://storage.googleapis.com/keras-hub/getting_started_guide/prof_keras_intermediate.png)
 
 ---
 ## Background
diff --git a/guides/md/keras_hub/getting_started.md b/guides/md/keras_hub/getting_started.md
new file mode 100644
index 0000000000..51afbb54ac
--- /dev/null
+++ b/guides/md/keras_hub/getting_started.md
@@ -0,0 +1,1065 @@
+# Getting Started with KerasHub
+
+**Author:** [Jonathan Bischof](https://github.com/jbischof)<br>
+**Date created:** 2022/12/15<br>
+**Last modified:** 2023/07/01<br>
+**Description:** An introduction to the KerasHub API.
+
+
+<img class="k-inline-icon" src="https://colab.research.google.com/img/colab_favicon.ico"/> [**View in Colab**](https://colab.research.google.com/github/keras-team/keras-io/blob/master/guides/ipynb/keras_hub/getting_started.ipynb)  <span class="k-dot">•</span><img class="k-inline-icon" src="https://github.com/favicon.ico"/> [**GitHub source**](https://github.com/keras-team/keras-io/blob/master/guides/keras_hub/getting_started.py)
+
+
+
+---
+## Introduction
+
+KerasHub is a natural language processing library that supports users through
+their entire development cycle. Our workflows are built from modular components
+that have state-of-the-art preset weights and architectures when used
+out-of-the-box and are easily customizable when more control is needed.
+
+This library is an extension of the core Keras API; all high-level modules are
+[`Layers`](/api/layers/) or [`Models`](/api/models/). If you are familiar with Keras,
+congratulations! You already understand most of KerasHub.
+
+KerasHub uses Keras 3 to work with any of TensorFlow, Pytorch and Jax. In the
+guide below, we will use the `jax` backend for training our models, and
+[tf.data](https://www.tensorflow.org/guide/data) for efficiently running our
+input preprocessing. But feel free to mix things up! This guide runs in
+TensorFlow or PyTorch backends with zero changes, simply update the
+`KERAS_BACKEND` below.
+
+This guide demonstrates our modular approach using a sentiment analysis example at six
+levels of complexity:
+
+* Inference with a pretrained classifier
+* Fine tuning a pretrained backbone
+* Fine tuning with user-controlled preprocessing
+* Fine tuning a custom model
+* Pretraining a backbone model
+* Build and train your own transformer from scratch
+
+Throughout our guide, we use Professor Keras, the official Keras mascot, as a visual
+reference for the complexity of the material:
+
+<img src="https://storage.googleapis.com/keras-hub/getting_started_guide/prof_keras_evolution.png" alt="drawing" height="250"/>
+
+
+```python
+!pip install -q --upgrade keras-hub
+!pip install -q --upgrade keras  # Upgrade to Keras 3.
+```
+
+```python
+import os
+
+os.environ["KERAS_BACKEND"] = "jax"  # or "tensorflow" or "torch"
+
+import keras_hub
+import keras
+
+# Use mixed precision to speed up all training in this guide.
+keras.mixed_precision.set_global_policy("mixed_float16")
+```
+<div class="k-default-codeblock">
+```
+
+
+```
+</div>
+---
+## API quickstart
+
+Our highest level API is `keras_hub.models`. These symbols cover the complete user
+journey of converting strings to tokens, tokens to dense features, and dense features to
+task-specific output. For each `XX` architecture (e.g., `Bert`), we offer the following
+modules:
+
+* **Tokenizer**: `keras_hub.models.XXTokenizer`
+  * **What it does**: Converts strings to sequences of token ids.
+  * **Why it's important**: The raw bytes of a string are too high dimensional to be useful
+    features so we first map them to a small number of tokens, for example `"The quick brown
+    fox"` to `["the", "qu", "##ick", "br", "##own", "fox"]`.
+  * **Inherits from**: `keras.layers.Layer`.
+* **Preprocessor**: `keras_hub.models.XXPreprocessor`
+  * **What it does**: Converts strings to a dictionary of preprocessed tensors consumed by
+    the backbone, starting with tokenization.
+  * **Why it's important**: Each model uses special tokens and extra tensors to understand
+    the input such as delimiting input segments and identifying padding tokens. Padding each
+    sequence to the same length improves computational efficiency.
+  * **Has a**: `XXTokenizer`.
+  * **Inherits from**: `keras.layers.Layer`.
+* **Backbone**: `keras_hub.models.XXBackbone`
+  * **What it does**: Converts preprocessed tensors to dense features. *Does not handle
+    strings; call the preprocessor first.*
+  * **Why it's important**: The backbone distills the input tokens into dense features that
+    can be used in downstream tasks. It is generally pretrained on a language modeling task
+    using massive amounts of unlabeled data. Transferring this information to a new task is a
+    major breakthrough in modern NLP.
+  * **Inherits from**: `keras.Model`.
+* **Task**: e.g., `keras_hub.models.XXClassifier`
+  * **What it does**: Converts strings to task-specific output (e.g., classification
+    probabilities).
+  * **Why it's important**: Task models combine string preprocessing and the backbone model
+    with task-specific `Layers` to solve a problem such as sentence classification, token
+    classification, or text generation. The additional `Layers` must be fine-tuned on labeled
+    data.
+  * **Has a**: `XXBackbone` and `XXPreprocessor`.
+  * **Inherits from**: `keras.Model`.
+
+Here is the modular hierarchy for `BertClassifier` (all relationships are compositional):
+
+<img src="https://storage.googleapis.com/keras-hub/getting_started_guide/class_diagram.png" alt="drawing" height="300"/>
+
+All modules can be used independently and have a `from_preset()` method in addition to
+the standard constructor that instantiates the class with **preset** architecture and
+weights (see examples below).
+
+---
+## Data
+
+We will use a running example of sentiment analysis of IMDB movie reviews. In this task,
+we use the text to predict whether the review was positive (`label = 1`) or negative
+(`label = 0`).
+
+We load the data using `keras.utils.text_dataset_from_directory`, which utilizes the
+powerful `tf.data.Dataset` format for examples.
+
+
+```python
+!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
+!tar -xf aclImdb_v1.tar.gz
+!# Remove unsupervised examples
+!rm -r aclImdb/train/unsup
+```
+
+```python
+BATCH_SIZE = 16
+imdb_train = keras.utils.text_dataset_from_directory(
+    "aclImdb/train",
+    batch_size=BATCH_SIZE,
+)
+imdb_test = keras.utils.text_dataset_from_directory(
+    "aclImdb/test",
+    batch_size=BATCH_SIZE,
+)
+
+# Inspect first review
+# Format is (review text tensor, label tensor)
+print(imdb_train.unbatch().take(1).get_single_element())
+
+```
+<div class="k-default-codeblock">
+```
+  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
+                                 Dload  Upload   Total   Spent    Left  Speed
+100 80.2M  100 80.2M    0     0  88.0M      0 --:--:-- --:--:-- --:--:-- 87.9M
+
+Found 25000 files belonging to 2 classes.
+Found 25000 files belonging to 2 classes.
+(<tf.Tensor: shape=(), dtype=string, numpy=b'This is a very, very early Bugs Bunny cartoon. As a result, the character is still in a transition period--he is not drawn as elongated as he later was and his voice isn\'t quite right. In addition, the chemistry between Elmer and Bugs is a little unusual. Elmer is some poor sap who buys Bugs from a pet shop--there is no gun or desire on his part to blast the bunny to smithereens! However, despite this, this is still a very enjoyable film. The early Bugs was definitely more sassy and cruel than his later incarnations. In later films, he messed with Elmer, Yosimite Sam and others because they started it--they messed with the rabbit. But, in this film, he is much more like Daffy Duck of the late 30s and early 40s--a jerk who just loves irritating others!! A true "anarchist" instead of the hero of the later cartoons. While this isn\'t among the best Bug Bunny cartoons, it sure is fun to watch and it\'s interesting to see just how much he\'s changed over the years.'>, <tf.Tensor: shape=(), dtype=int32, numpy=1>)
+
+```
+</div>
+---
+## Inference with a pretrained classifier
+
+<img src="https://storage.googleapis.com/keras-hub/getting_started_guide/prof_keras_beginner.png" alt="drawing" height="250"/>
+
+The highest level module in KerasHub is a **task**. A **task** is a `keras.Model`
+consisting of a (generally pretrained) **backbone** model and task-specific layers.
+Here's an example using `keras_hub.models.BertClassifier`.
+
+**Note**: Outputs are the logits per class (e.g., `[0, 0]` is 50% chance of positive). The output is
+[negative, positive] for binary classification.
+
+
+```python
+classifier = keras_hub.models.BertClassifier.from_preset("bert_tiny_en_uncased_sst2")
+# Note: batched inputs expected so must wrap string in iterable
+classifier.predict(["I love modular workflows in keras-hub!"])
+```
+
+<div class="k-default-codeblock">
+```
+ 1/1 ━━━━━━━━━━━━━━━━━━━━ 1s 689ms/step
+
+array([[-1.539,  1.543]], dtype=float16)
+
+```
+</div>
+All **tasks** have a `from_preset` method that constructs a `keras.Model` instance with
+preset preprocessing, architecture and weights. This means that we can pass raw strings
+in any format accepted by a `keras.Model` and get output specific to our task.
+
+This particular **preset** is a `"bert_tiny_uncased_en"` **backbone** fine-tuned on
+`sst2`, another movie review sentiment analysis (this time from Rotten Tomatoes). We use
+the `tiny` architecture for demo purposes, but larger models are recommended for SoTA
+performance. For all the task-specific presets available for `BertClassifier`, see
+our keras.io [models page](https://keras.io/api/keras_hub/models/).
+
+Let's evaluate our classifier on the IMDB dataset. You will note we don't need to
+call `keras.Model.compile` here. All **task** models like `BertClassifier` ship with
+compilation defaults, meaning we can just call `keras.Model.evaluate` directly. You
+can always call compile as normal to override these defaults (e.g. to add new metrics).
+
+The output below is [loss, accuracy],
+
+
+```python
+classifier.evaluate(imdb_test)
+```
+
+<div class="k-default-codeblock">
+```
+ 1563/1563 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - loss: 0.4610 - sparse_categorical_accuracy: 0.7882
+
+[0.4630218744277954, 0.783519983291626]
+
+```
+</div>
+Our result is 78% accuracy without training anything. Not bad!
+
+---
+## Fine tuning a pretrained BERT backbone
+
+<img src="https://storage.googleapis.com/keras-hub/getting_started_guide/prof_keras_intermediate.png" alt="drawing" height="250"/>
+
+When labeled text specific to our task is available, fine-tuning a custom classifier can
+improve performance. If we want to predict IMDB review sentiment, using IMDB data should
+perform better than Rotten Tomatoes data! And for many tasks, no relevant pretrained model
+will be available (e.g., categorizing customer reviews).
+
+The workflow for fine-tuning is almost identical to above, except that we request a
+**preset** for the **backbone**-only model rather than the entire classifier. When passed
+a **backbone** **preset**, a **task** `Model` will randomly initialize all task-specific
+layers in preparation for training. For all the **backbone** presets available for
+`BertClassifier`, see our keras.io [models page](https://keras.io/api/keras_hub/models/).
+
+To train your classifier, use `keras.Model.fit` as with any other
+`keras.Model`. As with our inference example, we can rely on the compilation
+defaults for the **task** and skip `keras.Model.compile`. As preprocessing is
+included, we again pass the raw data.
+
+
+```python
+classifier = keras_hub.models.BertClassifier.from_preset(
+    "bert_tiny_en_uncased",
+    num_classes=2,
+)
+classifier.fit(
+    imdb_train,
+    validation_data=imdb_test,
+    epochs=1,
+)
+```
+
+<div class="k-default-codeblock">
+```
+ 1563/1563 ━━━━━━━━━━━━━━━━━━━━ 16s 9ms/step - loss: 0.5202 - sparse_categorical_accuracy: 0.7281 - val_loss: 0.3254 - val_sparse_categorical_accuracy: 0.8621
+
+<keras.src.callbacks.history.History at 0x7f281ffc9f90>
+
+```
+</div>
+Here we see a significant lift in validation accuracy (0.78 -> 0.87) with a single epoch of
+training even though the IMDB dataset is much smaller than `sst2`.
+
+---
+## Fine tuning with user-controlled preprocessing
+<img src="https://storage.googleapis.com/keras-hub/getting_started_guide/prof_keras_advanced.png" alt="drawing" height="250"/>
+
+For some advanced training scenarios, users might prefer direct control over
+preprocessing. For large datasets, examples can be preprocessed in advance and saved to
+disk or preprocessed by a separate worker pool using `tf.data.experimental.service`. In
+other cases, custom preprocessing is needed to handle the inputs.
+
+Pass `preprocessor=None` to the constructor of a **task** `Model` to skip automatic
+preprocessing or pass a custom `BertPreprocessor` instead.
+
+### Separate preprocessing from the same preset
+
+Each model architecture has a parallel **preprocessor** `Layer` with its own
+`from_preset` constructor. Using the same **preset** for this `Layer` will return the
+matching **preprocessor** as the **task**.
+
+In this workflow we train the model over three epochs using `tf.data.Dataset.cache()`,
+which computes the preprocessing once and caches the result before fitting begins.
+
+**Note:** we can use `tf.data` for preprocessing while running on the
+Jax or PyTorch backend. The input dataset will automatically be converted to
+backend native tensor types during fit. In fact, given the efficiency of `tf.data`
+for running preprocessing, this is good practice on all backends.
+
+
+```python
+import tensorflow as tf
+
+preprocessor = keras_hub.models.BertPreprocessor.from_preset(
+    "bert_tiny_en_uncased",
+    sequence_length=512,
+)
+
+# Apply the preprocessor to every sample of train and test data using `map()`.
+# `tf.data.AUTOTUNE` and `prefetch()` are options to tune performance, see
+# https://www.tensorflow.org/guide/data_performance for details.
+
+# Note: only call `cache()` if you training data fits in CPU memory!
+imdb_train_cached = (
+    imdb_train.map(preprocessor, tf.data.AUTOTUNE).cache().prefetch(tf.data.AUTOTUNE)
+)
+imdb_test_cached = (
+    imdb_test.map(preprocessor, tf.data.AUTOTUNE).cache().prefetch(tf.data.AUTOTUNE)
+)
+
+classifier = keras_hub.models.BertClassifier.from_preset(
+    "bert_tiny_en_uncased", preprocessor=None, num_classes=2
+)
+classifier.fit(
+    imdb_train_cached,
+    validation_data=imdb_test_cached,
+    epochs=3,
+)
+```
+
+<div class="k-default-codeblock">
+```
+Epoch 1/3
+ 1563/1563 ━━━━━━━━━━━━━━━━━━━━ 15s 8ms/step - loss: 0.5194 - sparse_categorical_accuracy: 0.7272 - val_loss: 0.3032 - val_sparse_categorical_accuracy: 0.8728
+Epoch 2/3
+ 1563/1563 ━━━━━━━━━━━━━━━━━━━━ 10s 7ms/step - loss: 0.2871 - sparse_categorical_accuracy: 0.8805 - val_loss: 0.2809 - val_sparse_categorical_accuracy: 0.8818
+Epoch 3/3
+ 1563/1563 ━━━━━━━━━━━━━━━━━━━━ 10s 7ms/step - loss: 0.2134 - sparse_categorical_accuracy: 0.9178 - val_loss: 0.3043 - val_sparse_categorical_accuracy: 0.8790
+
+<keras.src.callbacks.history.History at 0x7f281ffc87f0>
+
+```
+</div>
+After three epochs, our validation accuracy has only increased to 0.88. This is both a
+function of the small size of our dataset and our model. To exceed 90% accuracy, try
+larger **presets** such as  `"bert_base_en_uncased"`. For all the **backbone** presets
+available for `BertClassifier`, see our keras.io [models page](https://keras.io/api/keras_hub/models/).
+
+### Custom preprocessing
+
+In cases where custom preprocessing is required, we offer direct access to the
+`Tokenizer` class that maps raw strings to tokens. It also has a `from_preset()`
+constructor to get the vocabulary matching pretraining.
+
+**Note:** `BertTokenizer` does not pad sequences by default, so the output is
+ragged (each sequence has varying length). The `MultiSegmentPacker` below
+handles padding these ragged sequences to dense tensor types (e.g. `tf.Tensor`
+or `torch.Tensor`).
+
+
+```python
+tokenizer = keras_hub.models.BertTokenizer.from_preset("bert_tiny_en_uncased")
+tokenizer(["I love modular workflows!", "Libraries over frameworks!"])
+
+# Write your own packer or use one of our `Layers`
+packer = keras_hub.layers.MultiSegmentPacker(
+    start_value=tokenizer.cls_token_id,
+    end_value=tokenizer.sep_token_id,
+    # Note: This cannot be longer than the preset's `sequence_length`, and there
+    # is no check for a custom preprocessor!
+    sequence_length=64,
+)
+
+
+# This function that takes a text sample `x` and its
+# corresponding label `y` as input and converts the
+# text into a format suitable for input into a BERT model.
+def preprocessor(x, y):
+    token_ids, segment_ids = packer(tokenizer(x))
+    x = {
+        "token_ids": token_ids,
+        "segment_ids": segment_ids,
+        "padding_mask": token_ids != 0,
+    }
+    return x, y
+
+
+imdb_train_preprocessed = imdb_train.map(preprocessor, tf.data.AUTOTUNE).prefetch(
+    tf.data.AUTOTUNE
+)
+imdb_test_preprocessed = imdb_test.map(preprocessor, tf.data.AUTOTUNE).prefetch(
+    tf.data.AUTOTUNE
+)
+
+# Preprocessed example
+print(imdb_train_preprocessed.unbatch().take(1).get_single_element())
+```
+
+<div class="k-default-codeblock">
+```
+({'token_ids': <tf.Tensor: shape=(64,), dtype=int32, numpy=
+array([  101,  2023,  2003,  2941,  2028,  1997,  2026,  5440,  3152,
+        1010,  1045,  2052, 16755,  2008,  3071, 12197,  2009,  1012,
+        2045,  2003,  2070,  2307,  3772,  1999,  2009,  1998,  2009,
+        3065,  2008,  2025,  2035,  1000,  2204,  1000,  3152,  2024,
+        2137,  1012,  1012,  1012,  1012,   102,     0,     0,     0,
+           0,     0,     0,     0,     0,     0,     0,     0,     0,
+           0,     0,     0,     0,     0,     0,     0,     0,     0,
+           0], dtype=int32)>, 'segment_ids': <tf.Tensor: shape=(64,), dtype=int32, numpy=
+array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+      dtype=int32)>, 'padding_mask': <tf.Tensor: shape=(64,), dtype=bool, numpy=
+array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True, False, False, False,
+       False, False, False, False, False, False, False, False, False,
+       False, False, False, False, False, False, False, False, False,
+       False])>}, <tf.Tensor: shape=(), dtype=int32, numpy=1>)
+
+```
+</div>
+---
+## Fine tuning with a custom model
+<img src="https://storage.googleapis.com/keras-hub/getting_started_guide/prof_keras_advanced.png" alt="drawing" height="250"/>
+
+For more advanced applications, an appropriate **task** `Model` may not be available. In
+this case, we provide direct access to the **backbone** `Model`, which has its own
+`from_preset` constructor and can be composed with custom `Layer`s. Detailed examples can
+be found at our [transfer learning guide](https://keras.io/guides/transfer_learning/).
+
+A **backbone** `Model` does not include automatic preprocessing but can be paired with a
+matching **preprocessor** using the same **preset** as shown in the previous workflow.
+
+In this workflow, we experiment with freezing our backbone model and adding two trainable
+transformer layers to adapt to the new input.
+
+**Note**: We can ignore the warning about gradients for the `pooled_dense` layer because
+we are using BERT's sequence output.
+
+
+```python
+preprocessor = keras_hub.models.BertPreprocessor.from_preset("bert_tiny_en_uncased")
+backbone = keras_hub.models.BertBackbone.from_preset("bert_tiny_en_uncased")
+
+imdb_train_preprocessed = (
+    imdb_train.map(preprocessor, tf.data.AUTOTUNE).cache().prefetch(tf.data.AUTOTUNE)
+)
+imdb_test_preprocessed = (
+    imdb_test.map(preprocessor, tf.data.AUTOTUNE).cache().prefetch(tf.data.AUTOTUNE)
+)
+
+backbone.trainable = False
+inputs = backbone.input
+sequence = backbone(inputs)["sequence_output"]
+for _ in range(2):
+    sequence = keras_hub.layers.TransformerEncoder(
+        num_heads=2,
+        intermediate_dim=512,
+        dropout=0.1,
+    )(sequence)
+# Use [CLS] token output to classify
+outputs = keras.layers.Dense(2)(sequence[:, backbone.cls_token_index, :])
+
+model = keras.Model(inputs, outputs)
+model.compile(
+    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+    optimizer=keras.optimizers.AdamW(5e-5),
+    metrics=[keras.metrics.SparseCategoricalAccuracy()],
+    jit_compile=True,
+)
+model.summary()
+model.fit(
+    imdb_train_preprocessed,
+    validation_data=imdb_test_preprocessed,
+    epochs=3,
+)
+```
+
+
+<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace"><span style="font-weight: bold">Model: "functional_1"</span>
+</pre>
+
+
+
+
+<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace">┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┓
+┃<span style="font-weight: bold"> Layer (type)        </span>┃<span style="font-weight: bold"> Output Shape      </span>┃<span style="font-weight: bold"> Param # </span>┃<span style="font-weight: bold"> Connected to         </span>┃
+┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━┩
+│ padding_mask        │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>)      │       <span style="color: #00af00; text-decoration-color: #00af00">0</span> │ -                    │
+│ (<span style="color: #0087ff; text-decoration-color: #0087ff">InputLayer</span>)        │                   │         │                      │
+├─────────────────────┼───────────────────┼─────────┼──────────────────────┤
+│ segment_ids         │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>)      │       <span style="color: #00af00; text-decoration-color: #00af00">0</span> │ -                    │
+│ (<span style="color: #0087ff; text-decoration-color: #0087ff">InputLayer</span>)        │                   │         │                      │
+├─────────────────────┼───────────────────┼─────────┼──────────────────────┤
+│ token_ids           │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>)      │       <span style="color: #00af00; text-decoration-color: #00af00">0</span> │ -                    │
+│ (<span style="color: #0087ff; text-decoration-color: #0087ff">InputLayer</span>)        │                   │         │                      │
+├─────────────────────┼───────────────────┼─────────┼──────────────────────┤
+│ bert_backbone_3     │ [(<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00af00; text-decoration-color: #00af00">128</span>),     │ <span style="color: #00af00; text-decoration-color: #00af00">4,385,…</span> │ padding_mask[<span style="color: #00af00; text-decoration-color: #00af00">0</span>][<span style="color: #00af00; text-decoration-color: #00af00">0</span>],  │
+│ (<span style="color: #0087ff; text-decoration-color: #0087ff">BertBackbone</span>)      │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>,      │         │ segment_ids[<span style="color: #00af00; text-decoration-color: #00af00">0</span>][<span style="color: #00af00; text-decoration-color: #00af00">0</span>],   │
+│                     │ <span style="color: #00af00; text-decoration-color: #00af00">128</span>)]             │         │ token_ids[<span style="color: #00af00; text-decoration-color: #00af00">0</span>][<span style="color: #00af00; text-decoration-color: #00af00">0</span>]      │
+├─────────────────────┼───────────────────┼─────────┼──────────────────────┤
+│ transformer_encoder │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00af00; text-decoration-color: #00af00">128</span>) │ <span style="color: #00af00; text-decoration-color: #00af00">198,272</span> │ bert_backbone_3[<span style="color: #00af00; text-decoration-color: #00af00">0</span>][<span style="color: #00af00; text-decoration-color: #00af00">…</span> │
+│ (<span style="color: #0087ff; text-decoration-color: #0087ff">TransformerEncode…</span> │                   │         │                      │
+├─────────────────────┼───────────────────┼─────────┼──────────────────────┤
+│ transformer_encode… │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00af00; text-decoration-color: #00af00">128</span>) │ <span style="color: #00af00; text-decoration-color: #00af00">198,272</span> │ transformer_encoder… │
+│ (<span style="color: #0087ff; text-decoration-color: #0087ff">TransformerEncode…</span> │                   │         │                      │
+├─────────────────────┼───────────────────┼─────────┼──────────────────────┤
+│ get_item_4          │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00af00; text-decoration-color: #00af00">128</span>)       │       <span style="color: #00af00; text-decoration-color: #00af00">0</span> │ transformer_encoder… │
+│ (<span style="color: #0087ff; text-decoration-color: #0087ff">GetItem</span>)           │                   │         │                      │
+├─────────────────────┼───────────────────┼─────────┼──────────────────────┤
+│ dense (<span style="color: #0087ff; text-decoration-color: #0087ff">Dense</span>)       │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00af00; text-decoration-color: #00af00">2</span>)         │     <span style="color: #00af00; text-decoration-color: #00af00">258</span> │ get_item_4[<span style="color: #00af00; text-decoration-color: #00af00">0</span>][<span style="color: #00af00; text-decoration-color: #00af00">0</span>]     │
+└─────────────────────┴───────────────────┴─────────┴──────────────────────┘
+</pre>
+
+
+
+
+<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace"><span style="font-weight: bold"> Total params: </span><span style="color: #00af00; text-decoration-color: #00af00">4,782,722</span> (18.24 MB)
+</pre>
+
+
+
+
+<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace"><span style="font-weight: bold"> Trainable params: </span><span style="color: #00af00; text-decoration-color: #00af00">396,802</span> (1.51 MB)
+</pre>
+
+
+
+
+<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace"><span style="font-weight: bold"> Non-trainable params: </span><span style="color: #00af00; text-decoration-color: #00af00">4,385,920</span> (16.73 MB)
+</pre>
+
+
+
+<div class="k-default-codeblock">
+```
+Epoch 1/3
+ 1563/1563 ━━━━━━━━━━━━━━━━━━━━ 17s 10ms/step - loss: 0.6208 - sparse_categorical_accuracy: 0.6612 - val_loss: 0.6119 - val_sparse_categorical_accuracy: 0.6758
+Epoch 2/3
+ 1563/1563 ━━━━━━━━━━━━━━━━━━━━ 12s 8ms/step - loss: 0.5324 - sparse_categorical_accuracy: 0.7347 - val_loss: 0.5484 - val_sparse_categorical_accuracy: 0.7320
+Epoch 3/3
+ 1563/1563 ━━━━━━━━━━━━━━━━━━━━ 12s 8ms/step - loss: 0.4735 - sparse_categorical_accuracy: 0.7723 - val_loss: 0.4874 - val_sparse_categorical_accuracy: 0.7742
+
+<keras.src.callbacks.history.History at 0x7f2790170220>
+
+```
+</div>
+This model achieves reasonable accuracy despite having only 10% of the trainable parameters
+of our `BertClassifier` model. Each training step takes about 1/3 of the time---even
+accounting for cached preprocessing.
+
+---
+## Pretraining a backbone model
+<img src="https://storage.googleapis.com/keras-hub/getting_started_guide/prof_keras_expert.png" alt="drawing" height="250"/>
+
+Do you have access to large unlabeled datasets in your domain? Are they around the
+same size as used to train popular backbones such as BERT, RoBERTa, or GPT2 (XX+ GiB)? If
+so, you might benefit from domain-specific pretraining of your own backbone models.
+
+NLP models are generally pretrained on a language modeling task, predicting masked words
+given the visible words in an input sentence. For example, given the input
+`"The fox [MASK] over the [MASK] dog"`, the model might be asked to predict `["jumped", "lazy"]`.
+The lower layers of this model are then packaged as a **backbone** to be combined with
+layers relating to a new task.
+
+The KerasHub library offers SoTA **backbones** and **tokenizers** to be trained from
+scratch without presets.
+
+In this workflow, we pretrain a BERT **backbone** using our IMDB review text. We skip the
+"next sentence prediction" (NSP) loss because it adds significant complexity to the data
+processing and was dropped by later models like RoBERTa. See our e2e
+[Transformer pretraining](https://keras.io/guides/keras_hub/transformer_pretraining/#pretraining)
+for step-by-step details on how to replicate the original paper.
+
+### Preprocessing
+
+
+```python
+# All BERT `en` models have the same vocabulary, so reuse preprocessor from
+# "bert_tiny_en_uncased"
+preprocessor = keras_hub.models.BertPreprocessor.from_preset(
+    "bert_tiny_en_uncased",
+    sequence_length=256,
+)
+packer = preprocessor.packer
+tokenizer = preprocessor.tokenizer
+
+# keras.Layer to replace some input tokens with the "[MASK]" token
+masker = keras_hub.layers.MaskedLMMaskGenerator(
+    vocabulary_size=tokenizer.vocabulary_size(),
+    mask_selection_rate=0.25,
+    mask_selection_length=64,
+    mask_token_id=tokenizer.token_to_id("[MASK]"),
+    unselectable_token_ids=[
+        tokenizer.token_to_id(x) for x in ["[CLS]", "[PAD]", "[SEP]"]
+    ],
+)
+
+
+def preprocess(inputs, label):
+    inputs = preprocessor(inputs)
+    masked_inputs = masker(inputs["token_ids"])
+    # Split the masking layer outputs into a (features, labels, and weights)
+    # tuple that we can use with keras.Model.fit().
+    features = {
+        "token_ids": masked_inputs["token_ids"],
+        "segment_ids": inputs["segment_ids"],
+        "padding_mask": inputs["padding_mask"],
+        "mask_positions": masked_inputs["mask_positions"],
+    }
+    labels = masked_inputs["mask_ids"]
+    weights = masked_inputs["mask_weights"]
+    return features, labels, weights
+
+
+pretrain_ds = imdb_train.map(preprocess, num_parallel_calls=tf.data.AUTOTUNE).prefetch(
+    tf.data.AUTOTUNE
+)
+pretrain_val_ds = imdb_test.map(
+    preprocess, num_parallel_calls=tf.data.AUTOTUNE
+).prefetch(tf.data.AUTOTUNE)
+
+# Tokens with ID 103 are "masked"
+print(pretrain_ds.unbatch().take(1).get_single_element())
+```
+
+<div class="k-default-codeblock">
+```
+({'token_ids': <tf.Tensor: shape=(256,), dtype=int32, numpy=
+array([  101,   103,  2332,   103,  1006,   103,   103,  2332,  2370,
+        1007,   103,  2029,   103,  2402,  2155,  1010, 24159,  2000,
+        3541,  7081,  1010,  2424,  2041,  2055,  1996,  9004,  4528,
+         103,   103,  2037,  2188,   103,  1996,  2269,  1006,  8512,
+        3054,   103,  4246,  1007,  2059,  4858,  1555,  2055,  1996,
+       23025, 22911,  8940,  2598,  3458,  1996, 25483,  4528,  2008,
+        2038,   103,  1997, 15218,  1011,   103,  1997,   103,  2505,
+        3950,  2045,  3310,  2067,  2025,  3243,  2157,  1012,   103,
+        7987,  1013,  1028,   103,  7987,  1013,  1028,  2917,   103,
+        1000,  5469,  1000,   103,   103,  2041, 22902,  1010, 23979,
+        1010,  1998,  1999, 23606,   103,  1998,  4247,  2008,  2126,
+        2005,  1037,  2096,  1010,  2007,  1996,   103,  5409,   103,
+        2108,  3054,  3211,  4246,  1005,  1055, 22692,  2836,  1012,
+        2009,   103,  1037,  2210,  2488,   103,   103,  2203,  1010,
+        2007,   103,   103,  9599,  1012,   103,  2391,  1997,  2755,
+        1010,  1996,  2878,  3185,  2003,  2428,   103,  1010,   103,
+         103,   103,  1045,  2064,  1005,  1056,  3294, 19776,  2009,
+        1011,  2012,  2560,  2009,  2038,  2242,  2000,   103,  2009,
+       13432,  1012, 11519,  4637,  4616,  2011,  5965,  1043, 11761,
+         103,   103,  2004,   103,  7968,  3243,  4793, 11429,  1010,
+        1998,  8226,  2665, 18331,  1010,  1219,  1996,  4487, 22747,
+        8004, 12165,  4382,  5125,   103,  3597,   103,  2024,  2025,
+        2438,  2000,   103,  2417, 21564,  2143,   103,   103,  7987,
+        1013,  1028,  1026,   103,  1013,  1028,  2332,  2038,   103,
+        5156, 12081,  2004,  1996,   103,  1012,  1026, 14216,   103,
+         103,  1026,  7987,  1013,  1028,   184,  2011,  1037,  8297,
+        2036,   103,  2011,  2984,   103,  1006,  2003,  2009,  2151,
+        4687,  2008,  2016,  1005,  1055,  2018,  2053,  7731,   103,
+         103,  2144,  1029,   102], dtype=int32)>, 'segment_ids': <tf.Tensor: shape=(256,), dtype=int32, numpy=
+array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)>, 'padding_mask': <tf.Tensor: shape=(256,), dtype=bool, numpy=
+array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True])>, 'mask_positions': <tf.Tensor: shape=(64,), dtype=int64, numpy=
+array([  1,   3,   5,   6,  10,  12,  13,  27,  28,  31,  37,  42,  51,
+        55,  59,  61,  65,  71,  75,  80,  83,  84,  85,  94, 105, 107,
+       108, 118, 122, 123, 127, 128, 131, 141, 143, 144, 145, 149, 160,
+       167, 170, 171, 172, 174, 176, 185, 193, 195, 200, 204, 205, 208,
+       210, 215, 220, 223, 224, 225, 230, 231, 235, 238, 251, 252])>}, <tf.Tensor: shape=(64,), dtype=int32, numpy=
+array([ 4459,  6789, 22892,  2011,  1999,  1037,  2402,  2485,  2000,
+        1012,  3211,  2041,  9004,  4204,  2069,  2607,  3310,  1026,
+        1026,  2779,  1000,  3861,  4627,  1010,  7619,  5783,  2108,
+        4152,  2646,  1996, 15958, 14888,  1999, 14888,  2029,  2003,
+        2339,  1056,  2191,  2011, 11761,  2638,  1010,  1996,  2214,
+        2004, 14674,  2860,  2428,  1012,  1026,  1028,  7987,  2010,
+        2704,  7987,  1013,  1028,  2628,  2011,  2856, 12838,  2143,
+        2147], dtype=int32)>, <tf.Tensor: shape=(64,), dtype=float16, numpy=
+array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
+       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
+       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
+       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.], dtype=float16)>)
+
+```
+</div>
+### Pretraining model
+
+
+```python
+# BERT backbone
+backbone = keras_hub.models.BertBackbone(
+    vocabulary_size=tokenizer.vocabulary_size(),
+    num_layers=2,
+    num_heads=2,
+    hidden_dim=128,
+    intermediate_dim=512,
+)
+
+# Language modeling head
+mlm_head = keras_hub.layers.MaskedLMHead(
+    token_embedding=backbone.token_embedding,
+)
+
+inputs = {
+    "token_ids": keras.Input(shape=(None,), dtype=tf.int32, name="token_ids"),
+    "segment_ids": keras.Input(shape=(None,), dtype=tf.int32, name="segment_ids"),
+    "padding_mask": keras.Input(shape=(None,), dtype=tf.int32, name="padding_mask"),
+    "mask_positions": keras.Input(shape=(None,), dtype=tf.int32, name="mask_positions"),
+}
+
+# Encoded token sequence
+sequence = backbone(inputs)["sequence_output"]
+
+# Predict an output word for each masked input token.
+# We use the input token embedding to project from our encoded vectors to
+# vocabulary logits, which has been shown to improve training efficiency.
+outputs = mlm_head(sequence, mask_positions=inputs["mask_positions"])
+
+# Define and compile our pretraining model.
+pretraining_model = keras.Model(inputs, outputs)
+pretraining_model.summary()
+pretraining_model.compile(
+    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+    optimizer=keras.optimizers.AdamW(learning_rate=5e-4),
+    weighted_metrics=[keras.metrics.SparseCategoricalAccuracy()],
+    jit_compile=True,
+)
+
+# Pretrain on IMDB dataset
+pretraining_model.fit(
+    pretrain_ds,
+    validation_data=pretrain_val_ds,
+    epochs=3,  # Increase to 6 for higher accuracy
+)
+```
+
+
+<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace"><span style="font-weight: bold">Model: "functional_3"</span>
+</pre>
+
+
+
+
+<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace">┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┓
+┃<span style="font-weight: bold"> Layer (type)        </span>┃<span style="font-weight: bold"> Output Shape      </span>┃<span style="font-weight: bold"> Param # </span>┃<span style="font-weight: bold"> Connected to         </span>┃
+┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━┩
+│ mask_positions      │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>)      │       <span style="color: #00af00; text-decoration-color: #00af00">0</span> │ -                    │
+│ (<span style="color: #0087ff; text-decoration-color: #0087ff">InputLayer</span>)        │                   │         │                      │
+├─────────────────────┼───────────────────┼─────────┼──────────────────────┤
+│ padding_mask        │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>)      │       <span style="color: #00af00; text-decoration-color: #00af00">0</span> │ -                    │
+│ (<span style="color: #0087ff; text-decoration-color: #0087ff">InputLayer</span>)        │                   │         │                      │
+├─────────────────────┼───────────────────┼─────────┼──────────────────────┤
+│ segment_ids         │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>)      │       <span style="color: #00af00; text-decoration-color: #00af00">0</span> │ -                    │
+│ (<span style="color: #0087ff; text-decoration-color: #0087ff">InputLayer</span>)        │                   │         │                      │
+├─────────────────────┼───────────────────┼─────────┼──────────────────────┤
+│ token_ids           │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>)      │       <span style="color: #00af00; text-decoration-color: #00af00">0</span> │ -                    │
+│ (<span style="color: #0087ff; text-decoration-color: #0087ff">InputLayer</span>)        │                   │         │                      │
+├─────────────────────┼───────────────────┼─────────┼──────────────────────┤
+│ bert_backbone_4     │ [(<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00af00; text-decoration-color: #00af00">128</span>),     │ <span style="color: #00af00; text-decoration-color: #00af00">4,385,…</span> │ mask_positions[<span style="color: #00af00; text-decoration-color: #00af00">0</span>][<span style="color: #00af00; text-decoration-color: #00af00">0</span>… │
+│ (<span style="color: #0087ff; text-decoration-color: #0087ff">BertBackbone</span>)      │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>,      │         │ padding_mask[<span style="color: #00af00; text-decoration-color: #00af00">0</span>][<span style="color: #00af00; text-decoration-color: #00af00">0</span>],  │
+│                     │ <span style="color: #00af00; text-decoration-color: #00af00">128</span>)]             │         │ segment_ids[<span style="color: #00af00; text-decoration-color: #00af00">0</span>][<span style="color: #00af00; text-decoration-color: #00af00">0</span>],   │
+│                     │                   │         │ token_ids[<span style="color: #00af00; text-decoration-color: #00af00">0</span>][<span style="color: #00af00; text-decoration-color: #00af00">0</span>]      │
+├─────────────────────┼───────────────────┼─────────┼──────────────────────┤
+│ masked_lm_head      │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>,      │ <span style="color: #00af00; text-decoration-color: #00af00">3,954,…</span> │ bert_backbone_4[<span style="color: #00af00; text-decoration-color: #00af00">0</span>][<span style="color: #00af00; text-decoration-color: #00af00">…</span> │
+│ (<span style="color: #0087ff; text-decoration-color: #0087ff">MaskedLMHead</span>)      │ <span style="color: #00af00; text-decoration-color: #00af00">30522</span>)            │         │ mask_positions[<span style="color: #00af00; text-decoration-color: #00af00">0</span>][<span style="color: #00af00; text-decoration-color: #00af00">0</span>] │
+└─────────────────────┴───────────────────┴─────────┴──────────────────────┘
+</pre>
+
+
+
+
+<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace"><span style="font-weight: bold"> Total params: </span><span style="color: #00af00; text-decoration-color: #00af00">4,433,210</span> (16.91 MB)
+</pre>
+
+
+
+
+<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace"><span style="font-weight: bold"> Trainable params: </span><span style="color: #00af00; text-decoration-color: #00af00">4,433,210</span> (16.91 MB)
+</pre>
+
+
+
+
+<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace"><span style="font-weight: bold"> Non-trainable params: </span><span style="color: #00af00; text-decoration-color: #00af00">0</span> (0.00 B)
+</pre>
+
+
+
+<div class="k-default-codeblock">
+```
+Epoch 1/3
+ 1563/1563 ━━━━━━━━━━━━━━━━━━━━ 22s 12ms/step - loss: 5.7032 - sparse_categorical_accuracy: 0.0566 - val_loss: 5.0685 - val_sparse_categorical_accuracy: 0.1044
+Epoch 2/3
+ 1563/1563 ━━━━━━━━━━━━━━━━━━━━ 13s 8ms/step - loss: 5.0701 - sparse_categorical_accuracy: 0.1096 - val_loss: 4.9363 - val_sparse_categorical_accuracy: 0.1239
+Epoch 3/3
+ 1563/1563 ━━━━━━━━━━━━━━━━━━━━ 13s 8ms/step - loss: 4.9607 - sparse_categorical_accuracy: 0.1240 - val_loss: 4.7913 - val_sparse_categorical_accuracy: 0.1417
+
+<keras.src.callbacks.history.History at 0x7f2738299330>
+
+```
+</div>
+After pretraining save your `backbone` submodel to use in a new task!
+
+---
+## Build and train your own transformer from scratch
+<img src="https://storage.googleapis.com/keras-hub/getting_started_guide/prof_keras_expert.png" alt="drawing" height="250"/>
+
+Want to implement a novel transformer architecture? The KerasHub library offers all the
+low-level modules used to build SoTA architectures in our `models` API. This includes the
+`keras_hub.tokenizers` API which allows you to train your own subword tokenizer using
+`WordPieceTokenizer`, `BytePairTokenizer`, or `SentencePieceTokenizer`.
+
+In this workflow, we train a custom tokenizer on the IMDB data and design a backbone with
+custom transformer architecture. For simplicity, we then train directly on the
+classification task. Interested in more details? We wrote an entire guide to pretraining
+and finetuning a custom transformer on
+[keras.io](https://keras.io/guides/keras_hub/transformer_pretraining/),
+
+### Train custom vocabulary from IMDB data
+
+
+```python
+vocab = keras_hub.tokenizers.compute_word_piece_vocabulary(
+    imdb_train.map(lambda x, y: x),
+    vocabulary_size=20_000,
+    lowercase=True,
+    strip_accents=True,
+    reserved_tokens=["[PAD]", "[START]", "[END]", "[MASK]", "[UNK]"],
+)
+tokenizer = keras_hub.tokenizers.WordPieceTokenizer(
+    vocabulary=vocab,
+    lowercase=True,
+    strip_accents=True,
+    oov_token="[UNK]",
+)
+```
+
+### Preprocess data with a custom tokenizer
+
+
+```python
+packer = keras_hub.layers.StartEndPacker(
+    start_value=tokenizer.token_to_id("[START]"),
+    end_value=tokenizer.token_to_id("[END]"),
+    pad_value=tokenizer.token_to_id("[PAD]"),
+    sequence_length=512,
+)
+
+
+def preprocess(x, y):
+    token_ids = packer(tokenizer(x))
+    return token_ids, y
+
+
+imdb_preproc_train_ds = imdb_train.map(
+    preprocess, num_parallel_calls=tf.data.AUTOTUNE
+).prefetch(tf.data.AUTOTUNE)
+imdb_preproc_val_ds = imdb_test.map(
+    preprocess, num_parallel_calls=tf.data.AUTOTUNE
+).prefetch(tf.data.AUTOTUNE)
+
+print(imdb_preproc_train_ds.unbatch().take(1).get_single_element())
+```
+
+<div class="k-default-codeblock">
+```
+(<tf.Tensor: shape=(512,), dtype=int32, numpy=
+array([    1,   102,    11,    61,    43,   771,    16,   340,   916,
+        1259,   155,    16,   135,   207,    18,   501, 10568,   344,
+          16,    51,   206,   612,   211,   232,    43,  1094,    17,
+         215,   155,   103,   238,   202,    18,   111,    16,    51,
+         143,  1583,   131,   100,    18,    32,   101,    19,    34,
+          32,   101,    19,    34,   102,    11,    61,    43,   155,
+         105,  5337,    99,   120,     6,  1289,     6,   129,    96,
+         526,    18,   111,    16,   193,    51,   197,   102,    16,
+          51,   252,    11,    62,   167,   104,   642,    98,     6,
+        8572,     6,   154,    51,   153,  1464,   119,  3005,   990,
+        2393,    18,   102,    11,    61,   233,   404,   103,   104,
+         110,    18,    18,    18,   233,  1259,    18,    18,    18,
+         154,    51,   659, 16273,   867,   192,  1632,   133,   990,
+        2393,    18,    32,   101,    19,    34,    32,   101,    19,
+          34,    96,   110,  2886,   761,   114,  4905,   293, 12337,
+          97,  2375,    18,   113,   143,   158,   179,   104,  4905,
+         610,    16, 12585,    97,   516,   725,    18,   113,   323,
+          96,   651,   146,   104,   207, 17649,    16,    96,   176,
+       16022,   136,    16,  1414,   136,    18,   113,   323,    96,
+        2184,    18,    97,   150,   651,    51,   242,   104,   100,
+       11722,    18,   113,   151,   543,   102,   171,   115,  1081,
+         103,    96,   222,    18,    18,    18,    18,   102,   659,
+        1081,    18,    18,    18,   102,    11,    61,   115,   299,
+          18,   113,   323,    96,  1579,    98,   203,  4438,  2033,
+         103,    96,   222,    18,    18,    18,    32,   101,    19,
+          34,    32,   101,    19,    34,   111,    16,    51,   455,
+         174,    99,   859,    43,  1687,  3330,    99,   104,  1021,
+          18,    18,    18,    51,   181,    11,    62,   214,   138,
+          96,   155,   100,   115,   916,    14,  1286,    14,    99,
+         296,    96,   642,   105,   224,  4598,   117,  1289,   156,
+         103,   904,    16,   111,   115,   103,  1628,    18,   113,
+         181,    11,    62,   119,    96,  1054,   155,    16,   111,
+         156, 14665,    18,   146,   110,   139,   742,    16,    96,
+        4905,   293, 12337,    97,  7042,  1104,   106,   557,   103,
+         366,    18,   128,    16,   150,  2446,   135,    96,   960,
+          98,    96,  4905,    18,   113,   323,   156,    43,  1174,
+         293,   188,    18,    18,    18,    43,   639,   293,    96,
+         455,   108,   207,    97,  1893,    99,  1081,   104,  4905,
+          18,    51,   194,   104,   440,    98, 12337,    99,  7042,
+        1104,   654,   122,    30,     6,    51,   276,    99,   663,
+          18,    18,    18,    97,   138,   113,   207,   163,    16,
+         113,   171,   172,   107,    51,  1027,   113,     6,    18,
+          32,   101,    19,    34,    32,   101,    19,    34,   104,
+         110,   171,   333, 10311,   141,  1311,   135,   140,   100,
+         207,    97,   140,   100,    99,   120,  1632,    18,    18,
+          18,    97,   210,    11,    61,    96,  6236,   293,   188,
+          18,    51,   181,    11,    62,   214,   138,    96,   421,
+          98,   104,   110,   100,     6,   207, 14129,   122,    18,
+          18,    18,   151,  1128,    97,  1632,  1675,     6,   133,
+           6,   207,   100,   404,    18,    18,    18,   150,   646,
+         179,   133,   210,     6,    18,   111,   103,   152,   744,
+          16,   104,   110,   100,   557,    43,  1120,   108,    96,
+         701,   382,   105,   102,   260,   113,   194,    18,    18,
+          18,     2,     0,     0,     0,     0,     0,     0,     0,
+           0,     0,     0,     0,     0,     0,     0,     0,     0,
+           0,     0,     0,     0,     0,     0,     0,     0,     0,
+           0,     0,     0,     0,     0,     0,     0,     0],
+      dtype=int32)>, <tf.Tensor: shape=(), dtype=int32, numpy=1>)
+
+```
+</div>
+### Design a tiny transformer
+
+
+```python
+token_id_input = keras.Input(
+    shape=(None,),
+    dtype="int32",
+    name="token_ids",
+)
+outputs = keras_hub.layers.TokenAndPositionEmbedding(
+    vocabulary_size=len(vocab),
+    sequence_length=packer.sequence_length,
+    embedding_dim=64,
+)(token_id_input)
+outputs = keras_hub.layers.TransformerEncoder(
+    num_heads=2,
+    intermediate_dim=128,
+    dropout=0.1,
+)(outputs)
+# Use "[START]" token to classify
+outputs = keras.layers.Dense(2)(outputs[:, 0, :])
+model = keras.Model(
+    inputs=token_id_input,
+    outputs=outputs,
+)
+
+model.summary()
+```
+
+
+<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace"><span style="font-weight: bold">Model: "functional_5"</span>
+</pre>
+
+
+
+
+<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace">┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┓
+┃<span style="font-weight: bold"> Layer (type)                    </span>┃<span style="font-weight: bold"> Output Shape              </span>┃<span style="font-weight: bold">    Param # </span>┃
+┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━┩
+│ token_ids (<span style="color: #0087ff; text-decoration-color: #0087ff">InputLayer</span>)          │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>)              │          <span style="color: #00af00; text-decoration-color: #00af00">0</span> │
+├─────────────────────────────────┼───────────────────────────┼────────────┤
+│ token_and_position_embedding    │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00af00; text-decoration-color: #00af00">64</span>)          │  <span style="color: #00af00; text-decoration-color: #00af00">1,259,648</span> │
+│ (<span style="color: #0087ff; text-decoration-color: #0087ff">TokenAndPositionEmbedding</span>)     │                           │            │
+├─────────────────────────────────┼───────────────────────────┼────────────┤
+│ transformer_encoder_2           │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00af00; text-decoration-color: #00af00">64</span>)          │     <span style="color: #00af00; text-decoration-color: #00af00">33,472</span> │
+│ (<span style="color: #0087ff; text-decoration-color: #0087ff">TransformerEncoder</span>)            │                           │            │
+├─────────────────────────────────┼───────────────────────────┼────────────┤
+│ get_item_6 (<span style="color: #0087ff; text-decoration-color: #0087ff">GetItem</span>)            │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00af00; text-decoration-color: #00af00">64</span>)                │          <span style="color: #00af00; text-decoration-color: #00af00">0</span> │
+├─────────────────────────────────┼───────────────────────────┼────────────┤
+│ dense_1 (<span style="color: #0087ff; text-decoration-color: #0087ff">Dense</span>)                 │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00af00; text-decoration-color: #00af00">2</span>)                 │        <span style="color: #00af00; text-decoration-color: #00af00">130</span> │
+└─────────────────────────────────┴───────────────────────────┴────────────┘
+</pre>
+
+
+
+
+<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace"><span style="font-weight: bold"> Total params: </span><span style="color: #00af00; text-decoration-color: #00af00">1,293,250</span> (4.93 MB)
+</pre>
+
+
+
+
+<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace"><span style="font-weight: bold"> Trainable params: </span><span style="color: #00af00; text-decoration-color: #00af00">1,293,250</span> (4.93 MB)
+</pre>
+
+
+
+
+<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace"><span style="font-weight: bold"> Non-trainable params: </span><span style="color: #00af00; text-decoration-color: #00af00">0</span> (0.00 B)
+</pre>
+
+
+
+### Train the transformer directly on the classification objective
+
+
+```python
+model.compile(
+    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+    optimizer=keras.optimizers.AdamW(5e-5),
+    metrics=[keras.metrics.SparseCategoricalAccuracy()],
+    jit_compile=True,
+)
+model.fit(
+    imdb_preproc_train_ds,
+    validation_data=imdb_preproc_val_ds,
+    epochs=3,
+)
+```
+
+<div class="k-default-codeblock">
+```
+Epoch 1/3
+ 1563/1563 ━━━━━━━━━━━━━━━━━━━━ 8s 4ms/step - loss: 0.7790 - sparse_categorical_accuracy: 0.5367 - val_loss: 0.4420 - val_sparse_categorical_accuracy: 0.8120
+Epoch 2/3
+ 1563/1563 ━━━━━━━━━━━━━━━━━━━━ 5s 3ms/step - loss: 0.3654 - sparse_categorical_accuracy: 0.8443 - val_loss: 0.3046 - val_sparse_categorical_accuracy: 0.8752
+Epoch 3/3
+ 1563/1563 ━━━━━━━━━━━━━━━━━━━━ 5s 3ms/step - loss: 0.2471 - sparse_categorical_accuracy: 0.9019 - val_loss: 0.3060 - val_sparse_categorical_accuracy: 0.8748
+
+<keras.src.callbacks.history.History at 0x7f26d032a4d0>
+
+```
+</div>
+Excitingly, our custom classifier is similar to the performance of fine-tuning
+`"bert_tiny_en_uncased"`! To see the advantages of pretraining and exceed 90% accuracy we
+would need to use larger **presets** such as `"bert_base_en_uncased"`.
diff --git a/guides/md/keras_hub/transformer_pretraining.md b/guides/md/keras_hub/transformer_pretraining.md
new file mode 100644
index 0000000000..15e94ea486
--- /dev/null
+++ b/guides/md/keras_hub/transformer_pretraining.md
@@ -0,0 +1,635 @@
+# Pretraining a Transformer from scratch with KerasHub
+
+**Author:** [Matthew Watson](https://github.com/mattdangerw/)<br>
+**Date created:** 2022/04/18<br>
+**Last modified:** 2023/07/15<br>
+**Description:** Use KerasHub to train a Transformer model from scratch.
+
+
+<img class="k-inline-icon" src="https://colab.research.google.com/img/colab_favicon.ico"/> [**View in Colab**](https://colab.research.google.com/github/keras-team/keras-io/blob/master/guides/ipynb/keras_hub/transformer_pretraining.ipynb)  <span class="k-dot">•</span><img class="k-inline-icon" src="https://github.com/favicon.ico"/> [**GitHub source**](https://github.com/keras-team/keras-io/blob/master/guides/keras_hub/transformer_pretraining.py)
+
+
+
+KerasHub aims to make it easy to build state-of-the-art text processing models. In this
+guide, we will show how library components simplify pretraining and fine-tuning a
+Transformer model from scratch.
+
+This guide is broken into three parts:
+
+1. *Setup*, task definition, and establishing a baseline.
+2. *Pretraining* a Transformer model.
+3. *Fine-tuning* the Transformer model on our classification task.
+
+---
+## Setup
+
+The following guide uses Keras 3 to work in any of `tensorflow`, `jax` or
+`torch`. We select the `jax` backend below, which will give us a particularly
+fast train step below, but feel free to mix it up.
+
+
+```python
+!pip install -q --upgrade keras-hub
+!pip install -q --upgrade keras  # Upgrade to Keras 3.
+```
+
+```python
+import os
+
+os.environ["KERAS_BACKEND"] = "jax"  # or "tensorflow" or "torch"
+
+
+import keras_hub
+import tensorflow as tf
+import keras
+```
+<div class="k-default-codeblock">
+```
+
+```
+</div>
+Next up, we can download two datasets.
+
+- [SST-2](https://paperswithcode.com/sota/sentiment-analysis-on-sst-2-binary) a text
+classification dataset and our "end goal". This dataset is often used to benchmark
+language models.
+- [WikiText-103](https://paperswithcode.com/dataset/wikitext-103): A medium sized
+collection of featured articles from English Wikipedia, which we will use for
+pretraining.
+
+Finally, we will download a WordPiece vocabulary, to do sub-word tokenization later on in
+this guide.
+
+
+```python
+# Download pretraining data.
+keras.utils.get_file(
+    origin="https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip",
+    extract=True,
+)
+wiki_dir = os.path.expanduser("~/.keras/datasets/wikitext-103-raw/")
+
+# Download finetuning data.
+keras.utils.get_file(
+    origin="https://dl.fbaipublicfiles.com/glue/data/SST-2.zip",
+    extract=True,
+)
+sst_dir = os.path.expanduser("~/.keras/datasets/SST-2/")
+
+# Download vocabulary data.
+vocab_file = keras.utils.get_file(
+    origin="https://storage.googleapis.com/tensorflow/keras-hub/examples/bert/bert_vocab_uncased.txt",
+)
+```
+
+Next, we define some hyperparameters we will use during training.
+
+
+```python
+# Preprocessing params.
+PRETRAINING_BATCH_SIZE = 128
+FINETUNING_BATCH_SIZE = 32
+SEQ_LENGTH = 128
+MASK_RATE = 0.25
+PREDICTIONS_PER_SEQ = 32
+
+# Model params.
+NUM_LAYERS = 3
+MODEL_DIM = 256
+INTERMEDIATE_DIM = 512
+NUM_HEADS = 4
+DROPOUT = 0.1
+NORM_EPSILON = 1e-5
+
+# Training params.
+PRETRAINING_LEARNING_RATE = 5e-4
+PRETRAINING_EPOCHS = 8
+FINETUNING_LEARNING_RATE = 5e-5
+FINETUNING_EPOCHS = 3
+```
+
+### Load data
+
+We load our data with [tf.data](https://www.tensorflow.org/guide/data), which will allow
+us to define input pipelines for tokenizing and preprocessing text.
+
+
+```python
+# Load SST-2.
+sst_train_ds = tf.data.experimental.CsvDataset(
+    sst_dir + "train.tsv", [tf.string, tf.int32], header=True, field_delim="\t"
+).batch(FINETUNING_BATCH_SIZE)
+sst_val_ds = tf.data.experimental.CsvDataset(
+    sst_dir + "dev.tsv", [tf.string, tf.int32], header=True, field_delim="\t"
+).batch(FINETUNING_BATCH_SIZE)
+
+# Load wikitext-103 and filter out short lines.
+wiki_train_ds = (
+    tf.data.TextLineDataset(wiki_dir + "wiki.train.raw")
+    .filter(lambda x: tf.strings.length(x) > 100)
+    .batch(PRETRAINING_BATCH_SIZE)
+)
+wiki_val_ds = (
+    tf.data.TextLineDataset(wiki_dir + "wiki.valid.raw")
+    .filter(lambda x: tf.strings.length(x) > 100)
+    .batch(PRETRAINING_BATCH_SIZE)
+)
+
+# Take a peak at the sst-2 dataset.
+print(sst_train_ds.unbatch().batch(4).take(1).get_single_element())
+```
+
+<div class="k-default-codeblock">
+```
+(<tf.Tensor: shape=(4,), dtype=string, numpy=
+array([b'hide new secretions from the parental units ',
+       b'contains no wit , only labored gags ',
+       b'that loves its characters and communicates something rather beautiful about human nature ',
+       b'remains utterly satisfied to remain the same throughout '],
+      dtype=object)>, <tf.Tensor: shape=(4,), dtype=int32, numpy=array([0, 0, 1, 0], dtype=int32)>)
+
+```
+</div>
+You can see that our `SST-2` dataset contains relatively short snippets of movie review
+text. Our goal is to predict the sentiment of the snippet. A label of 1 indicates
+positive sentiment, and a label of 0 negative sentiment.
+
+### Establish a baseline
+
+As a first step, we will establish a baseline of good performance. We don't actually need
+KerasHub for this, we can just use core Keras layers.
+
+We will train a simple bag-of-words model, where we learn a positive or negative weight
+for each word in our vocabulary. A sample's score is simply the sum of the weights of all
+words that are present in the sample.
+
+
+```python
+# This layer will turn our input sentence into a list of 1s and 0s the same size
+# our vocabulary, indicating whether a word is present in absent.
+multi_hot_layer = keras.layers.TextVectorization(
+    max_tokens=4000, output_mode="multi_hot"
+)
+multi_hot_layer.adapt(sst_train_ds.map(lambda x, y: x))
+multi_hot_ds = sst_train_ds.map(lambda x, y: (multi_hot_layer(x), y))
+multi_hot_val_ds = sst_val_ds.map(lambda x, y: (multi_hot_layer(x), y))
+
+# We then learn a linear regression over that layer, and that's our entire
+# baseline model!
+
+inputs = keras.Input(shape=(4000,), dtype="int32")
+outputs = keras.layers.Dense(1, activation="sigmoid")(inputs)
+baseline_model = keras.Model(inputs, outputs)
+baseline_model.compile(loss="binary_crossentropy", metrics=["accuracy"])
+baseline_model.fit(multi_hot_ds, validation_data=multi_hot_val_ds, epochs=5)
+```
+
+<div class="k-default-codeblock">
+```
+Epoch 1/5
+ 2105/2105 ━━━━━━━━━━━━━━━━━━━━ 2s 698us/step - accuracy: 0.6421 - loss: 0.6469 - val_accuracy: 0.7567 - val_loss: 0.5391
+Epoch 2/5
+ 2105/2105 ━━━━━━━━━━━━━━━━━━━━ 1s 493us/step - accuracy: 0.7524 - loss: 0.5392 - val_accuracy: 0.7868 - val_loss: 0.4891
+Epoch 3/5
+ 2105/2105 ━━━━━━━━━━━━━━━━━━━━ 1s 513us/step - accuracy: 0.7832 - loss: 0.4871 - val_accuracy: 0.7991 - val_loss: 0.4671
+Epoch 4/5
+ 2105/2105 ━━━━━━━━━━━━━━━━━━━━ 1s 475us/step - accuracy: 0.7991 - loss: 0.4543 - val_accuracy: 0.8069 - val_loss: 0.4569
+Epoch 5/5
+ 2105/2105 ━━━━━━━━━━━━━━━━━━━━ 1s 476us/step - accuracy: 0.8100 - loss: 0.4313 - val_accuracy: 0.8036 - val_loss: 0.4530
+
+<keras.src.callbacks.history.History at 0x7f13902967a0>
+
+```
+</div>
+A bag-of-words approach can be a fast and surprisingly powerful, especially when input
+examples contain a large number of words. With shorter sequences, it can hit a
+performance ceiling.
+
+To do better, we would like to build a model that can evaluate words *in context*. Instead
+of evaluating each word in a void, we need to use the information contained in the
+*entire ordered sequence* of our input.
+
+This runs us into a problem. `SST-2` is very small dataset, and there's simply not enough
+example text to attempt to build a larger, more parameterized model that can learn on a
+sequence. We would quickly start to overfit and memorize our training set, without any
+increase in our ability to generalize to unseen examples.
+
+Enter **pretraining**, which will allow us to learn on a larger corpus, and transfer our
+knowledge to the `SST-2` task. And enter **KerasHub**, which will allow us to pretrain a
+particularly powerful model, the Transformer, with ease.
+
+---
+## Pretraining
+
+To beat our baseline, we will leverage the `WikiText103` dataset, an unlabeled
+collection of Wikipedia articles that is much bigger than `SST-2`.
+
+We are going to train a *transformer*, a highly expressive model which will learn
+to embed each word in our input as a low dimensional vector. Our wikipedia dataset has no
+labels, so we will use an unsupervised training objective called the *Masked Language
+Modeling* (MaskedLM) objective.
+
+Essentially, we will be playing a big game of "guess the missing word". For each input
+sample we will obscure 25% of our input data, and train our model to predict the parts we
+covered up.
+
+### Preprocess data for the MaskedLM task
+
+Our text preprocessing for the MaskedLM task will occur in two stages.
+
+1. Tokenize input text into integer sequences of token ids.
+2. Mask certain positions in our input to predict on.
+
+To tokenize, we can use a `keras_hub.tokenizers.Tokenizer` -- the KerasHub building block
+for transforming text into sequences of integer token ids.
+
+In particular, we will use `keras_hub.tokenizers.WordPieceTokenizer` which does
+*sub-word* tokenization. Sub-word tokenization is popular when training models on large
+text corpora. Essentially, it allows our model to learn from uncommon words, while not
+requiring a massive vocabulary of every word in our training set.
+
+The second thing we need to do is mask our input for the MaskedLM task. To do this, we can use
+`keras_hub.layers.MaskedLMMaskGenerator`, which will randomly select a set of tokens in each
+input and mask them out.
+
+The tokenizer and the masking layer can both be used inside a call to
+[tf.data.Dataset.map](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#map).
+We can use `tf.data` to efficiently pre-compute each batch on the CPU, while our GPU or TPU
+works on training with the batch that came before. Because our masking layer will
+choose new words to mask each time, each epoch over our dataset will give us a totally
+new set of labels to train on.
+
+
+```python
+# Setting sequence_length will trim or pad the token outputs to shape
+# (batch_size, SEQ_LENGTH).
+tokenizer = keras_hub.tokenizers.WordPieceTokenizer(
+    vocabulary=vocab_file,
+    sequence_length=SEQ_LENGTH,
+    lowercase=True,
+    strip_accents=True,
+)
+# Setting mask_selection_length will trim or pad the mask outputs to shape
+# (batch_size, PREDICTIONS_PER_SEQ).
+masker = keras_hub.layers.MaskedLMMaskGenerator(
+    vocabulary_size=tokenizer.vocabulary_size(),
+    mask_selection_rate=MASK_RATE,
+    mask_selection_length=PREDICTIONS_PER_SEQ,
+    mask_token_id=tokenizer.token_to_id("[MASK]"),
+)
+
+
+def preprocess(inputs):
+    inputs = tokenizer(inputs)
+    outputs = masker(inputs)
+    # Split the masking layer outputs into a (features, labels, and weights)
+    # tuple that we can use with keras.Model.fit().
+    features = {
+        "token_ids": outputs["token_ids"],
+        "mask_positions": outputs["mask_positions"],
+    }
+    labels = outputs["mask_ids"]
+    weights = outputs["mask_weights"]
+    return features, labels, weights
+
+
+# We use prefetch() to pre-compute preprocessed batches on the fly on the CPU.
+pretrain_ds = wiki_train_ds.map(
+    preprocess, num_parallel_calls=tf.data.AUTOTUNE
+).prefetch(tf.data.AUTOTUNE)
+pretrain_val_ds = wiki_val_ds.map(
+    preprocess, num_parallel_calls=tf.data.AUTOTUNE
+).prefetch(tf.data.AUTOTUNE)
+
+# Preview a single input example.
+# The masks will change each time you run the cell.
+print(pretrain_val_ds.take(1).get_single_element())
+```
+
+<div class="k-default-codeblock">
+```
+({'token_ids': <tf.Tensor: shape=(128, 128), dtype=int32, numpy=
+array([[7570, 7849, 2271, ..., 9673,  103, 7570],
+       [7570, 7849,  103, ..., 1007, 1012, 2023],
+       [1996, 2034, 3940, ...,    0,    0,    0],
+       ...,
+       [2076, 1996, 2307, ...,    0,    0,    0],
+       [3216,  103, 2083, ...,    0,    0,    0],
+       [ 103, 2007, 1045, ...,    0,    0,    0]], dtype=int32)>, 'mask_positions': <tf.Tensor: shape=(128, 32), dtype=int64, numpy=
+array([[  5,   6,   7, ..., 118, 120, 126],
+       [  2,   3,  14, ..., 105, 106, 113],
+       [  4,   9,  10, ...,   0,   0,   0],
+       ...,
+       [  4,  11,  19, ..., 117, 118,   0],
+       [  1,  14,  17, ...,   0,   0,   0],
+       [  0,   3,   6, ...,   0,   0,   0]])>}, <tf.Tensor: shape=(128, 32), dtype=int32, numpy=
+array([[ 1010,  2124,  2004, ...,  2095, 11300,  1012],
+       [ 2271, 13091,  2303, ...,  2029,  2027,  1010],
+       [23976,  2007,  1037, ...,     0,     0,     0],
+       ...,
+       [ 1010,  1996,  1010, ...,  1999,  7511,     0],
+       [ 2225,  1998, 10722, ...,     0,     0,     0],
+       [ 9794,  1030,  2322, ...,     0,     0,     0]], dtype=int32)>, <tf.Tensor: shape=(128, 32), dtype=float32, numpy=
+array([[1., 1., 1., ..., 1., 1., 1.],
+       [1., 1., 1., ..., 1., 1., 1.],
+       [1., 1., 1., ..., 0., 0., 0.],
+       ...,
+       [1., 1., 1., ..., 1., 1., 0.],
+       [1., 1., 1., ..., 0., 0., 0.],
+       [1., 1., 1., ..., 0., 0., 0.]], dtype=float32)>)
+
+```
+</div>
+The above block sorts our dataset into a `(features, labels, weights)` tuple, which can be
+passed directly to `keras.Model.fit()`.
+
+We have two features:
+
+1. `"token_ids"`, where some tokens have been replaced with our mask token id.
+2. `"mask_positions"`, which keeps track of which tokens we masked out.
+
+Our labels are simply the ids we masked out.
+
+Because not all sequences will have the same number of masks, we also keep a
+`sample_weight` tensor, which removes padded labels from our loss function by giving them
+zero weight.
+
+### Create the Transformer encoder
+
+KerasHub provides all the building blocks to quickly build a Transformer encoder.
+
+We use `keras_hub.layers.TokenAndPositionEmbedding` to first embed our input token ids.
+This layer simultaneously learns two embeddings -- one for words in a sentence and another
+for integer positions in a sentence. The output embedding is simply the sum of the two.
+
+Then we can add a series of `keras_hub.layers.TransformerEncoder` layers. These are the
+bread and butter of the Transformer model, using an attention mechanism to attend to
+different parts of the input sentence, followed by a multi-layer perceptron block.
+
+The output of this model will be a encoded vector per input token id. Unlike the
+bag-of-words model we used as a baseline, this model will embed each token accounting for
+the context in which it appeared.
+
+
+```python
+inputs = keras.Input(shape=(SEQ_LENGTH,), dtype="int32")
+
+# Embed our tokens with a positional embedding.
+embedding_layer = keras_hub.layers.TokenAndPositionEmbedding(
+    vocabulary_size=tokenizer.vocabulary_size(),
+    sequence_length=SEQ_LENGTH,
+    embedding_dim=MODEL_DIM,
+)
+outputs = embedding_layer(inputs)
+
+# Apply layer normalization and dropout to the embedding.
+outputs = keras.layers.LayerNormalization(epsilon=NORM_EPSILON)(outputs)
+outputs = keras.layers.Dropout(rate=DROPOUT)(outputs)
+
+# Add a number of encoder blocks
+for i in range(NUM_LAYERS):
+    outputs = keras_hub.layers.TransformerEncoder(
+        intermediate_dim=INTERMEDIATE_DIM,
+        num_heads=NUM_HEADS,
+        dropout=DROPOUT,
+        layer_norm_epsilon=NORM_EPSILON,
+    )(outputs)
+
+encoder_model = keras.Model(inputs, outputs)
+encoder_model.summary()
+```
+
+
+<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace"><span style="font-weight: bold">Model: "functional_3"</span>
+</pre>
+
+
+
+
+<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace">┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┓
+┃<span style="font-weight: bold"> Layer (type)                    </span>┃<span style="font-weight: bold"> Output Shape              </span>┃<span style="font-weight: bold">    Param # </span>┃
+┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━┩
+│ input_layer_1 (<span style="color: #0087ff; text-decoration-color: #0087ff">InputLayer</span>)      │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00af00; text-decoration-color: #00af00">128</span>)               │          <span style="color: #00af00; text-decoration-color: #00af00">0</span> │
+├─────────────────────────────────┼───────────────────────────┼────────────┤
+│ token_and_position_embedding    │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00af00; text-decoration-color: #00af00">128</span>, <span style="color: #00af00; text-decoration-color: #00af00">256</span>)          │  <span style="color: #00af00; text-decoration-color: #00af00">7,846,400</span> │
+│ (<span style="color: #0087ff; text-decoration-color: #0087ff">TokenAndPositionEmbedding</span>)     │                           │            │
+├─────────────────────────────────┼───────────────────────────┼────────────┤
+│ layer_normalization             │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00af00; text-decoration-color: #00af00">128</span>, <span style="color: #00af00; text-decoration-color: #00af00">256</span>)          │        <span style="color: #00af00; text-decoration-color: #00af00">512</span> │
+│ (<span style="color: #0087ff; text-decoration-color: #0087ff">LayerNormalization</span>)            │                           │            │
+├─────────────────────────────────┼───────────────────────────┼────────────┤
+│ dropout (<span style="color: #0087ff; text-decoration-color: #0087ff">Dropout</span>)               │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00af00; text-decoration-color: #00af00">128</span>, <span style="color: #00af00; text-decoration-color: #00af00">256</span>)          │          <span style="color: #00af00; text-decoration-color: #00af00">0</span> │
+├─────────────────────────────────┼───────────────────────────┼────────────┤
+│ transformer_encoder             │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00af00; text-decoration-color: #00af00">128</span>, <span style="color: #00af00; text-decoration-color: #00af00">256</span>)          │    <span style="color: #00af00; text-decoration-color: #00af00">527,104</span> │
+│ (<span style="color: #0087ff; text-decoration-color: #0087ff">TransformerEncoder</span>)            │                           │            │
+├─────────────────────────────────┼───────────────────────────┼────────────┤
+│ transformer_encoder_1           │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00af00; text-decoration-color: #00af00">128</span>, <span style="color: #00af00; text-decoration-color: #00af00">256</span>)          │    <span style="color: #00af00; text-decoration-color: #00af00">527,104</span> │
+│ (<span style="color: #0087ff; text-decoration-color: #0087ff">TransformerEncoder</span>)            │                           │            │
+├─────────────────────────────────┼───────────────────────────┼────────────┤
+│ transformer_encoder_2           │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00af00; text-decoration-color: #00af00">128</span>, <span style="color: #00af00; text-decoration-color: #00af00">256</span>)          │    <span style="color: #00af00; text-decoration-color: #00af00">527,104</span> │
+│ (<span style="color: #0087ff; text-decoration-color: #0087ff">TransformerEncoder</span>)            │                           │            │
+└─────────────────────────────────┴───────────────────────────┴────────────┘
+</pre>
+
+
+
+
+<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace"><span style="font-weight: bold"> Total params: </span><span style="color: #00af00; text-decoration-color: #00af00">9,428,224</span> (287.73 MB)
+</pre>
+
+
+
+
+<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace"><span style="font-weight: bold"> Trainable params: </span><span style="color: #00af00; text-decoration-color: #00af00">9,428,224</span> (287.73 MB)
+</pre>
+
+
+
+
+<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace"><span style="font-weight: bold"> Non-trainable params: </span><span style="color: #00af00; text-decoration-color: #00af00">0</span> (0.00 B)
+</pre>
+
+
+
+### Pretrain the Transformer
+
+You can think of the `encoder_model` as it's own modular unit, it is the piece of our
+model that we are really interested in for our downstream task. However we still need to
+set up the encoder to train on the MaskedLM task; to do that we attach a
+`keras_hub.layers.MaskedLMHead`.
+
+This layer will take as one input the token encodings, and as another the positions we
+masked out in the original input. It will gather the token encodings we masked, and
+transform them back in predictions over our entire vocabulary.
+
+With that, we are ready to compile and run pretraining. If you are running this in a
+Colab, note that this will take about an hour. Training Transformer is famously compute
+intensive, so even this relatively small Transformer will take some time.
+
+
+```python
+# Create the pretraining model by attaching a masked language model head.
+inputs = {
+    "token_ids": keras.Input(shape=(SEQ_LENGTH,), dtype="int32", name="token_ids"),
+    "mask_positions": keras.Input(
+        shape=(PREDICTIONS_PER_SEQ,), dtype="int32", name="mask_positions"
+    ),
+}
+
+# Encode the tokens.
+encoded_tokens = encoder_model(inputs["token_ids"])
+
+# Predict an output word for each masked input token.
+# We use the input token embedding to project from our encoded vectors to
+# vocabulary logits, which has been shown to improve training efficiency.
+outputs = keras_hub.layers.MaskedLMHead(
+    token_embedding=embedding_layer.token_embedding,
+    activation="softmax",
+)(encoded_tokens, mask_positions=inputs["mask_positions"])
+
+# Define and compile our pretraining model.
+pretraining_model = keras.Model(inputs, outputs)
+pretraining_model.compile(
+    loss="sparse_categorical_crossentropy",
+    optimizer=keras.optimizers.AdamW(PRETRAINING_LEARNING_RATE),
+    weighted_metrics=["sparse_categorical_accuracy"],
+    jit_compile=True,
+)
+
+# Pretrain the model on our wiki text dataset.
+pretraining_model.fit(
+    pretrain_ds,
+    validation_data=pretrain_val_ds,
+    epochs=PRETRAINING_EPOCHS,
+)
+
+# Save this base model for further finetuning.
+encoder_model.save("encoder_model.keras")
+```
+
+<div class="k-default-codeblock">
+```
+Epoch 1/8
+ 5857/5857 ━━━━━━━━━━━━━━━━━━━━ 242s 41ms/step - loss: 5.4679 - sparse_categorical_accuracy: 0.1353 - val_loss: 3.4570 - val_sparse_categorical_accuracy: 0.3522
+Epoch 2/8
+ 5857/5857 ━━━━━━━━━━━━━━━━━━━━ 234s 40ms/step - loss: 3.6031 - sparse_categorical_accuracy: 0.3396 - val_loss: 3.0514 - val_sparse_categorical_accuracy: 0.4032
+Epoch 3/8
+ 5857/5857 ━━━━━━━━━━━━━━━━━━━━ 232s 40ms/step - loss: 3.2609 - sparse_categorical_accuracy: 0.3802 - val_loss: 2.8858 - val_sparse_categorical_accuracy: 0.4240
+Epoch 4/8
+ 5857/5857 ━━━━━━━━━━━━━━━━━━━━ 233s 40ms/step - loss: 3.1099 - sparse_categorical_accuracy: 0.3978 - val_loss: 2.7897 - val_sparse_categorical_accuracy: 0.4375
+Epoch 5/8
+ 5857/5857 ━━━━━━━━━━━━━━━━━━━━ 235s 40ms/step - loss: 3.0145 - sparse_categorical_accuracy: 0.4090 - val_loss: 2.7504 - val_sparse_categorical_accuracy: 0.4419
+Epoch 6/8
+ 5857/5857 ━━━━━━━━━━━━━━━━━━━━ 252s 43ms/step - loss: 2.9530 - sparse_categorical_accuracy: 0.4157 - val_loss: 2.6925 - val_sparse_categorical_accuracy: 0.4474
+Epoch 7/8
+ 5857/5857 ━━━━━━━━━━━━━━━━━━━━ 232s 40ms/step - loss: 2.9088 - sparse_categorical_accuracy: 0.4210 - val_loss: 2.6554 - val_sparse_categorical_accuracy: 0.4513
+Epoch 8/8
+ 5857/5857 ━━━━━━━━━━━━━━━━━━━━ 236s 40ms/step - loss: 2.8721 - sparse_categorical_accuracy: 0.4250 - val_loss: 2.6389 - val_sparse_categorical_accuracy: 0.4548
+
+```
+</div>
+---
+## Fine-tuning
+
+After pretraining, we can now fine-tune our model on the `SST-2` dataset. We can
+leverage the ability of the encoder we build to predict on words in context to boost 
+our performance on the downstream task.
+
+### Preprocess data for classification
+
+Preprocessing for fine-tuning is much simpler than for our pretraining MaskedLM task. We just
+tokenize our input sentences and we are ready for training!
+
+
+```python
+
+def preprocess(sentences, labels):
+    return tokenizer(sentences), labels
+
+
+# We use prefetch() to pre-compute preprocessed batches on the fly on our CPU.
+finetune_ds = sst_train_ds.map(
+    preprocess, num_parallel_calls=tf.data.AUTOTUNE
+).prefetch(tf.data.AUTOTUNE)
+finetune_val_ds = sst_val_ds.map(
+    preprocess, num_parallel_calls=tf.data.AUTOTUNE
+).prefetch(tf.data.AUTOTUNE)
+
+# Preview a single input example.
+print(finetune_val_ds.take(1).get_single_element())
+```
+
+<div class="k-default-codeblock">
+```
+(<tf.Tensor: shape=(32, 128), dtype=int32, numpy=
+array([[ 2009,  1005,  1055, ...,     0,     0,     0],
+       [ 4895, 10258,  2378, ...,     0,     0,     0],
+       [ 4473,  2149,  2000, ...,     0,     0,     0],
+       ...,
+       [ 1045,  2018,  2000, ...,     0,     0,     0],
+       [ 4283,  2000,  3660, ...,     0,     0,     0],
+       [ 1012,  1012,  1012, ...,     0,     0,     0]], dtype=int32)>, <tf.Tensor: shape=(32,), dtype=int32, numpy=
+array([1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0,
+       0, 1, 1, 0, 0, 1, 0, 0, 1, 0], dtype=int32)>)
+
+```
+</div>
+### Fine-tune the Transformer
+
+To go from our encoded token output to a classification prediction, we need to attach
+another "head" to our Transformer model. We can afford to be simple here. We pool
+the encoded tokens together, and use a single dense layer to make a prediction.
+
+
+```python
+# Reload the encoder model from disk so we can restart fine-tuning from scratch.
+encoder_model = keras.models.load_model("encoder_model.keras", compile=False)
+
+# Take as input the tokenized input.
+inputs = keras.Input(shape=(SEQ_LENGTH,), dtype="int32")
+
+# Encode and pool the tokens.
+encoded_tokens = encoder_model(inputs)
+pooled_tokens = keras.layers.GlobalAveragePooling1D()(encoded_tokens[0])
+
+# Predict an output label.
+outputs = keras.layers.Dense(1, activation="sigmoid")(pooled_tokens)
+
+# Define and compile our fine-tuning model.
+finetuning_model = keras.Model(inputs, outputs)
+finetuning_model.compile(
+    loss="binary_crossentropy",
+    optimizer=keras.optimizers.AdamW(FINETUNING_LEARNING_RATE),
+    metrics=["accuracy"],
+)
+
+# Finetune the model for the SST-2 task.
+finetuning_model.fit(
+    finetune_ds,
+    validation_data=finetune_val_ds,
+    epochs=FINETUNING_EPOCHS,
+)
+```
+
+<div class="k-default-codeblock">
+```
+Epoch 1/3
+ 2105/2105 ━━━━━━━━━━━━━━━━━━━━ 21s 9ms/step - accuracy: 0.7500 - loss: 0.4891 - val_accuracy: 0.8036 - val_loss: 0.4099
+Epoch 2/3
+ 2105/2105 ━━━━━━━━━━━━━━━━━━━━ 16s 8ms/step - accuracy: 0.8826 - loss: 0.2779 - val_accuracy: 0.8482 - val_loss: 0.3964
+Epoch 3/3
+ 2105/2105 ━━━━━━━━━━━━━━━━━━━━ 16s 8ms/step - accuracy: 0.9176 - loss: 0.2066 - val_accuracy: 0.8549 - val_loss: 0.4142
+
+<keras.src.callbacks.history.History at 0x7f12d85c21a0>
+
+```
+</div>
+Pretraining was enough to boost our performance to 84%, and this is hardly the ceiling
+for Transformer models. You may have noticed during pretraining that our validation
+performance was still steadily increasing. Our model is still significantly undertrained.
+Training for more epochs, training a large Transformer, and training on more unlabeled
+text would all continue to boost performance significantly.
+
+One of the key goals of KerasHub is to provide a modular approach to NLP model building.
+We have shown one approach to building a Transformer here, but KerasHub supports an ever
+growing array of components for preprocessing text and building models. We hope it makes
+it easier to experiment on solutions to your natural language problems.
diff --git a/guides/md/keras_hub/upload.md b/guides/md/keras_hub/upload.md
new file mode 100644
index 0000000000..3817d354e9
--- /dev/null
+++ b/guides/md/keras_hub/upload.md
@@ -0,0 +1,308 @@
+# Uploading Models with KerasHub
+
+**Author:** [Samaneh Saadat](https://github.com/SamanehSaadat/), [Matthew Watson](https://github.com/mattdangerw/)<br>
+**Date created:** 2024/04/29<br>
+**Last modified:** 2024/04/29<br>
+**Description:** An introduction on how to upload a fine-tuned KerasHub model to model hubs.
+
+
+<img class="k-inline-icon" src="https://colab.research.google.com/img/colab_favicon.ico"/> [**View in Colab**](https://colab.research.google.com/github/keras-team/keras-io/blob/master/guides/ipynb/keras_hub/upload.ipynb)  <span class="k-dot">•</span><img class="k-inline-icon" src="https://github.com/favicon.ico"/> [**GitHub source**](https://github.com/keras-team/keras-io/blob/master/guides/keras_hub/upload.py)
+
+
+
+# Introduction
+
+Fine-tuning a machine learning model can yield impressive results for specific tasks.
+Uploading your fine-tuned model to a model hub allows you to share it with the broader community.
+By sharing your models, you'll enhance accessibility for other researchers and developers,
+making your contributions an integral part of the machine learning landscape.
+This can also streamline the integration of your model into real-world applications.
+
+This guide walks you through how to upload your fine-tuned models to popular model hubs such as
+[Kaggle Models](https://www.kaggle.com/models) and [Hugging Face Hub](https://huggingface.co/models).
+
+# Setup
+
+Let's start by installing and importing all the libraries we need. We use KerasHub for this guide.
+
+
+```python
+!pip install -q --upgrade keras-hub huggingface-hub kagglehub
+```
+
+
+```python
+import os
+
+os.environ["KERAS_BACKEND"] = "jax"
+
+import keras_hub
+
+```
+
+# Data
+
+We can use the IMDB reviews dataset for this guide. Let's load the dataset from `tensorflow_dataset`.
+
+
+```python
+import tensorflow_datasets as tfds
+
+imdb_train, imdb_test = tfds.load(
+    "imdb_reviews",
+    split=["train", "test"],
+    as_supervised=True,
+    batch_size=4,
+)
+```
+
+We only use a small subset of the training samples to make the guide run faster.
+However, if you need a higher quality model, consider using a larger number of training samples.
+
+
+```python
+imdb_train = imdb_train.take(100)
+```
+
+# Task Upload
+
+A `keras_hub.models.Task`, wraps a `keras_hub.models.Backbone` and a `keras_hub.models.Preprocessor` to create
+a model that can be directly used for training, fine-tuning, and prediction for a given text problem.
+In this section, we explain how to create a `Task`, fine-tune and upload it to a model hub.
+
+---
+## Load Model
+
+If you want to build a Causal LM based on a base model, simply call `keras_hub.models.CausalLM.from_preset`
+and pass a built-in preset identifier.
+
+
+```python
+causal_lm = keras_hub.models.CausalLM.from_preset("gpt2_base_en")
+
+```
+
+<div class="k-default-codeblock">
+```
+Downloading from https://www.kaggle.com/api/v1/models/keras/gpt2/keras/gpt2_base_en/2/download/task.json...
+
+Downloading from https://www.kaggle.com/api/v1/models/keras/gpt2/keras/gpt2_base_en/2/download/preprocessor.json...
+
+```
+</div>
+---
+## Fine-tune Model
+
+After loading the model, you can call `.fit()` on the model to fine-tune it.
+Here, we fine-tune the model on the IMDB reviews which makes the model movie domain-specific.
+
+
+```python
+# Drop labels and keep the review text only for the Causal LM.
+imdb_train_reviews = imdb_train.map(lambda x, y: x)
+
+# Fine-tune the Causal LM.
+causal_lm.fit(imdb_train_reviews)
+```
+ 100/100 ━━━━━━━━━━━━━━━━━━━━ 151s 1s/step - loss: 1.0198 - sparse_categorical_accuracy: 0.3271
+
+---
+## Save the Model Locally
+
+To upload a model, you need to first save the model locally using `save_to_preset`.
+
+
+```python
+preset_dir = "./gpt2_imdb"
+causal_lm.save_to_preset(preset_dir)
+```
+
+Let's see the saved files.
+
+
+```python
+os.listdir(preset_dir)
+```
+
+
+
+
+<div class="k-default-codeblock">
+```
+['preprocessor.json',
+ 'tokenizer.json',
+ 'task.json',
+ 'model.weights.h5',
+ 'config.json',
+ 'metadata.json',
+ 'assets']
+
+```
+</div>
+### Load a Locally Saved Model
+
+A model that is saved to a local preset can be loaded using `from_preset`.
+What you save in, is what you get back out.
+
+
+```python
+causal_lm = keras_hub.models.CausalLM.from_preset(preset_dir)
+```
+
+You can also load the `keras_hub.models.Backbone` and `keras_hub.models.Tokenizer` objects from this preset directory.
+Note that these objects are equivalent to `causal_lm.backbone` and `causal_lm.preprocessor.tokenizer` above.
+
+
+```python
+backbone = keras_hub.models.Backbone.from_preset(preset_dir)
+tokenizer = keras_hub.models.Tokenizer.from_preset(preset_dir)
+```
+
+---
+## Upload the Model to a Model Hub
+
+After saving a preset to a directory, this directory can be uploaded to a model hub such as Kaggle or Hugging Face directly from the KerasHub library.
+To upload the model to Kaggle, the URI must start with `kaggle://` and to upload to Hugging Face, it should start with `hf://`.
+
+### Upload to Kaggle
+
+To upload a model to Kaggle, first, we need to authenticate with Kaggle.
+This can in one of the following ways:
+1. Set environment variables `KAGGLE_USERNAME` and `KAGGLE_KEY`.
+2. Provide a local `~/.kaggle/kaggle.json`.
+3. Call `kagglehub.login()`.
+
+Let's make sure we are logged in before continuing.
+
+
+```python
+import kagglehub
+
+if "KAGGLE_USERNAME" not in os.environ or "KAGGLE_KEY" not in os.environ:
+    kagglehub.login()
+
+```
+
+To upload a model we can use `keras_hub.upload_preset(uri, preset_dir)` API where `uri` has the format of
+`kaggle://<KAGGLE_USERNAME>/<MODEL>/Keras/<VARIATION>` for uploading to Kaggle and `preset_dir` is the directory that the model is saved in.
+
+Running the following uploads the model that is saved in `preset_dir` to Kaggle:
+
+
+```python
+kaggle_username = kagglehub.whoami()["username"]
+kaggle_uri = f"kaggle://{kaggle_username}/gpt2/keras/gpt2_imdb"
+keras_hub.upload_preset(kaggle_uri, preset_dir)
+```
+
+<div class="k-default-codeblock">
+```
+Upload successful: preprocessor.json (834B)
+Upload successful: tokenizer.json (322B)
+Upload successful: task.json (2KB)
+Upload successful: model.weights.h5 (475MB)
+Upload successful: config.json (431B)
+Upload successful: metadata.json (142B)
+Upload successful: merges.txt (446KB)
+Upload successful: vocabulary.json (1018KB)
+
+Your model instance version has been created.
+
+```
+</div>
+### Upload to Hugging Face
+
+To upload a model to Hugging Face, first, we need to authenticate with Hugging Face.
+This can in one of the following ways:
+1. Set environment variables `HF_USERNAME` and `HF_TOKEN`.
+2. Call `huggingface_hub.notebook_login()`.
+
+Let's make sure we are logged in before coninuing.
+
+
+```python
+import huggingface_hub
+
+if "HF_USERNAME" not in os.environ or "HF_TOKEN" not in os.environ:
+    huggingface_hub.notebook_login()
+```
+
+`keras_hub.upload_preset(uri, preset_dir)` can be used to upload a model to Hugging Face if `uri` has the format of
+`kaggle://<HF_USERNAME>/<MODEL>`.
+
+Running the following uploads the model that is saved in `preset_dir` to Hugging Face:
+
+
+```python
+hf_username = huggingface_hub.whoami()["name"]
+hf_uri = f"hf://{hf_username}/gpt2_imdb"
+keras_hub.upload_preset(hf_uri, preset_dir)
+
+```
+
+---
+## Load a User Uploaded Model
+
+After verifying that the model is uploaded to Kaggle, we can load the model by calling `from_preset`.
+
+```python
+causal_lm = keras_hub.models.CausalLM.from_preset(
+    f"kaggle://{kaggle_username}/gpt2/keras/gpt2_imdb"
+)
+```
+
+We can also load the model uploaded to Hugging Face by calling `from_preset`.
+
+```python
+causal_lm = keras_hub.models.CausalLM.from_preset(f"hf://{hf_username}/gpt2_imdb")
+```
+
+# Classifier Upload
+
+Uploading a classifier model is similar to Causal LM upload.
+To upload the fine-tuned model, first, the model should be saved to a local directory using `save_to_preset`
+API and then it can be uploaded via `keras_hub.upload_preset`.
+
+
+```python
+# Load the base model.
+classifier = keras_hub.models.Classifier.from_preset(
+    "bert_tiny_en_uncased", num_classes=2
+)
+
+# Fine-tune the classifier.
+classifier.fit(imdb_train)
+
+# Save the model to a local preset directory.
+preset_dir = "./bert_tiny_imdb"
+classifier.save_to_preset(preset_dir)
+
+# Upload to Kaggle.
+keras_hub.upload_preset(
+    f"kaggle://{kaggle_username}/bert/keras/bert_tiny_imdb", preset_dir
+)
+```
+ 100/100 ━━━━━━━━━━━━━━━━━━━━ 7s 31ms/step - loss: 0.6975 - sparse_categorical_accuracy: 0.5164
+
+
+<div class="k-default-codeblock">
+```
+Upload successful: preprocessor.json (947B)
+Upload successful: tokenizer.json (461B)
+Upload successful: task.json (2KB)
+Upload successful: task.weights.h5 (50MB)
+Upload successful: model.weights.h5 (17MB)
+Upload successful: config.json (454B)
+Upload successful: metadata.json (140B)
+Upload successful: vocabulary.txt (226KB)
+
+Your model instance version has been created.
+```
+</div>
+After verifying that the model is uploaded to Kaggle, we can load the model by calling `from_preset`.
+
+```python
+classifier = keras_hub.models.Classifier.from_preset(
+    f"kaggle://{kaggle_username}/bert/keras/bert_tiny_imdb"
+)
+```
\ No newline at end of file
diff --git a/redirects/api/keras_nlp/layers/fnet_encoder/index.html b/redirects/api/keras_nlp/layers/fnet_encoder/index.html
index cf952224ad..3a5100b022 100644
--- a/redirects/api/keras_nlp/layers/fnet_encoder/index.html
+++ b/redirects/api/keras_nlp/layers/fnet_encoder/index.html
@@ -1,2 +1,2 @@
 
-<meta http-equiv="refresh" content="0; URL='https://keras.io/api/keras_nlp/modeling_layers/fnet_encoder'" />
+<meta http-equiv="refresh" content="0; URL='https://keras.io/api/keras_hub/modeling_layers/fnet_encoder'" />
diff --git a/redirects/api/keras_nlp/layers/index.html b/redirects/api/keras_nlp/layers/index.html
index 939076da8d..6a18f1255f 100644
--- a/redirects/api/keras_nlp/layers/index.html
+++ b/redirects/api/keras_nlp/layers/index.html
@@ -1,2 +1,2 @@
 
-<meta http-equiv="refresh" content="0; URL='https://keras.io/api/keras_nlp/modeling_layers/'" />
+<meta http-equiv="refresh" content="0; URL='https://keras.io/api/keras_hub/modeling_layers/'" />
diff --git a/redirects/api/keras_nlp/layers/mlm_head/index.html b/redirects/api/keras_nlp/layers/mlm_head/index.html
index e7b3d201be..62d08476e8 100644
--- a/redirects/api/keras_nlp/layers/mlm_head/index.html
+++ b/redirects/api/keras_nlp/layers/mlm_head/index.html
@@ -1,2 +1,2 @@
 
-<meta http-equiv="refresh" content="0; URL='https://keras.io/api/keras_nlp/modeling_layers/masked_lm_head/'" />
+<meta http-equiv="refresh" content="0; URL='https://keras.io/api/keras_hub/modeling_layers/masked_lm_head/'" />
diff --git a/redirects/api/keras_nlp/layers/mlm_mask_generator/index.html b/redirects/api/keras_nlp/layers/mlm_mask_generator/index.html
index 15cbbf4724..e2d69ca00c 100644
--- a/redirects/api/keras_nlp/layers/mlm_mask_generator/index.html
+++ b/redirects/api/keras_nlp/layers/mlm_mask_generator/index.html
@@ -1,2 +1,2 @@
 
-<meta http-equiv="refresh" content="0; URL='https://keras.io/api/keras_nlp/preprocessing_layers/masked_lm_mask_generator/'" />
+<meta http-equiv="refresh" content="0; URL='https://keras.io/api/keras_hub/preprocessing_layers/masked_lm_mask_generator/'" />
diff --git a/redirects/api/keras_nlp/layers/multi_segment_packer/index.html b/redirects/api/keras_nlp/layers/multi_segment_packer/index.html
index 46476327cb..e4f1342f53 100644
--- a/redirects/api/keras_nlp/layers/multi_segment_packer/index.html
+++ b/redirects/api/keras_nlp/layers/multi_segment_packer/index.html
@@ -1,2 +1,2 @@
 
-<meta http-equiv="refresh" content="0; URL='https://keras.io/api/keras_nlp/preprocessing_layers/multi_segment_packer/'" />
+<meta http-equiv="refresh" content="0; URL='https://keras.io/api/keras_hub/preprocessing_layers/multi_segment_packer/'" />
diff --git a/redirects/api/keras_nlp/layers/position_embedding/index.html b/redirects/api/keras_nlp/layers/position_embedding/index.html
index facafa7c4b..b1289a7284 100644
--- a/redirects/api/keras_nlp/layers/position_embedding/index.html
+++ b/redirects/api/keras_nlp/layers/position_embedding/index.html
@@ -1,2 +1,2 @@
 
-<meta http-equiv="refresh" content="0; URL='https://keras.io/api/keras_nlp/modeling_layers/position_embedding/'" />
+<meta http-equiv="refresh" content="0; URL='https://keras.io/api/keras_hub/modeling_layers/position_embedding/'" />
diff --git a/redirects/api/keras_nlp/layers/sine_position_encoding/index.html b/redirects/api/keras_nlp/layers/sine_position_encoding/index.html
index f741133224..f4ba245015 100644
--- a/redirects/api/keras_nlp/layers/sine_position_encoding/index.html
+++ b/redirects/api/keras_nlp/layers/sine_position_encoding/index.html
@@ -1,2 +1,2 @@
 
-<meta http-equiv="refresh" content="0; URL='https://keras.io/api/keras_nlp/modeling_layers/sine_position_encoding/'" />
+<meta http-equiv="refresh" content="0; URL='https://keras.io/api/keras_hub/modeling_layers/sine_position_encoding/'" />
diff --git a/redirects/api/keras_nlp/layers/start_end_packer/index.html b/redirects/api/keras_nlp/layers/start_end_packer/index.html
index 310b463196..b061c27a8c 100644
--- a/redirects/api/keras_nlp/layers/start_end_packer/index.html
+++ b/redirects/api/keras_nlp/layers/start_end_packer/index.html
@@ -1,2 +1,2 @@
 
-<meta http-equiv="refresh" content="0; URL='https://keras.io/api/keras_nlp/preprocessing_layers/start_end_packer/'" />
+<meta http-equiv="refresh" content="0; URL='https://keras.io/api/keras_hub/preprocessing_layers/start_end_packer/'" />
diff --git a/redirects/api/keras_nlp/layers/token_and_position_embedding/index.html b/redirects/api/keras_nlp/layers/token_and_position_embedding/index.html
index 6d3b71e9a4..e7ea6564eb 100644
--- a/redirects/api/keras_nlp/layers/token_and_position_embedding/index.html
+++ b/redirects/api/keras_nlp/layers/token_and_position_embedding/index.html
@@ -1,2 +1,2 @@
 
-<meta http-equiv="refresh" content="0; URL='https://keras.io/api/keras_nlp/modeling_layers/token_and_position_embedding/'" />
+<meta http-equiv="refresh" content="0; URL='https://keras.io/api/keras_hub/modeling_layers/token_and_position_embedding/'" />
diff --git a/redirects/api/keras_nlp/layers/transformer_decoder/index.html b/redirects/api/keras_nlp/layers/transformer_decoder/index.html
index 463349ec84..c97613b370 100644
--- a/redirects/api/keras_nlp/layers/transformer_decoder/index.html
+++ b/redirects/api/keras_nlp/layers/transformer_decoder/index.html
@@ -1,2 +1,2 @@
 
-<meta http-equiv="refresh" content="0; URL='https://keras.io/api/keras_nlp/modeling_layers/transformer_decoder/'" />
+<meta http-equiv="refresh" content="0; URL='https://keras.io/api/keras_hub/modeling_layers/transformer_decoder/'" />
diff --git a/redirects/api/keras_nlp/layers/transformer_encoder/index.html b/redirects/api/keras_nlp/layers/transformer_encoder/index.html
index 50ff134876..adc2424450 100644
--- a/redirects/api/keras_nlp/layers/transformer_encoder/index.html
+++ b/redirects/api/keras_nlp/layers/transformer_encoder/index.html
@@ -1,2 +1,2 @@
 
-<meta http-equiv="refresh" content="0; URL='https://keras.io/api/keras_nlp/modeling_layers/transformer_encoder/'" />
+<meta http-equiv="refresh" content="0; URL='https://keras.io/api/keras_hub/modeling_layers/transformer_encoder/'" />
diff --git a/redirects/examples/nlp/gpt2_text_generation_with_kerasnlp/index.html b/redirects/examples/nlp/gpt2_text_generation_with_kerasnlp/index.html
index c057bb3ab4..17cdb19bdc 100644
--- a/redirects/examples/nlp/gpt2_text_generation_with_kerasnlp/index.html
+++ b/redirects/examples/nlp/gpt2_text_generation_with_kerasnlp/index.html
@@ -1,2 +1,2 @@
 
-<meta http-equiv="refresh" content="0; URL='https://keras.io/examples/generative/gpt2_text_generation_with_kerasnlp/'" />
+<meta http-equiv="refresh" content="0; URL='https://keras.io/examples/generative/gpt2_text_generation_with_kerashub/'" />
diff --git a/requirements.txt b/requirements.txt
index 4b40a0e06f..106f41e643 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -11,7 +11,8 @@ jupyter
 pydot
 boto3
 tensorflow
-keras-tuner
 keras-cv
-tf_keras
 keras-nlp
+keras-tuner
+tf_keras
+keras-hub-nightly  # TODO: update to keras-hub.
diff --git a/scripts/api_master.py b/scripts/api_master.py
index 49d2b32538..045013a5c6 100644
--- a/scripts/api_master.py
+++ b/scripts/api_master.py
@@ -1,5 +1,6 @@
 from kt_api_master import KT_API_MASTER
 from cv_api_master import CV_API_MASTER
+from hub_api_master import HUB_API_MASTER
 from nlp_api_master import NLP_API_MASTER
 
 API_MASTER = {
@@ -1824,5 +1825,6 @@
         KT_API_MASTER,
         CV_API_MASTER,
         NLP_API_MASTER,
+        HUB_API_MASTER,
     ],
 }
diff --git a/scripts/autogen.py b/scripts/autogen.py
index 50517df940..3c190d49aa 100644
--- a/scripts/autogen.py
+++ b/scripts/autogen.py
@@ -28,10 +28,10 @@
 import render_tags
 
 try:
-    import keras_nlp
+    import keras_hub
 except Exception as e:
-    print(f"Could not import Keras NLP. Exception: {e}")
-    keras_nlp = None
+    print(f"Could not import KerasHub. Exception: {e}")
+    keras_hub = None
 
 try:
     import keras_cv
@@ -39,6 +39,11 @@
     print(f"Could not import Keras CV. Exception: {e}")
     keras_cv = None
 
+try:
+    import keras_nlp
+except Exception as e:
+    print(f"Could not import Keras NLP. Exception: {e}")
+    keras_nlp = None
 
 EXAMPLES_GH_LOCATION = Path("keras-team") / "keras-io" / "blob" / "master" / "examples"
 GUIDES_GH_LOCATION = Path("keras-team") / "keras-io" / "blob" / "master" / "guides"
@@ -48,6 +53,7 @@
     "keras_tuner": f"{KERAS_TEAM_GH}/keras-tuner/tree/v1.4.7/",
     "keras_cv": f"{KERAS_TEAM_GH}/keras-cv/tree/v0.9.0/",
     "keras_nlp": f"{KERAS_TEAM_GH}/keras-nlp/tree/v0.15.1/",
+    "keras_hub": f"{KERAS_TEAM_GH}/keras-hub/tree/v0.16.1.dev202409290341/",
     "tf_keras": f"{KERAS_TEAM_GH}/tf-keras/tree/v2.17.0/",
 }
 USE_MULTIPROCESSING = False
@@ -543,6 +549,8 @@ def make_md_source_for_entry(self, entry, path_stack, title_stack):
             template = render_tags.render_tags(template, keras_nlp)
         if "keras_cv/" in path_stack and "models/" in path_stack:
             template = render_tags.render_tags(template, keras_cv)
+        if "keras_hub/" in path_stack and "models/" in path_stack:
+            template = render_tags.render_tags(template, keras_hub)
         source_path = Path(self.md_sources_dir) / Path(*path_stack)
         if path.endswith("/"):
             md_source_path = source_path / "index.md"
diff --git a/scripts/docstrings.py b/scripts/docstrings.py
index d33124a0eb..9a705f69cb 100644
--- a/scripts/docstrings.py
+++ b/scripts/docstrings.py
@@ -94,7 +94,7 @@ def render_from_object(self, object_, signature_override: str, element):
         if docstring:
             docstring = self.process_docstring(docstring)
             subblocks.append(docstring)
-        # Render preset table for KerasCV and KerasNLP
+        # Render preset table for KerasCV and KerasHub
         if element.endswith("from_preset"):
             table = render_tags.render_table(import_object(element.rsplit(".", 1)[0]))
             if table is not None:
diff --git a/scripts/examples_master.py b/scripts/examples_master.py
index 6477e994a4..eb2e6d96c7 100644
--- a/scripts/examples_master.py
+++ b/scripts/examples_master.py
@@ -433,7 +433,7 @@
                     "subcategory": "Text classification",
                 },
                 {
-                    "path": "fnet_classification_with_keras_nlp",
+                    "path": "fnet_classification_with_keras_hub",
                     "title": "Text Classification using FNet",
                     "subcategory": "Text classification",
                     "keras_3": True,
@@ -473,15 +473,15 @@
                     "keras_3": True,
                 },
                 {
-                    "path": "data_parallel_training_with_keras_nlp",
-                    "title": "Data Parallel Training with KerasNLP and tf.distribute",
+                    "path": "data_parallel_training_with_keras_hub",
+                    "title": "Data Parallel Training with KerasHub and tf.distribute",
                     "subcategory": "Text classification",
                     "keras_3": True,
                 },
                 # Machine translation
                 {
-                    "path": "neural_machine_translation_with_keras_nlp",
-                    "title": "English-to-Spanish translation with KerasNLP",
+                    "path": "neural_machine_translation_with_keras_hub",
+                    "title": "English-to-Spanish translation with KerasHub",
                     "subcategory": "Machine translation",
                     "keras_3": True,
                 },
@@ -525,8 +525,8 @@
                 },
                 # Text similarity search
                 {
-                    "path": "semantic_similarity_with_keras_nlp",
-                    "title": "Semantic Similarity with KerasNLP",
+                    "path": "semantic_similarity_with_keras_hub",
+                    "title": "Semantic Similarity with KerasHub",
                     "subcategory": "Text similarity search",
                     "keras_3": True,
                 },
@@ -807,15 +807,15 @@
                 },
                 # Text generation
                 {
-                    "path": "gpt2_text_generation_with_kerasnlp",
-                    "title": "GPT2 Text Generation with KerasNLP",
+                    "path": "gpt2_text_generation_with_keras_hub",
+                    "title": "GPT2 Text Generation with KerasHub",
                     "subcategory": "Text generation",
                     "highlight": True,
                     "keras_3": True,
                 },
                 {
                     "path": "text_generation_gpt",
-                    "title": "GPT text generation from scratch with KerasNLP",
+                    "title": "GPT text generation from scratch with KerasHub",
                     "subcategory": "Text generation",
                     "keras_3": True,
                 },
diff --git a/scripts/guides_master.py b/scripts/guides_master.py
index 7505d000d4..4912a80149 100644
--- a/scripts/guides_master.py
+++ b/scripts/guides_master.py
@@ -54,6 +54,26 @@
     ],
 }
 
+HUB_GUIDES_MASTER = {
+    "path": "keras_hub/",
+    "title": "KerasHub",
+    "toc": True,
+    "children": [
+        {
+            "path": "getting_started",
+            "title": "Getting Started with KerasHub",
+        },
+        {
+            "path": "transformer_pretraining",
+            "title": "Pretraining a Transformer from scratch with KerasHub",
+        },
+        {
+            "path": "upload",
+            "title": "Uploading Models with KerasHub",
+        },
+    ],
+}
+
 KT_GUIDES_MASTER = {
     "path": "keras_tuner/",
     "title": "Hyperparameter Tuning",
@@ -202,5 +222,6 @@
         KT_GUIDES_MASTER,
         CV_GUIDES_MASTER,
         NLP_GUIDES_MASTER,
+        HUB_GUIDES_MASTER,
     ],
 }
diff --git a/scripts/hub_api_master.py b/scripts/hub_api_master.py
new file mode 100644
index 0000000000..18787ee0fd
--- /dev/null
+++ b/scripts/hub_api_master.py
@@ -0,0 +1,1426 @@
+BASE_CLASSES = {
+    "path": "base_classes/",
+    "title": "Models API",
+    "toc": True,
+    "children": [
+        {
+            "path": "backbone",
+            "title": "Backbone",
+            "generate": [
+                "keras_hub.models.Backbone",
+                "keras_hub.models.Backbone.from_preset",
+                "keras_hub.models.Backbone.token_embedding",
+                "keras_hub.models.Backbone.enable_lora",
+                "keras_hub.models.Backbone.save_lora_weights",
+                "keras_hub.models.Backbone.load_lora_weights",
+                "keras_hub.models.Backbone.save_to_preset",
+            ],
+        },
+        {
+            "path": "task",
+            "title": "Task",
+            "generate": [
+                "keras_hub.models.Task",
+                "keras_hub.models.Task.from_preset",
+                "keras_hub.models.Task.save_to_preset",
+                "keras_hub.models.Task.preprocessor",
+                "keras_hub.models.Task.backbone",
+            ],
+        },
+        {
+            "path": "preprocessor",
+            "title": "Preprocessor",
+            "generate": [
+                "keras_hub.models.Preprocessor",
+                "keras_hub.models.Preprocessor.from_preset",
+                "keras_hub.models.Preprocessor.save_to_preset",
+                "keras_hub.models.Preprocessor.tokenizer",
+            ],
+        },
+        {
+            "path": "causal_lm",
+            "title": "CausalLM",
+            "generate": [
+                "keras_hub.models.CausalLM",
+                "keras_hub.models.CausalLM.from_preset",
+                "keras_hub.models.CausalLM.compile",
+                "keras_hub.models.CausalLM.generate",
+                "keras_hub.models.CausalLM.save_to_preset",
+                "keras_hub.models.CausalLM.preprocessor",
+                "keras_hub.models.CausalLM.backbone",
+            ],
+        },
+        {
+            "path": "causal_lm_preprocessor",
+            "title": "CausalLMPreprocessor",
+            "generate": [
+                "keras_hub.models.CausalLMPreprocessor",
+                "keras_hub.models.CausalLMPreprocessor.from_preset",
+                "keras_hub.models.CausalLMPreprocessor.save_to_preset",
+                "keras_hub.models.CausalLMPreprocessor.tokenizer",
+            ],
+        },
+        {
+            "path": "seq_2_seq_lm",
+            "title": "Seq2SeqLM",
+            "generate": [
+                "keras_hub.models.Seq2SeqLM",
+                "keras_hub.models.Seq2SeqLM.from_preset",
+                "keras_hub.models.Seq2SeqLM.compile",
+                "keras_hub.models.Seq2SeqLM.generate",
+                "keras_hub.models.Seq2SeqLM.save_to_preset",
+                "keras_hub.models.Seq2SeqLM.preprocessor",
+                "keras_hub.models.Seq2SeqLM.backbone",
+            ],
+        },
+        {
+            "path": "seq_2_seq_lm_preprocessor",
+            "title": "Seq2SeqLMPreprocessor",
+            "generate": [
+                "keras_hub.models.Seq2SeqLMPreprocessor",
+                "keras_hub.models.Seq2SeqLMPreprocessor.from_preset",
+                "keras_hub.models.Seq2SeqLMPreprocessor.save_to_preset",
+                "keras_hub.models.Seq2SeqLMPreprocessor.tokenizer",
+            ],
+        },
+        {
+            "path": "text_classifier",
+            "title": "TextClassifier",
+            "generate": [
+                "keras_hub.models.TextClassifier",
+                "keras_hub.models.TextClassifier.from_preset",
+                "keras_hub.models.TextClassifier.compile",
+                "keras_hub.models.TextClassifier.save_to_preset",
+                "keras_hub.models.TextClassifier.preprocessor",
+                "keras_hub.models.TextClassifier.backbone",
+            ],
+        },
+        {
+            "path": "text_classifier_preprocessor",
+            "title": "TextClassifierPreprocessor",
+            "generate": [
+                "keras_hub.models.TextClassifierPreprocessor",
+                "keras_hub.models.TextClassifierPreprocessor.from_preset",
+                "keras_hub.models.TextClassifierPreprocessor.save_to_preset",
+                "keras_hub.models.TextClassifierPreprocessor.tokenizer",
+            ],
+        },
+        {
+            "path": "masked_lm",
+            "title": "MaskedLM",
+            "generate": [
+                "keras_hub.models.MaskedLM",
+                "keras_hub.models.MaskedLM.from_preset",
+                "keras_hub.models.MaskedLM.compile",
+                "keras_hub.models.MaskedLM.save_to_preset",
+                "keras_hub.models.MaskedLM.preprocessor",
+                "keras_hub.models.MaskedLM.backbone",
+            ],
+        },
+        {
+            "path": "masked_lm_preprocessor",
+            "title": "MaskedLMPreprocessor",
+            "generate": [
+                "keras_hub.models.MaskedLMPreprocessor",
+                "keras_hub.models.MaskedLMPreprocessor.from_preset",
+                "keras_hub.models.MaskedLMPreprocessor.save_to_preset",
+                "keras_hub.models.MaskedLMPreprocessor.tokenizer",
+            ],
+        },
+        {
+            "path": "upload_preset",
+            "title": "upload_preset",
+            "generate": ["keras_hub.upload_preset"],
+        },
+    ],
+}
+
+MODELS_MASTER = {
+    "path": "models/",
+    "title": "Pretrained Models",
+    "toc": True,
+    "children": [
+        {
+            "path": "albert/",
+            "title": "Albert",
+            "toc": True,
+            "children": [
+                {
+                    "path": "albert_tokenizer",
+                    "title": "AlbertTokenizer",
+                    "generate": [
+                        "keras_hub.tokenizers.AlbertTokenizer",
+                        "keras_hub.tokenizers.AlbertTokenizer.from_preset",
+                    ],
+                },
+                {
+                    "path": "albert_backbone",
+                    "title": "AlbertBackbone model",
+                    "generate": [
+                        "keras_hub.models.AlbertBackbone",
+                        "keras_hub.models.AlbertBackbone.from_preset",
+                        "keras_hub.models.AlbertBackbone.token_embedding",
+                    ],
+                },
+                {
+                    "path": "albert_text_classifier",
+                    "title": "AlbertTextClassifier model",
+                    "generate": [
+                        "keras_hub.models.AlbertTextClassifier",
+                        "keras_hub.models.AlbertTextClassifier.from_preset",
+                        "keras_hub.models.AlbertTextClassifier.backbone",
+                        "keras_hub.models.AlbertTextClassifier.preprocessor",
+                    ],
+                },
+                {
+                    "path": "albert_text_classifier_preprocessor",
+                    "title": "AlbertTextClassifierPreprocessor layer",
+                    "generate": [
+                        "keras_hub.models.AlbertTextClassifierPreprocessor",
+                        "keras_hub.models.AlbertTextClassifierPreprocessor.from_preset",
+                        "keras_hub.models.AlbertTextClassifierPreprocessor.tokenizer",
+                    ],
+                },
+                {
+                    "path": "albert_masked_lm",
+                    "title": "AlbertMaskedLM model",
+                    "generate": [
+                        "keras_hub.models.AlbertMaskedLM",
+                        "keras_hub.models.AlbertMaskedLM.from_preset",
+                        "keras_hub.models.AlbertMaskedLM.backbone",
+                        "keras_hub.models.AlbertMaskedLM.preprocessor",
+                    ],
+                },
+                {
+                    "path": "albert_masked_lm_preprocessor",
+                    "title": "AlbertMaskedLMPreprocessor layer",
+                    "generate": [
+                        "keras_hub.models.AlbertMaskedLMPreprocessor",
+                        "keras_hub.models.AlbertMaskedLMPreprocessor.from_preset",
+                        "keras_hub.models.AlbertMaskedLMPreprocessor.tokenizer",
+                    ],
+                },
+            ],
+        },
+        {
+            "path": "bart/",
+            "title": "Bart",
+            "toc": True,
+            "children": [
+                {
+                    "path": "bart_tokenizer",
+                    "title": "BertTokenizer",
+                    "generate": [
+                        "keras_hub.tokenizers.BertTokenizer",
+                        "keras_hub.tokenizers.BertTokenizer.from_preset",
+                    ],
+                },
+                {
+                    "path": "bart_backbone",
+                    "title": "BertBackbone model",
+                    "generate": [
+                        "keras_hub.models.BertBackbone",
+                        "keras_hub.models.BertBackbone.from_preset",
+                        "keras_hub.models.BertBackbone.token_embedding",
+                    ],
+                },
+                {
+                    "path": "bart_seq_2_seq_lm",
+                    "title": "BartSeq2SeqLM model",
+                    "generate": [
+                        "keras_hub.models.BartSeq2SeqLM",
+                        "keras_hub.models.BartSeq2SeqLM.from_preset",
+                        "keras_hub.models.BartSeq2SeqLM.generate",
+                        "keras_hub.models.BartSeq2SeqLM.backbone",
+                        "keras_hub.models.BartSeq2SeqLM.preprocessor",
+                    ],
+                },
+                {
+                    "path": "bart_seq_2_seq_lm_preprocessor",
+                    "title": "BartSeq2SeqLMPreprocessor layer",
+                    "generate": [
+                        "keras_hub.models.BartSeq2SeqLMPreprocessor",
+                        "keras_hub.models.BartSeq2SeqLMPreprocessor.from_preset",
+                        "keras_hub.models.BartSeq2SeqLMPreprocessor.generate_preprocess",
+                        "keras_hub.models.BartSeq2SeqLMPreprocessor.generate_postprocess",
+                        "keras_hub.models.BartSeq2SeqLMPreprocessor.tokenizer",
+                    ],
+                },
+            ],
+        },
+        {
+            "path": "bert/",
+            "title": "Bert",
+            "toc": True,
+            "children": [
+                {
+                    "path": "bert_tokenizer",
+                    "title": "BertTokenizer",
+                    "generate": [
+                        "keras_hub.tokenizers.BertTokenizer",
+                        "keras_hub.tokenizers.BertTokenizer.from_preset",
+                    ],
+                },
+                {
+                    "path": "bert_backbone",
+                    "title": "BertBackbone model",
+                    "generate": [
+                        "keras_hub.models.BertBackbone",
+                        "keras_hub.models.BertBackbone.from_preset",
+                        "keras_hub.models.BertBackbone.token_embedding",
+                    ],
+                },
+                {
+                    "path": "bert_text_classifier",
+                    "title": "BertTextClassifier model",
+                    "generate": [
+                        "keras_hub.models.BertTextClassifier",
+                        "keras_hub.models.BertTextClassifier.from_preset",
+                        "keras_hub.models.BertTextClassifier.backbone",
+                        "keras_hub.models.BertTextClassifier.preprocessor",
+                    ],
+                },
+                {
+                    "path": "bert_text_classifier_preprocessor",
+                    "title": "BertTextClassifierPreprocessor layer",
+                    "generate": [
+                        "keras_hub.models.BertTextClassifierPreprocessor",
+                        "keras_hub.models.BertTextClassifierPreprocessor.from_preset",
+                        "keras_hub.models.BertTextClassifierPreprocessor.tokenizer",
+                    ],
+                },
+                {
+                    "path": "bert_masked_lm",
+                    "title": "BertMaskedLM model",
+                    "generate": [
+                        "keras_hub.models.BertMaskedLM",
+                        "keras_hub.models.BertMaskedLM.from_preset",
+                        "keras_hub.models.BertMaskedLM.backbone",
+                        "keras_hub.models.BertMaskedLM.preprocessor",
+                    ],
+                },
+                {
+                    "path": "bert_masked_lm_preprocessor",
+                    "title": "BertMaskedLMPreprocessor layer",
+                    "generate": [
+                        "keras_hub.models.BertMaskedLMPreprocessor",
+                        "keras_hub.models.BertMaskedLMPreprocessor.from_preset",
+                        "keras_hub.models.BertMaskedLMPreprocessor.tokenizer",
+                    ],
+                },
+            ],
+        },
+        {
+            "path": "bloom/",
+            "title": "Bloom",
+            "toc": True,
+            "children": [
+                {
+                    "path": "bloom_tokenizer",
+                    "title": "BloomTokenizer",
+                    "generate": [
+                        "keras_hub.tokenizers.BloomTokenizer",
+                        "keras_hub.tokenizers.BloomTokenizer.from_preset",
+                    ],
+                },
+                {
+                    "path": "bloom_backbone",
+                    "title": "BloomBackbone model",
+                    "generate": [
+                        "keras_hub.models.BloomBackbone",
+                        "keras_hub.models.BloomBackbone.from_preset",
+                        "keras_hub.models.BloomBackbone.token_embedding",
+                        "keras_hub.models.BloomBackbone.enable_lora",
+                    ],
+                },
+                {
+                    "path": "bloom_causal_lm",
+                    "title": "BloomCausalLM model",
+                    "generate": [
+                        "keras_hub.models.BloomCausalLM",
+                        "keras_hub.models.BloomCausalLM.from_preset",
+                        "keras_hub.models.BloomCausalLM.generate",
+                        "keras_hub.models.BloomCausalLM.backbone",
+                        "keras_hub.models.BloomCausalLM.preprocessor",
+                    ],
+                },
+                {
+                    "path": "bloom_causal_lm_preprocessor",
+                    "title": "BloomCausalLMPreprocessor layer",
+                    "generate": [
+                        "keras_hub.models.BloomCausalLMPreprocessor",
+                        "keras_hub.models.BloomCausalLMPreprocessor.from_preset",
+                        "keras_hub.models.BloomCausalLMPreprocessor.tokenizer",
+                    ],
+                },
+            ],
+        },
+        {
+            "path": "deberta_v3/",
+            "title": "DebertaV3",
+            "toc": True,
+            "children": [
+                {
+                    "path": "deberta_v3_tokenizer",
+                    "title": "DebertaV3Tokenizer",
+                    "generate": [
+                        "keras_hub.tokenizers.DebertaV3Tokenizer",
+                        "keras_hub.tokenizers.DebertaV3Tokenizer.from_preset",
+                    ],
+                },
+                {
+                    "path": "deberta_v3_backbone",
+                    "title": "DebertaV3Backbone model",
+                    "generate": [
+                        "keras_hub.models.DebertaV3Backbone",
+                        "keras_hub.models.DebertaV3Backbone.from_preset",
+                        "keras_hub.models.DebertaV3Backbone.token_embedding",
+                    ],
+                },
+                {
+                    "path": "deberta_v3_text_classifier",
+                    "title": "DebertaV3TextClassifier model",
+                    "generate": [
+                        "keras_hub.models.DebertaV3TextClassifier",
+                        "keras_hub.models.DebertaV3TextClassifier.from_preset",
+                        "keras_hub.models.DebertaV3TextClassifier.backbone",
+                        "keras_hub.models.DebertaV3TextClassifier.preprocessor",
+                    ],
+                },
+                {
+                    "path": "deberta_v3_text_classifier_preprocessor",
+                    "title": "DebertaV3TextClassifierPreprocessor layer",
+                    "generate": [
+                        "keras_hub.models.DebertaV3TextClassifierPreprocessor",
+                        "keras_hub.models.DebertaV3TextClassifierPreprocessor.from_preset",
+                        "keras_hub.models.DebertaV3TextClassifierPreprocessor.tokenizer",
+                    ],
+                },
+                {
+                    "path": "deberta_v3_masked_lm",
+                    "title": "DebertaV3MaskedLM model",
+                    "generate": [
+                        "keras_hub.models.DebertaV3MaskedLM",
+                        "keras_hub.models.DebertaV3MaskedLM.from_preset",
+                        "keras_hub.models.DebertaV3MaskedLM.backbone",
+                        "keras_hub.models.DebertaV3MaskedLM.preprocessor",
+                    ],
+                },
+                {
+                    "path": "deberta_v3_masked_lm_preprocessor",
+                    "title": "DebertaV3MaskedLMPreprocessor layer",
+                    "generate": [
+                        "keras_hub.models.DebertaV3MaskedLMPreprocessor",
+                        "keras_hub.models.DebertaV3MaskedLMPreprocessor.from_preset",
+                        "keras_hub.models.DebertaV3MaskedLMPreprocessor.tokenizer",
+                    ],
+                },
+            ],
+        },
+        {
+            "path": "distil_bert/",
+            "title": "DistilBert",
+            "toc": True,
+            "children": [
+                {
+                    "path": "distil_bert_tokenizer",
+                    "title": "DistilBertTokenizer",
+                    "generate": [
+                        "keras_hub.tokenizers.DistilBertTokenizer",
+                        "keras_hub.tokenizers.DistilBertTokenizer.from_preset",
+                    ],
+                },
+                {
+                    "path": "distil_bert_backbone",
+                    "title": "DistilBertBackbone model",
+                    "generate": [
+                        "keras_hub.models.DistilBertBackbone",
+                        "keras_hub.models.DistilBertBackbone.from_preset",
+                        "keras_hub.models.DistilBertBackbone.token_embedding",
+                    ],
+                },
+                {
+                    "path": "distil_bert_text_classifier",
+                    "title": "DistilBertTextClassifier model",
+                    "generate": [
+                        "keras_hub.models.DistilBertTextClassifier",
+                        "keras_hub.models.DistilBertTextClassifier.from_preset",
+                        "keras_hub.models.DistilBertTextClassifier.backbone",
+                        "keras_hub.models.DistilBertTextClassifier.preprocessor",
+                    ],
+                },
+                {
+                    "path": "distil_bert_text_classifier_preprocessor",
+                    "title": "DistilBertTextClassifierPreprocessor layer",
+                    "generate": [
+                        "keras_hub.models.DistilBertTextClassifierPreprocessor",
+                        "keras_hub.models.DistilBertTextClassifierPreprocessor.from_preset",
+                        "keras_hub.models.DistilBertTextClassifierPreprocessor.tokenizer",
+                    ],
+                },
+                {
+                    "path": "distil_bert_masked_lm",
+                    "title": "DistilBertMaskedLM model",
+                    "generate": [
+                        "keras_hub.models.DistilBertMaskedLM",
+                        "keras_hub.models.DistilBertMaskedLM.from_preset",
+                        "keras_hub.models.DistilBertMaskedLM.backbone",
+                        "keras_hub.models.DistilBertMaskedLM.preprocessor",
+                    ],
+                },
+                {
+                    "path": "distil_bert_masked_lm_preprocessor",
+                    "title": "DistilBertMaskedLMPreprocessor layer",
+                    "generate": [
+                        "keras_hub.models.DistilBertMaskedLMPreprocessor",
+                        "keras_hub.models.DistilBertMaskedLMPreprocessor.from_preset",
+                        "keras_hub.models.DistilBertMaskedLMPreprocessor.tokenizer",
+                    ],
+                },
+            ],
+        },
+        {
+            "path": "gemma/",
+            "title": "Gemma",
+            "toc": True,
+            "children": [
+                {
+                    "path": "gemma_tokenizer",
+                    "title": "GemmaTokenizer",
+                    "generate": [
+                        "keras_hub.tokenizers.GemmaTokenizer",
+                        "keras_hub.tokenizers.GemmaTokenizer.from_preset",
+                    ],
+                },
+                {
+                    "path": "gemma_backbone",
+                    "title": "GemmaBackbone model",
+                    "generate": [
+                        "keras_hub.models.GemmaBackbone",
+                        "keras_hub.models.GemmaBackbone.from_preset",
+                        "keras_hub.models.GemmaBackbone.token_embedding",
+                        "keras_hub.models.GemmaBackbone.enable_lora",
+                        "keras_hub.models.GemmaBackbone.get_layout_map",
+                    ],
+                },
+                {
+                    "path": "gemma_causal_lm",
+                    "title": "GemmaCausalLM model",
+                    "generate": [
+                        "keras_hub.models.GemmaCausalLM",
+                        "keras_hub.models.GemmaCausalLM.from_preset",
+                        "keras_hub.models.GemmaCausalLM.generate",
+                        "keras_hub.models.GemmaCausalLM.backbone",
+                        "keras_hub.models.GemmaCausalLM.preprocessor",
+                        "keras_hub.models.GemmaCausalLM.score",
+                    ],
+                },
+                {
+                    "path": "gemma_causal_lm_preprocessor",
+                    "title": "GemmaCausalLMPreprocessor layer",
+                    "generate": [
+                        "keras_hub.models.GemmaCausalLMPreprocessor",
+                        "keras_hub.models.GemmaCausalLMPreprocessor.from_preset",
+                        "keras_hub.models.GemmaCausalLMPreprocessor.tokenizer",
+                    ],
+                },
+            ],
+        },
+        {
+            "path": "electra/",
+            "title": "Electra",
+            "toc": True,
+            "children": [
+                {
+                    "path": "electra_tokenizer",
+                    "title": "ElectraTokenizer",
+                    "generate": [
+                        "keras_hub.tokenizers.ElectraTokenizer",
+                        "keras_hub.tokenizers.ElectraTokenizer.from_preset",
+                    ],
+                },
+                {
+                    "path": "electra_backbone",
+                    "title": "ElectraBackbone model",
+                    "generate": [
+                        "keras_hub.models.ElectraBackbone",
+                        "keras_hub.models.ElectraBackbone.from_preset",
+                        "keras_hub.models.ElectraBackbone.token_embedding",
+                    ],
+                },
+            ],
+        },
+        {
+            "path": "falcon/",
+            "title": "Falcon",
+            "toc": True,
+            "children": [
+                {
+                    "path": "falcon_tokenizer",
+                    "title": "FalconTokenizer",
+                    "generate": [
+                        "keras_hub.tokenizers.FalconTokenizer",
+                        "keras_hub.tokenizers.FalconTokenizer.from_preset",
+                    ],
+                },
+                {
+                    "path": "falcon_backbone",
+                    "title": "FalconBackbone model",
+                    "generate": [
+                        "keras_hub.models.FalconBackbone",
+                        "keras_hub.models.FalconBackbone.from_preset",
+                        "keras_hub.models.FalconBackbone.token_embedding",
+                    ],
+                },
+                {
+                    "path": "falcon_causal_lm",
+                    "title": "FalconCausalLM model",
+                    "generate": [
+                        "keras_hub.models.FalconCausalLM",
+                        "keras_hub.models.FalconCausalLM.from_preset",
+                        "keras_hub.models.FalconCausalLM.generate",
+                        "keras_hub.models.FalconCausalLM.backbone",
+                        "keras_hub.models.FalconCausalLM.preprocessor",
+                    ],
+                },
+                {
+                    "path": "falcon_causal_lm_preprocessor",
+                    "title": "FalconCausalLMPreprocessor layer",
+                    "generate": [
+                        "keras_hub.models.FalconCausalLMPreprocessor",
+                        "keras_hub.models.FalconCausalLMPreprocessor.from_preset",
+                        "keras_hub.models.FalconCausalLMPreprocessor.generate_preprocess",
+                        "keras_hub.models.FalconCausalLMPreprocessor.generate_postprocess",
+                        "keras_hub.models.FalconCausalLMPreprocessor.tokenizer",
+                    ],
+                },
+            ],
+        },
+        {
+            "path": "f_net/",
+            "title": "FNet",
+            "toc": True,
+            "children": [
+                {
+                    "path": "f_net_tokenizer",
+                    "title": "FNetTokenizer",
+                    "generate": [
+                        "keras_hub.tokenizers.FNetTokenizer",
+                        "keras_hub.tokenizers.FNetTokenizer.from_preset",
+                    ],
+                },
+                {
+                    "path": "f_net_backbone",
+                    "title": "FNetBackbone model",
+                    "generate": [
+                        "keras_hub.models.FNetBackbone",
+                        "keras_hub.models.FNetBackbone.from_preset",
+                        "keras_hub.models.FNetBackbone.token_embedding",
+                    ],
+                },
+                {
+                    "path": "f_net_text_classifier",
+                    "title": "FNetTextClassifier model",
+                    "generate": [
+                        "keras_hub.models.FNetTextClassifier",
+                        "keras_hub.models.FNetTextClassifier.from_preset",
+                        "keras_hub.models.FNetTextClassifier.backbone",
+                        "keras_hub.models.FNetTextClassifier.preprocessor",
+                    ],
+                },
+                {
+                    "path": "f_net_text_classifier_preprocessor",
+                    "title": "FNetTextClassifierPreprocessor layer",
+                    "generate": [
+                        "keras_hub.models.FNetTextClassifierPreprocessor",
+                        "keras_hub.models.FNetTextClassifierPreprocessor.from_preset",
+                        "keras_hub.models.FNetTextClassifierPreprocessor.tokenizer",
+                    ],
+                },
+                {
+                    "path": "f_net_masked_lm",
+                    "title": "FNetMaskedLM model",
+                    "generate": [
+                        "keras_hub.models.FNetMaskedLM",
+                        "keras_hub.models.FNetMaskedLM.from_preset",
+                        "keras_hub.models.FNetMaskedLM.backbone",
+                        "keras_hub.models.FNetMaskedLM.preprocessor",
+                    ],
+                },
+                {
+                    "path": "f_net_masked_lm_preprocessor",
+                    "title": "FNetMaskedLMPreprocessor layer",
+                    "generate": [
+                        "keras_hub.models.FNetMaskedLMPreprocessor",
+                        "keras_hub.models.FNetMaskedLMPreprocessor.from_preset",
+                        "keras_hub.models.FNetMaskedLMPreprocessor.tokenizer",
+                    ],
+                },
+            ],
+        },
+        {
+            "path": "gpt2/",
+            "title": "GPT2",
+            "toc": True,
+            "children": [
+                {
+                    "path": "gpt2_tokenizer",
+                    "title": "GPT2Tokenizer",
+                    "generate": [
+                        "keras_hub.tokenizers.GPT2Tokenizer",
+                        "keras_hub.tokenizers.GPT2Tokenizer.from_preset",
+                    ],
+                },
+                {
+                    "path": "gpt2_backbone",
+                    "title": "GPT2Backbone model",
+                    "generate": [
+                        "keras_hub.models.GPT2Backbone",
+                        "keras_hub.models.GPT2Backbone.from_preset",
+                        "keras_hub.models.GPT2Backbone.token_embedding",
+                    ],
+                },
+                {
+                    "path": "gpt2_causal_lm",
+                    "title": "GPT2CausalLM model",
+                    "generate": [
+                        "keras_hub.models.GPT2CausalLM",
+                        "keras_hub.models.GPT2CausalLM.from_preset",
+                        "keras_hub.models.GPT2CausalLM.generate",
+                        "keras_hub.models.GPT2CausalLM.backbone",
+                        "keras_hub.models.GPT2CausalLM.preprocessor",
+                    ],
+                },
+                {
+                    "path": "gpt2_causal_lm_preprocessor",
+                    "title": "GPT2CausalLMPreprocessor layer",
+                    "generate": [
+                        "keras_hub.models.GPT2CausalLMPreprocessor",
+                        "keras_hub.models.GPT2CausalLMPreprocessor.from_preset",
+                        "keras_hub.models.GPT2CausalLMPreprocessor.generate_preprocess",
+                        "keras_hub.models.GPT2CausalLMPreprocessor.generate_postprocess",
+                        "keras_hub.models.GPT2CausalLMPreprocessor.tokenizer",
+                    ],
+                },
+            ],
+        },
+        {
+            "path": "llama/",
+            "title": "Llama",
+            "toc": True,
+            "children": [
+                {
+                    "path": "llama_tokenizer",
+                    "title": "LlamaTokenizer",
+                    "generate": [
+                        "keras_hub.tokenizers.LlamaTokenizer",
+                        "keras_hub.tokenizers.LlamaTokenizer.from_preset",
+                    ],
+                },
+                {
+                    "path": "llama_backbone",
+                    "title": "LlamaBackbone model",
+                    "generate": [
+                        "keras_hub.models.LlamaBackbone",
+                        "keras_hub.models.LlamaBackbone.from_preset",
+                        "keras_hub.models.LlamaBackbone.token_embedding",
+                        "keras_hub.models.LlamaBackbone.enable_lora",
+                    ],
+                },
+                {
+                    "path": "llama_causal_lm",
+                    "title": "LlamaCausalLM model",
+                    "generate": [
+                        "keras_hub.models.LlamaCausalLM",
+                        "keras_hub.models.LlamaCausalLM.from_preset",
+                        "keras_hub.models.LlamaCausalLM.generate",
+                        "keras_hub.models.LlamaCausalLM.backbone",
+                        "keras_hub.models.LlamaCausalLM.preprocessor",
+                    ],
+                },
+                {
+                    "path": "llama_causal_lm_preprocessor",
+                    "title": "LlamaCausalLMPreprocessor layer",
+                    "generate": [
+                        "keras_hub.models.LlamaCausalLMPreprocessor",
+                        "keras_hub.models.LlamaCausalLMPreprocessor.from_preset",
+                        "keras_hub.models.LlamaCausalLMPreprocessor.tokenizer",
+                    ],
+                },
+            ],
+        },
+        {
+            "path": "llama3/",
+            "title": "Llama3",
+            "toc": True,
+            "children": [
+                {
+                    "path": "llama3_tokenizer",
+                    "title": "Llama3Tokenizer",
+                    "generate": [
+                        "keras_hub.tokenizers.Llama3Tokenizer",
+                        "keras_hub.tokenizers.Llama3Tokenizer.from_preset",
+                    ],
+                },
+                {
+                    "path": "llama3_backbone",
+                    "title": "Llama3Backbone model",
+                    "generate": [
+                        "keras_hub.models.Llama3Backbone",
+                        "keras_hub.models.Llama3Backbone.from_preset",
+                        "keras_hub.models.Llama3Backbone.token_embedding",
+                        "keras_hub.models.Llama3Backbone.enable_lora",
+                    ],
+                },
+                {
+                    "path": "llama3_causal_lm",
+                    "title": "Llama3CausalLM model",
+                    "generate": [
+                        "keras_hub.models.Llama3CausalLM",
+                        "keras_hub.models.Llama3CausalLM.from_preset",
+                        "keras_hub.models.Llama3CausalLM.generate",
+                        "keras_hub.models.Llama3CausalLM.backbone",
+                        "keras_hub.models.Llama3CausalLM.preprocessor",
+                    ],
+                },
+                {
+                    "path": "llama3_causal_lm_preprocessor",
+                    "title": "Llama3CausalLMPreprocessor layer",
+                    "generate": [
+                        "keras_hub.models.Llama3CausalLMPreprocessor",
+                        "keras_hub.models.Llama3CausalLMPreprocessor.from_preset",
+                        "keras_hub.models.Llama3CausalLMPreprocessor.tokenizer",
+                    ],
+                },
+            ],
+        },
+        {
+            "path": "mistral/",
+            "title": "Mistral",
+            "toc": True,
+            "children": [
+                {
+                    "path": "mistral_tokenizer",
+                    "title": "MistralTokenizer",
+                    "generate": [
+                        "keras_hub.tokenizers.MistralTokenizer",
+                        "keras_hub.tokenizers.MistralTokenizer.from_preset",
+                    ],
+                },
+                {
+                    "path": "mistral_backbone",
+                    "title": "MistralBackbone model",
+                    "generate": [
+                        "keras_hub.models.MistralBackbone",
+                        "keras_hub.models.MistralBackbone.from_preset",
+                        "keras_hub.models.MistralBackbone.token_embedding",
+                        "keras_hub.models.MistralBackbone.enable_lora",
+                    ],
+                },
+                {
+                    "path": "mistral_causal_lm",
+                    "title": "MistralCausalLM model",
+                    "generate": [
+                        "keras_hub.models.MistralCausalLM",
+                        "keras_hub.models.MistralCausalLM.from_preset",
+                        "keras_hub.models.MistralCausalLM.generate",
+                        "keras_hub.models.MistralCausalLM.backbone",
+                        "keras_hub.models.MistralCausalLM.preprocessor",
+                    ],
+                },
+                {
+                    "path": "mistral_causal_lm_preprocessor",
+                    "title": "MistralCausalLMPreprocessor layer",
+                    "generate": [
+                        "keras_hub.models.MistralCausalLMPreprocessor",
+                        "keras_hub.models.MistralCausalLMPreprocessor.from_preset",
+                        "keras_hub.models.MistralCausalLMPreprocessor.tokenizer",
+                    ],
+                },
+            ],
+        },
+        {
+            "path": "opt/",
+            "title": "OPT",
+            "toc": True,
+            "children": [
+                {
+                    "path": "opt_tokenizer",
+                    "title": "OPTTokenizer",
+                    "generate": [
+                        "keras_hub.tokenizers.OPTTokenizer",
+                        "keras_hub.tokenizers.OPTTokenizer.from_preset",
+                    ],
+                },
+                {
+                    "path": "opt_backbone",
+                    "title": "OPTBackbone model",
+                    "generate": [
+                        "keras_hub.models.OPTBackbone",
+                        "keras_hub.models.OPTBackbone.from_preset",
+                        "keras_hub.models.OPTBackbone.token_embedding",
+                    ],
+                },
+                {
+                    "path": "opt_causal_lm",
+                    "title": "OPTCausalLM model",
+                    "generate": [
+                        "keras_hub.models.OPTCausalLM",
+                        "keras_hub.models.OPTCausalLM.from_preset",
+                        "keras_hub.models.OPTCausalLM.generate",
+                        "keras_hub.models.OPTCausalLM.backbone",
+                        "keras_hub.models.OPTCausalLM.preprocessor",
+                    ],
+                },
+                {
+                    "path": "opt_causal_lm_preprocessor",
+                    "title": "OPTCausalLMPreprocessor layer",
+                    "generate": [
+                        "keras_hub.models.OPTCausalLMPreprocessor",
+                        "keras_hub.models.OPTCausalLMPreprocessor.from_preset",
+                        "keras_hub.models.OPTCausalLMPreprocessor.tokenizer",
+                    ],
+                },
+            ],
+        },
+        {
+            "path": "pali_gemma/",
+            "title": "PaliGemma",
+            "toc": True,
+            "children": [
+                {
+                    "path": "pali_gemma_tokenizer",
+                    "title": "PaliGemmaTokenizer",
+                    "generate": [
+                        "keras_hub.tokenizers.PaliGemmaTokenizer",
+                        "keras_hub.tokenizers.PaliGemmaTokenizer.from_preset",
+                    ],
+                },
+                {
+                    "path": "pali_gemma_backbone",
+                    "title": "PaliGemmaBackbone model",
+                    "generate": [
+                        "keras_hub.models.PaliGemmaBackbone",
+                        "keras_hub.models.PaliGemmaBackbone.from_preset",
+                        "keras_hub.models.PaliGemmaBackbone.token_embedding",
+                    ],
+                },
+                {
+                    "path": "pali_gemma_causal_lm",
+                    "title": "PaliGemmaCausalLM model",
+                    "generate": [
+                        "keras_hub.models.PaliGemmaCausalLM",
+                        "keras_hub.models.PaliGemmaCausalLM.from_preset",
+                        "keras_hub.models.PaliGemmaCausalLM.generate",
+                        "keras_hub.models.PaliGemmaCausalLM.backbone",
+                        "keras_hub.models.PaliGemmaCausalLM.preprocessor",
+                    ],
+                },
+                {
+                    "path": "pali_gemma_causal_lm_preprocessor",
+                    "title": "PaliGemmaCausalLMPreprocessor layer",
+                    "generate": [
+                        "keras_hub.models.PaliGemmaCausalLMPreprocessor",
+                        "keras_hub.models.PaliGemmaCausalLMPreprocessor.from_preset",
+                        "keras_hub.models.PaliGemmaCausalLMPreprocessor.tokenizer",
+                    ],
+                },
+            ],
+        },
+        {
+            "path": "phi3/",
+            "title": "Phi3",
+            "toc": True,
+            "children": [
+                {
+                    "path": "phi3_tokenizer",
+                    "title": "Phi3Tokenizer",
+                    "generate": [
+                        "keras_hub.tokenizers.Phi3Tokenizer",
+                        "keras_hub.tokenizers.Phi3Tokenizer.from_preset",
+                    ],
+                },
+                {
+                    "path": "phi3_backbone",
+                    "title": "Phi3Backbone model",
+                    "generate": [
+                        "keras_hub.models.Phi3Backbone",
+                        "keras_hub.models.Phi3Backbone.from_preset",
+                        "keras_hub.models.Phi3Backbone.token_embedding",
+                    ],
+                },
+                {
+                    "path": "phi3_causal_lm",
+                    "title": "Phi3CausalLM model",
+                    "generate": [
+                        "keras_hub.models.Phi3CausalLM",
+                        "keras_hub.models.Phi3CausalLM.from_preset",
+                        "keras_hub.models.Phi3CausalLM.generate",
+                        "keras_hub.models.Phi3CausalLM.backbone",
+                        "keras_hub.models.Phi3CausalLM.preprocessor",
+                    ],
+                },
+                {
+                    "path": "phi3_causal_lm_preprocessor",
+                    "title": "Phi3CausalLMPreprocessor layer",
+                    "generate": [
+                        "keras_hub.models.Phi3CausalLMPreprocessor",
+                        "keras_hub.models.Phi3CausalLMPreprocessor.from_preset",
+                        "keras_hub.models.Phi3CausalLMPreprocessor.tokenizer",
+                    ],
+                },
+            ],
+        },
+        {
+            "path": "resnet/",
+            "title": "ResNet",
+            "toc": True,
+            "children": [
+                {
+                    "path": "resnet_image_converter",
+                    "title": "ResNetImageConverter",
+                    "generate": [
+                        "keras_hub.layers.ResNetImageConverter",
+                        "keras_hub.layers.ResNetImageConverter.from_preset",
+                    ],
+                },
+                {
+                    "path": "resnet_backbone",
+                    "title": "ResNetBackbone model",
+                    "generate": [
+                        "keras_hub.models.ResNetBackbone",
+                        "keras_hub.models.ResNetBackbone.from_preset",
+                        "keras_hub.models.ResNetBackbone.token_embedding",
+                    ],
+                },
+                {
+                    "path": "resnet_image_classifier",
+                    "title": "ResNetImageClassifier model",
+                    "generate": [
+                        "keras_hub.models.ResNetImageClassifier",
+                        "keras_hub.models.ResNetImageClassifier.from_preset",
+                        "keras_hub.models.ResNetImageClassifier.backbone",
+                        "keras_hub.models.ResNetImageClassifier.preprocessor",
+                    ],
+                },
+                {
+                    "path": "resnet_image_classifier_preprocessor",
+                    "title": "ResNetImageClassifierPreprocessor layer",
+                    "generate": [
+                        "keras_hub.models.ResNetImageClassifierPreprocessor",
+                        "keras_hub.models.ResNetImageClassifierPreprocessor.from_preset",
+                        "keras_hub.models.ResNetImageClassifierPreprocessor.tokenizer",
+                    ],
+                },
+            ],
+        },
+        {
+            "path": "roberta/",
+            "title": "Roberta",
+            "toc": True,
+            "children": [
+                {
+                    "path": "roberta_tokenizer",
+                    "title": "RobertaTokenizer",
+                    "generate": [
+                        "keras_hub.tokenizers.RobertaTokenizer",
+                        "keras_hub.tokenizers.RobertaTokenizer.from_preset",
+                    ],
+                },
+                {
+                    "path": "roberta_backbone",
+                    "title": "RobertaBackbone model",
+                    "generate": [
+                        "keras_hub.models.RobertaBackbone",
+                        "keras_hub.models.RobertaBackbone.from_preset",
+                        "keras_hub.models.RobertaBackbone.token_embedding",
+                    ],
+                },
+                {
+                    "path": "roberta_text_classifier",
+                    "title": "RobertaTextClassifier model",
+                    "generate": [
+                        "keras_hub.models.RobertaTextClassifier",
+                        "keras_hub.models.RobertaTextClassifier.from_preset",
+                        "keras_hub.models.RobertaTextClassifier.backbone",
+                        "keras_hub.models.RobertaTextClassifier.preprocessor",
+                    ],
+                },
+                {
+                    "path": "roberta_text_classifier_preprocessor",
+                    "title": "RobertaTextClassifierPreprocessor layer",
+                    "generate": [
+                        "keras_hub.models.RobertaTextClassifierPreprocessor",
+                        "keras_hub.models.RobertaTextClassifierPreprocessor.from_preset",
+                        "keras_hub.models.RobertaTextClassifierPreprocessor.tokenizer",
+                    ],
+                },
+                {
+                    "path": "roberta_masked_lm",
+                    "title": "RobertaMaskedLM model",
+                    "generate": [
+                        "keras_hub.models.RobertaMaskedLM",
+                        "keras_hub.models.RobertaMaskedLM.from_preset",
+                        "keras_hub.models.RobertaMaskedLM.backbone",
+                        "keras_hub.models.RobertaMaskedLM.preprocessor",
+                    ],
+                },
+                {
+                    "path": "roberta_masked_lm_preprocessor",
+                    "title": "RobertaMaskedLMPreprocessor layer",
+                    "generate": [
+                        "keras_hub.models.RobertaMaskedLMPreprocessor",
+                        "keras_hub.models.RobertaMaskedLMPreprocessor.from_preset",
+                        "keras_hub.models.RobertaMaskedLMPreprocessor.tokenizer",
+                    ],
+                },
+            ],
+        },
+        {
+            "path": "xlm_roberta/",
+            "title": "XLMRoberta",
+            "toc": True,
+            "children": [
+                {
+                    "path": "xlm_roberta_tokenizer",
+                    "title": "XLMRobertaTokenizer",
+                    "generate": [
+                        "keras_hub.tokenizers.XLMRobertaTokenizer",
+                        "keras_hub.tokenizers.XLMRobertaTokenizer.from_preset",
+                    ],
+                },
+                {
+                    "path": "xlm_roberta_backbone",
+                    "title": "XLMRobertaBackbone model",
+                    "generate": [
+                        "keras_hub.models.XLMRobertaBackbone",
+                        "keras_hub.models.XLMRobertaBackbone.from_preset",
+                        "keras_hub.models.XLMRobertaBackbone.token_embedding",
+                    ],
+                },
+                {
+                    "path": "xlm_roberta_text_classifier",
+                    "title": "XLMRobertaTextClassifier model",
+                    "generate": [
+                        "keras_hub.models.XLMRobertaTextClassifier",
+                        "keras_hub.models.XLMRobertaTextClassifier.from_preset",
+                        "keras_hub.models.XLMRobertaTextClassifier.backbone",
+                        "keras_hub.models.XLMRobertaTextClassifier.preprocessor",
+                    ],
+                },
+                {
+                    "path": "xlm_roberta_text_classifier_preprocessor",
+                    "title": "XLMRobertaTextClassifierPreprocessor layer",
+                    "generate": [
+                        "keras_hub.models.XLMRobertaTextClassifierPreprocessor",
+                        "keras_hub.models.XLMRobertaTextClassifierPreprocessor.from_preset",
+                        "keras_hub.models.XLMRobertaTextClassifierPreprocessor.tokenizer",
+                    ],
+                },
+                {
+                    "path": "xlm_roberta_masked_lm",
+                    "title": "XLMRobertaMaskedLM model",
+                    "generate": [
+                        "keras_hub.models.XLMRobertaMaskedLM",
+                        "keras_hub.models.XLMRobertaMaskedLM.from_preset",
+                        "keras_hub.models.XLMRobertaMaskedLM.backbone",
+                        "keras_hub.models.XLMRobertaMaskedLM.preprocessor",
+                    ],
+                },
+                {
+                    "path": "xlm_roberta_masked_lm_preprocessor",
+                    "title": "XLMRobertaMaskedLMPreprocessor layer",
+                    "generate": [
+                        "keras_hub.models.XLMRobertaMaskedLMPreprocessor",
+                        "keras_hub.models.XLMRobertaMaskedLMPreprocessor.from_preset",
+                        "keras_hub.models.XLMRobertaMaskedLMPreprocessor.tokenizer",
+                    ],
+                },
+            ],
+        },
+    ],
+}
+
+SAMPLERS_MASTER = {
+    "path": "samplers/",
+    "title": "Samplers",
+    "toc": True,
+    "children": [
+        {
+            "path": "samplers",
+            "title": "Sampler base class",
+            "generate": [
+                "keras_hub.samplers.Sampler",
+                "keras_hub.samplers.Sampler.get_next_token",
+            ],
+        },
+        {
+            "path": "beam_sampler",
+            "title": "BeamSampler",
+            "generate": ["keras_hub.samplers.BeamSampler"],
+        },
+        {
+            "path": "contrastive_sampler",
+            "title": "ContrastiveSampler",
+            "generate": ["keras_hub.samplers.ContrastiveSampler"],
+        },
+        {
+            "path": "greedy_sampler",
+            "title": "GreedySampler",
+            "generate": ["keras_hub.samplers.GreedySampler"],
+        },
+        {
+            "path": "random_sampler",
+            "title": "RandomSampler",
+            "generate": ["keras_hub.samplers.RandomSampler"],
+        },
+        {
+            "path": "top_k_sampler",
+            "title": "TopKSampler",
+            "generate": ["keras_hub.samplers.TopKSampler"],
+        },
+        {
+            "path": "top_p_sampler",
+            "title": "TopPSampler",
+            "generate": ["keras_hub.samplers.TopPSampler"],
+        },
+    ],
+}
+
+TOKENIZERS_MASTER = {
+    "path": "tokenizers/",
+    "title": "Tokenizers",
+    "toc": True,
+    "children": [
+        {
+            "path": "tokenizer",
+            "title": "Tokenizer",
+            "generate": [
+                "keras_hub.tokenizers.Tokenizer",
+                "keras_hub.tokenizers.Tokenizer.from_preset",
+                "keras_hub.tokenizers.Tokenizer.save_to_preset",
+            ],
+        },
+        {
+            "path": "word_piece_tokenizer",
+            "title": "WordPieceTokenizer",
+            "generate": [
+                "keras_hub.tokenizers.WordPieceTokenizer",
+                "keras_hub.tokenizers.WordPieceTokenizer.tokenize",
+                "keras_hub.tokenizers.WordPieceTokenizer.detokenize",
+                "keras_hub.tokenizers.WordPieceTokenizer.get_vocabulary",
+                "keras_hub.tokenizers.WordPieceTokenizer.vocabulary_size",
+                "keras_hub.tokenizers.WordPieceTokenizer.token_to_id",
+                "keras_hub.tokenizers.WordPieceTokenizer.id_to_token",
+            ],
+        },
+        {
+            "path": "sentence_piece_tokenizer",
+            "title": "SentencePieceTokenizer",
+            "generate": [
+                "keras_hub.tokenizers.SentencePieceTokenizer",
+                "keras_hub.tokenizers.SentencePieceTokenizer.tokenize",
+                "keras_hub.tokenizers.SentencePieceTokenizer.detokenize",
+                "keras_hub.tokenizers.SentencePieceTokenizer.get_vocabulary",
+                "keras_hub.tokenizers.SentencePieceTokenizer.vocabulary_size",
+                "keras_hub.tokenizers.SentencePieceTokenizer.token_to_id",
+                "keras_hub.tokenizers.SentencePieceTokenizer.id_to_token",
+            ],
+        },
+        {
+            "path": "byte_pair_tokenizer",
+            "title": "BytePairTokenizer",
+            "generate": [
+                "keras_hub.tokenizers.BytePairTokenizer",
+                "keras_hub.tokenizers.BytePairTokenizer.tokenize",
+                "keras_hub.tokenizers.BytePairTokenizer.detokenize",
+                "keras_hub.tokenizers.BytePairTokenizer.get_vocabulary",
+                "keras_hub.tokenizers.BytePairTokenizer.vocabulary_size",
+                "keras_hub.tokenizers.BytePairTokenizer.token_to_id",
+                "keras_hub.tokenizers.BytePairTokenizer.id_to_token",
+            ],
+        },
+        {
+            "path": "byte_tokenizer",
+            "title": "ByteTokenizer",
+            "generate": [
+                "keras_hub.tokenizers.ByteTokenizer",
+                "keras_hub.tokenizers.ByteTokenizer.tokenize",
+                "keras_hub.tokenizers.ByteTokenizer.detokenize",
+                "keras_hub.tokenizers.ByteTokenizer.get_vocabulary",
+                "keras_hub.tokenizers.ByteTokenizer.vocabulary_size",
+                "keras_hub.tokenizers.ByteTokenizer.token_to_id",
+                "keras_hub.tokenizers.ByteTokenizer.id_to_token",
+            ],
+        },
+        {
+            "path": "unicode_codepoint_tokenizer",
+            "title": "UnicodeCodepointTokenizer",
+            "generate": [
+                "keras_hub.tokenizers.UnicodeCodepointTokenizer",
+                "keras_hub.tokenizers.UnicodeCodepointTokenizer.tokenize",
+                "keras_hub.tokenizers.UnicodeCodepointTokenizer.detokenize",
+                "keras_hub.tokenizers.UnicodeCodepointTokenizer.get_vocabulary",
+                "keras_hub.tokenizers.UnicodeCodepointTokenizer.vocabulary_size",
+                "keras_hub.tokenizers.UnicodeCodepointTokenizer.token_to_id",
+                "keras_hub.tokenizers.UnicodeCodepointTokenizer.id_to_token",
+            ],
+        },
+        {
+            "path": "compute_word_piece_vocabulary",
+            "title": "compute_word_piece_vocabulary function",
+            "generate": ["keras_hub.tokenizers.compute_word_piece_vocabulary"],
+        },
+        {
+            "path": "compute_sentence_piece_proto",
+            "title": "compute_sentence_piece_proto function",
+            "generate": ["keras_hub.tokenizers.compute_sentence_piece_proto"],
+        },
+    ],
+}
+
+PREPROCESSING_LAYERS_MASTER = {
+    "path": "preprocessing_layers/",
+    "title": "Preprocessing Layers",
+    "toc": True,
+    "children": [
+        {
+            "path": "audio_converter",
+            "title": "AudioConverter layer",
+            "generate": ["keras_hub.layers.AudioConverter"],
+        },
+        {
+            "path": "image_converter",
+            "title": "ImageConverter layer",
+            "generate": ["keras_hub.layers.ImageConverter"],
+        },
+        {
+            "path": "resizing_image_converter",
+            "title": "ResizingImageConverter layer",
+            "generate": ["keras_hub.layers.ResizingImageConverter"],
+        },
+        {
+            "path": "start_end_packer",
+            "title": "StartEndPacker layer",
+            "generate": ["keras_hub.layers.StartEndPacker"],
+        },
+        {
+            "path": "multi_segment_packer",
+            "title": "MultiSegmentPacker layer",
+            "generate": ["keras_hub.layers.MultiSegmentPacker"],
+        },
+        {
+            "path": "random_swap",
+            "title": "RandomSwap layer",
+            "generate": ["keras_hub.layers.RandomSwap"],
+        },
+        {
+            "path": "random_deletion",
+            "title": "RandomDeletion layer",
+            "generate": ["keras_hub.layers.RandomDeletion"],
+        },
+        {
+            "path": "masked_lm_mask_generator",
+            "title": "MaskedLMMaskGenerator layer",
+            "generate": ["keras_hub.layers.MaskedLMMaskGenerator"],
+        },
+    ],
+}
+
+MODELING_LAYERS_MASTER = {
+    "path": "modeling_layers/",
+    "title": "Modeling Layers",
+    "toc": True,
+    "children": [
+        {
+            "path": "transformer_encoder",
+            "title": "TransformerEncoder layer",
+            "generate": [
+                "keras_hub.layers.TransformerEncoder",
+                "keras_hub.layers.TransformerEncoder.call",
+            ],
+        },
+        {
+            "path": "transformer_decoder",
+            "title": "TransformerDecoder layer",
+            "generate": [
+                "keras_hub.layers.TransformerDecoder",
+                "keras_hub.layers.TransformerDecoder.call",
+            ],
+        },
+        {
+            "path": "fnet_encoder",
+            "title": "FNetEncoder layer",
+            "generate": ["keras_hub.layers.FNetEncoder"],
+        },
+        {
+            "path": "position_embedding",
+            "title": "PositionEmbedding layer",
+            "generate": ["keras_hub.layers.PositionEmbedding"],
+        },
+        {
+            "path": "rotary_embedding",
+            "title": "RotaryEmbedding layer",
+            "generate": ["keras_hub.layers.RotaryEmbedding"],
+        },
+        {
+            "path": "sine_position_encoding",
+            "title": "SinePositionEncoding layer",
+            "generate": ["keras_hub.layers.SinePositionEncoding"],
+        },
+        {
+            "path": "reversible_embedding",
+            "title": "ReversibleEmbedding layer",
+            "generate": ["keras_hub.layers.ReversibleEmbedding"],
+        },
+        {
+            "path": "token_and_position_embedding",
+            "title": "TokenAndPositionEmbedding layer",
+            "generate": ["keras_hub.layers.TokenAndPositionEmbedding"],
+        },
+        {
+            "path": "alibi_bias",
+            "title": "AlibiBias layer",
+            "generate": ["keras_hub.layers.AlibiBias"],
+        },
+        {
+            "path": "masked_lm_head",
+            "title": "MaskedLMHead layer",
+            "generate": ["keras_hub.layers.MaskedLMHead"],
+        },
+        {
+            "path": "cached_multi_head_attention",
+            "title": "CachedMultiHeadAttention layer",
+            "generate": ["keras_hub.layers.CachedMultiHeadAttention"],
+        },
+    ],
+}
+
+
+METRICS_MASTER = {
+    "path": "metrics/",
+    "title": "Metrics",
+    "toc": True,
+    "children": [
+        {
+            "path": "perplexity",
+            "title": "Perplexity metric",
+            "generate": ["keras_hub.metrics.Perplexity"],
+        },
+    ],
+}
+
+HUB_API_MASTER = {
+    "path": "keras_hub/",
+    "title": "KerasHub",
+    "toc": True,
+    "children": [
+        MODELS_MASTER,
+        BASE_CLASSES,
+        TOKENIZERS_MASTER,
+        PREPROCESSING_LAYERS_MASTER,
+        MODELING_LAYERS_MASTER,
+        SAMPLERS_MASTER,
+        METRICS_MASTER,
+    ],
+}
diff --git a/scripts/master.py b/scripts/master.py
index 8425fe70bc..9db79961a6 100644
--- a/scripts/master.py
+++ b/scripts/master.py
@@ -42,11 +42,15 @@
             "path": "keras_tuner/",
             "title": "KerasTuner: Hyperparameter Tuning",
         },
+        {
+            "path": "keras_hub/",
+            "title": "KerasHub: Pretrained Models",
+        },
         {
             "path": "keras_cv/",
             "title": "KerasCV: Computer Vision Workflows",
         },
-        {
+                {
             "path": "keras_nlp/",
             "title": "KerasNLP: Natural Language Workflows",
         },
diff --git a/scripts/render_tags.py b/scripts/render_tags.py
index cfa58c08e1..a8533a1dd3 100644
--- a/scripts/render_tags.py
+++ b/scripts/render_tags.py
@@ -1,4 +1,4 @@
-"""Custom rendering code for the /api/{keras_nlp|keras_cv}/models page.
+"""Custom rendering code for the /api/{keras_hub|keras_cv}/models page.
 
 The model metadata is pulled from the library, each preset has a
 metadata dictionary as follows:
@@ -55,17 +55,17 @@ def format_path(metadata):
 
 
 def is_base_class(symbol):
-    import keras_nlp
+    import keras_hub
 
     return symbol in (
-        keras_nlp.models.Backbone,
-        keras_nlp.models.Tokenizer,
-        keras_nlp.models.Preprocessor,
-        keras_nlp.models.Task,
-        keras_nlp.models.Classifier,
-        keras_nlp.models.CausalLM,
-        keras_nlp.models.MaskedLM,
-        keras_nlp.models.Seq2SeqLM,
+        keras_hub.models.Backbone,
+        keras_hub.models.Tokenizer,
+        keras_hub.models.Preprocessor,
+        keras_hub.models.Task,
+        keras_hub.models.Classifier,
+        keras_hub.models.CausalLM,
+        keras_hub.models.MaskedLM,
+        keras_hub.models.Seq2SeqLM,
     )
 
 
@@ -83,17 +83,12 @@ def render_backbone_table(symbols):
             continue
         presets = symbol.presets
         # Only keep the ones with pretrained weights for KerasCV Backbones.
-        if issubclass(symbol, keras_cv.models.Backbone):
-            presets = symbol.presets_with_weights
         for preset in presets:
             if preset in added_presets:
                 continue
             else:
                 added_presets.add(preset)
             metadata = presets[preset]["metadata"]
-            # KerasCV backbones docs' URL has a "backbones/" path.
-            if issubclass(symbol, keras_cv.models.Backbone) and "path" in metadata:
-                metadata["path"] = "backbones/" + metadata["path"]
             table += (
                 f"{preset} | "
                 f"{format_path(metadata)} | "
@@ -106,59 +101,6 @@ def render_backbone_table(symbols):
     return table
 
 
-def render_classifier_table(symbols):
-    """Renders the markdown table for classifier presets as a string."""
-
-    table = TABLE_HEADER
-
-    # Classifier presets
-    for name, symbol in symbols:
-        if "Classifier" not in name:
-            continue
-        for preset in symbol.presets:
-            backbone_cls = symbol.backbone_cls
-            if backbone_cls is not None and preset not in backbone_cls.presets:
-                metadata = symbol.presets[preset]["metadata"]
-                table += (
-                    f"{preset} | "
-                    f"{format_path(metadata)} | "
-                    f"{format_param_count(metadata)} | "
-                    f"{metadata['description']} \n"
-                )
-    return table
-
-
-def render_task_table(symbols):
-    """Renders the markdown table for Task presets as a string."""
-    table = TABLE_HEADER
-
-    for name, symbol in symbols:
-        if not inspect.isclass(symbol):
-            continue
-        if not issubclass(symbol, keras_cv.models.Task):
-            continue
-        for preset in symbol.presets:
-            # Do not print all backbone presets for a task
-            if (
-                preset
-                in keras_cv.src.models.backbones.backbone_presets.backbone_presets
-            ):
-                continue
-            if preset not in symbol.presets_with_weights:
-                continue
-            # Only render the ones with pretrained_weights for KerasCV.
-            metadata = symbol.presets_with_weights[preset]["metadata"]
-            # KerasCV tasks docs' URL has a "tasks/" path.
-            metadata["path"] = "tasks/" + metadata["path"]
-            table += (
-                f"{preset} | "
-                f"{format_path(metadata)} | "
-                f"{format_param_count(metadata)} | "
-                f"{metadata['description']} \n"
-            )
-    return table
-
-
 def render_table(symbol):
     table = TABLE_HEADER_PER_MODEL
     if is_base_class(symbol) or len(symbol.presets) == 0:
@@ -182,18 +124,10 @@ def render_table(symbol):
 
 
 def render_tags(template, lib):
-    """Replaces all custom KerasNLP/KerasCV tags with rendered content."""
+    """Replaces all custom KerasHub/KerasCV tags with rendered content."""
     symbols = lib.models.__dict__.items()
     if "{{backbone_presets_table}}" in template:
         template = template.replace(
             "{{backbone_presets_table}}", render_backbone_table(symbols)
         )
-    if "{{classifier_presets_table}}" in template:
-        template = template.replace(
-            "{{classifier_presets_table}}", render_classifier_table(symbols)
-        )
-    if "{{task_presets_table}}" in template:
-        template = template.replace(
-            "{{task_presets_table}}", render_task_table(symbols)
-        )
     return template
diff --git a/templates/api/keras_hub/index.md b/templates/api/keras_hub/index.md
new file mode 100644
index 0000000000..faa0a651f0
--- /dev/null
+++ b/templates/api/keras_hub/index.md
@@ -0,0 +1,9 @@
+# KerasHub
+
+KerasHub is a toolbox of modular building blocks ranging from pretrained
+state-of-the-art models, to low-level Transformer Encoder layers. For an
+introduction to the library see the  [KerasHub home page](/keras_hub). For a
+high-level introduction to the API see our
+[getting started guide](/guides/keras_hub/getting_started/).
+
+{{toc}}
diff --git a/templates/api/keras_hub/layers/index.md b/templates/api/keras_hub/layers/index.md
new file mode 100644
index 0000000000..9db5473315
--- /dev/null
+++ b/templates/api/keras_hub/layers/index.md
@@ -0,0 +1,8 @@
+# KerasHub Layers
+
+KerasHub layers are `keras.Layer` subclasses for NLP-specific use cases.
+
+These layers are building blocks for common NLP model architectures
+(e.g. Transformers).
+
+{{toc}}
diff --git a/templates/api/keras_hub/metrics/index.md b/templates/api/keras_hub/metrics/index.md
new file mode 100644
index 0000000000..b9b37b27f4
--- /dev/null
+++ b/templates/api/keras_hub/metrics/index.md
@@ -0,0 +1,5 @@
+# KerasHub Metrics
+
+KerasHub metrics are `keras.Metric` subclasses for NLP-specific use cases.
+
+{{toc}}
diff --git a/templates/api/keras_hub/models/bert/index.md b/templates/api/keras_hub/models/bert/index.md
new file mode 100644
index 0000000000..2871f5edad
--- /dev/null
+++ b/templates/api/keras_hub/models/bert/index.md
@@ -0,0 +1,9 @@
+# BERT
+
+Models, tokenizers, and preprocessing layers for BERT,
+as described in ["BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding"](https://arxiv.org/abs/1810.04805).
+
+For a full list of available **presets**, see the
+[models page](/api/keras_hub/models).
+
+{{toc}}
diff --git a/templates/api/keras_hub/models/distil_bert/index.md b/templates/api/keras_hub/models/distil_bert/index.md
new file mode 100644
index 0000000000..095e6a3bb8
--- /dev/null
+++ b/templates/api/keras_hub/models/distil_bert/index.md
@@ -0,0 +1,9 @@
+# DistilBERT
+
+Models, tokenizers, and preprocessing layers for DistilBERT,
+as described in ["DistilBERT, a distilled version of BERT: smaller, faster, cheaper and lighter"](https://arxiv.org/abs/1910.01108).
+
+For a full list of available **presets**, see the
+[models page](/api/keras_hub/models).
+
+{{toc}}
diff --git a/templates/api/keras_hub/models/index.md b/templates/api/keras_hub/models/index.md
new file mode 100644
index 0000000000..3982cc3832
--- /dev/null
+++ b/templates/api/keras_hub/models/index.md
@@ -0,0 +1,34 @@
+# KerasHub Models
+
+KerasHub contains end-to-end implementations of popular model architectures.
+These models can be created in two ways:
+
+- Through the `from_preset()` constructor, which instantiates an object with
+  a pre-trained configurations, vocabularies, and (optionally) weights.
+- Through custom configuration controlled by the user.
+
+Below, we list all presets available in the library. For more detailed usage,
+browse the docstring for a particular class. For an in depth introduction
+to our API, see the [getting started guide](/guides/keras_hub/getting_started/).
+
+## Presets
+
+The following preset names correspond to a config and weights for a pretrained
+model. Any task, preprocessor, backbone or tokenizer `from_preset()` can be used
+to create a model from the saved preset.
+
+```python
+backbone = keras_hub.models.Backbone.from_preset("bert_base_en")
+tokenizer = keras_hub.models.Tokenizer.from_preset("bert_base_en")
+classifier = keras_hub.models.TextClassifier.from_preset("bert_base_en", num_classes=2)
+preprocessor = keras_hub.models.TextClassifierPreprocessor.from_preset("bert_base_en")
+```
+
+{{backbone_presets_table}}
+
+**Note**: The links provided will lead to the model card or to the official README,
+if no model card has been provided by the author.
+
+## API Documentation
+
+{{toc}}
diff --git a/templates/api/keras_hub/models/roberta/index.md b/templates/api/keras_hub/models/roberta/index.md
new file mode 100644
index 0000000000..de765df2c0
--- /dev/null
+++ b/templates/api/keras_hub/models/roberta/index.md
@@ -0,0 +1,9 @@
+# RoBERTa
+
+Models, tokenizers, and preprocessing layers for RoBERTa,
+as described in ["RoBERTa: A Robustly Optimized BERT Pretraining Approach"](https://arxiv.org/abs/1907.11692).
+
+For a full list of available **presets**, see the
+[models page](/api/keras_hub/models).
+
+{{toc}}
diff --git a/templates/api/keras_hub/models/xlm_roberta/index.md b/templates/api/keras_hub/models/xlm_roberta/index.md
new file mode 100644
index 0000000000..e9cc86facc
--- /dev/null
+++ b/templates/api/keras_hub/models/xlm_roberta/index.md
@@ -0,0 +1,9 @@
+# XLM-RoBERTa
+
+Models, tokenizers, and preprocessing layers for XLM-Roberta,
+as described in ["Unsupervised Cross-lingual Representation Learning at Scale"](https://arxiv.org/abs/1911.02116).
+
+For a full list of available **presets**, see the
+[models page](/api/keras_hub/models).
+
+{{toc}}
diff --git a/templates/api/keras_hub/tokenizers/index.md b/templates/api/keras_hub/tokenizers/index.md
new file mode 100644
index 0000000000..44e037619b
--- /dev/null
+++ b/templates/api/keras_hub/tokenizers/index.md
@@ -0,0 +1,11 @@
+# KerasHub Tokenizers
+
+Tokenizers convert raw string input into integer input suitable for a Keras `Embedding` layer.
+They can also convert back from predicted integer sequences to raw string output.
+
+All tokenizers subclass `keras_hub.tokenizers.Tokenizer`, which in turn
+subclasses `keras.layers.Layer`. Tokenizers should generally be applied inside a
+[tf.data.Dataset.map](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#map)
+for training, and can be included inside a `keras.Model` for inference.
+
+{{toc}}
diff --git a/templates/api/keras_hub/utils/index.md b/templates/api/keras_hub/utils/index.md
new file mode 100644
index 0000000000..b73489484c
--- /dev/null
+++ b/templates/api/keras_hub/utils/index.md
@@ -0,0 +1,6 @@
+# KerasHub Utils
+
+Standalone utilitiy methods for KerasHub, including functions for generating
+sequences of text with a model.
+
+{{toc}}
diff --git a/templates/getting_started/benchmarks.md b/templates/getting_started/benchmarks.md
index 0e1be1848b..0fcee3f563 100644
--- a/templates/getting_started/benchmarks.md
+++ b/templates/getting_started/benchmarks.md
@@ -21,7 +21,7 @@ selections.
 
 We are not measuring the best possible performance achievable by each framework,
 but the out-of-the-box performance of common user workflows. With this goal in
-mind, we leveraged pre-existing implementations from KerasCV and KerasNLP for
+mind, we leveraged pre-existing implementations from KerasCV and KerasHub for
 the Keras versions of the models.
 
 ## Hardware
@@ -77,7 +77,7 @@ better.
 | **Mistral<br>(generate)** | 1 | NA | 743.28 | **679.30** | 11,054.67<sup>*</sup> | **679.30** |
 
 \* _LLM inference with the PyTorch backend is abnormally slow at this time
-because KerasNLP uses static sequence padding, unlike HuggingFace. This will be
+because KerasHub uses static sequence padding, unlike HuggingFace. This will be
 addressed soon._
 
 ## Discussion
diff --git a/templates/getting_started/ecosystem.md b/templates/getting_started/ecosystem.md
index 6b2e6454e3..f24cc655fc 100644
--- a/templates/getting_started/ecosystem.md
+++ b/templates/getting_started/ecosystem.md
@@ -13,11 +13,11 @@ KerasTuner is an easy-to-use, scalable hyperparameter optimization framework tha
 
 ---
 
-## KerasNLP
+## KerasHub
 
-[KerasNLP Documentation](/keras_nlp/) - [KerasNLP GitHub repository](https://github.com/keras-team/keras-nlp)
+[KerasHub Documentation](/keras_hub/) - [KerasHub GitHub repository](https://github.com/keras-team/keras-hub)
 
-KerasNLP is a natural language processing library that supports users through
+KerasHub is a natural language processing library that supports users through
 their entire development cycle. Our workflows are built from modular components 
 that have state-of-the-art preset weights and architectures when used 
 out-of-the-box and are easily customizable when more control is needed.
diff --git a/templates/getting_started/faq.md b/templates/getting_started/faq.md
index b948724cc1..d38f8e846e 100644
--- a/templates/getting_started/faq.md
+++ b/templates/getting_started/faq.md
@@ -748,7 +748,7 @@ intermediate_output = intermediate_layer_model(data)
 ### How can I use pre-trained models in Keras?
 
 You could leverage the [models available in `keras.applications`](/api/applications/),
-or the models available in [KerasCV](/keras_cv/) and [KerasNLP](/keras_nlp/).
+or the models available in [KerasCV](/keras_cv/) and [KerasHub](/keras_hub/).
 
 
 ---
diff --git a/templates/getting_started/index.md b/templates/getting_started/index.md
index 0b6c03ff72..420dc2dfc4 100644
--- a/templates/getting_started/index.md
+++ b/templates/getting_started/index.md
@@ -40,13 +40,13 @@ To use Keras 3, you will also need to install a backend framework -- either JAX,
 If you install TensorFlow 2.15, you should reinstall Keras 3 afterwards. The cause is that `tensorflow==2.15` will overwrite your Keras installation with `keras==2.15`.
 This step is not necessary for TensorFlow versions 2.16 onwards as starting in TensorFlow 2.16, it will install Keras 3 by default.
 
-### Installing KerasCV and KerasNLP
+### Installing KerasCV and KerasHub
 
-KerasCV and KerasNLP can be installed via pip:
+KerasCV and KerasHub can be installed via pip:
 
 ```
 pip install --upgrade keras-cv
-pip install --upgrade keras-nlp
+pip install --upgrade keras-hub
 pip install --upgrade keras
 ```
 
diff --git a/templates/guides/keras_hub/getting_started.md b/templates/guides/keras_hub/getting_started.md
new file mode 100644
index 0000000000..51afbb54ac
--- /dev/null
+++ b/templates/guides/keras_hub/getting_started.md
@@ -0,0 +1,1065 @@
+# Getting Started with KerasHub
+
+**Author:** [Jonathan Bischof](https://github.com/jbischof)<br>
+**Date created:** 2022/12/15<br>
+**Last modified:** 2023/07/01<br>
+**Description:** An introduction to the KerasHub API.
+
+
+<img class="k-inline-icon" src="https://colab.research.google.com/img/colab_favicon.ico"/> [**View in Colab**](https://colab.research.google.com/github/keras-team/keras-io/blob/master/guides/ipynb/keras_hub/getting_started.ipynb)  <span class="k-dot">•</span><img class="k-inline-icon" src="https://github.com/favicon.ico"/> [**GitHub source**](https://github.com/keras-team/keras-io/blob/master/guides/keras_hub/getting_started.py)
+
+
+
+---
+## Introduction
+
+KerasHub is a natural language processing library that supports users through
+their entire development cycle. Our workflows are built from modular components
+that have state-of-the-art preset weights and architectures when used
+out-of-the-box and are easily customizable when more control is needed.
+
+This library is an extension of the core Keras API; all high-level modules are
+[`Layers`](/api/layers/) or [`Models`](/api/models/). If you are familiar with Keras,
+congratulations! You already understand most of KerasHub.
+
+KerasHub uses Keras 3 to work with any of TensorFlow, Pytorch and Jax. In the
+guide below, we will use the `jax` backend for training our models, and
+[tf.data](https://www.tensorflow.org/guide/data) for efficiently running our
+input preprocessing. But feel free to mix things up! This guide runs in
+TensorFlow or PyTorch backends with zero changes, simply update the
+`KERAS_BACKEND` below.
+
+This guide demonstrates our modular approach using a sentiment analysis example at six
+levels of complexity:
+
+* Inference with a pretrained classifier
+* Fine tuning a pretrained backbone
+* Fine tuning with user-controlled preprocessing
+* Fine tuning a custom model
+* Pretraining a backbone model
+* Build and train your own transformer from scratch
+
+Throughout our guide, we use Professor Keras, the official Keras mascot, as a visual
+reference for the complexity of the material:
+
+<img src="https://storage.googleapis.com/keras-hub/getting_started_guide/prof_keras_evolution.png" alt="drawing" height="250"/>
+
+
+```python
+!pip install -q --upgrade keras-hub
+!pip install -q --upgrade keras  # Upgrade to Keras 3.
+```
+
+```python
+import os
+
+os.environ["KERAS_BACKEND"] = "jax"  # or "tensorflow" or "torch"
+
+import keras_hub
+import keras
+
+# Use mixed precision to speed up all training in this guide.
+keras.mixed_precision.set_global_policy("mixed_float16")
+```
+<div class="k-default-codeblock">
+```
+
+
+```
+</div>
+---
+## API quickstart
+
+Our highest level API is `keras_hub.models`. These symbols cover the complete user
+journey of converting strings to tokens, tokens to dense features, and dense features to
+task-specific output. For each `XX` architecture (e.g., `Bert`), we offer the following
+modules:
+
+* **Tokenizer**: `keras_hub.models.XXTokenizer`
+  * **What it does**: Converts strings to sequences of token ids.
+  * **Why it's important**: The raw bytes of a string are too high dimensional to be useful
+    features so we first map them to a small number of tokens, for example `"The quick brown
+    fox"` to `["the", "qu", "##ick", "br", "##own", "fox"]`.
+  * **Inherits from**: `keras.layers.Layer`.
+* **Preprocessor**: `keras_hub.models.XXPreprocessor`
+  * **What it does**: Converts strings to a dictionary of preprocessed tensors consumed by
+    the backbone, starting with tokenization.
+  * **Why it's important**: Each model uses special tokens and extra tensors to understand
+    the input such as delimiting input segments and identifying padding tokens. Padding each
+    sequence to the same length improves computational efficiency.
+  * **Has a**: `XXTokenizer`.
+  * **Inherits from**: `keras.layers.Layer`.
+* **Backbone**: `keras_hub.models.XXBackbone`
+  * **What it does**: Converts preprocessed tensors to dense features. *Does not handle
+    strings; call the preprocessor first.*
+  * **Why it's important**: The backbone distills the input tokens into dense features that
+    can be used in downstream tasks. It is generally pretrained on a language modeling task
+    using massive amounts of unlabeled data. Transferring this information to a new task is a
+    major breakthrough in modern NLP.
+  * **Inherits from**: `keras.Model`.
+* **Task**: e.g., `keras_hub.models.XXClassifier`
+  * **What it does**: Converts strings to task-specific output (e.g., classification
+    probabilities).
+  * **Why it's important**: Task models combine string preprocessing and the backbone model
+    with task-specific `Layers` to solve a problem such as sentence classification, token
+    classification, or text generation. The additional `Layers` must be fine-tuned on labeled
+    data.
+  * **Has a**: `XXBackbone` and `XXPreprocessor`.
+  * **Inherits from**: `keras.Model`.
+
+Here is the modular hierarchy for `BertClassifier` (all relationships are compositional):
+
+<img src="https://storage.googleapis.com/keras-hub/getting_started_guide/class_diagram.png" alt="drawing" height="300"/>
+
+All modules can be used independently and have a `from_preset()` method in addition to
+the standard constructor that instantiates the class with **preset** architecture and
+weights (see examples below).
+
+---
+## Data
+
+We will use a running example of sentiment analysis of IMDB movie reviews. In this task,
+we use the text to predict whether the review was positive (`label = 1`) or negative
+(`label = 0`).
+
+We load the data using `keras.utils.text_dataset_from_directory`, which utilizes the
+powerful `tf.data.Dataset` format for examples.
+
+
+```python
+!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
+!tar -xf aclImdb_v1.tar.gz
+!# Remove unsupervised examples
+!rm -r aclImdb/train/unsup
+```
+
+```python
+BATCH_SIZE = 16
+imdb_train = keras.utils.text_dataset_from_directory(
+    "aclImdb/train",
+    batch_size=BATCH_SIZE,
+)
+imdb_test = keras.utils.text_dataset_from_directory(
+    "aclImdb/test",
+    batch_size=BATCH_SIZE,
+)
+
+# Inspect first review
+# Format is (review text tensor, label tensor)
+print(imdb_train.unbatch().take(1).get_single_element())
+
+```
+<div class="k-default-codeblock">
+```
+  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
+                                 Dload  Upload   Total   Spent    Left  Speed
+100 80.2M  100 80.2M    0     0  88.0M      0 --:--:-- --:--:-- --:--:-- 87.9M
+
+Found 25000 files belonging to 2 classes.
+Found 25000 files belonging to 2 classes.
+(<tf.Tensor: shape=(), dtype=string, numpy=b'This is a very, very early Bugs Bunny cartoon. As a result, the character is still in a transition period--he is not drawn as elongated as he later was and his voice isn\'t quite right. In addition, the chemistry between Elmer and Bugs is a little unusual. Elmer is some poor sap who buys Bugs from a pet shop--there is no gun or desire on his part to blast the bunny to smithereens! However, despite this, this is still a very enjoyable film. The early Bugs was definitely more sassy and cruel than his later incarnations. In later films, he messed with Elmer, Yosimite Sam and others because they started it--they messed with the rabbit. But, in this film, he is much more like Daffy Duck of the late 30s and early 40s--a jerk who just loves irritating others!! A true "anarchist" instead of the hero of the later cartoons. While this isn\'t among the best Bug Bunny cartoons, it sure is fun to watch and it\'s interesting to see just how much he\'s changed over the years.'>, <tf.Tensor: shape=(), dtype=int32, numpy=1>)
+
+```
+</div>
+---
+## Inference with a pretrained classifier
+
+<img src="https://storage.googleapis.com/keras-hub/getting_started_guide/prof_keras_beginner.png" alt="drawing" height="250"/>
+
+The highest level module in KerasHub is a **task**. A **task** is a `keras.Model`
+consisting of a (generally pretrained) **backbone** model and task-specific layers.
+Here's an example using `keras_hub.models.BertClassifier`.
+
+**Note**: Outputs are the logits per class (e.g., `[0, 0]` is 50% chance of positive). The output is
+[negative, positive] for binary classification.
+
+
+```python
+classifier = keras_hub.models.BertClassifier.from_preset("bert_tiny_en_uncased_sst2")
+# Note: batched inputs expected so must wrap string in iterable
+classifier.predict(["I love modular workflows in keras-hub!"])
+```
+
+<div class="k-default-codeblock">
+```
+ 1/1 ━━━━━━━━━━━━━━━━━━━━ 1s 689ms/step
+
+array([[-1.539,  1.543]], dtype=float16)
+
+```
+</div>
+All **tasks** have a `from_preset` method that constructs a `keras.Model` instance with
+preset preprocessing, architecture and weights. This means that we can pass raw strings
+in any format accepted by a `keras.Model` and get output specific to our task.
+
+This particular **preset** is a `"bert_tiny_uncased_en"` **backbone** fine-tuned on
+`sst2`, another movie review sentiment analysis (this time from Rotten Tomatoes). We use
+the `tiny` architecture for demo purposes, but larger models are recommended for SoTA
+performance. For all the task-specific presets available for `BertClassifier`, see
+our keras.io [models page](https://keras.io/api/keras_hub/models/).
+
+Let's evaluate our classifier on the IMDB dataset. You will note we don't need to
+call `keras.Model.compile` here. All **task** models like `BertClassifier` ship with
+compilation defaults, meaning we can just call `keras.Model.evaluate` directly. You
+can always call compile as normal to override these defaults (e.g. to add new metrics).
+
+The output below is [loss, accuracy],
+
+
+```python
+classifier.evaluate(imdb_test)
+```
+
+<div class="k-default-codeblock">
+```
+ 1563/1563 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - loss: 0.4610 - sparse_categorical_accuracy: 0.7882
+
+[0.4630218744277954, 0.783519983291626]
+
+```
+</div>
+Our result is 78% accuracy without training anything. Not bad!
+
+---
+## Fine tuning a pretrained BERT backbone
+
+<img src="https://storage.googleapis.com/keras-hub/getting_started_guide/prof_keras_intermediate.png" alt="drawing" height="250"/>
+
+When labeled text specific to our task is available, fine-tuning a custom classifier can
+improve performance. If we want to predict IMDB review sentiment, using IMDB data should
+perform better than Rotten Tomatoes data! And for many tasks, no relevant pretrained model
+will be available (e.g., categorizing customer reviews).
+
+The workflow for fine-tuning is almost identical to above, except that we request a
+**preset** for the **backbone**-only model rather than the entire classifier. When passed
+a **backbone** **preset**, a **task** `Model` will randomly initialize all task-specific
+layers in preparation for training. For all the **backbone** presets available for
+`BertClassifier`, see our keras.io [models page](https://keras.io/api/keras_hub/models/).
+
+To train your classifier, use `keras.Model.fit` as with any other
+`keras.Model`. As with our inference example, we can rely on the compilation
+defaults for the **task** and skip `keras.Model.compile`. As preprocessing is
+included, we again pass the raw data.
+
+
+```python
+classifier = keras_hub.models.BertClassifier.from_preset(
+    "bert_tiny_en_uncased",
+    num_classes=2,
+)
+classifier.fit(
+    imdb_train,
+    validation_data=imdb_test,
+    epochs=1,
+)
+```
+
+<div class="k-default-codeblock">
+```
+ 1563/1563 ━━━━━━━━━━━━━━━━━━━━ 16s 9ms/step - loss: 0.5202 - sparse_categorical_accuracy: 0.7281 - val_loss: 0.3254 - val_sparse_categorical_accuracy: 0.8621
+
+<keras.src.callbacks.history.History at 0x7f281ffc9f90>
+
+```
+</div>
+Here we see a significant lift in validation accuracy (0.78 -> 0.87) with a single epoch of
+training even though the IMDB dataset is much smaller than `sst2`.
+
+---
+## Fine tuning with user-controlled preprocessing
+<img src="https://storage.googleapis.com/keras-hub/getting_started_guide/prof_keras_advanced.png" alt="drawing" height="250"/>
+
+For some advanced training scenarios, users might prefer direct control over
+preprocessing. For large datasets, examples can be preprocessed in advance and saved to
+disk or preprocessed by a separate worker pool using `tf.data.experimental.service`. In
+other cases, custom preprocessing is needed to handle the inputs.
+
+Pass `preprocessor=None` to the constructor of a **task** `Model` to skip automatic
+preprocessing or pass a custom `BertPreprocessor` instead.
+
+### Separate preprocessing from the same preset
+
+Each model architecture has a parallel **preprocessor** `Layer` with its own
+`from_preset` constructor. Using the same **preset** for this `Layer` will return the
+matching **preprocessor** as the **task**.
+
+In this workflow we train the model over three epochs using `tf.data.Dataset.cache()`,
+which computes the preprocessing once and caches the result before fitting begins.
+
+**Note:** we can use `tf.data` for preprocessing while running on the
+Jax or PyTorch backend. The input dataset will automatically be converted to
+backend native tensor types during fit. In fact, given the efficiency of `tf.data`
+for running preprocessing, this is good practice on all backends.
+
+
+```python
+import tensorflow as tf
+
+preprocessor = keras_hub.models.BertPreprocessor.from_preset(
+    "bert_tiny_en_uncased",
+    sequence_length=512,
+)
+
+# Apply the preprocessor to every sample of train and test data using `map()`.
+# `tf.data.AUTOTUNE` and `prefetch()` are options to tune performance, see
+# https://www.tensorflow.org/guide/data_performance for details.
+
+# Note: only call `cache()` if you training data fits in CPU memory!
+imdb_train_cached = (
+    imdb_train.map(preprocessor, tf.data.AUTOTUNE).cache().prefetch(tf.data.AUTOTUNE)
+)
+imdb_test_cached = (
+    imdb_test.map(preprocessor, tf.data.AUTOTUNE).cache().prefetch(tf.data.AUTOTUNE)
+)
+
+classifier = keras_hub.models.BertClassifier.from_preset(
+    "bert_tiny_en_uncased", preprocessor=None, num_classes=2
+)
+classifier.fit(
+    imdb_train_cached,
+    validation_data=imdb_test_cached,
+    epochs=3,
+)
+```
+
+<div class="k-default-codeblock">
+```
+Epoch 1/3
+ 1563/1563 ━━━━━━━━━━━━━━━━━━━━ 15s 8ms/step - loss: 0.5194 - sparse_categorical_accuracy: 0.7272 - val_loss: 0.3032 - val_sparse_categorical_accuracy: 0.8728
+Epoch 2/3
+ 1563/1563 ━━━━━━━━━━━━━━━━━━━━ 10s 7ms/step - loss: 0.2871 - sparse_categorical_accuracy: 0.8805 - val_loss: 0.2809 - val_sparse_categorical_accuracy: 0.8818
+Epoch 3/3
+ 1563/1563 ━━━━━━━━━━━━━━━━━━━━ 10s 7ms/step - loss: 0.2134 - sparse_categorical_accuracy: 0.9178 - val_loss: 0.3043 - val_sparse_categorical_accuracy: 0.8790
+
+<keras.src.callbacks.history.History at 0x7f281ffc87f0>
+
+```
+</div>
+After three epochs, our validation accuracy has only increased to 0.88. This is both a
+function of the small size of our dataset and our model. To exceed 90% accuracy, try
+larger **presets** such as  `"bert_base_en_uncased"`. For all the **backbone** presets
+available for `BertClassifier`, see our keras.io [models page](https://keras.io/api/keras_hub/models/).
+
+### Custom preprocessing
+
+In cases where custom preprocessing is required, we offer direct access to the
+`Tokenizer` class that maps raw strings to tokens. It also has a `from_preset()`
+constructor to get the vocabulary matching pretraining.
+
+**Note:** `BertTokenizer` does not pad sequences by default, so the output is
+ragged (each sequence has varying length). The `MultiSegmentPacker` below
+handles padding these ragged sequences to dense tensor types (e.g. `tf.Tensor`
+or `torch.Tensor`).
+
+
+```python
+tokenizer = keras_hub.models.BertTokenizer.from_preset("bert_tiny_en_uncased")
+tokenizer(["I love modular workflows!", "Libraries over frameworks!"])
+
+# Write your own packer or use one of our `Layers`
+packer = keras_hub.layers.MultiSegmentPacker(
+    start_value=tokenizer.cls_token_id,
+    end_value=tokenizer.sep_token_id,
+    # Note: This cannot be longer than the preset's `sequence_length`, and there
+    # is no check for a custom preprocessor!
+    sequence_length=64,
+)
+
+
+# This function that takes a text sample `x` and its
+# corresponding label `y` as input and converts the
+# text into a format suitable for input into a BERT model.
+def preprocessor(x, y):
+    token_ids, segment_ids = packer(tokenizer(x))
+    x = {
+        "token_ids": token_ids,
+        "segment_ids": segment_ids,
+        "padding_mask": token_ids != 0,
+    }
+    return x, y
+
+
+imdb_train_preprocessed = imdb_train.map(preprocessor, tf.data.AUTOTUNE).prefetch(
+    tf.data.AUTOTUNE
+)
+imdb_test_preprocessed = imdb_test.map(preprocessor, tf.data.AUTOTUNE).prefetch(
+    tf.data.AUTOTUNE
+)
+
+# Preprocessed example
+print(imdb_train_preprocessed.unbatch().take(1).get_single_element())
+```
+
+<div class="k-default-codeblock">
+```
+({'token_ids': <tf.Tensor: shape=(64,), dtype=int32, numpy=
+array([  101,  2023,  2003,  2941,  2028,  1997,  2026,  5440,  3152,
+        1010,  1045,  2052, 16755,  2008,  3071, 12197,  2009,  1012,
+        2045,  2003,  2070,  2307,  3772,  1999,  2009,  1998,  2009,
+        3065,  2008,  2025,  2035,  1000,  2204,  1000,  3152,  2024,
+        2137,  1012,  1012,  1012,  1012,   102,     0,     0,     0,
+           0,     0,     0,     0,     0,     0,     0,     0,     0,
+           0,     0,     0,     0,     0,     0,     0,     0,     0,
+           0], dtype=int32)>, 'segment_ids': <tf.Tensor: shape=(64,), dtype=int32, numpy=
+array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+      dtype=int32)>, 'padding_mask': <tf.Tensor: shape=(64,), dtype=bool, numpy=
+array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True, False, False, False,
+       False, False, False, False, False, False, False, False, False,
+       False, False, False, False, False, False, False, False, False,
+       False])>}, <tf.Tensor: shape=(), dtype=int32, numpy=1>)
+
+```
+</div>
+---
+## Fine tuning with a custom model
+<img src="https://storage.googleapis.com/keras-hub/getting_started_guide/prof_keras_advanced.png" alt="drawing" height="250"/>
+
+For more advanced applications, an appropriate **task** `Model` may not be available. In
+this case, we provide direct access to the **backbone** `Model`, which has its own
+`from_preset` constructor and can be composed with custom `Layer`s. Detailed examples can
+be found at our [transfer learning guide](https://keras.io/guides/transfer_learning/).
+
+A **backbone** `Model` does not include automatic preprocessing but can be paired with a
+matching **preprocessor** using the same **preset** as shown in the previous workflow.
+
+In this workflow, we experiment with freezing our backbone model and adding two trainable
+transformer layers to adapt to the new input.
+
+**Note**: We can ignore the warning about gradients for the `pooled_dense` layer because
+we are using BERT's sequence output.
+
+
+```python
+preprocessor = keras_hub.models.BertPreprocessor.from_preset("bert_tiny_en_uncased")
+backbone = keras_hub.models.BertBackbone.from_preset("bert_tiny_en_uncased")
+
+imdb_train_preprocessed = (
+    imdb_train.map(preprocessor, tf.data.AUTOTUNE).cache().prefetch(tf.data.AUTOTUNE)
+)
+imdb_test_preprocessed = (
+    imdb_test.map(preprocessor, tf.data.AUTOTUNE).cache().prefetch(tf.data.AUTOTUNE)
+)
+
+backbone.trainable = False
+inputs = backbone.input
+sequence = backbone(inputs)["sequence_output"]
+for _ in range(2):
+    sequence = keras_hub.layers.TransformerEncoder(
+        num_heads=2,
+        intermediate_dim=512,
+        dropout=0.1,
+    )(sequence)
+# Use [CLS] token output to classify
+outputs = keras.layers.Dense(2)(sequence[:, backbone.cls_token_index, :])
+
+model = keras.Model(inputs, outputs)
+model.compile(
+    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+    optimizer=keras.optimizers.AdamW(5e-5),
+    metrics=[keras.metrics.SparseCategoricalAccuracy()],
+    jit_compile=True,
+)
+model.summary()
+model.fit(
+    imdb_train_preprocessed,
+    validation_data=imdb_test_preprocessed,
+    epochs=3,
+)
+```
+
+
+<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace"><span style="font-weight: bold">Model: "functional_1"</span>
+</pre>
+
+
+
+
+<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace">┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┓
+┃<span style="font-weight: bold"> Layer (type)        </span>┃<span style="font-weight: bold"> Output Shape      </span>┃<span style="font-weight: bold"> Param # </span>┃<span style="font-weight: bold"> Connected to         </span>┃
+┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━┩
+│ padding_mask        │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>)      │       <span style="color: #00af00; text-decoration-color: #00af00">0</span> │ -                    │
+│ (<span style="color: #0087ff; text-decoration-color: #0087ff">InputLayer</span>)        │                   │         │                      │
+├─────────────────────┼───────────────────┼─────────┼──────────────────────┤
+│ segment_ids         │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>)      │       <span style="color: #00af00; text-decoration-color: #00af00">0</span> │ -                    │
+│ (<span style="color: #0087ff; text-decoration-color: #0087ff">InputLayer</span>)        │                   │         │                      │
+├─────────────────────┼───────────────────┼─────────┼──────────────────────┤
+│ token_ids           │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>)      │       <span style="color: #00af00; text-decoration-color: #00af00">0</span> │ -                    │
+│ (<span style="color: #0087ff; text-decoration-color: #0087ff">InputLayer</span>)        │                   │         │                      │
+├─────────────────────┼───────────────────┼─────────┼──────────────────────┤
+│ bert_backbone_3     │ [(<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00af00; text-decoration-color: #00af00">128</span>),     │ <span style="color: #00af00; text-decoration-color: #00af00">4,385,…</span> │ padding_mask[<span style="color: #00af00; text-decoration-color: #00af00">0</span>][<span style="color: #00af00; text-decoration-color: #00af00">0</span>],  │
+│ (<span style="color: #0087ff; text-decoration-color: #0087ff">BertBackbone</span>)      │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>,      │         │ segment_ids[<span style="color: #00af00; text-decoration-color: #00af00">0</span>][<span style="color: #00af00; text-decoration-color: #00af00">0</span>],   │
+│                     │ <span style="color: #00af00; text-decoration-color: #00af00">128</span>)]             │         │ token_ids[<span style="color: #00af00; text-decoration-color: #00af00">0</span>][<span style="color: #00af00; text-decoration-color: #00af00">0</span>]      │
+├─────────────────────┼───────────────────┼─────────┼──────────────────────┤
+│ transformer_encoder │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00af00; text-decoration-color: #00af00">128</span>) │ <span style="color: #00af00; text-decoration-color: #00af00">198,272</span> │ bert_backbone_3[<span style="color: #00af00; text-decoration-color: #00af00">0</span>][<span style="color: #00af00; text-decoration-color: #00af00">…</span> │
+│ (<span style="color: #0087ff; text-decoration-color: #0087ff">TransformerEncode…</span> │                   │         │                      │
+├─────────────────────┼───────────────────┼─────────┼──────────────────────┤
+│ transformer_encode… │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00af00; text-decoration-color: #00af00">128</span>) │ <span style="color: #00af00; text-decoration-color: #00af00">198,272</span> │ transformer_encoder… │
+│ (<span style="color: #0087ff; text-decoration-color: #0087ff">TransformerEncode…</span> │                   │         │                      │
+├─────────────────────┼───────────────────┼─────────┼──────────────────────┤
+│ get_item_4          │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00af00; text-decoration-color: #00af00">128</span>)       │       <span style="color: #00af00; text-decoration-color: #00af00">0</span> │ transformer_encoder… │
+│ (<span style="color: #0087ff; text-decoration-color: #0087ff">GetItem</span>)           │                   │         │                      │
+├─────────────────────┼───────────────────┼─────────┼──────────────────────┤
+│ dense (<span style="color: #0087ff; text-decoration-color: #0087ff">Dense</span>)       │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00af00; text-decoration-color: #00af00">2</span>)         │     <span style="color: #00af00; text-decoration-color: #00af00">258</span> │ get_item_4[<span style="color: #00af00; text-decoration-color: #00af00">0</span>][<span style="color: #00af00; text-decoration-color: #00af00">0</span>]     │
+└─────────────────────┴───────────────────┴─────────┴──────────────────────┘
+</pre>
+
+
+
+
+<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace"><span style="font-weight: bold"> Total params: </span><span style="color: #00af00; text-decoration-color: #00af00">4,782,722</span> (18.24 MB)
+</pre>
+
+
+
+
+<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace"><span style="font-weight: bold"> Trainable params: </span><span style="color: #00af00; text-decoration-color: #00af00">396,802</span> (1.51 MB)
+</pre>
+
+
+
+
+<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace"><span style="font-weight: bold"> Non-trainable params: </span><span style="color: #00af00; text-decoration-color: #00af00">4,385,920</span> (16.73 MB)
+</pre>
+
+
+
+<div class="k-default-codeblock">
+```
+Epoch 1/3
+ 1563/1563 ━━━━━━━━━━━━━━━━━━━━ 17s 10ms/step - loss: 0.6208 - sparse_categorical_accuracy: 0.6612 - val_loss: 0.6119 - val_sparse_categorical_accuracy: 0.6758
+Epoch 2/3
+ 1563/1563 ━━━━━━━━━━━━━━━━━━━━ 12s 8ms/step - loss: 0.5324 - sparse_categorical_accuracy: 0.7347 - val_loss: 0.5484 - val_sparse_categorical_accuracy: 0.7320
+Epoch 3/3
+ 1563/1563 ━━━━━━━━━━━━━━━━━━━━ 12s 8ms/step - loss: 0.4735 - sparse_categorical_accuracy: 0.7723 - val_loss: 0.4874 - val_sparse_categorical_accuracy: 0.7742
+
+<keras.src.callbacks.history.History at 0x7f2790170220>
+
+```
+</div>
+This model achieves reasonable accuracy despite having only 10% of the trainable parameters
+of our `BertClassifier` model. Each training step takes about 1/3 of the time---even
+accounting for cached preprocessing.
+
+---
+## Pretraining a backbone model
+<img src="https://storage.googleapis.com/keras-hub/getting_started_guide/prof_keras_expert.png" alt="drawing" height="250"/>
+
+Do you have access to large unlabeled datasets in your domain? Are they around the
+same size as used to train popular backbones such as BERT, RoBERTa, or GPT2 (XX+ GiB)? If
+so, you might benefit from domain-specific pretraining of your own backbone models.
+
+NLP models are generally pretrained on a language modeling task, predicting masked words
+given the visible words in an input sentence. For example, given the input
+`"The fox [MASK] over the [MASK] dog"`, the model might be asked to predict `["jumped", "lazy"]`.
+The lower layers of this model are then packaged as a **backbone** to be combined with
+layers relating to a new task.
+
+The KerasHub library offers SoTA **backbones** and **tokenizers** to be trained from
+scratch without presets.
+
+In this workflow, we pretrain a BERT **backbone** using our IMDB review text. We skip the
+"next sentence prediction" (NSP) loss because it adds significant complexity to the data
+processing and was dropped by later models like RoBERTa. See our e2e
+[Transformer pretraining](https://keras.io/guides/keras_hub/transformer_pretraining/#pretraining)
+for step-by-step details on how to replicate the original paper.
+
+### Preprocessing
+
+
+```python
+# All BERT `en` models have the same vocabulary, so reuse preprocessor from
+# "bert_tiny_en_uncased"
+preprocessor = keras_hub.models.BertPreprocessor.from_preset(
+    "bert_tiny_en_uncased",
+    sequence_length=256,
+)
+packer = preprocessor.packer
+tokenizer = preprocessor.tokenizer
+
+# keras.Layer to replace some input tokens with the "[MASK]" token
+masker = keras_hub.layers.MaskedLMMaskGenerator(
+    vocabulary_size=tokenizer.vocabulary_size(),
+    mask_selection_rate=0.25,
+    mask_selection_length=64,
+    mask_token_id=tokenizer.token_to_id("[MASK]"),
+    unselectable_token_ids=[
+        tokenizer.token_to_id(x) for x in ["[CLS]", "[PAD]", "[SEP]"]
+    ],
+)
+
+
+def preprocess(inputs, label):
+    inputs = preprocessor(inputs)
+    masked_inputs = masker(inputs["token_ids"])
+    # Split the masking layer outputs into a (features, labels, and weights)
+    # tuple that we can use with keras.Model.fit().
+    features = {
+        "token_ids": masked_inputs["token_ids"],
+        "segment_ids": inputs["segment_ids"],
+        "padding_mask": inputs["padding_mask"],
+        "mask_positions": masked_inputs["mask_positions"],
+    }
+    labels = masked_inputs["mask_ids"]
+    weights = masked_inputs["mask_weights"]
+    return features, labels, weights
+
+
+pretrain_ds = imdb_train.map(preprocess, num_parallel_calls=tf.data.AUTOTUNE).prefetch(
+    tf.data.AUTOTUNE
+)
+pretrain_val_ds = imdb_test.map(
+    preprocess, num_parallel_calls=tf.data.AUTOTUNE
+).prefetch(tf.data.AUTOTUNE)
+
+# Tokens with ID 103 are "masked"
+print(pretrain_ds.unbatch().take(1).get_single_element())
+```
+
+<div class="k-default-codeblock">
+```
+({'token_ids': <tf.Tensor: shape=(256,), dtype=int32, numpy=
+array([  101,   103,  2332,   103,  1006,   103,   103,  2332,  2370,
+        1007,   103,  2029,   103,  2402,  2155,  1010, 24159,  2000,
+        3541,  7081,  1010,  2424,  2041,  2055,  1996,  9004,  4528,
+         103,   103,  2037,  2188,   103,  1996,  2269,  1006,  8512,
+        3054,   103,  4246,  1007,  2059,  4858,  1555,  2055,  1996,
+       23025, 22911,  8940,  2598,  3458,  1996, 25483,  4528,  2008,
+        2038,   103,  1997, 15218,  1011,   103,  1997,   103,  2505,
+        3950,  2045,  3310,  2067,  2025,  3243,  2157,  1012,   103,
+        7987,  1013,  1028,   103,  7987,  1013,  1028,  2917,   103,
+        1000,  5469,  1000,   103,   103,  2041, 22902,  1010, 23979,
+        1010,  1998,  1999, 23606,   103,  1998,  4247,  2008,  2126,
+        2005,  1037,  2096,  1010,  2007,  1996,   103,  5409,   103,
+        2108,  3054,  3211,  4246,  1005,  1055, 22692,  2836,  1012,
+        2009,   103,  1037,  2210,  2488,   103,   103,  2203,  1010,
+        2007,   103,   103,  9599,  1012,   103,  2391,  1997,  2755,
+        1010,  1996,  2878,  3185,  2003,  2428,   103,  1010,   103,
+         103,   103,  1045,  2064,  1005,  1056,  3294, 19776,  2009,
+        1011,  2012,  2560,  2009,  2038,  2242,  2000,   103,  2009,
+       13432,  1012, 11519,  4637,  4616,  2011,  5965,  1043, 11761,
+         103,   103,  2004,   103,  7968,  3243,  4793, 11429,  1010,
+        1998,  8226,  2665, 18331,  1010,  1219,  1996,  4487, 22747,
+        8004, 12165,  4382,  5125,   103,  3597,   103,  2024,  2025,
+        2438,  2000,   103,  2417, 21564,  2143,   103,   103,  7987,
+        1013,  1028,  1026,   103,  1013,  1028,  2332,  2038,   103,
+        5156, 12081,  2004,  1996,   103,  1012,  1026, 14216,   103,
+         103,  1026,  7987,  1013,  1028,   184,  2011,  1037,  8297,
+        2036,   103,  2011,  2984,   103,  1006,  2003,  2009,  2151,
+        4687,  2008,  2016,  1005,  1055,  2018,  2053,  7731,   103,
+         103,  2144,  1029,   102], dtype=int32)>, 'segment_ids': <tf.Tensor: shape=(256,), dtype=int32, numpy=
+array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)>, 'padding_mask': <tf.Tensor: shape=(256,), dtype=bool, numpy=
+array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True])>, 'mask_positions': <tf.Tensor: shape=(64,), dtype=int64, numpy=
+array([  1,   3,   5,   6,  10,  12,  13,  27,  28,  31,  37,  42,  51,
+        55,  59,  61,  65,  71,  75,  80,  83,  84,  85,  94, 105, 107,
+       108, 118, 122, 123, 127, 128, 131, 141, 143, 144, 145, 149, 160,
+       167, 170, 171, 172, 174, 176, 185, 193, 195, 200, 204, 205, 208,
+       210, 215, 220, 223, 224, 225, 230, 231, 235, 238, 251, 252])>}, <tf.Tensor: shape=(64,), dtype=int32, numpy=
+array([ 4459,  6789, 22892,  2011,  1999,  1037,  2402,  2485,  2000,
+        1012,  3211,  2041,  9004,  4204,  2069,  2607,  3310,  1026,
+        1026,  2779,  1000,  3861,  4627,  1010,  7619,  5783,  2108,
+        4152,  2646,  1996, 15958, 14888,  1999, 14888,  2029,  2003,
+        2339,  1056,  2191,  2011, 11761,  2638,  1010,  1996,  2214,
+        2004, 14674,  2860,  2428,  1012,  1026,  1028,  7987,  2010,
+        2704,  7987,  1013,  1028,  2628,  2011,  2856, 12838,  2143,
+        2147], dtype=int32)>, <tf.Tensor: shape=(64,), dtype=float16, numpy=
+array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
+       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
+       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
+       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.], dtype=float16)>)
+
+```
+</div>
+### Pretraining model
+
+
+```python
+# BERT backbone
+backbone = keras_hub.models.BertBackbone(
+    vocabulary_size=tokenizer.vocabulary_size(),
+    num_layers=2,
+    num_heads=2,
+    hidden_dim=128,
+    intermediate_dim=512,
+)
+
+# Language modeling head
+mlm_head = keras_hub.layers.MaskedLMHead(
+    token_embedding=backbone.token_embedding,
+)
+
+inputs = {
+    "token_ids": keras.Input(shape=(None,), dtype=tf.int32, name="token_ids"),
+    "segment_ids": keras.Input(shape=(None,), dtype=tf.int32, name="segment_ids"),
+    "padding_mask": keras.Input(shape=(None,), dtype=tf.int32, name="padding_mask"),
+    "mask_positions": keras.Input(shape=(None,), dtype=tf.int32, name="mask_positions"),
+}
+
+# Encoded token sequence
+sequence = backbone(inputs)["sequence_output"]
+
+# Predict an output word for each masked input token.
+# We use the input token embedding to project from our encoded vectors to
+# vocabulary logits, which has been shown to improve training efficiency.
+outputs = mlm_head(sequence, mask_positions=inputs["mask_positions"])
+
+# Define and compile our pretraining model.
+pretraining_model = keras.Model(inputs, outputs)
+pretraining_model.summary()
+pretraining_model.compile(
+    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+    optimizer=keras.optimizers.AdamW(learning_rate=5e-4),
+    weighted_metrics=[keras.metrics.SparseCategoricalAccuracy()],
+    jit_compile=True,
+)
+
+# Pretrain on IMDB dataset
+pretraining_model.fit(
+    pretrain_ds,
+    validation_data=pretrain_val_ds,
+    epochs=3,  # Increase to 6 for higher accuracy
+)
+```
+
+
+<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace"><span style="font-weight: bold">Model: "functional_3"</span>
+</pre>
+
+
+
+
+<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace">┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┓
+┃<span style="font-weight: bold"> Layer (type)        </span>┃<span style="font-weight: bold"> Output Shape      </span>┃<span style="font-weight: bold"> Param # </span>┃<span style="font-weight: bold"> Connected to         </span>┃
+┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━┩
+│ mask_positions      │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>)      │       <span style="color: #00af00; text-decoration-color: #00af00">0</span> │ -                    │
+│ (<span style="color: #0087ff; text-decoration-color: #0087ff">InputLayer</span>)        │                   │         │                      │
+├─────────────────────┼───────────────────┼─────────┼──────────────────────┤
+│ padding_mask        │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>)      │       <span style="color: #00af00; text-decoration-color: #00af00">0</span> │ -                    │
+│ (<span style="color: #0087ff; text-decoration-color: #0087ff">InputLayer</span>)        │                   │         │                      │
+├─────────────────────┼───────────────────┼─────────┼──────────────────────┤
+│ segment_ids         │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>)      │       <span style="color: #00af00; text-decoration-color: #00af00">0</span> │ -                    │
+│ (<span style="color: #0087ff; text-decoration-color: #0087ff">InputLayer</span>)        │                   │         │                      │
+├─────────────────────┼───────────────────┼─────────┼──────────────────────┤
+│ token_ids           │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>)      │       <span style="color: #00af00; text-decoration-color: #00af00">0</span> │ -                    │
+│ (<span style="color: #0087ff; text-decoration-color: #0087ff">InputLayer</span>)        │                   │         │                      │
+├─────────────────────┼───────────────────┼─────────┼──────────────────────┤
+│ bert_backbone_4     │ [(<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00af00; text-decoration-color: #00af00">128</span>),     │ <span style="color: #00af00; text-decoration-color: #00af00">4,385,…</span> │ mask_positions[<span style="color: #00af00; text-decoration-color: #00af00">0</span>][<span style="color: #00af00; text-decoration-color: #00af00">0</span>… │
+│ (<span style="color: #0087ff; text-decoration-color: #0087ff">BertBackbone</span>)      │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>,      │         │ padding_mask[<span style="color: #00af00; text-decoration-color: #00af00">0</span>][<span style="color: #00af00; text-decoration-color: #00af00">0</span>],  │
+│                     │ <span style="color: #00af00; text-decoration-color: #00af00">128</span>)]             │         │ segment_ids[<span style="color: #00af00; text-decoration-color: #00af00">0</span>][<span style="color: #00af00; text-decoration-color: #00af00">0</span>],   │
+│                     │                   │         │ token_ids[<span style="color: #00af00; text-decoration-color: #00af00">0</span>][<span style="color: #00af00; text-decoration-color: #00af00">0</span>]      │
+├─────────────────────┼───────────────────┼─────────┼──────────────────────┤
+│ masked_lm_head      │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>,      │ <span style="color: #00af00; text-decoration-color: #00af00">3,954,…</span> │ bert_backbone_4[<span style="color: #00af00; text-decoration-color: #00af00">0</span>][<span style="color: #00af00; text-decoration-color: #00af00">…</span> │
+│ (<span style="color: #0087ff; text-decoration-color: #0087ff">MaskedLMHead</span>)      │ <span style="color: #00af00; text-decoration-color: #00af00">30522</span>)            │         │ mask_positions[<span style="color: #00af00; text-decoration-color: #00af00">0</span>][<span style="color: #00af00; text-decoration-color: #00af00">0</span>] │
+└─────────────────────┴───────────────────┴─────────┴──────────────────────┘
+</pre>
+
+
+
+
+<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace"><span style="font-weight: bold"> Total params: </span><span style="color: #00af00; text-decoration-color: #00af00">4,433,210</span> (16.91 MB)
+</pre>
+
+
+
+
+<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace"><span style="font-weight: bold"> Trainable params: </span><span style="color: #00af00; text-decoration-color: #00af00">4,433,210</span> (16.91 MB)
+</pre>
+
+
+
+
+<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace"><span style="font-weight: bold"> Non-trainable params: </span><span style="color: #00af00; text-decoration-color: #00af00">0</span> (0.00 B)
+</pre>
+
+
+
+<div class="k-default-codeblock">
+```
+Epoch 1/3
+ 1563/1563 ━━━━━━━━━━━━━━━━━━━━ 22s 12ms/step - loss: 5.7032 - sparse_categorical_accuracy: 0.0566 - val_loss: 5.0685 - val_sparse_categorical_accuracy: 0.1044
+Epoch 2/3
+ 1563/1563 ━━━━━━━━━━━━━━━━━━━━ 13s 8ms/step - loss: 5.0701 - sparse_categorical_accuracy: 0.1096 - val_loss: 4.9363 - val_sparse_categorical_accuracy: 0.1239
+Epoch 3/3
+ 1563/1563 ━━━━━━━━━━━━━━━━━━━━ 13s 8ms/step - loss: 4.9607 - sparse_categorical_accuracy: 0.1240 - val_loss: 4.7913 - val_sparse_categorical_accuracy: 0.1417
+
+<keras.src.callbacks.history.History at 0x7f2738299330>
+
+```
+</div>
+After pretraining save your `backbone` submodel to use in a new task!
+
+---
+## Build and train your own transformer from scratch
+<img src="https://storage.googleapis.com/keras-hub/getting_started_guide/prof_keras_expert.png" alt="drawing" height="250"/>
+
+Want to implement a novel transformer architecture? The KerasHub library offers all the
+low-level modules used to build SoTA architectures in our `models` API. This includes the
+`keras_hub.tokenizers` API which allows you to train your own subword tokenizer using
+`WordPieceTokenizer`, `BytePairTokenizer`, or `SentencePieceTokenizer`.
+
+In this workflow, we train a custom tokenizer on the IMDB data and design a backbone with
+custom transformer architecture. For simplicity, we then train directly on the
+classification task. Interested in more details? We wrote an entire guide to pretraining
+and finetuning a custom transformer on
+[keras.io](https://keras.io/guides/keras_hub/transformer_pretraining/),
+
+### Train custom vocabulary from IMDB data
+
+
+```python
+vocab = keras_hub.tokenizers.compute_word_piece_vocabulary(
+    imdb_train.map(lambda x, y: x),
+    vocabulary_size=20_000,
+    lowercase=True,
+    strip_accents=True,
+    reserved_tokens=["[PAD]", "[START]", "[END]", "[MASK]", "[UNK]"],
+)
+tokenizer = keras_hub.tokenizers.WordPieceTokenizer(
+    vocabulary=vocab,
+    lowercase=True,
+    strip_accents=True,
+    oov_token="[UNK]",
+)
+```
+
+### Preprocess data with a custom tokenizer
+
+
+```python
+packer = keras_hub.layers.StartEndPacker(
+    start_value=tokenizer.token_to_id("[START]"),
+    end_value=tokenizer.token_to_id("[END]"),
+    pad_value=tokenizer.token_to_id("[PAD]"),
+    sequence_length=512,
+)
+
+
+def preprocess(x, y):
+    token_ids = packer(tokenizer(x))
+    return token_ids, y
+
+
+imdb_preproc_train_ds = imdb_train.map(
+    preprocess, num_parallel_calls=tf.data.AUTOTUNE
+).prefetch(tf.data.AUTOTUNE)
+imdb_preproc_val_ds = imdb_test.map(
+    preprocess, num_parallel_calls=tf.data.AUTOTUNE
+).prefetch(tf.data.AUTOTUNE)
+
+print(imdb_preproc_train_ds.unbatch().take(1).get_single_element())
+```
+
+<div class="k-default-codeblock">
+```
+(<tf.Tensor: shape=(512,), dtype=int32, numpy=
+array([    1,   102,    11,    61,    43,   771,    16,   340,   916,
+        1259,   155,    16,   135,   207,    18,   501, 10568,   344,
+          16,    51,   206,   612,   211,   232,    43,  1094,    17,
+         215,   155,   103,   238,   202,    18,   111,    16,    51,
+         143,  1583,   131,   100,    18,    32,   101,    19,    34,
+          32,   101,    19,    34,   102,    11,    61,    43,   155,
+         105,  5337,    99,   120,     6,  1289,     6,   129,    96,
+         526,    18,   111,    16,   193,    51,   197,   102,    16,
+          51,   252,    11,    62,   167,   104,   642,    98,     6,
+        8572,     6,   154,    51,   153,  1464,   119,  3005,   990,
+        2393,    18,   102,    11,    61,   233,   404,   103,   104,
+         110,    18,    18,    18,   233,  1259,    18,    18,    18,
+         154,    51,   659, 16273,   867,   192,  1632,   133,   990,
+        2393,    18,    32,   101,    19,    34,    32,   101,    19,
+          34,    96,   110,  2886,   761,   114,  4905,   293, 12337,
+          97,  2375,    18,   113,   143,   158,   179,   104,  4905,
+         610,    16, 12585,    97,   516,   725,    18,   113,   323,
+          96,   651,   146,   104,   207, 17649,    16,    96,   176,
+       16022,   136,    16,  1414,   136,    18,   113,   323,    96,
+        2184,    18,    97,   150,   651,    51,   242,   104,   100,
+       11722,    18,   113,   151,   543,   102,   171,   115,  1081,
+         103,    96,   222,    18,    18,    18,    18,   102,   659,
+        1081,    18,    18,    18,   102,    11,    61,   115,   299,
+          18,   113,   323,    96,  1579,    98,   203,  4438,  2033,
+         103,    96,   222,    18,    18,    18,    32,   101,    19,
+          34,    32,   101,    19,    34,   111,    16,    51,   455,
+         174,    99,   859,    43,  1687,  3330,    99,   104,  1021,
+          18,    18,    18,    51,   181,    11,    62,   214,   138,
+          96,   155,   100,   115,   916,    14,  1286,    14,    99,
+         296,    96,   642,   105,   224,  4598,   117,  1289,   156,
+         103,   904,    16,   111,   115,   103,  1628,    18,   113,
+         181,    11,    62,   119,    96,  1054,   155,    16,   111,
+         156, 14665,    18,   146,   110,   139,   742,    16,    96,
+        4905,   293, 12337,    97,  7042,  1104,   106,   557,   103,
+         366,    18,   128,    16,   150,  2446,   135,    96,   960,
+          98,    96,  4905,    18,   113,   323,   156,    43,  1174,
+         293,   188,    18,    18,    18,    43,   639,   293,    96,
+         455,   108,   207,    97,  1893,    99,  1081,   104,  4905,
+          18,    51,   194,   104,   440,    98, 12337,    99,  7042,
+        1104,   654,   122,    30,     6,    51,   276,    99,   663,
+          18,    18,    18,    97,   138,   113,   207,   163,    16,
+         113,   171,   172,   107,    51,  1027,   113,     6,    18,
+          32,   101,    19,    34,    32,   101,    19,    34,   104,
+         110,   171,   333, 10311,   141,  1311,   135,   140,   100,
+         207,    97,   140,   100,    99,   120,  1632,    18,    18,
+          18,    97,   210,    11,    61,    96,  6236,   293,   188,
+          18,    51,   181,    11,    62,   214,   138,    96,   421,
+          98,   104,   110,   100,     6,   207, 14129,   122,    18,
+          18,    18,   151,  1128,    97,  1632,  1675,     6,   133,
+           6,   207,   100,   404,    18,    18,    18,   150,   646,
+         179,   133,   210,     6,    18,   111,   103,   152,   744,
+          16,   104,   110,   100,   557,    43,  1120,   108,    96,
+         701,   382,   105,   102,   260,   113,   194,    18,    18,
+          18,     2,     0,     0,     0,     0,     0,     0,     0,
+           0,     0,     0,     0,     0,     0,     0,     0,     0,
+           0,     0,     0,     0,     0,     0,     0,     0,     0,
+           0,     0,     0,     0,     0,     0,     0,     0],
+      dtype=int32)>, <tf.Tensor: shape=(), dtype=int32, numpy=1>)
+
+```
+</div>
+### Design a tiny transformer
+
+
+```python
+token_id_input = keras.Input(
+    shape=(None,),
+    dtype="int32",
+    name="token_ids",
+)
+outputs = keras_hub.layers.TokenAndPositionEmbedding(
+    vocabulary_size=len(vocab),
+    sequence_length=packer.sequence_length,
+    embedding_dim=64,
+)(token_id_input)
+outputs = keras_hub.layers.TransformerEncoder(
+    num_heads=2,
+    intermediate_dim=128,
+    dropout=0.1,
+)(outputs)
+# Use "[START]" token to classify
+outputs = keras.layers.Dense(2)(outputs[:, 0, :])
+model = keras.Model(
+    inputs=token_id_input,
+    outputs=outputs,
+)
+
+model.summary()
+```
+
+
+<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace"><span style="font-weight: bold">Model: "functional_5"</span>
+</pre>
+
+
+
+
+<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace">┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┓
+┃<span style="font-weight: bold"> Layer (type)                    </span>┃<span style="font-weight: bold"> Output Shape              </span>┃<span style="font-weight: bold">    Param # </span>┃
+┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━┩
+│ token_ids (<span style="color: #0087ff; text-decoration-color: #0087ff">InputLayer</span>)          │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>)              │          <span style="color: #00af00; text-decoration-color: #00af00">0</span> │
+├─────────────────────────────────┼───────────────────────────┼────────────┤
+│ token_and_position_embedding    │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00af00; text-decoration-color: #00af00">64</span>)          │  <span style="color: #00af00; text-decoration-color: #00af00">1,259,648</span> │
+│ (<span style="color: #0087ff; text-decoration-color: #0087ff">TokenAndPositionEmbedding</span>)     │                           │            │
+├─────────────────────────────────┼───────────────────────────┼────────────┤
+│ transformer_encoder_2           │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00af00; text-decoration-color: #00af00">64</span>)          │     <span style="color: #00af00; text-decoration-color: #00af00">33,472</span> │
+│ (<span style="color: #0087ff; text-decoration-color: #0087ff">TransformerEncoder</span>)            │                           │            │
+├─────────────────────────────────┼───────────────────────────┼────────────┤
+│ get_item_6 (<span style="color: #0087ff; text-decoration-color: #0087ff">GetItem</span>)            │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00af00; text-decoration-color: #00af00">64</span>)                │          <span style="color: #00af00; text-decoration-color: #00af00">0</span> │
+├─────────────────────────────────┼───────────────────────────┼────────────┤
+│ dense_1 (<span style="color: #0087ff; text-decoration-color: #0087ff">Dense</span>)                 │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00af00; text-decoration-color: #00af00">2</span>)                 │        <span style="color: #00af00; text-decoration-color: #00af00">130</span> │
+└─────────────────────────────────┴───────────────────────────┴────────────┘
+</pre>
+
+
+
+
+<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace"><span style="font-weight: bold"> Total params: </span><span style="color: #00af00; text-decoration-color: #00af00">1,293,250</span> (4.93 MB)
+</pre>
+
+
+
+
+<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace"><span style="font-weight: bold"> Trainable params: </span><span style="color: #00af00; text-decoration-color: #00af00">1,293,250</span> (4.93 MB)
+</pre>
+
+
+
+
+<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace"><span style="font-weight: bold"> Non-trainable params: </span><span style="color: #00af00; text-decoration-color: #00af00">0</span> (0.00 B)
+</pre>
+
+
+
+### Train the transformer directly on the classification objective
+
+
+```python
+model.compile(
+    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+    optimizer=keras.optimizers.AdamW(5e-5),
+    metrics=[keras.metrics.SparseCategoricalAccuracy()],
+    jit_compile=True,
+)
+model.fit(
+    imdb_preproc_train_ds,
+    validation_data=imdb_preproc_val_ds,
+    epochs=3,
+)
+```
+
+<div class="k-default-codeblock">
+```
+Epoch 1/3
+ 1563/1563 ━━━━━━━━━━━━━━━━━━━━ 8s 4ms/step - loss: 0.7790 - sparse_categorical_accuracy: 0.5367 - val_loss: 0.4420 - val_sparse_categorical_accuracy: 0.8120
+Epoch 2/3
+ 1563/1563 ━━━━━━━━━━━━━━━━━━━━ 5s 3ms/step - loss: 0.3654 - sparse_categorical_accuracy: 0.8443 - val_loss: 0.3046 - val_sparse_categorical_accuracy: 0.8752
+Epoch 3/3
+ 1563/1563 ━━━━━━━━━━━━━━━━━━━━ 5s 3ms/step - loss: 0.2471 - sparse_categorical_accuracy: 0.9019 - val_loss: 0.3060 - val_sparse_categorical_accuracy: 0.8748
+
+<keras.src.callbacks.history.History at 0x7f26d032a4d0>
+
+```
+</div>
+Excitingly, our custom classifier is similar to the performance of fine-tuning
+`"bert_tiny_en_uncased"`! To see the advantages of pretraining and exceed 90% accuracy we
+would need to use larger **presets** such as `"bert_base_en_uncased"`.
diff --git a/templates/guides/keras_hub/transformer_pretraining.md b/templates/guides/keras_hub/transformer_pretraining.md
new file mode 100644
index 0000000000..15e94ea486
--- /dev/null
+++ b/templates/guides/keras_hub/transformer_pretraining.md
@@ -0,0 +1,635 @@
+# Pretraining a Transformer from scratch with KerasHub
+
+**Author:** [Matthew Watson](https://github.com/mattdangerw/)<br>
+**Date created:** 2022/04/18<br>
+**Last modified:** 2023/07/15<br>
+**Description:** Use KerasHub to train a Transformer model from scratch.
+
+
+<img class="k-inline-icon" src="https://colab.research.google.com/img/colab_favicon.ico"/> [**View in Colab**](https://colab.research.google.com/github/keras-team/keras-io/blob/master/guides/ipynb/keras_hub/transformer_pretraining.ipynb)  <span class="k-dot">•</span><img class="k-inline-icon" src="https://github.com/favicon.ico"/> [**GitHub source**](https://github.com/keras-team/keras-io/blob/master/guides/keras_hub/transformer_pretraining.py)
+
+
+
+KerasHub aims to make it easy to build state-of-the-art text processing models. In this
+guide, we will show how library components simplify pretraining and fine-tuning a
+Transformer model from scratch.
+
+This guide is broken into three parts:
+
+1. *Setup*, task definition, and establishing a baseline.
+2. *Pretraining* a Transformer model.
+3. *Fine-tuning* the Transformer model on our classification task.
+
+---
+## Setup
+
+The following guide uses Keras 3 to work in any of `tensorflow`, `jax` or
+`torch`. We select the `jax` backend below, which will give us a particularly
+fast train step below, but feel free to mix it up.
+
+
+```python
+!pip install -q --upgrade keras-hub
+!pip install -q --upgrade keras  # Upgrade to Keras 3.
+```
+
+```python
+import os
+
+os.environ["KERAS_BACKEND"] = "jax"  # or "tensorflow" or "torch"
+
+
+import keras_hub
+import tensorflow as tf
+import keras
+```
+<div class="k-default-codeblock">
+```
+
+```
+</div>
+Next up, we can download two datasets.
+
+- [SST-2](https://paperswithcode.com/sota/sentiment-analysis-on-sst-2-binary) a text
+classification dataset and our "end goal". This dataset is often used to benchmark
+language models.
+- [WikiText-103](https://paperswithcode.com/dataset/wikitext-103): A medium sized
+collection of featured articles from English Wikipedia, which we will use for
+pretraining.
+
+Finally, we will download a WordPiece vocabulary, to do sub-word tokenization later on in
+this guide.
+
+
+```python
+# Download pretraining data.
+keras.utils.get_file(
+    origin="https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip",
+    extract=True,
+)
+wiki_dir = os.path.expanduser("~/.keras/datasets/wikitext-103-raw/")
+
+# Download finetuning data.
+keras.utils.get_file(
+    origin="https://dl.fbaipublicfiles.com/glue/data/SST-2.zip",
+    extract=True,
+)
+sst_dir = os.path.expanduser("~/.keras/datasets/SST-2/")
+
+# Download vocabulary data.
+vocab_file = keras.utils.get_file(
+    origin="https://storage.googleapis.com/tensorflow/keras-hub/examples/bert/bert_vocab_uncased.txt",
+)
+```
+
+Next, we define some hyperparameters we will use during training.
+
+
+```python
+# Preprocessing params.
+PRETRAINING_BATCH_SIZE = 128
+FINETUNING_BATCH_SIZE = 32
+SEQ_LENGTH = 128
+MASK_RATE = 0.25
+PREDICTIONS_PER_SEQ = 32
+
+# Model params.
+NUM_LAYERS = 3
+MODEL_DIM = 256
+INTERMEDIATE_DIM = 512
+NUM_HEADS = 4
+DROPOUT = 0.1
+NORM_EPSILON = 1e-5
+
+# Training params.
+PRETRAINING_LEARNING_RATE = 5e-4
+PRETRAINING_EPOCHS = 8
+FINETUNING_LEARNING_RATE = 5e-5
+FINETUNING_EPOCHS = 3
+```
+
+### Load data
+
+We load our data with [tf.data](https://www.tensorflow.org/guide/data), which will allow
+us to define input pipelines for tokenizing and preprocessing text.
+
+
+```python
+# Load SST-2.
+sst_train_ds = tf.data.experimental.CsvDataset(
+    sst_dir + "train.tsv", [tf.string, tf.int32], header=True, field_delim="\t"
+).batch(FINETUNING_BATCH_SIZE)
+sst_val_ds = tf.data.experimental.CsvDataset(
+    sst_dir + "dev.tsv", [tf.string, tf.int32], header=True, field_delim="\t"
+).batch(FINETUNING_BATCH_SIZE)
+
+# Load wikitext-103 and filter out short lines.
+wiki_train_ds = (
+    tf.data.TextLineDataset(wiki_dir + "wiki.train.raw")
+    .filter(lambda x: tf.strings.length(x) > 100)
+    .batch(PRETRAINING_BATCH_SIZE)
+)
+wiki_val_ds = (
+    tf.data.TextLineDataset(wiki_dir + "wiki.valid.raw")
+    .filter(lambda x: tf.strings.length(x) > 100)
+    .batch(PRETRAINING_BATCH_SIZE)
+)
+
+# Take a peak at the sst-2 dataset.
+print(sst_train_ds.unbatch().batch(4).take(1).get_single_element())
+```
+
+<div class="k-default-codeblock">
+```
+(<tf.Tensor: shape=(4,), dtype=string, numpy=
+array([b'hide new secretions from the parental units ',
+       b'contains no wit , only labored gags ',
+       b'that loves its characters and communicates something rather beautiful about human nature ',
+       b'remains utterly satisfied to remain the same throughout '],
+      dtype=object)>, <tf.Tensor: shape=(4,), dtype=int32, numpy=array([0, 0, 1, 0], dtype=int32)>)
+
+```
+</div>
+You can see that our `SST-2` dataset contains relatively short snippets of movie review
+text. Our goal is to predict the sentiment of the snippet. A label of 1 indicates
+positive sentiment, and a label of 0 negative sentiment.
+
+### Establish a baseline
+
+As a first step, we will establish a baseline of good performance. We don't actually need
+KerasHub for this, we can just use core Keras layers.
+
+We will train a simple bag-of-words model, where we learn a positive or negative weight
+for each word in our vocabulary. A sample's score is simply the sum of the weights of all
+words that are present in the sample.
+
+
+```python
+# This layer will turn our input sentence into a list of 1s and 0s the same size
+# our vocabulary, indicating whether a word is present in absent.
+multi_hot_layer = keras.layers.TextVectorization(
+    max_tokens=4000, output_mode="multi_hot"
+)
+multi_hot_layer.adapt(sst_train_ds.map(lambda x, y: x))
+multi_hot_ds = sst_train_ds.map(lambda x, y: (multi_hot_layer(x), y))
+multi_hot_val_ds = sst_val_ds.map(lambda x, y: (multi_hot_layer(x), y))
+
+# We then learn a linear regression over that layer, and that's our entire
+# baseline model!
+
+inputs = keras.Input(shape=(4000,), dtype="int32")
+outputs = keras.layers.Dense(1, activation="sigmoid")(inputs)
+baseline_model = keras.Model(inputs, outputs)
+baseline_model.compile(loss="binary_crossentropy", metrics=["accuracy"])
+baseline_model.fit(multi_hot_ds, validation_data=multi_hot_val_ds, epochs=5)
+```
+
+<div class="k-default-codeblock">
+```
+Epoch 1/5
+ 2105/2105 ━━━━━━━━━━━━━━━━━━━━ 2s 698us/step - accuracy: 0.6421 - loss: 0.6469 - val_accuracy: 0.7567 - val_loss: 0.5391
+Epoch 2/5
+ 2105/2105 ━━━━━━━━━━━━━━━━━━━━ 1s 493us/step - accuracy: 0.7524 - loss: 0.5392 - val_accuracy: 0.7868 - val_loss: 0.4891
+Epoch 3/5
+ 2105/2105 ━━━━━━━━━━━━━━━━━━━━ 1s 513us/step - accuracy: 0.7832 - loss: 0.4871 - val_accuracy: 0.7991 - val_loss: 0.4671
+Epoch 4/5
+ 2105/2105 ━━━━━━━━━━━━━━━━━━━━ 1s 475us/step - accuracy: 0.7991 - loss: 0.4543 - val_accuracy: 0.8069 - val_loss: 0.4569
+Epoch 5/5
+ 2105/2105 ━━━━━━━━━━━━━━━━━━━━ 1s 476us/step - accuracy: 0.8100 - loss: 0.4313 - val_accuracy: 0.8036 - val_loss: 0.4530
+
+<keras.src.callbacks.history.History at 0x7f13902967a0>
+
+```
+</div>
+A bag-of-words approach can be a fast and surprisingly powerful, especially when input
+examples contain a large number of words. With shorter sequences, it can hit a
+performance ceiling.
+
+To do better, we would like to build a model that can evaluate words *in context*. Instead
+of evaluating each word in a void, we need to use the information contained in the
+*entire ordered sequence* of our input.
+
+This runs us into a problem. `SST-2` is very small dataset, and there's simply not enough
+example text to attempt to build a larger, more parameterized model that can learn on a
+sequence. We would quickly start to overfit and memorize our training set, without any
+increase in our ability to generalize to unseen examples.
+
+Enter **pretraining**, which will allow us to learn on a larger corpus, and transfer our
+knowledge to the `SST-2` task. And enter **KerasHub**, which will allow us to pretrain a
+particularly powerful model, the Transformer, with ease.
+
+---
+## Pretraining
+
+To beat our baseline, we will leverage the `WikiText103` dataset, an unlabeled
+collection of Wikipedia articles that is much bigger than `SST-2`.
+
+We are going to train a *transformer*, a highly expressive model which will learn
+to embed each word in our input as a low dimensional vector. Our wikipedia dataset has no
+labels, so we will use an unsupervised training objective called the *Masked Language
+Modeling* (MaskedLM) objective.
+
+Essentially, we will be playing a big game of "guess the missing word". For each input
+sample we will obscure 25% of our input data, and train our model to predict the parts we
+covered up.
+
+### Preprocess data for the MaskedLM task
+
+Our text preprocessing for the MaskedLM task will occur in two stages.
+
+1. Tokenize input text into integer sequences of token ids.
+2. Mask certain positions in our input to predict on.
+
+To tokenize, we can use a `keras_hub.tokenizers.Tokenizer` -- the KerasHub building block
+for transforming text into sequences of integer token ids.
+
+In particular, we will use `keras_hub.tokenizers.WordPieceTokenizer` which does
+*sub-word* tokenization. Sub-word tokenization is popular when training models on large
+text corpora. Essentially, it allows our model to learn from uncommon words, while not
+requiring a massive vocabulary of every word in our training set.
+
+The second thing we need to do is mask our input for the MaskedLM task. To do this, we can use
+`keras_hub.layers.MaskedLMMaskGenerator`, which will randomly select a set of tokens in each
+input and mask them out.
+
+The tokenizer and the masking layer can both be used inside a call to
+[tf.data.Dataset.map](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#map).
+We can use `tf.data` to efficiently pre-compute each batch on the CPU, while our GPU or TPU
+works on training with the batch that came before. Because our masking layer will
+choose new words to mask each time, each epoch over our dataset will give us a totally
+new set of labels to train on.
+
+
+```python
+# Setting sequence_length will trim or pad the token outputs to shape
+# (batch_size, SEQ_LENGTH).
+tokenizer = keras_hub.tokenizers.WordPieceTokenizer(
+    vocabulary=vocab_file,
+    sequence_length=SEQ_LENGTH,
+    lowercase=True,
+    strip_accents=True,
+)
+# Setting mask_selection_length will trim or pad the mask outputs to shape
+# (batch_size, PREDICTIONS_PER_SEQ).
+masker = keras_hub.layers.MaskedLMMaskGenerator(
+    vocabulary_size=tokenizer.vocabulary_size(),
+    mask_selection_rate=MASK_RATE,
+    mask_selection_length=PREDICTIONS_PER_SEQ,
+    mask_token_id=tokenizer.token_to_id("[MASK]"),
+)
+
+
+def preprocess(inputs):
+    inputs = tokenizer(inputs)
+    outputs = masker(inputs)
+    # Split the masking layer outputs into a (features, labels, and weights)
+    # tuple that we can use with keras.Model.fit().
+    features = {
+        "token_ids": outputs["token_ids"],
+        "mask_positions": outputs["mask_positions"],
+    }
+    labels = outputs["mask_ids"]
+    weights = outputs["mask_weights"]
+    return features, labels, weights
+
+
+# We use prefetch() to pre-compute preprocessed batches on the fly on the CPU.
+pretrain_ds = wiki_train_ds.map(
+    preprocess, num_parallel_calls=tf.data.AUTOTUNE
+).prefetch(tf.data.AUTOTUNE)
+pretrain_val_ds = wiki_val_ds.map(
+    preprocess, num_parallel_calls=tf.data.AUTOTUNE
+).prefetch(tf.data.AUTOTUNE)
+
+# Preview a single input example.
+# The masks will change each time you run the cell.
+print(pretrain_val_ds.take(1).get_single_element())
+```
+
+<div class="k-default-codeblock">
+```
+({'token_ids': <tf.Tensor: shape=(128, 128), dtype=int32, numpy=
+array([[7570, 7849, 2271, ..., 9673,  103, 7570],
+       [7570, 7849,  103, ..., 1007, 1012, 2023],
+       [1996, 2034, 3940, ...,    0,    0,    0],
+       ...,
+       [2076, 1996, 2307, ...,    0,    0,    0],
+       [3216,  103, 2083, ...,    0,    0,    0],
+       [ 103, 2007, 1045, ...,    0,    0,    0]], dtype=int32)>, 'mask_positions': <tf.Tensor: shape=(128, 32), dtype=int64, numpy=
+array([[  5,   6,   7, ..., 118, 120, 126],
+       [  2,   3,  14, ..., 105, 106, 113],
+       [  4,   9,  10, ...,   0,   0,   0],
+       ...,
+       [  4,  11,  19, ..., 117, 118,   0],
+       [  1,  14,  17, ...,   0,   0,   0],
+       [  0,   3,   6, ...,   0,   0,   0]])>}, <tf.Tensor: shape=(128, 32), dtype=int32, numpy=
+array([[ 1010,  2124,  2004, ...,  2095, 11300,  1012],
+       [ 2271, 13091,  2303, ...,  2029,  2027,  1010],
+       [23976,  2007,  1037, ...,     0,     0,     0],
+       ...,
+       [ 1010,  1996,  1010, ...,  1999,  7511,     0],
+       [ 2225,  1998, 10722, ...,     0,     0,     0],
+       [ 9794,  1030,  2322, ...,     0,     0,     0]], dtype=int32)>, <tf.Tensor: shape=(128, 32), dtype=float32, numpy=
+array([[1., 1., 1., ..., 1., 1., 1.],
+       [1., 1., 1., ..., 1., 1., 1.],
+       [1., 1., 1., ..., 0., 0., 0.],
+       ...,
+       [1., 1., 1., ..., 1., 1., 0.],
+       [1., 1., 1., ..., 0., 0., 0.],
+       [1., 1., 1., ..., 0., 0., 0.]], dtype=float32)>)
+
+```
+</div>
+The above block sorts our dataset into a `(features, labels, weights)` tuple, which can be
+passed directly to `keras.Model.fit()`.
+
+We have two features:
+
+1. `"token_ids"`, where some tokens have been replaced with our mask token id.
+2. `"mask_positions"`, which keeps track of which tokens we masked out.
+
+Our labels are simply the ids we masked out.
+
+Because not all sequences will have the same number of masks, we also keep a
+`sample_weight` tensor, which removes padded labels from our loss function by giving them
+zero weight.
+
+### Create the Transformer encoder
+
+KerasHub provides all the building blocks to quickly build a Transformer encoder.
+
+We use `keras_hub.layers.TokenAndPositionEmbedding` to first embed our input token ids.
+This layer simultaneously learns two embeddings -- one for words in a sentence and another
+for integer positions in a sentence. The output embedding is simply the sum of the two.
+
+Then we can add a series of `keras_hub.layers.TransformerEncoder` layers. These are the
+bread and butter of the Transformer model, using an attention mechanism to attend to
+different parts of the input sentence, followed by a multi-layer perceptron block.
+
+The output of this model will be a encoded vector per input token id. Unlike the
+bag-of-words model we used as a baseline, this model will embed each token accounting for
+the context in which it appeared.
+
+
+```python
+inputs = keras.Input(shape=(SEQ_LENGTH,), dtype="int32")
+
+# Embed our tokens with a positional embedding.
+embedding_layer = keras_hub.layers.TokenAndPositionEmbedding(
+    vocabulary_size=tokenizer.vocabulary_size(),
+    sequence_length=SEQ_LENGTH,
+    embedding_dim=MODEL_DIM,
+)
+outputs = embedding_layer(inputs)
+
+# Apply layer normalization and dropout to the embedding.
+outputs = keras.layers.LayerNormalization(epsilon=NORM_EPSILON)(outputs)
+outputs = keras.layers.Dropout(rate=DROPOUT)(outputs)
+
+# Add a number of encoder blocks
+for i in range(NUM_LAYERS):
+    outputs = keras_hub.layers.TransformerEncoder(
+        intermediate_dim=INTERMEDIATE_DIM,
+        num_heads=NUM_HEADS,
+        dropout=DROPOUT,
+        layer_norm_epsilon=NORM_EPSILON,
+    )(outputs)
+
+encoder_model = keras.Model(inputs, outputs)
+encoder_model.summary()
+```
+
+
+<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace"><span style="font-weight: bold">Model: "functional_3"</span>
+</pre>
+
+
+
+
+<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace">┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┓
+┃<span style="font-weight: bold"> Layer (type)                    </span>┃<span style="font-weight: bold"> Output Shape              </span>┃<span style="font-weight: bold">    Param # </span>┃
+┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━┩
+│ input_layer_1 (<span style="color: #0087ff; text-decoration-color: #0087ff">InputLayer</span>)      │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00af00; text-decoration-color: #00af00">128</span>)               │          <span style="color: #00af00; text-decoration-color: #00af00">0</span> │
+├─────────────────────────────────┼───────────────────────────┼────────────┤
+│ token_and_position_embedding    │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00af00; text-decoration-color: #00af00">128</span>, <span style="color: #00af00; text-decoration-color: #00af00">256</span>)          │  <span style="color: #00af00; text-decoration-color: #00af00">7,846,400</span> │
+│ (<span style="color: #0087ff; text-decoration-color: #0087ff">TokenAndPositionEmbedding</span>)     │                           │            │
+├─────────────────────────────────┼───────────────────────────┼────────────┤
+│ layer_normalization             │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00af00; text-decoration-color: #00af00">128</span>, <span style="color: #00af00; text-decoration-color: #00af00">256</span>)          │        <span style="color: #00af00; text-decoration-color: #00af00">512</span> │
+│ (<span style="color: #0087ff; text-decoration-color: #0087ff">LayerNormalization</span>)            │                           │            │
+├─────────────────────────────────┼───────────────────────────┼────────────┤
+│ dropout (<span style="color: #0087ff; text-decoration-color: #0087ff">Dropout</span>)               │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00af00; text-decoration-color: #00af00">128</span>, <span style="color: #00af00; text-decoration-color: #00af00">256</span>)          │          <span style="color: #00af00; text-decoration-color: #00af00">0</span> │
+├─────────────────────────────────┼───────────────────────────┼────────────┤
+│ transformer_encoder             │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00af00; text-decoration-color: #00af00">128</span>, <span style="color: #00af00; text-decoration-color: #00af00">256</span>)          │    <span style="color: #00af00; text-decoration-color: #00af00">527,104</span> │
+│ (<span style="color: #0087ff; text-decoration-color: #0087ff">TransformerEncoder</span>)            │                           │            │
+├─────────────────────────────────┼───────────────────────────┼────────────┤
+│ transformer_encoder_1           │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00af00; text-decoration-color: #00af00">128</span>, <span style="color: #00af00; text-decoration-color: #00af00">256</span>)          │    <span style="color: #00af00; text-decoration-color: #00af00">527,104</span> │
+│ (<span style="color: #0087ff; text-decoration-color: #0087ff">TransformerEncoder</span>)            │                           │            │
+├─────────────────────────────────┼───────────────────────────┼────────────┤
+│ transformer_encoder_2           │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00af00; text-decoration-color: #00af00">128</span>, <span style="color: #00af00; text-decoration-color: #00af00">256</span>)          │    <span style="color: #00af00; text-decoration-color: #00af00">527,104</span> │
+│ (<span style="color: #0087ff; text-decoration-color: #0087ff">TransformerEncoder</span>)            │                           │            │
+└─────────────────────────────────┴───────────────────────────┴────────────┘
+</pre>
+
+
+
+
+<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace"><span style="font-weight: bold"> Total params: </span><span style="color: #00af00; text-decoration-color: #00af00">9,428,224</span> (287.73 MB)
+</pre>
+
+
+
+
+<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace"><span style="font-weight: bold"> Trainable params: </span><span style="color: #00af00; text-decoration-color: #00af00">9,428,224</span> (287.73 MB)
+</pre>
+
+
+
+
+<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace"><span style="font-weight: bold"> Non-trainable params: </span><span style="color: #00af00; text-decoration-color: #00af00">0</span> (0.00 B)
+</pre>
+
+
+
+### Pretrain the Transformer
+
+You can think of the `encoder_model` as it's own modular unit, it is the piece of our
+model that we are really interested in for our downstream task. However we still need to
+set up the encoder to train on the MaskedLM task; to do that we attach a
+`keras_hub.layers.MaskedLMHead`.
+
+This layer will take as one input the token encodings, and as another the positions we
+masked out in the original input. It will gather the token encodings we masked, and
+transform them back in predictions over our entire vocabulary.
+
+With that, we are ready to compile and run pretraining. If you are running this in a
+Colab, note that this will take about an hour. Training Transformer is famously compute
+intensive, so even this relatively small Transformer will take some time.
+
+
+```python
+# Create the pretraining model by attaching a masked language model head.
+inputs = {
+    "token_ids": keras.Input(shape=(SEQ_LENGTH,), dtype="int32", name="token_ids"),
+    "mask_positions": keras.Input(
+        shape=(PREDICTIONS_PER_SEQ,), dtype="int32", name="mask_positions"
+    ),
+}
+
+# Encode the tokens.
+encoded_tokens = encoder_model(inputs["token_ids"])
+
+# Predict an output word for each masked input token.
+# We use the input token embedding to project from our encoded vectors to
+# vocabulary logits, which has been shown to improve training efficiency.
+outputs = keras_hub.layers.MaskedLMHead(
+    token_embedding=embedding_layer.token_embedding,
+    activation="softmax",
+)(encoded_tokens, mask_positions=inputs["mask_positions"])
+
+# Define and compile our pretraining model.
+pretraining_model = keras.Model(inputs, outputs)
+pretraining_model.compile(
+    loss="sparse_categorical_crossentropy",
+    optimizer=keras.optimizers.AdamW(PRETRAINING_LEARNING_RATE),
+    weighted_metrics=["sparse_categorical_accuracy"],
+    jit_compile=True,
+)
+
+# Pretrain the model on our wiki text dataset.
+pretraining_model.fit(
+    pretrain_ds,
+    validation_data=pretrain_val_ds,
+    epochs=PRETRAINING_EPOCHS,
+)
+
+# Save this base model for further finetuning.
+encoder_model.save("encoder_model.keras")
+```
+
+<div class="k-default-codeblock">
+```
+Epoch 1/8
+ 5857/5857 ━━━━━━━━━━━━━━━━━━━━ 242s 41ms/step - loss: 5.4679 - sparse_categorical_accuracy: 0.1353 - val_loss: 3.4570 - val_sparse_categorical_accuracy: 0.3522
+Epoch 2/8
+ 5857/5857 ━━━━━━━━━━━━━━━━━━━━ 234s 40ms/step - loss: 3.6031 - sparse_categorical_accuracy: 0.3396 - val_loss: 3.0514 - val_sparse_categorical_accuracy: 0.4032
+Epoch 3/8
+ 5857/5857 ━━━━━━━━━━━━━━━━━━━━ 232s 40ms/step - loss: 3.2609 - sparse_categorical_accuracy: 0.3802 - val_loss: 2.8858 - val_sparse_categorical_accuracy: 0.4240
+Epoch 4/8
+ 5857/5857 ━━━━━━━━━━━━━━━━━━━━ 233s 40ms/step - loss: 3.1099 - sparse_categorical_accuracy: 0.3978 - val_loss: 2.7897 - val_sparse_categorical_accuracy: 0.4375
+Epoch 5/8
+ 5857/5857 ━━━━━━━━━━━━━━━━━━━━ 235s 40ms/step - loss: 3.0145 - sparse_categorical_accuracy: 0.4090 - val_loss: 2.7504 - val_sparse_categorical_accuracy: 0.4419
+Epoch 6/8
+ 5857/5857 ━━━━━━━━━━━━━━━━━━━━ 252s 43ms/step - loss: 2.9530 - sparse_categorical_accuracy: 0.4157 - val_loss: 2.6925 - val_sparse_categorical_accuracy: 0.4474
+Epoch 7/8
+ 5857/5857 ━━━━━━━━━━━━━━━━━━━━ 232s 40ms/step - loss: 2.9088 - sparse_categorical_accuracy: 0.4210 - val_loss: 2.6554 - val_sparse_categorical_accuracy: 0.4513
+Epoch 8/8
+ 5857/5857 ━━━━━━━━━━━━━━━━━━━━ 236s 40ms/step - loss: 2.8721 - sparse_categorical_accuracy: 0.4250 - val_loss: 2.6389 - val_sparse_categorical_accuracy: 0.4548
+
+```
+</div>
+---
+## Fine-tuning
+
+After pretraining, we can now fine-tune our model on the `SST-2` dataset. We can
+leverage the ability of the encoder we build to predict on words in context to boost 
+our performance on the downstream task.
+
+### Preprocess data for classification
+
+Preprocessing for fine-tuning is much simpler than for our pretraining MaskedLM task. We just
+tokenize our input sentences and we are ready for training!
+
+
+```python
+
+def preprocess(sentences, labels):
+    return tokenizer(sentences), labels
+
+
+# We use prefetch() to pre-compute preprocessed batches on the fly on our CPU.
+finetune_ds = sst_train_ds.map(
+    preprocess, num_parallel_calls=tf.data.AUTOTUNE
+).prefetch(tf.data.AUTOTUNE)
+finetune_val_ds = sst_val_ds.map(
+    preprocess, num_parallel_calls=tf.data.AUTOTUNE
+).prefetch(tf.data.AUTOTUNE)
+
+# Preview a single input example.
+print(finetune_val_ds.take(1).get_single_element())
+```
+
+<div class="k-default-codeblock">
+```
+(<tf.Tensor: shape=(32, 128), dtype=int32, numpy=
+array([[ 2009,  1005,  1055, ...,     0,     0,     0],
+       [ 4895, 10258,  2378, ...,     0,     0,     0],
+       [ 4473,  2149,  2000, ...,     0,     0,     0],
+       ...,
+       [ 1045,  2018,  2000, ...,     0,     0,     0],
+       [ 4283,  2000,  3660, ...,     0,     0,     0],
+       [ 1012,  1012,  1012, ...,     0,     0,     0]], dtype=int32)>, <tf.Tensor: shape=(32,), dtype=int32, numpy=
+array([1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0,
+       0, 1, 1, 0, 0, 1, 0, 0, 1, 0], dtype=int32)>)
+
+```
+</div>
+### Fine-tune the Transformer
+
+To go from our encoded token output to a classification prediction, we need to attach
+another "head" to our Transformer model. We can afford to be simple here. We pool
+the encoded tokens together, and use a single dense layer to make a prediction.
+
+
+```python
+# Reload the encoder model from disk so we can restart fine-tuning from scratch.
+encoder_model = keras.models.load_model("encoder_model.keras", compile=False)
+
+# Take as input the tokenized input.
+inputs = keras.Input(shape=(SEQ_LENGTH,), dtype="int32")
+
+# Encode and pool the tokens.
+encoded_tokens = encoder_model(inputs)
+pooled_tokens = keras.layers.GlobalAveragePooling1D()(encoded_tokens[0])
+
+# Predict an output label.
+outputs = keras.layers.Dense(1, activation="sigmoid")(pooled_tokens)
+
+# Define and compile our fine-tuning model.
+finetuning_model = keras.Model(inputs, outputs)
+finetuning_model.compile(
+    loss="binary_crossentropy",
+    optimizer=keras.optimizers.AdamW(FINETUNING_LEARNING_RATE),
+    metrics=["accuracy"],
+)
+
+# Finetune the model for the SST-2 task.
+finetuning_model.fit(
+    finetune_ds,
+    validation_data=finetune_val_ds,
+    epochs=FINETUNING_EPOCHS,
+)
+```
+
+<div class="k-default-codeblock">
+```
+Epoch 1/3
+ 2105/2105 ━━━━━━━━━━━━━━━━━━━━ 21s 9ms/step - accuracy: 0.7500 - loss: 0.4891 - val_accuracy: 0.8036 - val_loss: 0.4099
+Epoch 2/3
+ 2105/2105 ━━━━━━━━━━━━━━━━━━━━ 16s 8ms/step - accuracy: 0.8826 - loss: 0.2779 - val_accuracy: 0.8482 - val_loss: 0.3964
+Epoch 3/3
+ 2105/2105 ━━━━━━━━━━━━━━━━━━━━ 16s 8ms/step - accuracy: 0.9176 - loss: 0.2066 - val_accuracy: 0.8549 - val_loss: 0.4142
+
+<keras.src.callbacks.history.History at 0x7f12d85c21a0>
+
+```
+</div>
+Pretraining was enough to boost our performance to 84%, and this is hardly the ceiling
+for Transformer models. You may have noticed during pretraining that our validation
+performance was still steadily increasing. Our model is still significantly undertrained.
+Training for more epochs, training a large Transformer, and training on more unlabeled
+text would all continue to boost performance significantly.
+
+One of the key goals of KerasHub is to provide a modular approach to NLP model building.
+We have shown one approach to building a Transformer here, but KerasHub supports an ever
+growing array of components for preprocessing text and building models. We hope it makes
+it easier to experiment on solutions to your natural language problems.
diff --git a/templates/guides/keras_hub/upload.md b/templates/guides/keras_hub/upload.md
new file mode 100644
index 0000000000..3817d354e9
--- /dev/null
+++ b/templates/guides/keras_hub/upload.md
@@ -0,0 +1,308 @@
+# Uploading Models with KerasHub
+
+**Author:** [Samaneh Saadat](https://github.com/SamanehSaadat/), [Matthew Watson](https://github.com/mattdangerw/)<br>
+**Date created:** 2024/04/29<br>
+**Last modified:** 2024/04/29<br>
+**Description:** An introduction on how to upload a fine-tuned KerasHub model to model hubs.
+
+
+<img class="k-inline-icon" src="https://colab.research.google.com/img/colab_favicon.ico"/> [**View in Colab**](https://colab.research.google.com/github/keras-team/keras-io/blob/master/guides/ipynb/keras_hub/upload.ipynb)  <span class="k-dot">•</span><img class="k-inline-icon" src="https://github.com/favicon.ico"/> [**GitHub source**](https://github.com/keras-team/keras-io/blob/master/guides/keras_hub/upload.py)
+
+
+
+# Introduction
+
+Fine-tuning a machine learning model can yield impressive results for specific tasks.
+Uploading your fine-tuned model to a model hub allows you to share it with the broader community.
+By sharing your models, you'll enhance accessibility for other researchers and developers,
+making your contributions an integral part of the machine learning landscape.
+This can also streamline the integration of your model into real-world applications.
+
+This guide walks you through how to upload your fine-tuned models to popular model hubs such as
+[Kaggle Models](https://www.kaggle.com/models) and [Hugging Face Hub](https://huggingface.co/models).
+
+# Setup
+
+Let's start by installing and importing all the libraries we need. We use KerasHub for this guide.
+
+
+```python
+!pip install -q --upgrade keras-hub huggingface-hub kagglehub
+```
+
+
+```python
+import os
+
+os.environ["KERAS_BACKEND"] = "jax"
+
+import keras_hub
+
+```
+
+# Data
+
+We can use the IMDB reviews dataset for this guide. Let's load the dataset from `tensorflow_dataset`.
+
+
+```python
+import tensorflow_datasets as tfds
+
+imdb_train, imdb_test = tfds.load(
+    "imdb_reviews",
+    split=["train", "test"],
+    as_supervised=True,
+    batch_size=4,
+)
+```
+
+We only use a small subset of the training samples to make the guide run faster.
+However, if you need a higher quality model, consider using a larger number of training samples.
+
+
+```python
+imdb_train = imdb_train.take(100)
+```
+
+# Task Upload
+
+A `keras_hub.models.Task`, wraps a `keras_hub.models.Backbone` and a `keras_hub.models.Preprocessor` to create
+a model that can be directly used for training, fine-tuning, and prediction for a given text problem.
+In this section, we explain how to create a `Task`, fine-tune and upload it to a model hub.
+
+---
+## Load Model
+
+If you want to build a Causal LM based on a base model, simply call `keras_hub.models.CausalLM.from_preset`
+and pass a built-in preset identifier.
+
+
+```python
+causal_lm = keras_hub.models.CausalLM.from_preset("gpt2_base_en")
+
+```
+
+<div class="k-default-codeblock">
+```
+Downloading from https://www.kaggle.com/api/v1/models/keras/gpt2/keras/gpt2_base_en/2/download/task.json...
+
+Downloading from https://www.kaggle.com/api/v1/models/keras/gpt2/keras/gpt2_base_en/2/download/preprocessor.json...
+
+```
+</div>
+---
+## Fine-tune Model
+
+After loading the model, you can call `.fit()` on the model to fine-tune it.
+Here, we fine-tune the model on the IMDB reviews which makes the model movie domain-specific.
+
+
+```python
+# Drop labels and keep the review text only for the Causal LM.
+imdb_train_reviews = imdb_train.map(lambda x, y: x)
+
+# Fine-tune the Causal LM.
+causal_lm.fit(imdb_train_reviews)
+```
+ 100/100 ━━━━━━━━━━━━━━━━━━━━ 151s 1s/step - loss: 1.0198 - sparse_categorical_accuracy: 0.3271
+
+---
+## Save the Model Locally
+
+To upload a model, you need to first save the model locally using `save_to_preset`.
+
+
+```python
+preset_dir = "./gpt2_imdb"
+causal_lm.save_to_preset(preset_dir)
+```
+
+Let's see the saved files.
+
+
+```python
+os.listdir(preset_dir)
+```
+
+
+
+
+<div class="k-default-codeblock">
+```
+['preprocessor.json',
+ 'tokenizer.json',
+ 'task.json',
+ 'model.weights.h5',
+ 'config.json',
+ 'metadata.json',
+ 'assets']
+
+```
+</div>
+### Load a Locally Saved Model
+
+A model that is saved to a local preset can be loaded using `from_preset`.
+What you save in, is what you get back out.
+
+
+```python
+causal_lm = keras_hub.models.CausalLM.from_preset(preset_dir)
+```
+
+You can also load the `keras_hub.models.Backbone` and `keras_hub.models.Tokenizer` objects from this preset directory.
+Note that these objects are equivalent to `causal_lm.backbone` and `causal_lm.preprocessor.tokenizer` above.
+
+
+```python
+backbone = keras_hub.models.Backbone.from_preset(preset_dir)
+tokenizer = keras_hub.models.Tokenizer.from_preset(preset_dir)
+```
+
+---
+## Upload the Model to a Model Hub
+
+After saving a preset to a directory, this directory can be uploaded to a model hub such as Kaggle or Hugging Face directly from the KerasHub library.
+To upload the model to Kaggle, the URI must start with `kaggle://` and to upload to Hugging Face, it should start with `hf://`.
+
+### Upload to Kaggle
+
+To upload a model to Kaggle, first, we need to authenticate with Kaggle.
+This can in one of the following ways:
+1. Set environment variables `KAGGLE_USERNAME` and `KAGGLE_KEY`.
+2. Provide a local `~/.kaggle/kaggle.json`.
+3. Call `kagglehub.login()`.
+
+Let's make sure we are logged in before continuing.
+
+
+```python
+import kagglehub
+
+if "KAGGLE_USERNAME" not in os.environ or "KAGGLE_KEY" not in os.environ:
+    kagglehub.login()
+
+```
+
+To upload a model we can use `keras_hub.upload_preset(uri, preset_dir)` API where `uri` has the format of
+`kaggle://<KAGGLE_USERNAME>/<MODEL>/Keras/<VARIATION>` for uploading to Kaggle and `preset_dir` is the directory that the model is saved in.
+
+Running the following uploads the model that is saved in `preset_dir` to Kaggle:
+
+
+```python
+kaggle_username = kagglehub.whoami()["username"]
+kaggle_uri = f"kaggle://{kaggle_username}/gpt2/keras/gpt2_imdb"
+keras_hub.upload_preset(kaggle_uri, preset_dir)
+```
+
+<div class="k-default-codeblock">
+```
+Upload successful: preprocessor.json (834B)
+Upload successful: tokenizer.json (322B)
+Upload successful: task.json (2KB)
+Upload successful: model.weights.h5 (475MB)
+Upload successful: config.json (431B)
+Upload successful: metadata.json (142B)
+Upload successful: merges.txt (446KB)
+Upload successful: vocabulary.json (1018KB)
+
+Your model instance version has been created.
+
+```
+</div>
+### Upload to Hugging Face
+
+To upload a model to Hugging Face, first, we need to authenticate with Hugging Face.
+This can in one of the following ways:
+1. Set environment variables `HF_USERNAME` and `HF_TOKEN`.
+2. Call `huggingface_hub.notebook_login()`.
+
+Let's make sure we are logged in before coninuing.
+
+
+```python
+import huggingface_hub
+
+if "HF_USERNAME" not in os.environ or "HF_TOKEN" not in os.environ:
+    huggingface_hub.notebook_login()
+```
+
+`keras_hub.upload_preset(uri, preset_dir)` can be used to upload a model to Hugging Face if `uri` has the format of
+`kaggle://<HF_USERNAME>/<MODEL>`.
+
+Running the following uploads the model that is saved in `preset_dir` to Hugging Face:
+
+
+```python
+hf_username = huggingface_hub.whoami()["name"]
+hf_uri = f"hf://{hf_username}/gpt2_imdb"
+keras_hub.upload_preset(hf_uri, preset_dir)
+
+```
+
+---
+## Load a User Uploaded Model
+
+After verifying that the model is uploaded to Kaggle, we can load the model by calling `from_preset`.
+
+```python
+causal_lm = keras_hub.models.CausalLM.from_preset(
+    f"kaggle://{kaggle_username}/gpt2/keras/gpt2_imdb"
+)
+```
+
+We can also load the model uploaded to Hugging Face by calling `from_preset`.
+
+```python
+causal_lm = keras_hub.models.CausalLM.from_preset(f"hf://{hf_username}/gpt2_imdb")
+```
+
+# Classifier Upload
+
+Uploading a classifier model is similar to Causal LM upload.
+To upload the fine-tuned model, first, the model should be saved to a local directory using `save_to_preset`
+API and then it can be uploaded via `keras_hub.upload_preset`.
+
+
+```python
+# Load the base model.
+classifier = keras_hub.models.Classifier.from_preset(
+    "bert_tiny_en_uncased", num_classes=2
+)
+
+# Fine-tune the classifier.
+classifier.fit(imdb_train)
+
+# Save the model to a local preset directory.
+preset_dir = "./bert_tiny_imdb"
+classifier.save_to_preset(preset_dir)
+
+# Upload to Kaggle.
+keras_hub.upload_preset(
+    f"kaggle://{kaggle_username}/bert/keras/bert_tiny_imdb", preset_dir
+)
+```
+ 100/100 ━━━━━━━━━━━━━━━━━━━━ 7s 31ms/step - loss: 0.6975 - sparse_categorical_accuracy: 0.5164
+
+
+<div class="k-default-codeblock">
+```
+Upload successful: preprocessor.json (947B)
+Upload successful: tokenizer.json (461B)
+Upload successful: task.json (2KB)
+Upload successful: task.weights.h5 (50MB)
+Upload successful: model.weights.h5 (17MB)
+Upload successful: config.json (454B)
+Upload successful: metadata.json (140B)
+Upload successful: vocabulary.txt (226KB)
+
+Your model instance version has been created.
+```
+</div>
+After verifying that the model is uploaded to Kaggle, we can load the model by calling `from_preset`.
+
+```python
+classifier = keras_hub.models.Classifier.from_preset(
+    f"kaggle://{kaggle_username}/bert/keras/bert_tiny_imdb"
+)
+```
\ No newline at end of file
diff --git a/templates/guides/keras_nlp/getting_started.md b/templates/guides/keras_nlp/getting_started.md
new file mode 100644
index 0000000000..92da7dccd1
--- /dev/null
+++ b/templates/guides/keras_nlp/getting_started.md
@@ -0,0 +1,1065 @@
+# Getting Started with KerasNLP
+
+**Author:** [Jonathan Bischof](https://github.com/jbischof)<br>
+**Date created:** 2022/12/15<br>
+**Last modified:** 2023/07/01<br>
+**Description:** An introduction to the KerasNLP API.
+
+
+<img class="k-inline-icon" src="https://colab.research.google.com/img/colab_favicon.ico"/> [**View in Colab**](https://colab.research.google.com/github/keras-team/keras-io/blob/master/guides/ipynb/keras_nlp/getting_started.ipynb)  <span class="k-dot">•</span><img class="k-inline-icon" src="https://github.com/favicon.ico"/> [**GitHub source**](https://github.com/keras-team/keras-io/blob/master/guides/keras_nlp/getting_started.py)
+
+
+
+---
+## Introduction
+
+KerasNLP is a natural language processing library that supports users through
+their entire development cycle. Our workflows are built from modular components
+that have state-of-the-art preset weights and architectures when used
+out-of-the-box and are easily customizable when more control is needed.
+
+This library is an extension of the core Keras API; all high-level modules are
+[`Layers`](/api/layers/) or [`Models`](/api/models/). If you are familiar with Keras,
+congratulations! You already understand most of KerasNLP.
+
+KerasNLP uses Keras 3 to work with any of TensorFlow, Pytorch and Jax. In the
+guide below, we will use the `jax` backend for training our models, and
+[tf.data](https://www.tensorflow.org/guide/data) for efficiently running our
+input preprocessing. But feel free to mix things up! This guide runs in
+TensorFlow or PyTorch backends with zero changes, simply update the
+`KERAS_BACKEND` below.
+
+This guide demonstrates our modular approach using a sentiment analysis example at six
+levels of complexity:
+
+* Inference with a pretrained classifier
+* Fine tuning a pretrained backbone
+* Fine tuning with user-controlled preprocessing
+* Fine tuning a custom model
+* Pretraining a backbone model
+* Build and train your own transformer from scratch
+
+Throughout our guide, we use Professor Keras, the official Keras mascot, as a visual
+reference for the complexity of the material:
+
+<img src="https://storage.googleapis.com/keras-nlp/getting_started_guide/prof_keras_evolution.png" alt="drawing" height="250"/>
+
+
+```python
+!pip install -q --upgrade keras-nlp
+!pip install -q --upgrade keras  # Upgrade to Keras 3.
+```
+
+```python
+import os
+
+os.environ["KERAS_BACKEND"] = "jax"  # or "tensorflow" or "torch"
+
+import keras_nlp
+import keras
+
+# Use mixed precision to speed up all training in this guide.
+keras.mixed_precision.set_global_policy("mixed_float16")
+```
+<div class="k-default-codeblock">
+```
+
+
+```
+</div>
+---
+## API quickstart
+
+Our highest level API is `keras_nlp.models`. These symbols cover the complete user
+journey of converting strings to tokens, tokens to dense features, and dense features to
+task-specific output. For each `XX` architecture (e.g., `Bert`), we offer the following
+modules:
+
+* **Tokenizer**: `keras_nlp.models.XXTokenizer`
+  * **What it does**: Converts strings to sequences of token ids.
+  * **Why it's important**: The raw bytes of a string are too high dimensional to be useful
+    features so we first map them to a small number of tokens, for example `"The quick brown
+    fox"` to `["the", "qu", "##ick", "br", "##own", "fox"]`.
+  * **Inherits from**: `keras.layers.Layer`.
+* **Preprocessor**: `keras_nlp.models.XXPreprocessor`
+  * **What it does**: Converts strings to a dictionary of preprocessed tensors consumed by
+    the backbone, starting with tokenization.
+  * **Why it's important**: Each model uses special tokens and extra tensors to understand
+    the input such as delimiting input segments and identifying padding tokens. Padding each
+    sequence to the same length improves computational efficiency.
+  * **Has a**: `XXTokenizer`.
+  * **Inherits from**: `keras.layers.Layer`.
+* **Backbone**: `keras_nlp.models.XXBackbone`
+  * **What it does**: Converts preprocessed tensors to dense features. *Does not handle
+    strings; call the preprocessor first.*
+  * **Why it's important**: The backbone distills the input tokens into dense features that
+    can be used in downstream tasks. It is generally pretrained on a language modeling task
+    using massive amounts of unlabeled data. Transferring this information to a new task is a
+    major breakthrough in modern NLP.
+  * **Inherits from**: `keras.Model`.
+* **Task**: e.g., `keras_nlp.models.XXClassifier`
+  * **What it does**: Converts strings to task-specific output (e.g., classification
+    probabilities).
+  * **Why it's important**: Task models combine string preprocessing and the backbone model
+    with task-specific `Layers` to solve a problem such as sentence classification, token
+    classification, or text generation. The additional `Layers` must be fine-tuned on labeled
+    data.
+  * **Has a**: `XXBackbone` and `XXPreprocessor`.
+  * **Inherits from**: `keras.Model`.
+
+Here is the modular hierarchy for `BertClassifier` (all relationships are compositional):
+
+<img src="https://storage.googleapis.com/keras-nlp/getting_started_guide/class_diagram.png" alt="drawing" height="300"/>
+
+All modules can be used independently and have a `from_preset()` method in addition to
+the standard constructor that instantiates the class with **preset** architecture and
+weights (see examples below).
+
+---
+## Data
+
+We will use a running example of sentiment analysis of IMDB movie reviews. In this task,
+we use the text to predict whether the review was positive (`label = 1`) or negative
+(`label = 0`).
+
+We load the data using `keras.utils.text_dataset_from_directory`, which utilizes the
+powerful `tf.data.Dataset` format for examples.
+
+
+```python
+!curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
+!tar -xf aclImdb_v1.tar.gz
+!# Remove unsupervised examples
+!rm -r aclImdb/train/unsup
+```
+
+```python
+BATCH_SIZE = 16
+imdb_train = keras.utils.text_dataset_from_directory(
+    "aclImdb/train",
+    batch_size=BATCH_SIZE,
+)
+imdb_test = keras.utils.text_dataset_from_directory(
+    "aclImdb/test",
+    batch_size=BATCH_SIZE,
+)
+
+# Inspect first review
+# Format is (review text tensor, label tensor)
+print(imdb_train.unbatch().take(1).get_single_element())
+
+```
+<div class="k-default-codeblock">
+```
+  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
+                                 Dload  Upload   Total   Spent    Left  Speed
+100 80.2M  100 80.2M    0     0  88.0M      0 --:--:-- --:--:-- --:--:-- 87.9M
+
+Found 25000 files belonging to 2 classes.
+Found 25000 files belonging to 2 classes.
+(<tf.Tensor: shape=(), dtype=string, numpy=b'This is a very, very early Bugs Bunny cartoon. As a result, the character is still in a transition period--he is not drawn as elongated as he later was and his voice isn\'t quite right. In addition, the chemistry between Elmer and Bugs is a little unusual. Elmer is some poor sap who buys Bugs from a pet shop--there is no gun or desire on his part to blast the bunny to smithereens! However, despite this, this is still a very enjoyable film. The early Bugs was definitely more sassy and cruel than his later incarnations. In later films, he messed with Elmer, Yosimite Sam and others because they started it--they messed with the rabbit. But, in this film, he is much more like Daffy Duck of the late 30s and early 40s--a jerk who just loves irritating others!! A true "anarchist" instead of the hero of the later cartoons. While this isn\'t among the best Bug Bunny cartoons, it sure is fun to watch and it\'s interesting to see just how much he\'s changed over the years.'>, <tf.Tensor: shape=(), dtype=int32, numpy=1>)
+
+```
+</div>
+---
+## Inference with a pretrained classifier
+
+<img src="https://storage.googleapis.com/keras-nlp/getting_started_guide/prof_keras_beginner.png" alt="drawing" height="250"/>
+
+The highest level module in KerasNLP is a **task**. A **task** is a `keras.Model`
+consisting of a (generally pretrained) **backbone** model and task-specific layers.
+Here's an example using `keras_nlp.models.BertClassifier`.
+
+**Note**: Outputs are the logits per class (e.g., `[0, 0]` is 50% chance of positive). The output is
+[negative, positive] for binary classification.
+
+
+```python
+classifier = keras_nlp.models.BertClassifier.from_preset("bert_tiny_en_uncased_sst2")
+# Note: batched inputs expected so must wrap string in iterable
+classifier.predict(["I love modular workflows in keras-nlp!"])
+```
+
+<div class="k-default-codeblock">
+```
+ 1/1 ━━━━━━━━━━━━━━━━━━━━ 1s 689ms/step
+
+array([[-1.539,  1.543]], dtype=float16)
+
+```
+</div>
+All **tasks** have a `from_preset` method that constructs a `keras.Model` instance with
+preset preprocessing, architecture and weights. This means that we can pass raw strings
+in any format accepted by a `keras.Model` and get output specific to our task.
+
+This particular **preset** is a `"bert_tiny_uncased_en"` **backbone** fine-tuned on
+`sst2`, another movie review sentiment analysis (this time from Rotten Tomatoes). We use
+the `tiny` architecture for demo purposes, but larger models are recommended for SoTA
+performance. For all the task-specific presets available for `BertClassifier`, see
+our keras.io [models page](https://keras.io/api/keras_nlp/models/).
+
+Let's evaluate our classifier on the IMDB dataset. You will note we don't need to
+call `keras.Model.compile` here. All **task** models like `BertClassifier` ship with
+compilation defaults, meaning we can just call `keras.Model.evaluate` directly. You
+can always call compile as normal to override these defaults (e.g. to add new metrics).
+
+The output below is [loss, accuracy],
+
+
+```python
+classifier.evaluate(imdb_test)
+```
+
+<div class="k-default-codeblock">
+```
+ 1563/1563 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - loss: 0.4610 - sparse_categorical_accuracy: 0.7882
+
+[0.4630218744277954, 0.783519983291626]
+
+```
+</div>
+Our result is 78% accuracy without training anything. Not bad!
+
+---
+## Fine tuning a pretrained BERT backbone
+
+<img src="https://storage.googleapis.com/keras-nlp/getting_started_guide/prof_keras_intermediate.png" alt="drawing" height="250"/>
+
+When labeled text specific to our task is available, fine-tuning a custom classifier can
+improve performance. If we want to predict IMDB review sentiment, using IMDB data should
+perform better than Rotten Tomatoes data! And for many tasks, no relevant pretrained model
+will be available (e.g., categorizing customer reviews).
+
+The workflow for fine-tuning is almost identical to above, except that we request a
+**preset** for the **backbone**-only model rather than the entire classifier. When passed
+a **backbone** **preset**, a **task** `Model` will randomly initialize all task-specific
+layers in preparation for training. For all the **backbone** presets available for
+`BertClassifier`, see our keras.io [models page](https://keras.io/api/keras_nlp/models/).
+
+To train your classifier, use `keras.Model.fit` as with any other
+`keras.Model`. As with our inference example, we can rely on the compilation
+defaults for the **task** and skip `keras.Model.compile`. As preprocessing is
+included, we again pass the raw data.
+
+
+```python
+classifier = keras_nlp.models.BertClassifier.from_preset(
+    "bert_tiny_en_uncased",
+    num_classes=2,
+)
+classifier.fit(
+    imdb_train,
+    validation_data=imdb_test,
+    epochs=1,
+)
+```
+
+<div class="k-default-codeblock">
+```
+ 1563/1563 ━━━━━━━━━━━━━━━━━━━━ 16s 9ms/step - loss: 0.5202 - sparse_categorical_accuracy: 0.7281 - val_loss: 0.3254 - val_sparse_categorical_accuracy: 0.8621
+
+<keras.src.callbacks.history.History at 0x7f281ffc9f90>
+
+```
+</div>
+Here we see a significant lift in validation accuracy (0.78 -> 0.87) with a single epoch of
+training even though the IMDB dataset is much smaller than `sst2`.
+
+---
+## Fine tuning with user-controlled preprocessing
+<img src="https://storage.googleapis.com/keras-nlp/getting_started_guide/prof_keras_advanced.png" alt="drawing" height="250"/>
+
+For some advanced training scenarios, users might prefer direct control over
+preprocessing. For large datasets, examples can be preprocessed in advance and saved to
+disk or preprocessed by a separate worker pool using `tf.data.experimental.service`. In
+other cases, custom preprocessing is needed to handle the inputs.
+
+Pass `preprocessor=None` to the constructor of a **task** `Model` to skip automatic
+preprocessing or pass a custom `BertPreprocessor` instead.
+
+### Separate preprocessing from the same preset
+
+Each model architecture has a parallel **preprocessor** `Layer` with its own
+`from_preset` constructor. Using the same **preset** for this `Layer` will return the
+matching **preprocessor** as the **task**.
+
+In this workflow we train the model over three epochs using `tf.data.Dataset.cache()`,
+which computes the preprocessing once and caches the result before fitting begins.
+
+**Note:** we can use `tf.data` for preprocessing while running on the
+Jax or PyTorch backend. The input dataset will automatically be converted to
+backend native tensor types during fit. In fact, given the efficiency of `tf.data`
+for running preprocessing, this is good practice on all backends.
+
+
+```python
+import tensorflow as tf
+
+preprocessor = keras_nlp.models.BertPreprocessor.from_preset(
+    "bert_tiny_en_uncased",
+    sequence_length=512,
+)
+
+# Apply the preprocessor to every sample of train and test data using `map()`.
+# `tf.data.AUTOTUNE` and `prefetch()` are options to tune performance, see
+# https://www.tensorflow.org/guide/data_performance for details.
+
+# Note: only call `cache()` if you training data fits in CPU memory!
+imdb_train_cached = (
+    imdb_train.map(preprocessor, tf.data.AUTOTUNE).cache().prefetch(tf.data.AUTOTUNE)
+)
+imdb_test_cached = (
+    imdb_test.map(preprocessor, tf.data.AUTOTUNE).cache().prefetch(tf.data.AUTOTUNE)
+)
+
+classifier = keras_nlp.models.BertClassifier.from_preset(
+    "bert_tiny_en_uncased", preprocessor=None, num_classes=2
+)
+classifier.fit(
+    imdb_train_cached,
+    validation_data=imdb_test_cached,
+    epochs=3,
+)
+```
+
+<div class="k-default-codeblock">
+```
+Epoch 1/3
+ 1563/1563 ━━━━━━━━━━━━━━━━━━━━ 15s 8ms/step - loss: 0.5194 - sparse_categorical_accuracy: 0.7272 - val_loss: 0.3032 - val_sparse_categorical_accuracy: 0.8728
+Epoch 2/3
+ 1563/1563 ━━━━━━━━━━━━━━━━━━━━ 10s 7ms/step - loss: 0.2871 - sparse_categorical_accuracy: 0.8805 - val_loss: 0.2809 - val_sparse_categorical_accuracy: 0.8818
+Epoch 3/3
+ 1563/1563 ━━━━━━━━━━━━━━━━━━━━ 10s 7ms/step - loss: 0.2134 - sparse_categorical_accuracy: 0.9178 - val_loss: 0.3043 - val_sparse_categorical_accuracy: 0.8790
+
+<keras.src.callbacks.history.History at 0x7f281ffc87f0>
+
+```
+</div>
+After three epochs, our validation accuracy has only increased to 0.88. This is both a
+function of the small size of our dataset and our model. To exceed 90% accuracy, try
+larger **presets** such as  `"bert_base_en_uncased"`. For all the **backbone** presets
+available for `BertClassifier`, see our keras.io [models page](https://keras.io/api/keras_nlp/models/).
+
+### Custom preprocessing
+
+In cases where custom preprocessing is required, we offer direct access to the
+`Tokenizer` class that maps raw strings to tokens. It also has a `from_preset()`
+constructor to get the vocabulary matching pretraining.
+
+**Note:** `BertTokenizer` does not pad sequences by default, so the output is
+ragged (each sequence has varying length). The `MultiSegmentPacker` below
+handles padding these ragged sequences to dense tensor types (e.g. `tf.Tensor`
+or `torch.Tensor`).
+
+
+```python
+tokenizer = keras_nlp.models.BertTokenizer.from_preset("bert_tiny_en_uncased")
+tokenizer(["I love modular workflows!", "Libraries over frameworks!"])
+
+# Write your own packer or use one of our `Layers`
+packer = keras_nlp.layers.MultiSegmentPacker(
+    start_value=tokenizer.cls_token_id,
+    end_value=tokenizer.sep_token_id,
+    # Note: This cannot be longer than the preset's `sequence_length`, and there
+    # is no check for a custom preprocessor!
+    sequence_length=64,
+)
+
+
+# This function that takes a text sample `x` and its
+# corresponding label `y` as input and converts the
+# text into a format suitable for input into a BERT model.
+def preprocessor(x, y):
+    token_ids, segment_ids = packer(tokenizer(x))
+    x = {
+        "token_ids": token_ids,
+        "segment_ids": segment_ids,
+        "padding_mask": token_ids != 0,
+    }
+    return x, y
+
+
+imdb_train_preprocessed = imdb_train.map(preprocessor, tf.data.AUTOTUNE).prefetch(
+    tf.data.AUTOTUNE
+)
+imdb_test_preprocessed = imdb_test.map(preprocessor, tf.data.AUTOTUNE).prefetch(
+    tf.data.AUTOTUNE
+)
+
+# Preprocessed example
+print(imdb_train_preprocessed.unbatch().take(1).get_single_element())
+```
+
+<div class="k-default-codeblock">
+```
+({'token_ids': <tf.Tensor: shape=(64,), dtype=int32, numpy=
+array([  101,  2023,  2003,  2941,  2028,  1997,  2026,  5440,  3152,
+        1010,  1045,  2052, 16755,  2008,  3071, 12197,  2009,  1012,
+        2045,  2003,  2070,  2307,  3772,  1999,  2009,  1998,  2009,
+        3065,  2008,  2025,  2035,  1000,  2204,  1000,  3152,  2024,
+        2137,  1012,  1012,  1012,  1012,   102,     0,     0,     0,
+           0,     0,     0,     0,     0,     0,     0,     0,     0,
+           0,     0,     0,     0,     0,     0,     0,     0,     0,
+           0], dtype=int32)>, 'segment_ids': <tf.Tensor: shape=(64,), dtype=int32, numpy=
+array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
+      dtype=int32)>, 'padding_mask': <tf.Tensor: shape=(64,), dtype=bool, numpy=
+array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True, False, False, False,
+       False, False, False, False, False, False, False, False, False,
+       False, False, False, False, False, False, False, False, False,
+       False])>}, <tf.Tensor: shape=(), dtype=int32, numpy=1>)
+
+```
+</div>
+---
+## Fine tuning with a custom model
+<img src="https://storage.googleapis.com/keras-nlp/getting_started_guide/prof_keras_advanced.png" alt="drawing" height="250"/>
+
+For more advanced applications, an appropriate **task** `Model` may not be available. In
+this case, we provide direct access to the **backbone** `Model`, which has its own
+`from_preset` constructor and can be composed with custom `Layer`s. Detailed examples can
+be found at our [transfer learning guide](https://keras.io/guides/transfer_learning/).
+
+A **backbone** `Model` does not include automatic preprocessing but can be paired with a
+matching **preprocessor** using the same **preset** as shown in the previous workflow.
+
+In this workflow, we experiment with freezing our backbone model and adding two trainable
+transformer layers to adapt to the new input.
+
+**Note**: We can ignore the warning about gradients for the `pooled_dense` layer because
+we are using BERT's sequence output.
+
+
+```python
+preprocessor = keras_nlp.models.BertPreprocessor.from_preset("bert_tiny_en_uncased")
+backbone = keras_nlp.models.BertBackbone.from_preset("bert_tiny_en_uncased")
+
+imdb_train_preprocessed = (
+    imdb_train.map(preprocessor, tf.data.AUTOTUNE).cache().prefetch(tf.data.AUTOTUNE)
+)
+imdb_test_preprocessed = (
+    imdb_test.map(preprocessor, tf.data.AUTOTUNE).cache().prefetch(tf.data.AUTOTUNE)
+)
+
+backbone.trainable = False
+inputs = backbone.input
+sequence = backbone(inputs)["sequence_output"]
+for _ in range(2):
+    sequence = keras_nlp.layers.TransformerEncoder(
+        num_heads=2,
+        intermediate_dim=512,
+        dropout=0.1,
+    )(sequence)
+# Use [CLS] token output to classify
+outputs = keras.layers.Dense(2)(sequence[:, backbone.cls_token_index, :])
+
+model = keras.Model(inputs, outputs)
+model.compile(
+    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+    optimizer=keras.optimizers.AdamW(5e-5),
+    metrics=[keras.metrics.SparseCategoricalAccuracy()],
+    jit_compile=True,
+)
+model.summary()
+model.fit(
+    imdb_train_preprocessed,
+    validation_data=imdb_test_preprocessed,
+    epochs=3,
+)
+```
+
+
+<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace"><span style="font-weight: bold">Model: "functional_1"</span>
+</pre>
+
+
+
+
+<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace">┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┓
+┃<span style="font-weight: bold"> Layer (type)        </span>┃<span style="font-weight: bold"> Output Shape      </span>┃<span style="font-weight: bold"> Param # </span>┃<span style="font-weight: bold"> Connected to         </span>┃
+┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━┩
+│ padding_mask        │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>)      │       <span style="color: #00af00; text-decoration-color: #00af00">0</span> │ -                    │
+│ (<span style="color: #0087ff; text-decoration-color: #0087ff">InputLayer</span>)        │                   │         │                      │
+├─────────────────────┼───────────────────┼─────────┼──────────────────────┤
+│ segment_ids         │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>)      │       <span style="color: #00af00; text-decoration-color: #00af00">0</span> │ -                    │
+│ (<span style="color: #0087ff; text-decoration-color: #0087ff">InputLayer</span>)        │                   │         │                      │
+├─────────────────────┼───────────────────┼─────────┼──────────────────────┤
+│ token_ids           │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>)      │       <span style="color: #00af00; text-decoration-color: #00af00">0</span> │ -                    │
+│ (<span style="color: #0087ff; text-decoration-color: #0087ff">InputLayer</span>)        │                   │         │                      │
+├─────────────────────┼───────────────────┼─────────┼──────────────────────┤
+│ bert_backbone_3     │ [(<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00af00; text-decoration-color: #00af00">128</span>),     │ <span style="color: #00af00; text-decoration-color: #00af00">4,385,…</span> │ padding_mask[<span style="color: #00af00; text-decoration-color: #00af00">0</span>][<span style="color: #00af00; text-decoration-color: #00af00">0</span>],  │
+│ (<span style="color: #0087ff; text-decoration-color: #0087ff">BertBackbone</span>)      │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>,      │         │ segment_ids[<span style="color: #00af00; text-decoration-color: #00af00">0</span>][<span style="color: #00af00; text-decoration-color: #00af00">0</span>],   │
+│                     │ <span style="color: #00af00; text-decoration-color: #00af00">128</span>)]             │         │ token_ids[<span style="color: #00af00; text-decoration-color: #00af00">0</span>][<span style="color: #00af00; text-decoration-color: #00af00">0</span>]      │
+├─────────────────────┼───────────────────┼─────────┼──────────────────────┤
+│ transformer_encoder │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00af00; text-decoration-color: #00af00">128</span>) │ <span style="color: #00af00; text-decoration-color: #00af00">198,272</span> │ bert_backbone_3[<span style="color: #00af00; text-decoration-color: #00af00">0</span>][<span style="color: #00af00; text-decoration-color: #00af00">…</span> │
+│ (<span style="color: #0087ff; text-decoration-color: #0087ff">TransformerEncode…</span> │                   │         │                      │
+├─────────────────────┼───────────────────┼─────────┼──────────────────────┤
+│ transformer_encode… │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00af00; text-decoration-color: #00af00">128</span>) │ <span style="color: #00af00; text-decoration-color: #00af00">198,272</span> │ transformer_encoder… │
+│ (<span style="color: #0087ff; text-decoration-color: #0087ff">TransformerEncode…</span> │                   │         │                      │
+├─────────────────────┼───────────────────┼─────────┼──────────────────────┤
+│ get_item_4          │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00af00; text-decoration-color: #00af00">128</span>)       │       <span style="color: #00af00; text-decoration-color: #00af00">0</span> │ transformer_encoder… │
+│ (<span style="color: #0087ff; text-decoration-color: #0087ff">GetItem</span>)           │                   │         │                      │
+├─────────────────────┼───────────────────┼─────────┼──────────────────────┤
+│ dense (<span style="color: #0087ff; text-decoration-color: #0087ff">Dense</span>)       │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00af00; text-decoration-color: #00af00">2</span>)         │     <span style="color: #00af00; text-decoration-color: #00af00">258</span> │ get_item_4[<span style="color: #00af00; text-decoration-color: #00af00">0</span>][<span style="color: #00af00; text-decoration-color: #00af00">0</span>]     │
+└─────────────────────┴───────────────────┴─────────┴──────────────────────┘
+</pre>
+
+
+
+
+<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace"><span style="font-weight: bold"> Total params: </span><span style="color: #00af00; text-decoration-color: #00af00">4,782,722</span> (18.24 MB)
+</pre>
+
+
+
+
+<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace"><span style="font-weight: bold"> Trainable params: </span><span style="color: #00af00; text-decoration-color: #00af00">396,802</span> (1.51 MB)
+</pre>
+
+
+
+
+<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace"><span style="font-weight: bold"> Non-trainable params: </span><span style="color: #00af00; text-decoration-color: #00af00">4,385,920</span> (16.73 MB)
+</pre>
+
+
+
+<div class="k-default-codeblock">
+```
+Epoch 1/3
+ 1563/1563 ━━━━━━━━━━━━━━━━━━━━ 17s 10ms/step - loss: 0.6208 - sparse_categorical_accuracy: 0.6612 - val_loss: 0.6119 - val_sparse_categorical_accuracy: 0.6758
+Epoch 2/3
+ 1563/1563 ━━━━━━━━━━━━━━━━━━━━ 12s 8ms/step - loss: 0.5324 - sparse_categorical_accuracy: 0.7347 - val_loss: 0.5484 - val_sparse_categorical_accuracy: 0.7320
+Epoch 3/3
+ 1563/1563 ━━━━━━━━━━━━━━━━━━━━ 12s 8ms/step - loss: 0.4735 - sparse_categorical_accuracy: 0.7723 - val_loss: 0.4874 - val_sparse_categorical_accuracy: 0.7742
+
+<keras.src.callbacks.history.History at 0x7f2790170220>
+
+```
+</div>
+This model achieves reasonable accuracy despite having only 10% of the trainable parameters
+of our `BertClassifier` model. Each training step takes about 1/3 of the time---even
+accounting for cached preprocessing.
+
+---
+## Pretraining a backbone model
+<img src="https://storage.googleapis.com/keras-nlp/getting_started_guide/prof_keras_expert.png" alt="drawing" height="250"/>
+
+Do you have access to large unlabeled datasets in your domain? Are they around the
+same size as used to train popular backbones such as BERT, RoBERTa, or GPT2 (XX+ GiB)? If
+so, you might benefit from domain-specific pretraining of your own backbone models.
+
+NLP models are generally pretrained on a language modeling task, predicting masked words
+given the visible words in an input sentence. For example, given the input
+`"The fox [MASK] over the [MASK] dog"`, the model might be asked to predict `["jumped", "lazy"]`.
+The lower layers of this model are then packaged as a **backbone** to be combined with
+layers relating to a new task.
+
+The KerasNLP library offers SoTA **backbones** and **tokenizers** to be trained from
+scratch without presets.
+
+In this workflow, we pretrain a BERT **backbone** using our IMDB review text. We skip the
+"next sentence prediction" (NSP) loss because it adds significant complexity to the data
+processing and was dropped by later models like RoBERTa. See our e2e
+[Transformer pretraining](https://keras.io/guides/keras_nlp/transformer_pretraining/#pretraining)
+for step-by-step details on how to replicate the original paper.
+
+### Preprocessing
+
+
+```python
+# All BERT `en` models have the same vocabulary, so reuse preprocessor from
+# "bert_tiny_en_uncased"
+preprocessor = keras_nlp.models.BertPreprocessor.from_preset(
+    "bert_tiny_en_uncased",
+    sequence_length=256,
+)
+packer = preprocessor.packer
+tokenizer = preprocessor.tokenizer
+
+# keras.Layer to replace some input tokens with the "[MASK]" token
+masker = keras_nlp.layers.MaskedLMMaskGenerator(
+    vocabulary_size=tokenizer.vocabulary_size(),
+    mask_selection_rate=0.25,
+    mask_selection_length=64,
+    mask_token_id=tokenizer.token_to_id("[MASK]"),
+    unselectable_token_ids=[
+        tokenizer.token_to_id(x) for x in ["[CLS]", "[PAD]", "[SEP]"]
+    ],
+)
+
+
+def preprocess(inputs, label):
+    inputs = preprocessor(inputs)
+    masked_inputs = masker(inputs["token_ids"])
+    # Split the masking layer outputs into a (features, labels, and weights)
+    # tuple that we can use with keras.Model.fit().
+    features = {
+        "token_ids": masked_inputs["token_ids"],
+        "segment_ids": inputs["segment_ids"],
+        "padding_mask": inputs["padding_mask"],
+        "mask_positions": masked_inputs["mask_positions"],
+    }
+    labels = masked_inputs["mask_ids"]
+    weights = masked_inputs["mask_weights"]
+    return features, labels, weights
+
+
+pretrain_ds = imdb_train.map(preprocess, num_parallel_calls=tf.data.AUTOTUNE).prefetch(
+    tf.data.AUTOTUNE
+)
+pretrain_val_ds = imdb_test.map(
+    preprocess, num_parallel_calls=tf.data.AUTOTUNE
+).prefetch(tf.data.AUTOTUNE)
+
+# Tokens with ID 103 are "masked"
+print(pretrain_ds.unbatch().take(1).get_single_element())
+```
+
+<div class="k-default-codeblock">
+```
+({'token_ids': <tf.Tensor: shape=(256,), dtype=int32, numpy=
+array([  101,   103,  2332,   103,  1006,   103,   103,  2332,  2370,
+        1007,   103,  2029,   103,  2402,  2155,  1010, 24159,  2000,
+        3541,  7081,  1010,  2424,  2041,  2055,  1996,  9004,  4528,
+         103,   103,  2037,  2188,   103,  1996,  2269,  1006,  8512,
+        3054,   103,  4246,  1007,  2059,  4858,  1555,  2055,  1996,
+       23025, 22911,  8940,  2598,  3458,  1996, 25483,  4528,  2008,
+        2038,   103,  1997, 15218,  1011,   103,  1997,   103,  2505,
+        3950,  2045,  3310,  2067,  2025,  3243,  2157,  1012,   103,
+        7987,  1013,  1028,   103,  7987,  1013,  1028,  2917,   103,
+        1000,  5469,  1000,   103,   103,  2041, 22902,  1010, 23979,
+        1010,  1998,  1999, 23606,   103,  1998,  4247,  2008,  2126,
+        2005,  1037,  2096,  1010,  2007,  1996,   103,  5409,   103,
+        2108,  3054,  3211,  4246,  1005,  1055, 22692,  2836,  1012,
+        2009,   103,  1037,  2210,  2488,   103,   103,  2203,  1010,
+        2007,   103,   103,  9599,  1012,   103,  2391,  1997,  2755,
+        1010,  1996,  2878,  3185,  2003,  2428,   103,  1010,   103,
+         103,   103,  1045,  2064,  1005,  1056,  3294, 19776,  2009,
+        1011,  2012,  2560,  2009,  2038,  2242,  2000,   103,  2009,
+       13432,  1012, 11519,  4637,  4616,  2011,  5965,  1043, 11761,
+         103,   103,  2004,   103,  7968,  3243,  4793, 11429,  1010,
+        1998,  8226,  2665, 18331,  1010,  1219,  1996,  4487, 22747,
+        8004, 12165,  4382,  5125,   103,  3597,   103,  2024,  2025,
+        2438,  2000,   103,  2417, 21564,  2143,   103,   103,  7987,
+        1013,  1028,  1026,   103,  1013,  1028,  2332,  2038,   103,
+        5156, 12081,  2004,  1996,   103,  1012,  1026, 14216,   103,
+         103,  1026,  7987,  1013,  1028,   184,  2011,  1037,  8297,
+        2036,   103,  2011,  2984,   103,  1006,  2003,  2009,  2151,
+        4687,  2008,  2016,  1005,  1055,  2018,  2053,  7731,   103,
+         103,  2144,  1029,   102], dtype=int32)>, 'segment_ids': <tf.Tensor: shape=(256,), dtype=int32, numpy=
+array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int32)>, 'padding_mask': <tf.Tensor: shape=(256,), dtype=bool, numpy=
+array([ True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True,  True,  True,  True,  True,  True,
+        True,  True,  True,  True])>, 'mask_positions': <tf.Tensor: shape=(64,), dtype=int64, numpy=
+array([  1,   3,   5,   6,  10,  12,  13,  27,  28,  31,  37,  42,  51,
+        55,  59,  61,  65,  71,  75,  80,  83,  84,  85,  94, 105, 107,
+       108, 118, 122, 123, 127, 128, 131, 141, 143, 144, 145, 149, 160,
+       167, 170, 171, 172, 174, 176, 185, 193, 195, 200, 204, 205, 208,
+       210, 215, 220, 223, 224, 225, 230, 231, 235, 238, 251, 252])>}, <tf.Tensor: shape=(64,), dtype=int32, numpy=
+array([ 4459,  6789, 22892,  2011,  1999,  1037,  2402,  2485,  2000,
+        1012,  3211,  2041,  9004,  4204,  2069,  2607,  3310,  1026,
+        1026,  2779,  1000,  3861,  4627,  1010,  7619,  5783,  2108,
+        4152,  2646,  1996, 15958, 14888,  1999, 14888,  2029,  2003,
+        2339,  1056,  2191,  2011, 11761,  2638,  1010,  1996,  2214,
+        2004, 14674,  2860,  2428,  1012,  1026,  1028,  7987,  2010,
+        2704,  7987,  1013,  1028,  2628,  2011,  2856, 12838,  2143,
+        2147], dtype=int32)>, <tf.Tensor: shape=(64,), dtype=float16, numpy=
+array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
+       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
+       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
+       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.], dtype=float16)>)
+
+```
+</div>
+### Pretraining model
+
+
+```python
+# BERT backbone
+backbone = keras_nlp.models.BertBackbone(
+    vocabulary_size=tokenizer.vocabulary_size(),
+    num_layers=2,
+    num_heads=2,
+    hidden_dim=128,
+    intermediate_dim=512,
+)
+
+# Language modeling head
+mlm_head = keras_nlp.layers.MaskedLMHead(
+    token_embedding=backbone.token_embedding,
+)
+
+inputs = {
+    "token_ids": keras.Input(shape=(None,), dtype=tf.int32, name="token_ids"),
+    "segment_ids": keras.Input(shape=(None,), dtype=tf.int32, name="segment_ids"),
+    "padding_mask": keras.Input(shape=(None,), dtype=tf.int32, name="padding_mask"),
+    "mask_positions": keras.Input(shape=(None,), dtype=tf.int32, name="mask_positions"),
+}
+
+# Encoded token sequence
+sequence = backbone(inputs)["sequence_output"]
+
+# Predict an output word for each masked input token.
+# We use the input token embedding to project from our encoded vectors to
+# vocabulary logits, which has been shown to improve training efficiency.
+outputs = mlm_head(sequence, mask_positions=inputs["mask_positions"])
+
+# Define and compile our pretraining model.
+pretraining_model = keras.Model(inputs, outputs)
+pretraining_model.summary()
+pretraining_model.compile(
+    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+    optimizer=keras.optimizers.AdamW(learning_rate=5e-4),
+    weighted_metrics=[keras.metrics.SparseCategoricalAccuracy()],
+    jit_compile=True,
+)
+
+# Pretrain on IMDB dataset
+pretraining_model.fit(
+    pretrain_ds,
+    validation_data=pretrain_val_ds,
+    epochs=3,  # Increase to 6 for higher accuracy
+)
+```
+
+
+<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace"><span style="font-weight: bold">Model: "functional_3"</span>
+</pre>
+
+
+
+
+<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace">┏━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━┓
+┃<span style="font-weight: bold"> Layer (type)        </span>┃<span style="font-weight: bold"> Output Shape      </span>┃<span style="font-weight: bold"> Param # </span>┃<span style="font-weight: bold"> Connected to         </span>┃
+┡━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━┩
+│ mask_positions      │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>)      │       <span style="color: #00af00; text-decoration-color: #00af00">0</span> │ -                    │
+│ (<span style="color: #0087ff; text-decoration-color: #0087ff">InputLayer</span>)        │                   │         │                      │
+├─────────────────────┼───────────────────┼─────────┼──────────────────────┤
+│ padding_mask        │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>)      │       <span style="color: #00af00; text-decoration-color: #00af00">0</span> │ -                    │
+│ (<span style="color: #0087ff; text-decoration-color: #0087ff">InputLayer</span>)        │                   │         │                      │
+├─────────────────────┼───────────────────┼─────────┼──────────────────────┤
+│ segment_ids         │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>)      │       <span style="color: #00af00; text-decoration-color: #00af00">0</span> │ -                    │
+│ (<span style="color: #0087ff; text-decoration-color: #0087ff">InputLayer</span>)        │                   │         │                      │
+├─────────────────────┼───────────────────┼─────────┼──────────────────────┤
+│ token_ids           │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>)      │       <span style="color: #00af00; text-decoration-color: #00af00">0</span> │ -                    │
+│ (<span style="color: #0087ff; text-decoration-color: #0087ff">InputLayer</span>)        │                   │         │                      │
+├─────────────────────┼───────────────────┼─────────┼──────────────────────┤
+│ bert_backbone_4     │ [(<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00af00; text-decoration-color: #00af00">128</span>),     │ <span style="color: #00af00; text-decoration-color: #00af00">4,385,…</span> │ mask_positions[<span style="color: #00af00; text-decoration-color: #00af00">0</span>][<span style="color: #00af00; text-decoration-color: #00af00">0</span>… │
+│ (<span style="color: #0087ff; text-decoration-color: #0087ff">BertBackbone</span>)      │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>,      │         │ padding_mask[<span style="color: #00af00; text-decoration-color: #00af00">0</span>][<span style="color: #00af00; text-decoration-color: #00af00">0</span>],  │
+│                     │ <span style="color: #00af00; text-decoration-color: #00af00">128</span>)]             │         │ segment_ids[<span style="color: #00af00; text-decoration-color: #00af00">0</span>][<span style="color: #00af00; text-decoration-color: #00af00">0</span>],   │
+│                     │                   │         │ token_ids[<span style="color: #00af00; text-decoration-color: #00af00">0</span>][<span style="color: #00af00; text-decoration-color: #00af00">0</span>]      │
+├─────────────────────┼───────────────────┼─────────┼──────────────────────┤
+│ masked_lm_head      │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>,      │ <span style="color: #00af00; text-decoration-color: #00af00">3,954,…</span> │ bert_backbone_4[<span style="color: #00af00; text-decoration-color: #00af00">0</span>][<span style="color: #00af00; text-decoration-color: #00af00">…</span> │
+│ (<span style="color: #0087ff; text-decoration-color: #0087ff">MaskedLMHead</span>)      │ <span style="color: #00af00; text-decoration-color: #00af00">30522</span>)            │         │ mask_positions[<span style="color: #00af00; text-decoration-color: #00af00">0</span>][<span style="color: #00af00; text-decoration-color: #00af00">0</span>] │
+└─────────────────────┴───────────────────┴─────────┴──────────────────────┘
+</pre>
+
+
+
+
+<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace"><span style="font-weight: bold"> Total params: </span><span style="color: #00af00; text-decoration-color: #00af00">4,433,210</span> (16.91 MB)
+</pre>
+
+
+
+
+<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace"><span style="font-weight: bold"> Trainable params: </span><span style="color: #00af00; text-decoration-color: #00af00">4,433,210</span> (16.91 MB)
+</pre>
+
+
+
+
+<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace"><span style="font-weight: bold"> Non-trainable params: </span><span style="color: #00af00; text-decoration-color: #00af00">0</span> (0.00 B)
+</pre>
+
+
+
+<div class="k-default-codeblock">
+```
+Epoch 1/3
+ 1563/1563 ━━━━━━━━━━━━━━━━━━━━ 22s 12ms/step - loss: 5.7032 - sparse_categorical_accuracy: 0.0566 - val_loss: 5.0685 - val_sparse_categorical_accuracy: 0.1044
+Epoch 2/3
+ 1563/1563 ━━━━━━━━━━━━━━━━━━━━ 13s 8ms/step - loss: 5.0701 - sparse_categorical_accuracy: 0.1096 - val_loss: 4.9363 - val_sparse_categorical_accuracy: 0.1239
+Epoch 3/3
+ 1563/1563 ━━━━━━━━━━━━━━━━━━━━ 13s 8ms/step - loss: 4.9607 - sparse_categorical_accuracy: 0.1240 - val_loss: 4.7913 - val_sparse_categorical_accuracy: 0.1417
+
+<keras.src.callbacks.history.History at 0x7f2738299330>
+
+```
+</div>
+After pretraining save your `backbone` submodel to use in a new task!
+
+---
+## Build and train your own transformer from scratch
+<img src="https://storage.googleapis.com/keras-nlp/getting_started_guide/prof_keras_expert.png" alt="drawing" height="250"/>
+
+Want to implement a novel transformer architecture? The KerasNLP library offers all the
+low-level modules used to build SoTA architectures in our `models` API. This includes the
+`keras_nlp.tokenizers` API which allows you to train your own subword tokenizer using
+`WordPieceTokenizer`, `BytePairTokenizer`, or `SentencePieceTokenizer`.
+
+In this workflow, we train a custom tokenizer on the IMDB data and design a backbone with
+custom transformer architecture. For simplicity, we then train directly on the
+classification task. Interested in more details? We wrote an entire guide to pretraining
+and finetuning a custom transformer on
+[keras.io](https://keras.io/guides/keras_nlp/transformer_pretraining/),
+
+### Train custom vocabulary from IMDB data
+
+
+```python
+vocab = keras_nlp.tokenizers.compute_word_piece_vocabulary(
+    imdb_train.map(lambda x, y: x),
+    vocabulary_size=20_000,
+    lowercase=True,
+    strip_accents=True,
+    reserved_tokens=["[PAD]", "[START]", "[END]", "[MASK]", "[UNK]"],
+)
+tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
+    vocabulary=vocab,
+    lowercase=True,
+    strip_accents=True,
+    oov_token="[UNK]",
+)
+```
+
+### Preprocess data with a custom tokenizer
+
+
+```python
+packer = keras_nlp.layers.StartEndPacker(
+    start_value=tokenizer.token_to_id("[START]"),
+    end_value=tokenizer.token_to_id("[END]"),
+    pad_value=tokenizer.token_to_id("[PAD]"),
+    sequence_length=512,
+)
+
+
+def preprocess(x, y):
+    token_ids = packer(tokenizer(x))
+    return token_ids, y
+
+
+imdb_preproc_train_ds = imdb_train.map(
+    preprocess, num_parallel_calls=tf.data.AUTOTUNE
+).prefetch(tf.data.AUTOTUNE)
+imdb_preproc_val_ds = imdb_test.map(
+    preprocess, num_parallel_calls=tf.data.AUTOTUNE
+).prefetch(tf.data.AUTOTUNE)
+
+print(imdb_preproc_train_ds.unbatch().take(1).get_single_element())
+```
+
+<div class="k-default-codeblock">
+```
+(<tf.Tensor: shape=(512,), dtype=int32, numpy=
+array([    1,   102,    11,    61,    43,   771,    16,   340,   916,
+        1259,   155,    16,   135,   207,    18,   501, 10568,   344,
+          16,    51,   206,   612,   211,   232,    43,  1094,    17,
+         215,   155,   103,   238,   202,    18,   111,    16,    51,
+         143,  1583,   131,   100,    18,    32,   101,    19,    34,
+          32,   101,    19,    34,   102,    11,    61,    43,   155,
+         105,  5337,    99,   120,     6,  1289,     6,   129,    96,
+         526,    18,   111,    16,   193,    51,   197,   102,    16,
+          51,   252,    11,    62,   167,   104,   642,    98,     6,
+        8572,     6,   154,    51,   153,  1464,   119,  3005,   990,
+        2393,    18,   102,    11,    61,   233,   404,   103,   104,
+         110,    18,    18,    18,   233,  1259,    18,    18,    18,
+         154,    51,   659, 16273,   867,   192,  1632,   133,   990,
+        2393,    18,    32,   101,    19,    34,    32,   101,    19,
+          34,    96,   110,  2886,   761,   114,  4905,   293, 12337,
+          97,  2375,    18,   113,   143,   158,   179,   104,  4905,
+         610,    16, 12585,    97,   516,   725,    18,   113,   323,
+          96,   651,   146,   104,   207, 17649,    16,    96,   176,
+       16022,   136,    16,  1414,   136,    18,   113,   323,    96,
+        2184,    18,    97,   150,   651,    51,   242,   104,   100,
+       11722,    18,   113,   151,   543,   102,   171,   115,  1081,
+         103,    96,   222,    18,    18,    18,    18,   102,   659,
+        1081,    18,    18,    18,   102,    11,    61,   115,   299,
+          18,   113,   323,    96,  1579,    98,   203,  4438,  2033,
+         103,    96,   222,    18,    18,    18,    32,   101,    19,
+          34,    32,   101,    19,    34,   111,    16,    51,   455,
+         174,    99,   859,    43,  1687,  3330,    99,   104,  1021,
+          18,    18,    18,    51,   181,    11,    62,   214,   138,
+          96,   155,   100,   115,   916,    14,  1286,    14,    99,
+         296,    96,   642,   105,   224,  4598,   117,  1289,   156,
+         103,   904,    16,   111,   115,   103,  1628,    18,   113,
+         181,    11,    62,   119,    96,  1054,   155,    16,   111,
+         156, 14665,    18,   146,   110,   139,   742,    16,    96,
+        4905,   293, 12337,    97,  7042,  1104,   106,   557,   103,
+         366,    18,   128,    16,   150,  2446,   135,    96,   960,
+          98,    96,  4905,    18,   113,   323,   156,    43,  1174,
+         293,   188,    18,    18,    18,    43,   639,   293,    96,
+         455,   108,   207,    97,  1893,    99,  1081,   104,  4905,
+          18,    51,   194,   104,   440,    98, 12337,    99,  7042,
+        1104,   654,   122,    30,     6,    51,   276,    99,   663,
+          18,    18,    18,    97,   138,   113,   207,   163,    16,
+         113,   171,   172,   107,    51,  1027,   113,     6,    18,
+          32,   101,    19,    34,    32,   101,    19,    34,   104,
+         110,   171,   333, 10311,   141,  1311,   135,   140,   100,
+         207,    97,   140,   100,    99,   120,  1632,    18,    18,
+          18,    97,   210,    11,    61,    96,  6236,   293,   188,
+          18,    51,   181,    11,    62,   214,   138,    96,   421,
+          98,   104,   110,   100,     6,   207, 14129,   122,    18,
+          18,    18,   151,  1128,    97,  1632,  1675,     6,   133,
+           6,   207,   100,   404,    18,    18,    18,   150,   646,
+         179,   133,   210,     6,    18,   111,   103,   152,   744,
+          16,   104,   110,   100,   557,    43,  1120,   108,    96,
+         701,   382,   105,   102,   260,   113,   194,    18,    18,
+          18,     2,     0,     0,     0,     0,     0,     0,     0,
+           0,     0,     0,     0,     0,     0,     0,     0,     0,
+           0,     0,     0,     0,     0,     0,     0,     0,     0,
+           0,     0,     0,     0,     0,     0,     0,     0],
+      dtype=int32)>, <tf.Tensor: shape=(), dtype=int32, numpy=1>)
+
+```
+</div>
+### Design a tiny transformer
+
+
+```python
+token_id_input = keras.Input(
+    shape=(None,),
+    dtype="int32",
+    name="token_ids",
+)
+outputs = keras_nlp.layers.TokenAndPositionEmbedding(
+    vocabulary_size=len(vocab),
+    sequence_length=packer.sequence_length,
+    embedding_dim=64,
+)(token_id_input)
+outputs = keras_nlp.layers.TransformerEncoder(
+    num_heads=2,
+    intermediate_dim=128,
+    dropout=0.1,
+)(outputs)
+# Use "[START]" token to classify
+outputs = keras.layers.Dense(2)(outputs[:, 0, :])
+model = keras.Model(
+    inputs=token_id_input,
+    outputs=outputs,
+)
+
+model.summary()
+```
+
+
+<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace"><span style="font-weight: bold">Model: "functional_5"</span>
+</pre>
+
+
+
+
+<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace">┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┓
+┃<span style="font-weight: bold"> Layer (type)                    </span>┃<span style="font-weight: bold"> Output Shape              </span>┃<span style="font-weight: bold">    Param # </span>┃
+┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━┩
+│ token_ids (<span style="color: #0087ff; text-decoration-color: #0087ff">InputLayer</span>)          │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>)              │          <span style="color: #00af00; text-decoration-color: #00af00">0</span> │
+├─────────────────────────────────┼───────────────────────────┼────────────┤
+│ token_and_position_embedding    │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00af00; text-decoration-color: #00af00">64</span>)          │  <span style="color: #00af00; text-decoration-color: #00af00">1,259,648</span> │
+│ (<span style="color: #0087ff; text-decoration-color: #0087ff">TokenAndPositionEmbedding</span>)     │                           │            │
+├─────────────────────────────────┼───────────────────────────┼────────────┤
+│ transformer_encoder_2           │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00af00; text-decoration-color: #00af00">64</span>)          │     <span style="color: #00af00; text-decoration-color: #00af00">33,472</span> │
+│ (<span style="color: #0087ff; text-decoration-color: #0087ff">TransformerEncoder</span>)            │                           │            │
+├─────────────────────────────────┼───────────────────────────┼────────────┤
+│ get_item_6 (<span style="color: #0087ff; text-decoration-color: #0087ff">GetItem</span>)            │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00af00; text-decoration-color: #00af00">64</span>)                │          <span style="color: #00af00; text-decoration-color: #00af00">0</span> │
+├─────────────────────────────────┼───────────────────────────┼────────────┤
+│ dense_1 (<span style="color: #0087ff; text-decoration-color: #0087ff">Dense</span>)                 │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00af00; text-decoration-color: #00af00">2</span>)                 │        <span style="color: #00af00; text-decoration-color: #00af00">130</span> │
+└─────────────────────────────────┴───────────────────────────┴────────────┘
+</pre>
+
+
+
+
+<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace"><span style="font-weight: bold"> Total params: </span><span style="color: #00af00; text-decoration-color: #00af00">1,293,250</span> (4.93 MB)
+</pre>
+
+
+
+
+<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace"><span style="font-weight: bold"> Trainable params: </span><span style="color: #00af00; text-decoration-color: #00af00">1,293,250</span> (4.93 MB)
+</pre>
+
+
+
+
+<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace"><span style="font-weight: bold"> Non-trainable params: </span><span style="color: #00af00; text-decoration-color: #00af00">0</span> (0.00 B)
+</pre>
+
+
+
+### Train the transformer directly on the classification objective
+
+
+```python
+model.compile(
+    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+    optimizer=keras.optimizers.AdamW(5e-5),
+    metrics=[keras.metrics.SparseCategoricalAccuracy()],
+    jit_compile=True,
+)
+model.fit(
+    imdb_preproc_train_ds,
+    validation_data=imdb_preproc_val_ds,
+    epochs=3,
+)
+```
+
+<div class="k-default-codeblock">
+```
+Epoch 1/3
+ 1563/1563 ━━━━━━━━━━━━━━━━━━━━ 8s 4ms/step - loss: 0.7790 - sparse_categorical_accuracy: 0.5367 - val_loss: 0.4420 - val_sparse_categorical_accuracy: 0.8120
+Epoch 2/3
+ 1563/1563 ━━━━━━━━━━━━━━━━━━━━ 5s 3ms/step - loss: 0.3654 - sparse_categorical_accuracy: 0.8443 - val_loss: 0.3046 - val_sparse_categorical_accuracy: 0.8752
+Epoch 3/3
+ 1563/1563 ━━━━━━━━━━━━━━━━━━━━ 5s 3ms/step - loss: 0.2471 - sparse_categorical_accuracy: 0.9019 - val_loss: 0.3060 - val_sparse_categorical_accuracy: 0.8748
+
+<keras.src.callbacks.history.History at 0x7f26d032a4d0>
+
+```
+</div>
+Excitingly, our custom classifier is similar to the performance of fine-tuning
+`"bert_tiny_en_uncased"`! To see the advantages of pretraining and exceed 90% accuracy we
+would need to use larger **presets** such as `"bert_base_en_uncased"`.
diff --git a/templates/guides/keras_nlp/transformer_pretraining.md b/templates/guides/keras_nlp/transformer_pretraining.md
new file mode 100644
index 0000000000..40397c881c
--- /dev/null
+++ b/templates/guides/keras_nlp/transformer_pretraining.md
@@ -0,0 +1,635 @@
+# Pretraining a Transformer from scratch with KerasNLP
+
+**Author:** [Matthew Watson](https://github.com/mattdangerw/)<br>
+**Date created:** 2022/04/18<br>
+**Last modified:** 2023/07/15<br>
+**Description:** Use KerasNLP to train a Transformer model from scratch.
+
+
+<img class="k-inline-icon" src="https://colab.research.google.com/img/colab_favicon.ico"/> [**View in Colab**](https://colab.research.google.com/github/keras-team/keras-io/blob/master/guides/ipynb/keras_nlp/transformer_pretraining.ipynb)  <span class="k-dot">•</span><img class="k-inline-icon" src="https://github.com/favicon.ico"/> [**GitHub source**](https://github.com/keras-team/keras-io/blob/master/guides/keras_nlp/transformer_pretraining.py)
+
+
+
+KerasNLP aims to make it easy to build state-of-the-art text processing models. In this
+guide, we will show how library components simplify pretraining and fine-tuning a
+Transformer model from scratch.
+
+This guide is broken into three parts:
+
+1. *Setup*, task definition, and establishing a baseline.
+2. *Pretraining* a Transformer model.
+3. *Fine-tuning* the Transformer model on our classification task.
+
+---
+## Setup
+
+The following guide uses Keras 3 to work in any of `tensorflow`, `jax` or
+`torch`. We select the `jax` backend below, which will give us a particularly
+fast train step below, but feel free to mix it up.
+
+
+```python
+!pip install -q --upgrade keras-nlp
+!pip install -q --upgrade keras  # Upgrade to Keras 3.
+```
+
+```python
+import os
+
+os.environ["KERAS_BACKEND"] = "jax"  # or "tensorflow" or "torch"
+
+
+import keras_nlp
+import tensorflow as tf
+import keras
+```
+<div class="k-default-codeblock">
+```
+
+```
+</div>
+Next up, we can download two datasets.
+
+- [SST-2](https://paperswithcode.com/sota/sentiment-analysis-on-sst-2-binary) a text
+classification dataset and our "end goal". This dataset is often used to benchmark
+language models.
+- [WikiText-103](https://paperswithcode.com/dataset/wikitext-103): A medium sized
+collection of featured articles from English Wikipedia, which we will use for
+pretraining.
+
+Finally, we will download a WordPiece vocabulary, to do sub-word tokenization later on in
+this guide.
+
+
+```python
+# Download pretraining data.
+keras.utils.get_file(
+    origin="https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-103-raw-v1.zip",
+    extract=True,
+)
+wiki_dir = os.path.expanduser("~/.keras/datasets/wikitext-103-raw/")
+
+# Download finetuning data.
+keras.utils.get_file(
+    origin="https://dl.fbaipublicfiles.com/glue/data/SST-2.zip",
+    extract=True,
+)
+sst_dir = os.path.expanduser("~/.keras/datasets/SST-2/")
+
+# Download vocabulary data.
+vocab_file = keras.utils.get_file(
+    origin="https://storage.googleapis.com/tensorflow/keras-nlp/examples/bert/bert_vocab_uncased.txt",
+)
+```
+
+Next, we define some hyperparameters we will use during training.
+
+
+```python
+# Preprocessing params.
+PRETRAINING_BATCH_SIZE = 128
+FINETUNING_BATCH_SIZE = 32
+SEQ_LENGTH = 128
+MASK_RATE = 0.25
+PREDICTIONS_PER_SEQ = 32
+
+# Model params.
+NUM_LAYERS = 3
+MODEL_DIM = 256
+INTERMEDIATE_DIM = 512
+NUM_HEADS = 4
+DROPOUT = 0.1
+NORM_EPSILON = 1e-5
+
+# Training params.
+PRETRAINING_LEARNING_RATE = 5e-4
+PRETRAINING_EPOCHS = 8
+FINETUNING_LEARNING_RATE = 5e-5
+FINETUNING_EPOCHS = 3
+```
+
+### Load data
+
+We load our data with [tf.data](https://www.tensorflow.org/guide/data), which will allow
+us to define input pipelines for tokenizing and preprocessing text.
+
+
+```python
+# Load SST-2.
+sst_train_ds = tf.data.experimental.CsvDataset(
+    sst_dir + "train.tsv", [tf.string, tf.int32], header=True, field_delim="\t"
+).batch(FINETUNING_BATCH_SIZE)
+sst_val_ds = tf.data.experimental.CsvDataset(
+    sst_dir + "dev.tsv", [tf.string, tf.int32], header=True, field_delim="\t"
+).batch(FINETUNING_BATCH_SIZE)
+
+# Load wikitext-103 and filter out short lines.
+wiki_train_ds = (
+    tf.data.TextLineDataset(wiki_dir + "wiki.train.raw")
+    .filter(lambda x: tf.strings.length(x) > 100)
+    .batch(PRETRAINING_BATCH_SIZE)
+)
+wiki_val_ds = (
+    tf.data.TextLineDataset(wiki_dir + "wiki.valid.raw")
+    .filter(lambda x: tf.strings.length(x) > 100)
+    .batch(PRETRAINING_BATCH_SIZE)
+)
+
+# Take a peak at the sst-2 dataset.
+print(sst_train_ds.unbatch().batch(4).take(1).get_single_element())
+```
+
+<div class="k-default-codeblock">
+```
+(<tf.Tensor: shape=(4,), dtype=string, numpy=
+array([b'hide new secretions from the parental units ',
+       b'contains no wit , only labored gags ',
+       b'that loves its characters and communicates something rather beautiful about human nature ',
+       b'remains utterly satisfied to remain the same throughout '],
+      dtype=object)>, <tf.Tensor: shape=(4,), dtype=int32, numpy=array([0, 0, 1, 0], dtype=int32)>)
+
+```
+</div>
+You can see that our `SST-2` dataset contains relatively short snippets of movie review
+text. Our goal is to predict the sentiment of the snippet. A label of 1 indicates
+positive sentiment, and a label of 0 negative sentiment.
+
+### Establish a baseline
+
+As a first step, we will establish a baseline of good performance. We don't actually need
+KerasNLP for this, we can just use core Keras layers.
+
+We will train a simple bag-of-words model, where we learn a positive or negative weight
+for each word in our vocabulary. A sample's score is simply the sum of the weights of all
+words that are present in the sample.
+
+
+```python
+# This layer will turn our input sentence into a list of 1s and 0s the same size
+# our vocabulary, indicating whether a word is present in absent.
+multi_hot_layer = keras.layers.TextVectorization(
+    max_tokens=4000, output_mode="multi_hot"
+)
+multi_hot_layer.adapt(sst_train_ds.map(lambda x, y: x))
+multi_hot_ds = sst_train_ds.map(lambda x, y: (multi_hot_layer(x), y))
+multi_hot_val_ds = sst_val_ds.map(lambda x, y: (multi_hot_layer(x), y))
+
+# We then learn a linear regression over that layer, and that's our entire
+# baseline model!
+
+inputs = keras.Input(shape=(4000,), dtype="int32")
+outputs = keras.layers.Dense(1, activation="sigmoid")(inputs)
+baseline_model = keras.Model(inputs, outputs)
+baseline_model.compile(loss="binary_crossentropy", metrics=["accuracy"])
+baseline_model.fit(multi_hot_ds, validation_data=multi_hot_val_ds, epochs=5)
+```
+
+<div class="k-default-codeblock">
+```
+Epoch 1/5
+ 2105/2105 ━━━━━━━━━━━━━━━━━━━━ 2s 698us/step - accuracy: 0.6421 - loss: 0.6469 - val_accuracy: 0.7567 - val_loss: 0.5391
+Epoch 2/5
+ 2105/2105 ━━━━━━━━━━━━━━━━━━━━ 1s 493us/step - accuracy: 0.7524 - loss: 0.5392 - val_accuracy: 0.7868 - val_loss: 0.4891
+Epoch 3/5
+ 2105/2105 ━━━━━━━━━━━━━━━━━━━━ 1s 513us/step - accuracy: 0.7832 - loss: 0.4871 - val_accuracy: 0.7991 - val_loss: 0.4671
+Epoch 4/5
+ 2105/2105 ━━━━━━━━━━━━━━━━━━━━ 1s 475us/step - accuracy: 0.7991 - loss: 0.4543 - val_accuracy: 0.8069 - val_loss: 0.4569
+Epoch 5/5
+ 2105/2105 ━━━━━━━━━━━━━━━━━━━━ 1s 476us/step - accuracy: 0.8100 - loss: 0.4313 - val_accuracy: 0.8036 - val_loss: 0.4530
+
+<keras.src.callbacks.history.History at 0x7f13902967a0>
+
+```
+</div>
+A bag-of-words approach can be a fast and surprisingly powerful, especially when input
+examples contain a large number of words. With shorter sequences, it can hit a
+performance ceiling.
+
+To do better, we would like to build a model that can evaluate words *in context*. Instead
+of evaluating each word in a void, we need to use the information contained in the
+*entire ordered sequence* of our input.
+
+This runs us into a problem. `SST-2` is very small dataset, and there's simply not enough
+example text to attempt to build a larger, more parameterized model that can learn on a
+sequence. We would quickly start to overfit and memorize our training set, without any
+increase in our ability to generalize to unseen examples.
+
+Enter **pretraining**, which will allow us to learn on a larger corpus, and transfer our
+knowledge to the `SST-2` task. And enter **KerasNLP**, which will allow us to pretrain a
+particularly powerful model, the Transformer, with ease.
+
+---
+## Pretraining
+
+To beat our baseline, we will leverage the `WikiText103` dataset, an unlabeled
+collection of Wikipedia articles that is much bigger than `SST-2`.
+
+We are going to train a *transformer*, a highly expressive model which will learn
+to embed each word in our input as a low dimensional vector. Our wikipedia dataset has no
+labels, so we will use an unsupervised training objective called the *Masked Language
+Modeling* (MaskedLM) objective.
+
+Essentially, we will be playing a big game of "guess the missing word". For each input
+sample we will obscure 25% of our input data, and train our model to predict the parts we
+covered up.
+
+### Preprocess data for the MaskedLM task
+
+Our text preprocessing for the MaskedLM task will occur in two stages.
+
+1. Tokenize input text into integer sequences of token ids.
+2. Mask certain positions in our input to predict on.
+
+To tokenize, we can use a `keras_nlp.tokenizers.Tokenizer` -- the KerasNLP building block
+for transforming text into sequences of integer token ids.
+
+In particular, we will use `keras_nlp.tokenizers.WordPieceTokenizer` which does
+*sub-word* tokenization. Sub-word tokenization is popular when training models on large
+text corpora. Essentially, it allows our model to learn from uncommon words, while not
+requiring a massive vocabulary of every word in our training set.
+
+The second thing we need to do is mask our input for the MaskedLM task. To do this, we can use
+`keras_nlp.layers.MaskedLMMaskGenerator`, which will randomly select a set of tokens in each
+input and mask them out.
+
+The tokenizer and the masking layer can both be used inside a call to
+[tf.data.Dataset.map](https://www.tensorflow.org/api_docs/python/tf/data/Dataset#map).
+We can use `tf.data` to efficiently pre-compute each batch on the CPU, while our GPU or TPU
+works on training with the batch that came before. Because our masking layer will
+choose new words to mask each time, each epoch over our dataset will give us a totally
+new set of labels to train on.
+
+
+```python
+# Setting sequence_length will trim or pad the token outputs to shape
+# (batch_size, SEQ_LENGTH).
+tokenizer = keras_nlp.tokenizers.WordPieceTokenizer(
+    vocabulary=vocab_file,
+    sequence_length=SEQ_LENGTH,
+    lowercase=True,
+    strip_accents=True,
+)
+# Setting mask_selection_length will trim or pad the mask outputs to shape
+# (batch_size, PREDICTIONS_PER_SEQ).
+masker = keras_nlp.layers.MaskedLMMaskGenerator(
+    vocabulary_size=tokenizer.vocabulary_size(),
+    mask_selection_rate=MASK_RATE,
+    mask_selection_length=PREDICTIONS_PER_SEQ,
+    mask_token_id=tokenizer.token_to_id("[MASK]"),
+)
+
+
+def preprocess(inputs):
+    inputs = tokenizer(inputs)
+    outputs = masker(inputs)
+    # Split the masking layer outputs into a (features, labels, and weights)
+    # tuple that we can use with keras.Model.fit().
+    features = {
+        "token_ids": outputs["token_ids"],
+        "mask_positions": outputs["mask_positions"],
+    }
+    labels = outputs["mask_ids"]
+    weights = outputs["mask_weights"]
+    return features, labels, weights
+
+
+# We use prefetch() to pre-compute preprocessed batches on the fly on the CPU.
+pretrain_ds = wiki_train_ds.map(
+    preprocess, num_parallel_calls=tf.data.AUTOTUNE
+).prefetch(tf.data.AUTOTUNE)
+pretrain_val_ds = wiki_val_ds.map(
+    preprocess, num_parallel_calls=tf.data.AUTOTUNE
+).prefetch(tf.data.AUTOTUNE)
+
+# Preview a single input example.
+# The masks will change each time you run the cell.
+print(pretrain_val_ds.take(1).get_single_element())
+```
+
+<div class="k-default-codeblock">
+```
+({'token_ids': <tf.Tensor: shape=(128, 128), dtype=int32, numpy=
+array([[7570, 7849, 2271, ..., 9673,  103, 7570],
+       [7570, 7849,  103, ..., 1007, 1012, 2023],
+       [1996, 2034, 3940, ...,    0,    0,    0],
+       ...,
+       [2076, 1996, 2307, ...,    0,    0,    0],
+       [3216,  103, 2083, ...,    0,    0,    0],
+       [ 103, 2007, 1045, ...,    0,    0,    0]], dtype=int32)>, 'mask_positions': <tf.Tensor: shape=(128, 32), dtype=int64, numpy=
+array([[  5,   6,   7, ..., 118, 120, 126],
+       [  2,   3,  14, ..., 105, 106, 113],
+       [  4,   9,  10, ...,   0,   0,   0],
+       ...,
+       [  4,  11,  19, ..., 117, 118,   0],
+       [  1,  14,  17, ...,   0,   0,   0],
+       [  0,   3,   6, ...,   0,   0,   0]])>}, <tf.Tensor: shape=(128, 32), dtype=int32, numpy=
+array([[ 1010,  2124,  2004, ...,  2095, 11300,  1012],
+       [ 2271, 13091,  2303, ...,  2029,  2027,  1010],
+       [23976,  2007,  1037, ...,     0,     0,     0],
+       ...,
+       [ 1010,  1996,  1010, ...,  1999,  7511,     0],
+       [ 2225,  1998, 10722, ...,     0,     0,     0],
+       [ 9794,  1030,  2322, ...,     0,     0,     0]], dtype=int32)>, <tf.Tensor: shape=(128, 32), dtype=float32, numpy=
+array([[1., 1., 1., ..., 1., 1., 1.],
+       [1., 1., 1., ..., 1., 1., 1.],
+       [1., 1., 1., ..., 0., 0., 0.],
+       ...,
+       [1., 1., 1., ..., 1., 1., 0.],
+       [1., 1., 1., ..., 0., 0., 0.],
+       [1., 1., 1., ..., 0., 0., 0.]], dtype=float32)>)
+
+```
+</div>
+The above block sorts our dataset into a `(features, labels, weights)` tuple, which can be
+passed directly to `keras.Model.fit()`.
+
+We have two features:
+
+1. `"token_ids"`, where some tokens have been replaced with our mask token id.
+2. `"mask_positions"`, which keeps track of which tokens we masked out.
+
+Our labels are simply the ids we masked out.
+
+Because not all sequences will have the same number of masks, we also keep a
+`sample_weight` tensor, which removes padded labels from our loss function by giving them
+zero weight.
+
+### Create the Transformer encoder
+
+KerasNLP provides all the building blocks to quickly build a Transformer encoder.
+
+We use `keras_nlp.layers.TokenAndPositionEmbedding` to first embed our input token ids.
+This layer simultaneously learns two embeddings -- one for words in a sentence and another
+for integer positions in a sentence. The output embedding is simply the sum of the two.
+
+Then we can add a series of `keras_nlp.layers.TransformerEncoder` layers. These are the
+bread and butter of the Transformer model, using an attention mechanism to attend to
+different parts of the input sentence, followed by a multi-layer perceptron block.
+
+The output of this model will be a encoded vector per input token id. Unlike the
+bag-of-words model we used as a baseline, this model will embed each token accounting for
+the context in which it appeared.
+
+
+```python
+inputs = keras.Input(shape=(SEQ_LENGTH,), dtype="int32")
+
+# Embed our tokens with a positional embedding.
+embedding_layer = keras_nlp.layers.TokenAndPositionEmbedding(
+    vocabulary_size=tokenizer.vocabulary_size(),
+    sequence_length=SEQ_LENGTH,
+    embedding_dim=MODEL_DIM,
+)
+outputs = embedding_layer(inputs)
+
+# Apply layer normalization and dropout to the embedding.
+outputs = keras.layers.LayerNormalization(epsilon=NORM_EPSILON)(outputs)
+outputs = keras.layers.Dropout(rate=DROPOUT)(outputs)
+
+# Add a number of encoder blocks
+for i in range(NUM_LAYERS):
+    outputs = keras_nlp.layers.TransformerEncoder(
+        intermediate_dim=INTERMEDIATE_DIM,
+        num_heads=NUM_HEADS,
+        dropout=DROPOUT,
+        layer_norm_epsilon=NORM_EPSILON,
+    )(outputs)
+
+encoder_model = keras.Model(inputs, outputs)
+encoder_model.summary()
+```
+
+
+<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace"><span style="font-weight: bold">Model: "functional_3"</span>
+</pre>
+
+
+
+
+<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace">┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┓
+┃<span style="font-weight: bold"> Layer (type)                    </span>┃<span style="font-weight: bold"> Output Shape              </span>┃<span style="font-weight: bold">    Param # </span>┃
+┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━┩
+│ input_layer_1 (<span style="color: #0087ff; text-decoration-color: #0087ff">InputLayer</span>)      │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00af00; text-decoration-color: #00af00">128</span>)               │          <span style="color: #00af00; text-decoration-color: #00af00">0</span> │
+├─────────────────────────────────┼───────────────────────────┼────────────┤
+│ token_and_position_embedding    │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00af00; text-decoration-color: #00af00">128</span>, <span style="color: #00af00; text-decoration-color: #00af00">256</span>)          │  <span style="color: #00af00; text-decoration-color: #00af00">7,846,400</span> │
+│ (<span style="color: #0087ff; text-decoration-color: #0087ff">TokenAndPositionEmbedding</span>)     │                           │            │
+├─────────────────────────────────┼───────────────────────────┼────────────┤
+│ layer_normalization             │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00af00; text-decoration-color: #00af00">128</span>, <span style="color: #00af00; text-decoration-color: #00af00">256</span>)          │        <span style="color: #00af00; text-decoration-color: #00af00">512</span> │
+│ (<span style="color: #0087ff; text-decoration-color: #0087ff">LayerNormalization</span>)            │                           │            │
+├─────────────────────────────────┼───────────────────────────┼────────────┤
+│ dropout (<span style="color: #0087ff; text-decoration-color: #0087ff">Dropout</span>)               │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00af00; text-decoration-color: #00af00">128</span>, <span style="color: #00af00; text-decoration-color: #00af00">256</span>)          │          <span style="color: #00af00; text-decoration-color: #00af00">0</span> │
+├─────────────────────────────────┼───────────────────────────┼────────────┤
+│ transformer_encoder             │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00af00; text-decoration-color: #00af00">128</span>, <span style="color: #00af00; text-decoration-color: #00af00">256</span>)          │    <span style="color: #00af00; text-decoration-color: #00af00">527,104</span> │
+│ (<span style="color: #0087ff; text-decoration-color: #0087ff">TransformerEncoder</span>)            │                           │            │
+├─────────────────────────────────┼───────────────────────────┼────────────┤
+│ transformer_encoder_1           │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00af00; text-decoration-color: #00af00">128</span>, <span style="color: #00af00; text-decoration-color: #00af00">256</span>)          │    <span style="color: #00af00; text-decoration-color: #00af00">527,104</span> │
+│ (<span style="color: #0087ff; text-decoration-color: #0087ff">TransformerEncoder</span>)            │                           │            │
+├─────────────────────────────────┼───────────────────────────┼────────────┤
+│ transformer_encoder_2           │ (<span style="color: #00d7ff; text-decoration-color: #00d7ff">None</span>, <span style="color: #00af00; text-decoration-color: #00af00">128</span>, <span style="color: #00af00; text-decoration-color: #00af00">256</span>)          │    <span style="color: #00af00; text-decoration-color: #00af00">527,104</span> │
+│ (<span style="color: #0087ff; text-decoration-color: #0087ff">TransformerEncoder</span>)            │                           │            │
+└─────────────────────────────────┴───────────────────────────┴────────────┘
+</pre>
+
+
+
+
+<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace"><span style="font-weight: bold"> Total params: </span><span style="color: #00af00; text-decoration-color: #00af00">9,428,224</span> (287.73 MB)
+</pre>
+
+
+
+
+<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace"><span style="font-weight: bold"> Trainable params: </span><span style="color: #00af00; text-decoration-color: #00af00">9,428,224</span> (287.73 MB)
+</pre>
+
+
+
+
+<pre style="white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace"><span style="font-weight: bold"> Non-trainable params: </span><span style="color: #00af00; text-decoration-color: #00af00">0</span> (0.00 B)
+</pre>
+
+
+
+### Pretrain the Transformer
+
+You can think of the `encoder_model` as it's own modular unit, it is the piece of our
+model that we are really interested in for our downstream task. However we still need to
+set up the encoder to train on the MaskedLM task; to do that we attach a
+`keras_nlp.layers.MaskedLMHead`.
+
+This layer will take as one input the token encodings, and as another the positions we
+masked out in the original input. It will gather the token encodings we masked, and
+transform them back in predictions over our entire vocabulary.
+
+With that, we are ready to compile and run pretraining. If you are running this in a
+Colab, note that this will take about an hour. Training Transformer is famously compute
+intensive, so even this relatively small Transformer will take some time.
+
+
+```python
+# Create the pretraining model by attaching a masked language model head.
+inputs = {
+    "token_ids": keras.Input(shape=(SEQ_LENGTH,), dtype="int32", name="token_ids"),
+    "mask_positions": keras.Input(
+        shape=(PREDICTIONS_PER_SEQ,), dtype="int32", name="mask_positions"
+    ),
+}
+
+# Encode the tokens.
+encoded_tokens = encoder_model(inputs["token_ids"])
+
+# Predict an output word for each masked input token.
+# We use the input token embedding to project from our encoded vectors to
+# vocabulary logits, which has been shown to improve training efficiency.
+outputs = keras_nlp.layers.MaskedLMHead(
+    token_embedding=embedding_layer.token_embedding,
+    activation="softmax",
+)(encoded_tokens, mask_positions=inputs["mask_positions"])
+
+# Define and compile our pretraining model.
+pretraining_model = keras.Model(inputs, outputs)
+pretraining_model.compile(
+    loss="sparse_categorical_crossentropy",
+    optimizer=keras.optimizers.AdamW(PRETRAINING_LEARNING_RATE),
+    weighted_metrics=["sparse_categorical_accuracy"],
+    jit_compile=True,
+)
+
+# Pretrain the model on our wiki text dataset.
+pretraining_model.fit(
+    pretrain_ds,
+    validation_data=pretrain_val_ds,
+    epochs=PRETRAINING_EPOCHS,
+)
+
+# Save this base model for further finetuning.
+encoder_model.save("encoder_model.keras")
+```
+
+<div class="k-default-codeblock">
+```
+Epoch 1/8
+ 5857/5857 ━━━━━━━━━━━━━━━━━━━━ 242s 41ms/step - loss: 5.4679 - sparse_categorical_accuracy: 0.1353 - val_loss: 3.4570 - val_sparse_categorical_accuracy: 0.3522
+Epoch 2/8
+ 5857/5857 ━━━━━━━━━━━━━━━━━━━━ 234s 40ms/step - loss: 3.6031 - sparse_categorical_accuracy: 0.3396 - val_loss: 3.0514 - val_sparse_categorical_accuracy: 0.4032
+Epoch 3/8
+ 5857/5857 ━━━━━━━━━━━━━━━━━━━━ 232s 40ms/step - loss: 3.2609 - sparse_categorical_accuracy: 0.3802 - val_loss: 2.8858 - val_sparse_categorical_accuracy: 0.4240
+Epoch 4/8
+ 5857/5857 ━━━━━━━━━━━━━━━━━━━━ 233s 40ms/step - loss: 3.1099 - sparse_categorical_accuracy: 0.3978 - val_loss: 2.7897 - val_sparse_categorical_accuracy: 0.4375
+Epoch 5/8
+ 5857/5857 ━━━━━━━━━━━━━━━━━━━━ 235s 40ms/step - loss: 3.0145 - sparse_categorical_accuracy: 0.4090 - val_loss: 2.7504 - val_sparse_categorical_accuracy: 0.4419
+Epoch 6/8
+ 5857/5857 ━━━━━━━━━━━━━━━━━━━━ 252s 43ms/step - loss: 2.9530 - sparse_categorical_accuracy: 0.4157 - val_loss: 2.6925 - val_sparse_categorical_accuracy: 0.4474
+Epoch 7/8
+ 5857/5857 ━━━━━━━━━━━━━━━━━━━━ 232s 40ms/step - loss: 2.9088 - sparse_categorical_accuracy: 0.4210 - val_loss: 2.6554 - val_sparse_categorical_accuracy: 0.4513
+Epoch 8/8
+ 5857/5857 ━━━━━━━━━━━━━━━━━━━━ 236s 40ms/step - loss: 2.8721 - sparse_categorical_accuracy: 0.4250 - val_loss: 2.6389 - val_sparse_categorical_accuracy: 0.4548
+
+```
+</div>
+---
+## Fine-tuning
+
+After pretraining, we can now fine-tune our model on the `SST-2` dataset. We can
+leverage the ability of the encoder we build to predict on words in context to boost 
+our performance on the downstream task.
+
+### Preprocess data for classification
+
+Preprocessing for fine-tuning is much simpler than for our pretraining MaskedLM task. We just
+tokenize our input sentences and we are ready for training!
+
+
+```python
+
+def preprocess(sentences, labels):
+    return tokenizer(sentences), labels
+
+
+# We use prefetch() to pre-compute preprocessed batches on the fly on our CPU.
+finetune_ds = sst_train_ds.map(
+    preprocess, num_parallel_calls=tf.data.AUTOTUNE
+).prefetch(tf.data.AUTOTUNE)
+finetune_val_ds = sst_val_ds.map(
+    preprocess, num_parallel_calls=tf.data.AUTOTUNE
+).prefetch(tf.data.AUTOTUNE)
+
+# Preview a single input example.
+print(finetune_val_ds.take(1).get_single_element())
+```
+
+<div class="k-default-codeblock">
+```
+(<tf.Tensor: shape=(32, 128), dtype=int32, numpy=
+array([[ 2009,  1005,  1055, ...,     0,     0,     0],
+       [ 4895, 10258,  2378, ...,     0,     0,     0],
+       [ 4473,  2149,  2000, ...,     0,     0,     0],
+       ...,
+       [ 1045,  2018,  2000, ...,     0,     0,     0],
+       [ 4283,  2000,  3660, ...,     0,     0,     0],
+       [ 1012,  1012,  1012, ...,     0,     0,     0]], dtype=int32)>, <tf.Tensor: shape=(32,), dtype=int32, numpy=
+array([1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0,
+       0, 1, 1, 0, 0, 1, 0, 0, 1, 0], dtype=int32)>)
+
+```
+</div>
+### Fine-tune the Transformer
+
+To go from our encoded token output to a classification prediction, we need to attach
+another "head" to our Transformer model. We can afford to be simple here. We pool
+the encoded tokens together, and use a single dense layer to make a prediction.
+
+
+```python
+# Reload the encoder model from disk so we can restart fine-tuning from scratch.
+encoder_model = keras.models.load_model("encoder_model.keras", compile=False)
+
+# Take as input the tokenized input.
+inputs = keras.Input(shape=(SEQ_LENGTH,), dtype="int32")
+
+# Encode and pool the tokens.
+encoded_tokens = encoder_model(inputs)
+pooled_tokens = keras.layers.GlobalAveragePooling1D()(encoded_tokens[0])
+
+# Predict an output label.
+outputs = keras.layers.Dense(1, activation="sigmoid")(pooled_tokens)
+
+# Define and compile our fine-tuning model.
+finetuning_model = keras.Model(inputs, outputs)
+finetuning_model.compile(
+    loss="binary_crossentropy",
+    optimizer=keras.optimizers.AdamW(FINETUNING_LEARNING_RATE),
+    metrics=["accuracy"],
+)
+
+# Finetune the model for the SST-2 task.
+finetuning_model.fit(
+    finetune_ds,
+    validation_data=finetune_val_ds,
+    epochs=FINETUNING_EPOCHS,
+)
+```
+
+<div class="k-default-codeblock">
+```
+Epoch 1/3
+ 2105/2105 ━━━━━━━━━━━━━━━━━━━━ 21s 9ms/step - accuracy: 0.7500 - loss: 0.4891 - val_accuracy: 0.8036 - val_loss: 0.4099
+Epoch 2/3
+ 2105/2105 ━━━━━━━━━━━━━━━━━━━━ 16s 8ms/step - accuracy: 0.8826 - loss: 0.2779 - val_accuracy: 0.8482 - val_loss: 0.3964
+Epoch 3/3
+ 2105/2105 ━━━━━━━━━━━━━━━━━━━━ 16s 8ms/step - accuracy: 0.9176 - loss: 0.2066 - val_accuracy: 0.8549 - val_loss: 0.4142
+
+<keras.src.callbacks.history.History at 0x7f12d85c21a0>
+
+```
+</div>
+Pretraining was enough to boost our performance to 84%, and this is hardly the ceiling
+for Transformer models. You may have noticed during pretraining that our validation
+performance was still steadily increasing. Our model is still significantly undertrained.
+Training for more epochs, training a large Transformer, and training on more unlabeled
+text would all continue to boost performance significantly.
+
+One of the key goals of KerasNLP is to provide a modular approach to NLP model building.
+We have shown one approach to building a Transformer here, but KerasNLP supports an ever
+growing array of components for preprocessing text and building models. We hope it makes
+it easier to experiment on solutions to your natural language problems.
diff --git a/templates/guides/keras_nlp/upload.md b/templates/guides/keras_nlp/upload.md
new file mode 100644
index 0000000000..76d3872fb0
--- /dev/null
+++ b/templates/guides/keras_nlp/upload.md
@@ -0,0 +1,308 @@
+# Uploading Models with KerasNLP
+
+**Author:** [Samaneh Saadat](https://github.com/SamanehSaadat/), [Matthew Watson](https://github.com/mattdangerw/)<br>
+**Date created:** 2024/04/29<br>
+**Last modified:** 2024/04/29<br>
+**Description:** An introduction on how to upload a fine-tuned KerasNLP model to model hubs.
+
+
+<img class="k-inline-icon" src="https://colab.research.google.com/img/colab_favicon.ico"/> [**View in Colab**](https://colab.research.google.com/github/keras-team/keras-io/blob/master/guides/ipynb/keras_nlp/upload.ipynb)  <span class="k-dot">•</span><img class="k-inline-icon" src="https://github.com/favicon.ico"/> [**GitHub source**](https://github.com/keras-team/keras-io/blob/master/guides/keras_nlp/upload.py)
+
+
+
+# Introduction
+
+Fine-tuning a machine learning model can yield impressive results for specific tasks.
+Uploading your fine-tuned model to a model hub allows you to share it with the broader community.
+By sharing your models, you'll enhance accessibility for other researchers and developers,
+making your contributions an integral part of the machine learning landscape.
+This can also streamline the integration of your model into real-world applications.
+
+This guide walks you through how to upload your fine-tuned models to popular model hubs such as
+[Kaggle Models](https://www.kaggle.com/models) and [Hugging Face Hub](https://huggingface.co/models).
+
+# Setup
+
+Let's start by installing and importing all the libraries we need. We use KerasNLP for this guide.
+
+
+```python
+!pip install -q --upgrade keras-nlp huggingface-hub kagglehub
+```
+
+
+```python
+import os
+
+os.environ["KERAS_BACKEND"] = "jax"
+
+import keras_nlp
+
+```
+
+# Data
+
+We can use the IMDB reviews dataset for this guide. Let's load the dataset from `tensorflow_dataset`.
+
+
+```python
+import tensorflow_datasets as tfds
+
+imdb_train, imdb_test = tfds.load(
+    "imdb_reviews",
+    split=["train", "test"],
+    as_supervised=True,
+    batch_size=4,
+)
+```
+
+We only use a small subset of the training samples to make the guide run faster.
+However, if you need a higher quality model, consider using a larger number of training samples.
+
+
+```python
+imdb_train = imdb_train.take(100)
+```
+
+# Task Upload
+
+A `keras_nlp.models.Task`, wraps a `keras_nlp.models.Backbone` and a `keras_nlp.models.Preprocessor` to create
+a model that can be directly used for training, fine-tuning, and prediction for a given text problem.
+In this section, we explain how to create a `Task`, fine-tune and upload it to a model hub.
+
+---
+## Load Model
+
+If you want to build a Causal LM based on a base model, simply call `keras_nlp.models.CausalLM.from_preset`
+and pass a built-in preset identifier.
+
+
+```python
+causal_lm = keras_nlp.models.CausalLM.from_preset("gpt2_base_en")
+
+```
+
+<div class="k-default-codeblock">
+```
+Downloading from https://www.kaggle.com/api/v1/models/keras/gpt2/keras/gpt2_base_en/2/download/task.json...
+
+Downloading from https://www.kaggle.com/api/v1/models/keras/gpt2/keras/gpt2_base_en/2/download/preprocessor.json...
+
+```
+</div>
+---
+## Fine-tune Model
+
+After loading the model, you can call `.fit()` on the model to fine-tune it.
+Here, we fine-tune the model on the IMDB reviews which makes the model movie domain-specific.
+
+
+```python
+# Drop labels and keep the review text only for the Causal LM.
+imdb_train_reviews = imdb_train.map(lambda x, y: x)
+
+# Fine-tune the Causal LM.
+causal_lm.fit(imdb_train_reviews)
+```
+ 100/100 ━━━━━━━━━━━━━━━━━━━━ 151s 1s/step - loss: 1.0198 - sparse_categorical_accuracy: 0.3271
+
+---
+## Save the Model Locally
+
+To upload a model, you need to first save the model locally using `save_to_preset`.
+
+
+```python
+preset_dir = "./gpt2_imdb"
+causal_lm.save_to_preset(preset_dir)
+```
+
+Let's see the saved files.
+
+
+```python
+os.listdir(preset_dir)
+```
+
+
+
+
+<div class="k-default-codeblock">
+```
+['preprocessor.json',
+ 'tokenizer.json',
+ 'task.json',
+ 'model.weights.h5',
+ 'config.json',
+ 'metadata.json',
+ 'assets']
+
+```
+</div>
+### Load a Locally Saved Model
+
+A model that is saved to a local preset can be loaded using `from_preset`.
+What you save in, is what you get back out.
+
+
+```python
+causal_lm = keras_nlp.models.CausalLM.from_preset(preset_dir)
+```
+
+You can also load the `keras_nlp.models.Backbone` and `keras_nlp.models.Tokenizer` objects from this preset directory.
+Note that these objects are equivalent to `causal_lm.backbone` and `causal_lm.preprocessor.tokenizer` above.
+
+
+```python
+backbone = keras_nlp.models.Backbone.from_preset(preset_dir)
+tokenizer = keras_nlp.models.Tokenizer.from_preset(preset_dir)
+```
+
+---
+## Upload the Model to a Model Hub
+
+After saving a preset to a directory, this directory can be uploaded to a model hub such as Kaggle or Hugging Face directly from the KerasNLP library.
+To upload the model to Kaggle, the URI must start with `kaggle://` and to upload to Hugging Face, it should start with `hf://`.
+
+### Upload to Kaggle
+
+To upload a model to Kaggle, first, we need to authenticate with Kaggle.
+This can in one of the following ways:
+1. Set environment variables `KAGGLE_USERNAME` and `KAGGLE_KEY`.
+2. Provide a local `~/.kaggle/kaggle.json`.
+3. Call `kagglehub.login()`.
+
+Let's make sure we are logged in before continuing.
+
+
+```python
+import kagglehub
+
+if "KAGGLE_USERNAME" not in os.environ or "KAGGLE_KEY" not in os.environ:
+    kagglehub.login()
+
+```
+
+To upload a model we can use `keras_nlp.upload_preset(uri, preset_dir)` API where `uri` has the format of
+`kaggle://<KAGGLE_USERNAME>/<MODEL>/Keras/<VARIATION>` for uploading to Kaggle and `preset_dir` is the directory that the model is saved in.
+
+Running the following uploads the model that is saved in `preset_dir` to Kaggle:
+
+
+```python
+kaggle_username = kagglehub.whoami()["username"]
+kaggle_uri = f"kaggle://{kaggle_username}/gpt2/keras/gpt2_imdb"
+keras_nlp.upload_preset(kaggle_uri, preset_dir)
+```
+
+<div class="k-default-codeblock">
+```
+Upload successful: preprocessor.json (834B)
+Upload successful: tokenizer.json (322B)
+Upload successful: task.json (2KB)
+Upload successful: model.weights.h5 (475MB)
+Upload successful: config.json (431B)
+Upload successful: metadata.json (142B)
+Upload successful: merges.txt (446KB)
+Upload successful: vocabulary.json (1018KB)
+
+Your model instance version has been created.
+
+```
+</div>
+### Upload to Hugging Face
+
+To upload a model to Hugging Face, first, we need to authenticate with Hugging Face.
+This can in one of the following ways:
+1. Set environment variables `HF_USERNAME` and `HF_TOKEN`.
+2. Call `huggingface_hub.notebook_login()`.
+
+Let's make sure we are logged in before coninuing.
+
+
+```python
+import huggingface_hub
+
+if "HF_USERNAME" not in os.environ or "HF_TOKEN" not in os.environ:
+    huggingface_hub.notebook_login()
+```
+
+`keras_nlp.upload_preset(uri, preset_dir)` can be used to upload a model to Hugging Face if `uri` has the format of
+`kaggle://<HF_USERNAME>/<MODEL>`.
+
+Running the following uploads the model that is saved in `preset_dir` to Hugging Face:
+
+
+```python
+hf_username = huggingface_hub.whoami()["name"]
+hf_uri = f"hf://{hf_username}/gpt2_imdb"
+keras_nlp.upload_preset(hf_uri, preset_dir)
+
+```
+
+---
+## Load a User Uploaded Model
+
+After verifying that the model is uploaded to Kaggle, we can load the model by calling `from_preset`.
+
+```python
+causal_lm = keras_nlp.models.CausalLM.from_preset(
+    f"kaggle://{kaggle_username}/gpt2/keras/gpt2_imdb"
+)
+```
+
+We can also load the model uploaded to Hugging Face by calling `from_preset`.
+
+```python
+causal_lm = keras_nlp.models.CausalLM.from_preset(f"hf://{hf_username}/gpt2_imdb")
+```
+
+# Classifier Upload
+
+Uploading a classifier model is similar to Causal LM upload.
+To upload the fine-tuned model, first, the model should be saved to a local directory using `save_to_preset`
+API and then it can be uploaded via `keras_nlp.upload_preset`.
+
+
+```python
+# Load the base model.
+classifier = keras_nlp.models.Classifier.from_preset(
+    "bert_tiny_en_uncased", num_classes=2
+)
+
+# Fine-tune the classifier.
+classifier.fit(imdb_train)
+
+# Save the model to a local preset directory.
+preset_dir = "./bert_tiny_imdb"
+classifier.save_to_preset(preset_dir)
+
+# Upload to Kaggle.
+keras_nlp.upload_preset(
+    f"kaggle://{kaggle_username}/bert/keras/bert_tiny_imdb", preset_dir
+)
+```
+ 100/100 ━━━━━━━━━━━━━━━━━━━━ 7s 31ms/step - loss: 0.6975 - sparse_categorical_accuracy: 0.5164
+
+
+<div class="k-default-codeblock">
+```
+Upload successful: preprocessor.json (947B)
+Upload successful: tokenizer.json (461B)
+Upload successful: task.json (2KB)
+Upload successful: task.weights.h5 (50MB)
+Upload successful: model.weights.h5 (17MB)
+Upload successful: config.json (454B)
+Upload successful: metadata.json (140B)
+Upload successful: vocabulary.txt (226KB)
+
+Your model instance version has been created.
+```
+</div>
+After verifying that the model is uploaded to Kaggle, we can load the model by calling `from_preset`.
+
+```python
+classifier = keras_nlp.models.Classifier.from_preset(
+    f"kaggle://{kaggle_username}/bert/keras/bert_tiny_imdb"
+)
+```
\ No newline at end of file
diff --git a/templates/keras_3/keras_3_announcement.md b/templates/keras_3/keras_3_announcement.md
index b57d985d75..2be864b4a7 100644
--- a/templates/keras_3/keras_3_announcement.md
+++ b/templates/keras_3/keras_3_announcement.md
@@ -171,7 +171,7 @@ you can start using today with Keras 3.
 All 40 Keras Applications models (the `keras.applications` namespace)
 are available in all backends.
 The vast array of pretrained models in [KerasCV](https://keras.io/api/keras_cv/)
-and [KerasNLP](https://keras.io/api/keras_nlp/) also work with all backends. This includes:
+and [KerasHub](https://keras.io/api/keras_hub/) also work with all backends. This includes:
 
 - BERT
 - OPT
diff --git a/templates/keras_cv/index.md b/templates/keras_cv/index.md
index 9b6287c348..1c3615f649 100644
--- a/templates/keras_cv/index.md
+++ b/templates/keras_cv/index.md
@@ -3,20 +3,20 @@
 <a class="github-button" href="https://github.com/keras-team/keras-cv" data-size="large" data-show-count="true" aria-label="Star keras-team/keras-cv on GitHub">Star</a>
 
 KerasCV is a library of modular computer vision components that work natively
-with TensorFlow, JAX, or PyTorch. Built on Keras 3, these models, layers, 
-metrics, callbacks, etc., can be trained and serialized in any framework and 
+with TensorFlow, JAX, or PyTorch. Built on Keras 3, these models, layers,
+metrics, callbacks, etc., can be trained and serialized in any framework and
 re-used in another without costly migrations.
 
-KerasCV can be understood as a horizontal extension of the Keras API: the 
-components are new first-party Keras objects that are too specialized to be 
-added to core Keras. They receive the same level of polish and backwards 
-compatibility guarantees as the core Keras API, and they are maintained by the 
+KerasCV can be understood as a horizontal extension of the Keras API: the
+components are new first-party Keras objects that are too specialized to be
+added to core Keras. They receive the same level of polish and backwards
+compatibility guarantees as the core Keras API, and they are maintained by the
 Keras team.
 
-Our APIs assist in common computer vision tasks such as data augmentation, 
+Our APIs assist in common computer vision tasks such as data augmentation,
 classification, object detection, segmentation, image generation, and more.
-Applied computer vision engineers can leverage KerasCV to quickly assemble 
-production-grade, state-of-the-art training and inference pipelines for all of 
+Applied computer vision engineers can leverage KerasCV to quickly assemble
+production-grade, state-of-the-art training and inference pipelines for all of
 these common tasks.
 
 
diff --git a/templates/keras_hub/index.md b/templates/keras_hub/index.md
new file mode 100644
index 0000000000..042d796c27
--- /dev/null
+++ b/templates/keras_hub/index.md
@@ -0,0 +1,136 @@
+# KerasHub
+
+<a class="github-button" href="https://github.com/keras-team/keras-hub" data-size="large" data-show-count="true" aria-label="Star keras-team/keras-hub on GitHub">Star</a>
+
+**KerasHub** is a pretrained modeling library that aims to be simple, flexible,
+and fast. The library provides [Keras 3](https://keras.io/keras_3/)
+implementations of popular model archtictures, paired with a collection of
+pretrained checkpoints available on [Kaggle Models](https://kaggle.com/models/).
+Models can be use for both training and inference, on any of the TensorFlow,
+Jax, and Torch backends.
+
+KerasHub is an extension of the core Keras API; KerasHub components are provide
+as [`Layers`](/api/layers/) and [`Models`](/api/models/). If you are familiar
+with Keras, congratulations! You already understand most of KerasHub.
+
+See our [Getting Started guide](/guides/keras_hub/getting_started)
+to start learning our API. We welcome
+[contributions](https://github.com/keras-team/keras-hub/issues/1835).
+
+---
+## Quick links
+
+* [KerasHub API reference](/api/keras_hub/)
+* [KerasHub on GitHub](https://github.com/keras-team/keras-hub)
+* [KerasHub models on Kaggle](https://www.kaggle.com/organizations/keras/models)
+* [List of available pretrained models](/api/keras_hub/models/)
+
+## Guides
+
+* [Getting Started with KerasHub](/guides/keras_hub/getting_started/)
+* [Uploading Models with KerasHub](/guides/keras_hub/upload/)
+
+---
+## Installation
+
+To install the latest KerasHub release with Keras 3, simply run:
+
+```
+pip install --upgrade keras-hub
+```
+
+To install the latest nightly changes for both KerasHub and Keras, you can use
+our nightly package.
+
+```
+pip install --upgrade keras-hub-nightly
+```
+
+Note that currently, installing KerasHub will always pull in TensorFlow for use
+of the `tf.data` API for preprocessing. Even when pre-processing with `tf.data`,
+training can still happen on any backend.
+
+Read [Getting started with Keras](https://keras.io/getting_started/) for more
+information on installing Keras 3 and compatibility with different frameworks.
+
+**Note:** We recommend using KerasHub with TensorFlow 2.16 or later, as TF 2.16
+packages Keras 3 by default.
+
+---
+## Quickstart
+
+Below is a quick example using ResNet to predict an image, and BERT to train a
+classifier:
+
+```python
+import os
+os.environ["KERAS_BACKEND"] = "jax"  # Or "tensorflow" or "torch"!
+
+import keras
+import keras_hub
+import numpy as np
+import tensorflow_datasets as tfds
+
+# Load a ResNet model.
+classifier = keras_hub.models.ImageClassifier.from_preset(
+    "resnet_50_imagenet",
+    activation="softmax",
+)
+# Predict a label for a single image.
+image_url = "https://upload.wikimedia.org/wikipedia/commons/a/aa/California_quail.jpg"
+image_path = keras.utils.get_file(origin=image_url)
+image = keras.utils.load_img(image_path)
+batch = np.array([image])
+preds = classifier.predict(batch)
+print(keras_hub.utils.decode_imagenet_predictions(preds))
+
+# Load a BERT model.
+classifier = keras_hub.models.BertClassifier.from_preset(
+    "bert_base_en_uncased", 
+    activation="softmax",
+    num_classes=2,
+)
+
+# Fine-tune on IMDb movie reviews.
+imdb_train, imdb_test = tfds.load(
+    "imdb_reviews",
+    split=["train", "test"],
+    as_supervised=True,
+    batch_size=16,
+)
+classifier.fit(imdb_train, validation_data=imdb_test)
+# Predict two new examples.
+preds = classifier.predict(
+    ["What an amazing movie!", "A total waste of my time."]
+)
+print(preds)
+```
+
+---
+## Compatibility
+
+We follow [Semantic Versioning](https://semver.org/), and plan to
+provide backwards compatibility guarantees both for code and saved models built
+with our components. While we continue with pre-release `0.y.z` development, we
+may break compatibility at any time and APIs should not be consider stable.
+
+## Disclaimer
+
+KerasHub provides access to pre-trained models via the `keras_hub.models` API.
+These pre-trained models are provided on an "as is" basis, without warranties
+or conditions of any kind.
+
+## Citing KerasHub
+
+If KerasHub helps your research, we appreciate your citations.
+Here is the BibTeX entry:
+
+```bibtex
+@misc{kerashub2022,
+  title={KerasHub},
+  author={Watson, Matthew, and Qian, Chen, and Bischof, Jonathan and Chollet, 
+  Fran\c{c}ois and others},
+  year={2022},
+  howpublished={\url{https://github.com/keras-team/keras-hub}},
+}
+```