jiali-home
diff --git a/‎example_chat_completion.py
+8-12 b/‎example_chat_completion.py
+8-12
diff --git a/‎example_text_completion.py
+4-11 b/‎example_text_completion.py
+4-11
diff --git a/‎llama/generation.py
+9-14 b/‎llama/generation.py
+9-14
@@ -18,19 +18,15 @@ def main(
     max_gen_len: Optional[int] = None,
 ):
     """
-    Entry point of the program for generating text using a pretrained model.
+    Examples to run with the models finetuned for chat. Prompts correspond of chat
+    turns between the user and assistant with the final one always being the user.
 
-    Args:
-        ckpt_dir (str): The directory containing checkpoint files for the pretrained model.
-        tokenizer_path (str): The path to the tokenizer model used for text encoding/decoding.
-        temperature (float, optional): The temperature value for controlling randomness in generation.
-            Defaults to 0.6.
-        top_p (float, optional): The top-p sampling parameter for controlling diversity in generation.
-            Defaults to 0.9.
-        max_seq_len (int, optional): The maximum sequence length for input prompts. Defaults to 512.
-        max_batch_size (int, optional): The maximum batch size for generating sequences. Defaults to 8.
-        max_gen_len (int, optional): The maximum length of generated sequences. If None, it will be
-            set to the model's max sequence length. Defaults to None.
+    An optional system prompt at the beginning to control how the model should respond
+    is also supported.
+
+    The context window of llama3 models is 8192 tokens, so `max_seq_len` needs to be <= 8192.
+
+    `max_gen_len` is optional because finetuned models are able to stop generations naturally.
     """
     generator = Llama.build(
         ckpt_dir=ckpt_dir,
 
@@ -18,18 +18,11 @@ def main(
     max_batch_size: int = 4,
 ):
     """
-    Entry point of the program for generating text using a pretrained model.
+    Examples to run with the pre-trained models (no fine-tuning). Prompts are
+    usually in the form of an incomplete text prefix that the model can then try to complete.
 
-    Args:
-        ckpt_dir (str): The directory containing checkpoint files for the pretrained model.
-        tokenizer_path (str): The path to the tokenizer model used for text encoding/decoding.
-        temperature (float, optional): The temperature value for controlling randomness in generation.
-            Defaults to 0.6.
-        top_p (float, optional): The top-p sampling parameter for controlling diversity in generation.
-            Defaults to 0.9.
-        max_seq_len (int, optional): The maximum sequence length for input prompts. Defaults to 128.
-        max_gen_len (int, optional): The maximum length of generated sequences. Defaults to 64.
-        max_batch_size (int, optional): The maximum batch size for generating sequences. Defaults to 4.
+    The context window of llama3 models is 8192 tokens, so `max_seq_len` needs to be <= 8192.
+    `max_gen_len` is needed because pre-trained models usually do not stop completions naturally.
     """
     generator = Llama.build(
         ckpt_dir=ckpt_dir,
 
@@ -17,7 +17,7 @@
 )
 
 from llama.model import ModelArgs, Transformer
-from llama.tokenizer import Dialog, Message, ChatFormat, Tokenizer
+from llama.tokenizer import ChatFormat, Dialog, Message, Tokenizer
 
 
 class CompletionPrediction(TypedDict, total=False):
@@ -43,7 +43,7 @@ def build(
         seed: int = 1,
     ) -> "Llama":
         """
-        Build a Llama instance by initializing and loading a pre-trained model.
+        Build a Llama instance by initializing and loading a model checkpoint.
 
         Args:
             ckpt_dir (str): Path to the directory containing checkpoint files.
@@ -63,7 +63,6 @@ def build(
         Note:
             This method initializes the distributed process group, sets the device to CUDA,
             and loads the pre-trained model and tokenizer.
-
         """
         if not torch.distributed.is_initialized():
             torch.distributed.init_process_group("nccl")
@@ -99,7 +98,10 @@ def build(
         )
         tokenizer = Tokenizer(model_path=tokenizer_path)
         assert model_args.vocab_size == tokenizer.n_words
-        torch.set_default_tensor_type(torch.cuda.HalfTensor)
+        if torch.cuda.is_bf16_supported():
+            torch.set_default_tensor_type(torch.cuda.BFloat16Tensor)
+        else:
+            torch.set_default_tensor_type(torch.cuda.HalfTensor)
         model = Transformer(model_args)
         model.load_state_dict(checkpoint, strict=False)
         print(f"Loaded in {time.time() - start_time:.2f} seconds")
@@ -212,8 +214,8 @@ def generate(
             for stop_token in self.tokenizer.stop_tokens:
                 try:
                     eos_idx = toks.index(stop_token)
-                    toks = toks[: eos_idx]
-                    probs = probs[: eos_idx] if logprobs else None
+                    toks = toks[:eos_idx]
+                    probs = probs[:eos_idx] if logprobs else None
                 except ValueError:
                     pass
             out_tokens.append(toks)
@@ -293,22 +295,16 @@ def chat_completion(
         Returns:
             List[ChatPrediction]: List of chat predictions, each containing the assistant's generated response.
 
-        Raises:
-            AssertionError: If the last message in a dialog is not from the user.
-            AssertionError: If the dialog roles are not in the required 'user', 'assistant', and optional 'system' order.
-
         Note:
             This method generates assistant responses for the provided conversational dialogs.
             It employs nucleus sampling to introduce controlled randomness in text generation.
             If logprobs is True, token log probabilities are computed for each generated token.
-
         """
         if max_gen_len is None:
             max_gen_len = self.model.params.max_seq_len - 1
 
         prompt_tokens = [
-            self.formatter.encode_dialog_prompt(dialog)
-            for dialog in dialogs
+            self.formatter.encode_dialog_prompt(dialog) for dialog in dialogs
         ]
         generation_tokens, generation_logprobs = self.generate(
             prompt_tokens=prompt_tokens,
@@ -354,7 +350,6 @@ def sample_top_p(probs, p):
     Note:
         Top-p sampling selects the smallest set of tokens whose cumulative probability mass
         exceeds the threshold p. The distribution is renormalized based on the selected tokens.
-
     """
     probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True)
     probs_sum = torch.cumsum(probs_sort, dim=-1)