OpenPPL · yinfan98 · Apr 30, 2024 · Jul 1, 2024 · Jul 3, 2024 · Sep 23, 2024
diff --git a/README.md b/README.md
@@ -18,6 +18,15 @@ About add new operator: [Link](docs/AddNewOp.md)
 
 About update an operator's version: [Link](docs/UpdateOp.md)
 
+# Install OPMX
+
+You can install opmx like:
+
+```
+pip install -e .
+```
+Then you can just use OPMX like a python api.
+
 # Use OPMX Python API
 
 OPMX provides functional API based on `torch.autograd.Function`.

diff --git a/eval/eval_ppl.py b/eval/eval_ppl.py
@@ -0,0 +1,83 @@
+import torch
+import torch.nn as nn
+from tqdm import tqdm
+from datasets import load_dataset
+
+
+def evaluate_perplexity(generator, tokenizer):
+    def _perplexity(nlls, n_samples, seqlen):
+        return torch.exp(torch.stack(nlls).sum() / (n_samples * seqlen))
+
+    # load and prepare dataset
+    data = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
+    data = tokenizer("\n\n".join(data["text"]), return_tensors="pt")
+    data = data.input_ids.to("cuda")
+
+    seqlen = 2048
+    n_samples = data.numel() // seqlen
+
+    nlls = []
+
+
+    with tqdm(range(n_samples), desc="Perplexity -") as progress_bar:
+        for i in progress_bar:
+            start_index = i * seqlen
+            end_index = (i + 1) * seqlen
+            batch = data[:, start_index:end_index].to("cuda")
+            batch = batch[0]
+            with torch.no_grad():
+                # ppl_forward args
+                ## attn_mask
+                attn_mask = torch.empty(0, dtype=torch.float16)
+                ## seqstarts
+                seqstarts = torch.zeros(2, dtype=torch.int64)
+                token_len = len(batch)
+                seqstarts[1:] = torch.tensor(token_len, dtype=torch.int64)
+                seqstarts = seqstarts.cumsum(0).cuda()
+                ## kvlens
+                kvstarts = torch.zeros(2, dtype=torch.int64)
+                kvlens = [token_len]
+                kvstarts[1:] = torch.tensor(kvlens, dtype=torch.int64)
+                kvstarts = kvstarts.cumsum(0).cuda()
+                ## cachestarts
+                cachestarts = torch.tensor([0], dtype=torch.int64).cuda()
+                ## decoding_batches
+                decoding_batches = torch.tensor([0])
+                ## start_pos
+                start_pos = torch.tensor([0], dtype=torch.int64).cuda()
+                ## max_seqlen
+                max_seqlen = torch.tensor([token_len])
+                ## max_kvlen
+                max_kvlen = torch.tensor([token_len])
+                ## kvcache
+                total_cache_len = token_len + seqlen
+                num_layers = generator.model.params.num_layers
+                num_local_kv_heads = generator.model.params.num_kv_heads
+                cache_prefix_shape = (total_cache_len, num_layers, 2, num_local_kv_heads)
+                head_dim = generator.model.params.hidden_dim // generator.model.params.num_heads
+                scale_head_dim = head_dim // generator.model.params.cache_quant_group
+                kv_cache = torch.zeros(cache_prefix_shape + (head_dim,), dtype=torch.float16).cuda()
+                kv_scale = torch.zeros(cache_prefix_shape + (scale_head_dim,), dtype=torch.float16).cuda()
+
+                # print(attn_mask, seqstarts, kvstarts, cachestarts, decoding_batches, start_pos, max_seqlen, max_kvlen)
+                logits = generator.model.logit_forward(batch, attn_mask, seqstarts, kvstarts,
+                                                       cachestarts, decoding_batches, start_pos,
+                                                       max_seqlen, max_kvlen, kv_cache, kv_scale)
+
+
+            shift_logits = logits[:-1, :].contiguous().float()
+            shift_labels = data[:, start_index:end_index][:, 1:]
+
+            loss_fct = nn.CrossEntropyLoss(reduction='sum')
+            loss = loss_fct(
+                shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)
+            )
+            neg_log_likelihood = loss.float()
+            nlls.append(neg_log_likelihood)
+
+            curr_ppl = _perplexity(nlls, i + 1, seqlen)
+            progress_bar.set_description(f"Perplexity {curr_ppl:.3f}")
+
+    ppl = _perplexity(nlls, n_samples, seqlen)
+
+    return ppl.item()
diff --git a/model_zoo/llama/modeling/dynamic_batching/Model.py b/model_zoo/llama/modeling/dynamic_batching/Model.py
@@ -383,6 +383,53 @@ def forward(self, tokens: torch.Tensor, attn_mask: Optional[torch.Tensor],
         return output
 
 
+    @torch.inference_mode()
+    def logit_forward(self, tokens: torch.Tensor, attn_mask: Optional[torch.Tensor],
+                      seqstarts: torch.Tensor, kvstarts: torch.Tensor,
+                      cachestarts: torch.Tensor, decoding_batches: torch.Tensor,
+                      start_pos: torch.Tensor, max_seqlen: torch.Tensor,  max_kvlen: torch.Tensor,
+                      kv_cache: torch.Tensor, kv_scale: torch.Tensor = None):
+        h = self.tok_embeddings(tokens)
+        # TensorDumper.dump(h, "emb_out")
+
+        _kv_scale = kv_scale
+        TensorDumper.dump(tokens, "token_ids")
+        if attn_mask is not None:
+            TensorDumper.dump(attn_mask, "attn_mask")
+        if self.fused_kvcache and attn_mask is not None:
+            if kv_scale is None: # mount an empty scale for friendly exporting
+                _kv_scale = torch.empty(0, dtype=h.dtype)
+        TensorDumper.dump(seqstarts, "seqstarts")
+        TensorDumper.dump(kvstarts, "kvstarts")
+        TensorDumper.dump(cachestarts, "cachestarts")
+        TensorDumper.dump(decoding_batches, "decoding_batches")
+        TensorDumper.dump(start_pos, "start_pos")
+        TensorDumper.dump(max_seqlen, "max_seqlen")
+        TensorDumper.dump(max_kvlen, "max_kvlen")
+        TensorDumper.dump(kv_cache, "kv_cache")
+        if kv_scale is not None:
+            TensorDumper.dump(kv_scale, "kv_scale")
+
+        if self.with_alibi and not self.fused_alibi:
+            attn_mask = OPMX.dynamic_batching.alibi_mask(seqstarts, kvstarts, attn_mask, self.params.num_heads, h.dtype)
+            # TensorDumper.dump(attn_mask, "alibi_mask")
+
+        norm = None
+        for layer in self.layers:
+            h, norm = layer(h, norm, attn_mask, seqstarts, kvstarts, cachestarts,
+                            decoding_batches, start_pos, max_seqlen, max_kvlen,
+                            kv_cache, _kv_scale)
+
+        h, norm = self.norm(h, norm)
+        # TensorDumper.dump(h, "last_rms_norm")
+        # TensorDumper.dump(gathered_h, "gathered_h")
+        output = self.output(h)  # only compute last logits
+        # TensorDumper.dump(output, "logits_before_cast")
+        output = output.float()
+        TensorDumper.dump(output, "logits")
+        return output
+
+
     @torch.no_grad()
     def load_state_dict(self, state_dict: Mapping[str, Any]):
         loaded_params = set()

diff --git a/model_zoo/llama/modeling/static_batching/Model.py b/model_zoo/llama/modeling/static_batching/Model.py
@@ -342,6 +342,44 @@ def forward(self, tokens: torch.Tensor, attn_mask: Optional[torch.Tensor],
         TensorDumper.dump(output, "logits")
         return output
 
+
+    @torch.inference_mode()
+    def logit_forward(self, tokens: torch.Tensor, attn_mask: Optional[torch.Tensor],
+                      start_pos: torch.Tensor, kv_cache: torch.Tensor, kv_scale: torch.Tensor = None):
+        h = self.tok_embeddings(tokens)
+        # TensorDumper.dump(h, "emb_out")
+
+        _kv_scale = kv_scale
+        TensorDumper.dump(tokens, "token_ids")
+        if attn_mask is not None:
+            TensorDumper.dump(attn_mask, "attn_mask")
+        if self.fused_kvcache and attn_mask is not None:
+            if kv_scale is None: # mount an empty scale for friendly exporting
+                _kv_scale = torch.empty(0, dtype=h.dtype)
+        TensorDumper.dump(start_pos, "start_pos")
+        TensorDumper.dump(kv_cache, "kv_cache")
+        if kv_scale is not None:
+            TensorDumper.dump(kv_scale, "kv_scale")
+
+        if self.with_alibi and not self.fused_alibi:
+            attn_mask = OPMX.alibi_mask(
+                torch.tensor(tokens.shape[1], dtype=torch.int64),
+                torch.tensor(tokens.shape[1], dtype=torch.int64) + start_pos, attn_mask, self.params.num_heads, h.dtype)
+            # TensorDumper.dump(attn_mask, "alibi_mask")
+
+        norm = None
+        for layer in self.layers:
+            h, norm = layer(h, norm, attn_mask, start_pos, kv_cache, _kv_scale)
+
+        h, norm = self.norm(h, norm)
+        # TensorDumper.dump(h, "last_rms_norm")
+        output = self.output(h)  # only compute last logits
+        # TensorDumper.dump(output, "logits_before_cast")
+        output = output.float()
+        TensorDumper.dump(output, "logits")
+        return output
+
+
     @torch.no_grad()
     def load_state_dict(self, state_dict: Mapping[str, Any]):
         loaded_params = set()

diff --git a/model_zoo/llama3/huggingface/Eval.py b/model_zoo/llama3/huggingface/Eval.py
@@ -0,0 +1,76 @@
+import fire
+import sys
+import os
+import json
+
+from pathlib import Path
+from typing import List
+
+sys.path.append(os.path.dirname(os.path.realpath(__file__)) + "/../..")
+
+import model_zoo.llama.modeling.Loader as Loader
+from eval.eval_ppl import evaluate_perplexity
+from transformers import AutoTokenizer
+from ModelParams import ModelParams
+
+def main(
+    ckpt_dir: str,
+    tokenizer_path: str,
+    temperature: float = 0.0,
+    top_p: float = 0.95,
+    batch: int = 4,
+    seqlen_scale_up: int = 1,
+    unaligned_batch: bool = False,
+    max_gen_len: int = 256,
+    friendly_gqa: bool = False, # done gqa by repeating key and value by key_value_cache op
+    fused_qkv: bool = True, # fuse qkv linear
+    fused_kvcache: bool = True, # fuse key_value_cache and multi_head_attention
+    fused_ffn_glu: bool = True, # fuse feed forward gate linear unit
+    auto_causal: bool = True, # causal mask is auto done by attention op, no need to pass additional mask to the model
+    quantized_cache: bool = True, # 8bit kv cache quantization
+    cache_layout: int = 0, # change kv cache layout for hardware performance friendly
+    cache_mode: int = 0, # change kv cache indexing mode for memory management friendly, only affected when dynamic_batching == True
+    dynamic_batching: bool = True, # use dynamic batching scheduling
+    context_chunking: bool = True, # enable context chunking for dynamic batching
+    dump_tensor_path: str = None,
+    dump_steps: List[int] = []
+):
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
+
+    with open(Path(ckpt_dir) / "opmx_params.json", "r") as f:
+        params = json.loads(f.read())
+    params: ModelParams = ModelParams(**params)
+
+    generator = Loader.load(
+        ckpt_dir, params,
+        friendly_gqa=friendly_gqa,
+        fused_qkv=fused_qkv,
+        fused_kvcache=fused_kvcache,
+        fused_ffn_glu=fused_ffn_glu,
+        fused_alibi=False,
+        auto_causal=auto_causal,
+        with_rope=True,
+        with_alibi=False,
+        quantized_cache=quantized_cache,
+        cache_layout=cache_layout,
+        cache_mode=cache_mode,
+        dynamic_batching=dynamic_batching,
+        attn_wqkv_bias_term=False,
+        attn_wo_bias_term=False,
+        ffn_linear_bias_term=False,
+        load_to_cpu=False,
+        rotary_dim=0,
+        dump_tensor_path=dump_tensor_path,
+        dump_steps=dump_steps
+    )
+
+    generator.context_chunking = context_chunking if dynamic_batching else False
+
+
+    ppl = evaluate_perplexity(generator, tokenizer)
+
+    print("model eval ppl is : ", ppl)
+
+
+if __name__ == "__main__":
+    fire.Fire(main)
diff --git a/model_zoo/llama3_woqu/README.md b/model_zoo/llama3_woqu/README.md
@@ -46,6 +46,16 @@ OMP_NUM_THREADS=1 torchrun --nproc_per_node $num_gpu huggingface/Demo.py --ckpt_
 - `OMP_NUM_THREADS`: This parameter determines the number of OpenMP threads. It is set to 1 to prevent excessive CPU core usage. Each PyTorch process opens an OpenMP thread pool, and setting it to 1 avoids occupying too many CPU cores.
 - `--nproc_per_node`: Specifies the number of model slices per node.
 
+## Eval Model
+The `Eval.py` provides ppl test for llama3 model.
+
+```bash
+OMP_NUM_THREADS=1 torchrun --nproc_per_node $num_gpu huggingface/Eval.py --ckpt_dir <convert_dir> --tokenizer_path <llama_tokenizer_dir> --fused_qkv 1 --fused_kvcache 1 --auto_causal 1 --quantized_cache 1 --dynamic_batching 1 --quant_data_type "int4" --quant_method "weight_only" --quant_axis 1 --group_size 128 --storage_bits 32
+```
+
+- `OMP_NUM_THREADS`: This parameter determines the number of OpenMP threads. It is set to 1 to prevent excessive CPU core usage. Each PyTorch process opens an OpenMP thread pool, and setting it to 1 avoids occupying too many CPU cores.
+- `--nproc_per_node`: Specifies the number of model slices per node.
+
 ## Exporting Model
 
 To export a model, you will use the `Export.py` script provided. Here's an example command for exporting a 13B model with 1 GPU:

diff --git a/model_zoo/llama3_woqu/huggingface/ConvertWeightToOpmx.py b/model_zoo/llama3_woqu/huggingface/ConvertWeightToOpmx.py
@@ -55,6 +55,17 @@ def write_json(text, path):
         json.dump(text, f)
 
 
+def str2bool(v):
+    if isinstance(v, bool):
+        return v
+    if v.lower() in ('yes', 'true', 't', 'y', '1'):
+        return True
+    elif v.lower() in ('no', 'false', 'f', 'n', '0'):
+        return False
+    else:
+        raise argparse.ArgumentTypeError('Boolean value expected.')
+
+
 def write_pmx_model(model_path, input_base_path, model_type, quant, group_size, n_bits, has_zeropoint, storage_bits):
     os.makedirs(model_path, exist_ok=True)
     print ("Loading the checkpoint in a HF model")
@@ -272,9 +283,10 @@ def main():
         help="Input model type",
     )
     parser.add_argument(
-    "--quant",
-    default=False,
-    help="Enable quantization for the model. Set to True to quantize the model weights.",
+        "--quant",
+        type=str2bool,
+        default=False,
+        help="Enable quantization for the model. Set to True to quantize the model weights.",
     )
     parser.add_argument(
         "--group_size",
@@ -288,7 +300,8 @@ def main():
     )
     parser.add_argument(
         "--has_zeropoint",
-        default=False,
+        type=str2bool,
+        default=True,
         help="Include zero-point in quantization. Set to True to use zero-point quantization.",
     )
     parser.add_argument(