Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Eval] feat: add ppl eval and setup.py for ppl.pmx #94

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,15 @@ About add new operator: [Link](docs/AddNewOp.md)

About update an operator's version: [Link](docs/UpdateOp.md)

# Install OPMX

You can install opmx like:

```
pip install -e .
```
Then you can just use OPMX like a python api.

# Use OPMX Python API

OPMX provides functional API based on `torch.autograd.Function`.
Expand Down
83 changes: 83 additions & 0 deletions eval/eval_ppl.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
import torch
import torch.nn as nn
from tqdm import tqdm
from datasets import load_dataset


def evaluate_perplexity(generator, tokenizer):
def _perplexity(nlls, n_samples, seqlen):
return torch.exp(torch.stack(nlls).sum() / (n_samples * seqlen))

# load and prepare dataset
data = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
data = tokenizer("\n\n".join(data["text"]), return_tensors="pt")
data = data.input_ids.to("cuda")

seqlen = 2048
n_samples = data.numel() // seqlen

nlls = []


with tqdm(range(n_samples), desc="Perplexity -") as progress_bar:
for i in progress_bar:
start_index = i * seqlen
end_index = (i + 1) * seqlen
batch = data[:, start_index:end_index].to("cuda")
batch = batch[0]
with torch.no_grad():
# ppl_forward args
## attn_mask
attn_mask = torch.empty(0, dtype=torch.float16)
## seqstarts
seqstarts = torch.zeros(2, dtype=torch.int64)
token_len = len(batch)
seqstarts[1:] = torch.tensor(token_len, dtype=torch.int64)
seqstarts = seqstarts.cumsum(0).cuda()
## kvlens
kvstarts = torch.zeros(2, dtype=torch.int64)
kvlens = [token_len]
kvstarts[1:] = torch.tensor(kvlens, dtype=torch.int64)
kvstarts = kvstarts.cumsum(0).cuda()
## cachestarts
cachestarts = torch.tensor([0], dtype=torch.int64).cuda()
## decoding_batches
decoding_batches = torch.tensor([0])
## start_pos
start_pos = torch.tensor([0], dtype=torch.int64).cuda()
## max_seqlen
max_seqlen = torch.tensor([token_len])
## max_kvlen
max_kvlen = torch.tensor([token_len])
## kvcache
total_cache_len = token_len + seqlen
num_layers = generator.model.params.num_layers
num_local_kv_heads = generator.model.params.num_kv_heads
cache_prefix_shape = (total_cache_len, num_layers, 2, num_local_kv_heads)
head_dim = generator.model.params.hidden_dim // generator.model.params.num_heads
scale_head_dim = head_dim // generator.model.params.cache_quant_group
kv_cache = torch.zeros(cache_prefix_shape + (head_dim,), dtype=torch.float16).cuda()
kv_scale = torch.zeros(cache_prefix_shape + (scale_head_dim,), dtype=torch.float16).cuda()

# print(attn_mask, seqstarts, kvstarts, cachestarts, decoding_batches, start_pos, max_seqlen, max_kvlen)
logits = generator.model.logit_forward(batch, attn_mask, seqstarts, kvstarts,
cachestarts, decoding_batches, start_pos,
max_seqlen, max_kvlen, kv_cache, kv_scale)


shift_logits = logits[:-1, :].contiguous().float()
shift_labels = data[:, start_index:end_index][:, 1:]

loss_fct = nn.CrossEntropyLoss(reduction='sum')
loss = loss_fct(
shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1)
)
neg_log_likelihood = loss.float()
nlls.append(neg_log_likelihood)

curr_ppl = _perplexity(nlls, i + 1, seqlen)
progress_bar.set_description(f"Perplexity {curr_ppl:.3f}")

ppl = _perplexity(nlls, n_samples, seqlen)

return ppl.item()
47 changes: 47 additions & 0 deletions model_zoo/llama/modeling/dynamic_batching/Model.py
Original file line number Diff line number Diff line change
Expand Up @@ -383,6 +383,53 @@ def forward(self, tokens: torch.Tensor, attn_mask: Optional[torch.Tensor],
return output


@torch.inference_mode()
def logit_forward(self, tokens: torch.Tensor, attn_mask: Optional[torch.Tensor],
seqstarts: torch.Tensor, kvstarts: torch.Tensor,
cachestarts: torch.Tensor, decoding_batches: torch.Tensor,
start_pos: torch.Tensor, max_seqlen: torch.Tensor, max_kvlen: torch.Tensor,
kv_cache: torch.Tensor, kv_scale: torch.Tensor = None):
h = self.tok_embeddings(tokens)
# TensorDumper.dump(h, "emb_out")

_kv_scale = kv_scale
TensorDumper.dump(tokens, "token_ids")
if attn_mask is not None:
TensorDumper.dump(attn_mask, "attn_mask")
if self.fused_kvcache and attn_mask is not None:
if kv_scale is None: # mount an empty scale for friendly exporting
_kv_scale = torch.empty(0, dtype=h.dtype)
TensorDumper.dump(seqstarts, "seqstarts")
TensorDumper.dump(kvstarts, "kvstarts")
TensorDumper.dump(cachestarts, "cachestarts")
TensorDumper.dump(decoding_batches, "decoding_batches")
TensorDumper.dump(start_pos, "start_pos")
TensorDumper.dump(max_seqlen, "max_seqlen")
TensorDumper.dump(max_kvlen, "max_kvlen")
TensorDumper.dump(kv_cache, "kv_cache")
if kv_scale is not None:
TensorDumper.dump(kv_scale, "kv_scale")

if self.with_alibi and not self.fused_alibi:
attn_mask = OPMX.dynamic_batching.alibi_mask(seqstarts, kvstarts, attn_mask, self.params.num_heads, h.dtype)
# TensorDumper.dump(attn_mask, "alibi_mask")

norm = None
for layer in self.layers:
h, norm = layer(h, norm, attn_mask, seqstarts, kvstarts, cachestarts,
decoding_batches, start_pos, max_seqlen, max_kvlen,
kv_cache, _kv_scale)

h, norm = self.norm(h, norm)
# TensorDumper.dump(h, "last_rms_norm")
# TensorDumper.dump(gathered_h, "gathered_h")
output = self.output(h) # only compute last logits
# TensorDumper.dump(output, "logits_before_cast")
output = output.float()
TensorDumper.dump(output, "logits")
return output


@torch.no_grad()
def load_state_dict(self, state_dict: Mapping[str, Any]):
loaded_params = set()
Expand Down
38 changes: 38 additions & 0 deletions model_zoo/llama/modeling/static_batching/Model.py
Original file line number Diff line number Diff line change
Expand Up @@ -342,6 +342,44 @@ def forward(self, tokens: torch.Tensor, attn_mask: Optional[torch.Tensor],
TensorDumper.dump(output, "logits")
return output


@torch.inference_mode()
def logit_forward(self, tokens: torch.Tensor, attn_mask: Optional[torch.Tensor],
start_pos: torch.Tensor, kv_cache: torch.Tensor, kv_scale: torch.Tensor = None):
h = self.tok_embeddings(tokens)
# TensorDumper.dump(h, "emb_out")

_kv_scale = kv_scale
TensorDumper.dump(tokens, "token_ids")
if attn_mask is not None:
TensorDumper.dump(attn_mask, "attn_mask")
if self.fused_kvcache and attn_mask is not None:
if kv_scale is None: # mount an empty scale for friendly exporting
_kv_scale = torch.empty(0, dtype=h.dtype)
TensorDumper.dump(start_pos, "start_pos")
TensorDumper.dump(kv_cache, "kv_cache")
if kv_scale is not None:
TensorDumper.dump(kv_scale, "kv_scale")

if self.with_alibi and not self.fused_alibi:
attn_mask = OPMX.alibi_mask(
torch.tensor(tokens.shape[1], dtype=torch.int64),
torch.tensor(tokens.shape[1], dtype=torch.int64) + start_pos, attn_mask, self.params.num_heads, h.dtype)
# TensorDumper.dump(attn_mask, "alibi_mask")

norm = None
for layer in self.layers:
h, norm = layer(h, norm, attn_mask, start_pos, kv_cache, _kv_scale)

h, norm = self.norm(h, norm)
# TensorDumper.dump(h, "last_rms_norm")
output = self.output(h) # only compute last logits
# TensorDumper.dump(output, "logits_before_cast")
output = output.float()
TensorDumper.dump(output, "logits")
return output


@torch.no_grad()
def load_state_dict(self, state_dict: Mapping[str, Any]):
loaded_params = set()
Expand Down
76 changes: 76 additions & 0 deletions model_zoo/llama3/huggingface/Eval.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
import fire
import sys
import os
import json

from pathlib import Path
from typing import List

sys.path.append(os.path.dirname(os.path.realpath(__file__)) + "/../..")

import model_zoo.llama.modeling.Loader as Loader
from eval.eval_ppl import evaluate_perplexity
from transformers import AutoTokenizer
from ModelParams import ModelParams

def main(
ckpt_dir: str,
tokenizer_path: str,
temperature: float = 0.0,
top_p: float = 0.95,
batch: int = 4,
seqlen_scale_up: int = 1,
unaligned_batch: bool = False,
max_gen_len: int = 256,
friendly_gqa: bool = False, # done gqa by repeating key and value by key_value_cache op
fused_qkv: bool = True, # fuse qkv linear
fused_kvcache: bool = True, # fuse key_value_cache and multi_head_attention
fused_ffn_glu: bool = True, # fuse feed forward gate linear unit
auto_causal: bool = True, # causal mask is auto done by attention op, no need to pass additional mask to the model
quantized_cache: bool = True, # 8bit kv cache quantization
cache_layout: int = 0, # change kv cache layout for hardware performance friendly
cache_mode: int = 0, # change kv cache indexing mode for memory management friendly, only affected when dynamic_batching == True
dynamic_batching: bool = True, # use dynamic batching scheduling
context_chunking: bool = True, # enable context chunking for dynamic batching
dump_tensor_path: str = None,
dump_steps: List[int] = []
):
tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

with open(Path(ckpt_dir) / "opmx_params.json", "r") as f:
params = json.loads(f.read())
params: ModelParams = ModelParams(**params)

generator = Loader.load(
ckpt_dir, params,
friendly_gqa=friendly_gqa,
fused_qkv=fused_qkv,
fused_kvcache=fused_kvcache,
fused_ffn_glu=fused_ffn_glu,
fused_alibi=False,
auto_causal=auto_causal,
with_rope=True,
with_alibi=False,
quantized_cache=quantized_cache,
cache_layout=cache_layout,
cache_mode=cache_mode,
dynamic_batching=dynamic_batching,
attn_wqkv_bias_term=False,
attn_wo_bias_term=False,
ffn_linear_bias_term=False,
load_to_cpu=False,
rotary_dim=0,
dump_tensor_path=dump_tensor_path,
dump_steps=dump_steps
)

generator.context_chunking = context_chunking if dynamic_batching else False


ppl = evaluate_perplexity(generator, tokenizer)

print("model eval ppl is : ", ppl)


if __name__ == "__main__":
fire.Fire(main)
10 changes: 10 additions & 0 deletions model_zoo/llama3_woqu/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,16 @@ OMP_NUM_THREADS=1 torchrun --nproc_per_node $num_gpu huggingface/Demo.py --ckpt_
- `OMP_NUM_THREADS`: This parameter determines the number of OpenMP threads. It is set to 1 to prevent excessive CPU core usage. Each PyTorch process opens an OpenMP thread pool, and setting it to 1 avoids occupying too many CPU cores.
- `--nproc_per_node`: Specifies the number of model slices per node.

## Eval Model
The `Eval.py` provides ppl test for llama3 model.

```bash
OMP_NUM_THREADS=1 torchrun --nproc_per_node $num_gpu huggingface/Eval.py --ckpt_dir <convert_dir> --tokenizer_path <llama_tokenizer_dir> --fused_qkv 1 --fused_kvcache 1 --auto_causal 1 --quantized_cache 1 --dynamic_batching 1 --quant_data_type "int4" --quant_method "weight_only" --quant_axis 1 --group_size 128 --storage_bits 32
```

- `OMP_NUM_THREADS`: This parameter determines the number of OpenMP threads. It is set to 1 to prevent excessive CPU core usage. Each PyTorch process opens an OpenMP thread pool, and setting it to 1 avoids occupying too many CPU cores.
- `--nproc_per_node`: Specifies the number of model slices per node.

## Exporting Model

To export a model, you will use the `Export.py` script provided. Here's an example command for exporting a 13B model with 1 GPU:
Expand Down
21 changes: 17 additions & 4 deletions model_zoo/llama3_woqu/huggingface/ConvertWeightToOpmx.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,17 @@ def write_json(text, path):
json.dump(text, f)


def str2bool(v):
if isinstance(v, bool):
return v
if v.lower() in ('yes', 'true', 't', 'y', '1'):
return True
elif v.lower() in ('no', 'false', 'f', 'n', '0'):
return False
else:
raise argparse.ArgumentTypeError('Boolean value expected.')


def write_pmx_model(model_path, input_base_path, model_type, quant, group_size, n_bits, has_zeropoint, storage_bits):
os.makedirs(model_path, exist_ok=True)
print ("Loading the checkpoint in a HF model")
Expand Down Expand Up @@ -272,9 +283,10 @@ def main():
help="Input model type",
)
parser.add_argument(
"--quant",
default=False,
help="Enable quantization for the model. Set to True to quantize the model weights.",
"--quant",
type=str2bool,
default=False,
help="Enable quantization for the model. Set to True to quantize the model weights.",
)
parser.add_argument(
"--group_size",
Expand All @@ -288,7 +300,8 @@ def main():
)
parser.add_argument(
"--has_zeropoint",
default=False,
type=str2bool,
default=True,
help="Include zero-point in quantization. Set to True to use zero-point quantization.",
)
parser.add_argument(
Expand Down
Loading