diff --git a/.github/workflows/ci_eval.yaml b/.github/workflows/ci_eval.yaml index 54aa3c763..e6794bff9 100644 --- a/.github/workflows/ci_eval.yaml +++ b/.github/workflows/ci_eval.yaml @@ -21,10 +21,10 @@ concurrency: cancel-in-progress: true jobs: - test_perplexity_vmfb: + test_perplexity_iree: if: ${{ github.repository_owner == 'nod-ai' || github.event_name != 'schedule' }} timeout-minutes: 1000 - name: "IREE/vmfb" + name: "Perplexity-IREE" strategy: matrix: version: [3.11] @@ -74,13 +74,21 @@ jobs: iree-base-compiler \ iree-base-runtime - - name: Run perplexity test with vmfb - run: pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_vmfb_test.py --longrun --iree-device='hip://7' --iree-hip-target=gfx942 --iree-hal-target-backends=rocm --llama3-8b-f16-model-path=/data/llama3.1/8b/llama8b_f16.irpa --llama3-8b-tokenizer-path=/data/llama3.1/8b/tokenizer_config.json + - name: Run perplexity test with IREE + run: pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_iree_test.py --longrun --iree-device='hip://7' --iree-hip-target=gfx942 --iree-hal-target-backends=rocm --llama3-8b-f16-model-path=/data/llama3.1/8b/llama8b_f16.irpa --llama3-8b-tokenizer-path=/data/llama3.1/8b/tokenizer_config.json --html=out/llm/llama/perplexity/iree_perplexity/index.html + + - name: Deploy to GitHub Pages + uses: peaceiris/actions-gh-pages@4f9cc6602d3f66b9c108549d475ec49e8ef4d45e # v4.0.0 + with: + github_token: ${{ secrets.SHARK_PLATFORM_GH_TOKEN }} + publish_dir: ./out/llm/llama/perplexity/iree_perplexity + destination_dir: ./llm/llama/perplexity/iree_perplexity + keep_files: true test_perplexity_torch: if: ${{ github.repository_owner == 'nod-ai' || github.event_name != 'schedule' }} timeout-minutes: 1000 - name: "Torch/eager mode" + name: "Perplexity-Torch" strategy: matrix: version: [3.11] @@ -123,5 +131,13 @@ jobs: pip install --no-compile -f https://iree.dev/pip-release-links.html --src deps \ -e "git+https://github.com/iree-org/iree-turbine.git#egg=iree-turbine" - - name: Run perplexity test in eager mode - run: pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_torch_test.py --longrun --llama3-8b-f16-model-path=/data/llama3.1/8b/llama8b_f16.irpa --llama3-8b-tokenizer-path=/data/llama3.1/8b/tokenizer_config.json + - name: Run perplexity test with Torch + run: pytest -n 8 -v -s sharktank/tests/evaluate/perplexity_torch_test.py --longrun --llama3-8b-f16-model-path=/data/llama3.1/8b/llama8b_f16.irpa --llama3-8b-tokenizer-path=/data/llama3.1/8b/tokenizer_config.json --html=out/llm/llama/perplexity/torch_perplexity/index.html + + - name: Deploy to GitHub Pages + uses: peaceiris/actions-gh-pages@4f9cc6602d3f66b9c108549d475ec49e8ef4d45e # v4.0.0 + with: + github_token: ${{ secrets.SHARK_PLATFORM_GH_TOKEN }} + publish_dir: ./out/llm/llama/perplexity/torch_perplexity + destination_dir: ./llm/llama/perplexity/torch_perplexity + keep_files: true diff --git a/sharktank/sharktank/evaluate/README.md b/sharktank/sharktank/evaluate/README.md index 784bb24fd..beb0281cd 100644 --- a/sharktank/sharktank/evaluate/README.md +++ b/sharktank/sharktank/evaluate/README.md @@ -9,16 +9,32 @@ pip install -r sharktank/requirements-tests.txt ### Perplexity -Test perplexity for Llama3.1 8B & 405B (FP16 & FP8) models: +Perplexity score measures the ability of a language model to predict the next token in a sequence. A lower score indicates that a model has higher certainty in it's predictions. Perplexity acts as an intrinsic evaluation metric that measures the model quality, independent of any downstream task. + +In SHARK-Platform, we use perplexity to track code regressions and quality loss across quantized models (with FP16 as baseline). We use 100 prompts randomly selected from the Wikitext-2 test set and calculate the mean perplexities shown below. These numbers are neither comparable between models with different tokenizers nor with other projects due to varying implementations. + +* Test perplexity for Llama3.1 8B (FP16) model: ```bash pytest sharktank/tests/evaluate/perplexity_test.py --longrun ``` -Get perplexity for a new model: +* Calculate perplexity for a new model: ```bash python -m sharktank.evaluate.perplexity \ --gguf-file=llama3_70b_f16.gguf \ --tokenizer-config-json=tokenizer_config.json ``` + +### Perplexity Scoreboard + +| CPU | GPU | +|:-------------: |:----------:| +| AMD EPYC 9554 | MI300X | + +#### LLaMA 3.1 + +|Models |Model size (GB) |Torch score |IREE score | +|:----------------------|:---------------|:-------------|:-------------| +|8B FP16 TP1 decomposed |16.07 |14.930181 |14.991893 | diff --git a/sharktank/sharktank/evaluate/perplexity_vmfb.py b/sharktank/sharktank/evaluate/perplexity_iree.py similarity index 96% rename from sharktank/sharktank/evaluate/perplexity_vmfb.py rename to sharktank/sharktank/evaluate/perplexity_iree.py index 4f95ae1bd..9701bed34 100644 --- a/sharktank/sharktank/evaluate/perplexity_vmfb.py +++ b/sharktank/sharktank/evaluate/perplexity_iree.py @@ -9,6 +9,7 @@ import json import time import random +import re from datetime import timedelta from tqdm import tqdm @@ -83,11 +84,18 @@ def wrapper(*args, **kwargs): start = time.time() result = func(*args, **kwargs) end = time.time() - seconds = end - start - time_taken = abs(timedelta(seconds=round(seconds))) - - if seconds < 1: - time_taken = f" {seconds * 1000} ms" + total_seconds = end - start + time_taken = abs(timedelta(seconds=total_seconds)) + hours, minutes, seconds = re.split(":", str(time_taken)) + + if total_seconds < 1: + time_taken = f" {round(total_seconds * 1000, 3)} ms" + elif total_seconds < 60: + time_taken = "{:.2f} secs".format(round(float(total_seconds), 2)) + else: + time_taken = "{:02d} hrs : {:02d} mins : {:.2f} secs".format( + int(hours), int(minutes), round(float(seconds), 2) + ) func_name = func.__name__ if func_name == "get_perplexity": diff --git a/sharktank/sharktank/evaluate/perplexity_torch.py b/sharktank/sharktank/evaluate/perplexity_torch.py index fc3aa5fca..da5fc104a 100644 --- a/sharktank/sharktank/evaluate/perplexity_torch.py +++ b/sharktank/sharktank/evaluate/perplexity_torch.py @@ -8,6 +8,7 @@ import logging import time import random +import re from datetime import timedelta import json import numpy as np @@ -69,11 +70,18 @@ def wrapper(*args, **kwargs): start = time.time() result = func(*args, **kwargs) end = time.time() - seconds = end - start - time_taken = abs(timedelta(seconds=round(seconds))) - - if seconds < 1: - time_taken = f" {seconds * 1000} ms" + total_seconds = end - start + time_taken = abs(timedelta(seconds=total_seconds)) + hours, minutes, seconds = re.split(":", str(time_taken)) + + if total_seconds < 1: + time_taken = f" {round(total_seconds * 1000, 3)} ms" + elif total_seconds < 60: + time_taken = "{:.2f} secs".format(round(float(total_seconds), 2)) + else: + time_taken = "{:02d} hrs : {:02d} mins : {:.2f} secs".format( + int(hours), int(minutes), round(float(seconds), 2) + ) func_name = func.__name__ if func_name == "get_perplexity": diff --git a/sharktank/sharktank/utils/export_artifacts.py b/sharktank/sharktank/utils/export_artifacts.py index bd33e1a62..e7851ac37 100644 --- a/sharktank/sharktank/utils/export_artifacts.py +++ b/sharktank/sharktank/utils/export_artifacts.py @@ -9,6 +9,7 @@ import subprocess import logging import time +import re from pathlib import Path from datetime import timedelta from typing import List, Optional @@ -107,11 +108,18 @@ def wrapper(*args, **kwargs): start = time.time() result = func(*args, **kwargs) end = time.time() - seconds = end - start - time_taken = abs(timedelta(seconds=round(seconds))) - - if seconds < 1: - time_taken = f" {seconds * 1000} ms" + total_seconds = end - start + time_taken = abs(timedelta(seconds=total_seconds)) + hours, minutes, seconds = re.split(":", str(time_taken)) + + if total_seconds < 1: + time_taken = f" {round(total_seconds * 1000, 3)} ms" + elif total_seconds < 60: + time_taken = "{:.2f} secs".format(round(float(total_seconds), 2)) + else: + time_taken = "{:02d} hrs : {:02d} mins : {:.2f} secs".format( + int(hours), int(minutes), round(float(seconds), 2) + ) func_name = func.__name__ logger.info(f" {func_name}: {time_taken}") diff --git a/sharktank/tests/evaluate/baseline_perplexity_scores.json b/sharktank/tests/evaluate/baseline_perplexity_scores.json index ac2cd7b83..24511b05f 100644 --- a/sharktank/tests/evaluate/baseline_perplexity_scores.json +++ b/sharktank/tests/evaluate/baseline_perplexity_scores.json @@ -210,7 +210,7 @@ ], "mean_perplexity": 6.060831 }, - "llama3_8B_f16_decomposed_vmfb": { + "llama3_8B_f16_decomposed_iree": { "perplexities": [ 6.651368, 22.059452, diff --git a/sharktank/tests/evaluate/perplexity_vmfb_test.py b/sharktank/tests/evaluate/perplexity_iree_test.py similarity index 92% rename from sharktank/tests/evaluate/perplexity_vmfb_test.py rename to sharktank/tests/evaluate/perplexity_iree_test.py index 93ffbe61c..8cf2055c9 100644 --- a/sharktank/tests/evaluate/perplexity_vmfb_test.py +++ b/sharktank/tests/evaluate/perplexity_iree_test.py @@ -8,7 +8,7 @@ import pytest import json -from sharktank.evaluate import perplexity_vmfb +from sharktank.evaluate import perplexity_iree longrun = pytest.mark.skipif("not config.getoption('longrun')") @@ -32,10 +32,10 @@ def test_llama3_8B_f16_decomposed(self): # Llama 3.1 8B decomposed - model_name = "llama3_8B_f16_decomposed_vmfb" + model_name = "llama3_8B_f16_decomposed_iree" baseline_perplexity = self.baseline_perplexity[model_name] - current_perplexity = perplexity_vmfb.main( + current_perplexity = perplexity_iree.main( [ f"--irpa-file={self.llama3_8b_f16_model}", f"--tokenizer-config-json={self.llama3_8b_tokenizer}", @@ -67,10 +67,10 @@ def test_llama3_8B_f16(self): # Llama 3.1 8B non-decomposed - model_name = "llama3_8B_f16_vmfb" + model_name = "llama3_8B_f16_iree" baseline_perplexity = self.baseline_perplexity[model_name] - current_perplexity = perplexity_vmfb.main( + current_perplexity = perplexity_iree.main( [ f"--irpa-file={self.llama3_8b_f16_model}", f"--tokenizer-config-json={self.llama3_8b_tokenizer}", @@ -102,10 +102,10 @@ def test_llama3_8B_fp8_decomposed(self): # Llama 3.1 8B decomposed - model_name = "llama3_8B_fp8_decomposed_vmfb" + model_name = "llama3_8B_fp8_decomposed_iree" baseline_perplexity = self.baseline_perplexity[model_name] - current_perplexity = perplexity_vmfb.main( + current_perplexity = perplexity_iree.main( [ f"--irpa-file={self.llama3_8b_fp8_model}", f"--tokenizer-config-json={self.llama3_8b_tokenizer}", @@ -137,10 +137,10 @@ def test_llama3_8B_fp8(self): # Llama 3.1 8B non-decomposed - model_name = "llama3_8B_fp8_vmfb" + model_name = "llama3_8B_fp8_iree" baseline_perplexity = self.baseline_perplexity[model_name] - current_perplexity = perplexity_vmfb.main( + current_perplexity = perplexity_iree.main( [ f"--irpa-file={self.llama3_8b_fp8_model}", f"--tokenizer-config-json={self.llama3_8b_tokenizer}", @@ -172,10 +172,10 @@ def test_llama3_405B_f16_decomposed(self): # Llama 3.1 405B decomposed - model_name = "llama3_405B_f16_decomposed_vmfb" + model_name = "llama3_405B_f16_decomposed_iree" baseline_perplexity = self.baseline_perplexity[model_name] - current_perplexity = perplexity_vmfb.main( + current_perplexity = perplexity_iree.main( [ f"--irpa-file={self.llama3_405b_f16_model}", f"--tokenizer-config-json={self.llama3_405b_tokenizer}", @@ -207,10 +207,10 @@ def test_llama3_405B_f16(self): # Llama 3.1 405B non-decomposed - model_name = "llama3_405B_f16_vmfb" + model_name = "llama3_405B_f16_iree" baseline_perplexity = self.baseline_perplexity[model_name] - current_perplexity = perplexity_vmfb.main( + current_perplexity = perplexity_iree.main( [ f"--irpa-file={self.llama3_405b_f16_model}", f"--tokenizer-config-json={self.llama3_405b_tokenizer}", @@ -242,10 +242,10 @@ def test_llama3_405B_fp8_decomposed(self): # Llama 3.1 405B decomposed - model_name = "llama3_405B_fp8_decomposed_vmfb" + model_name = "llama3_405B_fp8_decomposed_iree" baseline_perplexity = self.baseline_perplexity[model_name] - current_perplexity = perplexity_vmfb.main( + current_perplexity = perplexity_iree.main( [ f"--irpa-file={self.llama3_405b_fp8_model}", f"--tokenizer-config-json={self.llama3_405b_tokenizer}", @@ -277,10 +277,10 @@ def test_llama3_405B_fp8(self): # Llama 3.1 405B non-decomposed - model_name = "llama3_405B_fp8_vmfb" + model_name = "llama3_405B_fp8_iree" baseline_perplexity = self.baseline_perplexity[model_name] - current_perplexity = perplexity_vmfb.main( + current_perplexity = perplexity_iree.main( [ f"--irpa-file={self.llama3_405b_fp8_model}", f"--tokenizer-config-json={self.llama3_405b_tokenizer}",