[ CI ] LM Eval Testing Expansion (#326)

SUMMARY: * updated hf lm-eval gsm baseline script to use accelerate, enabling us to generate baselines on big models' * added vllm lm-eval gsm baseline script for scenarios that hf does not support (e.g. fp8) * added lm-eval GSM model configs for broad set of models (small + large) * refactored smoke / full configs to reference the model configs + trigger one test at a time * refactored lm-eval accuracy test to avoid using ray to launch server, which caused issues cleaning up in server case * moved configs into `.github` folder so they are closer to the scripts FOLLOW UP PRS: * enable distributed * enable H100 for large models * eliminate the `neuralmagic` directory
neuralmagic · Jun 26, 2024 · ec8b450 · ec8b450
1 parent 89a0e3c
commit ec8b450
Show file tree

Hide file tree

Showing 29 changed files with 368 additions and 123 deletions.
diff --git a/.github/lm-eval-configs/full-large-models.txt b/.github/lm-eval-configs/full-large-models.txt
@@ -0,0 +1,9 @@
+Meta-Llama-3-70B-Instruct-FP8.yaml
+Meta-Llama-3-70B-Instruct.yaml
+Mixtral-8x22B-Instruct-v0.1-FP8.yaml
+Mixtral-8x22B-Instruct-v0.1.yaml
+Mixtral-8x7B-Instruct-v0.1-FP8.yaml
+Mixtral-8x7B-Instruct-v0.1.yaml
+Qwen2-57B-A14B-Instruct.yaml
+Qwen2-72B-Instruct.yaml
+Phi-3-medium-4k-instruct.yaml
diff --git a/.github/lm-eval-configs/full-small-models.txt b/.github/lm-eval-configs/full-small-models.txt
@@ -0,0 +1,7 @@
+gemma-7b-it.yaml
+Meta-Llama-3-8B-Instruct-FP8-KV.yaml
+Meta-Llama-3-8B-Instruct-FP8.yaml
+Meta-Llama-3-8B-Instruct-W4A16.yaml
+Meta-Llama-3-8B-Instruct.yaml
+Mistral-7B-Instruct-v0.3.yaml
+Qwen2-7B-Instruct.yaml
diff --git a/.github/lm-eval-configs/models/Meta-Llama-3-70B-Instruct-FP8.yaml b/.github/lm-eval-configs/models/Meta-Llama-3-70B-Instruct-FP8.yaml
@@ -0,0 +1,11 @@
+# ./nm-run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
+model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.900
+  - name: "exact_match,flexible-extract"
+    value: 0.900
+limit: 250
+num_fewshot: 5
diff --git a/.github/lm-eval-configs/models/Meta-Llama-3-70B-Instruct.yaml b/.github/lm-eval-configs/models/Meta-Llama-3-70B-Instruct.yaml
@@ -0,0 +1,11 @@
+# ./nm-run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-70B-Instruct -b 32 -l 250 -f 5
+model_name: "meta-llama/Meta-Llama-3-70B-Instruct"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.888
+  - name: "exact_match,flexible-extract"
+    value: 0.888
+limit: 250
+num_fewshot: 5
diff --git a/.github/lm-eval-configs/models/Meta-Llama-3-8B-Instruct-FP8-KV.yaml b/.github/lm-eval-configs/models/Meta-Llama-3-8B-Instruct-FP8-KV.yaml
@@ -0,0 +1,11 @@
+# ./nm-run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV -b 32 -l 250 -f 5 -t 1
+model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.764
+  - name: "exact_match,flexible-extract"
+    value: 0.764
+limit: 250
+num_fewshot: 5
diff --git a/.github/lm-eval-configs/models/Meta-Llama-3-8B-Instruct-FP8.yaml b/.github/lm-eval-configs/models/Meta-Llama-3-8B-Instruct-FP8.yaml
@@ -0,0 +1,11 @@
+# ./nm-run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Meta-Llama-3-8B-Instruct-FP8 -b 32 -l 250 -f 5 -t 1
+model_name: "neuralmagic/Meta-Llama-3-8B-Instruct-FP8"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.744
+  - name: "exact_match,flexible-extract"
+    value: 0.740
+limit: 250
+num_fewshot: 5
diff --git a/.github/lm-eval-configs/models/Meta-Llama-3-8B-Instruct-W4A16.yaml b/.github/lm-eval-configs/models/Meta-Llama-3-8B-Instruct-W4A16.yaml
@@ -0,0 +1,11 @@
+# ./nm-run-lm-eval-gsm-hf-baseline.sh -m TechxGenus/Meta-Llama-3-8B-Instruct-GPTQ -b 32 -l 250 -f 5
+model_name: "TechxGenus/Meta-Llama-3-8B-Instruct-GPTQ"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.684
+  - name: "exact_match,flexible-extract"
+    value: 0.688
+limit: 250
+num_fewshot: 5
diff --git a/.github/lm-eval-configs/models/Meta-Llama-3-8B-Instruct.yaml b/.github/lm-eval-configs/models/Meta-Llama-3-8B-Instruct.yaml
@@ -0,0 +1,11 @@
+# ./nm-run-lm-eval-gsm-hf-baseline.sh -m meta-llama/Meta-Llama-3-8B-Instruct -b 32 -l 250 -f 5
+model_name: "meta-llama/Meta-Llama-3-8B-Instruct"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.74
+  - name: "exact_match,flexible-extract"
+    value: 0.74
+limit: 250
+num_fewshot: 5
diff --git a/.github/lm-eval-configs/models/Mistral-7B-Instruct-v0.3.yaml b/.github/lm-eval-configs/models/Mistral-7B-Instruct-v0.3.yaml
@@ -0,0 +1,11 @@
+# ./nm-run-lm-eval-gsm-hf-baseline.sh -m mistralai/Mistral-7B-Instruct-v0.3 -b 32 -l 250 -f 5
+model_name: "mistralai/Mistral-7B-Instruct-v0.3"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.524
+  - name: "exact_match,flexible-extract"
+    value: 0.524
+limit: 250
+num_fewshot: 5
diff --git a/.github/lm-eval-configs/models/Mixtral-8x22B-Instruct-v0.1-FP8.yaml b/.github/lm-eval-configs/models/Mixtral-8x22B-Instruct-v0.1-FP8.yaml
@@ -0,0 +1,11 @@
+# ./nm-run-lm-eval-gsm-hf-baseline.sh -m mistralai/Mixtral-8x22B-Instruct-v0.1 -b 32 -l 250 -f 5
+model_name: "mistralai/Mixtral-8x22B-Instruct-v0.1"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.840
+  - name: "exact_match,flexible-extract"
+    value: 0.844
+limit: 250
+num_fewshot: 5
diff --git a/.github/lm-eval-configs/models/Mixtral-8x22B-Instruct-v0.1.yaml b/.github/lm-eval-configs/models/Mixtral-8x22B-Instruct-v0.1.yaml
@@ -0,0 +1,11 @@
+# ./nm-run-lm-eval-gsm-hf-baseline.sh -m mistralai/Mixtral-8x22B-Instruct-v0.1 -b 32 -l 250 -f 5
+model_name: "mistralai/Mixtral-8x22B-Instruct-v0.1"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.876
+  - name: "exact_match,flexible-extract"
+    value: 0.880
+limit: 250
+num_fewshot: 5
diff --git a/.github/lm-eval-configs/models/Mixtral-8x7B-Instruct-v0.1-FP8.yaml b/.github/lm-eval-configs/models/Mixtral-8x7B-Instruct-v0.1-FP8.yaml
@@ -0,0 +1,11 @@
+# ./nm-run-lm-eval-gsm-hf-baseline.sh -m mistralai/Mixtral-8x7B-Instruct-v0.1 -b 32 -l 250 -f 5
+model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.616
+  - name: "exact_match,flexible-extract"
+    value: 0.620
+limit: 250
+num_fewshot: 5
diff --git a/.github/lm-eval-configs/models/Mixtral-8x7B-Instruct-v0.1.yaml b/.github/lm-eval-configs/models/Mixtral-8x7B-Instruct-v0.1.yaml
@@ -0,0 +1,11 @@
+# bash ./nm-run-lm-eval-gsm-vllm-baseline.sh -m neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8 -b 32 -l 250 -f 5 -t 4
+model_name: "mistralai/Mixtral-8x7B-Instruct-v0.1"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.616
+  - name: "exact_match,flexible-extract"
+    value: 0.628
+limit: 250
+num_fewshot: 5
diff --git a/.github/lm-eval-configs/models/Phi-3-medium-4k-instruct.yaml b/.github/lm-eval-configs/models/Phi-3-medium-4k-instruct.yaml
@@ -0,0 +1,11 @@
+# ./nm-run-lm-eval-gsm-hf-baseline.sh -m microsoft/Phi-3-medium-4k-instruct -b 16 -l 250 -f 5
+model_name: "microsoft/Phi-3-medium-4k-instruct"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.840
+  - name: "exact_match,flexible-extract"
+    value: 0.852
+limit: 250
+num_fewshot: 5
diff --git a/.github/lm-eval-configs/models/Qwen2-57B-A14B-Instruct.yaml b/.github/lm-eval-configs/models/Qwen2-57B-A14B-Instruct.yaml
@@ -0,0 +1,11 @@
+# ./nm-run-lm-eval-gsm-hf-baseline.sh -m Qwen/Qwen2-57B-A14B-Instruct -b 32 -l 250 -f 5
+model_name: "Qwen/Qwen2-57B-A14B-Instruct"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.736
+  - name: "exact_match,flexible-extract"
+    value: 0.800
+limit: 250
+num_fewshot: 5
diff --git a/.github/lm-eval-configs/models/Qwen2-72B-Instruct.yaml b/.github/lm-eval-configs/models/Qwen2-72B-Instruct.yaml
@@ -0,0 +1,11 @@
+# ./nm-run-lm-eval-gsm-hf-baseline.sh -m Qwen/Qwen2-72B-Instruct -b 16 -l 250 -f 5
+model_name: "Qwen/Qwen2-72B-Instruct"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.828
+  - name: "exact_match,flexible-extract"
+    value: 0.856
+limit: 250
+num_fewshot: 5
diff --git a/.github/lm-eval-configs/models/Qwen2-7B-Instruct.yaml b/.github/lm-eval-configs/models/Qwen2-7B-Instruct.yaml
@@ -0,0 +1,11 @@
+# ./nm-run-lm-eval-gsm-hf-baseline.sh -m Qwen/Qwen2-7B-Instruct -b 32 -l 250 -f 5
+model_name: "Qwen/Qwen2-7B-Instruct"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.680
+  - name: "exact_match,flexible-extract"
+    value: 0.756
+limit: 250
+num_fewshot: 5
diff --git a/.github/lm-eval-configs/models/gemma-7b-it.yaml b/.github/lm-eval-configs/models/gemma-7b-it.yaml
@@ -0,0 +1,11 @@
+# ./nm-run-lm-eval-gsm-hf-baseline.sh -m google/gemma-7b-it -b 16 -l 250 -f 5
+model_name: "google/gemma-7b-it"
+tasks:
+- name: "gsm8k"
+  metrics:
+  - name: "exact_match,strict-match"
+    value: 0.284
+  - name: "exact_match,flexible-extract"
+    value: 0.324
+limit: 250
+num_fewshot: 5
diff --git a/.github/lm-eval-configs/smoke-large-models.txt b/.github/lm-eval-configs/smoke-large-models.txt
@@ -0,0 +1,2 @@
+Meta-Llama-3-70B-Instruct.yaml
+Mixtral-8x7B-Instruct-v0.1.yaml
diff --git a/.github/lm-eval-configs/smoke-small-models.txt b/.github/lm-eval-configs/smoke-small-models.txt
@@ -0,0 +1 @@
+Meta-Llama-3-8B-Instruct.yaml
diff --git a/.github/scripts/nm-run-lm-eval-gsm-hf-baseline.sh b/.github/scripts/nm-run-lm-eval-gsm-hf-baseline.sh
@@ -14,23 +14,19 @@ usage() {
     echo
     echo "  -m    - huggingface stub or local directory of the model"
     echo "  -b    - batch size to run the evaluation at"
-    echo "  -d    - device to use (e.g. cuda, cuda:0, auto, cpu)"
     echo "  -l    - limit number of samples to run"
     echo "  -f    - number of fewshot samples to use"
     echo
 }
 
-while getopts "m:b:d:l:f:" OPT; do
+while getopts "m:b:l:f:" OPT; do
   case ${OPT} in
     m ) 
         MODEL="$OPTARG"
         ;;
     b ) 
         BATCH_SIZE="$OPTARG"
         ;;
-    d ) 
-        DEVICE="$OPTARG"
-        ;;
     l ) 
         LIMIT="$OPTARG"
         ;;
@@ -45,6 +41,6 @@ while getopts "m:b:d:l:f:" OPT; do
 done
 
 lm_eval --model hf \
-  --model_args pretrained=$MODEL \
+  --model_args pretrained=$MODEL,parallelize=True \
   --tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
-  --batch_size $BATCH_SIZE --device $DEVICE
+  --batch_size $BATCH_SIZE
diff --git a/.github/scripts/nm-run-lm-eval-gsm-vllm-baseline.sh b/.github/scripts/nm-run-lm-eval-gsm-vllm-baseline.sh
@@ -0,0 +1,51 @@
+#!/bin/bash
+# We can use this script to compute baseline accuracy on GSM for vllm.
+# We use this for fp8, which HF does not support.
+#
+# Make sure you have lm-eval-harness installed:
+#   pip install git+https://github.com/EleutherAI/lm-evaluation-harness.git@9516087b81a61d0e220b22cc1b75be76de23bc10
+
+usage() {
+    echo``
+    echo "Runs lm eval harness on GSM8k using huggingface transformers."
+    echo "This pathway is intended to be used to create baselines for "
+    echo "our automated nm-test-accuracy workflow"
+    echo
+    echo "usage: ${0} <options>"
+    echo
+    echo "  -m    - huggingface stub or local directory of the model"
+    echo "  -b    - batch size to run the evaluation at"
+    echo "  -l    - limit number of samples to run"
+    echo "  -f    - number of fewshot samples to use"
+    echo "  -t    - tensor parallel size to run at"
+    echo
+}
+
+while getopts "m:b:l:f:t:" OPT; do
+  case ${OPT} in
+    m ) 
+        MODEL="$OPTARG"
+        ;;
+    b ) 
+        BATCH_SIZE="$OPTARG"
+        ;;
+    l ) 
+        LIMIT="$OPTARG"
+        ;;
+    f ) 
+        FEWSHOT="$OPTARG"
+        ;;
+    t )
+        TP_SIZE="$OPTARG"
+        ;;
+    \? ) 
+        usage
+        exit 1
+        ;;
+  esac
+done
+
+lm_eval --model vllm \
+  --model_args pretrained=$MODEL,tensor_parallel_size=$TP_SIZE \
+  --tasks gsm8k --num_fewshot $FEWSHOT --limit $LIMIT \
+  --batch_size $BATCH_SIZE
diff --git a/.github/scripts/nm-run-lm-eval-vllm.sh b/.github/scripts/nm-run-lm-eval-vllm.sh
@@ -7,15 +7,19 @@
 usage() {
     echo``
     echo "Runs lm eval harness on GSM8k using vllm server and compares to "
-    echo "precomputed baseline (measured by HF transformers."
+    echo "precomputed baseline (measured by HF transformers.)"
+    echo
+    echo "This script should be run from the /nm-vllm directory" 
     echo
     echo "usage: ${0} <options>"
     echo
-    echo "  -c    - path to the test data config (e.g. neuralmagic/lm-eval/YOUR_CONFIG.yaml)"
+    echo "  -c    - path to the test data config (e.g. .github/lm-eval-configs/small-models-smoke.txt)"
     echo
 }
 
-while getopts "c:" OPT; do
+SUCCESS=0
+
+while getopts "c:t:" OPT; do
   case ${OPT} in
     c ) 
         CONFIG="$OPTARG"
@@ -27,4 +31,30 @@ while getopts "c:" OPT; do
   esac
 done
 
-LM_EVAL_TEST_DATA_FILE=$CONFIG pytest -v tests/accuracy/test_lm_eval_correctness.py
+# Parse list of configs.
+IFS=$'\n' read -d '' -r -a MODEL_CONFIGS < $CONFIG
+
+for MODEL_CONFIG in "${MODEL_CONFIGS[@]}"
+do
+    LOCAL_SUCCESS=0
+
+    echo "=== RUNNING MODEL: $MODEL_CONFIG ==="
+
+    MODEL_CONFIG_PATH=$PWD/.github/lm-eval-configs/models/${MODEL_CONFIG}
+    LM_EVAL_TEST_DATA_FILE=$MODEL_CONFIG_PATH pytest -s tests/accuracy/test_lm_eval_correctness.py || LOCAL_SUCCESS=$?
+
+    if [[ $LOCAL_SUCCESS == 0 ]]; then
+        echo "=== PASSED MODEL: ${MODEL_CONFIG} ==="
+    else
+        echo "=== FAILED MODEL: ${MODEL_CONFIG} ==="
+    fi
+
+    SUCCESS=$((SUCCESS + LOCAL_SUCCESS))
+
+done
+
+if [ "${SUCCESS}" -eq "0" ]; then
+    exit 0
+else
+    exit 1
+fi
diff --git a/.github/workflows/nm-build-test.yml b/.github/workflows/nm-build-test.yml
@@ -85,7 +85,7 @@ on:
         type: string
         default: "60"
       lm_eval_configuration:
-        description: "configuration for lm-eval test (see neuralmagic/lm-eval)"
+        description: "configuration for lm-eval test (see .github/lm-eval-configs)"
         type: string
         default: ""
 

diff --git a/.github/workflows/nm-nightly.yml b/.github/workflows/nm-nightly.yml
@@ -45,6 +45,6 @@ jobs:
             push_benchmark_results_to_gh_pages: "${{ github.event_name == 'schedule' || inputs.push_benchmark_results_to_gh_pages }}"
 
             lm_eval_label: gcp-k8s-l4-solo
-            lm_eval_configuration: ./neuralmagic/lm-eval/full-small-models.yaml
+            lm_eval_configuration: ./.github/lm-eval-configs/smoke-small-models.txt
             lm_eval_timeout: 60
         secrets: inherit
diff --git a/.github/workflows/nm-remote-push.yml b/.github/workflows/nm-remote-push.yml
@@ -30,6 +30,6 @@ jobs:
             benchmark_timeout: 480
 
             lm_eval_label: gcp-k8s-l4-solo
-            lm_eval_configuration: ./neuralmagic/lm-eval/smoke-small-models.yaml
+            lm_eval_configuration: ./.github/lm-eval-configs/smoke-small-models.txt
             lm_eval_timeout: 60
         secrets: inherit
diff --git a/neuralmagic/lm-eval/full-small-models.yaml b/neuralmagic/lm-eval/full-small-models.yaml
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		Meta-Llama-3-70B-Instruct.yaml
		Mixtral-8x7B-Instruct-v0.1.yaml