From 5c7a85d4be83f8af58bbe78dd71396fa76aa6417 Mon Sep 17 00:00:00 2001
From: Andy Linfoot <78757007+andy-neuma@users.noreply.github.com>
Date: Fri, 3 May 2024 08:34:31 -0400
Subject: [PATCH] update workflows to use generated whls (#204)

SUMMARY:
* update NIGHTLY workflow to be whl centric
* update benchmarking jobs to use generated whl

TEST PLAN:
runs on remote push. i'm also triggering NIGHTLY manually.

---------

Co-authored-by: andy-neuma <andy@neuralmagic.com>
Co-authored-by: Domenic Barbuzzi <domenic@neuralmagic.com>
Co-authored-by: Domenic Barbuzzi <dbarbuzzi@gmail.com>
---
 .github/actions/nm-benchmark/action.yml       |   3 +
 .../actions/nm-install-test-whl/action.yml    |   6 +-
 .github/actions/nm-install-whl/action.yml     |  27 ++++
 .github/actions/nm-set-python/action.yml      |   2 +-
 .github/scripts/nm-run-benchmarks.sh          |   6 +-
 .github/workflows/build-test.yml              | 143 +++++++++++++++---
 .github/workflows/build.yml                   |   1 +
 .github/workflows/nightly.yml                 |  67 +++-----
 .github/workflows/nm-benchmark.yml            |  71 ++++-----
 .github/workflows/remote-push.yml             |  32 ++--
 .github/workflows/test.yml                    |   6 +-
 neuralmagic/benchmarks/common.py              |   7 +-
 .../benchmarks/requirements-benchmark.txt     |   1 -
 neuralmagic/tests/skip-almost-all.txt         |  40 ++++-
 14 files changed, 267 insertions(+), 145 deletions(-)
 create mode 100644 .github/actions/nm-install-whl/action.yml

diff --git a/.github/actions/nm-benchmark/action.yml b/.github/actions/nm-benchmark/action.yml
index 2c91778a31b29..62c516eeef083 100644
--- a/.github/actions/nm-benchmark/action.yml
+++ b/.github/actions/nm-benchmark/action.yml
@@ -19,6 +19,9 @@ runs:
   - id: benchmark
     run: |
       mkdir -p ${{ inputs.output_directory }}
+      # move source directories
+      mv vllm vllm-ignore || echo "no 'vllm' folder to move"
+      mv csrc csrc-ignore || echo "no 'csrc' folder to move"
       COMMIT=${{ github.sha }}
       VENV="${{ inputs.venv }}-${COMMIT:0:7}"
       source $(pyenv root)/versions/${{ inputs.python }}/envs/${VENV}/bin/activate
diff --git a/.github/actions/nm-install-test-whl/action.yml b/.github/actions/nm-install-test-whl/action.yml
index 7a34c1e31a8a1..193dad8f99820 100644
--- a/.github/actions/nm-install-test-whl/action.yml
+++ b/.github/actions/nm-install-test-whl/action.yml
@@ -44,14 +44,12 @@ runs:
         pip3 install coverage
         pip3 install pytest-cov
         pip3 install pytest-xdist
-        pip3 install --index-url http://${{ inputs.pypi }}:8080/ --trusted-host ${{ inputs.pypi }} nm-magic-wand-nightly
-        pip3 list
+        pip3 install -r requirements-dev.txt
         BASE=$(./.github/scripts/convert-version ${{ inputs.python }})
         WHL=$(find . -type f -iname "*${BASE}*.whl")
         WHL_BASENAME=$(basename ${WHL})
         echo "whl=${WHL_BASENAME}" >> "$GITHUB_OUTPUT"
-        pip3 install ${WHL}
-        pip3 install -r requirements-dev.txt
+        pip3 install ${WHL}[sparse]
         # report magic_wand version
         MAGIC_WAND=$(pip3 show nm-magic-wand-nightly | grep "Version" | cut -d' ' -f2)
         echo "magic_wand=${MAGIC_WAND}" >> "$GITHUB_OUTPUT"
diff --git a/.github/actions/nm-install-whl/action.yml b/.github/actions/nm-install-whl/action.yml
new file mode 100644
index 0000000000000..d67183a7239e8
--- /dev/null
+++ b/.github/actions/nm-install-whl/action.yml
@@ -0,0 +1,27 @@
+name: install whl
+description: 'installs found whl based on python version into specified venv'
+inputs:
+  python:
+    description: 'python version, e.g. 3.10.12'
+    required: true
+  venv:
+    description: 'name for python virtual environment'
+    required: true
+runs:
+  using: composite
+  steps:
+    - id: install_whl
+      run: |
+        # move source directories
+        mv vllm vllm-ignore
+        mv csrc csrc-ignore
+        # activate and install
+        COMMIT=${{ github.sha }}
+        VENV="${{ env.VENV_BASE }}-${COMMIT:0:7}"
+        source $(pyenv root)/versions/${{ inputs.python }}/envs/${VENV}/bin/activate
+        pip3 install -r requirements-dev.txt
+        BASE=$(./.github/scripts/convert-version ${{ inputs.python }})
+        WHL=$(find . -type f -iname "*${BASE}*.whl")
+        WHL_BASENAME=$(basename ${WHL})
+        pip3 install ${WHL}[sparse]
+      shell: bash
diff --git a/.github/actions/nm-set-python/action.yml b/.github/actions/nm-set-python/action.yml
index 8558f97c5efe6..1a3092b735bd3 100644
--- a/.github/actions/nm-set-python/action.yml
+++ b/.github/actions/nm-set-python/action.yml
@@ -20,7 +20,7 @@ runs:
         pyenv local ${{ inputs.python }}
         COMMIT=${{ github.sha }}
         VENV="${{ inputs.venv }}-${COMMIT:0:7}"
-        pyenv virtualenv ${VENV} || true
+        pyenv virtualenv --force ${VENV}
         source $(pyenv root)/versions/${{ inputs.python }}/envs/${VENV}/bin/activate
         VERSION=$(python --version)
         echo "version=${VERSION}" >> "$GITHUB_OUTPUT"
diff --git a/.github/scripts/nm-run-benchmarks.sh b/.github/scripts/nm-run-benchmarks.sh
index 9bb975530079c..7e44c0a7a7f98 100755
--- a/.github/scripts/nm-run-benchmarks.sh
+++ b/.github/scripts/nm-run-benchmarks.sh
@@ -3,7 +3,7 @@
 
 set -e
 set -u
-  
+
 if [ $# -ne 2 ];
 then
   echo "run_benchmarks needs exactly 2 arguments: "
@@ -11,10 +11,10 @@ then
   echo " 2. The output path to store the benchmark results"
   exit 1
 fi
-  
+
 benchmark_config_list_file=$1
 output_directory=$2
-  
+
 for bench_config in `cat $benchmark_config_list_file`
 do
   echo "Running benchmarks for config " $bench_config
diff --git a/.github/workflows/build-test.yml b/.github/workflows/build-test.yml
index f9005f86dffcd..0b3ce56982081 100644
--- a/.github/workflows/build-test.yml
+++ b/.github/workflows/build-test.yml
@@ -3,34 +3,69 @@ on:
   # makes workflow reusable
   workflow_call:
     inputs:
-      build_label:
-        description: "requested runner label (specifies instance)"
+      wf_category:
+        description: "categories: REMOTE, NIGHTLY, RELEASE"
         type: string
-        required: true
-      timeout:
-        description: "time limit for run in minutes "
+        default: "REMOTE"
+      python:
+        description: "python version, e.g. 3.10.12"
         type: string
         required: true
-      gitref:
-        description: "git commit hash or branch name"
+      # build related parameters
+      build_label:
+        description: "requested runner label (specifies instance)"
         type: string
-        required: true
+        default: "gcp-build-static"
+      build_timeout:
+        description: "time limit for build in minutes "
+        type: string
+        default: "60"
       Gi_per_thread:
         description: 'requested GiB to reserve per thread'
         type: string
-        required: true
+        default: "1"
       nvcc_threads:
         description: "number of threads nvcc build threads"
         type: string
+        default: "4"
+      # test related parameters
+      test_label_solo:
+        description: "requested runner label (specifies instance)"
+        type: string
         required: true
-      python:
-        description: "python version, e.g. 3.10.12"
+      test_label_multi:
+        description: "requested runner label (specifies instance)"
+        type: string
+        required: true
+      test_timeout:
+        description: "time limit for test run in minutes "
+        type: string
+        required: true
+      gitref:
+        description: "git commit hash or branch name"
         type: string
         required: true
       test_skip_list:
         description: 'file containing tests to skip'
         type: string
         required: true
+      # benchmark related parameters
+      benchmark_label:
+        description: "requested benchmark label (specifies instance)"
+        type: string
+        default: ""
+      benchmark_config_list_file:
+        description: "benchmark configs file, e.g. 'nm_benchmark_nightly_configs_list.txt'"
+        type: string
+        required: true
+      benchmark_timeout:
+        description: "time limit for benchmarking"
+        type: string
+        default: "720"
+      push_benchmark_results_to_gh_pages:
+        description: "When set to true, the workflow pushes all benchmarking results to gh-pages UI"
+        type: string
+        default: "false"
 
   # makes workflow manually callable
   workflow_dispatch:
@@ -39,8 +74,20 @@ on:
         description: "requested runner label (specifies instance)"
         type: string
         required: true
-      timeout:
-        description: "time limit for run in minutes "
+      build_timeout:
+        description: "time limit for build in minutes "
+        type: string
+        required: true
+      test_label_solo:
+        description: "requested runner label (specifies instance)"
+        type: string
+        required: true
+      test_label_multi:
+        description: "requested runner label (specifies instance)"
+        type: string
+        required: true
+      test_timeout:
+        description: "time limit for test run in minutes "
         type: string
         required: true
       gitref:
@@ -70,25 +117,77 @@ jobs:
         uses: ./.github/workflows/build.yml
         with:
             build_label: ${{ inputs.build_label }}
-            timeout: ${{ inputs.timeout }}
-            gitref: ${{ inputs.gitref }}
+            timeout: ${{ inputs.build_timeout }}
+            gitref: ${{ github.ref }}
             Gi_per_thread: ${{ inputs.Gi_per_thread }}
             nvcc_threads: ${{ inputs.nvcc_threads }}
             python: ${{ inputs.python }}
         secrets: inherit
 
-    TEST:
+    TEST-SOLO:
         needs: [BUILD]
         if: success()
-        strategy:
-            matrix:
-                test_label: [aws-avx2-192G-4-a10g-96G]
         uses: ./.github/workflows/test.yml
         with:
-            test_label: ${{ matrix.test_label }}
-            timeout: ${{ inputs.timeout }}
-            gitref: ${{ inputs.gitref }}
+            test_label: ${{ inputs.test_label_solo }}
+            timeout: ${{ inputs.test_timeout }}
+            gitref: ${{ github.ref }}
+            python: ${{ inputs.python }}
+            whl: ${{ needs.BUILD.outputs.whl }}
+            test_skip_list: ${{ inputs.test_skip_list }}
+        secrets: inherit
+
+    TEST-MULTI:
+        needs: [BUILD]
+        if: success() && contains(fromJSON('["NIGHTLY", "RELEASE"]'), inputs.wf_category)
+        uses: ./.github/workflows/test.yml
+        with:
+            test_label: ${{ inputs.test_label_multi }}
+            timeout: ${{ inputs.test_timeout }}
+            gitref: ${{ github.ref }}
             python: ${{ inputs.python }}
             whl: ${{ needs.BUILD.outputs.whl }}
             test_skip_list: ${{ inputs.test_skip_list }}
         secrets: inherit
+
+    PUBLISH:
+        needs: [TEST-SOLO, TEST-MULTI]
+        uses: ./.github/workflows/nm-publish.yml
+        with:
+            label: ${{ inputs.build_label }}
+            timeout: ${{ inputs.build_timeout }}
+            gitref: ${{ github.ref }}
+            python: ${{ inputs.python }}
+            whl: ${{ needs.BUILD.outputs.whl }}
+            tarfile: ${{ needs.BUILD.outputs.tarfile }}
+        secrets: inherit
+
+    BENCHMARK:
+        needs: [BUILD]
+        if: success()
+        uses: ./.github/workflows/nm-benchmark.yml
+        with:
+            label: ${{ inputs.test_label_solo }}
+            benchmark_config_list_file: ${{ inputs.benchmark_config_list_file }}
+            timeout: ${{ inputs.benchmark_timeout }}
+            gitref: ${{ github.ref }}
+            python: ${{ inputs.python }}
+            whl: ${{ needs.BUILD.outputs.whl }}
+            # Always push if it is a scheduled job
+            push_benchmark_results_to_gh_pages: "${{ github.event_name == 'schedule' || inputs.push_benchmark_results_to_gh_pages }}"
+        secrets: inherit
+
+    # TODO: decide if this should build or use the whl
+    # single gpu
+    # TODO: this should only run if doing a NIGHTLY or RELEASE
+    # Accuracy-Smoke-AWS-AVX2-32G-A10G-24G:
+    #     if: ${{ inputs.wf_category == 'NIGHTLY' || inputs.wf_category == 'RELEASE' }}
+    #     uses: ./.github/workflows/nm-lm-eval-smoke.yml
+    #     with:
+    #         label: ${{ inputs.test_label_solo }}
+    #         timeout: ${{ inputs.benchmark_timeout }}
+    #         gitref: ${{ github.ref }}
+    #         Gi_per_thread: ${{ inputs.Gi_per_thread }}
+    #         nvcc_threads: ${{ inputs.nvcc_threads }}
+    #         python: ${{ inputs.python }}
+    #     secrets: inherit
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 4687314766874..0c2b2f3fa8727 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -67,6 +67,7 @@ jobs:
         timeout-minutes: ${{ fromJson(inputs.timeout) }}
         outputs:
             whl: ${{ steps.build.outputs.whl }}
+            tarfile: ${{ steps.build.outputs.tarfile }}
 
         steps:
 
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
index 510bfcc896ac3..d45a0be2b0288 100644
--- a/.github/workflows/nightly.yml
+++ b/.github/workflows/nightly.yml
@@ -6,64 +6,31 @@ on:
       - cron: '0 1 * * *'
 
     workflow_dispatch:
-      inputs:
-        push_benchmark_results_to_gh_pages:
-          description: "When set to true, the workflow pushes all benchmarking results to gh-pages UI "
-          type: choice
-          options:
-              - 'true'
-              - 'false'
-          default: 'false'
+        inputs:
+            push_benchmark_results_to_gh_pages:
+                description: "When set to true, the workflow pushes all benchmarking results to gh-pages UI "
+                type: choice
+                options:
+                    - 'true'
+                    - 'false'
+                default: 'false'
 
 jobs:
 
-    NIGHTLY-MULTI:
+    BUILD-TEST:
         uses: ./.github/workflows/build-test.yml
         with:
-            build_label: aws-avx2-192G-4-a10g-96G
-            timeout: 480
-            gitref: ${{ github.ref }}
-            Gi_per_thread: 4
-            nvcc_threads: 8
+            wf_category: NIGHTLY
             python: 3.10.12
-            test_skip_list:
-        secrets: inherit
-
-    NIGHTLY-SOLO:
-        uses: ./.github/workflows/build-test.yml
-        with:
-            build_label: aws-avx2-32G-a10g-24G
-            timeout: 480
             gitref: ${{ github.ref }}
-            Gi_per_thread: 12
-            nvcc_threads: 1
-            python: 3.11.4
+
+            test_label_solo: aws-avx2-32G-a10g-24G
+            test_label_multi: aws-avx2-192G-4-a10g-96G
+            test_timeout: 480
             test_skip_list:
-        secrets: inherit
 
-    # single gpu
-    AWS-AVX2-32G-A10G-24G-Benchmark:
-        uses: ./.github/workflows/nm-benchmark.yml
-        with:
-            label: aws-avx2-32G-a10g-24G
-            benchmark_config_list_file:  ./.github/data/nm_benchmark_nightly_configs_list.txt
-            timeout: 720
-            gitref: '${{ github.ref }}'
-            Gi_per_thread: 12
-            nvcc_threads: 1
-            python: "3.10.12"
-            # Always push if it is a scheduled job
+            benchmark_label: aws-avx2-32G-a10g-24G
+            benchmark_config_list_file: ./.github/data/nm_benchmark_nightly_configs_list.txt
+            benchmark_timeout: 720
             push_benchmark_results_to_gh_pages: "${{ github.event_name == 'schedule' || inputs.push_benchmark_results_to_gh_pages }}"
         secrets: inherit
-
-    # single gpu
-    Accuracy-Smoke-AWS-AVX2-32G-A10G-24G:
-        uses: ./.github/workflows/nm-lm-eval-smoke.yml
-        with:
-            label: aws-avx2-32G-a10g-24G
-            timeout: 240
-            gitref: '${{ github.ref }}'
-            Gi_per_thread: 12
-            nvcc_threads: 1
-            python: "3.10.12"
-        secrets: inherit
diff --git a/.github/workflows/nm-benchmark.yml b/.github/workflows/nm-benchmark.yml
index 18be16f0bb2d5..4733775621432 100644
--- a/.github/workflows/nm-benchmark.yml
+++ b/.github/workflows/nm-benchmark.yml
@@ -1,4 +1,4 @@
-name: benchmark 
+name: benchmark
 on:
   # makes workflow reusable
   workflow_call:
@@ -19,18 +19,14 @@ on:
         description: "git commit hash or branch name"
         type: string
         required: true
-      Gi_per_thread:
-        description: 'requested GiB to reserve per thread'
-        type: string
-        required: true
-      nvcc_threads:
-        description: "number of threads nvcc build threads"
-        type: string
-        required: true
       python:
         description: "python version, e.g. 3.10.12"
         type: string
         required: true
+      whl:
+        description: "whl to test (variable appears late binding so unusable outside 'download artifact')"
+        type: string
+        required: true
       push_benchmark_results_to_gh_pages:
         description: "When set to true, the workflow pushes all benchmarking results to gh-pages UI"
         type: string
@@ -55,18 +51,14 @@ on:
         description: "git commit hash or branch name"
         type: string
         required: true
-      Gi_per_thread:
-        description: 'requested GiB to reserve per thread'
-        type: string
-        required: true
-      nvcc_threads:
-        description: "number of threads nvcc build threads"
-        type: string
-        required: true
       python:
         description: "python version, e.g. 3.10.12"
         type: string
         required: true
+      whl:
+        description: "whl to test (variable appears late binding so unusable outside 'download artifact')"
+        type: string
+        required: true
       push_benchmark_results_to_gh_pages:
         description: "When set to true, the workflow pushes all benchmarking results to gh-pages UI"
         type: choice
@@ -75,11 +67,16 @@ on:
           - 'false'
         default: 'false'
 
+env:
+    VENV_BASE: "BENCHMARK"
+
 jobs:
+
   BENCHMARK:
 
     runs-on: ${{ inputs.label }}
     timeout-minutes: ${{ fromJSON(inputs.timeout) }}
+
     outputs:
       gh_action_benchmark_input_artifact_name: ${{ steps.set_gh_action_benchmark_input_artifact_name.outputs.gh_action_benchmark_input_artifact_name}}
 
@@ -96,15 +93,15 @@ jobs:
         uses: ./.github/actions/nm-set-env/
         with:
           hf_token: ${{ secrets.NM_HF_TOKEN }}
-          Gi_per_thread: ${{ inputs.Gi_per_thread }}
-          nvcc_threads: ${{ inputs.nvcc_threads }}
+          Gi_per_thread: 1
+          nvcc_threads: 0
 
       - name: set python
         id: set_python
         uses: ./.github/actions/nm-set-python/
         with:
           python: ${{ inputs.python }}
-          venv: TEST
+          venv: ${{ env.VENV_BASE }}
 
       - name: hf cache
         id: hf_cache
@@ -112,13 +109,19 @@ jobs:
         with:
           fs_cache: ${{ secrets.HF_FS_CACHE }}
 
-      - name: build
-        id: build
-        uses: ./.github/actions/nm-build-vllm/
+      - name: download whl
+        id: download
+        uses: actions/download-artifact@v4
         with:
-          python: ${{ inputs.python }}
-          venv: TEST
-          pypi: ${{ secrets.NM_PRIVATE_PYPI_LOCATION }}
+          name: ${{ inputs.whl }}
+          path: ${{ inputs.whl }}
+
+      - name: install whl
+        id: install_whl
+        uses: ./.github/actions/nm-install-whl/
+        with:
+            python: ${{ inputs.python }}
+            venv: ${{ env.VENV_BASE }}
 
       - name: run benchmarks
         uses: ./.github/actions/nm-benchmark/
@@ -126,7 +129,7 @@ jobs:
           benchmark_config_list_file: ${{ inputs.benchmark_config_list_file }}
           output_directory: benchmark-results
           python: ${{ inputs.python }}
-          venv: TEST
+          venv: ${{ env.VENV_BASE }}
 
       - name: store benchmark result artifacts
         if: success()
@@ -145,10 +148,10 @@ jobs:
           efs_dst: /EFS/benchmark_results
 
       # Produce GHA benchmark JSONs
-      - name: make github-action-benchmark JSONs 
+      - name: make github-action-benchmark JSONs
         uses: ./.github/actions/nm-produce-gha-benchmark-json
         with:
-          vllm_benchmark_jsons_path: benchmark-results 
+          vllm_benchmark_jsons_path: benchmark-results
           # Metrics that are "better" when the value is greater are stored here
           bigger_is_better_output_file_path: gh-action-benchmark-jsons/bigger_is_better.json
           # Metrics that are "better" when the value is smaller are stored here
@@ -156,7 +159,7 @@ jobs:
           # Metrics that we only want to observe are stored here
           observation_metrics_output_file_path: gh-action-benchmark-jsons/observation_metrics.json
           python: ${{ inputs.python }}
-          venv: TEST
+          venv: ${{ env.VENV_BASE }}
 
       - name: set gh action benchmark input artifact name
         id: set_gh_action_benchmark_input_artifact_name
@@ -169,7 +172,7 @@ jobs:
         uses: actions/upload-artifact@v4
         with:
           name: ${{ steps.set_gh_action_benchmark_input_artifact_name.outputs.gh_action_benchmark_input_artifact_name}}
-          path: gh-action-benchmark-jsons 
+          path: gh-action-benchmark-jsons
           retention-days: 1
 
       - name: copy gh action benchmark JSONs to EFS store
@@ -180,13 +183,13 @@ jobs:
           src: gh-action-benchmark-jsons
           efs_dst: /EFS/benchmark_results
 
-  NM_GH_ACTION_BENCHMARK:
+  BENCHMARK_REPORT:
 
-    needs: BENCHMARK 
+    needs: [BENCHMARK]
     runs-on: ubuntu-latest
     timeout-minutes: 20
     permissions:
-      # Permissions required to be able to push to the nm-gh-pages branch 
+      # Permissions required to be able to push to the nm-gh-pages branch
       contents: write
 
     steps:
diff --git a/.github/workflows/remote-push.yml b/.github/workflows/remote-push.yml
index 56c26fc367f9e..5bc25d574e145 100644
--- a/.github/workflows/remote-push.yml
+++ b/.github/workflows/remote-push.yml
@@ -11,32 +11,18 @@ concurrency:
 
 jobs:
 
-    # multi-gpu
     BUILD-TEST:
-        strategy:
-            matrix:
-                python: [3.10.12]
         uses: ./.github/workflows/build-test.yml
         with:
-            build_label: gcp-build-static
-            timeout: 240
-            gitref: '${{ github.ref }}'
-            Gi_per_thread: 1
-            nvcc_threads: 4
-            python: ${{ matrix.python }}
+            python: 3.10.12
+            gitref: ${{ github.ref }}
+
+            test_label_solo: aws-avx2-32G-a10g-24G
+            test_label_multi: ignore
+            test_timeout: 480
             test_skip_list: neuralmagic/tests/skip-for-remote-push.txt
-        secrets: inherit
 
-    # Benchmarks
-    AWS-AVX2-32G-A10G-24G-Benchmark:
-        uses: ./.github/workflows/nm-benchmark.yml
-        with:
-            label: aws-avx2-32G-a10g-24G
-            benchmark_config_list_file:  ./.github/data/nm_benchmark_remote_push_configs_list.txt
-            timeout: 180
-            gitref: '${{ github.ref }}'
-            Gi_per_thread: 1
-            nvcc_threads: 4
-            python: 3.10.12
-            push_benchmark_results_to_gh_pages: "false"
+            benchmark_label: aws-avx2-32G-a10g-24G
+            benchmark_config_list_file: ./.github/data/nm_benchmark_remote_push_configs_list.txt
+            benchmark_timeout: 180
         secrets: inherit
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index bcf7d73a695ed..b081a63b7e9e1 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -56,6 +56,9 @@ on:
         type: string
         required: true
 
+env:
+    VENV_BASE: "TEST"
+
 jobs:
 
     TEST:
@@ -79,13 +82,14 @@ jobs:
               with:
                 hf_token: ${{ secrets.NM_HF_TOKEN }}
                 Gi_per_thread: 1
+                nvcc_threads: 0
 
             - name: set python
               id: set_python
               uses: ./.github/actions/nm-set-python/
               with:
                 python: ${{ inputs.python }}
-                venv: TEST
+                venv: ${{ env.VENV_BASE }}
 
             - name: hf cache
               id: hf_cache
diff --git a/neuralmagic/benchmarks/common.py b/neuralmagic/benchmarks/common.py
index fbfa6153332b2..459ee47eddadc 100644
--- a/neuralmagic/benchmarks/common.py
+++ b/neuralmagic/benchmarks/common.py
@@ -6,7 +6,8 @@
 
 # TODO (varun) : find a workaround so we avoid using private methods
 from vllm.config import _get_and_verify_max_len
-from vllm.model_executor.weight_utils import prepare_hf_model_weights
+from vllm.model_executor.model_loader.weight_utils import (
+    download_weights_from_hf)
 from vllm.transformers_utils.config import get_config
 from vllm.transformers_utils.tokenizer import get_tokenizer
 
@@ -15,7 +16,7 @@ def download_model(model: str) -> None:
     """
      Downloads a hugging face model to cache
      """
-    prepare_hf_model_weights(model)
+    download_weights_from_hf(model)
     get_tokenizer(model)
 
 
@@ -56,7 +57,7 @@ def script_args_to_cla(config: NamedTuple) -> Iterable[dict]:
 
 def benchmark_configs(config_file_path: Path) -> Iterable[NamedTuple]:
     """
-    Give a path to a config file in `neuralmagic/benchmarks/configs/*` 
+    Give a path to a config file in `neuralmagic/benchmarks/configs/*`
     return an Iterable of (sub)configs in the file
     """
     assert config_file_path.exists()
diff --git a/neuralmagic/benchmarks/requirements-benchmark.txt b/neuralmagic/benchmarks/requirements-benchmark.txt
index df1c80adcfc17..095bba70f1946 100644
--- a/neuralmagic/benchmarks/requirements-benchmark.txt
+++ b/neuralmagic/benchmarks/requirements-benchmark.txt
@@ -2,4 +2,3 @@
 requests
 aiohttp
 datasets
-nm-magic-wand-nightly
diff --git a/neuralmagic/tests/skip-almost-all.txt b/neuralmagic/tests/skip-almost-all.txt
index 543086461bc47..99a541c7e1628 100644
--- a/neuralmagic/tests/skip-almost-all.txt
+++ b/neuralmagic/tests/skip-almost-all.txt
@@ -1,14 +1,27 @@
 tests/test_sequence.py
 tests/metrics/test_metrics.py
 tests/kernels/test_prefix_prefill.py
+tests/kernels/test_pos_encoding.py
 tests/kernels/test_activation.py
 tests/kernels/test_moe.py
 tests/kernels/test_layernorm.py
 tests/kernels/test_attention.py
+tests/kernels/test_rand.py
 tests/kernels/test_cache.py
+tests/kernels/test_sampler.py
 tests/core/test_block_manager.py
+tests/core/test_chunked_prefill_scheduler.py
 tests/core/test_scheduler.py
+tests/core/block/test_cpu_gpu_block_allocator.py
+tests/core/block/test_common.py
+tests/core/block/test_prefix_caching_block.py
+tests/core/block/test_block_table.py
+tests/core/block/test_block_manager_v2.py
+tests/core/block/test_naive_block.py
+tests/core/block/e2e/test_correctness.py
 tests/distributed/test_basic_distributed_correctness.py
+tests/distributed/test_pynccl.py
+tests/distributed/test_chunked_prefill_distributed.py
 tests/distributed/test_custom_all_reduce.py
 tests/distributed/test_comm_ops.py
 tests/prefix_caching/test_prefix_caching.py
@@ -17,12 +30,20 @@ tests/models/test_compressed_memory.py
 tests/models/test_marlin.py
 tests/models/test_compressed.py
 tests/models/test_models_logprobs.py
+tests/models/test_big_models.py
 tests/models/test_models.py
+tests/models/test_llava.py
+tests/models/test_oot_registration.py
+tests/tokenization/test_detokenize.py
+tests/tokenization/test_tokenizer_group.py
+tests/tokenization/test_cached_tokenizer.py
 tests/spec_decode/test_utils.py
 tests/spec_decode/test_spec_decode_worker.py
 tests/spec_decode/test_metrics.py
 tests/spec_decode/test_batch_expansion.py
+tests/spec_decode/e2e/test_correctness.py
 tests/spec_decode/test_multi_step_worker.py
+tests/quantization/test_autogptq_marlin_configs.py
 tests/test_sampling_params.py
 tests/async_engine/test_async_llm_engine.py
 tests/async_engine/test_api_server.py
@@ -30,24 +51,37 @@ tests/async_engine/test_chat_template.py
 tests/async_engine/test_request_tracker.py
 tests/samplers/test_beam_search.py
 tests/samplers/test_logprobs.py
+tests/samplers/test_ranks.py
+tests/samplers/test_logits_processor.py
 tests/samplers/test_seeded_generate.py
 tests/samplers/test_rejection_sampler.py
 tests/samplers/test_sampler.py
+tests/test_config.py
+tests/entrypoints/test_server_oot_registration.py
 tests/entrypoints/test_guided_processors.py
 tests/entrypoints/test_openai_server.py
 tests/lora/test_llama.py
 tests/lora/test_utils.py
-tests/lora/test_tokenizer.py
 tests/lora/test_layer_variation.py
 tests/lora/test_gemma.py
 tests/lora/test_lora_manager.py
 tests/lora/test_layers.py
+tests/lora/test_lora_checkpoints.py
+tests/lora/test_baichuan.py
 tests/lora/test_worker.py
 tests/lora/test_mixtral.py
 tests/lora/test_punica.py
+tests/lora/test_tokenizer_group.py
+tests/lora/test_quant_model.py
+tests/lora/test_chatglm3.py
 tests/lora/test_lora.py
+tests/test_logits_processor.py
+tests/worker/test_swap.py
 tests/worker/test_model_runner.py
-tests/engine/test_detokenize.py
+tests/engine/test_stop_reason.py
+tests/engine/test_stop_strings.py
+tests/engine/test_detokenization.py
 tests/engine/test_computed_prefix_blocks.py
+tests/basic_correctness/test_chunked_prefill.py
+tests/basic_correctness/test_basic_correctness.py
 tests/test_cache_block_hashing.py
-tests/test_regression.py