diff --git a/.github/workflows/amd_perf_kernel_tests.yml b/.github/workflows/amd_perf_kernel_Integration_tests.yml similarity index 95% rename from .github/workflows/amd_perf_kernel_tests.yml rename to .github/workflows/amd_perf_kernel_Integration_tests.yml index 07424924a832..a8a8b3d50b9e 100644 --- a/.github/workflows/amd_perf_kernel_tests.yml +++ b/.github/workflows/amd_perf_kernel_Integration_tests.yml @@ -1,4 +1,4 @@ -name: AMD Perf Kernel Tests +name: AMD Perf Kernel Integration Tests on: workflow_dispatch: @@ -7,8 +7,6 @@ on: merge_group: branches: [main_perf] types: [checks_requested] - push: - branches: [main_perf] concurrency: group: ${{ github.ref }} @@ -36,8 +34,8 @@ jobs: changed_files=$(git diff --name-only origin/${{ github.base_ref }} ${{ github.sha }}) echo "Changed files:" echo "$changed_files" - if echo "$changed_files" | grep -v "^python/perf-kernels/"; then - echo "Changes detected outside of the python/perf-kernels directory. Failing the workflow." + if echo "$changed_files" | grep -vE "^python/perf-kernels/|^\.github/workflows/amd_"; then + echo "Changes detected outside of the python/perf-kernels directory or .github/workflows/amd_ files. Failing the workflow." exit 1 fi diff --git a/.github/workflows/amd_perf_kernel_postmerge_tests.yml b/.github/workflows/amd_perf_kernel_postmerge_tests.yml new file mode 100644 index 000000000000..40f211118541 --- /dev/null +++ b/.github/workflows/amd_perf_kernel_postmerge_tests.yml @@ -0,0 +1,92 @@ +name: AMD Perf Kernel Post-Merge Tests + +on: + workflow_dispatch: + push: + branches: [main_perf, micmelesse/post_merge_ci] + +concurrency: + group: ${{ github.ref }} + cancel-in-progress: ${{ github.ref != 'refs/heads/main_perf' }} + +permissions: read-all + +env: + TRITON_BUILD_WITH_CLANG_LLD: "TRUE" + TRITON_USE_ASSERT_ENABLED_LLVM: "TRUE" + TRITON_DISABLE_LINE_INFO: 1 + +jobs: + Runner-Preparation-AMD: + runs-on: ubuntu-latest + timeout-minutes: 30 + outputs: + matrix-HIP: ${{ steps.set-matrix.outputs.matrix-HIP }} + steps: + - name: Prepare runner matrix + id: set-matrix + run: | + if [ x"${{ github.repository }}" == x"ROCm/triton" ]; then + echo '::set-output name=matrix-HIP::[["self-hosted", "rocm.gfx90a"]]' + else + echo '::set-output name=matrix-HIP::[["ubuntu-latest"]]' + fi + + PostMerge-Tests-AMD: + needs: Runner-Preparation-AMD + if: needs.Runner-Preparation-AMD.outputs.matrix-HIP != '' + runs-on: ${{ matrix.runner }} + timeout-minutes: 30 + strategy: + matrix: + runner: ${{fromJson(needs.Runner-Preparation-AMD.outputs.matrix-HIP)}} + container: + image: rocm/pytorch:rocm6.0.2_ubuntu22.04_py3.10_pytorch_2.1.2 + options: --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --user root + steps: + - name: Checkout + uses: actions/checkout@v4 + with: + fetch-depth: 0 # Ensure the entire history is fetched for rebase + - name: Add upstream remote + run: | + git config --global --add safe.directory /__w/triton/triton + if [ $(git remote | grep -c upstream) -eq 0 ]; then + git remote add upstream https://github.com/triton-lang/triton.git + fi + git fetch upstream + - name: Rebase onto upstream/main + run: | + git config --global user.email "ci@amd.com" + git config --global user.name "Github Actions Post-Merge CI Script" + git rebase upstream/main || { echo "Rebase failed"; exit 1; } + - name: Show Git Log + run: | + echo "Git log after rebase from upstream/main to HEAD:" + git log $(git rev-parse upstream/main~2)..HEAD --oneline --graph --decorate + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: Clear cache + run: | + rm -rf ~/.triton + mkdir -p ~/.triton + ls -alh ~/.triton + - name: Update PATH + run: | + echo "/opt/rocm/llvm/bin" >> $GITHUB_PATH + - name: Install pip dependencies + run: | + python3 -m pip install --upgrade pip + python3 -m pip install lit matplotlib pandas + - name: Install Triton + run: | + echo "PATH is '$PATH'" + pip uninstall -y triton + cd python + pip install -v -e . + - name: Run Perf Kernels Unit Tests + run: | + pytest -vvv ./python/perf-kernels/flash-attention.py + - name: Run Perf Kernels Benchmark + run: | + python ./python/perf-kernels/flash-attention.py diff --git a/python/perf-kernels/flash-attention.py b/python/perf-kernels/flash-attention.py index d36caaf61952..8177cf4ebf30 100644 --- a/python/perf-kernels/flash-attention.py +++ b/python/perf-kernels/flash-attention.py @@ -309,8 +309,8 @@ def _attn_fwd_inner(acc, l_i, m_i, q, k_ptrs, v_ptrs, bias_ptrs, stride_kn, stri num_warps=8), triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'waves_per_eu': 3, 'PRE_LOAD_V': True}, num_stages=1, num_warps=4), - triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'waves_per_eu': 3, 'PRE_LOAD_V': False}, num_stages=1, - num_warps=4), + # triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'waves_per_eu': 3, 'PRE_LOAD_V': False}, num_stages=1, + # num_warps=4), triton.Config({'BLOCK_M': 64, 'BLOCK_N': 64, 'waves_per_eu': 4, 'PRE_LOAD_V': False}, num_stages=1, num_warps=8), triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'waves_per_eu': 1, 'PRE_LOAD_V': False}, num_stages=1, @@ -1166,7 +1166,8 @@ def test_op_fwd(Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD, causal, use_alibi, layout, ]) @pytest.mark.parametrize('causal', [True, False]) @pytest.mark.parametrize('use_bias', [True]) -def test_op_fwd_bias(Z, H, N_CTX_Q, N_CTX_K, D_HEAD, causal, use_bias, dtype=torch.float16): +@pytest.mark.parametrize('dtype', [torch.float16, torch.bfloat16]) +def test_op_fwd_bias(Z, H, N_CTX_Q, N_CTX_K, D_HEAD, causal, use_bias, dtype): torch.manual_seed(20) sm_scale = D_HEAD**-0.5 input_metadata = MetaData(sm_scale=sm_scale) @@ -1174,7 +1175,7 @@ def test_op_fwd_bias(Z, H, N_CTX_Q, N_CTX_K, D_HEAD, causal, use_bias, dtype=tor if causal: input_metadata.need_causal() if use_bias: - bias = torch.randn((1, H, N_CTX_Q, N_CTX_K), dtype=torch.float32, device="cuda") + bias = torch.randn((1, H, N_CTX_Q, N_CTX_K), dtype=dtype, device="cuda") input_metadata.need_bias(bias, Z, H, N_CTX_Q, N_CTX_K) else: bias = None @@ -1197,7 +1198,7 @@ def test_op_fwd_bias(Z, H, N_CTX_Q, N_CTX_K, D_HEAD, causal, use_bias, dtype=tor # this by converting the NaNs to 0s, which is what they should be out of the softmax. nan_mask = torch.isnan(p) p[nan_mask == 1] = 0 - ref_out = torch.einsum('bhqk,bhkd->bhqd', p.half(), v) + ref_out = torch.einsum('bhqk,bhkd->bhqd', p.to(dtype), v) # compare torch.testing.assert_close(ref_out, tri_out, atol=2e-2, rtol=2e-2)