diff --git a/.github/workflows/amd_perf_kernel_tests.yml b/.github/workflows/amd_perf_kernel_Integration_tests.yml
similarity index 95%
rename from .github/workflows/amd_perf_kernel_tests.yml
rename to .github/workflows/amd_perf_kernel_Integration_tests.yml
index 07424924a832..a8a8b3d50b9e 100644
--- a/.github/workflows/amd_perf_kernel_tests.yml
+++ b/.github/workflows/amd_perf_kernel_Integration_tests.yml
@@ -1,4 +1,4 @@
-name: AMD Perf Kernel Tests
+name: AMD Perf Kernel Integration Tests
 
 on:
   workflow_dispatch:
@@ -7,8 +7,6 @@ on:
     merge_group:
       branches: [main_perf]
       types: [checks_requested]
-  push:
-    branches: [main_perf]
 
 concurrency:
   group: ${{ github.ref }}
@@ -36,8 +34,8 @@ jobs:
           changed_files=$(git diff --name-only origin/${{ github.base_ref }} ${{ github.sha }})
           echo "Changed files:"
           echo "$changed_files"
-          if echo "$changed_files" | grep -v "^python/perf-kernels/"; then
-            echo "Changes detected outside of the python/perf-kernels directory. Failing the workflow."
+          if echo "$changed_files" | grep -vE "^python/perf-kernels/|^\.github/workflows/amd_"; then
+            echo "Changes detected outside of the python/perf-kernels directory or .github/workflows/amd_ files. Failing the workflow."
             exit 1
           fi
 
diff --git a/.github/workflows/amd_perf_kernel_postmerge_tests.yml b/.github/workflows/amd_perf_kernel_postmerge_tests.yml
new file mode 100644
index 000000000000..40f211118541
--- /dev/null
+++ b/.github/workflows/amd_perf_kernel_postmerge_tests.yml
@@ -0,0 +1,92 @@
+name: AMD Perf Kernel Post-Merge Tests
+
+on:
+  workflow_dispatch:
+  push:
+    branches: [main_perf, micmelesse/post_merge_ci]
+
+concurrency:
+  group: ${{ github.ref }}
+  cancel-in-progress: ${{ github.ref != 'refs/heads/main_perf' }}
+
+permissions: read-all
+
+env:
+  TRITON_BUILD_WITH_CLANG_LLD: "TRUE"
+  TRITON_USE_ASSERT_ENABLED_LLVM: "TRUE"
+  TRITON_DISABLE_LINE_INFO: 1
+
+jobs:
+  Runner-Preparation-AMD:
+    runs-on: ubuntu-latest
+    timeout-minutes: 30
+    outputs:
+      matrix-HIP: ${{ steps.set-matrix.outputs.matrix-HIP }}
+    steps:
+      - name: Prepare runner matrix
+        id: set-matrix
+        run: |
+          if [ x"${{ github.repository }}" == x"ROCm/triton" ]; then
+            echo '::set-output name=matrix-HIP::[["self-hosted", "rocm.gfx90a"]]'
+          else
+            echo '::set-output name=matrix-HIP::[["ubuntu-latest"]]'
+          fi
+
+  PostMerge-Tests-AMD:
+    needs: Runner-Preparation-AMD
+    if: needs.Runner-Preparation-AMD.outputs.matrix-HIP != ''
+    runs-on: ${{ matrix.runner }}
+    timeout-minutes: 30
+    strategy:
+      matrix:
+        runner: ${{fromJson(needs.Runner-Preparation-AMD.outputs.matrix-HIP)}}
+    container:
+      image: rocm/pytorch:rocm6.0.2_ubuntu22.04_py3.10_pytorch_2.1.2
+      options: --device=/dev/kfd --device=/dev/dri --security-opt seccomp=unconfined --group-add video --user root
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0  # Ensure the entire history is fetched for rebase
+      - name: Add upstream remote
+        run: |
+          git config --global --add safe.directory /__w/triton/triton
+          if [ $(git remote | grep -c upstream) -eq 0 ]; then
+            git remote add upstream https://github.com/triton-lang/triton.git
+          fi
+          git fetch upstream
+      - name: Rebase onto upstream/main
+        run: |
+          git config --global user.email "ci@amd.com"
+          git config --global user.name "Github Actions Post-Merge CI Script"
+          git rebase upstream/main || { echo "Rebase failed"; exit 1; }
+      - name: Show Git Log
+        run: |
+          echo "Git log after rebase from upstream/main to HEAD:"
+          git log $(git rev-parse upstream/main~2)..HEAD --oneline --graph --decorate
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: Clear cache
+        run: |
+          rm -rf ~/.triton
+          mkdir -p ~/.triton
+          ls -alh ~/.triton
+      - name: Update PATH
+        run: |
+          echo "/opt/rocm/llvm/bin" >> $GITHUB_PATH
+      - name: Install pip dependencies
+        run: |
+          python3 -m pip install --upgrade pip
+          python3 -m pip install lit matplotlib pandas
+      - name: Install Triton
+        run: |
+          echo "PATH is '$PATH'"
+          pip uninstall -y triton
+          cd python
+          pip install -v -e .
+      - name: Run Perf Kernels Unit Tests
+        run: |
+          pytest -vvv ./python/perf-kernels/flash-attention.py
+      - name: Run Perf Kernels Benchmark
+        run: |
+          python ./python/perf-kernels/flash-attention.py
diff --git a/python/perf-kernels/flash-attention.py b/python/perf-kernels/flash-attention.py
index d36caaf61952..8177cf4ebf30 100644
--- a/python/perf-kernels/flash-attention.py
+++ b/python/perf-kernels/flash-attention.py
@@ -309,8 +309,8 @@ def _attn_fwd_inner(acc, l_i, m_i, q, k_ptrs, v_ptrs, bias_ptrs, stride_kn, stri
                       num_warps=8),
         triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'waves_per_eu': 3, 'PRE_LOAD_V': True}, num_stages=1,
                       num_warps=4),
-        triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'waves_per_eu': 3, 'PRE_LOAD_V': False}, num_stages=1,
-                      num_warps=4),
+        # triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'waves_per_eu': 3, 'PRE_LOAD_V': False}, num_stages=1,
+        #            num_warps=4),
         triton.Config({'BLOCK_M': 64, 'BLOCK_N': 64, 'waves_per_eu': 4, 'PRE_LOAD_V': False}, num_stages=1,
                       num_warps=8),
         triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'waves_per_eu': 1, 'PRE_LOAD_V': False}, num_stages=1,
@@ -1166,7 +1166,8 @@ def test_op_fwd(Z, HQ, HK, N_CTX_Q, N_CTX_K, D_HEAD, causal, use_alibi, layout,
 ])
 @pytest.mark.parametrize('causal', [True, False])
 @pytest.mark.parametrize('use_bias', [True])
-def test_op_fwd_bias(Z, H, N_CTX_Q, N_CTX_K, D_HEAD, causal, use_bias, dtype=torch.float16):
+@pytest.mark.parametrize('dtype', [torch.float16, torch.bfloat16])
+def test_op_fwd_bias(Z, H, N_CTX_Q, N_CTX_K, D_HEAD, causal, use_bias, dtype):
     torch.manual_seed(20)
     sm_scale = D_HEAD**-0.5
     input_metadata = MetaData(sm_scale=sm_scale)
@@ -1174,7 +1175,7 @@ def test_op_fwd_bias(Z, H, N_CTX_Q, N_CTX_K, D_HEAD, causal, use_bias, dtype=tor
     if causal:
         input_metadata.need_causal()
     if use_bias:
-        bias = torch.randn((1, H, N_CTX_Q, N_CTX_K), dtype=torch.float32, device="cuda")
+        bias = torch.randn((1, H, N_CTX_Q, N_CTX_K), dtype=dtype, device="cuda")
         input_metadata.need_bias(bias, Z, H, N_CTX_Q, N_CTX_K)
     else:
         bias = None
@@ -1197,7 +1198,7 @@ def test_op_fwd_bias(Z, H, N_CTX_Q, N_CTX_K, D_HEAD, causal, use_bias, dtype=tor
         # this by converting the NaNs to 0s, which is what they should be out of the softmax.
         nan_mask = torch.isnan(p)
         p[nan_mask == 1] = 0
-    ref_out = torch.einsum('bhqk,bhkd->bhqd', p.half(), v)
+    ref_out = torch.einsum('bhqk,bhkd->bhqd', p.to(dtype), v)
     # compare
     torch.testing.assert_close(ref_out, tri_out, atol=2e-2, rtol=2e-2)