diff --git a/.github/workflows/amd_tests.yml b/.github/workflows/amd_tests.yml
index 21cfe22d7..ab3615dea 100644
--- a/.github/workflows/amd_tests.yml
+++ b/.github/workflows/amd_tests.yml
@@ -4,11 +4,6 @@ on:
   workflow_dispatch:
   pull_request:
     branches: [main_perf]
-  merge_group:
-    branches: [main_perf]
-    types: [checks_requested]
-  push:
-    branches: [main_perf]
 
 concurrency:
   group: ${{ github.ref }}
@@ -55,6 +50,12 @@ jobs:
           python setup.py install
 
       # CDNA Tests
+      - name: Flash Attention Tests Using Reference Impl
+        if: matrix.runner == 'linux-mi300-gpu-1'
+        run: |
+          export FLASH_ATTENTION_TRITON_AMD_ENABLE="TRUE"
+          export FLASH_ATTENTION_TRITON_AMD_REF=1
+          pytest tests/test_flash_attn_triton_amd.py
       - name: Flash Attention CDNA Tests
         if: matrix.runner == 'linux-mi300-gpu-1'
         run: |
@@ -78,12 +79,6 @@ jobs:
           python flash_attn/flash_attn_triton_amd/bench.py
       
       # RDNA Tests
-      - name: Flash Attention Tests Using Reference Impl
-        if: matrix.runner == 'gfx1100'
-        run: |
-          export FLASH_ATTENTION_TRITON_AMD_ENABLE="TRUE"
-          export FLASH_ATTENTION_TRITON_AMD_REF=1
-          pytest tests/test_flash_attn_triton_amd.py
       - name: Flash Attention RDNA Tests
         if: matrix.runner == 'gfx1100'
         run: |
diff --git a/.gitignore b/.gitignore
index b1f8a9715..ddc0f514c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -32,6 +32,7 @@ csrc/flash_attn_ck
 .eggs
 *.log
 core.*
+gpucore.*
 *.csv
 *.png
 *.html
diff --git a/README.md b/README.md
index 776824f75..fbe00936c 100644
--- a/README.md
+++ b/README.md
@@ -149,7 +149,7 @@ These features are in development
 2) Sliding Window
 5) Performance Improvements
 
-#### Getting Started
+##### Getting Started
 To get started with the triton backend for AMD, follow the steps below.
 
 First install the recommended Triton [commit](https://github.com/triton-lang/triton/commit/3ca2f498e98ed7249b82722587c511a5610e00c4).
diff --git a/flash_attn/flash_attn_triton_amd/README.md b/flash_attn/flash_attn_triton_amd/README.md
index 353b493f6..be1e8ded6 100644
--- a/flash_attn/flash_attn_triton_amd/README.md
+++ b/flash_attn/flash_attn_triton_amd/README.md
@@ -23,7 +23,7 @@ These features are in development
 2) Sliding Window
 5) Performance Improvements
 
-#### Getting Started
+##### Getting Started
 To get started with the triton backend for AMD, follow the steps below.
 
 First install the recommended Triton [commit](https://github.com/triton-lang/triton/commit/3ca2f498e98ed7249b82722587c511a5610e00c4).
@@ -43,7 +43,7 @@ python setup.py install
 pytest tests/test_flash_attn_triton_amd.py
 ```
 
-#### Credits
+##### Credits
 AMD Triton kernels team
 
 OpenAI kernel team
diff --git a/tests/test_flash_attn_triton_amd.py b/tests/test_flash_attn_triton_amd.py
index 623eb1e9c..28f947beb 100755
--- a/tests/test_flash_attn_triton_amd.py
+++ b/tests/test_flash_attn_triton_amd.py
@@ -575,8 +575,8 @@ def get_dropout_fraction(
 @pytest.mark.parametrize("alibi", [False])
 # @pytest.mark.parametrize("local", [False, True])
 @pytest.mark.parametrize("local", [False])
-# @pytest.mark.parametrize("causal", [False, True])
-@pytest.mark.parametrize("causal", [False])
+@pytest.mark.parametrize("causal", [False, True])
+# @pytest.mark.parametrize("causal", [False])
 @pytest.mark.parametrize("d", [32, 40, 59, 64, 80, 96, 111, 128, 160, 192, 224, 256])
 # @pytest.mark.parametrize("d", [32, 64, 96, 128, 160, 192, 224, 256])
 # @pytest.mark.parametrize('d', [32, 64, 96, 128])
@@ -738,8 +738,8 @@ def test_flash_attn_qkvpacked(seqlen, d, dropout_p, causal, local, alibi, determ
 @pytest.mark.parametrize("alibi", [False])
 # @pytest.mark.parametrize("local", [False, True])
 @pytest.mark.parametrize("local", [False])
-# @pytest.mark.parametrize("causal", [False, True])
-@pytest.mark.parametrize('causal', [False])
+@pytest.mark.parametrize("causal", [False, True])
+# @pytest.mark.parametrize("causal", [False])
 @pytest.mark.parametrize("d", [32, 59, 64, 80, 96, 128, 160, 192, 224, 256])
 # @pytest.mark.parametrize("d", [32, 64, 96, 128, 160, 192, 224, 256])
 # @pytest.mark.parametrize('d', [32])
@@ -898,8 +898,8 @@ def test_flash_attn_varlen_qkvpacked(
 @pytest.mark.parametrize("alibi", [False])
 # @pytest.mark.parametrize("local", [False, True])
 @pytest.mark.parametrize("local", [False])
-# @pytest.mark.parametrize("causal", [False, True])
-@pytest.mark.parametrize("causal", [False])
+@pytest.mark.parametrize("causal", [False, True])
+# @pytest.mark.parametrize("causal", [False])
 @pytest.mark.parametrize("d", [32, 40, 59, 64, 96, 111, 128, 160, 192, 224, 256])
 # @pytest.mark.parametrize("d", [32, 64, 96, 128, 160, 192, 224, 256])
 # @pytest.mark.parametrize('d', [32, 40, 64, 80, 96, 128, 160, 192])
@@ -930,6 +930,10 @@ def test_flash_attn_output(
     seqlen_q, seqlen_k, d, dropout_p, causal, local, alibi, deterministic, mha_type, dtype, kvpacked, softcap
 ):
     if USE_TRITON_ROCM:
+        if causal:
+            if seqlen_q ==1024 and seqlen_k==1024 and d==160:
+                pytest.skip("This test with causal=True is flakey")
+
         if softcap != 0.0:
             pytest.skip("softcap not supported on AMD's Triton Backend yet")
 
@@ -1205,8 +1209,8 @@ def test_flash_attn_output(
 @pytest.mark.parametrize("alibi", [False])
 # @pytest.mark.parametrize("local", [False, True])
 @pytest.mark.parametrize("local", [False])
-# @pytest.mark.parametrize("causal", [False, True])
-@pytest.mark.parametrize('causal', [False])
+@pytest.mark.parametrize("causal", [False, True])
+# @pytest.mark.parametrize("causal", [False])
 @pytest.mark.parametrize("d", [32, 59, 64, 80, 96, 111, 128, 160, 192, 224, 256])
 # @pytest.mark.parametrize("d", [32, 64, 96, 128, 160, 192, 224, 256])
 # @pytest.mark.parametrize('d', [32])