Merge branch 'reduce_lds_usage' into fa_fwd_benchmark_2gpus

ROCm · Oct 11, 2023 · 3e525c9 · 3e525c9
2 parents 20f414f + a8e659c
commit 3e525c9
Show file tree

Hide file tree

Showing 281 changed files with 35,748 additions and 5,005 deletions.
diff --git a/.github/workflows/amd-offline-tests.yml b/.github/workflows/amd-offline-tests.yml
@@ -48,6 +48,7 @@ jobs:
       - name: Install Triton
         run: |
           cd python
+          pip3 install ninja
           # Install in system, because need to override system triton. Otherwise lit tests will use wrong version
           DEBUG=TRUE TRITON_USE_ROCM=TRUE TRITON_USE_ASSERT_ENABLED_LLVM=TRUE python3 -m pip install --no-build-isolation -vvv -e .
 

diff --git a/.github/workflows/compare-artifacts.yml b/.github/workflows/compare-artifacts.yml
@@ -0,0 +1,85 @@
+name: Compare Artifacts
+on:
+  workflow_run:
+    workflows:
+      - Integration Tests
+    types:
+      - completed
+
+jobs:
+  Compare-artifacts:
+    runs-on: ubuntu-latest
+    if: ${{ github.event.workflow_run.conclusion == 'success' }}
+
+    steps:
+      - name: Download PR number artifact
+        uses: actions/github-script@v6
+        with:
+          script: |
+            let allArtifacts = await github.rest.actions.listWorkflowRunArtifacts({
+               owner: context.repo.owner,
+               repo: context.repo.repo,
+               run_id: context.payload.workflow_run.id,
+            });
+            let matchArtifact = allArtifacts.data.artifacts.filter((artifact) => {
+              return artifact.name == "pr_number"
+            })[0];
+            let download = await github.rest.actions.downloadArtifact({
+               owner: context.repo.owner,
+               repo: context.repo.repo,
+               artifact_id: matchArtifact.id,
+               archive_format: 'zip',
+            });
+            let fs = require('fs');
+            fs.writeFileSync(`${process.env.GITHUB_WORKSPACE}/pr_number.zip`, Buffer.from(download.data));
+      - name: Download comparison result artifact
+        uses: actions/github-script@v6
+        with:
+          script: |
+            let allArtifacts = await github.rest.actions.listWorkflowRunArtifacts({
+               owner: context.repo.owner,
+               repo: context.repo.repo,
+               run_id: context.payload.workflow_run.id,
+            });
+            let matchArtifact = allArtifacts.data.artifacts.filter((artifact) => {
+              return artifact.name == "comparison_result"
+            })[0];
+            let download = await github.rest.actions.downloadArtifact({
+               owner: context.repo.owner,
+               repo: context.repo.repo,
+               artifact_id: matchArtifact.id,
+               archive_format: 'zip',
+            });
+            let fs = require('fs');
+            fs.writeFileSync(`${process.env.GITHUB_WORKSPACE}/comparison_result.zip`, Buffer.from(download.data));
+      - name: Unzip artifacts
+        run: |
+          unzip pr_number.zip
+          unzip comparison_result.zip
+      - name: Print artifacts
+        uses: actions/github-script@v6
+        with:
+          script: |
+            let fs = require('fs');
+            let pr_number = Number(fs.readFileSync('./pr_number'));
+            let comparison_result = fs.readFileSync('./comparison_result', 'utf8');
+            console.log("PR number = ", pr_number);
+            console.log("Comparison result = ", comparison_result);
+      - name: Comment on PR
+        uses: actions/github-script@v6
+        with:
+          github-token: ${{ secrets.GITHUB_TOKEN }}
+          script: |
+            let fs = require('fs');
+            let run_id = context.payload.workflow_run.id;
+            let issue_number = Number(fs.readFileSync('./pr_number'));
+            let comparison_result = fs.readFileSync('./comparison_result', 'utf8');
+            const message = `:warning: **This PR does not produce bitwise identical kernels as the branch it's merged against.** Please check artifacts for details. [Download the output file here](https://github.com/${{ github.repository }}/actions/runs/${run_id}).`;
+            if (comparison_result.trim() !== 'SUCCESS') {
+              await github.rest.issues.createComment({
+                owner: context.repo.owner,
+                repo: context.repo.repo,
+                issue_number: issue_number,
+                body: message
+              });
+            }
diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml
@@ -24,6 +24,7 @@ jobs:
         run: |
           pip3 install tabulate
           pip3 install cmake
+          pip3 install sphinx
 
       #- name: Fetch dependent branches
       #  run: |
@@ -33,7 +34,7 @@ jobs:
         run: |
           cd docs
           export PATH=$(python3 -c "import cmake; print(cmake.CMAKE_BIN_DIR)"):$PATH
-          python3 -m sphinx_multiversion . _build/html/
+          python3 -m sphinx . _build/html/main
 
       - name: Update docs
         run: |

diff --git a/.github/workflows/integration-tests.yml b/.github/workflows/integration-tests.yml
@@ -27,7 +27,7 @@ jobs:
         run: |
           if [ x"${{ github.repository }}" == x"openai/triton" ]; then
             echo '::set-output name=matrix-required::[["self-hosted", "A100"], ["self-hosted", "H100"]]'
-            echo '::set-output name=matrix-optional::[["self-hosted", "gfx908"], ["self-hosted", "arc770"]]'
+            echo '::set-output name=matrix-optional::[]'
           else
             echo '::set-output name=matrix-required::["ubuntu-latest"]'
             echo '::set-output name=matrix-optional::["ubuntu-latest"]'
@@ -50,6 +50,9 @@ jobs:
         if: ${{(matrix.runner[0] == 'self-hosted') && (matrix.runner[1] == 'V100' || matrix.runner[1] == 'A100' || matrix.runner[1] == 'H100')}}
         run: |
           echo "BACKEND=CUDA" >> "${GITHUB_ENV}"
+          echo "ENABLE_TMA=0" >> "${GITHUB_ENV}"
+          echo "ENABLE_MMA_V3=0" >> "${GITHUB_ENV}"
+          echo "TRITON_DISABLE_LINE_INFO=1" >> "${GITHUB_ENV}"
 
       - name: Clear cache
         run: |
@@ -59,12 +62,18 @@ jobs:
         run: |
           echo "PATH=${HOME}/.local/bin:${PATH}" >> "${GITHUB_ENV}"
 
+      - name: Check pre-commit
+        run: |
+          python3 -m pip install --upgrade pre-commit
+          python3 -m pre_commit run --all-files --verbose
+
       - name: Install Triton
         if: ${{ env.BACKEND == 'CUDA'}}
         run: |
           cd python
           python3 -m pip install --upgrade pip
           python3 -m pip install cmake==3.24
+          python3 -m pip install ninja
           python3 -m pip install --no-build-isolation -vvv '.[tests]'
           python3 -m pip install pytest-xdist
 
@@ -79,19 +88,53 @@ jobs:
           fi
           lit -v "${LIT_TEST_DIR}"
 
-      - name: Run python tests on CUDA
-        if: ${{ env.BACKEND == 'CUDA'}}
+      - name: Enable MMAV3 and TMA
+        if: ${{(matrix.runner[0] == 'self-hosted') && (matrix.runner[1] == 'H100')}}
+        run: |
+          echo "ENABLE_TMA=1" >> "${GITHUB_ENV}"
+          echo "ENABLE_MMA_V3=1" >> "${GITHUB_ENV}"
+
+      - name: Run python tests on CUDA with ENABLE_TMA=1 and ENABLE_MMA_V3=1
+        if: ${{ env.BACKEND == 'CUDA' && env.ENABLE_TMA == '1' && env.ENABLE_MMA_V3 == '1'}}
         run: |
           cd python/test/unit
-          python3 -m pytest -n 8 --ignore=runtime
+          python3 -m pytest -n 8 --ignore=runtime --ignore=operators --ignore=language/test_line_info.py
           # run runtime tests serially to avoid race condition with cache handling.
           python3 -m pytest runtime/
+          # run test_line_info.py separately with TRITON_DISABLE_LINE_INFO=0
+          TRITON_DISABLE_LINE_INFO=0 python3 -m pytest language/test_line_info.py
+
+      - name: Run python tests on CUDA with ENABLE_TMA=0 and ENABLE_MMA_V3=0
+        if: ${{ env.BACKEND == 'CUDA' && env.ENABLE_TMA == '0' && env.ENABLE_MMA_V3 == '0'}}
+        run: |
+          cd python/test/unit
+          python3 -m pytest -n 8 --ignore=runtime --ignore=hopper --ignore=operators --ignore=language/test_line_info.py
+          # run runtime tests serially to avoid race condition with cache handling.
+          python3 -m pytest runtime/
+          # run test_line_info.py separately with TRITON_DISABLE_LINE_INFO=0
+          TRITON_DISABLE_LINE_INFO=0 python3 -m pytest language/test_line_info.py
+
+      - name: Clear cache
+        run: |
+          rm -rf ~/.triton
+
+      - name: Run partial tests on CUDA with ENABLE_TMA=1 and ENABLE_MMA_V3=1
+        if: ${{ env.BACKEND == 'CUDA' && env.ENABLE_TMA == '1' && env.ENABLE_MMA_V3 == '1'}}
+        run: |
+          cd python/test/unit
+          python3 -m pytest -n 8 operators
+
+      - name: Run partial tests on CUDA with ENABLE_TMA=0 and ENABLE_MMA_V3=0
+        if: ${{ env.BACKEND == 'CUDA' && env.ENABLE_TMA == '0' && env.ENABLE_MMA_V3 == '0'}}
+        run: |
+          cd python/test/unit
+          python3 -m pytest -n 8 operators
 
       - name: Create artifacts archive
         if: ${{(matrix.runner[0] == 'self-hosted') && (matrix.runner[1] == 'V100' || matrix.runner[1] == 'A100' || matrix.runner[1] == 'H100')}}
         run: |
           cd ~/.triton
-          tar -czvf artifacts.tar.gz cache
+          tar -czf artifacts.tar.gz cache
 
       - name: Upload artifacts archive
         if: ${{(matrix.runner[0] == 'self-hosted') && (matrix.runner[1] == 'V100' || matrix.runner[1] == 'A100' || matrix.runner[1] == 'H100')}}
@@ -119,6 +162,7 @@ jobs:
 
   Integration-Tests-Third-Party:
     needs: Runner-Preparation
+    if: false
 
     runs-on: ${{ matrix.runner }}
 
@@ -218,10 +262,22 @@ jobs:
           sudo apt update
           sudo apt install gh
 
+      - name: Save PR number to a file
+        env:
+          PR_NUMBER: ${{ github.event.number }}
+        run: |
+          echo $PR_NUMBER > pr_number
+      - name: Upload PR number to artifacts
+        uses: actions/upload-artifact@v3
+        with:
+          name: pr_number
+          path: pr_number
+
       - name: Download latest main artifacts
         env:
           ARTIFACT_NAME: artifacts A100
           ARTIFACT_JOB_NAME: Integration-Tests-Nvidia
+          MAX_NUM_ACTIONS_PAGES: 30
           GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
         run: |
           OWNER_REPO="${{ github.repository }}"
@@ -238,18 +294,27 @@ jobs:
             USER_ID=$(gh api repos/$OWNER_REPO/pulls/$PR_NUMBER --jq '.user.id')
             echo "USER_ID: $USER_ID"
 
+            run_id_found=false
             page=1
             while true; do
+              if [ "$page" -gt $MAX_NUM_ACTIONS_PAGES ]; then
+                break
+              fi
+
               run_id=$(gh api --method GET "repos/$OWNER_REPO/actions/runs?page=$page&per_page=100" | jq --arg branch_name "$BRANCH_NAME" --arg run_name "Integration Tests" --arg user_id "$USER_ID" '.workflow_runs[] | select(.head_branch == $branch_name and .name == $run_name and .actor.id == ($user_id | tonumber))' | jq '.id' | head -1)
               if [ "$run_id" != "" ]; then
                 echo "First run ID on branch $BRANCH_NAME is: $run_id"
                 WORKFLOW_RUN_ID=$run_id
+                run_id_found=true
                 break
               fi
 
               ((page++))
             done
-
+            if ! $run_id_found; then
+              echo "No run_id found for PR ${PR_NUMBER}, moving to the next PR."
+              continue
+            fi
             echo "WORKFLOW_RUN_ID: $WORKFLOW_RUN_ID"
             ARTIFACT_URL=$(gh api repos/$OWNER_REPO/actions/runs/$WORKFLOW_RUN_ID/artifacts | jq --arg artifact_name "$ARTIFACT_NAME" '.artifacts[] | select(.name == $artifact_name).archive_download_url' --raw-output)
             echo "ARTIFACT_URL: $ARTIFACT_URL"
@@ -289,7 +354,7 @@ jobs:
       - name: Compare artifacts
         run: |
           set +e
-          python3 python/test/tools/compare_files.py --path1 reference --path2 current --kernels python/test/kernel_comparison/kernels.yml
+          python3 python/test/tools/compare_files.py --path1 reference --path2 current
           exit_code=$?
           set -e
           echo $exit_code
@@ -303,34 +368,20 @@ jobs:
             echo "Error while comparing artifacts"
             echo "COMPARISON_RESULT=error" >> $GITHUB_ENV
           fi
-          echo "COMPARISON_RESULT=env.COMPARISON_RESULT"
-      - name: Check exit code and handle failure
-        if: ${{ env.COMPARISON_RESULT == 'error' }}
+      - name: Check comparison result and write to file
         run: |
-          echo "Error while comparing artifacts"
-          exit 1
-      - name: Fetch Run ID
-        id: get_run_id
-        run: echo "RUN_ID=${{ github.run_id }}" >> $GITHUB_ENV
-
+          if [ "${{ env.COMPARISON_RESULT }}" = "true" ]; then
+            echo "SUCCESS" > comparison_result
+          else
+            echo "FAILED" > comparison_result
+          fi
+      - name: Upload comparison result to artifacts
+        uses: actions/upload-artifact@v3
+        with:
+          name: comparison_result
+          path: comparison_result
       - name: Upload results as artifact
         uses: actions/upload-artifact@v2
         with:
           name: kernels-reference-check
           path: kernels_reference_check.txt
-
-      - name: Check output and comment on PR
-        if: ${{ env.COMPARISON_RESULT == 'false' }}
-        uses: actions/github-script@v5
-        with:
-          github-token: ${{ secrets.CI_ACCESS_TOKEN }}
-          script: |
-            const run_id = ${{ env.RUN_ID }};
-            const issue_number = context.payload.pull_request.number;
-            const message = `:warning: **This PR does not produce bitwise identical kernels as the branch it's merged against.** Please check artifacts for details. [Download the output file here](https://github.com/${{ github.repository }}/actions/runs/${run_id}).`;
-            await github.rest.issues.createComment({
-                owner: context.repo.owner,
-                repo: context.repo.repo,
-                issue_number: issue_number,
-                body: message
-            });
diff --git a/.gitignore b/.gitignore
@@ -25,9 +25,5 @@ venv.bak/
 .idea
 cmake-build-*
 
-# cache dumps
-triton_cache*
-log_*
-
-#
-python/triton/third_party/cuda/bin/ptxas
+# Third-party binaries
+ptxas
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -26,6 +26,8 @@ set(TRITON_CODEGEN_BACKENDS "" CACHE STRING "Enable different codegen backends")
 
 # Force TRITON_USE_ROCM for ROCm support
 set(TRITON_USE_ROCM ON)
+set(ROCM_DEFAULT_DIR "/opt/rocm")
+add_definitions( -DROCM_DEFAULT_DIR="${ROCM_DEFAULT_DIR}")
 
 # Ensure Python3 vars are set correctly
 # used conditionally in this file and by lit tests
@@ -200,6 +202,10 @@ include_directories(${LLVM_INCLUDE_DIRS})
 include_directories(${PROJECT_SOURCE_DIR}/include)
 include_directories(${PROJECT_BINARY_DIR}/include) # Tablegen'd files
 
+set(ROCM_LIBRARIES
+  ${CMAKE_CURRENT_SOURCE_DIR}/lib/rocm/libhsa-runtime64.so
+)
+
 # link_directories(${LLVM_LIBRARY_DIR})
 add_subdirectory(include)
 add_subdirectory(lib)
@@ -218,6 +224,7 @@ if(TRITON_BUILD_PYTHON_MODULE)
     TritonAnalysis
     TritonTransforms
     TritonGPUTransforms
+    TritonNvidiaGPUTransforms
     TritonLLVMIR
     TritonPTX
     TritonHSACO
@@ -238,9 +245,21 @@ if(TRITON_BUILD_PYTHON_MODULE)
     MLIRIR
   )
 
-  set(ROCM_LIBRARIES
-    ${CMAKE_CURRENT_SOURCE_DIR}/lib/rocm/libhsa-runtime64.so
-  )
+  if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/python/triton/third_party/rocm/lib/libhsa-runtime64.so)
+    set(ROCM_LIBRARIES
+      ${CMAKE_CURRENT_SOURCE_DIR}/python/triton/third_party/rocm/lib/libhsa-runtime64.so
+    )
+  elseif(EXISTS "$ENV{ROCM_PATH}/lib/libhsa-runtime64.so" )
+    set(ROCM_LIBRARIES
+      "$ENV{ROCM_PATH}/lib/libhsa-runtime64.so"
+    )
+  elseif(EXISTS "${ROCM_DEFAULT_DIR}/lib/libhsa-runtime64.so" )
+    set(ROCM_LIBRARIES
+      "${ROCM_DEFAULT_DIR}/lib/libhsa-runtime64.so"
+    )
+  else()
+    message(STATUS "WARNING: Can't find libhsa-runtime64.so")
+  endif()
 
   if(WIN32)
     target_link_libraries(triton PRIVATE ${ROCM_LIBRARIES} ${LLVM_LIBRARIES} ${CMAKE_DL_LIBS}