Skip to content

Commit

Permalink
Merge pull request #382 from ROCmSoftwarePlatform/ifu231005-rebase
Browse files Browse the repository at this point in the history
Ifu231005
  • Loading branch information
jayfurmanek authored Nov 7, 2023
2 parents c65f1e6 + 85216ea commit 3c1fe61
Show file tree
Hide file tree
Showing 163 changed files with 5,949 additions and 4,459 deletions.
1 change: 1 addition & 0 deletions .github/workflows/documentation.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ jobs:
pip3 install tabulate
pip3 install cmake
pip3 install sphinx
pip3 install myst_parser
#- name: Fetch dependent branches
# run: |
Expand Down
83 changes: 69 additions & 14 deletions .github/workflows/integration-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ jobs:
echo '::set-output name=matrix-optional::["ubuntu-latest"]'
fi
Integration-Tests-Nvidia:
needs: Runner-Preparation

Expand All @@ -44,14 +45,14 @@ jobs:

steps:
- name: Checkout
uses: actions/checkout@v2

uses: actions/checkout@v3
with:
submodules: 'true'
- name: Set CUDA ENV
if: ${{(matrix.runner[0] == 'self-hosted') && (matrix.runner[1] == 'V100' || matrix.runner[1] == 'A100' || matrix.runner[1] == 'H100')}}
run: |
echo "BACKEND=CUDA" >> "${GITHUB_ENV}"
echo "ENABLE_TMA=0" >> "${GITHUB_ENV}"
echo "ENABLE_MMA_V3=0" >> "${GITHUB_ENV}"
echo "TRITON_DISABLE_LINE_INFO=1" >> "${GITHUB_ENV}"
- name: Clear cache
Expand Down Expand Up @@ -88,24 +89,26 @@ jobs:
fi
lit -v "${LIT_TEST_DIR}"
- name: Enable MMAV3 and TMA
- name: Enable TMA
if: ${{(matrix.runner[0] == 'self-hosted') && (matrix.runner[1] == 'H100')}}
run: |
echo "ENABLE_TMA=1" >> "${GITHUB_ENV}"
echo "ENABLE_MMA_V3=1" >> "${GITHUB_ENV}"
- name: Run python tests on CUDA with ENABLE_TMA=1 and ENABLE_MMA_V3=1
if: ${{ env.BACKEND == 'CUDA' && env.ENABLE_TMA == '1' && env.ENABLE_MMA_V3 == '1'}}
- name: Run python tests on CUDA with ENABLE_TMA=1
if: ${{ env.BACKEND == 'CUDA' && env.ENABLE_TMA == '1'}}
run: |
cd python/test/unit
python3 -m pytest -n 8 --ignore=runtime --ignore=operators --ignore=language/test_line_info.py
python3 -m pytest -n 8 --ignore=runtime --ignore=operators --ignore=language/test_line_info.py --ignore=language/test_subprocess.py
python3 -m pytest -n 8 language/test_subprocess.py
# run runtime tests serially to avoid race condition with cache handling.
python3 -m pytest runtime/
# run test_line_info.py separately with TRITON_DISABLE_LINE_INFO=0
TRITON_DISABLE_LINE_INFO=0 python3 -m pytest language/test_line_info.py
#run hopper/test_flashattention.py to avoid out of gpu memory
python3 -m pytest hopper/test_flashattention.py
- name: Run python tests on CUDA with ENABLE_TMA=0 and ENABLE_MMA_V3=0
if: ${{ env.BACKEND == 'CUDA' && env.ENABLE_TMA == '0' && env.ENABLE_MMA_V3 == '0'}}
- name: Run python tests on CUDA with ENABLE_TMA=0
if: ${{ env.BACKEND == 'CUDA' && env.ENABLE_TMA == '0'}}
run: |
cd python/test/unit
python3 -m pytest -n 8 --ignore=runtime --ignore=hopper --ignore=operators --ignore=language/test_line_info.py
Expand All @@ -118,14 +121,22 @@ jobs:
run: |
rm -rf ~/.triton
- name: Run partial tests on CUDA with ENABLE_TMA=1 and ENABLE_MMA_V3=1
if: ${{ env.BACKEND == 'CUDA' && env.ENABLE_TMA == '1' && env.ENABLE_MMA_V3 == '1'}}
- name: Run interpreter tests
env:
# TRITON_INTERPRET: "1"
CUA_VISIBLE_DEVICES: ""
run: |
cd python/test/unit
python3 -m pytest -vs operators/test_flash_attention.py
- name: Run partial tests on CUDA with ENABLE_TMA=1
if: ${{ env.BACKEND == 'CUDA' && env.ENABLE_TMA == '1'}}
run: |
cd python/test/unit
python3 -m pytest -n 8 operators
- name: Run partial tests on CUDA with ENABLE_TMA=0 and ENABLE_MMA_V3=0
if: ${{ env.BACKEND == 'CUDA' && env.ENABLE_TMA == '0' && env.ENABLE_MMA_V3 == '0'}}
- name: Run partial tests on CUDA with ENABLE_TMA=0
if: ${{ env.BACKEND == 'CUDA' && env.ENABLE_TMA == '0'}}
run: |
cd python/test/unit
python3 -m pytest -n 8 operators
Expand Down Expand Up @@ -160,6 +171,50 @@ jobs:
python3 -m pytest -vs . --reruns 10
sudo nvidia-smi -i 0 -rgc
Integration-Tests-Shared-Middle-Layer:

runs-on: ubuntu-latest

steps:
- name: Checkout
uses: actions/checkout@v2

- name: Clear cache
run: |
rm -rf ~/.triton
- name: Update PATH
run: |
echo "PATH=${HOME}/.local/bin:${PATH}" >> "${GITHUB_ENV}"
- name: Check pre-commit
run: |
python3 -m pip install --upgrade pre-commit
python3 -m pre_commit run --all-files --verbose
- name: Install Triton
run: |
export TRITON_CODEGEN_TRITON_SHARED=1
git submodule update --init --recursive
cd python
python3 -m pip install --upgrade pip
python3 -m pip install cmake==3.24
python3 -m pip install ninja
python3 -m pip uninstall -y triton
python3 setup.py build
python3 -m pip install --no-build-isolation -vvv '.[tests]'
- name: Run shared middle-layer lit tests
run: |
python3 -m pip install lit
cd python
LIT_TEST_DIR="build/$(ls build | grep -i cmake)/third_party/triton_shared/test"
if [ ! -d "${LIT_TEST_DIR}" ]; then
echo "Coult not find '${LIT_TEST_DIR}'" ; exit -1
fi
lit -v "${LIT_TEST_DIR}"
Integration-Tests-Third-Party:
needs: Runner-Preparation
if: false
Expand Down
4 changes: 2 additions & 2 deletions .github/workflows/wheels.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,8 @@ jobs:
export CIBW_MANYLINUX_X86_64_IMAGE="quay.io/pypa/manylinux2014_x86_64:latest"
#export CIBW_MANYLINUX_PYPY_X86_64_IMAGE="quay.io/pypa/manylinux2014_x86_64:latest"
export CIBW_BEFORE_BUILD="pip install cmake;"
export CIBW_SKIP="{cp,pp}{35,36}-*"
export CIBW_BUILD="{cp,pp}3*-manylinux_x86_64"
export CIBW_SKIP="cp{35,36}-*"
export CIBW_BUILD="cp3*-manylinux_x86_64"
python3 -m cibuildwheel python --output-dir wheelhouse
- name: Install Azure CLI
Expand Down
19 changes: 16 additions & 3 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,21 @@ venv.bak/
cmake-build-*

# Third-party binaries
cuobjdump
nvdisasm
ptxas

# HIP
log*
python/triton/third_party/cuda/bin/ptxas
# Docs
docs/_build/
docs/python-api/generated/
docs/dialects/
docs/getting-started/tutorials
!python/tutorials/*.py
!python/tutorials/*.rst

# clangd index. (".clangd" is a config file now, thus trailing slash)
.clangd/
.cache
/compile_commands.json
.vscode
.vs
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,6 @@
path = third_party/amd_hip_backend
url = https://github.com/ROCmSoftwarePlatform/triton
branch = third_party_backend_2
[submodule "third_party/triton_shared"]
path = third_party/triton_shared
url = https://github.com/microsoft/triton-shared
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ repos:
^docs/conf.py$
)
- repo: https://github.com/pre-commit/mirrors-clang-format
rev: v14.0.6
rev: v16.0.6
hooks:
- id: clang-format
stages: [commit, push, manual]
Expand Down
85 changes: 54 additions & 31 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,32 +10,6 @@ We're hiring! If you are interested in working on Triton at OpenAI, we have role
------------------- |
[![Documentation](https://github.com/openai/triton/actions/workflows/documentation.yml/badge.svg)](https://triton-lang.org/)

# Triton Developer Conference Registration Open
The Triton Developer Conference will be held in a hybrid mode at the Microsoft Silicon Valley Campus in Mountain View, California. The conference will be held on September 20th from 10am to 4pm, followed by a reception till 5:30 pm. Please use the link below to register to attend either in-person or virtually online.

Registration Link for Triton Developer Conference is [here](https://forms.office.com/r/m4jQXShDts)

Tentative Agenda for the conference (subject to change):

|Time |Title |Speaker
|--------|-------|-------|
|10:00 AM|Welcome|Kevin Scott (Microsoft)|
|10:20 AM|The Triton Compiler: Past, Present and Future|Phil Tillet (OpenAI)|
|11:00 AM|**Break**||
|11:20 AM|Hopper support in Triton|Gustav Zhu (Nvidia)|
|11:40 AM|Bringing Triton to AMD GPUs|Jason Furmanek, Lixun Zhang (AMD)|
|12:00 PM|Intel XPU Backend for Triton|Eikan Wang (Intel)|
|12:20 PM|Vectorization of Triton Kernels for Qualcomm Hexagon Backend|Javed Absar (Qualcomm)|
|12:30 PM|**Lunch**||
|1:40 PM |Triton for MTIA|Roman Levenstein et al, (Meta)|
|2:00 PM |Using Triton IR for high-performance fusions in XLA|George Karpenkov (Google)|
|2:20 PM |Triton for All: Triton as a device-independent language|Ian Bearman (Microsoft)|
|2:40 PM|**Break**||
|3:00 PM|PyTorch 2.0 and TorchInductor|Jason Ansel, Horace He (Meta)|
|3:20 PM|Pallas: A JAX Kernel Language|Sharad Vikram (Google)|
|3:40 PM|Writing Grouped GEMMs in Triton|Vinod Grover (Nvidia)|
|4:00 PM|**Reception**||


# Triton

Expand Down Expand Up @@ -86,9 +60,23 @@ lit -v test

```
git clone https://github.com/openai/triton.git;
cd triton/python;
pip install ninja cmake; # build-time dependencies
pip install -e .
cd triton;
pip install ninja cmake wheel; # build-time dependencies
pip install -e python
```

Or with a virtualenv:

```
git clone https://github.com/openai/triton.git;
cd triton;
python -m venv .venv --prompt triton;
source .venv/bin/activate;
pip install ninja cmake wheel; # build-time dependencies
pip install -e python
```

# Building with a custom LLVM
Expand Down Expand Up @@ -125,11 +113,46 @@ arbitrary LLVM version.
# Modify as appropriate to point to your LLVM build.
$ export LLVM_BUILD_DIR=$HOME/llvm-project/build

$ cd <triton install>/python
$ cd <triton install>
$ LLVM_INCLUDE_DIRS=$LLVM_BUILD_DIR/include \
LLVM_LIBRARY_DIR=$LLVM_BUILD_DIR/lib \
LLVM_SYSPATH=$LLVM_BUILD_DIR \
pip install -e .
pip install -e python

# Tips for building

- Set `TRITON_BUILD_WITH_CLANG_LLD=true` as an environment variable to use clang
and lld. lld in particular results in faster builds.

- Set `TRITON_BUILD_WITH_CCACHE=true` to build with ccache.

- Pass `--no-build-isolation` to `pip install` to make nop builds faster.
Without this, every invocation of `pip install` uses a different symlink to
cmake, and this forces ninja to rebuild most of the `.a` files.

# Running tests

There currently isn't a turnkey way to run all the Triton tests, but you can
follow the following recipe.

```shell
# One-time setup. Note we have to reinstall local Triton because torch
# overwrites it with the public version.
$ pip install scipy numpy torch pytest lit && pip install -e python

# Run Python tests using your local GPU.
$ python3 -m pytest python/test/unit

# Move to builddir. Fill in <...> with the full path, e.g.
# `cmake.linux-x86_64-cpython-3.11`.
$ cd python/build/cmake<...>

# Run C++ unit tests.
$ ninja test

# Run lit tests.
$ lit test
```

# Changelog

Expand Down
19 changes: 18 additions & 1 deletion bin/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ llvm_update_compile_flags(triton-translate)
TritonNvidiaGPUTransforms
TritonLLVMIR
TritonPTX
TritonHSACO
TritonHSACO
${dialect_libs}
${conversion_libs}
# tests
Expand All @@ -80,3 +80,20 @@ llvm_update_compile_flags(triton-translate)
MLIRROCDLToLLVMIRTranslation
)
mlir_check_all_link_libraries(triton-translate)

add_llvm_executable(triton-llvm-opt
triton-llvm-opt.cpp

DEPENDS
intrinsics_gen
SUPPORT_PLUGINS
)
target_link_libraries(triton-llvm-opt PRIVATE
TritonLLVMIR

LLVMCore
LLVMSupport
LLVMOption
LLVMCodeGen
)
export_executable_symbols_for_plugins(triton-llvm-opt)
5 changes: 4 additions & 1 deletion bin/RegisterTritonDialects.h
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#pragma once
#include "triton/Dialect/NVGPU/IR/Dialect.h"
#include "triton/Dialect/Triton/IR/Dialect.h"
#include "triton/Dialect/TritonGPU/IR/Dialect.h"
#include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h"
Expand All @@ -11,6 +12,7 @@
#include "triton/Conversion/TritonGPUToLLVM/Passes.h"
#include "triton/Conversion/TritonToTritonGPU/Passes.h"

#include "mlir/Dialect/LLVMIR/NVVMDialect.h"
#include "mlir/InitAllPasses.h"

namespace mlir {
Expand Down Expand Up @@ -40,5 +42,6 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
mlir::triton::nvidia_gpu::TritonNvidiaGPUDialect,
mlir::triton::gpu::TritonGPUDialect, mlir::math::MathDialect,
mlir::arith::ArithDialect, mlir::scf::SCFDialect,
mlir::gpu::GPUDialect>();
mlir::gpu::GPUDialect, mlir::LLVM::LLVMDialect,
mlir::NVVM::NVVMDialect, mlir::triton::nvgpu::NVGPUDialect>();
}
Loading

0 comments on commit 3c1fe61

Please sign in to comment.