Skip to content

Commit 3233785

Browse files
ThomasRaouxmemfrob
authored and
memfrob
committed
[mlir][gpu] Move async copy ops to NVGPU and add caching hints
Move async copy operations to NVGPU as they only exist on NV target and are designed to match ptx semantic. This allows us to also add more fine grain caching hint attribute to the op. Add hint to bypass L1 and hook it up to NVVM op. Differential Revision: https://reviews.llvm.org/D125244
1 parent 75231af commit 3233785

File tree

19 files changed

+407
-349
lines changed

19 files changed

+407
-349
lines changed

mlir/include/mlir/Dialect/GPU/GPUBase.td

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -60,13 +60,6 @@ def GPU_AsyncToken : DialectType<
6060
GPU_Dialect, CPred<"$_self.isa<::mlir::gpu::AsyncTokenType>()">, "async token type">,
6161
BuildableType<"mlir::gpu::AsyncTokenType::get($_builder.getContext())">;
6262

63-
/// Device-side synchronization token.
64-
def GPU_DeviceAsyncToken : DialectType<
65-
GPU_Dialect, CPred<"$_self.isa<::mlir::gpu::DeviceAsyncTokenType>()">,
66-
"device async token type">,
67-
BuildableType<
68-
"mlir::gpu::DeviceAsyncTokenType::get($_builder.getContext())">;
69-
7063
// Predicat to check if type is gpu::MMAMatrixType.
7164
def IsMMAMatrixTypePred : CPred<"$_self.isa<::mlir::gpu::MMAMatrixType>()">;
7265

mlir/include/mlir/Dialect/GPU/GPUDialect.h

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -43,14 +43,6 @@ class AsyncTokenType
4343
using Base::Base;
4444
};
4545

46-
/// Device-side token storage type. There is only one type of device-side token.
47-
class DeviceAsyncTokenType
48-
: public Type::TypeBase<DeviceAsyncTokenType, Type, TypeStorage> {
49-
public:
50-
// Used for generic hooks in TypeBase.
51-
using Base::Base;
52-
};
53-
5446
/// MMAMatrixType storage and uniquing. Array is uniqued based on its shape
5547
/// and type.
5648
struct MMAMatrixStorageType : public TypeStorage {

mlir/include/mlir/Dialect/GPU/GPUOps.td

Lines changed: 0 additions & 101 deletions
Original file line numberDiff line numberDiff line change
@@ -1280,105 +1280,4 @@ def GPU_SubgroupMmaElementwiseOp : GPU_Op<"subgroup_mma_elementwise",
12801280
}];
12811281
}
12821282

1283-
def GPU_DeviceAsyncCopyOp : GPU_Op<"device_async_copy",
1284-
[AttrSizedOperandSegments]> {
1285-
let summary = "device-side asynchronous copy";
1286-
let description = [{
1287-
The `gpu.device_async_copy` op initiates an asynchronous copy operation of
1288-
`$size` elements from source to the destination without blocking the thread.
1289-
The destination has to be in shared memory.
1290-
1291-
This is memory access will be pending to be added to a group.
1292-
1293-
This op is meant to be used with `gpu.device_async_create_group` and
1294-
`gpu.device_async_wait` to synchronize copies as explained in those ops
1295-
descriptions.
1296-
1297-
In order to do a copy and wait for the result we need the following
1298-
combination:
1299-
```
1300-
// copy 1.
1301-
%cp1 = gpu.device_async_copy %A[%c0], %B[%c0], 4 :memref<16xf32> to memref<16xf32, 3>
1302-
// copy 2.
1303-
%cp2 = gpu.device_async_copy %C[%c0], %D[%c0], 4 : memref<16xf32> to memref<16xf32, 3>
1304-
// group 1 contains copy 1 and copy 2.
1305-
%token1 = gpu.device_async_create_group %cp1, %cp2
1306-
// copy 3.
1307-
%cp3 = gpu.device_async_copy %E[%c0], %F[%c0], 4 : memref<16xf32> to memref<16xf32, 3>
1308-
// group 2 contains copy 3.
1309-
%token2 = gpu.device_async_create_group %cp3
1310-
// after the wait copy 1 and copy 2 are complete.
1311-
gpu.device_async_wait %token1
1312-
// after the wait copy 3 is complete.
1313-
gpu.device_async_wait %token2
1314-
```
1315-
1316-
Example:
1317-
1318-
```mlir
1319-
%0 = gpu.device_async_copy %src[%c0, %c0], %dst[%c0, %c0, %c0], 4 :
1320-
memref<4x5xf32> to memref<2x7x5xf32, 3>
1321-
```
1322-
}];
1323-
let results = (outs GPU_DeviceAsyncToken:$asyncToken);
1324-
let arguments = (ins Arg<AnyMemRef, "", [MemWrite]>:$dst,
1325-
Variadic<Index>:$dstIndices,
1326-
Arg<AnyMemRef, "", [MemRead]>:$src,
1327-
Variadic<Index>:$srcIndices,
1328-
IndexAttr:$numElements);
1329-
let assemblyFormat = [{
1330-
$src `[` $srcIndices `]` `,` $dst `[` $dstIndices `]` `,` $numElements
1331-
attr-dict `:` type($src) `to` type($dst)
1332-
}];
1333-
let hasVerifier = 1;
1334-
}
1335-
1336-
def GPU_DeviceAsyncCreateGroupOp : GPU_Op<"device_async_create_group", []> {
1337-
let summary = "device side asynchronous create group operation";
1338-
let description = [{
1339-
The `gpu.device_async_create_group` op creates a group of memory accesses
1340-
containing all the pending `device_async_copy` operations associated with
1341-
argument tokens. Each token can only be part of one group.
1342-
1343-
It returns a token that can be use to wait until the group fully completes.
1344-
1345-
This is meant to be used with `gpu.device_async_wait` to synchronize copies
1346-
as explained in those ops descriptions.
1347-
1348-
Groups are executed in the order they are created.
1349-
1350-
Example:
1351-
1352-
```mlir
1353-
%0 = gpu.device_async_create_group
1354-
```
1355-
}];
1356-
let results = (outs GPU_DeviceAsyncToken:$asyncToken);
1357-
let arguments = (ins Variadic<GPU_DeviceAsyncToken>:$inputTokens);
1358-
let assemblyFormat = [{
1359-
$inputTokens attr-dict
1360-
}];
1361-
}
1362-
1363-
def GPU_DeviceAsyncWaitOp : GPU_Op<"device_async_wait", []> {
1364-
let summary = "Wait for async gpu ops to complete.";
1365-
let description = [{
1366-
The `gpu.device_async_wait` op will block the execution thread until the group
1367-
associated with the source token is fully completed.
1368-
1369-
The optional `$numGroup` attribute gives a lower bound of the number of
1370-
groups uncompleted when the wait can unblock the thread.
1371-
Example:
1372-
1373-
```mlir
1374-
gpu.device_async_wait %0
1375-
```
1376-
}];
1377-
let arguments = (ins GPU_DeviceAsyncToken:$asyncDependencies,
1378-
OptionalAttr<I32Attr>:$numGroups);
1379-
let assemblyFormat = [{
1380-
$asyncDependencies attr-dict
1381-
}];
1382-
}
1383-
13841283
#endif // GPU_OPS

mlir/include/mlir/Dialect/LLVMIR/NVVMDialect.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,15 @@
2525

2626
namespace mlir {
2727
namespace NVVM {
28+
29+
/// NVVM memory space identifiers.
30+
enum NVVMMemorySpace {
31+
/// Global memory space identifier.
32+
kGlobalMemorySpace = 1,
33+
/// Shared memory space identifier.
34+
kSharedMemorySpace = 3
35+
};
36+
2837
/// Return the element type and number of elements associated with a wmma matrix
2938
/// of given chracteristics. This matches the logic in IntrinsicsNVVM.td
3039
/// WMMA_REGS structure.

mlir/include/mlir/Dialect/NVGPU/NVGPU.td

Lines changed: 121 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,17 @@ def NVGPU_Dialect : Dialect {
3232
representing PTX specific operations while using MLIR high level concepts
3333
like memref and 2-D vector.
3434
}];
35+
let useDefaultAttributePrinterParser = 1;
3536
}
3637

38+
/// Device-side synchronization token.
39+
def NVGPU_DeviceAsyncToken : DialectType<
40+
NVGPU_Dialect, CPred<"$_self.isa<::mlir::nvgpu::DeviceAsyncTokenType>()">,
41+
"device async token type">,
42+
BuildableType<
43+
"mlir::nvgpu::DeviceAsyncTokenType::get($_builder.getContext())">;
44+
45+
3746
//===----------------------------------------------------------------------===//
3847
// NVGPU Op definitions
3948
//===----------------------------------------------------------------------===//
@@ -73,24 +82,24 @@ def NVGPU_MmaSyncOp : NVGPU_Op<"mma.sync", [NoSideEffect]> {
7382
let description = [{
7483
The `nvgpu.mma.sync` op represents the distributed form of a collective
7584
matrix-multiply-and-accumulate (mma) operation that is compatible with
76-
`nvvm.mma.sync`. The operands and results are fragments of the full matrix
85+
`nvvm.mma.sync`. The operands and results are fragments of the full matrix
7786
operands. The full shape of the distributed mma operation is given by the
78-
`mmaShape` attribute in the form of a list of dimensions `[m, n, k]`.
87+
`mmaShape` attribute in the form of a list of dimensions `[m, n, k]`.
7988

8089
This operation is meant to be lowered to the `nvvm.mma.sync` instruction, and
8190
is an intermediate point between lowering from `vector.contract` to
8291
`nvvm.mma.sync`.
83-
92+
8493
This operation is meant to follow the semantic of described here:
8594
https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-mma
86-
95+
8796
Example:
88-
97+
8998
```mlir
9099
nvgpu.mma.sync (%a, %b, %c) :
91100
(vector<4x2xf16>, vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16>
92101
```
93-
}];
102+
}];
94103
let arguments = (ins AnyVector:$matrixA, AnyVector:$matrixB,
95104
AnyVector:$matrixC, I64ArrayAttr:$mmaShape);
96105

@@ -102,4 +111,110 @@ def NVGPU_MmaSyncOp : NVGPU_Op<"mma.sync", [NoSideEffect]> {
102111
}];
103112
}
104113

114+
115+
def NVGPU_DeviceAsyncCopyOp : NVGPU_Op<"device_async_copy",
116+
[AttrSizedOperandSegments]> {
117+
let summary = "device-side asynchronous copy";
118+
let description = [{
119+
The `gpu.device_async_copy` op initiates an asynchronous copy operation of
120+
`$size` elements from source to the destination without blocking the thread.
121+
The destination has to be in shared memory.
122+
123+
This is memory access will be pending to be added to a group.
124+
125+
This op is meant to be used with `gpu.device_async_create_group` and
126+
`gpu.device_async_wait` to synchronize copies as explained in those ops
127+
descriptions.
128+
`bypassL1` attribute is hint to the backend and hardware that
129+
the copy should by pass the L1 cache, this may be dropped by the backend or
130+
hardware.
131+
132+
In order to do a copy and wait for the result we need the following
133+
combination:
134+
```
135+
// copy 1.
136+
%cp1 = gpu.device_async_copy %A[%c0], %B[%c0], 4 :memref<16xf32> to memref<16xf32, 3>
137+
// copy 2.
138+
%cp2 = gpu.device_async_copy %C[%c0], %D[%c0], 4 : memref<16xf32> to memref<16xf32, 3>
139+
// group 1 contains copy 1 and copy 2.
140+
%token1 = gpu.device_async_create_group %cp1, %cp2
141+
// copy 3.
142+
%cp3 = gpu.device_async_copy %E[%c0], %F[%c0], 4 : memref<16xf32> to memref<16xf32, 3>
143+
// group 2 contains copy 3.
144+
%token2 = gpu.device_async_create_group %cp3
145+
// after the wait copy 1 and copy 2 are complete.
146+
gpu.device_async_wait %token1
147+
// after the wait copy 3 is complete.
148+
gpu.device_async_wait %token2
149+
```
150+
151+
Example:
152+
153+
```mlir
154+
%0 = gpu.device_async_copy %src[%c0, %c0], %dst[%c0, %c0, %c0], 4 :
155+
memref<4x5xf32> to memref<2x7x5xf32, 3>
156+
```
157+
}];
158+
let results = (outs NVGPU_DeviceAsyncToken:$asyncToken);
159+
let arguments = (ins Arg<AnyMemRef, "", [MemWrite]>:$dst,
160+
Variadic<Index>:$dstIndices,
161+
Arg<AnyMemRef, "", [MemRead]>:$src,
162+
Variadic<Index>:$srcIndices,
163+
IndexAttr:$numElements,
164+
OptionalAttr<UnitAttr>:$bypassL1);
165+
let assemblyFormat = [{
166+
$src `[` $srcIndices `]` `,` $dst `[` $dstIndices `]` `,` $numElements
167+
attr-dict `:` type($src) `to` type($dst)
168+
}];
169+
let hasVerifier = 1;
170+
}
171+
172+
def NVGPU_DeviceAsyncCreateGroupOp : NVGPU_Op<"device_async_create_group", []> {
173+
let summary = "device side asynchronous create group operation";
174+
let description = [{
175+
The `gpu.device_async_create_group` op creates a group of memory accesses
176+
containing all the pending `device_async_copy` operations associated with
177+
argument tokens. Each token can only be part of one group.
178+
179+
It returns a token that can be use to wait until the group fully completes.
180+
181+
This is meant to be used with `gpu.device_async_wait` to synchronize copies
182+
as explained in those ops descriptions.
183+
184+
Groups are executed in the order they are created.
185+
186+
Example:
187+
188+
```mlir
189+
%0 = gpu.device_async_create_group
190+
```
191+
}];
192+
let results = (outs NVGPU_DeviceAsyncToken:$asyncToken);
193+
let arguments = (ins Variadic<NVGPU_DeviceAsyncToken>:$inputTokens);
194+
let assemblyFormat = [{
195+
$inputTokens attr-dict
196+
}];
197+
}
198+
199+
def NVGPU_DeviceAsyncWaitOp : NVGPU_Op<"device_async_wait", []> {
200+
let summary = "Wait for async gpu ops to complete.";
201+
let description = [{
202+
The `gpu.device_async_wait` op will block the execution thread until the group
203+
associated with the source token is fully completed.
204+
205+
The optional `$numGroup` attribute gives a lower bound of the number of
206+
groups uncompleted when the wait can unblock the thread.
207+
Example:
208+
209+
```mlir
210+
gpu.device_async_wait %0
211+
```
212+
}];
213+
let arguments = (ins NVGPU_DeviceAsyncToken:$asyncDependencies,
214+
OptionalAttr<I32Attr>:$numGroups);
215+
let assemblyFormat = [{
216+
$asyncDependencies attr-dict
217+
}];
218+
}
219+
105220
#endif // NVGPU

mlir/include/mlir/Dialect/NVGPU/NVGPUDialect.h

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,20 @@
1818
#include "mlir/IR/OpDefinition.h"
1919
#include "mlir/Interfaces/SideEffectInterfaces.h"
2020

21+
namespace mlir {
22+
namespace nvgpu {
23+
24+
/// Device-side token storage type. There is only one type of device-side token.
25+
class DeviceAsyncTokenType
26+
: public Type::TypeBase<DeviceAsyncTokenType, Type, TypeStorage> {
27+
public:
28+
// Used for generic hooks in TypeBase.
29+
using Base::Base;
30+
};
31+
32+
} // namespace nvgpu
33+
} // namespace mlir
34+
2135
#include "mlir/Dialect/NVGPU/NVGPUDialect.h.inc"
2236

2337
#define GET_OP_CLASSES

0 commit comments

Comments
 (0)