@@ -32,8 +32,17 @@ def NVGPU_Dialect : Dialect {
32
32
representing PTX specific operations while using MLIR high level concepts
33
33
like memref and 2-D vector.
34
34
}];
35
+ let useDefaultAttributePrinterParser = 1;
35
36
}
36
37
38
+ /// Device-side synchronization token.
39
+ def NVGPU_DeviceAsyncToken : DialectType<
40
+ NVGPU_Dialect, CPred<"$_self.isa<::mlir::nvgpu::DeviceAsyncTokenType>()">,
41
+ "device async token type">,
42
+ BuildableType<
43
+ "mlir::nvgpu::DeviceAsyncTokenType::get($_builder.getContext())">;
44
+
45
+
37
46
//===----------------------------------------------------------------------===//
38
47
// NVGPU Op definitions
39
48
//===----------------------------------------------------------------------===//
@@ -73,24 +82,24 @@ def NVGPU_MmaSyncOp : NVGPU_Op<"mma.sync", [NoSideEffect]> {
73
82
let description = [{
74
83
The `nvgpu.mma.sync` op represents the distributed form of a collective
75
84
matrix-multiply-and-accumulate (mma) operation that is compatible with
76
- `nvvm.mma.sync`. The operands and results are fragments of the full matrix
85
+ `nvvm.mma.sync`. The operands and results are fragments of the full matrix
77
86
operands. The full shape of the distributed mma operation is given by the
78
- `mmaShape` attribute in the form of a list of dimensions `[m, n, k]`.
87
+ `mmaShape` attribute in the form of a list of dimensions `[m, n, k]`.
79
88
80
89
This operation is meant to be lowered to the `nvvm.mma.sync` instruction, and
81
90
is an intermediate point between lowering from `vector.contract` to
82
91
`nvvm.mma.sync`.
83
-
92
+
84
93
This operation is meant to follow the semantic of described here:
85
94
https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-mma
86
-
95
+
87
96
Example:
88
-
97
+
89
98
```mlir
90
99
nvgpu.mma.sync (%a, %b, %c) :
91
100
(vector<4x2xf16>, vector<2x2xf16>, vector<2x2xf16>) -> vector<2x2xf16>
92
101
```
93
- }];
102
+ }];
94
103
let arguments = (ins AnyVector:$matrixA, AnyVector:$matrixB,
95
104
AnyVector:$matrixC, I64ArrayAttr:$mmaShape);
96
105
@@ -102,4 +111,110 @@ def NVGPU_MmaSyncOp : NVGPU_Op<"mma.sync", [NoSideEffect]> {
102
111
}];
103
112
}
104
113
114
+
115
+ def NVGPU_DeviceAsyncCopyOp : NVGPU_Op<"device_async_copy",
116
+ [AttrSizedOperandSegments]> {
117
+ let summary = "device-side asynchronous copy";
118
+ let description = [{
119
+ The `gpu.device_async_copy` op initiates an asynchronous copy operation of
120
+ `$size` elements from source to the destination without blocking the thread.
121
+ The destination has to be in shared memory.
122
+
123
+ This is memory access will be pending to be added to a group.
124
+
125
+ This op is meant to be used with `gpu.device_async_create_group` and
126
+ `gpu.device_async_wait` to synchronize copies as explained in those ops
127
+ descriptions.
128
+ `bypassL1` attribute is hint to the backend and hardware that
129
+ the copy should by pass the L1 cache, this may be dropped by the backend or
130
+ hardware.
131
+
132
+ In order to do a copy and wait for the result we need the following
133
+ combination:
134
+ ```
135
+ // copy 1.
136
+ %cp1 = gpu.device_async_copy %A[%c0], %B[%c0], 4 :memref<16xf32> to memref<16xf32, 3>
137
+ // copy 2.
138
+ %cp2 = gpu.device_async_copy %C[%c0], %D[%c0], 4 : memref<16xf32> to memref<16xf32, 3>
139
+ // group 1 contains copy 1 and copy 2.
140
+ %token1 = gpu.device_async_create_group %cp1, %cp2
141
+ // copy 3.
142
+ %cp3 = gpu.device_async_copy %E[%c0], %F[%c0], 4 : memref<16xf32> to memref<16xf32, 3>
143
+ // group 2 contains copy 3.
144
+ %token2 = gpu.device_async_create_group %cp3
145
+ // after the wait copy 1 and copy 2 are complete.
146
+ gpu.device_async_wait %token1
147
+ // after the wait copy 3 is complete.
148
+ gpu.device_async_wait %token2
149
+ ```
150
+
151
+ Example:
152
+
153
+ ```mlir
154
+ %0 = gpu.device_async_copy %src[%c0, %c0], %dst[%c0, %c0, %c0], 4 :
155
+ memref<4x5xf32> to memref<2x7x5xf32, 3>
156
+ ```
157
+ }];
158
+ let results = (outs NVGPU_DeviceAsyncToken:$asyncToken);
159
+ let arguments = (ins Arg<AnyMemRef, "", [MemWrite]>:$dst,
160
+ Variadic<Index>:$dstIndices,
161
+ Arg<AnyMemRef, "", [MemRead]>:$src,
162
+ Variadic<Index>:$srcIndices,
163
+ IndexAttr:$numElements,
164
+ OptionalAttr<UnitAttr>:$bypassL1);
165
+ let assemblyFormat = [{
166
+ $src `[` $srcIndices `]` `,` $dst `[` $dstIndices `]` `,` $numElements
167
+ attr-dict `:` type($src) `to` type($dst)
168
+ }];
169
+ let hasVerifier = 1;
170
+ }
171
+
172
+ def NVGPU_DeviceAsyncCreateGroupOp : NVGPU_Op<"device_async_create_group", []> {
173
+ let summary = "device side asynchronous create group operation";
174
+ let description = [{
175
+ The `gpu.device_async_create_group` op creates a group of memory accesses
176
+ containing all the pending `device_async_copy` operations associated with
177
+ argument tokens. Each token can only be part of one group.
178
+
179
+ It returns a token that can be use to wait until the group fully completes.
180
+
181
+ This is meant to be used with `gpu.device_async_wait` to synchronize copies
182
+ as explained in those ops descriptions.
183
+
184
+ Groups are executed in the order they are created.
185
+
186
+ Example:
187
+
188
+ ```mlir
189
+ %0 = gpu.device_async_create_group
190
+ ```
191
+ }];
192
+ let results = (outs NVGPU_DeviceAsyncToken:$asyncToken);
193
+ let arguments = (ins Variadic<NVGPU_DeviceAsyncToken>:$inputTokens);
194
+ let assemblyFormat = [{
195
+ $inputTokens attr-dict
196
+ }];
197
+ }
198
+
199
+ def NVGPU_DeviceAsyncWaitOp : NVGPU_Op<"device_async_wait", []> {
200
+ let summary = "Wait for async gpu ops to complete.";
201
+ let description = [{
202
+ The `gpu.device_async_wait` op will block the execution thread until the group
203
+ associated with the source token is fully completed.
204
+
205
+ The optional `$numGroup` attribute gives a lower bound of the number of
206
+ groups uncompleted when the wait can unblock the thread.
207
+ Example:
208
+
209
+ ```mlir
210
+ gpu.device_async_wait %0
211
+ ```
212
+ }];
213
+ let arguments = (ins NVGPU_DeviceAsyncToken:$asyncDependencies,
214
+ OptionalAttr<I32Attr>:$numGroups);
215
+ let assemblyFormat = [{
216
+ $asyncDependencies attr-dict
217
+ }];
218
+ }
219
+
105
220
#endif // NVGPU
0 commit comments