[mlir][gpu] Allow gpu.launch_func to be async.

This is a roll-forward of rGec7780ebdab4, now that the remaining
gpu.launch_func have been converted to custom form in rGb22f111023ba.

Reviewed By: antiagainst

Differential Revision: https://reviews.llvm.org/D90420
This commit is contained in:
Christian Sigg
2020-10-22 07:49:50 +02:00
parent 20b386aae0
commit 3556114083
4 changed files with 45 additions and 17 deletions

View File

@@ -291,12 +291,14 @@ def GPU_GPUFuncOp : GPU_Op<"func", [HasParent<"GPUModuleOp">,
let parser = [{ return parseGPUFuncOp(parser, result); }];
}
def GPU_LaunchFuncOp : GPU_Op<"launch_func">,
Arguments<(ins SymbolRefAttr:$kernel,
def GPU_LaunchFuncOp : GPU_Op<"launch_func",
[GPU_AsyncOpInterface, AttrSizedOperandSegments]>,
Arguments<(ins Variadic<GPU_AsyncToken>:$asyncDependencies,
SymbolRefAttr:$kernel,
Index:$gridSizeX, Index:$gridSizeY, Index:$gridSizeZ,
Index:$blockSizeX, Index:$blockSizeY, Index:$blockSizeZ,
Variadic<AnyType>:$operands)>,
Results<(outs)> {
Results<(outs Optional<GPU_AsyncToken>:$asyncToken)> {
let summary = "Launches a function as a GPU kernel";
let description = [{
@@ -308,14 +310,22 @@ def GPU_LaunchFuncOp : GPU_Op<"launch_func">,
function is required to be a gpu.module. And finally, the module containing
the kernel module (which thus cannot be the top-level module) is required
to have the `gpu.container_module` attribute. The `gpu.launch_func`
operation has a symbol attribute named `kernel` to identify the fully
operation has a symbol attribute named `kernel` to identify the fully
specified kernel function to launch (both the gpu.module and func).
The operation takes at least six operands, with the first three operands
being grid sizes along x,y,z dimensions and the following three being block
sizes along x,y,z dimensions. When a lower-dimensional kernel is required,
unused sizes must be explicitly set to `1`. The remaining operands are
passed as arguments to the kernel function.
The `gpu.launch_func` supports async dependencies: the kernel does not start
executing until the ops producing those async dependencies have completed.
By the default, the host implicitly blocks until kernel execution has
completed. If the `async` keyword is present, the host does not block but
instead a `!gpu.async.token` is returned. Other async GPU ops can take this
token as dependency.
The operation requires at least the grid and block sizes along the x,y,z
dimensions as arguments. When a lower-dimensional kernel is required,
unused sizes must be explicitly set to `1`.
The remaining operands are passed as arguments to the kernel function.
Example:
@@ -351,11 +361,15 @@ def GPU_LaunchFuncOp : GPU_Op<"launch_func">,
}
}
%t0 = gpu.wait async
gpu.launch_func
@kernels::@kernel_1 // Kernel function.
blocks in (%cst, %cst, %cst) // Grid size.
threads in (%cst, %cst, %cst) // Block size.
args(%arg0 : f32, %arg1 : memref<?xf32, 1>) // Kernel arguments.
async // (Optional) Don't block host, return token.
[%t0] // (Optional) Execute only after %t0 has completed.
@kernels::@kernel_1 // Kernel function.
blocks in (%cst, %cst, %cst) // Grid size.
threads in (%cst, %cst, %cst) // Block size.
args(%arg0 : f32, // (Optional) Kernel arguments.
%arg1 : memref<?xf32, 1>)
}
```
}];
@@ -401,6 +415,7 @@ def GPU_LaunchFuncOp : GPU_Op<"launch_func">,
let verifier = [{ return ::verify(*this); }];
let assemblyFormat = [{
custom<AsyncDependencies>(type($asyncToken), $asyncDependencies)
$kernel
`blocks` `in` ` ` `(`$gridSizeX`,` $gridSizeY`,` $gridSizeZ`)`
`threads` `in` ` ` `(`$blockSizeX`,` $blockSizeY`,` $blockSizeZ`)`

View File

@@ -438,10 +438,15 @@ void LaunchFuncOp::build(OpBuilder &builder, OperationState &result,
auto kernelSymbol = builder.getSymbolRefAttr(
kernelModule.getName(), {builder.getSymbolRefAttr(kernelFunc.getName())});
result.addAttribute(getKernelAttrName(), kernelSymbol);
SmallVector<int32_t, 8> segmentSizes(8, 1);
segmentSizes.front() = 0; // Initially no async dependencies.
segmentSizes.back() = static_cast<int32_t>(kernelOperands.size());
result.addAttribute(getOperandSegmentSizeAttr(),
builder.getI32VectorAttr(segmentSizes));
}
unsigned LaunchFuncOp::getNumKernelOperands() {
return getNumOperands() - kNumConfigOperands;
return getNumOperands() - asyncDependencies().size() - kNumConfigOperands;
}
StringRef LaunchFuncOp::getKernelModuleName() {
@@ -451,15 +456,17 @@ StringRef LaunchFuncOp::getKernelModuleName() {
StringRef LaunchFuncOp::getKernelName() { return kernel().getLeafReference(); }
Value LaunchFuncOp::getKernelOperand(unsigned i) {
return getOperation()->getOperand(i + kNumConfigOperands);
return getOperand(asyncDependencies().size() + kNumConfigOperands + i);
}
KernelDim3 LaunchFuncOp::getGridSizeOperandValues() {
return KernelDim3{getOperand(0), getOperand(1), getOperand(2)};
auto operands = getOperands().drop_front(asyncDependencies().size());
return KernelDim3{operands[0], operands[1], operands[2]};
}
KernelDim3 LaunchFuncOp::getBlockSizeOperandValues() {
return KernelDim3{getOperand(3), getOperand(4), getOperand(5)};
auto operands = getOperands().drop_front(asyncDependencies().size());
return KernelDim3{operands[3], operands[4], operands[5]};
}
static LogicalResult verify(LaunchFuncOp op) {

View File

@@ -37,6 +37,7 @@ func @launch_requires_gpu_return(%sz : index) {
func @launch_func_too_few_operands(%sz : index) {
// expected-error@+1 {{expected 6 or more operands}}
"gpu.launch_func"(%sz, %sz, %sz, %sz, %sz)
{operand_segment_sizes = dense<[0, 1, 1, 1, 1, 1, 0, 0]> : vector<8xi32>}
: (index, index, index, index, index) -> ()
return
}
@@ -55,6 +56,7 @@ module attributes {gpu.container_module} {
func @launch_func_missing_callee_attribute(%sz : index) {
// expected-error@+1 {{'gpu.launch_func' op requires attribute 'kernel'}}
"gpu.launch_func"(%sz, %sz, %sz, %sz, %sz, %sz)
{operand_segment_sizes = dense<[0, 1, 1, 1, 1, 1, 1, 0]> : vector<8xi32>}
: (index, index, index, index, index, index) -> ()
return
}

View File

@@ -73,6 +73,7 @@ module attributes {gpu.container_module} {
%1 = "op"() : () -> (memref<?xf32, 1>)
// CHECK: %{{.*}} = constant 8
%cst = constant 8 : index
%t0 = gpu.wait async
// CHECK: gpu.launch_func @kernels::@kernel_1 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) args(%{{.*}} : f32, %{{.*}} : memref<?xf32, 1>)
gpu.launch_func @kernels::@kernel_1 blocks in (%cst, %cst, %cst) threads in (%cst, %cst, %cst) args(%0 : f32, %1 : memref<?xf32, 1>)
@@ -80,6 +81,9 @@ module attributes {gpu.container_module} {
// CHECK: gpu.launch_func @kernels::@kernel_2 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}})
gpu.launch_func @kernels::@kernel_2 blocks in (%cst, %cst, %cst) threads in (%cst, %cst, %cst)
// CHECK: %{{.*}} = gpu.launch_func async [%{{.*}}] @kernels::@kernel_2 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}})
%t1 = gpu.launch_func async [%t0] @kernels::@kernel_2 blocks in (%cst, %cst, %cst) threads in (%cst, %cst, %cst)
return
}