mirror of
https://github.com/intel/llvm.git
synced 2026-01-13 02:38:07 +08:00
[offload] Fix CUDA args size by subtracting tail padding (#172249)
This commit makes the cuLaunchKernel call to pass the total arguments size without tail padding.
This commit is contained in:
committed by
GitHub
parent
35b23172c5
commit
35315a84b4
@@ -81,6 +81,7 @@ DLWRAP(cuDevicePrimaryCtxSetFlags, 2)
|
||||
DLWRAP(cuDevicePrimaryCtxRetain, 2)
|
||||
DLWRAP(cuModuleLoadDataEx, 5)
|
||||
DLWRAP(cuOccupancyMaxPotentialBlockSize, 6)
|
||||
DLWRAP(cuFuncGetParamInfo, 4)
|
||||
|
||||
DLWRAP(cuDeviceCanAccessPeer, 3)
|
||||
DLWRAP(cuCtxEnablePeerAccess, 2)
|
||||
|
||||
@@ -390,5 +390,6 @@ CUresult cuMemGetAllocationGranularity(size_t *granularity,
|
||||
CUmemAllocationGranularity_flags option);
|
||||
CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
|
||||
CUoccupancyB2DSize, size_t, int);
|
||||
CUresult cuFuncGetParamInfo(CUfunction, size_t, size_t *, size_t *);
|
||||
|
||||
#endif
|
||||
|
||||
@@ -149,7 +149,8 @@ struct CUDAKernelTy : public GenericKernelTy {
|
||||
// The maximum number of threads cannot exceed the maximum of the kernel.
|
||||
MaxNumThreads = std::min(MaxNumThreads, (uint32_t)MaxThreads);
|
||||
|
||||
return Plugin::success();
|
||||
// Retrieve the size of the arguments.
|
||||
return initArgsSize();
|
||||
}
|
||||
|
||||
/// Launch the CUDA kernel function.
|
||||
@@ -173,11 +174,32 @@ struct CUDAKernelTy : public GenericKernelTy {
|
||||
}
|
||||
|
||||
private:
|
||||
/// Initialize the size of the arguments.
|
||||
Error initArgsSize() {
|
||||
CUresult Res;
|
||||
size_t ArgOffset, ArgSize;
|
||||
size_t Arg = 0;
|
||||
|
||||
ArgsSize = 0;
|
||||
|
||||
// Find the last argument to know the total size of the arguments.
|
||||
while ((Res = cuFuncGetParamInfo(Func, Arg++, &ArgOffset, &ArgSize)) ==
|
||||
CUDA_SUCCESS)
|
||||
ArgsSize = ArgOffset + ArgSize;
|
||||
|
||||
if (Res != CUDA_ERROR_INVALID_VALUE)
|
||||
return Plugin::check(Res, "error in cuFuncGetParamInfo: %s");
|
||||
return Plugin::success();
|
||||
}
|
||||
|
||||
/// The CUDA kernel function to execute.
|
||||
CUfunction Func;
|
||||
/// The maximum amount of dynamic shared memory per thread group. By default,
|
||||
/// this is set to 48 KB.
|
||||
mutable uint32_t MaxDynCGroupMemLimit = 49152;
|
||||
|
||||
/// The size of the kernel arguments.
|
||||
size_t ArgsSize;
|
||||
};
|
||||
|
||||
/// Class wrapping a CUDA stream reference. These are the objects handled by the
|
||||
@@ -1430,6 +1452,12 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
|
||||
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
|
||||
CUDADeviceTy &CUDADevice = static_cast<CUDADeviceTy &>(GenericDevice);
|
||||
|
||||
// The args size passed in LaunchParams may have tail padding, which is not
|
||||
// accepted by the CUDA driver.
|
||||
if (ArgsSize > LaunchParams.Size)
|
||||
return Plugin::error(ErrorCode::INVALID_ARGUMENT,
|
||||
"mismatch in kernel arguments");
|
||||
|
||||
CUstream Stream;
|
||||
if (auto Err = CUDADevice.getStream(AsyncInfoWrapper, Stream))
|
||||
return Err;
|
||||
@@ -1437,9 +1465,10 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
|
||||
uint32_t MaxDynCGroupMem =
|
||||
std::max(KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize());
|
||||
|
||||
size_t ConfigArgsSize = ArgsSize;
|
||||
void *Config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, LaunchParams.Data,
|
||||
CU_LAUNCH_PARAM_BUFFER_SIZE,
|
||||
reinterpret_cast<void *>(&LaunchParams.Size),
|
||||
reinterpret_cast<void *>(&ConfigArgsSize),
|
||||
CU_LAUNCH_PARAM_END};
|
||||
|
||||
// If we are running an RPC server we want to wake up the server thread
|
||||
|
||||
@@ -6,9 +6,6 @@
|
||||
// clang-format on
|
||||
|
||||
// REQUIRES: gpu
|
||||
//
|
||||
// FIXME: https://github.com/llvm/llvm-project/issues/161265
|
||||
// UNSUPPORTED: gpu
|
||||
|
||||
#include <stdio.h>
|
||||
|
||||
|
||||
@@ -2,6 +2,7 @@ add_offload_test_device_code(foo.cpp foo)
|
||||
add_offload_test_device_code(bar.cpp bar)
|
||||
# Compile with optimizations to eliminate AMDGPU implicit arguments.
|
||||
add_offload_test_device_code(noargs.cpp noargs -O3)
|
||||
add_offload_test_device_code(multiargs.cpp multiargs -O3)
|
||||
add_offload_test_device_code(byte.cpp byte)
|
||||
add_offload_test_device_code(localmem.cpp localmem)
|
||||
add_offload_test_device_code(localmem_reduction.cpp localmem_reduction)
|
||||
@@ -15,6 +16,7 @@ add_custom_target(offload_device_binaries DEPENDS
|
||||
foo.bin
|
||||
bar.bin
|
||||
noargs.bin
|
||||
multiargs.bin
|
||||
byte.bin
|
||||
localmem.bin
|
||||
localmem_reduction.bin
|
||||
|
||||
3
offload/unittests/OffloadAPI/device_code/multiargs.cpp
Normal file
3
offload/unittests/OffloadAPI/device_code/multiargs.cpp
Normal file
@@ -0,0 +1,3 @@
|
||||
#include <gpuintrin.h>
|
||||
|
||||
extern "C" __gpu_kernel void multiargs(char, int *, short) { (void)0; }
|
||||
@@ -55,6 +55,7 @@ struct LaunchSingleKernelTestBase : LaunchKernelTestBase {
|
||||
|
||||
KERNEL_TEST(Foo, foo)
|
||||
KERNEL_TEST(NoArgs, noargs)
|
||||
KERNEL_TEST(MultiArgs, multiargs)
|
||||
KERNEL_TEST(Byte, byte)
|
||||
KERNEL_TEST(LocalMem, localmem)
|
||||
KERNEL_TEST(LocalMemReduction, localmem_reduction)
|
||||
@@ -135,6 +136,19 @@ TEST_P(olLaunchKernelNoArgsTest, Success) {
|
||||
ASSERT_SUCCESS(olSyncQueue(Queue));
|
||||
}
|
||||
|
||||
TEST_P(olLaunchKernelMultiTest, Success) {
|
||||
struct {
|
||||
char A;
|
||||
int *B;
|
||||
short C;
|
||||
} Args{0, nullptr, 0};
|
||||
|
||||
ASSERT_SUCCESS(
|
||||
olLaunchKernel(Queue, Device, Kernel, Args, sizeof(Args), &LaunchArgs));
|
||||
|
||||
ASSERT_SUCCESS(olSyncQueue(Queue));
|
||||
}
|
||||
|
||||
TEST_P(olLaunchKernelFooTest, SuccessSynchronous) {
|
||||
void *Mem;
|
||||
ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,
|
||||
|
||||
Reference in New Issue
Block a user