[offload] Fix CUDA args size by subtracting tail padding (#172249)

This commit makes the cuLaunchKernel call to pass the total arguments size without tail padding.
This commit is contained in:
Kevin Sala Penades
2025-12-14 21:57:25 -08:00
committed by GitHub
parent 35b23172c5
commit 35315a84b4
7 changed files with 52 additions and 5 deletions

View File

@@ -81,6 +81,7 @@ DLWRAP(cuDevicePrimaryCtxSetFlags, 2)
DLWRAP(cuDevicePrimaryCtxRetain, 2)
DLWRAP(cuModuleLoadDataEx, 5)
DLWRAP(cuOccupancyMaxPotentialBlockSize, 6)
DLWRAP(cuFuncGetParamInfo, 4)
DLWRAP(cuDeviceCanAccessPeer, 3)
DLWRAP(cuCtxEnablePeerAccess, 2)

View File

@@ -390,5 +390,6 @@ CUresult cuMemGetAllocationGranularity(size_t *granularity,
CUmemAllocationGranularity_flags option);
CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
CUoccupancyB2DSize, size_t, int);
CUresult cuFuncGetParamInfo(CUfunction, size_t, size_t *, size_t *);
#endif

View File

@@ -149,7 +149,8 @@ struct CUDAKernelTy : public GenericKernelTy {
// The maximum number of threads cannot exceed the maximum of the kernel.
MaxNumThreads = std::min(MaxNumThreads, (uint32_t)MaxThreads);
return Plugin::success();
// Retrieve the size of the arguments.
return initArgsSize();
}
/// Launch the CUDA kernel function.
@@ -173,11 +174,32 @@ struct CUDAKernelTy : public GenericKernelTy {
}
private:
/// Initialize the size of the arguments.
Error initArgsSize() {
CUresult Res;
size_t ArgOffset, ArgSize;
size_t Arg = 0;
ArgsSize = 0;
// Find the last argument to know the total size of the arguments.
while ((Res = cuFuncGetParamInfo(Func, Arg++, &ArgOffset, &ArgSize)) ==
CUDA_SUCCESS)
ArgsSize = ArgOffset + ArgSize;
if (Res != CUDA_ERROR_INVALID_VALUE)
return Plugin::check(Res, "error in cuFuncGetParamInfo: %s");
return Plugin::success();
}
/// The CUDA kernel function to execute.
CUfunction Func;
/// The maximum amount of dynamic shared memory per thread group. By default,
/// this is set to 48 KB.
mutable uint32_t MaxDynCGroupMemLimit = 49152;
/// The size of the kernel arguments.
size_t ArgsSize;
};
/// Class wrapping a CUDA stream reference. These are the objects handled by the
@@ -1430,6 +1452,12 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
AsyncInfoWrapperTy &AsyncInfoWrapper) const {
CUDADeviceTy &CUDADevice = static_cast<CUDADeviceTy &>(GenericDevice);
// The args size passed in LaunchParams may have tail padding, which is not
// accepted by the CUDA driver.
if (ArgsSize > LaunchParams.Size)
return Plugin::error(ErrorCode::INVALID_ARGUMENT,
"mismatch in kernel arguments");
CUstream Stream;
if (auto Err = CUDADevice.getStream(AsyncInfoWrapper, Stream))
return Err;
@@ -1437,9 +1465,10 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
uint32_t MaxDynCGroupMem =
std::max(KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize());
size_t ConfigArgsSize = ArgsSize;
void *Config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, LaunchParams.Data,
CU_LAUNCH_PARAM_BUFFER_SIZE,
reinterpret_cast<void *>(&LaunchParams.Size),
reinterpret_cast<void *>(&ConfigArgsSize),
CU_LAUNCH_PARAM_END};
// If we are running an RPC server we want to wake up the server thread

View File

@@ -6,9 +6,6 @@
// clang-format on
// REQUIRES: gpu
//
// FIXME: https://github.com/llvm/llvm-project/issues/161265
// UNSUPPORTED: gpu
#include <stdio.h>

View File

@@ -2,6 +2,7 @@ add_offload_test_device_code(foo.cpp foo)
add_offload_test_device_code(bar.cpp bar)
# Compile with optimizations to eliminate AMDGPU implicit arguments.
add_offload_test_device_code(noargs.cpp noargs -O3)
add_offload_test_device_code(multiargs.cpp multiargs -O3)
add_offload_test_device_code(byte.cpp byte)
add_offload_test_device_code(localmem.cpp localmem)
add_offload_test_device_code(localmem_reduction.cpp localmem_reduction)
@@ -15,6 +16,7 @@ add_custom_target(offload_device_binaries DEPENDS
foo.bin
bar.bin
noargs.bin
multiargs.bin
byte.bin
localmem.bin
localmem_reduction.bin

View File

@@ -0,0 +1,3 @@
#include <gpuintrin.h>
extern "C" __gpu_kernel void multiargs(char, int *, short) { (void)0; }

View File

@@ -55,6 +55,7 @@ struct LaunchSingleKernelTestBase : LaunchKernelTestBase {
KERNEL_TEST(Foo, foo)
KERNEL_TEST(NoArgs, noargs)
KERNEL_TEST(MultiArgs, multiargs)
KERNEL_TEST(Byte, byte)
KERNEL_TEST(LocalMem, localmem)
KERNEL_TEST(LocalMemReduction, localmem_reduction)
@@ -135,6 +136,19 @@ TEST_P(olLaunchKernelNoArgsTest, Success) {
ASSERT_SUCCESS(olSyncQueue(Queue));
}
TEST_P(olLaunchKernelMultiTest, Success) {
struct {
char A;
int *B;
short C;
} Args{0, nullptr, 0};
ASSERT_SUCCESS(
olLaunchKernel(Queue, Device, Kernel, Args, sizeof(Args), &LaunchArgs));
ASSERT_SUCCESS(olSyncQueue(Queue));
}
TEST_P(olLaunchKernelFooTest, SuccessSynchronous) {
void *Mem;
ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,