[offload] Fix CUDA args size by subtracting tail padding (#172249)

This commit makes the cuLaunchKernel call to pass the total arguments size without tail padding.
2026-01-13 02:38:07 +08:00 · 2025-12-14 21:57:25 -08:00
parent 35b23172c5
commit 35315a84b4
7 changed files with 52 additions and 5 deletions
--- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
+++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.cpp
@@ -81,6 +81,7 @@ DLWRAP(cuDevicePrimaryCtxSetFlags, 2)
 DLWRAP(cuDevicePrimaryCtxRetain, 2)
 DLWRAP(cuModuleLoadDataEx, 5)
 DLWRAP(cuOccupancyMaxPotentialBlockSize, 6)
+DLWRAP(cuFuncGetParamInfo, 4)

 DLWRAP(cuDeviceCanAccessPeer, 3)
 DLWRAP(cuCtxEnablePeerAccess, 2)
--- a/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
+++ b/offload/plugins-nextgen/cuda/dynamic_cuda/cuda.h
@@ -390,5 +390,6 @@ CUresult cuMemGetAllocationGranularity(size_t *granularity,
                                       CUmemAllocationGranularity_flags option);
 CUresult cuOccupancyMaxPotentialBlockSize(int *, int *, CUfunction,
                                          CUoccupancyB2DSize, size_t, int);
+CUresult cuFuncGetParamInfo(CUfunction, size_t, size_t *, size_t *);

 #endif
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -149,7 +149,8 @@ struct CUDAKernelTy : public GenericKernelTy {
    // The maximum number of threads cannot exceed the maximum of the kernel.
    MaxNumThreads = std::min(MaxNumThreads, (uint32_t)MaxThreads);

-    return Plugin::success();
+    // Retrieve the size of the arguments.
+    return initArgsSize();
  }

  /// Launch the CUDA kernel function.
@@ -173,11 +174,32 @@ struct CUDAKernelTy : public GenericKernelTy {
  }

 private:
+  /// Initialize the size of the arguments.
+  Error initArgsSize() {
+    CUresult Res;
+    size_t ArgOffset, ArgSize;
+    size_t Arg = 0;
+
+    ArgsSize = 0;
+
+    // Find the last argument to know the total size of the arguments.
+    while ((Res = cuFuncGetParamInfo(Func, Arg++, &ArgOffset, &ArgSize)) ==
+           CUDA_SUCCESS)
+      ArgsSize = ArgOffset + ArgSize;
+
+    if (Res != CUDA_ERROR_INVALID_VALUE)
+      return Plugin::check(Res, "error in cuFuncGetParamInfo: %s");
+    return Plugin::success();
+  }
+
  /// The CUDA kernel function to execute.
  CUfunction Func;
  /// The maximum amount of dynamic shared memory per thread group. By default,
  /// this is set to 48 KB.
  mutable uint32_t MaxDynCGroupMemLimit = 49152;
+
+  /// The size of the kernel arguments.
+  size_t ArgsSize;
 };

 /// Class wrapping a CUDA stream reference. These are the objects handled by the
@@ -1430,6 +1452,12 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
                               AsyncInfoWrapperTy &AsyncInfoWrapper) const {
  CUDADeviceTy &CUDADevice = static_cast<CUDADeviceTy &>(GenericDevice);

+  // The args size passed in LaunchParams may have tail padding, which is not
+  // accepted by the CUDA driver.
+  if (ArgsSize > LaunchParams.Size)
+    return Plugin::error(ErrorCode::INVALID_ARGUMENT,
+                         "mismatch in kernel arguments");
+
  CUstream Stream;
  if (auto Err = CUDADevice.getStream(AsyncInfoWrapper, Stream))
    return Err;
@@ -1437,9 +1465,10 @@ Error CUDAKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
  uint32_t MaxDynCGroupMem =
      std::max(KernelArgs.DynCGroupMem, GenericDevice.getDynamicMemorySize());

+  size_t ConfigArgsSize = ArgsSize;
  void *Config[] = {CU_LAUNCH_PARAM_BUFFER_POINTER, LaunchParams.Data,
                    CU_LAUNCH_PARAM_BUFFER_SIZE,
-                    reinterpret_cast<void *>(&LaunchParams.Size),
+                    reinterpret_cast<void *>(&ConfigArgsSize),
                    CU_LAUNCH_PARAM_END};

  // If we are running an RPC server we want to wake up the server thread
--- a/offload/test/offloading/CUDA/basic_launch_multi_arg.cu
+++ b/offload/test/offloading/CUDA/basic_launch_multi_arg.cu
@@ -6,9 +6,6 @@
 // clang-format on

 // REQUIRES: gpu
-//
-// FIXME: https://github.com/llvm/llvm-project/issues/161265
-// UNSUPPORTED: gpu

 #include <stdio.h>

--- a/offload/unittests/OffloadAPI/device_code/CMakeLists.txt
+++ b/offload/unittests/OffloadAPI/device_code/CMakeLists.txt
@@ -2,6 +2,7 @@ add_offload_test_device_code(foo.cpp foo)
 add_offload_test_device_code(bar.cpp bar)
 # Compile with optimizations to eliminate AMDGPU implicit arguments.
 add_offload_test_device_code(noargs.cpp noargs -O3)
+add_offload_test_device_code(multiargs.cpp multiargs -O3)
 add_offload_test_device_code(byte.cpp byte)
 add_offload_test_device_code(localmem.cpp localmem)
 add_offload_test_device_code(localmem_reduction.cpp localmem_reduction)
@@ -15,6 +16,7 @@ add_custom_target(offload_device_binaries DEPENDS
    foo.bin
    bar.bin
    noargs.bin
+    multiargs.bin
    byte.bin
    localmem.bin
    localmem_reduction.bin
--- a/offload/unittests/OffloadAPI/device_code/multiargs.cpp
+++ b/offload/unittests/OffloadAPI/device_code/multiargs.cpp
@@ -0,0 +1,3 @@
+#include <gpuintrin.h>
+
+extern "C" __gpu_kernel void multiargs(char, int *, short) { (void)0; }
--- a/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
+++ b/offload/unittests/OffloadAPI/kernel/olLaunchKernel.cpp
@@ -55,6 +55,7 @@ struct LaunchSingleKernelTestBase : LaunchKernelTestBase {

 KERNEL_TEST(Foo, foo)
 KERNEL_TEST(NoArgs, noargs)
+KERNEL_TEST(MultiArgs, multiargs)
 KERNEL_TEST(Byte, byte)
 KERNEL_TEST(LocalMem, localmem)
 KERNEL_TEST(LocalMemReduction, localmem_reduction)
@@ -135,6 +136,19 @@ TEST_P(olLaunchKernelNoArgsTest, Success) {
  ASSERT_SUCCESS(olSyncQueue(Queue));
 }

+TEST_P(olLaunchKernelMultiTest, Success) {
+  struct {
+    char A;
+    int *B;
+    short C;
+  } Args{0, nullptr, 0};
+
+  ASSERT_SUCCESS(
+      olLaunchKernel(Queue, Device, Kernel, Args, sizeof(Args), &LaunchArgs));
+
+  ASSERT_SUCCESS(olSyncQueue(Queue));
+}
+
 TEST_P(olLaunchKernelFooTest, SuccessSynchronous) {
  void *Mem;
  ASSERT_SUCCESS(olMemAlloc(Device, OL_ALLOC_TYPE_MANAGED,