[Offload] Add device info for shared memory (#167817)

2026-01-13 11:02:04 +08:00 · 2025-11-13 11:00:12 -08:00
parent e5f499f48f
commit 1a86f0aae7
8 changed files with 47 additions and 4 deletions
--- a/offload/liboffload/API/Device.td
+++ b/offload/liboffload/API/Device.td
@@ -43,6 +43,7 @@ def ol_device_info_t : Enum {
    TaggedEtor<"ADDRESS_BITS", "uint32_t", "Number of bits used to represent an address in device memory">,
    TaggedEtor<"MAX_MEM_ALLOC_SIZE", "uint64_t", "The maximum size of memory object allocation in bytes">,
    TaggedEtor<"GLOBAL_MEM_SIZE", "uint64_t", "The size of global device memory in bytes">,
+    TaggedEtor<"WORK_GROUP_LOCAL_MEM_SIZE", "uint64_t", "The maximum size of local shared memory per work group in bytes">,
  ];
  list<TaggedEtor> fp_configs = !foreach(type, ["Single", "Double", "Half"], TaggedEtor<type # "_FP_CONFIG", "ol_device_fp_capability_flags_t", type # " precision floating point capability">);
  list<TaggedEtor> native_vec_widths = !foreach(type, ["char","short","int","long","float","double","half"], TaggedEtor<"NATIVE_VECTOR_WIDTH_" # type, "uint32_t", "Native vector width for " # type>);
--- a/offload/liboffload/src/OffloadImpl.cpp
+++ b/offload/liboffload/src/OffloadImpl.cpp
@@ -495,6 +495,13 @@ Error olGetDeviceInfoImplDetail(ol_device_handle_t Device,
    return Info.write(static_cast<uint32_t>(Value));
  }

+  case OL_DEVICE_INFO_WORK_GROUP_LOCAL_MEM_SIZE: {
+    if (!std::holds_alternative<uint64_t>(Entry->Value))
+      return makeError(ErrorCode::BACKEND_FAILURE,
+                       "plugin returned incorrect type");
+    return Info.write(std::get<uint64_t>(Entry->Value));
+  }
+
  case OL_DEVICE_INFO_MAX_WORK_SIZE_PER_DIMENSION:
  case OL_DEVICE_INFO_MAX_WORK_GROUP_SIZE_PER_DIMENSION: {
    // {x, y, z} triples
@@ -590,6 +597,7 @@ Error olGetDeviceInfoImplDetailHost(ol_device_handle_t Device,
    return Info.write<uint32_t>(std::numeric_limits<uintptr_t>::digits);
  case OL_DEVICE_INFO_MAX_MEM_ALLOC_SIZE:
  case OL_DEVICE_INFO_GLOBAL_MEM_SIZE:
+  case OL_DEVICE_INFO_WORK_GROUP_LOCAL_MEM_SIZE:
    return Info.write<uint64_t>(0);
  default:
    return createOffloadError(ErrorCode::INVALID_ENUMERATION,
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -2186,6 +2186,16 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
    if (auto Err = checkIfAPU())
      return Err;

+    // Retrieve the size of the group memory.
+    for (const auto *Pool : AllMemoryPools) {
+      if (Pool->isGroup()) {
+        if (auto Err = Pool->getAttr(HSA_AMD_MEMORY_POOL_INFO_SIZE,
+                                     MaxBlockSharedMemSize))
+          return Err;
+        break;
+      }
+    }
+
    return Plugin::success();
  }

@@ -2923,6 +2933,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
    if (Status == HSA_STATUS_SUCCESS)
      Info.add("Cacheline Size", TmpUInt);

+    Info.add("Max Shared Memory per Work Group", MaxBlockSharedMemSize, "bytes",
+             DeviceInfo::WORK_GROUP_LOCAL_MEM_SIZE);
+
    Status = getDeviceAttrRaw(HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY, TmpUInt);
    if (Status == HSA_STATUS_SUCCESS)
      Info.add("Max Clock Freq", TmpUInt, "MHz",
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -794,6 +794,10 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
  /// Get the unique identifier of the device.
  const char *getDeviceUid() const { return DeviceUid.c_str(); }

+  /// Get the total shared memory per block (in bytes) that can be used in any
+  /// kernel.
+  size_t getMaxBlockSharedMemSize() const { return MaxBlockSharedMemSize; }
+
  /// Set the context of the device if needed, before calling device-specific
  /// functions. Plugins may implement this function as a no-op if not needed.
  virtual Error setContext() = 0;
@@ -1251,6 +1255,9 @@ protected:
  /// Internal representation for OMPT device (initialize & finalize)
  std::atomic<bool> OmptInitialized;
 #endif
+
+  /// The total per-block native shared memory that a kernel may use.
+  size_t MaxBlockSharedMemSize = 0;
 };

 /// Class implementing common functionalities of offload plugins. Each plugin
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -379,6 +379,12 @@ struct CUDADeviceTy : public GenericDeviceTy {
      return Err;
    HardwareParallelism = NumMuliprocessors * (MaxThreadsPerSM / WarpSize);

+    uint32_t MaxSharedMem;
+    if (auto Err = getDeviceAttr(
+            CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, MaxSharedMem))
+      return Err;
+    MaxBlockSharedMemSize = MaxSharedMem;
+
    return Plugin::success();
  }

@@ -1089,10 +1095,8 @@ struct CUDADeviceTy : public GenericDeviceTy {
    if (Res == CUDA_SUCCESS)
      Info.add("Total Constant Memory", TmpInt, "bytes");

-    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK,
-                           TmpInt);
-    if (Res == CUDA_SUCCESS)
-      Info.add("Max Shared Memory per Block", TmpInt, "bytes");
+    Info.add("Max Shared Memory per Block", MaxBlockSharedMemSize, "bytes",
+             DeviceInfo::WORK_GROUP_LOCAL_MEM_SIZE);

    Res = getDeviceAttrRaw(CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, TmpInt);
    if (Res == CUDA_SUCCESS)
--- a/offload/tools/deviceinfo/llvm-offload-device-info.cpp
+++ b/offload/tools/deviceinfo/llvm-offload-device-info.cpp
@@ -205,6 +205,9 @@ ol_result_t printDevice(std::ostream &S, ol_device_handle_t D) {
      S, D, OL_DEVICE_INFO_MAX_MEM_ALLOC_SIZE, "Max Mem Allocation Size", "B"));
  OFFLOAD_ERR(printDeviceValue<uint64_t>(S, D, OL_DEVICE_INFO_GLOBAL_MEM_SIZE,
                                         "Global Mem Size", "B"));
+  OFFLOAD_ERR(
+      printDeviceValue<uint64_t>(S, D, OL_DEVICE_INFO_WORK_GROUP_LOCAL_MEM_SIZE,
+                                 "Work Group Shared Mem Size", "B"));
  OFFLOAD_ERR(
      (printDeviceValue<ol_device_fp_capability_flags_t, PrintKind::FP_FLAGS>(
          S, D, OL_DEVICE_INFO_SINGLE_FP_CONFIG,
--- a/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp
+++ b/offload/unittests/OffloadAPI/device/olGetDeviceInfo.cpp
@@ -217,6 +217,11 @@ OL_DEVICE_INFO_TEST_DEVICE_VALUE_GT(GlobalMemSize, uint64_t,
                                    OL_DEVICE_INFO_GLOBAL_MEM_SIZE, 0);
 OL_DEVICE_INFO_TEST_HOST_SUCCESS(GlobalMemSize, uint64_t,
                                 OL_DEVICE_INFO_GLOBAL_MEM_SIZE);
+OL_DEVICE_INFO_TEST_DEVICE_VALUE_GT(SharedMemSize, uint64_t,
+                                    OL_DEVICE_INFO_WORK_GROUP_LOCAL_MEM_SIZE,
+                                    0);
+OL_DEVICE_INFO_TEST_HOST_SUCCESS(SharedMemSize, uint64_t,
+                                 OL_DEVICE_INFO_WORK_GROUP_LOCAL_MEM_SIZE);

 TEST_P(olGetDeviceInfoTest, InvalidNullHandleDevice) {
  ol_device_type_t DeviceType;
--- a/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp
+++ b/offload/unittests/OffloadAPI/device/olGetDeviceInfoSize.cpp
@@ -71,6 +71,8 @@ OL_DEVICE_INFO_SIZE_TEST_EQ(MaxMemAllocSize, uint64_t,
                            OL_DEVICE_INFO_MAX_MEM_ALLOC_SIZE);
 OL_DEVICE_INFO_SIZE_TEST_EQ(GlobalMemSize, uint64_t,
                            OL_DEVICE_INFO_GLOBAL_MEM_SIZE);
+OL_DEVICE_INFO_SIZE_TEST_EQ(SharedMemSize, uint64_t,
+                            OL_DEVICE_INFO_WORK_GROUP_LOCAL_MEM_SIZE);

 TEST_P(olGetDeviceInfoSizeTest, SuccessMaxWorkGroupSizePerDimension) {
  size_t Size = 0;