[Offload] Remove handling for device memory pool (#163629)

Summary: This was a lot of code that was only used for upstream LLVM builds of AMDGPU offloading. We have a generic and fast `malloc` in `libc` now so just use that. Simplifies code, can be added back if we start providing alternate forms but I don't think there's a single use-case that would justify it yet.
2026-01-19 09:31:59 +08:00 · 2025-11-06 10:15:18 -06:00
parent 527b7a48c6
commit 670c453aeb
15 changed files with 50 additions and 215 deletions
--- a/offload/include/Shared/Environment.h
+++ b/offload/include/Shared/Environment.h
@@ -21,7 +21,6 @@ enum class DeviceDebugKind : uint32_t {
  Assertion = 1U << 0,
  FunctionTracing = 1U << 1,
  CommonIssues = 1U << 2,
-  AllocationTracker = 1U << 3,
  PGODump = 1U << 4,
 };

@@ -36,27 +35,6 @@ struct DeviceEnvironmentTy {
  uint64_t HardwareParallelism;
 };

-struct DeviceMemoryPoolTy {
-  void *Ptr;
-  uint64_t Size;
-};
-
-struct DeviceMemoryPoolTrackingTy {
-  uint64_t NumAllocations;
-  uint64_t AllocationTotal;
-  uint64_t AllocationMin;
-  uint64_t AllocationMax;
-
-  void combine(DeviceMemoryPoolTrackingTy &Other) {
-    NumAllocations += Other.NumAllocations;
-    AllocationTotal += Other.AllocationTotal;
-    AllocationMin = AllocationMin > Other.AllocationMin ? Other.AllocationMin
-                                                        : AllocationMin;
-    AllocationMax = AllocationMax < Other.AllocationMax ? Other.AllocationMax
-                                                        : AllocationMax;
-  }
-};
-
 // NOTE: Please don't change the order of those members as their indices are
 // used in the middle end. Always add the new data member at the end.
 // Different from KernelEnvironmentTy below, this structure contains members
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -3109,17 +3109,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
    StackSize = Value;
    return Plugin::success();
  }
-  Error getDeviceHeapSize(uint64_t &Value) override {
-    Value = DeviceMemoryPoolSize;
-    return Plugin::success();
-  }
-  Error setDeviceHeapSize(uint64_t Value) override {
-    for (DeviceImageTy *Image : LoadedImages)
-      if (auto Err = setupDeviceMemoryPool(Plugin, *Image, Value))
-        return Err;
-    DeviceMemoryPoolSize = Value;
-    return Plugin::success();
-  }
  Error getDeviceMemorySize(uint64_t &Value) override {
    for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) {
      if (Pool->isGlobal()) {
@@ -3321,9 +3310,6 @@ private:
  /// Reference to the host device.
  AMDHostDeviceTy &HostDevice;

-  /// The current size of the global device memory pool (managed by us).
-  uint64_t DeviceMemoryPoolSize = 1L << 29L /*512MB=*/;
-
  /// The current size of the stack that will be used in cases where it could
  /// not be statically determined.
  uint64_t StackSize = 16 * 1024 /* 16 KB */;
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -819,10 +819,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
  Error unloadBinary(DeviceImageTy *Image);
  virtual Error unloadBinaryImpl(DeviceImageTy *Image) = 0;

-  /// Setup the global device memory pool, if the plugin requires one.
-  Error setupDeviceMemoryPool(GenericPluginTy &Plugin, DeviceImageTy &Image,
-                              uint64_t PoolSize);
-
  // Setup the RPC server for this device if needed. This may not run on some
  // plugins like the CPU targets. By default, it will not be executed so it is
  // up to the target to override this using the shouldSetupRPCServer function.
@@ -1067,6 +1063,15 @@ struct GenericDeviceTy : public DeviceAllocatorTy {

  virtual Error getDeviceStackSize(uint64_t &V) = 0;

+  virtual Error getDeviceHeapSize(uint64_t &V) {
+    return Plugin::error(error::ErrorCode::UNSUPPORTED,
+                         "%s not supported by platform", __func__);
+  }
+  virtual Error setDeviceHeapSize(uint64_t V) {
+    return Plugin::error(error::ErrorCode::UNSUPPORTED,
+                         "%s not supported by platform", __func__);
+  }
+
  /// Returns true if current plugin architecture is an APU
  /// and unified_shared_memory was not requested by the program.
  bool useAutoZeroCopy();
@@ -1159,12 +1164,6 @@ private:
  /// plugin can implement the setters as no-op and setting the output
  /// value to zero for the getters.
  virtual Error setDeviceStackSize(uint64_t V) = 0;
-  virtual Error getDeviceHeapSize(uint64_t &V) = 0;
-  virtual Error setDeviceHeapSize(uint64_t V) = 0;
-
-  /// Indicate whether the device should setup the global device memory pool. If
-  /// false is return the value on the device will be uninitialized.
-  virtual bool shouldSetupDeviceMemoryPool() const { return true; }

  /// Indicate whether or not the device should setup the RPC server. This is
  /// only necessary for unhosted targets like the GPU.
@@ -1251,10 +1250,6 @@ protected:
  /// Internal representation for OMPT device (initialize & finalize)
  std::atomic<bool> OmptInitialized;
 #endif
-
-private:
-  DeviceMemoryPoolTy DeviceMemoryPool = {nullptr, 0};
-  DeviceMemoryPoolTrackingTy DeviceMemoryPoolTracking = {0, 0, ~0U, 0};
 };

 /// Class implementing common functionalities of offload plugins. Each plugin
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -795,19 +795,6 @@ Error GenericDeviceTy::unloadBinary(DeviceImageTy *Image) {
  if (auto Err = callGlobalDestructors(Plugin, *Image))
    return Err;

-  if (OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::AllocationTracker)) {
-    GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler();
-    DeviceMemoryPoolTrackingTy ImageDeviceMemoryPoolTracking = {0, 0, ~0U, 0};
-    GlobalTy TrackerGlobal("__omp_rtl_device_memory_pool_tracker",
-                           sizeof(DeviceMemoryPoolTrackingTy),
-                           &ImageDeviceMemoryPoolTracking);
-    if (auto Err =
-            GHandler.readGlobalFromDevice(*this, *Image, TrackerGlobal)) {
-      consumeError(std::move(Err));
-    }
-    DeviceMemoryPoolTracking.combine(ImageDeviceMemoryPoolTracking);
-  }
-
  GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
  auto ProfOrErr = Handler.readProfilingGlobals(*this, *Image);
  if (!ProfOrErr)
@@ -833,22 +820,6 @@ Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) {
      return Err;
  LoadedImages.clear();

-  if (OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::AllocationTracker)) {
-    // TODO: Write this by default into a file.
-    printf("\n\n|-----------------------\n"
-           "| Device memory tracker:\n"
-           "|-----------------------\n"
-           "| #Allocations: %lu\n"
-           "| Byes allocated: %lu\n"
-           "| Minimal allocation: %lu\n"
-           "| Maximal allocation: %lu\n"
-           "|-----------------------\n\n\n",
-           DeviceMemoryPoolTracking.NumAllocations,
-           DeviceMemoryPoolTracking.AllocationTotal,
-           DeviceMemoryPoolTracking.AllocationMin,
-           DeviceMemoryPoolTracking.AllocationMax);
-  }
-
  // Delete the memory manager before deinitializing the device. Otherwise,
  // we may delete device allocations after the device is deinitialized.
  if (MemoryManager)
@@ -901,18 +872,6 @@ Expected<DeviceImageTy *> GenericDeviceTy::loadBinary(GenericPluginTy &Plugin,
  // Add the image to list.
  LoadedImages.push_back(Image);

-  // Setup the global device memory pool if needed.
-  if (!Plugin.getRecordReplay().isReplaying() &&
-      shouldSetupDeviceMemoryPool()) {
-    uint64_t HeapSize;
-    auto SizeOrErr = getDeviceHeapSize(HeapSize);
-    if (SizeOrErr) {
-      REPORT("No global device memory pool due to error: %s\n",
-             toString(std::move(SizeOrErr)).data());
-    } else if (auto Err = setupDeviceMemoryPool(Plugin, *Image, HeapSize))
-      return std::move(Err);
-  }
-
  if (auto Err = setupRPCServer(Plugin, *Image))
    return std::move(Err);

@@ -936,51 +895,6 @@ Expected<DeviceImageTy *> GenericDeviceTy::loadBinary(GenericPluginTy &Plugin,
  return Image;
 }

-Error GenericDeviceTy::setupDeviceMemoryPool(GenericPluginTy &Plugin,
-                                             DeviceImageTy &Image,
-                                             uint64_t PoolSize) {
-  // Free the old pool, if any.
-  if (DeviceMemoryPool.Ptr) {
-    if (auto Err = dataDelete(DeviceMemoryPool.Ptr,
-                              TargetAllocTy::TARGET_ALLOC_DEVICE))
-      return Err;
-  }
-
-  DeviceMemoryPool.Size = PoolSize;
-  auto AllocOrErr = dataAlloc(PoolSize, /*HostPtr=*/nullptr,
-                              TargetAllocTy::TARGET_ALLOC_DEVICE);
-  if (AllocOrErr) {
-    DeviceMemoryPool.Ptr = *AllocOrErr;
-  } else {
-    auto Err = AllocOrErr.takeError();
-    REPORT("Failure to allocate device memory for global memory pool: %s\n",
-           toString(std::move(Err)).data());
-    DeviceMemoryPool.Ptr = nullptr;
-    DeviceMemoryPool.Size = 0;
-  }
-
-  // Create the metainfo of the device environment global.
-  GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler();
-  if (!GHandler.isSymbolInImage(*this, Image,
-                                "__omp_rtl_device_memory_pool_tracker")) {
-    DP("Skip the memory pool as there is no tracker symbol in the image.");
-    return Error::success();
-  }
-
-  GlobalTy TrackerGlobal("__omp_rtl_device_memory_pool_tracker",
-                         sizeof(DeviceMemoryPoolTrackingTy),
-                         &DeviceMemoryPoolTracking);
-  if (auto Err = GHandler.writeGlobalToDevice(*this, Image, TrackerGlobal))
-    return Err;
-
-  // Create the metainfo of the device environment global.
-  GlobalTy DevEnvGlobal("__omp_rtl_device_memory_pool",
-                        sizeof(DeviceMemoryPoolTy), &DeviceMemoryPool);
-
-  // Write device environment values to the device.
-  return GHandler.writeGlobalToDevice(*this, Image, DevEnvGlobal);
-}
-
 Error GenericDeviceTy::setupRPCServer(GenericPluginTy &Plugin,
                                      DeviceImageTy &Image) {
  // The plugin either does not need an RPC server or it is unavailable.
--- a/offload/plugins-nextgen/cuda/src/rtl.cpp
+++ b/offload/plugins-nextgen/cuda/src/rtl.cpp
@@ -1235,11 +1235,6 @@ struct CUDADeviceTy : public GenericDeviceTy {
    return Info;
  }

-  virtual bool shouldSetupDeviceMemoryPool() const override {
-    /// We use the CUDA malloc for now.
-    return false;
-  }
-
  /// Getters and setters for stack and heap sizes.
  Error getDeviceStackSize(uint64_t &Value) override {
    return getCtxLimit(CU_LIMIT_STACK_SIZE, Value);
--- a/offload/plugins-nextgen/host/src/rtl.cpp
+++ b/offload/plugins-nextgen/host/src/rtl.cpp
@@ -380,9 +380,6 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
    return Info;
  }

-  /// This plugin should not setup the device environment or memory pool.
-  virtual bool shouldSetupDeviceMemoryPool() const override { return false; };
-
  /// Getters and setters for stack size and heap size not relevant.
  Error getDeviceStackSize(uint64_t &Value) override {
    Value = 0;
@@ -391,11 +388,6 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
  Error setDeviceStackSize(uint64_t Value) override {
    return Plugin::success();
  }
-  Error getDeviceHeapSize(uint64_t &Value) override {
-    Value = 0;
-    return Plugin::success();
-  }
-  Error setDeviceHeapSize(uint64_t Value) override { return Plugin::success(); }

 private:
  /// Grid values for Generic ELF64 plugins.
--- a/offload/test/offloading/malloc_parallel.c
+++ b/offload/test/offloading/malloc_parallel.c
--- a/offload/test/mapping/lambda_mapping.cpp
+++ b/offload/test/mapping/lambda_mapping.cpp
@@ -4,6 +4,8 @@
 // RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic
 // RUN: %libomptarget-compileoptxx-run-and-check-generic

+// REQUIRES: libc
+
 #include <iostream>

 template <typename LOOP_BODY>
--- a/offload/test/offloading/malloc.c
+++ b/offload/test/offloading/malloc.c
@@ -10,7 +10,7 @@ int main() {
  int Threads = 64;
  int Teams = 10;

-  // Allocate ~55MB on the device.
+  // Allocate ~160 KiB on the device.
 #pragma omp target map(from : DP)
  DP = (long unsigned *)malloc(sizeof(long unsigned) * N * Threads * Teams);

--- a/openmp/device/include/Allocator.h
+++ b/openmp/device/include/Allocator.h
@@ -14,18 +14,12 @@

 #include "DeviceTypes.h"

-// Forward declaration.
-struct KernelEnvironmentTy;
-
 namespace ompx {

 namespace allocator {

 static uint64_t constexpr ALIGNMENT = 16;

-/// Initialize the allocator according to \p KernelEnvironment
-void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment);
-
 /// Allocate \p Size bytes.
 [[gnu::alloc_size(1), gnu::assume_aligned(ALIGNMENT), gnu::malloc]] void *
 alloc(uint64_t Size);
--- a/openmp/device/src/Allocator.cpp
+++ b/openmp/device/src/Allocator.cpp
@@ -18,42 +18,36 @@
 #include "Synchronization.h"

 using namespace ompx;
+using namespace allocator;

-[[gnu::used, gnu::retain, gnu::weak,
-  gnu::visibility(
-      "protected")]] DeviceMemoryPoolTy __omp_rtl_device_memory_pool;
-[[gnu::used, gnu::retain, gnu::weak,
-  gnu::visibility("protected")]] DeviceMemoryPoolTrackingTy
-    __omp_rtl_device_memory_pool_tracker;
+// Provide a default implementation of malloc / free for AMDGPU platforms built
+// without 'libc' support.
+extern "C" {
+#if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC)
+[[gnu::weak]] void *malloc(size_t Size) { return allocator::alloc(Size); }
+[[gnu::weak]] void free(void *Ptr) { allocator::free(Ptr); }
+#else
+[[gnu::leaf]] void *malloc(size_t Size);
+[[gnu::leaf]] void free(void *Ptr);
+#endif
+}

-/// Stateless bump allocator that uses the __omp_rtl_device_memory_pool
-/// directly.
+static constexpr uint64_t MEMORY_SIZE = /* 1 MiB */ 1024 * 1024;
+alignas(ALIGNMENT) static uint8_t Memory[MEMORY_SIZE] = {0};
+
+// Fallback bump pointer interface for platforms without a functioning
+// allocator.
 struct BumpAllocatorTy final {
+  uint64_t Offset = 0;

  void *alloc(uint64_t Size) {
    Size = utils::roundUp(Size, uint64_t(allocator::ALIGNMENT));

-    if (config::isDebugMode(DeviceDebugKind::AllocationTracker)) {
-      atomic::add(&__omp_rtl_device_memory_pool_tracker.NumAllocations, 1,
-                  atomic::seq_cst);
-      atomic::add(&__omp_rtl_device_memory_pool_tracker.AllocationTotal, Size,
-                  atomic::seq_cst);
-      atomic::min(&__omp_rtl_device_memory_pool_tracker.AllocationMin, Size,
-                  atomic::seq_cst);
-      atomic::max(&__omp_rtl_device_memory_pool_tracker.AllocationMax, Size,
-                  atomic::seq_cst);
-    }
-
-    uint64_t *Data =
-        reinterpret_cast<uint64_t *>(&__omp_rtl_device_memory_pool.Ptr);
-    uint64_t End =
-        reinterpret_cast<uint64_t>(Data) + __omp_rtl_device_memory_pool.Size;
-
-    uint64_t OldData = atomic::add(Data, Size, atomic::seq_cst);
-    if (OldData + Size > End)
+    uint64_t OldData = atomic::add(&Offset, Size, atomic::seq_cst);
+    if (OldData + Size >= MEMORY_SIZE)
      __builtin_trap();

-    return reinterpret_cast<void *>(OldData);
+    return &Memory[OldData];
  }

  void free(void *) {}
@@ -65,13 +59,20 @@ BumpAllocatorTy BumpAllocator;
 ///
 ///{

-void allocator::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment) {
-  // TODO: Check KernelEnvironment for an allocator choice as soon as we have
-  // more than one.
+void *allocator::alloc(uint64_t Size) {
+#if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC)
+  return BumpAllocator.alloc(Size);
+#else
+  return ::malloc(Size);
+#endif
 }

-void *allocator::alloc(uint64_t Size) { return BumpAllocator.alloc(Size); }
-
-void allocator::free(void *Ptr) { BumpAllocator.free(Ptr); }
+void allocator::free(void *Ptr) {
+#if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC)
+  BumpAllocator.free(Ptr);
+#else
+  ::free(Ptr);
+#endif
+}

 ///}
--- a/openmp/device/src/Kernel.cpp
+++ b/openmp/device/src/Kernel.cpp
@@ -41,7 +41,6 @@ inititializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
  synchronize::init(IsSPMD);
  mapping::init(IsSPMD);
  state::init(IsSPMD, KernelEnvironment, KernelLaunchEnvironment);
-  allocator::init(IsSPMD, KernelEnvironment);
  workshare::init(IsSPMD);
 }

--- a/openmp/device/src/Misc.cpp
+++ b/openmp/device/src/Misc.cpp
@@ -100,7 +100,7 @@ void *omp_alloc(size_t size, omp_allocator_handle_t allocator) {
  case omp_const_mem_alloc:
  case omp_high_bw_mem_alloc:
  case omp_low_lat_mem_alloc:
-    return malloc(size);
+    return ompx::allocator::alloc(size);
  default:
    return nullptr;
  }
@@ -113,7 +113,7 @@ void omp_free(void *ptr, omp_allocator_handle_t allocator) {
  case omp_const_mem_alloc:
  case omp_high_bw_mem_alloc:
  case omp_low_lat_mem_alloc:
-    free(ptr);
+    ompx::allocator::free(ptr);
    return;
  case omp_null_allocator:
  default:
--- a/openmp/device/src/State.cpp
+++ b/openmp/device/src/State.cpp
@@ -44,26 +44,6 @@ using namespace ompx;

 namespace {

-/// Fallback implementations are missing to trigger a link time error.
-/// Implementations for new devices, including the host, should go into a
-/// dedicated begin/end declare variant.
-///
-///{
-extern "C" {
-#if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC)
-
-[[gnu::weak]] void *malloc(size_t Size) { return allocator::alloc(Size); }
-[[gnu::weak]] void free(void *Ptr) { allocator::free(Ptr); }
-
-#else
-
-[[gnu::weak, gnu::leaf]] void *malloc(size_t Size);
-[[gnu::weak, gnu::leaf]] void free(void *Ptr);
-
-#endif
-}
-///}
-
 /// A "smart" stack in shared memory.
 ///
 /// The stack exposes a malloc/free interface but works like a stack internally.
@@ -171,13 +151,13 @@ void memory::freeShared(void *Ptr, uint64_t Bytes, const char *Reason) {
 }

 void *memory::allocGlobal(uint64_t Bytes, const char *Reason) {
-  void *Ptr = malloc(Bytes);
+  void *Ptr = allocator::alloc(Bytes);
  if (config::isDebugMode(DeviceDebugKind::CommonIssues) && Ptr == nullptr)
    printf("nullptr returned by malloc!\n");
  return Ptr;
 }

-void memory::freeGlobal(void *Ptr, const char *Reason) { free(Ptr); }
+void memory::freeGlobal(void *Ptr, const char *Reason) { allocator::free(Ptr); }

 ///}

--- a/openmp/docs/design/Runtimes.rst
+++ b/openmp/docs/design/Runtimes.rst
@@ -1521,5 +1521,4 @@ debugging features are supported.

    * Enable debugging assertions in the device. ``0x01``
    * Enable diagnosing common problems during offloading . ``0x4``
-    * Enable device malloc statistics (amdgpu only). ``0x8``
    * Dump device PGO counters (only if PGO on GPU is enabled). ``0x10``