[AMDGPU][Libomptarget] Collect allocatable memory pools using HSA

The logic is almost similar to that of system.cpp with one change that instead of adding all the memory pools to a device struct it only keeps a single pool. The existing approach also always allocated memory on the first HSA pool found for a GPU. This depends on D104691. The goal of this series of patches is to remove _atl_machine global. The next patch will drop g_atl_machine entirely. Reviewed By: JonChesterfield Differential Revision: https://reviews.llvm.org/D104695
2026-02-03 10:39:35 +08:00 · 2021-06-28 11:27:05 +00:00
parent 8474bb13c3
commit 20df2c7052
1 changed files with 154 additions and 10 deletions
--- a/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins/amdgpu/src/rtl.cpp
@@ -17,6 +17,7 @@
 #include <cstring>
 #include <elf.h>
 #include <fstream>
+#include <functional>
 #include <iostream>
 #include <libelf.h>
 #include <list>
@@ -104,6 +105,16 @@ template <typename C> hsa_status_t iterate_agents(C cb) {
  return hsa_iterate_agents(L, static_cast<void *>(&cb));
 }

+template <typename C>
+hsa_status_t amd_agent_iterate_memory_pools(hsa_agent_t Agent, C cb) {
+  auto L = [](hsa_amd_memory_pool_t MemoryPool, void *data) -> hsa_status_t {
+    C *unwrapped = static_cast<C *>(data);
+    return (*unwrapped)(MemoryPool);
+  };
+
+  return hsa_amd_agent_iterate_memory_pools(Agent, L, static_cast<void *>(&cb));
+}
+
 } // namespace hsa

 /// Keep entries table per device
@@ -329,18 +340,60 @@ hsa_status_t addKernArgPool(hsa_amd_memory_pool_t MemoryPool, void *Data) {
    return err;
  }

+  size_t size = 0;
+  err = hsa_amd_memory_pool_get_info(MemoryPool, HSA_AMD_MEMORY_POOL_INFO_SIZE,
+                                     &size);
+  if (err != HSA_STATUS_SUCCESS) {
+    fprintf(stderr, "Get memory pool size failed: %s\n", get_error_string(err));
+    return err;
+  }
+
  if ((GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) &&
-      (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT)) {
-    size_t size = 0;
-    err = hsa_amd_memory_pool_get_info(MemoryPool,
-                                       HSA_AMD_MEMORY_POOL_INFO_SIZE, &size);
-    if (err != HSA_STATUS_SUCCESS) {
-      fprintf(stderr, "Get memory pool size failed: %s\n",
-              get_error_string(err));
-      return err;
+      (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT) &&
+      size > 0) {
+    Result->push_back(MemoryPool);
+  }
+
+  return HSA_STATUS_SUCCESS;
+}
+
+std::pair<hsa_status_t, bool>
+isValidMemoryPool(hsa_amd_memory_pool_t MemoryPool) {
+  bool AllocAllowed = false;
+  hsa_status_t Err = hsa_amd_memory_pool_get_info(
+      MemoryPool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED,
+      &AllocAllowed);
+  if (Err != HSA_STATUS_SUCCESS) {
+    fprintf(stderr, "Alloc allowed in memory pool check failed: %s\n",
+            get_error_string(Err));
+    return {Err, false};
+  }
+
+  return {HSA_STATUS_SUCCESS, AllocAllowed};
+}
+
+template <typename AccumulatorFunc>
+hsa_status_t collectMemoryPools(const std::vector<hsa_agent_t> &Agents,
+                                AccumulatorFunc Func) {
+  for (int DeviceId = 0; DeviceId < Agents.size(); DeviceId++) {
+    hsa_status_t Err = hsa::amd_agent_iterate_memory_pools(
+        Agents[DeviceId], [&](hsa_amd_memory_pool_t MemoryPool) {
+          hsa_status_t Err;
+          bool Valid = false;
+          std::tie(Err, Valid) = isValidMemoryPool(MemoryPool);
+          if (Err != HSA_STATUS_SUCCESS) {
+            return Err;
+          }
+          if (Valid)
+            Func(MemoryPool, DeviceId);
+          return HSA_STATUS_SUCCESS;
+        });
+
+    if (Err != HSA_STATUS_SUCCESS) {
+      printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
+             "Iterate all memory pools", get_error_string(Err));
+      return Err;
    }
-    if (size > 0)
-      Result->push_back(MemoryPool);
  }

  return HSA_STATUS_SUCCESS;
@@ -421,6 +474,13 @@ public:

  hsa_amd_memory_pool_t KernArgPool;

+  // fine grained memory pool for host allocations
+  hsa_amd_memory_pool_t HostFineGrainedMemoryPool;
+
+  // fine and coarse-grained memory pools per offloading device
+  std::vector<hsa_amd_memory_pool_t> DeviceFineGrainedMemoryPools;
+  std::vector<hsa_amd_memory_pool_t> DeviceCoarseGrainedMemoryPools;
+
  struct atmiFreePtrDeletor {
    void operator()(void *p) {
      core::Runtime::Memfree(p); // ignore failure to free
@@ -523,6 +583,82 @@ public:
    E.Table.EntriesBegin = E.Table.EntriesEnd = 0;
  }

+  hsa_status_t addDeviceMemoryPool(hsa_amd_memory_pool_t MemoryPool,
+                                   int DeviceId) {
+    assert(DeviceId < DeviceFineGrainedMemoryPools.size() && "Error here.");
+    uint32_t GlobalFlags = 0;
+    hsa_status_t Err = hsa_amd_memory_pool_get_info(
+        MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags);
+
+    if (Err != HSA_STATUS_SUCCESS) {
+      return Err;
+    }
+
+    if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) {
+      DeviceFineGrainedMemoryPools[DeviceId] = MemoryPool;
+    } else if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) {
+      DeviceCoarseGrainedMemoryPools[DeviceId] = MemoryPool;
+    }
+
+    return HSA_STATUS_SUCCESS;
+  }
+
+  hsa_status_t addHostMemoryPool(hsa_amd_memory_pool_t MemoryPool,
+                                 int DeviceId) {
+    uint32_t GlobalFlags = 0;
+    hsa_status_t Err = hsa_amd_memory_pool_get_info(
+        MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags);
+
+    if (Err != HSA_STATUS_SUCCESS) {
+      return Err;
+    }
+
+    uint32_t Size;
+    Err = hsa_amd_memory_pool_get_info(MemoryPool,
+                                       HSA_AMD_MEMORY_POOL_INFO_SIZE, &Size);
+    if (Err != HSA_STATUS_SUCCESS) {
+      return Err;
+    }
+
+    if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED &&
+        Size > 0) {
+      HostFineGrainedMemoryPool = MemoryPool;
+    }
+
+    return HSA_STATUS_SUCCESS;
+  }
+
+  hsa_status_t setupMemoryPools() {
+    using namespace std::placeholders;
+    hsa_status_t Err;
+    Err = core::collectMemoryPools(
+        CPUAgents, std::bind(&RTLDeviceInfoTy::addHostMemoryPool, this, _1, _2));
+    if (Err != HSA_STATUS_SUCCESS) {
+      fprintf(stderr, "HSA error in collecting memory pools for CPU: %s\n",
+              get_error_string(Err));
+      return Err;
+    }
+    Err = core::collectMemoryPools(
+        HSAAgents, std::bind(&RTLDeviceInfoTy::addDeviceMemoryPool, this, _1, _2));
+    if (Err != HSA_STATUS_SUCCESS) {
+      fprintf(stderr,
+              "HSA error in collecting memory pools for offload devices: %s\n",
+              get_error_string(Err));
+      return Err;
+    }
+    return HSA_STATUS_SUCCESS;
+  }
+
+  hsa_amd_memory_pool_t getDeviceMemoryPool(int DeviceId) {
+    assert(DeviceId >= 0 && DeviceId < DeviceCoarseGrainedMemoryPools.size() &&
+           "Invalid device Id");
+    return DeviceCoarseGrainedMemoryPools[DeviceId];
+  }
+
+  hsa_amd_memory_pool_t getHostMemoryPool() {
+    return HostFineGrainedMemoryPool;
+  }
+
  RTLDeviceInfoTy() {
    // LIBOMPTARGET_KERNEL_TRACE provides a kernel launch trace to stderr
    // anytime. You do not need a debug library build.
@@ -581,6 +717,14 @@ public:
    deviceStateStore.resize(NumberOfDevices);
    KernelInfoTable.resize(NumberOfDevices);
    SymbolInfoTable.resize(NumberOfDevices);
+    DeviceCoarseGrainedMemoryPools.resize(NumberOfDevices);
+    DeviceFineGrainedMemoryPools.resize(NumberOfDevices);
+
+    err = setupMemoryPools();
+    if (err != HSA_STATUS_SUCCESS) {
+      DP("Error when setting up memory pools");
+      return;
+    }

    for (int i = 0; i < NumberOfDevices; i++) {
      HSAQueues[i] = nullptr;