mirror of
https://github.com/intel/llvm.git
synced 2026-02-03 10:39:35 +08:00
[AMDGPU][Libomptarget] Collect allocatable memory pools using HSA
The logic is almost similar to that of system.cpp with one change that instead of adding all the memory pools to a device struct it only keeps a single pool. The existing approach also always allocated memory on the first HSA pool found for a GPU. This depends on D104691. The goal of this series of patches is to remove _atl_machine global. The next patch will drop g_atl_machine entirely. Reviewed By: JonChesterfield Differential Revision: https://reviews.llvm.org/D104695
This commit is contained in:
@@ -17,6 +17,7 @@
|
||||
#include <cstring>
|
||||
#include <elf.h>
|
||||
#include <fstream>
|
||||
#include <functional>
|
||||
#include <iostream>
|
||||
#include <libelf.h>
|
||||
#include <list>
|
||||
@@ -104,6 +105,16 @@ template <typename C> hsa_status_t iterate_agents(C cb) {
|
||||
return hsa_iterate_agents(L, static_cast<void *>(&cb));
|
||||
}
|
||||
|
||||
template <typename C>
|
||||
hsa_status_t amd_agent_iterate_memory_pools(hsa_agent_t Agent, C cb) {
|
||||
auto L = [](hsa_amd_memory_pool_t MemoryPool, void *data) -> hsa_status_t {
|
||||
C *unwrapped = static_cast<C *>(data);
|
||||
return (*unwrapped)(MemoryPool);
|
||||
};
|
||||
|
||||
return hsa_amd_agent_iterate_memory_pools(Agent, L, static_cast<void *>(&cb));
|
||||
}
|
||||
|
||||
} // namespace hsa
|
||||
|
||||
/// Keep entries table per device
|
||||
@@ -329,18 +340,60 @@ hsa_status_t addKernArgPool(hsa_amd_memory_pool_t MemoryPool, void *Data) {
|
||||
return err;
|
||||
}
|
||||
|
||||
size_t size = 0;
|
||||
err = hsa_amd_memory_pool_get_info(MemoryPool, HSA_AMD_MEMORY_POOL_INFO_SIZE,
|
||||
&size);
|
||||
if (err != HSA_STATUS_SUCCESS) {
|
||||
fprintf(stderr, "Get memory pool size failed: %s\n", get_error_string(err));
|
||||
return err;
|
||||
}
|
||||
|
||||
if ((GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) &&
|
||||
(GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT)) {
|
||||
size_t size = 0;
|
||||
err = hsa_amd_memory_pool_get_info(MemoryPool,
|
||||
HSA_AMD_MEMORY_POOL_INFO_SIZE, &size);
|
||||
if (err != HSA_STATUS_SUCCESS) {
|
||||
fprintf(stderr, "Get memory pool size failed: %s\n",
|
||||
get_error_string(err));
|
||||
return err;
|
||||
(GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_KERNARG_INIT) &&
|
||||
size > 0) {
|
||||
Result->push_back(MemoryPool);
|
||||
}
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
std::pair<hsa_status_t, bool>
|
||||
isValidMemoryPool(hsa_amd_memory_pool_t MemoryPool) {
|
||||
bool AllocAllowed = false;
|
||||
hsa_status_t Err = hsa_amd_memory_pool_get_info(
|
||||
MemoryPool, HSA_AMD_MEMORY_POOL_INFO_RUNTIME_ALLOC_ALLOWED,
|
||||
&AllocAllowed);
|
||||
if (Err != HSA_STATUS_SUCCESS) {
|
||||
fprintf(stderr, "Alloc allowed in memory pool check failed: %s\n",
|
||||
get_error_string(Err));
|
||||
return {Err, false};
|
||||
}
|
||||
|
||||
return {HSA_STATUS_SUCCESS, AllocAllowed};
|
||||
}
|
||||
|
||||
template <typename AccumulatorFunc>
|
||||
hsa_status_t collectMemoryPools(const std::vector<hsa_agent_t> &Agents,
|
||||
AccumulatorFunc Func) {
|
||||
for (int DeviceId = 0; DeviceId < Agents.size(); DeviceId++) {
|
||||
hsa_status_t Err = hsa::amd_agent_iterate_memory_pools(
|
||||
Agents[DeviceId], [&](hsa_amd_memory_pool_t MemoryPool) {
|
||||
hsa_status_t Err;
|
||||
bool Valid = false;
|
||||
std::tie(Err, Valid) = isValidMemoryPool(MemoryPool);
|
||||
if (Err != HSA_STATUS_SUCCESS) {
|
||||
return Err;
|
||||
}
|
||||
if (Valid)
|
||||
Func(MemoryPool, DeviceId);
|
||||
return HSA_STATUS_SUCCESS;
|
||||
});
|
||||
|
||||
if (Err != HSA_STATUS_SUCCESS) {
|
||||
printf("[%s:%d] %s failed: %s\n", __FILE__, __LINE__,
|
||||
"Iterate all memory pools", get_error_string(Err));
|
||||
return Err;
|
||||
}
|
||||
if (size > 0)
|
||||
Result->push_back(MemoryPool);
|
||||
}
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
@@ -421,6 +474,13 @@ public:
|
||||
|
||||
hsa_amd_memory_pool_t KernArgPool;
|
||||
|
||||
// fine grained memory pool for host allocations
|
||||
hsa_amd_memory_pool_t HostFineGrainedMemoryPool;
|
||||
|
||||
// fine and coarse-grained memory pools per offloading device
|
||||
std::vector<hsa_amd_memory_pool_t> DeviceFineGrainedMemoryPools;
|
||||
std::vector<hsa_amd_memory_pool_t> DeviceCoarseGrainedMemoryPools;
|
||||
|
||||
struct atmiFreePtrDeletor {
|
||||
void operator()(void *p) {
|
||||
core::Runtime::Memfree(p); // ignore failure to free
|
||||
@@ -523,6 +583,82 @@ public:
|
||||
E.Table.EntriesBegin = E.Table.EntriesEnd = 0;
|
||||
}
|
||||
|
||||
hsa_status_t addDeviceMemoryPool(hsa_amd_memory_pool_t MemoryPool,
|
||||
int DeviceId) {
|
||||
assert(DeviceId < DeviceFineGrainedMemoryPools.size() && "Error here.");
|
||||
uint32_t GlobalFlags = 0;
|
||||
hsa_status_t Err = hsa_amd_memory_pool_get_info(
|
||||
MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags);
|
||||
|
||||
if (Err != HSA_STATUS_SUCCESS) {
|
||||
return Err;
|
||||
}
|
||||
|
||||
if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED) {
|
||||
DeviceFineGrainedMemoryPools[DeviceId] = MemoryPool;
|
||||
} else if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_COARSE_GRAINED) {
|
||||
DeviceCoarseGrainedMemoryPools[DeviceId] = MemoryPool;
|
||||
}
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
hsa_status_t addHostMemoryPool(hsa_amd_memory_pool_t MemoryPool,
|
||||
int DeviceId) {
|
||||
uint32_t GlobalFlags = 0;
|
||||
hsa_status_t Err = hsa_amd_memory_pool_get_info(
|
||||
MemoryPool, HSA_AMD_MEMORY_POOL_INFO_GLOBAL_FLAGS, &GlobalFlags);
|
||||
|
||||
if (Err != HSA_STATUS_SUCCESS) {
|
||||
return Err;
|
||||
}
|
||||
|
||||
uint32_t Size;
|
||||
Err = hsa_amd_memory_pool_get_info(MemoryPool,
|
||||
HSA_AMD_MEMORY_POOL_INFO_SIZE, &Size);
|
||||
if (Err != HSA_STATUS_SUCCESS) {
|
||||
return Err;
|
||||
}
|
||||
|
||||
if (GlobalFlags & HSA_AMD_MEMORY_POOL_GLOBAL_FLAG_FINE_GRAINED &&
|
||||
Size > 0) {
|
||||
HostFineGrainedMemoryPool = MemoryPool;
|
||||
}
|
||||
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
hsa_status_t setupMemoryPools() {
|
||||
using namespace std::placeholders;
|
||||
hsa_status_t Err;
|
||||
Err = core::collectMemoryPools(
|
||||
CPUAgents, std::bind(&RTLDeviceInfoTy::addHostMemoryPool, this, _1, _2));
|
||||
if (Err != HSA_STATUS_SUCCESS) {
|
||||
fprintf(stderr, "HSA error in collecting memory pools for CPU: %s\n",
|
||||
get_error_string(Err));
|
||||
return Err;
|
||||
}
|
||||
Err = core::collectMemoryPools(
|
||||
HSAAgents, std::bind(&RTLDeviceInfoTy::addDeviceMemoryPool, this, _1, _2));
|
||||
if (Err != HSA_STATUS_SUCCESS) {
|
||||
fprintf(stderr,
|
||||
"HSA error in collecting memory pools for offload devices: %s\n",
|
||||
get_error_string(Err));
|
||||
return Err;
|
||||
}
|
||||
return HSA_STATUS_SUCCESS;
|
||||
}
|
||||
|
||||
hsa_amd_memory_pool_t getDeviceMemoryPool(int DeviceId) {
|
||||
assert(DeviceId >= 0 && DeviceId < DeviceCoarseGrainedMemoryPools.size() &&
|
||||
"Invalid device Id");
|
||||
return DeviceCoarseGrainedMemoryPools[DeviceId];
|
||||
}
|
||||
|
||||
hsa_amd_memory_pool_t getHostMemoryPool() {
|
||||
return HostFineGrainedMemoryPool;
|
||||
}
|
||||
|
||||
RTLDeviceInfoTy() {
|
||||
// LIBOMPTARGET_KERNEL_TRACE provides a kernel launch trace to stderr
|
||||
// anytime. You do not need a debug library build.
|
||||
@@ -581,6 +717,14 @@ public:
|
||||
deviceStateStore.resize(NumberOfDevices);
|
||||
KernelInfoTable.resize(NumberOfDevices);
|
||||
SymbolInfoTable.resize(NumberOfDevices);
|
||||
DeviceCoarseGrainedMemoryPools.resize(NumberOfDevices);
|
||||
DeviceFineGrainedMemoryPools.resize(NumberOfDevices);
|
||||
|
||||
err = setupMemoryPools();
|
||||
if (err != HSA_STATUS_SUCCESS) {
|
||||
DP("Error when setting up memory pools");
|
||||
return;
|
||||
}
|
||||
|
||||
for (int i = 0; i < NumberOfDevices; i++) {
|
||||
HSAQueues[i] = nullptr;
|
||||
|
||||
Reference in New Issue
Block a user