mirror of
https://github.com/intel/llvm.git
synced 2026-01-19 09:31:59 +08:00
[Offload] Remove handling for device memory pool (#163629)
Summary: This was a lot of code that was only used for upstream LLVM builds of AMDGPU offloading. We have a generic and fast `malloc` in `libc` now so just use that. Simplifies code, can be added back if we start providing alternate forms but I don't think there's a single use-case that would justify it yet.
This commit is contained in:
@@ -21,7 +21,6 @@ enum class DeviceDebugKind : uint32_t {
|
||||
Assertion = 1U << 0,
|
||||
FunctionTracing = 1U << 1,
|
||||
CommonIssues = 1U << 2,
|
||||
AllocationTracker = 1U << 3,
|
||||
PGODump = 1U << 4,
|
||||
};
|
||||
|
||||
@@ -36,27 +35,6 @@ struct DeviceEnvironmentTy {
|
||||
uint64_t HardwareParallelism;
|
||||
};
|
||||
|
||||
struct DeviceMemoryPoolTy {
|
||||
void *Ptr;
|
||||
uint64_t Size;
|
||||
};
|
||||
|
||||
struct DeviceMemoryPoolTrackingTy {
|
||||
uint64_t NumAllocations;
|
||||
uint64_t AllocationTotal;
|
||||
uint64_t AllocationMin;
|
||||
uint64_t AllocationMax;
|
||||
|
||||
void combine(DeviceMemoryPoolTrackingTy &Other) {
|
||||
NumAllocations += Other.NumAllocations;
|
||||
AllocationTotal += Other.AllocationTotal;
|
||||
AllocationMin = AllocationMin > Other.AllocationMin ? Other.AllocationMin
|
||||
: AllocationMin;
|
||||
AllocationMax = AllocationMax < Other.AllocationMax ? Other.AllocationMax
|
||||
: AllocationMax;
|
||||
}
|
||||
};
|
||||
|
||||
// NOTE: Please don't change the order of those members as their indices are
|
||||
// used in the middle end. Always add the new data member at the end.
|
||||
// Different from KernelEnvironmentTy below, this structure contains members
|
||||
|
||||
@@ -3109,17 +3109,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
|
||||
StackSize = Value;
|
||||
return Plugin::success();
|
||||
}
|
||||
Error getDeviceHeapSize(uint64_t &Value) override {
|
||||
Value = DeviceMemoryPoolSize;
|
||||
return Plugin::success();
|
||||
}
|
||||
Error setDeviceHeapSize(uint64_t Value) override {
|
||||
for (DeviceImageTy *Image : LoadedImages)
|
||||
if (auto Err = setupDeviceMemoryPool(Plugin, *Image, Value))
|
||||
return Err;
|
||||
DeviceMemoryPoolSize = Value;
|
||||
return Plugin::success();
|
||||
}
|
||||
Error getDeviceMemorySize(uint64_t &Value) override {
|
||||
for (AMDGPUMemoryPoolTy *Pool : AllMemoryPools) {
|
||||
if (Pool->isGlobal()) {
|
||||
@@ -3321,9 +3310,6 @@ private:
|
||||
/// Reference to the host device.
|
||||
AMDHostDeviceTy &HostDevice;
|
||||
|
||||
/// The current size of the global device memory pool (managed by us).
|
||||
uint64_t DeviceMemoryPoolSize = 1L << 29L /*512MB=*/;
|
||||
|
||||
/// The current size of the stack that will be used in cases where it could
|
||||
/// not be statically determined.
|
||||
uint64_t StackSize = 16 * 1024 /* 16 KB */;
|
||||
|
||||
@@ -819,10 +819,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
|
||||
Error unloadBinary(DeviceImageTy *Image);
|
||||
virtual Error unloadBinaryImpl(DeviceImageTy *Image) = 0;
|
||||
|
||||
/// Setup the global device memory pool, if the plugin requires one.
|
||||
Error setupDeviceMemoryPool(GenericPluginTy &Plugin, DeviceImageTy &Image,
|
||||
uint64_t PoolSize);
|
||||
|
||||
// Setup the RPC server for this device if needed. This may not run on some
|
||||
// plugins like the CPU targets. By default, it will not be executed so it is
|
||||
// up to the target to override this using the shouldSetupRPCServer function.
|
||||
@@ -1067,6 +1063,15 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
|
||||
|
||||
virtual Error getDeviceStackSize(uint64_t &V) = 0;
|
||||
|
||||
virtual Error getDeviceHeapSize(uint64_t &V) {
|
||||
return Plugin::error(error::ErrorCode::UNSUPPORTED,
|
||||
"%s not supported by platform", __func__);
|
||||
}
|
||||
virtual Error setDeviceHeapSize(uint64_t V) {
|
||||
return Plugin::error(error::ErrorCode::UNSUPPORTED,
|
||||
"%s not supported by platform", __func__);
|
||||
}
|
||||
|
||||
/// Returns true if current plugin architecture is an APU
|
||||
/// and unified_shared_memory was not requested by the program.
|
||||
bool useAutoZeroCopy();
|
||||
@@ -1159,12 +1164,6 @@ private:
|
||||
/// plugin can implement the setters as no-op and setting the output
|
||||
/// value to zero for the getters.
|
||||
virtual Error setDeviceStackSize(uint64_t V) = 0;
|
||||
virtual Error getDeviceHeapSize(uint64_t &V) = 0;
|
||||
virtual Error setDeviceHeapSize(uint64_t V) = 0;
|
||||
|
||||
/// Indicate whether the device should setup the global device memory pool. If
|
||||
/// false is return the value on the device will be uninitialized.
|
||||
virtual bool shouldSetupDeviceMemoryPool() const { return true; }
|
||||
|
||||
/// Indicate whether or not the device should setup the RPC server. This is
|
||||
/// only necessary for unhosted targets like the GPU.
|
||||
@@ -1251,10 +1250,6 @@ protected:
|
||||
/// Internal representation for OMPT device (initialize & finalize)
|
||||
std::atomic<bool> OmptInitialized;
|
||||
#endif
|
||||
|
||||
private:
|
||||
DeviceMemoryPoolTy DeviceMemoryPool = {nullptr, 0};
|
||||
DeviceMemoryPoolTrackingTy DeviceMemoryPoolTracking = {0, 0, ~0U, 0};
|
||||
};
|
||||
|
||||
/// Class implementing common functionalities of offload plugins. Each plugin
|
||||
|
||||
@@ -795,19 +795,6 @@ Error GenericDeviceTy::unloadBinary(DeviceImageTy *Image) {
|
||||
if (auto Err = callGlobalDestructors(Plugin, *Image))
|
||||
return Err;
|
||||
|
||||
if (OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::AllocationTracker)) {
|
||||
GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler();
|
||||
DeviceMemoryPoolTrackingTy ImageDeviceMemoryPoolTracking = {0, 0, ~0U, 0};
|
||||
GlobalTy TrackerGlobal("__omp_rtl_device_memory_pool_tracker",
|
||||
sizeof(DeviceMemoryPoolTrackingTy),
|
||||
&ImageDeviceMemoryPoolTracking);
|
||||
if (auto Err =
|
||||
GHandler.readGlobalFromDevice(*this, *Image, TrackerGlobal)) {
|
||||
consumeError(std::move(Err));
|
||||
}
|
||||
DeviceMemoryPoolTracking.combine(ImageDeviceMemoryPoolTracking);
|
||||
}
|
||||
|
||||
GenericGlobalHandlerTy &Handler = Plugin.getGlobalHandler();
|
||||
auto ProfOrErr = Handler.readProfilingGlobals(*this, *Image);
|
||||
if (!ProfOrErr)
|
||||
@@ -833,22 +820,6 @@ Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) {
|
||||
return Err;
|
||||
LoadedImages.clear();
|
||||
|
||||
if (OMPX_DebugKind.get() & uint32_t(DeviceDebugKind::AllocationTracker)) {
|
||||
// TODO: Write this by default into a file.
|
||||
printf("\n\n|-----------------------\n"
|
||||
"| Device memory tracker:\n"
|
||||
"|-----------------------\n"
|
||||
"| #Allocations: %lu\n"
|
||||
"| Byes allocated: %lu\n"
|
||||
"| Minimal allocation: %lu\n"
|
||||
"| Maximal allocation: %lu\n"
|
||||
"|-----------------------\n\n\n",
|
||||
DeviceMemoryPoolTracking.NumAllocations,
|
||||
DeviceMemoryPoolTracking.AllocationTotal,
|
||||
DeviceMemoryPoolTracking.AllocationMin,
|
||||
DeviceMemoryPoolTracking.AllocationMax);
|
||||
}
|
||||
|
||||
// Delete the memory manager before deinitializing the device. Otherwise,
|
||||
// we may delete device allocations after the device is deinitialized.
|
||||
if (MemoryManager)
|
||||
@@ -901,18 +872,6 @@ Expected<DeviceImageTy *> GenericDeviceTy::loadBinary(GenericPluginTy &Plugin,
|
||||
// Add the image to list.
|
||||
LoadedImages.push_back(Image);
|
||||
|
||||
// Setup the global device memory pool if needed.
|
||||
if (!Plugin.getRecordReplay().isReplaying() &&
|
||||
shouldSetupDeviceMemoryPool()) {
|
||||
uint64_t HeapSize;
|
||||
auto SizeOrErr = getDeviceHeapSize(HeapSize);
|
||||
if (SizeOrErr) {
|
||||
REPORT("No global device memory pool due to error: %s\n",
|
||||
toString(std::move(SizeOrErr)).data());
|
||||
} else if (auto Err = setupDeviceMemoryPool(Plugin, *Image, HeapSize))
|
||||
return std::move(Err);
|
||||
}
|
||||
|
||||
if (auto Err = setupRPCServer(Plugin, *Image))
|
||||
return std::move(Err);
|
||||
|
||||
@@ -936,51 +895,6 @@ Expected<DeviceImageTy *> GenericDeviceTy::loadBinary(GenericPluginTy &Plugin,
|
||||
return Image;
|
||||
}
|
||||
|
||||
Error GenericDeviceTy::setupDeviceMemoryPool(GenericPluginTy &Plugin,
|
||||
DeviceImageTy &Image,
|
||||
uint64_t PoolSize) {
|
||||
// Free the old pool, if any.
|
||||
if (DeviceMemoryPool.Ptr) {
|
||||
if (auto Err = dataDelete(DeviceMemoryPool.Ptr,
|
||||
TargetAllocTy::TARGET_ALLOC_DEVICE))
|
||||
return Err;
|
||||
}
|
||||
|
||||
DeviceMemoryPool.Size = PoolSize;
|
||||
auto AllocOrErr = dataAlloc(PoolSize, /*HostPtr=*/nullptr,
|
||||
TargetAllocTy::TARGET_ALLOC_DEVICE);
|
||||
if (AllocOrErr) {
|
||||
DeviceMemoryPool.Ptr = *AllocOrErr;
|
||||
} else {
|
||||
auto Err = AllocOrErr.takeError();
|
||||
REPORT("Failure to allocate device memory for global memory pool: %s\n",
|
||||
toString(std::move(Err)).data());
|
||||
DeviceMemoryPool.Ptr = nullptr;
|
||||
DeviceMemoryPool.Size = 0;
|
||||
}
|
||||
|
||||
// Create the metainfo of the device environment global.
|
||||
GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler();
|
||||
if (!GHandler.isSymbolInImage(*this, Image,
|
||||
"__omp_rtl_device_memory_pool_tracker")) {
|
||||
DP("Skip the memory pool as there is no tracker symbol in the image.");
|
||||
return Error::success();
|
||||
}
|
||||
|
||||
GlobalTy TrackerGlobal("__omp_rtl_device_memory_pool_tracker",
|
||||
sizeof(DeviceMemoryPoolTrackingTy),
|
||||
&DeviceMemoryPoolTracking);
|
||||
if (auto Err = GHandler.writeGlobalToDevice(*this, Image, TrackerGlobal))
|
||||
return Err;
|
||||
|
||||
// Create the metainfo of the device environment global.
|
||||
GlobalTy DevEnvGlobal("__omp_rtl_device_memory_pool",
|
||||
sizeof(DeviceMemoryPoolTy), &DeviceMemoryPool);
|
||||
|
||||
// Write device environment values to the device.
|
||||
return GHandler.writeGlobalToDevice(*this, Image, DevEnvGlobal);
|
||||
}
|
||||
|
||||
Error GenericDeviceTy::setupRPCServer(GenericPluginTy &Plugin,
|
||||
DeviceImageTy &Image) {
|
||||
// The plugin either does not need an RPC server or it is unavailable.
|
||||
|
||||
@@ -1235,11 +1235,6 @@ struct CUDADeviceTy : public GenericDeviceTy {
|
||||
return Info;
|
||||
}
|
||||
|
||||
virtual bool shouldSetupDeviceMemoryPool() const override {
|
||||
/// We use the CUDA malloc for now.
|
||||
return false;
|
||||
}
|
||||
|
||||
/// Getters and setters for stack and heap sizes.
|
||||
Error getDeviceStackSize(uint64_t &Value) override {
|
||||
return getCtxLimit(CU_LIMIT_STACK_SIZE, Value);
|
||||
|
||||
@@ -380,9 +380,6 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
|
||||
return Info;
|
||||
}
|
||||
|
||||
/// This plugin should not setup the device environment or memory pool.
|
||||
virtual bool shouldSetupDeviceMemoryPool() const override { return false; };
|
||||
|
||||
/// Getters and setters for stack size and heap size not relevant.
|
||||
Error getDeviceStackSize(uint64_t &Value) override {
|
||||
Value = 0;
|
||||
@@ -391,11 +388,6 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
|
||||
Error setDeviceStackSize(uint64_t Value) override {
|
||||
return Plugin::success();
|
||||
}
|
||||
Error getDeviceHeapSize(uint64_t &Value) override {
|
||||
Value = 0;
|
||||
return Plugin::success();
|
||||
}
|
||||
Error setDeviceHeapSize(uint64_t Value) override { return Plugin::success(); }
|
||||
|
||||
private:
|
||||
/// Grid values for Generic ELF64 plugins.
|
||||
|
||||
@@ -4,6 +4,8 @@
|
||||
// RUN: %libomptarget-run-generic 2>&1 | %fcheck-generic
|
||||
// RUN: %libomptarget-compileoptxx-run-and-check-generic
|
||||
|
||||
// REQUIRES: libc
|
||||
|
||||
#include <iostream>
|
||||
|
||||
template <typename LOOP_BODY>
|
||||
|
||||
@@ -10,7 +10,7 @@ int main() {
|
||||
int Threads = 64;
|
||||
int Teams = 10;
|
||||
|
||||
// Allocate ~55MB on the device.
|
||||
// Allocate ~160 KiB on the device.
|
||||
#pragma omp target map(from : DP)
|
||||
DP = (long unsigned *)malloc(sizeof(long unsigned) * N * Threads * Teams);
|
||||
|
||||
|
||||
@@ -14,18 +14,12 @@
|
||||
|
||||
#include "DeviceTypes.h"
|
||||
|
||||
// Forward declaration.
|
||||
struct KernelEnvironmentTy;
|
||||
|
||||
namespace ompx {
|
||||
|
||||
namespace allocator {
|
||||
|
||||
static uint64_t constexpr ALIGNMENT = 16;
|
||||
|
||||
/// Initialize the allocator according to \p KernelEnvironment
|
||||
void init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment);
|
||||
|
||||
/// Allocate \p Size bytes.
|
||||
[[gnu::alloc_size(1), gnu::assume_aligned(ALIGNMENT), gnu::malloc]] void *
|
||||
alloc(uint64_t Size);
|
||||
|
||||
@@ -18,42 +18,36 @@
|
||||
#include "Synchronization.h"
|
||||
|
||||
using namespace ompx;
|
||||
using namespace allocator;
|
||||
|
||||
[[gnu::used, gnu::retain, gnu::weak,
|
||||
gnu::visibility(
|
||||
"protected")]] DeviceMemoryPoolTy __omp_rtl_device_memory_pool;
|
||||
[[gnu::used, gnu::retain, gnu::weak,
|
||||
gnu::visibility("protected")]] DeviceMemoryPoolTrackingTy
|
||||
__omp_rtl_device_memory_pool_tracker;
|
||||
// Provide a default implementation of malloc / free for AMDGPU platforms built
|
||||
// without 'libc' support.
|
||||
extern "C" {
|
||||
#if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC)
|
||||
[[gnu::weak]] void *malloc(size_t Size) { return allocator::alloc(Size); }
|
||||
[[gnu::weak]] void free(void *Ptr) { allocator::free(Ptr); }
|
||||
#else
|
||||
[[gnu::leaf]] void *malloc(size_t Size);
|
||||
[[gnu::leaf]] void free(void *Ptr);
|
||||
#endif
|
||||
}
|
||||
|
||||
/// Stateless bump allocator that uses the __omp_rtl_device_memory_pool
|
||||
/// directly.
|
||||
static constexpr uint64_t MEMORY_SIZE = /* 1 MiB */ 1024 * 1024;
|
||||
alignas(ALIGNMENT) static uint8_t Memory[MEMORY_SIZE] = {0};
|
||||
|
||||
// Fallback bump pointer interface for platforms without a functioning
|
||||
// allocator.
|
||||
struct BumpAllocatorTy final {
|
||||
uint64_t Offset = 0;
|
||||
|
||||
void *alloc(uint64_t Size) {
|
||||
Size = utils::roundUp(Size, uint64_t(allocator::ALIGNMENT));
|
||||
|
||||
if (config::isDebugMode(DeviceDebugKind::AllocationTracker)) {
|
||||
atomic::add(&__omp_rtl_device_memory_pool_tracker.NumAllocations, 1,
|
||||
atomic::seq_cst);
|
||||
atomic::add(&__omp_rtl_device_memory_pool_tracker.AllocationTotal, Size,
|
||||
atomic::seq_cst);
|
||||
atomic::min(&__omp_rtl_device_memory_pool_tracker.AllocationMin, Size,
|
||||
atomic::seq_cst);
|
||||
atomic::max(&__omp_rtl_device_memory_pool_tracker.AllocationMax, Size,
|
||||
atomic::seq_cst);
|
||||
}
|
||||
|
||||
uint64_t *Data =
|
||||
reinterpret_cast<uint64_t *>(&__omp_rtl_device_memory_pool.Ptr);
|
||||
uint64_t End =
|
||||
reinterpret_cast<uint64_t>(Data) + __omp_rtl_device_memory_pool.Size;
|
||||
|
||||
uint64_t OldData = atomic::add(Data, Size, atomic::seq_cst);
|
||||
if (OldData + Size > End)
|
||||
uint64_t OldData = atomic::add(&Offset, Size, atomic::seq_cst);
|
||||
if (OldData + Size >= MEMORY_SIZE)
|
||||
__builtin_trap();
|
||||
|
||||
return reinterpret_cast<void *>(OldData);
|
||||
return &Memory[OldData];
|
||||
}
|
||||
|
||||
void free(void *) {}
|
||||
@@ -65,13 +59,20 @@ BumpAllocatorTy BumpAllocator;
|
||||
///
|
||||
///{
|
||||
|
||||
void allocator::init(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment) {
|
||||
// TODO: Check KernelEnvironment for an allocator choice as soon as we have
|
||||
// more than one.
|
||||
void *allocator::alloc(uint64_t Size) {
|
||||
#if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC)
|
||||
return BumpAllocator.alloc(Size);
|
||||
#else
|
||||
return ::malloc(Size);
|
||||
#endif
|
||||
}
|
||||
|
||||
void *allocator::alloc(uint64_t Size) { return BumpAllocator.alloc(Size); }
|
||||
|
||||
void allocator::free(void *Ptr) { BumpAllocator.free(Ptr); }
|
||||
void allocator::free(void *Ptr) {
|
||||
#if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC)
|
||||
BumpAllocator.free(Ptr);
|
||||
#else
|
||||
::free(Ptr);
|
||||
#endif
|
||||
}
|
||||
|
||||
///}
|
||||
|
||||
@@ -41,7 +41,6 @@ inititializeRuntime(bool IsSPMD, KernelEnvironmentTy &KernelEnvironment,
|
||||
synchronize::init(IsSPMD);
|
||||
mapping::init(IsSPMD);
|
||||
state::init(IsSPMD, KernelEnvironment, KernelLaunchEnvironment);
|
||||
allocator::init(IsSPMD, KernelEnvironment);
|
||||
workshare::init(IsSPMD);
|
||||
}
|
||||
|
||||
|
||||
@@ -100,7 +100,7 @@ void *omp_alloc(size_t size, omp_allocator_handle_t allocator) {
|
||||
case omp_const_mem_alloc:
|
||||
case omp_high_bw_mem_alloc:
|
||||
case omp_low_lat_mem_alloc:
|
||||
return malloc(size);
|
||||
return ompx::allocator::alloc(size);
|
||||
default:
|
||||
return nullptr;
|
||||
}
|
||||
@@ -113,7 +113,7 @@ void omp_free(void *ptr, omp_allocator_handle_t allocator) {
|
||||
case omp_const_mem_alloc:
|
||||
case omp_high_bw_mem_alloc:
|
||||
case omp_low_lat_mem_alloc:
|
||||
free(ptr);
|
||||
ompx::allocator::free(ptr);
|
||||
return;
|
||||
case omp_null_allocator:
|
||||
default:
|
||||
|
||||
@@ -44,26 +44,6 @@ using namespace ompx;
|
||||
|
||||
namespace {
|
||||
|
||||
/// Fallback implementations are missing to trigger a link time error.
|
||||
/// Implementations for new devices, including the host, should go into a
|
||||
/// dedicated begin/end declare variant.
|
||||
///
|
||||
///{
|
||||
extern "C" {
|
||||
#if defined(__AMDGPU__) && !defined(OMPTARGET_HAS_LIBC)
|
||||
|
||||
[[gnu::weak]] void *malloc(size_t Size) { return allocator::alloc(Size); }
|
||||
[[gnu::weak]] void free(void *Ptr) { allocator::free(Ptr); }
|
||||
|
||||
#else
|
||||
|
||||
[[gnu::weak, gnu::leaf]] void *malloc(size_t Size);
|
||||
[[gnu::weak, gnu::leaf]] void free(void *Ptr);
|
||||
|
||||
#endif
|
||||
}
|
||||
///}
|
||||
|
||||
/// A "smart" stack in shared memory.
|
||||
///
|
||||
/// The stack exposes a malloc/free interface but works like a stack internally.
|
||||
@@ -171,13 +151,13 @@ void memory::freeShared(void *Ptr, uint64_t Bytes, const char *Reason) {
|
||||
}
|
||||
|
||||
void *memory::allocGlobal(uint64_t Bytes, const char *Reason) {
|
||||
void *Ptr = malloc(Bytes);
|
||||
void *Ptr = allocator::alloc(Bytes);
|
||||
if (config::isDebugMode(DeviceDebugKind::CommonIssues) && Ptr == nullptr)
|
||||
printf("nullptr returned by malloc!\n");
|
||||
return Ptr;
|
||||
}
|
||||
|
||||
void memory::freeGlobal(void *Ptr, const char *Reason) { free(Ptr); }
|
||||
void memory::freeGlobal(void *Ptr, const char *Reason) { allocator::free(Ptr); }
|
||||
|
||||
///}
|
||||
|
||||
|
||||
@@ -1521,5 +1521,4 @@ debugging features are supported.
|
||||
|
||||
* Enable debugging assertions in the device. ``0x01``
|
||||
* Enable diagnosing common problems during offloading . ``0x4``
|
||||
* Enable device malloc statistics (amdgpu only). ``0x8``
|
||||
* Dump device PGO counters (only if PGO on GPU is enabled). ``0x10``
|
||||
|
||||
Reference in New Issue
Block a user