2017-12-06 21:59:09 +00:00
|
|
|
//===--------- device.cpp - Target independent OpenMP target RTL ----------===//
|
|
|
|
|
//
|
2019-01-19 10:56:40 +00:00
|
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
|
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
|
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
2017-12-06 21:59:09 +00:00
|
|
|
//
|
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
//
|
|
|
|
|
// Functionality for managing devices that are handled by RTL plugins.
|
|
|
|
|
//
|
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
|
|
#include "device.h"
|
2023-12-01 12:34:24 -08:00
|
|
|
#include "OffloadEntry.h"
|
2023-12-12 12:49:46 -08:00
|
|
|
#include "OpenMP/Mapping.h"
|
2023-11-29 08:29:41 -08:00
|
|
|
#include "OpenMP/OMPT/Callback.h"
|
|
|
|
|
#include "OpenMP/OMPT/Interface.h"
|
2023-11-30 13:47:47 -08:00
|
|
|
#include "PluginManager.h"
|
2023-12-01 12:34:24 -08:00
|
|
|
#include "Shared/APITypes.h"
|
|
|
|
|
#include "Shared/Debug.h"
|
2022-03-02 13:34:24 -06:00
|
|
|
#include "omptarget.h"
|
2017-12-06 21:59:09 +00:00
|
|
|
#include "private.h"
|
|
|
|
|
#include "rtl.h"
|
|
|
|
|
|
2023-11-30 15:23:34 -08:00
|
|
|
#include "Shared/EnvironmentVar.h"
|
2023-12-05 16:04:01 -08:00
|
|
|
#include "llvm/Support/Error.h"
|
2023-07-19 10:32:07 -07:00
|
|
|
|
2017-12-06 21:59:09 +00:00
|
|
|
#include <cassert>
|
|
|
|
|
#include <climits>
|
2022-03-05 15:14:20 -06:00
|
|
|
#include <cstdint>
|
2020-09-15 15:04:37 -04:00
|
|
|
#include <cstdio>
|
2023-03-21 13:40:36 -07:00
|
|
|
#include <mutex>
|
2017-12-06 21:59:09 +00:00
|
|
|
#include <string>
|
2022-03-02 13:46:01 -06:00
|
|
|
#include <thread>
|
2017-12-06 21:59:09 +00:00
|
|
|
|
2023-07-25 08:14:59 -04:00
|
|
|
#ifdef OMPT_SUPPORT
|
|
|
|
|
using namespace llvm::omp::target::ompt;
|
|
|
|
|
#endif
|
|
|
|
|
|
2025-09-08 09:58:38 -05:00
|
|
|
using namespace llvm::omp::target::plugin;
|
2025-12-02 23:45:23 +01:00
|
|
|
using namespace llvm::omp::target::debug;
|
2025-09-08 09:58:38 -05:00
|
|
|
|
2022-03-02 13:34:24 -06:00
|
|
|
int HostDataToTargetTy::addEventIfNecessary(DeviceTy &Device,
|
|
|
|
|
AsyncInfoTy &AsyncInfo) const {
|
2022-01-18 18:56:10 -06:00
|
|
|
// First, check if the user disabled atomic map transfer/malloc/dealloc.
|
2023-11-30 17:08:41 -08:00
|
|
|
if (!MappingConfig::get().UseEventsForAtomicTransfers)
|
2022-01-18 18:56:10 -06:00
|
|
|
return OFFLOAD_SUCCESS;
|
|
|
|
|
|
|
|
|
|
void *Event = getEvent();
|
|
|
|
|
bool NeedNewEvent = Event == nullptr;
|
|
|
|
|
if (NeedNewEvent && Device.createEvent(&Event) != OFFLOAD_SUCCESS) {
|
2025-12-02 23:45:23 +01:00
|
|
|
REPORT() << "Failed to create event";
|
2022-01-18 18:56:10 -06:00
|
|
|
return OFFLOAD_FAIL;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// We cannot assume the event should not be nullptr because we don't
|
|
|
|
|
// know if the target support event. But if a target doesn't,
|
|
|
|
|
// recordEvent should always return success.
|
|
|
|
|
if (Device.recordEvent(Event, AsyncInfo) != OFFLOAD_SUCCESS) {
|
2025-12-02 23:45:23 +01:00
|
|
|
REPORT() << "Failed to set dependence on event " << Event;
|
2022-01-18 18:56:10 -06:00
|
|
|
return OFFLOAD_FAIL;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (NeedNewEvent)
|
|
|
|
|
setEvent(Event);
|
|
|
|
|
|
|
|
|
|
return OFFLOAD_SUCCESS;
|
|
|
|
|
}
|
|
|
|
|
|
2024-05-09 06:35:54 -05:00
|
|
|
DeviceTy::DeviceTy(GenericPluginTy *RTL, int32_t DeviceID, int32_t RTLDeviceID)
|
2023-12-05 16:04:01 -08:00
|
|
|
: DeviceID(DeviceID), RTL(RTL), RTLDeviceID(RTLDeviceID),
|
2024-01-31 11:48:07 -06:00
|
|
|
MappingInfo(*this) {}
|
[OpenMP] Introduce target memory manager
Target memory manager is introduced in this patch which aims to manage target
memory such that they will not be freed immediately when they are not used
because the overhead of memory allocation and free is very large. For CUDA
device, cuMemFree even blocks the context switch on device which affects
concurrent kernel execution.
The memory manager can be taken as a memory pool. It divides the pool into
multiple buckets according to the size such that memory allocation/free
distributed to different buckets will not affect each other.
In this version, we use the exact-equality policy to find a free buffer. This
is an open question: will best-fit work better here? IMO, best-fit is not good
for target memory management because computation on GPU usually requires GBs of
data. Best-fit might lead to a serious waste. For example, there is a free
buffer of size 1960MB, and now we need a buffer of size 1200MB. If best-fit,
the free buffer will be returned, leading to a 760MB waste.
The allocation will happen when there is no free memory left, and the memory
free on device will take place in the following two cases:
1. The program ends. Obviously. However, there is a little problem that plugin
library is destroyed before the memory manager is destroyed, leading to a fact
that the call to target plugin will not succeed.
2. Device is out of memory when we request a new memory. The manager will walk
through all free buffers from the bucket with largest base size, pick up one
buffer, free it, and try to allocate immediately. If it succeeds, it will
return right away rather than freeing all buffers in free list.
Update:
A threshold (8KB by default) is set such that users could control what size of memory
will be managed by the manager. It can also be configured by an environment variable
`LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD`.
Reviewed By: jdoerfert, ye-luo, JonChesterfield
Differential Revision: https://reviews.llvm.org/D81054
2020-08-19 23:12:02 -04:00
|
|
|
|
2020-09-15 15:04:37 -04:00
|
|
|
DeviceTy::~DeviceTy() {
|
2020-12-18 15:14:44 -05:00
|
|
|
if (DeviceID == -1 || !(getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE))
|
2020-09-15 15:04:37 -04:00
|
|
|
return;
|
|
|
|
|
|
2022-07-01 11:48:15 -04:00
|
|
|
ident_t Loc = {0, 0, 0, 0, ";libomptarget;libomptarget;0;0;;"};
|
|
|
|
|
dumpTargetPointerMappings(&Loc, *this);
|
2020-09-15 15:04:37 -04:00
|
|
|
}
|
[OpenMP] Introduce target memory manager
Target memory manager is introduced in this patch which aims to manage target
memory such that they will not be freed immediately when they are not used
because the overhead of memory allocation and free is very large. For CUDA
device, cuMemFree even blocks the context switch on device which affects
concurrent kernel execution.
The memory manager can be taken as a memory pool. It divides the pool into
multiple buckets according to the size such that memory allocation/free
distributed to different buckets will not affect each other.
In this version, we use the exact-equality policy to find a free buffer. This
is an open question: will best-fit work better here? IMO, best-fit is not good
for target memory management because computation on GPU usually requires GBs of
data. Best-fit might lead to a serious waste. For example, there is a free
buffer of size 1960MB, and now we need a buffer of size 1200MB. If best-fit,
the free buffer will be returned, leading to a 760MB waste.
The allocation will happen when there is no free memory left, and the memory
free on device will take place in the following two cases:
1. The program ends. Obviously. However, there is a little problem that plugin
library is destroyed before the memory manager is destroyed, leading to a fact
that the call to target plugin will not succeed.
2. Device is out of memory when we request a new memory. The manager will walk
through all free buffers from the bucket with largest base size, pick up one
buffer, free it, and try to allocate immediately. If it succeeds, it will
return right away rather than freeing all buffers in free list.
Update:
A threshold (8KB by default) is set such that users could control what size of memory
will be managed by the manager. It can also be configured by an environment variable
`LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD`.
Reviewed By: jdoerfert, ye-luo, JonChesterfield
Differential Revision: https://reviews.llvm.org/D81054
2020-08-19 23:12:02 -04:00
|
|
|
|
2023-12-05 16:04:01 -08:00
|
|
|
llvm::Error DeviceTy::init() {
|
2024-05-16 11:13:50 -05:00
|
|
|
int32_t Ret = RTL->init_device(RTLDeviceID);
|
2023-12-05 16:04:01 -08:00
|
|
|
if (Ret != OFFLOAD_SUCCESS)
|
2025-05-20 14:50:20 +01:00
|
|
|
return error::createOffloadError(error::ErrorCode::BACKEND_FAILURE,
|
|
|
|
|
"failed to initialize device %d\n",
|
|
|
|
|
DeviceID);
|
[OpenMP] Introduce target memory manager
Target memory manager is introduced in this patch which aims to manage target
memory such that they will not be freed immediately when they are not used
because the overhead of memory allocation and free is very large. For CUDA
device, cuMemFree even blocks the context switch on device which affects
concurrent kernel execution.
The memory manager can be taken as a memory pool. It divides the pool into
multiple buckets according to the size such that memory allocation/free
distributed to different buckets will not affect each other.
In this version, we use the exact-equality policy to find a free buffer. This
is an open question: will best-fit work better here? IMO, best-fit is not good
for target memory management because computation on GPU usually requires GBs of
data. Best-fit might lead to a serious waste. For example, there is a free
buffer of size 1960MB, and now we need a buffer of size 1200MB. If best-fit,
the free buffer will be returned, leading to a 760MB waste.
The allocation will happen when there is no free memory left, and the memory
free on device will take place in the following two cases:
1. The program ends. Obviously. However, there is a little problem that plugin
library is destroyed before the memory manager is destroyed, leading to a fact
that the call to target plugin will not succeed.
2. Device is out of memory when we request a new memory. The manager will walk
through all free buffers from the bucket with largest base size, pick up one
buffer, free it, and try to allocate immediately. If it succeeds, it will
return right away rather than freeing all buffers in free list.
Update:
A threshold (8KB by default) is set such that users could control what size of memory
will be managed by the manager. It can also be configured by an environment variable
`LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD`.
Reviewed By: jdoerfert, ye-luo, JonChesterfield
Differential Revision: https://reviews.llvm.org/D81054
2020-08-19 23:12:02 -04:00
|
|
|
|
2023-07-19 10:32:07 -07:00
|
|
|
// Enables recording kernels if set.
|
2023-11-30 15:23:34 -08:00
|
|
|
BoolEnvar OMPX_RecordKernel("LIBOMPTARGET_RECORD", false);
|
2023-07-19 10:32:07 -07:00
|
|
|
if (OMPX_RecordKernel) {
|
|
|
|
|
// Enables saving the device memory kernel output post execution if set.
|
2023-11-30 15:23:34 -08:00
|
|
|
BoolEnvar OMPX_ReplaySaveOutput("LIBOMPTARGET_RR_SAVE_OUTPUT", false);
|
2023-11-16 16:09:05 -08:00
|
|
|
|
|
|
|
|
uint64_t ReqPtrArgOffset;
|
2023-11-30 14:16:34 -08:00
|
|
|
RTL->initialize_record_replay(RTLDeviceID, 0, nullptr, true,
|
|
|
|
|
OMPX_ReplaySaveOutput, ReqPtrArgOffset);
|
2023-07-19 10:32:07 -07:00
|
|
|
}
|
|
|
|
|
|
2023-12-05 16:04:01 -08:00
|
|
|
return llvm::Error::success();
|
2017-12-06 21:59:09 +00:00
|
|
|
}
|
|
|
|
|
|
2025-09-08 09:58:38 -05:00
|
|
|
// Extract the mapping of host function pointers to device function pointers
|
|
|
|
|
// from the entry table. Functions marked as 'indirect' in OpenMP will have
|
|
|
|
|
// offloading entries generated for them which map the host's function pointer
|
|
|
|
|
// to a global containing the corresponding function pointer on the device.
|
|
|
|
|
static llvm::Expected<std::pair<void *, uint64_t>>
|
|
|
|
|
setupIndirectCallTable(DeviceTy &Device, __tgt_device_image *Image,
|
|
|
|
|
__tgt_device_binary Binary) {
|
|
|
|
|
AsyncInfoTy AsyncInfo(Device);
|
|
|
|
|
llvm::ArrayRef<llvm::offloading::EntryTy> Entries(Image->EntriesBegin,
|
|
|
|
|
Image->EntriesEnd);
|
|
|
|
|
llvm::SmallVector<std::pair<void *, void *>> IndirectCallTable;
|
|
|
|
|
for (const auto &Entry : Entries) {
|
|
|
|
|
if (Entry.Kind != llvm::object::OffloadKind::OFK_OpenMP ||
|
2025-11-26 11:33:26 -06:00
|
|
|
Entry.Size == 0 ||
|
|
|
|
|
(!(Entry.Flags & OMP_DECLARE_TARGET_INDIRECT) &&
|
|
|
|
|
!(Entry.Flags & OMP_DECLARE_TARGET_INDIRECT_VTABLE)))
|
2025-09-08 09:58:38 -05:00
|
|
|
continue;
|
|
|
|
|
|
2025-11-26 11:33:26 -06:00
|
|
|
size_t PtrSize = sizeof(void *);
|
|
|
|
|
if (Entry.Flags & OMP_DECLARE_TARGET_INDIRECT_VTABLE) {
|
|
|
|
|
// This is a VTable entry, the current entry is the first index of the
|
|
|
|
|
// VTable and Entry.Size is the total size of the VTable. Unlike the
|
|
|
|
|
// indirect function case below, the Global is not of size Entry.Size and
|
|
|
|
|
// is instead of size PtrSize (sizeof(void*)).
|
|
|
|
|
void *Vtable;
|
|
|
|
|
void *res;
|
|
|
|
|
if (Device.RTL->get_global(Binary, PtrSize, Entry.SymbolName, &Vtable))
|
|
|
|
|
return error::createOffloadError(error::ErrorCode::INVALID_BINARY,
|
|
|
|
|
"failed to load %s", Entry.SymbolName);
|
|
|
|
|
|
|
|
|
|
// HstPtr = Entry.Address;
|
|
|
|
|
if (Device.retrieveData(&res, Vtable, PtrSize, AsyncInfo))
|
|
|
|
|
return error::createOffloadError(error::ErrorCode::INVALID_BINARY,
|
|
|
|
|
"failed to load %s", Entry.SymbolName);
|
|
|
|
|
if (Device.synchronize(AsyncInfo))
|
|
|
|
|
return error::createOffloadError(
|
|
|
|
|
error::ErrorCode::INVALID_BINARY,
|
|
|
|
|
"failed to synchronize after retrieving %s", Entry.SymbolName);
|
|
|
|
|
// Calculate and emplace entire Vtable from first Vtable byte
|
|
|
|
|
for (uint64_t i = 0; i < Entry.Size / PtrSize; ++i) {
|
|
|
|
|
auto &[HstPtr, DevPtr] = IndirectCallTable.emplace_back();
|
|
|
|
|
HstPtr = reinterpret_cast<void *>(
|
|
|
|
|
reinterpret_cast<uintptr_t>(Entry.Address) + i * PtrSize);
|
|
|
|
|
DevPtr = reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(res) +
|
|
|
|
|
i * PtrSize);
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
// Indirect function case: Entry.Size should equal PtrSize since we're
|
|
|
|
|
// dealing with a single function pointer (not a VTable)
|
|
|
|
|
assert(Entry.Size == PtrSize && "Global not a function pointer?");
|
|
|
|
|
auto &[HstPtr, DevPtr] = IndirectCallTable.emplace_back();
|
|
|
|
|
void *Ptr;
|
|
|
|
|
if (Device.RTL->get_global(Binary, Entry.Size, Entry.SymbolName, &Ptr))
|
|
|
|
|
return error::createOffloadError(error::ErrorCode::INVALID_BINARY,
|
|
|
|
|
"failed to load %s", Entry.SymbolName);
|
|
|
|
|
|
|
|
|
|
HstPtr = Entry.Address;
|
|
|
|
|
if (Device.retrieveData(&DevPtr, Ptr, Entry.Size, AsyncInfo))
|
|
|
|
|
return error::createOffloadError(error::ErrorCode::INVALID_BINARY,
|
|
|
|
|
"failed to load %s", Entry.SymbolName);
|
|
|
|
|
}
|
|
|
|
|
if (Device.synchronize(AsyncInfo))
|
|
|
|
|
return error::createOffloadError(
|
|
|
|
|
error::ErrorCode::INVALID_BINARY,
|
|
|
|
|
"failed to synchronize after retrieving %s", Entry.SymbolName);
|
2025-09-08 09:58:38 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// If we do not have any indirect globals we exit early.
|
|
|
|
|
if (IndirectCallTable.empty())
|
|
|
|
|
return std::pair{nullptr, 0};
|
|
|
|
|
|
|
|
|
|
// Sort the array to allow for more efficient lookup of device pointers.
|
|
|
|
|
llvm::sort(IndirectCallTable,
|
|
|
|
|
[](const auto &x, const auto &y) { return x.first < y.first; });
|
|
|
|
|
|
|
|
|
|
uint64_t TableSize =
|
|
|
|
|
IndirectCallTable.size() * sizeof(std::pair<void *, void *>);
|
|
|
|
|
void *DevicePtr = Device.allocData(TableSize, nullptr, TARGET_ALLOC_DEVICE);
|
|
|
|
|
if (Device.submitData(DevicePtr, IndirectCallTable.data(), TableSize,
|
|
|
|
|
AsyncInfo))
|
|
|
|
|
return error::createOffloadError(error::ErrorCode::INVALID_BINARY,
|
|
|
|
|
"failed to copy data");
|
|
|
|
|
return std::pair<void *, uint64_t>(DevicePtr, IndirectCallTable.size());
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Load binary to device and perform global initialization if needed.
|
2024-01-22 11:06:47 -06:00
|
|
|
llvm::Expected<__tgt_device_binary>
|
|
|
|
|
DeviceTy::loadBinary(__tgt_device_image *Img) {
|
|
|
|
|
__tgt_device_binary Binary;
|
|
|
|
|
|
|
|
|
|
if (RTL->load_binary(RTLDeviceID, Img, &Binary) != OFFLOAD_SUCCESS)
|
2025-05-20 14:50:20 +01:00
|
|
|
return error::createOffloadError(error::ErrorCode::INVALID_BINARY,
|
|
|
|
|
"failed to load binary %p", Img);
|
2025-09-08 09:58:38 -05:00
|
|
|
|
|
|
|
|
// This symbol is optional.
|
|
|
|
|
void *DeviceEnvironmentPtr;
|
|
|
|
|
if (RTL->get_global(Binary, sizeof(DeviceEnvironmentTy),
|
|
|
|
|
"__omp_rtl_device_environment", &DeviceEnvironmentPtr))
|
|
|
|
|
return Binary;
|
|
|
|
|
|
|
|
|
|
// Obtain a table mapping host function pointers to device function pointers.
|
|
|
|
|
auto CallTablePairOrErr = setupIndirectCallTable(*this, Img, Binary);
|
|
|
|
|
if (!CallTablePairOrErr)
|
|
|
|
|
return CallTablePairOrErr.takeError();
|
|
|
|
|
|
|
|
|
|
GenericDeviceTy &GenericDevice = RTL->getDevice(RTLDeviceID);
|
|
|
|
|
DeviceEnvironmentTy DeviceEnvironment;
|
|
|
|
|
DeviceEnvironment.DeviceDebugKind = GenericDevice.getDebugKind();
|
|
|
|
|
DeviceEnvironment.NumDevices = RTL->getNumDevices();
|
|
|
|
|
// TODO: The device ID used here is not the real device ID used by OpenMP.
|
|
|
|
|
DeviceEnvironment.DeviceNum = RTLDeviceID;
|
|
|
|
|
DeviceEnvironment.DynamicMemSize = GenericDevice.getDynamicMemorySize();
|
|
|
|
|
DeviceEnvironment.ClockFrequency = GenericDevice.getClockFrequency();
|
|
|
|
|
DeviceEnvironment.IndirectCallTable =
|
|
|
|
|
reinterpret_cast<uintptr_t>(CallTablePairOrErr->first);
|
|
|
|
|
DeviceEnvironment.IndirectCallTableSize = CallTablePairOrErr->second;
|
|
|
|
|
DeviceEnvironment.HardwareParallelism =
|
|
|
|
|
GenericDevice.getHardwareParallelism();
|
|
|
|
|
|
|
|
|
|
AsyncInfoTy AsyncInfo(*this);
|
|
|
|
|
if (submitData(DeviceEnvironmentPtr, &DeviceEnvironment,
|
|
|
|
|
sizeof(DeviceEnvironment), AsyncInfo))
|
|
|
|
|
return error::createOffloadError(error::ErrorCode::INVALID_BINARY,
|
|
|
|
|
"failed to copy data");
|
|
|
|
|
|
2024-01-22 11:06:47 -06:00
|
|
|
return Binary;
|
2017-12-06 21:59:09 +00:00
|
|
|
}
|
|
|
|
|
|
2021-03-03 11:48:32 -08:00
|
|
|
void *DeviceTy::allocData(int64_t Size, void *HstPtr, int32_t Kind) {
|
2023-07-25 08:14:59 -04:00
|
|
|
/// RAII to establish tool anchors before and after data allocation
|
2023-08-16 06:38:39 -04:00
|
|
|
void *TargetPtr = nullptr;
|
2023-07-25 08:14:59 -04:00
|
|
|
OMPT_IF_BUILT(InterfaceRAII TargetDataAllocRAII(
|
|
|
|
|
RegionInterface.getCallbacks<ompt_target_data_alloc>(),
|
2023-09-11 12:11:44 +02:00
|
|
|
DeviceID, HstPtr, &TargetPtr, Size,
|
2024-02-07 17:29:08 -08:00
|
|
|
/*CodePtr=*/OMPT_GET_RETURN_ADDRESS);)
|
2023-07-25 08:14:59 -04:00
|
|
|
|
2023-08-16 06:38:39 -04:00
|
|
|
TargetPtr = RTL->data_alloc(RTLDeviceID, Size, HstPtr, Kind);
|
|
|
|
|
return TargetPtr;
|
2020-07-27 16:08:19 -04:00
|
|
|
}
|
|
|
|
|
|
2023-07-03 10:23:38 -04:00
|
|
|
int32_t DeviceTy::deleteData(void *TgtAllocBegin, int32_t Kind) {
|
2023-07-25 08:14:59 -04:00
|
|
|
/// RAII to establish tool anchors before and after data deletion
|
|
|
|
|
OMPT_IF_BUILT(InterfaceRAII TargetDataDeleteRAII(
|
|
|
|
|
RegionInterface.getCallbacks<ompt_target_data_delete>(),
|
2023-09-11 12:11:44 +02:00
|
|
|
DeviceID, TgtAllocBegin,
|
2024-02-07 17:29:08 -08:00
|
|
|
/*CodePtr=*/OMPT_GET_RETURN_ADDRESS);)
|
2023-07-25 08:14:59 -04:00
|
|
|
|
2023-07-03 10:23:38 -04:00
|
|
|
return RTL->data_delete(RTLDeviceID, TgtAllocBegin, Kind);
|
2020-07-27 16:08:19 -04:00
|
|
|
}
|
|
|
|
|
|
[OpenMP] Optimized stream selection by scheduling data mapping for the same target region into a same stream
Summary:
This patch introduces two things for offloading:
1. Asynchronous data transferring: those functions are suffix with `_async`. They have one more argument compared with their synchronous counterparts: `__tgt_async_info*`, which is a new struct that only has one field, `void *Identifier`. This struct is for information exchange between different asynchronous operations. It can be used for stream selection, like in this case, or operation synchronization, which is also used. We may expect more usages in the future.
2. Optimization of stream selection for data mapping. Previous implementation was using asynchronous device memory transfer but synchronizing after each memory transfer. Actually, if we say kernel A needs four memory copy to device and two memory copy back to host, then we can schedule these seven operations (four H2D, two D2H, and one kernel launch) into a same stream and just need synchronization after memory copy from device to host. In this way, we can save a huge overhead compared with synchronization after each operation.
Reviewers: jdoerfert, ye-luo
Reviewed By: jdoerfert
Subscribers: yaxunl, lildmh, guansong, openmp-commits
Tags: #openmp
Differential Revision: https://reviews.llvm.org/D77005
2020-04-07 14:51:56 -04:00
|
|
|
// Submit data to device
|
2020-07-28 20:10:59 -04:00
|
|
|
int32_t DeviceTy::submitData(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size,
|
2023-12-05 15:25:10 -08:00
|
|
|
AsyncInfoTy &AsyncInfo, HostDataToTargetTy *Entry,
|
2023-12-12 12:49:46 -08:00
|
|
|
MappingInfoTy::HDTTMapAccessorTy *HDTTMapPtr) {
|
|
|
|
|
if (getInfoLevel() & OMP_INFOTYPE_DATA_TRANSFER)
|
|
|
|
|
MappingInfo.printCopyInfo(TgtPtrBegin, HstPtrBegin, Size, /*H2D=*/true,
|
|
|
|
|
Entry, HDTTMapPtr);
|
2021-06-08 16:43:59 -04:00
|
|
|
|
2023-07-25 08:14:59 -04:00
|
|
|
/// RAII to establish tool anchors before and after data submit
|
|
|
|
|
OMPT_IF_BUILT(
|
|
|
|
|
InterfaceRAII TargetDataSubmitRAII(
|
|
|
|
|
RegionInterface.getCallbacks<ompt_target_data_transfer_to_device>(),
|
2024-02-26 11:16:25 +01:00
|
|
|
omp_get_initial_device(), HstPtrBegin, DeviceID, TgtPtrBegin, Size,
|
2024-02-07 17:29:08 -08:00
|
|
|
/*CodePtr=*/OMPT_GET_RETURN_ADDRESS);)
|
2023-07-25 08:14:59 -04:00
|
|
|
|
2022-07-01 11:48:15 -04:00
|
|
|
return RTL->data_submit_async(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size,
|
|
|
|
|
AsyncInfo);
|
2017-12-06 21:59:09 +00:00
|
|
|
}
|
|
|
|
|
|
[OpenMP] Optimized stream selection by scheduling data mapping for the same target region into a same stream
Summary:
This patch introduces two things for offloading:
1. Asynchronous data transferring: those functions are suffix with `_async`. They have one more argument compared with their synchronous counterparts: `__tgt_async_info*`, which is a new struct that only has one field, `void *Identifier`. This struct is for information exchange between different asynchronous operations. It can be used for stream selection, like in this case, or operation synchronization, which is also used. We may expect more usages in the future.
2. Optimization of stream selection for data mapping. Previous implementation was using asynchronous device memory transfer but synchronizing after each memory transfer. Actually, if we say kernel A needs four memory copy to device and two memory copy back to host, then we can schedule these seven operations (four H2D, two D2H, and one kernel launch) into a same stream and just need synchronization after memory copy from device to host. In this way, we can save a huge overhead compared with synchronization after each operation.
Reviewers: jdoerfert, ye-luo
Reviewed By: jdoerfert
Subscribers: yaxunl, lildmh, guansong, openmp-commits
Tags: #openmp
Differential Revision: https://reviews.llvm.org/D77005
2020-04-07 14:51:56 -04:00
|
|
|
// Retrieve data from device
|
2020-07-30 21:37:01 -04:00
|
|
|
int32_t DeviceTy::retrieveData(void *HstPtrBegin, void *TgtPtrBegin,
|
2023-03-21 13:40:36 -07:00
|
|
|
int64_t Size, AsyncInfoTy &AsyncInfo,
|
2023-12-05 15:25:10 -08:00
|
|
|
HostDataToTargetTy *Entry,
|
2023-12-12 12:49:46 -08:00
|
|
|
MappingInfoTy::HDTTMapAccessorTy *HDTTMapPtr) {
|
|
|
|
|
if (getInfoLevel() & OMP_INFOTYPE_DATA_TRANSFER)
|
|
|
|
|
MappingInfo.printCopyInfo(TgtPtrBegin, HstPtrBegin, Size, /*H2D=*/false,
|
|
|
|
|
Entry, HDTTMapPtr);
|
2021-06-08 16:43:59 -04:00
|
|
|
|
2023-07-25 08:14:59 -04:00
|
|
|
/// RAII to establish tool anchors before and after data retrieval
|
|
|
|
|
OMPT_IF_BUILT(
|
|
|
|
|
InterfaceRAII TargetDataRetrieveRAII(
|
|
|
|
|
RegionInterface.getCallbacks<ompt_target_data_transfer_from_device>(),
|
2024-02-26 11:16:25 +01:00
|
|
|
DeviceID, TgtPtrBegin, omp_get_initial_device(), HstPtrBegin, Size,
|
2024-02-07 17:29:08 -08:00
|
|
|
/*CodePtr=*/OMPT_GET_RETURN_ADDRESS);)
|
2023-07-25 08:14:59 -04:00
|
|
|
|
2022-07-01 11:48:15 -04:00
|
|
|
return RTL->data_retrieve_async(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size,
|
|
|
|
|
AsyncInfo);
|
2017-12-06 21:59:09 +00:00
|
|
|
}
|
|
|
|
|
|
2020-06-04 16:58:37 -04:00
|
|
|
// Copy data from current device to destination device directly
|
2020-08-19 16:07:58 -04:00
|
|
|
int32_t DeviceTy::dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr,
|
2021-02-10 11:06:00 -06:00
|
|
|
int64_t Size, AsyncInfoTy &AsyncInfo) {
|
2024-02-26 11:16:25 +01:00
|
|
|
/// RAII to establish tool anchors before and after data exchange
|
|
|
|
|
/// Note: Despite the fact that this is a data exchange, we use 'from_device'
|
|
|
|
|
/// operation enum (w.r.t. ompt_target_data_op_t) as there is currently
|
|
|
|
|
/// no better alternative. It is still possible to distinguish this
|
|
|
|
|
/// scenario from a real data retrieve by checking if both involved
|
|
|
|
|
/// device numbers are less than omp_get_num_devices().
|
|
|
|
|
OMPT_IF_BUILT(
|
|
|
|
|
InterfaceRAII TargetDataExchangeRAII(
|
|
|
|
|
RegionInterface.getCallbacks<ompt_target_data_transfer_from_device>(),
|
|
|
|
|
RTLDeviceID, SrcPtr, DstDev.RTLDeviceID, DstPtr, Size,
|
|
|
|
|
/*CodePtr=*/OMPT_GET_RETURN_ADDRESS);)
|
2024-03-27 14:37:55 -05:00
|
|
|
if (!AsyncInfo) {
|
2020-06-04 16:58:37 -04:00
|
|
|
return RTL->data_exchange(RTLDeviceID, SrcPtr, DstDev.RTLDeviceID, DstPtr,
|
|
|
|
|
Size);
|
2022-07-01 11:48:15 -04:00
|
|
|
}
|
|
|
|
|
return RTL->data_exchange_async(RTLDeviceID, SrcPtr, DstDev.RTLDeviceID,
|
|
|
|
|
DstPtr, Size, AsyncInfo);
|
2020-06-04 16:58:37 -04:00
|
|
|
}
|
|
|
|
|
|
[Offload] Introduce ATTACH map-type support for pointer attachment. (#149036)
This patch introduces libomptarget support for the ATTACH map-type,
which can be used to implement OpenMP conditional compliant pointer
attachment, based on whether the pointer/pointee is newly mapped on a
given construct.
For example, for the following:
```c
int *p;
#pragma omp target enter data map(p[1:10])
```
The following maps can be emitted by clang:
```
(A)
&p[0], &p[1], 10 * sizeof(p[1]), TO | FROM
&p, &p[1], sizeof(p), ATTACH
```
Without this map-type, these two possible maps could be emitted by
clang:
```
(B)
&p[0], &p[1], 10 * sizeof(p[1]), TO | FROM
(C)
&p, &p[1], 10 * sizeof(p[1]), TO | FROM | PTR_AND_OBJ
````
(B) does not perform any pointer attachment, while (C) also maps the
pointer p, which are both incorrect.
In terms of implementation, maps with the ATTACH map-type are handled
after all other maps have been processed, as it requires knowledge of
which new allocations happened as part of the construct. As per OpenMP
5.0, an attachment should happen only when either the pointer or the
pointee was newly mapped while handling the construct.
Maps with ATTACH map-type-bit do not increase/decrease the ref-count.
With OpenMP 6.1, `attach(always/never)` can be used to force/prevent
attachment. For `attach(always)`, the compiler will insert the ALWAYS
map-type, which would let libomptarget bypass the check about one of the
pointer/pointee being new. With `attach(never)`, the ATTACH map will not
be emitted at all.
The size argument of the ATTACH map-type can specify values greater than
`sizeof(void*)` which can be used to support pointer attachment on
Fortran descriptors. Note that this also requires shadow-pointer
tracking to also support them. That has not been implemented in this
patch.
This was worked upon in coordination with Ravi Narayanaswamy, who has
since retired. Happy retirement, Ravi!
---------
Co-authored-by: Alex Duran <alejandro.duran@intel.com>
2025-08-17 15:17:04 -07:00
|
|
|
int32_t DeviceTy::dataFence(AsyncInfoTy &AsyncInfo) {
|
|
|
|
|
return RTL->data_fence(RTLDeviceID, AsyncInfo);
|
|
|
|
|
}
|
|
|
|
|
|
2023-01-25 01:04:07 +01:00
|
|
|
int32_t DeviceTy::notifyDataMapped(void *HstPtr, int64_t Size) {
|
2025-12-02 23:45:23 +01:00
|
|
|
ODBG(ODT_Mapping) << "Notifying about new mapping: HstPtr=" << HstPtr
|
|
|
|
|
<< ", Size=" << Size;
|
2023-01-25 01:04:07 +01:00
|
|
|
|
|
|
|
|
if (RTL->data_notify_mapped(RTLDeviceID, HstPtr, Size)) {
|
2025-12-02 23:45:23 +01:00
|
|
|
REPORT() << "Notifying about data mapping failed.";
|
2023-01-25 01:04:07 +01:00
|
|
|
return OFFLOAD_FAIL;
|
|
|
|
|
}
|
|
|
|
|
return OFFLOAD_SUCCESS;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int32_t DeviceTy::notifyDataUnmapped(void *HstPtr) {
|
2025-12-02 23:45:23 +01:00
|
|
|
ODBG(ODT_Mapping) << "Notifying about an unmapping: HstPtr=" << HstPtr;
|
2023-01-25 01:04:07 +01:00
|
|
|
|
|
|
|
|
if (RTL->data_notify_unmapped(RTLDeviceID, HstPtr)) {
|
2025-12-02 23:45:23 +01:00
|
|
|
REPORT() << "Notifying about data unmapping failed.";
|
2023-01-25 01:04:07 +01:00
|
|
|
return OFFLOAD_FAIL;
|
|
|
|
|
}
|
|
|
|
|
return OFFLOAD_SUCCESS;
|
|
|
|
|
}
|
|
|
|
|
|
2017-12-06 21:59:09 +00:00
|
|
|
// Run region on device
|
2023-01-19 13:40:58 -08:00
|
|
|
int32_t DeviceTy::launchKernel(void *TgtEntryPtr, void **TgtVarsPtr,
|
2023-11-30 14:16:34 -08:00
|
|
|
ptrdiff_t *TgtOffsets, KernelArgsTy &KernelArgs,
|
2023-01-19 13:40:58 -08:00
|
|
|
AsyncInfoTy &AsyncInfo) {
|
|
|
|
|
return RTL->launch_kernel(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtOffsets,
|
|
|
|
|
&KernelArgs, AsyncInfo);
|
2017-12-06 21:59:09 +00:00
|
|
|
}
|
|
|
|
|
|
2021-07-27 21:47:40 -04:00
|
|
|
// Run region on device
|
2023-12-04 17:10:37 -08:00
|
|
|
bool DeviceTy::printDeviceInfo() {
|
|
|
|
|
RTL->print_device_info(RTLDeviceID);
|
2021-07-27 21:47:40 -04:00
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2020-06-04 16:58:37 -04:00
|
|
|
// Whether data can be copied to DstDevice directly
|
|
|
|
|
bool DeviceTy::isDataExchangable(const DeviceTy &DstDevice) {
|
2024-03-27 14:37:55 -05:00
|
|
|
if (RTL != DstDevice.RTL)
|
2020-06-04 16:58:37 -04:00
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
if (RTL->is_data_exchangable(RTLDeviceID, DstDevice.RTLDeviceID))
|
2024-03-27 14:37:55 -05:00
|
|
|
return true;
|
2020-06-04 16:58:37 -04:00
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2021-02-10 11:06:00 -06:00
|
|
|
int32_t DeviceTy::synchronize(AsyncInfoTy &AsyncInfo) {
|
2024-03-27 14:37:55 -05:00
|
|
|
return RTL->synchronize(RTLDeviceID, AsyncInfo);
|
2020-07-27 16:08:19 -04:00
|
|
|
}
|
|
|
|
|
|
2022-12-14 13:46:23 -03:00
|
|
|
int32_t DeviceTy::queryAsync(AsyncInfoTy &AsyncInfo) {
|
2024-03-27 14:37:55 -05:00
|
|
|
return RTL->query_async(RTLDeviceID, AsyncInfo);
|
2022-12-14 13:46:23 -03:00
|
|
|
}
|
|
|
|
|
|
2021-08-28 16:24:06 -04:00
|
|
|
int32_t DeviceTy::createEvent(void **Event) {
|
2024-03-27 14:37:55 -05:00
|
|
|
return RTL->create_event(RTLDeviceID, Event);
|
2021-08-28 16:24:06 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int32_t DeviceTy::recordEvent(void *Event, AsyncInfoTy &AsyncInfo) {
|
2024-03-27 14:37:55 -05:00
|
|
|
return RTL->record_event(RTLDeviceID, Event, AsyncInfo);
|
2021-08-28 16:24:06 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int32_t DeviceTy::waitEvent(void *Event, AsyncInfoTy &AsyncInfo) {
|
2024-03-27 14:37:55 -05:00
|
|
|
return RTL->wait_event(RTLDeviceID, Event, AsyncInfo);
|
2021-08-28 16:24:06 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int32_t DeviceTy::syncEvent(void *Event) {
|
2024-03-27 14:37:55 -05:00
|
|
|
return RTL->sync_event(RTLDeviceID, Event);
|
2021-08-28 16:24:06 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int32_t DeviceTy::destroyEvent(void *Event) {
|
2024-03-27 14:37:55 -05:00
|
|
|
return RTL->destroy_event(RTLDeviceID, Event);
|
2021-08-28 16:24:06 -04:00
|
|
|
}
|
|
|
|
|
|
2023-12-01 12:34:24 -08:00
|
|
|
void DeviceTy::dumpOffloadEntries() {
|
|
|
|
|
fprintf(stderr, "Device %i offload entries:\n", DeviceID);
|
2023-12-12 12:49:46 -08:00
|
|
|
for (auto &It : *DeviceOffloadEntries.getExclusiveAccessor()) {
|
2023-12-01 12:34:24 -08:00
|
|
|
const char *Kind = "kernel";
|
2024-01-31 11:48:07 -06:00
|
|
|
if (It.second.isLink())
|
2023-12-01 12:34:24 -08:00
|
|
|
Kind = "link";
|
2024-01-08 16:49:33 -06:00
|
|
|
else if (It.second.isGlobal())
|
2023-12-01 12:34:24 -08:00
|
|
|
Kind = "global var.";
|
2024-01-08 16:49:33 -06:00
|
|
|
fprintf(stderr, " %11s: %s\n", Kind, It.second.getNameAsCStr());
|
2023-12-01 12:34:24 -08:00
|
|
|
}
|
|
|
|
|
}
|
2024-01-22 10:30:22 -06:00
|
|
|
|
|
|
|
|
bool DeviceTy::useAutoZeroCopy() {
|
2024-05-16 11:13:50 -05:00
|
|
|
if (PM->getRequirements() & OMP_REQ_UNIFIED_SHARED_MEMORY)
|
|
|
|
|
return false;
|
2024-03-27 14:37:55 -05:00
|
|
|
return RTL->use_auto_zero_copy(RTLDeviceID);
|
2024-01-22 10:30:22 -06:00
|
|
|
}
|
2025-10-22 17:35:16 +02:00
|
|
|
|
|
|
|
|
bool DeviceTy::isAccessiblePtr(const void *Ptr, size_t Size) {
|
|
|
|
|
return RTL->is_accessible_ptr(RTLDeviceID, Ptr, Size);
|
|
|
|
|
}
|