2017-12-06 21:59:09 +00:00
|
|
|
//===--------- device.cpp - Target independent OpenMP target RTL ----------===//
|
|
|
|
|
//
|
2019-01-19 10:56:40 +00:00
|
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
|
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
|
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
2017-12-06 21:59:09 +00:00
|
|
|
//
|
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
//
|
|
|
|
|
// Functionality for managing devices that are handled by RTL plugins.
|
|
|
|
|
//
|
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
|
|
#include "device.h"
|
2023-12-01 12:34:24 -08:00
|
|
|
#include "OffloadEntry.h"
|
2023-12-12 12:49:46 -08:00
|
|
|
#include "OpenMP/Mapping.h"
|
2023-11-29 08:29:41 -08:00
|
|
|
#include "OpenMP/OMPT/Callback.h"
|
|
|
|
|
#include "OpenMP/OMPT/Interface.h"
|
2023-11-30 13:47:47 -08:00
|
|
|
#include "PluginManager.h"
|
2023-12-01 12:34:24 -08:00
|
|
|
#include "Shared/APITypes.h"
|
|
|
|
|
#include "Shared/Debug.h"
|
2022-03-02 13:34:24 -06:00
|
|
|
#include "omptarget.h"
|
2017-12-06 21:59:09 +00:00
|
|
|
#include "private.h"
|
|
|
|
|
#include "rtl.h"
|
|
|
|
|
|
2023-11-30 15:23:34 -08:00
|
|
|
#include "Shared/EnvironmentVar.h"
|
2023-12-05 16:04:01 -08:00
|
|
|
#include "llvm/Support/Error.h"
|
2023-07-19 10:32:07 -07:00
|
|
|
|
2017-12-06 21:59:09 +00:00
|
|
|
#include <cassert>
|
|
|
|
|
#include <climits>
|
2022-03-05 15:14:20 -06:00
|
|
|
#include <cstdint>
|
2020-09-15 15:04:37 -04:00
|
|
|
#include <cstdio>
|
2023-03-21 13:40:36 -07:00
|
|
|
#include <mutex>
|
2017-12-06 21:59:09 +00:00
|
|
|
#include <string>
|
2022-03-02 13:46:01 -06:00
|
|
|
#include <thread>
|
2017-12-06 21:59:09 +00:00
|
|
|
|
2023-07-25 08:14:59 -04:00
|
|
|
#ifdef OMPT_SUPPORT
|
|
|
|
|
using namespace llvm::omp::target::ompt;
|
|
|
|
|
#endif
|
|
|
|
|
|
2022-03-02 13:34:24 -06:00
|
|
|
int HostDataToTargetTy::addEventIfNecessary(DeviceTy &Device,
|
|
|
|
|
AsyncInfoTy &AsyncInfo) const {
|
2022-01-18 18:56:10 -06:00
|
|
|
// First, check if the user disabled atomic map transfer/malloc/dealloc.
|
2023-11-30 17:08:41 -08:00
|
|
|
if (!MappingConfig::get().UseEventsForAtomicTransfers)
|
2022-01-18 18:56:10 -06:00
|
|
|
return OFFLOAD_SUCCESS;
|
|
|
|
|
|
|
|
|
|
void *Event = getEvent();
|
|
|
|
|
bool NeedNewEvent = Event == nullptr;
|
|
|
|
|
if (NeedNewEvent && Device.createEvent(&Event) != OFFLOAD_SUCCESS) {
|
|
|
|
|
REPORT("Failed to create event\n");
|
|
|
|
|
return OFFLOAD_FAIL;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// We cannot assume the event should not be nullptr because we don't
|
|
|
|
|
// know if the target support event. But if a target doesn't,
|
|
|
|
|
// recordEvent should always return success.
|
|
|
|
|
if (Device.recordEvent(Event, AsyncInfo) != OFFLOAD_SUCCESS) {
|
|
|
|
|
REPORT("Failed to set dependence on event " DPxMOD "\n", DPxPTR(Event));
|
|
|
|
|
return OFFLOAD_FAIL;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (NeedNewEvent)
|
|
|
|
|
setEvent(Event);
|
|
|
|
|
|
|
|
|
|
return OFFLOAD_SUCCESS;
|
|
|
|
|
}
|
|
|
|
|
|
2023-12-05 16:04:01 -08:00
|
|
|
DeviceTy::DeviceTy(PluginAdaptorTy *RTL, int32_t DeviceID, int32_t RTLDeviceID)
|
|
|
|
|
: DeviceID(DeviceID), RTL(RTL), RTLDeviceID(RTLDeviceID),
|
2024-01-31 11:48:07 -06:00
|
|
|
MappingInfo(*this) {}
|
[OpenMP] Introduce target memory manager
Target memory manager is introduced in this patch which aims to manage target
memory such that they will not be freed immediately when they are not used
because the overhead of memory allocation and free is very large. For CUDA
device, cuMemFree even blocks the context switch on device which affects
concurrent kernel execution.
The memory manager can be taken as a memory pool. It divides the pool into
multiple buckets according to the size such that memory allocation/free
distributed to different buckets will not affect each other.
In this version, we use the exact-equality policy to find a free buffer. This
is an open question: will best-fit work better here? IMO, best-fit is not good
for target memory management because computation on GPU usually requires GBs of
data. Best-fit might lead to a serious waste. For example, there is a free
buffer of size 1960MB, and now we need a buffer of size 1200MB. If best-fit,
the free buffer will be returned, leading to a 760MB waste.
The allocation will happen when there is no free memory left, and the memory
free on device will take place in the following two cases:
1. The program ends. Obviously. However, there is a little problem that plugin
library is destroyed before the memory manager is destroyed, leading to a fact
that the call to target plugin will not succeed.
2. Device is out of memory when we request a new memory. The manager will walk
through all free buffers from the bucket with largest base size, pick up one
buffer, free it, and try to allocate immediately. If it succeeds, it will
return right away rather than freeing all buffers in free list.
Update:
A threshold (8KB by default) is set such that users could control what size of memory
will be managed by the manager. It can also be configured by an environment variable
`LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD`.
Reviewed By: jdoerfert, ye-luo, JonChesterfield
Differential Revision: https://reviews.llvm.org/D81054
2020-08-19 23:12:02 -04:00
|
|
|
|
2020-09-15 15:04:37 -04:00
|
|
|
DeviceTy::~DeviceTy() {
|
2020-12-18 15:14:44 -05:00
|
|
|
if (DeviceID == -1 || !(getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE))
|
2020-09-15 15:04:37 -04:00
|
|
|
return;
|
|
|
|
|
|
2022-07-01 11:48:15 -04:00
|
|
|
ident_t Loc = {0, 0, 0, 0, ";libomptarget;libomptarget;0;0;;"};
|
|
|
|
|
dumpTargetPointerMappings(&Loc, *this);
|
2020-09-15 15:04:37 -04:00
|
|
|
}
|
[OpenMP] Introduce target memory manager
Target memory manager is introduced in this patch which aims to manage target
memory such that they will not be freed immediately when they are not used
because the overhead of memory allocation and free is very large. For CUDA
device, cuMemFree even blocks the context switch on device which affects
concurrent kernel execution.
The memory manager can be taken as a memory pool. It divides the pool into
multiple buckets according to the size such that memory allocation/free
distributed to different buckets will not affect each other.
In this version, we use the exact-equality policy to find a free buffer. This
is an open question: will best-fit work better here? IMO, best-fit is not good
for target memory management because computation on GPU usually requires GBs of
data. Best-fit might lead to a serious waste. For example, there is a free
buffer of size 1960MB, and now we need a buffer of size 1200MB. If best-fit,
the free buffer will be returned, leading to a 760MB waste.
The allocation will happen when there is no free memory left, and the memory
free on device will take place in the following two cases:
1. The program ends. Obviously. However, there is a little problem that plugin
library is destroyed before the memory manager is destroyed, leading to a fact
that the call to target plugin will not succeed.
2. Device is out of memory when we request a new memory. The manager will walk
through all free buffers from the bucket with largest base size, pick up one
buffer, free it, and try to allocate immediately. If it succeeds, it will
return right away rather than freeing all buffers in free list.
Update:
A threshold (8KB by default) is set such that users could control what size of memory
will be managed by the manager. It can also be configured by an environment variable
`LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD`.
Reviewed By: jdoerfert, ye-luo, JonChesterfield
Differential Revision: https://reviews.llvm.org/D81054
2020-08-19 23:12:02 -04:00
|
|
|
|
2023-12-05 16:04:01 -08:00
|
|
|
llvm::Error DeviceTy::init() {
|
2019-06-04 15:05:53 +00:00
|
|
|
// Make call to init_requires if it exists for this plugin.
|
2023-12-05 16:04:01 -08:00
|
|
|
int32_t Ret = 0;
|
2019-06-04 15:05:53 +00:00
|
|
|
if (RTL->init_requires)
|
2023-12-05 16:04:01 -08:00
|
|
|
Ret = RTL->init_requires(PM->getRequirements());
|
[OpenMP] Introduce target memory manager
Target memory manager is introduced in this patch which aims to manage target
memory such that they will not be freed immediately when they are not used
because the overhead of memory allocation and free is very large. For CUDA
device, cuMemFree even blocks the context switch on device which affects
concurrent kernel execution.
The memory manager can be taken as a memory pool. It divides the pool into
multiple buckets according to the size such that memory allocation/free
distributed to different buckets will not affect each other.
In this version, we use the exact-equality policy to find a free buffer. This
is an open question: will best-fit work better here? IMO, best-fit is not good
for target memory management because computation on GPU usually requires GBs of
data. Best-fit might lead to a serious waste. For example, there is a free
buffer of size 1960MB, and now we need a buffer of size 1200MB. If best-fit,
the free buffer will be returned, leading to a 760MB waste.
The allocation will happen when there is no free memory left, and the memory
free on device will take place in the following two cases:
1. The program ends. Obviously. However, there is a little problem that plugin
library is destroyed before the memory manager is destroyed, leading to a fact
that the call to target plugin will not succeed.
2. Device is out of memory when we request a new memory. The manager will walk
through all free buffers from the bucket with largest base size, pick up one
buffer, free it, and try to allocate immediately. If it succeeds, it will
return right away rather than freeing all buffers in free list.
Update:
A threshold (8KB by default) is set such that users could control what size of memory
will be managed by the manager. It can also be configured by an environment variable
`LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD`.
Reviewed By: jdoerfert, ye-luo, JonChesterfield
Differential Revision: https://reviews.llvm.org/D81054
2020-08-19 23:12:02 -04:00
|
|
|
if (Ret != OFFLOAD_SUCCESS)
|
2023-12-05 16:04:01 -08:00
|
|
|
return llvm::createStringError(
|
|
|
|
|
llvm::inconvertibleErrorCode(),
|
|
|
|
|
"Failed to initialize requirements for device %d\n", DeviceID);
|
|
|
|
|
|
|
|
|
|
Ret = RTL->init_device(RTLDeviceID);
|
|
|
|
|
if (Ret != OFFLOAD_SUCCESS)
|
|
|
|
|
return llvm::createStringError(llvm::inconvertibleErrorCode(),
|
|
|
|
|
"Failed to initialize device %d\n",
|
|
|
|
|
DeviceID);
|
[OpenMP] Introduce target memory manager
Target memory manager is introduced in this patch which aims to manage target
memory such that they will not be freed immediately when they are not used
because the overhead of memory allocation and free is very large. For CUDA
device, cuMemFree even blocks the context switch on device which affects
concurrent kernel execution.
The memory manager can be taken as a memory pool. It divides the pool into
multiple buckets according to the size such that memory allocation/free
distributed to different buckets will not affect each other.
In this version, we use the exact-equality policy to find a free buffer. This
is an open question: will best-fit work better here? IMO, best-fit is not good
for target memory management because computation on GPU usually requires GBs of
data. Best-fit might lead to a serious waste. For example, there is a free
buffer of size 1960MB, and now we need a buffer of size 1200MB. If best-fit,
the free buffer will be returned, leading to a 760MB waste.
The allocation will happen when there is no free memory left, and the memory
free on device will take place in the following two cases:
1. The program ends. Obviously. However, there is a little problem that plugin
library is destroyed before the memory manager is destroyed, leading to a fact
that the call to target plugin will not succeed.
2. Device is out of memory when we request a new memory. The manager will walk
through all free buffers from the bucket with largest base size, pick up one
buffer, free it, and try to allocate immediately. If it succeeds, it will
return right away rather than freeing all buffers in free list.
Update:
A threshold (8KB by default) is set such that users could control what size of memory
will be managed by the manager. It can also be configured by an environment variable
`LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD`.
Reviewed By: jdoerfert, ye-luo, JonChesterfield
Differential Revision: https://reviews.llvm.org/D81054
2020-08-19 23:12:02 -04:00
|
|
|
|
2023-07-19 10:32:07 -07:00
|
|
|
// Enables recording kernels if set.
|
2023-11-30 15:23:34 -08:00
|
|
|
BoolEnvar OMPX_RecordKernel("LIBOMPTARGET_RECORD", false);
|
2023-07-19 10:32:07 -07:00
|
|
|
if (OMPX_RecordKernel) {
|
|
|
|
|
// Enables saving the device memory kernel output post execution if set.
|
2023-11-30 15:23:34 -08:00
|
|
|
BoolEnvar OMPX_ReplaySaveOutput("LIBOMPTARGET_RR_SAVE_OUTPUT", false);
|
2023-11-16 16:09:05 -08:00
|
|
|
|
|
|
|
|
uint64_t ReqPtrArgOffset;
|
2023-11-30 14:16:34 -08:00
|
|
|
RTL->initialize_record_replay(RTLDeviceID, 0, nullptr, true,
|
|
|
|
|
OMPX_ReplaySaveOutput, ReqPtrArgOffset);
|
2023-07-19 10:32:07 -07:00
|
|
|
}
|
|
|
|
|
|
2023-12-05 16:04:01 -08:00
|
|
|
return llvm::Error::success();
|
2017-12-06 21:59:09 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Load binary to device.
|
2024-01-22 11:06:47 -06:00
|
|
|
llvm::Expected<__tgt_device_binary>
|
|
|
|
|
DeviceTy::loadBinary(__tgt_device_image *Img) {
|
|
|
|
|
__tgt_device_binary Binary;
|
|
|
|
|
|
|
|
|
|
if (RTL->load_binary(RTLDeviceID, Img, &Binary) != OFFLOAD_SUCCESS)
|
|
|
|
|
return llvm::createStringError(llvm::inconvertibleErrorCode(),
|
|
|
|
|
"Failed to load binary %p", Img);
|
|
|
|
|
return Binary;
|
2017-12-06 21:59:09 +00:00
|
|
|
}
|
|
|
|
|
|
2021-03-03 11:48:32 -08:00
|
|
|
void *DeviceTy::allocData(int64_t Size, void *HstPtr, int32_t Kind) {
|
2023-07-25 08:14:59 -04:00
|
|
|
/// RAII to establish tool anchors before and after data allocation
|
2023-08-16 06:38:39 -04:00
|
|
|
void *TargetPtr = nullptr;
|
2023-07-25 08:14:59 -04:00
|
|
|
OMPT_IF_BUILT(InterfaceRAII TargetDataAllocRAII(
|
|
|
|
|
RegionInterface.getCallbacks<ompt_target_data_alloc>(),
|
2023-09-11 12:11:44 +02:00
|
|
|
DeviceID, HstPtr, &TargetPtr, Size,
|
2024-02-07 17:29:08 -08:00
|
|
|
/*CodePtr=*/OMPT_GET_RETURN_ADDRESS);)
|
2023-07-25 08:14:59 -04:00
|
|
|
|
2023-08-16 06:38:39 -04:00
|
|
|
TargetPtr = RTL->data_alloc(RTLDeviceID, Size, HstPtr, Kind);
|
|
|
|
|
return TargetPtr;
|
2020-07-27 16:08:19 -04:00
|
|
|
}
|
|
|
|
|
|
2023-07-03 10:23:38 -04:00
|
|
|
int32_t DeviceTy::deleteData(void *TgtAllocBegin, int32_t Kind) {
|
2023-07-25 08:14:59 -04:00
|
|
|
/// RAII to establish tool anchors before and after data deletion
|
|
|
|
|
OMPT_IF_BUILT(InterfaceRAII TargetDataDeleteRAII(
|
|
|
|
|
RegionInterface.getCallbacks<ompt_target_data_delete>(),
|
2023-09-11 12:11:44 +02:00
|
|
|
DeviceID, TgtAllocBegin,
|
2024-02-07 17:29:08 -08:00
|
|
|
/*CodePtr=*/OMPT_GET_RETURN_ADDRESS);)
|
2023-07-25 08:14:59 -04:00
|
|
|
|
2023-07-03 10:23:38 -04:00
|
|
|
return RTL->data_delete(RTLDeviceID, TgtAllocBegin, Kind);
|
2020-07-27 16:08:19 -04:00
|
|
|
}
|
|
|
|
|
|
[OpenMP] Optimized stream selection by scheduling data mapping for the same target region into a same stream
Summary:
This patch introduces two things for offloading:
1. Asynchronous data transferring: those functions are suffix with `_async`. They have one more argument compared with their synchronous counterparts: `__tgt_async_info*`, which is a new struct that only has one field, `void *Identifier`. This struct is for information exchange between different asynchronous operations. It can be used for stream selection, like in this case, or operation synchronization, which is also used. We may expect more usages in the future.
2. Optimization of stream selection for data mapping. Previous implementation was using asynchronous device memory transfer but synchronizing after each memory transfer. Actually, if we say kernel A needs four memory copy to device and two memory copy back to host, then we can schedule these seven operations (four H2D, two D2H, and one kernel launch) into a same stream and just need synchronization after memory copy from device to host. In this way, we can save a huge overhead compared with synchronization after each operation.
Reviewers: jdoerfert, ye-luo
Reviewed By: jdoerfert
Subscribers: yaxunl, lildmh, guansong, openmp-commits
Tags: #openmp
Differential Revision: https://reviews.llvm.org/D77005
2020-04-07 14:51:56 -04:00
|
|
|
// Submit data to device
|
2020-07-28 20:10:59 -04:00
|
|
|
int32_t DeviceTy::submitData(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size,
|
2023-12-05 15:25:10 -08:00
|
|
|
AsyncInfoTy &AsyncInfo, HostDataToTargetTy *Entry,
|
2023-12-12 12:49:46 -08:00
|
|
|
MappingInfoTy::HDTTMapAccessorTy *HDTTMapPtr) {
|
|
|
|
|
if (getInfoLevel() & OMP_INFOTYPE_DATA_TRANSFER)
|
|
|
|
|
MappingInfo.printCopyInfo(TgtPtrBegin, HstPtrBegin, Size, /*H2D=*/true,
|
|
|
|
|
Entry, HDTTMapPtr);
|
2021-06-08 16:43:59 -04:00
|
|
|
|
2023-07-25 08:14:59 -04:00
|
|
|
/// RAII to establish tool anchors before and after data submit
|
|
|
|
|
OMPT_IF_BUILT(
|
|
|
|
|
InterfaceRAII TargetDataSubmitRAII(
|
|
|
|
|
RegionInterface.getCallbacks<ompt_target_data_transfer_to_device>(),
|
2024-02-26 11:16:25 +01:00
|
|
|
omp_get_initial_device(), HstPtrBegin, DeviceID, TgtPtrBegin, Size,
|
2024-02-07 17:29:08 -08:00
|
|
|
/*CodePtr=*/OMPT_GET_RETURN_ADDRESS);)
|
2023-07-25 08:14:59 -04:00
|
|
|
|
2021-02-10 11:06:00 -06:00
|
|
|
if (!AsyncInfo || !RTL->data_submit_async || !RTL->synchronize)
|
2020-04-09 22:40:30 -04:00
|
|
|
return RTL->data_submit(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size);
|
2022-07-01 11:48:15 -04:00
|
|
|
return RTL->data_submit_async(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size,
|
|
|
|
|
AsyncInfo);
|
2017-12-06 21:59:09 +00:00
|
|
|
}
|
|
|
|
|
|
[OpenMP] Optimized stream selection by scheduling data mapping for the same target region into a same stream
Summary:
This patch introduces two things for offloading:
1. Asynchronous data transferring: those functions are suffix with `_async`. They have one more argument compared with their synchronous counterparts: `__tgt_async_info*`, which is a new struct that only has one field, `void *Identifier`. This struct is for information exchange between different asynchronous operations. It can be used for stream selection, like in this case, or operation synchronization, which is also used. We may expect more usages in the future.
2. Optimization of stream selection for data mapping. Previous implementation was using asynchronous device memory transfer but synchronizing after each memory transfer. Actually, if we say kernel A needs four memory copy to device and two memory copy back to host, then we can schedule these seven operations (four H2D, two D2H, and one kernel launch) into a same stream and just need synchronization after memory copy from device to host. In this way, we can save a huge overhead compared with synchronization after each operation.
Reviewers: jdoerfert, ye-luo
Reviewed By: jdoerfert
Subscribers: yaxunl, lildmh, guansong, openmp-commits
Tags: #openmp
Differential Revision: https://reviews.llvm.org/D77005
2020-04-07 14:51:56 -04:00
|
|
|
// Retrieve data from device
|
2020-07-30 21:37:01 -04:00
|
|
|
int32_t DeviceTy::retrieveData(void *HstPtrBegin, void *TgtPtrBegin,
|
2023-03-21 13:40:36 -07:00
|
|
|
int64_t Size, AsyncInfoTy &AsyncInfo,
|
2023-12-05 15:25:10 -08:00
|
|
|
HostDataToTargetTy *Entry,
|
2023-12-12 12:49:46 -08:00
|
|
|
MappingInfoTy::HDTTMapAccessorTy *HDTTMapPtr) {
|
|
|
|
|
if (getInfoLevel() & OMP_INFOTYPE_DATA_TRANSFER)
|
|
|
|
|
MappingInfo.printCopyInfo(TgtPtrBegin, HstPtrBegin, Size, /*H2D=*/false,
|
|
|
|
|
Entry, HDTTMapPtr);
|
2021-06-08 16:43:59 -04:00
|
|
|
|
2023-07-25 08:14:59 -04:00
|
|
|
/// RAII to establish tool anchors before and after data retrieval
|
|
|
|
|
OMPT_IF_BUILT(
|
|
|
|
|
InterfaceRAII TargetDataRetrieveRAII(
|
|
|
|
|
RegionInterface.getCallbacks<ompt_target_data_transfer_from_device>(),
|
2024-02-26 11:16:25 +01:00
|
|
|
DeviceID, TgtPtrBegin, omp_get_initial_device(), HstPtrBegin, Size,
|
2024-02-07 17:29:08 -08:00
|
|
|
/*CodePtr=*/OMPT_GET_RETURN_ADDRESS);)
|
2023-07-25 08:14:59 -04:00
|
|
|
|
2021-02-10 11:06:00 -06:00
|
|
|
if (!RTL->data_retrieve_async || !RTL->synchronize)
|
2020-04-09 22:40:30 -04:00
|
|
|
return RTL->data_retrieve(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size);
|
2022-07-01 11:48:15 -04:00
|
|
|
return RTL->data_retrieve_async(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size,
|
|
|
|
|
AsyncInfo);
|
2017-12-06 21:59:09 +00:00
|
|
|
}
|
|
|
|
|
|
2020-06-04 16:58:37 -04:00
|
|
|
// Copy data from current device to destination device directly
|
2020-08-19 16:07:58 -04:00
|
|
|
int32_t DeviceTy::dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr,
|
2021-02-10 11:06:00 -06:00
|
|
|
int64_t Size, AsyncInfoTy &AsyncInfo) {
|
2024-02-26 11:16:25 +01:00
|
|
|
/// RAII to establish tool anchors before and after data exchange
|
|
|
|
|
/// Note: Despite the fact that this is a data exchange, we use 'from_device'
|
|
|
|
|
/// operation enum (w.r.t. ompt_target_data_op_t) as there is currently
|
|
|
|
|
/// no better alternative. It is still possible to distinguish this
|
|
|
|
|
/// scenario from a real data retrieve by checking if both involved
|
|
|
|
|
/// device numbers are less than omp_get_num_devices().
|
|
|
|
|
OMPT_IF_BUILT(
|
|
|
|
|
InterfaceRAII TargetDataExchangeRAII(
|
|
|
|
|
RegionInterface.getCallbacks<ompt_target_data_transfer_from_device>(),
|
|
|
|
|
RTLDeviceID, SrcPtr, DstDev.RTLDeviceID, DstPtr, Size,
|
|
|
|
|
/*CodePtr=*/OMPT_GET_RETURN_ADDRESS);)
|
2020-08-19 16:07:58 -04:00
|
|
|
if (!AsyncInfo || !RTL->data_exchange_async || !RTL->synchronize) {
|
2020-06-04 16:58:37 -04:00
|
|
|
assert(RTL->data_exchange && "RTL->data_exchange is nullptr");
|
|
|
|
|
return RTL->data_exchange(RTLDeviceID, SrcPtr, DstDev.RTLDeviceID, DstPtr,
|
|
|
|
|
Size);
|
2022-07-01 11:48:15 -04:00
|
|
|
}
|
|
|
|
|
return RTL->data_exchange_async(RTLDeviceID, SrcPtr, DstDev.RTLDeviceID,
|
|
|
|
|
DstPtr, Size, AsyncInfo);
|
2020-06-04 16:58:37 -04:00
|
|
|
}
|
|
|
|
|
|
2023-01-25 01:04:07 +01:00
|
|
|
int32_t DeviceTy::notifyDataMapped(void *HstPtr, int64_t Size) {
|
|
|
|
|
if (!RTL->data_notify_mapped)
|
|
|
|
|
return OFFLOAD_SUCCESS;
|
|
|
|
|
|
|
|
|
|
DP("Notifying about new mapping: HstPtr=" DPxMOD ", Size=%" PRId64 "\n",
|
|
|
|
|
DPxPTR(HstPtr), Size);
|
|
|
|
|
|
|
|
|
|
if (RTL->data_notify_mapped(RTLDeviceID, HstPtr, Size)) {
|
|
|
|
|
REPORT("Notifiying about data mapping failed.\n");
|
|
|
|
|
return OFFLOAD_FAIL;
|
|
|
|
|
}
|
|
|
|
|
return OFFLOAD_SUCCESS;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int32_t DeviceTy::notifyDataUnmapped(void *HstPtr) {
|
|
|
|
|
if (!RTL->data_notify_unmapped)
|
|
|
|
|
return OFFLOAD_SUCCESS;
|
|
|
|
|
|
|
|
|
|
DP("Notifying about an unmapping: HstPtr=" DPxMOD "\n", DPxPTR(HstPtr));
|
|
|
|
|
|
|
|
|
|
if (RTL->data_notify_unmapped(RTLDeviceID, HstPtr)) {
|
|
|
|
|
REPORT("Notifiying about data unmapping failed.\n");
|
|
|
|
|
return OFFLOAD_FAIL;
|
|
|
|
|
}
|
|
|
|
|
return OFFLOAD_SUCCESS;
|
|
|
|
|
}
|
|
|
|
|
|
2017-12-06 21:59:09 +00:00
|
|
|
// Run region on device
|
2023-01-19 13:40:58 -08:00
|
|
|
int32_t DeviceTy::launchKernel(void *TgtEntryPtr, void **TgtVarsPtr,
|
2023-11-30 14:16:34 -08:00
|
|
|
ptrdiff_t *TgtOffsets, KernelArgsTy &KernelArgs,
|
2023-01-19 13:40:58 -08:00
|
|
|
AsyncInfoTy &AsyncInfo) {
|
|
|
|
|
return RTL->launch_kernel(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtOffsets,
|
|
|
|
|
&KernelArgs, AsyncInfo);
|
2017-12-06 21:59:09 +00:00
|
|
|
}
|
|
|
|
|
|
2021-07-27 21:47:40 -04:00
|
|
|
// Run region on device
|
2023-12-04 17:10:37 -08:00
|
|
|
bool DeviceTy::printDeviceInfo() {
|
2021-07-27 21:47:40 -04:00
|
|
|
if (!RTL->print_device_info)
|
|
|
|
|
return false;
|
2023-12-04 17:10:37 -08:00
|
|
|
RTL->print_device_info(RTLDeviceID);
|
2021-07-27 21:47:40 -04:00
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2020-06-04 16:58:37 -04:00
|
|
|
// Whether data can be copied to DstDevice directly
|
|
|
|
|
bool DeviceTy::isDataExchangable(const DeviceTy &DstDevice) {
|
|
|
|
|
if (RTL != DstDevice.RTL || !RTL->is_data_exchangable)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
if (RTL->is_data_exchangable(RTLDeviceID, DstDevice.RTLDeviceID))
|
|
|
|
|
return (RTL->data_exchange != nullptr) ||
|
|
|
|
|
(RTL->data_exchange_async != nullptr);
|
|
|
|
|
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2021-02-10 11:06:00 -06:00
|
|
|
int32_t DeviceTy::synchronize(AsyncInfoTy &AsyncInfo) {
|
2020-07-27 16:08:19 -04:00
|
|
|
if (RTL->synchronize)
|
2021-02-10 11:06:00 -06:00
|
|
|
return RTL->synchronize(RTLDeviceID, AsyncInfo);
|
2020-07-27 16:08:19 -04:00
|
|
|
return OFFLOAD_SUCCESS;
|
|
|
|
|
}
|
|
|
|
|
|
2022-12-14 13:46:23 -03:00
|
|
|
int32_t DeviceTy::queryAsync(AsyncInfoTy &AsyncInfo) {
|
|
|
|
|
if (RTL->query_async)
|
|
|
|
|
return RTL->query_async(RTLDeviceID, AsyncInfo);
|
|
|
|
|
|
|
|
|
|
return synchronize(AsyncInfo);
|
|
|
|
|
}
|
|
|
|
|
|
2021-08-28 16:24:06 -04:00
|
|
|
int32_t DeviceTy::createEvent(void **Event) {
|
|
|
|
|
if (RTL->create_event)
|
|
|
|
|
return RTL->create_event(RTLDeviceID, Event);
|
|
|
|
|
|
|
|
|
|
return OFFLOAD_SUCCESS;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int32_t DeviceTy::recordEvent(void *Event, AsyncInfoTy &AsyncInfo) {
|
|
|
|
|
if (RTL->record_event)
|
|
|
|
|
return RTL->record_event(RTLDeviceID, Event, AsyncInfo);
|
|
|
|
|
|
|
|
|
|
return OFFLOAD_SUCCESS;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int32_t DeviceTy::waitEvent(void *Event, AsyncInfoTy &AsyncInfo) {
|
|
|
|
|
if (RTL->wait_event)
|
|
|
|
|
return RTL->wait_event(RTLDeviceID, Event, AsyncInfo);
|
|
|
|
|
|
|
|
|
|
return OFFLOAD_SUCCESS;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int32_t DeviceTy::syncEvent(void *Event) {
|
|
|
|
|
if (RTL->sync_event)
|
|
|
|
|
return RTL->sync_event(RTLDeviceID, Event);
|
|
|
|
|
|
|
|
|
|
return OFFLOAD_SUCCESS;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int32_t DeviceTy::destroyEvent(void *Event) {
|
|
|
|
|
if (RTL->create_event)
|
|
|
|
|
return RTL->destroy_event(RTLDeviceID, Event);
|
|
|
|
|
|
|
|
|
|
return OFFLOAD_SUCCESS;
|
|
|
|
|
}
|
|
|
|
|
|
2023-12-01 12:34:24 -08:00
|
|
|
void DeviceTy::dumpOffloadEntries() {
|
|
|
|
|
fprintf(stderr, "Device %i offload entries:\n", DeviceID);
|
2023-12-12 12:49:46 -08:00
|
|
|
for (auto &It : *DeviceOffloadEntries.getExclusiveAccessor()) {
|
2023-12-01 12:34:24 -08:00
|
|
|
const char *Kind = "kernel";
|
2024-01-31 11:48:07 -06:00
|
|
|
if (It.second.isLink())
|
2023-12-01 12:34:24 -08:00
|
|
|
Kind = "link";
|
2024-01-08 16:49:33 -06:00
|
|
|
else if (It.second.isGlobal())
|
2023-12-01 12:34:24 -08:00
|
|
|
Kind = "global var.";
|
2024-01-08 16:49:33 -06:00
|
|
|
fprintf(stderr, " %11s: %s\n", Kind, It.second.getNameAsCStr());
|
2023-12-01 12:34:24 -08:00
|
|
|
}
|
|
|
|
|
}
|
2024-01-22 10:30:22 -06:00
|
|
|
|
|
|
|
|
bool DeviceTy::useAutoZeroCopy() {
|
|
|
|
|
if (RTL->use_auto_zero_copy)
|
|
|
|
|
return RTL->use_auto_zero_copy(RTLDeviceID);
|
|
|
|
|
return false;
|
|
|
|
|
}
|