Files
llvm/openmp/libomptarget/src/device.cpp

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

744 lines
28 KiB
C++
Raw Normal View History

//===--------- device.cpp - Target independent OpenMP target RTL ----------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Functionality for managing devices that are handled by RTL plugins.
//
//===----------------------------------------------------------------------===//
#include "device.h"
#include "omptarget.h"
#include "private.h"
#include "rtl.h"
#include <cassert>
#include <climits>
#include <cstdint>
#include <cstdio>
#include <string>
#include <thread>
int HostDataToTargetTy::addEventIfNecessary(DeviceTy &Device,
AsyncInfoTy &AsyncInfo) const {
// First, check if the user disabled atomic map transfer/malloc/dealloc.
if (!PM->UseEventsForAtomicTransfers)
return OFFLOAD_SUCCESS;
void *Event = getEvent();
bool NeedNewEvent = Event == nullptr;
if (NeedNewEvent && Device.createEvent(&Event) != OFFLOAD_SUCCESS) {
REPORT("Failed to create event\n");
return OFFLOAD_FAIL;
}
// We cannot assume the event should not be nullptr because we don't
// know if the target support event. But if a target doesn't,
// recordEvent should always return success.
if (Device.recordEvent(Event, AsyncInfo) != OFFLOAD_SUCCESS) {
REPORT("Failed to set dependence on event " DPxMOD "\n", DPxPTR(Event));
return OFFLOAD_FAIL;
}
if (NeedNewEvent)
setEvent(Event);
return OFFLOAD_SUCCESS;
}
[OpenMP] Introduce target memory manager Target memory manager is introduced in this patch which aims to manage target memory such that they will not be freed immediately when they are not used because the overhead of memory allocation and free is very large. For CUDA device, cuMemFree even blocks the context switch on device which affects concurrent kernel execution. The memory manager can be taken as a memory pool. It divides the pool into multiple buckets according to the size such that memory allocation/free distributed to different buckets will not affect each other. In this version, we use the exact-equality policy to find a free buffer. This is an open question: will best-fit work better here? IMO, best-fit is not good for target memory management because computation on GPU usually requires GBs of data. Best-fit might lead to a serious waste. For example, there is a free buffer of size 1960MB, and now we need a buffer of size 1200MB. If best-fit, the free buffer will be returned, leading to a 760MB waste. The allocation will happen when there is no free memory left, and the memory free on device will take place in the following two cases: 1. The program ends. Obviously. However, there is a little problem that plugin library is destroyed before the memory manager is destroyed, leading to a fact that the call to target plugin will not succeed. 2. Device is out of memory when we request a new memory. The manager will walk through all free buffers from the bucket with largest base size, pick up one buffer, free it, and try to allocate immediately. If it succeeds, it will return right away rather than freeing all buffers in free list. Update: A threshold (8KB by default) is set such that users could control what size of memory will be managed by the manager. It can also be configured by an environment variable `LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD`. Reviewed By: jdoerfert, ye-luo, JonChesterfield Differential Revision: https://reviews.llvm.org/D81054
2020-08-19 23:12:02 -04:00
DeviceTy::DeviceTy(RTLInfoTy *RTL)
: DeviceID(-1), RTL(RTL), RTLDeviceID(-1), IsInit(false), InitFlag(),
HasPendingGlobals(false), PendingCtorsDtors(), ShadowPtrMap(),
PendingGlobalsMtx(), ShadowMtx() {}
[OpenMP] Introduce target memory manager Target memory manager is introduced in this patch which aims to manage target memory such that they will not be freed immediately when they are not used because the overhead of memory allocation and free is very large. For CUDA device, cuMemFree even blocks the context switch on device which affects concurrent kernel execution. The memory manager can be taken as a memory pool. It divides the pool into multiple buckets according to the size such that memory allocation/free distributed to different buckets will not affect each other. In this version, we use the exact-equality policy to find a free buffer. This is an open question: will best-fit work better here? IMO, best-fit is not good for target memory management because computation on GPU usually requires GBs of data. Best-fit might lead to a serious waste. For example, there is a free buffer of size 1960MB, and now we need a buffer of size 1200MB. If best-fit, the free buffer will be returned, leading to a 760MB waste. The allocation will happen when there is no free memory left, and the memory free on device will take place in the following two cases: 1. The program ends. Obviously. However, there is a little problem that plugin library is destroyed before the memory manager is destroyed, leading to a fact that the call to target plugin will not succeed. 2. Device is out of memory when we request a new memory. The manager will walk through all free buffers from the bucket with largest base size, pick up one buffer, free it, and try to allocate immediately. If it succeeds, it will return right away rather than freeing all buffers in free list. Update: A threshold (8KB by default) is set such that users could control what size of memory will be managed by the manager. It can also be configured by an environment variable `LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD`. Reviewed By: jdoerfert, ye-luo, JonChesterfield Differential Revision: https://reviews.llvm.org/D81054
2020-08-19 23:12:02 -04:00
DeviceTy::~DeviceTy() {
if (DeviceID == -1 || !(getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE))
return;
ident_t Loc = {0, 0, 0, 0, ";libomptarget;libomptarget;0;0;;"};
dumpTargetPointerMappings(&Loc, *this);
}
[OpenMP] Introduce target memory manager Target memory manager is introduced in this patch which aims to manage target memory such that they will not be freed immediately when they are not used because the overhead of memory allocation and free is very large. For CUDA device, cuMemFree even blocks the context switch on device which affects concurrent kernel execution. The memory manager can be taken as a memory pool. It divides the pool into multiple buckets according to the size such that memory allocation/free distributed to different buckets will not affect each other. In this version, we use the exact-equality policy to find a free buffer. This is an open question: will best-fit work better here? IMO, best-fit is not good for target memory management because computation on GPU usually requires GBs of data. Best-fit might lead to a serious waste. For example, there is a free buffer of size 1960MB, and now we need a buffer of size 1200MB. If best-fit, the free buffer will be returned, leading to a 760MB waste. The allocation will happen when there is no free memory left, and the memory free on device will take place in the following two cases: 1. The program ends. Obviously. However, there is a little problem that plugin library is destroyed before the memory manager is destroyed, leading to a fact that the call to target plugin will not succeed. 2. Device is out of memory when we request a new memory. The manager will walk through all free buffers from the bucket with largest base size, pick up one buffer, free it, and try to allocate immediately. If it succeeds, it will return right away rather than freeing all buffers in free list. Update: A threshold (8KB by default) is set such that users could control what size of memory will be managed by the manager. It can also be configured by an environment variable `LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD`. Reviewed By: jdoerfert, ye-luo, JonChesterfield Differential Revision: https://reviews.llvm.org/D81054
2020-08-19 23:12:02 -04:00
int DeviceTy::associatePtr(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size) {
HDTTMapAccessorTy HDTTMap = HostDataToTargetMap.getExclusiveAccessor();
// Check if entry exists
auto It = HDTTMap->find(HstPtrBegin);
if (It != HDTTMap->end()) {
HostDataToTargetTy &HDTT = *It->HDTT;
// Mapping already exists
bool IsValid = HDTT.HstPtrEnd == (uintptr_t)HstPtrBegin + Size &&
HDTT.TgtPtrBegin == (uintptr_t)TgtPtrBegin;
if (IsValid) {
DP("Attempt to re-associate the same device ptr+offset with the same "
"host ptr, nothing to do\n");
return OFFLOAD_SUCCESS;
}
REPORT("Not allowed to re-associate a different device ptr+offset with "
"the same host ptr\n");
return OFFLOAD_FAIL;
}
// Mapping does not exist, allocate it with refCount=INF
const HostDataToTargetTy &NewEntry =
*HDTTMap
->emplace(new HostDataToTargetTy(
[OpenMP] Improve ref count debug messages For example, without this patch: ``` $ cat test.c int main() { int x; #pragma omp target enter data map(alloc: x) #pragma omp target exit data map(release: x) ; return 0; } $ clang -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda test.c $ LIBOMPTARGET_DEBUG=1 ./a.out |& grep 'Creating\|Mapping exists' Libomptarget --> Creating new map entry with HstPtrBegin=0x00007ffcace8e448, TgtPtrBegin=0x00007f12ef600000, Size=4, Name=unknown Libomptarget --> Mapping exists with HstPtrBegin=0x00007ffcace8e448, TgtPtrBegin=0x00007f12ef600000, Size=4, updated RefCount=1 ``` There are two problems in this example: * `RefCount` is not reported when a mapping is created, but it might be 1 or infinite. In this case, because it's created by `omp target enter data`, it's 1. Seeing that would make later `RefCount` messages easier to understand. * `RefCount` is still 1 at the `omp target exit data`, but it's reported as `updated`. The reason it's still 1 is that, upon deletions, the reference count is generally not updated in `DeviceTy::getTgtPtrBegin`, where the report is produced. Instead, it's zeroed later in `DeviceTy::deallocTgtPtr`, where it's actually removed from the mapping table. This patch makes the following changes: * Report the reference count when creating a mapping. * Where an existing mapping is reported, always report a reference count action: * `update suppressed` when `UpdateRefCount=false` * `incremented` * `decremented` * `deferred final decrement`, which replaces the misleading `updated` in the above example * Add comments to `DeviceTy::getTgtPtrBegin` to explain why it does not zero the reference count. (Please advise if these comments miss the point.) * For unified shared memory, don't report confusing messages like `RefCount=` or `RefCount= updated` given that reference counts are irrelevant in this case. Instead, just report `for unified shared memory`. * Use `INFO` not `DP` consistently for `Mapping exists` messages. * Fix device table dumps to print `INF` instead of `-1` for an infinite reference count. Reviewed By: jhuber6, grokos Differential Revision: https://reviews.llvm.org/D104559
2021-06-23 09:37:54 -04:00
/*HstPtrBase=*/(uintptr_t)HstPtrBegin,
/*HstPtrBegin=*/(uintptr_t)HstPtrBegin,
/*HstPtrEnd=*/(uintptr_t)HstPtrBegin + Size,
/*TgtPtrBegin=*/(uintptr_t)TgtPtrBegin,
/*UseHoldRefCount=*/false, /*Name=*/nullptr,
/*IsRefCountINF=*/true))
.first->HDTT;
DP("Creating new map entry: HstBase=" DPxMOD ", HstBegin=" DPxMOD
", HstEnd=" DPxMOD ", TgtBegin=" DPxMOD ", DynRefCount=%s, "
"HoldRefCount=%s\n",
DPxPTR(NewEntry.HstPtrBase), DPxPTR(NewEntry.HstPtrBegin),
DPxPTR(NewEntry.HstPtrEnd), DPxPTR(NewEntry.TgtPtrBegin),
NewEntry.dynRefCountToStr().c_str(), NewEntry.holdRefCountToStr().c_str());
(void)NewEntry;
// Notify the plugin about the new mapping.
return notifyDataMapped(HstPtrBegin, Size);
}
int DeviceTy::disassociatePtr(void *HstPtrBegin) {
HDTTMapAccessorTy HDTTMap = HostDataToTargetMap.getExclusiveAccessor();
auto It = HDTTMap->find(HstPtrBegin);
if (It != HDTTMap->end()) {
HostDataToTargetTy &HDTT = *It->HDTT;
// Mapping exists
if (HDTT.getHoldRefCount()) {
// This is based on OpenACC 3.1, sec 3.2.33 "acc_unmap_data", L3656-3657:
// "It is an error to call acc_unmap_data if the structured reference
// count for the pointer is not zero."
REPORT("Trying to disassociate a pointer with a non-zero hold reference "
"count\n");
} else if (HDTT.isDynRefCountInf()) {
DP("Association found, removing it\n");
void *Event = HDTT.getEvent();
delete &HDTT;
if (Event)
destroyEvent(Event);
HDTTMap->erase(It);
// Notify the plugin about the unmapped memory.
return notifyDataUnmapped(HstPtrBegin);
} else {
REPORT("Trying to disassociate a pointer which was not mapped via "
"omp_target_associate_ptr\n");
}
} else {
REPORT("Association not found\n");
}
// Mapping not found
return OFFLOAD_FAIL;
}
LookupResult DeviceTy::lookupMapping(HDTTMapAccessorTy &HDTTMap,
void *HstPtrBegin, int64_t Size) {
uintptr_t HP = (uintptr_t)HstPtrBegin;
LookupResult LR;
DP("Looking up mapping(HstPtrBegin=" DPxMOD ", Size=%" PRId64 ")...\n",
DPxPTR(HP), Size);
if (HDTTMap->empty())
return LR;
auto Upper = HDTTMap->upper_bound(HP);
if (Size == 0) {
// specification v5.1 Pointer Initialization for Device Data Environments
// upper_bound satisfies
// std::prev(upper)->HDTT.HstPtrBegin <= hp < upper->HDTT.HstPtrBegin
if (Upper != HDTTMap->begin()) {
LR.Entry = std::prev(Upper)->HDTT;
auto &HT = *LR.Entry;
// the left side of extended address range is satisified.
// hp >= HT.HstPtrBegin || hp >= HT.HstPtrBase
LR.Flags.IsContained = HP < HT.HstPtrEnd || HP < HT.HstPtrBase;
}
if (!LR.Flags.IsContained && Upper != HDTTMap->end()) {
LR.Entry = Upper->HDTT;
auto &HT = *LR.Entry;
// the right side of extended address range is satisified.
// hp < HT.HstPtrEnd || hp < HT.HstPtrBase
LR.Flags.IsContained = HP >= HT.HstPtrBase;
}
} else {
// check the left bin
if (Upper != HDTTMap->begin()) {
LR.Entry = std::prev(Upper)->HDTT;
auto &HT = *LR.Entry;
// Is it contained?
LR.Flags.IsContained = HP >= HT.HstPtrBegin && HP < HT.HstPtrEnd &&
(HP + Size) <= HT.HstPtrEnd;
// Does it extend beyond the mapped region?
LR.Flags.ExtendsAfter = HP < HT.HstPtrEnd && (HP + Size) > HT.HstPtrEnd;
}
// check the right bin
if (!(LR.Flags.IsContained || LR.Flags.ExtendsAfter) &&
Upper != HDTTMap->end()) {
LR.Entry = Upper->HDTT;
auto &HT = *LR.Entry;
// Does it extend into an already mapped region?
LR.Flags.ExtendsBefore =
HP < HT.HstPtrBegin && (HP + Size) > HT.HstPtrBegin;
// Does it extend beyond the mapped region?
LR.Flags.ExtendsAfter = HP < HT.HstPtrEnd && (HP + Size) > HT.HstPtrEnd;
}
if (LR.Flags.ExtendsBefore) {
DP("WARNING: Pointer is not mapped but section extends into already "
"mapped data\n");
}
if (LR.Flags.ExtendsAfter) {
DP("WARNING: Pointer is already mapped but section extends beyond mapped "
"region\n");
}
}
return LR;
}
TargetPointerResultTy DeviceTy::getTargetPointer(
void *HstPtrBegin, void *HstPtrBase, int64_t Size,
map_var_info_t HstPtrName, bool HasFlagTo, bool HasFlagAlways,
bool IsImplicit, bool UpdateRefCount, bool HasCloseModifier,
bool HasPresentModifier, bool HasHoldModifier, AsyncInfoTy &AsyncInfo) {
HDTTMapAccessorTy HDTTMap = HostDataToTargetMap.getExclusiveAccessor();
[OpenMP][Offloading] Fix data race in data mapping by using two locks This patch tries to partially fix one of the two data race issues reported in [1] by introducing a per-entry mutex. Additional discussion can also be found in D104418, which will also be refined to fix another data race problem. Here is how it works. Like before, `DataMapMtx` is still being used for mapping table lookup and update. In any case, we will get a table entry. If we need to make a data transfer (update the data on the device), we need to lock the entry right before releasing `DataMapMtx`, and the issue of data transfer should be after releasing `DataMapMtx`, and the entry is unlocked afterwards. This can guarantee that: 1) issue of data movement is not in critical region, which will not affect performance too much, and also will not affect other threads that don't touch the same entry; 2) if another thread accesses the same entry, the state of data movement is consistent (which requires that a thread must first get the update lock before getting data movement information). For a target that doesn't support async data transfer, issue of data movement is data transfer. This two-lock design can potentially improve concurrency compared with the design that guards data movement with `DataMapMtx` as well. For a target that supports async data movement, we could simply attach the event between the issue of data movement and unlock the entry. For a thread that wants to get the event, it must first get the lock. This can also get rid of the busy wait until the event pointer is valid. Reference: [1] https://bugs.llvm.org/show_bug.cgi?id=49940 Reviewed By: grokos Differential Revision: https://reviews.llvm.org/D104555
2021-07-23 16:10:42 -04:00
void *TargetPointer = nullptr;
bool IsHostPtr = false;
bool IsPresent = true;
[OpenMP][Offloading] Fix data race in data mapping by using two locks This patch tries to partially fix one of the two data race issues reported in [1] by introducing a per-entry mutex. Additional discussion can also be found in D104418, which will also be refined to fix another data race problem. Here is how it works. Like before, `DataMapMtx` is still being used for mapping table lookup and update. In any case, we will get a table entry. If we need to make a data transfer (update the data on the device), we need to lock the entry right before releasing `DataMapMtx`, and the issue of data transfer should be after releasing `DataMapMtx`, and the entry is unlocked afterwards. This can guarantee that: 1) issue of data movement is not in critical region, which will not affect performance too much, and also will not affect other threads that don't touch the same entry; 2) if another thread accesses the same entry, the state of data movement is consistent (which requires that a thread must first get the update lock before getting data movement information). For a target that doesn't support async data transfer, issue of data movement is data transfer. This two-lock design can potentially improve concurrency compared with the design that guards data movement with `DataMapMtx` as well. For a target that supports async data movement, we could simply attach the event between the issue of data movement and unlock the entry. For a thread that wants to get the event, it must first get the lock. This can also get rid of the busy wait until the event pointer is valid. Reference: [1] https://bugs.llvm.org/show_bug.cgi?id=49940 Reviewed By: grokos Differential Revision: https://reviews.llvm.org/D104555
2021-07-23 16:10:42 -04:00
bool IsNew = false;
LookupResult LR = lookupMapping(HDTTMap, HstPtrBegin, Size);
auto *Entry = LR.Entry;
// Check if the pointer is contained.
// If a variable is mapped to the device manually by the user - which would
// lead to the IsContained flag to be true - then we must ensure that the
// device address is returned even under unified memory conditions.
if (LR.Flags.IsContained ||
((LR.Flags.ExtendsBefore || LR.Flags.ExtendsAfter) && IsImplicit)) {
auto &HT = *LR.Entry;
const char *RefCountAction;
if (UpdateRefCount) {
// After this, reference count >= 1. If the reference count was 0 but the
// entry was still there we can reuse the data on the device and avoid a
// new submission.
HT.incRefCount(HasHoldModifier);
RefCountAction = " (incremented)";
} else {
// It might have been allocated with the parent, but it's still new.
IsNew = HT.getTotalRefCount() == 1;
RefCountAction = " (update suppressed)";
}
const char *DynRefCountAction = HasHoldModifier ? "" : RefCountAction;
const char *HoldRefCountAction = HasHoldModifier ? RefCountAction : "";
uintptr_t Ptr = HT.TgtPtrBegin + ((uintptr_t)HstPtrBegin - HT.HstPtrBegin);
INFO(OMP_INFOTYPE_MAPPING_EXISTS, DeviceID,
"Mapping exists%s with HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD
", Size=%" PRId64 ", DynRefCount=%s%s, HoldRefCount=%s%s, Name=%s\n",
(IsImplicit ? " (implicit)" : ""), DPxPTR(HstPtrBegin), DPxPTR(Ptr),
Size, HT.dynRefCountToStr().c_str(), DynRefCountAction,
HT.holdRefCountToStr().c_str(), HoldRefCountAction,
(HstPtrName) ? getNameFromMapping(HstPtrName).c_str() : "unknown");
TargetPointer = (void *)Ptr;
} else if ((LR.Flags.ExtendsBefore || LR.Flags.ExtendsAfter) && !IsImplicit) {
// Explicit extension of mapped data - not allowed.
MESSAGE("explicit extension not allowed: host address specified is " DPxMOD
" (%" PRId64
" bytes), but device allocation maps to host at " DPxMOD
" (%" PRId64 " bytes)",
DPxPTR(HstPtrBegin), Size, DPxPTR(Entry->HstPtrBegin),
Entry->HstPtrEnd - Entry->HstPtrBegin);
if (HasPresentModifier)
MESSAGE("device mapping required by 'present' map type modifier does not "
"exist for host address " DPxMOD " (%" PRId64 " bytes)",
DPxPTR(HstPtrBegin), Size);
} else if (PM->RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
!HasCloseModifier) {
// If unified shared memory is active, implicitly mapped variables that are
// not privatized use host address. Any explicitly mapped variables also use
// host address where correctness is not impeded. In all other cases maps
// are respected.
// In addition to the mapping rules above, the close map modifier forces the
// mapping of the variable to the device.
if (Size) {
[OpenMP] Improve ref count debug messages For example, without this patch: ``` $ cat test.c int main() { int x; #pragma omp target enter data map(alloc: x) #pragma omp target exit data map(release: x) ; return 0; } $ clang -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda test.c $ LIBOMPTARGET_DEBUG=1 ./a.out |& grep 'Creating\|Mapping exists' Libomptarget --> Creating new map entry with HstPtrBegin=0x00007ffcace8e448, TgtPtrBegin=0x00007f12ef600000, Size=4, Name=unknown Libomptarget --> Mapping exists with HstPtrBegin=0x00007ffcace8e448, TgtPtrBegin=0x00007f12ef600000, Size=4, updated RefCount=1 ``` There are two problems in this example: * `RefCount` is not reported when a mapping is created, but it might be 1 or infinite. In this case, because it's created by `omp target enter data`, it's 1. Seeing that would make later `RefCount` messages easier to understand. * `RefCount` is still 1 at the `omp target exit data`, but it's reported as `updated`. The reason it's still 1 is that, upon deletions, the reference count is generally not updated in `DeviceTy::getTgtPtrBegin`, where the report is produced. Instead, it's zeroed later in `DeviceTy::deallocTgtPtr`, where it's actually removed from the mapping table. This patch makes the following changes: * Report the reference count when creating a mapping. * Where an existing mapping is reported, always report a reference count action: * `update suppressed` when `UpdateRefCount=false` * `incremented` * `decremented` * `deferred final decrement`, which replaces the misleading `updated` in the above example * Add comments to `DeviceTy::getTgtPtrBegin` to explain why it does not zero the reference count. (Please advise if these comments miss the point.) * For unified shared memory, don't report confusing messages like `RefCount=` or `RefCount= updated` given that reference counts are irrelevant in this case. Instead, just report `for unified shared memory`. * Use `INFO` not `DP` consistently for `Mapping exists` messages. * Fix device table dumps to print `INF` instead of `-1` for an infinite reference count. Reviewed By: jhuber6, grokos Differential Revision: https://reviews.llvm.org/D104559
2021-06-23 09:37:54 -04:00
DP("Return HstPtrBegin " DPxMOD " Size=%" PRId64 " for unified shared "
"memory\n",
DPxPTR((uintptr_t)HstPtrBegin), Size);
IsPresent = false;
IsHostPtr = true;
TargetPointer = HstPtrBegin;
}
} else if (HasPresentModifier) {
DP("Mapping required by 'present' map type modifier does not exist for "
"HstPtrBegin=" DPxMOD ", Size=%" PRId64 "\n",
DPxPTR(HstPtrBegin), Size);
MESSAGE("device mapping required by 'present' map type modifier does not "
"exist for host address " DPxMOD " (%" PRId64 " bytes)",
DPxPTR(HstPtrBegin), Size);
} else if (Size) {
// If it is not contained and Size > 0, we should create a new entry for it.
IsNew = true;
uintptr_t Ptr = (uintptr_t)allocData(Size, HstPtrBegin);
Entry = HDTTMap
->emplace(new HostDataToTargetTy(
(uintptr_t)HstPtrBase, (uintptr_t)HstPtrBegin,
(uintptr_t)HstPtrBegin + Size, Ptr, HasHoldModifier,
HstPtrName))
.first->HDTT;
INFO(OMP_INFOTYPE_MAPPING_CHANGED, DeviceID,
"Creating new map entry with HstPtrBase=" DPxMOD
", HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD ", Size=%ld, "
"DynRefCount=%s, HoldRefCount=%s, Name=%s\n",
DPxPTR(HstPtrBase), DPxPTR(HstPtrBegin), DPxPTR(Ptr), Size,
Entry->dynRefCountToStr().c_str(), Entry->holdRefCountToStr().c_str(),
(HstPtrName) ? getNameFromMapping(HstPtrName).c_str() : "unknown");
TargetPointer = (void *)Ptr;
// Notify the plugin about the new mapping.
if (notifyDataMapped(HstPtrBegin, Size))
return {{false /* IsNewEntry */, false /* IsHostPointer */},
nullptr /* Entry */,
nullptr /* TargetPointer */};
} else {
// This entry is not present and we did not create a new entry for it.
IsPresent = false;
}
[OpenMP][Offloading] Fix data race in data mapping by using two locks This patch tries to partially fix one of the two data race issues reported in [1] by introducing a per-entry mutex. Additional discussion can also be found in D104418, which will also be refined to fix another data race problem. Here is how it works. Like before, `DataMapMtx` is still being used for mapping table lookup and update. In any case, we will get a table entry. If we need to make a data transfer (update the data on the device), we need to lock the entry right before releasing `DataMapMtx`, and the issue of data transfer should be after releasing `DataMapMtx`, and the entry is unlocked afterwards. This can guarantee that: 1) issue of data movement is not in critical region, which will not affect performance too much, and also will not affect other threads that don't touch the same entry; 2) if another thread accesses the same entry, the state of data movement is consistent (which requires that a thread must first get the update lock before getting data movement information). For a target that doesn't support async data transfer, issue of data movement is data transfer. This two-lock design can potentially improve concurrency compared with the design that guards data movement with `DataMapMtx` as well. For a target that supports async data movement, we could simply attach the event between the issue of data movement and unlock the entry. For a thread that wants to get the event, it must first get the lock. This can also get rid of the busy wait until the event pointer is valid. Reference: [1] https://bugs.llvm.org/show_bug.cgi?id=49940 Reviewed By: grokos Differential Revision: https://reviews.llvm.org/D104555
2021-07-23 16:10:42 -04:00
// If the target pointer is valid, and we need to transfer data, issue the
// data transfer.
if (TargetPointer && !IsHostPtr && HasFlagTo && (IsNew || HasFlagAlways) &&
Size != 0) {
[OpenMP][Offloading] Fix data race in data mapping by using two locks This patch tries to partially fix one of the two data race issues reported in [1] by introducing a per-entry mutex. Additional discussion can also be found in D104418, which will also be refined to fix another data race problem. Here is how it works. Like before, `DataMapMtx` is still being used for mapping table lookup and update. In any case, we will get a table entry. If we need to make a data transfer (update the data on the device), we need to lock the entry right before releasing `DataMapMtx`, and the issue of data transfer should be after releasing `DataMapMtx`, and the entry is unlocked afterwards. This can guarantee that: 1) issue of data movement is not in critical region, which will not affect performance too much, and also will not affect other threads that don't touch the same entry; 2) if another thread accesses the same entry, the state of data movement is consistent (which requires that a thread must first get the update lock before getting data movement information). For a target that doesn't support async data transfer, issue of data movement is data transfer. This two-lock design can potentially improve concurrency compared with the design that guards data movement with `DataMapMtx` as well. For a target that supports async data movement, we could simply attach the event between the issue of data movement and unlock the entry. For a thread that wants to get the event, it must first get the lock. This can also get rid of the busy wait until the event pointer is valid. Reference: [1] https://bugs.llvm.org/show_bug.cgi?id=49940 Reviewed By: grokos Differential Revision: https://reviews.llvm.org/D104555
2021-07-23 16:10:42 -04:00
// Lock the entry before releasing the mapping table lock such that another
// thread that could issue data movement will get the right result.
std::lock_guard<decltype(*Entry)> LG(*Entry);
[OpenMP][Offloading] Fix data race in data mapping by using two locks This patch tries to partially fix one of the two data race issues reported in [1] by introducing a per-entry mutex. Additional discussion can also be found in D104418, which will also be refined to fix another data race problem. Here is how it works. Like before, `DataMapMtx` is still being used for mapping table lookup and update. In any case, we will get a table entry. If we need to make a data transfer (update the data on the device), we need to lock the entry right before releasing `DataMapMtx`, and the issue of data transfer should be after releasing `DataMapMtx`, and the entry is unlocked afterwards. This can guarantee that: 1) issue of data movement is not in critical region, which will not affect performance too much, and also will not affect other threads that don't touch the same entry; 2) if another thread accesses the same entry, the state of data movement is consistent (which requires that a thread must first get the update lock before getting data movement information). For a target that doesn't support async data transfer, issue of data movement is data transfer. This two-lock design can potentially improve concurrency compared with the design that guards data movement with `DataMapMtx` as well. For a target that supports async data movement, we could simply attach the event between the issue of data movement and unlock the entry. For a thread that wants to get the event, it must first get the lock. This can also get rid of the busy wait until the event pointer is valid. Reference: [1] https://bugs.llvm.org/show_bug.cgi?id=49940 Reviewed By: grokos Differential Revision: https://reviews.llvm.org/D104555
2021-07-23 16:10:42 -04:00
// Release the mapping table lock right after the entry is locked.
HDTTMap.destroy();
[OpenMP][Offloading] Fix data race in data mapping by using two locks This patch tries to partially fix one of the two data race issues reported in [1] by introducing a per-entry mutex. Additional discussion can also be found in D104418, which will also be refined to fix another data race problem. Here is how it works. Like before, `DataMapMtx` is still being used for mapping table lookup and update. In any case, we will get a table entry. If we need to make a data transfer (update the data on the device), we need to lock the entry right before releasing `DataMapMtx`, and the issue of data transfer should be after releasing `DataMapMtx`, and the entry is unlocked afterwards. This can guarantee that: 1) issue of data movement is not in critical region, which will not affect performance too much, and also will not affect other threads that don't touch the same entry; 2) if another thread accesses the same entry, the state of data movement is consistent (which requires that a thread must first get the update lock before getting data movement information). For a target that doesn't support async data transfer, issue of data movement is data transfer. This two-lock design can potentially improve concurrency compared with the design that guards data movement with `DataMapMtx` as well. For a target that supports async data movement, we could simply attach the event between the issue of data movement and unlock the entry. For a thread that wants to get the event, it must first get the lock. This can also get rid of the busy wait until the event pointer is valid. Reference: [1] https://bugs.llvm.org/show_bug.cgi?id=49940 Reviewed By: grokos Differential Revision: https://reviews.llvm.org/D104555
2021-07-23 16:10:42 -04:00
DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n", Size,
DPxPTR(HstPtrBegin), DPxPTR(TargetPointer));
int Ret = submitData(TargetPointer, HstPtrBegin, Size, AsyncInfo);
if (Ret != OFFLOAD_SUCCESS) {
REPORT("Copying data to device failed.\n");
// We will also return nullptr if the data movement fails because that
// pointer points to a corrupted memory region so it doesn't make any
// sense to continue to use it.
TargetPointer = nullptr;
} else if (Entry->addEventIfNecessary(*this, AsyncInfo) != OFFLOAD_SUCCESS)
return {{false /* IsNewEntry */, false /* IsHostPointer */},
nullptr /* Entry */,
nullptr /* TargetPointer */};
[OpenMP][Offloading] Fix data race in data mapping by using two locks This patch tries to partially fix one of the two data race issues reported in [1] by introducing a per-entry mutex. Additional discussion can also be found in D104418, which will also be refined to fix another data race problem. Here is how it works. Like before, `DataMapMtx` is still being used for mapping table lookup and update. In any case, we will get a table entry. If we need to make a data transfer (update the data on the device), we need to lock the entry right before releasing `DataMapMtx`, and the issue of data transfer should be after releasing `DataMapMtx`, and the entry is unlocked afterwards. This can guarantee that: 1) issue of data movement is not in critical region, which will not affect performance too much, and also will not affect other threads that don't touch the same entry; 2) if another thread accesses the same entry, the state of data movement is consistent (which requires that a thread must first get the update lock before getting data movement information). For a target that doesn't support async data transfer, issue of data movement is data transfer. This two-lock design can potentially improve concurrency compared with the design that guards data movement with `DataMapMtx` as well. For a target that supports async data movement, we could simply attach the event between the issue of data movement and unlock the entry. For a thread that wants to get the event, it must first get the lock. This can also get rid of the busy wait until the event pointer is valid. Reference: [1] https://bugs.llvm.org/show_bug.cgi?id=49940 Reviewed By: grokos Differential Revision: https://reviews.llvm.org/D104555
2021-07-23 16:10:42 -04:00
} else {
// Release the mapping table lock directly.
HDTTMap.destroy();
// If not a host pointer and no present modifier, we need to wait for the
// event if it exists.
// Note: Entry might be nullptr because of zero length array section.
if (Entry && !IsHostPtr && !HasPresentModifier) {
std::lock_guard<decltype(*Entry)> LG(*Entry);
void *Event = Entry->getEvent();
if (Event) {
int Ret = waitEvent(Event, AsyncInfo);
if (Ret != OFFLOAD_SUCCESS) {
// If it fails to wait for the event, we need to return nullptr in
// case of any data race.
REPORT("Failed to wait for event " DPxMOD ".\n", DPxPTR(Event));
return {{false /* IsNewEntry */, false /* IsHostPointer */},
nullptr /* Entry */,
nullptr /* TargetPointer */};
}
}
}
[OpenMP][Offloading] Fix data race in data mapping by using two locks This patch tries to partially fix one of the two data race issues reported in [1] by introducing a per-entry mutex. Additional discussion can also be found in D104418, which will also be refined to fix another data race problem. Here is how it works. Like before, `DataMapMtx` is still being used for mapping table lookup and update. In any case, we will get a table entry. If we need to make a data transfer (update the data on the device), we need to lock the entry right before releasing `DataMapMtx`, and the issue of data transfer should be after releasing `DataMapMtx`, and the entry is unlocked afterwards. This can guarantee that: 1) issue of data movement is not in critical region, which will not affect performance too much, and also will not affect other threads that don't touch the same entry; 2) if another thread accesses the same entry, the state of data movement is consistent (which requires that a thread must first get the update lock before getting data movement information). For a target that doesn't support async data transfer, issue of data movement is data transfer. This two-lock design can potentially improve concurrency compared with the design that guards data movement with `DataMapMtx` as well. For a target that supports async data movement, we could simply attach the event between the issue of data movement and unlock the entry. For a thread that wants to get the event, it must first get the lock. This can also get rid of the busy wait until the event pointer is valid. Reference: [1] https://bugs.llvm.org/show_bug.cgi?id=49940 Reviewed By: grokos Differential Revision: https://reviews.llvm.org/D104555
2021-07-23 16:10:42 -04:00
}
return {{IsNew, IsHostPtr, IsPresent}, Entry, TargetPointer};
}
TargetPointerResultTy
DeviceTy::getTgtPtrBegin(void *HstPtrBegin, int64_t Size, bool &IsLast,
bool UpdateRefCount, bool UseHoldRefCount,
[OpenMP][Fix] Track all threads that may delete an entry The entries inside a "target data end" is processed in three steps: 1. Query internal data maps for the entries and dispatch any necessary device-side operations (i.e., data retrieval); 2. Synchronize the such operations; 3. Update the host-side pointers and remove any entry which reference counter reached zero. Such steps may be executed by multiple threads which may even operate on the same entries. The current implementation (D121058) tries to synchronize these threads by tracking the "owner" for the deletion of each entry using their thread ID. Unfortunately it may failed to do so because of the following reasons: 1. The owner is always assigned at the first step only if the reference count is 0 when the map is queried. This does not work when such owner thread is faster than a previous one that is also processing the same entry on another "target data end", leading to user-after-free problems. 2. The entry is only added for post-processing (step 3) if its reference count was 0 at query time (step 1). This does not allow for threads to exchange responsibility for the deletion, leading again to user-after-free problems. 3. An entry may appear multiple times in the arguments array of a "target data end", which may lead to deleting the entry prematurely, leading, again, to user-after-free problems. This patch addresses these problems by tracking all the threads that are using an entry at "target data end" region through a counter, ensuring only the last one deletes it when needed. It also ensures that all entries that are successfully found inside the data maps in step 1 are also processed in step 3, regardless if their reference count was zeroed or not at query time. This ensures the deletion ownership may be passed to any thread that is using such entry. Reviewed By: ye-luo Differential Revision: https://reviews.llvm.org/D132676
2023-01-19 12:11:20 -03:00
bool &IsHostPtr, bool MustContain, bool ForceDelete,
bool FromDataEnd) {
HDTTMapAccessorTy HDTTMap = HostDataToTargetMap.getExclusiveAccessor();
void *TargetPointer = NULL;
bool IsNew = false;
bool IsPresent = true;
IsHostPtr = false;
IsLast = false;
LookupResult LR = lookupMapping(HDTTMap, HstPtrBegin, Size);
if (LR.Flags.IsContained ||
(!MustContain && (LR.Flags.ExtendsBefore || LR.Flags.ExtendsAfter))) {
auto &HT = *LR.Entry;
IsLast = HT.decShouldRemove(UseHoldRefCount, ForceDelete);
if (ForceDelete) {
HT.resetRefCount(UseHoldRefCount);
assert(IsLast == HT.decShouldRemove(UseHoldRefCount) &&
[OpenMP] Fix delete map type in ref count debug messages For example, without this patch: ``` $ cat test.c int main() { int x; #pragma omp target enter data map(alloc: x) #pragma omp target enter data map(alloc: x) #pragma omp target enter data map(alloc: x) #pragma omp target exit data map(delete: x) ; return 0; } $ clang -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda test.c $ LIBOMPTARGET_DEBUG=1 ./a.out |& grep 'Creating\|Mapping exists\|last' Libomptarget --> Creating new map entry with HstPtrBegin=0x00007ffddf1eaea8, TgtPtrBegin=0x00000000013bb040, Size=4, RefCount=1, Name=unknown Libomptarget --> Mapping exists with HstPtrBegin=0x00007ffddf1eaea8, TgtPtrBegin=0x00000000013bb040, Size=4, RefCount=2 (incremented), Name=unknown Libomptarget --> Mapping exists with HstPtrBegin=0x00007ffddf1eaea8, TgtPtrBegin=0x00000000013bb040, Size=4, RefCount=3 (incremented), Name=unknown Libomptarget --> Mapping exists with HstPtrBegin=0x00007ffddf1eaea8, TgtPtrBegin=0x00000000013bb040, Size=4, RefCount=2 (decremented) Libomptarget --> There are 4 bytes allocated at target address 0x00000000013bb040 - is not last ``` `RefCount` is reported as decremented to 2, but it ought to be reset because of the `delete` map type, and `is not last` is incorrect. This patch migrates the reset of reference counts from `DeviceTy::deallocTgtPtr` to `DeviceTy::getTgtPtrBegin`, which then correctly reports the reset. Based on the `IsLast` result from `DeviceTy::getTgtPtrBegin`, `targetDataEnd` then correctly reports `is last` for any deletion. `DeviceTy::deallocTgtPtr` is responsible only for the final reference count decrement and mapping removal. An obscure side effect of this patch is that a `delete` map type when the reference count is infinite yields `DelEntry=IsLast=false` in `targetDataEnd` and so no longer results in a `DeviceTy::deallocTgtPtr` call. Without this patch, that call is a no-op anyway besides some unnecessary locking and mapping table lookups. Reviewed By: grokos Differential Revision: https://reviews.llvm.org/D104560
2021-06-23 09:39:04 -04:00
"expected correct IsLast prediction for reset");
}
[OpenMP][Fix] Track all threads that may delete an entry The entries inside a "target data end" is processed in three steps: 1. Query internal data maps for the entries and dispatch any necessary device-side operations (i.e., data retrieval); 2. Synchronize the such operations; 3. Update the host-side pointers and remove any entry which reference counter reached zero. Such steps may be executed by multiple threads which may even operate on the same entries. The current implementation (D121058) tries to synchronize these threads by tracking the "owner" for the deletion of each entry using their thread ID. Unfortunately it may failed to do so because of the following reasons: 1. The owner is always assigned at the first step only if the reference count is 0 when the map is queried. This does not work when such owner thread is faster than a previous one that is also processing the same entry on another "target data end", leading to user-after-free problems. 2. The entry is only added for post-processing (step 3) if its reference count was 0 at query time (step 1). This does not allow for threads to exchange responsibility for the deletion, leading again to user-after-free problems. 3. An entry may appear multiple times in the arguments array of a "target data end", which may lead to deleting the entry prematurely, leading, again, to user-after-free problems. This patch addresses these problems by tracking all the threads that are using an entry at "target data end" region through a counter, ensuring only the last one deletes it when needed. It also ensures that all entries that are successfully found inside the data maps in step 1 are also processed in step 3, regardless if their reference count was zeroed or not at query time. This ensures the deletion ownership may be passed to any thread that is using such entry. Reviewed By: ye-luo Differential Revision: https://reviews.llvm.org/D132676
2023-01-19 12:11:20 -03:00
// Increment the number of threads that is using the entry on a
// targetDataEnd, tracking the number of possible "deleters". A thread may
// come to own the entry deletion even if it was not the last one querying
// for it. Thus, we must track every query on targetDataEnds to ensure only
// the last thread that holds a reference to an entry actually deletes it.
if (FromDataEnd)
HT.incDataEndThreadCount();
const char *RefCountAction;
if (!UpdateRefCount) {
RefCountAction = " (update suppressed)";
[OpenMP] Fix delete map type in ref count debug messages For example, without this patch: ``` $ cat test.c int main() { int x; #pragma omp target enter data map(alloc: x) #pragma omp target enter data map(alloc: x) #pragma omp target enter data map(alloc: x) #pragma omp target exit data map(delete: x) ; return 0; } $ clang -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda test.c $ LIBOMPTARGET_DEBUG=1 ./a.out |& grep 'Creating\|Mapping exists\|last' Libomptarget --> Creating new map entry with HstPtrBegin=0x00007ffddf1eaea8, TgtPtrBegin=0x00000000013bb040, Size=4, RefCount=1, Name=unknown Libomptarget --> Mapping exists with HstPtrBegin=0x00007ffddf1eaea8, TgtPtrBegin=0x00000000013bb040, Size=4, RefCount=2 (incremented), Name=unknown Libomptarget --> Mapping exists with HstPtrBegin=0x00007ffddf1eaea8, TgtPtrBegin=0x00000000013bb040, Size=4, RefCount=3 (incremented), Name=unknown Libomptarget --> Mapping exists with HstPtrBegin=0x00007ffddf1eaea8, TgtPtrBegin=0x00000000013bb040, Size=4, RefCount=2 (decremented) Libomptarget --> There are 4 bytes allocated at target address 0x00000000013bb040 - is not last ``` `RefCount` is reported as decremented to 2, but it ought to be reset because of the `delete` map type, and `is not last` is incorrect. This patch migrates the reset of reference counts from `DeviceTy::deallocTgtPtr` to `DeviceTy::getTgtPtrBegin`, which then correctly reports the reset. Based on the `IsLast` result from `DeviceTy::getTgtPtrBegin`, `targetDataEnd` then correctly reports `is last` for any deletion. `DeviceTy::deallocTgtPtr` is responsible only for the final reference count decrement and mapping removal. An obscure side effect of this patch is that a `delete` map type when the reference count is infinite yields `DelEntry=IsLast=false` in `targetDataEnd` and so no longer results in a `DeviceTy::deallocTgtPtr` call. Without this patch, that call is a no-op anyway besides some unnecessary locking and mapping table lookups. Reviewed By: grokos Differential Revision: https://reviews.llvm.org/D104560
2021-06-23 09:39:04 -04:00
} else if (IsLast) {
HT.decRefCount(UseHoldRefCount);
assert(HT.getTotalRefCount() == 0 &&
"Expected zero reference count when deletion is scheduled");
if (ForceDelete)
RefCountAction = " (reset, delayed deletion)";
else
RefCountAction = " (decremented, delayed deletion)";
[OpenMP] Fix delete map type in ref count debug messages For example, without this patch: ``` $ cat test.c int main() { int x; #pragma omp target enter data map(alloc: x) #pragma omp target enter data map(alloc: x) #pragma omp target enter data map(alloc: x) #pragma omp target exit data map(delete: x) ; return 0; } $ clang -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda test.c $ LIBOMPTARGET_DEBUG=1 ./a.out |& grep 'Creating\|Mapping exists\|last' Libomptarget --> Creating new map entry with HstPtrBegin=0x00007ffddf1eaea8, TgtPtrBegin=0x00000000013bb040, Size=4, RefCount=1, Name=unknown Libomptarget --> Mapping exists with HstPtrBegin=0x00007ffddf1eaea8, TgtPtrBegin=0x00000000013bb040, Size=4, RefCount=2 (incremented), Name=unknown Libomptarget --> Mapping exists with HstPtrBegin=0x00007ffddf1eaea8, TgtPtrBegin=0x00000000013bb040, Size=4, RefCount=3 (incremented), Name=unknown Libomptarget --> Mapping exists with HstPtrBegin=0x00007ffddf1eaea8, TgtPtrBegin=0x00000000013bb040, Size=4, RefCount=2 (decremented) Libomptarget --> There are 4 bytes allocated at target address 0x00000000013bb040 - is not last ``` `RefCount` is reported as decremented to 2, but it ought to be reset because of the `delete` map type, and `is not last` is incorrect. This patch migrates the reset of reference counts from `DeviceTy::deallocTgtPtr` to `DeviceTy::getTgtPtrBegin`, which then correctly reports the reset. Based on the `IsLast` result from `DeviceTy::getTgtPtrBegin`, `targetDataEnd` then correctly reports `is last` for any deletion. `DeviceTy::deallocTgtPtr` is responsible only for the final reference count decrement and mapping removal. An obscure side effect of this patch is that a `delete` map type when the reference count is infinite yields `DelEntry=IsLast=false` in `targetDataEnd` and so no longer results in a `DeviceTy::deallocTgtPtr` call. Without this patch, that call is a no-op anyway besides some unnecessary locking and mapping table lookups. Reviewed By: grokos Differential Revision: https://reviews.llvm.org/D104560
2021-06-23 09:39:04 -04:00
} else {
HT.decRefCount(UseHoldRefCount);
RefCountAction = " (decremented)";
[OpenMP] Improve ref count debug messages For example, without this patch: ``` $ cat test.c int main() { int x; #pragma omp target enter data map(alloc: x) #pragma omp target exit data map(release: x) ; return 0; } $ clang -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda test.c $ LIBOMPTARGET_DEBUG=1 ./a.out |& grep 'Creating\|Mapping exists' Libomptarget --> Creating new map entry with HstPtrBegin=0x00007ffcace8e448, TgtPtrBegin=0x00007f12ef600000, Size=4, Name=unknown Libomptarget --> Mapping exists with HstPtrBegin=0x00007ffcace8e448, TgtPtrBegin=0x00007f12ef600000, Size=4, updated RefCount=1 ``` There are two problems in this example: * `RefCount` is not reported when a mapping is created, but it might be 1 or infinite. In this case, because it's created by `omp target enter data`, it's 1. Seeing that would make later `RefCount` messages easier to understand. * `RefCount` is still 1 at the `omp target exit data`, but it's reported as `updated`. The reason it's still 1 is that, upon deletions, the reference count is generally not updated in `DeviceTy::getTgtPtrBegin`, where the report is produced. Instead, it's zeroed later in `DeviceTy::deallocTgtPtr`, where it's actually removed from the mapping table. This patch makes the following changes: * Report the reference count when creating a mapping. * Where an existing mapping is reported, always report a reference count action: * `update suppressed` when `UpdateRefCount=false` * `incremented` * `decremented` * `deferred final decrement`, which replaces the misleading `updated` in the above example * Add comments to `DeviceTy::getTgtPtrBegin` to explain why it does not zero the reference count. (Please advise if these comments miss the point.) * For unified shared memory, don't report confusing messages like `RefCount=` or `RefCount= updated` given that reference counts are irrelevant in this case. Instead, just report `for unified shared memory`. * Use `INFO` not `DP` consistently for `Mapping exists` messages. * Fix device table dumps to print `INF` instead of `-1` for an infinite reference count. Reviewed By: jhuber6, grokos Differential Revision: https://reviews.llvm.org/D104559
2021-06-23 09:37:54 -04:00
}
const char *DynRefCountAction = UseHoldRefCount ? "" : RefCountAction;
const char *HoldRefCountAction = UseHoldRefCount ? RefCountAction : "";
uintptr_t TP = HT.TgtPtrBegin + ((uintptr_t)HstPtrBegin - HT.HstPtrBegin);
[OpenMP] Improve ref count debug messages For example, without this patch: ``` $ cat test.c int main() { int x; #pragma omp target enter data map(alloc: x) #pragma omp target exit data map(release: x) ; return 0; } $ clang -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda test.c $ LIBOMPTARGET_DEBUG=1 ./a.out |& grep 'Creating\|Mapping exists' Libomptarget --> Creating new map entry with HstPtrBegin=0x00007ffcace8e448, TgtPtrBegin=0x00007f12ef600000, Size=4, Name=unknown Libomptarget --> Mapping exists with HstPtrBegin=0x00007ffcace8e448, TgtPtrBegin=0x00007f12ef600000, Size=4, updated RefCount=1 ``` There are two problems in this example: * `RefCount` is not reported when a mapping is created, but it might be 1 or infinite. In this case, because it's created by `omp target enter data`, it's 1. Seeing that would make later `RefCount` messages easier to understand. * `RefCount` is still 1 at the `omp target exit data`, but it's reported as `updated`. The reason it's still 1 is that, upon deletions, the reference count is generally not updated in `DeviceTy::getTgtPtrBegin`, where the report is produced. Instead, it's zeroed later in `DeviceTy::deallocTgtPtr`, where it's actually removed from the mapping table. This patch makes the following changes: * Report the reference count when creating a mapping. * Where an existing mapping is reported, always report a reference count action: * `update suppressed` when `UpdateRefCount=false` * `incremented` * `decremented` * `deferred final decrement`, which replaces the misleading `updated` in the above example * Add comments to `DeviceTy::getTgtPtrBegin` to explain why it does not zero the reference count. (Please advise if these comments miss the point.) * For unified shared memory, don't report confusing messages like `RefCount=` or `RefCount= updated` given that reference counts are irrelevant in this case. Instead, just report `for unified shared memory`. * Use `INFO` not `DP` consistently for `Mapping exists` messages. * Fix device table dumps to print `INF` instead of `-1` for an infinite reference count. Reviewed By: jhuber6, grokos Differential Revision: https://reviews.llvm.org/D104559
2021-06-23 09:37:54 -04:00
INFO(OMP_INFOTYPE_MAPPING_EXISTS, DeviceID,
"Mapping exists with HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD ", "
"Size=%" PRId64 ", DynRefCount=%s%s, HoldRefCount=%s%s\n",
DPxPTR(HstPtrBegin), DPxPTR(TP), Size, HT.dynRefCountToStr().c_str(),
DynRefCountAction, HT.holdRefCountToStr().c_str(), HoldRefCountAction);
TargetPointer = (void *)TP;
} else if (PM->RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
// If the value isn't found in the mapping and unified shared memory
// is on then it means we have stumbled upon a value which we need to
// use directly from the host.
[OpenMP] Improve ref count debug messages For example, without this patch: ``` $ cat test.c int main() { int x; #pragma omp target enter data map(alloc: x) #pragma omp target exit data map(release: x) ; return 0; } $ clang -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda test.c $ LIBOMPTARGET_DEBUG=1 ./a.out |& grep 'Creating\|Mapping exists' Libomptarget --> Creating new map entry with HstPtrBegin=0x00007ffcace8e448, TgtPtrBegin=0x00007f12ef600000, Size=4, Name=unknown Libomptarget --> Mapping exists with HstPtrBegin=0x00007ffcace8e448, TgtPtrBegin=0x00007f12ef600000, Size=4, updated RefCount=1 ``` There are two problems in this example: * `RefCount` is not reported when a mapping is created, but it might be 1 or infinite. In this case, because it's created by `omp target enter data`, it's 1. Seeing that would make later `RefCount` messages easier to understand. * `RefCount` is still 1 at the `omp target exit data`, but it's reported as `updated`. The reason it's still 1 is that, upon deletions, the reference count is generally not updated in `DeviceTy::getTgtPtrBegin`, where the report is produced. Instead, it's zeroed later in `DeviceTy::deallocTgtPtr`, where it's actually removed from the mapping table. This patch makes the following changes: * Report the reference count when creating a mapping. * Where an existing mapping is reported, always report a reference count action: * `update suppressed` when `UpdateRefCount=false` * `incremented` * `decremented` * `deferred final decrement`, which replaces the misleading `updated` in the above example * Add comments to `DeviceTy::getTgtPtrBegin` to explain why it does not zero the reference count. (Please advise if these comments miss the point.) * For unified shared memory, don't report confusing messages like `RefCount=` or `RefCount= updated` given that reference counts are irrelevant in this case. Instead, just report `for unified shared memory`. * Use `INFO` not `DP` consistently for `Mapping exists` messages. * Fix device table dumps to print `INF` instead of `-1` for an infinite reference count. Reviewed By: jhuber6, grokos Differential Revision: https://reviews.llvm.org/D104559
2021-06-23 09:37:54 -04:00
DP("Get HstPtrBegin " DPxMOD " Size=%" PRId64 " for unified shared "
"memory\n",
DPxPTR((uintptr_t)HstPtrBegin), Size);
IsPresent = false;
IsHostPtr = true;
TargetPointer = HstPtrBegin;
} else {
// OpenMP Specification v5.2: if a matching list item is not found, the
// pointer retains its original value as per firstprivate semantics.
IsPresent = false;
IsHostPtr = false;
TargetPointer = HstPtrBegin;
}
return {{IsNew, IsHostPtr, IsPresent}, LR.Entry, TargetPointer};
}
// Return the target pointer begin (where the data will be moved).
void *DeviceTy::getTgtPtrBegin(HDTTMapAccessorTy &HDTTMap, void *HstPtrBegin,
int64_t Size) {
uintptr_t HP = (uintptr_t)HstPtrBegin;
LookupResult LR = lookupMapping(HDTTMap, HstPtrBegin, Size);
if (LR.Flags.IsContained || LR.Flags.ExtendsBefore || LR.Flags.ExtendsAfter) {
auto &HT = *LR.Entry;
uintptr_t TP = HT.TgtPtrBegin + (HP - HT.HstPtrBegin);
return (void *)TP;
}
return NULL;
}
[OpenMP][Fix] Track all threads that may delete an entry The entries inside a "target data end" is processed in three steps: 1. Query internal data maps for the entries and dispatch any necessary device-side operations (i.e., data retrieval); 2. Synchronize the such operations; 3. Update the host-side pointers and remove any entry which reference counter reached zero. Such steps may be executed by multiple threads which may even operate on the same entries. The current implementation (D121058) tries to synchronize these threads by tracking the "owner" for the deletion of each entry using their thread ID. Unfortunately it may failed to do so because of the following reasons: 1. The owner is always assigned at the first step only if the reference count is 0 when the map is queried. This does not work when such owner thread is faster than a previous one that is also processing the same entry on another "target data end", leading to user-after-free problems. 2. The entry is only added for post-processing (step 3) if its reference count was 0 at query time (step 1). This does not allow for threads to exchange responsibility for the deletion, leading again to user-after-free problems. 3. An entry may appear multiple times in the arguments array of a "target data end", which may lead to deleting the entry prematurely, leading, again, to user-after-free problems. This patch addresses these problems by tracking all the threads that are using an entry at "target data end" region through a counter, ensuring only the last one deletes it when needed. It also ensures that all entries that are successfully found inside the data maps in step 1 are also processed in step 3, regardless if their reference count was zeroed or not at query time. This ensures the deletion ownership may be passed to any thread that is using such entry. Reviewed By: ye-luo Differential Revision: https://reviews.llvm.org/D132676
2023-01-19 12:11:20 -03:00
int DeviceTy::eraseMapEntry(HDTTMapAccessorTy &HDTTMap,
HostDataToTargetTy *Entry, int64_t Size) {
assert(Entry && "Trying to delete a null entry from the HDTT map.");
assert(Entry->getTotalRefCount() == 0 && Entry->getDataEndThreadCount() == 0 &&
"Trying to delete entry that is in use or owned by another thread.");
INFO(OMP_INFOTYPE_MAPPING_CHANGED, DeviceID,
"Removing map entry with HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD
", Size=%" PRId64 ", Name=%s\n",
[OpenMP][Fix] Track all threads that may delete an entry The entries inside a "target data end" is processed in three steps: 1. Query internal data maps for the entries and dispatch any necessary device-side operations (i.e., data retrieval); 2. Synchronize the such operations; 3. Update the host-side pointers and remove any entry which reference counter reached zero. Such steps may be executed by multiple threads which may even operate on the same entries. The current implementation (D121058) tries to synchronize these threads by tracking the "owner" for the deletion of each entry using their thread ID. Unfortunately it may failed to do so because of the following reasons: 1. The owner is always assigned at the first step only if the reference count is 0 when the map is queried. This does not work when such owner thread is faster than a previous one that is also processing the same entry on another "target data end", leading to user-after-free problems. 2. The entry is only added for post-processing (step 3) if its reference count was 0 at query time (step 1). This does not allow for threads to exchange responsibility for the deletion, leading again to user-after-free problems. 3. An entry may appear multiple times in the arguments array of a "target data end", which may lead to deleting the entry prematurely, leading, again, to user-after-free problems. This patch addresses these problems by tracking all the threads that are using an entry at "target data end" region through a counter, ensuring only the last one deletes it when needed. It also ensures that all entries that are successfully found inside the data maps in step 1 are also processed in step 3, regardless if their reference count was zeroed or not at query time. This ensures the deletion ownership may be passed to any thread that is using such entry. Reviewed By: ye-luo Differential Revision: https://reviews.llvm.org/D132676
2023-01-19 12:11:20 -03:00
DPxPTR(Entry->HstPtrBegin), DPxPTR(Entry->TgtPtrBegin), Size,
(Entry->HstPtrName) ? getNameFromMapping(Entry->HstPtrName).c_str()
: "unknown");
if (HDTTMap->erase(Entry) == 0) {
REPORT("Trying to remove a non-existent map entry\n");
return OFFLOAD_FAIL;
}
[OpenMP][Fix] Track all threads that may delete an entry The entries inside a "target data end" is processed in three steps: 1. Query internal data maps for the entries and dispatch any necessary device-side operations (i.e., data retrieval); 2. Synchronize the such operations; 3. Update the host-side pointers and remove any entry which reference counter reached zero. Such steps may be executed by multiple threads which may even operate on the same entries. The current implementation (D121058) tries to synchronize these threads by tracking the "owner" for the deletion of each entry using their thread ID. Unfortunately it may failed to do so because of the following reasons: 1. The owner is always assigned at the first step only if the reference count is 0 when the map is queried. This does not work when such owner thread is faster than a previous one that is also processing the same entry on another "target data end", leading to user-after-free problems. 2. The entry is only added for post-processing (step 3) if its reference count was 0 at query time (step 1). This does not allow for threads to exchange responsibility for the deletion, leading again to user-after-free problems. 3. An entry may appear multiple times in the arguments array of a "target data end", which may lead to deleting the entry prematurely, leading, again, to user-after-free problems. This patch addresses these problems by tracking all the threads that are using an entry at "target data end" region through a counter, ensuring only the last one deletes it when needed. It also ensures that all entries that are successfully found inside the data maps in step 1 are also processed in step 3, regardless if their reference count was zeroed or not at query time. This ensures the deletion ownership may be passed to any thread that is using such entry. Reviewed By: ye-luo Differential Revision: https://reviews.llvm.org/D132676
2023-01-19 12:11:20 -03:00
return OFFLOAD_SUCCESS;
}
int DeviceTy::deallocTgtPtrAndEntry(HostDataToTargetTy *Entry, int64_t Size) {
assert(Entry && "Trying to deallocate a null entry.");
DP("Deleting tgt data " DPxMOD " of size %" PRId64 "\n",
DPxPTR(Entry->TgtPtrBegin), Size);
void *Event = Entry->getEvent();
if (Event && destroyEvent(Event) != OFFLOAD_SUCCESS) {
REPORT("Failed to destroy event " DPxMOD "\n", DPxPTR(Event));
[OpenMP][Fix] Track all threads that may delete an entry The entries inside a "target data end" is processed in three steps: 1. Query internal data maps for the entries and dispatch any necessary device-side operations (i.e., data retrieval); 2. Synchronize the such operations; 3. Update the host-side pointers and remove any entry which reference counter reached zero. Such steps may be executed by multiple threads which may even operate on the same entries. The current implementation (D121058) tries to synchronize these threads by tracking the "owner" for the deletion of each entry using their thread ID. Unfortunately it may failed to do so because of the following reasons: 1. The owner is always assigned at the first step only if the reference count is 0 when the map is queried. This does not work when such owner thread is faster than a previous one that is also processing the same entry on another "target data end", leading to user-after-free problems. 2. The entry is only added for post-processing (step 3) if its reference count was 0 at query time (step 1). This does not allow for threads to exchange responsibility for the deletion, leading again to user-after-free problems. 3. An entry may appear multiple times in the arguments array of a "target data end", which may lead to deleting the entry prematurely, leading, again, to user-after-free problems. This patch addresses these problems by tracking all the threads that are using an entry at "target data end" region through a counter, ensuring only the last one deletes it when needed. It also ensures that all entries that are successfully found inside the data maps in step 1 are also processed in step 3, regardless if their reference count was zeroed or not at query time. This ensures the deletion ownership may be passed to any thread that is using such entry. Reviewed By: ye-luo Differential Revision: https://reviews.llvm.org/D132676
2023-01-19 12:11:20 -03:00
return OFFLOAD_FAIL;
}
[OpenMP][Fix] Track all threads that may delete an entry The entries inside a "target data end" is processed in three steps: 1. Query internal data maps for the entries and dispatch any necessary device-side operations (i.e., data retrieval); 2. Synchronize the such operations; 3. Update the host-side pointers and remove any entry which reference counter reached zero. Such steps may be executed by multiple threads which may even operate on the same entries. The current implementation (D121058) tries to synchronize these threads by tracking the "owner" for the deletion of each entry using their thread ID. Unfortunately it may failed to do so because of the following reasons: 1. The owner is always assigned at the first step only if the reference count is 0 when the map is queried. This does not work when such owner thread is faster than a previous one that is also processing the same entry on another "target data end", leading to user-after-free problems. 2. The entry is only added for post-processing (step 3) if its reference count was 0 at query time (step 1). This does not allow for threads to exchange responsibility for the deletion, leading again to user-after-free problems. 3. An entry may appear multiple times in the arguments array of a "target data end", which may lead to deleting the entry prematurely, leading, again, to user-after-free problems. This patch addresses these problems by tracking all the threads that are using an entry at "target data end" region through a counter, ensuring only the last one deletes it when needed. It also ensures that all entries that are successfully found inside the data maps in step 1 are also processed in step 3, regardless if their reference count was zeroed or not at query time. This ensures the deletion ownership may be passed to any thread that is using such entry. Reviewed By: ye-luo Differential Revision: https://reviews.llvm.org/D132676
2023-01-19 12:11:20 -03:00
int Ret = deleteData((void *)Entry->TgtPtrBegin);
// Notify the plugin about the unmapped memory.
Ret |= notifyDataUnmapped((void *)Entry->HstPtrBegin);
[OpenMP][Fix] Track all threads that may delete an entry The entries inside a "target data end" is processed in three steps: 1. Query internal data maps for the entries and dispatch any necessary device-side operations (i.e., data retrieval); 2. Synchronize the such operations; 3. Update the host-side pointers and remove any entry which reference counter reached zero. Such steps may be executed by multiple threads which may even operate on the same entries. The current implementation (D121058) tries to synchronize these threads by tracking the "owner" for the deletion of each entry using their thread ID. Unfortunately it may failed to do so because of the following reasons: 1. The owner is always assigned at the first step only if the reference count is 0 when the map is queried. This does not work when such owner thread is faster than a previous one that is also processing the same entry on another "target data end", leading to user-after-free problems. 2. The entry is only added for post-processing (step 3) if its reference count was 0 at query time (step 1). This does not allow for threads to exchange responsibility for the deletion, leading again to user-after-free problems. 3. An entry may appear multiple times in the arguments array of a "target data end", which may lead to deleting the entry prematurely, leading, again, to user-after-free problems. This patch addresses these problems by tracking all the threads that are using an entry at "target data end" region through a counter, ensuring only the last one deletes it when needed. It also ensures that all entries that are successfully found inside the data maps in step 1 are also processed in step 3, regardless if their reference count was zeroed or not at query time. This ensures the deletion ownership may be passed to any thread that is using such entry. Reviewed By: ye-luo Differential Revision: https://reviews.llvm.org/D132676
2023-01-19 12:11:20 -03:00
delete Entry;
return Ret;
}
/// Init device, should not be called directly.
void DeviceTy::init() {
// Make call to init_requires if it exists for this plugin.
if (RTL->init_requires)
RTL->init_requires(PM->RTLs.RequiresFlags);
[OpenMP] Introduce target memory manager Target memory manager is introduced in this patch which aims to manage target memory such that they will not be freed immediately when they are not used because the overhead of memory allocation and free is very large. For CUDA device, cuMemFree even blocks the context switch on device which affects concurrent kernel execution. The memory manager can be taken as a memory pool. It divides the pool into multiple buckets according to the size such that memory allocation/free distributed to different buckets will not affect each other. In this version, we use the exact-equality policy to find a free buffer. This is an open question: will best-fit work better here? IMO, best-fit is not good for target memory management because computation on GPU usually requires GBs of data. Best-fit might lead to a serious waste. For example, there is a free buffer of size 1960MB, and now we need a buffer of size 1200MB. If best-fit, the free buffer will be returned, leading to a 760MB waste. The allocation will happen when there is no free memory left, and the memory free on device will take place in the following two cases: 1. The program ends. Obviously. However, there is a little problem that plugin library is destroyed before the memory manager is destroyed, leading to a fact that the call to target plugin will not succeed. 2. Device is out of memory when we request a new memory. The manager will walk through all free buffers from the bucket with largest base size, pick up one buffer, free it, and try to allocate immediately. If it succeeds, it will return right away rather than freeing all buffers in free list. Update: A threshold (8KB by default) is set such that users could control what size of memory will be managed by the manager. It can also be configured by an environment variable `LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD`. Reviewed By: jdoerfert, ye-luo, JonChesterfield Differential Revision: https://reviews.llvm.org/D81054
2020-08-19 23:12:02 -04:00
int32_t Ret = RTL->init_device(RTLDeviceID);
if (Ret != OFFLOAD_SUCCESS)
return;
IsInit = true;
}
/// Thread-safe method to initialize the device only once.
int32_t DeviceTy::initOnce() {
std::call_once(InitFlag, &DeviceTy::init, this);
// At this point, if IsInit is true, then either this thread or some other
// thread in the past successfully initialized the device, so we can return
// OFFLOAD_SUCCESS. If this thread executed init() via call_once() and it
// failed, return OFFLOAD_FAIL. If call_once did not invoke init(), it means
// that some other thread already attempted to execute init() and if IsInit
// is still false, return OFFLOAD_FAIL.
if (IsInit)
return OFFLOAD_SUCCESS;
return OFFLOAD_FAIL;
}
void DeviceTy::deinit() {
if (RTL->deinit_device)
RTL->deinit_device(RTLDeviceID);
}
// Load binary to device.
__tgt_target_table *DeviceTy::loadBinary(void *Img) {
std::lock_guard<decltype(RTL->Mtx)> LG(RTL->Mtx);
return RTL->load_binary(RTLDeviceID, Img);
}
void *DeviceTy::allocData(int64_t Size, void *HstPtr, int32_t Kind) {
return RTL->data_alloc(RTLDeviceID, Size, HstPtr, Kind);
}
int32_t DeviceTy::deleteData(void *TgtPtrBegin, int32_t Kind) {
return RTL->data_delete(RTLDeviceID, TgtPtrBegin, Kind);
}
// Submit data to device
int32_t DeviceTy::submitData(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size,
AsyncInfoTy &AsyncInfo) {
if (getInfoLevel() & OMP_INFOTYPE_DATA_TRANSFER) {
HDTTMapAccessorTy HDTTMap = HostDataToTargetMap.getExclusiveAccessor();
LookupResult LR = lookupMapping(HDTTMap, HstPtrBegin, Size);
auto *HT = &*LR.Entry;
INFO(OMP_INFOTYPE_DATA_TRANSFER, DeviceID,
"Copying data from host to device, HstPtr=" DPxMOD ", TgtPtr=" DPxMOD
", Size=%" PRId64 ", Name=%s\n",
DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin), Size,
(HT && HT->HstPtrName) ? getNameFromMapping(HT->HstPtrName).c_str()
: "unknown");
}
if (!AsyncInfo || !RTL->data_submit_async || !RTL->synchronize)
return RTL->data_submit(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size);
return RTL->data_submit_async(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size,
AsyncInfo);
}
// Retrieve data from device
int32_t DeviceTy::retrieveData(void *HstPtrBegin, void *TgtPtrBegin,
int64_t Size, AsyncInfoTy &AsyncInfo) {
if (getInfoLevel() & OMP_INFOTYPE_DATA_TRANSFER) {
HDTTMapAccessorTy HDTTMap = HostDataToTargetMap.getExclusiveAccessor();
LookupResult LR = lookupMapping(HDTTMap, HstPtrBegin, Size);
auto *HT = &*LR.Entry;
INFO(OMP_INFOTYPE_DATA_TRANSFER, DeviceID,
"Copying data from device to host, TgtPtr=" DPxMOD ", HstPtr=" DPxMOD
", Size=%" PRId64 ", Name=%s\n",
DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin), Size,
(HT && HT->HstPtrName) ? getNameFromMapping(HT->HstPtrName).c_str()
: "unknown");
}
if (!RTL->data_retrieve_async || !RTL->synchronize)
return RTL->data_retrieve(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size);
return RTL->data_retrieve_async(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size,
AsyncInfo);
}
// Copy data from current device to destination device directly
int32_t DeviceTy::dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr,
int64_t Size, AsyncInfoTy &AsyncInfo) {
if (!AsyncInfo || !RTL->data_exchange_async || !RTL->synchronize) {
assert(RTL->data_exchange && "RTL->data_exchange is nullptr");
return RTL->data_exchange(RTLDeviceID, SrcPtr, DstDev.RTLDeviceID, DstPtr,
Size);
}
return RTL->data_exchange_async(RTLDeviceID, SrcPtr, DstDev.RTLDeviceID,
DstPtr, Size, AsyncInfo);
}
int32_t DeviceTy::notifyDataMapped(void *HstPtr, int64_t Size) {
if (!RTL->data_notify_mapped)
return OFFLOAD_SUCCESS;
DP("Notifying about new mapping: HstPtr=" DPxMOD ", Size=%" PRId64 "\n",
DPxPTR(HstPtr), Size);
if (RTL->data_notify_mapped(RTLDeviceID, HstPtr, Size)) {
REPORT("Notifiying about data mapping failed.\n");
return OFFLOAD_FAIL;
}
return OFFLOAD_SUCCESS;
}
int32_t DeviceTy::notifyDataUnmapped(void *HstPtr) {
if (!RTL->data_notify_unmapped)
return OFFLOAD_SUCCESS;
DP("Notifying about an unmapping: HstPtr=" DPxMOD "\n", DPxPTR(HstPtr));
if (RTL->data_notify_unmapped(RTLDeviceID, HstPtr)) {
REPORT("Notifiying about data unmapping failed.\n");
return OFFLOAD_FAIL;
}
return OFFLOAD_SUCCESS;
}
// Run region on device
int32_t DeviceTy::launchKernel(void *TgtEntryPtr, void **TgtVarsPtr,
ptrdiff_t *TgtOffsets,
const KernelArgsTy &KernelArgs,
AsyncInfoTy &AsyncInfo) {
return RTL->launch_kernel(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtOffsets,
&KernelArgs, AsyncInfo);
}
// Run region on device
bool DeviceTy::printDeviceInfo(int32_t RTLDevId) {
if (!RTL->print_device_info)
return false;
RTL->print_device_info(RTLDevId);
return true;
}
// Whether data can be copied to DstDevice directly
bool DeviceTy::isDataExchangable(const DeviceTy &DstDevice) {
if (RTL != DstDevice.RTL || !RTL->is_data_exchangable)
return false;
if (RTL->is_data_exchangable(RTLDeviceID, DstDevice.RTLDeviceID))
return (RTL->data_exchange != nullptr) ||
(RTL->data_exchange_async != nullptr);
return false;
}
int32_t DeviceTy::synchronize(AsyncInfoTy &AsyncInfo) {
if (RTL->synchronize)
return RTL->synchronize(RTLDeviceID, AsyncInfo);
return OFFLOAD_SUCCESS;
}
int32_t DeviceTy::queryAsync(AsyncInfoTy &AsyncInfo) {
if (RTL->query_async)
return RTL->query_async(RTLDeviceID, AsyncInfo);
return synchronize(AsyncInfo);
}
int32_t DeviceTy::createEvent(void **Event) {
if (RTL->create_event)
return RTL->create_event(RTLDeviceID, Event);
return OFFLOAD_SUCCESS;
}
int32_t DeviceTy::recordEvent(void *Event, AsyncInfoTy &AsyncInfo) {
if (RTL->record_event)
return RTL->record_event(RTLDeviceID, Event, AsyncInfo);
return OFFLOAD_SUCCESS;
}
int32_t DeviceTy::waitEvent(void *Event, AsyncInfoTy &AsyncInfo) {
if (RTL->wait_event)
return RTL->wait_event(RTLDeviceID, Event, AsyncInfo);
return OFFLOAD_SUCCESS;
}
int32_t DeviceTy::syncEvent(void *Event) {
if (RTL->sync_event)
return RTL->sync_event(RTLDeviceID, Event);
return OFFLOAD_SUCCESS;
}
int32_t DeviceTy::destroyEvent(void *Event) {
if (RTL->create_event)
return RTL->destroy_event(RTLDeviceID, Event);
return OFFLOAD_SUCCESS;
}
/// Check whether a device has an associated RTL and initialize it if it's not
/// already initialized.
bool deviceIsReady(int DeviceNum) {
DP("Checking whether device %d is ready.\n", DeviceNum);
// Devices.size() can only change while registering a new
// library, so try to acquire the lock of RTLs' mutex.
size_t DevicesSize;
{
std::lock_guard<decltype(PM->RTLsMtx)> LG(PM->RTLsMtx);
DevicesSize = PM->Devices.size();
}
if (DevicesSize <= (size_t)DeviceNum) {
DP("Device ID %d does not have a matching RTL\n", DeviceNum);
return false;
}
// Get device info
DeviceTy &Device = *PM->Devices[DeviceNum];
DP("Is the device %d (local ID %d) initialized? %d\n", DeviceNum,
Device.RTLDeviceID, Device.IsInit);
// Init the device if not done before
if (!Device.IsInit && Device.initOnce() != OFFLOAD_SUCCESS) {
DP("Failed to init device %d\n", DeviceNum);
return false;
}
DP("Device %d is ready to use.\n", DeviceNum);
return true;
}