Files
llvm/openmp/libomptarget/src/device.cpp

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

705 lines
27 KiB
C++
Raw Normal View History

//===--------- device.cpp - Target independent OpenMP target RTL ----------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// Functionality for managing devices that are handled by RTL plugins.
//
//===----------------------------------------------------------------------===//
#include "device.h"
#include "omptarget.h"
#include "private.h"
#include "rtl.h"
#include <cassert>
#include <climits>
#include <cstdint>
#include <cstdio>
#include <string>
#include <thread>
int HostDataToTargetTy::addEventIfNecessary(DeviceTy &Device,
AsyncInfoTy &AsyncInfo) const {
// First, check if the user disabled atomic map transfer/malloc/dealloc.
if (!PM->UseEventsForAtomicTransfers)
return OFFLOAD_SUCCESS;
void *Event = getEvent();
bool NeedNewEvent = Event == nullptr;
if (NeedNewEvent && Device.createEvent(&Event) != OFFLOAD_SUCCESS) {
REPORT("Failed to create event\n");
return OFFLOAD_FAIL;
}
// We cannot assume the event should not be nullptr because we don't
// know if the target support event. But if a target doesn't,
// recordEvent should always return success.
if (Device.recordEvent(Event, AsyncInfo) != OFFLOAD_SUCCESS) {
REPORT("Failed to set dependence on event " DPxMOD "\n", DPxPTR(Event));
return OFFLOAD_FAIL;
}
if (NeedNewEvent)
setEvent(Event);
return OFFLOAD_SUCCESS;
}
[OpenMP] Introduce target memory manager Target memory manager is introduced in this patch which aims to manage target memory such that they will not be freed immediately when they are not used because the overhead of memory allocation and free is very large. For CUDA device, cuMemFree even blocks the context switch on device which affects concurrent kernel execution. The memory manager can be taken as a memory pool. It divides the pool into multiple buckets according to the size such that memory allocation/free distributed to different buckets will not affect each other. In this version, we use the exact-equality policy to find a free buffer. This is an open question: will best-fit work better here? IMO, best-fit is not good for target memory management because computation on GPU usually requires GBs of data. Best-fit might lead to a serious waste. For example, there is a free buffer of size 1960MB, and now we need a buffer of size 1200MB. If best-fit, the free buffer will be returned, leading to a 760MB waste. The allocation will happen when there is no free memory left, and the memory free on device will take place in the following two cases: 1. The program ends. Obviously. However, there is a little problem that plugin library is destroyed before the memory manager is destroyed, leading to a fact that the call to target plugin will not succeed. 2. Device is out of memory when we request a new memory. The manager will walk through all free buffers from the bucket with largest base size, pick up one buffer, free it, and try to allocate immediately. If it succeeds, it will return right away rather than freeing all buffers in free list. Update: A threshold (8KB by default) is set such that users could control what size of memory will be managed by the manager. It can also be configured by an environment variable `LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD`. Reviewed By: jdoerfert, ye-luo, JonChesterfield Differential Revision: https://reviews.llvm.org/D81054
2020-08-19 23:12:02 -04:00
DeviceTy::DeviceTy(RTLInfoTy *RTL)
: DeviceID(-1), RTL(RTL), RTLDeviceID(-1), IsInit(false), InitFlag(),
HasPendingGlobals(false), PendingCtorsDtors(), ShadowPtrMap(),
PendingGlobalsMtx(), ShadowMtx() {}
[OpenMP] Introduce target memory manager Target memory manager is introduced in this patch which aims to manage target memory such that they will not be freed immediately when they are not used because the overhead of memory allocation and free is very large. For CUDA device, cuMemFree even blocks the context switch on device which affects concurrent kernel execution. The memory manager can be taken as a memory pool. It divides the pool into multiple buckets according to the size such that memory allocation/free distributed to different buckets will not affect each other. In this version, we use the exact-equality policy to find a free buffer. This is an open question: will best-fit work better here? IMO, best-fit is not good for target memory management because computation on GPU usually requires GBs of data. Best-fit might lead to a serious waste. For example, there is a free buffer of size 1960MB, and now we need a buffer of size 1200MB. If best-fit, the free buffer will be returned, leading to a 760MB waste. The allocation will happen when there is no free memory left, and the memory free on device will take place in the following two cases: 1. The program ends. Obviously. However, there is a little problem that plugin library is destroyed before the memory manager is destroyed, leading to a fact that the call to target plugin will not succeed. 2. Device is out of memory when we request a new memory. The manager will walk through all free buffers from the bucket with largest base size, pick up one buffer, free it, and try to allocate immediately. If it succeeds, it will return right away rather than freeing all buffers in free list. Update: A threshold (8KB by default) is set such that users could control what size of memory will be managed by the manager. It can also be configured by an environment variable `LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD`. Reviewed By: jdoerfert, ye-luo, JonChesterfield Differential Revision: https://reviews.llvm.org/D81054
2020-08-19 23:12:02 -04:00
DeviceTy::~DeviceTy() {
if (DeviceID == -1 || !(getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE))
return;
ident_t loc = {0, 0, 0, 0, ";libomptarget;libomptarget;0;0;;"};
dumpTargetPointerMappings(&loc, *this);
}
[OpenMP] Introduce target memory manager Target memory manager is introduced in this patch which aims to manage target memory such that they will not be freed immediately when they are not used because the overhead of memory allocation and free is very large. For CUDA device, cuMemFree even blocks the context switch on device which affects concurrent kernel execution. The memory manager can be taken as a memory pool. It divides the pool into multiple buckets according to the size such that memory allocation/free distributed to different buckets will not affect each other. In this version, we use the exact-equality policy to find a free buffer. This is an open question: will best-fit work better here? IMO, best-fit is not good for target memory management because computation on GPU usually requires GBs of data. Best-fit might lead to a serious waste. For example, there is a free buffer of size 1960MB, and now we need a buffer of size 1200MB. If best-fit, the free buffer will be returned, leading to a 760MB waste. The allocation will happen when there is no free memory left, and the memory free on device will take place in the following two cases: 1. The program ends. Obviously. However, there is a little problem that plugin library is destroyed before the memory manager is destroyed, leading to a fact that the call to target plugin will not succeed. 2. Device is out of memory when we request a new memory. The manager will walk through all free buffers from the bucket with largest base size, pick up one buffer, free it, and try to allocate immediately. If it succeeds, it will return right away rather than freeing all buffers in free list. Update: A threshold (8KB by default) is set such that users could control what size of memory will be managed by the manager. It can also be configured by an environment variable `LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD`. Reviewed By: jdoerfert, ye-luo, JonChesterfield Differential Revision: https://reviews.llvm.org/D81054
2020-08-19 23:12:02 -04:00
int DeviceTy::associatePtr(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size) {
HDTTMapAccessorTy HDTTMap = HostDataToTargetMap.getExclusiveAccessor();
// Check if entry exists
auto It = HDTTMap->find(HstPtrBegin);
if (It != HDTTMap->end()) {
HostDataToTargetTy &HDTT = *It->HDTT;
// Mapping already exists
bool isValid = HDTT.HstPtrEnd == (uintptr_t)HstPtrBegin + Size &&
HDTT.TgtPtrBegin == (uintptr_t)TgtPtrBegin;
if (isValid) {
DP("Attempt to re-associate the same device ptr+offset with the same "
"host ptr, nothing to do\n");
return OFFLOAD_SUCCESS;
} else {
REPORT("Not allowed to re-associate a different device ptr+offset with "
"the same host ptr\n");
return OFFLOAD_FAIL;
}
}
// Mapping does not exist, allocate it with refCount=INF
[OpenMP] Improve ref count debug messages For example, without this patch: ``` $ cat test.c int main() { int x; #pragma omp target enter data map(alloc: x) #pragma omp target exit data map(release: x) ; return 0; } $ clang -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda test.c $ LIBOMPTARGET_DEBUG=1 ./a.out |& grep 'Creating\|Mapping exists' Libomptarget --> Creating new map entry with HstPtrBegin=0x00007ffcace8e448, TgtPtrBegin=0x00007f12ef600000, Size=4, Name=unknown Libomptarget --> Mapping exists with HstPtrBegin=0x00007ffcace8e448, TgtPtrBegin=0x00007f12ef600000, Size=4, updated RefCount=1 ``` There are two problems in this example: * `RefCount` is not reported when a mapping is created, but it might be 1 or infinite. In this case, because it's created by `omp target enter data`, it's 1. Seeing that would make later `RefCount` messages easier to understand. * `RefCount` is still 1 at the `omp target exit data`, but it's reported as `updated`. The reason it's still 1 is that, upon deletions, the reference count is generally not updated in `DeviceTy::getTgtPtrBegin`, where the report is produced. Instead, it's zeroed later in `DeviceTy::deallocTgtPtr`, where it's actually removed from the mapping table. This patch makes the following changes: * Report the reference count when creating a mapping. * Where an existing mapping is reported, always report a reference count action: * `update suppressed` when `UpdateRefCount=false` * `incremented` * `decremented` * `deferred final decrement`, which replaces the misleading `updated` in the above example * Add comments to `DeviceTy::getTgtPtrBegin` to explain why it does not zero the reference count. (Please advise if these comments miss the point.) * For unified shared memory, don't report confusing messages like `RefCount=` or `RefCount= updated` given that reference counts are irrelevant in this case. Instead, just report `for unified shared memory`. * Use `INFO` not `DP` consistently for `Mapping exists` messages. * Fix device table dumps to print `INF` instead of `-1` for an infinite reference count. Reviewed By: jhuber6, grokos Differential Revision: https://reviews.llvm.org/D104559
2021-06-23 09:37:54 -04:00
const HostDataToTargetTy &newEntry =
*HDTTMap
->emplace(new HostDataToTargetTy(
[OpenMP] Improve ref count debug messages For example, without this patch: ``` $ cat test.c int main() { int x; #pragma omp target enter data map(alloc: x) #pragma omp target exit data map(release: x) ; return 0; } $ clang -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda test.c $ LIBOMPTARGET_DEBUG=1 ./a.out |& grep 'Creating\|Mapping exists' Libomptarget --> Creating new map entry with HstPtrBegin=0x00007ffcace8e448, TgtPtrBegin=0x00007f12ef600000, Size=4, Name=unknown Libomptarget --> Mapping exists with HstPtrBegin=0x00007ffcace8e448, TgtPtrBegin=0x00007f12ef600000, Size=4, updated RefCount=1 ``` There are two problems in this example: * `RefCount` is not reported when a mapping is created, but it might be 1 or infinite. In this case, because it's created by `omp target enter data`, it's 1. Seeing that would make later `RefCount` messages easier to understand. * `RefCount` is still 1 at the `omp target exit data`, but it's reported as `updated`. The reason it's still 1 is that, upon deletions, the reference count is generally not updated in `DeviceTy::getTgtPtrBegin`, where the report is produced. Instead, it's zeroed later in `DeviceTy::deallocTgtPtr`, where it's actually removed from the mapping table. This patch makes the following changes: * Report the reference count when creating a mapping. * Where an existing mapping is reported, always report a reference count action: * `update suppressed` when `UpdateRefCount=false` * `incremented` * `decremented` * `deferred final decrement`, which replaces the misleading `updated` in the above example * Add comments to `DeviceTy::getTgtPtrBegin` to explain why it does not zero the reference count. (Please advise if these comments miss the point.) * For unified shared memory, don't report confusing messages like `RefCount=` or `RefCount= updated` given that reference counts are irrelevant in this case. Instead, just report `for unified shared memory`. * Use `INFO` not `DP` consistently for `Mapping exists` messages. * Fix device table dumps to print `INF` instead of `-1` for an infinite reference count. Reviewed By: jhuber6, grokos Differential Revision: https://reviews.llvm.org/D104559
2021-06-23 09:37:54 -04:00
/*HstPtrBase=*/(uintptr_t)HstPtrBegin,
/*HstPtrBegin=*/(uintptr_t)HstPtrBegin,
/*HstPtrEnd=*/(uintptr_t)HstPtrBegin + Size,
/*TgtPtrBegin=*/(uintptr_t)TgtPtrBegin,
/*UseHoldRefCount=*/false, /*Name=*/nullptr,
/*IsRefCountINF=*/true))
.first->HDTT;
DP("Creating new map entry: HstBase=" DPxMOD ", HstBegin=" DPxMOD
", HstEnd=" DPxMOD ", TgtBegin=" DPxMOD ", DynRefCount=%s, "
"HoldRefCount=%s\n",
[OpenMP] Improve ref count debug messages For example, without this patch: ``` $ cat test.c int main() { int x; #pragma omp target enter data map(alloc: x) #pragma omp target exit data map(release: x) ; return 0; } $ clang -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda test.c $ LIBOMPTARGET_DEBUG=1 ./a.out |& grep 'Creating\|Mapping exists' Libomptarget --> Creating new map entry with HstPtrBegin=0x00007ffcace8e448, TgtPtrBegin=0x00007f12ef600000, Size=4, Name=unknown Libomptarget --> Mapping exists with HstPtrBegin=0x00007ffcace8e448, TgtPtrBegin=0x00007f12ef600000, Size=4, updated RefCount=1 ``` There are two problems in this example: * `RefCount` is not reported when a mapping is created, but it might be 1 or infinite. In this case, because it's created by `omp target enter data`, it's 1. Seeing that would make later `RefCount` messages easier to understand. * `RefCount` is still 1 at the `omp target exit data`, but it's reported as `updated`. The reason it's still 1 is that, upon deletions, the reference count is generally not updated in `DeviceTy::getTgtPtrBegin`, where the report is produced. Instead, it's zeroed later in `DeviceTy::deallocTgtPtr`, where it's actually removed from the mapping table. This patch makes the following changes: * Report the reference count when creating a mapping. * Where an existing mapping is reported, always report a reference count action: * `update suppressed` when `UpdateRefCount=false` * `incremented` * `decremented` * `deferred final decrement`, which replaces the misleading `updated` in the above example * Add comments to `DeviceTy::getTgtPtrBegin` to explain why it does not zero the reference count. (Please advise if these comments miss the point.) * For unified shared memory, don't report confusing messages like `RefCount=` or `RefCount= updated` given that reference counts are irrelevant in this case. Instead, just report `for unified shared memory`. * Use `INFO` not `DP` consistently for `Mapping exists` messages. * Fix device table dumps to print `INF` instead of `-1` for an infinite reference count. Reviewed By: jhuber6, grokos Differential Revision: https://reviews.llvm.org/D104559
2021-06-23 09:37:54 -04:00
DPxPTR(newEntry.HstPtrBase), DPxPTR(newEntry.HstPtrBegin),
DPxPTR(newEntry.HstPtrEnd), DPxPTR(newEntry.TgtPtrBegin),
newEntry.dynRefCountToStr().c_str(), newEntry.holdRefCountToStr().c_str());
(void)newEntry;
return OFFLOAD_SUCCESS;
}
int DeviceTy::disassociatePtr(void *HstPtrBegin) {
HDTTMapAccessorTy HDTTMap = HostDataToTargetMap.getExclusiveAccessor();
auto It = HDTTMap->find(HstPtrBegin);
if (It != HDTTMap->end()) {
HostDataToTargetTy &HDTT = *It->HDTT;
// Mapping exists
if (HDTT.getHoldRefCount()) {
// This is based on OpenACC 3.1, sec 3.2.33 "acc_unmap_data", L3656-3657:
// "It is an error to call acc_unmap_data if the structured reference
// count for the pointer is not zero."
REPORT("Trying to disassociate a pointer with a non-zero hold reference "
"count\n");
} else if (HDTT.isDynRefCountInf()) {
DP("Association found, removing it\n");
void *Event = HDTT.getEvent();
delete &HDTT;
if (Event)
destroyEvent(Event);
HDTTMap->erase(It);
return OFFLOAD_SUCCESS;
} else {
REPORT("Trying to disassociate a pointer which was not mapped via "
"omp_target_associate_ptr\n");
}
} else {
REPORT("Association not found\n");
}
// Mapping not found
return OFFLOAD_FAIL;
}
LookupResult DeviceTy::lookupMapping(HDTTMapAccessorTy &HDTTMap,
void *HstPtrBegin, int64_t Size) {
uintptr_t hp = (uintptr_t)HstPtrBegin;
LookupResult lr;
DP("Looking up mapping(HstPtrBegin=" DPxMOD ", Size=%" PRId64 ")...\n",
DPxPTR(hp), Size);
if (HDTTMap->empty())
return lr;
auto upper = HDTTMap->upper_bound(hp);
if (Size == 0) {
// specification v5.1 Pointer Initialization for Device Data Environments
// upper_bound satisfies
// std::prev(upper)->HDTT.HstPtrBegin <= hp < upper->HDTT.HstPtrBegin
if (upper != HDTTMap->begin()) {
lr.Entry = std::prev(upper)->HDTT;
auto &HT = *lr.Entry;
// the left side of extended address range is satisified.
// hp >= HT.HstPtrBegin || hp >= HT.HstPtrBase
lr.Flags.IsContained = hp < HT.HstPtrEnd || hp < HT.HstPtrBase;
}
if (!lr.Flags.IsContained && upper != HDTTMap->end()) {
lr.Entry = upper->HDTT;
auto &HT = *lr.Entry;
// the right side of extended address range is satisified.
// hp < HT.HstPtrEnd || hp < HT.HstPtrBase
lr.Flags.IsContained = hp >= HT.HstPtrBase;
}
} else {
// check the left bin
if (upper != HDTTMap->begin()) {
lr.Entry = std::prev(upper)->HDTT;
auto &HT = *lr.Entry;
// Is it contained?
lr.Flags.IsContained = hp >= HT.HstPtrBegin && hp < HT.HstPtrEnd &&
(hp + Size) <= HT.HstPtrEnd;
// Does it extend beyond the mapped region?
lr.Flags.ExtendsAfter = hp < HT.HstPtrEnd && (hp + Size) > HT.HstPtrEnd;
}
// check the right bin
if (!(lr.Flags.IsContained || lr.Flags.ExtendsAfter) &&
upper != HDTTMap->end()) {
lr.Entry = upper->HDTT;
auto &HT = *lr.Entry;
// Does it extend into an already mapped region?
lr.Flags.ExtendsBefore =
hp < HT.HstPtrBegin && (hp + Size) > HT.HstPtrBegin;
// Does it extend beyond the mapped region?
lr.Flags.ExtendsAfter = hp < HT.HstPtrEnd && (hp + Size) > HT.HstPtrEnd;
}
if (lr.Flags.ExtendsBefore) {
DP("WARNING: Pointer is not mapped but section extends into already "
"mapped data\n");
}
if (lr.Flags.ExtendsAfter) {
DP("WARNING: Pointer is already mapped but section extends beyond mapped "
"region\n");
}
}
return lr;
}
TargetPointerResultTy DeviceTy::getTargetPointer(
void *HstPtrBegin, void *HstPtrBase, int64_t Size,
map_var_info_t HstPtrName, bool HasFlagTo, bool HasFlagAlways,
bool IsImplicit, bool UpdateRefCount, bool HasCloseModifier,
bool HasPresentModifier, bool HasHoldModifier, AsyncInfoTy &AsyncInfo) {
HDTTMapAccessorTy HDTTMap = HostDataToTargetMap.getExclusiveAccessor();
[OpenMP][Offloading] Fix data race in data mapping by using two locks This patch tries to partially fix one of the two data race issues reported in [1] by introducing a per-entry mutex. Additional discussion can also be found in D104418, which will also be refined to fix another data race problem. Here is how it works. Like before, `DataMapMtx` is still being used for mapping table lookup and update. In any case, we will get a table entry. If we need to make a data transfer (update the data on the device), we need to lock the entry right before releasing `DataMapMtx`, and the issue of data transfer should be after releasing `DataMapMtx`, and the entry is unlocked afterwards. This can guarantee that: 1) issue of data movement is not in critical region, which will not affect performance too much, and also will not affect other threads that don't touch the same entry; 2) if another thread accesses the same entry, the state of data movement is consistent (which requires that a thread must first get the update lock before getting data movement information). For a target that doesn't support async data transfer, issue of data movement is data transfer. This two-lock design can potentially improve concurrency compared with the design that guards data movement with `DataMapMtx` as well. For a target that supports async data movement, we could simply attach the event between the issue of data movement and unlock the entry. For a thread that wants to get the event, it must first get the lock. This can also get rid of the busy wait until the event pointer is valid. Reference: [1] https://bugs.llvm.org/show_bug.cgi?id=49940 Reviewed By: grokos Differential Revision: https://reviews.llvm.org/D104555
2021-07-23 16:10:42 -04:00
void *TargetPointer = nullptr;
bool IsHostPtr = false;
[OpenMP][Offloading] Fix data race in data mapping by using two locks This patch tries to partially fix one of the two data race issues reported in [1] by introducing a per-entry mutex. Additional discussion can also be found in D104418, which will also be refined to fix another data race problem. Here is how it works. Like before, `DataMapMtx` is still being used for mapping table lookup and update. In any case, we will get a table entry. If we need to make a data transfer (update the data on the device), we need to lock the entry right before releasing `DataMapMtx`, and the issue of data transfer should be after releasing `DataMapMtx`, and the entry is unlocked afterwards. This can guarantee that: 1) issue of data movement is not in critical region, which will not affect performance too much, and also will not affect other threads that don't touch the same entry; 2) if another thread accesses the same entry, the state of data movement is consistent (which requires that a thread must first get the update lock before getting data movement information). For a target that doesn't support async data transfer, issue of data movement is data transfer. This two-lock design can potentially improve concurrency compared with the design that guards data movement with `DataMapMtx` as well. For a target that supports async data movement, we could simply attach the event between the issue of data movement and unlock the entry. For a thread that wants to get the event, it must first get the lock. This can also get rid of the busy wait until the event pointer is valid. Reference: [1] https://bugs.llvm.org/show_bug.cgi?id=49940 Reviewed By: grokos Differential Revision: https://reviews.llvm.org/D104555
2021-07-23 16:10:42 -04:00
bool IsNew = false;
LookupResult LR = lookupMapping(HDTTMap, HstPtrBegin, Size);
auto *Entry = LR.Entry;
// Check if the pointer is contained.
// If a variable is mapped to the device manually by the user - which would
// lead to the IsContained flag to be true - then we must ensure that the
// device address is returned even under unified memory conditions.
if (LR.Flags.IsContained ||
((LR.Flags.ExtendsBefore || LR.Flags.ExtendsAfter) && IsImplicit)) {
auto &HT = *LR.Entry;
const char *RefCountAction;
if (UpdateRefCount) {
// After this, reference count >= 1. If the reference count was 0 but the
// entry was still there we can reuse the data on the device and avoid a
// new submission.
HT.incRefCount(HasHoldModifier);
RefCountAction = " (incremented)";
} else {
// It might have been allocated with the parent, but it's still new.
IsNew = HT.getTotalRefCount() == 1;
RefCountAction = " (update suppressed)";
}
const char *DynRefCountAction = HasHoldModifier ? "" : RefCountAction;
const char *HoldRefCountAction = HasHoldModifier ? RefCountAction : "";
uintptr_t Ptr = HT.TgtPtrBegin + ((uintptr_t)HstPtrBegin - HT.HstPtrBegin);
INFO(OMP_INFOTYPE_MAPPING_EXISTS, DeviceID,
"Mapping exists%s with HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD
", Size=%" PRId64 ", DynRefCount=%s%s, HoldRefCount=%s%s, Name=%s\n",
(IsImplicit ? " (implicit)" : ""), DPxPTR(HstPtrBegin), DPxPTR(Ptr),
Size, HT.dynRefCountToStr().c_str(), DynRefCountAction,
HT.holdRefCountToStr().c_str(), HoldRefCountAction,
(HstPtrName) ? getNameFromMapping(HstPtrName).c_str() : "unknown");
TargetPointer = (void *)Ptr;
} else if ((LR.Flags.ExtendsBefore || LR.Flags.ExtendsAfter) && !IsImplicit) {
// Explicit extension of mapped data - not allowed.
MESSAGE("explicit extension not allowed: host address specified is " DPxMOD
" (%" PRId64
" bytes), but device allocation maps to host at " DPxMOD
" (%" PRId64 " bytes)",
DPxPTR(HstPtrBegin), Size, DPxPTR(Entry->HstPtrBegin),
Entry->HstPtrEnd - Entry->HstPtrBegin);
if (HasPresentModifier)
MESSAGE("device mapping required by 'present' map type modifier does not "
"exist for host address " DPxMOD " (%" PRId64 " bytes)",
DPxPTR(HstPtrBegin), Size);
} else if (PM->RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
!HasCloseModifier) {
// If unified shared memory is active, implicitly mapped variables that are
// not privatized use host address. Any explicitly mapped variables also use
// host address where correctness is not impeded. In all other cases maps
// are respected.
// In addition to the mapping rules above, the close map modifier forces the
// mapping of the variable to the device.
if (Size) {
[OpenMP] Improve ref count debug messages For example, without this patch: ``` $ cat test.c int main() { int x; #pragma omp target enter data map(alloc: x) #pragma omp target exit data map(release: x) ; return 0; } $ clang -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda test.c $ LIBOMPTARGET_DEBUG=1 ./a.out |& grep 'Creating\|Mapping exists' Libomptarget --> Creating new map entry with HstPtrBegin=0x00007ffcace8e448, TgtPtrBegin=0x00007f12ef600000, Size=4, Name=unknown Libomptarget --> Mapping exists with HstPtrBegin=0x00007ffcace8e448, TgtPtrBegin=0x00007f12ef600000, Size=4, updated RefCount=1 ``` There are two problems in this example: * `RefCount` is not reported when a mapping is created, but it might be 1 or infinite. In this case, because it's created by `omp target enter data`, it's 1. Seeing that would make later `RefCount` messages easier to understand. * `RefCount` is still 1 at the `omp target exit data`, but it's reported as `updated`. The reason it's still 1 is that, upon deletions, the reference count is generally not updated in `DeviceTy::getTgtPtrBegin`, where the report is produced. Instead, it's zeroed later in `DeviceTy::deallocTgtPtr`, where it's actually removed from the mapping table. This patch makes the following changes: * Report the reference count when creating a mapping. * Where an existing mapping is reported, always report a reference count action: * `update suppressed` when `UpdateRefCount=false` * `incremented` * `decremented` * `deferred final decrement`, which replaces the misleading `updated` in the above example * Add comments to `DeviceTy::getTgtPtrBegin` to explain why it does not zero the reference count. (Please advise if these comments miss the point.) * For unified shared memory, don't report confusing messages like `RefCount=` or `RefCount= updated` given that reference counts are irrelevant in this case. Instead, just report `for unified shared memory`. * Use `INFO` not `DP` consistently for `Mapping exists` messages. * Fix device table dumps to print `INF` instead of `-1` for an infinite reference count. Reviewed By: jhuber6, grokos Differential Revision: https://reviews.llvm.org/D104559
2021-06-23 09:37:54 -04:00
DP("Return HstPtrBegin " DPxMOD " Size=%" PRId64 " for unified shared "
"memory\n",
DPxPTR((uintptr_t)HstPtrBegin), Size);
IsHostPtr = true;
TargetPointer = HstPtrBegin;
}
} else if (HasPresentModifier) {
DP("Mapping required by 'present' map type modifier does not exist for "
"HstPtrBegin=" DPxMOD ", Size=%" PRId64 "\n",
DPxPTR(HstPtrBegin), Size);
MESSAGE("device mapping required by 'present' map type modifier does not "
"exist for host address " DPxMOD " (%" PRId64 " bytes)",
DPxPTR(HstPtrBegin), Size);
} else if (Size) {
// If it is not contained and Size > 0, we should create a new entry for it.
IsNew = true;
uintptr_t Ptr = (uintptr_t)allocData(Size, HstPtrBegin);
Entry = HDTTMap
->emplace(new HostDataToTargetTy(
(uintptr_t)HstPtrBase, (uintptr_t)HstPtrBegin,
(uintptr_t)HstPtrBegin + Size, Ptr, HasHoldModifier,
HstPtrName))
.first->HDTT;
INFO(OMP_INFOTYPE_MAPPING_CHANGED, DeviceID,
"Creating new map entry with HstPtrBase=" DPxMOD
", HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD ", Size=%ld, "
"DynRefCount=%s, HoldRefCount=%s, Name=%s\n",
DPxPTR(HstPtrBase), DPxPTR(HstPtrBegin), DPxPTR(Ptr), Size,
Entry->dynRefCountToStr().c_str(), Entry->holdRefCountToStr().c_str(),
(HstPtrName) ? getNameFromMapping(HstPtrName).c_str() : "unknown");
TargetPointer = (void *)Ptr;
}
[OpenMP][Offloading] Fix data race in data mapping by using two locks This patch tries to partially fix one of the two data race issues reported in [1] by introducing a per-entry mutex. Additional discussion can also be found in D104418, which will also be refined to fix another data race problem. Here is how it works. Like before, `DataMapMtx` is still being used for mapping table lookup and update. In any case, we will get a table entry. If we need to make a data transfer (update the data on the device), we need to lock the entry right before releasing `DataMapMtx`, and the issue of data transfer should be after releasing `DataMapMtx`, and the entry is unlocked afterwards. This can guarantee that: 1) issue of data movement is not in critical region, which will not affect performance too much, and also will not affect other threads that don't touch the same entry; 2) if another thread accesses the same entry, the state of data movement is consistent (which requires that a thread must first get the update lock before getting data movement information). For a target that doesn't support async data transfer, issue of data movement is data transfer. This two-lock design can potentially improve concurrency compared with the design that guards data movement with `DataMapMtx` as well. For a target that supports async data movement, we could simply attach the event between the issue of data movement and unlock the entry. For a thread that wants to get the event, it must first get the lock. This can also get rid of the busy wait until the event pointer is valid. Reference: [1] https://bugs.llvm.org/show_bug.cgi?id=49940 Reviewed By: grokos Differential Revision: https://reviews.llvm.org/D104555
2021-07-23 16:10:42 -04:00
// If the target pointer is valid, and we need to transfer data, issue the
// data transfer.
if (TargetPointer && !IsHostPtr && HasFlagTo && (IsNew || HasFlagAlways)) {
[OpenMP][Offloading] Fix data race in data mapping by using two locks This patch tries to partially fix one of the two data race issues reported in [1] by introducing a per-entry mutex. Additional discussion can also be found in D104418, which will also be refined to fix another data race problem. Here is how it works. Like before, `DataMapMtx` is still being used for mapping table lookup and update. In any case, we will get a table entry. If we need to make a data transfer (update the data on the device), we need to lock the entry right before releasing `DataMapMtx`, and the issue of data transfer should be after releasing `DataMapMtx`, and the entry is unlocked afterwards. This can guarantee that: 1) issue of data movement is not in critical region, which will not affect performance too much, and also will not affect other threads that don't touch the same entry; 2) if another thread accesses the same entry, the state of data movement is consistent (which requires that a thread must first get the update lock before getting data movement information). For a target that doesn't support async data transfer, issue of data movement is data transfer. This two-lock design can potentially improve concurrency compared with the design that guards data movement with `DataMapMtx` as well. For a target that supports async data movement, we could simply attach the event between the issue of data movement and unlock the entry. For a thread that wants to get the event, it must first get the lock. This can also get rid of the busy wait until the event pointer is valid. Reference: [1] https://bugs.llvm.org/show_bug.cgi?id=49940 Reviewed By: grokos Differential Revision: https://reviews.llvm.org/D104555
2021-07-23 16:10:42 -04:00
// Lock the entry before releasing the mapping table lock such that another
// thread that could issue data movement will get the right result.
std::lock_guard<decltype(*Entry)> LG(*Entry);
[OpenMP][Offloading] Fix data race in data mapping by using two locks This patch tries to partially fix one of the two data race issues reported in [1] by introducing a per-entry mutex. Additional discussion can also be found in D104418, which will also be refined to fix another data race problem. Here is how it works. Like before, `DataMapMtx` is still being used for mapping table lookup and update. In any case, we will get a table entry. If we need to make a data transfer (update the data on the device), we need to lock the entry right before releasing `DataMapMtx`, and the issue of data transfer should be after releasing `DataMapMtx`, and the entry is unlocked afterwards. This can guarantee that: 1) issue of data movement is not in critical region, which will not affect performance too much, and also will not affect other threads that don't touch the same entry; 2) if another thread accesses the same entry, the state of data movement is consistent (which requires that a thread must first get the update lock before getting data movement information). For a target that doesn't support async data transfer, issue of data movement is data transfer. This two-lock design can potentially improve concurrency compared with the design that guards data movement with `DataMapMtx` as well. For a target that supports async data movement, we could simply attach the event between the issue of data movement and unlock the entry. For a thread that wants to get the event, it must first get the lock. This can also get rid of the busy wait until the event pointer is valid. Reference: [1] https://bugs.llvm.org/show_bug.cgi?id=49940 Reviewed By: grokos Differential Revision: https://reviews.llvm.org/D104555
2021-07-23 16:10:42 -04:00
// Release the mapping table lock right after the entry is locked.
HDTTMap.destroy();
[OpenMP][Offloading] Fix data race in data mapping by using two locks This patch tries to partially fix one of the two data race issues reported in [1] by introducing a per-entry mutex. Additional discussion can also be found in D104418, which will also be refined to fix another data race problem. Here is how it works. Like before, `DataMapMtx` is still being used for mapping table lookup and update. In any case, we will get a table entry. If we need to make a data transfer (update the data on the device), we need to lock the entry right before releasing `DataMapMtx`, and the issue of data transfer should be after releasing `DataMapMtx`, and the entry is unlocked afterwards. This can guarantee that: 1) issue of data movement is not in critical region, which will not affect performance too much, and also will not affect other threads that don't touch the same entry; 2) if another thread accesses the same entry, the state of data movement is consistent (which requires that a thread must first get the update lock before getting data movement information). For a target that doesn't support async data transfer, issue of data movement is data transfer. This two-lock design can potentially improve concurrency compared with the design that guards data movement with `DataMapMtx` as well. For a target that supports async data movement, we could simply attach the event between the issue of data movement and unlock the entry. For a thread that wants to get the event, it must first get the lock. This can also get rid of the busy wait until the event pointer is valid. Reference: [1] https://bugs.llvm.org/show_bug.cgi?id=49940 Reviewed By: grokos Differential Revision: https://reviews.llvm.org/D104555
2021-07-23 16:10:42 -04:00
DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n", Size,
DPxPTR(HstPtrBegin), DPxPTR(TargetPointer));
int Ret = submitData(TargetPointer, HstPtrBegin, Size, AsyncInfo);
if (Ret != OFFLOAD_SUCCESS) {
REPORT("Copying data to device failed.\n");
// We will also return nullptr if the data movement fails because that
// pointer points to a corrupted memory region so it doesn't make any
// sense to continue to use it.
TargetPointer = nullptr;
} else if (Entry->addEventIfNecessary(*this, AsyncInfo) != OFFLOAD_SUCCESS)
return {{false /* IsNewEntry */, false /* IsHostPointer */},
nullptr /* Entry */,
nullptr /* TargetPointer */};
[OpenMP][Offloading] Fix data race in data mapping by using two locks This patch tries to partially fix one of the two data race issues reported in [1] by introducing a per-entry mutex. Additional discussion can also be found in D104418, which will also be refined to fix another data race problem. Here is how it works. Like before, `DataMapMtx` is still being used for mapping table lookup and update. In any case, we will get a table entry. If we need to make a data transfer (update the data on the device), we need to lock the entry right before releasing `DataMapMtx`, and the issue of data transfer should be after releasing `DataMapMtx`, and the entry is unlocked afterwards. This can guarantee that: 1) issue of data movement is not in critical region, which will not affect performance too much, and also will not affect other threads that don't touch the same entry; 2) if another thread accesses the same entry, the state of data movement is consistent (which requires that a thread must first get the update lock before getting data movement information). For a target that doesn't support async data transfer, issue of data movement is data transfer. This two-lock design can potentially improve concurrency compared with the design that guards data movement with `DataMapMtx` as well. For a target that supports async data movement, we could simply attach the event between the issue of data movement and unlock the entry. For a thread that wants to get the event, it must first get the lock. This can also get rid of the busy wait until the event pointer is valid. Reference: [1] https://bugs.llvm.org/show_bug.cgi?id=49940 Reviewed By: grokos Differential Revision: https://reviews.llvm.org/D104555
2021-07-23 16:10:42 -04:00
} else {
// Release the mapping table lock directly.
HDTTMap.destroy();
// If not a host pointer and no present modifier, we need to wait for the
// event if it exists.
// Note: Entry might be nullptr because of zero length array section.
if (Entry && !IsHostPtr && !HasPresentModifier) {
std::lock_guard<decltype(*Entry)> LG(*Entry);
void *Event = Entry->getEvent();
if (Event) {
int Ret = waitEvent(Event, AsyncInfo);
if (Ret != OFFLOAD_SUCCESS) {
// If it fails to wait for the event, we need to return nullptr in
// case of any data race.
REPORT("Failed to wait for event " DPxMOD ".\n", DPxPTR(Event));
return {{false /* IsNewEntry */, false /* IsHostPointer */},
nullptr /* Entry */,
nullptr /* TargetPointer */};
}
}
}
[OpenMP][Offloading] Fix data race in data mapping by using two locks This patch tries to partially fix one of the two data race issues reported in [1] by introducing a per-entry mutex. Additional discussion can also be found in D104418, which will also be refined to fix another data race problem. Here is how it works. Like before, `DataMapMtx` is still being used for mapping table lookup and update. In any case, we will get a table entry. If we need to make a data transfer (update the data on the device), we need to lock the entry right before releasing `DataMapMtx`, and the issue of data transfer should be after releasing `DataMapMtx`, and the entry is unlocked afterwards. This can guarantee that: 1) issue of data movement is not in critical region, which will not affect performance too much, and also will not affect other threads that don't touch the same entry; 2) if another thread accesses the same entry, the state of data movement is consistent (which requires that a thread must first get the update lock before getting data movement information). For a target that doesn't support async data transfer, issue of data movement is data transfer. This two-lock design can potentially improve concurrency compared with the design that guards data movement with `DataMapMtx` as well. For a target that supports async data movement, we could simply attach the event between the issue of data movement and unlock the entry. For a thread that wants to get the event, it must first get the lock. This can also get rid of the busy wait until the event pointer is valid. Reference: [1] https://bugs.llvm.org/show_bug.cgi?id=49940 Reviewed By: grokos Differential Revision: https://reviews.llvm.org/D104555
2021-07-23 16:10:42 -04:00
}
return {{IsNew, IsHostPtr}, Entry, TargetPointer};
}
// Used by targetDataBegin, targetDataEnd, targetDataUpdate and target.
// Return the target pointer begin (where the data will be moved).
// Decrement the reference counter if called from targetDataEnd.
TargetPointerResultTy
DeviceTy::getTgtPtrBegin(void *HstPtrBegin, int64_t Size, bool &IsLast,
bool UpdateRefCount, bool UseHoldRefCount,
bool &IsHostPtr, bool MustContain, bool ForceDelete) {
HDTTMapAccessorTy HDTTMap = HostDataToTargetMap.getExclusiveAccessor();
void *TargetPointer = NULL;
bool IsNew = false;
IsHostPtr = false;
IsLast = false;
LookupResult lr = lookupMapping(HDTTMap, HstPtrBegin, Size);
[OpenMP] Fix `omp target update` for array extension OpenMP TR8 sec. 2.15.6 "target update Construct", p. 183, L3-4 states: > If the corresponding list item is not present in the device data > environment and there is no present modifier in the clause, then no > assignment occurs to or from the original list item. L10-11 states: > If a present modifier appears in the clause and the corresponding > list item is not present in the device data environment then an > error occurs and the program termintates. (OpenMP 5.0 also has the first passage but without mention of the present modifier of course.) In both passages, I assume "is not present" includes the case of partially but not entirely present. However, without this patch, the target update directive misbehaves in this case both with and without the present modifier. For example: ``` #pragma omp target enter data map(to:arr[0:3]) #pragma omp target update to(arr[0:5]) // might fail on data transfer #pragma omp target update to(present:arr[0:5]) // might fail on data transfer ``` The problem is that `DeviceTy::getTgtPtrBegin` does not return a null pointer in that case, so `target_data_update` sees the data as fully present, and the data transfer then might fail depending on the target device. However, without the present modifier, there should never be a failure. Moreover, with the present modifier, there should always be a failure, and the diagnostic should mention the present modifier. This patch fixes `DeviceTy::getTgtPtrBegin` to return null when `target_data_update` is the caller. I'm wondering if it should do the same for more callers. Reviewed By: grokos, jdoerfert Differential Revision: https://reviews.llvm.org/D85246
2020-08-05 09:00:12 -04:00
if (lr.Flags.IsContained ||
(!MustContain && (lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter))) {
auto &HT = *lr.Entry;
IsLast = HT.decShouldRemove(UseHoldRefCount, ForceDelete);
if (ForceDelete) {
HT.resetRefCount(UseHoldRefCount);
assert(IsLast == HT.decShouldRemove(UseHoldRefCount) &&
[OpenMP] Fix delete map type in ref count debug messages For example, without this patch: ``` $ cat test.c int main() { int x; #pragma omp target enter data map(alloc: x) #pragma omp target enter data map(alloc: x) #pragma omp target enter data map(alloc: x) #pragma omp target exit data map(delete: x) ; return 0; } $ clang -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda test.c $ LIBOMPTARGET_DEBUG=1 ./a.out |& grep 'Creating\|Mapping exists\|last' Libomptarget --> Creating new map entry with HstPtrBegin=0x00007ffddf1eaea8, TgtPtrBegin=0x00000000013bb040, Size=4, RefCount=1, Name=unknown Libomptarget --> Mapping exists with HstPtrBegin=0x00007ffddf1eaea8, TgtPtrBegin=0x00000000013bb040, Size=4, RefCount=2 (incremented), Name=unknown Libomptarget --> Mapping exists with HstPtrBegin=0x00007ffddf1eaea8, TgtPtrBegin=0x00000000013bb040, Size=4, RefCount=3 (incremented), Name=unknown Libomptarget --> Mapping exists with HstPtrBegin=0x00007ffddf1eaea8, TgtPtrBegin=0x00000000013bb040, Size=4, RefCount=2 (decremented) Libomptarget --> There are 4 bytes allocated at target address 0x00000000013bb040 - is not last ``` `RefCount` is reported as decremented to 2, but it ought to be reset because of the `delete` map type, and `is not last` is incorrect. This patch migrates the reset of reference counts from `DeviceTy::deallocTgtPtr` to `DeviceTy::getTgtPtrBegin`, which then correctly reports the reset. Based on the `IsLast` result from `DeviceTy::getTgtPtrBegin`, `targetDataEnd` then correctly reports `is last` for any deletion. `DeviceTy::deallocTgtPtr` is responsible only for the final reference count decrement and mapping removal. An obscure side effect of this patch is that a `delete` map type when the reference count is infinite yields `DelEntry=IsLast=false` in `targetDataEnd` and so no longer results in a `DeviceTy::deallocTgtPtr` call. Without this patch, that call is a no-op anyway besides some unnecessary locking and mapping table lookups. Reviewed By: grokos Differential Revision: https://reviews.llvm.org/D104560
2021-06-23 09:39:04 -04:00
"expected correct IsLast prediction for reset");
}
const char *RefCountAction;
if (!UpdateRefCount) {
RefCountAction = " (update suppressed)";
[OpenMP] Fix delete map type in ref count debug messages For example, without this patch: ``` $ cat test.c int main() { int x; #pragma omp target enter data map(alloc: x) #pragma omp target enter data map(alloc: x) #pragma omp target enter data map(alloc: x) #pragma omp target exit data map(delete: x) ; return 0; } $ clang -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda test.c $ LIBOMPTARGET_DEBUG=1 ./a.out |& grep 'Creating\|Mapping exists\|last' Libomptarget --> Creating new map entry with HstPtrBegin=0x00007ffddf1eaea8, TgtPtrBegin=0x00000000013bb040, Size=4, RefCount=1, Name=unknown Libomptarget --> Mapping exists with HstPtrBegin=0x00007ffddf1eaea8, TgtPtrBegin=0x00000000013bb040, Size=4, RefCount=2 (incremented), Name=unknown Libomptarget --> Mapping exists with HstPtrBegin=0x00007ffddf1eaea8, TgtPtrBegin=0x00000000013bb040, Size=4, RefCount=3 (incremented), Name=unknown Libomptarget --> Mapping exists with HstPtrBegin=0x00007ffddf1eaea8, TgtPtrBegin=0x00000000013bb040, Size=4, RefCount=2 (decremented) Libomptarget --> There are 4 bytes allocated at target address 0x00000000013bb040 - is not last ``` `RefCount` is reported as decremented to 2, but it ought to be reset because of the `delete` map type, and `is not last` is incorrect. This patch migrates the reset of reference counts from `DeviceTy::deallocTgtPtr` to `DeviceTy::getTgtPtrBegin`, which then correctly reports the reset. Based on the `IsLast` result from `DeviceTy::getTgtPtrBegin`, `targetDataEnd` then correctly reports `is last` for any deletion. `DeviceTy::deallocTgtPtr` is responsible only for the final reference count decrement and mapping removal. An obscure side effect of this patch is that a `delete` map type when the reference count is infinite yields `DelEntry=IsLast=false` in `targetDataEnd` and so no longer results in a `DeviceTy::deallocTgtPtr` call. Without this patch, that call is a no-op anyway besides some unnecessary locking and mapping table lookups. Reviewed By: grokos Differential Revision: https://reviews.llvm.org/D104560
2021-06-23 09:39:04 -04:00
} else if (IsLast) {
// Mark the entry as to be deleted by this thread. Another thread might
// reuse the entry and take "ownership" for the deletion while this thread
// is waiting for data transfers. That is fine and the current thread will
// simply skip the deletion step then.
HT.setDeleteThreadId();
HT.decRefCount(UseHoldRefCount);
assert(HT.getTotalRefCount() == 0 &&
"Expected zero reference count when deletion is scheduled");
if (ForceDelete)
RefCountAction = " (reset, delayed deletion)";
else
RefCountAction = " (decremented, delayed deletion)";
[OpenMP] Fix delete map type in ref count debug messages For example, without this patch: ``` $ cat test.c int main() { int x; #pragma omp target enter data map(alloc: x) #pragma omp target enter data map(alloc: x) #pragma omp target enter data map(alloc: x) #pragma omp target exit data map(delete: x) ; return 0; } $ clang -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda test.c $ LIBOMPTARGET_DEBUG=1 ./a.out |& grep 'Creating\|Mapping exists\|last' Libomptarget --> Creating new map entry with HstPtrBegin=0x00007ffddf1eaea8, TgtPtrBegin=0x00000000013bb040, Size=4, RefCount=1, Name=unknown Libomptarget --> Mapping exists with HstPtrBegin=0x00007ffddf1eaea8, TgtPtrBegin=0x00000000013bb040, Size=4, RefCount=2 (incremented), Name=unknown Libomptarget --> Mapping exists with HstPtrBegin=0x00007ffddf1eaea8, TgtPtrBegin=0x00000000013bb040, Size=4, RefCount=3 (incremented), Name=unknown Libomptarget --> Mapping exists with HstPtrBegin=0x00007ffddf1eaea8, TgtPtrBegin=0x00000000013bb040, Size=4, RefCount=2 (decremented) Libomptarget --> There are 4 bytes allocated at target address 0x00000000013bb040 - is not last ``` `RefCount` is reported as decremented to 2, but it ought to be reset because of the `delete` map type, and `is not last` is incorrect. This patch migrates the reset of reference counts from `DeviceTy::deallocTgtPtr` to `DeviceTy::getTgtPtrBegin`, which then correctly reports the reset. Based on the `IsLast` result from `DeviceTy::getTgtPtrBegin`, `targetDataEnd` then correctly reports `is last` for any deletion. `DeviceTy::deallocTgtPtr` is responsible only for the final reference count decrement and mapping removal. An obscure side effect of this patch is that a `delete` map type when the reference count is infinite yields `DelEntry=IsLast=false` in `targetDataEnd` and so no longer results in a `DeviceTy::deallocTgtPtr` call. Without this patch, that call is a no-op anyway besides some unnecessary locking and mapping table lookups. Reviewed By: grokos Differential Revision: https://reviews.llvm.org/D104560
2021-06-23 09:39:04 -04:00
} else {
HT.decRefCount(UseHoldRefCount);
RefCountAction = " (decremented)";
[OpenMP] Improve ref count debug messages For example, without this patch: ``` $ cat test.c int main() { int x; #pragma omp target enter data map(alloc: x) #pragma omp target exit data map(release: x) ; return 0; } $ clang -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda test.c $ LIBOMPTARGET_DEBUG=1 ./a.out |& grep 'Creating\|Mapping exists' Libomptarget --> Creating new map entry with HstPtrBegin=0x00007ffcace8e448, TgtPtrBegin=0x00007f12ef600000, Size=4, Name=unknown Libomptarget --> Mapping exists with HstPtrBegin=0x00007ffcace8e448, TgtPtrBegin=0x00007f12ef600000, Size=4, updated RefCount=1 ``` There are two problems in this example: * `RefCount` is not reported when a mapping is created, but it might be 1 or infinite. In this case, because it's created by `omp target enter data`, it's 1. Seeing that would make later `RefCount` messages easier to understand. * `RefCount` is still 1 at the `omp target exit data`, but it's reported as `updated`. The reason it's still 1 is that, upon deletions, the reference count is generally not updated in `DeviceTy::getTgtPtrBegin`, where the report is produced. Instead, it's zeroed later in `DeviceTy::deallocTgtPtr`, where it's actually removed from the mapping table. This patch makes the following changes: * Report the reference count when creating a mapping. * Where an existing mapping is reported, always report a reference count action: * `update suppressed` when `UpdateRefCount=false` * `incremented` * `decremented` * `deferred final decrement`, which replaces the misleading `updated` in the above example * Add comments to `DeviceTy::getTgtPtrBegin` to explain why it does not zero the reference count. (Please advise if these comments miss the point.) * For unified shared memory, don't report confusing messages like `RefCount=` or `RefCount= updated` given that reference counts are irrelevant in this case. Instead, just report `for unified shared memory`. * Use `INFO` not `DP` consistently for `Mapping exists` messages. * Fix device table dumps to print `INF` instead of `-1` for an infinite reference count. Reviewed By: jhuber6, grokos Differential Revision: https://reviews.llvm.org/D104559
2021-06-23 09:37:54 -04:00
}
const char *DynRefCountAction = UseHoldRefCount ? "" : RefCountAction;
const char *HoldRefCountAction = UseHoldRefCount ? RefCountAction : "";
uintptr_t tp = HT.TgtPtrBegin + ((uintptr_t)HstPtrBegin - HT.HstPtrBegin);
[OpenMP] Improve ref count debug messages For example, without this patch: ``` $ cat test.c int main() { int x; #pragma omp target enter data map(alloc: x) #pragma omp target exit data map(release: x) ; return 0; } $ clang -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda test.c $ LIBOMPTARGET_DEBUG=1 ./a.out |& grep 'Creating\|Mapping exists' Libomptarget --> Creating new map entry with HstPtrBegin=0x00007ffcace8e448, TgtPtrBegin=0x00007f12ef600000, Size=4, Name=unknown Libomptarget --> Mapping exists with HstPtrBegin=0x00007ffcace8e448, TgtPtrBegin=0x00007f12ef600000, Size=4, updated RefCount=1 ``` There are two problems in this example: * `RefCount` is not reported when a mapping is created, but it might be 1 or infinite. In this case, because it's created by `omp target enter data`, it's 1. Seeing that would make later `RefCount` messages easier to understand. * `RefCount` is still 1 at the `omp target exit data`, but it's reported as `updated`. The reason it's still 1 is that, upon deletions, the reference count is generally not updated in `DeviceTy::getTgtPtrBegin`, where the report is produced. Instead, it's zeroed later in `DeviceTy::deallocTgtPtr`, where it's actually removed from the mapping table. This patch makes the following changes: * Report the reference count when creating a mapping. * Where an existing mapping is reported, always report a reference count action: * `update suppressed` when `UpdateRefCount=false` * `incremented` * `decremented` * `deferred final decrement`, which replaces the misleading `updated` in the above example * Add comments to `DeviceTy::getTgtPtrBegin` to explain why it does not zero the reference count. (Please advise if these comments miss the point.) * For unified shared memory, don't report confusing messages like `RefCount=` or `RefCount= updated` given that reference counts are irrelevant in this case. Instead, just report `for unified shared memory`. * Use `INFO` not `DP` consistently for `Mapping exists` messages. * Fix device table dumps to print `INF` instead of `-1` for an infinite reference count. Reviewed By: jhuber6, grokos Differential Revision: https://reviews.llvm.org/D104559
2021-06-23 09:37:54 -04:00
INFO(OMP_INFOTYPE_MAPPING_EXISTS, DeviceID,
"Mapping exists with HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD ", "
"Size=%" PRId64 ", DynRefCount=%s%s, HoldRefCount=%s%s\n",
DPxPTR(HstPtrBegin), DPxPTR(tp), Size, HT.dynRefCountToStr().c_str(),
DynRefCountAction, HT.holdRefCountToStr().c_str(), HoldRefCountAction);
TargetPointer = (void *)tp;
} else if (PM->RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
// If the value isn't found in the mapping and unified shared memory
// is on then it means we have stumbled upon a value which we need to
// use directly from the host.
[OpenMP] Improve ref count debug messages For example, without this patch: ``` $ cat test.c int main() { int x; #pragma omp target enter data map(alloc: x) #pragma omp target exit data map(release: x) ; return 0; } $ clang -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda test.c $ LIBOMPTARGET_DEBUG=1 ./a.out |& grep 'Creating\|Mapping exists' Libomptarget --> Creating new map entry with HstPtrBegin=0x00007ffcace8e448, TgtPtrBegin=0x00007f12ef600000, Size=4, Name=unknown Libomptarget --> Mapping exists with HstPtrBegin=0x00007ffcace8e448, TgtPtrBegin=0x00007f12ef600000, Size=4, updated RefCount=1 ``` There are two problems in this example: * `RefCount` is not reported when a mapping is created, but it might be 1 or infinite. In this case, because it's created by `omp target enter data`, it's 1. Seeing that would make later `RefCount` messages easier to understand. * `RefCount` is still 1 at the `omp target exit data`, but it's reported as `updated`. The reason it's still 1 is that, upon deletions, the reference count is generally not updated in `DeviceTy::getTgtPtrBegin`, where the report is produced. Instead, it's zeroed later in `DeviceTy::deallocTgtPtr`, where it's actually removed from the mapping table. This patch makes the following changes: * Report the reference count when creating a mapping. * Where an existing mapping is reported, always report a reference count action: * `update suppressed` when `UpdateRefCount=false` * `incremented` * `decremented` * `deferred final decrement`, which replaces the misleading `updated` in the above example * Add comments to `DeviceTy::getTgtPtrBegin` to explain why it does not zero the reference count. (Please advise if these comments miss the point.) * For unified shared memory, don't report confusing messages like `RefCount=` or `RefCount= updated` given that reference counts are irrelevant in this case. Instead, just report `for unified shared memory`. * Use `INFO` not `DP` consistently for `Mapping exists` messages. * Fix device table dumps to print `INF` instead of `-1` for an infinite reference count. Reviewed By: jhuber6, grokos Differential Revision: https://reviews.llvm.org/D104559
2021-06-23 09:37:54 -04:00
DP("Get HstPtrBegin " DPxMOD " Size=%" PRId64 " for unified shared "
"memory\n",
DPxPTR((uintptr_t)HstPtrBegin), Size);
IsHostPtr = true;
TargetPointer = HstPtrBegin;
}
return {{IsNew, IsHostPtr}, lr.Entry, TargetPointer};
}
// Return the target pointer begin (where the data will be moved).
void *DeviceTy::getTgtPtrBegin(HDTTMapAccessorTy &HDTTMap, void *HstPtrBegin,
int64_t Size) {
uintptr_t hp = (uintptr_t)HstPtrBegin;
LookupResult lr = lookupMapping(HDTTMap, HstPtrBegin, Size);
if (lr.Flags.IsContained || lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) {
auto &HT = *lr.Entry;
uintptr_t tp = HT.TgtPtrBegin + (hp - HT.HstPtrBegin);
return (void *)tp;
}
return NULL;
}
int DeviceTy::deallocTgtPtr(HDTTMapAccessorTy &HDTTMap, LookupResult LR,
int64_t Size) {
// Check if the pointer is contained in any sub-nodes.
if (!(LR.Flags.IsContained || LR.Flags.ExtendsBefore ||
LR.Flags.ExtendsAfter)) {
REPORT("Section to delete (hst addr " DPxMOD ") does not exist in the"
" allocated memory\n",
DPxPTR(LR.Entry->HstPtrBegin));
return OFFLOAD_FAIL;
}
auto &HT = *LR.Entry;
// Verify this thread is still in charge of deleting the entry.
assert(HT.getTotalRefCount() == 0 &&
HT.getDeleteThreadId() == std::this_thread::get_id() &&
"Trying to delete entry that is in use or owned by another thread.");
DP("Deleting tgt data " DPxMOD " of size %" PRId64 "\n",
DPxPTR(HT.TgtPtrBegin), Size);
deleteData((void *)HT.TgtPtrBegin);
INFO(OMP_INFOTYPE_MAPPING_CHANGED, DeviceID,
"Removing map entry with HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD
", Size=%" PRId64 ", Name=%s\n",
DPxPTR(HT.HstPtrBegin), DPxPTR(HT.TgtPtrBegin), Size,
(HT.HstPtrName) ? getNameFromMapping(HT.HstPtrName).c_str() : "unknown");
void *Event = LR.Entry->getEvent();
HDTTMap->erase(LR.Entry);
delete LR.Entry;
int Ret = OFFLOAD_SUCCESS;
if (Event && destroyEvent(Event) != OFFLOAD_SUCCESS) {
REPORT("Failed to destroy event " DPxMOD "\n", DPxPTR(Event));
Ret = OFFLOAD_FAIL;
}
return Ret;
}
/// Init device, should not be called directly.
void DeviceTy::init() {
// Make call to init_requires if it exists for this plugin.
if (RTL->init_requires)
RTL->init_requires(PM->RTLs.RequiresFlags);
[OpenMP] Introduce target memory manager Target memory manager is introduced in this patch which aims to manage target memory such that they will not be freed immediately when they are not used because the overhead of memory allocation and free is very large. For CUDA device, cuMemFree even blocks the context switch on device which affects concurrent kernel execution. The memory manager can be taken as a memory pool. It divides the pool into multiple buckets according to the size such that memory allocation/free distributed to different buckets will not affect each other. In this version, we use the exact-equality policy to find a free buffer. This is an open question: will best-fit work better here? IMO, best-fit is not good for target memory management because computation on GPU usually requires GBs of data. Best-fit might lead to a serious waste. For example, there is a free buffer of size 1960MB, and now we need a buffer of size 1200MB. If best-fit, the free buffer will be returned, leading to a 760MB waste. The allocation will happen when there is no free memory left, and the memory free on device will take place in the following two cases: 1. The program ends. Obviously. However, there is a little problem that plugin library is destroyed before the memory manager is destroyed, leading to a fact that the call to target plugin will not succeed. 2. Device is out of memory when we request a new memory. The manager will walk through all free buffers from the bucket with largest base size, pick up one buffer, free it, and try to allocate immediately. If it succeeds, it will return right away rather than freeing all buffers in free list. Update: A threshold (8KB by default) is set such that users could control what size of memory will be managed by the manager. It can also be configured by an environment variable `LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD`. Reviewed By: jdoerfert, ye-luo, JonChesterfield Differential Revision: https://reviews.llvm.org/D81054
2020-08-19 23:12:02 -04:00
int32_t Ret = RTL->init_device(RTLDeviceID);
if (Ret != OFFLOAD_SUCCESS)
return;
IsInit = true;
}
/// Thread-safe method to initialize the device only once.
int32_t DeviceTy::initOnce() {
std::call_once(InitFlag, &DeviceTy::init, this);
// At this point, if IsInit is true, then either this thread or some other
// thread in the past successfully initialized the device, so we can return
// OFFLOAD_SUCCESS. If this thread executed init() via call_once() and it
// failed, return OFFLOAD_FAIL. If call_once did not invoke init(), it means
// that some other thread already attempted to execute init() and if IsInit
// is still false, return OFFLOAD_FAIL.
if (IsInit)
return OFFLOAD_SUCCESS;
else
return OFFLOAD_FAIL;
}
void DeviceTy::deinit() {
if (RTL->deinit_device)
RTL->deinit_device(RTLDeviceID);
}
// Load binary to device.
__tgt_target_table *DeviceTy::load_binary(void *Img) {
std::lock_guard<decltype(RTL->Mtx)> LG(RTL->Mtx);
__tgt_target_table *rc = RTL->load_binary(RTLDeviceID, Img);
return rc;
}
void *DeviceTy::allocData(int64_t Size, void *HstPtr, int32_t Kind) {
return RTL->data_alloc(RTLDeviceID, Size, HstPtr, Kind);
}
int32_t DeviceTy::deleteData(void *TgtPtrBegin) {
return RTL->data_delete(RTLDeviceID, TgtPtrBegin);
}
// Submit data to device
int32_t DeviceTy::submitData(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size,
AsyncInfoTy &AsyncInfo) {
if (getInfoLevel() & OMP_INFOTYPE_DATA_TRANSFER) {
HDTTMapAccessorTy HDTTMap = HostDataToTargetMap.getExclusiveAccessor();
LookupResult LR = lookupMapping(HDTTMap, HstPtrBegin, Size);
auto *HT = &*LR.Entry;
INFO(OMP_INFOTYPE_DATA_TRANSFER, DeviceID,
"Copying data from host to device, HstPtr=" DPxMOD ", TgtPtr=" DPxMOD
", Size=%" PRId64 ", Name=%s\n",
DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin), Size,
(HT && HT->HstPtrName) ? getNameFromMapping(HT->HstPtrName).c_str()
: "unknown");
}
if (!AsyncInfo || !RTL->data_submit_async || !RTL->synchronize)
return RTL->data_submit(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size);
else
return RTL->data_submit_async(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size,
AsyncInfo);
}
// Retrieve data from device
int32_t DeviceTy::retrieveData(void *HstPtrBegin, void *TgtPtrBegin,
int64_t Size, AsyncInfoTy &AsyncInfo) {
if (getInfoLevel() & OMP_INFOTYPE_DATA_TRANSFER) {
HDTTMapAccessorTy HDTTMap = HostDataToTargetMap.getExclusiveAccessor();
LookupResult LR = lookupMapping(HDTTMap, HstPtrBegin, Size);
auto *HT = &*LR.Entry;
INFO(OMP_INFOTYPE_DATA_TRANSFER, DeviceID,
"Copying data from device to host, TgtPtr=" DPxMOD ", HstPtr=" DPxMOD
", Size=%" PRId64 ", Name=%s\n",
DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin), Size,
(HT && HT->HstPtrName) ? getNameFromMapping(HT->HstPtrName).c_str()
: "unknown");
}
if (!RTL->data_retrieve_async || !RTL->synchronize)
return RTL->data_retrieve(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size);
else
return RTL->data_retrieve_async(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size,
AsyncInfo);
}
// Copy data from current device to destination device directly
int32_t DeviceTy::dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr,
int64_t Size, AsyncInfoTy &AsyncInfo) {
if (!AsyncInfo || !RTL->data_exchange_async || !RTL->synchronize) {
assert(RTL->data_exchange && "RTL->data_exchange is nullptr");
return RTL->data_exchange(RTLDeviceID, SrcPtr, DstDev.RTLDeviceID, DstPtr,
Size);
} else
return RTL->data_exchange_async(RTLDeviceID, SrcPtr, DstDev.RTLDeviceID,
DstPtr, Size, AsyncInfo);
}
// Run region on device
int32_t DeviceTy::runRegion(void *TgtEntryPtr, void **TgtVarsPtr,
ptrdiff_t *TgtOffsets, int32_t TgtVarsSize,
AsyncInfoTy &AsyncInfo) {
if (!RTL->run_region || !RTL->synchronize)
return RTL->run_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtOffsets,
TgtVarsSize);
else
return RTL->run_region_async(RTLDeviceID, TgtEntryPtr, TgtVarsPtr,
TgtOffsets, TgtVarsSize, AsyncInfo);
}
// Run region on device
bool DeviceTy::printDeviceInfo(int32_t RTLDevId) {
if (!RTL->print_device_info)
return false;
RTL->print_device_info(RTLDevId);
return true;
}
// Run team region on device.
int32_t DeviceTy::runTeamRegion(void *TgtEntryPtr, void **TgtVarsPtr,
ptrdiff_t *TgtOffsets, int32_t TgtVarsSize,
int32_t NumTeams, int32_t ThreadLimit,
uint64_t LoopTripCount,
AsyncInfoTy &AsyncInfo) {
if (!RTL->run_team_region_async || !RTL->synchronize)
return RTL->run_team_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr,
TgtOffsets, TgtVarsSize, NumTeams, ThreadLimit,
LoopTripCount);
else
return RTL->run_team_region_async(RTLDeviceID, TgtEntryPtr, TgtVarsPtr,
TgtOffsets, TgtVarsSize, NumTeams,
ThreadLimit, LoopTripCount, AsyncInfo);
}
// Whether data can be copied to DstDevice directly
bool DeviceTy::isDataExchangable(const DeviceTy &DstDevice) {
if (RTL != DstDevice.RTL || !RTL->is_data_exchangable)
return false;
if (RTL->is_data_exchangable(RTLDeviceID, DstDevice.RTLDeviceID))
return (RTL->data_exchange != nullptr) ||
(RTL->data_exchange_async != nullptr);
return false;
}
int32_t DeviceTy::synchronize(AsyncInfoTy &AsyncInfo) {
if (RTL->synchronize)
return RTL->synchronize(RTLDeviceID, AsyncInfo);
return OFFLOAD_SUCCESS;
}
int32_t DeviceTy::createEvent(void **Event) {
if (RTL->create_event)
return RTL->create_event(RTLDeviceID, Event);
return OFFLOAD_SUCCESS;
}
int32_t DeviceTy::recordEvent(void *Event, AsyncInfoTy &AsyncInfo) {
if (RTL->record_event)
return RTL->record_event(RTLDeviceID, Event, AsyncInfo);
return OFFLOAD_SUCCESS;
}
int32_t DeviceTy::waitEvent(void *Event, AsyncInfoTy &AsyncInfo) {
if (RTL->wait_event)
return RTL->wait_event(RTLDeviceID, Event, AsyncInfo);
return OFFLOAD_SUCCESS;
}
int32_t DeviceTy::syncEvent(void *Event) {
if (RTL->sync_event)
return RTL->sync_event(RTLDeviceID, Event);
return OFFLOAD_SUCCESS;
}
int32_t DeviceTy::destroyEvent(void *Event) {
if (RTL->create_event)
return RTL->destroy_event(RTLDeviceID, Event);
return OFFLOAD_SUCCESS;
}
/// Check whether a device has an associated RTL and initialize it if it's not
/// already initialized.
bool device_is_ready(int device_num) {
DP("Checking whether device %d is ready.\n", device_num);
// Devices.size() can only change while registering a new
// library, so try to acquire the lock of RTLs' mutex.
size_t DevicesSize;
{
std::lock_guard<decltype(PM->RTLsMtx)> LG(PM->RTLsMtx);
DevicesSize = PM->Devices.size();
}
if (DevicesSize <= (size_t)device_num) {
DP("Device ID %d does not have a matching RTL\n", device_num);
return false;
}
// Get device info
DeviceTy &Device = *PM->Devices[device_num];
DP("Is the device %d (local ID %d) initialized? %d\n", device_num,
Device.RTLDeviceID, Device.IsInit);
// Init the device if not done before
if (!Device.IsInit && Device.initOnce() != OFFLOAD_SUCCESS) {
DP("Failed to init device %d\n", device_num);
return false;
}
DP("Device %d is ready to use.\n", device_num);
return true;
}