2017-12-06 21:59:09 +00:00
|
|
|
//===--------- device.cpp - Target independent OpenMP target RTL ----------===//
|
|
|
|
|
//
|
2019-01-19 10:56:40 +00:00
|
|
|
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
|
|
|
|
|
// See https://llvm.org/LICENSE.txt for license information.
|
|
|
|
|
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
|
2017-12-06 21:59:09 +00:00
|
|
|
//
|
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
//
|
|
|
|
|
// Functionality for managing devices that are handled by RTL plugins.
|
|
|
|
|
//
|
|
|
|
|
//===----------------------------------------------------------------------===//
|
|
|
|
|
|
|
|
|
|
#include "device.h"
|
2022-03-02 13:34:24 -06:00
|
|
|
#include "omptarget.h"
|
2017-12-06 21:59:09 +00:00
|
|
|
#include "private.h"
|
|
|
|
|
#include "rtl.h"
|
|
|
|
|
|
|
|
|
|
#include <cassert>
|
|
|
|
|
#include <climits>
|
2022-03-05 15:14:20 -06:00
|
|
|
#include <cstdint>
|
2020-09-15 15:04:37 -04:00
|
|
|
#include <cstdio>
|
2017-12-06 21:59:09 +00:00
|
|
|
#include <string>
|
2022-03-02 13:46:01 -06:00
|
|
|
#include <thread>
|
2017-12-06 21:59:09 +00:00
|
|
|
|
2022-03-02 13:34:24 -06:00
|
|
|
int HostDataToTargetTy::addEventIfNecessary(DeviceTy &Device,
|
|
|
|
|
AsyncInfoTy &AsyncInfo) const {
|
2022-01-18 18:56:10 -06:00
|
|
|
// First, check if the user disabled atomic map transfer/malloc/dealloc.
|
|
|
|
|
if (!PM->UseEventsForAtomicTransfers)
|
|
|
|
|
return OFFLOAD_SUCCESS;
|
|
|
|
|
|
|
|
|
|
void *Event = getEvent();
|
|
|
|
|
bool NeedNewEvent = Event == nullptr;
|
|
|
|
|
if (NeedNewEvent && Device.createEvent(&Event) != OFFLOAD_SUCCESS) {
|
|
|
|
|
REPORT("Failed to create event\n");
|
|
|
|
|
return OFFLOAD_FAIL;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// We cannot assume the event should not be nullptr because we don't
|
|
|
|
|
// know if the target support event. But if a target doesn't,
|
|
|
|
|
// recordEvent should always return success.
|
|
|
|
|
if (Device.recordEvent(Event, AsyncInfo) != OFFLOAD_SUCCESS) {
|
|
|
|
|
REPORT("Failed to set dependence on event " DPxMOD "\n", DPxPTR(Event));
|
|
|
|
|
return OFFLOAD_FAIL;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (NeedNewEvent)
|
|
|
|
|
setEvent(Event);
|
|
|
|
|
|
|
|
|
|
return OFFLOAD_SUCCESS;
|
|
|
|
|
}
|
|
|
|
|
|
[OpenMP] Introduce target memory manager
Target memory manager is introduced in this patch which aims to manage target
memory such that they will not be freed immediately when they are not used
because the overhead of memory allocation and free is very large. For CUDA
device, cuMemFree even blocks the context switch on device which affects
concurrent kernel execution.
The memory manager can be taken as a memory pool. It divides the pool into
multiple buckets according to the size such that memory allocation/free
distributed to different buckets will not affect each other.
In this version, we use the exact-equality policy to find a free buffer. This
is an open question: will best-fit work better here? IMO, best-fit is not good
for target memory management because computation on GPU usually requires GBs of
data. Best-fit might lead to a serious waste. For example, there is a free
buffer of size 1960MB, and now we need a buffer of size 1200MB. If best-fit,
the free buffer will be returned, leading to a 760MB waste.
The allocation will happen when there is no free memory left, and the memory
free on device will take place in the following two cases:
1. The program ends. Obviously. However, there is a little problem that plugin
library is destroyed before the memory manager is destroyed, leading to a fact
that the call to target plugin will not succeed.
2. Device is out of memory when we request a new memory. The manager will walk
through all free buffers from the bucket with largest base size, pick up one
buffer, free it, and try to allocate immediately. If it succeeds, it will
return right away rather than freeing all buffers in free list.
Update:
A threshold (8KB by default) is set such that users could control what size of memory
will be managed by the manager. It can also be configured by an environment variable
`LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD`.
Reviewed By: jdoerfert, ye-luo, JonChesterfield
Differential Revision: https://reviews.llvm.org/D81054
2020-08-19 23:12:02 -04:00
|
|
|
DeviceTy::DeviceTy(RTLInfoTy *RTL)
|
|
|
|
|
: DeviceID(-1), RTL(RTL), RTLDeviceID(-1), IsInit(false), InitFlag(),
|
2022-03-05 15:14:20 -06:00
|
|
|
HasPendingGlobals(false), PendingCtorsDtors(), ShadowPtrMap(),
|
|
|
|
|
PendingGlobalsMtx(), ShadowMtx() {}
|
[OpenMP] Introduce target memory manager
Target memory manager is introduced in this patch which aims to manage target
memory such that they will not be freed immediately when they are not used
because the overhead of memory allocation and free is very large. For CUDA
device, cuMemFree even blocks the context switch on device which affects
concurrent kernel execution.
The memory manager can be taken as a memory pool. It divides the pool into
multiple buckets according to the size such that memory allocation/free
distributed to different buckets will not affect each other.
In this version, we use the exact-equality policy to find a free buffer. This
is an open question: will best-fit work better here? IMO, best-fit is not good
for target memory management because computation on GPU usually requires GBs of
data. Best-fit might lead to a serious waste. For example, there is a free
buffer of size 1960MB, and now we need a buffer of size 1200MB. If best-fit,
the free buffer will be returned, leading to a 760MB waste.
The allocation will happen when there is no free memory left, and the memory
free on device will take place in the following two cases:
1. The program ends. Obviously. However, there is a little problem that plugin
library is destroyed before the memory manager is destroyed, leading to a fact
that the call to target plugin will not succeed.
2. Device is out of memory when we request a new memory. The manager will walk
through all free buffers from the bucket with largest base size, pick up one
buffer, free it, and try to allocate immediately. If it succeeds, it will
return right away rather than freeing all buffers in free list.
Update:
A threshold (8KB by default) is set such that users could control what size of memory
will be managed by the manager. It can also be configured by an environment variable
`LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD`.
Reviewed By: jdoerfert, ye-luo, JonChesterfield
Differential Revision: https://reviews.llvm.org/D81054
2020-08-19 23:12:02 -04:00
|
|
|
|
2020-09-15 15:04:37 -04:00
|
|
|
DeviceTy::~DeviceTy() {
|
2020-12-18 15:14:44 -05:00
|
|
|
if (DeviceID == -1 || !(getInfoLevel() & OMP_INFOTYPE_DUMP_TABLE))
|
2020-09-15 15:04:37 -04:00
|
|
|
return;
|
|
|
|
|
|
2020-12-18 15:14:44 -05:00
|
|
|
ident_t loc = {0, 0, 0, 0, ";libomptarget;libomptarget;0;0;;"};
|
|
|
|
|
dumpTargetPointerMappings(&loc, *this);
|
2020-09-15 15:04:37 -04:00
|
|
|
}
|
[OpenMP] Introduce target memory manager
Target memory manager is introduced in this patch which aims to manage target
memory such that they will not be freed immediately when they are not used
because the overhead of memory allocation and free is very large. For CUDA
device, cuMemFree even blocks the context switch on device which affects
concurrent kernel execution.
The memory manager can be taken as a memory pool. It divides the pool into
multiple buckets according to the size such that memory allocation/free
distributed to different buckets will not affect each other.
In this version, we use the exact-equality policy to find a free buffer. This
is an open question: will best-fit work better here? IMO, best-fit is not good
for target memory management because computation on GPU usually requires GBs of
data. Best-fit might lead to a serious waste. For example, there is a free
buffer of size 1960MB, and now we need a buffer of size 1200MB. If best-fit,
the free buffer will be returned, leading to a 760MB waste.
The allocation will happen when there is no free memory left, and the memory
free on device will take place in the following two cases:
1. The program ends. Obviously. However, there is a little problem that plugin
library is destroyed before the memory manager is destroyed, leading to a fact
that the call to target plugin will not succeed.
2. Device is out of memory when we request a new memory. The manager will walk
through all free buffers from the bucket with largest base size, pick up one
buffer, free it, and try to allocate immediately. If it succeeds, it will
return right away rather than freeing all buffers in free list.
Update:
A threshold (8KB by default) is set such that users could control what size of memory
will be managed by the manager. It can also be configured by an environment variable
`LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD`.
Reviewed By: jdoerfert, ye-luo, JonChesterfield
Differential Revision: https://reviews.llvm.org/D81054
2020-08-19 23:12:02 -04:00
|
|
|
|
2017-12-06 21:59:09 +00:00
|
|
|
int DeviceTy::associatePtr(void *HstPtrBegin, void *TgtPtrBegin, int64_t Size) {
|
2022-03-05 15:14:20 -06:00
|
|
|
HDTTMapAccessorTy HDTTMap = HostDataToTargetMap.getExclusiveAccessor();
|
2017-12-06 21:59:09 +00:00
|
|
|
|
|
|
|
|
// Check if entry exists
|
2022-03-05 15:14:20 -06:00
|
|
|
auto It = HDTTMap->find(HstPtrBegin);
|
|
|
|
|
if (It != HDTTMap->end()) {
|
|
|
|
|
HostDataToTargetTy &HDTT = *It->HDTT;
|
2020-06-24 12:18:00 -04:00
|
|
|
// Mapping already exists
|
2022-03-05 15:14:20 -06:00
|
|
|
bool isValid = HDTT.HstPtrEnd == (uintptr_t)HstPtrBegin + Size &&
|
|
|
|
|
HDTT.TgtPtrBegin == (uintptr_t)TgtPtrBegin;
|
2020-06-24 12:18:00 -04:00
|
|
|
if (isValid) {
|
|
|
|
|
DP("Attempt to re-associate the same device ptr+offset with the same "
|
|
|
|
|
"host ptr, nothing to do\n");
|
|
|
|
|
return OFFLOAD_SUCCESS;
|
|
|
|
|
} else {
|
2020-08-26 18:11:26 -04:00
|
|
|
REPORT("Not allowed to re-associate a different device ptr+offset with "
|
|
|
|
|
"the same host ptr\n");
|
2020-06-24 12:18:00 -04:00
|
|
|
return OFFLOAD_FAIL;
|
2017-12-06 21:59:09 +00:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2020-01-14 16:30:38 -08:00
|
|
|
// Mapping does not exist, allocate it with refCount=INF
|
[OpenMP] Improve ref count debug messages
For example, without this patch:
```
$ cat test.c
int main() {
int x;
#pragma omp target enter data map(alloc: x)
#pragma omp target exit data map(release: x)
;
return 0;
}
$ clang -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda test.c
$ LIBOMPTARGET_DEBUG=1 ./a.out |& grep 'Creating\|Mapping exists'
Libomptarget --> Creating new map entry with HstPtrBegin=0x00007ffcace8e448, TgtPtrBegin=0x00007f12ef600000, Size=4, Name=unknown
Libomptarget --> Mapping exists with HstPtrBegin=0x00007ffcace8e448, TgtPtrBegin=0x00007f12ef600000, Size=4, updated RefCount=1
```
There are two problems in this example:
* `RefCount` is not reported when a mapping is created, but it might
be 1 or infinite. In this case, because it's created by `omp target
enter data`, it's 1. Seeing that would make later `RefCount`
messages easier to understand.
* `RefCount` is still 1 at the `omp target exit data`, but it's
reported as `updated`. The reason it's still 1 is that, upon
deletions, the reference count is generally not updated in
`DeviceTy::getTgtPtrBegin`, where the report is produced. Instead,
it's zeroed later in `DeviceTy::deallocTgtPtr`, where it's actually
removed from the mapping table.
This patch makes the following changes:
* Report the reference count when creating a mapping.
* Where an existing mapping is reported, always report a reference
count action:
* `update suppressed` when `UpdateRefCount=false`
* `incremented`
* `decremented`
* `deferred final decrement`, which replaces the misleading
`updated` in the above example
* Add comments to `DeviceTy::getTgtPtrBegin` to explain why it does
not zero the reference count. (Please advise if these comments miss
the point.)
* For unified shared memory, don't report confusing messages like
`RefCount=` or `RefCount= updated` given that reference counts are
irrelevant in this case. Instead, just report `for unified shared
memory`.
* Use `INFO` not `DP` consistently for `Mapping exists` messages.
* Fix device table dumps to print `INF` instead of `-1` for an
infinite reference count.
Reviewed By: jhuber6, grokos
Differential Revision: https://reviews.llvm.org/D104559
2021-06-23 09:37:54 -04:00
|
|
|
const HostDataToTargetTy &newEntry =
|
2022-03-05 15:14:20 -06:00
|
|
|
*HDTTMap
|
|
|
|
|
->emplace(new HostDataToTargetTy(
|
[OpenMP] Improve ref count debug messages
For example, without this patch:
```
$ cat test.c
int main() {
int x;
#pragma omp target enter data map(alloc: x)
#pragma omp target exit data map(release: x)
;
return 0;
}
$ clang -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda test.c
$ LIBOMPTARGET_DEBUG=1 ./a.out |& grep 'Creating\|Mapping exists'
Libomptarget --> Creating new map entry with HstPtrBegin=0x00007ffcace8e448, TgtPtrBegin=0x00007f12ef600000, Size=4, Name=unknown
Libomptarget --> Mapping exists with HstPtrBegin=0x00007ffcace8e448, TgtPtrBegin=0x00007f12ef600000, Size=4, updated RefCount=1
```
There are two problems in this example:
* `RefCount` is not reported when a mapping is created, but it might
be 1 or infinite. In this case, because it's created by `omp target
enter data`, it's 1. Seeing that would make later `RefCount`
messages easier to understand.
* `RefCount` is still 1 at the `omp target exit data`, but it's
reported as `updated`. The reason it's still 1 is that, upon
deletions, the reference count is generally not updated in
`DeviceTy::getTgtPtrBegin`, where the report is produced. Instead,
it's zeroed later in `DeviceTy::deallocTgtPtr`, where it's actually
removed from the mapping table.
This patch makes the following changes:
* Report the reference count when creating a mapping.
* Where an existing mapping is reported, always report a reference
count action:
* `update suppressed` when `UpdateRefCount=false`
* `incremented`
* `decremented`
* `deferred final decrement`, which replaces the misleading
`updated` in the above example
* Add comments to `DeviceTy::getTgtPtrBegin` to explain why it does
not zero the reference count. (Please advise if these comments miss
the point.)
* For unified shared memory, don't report confusing messages like
`RefCount=` or `RefCount= updated` given that reference counts are
irrelevant in this case. Instead, just report `for unified shared
memory`.
* Use `INFO` not `DP` consistently for `Mapping exists` messages.
* Fix device table dumps to print `INF` instead of `-1` for an
infinite reference count.
Reviewed By: jhuber6, grokos
Differential Revision: https://reviews.llvm.org/D104559
2021-06-23 09:37:54 -04:00
|
|
|
/*HstPtrBase=*/(uintptr_t)HstPtrBegin,
|
|
|
|
|
/*HstPtrBegin=*/(uintptr_t)HstPtrBegin,
|
|
|
|
|
/*HstPtrEnd=*/(uintptr_t)HstPtrBegin + Size,
|
2021-08-31 15:21:16 -04:00
|
|
|
/*TgtPtrBegin=*/(uintptr_t)TgtPtrBegin,
|
|
|
|
|
/*UseHoldRefCount=*/false, /*Name=*/nullptr,
|
2022-03-05 15:14:20 -06:00
|
|
|
/*IsRefCountINF=*/true))
|
|
|
|
|
.first->HDTT;
|
2021-02-10 11:45:39 -06:00
|
|
|
DP("Creating new map entry: HstBase=" DPxMOD ", HstBegin=" DPxMOD
|
2021-08-31 15:21:16 -04:00
|
|
|
", HstEnd=" DPxMOD ", TgtBegin=" DPxMOD ", DynRefCount=%s, "
|
|
|
|
|
"HoldRefCount=%s\n",
|
[OpenMP] Improve ref count debug messages
For example, without this patch:
```
$ cat test.c
int main() {
int x;
#pragma omp target enter data map(alloc: x)
#pragma omp target exit data map(release: x)
;
return 0;
}
$ clang -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda test.c
$ LIBOMPTARGET_DEBUG=1 ./a.out |& grep 'Creating\|Mapping exists'
Libomptarget --> Creating new map entry with HstPtrBegin=0x00007ffcace8e448, TgtPtrBegin=0x00007f12ef600000, Size=4, Name=unknown
Libomptarget --> Mapping exists with HstPtrBegin=0x00007ffcace8e448, TgtPtrBegin=0x00007f12ef600000, Size=4, updated RefCount=1
```
There are two problems in this example:
* `RefCount` is not reported when a mapping is created, but it might
be 1 or infinite. In this case, because it's created by `omp target
enter data`, it's 1. Seeing that would make later `RefCount`
messages easier to understand.
* `RefCount` is still 1 at the `omp target exit data`, but it's
reported as `updated`. The reason it's still 1 is that, upon
deletions, the reference count is generally not updated in
`DeviceTy::getTgtPtrBegin`, where the report is produced. Instead,
it's zeroed later in `DeviceTy::deallocTgtPtr`, where it's actually
removed from the mapping table.
This patch makes the following changes:
* Report the reference count when creating a mapping.
* Where an existing mapping is reported, always report a reference
count action:
* `update suppressed` when `UpdateRefCount=false`
* `incremented`
* `decremented`
* `deferred final decrement`, which replaces the misleading
`updated` in the above example
* Add comments to `DeviceTy::getTgtPtrBegin` to explain why it does
not zero the reference count. (Please advise if these comments miss
the point.)
* For unified shared memory, don't report confusing messages like
`RefCount=` or `RefCount= updated` given that reference counts are
irrelevant in this case. Instead, just report `for unified shared
memory`.
* Use `INFO` not `DP` consistently for `Mapping exists` messages.
* Fix device table dumps to print `INF` instead of `-1` for an
infinite reference count.
Reviewed By: jhuber6, grokos
Differential Revision: https://reviews.llvm.org/D104559
2021-06-23 09:37:54 -04:00
|
|
|
DPxPTR(newEntry.HstPtrBase), DPxPTR(newEntry.HstPtrBegin),
|
|
|
|
|
DPxPTR(newEntry.HstPtrEnd), DPxPTR(newEntry.TgtPtrBegin),
|
2021-08-31 15:21:16 -04:00
|
|
|
newEntry.dynRefCountToStr().c_str(), newEntry.holdRefCountToStr().c_str());
|
2021-07-26 09:54:30 +01:00
|
|
|
(void)newEntry;
|
2017-12-06 21:59:09 +00:00
|
|
|
|
|
|
|
|
return OFFLOAD_SUCCESS;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int DeviceTy::disassociatePtr(void *HstPtrBegin) {
|
2022-03-05 15:14:20 -06:00
|
|
|
HDTTMapAccessorTy HDTTMap = HostDataToTargetMap.getExclusiveAccessor();
|
2017-12-06 21:59:09 +00:00
|
|
|
|
2022-03-05 15:14:20 -06:00
|
|
|
auto It = HDTTMap->find(HstPtrBegin);
|
|
|
|
|
if (It != HDTTMap->end()) {
|
|
|
|
|
HostDataToTargetTy &HDTT = *It->HDTT;
|
2020-06-24 12:18:00 -04:00
|
|
|
// Mapping exists
|
2022-03-05 15:14:20 -06:00
|
|
|
if (HDTT.getHoldRefCount()) {
|
2021-08-31 15:21:16 -04:00
|
|
|
// This is based on OpenACC 3.1, sec 3.2.33 "acc_unmap_data", L3656-3657:
|
|
|
|
|
// "It is an error to call acc_unmap_data if the structured reference
|
|
|
|
|
// count for the pointer is not zero."
|
|
|
|
|
REPORT("Trying to disassociate a pointer with a non-zero hold reference "
|
|
|
|
|
"count\n");
|
2022-03-05 15:14:20 -06:00
|
|
|
} else if (HDTT.isDynRefCountInf()) {
|
2020-06-24 12:18:00 -04:00
|
|
|
DP("Association found, removing it\n");
|
2022-03-05 15:14:20 -06:00
|
|
|
void *Event = HDTT.getEvent();
|
|
|
|
|
delete &HDTT;
|
2022-01-05 20:19:55 -05:00
|
|
|
if (Event)
|
|
|
|
|
destroyEvent(Event);
|
2022-03-05 15:14:20 -06:00
|
|
|
HDTTMap->erase(It);
|
2020-06-24 12:18:00 -04:00
|
|
|
return OFFLOAD_SUCCESS;
|
|
|
|
|
} else {
|
2020-08-26 18:11:26 -04:00
|
|
|
REPORT("Trying to disassociate a pointer which was not mapped via "
|
|
|
|
|
"omp_target_associate_ptr\n");
|
2017-12-06 21:59:09 +00:00
|
|
|
}
|
2021-08-31 15:21:16 -04:00
|
|
|
} else {
|
|
|
|
|
REPORT("Association not found\n");
|
2017-12-06 21:59:09 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Mapping not found
|
|
|
|
|
return OFFLOAD_FAIL;
|
|
|
|
|
}
|
|
|
|
|
|
2022-03-05 15:14:20 -06:00
|
|
|
LookupResult DeviceTy::lookupMapping(HDTTMapAccessorTy &HDTTMap,
|
|
|
|
|
void *HstPtrBegin, int64_t Size) {
|
|
|
|
|
|
2017-12-06 21:59:09 +00:00
|
|
|
uintptr_t hp = (uintptr_t)HstPtrBegin;
|
|
|
|
|
LookupResult lr;
|
|
|
|
|
|
2020-08-05 13:06:27 -07:00
|
|
|
DP("Looking up mapping(HstPtrBegin=" DPxMOD ", Size=%" PRId64 ")...\n",
|
2021-02-10 11:45:39 -06:00
|
|
|
DPxPTR(hp), Size);
|
2020-06-24 12:18:00 -04:00
|
|
|
|
2022-03-05 15:14:20 -06:00
|
|
|
if (HDTTMap->empty())
|
2020-06-24 12:18:00 -04:00
|
|
|
return lr;
|
|
|
|
|
|
2022-03-05 15:14:20 -06:00
|
|
|
auto upper = HDTTMap->upper_bound(hp);
|
2020-06-24 12:18:00 -04:00
|
|
|
|
2022-04-07 21:06:09 -05:00
|
|
|
if (Size == 0) {
|
|
|
|
|
// specification v5.1 Pointer Initialization for Device Data Environments
|
|
|
|
|
// upper_bound satisfies
|
|
|
|
|
// std::prev(upper)->HDTT.HstPtrBegin <= hp < upper->HDTT.HstPtrBegin
|
|
|
|
|
if (upper != HDTTMap->begin()) {
|
|
|
|
|
lr.Entry = std::prev(upper)->HDTT;
|
|
|
|
|
auto &HT = *lr.Entry;
|
|
|
|
|
// the left side of extended address range is satisified.
|
|
|
|
|
// hp >= HT.HstPtrBegin || hp >= HT.HstPtrBase
|
|
|
|
|
lr.Flags.IsContained = hp < HT.HstPtrEnd || hp < HT.HstPtrBase;
|
|
|
|
|
}
|
2017-12-06 21:59:09 +00:00
|
|
|
|
2022-04-07 21:06:09 -05:00
|
|
|
if (!lr.Flags.IsContained && upper != HDTTMap->end()) {
|
|
|
|
|
lr.Entry = upper->HDTT;
|
|
|
|
|
auto &HT = *lr.Entry;
|
|
|
|
|
// the right side of extended address range is satisified.
|
|
|
|
|
// hp < HT.HstPtrEnd || hp < HT.HstPtrBase
|
|
|
|
|
lr.Flags.IsContained = hp >= HT.HstPtrBase;
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
// check the left bin
|
|
|
|
|
if (upper != HDTTMap->begin()) {
|
|
|
|
|
lr.Entry = std::prev(upper)->HDTT;
|
|
|
|
|
auto &HT = *lr.Entry;
|
|
|
|
|
// Is it contained?
|
|
|
|
|
lr.Flags.IsContained = hp >= HT.HstPtrBegin && hp < HT.HstPtrEnd &&
|
|
|
|
|
(hp + Size) <= HT.HstPtrEnd;
|
|
|
|
|
// Does it extend beyond the mapped region?
|
|
|
|
|
lr.Flags.ExtendsAfter = hp < HT.HstPtrEnd && (hp + Size) > HT.HstPtrEnd;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// check the right bin
|
|
|
|
|
if (!(lr.Flags.IsContained || lr.Flags.ExtendsAfter) &&
|
|
|
|
|
upper != HDTTMap->end()) {
|
|
|
|
|
lr.Entry = upper->HDTT;
|
|
|
|
|
auto &HT = *lr.Entry;
|
|
|
|
|
// Does it extend into an already mapped region?
|
|
|
|
|
lr.Flags.ExtendsBefore =
|
|
|
|
|
hp < HT.HstPtrBegin && (hp + Size) > HT.HstPtrBegin;
|
|
|
|
|
// Does it extend beyond the mapped region?
|
|
|
|
|
lr.Flags.ExtendsAfter = hp < HT.HstPtrEnd && (hp + Size) > HT.HstPtrEnd;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (lr.Flags.ExtendsBefore) {
|
|
|
|
|
DP("WARNING: Pointer is not mapped but section extends into already "
|
|
|
|
|
"mapped data\n");
|
|
|
|
|
}
|
|
|
|
|
if (lr.Flags.ExtendsAfter) {
|
|
|
|
|
DP("WARNING: Pointer is already mapped but section extends beyond mapped "
|
|
|
|
|
"region\n");
|
|
|
|
|
}
|
2017-12-06 21:59:09 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return lr;
|
|
|
|
|
}
|
|
|
|
|
|
2022-03-02 13:34:24 -06:00
|
|
|
TargetPointerResultTy DeviceTy::getTargetPointer(
|
|
|
|
|
void *HstPtrBegin, void *HstPtrBase, int64_t Size,
|
|
|
|
|
map_var_info_t HstPtrName, bool HasFlagTo, bool HasFlagAlways,
|
|
|
|
|
bool IsImplicit, bool UpdateRefCount, bool HasCloseModifier,
|
|
|
|
|
bool HasPresentModifier, bool HasHoldModifier, AsyncInfoTy &AsyncInfo) {
|
2022-03-05 15:14:20 -06:00
|
|
|
HDTTMapAccessorTy HDTTMap = HostDataToTargetMap.getExclusiveAccessor();
|
|
|
|
|
|
[OpenMP][Offloading] Fix data race in data mapping by using two locks
This patch tries to partially fix one of the two data race issues reported in
[1] by introducing a per-entry mutex. Additional discussion can also be found in
D104418, which will also be refined to fix another data race problem.
Here is how it works. Like before, `DataMapMtx` is still being used for mapping
table lookup and update. In any case, we will get a table entry. If we need to
make a data transfer (update the data on the device), we need to lock the entry
right before releasing `DataMapMtx`, and the issue of data transfer should be
after releasing `DataMapMtx`, and the entry is unlocked afterwards. This can
guarantee that: 1) issue of data movement is not in critical region, which will
not affect performance too much, and also will not affect other threads that don't
touch the same entry; 2) if another thread accesses the same entry, the state of
data movement is consistent (which requires that a thread must first get the
update lock before getting data movement information).
For a target that doesn't support async data transfer, issue of data movement is
data transfer. This two-lock design can potentially improve concurrency compared
with the design that guards data movement with `DataMapMtx` as well. For a target
that supports async data movement, we could simply attach the event between the
issue of data movement and unlock the entry. For a thread that wants to get the
event, it must first get the lock. This can also get rid of the busy wait until
the event pointer is valid.
Reference:
[1] https://bugs.llvm.org/show_bug.cgi?id=49940
Reviewed By: grokos
Differential Revision: https://reviews.llvm.org/D104555
2021-07-23 16:10:42 -04:00
|
|
|
void *TargetPointer = nullptr;
|
2021-07-01 12:31:45 -04:00
|
|
|
bool IsHostPtr = false;
|
[OpenMP][Offloading] Fix data race in data mapping by using two locks
This patch tries to partially fix one of the two data race issues reported in
[1] by introducing a per-entry mutex. Additional discussion can also be found in
D104418, which will also be refined to fix another data race problem.
Here is how it works. Like before, `DataMapMtx` is still being used for mapping
table lookup and update. In any case, we will get a table entry. If we need to
make a data transfer (update the data on the device), we need to lock the entry
right before releasing `DataMapMtx`, and the issue of data transfer should be
after releasing `DataMapMtx`, and the entry is unlocked afterwards. This can
guarantee that: 1) issue of data movement is not in critical region, which will
not affect performance too much, and also will not affect other threads that don't
touch the same entry; 2) if another thread accesses the same entry, the state of
data movement is consistent (which requires that a thread must first get the
update lock before getting data movement information).
For a target that doesn't support async data transfer, issue of data movement is
data transfer. This two-lock design can potentially improve concurrency compared
with the design that guards data movement with `DataMapMtx` as well. For a target
that supports async data movement, we could simply attach the event between the
issue of data movement and unlock the entry. For a thread that wants to get the
event, it must first get the lock. This can also get rid of the busy wait until
the event pointer is valid.
Reference:
[1] https://bugs.llvm.org/show_bug.cgi?id=49940
Reviewed By: grokos
Differential Revision: https://reviews.llvm.org/D104555
2021-07-23 16:10:42 -04:00
|
|
|
bool IsNew = false;
|
|
|
|
|
|
2022-03-05 15:14:20 -06:00
|
|
|
LookupResult LR = lookupMapping(HDTTMap, HstPtrBegin, Size);
|
|
|
|
|
auto *Entry = LR.Entry;
|
2017-12-06 21:59:09 +00:00
|
|
|
|
|
|
|
|
// Check if the pointer is contained.
|
2019-08-07 17:29:45 +00:00
|
|
|
// If a variable is mapped to the device manually by the user - which would
|
|
|
|
|
// lead to the IsContained flag to be true - then we must ensure that the
|
|
|
|
|
// device address is returned even under unified memory conditions.
|
2021-07-01 12:31:45 -04:00
|
|
|
if (LR.Flags.IsContained ||
|
|
|
|
|
((LR.Flags.ExtendsBefore || LR.Flags.ExtendsAfter) && IsImplicit)) {
|
|
|
|
|
auto &HT = *LR.Entry;
|
2021-08-31 15:21:16 -04:00
|
|
|
const char *RefCountAction;
|
|
|
|
|
if (UpdateRefCount) {
|
2022-03-02 13:46:01 -06:00
|
|
|
// After this, reference count >= 1. If the reference count was 0 but the
|
|
|
|
|
// entry was still there we can reuse the data on the device and avoid a
|
|
|
|
|
// new submission.
|
2021-08-31 15:21:16 -04:00
|
|
|
HT.incRefCount(HasHoldModifier);
|
|
|
|
|
RefCountAction = " (incremented)";
|
|
|
|
|
} else {
|
[OpenMP] Avoid checking parent reference count in targetDataBegin
This patch is an attempt to do for `targetDataBegin` what D104924 does
for `targetDataEnd`:
* Eliminates a lock/unlock of the data mapping table.
* Clarifies the logic that determines whether a struct member's
host-to-device transfer occurs. The old logic, which checks the
parent struct's reference count, is a leftover from back when we had
a different map interface (as pointed out at
<https://reviews.llvm.org/D104924#2846972>).
Additionally, it eliminates the `DeviceTy::getMapEntryRefCnt`, which
is no longer used after this patch.
While D104924 does not change the computation of `IsLast`, I found I
needed to change the computation of `IsNew` for this patch. As far as
I can tell, the change is correct, and this patch does not cause any
additional `openmp` tests to fail. However, I'm not sure I've thought
of all use cases. Please advise.
Reviewed By: jdoerfert, jhuber6, protze.joachim, tianshilei1992, grokos, RaviNarayanaswamy
Differential Revision: https://reviews.llvm.org/D105121
2021-07-10 12:01:45 -04:00
|
|
|
// It might have been allocated with the parent, but it's still new.
|
2021-08-31 15:21:16 -04:00
|
|
|
IsNew = HT.getTotalRefCount() == 1;
|
|
|
|
|
RefCountAction = " (update suppressed)";
|
|
|
|
|
}
|
|
|
|
|
const char *DynRefCountAction = HasHoldModifier ? "" : RefCountAction;
|
|
|
|
|
const char *HoldRefCountAction = HasHoldModifier ? RefCountAction : "";
|
2021-07-01 12:31:45 -04:00
|
|
|
uintptr_t Ptr = HT.TgtPtrBegin + ((uintptr_t)HstPtrBegin - HT.HstPtrBegin);
|
2021-01-21 09:59:29 -05:00
|
|
|
INFO(OMP_INFOTYPE_MAPPING_EXISTS, DeviceID,
|
|
|
|
|
"Mapping exists%s with HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD
|
2021-08-31 15:21:16 -04:00
|
|
|
", Size=%" PRId64 ", DynRefCount=%s%s, HoldRefCount=%s%s, Name=%s\n",
|
2021-07-01 12:31:45 -04:00
|
|
|
(IsImplicit ? " (implicit)" : ""), DPxPTR(HstPtrBegin), DPxPTR(Ptr),
|
2021-08-31 15:21:16 -04:00
|
|
|
Size, HT.dynRefCountToStr().c_str(), DynRefCountAction,
|
|
|
|
|
HT.holdRefCountToStr().c_str(), HoldRefCountAction,
|
2021-01-21 09:59:29 -05:00
|
|
|
(HstPtrName) ? getNameFromMapping(HstPtrName).c_str() : "unknown");
|
2021-07-01 12:31:45 -04:00
|
|
|
TargetPointer = (void *)Ptr;
|
|
|
|
|
} else if ((LR.Flags.ExtendsBefore || LR.Flags.ExtendsAfter) && !IsImplicit) {
|
2017-12-06 21:59:09 +00:00
|
|
|
// Explicit extension of mapped data - not allowed.
|
2020-08-05 16:47:29 -04:00
|
|
|
MESSAGE("explicit extension not allowed: host address specified is " DPxMOD
|
2021-02-10 11:45:39 -06:00
|
|
|
" (%" PRId64
|
|
|
|
|
" bytes), but device allocation maps to host at " DPxMOD
|
|
|
|
|
" (%" PRId64 " bytes)",
|
2021-07-01 12:31:45 -04:00
|
|
|
DPxPTR(HstPtrBegin), Size, DPxPTR(Entry->HstPtrBegin),
|
|
|
|
|
Entry->HstPtrEnd - Entry->HstPtrBegin);
|
2020-08-05 16:47:29 -04:00
|
|
|
if (HasPresentModifier)
|
|
|
|
|
MESSAGE("device mapping required by 'present' map type modifier does not "
|
|
|
|
|
"exist for host address " DPxMOD " (%" PRId64 " bytes)",
|
|
|
|
|
DPxPTR(HstPtrBegin), Size);
|
2020-10-30 01:04:34 -04:00
|
|
|
} else if (PM->RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY &&
|
2020-07-22 14:04:58 -04:00
|
|
|
!HasCloseModifier) {
|
|
|
|
|
// If unified shared memory is active, implicitly mapped variables that are
|
|
|
|
|
// not privatized use host address. Any explicitly mapped variables also use
|
|
|
|
|
// host address where correctness is not impeded. In all other cases maps
|
|
|
|
|
// are respected.
|
|
|
|
|
// In addition to the mapping rules above, the close map modifier forces the
|
|
|
|
|
// mapping of the variable to the device.
|
|
|
|
|
if (Size) {
|
[OpenMP] Improve ref count debug messages
For example, without this patch:
```
$ cat test.c
int main() {
int x;
#pragma omp target enter data map(alloc: x)
#pragma omp target exit data map(release: x)
;
return 0;
}
$ clang -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda test.c
$ LIBOMPTARGET_DEBUG=1 ./a.out |& grep 'Creating\|Mapping exists'
Libomptarget --> Creating new map entry with HstPtrBegin=0x00007ffcace8e448, TgtPtrBegin=0x00007f12ef600000, Size=4, Name=unknown
Libomptarget --> Mapping exists with HstPtrBegin=0x00007ffcace8e448, TgtPtrBegin=0x00007f12ef600000, Size=4, updated RefCount=1
```
There are two problems in this example:
* `RefCount` is not reported when a mapping is created, but it might
be 1 or infinite. In this case, because it's created by `omp target
enter data`, it's 1. Seeing that would make later `RefCount`
messages easier to understand.
* `RefCount` is still 1 at the `omp target exit data`, but it's
reported as `updated`. The reason it's still 1 is that, upon
deletions, the reference count is generally not updated in
`DeviceTy::getTgtPtrBegin`, where the report is produced. Instead,
it's zeroed later in `DeviceTy::deallocTgtPtr`, where it's actually
removed from the mapping table.
This patch makes the following changes:
* Report the reference count when creating a mapping.
* Where an existing mapping is reported, always report a reference
count action:
* `update suppressed` when `UpdateRefCount=false`
* `incremented`
* `decremented`
* `deferred final decrement`, which replaces the misleading
`updated` in the above example
* Add comments to `DeviceTy::getTgtPtrBegin` to explain why it does
not zero the reference count. (Please advise if these comments miss
the point.)
* For unified shared memory, don't report confusing messages like
`RefCount=` or `RefCount= updated` given that reference counts are
irrelevant in this case. Instead, just report `for unified shared
memory`.
* Use `INFO` not `DP` consistently for `Mapping exists` messages.
* Fix device table dumps to print `INF` instead of `-1` for an
infinite reference count.
Reviewed By: jhuber6, grokos
Differential Revision: https://reviews.llvm.org/D104559
2021-06-23 09:37:54 -04:00
|
|
|
DP("Return HstPtrBegin " DPxMOD " Size=%" PRId64 " for unified shared "
|
|
|
|
|
"memory\n",
|
|
|
|
|
DPxPTR((uintptr_t)HstPtrBegin), Size);
|
2019-08-07 17:29:45 +00:00
|
|
|
IsHostPtr = true;
|
2021-07-01 12:31:45 -04:00
|
|
|
TargetPointer = HstPtrBegin;
|
2019-08-07 17:29:45 +00:00
|
|
|
}
|
2020-07-22 14:04:58 -04:00
|
|
|
} else if (HasPresentModifier) {
|
|
|
|
|
DP("Mapping required by 'present' map type modifier does not exist for "
|
2020-08-05 13:06:27 -07:00
|
|
|
"HstPtrBegin=" DPxMOD ", Size=%" PRId64 "\n",
|
2020-07-22 14:04:58 -04:00
|
|
|
DPxPTR(HstPtrBegin), Size);
|
|
|
|
|
MESSAGE("device mapping required by 'present' map type modifier does not "
|
2020-08-05 13:06:27 -07:00
|
|
|
"exist for host address " DPxMOD " (%" PRId64 " bytes)",
|
2020-07-22 14:04:58 -04:00
|
|
|
DPxPTR(HstPtrBegin), Size);
|
|
|
|
|
} else if (Size) {
|
|
|
|
|
// If it is not contained and Size > 0, we should create a new entry for it.
|
|
|
|
|
IsNew = true;
|
2021-07-01 12:31:45 -04:00
|
|
|
uintptr_t Ptr = (uintptr_t)allocData(Size, HstPtrBegin);
|
2022-03-05 15:14:20 -06:00
|
|
|
Entry = HDTTMap
|
|
|
|
|
->emplace(new HostDataToTargetTy(
|
|
|
|
|
(uintptr_t)HstPtrBase, (uintptr_t)HstPtrBegin,
|
|
|
|
|
(uintptr_t)HstPtrBegin + Size, Ptr, HasHoldModifier,
|
|
|
|
|
HstPtrName))
|
|
|
|
|
.first->HDTT;
|
2021-04-15 17:27:17 -04:00
|
|
|
INFO(OMP_INFOTYPE_MAPPING_CHANGED, DeviceID,
|
2022-04-18 14:08:16 -04:00
|
|
|
"Creating new map entry with HstPtrBase=" DPxMOD
|
2022-04-07 21:06:09 -05:00
|
|
|
", HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD ", Size=%ld, "
|
2021-08-31 15:21:16 -04:00
|
|
|
"DynRefCount=%s, HoldRefCount=%s, Name=%s\n",
|
2022-04-07 21:06:09 -05:00
|
|
|
DPxPTR(HstPtrBase), DPxPTR(HstPtrBegin), DPxPTR(Ptr), Size,
|
2021-08-31 15:21:16 -04:00
|
|
|
Entry->dynRefCountToStr().c_str(), Entry->holdRefCountToStr().c_str(),
|
2021-04-15 17:27:17 -04:00
|
|
|
(HstPtrName) ? getNameFromMapping(HstPtrName).c_str() : "unknown");
|
2021-07-01 12:31:45 -04:00
|
|
|
TargetPointer = (void *)Ptr;
|
2017-12-06 21:59:09 +00:00
|
|
|
}
|
|
|
|
|
|
[OpenMP][Offloading] Fix data race in data mapping by using two locks
This patch tries to partially fix one of the two data race issues reported in
[1] by introducing a per-entry mutex. Additional discussion can also be found in
D104418, which will also be refined to fix another data race problem.
Here is how it works. Like before, `DataMapMtx` is still being used for mapping
table lookup and update. In any case, we will get a table entry. If we need to
make a data transfer (update the data on the device), we need to lock the entry
right before releasing `DataMapMtx`, and the issue of data transfer should be
after releasing `DataMapMtx`, and the entry is unlocked afterwards. This can
guarantee that: 1) issue of data movement is not in critical region, which will
not affect performance too much, and also will not affect other threads that don't
touch the same entry; 2) if another thread accesses the same entry, the state of
data movement is consistent (which requires that a thread must first get the
update lock before getting data movement information).
For a target that doesn't support async data transfer, issue of data movement is
data transfer. This two-lock design can potentially improve concurrency compared
with the design that guards data movement with `DataMapMtx` as well. For a target
that supports async data movement, we could simply attach the event between the
issue of data movement and unlock the entry. For a thread that wants to get the
event, it must first get the lock. This can also get rid of the busy wait until
the event pointer is valid.
Reference:
[1] https://bugs.llvm.org/show_bug.cgi?id=49940
Reviewed By: grokos
Differential Revision: https://reviews.llvm.org/D104555
2021-07-23 16:10:42 -04:00
|
|
|
// If the target pointer is valid, and we need to transfer data, issue the
|
|
|
|
|
// data transfer.
|
2021-09-01 16:24:34 -04:00
|
|
|
if (TargetPointer && !IsHostPtr && HasFlagTo && (IsNew || HasFlagAlways)) {
|
[OpenMP][Offloading] Fix data race in data mapping by using two locks
This patch tries to partially fix one of the two data race issues reported in
[1] by introducing a per-entry mutex. Additional discussion can also be found in
D104418, which will also be refined to fix another data race problem.
Here is how it works. Like before, `DataMapMtx` is still being used for mapping
table lookup and update. In any case, we will get a table entry. If we need to
make a data transfer (update the data on the device), we need to lock the entry
right before releasing `DataMapMtx`, and the issue of data transfer should be
after releasing `DataMapMtx`, and the entry is unlocked afterwards. This can
guarantee that: 1) issue of data movement is not in critical region, which will
not affect performance too much, and also will not affect other threads that don't
touch the same entry; 2) if another thread accesses the same entry, the state of
data movement is consistent (which requires that a thread must first get the
update lock before getting data movement information).
For a target that doesn't support async data transfer, issue of data movement is
data transfer. This two-lock design can potentially improve concurrency compared
with the design that guards data movement with `DataMapMtx` as well. For a target
that supports async data movement, we could simply attach the event between the
issue of data movement and unlock the entry. For a thread that wants to get the
event, it must first get the lock. This can also get rid of the busy wait until
the event pointer is valid.
Reference:
[1] https://bugs.llvm.org/show_bug.cgi?id=49940
Reviewed By: grokos
Differential Revision: https://reviews.llvm.org/D104555
2021-07-23 16:10:42 -04:00
|
|
|
// Lock the entry before releasing the mapping table lock such that another
|
|
|
|
|
// thread that could issue data movement will get the right result.
|
2022-03-02 13:34:24 -06:00
|
|
|
std::lock_guard<decltype(*Entry)> LG(*Entry);
|
[OpenMP][Offloading] Fix data race in data mapping by using two locks
This patch tries to partially fix one of the two data race issues reported in
[1] by introducing a per-entry mutex. Additional discussion can also be found in
D104418, which will also be refined to fix another data race problem.
Here is how it works. Like before, `DataMapMtx` is still being used for mapping
table lookup and update. In any case, we will get a table entry. If we need to
make a data transfer (update the data on the device), we need to lock the entry
right before releasing `DataMapMtx`, and the issue of data transfer should be
after releasing `DataMapMtx`, and the entry is unlocked afterwards. This can
guarantee that: 1) issue of data movement is not in critical region, which will
not affect performance too much, and also will not affect other threads that don't
touch the same entry; 2) if another thread accesses the same entry, the state of
data movement is consistent (which requires that a thread must first get the
update lock before getting data movement information).
For a target that doesn't support async data transfer, issue of data movement is
data transfer. This two-lock design can potentially improve concurrency compared
with the design that guards data movement with `DataMapMtx` as well. For a target
that supports async data movement, we could simply attach the event between the
issue of data movement and unlock the entry. For a thread that wants to get the
event, it must first get the lock. This can also get rid of the busy wait until
the event pointer is valid.
Reference:
[1] https://bugs.llvm.org/show_bug.cgi?id=49940
Reviewed By: grokos
Differential Revision: https://reviews.llvm.org/D104555
2021-07-23 16:10:42 -04:00
|
|
|
// Release the mapping table lock right after the entry is locked.
|
2022-03-05 15:14:20 -06:00
|
|
|
HDTTMap.destroy();
|
[OpenMP][Offloading] Fix data race in data mapping by using two locks
This patch tries to partially fix one of the two data race issues reported in
[1] by introducing a per-entry mutex. Additional discussion can also be found in
D104418, which will also be refined to fix another data race problem.
Here is how it works. Like before, `DataMapMtx` is still being used for mapping
table lookup and update. In any case, we will get a table entry. If we need to
make a data transfer (update the data on the device), we need to lock the entry
right before releasing `DataMapMtx`, and the issue of data transfer should be
after releasing `DataMapMtx`, and the entry is unlocked afterwards. This can
guarantee that: 1) issue of data movement is not in critical region, which will
not affect performance too much, and also will not affect other threads that don't
touch the same entry; 2) if another thread accesses the same entry, the state of
data movement is consistent (which requires that a thread must first get the
update lock before getting data movement information).
For a target that doesn't support async data transfer, issue of data movement is
data transfer. This two-lock design can potentially improve concurrency compared
with the design that guards data movement with `DataMapMtx` as well. For a target
that supports async data movement, we could simply attach the event between the
issue of data movement and unlock the entry. For a thread that wants to get the
event, it must first get the lock. This can also get rid of the busy wait until
the event pointer is valid.
Reference:
[1] https://bugs.llvm.org/show_bug.cgi?id=49940
Reviewed By: grokos
Differential Revision: https://reviews.llvm.org/D104555
2021-07-23 16:10:42 -04:00
|
|
|
|
|
|
|
|
DP("Moving %" PRId64 " bytes (hst:" DPxMOD ") -> (tgt:" DPxMOD ")\n", Size,
|
|
|
|
|
DPxPTR(HstPtrBegin), DPxPTR(TargetPointer));
|
|
|
|
|
|
|
|
|
|
int Ret = submitData(TargetPointer, HstPtrBegin, Size, AsyncInfo);
|
|
|
|
|
if (Ret != OFFLOAD_SUCCESS) {
|
|
|
|
|
REPORT("Copying data to device failed.\n");
|
|
|
|
|
// We will also return nullptr if the data movement fails because that
|
|
|
|
|
// pointer points to a corrupted memory region so it doesn't make any
|
|
|
|
|
// sense to continue to use it.
|
|
|
|
|
TargetPointer = nullptr;
|
2022-03-02 13:34:24 -06:00
|
|
|
} else if (Entry->addEventIfNecessary(*this, AsyncInfo) != OFFLOAD_SUCCESS)
|
2022-01-05 20:19:55 -05:00
|
|
|
return {{false /* IsNewEntry */, false /* IsHostPointer */},
|
2022-03-05 15:14:20 -06:00
|
|
|
nullptr /* Entry */,
|
2022-01-05 20:19:55 -05:00
|
|
|
nullptr /* TargetPointer */};
|
[OpenMP][Offloading] Fix data race in data mapping by using two locks
This patch tries to partially fix one of the two data race issues reported in
[1] by introducing a per-entry mutex. Additional discussion can also be found in
D104418, which will also be refined to fix another data race problem.
Here is how it works. Like before, `DataMapMtx` is still being used for mapping
table lookup and update. In any case, we will get a table entry. If we need to
make a data transfer (update the data on the device), we need to lock the entry
right before releasing `DataMapMtx`, and the issue of data transfer should be
after releasing `DataMapMtx`, and the entry is unlocked afterwards. This can
guarantee that: 1) issue of data movement is not in critical region, which will
not affect performance too much, and also will not affect other threads that don't
touch the same entry; 2) if another thread accesses the same entry, the state of
data movement is consistent (which requires that a thread must first get the
update lock before getting data movement information).
For a target that doesn't support async data transfer, issue of data movement is
data transfer. This two-lock design can potentially improve concurrency compared
with the design that guards data movement with `DataMapMtx` as well. For a target
that supports async data movement, we could simply attach the event between the
issue of data movement and unlock the entry. For a thread that wants to get the
event, it must first get the lock. This can also get rid of the busy wait until
the event pointer is valid.
Reference:
[1] https://bugs.llvm.org/show_bug.cgi?id=49940
Reviewed By: grokos
Differential Revision: https://reviews.llvm.org/D104555
2021-07-23 16:10:42 -04:00
|
|
|
} else {
|
|
|
|
|
// Release the mapping table lock directly.
|
2022-03-05 15:14:20 -06:00
|
|
|
HDTTMap.destroy();
|
2022-01-05 20:19:55 -05:00
|
|
|
// If not a host pointer and no present modifier, we need to wait for the
|
|
|
|
|
// event if it exists.
|
2022-01-05 23:04:17 -05:00
|
|
|
// Note: Entry might be nullptr because of zero length array section.
|
2022-03-05 15:14:20 -06:00
|
|
|
if (Entry && !IsHostPtr && !HasPresentModifier) {
|
2022-03-02 13:34:24 -06:00
|
|
|
std::lock_guard<decltype(*Entry)> LG(*Entry);
|
2022-01-05 20:19:55 -05:00
|
|
|
void *Event = Entry->getEvent();
|
|
|
|
|
if (Event) {
|
|
|
|
|
int Ret = waitEvent(Event, AsyncInfo);
|
|
|
|
|
if (Ret != OFFLOAD_SUCCESS) {
|
|
|
|
|
// If it fails to wait for the event, we need to return nullptr in
|
|
|
|
|
// case of any data race.
|
|
|
|
|
REPORT("Failed to wait for event " DPxMOD ".\n", DPxPTR(Event));
|
|
|
|
|
return {{false /* IsNewEntry */, false /* IsHostPointer */},
|
2022-03-05 15:14:20 -06:00
|
|
|
nullptr /* Entry */,
|
2022-01-05 20:19:55 -05:00
|
|
|
nullptr /* TargetPointer */};
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
[OpenMP][Offloading] Fix data race in data mapping by using two locks
This patch tries to partially fix one of the two data race issues reported in
[1] by introducing a per-entry mutex. Additional discussion can also be found in
D104418, which will also be refined to fix another data race problem.
Here is how it works. Like before, `DataMapMtx` is still being used for mapping
table lookup and update. In any case, we will get a table entry. If we need to
make a data transfer (update the data on the device), we need to lock the entry
right before releasing `DataMapMtx`, and the issue of data transfer should be
after releasing `DataMapMtx`, and the entry is unlocked afterwards. This can
guarantee that: 1) issue of data movement is not in critical region, which will
not affect performance too much, and also will not affect other threads that don't
touch the same entry; 2) if another thread accesses the same entry, the state of
data movement is consistent (which requires that a thread must first get the
update lock before getting data movement information).
For a target that doesn't support async data transfer, issue of data movement is
data transfer. This two-lock design can potentially improve concurrency compared
with the design that guards data movement with `DataMapMtx` as well. For a target
that supports async data movement, we could simply attach the event between the
issue of data movement and unlock the entry. For a thread that wants to get the
event, it must first get the lock. This can also get rid of the busy wait until
the event pointer is valid.
Reference:
[1] https://bugs.llvm.org/show_bug.cgi?id=49940
Reviewed By: grokos
Differential Revision: https://reviews.llvm.org/D104555
2021-07-23 16:10:42 -04:00
|
|
|
}
|
|
|
|
|
|
2021-07-01 12:31:45 -04:00
|
|
|
return {{IsNew, IsHostPtr}, Entry, TargetPointer};
|
2017-12-06 21:59:09 +00:00
|
|
|
}
|
|
|
|
|
|
2020-11-19 11:16:09 -06:00
|
|
|
// Used by targetDataBegin, targetDataEnd, targetDataUpdate and target.
|
2017-12-06 21:59:09 +00:00
|
|
|
// Return the target pointer begin (where the data will be moved).
|
2021-12-10 15:56:55 -05:00
|
|
|
// Decrement the reference counter if called from targetDataEnd.
|
2021-11-03 11:52:21 -05:00
|
|
|
TargetPointerResultTy
|
|
|
|
|
DeviceTy::getTgtPtrBegin(void *HstPtrBegin, int64_t Size, bool &IsLast,
|
|
|
|
|
bool UpdateRefCount, bool UseHoldRefCount,
|
|
|
|
|
bool &IsHostPtr, bool MustContain, bool ForceDelete) {
|
2022-03-05 15:14:20 -06:00
|
|
|
HDTTMapAccessorTy HDTTMap = HostDataToTargetMap.getExclusiveAccessor();
|
|
|
|
|
|
2021-11-03 11:52:21 -05:00
|
|
|
void *TargetPointer = NULL;
|
|
|
|
|
bool IsNew = false;
|
2019-08-07 17:29:45 +00:00
|
|
|
IsHostPtr = false;
|
|
|
|
|
IsLast = false;
|
2022-03-05 15:14:20 -06:00
|
|
|
LookupResult lr = lookupMapping(HDTTMap, HstPtrBegin, Size);
|
2017-12-06 21:59:09 +00:00
|
|
|
|
[OpenMP] Fix `omp target update` for array extension
OpenMP TR8 sec. 2.15.6 "target update Construct", p. 183, L3-4 states:
> If the corresponding list item is not present in the device data
> environment and there is no present modifier in the clause, then no
> assignment occurs to or from the original list item.
L10-11 states:
> If a present modifier appears in the clause and the corresponding
> list item is not present in the device data environment then an
> error occurs and the program termintates.
(OpenMP 5.0 also has the first passage but without mention of the
present modifier of course.)
In both passages, I assume "is not present" includes the case of
partially but not entirely present. However, without this patch, the
target update directive misbehaves in this case both with and without
the present modifier. For example:
```
#pragma omp target enter data map(to:arr[0:3])
#pragma omp target update to(arr[0:5]) // might fail on data transfer
#pragma omp target update to(present:arr[0:5]) // might fail on data transfer
```
The problem is that `DeviceTy::getTgtPtrBegin` does not return a null
pointer in that case, so `target_data_update` sees the data as fully
present, and the data transfer then might fail depending on the target
device. However, without the present modifier, there should never be
a failure. Moreover, with the present modifier, there should always
be a failure, and the diagnostic should mention the present modifier.
This patch fixes `DeviceTy::getTgtPtrBegin` to return null when
`target_data_update` is the caller. I'm wondering if it should do the
same for more callers.
Reviewed By: grokos, jdoerfert
Differential Revision: https://reviews.llvm.org/D85246
2020-08-05 09:00:12 -04:00
|
|
|
if (lr.Flags.IsContained ||
|
|
|
|
|
(!MustContain && (lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter))) {
|
2017-12-06 21:59:09 +00:00
|
|
|
auto &HT = *lr.Entry;
|
2021-08-31 15:21:16 -04:00
|
|
|
IsLast = HT.decShouldRemove(UseHoldRefCount, ForceDelete);
|
2022-03-02 13:46:01 -06:00
|
|
|
|
|
|
|
|
if (ForceDelete) {
|
2021-08-31 15:21:16 -04:00
|
|
|
HT.resetRefCount(UseHoldRefCount);
|
|
|
|
|
assert(IsLast == HT.decShouldRemove(UseHoldRefCount) &&
|
[OpenMP] Fix delete map type in ref count debug messages
For example, without this patch:
```
$ cat test.c
int main() {
int x;
#pragma omp target enter data map(alloc: x)
#pragma omp target enter data map(alloc: x)
#pragma omp target enter data map(alloc: x)
#pragma omp target exit data map(delete: x)
;
return 0;
}
$ clang -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda test.c
$ LIBOMPTARGET_DEBUG=1 ./a.out |& grep 'Creating\|Mapping exists\|last'
Libomptarget --> Creating new map entry with HstPtrBegin=0x00007ffddf1eaea8, TgtPtrBegin=0x00000000013bb040, Size=4, RefCount=1, Name=unknown
Libomptarget --> Mapping exists with HstPtrBegin=0x00007ffddf1eaea8, TgtPtrBegin=0x00000000013bb040, Size=4, RefCount=2 (incremented), Name=unknown
Libomptarget --> Mapping exists with HstPtrBegin=0x00007ffddf1eaea8, TgtPtrBegin=0x00000000013bb040, Size=4, RefCount=3 (incremented), Name=unknown
Libomptarget --> Mapping exists with HstPtrBegin=0x00007ffddf1eaea8, TgtPtrBegin=0x00000000013bb040, Size=4, RefCount=2 (decremented)
Libomptarget --> There are 4 bytes allocated at target address 0x00000000013bb040 - is not last
```
`RefCount` is reported as decremented to 2, but it ought to be reset
because of the `delete` map type, and `is not last` is incorrect.
This patch migrates the reset of reference counts from
`DeviceTy::deallocTgtPtr` to `DeviceTy::getTgtPtrBegin`, which then
correctly reports the reset. Based on the `IsLast` result from
`DeviceTy::getTgtPtrBegin`, `targetDataEnd` then correctly reports `is
last` for any deletion. `DeviceTy::deallocTgtPtr` is responsible only
for the final reference count decrement and mapping removal.
An obscure side effect of this patch is that a `delete` map type when
the reference count is infinite yields `DelEntry=IsLast=false` in
`targetDataEnd` and so no longer results in a
`DeviceTy::deallocTgtPtr` call. Without this patch, that call is a
no-op anyway besides some unnecessary locking and mapping table
lookups.
Reviewed By: grokos
Differential Revision: https://reviews.llvm.org/D104560
2021-06-23 09:39:04 -04:00
|
|
|
"expected correct IsLast prediction for reset");
|
2022-03-02 13:46:01 -06:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const char *RefCountAction;
|
|
|
|
|
if (!UpdateRefCount) {
|
|
|
|
|
RefCountAction = " (update suppressed)";
|
[OpenMP] Fix delete map type in ref count debug messages
For example, without this patch:
```
$ cat test.c
int main() {
int x;
#pragma omp target enter data map(alloc: x)
#pragma omp target enter data map(alloc: x)
#pragma omp target enter data map(alloc: x)
#pragma omp target exit data map(delete: x)
;
return 0;
}
$ clang -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda test.c
$ LIBOMPTARGET_DEBUG=1 ./a.out |& grep 'Creating\|Mapping exists\|last'
Libomptarget --> Creating new map entry with HstPtrBegin=0x00007ffddf1eaea8, TgtPtrBegin=0x00000000013bb040, Size=4, RefCount=1, Name=unknown
Libomptarget --> Mapping exists with HstPtrBegin=0x00007ffddf1eaea8, TgtPtrBegin=0x00000000013bb040, Size=4, RefCount=2 (incremented), Name=unknown
Libomptarget --> Mapping exists with HstPtrBegin=0x00007ffddf1eaea8, TgtPtrBegin=0x00000000013bb040, Size=4, RefCount=3 (incremented), Name=unknown
Libomptarget --> Mapping exists with HstPtrBegin=0x00007ffddf1eaea8, TgtPtrBegin=0x00000000013bb040, Size=4, RefCount=2 (decremented)
Libomptarget --> There are 4 bytes allocated at target address 0x00000000013bb040 - is not last
```
`RefCount` is reported as decremented to 2, but it ought to be reset
because of the `delete` map type, and `is not last` is incorrect.
This patch migrates the reset of reference counts from
`DeviceTy::deallocTgtPtr` to `DeviceTy::getTgtPtrBegin`, which then
correctly reports the reset. Based on the `IsLast` result from
`DeviceTy::getTgtPtrBegin`, `targetDataEnd` then correctly reports `is
last` for any deletion. `DeviceTy::deallocTgtPtr` is responsible only
for the final reference count decrement and mapping removal.
An obscure side effect of this patch is that a `delete` map type when
the reference count is infinite yields `DelEntry=IsLast=false` in
`targetDataEnd` and so no longer results in a
`DeviceTy::deallocTgtPtr` call. Without this patch, that call is a
no-op anyway besides some unnecessary locking and mapping table
lookups.
Reviewed By: grokos
Differential Revision: https://reviews.llvm.org/D104560
2021-06-23 09:39:04 -04:00
|
|
|
} else if (IsLast) {
|
2022-03-02 13:46:01 -06:00
|
|
|
// Mark the entry as to be deleted by this thread. Another thread might
|
|
|
|
|
// reuse the entry and take "ownership" for the deletion while this thread
|
|
|
|
|
// is waiting for data transfers. That is fine and the current thread will
|
|
|
|
|
// simply skip the deletion step then.
|
|
|
|
|
HT.setDeleteThreadId();
|
|
|
|
|
HT.decRefCount(UseHoldRefCount);
|
|
|
|
|
assert(HT.getTotalRefCount() == 0 &&
|
|
|
|
|
"Expected zero reference count when deletion is scheduled");
|
|
|
|
|
if (ForceDelete)
|
|
|
|
|
RefCountAction = " (reset, delayed deletion)";
|
|
|
|
|
else
|
|
|
|
|
RefCountAction = " (decremented, delayed deletion)";
|
[OpenMP] Fix delete map type in ref count debug messages
For example, without this patch:
```
$ cat test.c
int main() {
int x;
#pragma omp target enter data map(alloc: x)
#pragma omp target enter data map(alloc: x)
#pragma omp target enter data map(alloc: x)
#pragma omp target exit data map(delete: x)
;
return 0;
}
$ clang -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda test.c
$ LIBOMPTARGET_DEBUG=1 ./a.out |& grep 'Creating\|Mapping exists\|last'
Libomptarget --> Creating new map entry with HstPtrBegin=0x00007ffddf1eaea8, TgtPtrBegin=0x00000000013bb040, Size=4, RefCount=1, Name=unknown
Libomptarget --> Mapping exists with HstPtrBegin=0x00007ffddf1eaea8, TgtPtrBegin=0x00000000013bb040, Size=4, RefCount=2 (incremented), Name=unknown
Libomptarget --> Mapping exists with HstPtrBegin=0x00007ffddf1eaea8, TgtPtrBegin=0x00000000013bb040, Size=4, RefCount=3 (incremented), Name=unknown
Libomptarget --> Mapping exists with HstPtrBegin=0x00007ffddf1eaea8, TgtPtrBegin=0x00000000013bb040, Size=4, RefCount=2 (decremented)
Libomptarget --> There are 4 bytes allocated at target address 0x00000000013bb040 - is not last
```
`RefCount` is reported as decremented to 2, but it ought to be reset
because of the `delete` map type, and `is not last` is incorrect.
This patch migrates the reset of reference counts from
`DeviceTy::deallocTgtPtr` to `DeviceTy::getTgtPtrBegin`, which then
correctly reports the reset. Based on the `IsLast` result from
`DeviceTy::getTgtPtrBegin`, `targetDataEnd` then correctly reports `is
last` for any deletion. `DeviceTy::deallocTgtPtr` is responsible only
for the final reference count decrement and mapping removal.
An obscure side effect of this patch is that a `delete` map type when
the reference count is infinite yields `DelEntry=IsLast=false` in
`targetDataEnd` and so no longer results in a
`DeviceTy::deallocTgtPtr` call. Without this patch, that call is a
no-op anyway besides some unnecessary locking and mapping table
lookups.
Reviewed By: grokos
Differential Revision: https://reviews.llvm.org/D104560
2021-06-23 09:39:04 -04:00
|
|
|
} else {
|
2021-08-31 15:21:16 -04:00
|
|
|
HT.decRefCount(UseHoldRefCount);
|
|
|
|
|
RefCountAction = " (decremented)";
|
[OpenMP] Improve ref count debug messages
For example, without this patch:
```
$ cat test.c
int main() {
int x;
#pragma omp target enter data map(alloc: x)
#pragma omp target exit data map(release: x)
;
return 0;
}
$ clang -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda test.c
$ LIBOMPTARGET_DEBUG=1 ./a.out |& grep 'Creating\|Mapping exists'
Libomptarget --> Creating new map entry with HstPtrBegin=0x00007ffcace8e448, TgtPtrBegin=0x00007f12ef600000, Size=4, Name=unknown
Libomptarget --> Mapping exists with HstPtrBegin=0x00007ffcace8e448, TgtPtrBegin=0x00007f12ef600000, Size=4, updated RefCount=1
```
There are two problems in this example:
* `RefCount` is not reported when a mapping is created, but it might
be 1 or infinite. In this case, because it's created by `omp target
enter data`, it's 1. Seeing that would make later `RefCount`
messages easier to understand.
* `RefCount` is still 1 at the `omp target exit data`, but it's
reported as `updated`. The reason it's still 1 is that, upon
deletions, the reference count is generally not updated in
`DeviceTy::getTgtPtrBegin`, where the report is produced. Instead,
it's zeroed later in `DeviceTy::deallocTgtPtr`, where it's actually
removed from the mapping table.
This patch makes the following changes:
* Report the reference count when creating a mapping.
* Where an existing mapping is reported, always report a reference
count action:
* `update suppressed` when `UpdateRefCount=false`
* `incremented`
* `decremented`
* `deferred final decrement`, which replaces the misleading
`updated` in the above example
* Add comments to `DeviceTy::getTgtPtrBegin` to explain why it does
not zero the reference count. (Please advise if these comments miss
the point.)
* For unified shared memory, don't report confusing messages like
`RefCount=` or `RefCount= updated` given that reference counts are
irrelevant in this case. Instead, just report `for unified shared
memory`.
* Use `INFO` not `DP` consistently for `Mapping exists` messages.
* Fix device table dumps to print `INF` instead of `-1` for an
infinite reference count.
Reviewed By: jhuber6, grokos
Differential Revision: https://reviews.llvm.org/D104559
2021-06-23 09:37:54 -04:00
|
|
|
}
|
2021-08-31 15:21:16 -04:00
|
|
|
const char *DynRefCountAction = UseHoldRefCount ? "" : RefCountAction;
|
|
|
|
|
const char *HoldRefCountAction = UseHoldRefCount ? RefCountAction : "";
|
2017-12-06 21:59:09 +00:00
|
|
|
uintptr_t tp = HT.TgtPtrBegin + ((uintptr_t)HstPtrBegin - HT.HstPtrBegin);
|
[OpenMP] Improve ref count debug messages
For example, without this patch:
```
$ cat test.c
int main() {
int x;
#pragma omp target enter data map(alloc: x)
#pragma omp target exit data map(release: x)
;
return 0;
}
$ clang -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda test.c
$ LIBOMPTARGET_DEBUG=1 ./a.out |& grep 'Creating\|Mapping exists'
Libomptarget --> Creating new map entry with HstPtrBegin=0x00007ffcace8e448, TgtPtrBegin=0x00007f12ef600000, Size=4, Name=unknown
Libomptarget --> Mapping exists with HstPtrBegin=0x00007ffcace8e448, TgtPtrBegin=0x00007f12ef600000, Size=4, updated RefCount=1
```
There are two problems in this example:
* `RefCount` is not reported when a mapping is created, but it might
be 1 or infinite. In this case, because it's created by `omp target
enter data`, it's 1. Seeing that would make later `RefCount`
messages easier to understand.
* `RefCount` is still 1 at the `omp target exit data`, but it's
reported as `updated`. The reason it's still 1 is that, upon
deletions, the reference count is generally not updated in
`DeviceTy::getTgtPtrBegin`, where the report is produced. Instead,
it's zeroed later in `DeviceTy::deallocTgtPtr`, where it's actually
removed from the mapping table.
This patch makes the following changes:
* Report the reference count when creating a mapping.
* Where an existing mapping is reported, always report a reference
count action:
* `update suppressed` when `UpdateRefCount=false`
* `incremented`
* `decremented`
* `deferred final decrement`, which replaces the misleading
`updated` in the above example
* Add comments to `DeviceTy::getTgtPtrBegin` to explain why it does
not zero the reference count. (Please advise if these comments miss
the point.)
* For unified shared memory, don't report confusing messages like
`RefCount=` or `RefCount= updated` given that reference counts are
irrelevant in this case. Instead, just report `for unified shared
memory`.
* Use `INFO` not `DP` consistently for `Mapping exists` messages.
* Fix device table dumps to print `INF` instead of `-1` for an
infinite reference count.
Reviewed By: jhuber6, grokos
Differential Revision: https://reviews.llvm.org/D104559
2021-06-23 09:37:54 -04:00
|
|
|
INFO(OMP_INFOTYPE_MAPPING_EXISTS, DeviceID,
|
|
|
|
|
"Mapping exists with HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD ", "
|
2021-08-31 15:21:16 -04:00
|
|
|
"Size=%" PRId64 ", DynRefCount=%s%s, HoldRefCount=%s%s\n",
|
|
|
|
|
DPxPTR(HstPtrBegin), DPxPTR(tp), Size, HT.dynRefCountToStr().c_str(),
|
|
|
|
|
DynRefCountAction, HT.holdRefCountToStr().c_str(), HoldRefCountAction);
|
2021-11-03 11:52:21 -05:00
|
|
|
TargetPointer = (void *)tp;
|
2020-10-30 01:04:34 -04:00
|
|
|
} else if (PM->RTLs.RequiresFlags & OMP_REQ_UNIFIED_SHARED_MEMORY) {
|
2019-08-07 17:29:45 +00:00
|
|
|
// If the value isn't found in the mapping and unified shared memory
|
|
|
|
|
// is on then it means we have stumbled upon a value which we need to
|
|
|
|
|
// use directly from the host.
|
[OpenMP] Improve ref count debug messages
For example, without this patch:
```
$ cat test.c
int main() {
int x;
#pragma omp target enter data map(alloc: x)
#pragma omp target exit data map(release: x)
;
return 0;
}
$ clang -fopenmp -fopenmp-targets=nvptx64-nvidia-cuda test.c
$ LIBOMPTARGET_DEBUG=1 ./a.out |& grep 'Creating\|Mapping exists'
Libomptarget --> Creating new map entry with HstPtrBegin=0x00007ffcace8e448, TgtPtrBegin=0x00007f12ef600000, Size=4, Name=unknown
Libomptarget --> Mapping exists with HstPtrBegin=0x00007ffcace8e448, TgtPtrBegin=0x00007f12ef600000, Size=4, updated RefCount=1
```
There are two problems in this example:
* `RefCount` is not reported when a mapping is created, but it might
be 1 or infinite. In this case, because it's created by `omp target
enter data`, it's 1. Seeing that would make later `RefCount`
messages easier to understand.
* `RefCount` is still 1 at the `omp target exit data`, but it's
reported as `updated`. The reason it's still 1 is that, upon
deletions, the reference count is generally not updated in
`DeviceTy::getTgtPtrBegin`, where the report is produced. Instead,
it's zeroed later in `DeviceTy::deallocTgtPtr`, where it's actually
removed from the mapping table.
This patch makes the following changes:
* Report the reference count when creating a mapping.
* Where an existing mapping is reported, always report a reference
count action:
* `update suppressed` when `UpdateRefCount=false`
* `incremented`
* `decremented`
* `deferred final decrement`, which replaces the misleading
`updated` in the above example
* Add comments to `DeviceTy::getTgtPtrBegin` to explain why it does
not zero the reference count. (Please advise if these comments miss
the point.)
* For unified shared memory, don't report confusing messages like
`RefCount=` or `RefCount= updated` given that reference counts are
irrelevant in this case. Instead, just report `for unified shared
memory`.
* Use `INFO` not `DP` consistently for `Mapping exists` messages.
* Fix device table dumps to print `INF` instead of `-1` for an
infinite reference count.
Reviewed By: jhuber6, grokos
Differential Revision: https://reviews.llvm.org/D104559
2021-06-23 09:37:54 -04:00
|
|
|
DP("Get HstPtrBegin " DPxMOD " Size=%" PRId64 " for unified shared "
|
|
|
|
|
"memory\n",
|
|
|
|
|
DPxPTR((uintptr_t)HstPtrBegin), Size);
|
2019-08-07 17:29:45 +00:00
|
|
|
IsHostPtr = true;
|
2021-11-03 11:52:21 -05:00
|
|
|
TargetPointer = HstPtrBegin;
|
2017-12-06 21:59:09 +00:00
|
|
|
}
|
|
|
|
|
|
2021-11-03 11:52:21 -05:00
|
|
|
return {{IsNew, IsHostPtr}, lr.Entry, TargetPointer};
|
2017-12-06 21:59:09 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Return the target pointer begin (where the data will be moved).
|
2022-03-05 15:14:20 -06:00
|
|
|
void *DeviceTy::getTgtPtrBegin(HDTTMapAccessorTy &HDTTMap, void *HstPtrBegin,
|
|
|
|
|
int64_t Size) {
|
2017-12-06 21:59:09 +00:00
|
|
|
uintptr_t hp = (uintptr_t)HstPtrBegin;
|
2022-03-05 15:14:20 -06:00
|
|
|
LookupResult lr = lookupMapping(HDTTMap, HstPtrBegin, Size);
|
2017-12-06 21:59:09 +00:00
|
|
|
if (lr.Flags.IsContained || lr.Flags.ExtendsBefore || lr.Flags.ExtendsAfter) {
|
|
|
|
|
auto &HT = *lr.Entry;
|
|
|
|
|
uintptr_t tp = HT.TgtPtrBegin + (hp - HT.HstPtrBegin);
|
|
|
|
|
return (void *)tp;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return NULL;
|
|
|
|
|
}
|
|
|
|
|
|
2022-03-02 13:46:01 -06:00
|
|
|
int DeviceTy::deallocTgtPtr(HDTTMapAccessorTy &HDTTMap, LookupResult LR,
|
|
|
|
|
int64_t Size) {
|
2017-12-06 21:59:09 +00:00
|
|
|
// Check if the pointer is contained in any sub-nodes.
|
2022-03-02 13:46:01 -06:00
|
|
|
if (!(LR.Flags.IsContained || LR.Flags.ExtendsBefore ||
|
|
|
|
|
LR.Flags.ExtendsAfter)) {
|
2020-08-26 18:11:26 -04:00
|
|
|
REPORT("Section to delete (hst addr " DPxMOD ") does not exist in the"
|
|
|
|
|
" allocated memory\n",
|
2022-03-02 13:46:01 -06:00
|
|
|
DPxPTR(LR.Entry->HstPtrBegin));
|
|
|
|
|
return OFFLOAD_FAIL;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
auto &HT = *LR.Entry;
|
|
|
|
|
// Verify this thread is still in charge of deleting the entry.
|
|
|
|
|
assert(HT.getTotalRefCount() == 0 &&
|
|
|
|
|
HT.getDeleteThreadId() == std::this_thread::get_id() &&
|
|
|
|
|
"Trying to delete entry that is in use or owned by another thread.");
|
|
|
|
|
|
|
|
|
|
DP("Deleting tgt data " DPxMOD " of size %" PRId64 "\n",
|
|
|
|
|
DPxPTR(HT.TgtPtrBegin), Size);
|
|
|
|
|
deleteData((void *)HT.TgtPtrBegin);
|
|
|
|
|
INFO(OMP_INFOTYPE_MAPPING_CHANGED, DeviceID,
|
|
|
|
|
"Removing map entry with HstPtrBegin=" DPxMOD ", TgtPtrBegin=" DPxMOD
|
|
|
|
|
", Size=%" PRId64 ", Name=%s\n",
|
|
|
|
|
DPxPTR(HT.HstPtrBegin), DPxPTR(HT.TgtPtrBegin), Size,
|
|
|
|
|
(HT.HstPtrName) ? getNameFromMapping(HT.HstPtrName).c_str() : "unknown");
|
|
|
|
|
void *Event = LR.Entry->getEvent();
|
|
|
|
|
HDTTMap->erase(LR.Entry);
|
|
|
|
|
delete LR.Entry;
|
|
|
|
|
|
|
|
|
|
int Ret = OFFLOAD_SUCCESS;
|
|
|
|
|
if (Event && destroyEvent(Event) != OFFLOAD_SUCCESS) {
|
|
|
|
|
REPORT("Failed to destroy event " DPxMOD "\n", DPxPTR(Event));
|
2022-01-05 20:19:55 -05:00
|
|
|
Ret = OFFLOAD_FAIL;
|
2017-12-06 21:59:09 +00:00
|
|
|
}
|
|
|
|
|
|
2022-01-05 20:19:55 -05:00
|
|
|
return Ret;
|
2017-12-06 21:59:09 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Init device, should not be called directly.
|
|
|
|
|
void DeviceTy::init() {
|
2019-06-04 15:05:53 +00:00
|
|
|
// Make call to init_requires if it exists for this plugin.
|
|
|
|
|
if (RTL->init_requires)
|
2020-10-30 01:04:34 -04:00
|
|
|
RTL->init_requires(PM->RTLs.RequiresFlags);
|
[OpenMP] Introduce target memory manager
Target memory manager is introduced in this patch which aims to manage target
memory such that they will not be freed immediately when they are not used
because the overhead of memory allocation and free is very large. For CUDA
device, cuMemFree even blocks the context switch on device which affects
concurrent kernel execution.
The memory manager can be taken as a memory pool. It divides the pool into
multiple buckets according to the size such that memory allocation/free
distributed to different buckets will not affect each other.
In this version, we use the exact-equality policy to find a free buffer. This
is an open question: will best-fit work better here? IMO, best-fit is not good
for target memory management because computation on GPU usually requires GBs of
data. Best-fit might lead to a serious waste. For example, there is a free
buffer of size 1960MB, and now we need a buffer of size 1200MB. If best-fit,
the free buffer will be returned, leading to a 760MB waste.
The allocation will happen when there is no free memory left, and the memory
free on device will take place in the following two cases:
1. The program ends. Obviously. However, there is a little problem that plugin
library is destroyed before the memory manager is destroyed, leading to a fact
that the call to target plugin will not succeed.
2. Device is out of memory when we request a new memory. The manager will walk
through all free buffers from the bucket with largest base size, pick up one
buffer, free it, and try to allocate immediately. If it succeeds, it will
return right away rather than freeing all buffers in free list.
Update:
A threshold (8KB by default) is set such that users could control what size of memory
will be managed by the manager. It can also be configured by an environment variable
`LIBOMPTARGET_MEMORY_MANAGER_THRESHOLD`.
Reviewed By: jdoerfert, ye-luo, JonChesterfield
Differential Revision: https://reviews.llvm.org/D81054
2020-08-19 23:12:02 -04:00
|
|
|
int32_t Ret = RTL->init_device(RTLDeviceID);
|
|
|
|
|
if (Ret != OFFLOAD_SUCCESS)
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
IsInit = true;
|
2017-12-06 21:59:09 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/// Thread-safe method to initialize the device only once.
|
|
|
|
|
int32_t DeviceTy::initOnce() {
|
|
|
|
|
std::call_once(InitFlag, &DeviceTy::init, this);
|
|
|
|
|
|
|
|
|
|
// At this point, if IsInit is true, then either this thread or some other
|
|
|
|
|
// thread in the past successfully initialized the device, so we can return
|
|
|
|
|
// OFFLOAD_SUCCESS. If this thread executed init() via call_once() and it
|
|
|
|
|
// failed, return OFFLOAD_FAIL. If call_once did not invoke init(), it means
|
|
|
|
|
// that some other thread already attempted to execute init() and if IsInit
|
|
|
|
|
// is still false, return OFFLOAD_FAIL.
|
|
|
|
|
if (IsInit)
|
|
|
|
|
return OFFLOAD_SUCCESS;
|
|
|
|
|
else
|
|
|
|
|
return OFFLOAD_FAIL;
|
|
|
|
|
}
|
|
|
|
|
|
2022-02-17 16:04:12 -06:00
|
|
|
void DeviceTy::deinit() {
|
|
|
|
|
if (RTL->deinit_device)
|
|
|
|
|
RTL->deinit_device(RTLDeviceID);
|
|
|
|
|
}
|
|
|
|
|
|
2017-12-06 21:59:09 +00:00
|
|
|
// Load binary to device.
|
|
|
|
|
__tgt_target_table *DeviceTy::load_binary(void *Img) {
|
2022-03-02 13:34:24 -06:00
|
|
|
std::lock_guard<decltype(RTL->Mtx)> LG(RTL->Mtx);
|
2017-12-06 21:59:09 +00:00
|
|
|
__tgt_target_table *rc = RTL->load_binary(RTLDeviceID, Img);
|
|
|
|
|
return rc;
|
|
|
|
|
}
|
|
|
|
|
|
2021-03-03 11:48:32 -08:00
|
|
|
void *DeviceTy::allocData(int64_t Size, void *HstPtr, int32_t Kind) {
|
|
|
|
|
return RTL->data_alloc(RTLDeviceID, Size, HstPtr, Kind);
|
2020-07-27 16:08:19 -04:00
|
|
|
}
|
|
|
|
|
|
2020-07-28 20:10:59 -04:00
|
|
|
int32_t DeviceTy::deleteData(void *TgtPtrBegin) {
|
2020-07-27 16:08:19 -04:00
|
|
|
return RTL->data_delete(RTLDeviceID, TgtPtrBegin);
|
|
|
|
|
}
|
|
|
|
|
|
[OpenMP] Optimized stream selection by scheduling data mapping for the same target region into a same stream
Summary:
This patch introduces two things for offloading:
1. Asynchronous data transferring: those functions are suffix with `_async`. They have one more argument compared with their synchronous counterparts: `__tgt_async_info*`, which is a new struct that only has one field, `void *Identifier`. This struct is for information exchange between different asynchronous operations. It can be used for stream selection, like in this case, or operation synchronization, which is also used. We may expect more usages in the future.
2. Optimization of stream selection for data mapping. Previous implementation was using asynchronous device memory transfer but synchronizing after each memory transfer. Actually, if we say kernel A needs four memory copy to device and two memory copy back to host, then we can schedule these seven operations (four H2D, two D2H, and one kernel launch) into a same stream and just need synchronization after memory copy from device to host. In this way, we can save a huge overhead compared with synchronization after each operation.
Reviewers: jdoerfert, ye-luo
Reviewed By: jdoerfert
Subscribers: yaxunl, lildmh, guansong, openmp-commits
Tags: #openmp
Differential Revision: https://reviews.llvm.org/D77005
2020-04-07 14:51:56 -04:00
|
|
|
// Submit data to device
|
2020-07-28 20:10:59 -04:00
|
|
|
int32_t DeviceTy::submitData(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size,
|
2021-02-10 11:06:00 -06:00
|
|
|
AsyncInfoTy &AsyncInfo) {
|
2021-06-08 16:43:59 -04:00
|
|
|
if (getInfoLevel() & OMP_INFOTYPE_DATA_TRANSFER) {
|
2022-03-05 15:14:20 -06:00
|
|
|
HDTTMapAccessorTy HDTTMap = HostDataToTargetMap.getExclusiveAccessor();
|
|
|
|
|
LookupResult LR = lookupMapping(HDTTMap, HstPtrBegin, Size);
|
2021-06-08 16:43:59 -04:00
|
|
|
auto *HT = &*LR.Entry;
|
|
|
|
|
|
|
|
|
|
INFO(OMP_INFOTYPE_DATA_TRANSFER, DeviceID,
|
|
|
|
|
"Copying data from host to device, HstPtr=" DPxMOD ", TgtPtr=" DPxMOD
|
|
|
|
|
", Size=%" PRId64 ", Name=%s\n",
|
|
|
|
|
DPxPTR(HstPtrBegin), DPxPTR(TgtPtrBegin), Size,
|
|
|
|
|
(HT && HT->HstPtrName) ? getNameFromMapping(HT->HstPtrName).c_str()
|
|
|
|
|
: "unknown");
|
|
|
|
|
}
|
|
|
|
|
|
2021-02-10 11:06:00 -06:00
|
|
|
if (!AsyncInfo || !RTL->data_submit_async || !RTL->synchronize)
|
2020-04-09 22:40:30 -04:00
|
|
|
return RTL->data_submit(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size);
|
|
|
|
|
else
|
|
|
|
|
return RTL->data_submit_async(RTLDeviceID, TgtPtrBegin, HstPtrBegin, Size,
|
2021-02-10 11:06:00 -06:00
|
|
|
AsyncInfo);
|
2017-12-06 21:59:09 +00:00
|
|
|
}
|
|
|
|
|
|
[OpenMP] Optimized stream selection by scheduling data mapping for the same target region into a same stream
Summary:
This patch introduces two things for offloading:
1. Asynchronous data transferring: those functions are suffix with `_async`. They have one more argument compared with their synchronous counterparts: `__tgt_async_info*`, which is a new struct that only has one field, `void *Identifier`. This struct is for information exchange between different asynchronous operations. It can be used for stream selection, like in this case, or operation synchronization, which is also used. We may expect more usages in the future.
2. Optimization of stream selection for data mapping. Previous implementation was using asynchronous device memory transfer but synchronizing after each memory transfer. Actually, if we say kernel A needs four memory copy to device and two memory copy back to host, then we can schedule these seven operations (four H2D, two D2H, and one kernel launch) into a same stream and just need synchronization after memory copy from device to host. In this way, we can save a huge overhead compared with synchronization after each operation.
Reviewers: jdoerfert, ye-luo
Reviewed By: jdoerfert
Subscribers: yaxunl, lildmh, guansong, openmp-commits
Tags: #openmp
Differential Revision: https://reviews.llvm.org/D77005
2020-04-07 14:51:56 -04:00
|
|
|
// Retrieve data from device
|
2020-07-30 21:37:01 -04:00
|
|
|
int32_t DeviceTy::retrieveData(void *HstPtrBegin, void *TgtPtrBegin,
|
2021-02-10 11:06:00 -06:00
|
|
|
int64_t Size, AsyncInfoTy &AsyncInfo) {
|
2021-06-08 16:43:59 -04:00
|
|
|
if (getInfoLevel() & OMP_INFOTYPE_DATA_TRANSFER) {
|
2022-03-05 15:14:20 -06:00
|
|
|
HDTTMapAccessorTy HDTTMap = HostDataToTargetMap.getExclusiveAccessor();
|
|
|
|
|
LookupResult LR = lookupMapping(HDTTMap, HstPtrBegin, Size);
|
2021-06-08 16:43:59 -04:00
|
|
|
auto *HT = &*LR.Entry;
|
|
|
|
|
INFO(OMP_INFOTYPE_DATA_TRANSFER, DeviceID,
|
|
|
|
|
"Copying data from device to host, TgtPtr=" DPxMOD ", HstPtr=" DPxMOD
|
|
|
|
|
", Size=%" PRId64 ", Name=%s\n",
|
|
|
|
|
DPxPTR(TgtPtrBegin), DPxPTR(HstPtrBegin), Size,
|
|
|
|
|
(HT && HT->HstPtrName) ? getNameFromMapping(HT->HstPtrName).c_str()
|
|
|
|
|
: "unknown");
|
|
|
|
|
}
|
|
|
|
|
|
2021-02-10 11:06:00 -06:00
|
|
|
if (!RTL->data_retrieve_async || !RTL->synchronize)
|
2020-04-09 22:40:30 -04:00
|
|
|
return RTL->data_retrieve(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size);
|
|
|
|
|
else
|
|
|
|
|
return RTL->data_retrieve_async(RTLDeviceID, HstPtrBegin, TgtPtrBegin, Size,
|
2021-02-10 11:06:00 -06:00
|
|
|
AsyncInfo);
|
2017-12-06 21:59:09 +00:00
|
|
|
}
|
|
|
|
|
|
2020-06-04 16:58:37 -04:00
|
|
|
// Copy data from current device to destination device directly
|
2020-08-19 16:07:58 -04:00
|
|
|
int32_t DeviceTy::dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr,
|
2021-02-10 11:06:00 -06:00
|
|
|
int64_t Size, AsyncInfoTy &AsyncInfo) {
|
2020-08-19 16:07:58 -04:00
|
|
|
if (!AsyncInfo || !RTL->data_exchange_async || !RTL->synchronize) {
|
2020-06-04 16:58:37 -04:00
|
|
|
assert(RTL->data_exchange && "RTL->data_exchange is nullptr");
|
|
|
|
|
return RTL->data_exchange(RTLDeviceID, SrcPtr, DstDev.RTLDeviceID, DstPtr,
|
|
|
|
|
Size);
|
|
|
|
|
} else
|
|
|
|
|
return RTL->data_exchange_async(RTLDeviceID, SrcPtr, DstDev.RTLDeviceID,
|
2020-08-19 16:07:58 -04:00
|
|
|
DstPtr, Size, AsyncInfo);
|
2020-06-04 16:58:37 -04:00
|
|
|
}
|
|
|
|
|
|
2017-12-06 21:59:09 +00:00
|
|
|
// Run region on device
|
2020-07-28 20:10:59 -04:00
|
|
|
int32_t DeviceTy::runRegion(void *TgtEntryPtr, void **TgtVarsPtr,
|
|
|
|
|
ptrdiff_t *TgtOffsets, int32_t TgtVarsSize,
|
2021-02-10 11:06:00 -06:00
|
|
|
AsyncInfoTy &AsyncInfo) {
|
|
|
|
|
if (!RTL->run_region || !RTL->synchronize)
|
2020-04-09 22:40:30 -04:00
|
|
|
return RTL->run_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr, TgtOffsets,
|
|
|
|
|
TgtVarsSize);
|
|
|
|
|
else
|
|
|
|
|
return RTL->run_region_async(RTLDeviceID, TgtEntryPtr, TgtVarsPtr,
|
2021-02-10 11:06:00 -06:00
|
|
|
TgtOffsets, TgtVarsSize, AsyncInfo);
|
2017-12-06 21:59:09 +00:00
|
|
|
}
|
|
|
|
|
|
2021-07-27 21:47:40 -04:00
|
|
|
// Run region on device
|
|
|
|
|
bool DeviceTy::printDeviceInfo(int32_t RTLDevId) {
|
|
|
|
|
if (!RTL->print_device_info)
|
|
|
|
|
return false;
|
|
|
|
|
RTL->print_device_info(RTLDevId);
|
|
|
|
|
return true;
|
|
|
|
|
}
|
|
|
|
|
|
2017-12-06 21:59:09 +00:00
|
|
|
// Run team region on device.
|
2020-07-28 20:10:59 -04:00
|
|
|
int32_t DeviceTy::runTeamRegion(void *TgtEntryPtr, void **TgtVarsPtr,
|
|
|
|
|
ptrdiff_t *TgtOffsets, int32_t TgtVarsSize,
|
|
|
|
|
int32_t NumTeams, int32_t ThreadLimit,
|
|
|
|
|
uint64_t LoopTripCount,
|
2021-02-10 11:06:00 -06:00
|
|
|
AsyncInfoTy &AsyncInfo) {
|
|
|
|
|
if (!RTL->run_team_region_async || !RTL->synchronize)
|
2020-04-09 22:40:30 -04:00
|
|
|
return RTL->run_team_region(RTLDeviceID, TgtEntryPtr, TgtVarsPtr,
|
|
|
|
|
TgtOffsets, TgtVarsSize, NumTeams, ThreadLimit,
|
|
|
|
|
LoopTripCount);
|
|
|
|
|
else
|
|
|
|
|
return RTL->run_team_region_async(RTLDeviceID, TgtEntryPtr, TgtVarsPtr,
|
|
|
|
|
TgtOffsets, TgtVarsSize, NumTeams,
|
2021-02-10 11:06:00 -06:00
|
|
|
ThreadLimit, LoopTripCount, AsyncInfo);
|
2017-12-06 21:59:09 +00:00
|
|
|
}
|
2017-12-06 21:59:12 +00:00
|
|
|
|
2020-06-04 16:58:37 -04:00
|
|
|
// Whether data can be copied to DstDevice directly
|
|
|
|
|
bool DeviceTy::isDataExchangable(const DeviceTy &DstDevice) {
|
|
|
|
|
if (RTL != DstDevice.RTL || !RTL->is_data_exchangable)
|
|
|
|
|
return false;
|
|
|
|
|
|
|
|
|
|
if (RTL->is_data_exchangable(RTLDeviceID, DstDevice.RTLDeviceID))
|
|
|
|
|
return (RTL->data_exchange != nullptr) ||
|
|
|
|
|
(RTL->data_exchange_async != nullptr);
|
|
|
|
|
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
2021-02-10 11:06:00 -06:00
|
|
|
int32_t DeviceTy::synchronize(AsyncInfoTy &AsyncInfo) {
|
2020-07-27 16:08:19 -04:00
|
|
|
if (RTL->synchronize)
|
2021-02-10 11:06:00 -06:00
|
|
|
return RTL->synchronize(RTLDeviceID, AsyncInfo);
|
2020-07-27 16:08:19 -04:00
|
|
|
return OFFLOAD_SUCCESS;
|
|
|
|
|
}
|
|
|
|
|
|
2021-08-28 16:24:06 -04:00
|
|
|
int32_t DeviceTy::createEvent(void **Event) {
|
|
|
|
|
if (RTL->create_event)
|
|
|
|
|
return RTL->create_event(RTLDeviceID, Event);
|
|
|
|
|
|
|
|
|
|
return OFFLOAD_SUCCESS;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int32_t DeviceTy::recordEvent(void *Event, AsyncInfoTy &AsyncInfo) {
|
|
|
|
|
if (RTL->record_event)
|
|
|
|
|
return RTL->record_event(RTLDeviceID, Event, AsyncInfo);
|
|
|
|
|
|
|
|
|
|
return OFFLOAD_SUCCESS;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int32_t DeviceTy::waitEvent(void *Event, AsyncInfoTy &AsyncInfo) {
|
|
|
|
|
if (RTL->wait_event)
|
|
|
|
|
return RTL->wait_event(RTLDeviceID, Event, AsyncInfo);
|
|
|
|
|
|
|
|
|
|
return OFFLOAD_SUCCESS;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int32_t DeviceTy::syncEvent(void *Event) {
|
|
|
|
|
if (RTL->sync_event)
|
|
|
|
|
return RTL->sync_event(RTLDeviceID, Event);
|
|
|
|
|
|
|
|
|
|
return OFFLOAD_SUCCESS;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
int32_t DeviceTy::destroyEvent(void *Event) {
|
|
|
|
|
if (RTL->create_event)
|
|
|
|
|
return RTL->destroy_event(RTLDeviceID, Event);
|
|
|
|
|
|
|
|
|
|
return OFFLOAD_SUCCESS;
|
|
|
|
|
}
|
|
|
|
|
|
2017-12-06 21:59:12 +00:00
|
|
|
/// Check whether a device has an associated RTL and initialize it if it's not
|
|
|
|
|
/// already initialized.
|
|
|
|
|
bool device_is_ready(int device_num) {
|
|
|
|
|
DP("Checking whether device %d is ready.\n", device_num);
|
|
|
|
|
// Devices.size() can only change while registering a new
|
|
|
|
|
// library, so try to acquire the lock of RTLs' mutex.
|
2022-03-02 13:34:24 -06:00
|
|
|
size_t DevicesSize;
|
|
|
|
|
{
|
|
|
|
|
std::lock_guard<decltype(PM->RTLsMtx)> LG(PM->RTLsMtx);
|
|
|
|
|
DevicesSize = PM->Devices.size();
|
|
|
|
|
}
|
2020-10-30 01:04:34 -04:00
|
|
|
if (DevicesSize <= (size_t)device_num) {
|
2017-12-06 21:59:12 +00:00
|
|
|
DP("Device ID %d does not have a matching RTL\n", device_num);
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Get device info
|
2021-09-04 14:07:41 -05:00
|
|
|
DeviceTy &Device = *PM->Devices[device_num];
|
2017-12-06 21:59:12 +00:00
|
|
|
|
|
|
|
|
DP("Is the device %d (local ID %d) initialized? %d\n", device_num,
|
2021-02-10 11:45:39 -06:00
|
|
|
Device.RTLDeviceID, Device.IsInit);
|
2017-12-06 21:59:12 +00:00
|
|
|
|
|
|
|
|
// Init the device if not done before
|
|
|
|
|
if (!Device.IsInit && Device.initOnce() != OFFLOAD_SUCCESS) {
|
|
|
|
|
DP("Failed to init device %d\n", device_num);
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
DP("Device %d is ready to use.\n", device_num);
|
|
|
|
|
|
|
|
|
|
return true;
|
|
|
|
|
}
|