feature direct submission: use tag allocation as a completion fence

use tag allocation address as a completion address in exec call
wait for completion value before destroying drm direct submission

Related-To: NEO-6643
Signed-off-by: Mateusz Jablonski <mateusz.jablonski@intel.com>
This commit is contained in:
Mateusz Jablonski
2022-04-20 17:32:39 +00:00
committed by Compute-Runtime-Automation
parent 38190c5d17
commit 03185f7111
12 changed files with 408 additions and 5 deletions

View File

@@ -321,7 +321,7 @@ class CommandStreamReceiver {
MOCKABLE_VIRTUAL bool isGpuHangDetected() const;
uint64_t getCompletionAddress() {
uint64_t getCompletionAddress() const {
uint64_t completionFenceAddress = castToUint64(const_cast<uint32_t *>(getTagAddress()));
if (completionFenceAddress == 0) {
return 0;

View File

@@ -14,6 +14,7 @@ DirectSubmissionInputParams::DirectSubmissionInputParams(const CommandStreamRece
memoryManager = commandStreamReceiver.getMemoryManager();
globalFenceAllocation = commandStreamReceiver.getGlobalFenceAllocation();
workPartitionAllocation = commandStreamReceiver.getWorkPartitionAllocation();
completionFenceAllocation = commandStreamReceiver.getTagAllocation();
}
} // namespace NEO

View File

@@ -64,6 +64,7 @@ struct DirectSubmissionInputParams : NonCopyableClass {
MemoryManager *memoryManager = nullptr;
const GraphicsAllocation *globalFenceAllocation = nullptr;
GraphicsAllocation *workPartitionAllocation = nullptr;
GraphicsAllocation *completionFenceAllocation = nullptr;
const uint32_t rootDeviceIndex;
};
@@ -160,6 +161,7 @@ class DirectSubmissionHw {
MemoryOperationsHandler *memoryOperationHandler = nullptr;
const HardwareInfo *hwInfo = nullptr;
const GraphicsAllocation *globalFenceAllocation = nullptr;
GraphicsAllocation *completionFenceAllocation = nullptr;
GraphicsAllocation *ringBuffer = nullptr;
GraphicsAllocation *ringBuffer2 = nullptr;
GraphicsAllocation *semaphores = nullptr;

View File

@@ -100,6 +100,10 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::allocateResources() {
allocations.push_back(workPartitionAllocation);
}
if (completionFenceAllocation != nullptr) {
allocations.push_back(completionFenceAllocation);
}
if (DebugManager.flags.DirectSubmissionPrintBuffers.get()) {
printf("Ring buffer 1 - gpu address: %" PRIx64 " - %" PRIx64 ", cpu address: %p - %p, size: %zu \n",
ringBuffer->getGpuAddress(),

View File

@@ -35,7 +35,8 @@ class DrmDirectSubmission : public DirectSubmissionHw<GfxFamily, Dispatcher> {
MOCKABLE_VIRTUAL void wait(uint32_t taskCountToWait);
TagData currentTagData;
TagData currentTagData{};
volatile uint32_t *tagAddress;
uint32_t completionFenceValue{};
};
} // namespace NEO

View File

@@ -39,12 +39,17 @@ DrmDirectSubmission<GfxFamily, Dispatcher>::DrmDirectSubmission(const DirectSubm
this->partitionedMode = this->activeTiles > 1u;
this->partitionConfigSet = !this->partitionedMode;
osContextLinux->getDrm().setDirectSubmissionActive(true);
auto &drm = osContextLinux->getDrm();
drm.setDirectSubmissionActive(true);
if (this->partitionedMode) {
this->workPartitionAllocation = inputParams.workPartitionAllocation;
UNRECOVERABLE_IF(this->workPartitionAllocation == nullptr);
}
if (drm.completionFenceSupport()) {
this->completionFenceAllocation = inputParams.completionFenceAllocation;
}
}
template <typename GfxFamily, typename Dispatcher>
@@ -53,6 +58,24 @@ inline DrmDirectSubmission<GfxFamily, Dispatcher>::~DrmDirectSubmission() {
this->stopRingBuffer();
this->wait(static_cast<uint32_t>(this->currentTagData.tagValue));
}
if (this->completionFenceAllocation) {
auto osContextLinux = static_cast<OsContextLinux *>(&this->osContext);
auto &drm = osContextLinux->getDrm();
auto &drmContextIds = osContextLinux->getDrmContextIds();
uint32_t drmContextId = 0u;
auto completionFenceCpuAddress = reinterpret_cast<uint64_t>(this->completionFenceAllocation->getUnderlyingBuffer()) + Drm::completionFenceOffset;
for (auto drmIterator = 0u; drmIterator < osContextLinux->getDeviceBitfield().size(); drmIterator++) {
if (osContextLinux->getDeviceBitfield().test(drmIterator)) {
if (*reinterpret_cast<uint32_t *>(completionFenceCpuAddress) < completionFenceValue) {
constexpr int64_t timeout = -1;
constexpr uint16_t flags = 0;
drm.waitUserFence(drmContextIds[drmContextId], completionFenceCpuAddress, completionFenceValue, Drm::ValueWidth::U32, timeout, flags);
}
drmContextId++;
completionFenceCpuAddress = ptrOffset(completionFenceCpuAddress, this->postSyncOffset);
}
}
}
this->deallocateResources();
}
@@ -81,6 +104,14 @@ bool DrmDirectSubmission<GfxFamily, Dispatcher>::submit(uint64_t gpuAddress, siz
bool ret = false;
uint32_t drmContextId = 0u;
uint32_t completionValue = 0u;
uint64_t completionFenceGpuAddress = 0u;
if (this->completionFenceAllocation) {
completionValue = ++completionFenceValue;
completionFenceGpuAddress = this->completionFenceAllocation->getGpuAddress() + Drm::completionFenceOffset;
}
for (auto drmIterator = 0u; drmIterator < osContextLinux->getDeviceBitfield().size(); drmIterator++) {
if (osContextLinux->getDeviceBitfield().test(drmIterator)) {
ret |= !!bb->exec(static_cast<uint32_t>(size),
@@ -93,9 +124,12 @@ bool DrmDirectSubmission<GfxFamily, Dispatcher>::submit(uint64_t gpuAddress, siz
nullptr,
0,
&execObject,
0,
0);
completionFenceGpuAddress,
completionValue);
drmContextId++;
if (completionFenceGpuAddress) {
completionFenceGpuAddress += this->postSyncOffset;
}
}
}