feature: CPU copy path for in-order CommandList

Related-To: NEO-7966

Signed-off-by: Dunajski, Bartosz <bartosz.dunajski@intel.com>
This commit is contained in:
Dunajski, Bartosz
2023-05-15 17:11:02 +00:00
committed by Compute-Runtime-Automation
parent 5a908f6634
commit 7b207d5e11
5 changed files with 148 additions and 0 deletions

View File

@@ -40,6 +40,7 @@ struct CommandListCoreFamilyImmediate : public CommandListCoreFamily<gfxCoreFami
using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
using BaseClass = CommandListCoreFamily<gfxCoreFamily>;
using BaseClass::BaseClass;
using BaseClass::copyThroughLockedPtrEnabled;
using BaseClass::executeCommandListImmediate;
using BaseClass::isCopyOnly;
using BaseClass::isInOrderExecutionEnabled;
@@ -174,6 +175,8 @@ struct CommandListCoreFamilyImmediate : public CommandListCoreFamily<gfxCoreFami
using BaseClass::inOrderDependencyCounterAllocation;
void printKernelsPrintfOutput(bool hangDetected);
ze_result_t synchronizeInOrderExecution() const;
MOCKABLE_VIRTUAL void checkAssert();
std::atomic<bool> dependenciesPresent{false};
};

View File

@@ -20,6 +20,7 @@
#include "shared/source/memory_manager/internal_allocation_storage.h"
#include "shared/source/memory_manager/unified_memory_manager.h"
#include "shared/source/os_interface/os_context.h"
#include "shared/source/utilities/wait_util.h"
#include "level_zero/core/source/cmdlist/cmdlist_hw_immediate.h"
#include "level_zero/core/source/cmdqueue/cmdqueue_hw.h"
@@ -787,6 +788,10 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::performCpuMemcpy(cons
return ZE_RESULT_ERROR_UNKNOWN;
}
if (isInOrderExecutionEnabled()) {
this->dependenciesPresent = false; // wait only for waitlist and in-order sync value
}
if (numWaitEvents > 0) {
uint32_t numEventsThreshold = 5;
if (NEO::DebugManager.flags.ExperimentalCopyThroughLockWaitlistSizeThreshold.get() != -1) {
@@ -826,6 +831,13 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::performCpuMemcpy(cons
this->dependenciesPresent = false;
}
if (isInOrderExecutionEnabled()) {
auto status = synchronizeInOrderExecution();
if (status != ZE_RESULT_SUCCESS) {
return status;
}
}
if (signalEvent) {
signalEvent->setGpuStartTimestamp();
}
@@ -1028,4 +1040,24 @@ bool CommandListCoreFamilyImmediate<gfxCoreFamily>::isRelaxedOrderingDispatchAll
return NEO::RelaxedOrderingHelper::isRelaxedOrderingDispatchAllowed(*this->csr, numEvents);
}
template <GFXCORE_FAMILY gfxCoreFamily>
ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::synchronizeInOrderExecution() const {
auto hostAddress = static_cast<uint32_t *>(this->inOrderDependencyCounterAllocation->getUnderlyingBuffer());
auto waitValue = this->inOrderDependencyCounter;
auto lastHangCheckTime = std::chrono::high_resolution_clock::now();
while (*hostAddress < waitValue) {
this->csr->downloadAllocation(*this->inOrderDependencyCounterAllocation);
bool status = NEO::WaitUtils::waitFunctionWithPredicate<const uint32_t>(hostAddress, waitValue, std::greater_equal<uint32_t>());
if (!status && this->csr->checkGpuHangDetected(std::chrono::high_resolution_clock::now(), lastHangCheckTime)) {
return ZE_RESULT_ERROR_DEVICE_LOST;
}
}
return ZE_RESULT_SUCCESS;
}
} // namespace L0

View File

@@ -173,6 +173,7 @@ struct WhiteBox<L0::CommandListCoreFamilyImmediate<gfxCoreFamily>>
using BaseClass::signalAllEventPackets;
using BaseClass::stateBaseAddressTracking;
using BaseClass::stateComputeModeTracking;
using BaseClass::synchronizeInOrderExecution;
WhiteBox() : BaseClass(BaseClass::defaultNumIddsPerBlock) {}
};

View File

@@ -1115,6 +1115,107 @@ HWTEST2_F(InOrderCmdListTests, givenInOrderModeWhenProgrammingAppendBarrierThenS
EXPECT_EQ(2u, pcCmd->getImmediateData());
}
HWTEST2_F(InOrderCmdListTests, givenInOrderModeWhenCallingSyncThenHandleCompletion, IsAtLeastXeHpCore) {
auto immCmdList = createImmCmdList<gfxCoreFamily>();
auto ultCsr = static_cast<UltCommandStreamReceiver<FamilyType> *>(device->getNEODevice()->getDefaultEngine().commandStreamReceiver);
immCmdList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false);
auto hostAddress = static_cast<uint32_t *>(immCmdList->inOrderDependencyCounterAllocation->getUnderlyingBuffer());
*hostAddress = 0;
const uint32_t failCounter = 3;
uint32_t callCounter = 0;
ultCsr->downloadAllocationImpl = [&](GraphicsAllocation &graphicsAllocation) {
callCounter++;
if (callCounter >= failCounter) {
*hostAddress = 1;
}
};
immCmdList->synchronizeInOrderExecution();
EXPECT_EQ(3u, callCounter);
EXPECT_EQ(2u, ultCsr->checkGpuHangDetectedCalled);
EXPECT_EQ(1u, *hostAddress);
}
HWTEST2_F(InOrderCmdListTests, givenInOrderModeWhenDoingCpuCopyThenSynchronize, IsAtLeastXeHpCore) {
auto immCmdList = createImmCmdList<gfxCoreFamily>();
immCmdList->copyThroughLockedPtrEnabled = true;
auto ultCsr = static_cast<UltCommandStreamReceiver<FamilyType> *>(device->getNEODevice()->getDefaultEngine().commandStreamReceiver);
auto eventPool = createEvents(1, false);
auto eventHandle = events[0]->toHandle();
auto hostAddress = static_cast<uint32_t *>(immCmdList->inOrderDependencyCounterAllocation->getUnderlyingBuffer());
*hostAddress = 0;
const uint32_t failCounter = 3;
uint32_t callCounter = 0;
ultCsr->downloadAllocationImpl = [&](GraphicsAllocation &graphicsAllocation) {
callCounter++;
if (callCounter >= failCounter) {
(*hostAddress)++;
}
};
immCmdList->appendLaunchKernel(kernel->toHandle(), &groupCount, eventHandle, 0, nullptr, launchParams, false);
events[0]->setIsCompleted();
ultCsr->waitForCompletionWithTimeoutTaskCountCalled = 0;
ultCsr->flushTagUpdateCalled = false;
void *deviceAlloc = nullptr;
ze_device_mem_alloc_desc_t deviceDesc = {};
auto result = context->allocDeviceMem(device->toHandle(), &deviceDesc, 128, 128, &deviceAlloc);
ASSERT_EQ(result, ZE_RESULT_SUCCESS);
uint32_t hostCopyData = 0;
immCmdList->appendMemoryCopy(deviceAlloc, &hostCopyData, 1, nullptr, 1, &eventHandle, false);
EXPECT_EQ(3u, callCounter);
EXPECT_EQ(1u, *hostAddress);
EXPECT_EQ(2u, ultCsr->checkGpuHangDetectedCalled);
EXPECT_EQ(0u, ultCsr->waitForCompletionWithTimeoutTaskCountCalled);
EXPECT_FALSE(ultCsr->flushTagUpdateCalled);
context->freeMem(deviceAlloc);
}
HWTEST2_F(InOrderCmdListTests, givenInOrderModeWhenGpuHangDetectedInCpuCopyPathThenReportError, IsAtLeastXeHpCore) {
auto immCmdList = createImmCmdList<gfxCoreFamily>();
immCmdList->copyThroughLockedPtrEnabled = true;
auto ultCsr = static_cast<UltCommandStreamReceiver<FamilyType> *>(device->getNEODevice()->getDefaultEngine().commandStreamReceiver);
auto hostAddress = static_cast<uint32_t *>(immCmdList->inOrderDependencyCounterAllocation->getUnderlyingBuffer());
*hostAddress = 0;
immCmdList->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false);
void *deviceAlloc = nullptr;
ze_device_mem_alloc_desc_t deviceDesc = {};
auto result = context->allocDeviceMem(device->toHandle(), &deviceDesc, 128, 128, &deviceAlloc);
ASSERT_EQ(result, ZE_RESULT_SUCCESS);
uint32_t hostCopyData = 0;
ultCsr->forceReturnGpuHang = true;
auto status = immCmdList->appendMemoryCopy(deviceAlloc, &hostCopyData, 1, nullptr, 0, nullptr, false);
EXPECT_EQ(ZE_RESULT_ERROR_DEVICE_LOST, status);
ultCsr->forceReturnGpuHang = false;
context->freeMem(deviceAlloc);
}
struct CommandListAppendLaunchKernelWithImplicitArgs : CommandListAppendLaunchKernel {
template <typename FamilyType>
uint64_t getIndirectHeapOffsetForImplicitArgsBuffer(const Mock<::L0::Kernel> &kernel) {

View File

@@ -379,6 +379,15 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw<GfxFamily>, publ
pollForCompletionCalled++;
}
bool checkGpuHangDetected(CommandStreamReceiver::TimeType currentTime, CommandStreamReceiver::TimeType &lastHangCheckTime) const override {
checkGpuHangDetectedCalled++;
if (forceReturnGpuHang) {
return true;
}
return BaseClass::checkGpuHangDetected(currentTime, lastHangCheckTime);
}
SubmissionStatus sendRenderStateCacheFlush() override {
if (callBaseSendRenderStateCacheFlush) {
return BaseClass::sendRenderStateCacheFlush();
@@ -404,6 +413,7 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw<GfxFamily>, publ
uint32_t initDirectSubmissionCalled = 0;
uint32_t fillReusableAllocationsListCalled = 0;
uint32_t pollForCompletionCalled = 0;
mutable uint32_t checkGpuHangDetectedCalled = 0;
int ensureCommandBufferAllocationCalled = 0;
DispatchFlags recordedDispatchFlags;
BlitPropertiesContainer receivedBlitProperties = {};
@@ -435,6 +445,7 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw<GfxFamily>, publ
bool callBaseFillReusableAllocationsList = false;
bool callBaseFlushBcsTask{true};
bool callBaseSendRenderStateCacheFlush = true;
bool forceReturnGpuHang = false;
};
} // namespace NEO