fix: Use immediate command queue instead of CSR to obtain TaskCount.

Signed-off-by: Dunajski, Bartosz <bartosz.dunajski@intel.com>
This commit is contained in:
Dunajski, Bartosz
2023-08-21 10:35:14 +00:00
committed by Compute-Runtime-Automation
parent bef6b64148
commit f3b2458a9c
7 changed files with 89 additions and 7 deletions

View File

@@ -383,6 +383,8 @@ inline ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::executeCommand
ze_result_t status = ZE_RESULT_SUCCESS;
this->cmdQImmediate->setTaskCount(completionStamp.taskCount);
if (this->isSyncModeQueue || this->printfKernelContainer.size() > 0u) {
status = hostSynchronize(std::numeric_limits<uint64_t>::max(), completionStamp.taskCount, true);
}
@@ -849,7 +851,7 @@ template <GFXCORE_FAMILY gfxCoreFamily>
ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::hostSynchronize(uint64_t timeout, TaskCountType taskCount, bool handlePostWaitOperations) {
ze_result_t status = ZE_RESULT_SUCCESS;
if (isInOrderExecutionEnabled()) {
if (isInOrderExecutionEnabled() && NEO::DebugManager.flags.UseCounterAllocToSyncInOrderCmdList.get() != 0) {
status = synchronizeInOrderExecution(timeout);
} else {
const int64_t timeoutInMicroSeconds = timeout / 1000;
@@ -876,7 +878,7 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::hostSynchronize(uint6
template <GFXCORE_FAMILY gfxCoreFamily>
ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::hostSynchronize(uint64_t timeout) {
return hostSynchronize(timeout, this->csr->peekTaskCount(), true);
return hostSynchronize(timeout, this->cmdQImmediate->getTaskCount(), true);
}
template <GFXCORE_FAMILY gfxCoreFamily>
@@ -1041,7 +1043,7 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::performCpuMemcpy(cons
void *cpuMemcpyDstPtr = dstLockPointer ? dstLockPointer : cpuMemCopyInfo.dstPtr;
if (this->dependenciesPresent || isInOrderExecutionEnabled()) {
auto waitStatus = hostSynchronize(std::numeric_limits<uint64_t>::max(), this->csr->peekTaskCount(), false);
auto waitStatus = hostSynchronize(std::numeric_limits<uint64_t>::max(), this->cmdQImmediate->getTaskCount(), false);
if (waitStatus != ZE_RESULT_SUCCESS) {
return waitStatus;

View File

@@ -6,10 +6,12 @@
*/
#pragma once
#include "shared/source/command_stream/task_count_helper.h"
#include "shared/source/helpers/heap_base_address_model.h"
#include <level_zero/ze_api.h>
#include <atomic>
#include <mutex>
#include <vector>
@@ -64,6 +66,9 @@ struct CommandQueue : _ze_command_queue_handle_t {
void setClientId(uint32_t value) { this->clientId = value; }
virtual void unregisterCsrClient() = 0;
TaskCountType getTaskCount() const { return taskCount; }
void setTaskCount(TaskCountType newTaskCount) { taskCount = newTaskCount; }
static constexpr uint32_t clientNotRegistered = std::numeric_limits<uint32_t>::max();
protected:
@@ -72,6 +77,7 @@ struct CommandQueue : _ze_command_queue_handle_t {
uint32_t clientId = clientNotRegistered;
uint32_t partitionCount = 1;
uint32_t activeSubDevices = 1;
std::atomic<TaskCountType> taskCount = 0;
NEO::HeapAddressModel cmdListHeapAddressModel = NEO::HeapAddressModel::PrivateHeaps;
bool preemptionCmdSyncProgramming = true;

View File

@@ -80,8 +80,6 @@ struct CommandQueueImp : public CommandQueue {
Device *getDevice() { return device; }
TaskCountType getTaskCount() { return taskCount; }
NEO::CommandStreamReceiver *getCsr() { return csr; }
MOCKABLE_VIRTUAL NEO::WaitStatus reserveLinearStreamSize(size_t size);
@@ -141,8 +139,6 @@ struct CommandQueueImp : public CommandQueue {
ze_command_queue_desc_t desc;
std::vector<Kernel *> printfKernelContainer;
std::atomic<TaskCountType> taskCount{0};
Device *device = nullptr;
NEO::CommandStreamReceiver *csr = nullptr;
NEO::LinearStream *startingCmdBuffer = nullptr;

View File

@@ -2401,8 +2401,13 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndSignalEventAnd
}
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndSignalEventAndCpuMemcpyWhenGpuHangThenDontSynchronizeEvent, IsAtLeastSkl) {
ze_command_queue_desc_t desc = {};
auto mockCmdQ = std::make_unique<Mock<CommandQueue>>(device, device->getNEODevice()->getInternalEngine().commandStreamReceiver, &desc);
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
cmdList.copyThroughLockedPtrEnabled = true;
cmdList.cmdQImmediate = mockCmdQ.get();
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
reinterpret_cast<NEO::UltCommandStreamReceiver<FamilyType> *>(cmdList.csr)->callBaseWaitForCompletionWithTimeout = false;
@@ -2441,8 +2446,13 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenCpuMemcpyWith
}
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenCpuMemcpyWithBarrierThenWaitForTagUpdate, IsAtLeastSkl) {
ze_command_queue_desc_t desc = {};
auto mockCmdQ = std::make_unique<Mock<CommandQueue>>(device, device->getNEODevice()->getInternalEngine().commandStreamReceiver, &desc);
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
cmdList.copyThroughLockedPtrEnabled = true;
cmdList.cmdQImmediate = mockCmdQ.get();
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
@@ -2455,8 +2465,13 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenCpuMemcpyWith
}
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenAppendBarrierThenSetDependenciesPresent, IsAtLeastSkl) {
ze_command_queue_desc_t desc = {};
auto mockCmdQ = std::make_unique<Mock<CommandQueue>>(device, device->getNEODevice()->getInternalEngine().commandStreamReceiver, &desc);
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
cmdList.copyThroughLockedPtrEnabled = true;
cmdList.cmdQImmediate = mockCmdQ.get();
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
@@ -2473,8 +2488,13 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenAppendBarrier
}
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenAppendWaitOnEventsThenSetDependenciesPresent, IsAtLeastSkl) {
ze_command_queue_desc_t desc = {};
auto mockCmdQ = std::make_unique<Mock<CommandQueue>>(device, device->getNEODevice()->getInternalEngine().commandStreamReceiver, &desc);
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
cmdList.copyThroughLockedPtrEnabled = true;
cmdList.cmdQImmediate = mockCmdQ.get();
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
@@ -2627,8 +2647,13 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndH2DCopyWhenSiz
}
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndCpuMemcpyWithDependencyThenAppendBarrierCalled, IsAtLeastSkl) {
ze_command_queue_desc_t desc = {};
auto mockCmdQ = std::make_unique<Mock<CommandQueue>>(device, device->getNEODevice()->getInternalEngine().commandStreamReceiver, &desc);
MockAppendMemoryLockedCopyTestImmediateCmdList<gfxCoreFamily> cmdList;
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
cmdList.cmdQImmediate = mockCmdQ.get();
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
constexpr uint32_t numEvents = 5;
@@ -2658,8 +2683,13 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndCpuMemcpyWithD
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndCpuMemcpyWithDependencyWithinThresholdThenWaitOnHost, IsAtLeastSkl) {
DebugManagerStateRestore restore;
ze_command_queue_desc_t desc = {};
auto mockCmdQ = std::make_unique<Mock<CommandQueue>>(device, device->getNEODevice()->getInternalEngine().commandStreamReceiver, &desc);
MockAppendMemoryLockedCopyTestImmediateCmdList<gfxCoreFamily> cmdList;
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
cmdList.cmdQImmediate = mockCmdQ.get();
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
constexpr uint32_t numEvents = 4;

View File

@@ -806,6 +806,52 @@ HWTEST2_F(InOrderCmdListTests, givenQueueFlagWhenCreatingCmdListThenEnableRelaxe
EXPECT_EQ(ZE_RESULT_SUCCESS, zeCommandListDestroy(cmdList));
}
HWTEST2_F(InOrderCmdListTests, givenCmdListsWhenDispatchingThenUseInternalTaskCountForWaits, IsAtLeastSkl) {
DebugManager.flags.UseCounterAllocToSyncInOrderCmdList.set(0);
auto immCmdList0 = createImmCmdList<gfxCoreFamily>();
auto immCmdList1 = createImmCmdList<gfxCoreFamily>();
auto ultCsr = static_cast<UltCommandStreamReceiver<FamilyType> *>(device->getNEODevice()->getDefaultEngine().commandStreamReceiver);
immCmdList0->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false);
immCmdList1->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false);
EXPECT_EQ(1u, immCmdList0->cmdQImmediate->getTaskCount());
EXPECT_EQ(2u, immCmdList1->cmdQImmediate->getTaskCount());
// explicit wait
{
immCmdList0->hostSynchronize(0);
EXPECT_EQ(1u, ultCsr->latestWaitForCompletionWithTimeoutTaskCount.load());
immCmdList1->hostSynchronize(0);
EXPECT_EQ(2u, ultCsr->latestWaitForCompletionWithTimeoutTaskCount.load());
}
// implicit wait
{
immCmdList0->copyThroughLockedPtrEnabled = true;
immCmdList1->copyThroughLockedPtrEnabled = true;
void *deviceAlloc = nullptr;
ze_device_mem_alloc_desc_t deviceDesc = {};
auto result = context->allocDeviceMem(device->toHandle(), &deviceDesc, 128, 128, &deviceAlloc);
ASSERT_EQ(result, ZE_RESULT_SUCCESS);
uint32_t hostCopyData = 0;
immCmdList0->appendMemoryCopy(deviceAlloc, &hostCopyData, 1, nullptr, 0, nullptr, false, false);
EXPECT_EQ(1u, ultCsr->latestWaitForCompletionWithTimeoutTaskCount.load());
immCmdList1->appendMemoryCopy(deviceAlloc, &hostCopyData, 1, nullptr, 0, nullptr, false, false);
EXPECT_EQ(2u, ultCsr->latestWaitForCompletionWithTimeoutTaskCount.load());
context->freeMem(deviceAlloc);
}
}
HWTEST2_F(InOrderCmdListTests, givenDebugFlagSetWhenEventHostSyncCalledThenCallWaitUserFence, IsAtLeastXeHpCore) {
NEO::DebugManager.flags.WaitForUserFenceOnEventHostSynchronize.set(1);