mirror of
https://github.com/intel/compute-runtime.git
synced 2026-01-03 06:49:52 +08:00
fix: Use immediate command queue instead of CSR to obtain TaskCount.
Signed-off-by: Dunajski, Bartosz <bartosz.dunajski@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
bef6b64148
commit
f3b2458a9c
@@ -383,6 +383,8 @@ inline ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::executeCommand
|
||||
|
||||
ze_result_t status = ZE_RESULT_SUCCESS;
|
||||
|
||||
this->cmdQImmediate->setTaskCount(completionStamp.taskCount);
|
||||
|
||||
if (this->isSyncModeQueue || this->printfKernelContainer.size() > 0u) {
|
||||
status = hostSynchronize(std::numeric_limits<uint64_t>::max(), completionStamp.taskCount, true);
|
||||
}
|
||||
@@ -849,7 +851,7 @@ template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::hostSynchronize(uint64_t timeout, TaskCountType taskCount, bool handlePostWaitOperations) {
|
||||
ze_result_t status = ZE_RESULT_SUCCESS;
|
||||
|
||||
if (isInOrderExecutionEnabled()) {
|
||||
if (isInOrderExecutionEnabled() && NEO::DebugManager.flags.UseCounterAllocToSyncInOrderCmdList.get() != 0) {
|
||||
status = synchronizeInOrderExecution(timeout);
|
||||
} else {
|
||||
const int64_t timeoutInMicroSeconds = timeout / 1000;
|
||||
@@ -876,7 +878,7 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::hostSynchronize(uint6
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::hostSynchronize(uint64_t timeout) {
|
||||
return hostSynchronize(timeout, this->csr->peekTaskCount(), true);
|
||||
return hostSynchronize(timeout, this->cmdQImmediate->getTaskCount(), true);
|
||||
}
|
||||
|
||||
template <GFXCORE_FAMILY gfxCoreFamily>
|
||||
@@ -1041,7 +1043,7 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::performCpuMemcpy(cons
|
||||
void *cpuMemcpyDstPtr = dstLockPointer ? dstLockPointer : cpuMemCopyInfo.dstPtr;
|
||||
|
||||
if (this->dependenciesPresent || isInOrderExecutionEnabled()) {
|
||||
auto waitStatus = hostSynchronize(std::numeric_limits<uint64_t>::max(), this->csr->peekTaskCount(), false);
|
||||
auto waitStatus = hostSynchronize(std::numeric_limits<uint64_t>::max(), this->cmdQImmediate->getTaskCount(), false);
|
||||
|
||||
if (waitStatus != ZE_RESULT_SUCCESS) {
|
||||
return waitStatus;
|
||||
|
||||
@@ -6,10 +6,12 @@
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include "shared/source/command_stream/task_count_helper.h"
|
||||
#include "shared/source/helpers/heap_base_address_model.h"
|
||||
|
||||
#include <level_zero/ze_api.h>
|
||||
|
||||
#include <atomic>
|
||||
#include <mutex>
|
||||
#include <vector>
|
||||
|
||||
@@ -64,6 +66,9 @@ struct CommandQueue : _ze_command_queue_handle_t {
|
||||
void setClientId(uint32_t value) { this->clientId = value; }
|
||||
virtual void unregisterCsrClient() = 0;
|
||||
|
||||
TaskCountType getTaskCount() const { return taskCount; }
|
||||
void setTaskCount(TaskCountType newTaskCount) { taskCount = newTaskCount; }
|
||||
|
||||
static constexpr uint32_t clientNotRegistered = std::numeric_limits<uint32_t>::max();
|
||||
|
||||
protected:
|
||||
@@ -72,6 +77,7 @@ struct CommandQueue : _ze_command_queue_handle_t {
|
||||
uint32_t clientId = clientNotRegistered;
|
||||
uint32_t partitionCount = 1;
|
||||
uint32_t activeSubDevices = 1;
|
||||
std::atomic<TaskCountType> taskCount = 0;
|
||||
NEO::HeapAddressModel cmdListHeapAddressModel = NEO::HeapAddressModel::PrivateHeaps;
|
||||
|
||||
bool preemptionCmdSyncProgramming = true;
|
||||
|
||||
@@ -80,8 +80,6 @@ struct CommandQueueImp : public CommandQueue {
|
||||
|
||||
Device *getDevice() { return device; }
|
||||
|
||||
TaskCountType getTaskCount() { return taskCount; }
|
||||
|
||||
NEO::CommandStreamReceiver *getCsr() { return csr; }
|
||||
|
||||
MOCKABLE_VIRTUAL NEO::WaitStatus reserveLinearStreamSize(size_t size);
|
||||
@@ -141,8 +139,6 @@ struct CommandQueueImp : public CommandQueue {
|
||||
ze_command_queue_desc_t desc;
|
||||
std::vector<Kernel *> printfKernelContainer;
|
||||
|
||||
std::atomic<TaskCountType> taskCount{0};
|
||||
|
||||
Device *device = nullptr;
|
||||
NEO::CommandStreamReceiver *csr = nullptr;
|
||||
NEO::LinearStream *startingCmdBuffer = nullptr;
|
||||
|
||||
@@ -2401,8 +2401,13 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndSignalEventAnd
|
||||
}
|
||||
|
||||
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndSignalEventAndCpuMemcpyWhenGpuHangThenDontSynchronizeEvent, IsAtLeastSkl) {
|
||||
ze_command_queue_desc_t desc = {};
|
||||
|
||||
auto mockCmdQ = std::make_unique<Mock<CommandQueue>>(device, device->getNEODevice()->getInternalEngine().commandStreamReceiver, &desc);
|
||||
|
||||
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
|
||||
cmdList.copyThroughLockedPtrEnabled = true;
|
||||
cmdList.cmdQImmediate = mockCmdQ.get();
|
||||
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
|
||||
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
|
||||
reinterpret_cast<NEO::UltCommandStreamReceiver<FamilyType> *>(cmdList.csr)->callBaseWaitForCompletionWithTimeout = false;
|
||||
@@ -2441,8 +2446,13 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenCpuMemcpyWith
|
||||
}
|
||||
|
||||
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenCpuMemcpyWithBarrierThenWaitForTagUpdate, IsAtLeastSkl) {
|
||||
ze_command_queue_desc_t desc = {};
|
||||
|
||||
auto mockCmdQ = std::make_unique<Mock<CommandQueue>>(device, device->getNEODevice()->getInternalEngine().commandStreamReceiver, &desc);
|
||||
|
||||
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
|
||||
cmdList.copyThroughLockedPtrEnabled = true;
|
||||
cmdList.cmdQImmediate = mockCmdQ.get();
|
||||
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
|
||||
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
|
||||
|
||||
@@ -2455,8 +2465,13 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenCpuMemcpyWith
|
||||
}
|
||||
|
||||
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenAppendBarrierThenSetDependenciesPresent, IsAtLeastSkl) {
|
||||
ze_command_queue_desc_t desc = {};
|
||||
|
||||
auto mockCmdQ = std::make_unique<Mock<CommandQueue>>(device, device->getNEODevice()->getInternalEngine().commandStreamReceiver, &desc);
|
||||
|
||||
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
|
||||
cmdList.copyThroughLockedPtrEnabled = true;
|
||||
cmdList.cmdQImmediate = mockCmdQ.get();
|
||||
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
|
||||
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
|
||||
|
||||
@@ -2473,8 +2488,13 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenAppendBarrier
|
||||
}
|
||||
|
||||
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListWhenAppendWaitOnEventsThenSetDependenciesPresent, IsAtLeastSkl) {
|
||||
ze_command_queue_desc_t desc = {};
|
||||
|
||||
auto mockCmdQ = std::make_unique<Mock<CommandQueue>>(device, device->getNEODevice()->getInternalEngine().commandStreamReceiver, &desc);
|
||||
|
||||
MockCommandListImmediateHw<gfxCoreFamily> cmdList;
|
||||
cmdList.copyThroughLockedPtrEnabled = true;
|
||||
cmdList.cmdQImmediate = mockCmdQ.get();
|
||||
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
|
||||
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
|
||||
|
||||
@@ -2627,8 +2647,13 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndH2DCopyWhenSiz
|
||||
}
|
||||
|
||||
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndCpuMemcpyWithDependencyThenAppendBarrierCalled, IsAtLeastSkl) {
|
||||
ze_command_queue_desc_t desc = {};
|
||||
|
||||
auto mockCmdQ = std::make_unique<Mock<CommandQueue>>(device, device->getNEODevice()->getInternalEngine().commandStreamReceiver, &desc);
|
||||
|
||||
MockAppendMemoryLockedCopyTestImmediateCmdList<gfxCoreFamily> cmdList;
|
||||
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
|
||||
cmdList.cmdQImmediate = mockCmdQ.get();
|
||||
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
|
||||
|
||||
constexpr uint32_t numEvents = 5;
|
||||
@@ -2658,8 +2683,13 @@ HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndCpuMemcpyWithD
|
||||
HWTEST2_F(AppendMemoryLockedCopyTest, givenImmediateCommandListAndCpuMemcpyWithDependencyWithinThresholdThenWaitOnHost, IsAtLeastSkl) {
|
||||
DebugManagerStateRestore restore;
|
||||
|
||||
ze_command_queue_desc_t desc = {};
|
||||
|
||||
auto mockCmdQ = std::make_unique<Mock<CommandQueue>>(device, device->getNEODevice()->getInternalEngine().commandStreamReceiver, &desc);
|
||||
|
||||
MockAppendMemoryLockedCopyTestImmediateCmdList<gfxCoreFamily> cmdList;
|
||||
cmdList.csr = device->getNEODevice()->getInternalEngine().commandStreamReceiver;
|
||||
cmdList.cmdQImmediate = mockCmdQ.get();
|
||||
cmdList.initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
|
||||
|
||||
constexpr uint32_t numEvents = 4;
|
||||
|
||||
@@ -806,6 +806,52 @@ HWTEST2_F(InOrderCmdListTests, givenQueueFlagWhenCreatingCmdListThenEnableRelaxe
|
||||
EXPECT_EQ(ZE_RESULT_SUCCESS, zeCommandListDestroy(cmdList));
|
||||
}
|
||||
|
||||
HWTEST2_F(InOrderCmdListTests, givenCmdListsWhenDispatchingThenUseInternalTaskCountForWaits, IsAtLeastSkl) {
|
||||
DebugManager.flags.UseCounterAllocToSyncInOrderCmdList.set(0);
|
||||
|
||||
auto immCmdList0 = createImmCmdList<gfxCoreFamily>();
|
||||
auto immCmdList1 = createImmCmdList<gfxCoreFamily>();
|
||||
|
||||
auto ultCsr = static_cast<UltCommandStreamReceiver<FamilyType> *>(device->getNEODevice()->getDefaultEngine().commandStreamReceiver);
|
||||
|
||||
immCmdList0->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false);
|
||||
|
||||
immCmdList1->appendLaunchKernel(kernel->toHandle(), &groupCount, nullptr, 0, nullptr, launchParams, false);
|
||||
|
||||
EXPECT_EQ(1u, immCmdList0->cmdQImmediate->getTaskCount());
|
||||
EXPECT_EQ(2u, immCmdList1->cmdQImmediate->getTaskCount());
|
||||
|
||||
// explicit wait
|
||||
{
|
||||
immCmdList0->hostSynchronize(0);
|
||||
EXPECT_EQ(1u, ultCsr->latestWaitForCompletionWithTimeoutTaskCount.load());
|
||||
|
||||
immCmdList1->hostSynchronize(0);
|
||||
EXPECT_EQ(2u, ultCsr->latestWaitForCompletionWithTimeoutTaskCount.load());
|
||||
}
|
||||
|
||||
// implicit wait
|
||||
{
|
||||
immCmdList0->copyThroughLockedPtrEnabled = true;
|
||||
immCmdList1->copyThroughLockedPtrEnabled = true;
|
||||
|
||||
void *deviceAlloc = nullptr;
|
||||
ze_device_mem_alloc_desc_t deviceDesc = {};
|
||||
auto result = context->allocDeviceMem(device->toHandle(), &deviceDesc, 128, 128, &deviceAlloc);
|
||||
ASSERT_EQ(result, ZE_RESULT_SUCCESS);
|
||||
|
||||
uint32_t hostCopyData = 0;
|
||||
|
||||
immCmdList0->appendMemoryCopy(deviceAlloc, &hostCopyData, 1, nullptr, 0, nullptr, false, false);
|
||||
EXPECT_EQ(1u, ultCsr->latestWaitForCompletionWithTimeoutTaskCount.load());
|
||||
|
||||
immCmdList1->appendMemoryCopy(deviceAlloc, &hostCopyData, 1, nullptr, 0, nullptr, false, false);
|
||||
EXPECT_EQ(2u, ultCsr->latestWaitForCompletionWithTimeoutTaskCount.load());
|
||||
|
||||
context->freeMem(deviceAlloc);
|
||||
}
|
||||
}
|
||||
|
||||
HWTEST2_F(InOrderCmdListTests, givenDebugFlagSetWhenEventHostSyncCalledThenCallWaitUserFence, IsAtLeastXeHpCore) {
|
||||
NEO::DebugManager.flags.WaitForUserFenceOnEventHostSynchronize.set(1);
|
||||
|
||||
|
||||
Reference in New Issue
Block a user