feature: unregister CSR client on Event host synchronize

Related-To: NEO-7458

Signed-off-by: Dunajski, Bartosz <bartosz.dunajski@intel.com>
This commit is contained in:
Dunajski, Bartosz 2023-06-22 16:38:44 +00:00 committed by Compute-Runtime-Automation
parent 61fb19caab
commit aea5f435db
10 changed files with 145 additions and 6 deletions

View File

@ -727,6 +727,8 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::hostSynchronize(uint6
template <GFXCORE_FAMILY gfxCoreFamily>
ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::flushImmediate(ze_result_t inputRet, bool performMigration, bool hasStallingCmds,
bool hasRelaxedOrderingDependencies, ze_event_handle_t hSignalEvent) {
auto signalEvent = Event::fromHandle(hSignalEvent);
if (inputRet == ZE_RESULT_SUCCESS) {
if (isInOrderExecutionEnabled()) {
inOrderDependencyCounter++;
@ -735,14 +737,15 @@ ze_result_t CommandListCoreFamilyImmediate<gfxCoreFamily>::flushImmediate(ze_res
}
if (this->isFlushTaskSubmissionEnabled) {
if (signalEvent && (NEO::DebugManager.flags.TrackNumCsrClientsOnSyncPoints.get() != 0)) {
signalEvent->setLatestUsedCmdQueue(this->cmdQImmediate);
}
inputRet = executeCommandListImmediateWithFlushTask(performMigration, hasStallingCmds, hasRelaxedOrderingDependencies);
} else {
inputRet = executeCommandListImmediate(performMigration);
}
}
auto signalEvent = Event::fromHandle(hSignalEvent);
if (signalEvent) {
signalEvent->setCsr(this->csr);

View File

@ -53,9 +53,7 @@ CommandQueueImp::CommandQueueImp(Device *device, NEO::CommandStreamReceiver *csr
}
ze_result_t CommandQueueImp::destroy() {
if (this->clientId != CommandQueue::clientNotRegistered) {
this->csr->unregisterClient();
}
unregisterCsrClient();
if (commandStream.getCpuBase() != nullptr) {
commandStream.replaceGraphicsAllocation(nullptr);
@ -252,6 +250,13 @@ CommandQueue *CommandQueue::create(uint32_t productFamily, Device *device, NEO::
return commandQueue;
}
void CommandQueueImp::unregisterCsrClient() {
if (getClientId() != CommandQueue::clientNotRegistered) {
this->csr->unregisterClient();
setClientId(CommandQueue::clientNotRegistered);
}
}
ze_command_queue_mode_t CommandQueueImp::getSynchronousMode() const {
return desc.mode;
}

View File

@ -60,8 +60,9 @@ struct CommandQueue : _ze_command_queue_handle_t {
bool peekIsCopyOnlyCommandQueue() const { return this->isCopyOnlyCommandQueue; }
uint32_t getClientId() { return this->clientId; }
uint32_t getClientId() const { return this->clientId; }
void setClientId(uint32_t value) { this->clientId = value; }
virtual void unregisterCsrClient() = 0;
static constexpr uint32_t clientNotRegistered = std::numeric_limits<uint32_t>::max();

View File

@ -91,6 +91,7 @@ struct CommandQueueImp : public CommandQueue {
void makeResidentAndMigrate(bool performMigration, const NEO::ResidencyContainer &residencyContainer) override;
void printKernelsPrintfOutput(bool hangDetected);
void checkAssert();
void unregisterCsrClient() override;
protected:
MOCKABLE_VIRTUAL NEO::SubmissionStatus submitBatchBuffer(size_t offset, NEO::ResidencyContainer &residencyContainer, void *endingCmdPtr,

View File

@ -388,6 +388,7 @@ void Event::setIsCompleted() {
if (this->isCompleted.load() == STATE_CLEARED) {
this->isCompleted = STATE_SIGNALED;
}
unsetCmdQueue(true);
}
void Event::enableInOrderExecMode(NEO::GraphicsAllocation &inOrderDependenciesAllocation, uint64_t signalValue) {
@ -397,4 +398,15 @@ void Event::enableInOrderExecMode(NEO::GraphicsAllocation &inOrderDependenciesAl
inOrderExecDataAllocation = &inOrderDependenciesAllocation;
}
void Event::setLatestUsedCmdQueue(CommandQueue *newCmdQ) {
this->latestUsedCmdQueue = newCmdQ;
}
void Event::unsetCmdQueue(bool unregisterClient) {
if (latestUsedCmdQueue && unregisterClient) {
latestUsedCmdQueue->unregisterCsrClient();
}
latestUsedCmdQueue = nullptr;
}
} // namespace L0

View File

@ -37,6 +37,7 @@ struct EventPool;
struct MetricStreamer;
struct ContextImp;
struct Context;
struct CommandQueue;
struct DriverHandle;
struct DriverHandleImp;
struct Device;
@ -214,6 +215,7 @@ struct Event : _ze_event_handle_t {
bool isInOrderExecEvent() const { return inOrderExecEvent; }
NEO::GraphicsAllocation *getInOrderExecDataAllocation() const { return inOrderExecDataAllocation; }
uint64_t getInOrderExecSignalValue() const { return inOrderExecSignalValue; }
void setLatestUsedCmdQueue(CommandQueue *newCmdQ);
void setReferenceTs(NEO::TimeStampData &timestamp) {
referenceTs = timestamp;
}
@ -222,6 +224,8 @@ struct Event : _ze_event_handle_t {
protected:
Event(EventPool *eventPool, int index, Device *device) : device(device), eventPool(eventPool), index(index) {}
void unsetCmdQueue(bool unregisterClient);
uint64_t globalStartTS = 1;
uint64_t globalEndTS = 1;
uint64_t contextStartTS = 1;
@ -253,6 +257,7 @@ struct Event : _ze_event_handle_t {
EventPool *eventPool = nullptr;
Kernel *kernelWithPrintf = nullptr;
NEO::GraphicsAllocation *inOrderExecDataAllocation = nullptr;
CommandQueue *latestUsedCmdQueue = nullptr;
uint32_t maxKernelCount = 0;
uint32_t kernelCount = 1u;

View File

@ -163,6 +163,7 @@ void EventImp<TagSizeT>::handleSuccessfulHostSynchronization() {
}
}
this->setIsCompleted();
unsetCmdQueue(true);
for (auto &csr : csrs) {
csr->getInternalAllocationStorage()->cleanAllocationList(csr->peekTaskCount(), NEO::AllocationUsage::TEMPORARY_ALLOCATION);
}
@ -396,6 +397,7 @@ ze_result_t EventImp<TagSizeT>::reset() {
inOrderExecDataAllocation = nullptr;
inOrderExecSignalValue = 0;
}
unsetCmdQueue(false);
this->resetCompletionStatus();
this->resetDeviceCompletionData(false);
this->l3FlushAppliedOnKernel.reset();

View File

@ -10,6 +10,7 @@
#include "shared/source/indirect_heap/indirect_heap.h"
#include "shared/source/memory_manager/internal_allocation_storage.h"
#include "shared/test/common/cmd_parse/gen_cmd_parse.h"
#include "shared/test/common/helpers/debug_manager_state_restore.h"
#include "shared/test/common/helpers/relaxed_ordering_commands_helper.h"
#include "shared/test/common/helpers/unit_test_helper.h"
#include "shared/test/common/libult/ult_command_stream_receiver.h"
@ -1108,6 +1109,113 @@ HWTEST2_F(CommandListCreate, whenDispatchingThenPassNumCsrClients, IsAtLeastXeHp
EXPECT_EQ(ultCsr->latestFlushedBatchBuffer.numCsrClients, ultCsr->getNumClients());
}
HWTEST_F(CommandListCreate, givenSignalEventWhenCallingSynchronizeThenUnregisterClient) {
ze_command_queue_desc_t desc = {};
desc.mode = ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS;
ze_result_t returnValue;
std::unique_ptr<L0::CommandList> commandList(CommandList::createImmediate(productFamily, device, &desc, false, NEO::EngineGroupType::RenderCompute, returnValue));
ASSERT_NE(nullptr, commandList);
auto whiteBoxCmdList = static_cast<CommandList *>(commandList.get());
Mock<::L0::Kernel> kernel;
ze_group_count_t groupCount{1, 1, 1};
CmdListKernelLaunchParams launchParams = {};
auto ultCsr = static_cast<NEO::UltCommandStreamReceiver<FamilyType> *>(whiteBoxCmdList->csr);
ze_event_pool_desc_t eventPoolDesc = {};
eventPoolDesc.count = 3;
ze_event_desc_t eventDesc = {};
ze_event_handle_t event1 = nullptr;
ze_event_handle_t event2 = nullptr;
ze_event_handle_t event3 = nullptr;
std::unique_ptr<L0::EventPool> eventPool(EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, returnValue));
ASSERT_EQ(ZE_RESULT_SUCCESS, eventPool->createEvent(&eventDesc, &event1));
ASSERT_EQ(ZE_RESULT_SUCCESS, eventPool->createEvent(&eventDesc, &event2));
ASSERT_EQ(ZE_RESULT_SUCCESS, eventPool->createEvent(&eventDesc, &event3));
EXPECT_EQ(ultCsr->getNumClients(), 0u);
{
commandList->appendLaunchKernel(kernel.toHandle(), &groupCount, event1, 0, nullptr, launchParams, false);
EXPECT_EQ(ultCsr->getNumClients(), 1u);
Event::fromHandle(event1)->setIsCompleted();
zeEventHostSynchronize(event1, std::numeric_limits<uint64_t>::max());
EXPECT_EQ(ultCsr->getNumClients(), 0u);
}
{
commandList->appendLaunchKernel(kernel.toHandle(), &groupCount, event2, 0, nullptr, launchParams, false);
EXPECT_EQ(ultCsr->getNumClients(), 1u);
*reinterpret_cast<uint32_t *>(Event::fromHandle(event2)->getHostAddress()) = static_cast<uint32_t>(Event::STATE_SIGNALED);
zeEventHostSynchronize(event2, std::numeric_limits<uint64_t>::max());
EXPECT_EQ(ultCsr->getNumClients(), 0u);
}
{
commandList->appendLaunchKernel(kernel.toHandle(), &groupCount, event3, 0, nullptr, launchParams, false);
EXPECT_EQ(ultCsr->getNumClients(), 1u);
zeEventHostReset(event3);
zeEventHostSynchronize(event3, 1);
EXPECT_EQ(ultCsr->getNumClients(), 1u);
}
zeEventDestroy(event1);
zeEventDestroy(event2);
zeEventDestroy(event3);
}
HWTEST_F(CommandListCreate, givenDebugFlagSetWhenCallingSynchronizeThenDontUnregister) {
DebugManagerStateRestore restore;
DebugManager.flags.TrackNumCsrClientsOnSyncPoints.set(0);
ze_command_queue_desc_t desc = {};
desc.mode = ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS;
ze_result_t returnValue;
std::unique_ptr<L0::CommandList> commandList(CommandList::createImmediate(productFamily, device, &desc, false, NEO::EngineGroupType::RenderCompute, returnValue));
ASSERT_NE(nullptr, commandList);
auto whiteBoxCmdList = static_cast<CommandList *>(commandList.get());
Mock<::L0::Kernel> kernel;
ze_group_count_t groupCount{1, 1, 1};
CmdListKernelLaunchParams launchParams = {};
auto ultCsr = static_cast<NEO::UltCommandStreamReceiver<FamilyType> *>(whiteBoxCmdList->csr);
ze_event_pool_desc_t eventPoolDesc = {};
eventPoolDesc.count = 1;
ze_event_desc_t eventDesc = {};
ze_event_handle_t event = nullptr;
std::unique_ptr<L0::EventPool> eventPool(EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, returnValue));
ASSERT_EQ(ZE_RESULT_SUCCESS, eventPool->createEvent(&eventDesc, &event));
EXPECT_EQ(ultCsr->getNumClients(), 0u);
commandList->appendLaunchKernel(kernel.toHandle(), &groupCount, event, 0, nullptr, launchParams, false);
EXPECT_EQ(ultCsr->getNumClients(), 1u);
Event::fromHandle(event)->setIsCompleted();
zeEventHostSynchronize(event, std::numeric_limits<uint64_t>::max());
EXPECT_EQ(ultCsr->getNumClients(), 1u);
zeEventDestroy(event);
}
HWTEST2_F(CommandListCreate, givenDirectSubmissionAndImmCmdListWhenDispatchingThenPassRelaxedOrderingDependenciesInfo, IsAtLeastXeHpcCore) {
DebugManagerStateRestore restore;
DebugManager.flags.DirectSubmissionRelaxedOrdering.set(1);

View File

@ -245,6 +245,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, DebugSetMemoryDiagnosticsDelay, -1, "-1: default
DECLARE_DEBUG_VARIABLE(int32_t, EnableDeviceStateVerification, -1, "-1: default, 0: disable, 1: enable check of device state before submit on Windows")
DECLARE_DEBUG_VARIABLE(int32_t, EnableDynamicPostSyncAllocLayout, -1, "-1: default, 0: Keep Timestamp size layout, 1: Use write immediate layout (qword) and switch dynamically to TS for profiling")
DECLARE_DEBUG_VARIABLE(int32_t, PrintTimestampPacketUsage, -1, "-1: default, 0: Disabled, 1: Print when TSP is allocated, initialized, returned to pool, etc.")
DECLARE_DEBUG_VARIABLE(int32_t, TrackNumCsrClientsOnSyncPoints, -1, "-1: default, 0: Disabled, 1: If set, synchronization points like zeEventHostSynchronize will unregister CmdQ from CSR clients")
/*LOGGING FLAGS*/
DECLARE_DEBUG_VARIABLE(int32_t, PrintDriverDiagnostics, -1, "prints driver diagnostics messages to standard output, value corresponds to hint level")

View File

@ -536,5 +536,6 @@ VfBarResourceAllocationWa = 1
EnableDynamicPostSyncAllocLayout = -1
ForceNumberOfThreadsInGpgpuThreadGroup = -1
PrintTimestampPacketUsage = -1
TrackNumCsrClientsOnSyncPoints = -1
CommandListTimestampRefreshIntervalInMilliSec = -1
# Please don't edit below this line