diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index f491b3706c..320dd3e65e 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -200,15 +200,15 @@ void CommandListCoreFamily::handleInOrderCounterOverflow(bool cop CommandListCoreFamily::appendWaitOnInOrderDependency(inOrderExecInfo, nullptr, inOrderExecInfo->getCounterValue() + 1, inOrderExecInfo->getAllocationOffset(), false, true, false, false, isDualStreamCopyOffloadOperation(copyOffloadOperation)); - inOrderExecInfo->resetCounterValue(); - uint32_t newOffset = 0; if (inOrderExecInfo->getAllocationOffset() == 0) { // multitile immediate writes are uint64_t aligned newOffset = alignUp(this->partitionCount * device->getL0GfxCoreHelper().getImmediateWritePostSyncOffset(), MemoryConstants::cacheLineSize * 4); + UNRECOVERABLE_IF(newOffset == 0); } inOrderExecInfo->setAllocationOffset(newOffset); + inOrderExecInfo->resetCounterValue(); inOrderExecInfo->initializeAllocationsFromHost(); CommandListCoreFamily::appendSignalInOrderDependencyCounter(nullptr, copyOffloadOperation, false, false, false); // signal counter on new offset @@ -2970,7 +2970,7 @@ bool CommandListCoreFamily::handleInOrderImplicitDependencies(boo } if (hasInOrderDependencies()) { - if (inOrderExecInfo->isCounterAlreadyDone(inOrderExecInfo->getCounterValue())) { + if (inOrderExecInfo->isCounterAlreadyDone(inOrderExecInfo->getCounterValue(), inOrderExecInfo->getAllocationOffset())) { this->latestOperationHasOptimizedCbEvent = false; return false; } @@ -4697,7 +4697,7 @@ void CommandListCoreFamily::patchInOrderCmds() { } template bool CommandListCoreFamily::hasInOrderDependencies() const { - return (inOrderExecInfo.get() && inOrderExecInfo->getCounterValue() > 0); + return (inOrderExecInfo.get() && inOrderExecInfo->getCounterValue() > inOrderExecInfo->getInitialCounterValue()); } template diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl index b6b17b684c..e0bf7b3266 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl @@ -1215,9 +1215,9 @@ ze_result_t CommandListCoreFamilyImmediate::hostSynchronize(uint6 uint64_t inOrderSyncValue = this->inOrderExecInfo.get() ? inOrderExecInfo->getCounterValue() : 0; - if (inOrderWaitAllowed) { + if (inOrderWaitAllowed && !inOrderExecInfo->isCounterAlreadyDone(inOrderExecInfo->getCounterValue(), inOrderExecInfo->getAllocationOffset())) { status = synchronizeInOrderExecution(timeout, (waitQueue == this->cmdQImmediateCopyOffload)); - } else { + } else if (!inOrderWaitAllowed) { const int64_t timeoutInMicroSeconds = timeout / 1000; const auto indefinitelyPoll = timeout == std::numeric_limits::max(); const auto waitStatus = waitCsr->waitForCompletionWithTimeout(NEO::WaitParams{indefinitelyPoll, !indefinitelyPoll, false, timeoutInMicroSeconds}, waitTaskCount); @@ -1230,7 +1230,7 @@ ze_result_t CommandListCoreFamilyImmediate::hostSynchronize(uint6 if (status != ZE_RESULT_NOT_READY) { if (isInOrderExecutionEnabled()) { - inOrderExecInfo->setLastWaitedCounterValue(inOrderSyncValue); + inOrderExecInfo->setLastWaitedCounterValue(inOrderSyncValue, inOrderExecInfo->getAllocationOffset()); } if (this->isTbxMode && (status == ZE_RESULT_SUCCESS)) { diff --git a/level_zero/core/source/event/event.cpp b/level_zero/core/source/event/event.cpp index c96b726f31..f2a1ce2475 100644 --- a/level_zero/core/source/event/event.cpp +++ b/level_zero/core/source/event/event.cpp @@ -695,7 +695,7 @@ void Event::unsetInOrderExecInfo() { void Event::resetInOrderTimestampNode(NEO::TagNodeBase *newNode, uint32_t partitionCount) { if (inOrderIncrementValue == 0 || !newNode) { for (auto &node : inOrderTimestampNode) { - inOrderExecInfo->pushTempTimestampNode(node, inOrderExecSignalValue); + inOrderExecInfo->pushTempTimestampNode(node, inOrderExecSignalValue, this->getInOrderAllocationOffset()); } inOrderTimestampNode.clear(); @@ -720,7 +720,7 @@ void Event::resetAdditionalTimestampNode(NEO::TagNodeBase *newNode, uint32_t par } else if (resetAggregatedEvent) { // If we are resetting aggregated event, we need to clear all additional timestamp nodes for (auto &node : additionalTimestampNode) { - inOrderExecInfo->pushTempTimestampNode(node, inOrderExecSignalValue); + inOrderExecInfo->pushTempTimestampNode(node, inOrderExecSignalValue, this->getInOrderAllocationOffset()); } additionalTimestampNode.clear(); } @@ -731,7 +731,7 @@ void Event::resetAdditionalTimestampNode(NEO::TagNodeBase *newNode, uint32_t par for (auto &node : additionalTimestampNode) { if (inOrderExecInfo) { // Push to temp node vector and releaseNotUsedTempTimestampNodes will clear when needed - inOrderExecInfo->pushTempTimestampNode(node, inOrderExecSignalValue); + inOrderExecInfo->pushTempTimestampNode(node, inOrderExecSignalValue, this->getInOrderAllocationOffset()); } else { node->returnTag(); } diff --git a/level_zero/core/source/event/event_impl.inl b/level_zero/core/source/event/event_impl.inl index 58235ca1c7..52e6ef9b27 100644 --- a/level_zero/core/source/event/event_impl.inl +++ b/level_zero/core/source/event/event_impl.inl @@ -305,7 +305,7 @@ ze_result_t EventImp::queryCounterBasedEventStatus() { auto waitValue = getInOrderExecSignalValueWithSubmissionCounter(); - if (!inOrderExecInfo->isCounterAlreadyDone(waitValue)) { + if (!inOrderExecInfo->isCounterAlreadyDone(waitValue, this->getInOrderAllocationOffset())) { bool signaled = true; if (this->optimizedCbEvent) { @@ -327,7 +327,7 @@ ze_result_t EventImp::queryCounterBasedEventStatus() { if (!signaled) { return ZE_RESULT_NOT_READY; } - inOrderExecInfo->setLastWaitedCounterValue(waitValue); + inOrderExecInfo->setLastWaitedCounterValue(waitValue, this->getInOrderAllocationOffset()); } handleSuccessfulHostSynchronization(); @@ -770,7 +770,7 @@ ze_result_t EventImp::hostSynchronize(uint64_t timeout) { if (this->optimizedCbEvent) { synchronizeTimestampCompletionWithTimeout(); if (this->isTimestampPopulated()) { - inOrderExecInfo->setLastWaitedCounterValue(getInOrderExecSignalValueWithSubmissionCounter()); + inOrderExecInfo->setLastWaitedCounterValue(getInOrderExecSignalValueWithSubmissionCounter(), this->getInOrderAllocationOffset()); handleSuccessfulHostSynchronization(); ret = ZE_RESULT_SUCCESS; this->optimizedCbEvent = false; diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist_1.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist_1.cpp index 79a1a351d4..6a414b3d3d 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist_1.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist_1.cpp @@ -328,25 +328,24 @@ HWTEST_F(InOrderCmdListTests, givenCounterBasedEventsWhenHostWaitsAreCalledThenL EXPECT_EQ(ZE_RESULT_SUCCESS, status); auto counterValue = events[1]->inOrderExecSignalValue; - EXPECT_TRUE(inOrderExecInfo->isCounterAlreadyDone(counterValue)); - EXPECT_TRUE(inOrderExecInfo->isCounterAlreadyDone(events[0]->inOrderExecSignalValue)); - EXPECT_FALSE(inOrderExecInfo->isCounterAlreadyDone(counterValue + 1)); + EXPECT_TRUE(inOrderExecInfo->isCounterAlreadyDone(counterValue, 0)); + EXPECT_TRUE(inOrderExecInfo->isCounterAlreadyDone(events[0]->inOrderExecSignalValue, 0)); + EXPECT_FALSE(inOrderExecInfo->isCounterAlreadyDone(counterValue + 1, 0)); // setting lower counter ignored - inOrderExecInfo->setLastWaitedCounterValue(counterValue - 1); - EXPECT_TRUE(inOrderExecInfo->isCounterAlreadyDone(counterValue)); - EXPECT_TRUE(inOrderExecInfo->isCounterAlreadyDone(events[0]->inOrderExecSignalValue)); - EXPECT_FALSE(inOrderExecInfo->isCounterAlreadyDone(counterValue + 1)); + inOrderExecInfo->setLastWaitedCounterValue(counterValue - 1, 0); + EXPECT_TRUE(inOrderExecInfo->isCounterAlreadyDone(counterValue, 0)); + EXPECT_TRUE(inOrderExecInfo->isCounterAlreadyDone(events[0]->inOrderExecSignalValue, 0)); + EXPECT_FALSE(inOrderExecInfo->isCounterAlreadyDone(counterValue + 1, 0)); status = events[0]->hostSynchronize(-1); EXPECT_EQ(ZE_RESULT_SUCCESS, status); - EXPECT_TRUE(inOrderExecInfo->isCounterAlreadyDone(counterValue)); - EXPECT_FALSE(inOrderExecInfo->isCounterAlreadyDone(counterValue + 1)); + EXPECT_TRUE(inOrderExecInfo->isCounterAlreadyDone(counterValue, 0)); + EXPECT_FALSE(inOrderExecInfo->isCounterAlreadyDone(counterValue + 1, 0)); - // setting offset disables mechanism inOrderExecInfo->setAllocationOffset(4u); - EXPECT_FALSE(inOrderExecInfo->isCounterAlreadyDone(0u)); - EXPECT_FALSE(inOrderExecInfo->isCounterAlreadyDone(counterValue)); + EXPECT_TRUE(inOrderExecInfo->isCounterAlreadyDone(0u, 0)); + EXPECT_TRUE(inOrderExecInfo->isCounterAlreadyDone(counterValue, 0)); completeHostAddress>>(immCmdList.get()); } @@ -529,9 +528,9 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, InOrderCmdListTests, givenCounterBasedTimestampEven cmdList->appendLaunchKernel(kernel->toHandle(), groupCount, event3->toHandle(), 0, nullptr, launchParams); event3->hostEventSetValue(Event::STATE_CLEARED); - event1->getInOrderExecInfo()->setLastWaitedCounterValue(2); - event2->getInOrderExecInfo()->setLastWaitedCounterValue(2); - event3->getInOrderExecInfo()->setLastWaitedCounterValue(3); + event1->getInOrderExecInfo()->setLastWaitedCounterValue(2, 0); + event2->getInOrderExecInfo()->setLastWaitedCounterValue(2, 0); + event3->getInOrderExecInfo()->setLastWaitedCounterValue(3, 0); EXPECT_EQ(ZE_RESULT_SUCCESS, event1->queryStatus()); EXPECT_EQ(ZE_RESULT_SUCCESS, event2->queryStatus()); @@ -1781,7 +1780,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, InOrderCmdListTests, givenImmediateCmdListWhenDispa EXPECT_EQ(Event::CounterBasedMode::implicitlyEnabled, events[0]->counterBasedMode); } if (!events[0]->inOrderTimestampNode.empty()) { - copyOnlyCmdList->inOrderExecInfo->pushTempTimestampNode(events[0]->inOrderTimestampNode[0], events[0]->inOrderExecSignalValue); + copyOnlyCmdList->inOrderExecInfo->pushTempTimestampNode(events[0]->inOrderTimestampNode[0], events[0]->inOrderExecSignalValue, 0); } events[0]->inOrderTimestampNode.clear(); events[0]->makeCounterBasedInitiallyDisabled(eventPool->getAllocation()); @@ -5206,12 +5205,13 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, InOrderCmdListTests, givenInOrderModeWhenCallingSyn ultCsr->forceReturnGpuHang = false; forceFail = false; callCounter = 0; + immCmdList->getInOrderExecInfo()->addCounterValue(1); EXPECT_EQ(ZE_RESULT_SUCCESS, immCmdList->hostSynchronize(std::numeric_limits::max(), false)); EXPECT_EQ(downloadedAlloc, expectedAlloc); - EXPECT_EQ(failCounter, callCounter); - EXPECT_EQ(failCounter - 1, ultCsr->checkGpuHangDetectedCalled); - EXPECT_EQ(1u, *hostAddress); + EXPECT_EQ(failCounter + 1, callCounter); + EXPECT_EQ(failCounter, ultCsr->checkGpuHangDetectedCalled); + EXPECT_EQ(2u, *hostAddress); } immCmdList->appendLaunchKernel(kernel->toHandle(), groupCount, nullptr, 0, nullptr, launchParams); @@ -5299,6 +5299,8 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, InOrderCmdListTests, givenDebugFlagSetWhenCallingSy // success { + immCmdList->getInOrderExecInfo()->addCounterValue(1); + ultCsr->checkGpuHangDetectedCalled = 0; ultCsr->forceReturnGpuHang = false; forceFail = false; @@ -5306,9 +5308,9 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, InOrderCmdListTests, givenDebugFlagSetWhenCallingSy EXPECT_EQ(downloadedAlloc, hostAlloc); EXPECT_EQ(ZE_RESULT_SUCCESS, immCmdList->hostSynchronize(std::numeric_limits::max(), false)); - EXPECT_EQ(failCounter, callCounter); - EXPECT_EQ(failCounter - 1, ultCsr->checkGpuHangDetectedCalled); - EXPECT_EQ(1u, *hostAddress); + EXPECT_EQ(failCounter + 1, callCounter); + EXPECT_EQ(failCounter, ultCsr->checkGpuHangDetectedCalled); + EXPECT_EQ(2u, *hostAddress); } immCmdList->appendLaunchKernel(kernel->toHandle(), groupCount, nullptr, 0, nullptr, launchParams); diff --git a/level_zero/core/test/unit_tests/sources/event/test_event.cpp b/level_zero/core/test/unit_tests/sources/event/test_event.cpp index 32fb7936e3..755c426fed 100644 --- a/level_zero/core/test/unit_tests/sources/event/test_event.cpp +++ b/level_zero/core/test/unit_tests/sources/event/test_event.cpp @@ -4948,12 +4948,12 @@ HWTEST2_F(EventMultiTileDynamicPacketUseTest, givenEventCounterBasedUsedCreatedO event2->eventPoolAllocation = nullptr; auto inOrderExecInfo0 = NEO::InOrderExecInfo::create(device->getDeviceInOrderCounterAllocator()->getTag(), nullptr, *device->getNEODevice(), 1, false); - inOrderExecInfo0->setLastWaitedCounterValue(1); + inOrderExecInfo0->setLastWaitedCounterValue(1, 0); event0->updateInOrderExecState(inOrderExecInfo0, 1, 0); uint64_t counter = 2; auto inOrderExecInfo1 = NEO::InOrderExecInfo::createFromExternalAllocation(*device->getNEODevice(), nullptr, 0x1, nullptr, &counter, 1, 1, 1); - inOrderExecInfo1->setLastWaitedCounterValue(1); + inOrderExecInfo1->setLastWaitedCounterValue(1, 0); event1->updateInOrderExecState(inOrderExecInfo1, 1, 0); MockGraphicsAllocation mockAlloc(rootDeviceIndex, nullptr, 1); diff --git a/shared/source/debug_settings/debug_variables_base.inl b/shared/source/debug_settings/debug_variables_base.inl index 3d0f546d1b..238f49d27e 100644 --- a/shared/source/debug_settings/debug_variables_base.inl +++ b/shared/source/debug_settings/debug_variables_base.inl @@ -268,6 +268,7 @@ DECLARE_DEBUG_VARIABLE(int32_t, ForceInOrderImmediateCmdListExecution, -1, "-1: DECLARE_DEBUG_VARIABLE(int32_t, ForceInOrderEvents, -1, "-1: default, 0: disabled, 1: Enable all Events as in-order, to rely on command list counter value") DECLARE_DEBUG_VARIABLE(int32_t, ForceCopyOperationOffloadForComputeCmdList, -1, "-1: default, 0: disabled, 1: Enabled for immediate in-order cmd lists, 2: Enabled for all types. If enabled, all compute cmdlist will try to offload copy operations to copy engine") DECLARE_DEBUG_VARIABLE(int32_t, EnableImplicitConvertionToCounterBasedEvents, -1, "-1: default, 0: Disable, 1: Enable. If enabled, try to convert Regular Events used on Immediate CL to CounterBased") +DECLARE_DEBUG_VARIABLE(int64_t, InitialCounterBasedEventValue, -1, "-1: default, >=0: initial value set during counter creation") DECLARE_DEBUG_VARIABLE(int32_t, ForceTlbFlush, -1, "-1: default, 0: Tlb flush disabled, 1: Tlb Flush enabled") DECLARE_DEBUG_VARIABLE(int32_t, AllowDcFlush, -1, "-1: default, 0: DC flush disabled, 1: DC flush enabled") DECLARE_DEBUG_VARIABLE(int32_t, DebugSetMemoryDiagnosticsDelay, -1, "-1: default, >=0: delay time in minutes necessary for completion of Memory diagnostics") diff --git a/shared/source/helpers/in_order_cmd_helpers.cpp b/shared/source/helpers/in_order_cmd_helpers.cpp index b48200b501..5c2fec5158 100644 --- a/shared/source/helpers/in_order_cmd_helpers.cpp +++ b/shared/source/helpers/in_order_cmd_helpers.cpp @@ -73,7 +73,9 @@ InOrderExecInfo::InOrderExecInfo(TagNodeBase *deviceCounterNode, TagNodeBase *ho deviceAddress = deviceCounterNode->getGpuAddress(); } - isTbx = device.getDefaultEngine().commandStreamReceiver->isTbxMode(); + auto csr = device.getDefaultEngine().commandStreamReceiver; + isTbx = csr->isTbxMode(); + immWritePostSyncWriteOffset = std::max(csr->getImmWritePostSyncWriteOffset(), static_cast(sizeof(uint64_t))); reset(); } @@ -98,20 +100,28 @@ void InOrderExecInfo::uploadToTbx(TagNodeBase &node, size_t size) { } void InOrderExecInfo::initializeAllocationsFromHost() { + const uint64_t initialValue = getInitialCounterValue(); + if (deviceCounterNode) { - const size_t deviceAllocationWriteSize = sizeof(uint64_t) * numDevicePartitionsToWait; - memset(ptrOffset(deviceCounterNode->getCpuBase(), allocationOffset), 0, deviceAllocationWriteSize); + for (uint32_t i = 0; i < numDevicePartitionsToWait; i++) { + uint64_t *ptr = reinterpret_cast(ptrOffset(deviceCounterNode->getCpuBase(), allocationOffset + (i * immWritePostSyncWriteOffset))); + *ptr = initialValue; + } if (isTbx) { + const size_t deviceAllocationWriteSize = alignUp(sizeof(uint64_t), immWritePostSyncWriteOffset) * numDevicePartitionsToWait; uploadToTbx(*deviceCounterNode, deviceAllocationWriteSize); } } if (hostCounterNode) { - const size_t hostAllocationWriteSize = sizeof(uint64_t) * numHostPartitionsToWait; - memset(ptrOffset(hostCounterNode->getCpuBase(), allocationOffset), 0, hostAllocationWriteSize); + for (uint32_t i = 0; i < numHostPartitionsToWait; i++) { + uint64_t *ptr = reinterpret_cast(ptrOffset(hostCounterNode->getCpuBase(), allocationOffset + (i * immWritePostSyncWriteOffset))); + *ptr = initialValue; + } if (isTbx) { + const size_t hostAllocationWriteSize = alignUp(sizeof(uint64_t), immWritePostSyncWriteOffset) * numHostPartitionsToWait; uploadToTbx(*hostCounterNode, hostAllocationWriteSize); } } @@ -125,6 +135,11 @@ void InOrderExecInfo::reset() { initializeAllocationsFromHost(); } +void InOrderExecInfo::resetCounterValue() { + counterValue = getInitialCounterValue(); + lastWaitedCounterValue[allocationOffset != 0].store(getInitialCounterValue()); +} + NEO::GraphicsAllocation *InOrderExecInfo::getDeviceCounterAllocation() const { if (externalDeviceAllocation) { return externalDeviceAllocation; @@ -143,19 +158,20 @@ uint64_t InOrderExecInfo::getBaseHostGpuAddress() const { return hostCounterNode->getGpuAddress(); } -void InOrderExecInfo::pushTempTimestampNode(TagNodeBase *node, uint64_t value) { +void InOrderExecInfo::pushTempTimestampNode(TagNodeBase *node, uint64_t value, uint32_t allocationOffset) { std::unique_lock lock(mutex); - tempTimestampNodes.emplace_back(node, value); + tempTimestampNodes.emplace_back(node, std::make_pair(value, allocationOffset)); } void InOrderExecInfo::releaseNotUsedTempTimestampNodes(bool forceReturn) { std::unique_lock lock(mutex); - std::vector> tempVector; + std::vector> tempVector; for (auto &node : tempTimestampNodes) { - if (forceReturn || lastWaitedCounterValue >= node.second) { + const auto &counterAndOffsetPair = node.second; + if (forceReturn || isCounterAlreadyDone(counterAndOffsetPair.first, counterAndOffsetPair.second)) { node.first->returnTag(); } else { tempVector.push_back(node); @@ -179,4 +195,8 @@ uint64_t InOrderExecInfo::getDeviceNodeGpuAddress() const { return 0; } +uint64_t InOrderExecInfo::getInitialCounterValue() const { + return debugManager.flags.InitialCounterBasedEventValue.getIfNotDefault(0); +} + } // namespace NEO diff --git a/shared/source/helpers/in_order_cmd_helpers.h b/shared/source/helpers/in_order_cmd_helpers.h index 02fbb1fea1..51f1d44672 100644 --- a/shared/source/helpers/in_order_cmd_helpers.h +++ b/shared/source/helpers/in_order_cmd_helpers.h @@ -8,6 +8,7 @@ #pragma once #include "shared/source/helpers/common_types.h" +#include "shared/source/helpers/mt_helpers.h" #include "shared/source/helpers/non_copyable_or_moveable.h" #include "shared/source/helpers/ptr_math.h" #include "shared/source/memory_manager/allocation_type.h" @@ -82,7 +83,7 @@ class InOrderExecInfo : public NEO::NonCopyableClass { uint64_t getCounterValue() const { return counterValue; } void addCounterValue(uint64_t addValue) { counterValue += addValue; } - void resetCounterValue() { counterValue = 0; } + void resetCounterValue(); uint64_t getRegularCmdListSubmissionCounter() const { return regularCmdListSubmissionCounter; } void addRegularCmdListSubmissionCounter(uint64_t addValue) { regularCmdListSubmissionCounter += addValue; } @@ -100,23 +101,27 @@ class InOrderExecInfo : public NEO::NonCopyableClass { void reset(); bool isExternalMemoryExecInfo() const { return deviceCounterNode == nullptr; } - void setLastWaitedCounterValue(uint64_t value) { + void setLastWaitedCounterValue(uint64_t value, uint32_t allocationOffset) { if (!isExternalMemoryExecInfo()) { - lastWaitedCounterValue = std::max(value, lastWaitedCounterValue); + NEO::MultiThreadHelpers::interlockedMax(lastWaitedCounterValue[allocationOffset != 0], value); } } - bool isCounterAlreadyDone(uint64_t waitValue) const { - return lastWaitedCounterValue >= waitValue && this->allocationOffset == 0u; + bool isCounterAlreadyDone(uint64_t waitValue, uint32_t allocationOffset) const { + return lastWaitedCounterValue[allocationOffset != 0] >= waitValue; } NEO::GraphicsAllocation *getExternalHostAllocation() const { return externalHostAllocation; } NEO::GraphicsAllocation *getExternalDeviceAllocation() const { return externalDeviceAllocation; } - void pushTempTimestampNode(TagNodeBase *node, uint64_t value); + void pushTempTimestampNode(TagNodeBase *node, uint64_t value, uint32_t allocationOffset); void releaseNotUsedTempTimestampNodes(bool forceReturn); + uint64_t getInitialCounterValue() const; + protected: + using CounterAndOffsetPairT = std::pair; + void uploadToTbx(TagNodeBase &node, size_t size); NEO::Device &device; @@ -124,12 +129,12 @@ class InOrderExecInfo : public NEO::NonCopyableClass { NEO::TagNodeBase *hostCounterNode = nullptr; NEO::GraphicsAllocation *externalHostAllocation = nullptr; NEO::GraphicsAllocation *externalDeviceAllocation = nullptr; - std::vector> tempTimestampNodes; + std::vector> tempTimestampNodes; std::mutex mutex; + std::atomic lastWaitedCounterValue[2] = {0, 0}; // [0] for offset == 0, [1] for offset != 0 uint64_t counterValue = 0; - uint64_t lastWaitedCounterValue = 0; uint64_t regularCmdListSubmissionCounter = 0; uint64_t deviceAddress = 0; uint64_t *hostAddress = nullptr; @@ -137,6 +142,7 @@ class InOrderExecInfo : public NEO::NonCopyableClass { uint32_t numHostPartitionsToWait = 0; uint32_t allocationOffset = 0; uint32_t rootDeviceIndex = 0; + uint32_t immWritePostSyncWriteOffset = 0; bool regularCmdList = false; bool duplicatedHostStorage = false; bool atomicDeviceSignalling = false; diff --git a/shared/test/common/test_files/igdrcl.config b/shared/test/common/test_files/igdrcl.config index d997b74f27..c19b8e8957 100644 --- a/shared/test/common/test_files/igdrcl.config +++ b/shared/test/common/test_files/igdrcl.config @@ -673,4 +673,5 @@ LimitIsaPrefetchSize = -1 EnableUsmAllocationPoolManager = -1 ForceTotalWMTPDataSize = -1 DirectSubmissionInitialSemaphoreValue = -1 +InitialCounterBasedEventValue = -1 # Please don't edit below this line diff --git a/shared/test/unit_test/command_container/command_encoder_tests.cpp b/shared/test/unit_test/command_container/command_encoder_tests.cpp index 84700aa120..f1625bf160 100644 --- a/shared/test/unit_test/command_container/command_encoder_tests.cpp +++ b/shared/test/unit_test/command_container/command_encoder_tests.cpp @@ -120,10 +120,10 @@ HWTEST_F(CommandEncoderTests, givenTsNodesWhenStoringOnTempListThenHandleOwnersh { MyMockInOrderExecInfo inOrderExecInfo(nullptr, nullptr, mockDevice, 1, false, false); - inOrderExecInfo.lastWaitedCounterValue = 0; + inOrderExecInfo.lastWaitedCounterValue[0] = 0; - inOrderExecInfo.pushTempTimestampNode(node0, 1); - inOrderExecInfo.pushTempTimestampNode(node1, 2); + inOrderExecInfo.pushTempTimestampNode(node0, 1, 0); + inOrderExecInfo.pushTempTimestampNode(node1, 2, 0); EXPECT_EQ(2u, inOrderExecInfo.tempTimestampNodes.size()); @@ -133,7 +133,7 @@ HWTEST_F(CommandEncoderTests, givenTsNodesWhenStoringOnTempListThenHandleOwnersh EXPECT_FALSE(tsAllocator.freeTags.peekContains(*node0)); EXPECT_FALSE(tsAllocator.freeTags.peekContains(*node1)); - inOrderExecInfo.lastWaitedCounterValue = 1; + inOrderExecInfo.lastWaitedCounterValue[0] = 1; inOrderExecInfo.releaseNotUsedTempTimestampNodes(false); EXPECT_EQ(1u, inOrderExecInfo.tempTimestampNodes.size()); EXPECT_EQ(node1, inOrderExecInfo.tempTimestampNodes[0].first); @@ -141,7 +141,7 @@ HWTEST_F(CommandEncoderTests, givenTsNodesWhenStoringOnTempListThenHandleOwnersh EXPECT_TRUE(tsAllocator.freeTags.peekContains(*node0)); EXPECT_FALSE(tsAllocator.freeTags.peekContains(*node1)); - inOrderExecInfo.lastWaitedCounterValue = 2; + inOrderExecInfo.lastWaitedCounterValue[0] = 2; inOrderExecInfo.releaseNotUsedTempTimestampNodes(false); EXPECT_EQ(0u, inOrderExecInfo.tempTimestampNodes.size()); EXPECT_TRUE(tsAllocator.freeTags.peekContains(*node0)); @@ -153,8 +153,8 @@ HWTEST_F(CommandEncoderTests, givenTsNodesWhenStoringOnTempListThenHandleOwnersh EXPECT_FALSE(tsAllocator.freeTags.peekContains(*node0)); EXPECT_FALSE(tsAllocator.freeTags.peekContains(*node1)); - inOrderExecInfo.pushTempTimestampNode(node0, 3); - inOrderExecInfo.pushTempTimestampNode(node1, 4); + inOrderExecInfo.pushTempTimestampNode(node0, 3, 0); + inOrderExecInfo.pushTempTimestampNode(node1, 4, 0); } // forced release on destruction @@ -162,6 +162,64 @@ HWTEST_F(CommandEncoderTests, givenTsNodesWhenStoringOnTempListThenHandleOwnersh EXPECT_TRUE(tsAllocator.freeTags.peekContains(*node1)); } +HWTEST_F(CommandEncoderTests, givenDebugFlagSetWhenHandlingTheCounterThenUseInitialValue) { + DebugManagerStateRestore restore; + + constexpr uint64_t initialValue = 16; + + debugManager.flags.InOrderDuplicatedCounterStorageEnabled.set(1); + debugManager.flags.InOrderAtomicSignallingEnabled.set(0); + debugManager.flags.InitialCounterBasedEventValue.set(static_cast(initialValue)); + + constexpr uint32_t partitionCount = 2u; + MockDevice mockDevice; + + MockTagAllocator> deviceTagAllocator(0, mockDevice.getMemoryManager()); + MockTagAllocator> hostTagAllocator(0, mockDevice.getMemoryManager()); + + const auto immWritePartitionOffset = ImplicitScalingDispatch::getImmediateWritePostSyncOffset(); + + // initialize + auto deviceNode = deviceTagAllocator.getTag(); + auto hostNode = hostTagAllocator.getTag(); + + auto devicePtrBase = reinterpret_cast(deviceNode->getCpuBase()); + auto hostPtrBase = reinterpret_cast(hostNode->getCpuBase()); + + auto inOrderExecInfo = InOrderExecInfo::create(deviceNode, hostNode, mockDevice, partitionCount, false); + + for (uint32_t i = 0; i < partitionCount; i++) { + auto devicePtr = ptrOffset(devicePtrBase, i * immWritePartitionOffset); + EXPECT_EQ(initialValue, *devicePtr); + + auto hostPtr = ptrOffset(hostPtrBase, i * immWritePartitionOffset); + EXPECT_EQ(initialValue, *hostPtr); + } + + // update + for (uint32_t i = 0; i < partitionCount; i++) { + auto devicePtr = ptrOffset(devicePtrBase, i * immWritePartitionOffset); + *devicePtr = initialValue + 10; + + auto hostPtr = ptrOffset(hostPtrBase, i * immWritePartitionOffset); + *hostPtr = initialValue + 20; + } + + inOrderExecInfo->setLastWaitedCounterValue(initialValue + 5, 0); + + // reset + inOrderExecInfo->reset(); + + for (uint32_t i = 0; i < partitionCount; i++) { + auto devicePtr = ptrOffset(devicePtrBase, i * immWritePartitionOffset); + EXPECT_EQ(initialValue, *devicePtr); + + auto hostPtr = ptrOffset(hostPtrBase, i * immWritePartitionOffset); + EXPECT_EQ(initialValue, *hostPtr); + } + EXPECT_FALSE(inOrderExecInfo->isCounterAlreadyDone(initialValue + 1, 0)); +} + HWTEST_F(CommandEncoderTests, givenDifferentInputParamsWhenCreatingInOrderExecInfoThenSetupCorrectly) { MockDevice mockDevice; @@ -296,29 +354,29 @@ HWTEST_F(CommandEncoderTests, givenInOrderExecutionInfoWhenSetLastCounterValueIs auto node = tagAllocator.getTag(); auto inOrderExecInfo = std::make_unique(node, nullptr, mockDevice, 2, true, false); - inOrderExecInfo->setLastWaitedCounterValue(1u); + inOrderExecInfo->setLastWaitedCounterValue(1u, 0); - EXPECT_FALSE(inOrderExecInfo->isCounterAlreadyDone(2u)); - EXPECT_TRUE(inOrderExecInfo->isCounterAlreadyDone(1u)); - EXPECT_TRUE(inOrderExecInfo->isCounterAlreadyDone(0u)); + EXPECT_FALSE(inOrderExecInfo->isCounterAlreadyDone(2u, 0)); + EXPECT_TRUE(inOrderExecInfo->isCounterAlreadyDone(1u, 0)); + EXPECT_TRUE(inOrderExecInfo->isCounterAlreadyDone(0u, 0)); - inOrderExecInfo->setLastWaitedCounterValue(0u); - EXPECT_FALSE(inOrderExecInfo->isCounterAlreadyDone(2u)); - EXPECT_TRUE(inOrderExecInfo->isCounterAlreadyDone(1u)); - EXPECT_TRUE(inOrderExecInfo->isCounterAlreadyDone(0u)); + inOrderExecInfo->setLastWaitedCounterValue(0u, 0); + EXPECT_FALSE(inOrderExecInfo->isCounterAlreadyDone(2u, 0)); + EXPECT_TRUE(inOrderExecInfo->isCounterAlreadyDone(1u, 0)); + EXPECT_TRUE(inOrderExecInfo->isCounterAlreadyDone(0u, 0)); - inOrderExecInfo->setLastWaitedCounterValue(3u); - EXPECT_TRUE(inOrderExecInfo->isCounterAlreadyDone(2u)); - EXPECT_TRUE(inOrderExecInfo->isCounterAlreadyDone(3u)); + inOrderExecInfo->setLastWaitedCounterValue(3u, 0); + EXPECT_TRUE(inOrderExecInfo->isCounterAlreadyDone(2u, 0)); + EXPECT_TRUE(inOrderExecInfo->isCounterAlreadyDone(3u, 0)); inOrderExecInfo->setAllocationOffset(4u); - EXPECT_FALSE(inOrderExecInfo->isCounterAlreadyDone(2u)); - EXPECT_FALSE(inOrderExecInfo->isCounterAlreadyDone(3u)); - EXPECT_FALSE(inOrderExecInfo->isCounterAlreadyDone(0u)); + EXPECT_FALSE(inOrderExecInfo->isCounterAlreadyDone(2u, 4)); + EXPECT_FALSE(inOrderExecInfo->isCounterAlreadyDone(3u, 4)); + EXPECT_TRUE(inOrderExecInfo->isCounterAlreadyDone(0u, 4)); inOrderExecInfo = std::make_unique(nullptr, nullptr, mockDevice, 2, true, false); - inOrderExecInfo->setLastWaitedCounterValue(2); - EXPECT_FALSE(inOrderExecInfo->isCounterAlreadyDone(1)); + inOrderExecInfo->setLastWaitedCounterValue(2, 0); + EXPECT_FALSE(inOrderExecInfo->isCounterAlreadyDone(1, 0)); } HWTEST_F(CommandEncoderTests, givenInOrderExecutionInfoWhenResetCalledThenUploadToTbx) {