diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.h b/level_zero/core/source/cmdlist/cmdlist_hw.h index 0f5d546c3d..40dd40bb4c 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.h +++ b/level_zero/core/source/cmdlist/cmdlist_hw.h @@ -340,7 +340,7 @@ struct CommandListCoreFamily : public CommandListImp { void addCmdForPatching(std::shared_ptr *externalInOrderExecInfo, void *cmd1, void *cmd2, uint64_t counterValue, InOrderPatchCommandHelpers::PatchCmdType patchCmdType); - bool inOrderAtomicSignallingEnabled() const; + bool inOrderAtomicSignallingEnabled() const override; uint64_t getInOrderIncrementValue() const; InOrderPatchCommandsContainer inOrderPatchCmds; diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index 9acf49d550..671e107e3f 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -2420,7 +2420,7 @@ void CommandListCoreFamily::appendWaitOnInOrderDependency(std::sh uint64_t gpuAddress = dependencyCounterAllocation.getGpuAddress() + offset; - for (uint32_t i = 0; i < this->partitionCount; i++) { + for (uint32_t i = 0; i < inOrderExecInfo->getNumDevicePartitionsToWait(); i++) { if (relaxedOrderingAllowed) { NEO::EncodeBatchBufferStartOrEnd::programConditionalDataMemBatchBufferStart(*commandContainer.getCommandStream(), 0, gpuAddress, waitValue, NEO::CompareOperation::Less, true, isQwordInOrderCounter()); diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl index 4603baf4df..9b49884019 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl @@ -1284,9 +1284,9 @@ ze_result_t CommandListCoreFamilyImmediate::synchronizeInOrderExe bool signaled = true; - auto hostAddress = static_cast(ptrOffset(inOrderExecInfo->getDeviceCounterAllocation().getUnderlyingBuffer(), this->inOrderAllocationOffset)); + const uint64_t *hostAddress = ptrOffset(inOrderExecInfo->getHostAddress(), this->inOrderAllocationOffset); - for (uint32_t i = 0; i < this->partitionCount; i++) { + for (uint32_t i = 0; i < inOrderExecInfo->getNumHostPartitionsToWait(); i++) { if (!NEO::WaitUtils::waitFunctionWithPredicate(hostAddress, waitValue, std::greater_equal())) { signaled = false; break; diff --git a/level_zero/core/source/cmdlist/cmdlist_imp.cpp b/level_zero/core/source/cmdlist/cmdlist_imp.cpp index 07373e386e..85dbd44123 100644 --- a/level_zero/core/source/cmdlist/cmdlist_imp.cpp +++ b/level_zero/core/source/cmdlist/cmdlist_imp.cpp @@ -234,9 +234,7 @@ void CommandListImp::enableInOrderExecution() { UNRECOVERABLE_IF(!inOrderDependencyCounterAllocation); - memset(inOrderDependencyCounterAllocation->getUnderlyingBuffer(), 0, inOrderDependencyCounterAllocation->getUnderlyingBufferSize()); - - inOrderExecInfo = std::make_shared(*inOrderDependencyCounterAllocation, *device->getMemoryManager(), (this->cmdListType == TYPE_REGULAR)); + inOrderExecInfo = std::make_shared(*inOrderDependencyCounterAllocation, *device->getMemoryManager(), this->partitionCount, (this->cmdListType == TYPE_REGULAR), inOrderAtomicSignallingEnabled()); } void CommandListImp::storeReferenceTsToMappedEvents(bool isClearEnabled) { diff --git a/level_zero/core/source/cmdlist/cmdlist_imp.h b/level_zero/core/source/cmdlist/cmdlist_imp.h index 1a629e6588..800e8b7412 100644 --- a/level_zero/core/source/cmdlist/cmdlist_imp.h +++ b/level_zero/core/source/cmdlist/cmdlist_imp.h @@ -44,6 +44,8 @@ struct CommandListImp : public CommandList { ~CommandListImp() override = default; + virtual bool inOrderAtomicSignallingEnabled() const = 0; + static constexpr int32_t cmdListDefaultEngineInstancedDevice = NEO::StreamProperty::initValue; static constexpr bool cmdListDefaultCoherency = false; static constexpr bool cmdListDefaultDisableOverdispatch = true; diff --git a/level_zero/core/source/event/event_impl.inl b/level_zero/core/source/event/event_impl.inl index 0bfa9fd00f..77028005e6 100644 --- a/level_zero/core/source/event/event_impl.inl +++ b/level_zero/core/source/event/event_impl.inl @@ -163,11 +163,11 @@ ze_result_t EventImp::queryCounterBasedEventStatus() { return ZE_RESULT_NOT_READY; } - auto hostAddress = static_cast(ptrOffset(inOrderExecInfo->getDeviceCounterAllocation().getUnderlyingBuffer(), this->inOrderAllocationOffset)); + const uint64_t *hostAddress = ptrOffset(inOrderExecInfo->getHostAddress(), this->inOrderAllocationOffset); auto waitValue = getInOrderExecSignalValueWithSubmissionCounter(); bool signaled = true; - for (uint32_t i = 0; i < this->getPacketsInUse(); i++) { + for (uint32_t i = 0; i < inOrderExecInfo->getNumHostPartitionsToWait(); i++) { if (!NEO::WaitUtils::waitFunctionWithPredicate(hostAddress, waitValue, std::greater_equal())) { signaled = false; break; @@ -438,7 +438,7 @@ ze_result_t EventImp::waitForUserFence(uint64_t timeout) { return ZE_RESULT_NOT_READY; } - uint64_t waitAddress = castToUint64(ptrOffset(inOrderExecInfo->getDeviceCounterAllocation().getUnderlyingBuffer(), this->inOrderAllocationOffset)); + uint64_t waitAddress = castToUint64(ptrOffset(inOrderExecInfo->getHostAddress(), this->inOrderAllocationOffset)); if (!csrs[0]->waitUserFence(getInOrderExecSignalValueWithSubmissionCounter(), waitAddress, timeout)) { return ZE_RESULT_NOT_READY; diff --git a/level_zero/core/source/helpers/in_order_cmd_helpers.cpp b/level_zero/core/source/helpers/in_order_cmd_helpers.cpp index 926f777247..aad991f3d4 100644 --- a/level_zero/core/source/helpers/in_order_cmd_helpers.cpp +++ b/level_zero/core/source/helpers/in_order_cmd_helpers.cpp @@ -19,8 +19,15 @@ InOrderExecInfo::~InOrderExecInfo() { memoryManager.freeGraphicsMemory(&deviceCounterAllocation); } -InOrderExecInfo::InOrderExecInfo(NEO::GraphicsAllocation &deviceCounterAllocation, NEO::MemoryManager &memoryManager, bool regularCmdList) +InOrderExecInfo::InOrderExecInfo(NEO::GraphicsAllocation &deviceCounterAllocation, NEO::MemoryManager &memoryManager, uint32_t partitionCount, bool regularCmdList, bool atomicDeviceSignalling) : deviceCounterAllocation(deviceCounterAllocation), memoryManager(memoryManager), regularCmdList(regularCmdList) { + + numDevicePartitionsToWait = atomicDeviceSignalling ? 1 : partitionCount; + numHostPartitionsToWait = partitionCount; + + hostAddress = reinterpret_cast(deviceCounterAllocation.getUnderlyingBuffer()); + + reset(); } void InOrderExecInfo::reset() { diff --git a/level_zero/core/source/helpers/in_order_cmd_helpers.h b/level_zero/core/source/helpers/in_order_cmd_helpers.h index 4c3afe3f4c..7a6f87f502 100644 --- a/level_zero/core/source/helpers/in_order_cmd_helpers.h +++ b/level_zero/core/source/helpers/in_order_cmd_helpers.h @@ -27,9 +27,10 @@ class InOrderExecInfo : public NEO::NonCopyableClass { InOrderExecInfo() = delete; - InOrderExecInfo(NEO::GraphicsAllocation &deviceCounterAllocation, NEO::MemoryManager &memoryManager, bool regularCmdList); + InOrderExecInfo(NEO::GraphicsAllocation &deviceCounterAllocation, NEO::MemoryManager &memoryManager, uint32_t partitionCount, bool regularCmdList, bool atomicDeviceSignalling); NEO::GraphicsAllocation &getDeviceCounterAllocation() const { return deviceCounterAllocation; } + uint64_t *getHostAddress() const { return hostAddress; } uint64_t getCounterValue() const { return counterValue; } void addCounterValue(uint64_t addValue) { counterValue += addValue; } @@ -40,6 +41,9 @@ class InOrderExecInfo : public NEO::NonCopyableClass { bool isRegularCmdList() const { return regularCmdList; } + uint32_t getNumDevicePartitionsToWait() const { return numDevicePartitionsToWait; } + uint32_t getNumHostPartitionsToWait() const { return numHostPartitionsToWait; } + void reset(); protected: @@ -47,6 +51,9 @@ class InOrderExecInfo : public NEO::NonCopyableClass { NEO::MemoryManager &memoryManager; uint64_t counterValue = 0; uint64_t regularCmdListSubmissionCounter = 0; + uint64_t *hostAddress = nullptr; + uint32_t numDevicePartitionsToWait = 0; + uint32_t numHostPartitionsToWait = 0; bool regularCmdList = false; }; diff --git a/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h b/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h index 32f1f69d21..dbe92a9ac9 100644 --- a/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h +++ b/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h @@ -293,6 +293,7 @@ struct MockCommandList : public CommandList { ADDMETHOD_NOBASE(close, ze_result_t, ZE_RESULT_SUCCESS, ()); ADDMETHOD_NOBASE(destroy, ze_result_t, ZE_RESULT_SUCCESS, ()); ADDMETHOD_NOBASE_VOIDRETURN(patchInOrderCmds, (void)); + ADDMETHOD_CONST_NOBASE(inOrderAtomicSignallingEnabled, bool, false, (void)); ADDMETHOD_NOBASE(appendLaunchKernel, ze_result_t, ZE_RESULT_SUCCESS, (ze_kernel_handle_t kernelHandle, diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_3.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_3.cpp index 695abf2350..1f3bed5be5 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_3.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_3.cpp @@ -4164,6 +4164,47 @@ HWTEST2_F(MultiTileInOrderCmdListTests, givenAtomicSignallingEnabledWhenSignalli EXPECT_EQ(0u, atomicCmd->getCsStall()); } +HWTEST2_F(MultiTileInOrderCmdListTests, givenAtomicSignallingEnabledWhenWaitingForDependencyThenUseOnlyOneSemaphore, IsAtLeastXeHpCore) { + using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; + + debugManager.flags.InOrderAtomicSignallingEnabled.set(1); + + auto immCmdList1 = createMultiTileImmCmdList(); + auto immCmdList2 = createMultiTileImmCmdList(); + + auto eventPool = createEvents(1, false); + + auto handle = events[0]->toHandle(); + + immCmdList1->appendLaunchKernel(kernel->toHandle(), groupCount, handle, 0, nullptr, launchParams, false); + + EXPECT_EQ(partitionCount, immCmdList1->inOrderExecInfo->getCounterValue()); + + auto cmdStream = immCmdList2->getCmdContainer().getCommandStream(); + + immCmdList2->appendLaunchKernel(kernel->toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false); + + size_t offset = cmdStream->getUsed(); + + immCmdList2->appendLaunchKernel(kernel->toHandle(), groupCount, nullptr, 1, &handle, launchParams, false); + + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(cmdList, ptrOffset(cmdStream->getCpuBase(), offset), (cmdStream->getUsed() - offset))); + + auto semaphores = findAll(cmdList.begin(), cmdList.end()); + ASSERT_EQ(2u + (ImplicitScalingDispatch::getPipeControlStallRequired() ? 1 : 0), semaphores.size()); + + auto itor = cmdList.begin(); + + // implicit dependency + auto gpuAddress = immCmdList2->inOrderExecInfo->getDeviceCounterAllocation().getGpuAddress(); + + ASSERT_TRUE(verifyInOrderDependency(itor, partitionCount, gpuAddress, immCmdList2->isQwordInOrderCounter())); + + // event + ASSERT_TRUE(verifyInOrderDependency(itor, partitionCount, events[0]->inOrderExecInfo->getDeviceCounterAllocation().getGpuAddress(), immCmdList2->isQwordInOrderCounter())); +} + HWTEST2_F(MultiTileInOrderCmdListTests, givenMultiTileInOrderModeWhenProgrammingWaitOnEventsThenHandleAllEventPackets, IsAtLeastXeHpCore) { using MI_STORE_DATA_IMM = typename FamilyType::MI_STORE_DATA_IMM; using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; diff --git a/level_zero/core/test/unit_tests/sources/event/test_event.cpp b/level_zero/core/test/unit_tests/sources/event/test_event.cpp index e8bd697be1..19a1888fda 100644 --- a/level_zero/core/test/unit_tests/sources/event/test_event.cpp +++ b/level_zero/core/test/unit_tests/sources/event/test_event.cpp @@ -3274,7 +3274,8 @@ HWTEST_F(EventTests, givenInOrderEventWhenHostSynchronizeIsCalledThenAllocationI auto syncAllocation = new NEO::MockGraphicsAllocation(&storage, sizeof(storage)); - auto inOrderExecInfo = std::make_shared(*syncAllocation, *neoDevice->getMemoryManager(), false); + auto inOrderExecInfo = std::make_shared(*syncAllocation, *neoDevice->getMemoryManager(), 1, false, false); + *inOrderExecInfo->getHostAddress() = 1; event->enableCounterBasedMode(true); event->updateInOrderExecState(inOrderExecInfo, 1, 0);