diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.h b/level_zero/core/source/cmdlist/cmdlist_hw.h index 203655f698..87e65206a5 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.h +++ b/level_zero/core/source/cmdlist/cmdlist_hw.h @@ -233,6 +233,7 @@ struct CommandListCoreFamily : public CommandListImp { void patchInOrderCmds() override; MOCKABLE_VIRTUAL bool handleCounterBasedEventOperations(Event *signalEvent, bool skipAddingEventToResidency); bool isCbEventBoundToCmdList(Event *event) const; + bool kernelMemoryPrefetchEnabled() const override; protected: MOCKABLE_VIRTUAL ze_result_t appendMemoryCopyKernelWithGA(void *dstPtr, NEO::GraphicsAllocation *dstPtrAlloc, @@ -420,7 +421,6 @@ struct CommandListCoreFamily : public CommandListImp { virtual uint64_t getPrefetchCmdId() const { return std::numeric_limits::max(); } virtual uint32_t getIohSizeForPrefetch(const Kernel &kernel, uint32_t reserveExtraSpace) const; virtual void ensureCmdBufferSpaceForPrefetch() {} - bool kernelMemoryPrefetchEnabled() const { return NEO::debugManager.flags.EnableMemoryPrefetch.get() == 1; } NEO::InOrderPatchCommandsContainer inOrderPatchCmds; diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index 23b83e0a41..dbd49d70c0 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -1575,6 +1575,9 @@ ze_result_t CommandListCoreFamily::appendMemoryCopyBlit(uintptr_t return ZE_RESULT_SUCCESS; } +template +bool CommandListCoreFamily::kernelMemoryPrefetchEnabled() const { return NEO::debugManager.flags.EnableMemoryPrefetch.get() == 1; } + template ze_result_t CommandListCoreFamily::appendMemoryCopyBlitRegion(AlignedAllocationData *srcAllocationData, AlignedAllocationData *dstAllocationData, diff --git a/level_zero/core/source/cmdlist/cmdlist_imp.h b/level_zero/core/source/cmdlist/cmdlist_imp.h index 46328415ae..d4c383fe32 100644 --- a/level_zero/core/source/cmdlist/cmdlist_imp.h +++ b/level_zero/core/source/cmdlist/cmdlist_imp.h @@ -48,6 +48,7 @@ struct CommandListImp : public CommandList { NEO::SynchronizedDispatchMode getSynchronizedDispatchMode() const { return synchronizedDispatchMode; } void enableCopyOperationOffload(); void setInterruptEventsCsr(NEO::CommandStreamReceiver &csr); + virtual bool kernelMemoryPrefetchEnabled() const = 0; protected: std::shared_ptr inOrderExecInfo; diff --git a/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h b/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h index a2ae0353e9..6d5dd9ba9e 100644 --- a/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h +++ b/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h @@ -355,6 +355,8 @@ struct MockCommandList : public CommandList { ADDMETHOD_NOBASE(destroy, ze_result_t, ZE_RESULT_SUCCESS, ()); ADDMETHOD_NOBASE_VOIDRETURN(patchInOrderCmds, (void)); + ADDMETHOD_CONST_NOBASE(kernelMemoryPrefetchEnabled, bool, false, (void)); + ADDMETHOD_NOBASE(appendLaunchKernel, ze_result_t, ZE_RESULT_SUCCESS, (ze_kernel_handle_t kernelHandle, const ze_group_count_t &threadGroupDimensions, diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_1.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_1.cpp index 64faecb9fa..86ce28e4d4 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_1.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_1.cpp @@ -2143,7 +2143,10 @@ HWTEST2_F(CommandListCreateTests, givenInOrderExecutionWhenDispatchingRelaxedOrd cmdStream->getUsed() - offset)); // init registers - auto lrrCmd = genCmdCast(*genCmdList.begin()); + auto iter = genCmdList.begin(); + UnitTestHelper::skipStatePrefetch(iter); + + auto lrrCmd = genCmdCast(*iter); ASSERT_NE(nullptr, lrrCmd); lrrCmd++; lrrCmd++; diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_5.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_5.cpp index 70a567b66b..54c9de5ecf 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_5.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_5.cpp @@ -3446,7 +3446,14 @@ HWTEST2_F(CommandListStateBaseAddressPrivateHeapTest, if (ssh) { EXPECT_EQ(oldGfxHeapAllocation, ssh->getGraphicsAllocation()); } - EXPECT_EQ(usedBefore, cmdListStream.getUsed()); + + size_t prefetchSize = 0; + if (commandList->kernelMemoryPrefetchEnabled()) { + prefetchSize = NEO::EncodeMemoryPrefetch::getSizeForMemoryPrefetch(kernel->getIndirectSize(), device->getNEODevice()->getRootDeviceEnvironment()) + + NEO::EncodeMemoryPrefetch::getSizeForMemoryPrefetch(kernel->getImmutableData()->getIsaSize(), device->getNEODevice()->getRootDeviceEnvironment()); + } + + EXPECT_EQ(usedBefore + prefetchSize, cmdListStream.getUsed()); } } // namespace ult diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_7.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_7.cpp index 1ee6b2fad3..d29040766d 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_7.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_7.cpp @@ -882,6 +882,7 @@ HWTEST2_F(FrontEndMultiReturnCommandListTest, givenFrontEndTrackingIsUsedWhenPro } HWTEST2_F(FrontEndMultiReturnCommandListTest, givenFrontEndTrackingIsUsedWhenPropertyComputeDispatchAllWalkerSupportedThenExpectReturnPointsAndBbEndProgramming, FrontEndMultiReturnMatcher) { + debugManager.flags.EnableMemoryPrefetch.set(0); using MI_BATCH_BUFFER_END = typename FamilyType::MI_BATCH_BUFFER_END; NEO::FrontEndPropertiesSupport fePropertiesSupport = {}; auto &productHelper = device->getProductHelper(); @@ -1296,6 +1297,7 @@ HWTEST2_F(FrontEndMultiReturnCommandListTest, HWTEST2_F(FrontEndMultiReturnCommandListTest, givenFrontEndTrackingCmdListIsExecutedWhenPropertyComputeDispatchAllWalkerSupportedThenExpectFrontEndProgrammingInCmdQueue, FrontEndMultiReturnMatcher) { + debugManager.flags.EnableMemoryPrefetch.set(0); using FrontEndStateCommand = typename FamilyType::FrontEndStateCommand; using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START; using MI_BATCH_BUFFER_END = typename FamilyType::MI_BATCH_BUFFER_END; diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_xehp_and_later.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_xehp_and_later.cpp index cc3bf96737..eb9db2ba60 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_xehp_and_later.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_xehp_and_later.cpp @@ -2741,12 +2741,14 @@ HWTEST2_F(CommandListAppendLaunchKernel, ASSERT_NE(0u, storeRegMemList.size()); ASSERT_NE(0u, outStoreRegMemCmdList.size()); - ASSERT_EQ(storeRegMemList.size(), outStoreRegMemCmdList.size()); + size_t additionalPatchCmdsSize = commandList->kernelMemoryPrefetchEnabled() ? 1 : 0; + + ASSERT_EQ(storeRegMemList.size(), outStoreRegMemCmdList.size() - additionalPatchCmdsSize); for (size_t i = 0; i < storeRegMemList.size(); i++) { MI_STORE_REGISTER_MEM *storeRegMem = genCmdCast(*storeRegMemList[i]); - auto &cmdToPatch = outStoreRegMemCmdList[i]; + auto &cmdToPatch = outStoreRegMemCmdList[i + additionalPatchCmdsSize]; EXPECT_EQ(CommandToPatch::TimestampEventPostSyncStoreRegMem, cmdToPatch.type); MI_STORE_REGISTER_MEM *outStoreRegMem = genCmdCast(cmdToPatch.pDestination); ASSERT_NE(nullptr, outStoreRegMem); @@ -2804,7 +2806,9 @@ HWTEST2_F(CommandListAppendLaunchKernel, ptrOffset(cmdStream->getCpuBase(), commandStreamOffset), cmdStream->getUsed() - commandStreamOffset)); - ASSERT_EQ(0u, outCbEventCmds.size()); + size_t additionalPatchCmdsSize = commandList->kernelMemoryPrefetchEnabled() ? 1 : 0; + + ASSERT_EQ(additionalPatchCmdsSize, outCbEventCmds.size()); auto eventBaseAddress = event->getGpuAddress(device); WalkerVariant walkerVariant = NEO::UnitTestHelper::getWalkerVariant(launchParams.outWalker); @@ -2878,8 +2882,9 @@ HWTEST2_F(CommandListAppendLaunchKernel, auto inOrderAllocation = event->getInOrderExecInfo()->getDeviceCounterAllocation(); size_t expectedLoadRegImmCount = FamilyType::isQwordInOrderCounter ? 2 : 0; + size_t additionalPatchCmdsSize = commandList->kernelMemoryPrefetchEnabled() ? 1 : 0; - size_t expectedWaitCmds = 1 + expectedLoadRegImmCount; + size_t expectedWaitCmds = 1 + expectedLoadRegImmCount + additionalPatchCmdsSize; ASSERT_EQ(expectedWaitCmds, outCbWaitEventCmds.size()); auto loadRegImmList = findAll(cmdList.begin(), cmdList.end()); @@ -2889,27 +2894,31 @@ HWTEST2_F(CommandListAppendLaunchKernel, size_t outCbWaitEventCmdsIndex = 0; for (; outCbWaitEventCmdsIndex < expectedLoadRegImmCount; outCbWaitEventCmdsIndex++) { - EXPECT_EQ(CommandToPatch::CbWaitEventLoadRegisterImm, outCbWaitEventCmds[outCbWaitEventCmdsIndex].type); - ASSERT_NE(nullptr, outCbWaitEventCmds[outCbWaitEventCmdsIndex].pDestination); - ASSERT_EQ(*loadRegImmList[outCbWaitEventCmdsIndex], outCbWaitEventCmds[outCbWaitEventCmdsIndex].pDestination); - auto loadRegImmCmd = genCmdCast(outCbWaitEventCmds[outCbWaitEventCmdsIndex].pDestination); + auto &cmd = outCbWaitEventCmds[outCbWaitEventCmdsIndex + additionalPatchCmdsSize]; + + EXPECT_EQ(CommandToPatch::CbWaitEventLoadRegisterImm, cmd.type); + ASSERT_NE(nullptr, cmd.pDestination); + ASSERT_EQ(*loadRegImmList[outCbWaitEventCmdsIndex], cmd.pDestination); + auto loadRegImmCmd = genCmdCast(cmd.pDestination); ASSERT_NE(nullptr, loadRegImmCmd); - EXPECT_EQ(0u, outCbWaitEventCmds[outCbWaitEventCmdsIndex].inOrderPatchListIndex); + EXPECT_EQ(0u, cmd.inOrderPatchListIndex); auto registerNumber = 0x2600 + (4 * outCbWaitEventCmdsIndex); - EXPECT_EQ(registerNumber, outCbWaitEventCmds[outCbWaitEventCmdsIndex].offset); + EXPECT_EQ(registerNumber, cmd.offset); } - EXPECT_EQ(CommandToPatch::CbWaitEventSemaphoreWait, outCbWaitEventCmds[outCbWaitEventCmdsIndex].type); - ASSERT_NE(nullptr, outCbWaitEventCmds[outCbWaitEventCmdsIndex].pDestination); - ASSERT_EQ(*semaphoreWaitList[0], outCbWaitEventCmds[outCbWaitEventCmdsIndex].pDestination); - auto semaphoreWaitCmd = genCmdCast(outCbWaitEventCmds[outCbWaitEventCmdsIndex].pDestination); + auto &cmd = outCbWaitEventCmds[outCbWaitEventCmdsIndex + additionalPatchCmdsSize]; + + EXPECT_EQ(CommandToPatch::CbWaitEventSemaphoreWait, cmd.type); + ASSERT_NE(nullptr, cmd.pDestination); + ASSERT_EQ(*semaphoreWaitList[0], cmd.pDestination); + auto semaphoreWaitCmd = genCmdCast(cmd.pDestination); ASSERT_NE(nullptr, semaphoreWaitCmd); - EXPECT_EQ(eventCompletionAddress + outCbWaitEventCmds[outCbWaitEventCmdsIndex].offset, semaphoreWaitCmd->getSemaphoreGraphicsAddress()); + EXPECT_EQ(eventCompletionAddress + cmd.offset, semaphoreWaitCmd->getSemaphoreGraphicsAddress()); if (FamilyType::isQwordInOrderCounter) { - EXPECT_EQ(std::numeric_limits::max(), outCbWaitEventCmds[outCbWaitEventCmdsIndex].inOrderPatchListIndex); + EXPECT_EQ(std::numeric_limits::max(), cmd.inOrderPatchListIndex); } else { - EXPECT_EQ(0u, outCbWaitEventCmds[outCbWaitEventCmdsIndex].inOrderPatchListIndex); + EXPECT_EQ(0u, cmd.inOrderPatchListIndex); } auto &residencyContainer = commandContainer.getResidencyContainer(); diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist_1.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist_1.cpp index fe1c7f5492..f0a0ecbc77 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist_1.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist_1.cpp @@ -6330,24 +6330,29 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, InOrderCmdListTests, givenInOrderModeAndNoopWaitEve size_t expectedLoadRegImmCount = FamilyType::isQwordInOrderCounter ? 2 : 0; - size_t expectedWaitCmds = 1 + expectedLoadRegImmCount; + size_t additionalPatchCmdsSize = regularCmdList->kernelMemoryPrefetchEnabled() ? 1 : 0; + size_t expectedWaitCmds = 1 + expectedLoadRegImmCount + additionalPatchCmdsSize; ASSERT_EQ(expectedWaitCmds, outCbWaitEventCmds.size()); size_t outCbWaitEventCmdsIndex = 0; for (; outCbWaitEventCmdsIndex < expectedLoadRegImmCount; outCbWaitEventCmdsIndex++) { - EXPECT_EQ(CommandToPatch::CbWaitEventLoadRegisterImm, outCbWaitEventCmds[outCbWaitEventCmdsIndex].type); - auto registerNumber = 0x2600 + (4 * outCbWaitEventCmdsIndex); - EXPECT_EQ(registerNumber, outCbWaitEventCmds[outCbWaitEventCmdsIndex].offset); + auto &cmd = outCbWaitEventCmds[outCbWaitEventCmdsIndex + additionalPatchCmdsSize]; - ASSERT_NE(nullptr, outCbWaitEventCmds[outCbWaitEventCmdsIndex].pDestination); - auto memCmpRet = memcmp(outCbWaitEventCmds[outCbWaitEventCmdsIndex].pDestination, noopedLriBuffer, sizeof(MI_LOAD_REGISTER_IMM)); + EXPECT_EQ(CommandToPatch::CbWaitEventLoadRegisterImm, cmd.type); + auto registerNumber = 0x2600 + (4 * outCbWaitEventCmdsIndex); + EXPECT_EQ(registerNumber, cmd.offset); + + ASSERT_NE(nullptr, cmd.pDestination); + auto memCmpRet = memcmp(cmd.pDestination, noopedLriBuffer, sizeof(MI_LOAD_REGISTER_IMM)); EXPECT_EQ(0, memCmpRet); } - EXPECT_EQ(CommandToPatch::CbWaitEventSemaphoreWait, outCbWaitEventCmds[outCbWaitEventCmdsIndex].type); + auto &cmd = outCbWaitEventCmds[outCbWaitEventCmdsIndex + additionalPatchCmdsSize]; - ASSERT_NE(nullptr, outCbWaitEventCmds[outCbWaitEventCmdsIndex].pDestination); - auto memCmpRet = memcmp(outCbWaitEventCmds[outCbWaitEventCmdsIndex].pDestination, noopedSemWaitBuffer, sizeof(MI_SEMAPHORE_WAIT)); + EXPECT_EQ(CommandToPatch::CbWaitEventSemaphoreWait, cmd.type); + + ASSERT_NE(nullptr, cmd.pDestination); + auto memCmpRet = memcmp(cmd.pDestination, noopedSemWaitBuffer, sizeof(MI_SEMAPHORE_WAIT)); EXPECT_EQ(0, memCmpRet); } diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist_2.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist_2.cpp index 06f0110f7b..88947baa32 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist_2.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist_2.cpp @@ -2908,6 +2908,9 @@ HWTEST2_F(MultiTileSynchronizedDispatchTests, givenFullSyncDispatchWhenAppending } auto itor = cmdList.begin(); + + UnitTestHelper::skipStatePrefetch(itor); + if (hasDependencySemaphore) { auto nPartition = std::min(immCmdList->inOrderExecInfo->getNumDevicePartitionsToWait(), partitionCount); for (uint32_t i = 0; i < nPartition; i++) { @@ -3527,6 +3530,7 @@ HWTEST2_F(MultiTileInOrderCmdListTests, givenAtomicSignallingEnabledWhenWaitingF ASSERT_EQ(2u + (ImplicitScalingDispatch::getPipeControlStallRequired() ? 1 : 0), semaphores.size()); auto itor = cmdList.begin(); + UnitTestHelper::skipStatePrefetch(itor); // implicit dependency auto gpuAddress = immCmdList2->inOrderExecInfo->getBaseDeviceAddress(); @@ -3582,6 +3586,8 @@ HWTEST2_F(MultiTileInOrderCmdListTests, givenMultiTileInOrderModeWhenProgramming (cmdStream->getUsed() - offset))); auto itor = cmdList.begin(); + UnitTestHelper::skipStatePrefetch(itor); + if (immCmdList->isQwordInOrderCounter()) { std::advance(itor, 2); } diff --git a/shared/test/common/gen12lp/unit_test_helper_gen12lp.cpp b/shared/test/common/gen12lp/unit_test_helper_gen12lp.cpp index 2fb7ba8967..49c724857f 100644 --- a/shared/test/common/gen12lp/unit_test_helper_gen12lp.cpp +++ b/shared/test/common/gen12lp/unit_test_helper_gen12lp.cpp @@ -223,6 +223,9 @@ uint64_t UnitTestHelper::getWalkerActivePostSyncAddress(WalkerType *walk return 0; } +template <> +void UnitTestHelper::skipStatePrefetch(GenCmdList::iterator &iter) {} + template struct UnitTestHelper; template struct UnitTestHelperWithHeap; diff --git a/shared/test/common/helpers/unit_test_helper.h b/shared/test/common/helpers/unit_test_helper.h index 7d3006e36a..197142659f 100644 --- a/shared/test/common/helpers/unit_test_helper.h +++ b/shared/test/common/helpers/unit_test_helper.h @@ -135,6 +135,7 @@ struct UnitTestHelper : public UnitTestHelperBase { static size_t getWalkerSize(bool isHeaplessEnabled); template static uint64_t getWalkerActivePostSyncAddress(WalkerType *walkerCmd); + static void skipStatePrefetch(GenCmdList::iterator &iter); static bool isHeaplessAllowed(); }; diff --git a/shared/test/common/helpers/unit_test_helper_xe_hpc_and_later.inl b/shared/test/common/helpers/unit_test_helper_xe_hpc_and_later.inl index d5da992c07..0d0f2b3e12 100644 --- a/shared/test/common/helpers/unit_test_helper_xe_hpc_and_later.inl +++ b/shared/test/common/helpers/unit_test_helper_xe_hpc_and_later.inl @@ -35,4 +35,11 @@ bool UnitTestHelper::isAdditionalSynchronizationRequired() { return true; } +template +void UnitTestHelper::skipStatePrefetch(GenCmdList::iterator &iter) { + while (genCmdCast(*iter)) { + iter++; + } +} + } // namespace NEO diff --git a/shared/test/common/xe_hpg_core/unit_test_helper_xe_hpg_core.cpp b/shared/test/common/xe_hpg_core/unit_test_helper_xe_hpg_core.cpp index 746703e91f..e0fd09e4c9 100644 --- a/shared/test/common/xe_hpg_core/unit_test_helper_xe_hpg_core.cpp +++ b/shared/test/common/xe_hpg_core/unit_test_helper_xe_hpg_core.cpp @@ -49,6 +49,9 @@ bool UnitTestHelper::requiresTimestampPacketsInSystemMemory(HardwareInfo return true; } +template <> +void UnitTestHelper::skipStatePrefetch(GenCmdList::iterator &iter) {} + template struct UnitTestHelper; template struct UnitTestHelperWithHeap;