From 8d83f7603cb98ab43a812496eef4c455f785652a Mon Sep 17 00:00:00 2001 From: Mateusz Jablonski Date: Tue, 12 Mar 2024 11:12:51 +0000 Subject: [PATCH] performance: skip dummy blits prior to flush without postsync Related-To: NEO-9996 Signed-off-by: Mateusz Jablonski --- level_zero/core/source/cmdlist/cmdlist_hw.inl | 1 + .../sources/cmdlist/test_cmdlist_1.cpp | 28 ++++++++++++++++++- .../command_container/command_encoder.inl | 1 + 3 files changed, 29 insertions(+), 1 deletion(-) diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index 30054ccad9..d9dbb60098 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -3809,6 +3809,7 @@ uint64_t CommandListCoreFamily::getInOrderIncrementValue() const template void CommandListCoreFamily::encodeMiFlush(uint64_t immediateDataGpuAddress, uint64_t immediateData, NEO::MiFlushArgs &args) { + args.waArgs.isWaRequired &= args.commandWithPostSync; auto isDummyBlitRequired = NEO::BlitCommandsHelper::isDummyBlitWaNeeded(args.waArgs); NEO::EncodeMiFlushDW::programWithWa(*commandContainer.getCommandStream(), immediateDataGpuAddress, immediateData, args); if (isDummyBlitRequired) { diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_1.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_1.cpp index f4600bfaf5..af80880435 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_1.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_1.cpp @@ -3024,7 +3024,7 @@ TEST(BuiltinTypeHelperTest, givenHeaplessWhenAdjustBuiltinTypeIsCalledThenCorrec EXPECT_EQ(Builtin::fillBufferMiddleStatelessHeapless, BuiltinTypeHelper::adjustBuiltinType(isStateless, isHeapless)); EXPECT_EQ(Builtin::fillBufferRightLeftoverStatelessHeapless, BuiltinTypeHelper::adjustBuiltinType(isStateless, isHeapless)); } -HWTEST2_F(CommandListCreate, givenDummyBlitRequiredWhenEncodeMiFlushThenDummyBlitIsProgrammedPriorToMiFlushAndDummyAllocationIsAddedToResidencyContainer, IsAtLeastXeHpCore) { +HWTEST2_F(CommandListCreate, givenDummyBlitRequiredWhenEncodeMiFlushWithPostSyncThenDummyBlitIsProgrammedPriorToMiFlushAndDummyAllocationIsAddedToResidencyContainer, IsAtLeastXeHpCore) { using MI_FLUSH_DW = typename FamilyType::MI_FLUSH_DW; DebugManagerStateRestore restorer; debugManager.flags.ForceDummyBlitWa.set(1); @@ -3034,6 +3034,7 @@ HWTEST2_F(CommandListCreate, givenDummyBlitRequiredWhenEncodeMiFlushThenDummyBli auto &commandContainer = cmdlist.getCmdContainer(); cmdlist.dummyBlitWa.isWaRequired = true; MiFlushArgs args{cmdlist.dummyBlitWa}; + args.commandWithPostSync = true; auto &rootDeviceEnvironment = device->getNEODevice()->getRootDeviceEnvironmentRef(); commandContainer.getResidencyContainer().clear(); EXPECT_EQ(nullptr, rootDeviceEnvironment.getDummyAllocation()); @@ -3051,6 +3052,31 @@ HWTEST2_F(CommandListCreate, givenDummyBlitRequiredWhenEncodeMiFlushThenDummyBli EXPECT_EQ(commandContainer.getResidencyContainer()[0], rootDeviceEnvironment.getDummyAllocation()); } +HWTEST2_F(CommandListCreate, givenDummyBlitRequiredWhenEncodeMiFlushWithoutPostSyncThenDummyBlitIsNotProgrammedAndDummyAllocationIsNotAddedToResidencyContainer, IsAtLeastXeHpCore) { + using MI_FLUSH_DW = typename FamilyType::MI_FLUSH_DW; + DebugManagerStateRestore restorer; + debugManager.flags.ForceDummyBlitWa.set(1); + MockCommandListCoreFamily cmdlist; + cmdlist.initialize(device, NEO::EngineGroupType::copy, 0u); + cmdlist.csr = device->getNEODevice()->getDefaultEngine().commandStreamReceiver; + auto &commandContainer = cmdlist.getCmdContainer(); + cmdlist.dummyBlitWa.isWaRequired = true; + MiFlushArgs args{cmdlist.dummyBlitWa}; + args.commandWithPostSync = false; + auto &rootDeviceEnvironment = device->getNEODevice()->getRootDeviceEnvironmentRef(); + rootDeviceEnvironment.initDummyAllocation(); + EXPECT_NE(nullptr, rootDeviceEnvironment.getDummyAllocation()); + commandContainer.getResidencyContainer().clear(); + cmdlist.encodeMiFlush(0, 0, args); + GenCmdList programmedCommands; + ASSERT_TRUE(FamilyType::Parse::parseCommandBuffer( + programmedCommands, ptrOffset(commandContainer.getCommandStream()->getCpuBase(), 0), commandContainer.getCommandStream()->getUsed())); + auto itor = find(programmedCommands.begin(), programmedCommands.end()); + EXPECT_EQ(programmedCommands.begin(), itor); + EXPECT_NE(programmedCommands.end(), itor); + EXPECT_EQ(commandContainer.getResidencyContainer().size(), 0u); +} + HWTEST2_F(CommandListCreate, givenDummyBlitNotRequiredWhenEncodeMiFlushThenDummyBlitIsNotProgrammedAndDummyAllocationIsNotAddedToResidencyContainer, IsAtLeastXeHpCore) { using MI_FLUSH_DW = typename FamilyType::MI_FLUSH_DW; DebugManagerStateRestore restorer; diff --git a/shared/source/command_container/command_encoder.inl b/shared/source/command_container/command_encoder.inl index 32c3248494..f8ad09e418 100644 --- a/shared/source/command_container/command_encoder.inl +++ b/shared/source/command_container/command_encoder.inl @@ -1116,6 +1116,7 @@ void EncodeMiFlushDW::appendWa(LinearStream &commandStream, MiFlushAr template void EncodeMiFlushDW::programWithWa(LinearStream &commandStream, uint64_t immediateDataGpuAddress, uint64_t immediateData, MiFlushArgs &args) { + UNRECOVERABLE_IF(args.waArgs.isWaRequired && !args.commandWithPostSync); appendWa(commandStream, args); args.waArgs.isWaRequired = false;