diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index 0b9abd45df..c8deae35b2 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -1494,6 +1494,11 @@ ze_result_t CommandListCoreFamily::setGlobalWorkSizeIndirect(NEO: template void CommandListCoreFamily::programStateBaseAddress(NEO::CommandContainer &container) { + NEO::PipeControlArgs args(true); + args.hdcPipelineFlush = true; + args.textureCacheInvalidationEnable = true; + NEO::MemorySynchronizationCommands::addPipeControl(*commandContainer.getCommandStream(), args); + NEO::EncodeStateBaseAddress::encode(commandContainer); if (device->getL0Debugger()) { device->getL0Debugger()->captureStateBaseAddress(commandContainer); diff --git a/level_zero/core/source/cmdqueue/cmdqueue_hw_base.inl b/level_zero/core/source/cmdqueue/cmdqueue_hw_base.inl index 494df5336a..09d095665e 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue_hw_base.inl +++ b/level_zero/core/source/cmdqueue/cmdqueue_hw_base.inl @@ -8,7 +8,6 @@ #pragma once #include "shared/source/command_container/command_encoder.h" -#include "shared/source/command_container/command_encoder_base.inl" #include "shared/source/command_stream/csr_definitions.h" #include "shared/source/command_stream/linear_stream.h" #include "shared/source/device/device.h" diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_1.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_1.cpp index 7b73320715..39badeb2b6 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_1.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_1.cpp @@ -191,8 +191,9 @@ TEST_F(CommandListCreate, givenInvalidProductFamilyThenReturnsNullPointer) { EXPECT_EQ(nullptr, commandList); } -HWTEST_F(CommandListCreate, whenCommandListIsCreatedThenStateBaseAddressCmdIsAddedAndCorrectlyProgrammed) { +HWTEST_F(CommandListCreate, whenCommandListIsCreatedThenPCAndStateBaseAddressCmdsAreAddedAndCorrectlyProgrammed) { using STATE_BASE_ADDRESS = typename FamilyType::STATE_BASE_ADDRESS; + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; std::unique_ptr commandList(CommandList::create(productFamily, device, false)); auto &commandContainer = commandList->commandContainer; @@ -210,8 +211,17 @@ HWTEST_F(CommandListCreate, whenCommandListIsCreatedThenStateBaseAddressCmdIsAdd GenCmdList cmdList; ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( cmdList, ptrOffset(commandContainer.getCommandStream()->getCpuBase(), 0), usedSpaceAfter)); - auto itor = find(cmdList.begin(), cmdList.end()); + + auto itorPc = find(cmdList.begin(), cmdList.end()); + ASSERT_NE(cmdList.end(), itorPc); + auto cmdPc = genCmdCast(*itorPc); + EXPECT_TRUE(cmdPc->getDcFlushEnable()); + EXPECT_TRUE(cmdPc->getCommandStreamerStallEnable()); + EXPECT_TRUE(cmdPc->getTextureCacheInvalidationEnable()); + + auto itor = find(itorPc, cmdList.end()); ASSERT_NE(cmdList.end(), itor); + auto cmdSba = genCmdCast(*itor); auto dsh = commandContainer.getIndirectHeap(NEO::HeapType::DYNAMIC_STATE); diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_barrier.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_barrier.cpp index 464b888a1d..ea6b250061 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_barrier.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_barrier.cpp @@ -31,8 +31,8 @@ HWTEST_F(CommandListAppendBarrier, WhenAppendingBarrierThenPipeControlIsGenerate GenCmdList cmdList; ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(cmdList, - ptrOffset(commandList->commandContainer.getCommandStream()->getCpuBase(), 0), - usedSpaceAfter)); + ptrOffset(commandList->commandContainer.getCommandStream()->getCpuBase(), usedSpaceBefore), + usedSpaceAfter - usedSpaceBefore)); // Find a PC w/ CS stall auto itorPC = find(cmdList.begin(), cmdList.end()); diff --git a/shared/source/command_container/command_encoder_base.inl b/shared/source/command_container/command_encoder_base.inl index 78db2a08f3..41bebe57bc 100644 --- a/shared/source/command_container/command_encoder_base.inl +++ b/shared/source/command_container/command_encoder_base.inl @@ -159,12 +159,21 @@ void EncodeDispatchKernel::encode(CommandContainer &container, } auto slmSizeNew = dispatchInterface->getSlmTotalSize(); - bool flush = container.slmSize != slmSizeNew || container.isAnyHeapDirty(); + bool dirtyHeaps = container.isAnyHeapDirty(); + bool flush = container.slmSize != slmSizeNew || dirtyHeaps; if (flush) { PipeControlArgs args(true); + if (dirtyHeaps) { + args.hdcPipelineFlush = true; + } MemorySynchronizationCommands::addPipeControl(*container.getCommandStream(), args); + if (dirtyHeaps) { + EncodeStateBaseAddress::encode(container); + container.setDirtyStateForAllHeaps(false); + } + if (container.slmSize != slmSizeNew) { EncodeL3State::encode(container, slmSizeNew != 0u); container.slmSize = slmSizeNew; @@ -173,11 +182,6 @@ void EncodeDispatchKernel::encode(CommandContainer &container, EncodeMediaInterfaceDescriptorLoad::encode(container); } } - - if (container.isAnyHeapDirty()) { - EncodeStateBaseAddress::encode(container); - container.setDirtyStateForAllHeaps(false); - } } uint32_t numIDD = 0u; diff --git a/shared/source/gen11/command_encoder_gen11.cpp b/shared/source/gen11/command_encoder_gen11.cpp index 2deeec60a5..886d77f48e 100644 --- a/shared/source/gen11/command_encoder_gen11.cpp +++ b/shared/source/gen11/command_encoder_gen11.cpp @@ -6,16 +6,16 @@ */ #include "shared/source/command_container/command_encoder.h" -#include "shared/source/command_container/command_encoder.inl" -#include "shared/source/command_container/command_encoder_base.inl" -#include "shared/source/command_container/encode_compute_mode_bdw_plus.inl" #include "shared/source/gen11/hw_cmds_base.h" #include "shared/source/gen11/reg_configs.h" +using Family = NEO::ICLFamily; + +#include "shared/source/command_container/command_encoder.inl" +#include "shared/source/command_container/command_encoder_base.inl" +#include "shared/source/command_container/encode_compute_mode_bdw_plus.inl" + namespace NEO { - -using Family = ICLFamily; - template struct EncodeDispatchKernel; template struct EncodeStates; template struct EncodeMath; @@ -32,4 +32,5 @@ template struct EncodeSempahore; template struct EncodeBatchBufferStartOrEnd; template struct EncodeMiFlushDW; template struct EncodeMemoryPrefetch; +template struct EncodeWA; } // namespace NEO diff --git a/shared/source/gen12lp/command_encoder_gen12lp.cpp b/shared/source/gen12lp/command_encoder_gen12lp.cpp index 7262895a69..ecf6eee9a8 100644 --- a/shared/source/gen12lp/command_encoder_gen12lp.cpp +++ b/shared/source/gen12lp/command_encoder_gen12lp.cpp @@ -6,19 +6,19 @@ */ #include "shared/source/command_container/command_encoder.h" -#include "shared/source/command_container/command_encoder.inl" -#include "shared/source/command_container/command_encoder_base.inl" -#include "shared/source/command_container/encode_compute_mode_tgllp_plus.inl" #include "shared/source/gen12lp/hw_cmds_base.h" #include "shared/source/gen12lp/reg_configs.h" #include "shared/source/helpers/preamble.h" +using Family = NEO::TGLLPFamily; + +#include "shared/source/command_container/command_encoder.inl" +#include "shared/source/command_container/command_encoder_base.inl" +#include "shared/source/command_container/encode_compute_mode_tgllp_plus.inl" + namespace NEO { - -using Family = TGLLPFamily; - template <> -inline size_t EncodeWA::getAdditionalPipelineSelectSize(Device &device) { +size_t EncodeWA::getAdditionalPipelineSelectSize(Device &device) { size_t size = 0; if (device.getDefaultEngine().commandStreamReceiver->isRcs()) { size += 2 * PreambleHelper::getCmdSizeForPipelineSelect(device.getHardwareInfo()); @@ -39,7 +39,7 @@ void EncodeComputeMode::adjustComputeMode(LinearStream &csr, uint32_t nu } template <> -inline void EncodeWA::encodeAdditionalPipelineSelect(Device &device, LinearStream &stream, bool is3DPipeline) { +void EncodeWA::encodeAdditionalPipelineSelect(Device &device, LinearStream &stream, bool is3DPipeline) { if (device.getDefaultEngine().commandStreamReceiver->isRcs()) { PipelineSelectArgs args; args.is3DPipelineRequired = is3DPipeline; diff --git a/shared/source/gen8/command_encoder_gen8.cpp b/shared/source/gen8/command_encoder_gen8.cpp index 4a09392770..d95277d8c7 100644 --- a/shared/source/gen8/command_encoder_gen8.cpp +++ b/shared/source/gen8/command_encoder_gen8.cpp @@ -6,16 +6,16 @@ */ #include "shared/source/command_container/command_encoder.h" -#include "shared/source/command_container/command_encoder.inl" -#include "shared/source/command_container/command_encoder_base.inl" -#include "shared/source/command_container/encode_compute_mode_bdw_plus.inl" #include "shared/source/gen8/hw_cmds_base.h" #include "shared/source/gen8/reg_configs.h" +using Family = NEO::BDWFamily; + +#include "shared/source/command_container/command_encoder.inl" +#include "shared/source/command_container/command_encoder_base.inl" +#include "shared/source/command_container/encode_compute_mode_bdw_plus.inl" + namespace NEO { - -using Family = BDWFamily; - template struct EncodeDispatchKernel; template struct EncodeStates; template struct EncodeMath; @@ -32,4 +32,5 @@ template struct EncodeSempahore; template struct EncodeBatchBufferStartOrEnd; template struct EncodeMiFlushDW; template struct EncodeMemoryPrefetch; +template struct EncodeWA; } // namespace NEO diff --git a/shared/source/gen9/command_encoder_gen9.cpp b/shared/source/gen9/command_encoder_gen9.cpp index 8f65fde7ba..eda87d15f0 100644 --- a/shared/source/gen9/command_encoder_gen9.cpp +++ b/shared/source/gen9/command_encoder_gen9.cpp @@ -6,16 +6,16 @@ */ #include "shared/source/command_container/command_encoder.h" -#include "shared/source/command_container/command_encoder.inl" -#include "shared/source/command_container/command_encoder_base.inl" -#include "shared/source/command_container/encode_compute_mode_bdw_plus.inl" #include "shared/source/gen9/hw_cmds_base.h" #include "shared/source/gen9/reg_configs.h" +using Family = NEO::SKLFamily; + +#include "shared/source/command_container/command_encoder.inl" +#include "shared/source/command_container/command_encoder_base.inl" +#include "shared/source/command_container/encode_compute_mode_bdw_plus.inl" + namespace NEO { - -using Family = SKLFamily; - template struct EncodeDispatchKernel; template struct EncodeStates; template struct EncodeMath; @@ -32,4 +32,5 @@ template struct EncodeSempahore; template struct EncodeBatchBufferStartOrEnd; template struct EncodeMiFlushDW; template struct EncodeMemoryPrefetch; +template struct EncodeWA; } // namespace NEO diff --git a/shared/test/unit_test/encoders/test_encode_dispatch_kernel.cpp b/shared/test/unit_test/encoders/test_encode_dispatch_kernel.cpp index f60f49ae89..d527838e13 100644 --- a/shared/test/unit_test/encoders/test_encode_dispatch_kernel.cpp +++ b/shared/test/unit_test/encoders/test_encode_dispatch_kernel.cpp @@ -69,8 +69,6 @@ HWTEST_F(CommandEncodeStatesTest, givenCommandContainerWithUsedAvailableSizeWhen HWCMDTEST_F(IGFX_GEN8_CORE, CommandEncodeStatesTest, givenSlmTotalSizeGraterThanZeroWhenDispatchingKernelThenSharedMemorySizeSetCorrectly) { using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA; - using STATE_BASE_ADDRESS = typename FamilyType::STATE_BASE_ADDRESS; - using SAMPLER_STATE = typename FamilyType::SAMPLER_STATE; uint32_t dims[] = {2, 1, 1}; std::unique_ptr dispatchInterface(new MockDispatchKernelEncoder()); uint32_t slmTotalSize = 1; @@ -87,8 +85,6 @@ HWCMDTEST_F(IGFX_GEN8_CORE, CommandEncodeStatesTest, givenSlmTotalSizeGraterThan HWCMDTEST_F(IGFX_GEN8_CORE, CommandEncodeStatesTest, givenSlmTotalSizeEqualZeroWhenDispatchingKernelThenSharedMemorySizeSetCorrectly) { using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA; - using STATE_BASE_ADDRESS = typename FamilyType::STATE_BASE_ADDRESS; - using SAMPLER_STATE = typename FamilyType::SAMPLER_STATE; uint32_t dims[] = {2, 1, 1}; std::unique_ptr dispatchInterface(new MockDispatchKernelEncoder()); uint32_t slmTotalSize = 0; @@ -285,6 +281,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, CommandEncodeStatesTest, givenCleanHeapsAndSlmNotCha HWCMDTEST_F(IGFX_GEN8_CORE, CommandEncodeStatesTest, givenDirtyHeapsAndSlmNotChangedWhenDispatchKernelThenHeapsAreCleanAndFlushAdded) { using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + uint32_t dims[] = {2, 1, 1}; std::unique_ptr dispatchInterface(new MockDispatchKernelEncoder()); cmdContainer->slmSize = 1; @@ -301,6 +298,43 @@ HWCMDTEST_F(IGFX_GEN8_CORE, CommandEncodeStatesTest, givenDirtyHeapsAndSlmNotCha EXPECT_FALSE(cmdContainer->isAnyHeapDirty()); } +HWCMDTEST_F(IGFX_GEN8_CORE, CommandEncodeStatesTest, givenDirtyHeapsWhenDispatchKernelThenPCIsAddedBeforeSBA) { + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + using STATE_BASE_ADDRESS = typename FamilyType::STATE_BASE_ADDRESS; + + uint32_t dims[] = {2, 1, 1}; + std::unique_ptr dispatchInterface(new MockDispatchKernelEncoder()); + cmdContainer->slmSize = 1; + EXPECT_CALL(*dispatchInterface.get(), getSlmTotalSize()).WillRepeatedly(::testing::Return(cmdContainer->slmSize)); + cmdContainer->setDirtyStateForAllHeaps(true); + + EncodeDispatchKernel::encode(*cmdContainer.get(), dims, false, false, dispatchInterface.get(), 0, pDevice, NEO::PreemptionMode::Disabled); + + GenCmdList cmdList; + CmdParse::parseCommandBuffer(cmdList, ptrOffset(cmdContainer->getCommandStream()->getCpuBase(), 0), cmdContainer->getCommandStream()->getUsed()); + + auto itor = reverse_find(cmdList.rbegin(), cmdList.rend()); + ASSERT_NE(cmdList.rend(), itor); + + auto cmdSba = genCmdCast(*itor); + EXPECT_NE(nullptr, cmdSba); + + auto itorPc = reverse_find(itor, cmdList.rend()); + ASSERT_NE(cmdList.rend(), itorPc); + + bool foundPcWithDCFlush = false; + + do { + auto cmdPc = genCmdCast(*itorPc); + if (cmdPc && cmdPc->getDcFlushEnable()) { + foundPcWithDCFlush = true; + break; + } + } while (++itorPc != cmdList.rend()); + + EXPECT_TRUE(foundPcWithDCFlush); +} + HWCMDTEST_F(IGFX_GEN8_CORE, CommandEncodeStatesTest, givenCleanHeapsAndSlmChangedWhenDispatchKernelThenFlushAdded) { using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; uint32_t dims[] = {2, 1, 1};