From d1bc7199de140e3c064498ab4299c00921759290 Mon Sep 17 00:00:00 2001
From: Lukasz Jobczyk <lukasz.jobczyk@intel.com>
Date: Wed, 25 Mar 2020 10:04:42 +0100
Subject: [PATCH] Switch to 3D pipeline to program selected commands - part 2

Resolves: NEO-4447

Change-Id: I1dd6a9694cdf3be19aadec1cd139c466baecbcd7
Signed-off-by: Lukasz Jobczyk <lukasz.jobczyk@intel.com>
---
 Jenkinsfile                                   |  2 +-
 .../core/source/cmdqueue/cmdqueue_hw_base.inl |  7 ++-
 .../command_container/command_encoder.h       |  6 +++
 .../command_encoder_base.inl                  | 17 ++++++
 .../gen12lp/command_encoder_gen12lp.cpp       | 21 ++++++++
 .../encoders/test_encode_dispatch_kernel.cpp  |  9 +++-
 .../unit_test/encoders/test_encode_states.cpp |  7 ++-
 .../gen12lp/test_command_encoder_gen12lp.cpp  | 52 +++++++++++++++++++
 8 files changed, 115 insertions(+), 6 deletions(-)
diff --git a/Jenkinsfile b/Jenkinsfile
index 0c8ebfa847..e765ad311b 100644
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -1,5 +1,5 @@
 #!groovy
 dependenciesRevision='3232e5d67b5c3dd2323f13bede3ab1558b5aa4b9-1401'
 strategy='EQUAL'
-allowedCD=222
+allowedCD=221
 allowedF=11
diff --git a/level_zero/core/source/cmdqueue/cmdqueue_hw_base.inl b/level_zero/core/source/cmdqueue/cmdqueue_hw_base.inl
index e1e416ac6b..8dd738631a 100644
--- a/level_zero/core/source/cmdqueue/cmdqueue_hw_base.inl
+++ b/level_zero/core/source/cmdqueue/cmdqueue_hw_base.inl
@@ -7,6 +7,8 @@
 
 #pragma once
 
+#include "shared/source/command_container/command_encoder.h"
+#include "shared/source/command_container/command_encoder_base.inl"
 #include "shared/source/command_stream/csr_definitions.h"
 #include "shared/source/command_stream/linear_stream.h"
 #include "shared/source/device/device.h"
@@ -40,6 +42,7 @@ void CommandQueueHw<gfxCoreFamily>::programGeneralStateBaseAddress(uint64_t gsba
     pcCmd->setCommandStreamerStallEnable(true);
 
     auto gmmHelper = device->getNEODevice()->getGmmHelper();
+    NEO::EncodeWA<GfxFamily>::encodeAdditionalPipelineSelect(*device->getNEODevice(), commandStream, true);
 
     NEO::StateBaseAddressHelper<GfxFamily>::programStateBaseAddress(commandStream,
                                                                     nullptr,
@@ -54,6 +57,8 @@ void CommandQueueHw<gfxCoreFamily>::programGeneralStateBaseAddress(uint64_t gsba
                                                                     false);
 
     gsbaInit = true;
+
+    NEO::EncodeWA<GfxFamily>::encodeAdditionalPipelineSelect(*device->getNEODevice(), commandStream, false);
 }
 
 template <GFXCORE_FAMILY gfxCoreFamily>
@@ -62,7 +67,7 @@ size_t CommandQueueHw<gfxCoreFamily>::estimateStateBaseAddressCmdSize() {
     using STATE_BASE_ADDRESS = typename GfxFamily::STATE_BASE_ADDRESS;
     using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
 
-    constexpr size_t size = sizeof(STATE_BASE_ADDRESS) + sizeof(PIPE_CONTROL);
+    size_t size = sizeof(STATE_BASE_ADDRESS) + sizeof(PIPE_CONTROL) + NEO::EncodeWA<GfxFamily>::getAdditionalPipelineSelectSize(*device->getNEODevice());
     return size;
 }
 
diff --git a/shared/source/command_container/command_encoder.h b/shared/source/command_container/command_encoder.h
index ee064fd988..2fc27f7c90 100644
--- a/shared/source/command_container/command_encoder.h
+++ b/shared/source/command_container/command_encoder.h
@@ -173,6 +173,12 @@ struct EncodeComputeMode {
     static void adjustPipelineSelect(CommandContainer &container, uint32_t numGrfRequired);
 };
 
+template <typename GfxFamily>
+struct EncodeWA {
+    static void encodeAdditionalPipelineSelect(Device &device, LinearStream &stream, bool is3DPipeline);
+    static size_t getAdditionalPipelineSelectSize(Device &device);
+};
+
 template <typename GfxFamily>
 struct EncodeSempahore {
     using MI_SEMAPHORE_WAIT = typename GfxFamily::MI_SEMAPHORE_WAIT;
diff --git a/shared/source/command_container/command_encoder_base.inl b/shared/source/command_container/command_encoder_base.inl
index a2e8db4914..99f4bc3f11 100644
--- a/shared/source/command_container/command_encoder_base.inl
+++ b/shared/source/command_container/command_encoder_base.inl
@@ -53,7 +53,10 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container,
         idd.setKernelStartPointer(offset);
         idd.setKernelStartPointerHigh(0u);
     }
+
+    EncodeWA<Family>::encodeAdditionalPipelineSelect(*container.getDevice(), *container.getCommandStream(), true);
     EncodeStates<Family>::adjustStateComputeMode(*container.getCommandStream(), container.lastSentNumGrfRequired, nullptr, false, false);
+    EncodeWA<Family>::encodeAdditionalPipelineSelect(*container.getDevice(), *container.getCommandStream(), false);
 
     auto threadsPerThreadGroup = dispatchInterface->getThreadsPerThreadGroupCount();
     idd.setNumberOfThreadsInGpgpuThreadGroup(threadsPerThreadGroup);
@@ -224,6 +227,8 @@ void EncodeMediaInterfaceDescriptorLoad<Family>::encode(CommandContainer &contai
 
 template <typename Family>
 void EncodeStateBaseAddress<Family>::encode(CommandContainer &container) {
+    EncodeWA<Family>::encodeAdditionalPipelineSelect(*container.getDevice(), *container.getCommandStream(), true);
+
     auto gmmHelper = container.getDevice()->getGmmHelper();
 
     StateBaseAddressHelper<Family>::programStateBaseAddress(
@@ -238,6 +243,8 @@ void EncodeStateBaseAddress<Family>::encode(CommandContainer &container) {
         false,
         gmmHelper,
         false);
+
+    EncodeWA<Family>::encodeAdditionalPipelineSelect(*container.getDevice(), *container.getCommandStream(), false);
 }
 
 template <typename Family>
@@ -259,6 +266,7 @@ size_t EncodeDispatchKernel<Family>::estimateEncodeDispatchKernelCmdsSize(Device
     totalSize += sizeof(MEDIA_STATE_FLUSH);
     totalSize += issueMediaInterfaceDescriptorLoad;
     totalSize += EncodeStates<Family>::getAdjustStateComputeModeSize();
+    totalSize += EncodeWA<Family>::getAdditionalPipelineSelectSize(*device);
     totalSize += EncodeIndirectParams<Family>::getCmdsSizeForIndirectParams();
     totalSize += EncodeIndirectParams<Family>::getCmdsSizeForSetGroupCountIndirect();
     totalSize += EncodeIndirectParams<Family>::getCmdsSizeForSetGroupSizeIndirect();
@@ -278,4 +286,13 @@ template <typename GfxFamily>
 size_t EncodeMiFlushDW<GfxFamily>::getMiFlushDwWaSize() {
     return 0;
 }
+
+template <typename GfxFamily>
+inline void EncodeWA<GfxFamily>::encodeAdditionalPipelineSelect(Device &device, LinearStream &stream, bool is3DPipeline) {}
+
+template <typename GfxFamily>
+inline size_t EncodeWA<GfxFamily>::getAdditionalPipelineSelectSize(Device &device) {
+    return 0;
+}
+
 } // namespace NEO
diff --git a/shared/source/gen12lp/command_encoder_gen12lp.cpp b/shared/source/gen12lp/command_encoder_gen12lp.cpp
index 3ea23237f7..64e4f1535a 100644
--- a/shared/source/gen12lp/command_encoder_gen12lp.cpp
+++ b/shared/source/gen12lp/command_encoder_gen12lp.cpp
@@ -11,10 +11,21 @@
 #include "shared/source/command_container/encode_compute_mode_tgllp_plus.inl"
 #include "shared/source/gen12lp/hw_cmds_base.h"
 #include "shared/source/gen12lp/reg_configs.h"
+#include "shared/source/helpers/preamble.h"
 
 namespace NEO {
 
 using Family = TGLLPFamily;
+
+template <>
+inline size_t EncodeWA<Family>::getAdditionalPipelineSelectSize(Device &device) {
+    size_t size = 0;
+    if (device.getDefaultEngine().commandStreamReceiver->isRcs()) {
+        size += 2 * PreambleHelper<Family>::getCmdSizeForPipelineSelect(device.getHardwareInfo());
+    }
+    return size;
+}
+
 template <>
 size_t EncodeStates<Family>::getAdjustStateComputeModeSize() {
     return sizeof(typename Family::STATE_COMPUTE_MODE);
@@ -27,6 +38,15 @@ void EncodeComputeMode<Family>::adjustComputeMode(LinearStream &csr, uint32_t nu
     *reinterpret_cast<STATE_COMPUTE_MODE *>(buffer) = *stateComputeMode;
 }
 
+template <>
+inline void EncodeWA<Family>::encodeAdditionalPipelineSelect(Device &device, LinearStream &stream, bool is3DPipeline) {
+    if (device.getDefaultEngine().commandStreamReceiver->isRcs()) {
+        PipelineSelectArgs args;
+        args.is3DPipelineRequired = is3DPipeline;
+        PreambleHelper<Family>::programPipelineSelect(&stream, args, device.getHardwareInfo());
+    }
+}
+
 template struct EncodeDispatchKernel<Family>;
 template struct EncodeStates<Family>;
 template struct EncodeMath<Family>;
@@ -42,4 +62,5 @@ template struct EncodeAtomic<Family>;
 template struct EncodeSempahore<Family>;
 template struct EncodeBatchBufferStartOrEnd<Family>;
 template struct EncodeMiFlushDW<Family>;
+template struct EncodeWA<Family>;
 } // namespace NEO
diff --git a/shared/test/unit_test/encoders/test_encode_dispatch_kernel.cpp b/shared/test/unit_test/encoders/test_encode_dispatch_kernel.cpp
index e7e319db42..41feab2173 100644
--- a/shared/test/unit_test/encoders/test_encode_dispatch_kernel.cpp
+++ b/shared/test/unit_test/encoders/test_encode_dispatch_kernel.cpp
@@ -260,8 +260,13 @@ HWCMDTEST_F(IGFX_GEN8_CORE, CommandEncodeStatesTest, givenCleanHeapsAndSlmNotCha
     GenCmdList commands;
     CmdParse<FamilyType>::parseCommandBuffer(commands, ptrOffset(cmdContainer->getCommandStream()->getCpuBase(), 0), cmdContainer->getCommandStream()->getUsed());
 
-    auto itorPC = find<PIPE_CONTROL *>(commands.begin(), commands.end());
-    ASSERT_EQ(itorPC, commands.end());
+    if (HardwareCommandsHelper<TGLLPFamily>::isPipeControlPriorToPipelineSelectWArequired(pDevice->getHardwareInfo())) {
+        auto itorPC = findAll<PIPE_CONTROL *>(commands.begin(), commands.end());
+        EXPECT_EQ(2u, itorPC.size());
+    } else {
+        auto itorPC = find<PIPE_CONTROL *>(commands.begin(), commands.end());
+        ASSERT_EQ(itorPC, commands.end());
+    }
 }
 
 HWCMDTEST_F(IGFX_GEN8_CORE, CommandEncodeStatesTest, givenDirtyHeapsAndSlmNotChangedWhenDispatchKernelThenHeapsAreCleanAndFlushAdded) {
diff --git a/shared/test/unit_test/encoders/test_encode_states.cpp b/shared/test/unit_test/encoders/test_encode_states.cpp
index 4d784bdbe7..44f5882a02 100644
--- a/shared/test/unit_test/encoders/test_encode_states.cpp
+++ b/shared/test/unit_test/encoders/test_encode_states.cpp
@@ -107,7 +107,6 @@ HWTEST_F(CommandEncodeStatesTest, givenCreatedSurfaceStateBufferWhenGpuCoherency
 HWTEST_F(CommandEncodeStatesTest, givenCommandContainerWithDirtyHeapsWhenSetStateBaseAddressCalledThenStateBaseAddressAreNotSet) {
     using STATE_BASE_ADDRESS = typename FamilyType::STATE_BASE_ADDRESS;
     cmdContainer->dirtyHeaps = 0;
-    auto baseAddres = cmdContainer->getCommandStream()->getCpuBase();
 
     cmdContainer->setHeapDirty(NEO::HeapType::DYNAMIC_STATE);
     cmdContainer->setHeapDirty(NEO::HeapType::INDIRECT_OBJECT);
@@ -119,7 +118,11 @@ HWTEST_F(CommandEncodeStatesTest, givenCommandContainerWithDirtyHeapsWhenSetStat
     auto ioh = cmdContainer->getIndirectHeap(NEO::HeapType::INDIRECT_OBJECT);
     auto ssh = cmdContainer->getIndirectHeap(NEO::HeapType::SURFACE_STATE);
 
-    auto pCmd = static_cast<STATE_BASE_ADDRESS *>(baseAddres);
+    GenCmdList commands;
+    CmdParse<FamilyType>::parseCommandBuffer(commands, ptrOffset(cmdContainer->getCommandStream()->getCpuBase(), 0), cmdContainer->getCommandStream()->getUsed());
+
+    auto itorCmd = find<STATE_BASE_ADDRESS *>(commands.begin(), commands.end());
+    auto pCmd = genCmdCast<STATE_BASE_ADDRESS *>(*itorCmd);
 
     EXPECT_EQ(dsh->getHeapGpuBase(), pCmd->getDynamicStateBaseAddress());
     EXPECT_EQ(ioh->getHeapGpuBase(), pCmd->getIndirectObjectBaseAddress());
diff --git a/shared/test/unit_test/gen12lp/test_command_encoder_gen12lp.cpp b/shared/test/unit_test/gen12lp/test_command_encoder_gen12lp.cpp
index 1a5f20e841..4935ed8dc9 100644
--- a/shared/test/unit_test/gen12lp/test_command_encoder_gen12lp.cpp
+++ b/shared/test/unit_test/gen12lp/test_command_encoder_gen12lp.cpp
@@ -7,6 +7,8 @@
 
 #include "shared/source/command_container/cmdcontainer.h"
 #include "shared/source/command_container/command_encoder.h"
+#include "shared/source/helpers/preamble.h"
+#include "shared/source/os_interface/os_context.h"
 #include "shared/test/unit_test/cmd_parse/gen_cmd_parse.h"
 
 #include "opencl/test/unit_test/fixtures/device_fixture.h"
@@ -62,3 +64,53 @@ GEN12LPTEST_F(CommandEncoderTest, givenCommandContainerWhenEncodeL3StateThenSetC
     EXPECT_EQ(cmd->getRegisterOffset(), 0xB134u);
     EXPECT_EQ(cmd->getDataDword(), 0xD0000020u);
 }
+
+struct MockOsContext : public OsContext {
+    using OsContext::engineType;
+};
+
+GEN12LPTEST_F(CommandEncoderTest, givenVariousEngineTypesWhenEncodeSBAThenAdditionalPipelineSelectWAIsAppliedOnlyToRcs) {
+    using PIPELINE_SELECT = typename FamilyType::PIPELINE_SELECT;
+    using STATE_COMPUTE_MODE = typename FamilyType::STATE_COMPUTE_MODE;
+
+    CommandContainer cmdContainer;
+
+    bool ret = cmdContainer.initialize(pDevice);
+    ASSERT_TRUE(ret);
+
+    {
+        EncodeStateBaseAddress<FamilyType>::encode(cmdContainer);
+
+        GenCmdList commands;
+        CmdParse<FamilyType>::parseCommandBuffer(commands, ptrOffset(cmdContainer.getCommandStream()->getCpuBase(), 0), cmdContainer.getCommandStream()->getUsed());
+        auto itorLRI = find<PIPELINE_SELECT *>(commands.begin(), commands.end());
+        EXPECT_NE(itorLRI, commands.end());
+    }
+
+    cmdContainer.reset();
+
+    {
+        static_cast<MockOsContext *>(pDevice->getDefaultEngine().osContext)->engineType = aub_stream::ENGINE_CCS;
+
+        EncodeStateBaseAddress<FamilyType>::encode(cmdContainer);
+
+        GenCmdList commands;
+        CmdParse<FamilyType>::parseCommandBuffer(commands, ptrOffset(cmdContainer.getCommandStream()->getCpuBase(), 0), cmdContainer.getCommandStream()->getUsed());
+        auto itorLRI = find<PIPELINE_SELECT *>(commands.begin(), commands.end());
+        EXPECT_EQ(itorLRI, commands.end());
+    }
+}
+
+GEN12LPTEST_F(CommandEncoderTest, givenVariousEngineTypesWhenEstimateCommandBufferSizeThenRcsHasAdditionalPipelineSelectWASize) {
+    using PIPELINE_SELECT = typename FamilyType::PIPELINE_SELECT;
+    using STATE_COMPUTE_MODE = typename FamilyType::STATE_COMPUTE_MODE;
+
+    auto sizeWA = EncodeDispatchKernel<FamilyType>::estimateEncodeDispatchKernelCmdsSize(pDevice);
+    static_cast<MockOsContext *>(pDevice->getDefaultEngine().osContext)->engineType = aub_stream::ENGINE_CCS;
+    auto size = EncodeDispatchKernel<FamilyType>::estimateEncodeDispatchKernelCmdsSize(pDevice);
+
+    auto expectedDiff = 2 * PreambleHelper<FamilyType>::getCmdSizeForPipelineSelect(pDevice->getHardwareInfo());
+    auto diff = sizeWA - size;
+
+    EXPECT_EQ(expectedDiff, diff);
+}