Command container appends BB_END on cmd buffer allocation end

When linear stream created for command container has not enough space for command and BB_END it will program BB_END and allocate new command buffer allocation. Pointer returned from getSpace in this case will return storage from new command buffer allocation. Related-To: NEO-5707 Signed-off-by: Maciej Plewka <maciej.plewka@intel.com>
2025-12-21 01:04:57 +08:00 · 2022-01-12 16:57:42 +00:00
parent 92316c48f2
commit 9d8ce7aace
31 changed files with 262 additions and 306 deletions
--- a/level_zero/core/source/cmdlist/cmdlist_hw.h
+++ b/level_zero/core/source/cmdlist/cmdlist_hw.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2020-2021 Intel Corporation
+ * Copyright (C) 2020-2022 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@@ -159,7 +159,6 @@ struct CommandListCoreFamily : CommandListImp {
    ze_result_t reset() override;
    ze_result_t executeCommandListImmediate(bool performMigration) override;
    size_t getReserveSshSize();
-    void increaseCommandStreamSpace(size_t commandSize);

  protected:
    MOCKABLE_VIRTUAL ze_result_t appendMemoryCopyKernelWithGA(void *dstPtr, NEO::GraphicsAllocation *dstPtrAlloc,
--- a/level_zero/core/source/cmdlist/cmdlist_hw.inl
+++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl
@@ -332,7 +332,6 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendEventReset(ze_event_hand
        if (this->partitionCount > 1) {
            estimateSize += estimateBufferSizeMultiTileBarrier(hwInfo);
        }
-        increaseCommandStreamSpace(estimateSize);

        for (uint32_t i = 0u; i < packetsToReset; i++) {
            NEO::MemorySynchronizationCommands<GfxFamily>::addPipeControlAndProgramPostSyncOperation(
@@ -896,13 +895,6 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopyBlit(uintptr_t
    commandContainer.addToResidencyContainer(clearColorAllocation);

    NEO::BlitPropertiesContainer blitPropertiesContainer{blitProperties};
-    bool blitterDirectSubmission = true; // assume direct submission enabled, since usually MI_BATCH_BUFFER_START is bigger than MI_BATCH_BUFFER_END
-    size_t estimatedSize = NEO::BlitCommandsHelper<GfxFamily>::template BlitCommandsHelper<GfxFamily>::estimateBlitCommandsSize(blitPropertiesContainer,
-                                                                                                                                false,
-                                                                                                                                false,
-                                                                                                                                blitterDirectSubmission,
-                                                                                                                                *device->getNEODevice()->getExecutionEnvironment()->rootDeviceEnvironments[device->getRootDeviceIndex()]);
-    increaseCommandStreamSpace(estimatedSize);

    NEO::BlitCommandsHelper<GfxFamily>::dispatchBlitCommandsForBufferPerRow(blitProperties, *commandContainer.getCommandStream(), *device->getNEODevice()->getExecutionEnvironment()->rootDeviceEnvironments[device->getRootDeviceIndex()]);

@@ -946,13 +938,6 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopyBlitRegion(NEO
    }

    NEO::BlitPropertiesContainer blitPropertiesContainer{blitProperties};
-    bool blitterDirectSubmission = true; // assume direct submission enabled, since usually MI_BATCH_BUFFER_START is bigger than MI_BATCH_BUFFER_END
-    size_t estimatedSize = NEO::BlitCommandsHelper<GfxFamily>::template BlitCommandsHelper<GfxFamily>::estimateBlitCommandsSize(blitPropertiesContainer,
-                                                                                                                                false,
-                                                                                                                                false,
-                                                                                                                                blitterDirectSubmission,
-                                                                                                                                *device->getNEODevice()->getExecutionEnvironment()->rootDeviceEnvironments[device->getRootDeviceIndex()]);
-    increaseCommandStreamSpace(estimatedSize);

    appendEventForProfiling(hSignalEvent, true);
    bool copyRegionPreferred = NEO::BlitCommandsHelper<GfxFamily>::isCopyRegionPreferred(copySizeModified, *device->getNEODevice()->getExecutionEnvironment()->rootDeviceEnvironments[device->getRootDeviceIndex()]);
@@ -1684,11 +1669,9 @@ void CommandListCoreFamily<gfxCoreFamily>::appendSignalEventPostWalker(ze_event_
        if (isCopyOnly()) {
            NEO::MiFlushArgs args;
            args.commandWithPostSync = true;
-            increaseCommandStreamSpace(NEO::EncodeMiFlushDW<GfxFamily>::getMiFlushDwCmdSizeForDataWrite());
            NEO::EncodeMiFlushDW<GfxFamily>::programMiFlushDw(*commandContainer.getCommandStream(), baseAddr, Event::STATE_SIGNALED,
                                                              args, hwInfo);
        } else {
-            increaseCommandStreamSpace(NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForPipeControlWithPostSyncOperation(hwInfo));
            NEO::PipeControlArgs args;
            args.dcFlushEnable = NEO::MemorySynchronizationCommands<GfxFamily>::getDcFlushEnable(event->signalScope, hwInfo);
            if (this->partitionCount > 1) {
@@ -1839,7 +1822,6 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendSignalEvent(ze_event_han
    if (isCopyOnly()) {
        NEO::MiFlushArgs args;
        args.commandWithPostSync = true;
-        increaseCommandStreamSpace(NEO::EncodeMiFlushDW<GfxFamily>::getMiFlushDwCmdSizeForDataWrite());
        NEO::EncodeMiFlushDW<GfxFamily>::programMiFlushDw(*commandContainer.getCommandStream(), ptrOffset(baseAddr, eventSignalOffset),
                                                          Event::STATE_SIGNALED, args, hwInfo);
    } else {
@@ -1851,7 +1833,6 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendSignalEvent(ze_event_han
            event->setPacketsInUse(this->partitionCount);
        }
        if (applyScope || event->isEventTimestampFlagSet()) {
-            increaseCommandStreamSpace(NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForPipeControlWithPostSyncOperation(hwInfo));
            NEO::MemorySynchronizationCommands<GfxFamily>::addPipeControlAndProgramPostSyncOperation(
                *commandContainer.getCommandStream(),
                POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA,
@@ -1860,7 +1841,6 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendSignalEvent(ze_event_han
                hwInfo,
                args);
        } else {
-            increaseCommandStreamSpace(NEO::EncodeStoreMemory<GfxFamily>::getStoreDataImmSize());
            NEO::EncodeStoreMemory<GfxFamily>::programStoreDataImm(
                *commandContainer.getCommandStream(),
                ptrOffset(baseAddr, eventSignalOffset),
@@ -1928,7 +1908,6 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendWaitOnEvents(uint32_t nu
            estimatedBufferSize += NEO::EncodeSempahore<GfxFamily>::getSizeMiSemaphoreWait();
        }
    }
-    increaseCommandStreamSpace(estimatedBufferSize);

    if (dcFlushRequired) {
        if (isCopyOnly()) {
@@ -2204,17 +2183,6 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::reserveSpace(size_t size, void
    return ZE_RESULT_SUCCESS;
 }

-template <GFXCORE_FAMILY gfxCoreFamily>
-void CommandListCoreFamily<gfxCoreFamily>::increaseCommandStreamSpace(size_t commandSize) {
-    using MI_BATCH_BUFFER_END = typename GfxFamily::MI_BATCH_BUFFER_END;
-    size_t estimatedSizeRequired = commandSize + sizeof(MI_BATCH_BUFFER_END);
-    if (commandContainer.getCommandStream()->getAvailableSpace() < estimatedSizeRequired) {
-        auto bbEnd = commandContainer.getCommandStream()->template getSpaceForCmd<MI_BATCH_BUFFER_END>();
-        *bbEnd = GfxFamily::cmdInitBatchBufferEnd;
-        commandContainer.allocateNextCommandBuffer();
-    }
-}
-
 template <GFXCORE_FAMILY gfxCoreFamily>
 ze_result_t CommandListCoreFamily<gfxCoreFamily>::prepareIndirectParams(const ze_group_count_t *pThreadGroupDimensions) {
    using GfxFamily = typename NEO::GfxFamilyMapper<gfxCoreFamily>::GfxFamily;
@@ -2353,9 +2321,6 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendBarrier(ze_event_handle_
    const auto &hwInfo = this->device->getHwInfo();
    if (!hSignalEvent) {
        if (isCopyOnly()) {
-            size_t estimatedSizeRequired = NEO::EncodeMiFlushDW<GfxFamily>::getMiFlushDwCmdSizeForDataWrite();
-            increaseCommandStreamSpace(estimatedSizeRequired);
-
            NEO::MiFlushArgs args;
            NEO::EncodeMiFlushDW<GfxFamily>::programMiFlushDw(*commandContainer.getCommandStream(), 0, 0, args, hwInfo);
        } else {
--- a/level_zero/core/source/cmdlist/cmdlist_hw_base.inl
+++ b/level_zero/core/source/cmdlist/cmdlist_hw_base.inl
@@ -183,9 +183,6 @@ void CommandListCoreFamily<gfxCoreFamily>::appendMultiPartitionEpilogue() {}

 template <GFXCORE_FAMILY gfxCoreFamily>
 void CommandListCoreFamily<gfxCoreFamily>::appendComputeBarrierCommand() {
-    size_t estimatedSizeRequired = NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForSinglePipeControl();
-    increaseCommandStreamSpace(estimatedSizeRequired);
-
    NEO::PipeControlArgs args = createBarrierFlags();
    NEO::MemorySynchronizationCommands<GfxFamily>::addPipeControl(*commandContainer.getCommandStream(), args);
 }
--- a/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl
+++ b/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl
@@ -134,8 +134,6 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(z

    const auto &hwInfo = this->device->getHwInfo();
    if (NEO::DebugManager.flags.ForcePipeControlPriorToWalker.get()) {
-        increaseCommandStreamSpace(NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForSinglePipeControl());
-
        NEO::PipeControlArgs args;
        NEO::MemorySynchronizationCommands<GfxFamily>::addPipeControl(*commandContainer.getCommandStream(), args);
    }
@@ -245,8 +243,6 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(z
            event->setPacketsInUse(partitionCount);
        }
        if (L3FlushEnable) {
-            size_t estimatedSize = NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForPipeControlWithPostSyncOperation(hwInfo);
-            increaseCommandStreamSpace(estimatedSize);
            programEventL3Flush<gfxCoreFamily>(hEvent, this->device, partitionCount, commandContainer);
        }
    }
@@ -302,16 +298,12 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(z

 template <GFXCORE_FAMILY gfxCoreFamily>
 void CommandListCoreFamily<gfxCoreFamily>::appendMultiPartitionPrologue(uint32_t partitionDataSize) {
-    size_t estimatedSizeRequired = NEO::ImplicitScalingDispatch<GfxFamily>::getOffsetRegisterSize();
-    increaseCommandStreamSpace(estimatedSizeRequired);
    NEO::ImplicitScalingDispatch<GfxFamily>::dispatchOffsetRegister(*commandContainer.getCommandStream(),
                                                                    partitionDataSize);
 }

 template <GFXCORE_FAMILY gfxCoreFamily>
 void CommandListCoreFamily<gfxCoreFamily>::appendMultiPartitionEpilogue() {
-    const size_t estimatedSizeRequired = NEO::ImplicitScalingDispatch<GfxFamily>::getOffsetRegisterSize();
-    increaseCommandStreamSpace(estimatedSizeRequired);
    NEO::ImplicitScalingDispatch<GfxFamily>::dispatchOffsetRegister(*commandContainer.getCommandStream(),
                                                                    NEO::ImplicitScalingDispatch<GfxFamily>::getPostSyncOffset());
 }
@@ -320,14 +312,9 @@ template <GFXCORE_FAMILY gfxCoreFamily>
 void CommandListCoreFamily<gfxCoreFamily>::appendComputeBarrierCommand() {
    if (this->partitionCount > 1) {
        auto neoDevice = device->getNEODevice();
-        auto &hwInfo = neoDevice->getHardwareInfo();
-
-        increaseCommandStreamSpace(estimateBufferSizeMultiTileBarrier(hwInfo));
        appendMultiTileBarrier(*neoDevice);
    } else {
        NEO::PipeControlArgs args = createBarrierFlags();
-        size_t estimatedSizeRequired = NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForSinglePipeControl();
-        increaseCommandStreamSpace(estimatedSizeRequired);
        NEO::MemorySynchronizationCommands<GfxFamily>::addPipeControl(*commandContainer.getCommandStream(), args);
    }
 }
--- a/level_zero/core/source/xe_hpc_core/cmdlist_xe_hpc_core.cpp
+++ b/level_zero/core/source/xe_hpc_core/cmdlist_xe_hpc_core.cpp
@@ -44,9 +44,6 @@ ze_result_t CommandListCoreFamily<IGFX_XE_HPC_CORE>::appendMemoryPrefetch(const

    NEO::LinearStream &cmdStream = *commandContainer.getCommandStream();

-    size_t estimatedSizeRequired = NEO::EncodeMemoryPrefetch<GfxFamily>::getSizeForMemoryPrefetch(size);
-    increaseCommandStreamSpace(estimatedSizeRequired);
-
    NEO::EncodeMemoryPrefetch<GfxFamily>::programMemoryPrefetch(cmdStream, *gpuAlloc, static_cast<uint32_t>(size), offset, hwInfo);

    return ZE_RESULT_SUCCESS;
@@ -56,9 +53,6 @@ template <>
 void CommandListCoreFamily<IGFX_XE_HPC_CORE>::applyMemoryRangesBarrier(uint32_t numRanges,
                                                                       const size_t *pRangeSizes,
                                                                       const void **pRanges) {
-
-    increaseCommandStreamSpace(NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForSinglePipeControl());
-
    NEO::PipeControlArgs args;
    args.hdcPipelineFlush = true;
    args.unTypedDataPortCacheFlush = true;
--- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_1.cpp
+++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_1.cpp
@@ -106,10 +106,9 @@ HWTEST_F(CommandListAppendLaunchKernel, givenNotEnoughSpaceInCommandStreamWhenAp
    const auto streamCpu = stream->getCpuBase();

    Vec3<size_t> groupCount{1, 1, 1};
-    auto requiredSizeEstimate = EncodeDispatchKernel<FamilyType>::estimateEncodeDispatchKernelCmdsSize(
-        device->getNEODevice(), {0, 0, 0}, groupCount, false, false, false, kernel.get(), false);
+    auto sizeLeftInStream = sizeof(MI_BATCH_BUFFER_END);
    auto available = stream->getAvailableSpace();
-    stream->getSpace(available - requiredSizeEstimate + 1);
+    stream->getSpace(available - sizeLeftInStream);
    auto bbEndPosition = stream->getSpace(0);

    const uint32_t threadGroupDimensions[3] = {1, 1, 1};
@@ -236,38 +235,6 @@ HWTEST_F(CommandListAppendLaunchKernel, WhenAppendingMultipleTimesThenSshIsNotDe
    EXPECT_NE(initialAllocation, reallocatedAllocation);
 }

-HWTEST2_F(CommandListAppendLaunchKernel, WhenAppendingFunctionThenUsedCmdBufferSizeDoesNotExceedEstimate, IsAtLeastSkl) {
-    createKernel();
-    ze_group_count_t groupCount{1, 1, 1};
-
-    auto commandList = std::make_unique<WhiteBox<::L0::CommandListCoreFamily<gfxCoreFamily>>>();
-    ze_result_t ret = commandList->initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
-    ASSERT_EQ(ZE_RESULT_SUCCESS, ret);
-
-    auto sizeBefore = commandList->commandContainer.getCommandStream()->getUsed();
-
-    auto result = commandList->appendLaunchKernelWithParams(kernel->toHandle(), &groupCount, nullptr, false, false, false);
-    ASSERT_EQ(ZE_RESULT_SUCCESS, result);
-
-    auto sizeAfter = commandList->commandContainer.getCommandStream()->getUsed();
-    auto estimate = NEO::EncodeDispatchKernel<FamilyType>::estimateEncodeDispatchKernelCmdsSize(
-        device->getNEODevice(), Vec3<size_t>(0, 0, 0), Vec3<size_t>(1, 1, 1), false, false, false, kernel.get(), false);
-
-    EXPECT_LE(sizeAfter - sizeBefore, estimate);
-
-    sizeBefore = commandList->commandContainer.getCommandStream()->getUsed();
-
-    result = commandList->appendLaunchKernelWithParams(kernel->toHandle(), &groupCount, nullptr, true, false, false);
-    ASSERT_EQ(ZE_RESULT_SUCCESS, result);
-
-    sizeAfter = commandList->commandContainer.getCommandStream()->getUsed();
-    estimate = NEO::EncodeDispatchKernel<FamilyType>::estimateEncodeDispatchKernelCmdsSize(
-        device->getNEODevice(), Vec3<size_t>(0, 0, 0), Vec3<size_t>(1, 1, 1), false, false, false, kernel.get(), false);
-
-    EXPECT_LE(sizeAfter - sizeBefore, estimate);
-    EXPECT_LE(sizeAfter - sizeBefore, estimate);
-}
-
 HWCMDTEST_F(IGFX_GEN8_CORE, CommandListAppendLaunchKernel, givenEventsWhenAppendingKernelThenPostSyncToEventIsGenerated) {
    using GPGPU_WALKER = typename FamilyType::GPGPU_WALKER;
    using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
@@ -689,19 +656,10 @@ HWTEST_F(CommandListAppendLaunchKernel, givenIndirectDispatchWithImplicitArgsWhe
    auto result = context->allocDeviceMem(device->toHandle(), &deviceDesc, 16384u, 4096u, &alloc);
    ASSERT_EQ(result, ZE_RESULT_SUCCESS);

-    auto sizeBefore = commandList->commandContainer.getCommandStream()->getUsed();
-
    result = commandList->appendLaunchKernelIndirect(kernel.toHandle(),
                                                     static_cast<ze_group_count_t *>(alloc),
                                                     nullptr, 0, nullptr);
    EXPECT_EQ(result, ZE_RESULT_SUCCESS);
-
-    auto sizeAfter = commandList->commandContainer.getCommandStream()->getUsed();
-    auto estimate = NEO::EncodeDispatchKernel<FamilyType>::estimateEncodeDispatchKernelCmdsSize(
-        device->getNEODevice(), Vec3<size_t>(0, 0, 0), Vec3<size_t>(1, 1, 1), false, false, true, &kernel, false);
-
-    EXPECT_LE(sizeAfter - sizeBefore, estimate);
-
    auto heap = commandList->commandContainer.getIndirectHeap(HeapType::INDIRECT_OBJECT);
    uint64_t pImplicitArgsGPUVA = heap->getGraphicsAllocation()->getGpuAddress() + kernel.getSizeForImplicitArgsPatching() - sizeof(ImplicitArgs);
    auto workDimStoreRegisterMemCmd = FamilyType::cmdInitStoreRegisterMem;
@@ -869,19 +827,11 @@ HWTEST_F(CommandListAppendLaunchKernel, givenIndirectDispatchWhenAppendingThenWo
    auto result = context->allocDeviceMem(device->toHandle(), &deviceDesc, 16384u, 4096u, &alloc);
    ASSERT_EQ(result, ZE_RESULT_SUCCESS);

-    auto sizeBefore = commandList->commandContainer.getCommandStream()->getUsed();
-
    result = commandList->appendLaunchKernelIndirect(kernel.toHandle(),
                                                     static_cast<ze_group_count_t *>(alloc),
                                                     nullptr, 0, nullptr);
    EXPECT_EQ(result, ZE_RESULT_SUCCESS);

-    auto sizeAfter = commandList->commandContainer.getCommandStream()->getUsed();
-    auto estimate = NEO::EncodeDispatchKernel<FamilyType>::estimateEncodeDispatchKernelCmdsSize(
-        device->getNEODevice(), Vec3<size_t>(0, 0, 0), Vec3<size_t>(1, 1, 1), false, false, true, &kernel, false);
-
-    EXPECT_LE(sizeAfter - sizeBefore, estimate);
-
    kernel.groupSize[2] = 2;
    result = commandList->appendLaunchKernelIndirect(kernel.toHandle(),
                                                     static_cast<ze_group_count_t *>(alloc),
--- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp
+++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel_2.cpp
@@ -1209,20 +1209,19 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, MultiTileCommandListAppendLaunchFunctionXeHpCoreTes
 HWTEST2_F(MultiTileCommandListAppendLaunchFunctionXeHpCoreTest, givenCooperativeKernelWhenAppendingKernelsThenDoNotUseImplicitScaling, IsAtLeastXeHpCore) {
    ze_group_count_t groupCount{1, 1, 1};

-    auto estimateWithNonCooperativeKernel = NEO::EncodeDispatchKernel<FamilyType>::estimateEncodeDispatchKernelCmdsSize(
-        device->getNEODevice(), Vec3<size_t>{0, 0, 0}, Vec3<size_t>{1, 1, 1}, false, false, false, kernel.get(), true);
-    auto estimateWithCooperativeKernel = NEO::EncodeDispatchKernel<FamilyType>::estimateEncodeDispatchKernelCmdsSize(
-        device->getNEODevice(), Vec3<size_t>{0, 0, 0}, Vec3<size_t>{1, 1, 1}, false, true, false, kernel.get(), true);
-    EXPECT_GT(estimateWithNonCooperativeKernel, estimateWithCooperativeKernel);
-
    auto commandListWithNonCooperativeKernel = std::make_unique<WhiteBox<::L0::CommandListCoreFamily<gfxCoreFamily>>>();
    auto result = commandListWithNonCooperativeKernel->initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
    ASSERT_EQ(ZE_RESULT_SUCCESS, result);
    auto sizeBefore = commandListWithNonCooperativeKernel->commandContainer.getCommandStream()->getUsed();
    result = commandListWithNonCooperativeKernel->appendLaunchKernelWithParams(kernel->toHandle(), &groupCount, nullptr, false, false, false);
    ASSERT_EQ(ZE_RESULT_SUCCESS, result);
-    auto sizeUsedWithNonCooperativeKernel = commandListWithNonCooperativeKernel->commandContainer.getCommandStream()->getUsed() - sizeBefore;
-    EXPECT_LE(sizeUsedWithNonCooperativeKernel, estimateWithNonCooperativeKernel);
+    auto sizeAfter = commandListWithNonCooperativeKernel->commandContainer.getCommandStream()->getUsed();
+    GenCmdList cmdList;
+    ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
+        cmdList, ptrOffset(commandListWithNonCooperativeKernel->commandContainer.getCommandStream()->getCpuBase(), sizeBefore), sizeAfter - sizeBefore));
+    auto itorWalker = find<typename FamilyType::WALKER_TYPE *>(cmdList.begin(), cmdList.end());
+    auto cmd = genCmdCast<typename FamilyType::WALKER_TYPE *>(*itorWalker);
+    EXPECT_TRUE(cmd->getWorkloadPartitionEnable());

    auto commandListWithCooperativeKernel = std::make_unique<WhiteBox<::L0::CommandListCoreFamily<gfxCoreFamily>>>();
    result = commandListWithCooperativeKernel->initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
@@ -1230,8 +1229,14 @@ HWTEST2_F(MultiTileCommandListAppendLaunchFunctionXeHpCoreTest, givenCooperative
    sizeBefore = commandListWithCooperativeKernel->commandContainer.getCommandStream()->getUsed();
    result = commandListWithCooperativeKernel->appendLaunchKernelWithParams(kernel->toHandle(), &groupCount, nullptr, false, false, true);
    ASSERT_EQ(ZE_RESULT_SUCCESS, result);
-    auto sizeUsedWithCooperativeKernel = commandListWithCooperativeKernel->commandContainer.getCommandStream()->getUsed() - sizeBefore;
-    EXPECT_LE(sizeUsedWithCooperativeKernel, estimateWithCooperativeKernel);
+    sizeAfter = commandListWithCooperativeKernel->commandContainer.getCommandStream()->getUsed();
+    cmdList.clear();
+    ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
+        cmdList, ptrOffset(commandListWithNonCooperativeKernel->commandContainer.getCommandStream()->getCpuBase(), sizeBefore), sizeAfter - sizeBefore));
+
+    itorWalker = find<typename FamilyType::WALKER_TYPE *>(cmdList.begin(), cmdList.end());
+    cmd = genCmdCast<typename FamilyType::WALKER_TYPE *>(*itorWalker);
+    EXPECT_TRUE(cmd->getWorkloadPartitionEnable());
 }

 } // namespace ult
--- a/opencl/test/unit_test/helpers/hw_helper_tests.cpp
+++ b/opencl/test/unit_test/helpers/hw_helper_tests.cpp
@@ -1449,3 +1449,13 @@ HWTEST2_F(HwHelperTest, givenHwInfoConfigWhenCheckingForceNonGpuCoherencyWAThenF
    EXPECT_FALSE(hwHelper.forceNonGpuCoherencyWA(true));
    EXPECT_FALSE(hwHelper.forceNonGpuCoherencyWA(false));
 }
+
+HWTEST_F(HwHelperTest, GivenHwInfoWhenGetBatchBufferEndSizeCalledThenCorrectSizeReturned) {
+    const auto &hwHelper = HwHelper::get(renderCoreFamily);
+    EXPECT_EQ(hwHelper.getBatchBufferEndSize(), sizeof(typename FamilyType::MI_BATCH_BUFFER_END));
+}
+
+HWTEST_F(HwHelperTest, GivenHwInfoWhenGetBatchBufferEndReferenceCalledThenCorrectPtrReturned) {
+    const auto &hwHelper = HwHelper::get(renderCoreFamily);
+    EXPECT_EQ(hwHelper.getBatchBufferEndReference(), reinterpret_cast<const void *>(&FamilyType::cmdInitBatchBufferEnd));
+}
--- a/shared/source/command_container/cmdcontainer.cpp
+++ b/shared/source/command_container/cmdcontainer.cpp
@@ -69,8 +69,10 @@ ErrorCode CommandContainer::initialize(Device *device, AllocationsList *reusable

    cmdBufferAllocations.push_back(cmdBufferAllocation);

-    commandStream = std::unique_ptr<LinearStream>(new LinearStream(cmdBufferAllocation->getUnderlyingBuffer(),
-                                                                   defaultListCmdBufferSize));
+    const auto &hardwareInfo = device->getHardwareInfo();
+    auto &hwHelper = NEO::HwHelper::get(hardwareInfo.platform.eRenderCoreFamily);
+    commandStream = std::make_unique<LinearStream>(cmdBufferAllocation->getUnderlyingBuffer(),
+                                                   alignedSize - cmdBufferReservedSize, this, hwHelper.getBatchBufferEndSize());

    commandStream->replaceGraphicsAllocation(cmdBufferAllocation);

@@ -264,7 +266,8 @@ void CommandContainer::allocateNextCommandBuffer() {

    cmdBufferAllocations.push_back(cmdBufferAllocation);

-    commandStream->replaceBuffer(cmdBufferAllocation->getUnderlyingBuffer(), defaultListCmdBufferSize);
+    size_t alignedSize = alignUp<size_t>(totalCmdBufferSize, MemoryConstants::pageSize64k);
+    commandStream->replaceBuffer(cmdBufferAllocation->getUnderlyingBuffer(), alignedSize - cmdBufferReservedSize);
    commandStream->replaceGraphicsAllocation(cmdBufferAllocation);

    if (!getFlushTaskUsedForImmediate()) {
@@ -272,6 +275,14 @@ void CommandContainer::allocateNextCommandBuffer() {
    }
 }

+void CommandContainer::closeAndAllocateNextCommandBuffer() {
+    auto &hwHelper = NEO::HwHelper::get(device->getHardwareInfo().platform.eRenderCoreFamily);
+    auto bbEndSize = hwHelper.getBatchBufferEndSize();
+    auto ptr = commandStream->getSpace(0u);
+    memcpy_s(ptr, bbEndSize, hwHelper.getBatchBufferEndReference(), bbEndSize);
+    allocateNextCommandBuffer();
+}
+
 void CommandContainer::prepareBindfulSsh() {
    if (ApiSpecificConfig::getBindlessConfiguration()) {
        if (allocationIndirectHeaps[IndirectHeap::Type::SURFACE_STATE] == nullptr) {
--- a/shared/source/command_container/cmdcontainer.h
+++ b/shared/source/command_container/cmdcontainer.h
@@ -36,10 +36,9 @@ enum class ErrorCode {
 class CommandContainer : public NonCopyableOrMovableClass {
  public:
    static constexpr size_t defaultListCmdBufferSize = MemoryConstants::kiloByte * 256;
-    static constexpr size_t totalCmdBufferSize =
-        defaultListCmdBufferSize +
-        MemoryConstants::cacheLineSize +
-        CSRequirements::csOverfetchSize;
+    static constexpr size_t cmdBufferReservedSize = MemoryConstants::cacheLineSize +
+                                                    CSRequirements::csOverfetchSize;
+    static constexpr size_t totalCmdBufferSize = defaultListCmdBufferSize + cmdBufferReservedSize;

    CommandContainer();

@@ -86,6 +85,7 @@ class CommandContainer : public NonCopyableOrMovableClass {

    IndirectHeap *getHeapWithRequiredSizeAndAlignment(HeapType heapType, size_t sizeRequired, size_t alignment);
    void allocateNextCommandBuffer();
+    void closeAndAllocateNextCommandBuffer();

    void handleCmdBufferAllocations(size_t startIndex);
    GraphicsAllocation *obtainNextCommandBufferAllocation();
--- a/shared/source/command_container/command_encoder.h
+++ b/shared/source/command_container/command_encoder.h
@@ -7,7 +7,6 @@

 #pragma once
 #include "shared/source/command_container/cmdcontainer.h"
-#include "shared/source/command_stream/linear_stream.h"
 #include "shared/source/debugger/debugger.h"
 #include "shared/source/execution_environment/execution_environment.h"
 #include "shared/source/helpers/definitions/mi_flush_args.h"
@@ -64,10 +63,6 @@ struct EncodeDispatchKernel {

    static void *getInterfaceDescriptor(CommandContainer &container, uint32_t &iddOffset);

-    static size_t estimateEncodeDispatchKernelCmdsSize(Device *device, const Vec3<size_t> &groupStart, const Vec3<size_t> &groupCount,
-                                                       bool isInternal, bool isCooperative, bool isIndirect, DispatchKernelEncoderI *dispatchInterface,
-                                                       bool isPartitioned);
-
    static bool isRuntimeLocalIdsGenerationRequired(uint32_t activeChannels,
                                                    size_t *lws,
                                                    std::array<uint8_t, 3> walkOrder,
@@ -116,8 +111,6 @@ struct EncodeStates {
                                     const void *fnDynamicStateHeap,
                                     BindlessHeapsHelper *bindlessHeapHelper,
                                     const HardwareInfo &hwInfo);
-
-    static size_t getAdjustStateComputeModeSize();
 };

 template <typename GfxFamily>
@@ -186,9 +179,6 @@ struct EncodeIndirectParams {
    static void setWorkDimIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offset, uint64_t crossThreadAddress, const uint32_t *groupSize);
    static void setGlobalWorkSizeIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], uint64_t crossThreadAddress, const uint32_t *lws);

-    static size_t getCmdsSizeForIndirectParams();
-    static size_t getCmdsSizeForSetGroupSizeIndirect();
-    static size_t getCmdsSizeForSetGroupCountIndirect();
    static size_t getCmdsSizeForSetWorkDimIndirect(const uint32_t *groupSize, bool misalignedPtr);
 };

--- a/shared/source/command_container/command_encoder.inl
+++ b/shared/source/command_container/command_encoder.inl
@@ -92,11 +92,6 @@ uint32_t EncodeStates<Family>::copySamplerState(IndirectHeap *dsh,
    return samplerStateOffsetInDsh;
 } // namespace NEO

-template <typename Family>
-inline size_t EncodeStates<Family>::getAdjustStateComputeModeSize() {
-    return 0;
-}
-
 template <typename Family>
 void EncodeMathMMIO<Family>::encodeMulRegVal(CommandContainer &container, uint32_t offset, uint32_t val, uint64_t dstAddress) {
    int logLws = 0;
@@ -665,22 +660,6 @@ void EncodeIndirectParams<Family>::setGlobalWorkSizeIndirect(CommandContainer &c
    }
 }

-template <typename Family>
-inline size_t EncodeIndirectParams<Family>::getCmdsSizeForIndirectParams() {
-    return 3 * sizeof(typename Family::MI_LOAD_REGISTER_MEM);
-}
-
-template <typename Family>
-inline size_t EncodeIndirectParams<Family>::getCmdsSizeForSetGroupCountIndirect() {
-    return 3 * (sizeof(MI_STORE_REGISTER_MEM));
-}
-
-template <typename Family>
-inline size_t EncodeIndirectParams<Family>::getCmdsSizeForSetGroupSizeIndirect() {
-    constexpr uint32_t aluCmdSize = sizeof(MI_MATH) + sizeof(MI_MATH_ALU_INST_INLINE) * NUM_ALU_INST_FOR_READ_MODIFY_WRITE;
-    return 3 * (sizeof(MI_LOAD_REGISTER_REG) + sizeof(MI_LOAD_REGISTER_IMM) + aluCmdSize + sizeof(MI_STORE_REGISTER_MEM));
-}
-
 template <typename Family>
 inline size_t EncodeIndirectParams<Family>::getCmdsSizeForSetWorkDimIndirect(const uint32_t *groupSize, bool misaligedPtr) {
    constexpr uint32_t aluCmdSize = sizeof(MI_MATH) + sizeof(MI_MATH_ALU_INST_INLINE) * NUM_ALU_INST_FOR_READ_MODIFY_WRITE;
--- a/shared/source/command_container/command_encoder_bdw_and_later.inl
+++ b/shared/source/command_container/command_encoder_bdw_and_later.inl
@@ -66,15 +66,6 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container,
    if (!args.isIndirect) {
        threadDimsVec = {threadDims[0], threadDims[1], threadDims[2]};
    }
-    size_t estimatedSizeRequired = estimateEncodeDispatchKernelCmdsSize(args.device, threadStartVec, threadDimsVec,
-                                                                        args.isInternal, args.isCooperative, args.isIndirect,
-                                                                        args.dispatchInterface, false);
-    if (container.getCommandStream()->getAvailableSpace() < estimatedSizeRequired) {
-        auto bbEnd = listCmdBufferStream->getSpaceForCmd<MI_BATCH_BUFFER_END>();
-        *bbEnd = Family::cmdInitBatchBufferEnd;
-
-        container.allocateNextCommandBuffer();
-    }

    WALKER_TYPE cmd = Family::cmdInitGpgpuWalker;
    auto idd = Family::cmdInitInterfaceDescriptorData;
@@ -343,40 +334,6 @@ inline void EncodeDispatchKernel<Family>::encodeAdditionalWalkerFields(const Har
 template <typename Family>
 void EncodeDispatchKernel<Family>::appendAdditionalIDDFields(INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, const HardwareInfo &hwInfo, const uint32_t threadsPerThreadGroup, uint32_t slmTotalSize, SlmPolicy slmPolicy) {}

-template <typename Family>
-size_t EncodeDispatchKernel<Family>::estimateEncodeDispatchKernelCmdsSize(Device *device, const Vec3<size_t> &groupStart,
-                                                                          const Vec3<size_t> &groupCount, bool isInternal,
-                                                                          bool isCooperative, bool isIndirect, DispatchKernelEncoderI *dispatchInterface,
-                                                                          bool isPartitioned) {
-    using MEDIA_STATE_FLUSH = typename Family::MEDIA_STATE_FLUSH;
-    using MEDIA_INTERFACE_DESCRIPTOR_LOAD = typename Family::MEDIA_INTERFACE_DESCRIPTOR_LOAD;
-    using MI_BATCH_BUFFER_END = typename Family::MI_BATCH_BUFFER_END;
-
-    size_t issueMediaInterfaceDescriptorLoad = sizeof(MEDIA_STATE_FLUSH) + sizeof(MEDIA_INTERFACE_DESCRIPTOR_LOAD);
-    size_t totalSize = sizeof(WALKER_TYPE);
-    totalSize += PreemptionHelper::getPreemptionWaCsSize<Family>(*device);
-    totalSize += sizeof(MEDIA_STATE_FLUSH);
-    totalSize += issueMediaInterfaceDescriptorLoad;
-    totalSize += EncodeStates<Family>::getAdjustStateComputeModeSize();
-    totalSize += EncodeWA<Family>::getAdditionalPipelineSelectSize(*device);
-    totalSize += EncodeIndirectParams<Family>::getCmdsSizeForIndirectParams();
-    totalSize += EncodeIndirectParams<Family>::getCmdsSizeForSetGroupCountIndirect();
-    totalSize += EncodeIndirectParams<Family>::getCmdsSizeForSetGroupSizeIndirect();
-    if (isIndirect) {
-        UNRECOVERABLE_IF(dispatchInterface == nullptr);
-        totalSize += EncodeIndirectParams<Family>::getCmdsSizeForSetWorkDimIndirect(dispatchInterface->getGroupSize(), false);
-        if (dispatchInterface->getImplicitArgs()) {
-            totalSize += EncodeIndirectParams<Family>::getCmdsSizeForSetGroupCountIndirect();
-            totalSize += EncodeIndirectParams<Family>::getCmdsSizeForSetGroupSizeIndirect();
-            totalSize += EncodeIndirectParams<Family>::getCmdsSizeForSetWorkDimIndirect(dispatchInterface->getGroupSize(), true);
-        }
-    }
-
-    totalSize += sizeof(MI_BATCH_BUFFER_END);
-
-    return totalSize;
-}
-
 template <typename Family>
 inline void EncodeComputeMode<Family>::programComputeModeCommand(LinearStream &csr, StateComputeModeProperties &properties, const HardwareInfo &hwInfo) {
 }
--- a/shared/source/command_container/command_encoder_xehp_and_later.inl
+++ b/shared/source/command_container/command_encoder_xehp_and_later.inl
@@ -63,15 +63,6 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container,
    if (!args.isIndirect) {
        threadDimsVec = {threadDims[0], threadDims[1], threadDims[2]};
    }
-    size_t estimatedSizeRequired = estimateEncodeDispatchKernelCmdsSize(args.device, threadStartVec, threadDimsVec,
-                                                                        args.isInternal, args.isCooperative, args.isIndirect, args.dispatchInterface,
-                                                                        args.partitionCount > 1);
-    if (container.getCommandStream()->getAvailableSpace() < estimatedSizeRequired) {
-        auto bbEnd = listCmdBufferStream->getSpaceForCmd<MI_BATCH_BUFFER_END>();
-        *bbEnd = Family::cmdInitBatchBufferEnd;
-
-        container.allocateNextCommandBuffer();
-    }

    bool specialModeRequired = kernelDescriptor.kernelAttributes.flags.usesSpecialPipelineSelectMode;
    if (PreambleHelper<Family>::isSpecialPipelineSelectModeChanged(container.lastPipelineSelectModeRequired, specialModeRequired, hwInfo)) {
@@ -448,36 +439,6 @@ void EncodeDispatchKernel<Family>::encodeThreadData(WALKER_TYPE &walkerCmd,
    }
 }

-template <typename Family>
-size_t EncodeDispatchKernel<Family>::estimateEncodeDispatchKernelCmdsSize(Device *device, const Vec3<size_t> &groupStart,
-                                                                          const Vec3<size_t> &groupCount, bool isInternal,
-                                                                          bool isCooperative, bool isIndirect, DispatchKernelEncoderI *dispatchInterface,
-                                                                          bool isPartitioned) {
-    size_t totalSize = sizeof(WALKER_TYPE);
-    totalSize += PreemptionHelper::getPreemptionWaCsSize<Family>(*device);
-    totalSize += EncodeStates<Family>::getAdjustStateComputeModeSize();
-    totalSize += EncodeIndirectParams<Family>::getCmdsSizeForIndirectParams();
-    totalSize += EncodeIndirectParams<Family>::getCmdsSizeForSetGroupCountIndirect();
-    totalSize += EncodeIndirectParams<Family>::getCmdsSizeForSetGroupSizeIndirect();
-    if (isIndirect) {
-        UNRECOVERABLE_IF(dispatchInterface == nullptr);
-        totalSize += EncodeIndirectParams<Family>::getCmdsSizeForSetWorkDimIndirect(dispatchInterface->getGroupSize(), false);
-        if (dispatchInterface->getImplicitArgs()) {
-            totalSize += EncodeIndirectParams<Family>::getCmdsSizeForSetGroupCountIndirect();
-            totalSize += EncodeIndirectParams<Family>::getCmdsSizeForSetGroupSizeIndirect();
-            totalSize += EncodeIndirectParams<Family>::getCmdsSizeForSetWorkDimIndirect(dispatchInterface->getGroupSize(), true);
-        }
-    }
-
-    if ((isPartitioned && !isCooperative) &&
-        !isInternal) {
-        const bool staticPartitioning = device->getDefaultEngine().commandStreamReceiver->isStaticWorkPartitioningEnabled();
-        totalSize += ImplicitScalingDispatch<Family>::getSize(true, staticPartitioning, device->getDeviceBitfield(), groupStart, groupCount);
-    }
-
-    return totalSize;
-}
-
 template <typename Family>
 void EncodeStateBaseAddress<Family>::setIohAddressForDebugger(NEO::Debugger::SbaAddresses &sbaAddress, const STATE_BASE_ADDRESS &sbaCmd) {
 }
--- a/shared/source/command_container/implicit_scaling_xehp_and_later.inl
+++ b/shared/source/command_container/implicit_scaling_xehp_and_later.inl
@@ -100,8 +100,10 @@ void ImplicitScalingDispatch<GfxFamily>::dispatchCommands(LinearStream &commandS
                                                                                      staticPartitioning,
                                                                                      useSecondaryBatchBuffer);

-    uint64_t cmdBufferGpuAddress = commandStream.getGraphicsAllocation()->getGpuAddress() + commandStream.getUsed();
-    void *commandBuffer = commandStream.getSpace(0u);
+    auto dispatchCommandsSize = getSize(apiSelfCleanup, preferStaticPartitioning, devices, {walkerCmd.getThreadGroupIdStartingX(), walkerCmd.getThreadGroupIdStartingY(), walkerCmd.getThreadGroupIdStartingZ()}, {walkerCmd.getThreadGroupIdXDimension(), walkerCmd.getThreadGroupIdYDimension(), walkerCmd.getThreadGroupIdZDimension()});
+    void *commandBuffer = commandStream.getSpace(dispatchCommandsSize);
+    uint64_t cmdBufferGpuAddress = commandStream.getGraphicsAllocation()->getGpuAddress() + commandStream.getUsed() - dispatchCommandsSize;
+
    if (staticPartitioning) {
        UNRECOVERABLE_IF(tileCount != partitionCount);
        WalkerPartition::constructStaticallyPartitionedCommandBuffer<GfxFamily>(commandBuffer,
@@ -126,7 +128,7 @@ void ImplicitScalingDispatch<GfxFamily>::dispatchCommands(LinearStream &commandS
                                                                                 args,
                                                                                 hwInfo);
    }
-    commandStream.getSpace(totalProgrammedSize);
+    UNRECOVERABLE_IF(totalProgrammedSize != dispatchCommandsSize);
 }

 template <typename GfxFamily>
@@ -166,8 +168,9 @@ void ImplicitScalingDispatch<GfxFamily>::dispatchBarrierCommands(LinearStream &c
    args.postSyncGpuAddress = gpuAddress;
    args.postSyncImmediateValue = immediateData;

-    uint64_t cmdBufferGpuAddress = commandStream.getGraphicsAllocation()->getGpuAddress() + commandStream.getUsed();
-    void *commandBuffer = commandStream.getSpace(0u);
+    auto barrierCommandsSize = getBarrierSize(hwInfo, apiSelfCleanup, args.usePostSync);
+    void *commandBuffer = commandStream.getSpace(barrierCommandsSize);
+    uint64_t cmdBufferGpuAddress = commandStream.getGraphicsAllocation()->getGpuAddress() + commandStream.getUsed() - barrierCommandsSize;

    WalkerPartition::constructBarrierCommandBuffer<GfxFamily>(commandBuffer,
                                                              cmdBufferGpuAddress,
@@ -175,7 +178,7 @@ void ImplicitScalingDispatch<GfxFamily>::dispatchBarrierCommands(LinearStream &c
                                                              args,
                                                              flushArgs,
                                                              hwInfo);
-    commandStream.getSpace(totalProgrammedSize);
+    UNRECOVERABLE_IF(totalProgrammedSize != barrierCommandsSize);
 }

 template <typename GfxFamily>
--- a/shared/source/command_stream/command_stream_receiver.h
+++ b/shared/source/command_stream/command_stream_receiver.h
@@ -273,7 +273,7 @@ class CommandStreamReceiver {

    uint64_t getWorkPartitionAllocationGpuAddress() const;

-    bool isRcs() const;
+    MOCKABLE_VIRTUAL bool isRcs() const;

    virtual void initializeDefaultsForInternalEngine(){};

--- a/shared/source/command_stream/linear_stream.cpp
+++ b/shared/source/command_stream/linear_stream.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2018-2021 Intel Corporation
+ * Copyright (C) 2018-2022 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@@ -33,4 +33,10 @@ LinearStream::LinearStream(GraphicsAllocation *gfxAllocation)
 LinearStream::LinearStream()
    : LinearStream(nullptr) {
 }
+
+LinearStream::LinearStream(void *buffer, size_t bufferSize, CommandContainer *cmdContainer, size_t batchBufferEndSize)
+    : LinearStream(buffer, bufferSize) {
+    this->cmdContainer = cmdContainer;
+    this->batchBufferEndSize = batchBufferEndSize;
+}
 } // namespace NEO
--- a/shared/source/command_stream/linear_stream.h
+++ b/shared/source/command_stream/linear_stream.h
@@ -1,13 +1,16 @@
 /*
- * Copyright (C) 2018-2021 Intel Corporation
+ * Copyright (C) 2018-2022 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
 */

 #pragma once
+#include "shared/source/command_container/cmdcontainer.h"
 #include "shared/source/helpers/debug_helpers.h"
+#include "shared/source/helpers/hw_helper.h"
 #include "shared/source/helpers/ptr_math.h"
+#include "shared/source/helpers/string.h"

 #include <atomic>
 #include <cstddef>
@@ -23,6 +26,7 @@ class LinearStream {
    LinearStream(void *buffer, size_t bufferSize);
    LinearStream(GraphicsAllocation *buffer);
    LinearStream(GraphicsAllocation *gfxAllocation, void *buffer, size_t bufferSize);
+    LinearStream(void *buffer, size_t bufferSize, CommandContainer *cmdContainer, size_t batchBufferEndSize);
    void *getCpuBase() const;
    void *getSpace(size_t size);
    size_t getMaxAvailableSpace() const;
@@ -44,6 +48,8 @@ class LinearStream {
    size_t maxAvailableSpace;
    void *buffer;
    GraphicsAllocation *graphicsAllocation;
+    CommandContainer *cmdContainer = nullptr;
+    size_t batchBufferEndSize = 0;
 };

 inline void *LinearStream::getCpuBase() const {
@@ -51,6 +57,10 @@ inline void *LinearStream::getCpuBase() const {
 }

 inline void *LinearStream::getSpace(size_t size) {
+    if (cmdContainer != nullptr && getAvailableSpace() < batchBufferEndSize + size) {
+        UNRECOVERABLE_IF(sizeUsed + batchBufferEndSize > maxAvailableSpace);
+        cmdContainer->closeAndAllocateNextCommandBuffer();
+    }
    UNRECOVERABLE_IF(sizeUsed + size > maxAvailableSpace);
    auto memory = ptrOffset(buffer, sizeUsed);
    sizeUsed += size;
--- a/shared/source/gen12lp/command_encoder_gen12lp.cpp
+++ b/shared/source/gen12lp/command_encoder_gen12lp.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2020-2021 Intel Corporation
+ * Copyright (C) 2020-2022 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@@ -32,11 +32,6 @@ size_t EncodeWA<Family>::getAdditionalPipelineSelectSize(Device &device) {
    return size;
 }

-template <>
-size_t EncodeStates<Family>::getAdjustStateComputeModeSize() {
-    return sizeof(typename Family::STATE_COMPUTE_MODE);
-}
-
 template <>
 void EncodeComputeMode<Family>::programComputeModeCommand(LinearStream &csr, StateComputeModeProperties &properties, const HardwareInfo &hwInfo) {
    using STATE_COMPUTE_MODE = typename Family::STATE_COMPUTE_MODE;
--- a/shared/source/helpers/hw_helper.h
+++ b/shared/source/helpers/hw_helper.h
@@ -9,7 +9,6 @@
 #include "shared/source/aub_mem_dump/aub_mem_dump.h"
 #include "shared/source/built_ins/sip.h"
 #include "shared/source/command_container/command_encoder.h"
-#include "shared/source/command_stream/linear_stream.h"
 #include "shared/source/commands/bxml_generator_glue.h"
 #include "shared/source/helpers/aux_translation.h"
 #include "shared/source/helpers/definitions/engine_group_types.h"
@@ -28,6 +27,7 @@ namespace NEO {
 class GmmHelper;
 class GraphicsAllocation;
 class TagAllocatorBase;
+class LinearSteram;
 class Gmm;
 struct AllocationData;
 struct AllocationProperties;
@@ -155,6 +155,8 @@ class HwHelper {
    virtual bool forceNonGpuCoherencyWA(bool requiresCoherency) const = 0;
    virtual bool platformSupportsImplicitScaling(const NEO::HardwareInfo &hwInfo) const = 0;
    virtual bool isLinuxCompletionFenceSupported() const = 0;
+    virtual size_t getBatchBufferEndSize() const = 0;
+    virtual const void *getBatchBufferEndReference() const = 0;

  protected:
    HwHelper() = default;
@@ -391,6 +393,8 @@ class HwHelperHw : public HwHelper {
    bool forceNonGpuCoherencyWA(bool requiresCoherency) const override;
    bool platformSupportsImplicitScaling(const NEO::HardwareInfo &hwInfo) const override;
    bool isLinuxCompletionFenceSupported() const override;
+    size_t getBatchBufferEndSize() const override;
+    const void *getBatchBufferEndReference() const override;

  protected:
    static const AuxTranslationMode defaultAuxTranslationMode;
--- a/shared/source/helpers/hw_helper_base.inl
+++ b/shared/source/helpers/hw_helper_base.inl
@@ -710,4 +710,12 @@ template <typename GfxFamily>
 bool HwHelperHw<GfxFamily>::forceNonGpuCoherencyWA(bool requiresCoherency) const {
    return requiresCoherency;
 }
+template <typename GfxFamily>
+size_t HwHelperHw<GfxFamily>::getBatchBufferEndSize() const {
+    return sizeof(typename GfxFamily::MI_BATCH_BUFFER_END);
+}
+template <typename GfxFamily>
+const void *HwHelperHw<GfxFamily>::getBatchBufferEndReference() const {
+    return reinterpret_cast<const void *>(&GfxFamily::cmdInitBatchBufferEnd);
+}
 } // namespace NEO
--- a/shared/source/helpers/preamble_bdw_and_later.inl
+++ b/shared/source/helpers/preamble_bdw_and_later.inl
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2019-2021 Intel Corporation
+ * Copyright (C) 2019-2022 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@@ -7,7 +7,6 @@

 #include "shared/source/command_stream/stream_properties.h"
 #include "shared/source/helpers/flat_batch_buffer_helper.h"
-#include "shared/source/helpers/hw_helper.h"
 #include "shared/source/helpers/preamble_base.inl"
 #include "shared/source/kernel/kernel_execution_type.h"

--- a/shared/source/utilities/software_tags_manager.h
+++ b/shared/source/utilities/software_tags_manager.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2021 Intel Corporation
+ * Copyright (C) 2021-2022 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@@ -7,6 +7,7 @@

 #pragma once
 #include "shared/source/command_container/command_encoder.h"
+#include "shared/source/command_stream/linear_stream.h"
 #include "shared/source/debug_settings/debug_settings_manager.h"
 #include "shared/source/memory_manager/memory_manager.h"
 #include "shared/source/utilities/software_tags.h"
--- a/shared/test/common/gen12lp/command_encoder_tests_gen12lp.cpp
+++ b/shared/test/common/gen12lp/command_encoder_tests_gen12lp.cpp
@@ -1,11 +1,14 @@
 /*
- * Copyright (C) 2021 Intel Corporation
+ * Copyright (C) 2021-2022 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
 */

 #include "shared/source/command_container/command_encoder.h"
+#include "shared/source/helpers/preamble.h"
+#include "shared/test/common/mocks/mock_command_stream_receiver.h"
+#include "shared/test/common/mocks/mock_device.h"
 #include "shared/test/common/test_macros/test.h"

 using namespace NEO;
@@ -14,3 +17,30 @@ using Gen12LpCommandEncodeTest = testing::Test;
 GEN12LPTEST_F(Gen12LpCommandEncodeTest, givenGen12LpPlatformWhenDoBindingTablePrefetchIsCalledThenReturnsTrue) {
    EXPECT_FALSE(EncodeSurfaceState<FamilyType>::doBindingTablePrefetch());
 }
+
+template <bool rcs>
+class MyCommandStreamReceiverMock : public MockCommandStreamReceiver {
+  public:
+    MyCommandStreamReceiverMock(ExecutionEnvironment &executionEnvironment, uint32_t rootDeviceIndex, const DeviceBitfield deviceBitfield) : MockCommandStreamReceiver(executionEnvironment, rootDeviceIndex, deviceBitfield) {}
+    bool isRcs() const override {
+        return rcs;
+    }
+};
+
+GEN12LPTEST_F(Gen12LpCommandEncodeTest, givenGen12LpPlatformWhenDefaultEngineIsRcsThenAdditionalPipelineSelectSizeEqualTwoPipelineSelectSize) {
+    MockDevice device;
+    auto csr = std::make_unique<MyCommandStreamReceiverMock<true>>(*device.getExecutionEnvironment(), 0, device.getDeviceBitfield());
+    auto oldCsr = device.getDefaultEngine().commandStreamReceiver;
+    device.getDefaultEngine().commandStreamReceiver = csr.get();
+    EXPECT_EQ(2 * PreambleHelper<FamilyType>::getCmdSizeForPipelineSelect(device.getHardwareInfo()), EncodeWA<FamilyType>::getAdditionalPipelineSelectSize(device));
+    device.getDefaultEngine().commandStreamReceiver = oldCsr;
+}
+
+GEN12LPTEST_F(Gen12LpCommandEncodeTest, givenGen12LpPlatformWhenDefaultEngineIsNotRcsThenAdditionalPipelineSelectSizeEqualZero) {
+    MockDevice device;
+    auto csr = std::make_unique<MyCommandStreamReceiverMock<false>>(*device.getExecutionEnvironment(), 0, device.getDeviceBitfield());
+    auto oldCsr = device.getDefaultEngine().commandStreamReceiver;
+    device.getDefaultEngine().commandStreamReceiver = csr.get();
+    EXPECT_EQ(0u, EncodeWA<FamilyType>::getAdditionalPipelineSelectSize(device));
+    device.getDefaultEngine().commandStreamReceiver = oldCsr;
+}
--- a/shared/test/common/gen12lp/test_command_encoder_gen12lp.cpp
+++ b/shared/test/common/gen12lp/test_command_encoder_gen12lp.cpp
@@ -103,22 +103,6 @@ GEN12LPTEST_F(CommandEncoderTest, givenVariousEngineTypesWhenEncodeSBAThenAdditi
    }
 }

-GEN12LPTEST_F(CommandEncoderTest, givenVariousEngineTypesWhenEstimateCommandBufferSizeThenRcsHasAdditionalPipelineSelectWASize) {
-    using PIPELINE_SELECT = typename FamilyType::PIPELINE_SELECT;
-    using STATE_COMPUTE_MODE = typename FamilyType::STATE_COMPUTE_MODE;
-
-    auto sizeWA = EncodeDispatchKernel<FamilyType>::estimateEncodeDispatchKernelCmdsSize(pDevice, Vec3<size_t>(0, 0, 0),
-                                                                                         Vec3<size_t>(1, 1, 1), false, false, false, nullptr, false);
-    static_cast<MockOsContext *>(pDevice->getDefaultEngine().osContext)->engineType = aub_stream::ENGINE_CCS;
-    auto size = EncodeDispatchKernel<FamilyType>::estimateEncodeDispatchKernelCmdsSize(pDevice, Vec3<size_t>(0, 0, 0),
-                                                                                       Vec3<size_t>(1, 1, 1), false, false, false, nullptr, false);
-
-    auto expectedDiff = 2 * PreambleHelper<FamilyType>::getCmdSizeForPipelineSelect(pDevice->getHardwareInfo());
-    auto diff = sizeWA - size;
-
-    EXPECT_EQ(expectedDiff, diff);
-}
-
 GEN12LPTEST_F(CommandEncoderTest, GivenGen12LpWhenProgrammingL3StateOnThenExpectNoCommandsDispatched) {
    using MI_LOAD_REGISTER_IMM = typename FamilyType::MI_LOAD_REGISTER_IMM;

--- a/shared/test/common/gen12lp/test_encode_gen12lp.cpp
+++ b/shared/test/common/gen12lp/test_encode_gen12lp.cpp
@@ -1,11 +1,12 @@
 /*
- * Copyright (C) 2021 Intel Corporation
+ * Copyright (C) 2021-2022 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
 */

 #include "shared/source/command_container/command_encoder.h"
+#include "shared/source/command_stream/linear_stream.h"
 #include "shared/source/command_stream/stream_properties.h"
 #include "shared/test/common/helpers/default_hw_info.h"
 #include "shared/test/common/test_macros/test.h"
--- a/shared/test/common/xe_hpg_core/dg2/test_encode_dg2.cpp
+++ b/shared/test/common/xe_hpg_core/dg2/test_encode_dg2.cpp
@@ -1,11 +1,12 @@
 /*
- * Copyright (C) 2021 Intel Corporation
+ * Copyright (C) 2021-2022 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
 */

 #include "shared/source/command_container/command_encoder.h"
+#include "shared/source/command_stream/linear_stream.h"
 #include "shared/source/command_stream/stream_properties.h"
 #include "shared/test/common/helpers/default_hw_info.h"
 #include "shared/test/common/test_macros/test.h"
--- a/shared/test/unit_test/command_container/command_container_tests.cpp
+++ b/shared/test/unit_test/command_container/command_container_tests.cpp
@@ -6,6 +6,7 @@
 */

 #include "shared/source/command_container/cmdcontainer.h"
+#include "shared/source/command_stream/linear_stream.h"
 #include "shared/source/memory_manager/allocations_list.h"
 #include "shared/test/common/fixtures/device_fixture.h"
 #include "shared/test/common/helpers/debug_manager_state_restore.h"
@@ -475,7 +476,8 @@ TEST_F(CommandContainerTest, whenAllocateNextCmdBufferIsCalledThenNewAllocationI
    EXPECT_NE(nullptr, nextBuffer);
    EXPECT_EQ(0u, sizeUsed);
    EXPECT_NE(initialBuffer, nextBuffer);
-    const size_t cmdBufSize = CommandContainer::defaultListCmdBufferSize;
+    size_t alignedSize = alignUp<size_t>(CommandContainer::totalCmdBufferSize, MemoryConstants::pageSize64k);
+    const size_t cmdBufSize = alignedSize - CommandContainer::cmdBufferReservedSize;
    EXPECT_EQ(cmdBufSize, availableSize);

    ASSERT_EQ(2u, cmdContainer->getCmdBufferAllocations().size());
@@ -682,3 +684,48 @@ TEST_F(CommandContainerTest, givenContainerAllocatesNextCommandBufferWhenResetin
    }
    EXPECT_TRUE(firstAllocationFound);
 }
+
+class MyLinearStreamMock : public LinearStream {
+  public:
+    using LinearStream::cmdContainer;
+};
+
+TEST_F(CommandContainerTest, givenCmdContainerWhenContainerIsInitializedThenStreamContainsContainerPtr) {
+    CommandContainer cmdContainer;
+    cmdContainer.initialize(pDevice, nullptr);
+
+    EXPECT_EQ(reinterpret_cast<MyLinearStreamMock *>(cmdContainer.getCommandStream())->cmdContainer, &cmdContainer);
+}
+
+TEST_F(CommandContainerTest, givenCmdContainerWhenContainerIsInitializedThenStreamSizeEqualAlignedTotalCmdBuffSizeDecreasedOfReservedSize) {
+    CommandContainer cmdContainer;
+    cmdContainer.initialize(pDevice, nullptr);
+    size_t alignedSize = alignUp<size_t>(CommandContainer::totalCmdBufferSize, MemoryConstants::pageSize64k);
+    EXPECT_EQ(cmdContainer.getCommandStream()->getMaxAvailableSpace(), alignedSize - CommandContainer::cmdBufferReservedSize);
+}
+
+TEST_F(CommandContainerTest, givenCmdContainerWhenAlocatingNextCmdBufferThenStreamSizeEqualAlignedTotalCmdBuffSizeDecreasedOfReservedSize) {
+    CommandContainer cmdContainer;
+    cmdContainer.initialize(pDevice, nullptr);
+    cmdContainer.allocateNextCommandBuffer();
+    size_t alignedSize = alignUp<size_t>(CommandContainer::totalCmdBufferSize, MemoryConstants::pageSize64k);
+    EXPECT_EQ(cmdContainer.getCommandStream()->getMaxAvailableSpace(), alignedSize - CommandContainer::cmdBufferReservedSize);
+}
+
+TEST_F(CommandContainerTest, givenCmdContainerWhenCloseAndAllocateNextCommandBufferCalledThenBBEndPlacedAtEndOfLinearStream) {
+    CommandContainer cmdContainer;
+    cmdContainer.initialize(pDevice, nullptr);
+    auto &hwInfo = pDevice->getHardwareInfo();
+    auto &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily);
+    auto ptr = cmdContainer.getCommandStream()->getSpace(0u);
+    cmdContainer.closeAndAllocateNextCommandBuffer();
+    EXPECT_EQ(memcmp(ptr, hwHelper.getBatchBufferEndReference(), hwHelper.getBatchBufferEndSize()), 0);
+}
+
+TEST_F(CommandContainerTest, givenCmdContainerWhenCloseAndAllocateNextCommandBufferCalledThenNewCmdBufferAllocationCreated) {
+    CommandContainer cmdContainer;
+    cmdContainer.initialize(pDevice, nullptr);
+    EXPECT_EQ(cmdContainer.getCmdBufferAllocations().size(), 1u);
+    cmdContainer.closeAndAllocateNextCommandBuffer();
+    EXPECT_EQ(cmdContainer.getCmdBufferAllocations().size(), 2u);
+}
--- a/shared/test/unit_test/command_container/command_encoder_tests.cpp
+++ b/shared/test/unit_test/command_container/command_encoder_tests.cpp
@@ -6,6 +6,7 @@
 */

 #include "shared/source/command_container/command_encoder.h"
+#include "shared/source/command_stream/linear_stream.h"
 #include "shared/source/memory_manager/graphics_allocation.h"
 #include "shared/test/common/helpers/default_hw_info.h"
 #include "shared/test/common/helpers/unit_test_helper.h"
--- a/shared/test/unit_test/command_stream/linear_stream_tests.cpp
+++ b/shared/test/unit_test/command_stream/linear_stream_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2018-2021 Intel Corporation
+ * Copyright (C) 2018-2022 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@@ -7,6 +7,7 @@

 #include "shared/source/command_stream/linear_stream.h"
 #include "shared/source/memory_manager/graphics_allocation.h"
+#include "shared/test/common/fixtures/device_fixture.h"
 #include "shared/test/common/fixtures/linear_stream_fixture.h"
 #include "shared/test/common/mocks/mock_graphics_allocation.h"

@@ -109,3 +110,87 @@ TEST_F(LinearStreamTest, givenNewGraphicsAllocationWhenReplaceIsCalledThenLinear
    linearStream.replaceGraphicsAllocation(&newGraphicsAllocation);
    EXPECT_EQ(&newGraphicsAllocation, linearStream.getGraphicsAllocation());
 }
+
+class MyLinearStreamMock : public LinearStream {
+  public:
+    using LinearStream::sizeUsed;
+};
+
+TEST_F(LinearStreamTest, givenLinearStreamWithoutCmdContainerWhenOneByteLeftInStreamThenGetSpaceDontThrowAbort) {
+    reinterpret_cast<MyLinearStreamMock *>(&linearStream)->sizeUsed = linearStream.getMaxAvailableSpace() - 1;
+    EXPECT_NO_THROW(linearStream.getSpace(1));
+}
+using CommandContainerLinearStreamTest = Test<DeviceFixture>;
+TEST_F(CommandContainerLinearStreamTest, givenLinearStreamWithCmdContainerWhenOneByteLeftInStreamThenGetSpaceThrowAbort) {
+    CommandContainer cmdContainer;
+    cmdContainer.initialize(pDevice, nullptr);
+    auto stream = reinterpret_cast<MyLinearStreamMock *>(cmdContainer.getCommandStream());
+    stream->sizeUsed = stream->getMaxAvailableSpace() - 1;
+    EXPECT_THROW(stream->getSpace(1), std::exception);
+}
+
+TEST_F(CommandContainerLinearStreamTest, givenLinearStreamWithCmdContainerWhenThereIsNoSpaceForCommandAndBBEndThenNewCmdBufferAllocated) {
+    CommandContainer cmdContainer;
+    cmdContainer.initialize(pDevice, nullptr);
+    auto &hwInfo = pDevice->getHardwareInfo();
+    auto &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily);
+    auto stream = reinterpret_cast<MyLinearStreamMock *>(cmdContainer.getCommandStream());
+    size_t dummyCommandSize = 2;
+    stream->sizeUsed = stream->getMaxAvailableSpace() - hwHelper.getBatchBufferEndSize() - (dummyCommandSize - 1);
+    EXPECT_EQ(cmdContainer.getCmdBufferAllocations().size(), 1u);
+    stream->getSpace(dummyCommandSize);
+    EXPECT_EQ(cmdContainer.getCmdBufferAllocations().size(), 2u);
+}
+
+TEST_F(CommandContainerLinearStreamTest, givenLinearStreamWithCmdContainerWhenThereIsNoSpaceForCommandAndBBEndThenLinearStreamHasNewAllocation) {
+    CommandContainer cmdContainer;
+    cmdContainer.initialize(pDevice, nullptr);
+    auto &hwInfo = pDevice->getHardwareInfo();
+    auto &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily);
+    auto stream = reinterpret_cast<MyLinearStreamMock *>(cmdContainer.getCommandStream());
+    size_t dummyCommandSize = 2;
+    stream->sizeUsed = stream->getMaxAvailableSpace() - hwHelper.getBatchBufferEndSize() - (dummyCommandSize - 1);
+    auto oldBuffer = stream->getCpuBase();
+    stream->getSpace(dummyCommandSize);
+    auto newBuffer = stream->getCpuBase();
+    EXPECT_NE(newBuffer, oldBuffer);
+}
+
+TEST_F(CommandContainerLinearStreamTest, givenLinearStreamWithCmdContainerWhenThereIsNoSpaceForCommandAndBBEndThenGetSpaceReturnPtrFromNewAllocation) {
+    CommandContainer cmdContainer;
+    cmdContainer.initialize(pDevice, nullptr);
+    auto &hwInfo = pDevice->getHardwareInfo();
+    auto &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily);
+    auto stream = reinterpret_cast<MyLinearStreamMock *>(cmdContainer.getCommandStream());
+    size_t dummyCommandSize = 2;
+    stream->sizeUsed = stream->getMaxAvailableSpace() - hwHelper.getBatchBufferEndSize() - (dummyCommandSize - 1);
+    auto ptr = stream->getSpace(dummyCommandSize);
+    auto buffer = stream->getCpuBase();
+    EXPECT_EQ(buffer, ptr);
+}
+
+TEST_F(CommandContainerLinearStreamTest, givenLinearStreamWithCmdContainerWhenThereIsSpaceForCommandAndBBEndThenNewCmdBufferIsNotAllocated) {
+    CommandContainer cmdContainer;
+    cmdContainer.initialize(pDevice, nullptr);
+    auto &hwInfo = pDevice->getHardwareInfo();
+    auto &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily);
+    auto stream = reinterpret_cast<MyLinearStreamMock *>(cmdContainer.getCommandStream());
+    size_t dummyCommandSize = 2;
+    stream->sizeUsed = stream->getMaxAvailableSpace() - hwHelper.getBatchBufferEndSize() - (dummyCommandSize);
+    EXPECT_EQ(cmdContainer.getCmdBufferAllocations().size(), 1u);
+    stream->getSpace(dummyCommandSize);
+    EXPECT_EQ(cmdContainer.getCmdBufferAllocations().size(), 1u);
+}
+
+TEST_F(CommandContainerLinearStreamTest, givenLinearStreamWithCmdContainerWhenThereIsNoSpaceForCommandAndBBEndThenBBEndAddedAtEndOfStream) {
+    CommandContainer cmdContainer;
+    cmdContainer.initialize(pDevice, nullptr);
+    auto &hwInfo = pDevice->getHardwareInfo();
+    auto &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily);
+    auto stream = reinterpret_cast<MyLinearStreamMock *>(cmdContainer.getCommandStream());
+    size_t dummyCommandSize = 2;
+    stream->sizeUsed = stream->getMaxAvailableSpace() - hwHelper.getBatchBufferEndSize() - (dummyCommandSize - 1);
+    auto ptr = stream->getSpace(0u);
+    stream->getSpace(dummyCommandSize);
+    EXPECT_EQ(memcmp(ptr, hwHelper.getBatchBufferEndReference(), hwHelper.getBatchBufferEndSize()), 0);
+}
--- a/shared/test/unit_test/encoders/test_encode_dispatch_kernel_xehp_and_later.cpp
+++ b/shared/test/unit_test/encoders/test_encode_dispatch_kernel_xehp_and_later.cpp
@@ -955,8 +955,6 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CommandEncodeStatesDynamicImplicitScaling, givenImp

    bool requiresUncachedMocs = false;
    bool isInternal = false;
-    size_t regularEstimateSize = EncodeDispatchKernel<FamilyType>::estimateEncodeDispatchKernelCmdsSize(
-        pDevice, Vec3<size_t>(0, 0, 0), Vec3<size_t>(16, 1, 1), isInternal, false, false, nullptr, false);

    EncodeDispatchKernelArgs dispatchArgs = createDefaultDispatchKernelArgs(pDevice, dispatchInterface.get(), dims, requiresUncachedMocs);
    dispatchArgs.isInternal = isInternal;
@@ -972,8 +970,6 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CommandEncodeStatesDynamicImplicitScaling, givenImp
    EXPECT_EQ(WALKER_TYPE::PARTITION_TYPE::PARTITION_TYPE_DISABLED, baseWalkerCmd->getPartitionType());
    EXPECT_EQ(16u, baseWalkerCmd->getThreadGroupIdXDimension());

-    size_t partitionEstimateSize = EncodeDispatchKernel<FamilyType>::estimateEncodeDispatchKernelCmdsSize(
-        pDevice, Vec3<size_t>(0, 0, 0), Vec3<size_t>(16, 1, 1), isInternal, false, false, nullptr, true);
    dispatchArgs.partitionCount = 2;
    EncodeDispatchKernel<FamilyType>::encode(*cmdContainer.get(), dispatchArgs);

@@ -982,7 +978,6 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CommandEncodeStatesDynamicImplicitScaling, givenImp

    size_t expectedPartitionedWalkerSize = ImplicitScalingDispatch<FamilyType>::getSize(true, false, pDevice->getDeviceBitfield(), Vec3<size_t>(0, 0, 0), Vec3<size_t>(16, 1, 1));
    EXPECT_EQ(expectedPartitionedWalkerSize, partitionedWalkerSize);
-    EXPECT_EQ(partitionEstimateSize, regularEstimateSize + expectedPartitionedWalkerSize);

    GenCmdList partitionedWalkerList;
    CmdParse<FamilyType>::parseCommandBuffer(
@@ -1020,23 +1015,18 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CommandEncodeStatesDynamicImplicitScaling, givenImp
    std::unique_ptr<MockDispatchKernelEncoder> dispatchInterface(new MockDispatchKernelEncoder());

    bool isInternal = false;
-    size_t baseEstimateSize = EncodeDispatchKernel<FamilyType>::estimateEncodeDispatchKernelCmdsSize(
-        pDevice, Vec3<size_t>(0, 0, 0), Vec3<size_t>(16, 1, 1), isInternal, false, false, dispatchInterface.get(), false);

    bool requiresUncachedMocs = false;
    EncodeDispatchKernelArgs dispatchArgs = createDefaultDispatchKernelArgs(pDevice, dispatchInterface.get(), dims, requiresUncachedMocs);
    dispatchArgs.isInternal = isInternal;
    dispatchArgs.partitionCount = 2;

-    size_t partitionEstimateSize = EncodeDispatchKernel<FamilyType>::estimateEncodeDispatchKernelCmdsSize(
-        pDevice, Vec3<size_t>(0, 0, 0), Vec3<size_t>(16, 1, 1), isInternal, false, false, dispatchInterface.get(), true);
    EncodeDispatchKernel<FamilyType>::encode(*cmdContainer.get(), dispatchArgs);

    EXPECT_EQ(2u, dispatchArgs.partitionCount);
    size_t partitionedWalkerSize = cmdContainer->getCommandStream()->getUsed();

    size_t expectedPartitionedWalkerSize = ImplicitScalingDispatch<FamilyType>::getSize(true, false, pDevice->getDeviceBitfield(), Vec3<size_t>(0, 0, 0), Vec3<size_t>(16, 1, 1));
-    EXPECT_EQ(partitionEstimateSize, baseEstimateSize + expectedPartitionedWalkerSize);
    EXPECT_EQ(expectedPartitionedWalkerSize, partitionedWalkerSize);

    GenCmdList partitionedWalkerList;
@@ -1124,23 +1114,17 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CommandEncodeStatesDynamicImplicitScaling,
    std::unique_ptr<MockDispatchKernelEncoder> dispatchInterface(new MockDispatchKernelEncoder());

    bool isInternal = false;
-    size_t baseEstimateSize = EncodeDispatchKernel<FamilyType>::estimateEncodeDispatchKernelCmdsSize(
-        pDevice, Vec3<size_t>(0, 0, 0), Vec3<size_t>(16, 1, 1), isInternal, false, false, dispatchInterface.get(), false);
-
    bool requiresUncachedMocs = false;
    EncodeDispatchKernelArgs dispatchArgs = createDefaultDispatchKernelArgs(pDevice, dispatchInterface.get(), dims, requiresUncachedMocs);
    dispatchArgs.isInternal = isInternal;
    dispatchArgs.partitionCount = 2;

-    size_t partitionEstimateSize = EncodeDispatchKernel<FamilyType>::estimateEncodeDispatchKernelCmdsSize(
-        pDevice, Vec3<size_t>(0, 0, 0), Vec3<size_t>(16, 1, 1), isInternal, false, false, dispatchInterface.get(), true);
    EncodeDispatchKernel<FamilyType>::encode(*cmdContainer.get(), dispatchArgs);

    EXPECT_EQ(2u, dispatchArgs.partitionCount);
    size_t partitionedWalkerSize = cmdContainer->getCommandStream()->getUsed();

    size_t expectedPartitionedWalkerSize = ImplicitScalingDispatch<FamilyType>::getSize(true, false, pDevice->getDeviceBitfield(), Vec3<size_t>(0, 0, 0), Vec3<size_t>(16, 1, 1));
-    EXPECT_EQ(partitionEstimateSize, baseEstimateSize + expectedPartitionedWalkerSize);
    EXPECT_EQ(expectedPartitionedWalkerSize, partitionedWalkerSize);

    GenCmdList partitionedWalkerList;
@@ -1187,20 +1171,12 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CommandEncodeStatesDynamicImplicitScaling, givenImp
    uint32_t dims[] = {16, 1, 1};
    std::unique_ptr<MockDispatchKernelEncoder> dispatchInterface(new MockDispatchKernelEncoder());

-    bool isInternal = false;
-    size_t baseEstimateSize = EncodeDispatchKernel<FamilyType>::estimateEncodeDispatchKernelCmdsSize(
-        pDevice, Vec3<size_t>(0, 0, 0), Vec3<size_t>(16, 1, 1), isInternal, false, false, dispatchInterface.get(), false);
-
-    isInternal = true;
+    bool isInternal = true;
    bool requiresUncachedMocs = false;
    EncodeDispatchKernelArgs dispatchArgs = createDefaultDispatchKernelArgs(pDevice, dispatchInterface.get(), dims, requiresUncachedMocs);
    dispatchArgs.isInternal = isInternal;
    dispatchArgs.partitionCount = 2;

-    size_t internalEstimateSize = EncodeDispatchKernel<FamilyType>::estimateEncodeDispatchKernelCmdsSize(
-        pDevice, Vec3<size_t>(0, 0, 0), Vec3<size_t>(16, 1, 1), isInternal, false, false, dispatchInterface.get(), true);
-    EXPECT_EQ(baseEstimateSize, internalEstimateSize);
-
    EncodeDispatchKernel<FamilyType>::encode(*cmdContainer.get(), dispatchArgs);

    size_t internalWalkerSize = cmdContainer->getCommandStream()->getUsed();