refactor: Enable CSR heap sharing on Older Generation platforms

Related-To: LOCI-4312 Signed-off-by: Jitendra Sharma <jitendra.sharma@intel.com>
2023-07-17 10:00:40 +00:00 · 2023-07-17 10:00:40 +00:00 · 8a01619310
parent 5e4ea627f7
commit 8a01619310
12 changed files with 230 additions and 19 deletions
--- a/level_zero/core/source/cmdlist/cmdlist.h
+++ b/level_zero/core/source/cmdlist/cmdlist.h
@ -43,6 +43,8 @@ struct CmdListKernelLaunchParams {
    bool isDestinationAllocationInSystemMemory = false;
    bool isHostSignalScopeEvent = false;
    bool skipInOrderNonWalkerSignaling = false;
+    uint32_t numKernelsInSplitLaunch = 0;
+    uint32_t numKernelsExecutedInSplitLaunch = 0;
 };

 struct CmdListReturnPoint {
--- a/level_zero/core/source/cmdlist/cmdlist_hw.inl
+++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl
@ -1281,7 +1281,8 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendPageFaultCopy(NEO::Graph
                                    size);
    } else {
        CmdListKernelLaunchParams launchParams = {};
-        launchParams.isKernelSplitOperation = rightSize > 1;
+        launchParams.isKernelSplitOperation = rightSize > 0;
+        launchParams.numKernelsInSplitLaunch = 2;
        ret = appendMemoryCopyKernelWithGA(reinterpret_cast<void *>(&dstAddress),
                                           dstAllocation, 0,
                                           reinterpret_cast<void *>(&srcAddress),
@ -1292,6 +1293,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendPageFaultCopy(NEO::Graph
                                           nullptr,
                                           isStateless,
                                           launchParams);
+        launchParams.numKernelsExecutedInSplitLaunch++;
        if (ret == ZE_RESULT_SUCCESS && rightSize) {
            ret = appendMemoryCopyKernelWithGA(reinterpret_cast<void *>(&dstAddress),
                                               dstAllocation, size - rightSize,
@ -1302,6 +1304,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendPageFaultCopy(NEO::Graph
                                               nullptr,
                                               isStateless,
                                               launchParams);
+            launchParams.numKernelsExecutedInSplitLaunch++;
        }

        if (this->dcFlushSupport) {
@ -1397,6 +1400,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(void *dstptr,
        dcFlush = getDcFlushRequired(signalEvent->isSignalScope());
    }

+    launchParams.numKernelsInSplitLaunch = kernelCounter;
    launchParams.isKernelSplitOperation = kernelCounter > 1;
    bool singlePipeControlPacket = eventSignalPipeControl(launchParams.isKernelSplitOperation, dcFlush);

@ -1423,6 +1427,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(void *dstptr,
                                               signalEvent,
                                               isStateless,
                                               launchParams);
+            launchParams.numKernelsExecutedInSplitLaunch++;
        }

        if (ret == ZE_RESULT_SUCCESS && middleSizeBytes) {
@ -1441,6 +1446,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(void *dstptr,
                                               signalEvent,
                                               isStateless,
                                               launchParams);
+            launchParams.numKernelsExecutedInSplitLaunch++;
        }

        if (ret == ZE_RESULT_SUCCESS && rightSize) {
@ -1458,6 +1464,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(void *dstptr,
                                               signalEvent,
                                               isStateless,
                                               launchParams);
+            launchParams.numKernelsExecutedInSplitLaunch++;
        }
    }

@ -1856,12 +1863,21 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryFill(void *ptr,

    appendEventForProfilingAllWalkers(signalEvent, true, singlePipeControlPacket);

+    if (fillArguments.leftRemainingBytes > 0) {
+        launchParams.numKernelsInSplitLaunch++;
+    }
+    if (fillArguments.rightRemainingBytes > 0) {
+        launchParams.numKernelsInSplitLaunch++;
+    }
+
    if (patternSize == 1) {
+        launchParams.numKernelsInSplitLaunch++;
        if (fillArguments.leftRemainingBytes > 0) {
            res = appendUnalignedFillKernel(isStateless, fillArguments.leftRemainingBytes, dstAllocation, pattern, signalEvent, launchParams);
            if (res) {
                return res;
            }
+            launchParams.numKernelsExecutedInSplitLaunch++;
        }

        ze_result_t ret = builtinKernel->setGroupSize(static_cast<uint32_t>(fillArguments.mainGroupSize), 1u, 1u);
@ -1882,6 +1898,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryFill(void *ptr,
        if (res) {
            return res;
        }
+        launchParams.numKernelsExecutedInSplitLaunch++;

        if (fillArguments.rightRemainingBytes > 0) {
            dstAllocation.offset = fillArguments.rightOffset;
@ -1889,6 +1906,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryFill(void *ptr,
            if (res) {
                return res;
            }
+            launchParams.numKernelsExecutedInSplitLaunch++;
        }
    } else {
        builtinKernel->setGroupSize(static_cast<uint32_t>(fillArguments.mainGroupSize), 1, 1);
@ -1923,10 +1941,12 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryFill(void *ptr,
            builtinKernel->setArgumentValue(3, sizeof(fillArguments.patternSizeInEls), &fillArguments.patternSizeInEls);

            ze_group_count_t dispatchKernelArgs{static_cast<uint32_t>(fillArguments.groups), 1u, 1u};
+            launchParams.numKernelsInSplitLaunch++;
            res = appendLaunchKernelSplit(builtinKernel, dispatchKernelArgs, signalEvent, launchParams);
            if (res) {
                return res;
            }
+            launchParams.numKernelsExecutedInSplitLaunch++;
        } else {
            uint32_t dstOffsetRemainder = static_cast<uint32_t>(dstAllocation.offset);

@ -1955,6 +1975,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryFill(void *ptr,
            if (res) {
                return res;
            }
+            launchParams.numKernelsExecutedInSplitLaunch++;
        }

        if (fillArguments.rightRemainingBytes > 0) {
@ -1986,6 +2007,7 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryFill(void *ptr,
            if (res) {
                return res;
            }
+            launchParams.numKernelsExecutedInSplitLaunch++;
        }
    }

--- a/level_zero/core/source/cmdlist/cmdlist_hw_skl_to_tgllp.inl
+++ b/level_zero/core/source/cmdlist/cmdlist_hw_skl_to_tgllp.inl
@ -87,10 +87,23 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendLaunchKernelWithParams(K
            NEO::EncodeDispatchKernel<GfxFamily>::getSizeRequiredDsh(kernelDescriptor, commandContainer.getNumIddPerBlock()),
            NEO::EncodeDispatchKernel<GfxFamily>::getDefaultDshAlignment()};

-        commandContainer.reserveSpaceForDispatch(
-            sshReserveArgs,
-            dshReserveArgs, true);
-
+        if (launchParams.isKernelSplitOperation) {
+            // when appendLaunchKernel is called during an operation with kernel split is true,
+            // then reserve sufficient ssh and dsh heaps during first kernel split, by multiplying, individual
+            // dsh and ssh heap size retrieved above with number of kernels in split operation.
+            // And after first kernel split, for remainder kernel split calls, dont estimate heap size.
+            if (launchParams.numKernelsExecutedInSplitLaunch == 0) {
+                dshReserveArgs.size = launchParams.numKernelsInSplitLaunch * dshReserveArgs.size;
+                sshReserveArgs.size = launchParams.numKernelsInSplitLaunch * sshReserveArgs.size;
+                commandContainer.reserveSpaceForDispatch(
+                    sshReserveArgs,
+                    dshReserveArgs, true);
+            }
+        } else {
+            commandContainer.reserveSpaceForDispatch(
+                sshReserveArgs,
+                dshReserveArgs, true);
+        }
        ssh = sshReserveArgs.indirectHeapReservation;
        dsh = dshReserveArgs.indirectHeapReservation;
    }
--- a/level_zero/core/source/gfx_core_helpers/l0_gfx_core_helper_skl_to_tgllp.inl
+++ b/level_zero/core/source/gfx_core_helpers/l0_gfx_core_helper_skl_to_tgllp.inl
@ -11,7 +11,7 @@ namespace L0 {

 template <typename Family>
 bool L0GfxCoreHelperHw<Family>::platformSupportsCmdListHeapSharing() const {
-    return false;
+    return true;
 }

 template <typename Family>
--- a/level_zero/core/source/kernel/kernel_imp.cpp
+++ b/level_zero/core/source/kernel/kernel_imp.cpp
@ -98,13 +98,7 @@ ze_result_t KernelImmutableData::initialize(NEO::KernelInfo *kernelInfo, Device
                                       kernelDescriptor->payloadMappings.implicitArgs.simdSize, kernelDescriptor->kernelAttributes.simdSize);
    }

-    if (kernelInfo->heapInfo.surfaceStateHeapSize != 0) {
-        this->surfaceStateHeapSize = kernelInfo->heapInfo.surfaceStateHeapSize;
-        surfaceStateHeapTemplate.reset(new uint8_t[surfaceStateHeapSize]);
-
-        memcpy_s(surfaceStateHeapTemplate.get(), surfaceStateHeapSize,
-                 kernelInfo->heapInfo.pSsh, surfaceStateHeapSize);
-    } else if (NEO::KernelDescriptor::isBindlessAddressingKernel(kernelInfo->kernelDescriptor)) {
+    if (NEO::KernelDescriptor::isBindlessAddressingKernel(kernelInfo->kernelDescriptor)) {
        auto &gfxCoreHelper = deviceImp->getNEODevice()->getGfxCoreHelper();
        auto surfaceStateSize = static_cast<uint32_t>(gfxCoreHelper.getRenderSurfaceStateSize());

@ -112,6 +106,12 @@ ze_result_t KernelImmutableData::initialize(NEO::KernelInfo *kernelInfo, Device
        UNRECOVERABLE_IF(kernelInfo->kernelDescriptor.kernelAttributes.numArgsStateful != kernelInfo->kernelDescriptor.getBindlessOffsetToSurfaceState().size());

        surfaceStateHeapTemplate.reset(new uint8_t[surfaceStateHeapSize]);
+    } else if (kernelInfo->heapInfo.surfaceStateHeapSize != 0) {
+        this->surfaceStateHeapSize = kernelInfo->heapInfo.surfaceStateHeapSize;
+        surfaceStateHeapTemplate.reset(new uint8_t[surfaceStateHeapSize]);
+
+        memcpy_s(surfaceStateHeapTemplate.get(), surfaceStateHeapSize,
+                 kernelInfo->heapInfo.pSsh, surfaceStateHeapSize);
    }

    if (kernelInfo->heapInfo.dynamicStateHeapSize != 0) {
--- a/level_zero/core/test/unit_tests/gen11/test_l0_gfx_core_helper_gen11.cpp
+++ b/level_zero/core/test/unit_tests/gen11/test_l0_gfx_core_helper_gen11.cpp
@ -16,9 +16,9 @@ namespace ult {

 using L0GfxCoreHelperTestGen11 = Test<DeviceFixture>;

-GEN11TEST_F(L0GfxCoreHelperTestGen11, GivenGen11WhenCheckingL0HelperForCmdListHeapSharingSupportThenReturnFalse) {
+GEN11TEST_F(L0GfxCoreHelperTestGen11, GivenGen11WhenCheckingL0HelperForCmdListHeapSharingSupportThenReturnTrue) {
    auto &l0GfxCoreHelper = getHelper<L0GfxCoreHelper>();
-    EXPECT_FALSE(l0GfxCoreHelper.platformSupportsCmdListHeapSharing());
+    EXPECT_TRUE(l0GfxCoreHelper.platformSupportsCmdListHeapSharing());
 }

 GEN11TEST_F(L0GfxCoreHelperTestGen11, GivenGen11WhenCheckingL0HelperForStateComputeModeTrackingSupportThenReturnFalse) {
--- a/level_zero/core/test/unit_tests/gen12lp/test_l0_gfx_core_helper_gen12lp.cpp
+++ b/level_zero/core/test/unit_tests/gen12lp/test_l0_gfx_core_helper_gen12lp.cpp
@ -28,10 +28,10 @@ GEN12LPTEST_F(L0GfxCoreHelperTestGen12Lp, GivenGen12LpWhenGetRegsetTypeForLargeG
    EXPECT_EQ(ZET_DEBUG_REGSET_TYPE_INVALID_INTEL_GPU, l0GfxCoreHelper.getRegsetTypeForLargeGrfDetection());
 }

-GEN12LPTEST_F(L0GfxCoreHelperTestGen12Lp, GivenGen12LpWhenCheckingL0HelperForCmdListHeapSharingSupportThenReturnFalse) {
+GEN12LPTEST_F(L0GfxCoreHelperTestGen12Lp, GivenGen12LpWhenCheckingL0HelperForCmdListHeapSharingSupportThenReturnTrue) {
    auto &l0GfxCoreHelper = getHelper<L0GfxCoreHelper>();

-    EXPECT_FALSE(l0GfxCoreHelper.platformSupportsCmdListHeapSharing());
+    EXPECT_TRUE(l0GfxCoreHelper.platformSupportsCmdListHeapSharing());
 }

 GEN12LPTEST_F(L0GfxCoreHelperTestGen12Lp, GivenGen12LpWhenCheckingL0HelperForStateComputeModeTrackingSupportThenReturnFalse) {
--- a/level_zero/core/test/unit_tests/gen9/test_l0_gfx_core_helper_gen9.cpp
+++ b/level_zero/core/test/unit_tests/gen9/test_l0_gfx_core_helper_gen9.cpp
@ -16,9 +16,9 @@ namespace ult {

 using L0GfxCoreHelperTestGen9 = Test<DeviceFixture>;

-GEN9TEST_F(L0GfxCoreHelperTestGen9, GivenGen9WhenCheckingL0HelperForCmdListHeapSharingSupportThenReturnFalse) {
+GEN9TEST_F(L0GfxCoreHelperTestGen9, GivenGen9WhenCheckingL0HelperForCmdListHeapSharingSupportThenReturnTrue) {
    auto &l0GfxCoreHelper = getHelper<L0GfxCoreHelper>();
-    EXPECT_FALSE(l0GfxCoreHelper.platformSupportsCmdListHeapSharing());
+    EXPECT_TRUE(l0GfxCoreHelper.platformSupportsCmdListHeapSharing());
 }

 GEN9TEST_F(L0GfxCoreHelperTestGen9, GivenGen9WhenCheckingL0HelperForStateComputeModeTrackingSupportThenReturnFalse) {
--- a/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h
+++ b/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h
@ -108,6 +108,9 @@ struct WhiteBox<::L0::CommandListCoreFamily<gfxCoreFamily>>
                                             const CmdListKernelLaunchParams &launchParams) override {

        usedKernelLaunchParams = launchParams;
+        if (launchParams.isKernelSplitOperation && (launchParams.numKernelsExecutedInSplitLaunch == 0)) {
+            firstKernelInSplitOperation = kernel;
+        }
        appendKernelEventValue = event;
        return BaseClass::appendLaunchKernelWithParams(kernel, threadGroupDimensions,
                                                       event, launchParams);
@ -140,6 +143,7 @@ struct WhiteBox<::L0::CommandListCoreFamily<gfxCoreFamily>>

    CmdListKernelLaunchParams usedKernelLaunchParams;
    ::L0::Event *appendKernelEventValue = nullptr;
+    ::L0::Kernel *firstKernelInSplitOperation = nullptr;
    ze_event_handle_t appendEventMultipleKernelIndirectEventHandleValue = nullptr;
    ze_event_handle_t appendEventKernelIndirectEventHandleValue = nullptr;
 };
--- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_6.cpp
+++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_6.cpp
@ -998,6 +998,130 @@ HWTEST2_F(CommandListTest, givenComputeCommandListWhenMemoryFillRequiresMultiKer
    context->freeMem(dstBuffer);
 }

+using IsPlatformSklToDg1 = IsWithinProducts<IGFX_SKYLAKE, IGFX_DG1>;
+HWTEST2_F(CommandListTest, givenComputeCommandListWhenMemoryCopyInUsmDeviceAllocationThenSplitFlagIsSetAndHeapsEstimationIsProper, IsPlatformSklToDg1) {
+    auto commandList = std::make_unique<WhiteBox<::L0::CommandListCoreFamily<gfxCoreFamily>>>();
+    commandList->isFlushTaskSubmissionEnabled = true;
+    commandList->immediateCmdListHeapSharing = true;
+    commandList->initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
+    commandList->commandContainer.setImmediateCmdListCsr(device->getNEODevice()->getDefaultEngine().commandStreamReceiver);
+
+    constexpr size_t size = 4096u;
+    constexpr size_t alignment = 0;
+    void *dstBuffer = nullptr;
+
+    ze_device_mem_alloc_desc_t deviceDesc = {};
+    auto result = context->allocDeviceMem(device->toHandle(),
+                                          &deviceDesc,
+                                          size, alignment, &dstBuffer);
+    EXPECT_EQ(ZE_RESULT_SUCCESS, result);
+
+    void *srcPtr = reinterpret_cast<void *>(0x1234);
+
+    auto &cmdContainer = commandList->commandContainer;
+    auto csrDshHeap = &device->getNEODevice()->getDefaultEngine().commandStreamReceiver->getIndirectHeap(HeapType::DYNAMIC_STATE, MemoryConstants::pageSize64k);
+    auto csrSshHeap = &device->getNEODevice()->getDefaultEngine().commandStreamReceiver->getIndirectHeap(HeapType::SURFACE_STATE, MemoryConstants::pageSize64k);
+
+    size_t dshUsed = csrDshHeap->getUsed();
+    size_t sshUsed = csrSshHeap->getUsed();
+
+    commandList->appendMemoryCopy(dstBuffer, srcPtr, 0x101, nullptr, 0, nullptr, false, false);
+    EXPECT_TRUE(commandList->usedKernelLaunchParams.isBuiltInKernel);
+    EXPECT_TRUE(commandList->usedKernelLaunchParams.isKernelSplitOperation);
+    EXPECT_FALSE(commandList->usedKernelLaunchParams.isDestinationAllocationInSystemMemory);
+
+    // As numKernelsExecutedInSplitLaunch is incremented after split kernel launch. But we are storing usedKernelLaunchParams before actual split kernel launch.
+    // Hence below comparison tells that actually (usedKernelLaunchParams.numKernelsExecutedInSplitLaunch + 1) split kernels are launched
+    EXPECT_EQ(commandList->usedKernelLaunchParams.numKernelsInSplitLaunch, commandList->usedKernelLaunchParams.numKernelsExecutedInSplitLaunch + 1);
+
+    size_t dshEstimated = NEO::EncodeDispatchKernel<FamilyType>::getSizeRequiredDsh(
+        commandList->firstKernelInSplitOperation->getKernelDescriptor(),
+        cmdContainer.getNumIddPerBlock());
+    size_t sshEstimated = NEO::EncodeDispatchKernel<FamilyType>::getSizeRequiredSsh(*commandList->firstKernelInSplitOperation->getImmutableData()->getKernelInfo());
+
+    auto expectedDshToBeConsumed = dshEstimated * commandList->usedKernelLaunchParams.numKernelsInSplitLaunch;
+    auto expectedSshToBeConsumed = sshEstimated * commandList->usedKernelLaunchParams.numKernelsInSplitLaunch;
+    auto consumedDsh1 = csrDshHeap->getUsed();
+    auto consumedSsh1 = csrSshHeap->getUsed();
+
+    EXPECT_EQ(expectedDshToBeConsumed, (consumedDsh1 - dshUsed));
+    EXPECT_EQ(expectedSshToBeConsumed, (consumedSsh1 - sshUsed));
+
+    context->freeMem(dstBuffer);
+}
+
+HWTEST2_F(CommandListTest, givenComputeCommandListWhenMemoryFillRequiresMultiKernelsThenSplitFlagIsSetAndHeapsEstimationIsProper, IsPlatformSklToDg1) {
+    auto commandList = std::make_unique<WhiteBox<::L0::CommandListCoreFamily<gfxCoreFamily>>>();
+    commandList->isFlushTaskSubmissionEnabled = true;
+    commandList->immediateCmdListHeapSharing = true;
+    commandList->initialize(device, NEO::EngineGroupType::RenderCompute, 0u);
+    commandList->commandContainer.setImmediateCmdListCsr(device->getNEODevice()->getDefaultEngine().commandStreamReceiver);
+
+    constexpr size_t patternSize = 8;
+    uint8_t pattern[patternSize] = {1, 2, 3, 4};
+
+    constexpr size_t size = 4096u;
+    constexpr size_t alignment = 4096u;
+    void *dstBuffer = nullptr;
+
+    ze_device_mem_alloc_desc_t deviceDesc = {};
+    auto result = context->allocDeviceMem(device->toHandle(),
+                                          &deviceDesc,
+                                          size, alignment, &dstBuffer);
+    EXPECT_EQ(ZE_RESULT_SUCCESS, result);
+
+    constexpr size_t fillSize = size - 1;
+
+    auto &cmdContainer = commandList->commandContainer;
+    auto csrDshHeap = &device->getNEODevice()->getDefaultEngine().commandStreamReceiver->getIndirectHeap(HeapType::DYNAMIC_STATE, MemoryConstants::pageSize64k);
+    auto csrSshHeap = &device->getNEODevice()->getDefaultEngine().commandStreamReceiver->getIndirectHeap(HeapType::SURFACE_STATE, MemoryConstants::pageSize64k);
+
+    size_t dshUsed = csrDshHeap->getUsed();
+    size_t sshUsed = csrSshHeap->getUsed();
+
+    commandList->appendMemoryFill(dstBuffer, pattern, patternSize, fillSize, nullptr, 0, nullptr, false);
+    EXPECT_TRUE(commandList->usedKernelLaunchParams.isBuiltInKernel);
+    EXPECT_TRUE(commandList->usedKernelLaunchParams.isKernelSplitOperation);
+    EXPECT_FALSE(commandList->usedKernelLaunchParams.isDestinationAllocationInSystemMemory);
+
+    // As numKernelsExecutedInSplitLaunch is incremented after split kernel launch. But we are storing usedKernelLaunchParams before actual split kernel launch.
+    // Hence below comparison tells that actually (usedKernelLaunchParams.numKernelsExecutedInSplitLaunch + 1) split kernels are launched
+    EXPECT_EQ(commandList->usedKernelLaunchParams.numKernelsInSplitLaunch, commandList->usedKernelLaunchParams.numKernelsExecutedInSplitLaunch + 1);
+
+    size_t dshEstimated = NEO::EncodeDispatchKernel<FamilyType>::getSizeRequiredDsh(
+        commandList->firstKernelInSplitOperation->getKernelDescriptor(),
+        cmdContainer.getNumIddPerBlock());
+    size_t sshEstimated = NEO::EncodeDispatchKernel<FamilyType>::getSizeRequiredSsh(*commandList->firstKernelInSplitOperation->getImmutableData()->getKernelInfo());
+
+    auto expectedDshToBeConsumed = dshEstimated * commandList->usedKernelLaunchParams.numKernelsInSplitLaunch;
+    auto expectedSshToBeConsumed = sshEstimated * commandList->usedKernelLaunchParams.numKernelsInSplitLaunch;
+    auto consumedDsh1 = csrDshHeap->getUsed();
+    auto consumedSsh1 = csrSshHeap->getUsed();
+
+    EXPECT_EQ(expectedDshToBeConsumed, (consumedDsh1 - dshUsed));
+    EXPECT_EQ(expectedSshToBeConsumed, (consumedSsh1 - sshUsed));
+
+    commandList->appendMemoryFill(dstBuffer, pattern, 1, fillSize, nullptr, 0, nullptr, false);
+    EXPECT_TRUE(commandList->usedKernelLaunchParams.isBuiltInKernel);
+    EXPECT_TRUE(commandList->usedKernelLaunchParams.isKernelSplitOperation);
+    EXPECT_FALSE(commandList->usedKernelLaunchParams.isDestinationAllocationInSystemMemory);
+    EXPECT_EQ(commandList->usedKernelLaunchParams.numKernelsInSplitLaunch, commandList->usedKernelLaunchParams.numKernelsExecutedInSplitLaunch + 1);
+
+    dshEstimated = NEO::EncodeDispatchKernel<FamilyType>::getSizeRequiredDsh(
+        commandList->firstKernelInSplitOperation->getKernelDescriptor(),
+        cmdContainer.getNumIddPerBlock());
+    sshEstimated = NEO::EncodeDispatchKernel<FamilyType>::getSizeRequiredSsh(*commandList->firstKernelInSplitOperation->getImmutableData()->getKernelInfo());
+
+    expectedDshToBeConsumed = dshEstimated * commandList->usedKernelLaunchParams.numKernelsInSplitLaunch;
+    expectedSshToBeConsumed = sshEstimated * commandList->usedKernelLaunchParams.numKernelsInSplitLaunch;
+    auto consumedDsh2 = csrDshHeap->getUsed();
+    auto consumedSsh2 = csrSshHeap->getUsed();
+    EXPECT_EQ(expectedDshToBeConsumed, (consumedDsh2 - consumedDsh1));
+    EXPECT_EQ(expectedSshToBeConsumed, (consumedSsh2 - consumedSsh1));
+
+    context->freeMem(dstBuffer);
+}
+
 TEST(CommandList, whenAsMutableIsCalledNullptrIsReturned) {
    MockCommandList cmdList;
    EXPECT_EQ(nullptr, cmdList.asMutable());
--- a/level_zero/core/test/unit_tests/sources/debugger/test_l0_debugger_2.cpp
+++ b/level_zero/core/test/unit_tests/sources/debugger/test_l0_debugger_2.cpp
@ -156,6 +156,51 @@ HWTEST2_F(singleAddressSpaceModeTest, givenImmediateCommandListWhenExecutingWith
    commandList->destroy();
 }

+HWTEST2_F(singleAddressSpaceModeTest, givenUseCsrImmediateSubmissionEnabledAndSharedHeapsDisbledForImmediateCommandListWhenExecutingWithFlushTaskThenGPR15isProgrammed, Gen12Plus) {
+    using MI_LOAD_REGISTER_IMM = typename FamilyType::MI_LOAD_REGISTER_IMM;
+    Mock<::L0::KernelImp> kernel;
+    DebugManagerStateRestore restorer;
+    NEO::DebugManager.flags.EnableFlushTaskSubmission.set(true);
+    NEO::DebugManager.flags.EnableImmediateCmdListHeapSharing.set(0);
+    NEO::DebugManager.flags.UseImmediateFlushTask.set(0);
+
+    ze_command_queue_desc_t queueDesc = {};
+    ze_result_t returnValue = ZE_RESULT_SUCCESS;
+    ze_group_count_t groupCount{1, 1, 1};
+
+    auto &csr = neoDevice->getUltCommandStreamReceiver<FamilyType>();
+    csr.storeMakeResidentAllocations = true;
+
+    auto commandList = whiteboxCast(CommandList::createImmediate(productFamily, device, &queueDesc, false, NEO::EngineGroupType::RenderCompute, returnValue));
+
+    EXPECT_TRUE(commandList->isFlushTaskSubmissionEnabled);
+    EXPECT_EQ(&csr, commandList->csr);
+
+    csr.lastFlushedCommandStream = nullptr;
+    CmdListKernelLaunchParams launchParams = {};
+    auto result = commandList->appendLaunchKernel(kernel.toHandle(), groupCount, nullptr, 0, nullptr, launchParams, false);
+    ASSERT_EQ(ZE_RESULT_SUCCESS, result);
+
+    EXPECT_NE(nullptr, csr.lastFlushedCommandStream);
+
+    GenCmdList cmdList;
+    ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(
+        cmdList, commandList->csr->getCS().getCpuBase(), commandList->csr->getCS().getUsed()));
+    bool gpr15Found = false;
+    auto miLoadImm = findAll<MI_LOAD_REGISTER_IMM *>(cmdList.begin(), cmdList.end());
+    for (size_t i = 0; i < miLoadImm.size(); i++) {
+        MI_LOAD_REGISTER_IMM *miLoad = genCmdCast<MI_LOAD_REGISTER_IMM *>(*miLoadImm[i]);
+        ASSERT_NE(nullptr, miLoad);
+
+        if (miLoad->getRegisterOffset() == CS_GPR_R15) {
+            gpr15Found = true;
+            break;
+        }
+    }
+    EXPECT_TRUE(gpr15Found);
+    commandList->destroy();
+}
+
 HWTEST2_P(L0DebuggerWithBlitterTest, givenImmediateCommandListWhenExecutingWithFlushTaskThenSipIsInstalledAndDebuggerAllocationsAreResident, Gen12Plus) {
    using STATE_SIP = typename FamilyType::STATE_SIP;
    using MI_LOAD_REGISTER_IMM = typename FamilyType::MI_LOAD_REGISTER_IMM;
--- a/shared/source/command_container/command_encoder.inl
+++ b/shared/source/command_container/command_encoder.inl
@ -736,6 +736,7 @@ size_t EncodeDispatchKernel<Family>::getSizeRequiredDsh(const KernelDescriptor &
    size = alignUp(size, INTERFACE_DESCRIPTOR_DATA::SAMPLERSTATEPOINTER_ALIGN_SIZE);

    if (additionalDshSize > 0) {
+        size = alignUp(size, EncodeStates<Family>::alignInterfaceDescriptorData);
        size += additionalDshSize;
        size = alignUp(size, EncodeDispatchKernel<Family>::getDefaultDshAlignment());
    }