refactor: Improve scratch programming in heapless mode

Related-To: NEO-7621 Signed-off-by: Kamil Kopryk <kamil.kopryk@intel.com>
2024-01-30 01:00:53 +00:00 · 2024-01-30 01:00:53 +00:00 · 6d3a53fe7f
parent a104d9199d
commit 6d3a53fe7f
8 changed files with 68 additions and 54 deletions
--- a/opencl/source/command_queue/hardware_interface_xehp_and_later.inl
+++ b/opencl/source/command_queue/hardware_interface_xehp_and_later.inl
@ -107,41 +107,12 @@ inline void HardwareInterface<GfxFamily>::programWalker(
                                                                                numWorkGroups, walkerArgs.localWorkSizes, simd, dim,
                                                                                localIdsGenerationByRuntime, inlineDataProgrammingRequired, requiredWalkOrder);

+    auto requiredScratchSlot0Size = queueCsr.getRequiredScratchSlot0Size();
+    auto requiredScratchSlot1Size = queueCsr.getRequiredScratchSlot1Size();
+    uint64_t scratchAddress = 0u;
+    EncodeDispatchKernel<GfxFamily>::template setScratchAddress<heaplessModeEnabled>(scratchAddress, requiredScratchSlot0Size, requiredScratchSlot1Size, &ssh, queueCsr);
+
    auto interfaceDescriptor = &walkerCmd.getInterfaceDescriptor();
-    uint64_t scratchAddress = 0;
-
-    if constexpr (heaplessModeEnabled) {
-        auto scratchAllocation = queueCsr.getScratchAllocation();
-        auto scratchSpaceController = queueCsr.getScratchSpaceController();
-        if (scratchAllocation) {
-            scratchAddress = ssh.getGpuBase() + scratchSpaceController->getScratchPatchAddress();
-        } else {
-            auto requiredScratchSlot0Size = queueCsr.getRequiredScratchSlot0Size();
-            auto requiredScratchSlot1Size = queueCsr.getRequiredScratchSlot1Size();
-            bool stateBaseAddressDirty = false;
-            bool checkVfeStateDirty = false;
-
-            if (requiredScratchSlot0Size || requiredScratchSlot1Size) {
-
-                scratchSpaceController->setRequiredScratchSpace(ssh.getCpuBase(),
-                                                                0u,
-                                                                requiredScratchSlot0Size,
-                                                                requiredScratchSlot1Size,
-                                                                queueCsr.peekTaskCount(), queueCsr.getOsContext(),
-                                                                stateBaseAddressDirty,
-                                                                checkVfeStateDirty);
-
-                if (scratchSpaceController->getScratchSpaceSlot0Allocation()) {
-                    queueCsr.makeResident(*scratchSpaceController->getScratchSpaceSlot0Allocation());
-                }
-                if (scratchSpaceController->getScratchSpaceSlot1Allocation()) {
-                    queueCsr.makeResident(*scratchSpaceController->getScratchSpaceSlot1Allocation());
-                }
-
-                scratchAddress = ssh.getGpuBase() + scratchSpaceController->getScratchPatchAddress();
-            }
-        }
-    }

    HardwareCommandsHelper<GfxFamily>::template sendIndirectState<WalkerType, InterfaceDescriptorType>(
        commandStream,
--- a/shared/source/command_container/command_encoder.h
+++ b/shared/source/command_container/command_encoder.h
@ -184,6 +184,9 @@ struct EncodeDispatchKernel {
        using BINDING_TABLE_STATE = typename GfxFamily::BINDING_TABLE_STATE;
        return BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE;
    }
+
+    template <bool isHeapless>
+    static void setScratchAddress(uint64_t &scratchAddress, uint32_t requiredScratchSlot0Size, uint32_t requiredScratchSlot1Size, IndirectHeap *ssh, CommandStreamReceiver &csr);
 };

 template <typename GfxFamily>
--- a/shared/source/command_container/command_encoder.inl
+++ b/shared/source/command_container/command_encoder.inl
@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2020-2023 Intel Corporation
+ * Copyright (C) 2020-2024 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@ -768,6 +768,11 @@ size_t EncodeDispatchKernel<Family>::getDefaultDshAlignment() {
    return EncodeStates<Family>::alignIndirectStatePointer;
 }

+template <typename Family>
+template <bool isHeapless>
+void EncodeDispatchKernel<Family>::setScratchAddress(uint64_t &scratchAddress, uint32_t requiredScratchSlot0Size, uint32_t requiredScratchSlot1Size, IndirectHeap *ssh, CommandStreamReceiver &csr) {
+}
+
 template <typename Family>
 void EncodeIndirectParams<Family>::setGlobalWorkSizeIndirect(CommandContainer &container, const NEO::CrossThreadDataOffset offsets[3], uint64_t crossThreadAddress, const uint32_t *lws) {
    for (int i = 0; i < 3; ++i) {
--- a/shared/source/command_container/command_encoder_enablers.inl
+++ b/shared/source/command_container/command_encoder_enablers.inl
@ -22,6 +22,8 @@ template void NEO::EncodeDispatchKernel<Family>::encode<Family::DefaultWalkerTyp
 template void NEO::EncodeDispatchKernel<Family>::encodeThreadData<Family::DefaultWalkerType>(Family::DefaultWalkerType &walkerCmd, const uint32_t *startWorkGroup, const uint32_t *numWorkGroups, const uint32_t *workGroupSizes, uint32_t simd, uint32_t localIdDimensions, uint32_t threadsPerThreadGroup, uint32_t threadExecutionMask, bool localIdsGenerationByRuntime, bool inlineDataProgrammingRequired, bool isIndirect, uint32_t requiredWorkGroupOrder, const RootDeviceEnvironment &rootDeviceEnvironment);
 template void NEO::EncodeDispatchKernel<Family>::adjustWalkOrder<Family::DefaultWalkerType>(Family::DefaultWalkerType &walkerCmd, uint32_t requiredWorkGroupOrder, const RootDeviceEnvironment &rootDeviceEnvironment);
 template void NEO::EncodeDispatchKernel<Family>::programBarrierEnable<Family::INTERFACE_DESCRIPTOR_DATA>(Family::INTERFACE_DESCRIPTOR_DATA &interfaceDescriptor, uint32_t value, const HardwareInfo &hwInfo);
+template void NEO::EncodeDispatchKernel<Family>::setScratchAddress<false>(uint64_t &scratchAddress, uint32_t requiredScratchSlot0Size, uint32_t requiredScratchSlot1Size, IndirectHeap *ssh, CommandStreamReceiver &csr);
+template void NEO::EncodeDispatchKernel<Family>::setScratchAddress<true>(uint64_t &scratchAddress, uint32_t requiredScratchSlot0Size, uint32_t requiredScratchSlot1Size, IndirectHeap *ssh, CommandStreamReceiver &csr);

 template struct NEO::EncodeStates<Family>;
 template struct NEO::EncodeMath<Family>;
--- a/shared/source/command_container/command_encoder_xehp_and_later.inl
+++ b/shared/source/command_container/command_encoder_xehp_and_later.inl
@ -321,27 +321,15 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
    }

    if constexpr (heaplessModeEnabled) {
+
        auto requiredScratchSlot0Size = kernelDescriptor.kernelAttributes.perThreadScratchSize[0];
        auto requiredScratchSlot1Size = kernelDescriptor.kernelAttributes.perThreadScratchSize[1];
-        uint64_t scratchAddress = 0;
-        if (requiredScratchSlot0Size > 0 || requiredScratchSlot1Size > 0) {
-            auto csr = args.device->getDefaultEngine().commandStreamReceiver;
-            auto scratchController = csr->getScratchSpaceController();
-            bool gsbaState = false;
-            bool frontEndState = false;
-            auto ssh = container.getIndirectHeap(HeapType::surfaceState);
-            scratchController->setRequiredScratchSpace(ssh->getCpuBase(), 0, requiredScratchSlot0Size, requiredScratchSlot1Size,
-                                                       csr->peekTaskCount(), csr->getOsContext(), gsbaState, frontEndState);
+        auto csr = args.device->getDefaultEngine().commandStreamReceiver;
+        auto ssh = container.getIndirectHeap(HeapType::surfaceState);

-            if (scratchController->getScratchSpaceSlot0Allocation()) {
-                csr->makeResident(*scratchController->getScratchSpaceSlot0Allocation());
-            }
-            if (scratchController->getScratchSpaceSlot1Allocation()) {
-                csr->makeResident(*scratchController->getScratchSpaceSlot1Allocation());
-            }
+        uint64_t scratchAddress = 0u;

-            scratchAddress = ssh->getGpuBase() + scratchController->getScratchPatchAddress();
-        }
+        EncodeDispatchKernel<Family>::template setScratchAddress<heaplessModeEnabled>(scratchAddress, requiredScratchSlot0Size, requiredScratchSlot1Size, ssh, *csr);

        auto inlineDataPointer = reinterpret_cast<char *>(walkerCmd.getInlineDataPointer());
        auto indirectDataPointerAddress = kernelDescriptor.payloadMappings.implicitArgs.indirectDataPointerAddress;
--- a/shared/test/unit_test/encoders/test_encode_dispatch_kernel_xehp_and_later.cpp
+++ b/shared/test/unit_test/encoders/test_encode_dispatch_kernel_xehp_and_later.cpp
@ -1539,3 +1539,19 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CommandEncodeStatesImplicitScalingSecondaryBufferTe
            givenDispatchImplicitScalingWithBbStartOverControlSectionWhenDispatchingAsSecondaryBufferContainerThenExpectSecondaryBatchBuffer) {
    testBodyFindPrimaryBatchBuffer<FamilyType>();
 }
+
+using EncodeKernelScratchProgrammingTest = Test<ScratchProgrammingFixture>;
+
+HWTEST2_F(EncodeKernelScratchProgrammingTest, givenHeaplessModeDisabledWhenSetScratchAddressIsCalledThenDoNothing, IsAtLeastXeHpCore) {
+
+    static constexpr bool heaplessModeEnabled = false;
+    auto &ultCsr = pDevice->getUltCommandStreamReceiver<FamilyType>();
+    uint64_t scratchAddress = 0;
+    uint32_t requiredScratchSlot0Size = 64;
+    uint32_t requiredScratchSlot1Size = 0;
+
+    EncodeDispatchKernel<FamilyType>::template setScratchAddress<heaplessModeEnabled>(scratchAddress, requiredScratchSlot0Size, requiredScratchSlot1Size, ssh, ultCsr);
+
+    uint64_t expectedScratchAddress = 0;
+    EXPECT_EQ(expectedScratchAddress, scratchAddress);
+}
--- a/shared/test/unit_test/fixtures/command_container_fixture.cpp
+++ b/shared/test/unit_test/fixtures/command_container_fixture.cpp
@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2022-2023 Intel Corporation
+ * Copyright (C) 2022-2024 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@ -8,8 +8,10 @@
 #include "shared/test/unit_test/fixtures/command_container_fixture.h"

 #include "shared/source/indirect_heap/heap_size.h"
+#include "shared/source/indirect_heap/indirect_heap.h"
 #include "shared/source/os_interface/product_helper.h"
 #include "shared/test/common/mocks/mock_device.h"
+#include "shared/test/common/mocks/mock_graphics_allocation.h"

 namespace NEO {

@ -72,6 +74,25 @@ EncodeDispatchKernelArgs CommandEncodeStatesFixture::createDefaultDispatchKernel
    return args;
 }

+void ScratchProgrammingFixture::setUp() {
+    NEO::DeviceFixture::setUp();
+    size_t sizeStream = 512;
+    size_t alignmentStream = 0x1000;
+    ssh = new IndirectHeap{nullptr};
+    sshBuffer = alignedMalloc(sizeStream, alignmentStream);
+    ASSERT_NE(nullptr, sshBuffer);
+    ssh->replaceBuffer(sshBuffer, sizeStream);
+    auto graphicsAllocation = new MockGraphicsAllocation(sshBuffer, sizeStream);
+    ssh->replaceGraphicsAllocation(graphicsAllocation);
+}
+
+void ScratchProgrammingFixture::tearDown() {
+    delete ssh->getGraphicsAllocation();
+    delete ssh;
+    alignedFree(sshBuffer);
+    NEO::DeviceFixture::tearDown();
+}
+
 } // namespace NEO

 void WalkerThreadFixture::setUp() {
--- a/shared/test/unit_test/fixtures/command_container_fixture.h
+++ b/shared/test/unit_test/fixtures/command_container_fixture.h
@ -1,5 +1,5 @@
 /*
- * Copyright (C) 2020-2023 Intel Corporation
+ * Copyright (C) 2020-2024 Intel Corporation
 *
 * SPDX-License-Identifier: MIT
 *
@ -65,6 +65,14 @@ class CommandEncodeStatesFixture : public DeviceFixture {
    NEO::L1CachePolicy l1CachePolicyData;
 };

+struct ScratchProgrammingFixture : public NEO::DeviceFixture {
+    void setUp();
+    void tearDown();
+
+    IndirectHeap *ssh = nullptr;
+    void *sshBuffer = nullptr;
+};
+
 } // namespace NEO

 struct WalkerThreadFixture {