Pass private scratch size to scratch space controller

Related-To: NEO-3190 Change-Id: I6f1e71481679492516d898226de6a1e721896e81 Signed-off-by: Mateusz Jablonski <mateusz.jablonski@intel.com>
2025-09-15 13:01:45 +08:00 · 2019-06-28 09:37:04 +02:00
parent 2f42f332d8
commit 27f3f8ea8f
10 changed files with 68 additions and 22 deletions
--- a/runtime/command_queue/enqueue_common.h
+++ b/runtime/command_queue/enqueue_common.h
@ -442,7 +442,7 @@ void CommandQueueHw<GfxFamily>::processDispatchForKernels(const MultiDispatchInf
        }
    }

-    getCommandStreamReceiver().setRequiredScratchSize(multiDispatchInfo.getRequiredScratchSize());
+    getCommandStreamReceiver().setRequiredScratchSizes(multiDispatchInfo.getRequiredScratchSize(), multiDispatchInfo.getRequiredPrivateScratchSize());
 }

 template <typename GfxFamily>
--- a/runtime/command_stream/command_stream_receiver.cpp
+++ b/runtime/command_stream/command_stream_receiver.cpp
@ -214,10 +214,13 @@ void CommandStreamReceiver::setTagAllocation(GraphicsAllocation *allocation) {
    this->tagAddress = allocation ? reinterpret_cast<uint32_t *>(allocation->getUnderlyingBuffer()) : nullptr;
 }

-void CommandStreamReceiver::setRequiredScratchSize(uint32_t newRequiredScratchSize) {
+void CommandStreamReceiver::setRequiredScratchSizes(uint32_t newRequiredScratchSize, uint32_t newRequiredPrivateScratchSize) {
    if (newRequiredScratchSize > requiredScratchSize) {
        requiredScratchSize = newRequiredScratchSize;
    }
+    if (newRequiredPrivateScratchSize > requiredPrivateScratchSize) {
+        requiredPrivateScratchSize = newRequiredPrivateScratchSize;
+    }
 }

 GraphicsAllocation *CommandStreamReceiver::getScratchAllocation() {
--- a/runtime/command_stream/command_stream_receiver.h
+++ b/runtime/command_stream/command_stream_receiver.h
@ -116,7 +116,7 @@ class CommandStreamReceiver {

    void setMediaVFEStateDirty(bool dirty) { mediaVfeStateDirty = dirty; }

-    void setRequiredScratchSize(uint32_t newRequiredScratchSize);
+    void setRequiredScratchSizes(uint32_t newRequiredScratchSize, uint32_t newRequiredPrivateScratchSize);
    GraphicsAllocation *getScratchAllocation();
    GraphicsAllocation *getDebugSurfaceAllocation() const { return debugSurface; }
    GraphicsAllocation *allocateDebugSurface(size_t size);
@ -235,6 +235,7 @@ class CommandStreamReceiver {
    uint32_t lastSentThreadArbitrationPolicy = ThreadArbitrationPolicy::NotPresent;

    uint32_t requiredScratchSize = 0;
+    uint32_t requiredPrivateScratchSize = 0;

    int8_t lastSentCoherencyRequest = -1;
    int8_t lastMediaSamplerConfig = -1;
--- a/runtime/command_stream/command_stream_receiver_hw_base.inl
+++ b/runtime/command_stream/command_stream_receiver_hw_base.inl
@ -224,7 +224,7 @@ CompletionStamp CommandStreamReceiverHw<GfxFamily>::flushTask(
    if (requiredScratchSize) {
        scratchSpaceController->setRequiredScratchSpace(ssh.getCpuBase(),
                                                        requiredScratchSize,
-                                                        0u,
+                                                        requiredPrivateScratchSize,
                                                        this->taskCount,
                                                        this->osContext->getContextId(),
                                                        stateBaseAddressDirty,
--- a/runtime/helpers/dispatch_info.cpp
+++ b/runtime/helpers/dispatch_info.cpp
@ -22,6 +22,10 @@ uint32_t DispatchInfo::getRequiredScratchSize() const {
    return (kernel == nullptr) ? 0 : kernel->getScratchSize();
 }

+uint32_t DispatchInfo::getRequiredPrivateScratchSize() const {
+    return (kernel == nullptr) ? 0 : kernel->getPrivateScratchSize();
+}
+
 Kernel *MultiDispatchInfo::peekMainKernel() const {
    if (dispatchInfos.size() == 0) {
        return nullptr;
--- a/runtime/helpers/dispatch_info.h
+++ b/runtime/helpers/dispatch_info.h
@ -21,16 +21,17 @@ class Kernel;

 class DispatchInfo {
  public:
-    DispatchInfo() : gws(0, 0, 0), elws(0, 0, 0), offset(0, 0, 0), agws(0, 0, 0), lws(0, 0, 0), twgs(0, 0, 0), nwgs(0, 0, 0), swgs(0, 0, 0) {}
-    DispatchInfo(Kernel *k, uint32_t d, Vec3<size_t> gws, Vec3<size_t> elws, Vec3<size_t> offset)
-        : kernel(k), dim(d), gws(gws), elws(elws), offset(offset), agws(0, 0, 0), lws(0, 0, 0), twgs(0, 0, 0), nwgs(0, 0, 0), swgs(0, 0, 0) {}
-    DispatchInfo(Kernel *k, uint32_t d, Vec3<size_t> gws, Vec3<size_t> elws, Vec3<size_t> offset, Vec3<size_t> agws, Vec3<size_t> lws, Vec3<size_t> twgs, Vec3<size_t> nwgs, Vec3<size_t> swgs)
-        : kernel(k), dim(d), gws(gws), elws(elws), offset(offset), agws(agws), lws(lws), twgs(twgs), nwgs(nwgs), swgs(swgs) {}
+    DispatchInfo() = default;
+    DispatchInfo(Kernel *kernel, uint32_t dim, Vec3<size_t> gws, Vec3<size_t> elws, Vec3<size_t> offset)
+        : kernel(kernel), dim(dim), gws(gws), elws(elws), offset(offset) {}
+    DispatchInfo(Kernel *kernel, uint32_t dim, Vec3<size_t> gws, Vec3<size_t> elws, Vec3<size_t> offset, Vec3<size_t> agws, Vec3<size_t> lws, Vec3<size_t> twgs, Vec3<size_t> nwgs, Vec3<size_t> swgs)
+        : kernel(kernel), dim(dim), gws(gws), elws(elws), offset(offset), agws(agws), lws(lws), twgs(twgs), nwgs(nwgs), swgs(swgs) {}
    bool isPipeControlRequired() const { return pipeControlRequired; }
    void setPipeControlRequired(bool blocking) { this->pipeControlRequired = blocking; }
    bool usesSlm() const;
    bool usesStatelessPrintfSurface() const;
    uint32_t getRequiredScratchSize() const;
+    uint32_t getRequiredPrivateScratchSize() const;
    void setKernel(Kernel *kernel) { this->kernel = kernel; }
    Kernel *getKernel() const { return kernel; }
    uint32_t getDim() const { return dim; }
@ -60,14 +61,14 @@ class DispatchInfo {
    Kernel *kernel = nullptr;
    uint32_t dim = 0;

-    Vec3<size_t> gws;    //global work size
-    Vec3<size_t> elws;   //enqueued local work size
-    Vec3<size_t> offset; //global offset
-    Vec3<size_t> agws;   //actual global work size
-    Vec3<size_t> lws;    //local work size
-    Vec3<size_t> twgs;   //total number of work groups
-    Vec3<size_t> nwgs;   //number of work groups
-    Vec3<size_t> swgs;   //start of work groups
+    Vec3<size_t> gws{0, 0, 0};    //global work size
+    Vec3<size_t> elws{0, 0, 0};   //enqueued local work size
+    Vec3<size_t> offset{0, 0, 0}; //global offset
+    Vec3<size_t> agws{0, 0, 0};   //actual global work size
+    Vec3<size_t> lws{0, 0, 0};    //local work size
+    Vec3<size_t> twgs{0, 0, 0};   //total number of work groups
+    Vec3<size_t> nwgs{0, 0, 0};   //number of work groups
+    Vec3<size_t> swgs{0, 0, 0};   //start of work groups
 };

 struct MultiDispatchInfo {
@ -113,6 +114,14 @@ struct MultiDispatchInfo {
        return ret;
    }

+    uint32_t getRequiredPrivateScratchSize() const {
+        uint32_t ret = 0;
+        for (const auto &dispatchInfo : dispatchInfos) {
+            ret = std::max(ret, dispatchInfo.getRequiredPrivateScratchSize());
+        }
+        return ret;
+    }
+
    DispatchInfo *begin() {
        return dispatchInfos.begin();
    }
--- a/unit_tests/command_queue/enqueue_kernel_1_tests.cpp
+++ b/unit_tests/command_queue/enqueue_kernel_1_tests.cpp
@ -334,6 +334,26 @@ HWCMDTEST_F(IGFX_GEN8_CORE, EnqueueKernelTest, givenSecondEnqueueWithTheSameScra
    EXPECT_EQ(csr.getScratchAllocation(), scratchAlloc);
 }

+HWTEST_F(EnqueueKernelTest, whenEnqueueingKernelThatRequirePrivateScratchThenPrivateScratchIsSetInCommandStreamReceviver) {
+    pDevice->setPreemptionMode(PreemptionMode::ThreadGroup);
+    auto &csr = pDevice->getUltCommandStreamReceiver<FamilyType>();
+    csr.getMemoryManager()->setForce32BitAllocations(false);
+    size_t off[3] = {0, 0, 0};
+    size_t gws[3] = {1, 1, 1};
+
+    SPatchMediaVFEState mediaVFEstate;
+    uint32_t privateScratchSize = 4096u;
+
+    mediaVFEstate.PerThreadScratchSpace = privateScratchSize;
+
+    MockKernelWithInternals mockKernel(*pDevice);
+    mockKernel.kernelInfo.patchInfo.mediaVfeStateSlot1 = &mediaVFEstate;
+
+    pCmdQ->enqueueKernel(mockKernel.mockKernel, 1, off, gws, nullptr, 0, nullptr, nullptr);
+
+    EXPECT_EQ(privateScratchSize, csr.requiredPrivateScratchSize);
+}
+
 HWTEST_F(EnqueueKernelTest, givenEnqueueWithGlobalWorkSizeWhenZeroValueIsPassedInDimensionThenTheKernelCommandWillTriviallySucceed) {
    size_t gws[3] = {0, 0, 0};
    MockKernelWithInternals mockKernel(*pDevice);
--- a/unit_tests/command_stream/command_stream_receiver_flush_task_2_tests.cpp
+++ b/unit_tests/command_stream/command_stream_receiver_flush_task_2_tests.cpp
@ -6,6 +6,7 @@
 */

 #include "runtime/command_stream/csr_definitions.h"
+#include "runtime/command_stream/scratch_space_controller.h"
 #include "runtime/gmm_helper/gmm_helper.h"
 #include "runtime/helpers/hw_helper.h"
 #include "runtime/helpers/state_base_address.h"
@ -410,7 +411,7 @@ HWTEST_F(CommandStreamReceiverFlushTaskTests, handleTagAndScratchAllocationsResi
    auto commandStreamReceiver = new MockCsrHw<FamilyType>(*pDevice->executionEnvironment);
    pDevice->resetCommandStreamReceiver(commandStreamReceiver);

-    commandStreamReceiver->setRequiredScratchSize(1024); // whatever > 0
+    commandStreamReceiver->setRequiredScratchSizes(1024, 0); // whatever > 0

    flushTask(*commandStreamReceiver);

@ -686,13 +687,13 @@ HWTEST_F(CommandStreamReceiverFlushTaskTests, InForced32BitAllocationsModeDoNotS

    pDevice->resetCommandStreamReceiver(commandStreamReceiver);

-    commandStreamReceiver->setRequiredScratchSize(4096); // whatever > 0 (in page size)
+    commandStreamReceiver->setRequiredScratchSizes(4096, 0); // whatever > 0 (in page size)
    flushTask(*commandStreamReceiver);

    auto scratchAllocation = commandStreamReceiver->getScratchAllocation();
    ASSERT_NE(scratchAllocation, nullptr);

-    commandStreamReceiver->setRequiredScratchSize(8196); // whatever > first size
+    commandStreamReceiver->setRequiredScratchSizes(8196, 0); // whatever > first size

    flushTask(*commandStreamReceiver); // 2nd flush

@ -720,13 +721,13 @@ HWTEST_F(CommandStreamReceiverFlushTaskTests, InForced32BitAllocationsModeStore3

        pDevice->resetCommandStreamReceiver(commandStreamReceiver);

-        commandStreamReceiver->setRequiredScratchSize(4096); // whatever > 0 (in page size)
+        commandStreamReceiver->setRequiredScratchSizes(4096, 0); // whatever > 0 (in page size)
        flushTask(*commandStreamReceiver);

        auto scratchAllocation = commandStreamReceiver->getScratchAllocation();
        ASSERT_NE(scratchAllocation, nullptr);

-        commandStreamReceiver->setRequiredScratchSize(8196); // whatever > first size
+        commandStreamReceiver->setRequiredScratchSizes(8196, 0); // whatever > first size

        flushTask(*commandStreamReceiver); // 2nd flush

--- a/unit_tests/helpers/dispatch_info_tests.cpp
+++ b/unit_tests/helpers/dispatch_info_tests.cpp
@ -337,3 +337,9 @@ TEST(DispatchInfoBasicTests, givenDispatchInfoWhenSetCanBePartitionIsCalledThenS
    dispatchInfo.setCanBePartitioned(true);
    EXPECT_TRUE(dispatchInfo.peekCanBePartitioned());
 }
+
+TEST(DispatchInfoBasicTests, givenDispatchInfoWithoutKernelWhenGettingSizeForPrivateScratchThenZeroIsReturned) {
+    DispatchInfo dispatchInfo;
+    EXPECT_EQ(nullptr, dispatchInfo.getKernel());
+    EXPECT_EQ(0u, dispatchInfo.getRequiredPrivateScratchSize());
+}
--- a/unit_tests/libult/ult_command_stream_receiver.h
+++ b/unit_tests/libult/ult_command_stream_receiver.h
@ -56,9 +56,11 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw<GfxFamily>, publ
    using BaseClass::CommandStreamReceiver::mediaVfeStateDirty;
    using BaseClass::CommandStreamReceiver::perfCounterAllocator;
    using BaseClass::CommandStreamReceiver::profilingTimeStampAllocator;
+    using BaseClass::CommandStreamReceiver::requiredPrivateScratchSize;
    using BaseClass::CommandStreamReceiver::requiredScratchSize;
    using BaseClass::CommandStreamReceiver::requiredThreadArbitrationPolicy;
    using BaseClass::CommandStreamReceiver::samplerCacheFlushRequired;
+    using BaseClass::CommandStreamReceiver::scratchSpaceController;
    using BaseClass::CommandStreamReceiver::stallingPipeControlOnNextFlushRequired;
    using BaseClass::CommandStreamReceiver::submissionAggregator;
    using BaseClass::CommandStreamReceiver::taskCount;