[26/n] Internal 4GB allocator.

- change the way we handle blocked commands. - instead of allocating CPU pointer and populating it with commands, create real IndirectHeap that may be later submitted to the GPU - that removes a lot of copy operations that were happening on submit time - for device enqueue, this requires dsh & shh to be passed directly to the underlying commands, in that scenario device queue buffers are not used Change-Id: I1124a8edbb46777ea7f7d3a5946f302e7fdf9665
2025-12-31 20:13:04 +08:00 · 2018-04-05 15:12:28 +02:00
parent 100f559daa
commit ffa9b097f5
20 changed files with 331 additions and 319 deletions
--- a/runtime/command_queue/command_queue.cpp
+++ b/runtime/command_queue/command_queue.cpp
@@ -239,35 +239,7 @@ IndirectHeap &CommandQueue::getIndirectHeap(IndirectHeap::Type heapType,
    }

    if (!heapMemory) {
-        size_t reservedSize = 0;
-        auto finalHeapSize = defaultHeapSize;
-
-        minRequiredSize += reservedSize;
-
-        finalHeapSize = alignUp(std::max(finalHeapSize, minRequiredSize), MemoryConstants::pageSize);
-
-        heapMemory = memoryManager->obtainReusableAllocation(finalHeapSize).release();
-
-        if (!heapMemory) {
-            heapMemory = memoryManager->allocateGraphicsMemory(finalHeapSize, MemoryConstants::pageSize);
-        } else {
-            finalHeapSize = std::max(heapMemory->getUnderlyingBufferSize(), finalHeapSize);
-        }
-
-        heapMemory->setAllocationType(GraphicsAllocation::ALLOCATION_TYPE_LINEAR_STREAM);
-
-        if (IndirectHeap::SURFACE_STATE == heapType) {
-            DEBUG_BREAK_IF(minRequiredSize > maxSshSize);
-            finalHeapSize = maxSshSize;
-        }
-
-        if (heap) {
-            heap->replaceBuffer(heapMemory->getUnderlyingBuffer(), finalHeapSize);
-            heap->replaceGraphicsAllocation(heapMemory);
-        } else {
-            heap = new IndirectHeap(heapMemory);
-            heap->overrideMaxSize(finalHeapSize);
-        }
+        allocateHeapMemory(heapType, minRequiredSize, heap);
    }

    return *heap;
@@ -650,4 +622,37 @@ bool CommandQueue::setupDebugSurface(Kernel *kernel) {
    return true;
 }

+void CommandQueue::allocateHeapMemory(IndirectHeap::Type heapType,
+                                      size_t minRequiredSize, IndirectHeap *&indirectHeap) {
+    auto memoryManager = device->getMemoryManager();
+    size_t reservedSize = 0;
+    auto finalHeapSize = defaultHeapSize;
+
+    minRequiredSize += reservedSize;
+
+    finalHeapSize = alignUp(std::max(finalHeapSize, minRequiredSize), MemoryConstants::pageSize);
+
+    auto heapMemory = memoryManager->obtainReusableAllocation(finalHeapSize).release();
+
+    if (!heapMemory) {
+        heapMemory = memoryManager->allocateGraphicsMemory(finalHeapSize, MemoryConstants::pageSize);
+    } else {
+        finalHeapSize = std::max(heapMemory->getUnderlyingBufferSize(), finalHeapSize);
+    }
+
+    heapMemory->setAllocationType(GraphicsAllocation::ALLOCATION_TYPE_LINEAR_STREAM);
+
+    if (IndirectHeap::SURFACE_STATE == heapType) {
+        DEBUG_BREAK_IF(minRequiredSize > maxSshSize);
+        finalHeapSize = maxSshSize;
+    }
+
+    if (indirectHeap) {
+        indirectHeap->replaceBuffer(heapMemory->getUnderlyingBuffer(), finalHeapSize);
+        indirectHeap->replaceGraphicsAllocation(heapMemory);
+    } else {
+        indirectHeap = new IndirectHeap(heapMemory);
+        indirectHeap->overrideMaxSize(finalHeapSize);
+    }
+}
 } // namespace OCLRT
--- a/runtime/command_queue/command_queue.h
+++ b/runtime/command_queue/command_queue.h
@@ -336,6 +336,9 @@ class CommandQueue : public BaseObject<_cl_command_queue> {
    IndirectHeap &getIndirectHeap(IndirectHeap::Type heapType,
                                  size_t minRequiredSize = 0u);

+    void allocateHeapMemory(IndirectHeap::Type heapType,
+                            size_t minRequiredSize, IndirectHeap *&indirectHeap);
+
    MOCKABLE_VIRTUAL void releaseIndirectHeap(IndirectHeap::Type heapType);

    cl_command_queue_properties getCommandQueueProperties() const {
--- a/runtime/command_queue/enqueue_common.h
+++ b/runtime/command_queue/enqueue_common.h
@@ -275,6 +275,7 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,

            uint32_t taskCount = commandStreamReceiver.peekTaskCount() + 1;
            devQueueHw->setupExecutionModelDispatch(getIndirectHeap(IndirectHeap::SURFACE_STATE, minSizeSSHForEM),
+                                                    *devQueueHw->getIndirectHeap(IndirectHeap::DYNAMIC_STATE),
                                                    multiDispatchInfo.begin()->getKernel(),
                                                    (uint32_t)multiDispatchInfo.size(),
                                                    taskCount,
@@ -297,7 +298,9 @@ void CommandQueueHw<GfxFamily>::enqueueHandler(Surface **surfacesForResidency,
                *this,
                *devQueueHw,
                preemption,
-                scheduler);
+                scheduler,
+                &getIndirectHeap(IndirectHeap::SURFACE_STATE),
+                devQueueHw->getIndirectHeap(IndirectHeap::DYNAMIC_STATE));

            scheduler.makeResident(commandStreamReceiver);

--- a/runtime/command_queue/gpgpu_walker.h
+++ b/runtime/command_queue/gpgpu_walker.h
@@ -119,13 +119,6 @@ inline cl_uint computeDimensions(const size_t workItems[3]) {
    return (workItems[2] > 1) ? 3 : (workItems[1] > 1) ? 2 : 1;
 }

-template <typename SizeAndAllocCalcT, typename... CalcArgsT>
-IndirectHeap *allocateIndirectHeap(SizeAndAllocCalcT &&calc, CalcArgsT &&... args) {
-    size_t alignment = MemoryConstants::pageSize;
-    size_t size = calc(std::forward<CalcArgsT>(args)...);
-    return new IndirectHeap(alignedMalloc(size, alignment), size);
-}
-
 template <typename GfxFamily>
 class GpgpuWalkerHelper {
  public:
@@ -227,7 +220,9 @@ class GpgpuWalkerHelper {
        CommandQueue &commandQueue,
        DeviceQueueHw<GfxFamily> &devQueueHw,
        PreemptionMode preemptionMode,
-        SchedulerKernel &scheduler);
+        SchedulerKernel &scheduler,
+        IndirectHeap *ssh,
+        IndirectHeap *dsh);
 };

 template <typename GfxFamily, uint32_t eventType>
--- a/runtime/command_queue/gpgpu_walker.inl
+++ b/runtime/command_queue/gpgpu_walker.inl
@@ -458,20 +458,27 @@ void GpgpuWalkerHelper<GfxFamily>::dispatchWalker(
        using KCH = KernelCommandsHelper<GfxFamily>;
        commandStream = new LinearStream(alignedMalloc(MemoryConstants::pageSize, MemoryConstants::pageSize), MemoryConstants::pageSize);
        if (executionModelKernel) {
-            uint32_t offsetDsh = commandQueue.getContext().getDefaultDeviceQueue()->getDshOffset();
            uint32_t colorCalcSize = commandQueue.getContext().getDefaultDeviceQueue()->colorCalcStateSize;

-            dsh = allocateIndirectHeap([&multiDispatchInfo, offsetDsh] { return KCH::getTotalSizeRequiredDSH(multiDispatchInfo) + KCH::getTotalSizeRequiredIOH(multiDispatchInfo) + offsetDsh; });
+            commandQueue.allocateHeapMemory(IndirectHeap::DYNAMIC_STATE,
+                                            commandQueue.getContext().getDefaultDeviceQueue()->getDshBuffer()->getUnderlyingBufferSize(),
+                                            dsh);
+
            dsh->getSpace(colorCalcSize);
            ioh = dsh;
+            commandQueue.allocateHeapMemory(IndirectHeap::SURFACE_STATE,
+                                            KernelCommandsHelper<GfxFamily>::template getSizeRequiredForExecutionModel<IndirectHeap::SURFACE_STATE>(*(multiDispatchInfo.begin()->getKernel())) +
+                                                KCH::getTotalSizeRequiredSSH(multiDispatchInfo),
+                                            ssh);
        } else {
-            dsh = allocateIndirectHeap([&multiDispatchInfo] { return KCH::getTotalSizeRequiredDSH(multiDispatchInfo); });
-            ioh = allocateIndirectHeap([&multiDispatchInfo] { return KCH::getTotalSizeRequiredIOH(multiDispatchInfo); });
+            commandQueue.allocateHeapMemory(IndirectHeap::DYNAMIC_STATE, KCH::getTotalSizeRequiredDSH(multiDispatchInfo), dsh);
+            commandQueue.allocateHeapMemory(IndirectHeap::INDIRECT_OBJECT, KCH::getTotalSizeRequiredIOH(multiDispatchInfo), ioh);
+            commandQueue.allocateHeapMemory(IndirectHeap::SURFACE_STATE, KCH::getTotalSizeRequiredSSH(multiDispatchInfo), ssh);
        }

-        ssh = allocateIndirectHeap([&multiDispatchInfo] { return KCH::getTotalSizeRequiredSSH(multiDispatchInfo); });
        using UniqueIH = std::unique_ptr<IndirectHeap>;
-        *blockedCommandsData = new KernelOperation(std::unique_ptr<LinearStream>(commandStream), UniqueIH(dsh), UniqueIH(ioh), UniqueIH(ssh));
+        *blockedCommandsData = new KernelOperation(std::unique_ptr<LinearStream>(commandStream), UniqueIH(dsh), UniqueIH(ioh), UniqueIH(ssh),
+                                                   *commandQueue.getDevice().getMemoryManager());
        if (executionModelKernel) {
            (*blockedCommandsData)->doNotFreeISH = true;
        }
@@ -671,7 +678,9 @@ void GpgpuWalkerHelper<GfxFamily>::dispatchScheduler(
    CommandQueue &commandQueue,
    DeviceQueueHw<GfxFamily> &devQueueHw,
    PreemptionMode preemptionMode,
-    SchedulerKernel &scheduler) {
+    SchedulerKernel &scheduler,
+    IndirectHeap *ssh,
+    IndirectHeap *dsh) {

    using INTERFACE_DESCRIPTOR_DATA = typename GfxFamily::INTERFACE_DESCRIPTOR_DATA;
    using GPGPU_WALKER = typename GfxFamily::GPGPU_WALKER;
@@ -679,13 +688,9 @@ void GpgpuWalkerHelper<GfxFamily>::dispatchScheduler(
    using MI_BATCH_BUFFER_START = typename GfxFamily::MI_BATCH_BUFFER_START;

    OCLRT::LinearStream *commandStream = nullptr;
-    OCLRT::IndirectHeap *dsh = nullptr, *ioh = nullptr, *ssh = nullptr;
+    OCLRT::IndirectHeap *ioh = nullptr;

    commandStream = &commandQueue.getCS(0);
-    // note : below code assumes that caller to dispatchScheduler "preallocated" memory
-    //        required for execution model in below heap managers
-    dsh = devQueueHw.getIndirectHeap(IndirectHeap::DYNAMIC_STATE);
-    ssh = &commandQueue.getIndirectHeap(IndirectHeap::SURFACE_STATE);

    bool dcFlush = false;
    commandQueue.getDevice().getCommandStreamReceiver().addPipeControl(*commandStream, dcFlush);