Move hw specific GpgpuWalkerHelper functions to separate file

Change-Id: If2e793d0c3de1a5245bbdee065111a504807b134 Signed-off-by: Filip Hazubski <filip.hazubski@intel.com>
2026-01-09 06:23:01 +08:00 · 2018-10-02 15:09:06 +02:00
parent ce29770d61
commit 3fdb17bc7f
6 changed files with 193 additions and 176 deletions
--- a/runtime/command_queue/gpgpu_walker.inl
+++ b/runtime/command_queue/gpgpu_walker.inl
@@ -99,44 +99,6 @@ void GpgpuWalkerHelper<GfxFamily>::addAluReadModifyWriteRegister(
    pCmd5->setStateCacheInvalidationEnable(true);
 }

-template <typename GfxFamily>
-inline size_t GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(
-    WALKER_TYPE<GfxFamily> *walkerCmd,
-    const size_t globalOffsets[3],
-    const size_t startWorkGroups[3],
-    const size_t numWorkGroups[3],
-    const size_t localWorkSizesIn[3],
-    uint32_t simd,
-    uint32_t workDim,
-    bool localIdsGeneration) {
-    auto localWorkSize = localWorkSizesIn[0] * localWorkSizesIn[1] * localWorkSizesIn[2];
-
-    auto threadsPerWorkGroup = getThreadsPerWG(simd, localWorkSize);
-    walkerCmd->setThreadWidthCounterMaximum(static_cast<uint32_t>(threadsPerWorkGroup));
-
-    walkerCmd->setThreadGroupIdXDimension(static_cast<uint32_t>(numWorkGroups[0]));
-    walkerCmd->setThreadGroupIdYDimension(static_cast<uint32_t>(numWorkGroups[1]));
-    walkerCmd->setThreadGroupIdZDimension(static_cast<uint32_t>(numWorkGroups[2]));
-
-    // compute executionMask - to tell which SIMD lines are active within thread
-    auto remainderSimdLanes = localWorkSize & (simd - 1);
-    uint64_t executionMask = (1ull << remainderSimdLanes) - 1;
-    if (!executionMask)
-        executionMask = ~executionMask;
-
-    using SIMD_SIZE = typename WALKER_TYPE<GfxFamily>::SIMD_SIZE;
-
-    walkerCmd->setRightExecutionMask(static_cast<uint32_t>(executionMask));
-    walkerCmd->setBottomExecutionMask(static_cast<uint32_t>(0xffffffff));
-    walkerCmd->setSimdSize(static_cast<SIMD_SIZE>(simd >> 4));
-
-    walkerCmd->setThreadGroupIdStartingX(static_cast<uint32_t>(startWorkGroups[0]));
-    walkerCmd->setThreadGroupIdStartingY(static_cast<uint32_t>(startWorkGroups[1]));
-    walkerCmd->setThreadGroupIdStartingResumeZ(static_cast<uint32_t>(startWorkGroups[2]));
-
-    return localWorkSize;
-}
-
 template <typename GfxFamily>
 void GpgpuWalkerHelper<GfxFamily>::dispatchProfilingCommandsStart(
    HwTimeStamps &hwTimeStamps,
@@ -427,144 +389,6 @@ inline void GpgpuWalkerHelper<GfxFamily>::dispatchOnDeviceWaitlistSemaphores(Lin
    }
 }

-template <typename GfxFamily>
-void GpgpuWalkerHelper<GfxFamily>::setupTimestampPacket(
-    LinearStream *cmdStream,
-    WALKER_TYPE<GfxFamily> *walkerCmd,
-    TimestampPacket *timestampPacket,
-    TimestampPacket::WriteOperationType writeOperationType) {
-
-    if (TimestampPacket::WriteOperationType::AfterWalker == writeOperationType) {
-        uint64_t address = timestampPacket->pickAddressForDataWrite(TimestampPacket::DataIndex::ContextEnd);
-        auto pipeControlCmd = cmdStream->getSpaceForCmd<PIPE_CONTROL>();
-        *pipeControlCmd = PIPE_CONTROL::sInit();
-        pipeControlCmd->setCommandStreamerStallEnable(true);
-        pipeControlCmd->setPostSyncOperation(PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA);
-        pipeControlCmd->setAddress(static_cast<uint32_t>(address & 0x0000FFFFFFFFULL));
-        pipeControlCmd->setAddressHigh(static_cast<uint32_t>(address >> 32));
-        pipeControlCmd->setImmediateData(0);
-    }
-}
-
-template <typename GfxFamily>
-void GpgpuWalkerHelper<GfxFamily>::dispatchScheduler(
-    CommandQueue &commandQueue,
-    DeviceQueueHw<GfxFamily> &devQueueHw,
-    PreemptionMode preemptionMode,
-    SchedulerKernel &scheduler,
-    IndirectHeap *ssh,
-    IndirectHeap *dsh) {
-
-    using INTERFACE_DESCRIPTOR_DATA = typename GfxFamily::INTERFACE_DESCRIPTOR_DATA;
-    using GPGPU_WALKER = typename GfxFamily::GPGPU_WALKER;
-    using MI_BATCH_BUFFER_START = typename GfxFamily::MI_BATCH_BUFFER_START;
-
-    OCLRT::LinearStream *commandStream = nullptr;
-    OCLRT::IndirectHeap *ioh = nullptr;
-
-    commandStream = &commandQueue.getCS(0);
-
-    bool dcFlush = false;
-    commandQueue.getDevice().getCommandStreamReceiver().addPipeControl(*commandStream, dcFlush);
-
-    uint32_t interfaceDescriptorIndex = devQueueHw.schedulerIDIndex;
-    const size_t offsetInterfaceDescriptorTable = devQueueHw.colorCalcStateSize;
-    const size_t offsetInterfaceDescriptor = offsetInterfaceDescriptorTable;
-    const size_t totalInterfaceDescriptorTableSize = devQueueHw.interfaceDescriptorEntries * sizeof(INTERFACE_DESCRIPTOR_DATA);
-
-    // Program media interface descriptor load
-    KernelCommandsHelper<GfxFamily>::sendMediaInterfaceDescriptorLoad(
-        *commandStream,
-        offsetInterfaceDescriptor,
-        totalInterfaceDescriptorTableSize);
-
-    DEBUG_BREAK_IF(offsetInterfaceDescriptorTable % 64 != 0);
-
-    // Determine SIMD size
-    uint32_t simd = scheduler.getKernelInfo().getMaxSimdSize();
-    DEBUG_BREAK_IF(simd != PARALLEL_SCHEDULER_COMPILATION_SIZE_20);
-
-    // Patch our kernel constants
-    *scheduler.globalWorkOffsetX = 0;
-    *scheduler.globalWorkOffsetY = 0;
-    *scheduler.globalWorkOffsetZ = 0;
-
-    *scheduler.globalWorkSizeX = (uint32_t)scheduler.getGws();
-    *scheduler.globalWorkSizeY = 1;
-    *scheduler.globalWorkSizeZ = 1;
-
-    *scheduler.localWorkSizeX = (uint32_t)scheduler.getLws();
-    *scheduler.localWorkSizeY = 1;
-    *scheduler.localWorkSizeZ = 1;
-
-    *scheduler.localWorkSizeX2 = (uint32_t)scheduler.getLws();
-    *scheduler.localWorkSizeY2 = 1;
-    *scheduler.localWorkSizeZ2 = 1;
-
-    *scheduler.enqueuedLocalWorkSizeX = (uint32_t)scheduler.getLws();
-    *scheduler.enqueuedLocalWorkSizeY = 1;
-    *scheduler.enqueuedLocalWorkSizeZ = 1;
-
-    *scheduler.numWorkGroupsX = (uint32_t)(scheduler.getGws() / scheduler.getLws());
-    *scheduler.numWorkGroupsY = 0;
-    *scheduler.numWorkGroupsZ = 0;
-
-    *scheduler.workDim = 1;
-
-    // Send our indirect object data
-    size_t localWorkSizes[3] = {scheduler.getLws(), 1, 1};
-    size_t globalWorkSizes[3] = {scheduler.getGws(), 1, 1};
-
-    // Create indirectHeap for IOH that is located at the end of device enqueue DSH
-    size_t curbeOffset = devQueueHw.setSchedulerCrossThreadData(scheduler);
-    IndirectHeap indirectObjectHeap(dsh->getCpuBase(), dsh->getMaxAvailableSpace());
-    indirectObjectHeap.getSpace(curbeOffset);
-    ioh = &indirectObjectHeap;
-
-    // Program the walker.  Invokes execution so all state should already be programmed
-    auto pGpGpuWalkerCmd = (GPGPU_WALKER *)commandStream->getSpace(sizeof(GPGPU_WALKER));
-    *pGpGpuWalkerCmd = GfxFamily::cmdInitGpgpuWalker;
-
-    bool localIdsGeneration = KernelCommandsHelper<GfxFamily>::isRuntimeLocalIdsGenerationRequired(1, globalWorkSizes, localWorkSizes);
-    KernelCommandsHelper<GfxFamily>::sendIndirectState(
-        *commandStream,
-        *dsh,
-        *ioh,
-        *ssh,
-        scheduler,
-        simd,
-        localWorkSizes,
-        offsetInterfaceDescriptorTable,
-        interfaceDescriptorIndex,
-        preemptionMode,
-        pGpGpuWalkerCmd,
-        nullptr,
-        localIdsGeneration);
-
-    // Implement enabling special WA DisableLSQCROPERFforOCL if needed
-    GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(commandStream, scheduler, true);
-
-    size_t globalOffsets[3] = {0, 0, 0};
-    size_t workGroups[3] = {(scheduler.getGws() / scheduler.getLws()), 1, 1};
-    GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(pGpGpuWalkerCmd, globalOffsets, globalOffsets, workGroups, localWorkSizes, simd, 1, localIdsGeneration);
-
-    // Implement disabling special WA DisableLSQCROPERFforOCL if needed
-    GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(commandStream, scheduler, false);
-
-    // Do not put BB_START only when returning in first Scheduler run
-    if (devQueueHw.getSchedulerReturnInstance() != 1) {
-
-        commandQueue.getDevice().getCommandStreamReceiver().addPipeControl(*commandStream, true);
-
-        // Add BB Start Cmd to the SLB in the Primary Batch Buffer
-        auto *bbStart = (MI_BATCH_BUFFER_START *)commandStream->getSpace(sizeof(MI_BATCH_BUFFER_START));
-        *bbStart = MI_BATCH_BUFFER_START::sInit();
-        bbStart->setSecondLevelBatchBuffer(MI_BATCH_BUFFER_START::SECOND_LEVEL_BATCH_BUFFER_FIRST_LEVEL_BATCH);
-        uint64_t slbAddress = devQueueHw.getSlbBuffer()->getGpuAddress();
-        bbStart->setBatchBufferStartAddressGraphicsaddress472(slbAddress);
-    }
-}
-
 template <typename GfxFamily>
 void GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(OCLRT::LinearStream *pCommandStream, const Kernel &kernel, bool disablePerfMode) {
 }
--- a/runtime/command_queue/gpgpu_walker_base.inl
+++ b/runtime/command_queue/gpgpu_walker_base.inl
@@ -0,0 +1,189 @@
+/*
+ * Copyright (C) 2018 Intel Corporation
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ */
+
+#pragma once
+#include "runtime/command_queue/gpgpu_walker.h"
+
+namespace OCLRT {
+
+template <typename GfxFamily>
+inline size_t GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(
+    WALKER_TYPE<GfxFamily> *walkerCmd,
+    const size_t globalOffsets[3],
+    const size_t startWorkGroups[3],
+    const size_t numWorkGroups[3],
+    const size_t localWorkSizesIn[3],
+    uint32_t simd,
+    uint32_t workDim,
+    bool localIdsGeneration) {
+    auto localWorkSize = localWorkSizesIn[0] * localWorkSizesIn[1] * localWorkSizesIn[2];
+
+    auto threadsPerWorkGroup = getThreadsPerWG(simd, localWorkSize);
+    walkerCmd->setThreadWidthCounterMaximum(static_cast<uint32_t>(threadsPerWorkGroup));
+
+    walkerCmd->setThreadGroupIdXDimension(static_cast<uint32_t>(numWorkGroups[0]));
+    walkerCmd->setThreadGroupIdYDimension(static_cast<uint32_t>(numWorkGroups[1]));
+    walkerCmd->setThreadGroupIdZDimension(static_cast<uint32_t>(numWorkGroups[2]));
+
+    // compute executionMask - to tell which SIMD lines are active within thread
+    auto remainderSimdLanes = localWorkSize & (simd - 1);
+    uint64_t executionMask = (1ull << remainderSimdLanes) - 1;
+    if (!executionMask)
+        executionMask = ~executionMask;
+
+    using SIMD_SIZE = typename WALKER_TYPE<GfxFamily>::SIMD_SIZE;
+
+    walkerCmd->setRightExecutionMask(static_cast<uint32_t>(executionMask));
+    walkerCmd->setBottomExecutionMask(static_cast<uint32_t>(0xffffffff));
+    walkerCmd->setSimdSize(static_cast<SIMD_SIZE>(simd >> 4));
+
+    walkerCmd->setThreadGroupIdStartingX(static_cast<uint32_t>(startWorkGroups[0]));
+    walkerCmd->setThreadGroupIdStartingY(static_cast<uint32_t>(startWorkGroups[1]));
+    walkerCmd->setThreadGroupIdStartingResumeZ(static_cast<uint32_t>(startWorkGroups[2]));
+
+    return localWorkSize;
+}
+
+template <typename GfxFamily>
+void GpgpuWalkerHelper<GfxFamily>::dispatchScheduler(
+    CommandQueue &commandQueue,
+    DeviceQueueHw<GfxFamily> &devQueueHw,
+    PreemptionMode preemptionMode,
+    SchedulerKernel &scheduler,
+    IndirectHeap *ssh,
+    IndirectHeap *dsh) {
+
+    using INTERFACE_DESCRIPTOR_DATA = typename GfxFamily::INTERFACE_DESCRIPTOR_DATA;
+    using GPGPU_WALKER = typename GfxFamily::GPGPU_WALKER;
+    using MI_BATCH_BUFFER_START = typename GfxFamily::MI_BATCH_BUFFER_START;
+
+    OCLRT::LinearStream *commandStream = nullptr;
+    OCLRT::IndirectHeap *ioh = nullptr;
+
+    commandStream = &commandQueue.getCS(0);
+
+    bool dcFlush = false;
+    commandQueue.getDevice().getCommandStreamReceiver().addPipeControl(*commandStream, dcFlush);
+
+    uint32_t interfaceDescriptorIndex = devQueueHw.schedulerIDIndex;
+    const size_t offsetInterfaceDescriptorTable = devQueueHw.colorCalcStateSize;
+    const size_t offsetInterfaceDescriptor = offsetInterfaceDescriptorTable;
+    const size_t totalInterfaceDescriptorTableSize = devQueueHw.interfaceDescriptorEntries * sizeof(INTERFACE_DESCRIPTOR_DATA);
+
+    // Program media interface descriptor load
+    KernelCommandsHelper<GfxFamily>::sendMediaInterfaceDescriptorLoad(
+        *commandStream,
+        offsetInterfaceDescriptor,
+        totalInterfaceDescriptorTableSize);
+
+    DEBUG_BREAK_IF(offsetInterfaceDescriptorTable % 64 != 0);
+
+    // Determine SIMD size
+    uint32_t simd = scheduler.getKernelInfo().getMaxSimdSize();
+    DEBUG_BREAK_IF(simd != PARALLEL_SCHEDULER_COMPILATION_SIZE_20);
+
+    // Patch our kernel constants
+    *scheduler.globalWorkOffsetX = 0;
+    *scheduler.globalWorkOffsetY = 0;
+    *scheduler.globalWorkOffsetZ = 0;
+
+    *scheduler.globalWorkSizeX = (uint32_t)scheduler.getGws();
+    *scheduler.globalWorkSizeY = 1;
+    *scheduler.globalWorkSizeZ = 1;
+
+    *scheduler.localWorkSizeX = (uint32_t)scheduler.getLws();
+    *scheduler.localWorkSizeY = 1;
+    *scheduler.localWorkSizeZ = 1;
+
+    *scheduler.localWorkSizeX2 = (uint32_t)scheduler.getLws();
+    *scheduler.localWorkSizeY2 = 1;
+    *scheduler.localWorkSizeZ2 = 1;
+
+    *scheduler.enqueuedLocalWorkSizeX = (uint32_t)scheduler.getLws();
+    *scheduler.enqueuedLocalWorkSizeY = 1;
+    *scheduler.enqueuedLocalWorkSizeZ = 1;
+
+    *scheduler.numWorkGroupsX = (uint32_t)(scheduler.getGws() / scheduler.getLws());
+    *scheduler.numWorkGroupsY = 0;
+    *scheduler.numWorkGroupsZ = 0;
+
+    *scheduler.workDim = 1;
+
+    // Send our indirect object data
+    size_t localWorkSizes[3] = {scheduler.getLws(), 1, 1};
+    size_t globalWorkSizes[3] = {scheduler.getGws(), 1, 1};
+
+    // Create indirectHeap for IOH that is located at the end of device enqueue DSH
+    size_t curbeOffset = devQueueHw.setSchedulerCrossThreadData(scheduler);
+    IndirectHeap indirectObjectHeap(dsh->getCpuBase(), dsh->getMaxAvailableSpace());
+    indirectObjectHeap.getSpace(curbeOffset);
+    ioh = &indirectObjectHeap;
+
+    // Program the walker.  Invokes execution so all state should already be programmed
+    auto pGpGpuWalkerCmd = (GPGPU_WALKER *)commandStream->getSpace(sizeof(GPGPU_WALKER));
+    *pGpGpuWalkerCmd = GfxFamily::cmdInitGpgpuWalker;
+
+    bool localIdsGeneration = KernelCommandsHelper<GfxFamily>::isRuntimeLocalIdsGenerationRequired(1, globalWorkSizes, localWorkSizes);
+    KernelCommandsHelper<GfxFamily>::sendIndirectState(
+        *commandStream,
+        *dsh,
+        *ioh,
+        *ssh,
+        scheduler,
+        simd,
+        localWorkSizes,
+        offsetInterfaceDescriptorTable,
+        interfaceDescriptorIndex,
+        preemptionMode,
+        pGpGpuWalkerCmd,
+        nullptr,
+        localIdsGeneration);
+
+    // Implement enabling special WA DisableLSQCROPERFforOCL if needed
+    GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(commandStream, scheduler, true);
+
+    size_t globalOffsets[3] = {0, 0, 0};
+    size_t workGroups[3] = {(scheduler.getGws() / scheduler.getLws()), 1, 1};
+    GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(pGpGpuWalkerCmd, globalOffsets, globalOffsets, workGroups, localWorkSizes, simd, 1, localIdsGeneration);
+
+    // Implement disabling special WA DisableLSQCROPERFforOCL if needed
+    GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(commandStream, scheduler, false);
+
+    // Do not put BB_START only when returning in first Scheduler run
+    if (devQueueHw.getSchedulerReturnInstance() != 1) {
+
+        commandQueue.getDevice().getCommandStreamReceiver().addPipeControl(*commandStream, true);
+
+        // Add BB Start Cmd to the SLB in the Primary Batch Buffer
+        auto *bbStart = (MI_BATCH_BUFFER_START *)commandStream->getSpace(sizeof(MI_BATCH_BUFFER_START));
+        *bbStart = MI_BATCH_BUFFER_START::sInit();
+        bbStart->setSecondLevelBatchBuffer(MI_BATCH_BUFFER_START::SECOND_LEVEL_BATCH_BUFFER_FIRST_LEVEL_BATCH);
+        uint64_t slbAddress = devQueueHw.getSlbBuffer()->getGpuAddress();
+        bbStart->setBatchBufferStartAddressGraphicsaddress472(slbAddress);
+    }
+}
+
+template <typename GfxFamily>
+void GpgpuWalkerHelper<GfxFamily>::setupTimestampPacket(
+    LinearStream *cmdStream,
+    WALKER_TYPE<GfxFamily> *walkerCmd,
+    TimestampPacket *timestampPacket,
+    TimestampPacket::WriteOperationType writeOperationType) {
+
+    if (TimestampPacket::WriteOperationType::AfterWalker == writeOperationType) {
+        uint64_t address = timestampPacket->pickAddressForDataWrite(TimestampPacket::DataIndex::ContextEnd);
+        auto pipeControlCmd = cmdStream->getSpaceForCmd<PIPE_CONTROL>();
+        *pipeControlCmd = PIPE_CONTROL::sInit();
+        pipeControlCmd->setCommandStreamerStallEnable(true);
+        pipeControlCmd->setPostSyncOperation(PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA);
+        pipeControlCmd->setAddress(static_cast<uint32_t>(address & 0x0000FFFFFFFFULL));
+        pipeControlCmd->setAddressHigh(static_cast<uint32_t>(address >> 32));
+        pipeControlCmd->setImmediateData(0);
+    }
+}
+
+} // namespace OCLRT
--- a/runtime/gen10/gpgpu_walker_gen10.cpp
+++ b/runtime/gen10/gpgpu_walker_gen10.cpp
@@ -8,6 +8,7 @@
 #include "runtime/gen10/hw_info.h"
 #include "runtime/command_queue/gpgpu_walker.h"
 #include "runtime/command_queue/gpgpu_walker.inl"
+#include "runtime/command_queue/gpgpu_walker_base.inl"
 #include "runtime/command_queue/hardware_interface.h"
 #include "runtime/command_queue/hardware_interface.inl"
 #include "runtime/command_queue/hardware_interface_base.inl"
--- a/runtime/gen8/gpgpu_walker_gen8.cpp
+++ b/runtime/gen8/gpgpu_walker_gen8.cpp
@@ -8,6 +8,7 @@
 #include "runtime/gen8/hw_info.h"
 #include "runtime/command_queue/gpgpu_walker.h"
 #include "runtime/command_queue/gpgpu_walker.inl"
+#include "runtime/command_queue/gpgpu_walker_base.inl"
 #include "runtime/command_queue/hardware_interface.h"
 #include "runtime/command_queue/hardware_interface.inl"
 #include "runtime/command_queue/hardware_interface_base.inl"
--- a/runtime/gen9/gpgpu_walker_gen9.cpp
+++ b/runtime/gen9/gpgpu_walker_gen9.cpp
@@ -8,6 +8,7 @@
 #include "runtime/gen9/hw_cmds_base.h"
 #include "runtime/command_queue/gpgpu_walker.h"
 #include "runtime/command_queue/gpgpu_walker.inl"
+#include "runtime/command_queue/gpgpu_walker_base.inl"
 #include "runtime/command_queue/hardware_interface.h"
 #include "runtime/command_queue/hardware_interface.inl"
 #include "runtime/command_queue/hardware_interface_base.inl"