Introduce initial implementation of DirectSubmission relaxed ordering mode.

Initial implementation of task store section Related-To: NEO-7458 Signed-off-by: Dunajski, Bartosz <bartosz.dunajski@intel.com>
2025-12-26 15:03:02 +08:00 · 2022-11-16 17:24:04 +00:00
parent 8927399cce
commit 89b96e5e8f
16 changed files with 393 additions and 5 deletions
--- a/shared/source/direct_submission/direct_submission_hw.inl
+++ b/shared/source/direct_submission/direct_submission_hw.inl
@@ -12,6 +12,7 @@
 #include "shared/source/device/device.h"
 #include "shared/source/direct_submission/direct_submission_hw.h"
 #include "shared/source/direct_submission/direct_submission_hw_diagnostic_mode.h"
+#include "shared/source/direct_submission/relaxed_ordering_helper.h"
 #include "shared/source/helpers/flush_stamp.h"
 #include "shared/source/helpers/logical_state_helper.h"
 #include "shared/source/helpers/ptr_math.h"
@@ -76,6 +77,7 @@ DirectSubmissionHw<GfxFamily, Dispatcher>::DirectSubmissionHw(const DirectSubmis
    setPostSyncOffset();

    dcFlushRequired = MemorySynchronizationCommands<GfxFamily>::getDcFlushEnable(true, *hwInfo);
+    relaxedOrderingEnabled = (DebugManager.flags.DirectSubmissionRelaxedOrdering.get() == 1);
 }

 template <typename GfxFamily, typename Dispatcher>
@@ -118,6 +120,18 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::allocateResources() {
        allocations.push_back(completionFenceAllocation);
    }

+    if (this->relaxedOrderingEnabled) {
+        const AllocationProperties allocationProperties(rootDeviceIndex,
+                                                        true, MemoryConstants::pageSize64k,
+                                                        AllocationType::DEFERRED_TASKS_LIST,
+                                                        isMultiOsContextCapable, false, osContext.getDeviceBitfield());
+
+        deferredTasksListAllocation = memoryManager->allocateGraphicsMemoryWithProperties(allocationProperties);
+        UNRECOVERABLE_IF(deferredTasksListAllocation == nullptr);
+
+        allocations.push_back(deferredTasksListAllocation);
+    }
+
    if (DebugManager.flags.DirectSubmissionPrintBuffers.get()) {
        for (uint32_t ringBufferIndex = 0; ringBufferIndex < RingBufferUse::initialRingBufferCount; ringBufferIndex++) {
            const auto ringBuffer = this->ringBuffers[ringBufferIndex].ringBuffer;
@@ -214,6 +228,10 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::initialize(bool submitOnInit, bo

            this->systemMemoryFenceAddressSet = true;
        }
+        if (this->relaxedOrderingEnabled) {
+            preinitializeTaskStoreSection();
+            this->relaxedOrderingInitialized = true;
+        }
        if (workloadMode == 1) {
            dispatchDiagnosticModeSection();
            startBufferSize += getDiagnosticModeSection();
@@ -257,6 +275,11 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::startRingBuffer() {
        this->systemMemoryFenceAddressSet = true;
    }

+    if (this->relaxedOrderingEnabled && !this->relaxedOrderingInitialized) {
+        preinitializeTaskStoreSection();
+        this->relaxedOrderingInitialized = true;
+    }
+
    currentQueueWorkCount++;
    dispatchSemaphoreSection(currentQueueWorkCount);

@@ -388,7 +411,7 @@ inline size_t DirectSubmissionHw<GfxFamily, Dispatcher>::getSizeDispatch() {
    } else if (workloadMode == 1) {
        size += getDiagnosticModeSection();
    }
-    //mode 2 does not dispatch any commands
+    // mode 2 does not dispatch any commands

    if (!disableCacheFlush) {
        size += Dispatcher::getSizeCacheFlush(*hwInfo);
@@ -435,7 +458,11 @@ void *DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchWorkloadSection(BatchBu
        DirectSubmissionDiagnostics::diagnosticModeOneDispatch(diagnostic.get());
        dispatchDiagnosticModeSection();
    }
-    //mode 2 does not dispatch any commands
+    // mode 2 does not dispatch any commands
+
+    if (this->relaxedOrderingEnabled) {
+        dispatchTaskStoreSection(0);
+    }

    if (!disableCacheFlush) {
        Dispatcher::dispatchCacheFlush(ringCommandStream, *hwInfo, gpuVaForMiFlush);
@@ -452,9 +479,63 @@ void *DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchWorkloadSection(BatchBu
    return currentPosition;
 }

+template <typename GfxFamily, typename Dispatcher>
+void DirectSubmissionHw<GfxFamily, Dispatcher>::preinitializeTaskStoreSection() {
+    preinitializedTaskStoreSection = std::make_unique<uint8_t[]>(RelaxedOrderingHelper::getSizeTaskStoreSection<GfxFamily>());
+
+    LinearStream stream(preinitializedTaskStoreSection.get(), RelaxedOrderingHelper::getSizeTaskStoreSection<GfxFamily>());
+
+    EncodeMiPredicate<GfxFamily>::encode(stream, MiPredicateType::Disable);
+
+    uint64_t deferredWalkerListGpuVa = deferredTasksListAllocation->getGpuAddress();
+    LriHelper<GfxFamily>::program(&stream, CS_GPR_R6, static_cast<uint32_t>(deferredWalkerListGpuVa & 0xFFFF'FFFFULL), true);
+    LriHelper<GfxFamily>::program(&stream, CS_GPR_R6 + 4, static_cast<uint32_t>(deferredWalkerListGpuVa >> 32), true);
+
+    // Task start VA
+    LriHelper<GfxFamily>::program(&stream, CS_GPR_R7, 0, true);
+    LriHelper<GfxFamily>::program(&stream, CS_GPR_R7 + 4, 0, true);
+
+    // Shift by 8 = multiply by 256. Address must by 64b aligned (shift by 6), but SHL accepts only 1, 2, 4, 8, 16 and 32
+    LriHelper<GfxFamily>::program(&stream, CS_GPR_R8, 8, true);
+    LriHelper<GfxFamily>::program(&stream, CS_GPR_R8 + 4, 0, true);
+
+    EncodeAluHelper<GfxFamily, 9> aluHelper;
+    aluHelper.setNextAlu(AluRegisters::OPCODE_LOAD, AluRegisters::R_SRCA, AluRegisters::R_1);
+    aluHelper.setNextAlu(AluRegisters::OPCODE_LOAD, AluRegisters::R_SRCB, AluRegisters::R_8);
+    aluHelper.setNextAlu(AluRegisters::OPCODE_SHL);
+    aluHelper.setNextAlu(AluRegisters::OPCODE_STORE, AluRegisters::R_8, AluRegisters::R_ACCU);
+    aluHelper.setNextAlu(AluRegisters::OPCODE_LOAD, AluRegisters::R_SRCA, AluRegisters::R_8);
+    aluHelper.setNextAlu(AluRegisters::OPCODE_LOAD, AluRegisters::R_SRCB, AluRegisters::R_6);
+    aluHelper.setNextAlu(AluRegisters::OPCODE_ADD);
+    aluHelper.setNextAlu(AluRegisters::OPCODE_STOREIND, AluRegisters::R_ACCU, AluRegisters::R_7);
+    aluHelper.setNextAlu(AluRegisters::OPCODE_FENCE_WR);
+
+    aluHelper.copyToCmdStream(stream);
+
+    EncodeMathMMIO<GfxFamily>::encodeIncrement(stream, AluRegisters::R_1);
+
+    UNRECOVERABLE_IF(stream.getUsed() != RelaxedOrderingHelper::getSizeTaskStoreSection<GfxFamily>());
+}
+
+template <typename GfxFamily, typename Dispatcher>
+void DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchTaskStoreSection(uint64_t taskStartSectionVa) {
+    using MI_LOAD_REGISTER_IMM = typename GfxFamily::MI_LOAD_REGISTER_IMM;
+
+    constexpr size_t patchOffset = EncodeMiPredicate<GfxFamily>::getCmdSize() + (2 * sizeof(MI_LOAD_REGISTER_IMM));
+
+    auto lri = reinterpret_cast<MI_LOAD_REGISTER_IMM *>(ptrOffset(preinitializedTaskStoreSection.get(), patchOffset));
+
+    lri->setDataDword(static_cast<uint32_t>(taskStartSectionVa & 0xFFFF'FFFFULL));
+    lri++;
+    lri->setDataDword(static_cast<uint32_t>(taskStartSectionVa >> 32));
+
+    auto dst = ringCommandStream.getSpace(RelaxedOrderingHelper::getSizeTaskStoreSection<GfxFamily>());
+    memcpy_s(dst, RelaxedOrderingHelper::getSizeTaskStoreSection<GfxFamily>(), preinitializedTaskStoreSection.get(), RelaxedOrderingHelper::getSizeTaskStoreSection<GfxFamily>());
+}
+
 template <typename GfxFamily, typename Dispatcher>
 bool DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchCommandBuffer(BatchBuffer &batchBuffer, FlushStampTracker &flushStamp) {
-    //for now workloads requiring cache coherency are not supported
+    // for now workloads requiring cache coherency are not supported
    UNRECOVERABLE_IF(batchBuffer.requiresCoherency);

    if (batchBuffer.ringBufferRestartRequest) {
@@ -466,6 +547,9 @@ bool DirectSubmissionHw<GfxFamily, Dispatcher>::dispatchCommandBuffer(BatchBuffe
    size_t dispatchSize = getSizeDispatch();
    size_t cycleSize = getSizeSwitchRingBufferSection();
    size_t requiredMinimalSize = dispatchSize + cycleSize + getSizeEnd();
+    if (this->relaxedOrderingEnabled) {
+        requiredMinimalSize += RelaxedOrderingHelper::getSizeTaskStoreSection<GfxFamily>();
+    }

    getCommandBufferPositionGpuAddress(ringCommandStream.getSpace(0));

@@ -588,6 +672,8 @@ void DirectSubmissionHw<GfxFamily, Dispatcher>::deallocateResources() {
        memoryManager->freeGraphicsMemory(semaphores);
        semaphores = nullptr;
    }
+
+    memoryManager->freeGraphicsMemory(deferredTasksListAllocation);
 }

 template <typename GfxFamily, typename Dispatcher>