From 680fa1ec1d705986a787e3bf82dc789b2d59ece6 Mon Sep 17 00:00:00 2001 From: Lukasz Jobczyk Date: Thu, 22 May 2025 10:02:17 +0000 Subject: [PATCH] refactor: Split direct_submission_hw.inl Signed-off-by: Lukasz Jobczyk --- .../source/direct_submission/CMakeLists.txt | 1 + .../direct_submission_hw.inl | 350 ----------------- .../direct_submission_relaxed_ordering.inl | 360 ++++++++++++++++++ .../linux/direct_submission_gen12lp.cpp | 1 + .../windows/direct_submission_gen12lp.cpp | 1 + .../linux/direct_submission_xe2_hpg_core.cpp | 3 +- .../direct_submission_xe2_hpg_core.cpp | 3 +- .../linux/direct_submission_xe3_core.cpp | 1 + .../windows/direct_submission_xe3_core.cpp | 1 + .../linux/direct_submission_xe_hpc_core.cpp | 3 +- .../windows/direct_submission_xe_hpc_core.cpp | 3 +- .../linux/direct_submission_xe_hpg_core.cpp | 3 +- .../windows/direct_submission_xe_hpg_core.cpp | 3 +- 13 files changed, 377 insertions(+), 356 deletions(-) create mode 100644 shared/source/direct_submission/direct_submission_relaxed_ordering.inl diff --git a/shared/source/direct_submission/CMakeLists.txt b/shared/source/direct_submission/CMakeLists.txt index 18505b90e4..169f3d9efb 100644 --- a/shared/source/direct_submission/CMakeLists.txt +++ b/shared/source/direct_submission/CMakeLists.txt @@ -15,6 +15,7 @@ set(NEO_CORE_DIRECT_SUBMISSION ${CMAKE_CURRENT_SOURCE_DIR}/direct_submission_hw.inl ${CMAKE_CURRENT_SOURCE_DIR}/direct_submission_tgllp_and_later.inl ${CMAKE_CURRENT_SOURCE_DIR}/direct_submission_properties.h + ${CMAKE_CURRENT_SOURCE_DIR}/direct_submission_relaxed_ordering.inl ${CMAKE_CURRENT_SOURCE_DIR}/relaxed_ordering_helper.cpp ${CMAKE_CURRENT_SOURCE_DIR}/relaxed_ordering_helper.h ) diff --git a/shared/source/direct_submission/direct_submission_hw.inl b/shared/source/direct_submission/direct_submission_hw.inl index 4ea83c8d1c..13b59abdf8 100644 --- a/shared/source/direct_submission/direct_submission_hw.inl +++ b/shared/source/direct_submission/direct_submission_hw.inl @@ -114,210 +114,6 @@ DirectSubmissionHw::DirectSubmissionHw(const DirectSubmis } } -template -void DirectSubmissionHw::dispatchStaticRelaxedOrderingScheduler() { - LinearStream schedulerCmdStream(this->relaxedOrderingSchedulerAllocation); - uint64_t schedulerStartAddress = schedulerCmdStream.getGpuBase(); - uint64_t deferredTasksListGpuVa = deferredTasksListAllocation->getGpuAddress(); - - uint64_t loopSectionStartAddress = schedulerStartAddress + RelaxedOrderingHelper::StaticSchedulerSizeAndOffsetSection::loopStartSectionStart; - - const uint32_t miMathMocs = this->rootDeviceEnvironment.getGmmHelper()->getL3EnabledMOCS(); - - constexpr bool isBcs = Dispatcher::isCopy(); - - // 1. Init section - { - EncodeMiPredicate::encode(schedulerCmdStream, MiPredicateType::disable); - - EncodeSetMMIO::encodeREG(schedulerCmdStream, RegisterOffsets::csGprR0, RegisterOffsets::csGprR9, isBcs); - EncodeSetMMIO::encodeREG(schedulerCmdStream, RegisterOffsets::csGprR0 + 4, RegisterOffsets::csGprR9 + 4, isBcs); - - EncodeBatchBufferStartOrEnd::programConditionalDataRegBatchBufferStart(schedulerCmdStream, 0, RegisterOffsets::csGprR1, 0, CompareOperation::equal, true, false, isBcs); - - LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR2, 0, true, isBcs); - LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR2 + 4, 0, true, isBcs); - - uint64_t removeTaskVa = schedulerStartAddress + RelaxedOrderingHelper::StaticSchedulerSizeAndOffsetSection::removeTaskSectionStart; - LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR3, static_cast(removeTaskVa & 0xFFFF'FFFFULL), true, isBcs); - LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR3 + 4, static_cast(removeTaskVa >> 32), true, isBcs); - - uint64_t walkersLoopConditionCheckVa = schedulerStartAddress + RelaxedOrderingHelper::StaticSchedulerSizeAndOffsetSection::tasksListLoopCheckSectionStart; - LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR4, static_cast(walkersLoopConditionCheckVa & 0xFFFF'FFFFULL), true, isBcs); - LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR4 + 4, static_cast(walkersLoopConditionCheckVa >> 32), true, isBcs); - } - - // 2. Dispatch task section (loop start) - { - UNRECOVERABLE_IF(schedulerCmdStream.getUsed() != RelaxedOrderingHelper::StaticSchedulerSizeAndOffsetSection::loopStartSectionStart); - - EncodeMiPredicate::encode(schedulerCmdStream, MiPredicateType::disable); - - LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR6, 8, true, isBcs); - LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR6 + 4, 0, true, isBcs); - - LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR8, static_cast(deferredTasksListGpuVa & 0xFFFF'FFFFULL), true, isBcs); - LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR8 + 4, static_cast(deferredTasksListGpuVa >> 32), true, isBcs); - - EncodeAluHelper aluHelper({{ - {AluRegisters::opcodeLoad, AluRegisters::srca, AluRegisters::gpr2}, - {AluRegisters::opcodeLoad, AluRegisters::srcb, AluRegisters::gpr6}, - {AluRegisters::opcodeShl, AluRegisters::opcodeNone, AluRegisters::opcodeNone}, - {AluRegisters::opcodeStore, AluRegisters::gpr7, AluRegisters::accu}, - {AluRegisters::opcodeLoad, AluRegisters::srca, AluRegisters::gpr7}, - {AluRegisters::opcodeLoad, AluRegisters::srcb, AluRegisters::gpr8}, - {AluRegisters::opcodeAdd, AluRegisters::opcodeNone, AluRegisters::opcodeNone}, - {AluRegisters::opcodeStore, AluRegisters::gpr6, AluRegisters::accu}, - {AluRegisters::opcodeLoadind, AluRegisters::gpr0, AluRegisters::accu}, - {AluRegisters::opcodeFenceRd, AluRegisters::opcodeNone, AluRegisters::opcodeNone}, - }}); - - aluHelper.setMocs(miMathMocs); - aluHelper.copyToCmdStream(schedulerCmdStream); - - EncodeBatchBufferStartOrEnd::programBatchBufferStart(&schedulerCmdStream, 0, false, true, false); - } - - // 3. Remove task section - { - UNRECOVERABLE_IF(schedulerCmdStream.getUsed() != RelaxedOrderingHelper::StaticSchedulerSizeAndOffsetSection::removeTaskSectionStart); - - EncodeMiPredicate::encode(schedulerCmdStream, MiPredicateType::disable); - - EncodeMathMMIO::encodeDecrement(schedulerCmdStream, AluRegisters::gpr1, isBcs); - EncodeMathMMIO::encodeDecrement(schedulerCmdStream, AluRegisters::gpr2, isBcs); - - EncodeSetMMIO::encodeREG(schedulerCmdStream, RegisterOffsets::csGprR0, RegisterOffsets::csGprR9, isBcs); - EncodeSetMMIO::encodeREG(schedulerCmdStream, RegisterOffsets::csGprR0 + 4, RegisterOffsets::csGprR9 + 4, isBcs); - - EncodeBatchBufferStartOrEnd::programConditionalDataRegBatchBufferStart(schedulerCmdStream, 0, RegisterOffsets::csGprR1, 0, CompareOperation::equal, true, false, isBcs); - - LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR7, 8, true, isBcs); - LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR7 + 4, 0, true, isBcs); - - LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR8, static_cast(deferredTasksListGpuVa & 0xFFFF'FFFFULL), true, isBcs); - LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR8 + 4, static_cast(deferredTasksListGpuVa >> 32), true, isBcs); - - EncodeAluHelper aluHelper({{ - {AluRegisters::opcodeLoad, AluRegisters::srca, AluRegisters::gpr1}, - {AluRegisters::opcodeLoad, AluRegisters::srcb, AluRegisters::gpr7}, - {AluRegisters::opcodeShl, AluRegisters::opcodeNone, AluRegisters::opcodeNone}, - {AluRegisters::opcodeStore, AluRegisters::gpr7, AluRegisters::accu}, - {AluRegisters::opcodeLoad, AluRegisters::srca, AluRegisters::gpr7}, - {AluRegisters::opcodeLoad, AluRegisters::srcb, AluRegisters::gpr8}, - {AluRegisters::opcodeAdd, AluRegisters::opcodeNone, AluRegisters::opcodeNone}, - {AluRegisters::opcodeLoadind, AluRegisters::gpr7, AluRegisters::accu}, - {AluRegisters::opcodeFenceRd, AluRegisters::opcodeNone, AluRegisters::opcodeNone}, - {AluRegisters::opcodeLoad, AluRegisters::srca, AluRegisters::gpr6}, - {AluRegisters::opcodeLoad0, AluRegisters::srcb, AluRegisters::opcodeNone}, - {AluRegisters::opcodeAdd, AluRegisters::opcodeNone, AluRegisters::opcodeNone}, - {AluRegisters::opcodeStoreind, AluRegisters::accu, AluRegisters::gpr7}, - {AluRegisters::opcodeFenceWr, AluRegisters::opcodeNone, AluRegisters::opcodeNone}, - }}); - - aluHelper.setMocs(miMathMocs); - aluHelper.copyToCmdStream(schedulerCmdStream); - } - - // 4. List loop check section - { - UNRECOVERABLE_IF(schedulerCmdStream.getUsed() != RelaxedOrderingHelper::StaticSchedulerSizeAndOffsetSection::tasksListLoopCheckSectionStart); - - EncodeMiPredicate::encode(schedulerCmdStream, MiPredicateType::disable); - - EncodeMathMMIO::encodeIncrement(schedulerCmdStream, AluRegisters::gpr2, isBcs); - - EncodeBatchBufferStartOrEnd::programConditionalRegRegBatchBufferStart( - schedulerCmdStream, - loopSectionStartAddress, - AluRegisters::gpr1, AluRegisters::gpr2, CompareOperation::notEqual, false, isBcs); - - LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR2, 0, true, isBcs); - LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR2 + 4, 0, true, isBcs); - } - - // 5. Drain request section - { - UNRECOVERABLE_IF(schedulerCmdStream.getUsed() != RelaxedOrderingHelper::StaticSchedulerSizeAndOffsetSection::drainRequestSectionStart); - - EncodeMiArbCheck::program(schedulerCmdStream, std::nullopt); - - if (debugManager.flags.DirectSubmissionRelaxedOrderingQueueSizeLimit.get() != -1) { - currentRelaxedOrderingQueueSize = static_cast(debugManager.flags.DirectSubmissionRelaxedOrderingQueueSizeLimit.get()); - } - - this->relaxedOrderingQueueSizeLimitValueVa = schedulerCmdStream.getCurrentGpuAddressPosition() + RelaxedOrderingHelper::getQueueSizeLimitValueOffset(); - - EncodeBatchBufferStartOrEnd::programConditionalDataRegBatchBufferStart( - schedulerCmdStream, - loopSectionStartAddress, - RegisterOffsets::csGprR1, currentRelaxedOrderingQueueSize, CompareOperation::greaterOrEqual, false, false, isBcs); - - EncodeBatchBufferStartOrEnd::programConditionalDataRegBatchBufferStart( - schedulerCmdStream, - loopSectionStartAddress, - RegisterOffsets::csGprR5, 1, CompareOperation::equal, false, false, isBcs); - } - - // 6. Scheduler loop check section - { - UNRECOVERABLE_IF(schedulerCmdStream.getUsed() != RelaxedOrderingHelper::StaticSchedulerSizeAndOffsetSection::schedulerLoopCheckSectionStart); - - LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR10, static_cast(RelaxedOrderingHelper::DynamicSchedulerSizeAndOffsetSection::semaphoreSectionSize), true, isBcs); - LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR10 + 4, 0, true, isBcs); - - EncodeAluHelper aluHelper({{ - {AluRegisters::opcodeLoad, AluRegisters::srca, AluRegisters::gpr9}, - {AluRegisters::opcodeLoad, AluRegisters::srcb, AluRegisters::gpr10}, - {AluRegisters::opcodeAdd, AluRegisters::opcodeNone, AluRegisters::opcodeNone}, - {AluRegisters::opcodeStore, AluRegisters::gpr0, AluRegisters::accu}, - }}); - aluHelper.setMocs(miMathMocs); - aluHelper.copyToCmdStream(schedulerCmdStream); - - EncodeBatchBufferStartOrEnd::programConditionalRegMemBatchBufferStart(schedulerCmdStream, 0, semaphoreGpuVa, RegisterOffsets::csGprR11, CompareOperation::greaterOrEqual, true, isBcs); - - EncodeBatchBufferStartOrEnd::programBatchBufferStart(&schedulerCmdStream, schedulerStartAddress + RelaxedOrderingHelper::StaticSchedulerSizeAndOffsetSection::loopStartSectionStart, - false, false, false); - } - - UNRECOVERABLE_IF(schedulerCmdStream.getUsed() != RelaxedOrderingHelper::StaticSchedulerSizeAndOffsetSection::totalSize); -} - -template -void DirectSubmissionHw::dispatchRelaxedOrderingSchedulerSection(uint32_t value) { - LinearStream schedulerCmdStream(this->preinitializedRelaxedOrderingScheduler.get(), RelaxedOrderingHelper::DynamicSchedulerSizeAndOffsetSection::totalSize); - - // 1. Init section - - uint64_t schedulerStartVa = ringCommandStream.getCurrentGpuAddressPosition(); - - uint64_t semaphoreSectionVa = schedulerStartVa + RelaxedOrderingHelper::DynamicSchedulerSizeAndOffsetSection::semaphoreSectionStart; - - constexpr bool isBcs = Dispatcher::isCopy(); - LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR11, value, true, isBcs); - LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR9, static_cast(semaphoreSectionVa & 0xFFFF'FFFFULL), true, isBcs); - LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR9 + 4, static_cast(semaphoreSectionVa >> 32), true, isBcs); - - schedulerCmdStream.getSpace(sizeof(typename GfxFamily::MI_BATCH_BUFFER_START)); // skip patching - - // 2. Semaphore section - { - using COMPARE_OPERATION = typename GfxFamily::MI_SEMAPHORE_WAIT::COMPARE_OPERATION; - - schedulerCmdStream.getSpace(EncodeMiPredicate::getCmdSize()); // skip patching - - EncodeSemaphore::addMiSemaphoreWaitCommand(schedulerCmdStream, semaphoreGpuVa, value, - COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD, false, false, false, false, nullptr); - } - - // skip patching End section - - auto dst = ringCommandStream.getSpace(RelaxedOrderingHelper::DynamicSchedulerSizeAndOffsetSection::totalSize); - memcpy_s(dst, RelaxedOrderingHelper::DynamicSchedulerSizeAndOffsetSection::totalSize, - this->preinitializedRelaxedOrderingScheduler.get(), RelaxedOrderingHelper::DynamicSchedulerSizeAndOffsetSection::totalSize); -} - template DirectSubmissionHw::~DirectSubmissionHw() = default; @@ -652,15 +448,6 @@ inline size_t DirectSubmissionHw::getSizeDispatch(bool re return size; } -template -void DirectSubmissionHw::updateRelaxedOrderingQueueSize(uint32_t newSize) { - this->currentRelaxedOrderingQueueSize = newSize; - - EncodeStoreMemory::programStoreDataImm(this->ringCommandStream, this->relaxedOrderingQueueSizeLimitValueVa, - this->currentRelaxedOrderingQueueSize, 0, false, false, - nullptr); -} - template void *DirectSubmissionHw::dispatchWorkloadSection(BatchBuffer &batchBuffer, bool dispatchMonitorFence) { void *currentPosition = ringCommandStream.getSpace(0); @@ -748,143 +535,6 @@ void *DirectSubmissionHw::dispatchWorkloadSection(BatchBu return currentPosition; } -template -void DirectSubmissionHw::dispatchRelaxedOrderingQueueStall() { - LinearStream bbStartStream(ringCommandStream.getSpace(EncodeBatchBufferStartOrEnd::getCmdSizeConditionalDataRegBatchBufferStart(false)), - EncodeBatchBufferStartOrEnd::getCmdSizeConditionalDataRegBatchBufferStart(false)); - - constexpr bool isBcs = Dispatcher::isCopy(); - LriHelper::program(&ringCommandStream, RegisterOffsets::csGprR5, 1, true, isBcs); - dispatchSemaphoreSection(currentQueueWorkCount); - - // patch conditional bb_start with current GPU address - EncodeBatchBufferStartOrEnd::programConditionalDataRegBatchBufferStart(bbStartStream, ringCommandStream.getCurrentGpuAddressPosition(), - RegisterOffsets::csGprR1, 0, CompareOperation::equal, false, false, isBcs); - - relaxedOrderingSchedulerRequired = false; -} - -template -size_t DirectSubmissionHw::getSizeDispatchRelaxedOrderingQueueStall() { - return getSizeSemaphoreSection(true) + sizeof(typename GfxFamily::MI_LOAD_REGISTER_IMM) + - EncodeBatchBufferStartOrEnd::getCmdSizeConditionalDataRegBatchBufferStart(false); -} - -template -void DirectSubmissionHw::dispatchRelaxedOrderingReturnPtrRegs(LinearStream &cmdStream, uint64_t returnPtr) { - - constexpr bool isBcs = Dispatcher::isCopy(); - LriHelper::program(&cmdStream, RegisterOffsets::csGprR4, static_cast(returnPtr & 0xFFFF'FFFFULL), true, isBcs); - LriHelper::program(&cmdStream, RegisterOffsets::csGprR4 + 4, static_cast(returnPtr >> 32), true, isBcs); - - uint64_t returnPtrAfterTaskStoreSection = returnPtr; - - returnPtrAfterTaskStoreSection += RelaxedOrderingHelper::getSizeTaskStoreSection(); - - LriHelper::program(&cmdStream, RegisterOffsets::csGprR3, static_cast(returnPtrAfterTaskStoreSection & 0xFFFF'FFFFULL), true, isBcs); - LriHelper::program(&cmdStream, RegisterOffsets::csGprR3 + 4, static_cast(returnPtrAfterTaskStoreSection >> 32), true, isBcs); -} - -template -void DirectSubmissionHw::initRelaxedOrderingRegisters() { - - constexpr bool isBcs = Dispatcher::isCopy(); - LriHelper::program(&ringCommandStream, RegisterOffsets::csGprR1, 0, true, isBcs); - LriHelper::program(&ringCommandStream, RegisterOffsets::csGprR1 + 4, 0, true, isBcs); - LriHelper::program(&ringCommandStream, RegisterOffsets::csGprR5, 0, true, isBcs); - LriHelper::program(&ringCommandStream, RegisterOffsets::csGprR5 + 4, 0, true, isBcs); -} - -template -void DirectSubmissionHw::preinitializeRelaxedOrderingSections() { - // Task store section - preinitializedTaskStoreSection = std::make_unique(RelaxedOrderingHelper::getSizeTaskStoreSection()); - - LinearStream stream(preinitializedTaskStoreSection.get(), RelaxedOrderingHelper::getSizeTaskStoreSection()); - - EncodeMiPredicate::encode(stream, MiPredicateType::disable); - - uint64_t deferredTasksListGpuVa = deferredTasksListAllocation->getGpuAddress(); - - constexpr bool isBcs = Dispatcher::isCopy(); - LriHelper::program(&stream, RegisterOffsets::csGprR6, static_cast(deferredTasksListGpuVa & 0xFFFF'FFFFULL), true, isBcs); - LriHelper::program(&stream, RegisterOffsets::csGprR6 + 4, static_cast(deferredTasksListGpuVa >> 32), true, isBcs); - - // Task start VA - LriHelper::program(&stream, RegisterOffsets::csGprR7, 0, true, isBcs); - LriHelper::program(&stream, RegisterOffsets::csGprR7 + 4, 0, true, isBcs); - - // Shift by 8 = multiply by 256. Address must by 64b aligned (shift by 6), but SHL accepts only 1, 2, 4, 8, 16 and 32 - LriHelper::program(&stream, RegisterOffsets::csGprR8, 8, true, isBcs); - LriHelper::program(&stream, RegisterOffsets::csGprR8 + 4, 0, true, isBcs); - - const uint32_t miMathMocs = this->rootDeviceEnvironment.getGmmHelper()->getL3EnabledMOCS(); - - EncodeAluHelper aluHelper({{ - {AluRegisters::opcodeLoad, AluRegisters::srca, AluRegisters::gpr1}, - {AluRegisters::opcodeLoad, AluRegisters::srcb, AluRegisters::gpr8}, - {AluRegisters::opcodeShl, AluRegisters::opcodeNone, AluRegisters::opcodeNone}, - {AluRegisters::opcodeStore, AluRegisters::gpr8, AluRegisters::accu}, - {AluRegisters::opcodeLoad, AluRegisters::srca, AluRegisters::gpr8}, - {AluRegisters::opcodeLoad, AluRegisters::srcb, AluRegisters::gpr6}, - {AluRegisters::opcodeAdd, AluRegisters::opcodeNone, AluRegisters::opcodeNone}, - {AluRegisters::opcodeStoreind, AluRegisters::accu, AluRegisters::gpr7}, - {AluRegisters::opcodeFenceWr, AluRegisters::opcodeNone, AluRegisters::opcodeNone}, - }}); - aluHelper.setMocs(miMathMocs); - aluHelper.copyToCmdStream(stream); - - EncodeMathMMIO::encodeIncrement(stream, AluRegisters::gpr1, isBcs); - - UNRECOVERABLE_IF(stream.getUsed() != RelaxedOrderingHelper::getSizeTaskStoreSection()); - - // Scheduler section - preinitializedRelaxedOrderingScheduler = std::make_unique(RelaxedOrderingHelper::DynamicSchedulerSizeAndOffsetSection::totalSize); - LinearStream schedulerStream(preinitializedRelaxedOrderingScheduler.get(), RelaxedOrderingHelper::DynamicSchedulerSizeAndOffsetSection::totalSize); - - uint64_t schedulerStartAddress = relaxedOrderingSchedulerAllocation->getGpuAddress(); - - // 1. Init section - LriHelper::program(&schedulerStream, RegisterOffsets::csGprR11, 0, true, isBcs); - LriHelper::program(&schedulerStream, RegisterOffsets::csGprR9, 0, true, isBcs); - LriHelper::program(&schedulerStream, RegisterOffsets::csGprR9 + 4, 0, true, isBcs); - EncodeBatchBufferStartOrEnd::programBatchBufferStart(&schedulerStream, schedulerStartAddress, false, false, false); - - // 2. Semaphore section - { - using COMPARE_OPERATION = typename GfxFamily::MI_SEMAPHORE_WAIT::COMPARE_OPERATION; - - EncodeMiPredicate::encode(schedulerStream, MiPredicateType::disable); - - EncodeSemaphore::addMiSemaphoreWaitCommand(schedulerStream, 0, 0, COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD, false, false, false, false, nullptr); - } - - // 3. End section - { - EncodeMiPredicate::encode(schedulerStream, MiPredicateType::disable); - - LriHelper::program(&schedulerStream, RegisterOffsets::csGprR5, 0, true, isBcs); - } - - UNRECOVERABLE_IF(schedulerStream.getUsed() != RelaxedOrderingHelper::DynamicSchedulerSizeAndOffsetSection::totalSize); -} - -template -void DirectSubmissionHw::dispatchTaskStoreSection(uint64_t taskStartSectionVa) { - using MI_LOAD_REGISTER_IMM = typename GfxFamily::MI_LOAD_REGISTER_IMM; - - constexpr size_t patchOffset = EncodeMiPredicate::getCmdSize() + (2 * sizeof(MI_LOAD_REGISTER_IMM)); - - auto lri = reinterpret_cast(ptrOffset(preinitializedTaskStoreSection.get(), patchOffset)); - - lri->setDataDword(static_cast(taskStartSectionVa & 0xFFFF'FFFFULL)); - lri++; - lri->setDataDword(static_cast(taskStartSectionVa >> 32)); - - auto dst = ringCommandStream.getSpace(RelaxedOrderingHelper::getSizeTaskStoreSection()); - memcpy_s(dst, RelaxedOrderingHelper::getSizeTaskStoreSection(), preinitializedTaskStoreSection.get(), RelaxedOrderingHelper::getSizeTaskStoreSection()); -} - template bool DirectSubmissionHw::copyCommandBufferIntoRing(BatchBuffer &batchBuffer) { /* Command buffer can't be copied into ring if implicit scaling or metrics are enabled, diff --git a/shared/source/direct_submission/direct_submission_relaxed_ordering.inl b/shared/source/direct_submission/direct_submission_relaxed_ordering.inl new file mode 100644 index 0000000000..31d38c666b --- /dev/null +++ b/shared/source/direct_submission/direct_submission_relaxed_ordering.inl @@ -0,0 +1,360 @@ +/* + * Copyright (C) 2025 Intel Corporation + * + * SPDX-License-Identifier: MIT + * + */ + +#include "shared/source/direct_submission/direct_submission_hw.h" + +namespace NEO { +template +void DirectSubmissionHw::dispatchStaticRelaxedOrderingScheduler() { + LinearStream schedulerCmdStream(this->relaxedOrderingSchedulerAllocation); + uint64_t schedulerStartAddress = schedulerCmdStream.getGpuBase(); + uint64_t deferredTasksListGpuVa = deferredTasksListAllocation->getGpuAddress(); + + uint64_t loopSectionStartAddress = schedulerStartAddress + RelaxedOrderingHelper::StaticSchedulerSizeAndOffsetSection::loopStartSectionStart; + + const uint32_t miMathMocs = this->rootDeviceEnvironment.getGmmHelper()->getL3EnabledMOCS(); + + constexpr bool isBcs = Dispatcher::isCopy(); + + // 1. Init section + { + EncodeMiPredicate::encode(schedulerCmdStream, MiPredicateType::disable); + + EncodeSetMMIO::encodeREG(schedulerCmdStream, RegisterOffsets::csGprR0, RegisterOffsets::csGprR9, isBcs); + EncodeSetMMIO::encodeREG(schedulerCmdStream, RegisterOffsets::csGprR0 + 4, RegisterOffsets::csGprR9 + 4, isBcs); + + EncodeBatchBufferStartOrEnd::programConditionalDataRegBatchBufferStart(schedulerCmdStream, 0, RegisterOffsets::csGprR1, 0, CompareOperation::equal, true, false, isBcs); + + LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR2, 0, true, isBcs); + LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR2 + 4, 0, true, isBcs); + + uint64_t removeTaskVa = schedulerStartAddress + RelaxedOrderingHelper::StaticSchedulerSizeAndOffsetSection::removeTaskSectionStart; + LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR3, static_cast(removeTaskVa & 0xFFFF'FFFFULL), true, isBcs); + LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR3 + 4, static_cast(removeTaskVa >> 32), true, isBcs); + + uint64_t walkersLoopConditionCheckVa = schedulerStartAddress + RelaxedOrderingHelper::StaticSchedulerSizeAndOffsetSection::tasksListLoopCheckSectionStart; + LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR4, static_cast(walkersLoopConditionCheckVa & 0xFFFF'FFFFULL), true, isBcs); + LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR4 + 4, static_cast(walkersLoopConditionCheckVa >> 32), true, isBcs); + } + + // 2. Dispatch task section (loop start) + { + UNRECOVERABLE_IF(schedulerCmdStream.getUsed() != RelaxedOrderingHelper::StaticSchedulerSizeAndOffsetSection::loopStartSectionStart); + + EncodeMiPredicate::encode(schedulerCmdStream, MiPredicateType::disable); + + LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR6, 8, true, isBcs); + LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR6 + 4, 0, true, isBcs); + + LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR8, static_cast(deferredTasksListGpuVa & 0xFFFF'FFFFULL), true, isBcs); + LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR8 + 4, static_cast(deferredTasksListGpuVa >> 32), true, isBcs); + + EncodeAluHelper aluHelper({{ + {AluRegisters::opcodeLoad, AluRegisters::srca, AluRegisters::gpr2}, + {AluRegisters::opcodeLoad, AluRegisters::srcb, AluRegisters::gpr6}, + {AluRegisters::opcodeShl, AluRegisters::opcodeNone, AluRegisters::opcodeNone}, + {AluRegisters::opcodeStore, AluRegisters::gpr7, AluRegisters::accu}, + {AluRegisters::opcodeLoad, AluRegisters::srca, AluRegisters::gpr7}, + {AluRegisters::opcodeLoad, AluRegisters::srcb, AluRegisters::gpr8}, + {AluRegisters::opcodeAdd, AluRegisters::opcodeNone, AluRegisters::opcodeNone}, + {AluRegisters::opcodeStore, AluRegisters::gpr6, AluRegisters::accu}, + {AluRegisters::opcodeLoadind, AluRegisters::gpr0, AluRegisters::accu}, + {AluRegisters::opcodeFenceRd, AluRegisters::opcodeNone, AluRegisters::opcodeNone}, + }}); + + aluHelper.setMocs(miMathMocs); + aluHelper.copyToCmdStream(schedulerCmdStream); + + EncodeBatchBufferStartOrEnd::programBatchBufferStart(&schedulerCmdStream, 0, false, true, false); + } + + // 3. Remove task section + { + UNRECOVERABLE_IF(schedulerCmdStream.getUsed() != RelaxedOrderingHelper::StaticSchedulerSizeAndOffsetSection::removeTaskSectionStart); + + EncodeMiPredicate::encode(schedulerCmdStream, MiPredicateType::disable); + + EncodeMathMMIO::encodeDecrement(schedulerCmdStream, AluRegisters::gpr1, isBcs); + EncodeMathMMIO::encodeDecrement(schedulerCmdStream, AluRegisters::gpr2, isBcs); + + EncodeSetMMIO::encodeREG(schedulerCmdStream, RegisterOffsets::csGprR0, RegisterOffsets::csGprR9, isBcs); + EncodeSetMMIO::encodeREG(schedulerCmdStream, RegisterOffsets::csGprR0 + 4, RegisterOffsets::csGprR9 + 4, isBcs); + + EncodeBatchBufferStartOrEnd::programConditionalDataRegBatchBufferStart(schedulerCmdStream, 0, RegisterOffsets::csGprR1, 0, CompareOperation::equal, true, false, isBcs); + + LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR7, 8, true, isBcs); + LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR7 + 4, 0, true, isBcs); + + LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR8, static_cast(deferredTasksListGpuVa & 0xFFFF'FFFFULL), true, isBcs); + LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR8 + 4, static_cast(deferredTasksListGpuVa >> 32), true, isBcs); + + EncodeAluHelper aluHelper({{ + {AluRegisters::opcodeLoad, AluRegisters::srca, AluRegisters::gpr1}, + {AluRegisters::opcodeLoad, AluRegisters::srcb, AluRegisters::gpr7}, + {AluRegisters::opcodeShl, AluRegisters::opcodeNone, AluRegisters::opcodeNone}, + {AluRegisters::opcodeStore, AluRegisters::gpr7, AluRegisters::accu}, + {AluRegisters::opcodeLoad, AluRegisters::srca, AluRegisters::gpr7}, + {AluRegisters::opcodeLoad, AluRegisters::srcb, AluRegisters::gpr8}, + {AluRegisters::opcodeAdd, AluRegisters::opcodeNone, AluRegisters::opcodeNone}, + {AluRegisters::opcodeLoadind, AluRegisters::gpr7, AluRegisters::accu}, + {AluRegisters::opcodeFenceRd, AluRegisters::opcodeNone, AluRegisters::opcodeNone}, + {AluRegisters::opcodeLoad, AluRegisters::srca, AluRegisters::gpr6}, + {AluRegisters::opcodeLoad0, AluRegisters::srcb, AluRegisters::opcodeNone}, + {AluRegisters::opcodeAdd, AluRegisters::opcodeNone, AluRegisters::opcodeNone}, + {AluRegisters::opcodeStoreind, AluRegisters::accu, AluRegisters::gpr7}, + {AluRegisters::opcodeFenceWr, AluRegisters::opcodeNone, AluRegisters::opcodeNone}, + }}); + + aluHelper.setMocs(miMathMocs); + aluHelper.copyToCmdStream(schedulerCmdStream); + } + + // 4. List loop check section + { + UNRECOVERABLE_IF(schedulerCmdStream.getUsed() != RelaxedOrderingHelper::StaticSchedulerSizeAndOffsetSection::tasksListLoopCheckSectionStart); + + EncodeMiPredicate::encode(schedulerCmdStream, MiPredicateType::disable); + + EncodeMathMMIO::encodeIncrement(schedulerCmdStream, AluRegisters::gpr2, isBcs); + + EncodeBatchBufferStartOrEnd::programConditionalRegRegBatchBufferStart( + schedulerCmdStream, + loopSectionStartAddress, + AluRegisters::gpr1, AluRegisters::gpr2, CompareOperation::notEqual, false, isBcs); + + LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR2, 0, true, isBcs); + LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR2 + 4, 0, true, isBcs); + } + + // 5. Drain request section + { + UNRECOVERABLE_IF(schedulerCmdStream.getUsed() != RelaxedOrderingHelper::StaticSchedulerSizeAndOffsetSection::drainRequestSectionStart); + + EncodeMiArbCheck::program(schedulerCmdStream, std::nullopt); + + if (debugManager.flags.DirectSubmissionRelaxedOrderingQueueSizeLimit.get() != -1) { + currentRelaxedOrderingQueueSize = static_cast(debugManager.flags.DirectSubmissionRelaxedOrderingQueueSizeLimit.get()); + } + + this->relaxedOrderingQueueSizeLimitValueVa = schedulerCmdStream.getCurrentGpuAddressPosition() + RelaxedOrderingHelper::getQueueSizeLimitValueOffset(); + + EncodeBatchBufferStartOrEnd::programConditionalDataRegBatchBufferStart( + schedulerCmdStream, + loopSectionStartAddress, + RegisterOffsets::csGprR1, currentRelaxedOrderingQueueSize, CompareOperation::greaterOrEqual, false, false, isBcs); + + EncodeBatchBufferStartOrEnd::programConditionalDataRegBatchBufferStart( + schedulerCmdStream, + loopSectionStartAddress, + RegisterOffsets::csGprR5, 1, CompareOperation::equal, false, false, isBcs); + } + + // 6. Scheduler loop check section + { + UNRECOVERABLE_IF(schedulerCmdStream.getUsed() != RelaxedOrderingHelper::StaticSchedulerSizeAndOffsetSection::schedulerLoopCheckSectionStart); + + LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR10, static_cast(RelaxedOrderingHelper::DynamicSchedulerSizeAndOffsetSection::semaphoreSectionSize), true, isBcs); + LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR10 + 4, 0, true, isBcs); + + EncodeAluHelper aluHelper({{ + {AluRegisters::opcodeLoad, AluRegisters::srca, AluRegisters::gpr9}, + {AluRegisters::opcodeLoad, AluRegisters::srcb, AluRegisters::gpr10}, + {AluRegisters::opcodeAdd, AluRegisters::opcodeNone, AluRegisters::opcodeNone}, + {AluRegisters::opcodeStore, AluRegisters::gpr0, AluRegisters::accu}, + }}); + aluHelper.setMocs(miMathMocs); + aluHelper.copyToCmdStream(schedulerCmdStream); + + EncodeBatchBufferStartOrEnd::programConditionalRegMemBatchBufferStart(schedulerCmdStream, 0, semaphoreGpuVa, RegisterOffsets::csGprR11, CompareOperation::greaterOrEqual, true, isBcs); + + EncodeBatchBufferStartOrEnd::programBatchBufferStart(&schedulerCmdStream, schedulerStartAddress + RelaxedOrderingHelper::StaticSchedulerSizeAndOffsetSection::loopStartSectionStart, + false, false, false); + } + + UNRECOVERABLE_IF(schedulerCmdStream.getUsed() != RelaxedOrderingHelper::StaticSchedulerSizeAndOffsetSection::totalSize); +} + +template +void DirectSubmissionHw::dispatchRelaxedOrderingSchedulerSection(uint32_t value) { + LinearStream schedulerCmdStream(this->preinitializedRelaxedOrderingScheduler.get(), RelaxedOrderingHelper::DynamicSchedulerSizeAndOffsetSection::totalSize); + + // 1. Init section + + uint64_t schedulerStartVa = ringCommandStream.getCurrentGpuAddressPosition(); + + uint64_t semaphoreSectionVa = schedulerStartVa + RelaxedOrderingHelper::DynamicSchedulerSizeAndOffsetSection::semaphoreSectionStart; + + constexpr bool isBcs = Dispatcher::isCopy(); + LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR11, value, true, isBcs); + LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR9, static_cast(semaphoreSectionVa & 0xFFFF'FFFFULL), true, isBcs); + LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR9 + 4, static_cast(semaphoreSectionVa >> 32), true, isBcs); + + schedulerCmdStream.getSpace(sizeof(typename GfxFamily::MI_BATCH_BUFFER_START)); // skip patching + + // 2. Semaphore section + { + using COMPARE_OPERATION = typename GfxFamily::MI_SEMAPHORE_WAIT::COMPARE_OPERATION; + + schedulerCmdStream.getSpace(EncodeMiPredicate::getCmdSize()); // skip patching + + EncodeSemaphore::addMiSemaphoreWaitCommand(schedulerCmdStream, semaphoreGpuVa, value, + COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD, false, false, false, false, nullptr); + } + + // skip patching End section + + auto dst = ringCommandStream.getSpace(RelaxedOrderingHelper::DynamicSchedulerSizeAndOffsetSection::totalSize); + memcpy_s(dst, RelaxedOrderingHelper::DynamicSchedulerSizeAndOffsetSection::totalSize, + this->preinitializedRelaxedOrderingScheduler.get(), RelaxedOrderingHelper::DynamicSchedulerSizeAndOffsetSection::totalSize); +} + +template +void DirectSubmissionHw::updateRelaxedOrderingQueueSize(uint32_t newSize) { + this->currentRelaxedOrderingQueueSize = newSize; + + EncodeStoreMemory::programStoreDataImm(this->ringCommandStream, this->relaxedOrderingQueueSizeLimitValueVa, + this->currentRelaxedOrderingQueueSize, 0, false, false, + nullptr); +} + +template +void DirectSubmissionHw::dispatchRelaxedOrderingQueueStall() { + LinearStream bbStartStream(ringCommandStream.getSpace(EncodeBatchBufferStartOrEnd::getCmdSizeConditionalDataRegBatchBufferStart(false)), + EncodeBatchBufferStartOrEnd::getCmdSizeConditionalDataRegBatchBufferStart(false)); + + constexpr bool isBcs = Dispatcher::isCopy(); + LriHelper::program(&ringCommandStream, RegisterOffsets::csGprR5, 1, true, isBcs); + dispatchSemaphoreSection(currentQueueWorkCount); + + // patch conditional bb_start with current GPU address + EncodeBatchBufferStartOrEnd::programConditionalDataRegBatchBufferStart(bbStartStream, ringCommandStream.getCurrentGpuAddressPosition(), + RegisterOffsets::csGprR1, 0, CompareOperation::equal, false, false, isBcs); + + relaxedOrderingSchedulerRequired = false; +} + +template +size_t DirectSubmissionHw::getSizeDispatchRelaxedOrderingQueueStall() { + return getSizeSemaphoreSection(true) + sizeof(typename GfxFamily::MI_LOAD_REGISTER_IMM) + + EncodeBatchBufferStartOrEnd::getCmdSizeConditionalDataRegBatchBufferStart(false); +} + +template +void DirectSubmissionHw::dispatchRelaxedOrderingReturnPtrRegs(LinearStream &cmdStream, uint64_t returnPtr) { + + constexpr bool isBcs = Dispatcher::isCopy(); + LriHelper::program(&cmdStream, RegisterOffsets::csGprR4, static_cast(returnPtr & 0xFFFF'FFFFULL), true, isBcs); + LriHelper::program(&cmdStream, RegisterOffsets::csGprR4 + 4, static_cast(returnPtr >> 32), true, isBcs); + + uint64_t returnPtrAfterTaskStoreSection = returnPtr; + + returnPtrAfterTaskStoreSection += RelaxedOrderingHelper::getSizeTaskStoreSection(); + + LriHelper::program(&cmdStream, RegisterOffsets::csGprR3, static_cast(returnPtrAfterTaskStoreSection & 0xFFFF'FFFFULL), true, isBcs); + LriHelper::program(&cmdStream, RegisterOffsets::csGprR3 + 4, static_cast(returnPtrAfterTaskStoreSection >> 32), true, isBcs); +} + +template +void DirectSubmissionHw::initRelaxedOrderingRegisters() { + + constexpr bool isBcs = Dispatcher::isCopy(); + LriHelper::program(&ringCommandStream, RegisterOffsets::csGprR1, 0, true, isBcs); + LriHelper::program(&ringCommandStream, RegisterOffsets::csGprR1 + 4, 0, true, isBcs); + LriHelper::program(&ringCommandStream, RegisterOffsets::csGprR5, 0, true, isBcs); + LriHelper::program(&ringCommandStream, RegisterOffsets::csGprR5 + 4, 0, true, isBcs); +} + +template +void DirectSubmissionHw::preinitializeRelaxedOrderingSections() { + // Task store section + preinitializedTaskStoreSection = std::make_unique(RelaxedOrderingHelper::getSizeTaskStoreSection()); + + LinearStream stream(preinitializedTaskStoreSection.get(), RelaxedOrderingHelper::getSizeTaskStoreSection()); + + EncodeMiPredicate::encode(stream, MiPredicateType::disable); + + uint64_t deferredTasksListGpuVa = deferredTasksListAllocation->getGpuAddress(); + + constexpr bool isBcs = Dispatcher::isCopy(); + LriHelper::program(&stream, RegisterOffsets::csGprR6, static_cast(deferredTasksListGpuVa & 0xFFFF'FFFFULL), true, isBcs); + LriHelper::program(&stream, RegisterOffsets::csGprR6 + 4, static_cast(deferredTasksListGpuVa >> 32), true, isBcs); + + // Task start VA + LriHelper::program(&stream, RegisterOffsets::csGprR7, 0, true, isBcs); + LriHelper::program(&stream, RegisterOffsets::csGprR7 + 4, 0, true, isBcs); + + // Shift by 8 = multiply by 256. Address must by 64b aligned (shift by 6), but SHL accepts only 1, 2, 4, 8, 16 and 32 + LriHelper::program(&stream, RegisterOffsets::csGprR8, 8, true, isBcs); + LriHelper::program(&stream, RegisterOffsets::csGprR8 + 4, 0, true, isBcs); + + const uint32_t miMathMocs = this->rootDeviceEnvironment.getGmmHelper()->getL3EnabledMOCS(); + + EncodeAluHelper aluHelper({{ + {AluRegisters::opcodeLoad, AluRegisters::srca, AluRegisters::gpr1}, + {AluRegisters::opcodeLoad, AluRegisters::srcb, AluRegisters::gpr8}, + {AluRegisters::opcodeShl, AluRegisters::opcodeNone, AluRegisters::opcodeNone}, + {AluRegisters::opcodeStore, AluRegisters::gpr8, AluRegisters::accu}, + {AluRegisters::opcodeLoad, AluRegisters::srca, AluRegisters::gpr8}, + {AluRegisters::opcodeLoad, AluRegisters::srcb, AluRegisters::gpr6}, + {AluRegisters::opcodeAdd, AluRegisters::opcodeNone, AluRegisters::opcodeNone}, + {AluRegisters::opcodeStoreind, AluRegisters::accu, AluRegisters::gpr7}, + {AluRegisters::opcodeFenceWr, AluRegisters::opcodeNone, AluRegisters::opcodeNone}, + }}); + aluHelper.setMocs(miMathMocs); + aluHelper.copyToCmdStream(stream); + + EncodeMathMMIO::encodeIncrement(stream, AluRegisters::gpr1, isBcs); + + UNRECOVERABLE_IF(stream.getUsed() != RelaxedOrderingHelper::getSizeTaskStoreSection()); + + // Scheduler section + preinitializedRelaxedOrderingScheduler = std::make_unique(RelaxedOrderingHelper::DynamicSchedulerSizeAndOffsetSection::totalSize); + LinearStream schedulerStream(preinitializedRelaxedOrderingScheduler.get(), RelaxedOrderingHelper::DynamicSchedulerSizeAndOffsetSection::totalSize); + + uint64_t schedulerStartAddress = relaxedOrderingSchedulerAllocation->getGpuAddress(); + + // 1. Init section + LriHelper::program(&schedulerStream, RegisterOffsets::csGprR11, 0, true, isBcs); + LriHelper::program(&schedulerStream, RegisterOffsets::csGprR9, 0, true, isBcs); + LriHelper::program(&schedulerStream, RegisterOffsets::csGprR9 + 4, 0, true, isBcs); + EncodeBatchBufferStartOrEnd::programBatchBufferStart(&schedulerStream, schedulerStartAddress, false, false, false); + + // 2. Semaphore section + { + using COMPARE_OPERATION = typename GfxFamily::MI_SEMAPHORE_WAIT::COMPARE_OPERATION; + + EncodeMiPredicate::encode(schedulerStream, MiPredicateType::disable); + + EncodeSemaphore::addMiSemaphoreWaitCommand(schedulerStream, 0, 0, COMPARE_OPERATION::COMPARE_OPERATION_SAD_GREATER_THAN_OR_EQUAL_SDD, false, false, false, false, nullptr); + } + + // 3. End section + { + EncodeMiPredicate::encode(schedulerStream, MiPredicateType::disable); + + LriHelper::program(&schedulerStream, RegisterOffsets::csGprR5, 0, true, isBcs); + } + + UNRECOVERABLE_IF(schedulerStream.getUsed() != RelaxedOrderingHelper::DynamicSchedulerSizeAndOffsetSection::totalSize); +} + +template +void DirectSubmissionHw::dispatchTaskStoreSection(uint64_t taskStartSectionVa) { + using MI_LOAD_REGISTER_IMM = typename GfxFamily::MI_LOAD_REGISTER_IMM; + + constexpr size_t patchOffset = EncodeMiPredicate::getCmdSize() + (2 * sizeof(MI_LOAD_REGISTER_IMM)); + + auto lri = reinterpret_cast(ptrOffset(preinitializedTaskStoreSection.get(), patchOffset)); + + lri->setDataDword(static_cast(taskStartSectionVa & 0xFFFF'FFFFULL)); + lri++; + lri->setDataDword(static_cast(taskStartSectionVa >> 32)); + + auto dst = ringCommandStream.getSpace(RelaxedOrderingHelper::getSizeTaskStoreSection()); + memcpy_s(dst, RelaxedOrderingHelper::getSizeTaskStoreSection(), preinitializedTaskStoreSection.get(), RelaxedOrderingHelper::getSizeTaskStoreSection()); +} +} // namespace NEO \ No newline at end of file diff --git a/shared/source/gen12lp/linux/direct_submission_gen12lp.cpp b/shared/source/gen12lp/linux/direct_submission_gen12lp.cpp index 642774fbf9..fbc8ee76cc 100644 --- a/shared/source/gen12lp/linux/direct_submission_gen12lp.cpp +++ b/shared/source/gen12lp/linux/direct_submission_gen12lp.cpp @@ -8,6 +8,7 @@ #include "shared/source/direct_submission/direct_submission_hw.inl" #include "shared/source/direct_submission/direct_submission_prefetch_mitigation_base.inl" #include "shared/source/direct_submission/direct_submission_prefetcher_base.inl" +#include "shared/source/direct_submission/direct_submission_relaxed_ordering.inl" #include "shared/source/direct_submission/direct_submission_tgllp_and_later.inl" #include "shared/source/direct_submission/dispatchers/blitter_dispatcher.inl" #include "shared/source/direct_submission/dispatchers/dispatcher.inl" diff --git a/shared/source/gen12lp/windows/direct_submission_gen12lp.cpp b/shared/source/gen12lp/windows/direct_submission_gen12lp.cpp index b1916c8d1b..9431bb601a 100644 --- a/shared/source/gen12lp/windows/direct_submission_gen12lp.cpp +++ b/shared/source/gen12lp/windows/direct_submission_gen12lp.cpp @@ -8,6 +8,7 @@ #include "shared/source/direct_submission/direct_submission_hw.inl" #include "shared/source/direct_submission/direct_submission_prefetch_mitigation_base.inl" #include "shared/source/direct_submission/direct_submission_prefetcher_base.inl" +#include "shared/source/direct_submission/direct_submission_relaxed_ordering.inl" #include "shared/source/direct_submission/direct_submission_tgllp_and_later.inl" #include "shared/source/direct_submission/dispatchers/blitter_dispatcher.inl" #include "shared/source/direct_submission/dispatchers/dispatcher.inl" diff --git a/shared/source/xe2_hpg_core/linux/direct_submission_xe2_hpg_core.cpp b/shared/source/xe2_hpg_core/linux/direct_submission_xe2_hpg_core.cpp index 089e268a47..21183c307d 100644 --- a/shared/source/xe2_hpg_core/linux/direct_submission_xe2_hpg_core.cpp +++ b/shared/source/xe2_hpg_core/linux/direct_submission_xe2_hpg_core.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2024 Intel Corporation + * Copyright (C) 2024-2025 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -8,6 +8,7 @@ #include "shared/source/direct_submission/direct_submission_hw.inl" #include "shared/source/direct_submission/direct_submission_prefetch_mitigation_xe_hp_core_and_later.inl" #include "shared/source/direct_submission/direct_submission_prefetcher_pvc_and_later.inl" +#include "shared/source/direct_submission/direct_submission_relaxed_ordering.inl" #include "shared/source/direct_submission/direct_submission_xe_hp_core_and_later.inl" #include "shared/source/direct_submission/dispatchers/blitter_dispatcher.inl" #include "shared/source/direct_submission/dispatchers/dispatcher.inl" diff --git a/shared/source/xe2_hpg_core/windows/direct_submission_xe2_hpg_core.cpp b/shared/source/xe2_hpg_core/windows/direct_submission_xe2_hpg_core.cpp index d15ced7635..057951d70c 100644 --- a/shared/source/xe2_hpg_core/windows/direct_submission_xe2_hpg_core.cpp +++ b/shared/source/xe2_hpg_core/windows/direct_submission_xe2_hpg_core.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2024 Intel Corporation + * Copyright (C) 2024-2025 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -8,6 +8,7 @@ #include "shared/source/direct_submission/direct_submission_hw.inl" #include "shared/source/direct_submission/direct_submission_prefetch_mitigation_xe_hp_core_and_later.inl" #include "shared/source/direct_submission/direct_submission_prefetcher_pvc_and_later.inl" +#include "shared/source/direct_submission/direct_submission_relaxed_ordering.inl" #include "shared/source/direct_submission/direct_submission_xe_hp_core_and_later.inl" #include "shared/source/direct_submission/dispatchers/blitter_dispatcher.inl" #include "shared/source/direct_submission/dispatchers/dispatcher.inl" diff --git a/shared/source/xe3_core/linux/direct_submission_xe3_core.cpp b/shared/source/xe3_core/linux/direct_submission_xe3_core.cpp index b633d3e58f..6b314f56b8 100644 --- a/shared/source/xe3_core/linux/direct_submission_xe3_core.cpp +++ b/shared/source/xe3_core/linux/direct_submission_xe3_core.cpp @@ -8,6 +8,7 @@ #include "shared/source/direct_submission/direct_submission_hw.inl" #include "shared/source/direct_submission/direct_submission_prefetch_mitigation_xe_hp_core_and_later.inl" #include "shared/source/direct_submission/direct_submission_prefetcher_pvc_and_later.inl" +#include "shared/source/direct_submission/direct_submission_relaxed_ordering.inl" #include "shared/source/direct_submission/direct_submission_xe_hp_core_and_later.inl" #include "shared/source/direct_submission/dispatchers/blitter_dispatcher.inl" #include "shared/source/direct_submission/dispatchers/dispatcher.inl" diff --git a/shared/source/xe3_core/windows/direct_submission_xe3_core.cpp b/shared/source/xe3_core/windows/direct_submission_xe3_core.cpp index 5f4e5d5c94..4e8186bd5b 100644 --- a/shared/source/xe3_core/windows/direct_submission_xe3_core.cpp +++ b/shared/source/xe3_core/windows/direct_submission_xe3_core.cpp @@ -8,6 +8,7 @@ #include "shared/source/direct_submission/direct_submission_hw.inl" #include "shared/source/direct_submission/direct_submission_prefetch_mitigation_xe_hp_core_and_later.inl" #include "shared/source/direct_submission/direct_submission_prefetcher_pvc_and_later.inl" +#include "shared/source/direct_submission/direct_submission_relaxed_ordering.inl" #include "shared/source/direct_submission/direct_submission_xe_hp_core_and_later.inl" #include "shared/source/direct_submission/dispatchers/blitter_dispatcher.inl" #include "shared/source/direct_submission/dispatchers/dispatcher.inl" diff --git a/shared/source/xe_hpc_core/linux/direct_submission_xe_hpc_core.cpp b/shared/source/xe_hpc_core/linux/direct_submission_xe_hpc_core.cpp index 4997008fb9..c33daaeda8 100644 --- a/shared/source/xe_hpc_core/linux/direct_submission_xe_hpc_core.cpp +++ b/shared/source/xe_hpc_core/linux/direct_submission_xe_hpc_core.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2021-2022 Intel Corporation + * Copyright (C) 2021-2025 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -8,6 +8,7 @@ #include "shared/source/direct_submission/direct_submission_hw.inl" #include "shared/source/direct_submission/direct_submission_prefetch_mitigation_xe_hp_core_and_later.inl" #include "shared/source/direct_submission/direct_submission_prefetcher_pvc_and_later.inl" +#include "shared/source/direct_submission/direct_submission_relaxed_ordering.inl" #include "shared/source/direct_submission/direct_submission_xe_hp_core_and_later.inl" #include "shared/source/direct_submission/dispatchers/blitter_dispatcher.inl" #include "shared/source/direct_submission/dispatchers/dispatcher.inl" diff --git a/shared/source/xe_hpc_core/windows/direct_submission_xe_hpc_core.cpp b/shared/source/xe_hpc_core/windows/direct_submission_xe_hpc_core.cpp index 4fc740922c..9f7cab8303 100644 --- a/shared/source/xe_hpc_core/windows/direct_submission_xe_hpc_core.cpp +++ b/shared/source/xe_hpc_core/windows/direct_submission_xe_hpc_core.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2021-2022 Intel Corporation + * Copyright (C) 2021-2025 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -8,6 +8,7 @@ #include "shared/source/direct_submission/direct_submission_hw.inl" #include "shared/source/direct_submission/direct_submission_prefetch_mitigation_xe_hp_core_and_later.inl" #include "shared/source/direct_submission/direct_submission_prefetcher_pvc_and_later.inl" +#include "shared/source/direct_submission/direct_submission_relaxed_ordering.inl" #include "shared/source/direct_submission/direct_submission_xe_hp_core_and_later.inl" #include "shared/source/direct_submission/dispatchers/blitter_dispatcher.inl" #include "shared/source/direct_submission/dispatchers/dispatcher.inl" diff --git a/shared/source/xe_hpg_core/linux/direct_submission_xe_hpg_core.cpp b/shared/source/xe_hpg_core/linux/direct_submission_xe_hpg_core.cpp index 6120f5f04f..43ae9a2b45 100644 --- a/shared/source/xe_hpg_core/linux/direct_submission_xe_hpg_core.cpp +++ b/shared/source/xe_hpg_core/linux/direct_submission_xe_hpg_core.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2021-2022 Intel Corporation + * Copyright (C) 2021-2025 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -8,6 +8,7 @@ #include "shared/source/direct_submission/direct_submission_hw.inl" #include "shared/source/direct_submission/direct_submission_prefetch_mitigation_xe_hp_core_and_later.inl" #include "shared/source/direct_submission/direct_submission_prefetcher_xe_hp_core_and_later.inl" +#include "shared/source/direct_submission/direct_submission_relaxed_ordering.inl" #include "shared/source/direct_submission/direct_submission_xe_hp_core_and_later.inl" #include "shared/source/direct_submission/dispatchers/blitter_dispatcher.inl" #include "shared/source/direct_submission/dispatchers/dispatcher.inl" diff --git a/shared/source/xe_hpg_core/windows/direct_submission_xe_hpg_core.cpp b/shared/source/xe_hpg_core/windows/direct_submission_xe_hpg_core.cpp index dd25a9fdf8..f08ebc4577 100644 --- a/shared/source/xe_hpg_core/windows/direct_submission_xe_hpg_core.cpp +++ b/shared/source/xe_hpg_core/windows/direct_submission_xe_hpg_core.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2021-2022 Intel Corporation + * Copyright (C) 2021-2025 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -8,6 +8,7 @@ #include "shared/source/direct_submission/direct_submission_hw.inl" #include "shared/source/direct_submission/direct_submission_prefetch_mitigation_xe_hp_core_and_later.inl" #include "shared/source/direct_submission/direct_submission_prefetcher_xe_hp_core_and_later.inl" +#include "shared/source/direct_submission/direct_submission_relaxed_ordering.inl" #include "shared/source/direct_submission/direct_submission_xe_hp_core_and_later.inl" #include "shared/source/direct_submission/dispatchers/blitter_dispatcher.inl" #include "shared/source/direct_submission/dispatchers/dispatcher.inl"