From b0e7a11e9adfde37f46014ced7e0266b0e3ad385 Mon Sep 17 00:00:00 2001 From: Andrzej Koska Date: Wed, 4 Sep 2024 21:24:17 +0000 Subject: [PATCH] refactor: Improving information transfer about the copy engine Related-To: NEO-11934 Signed-off-by: Andrzej Koska --- level_zero/core/source/cmdlist/cmdlist_hw.inl | 19 ++- .../source/cmdlist/cmdlist_hw_immediate.inl | 2 +- .../cmdlist/cmdlist_hw_xe2_hpg_and_later.inl | 4 +- opencl/source/command_queue/enqueue_common.h | 16 +-- .../command_queue/hardware_interface_base.inl | 2 +- .../command_container/command_encoder.h | 16 +-- .../command_container/command_encoder.inl | 72 +++++------ .../implicit_scaling_xehp_and_later.inl | 2 +- .../command_stream_receiver_hw.h | 2 +- .../command_stream_receiver_hw_base.inl | 22 ++-- .../debugger/debugger_l0_tgllp_and_later.inl | 4 +- .../direct_submission_hw.inl | 122 +++++++++--------- .../relaxed_ordering_helper.h | 6 +- .../helpers/blit_commands_helper_base.inl | 8 +- shared/source/helpers/gfx_core_helper.h | 2 +- .../source/helpers/gfx_core_helper_base.inl | 2 +- .../helpers/gfx_core_helper_xe2_and_later.inl | 6 +- ...ommand_stream_receiver_hw_xe2_hpg_core.cpp | 16 +-- .../direct_submission_tests_1.cpp | 4 +- .../direct_submission_tests_2.cpp | 8 +- .../command_encoder_tests_xehp_and_later.cpp | 6 +- .../encoders/test_encode_pvc_and_later.cpp | 2 +- .../encoders/test_encode_set_mmio.cpp | 18 +-- .../helpers/gfx_core_helper_tests.cpp | 2 +- .../helpers/gfx_core_helper_xe2_and_later.cpp | 2 +- 25 files changed, 183 insertions(+), 182 deletions(-) diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index a659fb691a..61aec886d4 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -2407,7 +2407,7 @@ bool CommandListCoreFamily::handleInOrderImplicitDependencies(boo } if (relaxedOrderingAllowed) { - NEO::RelaxedOrderingHelper::encodeRegistersBeforeDependencyCheckers(*commandContainer.getCommandStream()); + NEO::RelaxedOrderingHelper::encodeRegistersBeforeDependencyCheckers(*commandContainer.getCommandStream(), isCopyOnly()); } CommandListCoreFamily::appendWaitOnInOrderDependency(inOrderExecInfo, nullptr, inOrderExecInfo->getCounterValue(), inOrderExecInfo->getAllocationOffset(), relaxedOrderingAllowed, true, false, false); @@ -2432,7 +2432,7 @@ inline ze_result_t CommandListCoreFamily::addEventsToCmdList(uint } if (relaxedOrderingAllowed && numWaitEvents > 0 && !inOrderDependenciesSent) { - NEO::RelaxedOrderingHelper::encodeRegistersBeforeDependencyCheckers(*commandContainer.getCommandStream()); + NEO::RelaxedOrderingHelper::encodeRegistersBeforeDependencyCheckers(*commandContainer.getCommandStream(), isCopyOnly()); } if (numWaitEvents > 0) { @@ -2811,11 +2811,11 @@ void CommandListCoreFamily::appendWriteKernelTimestamp(Event *eve uint64_t contextAddress = ptrOffset(baseAddr, contextOffset); if (maskLsb) { - NEO::EncodeMathMMIO::encodeBitwiseAndVal(commandContainer, RegisterOffsets::globalTimestampLdw, mask, globalAddress, workloadPartition, globalPostSyncCmdBuffer, copyOperation); - NEO::EncodeMathMMIO::encodeBitwiseAndVal(commandContainer, RegisterOffsets::gpThreadTimeRegAddressOffsetLow, mask, contextAddress, workloadPartition, contextPostSyncCmdBuffer, copyOperation); + NEO::EncodeMathMMIO::encodeBitwiseAndVal(commandContainer, RegisterOffsets::globalTimestampLdw, mask, globalAddress, workloadPartition, globalPostSyncCmdBuffer, isCopyOnly()); + NEO::EncodeMathMMIO::encodeBitwiseAndVal(commandContainer, RegisterOffsets::gpThreadTimeRegAddressOffsetLow, mask, contextAddress, workloadPartition, contextPostSyncCmdBuffer, isCopyOnly()); } else { - NEO::EncodeStoreMMIO::encode(*commandContainer.getCommandStream(), RegisterOffsets::globalTimestampLdw, globalAddress, workloadPartition, globalPostSyncCmdBuffer); - NEO::EncodeStoreMMIO::encode(*commandContainer.getCommandStream(), RegisterOffsets::gpThreadTimeRegAddressOffsetLow, contextAddress, workloadPartition, contextPostSyncCmdBuffer); + NEO::EncodeStoreMMIO::encode(*commandContainer.getCommandStream(), RegisterOffsets::globalTimestampLdw, globalAddress, workloadPartition, globalPostSyncCmdBuffer, isCopyOnly()); + NEO::EncodeStoreMMIO::encode(*commandContainer.getCommandStream(), RegisterOffsets::gpThreadTimeRegAddressOffsetLow, contextAddress, workloadPartition, contextPostSyncCmdBuffer, isCopyOnly()); } if (outTimeStampSyncCmds != nullptr) { @@ -3094,13 +3094,12 @@ ze_result_t CommandListCoreFamily::prepareIndirectParams(const ze } auto groupCount = ptrOffset(alloc->getGpuAddress(), groupCountOffset); - NEO::EncodeSetMMIO::encodeMEM(commandContainer, RegisterOffsets::gpgpuDispatchDimX, - ptrOffset(groupCount, offsetof(ze_group_count_t, groupCountX))); + ptrOffset(groupCount, offsetof(ze_group_count_t, groupCountX)), isCopyOnly()); NEO::EncodeSetMMIO::encodeMEM(commandContainer, RegisterOffsets::gpgpuDispatchDimY, - ptrOffset(groupCount, offsetof(ze_group_count_t, groupCountY))); + ptrOffset(groupCount, offsetof(ze_group_count_t, groupCountY)), isCopyOnly()); NEO::EncodeSetMMIO::encodeMEM(commandContainer, RegisterOffsets::gpgpuDispatchDimZ, - ptrOffset(groupCount, offsetof(ze_group_count_t, groupCountZ))); + ptrOffset(groupCount, offsetof(ze_group_count_t, groupCountZ)), isCopyOnly()); } return ZE_RESULT_SUCCESS; diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl index b31ef54d6b..2224ecc069 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl @@ -553,7 +553,7 @@ void CommandListCoreFamilyImmediate::handleInOrderNonWalkerSignal if (nonWalkerSignalingHasRelaxedOrdering) { result = flushImmediate(result, true, hasStallingCmds, relaxedOrderingDispatch, true, false, nullptr, false); - NEO::RelaxedOrderingHelper::encodeRegistersBeforeDependencyCheckers(*this->commandContainer.getCommandStream()); + NEO::RelaxedOrderingHelper::encodeRegistersBeforeDependencyCheckers(*this->commandContainer.getCommandStream(), isCopyOnly()); relaxedOrderingDispatch = true; hasStallingCmds = hasStallingCmdsForRelaxedOrdering(1, relaxedOrderingDispatch); } diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_xe2_hpg_and_later.inl b/level_zero/core/source/cmdlist/cmdlist_hw_xe2_hpg_and_later.inl index 194eecc38c..d059289564 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_xe2_hpg_and_later.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_xe2_hpg_and_later.inl @@ -43,8 +43,8 @@ void CommandListCoreFamily::adjustWriteKernelTimestamp(uint64_t g NEO::EncodeMathMMIO::encodeBitwiseAndVal(commandContainer, RegisterOffsets::globalTimestampUn, mask, globalHighAddress, workloadPartition, globalPostSyncCmdBuffer, copyOperation); NEO::EncodeMathMMIO::encodeBitwiseAndVal(commandContainer, RegisterOffsets::gpThreadTimeRegAddressOffsetHigh, mask, contextHighAddress, workloadPartition, contextPostSyncCmdBuffer, copyOperation); } else { - NEO::EncodeStoreMMIO::encode(*commandContainer.getCommandStream(), RegisterOffsets::globalTimestampUn, globalHighAddress, workloadPartition, globalPostSyncCmdBuffer); - NEO::EncodeStoreMMIO::encode(*commandContainer.getCommandStream(), RegisterOffsets::gpThreadTimeRegAddressOffsetHigh, contextHighAddress, workloadPartition, contextPostSyncCmdBuffer); + NEO::EncodeStoreMMIO::encode(*commandContainer.getCommandStream(), RegisterOffsets::globalTimestampUn, globalHighAddress, workloadPartition, globalPostSyncCmdBuffer, copyOperation); + NEO::EncodeStoreMMIO::encode(*commandContainer.getCommandStream(), RegisterOffsets::gpThreadTimeRegAddressOffsetHigh, contextHighAddress, workloadPartition, contextPostSyncCmdBuffer, copyOperation); } if (outTimeStampSyncCmds != nullptr) { diff --git a/opencl/source/command_queue/enqueue_common.h b/opencl/source/command_queue/enqueue_common.h index a1979a82d7..6329e612b1 100644 --- a/opencl/source/command_queue/enqueue_common.h +++ b/opencl/source/command_queue/enqueue_common.h @@ -316,7 +316,7 @@ cl_int CommandQueueHw::enqueueHandler(Surface **surfacesForResidency, } if (relaxedOrderingEnabled) { - RelaxedOrderingHelper::encodeRegistersBeforeDependencyCheckers(commandStream); + RelaxedOrderingHelper::encodeRegistersBeforeDependencyCheckers(commandStream, isCopyOnly); } TimestampPacketHelper::programCsrDependenciesForTimestampPacketContainer(commandStream, csrDeps, relaxedOrderingEnabled, isCopyOnly); @@ -717,17 +717,17 @@ void CommandQueueHw::processDispatchForMarkerWithTimestampPacket(Comm auto timestampContextStartGpuAddress = TimestampPacketHelper::getContextStartGpuAddress(*currentTimestampPacketNode); auto timestampGlobalStartAddress = TimestampPacketHelper::getGlobalStartGpuAddress(*currentTimestampPacketNode); - - EncodeStoreMMIO::encode(*commandStream, RegisterOffsets::gpThreadTimeRegAddressOffsetLow, timestampContextStartGpuAddress, false, nullptr); - EncodeStoreMMIO::encode(*commandStream, RegisterOffsets::globalTimestampLdw, timestampGlobalStartAddress, false, nullptr); - MemorySynchronizationCommands::encodeAdditionalTimestampOffsets(*commandStream, timestampContextStartGpuAddress, timestampGlobalStartAddress); + bool isBcs = NEO::EngineHelpers::isBcs(getGpgpuCommandStreamReceiver().getOsContext().getEngineType()); + EncodeStoreMMIO::encode(*commandStream, RegisterOffsets::gpThreadTimeRegAddressOffsetLow, timestampContextStartGpuAddress, false, nullptr, isBcs); + EncodeStoreMMIO::encode(*commandStream, RegisterOffsets::globalTimestampLdw, timestampGlobalStartAddress, false, nullptr, isBcs); + MemorySynchronizationCommands::encodeAdditionalTimestampOffsets(*commandStream, timestampContextStartGpuAddress, timestampGlobalStartAddress, isBcs); auto timestampContextEndGpuAddress = TimestampPacketHelper::getContextEndGpuAddress(*currentTimestampPacketNode); auto timestampGlobalEndAddress = TimestampPacketHelper::getGlobalEndGpuAddress(*currentTimestampPacketNode); - EncodeStoreMMIO::encode(*commandStream, RegisterOffsets::gpThreadTimeRegAddressOffsetLow, timestampContextEndGpuAddress, false, nullptr); - EncodeStoreMMIO::encode(*commandStream, RegisterOffsets::globalTimestampLdw, timestampGlobalEndAddress, false, nullptr); - MemorySynchronizationCommands::encodeAdditionalTimestampOffsets(*commandStream, timestampContextEndGpuAddress, timestampGlobalEndAddress); + EncodeStoreMMIO::encode(*commandStream, RegisterOffsets::gpThreadTimeRegAddressOffsetLow, timestampContextEndGpuAddress, false, nullptr, isBcs); + EncodeStoreMMIO::encode(*commandStream, RegisterOffsets::globalTimestampLdw, timestampGlobalEndAddress, false, nullptr, isBcs); + MemorySynchronizationCommands::encodeAdditionalTimestampOffsets(*commandStream, timestampContextEndGpuAddress, timestampGlobalEndAddress, isBcs); } template diff --git a/opencl/source/command_queue/hardware_interface_base.inl b/opencl/source/command_queue/hardware_interface_base.inl index d22a1dd368..e703076b26 100644 --- a/opencl/source/command_queue/hardware_interface_base.inl +++ b/opencl/source/command_queue/hardware_interface_base.inl @@ -114,7 +114,7 @@ void HardwareInterface::dispatchWalker( } if (walkerArgs.relaxedOrderingEnabled) { - RelaxedOrderingHelper::encodeRegistersBeforeDependencyCheckers(*commandStream); + RelaxedOrderingHelper::encodeRegistersBeforeDependencyCheckers(*commandStream, false); } TimestampPacketHelper::programCsrDependenciesForTimestampPacketContainer(*commandStream, csrDependencies, walkerArgs.relaxedOrderingEnabled, commandQueue.isBcs()); diff --git a/shared/source/command_container/command_encoder.h b/shared/source/command_container/command_encoder.h index b106e6d856..9481069006 100644 --- a/shared/source/command_container/command_encoder.h +++ b/shared/source/command_container/command_encoder.h @@ -353,12 +353,12 @@ struct EncodeSetMMIO { static const size_t sizeREG = sizeof(MI_LOAD_REGISTER_REG); static void encodeIMM(CommandContainer &container, uint32_t offset, uint32_t data, bool remap, bool isBcs); - static void encodeMEM(CommandContainer &container, uint32_t offset, uint64_t address); - static void encodeREG(CommandContainer &container, uint32_t dstOffset, uint32_t srcOffset); + static void encodeMEM(CommandContainer &container, uint32_t offset, uint64_t address, bool isBcs); + static void encodeREG(CommandContainer &container, uint32_t dstOffset, uint32_t srcOffset, bool isBcs); static void encodeIMM(LinearStream &cmdStream, uint32_t offset, uint32_t data, bool remap, bool isBcs); - static void encodeMEM(LinearStream &cmdStream, uint32_t offset, uint64_t address); - static void encodeREG(LinearStream &cmdStream, uint32_t dstOffset, uint32_t srcOffset); + static void encodeMEM(LinearStream &cmdStream, uint32_t offset, uint64_t address, bool isBcs); + static void encodeREG(LinearStream &cmdStream, uint32_t dstOffset, uint32_t srcOffset, bool isBcs); static bool isRemapApplicable(uint32_t offset); static void remapOffset(MI_LOAD_REGISTER_MEM *pMiLoadReg); @@ -414,8 +414,8 @@ struct EncodeStoreMMIO { using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM; static const size_t size = sizeof(MI_STORE_REGISTER_MEM); - static void encode(LinearStream &csr, uint32_t offset, uint64_t address, bool workloadPartition, void **outCmdBuffer); - static void encode(MI_STORE_REGISTER_MEM *cmdBuffer, uint32_t offset, uint64_t address, bool workloadPartition); + static void encode(LinearStream &csr, uint32_t offset, uint64_t address, bool workloadPartition, void **outCmdBuffer, bool isBcs); + static void encode(MI_STORE_REGISTER_MEM *cmdBuffer, uint32_t offset, uint64_t address, bool workloadPartition, bool isBcs); static void appendFlags(MI_STORE_REGISTER_MEM *storeRegMem, bool workloadPartition); }; @@ -513,7 +513,7 @@ struct EncodeBatchBufferStartOrEnd { static void programConditionalDataMemBatchBufferStart(LinearStream &commandStream, uint64_t startAddress, uint64_t compareAddress, uint64_t compareData, CompareOperation compareOperation, bool indirect, bool useQwordData, bool isBcs); static void programConditionalDataRegBatchBufferStart(LinearStream &commandStream, uint64_t startAddress, uint32_t compareReg, uint64_t compareData, CompareOperation compareOperation, bool indirect, bool useQwordData, bool isBcs); - static void programConditionalRegRegBatchBufferStart(LinearStream &commandStream, uint64_t startAddress, AluRegisters compareReg0, AluRegisters compareReg1, CompareOperation compareOperation, bool indirect); + static void programConditionalRegRegBatchBufferStart(LinearStream &commandStream, uint64_t startAddress, AluRegisters compareReg0, AluRegisters compareReg1, CompareOperation compareOperation, bool indirect, bool isBcs); static void programConditionalRegMemBatchBufferStart(LinearStream &commandStream, uint64_t startAddress, uint64_t compareAddress, uint32_t compareReg, CompareOperation compareOperation, bool indirect, bool isBcs); static size_t constexpr getCmdSizeConditionalDataMemBatchBufferStart(bool useQwordData) { @@ -540,7 +540,7 @@ struct EncodeBatchBufferStartOrEnd { protected: static void appendBatchBufferStart(MI_BATCH_BUFFER_START &cmd, bool indirect, bool predicate); - static void programConditionalBatchBufferStartBase(LinearStream &commandStream, uint64_t startAddress, AluRegisters regA, AluRegisters regB, CompareOperation compareOperation, bool indirect); + static void programConditionalBatchBufferStartBase(LinearStream &commandStream, uint64_t startAddress, AluRegisters regA, AluRegisters regB, CompareOperation compareOperation, bool indirect, bool isBcs); static size_t constexpr getCmdSizeConditionalBufferStartBase() { return (EncodeAluHelper::getCmdsSize() + sizeof(typename GfxFamily::MI_LOAD_REGISTER_REG) + diff --git a/shared/source/command_container/command_encoder.inl b/shared/source/command_container/command_encoder.inl index 35aa0ab7b6..331a2a710b 100644 --- a/shared/source/command_container/command_encoder.inl +++ b/shared/source/command_container/command_encoder.inl @@ -111,7 +111,7 @@ void EncodeMathMMIO::encodeMulRegVal(CommandContainer &container, uint32 logLws++; } - EncodeSetMMIO::encodeREG(container, RegisterOffsets::csGprR0, offset); + EncodeSetMMIO::encodeREG(container, RegisterOffsets::csGprR0, offset, isBcs); EncodeSetMMIO::encodeIMM(container, RegisterOffsets::csGprR1, 0, true, isBcs); i = 0; @@ -119,14 +119,14 @@ void EncodeMathMMIO::encodeMulRegVal(CommandContainer &container, uint32 if (val & (1 << i)) { EncodeMath::addition(container, AluRegisters::gpr1, AluRegisters::gpr0, AluRegisters::gpr2); - EncodeSetMMIO::encodeREG(container, RegisterOffsets::csGprR1, RegisterOffsets::csGprR2); + EncodeSetMMIO::encodeREG(container, RegisterOffsets::csGprR1, RegisterOffsets::csGprR2, isBcs); } EncodeMath::addition(container, AluRegisters::gpr0, AluRegisters::gpr0, AluRegisters::gpr2); - EncodeSetMMIO::encodeREG(container, RegisterOffsets::csGprR0, RegisterOffsets::csGprR2); + EncodeSetMMIO::encodeREG(container, RegisterOffsets::csGprR0, RegisterOffsets::csGprR2, isBcs); i++; } - EncodeStoreMMIO::encode(*container.getCommandStream(), RegisterOffsets::csGprR1, dstAddress, false, nullptr); + EncodeStoreMMIO::encode(*container.getCommandStream(), RegisterOffsets::csGprR1, dstAddress, false, nullptr, isBcs); } /* @@ -139,14 +139,14 @@ void EncodeMathMMIO::encodeMulRegVal(CommandContainer &container, uint32 */ template void EncodeMathMMIO::encodeGreaterThanPredicate(CommandContainer &container, uint64_t firstOperand, uint32_t secondOperand, bool isBcs) { - EncodeSetMMIO::encodeMEM(container, RegisterOffsets::csGprR0, firstOperand); + EncodeSetMMIO::encodeMEM(container, RegisterOffsets::csGprR0, firstOperand, isBcs); EncodeSetMMIO::encodeIMM(container, RegisterOffsets::csGprR1, secondOperand, true, isBcs); /* RegisterOffsets::csGprR* registers map to AluRegisters::gpr* registers */ EncodeMath::greaterThan(container, AluRegisters::gpr0, AluRegisters::gpr1, AluRegisters::gpr2); - EncodeSetMMIO::encodeREG(container, RegisterOffsets::csPredicateResult, RegisterOffsets::csGprR2); + EncodeSetMMIO::encodeREG(container, RegisterOffsets::csPredicateResult, RegisterOffsets::csGprR2, isBcs); } /* @@ -156,13 +156,13 @@ void EncodeMathMMIO::encodeGreaterThanPredicate(CommandContainer &contai template void EncodeMathMMIO::encodeBitwiseAndVal(CommandContainer &container, uint32_t regOffset, uint32_t immVal, uint64_t dstAddress, bool workloadPartition, void **outCmdBuffer, bool isBcs) { - EncodeSetMMIO::encodeREG(container, RegisterOffsets::csGprR13, regOffset); + EncodeSetMMIO::encodeREG(container, RegisterOffsets::csGprR13, regOffset, isBcs); EncodeSetMMIO::encodeIMM(container, RegisterOffsets::csGprR14, immVal, true, isBcs); EncodeMath::bitwiseAnd(container, AluRegisters::gpr13, AluRegisters::gpr14, AluRegisters::gpr12); EncodeStoreMMIO::encode(*container.getCommandStream(), - RegisterOffsets::csGprR12, dstAddress, workloadPartition, outCmdBuffer); + RegisterOffsets::csGprR12, dstAddress, workloadPartition, outCmdBuffer, isBcs); } /* @@ -341,13 +341,13 @@ inline void EncodeSetMMIO::encodeIMM(CommandContainer &container, uint32 } template -inline void EncodeSetMMIO::encodeMEM(CommandContainer &container, uint32_t offset, uint64_t address) { - EncodeSetMMIO::encodeMEM(*container.getCommandStream(), offset, address); +inline void EncodeSetMMIO::encodeMEM(CommandContainer &container, uint32_t offset, uint64_t address, bool isBcs) { + EncodeSetMMIO::encodeMEM(*container.getCommandStream(), offset, address, isBcs); } template -inline void EncodeSetMMIO::encodeREG(CommandContainer &container, uint32_t dstOffset, uint32_t srcOffset) { - EncodeSetMMIO::encodeREG(*container.getCommandStream(), dstOffset, srcOffset); +inline void EncodeSetMMIO::encodeREG(CommandContainer &container, uint32_t dstOffset, uint32_t srcOffset, bool isBcs) { + EncodeSetMMIO::encodeREG(*container.getCommandStream(), dstOffset, srcOffset, isBcs); } template @@ -373,7 +373,7 @@ inline void EncodeStateBaseAddress::setSbaTrackingForL0DebuggerIfEnabled } template -void EncodeSetMMIO::encodeMEM(LinearStream &cmdStream, uint32_t offset, uint64_t address) { +void EncodeSetMMIO::encodeMEM(LinearStream &cmdStream, uint32_t offset, uint64_t address, bool isBcs) { MI_LOAD_REGISTER_MEM cmd = Family::cmdInitLoadRegisterMem; cmd.setRegisterAddress(offset); cmd.setMemoryAddress(address); @@ -384,7 +384,7 @@ void EncodeSetMMIO::encodeMEM(LinearStream &cmdStream, uint32_t offset, } template -void EncodeSetMMIO::encodeREG(LinearStream &cmdStream, uint32_t dstOffset, uint32_t srcOffset) { +void EncodeSetMMIO::encodeREG(LinearStream &cmdStream, uint32_t dstOffset, uint32_t srcOffset, bool isBcs) { MI_LOAD_REGISTER_REG cmd = Family::cmdInitLoadRegisterReg; cmd.setSourceRegisterAddress(srcOffset); cmd.setDestinationRegisterAddress(dstOffset); @@ -394,16 +394,16 @@ void EncodeSetMMIO::encodeREG(LinearStream &cmdStream, uint32_t dstOffse } template -void EncodeStoreMMIO::encode(LinearStream &csr, uint32_t offset, uint64_t address, bool workloadPartition, void **outCmdBuffer) { +void EncodeStoreMMIO::encode(LinearStream &csr, uint32_t offset, uint64_t address, bool workloadPartition, void **outCmdBuffer, bool isBcs) { auto buffer = csr.getSpaceForCmd(); if (outCmdBuffer != nullptr) { *outCmdBuffer = buffer; } - EncodeStoreMMIO::encode(buffer, offset, address, workloadPartition); + EncodeStoreMMIO::encode(buffer, offset, address, workloadPartition, isBcs); } template -inline void EncodeStoreMMIO::encode(MI_STORE_REGISTER_MEM *cmdBuffer, uint32_t offset, uint64_t address, bool workloadPartition) { +inline void EncodeStoreMMIO::encode(MI_STORE_REGISTER_MEM *cmdBuffer, uint32_t offset, uint64_t address, bool workloadPartition, bool isBcs) { MI_STORE_REGISTER_MEM cmd = Family::cmdInitStoreRegisterMem; cmd.setRegisterAddress(offset); cmd.setMemoryAddress(address); @@ -620,7 +620,7 @@ void EncodeIndirectParams::setGroupCountIndirect(CommandContainer &conta if (NEO::isUndefinedOffset(offsets[i])) { continue; } - EncodeStoreMMIO::encode(*container.getCommandStream(), RegisterOffsets::gpgpuDispatchDim[i], ptrOffset(crossThreadAddress, offsets[i]), false, nullptr); + EncodeStoreMMIO::encode(*container.getCommandStream(), RegisterOffsets::gpgpuDispatchDim[i], ptrOffset(crossThreadAddress, offsets[i]), false, nullptr, false); } } @@ -673,7 +673,7 @@ void EncodeIndirectParams::setWorkDimIndirect(CommandContainer &containe constexpr AluRegisters offsetAluRegister = AluRegisters::gpr8; if (offset) { - EncodeSetMMIO::encodeMEM(container, backupRegister, dstPtr); + EncodeSetMMIO::encodeMEM(container, backupRegister, dstPtr, false); EncodeSetMMIO::encodeIMM(container, memoryMaskRegister, memoryMask, true, false); EncodeMath::bitwiseAnd(container, memoryMaskAluRegister, backupAluRegister, backupAluRegister); EncodeSetMMIO::encodeIMM(container, offsetRegister, offset, true, false); @@ -682,13 +682,13 @@ void EncodeIndirectParams::setWorkDimIndirect(CommandContainer &containe EncodeSetMMIO::encodeIMM(container, constantOneRegister, 1, true, false); EncodeSetMMIO::encodeIMM(container, constantTwoRegister, 2, true, false); - EncodeSetMMIO::encodeREG(container, groupCount2Register, RegisterOffsets::gpgpuDispatchDim[2]); + EncodeSetMMIO::encodeREG(container, groupCount2Register, RegisterOffsets::gpgpuDispatchDim[2], false); EncodeMath::greaterThan(container, groupCount2AluRegister, constantOneAluRegister, workDimEq3AluRegister); EncodeMath::bitwiseAnd(container, workDimEq3AluRegister, constantOneAluRegister, workDimEq3AluRegister); EncodeSetMMIO::encodeIMM(container, groupSize1Register, groupSize[1], true, false); - EncodeSetMMIO::encodeREG(container, groupCount1Register, RegisterOffsets::gpgpuDispatchDim[1]); + EncodeSetMMIO::encodeREG(container, groupCount1Register, RegisterOffsets::gpgpuDispatchDim[1], false); EncodeMath::addition(container, groupSize1AluRegister, groupCount1AluRegister, sumAluRegister); EncodeMath::addition(container, sumAluRegister, workDimEq3AluRegister, sumAluRegister); @@ -703,7 +703,7 @@ void EncodeIndirectParams::setWorkDimIndirect(CommandContainer &containe EncodeMath::bitwiseAnd(container, workDimGe2AluRegister, constantOneAluRegister, workDimGe2AluRegister); } - EncodeSetMMIO::encodeREG(container, resultRegister, constantOneRegister); + EncodeSetMMIO::encodeREG(container, resultRegister, constantOneRegister, false); EncodeMath::addition(container, resultAluRegister, workDimGe2AluRegister, resultAluRegister); EncodeMath::addition(container, resultAluRegister, workDimEq3AluRegister, resultAluRegister); @@ -711,7 +711,7 @@ void EncodeIndirectParams::setWorkDimIndirect(CommandContainer &containe EncodeMath::addition(container, resultAluRegister, backupAluRegister, resultAluRegister); } } - EncodeStoreMMIO::encode(*container.getCommandStream(), resultRegister, dstPtr, false, nullptr); + EncodeStoreMMIO::encode(*container.getCommandStream(), resultRegister, dstPtr, false, nullptr, false); } } @@ -1020,10 +1020,10 @@ void EncodeAtomic::programMiAtomic(LinearStream &commandStream, template void EncodeBatchBufferStartOrEnd::programConditionalDataMemBatchBufferStart(LinearStream &commandStream, uint64_t startAddress, uint64_t compareAddress, uint64_t compareData, CompareOperation compareOperation, bool indirect, bool useQwordData, bool isBcs) { - EncodeSetMMIO::encodeMEM(commandStream, RegisterOffsets::csGprR7, compareAddress); + EncodeSetMMIO::encodeMEM(commandStream, RegisterOffsets::csGprR7, compareAddress, isBcs); if (useQwordData) { - EncodeSetMMIO::encodeMEM(commandStream, RegisterOffsets::csGprR7 + 4, compareAddress + 4); + EncodeSetMMIO::encodeMEM(commandStream, RegisterOffsets::csGprR7 + 4, compareAddress + 4, isBcs); } else { LriHelper::program(&commandStream, RegisterOffsets::csGprR7 + 4, 0, true, isBcs); } @@ -1034,15 +1034,15 @@ void EncodeBatchBufferStartOrEnd::programConditionalDataMemBatchBufferSt LriHelper::program(&commandStream, RegisterOffsets::csGprR8, compareDataLow, true, isBcs); LriHelper::program(&commandStream, RegisterOffsets::csGprR8 + 4, compareDataHigh, true, isBcs); - programConditionalBatchBufferStartBase(commandStream, startAddress, AluRegisters::gpr7, AluRegisters::gpr8, compareOperation, indirect); + programConditionalBatchBufferStartBase(commandStream, startAddress, AluRegisters::gpr7, AluRegisters::gpr8, compareOperation, indirect, isBcs); } template void EncodeBatchBufferStartOrEnd::programConditionalDataRegBatchBufferStart(LinearStream &commandStream, uint64_t startAddress, uint32_t compareReg, uint64_t compareData, CompareOperation compareOperation, bool indirect, bool useQwordData, bool isBcs) { - EncodeSetMMIO::encodeREG(commandStream, RegisterOffsets::csGprR7, compareReg); + EncodeSetMMIO::encodeREG(commandStream, RegisterOffsets::csGprR7, compareReg, isBcs); if (useQwordData) { - EncodeSetMMIO::encodeREG(commandStream, RegisterOffsets::csGprR7 + 4, compareReg + 4); + EncodeSetMMIO::encodeREG(commandStream, RegisterOffsets::csGprR7 + 4, compareReg + 4, isBcs); } else { LriHelper::program(&commandStream, RegisterOffsets::csGprR7 + 4, 0, true, isBcs); } @@ -1053,31 +1053,31 @@ void EncodeBatchBufferStartOrEnd::programConditionalDataRegBatchBufferSt LriHelper::program(&commandStream, RegisterOffsets::csGprR8, compareDataLow, true, isBcs); LriHelper::program(&commandStream, RegisterOffsets::csGprR8 + 4, compareDataHigh, true, isBcs); - programConditionalBatchBufferStartBase(commandStream, startAddress, AluRegisters::gpr7, AluRegisters::gpr8, compareOperation, indirect); + programConditionalBatchBufferStartBase(commandStream, startAddress, AluRegisters::gpr7, AluRegisters::gpr8, compareOperation, indirect, isBcs); } template void EncodeBatchBufferStartOrEnd::programConditionalRegRegBatchBufferStart(LinearStream &commandStream, uint64_t startAddress, AluRegisters compareReg0, - AluRegisters compareReg1, CompareOperation compareOperation, bool indirect) { + AluRegisters compareReg1, CompareOperation compareOperation, bool indirect, bool isBcs) { - programConditionalBatchBufferStartBase(commandStream, startAddress, compareReg0, compareReg1, compareOperation, indirect); + programConditionalBatchBufferStartBase(commandStream, startAddress, compareReg0, compareReg1, compareOperation, indirect, isBcs); } template void EncodeBatchBufferStartOrEnd::programConditionalRegMemBatchBufferStart(LinearStream &commandStream, uint64_t startAddress, uint64_t compareAddress, uint32_t compareReg, CompareOperation compareOperation, bool indirect, bool isBcs) { - EncodeSetMMIO::encodeMEM(commandStream, RegisterOffsets::csGprR7, compareAddress); + EncodeSetMMIO::encodeMEM(commandStream, RegisterOffsets::csGprR7, compareAddress, isBcs); LriHelper::program(&commandStream, RegisterOffsets::csGprR7 + 4, 0, true, isBcs); - EncodeSetMMIO::encodeREG(commandStream, RegisterOffsets::csGprR8, compareReg); + EncodeSetMMIO::encodeREG(commandStream, RegisterOffsets::csGprR8, compareReg, isBcs); LriHelper::program(&commandStream, RegisterOffsets::csGprR8 + 4, 0, true, isBcs); - programConditionalBatchBufferStartBase(commandStream, startAddress, AluRegisters::gpr7, AluRegisters::gpr8, compareOperation, indirect); + programConditionalBatchBufferStartBase(commandStream, startAddress, AluRegisters::gpr7, AluRegisters::gpr8, compareOperation, indirect, isBcs); } template void EncodeBatchBufferStartOrEnd::programConditionalBatchBufferStartBase(LinearStream &commandStream, uint64_t startAddress, AluRegisters regA, AluRegisters regB, - CompareOperation compareOperation, bool indirect) { + CompareOperation compareOperation, bool indirect, bool isBcs) { EncodeAluHelper aluHelper; aluHelper.setNextAlu(AluRegisters::opcodeLoad, AluRegisters::srca, regA); aluHelper.setNextAlu(AluRegisters::opcodeLoad, AluRegisters::srcb, regB); @@ -1093,7 +1093,7 @@ void EncodeBatchBufferStartOrEnd::programConditionalBatchBufferStartBase aluHelper.copyToCmdStream(commandStream); - EncodeSetMMIO::encodeREG(commandStream, RegisterOffsets::csPredicateResult2, RegisterOffsets::csGprR7); + EncodeSetMMIO::encodeREG(commandStream, RegisterOffsets::csPredicateResult2, RegisterOffsets::csGprR7, isBcs); MiPredicateType predicateType = MiPredicateType::noopOnResult2Clear; // Equal or Less if ((compareOperation == CompareOperation::notEqual) || (compareOperation == CompareOperation::greaterOrEqual)) { diff --git a/shared/source/command_container/implicit_scaling_xehp_and_later.inl b/shared/source/command_container/implicit_scaling_xehp_and_later.inl index c303046828..b9c4864884 100644 --- a/shared/source/command_container/implicit_scaling_xehp_and_later.inl +++ b/shared/source/command_container/implicit_scaling_xehp_and_later.inl @@ -214,7 +214,7 @@ inline void ImplicitScalingDispatch::dispatchRegisterConfiguration(Li bool isBcs) { EncodeSetMMIO::encodeMEM(commandStream, PartitionRegisters::wparidCCSOffset, - workPartitionSurfaceAddress); + workPartitionSurfaceAddress, isBcs); dispatchOffsetRegister(commandStream, addressOffset, isBcs); } diff --git a/shared/source/command_stream/command_stream_receiver_hw.h b/shared/source/command_stream/command_stream_receiver_hw.h index b3026618c8..850e320aba 100644 --- a/shared/source/command_stream/command_stream_receiver_hw.h +++ b/shared/source/command_stream/command_stream_receiver_hw.h @@ -80,7 +80,7 @@ class CommandStreamReceiverHw : public CommandStreamReceiver { size_t getCmdsSizeForHardwareContext() const override; static void addBatchBufferEnd(LinearStream &commandStream, void **patchLocation); - void programEndingCmd(LinearStream &commandStream, void **patchLocation, bool directSubmissionEnabled, bool hasRelaxedOrderingDependencies); + void programEndingCmd(LinearStream &commandStream, void **patchLocation, bool directSubmissionEnabled, bool hasRelaxedOrderingDependencies, bool isBcs); void addBatchBufferStart(MI_BATCH_BUFFER_START *commandBufferMemory, uint64_t startAddress, bool secondary); size_t getRequiredStateBaseAddressSize(const Device &device) const; diff --git a/shared/source/command_stream/command_stream_receiver_hw_base.inl b/shared/source/command_stream/command_stream_receiver_hw_base.inl index e3a71181ff..916631be57 100644 --- a/shared/source/command_stream/command_stream_receiver_hw_base.inl +++ b/shared/source/command_stream/command_stream_receiver_hw_base.inl @@ -107,7 +107,7 @@ inline void CommandStreamReceiverHw::addBatchBufferEnd(LinearStream & template inline void CommandStreamReceiverHw::programEndingCmd(LinearStream &commandStream, void **patchLocation, bool directSubmissionEnabled, - bool hasRelaxedOrderingDependencies) { + bool hasRelaxedOrderingDependencies, bool isBcs) { if (directSubmissionEnabled) { uint64_t startAddress = commandStream.getGraphicsAllocation()->getGpuAddress() + commandStream.getUsed(); if (debugManager.flags.BatchBufferStartPrepatchingWaEnabled.get() == 0) { @@ -123,8 +123,8 @@ inline void CommandStreamReceiverHw::programEndingCmd(LinearStream &c bool indirect = false; if (relaxedOrderingEnabled && hasRelaxedOrderingDependencies) { - NEO::EncodeSetMMIO::encodeREG(commandStream, RegisterOffsets::csGprR0, RegisterOffsets::csGprR3); - NEO::EncodeSetMMIO::encodeREG(commandStream, RegisterOffsets::csGprR0 + 4, RegisterOffsets::csGprR3 + 4); + NEO::EncodeSetMMIO::encodeREG(commandStream, RegisterOffsets::csGprR0, RegisterOffsets::csGprR3, isBcs); + NEO::EncodeSetMMIO::encodeREG(commandStream, RegisterOffsets::csGprR0 + 4, RegisterOffsets::csGprR3 + 4, isBcs); indirect = true; } @@ -249,7 +249,7 @@ CompletionStamp CommandStreamReceiverHw::flushBcsTask(LinearStream &c bool submitCSR = (commandStreamStartCSR != commandStreamCSR.getUsed()); void *bbEndLocation = nullptr; - programEndingCmd(commandStreamTask, &bbEndLocation, isBlitterDirectSubmissionEnabled(), dispatchBcsFlags.hasRelaxedOrderingDependencies); + programEndingCmd(commandStreamTask, &bbEndLocation, isBlitterDirectSubmissionEnabled(), dispatchBcsFlags.hasRelaxedOrderingDependencies, true); EncodeNoop::alignToCacheLine(commandStreamTask); if (submitCSR) { @@ -1022,7 +1022,7 @@ TaskCountType CommandStreamReceiverHw::flushBcsTask(const BlitPropert } if (isRelaxedOrderingDispatch) { - RelaxedOrderingHelper::encodeRegistersBeforeDependencyCheckers(commandStream); + RelaxedOrderingHelper::encodeRegistersBeforeDependencyCheckers(commandStream, true); } NEO::EncodeDummyBlitWaArgs waArgs{false, rootDeviceEnvironment.get()}; @@ -1109,7 +1109,7 @@ TaskCountType CommandStreamReceiverHw::flushBcsTask(const BlitPropert } void *endingCmdPtr = nullptr; - programEndingCmd(commandStream, &endingCmdPtr, blitterDirectSubmission, isRelaxedOrderingDispatch); + programEndingCmd(commandStream, &endingCmdPtr, blitterDirectSubmission, isRelaxedOrderingDispatch, true); EncodeNoop::alignToCacheLine(commandStream); @@ -1240,7 +1240,7 @@ SubmissionStatus CommandStreamReceiverHw::flushPipeControl(bool state template SubmissionStatus CommandStreamReceiverHw::flushSmallTask(LinearStream &commandStreamTask, size_t commandStreamStartTask) { void *endingCmdPtr = nullptr; - programEndingCmd(commandStreamTask, &endingCmdPtr, isAnyDirectSubmissionEnabled(), false); + programEndingCmd(commandStreamTask, &endingCmdPtr, isAnyDirectSubmissionEnabled(), false, EngineHelpers::isBcs(this->osContext->getEngineType())); auto bytesToPad = EncodeBatchBufferStartOrEnd::getBatchBufferStartSize() - EncodeBatchBufferStartOrEnd::getBatchBufferEndSize(); @@ -1328,7 +1328,7 @@ inline void CommandStreamReceiverHw::programEpilogue(LinearStream &cs addBatchBufferStart(reinterpret_cast(*batchBufferEndLocation), gpuAddress, false); this->programEpliogueCommands(csr, dispatchFlags); - programEndingCmd(csr, batchBufferEndLocation, isDirectSubmissionEnabled(), false); + programEndingCmd(csr, batchBufferEndLocation, isDirectSubmissionEnabled(), false, EngineHelpers::isBcs(this->osContext->getEngineType())); EncodeNoop::alignToCacheLine(csr); } } @@ -2190,7 +2190,7 @@ void CommandStreamReceiverHw::dispatchImmediateFlushClientBufferComma makeResident(*immediateCommandStream.getGraphicsAllocation()); - programEndingCmd(immediateCommandStream, &flushData.endPtr, isDirectSubmissionEnabled(), dispatchFlags.hasRelaxedOrderingDependencies); + programEndingCmd(immediateCommandStream, &flushData.endPtr, isDirectSubmissionEnabled(), dispatchFlags.hasRelaxedOrderingDependencies, EngineHelpers::isBcs(this->osContext->getEngineType())); EncodeNoop::alignToCacheLine(immediateCommandStream); } @@ -2293,7 +2293,7 @@ inline BatchBuffer CommandStreamReceiverHw::prepareBatchBufferForSubm // If the CSR has work in its CS, flush it before the task if (submitTask) { - programEndingCmd(commandStreamTask, &bbEndLocation, directSubmissionEnabled, dispatchFlags.hasRelaxedOrderingDependencies); + programEndingCmd(commandStreamTask, &bbEndLocation, directSubmissionEnabled, dispatchFlags.hasRelaxedOrderingDependencies, EngineHelpers::isBcs(this->osContext->getEngineType())); EncodeNoop::emitNoop(commandStreamTask, bbEndPaddingSize); EncodeNoop::alignToCacheLine(commandStreamTask); @@ -2306,7 +2306,7 @@ inline BatchBuffer CommandStreamReceiverHw::prepareBatchBufferForSubm this->programEpilogue(commandStreamCSR, device, &bbEndLocation, dispatchFlags); } else if (submitCSR) { - programEndingCmd(commandStreamCSR, &bbEndLocation, directSubmissionEnabled, dispatchFlags.hasRelaxedOrderingDependencies); + programEndingCmd(commandStreamCSR, &bbEndLocation, directSubmissionEnabled, dispatchFlags.hasRelaxedOrderingDependencies, EngineHelpers::isBcs(this->osContext->getEngineType())); EncodeNoop::emitNoop(commandStreamCSR, bbEndPaddingSize); EncodeNoop::alignToCacheLine(commandStreamCSR); DEBUG_BREAK_IF(commandStreamCSR.getUsed() > commandStreamCSR.getMaxAvailableSpace()); diff --git a/shared/source/debugger/debugger_l0_tgllp_and_later.inl b/shared/source/debugger/debugger_l0_tgllp_and_later.inl index b9880a88bb..eb062d7aa6 100644 --- a/shared/source/debugger/debugger_l0_tgllp_and_later.inl +++ b/shared/source/debugger/debugger_l0_tgllp_and_later.inl @@ -102,8 +102,8 @@ void DebuggerL0Hw::programSbaTrackingCommandsSingleAddressSpace(NEO:: const auto gmmHelper = device->getGmmHelper(); const auto gpuVaOfDataDWORD1 = gmmHelper->decanonize(gpuVaOfData + 4); - NEO::EncodeStoreMMIO::encode(miStoreRegMemLow, RegisterOffsets::csGprR1, gpuVaOfAddress, false); - NEO::EncodeStoreMMIO::encode(miStoreRegMemHigh, RegisterOffsets::csGprR1 + 4, gpuVaOfAddress + 4, false); + NEO::EncodeStoreMMIO::encode(miStoreRegMemLow, RegisterOffsets::csGprR1, gpuVaOfAddress, false, false); + NEO::EncodeStoreMMIO::encode(miStoreRegMemHigh, RegisterOffsets::csGprR1 + 4, gpuVaOfAddress + 4, false, false); MI_STORE_DATA_IMM setSbaBufferAddress = GfxFamily::cmdInitStoreDataImm; gpuVaOfData = gmmHelper->decanonize(gpuVaOfData); diff --git a/shared/source/direct_submission/direct_submission_hw.inl b/shared/source/direct_submission/direct_submission_hw.inl index 8509103592..ad1bc0b168 100644 --- a/shared/source/direct_submission/direct_submission_hw.inl +++ b/shared/source/direct_submission/direct_submission_hw.inl @@ -124,27 +124,27 @@ void DirectSubmissionHw::dispatchStaticRelaxedOrderingSch const uint32_t miMathMocs = this->rootDeviceEnvironment.getGmmHelper()->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER); - bool isBcsEngine = EngineHelpers::isBcs(this->osContext.getEngineType()); + bool isBcs = EngineHelpers::isBcs(this->osContext.getEngineType()); // 1. Init section { EncodeMiPredicate::encode(schedulerCmdStream, MiPredicateType::disable); - EncodeSetMMIO::encodeREG(schedulerCmdStream, RegisterOffsets::csGprR0, RegisterOffsets::csGprR9); - EncodeSetMMIO::encodeREG(schedulerCmdStream, RegisterOffsets::csGprR0 + 4, RegisterOffsets::csGprR9 + 4); + EncodeSetMMIO::encodeREG(schedulerCmdStream, RegisterOffsets::csGprR0, RegisterOffsets::csGprR9, isBcs); + EncodeSetMMIO::encodeREG(schedulerCmdStream, RegisterOffsets::csGprR0 + 4, RegisterOffsets::csGprR9 + 4, isBcs); - EncodeBatchBufferStartOrEnd::programConditionalDataRegBatchBufferStart(schedulerCmdStream, 0, RegisterOffsets::csGprR1, 0, CompareOperation::equal, true, false, isBcsEngine); + EncodeBatchBufferStartOrEnd::programConditionalDataRegBatchBufferStart(schedulerCmdStream, 0, RegisterOffsets::csGprR1, 0, CompareOperation::equal, true, false, isBcs); - LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR2, 0, true, isBcsEngine); - LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR2 + 4, 0, true, isBcsEngine); + LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR2, 0, true, isBcs); + LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR2 + 4, 0, true, isBcs); uint64_t removeTaskVa = schedulerStartAddress + RelaxedOrderingHelper::StaticSchedulerSizeAndOffsetSection::removeTaskSectionStart; - LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR3, static_cast(removeTaskVa & 0xFFFF'FFFFULL), true, isBcsEngine); - LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR3 + 4, static_cast(removeTaskVa >> 32), true, isBcsEngine); + LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR3, static_cast(removeTaskVa & 0xFFFF'FFFFULL), true, isBcs); + LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR3 + 4, static_cast(removeTaskVa >> 32), true, isBcs); uint64_t walkersLoopConditionCheckVa = schedulerStartAddress + RelaxedOrderingHelper::StaticSchedulerSizeAndOffsetSection::tasksListLoopCheckSectionStart; - LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR4, static_cast(walkersLoopConditionCheckVa & 0xFFFF'FFFFULL), true, isBcsEngine); - LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR4 + 4, static_cast(walkersLoopConditionCheckVa >> 32), true, isBcsEngine); + LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR4, static_cast(walkersLoopConditionCheckVa & 0xFFFF'FFFFULL), true, isBcs); + LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR4 + 4, static_cast(walkersLoopConditionCheckVa >> 32), true, isBcs); } // 2. Dispatch task section (loop start) @@ -153,11 +153,11 @@ void DirectSubmissionHw::dispatchStaticRelaxedOrderingSch EncodeMiPredicate::encode(schedulerCmdStream, MiPredicateType::disable); - LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR6, 8, true, isBcsEngine); - LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR6 + 4, 0, true, isBcsEngine); + LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR6, 8, true, isBcs); + LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR6 + 4, 0, true, isBcs); - LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR8, static_cast(deferredTasksListGpuVa & 0xFFFF'FFFFULL), true, isBcsEngine); - LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR8 + 4, static_cast(deferredTasksListGpuVa >> 32), true, isBcsEngine); + LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR8, static_cast(deferredTasksListGpuVa & 0xFFFF'FFFFULL), true, isBcs); + LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR8 + 4, static_cast(deferredTasksListGpuVa >> 32), true, isBcs); EncodeAluHelper aluHelper; aluHelper.setMocs(miMathMocs); @@ -183,19 +183,19 @@ void DirectSubmissionHw::dispatchStaticRelaxedOrderingSch EncodeMiPredicate::encode(schedulerCmdStream, MiPredicateType::disable); - EncodeMathMMIO::encodeDecrement(schedulerCmdStream, AluRegisters::gpr1, isBcsEngine); - EncodeMathMMIO::encodeDecrement(schedulerCmdStream, AluRegisters::gpr2, isBcsEngine); + EncodeMathMMIO::encodeDecrement(schedulerCmdStream, AluRegisters::gpr1, isBcs); + EncodeMathMMIO::encodeDecrement(schedulerCmdStream, AluRegisters::gpr2, isBcs); - EncodeSetMMIO::encodeREG(schedulerCmdStream, RegisterOffsets::csGprR0, RegisterOffsets::csGprR9); - EncodeSetMMIO::encodeREG(schedulerCmdStream, RegisterOffsets::csGprR0 + 4, RegisterOffsets::csGprR9 + 4); + EncodeSetMMIO::encodeREG(schedulerCmdStream, RegisterOffsets::csGprR0, RegisterOffsets::csGprR9, isBcs); + EncodeSetMMIO::encodeREG(schedulerCmdStream, RegisterOffsets::csGprR0 + 4, RegisterOffsets::csGprR9 + 4, isBcs); - EncodeBatchBufferStartOrEnd::programConditionalDataRegBatchBufferStart(schedulerCmdStream, 0, RegisterOffsets::csGprR1, 0, CompareOperation::equal, true, false, isBcsEngine); + EncodeBatchBufferStartOrEnd::programConditionalDataRegBatchBufferStart(schedulerCmdStream, 0, RegisterOffsets::csGprR1, 0, CompareOperation::equal, true, false, isBcs); - LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR7, 8, true, isBcsEngine); - LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR7 + 4, 0, true, isBcsEngine); + LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR7, 8, true, isBcs); + LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR7 + 4, 0, true, isBcs); - LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR8, static_cast(deferredTasksListGpuVa & 0xFFFF'FFFFULL), true, isBcsEngine); - LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR8 + 4, static_cast(deferredTasksListGpuVa >> 32), true, isBcsEngine); + LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR8, static_cast(deferredTasksListGpuVa & 0xFFFF'FFFFULL), true, isBcs); + LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR8 + 4, static_cast(deferredTasksListGpuVa >> 32), true, isBcs); EncodeAluHelper aluHelper; aluHelper.setMocs(miMathMocs); @@ -223,15 +223,15 @@ void DirectSubmissionHw::dispatchStaticRelaxedOrderingSch EncodeMiPredicate::encode(schedulerCmdStream, MiPredicateType::disable); - EncodeMathMMIO::encodeIncrement(schedulerCmdStream, AluRegisters::gpr2, isBcsEngine); + EncodeMathMMIO::encodeIncrement(schedulerCmdStream, AluRegisters::gpr2, isBcs); EncodeBatchBufferStartOrEnd::programConditionalRegRegBatchBufferStart( schedulerCmdStream, loopSectionStartAddress, - AluRegisters::gpr1, AluRegisters::gpr2, CompareOperation::notEqual, false); + AluRegisters::gpr1, AluRegisters::gpr2, CompareOperation::notEqual, false, isBcs); - LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR2, 0, true, isBcsEngine); - LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR2 + 4, 0, true, isBcsEngine); + LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR2, 0, true, isBcs); + LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR2 + 4, 0, true, isBcs); } // 5. Drain request section @@ -251,20 +251,20 @@ void DirectSubmissionHw::dispatchStaticRelaxedOrderingSch EncodeBatchBufferStartOrEnd::programConditionalDataRegBatchBufferStart( schedulerCmdStream, loopSectionStartAddress, - RegisterOffsets::csGprR1, currentRelaxedOrderingQueueSize, CompareOperation::greaterOrEqual, false, false, isBcsEngine); + RegisterOffsets::csGprR1, currentRelaxedOrderingQueueSize, CompareOperation::greaterOrEqual, false, false, isBcs); EncodeBatchBufferStartOrEnd::programConditionalDataRegBatchBufferStart( schedulerCmdStream, loopSectionStartAddress, - RegisterOffsets::csGprR5, 1, CompareOperation::equal, false, false, isBcsEngine); + RegisterOffsets::csGprR5, 1, CompareOperation::equal, false, false, isBcs); } // 6. Scheduler loop check section { UNRECOVERABLE_IF(schedulerCmdStream.getUsed() != RelaxedOrderingHelper::StaticSchedulerSizeAndOffsetSection::schedulerLoopCheckSectionStart); - LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR10, static_cast(RelaxedOrderingHelper::DynamicSchedulerSizeAndOffsetSection::semaphoreSectionSize), true, isBcsEngine); - LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR10 + 4, 0, true, isBcsEngine); + LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR10, static_cast(RelaxedOrderingHelper::DynamicSchedulerSizeAndOffsetSection::semaphoreSectionSize), true, isBcs); + LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR10 + 4, 0, true, isBcs); EncodeAluHelper aluHelper; aluHelper.setMocs(miMathMocs); @@ -274,7 +274,7 @@ void DirectSubmissionHw::dispatchStaticRelaxedOrderingSch aluHelper.setNextAlu(AluRegisters::opcodeStore, AluRegisters::gpr0, AluRegisters::accu); aluHelper.copyToCmdStream(schedulerCmdStream); - EncodeBatchBufferStartOrEnd::programConditionalRegMemBatchBufferStart(schedulerCmdStream, 0, semaphoreGpuVa, RegisterOffsets::csGprR11, CompareOperation::greaterOrEqual, true, isBcsEngine); + EncodeBatchBufferStartOrEnd::programConditionalRegMemBatchBufferStart(schedulerCmdStream, 0, semaphoreGpuVa, RegisterOffsets::csGprR11, CompareOperation::greaterOrEqual, true, isBcs); EncodeBatchBufferStartOrEnd::programBatchBufferStart(&schedulerCmdStream, schedulerStartAddress + RelaxedOrderingHelper::StaticSchedulerSizeAndOffsetSection::loopStartSectionStart, false, false, false); @@ -293,10 +293,10 @@ void DirectSubmissionHw::dispatchRelaxedOrderingScheduler uint64_t semaphoreSectionVa = schedulerStartVa + RelaxedOrderingHelper::DynamicSchedulerSizeAndOffsetSection::semaphoreSectionStart; - bool isBcsEngine = EngineHelpers::isBcs(this->osContext.getEngineType()); - LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR11, value, true, isBcsEngine); - LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR9, static_cast(semaphoreSectionVa & 0xFFFF'FFFFULL), true, isBcsEngine); - LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR9 + 4, static_cast(semaphoreSectionVa >> 32), true, isBcsEngine); + bool isBcs = EngineHelpers::isBcs(this->osContext.getEngineType()); + LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR11, value, true, isBcs); + LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR9, static_cast(semaphoreSectionVa & 0xFFFF'FFFFULL), true, isBcs); + LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR9 + 4, static_cast(semaphoreSectionVa >> 32), true, isBcs); schedulerCmdStream.getSpace(sizeof(typename GfxFamily::MI_BATCH_BUFFER_START)); // skip patching @@ -769,13 +769,13 @@ void DirectSubmissionHw::dispatchRelaxedOrderingQueueStal LinearStream bbStartStream(ringCommandStream.getSpace(EncodeBatchBufferStartOrEnd::getCmdSizeConditionalDataRegBatchBufferStart(false)), EncodeBatchBufferStartOrEnd::getCmdSizeConditionalDataRegBatchBufferStart(false)); - bool isBcsEngine = EngineHelpers::isBcs(this->osContext.getEngineType()); - LriHelper::program(&ringCommandStream, RegisterOffsets::csGprR5, 1, true, isBcsEngine); + bool isBcs = EngineHelpers::isBcs(this->osContext.getEngineType()); + LriHelper::program(&ringCommandStream, RegisterOffsets::csGprR5, 1, true, isBcs); dispatchSemaphoreSection(currentQueueWorkCount); // patch conditional bb_start with current GPU address EncodeBatchBufferStartOrEnd::programConditionalDataRegBatchBufferStart(bbStartStream, ringCommandStream.getCurrentGpuAddressPosition(), - RegisterOffsets::csGprR1, 0, CompareOperation::equal, false, false, isBcsEngine); + RegisterOffsets::csGprR1, 0, CompareOperation::equal, false, false, isBcs); relaxedOrderingSchedulerRequired = false; } @@ -789,26 +789,26 @@ size_t DirectSubmissionHw::getSizeDispatchRelaxedOrdering template void DirectSubmissionHw::dispatchRelaxedOrderingReturnPtrRegs(LinearStream &cmdStream, uint64_t returnPtr) { - bool isBcsEngine = EngineHelpers::isBcs(this->osContext.getEngineType()); - LriHelper::program(&cmdStream, RegisterOffsets::csGprR4, static_cast(returnPtr & 0xFFFF'FFFFULL), true, isBcsEngine); - LriHelper::program(&cmdStream, RegisterOffsets::csGprR4 + 4, static_cast(returnPtr >> 32), true, isBcsEngine); + bool isBcs = EngineHelpers::isBcs(this->osContext.getEngineType()); + LriHelper::program(&cmdStream, RegisterOffsets::csGprR4, static_cast(returnPtr & 0xFFFF'FFFFULL), true, isBcs); + LriHelper::program(&cmdStream, RegisterOffsets::csGprR4 + 4, static_cast(returnPtr >> 32), true, isBcs); uint64_t returnPtrAfterTaskStoreSection = returnPtr; returnPtrAfterTaskStoreSection += RelaxedOrderingHelper::getSizeTaskStoreSection(); - LriHelper::program(&cmdStream, RegisterOffsets::csGprR3, static_cast(returnPtrAfterTaskStoreSection & 0xFFFF'FFFFULL), true, isBcsEngine); - LriHelper::program(&cmdStream, RegisterOffsets::csGprR3 + 4, static_cast(returnPtrAfterTaskStoreSection >> 32), true, isBcsEngine); + LriHelper::program(&cmdStream, RegisterOffsets::csGprR3, static_cast(returnPtrAfterTaskStoreSection & 0xFFFF'FFFFULL), true, isBcs); + LriHelper::program(&cmdStream, RegisterOffsets::csGprR3 + 4, static_cast(returnPtrAfterTaskStoreSection >> 32), true, isBcs); } template void DirectSubmissionHw::initRelaxedOrderingRegisters() { - bool isBcsEngine = EngineHelpers::isBcs(this->osContext.getEngineType()); - LriHelper::program(&ringCommandStream, RegisterOffsets::csGprR1, 0, true, isBcsEngine); - LriHelper::program(&ringCommandStream, RegisterOffsets::csGprR1 + 4, 0, true, isBcsEngine); - LriHelper::program(&ringCommandStream, RegisterOffsets::csGprR5, 0, true, isBcsEngine); - LriHelper::program(&ringCommandStream, RegisterOffsets::csGprR5 + 4, 0, true, isBcsEngine); + bool isBcs = EngineHelpers::isBcs(this->osContext.getEngineType()); + LriHelper::program(&ringCommandStream, RegisterOffsets::csGprR1, 0, true, isBcs); + LriHelper::program(&ringCommandStream, RegisterOffsets::csGprR1 + 4, 0, true, isBcs); + LriHelper::program(&ringCommandStream, RegisterOffsets::csGprR5, 0, true, isBcs); + LriHelper::program(&ringCommandStream, RegisterOffsets::csGprR5 + 4, 0, true, isBcs); } template @@ -822,17 +822,17 @@ void DirectSubmissionHw::preinitializeRelaxedOrderingSect uint64_t deferredTasksListGpuVa = deferredTasksListAllocation->getGpuAddress(); - bool isBcsEngine = EngineHelpers::isBcs(this->osContext.getEngineType()); - LriHelper::program(&stream, RegisterOffsets::csGprR6, static_cast(deferredTasksListGpuVa & 0xFFFF'FFFFULL), true, isBcsEngine); - LriHelper::program(&stream, RegisterOffsets::csGprR6 + 4, static_cast(deferredTasksListGpuVa >> 32), true, isBcsEngine); + bool isBcs = EngineHelpers::isBcs(this->osContext.getEngineType()); + LriHelper::program(&stream, RegisterOffsets::csGprR6, static_cast(deferredTasksListGpuVa & 0xFFFF'FFFFULL), true, isBcs); + LriHelper::program(&stream, RegisterOffsets::csGprR6 + 4, static_cast(deferredTasksListGpuVa >> 32), true, isBcs); // Task start VA - LriHelper::program(&stream, RegisterOffsets::csGprR7, 0, true, isBcsEngine); - LriHelper::program(&stream, RegisterOffsets::csGprR7 + 4, 0, true, isBcsEngine); + LriHelper::program(&stream, RegisterOffsets::csGprR7, 0, true, isBcs); + LriHelper::program(&stream, RegisterOffsets::csGprR7 + 4, 0, true, isBcs); // Shift by 8 = multiply by 256. Address must by 64b aligned (shift by 6), but SHL accepts only 1, 2, 4, 8, 16 and 32 - LriHelper::program(&stream, RegisterOffsets::csGprR8, 8, true, isBcsEngine); - LriHelper::program(&stream, RegisterOffsets::csGprR8 + 4, 0, true, isBcsEngine); + LriHelper::program(&stream, RegisterOffsets::csGprR8, 8, true, isBcs); + LriHelper::program(&stream, RegisterOffsets::csGprR8 + 4, 0, true, isBcs); const uint32_t miMathMocs = this->rootDeviceEnvironment.getGmmHelper()->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER); @@ -850,7 +850,7 @@ void DirectSubmissionHw::preinitializeRelaxedOrderingSect aluHelper.copyToCmdStream(stream); - EncodeMathMMIO::encodeIncrement(stream, AluRegisters::gpr1, isBcsEngine); + EncodeMathMMIO::encodeIncrement(stream, AluRegisters::gpr1, isBcs); UNRECOVERABLE_IF(stream.getUsed() != RelaxedOrderingHelper::getSizeTaskStoreSection()); @@ -861,9 +861,9 @@ void DirectSubmissionHw::preinitializeRelaxedOrderingSect uint64_t schedulerStartAddress = relaxedOrderingSchedulerAllocation->getGpuAddress(); // 1. Init section - LriHelper::program(&schedulerStream, RegisterOffsets::csGprR11, 0, true, isBcsEngine); - LriHelper::program(&schedulerStream, RegisterOffsets::csGprR9, 0, true, isBcsEngine); - LriHelper::program(&schedulerStream, RegisterOffsets::csGprR9 + 4, 0, true, isBcsEngine); + LriHelper::program(&schedulerStream, RegisterOffsets::csGprR11, 0, true, isBcs); + LriHelper::program(&schedulerStream, RegisterOffsets::csGprR9, 0, true, isBcs); + LriHelper::program(&schedulerStream, RegisterOffsets::csGprR9 + 4, 0, true, isBcs); EncodeBatchBufferStartOrEnd::programBatchBufferStart(&schedulerStream, schedulerStartAddress, false, false, false); // 2. Semaphore section @@ -879,7 +879,7 @@ void DirectSubmissionHw::preinitializeRelaxedOrderingSect { EncodeMiPredicate::encode(schedulerStream, MiPredicateType::disable); - LriHelper::program(&schedulerStream, RegisterOffsets::csGprR5, 0, true, isBcsEngine); + LriHelper::program(&schedulerStream, RegisterOffsets::csGprR5, 0, true, isBcs); } UNRECOVERABLE_IF(schedulerStream.getUsed() != RelaxedOrderingHelper::DynamicSchedulerSizeAndOffsetSection::totalSize); diff --git a/shared/source/direct_submission/relaxed_ordering_helper.h b/shared/source/direct_submission/relaxed_ordering_helper.h index 8af189f436..2d39c201bf 100644 --- a/shared/source/direct_submission/relaxed_ordering_helper.h +++ b/shared/source/direct_submission/relaxed_ordering_helper.h @@ -20,10 +20,10 @@ static constexpr uint32_t queueSizeMultiplier = 4; static constexpr uint32_t maxQueueSize = 16; template -void encodeRegistersBeforeDependencyCheckers(LinearStream &cmdStream) { +void encodeRegistersBeforeDependencyCheckers(LinearStream &cmdStream, bool isBcs) { // Indirect BB_START operates only on GPR_0 - EncodeSetMMIO::encodeREG(cmdStream, RegisterOffsets::csGprR0, RegisterOffsets::csGprR4); - EncodeSetMMIO::encodeREG(cmdStream, RegisterOffsets::csGprR0 + 4, RegisterOffsets::csGprR4 + 4); + EncodeSetMMIO::encodeREG(cmdStream, RegisterOffsets::csGprR0, RegisterOffsets::csGprR4, isBcs); + EncodeSetMMIO::encodeREG(cmdStream, RegisterOffsets::csGprR0 + 4, RegisterOffsets::csGprR4 + 4, isBcs); } template diff --git a/shared/source/helpers/blit_commands_helper_base.inl b/shared/source/helpers/blit_commands_helper_base.inl index 66d60c4584..e9686facd8 100644 --- a/shared/source/helpers/blit_commands_helper_base.inl +++ b/shared/source/helpers/blit_commands_helper_base.inl @@ -520,8 +520,8 @@ void BlitCommandsHelper::encodeProfilingStartMmios(LinearStream &cmdS auto timestampContextStartGpuAddress = TimestampPacketHelper::getContextStartGpuAddress(timestampPacketNode); auto timestampGlobalStartAddress = TimestampPacketHelper::getGlobalStartGpuAddress(timestampPacketNode); - EncodeStoreMMIO::encode(cmdStream, RegisterOffsets::gpThreadTimeRegAddressOffsetLow, timestampContextStartGpuAddress, false, nullptr); - EncodeStoreMMIO::encode(cmdStream, RegisterOffsets::globalTimestampLdw, timestampGlobalStartAddress, false, nullptr); + EncodeStoreMMIO::encode(cmdStream, RegisterOffsets::gpThreadTimeRegAddressOffsetLow, timestampContextStartGpuAddress, false, nullptr, true); + EncodeStoreMMIO::encode(cmdStream, RegisterOffsets::globalTimestampLdw, timestampGlobalStartAddress, false, nullptr, true); } template @@ -529,8 +529,8 @@ void BlitCommandsHelper::encodeProfilingEndMmios(LinearStream &cmdStr auto timestampContextEndGpuAddress = TimestampPacketHelper::getContextEndGpuAddress(timestampPacketNode); auto timestampGlobalEndAddress = TimestampPacketHelper::getGlobalEndGpuAddress(timestampPacketNode); - EncodeStoreMMIO::encode(cmdStream, RegisterOffsets::gpThreadTimeRegAddressOffsetLow, timestampContextEndGpuAddress, false, nullptr); - EncodeStoreMMIO::encode(cmdStream, RegisterOffsets::globalTimestampLdw, timestampGlobalEndAddress, false, nullptr); + EncodeStoreMMIO::encode(cmdStream, RegisterOffsets::gpThreadTimeRegAddressOffsetLow, timestampContextEndGpuAddress, false, nullptr, true); + EncodeStoreMMIO::encode(cmdStream, RegisterOffsets::globalTimestampLdw, timestampGlobalEndAddress, false, nullptr, true); } template diff --git a/shared/source/helpers/gfx_core_helper.h b/shared/source/helpers/gfx_core_helper.h index ad31d6928f..80e0b072ea 100644 --- a/shared/source/helpers/gfx_core_helper.h +++ b/shared/source/helpers/gfx_core_helper.h @@ -504,7 +504,7 @@ struct MemorySynchronizationCommands { static bool isBarrierPriorToPipelineSelectWaRequired(const RootDeviceEnvironment &rootDeviceEnvironment); static void setBarrierExtraProperties(void *barrierCmd, PipeControlArgs &args); - static void encodeAdditionalTimestampOffsets(LinearStream &commandStream, uint64_t contextAddress, uint64_t globalAddress); + static void encodeAdditionalTimestampOffsets(LinearStream &commandStream, uint64_t contextAddress, uint64_t globalAddress, bool isBcs); }; } // namespace NEO diff --git a/shared/source/helpers/gfx_core_helper_base.inl b/shared/source/helpers/gfx_core_helper_base.inl index 052031738d..04bd04c75f 100644 --- a/shared/source/helpers/gfx_core_helper_base.inl +++ b/shared/source/helpers/gfx_core_helper_base.inl @@ -804,7 +804,7 @@ void *LriHelper::program(LinearStream *cmdStream, uint32_t address, u } template -void MemorySynchronizationCommands::encodeAdditionalTimestampOffsets(LinearStream &commandStream, uint64_t contextAddress, uint64_t globalAddress) { +void MemorySynchronizationCommands::encodeAdditionalTimestampOffsets(LinearStream &commandStream, uint64_t contextAddress, uint64_t globalAddress, bool isBcs) { } template diff --git a/shared/source/helpers/gfx_core_helper_xe2_and_later.inl b/shared/source/helpers/gfx_core_helper_xe2_and_later.inl index 2671a00403..afa53188b8 100644 --- a/shared/source/helpers/gfx_core_helper_xe2_and_later.inl +++ b/shared/source/helpers/gfx_core_helper_xe2_and_later.inl @@ -31,9 +31,9 @@ bool GfxCoreHelperHw::isTimestampShiftRequired() const { } template <> -void MemorySynchronizationCommands::encodeAdditionalTimestampOffsets(LinearStream &commandStream, uint64_t contextAddress, uint64_t globalAddress) { - EncodeStoreMMIO::encode(commandStream, RegisterOffsets::gpThreadTimeRegAddressOffsetHigh, contextAddress + sizeof(uint32_t), false, nullptr); - EncodeStoreMMIO::encode(commandStream, RegisterOffsets::globalTimestampUn, globalAddress + sizeof(uint32_t), false, nullptr); +void MemorySynchronizationCommands::encodeAdditionalTimestampOffsets(LinearStream &commandStream, uint64_t contextAddress, uint64_t globalAddress, bool isBcs) { + EncodeStoreMMIO::encode(commandStream, RegisterOffsets::gpThreadTimeRegAddressOffsetHigh, contextAddress + sizeof(uint32_t), false, nullptr, isBcs); + EncodeStoreMMIO::encode(commandStream, RegisterOffsets::globalTimestampUn, globalAddress + sizeof(uint32_t), false, nullptr, isBcs); } } // namespace NEO diff --git a/shared/source/xe2_hpg_core/command_stream_receiver_hw_xe2_hpg_core.cpp b/shared/source/xe2_hpg_core/command_stream_receiver_hw_xe2_hpg_core.cpp index 09d7ef6199..53cdd05c97 100644 --- a/shared/source/xe2_hpg_core/command_stream_receiver_hw_xe2_hpg_core.cpp +++ b/shared/source/xe2_hpg_core/command_stream_receiver_hw_xe2_hpg_core.cpp @@ -247,11 +247,11 @@ void BlitCommandsHelper::encodeProfilingStartMmios(LinearStream &cmdStre auto timestampContextStartGpuAddress = TimestampPacketHelper::getContextStartGpuAddress(timestampPacketNode); auto timestampGlobalStartAddress = TimestampPacketHelper::getGlobalStartGpuAddress(timestampPacketNode); - EncodeStoreMMIO::encode(cmdStream, RegisterOffsets::gpThreadTimeRegAddressOffsetHigh, timestampContextStartGpuAddress + sizeof(uint32_t), false, nullptr); - EncodeStoreMMIO::encode(cmdStream, RegisterOffsets::globalTimestampUn, timestampGlobalStartAddress + sizeof(uint32_t), false, nullptr); + EncodeStoreMMIO::encode(cmdStream, RegisterOffsets::gpThreadTimeRegAddressOffsetHigh, timestampContextStartGpuAddress + sizeof(uint32_t), false, nullptr, true); + EncodeStoreMMIO::encode(cmdStream, RegisterOffsets::globalTimestampUn, timestampGlobalStartAddress + sizeof(uint32_t), false, nullptr, true); - EncodeStoreMMIO::encode(cmdStream, RegisterOffsets::gpThreadTimeRegAddressOffsetLow, timestampContextStartGpuAddress, false, nullptr); - EncodeStoreMMIO::encode(cmdStream, RegisterOffsets::globalTimestampLdw, timestampGlobalStartAddress, false, nullptr); + EncodeStoreMMIO::encode(cmdStream, RegisterOffsets::gpThreadTimeRegAddressOffsetLow, timestampContextStartGpuAddress, false, nullptr, true); + EncodeStoreMMIO::encode(cmdStream, RegisterOffsets::globalTimestampLdw, timestampGlobalStartAddress, false, nullptr, true); } template <> @@ -259,11 +259,11 @@ void BlitCommandsHelper::encodeProfilingEndMmios(LinearStream &cmdStream auto timestampContextEndGpuAddress = TimestampPacketHelper::getContextEndGpuAddress(timestampPacketNode); auto timestampGlobalEndAddress = TimestampPacketHelper::getGlobalEndGpuAddress(timestampPacketNode); - EncodeStoreMMIO::encode(cmdStream, RegisterOffsets::gpThreadTimeRegAddressOffsetHigh, timestampContextEndGpuAddress + sizeof(uint32_t), false, nullptr); - EncodeStoreMMIO::encode(cmdStream, RegisterOffsets::globalTimestampUn, timestampGlobalEndAddress + sizeof(uint32_t), false, nullptr); + EncodeStoreMMIO::encode(cmdStream, RegisterOffsets::gpThreadTimeRegAddressOffsetHigh, timestampContextEndGpuAddress + sizeof(uint32_t), false, nullptr, true); + EncodeStoreMMIO::encode(cmdStream, RegisterOffsets::globalTimestampUn, timestampGlobalEndAddress + sizeof(uint32_t), false, nullptr, true); - EncodeStoreMMIO::encode(cmdStream, RegisterOffsets::gpThreadTimeRegAddressOffsetLow, timestampContextEndGpuAddress, false, nullptr); - EncodeStoreMMIO::encode(cmdStream, RegisterOffsets::globalTimestampLdw, timestampGlobalEndAddress, false, nullptr); + EncodeStoreMMIO::encode(cmdStream, RegisterOffsets::gpThreadTimeRegAddressOffsetLow, timestampContextEndGpuAddress, false, nullptr, true); + EncodeStoreMMIO::encode(cmdStream, RegisterOffsets::globalTimestampLdw, timestampGlobalEndAddress, false, nullptr, true); } template <> diff --git a/shared/test/unit_test/direct_submission/direct_submission_tests_1.cpp b/shared/test/unit_test/direct_submission/direct_submission_tests_1.cpp index a44c7d82f5..208d9476c6 100644 --- a/shared/test/unit_test/direct_submission/direct_submission_tests_1.cpp +++ b/shared/test/unit_test/direct_submission/direct_submission_tests_1.cpp @@ -750,7 +750,7 @@ HWTEST_F(DirectSubmissionTest, givenDirectSubmissionAvailableWhenProgrammingEndi uint8_t buffer[128]; mockCsr->commandStream.replaceBuffer(&buffer[0], 128u); mockCsr->commandStream.replaceGraphicsAllocation(&mockAllocation); - mockCsr->programEndingCmd(mockCsr->commandStream, &location, ret, false); + mockCsr->programEndingCmd(mockCsr->commandStream, &location, ret, false, false); EXPECT_EQ(sizeof(MI_BATCH_BUFFER_START), mockCsr->commandStream.getUsed()); DispatchFlags dispatchFlags = DispatchFlagsHelper::createDefaultDispatchFlags(); @@ -794,7 +794,7 @@ HWTEST_F(DirectSubmissionTest, givenDebugFlagSetWhenProgrammingEndingCommandThen auto currectBbStartCmd = reinterpret_cast(cmdStream.getSpace(0)); uint64_t expectedGpuVa = cmdStream.getGraphicsAllocation()->getGpuAddress() + cmdStream.getUsed(); - mockCsr->programEndingCmd(cmdStream, &location, ret, false); + mockCsr->programEndingCmd(cmdStream, &location, ret, false, false); EncodeNoop::alignToCacheLine(cmdStream); if (value == 0) { diff --git a/shared/test/unit_test/direct_submission/direct_submission_tests_2.cpp b/shared/test/unit_test/direct_submission/direct_submission_tests_2.cpp index 30a159d70a..37b1aaa721 100644 --- a/shared/test/unit_test/direct_submission/direct_submission_tests_2.cpp +++ b/shared/test/unit_test/direct_submission/direct_submission_tests_2.cpp @@ -2190,7 +2190,7 @@ HWTEST2_F(DirectSubmissionRelaxedOrderingTests, givenBcsRelaxedOrderingEnabledWh auto endingPtr = commandStream.getSpace(0); - ultCsr->programEndingCmd(commandStream, &endingPtr, true, true); + ultCsr->programEndingCmd(commandStream, &endingPtr, true, true, true); HardwareParse hwParser; hwParser.parseCommands(commandStream, 0); @@ -2227,7 +2227,7 @@ HWTEST2_F(DirectSubmissionRelaxedOrderingTests, givenBcsRelaxedOrderingDisabledW auto &commandStream = ultCsr->getCS(0x100); auto endingPtr = commandStream.getSpace(0); - ultCsr->programEndingCmd(commandStream, &endingPtr, true, false); + ultCsr->programEndingCmd(commandStream, &endingPtr, true, false, true); HardwareParse hwParser; hwParser.parseCommands(commandStream, 0); @@ -2255,7 +2255,7 @@ HWTEST2_F(DirectSubmissionRelaxedOrderingTests, whenProgrammingEndingCmdsThenSet auto endingPtr = commandStream.getSpace(0); - ultCsr->programEndingCmd(commandStream, &endingPtr, true, true); + ultCsr->programEndingCmd(commandStream, &endingPtr, true, true, true); HardwareParse hwParser; hwParser.parseCommands(commandStream, 0); @@ -2289,7 +2289,7 @@ HWTEST2_F(DirectSubmissionRelaxedOrderingTests, givenBbWithoutRelaxedOrderingDep auto endingPtr = commandStream.getSpace(0); - ultCsr->programEndingCmd(commandStream, &endingPtr, true, false); + ultCsr->programEndingCmd(commandStream, &endingPtr, true, false, false); HardwareParse hwParser; hwParser.parseCommands(commandStream, 0); diff --git a/shared/test/unit_test/encoders/command_encoder_tests_xehp_and_later.cpp b/shared/test/unit_test/encoders/command_encoder_tests_xehp_and_later.cpp index 42a96bada5..a559ee77ba 100644 --- a/shared/test/unit_test/encoders/command_encoder_tests_xehp_and_later.cpp +++ b/shared/test/unit_test/encoders/command_encoder_tests_xehp_and_later.cpp @@ -119,7 +119,8 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterHardwareCommandsTest, givenWorkloadPart offset, gpuAddress, true, - nullptr); + nullptr, + false); auto storeRegMem = genCmdCast(buffer); ASSERT_NE(nullptr, storeRegMem); @@ -131,7 +132,8 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterHardwareCommandsTest, givenWorkloadPart offset, gpuAddress, true, - &outCmdBuffer); + &outCmdBuffer, + false); storeRegMem = genCmdCast(ptrOffset(buffer, beforeEncode)); ASSERT_NE(nullptr, storeRegMem); diff --git a/shared/test/unit_test/encoders/test_encode_pvc_and_later.cpp b/shared/test/unit_test/encoders/test_encode_pvc_and_later.cpp index 107aa42484..219f521555 100644 --- a/shared/test/unit_test/encoders/test_encode_pvc_and_later.cpp +++ b/shared/test/unit_test/encoders/test_encode_pvc_and_later.cpp @@ -284,7 +284,7 @@ HWTEST2_F(EncodeConditionalBatchBufferStartTest, whenProgrammingConditionalRegRe uint8_t buffer[expectedSize] = {}; LinearStream cmdStream(buffer, expectedSize); - EncodeBatchBufferStartOrEnd::programConditionalRegRegBatchBufferStart(cmdStream, indirect ? 0 : startAddress, compareReg1, compareReg2, compareOperation, indirect); + EncodeBatchBufferStartOrEnd::programConditionalRegRegBatchBufferStart(cmdStream, indirect ? 0 : startAddress, compareReg1, compareReg2, compareOperation, indirect, false); EXPECT_EQ(expectedSize, cmdStream.getUsed()); diff --git a/shared/test/unit_test/encoders/test_encode_set_mmio.cpp b/shared/test/unit_test/encoders/test_encode_set_mmio.cpp index f07a2c0bd7..24ce8fa61a 100644 --- a/shared/test/unit_test/encoders/test_encode_set_mmio.cpp +++ b/shared/test/unit_test/encoders/test_encode_set_mmio.cpp @@ -47,7 +47,7 @@ HWTEST_F(CommandSetMMIOTest, WhenProgrammingThenLoadRegisterImmIsUsed) { } HWTEST_F(CommandSetMMIOTest, WhenProgrammingThenLoadRegisterMemIsUsed) { - EncodeSetMMIO::encodeMEM(*cmdContainer.get(), 0x2000, 0xDEADBEEFCAF0); + EncodeSetMMIO::encodeMEM(*cmdContainer.get(), 0x2000, 0xDEADBEEFCAF0, false); GenCmdList commands; CmdParse::parseCommandBuffer(commands, ptrOffset(cmdContainer->getCommandStream()->getCpuBase(), 0), cmdContainer->getCommandStream()->getUsed()); @@ -63,7 +63,7 @@ HWTEST_F(CommandSetMMIOTest, WhenProgrammingThenLoadRegisterMemIsUsed) { } HWTEST_F(CommandSetMMIOTest, WhenProgrammingThenLoadRegisterRegIsUsed) { - EncodeSetMMIO::encodeREG(*cmdContainer.get(), 0x2000, 0x2000); + EncodeSetMMIO::encodeREG(*cmdContainer.get(), 0x2000, 0x2000, false); GenCmdList commands; CmdParse::parseCommandBuffer(commands, ptrOffset(cmdContainer->getCommandStream()->getCpuBase(), 0), cmdContainer->getCommandStream()->getUsed()); @@ -99,7 +99,7 @@ HWTEST2_F(CommandSetMMIOTest, givenRegisterWithinRemapRangeWhenEncodingLoadingMM for (uint32_t offset = remapApplicableOffsets[2 * i]; offset < remapApplicableOffsets[2 * i + 1]; offset += 32) { MI_LOAD_REGISTER_MEM *miLoadReg = reinterpret_cast(cmdContainer->getCommandStream()->getSpace(0)); - EncodeSetMMIO::encodeMEM(*cmdContainer.get(), offset, 0xDEADBEEFCAF0); + EncodeSetMMIO::encodeMEM(*cmdContainer.get(), offset, 0xDEADBEEFCAF0, false); EXPECT_EQ(offset, miLoadReg->getRegisterAddress()); EXPECT_EQ(0xDEADBEEFCAF0u, miLoadReg->getMemoryAddress()); @@ -109,7 +109,7 @@ HWTEST2_F(CommandSetMMIOTest, givenRegisterWithinRemapRangeWhenEncodingLoadingMM { MI_LOAD_REGISTER_MEM *miLoadReg = reinterpret_cast(cmdContainer->getCommandStream()->getSpace(0)); - EncodeSetMMIO::encodeMEM(*cmdContainer.get(), 0x3000, 0xDEADBEEFCAF0); + EncodeSetMMIO::encodeMEM(*cmdContainer.get(), 0x3000, 0xDEADBEEFCAF0, false); EXPECT_EQ(0x3000u, miLoadReg->getRegisterAddress()); EXPECT_EQ(0xDEADBEEFCAF0u, miLoadReg->getMemoryAddress()); @@ -117,7 +117,7 @@ HWTEST2_F(CommandSetMMIOTest, givenRegisterWithinRemapRangeWhenEncodingLoadingMM } { MI_LOAD_REGISTER_MEM *miLoadReg = reinterpret_cast(cmdContainer->getCommandStream()->getSpace(0)); - EncodeSetMMIO::encodeMEM(*cmdContainer.get(), 0x4300, 0xDEADBEEFCAF0); + EncodeSetMMIO::encodeMEM(*cmdContainer.get(), 0x4300, 0xDEADBEEFCAF0, false); EXPECT_EQ(0x4300u, miLoadReg->getRegisterAddress()); EXPECT_EQ(0xDEADBEEFCAF0u, miLoadReg->getMemoryAddress()); @@ -125,7 +125,7 @@ HWTEST2_F(CommandSetMMIOTest, givenRegisterWithinRemapRangeWhenEncodingLoadingMM } { MI_LOAD_REGISTER_MEM *miLoadReg = reinterpret_cast(cmdContainer->getCommandStream()->getSpace(0)); - EncodeSetMMIO::encodeMEM(*cmdContainer.get(), 0x5000, 0xDEADBEEFCAF0); + EncodeSetMMIO::encodeMEM(*cmdContainer.get(), 0x5000, 0xDEADBEEFCAF0, false); EXPECT_EQ(0x5000u, miLoadReg->getRegisterAddress()); EXPECT_EQ(0xDEADBEEFCAF0u, miLoadReg->getMemoryAddress()); @@ -143,7 +143,7 @@ HWTEST2_F(CommandSetMMIOTest, givenRegisterWithinRemapRangeWhenEncodingLoadingMM for (int i = 0; i < 3; i++) { for (uint32_t offset = remapApplicableOffsets[2 * i]; offset < remapApplicableOffsets[2 * i + 1]; offset += 32) { MI_LOAD_REGISTER_REG *miLoadReg = reinterpret_cast(cmdContainer->getCommandStream()->getSpace(0)); - EncodeSetMMIO::encodeREG(*cmdContainer.get(), offset, offset); + EncodeSetMMIO::encodeREG(*cmdContainer.get(), offset, offset, false); EXPECT_EQ(offset, miLoadReg->getSourceRegisterAddress()); EXPECT_EQ(offset, miLoadReg->getDestinationRegisterAddress()); @@ -154,7 +154,7 @@ HWTEST2_F(CommandSetMMIOTest, givenRegisterWithinRemapRangeWhenEncodingLoadingMM { MI_LOAD_REGISTER_REG *miLoadReg = reinterpret_cast(cmdContainer->getCommandStream()->getSpace(0)); - EncodeSetMMIO::encodeREG(*cmdContainer.get(), 0x1000, 0x2500); + EncodeSetMMIO::encodeREG(*cmdContainer.get(), 0x1000, 0x2500, false); EXPECT_EQ(0x2500u, miLoadReg->getSourceRegisterAddress()); EXPECT_EQ(0x1000u, miLoadReg->getDestinationRegisterAddress()); @@ -163,7 +163,7 @@ HWTEST2_F(CommandSetMMIOTest, givenRegisterWithinRemapRangeWhenEncodingLoadingMM } { MI_LOAD_REGISTER_REG *miLoadReg = reinterpret_cast(cmdContainer->getCommandStream()->getSpace(0)); - EncodeSetMMIO::encodeREG(*cmdContainer.get(), 0x2200, 0x4000); + EncodeSetMMIO::encodeREG(*cmdContainer.get(), 0x2200, 0x4000, false); EXPECT_EQ(0x4000u, miLoadReg->getSourceRegisterAddress()); EXPECT_EQ(0x2200u, miLoadReg->getDestinationRegisterAddress()); diff --git a/shared/test/unit_test/helpers/gfx_core_helper_tests.cpp b/shared/test/unit_test/helpers/gfx_core_helper_tests.cpp index a56cf23388..7a7b005c9c 100644 --- a/shared/test/unit_test/helpers/gfx_core_helper_tests.cpp +++ b/shared/test/unit_test/helpers/gfx_core_helper_tests.cpp @@ -1902,7 +1902,7 @@ HWTEST_F(GfxCoreHelperTest, whenEncodeAdditionalTimestampOffsetsThenNothingEncod LinearStream stream(streamBuffer, bufferSize); uint64_t fstAddress = 0; uint64_t sndAddress = 0; - MemorySynchronizationCommands::encodeAdditionalTimestampOffsets(stream, fstAddress, sndAddress); + MemorySynchronizationCommands::encodeAdditionalTimestampOffsets(stream, fstAddress, sndAddress, false); HardwareParse hwParser; hwParser.parseCommands(stream, 0); diff --git a/shared/test/unit_test/helpers/gfx_core_helper_xe2_and_later.cpp b/shared/test/unit_test/helpers/gfx_core_helper_xe2_and_later.cpp index fe6ab84a2c..ec617a7a73 100644 --- a/shared/test/unit_test/helpers/gfx_core_helper_xe2_and_later.cpp +++ b/shared/test/unit_test/helpers/gfx_core_helper_xe2_and_later.cpp @@ -89,7 +89,7 @@ HWTEST2_F(GfxCoreHelperXe2AndLaterTests, givenAtLeastXe2HpgWhenEncodeAdditionalT LinearStream stream(streamBuffer, bufferSize); uint64_t fstAddress = 12; uint64_t sndAddress = 100; - MemorySynchronizationCommands::encodeAdditionalTimestampOffsets(stream, fstAddress, sndAddress); + MemorySynchronizationCommands::encodeAdditionalTimestampOffsets(stream, fstAddress, sndAddress, false); HardwareParse hwParser; hwParser.parseCommands(stream, 0);