diff --git a/level_zero/core/source/cmdlist/cmdlist.h b/level_zero/core/source/cmdlist/cmdlist.h index 5c2600ebb1..ed7f65413e 100644 --- a/level_zero/core/source/cmdlist/cmdlist.h +++ b/level_zero/core/source/cmdlist/cmdlist.h @@ -154,7 +154,7 @@ struct CommandList : _ze_command_list_handle_t { const size_t *pOffsets, ze_event_handle_t hSignalEvent, uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents) = 0; - virtual ze_result_t appendMILoadRegImm(uint32_t reg, uint32_t value) = 0; + virtual ze_result_t appendMILoadRegImm(uint32_t reg, uint32_t value, bool isBcs) = 0; virtual ze_result_t appendMILoadRegReg(uint32_t reg1, uint32_t reg2) = 0; virtual ze_result_t appendMILoadRegMem(uint32_t reg1, uint64_t address) = 0; virtual ze_result_t appendMIStoreRegMem(uint32_t reg1, uint64_t address) = 0; diff --git a/level_zero/core/source/cmdlist/cmdlist_extended/cmdlist_extended.inl b/level_zero/core/source/cmdlist/cmdlist_extended/cmdlist_extended.inl index 425d577ee9..f201dc2216 100644 --- a/level_zero/core/source/cmdlist/cmdlist_extended/cmdlist_extended.inl +++ b/level_zero/core/source/cmdlist/cmdlist_extended/cmdlist_extended.inl @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020-2023 Intel Corporation + * Copyright (C) 2020-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -14,7 +14,7 @@ namespace L0 { template -ze_result_t CommandListCoreFamily::appendMILoadRegImm(uint32_t reg, uint32_t value) { +ze_result_t CommandListCoreFamily::appendMILoadRegImm(uint32_t reg, uint32_t value, bool isBcs) { return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE; } template diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.h b/level_zero/core/source/cmdlist/cmdlist_hw.h index 004cd39a09..e16cb2beca 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.h +++ b/level_zero/core/source/cmdlist/cmdlist_hw.h @@ -162,7 +162,7 @@ struct CommandListCoreFamily : public CommandListImp { uint32_t numWaitEvents, ze_event_handle_t *phWaitEvents, bool relaxedOrderingDispatch) override; - ze_result_t appendMILoadRegImm(uint32_t reg, uint32_t value) override; + ze_result_t appendMILoadRegImm(uint32_t reg, uint32_t value, bool isBcs) override; ze_result_t appendMILoadRegReg(uint32_t reg1, uint32_t reg2) override; ze_result_t appendMILoadRegMem(uint32_t reg1, uint64_t address) override; ze_result_t appendMIStoreRegMem(uint32_t reg1, uint64_t address) override; diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index 847abde509..feee9f0c6e 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -543,7 +543,7 @@ ze_result_t CommandListCoreFamily::appendLaunchMultipleKernelsInd commandContainer.addToResidencyContainer(alloc); for (uint32_t i = 0; i < numKernels; i++) { - NEO::EncodeMathMMIO::encodeGreaterThanPredicate(commandContainer, alloc->getGpuAddress(), i); + NEO::EncodeMathMMIO::encodeGreaterThanPredicate(commandContainer, alloc->getGpuAddress(), i, isCopyOnly()); ret = appendLaunchKernelWithParams(Kernel::fromHandle(kernelHandles[i]), pLaunchArgumentsBuffer[i], @@ -2518,7 +2518,7 @@ void CommandListCoreFamily::appendWaitOnInOrderDependency(std::sh for (uint32_t i = 0; i < inOrderExecInfo->getNumDevicePartitionsToWait(); i++) { if (relaxedOrderingAllowed) { - NEO::EncodeBatchBufferStartOrEnd::programConditionalDataMemBatchBufferStart(*commandContainer.getCommandStream(), 0, gpuAddress, waitValue, NEO::CompareOperation::less, true, isQwordInOrderCounter()); + NEO::EncodeBatchBufferStartOrEnd::programConditionalDataMemBatchBufferStart(*commandContainer.getCommandStream(), 0, gpuAddress, waitValue, NEO::CompareOperation::less, true, isQwordInOrderCounter(), isCopyOnly()); } else { using MI_SEMAPHORE_WAIT = typename GfxFamily::MI_SEMAPHORE_WAIT; @@ -2784,8 +2784,8 @@ void CommandListCoreFamily::appendWriteKernelTimestamp(Event *eve uint64_t contextAddress = ptrOffset(baseAddr, contextOffset); if (maskLsb) { - NEO::EncodeMathMMIO::encodeBitwiseAndVal(commandContainer, RegisterOffsets::globalTimestampLdw, mask, globalAddress, workloadPartition, globalPostSyncCmdBuffer); - NEO::EncodeMathMMIO::encodeBitwiseAndVal(commandContainer, RegisterOffsets::gpThreadTimeRegAddressOffsetLow, mask, contextAddress, workloadPartition, contextPostSyncCmdBuffer); + NEO::EncodeMathMMIO::encodeBitwiseAndVal(commandContainer, RegisterOffsets::globalTimestampLdw, mask, globalAddress, workloadPartition, globalPostSyncCmdBuffer, isCopyOnly()); + NEO::EncodeMathMMIO::encodeBitwiseAndVal(commandContainer, RegisterOffsets::gpThreadTimeRegAddressOffsetLow, mask, contextAddress, workloadPartition, contextPostSyncCmdBuffer, isCopyOnly()); } else { NEO::EncodeStoreMMIO::encode(*commandContainer.getCommandStream(), RegisterOffsets::globalTimestampLdw, globalAddress, workloadPartition, globalPostSyncCmdBuffer); NEO::EncodeStoreMMIO::encode(*commandContainer.getCommandStream(), RegisterOffsets::gpThreadTimeRegAddressOffsetLow, contextAddress, workloadPartition, contextPostSyncCmdBuffer); @@ -3593,8 +3593,8 @@ ze_result_t CommandListCoreFamily::appendWaitOnMemory(void *desc, if (isQwordInOrderCounter()) { indirectMode = true; - NEO::LriHelper::program(commandContainer.getCommandStream(), RegisterOffsets::csGprR0, getLowPart(data), true); - NEO::LriHelper::program(commandContainer.getCommandStream(), RegisterOffsets::csGprR0 + 4, getHighPart(data), true); + NEO::LriHelper::program(commandContainer.getCommandStream(), RegisterOffsets::csGprR0, getLowPart(data), true, isCopyOnly()); + NEO::LriHelper::program(commandContainer.getCommandStream(), RegisterOffsets::csGprR0 + 4, getHighPart(data), true, isCopyOnly()); } else { return ZE_RESULT_ERROR_UNSUPPORTED_FEATURE; @@ -3850,7 +3850,7 @@ void CommandListCoreFamily::appendWaitOnSingleEvent(Event *event, for (uint32_t i = 0u; i < packetsToWait; i++) { if (relaxedOrderingAllowed) { NEO::EncodeBatchBufferStartOrEnd::programConditionalDataMemBatchBufferStart(*commandContainer.getCommandStream(), 0, gpuAddr, Event::STATE_CLEARED, - NEO::CompareOperation::equal, true, false); + NEO::CompareOperation::equal, true, false, isCopyOnly()); } else { NEO::EncodeSemaphore::addMiSemaphoreWaitCommand(*commandContainer.getCommandStream(), gpuAddr, @@ -4045,7 +4045,7 @@ void CommandListCoreFamily::appendFullSynchronizedDispatchInit() // Patch Primary Tile section skip (to Secondary Tile section) NEO::EncodeBatchBufferStartOrEnd::programConditionalDataMemBatchBufferStart(skipPrimaryTileSectionCmdStream, cmdStream->getCurrentGpuAddressPosition(), workPartitionAllocationGpuVa, 0, - NEO::CompareOperation::notEqual, false, false); + NEO::CompareOperation::notEqual, false, false, isCopyOnly()); // Secondary Tile section { @@ -4059,7 +4059,7 @@ void CommandListCoreFamily::appendFullSynchronizedDispatchInit() // Patch Primary Tile section jump to end NEO::EncodeBatchBufferStartOrEnd::programConditionalDataMemBatchBufferStart(jumpToEndSectionFromPrimaryTile, cmdStream->getCurrentGpuAddressPosition(), syncAllocationGpuVa + sizeof(uint32_t), queueId, - NEO::CompareOperation::equal, false, false); + NEO::CompareOperation::equal, false, false, isCopyOnly()); // End section NEO::EncodeMiPredicate::encode(*cmdStream, NEO::MiPredicateType::disable); diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl b/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl index 166aa55e3d..0b5964aad0 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_xehp_and_later.inl @@ -473,13 +473,15 @@ ze_result_t CommandListCoreFamily::appendLaunchKernelWithParams(K template void CommandListCoreFamily::appendMultiPartitionPrologue(uint32_t partitionDataSize) { NEO::ImplicitScalingDispatch::dispatchOffsetRegister(*commandContainer.getCommandStream(), - partitionDataSize); + partitionDataSize, + isCopyOnly()); } template void CommandListCoreFamily::appendMultiPartitionEpilogue() { NEO::ImplicitScalingDispatch::dispatchOffsetRegister(*commandContainer.getCommandStream(), - NEO::ImplicitScalingDispatch::getImmediateWritePostSyncOffset()); + NEO::ImplicitScalingDispatch::getImmediateWritePostSyncOffset(), + isCopyOnly()); } template @@ -568,7 +570,7 @@ void CommandListCoreFamily::appendDispatchOffsetRegister(bool wor if (workloadPartitionEvent && !device->getL0GfxCoreHelper().hasUnifiedPostSyncAllocationLayout()) { auto offset = beforeProfilingCmds ? NEO::ImplicitScalingDispatch::getTimeStampPostSyncOffset() : NEO::ImplicitScalingDispatch::getImmediateWritePostSyncOffset(); - NEO::ImplicitScalingDispatch::dispatchOffsetRegister(*commandContainer.getCommandStream(), offset); + NEO::ImplicitScalingDispatch::dispatchOffsetRegister(*commandContainer.getCommandStream(), offset, isCopyOnly()); } } diff --git a/level_zero/core/source/cmdqueue/cmdqueue_hw.inl b/level_zero/core/source/cmdqueue/cmdqueue_hw.inl index 09a6f7658c..c9f281f9e0 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue_hw.inl +++ b/level_zero/core/source/cmdqueue/cmdqueue_hw.inl @@ -23,6 +23,7 @@ #include "shared/source/helpers/api_specific_config.h" #include "shared/source/helpers/compiler_product_helper.h" #include "shared/source/helpers/definitions/command_encoder_args.h" +#include "shared/source/helpers/engine_node_helper.h" #include "shared/source/helpers/gfx_core_helper.h" #include "shared/source/helpers/heap_base_address_model.h" #include "shared/source/helpers/pause_on_gpu_properties.h" @@ -1005,7 +1006,8 @@ void CommandQueueHw::programCommandQueueDebugCmdsForSourceLevelOr if (isDebugEnabled && !this->commandQueueDebugCmdsProgrammed) { if (this->device->getL0Debugger()) { this->device->getL0Debugger()->programSbaAddressLoad(cmdStream, - device->getL0Debugger()->getSbaTrackingBuffer(csr->getOsContext().getContextId())->getGpuAddress()); + device->getL0Debugger()->getSbaTrackingBuffer(csr->getOsContext().getContextId())->getGpuAddress(), + NEO::EngineHelpers::isBcs(this->csr->getOsContext().getEngineType())); this->commandQueueDebugCmdsProgrammed = true; } } diff --git a/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h b/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h index 922181f10d..655dfa6115 100644 --- a/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h +++ b/level_zero/core/test/unit_tests/mocks/mock_cmdlist.h @@ -517,7 +517,8 @@ struct MockCommandList : public CommandList { ADDMETHOD_NOBASE(appendMILoadRegImm, ze_result_t, ZE_RESULT_SUCCESS, (uint32_t reg, - uint32_t value)); + uint32_t value, + bool isBcs)); ADDMETHOD_NOBASE(appendMILoadRegReg, ze_result_t, ZE_RESULT_SUCCESS, (uint32_t reg1, diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_1.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_1.cpp index 1417756537..9a97e0ae0d 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_1.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_1.cpp @@ -1857,7 +1857,7 @@ HWTEST2_F(CommandListCreate, givenInOrderExecutionWhenDispatchingRelaxedOrdering lrrCmd++; EXPECT_TRUE(RelaxedOrderingCommandsHelper::verifyConditionalDataMemBbStart(lrrCmd, 0, cmdList->inOrderExecInfo->getBaseDeviceAddress(), 2, - NEO::CompareOperation::less, true, FamilyType::isQwordInOrderCounter)); + NEO::CompareOperation::less, true, FamilyType::isQwordInOrderCounter, false)); } TEST_F(CommandListCreate, GivenGpuHangWhenCreatingImmCmdListWithSyncModeAndAppendBarrierThenAppendBarrierReturnsDeviceLost) { diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist.cpp index c90b6522e9..6302b75c56 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_in_order_cmdlist.cpp @@ -1969,7 +1969,7 @@ HWTEST2_F(InOrderCmdListTests, givenRelaxedOrderingWhenProgrammingTimestampEvent auto eventEndGpuVa = events[0]->getCompletionFieldGpuAddress(device); EXPECT_TRUE(RelaxedOrderingCommandsHelper::verifyConditionalDataMemBbStart(lrrCmd, 0, eventEndGpuVa, static_cast(Event::STATE_CLEARED), - NEO::CompareOperation::equal, true, false)); + NEO::CompareOperation::equal, true, false, false)); auto sdiCmd = genCmdCast(ptrOffset(lrrCmd, EncodeBatchBufferStartOrEnd::getCmdSizeConditionalDataMemBatchBufferStart(false))); ASSERT_NE(nullptr, sdiCmd); @@ -6393,7 +6393,7 @@ HWTEST2_F(MultiTileSynchronizedDispatchTests, givenFullSyncDispatchWhenAppending } // Primary Tile section skip - patching - if (!RelaxedOrderingCommandsHelper::verifyConditionalDataMemBbStart(primaryTileSectionSkipVa, castToUint64(miPredicate), workPartitionGpuVa, 0, NEO::CompareOperation::notEqual, false, false)) { + if (!RelaxedOrderingCommandsHelper::verifyConditionalDataMemBbStart(primaryTileSectionSkipVa, castToUint64(miPredicate), workPartitionGpuVa, 0, NEO::CompareOperation::notEqual, false, false, false)) { return false; } @@ -6409,7 +6409,7 @@ HWTEST2_F(MultiTileSynchronizedDispatchTests, givenFullSyncDispatchWhenAppending } // Jump to end from Primary Tile section - patching - if (!RelaxedOrderingCommandsHelper::verifyConditionalDataMemBbStart(jumpToEndSectionFromPrimaryTile, castToUint64(miPredicate), syncAllocGpuVa + sizeof(uint32_t), queueId, NEO::CompareOperation::equal, false, false)) { + if (!RelaxedOrderingCommandsHelper::verifyConditionalDataMemBbStart(jumpToEndSectionFromPrimaryTile, castToUint64(miPredicate), syncAllocGpuVa + sizeof(uint32_t), queueId, NEO::CompareOperation::equal, false, false, false)) { return false; } diff --git a/opencl/source/command_queue/command_queue.h b/opencl/source/command_queue/command_queue.h index 92a0e39bb5..452edf9d3b 100644 --- a/opencl/source/command_queue/command_queue.h +++ b/opencl/source/command_queue/command_queue.h @@ -386,6 +386,7 @@ class CommandQueue : public BaseObject<_cl_command_queue> { bool getHeaplessStateInitEnabled() const { return this->heaplessStateInitEnabled; } bool isBcsSplitInitialized() const { return this->bcsSplitInitialized; } + bool isBcs() const { return isCopyOnly; }; protected: void *enqueueReadMemObjForMap(TransferProperties &transferProperties, EventsRequest &eventsRequest, cl_int &errcodeRet); diff --git a/opencl/source/command_queue/enqueue_common.h b/opencl/source/command_queue/enqueue_common.h index e7f99da55f..d001995466 100644 --- a/opencl/source/command_queue/enqueue_common.h +++ b/opencl/source/command_queue/enqueue_common.h @@ -269,7 +269,7 @@ cl_int CommandQueueHw::enqueueHandler(Surface **surfacesForResidency, if (programBarrierInTaskStream) { CsrDependencies csrDeps{}; fillCsrDependenciesWithLastBcsPackets(csrDeps); - TimestampPacketHelper::programCsrDependenciesForTimestampPacketContainer(commandStream, csrDeps, false); + TimestampPacketHelper::programCsrDependenciesForTimestampPacketContainer(commandStream, csrDeps, false, isCopyOnly); setupBarrierTimestampForBcsEngines(getGpgpuCommandStreamReceiver().getOsContext().getEngineType(), timestampPacketDependencies); getGpgpuCommandStreamReceiver().programStallingCommandsForBarrier(commandStream, ×tampPacketDependencies.barrierNodes, isDcFlushRequiredOnStallingCommandsOnNextFlush()); @@ -314,7 +314,7 @@ cl_int CommandQueueHw::enqueueHandler(Surface **surfacesForResidency, RelaxedOrderingHelper::encodeRegistersBeforeDependencyCheckers(commandStream); } - TimestampPacketHelper::programCsrDependenciesForTimestampPacketContainer(commandStream, csrDeps, relaxedOrderingEnabled); + TimestampPacketHelper::programCsrDependenciesForTimestampPacketContainer(commandStream, csrDeps, relaxedOrderingEnabled, isCopyOnly); } if (isNonStallingIoqBarrierWithDependencies) { @@ -660,7 +660,7 @@ void CommandQueueHw::processDispatchForCacheFlush(Surface **surfaces, LinearStream *commandStream, CsrDependencies &csrDeps) { - TimestampPacketHelper::programCsrDependenciesForTimestampPacketContainer(*commandStream, csrDeps, false); + TimestampPacketHelper::programCsrDependenciesForTimestampPacketContainer(*commandStream, csrDeps, false, isCopyOnly); uint64_t postSyncAddress = 0; if (getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) { diff --git a/opencl/source/command_queue/gpgpu_walker_base.inl b/opencl/source/command_queue/gpgpu_walker_base.inl index 41b9472a5f..2da4f5783c 100644 --- a/opencl/source/command_queue/gpgpu_walker_base.inl +++ b/opencl/source/command_queue/gpgpu_walker_base.inl @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2023 Intel Corporation + * Copyright (C) 2018-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -45,6 +45,7 @@ void GpgpuWalkerHelper::addAluReadModifyWriteRegister( LriHelper::program(pCommandStream, RegisterOffsets::csGprR1, mask, + false, false); // Add instruction MI_MATH with 4 MI_MATH_ALU_INST_INLINE operands diff --git a/opencl/source/command_queue/hardware_interface_base.inl b/opencl/source/command_queue/hardware_interface_base.inl index 543ab73527..778150415b 100644 --- a/opencl/source/command_queue/hardware_interface_base.inl +++ b/opencl/source/command_queue/hardware_interface_base.inl @@ -117,7 +117,7 @@ void HardwareInterface::dispatchWalker( RelaxedOrderingHelper::encodeRegistersBeforeDependencyCheckers(*commandStream); } - TimestampPacketHelper::programCsrDependenciesForTimestampPacketContainer(*commandStream, csrDependencies, walkerArgs.relaxedOrderingEnabled); + TimestampPacketHelper::programCsrDependenciesForTimestampPacketContainer(*commandStream, csrDependencies, walkerArgs.relaxedOrderingEnabled, commandQueue.isBcs()); dsh->align(EncodeStates::alignInterfaceDescriptorData); @@ -165,7 +165,7 @@ void HardwareInterface::dispatchWalker( if (PauseOnGpuProperties::gpuScratchRegWriteAllowed(debugManager.flags.GpuScratchRegWriteAfterWalker.get(), commandQueue.getGpgpuCommandStreamReceiver().peekTaskCount())) { uint32_t registerOffset = debugManager.flags.GpuScratchRegWriteRegisterOffset.get(); uint32_t registerData = debugManager.flags.GpuScratchRegWriteRegisterData.get(); - LriHelper::program(commandStream, registerOffset, registerData, EncodeSetMMIO::isRemapApplicable(registerOffset)); + LriHelper::program(commandStream, registerOffset, registerData, EncodeSetMMIO::isRemapApplicable(registerOffset), commandQueue.isBcs()); } if (PauseOnGpuProperties::pauseModeAllowed(debugManager.flags.PauseOnEnqueue.get(), commandQueue.getGpgpuCommandStreamReceiver().peekTaskCount(), PauseOnGpuProperties::PauseMode::AfterWorkload)) { diff --git a/opencl/source/gen12lp/gpgpu_walker_gen12lp.cpp b/opencl/source/gen12lp/gpgpu_walker_gen12lp.cpp index 047323875a..efa98a9171 100644 --- a/opencl/source/gen12lp/gpgpu_walker_gen12lp.cpp +++ b/opencl/source/gen12lp/gpgpu_walker_gen12lp.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2019-2023 Intel Corporation + * Copyright (C) 2019-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -40,7 +40,8 @@ void HardwareInterface::dispatchWorkarounds( NEO::LriHelper::program(commandStream, 0x7010, value, - false); + false, + commandQueue.isBcs()); } } diff --git a/opencl/test/unit_test/aub_tests/command_stream/mi_math_aub_tests_dg2_and_later.cpp b/opencl/test/unit_test/aub_tests/command_stream/mi_math_aub_tests_dg2_and_later.cpp index 14dd7e96ac..06ca8501d9 100644 --- a/opencl/test/unit_test/aub_tests/command_stream/mi_math_aub_tests_dg2_and_later.cpp +++ b/opencl/test/unit_test/aub_tests/command_stream/mi_math_aub_tests_dg2_and_later.cpp @@ -586,7 +586,7 @@ void ConditionalBbStartTests::whenDispatchingEqualModeThenResultsAreValidImpl { uint64_t jumpAddress = taskStream->getCurrentGpuAddressPosition() + EncodeBatchBufferStartOrEnd::getCmdSizeConditionalDataMemBatchBufferStart(isQwordData) + EncodeBatchBufferStartOrEnd::getBatchBufferEndSize(); - EncodeBatchBufferStartOrEnd::programConditionalDataMemBatchBufferStart(*taskStream, jumpAddress, baseGpuVa, baseCompareValue, NEO::CompareOperation::equal, false, isQwordData); + EncodeBatchBufferStartOrEnd::programConditionalDataMemBatchBufferStart(*taskStream, jumpAddress, baseGpuVa, baseCompareValue, NEO::CompareOperation::equal, false, isQwordData, false); NEO::EncodeBatchBufferStartOrEnd::programBatchBufferEnd(*taskStream); // should be skipped @@ -599,7 +599,7 @@ void ConditionalBbStartTests::whenDispatchingEqualModeThenResultsAreValidImpl // Greater { - EncodeBatchBufferStartOrEnd::programConditionalDataMemBatchBufferStart(*taskStream, invalidGpuVa, baseGpuVa + sizeof(TestCompareDataT), baseCompareValue, NEO::CompareOperation::equal, false, isQwordData); + EncodeBatchBufferStartOrEnd::programConditionalDataMemBatchBufferStart(*taskStream, invalidGpuVa, baseGpuVa + sizeof(TestCompareDataT), baseCompareValue, NEO::CompareOperation::equal, false, isQwordData, false); EncodeAtomic::programMiAtomic(*taskStream, baseWriteGpuVa + sizeof(TestCompareDataT), getAtomicOpcode(), @@ -609,7 +609,7 @@ void ConditionalBbStartTests::whenDispatchingEqualModeThenResultsAreValidImpl // Less { - EncodeBatchBufferStartOrEnd::programConditionalDataMemBatchBufferStart(*taskStream, invalidGpuVa, baseGpuVa + (sizeof(TestCompareDataT) * 2), baseCompareValue, NEO::CompareOperation::equal, false, isQwordData); + EncodeBatchBufferStartOrEnd::programConditionalDataMemBatchBufferStart(*taskStream, invalidGpuVa, baseGpuVa + (sizeof(TestCompareDataT) * 2), baseCompareValue, NEO::CompareOperation::equal, false, isQwordData, false); EncodeAtomic::programMiAtomic(*taskStream, baseWriteGpuVa + (sizeof(TestCompareDataT) * 2), getAtomicOpcode(), @@ -641,7 +641,7 @@ void ConditionalBbStartTests::whenDispatchingNotEqualModeThenResultsAreValidI // Equal { - EncodeBatchBufferStartOrEnd::programConditionalDataMemBatchBufferStart(*taskStream, invalidGpuVa, baseGpuVa, baseCompareValue, NEO::CompareOperation::notEqual, false, isQwordData); + EncodeBatchBufferStartOrEnd::programConditionalDataMemBatchBufferStart(*taskStream, invalidGpuVa, baseGpuVa, baseCompareValue, NEO::CompareOperation::notEqual, false, isQwordData, false); EncodeAtomic::programMiAtomic(*taskStream, baseWriteGpuVa, getAtomicOpcode(), @@ -654,7 +654,7 @@ void ConditionalBbStartTests::whenDispatchingNotEqualModeThenResultsAreValidI uint64_t jumpAddress = taskStream->getCurrentGpuAddressPosition() + EncodeBatchBufferStartOrEnd::getCmdSizeConditionalDataMemBatchBufferStart(isQwordData) + EncodeBatchBufferStartOrEnd::getBatchBufferEndSize(); - EncodeBatchBufferStartOrEnd::programConditionalDataMemBatchBufferStart(*taskStream, jumpAddress, baseGpuVa + sizeof(TestCompareDataT), baseCompareValue, NEO::CompareOperation::notEqual, false, isQwordData); + EncodeBatchBufferStartOrEnd::programConditionalDataMemBatchBufferStart(*taskStream, jumpAddress, baseGpuVa + sizeof(TestCompareDataT), baseCompareValue, NEO::CompareOperation::notEqual, false, isQwordData, false); NEO::EncodeBatchBufferStartOrEnd::programBatchBufferEnd(*taskStream); // should be skipped @@ -668,7 +668,7 @@ void ConditionalBbStartTests::whenDispatchingNotEqualModeThenResultsAreValidI { uint64_t jumpAddress = taskStream->getCurrentGpuAddressPosition() + EncodeBatchBufferStartOrEnd::getCmdSizeConditionalDataMemBatchBufferStart(isQwordData) + EncodeBatchBufferStartOrEnd::getBatchBufferEndSize(); - EncodeBatchBufferStartOrEnd::programConditionalDataMemBatchBufferStart(*taskStream, jumpAddress, baseGpuVa + (sizeof(TestCompareDataT) * 2), baseCompareValue, NEO::CompareOperation::notEqual, false, isQwordData); + EncodeBatchBufferStartOrEnd::programConditionalDataMemBatchBufferStart(*taskStream, jumpAddress, baseGpuVa + (sizeof(TestCompareDataT) * 2), baseCompareValue, NEO::CompareOperation::notEqual, false, isQwordData, false); NEO::EncodeBatchBufferStartOrEnd::programBatchBufferEnd(*taskStream); // should be skipped @@ -703,7 +703,7 @@ void ConditionalBbStartTests::whenDispatchingGreaterOrEqualModeThenResultsAre { uint64_t jumpAddress = taskStream->getCurrentGpuAddressPosition() + EncodeBatchBufferStartOrEnd::getCmdSizeConditionalDataMemBatchBufferStart(isQwordData) + EncodeBatchBufferStartOrEnd::getBatchBufferEndSize(); - EncodeBatchBufferStartOrEnd::programConditionalDataMemBatchBufferStart(*taskStream, jumpAddress, baseGpuVa, baseCompareValue, NEO::CompareOperation::greaterOrEqual, false, isQwordData); + EncodeBatchBufferStartOrEnd::programConditionalDataMemBatchBufferStart(*taskStream, jumpAddress, baseGpuVa, baseCompareValue, NEO::CompareOperation::greaterOrEqual, false, isQwordData, false); NEO::EncodeBatchBufferStartOrEnd::programBatchBufferEnd(*taskStream); // should be skipped @@ -718,7 +718,7 @@ void ConditionalBbStartTests::whenDispatchingGreaterOrEqualModeThenResultsAre uint64_t jumpAddress = taskStream->getCurrentGpuAddressPosition() + EncodeBatchBufferStartOrEnd::getCmdSizeConditionalDataMemBatchBufferStart(isQwordData) + EncodeBatchBufferStartOrEnd::getBatchBufferEndSize(); - EncodeBatchBufferStartOrEnd::programConditionalDataMemBatchBufferStart(*taskStream, jumpAddress, baseGpuVa + sizeof(TestCompareDataT), baseCompareValue, NEO::CompareOperation::greaterOrEqual, false, isQwordData); + EncodeBatchBufferStartOrEnd::programConditionalDataMemBatchBufferStart(*taskStream, jumpAddress, baseGpuVa + sizeof(TestCompareDataT), baseCompareValue, NEO::CompareOperation::greaterOrEqual, false, isQwordData, false); NEO::EncodeBatchBufferStartOrEnd::programBatchBufferEnd(*taskStream); // should be skipped @@ -730,7 +730,7 @@ void ConditionalBbStartTests::whenDispatchingGreaterOrEqualModeThenResultsAre // Less { - EncodeBatchBufferStartOrEnd::programConditionalDataMemBatchBufferStart(*taskStream, invalidGpuVa, baseGpuVa + (sizeof(TestCompareDataT) * 2), baseCompareValue, NEO::CompareOperation::greaterOrEqual, false, isQwordData); + EncodeBatchBufferStartOrEnd::programConditionalDataMemBatchBufferStart(*taskStream, invalidGpuVa, baseGpuVa + (sizeof(TestCompareDataT) * 2), baseCompareValue, NEO::CompareOperation::greaterOrEqual, false, isQwordData, false); EncodeAtomic::programMiAtomic(*taskStream, baseWriteGpuVa + (sizeof(TestCompareDataT) * 2), getAtomicOpcode(), @@ -761,7 +761,7 @@ void ConditionalBbStartTests::whenDispatchingLessModeThenResultsAreValidImpl( // Equal { - EncodeBatchBufferStartOrEnd::programConditionalDataMemBatchBufferStart(*taskStream, invalidGpuVa, baseGpuVa, baseCompareValue, NEO::CompareOperation::less, false, isQwordData); + EncodeBatchBufferStartOrEnd::programConditionalDataMemBatchBufferStart(*taskStream, invalidGpuVa, baseGpuVa, baseCompareValue, NEO::CompareOperation::less, false, isQwordData, false); EncodeAtomic::programMiAtomic(*taskStream, baseWriteGpuVa, getAtomicOpcode(), @@ -771,7 +771,7 @@ void ConditionalBbStartTests::whenDispatchingLessModeThenResultsAreValidImpl( // Greater { - EncodeBatchBufferStartOrEnd::programConditionalDataMemBatchBufferStart(*taskStream, invalidGpuVa, baseGpuVa + sizeof(TestCompareDataT), baseCompareValue, NEO::CompareOperation::less, false, isQwordData); + EncodeBatchBufferStartOrEnd::programConditionalDataMemBatchBufferStart(*taskStream, invalidGpuVa, baseGpuVa + sizeof(TestCompareDataT), baseCompareValue, NEO::CompareOperation::less, false, isQwordData, false); EncodeAtomic::programMiAtomic(*taskStream, baseWriteGpuVa + sizeof(TestCompareDataT), getAtomicOpcode(), @@ -783,7 +783,7 @@ void ConditionalBbStartTests::whenDispatchingLessModeThenResultsAreValidImpl( { uint64_t jumpAddress = taskStream->getCurrentGpuAddressPosition() + EncodeBatchBufferStartOrEnd::getCmdSizeConditionalDataMemBatchBufferStart(isQwordData) + EncodeBatchBufferStartOrEnd::getBatchBufferEndSize(); - EncodeBatchBufferStartOrEnd::programConditionalDataMemBatchBufferStart(*taskStream, jumpAddress, baseGpuVa + (sizeof(TestCompareDataT) * 2), baseCompareValue, NEO::CompareOperation::less, false, isQwordData); + EncodeBatchBufferStartOrEnd::programConditionalDataMemBatchBufferStart(*taskStream, jumpAddress, baseGpuVa + (sizeof(TestCompareDataT) * 2), baseCompareValue, NEO::CompareOperation::less, false, isQwordData, false); NEO::EncodeBatchBufferStartOrEnd::programBatchBufferEnd(*taskStream); // should be skipped diff --git a/opencl/test/unit_test/command_queue/command_queue_tests.cpp b/opencl/test/unit_test/command_queue/command_queue_tests.cpp index 2653c17f42..e7e08615ab 100644 --- a/opencl/test/unit_test/command_queue/command_queue_tests.cpp +++ b/opencl/test/unit_test/command_queue/command_queue_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2023 Intel Corporation + * Copyright (C) 2018-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -140,6 +140,11 @@ TEST(CommandQueue, WhenGettingErrorCodeFromTaskCountThenProperValueIsReturned) { EXPECT_EQ(CL_OUT_OF_RESOURCES, CommandQueue::getErrorCodeFromTaskCount(CompletionStamp::failed)); } +TEST(CommandQueue, GivenCommandQueueWhenIsBcsIsCalledThenIsCopyOnlyIsReturned) { + MockCommandQueue cmdQ(nullptr, nullptr, 0, false); + EXPECT_EQ(cmdQ.isBcs(), cmdQ.isCopyOnly); +} + TEST(CommandQueue, WhenConstructingCommandQueueThenTaskLevelAndTaskCountAreZero) { MockCommandQueue cmdQ(nullptr, nullptr, 0, false); EXPECT_EQ(0u, cmdQ.taskLevel); diff --git a/opencl/test/unit_test/command_queue/enqueue_kernel_2_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_kernel_2_tests.cpp index c157179ad8..0d138c0d48 100644 --- a/opencl/test/unit_test/command_queue/enqueue_kernel_2_tests.cpp +++ b/opencl/test/unit_test/command_queue/enqueue_kernel_2_tests.cpp @@ -1171,7 +1171,7 @@ HWTEST2_F(RelaxedOrderingEnqueueKernelTests, givenEnqueueKernelWhenProgrammingDe auto eventNode = castToObject(outEvent)->getTimestampPacketNodes()->peekNodes()[0]; auto compareAddress = eventNode->getGpuAddress() + eventNode->getContextEndOffset(); - EXPECT_TRUE(RelaxedOrderingCommandsHelper::verifyConditionalDataMemBbStart(++lrrCmd, 0, compareAddress, 1, CompareOperation::equal, true, false)); + EXPECT_TRUE(RelaxedOrderingCommandsHelper::verifyConditionalDataMemBbStart(++lrrCmd, 0, compareAddress, 1, CompareOperation::equal, true, false, false)); mockCmdQueueHw.enqueueBarrierWithWaitList(1, &outEvent, nullptr); @@ -1269,10 +1269,10 @@ HWTEST2_F(RelaxedOrderingEnqueueKernelTests, givenBarrierWithDependenciesWhenFlu auto eventNode = castToObject(outEvent)->getTimestampPacketNodes()->peekNodes()[0]; auto compareAddress = eventNode->getGpuAddress() + eventNode->getContextEndOffset(); - EXPECT_TRUE(RelaxedOrderingCommandsHelper::verifyConditionalDataMemBbStart(++lrrCmd, 0, compareAddress, 1, CompareOperation::equal, true, false)); + EXPECT_TRUE(RelaxedOrderingCommandsHelper::verifyConditionalDataMemBbStart(++lrrCmd, 0, compareAddress, 1, CompareOperation::equal, true, false, false)); auto conditionalBbStart2 = reinterpret_cast(ptrOffset(lrrCmd, EncodeBatchBufferStartOrEnd::getCmdSizeConditionalDataMemBatchBufferStart(false))); - EXPECT_TRUE(RelaxedOrderingCommandsHelper::verifyConditionalDataMemBbStart(conditionalBbStart2, 0, compareAddress, 1, CompareOperation::equal, true, false)); + EXPECT_TRUE(RelaxedOrderingCommandsHelper::verifyConditionalDataMemBbStart(conditionalBbStart2, 0, compareAddress, 1, CompareOperation::equal, true, false, false)); auto sdiCmd = genCmdCast(ptrOffset(conditionalBbStart2, EncodeBatchBufferStartOrEnd::getCmdSizeConditionalDataMemBatchBufferStart(false))); EXPECT_NE(nullptr, sdiCmd); diff --git a/opencl/test/unit_test/command_stream/command_stream_receiver_hw_1_tests.cpp b/opencl/test/unit_test/command_stream/command_stream_receiver_hw_1_tests.cpp index 7b140019d1..3de6ee2765 100644 --- a/opencl/test/unit_test/command_stream/command_stream_receiver_hw_1_tests.cpp +++ b/opencl/test/unit_test/command_stream/command_stream_receiver_hw_1_tests.cpp @@ -1068,8 +1068,7 @@ HWTEST2_F(RelaxedOrderingBcsTests, givenDependenciesWhenFlushingThenProgramCorre auto eventNode = timestamp.peekNodes()[0]; auto compareAddress = eventNode->getGpuAddress() + eventNode->getContextEndOffset(); - - EXPECT_TRUE(RelaxedOrderingCommandsHelper::verifyConditionalDataMemBbStart(++lrrCmd, 0, compareAddress, 1, CompareOperation::equal, true, false)); + EXPECT_TRUE(RelaxedOrderingCommandsHelper::verifyConditionalDataMemBbStart(++lrrCmd, 0, compareAddress, 1, CompareOperation::equal, true, false, true)); } HWTEST2_F(RelaxedOrderingBcsTests, givenDependenciesWhenFlushingThenProgramProgramRelaxedOrderingOnlyIfAllowed, IsAtLeastXeHpcCore) { diff --git a/opencl/test/unit_test/gen12lp/gfx_core_helper_tests_gen12lp.inl b/opencl/test/unit_test/gen12lp/gfx_core_helper_tests_gen12lp.inl index f9564a0f16..aa33270940 100644 --- a/opencl/test/unit_test/gen12lp/gfx_core_helper_tests_gen12lp.inl +++ b/opencl/test/unit_test/gen12lp/gfx_core_helper_tests_gen12lp.inl @@ -321,7 +321,7 @@ GEN12LPTEST_F(LriHelperTestsGen12Lp, whenProgrammingLriCommandThenExpectMmioRema expectedLri.setDataDword(data); expectedLri.setMmioRemapEnable(false); - LriHelper::program(&stream, address, data, false); + LriHelper::program(&stream, address, data, false, false); MI_LOAD_REGISTER_IMM *lri = genCmdCast(buffer.get()); ASSERT_NE(nullptr, lri); diff --git a/opencl/test/unit_test/gen8/command_stream_receiver_hw_tests_gen8.cpp b/opencl/test/unit_test/gen8/command_stream_receiver_hw_tests_gen8.cpp index a464a3e195..fbe1611650 100644 --- a/opencl/test/unit_test/gen8/command_stream_receiver_hw_tests_gen8.cpp +++ b/opencl/test/unit_test/gen8/command_stream_receiver_hw_tests_gen8.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2022 Intel Corporation + * Copyright (C) 2018-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -50,7 +50,7 @@ GEN8TEST_F(CommandStreamReceiverHwTestGen8, GivenChangedL3ConfigWhenL3IsProgramm uint32_t l3Config = 0x12345678; - csr.programL3(stream, l3Config); + csr.programL3(stream, l3Config, false); this->parseCommands(stream); diff --git a/opencl/test/unit_test/xe_hpc_core/gfx_core_helper_tests_xe_hpc_core.cpp b/opencl/test/unit_test/xe_hpc_core/gfx_core_helper_tests_xe_hpc_core.cpp index 7c4a465fe4..86b4634b6a 100644 --- a/opencl/test/unit_test/xe_hpc_core/gfx_core_helper_tests_xe_hpc_core.cpp +++ b/opencl/test/unit_test/xe_hpc_core/gfx_core_helper_tests_xe_hpc_core.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2021-2023 Intel Corporation + * Copyright (C) 2021-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -691,7 +691,7 @@ XE_HPC_CORETEST_F(LriHelperTestsXeHpcCore, whenProgrammingLriCommandThenExpectMm expectedLri.setDataDword(data); expectedLri.setMmioRemapEnable(true); - LriHelper::program(&stream, address, data, true); + LriHelper::program(&stream, address, data, true, false); MI_LOAD_REGISTER_IMM *lri = genCmdCast(buffer.get()); ASSERT_NE(nullptr, lri); diff --git a/shared/source/command_container/command_encoder.h b/shared/source/command_container/command_encoder.h index f7f29c2665..3dc7ca0b05 100644 --- a/shared/source/command_container/command_encoder.h +++ b/shared/source/command_container/command_encoder.h @@ -266,16 +266,17 @@ struct EncodeMathMMIO { static const size_t size = sizeof(MI_STORE_REGISTER_MEM); - static void encodeMulRegVal(CommandContainer &container, uint32_t offset, uint32_t val, uint64_t dstAddress); + static void encodeMulRegVal(CommandContainer &container, uint32_t offset, uint32_t val, uint64_t dstAddress, bool isBcs); - static void encodeGreaterThanPredicate(CommandContainer &container, uint64_t lhsVal, uint32_t rhsVal); + static void encodeGreaterThanPredicate(CommandContainer &container, uint64_t lhsVal, uint32_t rhsVal, bool isBcs); static void encodeBitwiseAndVal(CommandContainer &container, uint32_t regOffset, uint32_t immVal, uint64_t dstAddress, bool workloadPartition, - void **outCmdBuffer); + void **outCmdBuffer, + bool isBcs); static void encodeAlu(MI_MATH_ALU_INST_INLINE *pAluParam, AluRegisters srcA, AluRegisters srcB, AluRegisters op, AluRegisters dest, AluRegisters result); @@ -291,8 +292,8 @@ struct EncodeMathMMIO { AluRegisters secondOperandRegister, AluRegisters finalResultRegister); - static void encodeIncrement(LinearStream &cmdStream, AluRegisters operandRegister); - static void encodeDecrement(LinearStream &cmdStream, AluRegisters operandRegister); + static void encodeIncrement(LinearStream &cmdStream, AluRegisters operandRegister, bool isBcs); + static void encodeDecrement(LinearStream &cmdStream, AluRegisters operandRegister, bool isBcs); static constexpr size_t getCmdSizeForIncrementOrDecrement() { return (EncodeAluHelper::getCmdsSize() + (2 * sizeof(typename GfxFamily::MI_LOAD_REGISTER_IMM))); } @@ -303,7 +304,7 @@ struct EncodeMathMMIO { decrement = 1, }; - static void encodeIncrementOrDecrement(LinearStream &cmdStream, AluRegisters operandRegister, IncrementOrDecrementOperation operationType); + static void encodeIncrementOrDecrement(LinearStream &cmdStream, AluRegisters operandRegister, IncrementOrDecrementOperation operationType, bool isBcs); }; template @@ -333,11 +334,11 @@ struct EncodeSetMMIO { static const size_t sizeMEM = sizeof(MI_LOAD_REGISTER_MEM); static const size_t sizeREG = sizeof(MI_LOAD_REGISTER_REG); - static void encodeIMM(CommandContainer &container, uint32_t offset, uint32_t data, bool remap); + static void encodeIMM(CommandContainer &container, uint32_t offset, uint32_t data, bool remap, bool isBcs); static void encodeMEM(CommandContainer &container, uint32_t offset, uint64_t address); static void encodeREG(CommandContainer &container, uint32_t dstOffset, uint32_t srcOffset); - static void encodeIMM(LinearStream &cmdStream, uint32_t offset, uint32_t data, bool remap); + static void encodeIMM(LinearStream &cmdStream, uint32_t offset, uint32_t data, bool remap, bool isBcs); static void encodeMEM(LinearStream &cmdStream, uint32_t offset, uint64_t address); static void encodeREG(LinearStream &cmdStream, uint32_t dstOffset, uint32_t srcOffset); @@ -492,10 +493,10 @@ struct EncodeBatchBufferStartOrEnd { static void programBatchBufferEnd(CommandContainer &container); static void programBatchBufferEnd(LinearStream &commandStream); - static void programConditionalDataMemBatchBufferStart(LinearStream &commandStream, uint64_t startAddress, uint64_t compareAddress, uint64_t compareData, CompareOperation compareOperation, bool indirect, bool useQwordData); - static void programConditionalDataRegBatchBufferStart(LinearStream &commandStream, uint64_t startAddress, uint32_t compareReg, uint64_t compareData, CompareOperation compareOperation, bool indirect, bool useQwordData); + static void programConditionalDataMemBatchBufferStart(LinearStream &commandStream, uint64_t startAddress, uint64_t compareAddress, uint64_t compareData, CompareOperation compareOperation, bool indirect, bool useQwordData, bool isBcs); + static void programConditionalDataRegBatchBufferStart(LinearStream &commandStream, uint64_t startAddress, uint32_t compareReg, uint64_t compareData, CompareOperation compareOperation, bool indirect, bool useQwordData, bool isBcs); static void programConditionalRegRegBatchBufferStart(LinearStream &commandStream, uint64_t startAddress, AluRegisters compareReg0, AluRegisters compareReg1, CompareOperation compareOperation, bool indirect); - static void programConditionalRegMemBatchBufferStart(LinearStream &commandStream, uint64_t startAddress, uint64_t compareAddress, uint32_t compareReg, CompareOperation compareOperation, bool indirect); + static void programConditionalRegMemBatchBufferStart(LinearStream &commandStream, uint64_t startAddress, uint64_t compareAddress, uint32_t compareReg, CompareOperation compareOperation, bool indirect, bool isBcs); static size_t constexpr getCmdSizeConditionalDataMemBatchBufferStart(bool useQwordData) { size_t size = (getCmdSizeConditionalBufferStartBase() + sizeof(typename GfxFamily::MI_LOAD_REGISTER_MEM) + (2 * sizeof(typename GfxFamily::MI_LOAD_REGISTER_IMM))); diff --git a/shared/source/command_container/command_encoder.inl b/shared/source/command_container/command_encoder.inl index cd3ff5aea2..5142e878f2 100644 --- a/shared/source/command_container/command_encoder.inl +++ b/shared/source/command_container/command_encoder.inl @@ -100,7 +100,7 @@ uint32_t EncodeStates::copySamplerState(IndirectHeap *dsh, } // namespace NEO template -void EncodeMathMMIO::encodeMulRegVal(CommandContainer &container, uint32_t offset, uint32_t val, uint64_t dstAddress) { +void EncodeMathMMIO::encodeMulRegVal(CommandContainer &container, uint32_t offset, uint32_t val, uint64_t dstAddress, bool isBcs) { int logLws = 0; int i = val; while (val >> logLws) { @@ -108,7 +108,7 @@ void EncodeMathMMIO::encodeMulRegVal(CommandContainer &container, uint32 } EncodeSetMMIO::encodeREG(container, RegisterOffsets::csGprR0, offset); - EncodeSetMMIO::encodeIMM(container, RegisterOffsets::csGprR1, 0, true); + EncodeSetMMIO::encodeIMM(container, RegisterOffsets::csGprR1, 0, true, isBcs); i = 0; while (i < logLws) { @@ -134,9 +134,9 @@ void EncodeMathMMIO::encodeMulRegVal(CommandContainer &container, uint32 * set, then (*firstOperand) is greater than secondOperand. */ template -void EncodeMathMMIO::encodeGreaterThanPredicate(CommandContainer &container, uint64_t firstOperand, uint32_t secondOperand) { +void EncodeMathMMIO::encodeGreaterThanPredicate(CommandContainer &container, uint64_t firstOperand, uint32_t secondOperand, bool isBcs) { EncodeSetMMIO::encodeMEM(container, RegisterOffsets::csGprR0, firstOperand); - EncodeSetMMIO::encodeIMM(container, RegisterOffsets::csGprR1, secondOperand, true); + EncodeSetMMIO::encodeIMM(container, RegisterOffsets::csGprR1, secondOperand, true, isBcs); /* RegisterOffsets::csGprR* registers map to AluRegisters::gpr* registers */ EncodeMath::greaterThan(container, AluRegisters::gpr0, @@ -151,9 +151,9 @@ void EncodeMathMMIO::encodeGreaterThanPredicate(CommandContainer &contai */ template void EncodeMathMMIO::encodeBitwiseAndVal(CommandContainer &container, uint32_t regOffset, uint32_t immVal, uint64_t dstAddress, - bool workloadPartition, void **outCmdBuffer) { + bool workloadPartition, void **outCmdBuffer, bool isBcs) { EncodeSetMMIO::encodeREG(container, RegisterOffsets::csGprR13, regOffset); - EncodeSetMMIO::encodeIMM(container, RegisterOffsets::csGprR14, immVal, true); + EncodeSetMMIO::encodeIMM(container, RegisterOffsets::csGprR14, immVal, true, isBcs); EncodeMath::bitwiseAnd(container, AluRegisters::gpr13, AluRegisters::gpr14, AluRegisters::gpr12); @@ -250,9 +250,9 @@ void EncodeMathMMIO::encodeAluAnd(MI_MATH_ALU_INST_INLINE *pAluParam, } template -void EncodeMathMMIO::encodeIncrementOrDecrement(LinearStream &cmdStream, AluRegisters operandRegister, IncrementOrDecrementOperation operationType) { - LriHelper::program(&cmdStream, RegisterOffsets::csGprR7, 1, true); - LriHelper::program(&cmdStream, RegisterOffsets::csGprR7 + 4, 0, true); +void EncodeMathMMIO::encodeIncrementOrDecrement(LinearStream &cmdStream, AluRegisters operandRegister, IncrementOrDecrementOperation operationType, bool isBcs) { + LriHelper::program(&cmdStream, RegisterOffsets::csGprR7, 1, true, isBcs); + LriHelper::program(&cmdStream, RegisterOffsets::csGprR7 + 4, 0, true, isBcs); EncodeAluHelper aluHelper; aluHelper.setNextAlu(AluRegisters::opcodeLoad, AluRegisters::srca, operandRegister); @@ -265,13 +265,13 @@ void EncodeMathMMIO::encodeIncrementOrDecrement(LinearStream &cmdStream, } template -void EncodeMathMMIO::encodeIncrement(LinearStream &cmdStream, AluRegisters operandRegister) { - encodeIncrementOrDecrement(cmdStream, operandRegister, IncrementOrDecrementOperation::increment); +void EncodeMathMMIO::encodeIncrement(LinearStream &cmdStream, AluRegisters operandRegister, bool isBcs) { + encodeIncrementOrDecrement(cmdStream, operandRegister, IncrementOrDecrementOperation::increment, isBcs); } template -void EncodeMathMMIO::encodeDecrement(LinearStream &cmdStream, AluRegisters operandRegister) { - encodeIncrementOrDecrement(cmdStream, operandRegister, IncrementOrDecrementOperation::decrement); +void EncodeMathMMIO::encodeDecrement(LinearStream &cmdStream, AluRegisters operandRegister, bool isBcs) { + encodeIncrementOrDecrement(cmdStream, operandRegister, IncrementOrDecrementOperation::decrement, isBcs); } /* @@ -332,8 +332,8 @@ void EncodeMath::bitwiseAnd(CommandContainer &container, } template -inline void EncodeSetMMIO::encodeIMM(CommandContainer &container, uint32_t offset, uint32_t data, bool remap) { - EncodeSetMMIO::encodeIMM(*container.getCommandStream(), offset, data, remap); +inline void EncodeSetMMIO::encodeIMM(CommandContainer &container, uint32_t offset, uint32_t data, bool remap, bool isBcs) { + EncodeSetMMIO::encodeIMM(*container.getCommandStream(), offset, data, remap, isBcs); } template @@ -347,11 +347,12 @@ inline void EncodeSetMMIO::encodeREG(CommandContainer &container, uint32 } template -inline void EncodeSetMMIO::encodeIMM(LinearStream &cmdStream, uint32_t offset, uint32_t data, bool remap) { +inline void EncodeSetMMIO::encodeIMM(LinearStream &cmdStream, uint32_t offset, uint32_t data, bool remap, bool isBcs) { LriHelper::program(&cmdStream, offset, data, - remap); + remap, + isBcs); } template @@ -630,7 +631,7 @@ void EncodeIndirectParams::setWorkDimIndirect(CommandContainer &containe */ if (groupSize[2] > 1) { - EncodeSetMMIO::encodeIMM(container, resultRegister, 3 << (8 * (dstPtr & 0b11)), true); + EncodeSetMMIO::encodeIMM(container, resultRegister, 3 << (8 * (dstPtr & 0b11)), true, false); } else { constexpr uint32_t groupCount2Register = RegisterOffsets::csGprR1; @@ -664,20 +665,20 @@ void EncodeIndirectParams::setWorkDimIndirect(CommandContainer &containe if (offset) { EncodeSetMMIO::encodeMEM(container, backupRegister, dstPtr); - EncodeSetMMIO::encodeIMM(container, memoryMaskRegister, memoryMask, true); + EncodeSetMMIO::encodeIMM(container, memoryMaskRegister, memoryMask, true, false); EncodeMath::bitwiseAnd(container, memoryMaskAluRegister, backupAluRegister, backupAluRegister); - EncodeSetMMIO::encodeIMM(container, offsetRegister, offset, true); + EncodeSetMMIO::encodeIMM(container, offsetRegister, offset, true, false); } - EncodeSetMMIO::encodeIMM(container, constantOneRegister, 1, true); - EncodeSetMMIO::encodeIMM(container, constantTwoRegister, 2, true); + EncodeSetMMIO::encodeIMM(container, constantOneRegister, 1, true, false); + EncodeSetMMIO::encodeIMM(container, constantTwoRegister, 2, true, false); EncodeSetMMIO::encodeREG(container, groupCount2Register, RegisterOffsets::gpgpuDispatchDim[2]); EncodeMath::greaterThan(container, groupCount2AluRegister, constantOneAluRegister, workDimEq3AluRegister); EncodeMath::bitwiseAnd(container, workDimEq3AluRegister, constantOneAluRegister, workDimEq3AluRegister); - EncodeSetMMIO::encodeIMM(container, groupSize1Register, groupSize[1], true); + EncodeSetMMIO::encodeIMM(container, groupSize1Register, groupSize[1], true, false); EncodeSetMMIO::encodeREG(container, groupCount1Register, RegisterOffsets::gpgpuDispatchDim[1]); EncodeMath::addition(container, groupSize1AluRegister, groupCount1AluRegister, sumAluRegister); @@ -897,7 +898,7 @@ void EncodeIndirectParams::setGlobalWorkSizeIndirect(CommandContainer &c if (NEO::isUndefinedOffset(offsets[i])) { continue; } - EncodeMathMMIO::encodeMulRegVal(container, RegisterOffsets::gpgpuDispatchDim[i], lws[i], ptrOffset(crossThreadAddress, offsets[i])); + EncodeMathMMIO::encodeMulRegVal(container, RegisterOffsets::gpgpuDispatchDim[i], lws[i], ptrOffset(crossThreadAddress, offsets[i]), false); } } @@ -994,39 +995,39 @@ void EncodeAtomic::programMiAtomic(LinearStream &commandStream, template void EncodeBatchBufferStartOrEnd::programConditionalDataMemBatchBufferStart(LinearStream &commandStream, uint64_t startAddress, uint64_t compareAddress, - uint64_t compareData, CompareOperation compareOperation, bool indirect, bool useQwordData) { + uint64_t compareData, CompareOperation compareOperation, bool indirect, bool useQwordData, bool isBcs) { EncodeSetMMIO::encodeMEM(commandStream, RegisterOffsets::csGprR7, compareAddress); if (useQwordData) { EncodeSetMMIO::encodeMEM(commandStream, RegisterOffsets::csGprR7 + 4, compareAddress + 4); } else { - LriHelper::program(&commandStream, RegisterOffsets::csGprR7 + 4, 0, true); + LriHelper::program(&commandStream, RegisterOffsets::csGprR7 + 4, 0, true, isBcs); } uint32_t compareDataLow = static_cast(compareData & std::numeric_limits::max()); uint32_t compareDataHigh = useQwordData ? static_cast(compareData >> 32) : 0; - LriHelper::program(&commandStream, RegisterOffsets::csGprR8, compareDataLow, true); - LriHelper::program(&commandStream, RegisterOffsets::csGprR8 + 4, compareDataHigh, true); + LriHelper::program(&commandStream, RegisterOffsets::csGprR8, compareDataLow, true, isBcs); + LriHelper::program(&commandStream, RegisterOffsets::csGprR8 + 4, compareDataHigh, true, isBcs); programConditionalBatchBufferStartBase(commandStream, startAddress, AluRegisters::gpr7, AluRegisters::gpr8, compareOperation, indirect); } template void EncodeBatchBufferStartOrEnd::programConditionalDataRegBatchBufferStart(LinearStream &commandStream, uint64_t startAddress, uint32_t compareReg, - uint64_t compareData, CompareOperation compareOperation, bool indirect, bool useQwordData) { + uint64_t compareData, CompareOperation compareOperation, bool indirect, bool useQwordData, bool isBcs) { EncodeSetMMIO::encodeREG(commandStream, RegisterOffsets::csGprR7, compareReg); if (useQwordData) { EncodeSetMMIO::encodeREG(commandStream, RegisterOffsets::csGprR7 + 4, compareReg + 4); } else { - LriHelper::program(&commandStream, RegisterOffsets::csGprR7 + 4, 0, true); + LriHelper::program(&commandStream, RegisterOffsets::csGprR7 + 4, 0, true, isBcs); } uint32_t compareDataLow = static_cast(compareData & std::numeric_limits::max()); uint32_t compareDataHigh = useQwordData ? static_cast(compareData >> 32) : 0; - LriHelper::program(&commandStream, RegisterOffsets::csGprR8, compareDataLow, true); - LriHelper::program(&commandStream, RegisterOffsets::csGprR8 + 4, compareDataHigh, true); + LriHelper::program(&commandStream, RegisterOffsets::csGprR8, compareDataLow, true, isBcs); + LriHelper::program(&commandStream, RegisterOffsets::csGprR8 + 4, compareDataHigh, true, isBcs); programConditionalBatchBufferStartBase(commandStream, startAddress, AluRegisters::gpr7, AluRegisters::gpr8, compareOperation, indirect); } @@ -1040,12 +1041,12 @@ void EncodeBatchBufferStartOrEnd::programConditionalRegRegBatchBufferSta template void EncodeBatchBufferStartOrEnd::programConditionalRegMemBatchBufferStart(LinearStream &commandStream, uint64_t startAddress, uint64_t compareAddress, uint32_t compareReg, - CompareOperation compareOperation, bool indirect) { + CompareOperation compareOperation, bool indirect, bool isBcs) { EncodeSetMMIO::encodeMEM(commandStream, RegisterOffsets::csGprR7, compareAddress); - LriHelper::program(&commandStream, RegisterOffsets::csGprR7 + 4, 0, true); + LriHelper::program(&commandStream, RegisterOffsets::csGprR7 + 4, 0, true, isBcs); EncodeSetMMIO::encodeREG(commandStream, RegisterOffsets::csGprR8, compareReg); - LriHelper::program(&commandStream, RegisterOffsets::csGprR8 + 4, 0, true); + LriHelper::program(&commandStream, RegisterOffsets::csGprR8 + 4, 0, true, isBcs); programConditionalBatchBufferStartBase(commandStream, startAddress, AluRegisters::gpr7, AluRegisters::gpr8, compareOperation, indirect); } diff --git a/shared/source/command_container/command_encoder_bdw_and_later.inl b/shared/source/command_container/command_encoder_bdw_and_later.inl index 26dcc572d4..21991729db 100644 --- a/shared/source/command_container/command_encoder_bdw_and_later.inl +++ b/shared/source/command_container/command_encoder_bdw_and_later.inl @@ -507,7 +507,7 @@ template void EncodeL3State::encode(CommandContainer &container, bool enableSLM) { auto offset = L3CNTLRegisterOffset::registerOffset; auto data = PreambleHelper::getL3Config(container.getDevice()->getHardwareInfo(), enableSLM); - EncodeSetMMIO::encodeIMM(container, offset, data, false); + EncodeSetMMIO::encodeIMM(container, offset, data, false, false); } template diff --git a/shared/source/command_container/implicit_scaling.h b/shared/source/command_container/implicit_scaling.h index 281e6aec19..0ae6bffb69 100644 --- a/shared/source/command_container/implicit_scaling.h +++ b/shared/source/command_container/implicit_scaling.h @@ -79,11 +79,12 @@ struct ImplicitScalingDispatch { static size_t getRegisterConfigurationSize(); static void dispatchRegisterConfiguration(LinearStream &commandStream, uint64_t workPartitionSurfaceAddress, - uint32_t addressOffset); + uint32_t addressOffset, + bool isBcs); static size_t getOffsetRegisterSize(); static void dispatchOffsetRegister(LinearStream &commandStream, - uint32_t addressOffset); + uint32_t addressOffset, bool isBcs); static uint32_t getImmediateWritePostSyncOffset(); static uint32_t getTimeStampPostSyncOffset(); diff --git a/shared/source/command_container/implicit_scaling_before_xe_hp.inl b/shared/source/command_container/implicit_scaling_before_xe_hp.inl index 4ec9283165..bdc3f036b9 100644 --- a/shared/source/command_container/implicit_scaling_before_xe_hp.inl +++ b/shared/source/command_container/implicit_scaling_before_xe_hp.inl @@ -44,7 +44,7 @@ inline size_t ImplicitScalingDispatch::getRegisterConfigurationSize() } template -inline void ImplicitScalingDispatch::dispatchRegisterConfiguration(LinearStream &commandStream, uint64_t workPartitionSurfaceAddress, uint32_t addressOffset) { +inline void ImplicitScalingDispatch::dispatchRegisterConfiguration(LinearStream &commandStream, uint64_t workPartitionSurfaceAddress, uint32_t addressOffset, bool isBcs) { } template @@ -53,7 +53,7 @@ inline size_t ImplicitScalingDispatch::getOffsetRegisterSize() { } template -inline void ImplicitScalingDispatch::dispatchOffsetRegister(LinearStream &commandStream, uint32_t addressOffset) { +inline void ImplicitScalingDispatch::dispatchOffsetRegister(LinearStream &commandStream, uint32_t addressOffset, bool isBcs) { } template diff --git a/shared/source/command_container/implicit_scaling_xehp_and_later.inl b/shared/source/command_container/implicit_scaling_xehp_and_later.inl index 32b8f0447c..d133e60543 100644 --- a/shared/source/command_container/implicit_scaling_xehp_and_later.inl +++ b/shared/source/command_container/implicit_scaling_xehp_and_later.inl @@ -218,11 +218,12 @@ inline size_t ImplicitScalingDispatch::getRegisterConfigurationSize() template inline void ImplicitScalingDispatch::dispatchRegisterConfiguration(LinearStream &commandStream, uint64_t workPartitionSurfaceAddress, - uint32_t addressOffset) { + uint32_t addressOffset, + bool isBcs) { EncodeSetMMIO::encodeMEM(commandStream, PartitionRegisters::wparidCCSOffset, workPartitionSurfaceAddress); - dispatchOffsetRegister(commandStream, addressOffset); + dispatchOffsetRegister(commandStream, addressOffset, isBcs); } template @@ -232,11 +233,12 @@ inline size_t ImplicitScalingDispatch::getOffsetRegisterSize() { template inline void ImplicitScalingDispatch::dispatchOffsetRegister(LinearStream &commandStream, - uint32_t addressOffset) { + uint32_t addressOffset, bool isBcs) { EncodeSetMMIO::encodeIMM(commandStream, PartitionRegisters::addressOffsetCCSOffset, addressOffset, - true); + true, + isBcs); } template diff --git a/shared/source/command_stream/command_stream_receiver_hw.h b/shared/source/command_stream/command_stream_receiver_hw.h index 6197c92587..57d29a2c06 100644 --- a/shared/source/command_stream/command_stream_receiver_hw.h +++ b/shared/source/command_stream/command_stream_receiver_hw.h @@ -202,7 +202,7 @@ class CommandStreamReceiverHw : public CommandStreamReceiver { protected: void programPreemption(LinearStream &csr, DispatchFlags &dispatchFlags); - void programL3(LinearStream &csr, uint32_t &newL3Config); + void programL3(LinearStream &csr, uint32_t &newL3Config, bool isBcs); void programPreamble(LinearStream &csr, Device &device, uint32_t &newL3Config); void programPipelineSelect(LinearStream &csr, PipelineSelectArgs &pipelineSelectArgs); void programEpilogue(LinearStream &csr, Device &device, void **batchBufferEndLocation, DispatchFlags &dispatchFlags); diff --git a/shared/source/command_stream/command_stream_receiver_hw_base.inl b/shared/source/command_stream/command_stream_receiver_hw_base.inl index 8243329cee..9bbd1acb80 100644 --- a/shared/source/command_stream/command_stream_receiver_hw_base.inl +++ b/shared/source/command_stream/command_stream_receiver_hw_base.inl @@ -481,7 +481,7 @@ CompletionStamp CommandStreamReceiverHw::flushTask( auto &commandStreamCSR = this->getCS(estimatedSize); auto commandStreamStartCSR = commandStreamCSR.getUsed(); - TimestampPacketHelper::programCsrDependenciesForTimestampPacketContainer(commandStreamCSR, dispatchFlags.csrDependencies, false); + TimestampPacketHelper::programCsrDependenciesForTimestampPacketContainer(commandStreamCSR, dispatchFlags.csrDependencies, false, EngineHelpers::isBcs(this->osContext->getEngineType())); TimestampPacketHelper::programCsrDependenciesForForMultiRootDeviceSyncContainer(commandStreamCSR, dispatchFlags.csrDependencies); programActivePartitionConfigFlushTask(commandStreamCSR); @@ -494,7 +494,7 @@ CompletionStamp CommandStreamReceiverHw::flushTask( programHardwareContext(commandStreamCSR); programPipelineSelect(commandStreamCSR, dispatchFlags.pipelineSelectArgs); programComputeMode(commandStreamCSR, dispatchFlags, hwInfo); - programL3(commandStreamCSR, newL3Config); + programL3(commandStreamCSR, newL3Config, EngineHelpers::isBcs(this->osContext->getEngineType())); programPreamble(commandStreamCSR, device, newL3Config); programMediaSampler(commandStreamCSR, dispatchFlags); addPipeControlBefore3dState(commandStreamCSR, dispatchFlags); @@ -882,7 +882,7 @@ inline void CommandStreamReceiverHw::programStateSip(LinearStream &cm template inline void CommandStreamReceiverHw::programPreamble(LinearStream &csr, Device &device, uint32_t &newL3Config) { if (!this->isPreambleSent) { - PreambleHelper::programPreamble(&csr, device, newL3Config, this->preemptionAllocation); + PreambleHelper::programPreamble(&csr, device, newL3Config, this->preemptionAllocation, EngineHelpers::isBcs(osContext->getEngineType())); this->isPreambleSent = true; this->lastSentL3Config = newL3Config; } @@ -1045,7 +1045,7 @@ TaskCountType CommandStreamReceiverHw::flushBcsTask(const BlitPropert MiFlushArgs args{waArgs}; for (auto &blitProperties : blitPropertiesContainer) { - TimestampPacketHelper::programCsrDependenciesForTimestampPacketContainer(commandStream, blitProperties.csrDependencies, isRelaxedOrderingDispatch); + TimestampPacketHelper::programCsrDependenciesForTimestampPacketContainer(commandStream, blitProperties.csrDependencies, isRelaxedOrderingDispatch, EngineHelpers::isBcs(this->osContext->getEngineType())); TimestampPacketHelper::programCsrDependenciesForForMultiRootDeviceSyncContainer(commandStream, blitProperties.csrDependencies); BlitCommandsHelper::encodeWa(commandStream, blitProperties, latestSentBcsWaValue); @@ -1740,7 +1740,8 @@ inline void CommandStreamReceiverHw::programStateBaseAddressCommon( bool sbaTrackingEnabled = debuggingEnabled; if (sbaTrackingEnabled) { device.getL0Debugger()->programSbaAddressLoad(csrCommandStream, - device.getL0Debugger()->getSbaTrackingBuffer(this->getOsContext().getContextId())->getGpuAddress()); + device.getL0Debugger()->getSbaTrackingBuffer(this->getOsContext().getContextId())->getGpuAddress(), + EngineHelpers::isBcs(this->osContext->getEngineType())); } NEO::EncodeStateBaseAddress::setSbaTrackingForL0DebuggerIfEnabled(sbaTrackingEnabled, diff --git a/shared/source/command_stream/command_stream_receiver_hw_bdw_and_later.inl b/shared/source/command_stream/command_stream_receiver_hw_bdw_and_later.inl index ccf40c7e8d..39733d735a 100644 --- a/shared/source/command_stream/command_stream_receiver_hw_bdw_and_later.inl +++ b/shared/source/command_stream/command_stream_receiver_hw_bdw_and_later.inl @@ -1,5 +1,5 @@ /* - * Copyright (C) 2019-2023 Intel Corporation + * Copyright (C) 2019-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -15,7 +15,7 @@ template bool CommandStreamReceiverHw::are4GbHeapsAvailable() const { return true; } template -inline void CommandStreamReceiverHw::programL3(LinearStream &csr, uint32_t &newL3Config) { +inline void CommandStreamReceiverHw::programL3(LinearStream &csr, uint32_t &newL3Config, bool isBcs) { typedef typename GfxFamily::PIPE_CONTROL PIPE_CONTROL; if (csrSizeRequestFlags.l3ConfigChanged && this->isPreambleSent) { // Add a PIPE_CONTROL w/ CS_stall @@ -24,7 +24,7 @@ inline void CommandStreamReceiverHw::programL3(LinearStream &csr, uin setClearSlmWorkAroundParameter(args); MemorySynchronizationCommands::addSingleBarrier(csr, args); - PreambleHelper::programL3(&csr, newL3Config); + PreambleHelper::programL3(&csr, newL3Config, isBcs); this->lastSentL3Config = newL3Config; } } diff --git a/shared/source/command_stream/command_stream_receiver_hw_xehp_and_later.inl b/shared/source/command_stream/command_stream_receiver_hw_xehp_and_later.inl index e3ef42240c..3f379b2144 100644 --- a/shared/source/command_stream/command_stream_receiver_hw_xehp_and_later.inl +++ b/shared/source/command_stream/command_stream_receiver_hw_xehp_and_later.inl @@ -1,5 +1,5 @@ /* - * Copyright (C) 2021-2023 Intel Corporation + * Copyright (C) 2021-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -20,7 +20,7 @@ template bool CommandStreamReceiverHw::are4GbHeapsAvailable() const { return is64bit; } template -void CommandStreamReceiverHw::programL3(LinearStream &csr, uint32_t &newL3Config) {} +void CommandStreamReceiverHw::programL3(LinearStream &csr, uint32_t &newL3Config, bool isBcs) {} template size_t CommandStreamReceiverHw::getRequiredStateBaseAddressSize(const Device &device) const { @@ -127,7 +127,8 @@ template inline void CommandStreamReceiverHw::programActivePartitionConfig(LinearStream &csr) { if (this->staticWorkPartitioningEnabled) { uint64_t workPartitionAddress = getWorkPartitionAllocationGpuAddress(); - ImplicitScalingDispatch::dispatchRegisterConfiguration(csr, workPartitionAddress, this->immWritePostSyncWriteOffset); + ImplicitScalingDispatch::dispatchRegisterConfiguration(csr, workPartitionAddress, + this->immWritePostSyncWriteOffset, EngineHelpers::isBcs(osContext->getEngineType())); } this->activePartitionsConfig = this->activePartitions; } diff --git a/shared/source/command_stream/preemption.inl b/shared/source/command_stream/preemption.inl index ee19599558..263cc64a53 100644 --- a/shared/source/command_stream/preemption.inl +++ b/shared/source/command_stream/preemption.inl @@ -85,7 +85,7 @@ void PreemptionHelper::programCmdStream(LinearStream &cmdStream, PreemptionMode regVal = PreemptionConfig::cmdLevelVal | PreemptionConfig::mask; } - LriHelper::program(&cmdStream, PreemptionConfig::mmioAddress, regVal, true); + LriHelper::program(&cmdStream, PreemptionConfig::mmioAddress, regVal, true, false); } template diff --git a/shared/source/debugger/debugger_l0.h b/shared/source/debugger/debugger_l0.h index 0f23dfe824..881aa86b78 100644 --- a/shared/source/debugger/debugger_l0.h +++ b/shared/source/debugger/debugger_l0.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020-2023 Intel Corporation + * Copyright (C) 2020-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -101,7 +101,7 @@ class DebuggerL0 : public NEO::Debugger, NEO::NonCopyableOrMovableClass { void initSbaTrackingMode(); virtual size_t getSbaAddressLoadCommandsSize() = 0; - virtual void programSbaAddressLoad(NEO::LinearStream &cmdStream, uint64_t sbaGpuVa) = 0; + virtual void programSbaAddressLoad(NEO::LinearStream &cmdStream, uint64_t sbaGpuVa, bool isBcs) = 0; MOCKABLE_VIRTUAL bool attachZebinModuleToSegmentAllocations(const StackVec &kernelAlloc, uint32_t &moduleHandle, uint32_t elfHandle); MOCKABLE_VIRTUAL bool removeZebinModule(uint32_t moduleHandle); @@ -145,7 +145,7 @@ class DebuggerL0Hw : public DebuggerL0 { void captureStateBaseAddress(NEO::LinearStream &cmdStream, SbaAddresses sba, bool useFirstLevelBB) override; size_t getSbaTrackingCommandsSize(size_t trackedAddressCount) override; size_t getSbaAddressLoadCommandsSize() override; - void programSbaAddressLoad(NEO::LinearStream &cmdStream, uint64_t sbaGpuVa) override; + void programSbaAddressLoad(NEO::LinearStream &cmdStream, uint64_t sbaGpuVa, bool isBcs) override; void programSbaTrackingCommandsSingleAddressSpace(NEO::LinearStream &cmdStream, const SbaAddresses &sba, bool useFirstLevelBB); diff --git a/shared/source/debugger/debugger_l0.inl b/shared/source/debugger/debugger_l0.inl index a04c72f504..8f7c19cca0 100644 --- a/shared/source/debugger/debugger_l0.inl +++ b/shared/source/debugger/debugger_l0.inl @@ -121,7 +121,7 @@ size_t DebuggerL0Hw::getSbaAddressLoadCommandsSize() { } template -void DebuggerL0Hw::programSbaAddressLoad(NEO::LinearStream &cmdStream, uint64_t sbaGpuVa) { +void DebuggerL0Hw::programSbaAddressLoad(NEO::LinearStream &cmdStream, uint64_t sbaGpuVa, bool isBcs) { if (!singleAddressSpaceSbaTracking) { return; } @@ -131,12 +131,14 @@ void DebuggerL0Hw::programSbaAddressLoad(NEO::LinearStream &cmdStream NEO::LriHelper::program(&cmdStream, DebuggerRegisterOffsets::csGprR15, low, - true); + true, + isBcs); NEO::LriHelper::program(&cmdStream, DebuggerRegisterOffsets::csGprR15 + 4, high, - true); + true, + isBcs); } } // namespace NEO diff --git a/shared/source/debugger/debugger_l0_tgllp_and_later.inl b/shared/source/debugger/debugger_l0_tgllp_and_later.inl index e1f6eaeff9..b9880a88bb 100644 --- a/shared/source/debugger/debugger_l0_tgllp_and_later.inl +++ b/shared/source/debugger/debugger_l0_tgllp_and_later.inl @@ -75,7 +75,7 @@ void DebuggerL0Hw::programSbaTrackingCommandsSingleAddressSpace(NEO:: for (const auto &pair : fieldOffsetAndValue) { // Store SBA field offset to R0 - NEO::EncodeSetMMIO::encodeIMM(cmdStream, RegisterOffsets::csGprR0, static_cast(pair.first), true); + NEO::EncodeSetMMIO::encodeIMM(cmdStream, RegisterOffsets::csGprR0, static_cast(pair.first), true, false); // Add GPR0 to GPR15, store result in GPR1 NEO::EncodeMath::addition(cmdStream, AluRegisters::gpr0, static_cast(DebuggerAluRegisters::gpr15), AluRegisters::gpr1); diff --git a/shared/source/direct_submission/direct_submission_hw.inl b/shared/source/direct_submission/direct_submission_hw.inl index b0c9c9030b..b83c681e6c 100644 --- a/shared/source/direct_submission/direct_submission_hw.inl +++ b/shared/source/direct_submission/direct_submission_hw.inl @@ -124,6 +124,8 @@ void DirectSubmissionHw::dispatchStaticRelaxedOrderingSch const uint32_t miMathMocs = this->rootDeviceEnvironment.getGmmHelper()->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER); + bool isBcsEngine = EngineHelpers::isBcs(this->osContext.getEngineType()); + // 1. Init section { EncodeMiPredicate::encode(schedulerCmdStream, MiPredicateType::disable); @@ -131,18 +133,18 @@ void DirectSubmissionHw::dispatchStaticRelaxedOrderingSch EncodeSetMMIO::encodeREG(schedulerCmdStream, RegisterOffsets::csGprR0, RegisterOffsets::csGprR9); EncodeSetMMIO::encodeREG(schedulerCmdStream, RegisterOffsets::csGprR0 + 4, RegisterOffsets::csGprR9 + 4); - EncodeBatchBufferStartOrEnd::programConditionalDataRegBatchBufferStart(schedulerCmdStream, 0, RegisterOffsets::csGprR1, 0, CompareOperation::equal, true, false); + EncodeBatchBufferStartOrEnd::programConditionalDataRegBatchBufferStart(schedulerCmdStream, 0, RegisterOffsets::csGprR1, 0, CompareOperation::equal, true, false, isBcsEngine); - LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR2, 0, true); - LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR2 + 4, 0, true); + LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR2, 0, true, isBcsEngine); + LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR2 + 4, 0, true, isBcsEngine); uint64_t removeTaskVa = schedulerStartAddress + RelaxedOrderingHelper::StaticSchedulerSizeAndOffsetSection::removeTaskSectionStart; - LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR3, static_cast(removeTaskVa & 0xFFFF'FFFFULL), true); - LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR3 + 4, static_cast(removeTaskVa >> 32), true); + LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR3, static_cast(removeTaskVa & 0xFFFF'FFFFULL), true, isBcsEngine); + LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR3 + 4, static_cast(removeTaskVa >> 32), true, isBcsEngine); uint64_t walkersLoopConditionCheckVa = schedulerStartAddress + RelaxedOrderingHelper::StaticSchedulerSizeAndOffsetSection::tasksListLoopCheckSectionStart; - LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR4, static_cast(walkersLoopConditionCheckVa & 0xFFFF'FFFFULL), true); - LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR4 + 4, static_cast(walkersLoopConditionCheckVa >> 32), true); + LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR4, static_cast(walkersLoopConditionCheckVa & 0xFFFF'FFFFULL), true, isBcsEngine); + LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR4 + 4, static_cast(walkersLoopConditionCheckVa >> 32), true, isBcsEngine); } // 2. Dispatch task section (loop start) @@ -151,11 +153,11 @@ void DirectSubmissionHw::dispatchStaticRelaxedOrderingSch EncodeMiPredicate::encode(schedulerCmdStream, MiPredicateType::disable); - LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR6, 8, true); - LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR6 + 4, 0, true); + LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR6, 8, true, isBcsEngine); + LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR6 + 4, 0, true, isBcsEngine); - LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR8, static_cast(deferredTasksListGpuVa & 0xFFFF'FFFFULL), true); - LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR8 + 4, static_cast(deferredTasksListGpuVa >> 32), true); + LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR8, static_cast(deferredTasksListGpuVa & 0xFFFF'FFFFULL), true, isBcsEngine); + LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR8 + 4, static_cast(deferredTasksListGpuVa >> 32), true, isBcsEngine); EncodeAluHelper aluHelper; aluHelper.setMocs(miMathMocs); @@ -181,19 +183,19 @@ void DirectSubmissionHw::dispatchStaticRelaxedOrderingSch EncodeMiPredicate::encode(schedulerCmdStream, MiPredicateType::disable); - EncodeMathMMIO::encodeDecrement(schedulerCmdStream, AluRegisters::gpr1); - EncodeMathMMIO::encodeDecrement(schedulerCmdStream, AluRegisters::gpr2); + EncodeMathMMIO::encodeDecrement(schedulerCmdStream, AluRegisters::gpr1, isBcsEngine); + EncodeMathMMIO::encodeDecrement(schedulerCmdStream, AluRegisters::gpr2, isBcsEngine); EncodeSetMMIO::encodeREG(schedulerCmdStream, RegisterOffsets::csGprR0, RegisterOffsets::csGprR9); EncodeSetMMIO::encodeREG(schedulerCmdStream, RegisterOffsets::csGprR0 + 4, RegisterOffsets::csGprR9 + 4); - EncodeBatchBufferStartOrEnd::programConditionalDataRegBatchBufferStart(schedulerCmdStream, 0, RegisterOffsets::csGprR1, 0, CompareOperation::equal, true, false); + EncodeBatchBufferStartOrEnd::programConditionalDataRegBatchBufferStart(schedulerCmdStream, 0, RegisterOffsets::csGprR1, 0, CompareOperation::equal, true, false, isBcsEngine); - LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR7, 8, true); - LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR7 + 4, 0, true); + LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR7, 8, true, isBcsEngine); + LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR7 + 4, 0, true, isBcsEngine); - LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR8, static_cast(deferredTasksListGpuVa & 0xFFFF'FFFFULL), true); - LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR8 + 4, static_cast(deferredTasksListGpuVa >> 32), true); + LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR8, static_cast(deferredTasksListGpuVa & 0xFFFF'FFFFULL), true, isBcsEngine); + LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR8 + 4, static_cast(deferredTasksListGpuVa >> 32), true, isBcsEngine); EncodeAluHelper aluHelper; aluHelper.setMocs(miMathMocs); @@ -221,15 +223,15 @@ void DirectSubmissionHw::dispatchStaticRelaxedOrderingSch EncodeMiPredicate::encode(schedulerCmdStream, MiPredicateType::disable); - EncodeMathMMIO::encodeIncrement(schedulerCmdStream, AluRegisters::gpr2); + EncodeMathMMIO::encodeIncrement(schedulerCmdStream, AluRegisters::gpr2, isBcsEngine); EncodeBatchBufferStartOrEnd::programConditionalRegRegBatchBufferStart( schedulerCmdStream, loopSectionStartAddress, AluRegisters::gpr1, AluRegisters::gpr2, CompareOperation::notEqual, false); - LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR2, 0, true); - LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR2 + 4, 0, true); + LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR2, 0, true, isBcsEngine); + LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR2 + 4, 0, true, isBcsEngine); } // 5. Drain request section @@ -249,20 +251,20 @@ void DirectSubmissionHw::dispatchStaticRelaxedOrderingSch EncodeBatchBufferStartOrEnd::programConditionalDataRegBatchBufferStart( schedulerCmdStream, loopSectionStartAddress, - RegisterOffsets::csGprR1, currentRelaxedOrderingQueueSize, CompareOperation::greaterOrEqual, false, false); + RegisterOffsets::csGprR1, currentRelaxedOrderingQueueSize, CompareOperation::greaterOrEqual, false, false, isBcsEngine); EncodeBatchBufferStartOrEnd::programConditionalDataRegBatchBufferStart( schedulerCmdStream, loopSectionStartAddress, - RegisterOffsets::csGprR5, 1, CompareOperation::equal, false, false); + RegisterOffsets::csGprR5, 1, CompareOperation::equal, false, false, isBcsEngine); } // 6. Scheduler loop check section { UNRECOVERABLE_IF(schedulerCmdStream.getUsed() != RelaxedOrderingHelper::StaticSchedulerSizeAndOffsetSection::schedulerLoopCheckSectionStart); - LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR10, static_cast(RelaxedOrderingHelper::DynamicSchedulerSizeAndOffsetSection::semaphoreSectionSize), true); - LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR10 + 4, 0, true); + LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR10, static_cast(RelaxedOrderingHelper::DynamicSchedulerSizeAndOffsetSection::semaphoreSectionSize), true, isBcsEngine); + LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR10 + 4, 0, true, isBcsEngine); EncodeAluHelper aluHelper; aluHelper.setMocs(miMathMocs); @@ -272,7 +274,7 @@ void DirectSubmissionHw::dispatchStaticRelaxedOrderingSch aluHelper.setNextAlu(AluRegisters::opcodeStore, AluRegisters::gpr0, AluRegisters::accu); aluHelper.copyToCmdStream(schedulerCmdStream); - EncodeBatchBufferStartOrEnd::programConditionalRegMemBatchBufferStart(schedulerCmdStream, 0, semaphoreGpuVa, RegisterOffsets::csGprR11, CompareOperation::greaterOrEqual, true); + EncodeBatchBufferStartOrEnd::programConditionalRegMemBatchBufferStart(schedulerCmdStream, 0, semaphoreGpuVa, RegisterOffsets::csGprR11, CompareOperation::greaterOrEqual, true, isBcsEngine); EncodeBatchBufferStartOrEnd::programBatchBufferStart(&schedulerCmdStream, schedulerStartAddress + RelaxedOrderingHelper::StaticSchedulerSizeAndOffsetSection::loopStartSectionStart, false, false, false); @@ -291,9 +293,10 @@ void DirectSubmissionHw::dispatchRelaxedOrderingScheduler uint64_t semaphoreSectionVa = schedulerStartVa + RelaxedOrderingHelper::DynamicSchedulerSizeAndOffsetSection::semaphoreSectionStart; - LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR11, value, true); - LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR9, static_cast(semaphoreSectionVa & 0xFFFF'FFFFULL), true); - LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR9 + 4, static_cast(semaphoreSectionVa >> 32), true); + bool isBcsEngine = EngineHelpers::isBcs(this->osContext.getEngineType()); + LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR11, value, true, isBcsEngine); + LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR9, static_cast(semaphoreSectionVa & 0xFFFF'FFFFULL), true, isBcsEngine); + LriHelper::program(&schedulerCmdStream, RegisterOffsets::csGprR9 + 4, static_cast(semaphoreSectionVa >> 32), true, isBcsEngine); schedulerCmdStream.getSpace(sizeof(typename GfxFamily::MI_BATCH_BUFFER_START)); // skip patching @@ -746,12 +749,13 @@ void DirectSubmissionHw::dispatchRelaxedOrderingQueueStal LinearStream bbStartStream(ringCommandStream.getSpace(EncodeBatchBufferStartOrEnd::getCmdSizeConditionalDataRegBatchBufferStart(false)), EncodeBatchBufferStartOrEnd::getCmdSizeConditionalDataRegBatchBufferStart(false)); - LriHelper::program(&ringCommandStream, RegisterOffsets::csGprR5, 1, true); + bool isBcsEngine = EngineHelpers::isBcs(this->osContext.getEngineType()); + LriHelper::program(&ringCommandStream, RegisterOffsets::csGprR5, 1, true, isBcsEngine); dispatchSemaphoreSection(currentQueueWorkCount); // patch conditional bb_start with current GPU address EncodeBatchBufferStartOrEnd::programConditionalDataRegBatchBufferStart(bbStartStream, ringCommandStream.getCurrentGpuAddressPosition(), - RegisterOffsets::csGprR1, 0, CompareOperation::equal, false, false); + RegisterOffsets::csGprR1, 0, CompareOperation::equal, false, false, isBcsEngine); relaxedOrderingSchedulerRequired = false; } @@ -764,23 +768,27 @@ size_t DirectSubmissionHw::getSizeDispatchRelaxedOrdering template void DirectSubmissionHw::dispatchRelaxedOrderingReturnPtrRegs(LinearStream &cmdStream, uint64_t returnPtr) { - LriHelper::program(&cmdStream, RegisterOffsets::csGprR4, static_cast(returnPtr & 0xFFFF'FFFFULL), true); - LriHelper::program(&cmdStream, RegisterOffsets::csGprR4 + 4, static_cast(returnPtr >> 32), true); + + bool isBcsEngine = EngineHelpers::isBcs(this->osContext.getEngineType()); + LriHelper::program(&cmdStream, RegisterOffsets::csGprR4, static_cast(returnPtr & 0xFFFF'FFFFULL), true, isBcsEngine); + LriHelper::program(&cmdStream, RegisterOffsets::csGprR4 + 4, static_cast(returnPtr >> 32), true, isBcsEngine); uint64_t returnPtrAfterTaskStoreSection = returnPtr; returnPtrAfterTaskStoreSection += RelaxedOrderingHelper::getSizeTaskStoreSection(); - LriHelper::program(&cmdStream, RegisterOffsets::csGprR3, static_cast(returnPtrAfterTaskStoreSection & 0xFFFF'FFFFULL), true); - LriHelper::program(&cmdStream, RegisterOffsets::csGprR3 + 4, static_cast(returnPtrAfterTaskStoreSection >> 32), true); + LriHelper::program(&cmdStream, RegisterOffsets::csGprR3, static_cast(returnPtrAfterTaskStoreSection & 0xFFFF'FFFFULL), true, isBcsEngine); + LriHelper::program(&cmdStream, RegisterOffsets::csGprR3 + 4, static_cast(returnPtrAfterTaskStoreSection >> 32), true, isBcsEngine); } template void DirectSubmissionHw::initRelaxedOrderingRegisters() { - LriHelper::program(&ringCommandStream, RegisterOffsets::csGprR1, 0, true); - LriHelper::program(&ringCommandStream, RegisterOffsets::csGprR1 + 4, 0, true); - LriHelper::program(&ringCommandStream, RegisterOffsets::csGprR5, 0, true); - LriHelper::program(&ringCommandStream, RegisterOffsets::csGprR5 + 4, 0, true); + + bool isBcsEngine = EngineHelpers::isBcs(this->osContext.getEngineType()); + LriHelper::program(&ringCommandStream, RegisterOffsets::csGprR1, 0, true, isBcsEngine); + LriHelper::program(&ringCommandStream, RegisterOffsets::csGprR1 + 4, 0, true, isBcsEngine); + LriHelper::program(&ringCommandStream, RegisterOffsets::csGprR5, 0, true, isBcsEngine); + LriHelper::program(&ringCommandStream, RegisterOffsets::csGprR5 + 4, 0, true, isBcsEngine); } template @@ -793,16 +801,18 @@ void DirectSubmissionHw::preinitializeRelaxedOrderingSect EncodeMiPredicate::encode(stream, MiPredicateType::disable); uint64_t deferredTasksListGpuVa = deferredTasksListAllocation->getGpuAddress(); - LriHelper::program(&stream, RegisterOffsets::csGprR6, static_cast(deferredTasksListGpuVa & 0xFFFF'FFFFULL), true); - LriHelper::program(&stream, RegisterOffsets::csGprR6 + 4, static_cast(deferredTasksListGpuVa >> 32), true); + + bool isBcsEngine = EngineHelpers::isBcs(this->osContext.getEngineType()); + LriHelper::program(&stream, RegisterOffsets::csGprR6, static_cast(deferredTasksListGpuVa & 0xFFFF'FFFFULL), true, isBcsEngine); + LriHelper::program(&stream, RegisterOffsets::csGprR6 + 4, static_cast(deferredTasksListGpuVa >> 32), true, isBcsEngine); // Task start VA - LriHelper::program(&stream, RegisterOffsets::csGprR7, 0, true); - LriHelper::program(&stream, RegisterOffsets::csGprR7 + 4, 0, true); + LriHelper::program(&stream, RegisterOffsets::csGprR7, 0, true, isBcsEngine); + LriHelper::program(&stream, RegisterOffsets::csGprR7 + 4, 0, true, isBcsEngine); // Shift by 8 = multiply by 256. Address must by 64b aligned (shift by 6), but SHL accepts only 1, 2, 4, 8, 16 and 32 - LriHelper::program(&stream, RegisterOffsets::csGprR8, 8, true); - LriHelper::program(&stream, RegisterOffsets::csGprR8 + 4, 0, true); + LriHelper::program(&stream, RegisterOffsets::csGprR8, 8, true, isBcsEngine); + LriHelper::program(&stream, RegisterOffsets::csGprR8 + 4, 0, true, isBcsEngine); const uint32_t miMathMocs = this->rootDeviceEnvironment.getGmmHelper()->getMOCS(GMM_RESOURCE_USAGE_OCL_BUFFER); @@ -820,7 +830,7 @@ void DirectSubmissionHw::preinitializeRelaxedOrderingSect aluHelper.copyToCmdStream(stream); - EncodeMathMMIO::encodeIncrement(stream, AluRegisters::gpr1); + EncodeMathMMIO::encodeIncrement(stream, AluRegisters::gpr1, isBcsEngine); UNRECOVERABLE_IF(stream.getUsed() != RelaxedOrderingHelper::getSizeTaskStoreSection()); @@ -831,9 +841,9 @@ void DirectSubmissionHw::preinitializeRelaxedOrderingSect uint64_t schedulerStartAddress = relaxedOrderingSchedulerAllocation->getGpuAddress(); // 1. Init section - LriHelper::program(&schedulerStream, RegisterOffsets::csGprR11, 0, true); - LriHelper::program(&schedulerStream, RegisterOffsets::csGprR9, 0, true); - LriHelper::program(&schedulerStream, RegisterOffsets::csGprR9 + 4, 0, true); + LriHelper::program(&schedulerStream, RegisterOffsets::csGprR11, 0, true, isBcsEngine); + LriHelper::program(&schedulerStream, RegisterOffsets::csGprR9, 0, true, isBcsEngine); + LriHelper::program(&schedulerStream, RegisterOffsets::csGprR9 + 4, 0, true, isBcsEngine); EncodeBatchBufferStartOrEnd::programBatchBufferStart(&schedulerStream, schedulerStartAddress, false, false, false); // 2. Semaphore section @@ -849,7 +859,7 @@ void DirectSubmissionHw::preinitializeRelaxedOrderingSect { EncodeMiPredicate::encode(schedulerStream, MiPredicateType::disable); - LriHelper::program(&schedulerStream, RegisterOffsets::csGprR5, 0, true); + LriHelper::program(&schedulerStream, RegisterOffsets::csGprR5, 0, true, isBcsEngine); } UNRECOVERABLE_IF(schedulerStream.getUsed() != RelaxedOrderingHelper::DynamicSchedulerSizeAndOffsetSection::totalSize); diff --git a/shared/source/direct_submission/direct_submission_xe_hp_core_and_later.inl b/shared/source/direct_submission/direct_submission_xe_hp_core_and_later.inl index 5ab3f3c436..ed922d652e 100644 --- a/shared/source/direct_submission/direct_submission_xe_hp_core_and_later.inl +++ b/shared/source/direct_submission/direct_submission_xe_hp_core_and_later.inl @@ -1,5 +1,5 @@ /* - * Copyright (C) 2021-2023 Intel Corporation + * Copyright (C) 2021-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -15,7 +15,8 @@ template inline void DirectSubmissionHw::dispatchPartitionRegisterConfiguration() { ImplicitScalingDispatch::dispatchRegisterConfiguration(ringCommandStream, this->workPartitionAllocation->getGpuAddress(), - this->immWritePostSyncOffset); + this->immWritePostSyncOffset, + EngineHelpers::isBcs(this->osContext.getEngineType())); } template diff --git a/shared/source/gen11/command_encoder_gen11.cpp b/shared/source/gen11/command_encoder_gen11.cpp index 3664ddf696..fba0d9c87a 100644 --- a/shared/source/gen11/command_encoder_gen11.cpp +++ b/shared/source/gen11/command_encoder_gen11.cpp @@ -66,6 +66,7 @@ void EncodeComputeMode::programComputeModeCommand(LinearStream &csr, Sta LriHelper::program(&csr, RowChickenReg4::address, RowChickenReg4::regDataForArbitrationPolicy[properties.threadArbitrationPolicy.value], + false, false); } if (properties.isCoherencyRequired.isDirty) { @@ -73,6 +74,7 @@ void EncodeComputeMode::programComputeModeCommand(LinearStream &csr, Sta LriHelper::program(&csr, gen11HdcModeRegister::address, DwordBuilder::build(gen11HdcModeRegister::forceNonCoherentEnableBit, true, nonCoherentEnable), + false, false); } } diff --git a/shared/source/gen11/command_stream_receiver_hw_gen11.cpp b/shared/source/gen11/command_stream_receiver_hw_gen11.cpp index d9114c386f..f9ea2d7155 100644 --- a/shared/source/gen11/command_stream_receiver_hw_gen11.cpp +++ b/shared/source/gen11/command_stream_receiver_hw_gen11.cpp @@ -54,6 +54,7 @@ void CommandStreamReceiverHw::programMediaSampler(LinearStream &stream, LriHelper::program(&stream, PWR_CLK_STATE_REGISTER::REG_ADDRESS, reg.TheStructure.RawData[0], + false, false); args = {}; @@ -96,6 +97,7 @@ void CommandStreamReceiverHw::programMediaSampler(LinearStream &stream, LriHelper::program(&stream, PWR_CLK_STATE_REGISTER::REG_ADDRESS, reg.TheStructure.RawData[0], + false, false); MemorySynchronizationCommands::addSingleBarrier(stream, args); diff --git a/shared/source/gen12lp/command_stream_receiver_hw_gen12lp.cpp b/shared/source/gen12lp/command_stream_receiver_hw_gen12lp.cpp index 6483f02ecf..b6dfd5b72a 100644 --- a/shared/source/gen12lp/command_stream_receiver_hw_gen12lp.cpp +++ b/shared/source/gen12lp/command_stream_receiver_hw_gen12lp.cpp @@ -21,7 +21,7 @@ namespace NEO { static auto gfxCore = IGFX_GEN12LP_CORE; template <> -void CommandStreamReceiverHw::programL3(LinearStream &csr, uint32_t &newL3Config) { +void CommandStreamReceiverHw::programL3(LinearStream &csr, uint32_t &newL3Config, bool isBcs) { } template <> diff --git a/shared/source/gen12lp/preamble_gen12lp.cpp b/shared/source/gen12lp/preamble_gen12lp.cpp index c5b9417422..0134cb2baf 100644 --- a/shared/source/gen12lp/preamble_gen12lp.cpp +++ b/shared/source/gen12lp/preamble_gen12lp.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2019-2023 Intel Corporation + * Copyright (C) 2019-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -90,7 +90,7 @@ void PreambleHelper::addPipeControlBeforeVfeCmd(LinearStream *pCommandSt } template <> -void PreambleHelper::programL3(LinearStream *pCommandStream, uint32_t l3Config) { +void PreambleHelper::programL3(LinearStream *pCommandStream, uint32_t l3Config, bool isBcs) { } template <> diff --git a/shared/source/gen8/preemption_gen8.cpp b/shared/source/gen8/preemption_gen8.cpp index 681fcc95d6..ac54494414 100644 --- a/shared/source/gen8/preemption_gen8.cpp +++ b/shared/source/gen8/preemption_gen8.cpp @@ -41,7 +41,7 @@ void PreemptionHelper::programCmdStream(LinearStream &cmdStream, Pree regVal = PreemptionConfig::cmdLevelVal; } - LriHelper::program(&cmdStream, PreemptionConfig::mmioAddress, regVal, false); + LriHelper::program(&cmdStream, PreemptionConfig::mmioAddress, regVal, false, false); } template <> @@ -94,6 +94,7 @@ void PreemptionHelper::applyPreemptionWaCmdsBegin(LinearStream *pComm LriHelper::program(pCommandStream, RegisterOffsets::csGprR0, RegisterConstants::gpgpuWalkerCookieValueBeforeWalker, + false, false); } } @@ -109,6 +110,7 @@ void PreemptionHelper::applyPreemptionWaCmdsEnd(LinearStream *pComman LriHelper::program(pCommandStream, RegisterOffsets::csGprR0, RegisterConstants::gpgpuWalkerCookieValueAfterWalker, + false, false); } } diff --git a/shared/source/gen9/command_encoder_gen9.cpp b/shared/source/gen9/command_encoder_gen9.cpp index f83d2ad9d4..7c6dbd9421 100644 --- a/shared/source/gen9/command_encoder_gen9.cpp +++ b/shared/source/gen9/command_encoder_gen9.cpp @@ -58,6 +58,7 @@ void EncodeComputeMode::programComputeModeCommand(LinearStream &csr, Sta LriHelper::program(&csr, DebugControlReg2::address, DebugControlReg2::getRegData(properties.threadArbitrationPolicy.value), + false, false); } } diff --git a/shared/source/gen9/preemption_gen9.cpp b/shared/source/gen9/preemption_gen9.cpp index 444379a6bb..0b97281640 100644 --- a/shared/source/gen9/preemption_gen9.cpp +++ b/shared/source/gen9/preemption_gen9.cpp @@ -43,6 +43,7 @@ void PreemptionHelper::applyPreemptionWaCmdsBegin(LinearStream *pComm LriHelper::program(pCommandStream, RegisterOffsets::csGprR0, RegisterConstants::gpgpuWalkerCookieValueBeforeWalker, + false, false); } } @@ -58,6 +59,7 @@ void PreemptionHelper::applyPreemptionWaCmdsEnd(LinearStream *pComman LriHelper::program(pCommandStream, RegisterOffsets::csGprR0, RegisterConstants::gpgpuWalkerCookieValueAfterWalker, + false, false); } } diff --git a/shared/source/helpers/blit_commands_helper_xehp_and_later.inl b/shared/source/helpers/blit_commands_helper_xehp_and_later.inl index cab46f7970..875e8f3dd3 100644 --- a/shared/source/helpers/blit_commands_helper_xehp_and_later.inl +++ b/shared/source/helpers/blit_commands_helper_xehp_and_later.inl @@ -288,7 +288,7 @@ void BlitCommandsHelper::programGlobalSequencerFlush(LinearStream &co if (debugManager.flags.GlobalSequencerFlushOnCopyEngine.get() != 0) { using COMPARE_OPERATION = typename GfxFamily::MI_SEMAPHORE_WAIT::COMPARE_OPERATION; constexpr uint32_t globalInvalidationRegister = 0xB404u; - LriHelper::program(&commandStream, globalInvalidationRegister, 1u, false); + LriHelper::program(&commandStream, globalInvalidationRegister, 1u, false, true); EncodeSemaphore::addMiSemaphoreWaitCommand(commandStream, globalInvalidationRegister, 0u, diff --git a/shared/source/helpers/gfx_core_helper.h b/shared/source/helpers/gfx_core_helper.h index 51345f37db..6b7f058b3e 100644 --- a/shared/source/helpers/gfx_core_helper.h +++ b/shared/source/helpers/gfx_core_helper.h @@ -453,7 +453,7 @@ template struct LriHelper { using MI_LOAD_REGISTER_IMM = typename GfxFamily::MI_LOAD_REGISTER_IMM; - static void *program(LinearStream *cmdStream, uint32_t address, uint32_t value, bool remap); + static void *program(LinearStream *cmdStream, uint32_t address, uint32_t value, bool remap, bool isBcs); static void *program(MI_LOAD_REGISTER_IMM *lriCmd, uint32_t address, uint32_t value, bool remap); }; diff --git a/shared/source/helpers/gfx_core_helper_base.inl b/shared/source/helpers/gfx_core_helper_base.inl index 3bfd43dc6e..e4f54a013e 100644 --- a/shared/source/helpers/gfx_core_helper_base.inl +++ b/shared/source/helpers/gfx_core_helper_base.inl @@ -782,7 +782,7 @@ bool GfxCoreHelperHw::isRuntimeLocalIdsGenerationRequired(uint32_t ac } template -void *LriHelper::program(LinearStream *cmdStream, uint32_t address, uint32_t value, bool remap) { +void *LriHelper::program(LinearStream *cmdStream, uint32_t address, uint32_t value, bool remap, bool isBcs) { auto lri = cmdStream->getSpaceForCmd(); return LriHelper::program(lri, address, value, remap); } diff --git a/shared/source/helpers/preamble.h b/shared/source/helpers/preamble.h index 0b4d898f0d..3c464bd184 100644 --- a/shared/source/helpers/preamble.h +++ b/shared/source/helpers/preamble.h @@ -29,7 +29,7 @@ struct PreambleHelper { using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL; using FrontEndStateCommand = typename GfxFamily::FrontEndStateCommand; - static void programL3(LinearStream *pCommandStream, uint32_t l3Config); + static void programL3(LinearStream *pCommandStream, uint32_t l3Config, bool isBcs); static void programPipelineSelect(LinearStream *pCommandStream, const PipelineSelectArgs &pipelineSelectArgs, const RootDeviceEnvironment &rootDeviceEnvironment); @@ -47,8 +47,8 @@ struct PreambleHelper { const StreamProperties &streamProperties); static uint64_t getScratchSpaceAddressOffsetForVfeState(LinearStream *pCommandStream, void *pVfeState); static void programPreamble(LinearStream *pCommandStream, Device &device, uint32_t l3Config, - GraphicsAllocation *preemptionCsr); - static void programSemaphoreDelay(LinearStream *pCommandStream); + GraphicsAllocation *preemptionCsr, bool isBcs); + static void programSemaphoreDelay(LinearStream *pCommandStream, bool isBcs); static uint32_t getL3Config(const HardwareInfo &hwInfo, bool useSLM); static bool isSystolicModeConfigurable(const RootDeviceEnvironment &rootDeviceEnvironment); static size_t getAdditionalCommandsSize(const Device &device); diff --git a/shared/source/helpers/preamble_base.inl b/shared/source/helpers/preamble_base.inl index ae2a405596..5f1fef5737 100644 --- a/shared/source/helpers/preamble_base.inl +++ b/shared/source/helpers/preamble_base.inl @@ -1,5 +1,5 @@ /* - * Copyright (C) 2019-2023 Intel Corporation + * Copyright (C) 2019-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -30,13 +30,14 @@ void PreambleHelper::programGenSpecificPreambleWorkArounds(LinearStre } template -void PreambleHelper::programSemaphoreDelay(LinearStream *pCommandStream) { +void PreambleHelper::programSemaphoreDelay(LinearStream *pCommandStream, bool isBcs) { if (debugManager.flags.ForceSemaphoreDelayBetweenWaits.get() > -1) { uint32_t valueOfNewSemaphoreDelay = debugManager.flags.ForceSemaphoreDelayBetweenWaits.get(); LriHelper::program(pCommandStream, RegisterOffsets::semaWaitPoll, valueOfNewSemaphoreDelay, - true); + true, + isBcs); }; } @@ -55,11 +56,11 @@ size_t PreambleHelper::getAdditionalCommandsSize(const Device &device template void PreambleHelper::programPreamble(LinearStream *pCommandStream, Device &device, uint32_t l3Config, - GraphicsAllocation *preemptionCsr) { - programL3(pCommandStream, l3Config); + GraphicsAllocation *preemptionCsr, bool isBcs) { + programL3(pCommandStream, l3Config, isBcs); programPreemption(pCommandStream, device, preemptionCsr); programGenSpecificPreambleWorkArounds(pCommandStream, device.getHardwareInfo()); - programSemaphoreDelay(pCommandStream); + programSemaphoreDelay(pCommandStream, isBcs); } template diff --git a/shared/source/helpers/preamble_bdw_and_later.inl b/shared/source/helpers/preamble_bdw_and_later.inl index 94a7c72d5c..a68aa41840 100644 --- a/shared/source/helpers/preamble_bdw_and_later.inl +++ b/shared/source/helpers/preamble_bdw_and_later.inl @@ -13,11 +13,12 @@ namespace NEO { template -void PreambleHelper::programL3(LinearStream *pCommandStream, uint32_t l3Config) { +void PreambleHelper::programL3(LinearStream *pCommandStream, uint32_t l3Config, bool isBcs) { LriHelper::program(pCommandStream, L3CNTLRegisterOffset::registerOffset, l3Config, - false); + false, + isBcs); } template diff --git a/shared/source/helpers/preamble_xehp_and_later.inl b/shared/source/helpers/preamble_xehp_and_later.inl index 430bee1028..6736eced79 100644 --- a/shared/source/helpers/preamble_xehp_and_later.inl +++ b/shared/source/helpers/preamble_xehp_and_later.inl @@ -22,7 +22,7 @@ void PreambleHelper::addPipeControlBeforeVfeCmd(LinearStream *pComman } template -void PreambleHelper::programL3(LinearStream *pCommandStream, uint32_t l3Config) { +void PreambleHelper::programL3(LinearStream *pCommandStream, uint32_t l3Config, bool isBcs) { } template diff --git a/shared/source/helpers/timestamp_packet.h b/shared/source/helpers/timestamp_packet.h index 902bef4307..e0fd301ff9 100644 --- a/shared/source/helpers/timestamp_packet.h +++ b/shared/source/helpers/timestamp_packet.h @@ -109,23 +109,23 @@ struct TimestampPacketHelper { } template - static void programConditionalBbStartForRelaxedOrdering(LinearStream &cmdStream, TagNodeBase ×tampPacketNode) { + static void programConditionalBbStartForRelaxedOrdering(LinearStream &cmdStream, TagNodeBase ×tampPacketNode, bool isBcs) { auto compareAddress = getContextEndGpuAddress(timestampPacketNode); for (uint32_t packetId = 0; packetId < timestampPacketNode.getPacketsUsed(); packetId++) { uint64_t compareOffset = packetId * timestampPacketNode.getSinglePacketSize(); EncodeBatchBufferStartOrEnd::programConditionalDataMemBatchBufferStart(cmdStream, 0, compareAddress + compareOffset, TimestampPacketConstants::initValue, - NEO::CompareOperation::equal, true, false); + NEO::CompareOperation::equal, true, false, isBcs); } } template - static void programCsrDependenciesForTimestampPacketContainer(LinearStream &cmdStream, const CsrDependencies &csrDependencies, bool relaxedOrderingEnabled) { + static void programCsrDependenciesForTimestampPacketContainer(LinearStream &cmdStream, const CsrDependencies &csrDependencies, bool relaxedOrderingEnabled, bool isBcs) { for (auto timestampPacketContainer : csrDependencies.timestampPacketContainer) { for (auto &node : timestampPacketContainer->peekNodes()) { if (relaxedOrderingEnabled) { - TimestampPacketHelper::programConditionalBbStartForRelaxedOrdering(cmdStream, *node); + TimestampPacketHelper::programConditionalBbStartForRelaxedOrdering(cmdStream, *node, isBcs); } else { TimestampPacketHelper::programSemaphore(cmdStream, *node); } diff --git a/shared/source/helpers/windows/gmm_callbacks_tgllp_and_later.inl b/shared/source/helpers/windows/gmm_callbacks_tgllp_and_later.inl index 8b6c8463ec..7a55b35016 100644 --- a/shared/source/helpers/windows/gmm_callbacks_tgllp_and_later.inl +++ b/shared/source/helpers/windows/gmm_callbacks_tgllp_and_later.inl @@ -39,12 +39,14 @@ int __stdcall TTCallbacks::writeL3Address(void *queueHandle, uint64_t LriHelper::program(&csr->getCS(0), static_cast(regOffset & 0xFFFFFFFF), static_cast(l3GfxAddress & 0xFFFFFFFF), - true); + true, + false); LriHelper::program(&csr->getCS(0), static_cast(regOffset >> 32), static_cast(l3GfxAddress >> 32), - true); + true, + false); return 1; } diff --git a/shared/test/common/helpers/relaxed_ordering_commands_helper.h b/shared/test/common/helpers/relaxed_ordering_commands_helper.h index a1396f7315..369b23209f 100644 --- a/shared/test/common/helpers/relaxed_ordering_commands_helper.h +++ b/shared/test/common/helpers/relaxed_ordering_commands_helper.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2023 Intel Corporation + * Copyright (C) 2023-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -29,7 +29,7 @@ template bool verifyConditionalDataRegBbStart(void *cmd, uint64_t startAddress, uint32_t compareReg, uint32_t compareData, CompareOperation compareOperation, bool indirect); template -bool verifyConditionalDataMemBbStart(void *cmd, uint64_t startAddress, uint64_t compareAddress, uint64_t compareData, CompareOperation compareOperation, bool indirect, bool qwordData); +bool verifyConditionalDataMemBbStart(void *cmd, uint64_t startAddress, uint64_t compareAddress, uint64_t compareData, CompareOperation compareOperation, bool indirect, bool qwordData, bool isBcs); template bool verifyConditionalRegRegBbStart(void *cmd, uint64_t startAddress, AluRegisters compareReg0, AluRegisters compareReg1, CompareOperation compareOperation, bool indirect); @@ -224,7 +224,7 @@ bool verifyConditionalRegRegBbStart(void *cmd, uint64_t startAddress, AluRegiste } template -bool verifyConditionalDataMemBbStart(void *cmd, uint64_t startAddress, uint64_t compareAddress, uint64_t compareData, CompareOperation compareOperation, bool indirect, bool qwordData) { +bool verifyConditionalDataMemBbStart(void *cmd, uint64_t startAddress, uint64_t compareAddress, uint64_t compareData, CompareOperation compareOperation, bool indirect, bool qwordData, bool isBcs) { using MI_LOAD_REGISTER_MEM = typename FamilyType::MI_LOAD_REGISTER_MEM; using MI_LOAD_REGISTER_IMM = typename FamilyType::MI_LOAD_REGISTER_IMM; diff --git a/shared/test/common/mocks/mock_l0_debugger.h b/shared/test/common/mocks/mock_l0_debugger.h index e5ed7766af..71751541ef 100644 --- a/shared/test/common/mocks/mock_l0_debugger.h +++ b/shared/test/common/mocks/mock_l0_debugger.h @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020-2023 Intel Corporation + * Copyright (C) 2020-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -25,7 +25,7 @@ class MockDebuggerL0 : public NEO::DebuggerL0 { } size_t getSbaAddressLoadCommandsSize() override { return 0; }; - void programSbaAddressLoad(NEO::LinearStream &cmdStream, uint64_t sbaGpuVa) override{}; + void programSbaAddressLoad(NEO::LinearStream &cmdStream, uint64_t sbaGpuVa, bool isBcs) override{}; }; template diff --git a/shared/test/unit_test/command_container/command_encoder_tests.cpp b/shared/test/unit_test/command_container/command_encoder_tests.cpp index c0a8b2a032..9623e8a881 100644 --- a/shared/test/unit_test/command_container/command_encoder_tests.cpp +++ b/shared/test/unit_test/command_container/command_encoder_tests.cpp @@ -620,12 +620,12 @@ HWTEST2_F(CommandEncoderTests, whenAskingForImplicitScalingValuesThenAlwaysRetur EXPECT_EQ(0u, ImplicitScalingDispatch::getRegisterConfigurationSize()); - ImplicitScalingDispatch::dispatchRegisterConfiguration(linearStream, 0, 0); + ImplicitScalingDispatch::dispatchRegisterConfiguration(linearStream, 0, 0, false); EXPECT_EQ(0u, linearStream.getUsed()); EXPECT_EQ(0u, ImplicitScalingDispatch::getOffsetRegisterSize()); - ImplicitScalingDispatch::dispatchOffsetRegister(linearStream, 0); + ImplicitScalingDispatch::dispatchOffsetRegister(linearStream, 0, 0); EXPECT_EQ(0u, linearStream.getUsed()); EXPECT_EQ(static_cast(sizeof(uint64_t)), ImplicitScalingDispatch::getImmediateWritePostSyncOffset()); diff --git a/shared/test/unit_test/encoders/command_encoder_tests_xehp_and_later.cpp b/shared/test/unit_test/encoders/command_encoder_tests_xehp_and_later.cpp index feb02dcac4..42a96bada5 100644 --- a/shared/test/unit_test/encoders/command_encoder_tests_xehp_and_later.cpp +++ b/shared/test/unit_test/encoders/command_encoder_tests_xehp_and_later.cpp @@ -152,7 +152,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, XeHPAndLaterCommandEncoderTest, givenOffsetAndValue constexpr uint32_t immVal = 0xbaau; constexpr uint64_t dstAddress = 0xDEADCAF0u; void *storeRegMem = nullptr; - EncodeMathMMIO::encodeBitwiseAndVal(cmdContainer, regOffset, immVal, dstAddress, true, &storeRegMem); + EncodeMathMMIO::encodeBitwiseAndVal(cmdContainer, regOffset, immVal, dstAddress, true, &storeRegMem, false); CmdParse::parseCommandBuffer(commands, ptrOffset(cmdContainer.getCommandStream()->getCpuBase(), 0), diff --git a/shared/test/unit_test/encoders/test_encode_math.cpp b/shared/test/unit_test/encoders/test_encode_math.cpp index 5929166f07..4f9a67b056 100644 --- a/shared/test/unit_test/encoders/test_encode_math.cpp +++ b/shared/test/unit_test/encoders/test_encode_math.cpp @@ -183,7 +183,7 @@ HWTEST_F(CommandEncoderMathTest, givenOffsetAndValueWhenEncodeBitwiseAndValIsCal constexpr uint32_t immVal = 0xbaau; constexpr uint64_t dstAddress = 0xDEADCAF0u; void *storeRegMem = nullptr; - EncodeMathMMIO::encodeBitwiseAndVal(cmdContainer, regOffset, immVal, dstAddress, false, &storeRegMem); + EncodeMathMMIO::encodeBitwiseAndVal(cmdContainer, regOffset, immVal, dstAddress, false, &storeRegMem, false); CmdParse::parseCommandBuffer(commands, ptrOffset(cmdContainer.getCommandStream()->getCpuBase(), 0), @@ -296,7 +296,7 @@ HWTEST_F(CommandEncodeAluTests, whenProgrammingIncrementOperationThenUseCorrectA uint8_t buffer[bufferSize] = {}; LinearStream cmdStream(buffer, bufferSize); - EncodeMathMMIO::encodeIncrement(cmdStream, incRegister); + EncodeMathMMIO::encodeIncrement(cmdStream, incRegister, false); EXPECT_EQ(bufferSize, cmdStream.getUsed()); @@ -343,7 +343,7 @@ HWTEST_F(CommandEncodeAluTests, whenProgrammingDecrementOperationThenUseCorrectA uint8_t buffer[bufferSize] = {}; LinearStream cmdStream(buffer, bufferSize); - EncodeMathMMIO::encodeDecrement(cmdStream, decRegister); + EncodeMathMMIO::encodeDecrement(cmdStream, decRegister, false); EXPECT_EQ(bufferSize, cmdStream.getUsed()); diff --git a/shared/test/unit_test/encoders/test_encode_math_xehp_and_later.cpp b/shared/test/unit_test/encoders/test_encode_math_xehp_and_later.cpp index 78ff83bf58..a6caa30154 100644 --- a/shared/test/unit_test/encoders/test_encode_math_xehp_and_later.cpp +++ b/shared/test/unit_test/encoders/test_encode_math_xehp_and_later.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2021-2023 Intel Corporation + * Copyright (C) 2021-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -27,7 +27,7 @@ HWTEST2_F(XeHPAndLaterCommandEncoderMathTest, WhenAppendsAGreaterThanThenPredica CommandContainer cmdContainer; cmdContainer.initialize(pDevice, nullptr, HeapSize::defaultHeapSize, true, false); - EncodeMathMMIO::encodeGreaterThanPredicate(cmdContainer, 0xDEADBEEFCAF0u, 17u); + EncodeMathMMIO::encodeGreaterThanPredicate(cmdContainer, 0xDEADBEEFCAF0u, 17u, false); GenCmdList commands; CmdParse::parseCommandBuffer(commands, ptrOffset(cmdContainer.getCommandStream()->getCpuBase(), 0), cmdContainer.getCommandStream()->getUsed()); diff --git a/shared/test/unit_test/encoders/test_encode_pvc_and_later.cpp b/shared/test/unit_test/encoders/test_encode_pvc_and_later.cpp index a795a874d6..107aa42484 100644 --- a/shared/test/unit_test/encoders/test_encode_pvc_and_later.cpp +++ b/shared/test/unit_test/encoders/test_encode_pvc_and_later.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2022-2023 Intel Corporation + * Copyright (C) 2022-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -107,7 +107,7 @@ HWTEST2_F(EncodeConditionalBatchBufferStartTest, whenProgrammingConditionalDataM uint8_t buffer[expectedSize] = {}; LinearStream cmdStream(buffer, expectedSize); - EncodeBatchBufferStartOrEnd::programConditionalDataMemBatchBufferStart(cmdStream, indirect ? 0 : startAddress, compareAddress, compareData, compareOperation, indirect, false); + EncodeBatchBufferStartOrEnd::programConditionalDataMemBatchBufferStart(cmdStream, indirect ? 0 : startAddress, compareAddress, compareData, compareOperation, indirect, false, false); EXPECT_EQ(expectedSize, cmdStream.getUsed()); @@ -152,7 +152,7 @@ HWTEST2_F(EncodeConditionalBatchBufferStartTest, whenProgramming64bConditionalDa uint8_t buffer[expectedSize] = {}; LinearStream cmdStream(buffer, expectedSize); - EncodeBatchBufferStartOrEnd::programConditionalDataMemBatchBufferStart(cmdStream, indirect ? 0 : startAddress, compareAddress, compareData, compareOperation, indirect, true); + EncodeBatchBufferStartOrEnd::programConditionalDataMemBatchBufferStart(cmdStream, indirect ? 0 : startAddress, compareAddress, compareData, compareOperation, indirect, true, false); EXPECT_EQ(expectedSize, cmdStream.getUsed()); @@ -197,7 +197,7 @@ HWTEST2_F(EncodeConditionalBatchBufferStartTest, whenProgrammingConditionalDataR uint8_t buffer[expectedSize] = {}; LinearStream cmdStream(buffer, expectedSize); - EncodeBatchBufferStartOrEnd::programConditionalDataRegBatchBufferStart(cmdStream, indirect ? 0 : startAddress, compareReg, compareData, compareOperation, indirect, false); + EncodeBatchBufferStartOrEnd::programConditionalDataRegBatchBufferStart(cmdStream, indirect ? 0 : startAddress, compareReg, compareData, compareOperation, indirect, false, false); EXPECT_EQ(expectedSize, cmdStream.getUsed()); @@ -242,7 +242,7 @@ HWTEST2_F(EncodeConditionalBatchBufferStartTest, whenProgramming64bConditionalDa uint8_t buffer[expectedSize] = {}; LinearStream cmdStream(buffer, expectedSize); - EncodeBatchBufferStartOrEnd::programConditionalDataRegBatchBufferStart(cmdStream, indirect ? 0 : startAddress, compareReg, compareData, compareOperation, indirect, true); + EncodeBatchBufferStartOrEnd::programConditionalDataRegBatchBufferStart(cmdStream, indirect ? 0 : startAddress, compareReg, compareData, compareOperation, indirect, true, false); EXPECT_EQ(expectedSize, cmdStream.getUsed()); diff --git a/shared/test/unit_test/encoders/test_encode_set_mmio.cpp b/shared/test/unit_test/encoders/test_encode_set_mmio.cpp index f649ae1315..f07a2c0bd7 100644 --- a/shared/test/unit_test/encoders/test_encode_set_mmio.cpp +++ b/shared/test/unit_test/encoders/test_encode_set_mmio.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020-2023 Intel Corporation + * Copyright (C) 2020-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -31,7 +31,7 @@ class CommandSetMMIOFixture : public DeviceFixture { using CommandSetMMIOTest = Test; HWTEST_F(CommandSetMMIOTest, WhenProgrammingThenLoadRegisterImmIsUsed) { - EncodeSetMMIO::encodeIMM(*cmdContainer.get(), 0x2000, 0xbaa, false); + EncodeSetMMIO::encodeIMM(*cmdContainer.get(), 0x2000, 0xbaa, false, false); GenCmdList commands; CmdParse::parseCommandBuffer(commands, ptrOffset(cmdContainer->getCommandStream()->getCpuBase(), 0), cmdContainer->getCommandStream()->getUsed()); diff --git a/shared/test/unit_test/gen11/test_encode_math_gen11.cpp b/shared/test/unit_test/gen11/test_encode_math_gen11.cpp index 6f13515cc3..8394d53204 100644 --- a/shared/test/unit_test/gen11/test_encode_math_gen11.cpp +++ b/shared/test/unit_test/gen11/test_encode_math_gen11.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020-2023 Intel Corporation + * Copyright (C) 2020-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -29,7 +29,7 @@ GEN11TEST_F(CommandEncoderMathTestGen11, WhenAppendsAGreaterThanThenPredicateCor CommandContainer cmdContainer; cmdContainer.initialize(pDevice, nullptr, HeapSize::defaultHeapSize, true, false); - EncodeMathMMIO::encodeGreaterThanPredicate(cmdContainer, 0xDEADBEEFCAF0u, 17u); + EncodeMathMMIO::encodeGreaterThanPredicate(cmdContainer, 0xDEADBEEFCAF0u, 17u, false); GenCmdList commands; CmdParse::parseCommandBuffer(commands, ptrOffset(cmdContainer.getCommandStream()->getCpuBase(), 0), cmdContainer.getCommandStream()->getUsed()); diff --git a/shared/test/unit_test/gen11/test_preamble_gen11.cpp b/shared/test/unit_test/gen11/test_preamble_gen11.cpp index e5de1d4d00..199341f986 100644 --- a/shared/test/unit_test/gen11/test_preamble_gen11.cpp +++ b/shared/test/unit_test/gen11/test_preamble_gen11.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2019-2023 Intel Corporation + * Copyright (C) 2019-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -26,7 +26,7 @@ GEN11TEST_F(IclSlm, WhenL3ConfigIsDispatchedThenProperRegisterAddressAndValueAre typedef Gen11Family::MI_LOAD_REGISTER_IMM MI_LOAD_REGISTER_IMM; LinearStream &cs = linearStream; uint32_t l3Config = PreambleHelper::getL3Config(*defaultHwInfo, true); - PreambleHelper::programL3(&cs, l3Config); + PreambleHelper::programL3(&cs, l3Config, false); parseCommands(cs); @@ -121,7 +121,7 @@ GEN11TEST_F(ThreadArbitrationGen11, givenPreambleWhenItIsProgrammedThenThreadArb LinearStream &cs = linearStream; uint32_t l3Config = PreambleHelper::getL3Config(*defaultHwInfo, true); MockDevice mockDevice; - PreambleHelper::programPreamble(&linearStream, mockDevice, l3Config, nullptr); + PreambleHelper::programPreamble(&linearStream, mockDevice, l3Config, nullptr, false); parseCommands(cs); diff --git a/shared/test/unit_test/gen12lp/test_encode_math_gen12lp.cpp b/shared/test/unit_test/gen12lp/test_encode_math_gen12lp.cpp index a6d2ac4dab..431cdfba13 100644 --- a/shared/test/unit_test/gen12lp/test_encode_math_gen12lp.cpp +++ b/shared/test/unit_test/gen12lp/test_encode_math_gen12lp.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020-2023 Intel Corporation + * Copyright (C) 2020-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -27,7 +27,7 @@ GEN12LPTEST_F(CommandEncoderMathTestGen12Lp, WhenAppendsAGreaterThanThenPredicat CommandContainer cmdContainer; cmdContainer.initialize(pDevice, nullptr, HeapSize::defaultHeapSize, true, false); - EncodeMathMMIO::encodeGreaterThanPredicate(cmdContainer, 0xDEADBEEFCAF0u, 17u); + EncodeMathMMIO::encodeGreaterThanPredicate(cmdContainer, 0xDEADBEEFCAF0u, 17u, false); GenCmdList commands; CmdParse::parseCommandBuffer(commands, ptrOffset(cmdContainer.getCommandStream()->getCpuBase(), 0), cmdContainer.getCommandStream()->getUsed()); diff --git a/shared/test/unit_test/gen12lp/test_preamble_gen12lp.cpp b/shared/test/unit_test/gen12lp/test_preamble_gen12lp.cpp index 1a53b36de6..2673c59524 100644 --- a/shared/test/unit_test/gen12lp/test_preamble_gen12lp.cpp +++ b/shared/test/unit_test/gen12lp/test_preamble_gen12lp.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2019-2023 Intel Corporation + * Copyright (C) 2019-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -24,7 +24,7 @@ HWTEST2_F(TglLpSlm, givenTglLpWhenPreambleIsBeingProgrammedThenThreadArbitration LinearStream &cs = linearStream; uint32_t l3Config = PreambleHelper::getL3Config(pDevice->getHardwareInfo(), true); MockDevice mockDevice; - PreambleHelper::programPreamble(&linearStream, mockDevice, l3Config, nullptr); + PreambleHelper::programPreamble(&linearStream, mockDevice, l3Config, nullptr, false); parseCommands(cs); @@ -36,7 +36,7 @@ HWTEST2_F(TglLpSlm, WhenPreambleIsCreatedThenSlmIsDisabled, IsTGLLP) { typedef Gen12LpFamily::MI_LOAD_REGISTER_IMM MI_LOAD_REGISTER_IMM; LinearStream &cs = linearStream; uint32_t l3Config = PreambleHelper::getL3Config(pDevice->getHardwareInfo(), true); - PreambleHelper::programL3(&cs, l3Config); + PreambleHelper::programL3(&cs, l3Config, false); parseCommands(cs); diff --git a/shared/test/unit_test/gen8/test_encode_math_gen8.cpp b/shared/test/unit_test/gen8/test_encode_math_gen8.cpp index d0ff5b26fd..6101a15db3 100644 --- a/shared/test/unit_test/gen8/test_encode_math_gen8.cpp +++ b/shared/test/unit_test/gen8/test_encode_math_gen8.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020-2023 Intel Corporation + * Copyright (C) 2020-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -29,7 +29,7 @@ GEN8TEST_F(CommandEncoderMathTestGen8, WhenAppendsAGreaterThanThenPredicateCorre CommandContainer cmdContainer; cmdContainer.initialize(pDevice, nullptr, HeapSize::defaultHeapSize, true, false); - EncodeMathMMIO::encodeGreaterThanPredicate(cmdContainer, 0xDEADBEEFCAF0u, 17u); + EncodeMathMMIO::encodeGreaterThanPredicate(cmdContainer, 0xDEADBEEFCAF0u, 17u, false); GenCmdList commands; CmdParse::parseCommandBuffer(commands, ptrOffset(cmdContainer.getCommandStream()->getCpuBase(), 0), cmdContainer.getCommandStream()->getUsed()); diff --git a/shared/test/unit_test/gen8/test_preamble_gen8.cpp b/shared/test/unit_test/gen8/test_preamble_gen8.cpp index 4e334d30b9..348a6c6fad 100644 --- a/shared/test/unit_test/gen8/test_preamble_gen8.cpp +++ b/shared/test/unit_test/gen8/test_preamble_gen8.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2023 Intel Corporation + * Copyright (C) 2018-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -24,7 +24,7 @@ BDWTEST_F(BdwSlm, WhenL3ConfigIsDispatchedThenProperRegisterAddressAndValueArePr typedef Gen8Family::MI_LOAD_REGISTER_IMM MI_LOAD_REGISTER_IMM; LinearStream &cs = linearStream; uint32_t l3Config = PreambleHelper::getL3Config(*defaultHwInfo, true); - PreambleHelper::programL3(&cs, l3Config); + PreambleHelper::programL3(&cs, l3Config, false); parseCommands(cs); diff --git a/shared/test/unit_test/gen9/preamble_tests_gen9.cpp b/shared/test/unit_test/gen9/preamble_tests_gen9.cpp index 7d6d615d3c..fef95338c9 100644 --- a/shared/test/unit_test/gen9/preamble_tests_gen9.cpp +++ b/shared/test/unit_test/gen9/preamble_tests_gen9.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2023 Intel Corporation + * Copyright (C) 2018-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -60,7 +60,7 @@ GEN9TEST_F(ThreadArbitrationGen9, givenPreambleWhenItIsProgrammedThenThreadArbit LinearStream &cs = linearStream; uint32_t l3Config = PreambleHelper::getL3Config(*defaultHwInfo, true); MockDevice mockDevice; - PreambleHelper::programPreamble(&linearStream, mockDevice, l3Config, nullptr); + PreambleHelper::programPreamble(&linearStream, mockDevice, l3Config, nullptr, false); parseCommands(cs); diff --git a/shared/test/unit_test/gen9/test_encode_math_gen9.cpp b/shared/test/unit_test/gen9/test_encode_math_gen9.cpp index 31e767898e..d1c306e9f7 100644 --- a/shared/test/unit_test/gen9/test_encode_math_gen9.cpp +++ b/shared/test/unit_test/gen9/test_encode_math_gen9.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2020-2023 Intel Corporation + * Copyright (C) 2020-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -29,7 +29,7 @@ GEN9TEST_F(CommandEncoderMathTestGen9, WhenAppendsAGreaterThanThenPredicateCorre CommandContainer cmdContainer; cmdContainer.initialize(pDevice, nullptr, HeapSize::defaultHeapSize, true, false); - EncodeMathMMIO::encodeGreaterThanPredicate(cmdContainer, 0xDEADBEEFCAF0u, 17u); + EncodeMathMMIO::encodeGreaterThanPredicate(cmdContainer, 0xDEADBEEFCAF0u, 17u, false); GenCmdList commands; CmdParse::parseCommandBuffer(commands, ptrOffset(cmdContainer.getCommandStream()->getCpuBase(), 0), cmdContainer.getCommandStream()->getUsed()); diff --git a/shared/test/unit_test/gen9/test_preamble_gen9.cpp b/shared/test/unit_test/gen9/test_preamble_gen9.cpp index f767475734..d025d10b96 100644 --- a/shared/test/unit_test/gen9/test_preamble_gen9.cpp +++ b/shared/test/unit_test/gen9/test_preamble_gen9.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2023 Intel Corporation + * Copyright (C) 2018-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -26,7 +26,7 @@ GEN9TEST_F(Gen9Slm, WhenL3ConfigIsDispatchedThenProperRegisterAddressAndValueAre LinearStream &cs = linearStream; uint32_t l3Config = PreambleHelper::getL3Config(*defaultHwInfo, true); - PreambleHelper::programL3(&cs, l3Config); + PreambleHelper::programL3(&cs, l3Config, false); parseCommands(cs); diff --git a/shared/test/unit_test/helpers/gfx_core_helper_tests.cpp b/shared/test/unit_test/helpers/gfx_core_helper_tests.cpp index 945821f5d4..af277d2c8c 100644 --- a/shared/test/unit_test/helpers/gfx_core_helper_tests.cpp +++ b/shared/test/unit_test/helpers/gfx_core_helper_tests.cpp @@ -228,7 +228,7 @@ HWTEST_F(LriHelperTests, givenAddressAndOffsetWhenHelperIsUsedThenProgramCmdStre expectedLri.setRegisterOffset(address); expectedLri.setDataDword(data); - LriHelper::program(&stream, address, data, false); + LriHelper::program(&stream, address, data, false, false); auto lri = genCmdCast(stream.getCpuBase()); ASSERT_NE(nullptr, lri); diff --git a/shared/test/unit_test/preamble/preamble_tests.cpp b/shared/test/unit_test/preamble/preamble_tests.cpp index ecc6b1a985..6e4efbedda 100644 --- a/shared/test/unit_test/preamble/preamble_tests.cpp +++ b/shared/test/unit_test/preamble/preamble_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (C) 2018-2023 Intel Corporation + * Copyright (C) 2018-2024 Intel Corporation * * SPDX-License-Identifier: MIT * @@ -79,7 +79,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, PreambleTest, givenMidThreadPreemptionWhenPreambleIs uintptr_t minCsrAlignment = 2 * 256 * MemoryConstants::kiloByte; MockGraphicsAllocation csrSurface(reinterpret_cast(minCsrAlignment), 1024); - PreambleHelper::programPreamble(&preambleStream, *mockDevice, 0U, &csrSurface); + PreambleHelper::programPreamble(&preambleStream, *mockDevice, 0U, &csrSurface, false); PreemptionHelper::programStateSip(preemptionStream, *mockDevice, nullptr); @@ -220,7 +220,7 @@ HWTEST_F(PreambleTest, givenSetForceSemaphoreDelayBetweenWaitsWhenProgramSemapho auto buffer = std::unique_ptr(new char[bufferSize]); LinearStream stream(buffer.get(), bufferSize); - PreambleHelper::programSemaphoreDelay(&stream); + PreambleHelper::programSemaphoreDelay(&stream, false); HardwareParse hwParser; hwParser.parseCommands(stream); @@ -244,7 +244,7 @@ HWTEST_F(PreambleTest, givenNotSetForceSemaphoreDelayBetweenWaitsWhenProgramSema auto buffer = std::unique_ptr(new char[bufferSize]); LinearStream stream(buffer.get(), bufferSize); - PreambleHelper::programSemaphoreDelay(&stream); + PreambleHelper::programSemaphoreDelay(&stream, false); HardwareParse hwParser; hwParser.parseCommands(stream); diff --git a/shared/test/unit_test/xe_hpg_core/gfx_core_helper_tests_xe_hpg_core.cpp b/shared/test/unit_test/xe_hpg_core/gfx_core_helper_tests_xe_hpg_core.cpp index a156a252da..4d0144b123 100644 --- a/shared/test/unit_test/xe_hpg_core/gfx_core_helper_tests_xe_hpg_core.cpp +++ b/shared/test/unit_test/xe_hpg_core/gfx_core_helper_tests_xe_hpg_core.cpp @@ -89,7 +89,7 @@ XE_HPG_CORETEST_F(LriHelperTestsXeHpgCore, whenProgrammingLriCommandThenExpectMm expectedLri.setDataDword(data); expectedLri.setMmioRemapEnable(true); - LriHelper::program(&stream, address, data, true); + LriHelper::program(&stream, address, data, true, false); MI_LOAD_REGISTER_IMM *lri = genCmdCast(buffer.get()); ASSERT_NE(nullptr, lri);