diff --git a/level_zero/core/source/cmdqueue/cmdqueue_hw.inl b/level_zero/core/source/cmdqueue/cmdqueue_hw.inl index 0805c1b917..85dce1040b 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue_hw.inl +++ b/level_zero/core/source/cmdqueue/cmdqueue_hw.inl @@ -121,7 +121,7 @@ ze_result_t CommandQueueHw::executeCommandListsRegular( linearStreamSizeEstimate += this->estimateLinearStreamSizeComplementary(ctx, phCommandLists, numCommandLists); linearStreamSizeEstimate += this->computePreemptionSize(ctx, phCommandLists, numCommandLists); linearStreamSizeEstimate += this->computeDebuggerCmdsSize(ctx); - linearStreamSizeEstimate += NEO::MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(this->device->getHwInfo()); + linearStreamSizeEstimate += NEO::MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(this->device->getHwInfo(), false); NEO::LinearStream child(nullptr); if (const auto ret = this->makeAlignedChildStreamAndSetGpuBase(child, linearStreamSizeEstimate); ret != ZE_RESULT_SUCCESS) { @@ -491,7 +491,7 @@ size_t CommandQueueHw::computePreemptionSize( if (ctx.statePreemption != commandListPreemption) { if (this->preemptionCmdSyncProgramming) { - preemptionSize += NEO::MemorySynchronizationCommands::getSizeForSingleBarrier(); + preemptionSize += NEO::MemorySynchronizationCommands::getSizeForSingleBarrier(false); } preemptionSize += NEO::PreemptionHelper::getRequiredCmdStreamSize(commandListPreemption, ctx.statePreemption); ctx.statePreemption = commandListPreemption; diff --git a/level_zero/core/source/cmdqueue/cmdqueue_hw_base.inl b/level_zero/core/source/cmdqueue/cmdqueue_hw_base.inl index 5f922339ca..cd3237ba57 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue_hw_base.inl +++ b/level_zero/core/source/cmdqueue/cmdqueue_hw_base.inl @@ -92,9 +92,9 @@ void CommandQueueHw::programStateBaseAddress(uint64_t gsba, bool template size_t CommandQueueHw::estimateStateBaseAddressCmdSize() { using STATE_BASE_ADDRESS = typename GfxFamily::STATE_BASE_ADDRESS; - using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL; - size_t size = sizeof(STATE_BASE_ADDRESS) + sizeof(PIPE_CONTROL) + NEO::EncodeWA::getAdditionalPipelineSelectSize(*device->getNEODevice(), this->csr->isRcs()); + size_t size = sizeof(STATE_BASE_ADDRESS) + NEO::MemorySynchronizationCommands::getSizeForSingleBarrier(false) + + NEO::EncodeWA::getAdditionalPipelineSelectSize(*device->getNEODevice(), this->csr->isRcs()); if (NEO::Debugger::isDebugEnabled(internalUsage) && device->getL0Debugger() != nullptr) { const size_t trackedAddressesCount = 6; diff --git a/level_zero/core/source/cmdqueue/cmdqueue_xe_hp_core_and_later.inl b/level_zero/core/source/cmdqueue/cmdqueue_xe_hp_core_and_later.inl index e552321f74..c9d2ccec18 100644 --- a/level_zero/core/source/cmdqueue/cmdqueue_xe_hp_core_and_later.inl +++ b/level_zero/core/source/cmdqueue/cmdqueue_xe_hp_core_and_later.inl @@ -94,7 +94,6 @@ void CommandQueueHw::programStateBaseAddress(uint64_t gsba, bool template size_t CommandQueueHw::estimateStateBaseAddressCmdSize() { using STATE_BASE_ADDRESS = typename GfxFamily::STATE_BASE_ADDRESS; - using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL; using _3DSTATE_BINDING_TABLE_POOL_ALLOC = typename GfxFamily::_3DSTATE_BINDING_TABLE_POOL_ALLOC; NEO::Device *neoDevice = device->getNEODevice(); @@ -103,7 +102,7 @@ size_t CommandQueueHw::estimateStateBaseAddressCmdSize() { size_t size = 0; if (NEO::ApiSpecificConfig::getBindlessConfiguration()) { - size += sizeof(STATE_BASE_ADDRESS) + sizeof(PIPE_CONTROL) + sizeof(_3DSTATE_BINDING_TABLE_POOL_ALLOC); + size += sizeof(STATE_BASE_ADDRESS) + NEO::MemorySynchronizationCommands::getSizeForSingleBarrier(false) + sizeof(_3DSTATE_BINDING_TABLE_POOL_ALLOC); if (hwInfoConfig.isAdditionalStateBaseAddressWARequired(hwInfo)) { size += sizeof(STATE_BASE_ADDRESS); diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_barrier.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_barrier.cpp index c172dca78f..5d09847f4d 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_barrier.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_barrier.cpp @@ -341,7 +341,7 @@ HWTEST2_F(MultiTileCommandListAppendBarrier, sizeof(MI_STORE_DATA_IMM) + sizeof(MI_ATOMIC) + sizeof(MI_SEMAPHORE_WAIT); - size_t postSyncSize = NEO::MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(device->getHwInfo()); + size_t postSyncSize = NEO::MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(device->getHwInfo(), false); auto useSizeBefore = cmdListStream->getUsed(); auto result = commandList->appendBarrier(eventHandle, 0, nullptr); @@ -450,7 +450,7 @@ HWTEST2_F(MultiTileCommandListAppendBarrier, size_t timestampRegisters = 2 * (sizeof(MI_LOAD_REGISTER_REG) + sizeof(MI_LOAD_REGISTER_IMM) + NEO::EncodeMath::streamCommandSize + sizeof(MI_STORE_REGISTER_MEM)); - size_t postBarrierSynchronization = NEO::MemorySynchronizationCommands::getSizeForSingleBarrier() + + size_t postBarrierSynchronization = NEO::MemorySynchronizationCommands::getSizeForSingleBarrier(false) + NEO::MemorySynchronizationCommands::getSizeForSingleAdditionalSynchronization(device->getHwInfo()); size_t stopRegisters = timestampRegisters + postBarrierSynchronization; diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_event_reset.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_event_reset.cpp index b67efed19b..7ba220729a 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_event_reset.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_event_reset.cpp @@ -327,7 +327,7 @@ HWTEST2_F(CommandListAppendEventReset, auto gpuAddress = event->getGpuAddress(device) + event->getContextEndOffset(); auto &hwInfo = device->getNEODevice()->getHardwareInfo(); - size_t expectedSize = NEO::MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(hwInfo) + + size_t expectedSize = NEO::MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(hwInfo, false) + ((packets - 1) * sizeof(MI_STORE_DATA_IMM)) + commandList->estimateBufferSizeMultiTileBarrier(hwInfo); size_t usedSize = cmdStream->getUsed(); diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_signal_event.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_signal_event.cpp index 25cf6d1bab..8c79581f99 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_signal_event.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_signal_event.cpp @@ -214,7 +214,7 @@ HWTEST2_F(CommandListAppendSignalEvent, auto gpuAddress = event->getGpuAddress(device) + event->getContextEndOffset(); auto &hwInfo = device->getNEODevice()->getHardwareInfo(); - size_t expectedSize = NEO::MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(hwInfo); + size_t expectedSize = NEO::MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(hwInfo, false); size_t usedSize = cmdStream->getUsed(); EXPECT_EQ(expectedSize, usedSize); @@ -327,7 +327,7 @@ HWTEST2_F(CommandListAppendSignalEvent, } auto &hwInfo = device->getNEODevice()->getHardwareInfo(); - size_t expectedSize = NEO::MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(hwInfo); + size_t expectedSize = NEO::MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(hwInfo, false); size_t usedSize = cmdStream->getUsed(); EXPECT_EQ(expectedSize, usedSize); diff --git a/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_2.cpp b/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_2.cpp index e3ce9173c3..1b3901de11 100644 --- a/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_2.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdqueue/test_cmdqueue_2.cpp @@ -445,7 +445,7 @@ HWTEST_F(CommandQueueSynchronizeTest, givenSynchronousCommandQueueWhenTagUpdateF } else { expectedSize += sizeof(MI_BATCH_BUFFER_END); } - expectedSize += NEO::MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(neoDevice->getHardwareInfo()); + expectedSize += NEO::MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(neoDevice->getHardwareInfo(), false); expectedSize = alignUp(expectedSize, 8); const ze_command_queue_desc_t desc{ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, nullptr, 0, 0, 0, ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS, ZE_COMMAND_QUEUE_PRIORITY_NORMAL}; diff --git a/opencl/source/built_ins/aux_translation_builtin.h b/opencl/source/built_ins/aux_translation_builtin.h index 7016a02576..5d8a5e9183 100644 --- a/opencl/source/built_ins/aux_translation_builtin.h +++ b/opencl/source/built_ins/aux_translation_builtin.h @@ -87,7 +87,7 @@ class BuiltInOp : public BuiltinDispatchInfoBuilder template static size_t getSizeForSinglePipeControl(size_t, const HardwareInfo &, bool) { - return MemorySynchronizationCommands::getSizeForSingleBarrier(); + return MemorySynchronizationCommands::getSizeForSingleBarrier(false); } template diff --git a/opencl/source/command_queue/gpgpu_walker_base.inl b/opencl/source/command_queue/gpgpu_walker_base.inl index 7cf8f69460..f48ebc211c 100644 --- a/opencl/source/command_queue/gpgpu_walker_base.inl +++ b/opencl/source/command_queue/gpgpu_walker_base.inl @@ -173,7 +173,7 @@ size_t EnqueueOperation::getTotalSizeRequiredCS(uint32_t eventType, c if (blitEnqueue) { size_t expectedSizeCS = TimestampPacketHelper::getRequiredCmdStreamSizeForNodeDependencyWithBlitEnqueue(); if (commandQueueHw.isCacheFlushForBcsRequired()) { - expectedSizeCS += MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(hwInfo); + expectedSizeCS += MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(hwInfo, false); } return expectedSizeCS; @@ -195,7 +195,7 @@ size_t EnqueueOperation::getTotalSizeRequiredCS(uint32_t eventType, c expectedSizeCS += 4 * EncodeStoreMMIO::size; } } else if (isMarkerWithProfiling) { - expectedSizeCS += 2 * MemorySynchronizationCommands::getSizeForSingleBarrier(); + expectedSizeCS += 2 * MemorySynchronizationCommands::getSizeForSingleBarrier(false); if (!HwHelper::get(hwInfo.platform.eRenderCoreFamily).useOnlyGlobalTimestamps()) { expectedSizeCS += 2 * EncodeStoreMMIO::size; } @@ -205,7 +205,7 @@ size_t EnqueueOperation::getTotalSizeRequiredCS(uint32_t eventType, c } if (DebugManager.flags.PauseOnEnqueue.get() != -1) { - expectedSizeCS += MemorySynchronizationCommands::getSizeForSingleBarrier() * 2; + expectedSizeCS += MemorySynchronizationCommands::getSizeForSingleBarrier(false) * 2; expectedSizeCS += sizeof(typename GfxFamily::MI_SEMAPHORE_WAIT) * 2; } @@ -231,7 +231,7 @@ template size_t EnqueueOperation::getSizeRequiredCSNonKernel(bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue) { size_t size = 0; if (reserveProfilingCmdsSpace) { - size += 2 * sizeof(PIPE_CONTROL) + 4 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM); + size += 2 * MemorySynchronizationCommands::getSizeForSingleBarrier(false) + 4 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM); } return size; } diff --git a/opencl/source/command_queue/gpgpu_walker_xehp_and_later.inl b/opencl/source/command_queue/gpgpu_walker_xehp_and_later.inl index 84d9bd7f5b..0ccfb1e681 100644 --- a/opencl/source/command_queue/gpgpu_walker_xehp_and_later.inl +++ b/opencl/source/command_queue/gpgpu_walker_xehp_and_later.inl @@ -130,10 +130,10 @@ void GpgpuWalkerHelper::adjustMiStoreRegMemMode(MI_STORE_REG_MEM size_t EnqueueOperation::getSizeRequiredCSKernel(bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const Kernel *pKernel, const DispatchInfo &dispatchInfo) { - size_t numPipeControls = MemorySynchronizationCommands::isBarrierWaRequired(commandQueue.getDevice().getHardwareInfo()) ? 2 : 1; + size_t numBarriers = MemorySynchronizationCommands::isBarrierWaRequired(commandQueue.getDevice().getHardwareInfo()) ? 2 : 1; size_t size = sizeof(typename GfxFamily::COMPUTE_WALKER) + - (sizeof(typename GfxFamily::PIPE_CONTROL) * numPipeControls) + + (MemorySynchronizationCommands::getSizeForSingleBarrier(false) * numBarriers) + HardwareCommandsHelper::getSizeRequiredCS() + EncodeMemoryPrefetch::getSizeForMemoryPrefetch(pKernel->getKernelInfo().heapInfo.KernelHeapSize, commandQueue.getDevice().getHardwareInfo()); auto devices = commandQueue.getGpgpuCommandStreamReceiver().getOsContext().getDeviceBitfield(); @@ -170,7 +170,7 @@ size_t EnqueueOperation::getSizeForCacheFlushAfterWalkerCommands(cons size_t size = 0; if (kernel.requiresCacheFlushCommand(commandQueue)) { - size += sizeof(typename GfxFamily::PIPE_CONTROL); + size += MemorySynchronizationCommands::getSizeForSingleBarrier(false); if constexpr (GfxFamily::isUsingL3Control) { StackVec allocationsForCacheFlush; diff --git a/opencl/test/unit_test/command_queue/blit_enqueue_1_tests.cpp b/opencl/test/unit_test/command_queue/blit_enqueue_1_tests.cpp index aa2656ab17..feec4a8226 100644 --- a/opencl/test/unit_test/command_queue/blit_enqueue_1_tests.cpp +++ b/opencl/test/unit_test/command_queue/blit_enqueue_1_tests.cpp @@ -481,7 +481,7 @@ HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitAuxTranslationWithRequiredC size_t numBuffersToEstimate = 2; size_t dependencySize = numBuffersToEstimate * TimestampPacketHelper::getRequiredCmdStreamSizeForNodeDependencyWithBlitEnqueue(); - size_t cacheFlushSize = MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(hwInfo); + size_t cacheFlushSize = MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(hwInfo, false); setMockKernelArgs(std::array{{buffer0.get(), buffer1.get(), buffer2.get()}}); diff --git a/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests.cpp index fe7e95741e..2a23ca22e6 100644 --- a/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests.cpp +++ b/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests.cpp @@ -463,7 +463,7 @@ HWTEST_F(EnqueueHandlerTest, GivenCommandStreamWithoutKernelAndZeroSurfacesWhenE EXPECT_EQ(CL_SUCCESS, enqueueResult); auto requiredCmdStreamSize = alignUp(MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation( - pDevice->getHardwareInfo()), + pDevice->getHardwareInfo(), false), MemoryConstants::cacheLineSize); EXPECT_EQ(mockCmdQ->getCS(0).getUsed(), requiredCmdStreamSize); diff --git a/opencl/test/unit_test/command_queue/enqueue_kernel_2_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_kernel_2_tests.cpp index d4ee6088dd..91e6a50e38 100644 --- a/opencl/test/unit_test/command_queue/enqueue_kernel_2_tests.cpp +++ b/opencl/test/unit_test/command_queue/enqueue_kernel_2_tests.cpp @@ -900,7 +900,7 @@ HWTEST_F(EnqueueAuxKernelTests, givenMultipleArgsWhenAuxTranslationIsRequiredThe auto pipeControls = findAll(cmdList.begin(), cmdList.end()); auto additionalPcCount = MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation( - pDevice->getHardwareInfo()) / + pDevice->getHardwareInfo(), false) / sizeof(typename FamilyType::PIPE_CONTROL); // |AuxToNonAux|NDR|NonAuxToAux| @@ -1016,7 +1016,7 @@ HWTEST_F(EnqueueKernelTest, givenTimestampWriteEnableWhenMarkerProfilingWithoutW auto baseCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false); auto extendedCommandStreamSize = EnqueueOperation::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, true, false); - EXPECT_EQ(baseCommandStreamSize + 4 * EncodeStoreMMIO::size + MemorySynchronizationCommands::getSizeForSingleBarrier(), extendedCommandStreamSize); + EXPECT_EQ(baseCommandStreamSize + 4 * EncodeStoreMMIO::size + MemorySynchronizationCommands::getSizeForSingleBarrier(false), extendedCommandStreamSize); } HWCMDTEST_F(IGFX_XE_HP_CORE, EnqueueKernelTest, givenTimestampWriteEnableOnMultiTileQueueWhenMarkerProfilingWithoutWaitListThenSizeHasFourMMIOStoresAndCrossTileBarrier) { diff --git a/opencl/test/unit_test/command_queue/get_size_required_tests.cpp b/opencl/test/unit_test/command_queue/get_size_required_tests.cpp index a44b816831..ad74303050 100644 --- a/opencl/test/unit_test/command_queue/get_size_required_tests.cpp +++ b/opencl/test/unit_test/command_queue/get_size_required_tests.cpp @@ -73,7 +73,7 @@ HWTEST_F(GetSizeRequiredTest, WhenEnqueuingMarkerThenHeapsAndCommandBufferAreNot size_t expectedStreamSize = 0; if (pCmdQ->getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) { expectedStreamSize = alignUp(MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation( - pDevice->getHardwareInfo()), + pDevice->getHardwareInfo(), false), MemoryConstants::cacheLineSize); } EXPECT_EQ(expectedStreamSize, commandStream.getUsed() - usedBeforeCS); @@ -100,7 +100,7 @@ HWTEST_F(GetSizeRequiredTest, WhenEnqueuingBarrierThenHeapsAndCommandBufferAreNo size_t expectedStreamSize = 0; if (pCmdQ->getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) { expectedStreamSize = alignUp(MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation( - pDevice->getHardwareInfo()), + pDevice->getHardwareInfo(), false), MemoryConstants::cacheLineSize); } diff --git a/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_1_tests.cpp b/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_1_tests.cpp index e7573ad14f..58ef9ea6c8 100644 --- a/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_1_tests.cpp +++ b/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_1_tests.cpp @@ -118,7 +118,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, CommandStreamReceiverFlushTaskTests, givenCsrInBatch //we do level change that will emit PPC, fill all the space so only BB end fits. taskLevel++; - auto ppcSize = MemorySynchronizationCommands::getSizeForSingleBarrier(); + auto ppcSize = MemorySynchronizationCommands::getSizeForSingleBarrier(false); auto fillSize = MemoryConstants::cacheLineSize - ppcSize - sizeof(typename FamilyType::MI_BATCH_BUFFER_END); csrCommandStream.getSpace(fillSize); auto expectedUsedSize = 2 * MemoryConstants::cacheLineSize; diff --git a/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_tests_xehp_and_later.cpp b/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_tests_xehp_and_later.cpp index 846e447d60..2cc97232c0 100644 --- a/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_tests_xehp_and_later.cpp +++ b/opencl/test/unit_test/command_stream/command_stream_receiver_flush_task_tests_xehp_and_later.cpp @@ -383,7 +383,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, CommandStreamReceiverFlushTaskXeHPAndLaterTests, gi //we do level change that will emit PPC, fill all the space so only BB end fits. taskLevel++; - auto ppcSize = MemorySynchronizationCommands::getSizeForSingleBarrier(); + auto ppcSize = MemorySynchronizationCommands::getSizeForSingleBarrier(false); auto fillSize = MemoryConstants::cacheLineSize - ppcSize - sizeof(typename FamilyType::MI_BATCH_BUFFER_END); csrCommandStream.getSpace(fillSize); auto expectedUsedSize = 2 * MemoryConstants::cacheLineSize; diff --git a/opencl/test/unit_test/command_stream/command_stream_receiver_hw_1_tests.cpp b/opencl/test/unit_test/command_stream/command_stream_receiver_hw_1_tests.cpp index 6cb2050591..25fbf9281a 100644 --- a/opencl/test/unit_test/command_stream/command_stream_receiver_hw_1_tests.cpp +++ b/opencl/test/unit_test/command_stream/command_stream_receiver_hw_1_tests.cpp @@ -1502,7 +1502,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, UltCommandStreamReceiverTest, givenBarrierNodeSetWhe DispatchFlags dispatchFlags = DispatchFlagsHelper::createDefaultDispatchFlags(); dispatchFlags.barrierTimestampPacketNodes = ×tampPacketDependencies.barrierNodes; - size_t expectedCmdSize = MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(hwInfo); + size_t expectedCmdSize = MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(hwInfo, false); size_t estimatedCmdSize = commandStreamReceiver->getCmdSizeForStallingCommands(dispatchFlags); EXPECT_EQ(expectedCmdSize, estimatedCmdSize); diff --git a/opencl/test/unit_test/command_stream/command_stream_receiver_hw_tests_xehp_and_later.cpp b/opencl/test/unit_test/command_stream/command_stream_receiver_hw_tests_xehp_and_later.cpp index bfd0e85437..1ae2016e7a 100644 --- a/opencl/test/unit_test/command_stream/command_stream_receiver_hw_tests_xehp_and_later.cpp +++ b/opencl/test/unit_test/command_stream/command_stream_receiver_hw_tests_xehp_and_later.cpp @@ -973,7 +973,7 @@ HWTEST2_F(CommandStreamReceiverHwTestXeHPAndLater, givenStaticPartitionEnabledWh commandStreamReceiver->staticWorkPartitioningEnabled = true; commandStreamReceiver->activePartitions = 1; - size_t expectedCmdSize = MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(hwInfo); + size_t expectedCmdSize = MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(hwInfo, false); size_t estimatedCmdSize = commandStreamReceiver->getCmdSizeForStallingCommands(dispatchFlags); EXPECT_EQ(expectedCmdSize, estimatedCmdSize); @@ -1021,7 +1021,7 @@ HWTEST2_F(CommandStreamReceiverHwTestXeHPAndLater, givenStaticPartitionDisabledW commandStreamReceiver->staticWorkPartitioningEnabled = false; commandStreamReceiver->activePartitions = 2; - size_t expectedCmdSize = MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(hwInfo); + size_t expectedCmdSize = MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(hwInfo, false); size_t estimatedCmdSize = commandStreamReceiver->getCmdSizeForStallingCommands(dispatchFlags); EXPECT_EQ(expectedCmdSize, estimatedCmdSize); @@ -1072,7 +1072,7 @@ HWTEST2_F(CommandStreamReceiverHwTestXeHPAndLater, givenStaticPartitionEnabledWh commandStreamReceiver->staticWorkPartitioningEnabled = true; commandStreamReceiver->activePartitions = 2; - size_t expectedSize = MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(hwInfo) + + size_t expectedSize = MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(hwInfo, false) + sizeof(MI_ATOMIC) + sizeof(MI_SEMAPHORE_WAIT) + sizeof(MI_BATCH_BUFFER_START) + 2 * sizeof(uint32_t); diff --git a/opencl/test/unit_test/helpers/hw_helper_tests.cpp b/opencl/test/unit_test/helpers/hw_helper_tests.cpp index 26e1228a78..36370bb255 100644 --- a/opencl/test/unit_test/helpers/hw_helper_tests.cpp +++ b/opencl/test/unit_test/helpers/hw_helper_tests.cpp @@ -299,7 +299,7 @@ HWTEST_F(PipeControlHelperTests, givenPostSyncWriteTimestampModeWhenHelperIsUsed PipeControlArgs args; MemorySynchronizationCommands::addBarrierWithPostSyncOperation( stream, PostSyncMode::Timestamp, address, immediateData, hardwareInfo, args); - auto additionalPcSize = MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(hardwareInfo) - sizeof(PIPE_CONTROL); + auto additionalPcSize = MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(hardwareInfo, false) - sizeof(PIPE_CONTROL); auto pipeControlLocationSize = additionalPcSize - MemorySynchronizationCommands::getSizeForSingleAdditionalSynchronization(hardwareInfo); auto pipeControl = genCmdCast(ptrOffset(stream.getCpuBase(), pipeControlLocationSize)); ASSERT_NE(nullptr, pipeControl); @@ -348,7 +348,7 @@ HWTEST_F(PipeControlHelperTests, givenPostSyncWriteImmediateDataModeWhenHelperIs PipeControlArgs args; MemorySynchronizationCommands::addBarrierWithPostSyncOperation( stream, PostSyncMode::ImmediateData, address, immediateData, hardwareInfo, args); - auto additionalPcSize = MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(hardwareInfo) - sizeof(PIPE_CONTROL); + auto additionalPcSize = MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(hardwareInfo, false) - sizeof(PIPE_CONTROL); auto pipeControlLocationSize = additionalPcSize - MemorySynchronizationCommands::getSizeForSingleAdditionalSynchronization(hardwareInfo); auto pipeControl = genCmdCast(ptrOffset(stream.getCpuBase(), pipeControlLocationSize)); ASSERT_NE(nullptr, pipeControl); @@ -378,7 +378,7 @@ HWTEST_F(PipeControlHelperTests, givenNotifyEnableArgumentIsTrueWhenHelperIsUsed args.notifyEnable = true; MemorySynchronizationCommands::addBarrierWithPostSyncOperation( stream, PostSyncMode::ImmediateData, address, immediateData, hardwareInfo, args); - auto additionalPcSize = MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(hardwareInfo) - sizeof(PIPE_CONTROL); + auto additionalPcSize = MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(hardwareInfo, false) - sizeof(PIPE_CONTROL); auto pipeControlLocationSize = additionalPcSize - MemorySynchronizationCommands::getSizeForSingleAdditionalSynchronization(hardwareInfo); auto pipeControl = genCmdCast(ptrOffset(stream.getCpuBase(), pipeControlLocationSize)); ASSERT_NE(nullptr, pipeControl); diff --git a/opencl/test/unit_test/helpers/timestamp_packet_2_tests.cpp b/opencl/test/unit_test/helpers/timestamp_packet_2_tests.cpp index be4b1ceb9d..bab58544ea 100644 --- a/opencl/test/unit_test/helpers/timestamp_packet_2_tests.cpp +++ b/opencl/test/unit_test/helpers/timestamp_packet_2_tests.cpp @@ -257,7 +257,7 @@ HWTEST_F(TimestampPacketTests, givenPipeControlRequestWithBarrierWriteWhenEstima csr.stallingCommandsOnNextFlushRequired = true; auto sizeWithPcRequest = device->getUltCommandStreamReceiver().getRequiredCmdStreamSize(flags, device->getDevice()); - size_t extendedSize = sizeWithoutPcRequest + MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(device->getHardwareInfo()); + size_t extendedSize = sizeWithoutPcRequest + MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(device->getHardwareInfo(), false); EXPECT_EQ(sizeWithPcRequest, extendedSize); } diff --git a/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp b/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp index 7070f2eae1..a8f32b5a7d 100644 --- a/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp +++ b/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp @@ -769,7 +769,7 @@ HWTEST_TEMPLATED_F(BcsBufferTests, givenBufferOperationWithoutKernelWhenEstimati auto expectedSize = TimestampPacketHelper::getRequiredCmdStreamSizeForNodeDependencyWithBlitEnqueue(); if (cmdQ->isCacheFlushForBcsRequired()) { - expectedSize += MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(hwInfo); + expectedSize += MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(hwInfo, false); } EXPECT_EQ(expectedSize, readBufferCmdsSize); diff --git a/shared/source/command_container/encode_compute_mode_tgllp_and_later.inl b/shared/source/command_container/encode_compute_mode_tgllp_and_later.inl index 32e788cd74..e87d51ee5f 100644 --- a/shared/source/command_container/encode_compute_mode_tgllp_and_later.inl +++ b/shared/source/command_container/encode_compute_mode_tgllp_and_later.inl @@ -20,11 +20,11 @@ size_t EncodeComputeMode::getCmdSizeForComputeMode(const HardwareInfo &h std::ignore = isExtendedWARequired; if (isBasicWARequired) { - size += sizeof(typename Family::PIPE_CONTROL); + size += MemorySynchronizationCommands::getSizeForSingleBarrier(false); } size += sizeof(typename Family::STATE_COMPUTE_MODE); if (hasSharedHandles) { - size += sizeof(typename Family::PIPE_CONTROL); + size += MemorySynchronizationCommands::getSizeForSingleBarrier(false); } if (hwInfoConfig.is3DPipelineSelectWARequired() && isRcs) { size += (2 * PreambleHelper::getCmdSizeForPipelineSelect(hwInfo)); @@ -37,8 +37,6 @@ inline void EncodeComputeMode::programComputeModeCommandWithSynchronizat LinearStream &csr, StateComputeModeProperties &properties, const PipelineSelectArgs &args, bool hasSharedHandles, const HardwareInfo &hwInfo, bool isRcs, LogicalStateHelper *logicalStateHelper) { - using PIPE_CONTROL = typename Family::PIPE_CONTROL; - NEO::EncodeWA::encodeAdditionalPipelineSelect(csr, args, true, hwInfo, isRcs); auto &hwInfoConfig = (*HwInfoConfig::get(hwInfo.platform.eProductFamily)); diff --git a/shared/source/command_container/walker_partition_xehp_and_later.h b/shared/source/command_container/walker_partition_xehp_and_later.h index 72859497ca..1e3fb10318 100644 --- a/shared/source/command_container/walker_partition_xehp_and_later.h +++ b/shared/source/command_container/walker_partition_xehp_and_later.h @@ -318,7 +318,7 @@ void programPostSyncPipeControlCommand(void *&inputAddress, hwInfo, flushArgs); - totalBytesProgrammed += static_cast(NEO::MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(hwInfo)); + totalBytesProgrammed += static_cast(NEO::MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(hwInfo, flushArgs.tlbInvalidation)); } template @@ -755,9 +755,9 @@ uint64_t computeBarrierControlSectionOffset(WalkerPartitionArgs &args, } if (args.usePostSync) { - offset += NEO::MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(hwInfo); + offset += NEO::MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(hwInfo, false); } else { - offset += sizeof(PIPE_CONTROL); + offset += NEO::MemorySynchronizationCommands::getSizeForSingleBarrier(false); } offset += (computeTilesSynchronizationWithAtomicsSectionSize() + diff --git a/shared/source/command_stream/command_stream_receiver_hw_base.inl b/shared/source/command_stream/command_stream_receiver_hw_base.inl index cd4f6e622b..bdd2d5c2a3 100644 --- a/shared/source/command_stream/command_stream_receiver_hw_base.inl +++ b/shared/source/command_stream/command_stream_receiver_hw_base.inl @@ -723,7 +723,6 @@ inline bool CommandStreamReceiverHw::flushBatchedSubmissions() { ResidencyContainer surfacesForSubmit; ResourcePackage resourcePackage; const auto &hwInfo = peekHwInfo(); - auto pipeControlLocationSize = MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(hwInfo); void *currentPipeControlForNooping = nullptr; void *epiloguePipeControlLocation = nullptr; @@ -736,6 +735,8 @@ inline bool CommandStreamReceiverHw::flushBatchedSubmissions() { auto lastTaskCount = primaryCmdBuffer->taskCount; auto lastPipeControlArgs = primaryCmdBuffer->epiloguePipeControlArgs; + auto pipeControlLocationSize = MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(hwInfo, lastPipeControlArgs.tlbInvalidation); + FlushStampUpdateHelper flushStampUpdateHelper; flushStampUpdateHelper.insert(primaryCmdBuffer->flushStamp->getStampReference()); @@ -848,7 +849,7 @@ size_t CommandStreamReceiverHw::getRequiredCmdStreamSize(const Dispat if (!this->isStateSipSent || device.getDebugger()) { size += PreemptionHelper::getRequiredStateSipCmdSize(device, isRcs()); } - size += MemorySynchronizationCommands::getSizeForSingleBarrier(); + size += MemorySynchronizationCommands::getSizeForSingleBarrier(false); size += sizeof(typename GfxFamily::MI_BATCH_BUFFER_START); size += getCmdSizeForL3Config(); @@ -886,11 +887,11 @@ size_t CommandStreamReceiverHw::getRequiredCmdStreamSize(const Dispat } if (requiresInstructionCacheFlush) { - size += sizeof(typename GfxFamily::PIPE_CONTROL); + size += MemorySynchronizationCommands::getSizeForSingleBarrier(false); } if (DebugManager.flags.ForcePipeControlPriorToWalker.get()) { - size += 2 * sizeof(PIPE_CONTROL); + size += 2 * MemorySynchronizationCommands::getSizeForSingleBarrier(false); } return size; @@ -1222,13 +1223,15 @@ void CommandStreamReceiverHw::flushPipeControl() { auto lock = obtainUniqueOwnership(); const auto &hwInfo = peekHwInfo(); - auto &commandStream = getCS(MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(hwInfo)); - auto commandStreamStart = commandStream.getUsed(); PipeControlArgs args; args.dcFlushEnable = MemorySynchronizationCommands::getDcFlushEnable(true, hwInfo); args.notifyEnable = isUsedNotifyEnableForPostSync(); args.workloadPartitionOffset = isMultiTileOperationEnabled(); + + auto &commandStream = getCS(MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(hwInfo, args.tlbInvalidation)); + auto commandStreamStart = commandStream.getUsed(); + MemorySynchronizationCommands::addBarrierWithPostSyncOperation(commandStream, PostSyncMode::ImmediateData, getTagAllocation()->getGpuAddress(), diff --git a/shared/source/command_stream/command_stream_receiver_hw_bdw_and_later.inl b/shared/source/command_stream/command_stream_receiver_hw_bdw_and_later.inl index 7b0f7c8061..6f31bab436 100644 --- a/shared/source/command_stream/command_stream_receiver_hw_bdw_and_later.inl +++ b/shared/source/command_stream/command_stream_receiver_hw_bdw_and_later.inl @@ -141,7 +141,7 @@ inline size_t CommandStreamReceiverHw::getCmdSizeForStallingNoPostSyn template inline size_t CommandStreamReceiverHw::getCmdSizeForStallingPostSyncCommands() const { - return MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(peekHwInfo()); + return MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(peekHwInfo(), false); } template diff --git a/shared/source/command_stream/command_stream_receiver_hw_dg2_and_later.inl b/shared/source/command_stream/command_stream_receiver_hw_dg2_and_later.inl index 25a8b746bc..1aabcb3bd2 100644 --- a/shared/source/command_stream/command_stream_receiver_hw_dg2_and_later.inl +++ b/shared/source/command_stream/command_stream_receiver_hw_dg2_and_later.inl @@ -41,7 +41,7 @@ size_t CommandStreamReceiverHw::getCmdSizeForPerDssBackedBuffer(const Ha std::ignore = isBasicWARequired; if (isExtendedWARequired) { - size += sizeof(typename Family::PIPE_CONTROL); + size += MemorySynchronizationCommands::getSizeForSingleBarrier(false); } return size; diff --git a/shared/source/command_stream/command_stream_receiver_hw_xehp_and_later.inl b/shared/source/command_stream/command_stream_receiver_hw_xehp_and_later.inl index cd8093da72..80b2c7cbc4 100644 --- a/shared/source/command_stream/command_stream_receiver_hw_xehp_and_later.inl +++ b/shared/source/command_stream/command_stream_receiver_hw_xehp_and_later.inl @@ -30,7 +30,7 @@ template size_t CommandStreamReceiverHw::getRequiredStateBaseAddressSize(const Device &device) const { size_t size = sizeof(typename GfxFamily::STATE_BASE_ADDRESS); size += sizeof(typename GfxFamily::_3DSTATE_BINDING_TABLE_POOL_ALLOC); - size += sizeof(PIPE_CONTROL); + size += MemorySynchronizationCommands::getSizeForSingleBarrier(false); auto &hwInfo = *device.getRootDeviceEnvironment().getHardwareInfo(); auto &hwInfoConfig = *HwInfoConfig::get(hwInfo.platform.eProductFamily); @@ -157,7 +157,7 @@ inline size_t CommandStreamReceiverHw::getCmdSizeForStallingNoPostSyn false, false); } else { - return sizeof(typename GfxFamily::PIPE_CONTROL); + return MemorySynchronizationCommands::getSizeForSingleBarrier(false); } } @@ -168,7 +168,7 @@ inline size_t CommandStreamReceiverHw::getCmdSizeForStallingPostSyncC false, true); } else { - return MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(peekHwInfo()); + return MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(peekHwInfo(), false); } } diff --git a/shared/source/command_stream/experimental_command_buffer.inl b/shared/source/command_stream/experimental_command_buffer.inl index ffa3662c00..b415337ebf 100644 --- a/shared/source/command_stream/experimental_command_buffer.inl +++ b/shared/source/command_stream/experimental_command_buffer.inl @@ -62,11 +62,9 @@ size_t ExperimentalCommandBuffer::getTotalExperimentalSize() noexcept { template size_t ExperimentalCommandBuffer::getTimeStampPipeControlSize() noexcept { - using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL; - // Two P_C for timestamps return 2 * MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation( - *commandStreamReceiver->peekExecutionEnvironment().rootDeviceEnvironments[commandStreamReceiver->getRootDeviceIndex()]->getHardwareInfo()); + *commandStreamReceiver->peekExecutionEnvironment().rootDeviceEnvironments[commandStreamReceiver->getRootDeviceIndex()]->getHardwareInfo(), false); } template diff --git a/shared/source/command_stream/preemption_xehp_and_later.inl b/shared/source/command_stream/preemption_xehp_and_later.inl index 0196e2d0d1..acc0943f15 100644 --- a/shared/source/command_stream/preemption_xehp_and_later.inl +++ b/shared/source/command_stream/preemption_xehp_and_later.inl @@ -79,7 +79,7 @@ size_t PreemptionHelper::getRequiredStateSipCmdSize(Device &device, b HwHelper &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily); if (hwHelper.isSipWANeeded(hwInfo)) { - size += sizeof(typename GfxFamily::PIPE_CONTROL); + size += MemorySynchronizationCommands::getSizeForSingleBarrier(false); size += 2 * sizeof(typename GfxFamily::MI_LOAD_REGISTER_IMM); } else { auto hwInfoConfig = HwInfoConfig::get(hwInfo.platform.eProductFamily); @@ -87,7 +87,7 @@ size_t PreemptionHelper::getRequiredStateSipCmdSize(Device &device, b const auto isWARequired = isBasicWARequired || isExtendedWARequired; if (isWARequired) { - size += sizeof(typename GfxFamily::PIPE_CONTROL); + size += MemorySynchronizationCommands::getSizeForSingleBarrier(false); } size += sizeof(typename GfxFamily::STATE_SIP); } diff --git a/shared/source/direct_submission/dispatchers/render_dispatcher.inl b/shared/source/direct_submission/dispatchers/render_dispatcher.inl index 6d44b73b93..5fa8f9f13e 100644 --- a/shared/source/direct_submission/dispatchers/render_dispatcher.inl +++ b/shared/source/direct_submission/dispatchers/render_dispatcher.inl @@ -46,7 +46,7 @@ inline void RenderDispatcher::dispatchMonitorFence(LinearStream &cmdB template inline size_t RenderDispatcher::getSizeMonitorFence(const HardwareInfo &hwInfo) { - return MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(hwInfo); + return MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(hwInfo, false); } template @@ -72,7 +72,7 @@ inline size_t RenderDispatcher::getSizeCacheFlush(const HardwareInfo template inline size_t RenderDispatcher::getSizeTlbFlush() { - return MemorySynchronizationCommands::getSizeForSingleBarrier(); + return MemorySynchronizationCommands::getSizeForSingleBarrier(true); } } // namespace NEO diff --git a/shared/source/helpers/hw_helper.h b/shared/source/helpers/hw_helper.h index 2aa3f88a00..fee2902faf 100644 --- a/shared/source/helpers/hw_helper.h +++ b/shared/source/helpers/hw_helper.h @@ -452,9 +452,9 @@ struct MemorySynchronizationCommands { static void addFullCacheFlush(LinearStream &commandStream, const HardwareInfo &hwInfo); static void setCacheFlushExtraProperties(PipeControlArgs &args); - static size_t getSizeForBarrierWithPostSyncOperation(const HardwareInfo &hwInfo); + static size_t getSizeForBarrierWithPostSyncOperation(const HardwareInfo &hwInfo, bool tlbInvalidationRequired); static size_t getSizeForBarrierWa(const HardwareInfo &hwInfo); - static size_t getSizeForSingleBarrier(); + static size_t getSizeForSingleBarrier(bool tlbInvalidationRequired); static size_t getSizeForSingleAdditionalSynchronizationForDirectSubmission(const HardwareInfo &hwInfo); static size_t getSizeForSingleAdditionalSynchronization(const HardwareInfo &hwInfo); static size_t getSizeForAdditonalSynchronization(const HardwareInfo &hwInfo); @@ -462,8 +462,6 @@ struct MemorySynchronizationCommands { static bool isBarrierWaRequired(const HardwareInfo &hwInfo); static bool isBarrierlPriorToPipelineSelectWaRequired(const HardwareInfo &hwInfo); - - protected: static void setBarrierExtraProperties(void *barrierCmd, PipeControlArgs &args); }; diff --git a/shared/source/helpers/hw_helper_base.inl b/shared/source/helpers/hw_helper_base.inl index 3ecae1b534..fd502f0f93 100644 --- a/shared/source/helpers/hw_helper_base.inl +++ b/shared/source/helpers/hw_helper_base.inl @@ -216,7 +216,7 @@ template void MemorySynchronizationCommands::addBarrierWithPostSyncOperation(LinearStream &commandStream, PostSyncMode postSyncMode, uint64_t gpuAddress, uint64_t immediateData, const HardwareInfo &hwInfo, PipeControlArgs &args) { - void *commandBuffer = commandStream.getSpace(MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(hwInfo)); + void *commandBuffer = commandStream.getSpace(MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(hwInfo, args.tlbInvalidation)); MemorySynchronizationCommands::setBarrierWithPostSyncOperation(commandBuffer, postSyncMode, gpuAddress, immediateData, hwInfo, args); } @@ -234,7 +234,7 @@ void MemorySynchronizationCommands::setBarrierWithPostSyncOperation( setPostSyncExtraProperties(args, hwInfo); MemorySynchronizationCommands::setSingleBarrier(commandsBuffer, postSyncMode, gpuAddress, immediateData, args); - commandsBuffer = ptrOffset(commandsBuffer, getSizeForSingleBarrier()); + commandsBuffer = ptrOffset(commandsBuffer, getSizeForSingleBarrier(args.tlbInvalidation)); MemorySynchronizationCommands::setAdditionalSynchronization(commandsBuffer, gpuAddress, false, hwInfo); } @@ -251,7 +251,7 @@ void MemorySynchronizationCommands::setSingleBarrier(void *commandsBu template void MemorySynchronizationCommands::addSingleBarrier(LinearStream &commandStream, PostSyncMode postSyncMode, uint64_t gpuAddress, uint64_t immediateData, PipeControlArgs &args) { - auto barrier = commandStream.getSpace(MemorySynchronizationCommands::getSizeForSingleBarrier()); + auto barrier = commandStream.getSpace(MemorySynchronizationCommands::getSizeForSingleBarrier(args.tlbInvalidation)); setSingleBarrier(barrier, postSyncMode, gpuAddress, immediateData, args); } @@ -368,13 +368,13 @@ bool MemorySynchronizationCommands::getDcFlushEnable(bool isFlushPref } template -size_t MemorySynchronizationCommands::getSizeForSingleBarrier() { +size_t MemorySynchronizationCommands::getSizeForSingleBarrier(bool tlbInvalidationRequired) { return sizeof(typename GfxFamily::PIPE_CONTROL); } template -size_t MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(const HardwareInfo &hwInfo) { - size_t size = getSizeForSingleBarrier() + +size_t MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(const HardwareInfo &hwInfo, bool tlbInvalidationRequired) { + size_t size = getSizeForSingleBarrier(tlbInvalidationRequired) + getSizeForBarrierWa(hwInfo) + getSizeForSingleAdditionalSynchronization(hwInfo); return size; @@ -384,7 +384,7 @@ template size_t MemorySynchronizationCommands::getSizeForBarrierWa(const HardwareInfo &hwInfo) { size_t size = 0; if (MemorySynchronizationCommands::isBarrierWaRequired(hwInfo)) { - size = getSizeForSingleBarrier() + + size = getSizeForSingleBarrier(false) + getSizeForSingleAdditionalSynchronization(hwInfo); } return size; @@ -532,16 +532,11 @@ size_t HwHelperHw::getSingleTimestampPacketSizeHw() { template size_t MemorySynchronizationCommands::getSizeForFullCacheFlush() { - return sizeof(typename GfxFamily::PIPE_CONTROL); + return MemorySynchronizationCommands::getSizeForSingleBarrier(true); } template void MemorySynchronizationCommands::addFullCacheFlush(LinearStream &commandStream, const HardwareInfo &hwInfo) { - using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL; - - PIPE_CONTROL *pipeControl = commandStream.getSpaceForCmd(); - PIPE_CONTROL cmd = GfxFamily::cmdInitPipeControl; - PipeControlArgs args; args.dcFlushEnable = MemorySynchronizationCommands::getDcFlushEnable(true, hwInfo); args.renderTargetCacheFlushEnable = true; @@ -552,8 +547,7 @@ void MemorySynchronizationCommands::addFullCacheFlush(LinearStream &c args.stateCacheInvalidationEnable = true; args.tlbInvalidation = true; MemorySynchronizationCommands::setCacheFlushExtraProperties(args); - MemorySynchronizationCommands::setSingleBarrier(&cmd, args); - *pipeControl = cmd; + MemorySynchronizationCommands::addSingleBarrier(commandStream, args); } template diff --git a/shared/source/helpers/timestamp_packet.h b/shared/source/helpers/timestamp_packet.h index e51b497f29..60d1082647 100644 --- a/shared/source/helpers/timestamp_packet.h +++ b/shared/source/helpers/timestamp_packet.h @@ -189,7 +189,7 @@ struct TimestampPacketHelper { size_t size = count * TimestampPacketHelper::getRequiredCmdStreamSizeForNodeDependencyWithBlitEnqueue(); if (auxTranslationDirection == AuxTranslationDirection::NonAuxToAux && cacheFlushForBcsRequired) { - size += MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(hwInfo); + size += MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(hwInfo, false); } return size; diff --git a/shared/test/common/helpers/ult_hw_helper.h b/shared/test/common/helpers/ult_hw_helper.h index 1cb6a43d53..01d0bc7ded 100644 --- a/shared/test/common/helpers/ult_hw_helper.h +++ b/shared/test/common/helpers/ult_hw_helper.h @@ -14,7 +14,7 @@ namespace NEO { template struct UltMemorySynchronizationCommands : MemorySynchronizationCommands { static size_t getExpectedPipeControlCount(const HardwareInfo &hwInfo) { - return (MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(hwInfo) - + return (MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(hwInfo, false) - MemorySynchronizationCommands::getSizeForAdditonalSynchronization(hwInfo)) / sizeof(typename GfxFamily::PIPE_CONTROL); } diff --git a/shared/test/unit_test/direct_submission/dispatchers/render_dispatcher_tests.cpp b/shared/test/unit_test/direct_submission/dispatchers/render_dispatcher_tests.cpp index 04089f9d37..66ad585819 100644 --- a/shared/test/unit_test/direct_submission/dispatchers/render_dispatcher_tests.cpp +++ b/shared/test/unit_test/direct_submission/dispatchers/render_dispatcher_tests.cpp @@ -50,7 +50,7 @@ HWTEST_F(RenderDispatcherTest, givenRenderWhenAddingPreemptionCmdThenExpectPrope } HWTEST_F(RenderDispatcherTest, givenRenderWhenAskingForMonitorFenceCmdSizeThenReturnRequiredPipeControlCmdSize) { - size_t expectedSize = MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(hardwareInfo); + size_t expectedSize = MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(hardwareInfo, false); EXPECT_EQ(expectedSize, RenderDispatcher::getSizeMonitorFence(hardwareInfo)); } diff --git a/shared/test/unit_test/encoders/test_implicit_scaling_xehp_and_later.cpp b/shared/test/unit_test/encoders/test_implicit_scaling_xehp_and_later.cpp index a3e71b2c65..4a743d0d36 100644 --- a/shared/test/unit_test/encoders/test_implicit_scaling_xehp_and_later.cpp +++ b/shared/test/unit_test/encoders/test_implicit_scaling_xehp_and_later.cpp @@ -1153,7 +1153,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START; - size_t expectedSize = MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(testHardwareInfo) + + size_t expectedSize = MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(testHardwareInfo, false) + sizeof(MI_ATOMIC) + sizeof(MI_SEMAPHORE_WAIT) + sizeof(MI_BATCH_BUFFER_START) + sizeof(WalkerPartition::BarrierControlSection); @@ -1230,7 +1230,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START; size_t expectedSize = sizeof(MI_STORE_DATA_IMM) + - MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(testHardwareInfo) + + MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(testHardwareInfo, false) + sizeof(MI_ATOMIC) + sizeof(MI_SEMAPHORE_WAIT) + sizeof(MI_BATCH_BUFFER_START) + sizeof(WalkerPartition::BarrierControlSection) + @@ -1314,7 +1314,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests, testHardwareInfo.featureTable.flags.ftrLocalMemory = true; size_t expectedSize = sizeof(MI_ATOMIC) + - MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(testHardwareInfo) + + MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(testHardwareInfo, false) + sizeof(MI_ATOMIC) + sizeof(MI_SEMAPHORE_WAIT) + sizeof(MI_BATCH_BUFFER_START) + sizeof(WalkerPartition::BarrierControlSection) + diff --git a/shared/test/unit_test/encoders/walker_partition_tests_xehp_and_later_1.cpp b/shared/test/unit_test/encoders/walker_partition_tests_xehp_and_later_1.cpp index 79e05ff674..dc49ef80e4 100644 --- a/shared/test/unit_test/encoders/walker_partition_tests_xehp_and_later_1.cpp +++ b/shared/test/unit_test/encoders/walker_partition_tests_xehp_and_later_1.cpp @@ -1385,7 +1385,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, uint32_t totalBytesProgrammed = 0u; uint64_t gpuVirtualAddress = 0xFF0000; - auto expectedOffsetSectionSize = NEO::MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(testHardwareInfo) + + auto expectedOffsetSectionSize = NEO::MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(testHardwareInfo, false) + sizeof(WalkerPartition::MI_ATOMIC) + sizeof(WalkerPartition::MI_SEMAPHORE_WAIT) + sizeof(WalkerPartition::BATCH_BUFFER_START); @@ -1473,7 +1473,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, uint64_t gpuVirtualAddress = 0xFF0000; auto expectedOffsetSectionSize = sizeof(WalkerPartition::MI_STORE_DATA_IMM) + - NEO::MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(testHardwareInfo) + + NEO::MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(testHardwareInfo, false) + sizeof(WalkerPartition::MI_ATOMIC) + sizeof(WalkerPartition::MI_SEMAPHORE_WAIT) + sizeof(WalkerPartition::BATCH_BUFFER_START); @@ -1614,7 +1614,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests, uint64_t gpuVirtualAddress = 0xFF0000; auto expectedOffsetSectionSize = sizeof(WalkerPartition::MI_ATOMIC) + - NEO::MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(testHardwareInfo) + + NEO::MemorySynchronizationCommands::getSizeForBarrierWithPostSyncOperation(testHardwareInfo, false) + sizeof(WalkerPartition::MI_ATOMIC) + sizeof(WalkerPartition::MI_SEMAPHORE_WAIT) + sizeof(WalkerPartition::BATCH_BUFFER_START);