performance: Skip fence after PC when no post sync write

Related-To: NEO-14642

Signed-off-by: Lukasz Jobczyk <lukasz.jobczyk@intel.com>
This commit is contained in:
Lukasz Jobczyk 2025-05-22 15:59:42 +00:00 committed by Compute-Runtime-Automation
parent 8e5b29f55e
commit 86e7d5b276
32 changed files with 58 additions and 54 deletions

View File

@ -262,7 +262,7 @@ size_t CommandQueueHw<gfxCoreFamily>::estimateStreamSizeForExecuteCommandListsRe
}
if (ctx.isDispatchTaskCountPostSyncRequired) {
linearStreamSizeEstimate += NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForBarrierWithPostSyncOperation(this->device->getNEODevice()->getRootDeviceEnvironment());
linearStreamSizeEstimate += NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForBarrierWithPostSyncOperation(this->device->getNEODevice()->getRootDeviceEnvironment(), true);
}
if (instructionCacheFlushRequired) {
@ -327,7 +327,7 @@ ze_result_t CommandQueueHw<gfxCoreFamily>::executeCommandListsRegular(
linearStreamSizeEstimate += this->computeDebuggerCmdsSize(ctx);
if (ctx.isDispatchTaskCountPostSyncRequired) {
linearStreamSizeEstimate += NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForBarrierWithPostSyncOperation(neoDevice->getRootDeviceEnvironment());
linearStreamSizeEstimate += NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForBarrierWithPostSyncOperation(neoDevice->getRootDeviceEnvironment(), true);
}
NEO::LinearStream child(nullptr);

View File

@ -240,7 +240,7 @@ struct MultiTileCommandListAppendBarrierFixture : public MultiTileCommandListFix
sizeof(MI_STORE_DATA_IMM) +
sizeof(MI_ATOMIC) + NEO::EncodeSemaphore<FamilyType>::getSizeMiSemaphoreWait();
size_t postSyncSize = NEO::MemorySynchronizationCommands<FamilyType>::getSizeForBarrierWithPostSyncOperation(device->getNEODevice()->getRootDeviceEnvironment());
size_t postSyncSize = NEO::MemorySynchronizationCommands<FamilyType>::getSizeForBarrierWithPostSyncOperation(device->getNEODevice()->getRootDeviceEnvironment(), true);
auto useSizeBefore = cmdListStream->getUsed();
auto result = commandList->appendBarrier(eventHandle, 0, nullptr, false);

View File

@ -390,7 +390,7 @@ HWTEST2_F(CommandListAppendUsedPacketSignalEvent,
auto gpuAddress = event->getGpuAddress(device) + event->getContextEndOffset();
size_t expectedSize = NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForBarrierWithPostSyncOperation(device->getNEODevice()->getRootDeviceEnvironment()) +
size_t expectedSize = NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForBarrierWithPostSyncOperation(device->getNEODevice()->getRootDeviceEnvironment(), true) +
commandList->estimateBufferSizeMultiTileBarrier(device->getNEODevice()->getRootDeviceEnvironment());
size_t usedSize = cmdStream->getUsed();
EXPECT_EQ(expectedSize, usedSize);

View File

@ -520,7 +520,7 @@ HWTEST2_F(CommandListAppendUsedPacketSignalEvent,
auto gpuAddress = event->getGpuAddress(device) + event->getContextEndOffset();
size_t expectedSize = NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForBarrierWithPostSyncOperation(device->getNEODevice()->getRootDeviceEnvironment());
size_t expectedSize = NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForBarrierWithPostSyncOperation(device->getNEODevice()->getRootDeviceEnvironment(), true);
size_t usedSize = cmdStream->getUsed();
EXPECT_EQ(expectedSize, usedSize);
@ -561,7 +561,7 @@ HWTEST2_F(CommandListAppendUsedPacketSignalEvent, givenMultiTileAndDynamicPostSy
commandList->partitionCount = 2;
EXPECT_EQ(ZE_RESULT_SUCCESS, commandList->appendSignalEvent(event->toHandle(), false));
size_t expectedSize = NEO::MemorySynchronizationCommands<FamilyType>::getSizeForBarrierWithPostSyncOperation(device->getNEODevice()->getRootDeviceEnvironment());
size_t expectedSize = NEO::MemorySynchronizationCommands<FamilyType>::getSizeForBarrierWithPostSyncOperation(device->getNEODevice()->getRootDeviceEnvironment(), true);
auto unifiedPostSyncLayout = device->getL0GfxCoreHelper().hasUnifiedPostSyncAllocationLayout();
@ -713,7 +713,7 @@ HWTEST2_F(CommandListAppendUsedPacketSignalEvent,
auto gpuAddress = event->getCompletionFieldGpuAddress(device);
size_t expectedSize = NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForBarrierWithPostSyncOperation(device->getNEODevice()->getRootDeviceEnvironment());
size_t expectedSize = NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForBarrierWithPostSyncOperation(device->getNEODevice()->getRootDeviceEnvironment(), true);
size_t usedSize = cmdStream->getUsed();
EXPECT_EQ(expectedSize, usedSize);
@ -777,7 +777,7 @@ HWTEST2_F(CommandListAppendUsedPacketSignalEvent,
auto gpuAddress = event->getCompletionFieldGpuAddress(device);
size_t expectedSize = NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForBarrierWithPostSyncOperation(device->getNEODevice()->getRootDeviceEnvironment());
size_t expectedSize = NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForBarrierWithPostSyncOperation(device->getNEODevice()->getRootDeviceEnvironment(), true);
size_t usedSize = cmdStream->getUsed();
EXPECT_EQ(expectedSize, usedSize);

View File

@ -494,7 +494,7 @@ HWTEST_F(CommandQueueCreate, GivenDispatchTaskCountPostSyncRequiredWhenExecuteCo
commandQueue->executeCommandLists(1, &cmdListHandle, nullptr, false, nullptr, nullptr);
auto estimatedSizeWithtBarrier = commandQueue->requiredSizeCalled;
auto sizeForBarrier = NEO::MemorySynchronizationCommands<FamilyType>::getSizeForBarrierWithPostSyncOperation(device->getNEODevice()->getRootDeviceEnvironment());
auto sizeForBarrier = NEO::MemorySynchronizationCommands<FamilyType>::getSizeForBarrierWithPostSyncOperation(device->getNEODevice()->getRootDeviceEnvironment(), true);
EXPECT_GT(sizeForBarrier, 0u);
EXPECT_EQ(estimatedSizeWithtBarrier, estimatedSizeWithoutBarrier + sizeForBarrier);

View File

@ -485,7 +485,7 @@ HWTEST_F(CommandQueueSynchronizeTest, givenSynchronousCommandQueueWhenTagUpdateF
} else {
expectedSize += sizeof(MI_BATCH_BUFFER_END);
}
expectedSize += NEO::MemorySynchronizationCommands<FamilyType>::getSizeForBarrierWithPostSyncOperation(neoDevice->getRootDeviceEnvironment());
expectedSize += NEO::MemorySynchronizationCommands<FamilyType>::getSizeForBarrierWithPostSyncOperation(neoDevice->getRootDeviceEnvironment(), true);
expectedSize = alignUp(expectedSize, 8);
const ze_command_queue_desc_t desc{ZE_STRUCTURE_TYPE_COMMAND_QUEUE_DESC, nullptr, 0, 0, 0, ZE_COMMAND_QUEUE_MODE_SYNCHRONOUS, ZE_COMMAND_QUEUE_PRIORITY_NORMAL};

View File

@ -75,7 +75,7 @@ size_t EnqueueOperation<GfxFamily>::getTotalSizeRequiredCS(uint32_t eventType, c
if (blitEnqueue) {
size_t expectedSizeCS = TimestampPacketHelper::getRequiredCmdStreamSizeForNodeDependencyWithBlitEnqueue<GfxFamily>();
if (commandQueueHw.isCacheFlushForBcsRequired()) {
expectedSizeCS += MemorySynchronizationCommands<GfxFamily>::getSizeForBarrierWithPostSyncOperation(rootDeviceEnvironment);
expectedSizeCS += MemorySynchronizationCommands<GfxFamily>::getSizeForBarrierWithPostSyncOperation(rootDeviceEnvironment, true);
}
return expectedSizeCS;
@ -129,7 +129,7 @@ size_t EnqueueOperation<GfxFamily>::getTotalSizeRequiredCS(uint32_t eventType, c
if (outEvent) {
auto pEvent = castToObjectOrAbort<Event>(*outEvent);
if ((pEvent->getContext()->getRootDeviceIndices().size() > 1) && (!pEvent->isUserEvent())) {
expectedSizeCS += MemorySynchronizationCommands<GfxFamily>::getSizeForBarrierWithPostSyncOperation(rootDeviceEnvironment);
expectedSizeCS += MemorySynchronizationCommands<GfxFamily>::getSizeForBarrierWithPostSyncOperation(rootDeviceEnvironment, true);
}
}
expectedSizeCS += MemorySynchronizationCommands<GfxFamily>::getSizeForSingleBarrier();

View File

@ -508,7 +508,7 @@ HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitAuxTranslationWithRequiredC
size_t dependencySize = numBuffersToEstimate * TimestampPacketHelper::getRequiredCmdStreamSizeForNodeDependencyWithBlitEnqueue<FamilyType>();
auto &rootDeviceEnvironment = device->getRootDeviceEnvironment();
size_t cacheFlushSize = MemorySynchronizationCommands<FamilyType>::getSizeForBarrierWithPostSyncOperation(rootDeviceEnvironment);
size_t cacheFlushSize = MemorySynchronizationCommands<FamilyType>::getSizeForBarrierWithPostSyncOperation(rootDeviceEnvironment, true);
setMockKernelArgs(std::array<Buffer *, 3>{{buffer0.get(), buffer1.get(), buffer2.get()}});

View File

@ -571,7 +571,7 @@ HWTEST_F(EnqueueHandlerTest, GivenCommandStreamWithoutKernelAndZeroSurfacesWhenE
EXPECT_EQ(CL_SUCCESS, enqueueResult);
auto requiredCmdStreamSize = alignUp(MemorySynchronizationCommands<FamilyType>::getSizeForBarrierWithPostSyncOperation(
pDevice->getRootDeviceEnvironment()),
pDevice->getRootDeviceEnvironment(), true),
MemoryConstants::cacheLineSize);
EXPECT_EQ(mockCmdQ->getCS(0).getUsed(), requiredCmdStreamSize);

View File

@ -996,7 +996,7 @@ HWTEST_F(EnqueueAuxKernelTests, givenMultipleArgsWhenAuxTranslationIsRequiredThe
auto pipeControls = findAll<typename FamilyType::PIPE_CONTROL *>(cmdList.begin(), cmdList.end());
auto additionalPcCount = MemorySynchronizationCommands<FamilyType>::getSizeForBarrierWithPostSyncOperation(
pDevice->getRootDeviceEnvironment()) /
pDevice->getRootDeviceEnvironment(), true) /
sizeof(typename FamilyType::PIPE_CONTROL);
// |AuxToNonAux|NDR|NonAuxToAux|

View File

@ -442,7 +442,7 @@ HWTEST_F(GetSizeRequiredBufferTest, GivenOutEventForMultiDeviceContextWhenCalcul
auto baseCommandStreamSize = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false, false, nullptr);
auto extendedCommandStreamSize = EnqueueOperation<FamilyType>::getTotalSizeRequiredCS(CL_COMMAND_MARKER, {}, false, false, false, *pCmdQ, multiDispatchInfo, false, false, false, &clEvent);
EXPECT_EQ(baseCommandStreamSize + MemorySynchronizationCommands<FamilyType>::getSizeForBarrierWithPostSyncOperation(pContext.getDevices()[0]->getRootDeviceEnvironment()), extendedCommandStreamSize);
EXPECT_EQ(baseCommandStreamSize + MemorySynchronizationCommands<FamilyType>::getSizeForBarrierWithPostSyncOperation(pContext.getDevices()[0]->getRootDeviceEnvironment(), true), extendedCommandStreamSize);
}
HWTEST2_F(GetSizeRequiredBufferTest, givenMultipleKernelRequiringSshWhenTotalSizeIsComputedThenItIsProperlyAligned, IsHeapfulSupported) {

View File

@ -74,7 +74,7 @@ HWTEST_F(GetSizeRequiredTest, WhenEnqueuingMarkerThenHeapsAndCommandBufferAreNot
size_t expectedStreamSize = 0;
if (pCmdQ->getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled() && (!pCmdQ->getGpgpuCommandStreamReceiver().isUpdateTagFromWaitEnabled())) {
expectedStreamSize = alignUp(MemorySynchronizationCommands<FamilyType>::getSizeForBarrierWithPostSyncOperation(
pDevice->getRootDeviceEnvironment()),
pDevice->getRootDeviceEnvironment(), true),
MemoryConstants::cacheLineSize);
}
EXPECT_EQ(expectedStreamSize, commandStream.getUsed() - usedBeforeCS);
@ -100,7 +100,7 @@ HWTEST_F(GetSizeRequiredTest, WhenEnqueuingBarrierThenHeapsAndCommandBufferAreNo
size_t expectedStreamSize = 0;
if (pCmdQ->getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
auto unalignedSize = MemorySynchronizationCommands<FamilyType>::getSizeForBarrierWithPostSyncOperation(pDevice->getRootDeviceEnvironment()) +
auto unalignedSize = MemorySynchronizationCommands<FamilyType>::getSizeForBarrierWithPostSyncOperation(pDevice->getRootDeviceEnvironment(), true) +
EncodeStoreMemory<FamilyType>::getStoreDataImmSize() +
sizeof(typename FamilyType::MI_BATCH_BUFFER_END);
expectedStreamSize = alignUp(unalignedSize, MemoryConstants::cacheLineSize);

View File

@ -1719,7 +1719,7 @@ HWCMDTEST_F(IGFX_GEN12LP_CORE, UltCommandStreamReceiverTest, givenBarrierNodeSet
DispatchFlags dispatchFlags = DispatchFlagsHelper::createDefaultDispatchFlags();
dispatchFlags.barrierTimestampPacketNodes = &timestampPacketDependencies.barrierNodes;
size_t expectedCmdSize = MemorySynchronizationCommands<FamilyType>::getSizeForBarrierWithPostSyncOperation(rootDeviceEnvironment);
size_t expectedCmdSize = MemorySynchronizationCommands<FamilyType>::getSizeForBarrierWithPostSyncOperation(rootDeviceEnvironment, true);
size_t estimatedCmdSize = commandStreamReceiver->getCmdSizeForStallingCommands(dispatchFlags);
EXPECT_EQ(expectedCmdSize, estimatedCmdSize);

View File

@ -801,7 +801,7 @@ HWTEST2_F(CommandStreamReceiverHwTestXeHPAndLater, givenStaticPartitionEnabledWh
commandStreamReceiver->staticWorkPartitioningEnabled = true;
commandStreamReceiver->activePartitions = 1;
size_t expectedCmdSize = MemorySynchronizationCommands<FamilyType>::getSizeForBarrierWithPostSyncOperation(rootDeviceEnvironment);
size_t expectedCmdSize = MemorySynchronizationCommands<FamilyType>::getSizeForBarrierWithPostSyncOperation(rootDeviceEnvironment, true);
size_t estimatedCmdSize = commandStreamReceiver->getCmdSizeForStallingCommands(dispatchFlags);
EXPECT_EQ(expectedCmdSize, estimatedCmdSize);
@ -848,7 +848,7 @@ HWTEST2_F(CommandStreamReceiverHwTestXeHPAndLater, givenStaticPartitionDisabledW
commandStreamReceiver->staticWorkPartitioningEnabled = false;
commandStreamReceiver->activePartitions = 2;
size_t expectedCmdSize = MemorySynchronizationCommands<FamilyType>::getSizeForBarrierWithPostSyncOperation(rootDeviceEnvironment);
size_t expectedCmdSize = MemorySynchronizationCommands<FamilyType>::getSizeForBarrierWithPostSyncOperation(rootDeviceEnvironment, true);
size_t estimatedCmdSize = commandStreamReceiver->getCmdSizeForStallingCommands(dispatchFlags);
EXPECT_EQ(expectedCmdSize, estimatedCmdSize);
@ -898,7 +898,7 @@ HWTEST2_F(CommandStreamReceiverHwTestXeHPAndLater, givenStaticPartitionEnabledWh
commandStreamReceiver->staticWorkPartitioningEnabled = true;
commandStreamReceiver->activePartitions = 2;
size_t expectedSize = MemorySynchronizationCommands<FamilyType>::getSizeForBarrierWithPostSyncOperation(rootDeviceEnvironment) +
size_t expectedSize = MemorySynchronizationCommands<FamilyType>::getSizeForBarrierWithPostSyncOperation(rootDeviceEnvironment, true) +
sizeof(MI_ATOMIC) + NEO::EncodeSemaphore<FamilyType>::getSizeMiSemaphoreWait() +
sizeof(MI_BATCH_BUFFER_START) +
2 * sizeof(uint32_t);

View File

@ -400,7 +400,7 @@ HWTEST_F(TimestampPacketTests, givenPipeControlRequestWithBarrierWriteWhenEstima
flags.isStallingCommandsOnNextFlushRequired = true;
auto sizeWithPcRequest = device->getUltCommandStreamReceiver<FamilyType>().getRequiredCmdStreamSize(flags, device->getDevice());
size_t extendedSize = sizeWithoutPcRequest + MemorySynchronizationCommands<FamilyType>::getSizeForBarrierWithPostSyncOperation(device->getRootDeviceEnvironment());
size_t extendedSize = sizeWithoutPcRequest + MemorySynchronizationCommands<FamilyType>::getSizeForBarrierWithPostSyncOperation(device->getRootDeviceEnvironment(), true);
EXPECT_EQ(sizeWithPcRequest, extendedSize);
}

View File

@ -962,7 +962,7 @@ HWTEST_TEMPLATED_F(BcsBufferTests, givenBufferOperationWithoutKernelWhenEstimati
auto expectedSize = TimestampPacketHelper::getRequiredCmdStreamSizeForNodeDependencyWithBlitEnqueue<FamilyType>();
if (cmdQ->isCacheFlushForBcsRequired()) {
expectedSize += MemorySynchronizationCommands<FamilyType>::getSizeForBarrierWithPostSyncOperation(cmdQ->getDevice().getRootDeviceEnvironment());
expectedSize += MemorySynchronizationCommands<FamilyType>::getSizeForBarrierWithPostSyncOperation(cmdQ->getDevice().getRootDeviceEnvironment(), true);
}
EXPECT_EQ(expectedSize, readBufferCmdsSize);

View File

@ -338,7 +338,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
if (!args.makeCommandView) {
if (NEO::PauseOnGpuProperties::pauseModeAllowed(NEO::debugManager.flags.PauseOnEnqueue.get(), args.device->debugExecutionCounter.load(), NEO::PauseOnGpuProperties::PauseMode::BeforeWorkload)) {
void *commandBuffer = listCmdBufferStream->getSpace(MemorySynchronizationCommands<Family>::getSizeForBarrierWithPostSyncOperation(rootDeviceEnvironment));
void *commandBuffer = listCmdBufferStream->getSpace(MemorySynchronizationCommands<Family>::getSizeForBarrierWithPostSyncOperation(rootDeviceEnvironment, false));
args.additionalCommands->push_back(commandBuffer);
EncodeSemaphore<Family>::applyMiSemaphoreWaitCommand(*listCmdBufferStream, *args.additionalCommands);
@ -475,7 +475,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
if (!args.makeCommandView) {
if (NEO::PauseOnGpuProperties::pauseModeAllowed(NEO::debugManager.flags.PauseOnEnqueue.get(), args.device->debugExecutionCounter.load(), NEO::PauseOnGpuProperties::PauseMode::AfterWorkload)) {
void *commandBuffer = listCmdBufferStream->getSpace(MemorySynchronizationCommands<Family>::getSizeForBarrierWithPostSyncOperation(rootDeviceEnvironment));
void *commandBuffer = listCmdBufferStream->getSpace(MemorySynchronizationCommands<Family>::getSizeForBarrierWithPostSyncOperation(rootDeviceEnvironment, false));
args.additionalCommands->push_back(commandBuffer);
EncodeSemaphore<Family>::applyMiSemaphoreWaitCommand(*listCmdBufferStream, *args.additionalCommands);

View File

@ -345,7 +345,7 @@ void programPostSyncPipeControlCommand(void *&inputAddress,
rootDeviceEnvironment,
flushArgs);
totalBytesProgrammed += static_cast<uint32_t>(NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForBarrierWithPostSyncOperation(rootDeviceEnvironment));
totalBytesProgrammed += static_cast<uint32_t>(NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForBarrierWithPostSyncOperation(rootDeviceEnvironment, true));
}
template <typename GfxFamily>
@ -828,7 +828,7 @@ uint64_t computeBarrierControlSectionOffset(WalkerPartitionArgs &args,
}
if (args.usePostSync) {
offset += NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForBarrierWithPostSyncOperation(rootDeviceEnvironment);
offset += NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForBarrierWithPostSyncOperation(rootDeviceEnvironment, true);
} else {
offset += NEO::MemorySynchronizationCommands<GfxFamily>::getSizeForSingleBarrier();
}

View File

@ -643,7 +643,7 @@ inline bool CommandStreamReceiverHw<GfxFamily>::flushBatchedSubmissions() {
auto lastTaskCount = primaryCmdBuffer->taskCount;
auto lastPipeControlArgs = primaryCmdBuffer->epiloguePipeControlArgs;
auto pipeControlLocationSize = MemorySynchronizationCommands<GfxFamily>::getSizeForBarrierWithPostSyncOperation(peekRootDeviceEnvironment());
auto pipeControlLocationSize = MemorySynchronizationCommands<GfxFamily>::getSizeForBarrierWithPostSyncOperation(peekRootDeviceEnvironment(), true);
FlushStampUpdateHelper flushStampUpdateHelper;
flushStampUpdateHelper.insert(primaryCmdBuffer->flushStamp->getStampReference());
@ -1235,7 +1235,7 @@ SubmissionStatus CommandStreamReceiverHw<GfxFamily>::flushPipeControl(bool state
args.tlbInvalidation = this->isTlbFlushRequiredForStateCacheFlush();
}
auto dispatchSize = MemorySynchronizationCommands<GfxFamily>::getSizeForBarrierWithPostSyncOperation(peekRootDeviceEnvironment()) + this->getCmdSizeForPrologue();
auto dispatchSize = MemorySynchronizationCommands<GfxFamily>::getSizeForBarrierWithPostSyncOperation(peekRootDeviceEnvironment(), true) + this->getCmdSizeForPrologue();
auto &commandStream = getCS(dispatchSize);
auto commandStreamStart = commandStream.getUsed();
@ -2413,7 +2413,7 @@ bool CommandStreamReceiverHw<GfxFamily>::submitDependencyUpdate(TagNodeBase *tag
}
auto ownership = obtainUniqueOwnership();
PipeControlArgs args;
auto expectedSize = MemorySynchronizationCommands<GfxFamily>::getSizeForBarrierWithPostSyncOperation(peekRootDeviceEnvironment()) + this->getCmdSizeForPrologue();
auto expectedSize = MemorySynchronizationCommands<GfxFamily>::getSizeForBarrierWithPostSyncOperation(peekRootDeviceEnvironment(), true) + this->getCmdSizeForPrologue();
auto &commandStream = getCS(expectedSize);
auto commandStreamStart = commandStream.getUsed();
auto cacheFlushTimestampPacketGpuAddress = TimestampPacketHelper::getContextEndGpuAddress(*tag);

View File

@ -169,7 +169,7 @@ inline size_t CommandStreamReceiverHw<GfxFamily>::getCmdSizeForStallingPostSyncC
false,
true);
} else {
return MemorySynchronizationCommands<GfxFamily>::getSizeForBarrierWithPostSyncOperation(peekRootDeviceEnvironment());
return MemorySynchronizationCommands<GfxFamily>::getSizeForBarrierWithPostSyncOperation(peekRootDeviceEnvironment(), true);
}
}

View File

@ -49,7 +49,7 @@ inline void RenderDispatcher<GfxFamily>::dispatchMonitorFence(LinearStream &cmdB
template <typename GfxFamily>
inline size_t RenderDispatcher<GfxFamily>::getSizeMonitorFence(const RootDeviceEnvironment &rootDeviceEnvironment) {
return MemorySynchronizationCommands<GfxFamily>::getSizeForBarrierWithPostSyncOperation(rootDeviceEnvironment);
return MemorySynchronizationCommands<GfxFamily>::getSizeForBarrierWithPostSyncOperation(rootDeviceEnvironment, true);
}
template <typename GfxFamily>

View File

@ -309,7 +309,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
memcpy_s(iddPtr, sizeof(idd), &idd, sizeof(idd));
if (NEO::PauseOnGpuProperties::pauseModeAllowed(NEO::debugManager.flags.PauseOnEnqueue.get(), args.device->debugExecutionCounter.load(), NEO::PauseOnGpuProperties::PauseMode::BeforeWorkload)) {
void *commandBuffer = listCmdBufferStream->getSpace(MemorySynchronizationCommands<Family>::getSizeForBarrierWithPostSyncOperation(args.device->getRootDeviceEnvironment()));
void *commandBuffer = listCmdBufferStream->getSpace(MemorySynchronizationCommands<Family>::getSizeForBarrierWithPostSyncOperation(args.device->getRootDeviceEnvironment(), false));
args.additionalCommands->push_back(commandBuffer);
EncodeSemaphore<Family>::applyMiSemaphoreWaitCommand(*listCmdBufferStream, *args.additionalCommands);
@ -326,7 +326,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
args.partitionCount = 1;
if (NEO::PauseOnGpuProperties::pauseModeAllowed(NEO::debugManager.flags.PauseOnEnqueue.get(), args.device->debugExecutionCounter.load(), NEO::PauseOnGpuProperties::PauseMode::AfterWorkload)) {
void *commandBuffer = listCmdBufferStream->getSpace(MemorySynchronizationCommands<Family>::getSizeForBarrierWithPostSyncOperation(args.device->getRootDeviceEnvironment()));
void *commandBuffer = listCmdBufferStream->getSpace(MemorySynchronizationCommands<Family>::getSizeForBarrierWithPostSyncOperation(args.device->getRootDeviceEnvironment(), false));
args.additionalCommands->push_back(commandBuffer);
EncodeSemaphore<Family>::applyMiSemaphoreWaitCommand(*listCmdBufferStream, *args.additionalCommands);

View File

@ -129,7 +129,7 @@ inline size_t CommandStreamReceiverHw<GfxFamily>::getCmdSizeForStallingNoPostSyn
template <typename GfxFamily>
inline size_t CommandStreamReceiverHw<GfxFamily>::getCmdSizeForStallingPostSyncCommands() const {
return MemorySynchronizationCommands<GfxFamily>::getSizeForBarrierWithPostSyncOperation(peekRootDeviceEnvironment());
return MemorySynchronizationCommands<GfxFamily>::getSizeForBarrierWithPostSyncOperation(peekRootDeviceEnvironment(), true);
}
template <typename GfxFamily>

View File

@ -522,7 +522,7 @@ struct MemorySynchronizationCommands {
static void addStateCacheFlush(LinearStream &commandStream, const RootDeviceEnvironment &rootDeviceEnvironment);
static void addInstructionCacheFlush(LinearStream &commandStream);
static size_t getSizeForBarrierWithPostSyncOperation(const RootDeviceEnvironment &rootDeviceEnvironment);
static size_t getSizeForBarrierWithPostSyncOperation(const RootDeviceEnvironment &rootDeviceEnvironment, bool postSyncWrite);
static size_t getSizeForBarrierWa(const RootDeviceEnvironment &rootDeviceEnvironment);
static size_t getSizeForSingleBarrier();
static size_t getSizeForSingleAdditionalSynchronizationForDirectSubmission(const RootDeviceEnvironment &rootDeviceEnvironment);

View File

@ -198,7 +198,7 @@ template <typename GfxFamily>
void MemorySynchronizationCommands<GfxFamily>::addBarrierWithPostSyncOperation(LinearStream &commandStream, PostSyncMode postSyncMode, uint64_t gpuAddress, uint64_t immediateData,
const RootDeviceEnvironment &rootDeviceEnvironment, PipeControlArgs &args) {
void *commandBuffer = commandStream.getSpace(MemorySynchronizationCommands<GfxFamily>::getSizeForBarrierWithPostSyncOperation(rootDeviceEnvironment));
void *commandBuffer = commandStream.getSpace(MemorySynchronizationCommands<GfxFamily>::getSizeForBarrierWithPostSyncOperation(rootDeviceEnvironment, postSyncMode != PostSyncMode::noWrite));
MemorySynchronizationCommands<GfxFamily>::setBarrierWithPostSyncOperation(commandBuffer, postSyncMode, gpuAddress, immediateData, rootDeviceEnvironment, args);
}
@ -220,7 +220,9 @@ void MemorySynchronizationCommands<GfxFamily>::setBarrierWithPostSyncOperation(
MemorySynchronizationCommands<GfxFamily>::setSingleBarrier(commandsBuffer, postSyncMode, gpuAddress, immediateData, args);
commandsBuffer = ptrOffset(commandsBuffer, getSizeForSingleBarrier());
MemorySynchronizationCommands<GfxFamily>::setAdditionalSynchronization(commandsBuffer, gpuAddress, false, rootDeviceEnvironment);
if (postSyncMode != PostSyncMode::noWrite) {
MemorySynchronizationCommands<GfxFamily>::setAdditionalSynchronization(commandsBuffer, gpuAddress, false, rootDeviceEnvironment);
}
}
template <typename GfxFamily>
@ -358,11 +360,13 @@ size_t MemorySynchronizationCommands<GfxFamily>::getSizeForSingleBarrier() {
}
template <typename GfxFamily>
size_t MemorySynchronizationCommands<GfxFamily>::getSizeForBarrierWithPostSyncOperation(const RootDeviceEnvironment &rootDeviceEnvironment) {
size_t MemorySynchronizationCommands<GfxFamily>::getSizeForBarrierWithPostSyncOperation(const RootDeviceEnvironment &rootDeviceEnvironment, bool postSyncWrite) {
size_t size = getSizeForSingleBarrier();
size += getSizeForBarrierWa(rootDeviceEnvironment);
size += getSizeForSingleAdditionalSynchronization(rootDeviceEnvironment);
if (postSyncWrite) {
size += getSizeForSingleAdditionalSynchronization(rootDeviceEnvironment);
}
return size;
}

View File

@ -182,7 +182,7 @@ struct TimestampPacketHelper {
size_t size = count * TimestampPacketHelper::getRequiredCmdStreamSizeForNodeDependencyWithBlitEnqueue<GfxFamily>();
if (auxTranslationDirection == AuxTranslationDirection::nonAuxToAux && cacheFlushForBcsRequired) {
size += MemorySynchronizationCommands<GfxFamily>::getSizeForBarrierWithPostSyncOperation(rootDeviceEnvironment);
size += MemorySynchronizationCommands<GfxFamily>::getSizeForBarrierWithPostSyncOperation(rootDeviceEnvironment, true);
}
return size;

View File

@ -15,7 +15,7 @@ namespace NEO {
template <typename GfxFamily>
struct UltMemorySynchronizationCommands : MemorySynchronizationCommands<GfxFamily> {
static size_t getExpectedPipeControlCount(const RootDeviceEnvironment &rootDeviceEnvironment) {
return (MemorySynchronizationCommands<GfxFamily>::getSizeForBarrierWithPostSyncOperation(rootDeviceEnvironment) -
return (MemorySynchronizationCommands<GfxFamily>::getSizeForBarrierWithPostSyncOperation(rootDeviceEnvironment, true) -
MemorySynchronizationCommands<GfxFamily>::getSizeForAdditonalSynchronization(rootDeviceEnvironment)) /
sizeof(typename GfxFamily::PIPE_CONTROL);
}

View File

@ -5775,7 +5775,7 @@ HWTEST2_F(CommandStreamReceiverHwTest, givenImplicitScalingEnabledWhenProgrammin
ultCsr.activePartitions = 2;
ultCsr.staticWorkPartitioningEnabled = true;
size_t barrierWithPostSyncOperationSize = NEO::MemorySynchronizationCommands<FamilyType>::getSizeForBarrierWithPostSyncOperation(pDevice->getRootDeviceEnvironment());
size_t barrierWithPostSyncOperationSize = NEO::MemorySynchronizationCommands<FamilyType>::getSizeForBarrierWithPostSyncOperation(pDevice->getRootDeviceEnvironment(), true);
size_t expectedSize = barrierWithPostSyncOperationSize +
sizeof(MI_ATOMIC) + NEO::EncodeSemaphore<FamilyType>::getSizeMiSemaphoreWait() +
sizeof(MI_BATCH_BUFFER_START) +

View File

@ -53,7 +53,7 @@ HWTEST_F(RenderDispatcherTest, givenRenderWhenAddingPreemptionCmdThenExpectPrope
}
HWTEST_F(RenderDispatcherTest, givenRenderWhenAskingForMonitorFenceCmdSizeThenReturnRequiredPipeControlCmdSize) {
size_t expectedSize = MemorySynchronizationCommands<FamilyType>::getSizeForBarrierWithPostSyncOperation(this->pDevice->getRootDeviceEnvironment());
size_t expectedSize = MemorySynchronizationCommands<FamilyType>::getSizeForBarrierWithPostSyncOperation(this->pDevice->getRootDeviceEnvironment(), true);
EXPECT_EQ(expectedSize, RenderDispatcher<FamilyType>::getSizeMonitorFence(this->pDevice->getRootDeviceEnvironment()));
}

View File

@ -1193,7 +1193,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests,
MockExecutionEnvironment mockExecutionEnvironment{};
size_t expectedSize = MemorySynchronizationCommands<FamilyType>::getSizeForBarrierWithPostSyncOperation(pDevice->getRootDeviceEnvironment()) +
size_t expectedSize = MemorySynchronizationCommands<FamilyType>::getSizeForBarrierWithPostSyncOperation(pDevice->getRootDeviceEnvironment(), true) +
sizeof(MI_ATOMIC) + NEO::EncodeSemaphore<FamilyType>::getSizeMiSemaphoreWait() +
sizeof(MI_BATCH_BUFFER_START) +
sizeof(WalkerPartition::BarrierControlSection);
@ -1270,7 +1270,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests,
using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START;
size_t expectedSize = sizeof(MI_STORE_DATA_IMM) +
MemorySynchronizationCommands<FamilyType>::getSizeForBarrierWithPostSyncOperation(pDevice->getRootDeviceEnvironment()) +
MemorySynchronizationCommands<FamilyType>::getSizeForBarrierWithPostSyncOperation(pDevice->getRootDeviceEnvironment(), true) +
sizeof(MI_ATOMIC) + NEO::EncodeSemaphore<FamilyType>::getSizeMiSemaphoreWait() +
sizeof(MI_BATCH_BUFFER_START) +
sizeof(WalkerPartition::BarrierControlSection) +
@ -1354,7 +1354,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, ImplicitScalingTests,
testHardwareInfo.featureTable.flags.ftrLocalMemory = true;
size_t expectedSize = sizeof(MI_ATOMIC) +
MemorySynchronizationCommands<FamilyType>::getSizeForBarrierWithPostSyncOperation(pDevice->getRootDeviceEnvironment()) +
MemorySynchronizationCommands<FamilyType>::getSizeForBarrierWithPostSyncOperation(pDevice->getRootDeviceEnvironment(), true) +
sizeof(MI_ATOMIC) + NEO::EncodeSemaphore<FamilyType>::getSizeMiSemaphoreWait() +
sizeof(MI_BATCH_BUFFER_START) +
sizeof(WalkerPartition::BarrierControlSection) +

View File

@ -1493,7 +1493,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests,
MockExecutionEnvironment mockExecutionEnvironment{};
auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0];
auto expectedOffsetSectionSize = NEO::MemorySynchronizationCommands<FamilyType>::getSizeForBarrierWithPostSyncOperation(rootDeviceEnvironment) +
auto expectedOffsetSectionSize = NEO::MemorySynchronizationCommands<FamilyType>::getSizeForBarrierWithPostSyncOperation(rootDeviceEnvironment, true) +
sizeof(WalkerPartition::MI_ATOMIC<FamilyType>) + sizeof(WalkerPartition::MI_SEMAPHORE_WAIT<FamilyType>) +
sizeof(WalkerPartition::BATCH_BUFFER_START<FamilyType>);
@ -1584,7 +1584,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests,
auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0];
auto expectedOffsetSectionSize = sizeof(WalkerPartition::MI_STORE_DATA_IMM<FamilyType>) +
NEO::MemorySynchronizationCommands<FamilyType>::getSizeForBarrierWithPostSyncOperation(rootDeviceEnvironment) +
NEO::MemorySynchronizationCommands<FamilyType>::getSizeForBarrierWithPostSyncOperation(rootDeviceEnvironment, true) +
sizeof(WalkerPartition::MI_ATOMIC<FamilyType>) + sizeof(WalkerPartition::MI_SEMAPHORE_WAIT<FamilyType>) +
sizeof(WalkerPartition::BATCH_BUFFER_START<FamilyType>);
@ -1727,7 +1727,7 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, WalkerPartitionTests,
auto &rootDeviceEnvironment = *mockExecutionEnvironment.rootDeviceEnvironments[0];
auto expectedOffsetSectionSize = sizeof(WalkerPartition::MI_ATOMIC<FamilyType>) +
NEO::MemorySynchronizationCommands<FamilyType>::getSizeForBarrierWithPostSyncOperation(rootDeviceEnvironment) +
NEO::MemorySynchronizationCommands<FamilyType>::getSizeForBarrierWithPostSyncOperation(rootDeviceEnvironment, true) +
sizeof(WalkerPartition::MI_ATOMIC<FamilyType>) + sizeof(WalkerPartition::MI_SEMAPHORE_WAIT<FamilyType>) +
sizeof(WalkerPartition::BATCH_BUFFER_START<FamilyType>);

View File

@ -326,7 +326,7 @@ HWTEST_F(PipeControlHelperTests, givenPostSyncWriteTimestampModeWhenHelperIsUsed
PipeControlArgs args;
MemorySynchronizationCommands<FamilyType>::addBarrierWithPostSyncOperation(
stream, PostSyncMode::timestamp, address, immediateData, rootDeviceEnvironment, args);
auto additionalPcSize = MemorySynchronizationCommands<FamilyType>::getSizeForBarrierWithPostSyncOperation(rootDeviceEnvironment) - sizeof(PIPE_CONTROL);
auto additionalPcSize = MemorySynchronizationCommands<FamilyType>::getSizeForBarrierWithPostSyncOperation(rootDeviceEnvironment, true) - sizeof(PIPE_CONTROL);
auto pipeControlLocationSize = additionalPcSize - MemorySynchronizationCommands<FamilyType>::getSizeForSingleAdditionalSynchronization(rootDeviceEnvironment);
void *cpuPipeControlBuffer = ptrOffset(stream.getCpuBase(), pipeControlLocationSize);
auto pipeControl = genCmdCast<PIPE_CONTROL *>(cpuPipeControlBuffer);
@ -386,7 +386,7 @@ HWTEST_F(PipeControlHelperTests, givenPostSyncWriteImmediateDataModeWhenHelperIs
PipeControlArgs args{};
MemorySynchronizationCommands<FamilyType>::addBarrierWithPostSyncOperation(
stream, PostSyncMode::immediateData, address, immediateData, rootDeviceEnvironment, args);
auto additionalPcSize = MemorySynchronizationCommands<FamilyType>::getSizeForBarrierWithPostSyncOperation(rootDeviceEnvironment) - sizeof(PIPE_CONTROL);
auto additionalPcSize = MemorySynchronizationCommands<FamilyType>::getSizeForBarrierWithPostSyncOperation(rootDeviceEnvironment, true) - sizeof(PIPE_CONTROL);
auto pipeControlLocationSize = additionalPcSize - MemorySynchronizationCommands<FamilyType>::getSizeForSingleAdditionalSynchronization(rootDeviceEnvironment);
void *cpuPipeControlBuffer = ptrOffset(stream.getCpuBase(), pipeControlLocationSize);
auto pipeControl = genCmdCast<PIPE_CONTROL *>(cpuPipeControlBuffer);
@ -425,7 +425,7 @@ HWTEST_F(PipeControlHelperTests, givenNotifyEnableArgumentIsTrueWhenHelperIsUsed
args.notifyEnable = true;
MemorySynchronizationCommands<FamilyType>::addBarrierWithPostSyncOperation(
stream, PostSyncMode::immediateData, address, immediateData, rootDeviceEnvironment, args);
auto additionalPcSize = MemorySynchronizationCommands<FamilyType>::getSizeForBarrierWithPostSyncOperation(rootDeviceEnvironment) - sizeof(PIPE_CONTROL);
auto additionalPcSize = MemorySynchronizationCommands<FamilyType>::getSizeForBarrierWithPostSyncOperation(rootDeviceEnvironment, true) - sizeof(PIPE_CONTROL);
auto pipeControlLocationSize = additionalPcSize - MemorySynchronizationCommands<FamilyType>::getSizeForSingleAdditionalSynchronization(rootDeviceEnvironment);
auto pipeControl = genCmdCast<PIPE_CONTROL *>(ptrOffset(stream.getCpuBase(), pipeControlLocationSize));
ASSERT_NE(nullptr, pipeControl);