diff --git a/opencl/source/command_queue/command_queue.h b/opencl/source/command_queue/command_queue.h index 12f83da17d..7ed8cf6594 100644 --- a/opencl/source/command_queue/command_queue.h +++ b/opencl/source/command_queue/command_queue.h @@ -348,7 +348,12 @@ class CommandQueue : public BaseObject<_cl_command_queue> { void fillCsrDependenciesWithLastBcsPackets(CsrDependencies &csrDeps); void clearLastBcsPackets(); - void setStallingCommandsOnNextFlush(const bool isStallingCommandsOnNextFlushRequired) { stallingCommandsOnNextFlushRequired = isStallingCommandsOnNextFlushRequired; } + void setStallingCommandsOnNextFlush(const bool isStallingCommandsOnNextFlushRequired) { + stallingCommandsOnNextFlushRequired = isStallingCommandsOnNextFlushRequired; + if (!isStallingCommandsOnNextFlushRequired) { + dcFlushRequiredOnStallingCommandsOnNextFlush = false; + } + } bool isStallingCommandsOnNextFlushRequired() const { return stallingCommandsOnNextFlushRequired; } void setDcFlushRequiredOnStallingCommandsOnNextFlush(const bool isDcFlushRequiredOnStallingCommandsOnNextFlush) { dcFlushRequiredOnStallingCommandsOnNextFlush = isDcFlushRequiredOnStallingCommandsOnNextFlush; } diff --git a/opencl/source/command_queue/enqueue_common.h b/opencl/source/command_queue/enqueue_common.h index f7c888b95e..5a0f6655ee 100644 --- a/opencl/source/command_queue/enqueue_common.h +++ b/opencl/source/command_queue/enqueue_common.h @@ -260,10 +260,21 @@ cl_int CommandQueueHw::enqueueHandler(Surface **surfacesForResidency, bool flushDependenciesForNonKernelCommand = false; bool relaxedOrderingEnabled = false; - + bool programBarrierInTaskStream = false; if (multiDispatchInfo.empty() == false) { relaxedOrderingEnabled = relaxedOrderingForGpgpuAllowed(static_cast(csrDeps.timestampPacketContainer.size())); + programBarrierInTaskStream = !relaxedOrderingEnabled && !getGpgpuCommandStreamReceiver().isMultiTileOperationEnabled() && isStallingCommandsOnNextFlushRequired() && !isBlitAuxTranslationRequired(multiDispatchInfo); + if (programBarrierInTaskStream) { + CsrDependencies csrDeps{}; + fillCsrDependenciesWithLastBcsPackets(csrDeps); + TimestampPacketHelper::programCsrDependenciesForTimestampPacketContainer(commandStream, csrDeps, false); + setupBarrierTimestampForBcsEngines(getGpgpuCommandStreamReceiver().getOsContext().getEngineType(), timestampPacketDependencies); + getGpgpuCommandStreamReceiver().programStallingCommandsForBarrier(commandStream, ×tampPacketDependencies.barrierNodes, isDcFlushRequiredOnStallingCommandsOnNextFlush()); + + clearLastBcsPackets(); + setStallingCommandsOnNextFlush(false); + } processDispatchForKernels(multiDispatchInfo, printfHandler, eventBuilder.getEvent(), hwTimeStamps, blockQueue, csrDeps, blockedCommandsData.get(), timestampPacketDependencies, relaxedOrderingEnabled); @@ -272,11 +283,10 @@ cl_int CommandQueueHw::enqueueHandler(Surface **surfacesForResidency, } else if (computeCommandStreamReceiver.peekTimestampPacketWriteEnabled()) { if (CL_COMMAND_BARRIER == commandType && !isNonStallingIoqBarrier) { setStallingCommandsOnNextFlush(true); - if (NEO::DebugManager.flags.SkipDcFlushOnBarrierWithoutEvents.get() == 0 || event) { - setDcFlushRequiredOnStallingCommandsOnNextFlush(true); - } - this->splitBarrierRequired = true; + const bool isDcFlushRequiredOnBarrier = NEO::DebugManager.flags.SkipDcFlushOnBarrierWithoutEvents.get() == 0 || event; + setDcFlushRequiredOnStallingCommandsOnNextFlush(isDcFlushRequiredOnBarrier); } + this->splitBarrierRequired = true; for (size_t i = 0; i < eventsRequest.numEventsInWaitList; i++) { auto waitlistEvent = castToObjectOrAbort(eventsRequest.eventWaitList[i]); @@ -339,7 +349,7 @@ cl_int CommandQueueHw::enqueueHandler(Surface **surfacesForResidency, CompletionStamp completionStamp = {CompletionStamp::notReady, taskLevel, 0}; const EnqueueProperties enqueueProperties(false, !multiDispatchInfo.empty(), isCacheFlushCommand(commandType), - flushDependenciesForNonKernelCommand, isMarkerWithPostSyncWrite, &blitPropertiesContainer); + flushDependenciesForNonKernelCommand, isMarkerWithPostSyncWrite, programBarrierInTaskStream, &blitPropertiesContainer); if (!blockQueue && isOOQEnabled()) { setupBarrierTimestampForBcsEngines(computeCommandStreamReceiver.getOsContext().getEngineType(), timestampPacketDependencies); @@ -862,7 +872,7 @@ CompletionStamp CommandQueueHw::enqueueNonBlocked( } auto memoryCompressionState = getGpgpuCommandStreamReceiver().getMemoryCompressionState(auxTranslationRequired); - bool hasStallingCmds = !relaxedOrderingEnabled && (eventsRequest.numEventsInWaitList > 0 || timestampPacketDependencies.previousEnqueueNodes.peekNodes().size() > 0); + bool hasStallingCmds = enqueueProperties.hasStallingCmds || (!relaxedOrderingEnabled && (eventsRequest.numEventsInWaitList > 0 || timestampPacketDependencies.previousEnqueueNodes.peekNodes().size() > 0)); DispatchFlags dispatchFlags( {}, // csrDependencies @@ -960,7 +970,6 @@ CompletionStamp CommandQueueHw::enqueueNonBlocked( if (isHandlingBarrier) { clearLastBcsPackets(); setStallingCommandsOnNextFlush(false); - setDcFlushRequiredOnStallingCommandsOnNextFlush(false); } if (gtpinIsGTPinInitialized()) { @@ -1180,7 +1189,6 @@ CompletionStamp CommandQueueHw::enqueueCommandWithoutKernel( if (isHandlingBarrier) { clearLastBcsPackets(); setStallingCommandsOnNextFlush(false); - setDcFlushRequiredOnStallingCommandsOnNextFlush(false); } } @@ -1418,7 +1426,7 @@ cl_int CommandQueueHw::enqueueBlit(const MultiDispatchInfo &multiDisp CompletionStamp completionStamp = {CompletionStamp::notReady, taskLevel, 0}; - const EnqueueProperties enqueueProperties(true, false, false, false, false, &blitPropertiesContainer); + const EnqueueProperties enqueueProperties(true, false, false, false, false, false, &blitPropertiesContainer); LinearStream *gpgpuCommandStream = {}; size_t gpgpuCommandStreamStart = {}; diff --git a/opencl/source/helpers/enqueue_properties.h b/opencl/source/helpers/enqueue_properties.h index 980b702d8c..003ee2d02a 100644 --- a/opencl/source/helpers/enqueue_properties.h +++ b/opencl/source/helpers/enqueue_properties.h @@ -22,8 +22,8 @@ struct EnqueueProperties { }; EnqueueProperties() = delete; - EnqueueProperties(bool blitEnqueue, bool hasKernels, bool isCacheFlushCmd, bool flushDependenciesOnly, bool isMarkerWithEvent, - const BlitPropertiesContainer *blitPropertiesContainer) { + EnqueueProperties(bool blitEnqueue, bool hasKernels, bool isCacheFlushCmd, bool flushDependenciesOnly, bool isMarkerWithEvent, bool hasStallingCmds, + const BlitPropertiesContainer *blitPropertiesContainer) : hasStallingCmds(hasStallingCmds) { if (blitEnqueue) { operation = Operation::Blit; this->blitPropertiesContainer = blitPropertiesContainer; @@ -61,5 +61,6 @@ struct EnqueueProperties { const BlitPropertiesContainer *blitPropertiesContainer = nullptr; Operation operation = Operation::EnqueueWithoutSubmission; + const bool hasStallingCmds; }; } // namespace NEO diff --git a/opencl/source/helpers/task_information.cpp b/opencl/source/helpers/task_information.cpp index 9899243ae0..4eb1dad766 100644 --- a/opencl/source/helpers/task_information.cpp +++ b/opencl/source/helpers/task_information.cpp @@ -269,7 +269,6 @@ CompletionStamp &CommandComputeKernel::submit(TaskCountType taskLevel, bool term if (isHandlingBarrier) { commandQueue.clearLastBcsPackets(); commandQueue.setStallingCommandsOnNextFlush(false); - commandQueue.setDcFlushRequiredOnStallingCommandsOnNextFlush(false); } if (kernelOperation->blitPropertiesContainer.size() > 0) { @@ -429,7 +428,6 @@ CompletionStamp &CommandWithoutKernel::submit(TaskCountType taskLevel, bool term if (isHandlingBarrier) { commandQueue.clearLastBcsPackets(); commandQueue.setStallingCommandsOnNextFlush(false); - commandQueue.setDcFlushRequiredOnStallingCommandsOnNextFlush(false); } if (kernelOperation->blitEnqueue) { diff --git a/opencl/test/unit_test/command_queue/command_queue_hw_2_tests.cpp b/opencl/test/unit_test/command_queue/command_queue_hw_2_tests.cpp index 716db8cf29..1df3ba1ec0 100644 --- a/opencl/test/unit_test/command_queue/command_queue_hw_2_tests.cpp +++ b/opencl/test/unit_test/command_queue/command_queue_hw_2_tests.cpp @@ -1171,6 +1171,7 @@ HWTEST_F(OoqCommandQueueHwBlitTest, givenBlitBeforeBarrierWhenEnqueueingCommandT using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; using XY_COPY_BLT = typename FamilyType::XY_COPY_BLT; + using COMPUTE_WALKER = typename FamilyType::WALKER_TYPE; if (pCmdQ->getTimestampPacketContainer() == nullptr) { GTEST_SKIP(); @@ -1192,31 +1193,51 @@ HWTEST_F(OoqCommandQueueHwBlitTest, givenBlitBeforeBarrierWhenEnqueueingCommandT EXPECT_EQ(CL_SUCCESS, pCmdQ->enqueueReadBuffer(buffer.get(), CL_FALSE, 0, 1u, ptr, nullptr, 0, nullptr, nullptr)); uint64_t lastBlitNodeAddress = TimestampPacketHelper::getContextEndGpuAddress(*pCmdQ->getTimestampPacketContainer()->peekNodes()[0]); EXPECT_EQ(CL_SUCCESS, pCmdQ->enqueueKernel(kernel, 1, &offset, &gws, nullptr, 0, nullptr, nullptr)); - auto ccsStart = pCmdQ->getGpgpuCommandStreamReceiver().getCS().getUsed(); + auto bcsStart = pCmdQ->getBcsCommandStreamReceiver(aub_stream::ENGINE_BCS)->getCS(0).getUsed(); EXPECT_EQ(CL_SUCCESS, pCmdQ->enqueueBarrierWithWaitList(0, nullptr, nullptr)); EXPECT_EQ(CL_SUCCESS, pCmdQ->enqueueKernel(kernel, 1, &offset, &gws, nullptr, 0, nullptr, nullptr)); - EXPECT_EQ(CL_SUCCESS, pCmdQ->enqueueReadBuffer(buffer.get(), CL_FALSE, 0, 1u, ptr, nullptr, 0, nullptr, nullptr)); - EXPECT_EQ(CL_SUCCESS, pCmdQ->enqueueReadBuffer(buffer.get(), CL_FALSE, 0, 1u, ptr, nullptr, 0, nullptr, nullptr)); uint64_t barrierNodeAddress = 0u; { - HardwareParse ccsHwParser; - ccsHwParser.parseCommands(pCmdQ->getGpgpuCommandStreamReceiver().getCS(0), ccsStart); + HardwareParse queueHwParser; + queueHwParser.parseCommands(*pDevice->getUltCommandStreamReceiver().lastFlushedCommandStream, 0); - const auto semaphoreItor = find(ccsHwParser.cmdList.begin(), ccsHwParser.cmdList.end()); + const auto semaphoreItor = find(queueHwParser.cmdList.begin(), queueHwParser.cmdList.end()); const auto semaphore = genCmdCast(*semaphoreItor); EXPECT_EQ(lastBlitNodeAddress, semaphore->getSemaphoreGraphicsAddress()); - const auto pipeControlItor = find(semaphoreItor, ccsHwParser.cmdList.end()); + const auto pipeControlItor = find(semaphoreItor, queueHwParser.cmdList.end()); const auto pipeControl = genCmdCast(*pipeControlItor); EXPECT_EQ(PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA, pipeControl->getPostSyncOperation()); barrierNodeAddress = pipeControl->getAddress() | (static_cast(pipeControl->getAddressHigh()) << 32); // There shouldn't be any more semaphores before the barrier EXPECT_EQ(pipeControlItor, find(std::next(semaphoreItor), pipeControlItor)); + + // Make sure the gpgpu semaphore is programmed before the second compute walker + auto itor = queueHwParser.cmdList.begin(); + auto semaphoreIndex = 0u; + auto lastComputeWalkerIndex = 0u; + auto index = 0u; + while (itor != queueHwParser.cmdList.end()) { + const auto semaphore = genCmdCast(*itor); + if (semaphore) { + semaphoreIndex = index; + } + const auto computeWalker = genCmdCast(*itor); + if (computeWalker) { + lastComputeWalkerIndex = index; + } + ++itor; + ++index; + } + EXPECT_LT(semaphoreIndex, lastComputeWalkerIndex); } + EXPECT_EQ(CL_SUCCESS, pCmdQ->enqueueReadBuffer(buffer.get(), CL_FALSE, 0, 1u, ptr, nullptr, 0, nullptr, nullptr)); + EXPECT_EQ(CL_SUCCESS, pCmdQ->enqueueReadBuffer(buffer.get(), CL_FALSE, 0, 1u, ptr, nullptr, 0, nullptr, nullptr)); + { HardwareParse bcsHwParser; bcsHwParser.parseCommands(pCmdQ->getBcsCommandStreamReceiver(aub_stream::ENGINE_BCS)->getCS(0), bcsStart); diff --git a/opencl/test/unit_test/command_queue/command_queue_tests_pvc_and_later.cpp b/opencl/test/unit_test/command_queue/command_queue_tests_pvc_and_later.cpp index c3db186d52..215d8f00df 100644 --- a/opencl/test/unit_test/command_queue/command_queue_tests_pvc_and_later.cpp +++ b/opencl/test/unit_test/command_queue/command_queue_tests_pvc_and_later.cpp @@ -819,16 +819,18 @@ HWTEST2_F(OoqCommandQueueHwBlitTest, givenBarrierBeforeFirstKernelWhenEnqueueNDR EXPECT_EQ(CL_SUCCESS, pCmdQ->enqueueReadBuffer(buffer.get(), CL_FALSE, 0, 1u, ptr, nullptr, 0, nullptr, nullptr)); EXPECT_EQ(CL_SUCCESS, pCmdQ->enqueueReadBuffer(buffer.get(), CL_FALSE, 0, 1u, ptr, nullptr, 0, nullptr, nullptr)); - EXPECT_EQ(CL_SUCCESS, pCmdQ->enqueueBarrierWithWaitList(0, nullptr, nullptr)); auto ccsStart = pCmdQ->getGpgpuCommandStreamReceiver().getCS().getUsed(); - + EXPECT_EQ(CL_SUCCESS, pCmdQ->enqueueBarrierWithWaitList(0, nullptr, nullptr)); EXPECT_EQ(CL_SUCCESS, pCmdQ->enqueueKernel(kernel, 1, &offset, &gws, nullptr, 0, nullptr, nullptr)); + HardwareParse queueHwParser; + queueHwParser.parseCommands(*pDevice->getUltCommandStreamReceiver().lastFlushedCommandStream, 0u); + const auto memFenceItor = find(queueHwParser.cmdList.begin(), queueHwParser.cmdList.end()); + EXPECT_NE(queueHwParser.cmdList.end(), memFenceItor); + HardwareParse ccsHwParser; ccsHwParser.parseCommands(pCmdQ->getGpgpuCommandStreamReceiver().getCS(0), ccsStart); const auto memFenceStateItor = find(ccsHwParser.cmdList.begin(), ccsHwParser.cmdList.end()); - const auto memFenceItor = find(memFenceStateItor, ccsHwParser.cmdList.end()); - EXPECT_NE(ccsHwParser.cmdList.end(), memFenceItor); EXPECT_NE(ccsHwParser.cmdList.end(), memFenceStateItor); } diff --git a/opencl/test/unit_test/command_queue/enqueue_barrier_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_barrier_tests.cpp index 9197fc4345..259e384399 100644 --- a/opencl/test/unit_test/command_queue/enqueue_barrier_tests.cpp +++ b/opencl/test/unit_test/command_queue/enqueue_barrier_tests.cpp @@ -202,7 +202,7 @@ HWTEST_F(BarrierTest, WhenEnqueingBarrierWithWaitListThenDependenciesShouldSync) auto pEvent = castToObject(event); auto &csr = pCmdQ->getGpgpuCommandStreamReceiver(); - // in this case only cmdQ raises the taskLevel why csr stay intact + // in this case only cmdQ raises the taskLevel while csr stays intact EXPECT_EQ(8u, pCmdQ->taskLevel); if (csr.peekTimestampPacketWriteEnabled()) { EXPECT_EQ(pCmdQ->taskLevel + 1, commandStreamReceiver.peekTaskLevel()); diff --git a/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests.cpp b/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests.cpp index ad94eaaad3..cf2a325a24 100644 --- a/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests.cpp +++ b/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests.cpp @@ -56,7 +56,7 @@ HWTEST_F(EnqueueHandlerTest, GivenCommandStreamWithoutKernelWhenCommandEnqueuedT TimestampPacketDependencies timestampPacketDependencies; CsrDependencies csrDeps; - EnqueueProperties enqueueProperties(false, false, false, true, false, nullptr); + EnqueueProperties enqueueProperties(false, false, false, true, false, false, nullptr); mockCmdQ->enqueueCommandWithoutKernel(surfaces, 1, &mockCmdQ->getCS(0), 0, blocking, enqueueProperties, timestampPacketDependencies, eventsRequest, eventBuilder, 0, csrDeps, nullptr, false); @@ -96,7 +96,7 @@ HWTEST_F(EnqueueHandlerTimestampEnabledTest, givenProflingAndTimeStampPacketsEna TimestampPacketDependencies timestampPacketDependencies; CsrDependencies csrDeps; - EnqueueProperties enqueueProperties(false, false, false, true, false, nullptr); + EnqueueProperties enqueueProperties(false, false, false, true, false, false, nullptr); EXPECT_EQ(ev->submitTimeStamp.cpuTimeinNS, 0u); EXPECT_EQ(ev->submitTimeStamp.gpuTimeStamp, 0u); @@ -134,7 +134,7 @@ HWTEST_F(EnqueueHandlerTimestampDisabledTest, givenProflingEnabledTimeStampPacke TimestampPacketDependencies timestampPacketDependencies; CsrDependencies csrDeps; - EnqueueProperties enqueueProperties(false, false, false, true, false, nullptr); + EnqueueProperties enqueueProperties(false, false, false, true, false, false, nullptr); EXPECT_EQ(ev->submitTimeStamp.cpuTimeinNS, 0u); EXPECT_EQ(ev->submitTimeStamp.gpuTimeStamp, 0u); @@ -167,7 +167,7 @@ HWTEST_F(EnqueueHandlerTest, givenNonBlitPropertyWhenEnqueueIsBlockedThenDontReg EventsRequest eventsRequest(0, nullptr, nullptr); EventBuilder eventBuilder; - const EnqueueProperties enqueuePropertiesForDependencyFlush(false, false, false, true, false, nullptr); + const EnqueueProperties enqueuePropertiesForDependencyFlush(false, false, false, true, false, false, nullptr); auto blockedCommandsData = std::unique_ptr(blockedCommandsDataForDependencyFlush); Surface *surfaces[] = {nullptr}; @@ -200,7 +200,7 @@ HWTEST_F(EnqueueHandlerTest, givenBlitPropertyWhenEnqueueIsBlockedThenRegisterBl blitProperties.dstAllocation = reinterpret_cast(0x56789); BlitPropertiesContainer blitPropertiesContainer; blitPropertiesContainer.push_back(blitProperties); - const EnqueueProperties enqueuePropertiesForBlitEnqueue(true, false, false, false, false, &blitPropertiesContainer); + const EnqueueProperties enqueuePropertiesForBlitEnqueue(true, false, false, false, false, false, &blitPropertiesContainer); auto blockedCommandsData = std::unique_ptr(blockedCommandsDataForBlitEnqueue); Surface *surfaces[] = {nullptr}; @@ -225,7 +225,7 @@ HWTEST_F(DispatchFlagsTests, whenEnqueueCommandWithoutKernelThenPassCorrectDispa EventsRequest eventsRequest(0, nullptr, nullptr); EventBuilder eventBuilder; - EnqueueProperties enqueueProperties(false, false, false, true, false, nullptr); + EnqueueProperties enqueueProperties(false, false, false, true, false, false, nullptr); mockCmdQ->enqueueCommandWithoutKernel(nullptr, 0, &mockCmdQ->getCS(0), 0, blocking, enqueueProperties, timestampPacketDependencies, eventsRequest, eventBuilder, 0, csrDeps, nullptr, false); @@ -251,7 +251,7 @@ HWTEST_F(DispatchFlagsTests, whenEnqueueCommandWithoutKernelThenPassCorrectThrot EventsRequest eventsRequest(0, nullptr, nullptr); EventBuilder eventBuilder; - EnqueueProperties enqueueProperties(false, false, false, true, false, nullptr); + EnqueueProperties enqueueProperties(false, false, false, true, false, false, nullptr); bool blocking = true; mockCmdQ->enqueueCommandWithoutKernel(nullptr, 0, &mockCmdQ->getCS(0), 0, blocking, enqueueProperties, timestampPacketDependencies, @@ -295,7 +295,7 @@ HWTEST_F(DispatchFlagsBlitTests, givenBlitEnqueueWhenDispatchingCommandsWithoutK BlitPropertiesContainer blitPropertiesContainer; blitPropertiesContainer.push_back(blitProperties); - EnqueueProperties enqueueProperties(true, false, false, false, false, &blitPropertiesContainer); + EnqueueProperties enqueueProperties(true, false, false, false, false, false, &blitPropertiesContainer); mockCmdQ->enqueueCommandWithoutKernel(nullptr, 0, &mockCmdQ->getCS(0), 0, blocking, enqueueProperties, timestampPacketDependencies, eventsRequest, eventBuilder, 0, csrDeps, &bcsCsr, false); @@ -334,7 +334,7 @@ HWTEST_F(DispatchFlagsBlitTests, givenBlitOperationWhenEnqueueCommandWithoutKern BlitPropertiesContainer blitPropertiesContainer; blitPropertiesContainer.push_back(blitProperties); - EnqueueProperties enqueueProperties(true, false, false, false, false, &blitPropertiesContainer); + EnqueueProperties enqueueProperties(true, false, false, false, false, false, &blitPropertiesContainer); mockCmdQ->enqueueCommandWithoutKernel(nullptr, 0, &mockCmdQ->getCS(0), 0, blocking, enqueueProperties, timestampPacketDependencies, eventsRequest, eventBuilder, 0, csrDeps, &bcsCsr, false); @@ -359,7 +359,7 @@ HWTEST_F(DispatchFlagsBlitTests, givenBlitOperationWhenEnqueueCommandWithoutKern BlitPropertiesContainer blitPropertiesContainer; - EnqueueProperties enqueueProperties(true, false, false, false, false, &blitPropertiesContainer); + EnqueueProperties enqueueProperties(true, false, false, false, false, false, &blitPropertiesContainer); mockCmdQ->enqueueCommandWithoutKernel(nullptr, 0, &mockCmdQ->getCS(0), 0, blocking, enqueueProperties, timestampPacketDependencies, eventsRequest, eventBuilder, 0, csrDeps, &bcsCsr, false); EXPECT_TRUE(mockCsr->passedDispatchFlags.isStallingCommandsOnNextFlushRequired); @@ -399,7 +399,7 @@ HWTEST_F(DispatchFlagsBlitTests, givenN1EnabledWhenDispatchingWithoutKernelThenA blitPropertiesContainer.push_back(blitProperties); CsrDependencies csrDeps; - EnqueueProperties enqueueProperties(true, false, false, false, false, &blitPropertiesContainer); + EnqueueProperties enqueueProperties(true, false, false, false, false, false, &blitPropertiesContainer); mockCsr->nTo1SubmissionModelEnabled = false; mockCmdQ->enqueueCommandWithoutKernel(nullptr, 0, &mockCmdQ->getCS(0), 0, blocked, enqueueProperties, timestampPacketDependencies, @@ -423,7 +423,7 @@ HWTEST_F(DispatchFlagsTests, givenMockKernelWhenSettingAdditionalKernelExecInfoT EventsRequest eventsRequest(0, nullptr, nullptr); EventBuilder eventBuilder; - EnqueueProperties enqueueProperties(false, false, false, true, false, nullptr); + EnqueueProperties enqueueProperties(false, false, false, true, false, false, nullptr); auto cmdStream = new LinearStream(device->getMemoryManager()->allocateGraphicsMemoryWithProperties({device->getRootDeviceIndex(), 4096, AllocationType::COMMAND_BUFFER, device->getDeviceBitfield()})); auto blockedCommandsData = std::make_unique(cmdStream, *mockCmdQ->getGpgpuCommandStreamReceiver().getInternalAllocationStorage()); diff --git a/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests_dg2_and_later.cpp b/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests_dg2_and_later.cpp index f3d47baf7a..fe7ab0ee89 100644 --- a/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests_dg2_and_later.cpp +++ b/opencl/test/unit_test/command_queue/enqueue_command_without_kernel_tests_dg2_and_later.cpp @@ -37,7 +37,7 @@ HWTEST2_F(DispatchFlagsTests, whenSubmittingKernelWithAdditionalKernelExecInfoTh EventsRequest eventsRequest(0, nullptr, nullptr); EventBuilder eventBuilder; - EnqueueProperties enqueueProperties(false, false, false, true, false, nullptr); + EnqueueProperties enqueueProperties(false, false, false, true, false, false, nullptr); auto cmdStream = new LinearStream(device->getMemoryManager()->allocateGraphicsMemoryWithProperties({device->getRootDeviceIndex(), 4096, AllocationType::COMMAND_BUFFER, device->getDeviceBitfield()})); auto blockedCommandsData = std::make_unique(cmdStream, *mockCmdQ->getGpgpuCommandStreamReceiver().getInternalAllocationStorage()); diff --git a/opencl/test/unit_test/command_queue/ooq_task_tests.cpp b/opencl/test/unit_test/command_queue/ooq_task_tests.cpp index 172b5192ec..7fa62b69a2 100644 --- a/opencl/test/unit_test/command_queue/ooq_task_tests.cpp +++ b/opencl/test/unit_test/command_queue/ooq_task_tests.cpp @@ -368,7 +368,7 @@ HWTEST_F(OOQTaskTests, givenSkipDcFlushOnBarrierWithEventsEnabledWhenEnqueingBar outEvent->release(); } -HWTEST_F(OOQTaskTests, givenSkipDcFlushOnBarrierWithoutEventsDisableddWhenEnqueingBarrierWithWaitListThenDcFlushSet) { +HWTEST_F(OOQTaskTests, givenSkipDcFlushOnBarrierWithoutEventsDisabledWhenEnqueingBarrierWithWaitListThenDcFlushSet) { auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver(); if (false == commandStreamReceiver.peekTimestampPacketWriteEnabled()) { GTEST_SKIP(); @@ -388,7 +388,7 @@ HWTEST_F(OOQTaskTests, givenSkipDcFlushOnBarrierWithoutEventsDisableddWhenEnquei EXPECT_TRUE(pCmdQ->isDcFlushRequiredOnStallingCommandsOnNextFlush()); } -HWTEST_F(OOQTaskTests, givenEnqueueMarkerWithWaitListWhenIsMarkerWithPostSyncWriteThenBcsTimestapLastBarrierToWaitForIsNotEmpty) { +HWTEST_F(OOQTaskTests, givenEnqueueMarkerWithWaitListWhenIsMarkerWithPostSyncWriteThenBcsTimestampLastBarrierToWaitForIsNotEmpty) { auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver(); if (false == commandStreamReceiver.peekTimestampPacketWriteEnabled()) { GTEST_SKIP(); diff --git a/opencl/test/unit_test/command_stream/command_stream_receiver_hw_1_tests.cpp b/opencl/test/unit_test/command_stream/command_stream_receiver_hw_1_tests.cpp index 029f0320d7..4af5b66548 100644 --- a/opencl/test/unit_test/command_stream/command_stream_receiver_hw_1_tests.cpp +++ b/opencl/test/unit_test/command_stream/command_stream_receiver_hw_1_tests.cpp @@ -1749,7 +1749,7 @@ HWCMDTEST_F(IGFX_GEN8_CORE, UltCommandStreamReceiverTest, givenBarrierNodeSetWhe size_t estimatedCmdSize = commandStreamReceiver->getCmdSizeForStallingCommands(dispatchFlags); EXPECT_EQ(expectedCmdSize, estimatedCmdSize); - commandStreamReceiver->programStallingCommandsForBarrier(commandStreamCSR, dispatchFlags); + commandStreamReceiver->programStallingCommandsForBarrier(commandStreamCSR, dispatchFlags.barrierTimestampPacketNodes, false); EXPECT_EQ(estimatedCmdSize, commandStreamCSR.getUsed()); parseCommands(commandStreamCSR, 0); diff --git a/opencl/test/unit_test/command_stream/command_stream_receiver_hw_tests_xehp_and_later.cpp b/opencl/test/unit_test/command_stream/command_stream_receiver_hw_tests_xehp_and_later.cpp index b62046e7e8..851f54d265 100644 --- a/opencl/test/unit_test/command_stream/command_stream_receiver_hw_tests_xehp_and_later.cpp +++ b/opencl/test/unit_test/command_stream/command_stream_receiver_hw_tests_xehp_and_later.cpp @@ -945,7 +945,7 @@ HWTEST2_F(CommandStreamReceiverHwTestXeHPAndLater, givenStaticPartitionEnabledWh size_t estimatedCmdSize = commandStreamReceiver->getCmdSizeForStallingCommands(dispatchFlags); EXPECT_EQ(expectedCmdSize, estimatedCmdSize); - commandStreamReceiver->programStallingCommandsForBarrier(commandStreamCSR, dispatchFlags); + commandStreamReceiver->programStallingCommandsForBarrier(commandStreamCSR, dispatchFlags.barrierTimestampPacketNodes, false); EXPECT_EQ(estimatedCmdSize, commandStreamCSR.getUsed()); parseCommands(commandStreamCSR, 0); @@ -992,7 +992,7 @@ HWTEST2_F(CommandStreamReceiverHwTestXeHPAndLater, givenStaticPartitionDisabledW size_t estimatedCmdSize = commandStreamReceiver->getCmdSizeForStallingCommands(dispatchFlags); EXPECT_EQ(expectedCmdSize, estimatedCmdSize); - commandStreamReceiver->programStallingCommandsForBarrier(commandStreamCSR, dispatchFlags); + commandStreamReceiver->programStallingCommandsForBarrier(commandStreamCSR, dispatchFlags.barrierTimestampPacketNodes, false); EXPECT_EQ(estimatedCmdSize, commandStreamCSR.getUsed()); parseCommands(commandStreamCSR, 0); @@ -1045,7 +1045,7 @@ HWTEST2_F(CommandStreamReceiverHwTestXeHPAndLater, givenStaticPartitionEnabledWh size_t estimatedCmdSize = commandStreamReceiver->getCmdSizeForStallingCommands(dispatchFlags); EXPECT_EQ(expectedSize, estimatedCmdSize); - commandStreamReceiver->programStallingCommandsForBarrier(commandStreamCSR, dispatchFlags); + commandStreamReceiver->programStallingCommandsForBarrier(commandStreamCSR, dispatchFlags.barrierTimestampPacketNodes, false); EXPECT_EQ(estimatedCmdSize, commandStreamCSR.getUsed()); EXPECT_EQ(2u, tagNode->getPacketsUsed()); diff --git a/opencl/test/unit_test/kernel/kernel_tests.cpp b/opencl/test/unit_test/kernel/kernel_tests.cpp index 2406e33259..c511fc2fa6 100644 --- a/opencl/test/unit_test/kernel/kernel_tests.cpp +++ b/opencl/test/unit_test/kernel/kernel_tests.cpp @@ -590,6 +590,8 @@ class CommandStreamReceiverMock : public CommandStreamReceiver { size_t getCmdsSizeForComputeBarrierCommand() const override { return 0; } + void programStallingCommandsForBarrier(LinearStream &cmdStream, TimestampPacketContainer *barrierTimestampPacketNodes, const bool isDcFlushRequired) override { + } GraphicsAllocation *getClearColorAllocation() override { return nullptr; } bool createPreemptionAllocation() override { diff --git a/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp b/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp index 34a14ed241..6c2b137c04 100644 --- a/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp +++ b/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp @@ -811,7 +811,7 @@ HWTEST_TEMPLATED_F(BcsBufferTests, givenPipeControlRequestWhenDispatchingBlitEnq } } -HWTEST_TEMPLATED_F(BcsBufferTests, givenBarrierWithEmptyWaitlistWhenReleasingMultipleBlockedEnqueuesThenProgramBarrierOnce) { +HWTEST_TEMPLATED_F(BcsBufferTests, givenStallingCommandsOnNextFlushWhenReleasingMultipleBlockedEnqueuesThenProgramBarrierOnce) { DebugManager.flags.OptimizeIoqBarriersHandling.set(0); using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; @@ -827,7 +827,7 @@ HWTEST_TEMPLATED_F(BcsBufferTests, givenBarrierWithEmptyWaitlistWhenReleasingMul cl_event waitlist0[] = {&userEvent0}; cl_event waitlist1[] = {&userEvent1}; - cmdQ->enqueueBarrierWithWaitList(0, nullptr, nullptr); + cmdQ->setStallingCommandsOnNextFlush(true); cmdQ->enqueueWriteBuffer(buffer.get(), false, 0, 1, hostPtr, nullptr, 1, waitlist0, nullptr); cmdQ->enqueueWriteBuffer(buffer.get(), false, 0, 1, hostPtr, nullptr, 1, waitlist1, nullptr); diff --git a/opencl/test/unit_test/xe_hpg_core/dg2/command_queue_tests_dg2.cpp b/opencl/test/unit_test/xe_hpg_core/dg2/command_queue_tests_dg2.cpp index 9e3b6f8ffd..a37fcd90aa 100644 --- a/opencl/test/unit_test/xe_hpg_core/dg2/command_queue_tests_dg2.cpp +++ b/opencl/test/unit_test/xe_hpg_core/dg2/command_queue_tests_dg2.cpp @@ -69,7 +69,7 @@ DG2TEST_F(CommandQueueHwTest, GivenKernelWithDpasAndOddWorkGroupWhenenqueueNonBl auto pKernel = mockKernelWithInternals.mockKernel; MockMultiDispatchInfo multiDispatchInfo(mockDevice.get(), pKernel); BlitPropertiesContainer blitPropertiesContainer; - const EnqueueProperties enqueueProperties(false, true, false, false, false, &blitPropertiesContainer); + const EnqueueProperties enqueueProperties(false, true, false, false, false, false, &blitPropertiesContainer); TimestampPacketDependencies timestampPacketDependencies; EventsRequest eventsRequest(0, nullptr, nullptr); EventBuilder eventBuilder; @@ -100,7 +100,7 @@ DG2TEST_F(CommandQueueHwTest, GivenKernelWithDpasAndNotOddWorkGroupWhenenqueueNo auto pKernel = mockKernelWithInternals.mockKernel; MockMultiDispatchInfo multiDispatchInfo(mockDevice.get(), pKernel); BlitPropertiesContainer blitPropertiesContainer; - const EnqueueProperties enqueueProperties(false, true, false, false, false, &blitPropertiesContainer); + const EnqueueProperties enqueueProperties(false, true, false, false, false, false, &blitPropertiesContainer); TimestampPacketDependencies timestampPacketDependencies; EventsRequest eventsRequest(0, nullptr, nullptr); EventBuilder eventBuilder; @@ -136,7 +136,7 @@ DG2TEST_F(CommandQueueHwTest, GivenKernelWithRequiredDisableEuFusionWhenenqueueN auto pKernel = mockKernelWithInternals.mockKernel; MockMultiDispatchInfo multiDispatchInfo(mockDevice.get(), pKernel); BlitPropertiesContainer blitPropertiesContainer; - const EnqueueProperties enqueueProperties(false, true, false, false, false, &blitPropertiesContainer); + const EnqueueProperties enqueueProperties(false, true, false, false, false, false, &blitPropertiesContainer); TimestampPacketDependencies timestampPacketDependencies; EventsRequest eventsRequest(0, nullptr, nullptr); EventBuilder eventBuilder; @@ -163,7 +163,7 @@ DG2TEST_F(CommandQueueHwTest, GivenKernelWithoutRequiredDisableEuFusionWhenenque auto pKernel = mockKernelWithInternals.mockKernel; MockMultiDispatchInfo multiDispatchInfo(mockDevice.get(), pKernel); BlitPropertiesContainer blitPropertiesContainer; - const EnqueueProperties enqueueProperties(false, true, false, false, false, &blitPropertiesContainer); + const EnqueueProperties enqueueProperties(false, true, false, false, false, false, &blitPropertiesContainer); TimestampPacketDependencies timestampPacketDependencies; EventsRequest eventsRequest(0, nullptr, nullptr); EventBuilder eventBuilder; diff --git a/shared/source/command_stream/command_stream_receiver.h b/shared/source/command_stream/command_stream_receiver.h index 7de7dc05a7..53af4d9c6e 100644 --- a/shared/source/command_stream/command_stream_receiver.h +++ b/shared/source/command_stream/command_stream_receiver.h @@ -356,6 +356,8 @@ class CommandStreamReceiver { virtual void programComputeBarrierCommand(LinearStream &cmdStream) = 0; virtual size_t getCmdsSizeForComputeBarrierCommand() const = 0; + virtual void programStallingCommandsForBarrier(LinearStream &cmdStream, TimestampPacketContainer *barrierTimestampPacketNodes, const bool isDcFlushRequired) = 0; + const HardwareInfo &peekHwInfo() const; const RootDeviceEnvironment &peekRootDeviceEnvironment() const; diff --git a/shared/source/command_stream/command_stream_receiver_hw.h b/shared/source/command_stream/command_stream_receiver_hw.h index 9ec2dc9f71..44fb9cf66c 100644 --- a/shared/source/command_stream/command_stream_receiver_hw.h +++ b/shared/source/command_stream/command_stream_receiver_hw.h @@ -162,6 +162,7 @@ class CommandStreamReceiverHw : public CommandStreamReceiver { size_t getCmdsSizeForComputeBarrierCommand() const override { return getCmdSizeForStallingNoPostSyncCommands(); } + void programStallingCommandsForBarrier(LinearStream &cmdStream, TimestampPacketContainer *barrierTimestampPacketNodes, const bool isDcFlushRequired) override; SubmissionStatus initializeDeviceWithFirstSubmission() override; HeapDirtyState &getDshState() { @@ -187,7 +188,6 @@ class CommandStreamReceiverHw : public CommandStreamReceiver { void programPerDssBackedBuffer(LinearStream &scr, Device &device, DispatchFlags &dispatchFlags); void programStateSip(LinearStream &cmdStream, Device &device); void programVFEState(LinearStream &csr, DispatchFlags &dispatchFlags, uint32_t maxFrontEndThreads); - void programStallingCommandsForBarrier(LinearStream &cmdStream, DispatchFlags &dispatchFlags); void programStallingNoPostSyncCommandsForBarrier(LinearStream &cmdStream); void programStallingPostSyncCommandsForBarrier(LinearStream &cmdStream, TagNodeBase &tagNode, bool dcFlushRequired); void programEngineModeCommands(LinearStream &csr, const DispatchFlags &dispatchFlags); diff --git a/shared/source/command_stream/command_stream_receiver_hw_base.inl b/shared/source/command_stream/command_stream_receiver_hw_base.inl index 133b375556..1c5ae24afb 100644 --- a/shared/source/command_stream/command_stream_receiver_hw_base.inl +++ b/shared/source/command_stream/command_stream_receiver_hw_base.inl @@ -498,11 +498,7 @@ CompletionStamp CommandStreamReceiverHw::flushTask( programPreemption(commandStreamCSR, dispatchFlags); if (dispatchFlags.isStallingCommandsOnNextFlushRequired) { - if (DebugManager.flags.ProgramBarrierInCommandStreamTask.get() == 1) { - programStallingCommandsForBarrier(commandStreamTask, dispatchFlags); - } else { - programStallingCommandsForBarrier(commandStreamCSR, dispatchFlags); - } + programStallingCommandsForBarrier(commandStreamCSR, dispatchFlags.barrierTimestampPacketNodes, dispatchFlags.isDcFlushRequiredOnStallingCommandsOnNextFlush); } programStateBaseAddress(dsh, ioh, ssh, dispatchFlags, device, commandStreamCSR, stateBaseAddressDirty); @@ -727,12 +723,9 @@ void CommandStreamReceiverHw::programComputeMode(LinearStream &stream } template -inline void CommandStreamReceiverHw::programStallingCommandsForBarrier(LinearStream &cmdStream, DispatchFlags &dispatchFlags) { - - auto barrierTimestampPacketNodes = dispatchFlags.barrierTimestampPacketNodes; - +inline void CommandStreamReceiverHw::programStallingCommandsForBarrier(LinearStream &cmdStream, TimestampPacketContainer *barrierTimestampPacketNodes, const bool isDcFlushRequired) { if (barrierTimestampPacketNodes && barrierTimestampPacketNodes->peekNodes().size() != 0) { - programStallingPostSyncCommandsForBarrier(cmdStream, *barrierTimestampPacketNodes->peekNodes()[0], dispatchFlags.isDcFlushRequiredOnStallingCommandsOnNextFlush); + programStallingPostSyncCommandsForBarrier(cmdStream, *barrierTimestampPacketNodes->peekNodes()[0], isDcFlushRequired); barrierTimestampPacketNodes->makeResident(*this); } else { programStallingNoPostSyncCommandsForBarrier(cmdStream); diff --git a/shared/source/debug_settings/debug_variables_base.inl b/shared/source/debug_settings/debug_variables_base.inl index 2509847ed4..e4d6541065 100644 --- a/shared/source/debug_settings/debug_variables_base.inl +++ b/shared/source/debug_settings/debug_variables_base.inl @@ -467,7 +467,6 @@ DECLARE_DEBUG_VARIABLE(int32_t, CompactL3FlushEventPacket, -1, "Compact COMPUTE_ DECLARE_DEBUG_VARIABLE(int32_t, UseDynamicEventPacketsCount, -1, "Use dynamic estimation for event packet count based on a given device configuration, -1: default , 0: disabled, 1: enabled") DECLARE_DEBUG_VARIABLE(int32_t, SignalAllEventPackets, -1, "All packets of event are signaled, reset and waited/synchronized, -1: default, 0: disabled, 1: enabled") DECLARE_DEBUG_VARIABLE(int32_t, EnableBcsSwControlWa, -1, "Enable BCS WA via BCSSWCONTROL MMIO. -1: default, 0: disabled, 1: if src in system mem, 2: if dst in system mem, 3: if src and dst in system mem, 4: always") -DECLARE_DEBUG_VARIABLE(int32_t, ProgramBarrierInCommandStreamTask, -1, "Program barrier pipecontrol in task command stream. -1: default(disabled), 0: disabled (program in CSR command stream, 1: enabled") DECLARE_DEBUG_VARIABLE(bool, EnableAIL, true, "Enables AIL") /* IMPLICIT SCALING */ diff --git a/shared/test/common/mocks/mock_command_stream_receiver.h b/shared/test/common/mocks/mock_command_stream_receiver.h index f367bbbe30..5029bf6fdd 100644 --- a/shared/test/common/mocks/mock_command_stream_receiver.h +++ b/shared/test/common/mocks/mock_command_stream_receiver.h @@ -164,6 +164,9 @@ class MockCommandStreamReceiver : public CommandStreamReceiver { size_t getCmdsSizeForComputeBarrierCommand() const override { return 0; } + void programStallingCommandsForBarrier(LinearStream &cmdStream, TimestampPacketContainer *barrierTimestampPacketNodes, const bool isDcFlushRequired) override { + programStallingCommandsForBarrierCalled = true; + } bool createPreemptionAllocation() override { if (createPreemptionAllocationParentCall) { @@ -212,6 +215,7 @@ class MockCommandStreamReceiver : public CommandStreamReceiver { bool createPreemptionAllocationReturn = true; bool createPreemptionAllocationParentCall = false; bool programComputeBarrierCommandCalled = false; + bool programStallingCommandsForBarrierCalled = false; std::optional isGpuHangDetectedReturnValue{}; std::optional testTaskCountReadyReturnValue{}; WaitStatus waitForCompletionWithTimeoutReturnValue{WaitStatus::Ready}; diff --git a/shared/test/common/test_files/igdrcl.config b/shared/test/common/test_files/igdrcl.config index 6459938923..8c7940617c 100644 --- a/shared/test/common/test_files/igdrcl.config +++ b/shared/test/common/test_files/igdrcl.config @@ -546,7 +546,6 @@ SkipDcFlushOnBarrierWithoutEvents = -1 EnableAIL=1 WaitForUserFenceOnEventHostSynchronize = -1 ProgramUserInterruptOnResolvedDependency = -1 -ProgramBarrierInCommandStreamTask = -1 DisableSystemPointerKernelArgument = -1 DoNotValidateDriverPath = 0 # Please don't edit below this line diff --git a/shared/test/unit_test/command_stream/command_stream_receiver_simulated_tests.cpp b/shared/test/unit_test/command_stream/command_stream_receiver_simulated_tests.cpp index d1f6a21c71..ad0b55995e 100644 --- a/shared/test/unit_test/command_stream/command_stream_receiver_simulated_tests.cpp +++ b/shared/test/unit_test/command_stream/command_stream_receiver_simulated_tests.cpp @@ -9,11 +9,14 @@ #include "shared/source/command_stream/command_stream_receiver_simulated_hw.h" #include "shared/source/helpers/array_count.h" #include "shared/source/helpers/hardware_context_controller.h" +#include "shared/source/helpers/timestamp_packet.h" #include "shared/source/memory_manager/memory_pool.h" #include "shared/source/os_interface/os_context.h" +#include "shared/test/common/cmd_parse/hw_parse.h" #include "shared/test/common/helpers/debug_manager_state_restore.h" #include "shared/test/common/helpers/engine_descriptor_helper.h" #include "shared/test/common/helpers/gfx_core_helper_tests.h" +#include "shared/test/common/helpers/unit_test_helper.h" #include "shared/test/common/mocks/mock_aub_manager.h" #include "shared/test/common/mocks/mock_gmm.h" #include "shared/test/common/mocks/mock_graphics_allocation.h" @@ -595,3 +598,98 @@ HWTEST_F(CommandStreamSimulatedTests, givenSpecificMemoryPoolAllocationWhenWrite } } } + +HWTEST_F(CommandStreamSimulatedTests, givenBarrierNodesWhenProgramStallingCommandsForBarrierCalledThenPostSyncWritePipeControlIsProgrammed) { + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + auto csr = std::make_unique>(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); + MockOsContext osContext(0, EngineDescriptorHelper::getDefaultDescriptor()); + csr->setupContext(osContext); + + TagAllocatorBase *allocator = pDevice->getGpgpuCommandStreamReceiver().getTimestampPacketAllocator(); + auto barrierNode = allocator->getTag(); + const auto barrierNodeAddress = TimestampPacketHelper::getContextEndGpuAddress(*barrierNode); + TimestampPacketContainer barrierNodes{}; + barrierNodes.add(barrierNode); + + { + MockGraphicsAllocation streamAllocation{}; + uint32_t streamBuffer[100] = {}; + LinearStream linearStream(&streamAllocation, streamBuffer, sizeof(streamBuffer)); + + csr->programStallingCommandsForBarrier(linearStream, &barrierNodes, false); + + HardwareParse hwParser; + hwParser.parseCommands(linearStream); + auto pipeControlItor = find(hwParser.cmdList.begin(), hwParser.cmdList.end()); + auto pipeControl = genCmdCast(*pipeControlItor); + if (UnitTestHelper::isPipeControlWArequired(hardwareInfo)) { + auto nextPipeControlItor = find(++pipeControlItor, hwParser.cmdList.end()); + pipeControl = genCmdCast(*nextPipeControlItor); + } + ASSERT_NE(nullptr, pipeControl); + EXPECT_EQ(PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA, pipeControl->getPostSyncOperation()); + EXPECT_FALSE(pipeControl->getDcFlushEnable()); + EXPECT_EQ(barrierNodeAddress, UnitTestHelper::getPipeControlPostSyncAddress(*pipeControl)); + } + { + MockGraphicsAllocation streamAllocation{}; + uint32_t streamBuffer[100] = {}; + LinearStream linearStream(&streamAllocation, streamBuffer, sizeof(streamBuffer)); + + csr->programStallingCommandsForBarrier(linearStream, &barrierNodes, true); + + HardwareParse hwParser; + hwParser.parseCommands(linearStream); + auto pipeControlItor = find(hwParser.cmdList.begin(), hwParser.cmdList.end()); + auto pipeControl = genCmdCast(*pipeControlItor); + if (UnitTestHelper::isPipeControlWArequired(hardwareInfo)) { + auto nextPipeControlItor = find(++pipeControlItor, hwParser.cmdList.end()); + pipeControl = genCmdCast(*nextPipeControlItor); + } + ASSERT_NE(nullptr, pipeControl); + EXPECT_EQ(PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA, pipeControl->getPostSyncOperation()); + EXPECT_EQ(csr->getDcFlushSupport(), pipeControl->getDcFlushEnable()); + EXPECT_EQ(barrierNodeAddress, UnitTestHelper::getPipeControlPostSyncAddress(*pipeControl)); + } +} + +HWTEST_F(CommandStreamSimulatedTests, givenEmptyBarrierNodesWhenProgramStallingCommandsForBarrierCalledThenNoWritePipeControlIsProgrammed) { + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + auto csr = std::make_unique>(*pDevice->executionEnvironment, pDevice->getRootDeviceIndex(), pDevice->getDeviceBitfield()); + MockOsContext osContext(0, EngineDescriptorHelper::getDefaultDescriptor()); + csr->setupContext(osContext); + + { + TimestampPacketContainer barrierNodes{}; + + MockGraphicsAllocation streamAllocation{}; + uint32_t streamBuffer[100] = {}; + LinearStream linearStream(&streamAllocation, streamBuffer, sizeof(streamBuffer)); + + csr->programStallingCommandsForBarrier(linearStream, &barrierNodes, false); + + HardwareParse hwParser; + hwParser.parseCommands(linearStream); + const auto pipeControlItor = find(hwParser.cmdList.begin(), hwParser.cmdList.end()); + const auto pipeControl = genCmdCast(*pipeControlItor); + ASSERT_NE(nullptr, pipeControl); + EXPECT_EQ(PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_NO_WRITE, pipeControl->getPostSyncOperation()); + EXPECT_EQ(0u, UnitTestHelper::getPipeControlPostSyncAddress(*pipeControl)); + } + + { + MockGraphicsAllocation streamAllocation{}; + uint32_t streamBuffer[100] = {}; + LinearStream linearStream(&streamAllocation, streamBuffer, sizeof(streamBuffer)); + + csr->programStallingCommandsForBarrier(linearStream, nullptr, false); + + HardwareParse hwParser; + hwParser.parseCommands(linearStream); + const auto pipeControlItor = find(hwParser.cmdList.begin(), hwParser.cmdList.end()); + const auto pipeControl = genCmdCast(*pipeControlItor); + ASSERT_NE(nullptr, pipeControl); + EXPECT_EQ(PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_NO_WRITE, pipeControl->getPostSyncOperation()); + EXPECT_EQ(0u, UnitTestHelper::getPipeControlPostSyncAddress(*pipeControl)); + } +} \ No newline at end of file diff --git a/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp b/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp index 06a66d1e7a..270ff24cd0 100644 --- a/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp +++ b/shared/test/unit_test/command_stream/command_stream_receiver_tests.cpp @@ -2665,6 +2665,33 @@ HWTEST_F(CommandStreamReceiverHwTest, givenDcFlushFlagSetWhenGettingCsrFlagValue EXPECT_EQ(helperValue, csrValue); } +HWTEST_F(CommandStreamReceiverHwTest, givenBarrierTimestampPacketNodesWhenGetCmdSizeForStallingCommandsCalledThenReturnCorrectSize) { + + auto &commandStreamReceiver = pDevice->getUltCommandStreamReceiver(); + + const auto expectedCmdSizeNoPostSync = commandStreamReceiver.getCmdSizeForStallingNoPostSyncCommands(); + { + DispatchFlags dispatchFlags = DispatchFlagsHelper::createDefaultDispatchFlags(); + dispatchFlags.barrierTimestampPacketNodes = nullptr; + EXPECT_EQ(expectedCmdSizeNoPostSync, commandStreamReceiver.getCmdSizeForStallingCommands(dispatchFlags)); + } + { + DispatchFlags dispatchFlags = DispatchFlagsHelper::createDefaultDispatchFlags(); + TimestampPacketContainer emptyContainer; + dispatchFlags.barrierTimestampPacketNodes = &emptyContainer; + EXPECT_EQ(expectedCmdSizeNoPostSync, commandStreamReceiver.getCmdSizeForStallingCommands(dispatchFlags)); + } + + const auto expectedCmdSizePostSync = commandStreamReceiver.getCmdSizeForStallingPostSyncCommands(); + { + DispatchFlags dispatchFlags = DispatchFlagsHelper::createDefaultDispatchFlags(); + TimestampPacketContainer barrierNodes; + barrierNodes.add(commandStreamReceiver.getTimestampPacketAllocator()->getTag()); + dispatchFlags.barrierTimestampPacketNodes = &barrierNodes; + EXPECT_EQ(expectedCmdSizePostSync, commandStreamReceiver.getCmdSizeForStallingCommands(dispatchFlags)); + } +} + struct MockRequiredScratchSpaceController : public ScratchSpaceControllerBase { MockRequiredScratchSpaceController(uint32_t rootDeviceIndex, ExecutionEnvironment &environment, @@ -4426,39 +4453,6 @@ HWTEST_F(CommandStreamReceiverHwTest, givenDcFlushRequiredFalseWhenProgramStalli EXPECT_FALSE(pipeControl->getDcFlushEnable()); } -HWTEST_F(CommandStreamReceiverHwTest, givenFlagProgramBarrierInCommandStreamTaskWhenFlushTaskThenPipeControlProgrammedInTaskCommandStream) { - using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; - DebugManagerStateRestore restorer; - DebugManager.flags.ProgramBarrierInCommandStreamTask.set(1); - auto &ultCsr = pDevice->getUltCommandStreamReceiver(); - - GraphicsAllocation *allocation = pDevice->getMemoryManager()->allocateGraphicsMemoryWithProperties({ultCsr.getRootDeviceIndex(), MemoryConstants::pageSize, AllocationType::COMMAND_BUFFER, pDevice->getDeviceBitfield()}); - LinearStream commandStream{allocation}; - ASSERT_NE(nullptr, commandStream.getGraphicsAllocation()); - auto dispatchFlags = DispatchFlagsHelper::createDefaultDispatchFlags(); - dispatchFlags.isStallingCommandsOnNextFlushRequired = true; - ultCsr.flushTask(commandStream, - MemoryConstants::pageSize, - &dsh, - &ioh, - &ssh, - 0, - dispatchFlags, - *pDevice); - - GenCmdList cmdList; - ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( - cmdList, - commandStream.getCpuBase(), - commandStream.getUsed())); - auto pipeControlIteratorVector = findAll(cmdList.begin(), cmdList.end()); - ASSERT_EQ(pipeControlIteratorVector.size(), 1u); - auto pipeControlIterator = pipeControlIteratorVector[0]; - auto pipeControl = genCmdCast(*pipeControlIterator); - ASSERT_NE(nullptr, pipeControl); - pDevice->getMemoryManager()->freeGraphicsMemory(allocation); -} - HWTEST2_F(CommandStreamReceiverHwTest, givenImmediateFlushTaskWhenNextDispatchRequiresScratchSpaceAndSshPointerIsNullThenFrontEndCommandIsNotDispatched, IsAtLeastXeHpCore) {