From e3bb526067bcc21747c635085abcfd97b7a82659 Mon Sep 17 00:00:00 2001 From: Maciej Dziuban Date: Tue, 23 Nov 2021 16:23:18 +0000 Subject: [PATCH] Optimize timestamp packet dependencies - Clear dependencies even if last engine changed - Do no program semaphore waiting for blit when blit is submitted with gpgpu - Track barrier timestamps to correctly synchronize blits in OOQ Related-To: NEO-6444 Signed-off-by: Maciej Dziuban --- opencl/source/command_queue/command_queue.cpp | 62 +++++- opencl/source/command_queue/command_queue.h | 11 + opencl/source/command_queue/enqueue_common.h | 31 ++- opencl/source/helpers/task_information.cpp | 27 +++ .../command_queue/blit_enqueue_tests.cpp | 19 +- .../command_queue/command_queue_hw_tests.cpp | 202 +++++++++++++++++- .../command_queue/command_queue_tests.cpp | 198 +++++++++++++++++ .../unit_test/mem_obj/buffer_bcs_tests.cpp | 14 +- .../test/unit_test/mocks/mock_command_queue.h | 1 + .../helpers/engine_node_helper_extra.cpp | 5 + shared/source/utilities/tag_allocator.h | 2 - 11 files changed, 534 insertions(+), 38 deletions(-) diff --git a/opencl/source/command_queue/command_queue.cpp b/opencl/source/command_queue/command_queue.cpp index b8e459244e..ecfce63b97 100644 --- a/opencl/source/command_queue/command_queue.cpp +++ b/opencl/source/command_queue/command_queue.cpp @@ -681,7 +681,6 @@ void CommandQueue::updateBcsTaskCount(aub_stream::EngineType bcsEngineType, uint uint32_t CommandQueue::peekBcsTaskCount(aub_stream::EngineType bcsEngineType) const { const CopyEngineState &state = bcsStates[EngineHelpers::getBcsIndex(bcsEngineType)]; - DEBUG_BREAK_IF(!state.isValid()); return state.taskCount; } @@ -706,10 +705,6 @@ void CommandQueue::obtainNewTimestampPacketNodes(size_t numberOfNodes, Timestamp previousNodes.swapNodes(*timestampPacketContainer); - if ((previousNodes.peekNodes().size() > 0) && (previousNodes.peekNodes()[0]->getAllocator() != allocator)) { - clearAllDependencies = false; - } - if (clearAllDependencies) { previousNodes.moveNodesToNewContainer(*deferredTimestampPackets); } @@ -1007,4 +1002,61 @@ void CommandQueue::waitForAllEngines(bool blockedQueue, PrintfHandler *printfHan } } +void CommandQueue::setupBarrierTimestampForBcsEngines(aub_stream::EngineType engineType, TimestampPacketDependencies ×tampPacketDependencies) { + if (!getGpgpuCommandStreamReceiver().isStallingCommandsOnNextFlushRequired()) { + return; + } + + // Ensure we have exactly 1 barrier node. + if (timestampPacketDependencies.barrierNodes.peekNodes().empty()) { + timestampPacketDependencies.barrierNodes.add(getGpgpuCommandStreamReceiver().getTimestampPacketAllocator()->getTag()); + } + + if (isOOQEnabled()) { + // Barrier node will be signalled on gpgpuCsr. Save it for later use on blitters. + for (auto currentBcsIndex = 0u; currentBcsIndex < bcsTimestampPacketContainers.size(); currentBcsIndex++) { + const auto currentBcsEngineType = EngineHelpers::mapBcsIndexToEngineType(currentBcsIndex, true); + if (currentBcsEngineType == engineType) { + // Node is already added to barrierNodes for this engine, no need to save it. + continue; + } + + // Save latest timestamp (override previous, if any). + TimestampPacketContainer newContainer{}; + newContainer.assignAndIncrementNodesRefCounts(timestampPacketDependencies.barrierNodes); + bcsTimestampPacketContainers[currentBcsIndex].lastBarrierToWaitFor.swapNodes(newContainer); + } + } +} + +void CommandQueue::processBarrierTimestampForBcsEngine(aub_stream::EngineType bcsEngineType, TimestampPacketDependencies &blitDependencies) { + BcsTimestampPacketContainers &bcsContainers = bcsTimestampPacketContainers[EngineHelpers::getBcsIndex(bcsEngineType)]; + bcsContainers.lastBarrierToWaitFor.moveNodesToNewContainer(blitDependencies.barrierNodes); +} + +void CommandQueue::setLastBcsPacket(aub_stream::EngineType bcsEngineType) { + if (isOOQEnabled()) { + TimestampPacketContainer dummyContainer{}; + dummyContainer.assignAndIncrementNodesRefCounts(*this->timestampPacketContainer); + + BcsTimestampPacketContainers &bcsContainers = bcsTimestampPacketContainers[EngineHelpers::getBcsIndex(bcsEngineType)]; + bcsContainers.lastSignalledPacket.swapNodes(dummyContainer); + } +} + +void CommandQueue::fillCsrDependenciesWithLastBcsPackets(CsrDependencies &csrDeps) { + for (BcsTimestampPacketContainers &bcsContainers : bcsTimestampPacketContainers) { + if (bcsContainers.lastSignalledPacket.peekNodes().empty()) { + continue; + } + csrDeps.timestampPacketContainer.push_back(&bcsContainers.lastSignalledPacket); + } +} + +void CommandQueue::clearLastBcsPackets() { + for (BcsTimestampPacketContainers &bcsContainers : bcsTimestampPacketContainers) { + bcsContainers.lastSignalledPacket.moveNodesToNewContainer(*deferredTimestampPackets); + } +} + } // namespace NEO diff --git a/opencl/source/command_queue/command_queue.h b/opencl/source/command_queue/command_queue.h index 64f3aff962..ba894a2c5d 100644 --- a/opencl/source/command_queue/command_queue.h +++ b/opencl/source/command_queue/command_queue.h @@ -325,6 +325,12 @@ class CommandQueue : public BaseObject<_cl_command_queue> { void updateLatestSentEnqueueType(EnqueueProperties::Operation newEnqueueType) { this->latestSentEnqueueType = newEnqueueType; } + void setupBarrierTimestampForBcsEngines(aub_stream::EngineType engineType, TimestampPacketDependencies ×tampPacketDependencies); + void processBarrierTimestampForBcsEngine(aub_stream::EngineType bcsEngineType, TimestampPacketDependencies &blitDependencies); + void setLastBcsPacket(aub_stream::EngineType bcsEngineType); + void fillCsrDependenciesWithLastBcsPackets(CsrDependencies &csrDeps); + void clearLastBcsPackets(); + // taskCount of last task uint32_t taskCount = 0; @@ -409,6 +415,11 @@ class CommandQueue : public BaseObject<_cl_command_queue> { std::unique_ptr deferredTimestampPackets; std::unique_ptr timestampPacketContainer; + struct BcsTimestampPacketContainers { + TimestampPacketContainer lastBarrierToWaitFor; + TimestampPacketContainer lastSignalledPacket; + }; + std::array bcsTimestampPacketContainers; }; template diff --git a/opencl/source/command_queue/enqueue_common.h b/opencl/source/command_queue/enqueue_common.h index d6abd24c7d..22eba9744a 100644 --- a/opencl/source/command_queue/enqueue_common.h +++ b/opencl/source/command_queue/enqueue_common.h @@ -249,6 +249,10 @@ void CommandQueueHw::enqueueHandler(Surface **surfacesForResidency, timestampPacketDependencies, eventsRequest, blockQueue); } + if (!blockQueue && isOOQEnabled()) { + setupBarrierTimestampForBcsEngines(getGpgpuCommandStreamReceiver().getOsContext().getEngineType(), timestampPacketDependencies); + } + if (eventBuilder.getEvent() && getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) { eventBuilder.getEvent()->addTimestampPacketNodes(*timestampPacketContainer); eventBuilder.getEvent()->addTimestampPacketNodes(timestampPacketDependencies.nonAuxToAuxNodes); @@ -536,8 +540,6 @@ BlitProperties CommandQueueHw::processDispatchForBlitEnqueue(CommandS device->getHardwareInfo(), args); } - - TimestampPacketHelper::programSemaphore(*commandStream, *currentTimestampPacketNode); } return blitProperties; } @@ -898,8 +900,13 @@ CompletionStamp CommandQueueHw::enqueueNonBlocked( dispatchFlags.pipelineSelectArgs.mediaSamplerRequired = mediaSamplerRequired; dispatchFlags.pipelineSelectArgs.specialPipelineSelectMode = specialPipelineSelectMode; + const bool isHandlingBarrier = getGpgpuCommandStreamReceiver().isStallingCommandsOnNextFlushRequired(); + if (getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled() && !clearDependenciesForSubCapture) { eventsRequest.fillCsrDependenciesForTimestampPacketContainer(dispatchFlags.csrDependencies, getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OutOfCsr); + if (isHandlingBarrier) { + fillCsrDependenciesWithLastBcsPackets(dispatchFlags.csrDependencies); + } dispatchFlags.csrDependencies.makeResident(getGpgpuCommandStreamReceiver()); } @@ -937,6 +944,10 @@ CompletionStamp CommandQueueHw::enqueueNonBlocked( dispatchFlags, getDevice()); + if (isHandlingBarrier) { + clearLastBcsPackets(); + } + if (gtpinIsGTPinInitialized()) { gtpinNotifyFlushTask(completionStamp.taskCount); } @@ -1119,8 +1130,13 @@ CompletionStamp CommandQueueHw::enqueueCommandWithoutKernel( false, //memoryMigrationRequired false); //textureCacheFlush + const bool isHandlingBarrier = getGpgpuCommandStreamReceiver().isStallingCommandsOnNextFlushRequired(); + if (getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) { eventsRequest.fillCsrDependenciesForTimestampPacketContainer(dispatchFlags.csrDependencies, getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OutOfCsr); + if (isHandlingBarrier) { + fillCsrDependenciesWithLastBcsPackets(dispatchFlags.csrDependencies); + } dispatchFlags.csrDependencies.makeResident(getGpgpuCommandStreamReceiver()); } @@ -1133,6 +1149,10 @@ CompletionStamp CommandQueueHw::enqueueCommandWithoutKernel( taskLevel, dispatchFlags, getDevice()); + + if (isHandlingBarrier) { + clearLastBcsPackets(); + } } if (enqueueProperties.operation == EnqueueProperties::Operation::Blit) { @@ -1208,9 +1228,10 @@ void CommandQueueHw::enqueueBlit(const MultiDispatchInfo &multiDispat timestampPacketDependencies.cacheFlushNodes.add(allocator->getTag()); } - if (!blockQueue && getGpgpuCommandStreamReceiver().isStallingCommandsOnNextFlushRequired()) { - timestampPacketDependencies.barrierNodes.add(allocator->getTag()); + if (!blockQueue) { + setupBarrierTimestampForBcsEngines(bcsCsr.getOsContext().getEngineType(), timestampPacketDependencies); } + processBarrierTimestampForBcsEngine(bcsCsr.getOsContext().getEngineType(), timestampPacketDependencies); obtainNewTimestampPacketNodes(1, timestampPacketDependencies.previousEnqueueNodes, clearAllDependencies, bcsCsr); csrDeps.timestampPacketContainer.push_back(×tampPacketDependencies.previousEnqueueNodes); @@ -1243,6 +1264,8 @@ void CommandQueueHw::enqueueBlit(const MultiDispatchInfo &multiDispat } this->latestSentEnqueueType = enqueueProperties.operation; + + setLastBcsPacket(bcsCsr.getOsContext().getEngineType()); } updateFromCompletionStamp(completionStamp, eventBuilder.getEvent()); diff --git a/opencl/source/helpers/task_information.cpp b/opencl/source/helpers/task_information.cpp index af9bc46e38..48030ecab0 100644 --- a/opencl/source/helpers/task_information.cpp +++ b/opencl/source/helpers/task_information.cpp @@ -215,6 +215,10 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate commandQueue.getGpgpuCommandStreamReceiver(), *bcsCsrForAuxTranslation); } + if (timestampPacketDependencies && commandQueue.isOOQEnabled()) { + commandQueue.setupBarrierTimestampForBcsEngines(commandQueue.getGpgpuCommandStreamReceiver().getOsContext().getEngineType(), *timestampPacketDependencies); + } + const auto &kernelDescriptor = kernel->getKernelInfo().kernelDescriptor; auto memoryCompressionState = commandStreamReceiver.getMemoryCompressionState(kernel->isAuxTranslationRequired(), commandQueue.getDevice().getHardwareInfo()); @@ -254,8 +258,13 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate eventsRequest.fillCsrDependenciesForTaskCountContainer(dispatchFlags.csrDependencies, commandStreamReceiver); } + const bool isHandlingBarrier = commandQueue.getGpgpuCommandStreamReceiver().isStallingCommandsOnNextFlushRequired(); + if (timestampPacketDependencies) { eventsRequest.fillCsrDependenciesForTimestampPacketContainer(dispatchFlags.csrDependencies, commandStreamReceiver, CsrDependencies::DependenciesType::OutOfCsr); + if (isHandlingBarrier) { + commandQueue.fillCsrDependenciesWithLastBcsPackets(dispatchFlags.csrDependencies); + } dispatchFlags.barrierTimestampPacketNodes = ×tampPacketDependencies->barrierNodes; } dispatchFlags.pipelineSelectArgs.specialPipelineSelectMode = kernel->requiresSpecialPipelineSelectMode(); @@ -289,6 +298,10 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate dispatchFlags, commandQueue.getDevice()); + if (isHandlingBarrier) { + commandQueue.clearLastBcsPackets(); + } + if (kernelOperation->blitPropertiesContainer.size() > 0) { const auto newTaskCount = bcsCsrForAuxTranslation->blitBuffer(kernelOperation->blitPropertiesContainer, false, commandQueue.isProfilingEnabled(), commandQueue.getDevice()); commandQueue.updateBcsTaskCount(bcsCsrForAuxTranslation->getOsContext().getEngineType(), newTaskCount); @@ -330,6 +343,7 @@ void CommandWithoutKernel::dispatchBlitOperation() { const auto newTaskCount = bcsCsr->blitBuffer(kernelOperation->blitPropertiesContainer, false, commandQueue.isProfilingEnabled(), commandQueue.getDevice()); commandQueue.updateBcsTaskCount(bcsCsr->getOsContext().getEngineType(), newTaskCount); + commandQueue.setLastBcsPacket(bcsCsr->getOsContext().getEngineType()); } CompletionStamp &CommandWithoutKernel::submit(uint32_t taskLevel, bool terminated) { @@ -361,6 +375,10 @@ CompletionStamp &CommandWithoutKernel::submit(uint32_t taskLevel, bool terminate } } + if (timestampPacketDependencies && commandQueue.isOOQEnabled()) { + commandQueue.setupBarrierTimestampForBcsEngines(commandQueue.getGpgpuCommandStreamReceiver().getOsContext().getEngineType(), *timestampPacketDependencies); + } + auto rootDeviceIndex = commandStreamReceiver.getRootDeviceIndex(); DispatchFlags dispatchFlags( {}, //csrDependencies @@ -397,8 +415,13 @@ CompletionStamp &CommandWithoutKernel::submit(uint32_t taskLevel, bool terminate eventsRequest.fillCsrDependenciesForTaskCountContainer(dispatchFlags.csrDependencies, commandStreamReceiver); } + const bool isHandlingBarrier = commandQueue.getGpgpuCommandStreamReceiver().isStallingCommandsOnNextFlushRequired(); + if (commandStreamReceiver.peekTimestampPacketWriteEnabled()) { eventsRequest.fillCsrDependenciesForTimestampPacketContainer(dispatchFlags.csrDependencies, commandStreamReceiver, CsrDependencies::DependenciesType::OutOfCsr); + if (isHandlingBarrier) { + commandQueue.fillCsrDependenciesWithLastBcsPackets(dispatchFlags.csrDependencies); + } makeTimestampPacketsResident(commandStreamReceiver); } @@ -413,6 +436,10 @@ CompletionStamp &CommandWithoutKernel::submit(uint32_t taskLevel, bool terminate dispatchFlags, commandQueue.getDevice()); + if (isHandlingBarrier) { + commandQueue.clearLastBcsPackets(); + } + if (kernelOperation->blitEnqueue) { dispatchBlitOperation(); } diff --git a/opencl/test/unit_test/command_queue/blit_enqueue_tests.cpp b/opencl/test/unit_test/command_queue/blit_enqueue_tests.cpp index a3c9040ea0..b41609b0fb 100644 --- a/opencl/test/unit_test/command_queue/blit_enqueue_tests.cpp +++ b/opencl/test/unit_test/command_queue/blit_enqueue_tests.cpp @@ -236,6 +236,12 @@ struct BlitEnqueueTests : public ::testing::Test { return commandItor; } + template + void expectNoCommand(GenCmdList::iterator itorStart, GenCmdList::iterator itorEnd) { + auto commandItor = find(itorStart, itorEnd); + EXPECT_TRUE(commandItor == itorEnd); + } + template void verifySemaphore(GenCmdList::iterator &semaphoreItor, uint64_t expectedAddress) { using MI_SEMAPHORE_WAIT = typename Family::MI_SEMAPHORE_WAIT; @@ -965,13 +971,10 @@ HWTEST_TEMPLATED_F(BlitEnqueueWithNoTimestampPacketTests, givenNoTimestampPacket auto cmdFound = expectCommand(bcsCommands.begin(), bcsCommands.end()); cmdFound = expectMiFlush(cmdFound++, bcsCommands.end()); - auto miflushDwCmd = genCmdCast(*cmdFound); - const auto bcsSignalAddress = miflushDwCmd->getDestinationAddress(); cmdFound = expectCommand(ccsCommands.begin(), ccsCommands.end()); - cmdFound = expectCommand(cmdFound++, ccsCommands.end()); - verifySemaphore(cmdFound, bcsSignalAddress); + expectNoCommand(cmdFound++, ccsCommands.end()); } struct BlitEnqueueWithDebugCapabilityTests : public BlitEnqueueTests<0> { @@ -1726,7 +1729,7 @@ HWTEST_TEMPLATED_F(BlitEnqueueWithDisabledGpgpuSubmissionTests, givenCacheFlushR } } -HWTEST_TEMPLATED_F(BlitEnqueueWithDisabledGpgpuSubmissionTests, givenSubmissionToDifferentEngineWhenRequestingForNewTimestmapPacketThenDontClearDependencies) { +HWTEST_TEMPLATED_F(BlitEnqueueWithDisabledGpgpuSubmissionTests, givenSubmissionToDifferentEngineWhenRequestingForNewTimestmapPacketThenClearDependencies) { auto mockCommandQueue = static_cast *>(commandQueue.get()); const bool clearDependencies = true; @@ -1736,12 +1739,6 @@ HWTEST_TEMPLATED_F(BlitEnqueueWithDisabledGpgpuSubmissionTests, givenSubmissionT EXPECT_EQ(0u, previousNodes.peekNodes().size()); } - { - TimestampPacketContainer previousNodes; - mockCommandQueue->obtainNewTimestampPacketNodes(1, previousNodes, clearDependencies, *bcsCsr); - EXPECT_EQ(1u, previousNodes.peekNodes().size()); - } - { TimestampPacketContainer previousNodes; mockCommandQueue->obtainNewTimestampPacketNodes(1, previousNodes, clearDependencies, *bcsCsr); diff --git a/opencl/test/unit_test/command_queue/command_queue_hw_tests.cpp b/opencl/test/unit_test/command_queue/command_queue_hw_tests.cpp index b30e6ce780..8203ac24ea 100644 --- a/opencl/test/unit_test/command_queue/command_queue_hw_tests.cpp +++ b/opencl/test/unit_test/command_queue/command_queue_hw_tests.cpp @@ -5,6 +5,7 @@ * */ +#include "shared/test/common/cmd_parse/hw_parse.h" #include "shared/test/common/helpers/debug_manager_state_restore.h" #include "shared/test/common/helpers/unit_test_helper.h" #include "shared/test/common/libult/ult_command_stream_receiver.h" @@ -1506,20 +1507,23 @@ HWTEST_F(CommandQueueHwTest, WhenForcePerDssBackedBufferProgrammingSetThenDispat EXPECT_TRUE(csr.recordedDispatchFlags.usePerDssBackedBuffer); } +template struct CommandQueueHwBlitTest : ClDeviceFixture, ContextFixture, CommandQueueHwFixture, ::testing::Test { using ContextFixture::SetUp; void SetUp() override { - REQUIRE_FULL_BLITTER_OR_SKIP(defaultHwInfo.get()); + hwInfo = *::defaultHwInfo; + hwInfo.capabilityTable.blitterOperationsSupported = true; + REQUIRE_FULL_BLITTER_OR_SKIP(&hwInfo); DebugManager.flags.EnableBlitterOperationsSupport.set(1); DebugManager.flags.EnableTimestampPacket.set(1); DebugManager.flags.PreferCopyEngineForCopyBufferToBuffer.set(1); - ClDeviceFixture::SetUp(); - pDevice->getRootDeviceEnvironment().getMutableHardwareInfo()->capabilityTable.blitterOperationsSupported = true; + ClDeviceFixture::SetUpImpl(&hwInfo); cl_device_id device = pClDevice; ContextFixture::SetUp(1, &device); - CommandQueueHwFixture::SetUp(pClDevice, 0); + cl_command_queue_properties queueProperties = ooq ? CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE : 0; + CommandQueueHwFixture::SetUp(pClDevice, queueProperties); } void TearDown() override { @@ -1528,10 +1532,14 @@ struct CommandQueueHwBlitTest : ClDeviceFixture, ContextFixture, CommandQueueHwF ClDeviceFixture::TearDown(); } + HardwareInfo hwInfo{}; DebugManagerStateRestore state{}; }; -HWTEST_F(CommandQueueHwBlitTest, givenGpgpuCsrWhenEnqueueingSubsequentBlitsThenGpgpuCommandStreamIsNotObtained) { +using IoqCommandQueueHwBlitTest = CommandQueueHwBlitTest; +using OoqCommandQueueHwBlitTest = CommandQueueHwBlitTest; + +HWTEST_F(IoqCommandQueueHwBlitTest, givenGpgpuCsrWhenEnqueueingSubsequentBlitsThenGpgpuCommandStreamIsNotObtained) { auto &gpgpuCsr = pDevice->getUltCommandStreamReceiver(); auto srcBuffer = std::unique_ptr{BufferHelper<>::create(pContext)}; auto dstBuffer = std::unique_ptr{BufferHelper<>::create(pContext)}; @@ -1561,7 +1569,7 @@ HWTEST_F(CommandQueueHwBlitTest, givenGpgpuCsrWhenEnqueueingSubsequentBlitsThenG EXPECT_EQ(0, gpgpuCsr.ensureCommandBufferAllocationCalled); } -HWTEST_F(CommandQueueHwBlitTest, givenGpgpuCsrWhenEnqueueingBlitAfterKernelThenGpgpuCommandStreamIsObtained) { +HWTEST_F(IoqCommandQueueHwBlitTest, givenGpgpuCsrWhenEnqueueingBlitAfterKernelThenGpgpuCommandStreamIsObtained) { auto &gpgpuCsr = pDevice->getUltCommandStreamReceiver(); auto srcBuffer = std::unique_ptr{BufferHelper<>::create(pContext)}; auto dstBuffer = std::unique_ptr{BufferHelper<>::create(pContext)}; @@ -1586,3 +1594,185 @@ HWTEST_F(CommandQueueHwBlitTest, givenGpgpuCsrWhenEnqueueingBlitAfterKernelThenG ASSERT_EQ(CL_SUCCESS, retVal); EXPECT_NE(ensureCommandBufferAllocationCalledAfterKernel, gpgpuCsr.ensureCommandBufferAllocationCalled); } + +HWTEST_F(OoqCommandQueueHwBlitTest, givenBlitAfterBarrierWhenEnqueueingCommandThenWaitForBarrierOnBlit) { + using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + + if (pCmdQ->getTimestampPacketContainer() == nullptr) { + GTEST_SKIP(); + } + DebugManagerStateRestore restore{}; + DebugManager.flags.DoCpuCopyOnReadBuffer.set(0); + DebugManager.flags.ForceCacheFlushForBcs.set(0); + DebugManager.flags.UpdateTaskCountFromWait.set(1); + + MockKernelWithInternals mockKernelWithInternals(*pClDevice); + MockKernel *kernel = mockKernelWithInternals.mockKernel; + size_t offset = 0; + size_t gws = 1; + BufferDefaults::context = context; + auto buffer = clUniquePtr(BufferHelper<>::create()); + char ptr[1] = {}; + + EXPECT_EQ(CL_SUCCESS, pCmdQ->enqueueKernel(kernel, 1, &offset, &gws, nullptr, 0, nullptr, nullptr)); + EXPECT_EQ(CL_SUCCESS, pCmdQ->enqueueKernel(kernel, 1, &offset, &gws, nullptr, 0, nullptr, nullptr)); + auto ccsStart = pCmdQ->getGpgpuCommandStreamReceiver().getCS().getUsed(); + EXPECT_EQ(CL_SUCCESS, pCmdQ->enqueueBarrierWithWaitList(0, nullptr, nullptr)); + EXPECT_EQ(CL_SUCCESS, pCmdQ->enqueueReadBuffer(buffer.get(), CL_FALSE, 0, 1u, ptr, nullptr, 0, nullptr, nullptr)); + + uint64_t barrierNodeAddress = 0u; + { + HardwareParse ccsHwParser; + ccsHwParser.parseCommands(pCmdQ->getGpgpuCommandStreamReceiver().getCS(0), ccsStart); + + const auto pipeControlItor = find(ccsHwParser.cmdList.begin(), ccsHwParser.cmdList.end()); + auto pipeControl = genCmdCast(*pipeControlItor); + EXPECT_EQ(PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA, pipeControl->getPostSyncOperation()); + barrierNodeAddress = pipeControl->getAddress() | (static_cast(pipeControl->getAddressHigh()) << 32); + + // There shouldn't be any semaphores before the barrier + const auto semaphoreItor = find(ccsHwParser.cmdList.begin(), pipeControlItor); + EXPECT_EQ(pipeControlItor, semaphoreItor); + } + + { + HardwareParse bcsHwParser; + bcsHwParser.parseCommands(pCmdQ->getBcsCommandStreamReceiver(aub_stream::ENGINE_BCS)->getCS(0), 0u); + + const auto semaphoreItor = find(bcsHwParser.cmdList.begin(), bcsHwParser.cmdList.end()); + auto semaphore = genCmdCast(*semaphoreItor); + EXPECT_EQ(barrierNodeAddress, semaphore->getSemaphoreGraphicsAddress()); + + const auto pipeControlItor = find(semaphoreItor, bcsHwParser.cmdList.end()); + EXPECT_EQ(bcsHwParser.cmdList.end(), pipeControlItor); + } + + EXPECT_EQ(CL_SUCCESS, pCmdQ->finish()); +} + +HWTEST_F(OoqCommandQueueHwBlitTest, givenBlitBeforeBarrierWhenEnqueueingCommandThenWaitForBlitBeforeBarrier) { + using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + + if (pCmdQ->getTimestampPacketContainer() == nullptr) { + GTEST_SKIP(); + } + DebugManagerStateRestore restore{}; + DebugManager.flags.DoCpuCopyOnReadBuffer.set(0); + DebugManager.flags.ForceCacheFlushForBcs.set(0); + DebugManager.flags.UpdateTaskCountFromWait.set(1); + + MockKernelWithInternals mockKernelWithInternals(*pClDevice); + MockKernel *kernel = mockKernelWithInternals.mockKernel; + size_t offset = 0; + size_t gws = 1; + BufferDefaults::context = context; + auto buffer = clUniquePtr(BufferHelper<>::create()); + char ptr[1] = {}; + + EXPECT_EQ(CL_SUCCESS, pCmdQ->enqueueReadBuffer(buffer.get(), CL_FALSE, 0, 1u, ptr, nullptr, 0, nullptr, nullptr)); + EXPECT_EQ(CL_SUCCESS, pCmdQ->enqueueReadBuffer(buffer.get(), CL_FALSE, 0, 1u, ptr, nullptr, 0, nullptr, nullptr)); + uint64_t lastBlitNodeAddress = TimestampPacketHelper::getContextEndGpuAddress(*pCmdQ->getTimestampPacketContainer()->peekNodes()[0]); + EXPECT_EQ(CL_SUCCESS, pCmdQ->enqueueKernel(kernel, 1, &offset, &gws, nullptr, 0, nullptr, nullptr)); + auto ccsStart = pCmdQ->getGpgpuCommandStreamReceiver().getCS().getUsed(); + auto bcsStart = pCmdQ->getBcsCommandStreamReceiver(aub_stream::ENGINE_BCS)->getCS(0).getUsed(); + EXPECT_EQ(CL_SUCCESS, pCmdQ->enqueueBarrierWithWaitList(0, nullptr, nullptr)); + EXPECT_EQ(CL_SUCCESS, pCmdQ->enqueueKernel(kernel, 1, &offset, &gws, nullptr, 0, nullptr, nullptr)); + EXPECT_EQ(CL_SUCCESS, pCmdQ->enqueueReadBuffer(buffer.get(), CL_FALSE, 0, 1u, ptr, nullptr, 0, nullptr, nullptr)); + EXPECT_EQ(CL_SUCCESS, pCmdQ->enqueueReadBuffer(buffer.get(), CL_FALSE, 0, 1u, ptr, nullptr, 0, nullptr, nullptr)); + + uint64_t barrierNodeAddress = 0u; + { + HardwareParse ccsHwParser; + ccsHwParser.parseCommands(pCmdQ->getGpgpuCommandStreamReceiver().getCS(0), ccsStart); + + const auto semaphoreItor = find(ccsHwParser.cmdList.begin(), ccsHwParser.cmdList.end()); + const auto semaphore = genCmdCast(*semaphoreItor); + EXPECT_EQ(lastBlitNodeAddress, semaphore->getSemaphoreGraphicsAddress()); + + const auto pipeControlItor = find(semaphoreItor, ccsHwParser.cmdList.end()); + const auto pipeControl = genCmdCast(*pipeControlItor); + EXPECT_EQ(PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA, pipeControl->getPostSyncOperation()); + barrierNodeAddress = pipeControl->getAddress() | (static_cast(pipeControl->getAddressHigh()) << 32); + + // There shouldn't be any more semaphores before the barrier + EXPECT_EQ(pipeControlItor, find(std::next(semaphoreItor), pipeControlItor)); + } + + { + HardwareParse bcsHwParser; + bcsHwParser.parseCommands(pCmdQ->getBcsCommandStreamReceiver(aub_stream::ENGINE_BCS)->getCS(0), bcsStart); + + const auto semaphoreItor = find(bcsHwParser.cmdList.begin(), bcsHwParser.cmdList.end()); + const auto semaphore = genCmdCast(*semaphoreItor); + EXPECT_EQ(barrierNodeAddress, semaphore->getSemaphoreGraphicsAddress()); + EXPECT_EQ(bcsHwParser.cmdList.end(), find(semaphoreItor, bcsHwParser.cmdList.end())); + } + + EXPECT_EQ(CL_SUCCESS, pCmdQ->finish()); +} + +HWTEST_F(OoqCommandQueueHwBlitTest, givenBlockedBlitAfterBarrierWhenEnqueueingCommandThenWaitForBlitBeforeBarrier) { + using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + + if (pCmdQ->getTimestampPacketContainer() == nullptr) { + GTEST_SKIP(); + } + DebugManagerStateRestore restore{}; + DebugManager.flags.DoCpuCopyOnReadBuffer.set(0); + DebugManager.flags.ForceCacheFlushForBcs.set(0); + DebugManager.flags.UpdateTaskCountFromWait.set(1); + + UserEvent userEvent; + cl_event userEventWaitlist[] = {&userEvent}; + MockKernelWithInternals mockKernelWithInternals(*pClDevice); + MockKernel *kernel = mockKernelWithInternals.mockKernel; + size_t offset = 0; + size_t gws = 1; + BufferDefaults::context = context; + auto buffer = clUniquePtr(BufferHelper<>::create()); + char ptr[1] = {}; + + EXPECT_EQ(CL_SUCCESS, pCmdQ->enqueueReadBuffer(buffer.get(), CL_FALSE, 0, 1u, ptr, nullptr, 0, nullptr, nullptr)); + EXPECT_EQ(CL_SUCCESS, pCmdQ->enqueueReadBuffer(buffer.get(), CL_FALSE, 0, 1u, ptr, nullptr, 0, nullptr, nullptr)); + uint64_t lastBlitNodeAddress = TimestampPacketHelper::getContextEndGpuAddress(*pCmdQ->getTimestampPacketContainer()->peekNodes()[0]); + EXPECT_EQ(CL_SUCCESS, pCmdQ->enqueueKernel(kernel, 1, &offset, &gws, nullptr, 0, nullptr, nullptr)); + auto ccsStart = pCmdQ->getGpgpuCommandStreamReceiver().getCS().getUsed(); + auto bcsStart = pCmdQ->getBcsCommandStreamReceiver(aub_stream::ENGINE_BCS)->getCS(0).getUsed(); + EXPECT_EQ(CL_SUCCESS, pCmdQ->enqueueBarrierWithWaitList(0, nullptr, nullptr)); + EXPECT_EQ(CL_SUCCESS, pCmdQ->enqueueReadBuffer(buffer.get(), CL_FALSE, 0, 1u, ptr, nullptr, 1, userEventWaitlist, nullptr)); + + userEvent.setStatus(CL_COMPLETE); + + uint64_t barrierNodeAddress = 0u; + { + HardwareParse ccsHwParser; + ccsHwParser.parseCommands(pCmdQ->getGpgpuCommandStreamReceiver().getCS(0), ccsStart); + + const auto semaphoreItor = find(ccsHwParser.cmdList.begin(), ccsHwParser.cmdList.end()); + const auto semaphore = genCmdCast(*semaphoreItor); + EXPECT_EQ(lastBlitNodeAddress, semaphore->getSemaphoreGraphicsAddress()); + + const auto pipeControlItor = find(semaphoreItor, ccsHwParser.cmdList.end()); + const auto pipeControl = genCmdCast(*pipeControlItor); + EXPECT_EQ(PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA, pipeControl->getPostSyncOperation()); + barrierNodeAddress = pipeControl->getAddress() | (static_cast(pipeControl->getAddressHigh()) << 32); + + // There shouldn't be any more semaphores before the barrier + EXPECT_EQ(pipeControlItor, find(std::next(semaphoreItor), pipeControlItor)); + } + + { + HardwareParse bcsHwParser; + bcsHwParser.parseCommands(pCmdQ->getBcsCommandStreamReceiver(aub_stream::ENGINE_BCS)->getCS(0), bcsStart); + + const auto semaphoreItor = find(bcsHwParser.cmdList.begin(), bcsHwParser.cmdList.end()); + const auto semaphore = genCmdCast(*semaphoreItor); + EXPECT_EQ(barrierNodeAddress, semaphore->getSemaphoreGraphicsAddress()); + EXPECT_EQ(bcsHwParser.cmdList.end(), find(semaphoreItor, bcsHwParser.cmdList.end())); + } + + EXPECT_EQ(CL_SUCCESS, pCmdQ->finish()); +} diff --git a/opencl/test/unit_test/command_queue/command_queue_tests.cpp b/opencl/test/unit_test/command_queue/command_queue_tests.cpp index 1ded418863..bdad289f4e 100644 --- a/opencl/test/unit_test/command_queue/command_queue_tests.cpp +++ b/opencl/test/unit_test/command_queue/command_queue_tests.cpp @@ -1837,6 +1837,204 @@ TEST(CommandQueue, givenSupportForOutEventAndOutEventIsPassedWhenValidatingSuppo EXPECT_TRUE(queue.validateCapabilityForOperation(CL_QUEUE_CAPABILITY_TRANSFER_BUFFER_INTEL, 0, nullptr, &outEvent)); } +struct CommandQueueWithTimestampPacketTests : ::testing::Test { + void SetUp() override { + DebugManager.flags.EnableTimestampPacket.set(1); + } + + DebugManagerStateRestore restore{}; +}; + +TEST_F(CommandQueueWithTimestampPacketTests, givenInOrderQueueWhenSetupBarrierTimestampForBcsEnginesCalledThenEnsureBarrierNodeIsPresent) { + MockContext context{}; + MockCommandQueue queue{context}; + TimestampPacketDependencies dependencies{}; + for (auto &containers : queue.bcsTimestampPacketContainers) { + EXPECT_TRUE(containers.lastBarrierToWaitFor.peekNodes().empty()); + } + + // No pending barrier, skip + queue.setupBarrierTimestampForBcsEngines(aub_stream::EngineType::ENGINE_RCS, dependencies); + EXPECT_EQ(0u, dependencies.barrierNodes.peekNodes().size()); + + // Add barrier node + queue.getGpgpuCommandStreamReceiver().requestStallingCommandsOnNextFlush(); + queue.setupBarrierTimestampForBcsEngines(aub_stream::EngineType::ENGINE_RCS, dependencies); + EXPECT_EQ(1u, dependencies.barrierNodes.peekNodes().size()); + auto node1 = dependencies.barrierNodes.peekNodes()[0]; + + // Do not add new node, if it exists + queue.setupBarrierTimestampForBcsEngines(aub_stream::EngineType::ENGINE_RCS, dependencies); + EXPECT_EQ(1u, dependencies.barrierNodes.peekNodes().size()); + auto node2 = dependencies.barrierNodes.peekNodes()[0]; + EXPECT_EQ(node2, node1); + + for (auto &containers : queue.bcsTimestampPacketContainers) { + EXPECT_TRUE(containers.lastBarrierToWaitFor.peekNodes().empty()); + } +} + +TEST_F(CommandQueueWithTimestampPacketTests, givenOutOfOrderQueueWhenSetupBarrierTimestampForBcsEnginesCalledOnBcsEngineThenEnsureBarrierNodeIsPresentAndSaveItForOtherBcses) { + const cl_queue_properties props[3] = {CL_QUEUE_PROPERTIES, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, 0}; + MockContext context{}; + MockCommandQueue queue{&context, context.getDevice(0), props, false}; + TimestampPacketDependencies dependencies{}; + queue.getGpgpuCommandStreamReceiver().requestStallingCommandsOnNextFlush(); + for (auto &containers : queue.bcsTimestampPacketContainers) { + EXPECT_TRUE(containers.lastBarrierToWaitFor.peekNodes().empty()); + } + + queue.setupBarrierTimestampForBcsEngines(aub_stream::EngineType::ENGINE_BCS, dependencies); + EXPECT_EQ(1u, dependencies.barrierNodes.peekNodes().size()); + auto barrierNode = dependencies.barrierNodes.peekNodes()[0]; + + for (auto currentBcsIndex = 0u; currentBcsIndex < queue.bcsTimestampPacketContainers.size(); currentBcsIndex++) { + auto &containers = queue.bcsTimestampPacketContainers[currentBcsIndex]; + if (currentBcsIndex == 0) { + EXPECT_EQ(0u, containers.lastBarrierToWaitFor.peekNodes().size()); + } else { + EXPECT_EQ(1u, containers.lastBarrierToWaitFor.peekNodes().size()); + EXPECT_EQ(barrierNode, containers.lastBarrierToWaitFor.peekNodes()[0]); + } + } + EXPECT_EQ(queue.bcsTimestampPacketContainers.size(), barrierNode->refCountFetchSub(0)); +} + +TEST_F(CommandQueueWithTimestampPacketTests, givenOutOfOrderQueueWhenSetupBarrierTimestampForBcsEnginesCalledOnNonBcsEngineThenEnsureBarrierNodeIsPresentAndSaveItForBcses) { + const cl_queue_properties props[3] = {CL_QUEUE_PROPERTIES, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, 0}; + MockContext context{}; + MockCommandQueue queue{&context, context.getDevice(0), props, false}; + TimestampPacketDependencies dependencies{}; + queue.getGpgpuCommandStreamReceiver().requestStallingCommandsOnNextFlush(); + for (auto &containers : queue.bcsTimestampPacketContainers) { + EXPECT_TRUE(containers.lastBarrierToWaitFor.peekNodes().empty()); + } + + for (auto engineType : {aub_stream::EngineType::ENGINE_RCS, + aub_stream::EngineType::ENGINE_CCS}) { + queue.setupBarrierTimestampForBcsEngines(engineType, dependencies); + EXPECT_EQ(1u, dependencies.barrierNodes.peekNodes().size()); + auto barrierNode = dependencies.barrierNodes.peekNodes()[0]; + + for (auto &containers : queue.bcsTimestampPacketContainers) { + EXPECT_EQ(1u, containers.lastBarrierToWaitFor.peekNodes().size()); + EXPECT_EQ(barrierNode, containers.lastBarrierToWaitFor.peekNodes()[0]); + } + EXPECT_EQ(1u + queue.bcsTimestampPacketContainers.size(), barrierNode->refCountFetchSub(0)); + } +} + +TEST_F(CommandQueueWithTimestampPacketTests, givenSavedBarrierWhenProcessBarrierTimestampForBcsEngineCalledThenMoveSaveBarrierPacketToBarrierNodes) { + MockContext context{}; + MockCommandQueue queue{context}; + TimestampPacketDependencies dependencies{}; + + // No saved barriers + queue.processBarrierTimestampForBcsEngine(aub_stream::EngineType::ENGINE_BCS, dependencies); + EXPECT_TRUE(dependencies.barrierNodes.peekNodes().empty()); + + // Save barrier + TagNodeBase *node = queue.getGpgpuCommandStreamReceiver().getTimestampPacketAllocator()->getTag(); + queue.bcsTimestampPacketContainers[0].lastBarrierToWaitFor.add(node); + queue.processBarrierTimestampForBcsEngine(aub_stream::EngineType::ENGINE_BCS, dependencies); + EXPECT_EQ(1u, dependencies.barrierNodes.peekNodes().size()); + EXPECT_EQ(node, dependencies.barrierNodes.peekNodes()[0]); + EXPECT_TRUE(queue.bcsTimestampPacketContainers[0].lastBarrierToWaitFor.peekNodes().empty()); +} + +TEST_F(CommandQueueWithTimestampPacketTests, givenOutOfOrderQueueWhenBarrierTimestampAreSetupOnComputeEngineAndProcessedOnBcsThenPacketIsInBarrierNodes) { + const cl_queue_properties props[3] = {CL_QUEUE_PROPERTIES, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, 0}; + MockContext context{}; + MockCommandQueue queue{&context, context.getDevice(0), props, false}; + queue.getGpgpuCommandStreamReceiver().requestStallingCommandsOnNextFlush(); + + for (auto engineType : {aub_stream::EngineType::ENGINE_RCS, + aub_stream::EngineType::ENGINE_CCS}) { + TimestampPacketDependencies dependencies{}; + queue.setupBarrierTimestampForBcsEngines(engineType, dependencies); + + TimestampPacketDependencies blitDependencies{}; + queue.processBarrierTimestampForBcsEngine(aub_stream::EngineType::ENGINE_BCS, blitDependencies); + EXPECT_EQ(1u, blitDependencies.barrierNodes.peekNodes().size()); + } +} + +TEST_F(CommandQueueWithTimestampPacketTests, givenOutOfOrderQueueWhenBarrierTimestampAreSetupOnBcsEngineAndProcessedOnBcsThenPacketIsInBarrierNodes) { + const cl_queue_properties props[3] = {CL_QUEUE_PROPERTIES, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, 0}; + MockContext context{}; + MockCommandQueue queue{&context, context.getDevice(0), props, false}; + queue.getGpgpuCommandStreamReceiver().requestStallingCommandsOnNextFlush(); + + TimestampPacketDependencies dependencies{}; + queue.setupBarrierTimestampForBcsEngines(aub_stream::EngineType::ENGINE_BCS, dependencies); + queue.processBarrierTimestampForBcsEngine(aub_stream::EngineType::ENGINE_BCS, dependencies); + EXPECT_EQ(1u, dependencies.barrierNodes.peekNodes().size()); +} + +TEST_F(CommandQueueWithTimestampPacketTests, givenInOrderQueueWhenSettingLastBcsPacketThenDoNotSaveThePacket) { + MockContext context{}; + MockCommandQueue queue{context}; + + queue.setLastBcsPacket(aub_stream::EngineType::ENGINE_BCS); + EXPECT_TRUE(queue.bcsTimestampPacketContainers[0].lastSignalledPacket.peekNodes().empty()); +} + +TEST_F(CommandQueueWithTimestampPacketTests, givenOutOfOrderQueueWhenSettingLastBcsPacketThenSaveOnlyOneLastPacket) { + const cl_queue_properties props[3] = {CL_QUEUE_PROPERTIES, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, 0}; + MockContext context{}; + MockCommandQueue queue{&context, context.getDevice(0), props, false}; + + queue.timestampPacketContainer->add(queue.getGpgpuCommandStreamReceiver().getTimestampPacketAllocator()->getTag()); + queue.setLastBcsPacket(aub_stream::EngineType::ENGINE_BCS); + EXPECT_EQ(queue.timestampPacketContainer->peekNodes(), queue.bcsTimestampPacketContainers[0].lastSignalledPacket.peekNodes()); + EXPECT_EQ(1u, queue.timestampPacketContainer->peekNodes().size()); + + queue.timestampPacketContainer->moveNodesToNewContainer(*queue.getDeferredTimestampPackets()); + + queue.timestampPacketContainer->add(queue.getGpgpuCommandStreamReceiver().getTimestampPacketAllocator()->getTag()); + queue.setLastBcsPacket(aub_stream::EngineType::ENGINE_BCS); + EXPECT_EQ(queue.timestampPacketContainer->peekNodes(), queue.bcsTimestampPacketContainers[0].lastSignalledPacket.peekNodes()); + EXPECT_EQ(1u, queue.timestampPacketContainer->peekNodes().size()); +} + +TEST_F(CommandQueueWithTimestampPacketTests, givenLastSignalledPacketWhenFillingCsrDependenciesThenMovePacketToCsrDependencies) { + MockContext context{}; + MockCommandQueue queue{context}; + queue.bcsTimestampPacketContainers[0].lastSignalledPacket.add(queue.getGpgpuCommandStreamReceiver().getTimestampPacketAllocator()->getTag()); + + CsrDependencies csrDeps; + queue.fillCsrDependenciesWithLastBcsPackets(csrDeps); + EXPECT_EQ(1u, queue.bcsTimestampPacketContainers[0].lastSignalledPacket.peekNodes().size()); + EXPECT_EQ(&queue.bcsTimestampPacketContainers[0].lastSignalledPacket, csrDeps.timestampPacketContainer[0]); +} + +TEST_F(CommandQueueWithTimestampPacketTests, givenLastSignalledPacketWhenClearingPacketsThenClearThePacket) { + MockContext context{}; + MockCommandQueue queue{context}; + queue.bcsTimestampPacketContainers[0].lastSignalledPacket.add(queue.getGpgpuCommandStreamReceiver().getTimestampPacketAllocator()->getTag()); + + queue.clearLastBcsPackets(); + EXPECT_EQ(0u, queue.bcsTimestampPacketContainers[0].lastBarrierToWaitFor.peekNodes().size()); +} + +TEST_F(CommandQueueWithTimestampPacketTests, givenQueueWhenSettingAndQueryingLastBcsPacketThenReturnCorrectResults) { + const cl_queue_properties props[3] = {CL_QUEUE_PROPERTIES, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, 0}; + MockContext context{}; + MockCommandQueue queue{&context, context.getDevice(0), props, false}; + queue.timestampPacketContainer->add(queue.getGpgpuCommandStreamReceiver().getTimestampPacketAllocator()->getTag()); + + queue.setLastBcsPacket(aub_stream::EngineType::ENGINE_BCS); + + CsrDependencies csrDeps; + queue.fillCsrDependenciesWithLastBcsPackets(csrDeps); + EXPECT_FALSE(csrDeps.timestampPacketContainer.empty()); + + queue.clearLastBcsPackets(); + for (auto &containers : queue.bcsTimestampPacketContainers) { + EXPECT_TRUE(containers.lastSignalledPacket.peekNodes().empty()); + } +} + using KernelExecutionTypesTests = DispatchFlagsTests; HWTEST_F(KernelExecutionTypesTests, givenConcurrentKernelWhileDoingNonBlockedEnqueueThenCorrectKernelTypeIsSetInCSR) { using CsrType = MockCsrHw2; diff --git a/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp b/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp index 435959b2b4..8e46f3c81b 100644 --- a/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp +++ b/opencl/test/unit_test/mem_obj/buffer_bcs_tests.cpp @@ -489,7 +489,7 @@ HWTEST_TEMPLATED_F(BcsBufferTests, givenMapAllocationWhenEnqueueingReadOrWriteBu EXPECT_EQ(mapAllocation, mockCmdQ->kernelParams.transferAllocation); } -HWTEST_TEMPLATED_F(BcsBufferTests, givenWriteBufferEnqueueWhenProgrammingCommandStreamThenAddSemaphoreWait) { +HWTEST_TEMPLATED_F(BcsBufferTests, givenWriteBufferEnqueueWithGpgpuSubmissionWhenProgrammingCommandStreamThenDoNotAddSemaphoreWaitOnGpgpu) { using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; auto cmdQ = clUniquePtr(new MockCommandQueueHw(bcsMockContext.get(), device.get(), nullptr)); @@ -503,7 +503,6 @@ HWTEST_TEMPLATED_F(BcsBufferTests, givenWriteBufferEnqueueWhenProgrammingCommand void *hostPtr = reinterpret_cast(0x12340000); cmdQ->enqueueWriteBuffer(buffer.get(), true, 0, 1, hostPtr, nullptr, 0, nullptr, nullptr); - auto timestampPacketNode = cmdQ->timestampPacketContainer->peekNodes().at(0); HardwareParse hwParser; hwParser.parseCommands(*cmdQ->peekCommandStream()); @@ -515,15 +514,13 @@ HWTEST_TEMPLATED_F(BcsBufferTests, givenWriteBufferEnqueueWhenProgrammingCommand continue; } semaphoresCount++; - auto dataAddress = TimestampPacketHelper::getContextEndGpuAddress(*timestampPacketNode); - EXPECT_EQ(dataAddress, semaphoreCmd->getSemaphoreGraphicsAddress()); } } - EXPECT_EQ(1u, semaphoresCount); + EXPECT_EQ(0u, semaphoresCount); EXPECT_EQ(initialTaskCount + 1, queueCsr->peekTaskCount()); } -HWTEST_TEMPLATED_F(BcsBufferTests, givenReadBufferEnqueueWhenProgrammingCommandStreamThenAddSemaphoreWait) { +HWTEST_TEMPLATED_F(BcsBufferTests, givenReadBufferEnqueueWithGpgpuSubmissionWhenProgrammingCommandStreamThenDoNotAddSemaphoreWaitOnGpgpu) { using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; auto cmdQ = clUniquePtr(new MockCommandQueueHw(bcsMockContext.get(), device.get(), nullptr)); @@ -537,7 +534,6 @@ HWTEST_TEMPLATED_F(BcsBufferTests, givenReadBufferEnqueueWhenProgrammingCommandS void *hostPtr = reinterpret_cast(0x12340000); cmdQ->enqueueWriteBuffer(buffer.get(), true, 0, 1, hostPtr, nullptr, 0, nullptr, nullptr); - auto timestampPacketNode = cmdQ->timestampPacketContainer->peekNodes().at(0); HardwareParse hwParser; hwParser.parseCommands(*cmdQ->peekCommandStream()); @@ -549,11 +545,9 @@ HWTEST_TEMPLATED_F(BcsBufferTests, givenReadBufferEnqueueWhenProgrammingCommandS continue; } semaphoresCount++; - auto dataAddress = TimestampPacketHelper::getContextEndGpuAddress(*timestampPacketNode); - EXPECT_EQ(dataAddress, semaphoreCmd->getSemaphoreGraphicsAddress()); } } - EXPECT_EQ(1u, semaphoresCount); + EXPECT_EQ(0u, semaphoresCount); EXPECT_EQ(initialTaskCount + 1, queueCsr->peekTaskCount()); } diff --git a/opencl/test/unit_test/mocks/mock_command_queue.h b/opencl/test/unit_test/mocks/mock_command_queue.h index 237e868205..151105957d 100644 --- a/opencl/test/unit_test/mocks/mock_command_queue.h +++ b/opencl/test/unit_test/mocks/mock_command_queue.h @@ -20,6 +20,7 @@ class MockCommandQueue : public CommandQueue { public: using CommandQueue::bcsEngines; using CommandQueue::bcsEngineTypes; + using CommandQueue::bcsTimestampPacketContainers; using CommandQueue::blitEnqueueAllowed; using CommandQueue::blitEnqueueImageAllowed; using CommandQueue::bufferCpuCopyAllowed; diff --git a/shared/source/helpers/engine_node_helper_extra.cpp b/shared/source/helpers/engine_node_helper_extra.cpp index 6c3b83bdd4..ffefce15a8 100644 --- a/shared/source/helpers/engine_node_helper_extra.cpp +++ b/shared/source/helpers/engine_node_helper_extra.cpp @@ -31,5 +31,10 @@ uint32_t getBcsIndex(aub_stream::EngineType engineType) { return 0; } +aub_stream::EngineType mapBcsIndexToEngineType(uint32_t index, bool includeMainCopyEngine) { + DEBUG_BREAK_IF(index != 0); + return aub_stream::ENGINE_BCS; +} + } // namespace EngineHelpers } // namespace NEO diff --git a/shared/source/utilities/tag_allocator.h b/shared/source/utilities/tag_allocator.h index 117fa10581..4edab91851 100644 --- a/shared/source/utilities/tag_allocator.h +++ b/shared/source/utilities/tag_allocator.h @@ -54,8 +54,6 @@ class TagNodeBase : public NonCopyableOrMovableClass { bool isProfilingCapable() const { return profilingCapable; } - const TagAllocatorBase *getAllocator() const { return allocator; } - // TagType specific calls virtual void assignDataToAllTimestamps(uint32_t packetIndex, void *source) = 0;