From 80407aec155ef647af355c59997c21002dc4d970 Mon Sep 17 00:00:00 2001 From: "Dunajski, Bartosz" Date: Tue, 12 Nov 2019 09:37:16 +0100 Subject: [PATCH] Program barrierTimestampPacketNodes only on first unblocked command Change-Id: I8ebba9f8326e3da2365c001b0c350efb372a3774 Signed-off-by: Dunajski, Bartosz --- runtime/command_queue/command_queue_hw.h | 3 +- runtime/command_queue/enqueue_common.h | 9 ++-- .../command_stream_receiver_hw_base.inl | 2 + runtime/command_stream/csr_definitions.h | 4 +- runtime/helpers/task_information.cpp | 20 ++++---- runtime/helpers/task_information.h | 5 +- .../enqueue_command_without_kernel_tests.cpp | 6 +-- unit_tests/mem_obj/buffer_tests.cpp | 50 +++++++++++++++++++ 8 files changed, 71 insertions(+), 28 deletions(-) diff --git a/runtime/command_queue/command_queue_hw.h b/runtime/command_queue/command_queue_hw.h index 7f146f0840..3bf63337a5 100644 --- a/runtime/command_queue/command_queue_hw.h +++ b/runtime/command_queue/command_queue_hw.h @@ -345,7 +345,6 @@ class CommandQueueHw : public CommandQueue { size_t surfacesCount, const MultiDispatchInfo &multiDispatchInfo, TimestampPacketContainer &previousTimestampPacketNodes, - TimestampPacketContainer &barrierTimestampPacketNode, std::unique_ptr &blockedCommandsData, const EnqueueProperties &enqueueProperties, EventsRequest &eventsRequest, @@ -359,7 +358,7 @@ class CommandQueueHw : public CommandQueue { bool &blocking, const EnqueueProperties &enqueueProperties, TimestampPacketContainer *previousTimestampPacketNodes, - const TimestampPacketContainer &barrierTimestampPacketNodes, + TimestampPacketContainer &barrierTimestampPacketNodes, EventsRequest &eventsRequest, EventBuilder &eventBuilder, uint32_t taskLevel); diff --git a/runtime/command_queue/enqueue_common.h b/runtime/command_queue/enqueue_common.h index 9e092c5d8c..0706ef0ea2 100644 --- a/runtime/command_queue/enqueue_common.h +++ b/runtime/command_queue/enqueue_common.h @@ -205,7 +205,7 @@ void CommandQueueHw::enqueueHandler(Surface **surfacesForResidency, nodesCount = estimateTimestampPacketNodesCount(multiDispatchInfo); } - if (blitEnqueue && getGpgpuCommandStreamReceiver().isStallingPipeControlOnNextFlushRequired()) { + if (blitEnqueue && !blockQueue && getGpgpuCommandStreamReceiver().isStallingPipeControlOnNextFlushRequired()) { auto allocator = getGpgpuCommandStreamReceiver().getTimestampPacketAllocator(); barrierTimestampPacketNode.add(allocator->getTag()); } @@ -346,7 +346,6 @@ void CommandQueueHw::enqueueHandler(Surface **surfacesForResidency, numSurfaceForResidency, multiDispatchInfo, previousTimestampPacketNodes, - barrierTimestampPacketNode, blockedCommandsData, enqueueProperties, eventsRequest, @@ -741,7 +740,6 @@ void CommandQueueHw::enqueueBlocked( size_t surfaceCount, const MultiDispatchInfo &multiDispatchInfo, TimestampPacketContainer &previousTimestampPacketNodes, - TimestampPacketContainer &barrierTimestampPacketNode, std::unique_ptr &blockedCommandsData, const EnqueueProperties &enqueueProperties, EventsRequest &eventsRequest, @@ -820,8 +818,7 @@ void CommandQueueHw::enqueueBlocked( auto event = castToObjectOrAbort(eventsRequest.eventWaitList[i]); event->incRefInternal(); } - command->setTimestampPacketNode(*timestampPacketContainer, std::move(previousTimestampPacketNodes), - std::move(barrierTimestampPacketNode)); + command->setTimestampPacketNode(*timestampPacketContainer, std::move(previousTimestampPacketNodes)); command->setEventsRequest(eventsRequest); } outEvent->setCommand(std::move(command)); @@ -846,7 +843,7 @@ CompletionStamp CommandQueueHw::enqueueCommandWithoutKernel( bool &blocking, const EnqueueProperties &enqueueProperties, TimestampPacketContainer *previousTimestampPacketNodes, - const TimestampPacketContainer &barrierTimestampPacketNodes, + TimestampPacketContainer &barrierTimestampPacketNodes, EventsRequest &eventsRequest, EventBuilder &eventBuilder, uint32_t taskLevel) { diff --git a/runtime/command_stream/command_stream_receiver_hw_base.inl b/runtime/command_stream/command_stream_receiver_hw_base.inl index 8d5fb2cd6f..c2589f05f2 100644 --- a/runtime/command_stream/command_stream_receiver_hw_base.inl +++ b/runtime/command_stream/command_stream_receiver_hw_base.inl @@ -516,6 +516,8 @@ inline void CommandStreamReceiverHw::programStallingPipeControlForBar stallingPipeControlCmd = PipeControlHelper::obtainPipeControlAndProgramPostSyncOperation( cmdStream, PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA, barrierTimestampPacketGpuAddress, 0, false, peekHwInfo()); + + dispatchFlags.barrierTimestampPacketNodes->makeResident(*this); } else { stallingPipeControlCmd = PipeControlHelper::addPipeControl(cmdStream, false); } diff --git a/runtime/command_stream/csr_definitions.h b/runtime/command_stream/csr_definitions.h index 3650bb1c28..b4abddd648 100644 --- a/runtime/command_stream/csr_definitions.h +++ b/runtime/command_stream/csr_definitions.h @@ -44,7 +44,7 @@ constexpr uint32_t l3AndL1On = 2u; struct DispatchFlags { DispatchFlags() = delete; - DispatchFlags(CsrDependencies csrDependencies, const TimestampPacketContainer *barrierTimestampPacketNodes, PipelineSelectArgs pipelineSelectArgs, + DispatchFlags(CsrDependencies csrDependencies, TimestampPacketContainer *barrierTimestampPacketNodes, PipelineSelectArgs pipelineSelectArgs, FlushStampTrackingObj *flushStampReference, QueueThrottle throttle, PreemptionMode preemptionMode, uint32_t numGrfRequired, uint32_t l3CacheSettings, uint64_t sliceCount, bool blocking, bool dcFlush, bool useSLM, bool guardCommandBufferWithPipeControl, bool gsba32BitRequired, @@ -70,7 +70,7 @@ struct DispatchFlags { multiEngineQueue(multiEngineQueue), epilogueRequired(epilogueRequired){}; CsrDependencies csrDependencies; - const TimestampPacketContainer *barrierTimestampPacketNodes = nullptr; + TimestampPacketContainer *barrierTimestampPacketNodes = nullptr; PipelineSelectArgs pipelineSelectArgs; FlushStampTrackingObj *flushStampReference = nullptr; QueueThrottle throttle = QueueThrottle::MEDIUM; diff --git a/runtime/helpers/task_information.cpp b/runtime/helpers/task_information.cpp index c8e285c463..8d8d002277 100644 --- a/runtime/helpers/task_information.cpp +++ b/runtime/helpers/task_information.cpp @@ -252,14 +252,14 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate return completionStamp; } -void CommandWithoutKernel::dispatchBlitOperation() { +void CommandWithoutKernel::dispatchBlitOperation(TimestampPacketContainer &barrierTimestampPacketNodes) { auto bcsCsr = commandQueue.getBcsCommandStreamReceiver(); UNRECOVERABLE_IF(kernelOperation->blitPropertiesContainer.size() != 1); auto &blitProperties = *kernelOperation->blitPropertiesContainer.begin(); blitProperties.csrDependencies.fillFromEventsRequest(eventsRequest, *bcsCsr, CsrDependencies::DependenciesType::All); blitProperties.csrDependencies.push_back(previousTimestampPacketNodes.get()); - blitProperties.csrDependencies.push_back(barrierTimestampPacketNodes.get()); + blitProperties.csrDependencies.push_back(&barrierTimestampPacketNodes); blitProperties.outputTimestampPacket = currentTimestampPacketNodes.get(); auto bcsTaskCount = bcsCsr->blitBuffer(kernelOperation->blitPropertiesContainer, false); @@ -283,14 +283,18 @@ CompletionStamp &CommandWithoutKernel::submit(uint32_t taskLevel, bool terminate } auto lockCSR = commandStreamReceiver.obtainUniqueOwnership(); + TimestampPacketContainer barrierTimestampPacketNodes; if (kernelOperation->blitEnqueue) { - dispatchBlitOperation(); + if (commandStreamReceiver.isStallingPipeControlOnNextFlushRequired()) { + barrierTimestampPacketNodes.add(commandStreamReceiver.getTimestampPacketAllocator()->getTag()); + } + dispatchBlitOperation(barrierTimestampPacketNodes); } DispatchFlags dispatchFlags( {}, //csrDependencies - barrierTimestampPacketNodes.get(), //barrierTimestampPacketNodes + &barrierTimestampPacketNodes, //barrierTimestampPacketNodes {}, //pipelineSelectArgs commandQueue.flushStamp->getStampReference(), //flushStampReference commandQueue.getThrottle(), //throttle @@ -341,15 +345,12 @@ void Command::setEventsRequest(EventsRequest &eventsRequest) { } } -void Command::setTimestampPacketNode(TimestampPacketContainer ¤t, TimestampPacketContainer &&previous, TimestampPacketContainer &&barrier) { +void Command::setTimestampPacketNode(TimestampPacketContainer ¤t, TimestampPacketContainer &&previous) { currentTimestampPacketNodes = std::make_unique(); currentTimestampPacketNodes->assignAndIncrementNodesRefCounts(current); previousTimestampPacketNodes = std::make_unique(); *previousTimestampPacketNodes = std::move(previous); - - barrierTimestampPacketNodes = std::make_unique(); - *barrierTimestampPacketNodes = std::move(barrier); } Command::~Command() { @@ -378,9 +379,6 @@ void Command::makeTimestampPacketsResident(CommandStreamReceiver &commandStreamR if (previousTimestampPacketNodes) { previousTimestampPacketNodes->makeResident(commandStreamReceiver); } - if (barrierTimestampPacketNodes) { - barrierTimestampPacketNodes->makeResident(commandStreamReceiver); - } } Command::Command(CommandQueue &commandQueue) : commandQueue(commandQueue) {} diff --git a/runtime/helpers/task_information.h b/runtime/helpers/task_information.h index aa4ccf8436..3f91e8be15 100644 --- a/runtime/helpers/task_information.h +++ b/runtime/helpers/task_information.h @@ -94,7 +94,7 @@ class Command : public IFNode { virtual LinearStream *getCommandStream() { return nullptr; } - void setTimestampPacketNode(TimestampPacketContainer ¤t, TimestampPacketContainer &&previous, TimestampPacketContainer &&barrier); + void setTimestampPacketNode(TimestampPacketContainer ¤t, TimestampPacketContainer &&previous); void setEventsRequest(EventsRequest &eventsRequest); void makeTimestampPacketsResident(CommandStreamReceiver &commandStreamReceiver); @@ -106,7 +106,6 @@ class Command : public IFNode { std::unique_ptr kernelOperation; std::unique_ptr currentTimestampPacketNodes; std::unique_ptr previousTimestampPacketNodes; - std::unique_ptr barrierTimestampPacketNodes; EventsRequest eventsRequest = {0, nullptr, nullptr}; std::vector eventsWaitlist; }; @@ -153,6 +152,6 @@ class CommandWithoutKernel : public Command { public: using Command::Command; CompletionStamp &submit(uint32_t taskLevel, bool terminated) override; - void dispatchBlitOperation(); + void dispatchBlitOperation(TimestampPacketContainer &barrierTimestampPacketNodes); }; } // namespace NEO diff --git a/unit_tests/command_queue/enqueue_command_without_kernel_tests.cpp b/unit_tests/command_queue/enqueue_command_without_kernel_tests.cpp index d1b559fd21..c0915a832a 100644 --- a/unit_tests/command_queue/enqueue_command_without_kernel_tests.cpp +++ b/unit_tests/command_queue/enqueue_command_without_kernel_tests.cpp @@ -63,7 +63,6 @@ HWTEST_F(EnqueueHandlerTest, givenNonBlitPropertyWhenEnqueueIsBlockedThenDontReg auto blockedCommandsDataForDependencyFlush = new KernelOperation(commandStream, *csr.getInternalAllocationStorage()); TimestampPacketContainer previousTimestampPacketNodes; - TimestampPacketContainer barrierTimestampPacketNodes; MultiDispatchInfo multiDispatchInfo; EventsRequest eventsRequest(0, nullptr, nullptr); EventBuilder eventBuilder; @@ -73,7 +72,7 @@ HWTEST_F(EnqueueHandlerTest, givenNonBlitPropertyWhenEnqueueIsBlockedThenDontReg auto blockedCommandsData = std::unique_ptr(blockedCommandsDataForDependencyFlush); Surface *surfaces[] = {nullptr}; mockCmdQ->enqueueBlocked(CL_COMMAND_MARKER, surfaces, size_t(0), multiDispatchInfo, previousTimestampPacketNodes, - barrierTimestampPacketNodes, blockedCommandsData, enqueuePropertiesForDependencyFlush, eventsRequest, + blockedCommandsData, enqueuePropertiesForDependencyFlush, eventsRequest, eventBuilder, std::unique_ptr(nullptr)); EXPECT_FALSE(blockedCommandsDataForDependencyFlush->blitEnqueue); } @@ -88,7 +87,6 @@ HWTEST_F(EnqueueHandlerTest, givenBlitPropertyWhenEnqueueIsBlockedThenRegisterBl auto blockedCommandsDataForBlitEnqueue = new KernelOperation(commandStream, *csr.getInternalAllocationStorage()); TimestampPacketContainer previousTimestampPacketNodes; - TimestampPacketContainer barrierTimestampPacketNodes; MultiDispatchInfo multiDispatchInfo; EventsRequest eventsRequest(0, nullptr, nullptr); EventBuilder eventBuilder; @@ -103,7 +101,7 @@ HWTEST_F(EnqueueHandlerTest, givenBlitPropertyWhenEnqueueIsBlockedThenRegisterBl auto blockedCommandsData = std::unique_ptr(blockedCommandsDataForBlitEnqueue); Surface *surfaces[] = {nullptr}; mockCmdQ->enqueueBlocked(CL_COMMAND_READ_BUFFER, surfaces, size_t(0), multiDispatchInfo, previousTimestampPacketNodes, - barrierTimestampPacketNodes, blockedCommandsData, enqueuePropertiesForBlitEnqueue, eventsRequest, + blockedCommandsData, enqueuePropertiesForBlitEnqueue, eventsRequest, eventBuilder, std::unique_ptr(nullptr)); EXPECT_TRUE(blockedCommandsDataForBlitEnqueue->blitEnqueue); EXPECT_EQ(blitProperties.srcAllocation, blockedCommandsDataForBlitEnqueue->blitPropertiesContainer.begin()->srcAllocation); diff --git a/unit_tests/mem_obj/buffer_tests.cpp b/unit_tests/mem_obj/buffer_tests.cpp index be85a56ca6..4f04dcd40c 100644 --- a/unit_tests/mem_obj/buffer_tests.cpp +++ b/unit_tests/mem_obj/buffer_tests.cpp @@ -1031,6 +1031,56 @@ HWTEST_TEMPLATED_F(BcsBufferTests, givenPipeControlRequestWhenDispatchingBlitEnq EXPECT_EQ(pipeControlWriteAddress, genCmdCast(*(semaphores[0]))->getSemaphoreGraphicsAddress()); } +HWTEST_TEMPLATED_F(BcsBufferTests, givenBarrierWhenReleasingMultipleBlockedEnqueuesThenProgramBarrierOnce) { + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + + auto cmdQ = clUniquePtr(new MockCommandQueueHw(bcsMockContext.get(), device.get(), nullptr)); + + cl_int retVal = CL_SUCCESS; + auto buffer = clUniquePtr(Buffer::create(bcsMockContext.get(), CL_MEM_READ_WRITE, 1, nullptr, retVal)); + buffer->forceDisallowCPUCopy = true; + void *hostPtr = reinterpret_cast(0x12340000); + + UserEvent userEvent0, userEvent1; + cl_event waitlist0[] = {&userEvent0}; + cl_event waitlist1[] = {&userEvent1}; + + cmdQ->enqueueBarrierWithWaitList(0, nullptr, nullptr); + cmdQ->enqueueWriteBuffer(buffer.get(), false, 0, 1, hostPtr, nullptr, 1, waitlist0, nullptr); + cmdQ->enqueueWriteBuffer(buffer.get(), false, 0, 1, hostPtr, nullptr, 1, waitlist1, nullptr); + + auto pipeControlLookup = [](LinearStream &stream, size_t offset) { + HardwareParse hwParser; + hwParser.parseCommands(stream, offset); + + bool stallingPipeControlFound = false; + for (auto &cmd : hwParser.cmdList) { + if (auto pipeControlCmd = genCmdCast(cmd)) { + if (pipeControlCmd->getPostSyncOperation() != PIPE_CONTROL::POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA) { + continue; + } + + stallingPipeControlFound = true; + EXPECT_TRUE(pipeControlCmd->getCommandStreamerStallEnable()); + break; + } + } + + return stallingPipeControlFound; + }; + + auto &csrStream = cmdQ->getGpgpuCommandStreamReceiver().getCS(0); + EXPECT_TRUE(cmdQ->getGpgpuCommandStreamReceiver().isStallingPipeControlOnNextFlushRequired()); + userEvent0.setStatus(CL_COMPLETE); + EXPECT_FALSE(cmdQ->getGpgpuCommandStreamReceiver().isStallingPipeControlOnNextFlushRequired()); + EXPECT_TRUE(pipeControlLookup(csrStream, 0)); + + auto csrOffset = csrStream.getUsed(); + userEvent1.setStatus(CL_COMPLETE); + EXPECT_FALSE(pipeControlLookup(csrStream, csrOffset)); + cmdQ->isQueueBlocked(); +} + HWTEST_TEMPLATED_F(BcsBufferTests, givenPipeControlRequestWhenDispatchingBlockedBlitEnqueueThenWaitPipeControlOnBcsEngine) { using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;