diff --git a/runtime/command_queue/command_queue_hw.h b/runtime/command_queue/command_queue_hw.h index 02fbc96092..3982022b26 100644 --- a/runtime/command_queue/command_queue_hw.h +++ b/runtime/command_queue/command_queue_hw.h @@ -347,7 +347,7 @@ class CommandQueueHw : public CommandQueue { LinearStream &commandStream, size_t commandStreamStart, bool &blocking, - bool blitEnqueue, + const EnqueueProperties &enqueueProperties, TimestampPacketContainer *previousTimestampPacketNodes, EventsRequest &eventsRequest, EventBuilder &eventBuilder, @@ -356,11 +356,11 @@ class CommandQueueHw : public CommandQueue { size_t numSurfaces, LinearStream *commandStream, CsrDependencies &csrDeps); - void processDispatchForBlitEnqueue(const MultiDispatchInfo &multiDispatchInfo, - TimestampPacketContainer &previousTimestampPacketNodes, - const EventsRequest &eventsRequest, - LinearStream &commandStream, - uint32_t commandType); + BlitProperties processDispatchForBlitEnqueue(const MultiDispatchInfo &multiDispatchInfo, + TimestampPacketContainer &previousTimestampPacketNodes, + const EventsRequest &eventsRequest, + LinearStream &commandStream, + uint32_t commandType); void submitCacheFlush(Surface **surfaces, size_t numSurfaces, LinearStream *commandStream, diff --git a/runtime/command_queue/enqueue_common.h b/runtime/command_queue/enqueue_common.h index d5af6747bb..8b9a0df4a9 100644 --- a/runtime/command_queue/enqueue_common.h +++ b/runtime/command_queue/enqueue_common.h @@ -190,6 +190,7 @@ void CommandQueueHw::enqueueHandler(Surface **surfacesForResidency, TimestampPacketContainer previousTimestampPacketNodes; EventsRequest eventsRequest(numEventsInWaitList, eventWaitList, event); CsrDependencies csrDeps; + BlitProperties blitProperties; if (getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) { csrDeps.fillFromEventsRequestAndMakeResident(eventsRequest, getGpgpuCommandStreamReceiver(), CsrDependencies::DependenciesType::OnCsr); @@ -218,7 +219,7 @@ void CommandQueueHw::enqueueHandler(Surface **surfacesForResidency, bool flushDependenciesForNonKernelCommand = false; if (blitEnqueue) { - processDispatchForBlitEnqueue(multiDispatchInfo, previousTimestampPacketNodes, eventsRequest, commandStream, commandType); + blitProperties = processDispatchForBlitEnqueue(multiDispatchInfo, previousTimestampPacketNodes, eventsRequest, commandStream, commandType); } else if (multiDispatchInfo.empty() == false) { processDispatchForKernels(multiDispatchInfo, printfHandler, eventBuilder.getEvent(), hwTimeStamps, blockQueue, devQueueHw, csrDeps, blockedCommandsData.get(), @@ -245,14 +246,15 @@ void CommandQueueHw::enqueueHandler(Surface **surfacesForResidency, } CompletionStamp completionStamp = {Event::eventNotReady, taskLevel, 0}; + + EnqueueProperties enqueueProperties(blitEnqueue, !multiDispatchInfo.empty(), isCacheFlushCommand(commandType), + flushDependenciesForNonKernelCommand, &blitProperties); if (!blockQueue) { if (parentKernel) { processDeviceEnqueue(devQueueHw, multiDispatchInfo, hwTimeStamps, blocking); } - auto kernelSubmissionRequired = !isCommandWithoutKernel(commandType) && !blitEnqueue; - - if (kernelSubmissionRequired) { + if (enqueueProperties.operation == EnqueueProperties::Operation::GpuKernel) { completionStamp = enqueueNonBlocked( surfacesForResidency, numSurfaceForResidency, @@ -284,19 +286,20 @@ void CommandQueueHw::enqueueHandler(Surface **surfacesForResidency, devQueueHw->getDebugQueue()); } } - } else if (isCacheFlushCommand(commandType) || blitEnqueue || flushDependenciesForNonKernelCommand) { + } else if (enqueueProperties.isFlushWithoutKernelRequired()) { completionStamp = enqueueCommandWithoutKernel( surfacesForResidency, numSurfaceForResidency, commandStream, commandStreamStart, blocking, - blitEnqueue, + enqueueProperties, &previousTimestampPacketNodes, eventsRequest, eventBuilder, taskLevel); } else { + UNRECOVERABLE_IF(enqueueProperties.operation != EnqueueProperties::Operation::EnqueueWithoutSubmission); auto maxTaskCount = this->taskCount; for (auto eventId = 0u; eventId < numEventsInWaitList; eventId++) { auto event = castToObject(eventWaitList[eventId]); @@ -432,11 +435,11 @@ void CommandQueueHw::processDispatchForKernels(const MultiDispatchInf } template -void CommandQueueHw::processDispatchForBlitEnqueue(const MultiDispatchInfo &multiDispatchInfo, - TimestampPacketContainer &previousTimestampPacketNodes, - const EventsRequest &eventsRequest, - LinearStream &commandStream, - uint32_t commandType) { +BlitProperties CommandQueueHw::processDispatchForBlitEnqueue(const MultiDispatchInfo &multiDispatchInfo, + TimestampPacketContainer &previousTimestampPacketNodes, + const EventsRequest &eventsRequest, + LinearStream &commandStream, + uint32_t commandType) { auto blitDirection = BlitProperties::obtainBlitDirection(commandType); auto blitCommandStreamReceiver = getBcsCommandStreamReceiver(); @@ -450,12 +453,10 @@ void CommandQueueHw::processDispatchForBlitEnqueue(const MultiDispatc blitProperties.csrDependencies.push_back(&previousTimestampPacketNodes); blitProperties.outputTimestampPacket = timestampPacketContainer.get(); - previousTimestampPacketNodes.makeResident(*blitCommandStreamReceiver); - timestampPacketContainer->makeResident(*blitCommandStreamReceiver); - blitCommandStreamReceiver->blitBuffer(blitProperties); - auto currentTimestampPacketNode = timestampPacketContainer->peekNodes().at(0); TimestampPacketHelper::programSemaphoreWithImplicitDependency(commandStream, *currentTimestampPacketNode); + + return blitProperties; } template @@ -809,7 +810,7 @@ CompletionStamp CommandQueueHw::enqueueCommandWithoutKernel( LinearStream &commandStream, size_t commandStreamStart, bool &blocking, - bool blitEnqueue, + const EnqueueProperties &enqueueProperties, TimestampPacketContainer *previousTimestampPacketNodes, EventsRequest &eventsRequest, EventBuilder &eventBuilder, @@ -826,11 +827,20 @@ CompletionStamp CommandQueueHw::enqueueCommandWithoutKernel( requiresCoherency |= surface->IsCoherent; } + if (enqueueProperties.operation == EnqueueProperties::Operation::Blit) { + UNRECOVERABLE_IF(!enqueueProperties.blitProperties); + auto bcsCsr = getBcsCommandStreamReceiver(); + + previousTimestampPacketNodes->makeResident(*bcsCsr); + timestampPacketContainer->makeResident(*bcsCsr); + bcsCsr->blitBuffer(*enqueueProperties.blitProperties); + } + DispatchFlags dispatchFlags = {}; dispatchFlags.blocking = blocking; dispatchFlags.multiEngineQueue = multiEngineQueue; dispatchFlags.preemptionMode = device->getPreemptionMode(); - dispatchFlags.implicitFlush = blitEnqueue; + dispatchFlags.implicitFlush = (enqueueProperties.operation == EnqueueProperties::Operation::Blit); dispatchFlags.guardCommandBufferWithPipeControl = true; dispatchFlags.outOfOrderExecutionAllowed = getGpgpuCommandStreamReceiver().isNTo1SubmissionModelEnabled(); diff --git a/runtime/helpers/blit_commands_helper.h b/runtime/helpers/blit_commands_helper.h index dde6190c27..0651c0947a 100644 --- a/runtime/helpers/blit_commands_helper.h +++ b/runtime/helpers/blit_commands_helper.h @@ -21,8 +21,6 @@ class LinearStream; class TimestampPacketContainer; struct BlitProperties { - BlitProperties() = delete; - static BlitProperties constructPropertiesForReadWriteBuffer(BlitterConstants::BlitDirection blitDirection, CommandStreamReceiver &commandStreamReceiver, GraphicsAllocation *memObjAllocation, void *hostPtr, bool blocking, diff --git a/runtime/helpers/properties_helper.h b/runtime/helpers/properties_helper.h index e78364d285..7d6d34e283 100644 --- a/runtime/helpers/properties_helper.h +++ b/runtime/helpers/properties_helper.h @@ -15,6 +15,7 @@ namespace NEO { class MemObj; class Buffer; +struct BlitProperties; enum QueueThrottle : uint32_t { LOW, @@ -77,4 +78,49 @@ struct MapInfo { uint32_t mipLevel = 0; bool readOnly = false; }; + +struct EnqueueProperties { + enum class Operation { + Blit, + ExplicitCacheFlush, + EnqueueWithoutSubmission, + DependencyResolveOnGpu, + GpuKernel, + }; + + EnqueueProperties() = delete; + EnqueueProperties(bool blitEnqueue, bool hasKernels, bool isCacheFlushCmd, bool flushDependenciesOnly, + const BlitProperties *blitProperties) { + if (blitEnqueue) { + operation = Operation::Blit; + this->blitProperties = blitProperties; + return; + } + + if (hasKernels) { + operation = Operation::GpuKernel; + return; + } + + if (isCacheFlushCmd) { + operation = Operation::ExplicitCacheFlush; + return; + } + + if (flushDependenciesOnly) { + operation = Operation::DependencyResolveOnGpu; + return; + } + + operation = Operation::EnqueueWithoutSubmission; + } + + bool isFlushWithoutKernelRequired() const { + return (operation == Operation::Blit) || (operation == Operation::ExplicitCacheFlush) || + (operation == Operation::DependencyResolveOnGpu); + } + + const BlitProperties *blitProperties = nullptr; + Operation operation = Operation::EnqueueWithoutSubmission; +}; } // namespace NEO diff --git a/unit_tests/command_queue/enqueue_command_without_kernel_tests.cpp b/unit_tests/command_queue/enqueue_command_without_kernel_tests.cpp index 724642e00a..98b2b63b8f 100644 --- a/unit_tests/command_queue/enqueue_command_without_kernel_tests.cpp +++ b/unit_tests/command_queue/enqueue_command_without_kernel_tests.cpp @@ -5,6 +5,7 @@ * */ +#include "core/unit_tests/helpers/debug_manager_state_restore.h" #include "runtime/event/event_builder.h" #include "runtime/event/user_event.h" #include "runtime/helpers/timestamp_packet.h" @@ -43,7 +44,9 @@ HWTEST_F(EnqueueHandlerTest, GivenCommandStreamWithoutKernelWhenCommandEnqueuedT Surface *surfaces[] = {surface.get()}; auto blocking = true; TimestampPacketContainer previousTimestampPacketNodes; - mockCmdQ->enqueueCommandWithoutKernel(surfaces, 1, mockCmdQ->getCS(0), 0, blocking, false, &previousTimestampPacketNodes, eventsRequest, eventBuilder, 0); + EnqueueProperties enqueueProperties(false, false, false, true, nullptr); + + mockCmdQ->enqueueCommandWithoutKernel(surfaces, 1, mockCmdQ->getCS(0), 0, blocking, enqueueProperties, &previousTimestampPacketNodes, eventsRequest, eventBuilder, 0); EXPECT_EQ(allocation->getTaskCount(mockCmdQ->getGpgpuCommandStreamReceiver().getOsContext().getContextId()), 1u); } @@ -57,6 +60,7 @@ struct DispatchFlagsTests : public ::testing::Test { std::unique_ptr device; std::unique_ptr context; + DebugManagerStateRestore restore; }; HWTEST_F(DispatchFlagsTests, whenEnqueueCommandWithoutKernelThenPassCorrectDispatchFlags) { @@ -70,7 +74,9 @@ HWTEST_F(DispatchFlagsTests, whenEnqueueCommandWithoutKernelThenPassCorrectDispa TimestampPacketContainer previousTimestampPacketNodes; EventsRequest eventsRequest(0, nullptr, nullptr); EventBuilder eventBuilder; - mockCmdQ->enqueueCommandWithoutKernel(nullptr, 0, mockCmdQ->getCS(0), 0, blocking, false, &previousTimestampPacketNodes, eventsRequest, eventBuilder, 0); + + EnqueueProperties enqueueProperties(false, false, false, true, nullptr); + mockCmdQ->enqueueCommandWithoutKernel(nullptr, 0, mockCmdQ->getCS(0), 0, blocking, enqueueProperties, &previousTimestampPacketNodes, eventsRequest, eventBuilder, 0); EXPECT_EQ(blocking, mockCsr->passedDispatchFlags.blocking); EXPECT_FALSE(mockCsr->passedDispatchFlags.implicitFlush); @@ -81,16 +87,32 @@ HWTEST_F(DispatchFlagsTests, whenEnqueueCommandWithoutKernelThenPassCorrectDispa HWTEST_F(DispatchFlagsTests, givenBlitEnqueueWhenDispatchingCommandsWithoutKernelThenDoImplicitFlush) { using CsrType = MockCsrHw2; + DebugManager.flags.EnableTimestampPacket.set(1); SetUpImpl(); auto mockCmdQ = std::make_unique>(context.get(), device.get(), nullptr); auto mockCsr = static_cast(&mockCmdQ->getGpgpuCommandStreamReceiver()); + mockCsr->skipBlitCalls = true; + mockCmdQ->bcsEngine = mockCmdQ->gpgpuEngine; + cl_int retVal = CL_SUCCESS; + auto buffer = std::unique_ptr(Buffer::create(context.get(), 0, 1, nullptr, retVal)); auto blocking = true; TimestampPacketContainer previousTimestampPacketNodes; EventsRequest eventsRequest(0, nullptr, nullptr); EventBuilder eventBuilder; - mockCmdQ->enqueueCommandWithoutKernel(nullptr, 0, mockCmdQ->getCS(0), 0, blocking, true, &previousTimestampPacketNodes, eventsRequest, eventBuilder, 0); + BuiltinOpParams builtinOpParams; + builtinOpParams.srcMemObj = buffer.get(); + builtinOpParams.dstPtr = reinterpret_cast(0x1234); + MultiDispatchInfo multiDispatchInfo; + multiDispatchInfo.setBuiltinOpParams(builtinOpParams); + + mockCmdQ->obtainNewTimestampPacketNodes(1, previousTimestampPacketNodes, true); + BlitProperties blitProperties = mockCmdQ->processDispatchForBlitEnqueue(multiDispatchInfo, previousTimestampPacketNodes, eventsRequest, + mockCmdQ->getCS(0), 0); + + EnqueueProperties enqueueProperties(true, false, false, false, &blitProperties); + mockCmdQ->enqueueCommandWithoutKernel(nullptr, 0, mockCmdQ->getCS(0), 0, blocking, enqueueProperties, &previousTimestampPacketNodes, eventsRequest, eventBuilder, 0); EXPECT_TRUE(mockCsr->passedDispatchFlags.implicitFlush); EXPECT_TRUE(mockCsr->passedDispatchFlags.guardCommandBufferWithPipeControl); @@ -98,23 +120,40 @@ HWTEST_F(DispatchFlagsTests, givenBlitEnqueueWhenDispatchingCommandsWithoutKerne HWTEST_F(DispatchFlagsTests, givenN1EnabledWhenDispatchingWithoutKernelTheAllowOutOfOrderExecution) { using CsrType = MockCsrHw2; + DebugManager.flags.EnableTimestampPacket.set(1); + SetUpImpl(); auto mockCmdQ = std::make_unique>(context.get(), device.get(), nullptr); auto mockCsr = static_cast(&mockCmdQ->getGpgpuCommandStreamReceiver()); + mockCsr->skipBlitCalls = true; + mockCmdQ->bcsEngine = mockCmdQ->gpgpuEngine; + cl_int retVal = CL_SUCCESS; + auto buffer = std::unique_ptr(Buffer::create(context.get(), 0, 1, nullptr, retVal)); TimestampPacketContainer previousTimestampPacketNodes; EventsRequest eventsRequest(0, nullptr, nullptr); EventBuilder eventBuilder; bool blocked = false; + BuiltinOpParams builtinOpParams; + builtinOpParams.srcMemObj = buffer.get(); + builtinOpParams.dstPtr = reinterpret_cast(0x1234); + MultiDispatchInfo multiDispatchInfo; + multiDispatchInfo.setBuiltinOpParams(builtinOpParams); + + mockCmdQ->obtainNewTimestampPacketNodes(1, previousTimestampPacketNodes, true); + BlitProperties blitProperties = mockCmdQ->processDispatchForBlitEnqueue(multiDispatchInfo, previousTimestampPacketNodes, eventsRequest, + mockCmdQ->getCS(0), 0); + EnqueueProperties enqueueProperties(true, false, false, false, &blitProperties); + enqueueProperties.blitProperties = &blitProperties; mockCsr->nTo1SubmissionModelEnabled = false; - mockCmdQ->enqueueCommandWithoutKernel(nullptr, 0, mockCmdQ->getCS(0), 0, blocked, true, &previousTimestampPacketNodes, eventsRequest, eventBuilder, 0); + mockCmdQ->enqueueCommandWithoutKernel(nullptr, 0, mockCmdQ->getCS(0), 0, blocked, enqueueProperties, &previousTimestampPacketNodes, eventsRequest, eventBuilder, 0); EXPECT_FALSE(mockCsr->passedDispatchFlags.outOfOrderExecutionAllowed); mockCsr->nTo1SubmissionModelEnabled = true; - mockCmdQ->enqueueCommandWithoutKernel(nullptr, 0, mockCmdQ->getCS(0), 0, blocked, true, &previousTimestampPacketNodes, eventsRequest, eventBuilder, 0); + mockCmdQ->enqueueCommandWithoutKernel(nullptr, 0, mockCmdQ->getCS(0), 0, blocked, enqueueProperties, &previousTimestampPacketNodes, eventsRequest, eventBuilder, 0); EXPECT_TRUE(mockCsr->passedDispatchFlags.outOfOrderExecutionAllowed); } diff --git a/unit_tests/mocks/mock_csr.h b/unit_tests/mocks/mock_csr.h index ddb544a191..6acf1d165d 100644 --- a/unit_tests/mocks/mock_csr.h +++ b/unit_tests/mocks/mock_csr.h @@ -213,6 +213,13 @@ class MockCsrHw2 : public CommandStreamReceiverHw { return completionStamp; } + void blitBuffer(const BlitProperties &blitProperites) override { + if (!skipBlitCalls) { + CommandStreamReceiverHw::blitBuffer(blitProperites); + } + } + + bool skipBlitCalls = false; bool storeFlushedTaskStream = false; std::unique_ptr storedTaskStream; size_t storedTaskStreamSize = 0;