diff --git a/Jenkinsfile b/Jenkinsfile index d3277381f0..277c72a75c 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -1,5 +1,5 @@ #!groovy dependenciesRevision='e3bce757f3edc77263cc431a1dceb1b2cd0701dc-1335' strategy='EQUAL' -allowedCD=261 +allowedCD=260 allowedF=7 diff --git a/runtime/command_queue/enqueue_common.h b/runtime/command_queue/enqueue_common.h index 03b83d53a0..0f91a54226 100644 --- a/runtime/command_queue/enqueue_common.h +++ b/runtime/command_queue/enqueue_common.h @@ -489,7 +489,6 @@ void CommandQueueHw::processDispatchForBlitAuxTranslation(const Multi buffer->getGraphicsAllocation()); auto auxToNonAuxNode = nodesAllocator->getTag(); timestampPacketDependencies.auxToNonAuxNodes.add(auxToNonAuxNode); - blitPropertiesContainer[bufferIndex].outputTimestampPacket = auxToNonAuxNode; } { @@ -498,22 +497,14 @@ void CommandQueueHw::processDispatchForBlitAuxTranslation(const Multi buffer->getGraphicsAllocation()); auto nonAuxToAuxNode = nodesAllocator->getTag(); timestampPacketDependencies.nonAuxToAuxNodes.add(nonAuxToAuxNode); - blitPropertiesContainer[bufferIndex + numBuffers].outputTimestampPacket = nonAuxToAuxNode; } bufferIndex++; } if (!queueBlocked) { - getGpgpuCommandStreamReceiver().requestStallingPipeControlOnNextFlush(); - timestampPacketDependencies.barrierNodes.add(nodesAllocator->getTag()); - - // wait for barrier and events before AuxToNonAux - blitPropertiesContainer[0].csrDependencies.push_back(×tampPacketDependencies.barrierNodes); - blitPropertiesContainer[0].csrDependencies.fillFromEventsRequest(eventsRequest, *getBcsCommandStreamReceiver(), - CsrDependencies::DependenciesType::All); - - // wait for NDR before NonAuxToAux - blitPropertiesContainer[numBuffers].csrDependencies.push_back(this->timestampPacketContainer.get()); + BlitProperties::setupDependenciesForAuxTranslation(blitPropertiesContainer, timestampPacketDependencies, + *this->timestampPacketContainer, eventsRequest, + getGpgpuCommandStreamReceiver(), *getBcsCommandStreamReceiver()); } } @@ -832,7 +823,7 @@ void CommandQueueHw::enqueueBlocked( bool storeTimestampPackets = false; if (blockedCommandsData) { - if (enqueueProperties.operation == EnqueueProperties::Operation::Blit) { + if (enqueueProperties.blitPropertiesContainer) { blockedCommandsData->blitPropertiesContainer = *enqueueProperties.blitPropertiesContainer; blockedCommandsData->blitEnqueue = true; } diff --git a/runtime/helpers/blit_commands_helper.cpp b/runtime/helpers/blit_commands_helper.cpp index 96434e27eb..5f0ee2ce2c 100644 --- a/runtime/helpers/blit_commands_helper.cpp +++ b/runtime/helpers/blit_commands_helper.cpp @@ -109,4 +109,27 @@ BlitterConstants::BlitDirection BlitProperties::obtainBlitDirection(uint32_t com : BlitterConstants::BlitDirection::BufferToHostPtr; } +void BlitProperties::setupDependenciesForAuxTranslation(BlitPropertiesContainer &blitPropertiesContainer, TimestampPacketDependencies ×tampPacketDependencies, + TimestampPacketContainer &kernelTimestamps, const EventsRequest &eventsRequest, + CommandStreamReceiver &gpguCsr, CommandStreamReceiver &bcsCsr) { + auto numObjects = blitPropertiesContainer.size() / 2; + + for (size_t i = 0; i < numObjects; i++) { + blitPropertiesContainer[i].outputTimestampPacket = timestampPacketDependencies.auxToNonAuxNodes.peekNodes()[i]; + blitPropertiesContainer[i + numObjects].outputTimestampPacket = timestampPacketDependencies.nonAuxToAuxNodes.peekNodes()[i]; + } + + gpguCsr.requestStallingPipeControlOnNextFlush(); + auto nodesAllocator = gpguCsr.getTimestampPacketAllocator(); + timestampPacketDependencies.barrierNodes.add(nodesAllocator->getTag()); + + // wait for barrier and events before AuxToNonAux + blitPropertiesContainer[0].csrDependencies.push_back(×tampPacketDependencies.barrierNodes); + blitPropertiesContainer[0].csrDependencies.fillFromEventsRequest(eventsRequest, bcsCsr, + CsrDependencies::DependenciesType::All); + + // wait for NDR before NonAuxToAux + blitPropertiesContainer[numObjects].csrDependencies.push_back(&kernelTimestamps); +} + } // namespace NEO diff --git a/runtime/helpers/blit_commands_helper.h b/runtime/helpers/blit_commands_helper.h index fdf4d40c11..bc8426f327 100644 --- a/runtime/helpers/blit_commands_helper.h +++ b/runtime/helpers/blit_commands_helper.h @@ -24,6 +24,10 @@ struct TimestampPacketStorage; template struct TagNode; +struct BlitProperties; +struct TimestampPacketDependencies; +using BlitPropertiesContainer = StackVec; + struct BlitProperties { static BlitProperties constructPropertiesForReadWriteBuffer(BlitterConstants::BlitDirection blitDirection, CommandStreamReceiver &commandStreamReceiver, @@ -42,6 +46,10 @@ struct BlitProperties { static BlitProperties constructPropertiesForAuxTranslation(AuxTranslationDirection auxTranslationDirection, GraphicsAllocation *allocation); + static void setupDependenciesForAuxTranslation(BlitPropertiesContainer &blitPropertiesContainer, TimestampPacketDependencies ×tampPacketDependencies, + TimestampPacketContainer &kernelTimestamps, const EventsRequest &eventsRequest, + CommandStreamReceiver &gpguCsr, CommandStreamReceiver &bcsCsr); + static BlitterConstants::BlitDirection obtainBlitDirection(uint32_t commandType); TagNode *outputTimestampPacket = nullptr; @@ -56,8 +64,6 @@ struct BlitProperties { uint64_t copySize = 0; }; -using BlitPropertiesContainer = StackVec; - template struct BlitCommandsHelper { static size_t estimateBlitCommandsSize(uint64_t copySize, const CsrDependencies &csrDependencies, bool updateTimestampPacket); diff --git a/runtime/helpers/task_information.cpp b/runtime/helpers/task_information.cpp index 2fc519ca22..7a9fd0343f 100644 --- a/runtime/helpers/task_information.cpp +++ b/runtime/helpers/task_information.cpp @@ -188,6 +188,16 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate this->kernel->getProgram()->getBlockKernelManager()->makeInternalAllocationsResident(commandStreamReceiver); } + if (kernelOperation->blitPropertiesContainer.size() > 0) { + auto &bcsCsr = *commandQueue.getBcsCommandStreamReceiver(); + BlitProperties::setupDependenciesForAuxTranslation(kernelOperation->blitPropertiesContainer, *timestampPacketDependencies, + *currentTimestampPacketNodes, eventsRequest, + commandQueue.getGpgpuCommandStreamReceiver(), bcsCsr); + + auto bcsTaskCount = bcsCsr.blitBuffer(kernelOperation->blitPropertiesContainer, false); + commandQueue.updateBcsTaskCount(bcsTaskCount); + } + DispatchFlags dispatchFlags( {}, //csrDependencies nullptr, //barrierTimestampPacketNodes @@ -211,8 +221,9 @@ CompletionStamp &CommandComputeKernel::submit(uint32_t taskLevel, bool terminate false //epilogueRequired ); - if (commandStreamReceiver.peekTimestampPacketWriteEnabled()) { + if (timestampPacketDependencies) { dispatchFlags.csrDependencies.fillFromEventsRequest(eventsRequest, commandStreamReceiver, CsrDependencies::DependenciesType::OutOfCsr); + dispatchFlags.barrierTimestampPacketNodes = ×tampPacketDependencies->barrierNodes; } dispatchFlags.pipelineSelectArgs.specialPipelineSelectMode = kernel->requiresSpecialPipelineSelectMode(); if (anyUncacheableArgs) { diff --git a/unit_tests/command_queue/blit_enqueue_tests.cpp b/unit_tests/command_queue/blit_enqueue_tests.cpp index f1fde7df65..3fa48d64a8 100644 --- a/unit_tests/command_queue/blit_enqueue_tests.cpp +++ b/unit_tests/command_queue/blit_enqueue_tests.cpp @@ -7,6 +7,7 @@ #include "core/unit_tests/helpers/debug_manager_state_restore.h" #include "core/unit_tests/utilities/base_object_utils.h" +#include "runtime/event/user_event.h" #include "test.h" #include "unit_tests/helpers/hw_parse.h" #include "unit_tests/mocks/mock_command_queue.h" @@ -239,6 +240,80 @@ HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitAuxTranslationWhenConstruct } } +HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitAuxTranslationWhenConstructingBlockedCommandBufferThenEnsureCorrectOrder) { + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; + using WALKER_TYPE = typename FamilyType::WALKER_TYPE; + using XY_COPY_BLT = typename FamilyType::XY_COPY_BLT; + using MI_FLUSH_DW = typename FamilyType::MI_FLUSH_DW; + + auto buffer0 = createBuffer(1, true); + auto buffer1 = createBuffer(1, false); + auto buffer2 = createBuffer(1, true); + setMockKernelArgs(std::array{{buffer0.get(), buffer1.get(), buffer2.get()}}); + + auto mockCmdQ = static_cast *>(commandQueue.get()); + auto initialBcsTaskCount = mockCmdQ->bcsTaskCount; + + UserEvent userEvent; + cl_event waitlist[] = {&userEvent}; + + mockCmdQ->enqueueKernel(mockKernel->mockKernel, 1, nullptr, gws, lws, 1, waitlist, nullptr); + userEvent.setStatus(CL_COMPLETE); + + EXPECT_EQ(mockCmdQ->bcsTaskCount, initialBcsTaskCount + 1); + + // Gpgpu command buffer + { + auto cmdListCsr = getCmdList(gpgpuCsr->getCS(0)); + auto ultCsr = static_cast *>(gpgpuCsr); + auto cmdListQueue = getCmdList(*ultCsr->lastFlushedCommandStream); + + // Barrier + expectPipeControl(cmdListCsr.begin(), cmdListCsr.end()); + + // Aux to NonAux + auto cmdFound = expectCommand(cmdListQueue.begin(), cmdListQueue.end()); + cmdFound = expectCommand(++cmdFound, cmdListQueue.end()); + // Walker + cmdFound = expectCommand(++cmdFound, cmdListQueue.end()); + cmdFound = expectCommand(++cmdFound, cmdListQueue.end()); + // NonAux to Aux + cmdFound = expectCommand(++cmdFound, cmdListQueue.end()); + cmdFound = expectCommand(++cmdFound, cmdListQueue.end()); + + // task count + expectPipeControl(++cmdFound, cmdListQueue.end()); + } + + // BCS command buffer + { + auto cmdList = getCmdList(bcsCsr->getCS(0)); + + // Barrier + auto cmdFound = expectCommand(cmdList.begin(), cmdList.end()); + + // Aux to NonAux + cmdFound = expectCommand(++cmdFound, cmdList.end()); + cmdFound = expectCommand(++cmdFound, cmdList.end()); + cmdFound = expectCommand(++cmdFound, cmdList.end()); + cmdFound = expectCommand(++cmdFound, cmdList.end()); + + // wait for NDR + cmdFound = expectCommand(++cmdFound, cmdList.end()); + + // NonAux to Aux + cmdFound = expectCommand(++cmdFound, cmdList.end()); + cmdFound = expectCommand(++cmdFound, cmdList.end()); + cmdFound = expectCommand(++cmdFound, cmdList.end()); + cmdFound = expectCommand(++cmdFound, cmdList.end()); + + // taskCount + expectCommand(++cmdFound, cmdList.end()); + } + EXPECT_FALSE(mockCmdQ->isQueueBlocked()); +} + HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitTranslationWhenConstructingCommandBufferThenSynchronizeBarrier) { using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; @@ -439,3 +514,160 @@ HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitAuxTranslationWhenDispatchi EXPECT_EQ(0u, lastDispatchInfo->dispatchInitCommands.estimateCommandsSize(&memObjects)); EXPECT_EQ(dependencySize, lastDispatchInfo->dispatchEpilogueCommands.estimateCommandsSize(&memObjects)); } + +HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitTranslationWhenConstructingBlockedCommandBufferThenSynchronizeBarrier) { + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; + + auto buffer = createBuffer(1, true); + setMockKernelArgs(std::array{{buffer.get()}}); + + UserEvent userEvent; + cl_event waitlist[] = {&userEvent}; + + commandQueue->enqueueKernel(mockKernel->mockKernel, 1, nullptr, gws, nullptr, 1, waitlist, nullptr); + userEvent.setStatus(CL_COMPLETE); + + auto cmdListCsr = getCmdList(gpgpuCsr->getCS(0)); + auto pipeControl = expectPipeControl(cmdListCsr.begin(), cmdListCsr.end()); + auto pipeControlCmd = genCmdCast(*pipeControl); + + uint64_t low = pipeControlCmd->getAddress(); + uint64_t high = pipeControlCmd->getAddressHigh(); + uint64_t barrierGpuAddress = (high << 32) | low; + + auto cmdList = getCmdList(bcsCsr->getCS(0)); + auto semaphore = expectCommand(cmdList.begin(), cmdList.end()); + verifySemaphore(semaphore, barrierGpuAddress); + + EXPECT_FALSE(commandQueue->isQueueBlocked()); +} + +HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitTranslationWhenConstructingBlockedCommandBufferThenSynchronizeEvents) { + using XY_COPY_BLT = typename FamilyType::XY_COPY_BLT; + using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; + + auto buffer = createBuffer(1, true); + setMockKernelArgs(std::array{{buffer.get()}}); + + auto event = make_releaseable(commandQueue.get(), CL_COMMAND_READ_BUFFER, 0, 0); + MockTimestampPacketContainer eventDependencyContainer(*bcsCsr->getTimestampPacketAllocator(), 1); + auto eventDependency = eventDependencyContainer.getNode(0); + event->addTimestampPacketNodes(eventDependencyContainer); + + UserEvent userEvent; + cl_event waitlist[] = {&userEvent, event.get()}; + commandQueue->enqueueKernel(mockKernel->mockKernel, 1, nullptr, gws, nullptr, 2, waitlist, nullptr); + userEvent.setStatus(CL_COMPLETE); + + auto eventDependencyAddress = eventDependency->getGpuAddress() + offsetof(TimestampPacketStorage, packets[0].contextEnd); + + auto cmdList = getCmdList(bcsCsr->getCS(0)); + + // Barrier + auto cmdFound = expectCommand(cmdList.begin(), cmdList.end()); + + // Event + auto semaphore = expectCommand(++cmdFound, cmdList.end()); + verifySemaphore(semaphore, eventDependencyAddress); + + cmdFound = expectCommand(++semaphore, cmdList.end()); + expectCommand(++cmdFound, cmdList.end()); + + EXPECT_FALSE(commandQueue->isQueueBlocked()); +} + +HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitTranslationWhenConstructingBlockedCommandBufferThenSynchronizeKernel) { + using XY_COPY_BLT = typename FamilyType::XY_COPY_BLT; + using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; + + auto buffer = createBuffer(1, true); + setMockKernelArgs(std::array{{buffer.get()}}); + + auto mockCmdQ = static_cast *>(commandQueue.get()); + UserEvent userEvent; + cl_event waitlist[] = {&userEvent}; + + mockCmdQ->enqueueKernel(mockKernel->mockKernel, 1, nullptr, gws, nullptr, 1, waitlist, nullptr); + userEvent.setStatus(CL_COMPLETE); + + auto kernelNode = mockCmdQ->timestampPacketContainer->peekNodes()[0]; + auto kernelNodeAddress = kernelNode->getGpuAddress() + offsetof(TimestampPacketStorage, packets[0].contextEnd); + + auto cmdList = getCmdList(bcsCsr->getCS(0)); + + // Aux to nonAux + auto cmdFound = expectCommand(cmdList.begin(), cmdList.end()); + + // semaphore before NonAux to Aux + auto semaphore = expectCommand(++cmdFound, cmdList.end()); + verifySemaphore(semaphore, kernelNodeAddress); + + EXPECT_FALSE(commandQueue->isQueueBlocked()); +} + +HWTEST_TEMPLATED_F(BlitAuxTranslationTests, givenBlitTranslationWhenConstructingBlockedCommandBufferThenSynchronizeBcsOutput) { + using XY_COPY_BLT = typename FamilyType::XY_COPY_BLT; + using MI_FLUSH_DW = typename FamilyType::MI_FLUSH_DW; + using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; + using WALKER_TYPE = typename FamilyType::WALKER_TYPE; + + auto buffer0 = createBuffer(1, true); + auto buffer1 = createBuffer(1, true); + setMockKernelArgs(std::array{{buffer0.get(), buffer1.get()}}); + + UserEvent userEvent; + cl_event waitlist[] = {&userEvent}; + commandQueue->enqueueKernel(mockKernel->mockKernel, 1, nullptr, gws, nullptr, 1, waitlist, nullptr); + userEvent.setStatus(CL_COMPLETE); + + uint64_t auxToNonAuxOutputAddress[2] = {}; + uint64_t nonAuxToAuxOutputAddress[2] = {}; + { + auto cmdListBcs = getCmdList(bcsCsr->getCS(0)); + + auto cmdFound = expectCommand(cmdListBcs.begin(), cmdListBcs.end()); + + cmdFound = expectCommand(++cmdFound, cmdListBcs.end()); + auto miflushDwCmd = genCmdCast(*cmdFound); + auxToNonAuxOutputAddress[0] = miflushDwCmd->getDestinationAddress(); + + cmdFound = expectCommand(++cmdFound, cmdListBcs.end()); + miflushDwCmd = genCmdCast(*cmdFound); + auxToNonAuxOutputAddress[1] = miflushDwCmd->getDestinationAddress(); + + cmdFound = expectCommand(++cmdFound, cmdListBcs.end()); + + cmdFound = expectCommand(++cmdFound, cmdListBcs.end()); + miflushDwCmd = genCmdCast(*cmdFound); + nonAuxToAuxOutputAddress[0] = miflushDwCmd->getDestinationAddress(); + + cmdFound = expectCommand(++cmdFound, cmdListBcs.end()); + miflushDwCmd = genCmdCast(*cmdFound); + nonAuxToAuxOutputAddress[1] = miflushDwCmd->getDestinationAddress(); + } + + { + auto ultCsr = static_cast *>(gpgpuCsr); + auto cmdListQueue = getCmdList(*ultCsr->lastFlushedCommandStream); + + // Aux to NonAux + auto cmdFound = expectCommand(cmdListQueue.begin(), cmdListQueue.end()); + verifySemaphore(cmdFound, auxToNonAuxOutputAddress[0]); + + cmdFound = expectCommand(++cmdFound, cmdListQueue.end()); + verifySemaphore(cmdFound, auxToNonAuxOutputAddress[1]); + + // Walker + cmdFound = expectCommand(++cmdFound, cmdListQueue.end()); + + // NonAux to Aux + cmdFound = expectCommand(++cmdFound, cmdListQueue.end()); + verifySemaphore(cmdFound, nonAuxToAuxOutputAddress[0]); + + cmdFound = expectCommand(++cmdFound, cmdListQueue.end()); + verifySemaphore(cmdFound, nonAuxToAuxOutputAddress[1]); + } + + EXPECT_FALSE(commandQueue->isQueueBlocked()); +}