From b82cdd6b8e999b070b0f3313aaed9be45e61d0fc Mon Sep 17 00:00:00 2001 From: "Dunajski, Bartosz" Date: Mon, 20 May 2019 12:00:02 +0200 Subject: [PATCH] Program MI_SEMAPHORE_WAIT for dependencies during blit operations Change-Id: I8b0e467886bfb23d026a0c13be514343a22a20a1 Related-To: NEO-3020 Signed-off-by: Dunajski, Bartosz --- .../command_stream_receiver.cpp | 6 +- .../command_stream/command_stream_receiver.h | 5 +- .../command_stream_receiver_hw.h | 2 +- .../command_stream_receiver_hw_base.inl | 6 +- runtime/helpers/blit_commands_helper.h | 2 +- runtime/helpers/blit_commands_helper_base.inl | 5 +- runtime/mem_obj/buffer.cpp | 4 +- .../command_stream_receiver_hw_tests.cpp | 89 ++++++++++++++++--- unit_tests/kernel/kernel_tests.cpp | 2 +- .../libult/ult_command_stream_receiver.h | 4 +- unit_tests/mocks/mock_csr.h | 2 +- 11 files changed, 100 insertions(+), 27 deletions(-) diff --git a/runtime/command_stream/command_stream_receiver.cpp b/runtime/command_stream/command_stream_receiver.cpp index 82e8ef5e3d..8ce87f535c 100644 --- a/runtime/command_stream/command_stream_receiver.cpp +++ b/runtime/command_stream/command_stream_receiver.cpp @@ -408,7 +408,7 @@ cl_int CommandStreamReceiver::expectMemory(const void *gfxAddress, const void *s } void CommandStreamReceiver::blitWithHostPtr(Buffer &buffer, void *hostPtr, uint64_t hostPtrSize, - BlitterConstants::BlitWithHostPtrDirection copyDirection) { + BlitterConstants::BlitWithHostPtrDirection copyDirection, CsrDependencies &csrDependencies) { HostPtrSurface hostPtrSurface(hostPtr, static_cast(hostPtrSize), true); bool success = createAllocationForHostSurface(hostPtrSurface, false); UNRECOVERABLE_IF(!success); @@ -420,9 +420,9 @@ void CommandStreamReceiver::blitWithHostPtr(Buffer &buffer, void *hostPtr, uint6 true, false, true)); if (BlitterConstants::BlitWithHostPtrDirection::FromHostPtr == copyDirection) { - blitBuffer(buffer, *hostPtrBuffer, hostPtrSize); + blitBuffer(buffer, *hostPtrBuffer, hostPtrSize, csrDependencies); } else { - blitBuffer(*hostPtrBuffer, buffer, hostPtrSize); + blitBuffer(*hostPtrBuffer, buffer, hostPtrSize, csrDependencies); } } } // namespace NEO diff --git a/runtime/command_stream/command_stream_receiver.h b/runtime/command_stream/command_stream_receiver.h index d4c6c43bf3..e03435e335 100644 --- a/runtime/command_stream/command_stream_receiver.h +++ b/runtime/command_stream/command_stream_receiver.h @@ -174,8 +174,9 @@ class CommandStreamReceiver { this->latestSentTaskCount = latestSentTaskCount; } - void blitWithHostPtr(Buffer &buffer, void *hostPtr, uint64_t hostPtrSize, BlitterConstants::BlitWithHostPtrDirection copyDirection); - virtual void blitBuffer(Buffer &dstBuffer, Buffer &srcBuffer, uint64_t sourceSize) = 0; + void blitWithHostPtr(Buffer &buffer, void *hostPtr, uint64_t hostPtrSize, + BlitterConstants::BlitWithHostPtrDirection copyDirection, CsrDependencies &csrDependencies); + virtual void blitBuffer(Buffer &dstBuffer, Buffer &srcBuffer, uint64_t sourceSize, CsrDependencies &csrDependencies) = 0; protected: void cleanupResources(); diff --git a/runtime/command_stream/command_stream_receiver_hw.h b/runtime/command_stream/command_stream_receiver_hw.h index e68bd5e378..e1aa668f3e 100644 --- a/runtime/command_stream/command_stream_receiver_hw.h +++ b/runtime/command_stream/command_stream_receiver_hw.h @@ -70,7 +70,7 @@ class CommandStreamReceiverHw : public CommandStreamReceiver { return CommandStreamReceiverType::CSR_HW; } - void blitBuffer(Buffer &dstBuffer, Buffer &srcBuffer, uint64_t sourceSize) override; + void blitBuffer(Buffer &dstBuffer, Buffer &srcBuffer, uint64_t sourceSize, CsrDependencies &csrDependencies) override; protected: using CommandStreamReceiver::osContext; diff --git a/runtime/command_stream/command_stream_receiver_hw_base.inl b/runtime/command_stream/command_stream_receiver_hw_base.inl index 6e17bbd18e..835ece9d22 100644 --- a/runtime/command_stream/command_stream_receiver_hw_base.inl +++ b/runtime/command_stream/command_stream_receiver_hw_base.inl @@ -725,18 +725,20 @@ bool CommandStreamReceiverHw::detectInitProgrammingFlagsRequired(cons } template -void CommandStreamReceiverHw::blitBuffer(Buffer &dstBuffer, Buffer &srcBuffer, uint64_t sourceSize) { +void CommandStreamReceiverHw::blitBuffer(Buffer &dstBuffer, Buffer &srcBuffer, uint64_t sourceSize, CsrDependencies &csrDependencies) { using MI_BATCH_BUFFER_END = typename GfxFamily::MI_BATCH_BUFFER_END; using MI_FLUSH_DW = typename GfxFamily::MI_FLUSH_DW; UNRECOVERABLE_IF(osContext->getEngineType() != aub_stream::EngineType::ENGINE_BCS); auto lock = obtainUniqueOwnership(); - auto &commandStream = getCS(BlitCommandsHelper::estimateBlitCommandsSize(sourceSize)); + auto &commandStream = getCS(BlitCommandsHelper::estimateBlitCommandsSize(sourceSize, csrDependencies)); auto commandStreamStart = commandStream.getUsed(); auto newTaskCount = taskCount + 1; latestSentTaskCount = newTaskCount; + TimestampPacketHelper::programCsrDependencies(commandStream, csrDependencies); + BlitCommandsHelper::dispatchBlitCommandsForBuffer(dstBuffer, srcBuffer, commandStream, sourceSize); auto miFlushDwCmd = reinterpret_cast(commandStream.getSpace(sizeof(MI_FLUSH_DW))); diff --git a/runtime/helpers/blit_commands_helper.h b/runtime/helpers/blit_commands_helper.h index 7a9b8c796d..4461352956 100644 --- a/runtime/helpers/blit_commands_helper.h +++ b/runtime/helpers/blit_commands_helper.h @@ -15,7 +15,7 @@ class LinearStream; template struct BlitCommandsHelper { - static size_t estimateBlitCommandsSize(uint64_t copySize); + static size_t estimateBlitCommandsSize(uint64_t copySize, CsrDependencies &csrDependencies); static void dispatchBlitCommandsForBuffer(Buffer &dstBuffer, Buffer &srcBuffer, LinearStream &linearStream, uint64_t copySize); static void appendBlitCommandsForBuffer(Buffer &dstBuffer, Buffer &srcBuffer, typename GfxFamily::XY_COPY_BLT &blitCmd); }; diff --git a/runtime/helpers/blit_commands_helper_base.inl b/runtime/helpers/blit_commands_helper_base.inl index c87ee5b275..09cb6823b2 100644 --- a/runtime/helpers/blit_commands_helper_base.inl +++ b/runtime/helpers/blit_commands_helper_base.inl @@ -10,7 +10,7 @@ namespace NEO { template -size_t BlitCommandsHelper::estimateBlitCommandsSize(uint64_t copySize) { +size_t BlitCommandsHelper::estimateBlitCommandsSize(uint64_t copySize, CsrDependencies &csrDependencies) { size_t numberOfBlits = 0; uint64_t sizeToBlit = copySize; uint64_t width = 1; @@ -30,7 +30,8 @@ size_t BlitCommandsHelper::estimateBlitCommandsSize(uint64_t copySize numberOfBlits++; } - size_t size = (sizeof(typename GfxFamily::XY_COPY_BLT) * numberOfBlits) + + size_t size = TimestampPacketHelper::getRequiredCmdStreamSize(csrDependencies) + + (sizeof(typename GfxFamily::XY_COPY_BLT) * numberOfBlits) + sizeof(typename GfxFamily::MI_FLUSH_DW) + sizeof(typename GfxFamily::MI_BATCH_BUFFER_END); diff --git a/runtime/mem_obj/buffer.cpp b/runtime/mem_obj/buffer.cpp index e28ec9699f..30686abf74 100644 --- a/runtime/mem_obj/buffer.cpp +++ b/runtime/mem_obj/buffer.cpp @@ -277,7 +277,9 @@ Buffer *Buffer::create(Context *context, if (gpuCopyRequired) { auto blitCommandStreamReceiver = context->getCommandStreamReceiverForBlitOperation(*pBuffer); if (blitCommandStreamReceiver) { - blitCommandStreamReceiver->blitWithHostPtr(*pBuffer, hostPtr, size, BlitterConstants::BlitWithHostPtrDirection::FromHostPtr); + CsrDependencies dependencies; + blitCommandStreamReceiver->blitWithHostPtr(*pBuffer, hostPtr, size, BlitterConstants::BlitWithHostPtrDirection::FromHostPtr, + dependencies); } else { auto cmdQ = context->getSpecialQueue(); if (CL_SUCCESS != cmdQ->enqueueWriteBuffer(pBuffer, CL_TRUE, 0, size, hostPtr, nullptr, 0, nullptr, nullptr)) { diff --git a/unit_tests/command_stream/command_stream_receiver_hw_tests.cpp b/unit_tests/command_stream/command_stream_receiver_hw_tests.cpp index 546f5b2f2c..c9cbf23af3 100644 --- a/unit_tests/command_stream/command_stream_receiver_hw_tests.cpp +++ b/unit_tests/command_stream/command_stream_receiver_hw_tests.cpp @@ -41,6 +41,7 @@ #include "unit_tests/mocks/mock_internal_allocation_storage.h" #include "unit_tests/mocks/mock_kernel.h" #include "unit_tests/mocks/mock_submissions_aggregator.h" +#include "unit_tests/mocks/mock_timestamp_container.h" #include "unit_tests/utilities/base_object_utils.h" #include "reg_configs_common.h" @@ -276,6 +277,7 @@ struct BcsTests : public CommandStreamReceiverHwTest { CommandStreamReceiverHwTest::TearDown(); } + CsrDependencies csrDependencies; std::unique_ptr context; }; @@ -291,13 +293,34 @@ HWTEST_F(BcsTests, givenBltSizeWhenEstimatingCommandSizeThenAddAllRequiredComman auto expectedAlignedSize = alignUp(expectedSize + (sizeof(typename FamilyType::XY_COPY_BLT) * alignedNumberOfBlts), MemoryConstants::cacheLineSize); auto expectedNotAlignedSize = alignUp(expectedSize + (sizeof(typename FamilyType::XY_COPY_BLT) * notAlignedNumberOfBlts), MemoryConstants::cacheLineSize); - auto alignedEstimatedSize = BlitCommandsHelper::estimateBlitCommandsSize(alignedBltSize); - auto notAlignedEstimatedSize = BlitCommandsHelper::estimateBlitCommandsSize(notAlignedBltSize); + auto alignedEstimatedSize = BlitCommandsHelper::estimateBlitCommandsSize(alignedBltSize, csrDependencies); + auto notAlignedEstimatedSize = BlitCommandsHelper::estimateBlitCommandsSize(notAlignedBltSize, csrDependencies); EXPECT_EQ(expectedAlignedSize, alignedEstimatedSize); EXPECT_EQ(expectedNotAlignedSize, notAlignedEstimatedSize); } +HWTEST_F(BcsTests, givenBltSizeAndCsrDependenciesWhenEstimatingCommandSizeThenAddAllRequiredCommands) { + uint32_t numberOfBlts = 1; + size_t numberNodesPerContainer = 5; + auto &csr = pDevice->getUltCommandStreamReceiver(); + + MockTimestampPacketContainer timestamp0(*csr.getTimestampPacketAllocator(), numberNodesPerContainer); + MockTimestampPacketContainer timestamp1(*csr.getTimestampPacketAllocator(), numberNodesPerContainer); + csrDependencies.push_back(×tamp0); + csrDependencies.push_back(×tamp1); + + size_t expectedSize = sizeof(typename FamilyType::MI_FLUSH_DW) + sizeof(typename FamilyType::MI_BATCH_BUFFER_END) + + (sizeof(typename FamilyType::XY_COPY_BLT) * numberOfBlts) + + TimestampPacketHelper::getRequiredCmdStreamSize(csrDependencies); + + auto expectedAlignedSize = alignUp(expectedSize, MemoryConstants::cacheLineSize); + + auto estimatedSize = BlitCommandsHelper::estimateBlitCommandsSize(1, csrDependencies); + + EXPECT_EQ(expectedAlignedSize, estimatedSize); +} + HWTEST_F(BcsTests, givenBltSizeWithLeftoverWhenDispatchedThenProgramAllRequiredCommands) { using MI_FLUSH_DW = typename FamilyType::MI_FLUSH_DW; constexpr auto max2DBlitSize = BlitterConstants::maxBlitWidth * BlitterConstants::maxBlitHeight; @@ -316,7 +339,7 @@ HWTEST_F(BcsTests, givenBltSizeWithLeftoverWhenDispatchedThenProgramAllRequiredC uint32_t newTaskCount = 19; csr.taskCount = newTaskCount - 1; EXPECT_EQ(0u, csr.recursiveLockCounter.load()); - csr.blitWithHostPtr(*buffer, hostPtr, bltSize, BlitterConstants::BlitWithHostPtrDirection::FromHostPtr); + csr.blitWithHostPtr(*buffer, hostPtr, bltSize, BlitterConstants::BlitWithHostPtrDirection::FromHostPtr, csrDependencies); EXPECT_EQ(newTaskCount, csr.taskCount); EXPECT_EQ(newTaskCount, csr.latestFlushedTaskCount); EXPECT_EQ(newTaskCount, csr.latestSentTaskCount); @@ -363,6 +386,50 @@ HWTEST_F(BcsTests, givenBltSizeWithLeftoverWhenDispatchedThenProgramAllRequiredC } } +HWTEST_F(BcsTests, givenCsrDependenciesWhenProgrammingCommandStreamThenAddSemaphoreAndAtomic) { + auto &csr = pDevice->getUltCommandStreamReceiver(); + + cl_int retVal = CL_SUCCESS; + auto buffer = clUniquePtr(Buffer::create(context.get(), CL_MEM_READ_WRITE, 1, nullptr, retVal)); + void *hostPtr = reinterpret_cast(0x12340000); + uint32_t numberOfDependencyContainers = 2; + size_t numberNodesPerContainer = 5; + + MockTimestampPacketContainer timestamp0(*csr.getTimestampPacketAllocator(), numberNodesPerContainer); + MockTimestampPacketContainer timestamp1(*csr.getTimestampPacketAllocator(), numberNodesPerContainer); + csrDependencies.push_back(×tamp0); + csrDependencies.push_back(×tamp1); + + csr.blitWithHostPtr(*buffer, hostPtr, 1, BlitterConstants::BlitWithHostPtrDirection::FromHostPtr, csrDependencies); + + HardwareParse hwParser; + hwParser.parseCommands(csr.commandStream); + auto &cmdList = hwParser.cmdList; + bool xyCopyBltCmdFound = false; + bool dependenciesFound = false; + + for (auto cmdIterator = cmdList.begin(); cmdIterator != cmdList.end(); cmdIterator++) { + if (genCmdCast(*cmdIterator)) { + xyCopyBltCmdFound = true; + continue; + } + auto miSemaphore = genCmdCast(*cmdIterator); + if (miSemaphore) { + dependenciesFound = true; + EXPECT_FALSE(xyCopyBltCmdFound); + auto miAtomic = genCmdCast(*(++cmdIterator)); + EXPECT_NE(nullptr, miAtomic); + + for (uint32_t i = 1; i < numberOfDependencyContainers * numberNodesPerContainer; i++) { + EXPECT_NE(nullptr, genCmdCast(*(++cmdIterator))); + EXPECT_NE(nullptr, genCmdCast(*(++cmdIterator))); + } + } + } + EXPECT_TRUE(xyCopyBltCmdFound); + EXPECT_TRUE(dependenciesFound); +} + HWTEST_F(BcsTests, givenInputAllocationsWhenBlitDispatchedThenMakeAllAllocationsResident) { auto &csr = pDevice->getUltCommandStreamReceiver(); csr.storeMakeResidentAllocations = true; @@ -373,7 +440,7 @@ HWTEST_F(BcsTests, givenInputAllocationsWhenBlitDispatchedThenMakeAllAllocations EXPECT_EQ(0u, csr.makeSurfacePackNonResidentCalled); - csr.blitWithHostPtr(*buffer, hostPtr, 1, BlitterConstants::BlitWithHostPtrDirection::FromHostPtr); + csr.blitWithHostPtr(*buffer, hostPtr, 1, BlitterConstants::BlitWithHostPtrDirection::FromHostPtr, csrDependencies); EXPECT_TRUE(csr.isMadeResident(buffer->getGraphicsAllocation())); EXPECT_TRUE(csr.isMadeResident(csr.commandStream.getGraphicsAllocation())); @@ -397,7 +464,7 @@ HWTEST_F(BcsTests, givenBufferWhenBlitCalledThenFlushCommandBuffer) { uint32_t newTaskCount = 17; csr.taskCount = newTaskCount - 1; - csr.blitWithHostPtr(*buffer, hostPtr, 1, BlitterConstants::BlitWithHostPtrDirection::FromHostPtr); + csr.blitWithHostPtr(*buffer, hostPtr, 1, BlitterConstants::BlitWithHostPtrDirection::FromHostPtr, csrDependencies); EXPECT_EQ(commandStream.getGraphicsAllocation(), csr.latestFlushedBatchBuffer.commandBufferAllocation); EXPECT_EQ(commandStreamOffset, csr.latestFlushedBatchBuffer.startOffset); @@ -442,7 +509,7 @@ HWTEST_F(BcsTests, whenBlitFromHostPtrCalledThenCallWaitWithKmdFallback) { auto buffer = clUniquePtr(Buffer::create(context.get(), CL_MEM_READ_WRITE, 1, nullptr, retVal)); void *hostPtr = reinterpret_cast(0x12340000); - myMockCsr->blitWithHostPtr(*buffer, hostPtr, 1, BlitterConstants::BlitWithHostPtrDirection::FromHostPtr); + myMockCsr->blitWithHostPtr(*buffer, hostPtr, 1, BlitterConstants::BlitWithHostPtrDirection::FromHostPtr, csrDependencies); EXPECT_EQ(1u, myMockCsr->waitForTaskCountWithKmdNotifyFallbackCalled); EXPECT_EQ(myMockCsr->taskCount, myMockCsr->taskCountToWaitPassed); @@ -464,13 +531,13 @@ HWTEST_F(BcsTests, whenBlitFromHostPtrCalledThenCleanTemporaryAllocations) { bcsCsr.taskCount = newTaskCount - 1; EXPECT_EQ(0u, mockInternalAllocationsStorage->cleanAllocationsCalled); - bcsCsr.blitWithHostPtr(*buffer, hostPtr, 1, BlitterConstants::BlitWithHostPtrDirection::FromHostPtr); + bcsCsr.blitWithHostPtr(*buffer, hostPtr, 1, BlitterConstants::BlitWithHostPtrDirection::FromHostPtr, csrDependencies); EXPECT_EQ(1u, mockInternalAllocationsStorage->cleanAllocationsCalled); EXPECT_EQ(newTaskCount, mockInternalAllocationsStorage->lastCleanAllocationsTaskCount); EXPECT_TRUE(TEMPORARY_ALLOCATION == mockInternalAllocationsStorage->lastCleanAllocationUsage); } -HWTEST_F(BcsTests, givenHostPtrWhenBlitWithHostPtrCalledThenProgramCorrectGpuAddresses) { +HWTEST_F(BcsTests, givenBufferWhenBlitOperationCalledThenProgramCorrectGpuAddresses) { auto &csr = pDevice->getUltCommandStreamReceiver(); cl_int retVal = CL_SUCCESS; @@ -481,7 +548,7 @@ HWTEST_F(BcsTests, givenHostPtrWhenBlitWithHostPtrCalledThenProgramCorrectGpuAdd { // from hostPtr HardwareParse hwParser; - csr.blitWithHostPtr(*buffer1, hostPtr, 1, BlitterConstants::BlitWithHostPtrDirection::FromHostPtr); + csr.blitWithHostPtr(*buffer1, hostPtr, 1, BlitterConstants::BlitWithHostPtrDirection::FromHostPtr, csrDependencies); hwParser.parseCommands(csr.commandStream); @@ -494,7 +561,7 @@ HWTEST_F(BcsTests, givenHostPtrWhenBlitWithHostPtrCalledThenProgramCorrectGpuAdd // to hostPtr HardwareParse hwParser; auto offset = csr.commandStream.getUsed(); - csr.blitWithHostPtr(*buffer1, hostPtr, 1, BlitterConstants::BlitWithHostPtrDirection::ToHostPtr); + csr.blitWithHostPtr(*buffer1, hostPtr, 1, BlitterConstants::BlitWithHostPtrDirection::ToHostPtr, csrDependencies); hwParser.parseCommands(csr.commandStream, offset); @@ -507,7 +574,7 @@ HWTEST_F(BcsTests, givenHostPtrWhenBlitWithHostPtrCalledThenProgramCorrectGpuAdd // Buffer to Buffer HardwareParse hwParser; auto offset = csr.commandStream.getUsed(); - csr.blitBuffer(*buffer1, *buffer2, 1); + csr.blitBuffer(*buffer1, *buffer2, 1, csrDependencies); hwParser.parseCommands(csr.commandStream, offset); diff --git a/unit_tests/kernel/kernel_tests.cpp b/unit_tests/kernel/kernel_tests.cpp index 7861cde1ab..2c74e74dd3 100644 --- a/unit_tests/kernel/kernel_tests.cpp +++ b/unit_tests/kernel/kernel_tests.cpp @@ -544,7 +544,7 @@ class CommandStreamReceiverMock : public CommandStreamReceiver { void waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool quickKmdSleep, bool forcePowerSavingMode) override { } - void blitBuffer(Buffer &dstBuffer, Buffer &srcBuffer, uint64_t sourceSize) override{}; + void blitBuffer(Buffer &dstBuffer, Buffer &srcBuffer, uint64_t sourceSize, CsrDependencies &csrDependencies) override{}; CompletionStamp flushTask( LinearStream &commandStream, diff --git a/unit_tests/libult/ult_command_stream_receiver.h b/unit_tests/libult/ult_command_stream_receiver.h index a82191f60d..db9f00e6f9 100644 --- a/unit_tests/libult/ult_command_stream_receiver.h +++ b/unit_tests/libult/ult_command_stream_receiver.h @@ -165,9 +165,9 @@ class UltCommandStreamReceiver : public CommandStreamReceiverHw, publ return CommandStreamReceiverHw::obtainUniqueOwnership(); } - void blitBuffer(Buffer &dstBuffer, Buffer &srcBuffer, uint64_t sourceSize) override { + void blitBuffer(Buffer &dstBuffer, Buffer &srcBuffer, uint64_t sourceSize, CsrDependencies &csrDependencies) override { blitBufferCalled++; - CommandStreamReceiverHw::blitBuffer(dstBuffer, srcBuffer, sourceSize); + CommandStreamReceiverHw::blitBuffer(dstBuffer, srcBuffer, sourceSize, csrDependencies); } std::atomic recursiveLockCounter; diff --git a/unit_tests/mocks/mock_csr.h b/unit_tests/mocks/mock_csr.h index 3743ec8a32..169450aeeb 100644 --- a/unit_tests/mocks/mock_csr.h +++ b/unit_tests/mocks/mock_csr.h @@ -254,7 +254,7 @@ class MockCommandStreamReceiver : public CommandStreamReceiver { void waitForTaskCountWithKmdNotifyFallback(uint32_t taskCountToWait, FlushStamp flushStampToWait, bool quickKmdSleep, bool forcePowerSavingMode) override { } - void blitBuffer(Buffer &dstBuffer, Buffer &srcBuffer, uint64_t sourceSize) override{}; + void blitBuffer(Buffer &dstBuffer, Buffer &srcBuffer, uint64_t sourceSize, CsrDependencies &csrDependencies) override{}; void setOSInterface(OSInterface *osInterface);