From 7e9461ffa0f727f5600cc9caa5d16ab709d4ffb0 Mon Sep 17 00:00:00 2001 From: Vinod Tipparaju Date: Thu, 28 Jan 2021 13:18:18 +0530 Subject: [PATCH] Fix to optimize PC dispatched during appendMemoryCopy() call. Eliminate reduandant PC in case event signal scope is either device or host. The optimization is applicable to timestamp events as well. Related-To: LOCI-1995 Signed-off-by: Vinod Tipparaju --- level_zero/core/source/cmdlist/cmdlist_hw.inl | 5 + .../sources/cmdlist/test_cmdlist_2.cpp | 109 ++++++++++++++++++ .../test_cmdlist_append_launch_kernel.cpp | 14 +++ 3 files changed, 128 insertions(+) diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index 0b3dbaddc1..135ce320b8 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -928,6 +928,11 @@ ze_result_t CommandListCoreFamily::appendMemoryCopy(void *dstptr, appendEventForProfilingAllWalkers(hSignalEvent, false); + auto event = Event::fromHandle(hSignalEvent); + if (event) { + dstAllocationStruct.needsFlush &= !event->signalScope; + } + if (dstAllocationStruct.needsFlush && !isCopyOnly()) { NEO::PipeControlArgs args(true); NEO::MemorySynchronizationCommands::addPipeControl(*commandContainer.getCommandStream(), args); diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_2.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_2.cpp index 757972626a..40bcbac22e 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_2.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_2.cpp @@ -502,6 +502,103 @@ HWTEST2_F(CommandListCreate, givenCommandListWhenMemoryCopyWithSignalEventsThenS EXPECT_NE(cmdList.end(), itor); } +using platformSupport = IsWithinProducts; + +HWTEST2_F(CommandListCreate, givenCommandListWhenMemoryCopyWithSignalEventScopeSetToDeviceThenSinglePipeControlIsAddedWithDcFlush, platformSupport) { + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + using POST_SYNC_OPERATION = typename PIPE_CONTROL::POST_SYNC_OPERATION; + + ze_result_t result = ZE_RESULT_SUCCESS; + std::unique_ptr commandList(CommandList::create(productFamily, device, NEO::EngineGroupType::RenderCompute, result)); + auto &commandContainer = commandList->commandContainer; + + void *srcPtr = reinterpret_cast(0x1234); + void *dstPtr = reinterpret_cast(0x2345); + + ze_event_pool_desc_t eventPoolDesc = {}; + eventPoolDesc.count = 1; + auto eventPool = std::unique_ptr(L0::EventPool::create(driverHandle.get(), 0, nullptr, &eventPoolDesc)); + + ze_event_desc_t eventDesc = {}; + eventDesc.index = 0; + eventDesc.signal = ZE_EVENT_SCOPE_FLAG_DEVICE; + auto event = std::unique_ptr(L0::Event::create(eventPool.get(), &eventDesc, device)); + + result = commandList->appendMemoryCopy(dstPtr, srcPtr, 0x1001, event.get(), 0u, nullptr); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( + cmdList, ptrOffset(commandContainer.getCommandStream()->getCpuBase(), 0), commandContainer.getCommandStream()->getUsed())); + + auto iterator = findAll(cmdList.begin(), cmdList.end()); + bool postSyncFound = false; + ASSERT_NE(0u, iterator.size()); + uint32_t numPCs = 0; + for (auto it : iterator) { + auto cmd = genCmdCast(*it); + numPCs++; + if ((cmd->getPostSyncOperation() == POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA) && + (cmd->getImmediateData() == Event::STATE_SIGNALED) && + (cmd->getDcFlushEnable())) { + postSyncFound = true; + break; + } + } + + ASSERT_TRUE(postSyncFound); + EXPECT_EQ(numPCs, iterator.size()); +} + +HWTEST2_F(CommandListCreate, givenCommandListWhenMemoryCopyWithSignalEventScopeSetToSubDeviceThenB2BPipeControlIsAddedWithDcFlushForLastPC, platformSupport) { + using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; + using POST_SYNC_OPERATION = typename PIPE_CONTROL::POST_SYNC_OPERATION; + + ze_result_t result = ZE_RESULT_SUCCESS; + std::unique_ptr commandList(CommandList::create(productFamily, device, NEO::EngineGroupType::RenderCompute, result)); + auto &commandContainer = commandList->commandContainer; + + void *srcPtr = reinterpret_cast(0x1234); + void *dstPtr = reinterpret_cast(0x2345); + + ze_event_pool_desc_t eventPoolDesc = {}; + eventPoolDesc.count = 1; + auto eventPool = std::unique_ptr(L0::EventPool::create(driverHandle.get(), 0, nullptr, &eventPoolDesc)); + + ze_event_desc_t eventDesc = {}; + eventDesc.index = 0; + auto event = std::unique_ptr(L0::Event::create(eventPool.get(), &eventDesc, device)); + + result = commandList->appendMemoryCopy(dstPtr, srcPtr, 0x1001, event.get(), 0u, nullptr); + EXPECT_EQ(ZE_RESULT_SUCCESS, result); + + GenCmdList cmdList; + ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer( + cmdList, ptrOffset(commandContainer.getCommandStream()->getCpuBase(), 0), commandContainer.getCommandStream()->getUsed())); + + auto iterator = findAll(cmdList.begin(), cmdList.end()); + bool postSyncFound = false; + ASSERT_NE(0u, iterator.size()); + uint32_t numPCs = 0; + for (auto it : iterator) { + auto cmd = genCmdCast(*it); + numPCs++; + if ((cmd->getPostSyncOperation() == POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA) && + (cmd->getImmediateData() == Event::STATE_SIGNALED) && + (!cmd->getDcFlushEnable())) { + postSyncFound = true; + break; + } + } + + ASSERT_TRUE(postSyncFound); + EXPECT_EQ(numPCs, iterator.size() - 1); + + auto it = *(iterator.end() - 1); + auto cmd1 = genCmdCast(*it); + EXPECT_TRUE(cmd1->getDcFlushEnable()); +} + using ImageSupport = IsWithinProducts; HWTEST2_F(CommandListCreate, givenCopyCommandListWhenCopyFromMemoryToImageThenBlitImageCopyCalled, ImageSupport) { @@ -1102,9 +1199,11 @@ HWTEST2_F(CommandListCreate, givenCommandListWhenTimestampPassedToMemoryCopyThen cmd = genCmdCast(*itor); EXPECT_EQ(cmd->getSourceRegisterAddress(), GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW); + itor++; itor = find(itor, cmdList.end()); EXPECT_NE(cmdList.end(), itor); + itor++; itor = find(itor, cmdList.end()); EXPECT_NE(cmdList.end(), itor); cmd = genCmdCast(*itor); @@ -1115,6 +1214,16 @@ HWTEST2_F(CommandListCreate, givenCommandListWhenTimestampPassedToMemoryCopyThen EXPECT_NE(cmdList.end(), itor); cmd = genCmdCast(*itor); EXPECT_EQ(cmd->getSourceRegisterAddress(), GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW); + + auto temp = itor; + auto numPCs = findAll(temp, cmdList.end()); + //we should have only one PC with dcFlush added + ASSERT_EQ(1u, numPCs.size()); + + itor = find(itor, cmdList.end()); + EXPECT_NE(cmdList.end(), itor); + auto cmd1 = genCmdCast(*itor); + EXPECT_TRUE(cmd1->getDcFlushEnable()); } } // namespace ult } // namespace L0 diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel.cpp index 8951b22668..d7733462a1 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_launch_kernel.cpp @@ -296,6 +296,7 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenTimestampEventsWhenAppendingKernel ze_event_desc_t eventDesc = {}; eventDesc.index = 0; + eventDesc.signal = ZE_EVENT_SCOPE_FLAG_DEVICE; auto eventPool = std::unique_ptr(EventPool::create(driverHandle.get(), 0, nullptr, &eventPoolDesc)); auto event = std::unique_ptr(Event::create(eventPool.get(), &eventDesc, device)); @@ -355,6 +356,19 @@ HWTEST2_F(CommandListAppendLaunchKernel, givenTimestampEventsWhenAppendingKernel auto cmd = genCmdCast(*itor); EXPECT_EQ(GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW, cmd->getSourceRegisterAddress()); } + itor++; + + auto temp = itor; + auto numPCs = findAll(temp, cmdList.end()); + //we should have only one PC with dcFlush added + ASSERT_EQ(1u, numPCs.size()); + + itor = find(itor, cmdList.end()); + ASSERT_NE(cmdList.end(), itor); + { + auto cmd = genCmdCast(*itor); + EXPECT_TRUE(cmd->getDcFlushEnable()); + } { auto itorEvent = std::find(std::begin(commandList->commandContainer.getResidencyContainer()),