From 945deb8f67b5c2844b9ec1f14fe575dc76af2b0d Mon Sep 17 00:00:00 2001 From: Bartosz Dunajski Date: Tue, 9 Apr 2024 09:48:03 +0000 Subject: [PATCH] feature: memory copy device to host fence for host visible events in l0 path Related-To: NEO-10417 Signed-off-by: Bartosz Dunajski --- level_zero/core/source/cmdlist/cmdlist_hw.h | 3 + level_zero/core/source/cmdlist/cmdlist_hw.inl | 18 +++ .../cmdlist/test_cmdlist_append_memory.cpp | 121 ++++++++++++++++++ 3 files changed, 142 insertions(+) diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.h b/level_zero/core/source/cmdlist/cmdlist_hw.h index b934760924..f8c1f4cb18 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.h +++ b/level_zero/core/source/cmdlist/cmdlist_hw.h @@ -370,6 +370,8 @@ struct CommandListCoreFamily : public CommandListImp { void disablePatching(size_t inOrderPatchIndex); void enablePatching(size_t inOrderPatchIndex); + bool copyFenceRequired(Event *signalEvent, NEO::GraphicsAllocation *srcAllocation, NEO::GraphicsAllocation *dstAllocation) const; + NEO::InOrderPatchCommandsContainer inOrderPatchCmds; uint64_t latestHostWaitedInOrderSyncValue = 0; @@ -377,6 +379,7 @@ struct CommandListCoreFamily : public CommandListImp { bool duplicatedInOrderCounterStorageEnabled = false; bool inOrderAtomicSignalingEnabled = false; bool allowCbWaitEventsNoopDispatch = false; + bool copyOperationFenceSupported = false; }; template diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index 1f3191eb8d..debed31ec1 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -245,6 +245,7 @@ ze_result_t CommandListCoreFamily::initialize(Device *device, NEO this->duplicatedInOrderCounterStorageEnabled = gfxCoreHelper.duplicatedInOrderCounterStorageEnabled(rootDeviceEnvironment); this->inOrderAtomicSignalingEnabled = gfxCoreHelper.inOrderAtomicSignallingEnabled(rootDeviceEnvironment); this->scratchAddressPatchingEnabled = (this->heaplessModeEnabled && !isImmediateType()); + this->copyOperationFenceSupported = isCopyOnly() && productHelper.isDeviceToHostCopySignalingFenceRequired(); this->commandContainer.doubleSbaWaRef() = this->doubleSbaWa; this->commandContainer.l1CachePolicyDataRef() = &this->l1CachePolicyData; @@ -1616,6 +1617,10 @@ ze_result_t CommandListCoreFamily::appendMemoryCopy(void *dstptr, } } + if (copyFenceRequired(signalEvent, srcAllocationStruct.alloc, dstAllocationStruct.alloc)) { + NEO::MemorySynchronizationCommands::addAdditionalSynchronization(*commandContainer.getCommandStream(), 0, false, neoDevice->getRootDeviceEnvironment()); + } + appendEventForProfilingAllWalkers(signalEvent, nullptr, nullptr, false, singlePipeControlPacket, false); addFlushRequiredCommand(dstAllocationStruct.needsFlush, signalEvent); addToMappedEventList(signalEvent); @@ -1725,6 +1730,10 @@ ze_result_t CommandListCoreFamily::appendMemoryCopyRegion(void *d return result; } + if (copyFenceRequired(signalEvent, srcAllocationStruct.alloc, dstAllocationStruct.alloc)) { + NEO::MemorySynchronizationCommands::addAdditionalSynchronization(*commandContainer.getCommandStream(), 0, false, neoDevice->getRootDeviceEnvironment()); + } + addToMappedEventList(signalEvent); addFlushRequiredCommand(dstAllocationStruct.needsFlush, signalEvent); @@ -4074,4 +4083,13 @@ void CommandListCoreFamily::appendSynchronizedDispatchCleanupSect NEO::EncodeAtomic::programMiAtomic(*cmdStream, syncAllocationGpuVa, ATOMIC_OPCODES::ATOMIC_8B_CMP_WR, DATA_SIZE::DATA_SIZE_QWORD, 1, 1, queueIdToken, 0); } +template +bool CommandListCoreFamily::copyFenceRequired(Event *signalEvent, NEO::GraphicsAllocation *srcAllocation, NEO::GraphicsAllocation *dstAllocation) const { + if (!this->copyOperationFenceSupported || !signalEvent || !signalEvent->isSignalScope(ZE_EVENT_SCOPE_FLAG_HOST)) { + return false; + } + + return (srcAllocation->isAllocatedInLocalMemoryPool() && !dstAllocation->isAllocatedInLocalMemoryPool()); +} + } // namespace L0 diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_memory.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_memory.cpp index 84251e3ccb..007a6eec6d 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_memory.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_append_memory.cpp @@ -17,6 +17,7 @@ #include "level_zero/core/test/unit_tests/fixtures/device_fixture.h" #include "level_zero/core/test/unit_tests/mocks/mock_cmdlist.h" #include "level_zero/core/test/unit_tests/mocks/mock_cmdqueue.h" +#include "level_zero/core/test/unit_tests/sources/helper/ze_object_utils.h" namespace L0 { namespace ult { @@ -991,6 +992,126 @@ HWTEST2_F(AppendMemoryCopy, givenCopyCommandListWithDummyBlitWaWhenCopyMemoryReg context->freeMem(buffer); } +struct AppendMemoryCopyFenceTest : public AppendMemoryCopy { + void SetUp() override { + debugManager.flags.EnableLocalMemory.set(1); + AppendMemoryCopy::SetUp(); + } + DebugManagerStateRestore restore; +}; + +HWTEST2_F(AppendMemoryCopyFenceTest, givenDeviceToHostCopyWhenProgrammingThenAddFence, IsAtLeastXeHpcCore) { + using GfxFamily = typename NEO::GfxFamilyMapper::GfxFamily; + using XY_COPY_BLT = typename GfxFamily::XY_COPY_BLT; + using XY_COLOR_BLT = typename GfxFamily::XY_COLOR_BLT; + using MI_MEM_FENCE = typename GfxFamily::MI_MEM_FENCE; + + ze_result_t result = ZE_RESULT_SUCCESS; + + ze_event_pool_desc_t eventPoolDesc = {}; + eventPoolDesc.flags = ZE_EVENT_POOL_FLAG_HOST_VISIBLE; + eventPoolDesc.count = 2; + + ze_event_desc_t eventDesc = {}; + ze_event_desc_t eventDescHostVisible = {}; + eventDescHostVisible.signal = ZE_EVENT_SCOPE_FLAG_HOST; + + auto eventPool = std::unique_ptr(EventPool::create(driverHandle.get(), context, 0, nullptr, &eventPoolDesc, result)); + + auto hostVisibleEvent = DestroyableZeUniquePtr(Event::create(eventPool.get(), &eventDescHostVisible, device)); + auto regularEvent = DestroyableZeUniquePtr(Event::create(eventPool.get(), &eventDesc, device)); + + MockCommandListCoreFamily cmdList; + cmdList.initialize(device, NEO::EngineGroupType::copy, 0u); + cmdList.isFlushTaskSubmissionEnabled = true; + cmdList.csr = device->getNEODevice()->getDefaultEngine().commandStreamReceiver; + + constexpr size_t allocSize = 1; + void *hostBuffer = nullptr; + void *deviceBuffer = nullptr; + ze_host_mem_alloc_desc_t hostDesc = {}; + ze_device_mem_alloc_desc_t deviceDesc = {}; + result = context->allocHostMem(&hostDesc, allocSize, allocSize, &hostBuffer); + ASSERT_EQ(ZE_RESULT_SUCCESS, result); + + result = context->allocDeviceMem(device->toHandle(), &deviceDesc, allocSize, allocSize, &deviceBuffer); + ASSERT_EQ(ZE_RESULT_SUCCESS, result); + + ze_copy_region_t dstRegion = {0, 0, 0, 1, 1, 1}; + ze_copy_region_t srcRegion = {0, 0, 0, 1, 1, 1}; + + auto cmdStream = cmdList.getCmdContainer().getCommandStream(); + size_t offset = 0; + + auto verify = [&](bool expected) { + expected &= device->getProductHelper().isDeviceToHostCopySignalingFenceRequired(); + + GenCmdList genCmdList; + EXPECT_TRUE(FamilyType::Parse::parseCommandBuffer(genCmdList, ptrOffset(cmdStream->getCpuBase(), offset), cmdStream->getUsed() - offset)); + if (::testing::Test::HasFailure()) { + return false; + } + + auto itor = find(genCmdList.begin(), genCmdList.end()); + itor = find(itor, genCmdList.end()); + + EXPECT_EQ(expected, genCmdList.end() != itor); + + return !::testing::Test::HasFailure(); + }; + + // device to host - host visible event + { + offset = cmdStream->getUsed(); + cmdList.appendMemoryCopyRegion(hostBuffer, &dstRegion, 1, 1, deviceBuffer, &srcRegion, 1, 1, hostVisibleEvent->toHandle(), 0, nullptr, false, false); + + EXPECT_TRUE(verify(true)); + } + + // device to host - regular event + { + offset = cmdStream->getUsed(); + cmdList.appendMemoryCopyRegion(hostBuffer, &dstRegion, 1, 1, deviceBuffer, &srcRegion, 1, 1, regularEvent->toHandle(), 0, nullptr, false, false); + + EXPECT_TRUE(verify(false)); + } + + // device to host - no event + { + offset = cmdStream->getUsed(); + cmdList.appendMemoryCopyRegion(hostBuffer, &dstRegion, 1, 1, deviceBuffer, &srcRegion, 1, 1, nullptr, 0, nullptr, false, false); + + EXPECT_TRUE(verify(false)); + } + + // device to device - host visible event + { + offset = cmdStream->getUsed(); + cmdList.appendMemoryCopyRegion(deviceBuffer, &dstRegion, 1, 1, deviceBuffer, &srcRegion, 1, 1, hostVisibleEvent->toHandle(), 0, nullptr, false, false); + + EXPECT_TRUE(verify(false)); + } + + // host to device - host visible event + { + offset = cmdStream->getUsed(); + cmdList.appendMemoryCopyRegion(deviceBuffer, &dstRegion, 1, 1, hostBuffer, &srcRegion, 1, 1, hostVisibleEvent->toHandle(), 0, nullptr, false, false); + + EXPECT_TRUE(verify(false)); + } + + // host to host - host visible event + { + offset = cmdStream->getUsed(); + cmdList.appendMemoryCopyRegion(hostBuffer, &dstRegion, 1, 1, hostBuffer, &srcRegion, 1, 1, hostVisibleEvent->toHandle(), 0, nullptr, false, false); + + EXPECT_TRUE(verify(false)); + } + + context->freeMem(hostBuffer); + context->freeMem(deviceBuffer); +} + HWTEST2_F(AppendMemoryCopy, givenCopyCommandListImmediateWithDummyBlitWaWhenCopyMemoryThenDummyBlitIsNotProgrammedButIsRequiredForNextFlushProgramming, IsAtLeastXeHpCore) { using GfxFamily = typename NEO::GfxFamilyMapper::GfxFamily; using XY_COPY_BLT = typename GfxFamily::XY_COPY_BLT;