diff --git a/level_zero/core/source/cmdlist/cmdlist.h b/level_zero/core/source/cmdlist/cmdlist.h index 17784ffc5b..e01ec176e5 100644 --- a/level_zero/core/source/cmdlist/cmdlist.h +++ b/level_zero/core/source/cmdlist/cmdlist.h @@ -15,6 +15,7 @@ #include "shared/source/helpers/common_types.h" #include "shared/source/helpers/definitions/command_encoder_args.h" #include "shared/source/helpers/heap_base_address_model.h" +#include "shared/source/memory_manager/graphics_allocation.h" #include "shared/source/memory_manager/prefetch_manager.h" #include "shared/source/unified_memory/unified_memory.h" #include "shared/source/utilities/stackvec.h" @@ -229,6 +230,10 @@ struct CommandList : _ze_command_list_handle_t { return static_cast(handle); } + static bool isExternalHostPtrAlloc(NEO::GraphicsAllocation *alloc) { + return alloc && alloc->getAllocationType() == NEO::AllocationType::externalHostPtr; + } + inline ze_command_list_handle_t toHandle() { return this; } uint32_t getCommandListPerThreadScratchSize(uint32_t slotId) const { diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.h b/level_zero/core/source/cmdlist/cmdlist_hw.h index 206d5c68fb..7f2b165fc3 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.h +++ b/level_zero/core/source/cmdlist/cmdlist_hw.h @@ -178,6 +178,8 @@ struct CommandListCoreFamily : public CommandListImp { ze_result_t appendWaitOnMemory(void *desc, void *ptr, uint64_t data, ze_event_handle_t signalEventHandle, bool useQwordData) override; ze_result_t appendWriteToMemory(void *desc, void *ptr, uint64_t data) override; + ze_result_t appendWriteToMemory(void *desc, void *ptr, + uint64_t data, bool *requireTaskCountUpdate); ze_result_t appendWaitExternalSemaphores(uint32_t numExternalSemaphores, const ze_external_semaphore_ext_handle_t *hSemaphores, const ze_external_semaphore_wait_params_ext_t *params, ze_event_handle_t hSignalEvent, diff --git a/level_zero/core/source/cmdlist/cmdlist_hw.inl b/level_zero/core/source/cmdlist/cmdlist_hw.inl index 057330b67a..98a60f9a89 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw.inl @@ -836,6 +836,8 @@ ze_result_t CommandListCoreFamily::appendImageCopyFromMemoryExt(z return ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY; } + memoryCopyParams.taskCountUpdateRequired |= CommandList::isExternalHostPtrAlloc(allocationStruct.alloc); + DriverHandleImp *driverHandle = static_cast(device->getDriverHandle()); if (driverHandle->isRemoteImageNeeded(image, device)) { L0::Image *peerImage = nullptr; @@ -1034,6 +1036,8 @@ ze_result_t CommandListCoreFamily::appendImageCopyToMemoryExt(voi return ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY; } + memoryCopyParams.taskCountUpdateRequired |= CommandList::isExternalHostPtrAlloc(allocationStruct.alloc); + DriverHandleImp *driverHandle = static_cast(device->getDriverHandle()); if (driverHandle->isRemoteImageNeeded(image, device)) { L0::Image *peerImage = nullptr; @@ -1800,8 +1804,8 @@ ze_result_t CommandListCoreFamily::appendMemoryCopy(void *dstptr, } if (this->isImmediateType()) { - memoryCopyParams.taskCountUpdateRequired |= (dstAllocationStruct.alloc && dstAllocationStruct.alloc->getAllocationType() == NEO::AllocationType::externalHostPtr) || - (srcAllocationStruct.alloc && srcAllocationStruct.alloc->getAllocationType() == NEO::AllocationType::externalHostPtr); + memoryCopyParams.taskCountUpdateRequired |= CommandList::isExternalHostPtrAlloc(dstAllocationStruct.alloc) || + CommandList::isExternalHostPtrAlloc(srcAllocationStruct.alloc); } if ((dstAllocationStruct.alloc == nullptr) && (NEO::debugManager.flags.EmitMemAdvisePriorToCopyForNonUsm.get() == 1)) { @@ -2058,8 +2062,8 @@ ze_result_t CommandListCoreFamily::appendMemoryCopyRegion(void *d } if (this->isImmediateType()) { - memoryCopyParams.taskCountUpdateRequired |= dstAllocationStruct.alloc->getAllocationType() == NEO::AllocationType::externalHostPtr || - srcAllocationStruct.alloc->getAllocationType() == NEO::AllocationType::externalHostPtr; + memoryCopyParams.taskCountUpdateRequired |= CommandList::isExternalHostPtrAlloc(dstAllocationStruct.alloc) || + CommandList::isExternalHostPtrAlloc(srcAllocationStruct.alloc); } memoryCopyParams.copyOffloadAllowed = isCopyOffloadAllowed(*srcAllocationStruct.alloc, *dstAllocationStruct.alloc); @@ -4308,6 +4312,14 @@ template ze_result_t CommandListCoreFamily::appendWriteToMemory(void *desc, void *ptr, uint64_t data) { + return this->appendWriteToMemory(desc, ptr, data, nullptr); +} + +template +ze_result_t CommandListCoreFamily::appendWriteToMemory(void *desc, + void *ptr, + uint64_t data, + bool *requireTaskCountUpdate) { auto descriptor = reinterpret_cast(desc); size_t bufSize = sizeof(uint64_t); @@ -4315,6 +4327,11 @@ ze_result_t CommandListCoreFamily::appendWriteToMemory(void *desc if (dstAllocationStruct.alloc == nullptr) { return ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY; } + + if (requireTaskCountUpdate) { + *requireTaskCountUpdate = CommandList::isExternalHostPtrAlloc(dstAllocationStruct.alloc); + } + UNRECOVERABLE_IF(dstAllocationStruct.alloc == nullptr); commandContainer.addToResidencyContainer(dstAllocationStruct.alloc); diff --git a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl index 72e36bd5ea..cb0ad00fb5 100644 --- a/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl +++ b/level_zero/core/source/cmdlist/cmdlist_hw_immediate.inl @@ -936,7 +936,7 @@ ze_result_t CommandListCoreFamilyImmediate::appendImageCopyFromMe numWaitEvents, phWaitEvents, memoryCopyParams); return flushImmediate(ret, true, hasStallingCmdsForRelaxedOrdering(numWaitEvents, memoryCopyParams.relaxedOrderingDispatch), memoryCopyParams.relaxedOrderingDispatch, NEO::AppendOperations::kernel, - memoryCopyParams.copyOffloadAllowed, hSignalEvent, false, nullptr, nullptr); + memoryCopyParams.copyOffloadAllowed, hSignalEvent, memoryCopyParams.taskCountUpdateRequired, nullptr, nullptr); } template @@ -955,7 +955,7 @@ ze_result_t CommandListCoreFamilyImmediate::appendImageCopyToMemo numWaitEvents, phWaitEvents, memoryCopyParams); return flushImmediate(ret, true, hasStallingCmdsForRelaxedOrdering(numWaitEvents, memoryCopyParams.relaxedOrderingDispatch), memoryCopyParams.relaxedOrderingDispatch, NEO::AppendOperations::kernel, - memoryCopyParams.copyOffloadAllowed, hSignalEvent, false, nullptr, nullptr); + memoryCopyParams.copyOffloadAllowed, hSignalEvent, memoryCopyParams.taskCountUpdateRequired, nullptr, nullptr); } template @@ -976,7 +976,7 @@ ze_result_t CommandListCoreFamilyImmediate::appendImageCopyFromMe hSignalEvent, numWaitEvents, phWaitEvents, memoryCopyParams); return flushImmediate(ret, true, hasStallingCmdsForRelaxedOrdering(numWaitEvents, memoryCopyParams.relaxedOrderingDispatch), memoryCopyParams.relaxedOrderingDispatch, NEO::AppendOperations::kernel, - memoryCopyParams.copyOffloadAllowed, hSignalEvent, false, nullptr, nullptr); + memoryCopyParams.copyOffloadAllowed, hSignalEvent, memoryCopyParams.taskCountUpdateRequired, nullptr, nullptr); } template @@ -997,7 +997,7 @@ ze_result_t CommandListCoreFamilyImmediate::appendImageCopyToMemo hSignalEvent, numWaitEvents, phWaitEvents, memoryCopyParams); return flushImmediate(ret, true, hasStallingCmdsForRelaxedOrdering(numWaitEvents, memoryCopyParams.relaxedOrderingDispatch), memoryCopyParams.relaxedOrderingDispatch, NEO::AppendOperations::kernel, - memoryCopyParams.copyOffloadAllowed, hSignalEvent, false, nullptr, nullptr); + memoryCopyParams.copyOffloadAllowed, hSignalEvent, memoryCopyParams.taskCountUpdateRequired, nullptr, nullptr); } template @@ -1023,8 +1023,9 @@ ze_result_t CommandListCoreFamilyImmediate::appendWaitOnMemory(vo template ze_result_t CommandListCoreFamilyImmediate::appendWriteToMemory(void *desc, void *ptr, uint64_t data) { checkAvailableSpace(0, false, commonImmediateCommandSize, false); - auto ret = CommandListCoreFamily::appendWriteToMemory(desc, ptr, data); - return flushImmediate(ret, true, false, false, NEO::AppendOperations::nonKernel, false, nullptr, false, nullptr, nullptr); + bool requireTaskCountUpdate = false; + auto ret = CommandListCoreFamily::appendWriteToMemory(desc, ptr, data, &requireTaskCountUpdate); + return flushImmediate(ret, true, false, false, NEO::AppendOperations::nonKernel, false, nullptr, requireTaskCountUpdate, nullptr, nullptr); } template diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_2.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_2.cpp index ada0edef94..41e17b0a84 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_2.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_2.cpp @@ -834,6 +834,44 @@ HWTEST2_F(CommandListAppend, givenCopyCommandListAndNullDestinationRegionWhenIma EXPECT_TRUE(cmdList.useEvents); } +HWTEST2_F(CommandListAppend, givenImmediateCommandListWhenImageCopyFromOrToMemoryWithExternalHostPtrThenRequireTaskCountUpdate, ImageSupport) { + ze_command_queue_desc_t desc = {}; + ze_result_t ret = ZE_RESULT_SUCCESS; + std::unique_ptr cmdList(CommandList::whiteboxCast(CommandList::createImmediate(productFamily, device, &desc, false, NEO::EngineGroupType::renderCompute, ret))); + EXPECT_EQ(ZE_RESULT_SUCCESS, ret); + + void *hostPtr = reinterpret_cast(0x1234); + ze_image_desc_t zeDesc = {}; + zeDesc.stype = ZE_STRUCTURE_TYPE_IMAGE_DESC; + zeDesc.width = 1; + zeDesc.height = 1; + zeDesc.depth = 1; + auto imageHW = std::make_unique>>(); + imageHW->initialize(device, &zeDesc); + + auto kernel = device->getBuiltinFunctionsLib()->getImageFunction(ImageBuiltin::copyImageRegion); + auto mockBuiltinKernel = static_cast *>(kernel); + mockBuiltinKernel->setArgRedescribedImageCallBase = false; + cmdList->appendImageCopyFromMemory(imageHW->toHandle(), hostPtr, nullptr, nullptr, 0, nullptr, copyParams); + auto ultCsr = static_cast *>(cmdList->getCsr(false)); + if (L0GfxCoreHelper::useImmediateComputeFlushTask(device->getNEODevice()->getRootDeviceEnvironment())) { + ImmediateDispatchFlags &recordedImmediateDispatchFlags = ultCsr->recordedImmediateDispatchFlags; + EXPECT_TRUE(recordedImmediateDispatchFlags.requireTaskCountUpdate); + } else { + DispatchFlags &recordedDispatchFlags = ultCsr->recordedDispatchFlags; + EXPECT_TRUE(recordedDispatchFlags.guardCommandBufferWithPipeControl); + } + + cmdList->appendImageCopyToMemory(hostPtr, imageHW->toHandle(), nullptr, nullptr, 0u, nullptr, copyParams); + if (L0GfxCoreHelper::useImmediateComputeFlushTask(device->getNEODevice()->getRootDeviceEnvironment())) { + ImmediateDispatchFlags &recordedImmediateDispatchFlags = ultCsr->recordedImmediateDispatchFlags; + EXPECT_TRUE(recordedImmediateDispatchFlags.requireTaskCountUpdate); + } else { + DispatchFlags &recordedDispatchFlags = ultCsr->recordedDispatchFlags; + EXPECT_TRUE(recordedDispatchFlags.guardCommandBufferWithPipeControl); + } +} + HWTEST2_F(CommandListAppend, givenCopyCommandListAndNullDestinationRegionWhenImageCopyToMemoryThenBlitImageCopyCalledWithCorrectImageSize, ImageSupport) { MockCommandListHw cmdList; cmdList.initialize(device, NEO::EngineGroupType::copy, 0u); diff --git a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_memory_extension.cpp b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_memory_extension.cpp index c2f71ff52f..f53a8d5ed4 100644 --- a/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_memory_extension.cpp +++ b/level_zero/core/test/unit_tests/sources/cmdlist/test_cmdlist_memory_extension.cpp @@ -13,6 +13,7 @@ #include "shared/source/helpers/register_offsets.h" #include "shared/test/common/cmd_parse/gen_cmd_parse.h" #include "shared/test/common/helpers/unit_test_helper.h" +#include "shared/test/common/libult/ult_command_stream_receiver.h" #include "shared/test/common/mocks/mock_graphics_allocation.h" #include "shared/test/common/test_macros/hw_test.h" @@ -1387,6 +1388,7 @@ HWTEST_F(ImmediateCommandListAppendWriteToMem, givenAppendWriteToMemWithNoScopeT EXPECT_TRUE(cmd->getCommandStreamerStallEnable()); EXPECT_FALSE(cmd->getDcFlushEnable()); postSyncFound = true; + break; } } ASSERT_TRUE(postSyncFound); @@ -1414,6 +1416,7 @@ HWTEST_F(ImmediateCommandListAppendWriteToMem, givenAppendWriteToMemOnBcsWithNoS if (cmd->getPostSyncOperation() == MI_FLUSH_DW::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA_QWORD) { EXPECT_EQ(cmd->getImmediateData(), data); postSyncFound = true; + break; } } ASSERT_TRUE(postSyncFound); @@ -1432,6 +1435,17 @@ HWTEST_F(ImmediateCommandListAppendWriteToMem, givenAppendWriteToMemWithScopeThe result = immCommandList->appendWriteToMemory(reinterpret_cast(&desc), ptr, data); EXPECT_EQ(ZE_RESULT_SUCCESS, result); + auto whiteBoxCmdList = static_cast(immCommandList.get()); + auto ultCsr = static_cast *>(whiteBoxCmdList->getCsr(false)); + + if (L0GfxCoreHelper::useImmediateComputeFlushTask(device->getNEODevice()->getRootDeviceEnvironment())) { + ImmediateDispatchFlags &recordedImmediateDispatchFlags = ultCsr->recordedImmediateDispatchFlags; + EXPECT_TRUE(recordedImmediateDispatchFlags.requireTaskCountUpdate); + } else { + DispatchFlags &recordedDispatchFlags = ultCsr->recordedDispatchFlags; + EXPECT_TRUE(recordedDispatchFlags.guardCommandBufferWithPipeControl); + } + GenCmdList cmdList; ASSERT_TRUE(FamilyType::Parse::parseCommandBuffer( cmdList, ptrOffset(commandContainer.getCommandStream()->getCpuBase(), 0), commandContainer.getCommandStream()->getUsed())); @@ -1446,6 +1460,7 @@ HWTEST_F(ImmediateCommandListAppendWriteToMem, givenAppendWriteToMemWithScopeThe EXPECT_TRUE(cmd->getCommandStreamerStallEnable()); EXPECT_EQ(NEO::MemorySynchronizationCommands::getDcFlushEnable(true, device->getNEODevice()->getRootDeviceEnvironment()), cmd->getDcFlushEnable()); postSyncFound = true; + break; } } ASSERT_TRUE(postSyncFound);