fix: l0, tag update on mem copy ext host ptr

Require tag update on mem copy with external host ptr.
Without this, temporary allocation might not be cleaned before next copy
operation.
If a second copy operation is passed same ptr that has been reallocated,
there will be a pagefault.

Related-To: NEO-15663

Signed-off-by: Dominik Dabek <dominik.dabek@intel.com>
This commit is contained in:
Dominik Dabek
2025-08-08 11:39:51 +00:00
committed by Compute-Runtime-Automation
parent 34ddf678ad
commit 77470acf7a
9 changed files with 127 additions and 45 deletions

View File

@@ -1799,6 +1799,11 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopy(void *dstptr,
return ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY;
}
if (this->isImmediateType()) {
memoryCopyParams.taskCountUpdateRequired |= (dstAllocationStruct.alloc && dstAllocationStruct.alloc->getAllocationType() == NEO::AllocationType::externalHostPtr) ||
(srcAllocationStruct.alloc && srcAllocationStruct.alloc->getAllocationType() == NEO::AllocationType::externalHostPtr);
}
if ((dstAllocationStruct.alloc == nullptr) && (NEO::debugManager.flags.EmitMemAdvisePriorToCopyForNonUsm.get() == 1)) {
appendMemAdvise(device, reinterpret_cast<void *>(dstAllocationStruct.alignedAllocationPtr), size, static_cast<ze_memory_advice_t>(ZE_MEMORY_ADVICE_SET_SYSTEM_MEMORY_PREFERRED_LOCATION));
}
@@ -2052,6 +2057,11 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopyRegion(void *d
return ZE_RESULT_ERROR_OUT_OF_DEVICE_MEMORY;
}
if (this->isImmediateType()) {
memoryCopyParams.taskCountUpdateRequired |= dstAllocationStruct.alloc->getAllocationType() == NEO::AllocationType::externalHostPtr ||
srcAllocationStruct.alloc->getAllocationType() == NEO::AllocationType::externalHostPtr;
}
memoryCopyParams.copyOffloadAllowed = isCopyOffloadAllowed(*srcAllocationStruct.alloc, *dstAllocationStruct.alloc);
const bool isCopyOnlyEnabled = isCopyOnly(memoryCopyParams.copyOffloadAllowed);
const bool inOrderCopyOnlySignalingAllowed = this->isInOrderExecutionEnabled() && !memoryCopyParams.forceDisableCopyOnlyInOrderSignaling && isCopyOnlyEnabled;

View File

@@ -2201,41 +2201,47 @@ HWTEST2_F(CommandListCreateTests, givenDirectSubmissionAndImmCmdListWhenDispatch
auto ultCsr = static_cast<NEO::UltCommandStreamReceiver<FamilyType> *>(whiteBoxCmdList->getCsr(false));
auto verifyFlag = [&ultCsr](ze_result_t result, bool dispatchFlag) {
auto verifyWalkerWithProfilingEnqueued = [&ultCsr](ze_result_t result, bool expectEnqueued) {
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
EXPECT_EQ(ultCsr->isWalkerWithProfilingEnqueued, dispatchFlag);
const auto enqueueTimes = ultCsr->walkerWithProfilingEnqueuedTimes + ultCsr->isWalkerWithProfilingEnqueued;
if (expectEnqueued) {
EXPECT_GT(enqueueTimes, 0u);
} else {
EXPECT_EQ(0u, enqueueTimes);
}
ultCsr->walkerWithProfilingEnqueuedTimes = 0u;
ultCsr->isWalkerWithProfilingEnqueued = false;
};
auto expectFlagEnabled = true && this->device->getNEODevice()->getProductHelper().shouldRegisterEnqueuedWalkerWithProfiling();
bool expectWalkerWithProfilingEnqueued = this->device->getNEODevice()->getProductHelper().shouldRegisterEnqueuedWalkerWithProfiling();
// non-pipelined state
verifyFlag(commandList->appendLaunchKernel(kernel.toHandle(), groupCount, event, 0, nullptr, launchParams), expectFlagEnabled);
verifyWalkerWithProfilingEnqueued(commandList->appendLaunchKernel(kernel.toHandle(), groupCount, event, 0, nullptr, launchParams), expectWalkerWithProfilingEnqueued);
// non-pipelined state already programmed
verifyFlag(commandList->appendLaunchKernel(kernel.toHandle(), groupCount, event, 0, nullptr, launchParams), expectFlagEnabled);
verifyWalkerWithProfilingEnqueued(commandList->appendLaunchKernel(kernel.toHandle(), groupCount, event, 0, nullptr, launchParams), expectWalkerWithProfilingEnqueued);
verifyFlag(commandList->appendLaunchKernel(kernel.toHandle(), groupCount, nullptr, 0, nullptr, launchParams), false);
verifyWalkerWithProfilingEnqueued(commandList->appendLaunchKernel(kernel.toHandle(), groupCount, nullptr, 0, nullptr, launchParams), false);
verifyFlag(commandList->appendLaunchKernelIndirect(kernel.toHandle(), groupCount, event, 0, nullptr, false), expectFlagEnabled);
verifyWalkerWithProfilingEnqueued(commandList->appendLaunchKernelIndirect(kernel.toHandle(), groupCount, event, 0, nullptr, false), expectWalkerWithProfilingEnqueued);
verifyFlag(commandList->appendBarrier(event, 0, nullptr, false), false);
verifyWalkerWithProfilingEnqueued(commandList->appendBarrier(event, 0, nullptr, false), false);
CmdListMemoryCopyParams copyParams = {};
verifyFlag(commandList->appendMemoryCopy(dstPtr, srcPtr, 8, event, 0, nullptr, copyParams), expectFlagEnabled);
verifyWalkerWithProfilingEnqueued(commandList->appendMemoryCopy(dstPtr, srcPtr, 8, event, 0, nullptr, copyParams), expectWalkerWithProfilingEnqueued);
verifyFlag(commandList->appendMemoryCopyRegion(dstPtr, &region, 0, 0, srcPtr, &region, 0, 0, event, 0, nullptr, copyParams), expectFlagEnabled);
verifyWalkerWithProfilingEnqueued(commandList->appendMemoryCopyRegion(dstPtr, &region, 0, 0, srcPtr, &region, 0, 0, event, 0, nullptr, copyParams), expectWalkerWithProfilingEnqueued);
verifyFlag(commandList->appendMemoryFill(dstPtr, srcPtr, 8, 1, event, 0, nullptr, copyParams), expectFlagEnabled);
verifyWalkerWithProfilingEnqueued(commandList->appendMemoryFill(dstPtr, srcPtr, 8, 1, event, 0, nullptr, copyParams), expectWalkerWithProfilingEnqueued);
verifyFlag(commandList->appendEventReset(event), false);
verifyWalkerWithProfilingEnqueued(commandList->appendEventReset(event), false);
verifyFlag(commandList->appendSignalEvent(event, false), false);
verifyWalkerWithProfilingEnqueued(commandList->appendSignalEvent(event, false), false);
verifyFlag(commandList->appendPageFaultCopy(kernel.getIsaAllocation(), kernel.getIsaAllocation(), 1, false), false);
verifyWalkerWithProfilingEnqueued(commandList->appendPageFaultCopy(kernel.getIsaAllocation(), kernel.getIsaAllocation(), 1, false), false);
verifyFlag(commandList->appendWaitOnEvents(1, &event, nullptr, false, true, false, false, false, false), false);
verifyWalkerWithProfilingEnqueued(commandList->appendWaitOnEvents(1, &event, nullptr, false, true, false, false, false, false), false);
verifyFlag(commandList->appendWriteGlobalTimestamp(reinterpret_cast<uint64_t *>(dstPtr), event, 0, nullptr), false);
verifyWalkerWithProfilingEnqueued(commandList->appendWriteGlobalTimestamp(reinterpret_cast<uint64_t *>(dstPtr), event, 0, nullptr), false);
if constexpr (FamilyType::supportsSampler) {
auto kernel = device->getBuiltinFunctionsLib()->getImageFunction(ImageBuiltin::copyImageRegion);
@@ -2251,27 +2257,27 @@ HWTEST2_F(CommandListCreateTests, givenDirectSubmissionAndImmCmdListWhenDispatch
CmdListMemoryCopyParams copyParams = {};
verifyFlag(commandList->appendImageCopyRegion(image->toHandle(), image->toHandle(), &imgRegion, &imgRegion, event, 0, nullptr, copyParams), expectFlagEnabled);
verifyWalkerWithProfilingEnqueued(commandList->appendImageCopyRegion(image->toHandle(), image->toHandle(), &imgRegion, &imgRegion, event, 0, nullptr, copyParams), expectWalkerWithProfilingEnqueued);
verifyFlag(commandList->appendImageCopyFromMemory(image->toHandle(), dstPtr, &imgRegion, event, 0, nullptr, copyParams), expectFlagEnabled);
verifyWalkerWithProfilingEnqueued(commandList->appendImageCopyFromMemory(image->toHandle(), dstPtr, &imgRegion, event, 0, nullptr, copyParams), expectWalkerWithProfilingEnqueued);
verifyFlag(commandList->appendImageCopyToMemory(dstPtr, image->toHandle(), &imgRegion, event, 0, nullptr, copyParams), expectFlagEnabled);
verifyWalkerWithProfilingEnqueued(commandList->appendImageCopyToMemory(dstPtr, image->toHandle(), &imgRegion, event, 0, nullptr, copyParams), expectWalkerWithProfilingEnqueued);
verifyFlag(commandList->appendImageCopyFromMemoryExt(image->toHandle(), dstPtr, &imgRegion, bytesPerPixel, bytesPerPixel, event, 0, nullptr, copyParams), expectFlagEnabled);
verifyWalkerWithProfilingEnqueued(commandList->appendImageCopyFromMemoryExt(image->toHandle(), dstPtr, &imgRegion, bytesPerPixel, bytesPerPixel, event, 0, nullptr, copyParams), expectWalkerWithProfilingEnqueued);
verifyFlag(commandList->appendImageCopyToMemoryExt(dstPtr, image->toHandle(), &imgRegion, bytesPerPixel, bytesPerPixel, event, 0, nullptr, copyParams), expectFlagEnabled);
verifyWalkerWithProfilingEnqueued(commandList->appendImageCopyToMemoryExt(dstPtr, image->toHandle(), &imgRegion, bytesPerPixel, bytesPerPixel, event, 0, nullptr, copyParams), expectWalkerWithProfilingEnqueued);
}
size_t rangeSizes = 1;
const void **ranges = reinterpret_cast<const void **>(&dstPtr[0]);
verifyFlag(commandList->appendMemoryRangesBarrier(1, &rangeSizes, ranges, event, 0, nullptr), false);
verifyWalkerWithProfilingEnqueued(commandList->appendMemoryRangesBarrier(1, &rangeSizes, ranges, event, 0, nullptr), false);
CmdListKernelLaunchParams cooperativeParams = {};
cooperativeParams.isCooperative = true;
verifyFlag(commandList->appendLaunchKernel(kernel.toHandle(), groupCount, event, 0, nullptr, cooperativeParams), expectFlagEnabled);
verifyWalkerWithProfilingEnqueued(commandList->appendLaunchKernel(kernel.toHandle(), groupCount, event, 0, nullptr, cooperativeParams), expectWalkerWithProfilingEnqueued);
verifyFlag(commandList->appendLaunchKernel(kernel.toHandle(), groupCount, event, 0, nullptr, cooperativeParams), expectFlagEnabled);
verifyWalkerWithProfilingEnqueued(commandList->appendLaunchKernel(kernel.toHandle(), groupCount, event, 0, nullptr, cooperativeParams), expectWalkerWithProfilingEnqueued);
driverHandle->releaseImportedPointer(dstPtr);
}
@@ -2959,6 +2965,50 @@ TEST_F(CommandListCreateTests, whenInvokingAppendMemoryCopyFromContextForImmedia
EXPECT_EQ(ZE_RESULT_SUCCESS, result);
}
HWTEST_F(CommandListCreateTests, givenImmediateCmdListWhenInvokingAppendMemoryCopyWithExternalHostPtrThenRequireTaskCountUpdate) {
ze_command_queue_desc_t desc = {};
desc.mode = ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS;
ze_result_t returnValue;
std::unique_ptr<L0::CommandList> commandList(CommandList::createImmediate(productFamily, device, &desc, false, NEO::EngineGroupType::compute, returnValue));
ASSERT_NE(nullptr, commandList);
auto whiteBoxCmdList = static_cast<CommandList *>(commandList.get());
EXPECT_EQ(device, commandList->getDevice());
EXPECT_TRUE(commandList->isImmediateType());
EXPECT_NE(nullptr, whiteBoxCmdList->cmdQImmediate);
constexpr size_t transferSize = sizeof(size_t);
void *hostPtr;
ze_host_mem_alloc_desc_t hostDesc = {};
ASSERT_EQ(ZE_RESULT_SUCCESS, context->allocHostMem(&hostDesc, transferSize, 0u, &hostPtr));
size_t externalHostAlloc = 0;
CmdListMemoryCopyParams copyParams = {};
EXPECT_EQ(ZE_RESULT_SUCCESS, commandList->appendMemoryCopy(hostPtr, &externalHostAlloc, sizeof(size_t), nullptr, 0, nullptr, copyParams));
auto ultCsr = static_cast<NEO::UltCommandStreamReceiver<FamilyType> *>(whiteBoxCmdList->getCsr(false));
if (L0GfxCoreHelper::useImmediateComputeFlushTask(device->getNEODevice()->getRootDeviceEnvironment())) {
ImmediateDispatchFlags &recordedImmediateDispatchFlags = ultCsr->recordedImmediateDispatchFlags;
EXPECT_TRUE(recordedImmediateDispatchFlags.requireTaskCountUpdate);
} else {
DispatchFlags &recordedDispatchFlags = ultCsr->recordedDispatchFlags;
EXPECT_TRUE(recordedDispatchFlags.guardCommandBufferWithPipeControl);
}
const ze_copy_region_t region = {0U, 0U, 0U, 1, 1, 0U};
EXPECT_EQ(ZE_RESULT_SUCCESS, commandList->appendMemoryCopyRegion(hostPtr, &region, 0, 0, &externalHostAlloc, &region, 0, 0, nullptr, 0, nullptr, copyParams));
if (L0GfxCoreHelper::useImmediateComputeFlushTask(device->getNEODevice()->getRootDeviceEnvironment())) {
ImmediateDispatchFlags &recordedImmediateDispatchFlags = ultCsr->recordedImmediateDispatchFlags;
EXPECT_TRUE(recordedImmediateDispatchFlags.requireTaskCountUpdate);
} else {
DispatchFlags &recordedDispatchFlags = ultCsr->recordedDispatchFlags;
EXPECT_TRUE(recordedDispatchFlags.guardCommandBufferWithPipeControl);
}
EXPECT_EQ(ZE_RESULT_SUCCESS, context->freeMem(hostPtr));
}
TEST_F(CommandListCreateTests, whenInvokingAppendMemoryCopyFromContextForImmediateCommandListThenSuccessIsReturned) {
const ze_command_queue_desc_t desc = {};
ze_result_t returnValue;

View File

@@ -1208,13 +1208,20 @@ HWTEST2_F(ImmediateCommandListTest, givenCopyEngineAsyncCmdListWhenAppendingCopy
auto ultCsr = static_cast<NEO::UltCommandStreamReceiver<FamilyType> *>(whiteBoxCmdList->getCsr(false));
ultCsr->recordFlushedBatchBuffer = true;
size_t src = 0;
size_t dst = 0;
constexpr size_t transferSize = sizeof(size_t);
void *src, *dst;
ze_host_mem_alloc_desc_t hostDesc = {};
ASSERT_EQ(ZE_RESULT_SUCCESS, context->allocHostMem(&hostDesc, transferSize, 0u, &src));
ASSERT_EQ(ZE_RESULT_SUCCESS, context->allocHostMem(&hostDesc, transferSize, 0u, &dst));
CmdListMemoryCopyParams copyParams = {};
returnValue = commandList->appendMemoryCopy(&dst, &src, sizeof(size_t), nullptr, 0, nullptr, copyParams);
returnValue = commandList->appendMemoryCopy(dst, src, transferSize, nullptr, 0, nullptr, copyParams);
EXPECT_EQ(ZE_RESULT_SUCCESS, returnValue);
EXPECT_FALSE(ultCsr->latestFlushedBatchBuffer.dispatchMonitorFence);
EXPECT_EQ(ZE_RESULT_SUCCESS, context->freeMem(src));
EXPECT_EQ(ZE_RESULT_SUCCESS, context->freeMem(dst));
}
HWTEST2_F(ImmediateCommandListTest, givenCopyEngineSyncCmdListWhenAppendingCopyOperationThenRequireMonitorFence, IsAtLeastXeHpcCore) {

View File

@@ -448,8 +448,12 @@ HWTEST_F(AppendMemoryCopyTests, givenAsyncImmediateCommandListWhenAppendingMemor
auto cmdQueue = std::make_unique<Mock<CommandQueue>>();
cmdQueue->csr = ultCsr;
cmdQueue->isCopyOnlyCommandQueue = true;
size_t src = 0;
size_t dst = 0;
constexpr size_t transferSize = sizeof(size_t);
void *src, *dst;
ze_host_mem_alloc_desc_t hostDesc = {};
ASSERT_EQ(ZE_RESULT_SUCCESS, context->allocHostMem(&hostDesc, transferSize, 0u, &src));
ASSERT_EQ(ZE_RESULT_SUCCESS, context->allocHostMem(&hostDesc, transferSize, 0u, &dst));
auto commandList = std::make_unique<WhiteBox<L0::CommandListCoreFamilyImmediate<FamilyType::gfxCoreFamily>>>();
ASSERT_NE(nullptr, commandList);
@@ -477,7 +481,7 @@ HWTEST_F(AppendMemoryCopyTests, givenAsyncImmediateCommandListWhenAppendingMemor
expectedSize = alignUp(ultCsr->getCmdsSizeForHardwareContext() + sizeof(typename FamilyType::MI_BATCH_BUFFER_START), MemoryConstants::cacheLineSize);
}
ASSERT_EQ(ZE_RESULT_SUCCESS, commandList->appendMemoryCopy(&dst, &src, sizeof(size_t), nullptr, 0, nullptr, copyParams));
ASSERT_EQ(ZE_RESULT_SUCCESS, commandList->appendMemoryCopy(dst, src, transferSize, nullptr, 0, nullptr, copyParams));
EXPECT_EQ(expectedSize, ultCsr->getCS(0).getUsed() - sizeUsedBefore);
@@ -528,13 +532,16 @@ HWTEST_F(AppendMemoryCopyTests, givenAsyncImmediateCommandListWhenAppendingMemor
size_t csrOfffset = ultCsr->getCS(0).getUsed();
size_t cmdListOffset = commandList->commandContainer.getCommandStream()->getUsed();
ASSERT_EQ(ZE_RESULT_SUCCESS, commandList->appendMemoryCopy(&dst, &src, sizeof(size_t), nullptr, 0, nullptr, copyParams));
ASSERT_EQ(ZE_RESULT_SUCCESS, commandList->appendMemoryCopy(dst, src, transferSize, nullptr, 0, nullptr, copyParams));
EXPECT_EQ(csrOfffset, ultCsr->getCS(0).getUsed());
EXPECT_FALSE(findTagUpdate(ptrOffset(commandList->commandContainer.getCommandStream()->getCpuBase(), cmdListOffset),
commandList->commandContainer.getCommandStream()->getUsed() - cmdListOffset,
ultCsr->getTagAllocation()->getGpuAddress()));
EXPECT_EQ(ZE_RESULT_SUCCESS, context->freeMem(src));
EXPECT_EQ(ZE_RESULT_SUCCESS, context->freeMem(dst));
}
HWTEST_F(AppendMemoryCopyTests, givenSyncImmediateCommandListWhenAppendingMemoryCopyWithCopyEngineThenProgramCmdStreamWithFlushTask) {