feature: prework to enable memory prefetch

Related-To: NEO-14703

Signed-off-by: Bartosz Dunajski <bartosz.dunajski@intel.com>
This commit is contained in:
Bartosz Dunajski
2025-06-09 16:37:11 +00:00
committed by Compute-Runtime-Automation
parent 5714d9170c
commit ff7d0a76cb
14 changed files with 81 additions and 29 deletions

View File

@@ -233,6 +233,7 @@ struct CommandListCoreFamily : public CommandListImp {
void patchInOrderCmds() override;
MOCKABLE_VIRTUAL bool handleCounterBasedEventOperations(Event *signalEvent, bool skipAddingEventToResidency);
bool isCbEventBoundToCmdList(Event *event) const;
bool kernelMemoryPrefetchEnabled() const override;
protected:
MOCKABLE_VIRTUAL ze_result_t appendMemoryCopyKernelWithGA(void *dstPtr, NEO::GraphicsAllocation *dstPtrAlloc,
@@ -420,7 +421,6 @@ struct CommandListCoreFamily : public CommandListImp {
virtual uint64_t getPrefetchCmdId() const { return std::numeric_limits<uint64_t>::max(); }
virtual uint32_t getIohSizeForPrefetch(const Kernel &kernel, uint32_t reserveExtraSpace) const;
virtual void ensureCmdBufferSpaceForPrefetch() {}
bool kernelMemoryPrefetchEnabled() const { return NEO::debugManager.flags.EnableMemoryPrefetch.get() == 1; }
NEO::InOrderPatchCommandsContainer<GfxFamily> inOrderPatchCmds;

View File

@@ -1575,6 +1575,9 @@ ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopyBlit(uintptr_t
return ZE_RESULT_SUCCESS;
}
template <GFXCORE_FAMILY gfxCoreFamily>
bool CommandListCoreFamily<gfxCoreFamily>::kernelMemoryPrefetchEnabled() const { return NEO::debugManager.flags.EnableMemoryPrefetch.get() == 1; }
template <GFXCORE_FAMILY gfxCoreFamily>
ze_result_t CommandListCoreFamily<gfxCoreFamily>::appendMemoryCopyBlitRegion(AlignedAllocationData *srcAllocationData,
AlignedAllocationData *dstAllocationData,

View File

@@ -48,6 +48,7 @@ struct CommandListImp : public CommandList {
NEO::SynchronizedDispatchMode getSynchronizedDispatchMode() const { return synchronizedDispatchMode; }
void enableCopyOperationOffload();
void setInterruptEventsCsr(NEO::CommandStreamReceiver &csr);
virtual bool kernelMemoryPrefetchEnabled() const = 0;
protected:
std::shared_ptr<NEO::InOrderExecInfo> inOrderExecInfo;

View File

@@ -355,6 +355,8 @@ struct MockCommandList : public CommandList {
ADDMETHOD_NOBASE(destroy, ze_result_t, ZE_RESULT_SUCCESS, ());
ADDMETHOD_NOBASE_VOIDRETURN(patchInOrderCmds, (void));
ADDMETHOD_CONST_NOBASE(kernelMemoryPrefetchEnabled, bool, false, (void));
ADDMETHOD_NOBASE(appendLaunchKernel, ze_result_t, ZE_RESULT_SUCCESS,
(ze_kernel_handle_t kernelHandle,
const ze_group_count_t &threadGroupDimensions,

View File

@@ -2143,7 +2143,10 @@ HWTEST2_F(CommandListCreateTests, givenInOrderExecutionWhenDispatchingRelaxedOrd
cmdStream->getUsed() - offset));
// init registers
auto lrrCmd = genCmdCast<MI_LOAD_REGISTER_REG *>(*genCmdList.begin());
auto iter = genCmdList.begin();
UnitTestHelper<FamilyType>::skipStatePrefetch(iter);
auto lrrCmd = genCmdCast<MI_LOAD_REGISTER_REG *>(*iter);
ASSERT_NE(nullptr, lrrCmd);
lrrCmd++;
lrrCmd++;

View File

@@ -3446,7 +3446,14 @@ HWTEST2_F(CommandListStateBaseAddressPrivateHeapTest,
if (ssh) {
EXPECT_EQ(oldGfxHeapAllocation, ssh->getGraphicsAllocation());
}
EXPECT_EQ(usedBefore, cmdListStream.getUsed());
size_t prefetchSize = 0;
if (commandList->kernelMemoryPrefetchEnabled()) {
prefetchSize = NEO::EncodeMemoryPrefetch<FamilyType>::getSizeForMemoryPrefetch(kernel->getIndirectSize(), device->getNEODevice()->getRootDeviceEnvironment()) +
NEO::EncodeMemoryPrefetch<FamilyType>::getSizeForMemoryPrefetch(kernel->getImmutableData()->getIsaSize(), device->getNEODevice()->getRootDeviceEnvironment());
}
EXPECT_EQ(usedBefore + prefetchSize, cmdListStream.getUsed());
}
} // namespace ult

View File

@@ -882,6 +882,7 @@ HWTEST2_F(FrontEndMultiReturnCommandListTest, givenFrontEndTrackingIsUsedWhenPro
}
HWTEST2_F(FrontEndMultiReturnCommandListTest, givenFrontEndTrackingIsUsedWhenPropertyComputeDispatchAllWalkerSupportedThenExpectReturnPointsAndBbEndProgramming, FrontEndMultiReturnMatcher) {
debugManager.flags.EnableMemoryPrefetch.set(0);
using MI_BATCH_BUFFER_END = typename FamilyType::MI_BATCH_BUFFER_END;
NEO::FrontEndPropertiesSupport fePropertiesSupport = {};
auto &productHelper = device->getProductHelper();
@@ -1296,6 +1297,7 @@ HWTEST2_F(FrontEndMultiReturnCommandListTest,
HWTEST2_F(FrontEndMultiReturnCommandListTest,
givenFrontEndTrackingCmdListIsExecutedWhenPropertyComputeDispatchAllWalkerSupportedThenExpectFrontEndProgrammingInCmdQueue, FrontEndMultiReturnMatcher) {
debugManager.flags.EnableMemoryPrefetch.set(0);
using FrontEndStateCommand = typename FamilyType::FrontEndStateCommand;
using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START;
using MI_BATCH_BUFFER_END = typename FamilyType::MI_BATCH_BUFFER_END;

View File

@@ -2741,12 +2741,14 @@ HWTEST2_F(CommandListAppendLaunchKernel,
ASSERT_NE(0u, storeRegMemList.size());
ASSERT_NE(0u, outStoreRegMemCmdList.size());
ASSERT_EQ(storeRegMemList.size(), outStoreRegMemCmdList.size());
size_t additionalPatchCmdsSize = commandList->kernelMemoryPrefetchEnabled() ? 1 : 0;
ASSERT_EQ(storeRegMemList.size(), outStoreRegMemCmdList.size() - additionalPatchCmdsSize);
for (size_t i = 0; i < storeRegMemList.size(); i++) {
MI_STORE_REGISTER_MEM *storeRegMem = genCmdCast<MI_STORE_REGISTER_MEM *>(*storeRegMemList[i]);
auto &cmdToPatch = outStoreRegMemCmdList[i];
auto &cmdToPatch = outStoreRegMemCmdList[i + additionalPatchCmdsSize];
EXPECT_EQ(CommandToPatch::TimestampEventPostSyncStoreRegMem, cmdToPatch.type);
MI_STORE_REGISTER_MEM *outStoreRegMem = genCmdCast<MI_STORE_REGISTER_MEM *>(cmdToPatch.pDestination);
ASSERT_NE(nullptr, outStoreRegMem);
@@ -2804,7 +2806,9 @@ HWTEST2_F(CommandListAppendLaunchKernel,
ptrOffset(cmdStream->getCpuBase(), commandStreamOffset),
cmdStream->getUsed() - commandStreamOffset));
ASSERT_EQ(0u, outCbEventCmds.size());
size_t additionalPatchCmdsSize = commandList->kernelMemoryPrefetchEnabled() ? 1 : 0;
ASSERT_EQ(additionalPatchCmdsSize, outCbEventCmds.size());
auto eventBaseAddress = event->getGpuAddress(device);
WalkerVariant walkerVariant = NEO::UnitTestHelper<FamilyType>::getWalkerVariant(launchParams.outWalker);
@@ -2878,8 +2882,9 @@ HWTEST2_F(CommandListAppendLaunchKernel,
auto inOrderAllocation = event->getInOrderExecInfo()->getDeviceCounterAllocation();
size_t expectedLoadRegImmCount = FamilyType::isQwordInOrderCounter ? 2 : 0;
size_t additionalPatchCmdsSize = commandList->kernelMemoryPrefetchEnabled() ? 1 : 0;
size_t expectedWaitCmds = 1 + expectedLoadRegImmCount;
size_t expectedWaitCmds = 1 + expectedLoadRegImmCount + additionalPatchCmdsSize;
ASSERT_EQ(expectedWaitCmds, outCbWaitEventCmds.size());
auto loadRegImmList = findAll<MI_LOAD_REGISTER_IMM *>(cmdList.begin(), cmdList.end());
@@ -2889,27 +2894,31 @@ HWTEST2_F(CommandListAppendLaunchKernel,
size_t outCbWaitEventCmdsIndex = 0;
for (; outCbWaitEventCmdsIndex < expectedLoadRegImmCount; outCbWaitEventCmdsIndex++) {
EXPECT_EQ(CommandToPatch::CbWaitEventLoadRegisterImm, outCbWaitEventCmds[outCbWaitEventCmdsIndex].type);
ASSERT_NE(nullptr, outCbWaitEventCmds[outCbWaitEventCmdsIndex].pDestination);
ASSERT_EQ(*loadRegImmList[outCbWaitEventCmdsIndex], outCbWaitEventCmds[outCbWaitEventCmdsIndex].pDestination);
auto loadRegImmCmd = genCmdCast<MI_LOAD_REGISTER_IMM *>(outCbWaitEventCmds[outCbWaitEventCmdsIndex].pDestination);
auto &cmd = outCbWaitEventCmds[outCbWaitEventCmdsIndex + additionalPatchCmdsSize];
EXPECT_EQ(CommandToPatch::CbWaitEventLoadRegisterImm, cmd.type);
ASSERT_NE(nullptr, cmd.pDestination);
ASSERT_EQ(*loadRegImmList[outCbWaitEventCmdsIndex], cmd.pDestination);
auto loadRegImmCmd = genCmdCast<MI_LOAD_REGISTER_IMM *>(cmd.pDestination);
ASSERT_NE(nullptr, loadRegImmCmd);
EXPECT_EQ(0u, outCbWaitEventCmds[outCbWaitEventCmdsIndex].inOrderPatchListIndex);
EXPECT_EQ(0u, cmd.inOrderPatchListIndex);
auto registerNumber = 0x2600 + (4 * outCbWaitEventCmdsIndex);
EXPECT_EQ(registerNumber, outCbWaitEventCmds[outCbWaitEventCmdsIndex].offset);
EXPECT_EQ(registerNumber, cmd.offset);
}
EXPECT_EQ(CommandToPatch::CbWaitEventSemaphoreWait, outCbWaitEventCmds[outCbWaitEventCmdsIndex].type);
ASSERT_NE(nullptr, outCbWaitEventCmds[outCbWaitEventCmdsIndex].pDestination);
ASSERT_EQ(*semaphoreWaitList[0], outCbWaitEventCmds[outCbWaitEventCmdsIndex].pDestination);
auto semaphoreWaitCmd = genCmdCast<MI_SEMAPHORE_WAIT *>(outCbWaitEventCmds[outCbWaitEventCmdsIndex].pDestination);
auto &cmd = outCbWaitEventCmds[outCbWaitEventCmdsIndex + additionalPatchCmdsSize];
EXPECT_EQ(CommandToPatch::CbWaitEventSemaphoreWait, cmd.type);
ASSERT_NE(nullptr, cmd.pDestination);
ASSERT_EQ(*semaphoreWaitList[0], cmd.pDestination);
auto semaphoreWaitCmd = genCmdCast<MI_SEMAPHORE_WAIT *>(cmd.pDestination);
ASSERT_NE(nullptr, semaphoreWaitCmd);
EXPECT_EQ(eventCompletionAddress + outCbWaitEventCmds[outCbWaitEventCmdsIndex].offset, semaphoreWaitCmd->getSemaphoreGraphicsAddress());
EXPECT_EQ(eventCompletionAddress + cmd.offset, semaphoreWaitCmd->getSemaphoreGraphicsAddress());
if (FamilyType::isQwordInOrderCounter) {
EXPECT_EQ(std::numeric_limits<size_t>::max(), outCbWaitEventCmds[outCbWaitEventCmdsIndex].inOrderPatchListIndex);
EXPECT_EQ(std::numeric_limits<size_t>::max(), cmd.inOrderPatchListIndex);
} else {
EXPECT_EQ(0u, outCbWaitEventCmds[outCbWaitEventCmdsIndex].inOrderPatchListIndex);
EXPECT_EQ(0u, cmd.inOrderPatchListIndex);
}
auto &residencyContainer = commandContainer.getResidencyContainer();

View File

@@ -6330,24 +6330,29 @@ HWCMDTEST_F(IGFX_XE_HP_CORE, InOrderCmdListTests, givenInOrderModeAndNoopWaitEve
size_t expectedLoadRegImmCount = FamilyType::isQwordInOrderCounter ? 2 : 0;
size_t expectedWaitCmds = 1 + expectedLoadRegImmCount;
size_t additionalPatchCmdsSize = regularCmdList->kernelMemoryPrefetchEnabled() ? 1 : 0;
size_t expectedWaitCmds = 1 + expectedLoadRegImmCount + additionalPatchCmdsSize;
ASSERT_EQ(expectedWaitCmds, outCbWaitEventCmds.size());
size_t outCbWaitEventCmdsIndex = 0;
for (; outCbWaitEventCmdsIndex < expectedLoadRegImmCount; outCbWaitEventCmdsIndex++) {
EXPECT_EQ(CommandToPatch::CbWaitEventLoadRegisterImm, outCbWaitEventCmds[outCbWaitEventCmdsIndex].type);
auto registerNumber = 0x2600 + (4 * outCbWaitEventCmdsIndex);
EXPECT_EQ(registerNumber, outCbWaitEventCmds[outCbWaitEventCmdsIndex].offset);
auto &cmd = outCbWaitEventCmds[outCbWaitEventCmdsIndex + additionalPatchCmdsSize];
ASSERT_NE(nullptr, outCbWaitEventCmds[outCbWaitEventCmdsIndex].pDestination);
auto memCmpRet = memcmp(outCbWaitEventCmds[outCbWaitEventCmdsIndex].pDestination, noopedLriBuffer, sizeof(MI_LOAD_REGISTER_IMM));
EXPECT_EQ(CommandToPatch::CbWaitEventLoadRegisterImm, cmd.type);
auto registerNumber = 0x2600 + (4 * outCbWaitEventCmdsIndex);
EXPECT_EQ(registerNumber, cmd.offset);
ASSERT_NE(nullptr, cmd.pDestination);
auto memCmpRet = memcmp(cmd.pDestination, noopedLriBuffer, sizeof(MI_LOAD_REGISTER_IMM));
EXPECT_EQ(0, memCmpRet);
}
EXPECT_EQ(CommandToPatch::CbWaitEventSemaphoreWait, outCbWaitEventCmds[outCbWaitEventCmdsIndex].type);
auto &cmd = outCbWaitEventCmds[outCbWaitEventCmdsIndex + additionalPatchCmdsSize];
ASSERT_NE(nullptr, outCbWaitEventCmds[outCbWaitEventCmdsIndex].pDestination);
auto memCmpRet = memcmp(outCbWaitEventCmds[outCbWaitEventCmdsIndex].pDestination, noopedSemWaitBuffer, sizeof(MI_SEMAPHORE_WAIT));
EXPECT_EQ(CommandToPatch::CbWaitEventSemaphoreWait, cmd.type);
ASSERT_NE(nullptr, cmd.pDestination);
auto memCmpRet = memcmp(cmd.pDestination, noopedSemWaitBuffer, sizeof(MI_SEMAPHORE_WAIT));
EXPECT_EQ(0, memCmpRet);
}

View File

@@ -2908,6 +2908,9 @@ HWTEST2_F(MultiTileSynchronizedDispatchTests, givenFullSyncDispatchWhenAppending
}
auto itor = cmdList.begin();
UnitTestHelper<FamilyType>::skipStatePrefetch(itor);
if (hasDependencySemaphore) {
auto nPartition = std::min(immCmdList->inOrderExecInfo->getNumDevicePartitionsToWait(), partitionCount);
for (uint32_t i = 0; i < nPartition; i++) {
@@ -3527,6 +3530,7 @@ HWTEST2_F(MultiTileInOrderCmdListTests, givenAtomicSignallingEnabledWhenWaitingF
ASSERT_EQ(2u + (ImplicitScalingDispatch<FamilyType>::getPipeControlStallRequired() ? 1 : 0), semaphores.size());
auto itor = cmdList.begin();
UnitTestHelper<FamilyType>::skipStatePrefetch(itor);
// implicit dependency
auto gpuAddress = immCmdList2->inOrderExecInfo->getBaseDeviceAddress();
@@ -3582,6 +3586,8 @@ HWTEST2_F(MultiTileInOrderCmdListTests, givenMultiTileInOrderModeWhenProgramming
(cmdStream->getUsed() - offset)));
auto itor = cmdList.begin();
UnitTestHelper<FamilyType>::skipStatePrefetch(itor);
if (immCmdList->isQwordInOrderCounter()) {
std::advance(itor, 2);
}