refactor: add setupTimestampPacketFlushL3 function

Signed-off-by: Kamil Kopryk <kamil.kopryk@intel.com>
This commit is contained in:
Kamil Kopryk
2025-03-19 12:15:31 +00:00
committed by Compute-Runtime-Automation
parent c1d184fade
commit 73795ced64
19 changed files with 109 additions and 22 deletions

View File

@@ -1334,8 +1334,14 @@ bool CommandQueue::isWaitForTimestampsEnabled() const {
auto &productHelper = getDevice().getProductHelper();
auto enabled = CommandQueue::isTimestampWaitEnabled();
enabled &= productHelper.isTimestampWaitSupportedForQueues(false);
enabled &= !productHelper.isDcFlushAllowed();
enabled &= productHelper.isTimestampWaitSupportedForQueues(this->heaplessModeEnabled);
if (productHelper.isL3FlushAfterPostSyncRequired(this->heaplessModeEnabled)) {
enabled &= true;
} else {
enabled &= !productHelper.isDcFlushAllowed();
}
enabled &= !getDevice().getRootDeviceEnvironment().isWddmOnLinux();
enabled &= !this->isOOQEnabled(); // TSP for OOQ dispatch is optional. We need to wait for task count.

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2018-2024 Intel Corporation
* Copyright (C) 2018-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -69,6 +69,13 @@ class GpgpuWalkerHelper {
TagNodeBase *timestampPacketNode,
const RootDeviceEnvironment &rootDeviceEnvironment);
template <typename WalkerType>
static void setupTimestampPacketFlushL3(
WalkerType *walkerCmd,
const ProductHelper &productHelper,
bool flushL3AfterPostSyncForHostUsm,
bool flushL3AfterPostSyncForExternalAllocation);
static void adjustMiStoreRegMemMode(MI_STORE_REG_MEM<GfxFamily> *storeCmd);
private:

View File

@@ -160,4 +160,11 @@ size_t EnqueueOperation<GfxFamily>::getSizeRequiredCSNonKernel(bool reserveProfi
return size;
}
template <typename GfxFamily>
template <typename WalkerType>
void GpgpuWalkerHelper<GfxFamily>::setupTimestampPacketFlushL3(WalkerType *walkerCmd,
const ProductHelper &productHelper,
bool flushL3AfterPostSyncForHostUsm,
bool flushL3AfterPostSyncForExternalAllocation) {
}
} // namespace NEO

View File

@@ -84,19 +84,35 @@ inline void HardwareInterface<GfxFamily>::programWalker(
auto &device = commandQueue.getDevice();
auto &rootDeviceEnvironment = device.getRootDeviceEnvironment();
bool kernelSystemAllocation = false;
if (kernel.isBuiltIn) {
kernelSystemAllocation = kernel.getDestinationAllocationInSystemMemory();
} else {
kernelSystemAllocation = kernel.isAnyKernelArgumentUsingSystemMemory();
}
TagNodeBase *timestampPacketNode = nullptr;
if (walkerArgs.currentTimestampPacketNodes && (walkerArgs.currentTimestampPacketNodes->peekNodes().size() > walkerArgs.currentDispatchIndex)) {
timestampPacketNode = walkerArgs.currentTimestampPacketNodes->peekNodes()[walkerArgs.currentDispatchIndex];
}
constexpr bool heaplessModeEnabled = GfxFamily::template isHeaplessMode<WalkerType>();
if (timestampPacketNode) {
GpgpuWalkerHelper<GfxFamily>::template setupTimestampPacket<WalkerType>(&commandStream, &walkerCmd, timestampPacketNode, rootDeviceEnvironment);
if constexpr (heaplessModeEnabled) {
auto &productHelper = rootDeviceEnvironment.getHelper<ProductHelper>();
bool flushL3AfterPostSyncForHostUsm = kernelSystemAllocation;
bool flushL3AfterPostSyncForExternalAllocation = kernel.isUsingSharedObjArgs();
GpgpuWalkerHelper<GfxFamily>::template setupTimestampPacketFlushL3<WalkerType>(&walkerCmd, productHelper, flushL3AfterPostSyncForHostUsm, flushL3AfterPostSyncForExternalAllocation);
}
}
auto isCcsUsed = EngineHelpers::isCcs(commandQueue.getGpgpuEngine().osContext->getEngineType());
constexpr bool heaplessModeEnabled = GfxFamily::template isHeaplessMode<WalkerType>();
if constexpr (heaplessModeEnabled == false) {
if (auto kernelAllocation = kernelInfo.getGraphicsAllocation()) {
EncodeMemoryPrefetch<GfxFamily>::programMemoryPrefetch(commandStream, *kernelAllocation, kernelInfo.heapInfo.kernelHeapSize, 0, rootDeviceEnvironment);
@@ -135,13 +151,6 @@ inline void HardwareInterface<GfxFamily>::programWalker(
scratchAddress,
device);
bool kernelSystemAllocation = false;
if (kernel.isBuiltIn) {
kernelSystemAllocation = kernel.getDestinationAllocationInSystemMemory();
} else {
kernelSystemAllocation = kernel.isAnyKernelArgumentUsingSystemMemory();
}
EncodeWalkerArgs encodeWalkerArgs{
.kernelExecutionType = kernel.getExecutionType(),
.requiredDispatchWalkOrder = kernelAttributes.dispatchWalkOrder,

View File

@@ -297,6 +297,7 @@ template void HardwareInterface<Family>::dispatchKernelCommands<Family::DefaultW
template Family::DefaultWalkerType *HardwareInterface<Family>::allocateWalkerSpace<Family::DefaultWalkerType>(LinearStream &commandStream, const Kernel &kernel);
template class GpgpuWalkerHelper<Family>;
template void GpgpuWalkerHelper<Family>::setupTimestampPacketFlushL3<Family::DefaultWalkerType>(Family::DefaultWalkerType *walkerCmd, const ProductHelper &productHelper, bool flushL3AfterPostSyncForHostUsm, bool flushL3AfterPostSyncForExternalAllocation);
template void GpgpuWalkerHelper<Family>::setupTimestampPacket<Family::DefaultWalkerType>(LinearStream *cmdStream, Family::DefaultWalkerType *walkerCmd, TagNodeBase *timestampPacketNode, const RootDeviceEnvironment &rootDeviceEnvironment);
template size_t GpgpuWalkerHelper<Family>::setGpgpuWalkerThreadData<Family::DefaultWalkerType>(Family::DefaultWalkerType *walkerCmd, const KernelDescriptor &kernelDescriptor, const size_t startWorkGroups[3],
const size_t numWorkGroups[3], const size_t localWorkSizesIn[3], uint32_t simd, uint32_t workDim, bool localIdsGenerationByRuntime, bool inlineDataProgrammingRequired, uint32_t requiredWorkGroupOrder);

View File

@@ -89,4 +89,8 @@ bool ApiSpecificConfig::isGlobalStatelessEnabled(const RootDeviceEnvironment &ro
return false;
}
bool ApiSpecificConfig::isUpdateTagFromWaitEnabledForHeapless() {
return true;
}
} // namespace NEO

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2024 Intel Corporation
* Copyright (C) 2024-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -15,6 +15,8 @@ namespace NEO {
using Family = Xe2HpgCoreFamily;
template class GpgpuWalkerHelper<Family>;
template void GpgpuWalkerHelper<Family>::setupTimestampPacketFlushL3<Family::DefaultWalkerType>(Family::DefaultWalkerType *walkerCmd, const ProductHelper &productHelper, bool flushL3AfterPostSyncForHostUsm, bool flushL3AfterPostSyncForExternalAllocation);
template void GpgpuWalkerHelper<Family>::setupTimestampPacket<Family::DefaultWalkerType>(LinearStream *cmdStream, Family::DefaultWalkerType *walkerCmd, TagNodeBase *timestampPacketNode, const RootDeviceEnvironment &rootDeviceEnvironment);
template size_t GpgpuWalkerHelper<Family>::setGpgpuWalkerThreadData<Family::DefaultWalkerType>(Family::DefaultWalkerType *walkerCmd, const KernelDescriptor &kernelDescriptor, const size_t startWorkGroups[3],
const size_t numWorkGroups[3], const size_t localWorkSizesIn[3], uint32_t simd, uint32_t workDim, bool localIdsGenerationByRuntime, bool inlineDataProgrammingRequired, uint32_t requiredWorkGroupOrder);

View File

@@ -16,6 +16,8 @@ namespace NEO {
using Family = Xe3CoreFamily;
template class GpgpuWalkerHelper<Family>;
template void GpgpuWalkerHelper<Family>::setupTimestampPacketFlushL3<Family::DefaultWalkerType>(Family::DefaultWalkerType *walkerCmd, const ProductHelper &productHelper, bool flushL3AfterPostSyncForHostUsm, bool flushL3AfterPostSyncForExternalAllocation);
template void GpgpuWalkerHelper<Family>::setupTimestampPacket<Family::DefaultWalkerType>(LinearStream *cmdStream, Family::DefaultWalkerType *walkerCmd, TagNodeBase *timestampPacketNode, const RootDeviceEnvironment &rootDeviceEnvironment);
template size_t GpgpuWalkerHelper<Family>::setGpgpuWalkerThreadData<Family::DefaultWalkerType>(Family::DefaultWalkerType *walkerCmd, const KernelDescriptor &kernelDescriptor, const size_t startWorkGroups[3],
const size_t numWorkGroups[3], const size_t localWorkSizesIn[3], uint32_t simd, uint32_t workDim, bool localIdsGenerationByRuntime, bool inlineDataProgrammingRequired, uint32_t requiredWorkGroupOrder);

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2021-2024 Intel Corporation
* Copyright (C) 2021-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -23,6 +23,8 @@ void GpgpuWalkerHelper<Family>::setSystolicModeEnable(Family::COMPUTE_WALKER *wa
}
template class GpgpuWalkerHelper<Family>;
template void GpgpuWalkerHelper<Family>::setupTimestampPacketFlushL3<Family::DefaultWalkerType>(Family::DefaultWalkerType *walkerCmd, const ProductHelper &productHelper, bool flushL3AfterPostSyncForHostUsm, bool flushL3AfterPostSyncForExternalAllocation);
template void GpgpuWalkerHelper<Family>::setupTimestampPacket<Family::DefaultWalkerType>(LinearStream *cmdStream, Family::DefaultWalkerType *walkerCmd, TagNodeBase *timestampPacketNode, const RootDeviceEnvironment &rootDeviceEnvironment);
template size_t GpgpuWalkerHelper<Family>::setGpgpuWalkerThreadData<Family::DefaultWalkerType>(Family::DefaultWalkerType *walkerCmd, const KernelDescriptor &kernelDescriptor, const size_t startWorkGroups[3],
const size_t numWorkGroups[3], const size_t localWorkSizesIn[3], uint32_t simd, uint32_t workDim, bool localIdsGenerationByRuntime, bool inlineDataProgrammingRequired, uint32_t requiredWorkGroupOrder);

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2021-2024 Intel Corporation
* Copyright (C) 2021-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -23,6 +23,8 @@ void GpgpuWalkerHelper<Family>::setSystolicModeEnable(Family::COMPUTE_WALKER *wa
}
template class GpgpuWalkerHelper<Family>;
template void GpgpuWalkerHelper<Family>::setupTimestampPacketFlushL3<Family::DefaultWalkerType>(Family::DefaultWalkerType *walkerCmd, const ProductHelper &productHelper, bool flushL3AfterPostSyncForHostUsm, bool flushL3AfterPostSyncForExternalAllocation);
template void GpgpuWalkerHelper<Family>::setupTimestampPacket<Family::DefaultWalkerType>(LinearStream *cmdStream, Family::DefaultWalkerType *walkerCmd, TagNodeBase *timestampPacketNode, const RootDeviceEnvironment &rootDeviceEnvironment);
template size_t GpgpuWalkerHelper<Family>::setGpgpuWalkerThreadData<Family::DefaultWalkerType>(Family::DefaultWalkerType *walkerCmd, const KernelDescriptor &kernelDescriptor, const size_t startWorkGroups[3],
const size_t numWorkGroups[3], const size_t localWorkSizesIn[3], uint32_t simd, uint32_t workDim, bool localIdsGenerationByRuntime, bool inlineDataProgrammingRequired, uint32_t requiredWorkGroupOrder);

View File

@@ -170,7 +170,18 @@ TEST(CommandQueue, givenEnableTimestampWaitWhenCheckIsTimestampWaitEnabledThenRe
{
debugManager.flags.EnableTimestampWaitForQueues.set(-1);
const auto &productHelper = mockDevice->getProductHelper();
EXPECT_EQ(cmdQ.isWaitForTimestampsEnabled(), productHelper.isTimestampWaitSupportedForQueues(false) && !productHelper.isDcFlushAllowed());
const auto &compilerProductHelper = mockDevice->getCompilerProductHelper();
bool heaplessEnabled = compilerProductHelper.isHeaplessModeEnabled();
auto enabled = productHelper.isTimestampWaitSupportedForQueues(heaplessEnabled);
if (productHelper.isL3FlushAfterPostSyncRequired(heaplessEnabled)) {
enabled &= true;
} else {
enabled &= !productHelper.isDcFlushAllowed();
}
EXPECT_EQ(enabled, cmdQ.isWaitForTimestampsEnabled());
}
{

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2018-2024 Intel Corporation
* Copyright (C) 2018-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -2015,7 +2015,7 @@ TEST(EventTimestampTest, givenEnableTimestampWaitWhenCheckIsTimestampWaitEnabled
{
debugManager.flags.EnableTimestampWaitForEvents.set(-1);
const auto &productHelper = mockDevice->getRootDeviceEnvironment().getHelper<ProductHelper>();
EXPECT_EQ(event.isWaitForTimestampsEnabled(), productHelper.isTimestampWaitSupportedForEvents());
EXPECT_EQ(event.isWaitForTimestampsEnabled(), productHelper.isTimestampWaitSupportedForEvents() && cmdQ.isTimestampWaitEnabled());
}
{

View File

@@ -1,5 +1,5 @@
/*
* Copyright (C) 2021-2024 Intel Corporation
* Copyright (C) 2021-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@@ -80,4 +80,8 @@ TEST(ApiSpecificConfigOclTests, WhenCheckingIfCompilerCacheIsEnabledByDefaultThe
EXPECT_EQ(1u, ApiSpecificConfig::compilerCacheDefaultEnabled());
}
TEST(ApiSpecificConfigOclTests, WhenCheckingIsUpdateTagFromWaitEnabledForHeaplessThenTrueIsReturned) {
EXPECT_TRUE(ApiSpecificConfig::isUpdateTagFromWaitEnabledForHeapless());
}
} // namespace NEO