performance: Set dispatch all for small TG

Resolves: NEO-11814

Signed-off-by: Lukasz Jobczyk <lukasz.jobczyk@intel.com>
This commit is contained in:
Lukasz Jobczyk
2025-01-17 15:14:17 +00:00
committed by Compute-Runtime-Automation
parent 143bcc74c8
commit af0bb7964a
13 changed files with 133 additions and 30 deletions

View File

@ -243,8 +243,8 @@ struct EncodeDispatchKernel {
static void overrideDefaultValues(WalkerType &walkerCmd, InterfaceDescriptorType &interfaceDescriptor);
template <typename WalkerType>
static void encodeWalkerPostSyncFields(WalkerType &walkerCmd, const EncodeWalkerArgs &walkerArgs);
template <typename WalkerType>
static void encodeComputeDispatchAllWalker(WalkerType &walkerCmd, const EncodeWalkerArgs &walkerArgs);
template <typename WalkerType, typename InterfaceDescriptorType>
static void encodeComputeDispatchAllWalker(WalkerType &walkerCmd, const InterfaceDescriptorType *idd, const RootDeviceEnvironment &rootDeviceEnvironment, const EncodeWalkerArgs &walkerArgs);
};
template <typename GfxFamily>

View File

@ -287,7 +287,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
args.requiresSystemMemoryFence()}; // requiredSystemFence
EncodeDispatchKernel<Family>::encodeAdditionalWalkerFields(rootDeviceEnvironment, cmd, walkerArgs);
EncodeDispatchKernel<Family>::encodeWalkerPostSyncFields(cmd, walkerArgs);
EncodeDispatchKernel<Family>::encodeComputeDispatchAllWalker(cmd, walkerArgs);
EncodeDispatchKernel<Family>::template encodeComputeDispatchAllWalker<WalkerType, INTERFACE_DESCRIPTOR_DATA>(cmd, nullptr, rootDeviceEnvironment, walkerArgs);
memcpy_s(iddPtr, sizeof(idd), &idd, sizeof(idd));
@ -419,8 +419,8 @@ template <typename WalkerType>
inline void EncodeDispatchKernel<Family>::encodeWalkerPostSyncFields(WalkerType &walkerCmd, const EncodeWalkerArgs &walkerArgs) {}
template <typename Family>
template <typename WalkerType>
inline void EncodeDispatchKernel<Family>::encodeComputeDispatchAllWalker(WalkerType &walkerCmd, const EncodeWalkerArgs &walkerArgs) {}
template <typename WalkerType, typename InterfaceDescriptorType>
inline void EncodeDispatchKernel<Family>::encodeComputeDispatchAllWalker(WalkerType &walkerCmd, const InterfaceDescriptorType *idd, const RootDeviceEnvironment &rootDeviceEnvironment, const EncodeWalkerArgs &walkerArgs) {}
template <typename Family>
template <typename InterfaceDescriptorType>

View File

@ -33,7 +33,7 @@ template void NEO::EncodeDispatchKernel<Family>::setWalkerRegionSettings<Family:
uint32_t workgroupSize, uint32_t threadGroupCount, uint32_t maxWgCountPerTile, bool requiredDispatchWalkOrder);
template void NEO::EncodeDispatchKernel<Family>::overrideDefaultValues<Family::DefaultWalkerType, Family::DefaultWalkerType::InterfaceDescriptorType>(Family::DefaultWalkerType &walkerCmd, Family::DefaultWalkerType::InterfaceDescriptorType &interfaceDescriptor);
template void NEO::EncodeDispatchKernel<Family>::encodeWalkerPostSyncFields<Family::DefaultWalkerType>(Family::DefaultWalkerType &walkerCmd, const EncodeWalkerArgs &walkerArgs);
template void NEO::EncodeDispatchKernel<Family>::encodeComputeDispatchAllWalker<Family::DefaultWalkerType>(Family::DefaultWalkerType &walkerCmd, const EncodeWalkerArgs &walkerArgs);
template void NEO::EncodeDispatchKernel<Family>::encodeComputeDispatchAllWalker<Family::DefaultWalkerType, Family::DefaultWalkerType::InterfaceDescriptorType>(Family::DefaultWalkerType &walkerCmd, const Family::DefaultWalkerType::InterfaceDescriptorType *idd, const RootDeviceEnvironment &rootDeviceEnvironment, const EncodeWalkerArgs &walkerArgs);
template struct NEO::EncodeStates<Family>;
template struct NEO::EncodeMath<Family>;

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) 2024 Intel Corporation
* Copyright (C) 2024-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@ -30,9 +30,13 @@ uint32_t EncodeDispatchKernel<Family>::alignPreferredSlmSize(uint32_t slmSize) {
}
template <typename Family>
template <typename WalkerType>
void EncodeDispatchKernel<Family>::encodeComputeDispatchAllWalker(WalkerType &walkerCmd, const EncodeWalkerArgs &walkerArgs) {
bool computeDispatchAllWalkerEnable = walkerArgs.kernelExecutionType == KernelExecutionType::concurrent;
template <typename WalkerType, typename InterfaceDescriptorType>
void EncodeDispatchKernel<Family>::encodeComputeDispatchAllWalker(WalkerType &walkerCmd, const InterfaceDescriptorType *idd, const RootDeviceEnvironment &rootDeviceEnvironment, const EncodeWalkerArgs &walkerArgs) {
bool computeDispatchAllWalkerEnable = walkerArgs.kernelExecutionType == KernelExecutionType::concurrent || (rootDeviceEnvironment.getNonLimitedNumberOfCcs() == 1u &&
rootDeviceEnvironment.getHardwareInfo()->gtSystemInfo.SliceCount > 2u &&
idd &&
idd->getThreadGroupDispatchSize() == InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1 &&
walkerCmd.getThreadGroupIdXDimension() * walkerCmd.getThreadGroupIdYDimension() * walkerCmd.getThreadGroupIdZDimension() * idd->getNumberOfThreadsInGpgpuThreadGroup() < walkerArgs.maxFrontEndThreads);
int32_t overrideComputeDispatchAllWalkerEnable = debugManager.flags.ComputeDispatchAllWalkerEnableInComputeWalker.get();
if (overrideComputeDispatchAllWalkerEnable != -1) {
computeDispatchAllWalkerEnable = !!overrideComputeDispatchAllWalkerEnable;

View File

@ -411,7 +411,7 @@ void EncodeDispatchKernel<Family>::encode(CommandContainer &container, EncodeDis
args.requiresSystemMemoryFence()}; // requiresMemoryFence
EncodeDispatchKernel<Family>::encodeAdditionalWalkerFields(rootDeviceEnvironment, walkerCmd, walkerArgs);
EncodeDispatchKernel<Family>::encodeWalkerPostSyncFields(walkerCmd, walkerArgs);
EncodeDispatchKernel<Family>::encodeComputeDispatchAllWalker(walkerCmd, walkerArgs);
EncodeDispatchKernel<Family>::encodeComputeDispatchAllWalker(walkerCmd, &idd, rootDeviceEnvironment, walkerArgs);
EncodeDispatchKernel<Family>::overrideDefaultValues(walkerCmd, idd);

View File

@ -329,6 +329,7 @@ void ExecutionEnvironment::setDeviceHierarchyMode(const GfxCoreHelper &gfxCoreHe
void ExecutionEnvironment::adjustCcsCountImpl(RootDeviceEnvironment *rootDeviceEnvironment) const {
auto hwInfo = rootDeviceEnvironment->getMutableHardwareInfo();
auto &productHelper = rootDeviceEnvironment->getHelper<ProductHelper>();
rootDeviceEnvironment->setNonLimitedNumberOfCcs(hwInfo->gtSystemInfo.CCSInfo.NumberOfCCSEnabled);
productHelper.adjustNumberOfCcs(*hwInfo);
}

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) 2019-2024 Intel Corporation
* Copyright (C) 2019-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@ -95,6 +95,8 @@ struct RootDeviceEnvironment : NonCopyableClass {
const ProductHelper &getProductHelper() const;
GraphicsAllocation *getDummyAllocation() const;
void releaseDummyAllocation();
void setNonLimitedNumberOfCcs(uint32_t numberOfCss) { this->nonLimitedNumberOfCcs = numberOfCss; };
uint32_t getNonLimitedNumberOfCcs() const { return this->nonLimitedNumberOfCcs; };
std::unique_ptr<SipKernel> sipKernels[static_cast<uint32_t>(SipKernelType::count)];
std::unique_ptr<GmmHelper> gmmHelper;
@ -124,6 +126,7 @@ struct RootDeviceEnvironment : NonCopyableClass {
protected:
using GraphicsAllocationUniquePtrType = std::unique_ptr<GraphicsAllocation, std::function<void(GraphicsAllocation *)>>;
GraphicsAllocationUniquePtrType dummyAllocation = nullptr;
uint32_t nonLimitedNumberOfCcs = 0u;
bool limitedNumberOfCcs = false;
bool isWddmOnLinuxEnable = false;

View File

@ -148,8 +148,8 @@ void EncodeDispatchKernel<Family>::adjustBindingTablePrefetch(INTERFACE_DESCRIPT
}
template <>
template <typename WalkerType>
void EncodeDispatchKernel<Family>::encodeComputeDispatchAllWalker(WalkerType &walkerCmd, const EncodeWalkerArgs &walkerArgs) {
template <typename WalkerType, typename InterfaceDescriptorType>
void EncodeDispatchKernel<Family>::encodeComputeDispatchAllWalker(WalkerType &walkerCmd, const InterfaceDescriptorType *idd, const RootDeviceEnvironment &rootDeviceEnvironment, const EncodeWalkerArgs &walkerArgs) {
int32_t overrideDispatchAllWalkerEnableInComputeWalker = debugManager.flags.ComputeDispatchAllWalkerEnableInComputeWalker.get();
if (overrideDispatchAllWalkerEnableInComputeWalker != -1) {
walkerCmd.setComputeDispatchAllWalkerEnable(overrideDispatchAllWalkerEnableInComputeWalker);

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) 2021-2024 Intel Corporation
* Copyright (C) 2021-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@ -62,8 +62,8 @@ template <typename WalkerType>
void EncodeDispatchKernel<Family>::encodeWalkerPostSyncFields(WalkerType &walkerCmd, const EncodeWalkerArgs &walkerArgs) {}
template <>
template <typename WalkerType>
void EncodeDispatchKernel<Family>::encodeComputeDispatchAllWalker(WalkerType &walkerCmd, const EncodeWalkerArgs &walkerArgs) {}
template <typename WalkerType, typename InterfaceDescriptorType>
void EncodeDispatchKernel<Family>::encodeComputeDispatchAllWalker(WalkerType &walkerCmd, const InterfaceDescriptorType *idd, const RootDeviceEnvironment &rootDeviceEnvironment, const EncodeWalkerArgs &walkerArgs) {}
template <>
void EncodeComputeMode<Family>::programComputeModeCommand(LinearStream &csr, StateComputeModeProperties &properties, const RootDeviceEnvironment &rootDeviceEnvironment) {

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) 2021-2024 Intel Corporation
* Copyright (C) 2021-2025 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@ -21,6 +21,7 @@ using WalkerDispatchTestsXeHpcCore = ::testing::Test;
XE_HPC_CORETEST_F(WalkerDispatchTestsXeHpcCore, givenXeHpcWhenEncodeAdditionalWalkerFieldsIsCalledThenComputeDispatchAllIsCorrectlySet) {
DebugManagerStateRestore debugRestorer;
MockExecutionEnvironment executionEnvironment;
auto walkerCmd = FamilyType::cmdInitGpgpuWalker;
KernelDescriptor kernelDescriptor;
@ -28,13 +29,13 @@ XE_HPC_CORETEST_F(WalkerDispatchTestsXeHpcCore, givenXeHpcWhenEncodeAdditionalWa
walkerArgs.requiredSystemFence = true;
{
EncodeDispatchKernel<FamilyType>::encodeComputeDispatchAllWalker(walkerCmd, walkerArgs);
EncodeDispatchKernel<FamilyType>::encodeComputeDispatchAllWalker(walkerCmd, &walkerCmd.getInterfaceDescriptor(), *executionEnvironment.rootDeviceEnvironments[0], walkerArgs);
EXPECT_FALSE(walkerCmd.getComputeDispatchAllWalkerEnable());
}
{
debugManager.flags.ComputeDispatchAllWalkerEnableInComputeWalker.set(1);
EncodeDispatchKernel<FamilyType>::encodeComputeDispatchAllWalker(walkerCmd, walkerArgs);
EncodeDispatchKernel<FamilyType>::encodeComputeDispatchAllWalker(walkerCmd, &walkerCmd.getInterfaceDescriptor(), *executionEnvironment.rootDeviceEnvironments[0], walkerArgs);
EXPECT_TRUE(walkerCmd.getComputeDispatchAllWalkerEnable());
}
}