fix: Apply dispatch all for small TG only on BMG

Signed-off-by: Lukasz Jobczyk <lukasz.jobczyk@intel.com>
This commit is contained in:
Lukasz Jobczyk
2025-01-22 10:33:32 +00:00
committed by Compute-Runtime-Automation
parent 4af92c20e4
commit c0838e1f76
9 changed files with 25 additions and 34 deletions

View File

@ -38,8 +38,6 @@ XE2_HPG_CORETEST_F(WalkerDispatchTestsXe2HpGCore, whenEncodeAdditionalWalkerFiel
EXPECT_TRUE(walkerCmd.getComputeDispatchAllWalkerEnable());
}
auto backupCcsNumber = executionEnvironment.rootDeviceEnvironments[0]->getNonLimitedNumberOfCcs();
executionEnvironment.rootDeviceEnvironments[0]->setNonLimitedNumberOfCcs(1);
VariableBackup<uint32_t> sliceCountBackup(&executionEnvironment.rootDeviceEnvironments[0]->getMutableHardwareInfo()->gtSystemInfo.SliceCount, 4);
{
@ -51,7 +49,11 @@ XE2_HPG_CORETEST_F(WalkerDispatchTestsXe2HpGCore, whenEncodeAdditionalWalkerFiel
{
walkerArgs.kernelExecutionType = KernelExecutionType::defaultType;
EncodeDispatchKernel<FamilyType>::encodeComputeDispatchAllWalker(walkerCmd, &walkerCmd.getInterfaceDescriptor(), *executionEnvironment.rootDeviceEnvironments[0], walkerArgs);
EXPECT_TRUE(walkerCmd.getComputeDispatchAllWalkerEnable());
if (executionEnvironment.rootDeviceEnvironments[0]->getProductHelper().adjustDispatchAllRequired(*executionEnvironment.rootDeviceEnvironments[0]->getHardwareInfo())) {
EXPECT_TRUE(walkerCmd.getComputeDispatchAllWalkerEnable());
} else {
EXPECT_FALSE(walkerCmd.getComputeDispatchAllWalkerEnable());
}
}
{
@ -78,14 +80,6 @@ XE2_HPG_CORETEST_F(WalkerDispatchTestsXe2HpGCore, whenEncodeAdditionalWalkerFiel
}
{
walkerCmd.getInterfaceDescriptor().setThreadGroupDispatchSize(FamilyType::INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1);
executionEnvironment.rootDeviceEnvironments[0]->setNonLimitedNumberOfCcs(2);
EncodeDispatchKernel<FamilyType>::encodeComputeDispatchAllWalker(walkerCmd, &walkerCmd.getInterfaceDescriptor(), *executionEnvironment.rootDeviceEnvironments[0], walkerArgs);
EXPECT_FALSE(walkerCmd.getComputeDispatchAllWalkerEnable());
}
{
executionEnvironment.rootDeviceEnvironments[0]->setNonLimitedNumberOfCcs(backupCcsNumber);
debugManager.flags.ComputeDispatchAllWalkerEnableInComputeWalker.set(1);
EncodeDispatchKernel<FamilyType>::encodeComputeDispatchAllWalker(walkerCmd, &walkerCmd.getInterfaceDescriptor(), *executionEnvironment.rootDeviceEnvironments[0], walkerArgs);
EXPECT_TRUE(walkerCmd.getComputeDispatchAllWalkerEnable());

View File

@ -39,20 +39,18 @@ XE3_CORETEST_F(WalkerDispatchTestsXe3Core, whenEncodeAdditionalWalkerFieldsIsCal
EXPECT_TRUE(walkerCmd.getComputeDispatchAllWalkerEnable());
}
auto backupCcsNumber = executionEnvironment.rootDeviceEnvironments[0]->getNonLimitedNumberOfCcs();
executionEnvironment.rootDeviceEnvironments[0]->setNonLimitedNumberOfCcs(1);
VariableBackup<uint32_t> sliceCountBackup(&executionEnvironment.rootDeviceEnvironments[0]->getMutableHardwareInfo()->gtSystemInfo.SliceCount, 4);
{
walkerCmd.getInterfaceDescriptor().setThreadGroupDispatchSize(FamilyType::INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1);
EncodeDispatchKernel<FamilyType>::encodeComputeDispatchAllWalker(walkerCmd, &walkerCmd.getInterfaceDescriptor(), *executionEnvironment.rootDeviceEnvironments[0], walkerArgs);
EXPECT_TRUE(walkerCmd.getComputeDispatchAllWalkerEnable());
}
{
walkerArgs.kernelExecutionType = KernelExecutionType::defaultType;
EncodeDispatchKernel<FamilyType>::encodeComputeDispatchAllWalker(walkerCmd, &walkerCmd.getInterfaceDescriptor(), *executionEnvironment.rootDeviceEnvironments[0], walkerArgs);
EXPECT_TRUE(walkerCmd.getComputeDispatchAllWalkerEnable());
EXPECT_FALSE(walkerCmd.getComputeDispatchAllWalkerEnable());
}
{
walkerCmd.getInterfaceDescriptor().setThreadGroupDispatchSize(FamilyType::INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1);
EncodeDispatchKernel<FamilyType>::encodeComputeDispatchAllWalker(walkerCmd, &walkerCmd.getInterfaceDescriptor(), *executionEnvironment.rootDeviceEnvironments[0], walkerArgs);
EXPECT_FALSE(walkerCmd.getComputeDispatchAllWalkerEnable());
}
{
@ -79,14 +77,6 @@ XE3_CORETEST_F(WalkerDispatchTestsXe3Core, whenEncodeAdditionalWalkerFieldsIsCal
}
{
walkerCmd.getInterfaceDescriptor().setThreadGroupDispatchSize(FamilyType::INTERFACE_DESCRIPTOR_DATA::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1);
executionEnvironment.rootDeviceEnvironments[0]->setNonLimitedNumberOfCcs(2);
EncodeDispatchKernel<FamilyType>::encodeComputeDispatchAllWalker(walkerCmd, &walkerCmd.getInterfaceDescriptor(), *executionEnvironment.rootDeviceEnvironments[0], walkerArgs);
EXPECT_FALSE(walkerCmd.getComputeDispatchAllWalkerEnable());
}
{
executionEnvironment.rootDeviceEnvironments[0]->setNonLimitedNumberOfCcs(backupCcsNumber);
debugManager.flags.ComputeDispatchAllWalkerEnableInComputeWalker.set(1);
EncodeDispatchKernel<FamilyType>::encodeComputeDispatchAllWalker(walkerCmd, &walkerCmd.getInterfaceDescriptor(), *executionEnvironment.rootDeviceEnvironments[0], walkerArgs);
EXPECT_TRUE(walkerCmd.getComputeDispatchAllWalkerEnable());

View File

@ -32,8 +32,7 @@ uint32_t EncodeDispatchKernel<Family>::alignPreferredSlmSize(uint32_t slmSize) {
template <typename Family>
template <typename WalkerType, typename InterfaceDescriptorType>
void EncodeDispatchKernel<Family>::encodeComputeDispatchAllWalker(WalkerType &walkerCmd, const InterfaceDescriptorType *idd, const RootDeviceEnvironment &rootDeviceEnvironment, const EncodeWalkerArgs &walkerArgs) {
bool computeDispatchAllWalkerEnable = walkerArgs.kernelExecutionType == KernelExecutionType::concurrent || (rootDeviceEnvironment.getNonLimitedNumberOfCcs() == 1u &&
rootDeviceEnvironment.getHardwareInfo()->gtSystemInfo.SliceCount > 2u &&
bool computeDispatchAllWalkerEnable = walkerArgs.kernelExecutionType == KernelExecutionType::concurrent || (rootDeviceEnvironment.getProductHelper().adjustDispatchAllRequired(*rootDeviceEnvironment.getHardwareInfo()) &&
idd &&
idd->getThreadGroupDispatchSize() == InterfaceDescriptorType::THREAD_GROUP_DISPATCH_SIZE_TG_SIZE_1 &&
walkerCmd.getThreadGroupIdXDimension() * walkerCmd.getThreadGroupIdYDimension() * walkerCmd.getThreadGroupIdZDimension() * idd->getNumberOfThreadsInGpgpuThreadGroup() < walkerArgs.maxFrontEndThreads);

View File

@ -347,7 +347,6 @@ void ExecutionEnvironment::setDeviceHierarchyMode(const GfxCoreHelper &gfxCoreHe
void ExecutionEnvironment::adjustCcsCountImpl(RootDeviceEnvironment *rootDeviceEnvironment) const {
auto hwInfo = rootDeviceEnvironment->getMutableHardwareInfo();
auto &productHelper = rootDeviceEnvironment->getHelper<ProductHelper>();
rootDeviceEnvironment->setNonLimitedNumberOfCcs(hwInfo->gtSystemInfo.CCSInfo.NumberOfCCSEnabled);
productHelper.adjustNumberOfCcs(*hwInfo);
}

View File

@ -95,8 +95,6 @@ struct RootDeviceEnvironment : NonCopyableClass {
const ProductHelper &getProductHelper() const;
GraphicsAllocation *getDummyAllocation() const;
void releaseDummyAllocation();
void setNonLimitedNumberOfCcs(uint32_t numberOfCss) { this->nonLimitedNumberOfCcs = numberOfCss; };
uint32_t getNonLimitedNumberOfCcs() const { return this->nonLimitedNumberOfCcs; };
std::unique_ptr<SipKernel> sipKernels[static_cast<uint32_t>(SipKernelType::count)];
std::unique_ptr<GmmHelper> gmmHelper;
@ -126,7 +124,6 @@ struct RootDeviceEnvironment : NonCopyableClass {
protected:
using GraphicsAllocationUniquePtrType = std::unique_ptr<GraphicsAllocation, std::function<void(GraphicsAllocation *)>>;
GraphicsAllocationUniquePtrType dummyAllocation = nullptr;
uint32_t nonLimitedNumberOfCcs = 0u;
bool limitedNumberOfCcs = false;
bool isWddmOnLinuxEnable = false;

View File

@ -148,6 +148,7 @@ class ProductHelper {
virtual void adjustScratchSize(size_t &requiredScratchSize) const = 0;
virtual size_t getSvmCpuAlignment() const = 0;
virtual bool isComputeDispatchAllWalkerEnableInCfeStateRequired(const HardwareInfo &hwInfo) const = 0;
virtual bool adjustDispatchAllRequired(const HardwareInfo &hwInfo) const = 0;
virtual bool isVmBindPatIndexProgrammingSupported() const = 0;
virtual bool isIpSamplingSupported(const HardwareInfo &hwInfo) const = 0;
virtual bool isGrfNumReportedWithScm() const = 0;

View File

@ -542,6 +542,11 @@ size_t ProductHelperHw<gfxProduct>::getSvmCpuAlignment() const {
return MemoryConstants::pageSize2M;
}
template <PRODUCT_FAMILY gfxProduct>
bool ProductHelperHw<gfxProduct>::adjustDispatchAllRequired(const HardwareInfo &hwInfo) const {
return false;
}
template <PRODUCT_FAMILY gfxProduct>
bool ProductHelperHw<gfxProduct>::isComputeDispatchAllWalkerEnableInCfeStateRequired(const HardwareInfo &hwInfo) const {
return getFrontEndPropertyComputeDispatchAllWalkerSupport();

View File

@ -91,6 +91,7 @@ class ProductHelperHw : public ProductHelper {
void adjustScratchSize(size_t &requiredScratchSize) const override;
size_t getSvmCpuAlignment() const override;
bool isComputeDispatchAllWalkerEnableInCfeStateRequired(const HardwareInfo &hwInfo) const override;
bool adjustDispatchAllRequired(const HardwareInfo &hwInfo) const override;
bool isVmBindPatIndexProgrammingSupported() const override;
bool isIpSamplingSupported(const HardwareInfo &hwInfo) const override;
bool isGrfNumReportedWithScm() const override;

View File

@ -45,6 +45,11 @@ bool ProductHelperHw<gfxProduct>::isDirectSubmissionSupported(ReleaseHelper *rel
return true;
}
template <>
bool ProductHelperHw<gfxProduct>::adjustDispatchAllRequired(const HardwareInfo &hwInfo) const {
return hwInfo.gtSystemInfo.SliceCount > 2u;
}
template <>
void ProductHelperHw<gfxProduct>::adjustScratchSize(size_t &requiredScratchSize) const {
requiredScratchSize *= 2;