feature: limit max LWS based on preferred number of workgroups per ss
- limit max LWS size when SLM and barriers are not used Related-To: GSD-11112 Signed-off-by: Mateusz Hoppe <mateusz.hoppe@intel.com>
This commit is contained in:
parent
8839d62c79
commit
b03f625f03
|
@ -1,5 +1,5 @@
|
||||||
/*
|
/*
|
||||||
* Copyright (C) 2021-2023 Intel Corporation
|
* Copyright (C) 2021-2025 Intel Corporation
|
||||||
*
|
*
|
||||||
* SPDX-License-Identifier: MIT
|
* SPDX-License-Identifier: MIT
|
||||||
*
|
*
|
||||||
|
@ -101,6 +101,7 @@ WorkSizeInfo createWorkSizeInfoFromDispatchInfo(const DispatchInfo &dispatchInfo
|
||||||
kernelInfo.kernelDescriptor.kernelAttributes.flags.requiresDisabledEUFusion);
|
kernelInfo.kernelDescriptor.kernelAttributes.flags.requiresDisabledEUFusion);
|
||||||
|
|
||||||
wsInfo.setIfUseImg(kernelInfo);
|
wsInfo.setIfUseImg(kernelInfo);
|
||||||
|
wsInfo.setPreferredWgCountPerSubslice(device.getProductHelper().getPreferredWorkgroupCountPerSubslice());
|
||||||
|
|
||||||
return wsInfo;
|
return wsInfo;
|
||||||
}
|
}
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
/*
|
/*
|
||||||
* Copyright (C) 2018-2024 Intel Corporation
|
* Copyright (C) 2018-2025 Intel Corporation
|
||||||
*
|
*
|
||||||
* SPDX-License-Identifier: MIT
|
* SPDX-License-Identifier: MIT
|
||||||
*
|
*
|
||||||
|
@ -309,6 +309,257 @@ TEST_F(LocalWorkSizeTest, given2DimWorkGroupAndSimdEqual32WhenComputeCalledThenL
|
||||||
EXPECT_EQ(workGroupSize[2], 1u);
|
EXPECT_EQ(workGroupSize[2], 1u);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
TEST_F(LocalWorkSizeTest, givenSimdEqual32AndPreferredWgCountPerSubslice2WhenComputeCalledThenLocalGroupSizeIsLimited) {
|
||||||
|
DebugManagerStateRestore dbgRestore;
|
||||||
|
debugManager.flags.EnableComputeWorkSizeSquared.set(false);
|
||||||
|
WorkSizeInfo wsInfo(1024, 0u, 32, 0u, rootDeviceEnvironment, 32u, 0u, false, false, false);
|
||||||
|
wsInfo.setPreferredWgCountPerSubslice(2);
|
||||||
|
|
||||||
|
constexpr uint32_t maxLws = 32 * 32 / 2; // simd size * num threadsPerSubslice / preferredWgCountPerSubslice
|
||||||
|
|
||||||
|
uint32_t workDim = 2;
|
||||||
|
size_t workGroup[3] = {384, 96, 1};
|
||||||
|
size_t workGroupSize[3];
|
||||||
|
|
||||||
|
NEO::computeWorkgroupSizeND(wsInfo, workGroupSize, workGroup, workDim);
|
||||||
|
EXPECT_EQ(workGroupSize[0], 128u);
|
||||||
|
EXPECT_EQ(workGroupSize[1], 2u);
|
||||||
|
EXPECT_EQ(workGroupSize[2], 1u);
|
||||||
|
EXPECT_TRUE(workGroupSize[0] * workGroupSize[1] < maxLws);
|
||||||
|
|
||||||
|
workGroup[0] = 1024 * 256;
|
||||||
|
NEO::computeWorkgroupSizeND(wsInfo, workGroupSize, workGroup, workDim);
|
||||||
|
EXPECT_EQ(workGroupSize[0], 256u);
|
||||||
|
EXPECT_EQ(workGroupSize[1], 1u);
|
||||||
|
EXPECT_EQ(workGroupSize[2], 1u);
|
||||||
|
EXPECT_TRUE(workGroupSize[0] * workGroupSize[1] <= maxLws);
|
||||||
|
|
||||||
|
workGroup[0] = 48;
|
||||||
|
NEO::computeWorkgroupSizeND(wsInfo, workGroupSize, workGroup, workDim);
|
||||||
|
EXPECT_EQ(workGroupSize[0], 16u);
|
||||||
|
EXPECT_EQ(workGroupSize[1], 32u);
|
||||||
|
EXPECT_EQ(workGroupSize[2], 1u);
|
||||||
|
EXPECT_TRUE(workGroupSize[0] * workGroupSize[1] <= maxLws);
|
||||||
|
|
||||||
|
workGroup[0] = 512;
|
||||||
|
workGroup[1] = 1;
|
||||||
|
NEO::computeWorkgroupSizeND(wsInfo, workGroupSize, workGroup, workDim);
|
||||||
|
EXPECT_EQ(workGroupSize[0], 512u);
|
||||||
|
EXPECT_EQ(workGroupSize[1], 1u);
|
||||||
|
EXPECT_EQ(workGroupSize[2], 1u);
|
||||||
|
EXPECT_TRUE(workGroupSize[0] * workGroupSize[1] <= maxLws);
|
||||||
|
|
||||||
|
workGroup[0] = 12;
|
||||||
|
workGroup[1] = 512;
|
||||||
|
NEO::computeWorkgroupSizeND(wsInfo, workGroupSize, workGroup, workDim);
|
||||||
|
EXPECT_EQ(workGroupSize[0], 4u);
|
||||||
|
EXPECT_EQ(workGroupSize[1], 64u);
|
||||||
|
EXPECT_EQ(workGroupSize[2], 1u);
|
||||||
|
EXPECT_TRUE(workGroupSize[0] * workGroupSize[1] <= maxLws);
|
||||||
|
|
||||||
|
workGroup[0] = 1;
|
||||||
|
workGroup[1] = 384;
|
||||||
|
NEO::computeWorkgroupSizeND(wsInfo, workGroupSize, workGroup, workDim);
|
||||||
|
EXPECT_EQ(workGroupSize[0], 1u);
|
||||||
|
EXPECT_EQ(workGroupSize[1], 128u);
|
||||||
|
EXPECT_EQ(workGroupSize[2], 1u);
|
||||||
|
EXPECT_TRUE(workGroupSize[0] * workGroupSize[1] <= maxLws);
|
||||||
|
|
||||||
|
workGroup[0] = 128;
|
||||||
|
workGroup[1] = 4;
|
||||||
|
wsInfo.imgUsed = true;
|
||||||
|
NEO::computeWorkgroupSizeND(wsInfo, workGroupSize, workGroup, workDim);
|
||||||
|
EXPECT_EQ(workGroupSize[0], 128u);
|
||||||
|
EXPECT_EQ(workGroupSize[1], 4u);
|
||||||
|
EXPECT_EQ(workGroupSize[2], 1u);
|
||||||
|
EXPECT_TRUE(workGroupSize[0] * workGroupSize[1] <= maxLws);
|
||||||
|
|
||||||
|
workGroup[0] = 64;
|
||||||
|
workGroup[1] = 8;
|
||||||
|
wsInfo.imgUsed = false;
|
||||||
|
NEO::computeWorkgroupSizeND(wsInfo, workGroupSize, workGroup, workDim);
|
||||||
|
EXPECT_EQ(workGroupSize[0], 64u);
|
||||||
|
EXPECT_EQ(workGroupSize[1], 8u);
|
||||||
|
EXPECT_EQ(workGroupSize[2], 1u);
|
||||||
|
EXPECT_TRUE(workGroupSize[0] * workGroupSize[1] <= maxLws);
|
||||||
|
|
||||||
|
workGroup[0] = 1024;
|
||||||
|
workGroup[1] = 9;
|
||||||
|
wsInfo.imgUsed = true;
|
||||||
|
NEO::computeWorkgroupSizeND(wsInfo, workGroupSize, workGroup, workDim);
|
||||||
|
EXPECT_EQ(workGroupSize[0], 512u);
|
||||||
|
EXPECT_EQ(workGroupSize[1], 1u);
|
||||||
|
EXPECT_EQ(workGroupSize[2], 1u);
|
||||||
|
EXPECT_TRUE(workGroupSize[0] * workGroupSize[1] <= maxLws);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_F(LocalWorkSizeTest, givenSimdEqual32AndPreferredWgCountPerSubslice4WhenComputeCalledThenLocalGroupSizeIsLimited) {
|
||||||
|
DebugManagerStateRestore dbgRestore;
|
||||||
|
debugManager.flags.EnableComputeWorkSizeSquared.set(false);
|
||||||
|
WorkSizeInfo wsInfo(1024, 0u, 32, 0u, rootDeviceEnvironment, 32u, 0u, false, false, false);
|
||||||
|
wsInfo.setPreferredWgCountPerSubslice(4);
|
||||||
|
|
||||||
|
constexpr uint32_t maxLws = 32 * 32 / 4; // simd size * num threadsPerSubslice / preferredWgCountPerSubslice
|
||||||
|
|
||||||
|
uint32_t workDim = 2;
|
||||||
|
size_t workGroup[3] = {384, 96, 1};
|
||||||
|
size_t workGroupSize[3];
|
||||||
|
|
||||||
|
NEO::computeWorkgroupSizeND(wsInfo, workGroupSize, workGroup, workDim);
|
||||||
|
EXPECT_EQ(workGroupSize[0], 128u);
|
||||||
|
EXPECT_EQ(workGroupSize[1], 2u);
|
||||||
|
EXPECT_EQ(workGroupSize[2], 1u);
|
||||||
|
EXPECT_TRUE(workGroupSize[0] * workGroupSize[1] <= maxLws);
|
||||||
|
|
||||||
|
workGroup[0] = 1024 * 256;
|
||||||
|
NEO::computeWorkgroupSizeND(wsInfo, workGroupSize, workGroup, workDim);
|
||||||
|
EXPECT_EQ(workGroupSize[0], 256u);
|
||||||
|
EXPECT_EQ(workGroupSize[1], 1u);
|
||||||
|
EXPECT_EQ(workGroupSize[2], 1u);
|
||||||
|
EXPECT_TRUE(workGroupSize[0] * workGroupSize[1] <= maxLws);
|
||||||
|
|
||||||
|
workGroup[0] = 48;
|
||||||
|
NEO::computeWorkgroupSizeND(wsInfo, workGroupSize, workGroup, workDim);
|
||||||
|
EXPECT_EQ(workGroupSize[0], 16u);
|
||||||
|
EXPECT_EQ(workGroupSize[1], 16u);
|
||||||
|
EXPECT_EQ(workGroupSize[2], 1u);
|
||||||
|
EXPECT_TRUE(workGroupSize[0] * workGroupSize[1] <= maxLws);
|
||||||
|
|
||||||
|
workGroup[0] = 512;
|
||||||
|
workGroup[1] = 1;
|
||||||
|
NEO::computeWorkgroupSizeND(wsInfo, workGroupSize, workGroup, workDim);
|
||||||
|
EXPECT_EQ(workGroupSize[0], 256u);
|
||||||
|
EXPECT_EQ(workGroupSize[1], 1u);
|
||||||
|
EXPECT_EQ(workGroupSize[2], 1u);
|
||||||
|
EXPECT_TRUE(workGroupSize[0] * workGroupSize[1] <= maxLws);
|
||||||
|
|
||||||
|
workGroup[0] = 12;
|
||||||
|
workGroup[1] = 512;
|
||||||
|
NEO::computeWorkgroupSizeND(wsInfo, workGroupSize, workGroup, workDim);
|
||||||
|
EXPECT_EQ(workGroupSize[0], 4u);
|
||||||
|
EXPECT_EQ(workGroupSize[1], 64u);
|
||||||
|
EXPECT_EQ(workGroupSize[2], 1u);
|
||||||
|
EXPECT_TRUE(workGroupSize[0] * workGroupSize[1] <= maxLws);
|
||||||
|
|
||||||
|
workGroup[0] = 1;
|
||||||
|
workGroup[1] = 384;
|
||||||
|
NEO::computeWorkgroupSizeND(wsInfo, workGroupSize, workGroup, workDim);
|
||||||
|
EXPECT_EQ(workGroupSize[0], 1u);
|
||||||
|
EXPECT_EQ(workGroupSize[1], 128u);
|
||||||
|
EXPECT_EQ(workGroupSize[2], 1u);
|
||||||
|
EXPECT_TRUE(workGroupSize[0] * workGroupSize[1] <= maxLws);
|
||||||
|
|
||||||
|
workGroup[0] = 128;
|
||||||
|
workGroup[1] = 4;
|
||||||
|
wsInfo.imgUsed = true;
|
||||||
|
NEO::computeWorkgroupSizeND(wsInfo, workGroupSize, workGroup, workDim);
|
||||||
|
EXPECT_EQ(workGroupSize[0], 64u);
|
||||||
|
EXPECT_EQ(workGroupSize[1], 4u);
|
||||||
|
EXPECT_EQ(workGroupSize[2], 1u);
|
||||||
|
EXPECT_TRUE(workGroupSize[0] * workGroupSize[1] <= maxLws);
|
||||||
|
|
||||||
|
workGroup[0] = 64;
|
||||||
|
workGroup[1] = 8;
|
||||||
|
wsInfo.imgUsed = false;
|
||||||
|
NEO::computeWorkgroupSizeND(wsInfo, workGroupSize, workGroup, workDim);
|
||||||
|
EXPECT_EQ(workGroupSize[0], 64u);
|
||||||
|
EXPECT_EQ(workGroupSize[1], 4u);
|
||||||
|
EXPECT_EQ(workGroupSize[2], 1u);
|
||||||
|
EXPECT_TRUE(workGroupSize[0] * workGroupSize[1] <= maxLws);
|
||||||
|
|
||||||
|
workGroup[0] = 1024;
|
||||||
|
workGroup[1] = 9;
|
||||||
|
wsInfo.imgUsed = true;
|
||||||
|
NEO::computeWorkgroupSizeND(wsInfo, workGroupSize, workGroup, workDim);
|
||||||
|
EXPECT_EQ(workGroupSize[0], 256u);
|
||||||
|
EXPECT_EQ(workGroupSize[1], 1u);
|
||||||
|
EXPECT_EQ(workGroupSize[2], 1u);
|
||||||
|
EXPECT_TRUE(workGroupSize[0] * workGroupSize[1] <= maxLws);
|
||||||
|
|
||||||
|
workGroup[0] = 2048;
|
||||||
|
workGroup[1] = 1;
|
||||||
|
wsInfo.imgUsed = false;
|
||||||
|
NEO::computeWorkgroupSizeND(wsInfo, workGroupSize, workGroup, workDim);
|
||||||
|
EXPECT_EQ(workGroupSize[0], 256u);
|
||||||
|
EXPECT_EQ(workGroupSize[1], 1u);
|
||||||
|
EXPECT_EQ(workGroupSize[2], 1u);
|
||||||
|
EXPECT_TRUE(workGroupSize[0] * workGroupSize[1] <= maxLws);
|
||||||
|
}
|
||||||
|
|
||||||
|
TEST_F(LocalWorkSizeTest, givenSimdEqual32AndPreferredWgCountPerSubslice4WhenBarriersOrSlmUsedThenLocalGroupSizeIsNotLimited) {
|
||||||
|
DebugManagerStateRestore dbgRestore;
|
||||||
|
debugManager.flags.EnableComputeWorkSizeSquared.set(false);
|
||||||
|
WorkSizeInfo wsInfo(1024, 0u, 32, 0u, rootDeviceEnvironment, 32u, 0u, false, false, false);
|
||||||
|
wsInfo.setPreferredWgCountPerSubslice(4);
|
||||||
|
|
||||||
|
for (uint32_t i = 0; i < 2; i++) {
|
||||||
|
if (i == 0) {
|
||||||
|
wsInfo.hasBarriers = true;
|
||||||
|
} else if (i == 1) {
|
||||||
|
wsInfo.hasBarriers = false;
|
||||||
|
wsInfo.slmTotalSize = 256;
|
||||||
|
}
|
||||||
|
|
||||||
|
uint32_t workDim = 2;
|
||||||
|
size_t workGroup[3] = {384, 96, 1};
|
||||||
|
size_t workGroupSize[3];
|
||||||
|
|
||||||
|
NEO::computeWorkgroupSizeND(wsInfo, workGroupSize, workGroup, workDim);
|
||||||
|
if (wsInfo.slmTotalSize == 0) {
|
||||||
|
EXPECT_EQ(workGroupSize[0], 384u);
|
||||||
|
EXPECT_EQ(workGroupSize[1], 2u);
|
||||||
|
EXPECT_EQ(workGroupSize[2], 1u);
|
||||||
|
} else {
|
||||||
|
// use ratio in algorithm
|
||||||
|
EXPECT_EQ(workGroupSize[0], 64u);
|
||||||
|
EXPECT_EQ(workGroupSize[1], 16u);
|
||||||
|
EXPECT_EQ(workGroupSize[2], 1u);
|
||||||
|
}
|
||||||
|
|
||||||
|
workGroup[0] = 1024 * 256;
|
||||||
|
NEO::computeWorkgroupSizeND(wsInfo, workGroupSize, workGroup, workDim);
|
||||||
|
EXPECT_EQ(workGroupSize[0], 512u);
|
||||||
|
EXPECT_EQ(workGroupSize[1], 2u);
|
||||||
|
EXPECT_EQ(workGroupSize[2], 1u);
|
||||||
|
|
||||||
|
workGroup[0] = 48;
|
||||||
|
NEO::computeWorkgroupSizeND(wsInfo, workGroupSize, workGroup, workDim);
|
||||||
|
EXPECT_EQ(workGroupSize[0], 48u);
|
||||||
|
EXPECT_EQ(workGroupSize[1], 16u);
|
||||||
|
EXPECT_EQ(workGroupSize[2], 1u);
|
||||||
|
|
||||||
|
workGroup[0] = 128;
|
||||||
|
workGroup[1] = 4;
|
||||||
|
wsInfo.imgUsed = true;
|
||||||
|
NEO::computeWorkgroupSizeND(wsInfo, workGroupSize, workGroup, workDim);
|
||||||
|
EXPECT_EQ(workGroupSize[0], 128u);
|
||||||
|
EXPECT_EQ(workGroupSize[1], 4u);
|
||||||
|
EXPECT_EQ(workGroupSize[2], 1u);
|
||||||
|
|
||||||
|
workGroup[0] = 1024;
|
||||||
|
workGroup[1] = 9;
|
||||||
|
wsInfo.imgUsed = false;
|
||||||
|
NEO::computeWorkgroupSizeND(wsInfo, workGroupSize, workGroup, workDim);
|
||||||
|
if (wsInfo.slmTotalSize == 0) {
|
||||||
|
EXPECT_EQ(workGroupSize[0], 512u);
|
||||||
|
EXPECT_EQ(workGroupSize[1], 1u);
|
||||||
|
EXPECT_EQ(workGroupSize[2], 1u);
|
||||||
|
} else {
|
||||||
|
EXPECT_EQ(workGroupSize[0], 256u);
|
||||||
|
EXPECT_EQ(workGroupSize[1], 3u);
|
||||||
|
EXPECT_EQ(workGroupSize[2], 1u);
|
||||||
|
}
|
||||||
|
|
||||||
|
workGroup[0] = 2048;
|
||||||
|
workGroup[1] = 2;
|
||||||
|
wsInfo.imgUsed = false;
|
||||||
|
NEO::computeWorkgroupSizeND(wsInfo, workGroupSize, workGroup, workDim);
|
||||||
|
EXPECT_EQ(workGroupSize[0], 512u);
|
||||||
|
EXPECT_EQ(workGroupSize[1], 2u);
|
||||||
|
EXPECT_EQ(workGroupSize[2], 1u);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
TEST_F(LocalWorkSizeTest, given3DimWorkGroupAndSimdEqual8WhenComputeCalledThenLocalGroupComputed) {
|
TEST_F(LocalWorkSizeTest, given3DimWorkGroupAndSimdEqual8WhenComputeCalledThenLocalGroupComputed) {
|
||||||
WorkSizeInfo wsInfo(256, 0u, 8, 0u, rootDeviceEnvironment, 56u, 0u, false, false, false);
|
WorkSizeInfo wsInfo(256, 0u, 8, 0u, rootDeviceEnvironment, 56u, 0u, false, false, false);
|
||||||
uint32_t workDim = 3;
|
uint32_t workDim = 3;
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
/*
|
/*
|
||||||
* Copyright (C) 2018-2023 Intel Corporation
|
* Copyright (C) 2018-2025 Intel Corporation
|
||||||
*
|
*
|
||||||
* SPDX-License-Identifier: MIT
|
* SPDX-License-Identifier: MIT
|
||||||
*
|
*
|
||||||
|
@ -360,6 +360,9 @@ void computeWorkgroupSizeND(WorkSizeInfo &wsInfo, size_t workGroupSize[3], const
|
||||||
workGroupSize[i] = 1;
|
workGroupSize[i] = 1;
|
||||||
|
|
||||||
UNRECOVERABLE_IF(wsInfo.simdSize == 0);
|
UNRECOVERABLE_IF(wsInfo.simdSize == 0);
|
||||||
|
uint64_t totalNumberOfItems = workItems[0] * workItems[1] * workItems[2];
|
||||||
|
auto optimalWgThreadCount = optimalHardwareThreadCountGeneric[0];
|
||||||
|
bool totalRequiredThreadGroupsMoreThanSingleThreadGroup = totalNumberOfItems > wsInfo.simdSize * optimalWgThreadCount;
|
||||||
|
|
||||||
// Find biggest power of two which devide each dimension size
|
// Find biggest power of two which devide each dimension size
|
||||||
if (wsInfo.slmTotalSize == 0 && !wsInfo.hasBarriers) {
|
if (wsInfo.slmTotalSize == 0 && !wsInfo.hasBarriers) {
|
||||||
|
@ -367,9 +370,14 @@ void computeWorkgroupSizeND(WorkSizeInfo &wsInfo, size_t workGroupSize[3], const
|
||||||
return computeWorkgroupSizeSquared(wsInfo.maxWorkGroupSize, workGroupSize, workItems, wsInfo.simdSize, workDim);
|
return computeWorkgroupSizeSquared(wsInfo.maxWorkGroupSize, workGroupSize, workItems, wsInfo.simdSize, workDim);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (wsInfo.preferredWgCountPerSubSlice != 0 && wsInfo.simdSize == 32 && totalRequiredThreadGroupsMoreThanSingleThreadGroup) {
|
||||||
|
optimalWgThreadCount = std::min(optimalWgThreadCount, wsInfo.numThreadsPerSubSlice / wsInfo.preferredWgCountPerSubSlice);
|
||||||
|
wsInfo.maxWorkGroupSize = wsInfo.simdSize * optimalWgThreadCount;
|
||||||
|
}
|
||||||
|
|
||||||
size_t itemsPowerOfTwoDivisors[3] = {1, 1, 1};
|
size_t itemsPowerOfTwoDivisors[3] = {1, 1, 1};
|
||||||
for (auto i = 0u; i < workDim; i++) {
|
for (auto i = 0u; i < workDim; i++) {
|
||||||
uint32_t requiredWorkItemsCount = uint32_t(wsInfo.simdSize * optimalHardwareThreadCountGeneric[0]);
|
uint32_t requiredWorkItemsCount = uint32_t(wsInfo.simdSize * optimalWgThreadCount);
|
||||||
while (requiredWorkItemsCount > 1 && !(Math::isDivisibleByPowerOfTwoDivisor(uint32_t(workItems[i]), requiredWorkItemsCount)))
|
while (requiredWorkItemsCount > 1 && !(Math::isDivisibleByPowerOfTwoDivisor(uint32_t(workItems[i]), requiredWorkItemsCount)))
|
||||||
requiredWorkItemsCount >>= 1;
|
requiredWorkItemsCount >>= 1;
|
||||||
itemsPowerOfTwoDivisors[i] = requiredWorkItemsCount;
|
itemsPowerOfTwoDivisors[i] = requiredWorkItemsCount;
|
||||||
|
@ -382,7 +390,7 @@ void computeWorkgroupSizeND(WorkSizeInfo &wsInfo, size_t workGroupSize[3], const
|
||||||
// If computed dimension sizes which are powers of two are creating group which is
|
// If computed dimension sizes which are powers of two are creating group which is
|
||||||
// bigger than maxWorkGroupSize or this group would create more than optimal hardware threads then downsize it
|
// bigger than maxWorkGroupSize or this group would create more than optimal hardware threads then downsize it
|
||||||
uint64_t allItems = itemsPowerOfTwoDivisors[0] * itemsPowerOfTwoDivisors[1] * itemsPowerOfTwoDivisors[2];
|
uint64_t allItems = itemsPowerOfTwoDivisors[0] * itemsPowerOfTwoDivisors[1] * itemsPowerOfTwoDivisors[2];
|
||||||
if (allItems > wsInfo.simdSize && (allItems > wsInfo.maxWorkGroupSize || allItems > wsInfo.simdSize * optimalHardwareThreadCountGeneric[0])) {
|
if (allItems > wsInfo.simdSize && (allItems > wsInfo.maxWorkGroupSize || allItems > wsInfo.simdSize * optimalWgThreadCount)) {
|
||||||
return computePowerOfTwoLWS(itemsPowerOfTwoDivisors, wsInfo, workGroupSize, workDim, canUseNx4);
|
return computePowerOfTwoLWS(itemsPowerOfTwoDivisors, wsInfo, workGroupSize, workDim, canUseNx4);
|
||||||
}
|
}
|
||||||
// If coputed workgroup is at this point in correct size
|
// If coputed workgroup is at this point in correct size
|
||||||
|
@ -394,9 +402,8 @@ void computeWorkgroupSizeND(WorkSizeInfo &wsInfo, size_t workGroupSize[3], const
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
uint64_t totalNuberOfItems = workItems[0] * workItems[1] * workItems[2];
|
|
||||||
// If dimensions are not powers of two but total number of items is less than max work group size
|
// If dimensions are not powers of two but total number of items is less than max work group size
|
||||||
if (totalNuberOfItems <= wsInfo.maxWorkGroupSize) {
|
if (totalNumberOfItems <= wsInfo.maxWorkGroupSize) {
|
||||||
for (auto i = 0u; i < workDim; i++)
|
for (auto i = 0u; i < workDim; i++)
|
||||||
workGroupSize[i] = workItems[i];
|
workGroupSize[i] = workItems[i];
|
||||||
return;
|
return;
|
||||||
|
|
|
@ -95,6 +95,7 @@ class ProductHelper {
|
||||||
virtual bool isMaxThreadsForWorkgroupWARequired(const HardwareInfo &hwInfo) const = 0;
|
virtual bool isMaxThreadsForWorkgroupWARequired(const HardwareInfo &hwInfo) const = 0;
|
||||||
virtual uint32_t getMaxThreadsForWorkgroupInDSSOrSS(const HardwareInfo &hwInfo, uint32_t maxNumEUsPerSubSlice, uint32_t maxNumEUsPerDualSubSlice) const = 0;
|
virtual uint32_t getMaxThreadsForWorkgroupInDSSOrSS(const HardwareInfo &hwInfo, uint32_t maxNumEUsPerSubSlice, uint32_t maxNumEUsPerDualSubSlice) const = 0;
|
||||||
virtual uint32_t getMaxThreadsForWorkgroup(const HardwareInfo &hwInfo, uint32_t maxNumEUsPerSubSlice) const = 0;
|
virtual uint32_t getMaxThreadsForWorkgroup(const HardwareInfo &hwInfo, uint32_t maxNumEUsPerSubSlice) const = 0;
|
||||||
|
virtual uint32_t getPreferredWorkgroupCountPerSubslice() const = 0;
|
||||||
virtual void setForceNonCoherent(void *const commandPtr, const StateComputeModeProperties &properties) const = 0;
|
virtual void setForceNonCoherent(void *const commandPtr, const StateComputeModeProperties &properties) const = 0;
|
||||||
virtual void updateScmCommand(void *const commandPtr, const StateComputeModeProperties &properties) const = 0;
|
virtual void updateScmCommand(void *const commandPtr, const StateComputeModeProperties &properties) const = 0;
|
||||||
virtual bool obtainBlitterPreference(const HardwareInfo &hwInfo) const = 0;
|
virtual bool obtainBlitterPreference(const HardwareInfo &hwInfo) const = 0;
|
||||||
|
|
|
@ -247,6 +247,11 @@ uint32_t ProductHelperHw<gfxProduct>::getMaxThreadsForWorkgroup(const HardwareIn
|
||||||
return maxNumEUsPerSubSlice * numThreadsPerEU;
|
return maxNumEUsPerSubSlice * numThreadsPerEU;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <PRODUCT_FAMILY gfxProduct>
|
||||||
|
uint32_t ProductHelperHw<gfxProduct>::getPreferredWorkgroupCountPerSubslice() const {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
template <PRODUCT_FAMILY gfxProduct>
|
template <PRODUCT_FAMILY gfxProduct>
|
||||||
void ProductHelperHw<gfxProduct>::setForceNonCoherent(void *const commandPtr, const StateComputeModeProperties &properties) const {}
|
void ProductHelperHw<gfxProduct>::setForceNonCoherent(void *const commandPtr, const StateComputeModeProperties &properties) const {}
|
||||||
|
|
||||||
|
|
|
@ -35,6 +35,7 @@ class ProductHelperHw : public ProductHelper {
|
||||||
bool isMaxThreadsForWorkgroupWARequired(const HardwareInfo &hwInfo) const override;
|
bool isMaxThreadsForWorkgroupWARequired(const HardwareInfo &hwInfo) const override;
|
||||||
uint32_t getMaxThreadsForWorkgroupInDSSOrSS(const HardwareInfo &hwInfo, uint32_t maxNumEUsPerSubSlice, uint32_t maxNumEUsPerDualSubSlice) const override;
|
uint32_t getMaxThreadsForWorkgroupInDSSOrSS(const HardwareInfo &hwInfo, uint32_t maxNumEUsPerSubSlice, uint32_t maxNumEUsPerDualSubSlice) const override;
|
||||||
uint32_t getMaxThreadsForWorkgroup(const HardwareInfo &hwInfo, uint32_t maxNumEUsPerSubSlice) const override;
|
uint32_t getMaxThreadsForWorkgroup(const HardwareInfo &hwInfo, uint32_t maxNumEUsPerSubSlice) const override;
|
||||||
|
uint32_t getPreferredWorkgroupCountPerSubslice() const override;
|
||||||
void setForceNonCoherent(void *const commandPtr, const StateComputeModeProperties &properties) const override;
|
void setForceNonCoherent(void *const commandPtr, const StateComputeModeProperties &properties) const override;
|
||||||
void updateScmCommand(void *const commandPtr, const StateComputeModeProperties &properties) const override;
|
void updateScmCommand(void *const commandPtr, const StateComputeModeProperties &properties) const override;
|
||||||
bool obtainBlitterPreference(const HardwareInfo &hwInfo) const override;
|
bool obtainBlitterPreference(const HardwareInfo &hwInfo) const override;
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
/*
|
/*
|
||||||
* Copyright (C) 2023-2024 Intel Corporation
|
* Copyright (C) 2023-2025 Intel Corporation
|
||||||
*
|
*
|
||||||
* SPDX-License-Identifier: MIT
|
* SPDX-License-Identifier: MIT
|
||||||
*
|
*
|
||||||
|
@ -73,4 +73,8 @@ void WorkSizeInfo::checkRatio(const size_t workItems[3]) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void WorkSizeInfo::setPreferredWgCountPerSubslice(uint32_t preferredWgCount) {
|
||||||
|
preferredWgCountPerSubSlice = preferredWgCount;
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace NEO
|
} // namespace NEO
|
||||||
|
|
|
@ -1,5 +1,5 @@
|
||||||
/*
|
/*
|
||||||
* Copyright (C) 2023 Intel Corporation
|
* Copyright (C) 2023-2025 Intel Corporation
|
||||||
*
|
*
|
||||||
* SPDX-License-Identifier: MIT
|
* SPDX-License-Identifier: MIT
|
||||||
*
|
*
|
||||||
|
@ -29,12 +29,14 @@ struct WorkSizeInfo {
|
||||||
bool useRatio = false;
|
bool useRatio = false;
|
||||||
bool useStrictRatio = false;
|
bool useStrictRatio = false;
|
||||||
float targetRatio = 0;
|
float targetRatio = 0;
|
||||||
|
uint32_t preferredWgCountPerSubSlice = 0;
|
||||||
|
|
||||||
WorkSizeInfo(uint32_t maxWorkGroupSize, bool hasBarriers, uint32_t simdSize, uint32_t slmTotalSize, const RootDeviceEnvironment &rootDeviceEnvironment, uint32_t numThreadsPerSubSlice, uint32_t localMemSize, bool imgUsed, bool yTiledSurface, bool disableEUFusion);
|
WorkSizeInfo(uint32_t maxWorkGroupSize, bool hasBarriers, uint32_t simdSize, uint32_t slmTotalSize, const RootDeviceEnvironment &rootDeviceEnvironment, uint32_t numThreadsPerSubSlice, uint32_t localMemSize, bool imgUsed, bool yTiledSurface, bool disableEUFusion);
|
||||||
|
|
||||||
void setIfUseImg(const KernelInfo &kernelInfo);
|
void setIfUseImg(const KernelInfo &kernelInfo);
|
||||||
void setMinWorkGroupSize(const RootDeviceEnvironment &rootDeviceEnvironment, bool disableEUFusion);
|
void setMinWorkGroupSize(const RootDeviceEnvironment &rootDeviceEnvironment, bool disableEUFusion);
|
||||||
void checkRatio(const size_t workItems[3]);
|
void checkRatio(const size_t workItems[3]);
|
||||||
|
void setPreferredWgCountPerSubslice(uint32_t preferredWgCount);
|
||||||
};
|
};
|
||||||
|
|
||||||
} // namespace NEO
|
} // namespace NEO
|
|
@ -1166,3 +1166,7 @@ HWTEST2_F(ProductHelperTest, givenProductHelperWhenCallingIsResourceUncachedForC
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
HWTEST_F(ProductHelperTest, givenProductHelperWhenGettingPreferredWorkgroupCountPerSubsliceThenZeroReturned) {
|
||||||
|
EXPECT_EQ(0u, productHelper->getPreferredWorkgroupCountPerSubslice());
|
||||||
|
}
|
||||||
|
|
Loading…
Reference in New Issue