Allow squared algorithm to work together with base one.
Change-Id: I9087957bb427a422b1be632f6375c96b8f91a492
This commit is contained in:
parent
031b537e2b
commit
7640201585
|
@ -294,6 +294,37 @@ void computeWorkgroupSize2D(uint32_t maxWorkGroupSize, size_t workGroupSize[3],
|
|||
}
|
||||
}
|
||||
|
||||
void computeWorkgroupSizeSquared(uint32_t maxWorkGroupSize, size_t workGroupSize[3], const size_t workItems[3], size_t simdSize, const uint32_t workDim) {
|
||||
for (int i = 0; i < 3; i++)
|
||||
workGroupSize[i] = 1;
|
||||
size_t itemsPowerOfTwoDivisors[3] = {1, 1, 1};
|
||||
for (auto i = 0u; i < workDim; i++) {
|
||||
uint32_t requiredWorkItemsCount = maxWorkGroupSize;
|
||||
while (requiredWorkItemsCount > 1 && !(Math::isDivisableByPowerOfTwoDivisor(uint32_t(workItems[i]), requiredWorkItemsCount)))
|
||||
requiredWorkItemsCount = requiredWorkItemsCount >> 1;
|
||||
itemsPowerOfTwoDivisors[i] = requiredWorkItemsCount;
|
||||
}
|
||||
if (itemsPowerOfTwoDivisors[0] * itemsPowerOfTwoDivisors[1] >= maxWorkGroupSize) {
|
||||
while (itemsPowerOfTwoDivisors[0] * itemsPowerOfTwoDivisors[1] > maxWorkGroupSize) {
|
||||
if (itemsPowerOfTwoDivisors[0] > itemsPowerOfTwoDivisors[1])
|
||||
itemsPowerOfTwoDivisors[0] = itemsPowerOfTwoDivisors[0] >> 1;
|
||||
else
|
||||
itemsPowerOfTwoDivisors[1] = itemsPowerOfTwoDivisors[1] >> 1;
|
||||
}
|
||||
for (auto i = 0u; i < 3; i++)
|
||||
workGroupSize[i] = itemsPowerOfTwoDivisors[i];
|
||||
return;
|
||||
|
||||
} else if (workItems[0] * workItems[1] > maxWorkGroupSize) {
|
||||
computeWorkgroupSize2D(maxWorkGroupSize, workGroupSize, workItems, simdSize);
|
||||
return;
|
||||
} else {
|
||||
for (auto i = 0u; i < workDim; i++)
|
||||
workGroupSize[i] = workItems[i];
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
void computeWorkgroupSizeND(WorkSizeInfo wsInfo, size_t workGroupSize[3], const size_t workItems[3], const uint32_t workDim) {
|
||||
for (int i = 0; i < 3; i++)
|
||||
workGroupSize[i] = 1;
|
||||
|
@ -303,8 +334,11 @@ void computeWorkgroupSizeND(WorkSizeInfo wsInfo, size_t workGroupSize[3], const
|
|||
return;
|
||||
} else {
|
||||
//Find biggest power of two which devide each dimension size
|
||||
|
||||
if (wsInfo.slmTotalSize == 0 && wsInfo.hasBarriers == 0) {
|
||||
if (DebugManager.flags.EnableComputeWorkSizeSquared.get() && workDim == 2 && !wsInfo.imgUsed) {
|
||||
return computeWorkgroupSizeSquared(wsInfo.maxWorkGroupSize, workGroupSize, workItems, wsInfo.simdSize, workDim);
|
||||
}
|
||||
|
||||
size_t itemsPowerOfTwoDivisors[3] = {1, 1, 1};
|
||||
for (auto i = 0u; i < workDim; i++) {
|
||||
uint32_t requiredWorkItemsCount = uint32_t(wsInfo.simdSize * optimalHardwareThreadCountGeneric[0]);
|
||||
|
@ -370,37 +404,6 @@ void computeWorkgroupSizeND(WorkSizeInfo wsInfo, size_t workGroupSize[3], const
|
|||
}
|
||||
}
|
||||
|
||||
void computeWorkgroupSizeSquared(uint32_t maxWorkGroupSize, size_t workGroupSize[3], const size_t workItems[3], size_t simdSize, const uint32_t workDim) {
|
||||
for (int i = 0; i < 3; i++)
|
||||
workGroupSize[i] = 1;
|
||||
size_t itemsPowerOfTwoDivisors[3] = {1, 1, 1};
|
||||
for (auto i = 0u; i < workDim; i++) {
|
||||
uint32_t requiredWorkItemsCount = maxWorkGroupSize;
|
||||
while (requiredWorkItemsCount > 1 && !(Math::isDivisableByPowerOfTwoDivisor(uint32_t(workItems[i]), requiredWorkItemsCount)))
|
||||
requiredWorkItemsCount = requiredWorkItemsCount >> 1;
|
||||
itemsPowerOfTwoDivisors[i] = requiredWorkItemsCount;
|
||||
}
|
||||
if (itemsPowerOfTwoDivisors[0] * itemsPowerOfTwoDivisors[1] >= maxWorkGroupSize) {
|
||||
while (itemsPowerOfTwoDivisors[0] * itemsPowerOfTwoDivisors[1] > maxWorkGroupSize) {
|
||||
if (itemsPowerOfTwoDivisors[0] > itemsPowerOfTwoDivisors[1])
|
||||
itemsPowerOfTwoDivisors[0] = itemsPowerOfTwoDivisors[0] >> 1;
|
||||
else
|
||||
itemsPowerOfTwoDivisors[1] = itemsPowerOfTwoDivisors[1] >> 1;
|
||||
}
|
||||
for (auto i = 0u; i < 3; i++)
|
||||
workGroupSize[i] = itemsPowerOfTwoDivisors[i];
|
||||
return;
|
||||
|
||||
} else if (workItems[0] * workItems[1] > maxWorkGroupSize) {
|
||||
computeWorkgroupSize2D(maxWorkGroupSize, workGroupSize, workItems, simdSize);
|
||||
return;
|
||||
} else {
|
||||
for (auto i = 0u; i < workDim; i++)
|
||||
workGroupSize[i] = workItems[i];
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
Vec3<size_t> computeWorkgroupSize(const DispatchInfo &dispatchInfo) {
|
||||
size_t workGroupSize[3] = {};
|
||||
if (dispatchInfo.getKernel() != nullptr) {
|
||||
|
@ -452,20 +455,12 @@ Vec3<size_t> canonizeWorkgroup(Vec3<size_t> workgroup) {
|
|||
void provideLocalWorkGroupSizeHints(Context *context, uint32_t maxWorkGroupSize, DispatchInfo dispatchInfo) {
|
||||
if (context != nullptr && context->isProvidingPerformanceHints() && dispatchInfo.getDim() <= 3) {
|
||||
size_t preferredWorkGroupSize[3];
|
||||
size_t workItems[3] = {dispatchInfo.getGWS().x, dispatchInfo.getGWS().y, dispatchInfo.getGWS().z};
|
||||
if (DebugManager.flags.EnableComputeWorkSizeND.get()) {
|
||||
WorkSizeInfo wsInfo(dispatchInfo);
|
||||
computeWorkgroupSizeND(wsInfo, preferredWorkGroupSize, workItems, dispatchInfo.getDim());
|
||||
} else {
|
||||
auto simd = dispatchInfo.getKernel()->getKernelInfo().getMaxSimdSize();
|
||||
if (dispatchInfo.getDim() == 1)
|
||||
computeWorkgroupSize1D(maxWorkGroupSize, preferredWorkGroupSize, workItems, simd);
|
||||
else if (DebugManager.flags.EnableComputeWorkSizeSquared.get() && dispatchInfo.getDim() == 2) {
|
||||
computeWorkgroupSizeSquared(maxWorkGroupSize, preferredWorkGroupSize, workItems, simd, dispatchInfo.getDim());
|
||||
} else {
|
||||
computeWorkgroupSize2D(maxWorkGroupSize, preferredWorkGroupSize, workItems, simd);
|
||||
}
|
||||
}
|
||||
|
||||
auto lws = computeWorkgroupSize(dispatchInfo);
|
||||
preferredWorkGroupSize[0] = lws.x;
|
||||
preferredWorkGroupSize[1] = lws.y;
|
||||
preferredWorkGroupSize[2] = lws.z;
|
||||
|
||||
if (dispatchInfo.getEnqueuedWorkgroupSize().x == 0) {
|
||||
context->providePerformanceHint(CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL, NULL_LOCAL_WORKGROUP_SIZE, dispatchInfo.getKernel()->getKernelInfo().name.c_str(),
|
||||
preferredWorkGroupSize[0], preferredWorkGroupSize[1], preferredWorkGroupSize[2]);
|
||||
|
|
|
@ -68,7 +68,7 @@ DECLARE_DEBUG_VARIABLE(bool, EnableAsyncEventsHandler, true, "Enables async even
|
|||
DECLARE_DEBUG_VARIABLE(bool, EnableForcePin, true, "Enables early pinning for memory object")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, Enable64kbpages, -1, "-1: default behaviour, 0 Disables, 1 Enables support for 64KB pages for driver allocated fine grain svm buffers")
|
||||
DECLARE_DEBUG_VARIABLE(bool, EnableComputeWorkSizeND, true, "Enables diffrent algorithm to compute local work size")
|
||||
DECLARE_DEBUG_VARIABLE(bool, EnableComputeWorkSizeSquared, false, "Enables algorithm to compute the most squared work group as possible")
|
||||
DECLARE_DEBUG_VARIABLE(bool, EnableComputeWorkSizeSquared, true, "Enables algorithm to compute the most squared work group as possible")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, OverrideEnableKmdNotify, -1, "-1: dont override, 0: disable, 1: enable")
|
||||
DECLARE_DEBUG_VARIABLE(int32_t, OverrideKmdNotifyDelayMs, -1, "-1: dont override, 0: infinite timeout, >0: timeout in ms")
|
||||
DECLARE_DEBUG_VARIABLE(bool, EnableVaLibCalls, true, "Enable cl-va sharing lib calls")
|
||||
|
|
|
@ -22,6 +22,7 @@
|
|||
|
||||
#include "runtime/command_queue/dispatch_walker.h"
|
||||
#include "runtime/helpers/options.h"
|
||||
#include "unit_tests/helpers/debug_manager_state_restore.h"
|
||||
#include "gtest/gtest.h"
|
||||
|
||||
using namespace OCLRT;
|
||||
|
@ -95,8 +96,8 @@ TEST(localWorkSizeTest, given2DimWorkGroupAndSimdEqual8WhenComputeCalledThenLoca
|
|||
size_t workGroupSize[3];
|
||||
|
||||
OCLRT::computeWorkgroupSizeND(wsInfo, workGroupSize, workGroup, workDim);
|
||||
EXPECT_EQ(workGroupSize[0], 128u);
|
||||
EXPECT_EQ(workGroupSize[1], 2u);
|
||||
EXPECT_EQ(workGroupSize[0], 16u);
|
||||
EXPECT_EQ(workGroupSize[1], 16u);
|
||||
EXPECT_EQ(workGroupSize[2], 1u);
|
||||
|
||||
workGroup[0] = 48;
|
||||
|
@ -115,6 +116,8 @@ TEST(localWorkSizeTest, given2DimWorkGroupAndSimdEqual8WhenComputeCalledThenLoca
|
|||
}
|
||||
|
||||
TEST(localWorkSizeTest, given2DimWorkGroupAndSimdEqual32WhenComputeCalledThenLocalGroupComputed) {
|
||||
DebugManagerStateRestore dbgRestore;
|
||||
DebugManager.flags.EnableComputeWorkSizeSquared.set(false);
|
||||
WorkSizeInfo wsInfo(256, 0u, 32, 0u, platformDevices[0]->pPlatform->eRenderCoreFamily, 32u, 0u, false, false);
|
||||
uint32_t workDim = 2;
|
||||
size_t workGroup[3] = {384, 96, 1};
|
||||
|
@ -232,8 +235,8 @@ TEST(localWorkSizeTest, given2DimWorkGroupAndSimdEqual256WhenComputeCalledThenLo
|
|||
size_t workGroupSize[3];
|
||||
|
||||
OCLRT::computeWorkgroupSizeND(wsInfo, workGroupSize, workGroup, workDim);
|
||||
EXPECT_EQ(workGroupSize[0], 128u);
|
||||
EXPECT_EQ(workGroupSize[1], 2u);
|
||||
EXPECT_EQ(workGroupSize[0], 16u);
|
||||
EXPECT_EQ(workGroupSize[1], 16u);
|
||||
EXPECT_EQ(workGroupSize[2], 1u);
|
||||
}
|
||||
|
||||
|
@ -267,8 +270,8 @@ TEST(localWorkSizeTest, givenKernelWithTwoDimensionalGlobalSizesWhenLwsIsCompute
|
|||
workGroup[0] = 1024;
|
||||
workGroup[1] = 1024;
|
||||
OCLRT::computeWorkgroupSizeND(wsInfo, workGroupSize, workGroup, workDim);
|
||||
EXPECT_EQ(workGroupSize[0], 256u);
|
||||
EXPECT_EQ(workGroupSize[1], 1u);
|
||||
EXPECT_EQ(workGroupSize[0], 16u);
|
||||
EXPECT_EQ(workGroupSize[1], 16u);
|
||||
EXPECT_EQ(workGroupSize[2], 1u);
|
||||
}
|
||||
|
||||
|
@ -521,6 +524,8 @@ TEST(localWorkSizeTest, given2DimWorkWhenComputeSquaredCalledThenLocalGroupCompu
|
|||
}
|
||||
|
||||
TEST(localWorkSizeTest, givenDeviceSupportingLws1024AndKernelCompiledInSimd8WhenGwsIs1024ThenLwsIsComputedAsMaxOptimalMultipliedBySimd) {
|
||||
DebugManagerStateRestore dbgRestore;
|
||||
DebugManager.flags.EnableComputeWorkSizeSquared.set(false);
|
||||
WorkSizeInfo wsInfo(1024, 0u, 8, 0u, platformDevices[0]->pPlatform->eRenderCoreFamily, 32u, 0u, false, false);
|
||||
|
||||
uint32_t workDim = 2;
|
||||
|
@ -535,4 +540,7 @@ TEST(localWorkSizeTest, givenDeviceSupportingLws1024AndKernelCompiledInSimd8When
|
|||
|
||||
TEST(localWorkSizeTest, givenDebugVariableEnableComputeWorkSizeNDWhenCheckValueExpectTrue) {
|
||||
EXPECT_TRUE(DebugManager.flags.EnableComputeWorkSizeND.get());
|
||||
}
|
||||
TEST(localWorkSizeTest, givenDefaultDebugVariablesWhenEnableComputeWorkSizeSquaredIsCheckdThenTrueIsReturned) {
|
||||
EXPECT_TRUE(DebugManager.flags.EnableComputeWorkSizeSquared.get());
|
||||
}
|
Loading…
Reference in New Issue