Allow squared algorithm to work together with base one.

Change-Id: I9087957bb427a422b1be632f6375c96b8f91a492
This commit is contained in:
Mrozek, Michal 2018-01-12 09:58:30 +01:00 committed by sys_ocldev
parent 031b537e2b
commit 7640201585
3 changed files with 56 additions and 53 deletions

View File

@ -294,6 +294,37 @@ void computeWorkgroupSize2D(uint32_t maxWorkGroupSize, size_t workGroupSize[3],
}
}
void computeWorkgroupSizeSquared(uint32_t maxWorkGroupSize, size_t workGroupSize[3], const size_t workItems[3], size_t simdSize, const uint32_t workDim) {
for (int i = 0; i < 3; i++)
workGroupSize[i] = 1;
size_t itemsPowerOfTwoDivisors[3] = {1, 1, 1};
for (auto i = 0u; i < workDim; i++) {
uint32_t requiredWorkItemsCount = maxWorkGroupSize;
while (requiredWorkItemsCount > 1 && !(Math::isDivisableByPowerOfTwoDivisor(uint32_t(workItems[i]), requiredWorkItemsCount)))
requiredWorkItemsCount = requiredWorkItemsCount >> 1;
itemsPowerOfTwoDivisors[i] = requiredWorkItemsCount;
}
if (itemsPowerOfTwoDivisors[0] * itemsPowerOfTwoDivisors[1] >= maxWorkGroupSize) {
while (itemsPowerOfTwoDivisors[0] * itemsPowerOfTwoDivisors[1] > maxWorkGroupSize) {
if (itemsPowerOfTwoDivisors[0] > itemsPowerOfTwoDivisors[1])
itemsPowerOfTwoDivisors[0] = itemsPowerOfTwoDivisors[0] >> 1;
else
itemsPowerOfTwoDivisors[1] = itemsPowerOfTwoDivisors[1] >> 1;
}
for (auto i = 0u; i < 3; i++)
workGroupSize[i] = itemsPowerOfTwoDivisors[i];
return;
} else if (workItems[0] * workItems[1] > maxWorkGroupSize) {
computeWorkgroupSize2D(maxWorkGroupSize, workGroupSize, workItems, simdSize);
return;
} else {
for (auto i = 0u; i < workDim; i++)
workGroupSize[i] = workItems[i];
return;
}
}
void computeWorkgroupSizeND(WorkSizeInfo wsInfo, size_t workGroupSize[3], const size_t workItems[3], const uint32_t workDim) {
for (int i = 0; i < 3; i++)
workGroupSize[i] = 1;
@ -303,8 +334,11 @@ void computeWorkgroupSizeND(WorkSizeInfo wsInfo, size_t workGroupSize[3], const
return;
} else {
//Find biggest power of two which devide each dimension size
if (wsInfo.slmTotalSize == 0 && wsInfo.hasBarriers == 0) {
if (DebugManager.flags.EnableComputeWorkSizeSquared.get() && workDim == 2 && !wsInfo.imgUsed) {
return computeWorkgroupSizeSquared(wsInfo.maxWorkGroupSize, workGroupSize, workItems, wsInfo.simdSize, workDim);
}
size_t itemsPowerOfTwoDivisors[3] = {1, 1, 1};
for (auto i = 0u; i < workDim; i++) {
uint32_t requiredWorkItemsCount = uint32_t(wsInfo.simdSize * optimalHardwareThreadCountGeneric[0]);
@ -370,37 +404,6 @@ void computeWorkgroupSizeND(WorkSizeInfo wsInfo, size_t workGroupSize[3], const
}
}
void computeWorkgroupSizeSquared(uint32_t maxWorkGroupSize, size_t workGroupSize[3], const size_t workItems[3], size_t simdSize, const uint32_t workDim) {
for (int i = 0; i < 3; i++)
workGroupSize[i] = 1;
size_t itemsPowerOfTwoDivisors[3] = {1, 1, 1};
for (auto i = 0u; i < workDim; i++) {
uint32_t requiredWorkItemsCount = maxWorkGroupSize;
while (requiredWorkItemsCount > 1 && !(Math::isDivisableByPowerOfTwoDivisor(uint32_t(workItems[i]), requiredWorkItemsCount)))
requiredWorkItemsCount = requiredWorkItemsCount >> 1;
itemsPowerOfTwoDivisors[i] = requiredWorkItemsCount;
}
if (itemsPowerOfTwoDivisors[0] * itemsPowerOfTwoDivisors[1] >= maxWorkGroupSize) {
while (itemsPowerOfTwoDivisors[0] * itemsPowerOfTwoDivisors[1] > maxWorkGroupSize) {
if (itemsPowerOfTwoDivisors[0] > itemsPowerOfTwoDivisors[1])
itemsPowerOfTwoDivisors[0] = itemsPowerOfTwoDivisors[0] >> 1;
else
itemsPowerOfTwoDivisors[1] = itemsPowerOfTwoDivisors[1] >> 1;
}
for (auto i = 0u; i < 3; i++)
workGroupSize[i] = itemsPowerOfTwoDivisors[i];
return;
} else if (workItems[0] * workItems[1] > maxWorkGroupSize) {
computeWorkgroupSize2D(maxWorkGroupSize, workGroupSize, workItems, simdSize);
return;
} else {
for (auto i = 0u; i < workDim; i++)
workGroupSize[i] = workItems[i];
return;
}
}
Vec3<size_t> computeWorkgroupSize(const DispatchInfo &dispatchInfo) {
size_t workGroupSize[3] = {};
if (dispatchInfo.getKernel() != nullptr) {
@ -452,20 +455,12 @@ Vec3<size_t> canonizeWorkgroup(Vec3<size_t> workgroup) {
void provideLocalWorkGroupSizeHints(Context *context, uint32_t maxWorkGroupSize, DispatchInfo dispatchInfo) {
if (context != nullptr && context->isProvidingPerformanceHints() && dispatchInfo.getDim() <= 3) {
size_t preferredWorkGroupSize[3];
size_t workItems[3] = {dispatchInfo.getGWS().x, dispatchInfo.getGWS().y, dispatchInfo.getGWS().z};
if (DebugManager.flags.EnableComputeWorkSizeND.get()) {
WorkSizeInfo wsInfo(dispatchInfo);
computeWorkgroupSizeND(wsInfo, preferredWorkGroupSize, workItems, dispatchInfo.getDim());
} else {
auto simd = dispatchInfo.getKernel()->getKernelInfo().getMaxSimdSize();
if (dispatchInfo.getDim() == 1)
computeWorkgroupSize1D(maxWorkGroupSize, preferredWorkGroupSize, workItems, simd);
else if (DebugManager.flags.EnableComputeWorkSizeSquared.get() && dispatchInfo.getDim() == 2) {
computeWorkgroupSizeSquared(maxWorkGroupSize, preferredWorkGroupSize, workItems, simd, dispatchInfo.getDim());
} else {
computeWorkgroupSize2D(maxWorkGroupSize, preferredWorkGroupSize, workItems, simd);
}
}
auto lws = computeWorkgroupSize(dispatchInfo);
preferredWorkGroupSize[0] = lws.x;
preferredWorkGroupSize[1] = lws.y;
preferredWorkGroupSize[2] = lws.z;
if (dispatchInfo.getEnqueuedWorkgroupSize().x == 0) {
context->providePerformanceHint(CL_CONTEXT_DIAGNOSTICS_LEVEL_NEUTRAL_INTEL, NULL_LOCAL_WORKGROUP_SIZE, dispatchInfo.getKernel()->getKernelInfo().name.c_str(),
preferredWorkGroupSize[0], preferredWorkGroupSize[1], preferredWorkGroupSize[2]);

View File

@ -68,7 +68,7 @@ DECLARE_DEBUG_VARIABLE(bool, EnableAsyncEventsHandler, true, "Enables async even
DECLARE_DEBUG_VARIABLE(bool, EnableForcePin, true, "Enables early pinning for memory object")
DECLARE_DEBUG_VARIABLE(int32_t, Enable64kbpages, -1, "-1: default behaviour, 0 Disables, 1 Enables support for 64KB pages for driver allocated fine grain svm buffers")
DECLARE_DEBUG_VARIABLE(bool, EnableComputeWorkSizeND, true, "Enables diffrent algorithm to compute local work size")
DECLARE_DEBUG_VARIABLE(bool, EnableComputeWorkSizeSquared, false, "Enables algorithm to compute the most squared work group as possible")
DECLARE_DEBUG_VARIABLE(bool, EnableComputeWorkSizeSquared, true, "Enables algorithm to compute the most squared work group as possible")
DECLARE_DEBUG_VARIABLE(int32_t, OverrideEnableKmdNotify, -1, "-1: dont override, 0: disable, 1: enable")
DECLARE_DEBUG_VARIABLE(int32_t, OverrideKmdNotifyDelayMs, -1, "-1: dont override, 0: infinite timeout, >0: timeout in ms")
DECLARE_DEBUG_VARIABLE(bool, EnableVaLibCalls, true, "Enable cl-va sharing lib calls")

View File

@ -22,6 +22,7 @@
#include "runtime/command_queue/dispatch_walker.h"
#include "runtime/helpers/options.h"
#include "unit_tests/helpers/debug_manager_state_restore.h"
#include "gtest/gtest.h"
using namespace OCLRT;
@ -95,8 +96,8 @@ TEST(localWorkSizeTest, given2DimWorkGroupAndSimdEqual8WhenComputeCalledThenLoca
size_t workGroupSize[3];
OCLRT::computeWorkgroupSizeND(wsInfo, workGroupSize, workGroup, workDim);
EXPECT_EQ(workGroupSize[0], 128u);
EXPECT_EQ(workGroupSize[1], 2u);
EXPECT_EQ(workGroupSize[0], 16u);
EXPECT_EQ(workGroupSize[1], 16u);
EXPECT_EQ(workGroupSize[2], 1u);
workGroup[0] = 48;
@ -115,6 +116,8 @@ TEST(localWorkSizeTest, given2DimWorkGroupAndSimdEqual8WhenComputeCalledThenLoca
}
TEST(localWorkSizeTest, given2DimWorkGroupAndSimdEqual32WhenComputeCalledThenLocalGroupComputed) {
DebugManagerStateRestore dbgRestore;
DebugManager.flags.EnableComputeWorkSizeSquared.set(false);
WorkSizeInfo wsInfo(256, 0u, 32, 0u, platformDevices[0]->pPlatform->eRenderCoreFamily, 32u, 0u, false, false);
uint32_t workDim = 2;
size_t workGroup[3] = {384, 96, 1};
@ -232,8 +235,8 @@ TEST(localWorkSizeTest, given2DimWorkGroupAndSimdEqual256WhenComputeCalledThenLo
size_t workGroupSize[3];
OCLRT::computeWorkgroupSizeND(wsInfo, workGroupSize, workGroup, workDim);
EXPECT_EQ(workGroupSize[0], 128u);
EXPECT_EQ(workGroupSize[1], 2u);
EXPECT_EQ(workGroupSize[0], 16u);
EXPECT_EQ(workGroupSize[1], 16u);
EXPECT_EQ(workGroupSize[2], 1u);
}
@ -267,8 +270,8 @@ TEST(localWorkSizeTest, givenKernelWithTwoDimensionalGlobalSizesWhenLwsIsCompute
workGroup[0] = 1024;
workGroup[1] = 1024;
OCLRT::computeWorkgroupSizeND(wsInfo, workGroupSize, workGroup, workDim);
EXPECT_EQ(workGroupSize[0], 256u);
EXPECT_EQ(workGroupSize[1], 1u);
EXPECT_EQ(workGroupSize[0], 16u);
EXPECT_EQ(workGroupSize[1], 16u);
EXPECT_EQ(workGroupSize[2], 1u);
}
@ -521,6 +524,8 @@ TEST(localWorkSizeTest, given2DimWorkWhenComputeSquaredCalledThenLocalGroupCompu
}
TEST(localWorkSizeTest, givenDeviceSupportingLws1024AndKernelCompiledInSimd8WhenGwsIs1024ThenLwsIsComputedAsMaxOptimalMultipliedBySimd) {
DebugManagerStateRestore dbgRestore;
DebugManager.flags.EnableComputeWorkSizeSquared.set(false);
WorkSizeInfo wsInfo(1024, 0u, 8, 0u, platformDevices[0]->pPlatform->eRenderCoreFamily, 32u, 0u, false, false);
uint32_t workDim = 2;
@ -535,4 +540,7 @@ TEST(localWorkSizeTest, givenDeviceSupportingLws1024AndKernelCompiledInSimd8When
TEST(localWorkSizeTest, givenDebugVariableEnableComputeWorkSizeNDWhenCheckValueExpectTrue) {
EXPECT_TRUE(DebugManager.flags.EnableComputeWorkSizeND.get());
}
TEST(localWorkSizeTest, givenDefaultDebugVariablesWhenEnableComputeWorkSizeSquaredIsCheckdThenTrueIsReturned) {
EXPECT_TRUE(DebugManager.flags.EnableComputeWorkSizeSquared.get());
}