Enable new algorithm computing local work sizes

Change-Id: If0addb5f36ee0b667370489b11837be716d70782
This commit is contained in:
mplewka 2017-12-21 10:14:18 +01:00 committed by sys_ocldev
parent 1e78649540
commit b503597ffa
7 changed files with 115 additions and 30 deletions

View File

@ -66,8 +66,8 @@ DECLARE_DEBUG_VARIABLE(bool, EnableAsyncDestroyAllocations, true, "Enables async
DECLARE_DEBUG_VARIABLE(bool, EnableAsyncEventsHandler, true, "Enables async events handler")
DECLARE_DEBUG_VARIABLE(bool, EnableForcePin, true, "Enables early pinning for memory object")
DECLARE_DEBUG_VARIABLE(int32_t, Enable64kbpages, -1, "-1: default behaviour, 0 Disables, 1 Enables support for 64KB pages for driver allocated fine grain svm buffers")
DECLARE_DEBUG_VARIABLE(bool, EnableComputeWorkSizeND, false, "Enables diffrent algorithm to compute locla work size")
DECLARE_DEBUG_VARIABLE(bool, EnableComputeWorkSizeSquared, true, "Enables algorithm to compute the most squared work gropu as passible")
DECLARE_DEBUG_VARIABLE(bool, EnableComputeWorkSizeND, true, "Enables diffrent algorithm to compute local work size")
DECLARE_DEBUG_VARIABLE(bool, EnableComputeWorkSizeSquared, false, "Enables algorithm to compute the most squared work group as possible")
DECLARE_DEBUG_VARIABLE(int32_t, OverrideEnableKmdNotify, -1, "-1: dont override, 0: disable, 1: enable")
DECLARE_DEBUG_VARIABLE(int32_t, OverrideKmdNotifyDelayMs, -1, "-1: dont override, 0: infinite timeout, >0: timeout in ms")
DECLARE_DEBUG_VARIABLE(bool, EnableVaLibCalls, true, "Enable cl-va sharing lib calls")

View File

@ -202,7 +202,90 @@ HWTEST_F(DispatchWalkerTest, noLocalIdsShouldntCrash) {
EXPECT_EQ(sizeDispatchWalkerNeeds, commandStream.getUsed() - commandStreamStart);
}
HWTEST_F(DispatchWalkerTest, dataParameterWorkDimensions) {
HWTEST_F(DispatchWalkerTest, dataParameterWorkDimensionswithDefaultLwsAlgorithm) {
MockKernel kernel(&program, kernelInfo, *pDevice);
kernelInfo.workloadInfo.workDimOffset = 0;
ASSERT_EQ(CL_SUCCESS, kernel.initialize());
size_t globalOffsets[3] = {0, 0, 0};
size_t workItems[3] = {1, 1, 1};
for (uint32_t dimension = 1; dimension <= 3; ++dimension) {
workItems[dimension - 1] = 256;
dispatchWalker<FamilyType>(
*pCmdQ,
kernel,
dimension,
globalOffsets,
workItems,
nullptr,
0,
nullptr,
nullptr,
nullptr,
nullptr);
EXPECT_EQ(dimension, *kernel.workDim);
}
}
HWTEST_F(DispatchWalkerTest, dataParameterWorkDimensionswithSquaredLwsAlgorithm) {
DebugManagerStateRestore dbgRestore;
DebugManager.flags.EnableComputeWorkSizeND.set(false);
DebugManager.flags.EnableComputeWorkSizeSquared.set(true);
MockKernel kernel(&program, kernelInfo, *pDevice);
kernelInfo.workloadInfo.workDimOffset = 0;
ASSERT_EQ(CL_SUCCESS, kernel.initialize());
size_t globalOffsets[3] = {0, 0, 0};
size_t workItems[3] = {1, 1, 1};
for (uint32_t dimension = 1; dimension <= 3; ++dimension) {
workItems[dimension - 1] = 256;
dispatchWalker<FamilyType>(
*pCmdQ,
kernel,
dimension,
globalOffsets,
workItems,
nullptr,
0,
nullptr,
nullptr,
nullptr,
nullptr);
EXPECT_EQ(dimension, *kernel.workDim);
}
}
HWTEST_F(DispatchWalkerTest, dataParameterWorkDimensionswithNDLwsAlgorithm) {
DebugManagerStateRestore dbgRestore;
DebugManager.flags.EnableComputeWorkSizeND.set(true);
MockKernel kernel(&program, kernelInfo, *pDevice);
kernelInfo.workloadInfo.workDimOffset = 0;
ASSERT_EQ(CL_SUCCESS, kernel.initialize());
size_t globalOffsets[3] = {0, 0, 0};
size_t workItems[3] = {1, 1, 1};
for (uint32_t dimension = 1; dimension <= 3; ++dimension) {
workItems[dimension - 1] = 256;
dispatchWalker<FamilyType>(
*pCmdQ,
kernel,
dimension,
globalOffsets,
workItems,
nullptr,
0,
nullptr,
nullptr,
nullptr,
nullptr);
EXPECT_EQ(dimension, *kernel.workDim);
}
}
HWTEST_F(DispatchWalkerTest, dataParameterWorkDimensionswithOldLwsAlgorithm) {
DebugManagerStateRestore dbgRestore;
DebugManager.flags.EnableComputeWorkSizeND.set(false);
DebugManager.flags.EnableComputeWorkSizeSquared.set(false);
MockKernel kernel(&program, kernelInfo, *pDevice);
kernelInfo.workloadInfo.workDimOffset = 0;
ASSERT_EQ(CL_SUCCESS, kernel.initialize());
@ -316,6 +399,7 @@ HWTEST_F(DispatchWalkerTest, dataParameterNoLocalWorkSizeWithComputeND) {
HWTEST_F(DispatchWalkerTest, dataParameterNoLocalWorkSizeWithComputeSquared) {
DebugManagerStateRestore dbgRestore;
DebugManager.flags.EnableComputeWorkSizeSquared.set(true);
DebugManager.flags.EnableComputeWorkSizeND.set(false);
MockKernel kernel(&program, kernelInfo, *pDevice);
kernelInfo.workloadInfo.localWorkSizeOffsets[0] = 0;
kernelInfo.workloadInfo.localWorkSizeOffsets[1] = 4;
@ -342,9 +426,10 @@ HWTEST_F(DispatchWalkerTest, dataParameterNoLocalWorkSizeWithComputeSquared) {
EXPECT_EQ(1u, *kernel.localWorkSizeZ);
}
HWTEST_F(DispatchWalkerTest, dataParameterNoLocalWorkSizeWithOutComputeSquared) {
HWTEST_F(DispatchWalkerTest, dataParameterNoLocalWorkSizeWithOutComputeSquaredAndND) {
DebugManagerStateRestore dbgRestore;
DebugManager.flags.EnableComputeWorkSizeSquared.set(false);
DebugManager.flags.EnableComputeWorkSizeND.set(false);
MockKernel kernel(&program, kernelInfo, *pDevice);
kernelInfo.workloadInfo.localWorkSizeOffsets[0] = 0;
kernelInfo.workloadInfo.localWorkSizeOffsets[1] = 4;

View File

@ -524,6 +524,5 @@ TEST(localWorkSizeTest, given2DimWorkWhenComputeSquaredCalledThenLocalGroupCompu
}
TEST(localWorkSizeTest, givenDebugVariableEnableComputeWorkSizeNDWhenCheckValueExpectTrue) {
bool isEnabled = DebugManager.flags.EnableComputeWorkSizeND.get();
EXPECT_TRUE(isEnabled == false);
EXPECT_TRUE(DebugManager.flags.EnableComputeWorkSizeND.get());
}

View File

@ -24,6 +24,7 @@
#include "runtime/command_queue/dispatch_walker.h"
#include "unit_tests/fixtures/device_fixture.h"
#include "unit_tests/fixtures/memory_management_fixture.h"
#include "unit_tests/helpers/debug_manager_state_restore.h"
#include "test.h"
using namespace OCLRT;
@ -185,27 +186,25 @@ HWTEST_P(WorkGroupSizeChannels, allChannelsWithEnableComputeWorkSizeSquaredDefau
}
HWTEST_P(WorkGroupSizeChannels, allChannelsWithEnableComputeWorkSizeSquaredEnabled) {
bool isWorkGroupSizeEnabled = DebugManager.flags.EnableComputeWorkSizeSquared.get();
DebugManagerStateRestore dbgRestore;
DebugManager.flags.EnableComputeWorkSizeSquared.set(true);
DebugManager.flags.EnableComputeWorkSizeND.set(false);
uint32_t simdSize;
size_t workDim;
std::tie(simdSize, workDim) = GetParam();
verify<FamilyType>(simdSize, workDim, workDim, workDim);
DebugManager.flags.EnableComputeWorkSizeSquared.set(isWorkGroupSizeEnabled);
}
HWTEST_P(WorkGroupSizeChannels, allChannelsWithEnableComputeWorkSizeSquaredDisabled) {
bool isWorkGroupSizeEnabled = DebugManager.flags.EnableComputeWorkSizeSquared.get();
DebugManagerStateRestore dbgRestore;
DebugManager.flags.EnableComputeWorkSizeSquared.set(false);
DebugManager.flags.EnableComputeWorkSizeND.set(false);
uint32_t simdSize;
size_t workDim;
std::tie(simdSize, workDim) = GetParam();
verify<FamilyType>(simdSize, workDim, workDim, workDim);
DebugManager.flags.EnableComputeWorkSizeSquared.set(isWorkGroupSizeEnabled);
}
HWTEST_P(WorkGroupSizeChannels, justXWithEnableComputeWorkSizeNDDefault) {

View File

@ -22,6 +22,7 @@
#include "runtime/memory_manager/svm_memory_manager.h"
#include "driver_diagnostics_tests.h"
#include "unit_tests/helpers/debug_manager_state_restore.h"
using namespace OCLRT;
@ -385,28 +386,28 @@ TEST_F(PerformanceHintEnqueueKernelTest, GivenNullLocalSizeAndEnableComputeWorkS
TEST_F(PerformanceHintEnqueueKernelTest, GivenNullLocalSizeAndEnableComputeWorkSizeSquaredIsTrueWhenEnqueueKernelIsCallingThenContextProvidesProperHint) {
bool isWorkGroupSizeEnabled = DebugManager.flags.EnableComputeWorkSizeSquared.get();
DebugManagerStateRestore dbgRestore;
DebugManager.flags.EnableComputeWorkSizeSquared.set(true);
DebugManager.flags.EnableComputeWorkSizeND.set(false);
retVal = pCmdQ->enqueueKernel(kernel, 1, nullptr, globalWorkGroupSize, nullptr, 0, nullptr, nullptr);
EXPECT_EQ(CL_SUCCESS, retVal);
snprintf(expectedHint, DriverDiagnostics::maxHintStringSize, DriverDiagnostics::hintFormat[NULL_LOCAL_WORKGROUP_SIZE], kernel->getKernelInfo().name.c_str(),
*kernel->localWorkSizeX, *kernel->localWorkSizeY, *kernel->localWorkSizeZ);
EXPECT_TRUE(containsHint(expectedHint, userData));
DebugManager.flags.EnableComputeWorkSizeSquared.set(isWorkGroupSizeEnabled);
}
TEST_F(PerformanceHintEnqueueKernelTest, GivenNullLocalSizeAndEnableComputeWorkSizeSquaredIsFalseWhenEnqueueKernelIsCallingThenContextProvidesProperHint) {
bool isWorkGroupSizeEnabled = DebugManager.flags.EnableComputeWorkSizeSquared.get();
DebugManagerStateRestore dbgRestore;
DebugManager.flags.EnableComputeWorkSizeSquared.set(false);
DebugManager.flags.EnableComputeWorkSizeND.set(false);
retVal = pCmdQ->enqueueKernel(kernel, 1, nullptr, globalWorkGroupSize, nullptr, 0, nullptr, nullptr);
EXPECT_EQ(CL_SUCCESS, retVal);
snprintf(expectedHint, DriverDiagnostics::maxHintStringSize, DriverDiagnostics::hintFormat[NULL_LOCAL_WORKGROUP_SIZE], kernel->getKernelInfo().name.c_str(),
*kernel->localWorkSizeX, *kernel->localWorkSizeY, *kernel->localWorkSizeZ);
EXPECT_TRUE(containsHint(expectedHint, userData));
DebugManager.flags.EnableComputeWorkSizeSquared.set(isWorkGroupSizeEnabled);
}
TEST_P(PerformanceHintEnqueueKernelBadSizeTest, GivenBadLocalWorkGroupSizeWhenEnqueueKernelIsCallingThenContextProvidesProperHint) {

View File

@ -21,6 +21,7 @@
*/
#include "driver_diagnostics_tests.h"
#include "unit_tests/helpers/debug_manager_state_restore.h"
using namespace OCLRT;
@ -227,19 +228,19 @@ TEST_F(PerformanceHintTest, GivenNullContextAndEmptyDispatchinfoAndEnableCompute
}
TEST_F(PerformanceHintTest, GivenNullContextAndEmptyDispatchinfoAndEnableComputeWorkSizeSquaredIsTrueWhenProvideLocalWorkGroupSizeIsCalledThenItDoesntCrash) {
bool isWorkGroupSizeEnabled = DebugManager.flags.EnableComputeWorkSizeSquared.get();
DebugManagerStateRestore dbgRestore;
DebugManager.flags.EnableComputeWorkSizeSquared.set(true);
DebugManager.flags.EnableComputeWorkSizeND.set(false);
DispatchInfo emptyDispatchInfo;
provideLocalWorkGroupSizeHints(nullptr, 0, emptyDispatchInfo);
DebugManager.flags.EnableComputeWorkSizeSquared.set(isWorkGroupSizeEnabled);
}
TEST_F(PerformanceHintTest, GivenNullContextAndEmptyDispatchinfoAndEnableComputeWorkSizeSquaredIsFalseWhenProvideLocalWorkGroupSizeIsCalledThenItDoesntCrash) {
bool isWorkGroupSizeEnabled = DebugManager.flags.EnableComputeWorkSizeSquared.get();
DebugManagerStateRestore dbgRestore;
DebugManager.flags.EnableComputeWorkSizeSquared.set(false);
DebugManager.flags.EnableComputeWorkSizeND.set(false);
DispatchInfo emptyDispatchInfo;
provideLocalWorkGroupSizeHints(nullptr, 0, emptyDispatchInfo);
DebugManager.flags.EnableComputeWorkSizeSquared.set(isWorkGroupSizeEnabled);
}
TEST_F(PerformanceHintTest, GivenNullContextAndInvalidDispatchinfoAndEnableComputeWorkSizeNDIsDefaultWhenProvideLocalWorkGroupSizeIsCalledThenItDoesntCrash) {
@ -279,23 +280,23 @@ TEST_F(PerformanceHintTest, GivenNullContextAndInvalidDispatchinfoAndEnableCompu
}
TEST_F(PerformanceHintTest, GivenNullContextAndInvalidDispatchinfoAndEnableComputeWorkSizeSquaredIsTrueWhenProvideLocalWorkGroupSizeIsCalledThenItDoesntCrash) {
bool isWorkGroupSizeEnabled = DebugManager.flags.EnableComputeWorkSizeSquared.get();
DebugManagerStateRestore dbgRestore;
DebugManager.flags.EnableComputeWorkSizeSquared.set(true);
DebugManager.flags.EnableComputeWorkSizeND.set(false);
auto pDevice = castToObject<Device>(devices[0]);
MockKernelWithInternals mockKernel(*pDevice, context);
DispatchInfo invalidDispatchInfo(mockKernel, 100, {32, 32, 32}, {1, 1, 1}, {0, 0, 0});
provideLocalWorkGroupSizeHints(context, 0, invalidDispatchInfo);
DebugManager.flags.EnableComputeWorkSizeSquared.set(isWorkGroupSizeEnabled);
}
TEST_F(PerformanceHintTest, GivenNullContextAndInvalidDispatchinfoAndEnableComputeWorkSizeSquaredIsFalseWhenProvideLocalWorkGroupSizeIsCalledThenItDoesntCrash) {
bool isWorkGroupSizeEnabled = DebugManager.flags.EnableComputeWorkSizeSquared.get();
DebugManagerStateRestore dbgRestore;
DebugManager.flags.EnableComputeWorkSizeSquared.set(false);
DebugManager.flags.EnableComputeWorkSizeND.set(false);
auto pDevice = castToObject<Device>(devices[0]);
MockKernelWithInternals mockKernel(*pDevice, context);
DispatchInfo invalidDispatchInfo(mockKernel, 100, {32, 32, 32}, {1, 1, 1}, {0, 0, 0});
provideLocalWorkGroupSizeHints(context, 0, invalidDispatchInfo);
DebugManager.flags.EnableComputeWorkSizeSquared.set(isWorkGroupSizeEnabled);
}
TEST_F(PerformanceHintTest, GivenContextAndDispatchinfoAndEnableComputeWorkSizeSquaredIsDefaultWhenProvideLocalWorkGroupSizeIsCalledReturnValue) {
@ -307,23 +308,23 @@ TEST_F(PerformanceHintTest, GivenContextAndDispatchinfoAndEnableComputeWorkSizeS
}
TEST_F(PerformanceHintTest, GivenContextAndDispatchinfoAndEnableComputeWorkSizeSquaredIsTrueWhenProvideLocalWorkGroupSizeIsCalledReturnValue) {
bool isWorkGroupSizeEnabled = DebugManager.flags.EnableComputeWorkSizeSquared.get();
DebugManagerStateRestore dbgRestore;
DebugManager.flags.EnableComputeWorkSizeSquared.set(true);
DebugManager.flags.EnableComputeWorkSizeND.set(false);
auto pDevice = castToObject<Device>(devices[0]);
MockKernelWithInternals mockKernel(*pDevice, context);
DispatchInfo invalidDispatchInfo(mockKernel, 2, {32, 32, 1}, {1, 1, 1}, {0, 0, 0});
provideLocalWorkGroupSizeHints(context, 0, invalidDispatchInfo);
DebugManager.flags.EnableComputeWorkSizeSquared.set(isWorkGroupSizeEnabled);
}
TEST_F(PerformanceHintTest, GivenContextAndDispatchinfoAndEnableComputeWorkSizeSquaredIsFalseWhenProvideLocalWorkGroupSizeIsCalledReturnValue) {
bool isWorkGroupSizeEnabled = DebugManager.flags.EnableComputeWorkSizeSquared.get();
DebugManagerStateRestore dbgRestore;
DebugManager.flags.EnableComputeWorkSizeSquared.set(false);
DebugManager.flags.EnableComputeWorkSizeND.set(false);
auto pDevice = castToObject<Device>(devices[0]);
MockKernelWithInternals mockKernel(*pDevice, context);
DispatchInfo invalidDispatchInfo(mockKernel, 2, {32, 32, 1}, {1, 1, 1}, {0, 0, 0});
provideLocalWorkGroupSizeHints(context, 0, invalidDispatchInfo);
DebugManager.flags.EnableComputeWorkSizeSquared.set(isWorkGroupSizeEnabled);
}
TEST_P(PerformanceHintKernelTest, GivenSpillFillWhenKernelIsInitializedThenContextProvidesProperHint) {

View File

@ -45,8 +45,8 @@ Enable64kbpages = -1
NodeOrdinal = 0
ProductFamilyOverride = unk
EnableDebugBreak = false
EnableComputeWorkSizeND = false
EnableComputeWorkSizeND = true
EventsDebugEnable = false
UseMaxSimdSizeToDeduceMaxWorkgroupSize = false
EnableComputeWorkSizeSquared = true
EnableComputeWorkSizeSquared = false
TrackParentEvents = false