Improve NonUniform aubs. Use Buffer instead of SVM allocation

Change-Id: Ifd9b1e23aa64e631cf7efb1e1924aa7c3d9dcc7b
Signed-off-by: Bartosz Dunajski <bartosz.dunajski@intel.com>
This commit is contained in:
Bartosz Dunajski
2020-03-18 12:05:25 +01:00
committed by sys_ocldev
parent 2b0e743ac9
commit 5d1da5740a

View File

@ -370,17 +370,11 @@ INSTANTIATE_TEST_CASE_P(
struct AUBSimpleArgNonUniformFixture : public KernelAUBFixture<SimpleArgNonUniformKernelFixture> {
void SetUp() override {
deviceClVersionSupport = NEO::platformDevices[0]->capabilityTable.clVersionSupport;
if (deviceClVersionSupport < 20) {
return;
if (NEO::platformDevices[0]->capabilityTable.clVersionSupport < 20) {
GTEST_SKIP();
}
KernelAUBFixture<SimpleArgNonUniformKernelFixture>::SetUp();
argVal = static_cast<int>(0x22222222);
sizeWrittenMemory = 0;
typeSize = sizeof(int);
typeItems = 40 * 40 * 40;
sizeUserMemory = alignUp(typeItems * typeSize, 64);
destMemory = alignedMalloc(sizeUserMemory, 4096);
@ -391,12 +385,7 @@ struct AUBSimpleArgNonUniformFixture : public KernelAUBFixture<SimpleArgNonUnifo
expectedMemory = alignedMalloc(sizeUserMemory, 4096);
ASSERT_NE(nullptr, expectedMemory);
memset(expectedMemory, 0x0, sizeUserMemory);
kernel->setArgSvm(1, sizeUserMemory, destMemory, nullptr, 0u);
outBuffer = createHostPtrAllocationFromSvmPtr(destMemory, sizeUserMemory);
}
void initializeExpectedMemory(size_t globalX, size_t globalY, size_t globalZ) {
@ -421,6 +410,10 @@ struct AUBSimpleArgNonUniformFixture : public KernelAUBFixture<SimpleArgNonUnifo
*(static_cast<int *>(destMemory) + maxId) = 0;
*(expectedData + maxId) = maxId;
outBuffer.reset(Buffer::create(context, CL_MEM_COPY_HOST_PTR, alignUp(sizeUserMemory, 4096), destMemory, retVal));
bufferGpuAddress = reinterpret_cast<void *>(outBuffer->getGraphicsAllocation()->getGpuAddress());
kernel->setArg(1, outBuffer.get());
sizeWrittenMemory = maxId * typeSize;
//add single int size for atomic sum of all work-items
sizeWrittenMemory += typeSize;
@ -433,11 +426,11 @@ struct AUBSimpleArgNonUniformFixture : public KernelAUBFixture<SimpleArgNonUnifo
for (size_t i = 0; i < reminderElements; i++) {
*(expectedReminderData + i) = 0xdeadbeef;
}
remainderDestMemory = static_cast<char *>(destMemory) + sizeWrittenMemory;
remainderBufferGpuAddress = ptrOffset(bufferGpuAddress, sizeWrittenMemory);
}
void TearDown() override {
if (deviceClVersionSupport < 20) {
if (NEO::platformDevices[0]->capabilityTable.clVersionSupport < 20) {
return;
}
if (destMemory) {
@ -456,17 +449,18 @@ struct AUBSimpleArgNonUniformFixture : public KernelAUBFixture<SimpleArgNonUnifo
}
unsigned int deviceClVersionSupport;
size_t typeSize;
size_t typeItems;
size_t sizeWrittenMemory;
const size_t typeSize = sizeof(int);
const size_t typeItems = 40 * 40 * 40;
size_t sizeWrittenMemory = 0;
size_t sizeUserMemory;
size_t sizeRemainderMemory;
int argVal;
int argVal = 0x22222222;
void *destMemory = nullptr;
void *expectedMemory = nullptr;
void *expectedRemainderMemory = nullptr;
char *remainderDestMemory = nullptr;
GraphicsAllocation *outBuffer;
void *remainderBufferGpuAddress = nullptr;
void *bufferGpuAddress = nullptr;
std::unique_ptr<Buffer> outBuffer;
HardwareParse hwParser;
};
@ -527,374 +521,352 @@ HWTEST_F(AUBSimpleKernelStatelessTest, givenSimpleKernelWhenStatelessPathIsUsedT
using AUBSimpleArgNonUniformTest = Test<AUBSimpleArgNonUniformFixture>;
HWTEST_F(AUBSimpleArgNonUniformTest, givenOpenCL20SupportWhenProvidingWork1DimNonUniformGroupThenExpectTwoWalkers) {
using WALKER_TYPE = WALKER_TYPE<FamilyType>;
if (deviceClVersionSupport >= 20) {
cl_uint workDim = 1;
size_t globalWorkOffset[3] = {0, 0, 0};
size_t globalWorkSize[3] = {39, 1, 1};
size_t localWorkSize[3] = {32, 1, 1};
cl_uint numEventsInWaitList = 0;
cl_event *eventWaitList = nullptr;
cl_event *event = nullptr;
cl_uint workDim = 1;
size_t globalWorkOffset[3] = {0, 0, 0};
size_t globalWorkSize[3] = {39, 1, 1};
size_t localWorkSize[3] = {32, 1, 1};
cl_uint numEventsInWaitList = 0;
cl_event *eventWaitList = nullptr;
cl_event *event = nullptr;
initializeExpectedMemory(globalWorkSize[0], globalWorkSize[1], globalWorkSize[2]);
initializeExpectedMemory(globalWorkSize[0], globalWorkSize[1], globalWorkSize[2]);
auto retVal = this->pCmdQ->enqueueKernel(
this->kernel,
workDim,
globalWorkOffset,
globalWorkSize,
localWorkSize,
numEventsInWaitList,
eventWaitList,
event);
ASSERT_EQ(CL_SUCCESS, retVal);
auto retVal = this->pCmdQ->enqueueKernel(
this->kernel,
workDim,
globalWorkOffset,
globalWorkSize,
localWorkSize,
numEventsInWaitList,
eventWaitList,
event);
ASSERT_EQ(CL_SUCCESS, retVal);
hwParser.parseCommands<FamilyType>(*pCmdQ);
uint32_t walkerCount = hwParser.getCommandCount<WALKER_TYPE>();
EXPECT_EQ(2u, walkerCount);
hwParser.parseCommands<FamilyType>(*pCmdQ);
uint32_t walkerCount = hwParser.getCommandCount<WALKER_TYPE>();
EXPECT_EQ(2u, walkerCount);
pCmdQ->flush();
expectMemory<FamilyType>(this->destMemory, this->expectedMemory, sizeWrittenMemory);
expectMemory<FamilyType>(this->remainderDestMemory, this->expectedRemainderMemory, sizeRemainderMemory);
}
pCmdQ->flush();
expectMemory<FamilyType>(bufferGpuAddress, this->expectedMemory, sizeWrittenMemory);
expectMemory<FamilyType>(remainderBufferGpuAddress, this->expectedRemainderMemory, sizeRemainderMemory);
}
HWTEST_F(AUBSimpleArgNonUniformTest, givenOpenCL20SupportWhenProvidingWork2DimNonUniformGroupInXDimensionThenExpectTwoWalkers) {
using WALKER_TYPE = WALKER_TYPE<FamilyType>;
if (deviceClVersionSupport >= 20) {
cl_uint workDim = 2;
size_t globalWorkOffset[3] = {0, 0, 0};
size_t globalWorkSize[3] = {39, 32, 1};
size_t localWorkSize[3] = {16, 16, 1};
cl_uint numEventsInWaitList = 0;
cl_event *eventWaitList = nullptr;
cl_event *event = nullptr;
cl_uint workDim = 2;
size_t globalWorkOffset[3] = {0, 0, 0};
size_t globalWorkSize[3] = {39, 32, 1};
size_t localWorkSize[3] = {16, 16, 1};
cl_uint numEventsInWaitList = 0;
cl_event *eventWaitList = nullptr;
cl_event *event = nullptr;
initializeExpectedMemory(globalWorkSize[0], globalWorkSize[1], globalWorkSize[2]);
initializeExpectedMemory(globalWorkSize[0], globalWorkSize[1], globalWorkSize[2]);
auto retVal = this->pCmdQ->enqueueKernel(
this->kernel,
workDim,
globalWorkOffset,
globalWorkSize,
localWorkSize,
numEventsInWaitList,
eventWaitList,
event);
ASSERT_EQ(CL_SUCCESS, retVal);
auto retVal = this->pCmdQ->enqueueKernel(
this->kernel,
workDim,
globalWorkOffset,
globalWorkSize,
localWorkSize,
numEventsInWaitList,
eventWaitList,
event);
ASSERT_EQ(CL_SUCCESS, retVal);
hwParser.parseCommands<FamilyType>(*pCmdQ);
uint32_t walkerCount = hwParser.getCommandCount<WALKER_TYPE>();
EXPECT_EQ(2u, walkerCount);
hwParser.parseCommands<FamilyType>(*pCmdQ);
uint32_t walkerCount = hwParser.getCommandCount<WALKER_TYPE>();
EXPECT_EQ(2u, walkerCount);
pCmdQ->flush();
expectMemory<FamilyType>(this->destMemory, this->expectedMemory, sizeWrittenMemory);
expectMemory<FamilyType>(this->remainderDestMemory, this->expectedRemainderMemory, sizeRemainderMemory);
}
pCmdQ->flush();
expectMemory<FamilyType>(bufferGpuAddress, this->expectedMemory, sizeWrittenMemory);
expectMemory<FamilyType>(remainderBufferGpuAddress, this->expectedRemainderMemory, sizeRemainderMemory);
}
HWTEST_F(AUBSimpleArgNonUniformTest, givenOpenCL20SupportWhenProvidingWork2DimNonUniformGroupInYDimensionThenExpectTwoWalkers) {
using WALKER_TYPE = WALKER_TYPE<FamilyType>;
if (deviceClVersionSupport >= 20) {
cl_uint workDim = 2;
size_t globalWorkOffset[3] = {0, 0, 0};
size_t globalWorkSize[3] = {32, 39, 1};
size_t localWorkSize[3] = {16, 16, 1};
cl_uint numEventsInWaitList = 0;
cl_event *eventWaitList = nullptr;
cl_event *event = nullptr;
cl_uint workDim = 2;
size_t globalWorkOffset[3] = {0, 0, 0};
size_t globalWorkSize[3] = {32, 39, 1};
size_t localWorkSize[3] = {16, 16, 1};
cl_uint numEventsInWaitList = 0;
cl_event *eventWaitList = nullptr;
cl_event *event = nullptr;
initializeExpectedMemory(globalWorkSize[0], globalWorkSize[1], globalWorkSize[2]);
initializeExpectedMemory(globalWorkSize[0], globalWorkSize[1], globalWorkSize[2]);
auto retVal = this->pCmdQ->enqueueKernel(
this->kernel,
workDim,
globalWorkOffset,
globalWorkSize,
localWorkSize,
numEventsInWaitList,
eventWaitList,
event);
ASSERT_EQ(CL_SUCCESS, retVal);
auto retVal = this->pCmdQ->enqueueKernel(
this->kernel,
workDim,
globalWorkOffset,
globalWorkSize,
localWorkSize,
numEventsInWaitList,
eventWaitList,
event);
ASSERT_EQ(CL_SUCCESS, retVal);
hwParser.parseCommands<FamilyType>(*pCmdQ);
uint32_t walkerCount = hwParser.getCommandCount<WALKER_TYPE>();
EXPECT_EQ(2u, walkerCount);
hwParser.parseCommands<FamilyType>(*pCmdQ);
uint32_t walkerCount = hwParser.getCommandCount<WALKER_TYPE>();
EXPECT_EQ(2u, walkerCount);
pCmdQ->flush();
expectMemory<FamilyType>(this->destMemory, this->expectedMemory, sizeWrittenMemory);
expectMemory<FamilyType>(this->remainderDestMemory, this->expectedRemainderMemory, sizeRemainderMemory);
}
pCmdQ->flush();
expectMemory<FamilyType>(bufferGpuAddress, this->expectedMemory, sizeWrittenMemory);
expectMemory<FamilyType>(remainderBufferGpuAddress, this->expectedRemainderMemory, sizeRemainderMemory);
}
HWTEST_F(AUBSimpleArgNonUniformTest, givenOpenCL20SupportWhenProvidingWork2DimNonUniformGroupInXandYDimensionThenExpectFourWalkers) {
using WALKER_TYPE = WALKER_TYPE<FamilyType>;
if (deviceClVersionSupport >= 20) {
cl_uint workDim = 2;
size_t globalWorkOffset[3] = {0, 0, 0};
size_t globalWorkSize[3] = {39, 39, 1};
size_t localWorkSize[3] = {16, 16, 1};
cl_uint numEventsInWaitList = 0;
cl_event *eventWaitList = nullptr;
cl_event *event = nullptr;
cl_uint workDim = 2;
size_t globalWorkOffset[3] = {0, 0, 0};
size_t globalWorkSize[3] = {39, 39, 1};
size_t localWorkSize[3] = {16, 16, 1};
cl_uint numEventsInWaitList = 0;
cl_event *eventWaitList = nullptr;
cl_event *event = nullptr;
initializeExpectedMemory(globalWorkSize[0], globalWorkSize[1], globalWorkSize[2]);
initializeExpectedMemory(globalWorkSize[0], globalWorkSize[1], globalWorkSize[2]);
auto retVal = this->pCmdQ->enqueueKernel(
this->kernel,
workDim,
globalWorkOffset,
globalWorkSize,
localWorkSize,
numEventsInWaitList,
eventWaitList,
event);
ASSERT_EQ(CL_SUCCESS, retVal);
auto retVal = this->pCmdQ->enqueueKernel(
this->kernel,
workDim,
globalWorkOffset,
globalWorkSize,
localWorkSize,
numEventsInWaitList,
eventWaitList,
event);
ASSERT_EQ(CL_SUCCESS, retVal);
hwParser.parseCommands<FamilyType>(*pCmdQ);
uint32_t walkerCount = hwParser.getCommandCount<WALKER_TYPE>();
EXPECT_EQ(4u, walkerCount);
hwParser.parseCommands<FamilyType>(*pCmdQ);
uint32_t walkerCount = hwParser.getCommandCount<WALKER_TYPE>();
EXPECT_EQ(4u, walkerCount);
pCmdQ->flush();
expectMemory<FamilyType>(this->destMemory, this->expectedMemory, sizeWrittenMemory);
expectMemory<FamilyType>(this->remainderDestMemory, this->expectedRemainderMemory, sizeRemainderMemory);
}
pCmdQ->flush();
expectMemory<FamilyType>(bufferGpuAddress, this->expectedMemory, sizeWrittenMemory);
expectMemory<FamilyType>(remainderBufferGpuAddress, this->expectedRemainderMemory, sizeRemainderMemory);
}
HWTEST_F(AUBSimpleArgNonUniformTest, givenOpenCL20SupportWhenProvidingWork3DimNonUniformGroupInXDimensionThenExpectTwoWalkers) {
using WALKER_TYPE = WALKER_TYPE<FamilyType>;
if (deviceClVersionSupport >= 20) {
cl_uint workDim = 3;
size_t globalWorkOffset[3] = {0, 0, 0};
size_t globalWorkSize[3] = {39, 32, 32};
size_t localWorkSize[3] = {8, 8, 2};
cl_uint numEventsInWaitList = 0;
cl_event *eventWaitList = nullptr;
cl_event *event = nullptr;
cl_uint workDim = 3;
size_t globalWorkOffset[3] = {0, 0, 0};
size_t globalWorkSize[3] = {39, 32, 32};
size_t localWorkSize[3] = {8, 8, 2};
cl_uint numEventsInWaitList = 0;
cl_event *eventWaitList = nullptr;
cl_event *event = nullptr;
initializeExpectedMemory(globalWorkSize[0], globalWorkSize[1], globalWorkSize[2]);
initializeExpectedMemory(globalWorkSize[0], globalWorkSize[1], globalWorkSize[2]);
auto retVal = this->pCmdQ->enqueueKernel(
this->kernel,
workDim,
globalWorkOffset,
globalWorkSize,
localWorkSize,
numEventsInWaitList,
eventWaitList,
event);
ASSERT_EQ(CL_SUCCESS, retVal);
auto retVal = this->pCmdQ->enqueueKernel(
this->kernel,
workDim,
globalWorkOffset,
globalWorkSize,
localWorkSize,
numEventsInWaitList,
eventWaitList,
event);
ASSERT_EQ(CL_SUCCESS, retVal);
hwParser.parseCommands<FamilyType>(*pCmdQ);
uint32_t walkerCount = hwParser.getCommandCount<WALKER_TYPE>();
EXPECT_EQ(2u, walkerCount);
hwParser.parseCommands<FamilyType>(*pCmdQ);
uint32_t walkerCount = hwParser.getCommandCount<WALKER_TYPE>();
EXPECT_EQ(2u, walkerCount);
pCmdQ->flush();
expectMemory<FamilyType>(this->destMemory, this->expectedMemory, sizeWrittenMemory);
expectMemory<FamilyType>(this->remainderDestMemory, this->expectedRemainderMemory, sizeRemainderMemory);
}
pCmdQ->flush();
expectMemory<FamilyType>(bufferGpuAddress, this->expectedMemory, sizeWrittenMemory);
expectMemory<FamilyType>(remainderBufferGpuAddress, this->expectedRemainderMemory, sizeRemainderMemory);
}
HWTEST_F(AUBSimpleArgNonUniformTest, givenOpenCL20SupportWhenProvidingWork3DimNonUniformGroupInYDimensionThenExpectTwoWalkers) {
using WALKER_TYPE = WALKER_TYPE<FamilyType>;
if (deviceClVersionSupport >= 20) {
cl_uint workDim = 3;
size_t globalWorkOffset[3] = {0, 0, 0};
size_t globalWorkSize[3] = {32, 39, 32};
size_t localWorkSize[3] = {8, 8, 2};
cl_uint numEventsInWaitList = 0;
cl_event *eventWaitList = nullptr;
cl_event *event = nullptr;
cl_uint workDim = 3;
size_t globalWorkOffset[3] = {0, 0, 0};
size_t globalWorkSize[3] = {32, 39, 32};
size_t localWorkSize[3] = {8, 8, 2};
cl_uint numEventsInWaitList = 0;
cl_event *eventWaitList = nullptr;
cl_event *event = nullptr;
initializeExpectedMemory(globalWorkSize[0], globalWorkSize[1], globalWorkSize[2]);
initializeExpectedMemory(globalWorkSize[0], globalWorkSize[1], globalWorkSize[2]);
auto retVal = this->pCmdQ->enqueueKernel(
this->kernel,
workDim,
globalWorkOffset,
globalWorkSize,
localWorkSize,
numEventsInWaitList,
eventWaitList,
event);
ASSERT_EQ(CL_SUCCESS, retVal);
auto retVal = this->pCmdQ->enqueueKernel(
this->kernel,
workDim,
globalWorkOffset,
globalWorkSize,
localWorkSize,
numEventsInWaitList,
eventWaitList,
event);
ASSERT_EQ(CL_SUCCESS, retVal);
hwParser.parseCommands<FamilyType>(*pCmdQ);
uint32_t walkerCount = hwParser.getCommandCount<WALKER_TYPE>();
EXPECT_EQ(2u, walkerCount);
hwParser.parseCommands<FamilyType>(*pCmdQ);
uint32_t walkerCount = hwParser.getCommandCount<WALKER_TYPE>();
EXPECT_EQ(2u, walkerCount);
pCmdQ->flush();
expectMemory<FamilyType>(this->destMemory, this->expectedMemory, sizeWrittenMemory);
expectMemory<FamilyType>(this->remainderDestMemory, this->expectedRemainderMemory, sizeRemainderMemory);
}
pCmdQ->flush();
expectMemory<FamilyType>(bufferGpuAddress, this->expectedMemory, sizeWrittenMemory);
expectMemory<FamilyType>(remainderBufferGpuAddress, this->expectedRemainderMemory, sizeRemainderMemory);
}
HWTEST_F(AUBSimpleArgNonUniformTest, givenOpenCL20SupportWhenProvidingWork3DimNonUniformGroupInZDimensionThenExpectTwoWalkers) {
using WALKER_TYPE = WALKER_TYPE<FamilyType>;
if (deviceClVersionSupport >= 20) {
cl_uint workDim = 3;
size_t globalWorkOffset[3] = {0, 0, 0};
size_t globalWorkSize[3] = {32, 32, 39};
size_t localWorkSize[3] = {8, 2, 8};
cl_uint numEventsInWaitList = 0;
cl_event *eventWaitList = nullptr;
cl_event *event = nullptr;
cl_uint workDim = 3;
size_t globalWorkOffset[3] = {0, 0, 0};
size_t globalWorkSize[3] = {32, 32, 39};
size_t localWorkSize[3] = {8, 2, 8};
cl_uint numEventsInWaitList = 0;
cl_event *eventWaitList = nullptr;
cl_event *event = nullptr;
initializeExpectedMemory(globalWorkSize[0], globalWorkSize[1], globalWorkSize[2]);
initializeExpectedMemory(globalWorkSize[0], globalWorkSize[1], globalWorkSize[2]);
auto retVal = this->pCmdQ->enqueueKernel(
this->kernel,
workDim,
globalWorkOffset,
globalWorkSize,
localWorkSize,
numEventsInWaitList,
eventWaitList,
event);
ASSERT_EQ(CL_SUCCESS, retVal);
auto retVal = this->pCmdQ->enqueueKernel(
this->kernel,
workDim,
globalWorkOffset,
globalWorkSize,
localWorkSize,
numEventsInWaitList,
eventWaitList,
event);
ASSERT_EQ(CL_SUCCESS, retVal);
hwParser.parseCommands<FamilyType>(*pCmdQ);
uint32_t walkerCount = hwParser.getCommandCount<WALKER_TYPE>();
EXPECT_EQ(2u, walkerCount);
hwParser.parseCommands<FamilyType>(*pCmdQ);
uint32_t walkerCount = hwParser.getCommandCount<WALKER_TYPE>();
EXPECT_EQ(2u, walkerCount);
pCmdQ->flush();
expectMemory<FamilyType>(this->destMemory, this->expectedMemory, sizeWrittenMemory);
expectMemory<FamilyType>(this->remainderDestMemory, this->expectedRemainderMemory, sizeRemainderMemory);
}
pCmdQ->flush();
expectMemory<FamilyType>(bufferGpuAddress, this->expectedMemory, sizeWrittenMemory);
expectMemory<FamilyType>(remainderBufferGpuAddress, this->expectedRemainderMemory, sizeRemainderMemory);
}
HWTEST_F(AUBSimpleArgNonUniformTest, givenOpenCL20SupportWhenProvidingWork3DimNonUniformGroupInXandYDimensionThenExpectFourWalkers) {
using WALKER_TYPE = WALKER_TYPE<FamilyType>;
if (deviceClVersionSupport >= 20) {
cl_uint workDim = 3;
size_t globalWorkOffset[3] = {0, 0, 0};
size_t globalWorkSize[3] = {39, 39, 32};
size_t localWorkSize[3] = {8, 8, 2};
cl_uint numEventsInWaitList = 0;
cl_event *eventWaitList = nullptr;
cl_event *event = nullptr;
cl_uint workDim = 3;
size_t globalWorkOffset[3] = {0, 0, 0};
size_t globalWorkSize[3] = {39, 39, 32};
size_t localWorkSize[3] = {8, 8, 2};
cl_uint numEventsInWaitList = 0;
cl_event *eventWaitList = nullptr;
cl_event *event = nullptr;
initializeExpectedMemory(globalWorkSize[0], globalWorkSize[1], globalWorkSize[2]);
initializeExpectedMemory(globalWorkSize[0], globalWorkSize[1], globalWorkSize[2]);
auto retVal = this->pCmdQ->enqueueKernel(
this->kernel,
workDim,
globalWorkOffset,
globalWorkSize,
localWorkSize,
numEventsInWaitList,
eventWaitList,
event);
ASSERT_EQ(CL_SUCCESS, retVal);
auto retVal = this->pCmdQ->enqueueKernel(
this->kernel,
workDim,
globalWorkOffset,
globalWorkSize,
localWorkSize,
numEventsInWaitList,
eventWaitList,
event);
ASSERT_EQ(CL_SUCCESS, retVal);
hwParser.parseCommands<FamilyType>(*pCmdQ);
uint32_t walkerCount = hwParser.getCommandCount<WALKER_TYPE>();
EXPECT_EQ(4u, walkerCount);
hwParser.parseCommands<FamilyType>(*pCmdQ);
uint32_t walkerCount = hwParser.getCommandCount<WALKER_TYPE>();
EXPECT_EQ(4u, walkerCount);
pCmdQ->flush();
expectMemory<FamilyType>(this->destMemory, this->expectedMemory, sizeWrittenMemory);
expectMemory<FamilyType>(this->remainderDestMemory, this->expectedRemainderMemory, sizeRemainderMemory);
}
pCmdQ->flush();
expectMemory<FamilyType>(bufferGpuAddress, this->expectedMemory, sizeWrittenMemory);
expectMemory<FamilyType>(remainderBufferGpuAddress, this->expectedRemainderMemory, sizeRemainderMemory);
}
HWTEST_F(AUBSimpleArgNonUniformTest, givenOpenCL20SupportWhenProvidingWork3DimNonUniformGroupInXandZDimensionThenExpectFourWalkers) {
using WALKER_TYPE = WALKER_TYPE<FamilyType>;
if (deviceClVersionSupport >= 20) {
cl_uint workDim = 3;
size_t globalWorkOffset[3] = {0, 0, 0};
size_t globalWorkSize[3] = {39, 32, 39};
size_t localWorkSize[3] = {8, 2, 8};
cl_uint numEventsInWaitList = 0;
cl_event *eventWaitList = nullptr;
cl_event *event = nullptr;
cl_uint workDim = 3;
size_t globalWorkOffset[3] = {0, 0, 0};
size_t globalWorkSize[3] = {39, 32, 39};
size_t localWorkSize[3] = {8, 2, 8};
cl_uint numEventsInWaitList = 0;
cl_event *eventWaitList = nullptr;
cl_event *event = nullptr;
initializeExpectedMemory(globalWorkSize[0], globalWorkSize[1], globalWorkSize[2]);
initializeExpectedMemory(globalWorkSize[0], globalWorkSize[1], globalWorkSize[2]);
auto retVal = this->pCmdQ->enqueueKernel(
this->kernel,
workDim,
globalWorkOffset,
globalWorkSize,
localWorkSize,
numEventsInWaitList,
eventWaitList,
event);
ASSERT_EQ(CL_SUCCESS, retVal);
auto retVal = this->pCmdQ->enqueueKernel(
this->kernel,
workDim,
globalWorkOffset,
globalWorkSize,
localWorkSize,
numEventsInWaitList,
eventWaitList,
event);
ASSERT_EQ(CL_SUCCESS, retVal);
hwParser.parseCommands<FamilyType>(*pCmdQ);
uint32_t walkerCount = hwParser.getCommandCount<WALKER_TYPE>();
EXPECT_EQ(4u, walkerCount);
hwParser.parseCommands<FamilyType>(*pCmdQ);
uint32_t walkerCount = hwParser.getCommandCount<WALKER_TYPE>();
EXPECT_EQ(4u, walkerCount);
pCmdQ->flush();
expectMemory<FamilyType>(this->destMemory, this->expectedMemory, sizeWrittenMemory);
expectMemory<FamilyType>(this->remainderDestMemory, this->expectedRemainderMemory, sizeRemainderMemory);
}
pCmdQ->flush();
expectMemory<FamilyType>(bufferGpuAddress, this->expectedMemory, sizeWrittenMemory);
expectMemory<FamilyType>(remainderBufferGpuAddress, this->expectedRemainderMemory, sizeRemainderMemory);
}
HWTEST_F(AUBSimpleArgNonUniformTest, givenOpenCL20SupportWhenProvidingWork3DimNonUniformGroupInYandZDimensionThenExpectFourWalkers) {
using WALKER_TYPE = WALKER_TYPE<FamilyType>;
if (deviceClVersionSupport >= 20) {
cl_uint workDim = 3;
size_t globalWorkOffset[3] = {0, 0, 0};
size_t globalWorkSize[3] = {32, 39, 39};
size_t localWorkSize[3] = {2, 8, 8};
cl_uint numEventsInWaitList = 0;
cl_event *eventWaitList = nullptr;
cl_event *event = nullptr;
cl_uint workDim = 3;
size_t globalWorkOffset[3] = {0, 0, 0};
size_t globalWorkSize[3] = {32, 39, 39};
size_t localWorkSize[3] = {2, 8, 8};
cl_uint numEventsInWaitList = 0;
cl_event *eventWaitList = nullptr;
cl_event *event = nullptr;
initializeExpectedMemory(globalWorkSize[0], globalWorkSize[1], globalWorkSize[2]);
initializeExpectedMemory(globalWorkSize[0], globalWorkSize[1], globalWorkSize[2]);
auto retVal = this->pCmdQ->enqueueKernel(
this->kernel,
workDim,
globalWorkOffset,
globalWorkSize,
localWorkSize,
numEventsInWaitList,
eventWaitList,
event);
ASSERT_EQ(CL_SUCCESS, retVal);
auto retVal = this->pCmdQ->enqueueKernel(
this->kernel,
workDim,
globalWorkOffset,
globalWorkSize,
localWorkSize,
numEventsInWaitList,
eventWaitList,
event);
ASSERT_EQ(CL_SUCCESS, retVal);
hwParser.parseCommands<FamilyType>(*pCmdQ);
uint32_t walkerCount = hwParser.getCommandCount<WALKER_TYPE>();
EXPECT_EQ(4u, walkerCount);
hwParser.parseCommands<FamilyType>(*pCmdQ);
uint32_t walkerCount = hwParser.getCommandCount<WALKER_TYPE>();
EXPECT_EQ(4u, walkerCount);
pCmdQ->flush();
expectMemory<FamilyType>(this->destMemory, this->expectedMemory, sizeWrittenMemory);
expectMemory<FamilyType>(this->remainderDestMemory, this->expectedRemainderMemory, sizeRemainderMemory);
}
pCmdQ->flush();
expectMemory<FamilyType>(bufferGpuAddress, this->expectedMemory, sizeWrittenMemory);
expectMemory<FamilyType>(remainderBufferGpuAddress, this->expectedRemainderMemory, sizeRemainderMemory);
}
HWTEST_F(AUBSimpleArgNonUniformTest, givenOpenCL20SupportWhenProvidingWork3DimNonUniformGroupInXandYandZDimensionThenExpectEightWalkers) {
using WALKER_TYPE = WALKER_TYPE<FamilyType>;
if (deviceClVersionSupport >= 20) {
cl_uint workDim = 3;
size_t globalWorkOffset[3] = {0, 0, 0};
size_t globalWorkSize[3] = {39, 39, 39};
size_t localWorkSize[3] = {8, 8, 2};
cl_uint numEventsInWaitList = 0;
cl_event *eventWaitList = nullptr;
cl_event *event = nullptr;
cl_uint workDim = 3;
size_t globalWorkOffset[3] = {0, 0, 0};
size_t globalWorkSize[3] = {39, 39, 39};
size_t localWorkSize[3] = {8, 8, 2};
cl_uint numEventsInWaitList = 0;
cl_event *eventWaitList = nullptr;
cl_event *event = nullptr;
initializeExpectedMemory(globalWorkSize[0], globalWorkSize[1], globalWorkSize[2]);
initializeExpectedMemory(globalWorkSize[0], globalWorkSize[1], globalWorkSize[2]);
auto retVal = this->pCmdQ->enqueueKernel(
this->kernel,
workDim,
globalWorkOffset,
globalWorkSize,
localWorkSize,
numEventsInWaitList,
eventWaitList,
event);
ASSERT_EQ(CL_SUCCESS, retVal);
auto retVal = this->pCmdQ->enqueueKernel(
this->kernel,
workDim,
globalWorkOffset,
globalWorkSize,
localWorkSize,
numEventsInWaitList,
eventWaitList,
event);
ASSERT_EQ(CL_SUCCESS, retVal);
hwParser.parseCommands<FamilyType>(*pCmdQ);
uint32_t walkerCount = hwParser.getCommandCount<WALKER_TYPE>();
EXPECT_EQ(8u, walkerCount);
hwParser.parseCommands<FamilyType>(*pCmdQ);
uint32_t walkerCount = hwParser.getCommandCount<WALKER_TYPE>();
EXPECT_EQ(8u, walkerCount);
pCmdQ->flush();
expectMemory<FamilyType>(this->destMemory, this->expectedMemory, sizeWrittenMemory);
expectMemory<FamilyType>(this->remainderDestMemory, this->expectedRemainderMemory, sizeRemainderMemory);
}
pCmdQ->flush();
expectMemory<FamilyType>(bufferGpuAddress, this->expectedMemory, sizeWrittenMemory);
expectMemory<FamilyType>(remainderBufferGpuAddress, this->expectedRemainderMemory, sizeRemainderMemory);
}