From 5d1da5740aab6b498fac1fe350f76fd4ac420d64 Mon Sep 17 00:00:00 2001 From: Bartosz Dunajski Date: Wed, 18 Mar 2020 12:05:25 +0100 Subject: [PATCH] Improve NonUniform aubs. Use Buffer instead of SVM allocation Change-Id: Ifd9b1e23aa64e631cf7efb1e1924aa7c3d9dcc7b Signed-off-by: Bartosz Dunajski --- .../enqueue_kernel_aub_tests.cpp | 586 +++++++++--------- 1 file changed, 279 insertions(+), 307 deletions(-) diff --git a/opencl/test/unit_test/aub_tests/command_queue/enqueue_kernel_aub_tests.cpp b/opencl/test/unit_test/aub_tests/command_queue/enqueue_kernel_aub_tests.cpp index 3ec6af4d31..796752c75e 100644 --- a/opencl/test/unit_test/aub_tests/command_queue/enqueue_kernel_aub_tests.cpp +++ b/opencl/test/unit_test/aub_tests/command_queue/enqueue_kernel_aub_tests.cpp @@ -370,17 +370,11 @@ INSTANTIATE_TEST_CASE_P( struct AUBSimpleArgNonUniformFixture : public KernelAUBFixture { void SetUp() override { - deviceClVersionSupport = NEO::platformDevices[0]->capabilityTable.clVersionSupport; - if (deviceClVersionSupport < 20) { - return; + if (NEO::platformDevices[0]->capabilityTable.clVersionSupport < 20) { + GTEST_SKIP(); } KernelAUBFixture::SetUp(); - argVal = static_cast(0x22222222); - - sizeWrittenMemory = 0; - typeSize = sizeof(int); - typeItems = 40 * 40 * 40; sizeUserMemory = alignUp(typeItems * typeSize, 64); destMemory = alignedMalloc(sizeUserMemory, 4096); @@ -391,12 +385,7 @@ struct AUBSimpleArgNonUniformFixture : public KernelAUBFixturesetArgSvm(1, sizeUserMemory, destMemory, nullptr, 0u); - - outBuffer = createHostPtrAllocationFromSvmPtr(destMemory, sizeUserMemory); } void initializeExpectedMemory(size_t globalX, size_t globalY, size_t globalZ) { @@ -421,6 +410,10 @@ struct AUBSimpleArgNonUniformFixture : public KernelAUBFixture(destMemory) + maxId) = 0; *(expectedData + maxId) = maxId; + outBuffer.reset(Buffer::create(context, CL_MEM_COPY_HOST_PTR, alignUp(sizeUserMemory, 4096), destMemory, retVal)); + bufferGpuAddress = reinterpret_cast(outBuffer->getGraphicsAllocation()->getGpuAddress()); + kernel->setArg(1, outBuffer.get()); + sizeWrittenMemory = maxId * typeSize; //add single int size for atomic sum of all work-items sizeWrittenMemory += typeSize; @@ -433,11 +426,11 @@ struct AUBSimpleArgNonUniformFixture : public KernelAUBFixture(destMemory) + sizeWrittenMemory; + remainderBufferGpuAddress = ptrOffset(bufferGpuAddress, sizeWrittenMemory); } void TearDown() override { - if (deviceClVersionSupport < 20) { + if (NEO::platformDevices[0]->capabilityTable.clVersionSupport < 20) { return; } if (destMemory) { @@ -456,17 +449,18 @@ struct AUBSimpleArgNonUniformFixture : public KernelAUBFixture outBuffer; HardwareParse hwParser; }; @@ -527,374 +521,352 @@ HWTEST_F(AUBSimpleKernelStatelessTest, givenSimpleKernelWhenStatelessPathIsUsedT using AUBSimpleArgNonUniformTest = Test; HWTEST_F(AUBSimpleArgNonUniformTest, givenOpenCL20SupportWhenProvidingWork1DimNonUniformGroupThenExpectTwoWalkers) { using WALKER_TYPE = WALKER_TYPE; - if (deviceClVersionSupport >= 20) { - cl_uint workDim = 1; - size_t globalWorkOffset[3] = {0, 0, 0}; - size_t globalWorkSize[3] = {39, 1, 1}; - size_t localWorkSize[3] = {32, 1, 1}; - cl_uint numEventsInWaitList = 0; - cl_event *eventWaitList = nullptr; - cl_event *event = nullptr; + cl_uint workDim = 1; + size_t globalWorkOffset[3] = {0, 0, 0}; + size_t globalWorkSize[3] = {39, 1, 1}; + size_t localWorkSize[3] = {32, 1, 1}; + cl_uint numEventsInWaitList = 0; + cl_event *eventWaitList = nullptr; + cl_event *event = nullptr; - initializeExpectedMemory(globalWorkSize[0], globalWorkSize[1], globalWorkSize[2]); + initializeExpectedMemory(globalWorkSize[0], globalWorkSize[1], globalWorkSize[2]); - auto retVal = this->pCmdQ->enqueueKernel( - this->kernel, - workDim, - globalWorkOffset, - globalWorkSize, - localWorkSize, - numEventsInWaitList, - eventWaitList, - event); - ASSERT_EQ(CL_SUCCESS, retVal); + auto retVal = this->pCmdQ->enqueueKernel( + this->kernel, + workDim, + globalWorkOffset, + globalWorkSize, + localWorkSize, + numEventsInWaitList, + eventWaitList, + event); + ASSERT_EQ(CL_SUCCESS, retVal); - hwParser.parseCommands(*pCmdQ); - uint32_t walkerCount = hwParser.getCommandCount(); - EXPECT_EQ(2u, walkerCount); + hwParser.parseCommands(*pCmdQ); + uint32_t walkerCount = hwParser.getCommandCount(); + EXPECT_EQ(2u, walkerCount); - pCmdQ->flush(); - expectMemory(this->destMemory, this->expectedMemory, sizeWrittenMemory); - expectMemory(this->remainderDestMemory, this->expectedRemainderMemory, sizeRemainderMemory); - } + pCmdQ->flush(); + expectMemory(bufferGpuAddress, this->expectedMemory, sizeWrittenMemory); + expectMemory(remainderBufferGpuAddress, this->expectedRemainderMemory, sizeRemainderMemory); } HWTEST_F(AUBSimpleArgNonUniformTest, givenOpenCL20SupportWhenProvidingWork2DimNonUniformGroupInXDimensionThenExpectTwoWalkers) { using WALKER_TYPE = WALKER_TYPE; - if (deviceClVersionSupport >= 20) { - cl_uint workDim = 2; - size_t globalWorkOffset[3] = {0, 0, 0}; - size_t globalWorkSize[3] = {39, 32, 1}; - size_t localWorkSize[3] = {16, 16, 1}; - cl_uint numEventsInWaitList = 0; - cl_event *eventWaitList = nullptr; - cl_event *event = nullptr; + cl_uint workDim = 2; + size_t globalWorkOffset[3] = {0, 0, 0}; + size_t globalWorkSize[3] = {39, 32, 1}; + size_t localWorkSize[3] = {16, 16, 1}; + cl_uint numEventsInWaitList = 0; + cl_event *eventWaitList = nullptr; + cl_event *event = nullptr; - initializeExpectedMemory(globalWorkSize[0], globalWorkSize[1], globalWorkSize[2]); + initializeExpectedMemory(globalWorkSize[0], globalWorkSize[1], globalWorkSize[2]); - auto retVal = this->pCmdQ->enqueueKernel( - this->kernel, - workDim, - globalWorkOffset, - globalWorkSize, - localWorkSize, - numEventsInWaitList, - eventWaitList, - event); - ASSERT_EQ(CL_SUCCESS, retVal); + auto retVal = this->pCmdQ->enqueueKernel( + this->kernel, + workDim, + globalWorkOffset, + globalWorkSize, + localWorkSize, + numEventsInWaitList, + eventWaitList, + event); + ASSERT_EQ(CL_SUCCESS, retVal); - hwParser.parseCommands(*pCmdQ); - uint32_t walkerCount = hwParser.getCommandCount(); - EXPECT_EQ(2u, walkerCount); + hwParser.parseCommands(*pCmdQ); + uint32_t walkerCount = hwParser.getCommandCount(); + EXPECT_EQ(2u, walkerCount); - pCmdQ->flush(); - expectMemory(this->destMemory, this->expectedMemory, sizeWrittenMemory); - expectMemory(this->remainderDestMemory, this->expectedRemainderMemory, sizeRemainderMemory); - } + pCmdQ->flush(); + expectMemory(bufferGpuAddress, this->expectedMemory, sizeWrittenMemory); + expectMemory(remainderBufferGpuAddress, this->expectedRemainderMemory, sizeRemainderMemory); } HWTEST_F(AUBSimpleArgNonUniformTest, givenOpenCL20SupportWhenProvidingWork2DimNonUniformGroupInYDimensionThenExpectTwoWalkers) { using WALKER_TYPE = WALKER_TYPE; - if (deviceClVersionSupport >= 20) { - cl_uint workDim = 2; - size_t globalWorkOffset[3] = {0, 0, 0}; - size_t globalWorkSize[3] = {32, 39, 1}; - size_t localWorkSize[3] = {16, 16, 1}; - cl_uint numEventsInWaitList = 0; - cl_event *eventWaitList = nullptr; - cl_event *event = nullptr; + cl_uint workDim = 2; + size_t globalWorkOffset[3] = {0, 0, 0}; + size_t globalWorkSize[3] = {32, 39, 1}; + size_t localWorkSize[3] = {16, 16, 1}; + cl_uint numEventsInWaitList = 0; + cl_event *eventWaitList = nullptr; + cl_event *event = nullptr; - initializeExpectedMemory(globalWorkSize[0], globalWorkSize[1], globalWorkSize[2]); + initializeExpectedMemory(globalWorkSize[0], globalWorkSize[1], globalWorkSize[2]); - auto retVal = this->pCmdQ->enqueueKernel( - this->kernel, - workDim, - globalWorkOffset, - globalWorkSize, - localWorkSize, - numEventsInWaitList, - eventWaitList, - event); - ASSERT_EQ(CL_SUCCESS, retVal); + auto retVal = this->pCmdQ->enqueueKernel( + this->kernel, + workDim, + globalWorkOffset, + globalWorkSize, + localWorkSize, + numEventsInWaitList, + eventWaitList, + event); + ASSERT_EQ(CL_SUCCESS, retVal); - hwParser.parseCommands(*pCmdQ); - uint32_t walkerCount = hwParser.getCommandCount(); - EXPECT_EQ(2u, walkerCount); + hwParser.parseCommands(*pCmdQ); + uint32_t walkerCount = hwParser.getCommandCount(); + EXPECT_EQ(2u, walkerCount); - pCmdQ->flush(); - expectMemory(this->destMemory, this->expectedMemory, sizeWrittenMemory); - expectMemory(this->remainderDestMemory, this->expectedRemainderMemory, sizeRemainderMemory); - } + pCmdQ->flush(); + expectMemory(bufferGpuAddress, this->expectedMemory, sizeWrittenMemory); + expectMemory(remainderBufferGpuAddress, this->expectedRemainderMemory, sizeRemainderMemory); } HWTEST_F(AUBSimpleArgNonUniformTest, givenOpenCL20SupportWhenProvidingWork2DimNonUniformGroupInXandYDimensionThenExpectFourWalkers) { using WALKER_TYPE = WALKER_TYPE; - if (deviceClVersionSupport >= 20) { - cl_uint workDim = 2; - size_t globalWorkOffset[3] = {0, 0, 0}; - size_t globalWorkSize[3] = {39, 39, 1}; - size_t localWorkSize[3] = {16, 16, 1}; - cl_uint numEventsInWaitList = 0; - cl_event *eventWaitList = nullptr; - cl_event *event = nullptr; + cl_uint workDim = 2; + size_t globalWorkOffset[3] = {0, 0, 0}; + size_t globalWorkSize[3] = {39, 39, 1}; + size_t localWorkSize[3] = {16, 16, 1}; + cl_uint numEventsInWaitList = 0; + cl_event *eventWaitList = nullptr; + cl_event *event = nullptr; - initializeExpectedMemory(globalWorkSize[0], globalWorkSize[1], globalWorkSize[2]); + initializeExpectedMemory(globalWorkSize[0], globalWorkSize[1], globalWorkSize[2]); - auto retVal = this->pCmdQ->enqueueKernel( - this->kernel, - workDim, - globalWorkOffset, - globalWorkSize, - localWorkSize, - numEventsInWaitList, - eventWaitList, - event); - ASSERT_EQ(CL_SUCCESS, retVal); + auto retVal = this->pCmdQ->enqueueKernel( + this->kernel, + workDim, + globalWorkOffset, + globalWorkSize, + localWorkSize, + numEventsInWaitList, + eventWaitList, + event); + ASSERT_EQ(CL_SUCCESS, retVal); - hwParser.parseCommands(*pCmdQ); - uint32_t walkerCount = hwParser.getCommandCount(); - EXPECT_EQ(4u, walkerCount); + hwParser.parseCommands(*pCmdQ); + uint32_t walkerCount = hwParser.getCommandCount(); + EXPECT_EQ(4u, walkerCount); - pCmdQ->flush(); - expectMemory(this->destMemory, this->expectedMemory, sizeWrittenMemory); - expectMemory(this->remainderDestMemory, this->expectedRemainderMemory, sizeRemainderMemory); - } + pCmdQ->flush(); + expectMemory(bufferGpuAddress, this->expectedMemory, sizeWrittenMemory); + expectMemory(remainderBufferGpuAddress, this->expectedRemainderMemory, sizeRemainderMemory); } HWTEST_F(AUBSimpleArgNonUniformTest, givenOpenCL20SupportWhenProvidingWork3DimNonUniformGroupInXDimensionThenExpectTwoWalkers) { using WALKER_TYPE = WALKER_TYPE; - if (deviceClVersionSupport >= 20) { - cl_uint workDim = 3; - size_t globalWorkOffset[3] = {0, 0, 0}; - size_t globalWorkSize[3] = {39, 32, 32}; - size_t localWorkSize[3] = {8, 8, 2}; - cl_uint numEventsInWaitList = 0; - cl_event *eventWaitList = nullptr; - cl_event *event = nullptr; + cl_uint workDim = 3; + size_t globalWorkOffset[3] = {0, 0, 0}; + size_t globalWorkSize[3] = {39, 32, 32}; + size_t localWorkSize[3] = {8, 8, 2}; + cl_uint numEventsInWaitList = 0; + cl_event *eventWaitList = nullptr; + cl_event *event = nullptr; - initializeExpectedMemory(globalWorkSize[0], globalWorkSize[1], globalWorkSize[2]); + initializeExpectedMemory(globalWorkSize[0], globalWorkSize[1], globalWorkSize[2]); - auto retVal = this->pCmdQ->enqueueKernel( - this->kernel, - workDim, - globalWorkOffset, - globalWorkSize, - localWorkSize, - numEventsInWaitList, - eventWaitList, - event); - ASSERT_EQ(CL_SUCCESS, retVal); + auto retVal = this->pCmdQ->enqueueKernel( + this->kernel, + workDim, + globalWorkOffset, + globalWorkSize, + localWorkSize, + numEventsInWaitList, + eventWaitList, + event); + ASSERT_EQ(CL_SUCCESS, retVal); - hwParser.parseCommands(*pCmdQ); - uint32_t walkerCount = hwParser.getCommandCount(); - EXPECT_EQ(2u, walkerCount); + hwParser.parseCommands(*pCmdQ); + uint32_t walkerCount = hwParser.getCommandCount(); + EXPECT_EQ(2u, walkerCount); - pCmdQ->flush(); - expectMemory(this->destMemory, this->expectedMemory, sizeWrittenMemory); - expectMemory(this->remainderDestMemory, this->expectedRemainderMemory, sizeRemainderMemory); - } + pCmdQ->flush(); + expectMemory(bufferGpuAddress, this->expectedMemory, sizeWrittenMemory); + expectMemory(remainderBufferGpuAddress, this->expectedRemainderMemory, sizeRemainderMemory); } HWTEST_F(AUBSimpleArgNonUniformTest, givenOpenCL20SupportWhenProvidingWork3DimNonUniformGroupInYDimensionThenExpectTwoWalkers) { using WALKER_TYPE = WALKER_TYPE; - if (deviceClVersionSupport >= 20) { - cl_uint workDim = 3; - size_t globalWorkOffset[3] = {0, 0, 0}; - size_t globalWorkSize[3] = {32, 39, 32}; - size_t localWorkSize[3] = {8, 8, 2}; - cl_uint numEventsInWaitList = 0; - cl_event *eventWaitList = nullptr; - cl_event *event = nullptr; + cl_uint workDim = 3; + size_t globalWorkOffset[3] = {0, 0, 0}; + size_t globalWorkSize[3] = {32, 39, 32}; + size_t localWorkSize[3] = {8, 8, 2}; + cl_uint numEventsInWaitList = 0; + cl_event *eventWaitList = nullptr; + cl_event *event = nullptr; - initializeExpectedMemory(globalWorkSize[0], globalWorkSize[1], globalWorkSize[2]); + initializeExpectedMemory(globalWorkSize[0], globalWorkSize[1], globalWorkSize[2]); - auto retVal = this->pCmdQ->enqueueKernel( - this->kernel, - workDim, - globalWorkOffset, - globalWorkSize, - localWorkSize, - numEventsInWaitList, - eventWaitList, - event); - ASSERT_EQ(CL_SUCCESS, retVal); + auto retVal = this->pCmdQ->enqueueKernel( + this->kernel, + workDim, + globalWorkOffset, + globalWorkSize, + localWorkSize, + numEventsInWaitList, + eventWaitList, + event); + ASSERT_EQ(CL_SUCCESS, retVal); - hwParser.parseCommands(*pCmdQ); - uint32_t walkerCount = hwParser.getCommandCount(); - EXPECT_EQ(2u, walkerCount); + hwParser.parseCommands(*pCmdQ); + uint32_t walkerCount = hwParser.getCommandCount(); + EXPECT_EQ(2u, walkerCount); - pCmdQ->flush(); - expectMemory(this->destMemory, this->expectedMemory, sizeWrittenMemory); - expectMemory(this->remainderDestMemory, this->expectedRemainderMemory, sizeRemainderMemory); - } + pCmdQ->flush(); + expectMemory(bufferGpuAddress, this->expectedMemory, sizeWrittenMemory); + expectMemory(remainderBufferGpuAddress, this->expectedRemainderMemory, sizeRemainderMemory); } HWTEST_F(AUBSimpleArgNonUniformTest, givenOpenCL20SupportWhenProvidingWork3DimNonUniformGroupInZDimensionThenExpectTwoWalkers) { using WALKER_TYPE = WALKER_TYPE; - if (deviceClVersionSupport >= 20) { - cl_uint workDim = 3; - size_t globalWorkOffset[3] = {0, 0, 0}; - size_t globalWorkSize[3] = {32, 32, 39}; - size_t localWorkSize[3] = {8, 2, 8}; - cl_uint numEventsInWaitList = 0; - cl_event *eventWaitList = nullptr; - cl_event *event = nullptr; + cl_uint workDim = 3; + size_t globalWorkOffset[3] = {0, 0, 0}; + size_t globalWorkSize[3] = {32, 32, 39}; + size_t localWorkSize[3] = {8, 2, 8}; + cl_uint numEventsInWaitList = 0; + cl_event *eventWaitList = nullptr; + cl_event *event = nullptr; - initializeExpectedMemory(globalWorkSize[0], globalWorkSize[1], globalWorkSize[2]); + initializeExpectedMemory(globalWorkSize[0], globalWorkSize[1], globalWorkSize[2]); - auto retVal = this->pCmdQ->enqueueKernel( - this->kernel, - workDim, - globalWorkOffset, - globalWorkSize, - localWorkSize, - numEventsInWaitList, - eventWaitList, - event); - ASSERT_EQ(CL_SUCCESS, retVal); + auto retVal = this->pCmdQ->enqueueKernel( + this->kernel, + workDim, + globalWorkOffset, + globalWorkSize, + localWorkSize, + numEventsInWaitList, + eventWaitList, + event); + ASSERT_EQ(CL_SUCCESS, retVal); - hwParser.parseCommands(*pCmdQ); - uint32_t walkerCount = hwParser.getCommandCount(); - EXPECT_EQ(2u, walkerCount); + hwParser.parseCommands(*pCmdQ); + uint32_t walkerCount = hwParser.getCommandCount(); + EXPECT_EQ(2u, walkerCount); - pCmdQ->flush(); - expectMemory(this->destMemory, this->expectedMemory, sizeWrittenMemory); - expectMemory(this->remainderDestMemory, this->expectedRemainderMemory, sizeRemainderMemory); - } + pCmdQ->flush(); + expectMemory(bufferGpuAddress, this->expectedMemory, sizeWrittenMemory); + expectMemory(remainderBufferGpuAddress, this->expectedRemainderMemory, sizeRemainderMemory); } HWTEST_F(AUBSimpleArgNonUniformTest, givenOpenCL20SupportWhenProvidingWork3DimNonUniformGroupInXandYDimensionThenExpectFourWalkers) { using WALKER_TYPE = WALKER_TYPE; - if (deviceClVersionSupport >= 20) { - cl_uint workDim = 3; - size_t globalWorkOffset[3] = {0, 0, 0}; - size_t globalWorkSize[3] = {39, 39, 32}; - size_t localWorkSize[3] = {8, 8, 2}; - cl_uint numEventsInWaitList = 0; - cl_event *eventWaitList = nullptr; - cl_event *event = nullptr; + cl_uint workDim = 3; + size_t globalWorkOffset[3] = {0, 0, 0}; + size_t globalWorkSize[3] = {39, 39, 32}; + size_t localWorkSize[3] = {8, 8, 2}; + cl_uint numEventsInWaitList = 0; + cl_event *eventWaitList = nullptr; + cl_event *event = nullptr; - initializeExpectedMemory(globalWorkSize[0], globalWorkSize[1], globalWorkSize[2]); + initializeExpectedMemory(globalWorkSize[0], globalWorkSize[1], globalWorkSize[2]); - auto retVal = this->pCmdQ->enqueueKernel( - this->kernel, - workDim, - globalWorkOffset, - globalWorkSize, - localWorkSize, - numEventsInWaitList, - eventWaitList, - event); - ASSERT_EQ(CL_SUCCESS, retVal); + auto retVal = this->pCmdQ->enqueueKernel( + this->kernel, + workDim, + globalWorkOffset, + globalWorkSize, + localWorkSize, + numEventsInWaitList, + eventWaitList, + event); + ASSERT_EQ(CL_SUCCESS, retVal); - hwParser.parseCommands(*pCmdQ); - uint32_t walkerCount = hwParser.getCommandCount(); - EXPECT_EQ(4u, walkerCount); + hwParser.parseCommands(*pCmdQ); + uint32_t walkerCount = hwParser.getCommandCount(); + EXPECT_EQ(4u, walkerCount); - pCmdQ->flush(); - expectMemory(this->destMemory, this->expectedMemory, sizeWrittenMemory); - expectMemory(this->remainderDestMemory, this->expectedRemainderMemory, sizeRemainderMemory); - } + pCmdQ->flush(); + expectMemory(bufferGpuAddress, this->expectedMemory, sizeWrittenMemory); + expectMemory(remainderBufferGpuAddress, this->expectedRemainderMemory, sizeRemainderMemory); } HWTEST_F(AUBSimpleArgNonUniformTest, givenOpenCL20SupportWhenProvidingWork3DimNonUniformGroupInXandZDimensionThenExpectFourWalkers) { using WALKER_TYPE = WALKER_TYPE; - if (deviceClVersionSupport >= 20) { - cl_uint workDim = 3; - size_t globalWorkOffset[3] = {0, 0, 0}; - size_t globalWorkSize[3] = {39, 32, 39}; - size_t localWorkSize[3] = {8, 2, 8}; - cl_uint numEventsInWaitList = 0; - cl_event *eventWaitList = nullptr; - cl_event *event = nullptr; + cl_uint workDim = 3; + size_t globalWorkOffset[3] = {0, 0, 0}; + size_t globalWorkSize[3] = {39, 32, 39}; + size_t localWorkSize[3] = {8, 2, 8}; + cl_uint numEventsInWaitList = 0; + cl_event *eventWaitList = nullptr; + cl_event *event = nullptr; - initializeExpectedMemory(globalWorkSize[0], globalWorkSize[1], globalWorkSize[2]); + initializeExpectedMemory(globalWorkSize[0], globalWorkSize[1], globalWorkSize[2]); - auto retVal = this->pCmdQ->enqueueKernel( - this->kernel, - workDim, - globalWorkOffset, - globalWorkSize, - localWorkSize, - numEventsInWaitList, - eventWaitList, - event); - ASSERT_EQ(CL_SUCCESS, retVal); + auto retVal = this->pCmdQ->enqueueKernel( + this->kernel, + workDim, + globalWorkOffset, + globalWorkSize, + localWorkSize, + numEventsInWaitList, + eventWaitList, + event); + ASSERT_EQ(CL_SUCCESS, retVal); - hwParser.parseCommands(*pCmdQ); - uint32_t walkerCount = hwParser.getCommandCount(); - EXPECT_EQ(4u, walkerCount); + hwParser.parseCommands(*pCmdQ); + uint32_t walkerCount = hwParser.getCommandCount(); + EXPECT_EQ(4u, walkerCount); - pCmdQ->flush(); - expectMemory(this->destMemory, this->expectedMemory, sizeWrittenMemory); - expectMemory(this->remainderDestMemory, this->expectedRemainderMemory, sizeRemainderMemory); - } + pCmdQ->flush(); + expectMemory(bufferGpuAddress, this->expectedMemory, sizeWrittenMemory); + expectMemory(remainderBufferGpuAddress, this->expectedRemainderMemory, sizeRemainderMemory); } HWTEST_F(AUBSimpleArgNonUniformTest, givenOpenCL20SupportWhenProvidingWork3DimNonUniformGroupInYandZDimensionThenExpectFourWalkers) { using WALKER_TYPE = WALKER_TYPE; - if (deviceClVersionSupport >= 20) { - cl_uint workDim = 3; - size_t globalWorkOffset[3] = {0, 0, 0}; - size_t globalWorkSize[3] = {32, 39, 39}; - size_t localWorkSize[3] = {2, 8, 8}; - cl_uint numEventsInWaitList = 0; - cl_event *eventWaitList = nullptr; - cl_event *event = nullptr; + cl_uint workDim = 3; + size_t globalWorkOffset[3] = {0, 0, 0}; + size_t globalWorkSize[3] = {32, 39, 39}; + size_t localWorkSize[3] = {2, 8, 8}; + cl_uint numEventsInWaitList = 0; + cl_event *eventWaitList = nullptr; + cl_event *event = nullptr; - initializeExpectedMemory(globalWorkSize[0], globalWorkSize[1], globalWorkSize[2]); + initializeExpectedMemory(globalWorkSize[0], globalWorkSize[1], globalWorkSize[2]); - auto retVal = this->pCmdQ->enqueueKernel( - this->kernel, - workDim, - globalWorkOffset, - globalWorkSize, - localWorkSize, - numEventsInWaitList, - eventWaitList, - event); - ASSERT_EQ(CL_SUCCESS, retVal); + auto retVal = this->pCmdQ->enqueueKernel( + this->kernel, + workDim, + globalWorkOffset, + globalWorkSize, + localWorkSize, + numEventsInWaitList, + eventWaitList, + event); + ASSERT_EQ(CL_SUCCESS, retVal); - hwParser.parseCommands(*pCmdQ); - uint32_t walkerCount = hwParser.getCommandCount(); - EXPECT_EQ(4u, walkerCount); + hwParser.parseCommands(*pCmdQ); + uint32_t walkerCount = hwParser.getCommandCount(); + EXPECT_EQ(4u, walkerCount); - pCmdQ->flush(); - expectMemory(this->destMemory, this->expectedMemory, sizeWrittenMemory); - expectMemory(this->remainderDestMemory, this->expectedRemainderMemory, sizeRemainderMemory); - } + pCmdQ->flush(); + expectMemory(bufferGpuAddress, this->expectedMemory, sizeWrittenMemory); + expectMemory(remainderBufferGpuAddress, this->expectedRemainderMemory, sizeRemainderMemory); } HWTEST_F(AUBSimpleArgNonUniformTest, givenOpenCL20SupportWhenProvidingWork3DimNonUniformGroupInXandYandZDimensionThenExpectEightWalkers) { using WALKER_TYPE = WALKER_TYPE; - if (deviceClVersionSupport >= 20) { - cl_uint workDim = 3; - size_t globalWorkOffset[3] = {0, 0, 0}; - size_t globalWorkSize[3] = {39, 39, 39}; - size_t localWorkSize[3] = {8, 8, 2}; - cl_uint numEventsInWaitList = 0; - cl_event *eventWaitList = nullptr; - cl_event *event = nullptr; + cl_uint workDim = 3; + size_t globalWorkOffset[3] = {0, 0, 0}; + size_t globalWorkSize[3] = {39, 39, 39}; + size_t localWorkSize[3] = {8, 8, 2}; + cl_uint numEventsInWaitList = 0; + cl_event *eventWaitList = nullptr; + cl_event *event = nullptr; - initializeExpectedMemory(globalWorkSize[0], globalWorkSize[1], globalWorkSize[2]); + initializeExpectedMemory(globalWorkSize[0], globalWorkSize[1], globalWorkSize[2]); - auto retVal = this->pCmdQ->enqueueKernel( - this->kernel, - workDim, - globalWorkOffset, - globalWorkSize, - localWorkSize, - numEventsInWaitList, - eventWaitList, - event); - ASSERT_EQ(CL_SUCCESS, retVal); + auto retVal = this->pCmdQ->enqueueKernel( + this->kernel, + workDim, + globalWorkOffset, + globalWorkSize, + localWorkSize, + numEventsInWaitList, + eventWaitList, + event); + ASSERT_EQ(CL_SUCCESS, retVal); - hwParser.parseCommands(*pCmdQ); - uint32_t walkerCount = hwParser.getCommandCount(); - EXPECT_EQ(8u, walkerCount); + hwParser.parseCommands(*pCmdQ); + uint32_t walkerCount = hwParser.getCommandCount(); + EXPECT_EQ(8u, walkerCount); - pCmdQ->flush(); - expectMemory(this->destMemory, this->expectedMemory, sizeWrittenMemory); - expectMemory(this->remainderDestMemory, this->expectedRemainderMemory, sizeRemainderMemory); - } + pCmdQ->flush(); + expectMemory(bufferGpuAddress, this->expectedMemory, sizeWrittenMemory); + expectMemory(remainderBufferGpuAddress, this->expectedRemainderMemory, sizeRemainderMemory); }