Respect KernelExecutionType in enqueue kernel calls

Change-Id: I9de07f9e3b77c4a44f6a0127e0ae3bd7e1ab97f8
Signed-off-by: Filip Hazubski <filip.hazubski@intel.com>
This commit is contained in:
Filip Hazubski
2020-01-24 15:06:55 +01:00
committed by sys_ocldev
parent d6f4520599
commit 07c4682668
4 changed files with 116 additions and 32 deletions

View File

@ -3143,7 +3143,8 @@ cl_int CL_API_CALL clEnqueueNDRangeKernel(cl_command_queue commandQueue,
return retVal;
}
if (pKernel->getKernelInfo().patchInfo.pAllocateSyncBuffer != nullptr) {
if ((pKernel->getExecutionType() != KernelExecutionType::Default) ||
pKernel->isUsingSyncBuffer()) {
retVal = CL_INVALID_KERNEL;
TRACING_EXIT(clEnqueueNDRangeKernel, &retVal);
return retVal;
@ -5330,16 +5331,28 @@ cl_int CL_API_CALL clEnqueueNDCountKernelINTEL(cl_command_queue commandQueue,
}
size_t globalWorkSize[3];
size_t requestedNumberOfWorkgroups = 1;
for (size_t i = 0; i < workDim; i++) {
globalWorkSize[i] = workgroupCount[i] * localWorkSize[i];
requestedNumberOfWorkgroups *= workgroupCount[i];
}
size_t maximalNumberOfWorkgroupsAllowed = pKernel->getMaxWorkGroupCount(workDim, localWorkSize);
if (requestedNumberOfWorkgroups > maximalNumberOfWorkgroupsAllowed) {
retVal = CL_INVALID_VALUE;
return retVal;
if (pKernel->getExecutionType() == KernelExecutionType::Concurrent) {
size_t requestedNumberOfWorkgroups = 1;
for (size_t i = 0; i < workDim; i++) {
requestedNumberOfWorkgroups *= workgroupCount[i];
}
size_t maximalNumberOfWorkgroupsAllowed = pKernel->getMaxWorkGroupCount(workDim, localWorkSize);
if (requestedNumberOfWorkgroups > maximalNumberOfWorkgroupsAllowed) {
retVal = CL_INVALID_VALUE;
return retVal;
}
}
if (pKernel->isUsingSyncBuffer()) {
if (pKernel->getExecutionType() != KernelExecutionType::Concurrent) {
retVal = CL_INVALID_KERNEL;
return retVal;
}
platform()->clDeviceMap[&pCommandQueue->getDevice()]->allocateSyncBufferHandler();
}
TakeOwnershipWrapper<Kernel> kernelOwnership(*pKernel, gtpinIsGTPinInitialized());
@ -5347,8 +5360,6 @@ cl_int CL_API_CALL clEnqueueNDCountKernelINTEL(cl_command_queue commandQueue,
gtpinNotifyKernelSubmit(kernel, pCommandQueue);
}
platform()->clDeviceMap[&pCommandQueue->getDevice()]->allocateSyncBufferHandler();
retVal = pCommandQueue->enqueueKernel(
kernel,
workDim,

View File

@ -345,6 +345,12 @@ class Kernel : public BaseObject<_cl_kernel> {
uint32_t getThreadArbitrationPolicy() const {
return threadArbitrationPolicy;
}
KernelExecutionType getExecutionType() const {
return executionType;
}
bool isUsingSyncBuffer() const {
return (kernelInfo.patchInfo.pAllocateSyncBuffer != nullptr);
}
bool checkIfIsParentKernelAndBlocksUsesPrintf();

View File

@ -1,5 +1,5 @@
/*
* Copyright (C) 2017-2019 Intel Corporation
* Copyright (C) 2017-2020 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
@ -78,4 +78,54 @@ TEST_F(clEnqueueNDRangeKernelTests, GivenNonZeroEventsAndEmptyEventWaitListWhenE
EXPECT_EQ(CL_INVALID_EVENT_WAIT_LIST, retVal);
}
TEST_F(clEnqueueNDRangeKernelTests, GivenConcurrentKernelWhenExecutingKernelThenInvalidKernelErrorIsReturned) {
cl_uint workDim = 1;
size_t globalWorkOffset[3] = {0, 0, 0};
size_t globalWorkSize[3] = {1, 1, 1};
size_t localWorkSize[3] = {1, 1, 1};
cl_uint numEventsInWaitList = 0;
cl_event *eventWaitList = nullptr;
cl_event *event = nullptr;
pKernel->executionType = KernelExecutionType::Concurrent;
retVal = clEnqueueNDRangeKernel(
pCommandQueue,
pKernel,
workDim,
globalWorkOffset,
globalWorkSize,
localWorkSize,
numEventsInWaitList,
eventWaitList,
event);
EXPECT_EQ(CL_INVALID_KERNEL, retVal);
}
TEST_F(clEnqueueNDRangeKernelTests, GivenKernelWithAllocateSyncBufferPatchWhenExecutingKernelThenInvalidKernelErrorIsReturned) {
cl_uint workDim = 1;
size_t globalWorkOffset[3] = {0, 0, 0};
size_t globalWorkSize[3] = {1, 1, 1};
size_t localWorkSize[3] = {1, 1, 1};
cl_uint numEventsInWaitList = 0;
cl_event *eventWaitList = nullptr;
cl_event *event = nullptr;
SPatchAllocateSyncBuffer patchAllocateSyncBuffer;
pProgram->mockKernelInfo.patchInfo.pAllocateSyncBuffer = &patchAllocateSyncBuffer;
EXPECT_TRUE(pKernel->isUsingSyncBuffer());
retVal = clEnqueueNDRangeKernel(
pCommandQueue,
pKernel,
workDim,
globalWorkOffset,
globalWorkSize,
localWorkSize,
numEventsInWaitList,
eventWaitList,
event);
EXPECT_EQ(CL_INVALID_KERNEL, retVal);
}
} // namespace ULT

View File

@ -32,6 +32,7 @@ class SyncBufferHandlerTest : public EnqueueHandlerTest {
EnqueueHandlerTest::SetUp();
kernelInternals = std::make_unique<MockKernelWithInternals>(*pClDevice, context);
kernel = kernelInternals->mockKernel;
kernel->executionType = KernelExecutionType::Concurrent;
commandQueue = reinterpret_cast<MockCommandQueue *>(new MockCommandQueueHw<FamilyType>(context, pClDevice, 0));
}
@ -53,10 +54,15 @@ class SyncBufferHandlerTest : public EnqueueHandlerTest {
return reinterpret_cast<MockSyncBufferHandler *>(pClDevice->syncBufferHandler.get());
}
cl_int enqueueNDCount() {
return clEnqueueNDCountKernelINTEL(commandQueue, kernel, workDim, gwOffset, workgroupCount, lws, 0, nullptr, nullptr);
}
const cl_uint workDim = 1;
const size_t gwOffset[3] = {0, 0, 0};
const size_t lws[3] = {10, 1, 1};
size_t workgroupCount[3] = {10, 1, 1};
size_t globalWorkSize[3] = {100, 1, 1};
size_t workItemsCount = 10;
std::unique_ptr<MockKernelWithInternals> kernelInternals;
MockKernel *kernel;
@ -64,10 +70,10 @@ class SyncBufferHandlerTest : public EnqueueHandlerTest {
SPatchAllocateSyncBuffer sPatchAllocateSyncBuffer;
};
HWTEST_TEMPLATED_F(SyncBufferHandlerTest, GivenAllocateSyncBufferPatchWhenEnqueuingKernelThenSyncBufferIsUsed) {
HWTEST_TEMPLATED_F(SyncBufferHandlerTest, GivenAllocateSyncBufferPatchAndConcurrentKernelWhenEnqueuingKernelThenSyncBufferIsUsed) {
patchAllocateSyncBuffer();
clEnqueueNDCountKernelINTEL(commandQueue, kernel, workDim, gwOffset, workgroupCount, lws, 0, nullptr, nullptr);
enqueueNDCount();
auto syncBufferHandler = getSyncBufferHandler();
EXPECT_EQ(workItemsCount, syncBufferHandler->usedBufferSize);
@ -77,34 +83,53 @@ HWTEST_TEMPLATED_F(SyncBufferHandlerTest, GivenAllocateSyncBufferPatchWhenEnqueu
pDevice->getUltCommandStreamReceiver<FamilyType>().latestSentTaskCount);
}
HWTEST_TEMPLATED_F(SyncBufferHandlerTest, GivenNoAllocateSyncBufferPatchWhenEnqueuingKernelThenSyncBufferIsNotUsedAndUsedBufferSizeIsNotUpdated) {
clEnqueueNDCountKernelINTEL(commandQueue, kernel, workDim, gwOffset, workgroupCount, lws, 0, nullptr, nullptr);
auto syncBufferHandler = getSyncBufferHandler();
EXPECT_EQ(0u, syncBufferHandler->usedBufferSize);
HWTEST_TEMPLATED_F(SyncBufferHandlerTest, GivenConcurrentKernelWithoutAllocateSyncBufferPatchWhenEnqueuingConcurrentKernelThenSyncBufferIsNotCreated) {
auto retVal = enqueueNDCount();
EXPECT_EQ(CL_SUCCESS, retVal);
EXPECT_EQ(nullptr, getSyncBufferHandler());
}
HWTEST_TEMPLATED_F(SyncBufferHandlerTest, GivenMaxWorkgroupCountWhenEnqueuingKernelThenSuccessIsReturned) {
HWTEST_TEMPLATED_F(SyncBufferHandlerTest, GivenDefaultKernelUsingSyncBufferWhenEnqueuingKernelThenErrorIsReturnedAndSyncBufferIsNotCreated) {
patchAllocateSyncBuffer();
kernel->executionType = KernelExecutionType::Default;
auto retVal = enqueueNDCount();
EXPECT_EQ(CL_INVALID_KERNEL, retVal);
EXPECT_EQ(nullptr, getSyncBufferHandler());
}
HWTEST_TEMPLATED_F(SyncBufferHandlerTest, GivenConcurrentKernelWithAllocateSyncBufferPatchWhenEnqueuingConcurrentKernelThenSyncBufferIsCreated) {
patchAllocateSyncBuffer();
auto retVal = enqueueNDCount();
EXPECT_EQ(CL_SUCCESS, retVal);
EXPECT_NE(nullptr, getSyncBufferHandler());
}
HWTEST_TEMPLATED_F(SyncBufferHandlerTest, GivenMaxWorkgroupCountWhenEnqueuingConcurrentKernelThenSuccessIsReturned) {
auto maxWorkGroupCount = kernel->getMaxWorkGroupCount(workDim, lws);
workgroupCount[0] = maxWorkGroupCount;
auto retVal = clEnqueueNDCountKernelINTEL(commandQueue, kernel, workDim, gwOffset, workgroupCount, lws, 0, nullptr, nullptr);
globalWorkSize[0] = maxWorkGroupCount * lws[0];
auto retVal = enqueueNDCount();
EXPECT_EQ(CL_SUCCESS, retVal);
}
HWTEST_TEMPLATED_F(SyncBufferHandlerTest, GivenTooHighWorkgroupCountWhenEnqueuingKernelThenErrorIsReturned) {
HWTEST_TEMPLATED_F(SyncBufferHandlerTest, GivenTooHighWorkgroupCountWhenEnqueuingConcurrentKernelThenErrorIsReturned) {
size_t maxWorkGroupCount = kernel->getMaxWorkGroupCount(workDim, lws);
workgroupCount[0] = maxWorkGroupCount + 1;
auto retVal = clEnqueueNDCountKernelINTEL(commandQueue, kernel, workDim, gwOffset, workgroupCount, lws, 0, nullptr, nullptr);
globalWorkSize[0] = maxWorkGroupCount * lws[0] + 1;
auto retVal = enqueueNDCount();
EXPECT_EQ(CL_INVALID_VALUE, retVal);
}
HWTEST_TEMPLATED_F(SyncBufferHandlerTest, GivenSyncBufferFullWhenEnqueuingKernelThenNewBufferIsAllocated) {
patchAllocateSyncBuffer();
clEnqueueNDCountKernelINTEL(commandQueue, kernel, workDim, gwOffset, workgroupCount, lws, 0, nullptr, nullptr);
enqueueNDCount();
auto syncBufferHandler = getSyncBufferHandler();
syncBufferHandler->usedBufferSize = syncBufferHandler->bufferSize;
clEnqueueNDCountKernelINTEL(commandQueue, kernel, workDim, gwOffset, workgroupCount, lws, 0, nullptr, nullptr);
enqueueNDCount();
EXPECT_EQ(workItemsCount, syncBufferHandler->usedBufferSize);
}
@ -128,14 +153,6 @@ HWTEST_TEMPLATED_F(SyncBufferHandlerTest, GivenSshRequiredWhenPatchingSyncBuffer
EXPECT_EQ(bufferAddress, surfaceAddress);
}
HWTEST_TEMPLATED_F(SyncBufferHandlerTest, GivenKernelUsingSyncBufferWhenUsingStandardEnqueueThenErrorIsReturned) {
patchAllocateSyncBuffer();
size_t globalWorkSize[3] = {workgroupCount[0] * lws[0], workgroupCount[1] * lws[1], workgroupCount[2] * lws[2]};
auto retVal = clEnqueueNDRangeKernel(commandQueue, kernel, workDim, gwOffset, globalWorkSize, lws, 0, nullptr, nullptr);
EXPECT_EQ(CL_INVALID_KERNEL, retVal);
}
TEST(SyncBufferHandlerDeviceTest, GivenRootDeviceWhenAllocateSyncBufferIsCalledTwiceThenTheObjectIsCreatedOnlyOnce) {
const size_t testUsedBufferSize = 100;
MockClDevice rootDevice{new MockDevice};