compute-runtime/unit_tests/execution_model/parent_kernel_dispatch_test...

415 lines
20 KiB
C++

/*
* Copyright (c) 2017 - 2018, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include "runtime/command_queue/enqueue_kernel.h"
#include "runtime/event/perf_counter.h"
#include "runtime/kernel/kernel.h"
#include "runtime/sampler/sampler.h"
#include "unit_tests/fixtures/execution_model_fixture.h"
#include "unit_tests/helpers/hw_parse.h"
#include "unit_tests/mocks/mock_kernel.h"
#include "unit_tests/mocks/mock_program.h"
#include "unit_tests/mocks/mock_context.h"
#include "unit_tests/mocks/mock_mdi.h"
using namespace OCLRT;
static const char *binaryFile = "simple_block_kernel";
static const char *KernelNames[] = {"kernel_reflection", "simple_block_kernel"};
typedef ExecutionModelKernelTest ParentKernelDispatchTest;
HWTEST_P(ParentKernelDispatchTest, givenParentKernelWhenQueueIsNotBlockedThenDeviceQueueDSHIsUsed) {
if (std::string(pPlatform->getDevice(0)->getDeviceInfo().clVersion).find("OpenCL 2.") != std::string::npos) {
DeviceQueueHw<FamilyType> *pDevQueueHw = castToObject<DeviceQueueHw<FamilyType>>(pDevQueue);
KernelOperation *blockedCommandsData = nullptr;
const size_t globalOffsets[3] = {0, 0, 0};
const size_t workItems[3] = {1, 1, 1};
pKernel->createReflectionSurface();
size_t dshUsedBefore = pCmdQ->getIndirectHeap(IndirectHeap::DYNAMIC_STATE).getUsed();
EXPECT_EQ(0u, dshUsedBefore);
size_t executionModelDSHUsedBefore = pDevQueueHw->getIndirectHeap(IndirectHeap::DYNAMIC_STATE)->getUsed();
GpgpuWalkerHelper<FamilyType>::dispatchWalker(*pCmdQ,
*pKernel,
1,
globalOffsets,
workItems,
nullptr,
0,
nullptr,
&blockedCommandsData,
nullptr,
nullptr,
pDevice->getPreemptionMode(),
false);
size_t dshUsedAfter = pCmdQ->getIndirectHeap(IndirectHeap::DYNAMIC_STATE).getUsed();
EXPECT_EQ(0u, dshUsedAfter);
size_t executionModelDSHUsedAfter = pDevQueueHw->getIndirectHeap(IndirectHeap::DYNAMIC_STATE)->getUsed();
EXPECT_NE(executionModelDSHUsedBefore, executionModelDSHUsedAfter);
}
}
HWTEST_P(ParentKernelDispatchTest, givenParentKernelWhenDynamicStateHeapIsRequestedThenDeviceQueueHeapIsReturned) {
if (std::string(pPlatform->getDevice(0)->getDeviceInfo().clVersion).find("OpenCL 2.") != std::string::npos) {
DeviceQueueHw<FamilyType> *pDevQueueHw = castToObject<DeviceQueueHw<FamilyType>>(pDevQueue);
MockMultiDispatchInfo multiDispatchInfo(pKernel);
auto ish = &getIndirectHeap<FamilyType, IndirectHeap::DYNAMIC_STATE>(*pCmdQ, multiDispatchInfo);
auto ishOfDevQueue = pDevQueueHw->getIndirectHeap(IndirectHeap::DYNAMIC_STATE);
EXPECT_EQ(ishOfDevQueue, ish);
}
}
HWTEST_P(ParentKernelDispatchTest, givenParentKernelWhenIndirectObjectHeapIsRequestedThenDeviceQueueDSHIsReturned) {
if (std::string(pPlatform->getDevice(0)->getDeviceInfo().clVersion).find("OpenCL 2.") != std::string::npos) {
DeviceQueueHw<FamilyType> *pDevQueueHw = castToObject<DeviceQueueHw<FamilyType>>(pDevQueue);
MockMultiDispatchInfo multiDispatchInfo(pKernel);
auto ioh = &getIndirectHeap<FamilyType, IndirectHeap::INDIRECT_OBJECT>(*pCmdQ, multiDispatchInfo);
auto dshOfDevQueue = pDevQueueHw->getIndirectHeap(IndirectHeap::DYNAMIC_STATE);
EXPECT_EQ(dshOfDevQueue, ioh);
}
}
HWTEST_P(ParentKernelDispatchTest, givenParentKernelWhenQueueIsNotBlockedThenDefaultCmdQIOHIsNotUsed) {
if (std::string(pPlatform->getDevice(0)->getDeviceInfo().clVersion).find("OpenCL 2.") != std::string::npos) {
KernelOperation *blockedCommandsData = nullptr;
const size_t globalOffsets[3] = {0, 0, 0};
const size_t workItems[3] = {1, 1, 1};
MockMultiDispatchInfo multiDispatchInfo(pKernel);
auto &ioh = pCmdQ->getIndirectHeap(IndirectHeap::INDIRECT_OBJECT);
GpgpuWalkerHelper<FamilyType>::dispatchWalker(*pCmdQ,
*pKernel,
1,
globalOffsets,
workItems,
nullptr,
0,
nullptr,
&blockedCommandsData,
nullptr,
nullptr,
pDevice->getPreemptionMode(),
false);
auto iohUsed = ioh.getUsed();
EXPECT_EQ(0u, iohUsed);
}
}
HWTEST_P(ParentKernelDispatchTest, givenParentKernelWhenQueueIsNotBlockedThenSSHSizeAccountForsBlocksSurfaceStates) {
if (std::string(pPlatform->getDevice(0)->getDeviceInfo().clVersion).find("OpenCL 2.") != std::string::npos) {
KernelOperation *blockedCommandsData = nullptr;
const size_t globalOffsets[3] = {0, 0, 0};
const size_t workItems[3] = {1, 1, 1};
MockMultiDispatchInfo multiDispatchInfo(pKernel);
GpgpuWalkerHelper<FamilyType>::dispatchWalker(*pCmdQ,
*pKernel,
1,
globalOffsets,
workItems,
nullptr,
0,
nullptr,
&blockedCommandsData,
nullptr,
nullptr,
pDevice->getPreemptionMode(),
false);
auto &ssh = pCmdQ->getIndirectHeap(IndirectHeap::SURFACE_STATE);
EXPECT_LE(pKernel->getKernelInfo().heapInfo.pKernelHeader->SurfaceStateHeapSize, ssh.getMaxAvailableSpace());
size_t minRequiredSize = KernelCommandsHelper<FamilyType>::getTotalSizeRequiredSSH(multiDispatchInfo);
size_t minRequiredSizeForEM = KernelCommandsHelper<FamilyType>::template getSizeRequiredForExecutionModel<IndirectHeap::SURFACE_STATE>(*pKernel);
EXPECT_LE(minRequiredSize + minRequiredSizeForEM, ssh.getMaxAvailableSpace());
}
}
HWTEST_P(ParentKernelDispatchTest, givenParentKernelWhenQueueIsBlockedThenSSHSizeForParentIsAllocated) {
using BINDING_TABLE_STATE = typename FamilyType::BINDING_TABLE_STATE;
using RENDER_SURFACE_STATE = typename FamilyType::RENDER_SURFACE_STATE;
if (std::string(pPlatform->getDevice(0)->getDeviceInfo().clVersion).find("OpenCL 2.") != std::string::npos) {
KernelOperation *blockedCommandsData = nullptr;
const size_t globalOffsets[3] = {0, 0, 0};
const size_t workItems[3] = {1, 1, 1};
MockMultiDispatchInfo multiDispatchInfo(pKernel);
GpgpuWalkerHelper<FamilyType>::dispatchWalker(*pCmdQ,
*pKernel,
1,
globalOffsets,
workItems,
nullptr,
0,
nullptr,
&blockedCommandsData,
nullptr,
nullptr,
pDevice->getPreemptionMode(),
true); // blockQueue
ASSERT_NE(nullptr, blockedCommandsData);
size_t minRequiredSize = KernelCommandsHelper<FamilyType>::getTotalSizeRequiredSSH(multiDispatchInfo);
size_t minRequiredSizeForEM = KernelCommandsHelper<FamilyType>::template getSizeRequiredForExecutionModel<IndirectHeap::SURFACE_STATE>(*pKernel);
size_t sshUsed = blockedCommandsData->ssh->getUsed();
size_t expectedSizeSSH = pKernel->getNumberOfBindingTableStates() * sizeof(RENDER_SURFACE_STATE) + pKernel->getKernelInfo().patchInfo.bindingTableState->Count * sizeof(BINDING_TABLE_STATE);
if ((pKernel->requiresSshForBuffers()) || (pKernel->getKernelInfo().patchInfo.imageMemObjKernelArgs.size() > 0)) {
EXPECT_EQ(expectedSizeSSH, sshUsed);
}
EXPECT_GE(minRequiredSize, sshUsed);
// Total SSH size including EM must be greater then ssh allocated
EXPECT_GT(minRequiredSize + minRequiredSizeForEM, sshUsed);
delete blockedCommandsData;
}
}
INSTANTIATE_TEST_CASE_P(ParentKernelDispatchTest,
ParentKernelDispatchTest,
::testing::Combine(
::testing::Values(binaryFile),
::testing::ValuesIn(KernelNames)));
typedef ParentKernelCommandQueueFixture ParentKernelCommandStreamFixture;
HWTEST_F(ParentKernelCommandStreamFixture, GivenDispatchInfoWithParentKernelWhenCommandStreamIsAcquiredThenSizeAccountsForSchedulerDispatch) {
if (device->getSupportedClVersion() >= 20) {
MockParentKernel *mockParentKernel = MockParentKernel::create(*device);
DispatchInfo dispatchInfo(mockParentKernel, 1, Vec3<size_t>{24, 1, 1}, Vec3<size_t>{24, 1, 1}, Vec3<size_t>{0, 0, 0});
MultiDispatchInfo multiDispatchInfo;
size_t size = EnqueueOperation<FamilyType, CL_COMMAND_NDRANGE_KERNEL>::getSizeRequiredCS(false, false, *pCmdQ, mockParentKernel);
size_t numOfKernels = MemoryConstants::pageSize / size;
size_t rest = MemoryConstants::pageSize - (numOfKernels * size);
SchedulerKernel &scheduler = BuiltIns::getInstance().getSchedulerKernel(*mockParentKernel->getContext());
size_t schedulerSize = EnqueueOperation<FamilyType, CL_COMMAND_NDRANGE_KERNEL>::getSizeRequiredCS(false, false, *pCmdQ, &scheduler);
while (rest >= schedulerSize) {
numOfKernels++;
rest = alignUp(numOfKernels * size, MemoryConstants::pageSize) - numOfKernels * size;
}
for (size_t i = 0; i < numOfKernels; i++) {
multiDispatchInfo.push(dispatchInfo);
}
size_t totalKernelSize = alignUp(numOfKernels * size, MemoryConstants::pageSize);
LinearStream &commandStream = getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(*pCmdQ, false, false, multiDispatchInfo);
EXPECT_LT(totalKernelSize, commandStream.getMaxAvailableSpace());
delete mockParentKernel;
}
}
class MockParentKernelDispatch : public ExecutionModelSchedulerTest,
public testing::Test {
void SetUp() override {
ExecutionModelSchedulerTest::SetUp();
}
void TearDown() override {
ExecutionModelSchedulerTest::TearDown();
}
};
HWTEST_F(MockParentKernelDispatch, GivenBlockedQueueWhenParentKernelIsDispatchedThenDshHeapForIndirectObjectHeapIsUsed) {
if (pDevice->getSupportedClVersion() >= 20) {
MockParentKernel *mockParentKernel = MockParentKernel::create(*pDevice);
KernelOperation *blockedCommandsData = nullptr;
const size_t globalOffsets[3] = {0, 0, 0};
const size_t workItems[3] = {1, 1, 1};
GpgpuWalkerHelper<FamilyType>::dispatchWalker(*pCmdQ,
*mockParentKernel,
1,
globalOffsets,
workItems,
nullptr,
0,
nullptr,
&blockedCommandsData,
nullptr,
nullptr,
pDevice->getPreemptionMode(),
true); // blockQueue
ASSERT_NE(nullptr, blockedCommandsData);
EXPECT_EQ(blockedCommandsData->dsh.get(), blockedCommandsData->ioh.get());
delete blockedCommandsData;
delete mockParentKernel;
}
}
HWTEST_F(MockParentKernelDispatch, GivenParentKernelWhenDispatchedThenMediaInterfaceDescriptorLoadIsCorrectlyProgrammed) {
typedef typename FamilyType::MEDIA_INTERFACE_DESCRIPTOR_LOAD MEDIA_INTERFACE_DESCRIPTOR_LOAD;
typedef typename FamilyType::INTERFACE_DESCRIPTOR_DATA INTERFACE_DESCRIPTOR_DATA;
if (pDevice->getSupportedClVersion() >= 20) {
MockParentKernel *mockParentKernel = MockParentKernel::create(*pDevice);
KernelOperation *blockedCommandsData = nullptr;
const size_t globalOffsets[3] = {0, 0, 0};
const size_t workItems[3] = {1, 1, 1};
GpgpuWalkerHelper<FamilyType>::dispatchWalker(*pCmdQ,
*mockParentKernel,
1,
globalOffsets,
workItems,
nullptr,
0,
nullptr,
&blockedCommandsData,
nullptr,
nullptr,
pDevice->getPreemptionMode(),
false); // blockQueue
LinearStream *commandStream = &pCmdQ->getCS(0);
HardwareParse hwParser;
hwParser.parseCommands<FamilyType>(*commandStream, 0);
hwParser.findHardwareCommands<FamilyType>();
ASSERT_NE(hwParser.cmdList.end(), hwParser.itorMediaInterfaceDescriptorLoad);
auto pCmd = (MEDIA_INTERFACE_DESCRIPTOR_LOAD *)hwParser.getCommand<MEDIA_INTERFACE_DESCRIPTOR_LOAD>(hwParser.cmdList.begin(), hwParser.itorWalker);
ASSERT_NE(nullptr, pCmd);
uint32_t offsetInterfaceDescriptorData = DeviceQueue::colorCalcStateSize;
uint32_t sizeInterfaceDescriptorData = sizeof(INTERFACE_DESCRIPTOR_DATA);
EXPECT_EQ(offsetInterfaceDescriptorData, pCmd->getInterfaceDescriptorDataStartAddress());
EXPECT_EQ(sizeInterfaceDescriptorData, pCmd->getInterfaceDescriptorTotalLength());
delete mockParentKernel;
}
}
HWTEST_F(MockParentKernelDispatch, GivenUsedSSHHeapWhenParentKernelIsDispatchedThenNewSSHIsAllocated) {
if (pDevice->getSupportedClVersion() >= 20) {
MockParentKernel *mockParentKernel = MockParentKernel::create(*pDevice);
KernelOperation *blockedCommandsData = nullptr;
const size_t globalOffsets[3] = {0, 0, 0};
const size_t workItems[3] = {1, 1, 1};
auto &ssh = pCmdQ->getIndirectHeap(IndirectHeap::SURFACE_STATE, 100);
ssh.getSpace(20);
EXPECT_EQ(20u, ssh.getUsed());
// Assuming parent is not using SSH, this is becuase storing allocation on reuse list and allocating
// new one by obtaining from reuse list returns the same allocation and heap buffer does not differ
// If parent is not using SSH, then heap obtained has zero usage and the same buffer
ASSERT_EQ(0u, mockParentKernel->getKernelInfo().heapInfo.pKernelHeader->SurfaceStateHeapSize);
GpgpuWalkerHelper<FamilyType>::dispatchWalker(*pCmdQ,
*mockParentKernel,
1,
globalOffsets,
workItems,
nullptr,
0,
nullptr,
&blockedCommandsData,
nullptr,
nullptr,
pDevice->getPreemptionMode(),
false); // blockQueue
EXPECT_EQ(0u, ssh.getUsed());
delete mockParentKernel;
}
}
HWTEST_F(MockParentKernelDispatch, GivenNotUsedSSHHeapWhenParentKernelIsDispatchedThenExistingSSHIsUsed) {
if (pDevice->getSupportedClVersion() >= 20) {
MockParentKernel *mockParentKernel = MockParentKernel::create(*pDevice);
KernelOperation *blockedCommandsData = nullptr;
const size_t globalOffsets[3] = {0, 0, 0};
const size_t workItems[3] = {1, 1, 1};
auto &ssh = pCmdQ->getIndirectHeap(IndirectHeap::SURFACE_STATE, 100);
EXPECT_EQ(0u, ssh.getUsed());
auto *bufferMemory = ssh.getCpuBase();
GpgpuWalkerHelper<FamilyType>::dispatchWalker(*pCmdQ,
*mockParentKernel,
1,
globalOffsets,
workItems,
nullptr,
0,
nullptr,
&blockedCommandsData,
nullptr,
nullptr,
pDevice->getPreemptionMode(),
false); // blockQueue
EXPECT_EQ(bufferMemory, ssh.getCpuBase());
delete mockParentKernel;
}
}