compute-runtime/unit_tests/helpers/kernel_commands_tests.cpp

1265 lines
58 KiB
C++

/*
* Copyright (C) 2017-2018 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "runtime/built_ins/built_ins.h"
#include "runtime/built_ins/builtins_dispatch_builder.h"
#include "hw_cmds.h"
#include "runtime/command_queue/command_queue_hw.h"
#include "runtime/helpers/basic_math.h"
#include "runtime/helpers/kernel_commands.h"
#include "runtime/kernel/kernel.h"
#include "unit_tests/fixtures/context_fixture.h"
#include "unit_tests/fixtures/device_fixture.h"
#include "unit_tests/fixtures/image_fixture.h"
#include "unit_tests/fixtures/execution_model_kernel_fixture.h"
#include "unit_tests/helpers/debug_manager_state_restore.h"
#include "unit_tests/indirect_heap/indirect_heap_fixture.h"
#include "unit_tests/fixtures/built_in_fixture.h"
#include "unit_tests/mocks/mock_kernel.h"
#include "unit_tests/mocks/mock_program.h"
#include "unit_tests/mocks/mock_context.h"
#include "test.h"
#include <memory>
using namespace OCLRT;
struct KernelCommandsTest : DeviceFixture,
ContextFixture,
BuiltInFixture,
::testing::Test {
using BuiltInFixture::SetUp;
using ContextFixture::SetUp;
void SetUp() override {
DeviceFixture::SetUp();
ASSERT_NE(nullptr, pDevice);
cl_device_id device = pDevice;
ContextFixture::SetUp(1, &device);
ASSERT_NE(nullptr, pContext);
BuiltInFixture::SetUp(pDevice);
ASSERT_NE(nullptr, pBuiltIns);
}
void TearDown() override {
BuiltInFixture::TearDown();
ContextFixture::TearDown();
DeviceFixture::TearDown();
}
size_t sizeRequiredCS;
size_t sizeRequiredISH;
};
HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, programInterfaceDescriptorDataResourceUsage) {
CommandQueueHw<FamilyType> cmdQ(pContext, pDevice, 0);
std::unique_ptr<Image> srcImage(Image2dHelper<>::create(pContext));
ASSERT_NE(nullptr, srcImage.get());
std::unique_ptr<Image> dstImage(Image2dHelper<>::create(pContext));
ASSERT_NE(nullptr, dstImage.get());
MultiDispatchInfo multiDispatchInfo;
auto &builder = pDevice->getExecutionEnvironment()->getBuiltIns()->getBuiltinDispatchInfoBuilder(EBuiltInOps::CopyImageToImage3d,
cmdQ.getContext(), cmdQ.getDevice());
ASSERT_NE(nullptr, &builder);
BuiltinDispatchInfoBuilder::BuiltinOpParams dc;
dc.srcMemObj = srcImage.get();
dc.dstMemObj = dstImage.get();
dc.srcOffset = {0, 0, 0};
dc.dstOffset = {0, 0, 0};
dc.size = {1, 1, 1};
builder.buildDispatchInfos(multiDispatchInfo, dc);
EXPECT_NE(0u, multiDispatchInfo.size());
auto kernel = multiDispatchInfo.begin()->getKernel();
ASSERT_NE(nullptr, kernel);
typedef typename FamilyType::INTERFACE_DESCRIPTOR_DATA INTERFACE_DESCRIPTOR_DATA;
auto &indirectHeap = cmdQ.getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 8192);
auto usedIndirectHeapBefore = indirectHeap.getUsed();
indirectHeap.getSpace(sizeof(INTERFACE_DESCRIPTOR_DATA));
size_t crossThreadDataSize = kernel->getCrossThreadDataSize();
KernelCommandsHelper<FamilyType>::sendInterfaceDescriptorData(
indirectHeap, 0, 0, crossThreadDataSize, 64, 0, 0, 0, 1, 0 * KB, 0, false, pDevice->getPreemptionMode(), nullptr);
auto usedIndirectHeapAfter = indirectHeap.getUsed();
EXPECT_EQ(sizeof(INTERFACE_DESCRIPTOR_DATA), usedIndirectHeapAfter - usedIndirectHeapBefore);
}
HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, programMediaInterfaceDescriptorLoadResourceUsage) {
CommandQueueHw<FamilyType> cmdQ(nullptr, pDevice, 0);
typedef typename FamilyType::INTERFACE_DESCRIPTOR_DATA INTERFACE_DESCRIPTOR_DATA;
typedef typename FamilyType::MEDIA_INTERFACE_DESCRIPTOR_LOAD MEDIA_INTERFACE_DESCRIPTOR_LOAD;
typedef typename FamilyType::MEDIA_STATE_FLUSH MEDIA_STATE_FLUSH;
auto &commandStream = cmdQ.getCS(1024);
auto usedBefore = commandStream.getUsed();
KernelCommandsHelper<FamilyType>::sendMediaInterfaceDescriptorLoad(commandStream,
0,
sizeof(INTERFACE_DESCRIPTOR_DATA));
auto usedAfter = commandStream.getUsed();
EXPECT_EQ(sizeof(MEDIA_INTERFACE_DESCRIPTOR_LOAD) + sizeof(MEDIA_STATE_FLUSH), usedAfter - usedBefore);
}
HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, programMediaStateFlushResourceUsage) {
CommandQueueHw<FamilyType> cmdQ(nullptr, pDevice, 0);
typedef typename FamilyType::INTERFACE_DESCRIPTOR_DATA INTERFACE_DESCRIPTOR_DATA;
typedef typename FamilyType::MEDIA_STATE_FLUSH MEDIA_STATE_FLUSH;
auto &commandStream = cmdQ.getCS(1024);
auto usedBefore = commandStream.getUsed();
KernelCommandsHelper<FamilyType>::sendMediaStateFlush(commandStream,
sizeof(INTERFACE_DESCRIPTOR_DATA));
auto usedAfter = commandStream.getUsed();
EXPECT_EQ(sizeof(MEDIA_STATE_FLUSH), usedAfter - usedBefore);
}
HWTEST_F(KernelCommandsTest, sendCrossThreadDataResourceUsage) {
CommandQueueHw<FamilyType> cmdQ(pContext, pDevice, 0);
std::unique_ptr<Image> srcImage(Image2dHelper<>::create(pContext));
ASSERT_NE(nullptr, srcImage.get());
std::unique_ptr<Image> dstImage(Image2dHelper<>::create(pContext));
ASSERT_NE(nullptr, dstImage.get());
MultiDispatchInfo multiDispatchInfo;
auto &builder = pDevice->getExecutionEnvironment()->getBuiltIns()->getBuiltinDispatchInfoBuilder(EBuiltInOps::CopyImageToImage3d,
cmdQ.getContext(), cmdQ.getDevice());
ASSERT_NE(nullptr, &builder);
BuiltinDispatchInfoBuilder::BuiltinOpParams dc;
dc.srcMemObj = srcImage.get();
dc.dstMemObj = dstImage.get();
dc.srcOffset = {0, 0, 0};
dc.dstOffset = {0, 0, 0};
dc.size = {1, 1, 1};
builder.buildDispatchInfos(multiDispatchInfo, dc);
EXPECT_NE(0u, multiDispatchInfo.size());
auto kernel = multiDispatchInfo.begin()->getKernel();
ASSERT_NE(nullptr, kernel);
auto &indirectHeap = cmdQ.getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 8192);
auto usedBefore = indirectHeap.getUsed();
auto sizeCrossThreadData = kernel->getCrossThreadDataSize();
KernelCommandsHelper<FamilyType>::sendCrossThreadData(
indirectHeap,
*kernel,
false,
nullptr,
sizeCrossThreadData);
auto usedAfter = indirectHeap.getUsed();
EXPECT_EQ(kernel->getCrossThreadDataSize(), usedAfter - usedBefore);
}
HWTEST_F(KernelCommandsTest, givenSendCrossThreadDataWhenWhenAddPatchInfoCommentsForAUBDumpIsNotSetThenAddPatchInfoDataOffsetsAreNotMoved) {
CommandQueueHw<FamilyType> cmdQ(pContext, pDevice, 0);
MockContext context;
MockProgram program(*pDevice->getExecutionEnvironment(), &context, false);
auto kernelInfo = std::make_unique<KernelInfo>();
std::unique_ptr<MockKernel> kernel(new MockKernel(&program, *kernelInfo, *pDevice));
auto &indirectHeap = cmdQ.getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 8192);
PatchInfoData patchInfoData = {0xaaaaaaaa, 0, PatchInfoAllocationType::KernelArg, 0xbbbbbbbb, 0, PatchInfoAllocationType::IndirectObjectHeap};
kernel->getPatchInfoDataList().push_back(patchInfoData);
auto sizeCrossThreadData = kernel->getCrossThreadDataSize();
KernelCommandsHelper<FamilyType>::sendCrossThreadData(
indirectHeap,
*kernel,
false,
nullptr,
sizeCrossThreadData);
ASSERT_EQ(1u, kernel->getPatchInfoDataList().size());
EXPECT_EQ(0xaaaaaaaa, kernel->getPatchInfoDataList()[0].sourceAllocation);
EXPECT_EQ(0u, kernel->getPatchInfoDataList()[0].sourceAllocationOffset);
EXPECT_EQ(PatchInfoAllocationType::KernelArg, kernel->getPatchInfoDataList()[0].sourceType);
EXPECT_EQ(0xbbbbbbbb, kernel->getPatchInfoDataList()[0].targetAllocation);
EXPECT_EQ(0u, kernel->getPatchInfoDataList()[0].targetAllocationOffset);
EXPECT_EQ(PatchInfoAllocationType::IndirectObjectHeap, kernel->getPatchInfoDataList()[0].targetType);
}
HWTEST_F(KernelCommandsTest, givenIndirectHeapNotAllocatedFromInternalPoolWhenSendCrossThreadDataIsCalledThenOffsetZeroIsReturned) {
auto nonInternalAllocation = pDevice->getMemoryManager()->allocateGraphicsMemoryWithProperties(MockAllocationProperties{MemoryConstants::pageSize});
IndirectHeap indirectHeap(nonInternalAllocation, false);
MockKernelWithInternals mockKernelWithInternal(*pDevice);
auto sizeCrossThreadData = mockKernelWithInternal.mockKernel->getCrossThreadDataSize();
auto offset = KernelCommandsHelper<FamilyType>::sendCrossThreadData(
indirectHeap,
*mockKernelWithInternal.mockKernel,
false,
nullptr,
sizeCrossThreadData);
EXPECT_EQ(0u, offset);
pDevice->getMemoryManager()->freeGraphicsMemory(nonInternalAllocation);
}
HWTEST_F(KernelCommandsTest, givenIndirectHeapAllocatedFromInternalPoolWhenSendCrossThreadDataIsCalledThenHeapBaseOffsetIsReturned) {
auto internalAllocation = pDevice->getMemoryManager()->allocate32BitGraphicsMemory(MemoryConstants::pageSize, nullptr, AllocationOrigin::INTERNAL_ALLOCATION);
IndirectHeap indirectHeap(internalAllocation, true);
auto expectedOffset = internalAllocation->getGpuAddressToPatch();
MockKernelWithInternals mockKernelWithInternal(*pDevice);
auto sizeCrossThreadData = mockKernelWithInternal.mockKernel->getCrossThreadDataSize();
auto offset = KernelCommandsHelper<FamilyType>::sendCrossThreadData(
indirectHeap,
*mockKernelWithInternal.mockKernel,
false,
nullptr,
sizeCrossThreadData);
EXPECT_EQ(expectedOffset, offset);
pDevice->getMemoryManager()->freeGraphicsMemory(internalAllocation);
}
HWTEST_F(KernelCommandsTest, givenSendCrossThreadDataWhenWhenAddPatchInfoCommentsForAUBDumpIsSetThenAddPatchInfoDataOffsetsAreMoved) {
DebugManagerStateRestore dbgRestore;
DebugManager.flags.AddPatchInfoCommentsForAUBDump.set(true);
CommandQueueHw<FamilyType> cmdQ(pContext, pDevice, 0);
MockContext context;
MockProgram program(*pDevice->getExecutionEnvironment(), &context, false);
auto kernelInfo = std::make_unique<KernelInfo>();
std::unique_ptr<MockKernel> kernel(new MockKernel(&program, *kernelInfo, *pDevice));
auto &indirectHeap = cmdQ.getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 8192);
indirectHeap.getSpace(128u);
PatchInfoData patchInfoData1 = {0xaaaaaaaa, 0, PatchInfoAllocationType::KernelArg, 0xbbbbbbbb, 0, PatchInfoAllocationType::IndirectObjectHeap};
PatchInfoData patchInfoData2 = {0xcccccccc, 0, PatchInfoAllocationType::IndirectObjectHeap, 0xdddddddd, 0, PatchInfoAllocationType::Default};
kernel->getPatchInfoDataList().push_back(patchInfoData1);
kernel->getPatchInfoDataList().push_back(patchInfoData2);
auto sizeCrossThreadData = kernel->getCrossThreadDataSize();
auto offsetCrossThreadData = KernelCommandsHelper<FamilyType>::sendCrossThreadData(
indirectHeap,
*kernel,
false,
nullptr,
sizeCrossThreadData);
ASSERT_NE(0u, offsetCrossThreadData);
EXPECT_EQ(128u, offsetCrossThreadData);
ASSERT_EQ(2u, kernel->getPatchInfoDataList().size());
EXPECT_EQ(0xaaaaaaaa, kernel->getPatchInfoDataList()[0].sourceAllocation);
EXPECT_EQ(0u, kernel->getPatchInfoDataList()[0].sourceAllocationOffset);
EXPECT_EQ(PatchInfoAllocationType::KernelArg, kernel->getPatchInfoDataList()[0].sourceType);
EXPECT_NE(0xbbbbbbbb, kernel->getPatchInfoDataList()[0].targetAllocation);
EXPECT_EQ(indirectHeap.getGraphicsAllocation()->getGpuAddress(), kernel->getPatchInfoDataList()[0].targetAllocation);
EXPECT_NE(0u, kernel->getPatchInfoDataList()[0].targetAllocationOffset);
EXPECT_EQ(offsetCrossThreadData, kernel->getPatchInfoDataList()[0].targetAllocationOffset);
EXPECT_EQ(PatchInfoAllocationType::IndirectObjectHeap, kernel->getPatchInfoDataList()[0].targetType);
}
HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, sendIndirectStateResourceUsage) {
using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA;
using GPGPU_WALKER = typename FamilyType::GPGPU_WALKER;
CommandQueueHw<FamilyType> cmdQ(pContext, pDevice, 0);
std::unique_ptr<Image> srcImage(Image2dHelper<>::create(pContext));
ASSERT_NE(nullptr, srcImage.get());
std::unique_ptr<Image> dstImage(Image2dHelper<>::create(pContext));
ASSERT_NE(nullptr, dstImage.get());
MultiDispatchInfo multiDispatchInfo;
auto &builder = pDevice->getExecutionEnvironment()->getBuiltIns()->getBuiltinDispatchInfoBuilder(EBuiltInOps::CopyImageToImage3d,
cmdQ.getContext(), cmdQ.getDevice());
ASSERT_NE(nullptr, &builder);
BuiltinDispatchInfoBuilder::BuiltinOpParams dc;
dc.srcMemObj = srcImage.get();
dc.dstMemObj = dstImage.get();
dc.srcOffset = {0, 0, 0};
dc.dstOffset = {0, 0, 0};
dc.size = {1, 1, 1};
builder.buildDispatchInfos(multiDispatchInfo, dc);
EXPECT_NE(0u, multiDispatchInfo.size());
auto kernel = multiDispatchInfo.begin()->getKernel();
ASSERT_NE(nullptr, kernel);
const size_t localWorkSize = 256;
const size_t localWorkSizes[3]{localWorkSize, 1, 1};
auto &commandStream = cmdQ.getCS(1024);
auto pWalkerCmd = static_cast<GPGPU_WALKER *>(commandStream.getSpace(sizeof(GPGPU_WALKER)));
*pWalkerCmd = FamilyType::cmdInitGpgpuWalker;
auto &dsh = cmdQ.getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 8192);
auto &ioh = cmdQ.getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 8192);
auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::SURFACE_STATE, 8192);
auto usedBeforeCS = commandStream.getUsed();
auto usedBeforeDSH = dsh.getUsed();
auto usedBeforeIOH = ioh.getUsed();
auto usedBeforeSSH = ssh.getUsed();
dsh.align(KernelCommandsHelper<FamilyType>::alignInterfaceDescriptorData);
size_t IDToffset = dsh.getUsed();
dsh.getSpace(sizeof(INTERFACE_DESCRIPTOR_DATA));
KernelCommandsHelper<FamilyType>::sendMediaInterfaceDescriptorLoad(
commandStream,
IDToffset,
sizeof(INTERFACE_DESCRIPTOR_DATA));
uint32_t interfaceDescriptorIndex = 0;
KernelCommandsHelper<FamilyType>::sendIndirectState(
commandStream,
dsh,
ioh,
ssh,
*kernel,
kernel->getKernelInfo().getMaxSimdSize(),
localWorkSizes,
IDToffset,
interfaceDescriptorIndex,
pDevice->getPreemptionMode(),
pWalkerCmd,
nullptr,
true,
true,
false);
// It's okay these are EXPECT_GE as they're only going to be used for
// estimation purposes to avoid OOM.
auto usedAfterDSH = dsh.getUsed();
auto usedAfterIOH = ioh.getUsed();
auto usedAfterSSH = ssh.getUsed();
auto sizeRequiredDSH = KernelCommandsHelper<FamilyType>::getSizeRequiredDSH(*kernel);
auto sizeRequiredIOH = KernelCommandsHelper<FamilyType>::getSizeRequiredIOH(*kernel, localWorkSize);
auto sizeRequiredSSH = KernelCommandsHelper<FamilyType>::getSizeRequiredSSH(*kernel);
EXPECT_GE(sizeRequiredDSH, usedAfterDSH - usedBeforeDSH);
EXPECT_GE(sizeRequiredIOH, usedAfterIOH - usedBeforeIOH);
EXPECT_GE(sizeRequiredSSH, usedAfterSSH - usedBeforeSSH);
auto usedAfterCS = commandStream.getUsed();
EXPECT_GE(KernelCommandsHelper<FamilyType>::getSizeRequiredCS(), usedAfterCS - usedBeforeCS);
}
HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, givenKernelWithFourBindingTableEntriesWhenIndirectStateIsEmittedThenInterfaceDescriptorContainsCorrectBindingTableEntryCount) {
using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA;
using GPGPU_WALKER = typename FamilyType::GPGPU_WALKER;
CommandQueueHw<FamilyType> cmdQ(pContext, pDevice, 0);
auto &commandStream = cmdQ.getCS(1024);
auto pWalkerCmd = static_cast<GPGPU_WALKER *>(commandStream.getSpace(sizeof(GPGPU_WALKER)));
*pWalkerCmd = FamilyType::cmdInitGpgpuWalker;
MockKernelWithInternals mockKernel(*pDevice, pContext);
auto expectedBindingTableCount = 3u;
mockKernel.mockKernel->numberOfBindingTableStates = expectedBindingTableCount;
auto &dsh = cmdQ.getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 8192);
auto &ioh = cmdQ.getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 8192);
auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::SURFACE_STATE, 8192);
const size_t localWorkSize = 256;
const size_t localWorkSizes[3]{localWorkSize, 1, 1};
uint32_t interfaceDescriptorIndex = 0;
KernelCommandsHelper<FamilyType>::sendIndirectState(
commandStream,
dsh,
ioh,
ssh,
*mockKernel.mockKernel,
mockKernel.mockKernel->getKernelInfo().getMaxSimdSize(),
localWorkSizes,
0,
interfaceDescriptorIndex,
pDevice->getPreemptionMode(),
pWalkerCmd,
nullptr,
true,
true,
false);
auto interfaceDescriptor = reinterpret_cast<INTERFACE_DESCRIPTOR_DATA *>(dsh.getCpuBase());
if (KernelCommandsHelper<FamilyType>::doBindingTablePrefetch()) {
EXPECT_EQ(expectedBindingTableCount, interfaceDescriptor->getBindingTableEntryCount());
} else {
EXPECT_EQ(0u, interfaceDescriptor->getBindingTableEntryCount());
}
}
HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, givenKernelThatIsSchedulerWhenIndirectStateIsEmittedThenInterfaceDescriptorContainsZeroBindingTableEntryCount) {
using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA;
using GPGPU_WALKER = typename FamilyType::GPGPU_WALKER;
CommandQueueHw<FamilyType> cmdQ(pContext, pDevice, 0);
auto &commandStream = cmdQ.getCS(1024);
auto pWalkerCmd = static_cast<GPGPU_WALKER *>(commandStream.getSpace(sizeof(GPGPU_WALKER)));
*pWalkerCmd = FamilyType::cmdInitGpgpuWalker;
MockKernelWithInternals mockKernel(*pDevice, pContext);
auto expectedBindingTableCount = 3u;
mockKernel.mockKernel->numberOfBindingTableStates = expectedBindingTableCount;
auto isScheduler = const_cast<bool *>(&mockKernel.mockKernel->isSchedulerKernel);
*isScheduler = true;
auto &dsh = cmdQ.getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 8192);
auto &ioh = cmdQ.getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 8192);
auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::SURFACE_STATE, 8192);
const size_t localWorkSize = 256;
const size_t localWorkSizes[3]{localWorkSize, 1, 1};
uint32_t interfaceDescriptorIndex = 0;
KernelCommandsHelper<FamilyType>::sendIndirectState(
commandStream,
dsh,
ioh,
ssh,
*mockKernel.mockKernel,
mockKernel.mockKernel->getKernelInfo().getMaxSimdSize(),
localWorkSizes,
0,
interfaceDescriptorIndex,
pDevice->getPreemptionMode(),
pWalkerCmd,
nullptr,
true,
true,
false);
auto interfaceDescriptor = reinterpret_cast<INTERFACE_DESCRIPTOR_DATA *>(dsh.getCpuBase());
EXPECT_EQ(0u, interfaceDescriptor->getBindingTableEntryCount());
}
HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, givenKernelWith100BindingTableEntriesWhenIndirectStateIsEmittedThenInterfaceDescriptorHas31BindingTableEntriesSet) {
using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA;
using GPGPU_WALKER = typename FamilyType::GPGPU_WALKER;
CommandQueueHw<FamilyType> cmdQ(pContext, pDevice, 0);
auto &commandStream = cmdQ.getCS(1024);
auto pWalkerCmd = static_cast<GPGPU_WALKER *>(commandStream.getSpace(sizeof(GPGPU_WALKER)));
*pWalkerCmd = FamilyType::cmdInitGpgpuWalker;
MockKernelWithInternals mockKernel(*pDevice, pContext);
auto expectedBindingTableCount = 100u;
mockKernel.mockKernel->numberOfBindingTableStates = expectedBindingTableCount;
auto &dsh = cmdQ.getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 8192);
auto &ioh = cmdQ.getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 8192);
auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::SURFACE_STATE, 8192);
const size_t localWorkSize = 256;
const size_t localWorkSizes[3]{localWorkSize, 1, 1};
uint32_t interfaceDescriptorIndex = 0;
KernelCommandsHelper<FamilyType>::sendIndirectState(
commandStream,
dsh,
ioh,
ssh,
*mockKernel.mockKernel,
mockKernel.mockKernel->getKernelInfo().getMaxSimdSize(),
localWorkSizes,
0,
interfaceDescriptorIndex,
pDevice->getPreemptionMode(),
pWalkerCmd,
nullptr,
true,
true,
false);
auto interfaceDescriptor = reinterpret_cast<INTERFACE_DESCRIPTOR_DATA *>(dsh.getCpuBase());
if (KernelCommandsHelper<FamilyType>::doBindingTablePrefetch()) {
EXPECT_EQ(31u, interfaceDescriptor->getBindingTableEntryCount());
} else {
EXPECT_EQ(0u, interfaceDescriptor->getBindingTableEntryCount());
}
}
HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, whenSendingIndirectStateThenKernelsWalkOrderIsTakenIntoAccount) {
using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA;
using GPGPU_WALKER = typename FamilyType::GPGPU_WALKER;
CommandQueueHw<FamilyType> cmdQ(pContext, pDevice, 0);
std::unique_ptr<Image> img(Image2dHelper<>::create(pContext));
MultiDispatchInfo multiDispatchInfo;
auto &builder = cmdQ.getDevice().getExecutionEnvironment()->getBuiltIns()->getBuiltinDispatchInfoBuilder(EBuiltInOps::CopyImageToImage3d,
cmdQ.getContext(), cmdQ.getDevice());
BuiltinDispatchInfoBuilder::BuiltinOpParams dc;
dc.srcMemObj = img.get();
dc.dstMemObj = img.get();
dc.size = {1, 1, 1};
builder.buildDispatchInfos(multiDispatchInfo, dc);
ASSERT_NE(0u, multiDispatchInfo.size());
auto kernel = multiDispatchInfo.begin()->getKernel();
ASSERT_NE(nullptr, kernel);
const size_t localWorkSizeX = 2;
const size_t localWorkSizeY = 3;
const size_t localWorkSizeZ = 4;
const size_t localWorkSizes[3]{localWorkSizeX, localWorkSizeY, localWorkSizeZ};
auto &commandStream = cmdQ.getCS(1024);
auto pWalkerCmd = static_cast<GPGPU_WALKER *>(commandStream.getSpace(sizeof(GPGPU_WALKER)));
*pWalkerCmd = FamilyType::cmdInitGpgpuWalker;
auto &dsh = cmdQ.getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 8192);
auto &ioh = cmdQ.getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 8192);
auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::SURFACE_STATE, 8192);
dsh.align(KernelCommandsHelper<FamilyType>::alignInterfaceDescriptorData);
size_t IDToffset = dsh.getUsed();
dsh.getSpace(sizeof(INTERFACE_DESCRIPTOR_DATA));
KernelInfo modifiedKernelInfo = {};
modifiedKernelInfo.patchInfo = kernel->getKernelInfo().patchInfo;
modifiedKernelInfo.workgroupWalkOrder[0] = 2;
modifiedKernelInfo.workgroupWalkOrder[1] = 1;
modifiedKernelInfo.workgroupWalkOrder[2] = 0;
modifiedKernelInfo.workgroupDimensionsOrder[0] = 2;
modifiedKernelInfo.workgroupDimensionsOrder[1] = 1;
modifiedKernelInfo.workgroupDimensionsOrder[2] = 0;
MockKernel mockKernel{kernel->getProgram(), modifiedKernelInfo, kernel->getDevice(), false};
uint32_t interfaceDescriptorIndex = 0;
KernelCommandsHelper<FamilyType>::sendIndirectState(
commandStream,
dsh,
ioh,
ssh,
mockKernel,
modifiedKernelInfo.getMaxSimdSize(),
localWorkSizes,
IDToffset,
interfaceDescriptorIndex,
pDevice->getPreemptionMode(),
pWalkerCmd,
nullptr,
true,
true,
false);
size_t numThreads = localWorkSizeX * localWorkSizeY * localWorkSizeZ;
numThreads = (numThreads + modifiedKernelInfo.getMaxSimdSize() - 1) / modifiedKernelInfo.getMaxSimdSize();
size_t expectedIohSize = ((modifiedKernelInfo.getMaxSimdSize() == 32) ? 32 : 16) * 3 * numThreads * sizeof(uint16_t);
ASSERT_LE(expectedIohSize, ioh.getUsed());
auto expectedLocalIds = alignedMalloc(expectedIohSize, 64);
generateLocalIDs(expectedLocalIds, modifiedKernelInfo.getMaxSimdSize(),
std::array<uint16_t, 3>{{localWorkSizeX, localWorkSizeY, localWorkSizeZ}},
std::array<uint8_t, 3>{{modifiedKernelInfo.workgroupDimensionsOrder[0], modifiedKernelInfo.workgroupDimensionsOrder[1], modifiedKernelInfo.workgroupDimensionsOrder[2]}}, false);
EXPECT_EQ(0, memcmp(expectedLocalIds, ioh.getCpuBase(), expectedIohSize));
alignedFree(expectedLocalIds);
}
HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, usedBindingTableStatePointer) {
typedef typename FamilyType::BINDING_TABLE_STATE BINDING_TABLE_STATE;
typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE;
using GPGPU_WALKER = typename FamilyType::GPGPU_WALKER;
CommandQueueHw<FamilyType> cmdQ(pContext, pDevice, 0);
std::unique_ptr<Image> dstImage(Image2dHelper<>::create(pContext));
ASSERT_NE(nullptr, dstImage.get());
MultiDispatchInfo multiDispatchInfo;
auto &builder = pDevice->getExecutionEnvironment()->getBuiltIns()->getBuiltinDispatchInfoBuilder(EBuiltInOps::CopyBufferToImage3d,
cmdQ.getContext(), cmdQ.getDevice());
ASSERT_NE(nullptr, &builder);
BuiltinDispatchInfoBuilder::BuiltinOpParams dc;
dc.srcPtr = nullptr;
dc.dstMemObj = dstImage.get();
dc.dstOffset = {0, 0, 0};
dc.size = {1, 1, 1};
dc.dstRowPitch = 0;
dc.dstSlicePitch = 0;
builder.buildDispatchInfos(multiDispatchInfo, dc);
EXPECT_NE(0u, multiDispatchInfo.size());
auto kernel = multiDispatchInfo.begin()->getKernel();
ASSERT_NE(nullptr, kernel);
const size_t localWorkSizes[3]{256, 1, 1};
auto &commandStream = cmdQ.getCS(1024);
auto pWalkerCmd = static_cast<GPGPU_WALKER *>(commandStream.getSpace(sizeof(GPGPU_WALKER)));
*pWalkerCmd = FamilyType::cmdInitGpgpuWalker;
auto &dsh = cmdQ.getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 8192);
auto &ioh = cmdQ.getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 8192);
auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::SURFACE_STATE, 8192);
// Obtain where the pointers will be stored
const auto &kernelInfo = kernel->getKernelInfo();
auto numSurfaceStates = kernelInfo.patchInfo.statelessGlobalMemObjKernelArgs.size() +
kernelInfo.patchInfo.imageMemObjKernelArgs.size();
EXPECT_EQ(2u, numSurfaceStates);
size_t bindingTableStateSize = numSurfaceStates * sizeof(RENDER_SURFACE_STATE);
uint32_t *bindingTableStatesPointers = reinterpret_cast<uint32_t *>(
reinterpret_cast<uint8_t *>(ssh.getCpuBase()) + ssh.getUsed() + bindingTableStateSize);
for (auto i = 0u; i < numSurfaceStates; i++) {
*(&bindingTableStatesPointers[i]) = 0xDEADBEEF;
}
// force statefull path for buffers
const_cast<KernelInfo &>(kernelInfo).requiresSshForBuffers = true;
uint32_t interfaceDescriptorIndex = 0;
KernelCommandsHelper<FamilyType>::sendIndirectState(
commandStream,
dsh,
ioh,
ssh,
*kernel,
kernel->getKernelInfo().getMaxSimdSize(),
localWorkSizes,
0,
interfaceDescriptorIndex,
pDevice->getPreemptionMode(),
pWalkerCmd,
nullptr,
true,
true,
false);
EXPECT_EQ(0x00000000u, *(&bindingTableStatesPointers[0]));
EXPECT_EQ(0x00000040u, *(&bindingTableStatesPointers[1]));
}
HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, usedBindingTableStatePointersForGlobalAndConstantAndPrivateAndEventPoolAndDefaultCommandQueueSurfaces) {
using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA;
using GPGPU_WALKER = typename FamilyType::GPGPU_WALKER;
// define kernel info
auto pKernelInfo = std::make_unique<KernelInfo>();
SPatchExecutionEnvironment tokenEE = {};
tokenEE.CompiledSIMD8 = false;
tokenEE.CompiledSIMD16 = false;
tokenEE.CompiledSIMD32 = true;
pKernelInfo->patchInfo.executionEnvironment = &tokenEE;
// define patch offsets for global, constant, private, event pool and default device queue surfaces
SPatchAllocateStatelessGlobalMemorySurfaceWithInitialization AllocateStatelessGlobalMemorySurfaceWithInitialization;
AllocateStatelessGlobalMemorySurfaceWithInitialization.GlobalBufferIndex = 0;
AllocateStatelessGlobalMemorySurfaceWithInitialization.SurfaceStateHeapOffset = 0;
AllocateStatelessGlobalMemorySurfaceWithInitialization.DataParamOffset = 0;
AllocateStatelessGlobalMemorySurfaceWithInitialization.DataParamSize = 8;
pKernelInfo->patchInfo.pAllocateStatelessGlobalMemorySurfaceWithInitialization = &AllocateStatelessGlobalMemorySurfaceWithInitialization;
SPatchAllocateStatelessConstantMemorySurfaceWithInitialization AllocateStatelessConstantMemorySurfaceWithInitialization;
AllocateStatelessConstantMemorySurfaceWithInitialization.ConstantBufferIndex = 0;
AllocateStatelessConstantMemorySurfaceWithInitialization.SurfaceStateHeapOffset = 64;
AllocateStatelessConstantMemorySurfaceWithInitialization.DataParamOffset = 8;
AllocateStatelessConstantMemorySurfaceWithInitialization.DataParamSize = 8;
pKernelInfo->patchInfo.pAllocateStatelessConstantMemorySurfaceWithInitialization = &AllocateStatelessConstantMemorySurfaceWithInitialization;
SPatchAllocateStatelessPrivateSurface AllocateStatelessPrivateMemorySurface;
AllocateStatelessPrivateMemorySurface.PerThreadPrivateMemorySize = 32;
AllocateStatelessPrivateMemorySurface.SurfaceStateHeapOffset = 128;
AllocateStatelessPrivateMemorySurface.DataParamOffset = 16;
AllocateStatelessPrivateMemorySurface.DataParamSize = 8;
pKernelInfo->patchInfo.pAllocateStatelessPrivateSurface = &AllocateStatelessPrivateMemorySurface;
SPatchAllocateStatelessEventPoolSurface AllocateStatelessEventPoolSurface;
AllocateStatelessEventPoolSurface.SurfaceStateHeapOffset = 192;
AllocateStatelessEventPoolSurface.DataParamOffset = 24;
AllocateStatelessEventPoolSurface.DataParamSize = 8;
pKernelInfo->patchInfo.pAllocateStatelessEventPoolSurface = &AllocateStatelessEventPoolSurface;
SPatchAllocateStatelessDefaultDeviceQueueSurface AllocateStatelessDefaultDeviceQueueSurface;
AllocateStatelessDefaultDeviceQueueSurface.SurfaceStateHeapOffset = 256;
AllocateStatelessDefaultDeviceQueueSurface.DataParamOffset = 32;
AllocateStatelessDefaultDeviceQueueSurface.DataParamSize = 8;
pKernelInfo->patchInfo.pAllocateStatelessDefaultDeviceQueueSurface = &AllocateStatelessDefaultDeviceQueueSurface;
// create program with valid context
MockContext context;
MockProgram program(*pDevice->getExecutionEnvironment(), &context, false);
// setup global memory
char globalBuffer[16];
GraphicsAllocation gfxGlobalAlloc(globalBuffer, castToUint64(globalBuffer), 0llu, sizeof(globalBuffer), 1u, false);
program.setGlobalSurface(&gfxGlobalAlloc);
// setup constant memory
char constBuffer[16];
GraphicsAllocation gfxConstAlloc(constBuffer, castToUint64(constBuffer), 0llu, sizeof(constBuffer), 1u, false);
program.setConstantSurface(&gfxConstAlloc);
// create kernel
MockKernel *pKernel = new MockKernel(&program, *pKernelInfo, *pDevice);
SKernelBinaryHeaderCommon kernelHeader;
// setup surface state heap
constexpr uint32_t numSurfaces = 5;
constexpr uint32_t sshSize = numSurfaces * sizeof(typename FamilyType::RENDER_SURFACE_STATE) + numSurfaces * sizeof(typename FamilyType::BINDING_TABLE_STATE);
unsigned char *surfaceStateHeap = reinterpret_cast<unsigned char *>(alignedMalloc(sshSize, sizeof(typename FamilyType::RENDER_SURFACE_STATE)));
uint32_t btiOffset = static_cast<uint32_t>(numSurfaces * sizeof(typename FamilyType::RENDER_SURFACE_STATE));
auto bti = reinterpret_cast<typename FamilyType::BINDING_TABLE_STATE *>(surfaceStateHeap + btiOffset);
for (uint32_t i = 0; i < numSurfaces; ++i) {
bti[i].setSurfaceStatePointer(i * sizeof(typename FamilyType::RENDER_SURFACE_STATE));
}
kernelHeader.SurfaceStateHeapSize = sshSize;
// setup kernel heap
uint32_t kernelIsa[32];
kernelHeader.KernelHeapSize = sizeof(kernelIsa);
pKernelInfo->heapInfo.pSsh = surfaceStateHeap;
pKernelInfo->heapInfo.pKernelHeap = kernelIsa;
pKernelInfo->heapInfo.pKernelHeader = &kernelHeader;
// setup binding table state
SPatchBindingTableState bindingTableState;
bindingTableState.Token = iOpenCL::PATCH_TOKEN_BINDING_TABLE_STATE;
bindingTableState.Size = sizeof(SPatchBindingTableState);
bindingTableState.Count = 5;
bindingTableState.Offset = btiOffset;
bindingTableState.SurfaceStateOffset = 0;
pKernelInfo->patchInfo.bindingTableState = &bindingTableState;
// setup thread payload
SPatchThreadPayload threadPayload;
threadPayload.LocalIDXPresent = 1;
threadPayload.LocalIDYPresent = 1;
threadPayload.LocalIDZPresent = 1;
pKernelInfo->patchInfo.threadPayload = &threadPayload;
// define stateful path
pKernelInfo->usesSsh = true;
pKernelInfo->requiresSshForBuffers = true;
// initialize kernel
ASSERT_EQ(CL_SUCCESS, pKernel->initialize());
// setup cross thread data
char pCrossThreadData[64];
pKernel->setCrossThreadData(pCrossThreadData, sizeof(pCrossThreadData));
// try with different offsets to surface state base address
for (uint32_t ssbaOffset : {0U, (uint32_t)sizeof(typename FamilyType::RENDER_SURFACE_STATE)}) {
CommandQueueHw<FamilyType> cmdQ(nullptr, pDevice, 0);
auto &commandStream = cmdQ.getCS(1024);
auto pWalkerCmd = static_cast<GPGPU_WALKER *>(commandStream.getSpace(sizeof(GPGPU_WALKER)));
*pWalkerCmd = FamilyType::cmdInitGpgpuWalker;
auto &dsh = cmdQ.getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 8192);
auto &ioh = cmdQ.getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 8192);
auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::SURFACE_STATE, 8192);
// Initialize binding table state pointers with pattern
EXPECT_EQ(numSurfaces, pKernel->getNumberOfBindingTableStates());
const size_t localWorkSizes[3]{256, 1, 1};
dsh.getSpace(sizeof(INTERFACE_DESCRIPTOR_DATA));
ssh.getSpace(ssbaOffset); // offset local ssh from surface state base address
uint32_t localSshOffset = static_cast<uint32_t>(ssh.getUsed());
// push surfaces states and binding table to given ssh heap
uint32_t interfaceDescriptorIndex = 0;
KernelCommandsHelper<FamilyType>::sendIndirectState(
commandStream,
dsh,
ioh,
ssh,
*pKernel,
pKernel->getKernelInfo().getMaxSimdSize(),
localWorkSizes,
0,
interfaceDescriptorIndex,
pDevice->getPreemptionMode(),
pWalkerCmd,
nullptr,
true,
true,
false);
bti = reinterpret_cast<typename FamilyType::BINDING_TABLE_STATE *>(reinterpret_cast<unsigned char *>(ssh.getCpuBase()) + localSshOffset + btiOffset);
for (uint32_t i = 0; i < numSurfaces; ++i) {
uint32_t expected = localSshOffset + i * sizeof(typename FamilyType::RENDER_SURFACE_STATE);
EXPECT_EQ(expected, bti[i].getSurfaceStatePointer());
}
program.setGlobalSurface(nullptr);
program.setConstantSurface(nullptr);
//exhaust space to trigger reload
ssh.getSpace(ssh.getAvailableSpace());
dsh.getSpace(dsh.getAvailableSpace());
}
alignedFree(surfaceStateHeap);
delete pKernel;
}
HWTEST_F(KernelCommandsTest, setBindingTableStatesForKernelWithBuffersNotRequiringSSHDoesNotTouchSSH) {
// define kernel info
auto pKernelInfo = std::make_unique<KernelInfo>();
// create program with valid context
MockContext context;
MockProgram program(*pDevice->getExecutionEnvironment(), &context, false);
// create kernel
MockKernel *pKernel = new MockKernel(&program, *pKernelInfo, *pDevice);
// setup surface state heap
char surfaceStateHeap[256];
SKernelBinaryHeaderCommon kernelHeader;
kernelHeader.SurfaceStateHeapSize = sizeof(surfaceStateHeap);
pKernelInfo->heapInfo.pSsh = surfaceStateHeap;
pKernelInfo->heapInfo.pKernelHeader = &kernelHeader;
// define stateful path
pKernelInfo->usesSsh = true;
pKernelInfo->requiresSshForBuffers = false;
SPatchStatelessGlobalMemoryObjectKernelArgument statelessGlobalMemory;
statelessGlobalMemory.ArgumentNumber = 0;
statelessGlobalMemory.DataParamOffset = 0;
statelessGlobalMemory.DataParamSize = 0;
statelessGlobalMemory.Size = 0;
statelessGlobalMemory.SurfaceStateHeapOffset = 0;
pKernelInfo->patchInfo.statelessGlobalMemObjKernelArgs.push_back(&statelessGlobalMemory);
// initialize kernel
ASSERT_EQ(CL_SUCCESS, pKernel->initialize());
CommandQueueHw<FamilyType> cmdQ(nullptr, pDevice, 0);
auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::SURFACE_STATE, 8192);
ssh.align(8);
auto usedBefore = ssh.getUsed();
// Initialize binding table state pointers with pattern
auto numSurfaceStates = pKernel->getNumberOfBindingTableStates();
EXPECT_EQ(0u, numSurfaceStates);
// set binding table states
auto dstBindingTablePointer = KernelCommandsHelper<FamilyType>::pushBindingTableAndSurfaceStates(ssh, *pKernel);
EXPECT_EQ(0u, dstBindingTablePointer);
auto usedAfter = ssh.getUsed();
EXPECT_EQ(usedBefore, usedAfter);
ssh.align(8);
EXPECT_EQ(usedAfter, ssh.getUsed());
delete pKernel;
}
HWTEST_F(KernelCommandsTest, setBindingTableStatesForNoSurfaces) {
// define kernel info
auto pKernelInfo = std::make_unique<KernelInfo>();
// create program with valid context
MockContext context;
MockProgram program(*pDevice->getExecutionEnvironment(), &context, false);
// create kernel
MockKernel *pKernel = new MockKernel(&program, *pKernelInfo, *pDevice);
// setup surface state heap
char surfaceStateHeap[256];
SKernelBinaryHeaderCommon kernelHeader;
kernelHeader.SurfaceStateHeapSize = sizeof(surfaceStateHeap);
pKernelInfo->heapInfo.pSsh = surfaceStateHeap;
pKernelInfo->heapInfo.pKernelHeader = &kernelHeader;
// define stateful path
pKernelInfo->usesSsh = true;
pKernelInfo->requiresSshForBuffers = true;
// initialize kernel
ASSERT_EQ(CL_SUCCESS, pKernel->initialize());
CommandQueueHw<FamilyType> cmdQ(nullptr, pDevice, 0);
auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::SURFACE_STATE, 8192);
// Initialize binding table state pointers with pattern
auto numSurfaceStates = pKernel->getNumberOfBindingTableStates();
EXPECT_EQ(0u, numSurfaceStates);
auto dstBindingTablePointer = KernelCommandsHelper<FamilyType>::pushBindingTableAndSurfaceStates(ssh, *pKernelInfo);
EXPECT_EQ(0u, dstBindingTablePointer);
dstBindingTablePointer = KernelCommandsHelper<FamilyType>::pushBindingTableAndSurfaceStates(ssh, *pKernel);
EXPECT_EQ(0u, dstBindingTablePointer);
SPatchBindingTableState bindingTableState;
bindingTableState.Token = iOpenCL::PATCH_TOKEN_BINDING_TABLE_STATE;
bindingTableState.Size = sizeof(SPatchBindingTableState);
bindingTableState.Count = 0;
bindingTableState.Offset = 64;
bindingTableState.SurfaceStateOffset = 0;
pKernelInfo->patchInfo.bindingTableState = &bindingTableState;
dstBindingTablePointer = KernelCommandsHelper<FamilyType>::pushBindingTableAndSurfaceStates(ssh, *pKernel);
EXPECT_EQ(0u, dstBindingTablePointer);
pKernelInfo->patchInfo.bindingTableState = nullptr;
delete pKernel;
}
HWTEST_F(KernelCommandsTest, slmValueScenarios) {
if (::renderCoreFamily == IGFX_GEN8_CORE) {
EXPECT_EQ(0u, KernelCommandsHelper<FamilyType>::computeSlmValues(0));
EXPECT_EQ(1u, KernelCommandsHelper<FamilyType>::computeSlmValues(1));
EXPECT_EQ(1u, KernelCommandsHelper<FamilyType>::computeSlmValues(1024));
EXPECT_EQ(1u, KernelCommandsHelper<FamilyType>::computeSlmValues(1025));
EXPECT_EQ(1u, KernelCommandsHelper<FamilyType>::computeSlmValues(2048));
EXPECT_EQ(1u, KernelCommandsHelper<FamilyType>::computeSlmValues(2049));
EXPECT_EQ(1u, KernelCommandsHelper<FamilyType>::computeSlmValues(4096));
EXPECT_EQ(2u, KernelCommandsHelper<FamilyType>::computeSlmValues(4097));
EXPECT_EQ(2u, KernelCommandsHelper<FamilyType>::computeSlmValues(8192));
EXPECT_EQ(4u, KernelCommandsHelper<FamilyType>::computeSlmValues(8193));
EXPECT_EQ(4u, KernelCommandsHelper<FamilyType>::computeSlmValues(12288));
EXPECT_EQ(4u, KernelCommandsHelper<FamilyType>::computeSlmValues(16384));
EXPECT_EQ(8u, KernelCommandsHelper<FamilyType>::computeSlmValues(16385));
EXPECT_EQ(8u, KernelCommandsHelper<FamilyType>::computeSlmValues(24576));
EXPECT_EQ(8u, KernelCommandsHelper<FamilyType>::computeSlmValues(32768));
EXPECT_EQ(16u, KernelCommandsHelper<FamilyType>::computeSlmValues(32769));
EXPECT_EQ(16u, KernelCommandsHelper<FamilyType>::computeSlmValues(49152));
EXPECT_EQ(16u, KernelCommandsHelper<FamilyType>::computeSlmValues(65535));
EXPECT_EQ(16u, KernelCommandsHelper<FamilyType>::computeSlmValues(65536));
} else {
EXPECT_EQ(0u, KernelCommandsHelper<FamilyType>::computeSlmValues(0));
EXPECT_EQ(1u, KernelCommandsHelper<FamilyType>::computeSlmValues(1));
EXPECT_EQ(1u, KernelCommandsHelper<FamilyType>::computeSlmValues(1024));
EXPECT_EQ(2u, KernelCommandsHelper<FamilyType>::computeSlmValues(1025));
EXPECT_EQ(2u, KernelCommandsHelper<FamilyType>::computeSlmValues(2048));
EXPECT_EQ(3u, KernelCommandsHelper<FamilyType>::computeSlmValues(2049));
EXPECT_EQ(3u, KernelCommandsHelper<FamilyType>::computeSlmValues(4096));
EXPECT_EQ(4u, KernelCommandsHelper<FamilyType>::computeSlmValues(4097));
EXPECT_EQ(4u, KernelCommandsHelper<FamilyType>::computeSlmValues(8192));
EXPECT_EQ(5u, KernelCommandsHelper<FamilyType>::computeSlmValues(8193));
EXPECT_EQ(5u, KernelCommandsHelper<FamilyType>::computeSlmValues(16384));
EXPECT_EQ(6u, KernelCommandsHelper<FamilyType>::computeSlmValues(16385));
EXPECT_EQ(6u, KernelCommandsHelper<FamilyType>::computeSlmValues(32768));
EXPECT_EQ(7u, KernelCommandsHelper<FamilyType>::computeSlmValues(32769));
EXPECT_EQ(7u, KernelCommandsHelper<FamilyType>::computeSlmValues(65536));
}
}
HWCMDTEST_F(IGFX_GEN8_CORE, KernelCommandsTest, GivenKernelWithSamplersWhenIndirectStateIsProgrammedThenBorderColorIsCorrectlyCopiedToDshAndSamplerStatesAreProgrammedWithPointer) {
typedef typename FamilyType::BINDING_TABLE_STATE BINDING_TABLE_STATE;
typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE;
typedef typename FamilyType::SAMPLER_STATE SAMPLER_STATE;
using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA;
using GPGPU_WALKER = typename FamilyType::GPGPU_WALKER;
CommandQueueHw<FamilyType> cmdQ(nullptr, pDevice, 0);
MockKernelWithInternals kernelInternals(*pDevice);
const size_t localWorkSizes[3]{1, 1, 1};
auto &commandStream = cmdQ.getCS(1024);
auto pWalkerCmd = static_cast<GPGPU_WALKER *>(commandStream.getSpace(sizeof(GPGPU_WALKER)));
*pWalkerCmd = FamilyType::cmdInitGpgpuWalker;
auto &dsh = cmdQ.getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 8192);
auto &ioh = cmdQ.getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 8192);
auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::SURFACE_STATE, 8192);
const uint32_t borderColorSize = 64;
const uint32_t samplerStateSize = sizeof(SAMPLER_STATE) * 2;
SPatchSamplerStateArray samplerStateArray;
samplerStateArray.BorderColorOffset = 0x0;
samplerStateArray.Count = 2;
samplerStateArray.Offset = borderColorSize;
samplerStateArray.Size = samplerStateSize;
samplerStateArray.Token = 1;
char *mockDsh = new char[(borderColorSize + samplerStateSize) * 4];
memset(mockDsh, 6, borderColorSize);
memset(mockDsh + borderColorSize, 8, borderColorSize);
kernelInternals.kernelInfo.heapInfo.pDsh = mockDsh;
kernelInternals.kernelInfo.patchInfo.samplerStateArray = &samplerStateArray;
uint64_t interfaceDescriptorTableOffset = dsh.getUsed();
dsh.getSpace(sizeof(INTERFACE_DESCRIPTOR_DATA));
dsh.getSpace(4);
char *initialDshPointer = static_cast<char *>(dsh.getCpuBase()) + dsh.getUsed();
char *borderColorPointer = alignUp(initialDshPointer, 64);
uint32_t borderColorOffset = static_cast<uint32_t>(borderColorPointer - static_cast<char *>(dsh.getCpuBase()));
SAMPLER_STATE *pSamplerState = reinterpret_cast<SAMPLER_STATE *>(mockDsh + borderColorSize);
for (uint32_t i = 0; i < 2; i++) {
pSamplerState[i].setIndirectStatePointer(0);
}
MockKernel *kernel = new MockKernel(kernelInternals.mockProgram, kernelInternals.kernelInfo, *pDevice);
kernel->setCrossThreadData(kernelInternals.crossThreadData, sizeof(kernelInternals.crossThreadData));
kernel->setSshLocal(kernelInternals.sshLocal, sizeof(kernelInternals.sshLocal));
uint32_t interfaceDescriptorIndex = 0;
KernelCommandsHelper<FamilyType>::sendIndirectState(
commandStream,
dsh,
ioh,
ssh,
*kernel,
8,
localWorkSizes,
interfaceDescriptorTableOffset,
interfaceDescriptorIndex,
pDevice->getPreemptionMode(),
pWalkerCmd,
nullptr,
true,
true,
false);
bool isMemorySame = memcmp(borderColorPointer, mockDsh, borderColorSize) == 0;
EXPECT_TRUE(isMemorySame);
SAMPLER_STATE *pSamplerStatesCopied = reinterpret_cast<SAMPLER_STATE *>(borderColorPointer + borderColorSize);
for (uint32_t i = 0; i < 2; i++) {
EXPECT_EQ(pSamplerState[i].getNonNormalizedCoordinateEnable(), pSamplerStatesCopied[i].getNonNormalizedCoordinateEnable());
EXPECT_EQ(pSamplerState[i].getTcxAddressControlMode(), pSamplerStatesCopied[i].getTcxAddressControlMode());
EXPECT_EQ(pSamplerState[i].getTcyAddressControlMode(), pSamplerStatesCopied[i].getTcyAddressControlMode());
EXPECT_EQ(pSamplerState[i].getTczAddressControlMode(), pSamplerStatesCopied[i].getTczAddressControlMode());
EXPECT_EQ(pSamplerState[i].getMinModeFilter(), pSamplerStatesCopied[i].getMinModeFilter());
EXPECT_EQ(pSamplerState[i].getMagModeFilter(), pSamplerStatesCopied[i].getMagModeFilter());
EXPECT_EQ(pSamplerState[i].getMipModeFilter(), pSamplerStatesCopied[i].getMipModeFilter());
EXPECT_EQ(pSamplerState[i].getUAddressMinFilterRoundingEnable(), pSamplerStatesCopied[i].getUAddressMinFilterRoundingEnable());
EXPECT_EQ(pSamplerState[i].getUAddressMagFilterRoundingEnable(), pSamplerStatesCopied[i].getUAddressMagFilterRoundingEnable());
EXPECT_EQ(pSamplerState[i].getVAddressMinFilterRoundingEnable(), pSamplerStatesCopied[i].getVAddressMinFilterRoundingEnable());
EXPECT_EQ(pSamplerState[i].getVAddressMagFilterRoundingEnable(), pSamplerStatesCopied[i].getVAddressMagFilterRoundingEnable());
EXPECT_EQ(pSamplerState[i].getRAddressMagFilterRoundingEnable(), pSamplerStatesCopied[i].getRAddressMagFilterRoundingEnable());
EXPECT_EQ(pSamplerState[i].getRAddressMinFilterRoundingEnable(), pSamplerStatesCopied[i].getRAddressMinFilterRoundingEnable());
EXPECT_EQ(pSamplerState[i].getLodAlgorithm(), pSamplerStatesCopied[i].getLodAlgorithm());
EXPECT_EQ(pSamplerState[i].getTextureLodBias(), pSamplerStatesCopied[i].getTextureLodBias());
EXPECT_EQ(pSamplerState[i].getLodPreclampMode(), pSamplerStatesCopied[i].getLodPreclampMode());
EXPECT_EQ(pSamplerState[i].getTextureBorderColorMode(), pSamplerStatesCopied[i].getTextureBorderColorMode());
EXPECT_EQ(pSamplerState[i].getSamplerDisable(), pSamplerStatesCopied[i].getSamplerDisable());
EXPECT_EQ(pSamplerState[i].getCubeSurfaceControlMode(), pSamplerStatesCopied[i].getCubeSurfaceControlMode());
EXPECT_EQ(pSamplerState[i].getShadowFunction(), pSamplerStatesCopied[i].getShadowFunction());
EXPECT_EQ(pSamplerState[i].getChromakeyMode(), pSamplerStatesCopied[i].getChromakeyMode());
EXPECT_EQ(pSamplerState[i].getChromakeyIndex(), pSamplerStatesCopied[i].getChromakeyIndex());
EXPECT_EQ(pSamplerState[i].getChromakeyEnable(), pSamplerStatesCopied[i].getChromakeyEnable());
EXPECT_EQ(pSamplerState[i].getMaxLod(), pSamplerStatesCopied[i].getMaxLod());
EXPECT_EQ(pSamplerState[i].getMinLod(), pSamplerStatesCopied[i].getMinLod());
EXPECT_EQ(pSamplerState[i].getLodClampMagnificationMode(), pSamplerStatesCopied[i].getLodClampMagnificationMode());
EXPECT_EQ(borderColorOffset, pSamplerStatesCopied[i].getIndirectStatePointer());
}
delete kernel;
delete[] mockDsh;
}
using KernelCommandsHelperTests = ::testing::Test;
HWTEST_F(KernelCommandsHelperTests, givenCompareAddressAndDataWhenProgrammingSemaphoreWaitThenSetupAllFields) {
using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
uint64_t compareAddress = 0x10000;
uint32_t compareData = 1234;
uint8_t buffer[1024] = {};
LinearStream cmdStream(buffer, 1024);
MI_SEMAPHORE_WAIT referenceCommand = FamilyType::cmdInitMiSemaphoreWait;
referenceCommand.setCompareOperation(MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_NOT_EQUAL_SDD);
referenceCommand.setSemaphoreDataDword(compareData);
referenceCommand.setSemaphoreGraphicsAddress(compareAddress);
referenceCommand.setWaitMode(MI_SEMAPHORE_WAIT::WAIT_MODE::WAIT_MODE_POLLING_MODE);
KernelCommandsHelper<FamilyType>::programMiSemaphoreWait(cmdStream, compareAddress, compareData);
EXPECT_EQ(sizeof(MI_SEMAPHORE_WAIT), cmdStream.getUsed());
EXPECT_EQ(0, memcmp(&referenceCommand, buffer, sizeof(MI_SEMAPHORE_WAIT)));
}
HWTEST_F(KernelCommandsHelperTests, whenProgrammingMiAtomicThenSetupAllFields) {
using MI_ATOMIC = typename FamilyType::MI_ATOMIC;
uint64_t writeAddress = 0x10000;
auto opcode = MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_DECREMENT;
auto dataSize = MI_ATOMIC::DATA_SIZE::DATA_SIZE_DWORD;
uint8_t buffer[1024] = {};
LinearStream cmdStream(buffer, 1024);
MI_ATOMIC referenceCommand = MI_ATOMIC::sInit();
referenceCommand.setAtomicOpcode(opcode);
referenceCommand.setDataSize(dataSize);
referenceCommand.setMemoryAddress(static_cast<uint32_t>(writeAddress & 0x0000FFFFFFFFULL));
referenceCommand.setMemoryAddressHigh(static_cast<uint32_t>(writeAddress >> 32));
auto miAtomic = KernelCommandsHelper<FamilyType>::programMiAtomic(cmdStream, writeAddress, opcode, dataSize);
EXPECT_EQ(sizeof(MI_ATOMIC), cmdStream.getUsed());
EXPECT_EQ(miAtomic, cmdStream.getCpuBase());
EXPECT_EQ(0, memcmp(&referenceCommand, miAtomic, sizeof(MI_ATOMIC)));
}
typedef ExecutionModelKernelFixture ParentKernelCommandsFromBinaryTest;
HWTEST_P(ParentKernelCommandsFromBinaryTest, getSizeRequiredForExecutionModelForSurfaceStatesReturnsSizeOfBlocksPlusMaxBindingTableSizeForAllIDTEntriesAndSchedulerSSHSize) {
using BINDING_TABLE_STATE = typename FamilyType::BINDING_TABLE_STATE;
if (std::string(pPlatform->getDevice(0)->getDeviceInfo().clVersion).find("OpenCL 2.") != std::string::npos) {
EXPECT_TRUE(pKernel->isParentKernel);
size_t totalSize = 0;
BlockKernelManager *blockManager = pKernel->getProgram()->getBlockKernelManager();
uint32_t blockCount = static_cast<uint32_t>(blockManager->getCount());
totalSize = BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE - 1; // for initial alignment
uint32_t maxBindingTableCount = 0;
for (uint32_t i = 0; i < blockCount; i++) {
const KernelInfo *pBlockInfo = blockManager->getBlockKernelInfo(i);
totalSize += pBlockInfo->heapInfo.pKernelHeader->SurfaceStateHeapSize;
totalSize = alignUp(totalSize, BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE);
maxBindingTableCount = std::max(maxBindingTableCount, pBlockInfo->patchInfo.bindingTableState ? pBlockInfo->patchInfo.bindingTableState->Count : 0);
}
totalSize += maxBindingTableCount * sizeof(BINDING_TABLE_STATE) * DeviceQueue::interfaceDescriptorEntries;
BuiltIns &builtIns = *pDevice->getExecutionEnvironment()->getBuiltIns();
auto &scheduler = builtIns.getSchedulerKernel(*pContext);
auto schedulerSshSize = scheduler.getSurfaceStateHeapSize();
totalSize += schedulerSshSize + ((schedulerSshSize != 0) ? BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE : 0);
totalSize = alignUp(totalSize, BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE);
EXPECT_EQ(totalSize, KernelCommandsHelper<FamilyType>::template getSizeRequiredForExecutionModel<IndirectHeap::SURFACE_STATE>(*pKernel));
}
}
HWTEST_P(ParentKernelCommandsFromBinaryTest, getSizeRequiredForExecutionModelForIOHReturnsSchedulerSize) {
using BINDING_TABLE_STATE = typename FamilyType::BINDING_TABLE_STATE;
if (std::string(pPlatform->getDevice(0)->getDeviceInfo().clVersion).find("OpenCL 2.") != std::string::npos) {
EXPECT_TRUE(pKernel->isParentKernel);
BuiltIns &builtIns = *pDevice->getExecutionEnvironment()->getBuiltIns();
auto &scheduler = builtIns.getSchedulerKernel(*pContext);
size_t totalSize = KernelCommandsHelper<FamilyType>::getSizeRequiredIOH(scheduler);
EXPECT_EQ(totalSize, KernelCommandsHelper<FamilyType>::template getSizeRequiredForExecutionModel<IndirectHeap::INDIRECT_OBJECT>(*pKernel));
}
}
HWTEST_P(ParentKernelCommandsFromBinaryTest, getSizeRequiredForExecutionModelForGSH) {
using BINDING_TABLE_STATE = typename FamilyType::BINDING_TABLE_STATE;
if (std::string(pPlatform->getDevice(0)->getDeviceInfo().clVersion).find("OpenCL 2.") != std::string::npos) {
EXPECT_TRUE(pKernel->isParentKernel);
size_t totalSize = 0;
EXPECT_EQ(totalSize, KernelCommandsHelper<FamilyType>::template getSizeRequiredForExecutionModel<IndirectHeap::GENERAL_STATE>(*pKernel));
}
}
static const char *binaryFile = "simple_block_kernel";
static const char *KernelNames[] = {"kernel_reflection", "simple_block_kernel"};
INSTANTIATE_TEST_CASE_P(ParentKernelCommandsFromBinaryTest,
ParentKernelCommandsFromBinaryTest,
::testing::Combine(
::testing::Values(binaryFile),
::testing::ValuesIn(KernelNames)));
HWTEST_F(KernelCommandsTest, givenEnabledPassInlineDataWhenKernelAllowsInlineThenReturnTrue) {
DebugManagerStateRestore restore;
DebugManager.flags.EnablePassInlineData.set(true);
uint32_t crossThreadData[8];
MockKernelWithInternals mockKernelWithInternal(*pDevice);
const_cast<SPatchThreadPayload *>(mockKernelWithInternal.kernelInfo.patchInfo.threadPayload)->PassInlineData = 1;
mockKernelWithInternal.mockKernel->setCrossThreadData(crossThreadData, sizeof(crossThreadData));
EXPECT_TRUE(KernelCommandsHelper<FamilyType>::inlineDataProgrammingRequired(*mockKernelWithInternal.mockKernel));
}
HWTEST_F(KernelCommandsTest, givenEnabledPassInlineDataWhenKernelDisallowsInlineThenReturnFalse) {
DebugManagerStateRestore restore;
DebugManager.flags.EnablePassInlineData.set(true);
uint32_t crossThreadData[8];
MockKernelWithInternals mockKernelWithInternal(*pDevice);
const_cast<SPatchThreadPayload *>(mockKernelWithInternal.kernelInfo.patchInfo.threadPayload)->PassInlineData = 0;
mockKernelWithInternal.mockKernel->setCrossThreadData(crossThreadData, sizeof(crossThreadData));
EXPECT_FALSE(KernelCommandsHelper<FamilyType>::inlineDataProgrammingRequired(*mockKernelWithInternal.mockKernel));
}
HWTEST_F(KernelCommandsTest, whenLocalIdxInXDimPresentThenExpectLocalIdsInUseIsTrue) {
MockKernelWithInternals mockKernelWithInternal(*pDevice);
const_cast<SPatchThreadPayload *>(mockKernelWithInternal.kernelInfo.patchInfo.threadPayload)->LocalIDXPresent = 1;
const_cast<SPatchThreadPayload *>(mockKernelWithInternal.kernelInfo.patchInfo.threadPayload)->LocalIDYPresent = 0;
const_cast<SPatchThreadPayload *>(mockKernelWithInternal.kernelInfo.patchInfo.threadPayload)->LocalIDZPresent = 0;
EXPECT_TRUE(KernelCommandsHelper<FamilyType>::kernelUsesLocalIds(*mockKernelWithInternal.mockKernel));
}
HWTEST_F(KernelCommandsTest, whenLocalIdxInYDimPresentThenExpectLocalIdsInUseIsTrue) {
MockKernelWithInternals mockKernelWithInternal(*pDevice);
const_cast<SPatchThreadPayload *>(mockKernelWithInternal.kernelInfo.patchInfo.threadPayload)->LocalIDXPresent = 0;
const_cast<SPatchThreadPayload *>(mockKernelWithInternal.kernelInfo.patchInfo.threadPayload)->LocalIDYPresent = 1;
const_cast<SPatchThreadPayload *>(mockKernelWithInternal.kernelInfo.patchInfo.threadPayload)->LocalIDZPresent = 0;
EXPECT_TRUE(KernelCommandsHelper<FamilyType>::kernelUsesLocalIds(*mockKernelWithInternal.mockKernel));
}
HWTEST_F(KernelCommandsTest, whenLocalIdxInZDimPresentThenExpectLocalIdsInUseIsTrue) {
MockKernelWithInternals mockKernelWithInternal(*pDevice);
const_cast<SPatchThreadPayload *>(mockKernelWithInternal.kernelInfo.patchInfo.threadPayload)->LocalIDXPresent = 0;
const_cast<SPatchThreadPayload *>(mockKernelWithInternal.kernelInfo.patchInfo.threadPayload)->LocalIDYPresent = 0;
const_cast<SPatchThreadPayload *>(mockKernelWithInternal.kernelInfo.patchInfo.threadPayload)->LocalIDZPresent = 1;
EXPECT_TRUE(KernelCommandsHelper<FamilyType>::kernelUsesLocalIds(*mockKernelWithInternal.mockKernel));
}
HWTEST_F(KernelCommandsTest, whenLocalIdxAreNotPresentThenExpectLocalIdsInUseIsFalse) {
MockKernelWithInternals mockKernelWithInternal(*pDevice);
const_cast<SPatchThreadPayload *>(mockKernelWithInternal.kernelInfo.patchInfo.threadPayload)->LocalIDXPresent = 0;
const_cast<SPatchThreadPayload *>(mockKernelWithInternal.kernelInfo.patchInfo.threadPayload)->LocalIDYPresent = 0;
const_cast<SPatchThreadPayload *>(mockKernelWithInternal.kernelInfo.patchInfo.threadPayload)->LocalIDZPresent = 0;
EXPECT_FALSE(KernelCommandsHelper<FamilyType>::kernelUsesLocalIds(*mockKernelWithInternal.mockKernel));
}