compute-runtime/unit_tests/helpers/hardware_commands_helper_te...

1443 lines
69 KiB
C++

/*
* Copyright (C) 2017-2019 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "unit_tests/helpers/hardware_commands_helper_tests.h"
#include "core/helpers/basic_math.h"
#include "core/helpers/engine_node_helper.h"
#include "core/memory_manager/unified_memory_manager.h"
#include "core/unit_tests/helpers/debug_manager_state_restore.h"
#include "core/unit_tests/utilities/base_object_utils.h"
#include "runtime/api/api.h"
#include "runtime/built_ins/builtins_dispatch_builder.h"
#include "runtime/command_queue/command_queue_hw.h"
#include "runtime/helpers/hardware_commands_helper.h"
#include "unit_tests/fixtures/execution_model_kernel_fixture.h"
#include "unit_tests/fixtures/hello_world_fixture.h"
#include "unit_tests/fixtures/image_fixture.h"
#include "unit_tests/helpers/hw_parse.h"
#include "unit_tests/mocks/mock_graphics_allocation.h"
using namespace NEO;
void HardwareCommandsTest::SetUp() {
DeviceFixture::SetUp();
ASSERT_NE(nullptr, pDevice);
cl_device_id device = pDevice;
ContextFixture::SetUp(1, &device);
ASSERT_NE(nullptr, pContext);
BuiltInFixture::SetUp(pDevice);
ASSERT_NE(nullptr, pBuiltIns);
mockKernelWithInternal = std::make_unique<MockKernelWithInternals>(*pDevice, pContext);
}
void HardwareCommandsTest::TearDown() {
mockKernelWithInternal.reset(nullptr);
BuiltInFixture::TearDown();
ContextFixture::TearDown();
DeviceFixture::TearDown();
}
void HardwareCommandsTest::addSpaceForSingleKernelArg() {
kernelArguments.resize(1);
kernelArguments[0] = kernelArgInfo;
mockKernelWithInternal->kernelInfo.resizeKernelArgInfoAndRegisterParameter(1);
mockKernelWithInternal->kernelInfo.kernelArgInfo.resize(1);
mockKernelWithInternal->kernelInfo.kernelArgInfo[0].kernelArgPatchInfoVector.resize(1);
mockKernelWithInternal->kernelInfo.kernelArgInfo[0].kernelArgPatchInfoVector[0].crossthreadOffset = 0;
mockKernelWithInternal->kernelInfo.kernelArgInfo[0].kernelArgPatchInfoVector[0].size = sizeof(uintptr_t);
mockKernelWithInternal->mockKernel->setKernelArguments(kernelArguments);
mockKernelWithInternal->mockKernel->kernelArgRequiresCacheFlush.resize(1);
}
HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, programInterfaceDescriptorDataResourceUsage) {
CommandQueueHw<FamilyType> cmdQ(pContext, pDevice, 0);
std::unique_ptr<Image> srcImage(Image2dHelper<>::create(pContext));
ASSERT_NE(nullptr, srcImage.get());
std::unique_ptr<Image> dstImage(Image2dHelper<>::create(pContext));
ASSERT_NE(nullptr, dstImage.get());
MultiDispatchInfo multiDispatchInfo;
auto &builder = pDevice->getExecutionEnvironment()->getBuiltIns()->getBuiltinDispatchInfoBuilder(EBuiltInOps::CopyImageToImage3d,
cmdQ.getContext(), cmdQ.getDevice());
ASSERT_NE(nullptr, &builder);
BuiltinOpParams dc;
dc.srcMemObj = srcImage.get();
dc.dstMemObj = dstImage.get();
dc.srcOffset = {0, 0, 0};
dc.dstOffset = {0, 0, 0};
dc.size = {1, 1, 1};
builder.buildDispatchInfos(multiDispatchInfo, dc);
EXPECT_NE(0u, multiDispatchInfo.size());
auto kernel = multiDispatchInfo.begin()->getKernel();
ASSERT_NE(nullptr, kernel);
typedef typename FamilyType::INTERFACE_DESCRIPTOR_DATA INTERFACE_DESCRIPTOR_DATA;
auto &indirectHeap = cmdQ.getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 8192);
auto usedIndirectHeapBefore = indirectHeap.getUsed();
indirectHeap.getSpace(sizeof(INTERFACE_DESCRIPTOR_DATA));
size_t crossThreadDataSize = kernel->getCrossThreadDataSize();
HardwareCommandsHelper<FamilyType>::sendInterfaceDescriptorData(
indirectHeap, 0, 0, crossThreadDataSize, 64, 0, 0, 0, 1, *kernel, 0, pDevice->getPreemptionMode(), nullptr);
auto usedIndirectHeapAfter = indirectHeap.getUsed();
EXPECT_EQ(sizeof(INTERFACE_DESCRIPTOR_DATA), usedIndirectHeapAfter - usedIndirectHeapBefore);
}
HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, programMediaInterfaceDescriptorLoadResourceUsage) {
CommandQueueHw<FamilyType> cmdQ(nullptr, pDevice, 0);
typedef typename FamilyType::INTERFACE_DESCRIPTOR_DATA INTERFACE_DESCRIPTOR_DATA;
typedef typename FamilyType::MEDIA_INTERFACE_DESCRIPTOR_LOAD MEDIA_INTERFACE_DESCRIPTOR_LOAD;
typedef typename FamilyType::MEDIA_STATE_FLUSH MEDIA_STATE_FLUSH;
auto &commandStream = cmdQ.getCS(1024);
auto usedBefore = commandStream.getUsed();
HardwareCommandsHelper<FamilyType>::sendMediaInterfaceDescriptorLoad(commandStream,
0,
sizeof(INTERFACE_DESCRIPTOR_DATA));
auto usedAfter = commandStream.getUsed();
EXPECT_EQ(sizeof(MEDIA_INTERFACE_DESCRIPTOR_LOAD) + sizeof(MEDIA_STATE_FLUSH), usedAfter - usedBefore);
}
HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, programMediaStateFlushResourceUsage) {
CommandQueueHw<FamilyType> cmdQ(nullptr, pDevice, 0);
typedef typename FamilyType::INTERFACE_DESCRIPTOR_DATA INTERFACE_DESCRIPTOR_DATA;
typedef typename FamilyType::MEDIA_STATE_FLUSH MEDIA_STATE_FLUSH;
auto &commandStream = cmdQ.getCS(1024);
auto usedBefore = commandStream.getUsed();
HardwareCommandsHelper<FamilyType>::sendMediaStateFlush(commandStream,
sizeof(INTERFACE_DESCRIPTOR_DATA));
auto usedAfter = commandStream.getUsed();
EXPECT_EQ(sizeof(MEDIA_STATE_FLUSH), usedAfter - usedBefore);
}
HWTEST_F(HardwareCommandsTest, sendCrossThreadDataResourceUsage) {
CommandQueueHw<FamilyType> cmdQ(pContext, pDevice, 0);
std::unique_ptr<Image> srcImage(Image2dHelper<>::create(pContext));
ASSERT_NE(nullptr, srcImage.get());
std::unique_ptr<Image> dstImage(Image2dHelper<>::create(pContext));
ASSERT_NE(nullptr, dstImage.get());
MultiDispatchInfo multiDispatchInfo;
auto &builder = pDevice->getExecutionEnvironment()->getBuiltIns()->getBuiltinDispatchInfoBuilder(EBuiltInOps::CopyImageToImage3d,
cmdQ.getContext(), cmdQ.getDevice());
ASSERT_NE(nullptr, &builder);
BuiltinOpParams dc;
dc.srcMemObj = srcImage.get();
dc.dstMemObj = dstImage.get();
dc.srcOffset = {0, 0, 0};
dc.dstOffset = {0, 0, 0};
dc.size = {1, 1, 1};
builder.buildDispatchInfos(multiDispatchInfo, dc);
EXPECT_NE(0u, multiDispatchInfo.size());
auto kernel = multiDispatchInfo.begin()->getKernel();
ASSERT_NE(nullptr, kernel);
auto &indirectHeap = cmdQ.getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 8192);
auto usedBefore = indirectHeap.getUsed();
auto sizeCrossThreadData = kernel->getCrossThreadDataSize();
HardwareCommandsHelper<FamilyType>::sendCrossThreadData(
indirectHeap,
*kernel,
false,
nullptr,
sizeCrossThreadData);
auto usedAfter = indirectHeap.getUsed();
EXPECT_EQ(kernel->getCrossThreadDataSize(), usedAfter - usedBefore);
}
HWTEST_F(HardwareCommandsTest, givenSendCrossThreadDataWhenWhenAddPatchInfoCommentsForAUBDumpIsNotSetThenAddPatchInfoDataOffsetsAreNotMoved) {
CommandQueueHw<FamilyType> cmdQ(pContext, pDevice, 0);
MockContext context;
MockProgram program(*pDevice->getExecutionEnvironment(), &context, false);
auto kernelInfo = std::make_unique<KernelInfo>();
std::unique_ptr<MockKernel> kernel(new MockKernel(&program, *kernelInfo, *pDevice));
auto &indirectHeap = cmdQ.getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 8192);
PatchInfoData patchInfoData = {0xaaaaaaaa, 0, PatchInfoAllocationType::KernelArg, 0xbbbbbbbb, 0, PatchInfoAllocationType::IndirectObjectHeap};
kernel->getPatchInfoDataList().push_back(patchInfoData);
auto sizeCrossThreadData = kernel->getCrossThreadDataSize();
HardwareCommandsHelper<FamilyType>::sendCrossThreadData(
indirectHeap,
*kernel,
false,
nullptr,
sizeCrossThreadData);
ASSERT_EQ(1u, kernel->getPatchInfoDataList().size());
EXPECT_EQ(0xaaaaaaaa, kernel->getPatchInfoDataList()[0].sourceAllocation);
EXPECT_EQ(0u, kernel->getPatchInfoDataList()[0].sourceAllocationOffset);
EXPECT_EQ(PatchInfoAllocationType::KernelArg, kernel->getPatchInfoDataList()[0].sourceType);
EXPECT_EQ(0xbbbbbbbb, kernel->getPatchInfoDataList()[0].targetAllocation);
EXPECT_EQ(0u, kernel->getPatchInfoDataList()[0].targetAllocationOffset);
EXPECT_EQ(PatchInfoAllocationType::IndirectObjectHeap, kernel->getPatchInfoDataList()[0].targetType);
}
HWTEST_F(HardwareCommandsTest, givenIndirectHeapNotAllocatedFromInternalPoolWhenSendCrossThreadDataIsCalledThenOffsetZeroIsReturned) {
auto nonInternalAllocation = pDevice->getMemoryManager()->allocateGraphicsMemoryWithProperties(MockAllocationProperties{MemoryConstants::pageSize});
IndirectHeap indirectHeap(nonInternalAllocation, false);
auto sizeCrossThreadData = mockKernelWithInternal->mockKernel->getCrossThreadDataSize();
auto offset = HardwareCommandsHelper<FamilyType>::sendCrossThreadData(
indirectHeap,
*mockKernelWithInternal->mockKernel,
false,
nullptr,
sizeCrossThreadData);
EXPECT_EQ(0u, offset);
pDevice->getMemoryManager()->freeGraphicsMemory(nonInternalAllocation);
}
HWTEST_F(HardwareCommandsTest, givenIndirectHeapAllocatedFromInternalPoolWhenSendCrossThreadDataIsCalledThenHeapBaseOffsetIsReturned) {
auto internalAllocation = pDevice->getMemoryManager()->allocateGraphicsMemoryWithProperties(MockAllocationProperties(true, MemoryConstants::pageSize, GraphicsAllocation::AllocationType::INTERNAL_HEAP));
IndirectHeap indirectHeap(internalAllocation, true);
auto expectedOffset = internalAllocation->getGpuAddressToPatch();
auto sizeCrossThreadData = mockKernelWithInternal->mockKernel->getCrossThreadDataSize();
auto offset = HardwareCommandsHelper<FamilyType>::sendCrossThreadData(
indirectHeap,
*mockKernelWithInternal->mockKernel,
false,
nullptr,
sizeCrossThreadData);
EXPECT_EQ(expectedOffset, offset);
pDevice->getMemoryManager()->freeGraphicsMemory(internalAllocation);
}
HWTEST_F(HardwareCommandsTest, givenSendCrossThreadDataWhenWhenAddPatchInfoCommentsForAUBDumpIsSetThenAddPatchInfoDataOffsetsAreMoved) {
DebugManagerStateRestore dbgRestore;
DebugManager.flags.AddPatchInfoCommentsForAUBDump.set(true);
CommandQueueHw<FamilyType> cmdQ(pContext, pDevice, 0);
MockContext context;
MockProgram program(*pDevice->getExecutionEnvironment(), &context, false);
auto kernelInfo = std::make_unique<KernelInfo>();
std::unique_ptr<MockKernel> kernel(new MockKernel(&program, *kernelInfo, *pDevice));
auto &indirectHeap = cmdQ.getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 8192);
indirectHeap.getSpace(128u);
PatchInfoData patchInfoData1 = {0xaaaaaaaa, 0, PatchInfoAllocationType::KernelArg, 0xbbbbbbbb, 0, PatchInfoAllocationType::IndirectObjectHeap};
PatchInfoData patchInfoData2 = {0xcccccccc, 0, PatchInfoAllocationType::IndirectObjectHeap, 0xdddddddd, 0, PatchInfoAllocationType::Default};
kernel->getPatchInfoDataList().push_back(patchInfoData1);
kernel->getPatchInfoDataList().push_back(patchInfoData2);
auto sizeCrossThreadData = kernel->getCrossThreadDataSize();
auto offsetCrossThreadData = HardwareCommandsHelper<FamilyType>::sendCrossThreadData(
indirectHeap,
*kernel,
false,
nullptr,
sizeCrossThreadData);
ASSERT_NE(0u, offsetCrossThreadData);
EXPECT_EQ(128u, offsetCrossThreadData);
ASSERT_EQ(2u, kernel->getPatchInfoDataList().size());
EXPECT_EQ(0xaaaaaaaa, kernel->getPatchInfoDataList()[0].sourceAllocation);
EXPECT_EQ(0u, kernel->getPatchInfoDataList()[0].sourceAllocationOffset);
EXPECT_EQ(PatchInfoAllocationType::KernelArg, kernel->getPatchInfoDataList()[0].sourceType);
EXPECT_NE(0xbbbbbbbb, kernel->getPatchInfoDataList()[0].targetAllocation);
EXPECT_EQ(indirectHeap.getGraphicsAllocation()->getGpuAddress(), kernel->getPatchInfoDataList()[0].targetAllocation);
EXPECT_NE(0u, kernel->getPatchInfoDataList()[0].targetAllocationOffset);
EXPECT_EQ(offsetCrossThreadData, kernel->getPatchInfoDataList()[0].targetAllocationOffset);
EXPECT_EQ(PatchInfoAllocationType::IndirectObjectHeap, kernel->getPatchInfoDataList()[0].targetType);
}
HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, sendIndirectStateResourceUsage) {
using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA;
using GPGPU_WALKER = typename FamilyType::GPGPU_WALKER;
CommandQueueHw<FamilyType> cmdQ(pContext, pDevice, 0);
std::unique_ptr<Image> srcImage(Image2dHelper<>::create(pContext));
ASSERT_NE(nullptr, srcImage.get());
std::unique_ptr<Image> dstImage(Image2dHelper<>::create(pContext));
ASSERT_NE(nullptr, dstImage.get());
MultiDispatchInfo multiDispatchInfo;
auto &builder = pDevice->getExecutionEnvironment()->getBuiltIns()->getBuiltinDispatchInfoBuilder(EBuiltInOps::CopyImageToImage3d,
cmdQ.getContext(), cmdQ.getDevice());
ASSERT_NE(nullptr, &builder);
BuiltinOpParams dc;
dc.srcMemObj = srcImage.get();
dc.dstMemObj = dstImage.get();
dc.srcOffset = {0, 0, 0};
dc.dstOffset = {0, 0, 0};
dc.size = {1, 1, 1};
builder.buildDispatchInfos(multiDispatchInfo, dc);
EXPECT_NE(0u, multiDispatchInfo.size());
auto kernel = multiDispatchInfo.begin()->getKernel();
ASSERT_NE(nullptr, kernel);
const size_t localWorkSize = 256;
const size_t localWorkSizes[3]{localWorkSize, 1, 1};
auto &commandStream = cmdQ.getCS(1024);
auto pWalkerCmd = static_cast<GPGPU_WALKER *>(commandStream.getSpace(sizeof(GPGPU_WALKER)));
*pWalkerCmd = FamilyType::cmdInitGpgpuWalker;
auto &dsh = cmdQ.getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 8192);
auto &ioh = cmdQ.getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 8192);
auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::SURFACE_STATE, 8192);
auto usedBeforeCS = commandStream.getUsed();
auto usedBeforeDSH = dsh.getUsed();
auto usedBeforeIOH = ioh.getUsed();
auto usedBeforeSSH = ssh.getUsed();
dsh.align(HardwareCommandsHelper<FamilyType>::alignInterfaceDescriptorData);
size_t IDToffset = dsh.getUsed();
dsh.getSpace(sizeof(INTERFACE_DESCRIPTOR_DATA));
HardwareCommandsHelper<FamilyType>::sendMediaInterfaceDescriptorLoad(
commandStream,
IDToffset,
sizeof(INTERFACE_DESCRIPTOR_DATA));
uint32_t interfaceDescriptorIndex = 0;
auto isCcsUsed = isCcs(cmdQ.getGpgpuEngine().osContext->getEngineType());
HardwareCommandsHelper<FamilyType>::sendIndirectState(
commandStream,
dsh,
ioh,
ssh,
*kernel,
kernel->getKernelInfo().getMaxSimdSize(),
localWorkSizes,
IDToffset,
interfaceDescriptorIndex,
pDevice->getPreemptionMode(),
pWalkerCmd,
nullptr,
true,
isCcsUsed);
// It's okay these are EXPECT_GE as they're only going to be used for
// estimation purposes to avoid OOM.
auto usedAfterDSH = dsh.getUsed();
auto usedAfterIOH = ioh.getUsed();
auto usedAfterSSH = ssh.getUsed();
auto sizeRequiredDSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredDSH(*kernel);
auto sizeRequiredIOH = HardwareCommandsHelper<FamilyType>::getSizeRequiredIOH(*kernel, localWorkSize);
auto sizeRequiredSSH = HardwareCommandsHelper<FamilyType>::getSizeRequiredSSH(*kernel);
EXPECT_GE(sizeRequiredDSH, usedAfterDSH - usedBeforeDSH);
EXPECT_GE(sizeRequiredIOH, usedAfterIOH - usedBeforeIOH);
EXPECT_GE(sizeRequiredSSH, usedAfterSSH - usedBeforeSSH);
auto usedAfterCS = commandStream.getUsed();
EXPECT_GE(HardwareCommandsHelper<FamilyType>::getSizeRequiredCS(kernel), usedAfterCS - usedBeforeCS);
}
HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenKernelWithFourBindingTableEntriesWhenIndirectStateIsEmittedThenInterfaceDescriptorContainsCorrectBindingTableEntryCount) {
using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA;
using GPGPU_WALKER = typename FamilyType::GPGPU_WALKER;
CommandQueueHw<FamilyType> cmdQ(pContext, pDevice, 0);
auto &commandStream = cmdQ.getCS(1024);
auto pWalkerCmd = static_cast<GPGPU_WALKER *>(commandStream.getSpace(sizeof(GPGPU_WALKER)));
*pWalkerCmd = FamilyType::cmdInitGpgpuWalker;
auto expectedBindingTableCount = 3u;
mockKernelWithInternal->mockKernel->numberOfBindingTableStates = expectedBindingTableCount;
auto &dsh = cmdQ.getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 8192);
auto &ioh = cmdQ.getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 8192);
auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::SURFACE_STATE, 8192);
const size_t localWorkSize = 256;
const size_t localWorkSizes[3]{localWorkSize, 1, 1};
uint32_t interfaceDescriptorIndex = 0;
auto isCcsUsed = isCcs(cmdQ.getGpgpuEngine().osContext->getEngineType());
HardwareCommandsHelper<FamilyType>::sendIndirectState(
commandStream,
dsh,
ioh,
ssh,
*mockKernelWithInternal->mockKernel,
mockKernelWithInternal->mockKernel->getKernelInfo().getMaxSimdSize(),
localWorkSizes,
0,
interfaceDescriptorIndex,
pDevice->getPreemptionMode(),
pWalkerCmd,
nullptr,
true,
isCcsUsed);
auto interfaceDescriptor = reinterpret_cast<INTERFACE_DESCRIPTOR_DATA *>(dsh.getCpuBase());
if (HardwareCommandsHelper<FamilyType>::doBindingTablePrefetch()) {
EXPECT_EQ(expectedBindingTableCount, interfaceDescriptor->getBindingTableEntryCount());
} else {
EXPECT_EQ(0u, interfaceDescriptor->getBindingTableEntryCount());
}
}
HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenKernelThatIsSchedulerWhenIndirectStateIsEmittedThenInterfaceDescriptorContainsZeroBindingTableEntryCount) {
using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA;
using GPGPU_WALKER = typename FamilyType::GPGPU_WALKER;
CommandQueueHw<FamilyType> cmdQ(pContext, pDevice, 0);
auto &commandStream = cmdQ.getCS(1024);
auto pWalkerCmd = static_cast<GPGPU_WALKER *>(commandStream.getSpace(sizeof(GPGPU_WALKER)));
*pWalkerCmd = FamilyType::cmdInitGpgpuWalker;
auto expectedBindingTableCount = 3u;
mockKernelWithInternal->mockKernel->numberOfBindingTableStates = expectedBindingTableCount;
auto isScheduler = const_cast<bool *>(&mockKernelWithInternal->mockKernel->isSchedulerKernel);
*isScheduler = true;
auto &dsh = cmdQ.getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 8192);
auto &ioh = cmdQ.getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 8192);
auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::SURFACE_STATE, 8192);
const size_t localWorkSize = 256;
const size_t localWorkSizes[3]{localWorkSize, 1, 1};
uint32_t interfaceDescriptorIndex = 0;
auto isCcsUsed = isCcs(cmdQ.getGpgpuEngine().osContext->getEngineType());
HardwareCommandsHelper<FamilyType>::sendIndirectState(
commandStream,
dsh,
ioh,
ssh,
*mockKernelWithInternal->mockKernel,
mockKernelWithInternal->mockKernel->getKernelInfo().getMaxSimdSize(),
localWorkSizes,
0,
interfaceDescriptorIndex,
pDevice->getPreemptionMode(),
pWalkerCmd,
nullptr,
true,
isCcsUsed);
auto interfaceDescriptor = reinterpret_cast<INTERFACE_DESCRIPTOR_DATA *>(dsh.getCpuBase());
EXPECT_EQ(0u, interfaceDescriptor->getBindingTableEntryCount());
}
HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenKernelWith100BindingTableEntriesWhenIndirectStateIsEmittedThenInterfaceDescriptorHas31BindingTableEntriesSet) {
using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA;
using GPGPU_WALKER = typename FamilyType::GPGPU_WALKER;
CommandQueueHw<FamilyType> cmdQ(pContext, pDevice, 0);
auto &commandStream = cmdQ.getCS(1024);
auto pWalkerCmd = static_cast<GPGPU_WALKER *>(commandStream.getSpace(sizeof(GPGPU_WALKER)));
*pWalkerCmd = FamilyType::cmdInitGpgpuWalker;
auto expectedBindingTableCount = 100u;
mockKernelWithInternal->mockKernel->numberOfBindingTableStates = expectedBindingTableCount;
auto &dsh = cmdQ.getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 8192);
auto &ioh = cmdQ.getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 8192);
auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::SURFACE_STATE, 8192);
const size_t localWorkSize = 256;
const size_t localWorkSizes[3]{localWorkSize, 1, 1};
uint32_t interfaceDescriptorIndex = 0;
auto isCcsUsed = isCcs(cmdQ.getGpgpuEngine().osContext->getEngineType());
HardwareCommandsHelper<FamilyType>::sendIndirectState(
commandStream,
dsh,
ioh,
ssh,
*mockKernelWithInternal->mockKernel,
mockKernelWithInternal->mockKernel->getKernelInfo().getMaxSimdSize(),
localWorkSizes,
0,
interfaceDescriptorIndex,
pDevice->getPreemptionMode(),
pWalkerCmd,
nullptr,
true,
isCcsUsed);
auto interfaceDescriptor = reinterpret_cast<INTERFACE_DESCRIPTOR_DATA *>(dsh.getCpuBase());
if (HardwareCommandsHelper<FamilyType>::doBindingTablePrefetch()) {
EXPECT_EQ(31u, interfaceDescriptor->getBindingTableEntryCount());
} else {
EXPECT_EQ(0u, interfaceDescriptor->getBindingTableEntryCount());
}
}
HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, whenSendingIndirectStateThenKernelsWalkOrderIsTakenIntoAccount) {
using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA;
using GPGPU_WALKER = typename FamilyType::GPGPU_WALKER;
CommandQueueHw<FamilyType> cmdQ(pContext, pDevice, 0);
std::unique_ptr<Image> img(Image2dHelper<>::create(pContext));
MultiDispatchInfo multiDispatchInfo;
auto &builder = cmdQ.getDevice().getExecutionEnvironment()->getBuiltIns()->getBuiltinDispatchInfoBuilder(EBuiltInOps::CopyImageToImage3d,
cmdQ.getContext(), cmdQ.getDevice());
BuiltinOpParams dc;
dc.srcMemObj = img.get();
dc.dstMemObj = img.get();
dc.size = {1, 1, 1};
builder.buildDispatchInfos(multiDispatchInfo, dc);
ASSERT_NE(0u, multiDispatchInfo.size());
auto kernel = multiDispatchInfo.begin()->getKernel();
ASSERT_NE(nullptr, kernel);
const size_t localWorkSizeX = 2;
const size_t localWorkSizeY = 3;
const size_t localWorkSizeZ = 4;
const size_t localWorkSizes[3]{localWorkSizeX, localWorkSizeY, localWorkSizeZ};
auto &commandStream = cmdQ.getCS(1024);
auto pWalkerCmd = static_cast<GPGPU_WALKER *>(commandStream.getSpace(sizeof(GPGPU_WALKER)));
*pWalkerCmd = FamilyType::cmdInitGpgpuWalker;
auto &dsh = cmdQ.getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 8192);
auto &ioh = cmdQ.getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 8192);
auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::SURFACE_STATE, 8192);
dsh.align(HardwareCommandsHelper<FamilyType>::alignInterfaceDescriptorData);
size_t IDToffset = dsh.getUsed();
dsh.getSpace(sizeof(INTERFACE_DESCRIPTOR_DATA));
KernelInfo modifiedKernelInfo = {};
modifiedKernelInfo.patchInfo = kernel->getKernelInfo().patchInfo;
modifiedKernelInfo.workgroupWalkOrder[0] = 2;
modifiedKernelInfo.workgroupWalkOrder[1] = 1;
modifiedKernelInfo.workgroupWalkOrder[2] = 0;
modifiedKernelInfo.workgroupDimensionsOrder[0] = 2;
modifiedKernelInfo.workgroupDimensionsOrder[1] = 1;
modifiedKernelInfo.workgroupDimensionsOrder[2] = 0;
MockKernel mockKernel{kernel->getProgram(), modifiedKernelInfo, kernel->getDevice(), false};
uint32_t interfaceDescriptorIndex = 0;
auto isCcsUsed = isCcs(cmdQ.getGpgpuEngine().osContext->getEngineType());
HardwareCommandsHelper<FamilyType>::sendIndirectState(
commandStream,
dsh,
ioh,
ssh,
mockKernel,
modifiedKernelInfo.getMaxSimdSize(),
localWorkSizes,
IDToffset,
interfaceDescriptorIndex,
pDevice->getPreemptionMode(),
pWalkerCmd,
nullptr,
true,
isCcsUsed);
size_t numThreads = localWorkSizeX * localWorkSizeY * localWorkSizeZ;
numThreads = Math::divideAndRoundUp(numThreads, modifiedKernelInfo.getMaxSimdSize());
size_t expectedIohSize = ((modifiedKernelInfo.getMaxSimdSize() == 32) ? 32 : 16) * 3 * numThreads * sizeof(uint16_t);
ASSERT_LE(expectedIohSize, ioh.getUsed());
auto expectedLocalIds = alignedMalloc(expectedIohSize, 64);
generateLocalIDs(expectedLocalIds, modifiedKernelInfo.getMaxSimdSize(),
std::array<uint16_t, 3>{{localWorkSizeX, localWorkSizeY, localWorkSizeZ}},
std::array<uint8_t, 3>{{modifiedKernelInfo.workgroupDimensionsOrder[0], modifiedKernelInfo.workgroupDimensionsOrder[1], modifiedKernelInfo.workgroupDimensionsOrder[2]}}, false);
EXPECT_EQ(0, memcmp(expectedLocalIds, ioh.getCpuBase(), expectedIohSize));
alignedFree(expectedLocalIds);
}
HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, usedBindingTableStatePointer) {
typedef typename FamilyType::BINDING_TABLE_STATE BINDING_TABLE_STATE;
typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE;
using GPGPU_WALKER = typename FamilyType::GPGPU_WALKER;
CommandQueueHw<FamilyType> cmdQ(pContext, pDevice, 0);
std::unique_ptr<Image> dstImage(Image2dHelper<>::create(pContext));
ASSERT_NE(nullptr, dstImage.get());
MultiDispatchInfo multiDispatchInfo;
auto &builder = pDevice->getExecutionEnvironment()->getBuiltIns()->getBuiltinDispatchInfoBuilder(EBuiltInOps::CopyBufferToImage3d,
cmdQ.getContext(), cmdQ.getDevice());
ASSERT_NE(nullptr, &builder);
BuiltinOpParams dc;
dc.srcPtr = nullptr;
dc.dstMemObj = dstImage.get();
dc.dstOffset = {0, 0, 0};
dc.size = {1, 1, 1};
dc.dstRowPitch = 0;
dc.dstSlicePitch = 0;
builder.buildDispatchInfos(multiDispatchInfo, dc);
EXPECT_NE(0u, multiDispatchInfo.size());
auto kernel = multiDispatchInfo.begin()->getKernel();
ASSERT_NE(nullptr, kernel);
const size_t localWorkSizes[3]{256, 1, 1};
auto &commandStream = cmdQ.getCS(1024);
auto pWalkerCmd = static_cast<GPGPU_WALKER *>(commandStream.getSpace(sizeof(GPGPU_WALKER)));
*pWalkerCmd = FamilyType::cmdInitGpgpuWalker;
auto &dsh = cmdQ.getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 8192);
auto &ioh = cmdQ.getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 8192);
auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::SURFACE_STATE, 8192);
// Obtain where the pointers will be stored
const auto &kernelInfo = kernel->getKernelInfo();
auto numSurfaceStates = kernelInfo.patchInfo.statelessGlobalMemObjKernelArgs.size() +
kernelInfo.patchInfo.imageMemObjKernelArgs.size();
EXPECT_EQ(2u, numSurfaceStates);
size_t bindingTableStateSize = numSurfaceStates * sizeof(RENDER_SURFACE_STATE);
uint32_t *bindingTableStatesPointers = reinterpret_cast<uint32_t *>(
reinterpret_cast<uint8_t *>(ssh.getCpuBase()) + ssh.getUsed() + bindingTableStateSize);
for (auto i = 0u; i < numSurfaceStates; i++) {
*(&bindingTableStatesPointers[i]) = 0xDEADBEEF;
}
// force statefull path for buffers
const_cast<KernelInfo &>(kernelInfo).requiresSshForBuffers = true;
uint32_t interfaceDescriptorIndex = 0;
auto isCcsUsed = isCcs(cmdQ.getGpgpuEngine().osContext->getEngineType());
HardwareCommandsHelper<FamilyType>::sendIndirectState(
commandStream,
dsh,
ioh,
ssh,
*kernel,
kernel->getKernelInfo().getMaxSimdSize(),
localWorkSizes,
0,
interfaceDescriptorIndex,
pDevice->getPreemptionMode(),
pWalkerCmd,
nullptr,
true,
isCcsUsed);
EXPECT_EQ(0x00000000u, *(&bindingTableStatesPointers[0]));
EXPECT_EQ(0x00000040u, *(&bindingTableStatesPointers[1]));
}
HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, usedBindingTableStatePointersForGlobalAndConstantAndPrivateAndEventPoolAndDefaultCommandQueueSurfaces) {
using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA;
using GPGPU_WALKER = typename FamilyType::GPGPU_WALKER;
// define kernel info
auto pKernelInfo = std::make_unique<KernelInfo>();
SPatchExecutionEnvironment tokenEE = {};
tokenEE.CompiledSIMD8 = false;
tokenEE.CompiledSIMD16 = false;
tokenEE.CompiledSIMD32 = true;
pKernelInfo->patchInfo.executionEnvironment = &tokenEE;
// define patch offsets for global, constant, private, event pool and default device queue surfaces
SPatchAllocateStatelessGlobalMemorySurfaceWithInitialization AllocateStatelessGlobalMemorySurfaceWithInitialization;
AllocateStatelessGlobalMemorySurfaceWithInitialization.GlobalBufferIndex = 0;
AllocateStatelessGlobalMemorySurfaceWithInitialization.SurfaceStateHeapOffset = 0;
AllocateStatelessGlobalMemorySurfaceWithInitialization.DataParamOffset = 0;
AllocateStatelessGlobalMemorySurfaceWithInitialization.DataParamSize = 8;
pKernelInfo->patchInfo.pAllocateStatelessGlobalMemorySurfaceWithInitialization = &AllocateStatelessGlobalMemorySurfaceWithInitialization;
SPatchAllocateStatelessConstantMemorySurfaceWithInitialization AllocateStatelessConstantMemorySurfaceWithInitialization;
AllocateStatelessConstantMemorySurfaceWithInitialization.ConstantBufferIndex = 0;
AllocateStatelessConstantMemorySurfaceWithInitialization.SurfaceStateHeapOffset = 64;
AllocateStatelessConstantMemorySurfaceWithInitialization.DataParamOffset = 8;
AllocateStatelessConstantMemorySurfaceWithInitialization.DataParamSize = 8;
pKernelInfo->patchInfo.pAllocateStatelessConstantMemorySurfaceWithInitialization = &AllocateStatelessConstantMemorySurfaceWithInitialization;
SPatchAllocateStatelessPrivateSurface AllocateStatelessPrivateMemorySurface;
AllocateStatelessPrivateMemorySurface.PerThreadPrivateMemorySize = 32;
AllocateStatelessPrivateMemorySurface.SurfaceStateHeapOffset = 128;
AllocateStatelessPrivateMemorySurface.DataParamOffset = 16;
AllocateStatelessPrivateMemorySurface.DataParamSize = 8;
pKernelInfo->patchInfo.pAllocateStatelessPrivateSurface = &AllocateStatelessPrivateMemorySurface;
SPatchAllocateStatelessEventPoolSurface AllocateStatelessEventPoolSurface;
AllocateStatelessEventPoolSurface.SurfaceStateHeapOffset = 192;
AllocateStatelessEventPoolSurface.DataParamOffset = 24;
AllocateStatelessEventPoolSurface.DataParamSize = 8;
pKernelInfo->patchInfo.pAllocateStatelessEventPoolSurface = &AllocateStatelessEventPoolSurface;
SPatchAllocateStatelessDefaultDeviceQueueSurface AllocateStatelessDefaultDeviceQueueSurface;
AllocateStatelessDefaultDeviceQueueSurface.SurfaceStateHeapOffset = 256;
AllocateStatelessDefaultDeviceQueueSurface.DataParamOffset = 32;
AllocateStatelessDefaultDeviceQueueSurface.DataParamSize = 8;
pKernelInfo->patchInfo.pAllocateStatelessDefaultDeviceQueueSurface = &AllocateStatelessDefaultDeviceQueueSurface;
// create program with valid context
MockContext context;
MockProgram program(*pDevice->getExecutionEnvironment(), &context, false);
// setup global memory
char globalBuffer[16];
GraphicsAllocation gfxGlobalAlloc(0, GraphicsAllocation::AllocationType::UNKNOWN, globalBuffer, castToUint64(globalBuffer), 0llu, sizeof(globalBuffer), MemoryPool::MemoryNull);
program.setGlobalSurface(&gfxGlobalAlloc);
// setup constant memory
char constBuffer[16];
GraphicsAllocation gfxConstAlloc(0, GraphicsAllocation::AllocationType::UNKNOWN, constBuffer, castToUint64(constBuffer), 0llu, sizeof(constBuffer), MemoryPool::MemoryNull);
program.setConstantSurface(&gfxConstAlloc);
// create kernel
MockKernel *pKernel = new MockKernel(&program, *pKernelInfo, *pDevice);
SKernelBinaryHeaderCommon kernelHeader;
// setup surface state heap
constexpr uint32_t numSurfaces = 5;
constexpr uint32_t sshSize = numSurfaces * sizeof(typename FamilyType::RENDER_SURFACE_STATE) + numSurfaces * sizeof(typename FamilyType::BINDING_TABLE_STATE);
unsigned char *surfaceStateHeap = reinterpret_cast<unsigned char *>(alignedMalloc(sshSize, sizeof(typename FamilyType::RENDER_SURFACE_STATE)));
uint32_t btiOffset = static_cast<uint32_t>(numSurfaces * sizeof(typename FamilyType::RENDER_SURFACE_STATE));
auto bti = reinterpret_cast<typename FamilyType::BINDING_TABLE_STATE *>(surfaceStateHeap + btiOffset);
for (uint32_t i = 0; i < numSurfaces; ++i) {
bti[i].setSurfaceStatePointer(i * sizeof(typename FamilyType::RENDER_SURFACE_STATE));
}
kernelHeader.SurfaceStateHeapSize = sshSize;
// setup kernel heap
uint32_t kernelIsa[32];
kernelHeader.KernelHeapSize = sizeof(kernelIsa);
pKernelInfo->heapInfo.pSsh = surfaceStateHeap;
pKernelInfo->heapInfo.pKernelHeap = kernelIsa;
pKernelInfo->heapInfo.pKernelHeader = &kernelHeader;
// setup binding table state
SPatchBindingTableState bindingTableState;
bindingTableState.Token = iOpenCL::PATCH_TOKEN_BINDING_TABLE_STATE;
bindingTableState.Size = sizeof(SPatchBindingTableState);
bindingTableState.Count = 5;
bindingTableState.Offset = btiOffset;
bindingTableState.SurfaceStateOffset = 0;
pKernelInfo->patchInfo.bindingTableState = &bindingTableState;
// setup thread payload
SPatchThreadPayload threadPayload;
threadPayload.LocalIDXPresent = 1;
threadPayload.LocalIDYPresent = 1;
threadPayload.LocalIDZPresent = 1;
pKernelInfo->patchInfo.threadPayload = &threadPayload;
// define stateful path
pKernelInfo->usesSsh = true;
pKernelInfo->requiresSshForBuffers = true;
// initialize kernel
ASSERT_EQ(CL_SUCCESS, pKernel->initialize());
// setup cross thread data
char pCrossThreadData[64];
pKernel->setCrossThreadData(pCrossThreadData, sizeof(pCrossThreadData));
// try with different offsets to surface state base address
for (uint32_t ssbaOffset : {0U, (uint32_t)sizeof(typename FamilyType::RENDER_SURFACE_STATE)}) {
CommandQueueHw<FamilyType> cmdQ(nullptr, pDevice, 0);
auto &commandStream = cmdQ.getCS(1024);
auto pWalkerCmd = static_cast<GPGPU_WALKER *>(commandStream.getSpace(sizeof(GPGPU_WALKER)));
*pWalkerCmd = FamilyType::cmdInitGpgpuWalker;
auto &dsh = cmdQ.getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 8192);
auto &ioh = cmdQ.getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 8192);
auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::SURFACE_STATE, 8192);
// Initialize binding table state pointers with pattern
EXPECT_EQ(numSurfaces, pKernel->getNumberOfBindingTableStates());
const size_t localWorkSizes[3]{256, 1, 1};
dsh.getSpace(sizeof(INTERFACE_DESCRIPTOR_DATA));
ssh.getSpace(ssbaOffset); // offset local ssh from surface state base address
uint32_t localSshOffset = static_cast<uint32_t>(ssh.getUsed());
// push surfaces states and binding table to given ssh heap
uint32_t interfaceDescriptorIndex = 0;
auto isCcsUsed = isCcs(cmdQ.getGpgpuEngine().osContext->getEngineType());
HardwareCommandsHelper<FamilyType>::sendIndirectState(
commandStream,
dsh,
ioh,
ssh,
*pKernel,
pKernel->getKernelInfo().getMaxSimdSize(),
localWorkSizes,
0,
interfaceDescriptorIndex,
pDevice->getPreemptionMode(),
pWalkerCmd,
nullptr,
true,
isCcsUsed);
bti = reinterpret_cast<typename FamilyType::BINDING_TABLE_STATE *>(reinterpret_cast<unsigned char *>(ssh.getCpuBase()) + localSshOffset + btiOffset);
for (uint32_t i = 0; i < numSurfaces; ++i) {
uint32_t expected = localSshOffset + i * sizeof(typename FamilyType::RENDER_SURFACE_STATE);
EXPECT_EQ(expected, bti[i].getSurfaceStatePointer());
}
program.setGlobalSurface(nullptr);
program.setConstantSurface(nullptr);
//exhaust space to trigger reload
ssh.getSpace(ssh.getAvailableSpace());
dsh.getSpace(dsh.getAvailableSpace());
}
alignedFree(surfaceStateHeap);
delete pKernel;
}
HWTEST_F(HardwareCommandsTest, setBindingTableStatesForKernelWithBuffersNotRequiringSSHDoesNotTouchSSH) {
// define kernel info
auto pKernelInfo = std::make_unique<KernelInfo>();
// create program with valid context
MockContext context;
MockProgram program(*pDevice->getExecutionEnvironment(), &context, false);
// create kernel
MockKernel *pKernel = new MockKernel(&program, *pKernelInfo, *pDevice);
// setup surface state heap
char surfaceStateHeap[256];
SKernelBinaryHeaderCommon kernelHeader;
kernelHeader.SurfaceStateHeapSize = sizeof(surfaceStateHeap);
pKernelInfo->heapInfo.pSsh = surfaceStateHeap;
pKernelInfo->heapInfo.pKernelHeader = &kernelHeader;
// define stateful path
pKernelInfo->usesSsh = true;
pKernelInfo->requiresSshForBuffers = false;
SPatchStatelessGlobalMemoryObjectKernelArgument statelessGlobalMemory;
statelessGlobalMemory.ArgumentNumber = 0;
statelessGlobalMemory.DataParamOffset = 0;
statelessGlobalMemory.DataParamSize = 0;
statelessGlobalMemory.Size = 0;
statelessGlobalMemory.SurfaceStateHeapOffset = 0;
pKernelInfo->patchInfo.statelessGlobalMemObjKernelArgs.push_back(&statelessGlobalMemory);
// initialize kernel
ASSERT_EQ(CL_SUCCESS, pKernel->initialize());
CommandQueueHw<FamilyType> cmdQ(nullptr, pDevice, 0);
auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::SURFACE_STATE, 8192);
ssh.align(8);
auto usedBefore = ssh.getUsed();
// Initialize binding table state pointers with pattern
auto numSurfaceStates = pKernel->getNumberOfBindingTableStates();
EXPECT_EQ(0u, numSurfaceStates);
// set binding table states
auto dstBindingTablePointer = HardwareCommandsHelper<FamilyType>::pushBindingTableAndSurfaceStates(ssh, *pKernel);
EXPECT_EQ(0u, dstBindingTablePointer);
auto usedAfter = ssh.getUsed();
EXPECT_EQ(usedBefore, usedAfter);
ssh.align(8);
EXPECT_EQ(usedAfter, ssh.getUsed());
delete pKernel;
}
HWTEST_F(HardwareCommandsTest, setBindingTableStatesForNoSurfaces) {
// define kernel info
auto pKernelInfo = std::make_unique<KernelInfo>();
// create program with valid context
MockContext context;
MockProgram program(*pDevice->getExecutionEnvironment(), &context, false);
// create kernel
MockKernel *pKernel = new MockKernel(&program, *pKernelInfo, *pDevice);
// setup surface state heap
char surfaceStateHeap[256];
SKernelBinaryHeaderCommon kernelHeader;
kernelHeader.SurfaceStateHeapSize = sizeof(surfaceStateHeap);
pKernelInfo->heapInfo.pSsh = surfaceStateHeap;
pKernelInfo->heapInfo.pKernelHeader = &kernelHeader;
// define stateful path
pKernelInfo->usesSsh = true;
pKernelInfo->requiresSshForBuffers = true;
// initialize kernel
ASSERT_EQ(CL_SUCCESS, pKernel->initialize());
CommandQueueHw<FamilyType> cmdQ(nullptr, pDevice, 0);
auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::SURFACE_STATE, 8192);
// Initialize binding table state pointers with pattern
auto numSurfaceStates = pKernel->getNumberOfBindingTableStates();
EXPECT_EQ(0u, numSurfaceStates);
auto dstBindingTablePointer = HardwareCommandsHelper<FamilyType>::pushBindingTableAndSurfaceStates(ssh, *pKernelInfo);
EXPECT_EQ(0u, dstBindingTablePointer);
dstBindingTablePointer = HardwareCommandsHelper<FamilyType>::pushBindingTableAndSurfaceStates(ssh, *pKernel);
EXPECT_EQ(0u, dstBindingTablePointer);
SPatchBindingTableState bindingTableState;
bindingTableState.Token = iOpenCL::PATCH_TOKEN_BINDING_TABLE_STATE;
bindingTableState.Size = sizeof(SPatchBindingTableState);
bindingTableState.Count = 0;
bindingTableState.Offset = 64;
bindingTableState.SurfaceStateOffset = 0;
pKernelInfo->patchInfo.bindingTableState = &bindingTableState;
dstBindingTablePointer = HardwareCommandsHelper<FamilyType>::pushBindingTableAndSurfaceStates(ssh, *pKernel);
EXPECT_EQ(0u, dstBindingTablePointer);
pKernelInfo->patchInfo.bindingTableState = nullptr;
delete pKernel;
}
HWTEST_F(HardwareCommandsTest, GivenVariousValuesWhenAlignSlmSizeIsCalledThenCorrectValueIsReturned) {
if (::renderCoreFamily == IGFX_GEN8_CORE) {
EXPECT_EQ(0u, HardwareCommandsHelper<FamilyType>::alignSlmSize(0));
EXPECT_EQ(4096u, HardwareCommandsHelper<FamilyType>::alignSlmSize(1));
EXPECT_EQ(4096u, HardwareCommandsHelper<FamilyType>::alignSlmSize(1024));
EXPECT_EQ(4096u, HardwareCommandsHelper<FamilyType>::alignSlmSize(1025));
EXPECT_EQ(4096u, HardwareCommandsHelper<FamilyType>::alignSlmSize(2048));
EXPECT_EQ(4096u, HardwareCommandsHelper<FamilyType>::alignSlmSize(2049));
EXPECT_EQ(4096u, HardwareCommandsHelper<FamilyType>::alignSlmSize(4096));
EXPECT_EQ(8192u, HardwareCommandsHelper<FamilyType>::alignSlmSize(4097));
EXPECT_EQ(8192u, HardwareCommandsHelper<FamilyType>::alignSlmSize(8192));
EXPECT_EQ(16384u, HardwareCommandsHelper<FamilyType>::alignSlmSize(8193));
EXPECT_EQ(16384u, HardwareCommandsHelper<FamilyType>::alignSlmSize(12288));
EXPECT_EQ(16384u, HardwareCommandsHelper<FamilyType>::alignSlmSize(16384));
EXPECT_EQ(32768u, HardwareCommandsHelper<FamilyType>::alignSlmSize(16385));
EXPECT_EQ(32768u, HardwareCommandsHelper<FamilyType>::alignSlmSize(24576));
EXPECT_EQ(32768u, HardwareCommandsHelper<FamilyType>::alignSlmSize(32768));
EXPECT_EQ(65536u, HardwareCommandsHelper<FamilyType>::alignSlmSize(32769));
EXPECT_EQ(65536u, HardwareCommandsHelper<FamilyType>::alignSlmSize(49152));
EXPECT_EQ(65536u, HardwareCommandsHelper<FamilyType>::alignSlmSize(65535));
EXPECT_EQ(65536u, HardwareCommandsHelper<FamilyType>::alignSlmSize(65536));
} else {
EXPECT_EQ(0u, HardwareCommandsHelper<FamilyType>::alignSlmSize(0));
EXPECT_EQ(1024u, HardwareCommandsHelper<FamilyType>::alignSlmSize(1));
EXPECT_EQ(1024u, HardwareCommandsHelper<FamilyType>::alignSlmSize(1024));
EXPECT_EQ(2048u, HardwareCommandsHelper<FamilyType>::alignSlmSize(1025));
EXPECT_EQ(2048u, HardwareCommandsHelper<FamilyType>::alignSlmSize(2048));
EXPECT_EQ(4096u, HardwareCommandsHelper<FamilyType>::alignSlmSize(2049));
EXPECT_EQ(4096u, HardwareCommandsHelper<FamilyType>::alignSlmSize(4096));
EXPECT_EQ(8192u, HardwareCommandsHelper<FamilyType>::alignSlmSize(4097));
EXPECT_EQ(8192u, HardwareCommandsHelper<FamilyType>::alignSlmSize(8192));
EXPECT_EQ(16384u, HardwareCommandsHelper<FamilyType>::alignSlmSize(8193));
EXPECT_EQ(16384u, HardwareCommandsHelper<FamilyType>::alignSlmSize(16384));
EXPECT_EQ(32768u, HardwareCommandsHelper<FamilyType>::alignSlmSize(16385));
EXPECT_EQ(32768u, HardwareCommandsHelper<FamilyType>::alignSlmSize(32768));
EXPECT_EQ(65536u, HardwareCommandsHelper<FamilyType>::alignSlmSize(32769));
EXPECT_EQ(65536u, HardwareCommandsHelper<FamilyType>::alignSlmSize(65536));
}
}
HWTEST_F(HardwareCommandsTest, GivenVariousValuesWhenComputeSlmSizeIsCalledThenCorrectValueIsReturned) {
if (::renderCoreFamily == IGFX_GEN8_CORE) {
EXPECT_EQ(0u, HardwareCommandsHelper<FamilyType>::computeSlmValues(0));
EXPECT_EQ(1u, HardwareCommandsHelper<FamilyType>::computeSlmValues(1));
EXPECT_EQ(1u, HardwareCommandsHelper<FamilyType>::computeSlmValues(1024));
EXPECT_EQ(1u, HardwareCommandsHelper<FamilyType>::computeSlmValues(1025));
EXPECT_EQ(1u, HardwareCommandsHelper<FamilyType>::computeSlmValues(2048));
EXPECT_EQ(1u, HardwareCommandsHelper<FamilyType>::computeSlmValues(2049));
EXPECT_EQ(1u, HardwareCommandsHelper<FamilyType>::computeSlmValues(4096));
EXPECT_EQ(2u, HardwareCommandsHelper<FamilyType>::computeSlmValues(4097));
EXPECT_EQ(2u, HardwareCommandsHelper<FamilyType>::computeSlmValues(8192));
EXPECT_EQ(4u, HardwareCommandsHelper<FamilyType>::computeSlmValues(8193));
EXPECT_EQ(4u, HardwareCommandsHelper<FamilyType>::computeSlmValues(12288));
EXPECT_EQ(4u, HardwareCommandsHelper<FamilyType>::computeSlmValues(16384));
EXPECT_EQ(8u, HardwareCommandsHelper<FamilyType>::computeSlmValues(16385));
EXPECT_EQ(8u, HardwareCommandsHelper<FamilyType>::computeSlmValues(24576));
EXPECT_EQ(8u, HardwareCommandsHelper<FamilyType>::computeSlmValues(32768));
EXPECT_EQ(16u, HardwareCommandsHelper<FamilyType>::computeSlmValues(32769));
EXPECT_EQ(16u, HardwareCommandsHelper<FamilyType>::computeSlmValues(49152));
EXPECT_EQ(16u, HardwareCommandsHelper<FamilyType>::computeSlmValues(65535));
EXPECT_EQ(16u, HardwareCommandsHelper<FamilyType>::computeSlmValues(65536));
} else {
EXPECT_EQ(0u, HardwareCommandsHelper<FamilyType>::computeSlmValues(0));
EXPECT_EQ(1u, HardwareCommandsHelper<FamilyType>::computeSlmValues(1));
EXPECT_EQ(1u, HardwareCommandsHelper<FamilyType>::computeSlmValues(1024));
EXPECT_EQ(2u, HardwareCommandsHelper<FamilyType>::computeSlmValues(1025));
EXPECT_EQ(2u, HardwareCommandsHelper<FamilyType>::computeSlmValues(2048));
EXPECT_EQ(3u, HardwareCommandsHelper<FamilyType>::computeSlmValues(2049));
EXPECT_EQ(3u, HardwareCommandsHelper<FamilyType>::computeSlmValues(4096));
EXPECT_EQ(4u, HardwareCommandsHelper<FamilyType>::computeSlmValues(4097));
EXPECT_EQ(4u, HardwareCommandsHelper<FamilyType>::computeSlmValues(8192));
EXPECT_EQ(5u, HardwareCommandsHelper<FamilyType>::computeSlmValues(8193));
EXPECT_EQ(5u, HardwareCommandsHelper<FamilyType>::computeSlmValues(16384));
EXPECT_EQ(6u, HardwareCommandsHelper<FamilyType>::computeSlmValues(16385));
EXPECT_EQ(6u, HardwareCommandsHelper<FamilyType>::computeSlmValues(32768));
EXPECT_EQ(7u, HardwareCommandsHelper<FamilyType>::computeSlmValues(32769));
EXPECT_EQ(7u, HardwareCommandsHelper<FamilyType>::computeSlmValues(65536));
}
}
HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, GivenKernelWithSamplersWhenIndirectStateIsProgrammedThenBorderColorIsCorrectlyCopiedToDshAndSamplerStatesAreProgrammedWithPointer) {
typedef typename FamilyType::BINDING_TABLE_STATE BINDING_TABLE_STATE;
typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE;
typedef typename FamilyType::SAMPLER_STATE SAMPLER_STATE;
using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA;
using GPGPU_WALKER = typename FamilyType::GPGPU_WALKER;
CommandQueueHw<FamilyType> cmdQ(nullptr, pDevice, 0);
const size_t localWorkSizes[3]{1, 1, 1};
auto &commandStream = cmdQ.getCS(1024);
auto pWalkerCmd = static_cast<GPGPU_WALKER *>(commandStream.getSpace(sizeof(GPGPU_WALKER)));
*pWalkerCmd = FamilyType::cmdInitGpgpuWalker;
auto &dsh = cmdQ.getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 8192);
auto &ioh = cmdQ.getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 8192);
auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::SURFACE_STATE, 8192);
const uint32_t borderColorSize = 64;
const uint32_t samplerStateSize = sizeof(SAMPLER_STATE) * 2;
SPatchSamplerStateArray samplerStateArray;
samplerStateArray.BorderColorOffset = 0x0;
samplerStateArray.Count = 2;
samplerStateArray.Offset = borderColorSize;
samplerStateArray.Size = samplerStateSize;
samplerStateArray.Token = 1;
char *mockDsh = new char[(borderColorSize + samplerStateSize) * 4];
memset(mockDsh, 6, borderColorSize);
memset(mockDsh + borderColorSize, 8, borderColorSize);
mockKernelWithInternal->kernelInfo.heapInfo.pDsh = mockDsh;
mockKernelWithInternal->kernelInfo.patchInfo.samplerStateArray = &samplerStateArray;
uint64_t interfaceDescriptorTableOffset = dsh.getUsed();
dsh.getSpace(sizeof(INTERFACE_DESCRIPTOR_DATA));
dsh.getSpace(4);
char *initialDshPointer = static_cast<char *>(dsh.getCpuBase()) + dsh.getUsed();
char *borderColorPointer = alignUp(initialDshPointer, 64);
uint32_t borderColorOffset = static_cast<uint32_t>(borderColorPointer - static_cast<char *>(dsh.getCpuBase()));
SAMPLER_STATE *pSamplerState = reinterpret_cast<SAMPLER_STATE *>(mockDsh + borderColorSize);
for (uint32_t i = 0; i < 2; i++) {
pSamplerState[i].setIndirectStatePointer(0);
}
mockKernelWithInternal->mockKernel->setCrossThreadData(mockKernelWithInternal->crossThreadData, sizeof(mockKernelWithInternal->crossThreadData));
mockKernelWithInternal->mockKernel->setSshLocal(mockKernelWithInternal->sshLocal, sizeof(mockKernelWithInternal->sshLocal));
uint32_t interfaceDescriptorIndex = 0;
auto isCcsUsed = isCcs(cmdQ.getGpgpuEngine().osContext->getEngineType());
HardwareCommandsHelper<FamilyType>::sendIndirectState(
commandStream,
dsh,
ioh,
ssh,
*mockKernelWithInternal->mockKernel,
8,
localWorkSizes,
interfaceDescriptorTableOffset,
interfaceDescriptorIndex,
pDevice->getPreemptionMode(),
pWalkerCmd,
nullptr,
true,
isCcsUsed);
bool isMemorySame = memcmp(borderColorPointer, mockDsh, borderColorSize) == 0;
EXPECT_TRUE(isMemorySame);
SAMPLER_STATE *pSamplerStatesCopied = reinterpret_cast<SAMPLER_STATE *>(borderColorPointer + borderColorSize);
for (uint32_t i = 0; i < 2; i++) {
EXPECT_EQ(pSamplerState[i].getNonNormalizedCoordinateEnable(), pSamplerStatesCopied[i].getNonNormalizedCoordinateEnable());
EXPECT_EQ(pSamplerState[i].getTcxAddressControlMode(), pSamplerStatesCopied[i].getTcxAddressControlMode());
EXPECT_EQ(pSamplerState[i].getTcyAddressControlMode(), pSamplerStatesCopied[i].getTcyAddressControlMode());
EXPECT_EQ(pSamplerState[i].getTczAddressControlMode(), pSamplerStatesCopied[i].getTczAddressControlMode());
EXPECT_EQ(pSamplerState[i].getMinModeFilter(), pSamplerStatesCopied[i].getMinModeFilter());
EXPECT_EQ(pSamplerState[i].getMagModeFilter(), pSamplerStatesCopied[i].getMagModeFilter());
EXPECT_EQ(pSamplerState[i].getMipModeFilter(), pSamplerStatesCopied[i].getMipModeFilter());
EXPECT_EQ(pSamplerState[i].getUAddressMinFilterRoundingEnable(), pSamplerStatesCopied[i].getUAddressMinFilterRoundingEnable());
EXPECT_EQ(pSamplerState[i].getUAddressMagFilterRoundingEnable(), pSamplerStatesCopied[i].getUAddressMagFilterRoundingEnable());
EXPECT_EQ(pSamplerState[i].getVAddressMinFilterRoundingEnable(), pSamplerStatesCopied[i].getVAddressMinFilterRoundingEnable());
EXPECT_EQ(pSamplerState[i].getVAddressMagFilterRoundingEnable(), pSamplerStatesCopied[i].getVAddressMagFilterRoundingEnable());
EXPECT_EQ(pSamplerState[i].getRAddressMagFilterRoundingEnable(), pSamplerStatesCopied[i].getRAddressMagFilterRoundingEnable());
EXPECT_EQ(pSamplerState[i].getRAddressMinFilterRoundingEnable(), pSamplerStatesCopied[i].getRAddressMinFilterRoundingEnable());
EXPECT_EQ(pSamplerState[i].getLodAlgorithm(), pSamplerStatesCopied[i].getLodAlgorithm());
EXPECT_EQ(pSamplerState[i].getTextureLodBias(), pSamplerStatesCopied[i].getTextureLodBias());
EXPECT_EQ(pSamplerState[i].getLodPreclampMode(), pSamplerStatesCopied[i].getLodPreclampMode());
EXPECT_EQ(pSamplerState[i].getTextureBorderColorMode(), pSamplerStatesCopied[i].getTextureBorderColorMode());
EXPECT_EQ(pSamplerState[i].getSamplerDisable(), pSamplerStatesCopied[i].getSamplerDisable());
EXPECT_EQ(pSamplerState[i].getCubeSurfaceControlMode(), pSamplerStatesCopied[i].getCubeSurfaceControlMode());
EXPECT_EQ(pSamplerState[i].getShadowFunction(), pSamplerStatesCopied[i].getShadowFunction());
EXPECT_EQ(pSamplerState[i].getChromakeyMode(), pSamplerStatesCopied[i].getChromakeyMode());
EXPECT_EQ(pSamplerState[i].getChromakeyIndex(), pSamplerStatesCopied[i].getChromakeyIndex());
EXPECT_EQ(pSamplerState[i].getChromakeyEnable(), pSamplerStatesCopied[i].getChromakeyEnable());
EXPECT_EQ(pSamplerState[i].getMaxLod(), pSamplerStatesCopied[i].getMaxLod());
EXPECT_EQ(pSamplerState[i].getMinLod(), pSamplerStatesCopied[i].getMinLod());
EXPECT_EQ(pSamplerState[i].getLodClampMagnificationMode(), pSamplerStatesCopied[i].getLodClampMagnificationMode());
EXPECT_EQ(borderColorOffset, pSamplerStatesCopied[i].getIndirectStatePointer());
}
delete[] mockDsh;
}
using HardwareCommandsHelperTests = ::testing::Test;
HWTEST_F(HardwareCommandsHelperTests, givenCompareAddressAndDataWhenProgrammingSemaphoreWaitThenSetupAllFields) {
using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT;
uint64_t compareAddress = 0x10000;
uint32_t compareData = 1234;
uint8_t buffer[1024] = {};
LinearStream cmdStream(buffer, 1024);
MI_SEMAPHORE_WAIT referenceCommand = FamilyType::cmdInitMiSemaphoreWait;
referenceCommand.setCompareOperation(MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_NOT_EQUAL_SDD);
referenceCommand.setSemaphoreDataDword(compareData);
referenceCommand.setSemaphoreGraphicsAddress(compareAddress);
referenceCommand.setWaitMode(MI_SEMAPHORE_WAIT::WAIT_MODE::WAIT_MODE_POLLING_MODE);
HardwareCommandsHelper<FamilyType>::programMiSemaphoreWait(cmdStream, compareAddress, compareData);
EXPECT_EQ(sizeof(MI_SEMAPHORE_WAIT), cmdStream.getUsed());
EXPECT_EQ(0, memcmp(&referenceCommand, buffer, sizeof(MI_SEMAPHORE_WAIT)));
}
HWTEST_F(HardwareCommandsHelperTests, whenProgrammingMiAtomicThenSetupAllFields) {
using MI_ATOMIC = typename FamilyType::MI_ATOMIC;
uint64_t writeAddress = 0x10000;
auto opcode = MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_DECREMENT;
auto dataSize = MI_ATOMIC::DATA_SIZE::DATA_SIZE_DWORD;
uint8_t buffer[1024] = {};
LinearStream cmdStream(buffer, 1024);
MI_ATOMIC referenceCommand = FamilyType::cmdInitAtomic;
HardwareCommandsHelper<FamilyType>::programMiAtomic(referenceCommand, writeAddress, opcode, dataSize);
auto miAtomic = HardwareCommandsHelper<FamilyType>::programMiAtomic(cmdStream, writeAddress, opcode, dataSize);
EXPECT_EQ(sizeof(MI_ATOMIC), cmdStream.getUsed());
EXPECT_EQ(miAtomic, cmdStream.getCpuBase());
EXPECT_EQ(0, memcmp(&referenceCommand, miAtomic, sizeof(MI_ATOMIC)));
}
typedef ExecutionModelKernelFixture ParentKernelCommandsFromBinaryTest;
HWCMDTEST_P(IGFX_GEN8_CORE, ParentKernelCommandsFromBinaryTest, getSizeRequiredForExecutionModelForSurfaceStatesReturnsSizeOfBlocksPlusMaxBindingTableSizeForAllIDTEntriesAndSchedulerSSHSize) {
using BINDING_TABLE_STATE = typename FamilyType::BINDING_TABLE_STATE;
if (std::string(pPlatform->getDevice(0)->getDeviceInfo().clVersion).find("OpenCL 2.") != std::string::npos) {
EXPECT_TRUE(pKernel->isParentKernel);
size_t totalSize = 0;
BlockKernelManager *blockManager = pKernel->getProgram()->getBlockKernelManager();
uint32_t blockCount = static_cast<uint32_t>(blockManager->getCount());
totalSize = BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE - 1; // for initial alignment
uint32_t maxBindingTableCount = 0;
for (uint32_t i = 0; i < blockCount; i++) {
const KernelInfo *pBlockInfo = blockManager->getBlockKernelInfo(i);
totalSize += pBlockInfo->heapInfo.pKernelHeader->SurfaceStateHeapSize;
totalSize = alignUp(totalSize, BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE);
maxBindingTableCount = std::max(maxBindingTableCount, pBlockInfo->patchInfo.bindingTableState ? pBlockInfo->patchInfo.bindingTableState->Count : 0);
}
totalSize += maxBindingTableCount * sizeof(BINDING_TABLE_STATE) * DeviceQueue::interfaceDescriptorEntries;
BuiltIns &builtIns = *pDevice->getExecutionEnvironment()->getBuiltIns();
auto &scheduler = builtIns.getSchedulerKernel(*pContext);
auto schedulerSshSize = scheduler.getSurfaceStateHeapSize();
totalSize += schedulerSshSize + ((schedulerSshSize != 0) ? BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE : 0);
totalSize = alignUp(totalSize, BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE);
EXPECT_EQ(totalSize, HardwareCommandsHelper<FamilyType>::getSshSizeForExecutionModel(*pKernel));
}
}
static const char *binaryFile = "simple_block_kernel";
static const char *KernelNames[] = {"kernel_reflection", "simple_block_kernel"};
INSTANTIATE_TEST_CASE_P(ParentKernelCommandsFromBinaryTest,
ParentKernelCommandsFromBinaryTest,
::testing::Combine(
::testing::Values(binaryFile),
::testing::ValuesIn(KernelNames)));
HWTEST_F(HardwareCommandsTest, givenEnabledPassInlineDataWhenKernelAllowsInlineThenReturnTrue) {
DebugManagerStateRestore restore;
DebugManager.flags.EnablePassInlineData.set(1u);
uint32_t crossThreadData[8];
const_cast<SPatchThreadPayload *>(mockKernelWithInternal->kernelInfo.patchInfo.threadPayload)->PassInlineData = 1;
mockKernelWithInternal->mockKernel->setCrossThreadData(crossThreadData, sizeof(crossThreadData));
EXPECT_TRUE(HardwareCommandsHelper<FamilyType>::inlineDataProgrammingRequired(*mockKernelWithInternal->mockKernel));
}
HWTEST_F(HardwareCommandsTest, givenNoDebugSettingsWhenDefaultModeIsExcercisedThenWeFollowKernelSettingForInlineProgramming) {
const_cast<SPatchThreadPayload *>(mockKernelWithInternal->kernelInfo.patchInfo.threadPayload)->PassInlineData = 1;
EXPECT_TRUE(HardwareCommandsHelper<FamilyType>::inlineDataProgrammingRequired(*mockKernelWithInternal->mockKernel));
}
HWTEST_F(HardwareCommandsTest, givenDisabledPassInlineDataWhenKernelAllowsInlineThenReturnFalse) {
DebugManagerStateRestore restore;
DebugManager.flags.EnablePassInlineData.set(0u);
const_cast<SPatchThreadPayload *>(mockKernelWithInternal->kernelInfo.patchInfo.threadPayload)->PassInlineData = 1;
EXPECT_FALSE(HardwareCommandsHelper<FamilyType>::inlineDataProgrammingRequired(*mockKernelWithInternal->mockKernel));
}
HWTEST_F(HardwareCommandsTest, givenEnabledPassInlineDataWhenKernelDisallowsInlineThenReturnFalse) {
DebugManagerStateRestore restore;
DebugManager.flags.EnablePassInlineData.set(1u);
uint32_t crossThreadData[8];
const_cast<SPatchThreadPayload *>(mockKernelWithInternal->kernelInfo.patchInfo.threadPayload)->PassInlineData = 0;
mockKernelWithInternal->mockKernel->setCrossThreadData(crossThreadData, sizeof(crossThreadData));
EXPECT_FALSE(HardwareCommandsHelper<FamilyType>::inlineDataProgrammingRequired(*mockKernelWithInternal->mockKernel));
}
HWTEST_F(HardwareCommandsTest, whenLocalIdxInXDimPresentThenExpectLocalIdsInUseIsTrue) {
const_cast<SPatchThreadPayload *>(mockKernelWithInternal->kernelInfo.patchInfo.threadPayload)->LocalIDXPresent = 1;
const_cast<SPatchThreadPayload *>(mockKernelWithInternal->kernelInfo.patchInfo.threadPayload)->LocalIDYPresent = 0;
const_cast<SPatchThreadPayload *>(mockKernelWithInternal->kernelInfo.patchInfo.threadPayload)->LocalIDZPresent = 0;
EXPECT_TRUE(HardwareCommandsHelper<FamilyType>::kernelUsesLocalIds(*mockKernelWithInternal->mockKernel));
}
HWTEST_F(HardwareCommandsTest, whenLocalIdxInYDimPresentThenExpectLocalIdsInUseIsTrue) {
const_cast<SPatchThreadPayload *>(mockKernelWithInternal->kernelInfo.patchInfo.threadPayload)->LocalIDXPresent = 0;
const_cast<SPatchThreadPayload *>(mockKernelWithInternal->kernelInfo.patchInfo.threadPayload)->LocalIDYPresent = 1;
const_cast<SPatchThreadPayload *>(mockKernelWithInternal->kernelInfo.patchInfo.threadPayload)->LocalIDZPresent = 0;
EXPECT_TRUE(HardwareCommandsHelper<FamilyType>::kernelUsesLocalIds(*mockKernelWithInternal->mockKernel));
}
HWTEST_F(HardwareCommandsTest, whenLocalIdxInZDimPresentThenExpectLocalIdsInUseIsTrue) {
const_cast<SPatchThreadPayload *>(mockKernelWithInternal->kernelInfo.patchInfo.threadPayload)->LocalIDXPresent = 0;
const_cast<SPatchThreadPayload *>(mockKernelWithInternal->kernelInfo.patchInfo.threadPayload)->LocalIDYPresent = 0;
const_cast<SPatchThreadPayload *>(mockKernelWithInternal->kernelInfo.patchInfo.threadPayload)->LocalIDZPresent = 1;
EXPECT_TRUE(HardwareCommandsHelper<FamilyType>::kernelUsesLocalIds(*mockKernelWithInternal->mockKernel));
}
HWTEST_F(HardwareCommandsTest, whenLocalIdxAreNotPresentThenExpectLocalIdsInUseIsFalse) {
const_cast<SPatchThreadPayload *>(mockKernelWithInternal->kernelInfo.patchInfo.threadPayload)->LocalIDXPresent = 0;
const_cast<SPatchThreadPayload *>(mockKernelWithInternal->kernelInfo.patchInfo.threadPayload)->LocalIDYPresent = 0;
const_cast<SPatchThreadPayload *>(mockKernelWithInternal->kernelInfo.patchInfo.threadPayload)->LocalIDZPresent = 0;
EXPECT_FALSE(HardwareCommandsHelper<FamilyType>::kernelUsesLocalIds(*mockKernelWithInternal->mockKernel));
}
HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenCacheFlushAfterWalkerEnabledWhenProgramGlobalSurfacePresentThenExpectCacheFlushCommand) {
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
using MEDIA_STATE_FLUSH = typename FamilyType::MEDIA_STATE_FLUSH;
using MEDIA_INTERFACE_DESCRIPTOR_LOAD = typename FamilyType::MEDIA_INTERFACE_DESCRIPTOR_LOAD;
DebugManagerStateRestore dbgRestore;
DebugManager.flags.EnableCacheFlushAfterWalker.set(1);
CommandQueueHw<FamilyType> cmdQ(nullptr, pDevice, 0);
auto &commandStream = cmdQ.getCS(1024);
MockGraphicsAllocation globalAllocation;
mockKernelWithInternal->mockProgram->setGlobalSurface(&globalAllocation);
Kernel::CacheFlushAllocationsVec allocs;
mockKernelWithInternal->mockKernel->getAllocationsForCacheFlush(allocs);
EXPECT_NE(allocs.end(), std::find(allocs.begin(), allocs.end(), &globalAllocation));
size_t expectedSize = sizeof(PIPE_CONTROL);
size_t actualSize = HardwareCommandsHelper<FamilyType>::getSizeRequiredForCacheFlush(cmdQ, mockKernelWithInternal->mockKernel, 0U);
EXPECT_EQ(expectedSize, actualSize);
HardwareCommandsHelper<FamilyType>::programCacheFlushAfterWalkerCommand(&commandStream, cmdQ, mockKernelWithInternal->mockKernel, 0U);
HardwareParse hwParse;
hwParse.parseCommands<FamilyType>(commandStream);
PIPE_CONTROL *pipeControl = hwParse.getCommand<PIPE_CONTROL>();
ASSERT_NE(nullptr, pipeControl);
EXPECT_TRUE(pipeControl->getCommandStreamerStallEnable());
EXPECT_TRUE(pipeControl->getDcFlushEnable());
mockKernelWithInternal->mockProgram->setGlobalSurface(nullptr);
}
HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenCacheFlushAfterWalkerEnabledWhenSvmAllocationsSetAsCacheFlushRequiringThenExpectCacheFlushCommand) {
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
using MEDIA_STATE_FLUSH = typename FamilyType::MEDIA_STATE_FLUSH;
using MEDIA_INTERFACE_DESCRIPTOR_LOAD = typename FamilyType::MEDIA_INTERFACE_DESCRIPTOR_LOAD;
DebugManagerStateRestore dbgRestore;
DebugManager.flags.EnableCacheFlushAfterWalker.set(1);
CommandQueueHw<FamilyType> cmdQ(nullptr, pDevice, 0);
auto &commandStream = cmdQ.getCS(1024);
char buff[MemoryConstants::pageSize * 2];
MockGraphicsAllocation svmAllocation1{alignUp(buff, MemoryConstants::pageSize), MemoryConstants::pageSize};
mockKernelWithInternal->mockKernel->kernelSvmGfxAllocations.push_back(&svmAllocation1);
MockGraphicsAllocation svmAllocation2{alignUp(buff, MemoryConstants::pageSize), MemoryConstants::pageSize};
svmAllocation2.setFlushL3Required(false);
mockKernelWithInternal->mockKernel->kernelSvmGfxAllocations.push_back(&svmAllocation2);
mockKernelWithInternal->mockKernel->svmAllocationsRequireCacheFlush = true;
Kernel::CacheFlushAllocationsVec allocs;
mockKernelWithInternal->mockKernel->getAllocationsForCacheFlush(allocs);
EXPECT_NE(allocs.end(), std::find(allocs.begin(), allocs.end(), &svmAllocation1));
EXPECT_EQ(allocs.end(), std::find(allocs.begin(), allocs.end(), &svmAllocation2));
size_t expectedSize = sizeof(PIPE_CONTROL);
size_t actualSize = HardwareCommandsHelper<FamilyType>::getSizeRequiredForCacheFlush(cmdQ, mockKernelWithInternal->mockKernel, 0U);
EXPECT_EQ(expectedSize, actualSize);
HardwareCommandsHelper<FamilyType>::programCacheFlushAfterWalkerCommand(&commandStream, cmdQ, mockKernelWithInternal->mockKernel, 0U);
HardwareParse hwParse;
hwParse.parseCommands<FamilyType>(commandStream);
PIPE_CONTROL *pipeControl = hwParse.getCommand<PIPE_CONTROL>();
ASSERT_NE(nullptr, pipeControl);
EXPECT_TRUE(pipeControl->getCommandStreamerStallEnable());
EXPECT_TRUE(pipeControl->getDcFlushEnable());
}
HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenCacheFlushAfterWalkerEnabledWhenKernelArgIsSetAsCacheFlushRequiredThenExpectCacheFlushCommand) {
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
using MEDIA_STATE_FLUSH = typename FamilyType::MEDIA_STATE_FLUSH;
using MEDIA_INTERFACE_DESCRIPTOR_LOAD = typename FamilyType::MEDIA_INTERFACE_DESCRIPTOR_LOAD;
DebugManagerStateRestore dbgRestore;
DebugManager.flags.EnableCacheFlushAfterWalker.set(1);
CommandQueueHw<FamilyType> cmdQ(nullptr, pDevice, 0);
auto &commandStream = cmdQ.getCS(1024);
addSpaceForSingleKernelArg();
MockGraphicsAllocation cacheRequiringAllocation;
mockKernelWithInternal->mockKernel->kernelArgRequiresCacheFlush.resize(2);
mockKernelWithInternal->mockKernel->kernelArgRequiresCacheFlush[0] = &cacheRequiringAllocation;
Kernel::CacheFlushAllocationsVec allocs;
mockKernelWithInternal->mockKernel->getAllocationsForCacheFlush(allocs);
EXPECT_NE(allocs.end(), std::find(allocs.begin(), allocs.end(), &cacheRequiringAllocation));
size_t expectedSize = sizeof(PIPE_CONTROL);
size_t actualSize = HardwareCommandsHelper<FamilyType>::getSizeRequiredForCacheFlush(cmdQ, mockKernelWithInternal->mockKernel, 0U);
EXPECT_EQ(expectedSize, actualSize);
HardwareCommandsHelper<FamilyType>::programCacheFlushAfterWalkerCommand(&commandStream, cmdQ, mockKernelWithInternal->mockKernel, 0U);
HardwareParse hwParse;
hwParse.parseCommands<FamilyType>(commandStream);
PIPE_CONTROL *pipeControl = hwParse.getCommand<PIPE_CONTROL>();
ASSERT_NE(nullptr, pipeControl);
EXPECT_TRUE(pipeControl->getCommandStreamerStallEnable());
EXPECT_TRUE(pipeControl->getDcFlushEnable());
}
HWTEST_F(HardwareCommandsTest, givenCacheFlushAfterWalkerDisabledWhenGettingRequiredCacheFlushSizeThenReturnZero) {
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
DebugManagerStateRestore dbgRestore;
DebugManager.flags.EnableCacheFlushAfterWalker.set(0);
CommandQueueHw<FamilyType> cmdQ(nullptr, pDevice, 0);
size_t expectedSize = 0U;
size_t actualSize = HardwareCommandsHelper<FamilyType>::getSizeRequiredForCacheFlush(cmdQ, mockKernelWithInternal->mockKernel, 0U);
EXPECT_EQ(expectedSize, actualSize);
}
TEST_F(HardwareCommandsTest, givenCacheFlushAfterWalkerEnabledWhenPlatformNotSupportFlushThenExpectNoCacheAllocationForFlush) {
DebugManagerStateRestore dbgRestore;
DebugManager.flags.EnableCacheFlushAfterWalker.set(-1);
hardwareInfo.capabilityTable.supportCacheFlushAfterWalker = false;
StackVec<GraphicsAllocation *, 32> allocationsForCacheFlush;
mockKernelWithInternal->mockKernel->getAllocationsForCacheFlush(allocationsForCacheFlush);
EXPECT_EQ(0U, allocationsForCacheFlush.size());
}
HWTEST_F(HardwareCommandsTest, givenImmDataWriteWhenProgrammingMiFlushDwThenSetAllRequiredFields) {
using MI_FLUSH_DW = typename FamilyType::MI_FLUSH_DW;
uint8_t buffer[2 * sizeof(MI_FLUSH_DW)] = {};
LinearStream linearStream(buffer, sizeof(buffer));
uint64_t gpuAddress = 0x1230000;
uint64_t immData = 456;
HardwareCommandsHelper<FamilyType>::programMiFlushDw(linearStream, gpuAddress, immData);
auto miFlushDwCmd = reinterpret_cast<MI_FLUSH_DW *>(buffer);
EXPECT_EQ(sizeof(MI_FLUSH_DW), linearStream.getUsed());
EXPECT_EQ(MI_FLUSH_DW::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA_QWORD, miFlushDwCmd->getPostSyncOperation());
EXPECT_EQ(gpuAddress, miFlushDwCmd->getDestinationAddress());
EXPECT_EQ(immData, miFlushDwCmd->getImmediateData());
}
using KernelCacheFlushTests = Test<HelloWorldFixture<HelloWorldFixtureFactory>>;
HWTEST_F(KernelCacheFlushTests, givenLocallyUncachedBufferWhenGettingAllocationsForFlushThenEmptyVectorIsReturned) {
DebugManagerStateRestore dbgRestore;
DebugManager.flags.EnableCacheFlushAfterWalker.set(-1);
auto kernel = clUniquePtr(Kernel::create(pProgram, *pProgram->getKernelInfo("CopyBuffer"), &retVal));
cl_mem_properties_intel bufferPropertiesUncachedResource[] = {CL_MEM_FLAGS_INTEL, CL_MEM_LOCALLY_UNCACHED_RESOURCE, 0};
auto bufferLocallyUncached = clCreateBufferWithPropertiesINTEL(context, bufferPropertiesUncachedResource, 1, nullptr, nullptr);
kernel->setArg(0, sizeof(bufferLocallyUncached), &bufferLocallyUncached);
using CacheFlushAllocationsVec = StackVec<GraphicsAllocation *, 32>;
CacheFlushAllocationsVec cacheFlushVec;
kernel->getAllocationsForCacheFlush(cacheFlushVec);
EXPECT_EQ(0u, cacheFlushVec.size());
auto bufferRegular = clCreateBufferWithPropertiesINTEL(context, nullptr, 1, nullptr, nullptr);
kernel->setArg(1, sizeof(bufferRegular), &bufferRegular);
kernel->getAllocationsForCacheFlush(cacheFlushVec);
size_t expectedCacheFlushVecSize = (hardwareInfo.capabilityTable.supportCacheFlushAfterWalker ? 1u : 0u);
EXPECT_EQ(expectedCacheFlushVecSize, cacheFlushVec.size());
clReleaseMemObject(bufferLocallyUncached);
clReleaseMemObject(bufferRegular);
}