/* * Copyright (C) 2017-2019 Intel Corporation * * SPDX-License-Identifier: MIT * */ #include "unit_tests/helpers/hardware_commands_helper_tests.h" #include "core/helpers/basic_math.h" #include "core/helpers/engine_node_helper.h" #include "core/memory_manager/unified_memory_manager.h" #include "core/unit_tests/helpers/debug_manager_state_restore.h" #include "core/unit_tests/utilities/base_object_utils.h" #include "runtime/api/api.h" #include "runtime/built_ins/builtins_dispatch_builder.h" #include "runtime/command_queue/command_queue_hw.h" #include "runtime/helpers/hardware_commands_helper.h" #include "unit_tests/fixtures/execution_model_kernel_fixture.h" #include "unit_tests/fixtures/hello_world_fixture.h" #include "unit_tests/fixtures/image_fixture.h" #include "unit_tests/helpers/hw_parse.h" #include "unit_tests/mocks/mock_graphics_allocation.h" using namespace NEO; void HardwareCommandsTest::SetUp() { DeviceFixture::SetUp(); ASSERT_NE(nullptr, pDevice); cl_device_id device = pDevice; ContextFixture::SetUp(1, &device); ASSERT_NE(nullptr, pContext); BuiltInFixture::SetUp(pDevice); ASSERT_NE(nullptr, pBuiltIns); mockKernelWithInternal = std::make_unique(*pDevice, pContext); } void HardwareCommandsTest::TearDown() { mockKernelWithInternal.reset(nullptr); BuiltInFixture::TearDown(); ContextFixture::TearDown(); DeviceFixture::TearDown(); } void HardwareCommandsTest::addSpaceForSingleKernelArg() { kernelArguments.resize(1); kernelArguments[0] = kernelArgInfo; mockKernelWithInternal->kernelInfo.resizeKernelArgInfoAndRegisterParameter(1); mockKernelWithInternal->kernelInfo.kernelArgInfo.resize(1); mockKernelWithInternal->kernelInfo.kernelArgInfo[0].kernelArgPatchInfoVector.resize(1); mockKernelWithInternal->kernelInfo.kernelArgInfo[0].kernelArgPatchInfoVector[0].crossthreadOffset = 0; mockKernelWithInternal->kernelInfo.kernelArgInfo[0].kernelArgPatchInfoVector[0].size = sizeof(uintptr_t); mockKernelWithInternal->mockKernel->setKernelArguments(kernelArguments); mockKernelWithInternal->mockKernel->kernelArgRequiresCacheFlush.resize(1); } HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, programInterfaceDescriptorDataResourceUsage) { CommandQueueHw cmdQ(pContext, pDevice, 0); std::unique_ptr srcImage(Image2dHelper<>::create(pContext)); ASSERT_NE(nullptr, srcImage.get()); std::unique_ptr dstImage(Image2dHelper<>::create(pContext)); ASSERT_NE(nullptr, dstImage.get()); MultiDispatchInfo multiDispatchInfo; auto &builder = pDevice->getExecutionEnvironment()->getBuiltIns()->getBuiltinDispatchInfoBuilder(EBuiltInOps::CopyImageToImage3d, cmdQ.getContext(), cmdQ.getDevice()); ASSERT_NE(nullptr, &builder); BuiltinOpParams dc; dc.srcMemObj = srcImage.get(); dc.dstMemObj = dstImage.get(); dc.srcOffset = {0, 0, 0}; dc.dstOffset = {0, 0, 0}; dc.size = {1, 1, 1}; builder.buildDispatchInfos(multiDispatchInfo, dc); EXPECT_NE(0u, multiDispatchInfo.size()); auto kernel = multiDispatchInfo.begin()->getKernel(); ASSERT_NE(nullptr, kernel); typedef typename FamilyType::INTERFACE_DESCRIPTOR_DATA INTERFACE_DESCRIPTOR_DATA; auto &indirectHeap = cmdQ.getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 8192); auto usedIndirectHeapBefore = indirectHeap.getUsed(); indirectHeap.getSpace(sizeof(INTERFACE_DESCRIPTOR_DATA)); size_t crossThreadDataSize = kernel->getCrossThreadDataSize(); HardwareCommandsHelper::sendInterfaceDescriptorData( indirectHeap, 0, 0, crossThreadDataSize, 64, 0, 0, 0, 1, *kernel, 0, pDevice->getPreemptionMode(), nullptr); auto usedIndirectHeapAfter = indirectHeap.getUsed(); EXPECT_EQ(sizeof(INTERFACE_DESCRIPTOR_DATA), usedIndirectHeapAfter - usedIndirectHeapBefore); } HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, programMediaInterfaceDescriptorLoadResourceUsage) { CommandQueueHw cmdQ(nullptr, pDevice, 0); typedef typename FamilyType::INTERFACE_DESCRIPTOR_DATA INTERFACE_DESCRIPTOR_DATA; typedef typename FamilyType::MEDIA_INTERFACE_DESCRIPTOR_LOAD MEDIA_INTERFACE_DESCRIPTOR_LOAD; typedef typename FamilyType::MEDIA_STATE_FLUSH MEDIA_STATE_FLUSH; auto &commandStream = cmdQ.getCS(1024); auto usedBefore = commandStream.getUsed(); HardwareCommandsHelper::sendMediaInterfaceDescriptorLoad(commandStream, 0, sizeof(INTERFACE_DESCRIPTOR_DATA)); auto usedAfter = commandStream.getUsed(); EXPECT_EQ(sizeof(MEDIA_INTERFACE_DESCRIPTOR_LOAD) + sizeof(MEDIA_STATE_FLUSH), usedAfter - usedBefore); } HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, programMediaStateFlushResourceUsage) { CommandQueueHw cmdQ(nullptr, pDevice, 0); typedef typename FamilyType::INTERFACE_DESCRIPTOR_DATA INTERFACE_DESCRIPTOR_DATA; typedef typename FamilyType::MEDIA_STATE_FLUSH MEDIA_STATE_FLUSH; auto &commandStream = cmdQ.getCS(1024); auto usedBefore = commandStream.getUsed(); HardwareCommandsHelper::sendMediaStateFlush(commandStream, sizeof(INTERFACE_DESCRIPTOR_DATA)); auto usedAfter = commandStream.getUsed(); EXPECT_EQ(sizeof(MEDIA_STATE_FLUSH), usedAfter - usedBefore); } HWTEST_F(HardwareCommandsTest, sendCrossThreadDataResourceUsage) { CommandQueueHw cmdQ(pContext, pDevice, 0); std::unique_ptr srcImage(Image2dHelper<>::create(pContext)); ASSERT_NE(nullptr, srcImage.get()); std::unique_ptr dstImage(Image2dHelper<>::create(pContext)); ASSERT_NE(nullptr, dstImage.get()); MultiDispatchInfo multiDispatchInfo; auto &builder = pDevice->getExecutionEnvironment()->getBuiltIns()->getBuiltinDispatchInfoBuilder(EBuiltInOps::CopyImageToImage3d, cmdQ.getContext(), cmdQ.getDevice()); ASSERT_NE(nullptr, &builder); BuiltinOpParams dc; dc.srcMemObj = srcImage.get(); dc.dstMemObj = dstImage.get(); dc.srcOffset = {0, 0, 0}; dc.dstOffset = {0, 0, 0}; dc.size = {1, 1, 1}; builder.buildDispatchInfos(multiDispatchInfo, dc); EXPECT_NE(0u, multiDispatchInfo.size()); auto kernel = multiDispatchInfo.begin()->getKernel(); ASSERT_NE(nullptr, kernel); auto &indirectHeap = cmdQ.getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 8192); auto usedBefore = indirectHeap.getUsed(); auto sizeCrossThreadData = kernel->getCrossThreadDataSize(); HardwareCommandsHelper::sendCrossThreadData( indirectHeap, *kernel, false, nullptr, sizeCrossThreadData); auto usedAfter = indirectHeap.getUsed(); EXPECT_EQ(kernel->getCrossThreadDataSize(), usedAfter - usedBefore); } HWTEST_F(HardwareCommandsTest, givenSendCrossThreadDataWhenWhenAddPatchInfoCommentsForAUBDumpIsNotSetThenAddPatchInfoDataOffsetsAreNotMoved) { CommandQueueHw cmdQ(pContext, pDevice, 0); MockContext context; MockProgram program(*pDevice->getExecutionEnvironment(), &context, false); auto kernelInfo = std::make_unique(); std::unique_ptr kernel(new MockKernel(&program, *kernelInfo, *pDevice)); auto &indirectHeap = cmdQ.getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 8192); PatchInfoData patchInfoData = {0xaaaaaaaa, 0, PatchInfoAllocationType::KernelArg, 0xbbbbbbbb, 0, PatchInfoAllocationType::IndirectObjectHeap}; kernel->getPatchInfoDataList().push_back(patchInfoData); auto sizeCrossThreadData = kernel->getCrossThreadDataSize(); HardwareCommandsHelper::sendCrossThreadData( indirectHeap, *kernel, false, nullptr, sizeCrossThreadData); ASSERT_EQ(1u, kernel->getPatchInfoDataList().size()); EXPECT_EQ(0xaaaaaaaa, kernel->getPatchInfoDataList()[0].sourceAllocation); EXPECT_EQ(0u, kernel->getPatchInfoDataList()[0].sourceAllocationOffset); EXPECT_EQ(PatchInfoAllocationType::KernelArg, kernel->getPatchInfoDataList()[0].sourceType); EXPECT_EQ(0xbbbbbbbb, kernel->getPatchInfoDataList()[0].targetAllocation); EXPECT_EQ(0u, kernel->getPatchInfoDataList()[0].targetAllocationOffset); EXPECT_EQ(PatchInfoAllocationType::IndirectObjectHeap, kernel->getPatchInfoDataList()[0].targetType); } HWTEST_F(HardwareCommandsTest, givenIndirectHeapNotAllocatedFromInternalPoolWhenSendCrossThreadDataIsCalledThenOffsetZeroIsReturned) { auto nonInternalAllocation = pDevice->getMemoryManager()->allocateGraphicsMemoryWithProperties(MockAllocationProperties{MemoryConstants::pageSize}); IndirectHeap indirectHeap(nonInternalAllocation, false); auto sizeCrossThreadData = mockKernelWithInternal->mockKernel->getCrossThreadDataSize(); auto offset = HardwareCommandsHelper::sendCrossThreadData( indirectHeap, *mockKernelWithInternal->mockKernel, false, nullptr, sizeCrossThreadData); EXPECT_EQ(0u, offset); pDevice->getMemoryManager()->freeGraphicsMemory(nonInternalAllocation); } HWTEST_F(HardwareCommandsTest, givenIndirectHeapAllocatedFromInternalPoolWhenSendCrossThreadDataIsCalledThenHeapBaseOffsetIsReturned) { auto internalAllocation = pDevice->getMemoryManager()->allocateGraphicsMemoryWithProperties(MockAllocationProperties(true, MemoryConstants::pageSize, GraphicsAllocation::AllocationType::INTERNAL_HEAP)); IndirectHeap indirectHeap(internalAllocation, true); auto expectedOffset = internalAllocation->getGpuAddressToPatch(); auto sizeCrossThreadData = mockKernelWithInternal->mockKernel->getCrossThreadDataSize(); auto offset = HardwareCommandsHelper::sendCrossThreadData( indirectHeap, *mockKernelWithInternal->mockKernel, false, nullptr, sizeCrossThreadData); EXPECT_EQ(expectedOffset, offset); pDevice->getMemoryManager()->freeGraphicsMemory(internalAllocation); } HWTEST_F(HardwareCommandsTest, givenSendCrossThreadDataWhenWhenAddPatchInfoCommentsForAUBDumpIsSetThenAddPatchInfoDataOffsetsAreMoved) { DebugManagerStateRestore dbgRestore; DebugManager.flags.AddPatchInfoCommentsForAUBDump.set(true); CommandQueueHw cmdQ(pContext, pDevice, 0); MockContext context; MockProgram program(*pDevice->getExecutionEnvironment(), &context, false); auto kernelInfo = std::make_unique(); std::unique_ptr kernel(new MockKernel(&program, *kernelInfo, *pDevice)); auto &indirectHeap = cmdQ.getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 8192); indirectHeap.getSpace(128u); PatchInfoData patchInfoData1 = {0xaaaaaaaa, 0, PatchInfoAllocationType::KernelArg, 0xbbbbbbbb, 0, PatchInfoAllocationType::IndirectObjectHeap}; PatchInfoData patchInfoData2 = {0xcccccccc, 0, PatchInfoAllocationType::IndirectObjectHeap, 0xdddddddd, 0, PatchInfoAllocationType::Default}; kernel->getPatchInfoDataList().push_back(patchInfoData1); kernel->getPatchInfoDataList().push_back(patchInfoData2); auto sizeCrossThreadData = kernel->getCrossThreadDataSize(); auto offsetCrossThreadData = HardwareCommandsHelper::sendCrossThreadData( indirectHeap, *kernel, false, nullptr, sizeCrossThreadData); ASSERT_NE(0u, offsetCrossThreadData); EXPECT_EQ(128u, offsetCrossThreadData); ASSERT_EQ(2u, kernel->getPatchInfoDataList().size()); EXPECT_EQ(0xaaaaaaaa, kernel->getPatchInfoDataList()[0].sourceAllocation); EXPECT_EQ(0u, kernel->getPatchInfoDataList()[0].sourceAllocationOffset); EXPECT_EQ(PatchInfoAllocationType::KernelArg, kernel->getPatchInfoDataList()[0].sourceType); EXPECT_NE(0xbbbbbbbb, kernel->getPatchInfoDataList()[0].targetAllocation); EXPECT_EQ(indirectHeap.getGraphicsAllocation()->getGpuAddress(), kernel->getPatchInfoDataList()[0].targetAllocation); EXPECT_NE(0u, kernel->getPatchInfoDataList()[0].targetAllocationOffset); EXPECT_EQ(offsetCrossThreadData, kernel->getPatchInfoDataList()[0].targetAllocationOffset); EXPECT_EQ(PatchInfoAllocationType::IndirectObjectHeap, kernel->getPatchInfoDataList()[0].targetType); } HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, sendIndirectStateResourceUsage) { using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA; using GPGPU_WALKER = typename FamilyType::GPGPU_WALKER; CommandQueueHw cmdQ(pContext, pDevice, 0); std::unique_ptr srcImage(Image2dHelper<>::create(pContext)); ASSERT_NE(nullptr, srcImage.get()); std::unique_ptr dstImage(Image2dHelper<>::create(pContext)); ASSERT_NE(nullptr, dstImage.get()); MultiDispatchInfo multiDispatchInfo; auto &builder = pDevice->getExecutionEnvironment()->getBuiltIns()->getBuiltinDispatchInfoBuilder(EBuiltInOps::CopyImageToImage3d, cmdQ.getContext(), cmdQ.getDevice()); ASSERT_NE(nullptr, &builder); BuiltinOpParams dc; dc.srcMemObj = srcImage.get(); dc.dstMemObj = dstImage.get(); dc.srcOffset = {0, 0, 0}; dc.dstOffset = {0, 0, 0}; dc.size = {1, 1, 1}; builder.buildDispatchInfos(multiDispatchInfo, dc); EXPECT_NE(0u, multiDispatchInfo.size()); auto kernel = multiDispatchInfo.begin()->getKernel(); ASSERT_NE(nullptr, kernel); const size_t localWorkSize = 256; const size_t localWorkSizes[3]{localWorkSize, 1, 1}; auto &commandStream = cmdQ.getCS(1024); auto pWalkerCmd = static_cast(commandStream.getSpace(sizeof(GPGPU_WALKER))); *pWalkerCmd = FamilyType::cmdInitGpgpuWalker; auto &dsh = cmdQ.getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 8192); auto &ioh = cmdQ.getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 8192); auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::SURFACE_STATE, 8192); auto usedBeforeCS = commandStream.getUsed(); auto usedBeforeDSH = dsh.getUsed(); auto usedBeforeIOH = ioh.getUsed(); auto usedBeforeSSH = ssh.getUsed(); dsh.align(HardwareCommandsHelper::alignInterfaceDescriptorData); size_t IDToffset = dsh.getUsed(); dsh.getSpace(sizeof(INTERFACE_DESCRIPTOR_DATA)); HardwareCommandsHelper::sendMediaInterfaceDescriptorLoad( commandStream, IDToffset, sizeof(INTERFACE_DESCRIPTOR_DATA)); uint32_t interfaceDescriptorIndex = 0; auto isCcsUsed = isCcs(cmdQ.getGpgpuEngine().osContext->getEngineType()); HardwareCommandsHelper::sendIndirectState( commandStream, dsh, ioh, ssh, *kernel, kernel->getKernelInfo().getMaxSimdSize(), localWorkSizes, IDToffset, interfaceDescriptorIndex, pDevice->getPreemptionMode(), pWalkerCmd, nullptr, true, isCcsUsed); // It's okay these are EXPECT_GE as they're only going to be used for // estimation purposes to avoid OOM. auto usedAfterDSH = dsh.getUsed(); auto usedAfterIOH = ioh.getUsed(); auto usedAfterSSH = ssh.getUsed(); auto sizeRequiredDSH = HardwareCommandsHelper::getSizeRequiredDSH(*kernel); auto sizeRequiredIOH = HardwareCommandsHelper::getSizeRequiredIOH(*kernel, localWorkSize); auto sizeRequiredSSH = HardwareCommandsHelper::getSizeRequiredSSH(*kernel); EXPECT_GE(sizeRequiredDSH, usedAfterDSH - usedBeforeDSH); EXPECT_GE(sizeRequiredIOH, usedAfterIOH - usedBeforeIOH); EXPECT_GE(sizeRequiredSSH, usedAfterSSH - usedBeforeSSH); auto usedAfterCS = commandStream.getUsed(); EXPECT_GE(HardwareCommandsHelper::getSizeRequiredCS(kernel), usedAfterCS - usedBeforeCS); } HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenKernelWithFourBindingTableEntriesWhenIndirectStateIsEmittedThenInterfaceDescriptorContainsCorrectBindingTableEntryCount) { using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA; using GPGPU_WALKER = typename FamilyType::GPGPU_WALKER; CommandQueueHw cmdQ(pContext, pDevice, 0); auto &commandStream = cmdQ.getCS(1024); auto pWalkerCmd = static_cast(commandStream.getSpace(sizeof(GPGPU_WALKER))); *pWalkerCmd = FamilyType::cmdInitGpgpuWalker; auto expectedBindingTableCount = 3u; mockKernelWithInternal->mockKernel->numberOfBindingTableStates = expectedBindingTableCount; auto &dsh = cmdQ.getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 8192); auto &ioh = cmdQ.getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 8192); auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::SURFACE_STATE, 8192); const size_t localWorkSize = 256; const size_t localWorkSizes[3]{localWorkSize, 1, 1}; uint32_t interfaceDescriptorIndex = 0; auto isCcsUsed = isCcs(cmdQ.getGpgpuEngine().osContext->getEngineType()); HardwareCommandsHelper::sendIndirectState( commandStream, dsh, ioh, ssh, *mockKernelWithInternal->mockKernel, mockKernelWithInternal->mockKernel->getKernelInfo().getMaxSimdSize(), localWorkSizes, 0, interfaceDescriptorIndex, pDevice->getPreemptionMode(), pWalkerCmd, nullptr, true, isCcsUsed); auto interfaceDescriptor = reinterpret_cast(dsh.getCpuBase()); if (HardwareCommandsHelper::doBindingTablePrefetch()) { EXPECT_EQ(expectedBindingTableCount, interfaceDescriptor->getBindingTableEntryCount()); } else { EXPECT_EQ(0u, interfaceDescriptor->getBindingTableEntryCount()); } } HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenKernelThatIsSchedulerWhenIndirectStateIsEmittedThenInterfaceDescriptorContainsZeroBindingTableEntryCount) { using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA; using GPGPU_WALKER = typename FamilyType::GPGPU_WALKER; CommandQueueHw cmdQ(pContext, pDevice, 0); auto &commandStream = cmdQ.getCS(1024); auto pWalkerCmd = static_cast(commandStream.getSpace(sizeof(GPGPU_WALKER))); *pWalkerCmd = FamilyType::cmdInitGpgpuWalker; auto expectedBindingTableCount = 3u; mockKernelWithInternal->mockKernel->numberOfBindingTableStates = expectedBindingTableCount; auto isScheduler = const_cast(&mockKernelWithInternal->mockKernel->isSchedulerKernel); *isScheduler = true; auto &dsh = cmdQ.getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 8192); auto &ioh = cmdQ.getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 8192); auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::SURFACE_STATE, 8192); const size_t localWorkSize = 256; const size_t localWorkSizes[3]{localWorkSize, 1, 1}; uint32_t interfaceDescriptorIndex = 0; auto isCcsUsed = isCcs(cmdQ.getGpgpuEngine().osContext->getEngineType()); HardwareCommandsHelper::sendIndirectState( commandStream, dsh, ioh, ssh, *mockKernelWithInternal->mockKernel, mockKernelWithInternal->mockKernel->getKernelInfo().getMaxSimdSize(), localWorkSizes, 0, interfaceDescriptorIndex, pDevice->getPreemptionMode(), pWalkerCmd, nullptr, true, isCcsUsed); auto interfaceDescriptor = reinterpret_cast(dsh.getCpuBase()); EXPECT_EQ(0u, interfaceDescriptor->getBindingTableEntryCount()); } HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenKernelWith100BindingTableEntriesWhenIndirectStateIsEmittedThenInterfaceDescriptorHas31BindingTableEntriesSet) { using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA; using GPGPU_WALKER = typename FamilyType::GPGPU_WALKER; CommandQueueHw cmdQ(pContext, pDevice, 0); auto &commandStream = cmdQ.getCS(1024); auto pWalkerCmd = static_cast(commandStream.getSpace(sizeof(GPGPU_WALKER))); *pWalkerCmd = FamilyType::cmdInitGpgpuWalker; auto expectedBindingTableCount = 100u; mockKernelWithInternal->mockKernel->numberOfBindingTableStates = expectedBindingTableCount; auto &dsh = cmdQ.getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 8192); auto &ioh = cmdQ.getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 8192); auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::SURFACE_STATE, 8192); const size_t localWorkSize = 256; const size_t localWorkSizes[3]{localWorkSize, 1, 1}; uint32_t interfaceDescriptorIndex = 0; auto isCcsUsed = isCcs(cmdQ.getGpgpuEngine().osContext->getEngineType()); HardwareCommandsHelper::sendIndirectState( commandStream, dsh, ioh, ssh, *mockKernelWithInternal->mockKernel, mockKernelWithInternal->mockKernel->getKernelInfo().getMaxSimdSize(), localWorkSizes, 0, interfaceDescriptorIndex, pDevice->getPreemptionMode(), pWalkerCmd, nullptr, true, isCcsUsed); auto interfaceDescriptor = reinterpret_cast(dsh.getCpuBase()); if (HardwareCommandsHelper::doBindingTablePrefetch()) { EXPECT_EQ(31u, interfaceDescriptor->getBindingTableEntryCount()); } else { EXPECT_EQ(0u, interfaceDescriptor->getBindingTableEntryCount()); } } HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, whenSendingIndirectStateThenKernelsWalkOrderIsTakenIntoAccount) { using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA; using GPGPU_WALKER = typename FamilyType::GPGPU_WALKER; CommandQueueHw cmdQ(pContext, pDevice, 0); std::unique_ptr img(Image2dHelper<>::create(pContext)); MultiDispatchInfo multiDispatchInfo; auto &builder = cmdQ.getDevice().getExecutionEnvironment()->getBuiltIns()->getBuiltinDispatchInfoBuilder(EBuiltInOps::CopyImageToImage3d, cmdQ.getContext(), cmdQ.getDevice()); BuiltinOpParams dc; dc.srcMemObj = img.get(); dc.dstMemObj = img.get(); dc.size = {1, 1, 1}; builder.buildDispatchInfos(multiDispatchInfo, dc); ASSERT_NE(0u, multiDispatchInfo.size()); auto kernel = multiDispatchInfo.begin()->getKernel(); ASSERT_NE(nullptr, kernel); const size_t localWorkSizeX = 2; const size_t localWorkSizeY = 3; const size_t localWorkSizeZ = 4; const size_t localWorkSizes[3]{localWorkSizeX, localWorkSizeY, localWorkSizeZ}; auto &commandStream = cmdQ.getCS(1024); auto pWalkerCmd = static_cast(commandStream.getSpace(sizeof(GPGPU_WALKER))); *pWalkerCmd = FamilyType::cmdInitGpgpuWalker; auto &dsh = cmdQ.getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 8192); auto &ioh = cmdQ.getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 8192); auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::SURFACE_STATE, 8192); dsh.align(HardwareCommandsHelper::alignInterfaceDescriptorData); size_t IDToffset = dsh.getUsed(); dsh.getSpace(sizeof(INTERFACE_DESCRIPTOR_DATA)); KernelInfo modifiedKernelInfo = {}; modifiedKernelInfo.patchInfo = kernel->getKernelInfo().patchInfo; modifiedKernelInfo.workgroupWalkOrder[0] = 2; modifiedKernelInfo.workgroupWalkOrder[1] = 1; modifiedKernelInfo.workgroupWalkOrder[2] = 0; modifiedKernelInfo.workgroupDimensionsOrder[0] = 2; modifiedKernelInfo.workgroupDimensionsOrder[1] = 1; modifiedKernelInfo.workgroupDimensionsOrder[2] = 0; MockKernel mockKernel{kernel->getProgram(), modifiedKernelInfo, kernel->getDevice(), false}; uint32_t interfaceDescriptorIndex = 0; auto isCcsUsed = isCcs(cmdQ.getGpgpuEngine().osContext->getEngineType()); HardwareCommandsHelper::sendIndirectState( commandStream, dsh, ioh, ssh, mockKernel, modifiedKernelInfo.getMaxSimdSize(), localWorkSizes, IDToffset, interfaceDescriptorIndex, pDevice->getPreemptionMode(), pWalkerCmd, nullptr, true, isCcsUsed); size_t numThreads = localWorkSizeX * localWorkSizeY * localWorkSizeZ; numThreads = Math::divideAndRoundUp(numThreads, modifiedKernelInfo.getMaxSimdSize()); size_t expectedIohSize = ((modifiedKernelInfo.getMaxSimdSize() == 32) ? 32 : 16) * 3 * numThreads * sizeof(uint16_t); ASSERT_LE(expectedIohSize, ioh.getUsed()); auto expectedLocalIds = alignedMalloc(expectedIohSize, 64); generateLocalIDs(expectedLocalIds, modifiedKernelInfo.getMaxSimdSize(), std::array{{localWorkSizeX, localWorkSizeY, localWorkSizeZ}}, std::array{{modifiedKernelInfo.workgroupDimensionsOrder[0], modifiedKernelInfo.workgroupDimensionsOrder[1], modifiedKernelInfo.workgroupDimensionsOrder[2]}}, false); EXPECT_EQ(0, memcmp(expectedLocalIds, ioh.getCpuBase(), expectedIohSize)); alignedFree(expectedLocalIds); } HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, usedBindingTableStatePointer) { typedef typename FamilyType::BINDING_TABLE_STATE BINDING_TABLE_STATE; typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE; using GPGPU_WALKER = typename FamilyType::GPGPU_WALKER; CommandQueueHw cmdQ(pContext, pDevice, 0); std::unique_ptr dstImage(Image2dHelper<>::create(pContext)); ASSERT_NE(nullptr, dstImage.get()); MultiDispatchInfo multiDispatchInfo; auto &builder = pDevice->getExecutionEnvironment()->getBuiltIns()->getBuiltinDispatchInfoBuilder(EBuiltInOps::CopyBufferToImage3d, cmdQ.getContext(), cmdQ.getDevice()); ASSERT_NE(nullptr, &builder); BuiltinOpParams dc; dc.srcPtr = nullptr; dc.dstMemObj = dstImage.get(); dc.dstOffset = {0, 0, 0}; dc.size = {1, 1, 1}; dc.dstRowPitch = 0; dc.dstSlicePitch = 0; builder.buildDispatchInfos(multiDispatchInfo, dc); EXPECT_NE(0u, multiDispatchInfo.size()); auto kernel = multiDispatchInfo.begin()->getKernel(); ASSERT_NE(nullptr, kernel); const size_t localWorkSizes[3]{256, 1, 1}; auto &commandStream = cmdQ.getCS(1024); auto pWalkerCmd = static_cast(commandStream.getSpace(sizeof(GPGPU_WALKER))); *pWalkerCmd = FamilyType::cmdInitGpgpuWalker; auto &dsh = cmdQ.getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 8192); auto &ioh = cmdQ.getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 8192); auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::SURFACE_STATE, 8192); // Obtain where the pointers will be stored const auto &kernelInfo = kernel->getKernelInfo(); auto numSurfaceStates = kernelInfo.patchInfo.statelessGlobalMemObjKernelArgs.size() + kernelInfo.patchInfo.imageMemObjKernelArgs.size(); EXPECT_EQ(2u, numSurfaceStates); size_t bindingTableStateSize = numSurfaceStates * sizeof(RENDER_SURFACE_STATE); uint32_t *bindingTableStatesPointers = reinterpret_cast( reinterpret_cast(ssh.getCpuBase()) + ssh.getUsed() + bindingTableStateSize); for (auto i = 0u; i < numSurfaceStates; i++) { *(&bindingTableStatesPointers[i]) = 0xDEADBEEF; } // force statefull path for buffers const_cast(kernelInfo).requiresSshForBuffers = true; uint32_t interfaceDescriptorIndex = 0; auto isCcsUsed = isCcs(cmdQ.getGpgpuEngine().osContext->getEngineType()); HardwareCommandsHelper::sendIndirectState( commandStream, dsh, ioh, ssh, *kernel, kernel->getKernelInfo().getMaxSimdSize(), localWorkSizes, 0, interfaceDescriptorIndex, pDevice->getPreemptionMode(), pWalkerCmd, nullptr, true, isCcsUsed); EXPECT_EQ(0x00000000u, *(&bindingTableStatesPointers[0])); EXPECT_EQ(0x00000040u, *(&bindingTableStatesPointers[1])); } HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, usedBindingTableStatePointersForGlobalAndConstantAndPrivateAndEventPoolAndDefaultCommandQueueSurfaces) { using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA; using GPGPU_WALKER = typename FamilyType::GPGPU_WALKER; // define kernel info auto pKernelInfo = std::make_unique(); SPatchExecutionEnvironment tokenEE = {}; tokenEE.CompiledSIMD8 = false; tokenEE.CompiledSIMD16 = false; tokenEE.CompiledSIMD32 = true; pKernelInfo->patchInfo.executionEnvironment = &tokenEE; // define patch offsets for global, constant, private, event pool and default device queue surfaces SPatchAllocateStatelessGlobalMemorySurfaceWithInitialization AllocateStatelessGlobalMemorySurfaceWithInitialization; AllocateStatelessGlobalMemorySurfaceWithInitialization.GlobalBufferIndex = 0; AllocateStatelessGlobalMemorySurfaceWithInitialization.SurfaceStateHeapOffset = 0; AllocateStatelessGlobalMemorySurfaceWithInitialization.DataParamOffset = 0; AllocateStatelessGlobalMemorySurfaceWithInitialization.DataParamSize = 8; pKernelInfo->patchInfo.pAllocateStatelessGlobalMemorySurfaceWithInitialization = &AllocateStatelessGlobalMemorySurfaceWithInitialization; SPatchAllocateStatelessConstantMemorySurfaceWithInitialization AllocateStatelessConstantMemorySurfaceWithInitialization; AllocateStatelessConstantMemorySurfaceWithInitialization.ConstantBufferIndex = 0; AllocateStatelessConstantMemorySurfaceWithInitialization.SurfaceStateHeapOffset = 64; AllocateStatelessConstantMemorySurfaceWithInitialization.DataParamOffset = 8; AllocateStatelessConstantMemorySurfaceWithInitialization.DataParamSize = 8; pKernelInfo->patchInfo.pAllocateStatelessConstantMemorySurfaceWithInitialization = &AllocateStatelessConstantMemorySurfaceWithInitialization; SPatchAllocateStatelessPrivateSurface AllocateStatelessPrivateMemorySurface; AllocateStatelessPrivateMemorySurface.PerThreadPrivateMemorySize = 32; AllocateStatelessPrivateMemorySurface.SurfaceStateHeapOffset = 128; AllocateStatelessPrivateMemorySurface.DataParamOffset = 16; AllocateStatelessPrivateMemorySurface.DataParamSize = 8; pKernelInfo->patchInfo.pAllocateStatelessPrivateSurface = &AllocateStatelessPrivateMemorySurface; SPatchAllocateStatelessEventPoolSurface AllocateStatelessEventPoolSurface; AllocateStatelessEventPoolSurface.SurfaceStateHeapOffset = 192; AllocateStatelessEventPoolSurface.DataParamOffset = 24; AllocateStatelessEventPoolSurface.DataParamSize = 8; pKernelInfo->patchInfo.pAllocateStatelessEventPoolSurface = &AllocateStatelessEventPoolSurface; SPatchAllocateStatelessDefaultDeviceQueueSurface AllocateStatelessDefaultDeviceQueueSurface; AllocateStatelessDefaultDeviceQueueSurface.SurfaceStateHeapOffset = 256; AllocateStatelessDefaultDeviceQueueSurface.DataParamOffset = 32; AllocateStatelessDefaultDeviceQueueSurface.DataParamSize = 8; pKernelInfo->patchInfo.pAllocateStatelessDefaultDeviceQueueSurface = &AllocateStatelessDefaultDeviceQueueSurface; // create program with valid context MockContext context; MockProgram program(*pDevice->getExecutionEnvironment(), &context, false); // setup global memory char globalBuffer[16]; GraphicsAllocation gfxGlobalAlloc(0, GraphicsAllocation::AllocationType::UNKNOWN, globalBuffer, castToUint64(globalBuffer), 0llu, sizeof(globalBuffer), MemoryPool::MemoryNull); program.setGlobalSurface(&gfxGlobalAlloc); // setup constant memory char constBuffer[16]; GraphicsAllocation gfxConstAlloc(0, GraphicsAllocation::AllocationType::UNKNOWN, constBuffer, castToUint64(constBuffer), 0llu, sizeof(constBuffer), MemoryPool::MemoryNull); program.setConstantSurface(&gfxConstAlloc); // create kernel MockKernel *pKernel = new MockKernel(&program, *pKernelInfo, *pDevice); SKernelBinaryHeaderCommon kernelHeader; // setup surface state heap constexpr uint32_t numSurfaces = 5; constexpr uint32_t sshSize = numSurfaces * sizeof(typename FamilyType::RENDER_SURFACE_STATE) + numSurfaces * sizeof(typename FamilyType::BINDING_TABLE_STATE); unsigned char *surfaceStateHeap = reinterpret_cast(alignedMalloc(sshSize, sizeof(typename FamilyType::RENDER_SURFACE_STATE))); uint32_t btiOffset = static_cast(numSurfaces * sizeof(typename FamilyType::RENDER_SURFACE_STATE)); auto bti = reinterpret_cast(surfaceStateHeap + btiOffset); for (uint32_t i = 0; i < numSurfaces; ++i) { bti[i].setSurfaceStatePointer(i * sizeof(typename FamilyType::RENDER_SURFACE_STATE)); } kernelHeader.SurfaceStateHeapSize = sshSize; // setup kernel heap uint32_t kernelIsa[32]; kernelHeader.KernelHeapSize = sizeof(kernelIsa); pKernelInfo->heapInfo.pSsh = surfaceStateHeap; pKernelInfo->heapInfo.pKernelHeap = kernelIsa; pKernelInfo->heapInfo.pKernelHeader = &kernelHeader; // setup binding table state SPatchBindingTableState bindingTableState; bindingTableState.Token = iOpenCL::PATCH_TOKEN_BINDING_TABLE_STATE; bindingTableState.Size = sizeof(SPatchBindingTableState); bindingTableState.Count = 5; bindingTableState.Offset = btiOffset; bindingTableState.SurfaceStateOffset = 0; pKernelInfo->patchInfo.bindingTableState = &bindingTableState; // setup thread payload SPatchThreadPayload threadPayload; threadPayload.LocalIDXPresent = 1; threadPayload.LocalIDYPresent = 1; threadPayload.LocalIDZPresent = 1; pKernelInfo->patchInfo.threadPayload = &threadPayload; // define stateful path pKernelInfo->usesSsh = true; pKernelInfo->requiresSshForBuffers = true; // initialize kernel ASSERT_EQ(CL_SUCCESS, pKernel->initialize()); // setup cross thread data char pCrossThreadData[64]; pKernel->setCrossThreadData(pCrossThreadData, sizeof(pCrossThreadData)); // try with different offsets to surface state base address for (uint32_t ssbaOffset : {0U, (uint32_t)sizeof(typename FamilyType::RENDER_SURFACE_STATE)}) { CommandQueueHw cmdQ(nullptr, pDevice, 0); auto &commandStream = cmdQ.getCS(1024); auto pWalkerCmd = static_cast(commandStream.getSpace(sizeof(GPGPU_WALKER))); *pWalkerCmd = FamilyType::cmdInitGpgpuWalker; auto &dsh = cmdQ.getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 8192); auto &ioh = cmdQ.getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 8192); auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::SURFACE_STATE, 8192); // Initialize binding table state pointers with pattern EXPECT_EQ(numSurfaces, pKernel->getNumberOfBindingTableStates()); const size_t localWorkSizes[3]{256, 1, 1}; dsh.getSpace(sizeof(INTERFACE_DESCRIPTOR_DATA)); ssh.getSpace(ssbaOffset); // offset local ssh from surface state base address uint32_t localSshOffset = static_cast(ssh.getUsed()); // push surfaces states and binding table to given ssh heap uint32_t interfaceDescriptorIndex = 0; auto isCcsUsed = isCcs(cmdQ.getGpgpuEngine().osContext->getEngineType()); HardwareCommandsHelper::sendIndirectState( commandStream, dsh, ioh, ssh, *pKernel, pKernel->getKernelInfo().getMaxSimdSize(), localWorkSizes, 0, interfaceDescriptorIndex, pDevice->getPreemptionMode(), pWalkerCmd, nullptr, true, isCcsUsed); bti = reinterpret_cast(reinterpret_cast(ssh.getCpuBase()) + localSshOffset + btiOffset); for (uint32_t i = 0; i < numSurfaces; ++i) { uint32_t expected = localSshOffset + i * sizeof(typename FamilyType::RENDER_SURFACE_STATE); EXPECT_EQ(expected, bti[i].getSurfaceStatePointer()); } program.setGlobalSurface(nullptr); program.setConstantSurface(nullptr); //exhaust space to trigger reload ssh.getSpace(ssh.getAvailableSpace()); dsh.getSpace(dsh.getAvailableSpace()); } alignedFree(surfaceStateHeap); delete pKernel; } HWTEST_F(HardwareCommandsTest, setBindingTableStatesForKernelWithBuffersNotRequiringSSHDoesNotTouchSSH) { // define kernel info auto pKernelInfo = std::make_unique(); // create program with valid context MockContext context; MockProgram program(*pDevice->getExecutionEnvironment(), &context, false); // create kernel MockKernel *pKernel = new MockKernel(&program, *pKernelInfo, *pDevice); // setup surface state heap char surfaceStateHeap[256]; SKernelBinaryHeaderCommon kernelHeader; kernelHeader.SurfaceStateHeapSize = sizeof(surfaceStateHeap); pKernelInfo->heapInfo.pSsh = surfaceStateHeap; pKernelInfo->heapInfo.pKernelHeader = &kernelHeader; // define stateful path pKernelInfo->usesSsh = true; pKernelInfo->requiresSshForBuffers = false; SPatchStatelessGlobalMemoryObjectKernelArgument statelessGlobalMemory; statelessGlobalMemory.ArgumentNumber = 0; statelessGlobalMemory.DataParamOffset = 0; statelessGlobalMemory.DataParamSize = 0; statelessGlobalMemory.Size = 0; statelessGlobalMemory.SurfaceStateHeapOffset = 0; pKernelInfo->patchInfo.statelessGlobalMemObjKernelArgs.push_back(&statelessGlobalMemory); // initialize kernel ASSERT_EQ(CL_SUCCESS, pKernel->initialize()); CommandQueueHw cmdQ(nullptr, pDevice, 0); auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::SURFACE_STATE, 8192); ssh.align(8); auto usedBefore = ssh.getUsed(); // Initialize binding table state pointers with pattern auto numSurfaceStates = pKernel->getNumberOfBindingTableStates(); EXPECT_EQ(0u, numSurfaceStates); // set binding table states auto dstBindingTablePointer = HardwareCommandsHelper::pushBindingTableAndSurfaceStates(ssh, *pKernel); EXPECT_EQ(0u, dstBindingTablePointer); auto usedAfter = ssh.getUsed(); EXPECT_EQ(usedBefore, usedAfter); ssh.align(8); EXPECT_EQ(usedAfter, ssh.getUsed()); delete pKernel; } HWTEST_F(HardwareCommandsTest, setBindingTableStatesForNoSurfaces) { // define kernel info auto pKernelInfo = std::make_unique(); // create program with valid context MockContext context; MockProgram program(*pDevice->getExecutionEnvironment(), &context, false); // create kernel MockKernel *pKernel = new MockKernel(&program, *pKernelInfo, *pDevice); // setup surface state heap char surfaceStateHeap[256]; SKernelBinaryHeaderCommon kernelHeader; kernelHeader.SurfaceStateHeapSize = sizeof(surfaceStateHeap); pKernelInfo->heapInfo.pSsh = surfaceStateHeap; pKernelInfo->heapInfo.pKernelHeader = &kernelHeader; // define stateful path pKernelInfo->usesSsh = true; pKernelInfo->requiresSshForBuffers = true; // initialize kernel ASSERT_EQ(CL_SUCCESS, pKernel->initialize()); CommandQueueHw cmdQ(nullptr, pDevice, 0); auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::SURFACE_STATE, 8192); // Initialize binding table state pointers with pattern auto numSurfaceStates = pKernel->getNumberOfBindingTableStates(); EXPECT_EQ(0u, numSurfaceStates); auto dstBindingTablePointer = HardwareCommandsHelper::pushBindingTableAndSurfaceStates(ssh, *pKernelInfo); EXPECT_EQ(0u, dstBindingTablePointer); dstBindingTablePointer = HardwareCommandsHelper::pushBindingTableAndSurfaceStates(ssh, *pKernel); EXPECT_EQ(0u, dstBindingTablePointer); SPatchBindingTableState bindingTableState; bindingTableState.Token = iOpenCL::PATCH_TOKEN_BINDING_TABLE_STATE; bindingTableState.Size = sizeof(SPatchBindingTableState); bindingTableState.Count = 0; bindingTableState.Offset = 64; bindingTableState.SurfaceStateOffset = 0; pKernelInfo->patchInfo.bindingTableState = &bindingTableState; dstBindingTablePointer = HardwareCommandsHelper::pushBindingTableAndSurfaceStates(ssh, *pKernel); EXPECT_EQ(0u, dstBindingTablePointer); pKernelInfo->patchInfo.bindingTableState = nullptr; delete pKernel; } HWTEST_F(HardwareCommandsTest, GivenVariousValuesWhenAlignSlmSizeIsCalledThenCorrectValueIsReturned) { if (::renderCoreFamily == IGFX_GEN8_CORE) { EXPECT_EQ(0u, HardwareCommandsHelper::alignSlmSize(0)); EXPECT_EQ(4096u, HardwareCommandsHelper::alignSlmSize(1)); EXPECT_EQ(4096u, HardwareCommandsHelper::alignSlmSize(1024)); EXPECT_EQ(4096u, HardwareCommandsHelper::alignSlmSize(1025)); EXPECT_EQ(4096u, HardwareCommandsHelper::alignSlmSize(2048)); EXPECT_EQ(4096u, HardwareCommandsHelper::alignSlmSize(2049)); EXPECT_EQ(4096u, HardwareCommandsHelper::alignSlmSize(4096)); EXPECT_EQ(8192u, HardwareCommandsHelper::alignSlmSize(4097)); EXPECT_EQ(8192u, HardwareCommandsHelper::alignSlmSize(8192)); EXPECT_EQ(16384u, HardwareCommandsHelper::alignSlmSize(8193)); EXPECT_EQ(16384u, HardwareCommandsHelper::alignSlmSize(12288)); EXPECT_EQ(16384u, HardwareCommandsHelper::alignSlmSize(16384)); EXPECT_EQ(32768u, HardwareCommandsHelper::alignSlmSize(16385)); EXPECT_EQ(32768u, HardwareCommandsHelper::alignSlmSize(24576)); EXPECT_EQ(32768u, HardwareCommandsHelper::alignSlmSize(32768)); EXPECT_EQ(65536u, HardwareCommandsHelper::alignSlmSize(32769)); EXPECT_EQ(65536u, HardwareCommandsHelper::alignSlmSize(49152)); EXPECT_EQ(65536u, HardwareCommandsHelper::alignSlmSize(65535)); EXPECT_EQ(65536u, HardwareCommandsHelper::alignSlmSize(65536)); } else { EXPECT_EQ(0u, HardwareCommandsHelper::alignSlmSize(0)); EXPECT_EQ(1024u, HardwareCommandsHelper::alignSlmSize(1)); EXPECT_EQ(1024u, HardwareCommandsHelper::alignSlmSize(1024)); EXPECT_EQ(2048u, HardwareCommandsHelper::alignSlmSize(1025)); EXPECT_EQ(2048u, HardwareCommandsHelper::alignSlmSize(2048)); EXPECT_EQ(4096u, HardwareCommandsHelper::alignSlmSize(2049)); EXPECT_EQ(4096u, HardwareCommandsHelper::alignSlmSize(4096)); EXPECT_EQ(8192u, HardwareCommandsHelper::alignSlmSize(4097)); EXPECT_EQ(8192u, HardwareCommandsHelper::alignSlmSize(8192)); EXPECT_EQ(16384u, HardwareCommandsHelper::alignSlmSize(8193)); EXPECT_EQ(16384u, HardwareCommandsHelper::alignSlmSize(16384)); EXPECT_EQ(32768u, HardwareCommandsHelper::alignSlmSize(16385)); EXPECT_EQ(32768u, HardwareCommandsHelper::alignSlmSize(32768)); EXPECT_EQ(65536u, HardwareCommandsHelper::alignSlmSize(32769)); EXPECT_EQ(65536u, HardwareCommandsHelper::alignSlmSize(65536)); } } HWTEST_F(HardwareCommandsTest, GivenVariousValuesWhenComputeSlmSizeIsCalledThenCorrectValueIsReturned) { if (::renderCoreFamily == IGFX_GEN8_CORE) { EXPECT_EQ(0u, HardwareCommandsHelper::computeSlmValues(0)); EXPECT_EQ(1u, HardwareCommandsHelper::computeSlmValues(1)); EXPECT_EQ(1u, HardwareCommandsHelper::computeSlmValues(1024)); EXPECT_EQ(1u, HardwareCommandsHelper::computeSlmValues(1025)); EXPECT_EQ(1u, HardwareCommandsHelper::computeSlmValues(2048)); EXPECT_EQ(1u, HardwareCommandsHelper::computeSlmValues(2049)); EXPECT_EQ(1u, HardwareCommandsHelper::computeSlmValues(4096)); EXPECT_EQ(2u, HardwareCommandsHelper::computeSlmValues(4097)); EXPECT_EQ(2u, HardwareCommandsHelper::computeSlmValues(8192)); EXPECT_EQ(4u, HardwareCommandsHelper::computeSlmValues(8193)); EXPECT_EQ(4u, HardwareCommandsHelper::computeSlmValues(12288)); EXPECT_EQ(4u, HardwareCommandsHelper::computeSlmValues(16384)); EXPECT_EQ(8u, HardwareCommandsHelper::computeSlmValues(16385)); EXPECT_EQ(8u, HardwareCommandsHelper::computeSlmValues(24576)); EXPECT_EQ(8u, HardwareCommandsHelper::computeSlmValues(32768)); EXPECT_EQ(16u, HardwareCommandsHelper::computeSlmValues(32769)); EXPECT_EQ(16u, HardwareCommandsHelper::computeSlmValues(49152)); EXPECT_EQ(16u, HardwareCommandsHelper::computeSlmValues(65535)); EXPECT_EQ(16u, HardwareCommandsHelper::computeSlmValues(65536)); } else { EXPECT_EQ(0u, HardwareCommandsHelper::computeSlmValues(0)); EXPECT_EQ(1u, HardwareCommandsHelper::computeSlmValues(1)); EXPECT_EQ(1u, HardwareCommandsHelper::computeSlmValues(1024)); EXPECT_EQ(2u, HardwareCommandsHelper::computeSlmValues(1025)); EXPECT_EQ(2u, HardwareCommandsHelper::computeSlmValues(2048)); EXPECT_EQ(3u, HardwareCommandsHelper::computeSlmValues(2049)); EXPECT_EQ(3u, HardwareCommandsHelper::computeSlmValues(4096)); EXPECT_EQ(4u, HardwareCommandsHelper::computeSlmValues(4097)); EXPECT_EQ(4u, HardwareCommandsHelper::computeSlmValues(8192)); EXPECT_EQ(5u, HardwareCommandsHelper::computeSlmValues(8193)); EXPECT_EQ(5u, HardwareCommandsHelper::computeSlmValues(16384)); EXPECT_EQ(6u, HardwareCommandsHelper::computeSlmValues(16385)); EXPECT_EQ(6u, HardwareCommandsHelper::computeSlmValues(32768)); EXPECT_EQ(7u, HardwareCommandsHelper::computeSlmValues(32769)); EXPECT_EQ(7u, HardwareCommandsHelper::computeSlmValues(65536)); } } HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, GivenKernelWithSamplersWhenIndirectStateIsProgrammedThenBorderColorIsCorrectlyCopiedToDshAndSamplerStatesAreProgrammedWithPointer) { typedef typename FamilyType::BINDING_TABLE_STATE BINDING_TABLE_STATE; typedef typename FamilyType::RENDER_SURFACE_STATE RENDER_SURFACE_STATE; typedef typename FamilyType::SAMPLER_STATE SAMPLER_STATE; using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA; using GPGPU_WALKER = typename FamilyType::GPGPU_WALKER; CommandQueueHw cmdQ(nullptr, pDevice, 0); const size_t localWorkSizes[3]{1, 1, 1}; auto &commandStream = cmdQ.getCS(1024); auto pWalkerCmd = static_cast(commandStream.getSpace(sizeof(GPGPU_WALKER))); *pWalkerCmd = FamilyType::cmdInitGpgpuWalker; auto &dsh = cmdQ.getIndirectHeap(IndirectHeap::DYNAMIC_STATE, 8192); auto &ioh = cmdQ.getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 8192); auto &ssh = cmdQ.getIndirectHeap(IndirectHeap::SURFACE_STATE, 8192); const uint32_t borderColorSize = 64; const uint32_t samplerStateSize = sizeof(SAMPLER_STATE) * 2; SPatchSamplerStateArray samplerStateArray; samplerStateArray.BorderColorOffset = 0x0; samplerStateArray.Count = 2; samplerStateArray.Offset = borderColorSize; samplerStateArray.Size = samplerStateSize; samplerStateArray.Token = 1; char *mockDsh = new char[(borderColorSize + samplerStateSize) * 4]; memset(mockDsh, 6, borderColorSize); memset(mockDsh + borderColorSize, 8, borderColorSize); mockKernelWithInternal->kernelInfo.heapInfo.pDsh = mockDsh; mockKernelWithInternal->kernelInfo.patchInfo.samplerStateArray = &samplerStateArray; uint64_t interfaceDescriptorTableOffset = dsh.getUsed(); dsh.getSpace(sizeof(INTERFACE_DESCRIPTOR_DATA)); dsh.getSpace(4); char *initialDshPointer = static_cast(dsh.getCpuBase()) + dsh.getUsed(); char *borderColorPointer = alignUp(initialDshPointer, 64); uint32_t borderColorOffset = static_cast(borderColorPointer - static_cast(dsh.getCpuBase())); SAMPLER_STATE *pSamplerState = reinterpret_cast(mockDsh + borderColorSize); for (uint32_t i = 0; i < 2; i++) { pSamplerState[i].setIndirectStatePointer(0); } mockKernelWithInternal->mockKernel->setCrossThreadData(mockKernelWithInternal->crossThreadData, sizeof(mockKernelWithInternal->crossThreadData)); mockKernelWithInternal->mockKernel->setSshLocal(mockKernelWithInternal->sshLocal, sizeof(mockKernelWithInternal->sshLocal)); uint32_t interfaceDescriptorIndex = 0; auto isCcsUsed = isCcs(cmdQ.getGpgpuEngine().osContext->getEngineType()); HardwareCommandsHelper::sendIndirectState( commandStream, dsh, ioh, ssh, *mockKernelWithInternal->mockKernel, 8, localWorkSizes, interfaceDescriptorTableOffset, interfaceDescriptorIndex, pDevice->getPreemptionMode(), pWalkerCmd, nullptr, true, isCcsUsed); bool isMemorySame = memcmp(borderColorPointer, mockDsh, borderColorSize) == 0; EXPECT_TRUE(isMemorySame); SAMPLER_STATE *pSamplerStatesCopied = reinterpret_cast(borderColorPointer + borderColorSize); for (uint32_t i = 0; i < 2; i++) { EXPECT_EQ(pSamplerState[i].getNonNormalizedCoordinateEnable(), pSamplerStatesCopied[i].getNonNormalizedCoordinateEnable()); EXPECT_EQ(pSamplerState[i].getTcxAddressControlMode(), pSamplerStatesCopied[i].getTcxAddressControlMode()); EXPECT_EQ(pSamplerState[i].getTcyAddressControlMode(), pSamplerStatesCopied[i].getTcyAddressControlMode()); EXPECT_EQ(pSamplerState[i].getTczAddressControlMode(), pSamplerStatesCopied[i].getTczAddressControlMode()); EXPECT_EQ(pSamplerState[i].getMinModeFilter(), pSamplerStatesCopied[i].getMinModeFilter()); EXPECT_EQ(pSamplerState[i].getMagModeFilter(), pSamplerStatesCopied[i].getMagModeFilter()); EXPECT_EQ(pSamplerState[i].getMipModeFilter(), pSamplerStatesCopied[i].getMipModeFilter()); EXPECT_EQ(pSamplerState[i].getUAddressMinFilterRoundingEnable(), pSamplerStatesCopied[i].getUAddressMinFilterRoundingEnable()); EXPECT_EQ(pSamplerState[i].getUAddressMagFilterRoundingEnable(), pSamplerStatesCopied[i].getUAddressMagFilterRoundingEnable()); EXPECT_EQ(pSamplerState[i].getVAddressMinFilterRoundingEnable(), pSamplerStatesCopied[i].getVAddressMinFilterRoundingEnable()); EXPECT_EQ(pSamplerState[i].getVAddressMagFilterRoundingEnable(), pSamplerStatesCopied[i].getVAddressMagFilterRoundingEnable()); EXPECT_EQ(pSamplerState[i].getRAddressMagFilterRoundingEnable(), pSamplerStatesCopied[i].getRAddressMagFilterRoundingEnable()); EXPECT_EQ(pSamplerState[i].getRAddressMinFilterRoundingEnable(), pSamplerStatesCopied[i].getRAddressMinFilterRoundingEnable()); EXPECT_EQ(pSamplerState[i].getLodAlgorithm(), pSamplerStatesCopied[i].getLodAlgorithm()); EXPECT_EQ(pSamplerState[i].getTextureLodBias(), pSamplerStatesCopied[i].getTextureLodBias()); EXPECT_EQ(pSamplerState[i].getLodPreclampMode(), pSamplerStatesCopied[i].getLodPreclampMode()); EXPECT_EQ(pSamplerState[i].getTextureBorderColorMode(), pSamplerStatesCopied[i].getTextureBorderColorMode()); EXPECT_EQ(pSamplerState[i].getSamplerDisable(), pSamplerStatesCopied[i].getSamplerDisable()); EXPECT_EQ(pSamplerState[i].getCubeSurfaceControlMode(), pSamplerStatesCopied[i].getCubeSurfaceControlMode()); EXPECT_EQ(pSamplerState[i].getShadowFunction(), pSamplerStatesCopied[i].getShadowFunction()); EXPECT_EQ(pSamplerState[i].getChromakeyMode(), pSamplerStatesCopied[i].getChromakeyMode()); EXPECT_EQ(pSamplerState[i].getChromakeyIndex(), pSamplerStatesCopied[i].getChromakeyIndex()); EXPECT_EQ(pSamplerState[i].getChromakeyEnable(), pSamplerStatesCopied[i].getChromakeyEnable()); EXPECT_EQ(pSamplerState[i].getMaxLod(), pSamplerStatesCopied[i].getMaxLod()); EXPECT_EQ(pSamplerState[i].getMinLod(), pSamplerStatesCopied[i].getMinLod()); EXPECT_EQ(pSamplerState[i].getLodClampMagnificationMode(), pSamplerStatesCopied[i].getLodClampMagnificationMode()); EXPECT_EQ(borderColorOffset, pSamplerStatesCopied[i].getIndirectStatePointer()); } delete[] mockDsh; } using HardwareCommandsHelperTests = ::testing::Test; HWTEST_F(HardwareCommandsHelperTests, givenCompareAddressAndDataWhenProgrammingSemaphoreWaitThenSetupAllFields) { using MI_SEMAPHORE_WAIT = typename FamilyType::MI_SEMAPHORE_WAIT; uint64_t compareAddress = 0x10000; uint32_t compareData = 1234; uint8_t buffer[1024] = {}; LinearStream cmdStream(buffer, 1024); MI_SEMAPHORE_WAIT referenceCommand = FamilyType::cmdInitMiSemaphoreWait; referenceCommand.setCompareOperation(MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_NOT_EQUAL_SDD); referenceCommand.setSemaphoreDataDword(compareData); referenceCommand.setSemaphoreGraphicsAddress(compareAddress); referenceCommand.setWaitMode(MI_SEMAPHORE_WAIT::WAIT_MODE::WAIT_MODE_POLLING_MODE); HardwareCommandsHelper::programMiSemaphoreWait(cmdStream, compareAddress, compareData); EXPECT_EQ(sizeof(MI_SEMAPHORE_WAIT), cmdStream.getUsed()); EXPECT_EQ(0, memcmp(&referenceCommand, buffer, sizeof(MI_SEMAPHORE_WAIT))); } HWTEST_F(HardwareCommandsHelperTests, whenProgrammingMiAtomicThenSetupAllFields) { using MI_ATOMIC = typename FamilyType::MI_ATOMIC; uint64_t writeAddress = 0x10000; auto opcode = MI_ATOMIC::ATOMIC_OPCODES::ATOMIC_4B_DECREMENT; auto dataSize = MI_ATOMIC::DATA_SIZE::DATA_SIZE_DWORD; uint8_t buffer[1024] = {}; LinearStream cmdStream(buffer, 1024); MI_ATOMIC referenceCommand = FamilyType::cmdInitAtomic; HardwareCommandsHelper::programMiAtomic(referenceCommand, writeAddress, opcode, dataSize); auto miAtomic = HardwareCommandsHelper::programMiAtomic(cmdStream, writeAddress, opcode, dataSize); EXPECT_EQ(sizeof(MI_ATOMIC), cmdStream.getUsed()); EXPECT_EQ(miAtomic, cmdStream.getCpuBase()); EXPECT_EQ(0, memcmp(&referenceCommand, miAtomic, sizeof(MI_ATOMIC))); } typedef ExecutionModelKernelFixture ParentKernelCommandsFromBinaryTest; HWCMDTEST_P(IGFX_GEN8_CORE, ParentKernelCommandsFromBinaryTest, getSizeRequiredForExecutionModelForSurfaceStatesReturnsSizeOfBlocksPlusMaxBindingTableSizeForAllIDTEntriesAndSchedulerSSHSize) { using BINDING_TABLE_STATE = typename FamilyType::BINDING_TABLE_STATE; if (std::string(pPlatform->getDevice(0)->getDeviceInfo().clVersion).find("OpenCL 2.") != std::string::npos) { EXPECT_TRUE(pKernel->isParentKernel); size_t totalSize = 0; BlockKernelManager *blockManager = pKernel->getProgram()->getBlockKernelManager(); uint32_t blockCount = static_cast(blockManager->getCount()); totalSize = BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE - 1; // for initial alignment uint32_t maxBindingTableCount = 0; for (uint32_t i = 0; i < blockCount; i++) { const KernelInfo *pBlockInfo = blockManager->getBlockKernelInfo(i); totalSize += pBlockInfo->heapInfo.pKernelHeader->SurfaceStateHeapSize; totalSize = alignUp(totalSize, BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE); maxBindingTableCount = std::max(maxBindingTableCount, pBlockInfo->patchInfo.bindingTableState ? pBlockInfo->patchInfo.bindingTableState->Count : 0); } totalSize += maxBindingTableCount * sizeof(BINDING_TABLE_STATE) * DeviceQueue::interfaceDescriptorEntries; BuiltIns &builtIns = *pDevice->getExecutionEnvironment()->getBuiltIns(); auto &scheduler = builtIns.getSchedulerKernel(*pContext); auto schedulerSshSize = scheduler.getSurfaceStateHeapSize(); totalSize += schedulerSshSize + ((schedulerSshSize != 0) ? BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE : 0); totalSize = alignUp(totalSize, BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE); EXPECT_EQ(totalSize, HardwareCommandsHelper::getSshSizeForExecutionModel(*pKernel)); } } static const char *binaryFile = "simple_block_kernel"; static const char *KernelNames[] = {"kernel_reflection", "simple_block_kernel"}; INSTANTIATE_TEST_CASE_P(ParentKernelCommandsFromBinaryTest, ParentKernelCommandsFromBinaryTest, ::testing::Combine( ::testing::Values(binaryFile), ::testing::ValuesIn(KernelNames))); HWTEST_F(HardwareCommandsTest, givenEnabledPassInlineDataWhenKernelAllowsInlineThenReturnTrue) { DebugManagerStateRestore restore; DebugManager.flags.EnablePassInlineData.set(1u); uint32_t crossThreadData[8]; const_cast(mockKernelWithInternal->kernelInfo.patchInfo.threadPayload)->PassInlineData = 1; mockKernelWithInternal->mockKernel->setCrossThreadData(crossThreadData, sizeof(crossThreadData)); EXPECT_TRUE(HardwareCommandsHelper::inlineDataProgrammingRequired(*mockKernelWithInternal->mockKernel)); } HWTEST_F(HardwareCommandsTest, givenNoDebugSettingsWhenDefaultModeIsExcercisedThenWeFollowKernelSettingForInlineProgramming) { const_cast(mockKernelWithInternal->kernelInfo.patchInfo.threadPayload)->PassInlineData = 1; EXPECT_TRUE(HardwareCommandsHelper::inlineDataProgrammingRequired(*mockKernelWithInternal->mockKernel)); } HWTEST_F(HardwareCommandsTest, givenDisabledPassInlineDataWhenKernelAllowsInlineThenReturnFalse) { DebugManagerStateRestore restore; DebugManager.flags.EnablePassInlineData.set(0u); const_cast(mockKernelWithInternal->kernelInfo.patchInfo.threadPayload)->PassInlineData = 1; EXPECT_FALSE(HardwareCommandsHelper::inlineDataProgrammingRequired(*mockKernelWithInternal->mockKernel)); } HWTEST_F(HardwareCommandsTest, givenEnabledPassInlineDataWhenKernelDisallowsInlineThenReturnFalse) { DebugManagerStateRestore restore; DebugManager.flags.EnablePassInlineData.set(1u); uint32_t crossThreadData[8]; const_cast(mockKernelWithInternal->kernelInfo.patchInfo.threadPayload)->PassInlineData = 0; mockKernelWithInternal->mockKernel->setCrossThreadData(crossThreadData, sizeof(crossThreadData)); EXPECT_FALSE(HardwareCommandsHelper::inlineDataProgrammingRequired(*mockKernelWithInternal->mockKernel)); } HWTEST_F(HardwareCommandsTest, whenLocalIdxInXDimPresentThenExpectLocalIdsInUseIsTrue) { const_cast(mockKernelWithInternal->kernelInfo.patchInfo.threadPayload)->LocalIDXPresent = 1; const_cast(mockKernelWithInternal->kernelInfo.patchInfo.threadPayload)->LocalIDYPresent = 0; const_cast(mockKernelWithInternal->kernelInfo.patchInfo.threadPayload)->LocalIDZPresent = 0; EXPECT_TRUE(HardwareCommandsHelper::kernelUsesLocalIds(*mockKernelWithInternal->mockKernel)); } HWTEST_F(HardwareCommandsTest, whenLocalIdxInYDimPresentThenExpectLocalIdsInUseIsTrue) { const_cast(mockKernelWithInternal->kernelInfo.patchInfo.threadPayload)->LocalIDXPresent = 0; const_cast(mockKernelWithInternal->kernelInfo.patchInfo.threadPayload)->LocalIDYPresent = 1; const_cast(mockKernelWithInternal->kernelInfo.patchInfo.threadPayload)->LocalIDZPresent = 0; EXPECT_TRUE(HardwareCommandsHelper::kernelUsesLocalIds(*mockKernelWithInternal->mockKernel)); } HWTEST_F(HardwareCommandsTest, whenLocalIdxInZDimPresentThenExpectLocalIdsInUseIsTrue) { const_cast(mockKernelWithInternal->kernelInfo.patchInfo.threadPayload)->LocalIDXPresent = 0; const_cast(mockKernelWithInternal->kernelInfo.patchInfo.threadPayload)->LocalIDYPresent = 0; const_cast(mockKernelWithInternal->kernelInfo.patchInfo.threadPayload)->LocalIDZPresent = 1; EXPECT_TRUE(HardwareCommandsHelper::kernelUsesLocalIds(*mockKernelWithInternal->mockKernel)); } HWTEST_F(HardwareCommandsTest, whenLocalIdxAreNotPresentThenExpectLocalIdsInUseIsFalse) { const_cast(mockKernelWithInternal->kernelInfo.patchInfo.threadPayload)->LocalIDXPresent = 0; const_cast(mockKernelWithInternal->kernelInfo.patchInfo.threadPayload)->LocalIDYPresent = 0; const_cast(mockKernelWithInternal->kernelInfo.patchInfo.threadPayload)->LocalIDZPresent = 0; EXPECT_FALSE(HardwareCommandsHelper::kernelUsesLocalIds(*mockKernelWithInternal->mockKernel)); } HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenCacheFlushAfterWalkerEnabledWhenProgramGlobalSurfacePresentThenExpectCacheFlushCommand) { using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; using MEDIA_STATE_FLUSH = typename FamilyType::MEDIA_STATE_FLUSH; using MEDIA_INTERFACE_DESCRIPTOR_LOAD = typename FamilyType::MEDIA_INTERFACE_DESCRIPTOR_LOAD; DebugManagerStateRestore dbgRestore; DebugManager.flags.EnableCacheFlushAfterWalker.set(1); CommandQueueHw cmdQ(nullptr, pDevice, 0); auto &commandStream = cmdQ.getCS(1024); MockGraphicsAllocation globalAllocation; mockKernelWithInternal->mockProgram->setGlobalSurface(&globalAllocation); Kernel::CacheFlushAllocationsVec allocs; mockKernelWithInternal->mockKernel->getAllocationsForCacheFlush(allocs); EXPECT_NE(allocs.end(), std::find(allocs.begin(), allocs.end(), &globalAllocation)); size_t expectedSize = sizeof(PIPE_CONTROL); size_t actualSize = HardwareCommandsHelper::getSizeRequiredForCacheFlush(cmdQ, mockKernelWithInternal->mockKernel, 0U); EXPECT_EQ(expectedSize, actualSize); HardwareCommandsHelper::programCacheFlushAfterWalkerCommand(&commandStream, cmdQ, mockKernelWithInternal->mockKernel, 0U); HardwareParse hwParse; hwParse.parseCommands(commandStream); PIPE_CONTROL *pipeControl = hwParse.getCommand(); ASSERT_NE(nullptr, pipeControl); EXPECT_TRUE(pipeControl->getCommandStreamerStallEnable()); EXPECT_TRUE(pipeControl->getDcFlushEnable()); mockKernelWithInternal->mockProgram->setGlobalSurface(nullptr); } HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenCacheFlushAfterWalkerEnabledWhenSvmAllocationsSetAsCacheFlushRequiringThenExpectCacheFlushCommand) { using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; using MEDIA_STATE_FLUSH = typename FamilyType::MEDIA_STATE_FLUSH; using MEDIA_INTERFACE_DESCRIPTOR_LOAD = typename FamilyType::MEDIA_INTERFACE_DESCRIPTOR_LOAD; DebugManagerStateRestore dbgRestore; DebugManager.flags.EnableCacheFlushAfterWalker.set(1); CommandQueueHw cmdQ(nullptr, pDevice, 0); auto &commandStream = cmdQ.getCS(1024); char buff[MemoryConstants::pageSize * 2]; MockGraphicsAllocation svmAllocation1{alignUp(buff, MemoryConstants::pageSize), MemoryConstants::pageSize}; mockKernelWithInternal->mockKernel->kernelSvmGfxAllocations.push_back(&svmAllocation1); MockGraphicsAllocation svmAllocation2{alignUp(buff, MemoryConstants::pageSize), MemoryConstants::pageSize}; svmAllocation2.setFlushL3Required(false); mockKernelWithInternal->mockKernel->kernelSvmGfxAllocations.push_back(&svmAllocation2); mockKernelWithInternal->mockKernel->svmAllocationsRequireCacheFlush = true; Kernel::CacheFlushAllocationsVec allocs; mockKernelWithInternal->mockKernel->getAllocationsForCacheFlush(allocs); EXPECT_NE(allocs.end(), std::find(allocs.begin(), allocs.end(), &svmAllocation1)); EXPECT_EQ(allocs.end(), std::find(allocs.begin(), allocs.end(), &svmAllocation2)); size_t expectedSize = sizeof(PIPE_CONTROL); size_t actualSize = HardwareCommandsHelper::getSizeRequiredForCacheFlush(cmdQ, mockKernelWithInternal->mockKernel, 0U); EXPECT_EQ(expectedSize, actualSize); HardwareCommandsHelper::programCacheFlushAfterWalkerCommand(&commandStream, cmdQ, mockKernelWithInternal->mockKernel, 0U); HardwareParse hwParse; hwParse.parseCommands(commandStream); PIPE_CONTROL *pipeControl = hwParse.getCommand(); ASSERT_NE(nullptr, pipeControl); EXPECT_TRUE(pipeControl->getCommandStreamerStallEnable()); EXPECT_TRUE(pipeControl->getDcFlushEnable()); } HWCMDTEST_F(IGFX_GEN8_CORE, HardwareCommandsTest, givenCacheFlushAfterWalkerEnabledWhenKernelArgIsSetAsCacheFlushRequiredThenExpectCacheFlushCommand) { using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; using MEDIA_STATE_FLUSH = typename FamilyType::MEDIA_STATE_FLUSH; using MEDIA_INTERFACE_DESCRIPTOR_LOAD = typename FamilyType::MEDIA_INTERFACE_DESCRIPTOR_LOAD; DebugManagerStateRestore dbgRestore; DebugManager.flags.EnableCacheFlushAfterWalker.set(1); CommandQueueHw cmdQ(nullptr, pDevice, 0); auto &commandStream = cmdQ.getCS(1024); addSpaceForSingleKernelArg(); MockGraphicsAllocation cacheRequiringAllocation; mockKernelWithInternal->mockKernel->kernelArgRequiresCacheFlush.resize(2); mockKernelWithInternal->mockKernel->kernelArgRequiresCacheFlush[0] = &cacheRequiringAllocation; Kernel::CacheFlushAllocationsVec allocs; mockKernelWithInternal->mockKernel->getAllocationsForCacheFlush(allocs); EXPECT_NE(allocs.end(), std::find(allocs.begin(), allocs.end(), &cacheRequiringAllocation)); size_t expectedSize = sizeof(PIPE_CONTROL); size_t actualSize = HardwareCommandsHelper::getSizeRequiredForCacheFlush(cmdQ, mockKernelWithInternal->mockKernel, 0U); EXPECT_EQ(expectedSize, actualSize); HardwareCommandsHelper::programCacheFlushAfterWalkerCommand(&commandStream, cmdQ, mockKernelWithInternal->mockKernel, 0U); HardwareParse hwParse; hwParse.parseCommands(commandStream); PIPE_CONTROL *pipeControl = hwParse.getCommand(); ASSERT_NE(nullptr, pipeControl); EXPECT_TRUE(pipeControl->getCommandStreamerStallEnable()); EXPECT_TRUE(pipeControl->getDcFlushEnable()); } HWTEST_F(HardwareCommandsTest, givenCacheFlushAfterWalkerDisabledWhenGettingRequiredCacheFlushSizeThenReturnZero) { using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; DebugManagerStateRestore dbgRestore; DebugManager.flags.EnableCacheFlushAfterWalker.set(0); CommandQueueHw cmdQ(nullptr, pDevice, 0); size_t expectedSize = 0U; size_t actualSize = HardwareCommandsHelper::getSizeRequiredForCacheFlush(cmdQ, mockKernelWithInternal->mockKernel, 0U); EXPECT_EQ(expectedSize, actualSize); } TEST_F(HardwareCommandsTest, givenCacheFlushAfterWalkerEnabledWhenPlatformNotSupportFlushThenExpectNoCacheAllocationForFlush) { DebugManagerStateRestore dbgRestore; DebugManager.flags.EnableCacheFlushAfterWalker.set(-1); hardwareInfo.capabilityTable.supportCacheFlushAfterWalker = false; StackVec allocationsForCacheFlush; mockKernelWithInternal->mockKernel->getAllocationsForCacheFlush(allocationsForCacheFlush); EXPECT_EQ(0U, allocationsForCacheFlush.size()); } HWTEST_F(HardwareCommandsTest, givenImmDataWriteWhenProgrammingMiFlushDwThenSetAllRequiredFields) { using MI_FLUSH_DW = typename FamilyType::MI_FLUSH_DW; uint8_t buffer[2 * sizeof(MI_FLUSH_DW)] = {}; LinearStream linearStream(buffer, sizeof(buffer)); uint64_t gpuAddress = 0x1230000; uint64_t immData = 456; HardwareCommandsHelper::programMiFlushDw(linearStream, gpuAddress, immData); auto miFlushDwCmd = reinterpret_cast(buffer); EXPECT_EQ(sizeof(MI_FLUSH_DW), linearStream.getUsed()); EXPECT_EQ(MI_FLUSH_DW::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA_QWORD, miFlushDwCmd->getPostSyncOperation()); EXPECT_EQ(gpuAddress, miFlushDwCmd->getDestinationAddress()); EXPECT_EQ(immData, miFlushDwCmd->getImmediateData()); } using KernelCacheFlushTests = Test>; HWTEST_F(KernelCacheFlushTests, givenLocallyUncachedBufferWhenGettingAllocationsForFlushThenEmptyVectorIsReturned) { DebugManagerStateRestore dbgRestore; DebugManager.flags.EnableCacheFlushAfterWalker.set(-1); auto kernel = clUniquePtr(Kernel::create(pProgram, *pProgram->getKernelInfo("CopyBuffer"), &retVal)); cl_mem_properties_intel bufferPropertiesUncachedResource[] = {CL_MEM_FLAGS_INTEL, CL_MEM_LOCALLY_UNCACHED_RESOURCE, 0}; auto bufferLocallyUncached = clCreateBufferWithPropertiesINTEL(context, bufferPropertiesUncachedResource, 1, nullptr, nullptr); kernel->setArg(0, sizeof(bufferLocallyUncached), &bufferLocallyUncached); using CacheFlushAllocationsVec = StackVec; CacheFlushAllocationsVec cacheFlushVec; kernel->getAllocationsForCacheFlush(cacheFlushVec); EXPECT_EQ(0u, cacheFlushVec.size()); auto bufferRegular = clCreateBufferWithPropertiesINTEL(context, nullptr, 1, nullptr, nullptr); kernel->setArg(1, sizeof(bufferRegular), &bufferRegular); kernel->getAllocationsForCacheFlush(cacheFlushVec); size_t expectedCacheFlushVecSize = (hardwareInfo.capabilityTable.supportCacheFlushAfterWalker ? 1u : 0u); EXPECT_EQ(expectedCacheFlushVecSize, cacheFlushVec.size()); clReleaseMemObject(bufferLocallyUncached); clReleaseMemObject(bufferRegular); }