/* * Copyright (C) 2018-2022 Intel Corporation * * SPDX-License-Identifier: MIT * */ #include "shared/source/helpers/aligned_memory.h" #include "shared/source/helpers/hw_helper.h" #include "shared/source/helpers/local_work_size.h" #include "shared/source/memory_manager/internal_allocation_storage.h" #include "shared/source/utilities/perf_counter.h" #include "shared/source/utilities/tag_allocator.h" #include "shared/test/common/cmd_parse/hw_parse.h" #include "shared/test/common/helpers/debug_manager_state_restore.h" #include "shared/test/common/helpers/unit_test_helper.h" #include "shared/test/common/mocks/mock_graphics_allocation.h" #include "shared/test/common/mocks/mock_timestamp_container.h" #include "shared/test/common/test_macros/hw_test.h" #include "opencl/source/built_ins/aux_translation_builtin.h" #include "opencl/source/command_queue/gpgpu_walker.h" #include "opencl/source/command_queue/hardware_interface.h" #include "opencl/source/helpers/hardware_commands_helper.h" #include "opencl/source/helpers/task_information.h" #include "opencl/test/unit_test/command_queue/command_queue_fixture.h" #include "opencl/test/unit_test/command_queue/hardware_interface_helper.h" #include "opencl/test/unit_test/fixtures/cl_device_fixture.h" #include "opencl/test/unit_test/mocks/mock_buffer.h" #include "opencl/test/unit_test/mocks/mock_command_queue.h" #include "opencl/test/unit_test/mocks/mock_kernel.h" #include "opencl/test/unit_test/mocks/mock_mdi.h" #include "opencl/test/unit_test/mocks/mock_program.h" using namespace NEO; struct DispatchWalkerTest : public CommandQueueFixture, public ClDeviceFixture, public ::testing::Test { using CommandQueueFixture::SetUp; void SetUp() override { DebugManager.flags.EnableTimestampPacket.set(0); ClDeviceFixture::SetUp(); context = std::make_unique(pClDevice); CommandQueueFixture::SetUp(context.get(), pClDevice, 0); program = std::make_unique(toClDeviceVector(*pClDevice)); kernelInfo.kernelDescriptor.kernelAttributes.simdSize = 32; kernelInfo.setCrossThreadDataSize(64); kernelInfo.setLocalIds({1, 1, 1}); kernelInfo.heapInfo.pKernelHeap = kernelIsa; kernelInfo.heapInfo.KernelHeapSize = sizeof(kernelIsa); kernelInfoWithSampler.kernelDescriptor.kernelAttributes.simdSize = 32; kernelInfoWithSampler.setCrossThreadDataSize(64); kernelInfoWithSampler.setLocalIds({1, 1, 1}); kernelInfoWithSampler.setSamplerTable(0, 1, 4); kernelInfoWithSampler.heapInfo.pKernelHeap = kernelIsa; kernelInfoWithSampler.heapInfo.KernelHeapSize = sizeof(kernelIsa); kernelInfoWithSampler.heapInfo.pDsh = static_cast(dsh); kernelInfoWithSampler.heapInfo.DynamicStateHeapSize = sizeof(dsh); } void TearDown() override { CommandQueueFixture::TearDown(); context.reset(); ClDeviceFixture::TearDown(); } std::unique_ptr createBlockedCommandsData(CommandQueue &commandQueue) { auto commandStream = new LinearStream(); auto &gpgpuCsr = commandQueue.getGpgpuCommandStreamReceiver(); gpgpuCsr.ensureCommandBufferAllocation(*commandStream, 1, 1); return std::make_unique(commandStream, *gpgpuCsr.getInternalAllocationStorage()); } std::unique_ptr context; std::unique_ptr program; MockKernelInfo kernelInfo; MockKernelInfo kernelInfoWithSampler; uint32_t kernelIsa[32]; uint32_t dsh[32]; DebugManagerStateRestore dbgRestore; }; struct DispatchWalkerTestForAuxTranslation : DispatchWalkerTest, public ::testing::WithParamInterface { void SetUp() override { DispatchWalkerTest::SetUp(); kernelObjType = GetParam(); } KernelObjForAuxTranslation::Type kernelObjType; }; INSTANTIATE_TEST_CASE_P(, DispatchWalkerTestForAuxTranslation, testing::ValuesIn({KernelObjForAuxTranslation::Type::MEM_OBJ, KernelObjForAuxTranslation::Type::GFX_ALLOC})); HWTEST_F(DispatchWalkerTest, WhenGettingComputeDimensionsThenCorrectNumberOfDimensionsIsReturned) { const size_t workItems1D[] = {100, 1, 1}; EXPECT_EQ(1u, computeDimensions(workItems1D)); const size_t workItems2D[] = {100, 100, 1}; EXPECT_EQ(2u, computeDimensions(workItems2D)); const size_t workItems3D[] = {100, 100, 100}; EXPECT_EQ(3u, computeDimensions(workItems3D)); } HWTEST_F(DispatchWalkerTest, givenSimd1WhenSetGpgpuWalkerThreadDataThenSimdInWalkerIsSetTo32Value) { uint32_t pCmdBuffer[1024]; MockGraphicsAllocation gfxAllocation(static_cast(pCmdBuffer), sizeof(pCmdBuffer)); LinearStream linearStream(&gfxAllocation); using WALKER_TYPE = typename FamilyType::WALKER_TYPE; WALKER_TYPE *computeWalker = static_cast(linearStream.getSpace(sizeof(WALKER_TYPE))); *computeWalker = FamilyType::cmdInitGpgpuWalker; size_t globalOffsets[] = {0, 0, 0}; size_t startWorkGroups[] = {0, 0, 0}; size_t numWorkGroups[] = {1, 1, 1}; size_t localWorkSizesIn[] = {32, 1, 1}; uint32_t simd = 1; KernelDescriptor kd; GpgpuWalkerHelper::setGpgpuWalkerThreadData( computeWalker, kd, globalOffsets, startWorkGroups, numWorkGroups, localWorkSizesIn, simd, 3, true, false, 5u); EXPECT_EQ(computeWalker->getSimdSize(), 32 >> 4); } HWTEST_F(DispatchWalkerTest, WhenDispatchingWalkerThenCommandStreamMemoryIsntChanged) { MockKernel kernel(program.get(), kernelInfo, *pClDevice); ASSERT_EQ(CL_SUCCESS, kernel.initialize()); auto &commandStream = pCmdQ->getCS(4096); // Consume all memory except what is needed for this enqueue auto sizeDispatchWalkerNeeds = sizeof(typename FamilyType::WALKER_TYPE) + HardwareCommandsHelper::getSizeRequiredCS(); //cs has a minimum required size auto sizeThatNeedsToBeSubstracted = sizeDispatchWalkerNeeds + CSRequirements::minCommandQueueCommandStreamSize; commandStream.getSpace(commandStream.getMaxAvailableSpace() - sizeThatNeedsToBeSubstracted); ASSERT_EQ(commandStream.getAvailableSpace(), sizeThatNeedsToBeSubstracted); auto commandStreamStart = commandStream.getUsed(); auto commandStreamBuffer = commandStream.getCpuBase(); ASSERT_NE(0u, commandStreamStart); size_t globalOffsets[3] = {0, 0, 0}; size_t workItems[3] = {1, 1, 1}; cl_uint dimensions = 1; DispatchInfo dispatchInfo(pClDevice, const_cast(&kernel), dimensions, workItems, nullptr, globalOffsets); dispatchInfo.setNumberOfWorkgroups({1, 1, 1}); dispatchInfo.setTotalNumberOfWorkgroups({1, 1, 1}); MultiDispatchInfo multiDispatchInfo; multiDispatchInfo.push(dispatchInfo); HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL); HardwareInterface::dispatchWalker( *pCmdQ, multiDispatchInfo, CsrDependencies(), walkerArgs); EXPECT_EQ(commandStreamBuffer, commandStream.getCpuBase()); EXPECT_LT(commandStreamStart, commandStream.getUsed()); EXPECT_EQ(sizeDispatchWalkerNeeds, commandStream.getUsed() - commandStreamStart); } HWTEST_F(DispatchWalkerTest, GivenNoLocalIdsWhenDispatchingWalkerThenWalkerIsDispatched) { kernelInfo.setLocalIds({0, 0, 0}); kernelInfo.kernelDescriptor.kernelAttributes.flags.perThreadDataUnusedGrfIsPresent = true; MockKernel kernel(program.get(), kernelInfo, *pClDevice); ASSERT_EQ(CL_SUCCESS, kernel.initialize()); auto &commandStream = pCmdQ->getCS(4096); // Consume all memory except what is needed for this enqueue auto sizeDispatchWalkerNeeds = sizeof(typename FamilyType::WALKER_TYPE) + HardwareCommandsHelper::getSizeRequiredCS(); //cs has a minimum required size auto sizeThatNeedsToBeSubstracted = sizeDispatchWalkerNeeds + CSRequirements::minCommandQueueCommandStreamSize; commandStream.getSpace(commandStream.getMaxAvailableSpace() - sizeThatNeedsToBeSubstracted); ASSERT_EQ(commandStream.getAvailableSpace(), sizeThatNeedsToBeSubstracted); auto commandStreamStart = commandStream.getUsed(); auto commandStreamBuffer = commandStream.getCpuBase(); ASSERT_NE(0u, commandStreamStart); size_t globalOffsets[3] = {0, 0, 0}; size_t workItems[3] = {1, 1, 1}; cl_uint dimensions = 1; DispatchInfo dispatchInfo(pClDevice, const_cast(&kernel), dimensions, workItems, nullptr, globalOffsets); dispatchInfo.setNumberOfWorkgroups({1, 1, 1}); dispatchInfo.setTotalNumberOfWorkgroups({1, 1, 1}); MultiDispatchInfo multiDispatchInfo; multiDispatchInfo.push(dispatchInfo); HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL); HardwareInterface::dispatchWalker( *pCmdQ, multiDispatchInfo, CsrDependencies(), walkerArgs); EXPECT_EQ(commandStreamBuffer, commandStream.getCpuBase()); EXPECT_LT(commandStreamStart, commandStream.getUsed()); EXPECT_EQ(sizeDispatchWalkerNeeds, commandStream.getUsed() - commandStreamStart); } HWTEST_F(DispatchWalkerTest, GivenDefaultLwsAlgorithmWhenDispatchingWalkerThenDimensionsAreCorrect) { MockKernel kernel(program.get(), kernelInfo, *pClDevice); kernelInfo.kernelDescriptor.payloadMappings.dispatchTraits.workDim = 0; ASSERT_EQ(CL_SUCCESS, kernel.initialize()); size_t globalOffsets[3] = {0, 0, 0}; size_t workItems[3] = {1, 1, 1}; for (uint32_t dimension = 1; dimension <= 3; ++dimension) { workItems[dimension - 1] = 256; DispatchInfo dispatchInfo(pClDevice, const_cast(&kernel), dimension, workItems, nullptr, globalOffsets); dispatchInfo.setNumberOfWorkgroups({1, 1, 1}); dispatchInfo.setTotalNumberOfWorkgroups({1, 1, 1}); MultiDispatchInfo multiDispatchInfo; multiDispatchInfo.push(dispatchInfo); HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL); HardwareInterface::dispatchWalker( *pCmdQ, multiDispatchInfo, CsrDependencies(), walkerArgs); EXPECT_EQ(dimension, *kernel.getWorkDim()); } } HWTEST_F(DispatchWalkerTest, GivenSquaredLwsAlgorithmWhenDispatchingWalkerThenDimensionsAreCorrect) { DebugManagerStateRestore dbgRestore; DebugManager.flags.EnableComputeWorkSizeND.set(false); DebugManager.flags.EnableComputeWorkSizeSquared.set(true); MockKernel kernel(program.get(), kernelInfo, *pClDevice); kernelInfo.kernelDescriptor.payloadMappings.dispatchTraits.workDim = 0; ASSERT_EQ(CL_SUCCESS, kernel.initialize()); size_t globalOffsets[3] = {0, 0, 0}; size_t workItems[3] = {1, 1, 1}; for (uint32_t dimension = 1; dimension <= 3; ++dimension) { workItems[dimension - 1] = 256; DispatchInfo dispatchInfo(pClDevice, const_cast(&kernel), dimension, workItems, nullptr, globalOffsets); dispatchInfo.setNumberOfWorkgroups({1, 1, 1}); dispatchInfo.setTotalNumberOfWorkgroups({1, 1, 1}); MultiDispatchInfo multiDispatchInfo; multiDispatchInfo.push(dispatchInfo); HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL); HardwareInterface::dispatchWalker( *pCmdQ, multiDispatchInfo, CsrDependencies(), walkerArgs); EXPECT_EQ(dimension, *kernel.getWorkDim()); } } HWTEST_F(DispatchWalkerTest, GivenNdLwsAlgorithmWhenDispatchingWalkerThenDimensionsAreCorrect) { DebugManagerStateRestore dbgRestore; DebugManager.flags.EnableComputeWorkSizeND.set(true); MockKernel kernel(program.get(), kernelInfo, *pClDevice); kernelInfo.kernelDescriptor.payloadMappings.dispatchTraits.workDim = 0; ASSERT_EQ(CL_SUCCESS, kernel.initialize()); size_t globalOffsets[3] = {0, 0, 0}; size_t workItems[3] = {1, 1, 1}; for (uint32_t dimension = 1; dimension <= 3; ++dimension) { workItems[dimension - 1] = 256; DispatchInfo dispatchInfo(pClDevice, const_cast(&kernel), dimension, workItems, nullptr, globalOffsets); dispatchInfo.setNumberOfWorkgroups({1, 1, 1}); dispatchInfo.setTotalNumberOfWorkgroups({1, 1, 1}); MultiDispatchInfo multiDispatchInfo; multiDispatchInfo.push(dispatchInfo); HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL); HardwareInterface::dispatchWalker( *pCmdQ, multiDispatchInfo, CsrDependencies(), walkerArgs); EXPECT_EQ(dimension, *kernel.getWorkDim()); } } HWTEST_F(DispatchWalkerTest, GivenOldLwsAlgorithmWhenDispatchingWalkerThenDimensionsAreCorrect) { DebugManagerStateRestore dbgRestore; DebugManager.flags.EnableComputeWorkSizeND.set(false); DebugManager.flags.EnableComputeWorkSizeSquared.set(false); MockKernel kernel(program.get(), kernelInfo, *pClDevice); kernelInfo.kernelDescriptor.payloadMappings.dispatchTraits.workDim = 0; ASSERT_EQ(CL_SUCCESS, kernel.initialize()); size_t globalOffsets[3] = {0, 0, 0}; size_t workItems[3] = {1, 1, 1}; for (uint32_t dimension = 1; dimension <= 3; ++dimension) { workItems[dimension - 1] = 256; DispatchInfo dispatchInfo(pClDevice, const_cast(&kernel), dimension, workItems, nullptr, globalOffsets); dispatchInfo.setNumberOfWorkgroups({1, 1, 1}); dispatchInfo.setTotalNumberOfWorkgroups({1, 1, 1}); MultiDispatchInfo multiDispatchInfo; multiDispatchInfo.push(dispatchInfo); HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL); HardwareInterface::dispatchWalker( *pCmdQ, multiDispatchInfo, CsrDependencies(), walkerArgs); EXPECT_EQ(dimension, *kernel.getWorkDim()); } } HWTEST_F(DispatchWalkerTest, GivenNumWorkGroupsWhenDispatchingWalkerThenNumWorkGroupsIsCorrectlySet) { MockKernel kernel(program.get(), kernelInfo, *pClDevice); kernelInfo.kernelDescriptor.payloadMappings.dispatchTraits.numWorkGroups[0] = 0; kernelInfo.kernelDescriptor.payloadMappings.dispatchTraits.numWorkGroups[1] = 4; kernelInfo.kernelDescriptor.payloadMappings.dispatchTraits.numWorkGroups[2] = 8; ASSERT_EQ(CL_SUCCESS, kernel.initialize()); size_t globalOffsets[3] = {0, 0, 0}; size_t workItems[3] = {2, 5, 10}; size_t workGroupSize[3] = {1, 1, 1}; cl_uint dimensions = 3; DispatchInfo dispatchInfo(pClDevice, const_cast(&kernel), dimensions, workItems, workGroupSize, globalOffsets); dispatchInfo.setNumberOfWorkgroups(workItems); dispatchInfo.setTotalNumberOfWorkgroups(workItems); MultiDispatchInfo multiDispatchInfo; multiDispatchInfo.push(dispatchInfo); HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL); HardwareInterface::dispatchWalker( *pCmdQ, multiDispatchInfo, CsrDependencies(), walkerArgs); auto numWorkGroups = kernel.getNumWorkGroupsValues(); EXPECT_EQ(2u, *numWorkGroups[0]); EXPECT_EQ(5u, *numWorkGroups[1]); EXPECT_EQ(10u, *numWorkGroups[2]); } HWTEST_F(DispatchWalkerTest, GivenGlobalWorkOffsetWhenDispatchingWalkerThenGlobalWorkOffsetIsCorrectlySet) { kernelInfo.kernelDescriptor.payloadMappings.dispatchTraits.globalWorkOffset[0] = 0u; kernelInfo.kernelDescriptor.payloadMappings.dispatchTraits.globalWorkOffset[1] = 4u; kernelInfo.kernelDescriptor.payloadMappings.dispatchTraits.globalWorkOffset[2] = 8u; MockKernel kernel(program.get(), kernelInfo, *pClDevice); ASSERT_EQ(CL_SUCCESS, kernel.initialize()); size_t globalOffsets[3] = {1, 2, 3}; size_t workItems[3] = {2, 5, 10}; size_t workGroupSize[3] = {1, 1, 1}; cl_uint dimensions = 3; DispatchInfo dispatchInfo(pClDevice, &kernel, dimensions, workItems, workGroupSize, globalOffsets); dispatchInfo.setNumberOfWorkgroups({1, 1, 1}); dispatchInfo.setTotalNumberOfWorkgroups({1, 1, 1}); MultiDispatchInfo multiDispatchInfo; multiDispatchInfo.push(dispatchInfo); HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL); HardwareInterface::dispatchWalker( *pCmdQ, multiDispatchInfo, CsrDependencies(), walkerArgs); auto gwo = kernel.getGlobalWorkOffsetValues(); EXPECT_EQ(1u, *gwo[0]); EXPECT_EQ(2u, *gwo[1]); EXPECT_EQ(3u, *gwo[2]); } HWTEST_F(DispatchWalkerTest, GivenNoLocalWorkSizeAndDefaultAlgorithmWhenDispatchingWalkerThenLwsIsCorrect) { DebugManagerStateRestore dbgRestore; DebugManager.flags.EnableComputeWorkSizeND.set(false); MockKernel kernel(program.get(), kernelInfo, *pClDevice); kernelInfo.kernelDescriptor.payloadMappings.dispatchTraits.localWorkSize[0] = 0; kernelInfo.kernelDescriptor.payloadMappings.dispatchTraits.localWorkSize[1] = 4; kernelInfo.kernelDescriptor.payloadMappings.dispatchTraits.localWorkSize[2] = 8; ASSERT_EQ(CL_SUCCESS, kernel.initialize()); size_t globalOffsets[3] = {0, 0, 0}; size_t workItems[3] = {2, 5, 10}; cl_uint dimensions = 3; DispatchInfo dispatchInfo(pClDevice, const_cast(&kernel), dimensions, workItems, nullptr, globalOffsets); dispatchInfo.setNumberOfWorkgroups({1, 1, 1}); dispatchInfo.setTotalNumberOfWorkgroups({1, 1, 1}); MultiDispatchInfo multiDispatchInfo; multiDispatchInfo.push(dispatchInfo); HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL); HardwareInterface::dispatchWalker( *pCmdQ, multiDispatchInfo, CsrDependencies(), walkerArgs); auto localWorkSize = kernel.getLocalWorkSizeValues(); EXPECT_EQ(2u, *localWorkSize[0]); EXPECT_EQ(5u, *localWorkSize[1]); EXPECT_EQ(1u, *localWorkSize[2]); } HWTEST_F(DispatchWalkerTest, GivenNoLocalWorkSizeAndNdOnWhenDispatchingWalkerThenLwsIsCorrect) { DebugManagerStateRestore dbgRestore; DebugManager.flags.EnableComputeWorkSizeND.set(true); MockKernel kernel(program.get(), kernelInfo, *pClDevice); kernelInfo.kernelDescriptor.payloadMappings.dispatchTraits.localWorkSize[0] = 0; kernelInfo.kernelDescriptor.payloadMappings.dispatchTraits.localWorkSize[1] = 4; kernelInfo.kernelDescriptor.payloadMappings.dispatchTraits.localWorkSize[2] = 8; ASSERT_EQ(CL_SUCCESS, kernel.initialize()); size_t globalOffsets[3] = {0, 0, 0}; size_t workItems[3] = {2, 3, 5}; cl_uint dimensions = 3; DispatchInfo dispatchInfo(pClDevice, const_cast(&kernel), dimensions, workItems, nullptr, globalOffsets); dispatchInfo.setNumberOfWorkgroups({1, 1, 1}); dispatchInfo.setTotalNumberOfWorkgroups({1, 1, 1}); MultiDispatchInfo multiDispatchInfo; multiDispatchInfo.push(dispatchInfo); HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL); HardwareInterface::dispatchWalker( *pCmdQ, multiDispatchInfo, CsrDependencies(), walkerArgs); auto localWorkSize = kernel.getLocalWorkSizeValues(); EXPECT_EQ(2u, *localWorkSize[0]); EXPECT_EQ(3u, *localWorkSize[1]); EXPECT_EQ(5u, *localWorkSize[2]); } HWTEST_F(DispatchWalkerTest, GivenNoLocalWorkSizeAndSquaredAlgorithmWhenDispatchingWalkerThenLwsIsCorrect) { DebugManagerStateRestore dbgRestore; DebugManager.flags.EnableComputeWorkSizeSquared.set(true); DebugManager.flags.EnableComputeWorkSizeND.set(false); MockKernel kernel(program.get(), kernelInfo, *pClDevice); kernelInfo.kernelDescriptor.payloadMappings.dispatchTraits.localWorkSize[0] = 0; kernelInfo.kernelDescriptor.payloadMappings.dispatchTraits.localWorkSize[1] = 4; kernelInfo.kernelDescriptor.payloadMappings.dispatchTraits.localWorkSize[2] = 8; ASSERT_EQ(CL_SUCCESS, kernel.initialize()); size_t globalOffsets[3] = {0, 0, 0}; size_t workItems[3] = {2, 5, 10}; cl_uint dimensions = 3; DispatchInfo dispatchInfo(pClDevice, const_cast(&kernel), dimensions, workItems, nullptr, globalOffsets); dispatchInfo.setNumberOfWorkgroups({1, 1, 1}); dispatchInfo.setTotalNumberOfWorkgroups({1, 1, 1}); MultiDispatchInfo multiDispatchInfo; multiDispatchInfo.push(dispatchInfo); HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL); HardwareInterface::dispatchWalker( *pCmdQ, multiDispatchInfo, CsrDependencies(), walkerArgs); auto localWorkSize = kernel.getLocalWorkSizeValues(); EXPECT_EQ(2u, *localWorkSize[0]); EXPECT_EQ(5u, *localWorkSize[1]); EXPECT_EQ(1u, *localWorkSize[2]); } HWTEST_F(DispatchWalkerTest, GivenNoLocalWorkSizeAndSquaredAlgorithmOffAndNdOffWhenDispatchingWalkerThenLwsIsCorrect) { DebugManagerStateRestore dbgRestore; DebugManager.flags.EnableComputeWorkSizeSquared.set(false); DebugManager.flags.EnableComputeWorkSizeND.set(false); MockKernel kernel(program.get(), kernelInfo, *pClDevice); kernelInfo.kernelDescriptor.payloadMappings.dispatchTraits.localWorkSize[0] = 0; kernelInfo.kernelDescriptor.payloadMappings.dispatchTraits.localWorkSize[1] = 4; kernelInfo.kernelDescriptor.payloadMappings.dispatchTraits.localWorkSize[2] = 8; ASSERT_EQ(CL_SUCCESS, kernel.initialize()); size_t globalOffsets[3] = {0, 0, 0}; size_t workItems[3] = {2, 5, 10}; cl_uint dimensions = 3; DispatchInfo dispatchInfo(pClDevice, const_cast(&kernel), dimensions, workItems, nullptr, globalOffsets); dispatchInfo.setNumberOfWorkgroups({1, 1, 1}); dispatchInfo.setTotalNumberOfWorkgroups({1, 1, 1}); MultiDispatchInfo multiDispatchInfo; multiDispatchInfo.push(dispatchInfo); HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL); HardwareInterface::dispatchWalker( *pCmdQ, multiDispatchInfo, CsrDependencies(), walkerArgs); auto localWorkSize = kernel.getLocalWorkSizeValues(); EXPECT_EQ(2u, *localWorkSize[0]); EXPECT_EQ(5u, *localWorkSize[1]); EXPECT_EQ(1u, *localWorkSize[2]); } HWTEST_F(DispatchWalkerTest, GivenNoLocalWorkSizeWhenDispatchingWalkerThenLwsIsCorrect) { MockKernel kernel(program.get(), kernelInfo, *pClDevice); kernelInfo.kernelDescriptor.payloadMappings.dispatchTraits.localWorkSize[0] = 0; kernelInfo.kernelDescriptor.payloadMappings.dispatchTraits.localWorkSize[1] = 4; kernelInfo.kernelDescriptor.payloadMappings.dispatchTraits.localWorkSize[2] = 8; ASSERT_EQ(CL_SUCCESS, kernel.initialize()); size_t globalOffsets[3] = {0, 0, 0}; size_t workItems[3] = {2, 5, 10}; size_t workGroupSize[3] = {1, 2, 3}; cl_uint dimensions = 3; DispatchInfo dispatchInfo(pClDevice, const_cast(&kernel), dimensions, workItems, workGroupSize, globalOffsets); dispatchInfo.setNumberOfWorkgroups({1, 1, 1}); dispatchInfo.setTotalNumberOfWorkgroups({1, 1, 1}); MultiDispatchInfo multiDispatchInfo; multiDispatchInfo.push(dispatchInfo); HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL); HardwareInterface::dispatchWalker( *pCmdQ, multiDispatchInfo, CsrDependencies(), walkerArgs); auto localWorkSize = kernel.getLocalWorkSizeValues(); EXPECT_EQ(1u, *localWorkSize[0]); EXPECT_EQ(2u, *localWorkSize[1]); EXPECT_EQ(3u, *localWorkSize[2]); } HWTEST_F(DispatchWalkerTest, GivenTwoSetsOfLwsOffsetsWhenDispatchingWalkerThenLwsIsCorrect) { MockKernel kernel(program.get(), kernelInfo, *pClDevice); kernelInfo.kernelDescriptor.payloadMappings.dispatchTraits.localWorkSize[0] = 0; kernelInfo.kernelDescriptor.payloadMappings.dispatchTraits.localWorkSize[1] = 4; kernelInfo.kernelDescriptor.payloadMappings.dispatchTraits.localWorkSize[2] = 8; kernelInfo.kernelDescriptor.payloadMappings.dispatchTraits.localWorkSize2[0] = 12; kernelInfo.kernelDescriptor.payloadMappings.dispatchTraits.localWorkSize2[1] = 16; kernelInfo.kernelDescriptor.payloadMappings.dispatchTraits.localWorkSize2[2] = 20; ASSERT_EQ(CL_SUCCESS, kernel.initialize()); size_t globalOffsets[3] = {0, 0, 0}; size_t workItems[3] = {2, 5, 10}; size_t workGroupSize[3] = {1, 2, 3}; cl_uint dimensions = 3; DispatchInfo dispatchInfo(pClDevice, const_cast(&kernel), dimensions, workItems, workGroupSize, globalOffsets); dispatchInfo.setNumberOfWorkgroups({1, 1, 1}); dispatchInfo.setTotalNumberOfWorkgroups({1, 1, 1}); MultiDispatchInfo multiDispatchInfo; multiDispatchInfo.push(dispatchInfo); HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL); HardwareInterface::dispatchWalker( *pCmdQ, multiDispatchInfo, CsrDependencies(), walkerArgs); auto localWorkSize = kernel.getLocalWorkSizeValues(); EXPECT_EQ(1u, *localWorkSize[0]); EXPECT_EQ(2u, *localWorkSize[1]); EXPECT_EQ(3u, *localWorkSize[2]); auto localWorkSize2 = kernel.getLocalWorkSize2Values(); EXPECT_EQ(1u, *localWorkSize2[0]); EXPECT_EQ(2u, *localWorkSize2[1]); EXPECT_EQ(3u, *localWorkSize2[2]); } HWTEST_F(DispatchWalkerTest, GivenSplitKernelWhenDispatchingWalkerThenLwsIsCorrect) { MockKernel kernel1(program.get(), kernelInfo, *pClDevice); kernelInfo.kernelDescriptor.payloadMappings.dispatchTraits.localWorkSize[0] = 0; kernelInfo.kernelDescriptor.payloadMappings.dispatchTraits.localWorkSize[1] = 4; kernelInfo.kernelDescriptor.payloadMappings.dispatchTraits.localWorkSize[2] = 8; ASSERT_EQ(CL_SUCCESS, kernel1.initialize()); MockKernel kernel2(program.get(), kernelInfoWithSampler, *pClDevice); kernelInfoWithSampler.kernelDescriptor.payloadMappings.dispatchTraits.localWorkSize[0] = 12; kernelInfoWithSampler.kernelDescriptor.payloadMappings.dispatchTraits.localWorkSize[1] = 16; kernelInfoWithSampler.kernelDescriptor.payloadMappings.dispatchTraits.localWorkSize[2] = 20; ASSERT_EQ(CL_SUCCESS, kernel2.initialize()); DispatchInfo di1(pClDevice, &kernel1, 3, {10, 10, 10}, {1, 2, 3}, {0, 0, 0}); di1.setNumberOfWorkgroups({1, 1, 1}); di1.setTotalNumberOfWorkgroups({2, 2, 2}); DispatchInfo di2(pClDevice, &kernel2, 3, {10, 10, 10}, {4, 5, 6}, {0, 0, 0}); di2.setNumberOfWorkgroups({1, 1, 1}); di2.setTotalNumberOfWorkgroups({2, 2, 2}); MockMultiDispatchInfo multiDispatchInfo(std::vector({&di1, &di2})); HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL); HardwareInterface::dispatchWalker( *pCmdQ, multiDispatchInfo, CsrDependencies(), walkerArgs); auto dispatchId = 0; for (auto &dispatchInfo : multiDispatchInfo) { auto &kernel = static_cast(*dispatchInfo.getKernel()); auto localWorkSize = kernel.getLocalWorkSizeValues(); if (dispatchId == 0) { EXPECT_EQ(1u, *localWorkSize[0]); EXPECT_EQ(2u, *localWorkSize[1]); EXPECT_EQ(3u, *localWorkSize[2]); } if (dispatchId == 1) { EXPECT_EQ(4u, *localWorkSize[0]); EXPECT_EQ(5u, *localWorkSize[1]); EXPECT_EQ(6u, *localWorkSize[2]); } dispatchId++; } } HWTEST_F(DispatchWalkerTest, GivenSplitWalkerWhenDispatchingWalkerThenLwsIsCorrect) { MockKernel kernel1(program.get(), kernelInfo, *pClDevice); MockKernel mainKernel(program.get(), kernelInfo, *pClDevice); auto &dispatchTraits = kernelInfo.kernelDescriptor.payloadMappings.dispatchTraits; dispatchTraits.localWorkSize[0] = 0; dispatchTraits.localWorkSize[1] = 4; dispatchTraits.localWorkSize[2] = 8; dispatchTraits.localWorkSize2[0] = 12; dispatchTraits.localWorkSize2[1] = 16; dispatchTraits.localWorkSize2[2] = 20; dispatchTraits.numWorkGroups[0] = 24; dispatchTraits.numWorkGroups[1] = 28; dispatchTraits.numWorkGroups[2] = 32; ASSERT_EQ(CL_SUCCESS, kernel1.initialize()); ASSERT_EQ(CL_SUCCESS, mainKernel.initialize()); DispatchInfo di1(pClDevice, &kernel1, 3, {10, 10, 10}, {1, 2, 3}, {0, 0, 0}); di1.setNumberOfWorkgroups({1, 1, 1}); di1.setTotalNumberOfWorkgroups({3, 2, 2}); DispatchInfo di2(pClDevice, &mainKernel, 3, {10, 10, 10}, {4, 5, 6}, {0, 0, 0}); di2.setNumberOfWorkgroups({1, 1, 1}); di2.setTotalNumberOfWorkgroups({3, 2, 2}); MultiDispatchInfo multiDispatchInfo(&mainKernel); multiDispatchInfo.push(di1); multiDispatchInfo.push(di2); HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL); HardwareInterface::dispatchWalker( *pCmdQ, multiDispatchInfo, CsrDependencies(), walkerArgs); for (auto &dispatchInfo : multiDispatchInfo) { auto &kernel = static_cast(*dispatchInfo.getKernel()); auto localWorkSize = kernel.getLocalWorkSizeValues(); auto localWorkSize2 = kernel.getLocalWorkSize2Values(); auto numWorkGroups = kernel.getNumWorkGroupsValues(); if (&kernel == &mainKernel) { EXPECT_EQ(4u, *localWorkSize[0]); EXPECT_EQ(5u, *localWorkSize[1]); EXPECT_EQ(6u, *localWorkSize[2]); EXPECT_EQ(4u, *localWorkSize2[0]); EXPECT_EQ(5u, *localWorkSize2[1]); EXPECT_EQ(6u, *localWorkSize2[2]); EXPECT_EQ(3u, *numWorkGroups[0]); EXPECT_EQ(2u, *numWorkGroups[1]); EXPECT_EQ(2u, *numWorkGroups[2]); } else { EXPECT_EQ(0u, *localWorkSize[0]); EXPECT_EQ(0u, *localWorkSize[1]); EXPECT_EQ(0u, *localWorkSize[2]); EXPECT_EQ(1u, *localWorkSize2[0]); EXPECT_EQ(2u, *localWorkSize2[1]); EXPECT_EQ(3u, *localWorkSize2[2]); EXPECT_EQ(0u, *numWorkGroups[0]); EXPECT_EQ(0u, *numWorkGroups[1]); EXPECT_EQ(0u, *numWorkGroups[2]); } } } HWTEST_F(DispatchWalkerTest, GivenBlockedQueueWhenDispatchingWalkerThenCommandSteamIsNotConsumed) { MockKernel kernel(program.get(), kernelInfo, *pClDevice); ASSERT_EQ(CL_SUCCESS, kernel.initialize()); size_t globalOffsets[3] = {0, 0, 0}; size_t workItems[3] = {1, 1, 1}; size_t workGroupSize[3] = {2, 5, 10}; cl_uint dimensions = 1; auto blockedCommandsData = createBlockedCommandsData(*pCmdQ); DispatchInfo dispatchInfo(pClDevice, const_cast(&kernel), dimensions, workItems, workGroupSize, globalOffsets); dispatchInfo.setNumberOfWorkgroups({1, 1, 1}); dispatchInfo.setTotalNumberOfWorkgroups({1, 1, 1}); MultiDispatchInfo multiDispatchInfo; multiDispatchInfo.push(dispatchInfo); HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL); walkerArgs.blockedCommandsData = blockedCommandsData.get(); HardwareInterface::dispatchWalker( *pCmdQ, multiDispatchInfo, CsrDependencies(), walkerArgs); auto &commandStream = pCmdQ->getCS(1024); EXPECT_EQ(0u, commandStream.getUsed()); EXPECT_NE(nullptr, blockedCommandsData); EXPECT_NE(nullptr, blockedCommandsData->commandStream); EXPECT_NE(nullptr, blockedCommandsData->dsh); EXPECT_NE(nullptr, blockedCommandsData->ioh); EXPECT_NE(nullptr, blockedCommandsData->ssh); } HWTEST_F(DispatchWalkerTest, GivenBlockedQueueWhenDispatchingWalkerThenRequiredHeaSizesAreTakenFromKernel) { MockKernel kernel(program.get(), kernelInfo, *pClDevice); ASSERT_EQ(CL_SUCCESS, kernel.initialize()); size_t globalOffsets[3] = {0, 0, 0}; size_t workItems[3] = {1, 1, 1}; size_t workGroupSize[3] = {2, 5, 10}; cl_uint dimensions = 1; auto blockedCommandsData = createBlockedCommandsData(*pCmdQ); DispatchInfo dispatchInfo(pClDevice, const_cast(&kernel), dimensions, workItems, workGroupSize, globalOffsets); dispatchInfo.setNumberOfWorkgroups({1, 1, 1}); dispatchInfo.setTotalNumberOfWorkgroups({1, 1, 1}); MultiDispatchInfo multiDispatchInfo(&kernel); multiDispatchInfo.push(dispatchInfo); HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL); walkerArgs.blockedCommandsData = blockedCommandsData.get(); HardwareInterface::dispatchWalker( *pCmdQ, multiDispatchInfo, CsrDependencies(), walkerArgs); Vec3 localWorkgroupSize(workGroupSize); auto expectedSizeDSH = HardwareCommandsHelper::getSizeRequiredDSH(kernel); auto expectedSizeIOH = HardwareCommandsHelper::getSizeRequiredIOH(kernel, Math::computeTotalElementsCount(localWorkgroupSize)); auto expectedSizeSSH = HardwareCommandsHelper::getSizeRequiredSSH(kernel); EXPECT_LE(expectedSizeDSH, blockedCommandsData->dsh->getMaxAvailableSpace()); EXPECT_LE(expectedSizeIOH, blockedCommandsData->ioh->getMaxAvailableSpace()); EXPECT_LE(expectedSizeSSH, blockedCommandsData->ssh->getMaxAvailableSpace()); } HWTEST_F(DispatchWalkerTest, givenBlockedEnqueueWhenObtainingCommandStreamThenAllocateEnoughSpaceAndBlockedKernelData) { DispatchInfo dispatchInfo; MultiDispatchInfo multiDispatchInfo; multiDispatchInfo.push(dispatchInfo); std::unique_ptr blockedKernelData; MockCommandQueueHw mockCmdQ(nullptr, pClDevice, nullptr); auto expectedSizeCSAllocation = MemoryConstants::pageSize64k; auto expectedSizeCS = MemoryConstants::pageSize64k - CSRequirements::csOverfetchSize; CsrDependencies csrDependencies; EventsRequest eventsRequest(0, nullptr, nullptr); auto cmdStream = mockCmdQ.template obtainCommandStream(csrDependencies, false, true, multiDispatchInfo, eventsRequest, blockedKernelData, nullptr, 0u, false); EXPECT_EQ(expectedSizeCS, cmdStream->getMaxAvailableSpace()); EXPECT_EQ(expectedSizeCSAllocation, cmdStream->getGraphicsAllocation()->getUnderlyingBufferSize()); EXPECT_NE(nullptr, blockedKernelData); EXPECT_EQ(cmdStream, blockedKernelData->commandStream.get()); } HWTEST_F(DispatchWalkerTest, GivenBlockedQueueWhenDispatchingWalkerThenRequiredHeapSizesAreTakenFromMdi) { MockKernel kernel(program.get(), kernelInfo, *pClDevice); ASSERT_EQ(CL_SUCCESS, kernel.initialize()); MockMultiDispatchInfo multiDispatchInfo(pClDevice, &kernel); auto blockedCommandsData = createBlockedCommandsData(*pCmdQ); HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL); walkerArgs.blockedCommandsData = blockedCommandsData.get(); HardwareInterface::dispatchWalker( *pCmdQ, multiDispatchInfo, CsrDependencies(), walkerArgs); auto expectedSizeDSH = HardwareCommandsHelper::getTotalSizeRequiredDSH(multiDispatchInfo); auto expectedSizeIOH = HardwareCommandsHelper::getTotalSizeRequiredIOH(multiDispatchInfo); auto expectedSizeSSH = HardwareCommandsHelper::getTotalSizeRequiredSSH(multiDispatchInfo); EXPECT_LE(expectedSizeDSH, blockedCommandsData->dsh->getMaxAvailableSpace()); EXPECT_LE(expectedSizeIOH, blockedCommandsData->ioh->getMaxAvailableSpace()); EXPECT_LE(expectedSizeSSH, blockedCommandsData->ssh->getMaxAvailableSpace()); } HWTEST_F(DispatchWalkerTest, givenBlockedQueueWhenDispatchWalkerIsCalledThenCommandStreamHasGpuAddress) { MockKernel kernel(program.get(), kernelInfo, *pClDevice); ASSERT_EQ(CL_SUCCESS, kernel.initialize()); MockMultiDispatchInfo multiDispatchInfo(pClDevice, &kernel); auto blockedCommandsData = createBlockedCommandsData(*pCmdQ); HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL); walkerArgs.blockedCommandsData = blockedCommandsData.get(); HardwareInterface::dispatchWalker( *pCmdQ, multiDispatchInfo, CsrDependencies(), walkerArgs); EXPECT_NE(nullptr, blockedCommandsData->commandStream->getGraphicsAllocation()); EXPECT_NE(0ull, blockedCommandsData->commandStream->getGraphicsAllocation()->getGpuAddress()); } HWTEST_F(DispatchWalkerTest, givenThereAreAllocationsForReuseWhenDispatchWalkerIsCalledThenCommandStreamObtainsReusableAllocation) { MockKernel kernel(program.get(), kernelInfo, *pClDevice); ASSERT_EQ(CL_SUCCESS, kernel.initialize()); MockMultiDispatchInfo multiDispatchInfo(pClDevice, &kernel); auto &csr = pCmdQ->getGpgpuCommandStreamReceiver(); auto allocation = csr.getMemoryManager()->allocateGraphicsMemoryWithProperties({csr.getRootDeviceIndex(), MemoryConstants::pageSize64k + CSRequirements::csOverfetchSize, AllocationType::COMMAND_BUFFER, csr.getOsContext().getDeviceBitfield()}); csr.getInternalAllocationStorage()->storeAllocation(std::unique_ptr{allocation}, REUSABLE_ALLOCATION); ASSERT_FALSE(csr.getInternalAllocationStorage()->getAllocationsForReuse().peekIsEmpty()); auto blockedCommandsData = createBlockedCommandsData(*pCmdQ); HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL); walkerArgs.blockedCommandsData = blockedCommandsData.get(); HardwareInterface::dispatchWalker( *pCmdQ, multiDispatchInfo, CsrDependencies(), walkerArgs); EXPECT_TRUE(csr.getInternalAllocationStorage()->getAllocationsForReuse().peekIsEmpty()); EXPECT_EQ(allocation, blockedCommandsData->commandStream->getGraphicsAllocation()); } HWTEST_F(DispatchWalkerTest, GivenMultipleKernelsWhenDispatchingWalkerThenWorkDimensionsAreCorrect) { kernelInfo.kernelDescriptor.payloadMappings.dispatchTraits.workDim = 0; MockKernel kernel1(program.get(), kernelInfo, *pClDevice); ASSERT_EQ(CL_SUCCESS, kernel1.initialize()); MockKernel kernel2(program.get(), kernelInfo, *pClDevice); ASSERT_EQ(CL_SUCCESS, kernel2.initialize()); MockMultiDispatchInfo multiDispatchInfo(pClDevice, std::vector({&kernel1, &kernel2})); HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL); HardwareInterface::dispatchWalker( *pCmdQ, multiDispatchInfo, CsrDependencies(), walkerArgs); for (auto &dispatchInfo : multiDispatchInfo) { auto &kernel = static_cast(*dispatchInfo.getKernel()); EXPECT_EQ(dispatchInfo.getDim(), *kernel.getWorkDim()); } } HWCMDTEST_F(IGFX_GEN8_CORE, DispatchWalkerTest, GivenMultipleKernelsWhenDispatchingWalkerThenInterfaceDescriptorsAreProgrammedCorrectly) { using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA; auto memoryManager = this->pDevice->getMemoryManager(); auto kernelIsaAllocation = memoryManager->allocateGraphicsMemoryWithProperties({pDevice->getRootDeviceIndex(), MemoryConstants::pageSize, AllocationType::KERNEL_ISA, pDevice->getDeviceBitfield()}); auto kernelIsaWithSamplerAllocation = memoryManager->allocateGraphicsMemoryWithProperties({pDevice->getRootDeviceIndex(), MemoryConstants::pageSize, AllocationType::KERNEL_ISA, pDevice->getDeviceBitfield()}); kernelInfo.kernelAllocation = kernelIsaAllocation; kernelInfoWithSampler.kernelAllocation = kernelIsaWithSamplerAllocation; auto gpuAddress1 = kernelIsaAllocation->getGpuAddressToPatch(); auto gpuAddress2 = kernelIsaWithSamplerAllocation->getGpuAddressToPatch(); MockKernel kernel1(program.get(), kernelInfo, *pClDevice); ASSERT_EQ(CL_SUCCESS, kernel1.initialize()); MockKernel kernel2(program.get(), kernelInfoWithSampler, *pClDevice); ASSERT_EQ(CL_SUCCESS, kernel2.initialize()); MockMultiDispatchInfo multiDispatchInfo(pClDevice, std::vector({&kernel1, &kernel2})); // create Indirect DSH heap auto &indirectHeap = pCmdQ->getIndirectHeap(IndirectHeap::Type::DYNAMIC_STATE, 8192); indirectHeap.align(EncodeStates::alignInterfaceDescriptorData); auto dshBeforeMultiDisptach = indirectHeap.getUsed(); HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL); HardwareInterface::dispatchWalker( *pCmdQ, multiDispatchInfo, CsrDependencies(), walkerArgs); auto dshAfterMultiDisptach = indirectHeap.getUsed(); auto numberOfDispatches = multiDispatchInfo.size(); auto interfaceDesriptorTableSize = numberOfDispatches * sizeof(INTERFACE_DESCRIPTOR_DATA); EXPECT_LE(dshBeforeMultiDisptach + interfaceDesriptorTableSize, dshAfterMultiDisptach); INTERFACE_DESCRIPTOR_DATA *pID = reinterpret_cast(ptrOffset(indirectHeap.getCpuBase(), dshBeforeMultiDisptach)); for (uint32_t index = 0; index < multiDispatchInfo.size(); index++) { uint32_t addressLow = pID[index].getKernelStartPointer(); uint32_t addressHigh = pID[index].getKernelStartPointerHigh(); uint64_t fullAddress = ((uint64_t)addressHigh << 32) | addressLow; if (index > 0) { uint32_t addressLowOfPrevious = pID[index - 1].getKernelStartPointer(); uint32_t addressHighOfPrevious = pID[index - 1].getKernelStartPointerHigh(); uint64_t addressPrevious = ((uint64_t)addressHighOfPrevious << 32) | addressLowOfPrevious; uint64_t address = ((uint64_t)addressHigh << 32) | addressLow; EXPECT_NE(addressPrevious, address); } if (index == 0) { auto samplerPointer = pID[index].getSamplerStatePointer(); auto samplerCount = pID[index].getSamplerCount(); EXPECT_EQ(0u, samplerPointer); EXPECT_EQ(0u, samplerCount); EXPECT_EQ(fullAddress, gpuAddress1); } if (index == 1) { auto samplerPointer = pID[index].getSamplerStatePointer(); auto samplerCount = pID[index].getSamplerCount(); EXPECT_NE(0u, samplerPointer); if (EncodeSurfaceState::doBindingTablePrefetch()) { EXPECT_EQ(1u, samplerCount); } else { EXPECT_EQ(0u, samplerCount); } EXPECT_EQ(fullAddress, gpuAddress2); } } HardwareParse hwParser; auto &cmdStream = pCmdQ->getCS(0); hwParser.parseCommands(cmdStream, 0); hwParser.findHardwareCommands(); auto cmd = hwParser.getCommand(); EXPECT_NE(nullptr, cmd); auto idStartAddress = cmd->getInterfaceDescriptorDataStartAddress(); auto idSize = cmd->getInterfaceDescriptorTotalLength(); EXPECT_EQ(dshBeforeMultiDisptach, idStartAddress); EXPECT_EQ(interfaceDesriptorTableSize, idSize); memoryManager->freeGraphicsMemory(kernelIsaAllocation); memoryManager->freeGraphicsMemory(kernelIsaWithSamplerAllocation); } HWCMDTEST_F(IGFX_GEN8_CORE, DispatchWalkerTest, GivenMultipleKernelsWhenDispatchingWalkerThenGpgpuWalkerIdOffsetIsProgrammedCorrectly) { using GPGPU_WALKER = typename FamilyType::GPGPU_WALKER; MockKernel kernel1(program.get(), kernelInfo, *pClDevice); ASSERT_EQ(CL_SUCCESS, kernel1.initialize()); MockKernel kernel2(program.get(), kernelInfoWithSampler, *pClDevice); ASSERT_EQ(CL_SUCCESS, kernel2.initialize()); MockMultiDispatchInfo multiDispatchInfo(pClDevice, std::vector({&kernel1, &kernel2})); // create commandStream auto &cmdStream = pCmdQ->getCS(0); HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL); HardwareInterface::dispatchWalker( *pCmdQ, multiDispatchInfo, CsrDependencies(), walkerArgs); HardwareParse hwParser; hwParser.parseCommands(cmdStream, 0); hwParser.findHardwareCommands(); auto walkerItor = hwParser.itorWalker; ASSERT_NE(hwParser.cmdList.end(), walkerItor); for (uint32_t index = 0; index < multiDispatchInfo.size(); index++) { ASSERT_NE(hwParser.cmdList.end(), walkerItor); auto *gpgpuWalker = (GPGPU_WALKER *)*walkerItor; auto idIndex = gpgpuWalker->getInterfaceDescriptorOffset(); EXPECT_EQ(index, idIndex); // move walker iterator walkerItor++; walkerItor = find(walkerItor, hwParser.cmdList.end()); } } HWCMDTEST_F(IGFX_GEN8_CORE, DispatchWalkerTest, GivenMultipleKernelsWhenDispatchingWalkerThenThreadGroupIdStartingCoordinatesAreProgrammedCorrectly) { using GPGPU_WALKER = typename FamilyType::GPGPU_WALKER; MockKernel kernel1(program.get(), kernelInfo, *pClDevice); ASSERT_EQ(CL_SUCCESS, kernel1.initialize()); MockKernel kernel2(program.get(), kernelInfoWithSampler, *pClDevice); ASSERT_EQ(CL_SUCCESS, kernel2.initialize()); MockMultiDispatchInfo multiDispatchInfo(pClDevice, std::vector({&kernel1, &kernel2})); // create commandStream auto &cmdStream = pCmdQ->getCS(0); HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL); HardwareInterface::dispatchWalker( *pCmdQ, multiDispatchInfo, CsrDependencies(), walkerArgs); HardwareParse hwParser; hwParser.parseCommands(cmdStream, 0); hwParser.findHardwareCommands(); auto walkerItor = hwParser.itorWalker; ASSERT_NE(hwParser.cmdList.end(), walkerItor); for (uint32_t index = 0; index < multiDispatchInfo.size(); index++) { ASSERT_NE(hwParser.cmdList.end(), walkerItor); auto *gpgpuWalker = (GPGPU_WALKER *)*walkerItor; auto coordinateX = gpgpuWalker->getThreadGroupIdStartingX(); EXPECT_EQ(coordinateX, 0u); auto coordinateY = gpgpuWalker->getThreadGroupIdStartingY(); EXPECT_EQ(coordinateY, 0u); auto coordinateZ = gpgpuWalker->getThreadGroupIdStartingResumeZ(); EXPECT_EQ(coordinateZ, 0u); // move walker iterator walkerItor++; walkerItor = find(walkerItor, hwParser.cmdList.end()); } } HWCMDTEST_F(IGFX_GEN8_CORE, DispatchWalkerTest, GivenMultipleDispatchInfoAndSameKernelWhenDispatchingWalkerThenGpgpuWalkerThreadGroupIdStartingCoordinatesAreCorrectlyProgrammed) { using GPGPU_WALKER = typename FamilyType::GPGPU_WALKER; MockKernel kernel(program.get(), kernelInfo, *pClDevice); ASSERT_EQ(CL_SUCCESS, kernel.initialize()); DispatchInfo di1(pClDevice, &kernel, 1, {100, 1, 1}, {10, 1, 1}, {0, 0, 0}, {100, 1, 1}, {10, 1, 1}, {10, 1, 1}, {10, 1, 1}, {0, 0, 0}); DispatchInfo di2(pClDevice, &kernel, 1, {100, 1, 1}, {10, 1, 1}, {0, 0, 0}, {100, 1, 1}, {10, 1, 1}, {10, 1, 1}, {10, 1, 1}, {10, 0, 0}); MockMultiDispatchInfo multiDispatchInfo(std::vector({&di1, &di2})); // create commandStream auto &cmdStream = pCmdQ->getCS(0); HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL); HardwareInterface::dispatchWalker( *pCmdQ, multiDispatchInfo, CsrDependencies(), walkerArgs); HardwareParse hwParser; hwParser.parseCommands(cmdStream, 0); hwParser.findHardwareCommands(); auto walkerItor = hwParser.itorWalker; ASSERT_NE(hwParser.cmdList.end(), walkerItor); for (uint32_t index = 0; index < multiDispatchInfo.size(); index++) { ASSERT_NE(hwParser.cmdList.end(), walkerItor); auto *gpgpuWalker = (GPGPU_WALKER *)*walkerItor; auto coordinateX = gpgpuWalker->getThreadGroupIdStartingX(); EXPECT_EQ(coordinateX, index * 10u); auto coordinateY = gpgpuWalker->getThreadGroupIdStartingY(); EXPECT_EQ(coordinateY, 0u); auto coordinateZ = gpgpuWalker->getThreadGroupIdStartingResumeZ(); EXPECT_EQ(coordinateZ, 0u); // move walker iterator walkerItor++; walkerItor = find(walkerItor, hwParser.cmdList.end()); } } HWTEST_F(DispatchWalkerTest, GivenCacheFlushAfterWalkerDisabledWhenAllocationRequiresCacheFlushThenFlushCommandNotPresentAfterWalker) { using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; DebugManagerStateRestore dbgRestore; DebugManager.flags.EnableCacheFlushAfterWalker.set(0); MockKernel kernel1(program.get(), kernelInfo, *pClDevice); ASSERT_EQ(CL_SUCCESS, kernel1.initialize()); kernel1.kernelArgRequiresCacheFlush.resize(1); MockGraphicsAllocation cacheRequiringAllocation; kernel1.kernelArgRequiresCacheFlush[0] = &cacheRequiringAllocation; MockMultiDispatchInfo multiDispatchInfo(pClDevice, std::vector({&kernel1})); // create commandStream auto &cmdStream = pCmdQ->getCS(0); HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL); HardwareInterface::dispatchWalker( *pCmdQ, multiDispatchInfo, CsrDependencies(), walkerArgs); HardwareParse hwParse; hwParse.parseCommands(cmdStream); PIPE_CONTROL *pipeControl = hwParse.getCommand(); EXPECT_EQ(nullptr, pipeControl); } HWTEST_F(DispatchWalkerTest, GivenCacheFlushAfterWalkerEnabledWhenWalkerWithTwoKernelsThenFlushCommandPresentOnce) { using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; DebugManagerStateRestore dbgRestore; DebugManager.flags.EnableCacheFlushAfterWalker.set(1); MockKernel kernel1(program.get(), kernelInfo, *pClDevice); ASSERT_EQ(CL_SUCCESS, kernel1.initialize()); MockKernel kernel2(program.get(), kernelInfoWithSampler, *pClDevice); ASSERT_EQ(CL_SUCCESS, kernel2.initialize()); kernel1.kernelArgRequiresCacheFlush.resize(1); kernel2.kernelArgRequiresCacheFlush.resize(1); MockGraphicsAllocation cacheRequiringAllocation; kernel1.kernelArgRequiresCacheFlush[0] = &cacheRequiringAllocation; kernel2.kernelArgRequiresCacheFlush[0] = &cacheRequiringAllocation; MockMultiDispatchInfo multiDispatchInfo(pClDevice, std::vector({&kernel1, &kernel2})); // create commandStream auto &cmdStream = pCmdQ->getCS(0); HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL); HardwareInterface::dispatchWalker( *pCmdQ, multiDispatchInfo, CsrDependencies(), walkerArgs); HardwareParse hwParse; hwParse.parseCommands(cmdStream); uint32_t pipeControlCount = hwParse.getCommandCount(); EXPECT_EQ(pipeControlCount, 1u); } HWTEST_F(DispatchWalkerTest, GivenCacheFlushAfterWalkerEnabledWhenTwoWalkersForQueueThenFlushCommandPresentTwice) { using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; DebugManagerStateRestore dbgRestore; DebugManager.flags.EnableCacheFlushAfterWalker.set(1); MockKernel kernel1(program.get(), kernelInfo, *pClDevice); ASSERT_EQ(CL_SUCCESS, kernel1.initialize()); MockKernel kernel2(program.get(), kernelInfoWithSampler, *pClDevice); ASSERT_EQ(CL_SUCCESS, kernel2.initialize()); kernel1.kernelArgRequiresCacheFlush.resize(1); kernel2.kernelArgRequiresCacheFlush.resize(1); MockGraphicsAllocation cacheRequiringAllocation; kernel1.kernelArgRequiresCacheFlush[0] = &cacheRequiringAllocation; kernel2.kernelArgRequiresCacheFlush[0] = &cacheRequiringAllocation; MockMultiDispatchInfo multiDispatchInfo1(pClDevice, std::vector({&kernel1})); MockMultiDispatchInfo multiDispatchInfo2(pClDevice, std::vector({&kernel2})); // create commandStream auto &cmdStream = pCmdQ->getCS(0); HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL); HardwareInterface::dispatchWalker( *pCmdQ, multiDispatchInfo1, CsrDependencies(), walkerArgs); HardwareInterfaceWalkerArgs walkerArgs2 = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL); HardwareInterface::dispatchWalker( *pCmdQ, multiDispatchInfo2, CsrDependencies(), walkerArgs2); HardwareParse hwParse; hwParse.parseCommands(cmdStream); uint32_t pipeControlCount = hwParse.getCommandCount(); EXPECT_EQ(pipeControlCount, 2u); } TEST(DispatchWalker, WhenCalculatingDispatchDimensionsThenCorrectValuesAreReturned) { Vec3 dim0{0, 0, 0}; Vec3 dim1{2, 1, 1}; Vec3 dim2{2, 2, 1}; Vec3 dim3{2, 2, 2}; Vec3 dispatches[] = {dim0, dim1, dim2, dim3}; uint32_t testDims[] = {0, 1, 2, 3}; for (const auto &lhs : testDims) { for (const auto &rhs : testDims) { uint32_t dimTest = calculateDispatchDim(dispatches[lhs], dispatches[rhs]); uint32_t dimRef = std::max(1U, std::max(lhs, rhs)); EXPECT_EQ(dimRef, dimTest); } } } HWTEST_P(DispatchWalkerTestForAuxTranslation, givenKernelWhenAuxToNonAuxWhenTranslationRequiredThenPipeControlWithStallAndDCFlushAdded) { BuiltinDispatchInfoBuilder &baseBuilder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(EBuiltInOps::AuxTranslation, *pClDevice); auto &builder = static_cast &>(baseBuilder); MockKernel kernel(program.get(), kernelInfo, *pClDevice); kernelInfo.kernelDescriptor.payloadMappings.dispatchTraits.workDim = 0; ASSERT_EQ(CL_SUCCESS, kernel.initialize()); auto &cmdStream = pCmdQ->getCS(0); void *buffer = cmdStream.getCpuBase(); kernel.auxTranslationRequired = true; MockKernelObjForAuxTranslation mockKernelObj1(kernelObjType); MockKernelObjForAuxTranslation mockKernelObj2(kernelObjType); auto kernelObjsForAuxTranslation = std::make_unique(); kernelObjsForAuxTranslation->insert(mockKernelObj1); kernelObjsForAuxTranslation->insert(mockKernelObj2); MultiDispatchInfo multiDispatchInfo; multiDispatchInfo.setKernelObjsForAuxTranslation(std::move(kernelObjsForAuxTranslation)); BuiltinOpParams builtinOpsParams; builtinOpsParams.auxTranslationDirection = AuxTranslationDirection::AuxToNonAux; builder.buildDispatchInfosForAuxTranslation(multiDispatchInfo, builtinOpsParams); HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL); HardwareInterface::dispatchWalker( *pCmdQ, multiDispatchInfo, CsrDependencies(), walkerArgs); auto sizeUsed = cmdStream.getUsed(); GenCmdList cmdList; ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(cmdList, buffer, sizeUsed)); auto pipeControls = findAll(cmdList.begin(), cmdList.end()); ASSERT_EQ(2u, pipeControls.size()); auto beginPipeControl = genCmdCast(*(pipeControls[0])); EXPECT_EQ(MemorySynchronizationCommands::getDcFlushEnable(true, *defaultHwInfo), beginPipeControl->getDcFlushEnable()); EXPECT_TRUE(beginPipeControl->getCommandStreamerStallEnable()); auto endPipeControl = genCmdCast(*(pipeControls[1])); bool dcFlushRequired = (pClDevice->getHardwareInfo().platform.eRenderCoreFamily == IGFX_GEN8_CORE); EXPECT_EQ(dcFlushRequired, endPipeControl->getDcFlushEnable()); EXPECT_TRUE(endPipeControl->getCommandStreamerStallEnable()); } HWTEST_P(DispatchWalkerTestForAuxTranslation, givenKernelWhenNonAuxToAuxWhenTranslationRequiredThenPipeControlWithStallAdded) { BuiltinDispatchInfoBuilder &baseBuilder = BuiltInDispatchBuilderOp::getBuiltinDispatchInfoBuilder(EBuiltInOps::AuxTranslation, *pClDevice); auto &builder = static_cast &>(baseBuilder); MockKernel kernel(program.get(), kernelInfo, *pClDevice); kernelInfo.kernelDescriptor.payloadMappings.dispatchTraits.workDim = 0; ASSERT_EQ(CL_SUCCESS, kernel.initialize()); auto &cmdStream = pCmdQ->getCS(0); void *buffer = cmdStream.getCpuBase(); kernel.auxTranslationRequired = true; MockKernelObjForAuxTranslation mockKernelObj1(kernelObjType); MockKernelObjForAuxTranslation mockKernelObj2(kernelObjType); auto kernelObjsForAuxTranslation = std::make_unique(); kernelObjsForAuxTranslation->insert(mockKernelObj1); kernelObjsForAuxTranslation->insert(mockKernelObj2); MultiDispatchInfo multiDispatchInfo; multiDispatchInfo.setKernelObjsForAuxTranslation(std::move(kernelObjsForAuxTranslation)); BuiltinOpParams builtinOpsParams; builtinOpsParams.auxTranslationDirection = AuxTranslationDirection::NonAuxToAux; builder.buildDispatchInfosForAuxTranslation(multiDispatchInfo, builtinOpsParams); HardwareInterfaceWalkerArgs walkerArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL); HardwareInterface::dispatchWalker( *pCmdQ, multiDispatchInfo, CsrDependencies(), walkerArgs); auto sizeUsed = cmdStream.getUsed(); GenCmdList cmdList; ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(cmdList, buffer, sizeUsed)); auto pipeControls = findAll(cmdList.begin(), cmdList.end()); ASSERT_EQ(2u, pipeControls.size()); bool dcFlushRequired = (pClDevice->getHardwareInfo().platform.eRenderCoreFamily == IGFX_GEN8_CORE); auto beginPipeControl = genCmdCast(*(pipeControls[0])); EXPECT_EQ(MemorySynchronizationCommands::getDcFlushEnable(true, *defaultHwInfo), beginPipeControl->getDcFlushEnable()); EXPECT_TRUE(beginPipeControl->getCommandStreamerStallEnable()); auto endPipeControl = genCmdCast(*(pipeControls[1])); EXPECT_EQ(dcFlushRequired, endPipeControl->getDcFlushEnable()); EXPECT_TRUE(endPipeControl->getCommandStreamerStallEnable()); } struct ProfilingCommandsTest : public DispatchWalkerTest, ::testing::WithParamInterface { void SetUp() override { DispatchWalkerTest::SetUp(); } void TearDown() override { DispatchWalkerTest::TearDown(); } }; HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingCommandsTest, givenKernelWhenProfilingCommandStartIsTakenThenTimeStampAddressIsProgrammedCorrectly) { using MI_STORE_REGISTER_MEM = typename FamilyType::MI_STORE_REGISTER_MEM; using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; auto &cmdStream = pCmdQ->getCS(0); MockTagAllocator timeStampAllocator(pDevice->getRootDeviceIndex(), this->pDevice->getMemoryManager(), 10, MemoryConstants::cacheLineSize, sizeof(HwTimeStamps), false, pDevice->getDeviceBitfield()); auto hwTimeStamp1 = timeStampAllocator.getTag(); ASSERT_NE(nullptr, hwTimeStamp1); GpgpuWalkerHelper::dispatchProfilingCommandsStart(*hwTimeStamp1, &cmdStream, pDevice->getHardwareInfo()); auto hwTimeStamp2 = timeStampAllocator.getTag(); ASSERT_NE(nullptr, hwTimeStamp2); GpgpuWalkerHelper::dispatchProfilingCommandsStart(*hwTimeStamp2, &cmdStream, pDevice->getHardwareInfo()); GenCmdList cmdList; ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(cmdList, cmdStream.getCpuBase(), cmdStream.getUsed())); auto itorStoreReg = find(cmdList.begin(), cmdList.end()); ASSERT_NE(cmdList.end(), itorStoreReg); auto storeReg = genCmdCast(*itorStoreReg); ASSERT_NE(nullptr, storeReg); uint64_t gpuAddress = storeReg->getMemoryAddress(); auto contextTimestampFieldOffset = offsetof(HwTimeStamps, ContextStartTS); uint64_t expectedAddress = hwTimeStamp1->getGpuAddress() + contextTimestampFieldOffset; EXPECT_EQ(expectedAddress, gpuAddress); itorStoreReg++; itorStoreReg = find(itorStoreReg, cmdList.end()); ASSERT_NE(cmdList.end(), itorStoreReg); storeReg = genCmdCast(*itorStoreReg); ASSERT_NE(nullptr, storeReg); gpuAddress = storeReg->getMemoryAddress(); expectedAddress = hwTimeStamp2->getGpuAddress() + contextTimestampFieldOffset; EXPECT_EQ(expectedAddress, gpuAddress); auto itorPipeCtrl = find(cmdList.begin(), cmdList.end()); ASSERT_NE(cmdList.end(), itorPipeCtrl); if (MemorySynchronizationCommands::isPipeControlWArequired(pDevice->getHardwareInfo())) { itorPipeCtrl++; } if (UnitTestHelper::isAdditionalSynchronizationRequired()) { itorPipeCtrl++; } auto pipeControl = genCmdCast(*itorPipeCtrl); ASSERT_NE(nullptr, pipeControl); gpuAddress = NEO::UnitTestHelper::getPipeControlPostSyncAddress(*pipeControl); expectedAddress = hwTimeStamp1->getGpuAddress() + offsetof(HwTimeStamps, GlobalStartTS); EXPECT_EQ(expectedAddress, gpuAddress); itorPipeCtrl++; itorPipeCtrl = find(itorPipeCtrl, cmdList.end()); if (MemorySynchronizationCommands::isPipeControlWArequired(pDevice->getHardwareInfo())) { itorPipeCtrl++; } if (UnitTestHelper::isAdditionalSynchronizationRequired()) { itorPipeCtrl++; } ASSERT_NE(cmdList.end(), itorPipeCtrl); pipeControl = genCmdCast(*itorPipeCtrl); ASSERT_NE(nullptr, pipeControl); gpuAddress = NEO::UnitTestHelper::getPipeControlPostSyncAddress(*pipeControl); expectedAddress = hwTimeStamp2->getGpuAddress() + offsetof(HwTimeStamps, GlobalStartTS); EXPECT_EQ(expectedAddress, gpuAddress); } HWCMDTEST_F(IGFX_GEN8_CORE, ProfilingCommandsTest, givenKernelWhenProfilingCommandStartIsNotTakenThenTimeStampAddressIsProgrammedCorrectly) { using MI_STORE_REGISTER_MEM = typename FamilyType::MI_STORE_REGISTER_MEM; using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; auto &cmdStream = pCmdQ->getCS(0); MockTagAllocator timeStampAllocator(pDevice->getRootDeviceIndex(), this->pDevice->getMemoryManager(), 10, MemoryConstants::cacheLineSize, sizeof(HwTimeStamps), false, pDevice->getDeviceBitfield()); auto hwTimeStamp1 = timeStampAllocator.getTag(); ASSERT_NE(nullptr, hwTimeStamp1); GpgpuWalkerHelper::dispatchProfilingCommandsEnd(*hwTimeStamp1, &cmdStream, pDevice->getHardwareInfo()); auto hwTimeStamp2 = timeStampAllocator.getTag(); ASSERT_NE(nullptr, hwTimeStamp2); GpgpuWalkerHelper::dispatchProfilingCommandsEnd(*hwTimeStamp2, &cmdStream, pDevice->getHardwareInfo()); GenCmdList cmdList; ASSERT_TRUE(FamilyType::PARSE::parseCommandBuffer(cmdList, cmdStream.getCpuBase(), cmdStream.getUsed())); auto itorStoreReg = find(cmdList.begin(), cmdList.end()); ASSERT_NE(cmdList.end(), itorStoreReg); auto storeReg = genCmdCast(*itorStoreReg); ASSERT_NE(nullptr, storeReg); uint64_t gpuAddress = storeReg->getMemoryAddress(); auto contextTimestampFieldOffset = offsetof(HwTimeStamps, ContextEndTS); uint64_t expectedAddress = hwTimeStamp1->getGpuAddress() + contextTimestampFieldOffset; EXPECT_EQ(expectedAddress, gpuAddress); itorStoreReg++; itorStoreReg = find(itorStoreReg, cmdList.end()); ASSERT_NE(cmdList.end(), itorStoreReg); storeReg = genCmdCast(*itorStoreReg); ASSERT_NE(nullptr, storeReg); gpuAddress = storeReg->getMemoryAddress(); expectedAddress = hwTimeStamp2->getGpuAddress() + contextTimestampFieldOffset; EXPECT_EQ(expectedAddress, gpuAddress); } HWTEST_F(DispatchWalkerTest, WhenKernelRequiresImplicitArgsThenIohRequiresMoreSpace) { size_t globalOffsets[3] = {0, 0, 0}; size_t workItems[3] = {1, 1, 1}; size_t workGroupSize[3] = {2, 5, 10}; cl_uint dimensions = 1; Vec3 localWorkgroupSize(workGroupSize); auto blockedCommandsData = createBlockedCommandsData(*pCmdQ); kernelInfo.kernelDescriptor.kernelAttributes.simdSize = 1u; kernelInfo.kernelDescriptor.kernelAttributes.flags.requiresImplicitArgs = false; MockKernel kernelWithoutImplicitArgs(program.get(), kernelInfo, *pClDevice); ASSERT_EQ(CL_SUCCESS, kernelWithoutImplicitArgs.initialize()); UnitTestHelper::adjustKernelDescriptorForImplicitArgs(kernelInfo.kernelDescriptor); MockKernel kernelWithImplicitArgs(program.get(), kernelInfo, *pClDevice); ASSERT_EQ(CL_SUCCESS, kernelWithImplicitArgs.initialize()); DispatchInfo dispatchInfoWithoutImplicitArgs(pClDevice, const_cast(&kernelWithoutImplicitArgs), dimensions, workItems, workGroupSize, globalOffsets); dispatchInfoWithoutImplicitArgs.setNumberOfWorkgroups({1, 1, 1}); dispatchInfoWithoutImplicitArgs.setTotalNumberOfWorkgroups({1, 1, 1}); MultiDispatchInfo multiDispatchInfoWithoutImplicitArgs(&kernelWithoutImplicitArgs); multiDispatchInfoWithoutImplicitArgs.push(dispatchInfoWithoutImplicitArgs); HardwareInterfaceWalkerArgs walkerArgsWithoutImplicitArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL); walkerArgsWithoutImplicitArgs.blockedCommandsData = blockedCommandsData.get(); HardwareInterface::dispatchWalker( *pCmdQ, multiDispatchInfoWithoutImplicitArgs, CsrDependencies(), walkerArgsWithoutImplicitArgs); auto iohSizeWithoutImplicitArgs = HardwareCommandsHelper::getSizeRequiredIOH(kernelWithoutImplicitArgs, Math::computeTotalElementsCount(localWorkgroupSize)); DispatchInfo dispatchInfoWithImplicitArgs(pClDevice, const_cast(&kernelWithImplicitArgs), dimensions, workItems, workGroupSize, globalOffsets); dispatchInfoWithImplicitArgs.setNumberOfWorkgroups({1, 1, 1}); dispatchInfoWithImplicitArgs.setTotalNumberOfWorkgroups({1, 1, 1}); MultiDispatchInfo multiDispatchInfoWithImplicitArgs(&kernelWithoutImplicitArgs); multiDispatchInfoWithImplicitArgs.push(dispatchInfoWithImplicitArgs); HardwareInterfaceWalkerArgs walkerArgsWithImplicitArgs = createHardwareInterfaceWalkerArgs(CL_COMMAND_NDRANGE_KERNEL); walkerArgsWithImplicitArgs.blockedCommandsData = blockedCommandsData.get(); HardwareInterface::dispatchWalker( *pCmdQ, multiDispatchInfoWithImplicitArgs, CsrDependencies(), walkerArgsWithImplicitArgs); auto iohSizeWithImplicitArgs = HardwareCommandsHelper::getSizeRequiredIOH(kernelWithImplicitArgs, Math::computeTotalElementsCount(localWorkgroupSize)); EXPECT_LE(iohSizeWithoutImplicitArgs, iohSizeWithImplicitArgs); { auto numChannels = kernelInfo.kernelDescriptor.kernelAttributes.numLocalIdChannels; auto simdSize = kernelInfo.getMaxSimdSize(); uint32_t grfSize = sizeof(typename FamilyType::GRF); auto size = kernelWithImplicitArgs.getCrossThreadDataSize() + HardwareCommandsHelper::getPerThreadDataSizeTotal(simdSize, grfSize, numChannels, Math::computeTotalElementsCount(localWorkgroupSize)) + ImplicitArgsHelper::getSizeForImplicitArgsPatching(kernelWithImplicitArgs.getImplicitArgs(), kernelWithImplicitArgs.getDescriptor(), *defaultHwInfo); size = alignUp(size, MemoryConstants::cacheLineSize); EXPECT_EQ(size, iohSizeWithImplicitArgs); } }