/* * Copyright (C) 2018-2021 Intel Corporation * * SPDX-License-Identifier: MIT * */ #include "shared/source/built_ins/built_ins.h" #include "shared/test/common/cmd_parse/hw_parse.h" #include "shared/test/common/helpers/debug_manager_state_restore.h" #include "opencl/source/command_queue/enqueue_kernel.h" #include "opencl/source/device_queue/device_queue.h" #include "opencl/source/scheduler/scheduler_kernel.h" #include "opencl/test/unit_test/fixtures/execution_model_fixture.h" #include "opencl/test/unit_test/mocks/mock_command_queue.h" #include "opencl/test/unit_test/mocks/mock_context.h" #include "opencl/test/unit_test/mocks/mock_device_queue.h" #include "opencl/test/unit_test/mocks/mock_kernel.h" #include "opencl/test/unit_test/mocks/mock_program.h" using namespace NEO; class ExecutionModelSchedulerFixture : public ExecutionModelSchedulerTest, public testing::Test { public: void SetUp() override { ExecutionModelSchedulerTest::SetUp(); REQUIRE_DEVICE_ENQUEUE_OR_SKIP(pClDevice); } void TearDown() override { ExecutionModelSchedulerTest::TearDown(); } }; HWCMDTEST_F(IGFX_GEN8_CORE, ExecutionModelSchedulerFixture, WhenDispatchingSchedulerThenProgrammingIsCorrect) { using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA; using GPGPU_WALKER = typename FamilyType::GPGPU_WALKER; using PIPELINE_SELECT = typename FamilyType::PIPELINE_SELECT; using STATE_BASE_ADDRESS = typename FamilyType::STATE_BASE_ADDRESS; using MEDIA_INTERFACE_DESCRIPTOR_LOAD = typename FamilyType::MEDIA_INTERFACE_DESCRIPTOR_LOAD; using MEDIA_VFE_STATE = typename FamilyType::MEDIA_VFE_STATE; using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START; DeviceQueueHw *pDevQueueHw = castToObject>(pDevQueue); auto &scheduler = static_cast(context->getSchedulerKernel()); auto *executionModelDshAllocation = pDevQueueHw->getDshBuffer(); auto *dshHeap = pDevQueueHw->getIndirectHeap(IndirectHeap::DYNAMIC_STATE); void *executionModelDsh = executionModelDshAllocation->getUnderlyingBuffer(); EXPECT_NE(nullptr, executionModelDsh); size_t minRequiredSizeForSchedulerSSH = HardwareCommandsHelper::getSshSizeForExecutionModel(*parentKernel); // Setup heaps in pCmdQ MultiDispatchInfo multiDispatchinfo(&scheduler); LinearStream &commandStream = getCommandStream(*pCmdQ, CsrDependencies(), false, false, false, multiDispatchinfo, nullptr, 0, false, false); pCmdQ->getIndirectHeap(IndirectHeap::SURFACE_STATE, minRequiredSizeForSchedulerSSH); GpgpuWalkerHelper::dispatchScheduler( pCmdQ->getCS(0), *pDevQueueHw, pDevice->getPreemptionMode(), scheduler, &pCmdQ->getIndirectHeap(IndirectHeap::SURFACE_STATE, 0u), pDevQueueHw->getIndirectHeap(IndirectHeap::DYNAMIC_STATE), false); auto localWorkSize = scheduler.getLocalWorkSizeValues(); EXPECT_EQ((uint32_t)scheduler.getLws(), *localWorkSize[0]); EXPECT_EQ(1u, *localWorkSize[1]); EXPECT_EQ(1u, *localWorkSize[2]); auto enqueuedLocalWorkSize = scheduler.getEnqueuedLocalWorkSizeValues(); if (enqueuedLocalWorkSize[0] != &Kernel::dummyPatchLocation) { EXPECT_EQ((uint32_t)scheduler.getLws(), *enqueuedLocalWorkSize[0]); EXPECT_EQ(1u, *enqueuedLocalWorkSize[1]); EXPECT_EQ(1u, *enqueuedLocalWorkSize[2]); } auto numWorkGroups = scheduler.getNumWorkGroupsValues(); EXPECT_EQ((uint32_t)(scheduler.getGws() / scheduler.getLws()), *numWorkGroups[0]); EXPECT_EQ(0u, *numWorkGroups[1]); EXPECT_EQ(0u, *numWorkGroups[2]); HardwareParse hwParser; hwParser.parseCommands(commandStream, 0); hwParser.findHardwareCommands(); ASSERT_NE(hwParser.cmdList.end(), hwParser.itorWalker); // Before Walker There must be PC PIPE_CONTROL *pc = hwParser.getCommand(hwParser.cmdList.begin(), hwParser.itorWalker); ASSERT_NE(nullptr, pc); ASSERT_NE(hwParser.cmdList.end(), hwParser.itorMediaInterfaceDescriptorLoad); auto *interfaceDescLoad = (MEDIA_INTERFACE_DESCRIPTOR_LOAD *)*hwParser.itorMediaInterfaceDescriptorLoad; uint32_t addressOffsetProgrammed = interfaceDescLoad->getInterfaceDescriptorDataStartAddress(); uint32_t interfaceDescriptorSizeProgrammed = interfaceDescLoad->getInterfaceDescriptorTotalLength(); uint32_t addressOffsetExpected = pDevQueueHw->colorCalcStateSize; uint32_t intDescSizeExpected = DeviceQueue::interfaceDescriptorEntries * sizeof(INTERFACE_DESCRIPTOR_DATA); EXPECT_EQ(addressOffsetExpected, addressOffsetProgrammed); EXPECT_EQ(intDescSizeExpected, interfaceDescriptorSizeProgrammed); auto *walker = (GPGPU_WALKER *)*hwParser.itorWalker; size_t workGroups[3] = {(scheduler.getGws() / scheduler.getLws()), 1, 1}; size_t numWorkgroupsProgrammed[3] = {0, 0, 0}; uint32_t threadsPerWorkGroup = walker->getThreadWidthCounterMaximum(); EXPECT_EQ(scheduler.getLws() / scheduler.getKernelInfo().getMaxSimdSize(), threadsPerWorkGroup); numWorkgroupsProgrammed[0] = walker->getThreadGroupIdXDimension(); numWorkgroupsProgrammed[1] = walker->getThreadGroupIdYDimension(); numWorkgroupsProgrammed[2] = walker->getThreadGroupIdZDimension(); EXPECT_EQ(workGroups[0], numWorkgroupsProgrammed[0]); EXPECT_EQ(workGroups[1], numWorkgroupsProgrammed[1]); EXPECT_EQ(workGroups[2], numWorkgroupsProgrammed[2]); typename FamilyType::GPGPU_WALKER::SIMD_SIZE simdSize = walker->getSimdSize(); EXPECT_EQ(FamilyType::GPGPU_WALKER::SIMD_SIZE::SIMD_SIZE_SIMD8, simdSize); EXPECT_EQ(0u, walker->getThreadGroupIdStartingX()); EXPECT_EQ(0u, walker->getThreadGroupIdStartingY()); EXPECT_EQ(0u, walker->getThreadGroupIdStartingResumeZ()); uint32_t offsetCrossThreadDataProgrammed = walker->getIndirectDataStartAddress(); assert(offsetCrossThreadDataProgrammed % 64 == 0); size_t curbeSize = scheduler.getCurbeSize(); size_t offsetCrossThreadDataExpected = dshHeap->getMaxAvailableSpace() - curbeSize - 4096; // take additional page for padding into account EXPECT_EQ((uint32_t)offsetCrossThreadDataExpected, offsetCrossThreadDataProgrammed); EXPECT_EQ(62u, walker->getInterfaceDescriptorOffset()); auto numChannels = 3; auto grfSize = pDevice->getHardwareInfo().capabilityTable.grfSize; auto sizePerThreadDataTotal = PerThreadDataHelper::getPerThreadDataSizeTotal(scheduler.getKernelInfo().getMaxSimdSize(), grfSize, numChannels, scheduler.getLws()); auto sizeCrossThreadData = scheduler.getCrossThreadDataSize(); auto IndirectDataLength = alignUp((uint32_t)(sizeCrossThreadData + sizePerThreadDataTotal), GPGPU_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE); EXPECT_EQ(IndirectDataLength, walker->getIndirectDataLength()); ASSERT_NE(hwParser.cmdList.end(), hwParser.itorBBStartAfterWalker); auto *bbStart = (MI_BATCH_BUFFER_START *)*hwParser.itorBBStartAfterWalker; uint64_t slbAddress = pDevQueueHw->getSlbBuffer()->getGpuAddress(); EXPECT_EQ(slbAddress, bbStart->getBatchBufferStartAddressGraphicsaddress472()); } HWCMDTEST_F(IGFX_GEN8_CORE, ExecutionModelSchedulerFixture, WhenDispatchingSchedulerThenStandardCmdqIohIsNotUsed) { using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA; using GPGPU_WALKER = typename FamilyType::GPGPU_WALKER; using PIPELINE_SELECT = typename FamilyType::PIPELINE_SELECT; using STATE_BASE_ADDRESS = typename FamilyType::STATE_BASE_ADDRESS; using MEDIA_INTERFACE_DESCRIPTOR_LOAD = typename FamilyType::MEDIA_INTERFACE_DESCRIPTOR_LOAD; using MEDIA_VFE_STATE = typename FamilyType::MEDIA_VFE_STATE; using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL; DeviceQueueHw *pDevQueueHw = castToObject>(pDevQueue); SchedulerKernel &scheduler = context->getSchedulerKernel(); size_t minRequiredSizeForSchedulerSSH = HardwareCommandsHelper::getSshSizeForExecutionModel(*parentKernel); // Setup heaps in pCmdQ MultiDispatchInfo multiDispatchinfo(&scheduler); getCommandStream(*pCmdQ, CsrDependencies(), false, false, false, multiDispatchinfo, nullptr, 0, false, false); pCmdQ->getIndirectHeap(IndirectHeap::SURFACE_STATE, minRequiredSizeForSchedulerSSH); GpgpuWalkerHelper::dispatchScheduler( pCmdQ->getCS(0), *pDevQueueHw, pDevice->getPreemptionMode(), scheduler, &pCmdQ->getIndirectHeap(IndirectHeap::SURFACE_STATE, 0u), pDevQueueHw->getIndirectHeap(IndirectHeap::DYNAMIC_STATE), false); auto &ioh = pCmdQ->getIndirectHeap(IndirectHeap::INDIRECT_OBJECT, 0u); EXPECT_EQ(0u, ioh.getUsed()); } HWCMDTEST_F(IGFX_GEN8_CORE, ParentKernelCommandQueueFixture, GivenEarlyReturnSetToFirstInstanceWhenDispatchingSchedulerThenBbStartCmdIsNotInserted) { REQUIRE_DEVICE_ENQUEUE_OR_SKIP(device); cl_queue_properties properties[3] = {0}; MockDeviceQueueHw mockDevQueue(context, device, properties[0]); auto *igilQueue = mockDevQueue.getIgilQueue(); ASSERT_NE(nullptr, igilQueue); igilQueue->m_controls.m_SchedulerEarlyReturn = 1; SchedulerKernel &scheduler = context->getSchedulerKernel(); size_t minRequiredSizeForSchedulerSSH = HardwareCommandsHelper::getSizeRequiredSSH(scheduler); // Setup heaps in pCmdQ MultiDispatchInfo multiDispatchinfo(&scheduler); LinearStream &commandStream = getCommandStream(*pCmdQ, CsrDependencies(), false, false, false, multiDispatchinfo, nullptr, 0, false, false); pCmdQ->getIndirectHeap(IndirectHeap::SURFACE_STATE, minRequiredSizeForSchedulerSSH); GpgpuWalkerHelper::dispatchScheduler( pCmdQ->getCS(0), mockDevQueue, device->getPreemptionMode(), scheduler, &pCmdQ->getIndirectHeap(IndirectHeap::SURFACE_STATE, 0u), mockDevQueue.getIndirectHeap(IndirectHeap::DYNAMIC_STATE), false); HardwareParse hwParser; hwParser.parseCommands(commandStream, 0); hwParser.findHardwareCommands(); EXPECT_NE(hwParser.cmdList.end(), hwParser.itorWalker); EXPECT_EQ(hwParser.cmdList.end(), hwParser.itorBBStartAfterWalker); } HWCMDTEST_F(IGFX_GEN8_CORE, ExecutionModelSchedulerFixture, WhenForceDispatchingSchedulerThenSchedulerKernelIsEnqueued) { DebugManagerStateRestore dbgRestorer; DebugManager.flags.ForceDispatchScheduler.set(true); size_t offset[3] = {0, 0, 0}; size_t gws[3] = {1, 1, 1}; MockCommandQueueHw *mockCmdQ = new MockCommandQueueHw(context, pClDevice, 0); mockCmdQ->enqueueKernel(parentKernel, 1, offset, gws, gws, 0, nullptr, nullptr); EXPECT_TRUE(mockCmdQ->lastEnqueuedKernels.front()->isSchedulerKernel); delete mockCmdQ; }