compute-runtime/unit_tests/execution_model/scheduler_dispatch_tests.cpp

259 lines
12 KiB
C++
Raw Normal View History

/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#include "runtime/built_ins/built_ins.h"
#include "runtime/command_queue/enqueue_kernel.h"
#include "runtime/device_queue/device_queue.h"
#include "runtime/memory_manager/surface.h"
#include "runtime/scheduler/scheduler_kernel.h"
#include "unit_tests/fixtures/execution_model_fixture.h"
#include "unit_tests/helpers/debug_manager_state_restore.h"
#include "unit_tests/helpers/hw_parse.h"
#include "unit_tests/mocks/mock_kernel.h"
#include "unit_tests/mocks/mock_program.h"
#include "unit_tests/mocks/mock_command_queue.h"
#include "unit_tests/mocks/mock_context.h"
#include "unit_tests/mocks/mock_device_queue.h"
using namespace OCLRT;
class ExecutionModelSchedulerFixture : public ExecutionModelSchedulerTest,
public testing::Test {
void SetUp() override {
ExecutionModelSchedulerTest::SetUp();
}
void TearDown() override {
ExecutionModelSchedulerTest::TearDown();
}
};
HWTEST_F(ExecutionModelSchedulerFixture, dispatchScheduler) {
using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA;
using GPGPU_WALKER = typename FamilyType::GPGPU_WALKER;
using PIPELINE_SELECT = typename FamilyType::PIPELINE_SELECT;
using STATE_BASE_ADDRESS = typename FamilyType::STATE_BASE_ADDRESS;
using MEDIA_INTERFACE_DESCRIPTOR_LOAD = typename FamilyType::MEDIA_INTERFACE_DESCRIPTOR_LOAD;
using MEDIA_VFE_STATE = typename FamilyType::MEDIA_VFE_STATE;
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
using MI_BATCH_BUFFER_START = typename FamilyType::MI_BATCH_BUFFER_START;
if (pDevice->getSupportedClVersion() >= 20) {
DeviceQueueHw<FamilyType> *pDevQueueHw = castToObject<DeviceQueueHw<FamilyType>>(pDevQueue);
SchedulerKernel &scheduler = BuiltIns::getInstance().getSchedulerKernel(*context);
auto *executionModelDshAllocation = pDevQueueHw->getDshBuffer();
auto *dshHeap = pDevQueueHw->getIndirectHeap(IndirectHeap::DYNAMIC_STATE);
void *executionModelDsh = executionModelDshAllocation->getUnderlyingBuffer();
EXPECT_NE(nullptr, executionModelDsh);
size_t minRequiredSizeForSchedulerIH = KernelCommandsHelper<FamilyType>::template getSizeRequiredForExecutionModel<IndirectHeap::INSTRUCTION>(*parentKernel);
size_t minRequiredSizeForSchedulerSSH = KernelCommandsHelper<FamilyType>::template getSizeRequiredForExecutionModel<IndirectHeap::SURFACE_STATE>(*parentKernel);
// Setup heaps in pCmdQ
LinearStream &commandStream = getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(*pCmdQ, false, false, &scheduler);
pCmdQ->getIndirectHeap(IndirectHeap::INSTRUCTION, minRequiredSizeForSchedulerIH);
pCmdQ->getIndirectHeap(IndirectHeap::SURFACE_STATE, minRequiredSizeForSchedulerSSH);
dispatchScheduler<FamilyType>(
*pCmdQ,
*pDevQueueHw,
scheduler);
EXPECT_EQ(0u, *scheduler.globalWorkOffsetX);
EXPECT_EQ(0u, *scheduler.globalWorkOffsetY);
EXPECT_EQ(0u, *scheduler.globalWorkOffsetZ);
EXPECT_EQ((uint32_t)scheduler.getLws(), *scheduler.localWorkSizeX);
EXPECT_EQ(1u, *scheduler.localWorkSizeY);
EXPECT_EQ(1u, *scheduler.localWorkSizeZ);
EXPECT_EQ((uint32_t)scheduler.getLws(), *scheduler.localWorkSizeX2);
EXPECT_EQ(1u, *scheduler.localWorkSizeY2);
EXPECT_EQ(1u, *scheduler.localWorkSizeZ2);
if (scheduler.enqueuedLocalWorkSizeX != &Kernel::dummyPatchLocation) {
EXPECT_EQ((uint32_t)scheduler.getLws(), *scheduler.enqueuedLocalWorkSizeX);
}
EXPECT_EQ(1u, *scheduler.enqueuedLocalWorkSizeY);
EXPECT_EQ(1u, *scheduler.enqueuedLocalWorkSizeZ);
EXPECT_EQ((uint32_t)(scheduler.getGws() / scheduler.getLws()), *scheduler.numWorkGroupsX);
EXPECT_EQ(0u, *scheduler.numWorkGroupsY);
EXPECT_EQ(0u, *scheduler.numWorkGroupsZ);
HardwareParse hwParser;
hwParser.parseCommands<FamilyType>(commandStream, 0);
hwParser.findHardwareCommands<FamilyType>();
ASSERT_NE(hwParser.cmdList.end(), hwParser.itorWalker);
// Before Walker There must be PC
PIPE_CONTROL *pc = hwParser.getCommand<PIPE_CONTROL>(hwParser.cmdList.begin(), hwParser.itorWalker);
ASSERT_NE(nullptr, pc);
ASSERT_NE(hwParser.cmdList.end(), hwParser.itorMediaInterfaceDescriptorLoad);
auto *interfaceDescLoad = (MEDIA_INTERFACE_DESCRIPTOR_LOAD *)*hwParser.itorMediaInterfaceDescriptorLoad;
uint32_t addressOffsetProgrammed = interfaceDescLoad->getInterfaceDescriptorDataStartAddress();
uint32_t interfaceDescriptorSizeProgrammed = interfaceDescLoad->getInterfaceDescriptorTotalLength();
uint32_t addressOffsetExpected = pDevQueueHw->colorCalcStateSize;
uint32_t intDescSizeExpected = DeviceQueue::interfaceDescriptorEntries * sizeof(INTERFACE_DESCRIPTOR_DATA);
EXPECT_EQ(addressOffsetExpected, addressOffsetProgrammed);
EXPECT_EQ(intDescSizeExpected, interfaceDescriptorSizeProgrammed);
auto *walker = (GPGPU_WALKER *)*hwParser.itorWalker;
size_t workGroups[3] = {(scheduler.getGws() / scheduler.getLws()), 1, 1};
size_t numWorkgroupsProgrammed[3] = {0, 0, 0};
uint32_t threadsPerWorkGroup = walker->getThreadWidthCounterMaximum();
EXPECT_EQ(scheduler.getLws() / scheduler.getKernelInfo().getMaxSimdSize(), threadsPerWorkGroup);
numWorkgroupsProgrammed[0] = walker->getThreadGroupIdXDimension();
numWorkgroupsProgrammed[1] = walker->getThreadGroupIdYDimension();
numWorkgroupsProgrammed[2] = walker->getThreadGroupIdZDimension();
EXPECT_EQ(workGroups[0], numWorkgroupsProgrammed[0]);
EXPECT_EQ(workGroups[1], numWorkgroupsProgrammed[1]);
EXPECT_EQ(workGroups[2], numWorkgroupsProgrammed[2]);
typename FamilyType::GPGPU_WALKER::SIMD_SIZE simdSize = walker->getSimdSize();
EXPECT_EQ(FamilyType::GPGPU_WALKER::SIMD_SIZE::SIMD_SIZE_SIMD8, simdSize);
EXPECT_EQ(0u, walker->getThreadGroupIdStartingX());
EXPECT_EQ(0u, walker->getThreadGroupIdStartingY());
EXPECT_EQ(0u, walker->getThreadGroupIdStartingResumeZ());
uint32_t offsetCrossThreadDataProgrammed = walker->getIndirectDataStartAddress();
assert(offsetCrossThreadDataProgrammed % 64 == 0);
size_t curbeSize = scheduler.getCurbeSize();
size_t offsetCrossThreadDataExpected = dshHeap->getMaxAvailableSpace() - curbeSize - 4096; // take additional page for padding into account
EXPECT_EQ((uint32_t)offsetCrossThreadDataExpected, offsetCrossThreadDataProgrammed);
EXPECT_EQ(62u, walker->getInterfaceDescriptorOffset());
auto numChannels = 3;
auto sizePerThreadDataTotal = PerThreadDataHelper::getPerThreadDataSizeTotal(scheduler.getKernelInfo().getMaxSimdSize(), numChannels, scheduler.getLws());
auto sizeCrossThreadData = scheduler.getCrossThreadDataSize();
auto IndirectDataLength = alignUp((uint32_t)(sizeCrossThreadData + sizePerThreadDataTotal), GPGPU_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
EXPECT_EQ(IndirectDataLength, walker->getIndirectDataLength());
ASSERT_NE(hwParser.cmdList.end(), hwParser.itorBBStartAfterWalker);
auto *bbStart = (MI_BATCH_BUFFER_START *)*hwParser.itorBBStartAfterWalker;
uint64_t slbAddress = pDevQueueHw->getSlbBuffer()->getGpuAddress();
EXPECT_EQ(slbAddress, bbStart->getBatchBufferStartAddressGraphicsaddress472());
}
}
HWTEST_F(ExecutionModelSchedulerFixture, dispatchSchedulerDoesNotUseStandardCmdQIOH) {
using INTERFACE_DESCRIPTOR_DATA = typename FamilyType::INTERFACE_DESCRIPTOR_DATA;
using GPGPU_WALKER = typename FamilyType::GPGPU_WALKER;
using PIPELINE_SELECT = typename FamilyType::PIPELINE_SELECT;
using STATE_BASE_ADDRESS = typename FamilyType::STATE_BASE_ADDRESS;
using MEDIA_INTERFACE_DESCRIPTOR_LOAD = typename FamilyType::MEDIA_INTERFACE_DESCRIPTOR_LOAD;
using MEDIA_VFE_STATE = typename FamilyType::MEDIA_VFE_STATE;
using PIPE_CONTROL = typename FamilyType::PIPE_CONTROL;
if (pDevice->getSupportedClVersion() >= 20) {
DeviceQueueHw<FamilyType> *pDevQueueHw = castToObject<DeviceQueueHw<FamilyType>>(pDevQueue);
SchedulerKernel &scheduler = BuiltIns::getInstance().getSchedulerKernel(*context);
size_t minRequiredSizeForSchedulerIH = KernelCommandsHelper<FamilyType>::template getSizeRequiredForExecutionModel<IndirectHeap::INSTRUCTION>(*parentKernel);
size_t minRequiredSizeForSchedulerSSH = KernelCommandsHelper<FamilyType>::template getSizeRequiredForExecutionModel<IndirectHeap::SURFACE_STATE>(*parentKernel);
// Setup heaps in pCmdQ
getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(*pCmdQ, false, false, &scheduler);
pCmdQ->getIndirectHeap(IndirectHeap::INSTRUCTION, minRequiredSizeForSchedulerIH);
pCmdQ->getIndirectHeap(IndirectHeap::SURFACE_STATE, minRequiredSizeForSchedulerSSH);
dispatchScheduler<FamilyType>(
*pCmdQ,
*pDevQueueHw,
scheduler);
auto &ioh = pCmdQ->getIndirectHeap(IndirectHeap::INDIRECT_OBJECT);
EXPECT_EQ(0u, ioh.getUsed());
}
}
HWTEST_F(ParentKernelCommandQueueFixture, dispatchSchedulerWithEarlyReturnSetToFirstInstanceDoesNotPutBBStartCmd) {
if (device->getSupportedClVersion() >= 20) {
cl_queue_properties properties[3] = {0};
MockDeviceQueueHw<FamilyType> mockDevQueue(context, device, properties[0]);
auto *igilQueue = mockDevQueue.getIgilQueue();
ASSERT_NE(nullptr, igilQueue);
igilQueue->m_controls.m_SchedulerEarlyReturn = 1;
SchedulerKernel &scheduler = BuiltIns::getInstance().getSchedulerKernel(*context);
size_t minRequiredSizeForSchedulerIH = KernelCommandsHelper<FamilyType>::getSizeRequiredIH(scheduler);
size_t minRequiredSizeForSchedulerSSH = KernelCommandsHelper<FamilyType>::getSizeRequiredSSH(scheduler);
// Setup heaps in pCmdQ
LinearStream &commandStream = getCommandStream<FamilyType, CL_COMMAND_NDRANGE_KERNEL>(*pCmdQ, false, false, &scheduler);
pCmdQ->getIndirectHeap(IndirectHeap::INSTRUCTION, minRequiredSizeForSchedulerIH);
pCmdQ->getIndirectHeap(IndirectHeap::SURFACE_STATE, minRequiredSizeForSchedulerSSH);
dispatchScheduler<FamilyType>(
*pCmdQ,
mockDevQueue,
scheduler);
HardwareParse hwParser;
hwParser.parseCommands<FamilyType>(commandStream, 0);
hwParser.findHardwareCommands<FamilyType>();
EXPECT_NE(hwParser.cmdList.end(), hwParser.itorWalker);
EXPECT_EQ(hwParser.cmdList.end(), hwParser.itorBBStartAfterWalker);
}
}
HWTEST_F(ExecutionModelSchedulerFixture, ForceDispatchSchedulerEnqueuesSchedulerKernel) {
if (pDevice->getSupportedClVersion() >= 20) {
DebugManagerStateRestore dbgRestorer;
DebugManager.flags.ForceDispatchScheduler.set(true);
size_t offset[3] = {0, 0, 0};
size_t gws[3] = {1, 1, 1};
MockCommandQueueHw<FamilyType> *mockCmdQ = new MockCommandQueueHw<FamilyType>(context, pDevice, 0);
mockCmdQ->enqueueKernel(parentKernel, 1, offset, gws, gws, 0, nullptr, nullptr);
EXPECT_TRUE(mockCmdQ->lastEnqueuedKernels.front()->isSchedulerKernel);
delete mockCmdQ;
}
}