compute-runtime/runtime/command_queue/dispatch_walker.h

944 lines
44 KiB
C
Raw Normal View History

/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#pragma once
#include "runtime/context/context.h"
#include "runtime/command_queue/local_id_gen.h"
#include "runtime/command_queue/command_queue.h"
#include "runtime/command_queue/dispatch_walker_helper.h"
#include "runtime/command_stream/command_stream_receiver.h"
#include "runtime/command_stream/preemption.h"
#include "runtime/device/device_info.h"
#include "runtime/device_queue/device_queue_hw.h"
#include "runtime/event/perf_counter.h"
#include "runtime/event/user_event.h"
#include "runtime/indirect_heap/indirect_heap.h"
#include "runtime/helpers/aligned_memory.h"
#include "runtime/helpers/debug_helpers.h"
#include "runtime/helpers/kernel_commands.h"
#include "runtime/helpers/task_information.h"
#include "runtime/helpers/validators.h"
#include "runtime/helpers/dispatch_info.h"
#include "runtime/kernel/kernel.h"
#include "runtime/mem_obj/mem_obj.h"
#include "runtime/memory_manager/graphics_allocation.h"
#include <algorithm>
#include <cmath>
namespace OCLRT {
void computeWorkgroupSize1D(
uint32_t maxWorkGroupSize,
size_t workGroupSize[3],
const size_t workItems[3],
size_t simdSize);
void computeWorkgroupSizeND(
WorkSizeInfo wsInfo,
size_t workGroupSize[3],
const size_t workItems[3],
const uint32_t workDim);
void computeWorkgroupSize2D(
uint32_t maxWorkGroupSize,
size_t workGroupSize[3],
const size_t workItems[3],
size_t simdSize);
void computeWorkgroupSizeSquared(
uint32_t maxWorkGroupSize,
size_t workGroupSize[3],
const size_t workItems[3],
size_t simdSize,
const uint32_t workDim);
Vec3<size_t> computeWorkgroupSize(
const DispatchInfo &dispatchInfo);
Vec3<size_t> generateWorkgroupSize(
const DispatchInfo &dispatchInfo);
Vec3<size_t> computeWorkgroupsNumber(
const Vec3<size_t> gws,
const Vec3<size_t> lws);
Vec3<size_t> generateWorkgroupsNumber(
const Vec3<size_t> gws,
const Vec3<size_t> lws);
Vec3<size_t> generateWorkgroupsNumber(
const DispatchInfo &dispatchInfo);
Vec3<size_t> canonizeWorkgroup(
Vec3<size_t> workgroup);
inline uint32_t calculateDispatchDim(Vec3<size_t> dispatchSize, Vec3<size_t> dispatchOffset) {
return std::max(1U, std::max(dispatchSize.getSimplifiedDim(), dispatchOffset.getSimplifiedDim()));
}
template <typename GfxFamily>
inline size_t setGpgpuWalkerThreadData(
typename GfxFamily::GPGPU_WALKER *pCmd,
const size_t globalOffsets[3],
const size_t startWorkGroups[3],
const size_t numWorkGroups[3],
const size_t localWorkSizesIn[3],
uint32_t simd) {
typedef typename GfxFamily::GPGPU_WALKER GPGPU_WALKER;
auto localWorkSize = localWorkSizesIn[0] * localWorkSizesIn[1] * localWorkSizesIn[2];
auto threadsPerWorkGroup = getThreadsPerWG(simd, localWorkSize);
pCmd->setThreadWidthCounterMaximum((uint32_t)threadsPerWorkGroup);
pCmd->setThreadGroupIdXDimension((uint32_t)numWorkGroups[0]);
pCmd->setThreadGroupIdYDimension((uint32_t)numWorkGroups[1]);
pCmd->setThreadGroupIdZDimension((uint32_t)numWorkGroups[2]);
// compute RightExecutionMask
auto remainderSimdLanes = localWorkSize & (simd - 1);
uint64_t executionMask = (1ull << remainderSimdLanes) - 1;
if (!executionMask)
executionMask = ~executionMask;
pCmd->setRightExecutionMask((uint32_t)executionMask);
pCmd->setBottomExecutionMask((uint32_t)0xffffffff);
pCmd->setSimdSize((typename GPGPU_WALKER::SIMD_SIZE)(simd >> 4));
pCmd->setThreadGroupIdStartingX((uint32_t)startWorkGroups[0]);
pCmd->setThreadGroupIdStartingY((uint32_t)startWorkGroups[1]);
pCmd->setThreadGroupIdStartingResumeZ((uint32_t)startWorkGroups[2]);
return localWorkSize;
}
inline cl_uint computeDimensions(const size_t workItems[3]) {
return (workItems[2] > 1) ? 3 : (workItems[1] > 1) ? 2 : 1;
}
void provideLocalWorkGroupSizeHints(Context *context, uint32_t maxWorkGroupSize, DispatchInfo dispatchInfo);
template <typename SizeAndAllocCalcT, typename... CalcArgsT>
IndirectHeap *allocateIndirectHeap(SizeAndAllocCalcT &&calc, CalcArgsT &&... args) {
size_t alignment = MemoryConstants::pageSize;
size_t size = calc(std::forward<CalcArgsT>(args)...);
return new IndirectHeap(alignedMalloc(size, alignment), size);
}
template <typename GfxFamily>
void dispatchProfilingCommandsStart(
HwTimeStamps &hwTimeStamps,
OCLRT::LinearStream *commandStream) {
using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
// PIPE_CONTROL for global timestamp
uint64_t TimeStampAddress = reinterpret_cast<uint64_t>(&(hwTimeStamps.GlobalStartTS));
auto pPipeControlCmd = (PIPE_CONTROL *)commandStream->getSpace(sizeof(PIPE_CONTROL));
*pPipeControlCmd = PIPE_CONTROL::sInit();
pPipeControlCmd->setCommandStreamerStallEnable(true);
pPipeControlCmd->setPostSyncOperation(PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_TIMESTAMP);
pPipeControlCmd->setAddress(static_cast<uint32_t>(TimeStampAddress & 0x0000FFFFFFFFULL));
pPipeControlCmd->setAddressHigh(static_cast<uint32_t>(TimeStampAddress >> 32));
//MI_STORE_REGISTER_MEM for context local timestamp
TimeStampAddress = reinterpret_cast<uint64_t>(&(hwTimeStamps.ContextStartTS));
//low part
auto pMICmdLow = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM));
*pMICmdLow = MI_STORE_REGISTER_MEM::sInit();
pMICmdLow->setRegisterAddress(GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW);
pMICmdLow->setMemoryAddress(TimeStampAddress);
}
template <typename GfxFamily>
void dispatchProfilingCommandsEnd(
HwTimeStamps &hwTimeStamps,
OCLRT::LinearStream *commandStream) {
using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
// PIPE_CONTROL for global timestamp
auto pPipeControlCmd = (PIPE_CONTROL *)commandStream->getSpace(sizeof(PIPE_CONTROL));
*pPipeControlCmd = PIPE_CONTROL::sInit();
pPipeControlCmd->setCommandStreamerStallEnable(true);
//MI_STORE_REGISTER_MEM for context local timestamp
uint64_t TimeStampAddress = reinterpret_cast<uint64_t>(&(hwTimeStamps.ContextEndTS));
//low part
auto pMICmdLow = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM));
*pMICmdLow = MI_STORE_REGISTER_MEM::sInit();
pMICmdLow->setRegisterAddress(GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW);
pMICmdLow->setMemoryAddress(TimeStampAddress);
}
template <typename GfxFamily>
void dispatchPerfCountersNoopidRegisterCommands(
CommandQueue &commandQueue,
OCLRT::HwPerfCounter &hwPerfCounter,
OCLRT::LinearStream *commandStream,
bool start) {
using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
uint64_t address = start ? reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.DMAFenceIdBegin))
: reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.DMAFenceIdEnd));
auto pNoopIdRegister = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM));
*pNoopIdRegister = MI_STORE_REGISTER_MEM::sInit();
pNoopIdRegister->setRegisterAddress(OCLRT::INSTR_MMIO_NOOPID);
pNoopIdRegister->setMemoryAddress(address);
}
template <typename GfxFamily>
void dispatchPerfCountersReadFreqRegisterCommands(
CommandQueue &commandQueue,
OCLRT::HwPerfCounter &hwPerfCounter,
OCLRT::LinearStream *commandStream,
bool start) {
using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
uint64_t address = start ? reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.CoreFreqBegin))
: reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.CoreFreqEnd));
auto pCoreFreqRegister = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM));
*pCoreFreqRegister = MI_STORE_REGISTER_MEM::sInit();
pCoreFreqRegister->setRegisterAddress(OCLRT::INSTR_MMIO_RPSTAT1);
pCoreFreqRegister->setMemoryAddress(address);
}
template <typename GfxFamily>
void dispatchPerfCountersGeneralPurposeCounterCommands(
CommandQueue &commandQueue,
OCLRT::HwPerfCounter &hwPerfCounter,
OCLRT::LinearStream *commandStream,
bool start) {
using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
uint64_t address = 0;
const uint64_t baseAddress = start ? reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.HwPerfReportBegin.Gp))
: reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.HwPerfReportEnd.Gp));
// Read General Purpose counters
for (uint16_t i = 0; i < OCLRT::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT; i++) {
auto pGeneralPurposeRegister = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM));
*pGeneralPurposeRegister = MI_STORE_REGISTER_MEM::sInit();
uint32_t regAddr = INSTR_GFX_OFFSETS::INSTR_PERF_CNT_1_DW0 + i * sizeof(cl_uint);
pGeneralPurposeRegister->setRegisterAddress(regAddr);
//Gp field is 2*uint64 wide so it can hold 4 uint32
address = baseAddress + i * sizeof(cl_uint);
pGeneralPurposeRegister->setMemoryAddress(address);
}
}
template <typename GfxFamily>
void dispatchPerfCountersUserCounterCommands(
CommandQueue &commandQueue,
OCLRT::HwPerfCounter &hwPerfCounter,
OCLRT::LinearStream *commandStream,
bool start) {
using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
uint64_t address = 0;
const uint64_t baseAddr = start ? reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.HwPerfReportBegin.User))
: reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.HwPerfReportEnd.User));
uint32_t cmdNum = 0;
uint32_t regAddr = 0;
auto configData = commandQueue.getPerfCountersConfigData();
auto userRegs = &configData->ReadRegs;
for (uint32_t i = 0; i < userRegs->RegsCount; i++) {
auto pRegister = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM));
*pRegister = MI_STORE_REGISTER_MEM::sInit();
regAddr = userRegs->Reg[i].Offset;
pRegister->setRegisterAddress(regAddr);
//offset between base (low) registers is cl_ulong wide
address = baseAddr + i * sizeof(cl_ulong);
pRegister->setMemoryAddress(address);
cmdNum++;
if (userRegs->Reg[i].BitSize > 32) {
pRegister = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM));
*pRegister = MI_STORE_REGISTER_MEM::sInit();
regAddr += sizeof(cl_uint);
pRegister->setRegisterAddress(regAddr);
address += sizeof(cl_uint);
pRegister->setMemoryAddress(address);
cmdNum++;
}
}
}
template <typename GfxFamily>
void dispatchPerfCountersOABufferStateCommands(
CommandQueue &commandQueue,
OCLRT::HwPerfCounter &hwPerfCounter,
OCLRT::LinearStream *commandStream) {
using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
uint64_t address = 0;
//OA Status
auto pOaRegister = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM));
*pOaRegister = MI_STORE_REGISTER_MEM::sInit();
pOaRegister->setRegisterAddress(INSTR_GFX_OFFSETS::INSTR_OA_STATUS);
address = reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.OaStatus));
pOaRegister->setMemoryAddress(address);
//OA Head
pOaRegister = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM));
*pOaRegister = MI_STORE_REGISTER_MEM::sInit();
pOaRegister->setRegisterAddress(INSTR_GFX_OFFSETS::INSTR_OA_HEAD_PTR);
address = reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.OaHead));
pOaRegister->setMemoryAddress(address);
//OA Tail
pOaRegister = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM));
*pOaRegister = MI_STORE_REGISTER_MEM::sInit();
pOaRegister->setRegisterAddress(INSTR_GFX_OFFSETS::INSTR_OA_TAIL_PTR);
address = reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.OaTail));
pOaRegister->setMemoryAddress(address);
}
template <typename GfxFamily>
void dispatchPerfCountersCommandsStart(
CommandQueue &commandQueue,
OCLRT::HwPerfCounter &hwPerfCounter,
OCLRT::LinearStream *commandStream) {
using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
using MI_REPORT_PERF_COUNT = typename GfxFamily::MI_REPORT_PERF_COUNT;
auto perfCounters = commandQueue.getPerfCounters();
uint32_t currentReportId = perfCounters->getCurrentReportId();
uint64_t address = 0;
//flush command streamer
auto pPipeControlCmd = (PIPE_CONTROL *)commandStream->getSpace(sizeof(PIPE_CONTROL));
*pPipeControlCmd = PIPE_CONTROL::sInit();
pPipeControlCmd->setCommandStreamerStallEnable(true);
//Store value of NOOPID register
dispatchPerfCountersNoopidRegisterCommands<GfxFamily>(commandQueue, hwPerfCounter, commandStream, true);
//Read Core Frequency
dispatchPerfCountersReadFreqRegisterCommands<GfxFamily>(commandQueue, hwPerfCounter, commandStream, true);
dispatchPerfCountersGeneralPurposeCounterCommands<GfxFamily>(commandQueue, hwPerfCounter, commandStream, true);
auto pReportPerfCount = (MI_REPORT_PERF_COUNT *)commandStream->getSpace(sizeof(MI_REPORT_PERF_COUNT));
*pReportPerfCount = MI_REPORT_PERF_COUNT::sInit();
pReportPerfCount->setReportId(currentReportId);
address = reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.HwPerfReportBegin.Oa));
pReportPerfCount->setMemoryAddress(address);
//Timestamp: Global Start
pPipeControlCmd = (PIPE_CONTROL *)commandStream->getSpace(sizeof(PIPE_CONTROL));
*pPipeControlCmd = PIPE_CONTROL::sInit();
pPipeControlCmd->setCommandStreamerStallEnable(true);
pPipeControlCmd->setPostSyncOperation(PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_TIMESTAMP);
address = reinterpret_cast<uint64_t>(&(hwPerfCounter.HWTimeStamp.GlobalStartTS));
pPipeControlCmd->setAddress(static_cast<uint32_t>(address & ((uint64_t)UINT32_MAX)));
pPipeControlCmd->setAddressHigh(static_cast<uint32_t>(address >> 32));
dispatchPerfCountersUserCounterCommands<GfxFamily>(commandQueue, hwPerfCounter, commandStream, true);
commandQueue.sendPerfCountersConfig();
}
template <typename GfxFamily>
void dispatchPerfCountersCommandsEnd(
CommandQueue &commandQueue,
OCLRT::HwPerfCounter &hwPerfCounter,
OCLRT::LinearStream *commandStream) {
using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
using MI_REPORT_PERF_COUNT = typename GfxFamily::MI_REPORT_PERF_COUNT;
auto perfCounters = commandQueue.getPerfCounters();
uint32_t currentReportId = perfCounters->getCurrentReportId();
uint64_t address = 0;
//flush command streamer
auto pPipeControlCmd = (PIPE_CONTROL *)commandStream->getSpace(sizeof(PIPE_CONTROL));
*pPipeControlCmd = PIPE_CONTROL::sInit();
pPipeControlCmd->setCommandStreamerStallEnable(true);
dispatchPerfCountersOABufferStateCommands<GfxFamily>(commandQueue, hwPerfCounter, commandStream);
//Timestamp: Global End
pPipeControlCmd = (PIPE_CONTROL *)commandStream->getSpace(sizeof(PIPE_CONTROL));
*pPipeControlCmd = PIPE_CONTROL::sInit();
pPipeControlCmd->setCommandStreamerStallEnable(true);
pPipeControlCmd->setPostSyncOperation(PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_TIMESTAMP);
address = reinterpret_cast<uint64_t>(&(hwPerfCounter.HWTimeStamp.GlobalEndTS));
pPipeControlCmd->setAddress(static_cast<uint32_t>(address & ((uint64_t)UINT32_MAX)));
pPipeControlCmd->setAddressHigh(static_cast<uint32_t>(address >> 32));
auto pReportPerfCount = (MI_REPORT_PERF_COUNT *)commandStream->getSpace(sizeof(MI_REPORT_PERF_COUNT));
*pReportPerfCount = MI_REPORT_PERF_COUNT::sInit();
pReportPerfCount->setReportId(currentReportId);
address = reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.HwPerfReportEnd.Oa));
pReportPerfCount->setMemoryAddress(address);
dispatchPerfCountersGeneralPurposeCounterCommands<GfxFamily>(commandQueue, hwPerfCounter, commandStream, false);
//Store value of NOOPID register
dispatchPerfCountersNoopidRegisterCommands<GfxFamily>(commandQueue, hwPerfCounter, commandStream, false);
//Read Core Frequency
dispatchPerfCountersReadFreqRegisterCommands<GfxFamily>(commandQueue, hwPerfCounter, commandStream, false);
dispatchPerfCountersUserCounterCommands<GfxFamily>(commandQueue, hwPerfCounter, commandStream, false);
perfCounters->setCpuTimestamp();
}
template <typename GfxFamily>
void dispatchWalker(
CommandQueue &commandQueue,
const MultiDispatchInfo &multiDispatchInfo,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
KernelOperation **blockedCommandsData,
HwTimeStamps *hwTimeStamps,
OCLRT::HwPerfCounter *hwPerfCounter,
bool blockQueue = false,
unsigned int commandType = 0) {
OCLRT::LinearStream *commandStream = nullptr;
OCLRT::IndirectHeap *dsh = nullptr, *ish = nullptr, *ioh = nullptr, *ssh = nullptr;
bool executionModelKernel = multiDispatchInfo.begin()->getKernel()->isParentKernel;
for (auto &dispatchInfo : multiDispatchInfo) {
// Compute local workgroup sizes
if (dispatchInfo.getLocalWorkgroupSize().x == 0) {
const auto lws = generateWorkgroupSize(dispatchInfo);
const_cast<DispatchInfo &>(dispatchInfo).setLWS(lws);
}
}
// Allocate command stream and indirect heaps
size_t cmdQInstructionHeapReservedBlockSize = 0;
if (blockQueue) {
using KCH = KernelCommandsHelper<GfxFamily>;
commandStream = new LinearStream(alignedMalloc(MemoryConstants::pageSize, MemoryConstants::pageSize), MemoryConstants::pageSize);
if (executionModelKernel) {
uint32_t offsetDsh = commandQueue.getContext().getDefaultDeviceQueue()->getDshOffset();
uint32_t colorCalcSize = commandQueue.getContext().getDefaultDeviceQueue()->colorCalcStateSize;
dsh = allocateIndirectHeap([&multiDispatchInfo, offsetDsh] { return KCH::getTotalSizeRequiredDSH(multiDispatchInfo) + KCH::getTotalSizeRequiredIOH(multiDispatchInfo) + offsetDsh; });
dsh->getSpace(colorCalcSize);
ioh = dsh;
} else {
dsh = allocateIndirectHeap([&multiDispatchInfo] { return KCH::getTotalSizeRequiredDSH(multiDispatchInfo); });
ioh = allocateIndirectHeap([&multiDispatchInfo] { return KCH::getTotalSizeRequiredIOH(multiDispatchInfo); });
}
ish = allocateIndirectHeap([&multiDispatchInfo] { return KCH::getTotalSizeRequiredIH(multiDispatchInfo); });
cmdQInstructionHeapReservedBlockSize = commandQueue.getInstructionHeapReservedBlockSize();
ssh = allocateIndirectHeap([&multiDispatchInfo] { return KCH::getTotalSizeRequiredSSH(multiDispatchInfo); });
using UniqueIH = std::unique_ptr<IndirectHeap>;
*blockedCommandsData = new KernelOperation(std::unique_ptr<LinearStream>(commandStream), UniqueIH(dsh),
UniqueIH(ish), UniqueIH(ioh), UniqueIH(ssh));
if (executionModelKernel) {
(*blockedCommandsData)->doNotFreeISH = true;
}
} else {
commandStream = &commandQueue.getCS(0);
if (executionModelKernel && (commandQueue.getIndirectHeap(IndirectHeap::SURFACE_STATE, 0).getUsed() > 0)) {
commandQueue.releaseIndirectHeap(IndirectHeap::SURFACE_STATE);
}
dsh = &getIndirectHeap<GfxFamily, IndirectHeap::DYNAMIC_STATE>(commandQueue, multiDispatchInfo);
ish = &getIndirectHeap<GfxFamily, IndirectHeap::INSTRUCTION>(commandQueue, multiDispatchInfo);
ioh = &getIndirectHeap<GfxFamily, IndirectHeap::INDIRECT_OBJECT>(commandQueue, multiDispatchInfo);
ssh = &getIndirectHeap<GfxFamily, IndirectHeap::SURFACE_STATE>(commandQueue, multiDispatchInfo);
}
using INTERFACE_DESCRIPTOR_DATA = typename GfxFamily::INTERFACE_DESCRIPTOR_DATA;
dsh->align(KernelCommandsHelper<GfxFamily>::alignInterfaceDescriptorData);
const size_t offsetInterfaceDescriptorTable = dsh->getUsed();
uint32_t interfaceDescriptorIndex = 0;
size_t totalInterfaceDescriptorTableSize = sizeof(INTERFACE_DESCRIPTOR_DATA);
size_t numDispatches = multiDispatchInfo.size();
totalInterfaceDescriptorTableSize *= numDispatches;
if (!executionModelKernel) {
dsh->getSpace(totalInterfaceDescriptorTableSize);
} else {
dsh->getSpace(commandQueue.getContext().getDefaultDeviceQueue()->getDshOffset() - dsh->getUsed());
}
// Program media interface descriptor load
KernelCommandsHelper<GfxFamily>::sendMediaInterfaceDescriptorLoad(
*commandStream,
offsetInterfaceDescriptorTable,
totalInterfaceDescriptorTableSize);
DEBUG_BREAK_IF(offsetInterfaceDescriptorTable % 64 != 0);
for (auto &dispatchInfo : multiDispatchInfo) {
auto &kernel = *dispatchInfo.getKernel();
DEBUG_BREAK_IF(!(dispatchInfo.getDim() >= 1 && dispatchInfo.getDim() <= 3));
DEBUG_BREAK_IF(!(dispatchInfo.getGWS().z == 1 || dispatchInfo.getDim() == 3));
DEBUG_BREAK_IF(!(dispatchInfo.getGWS().y == 1 || dispatchInfo.getDim() >= 2));
DEBUG_BREAK_IF(!(dispatchInfo.getOffset().z == 0 || dispatchInfo.getDim() == 3));
DEBUG_BREAK_IF(!(dispatchInfo.getOffset().y == 0 || dispatchInfo.getDim() >= 2));
// Determine SIMD size
uint32_t simd = kernel.getKernelInfo().getMaxSimdSize();
// If we don't have a required WGS, compute one opportunistically
auto maxWorkGroupSize = static_cast<uint32_t>(commandQueue.getDevice().getDeviceInfo().maxWorkGroupSize);
if (commandType == CL_COMMAND_NDRANGE_KERNEL) {
provideLocalWorkGroupSizeHints(commandQueue.getContextPtr(), maxWorkGroupSize, dispatchInfo);
}
//Get dispatch geometry
uint32_t dim = dispatchInfo.getDim();
Vec3<size_t> gws = dispatchInfo.getGWS();
Vec3<size_t> offset = dispatchInfo.getOffset();
Vec3<size_t> swgs = dispatchInfo.getStartOfWorkgroups();
// Compute local workgroup sizes
Vec3<size_t> lws = dispatchInfo.getLocalWorkgroupSize();
Vec3<size_t> elws = (dispatchInfo.getEnqueuedWorkgroupSize().x > 0) ? dispatchInfo.getEnqueuedWorkgroupSize() : lws;
// Compute number of work groups
Vec3<size_t> twgs = (dispatchInfo.getTotalNumberOfWorkgroups().x > 0) ? dispatchInfo.getTotalNumberOfWorkgroups() : generateWorkgroupsNumber(gws, lws);
Vec3<size_t> nwgs = (dispatchInfo.getNumberOfWorkgroups().x > 0) ? dispatchInfo.getNumberOfWorkgroups() : twgs;
// Patch our kernel constants
*kernel.globalWorkOffsetX = static_cast<uint32_t>(offset.x);
*kernel.globalWorkOffsetY = static_cast<uint32_t>(offset.y);
*kernel.globalWorkOffsetZ = static_cast<uint32_t>(offset.z);
*kernel.globalWorkSizeX = static_cast<uint32_t>(gws.x);
*kernel.globalWorkSizeY = static_cast<uint32_t>(gws.y);
*kernel.globalWorkSizeZ = static_cast<uint32_t>(gws.z);
if ((&dispatchInfo == &*multiDispatchInfo.begin()) || (kernel.localWorkSizeX2 == &Kernel::dummyPatchLocation)) {
*kernel.localWorkSizeX = static_cast<uint32_t>(lws.x);
*kernel.localWorkSizeY = static_cast<uint32_t>(lws.y);
*kernel.localWorkSizeZ = static_cast<uint32_t>(lws.z);
}
*kernel.localWorkSizeX2 = static_cast<uint32_t>(lws.x);
*kernel.localWorkSizeY2 = static_cast<uint32_t>(lws.y);
*kernel.localWorkSizeZ2 = static_cast<uint32_t>(lws.z);
*kernel.enqueuedLocalWorkSizeX = static_cast<uint32_t>(elws.x);
*kernel.enqueuedLocalWorkSizeY = static_cast<uint32_t>(elws.y);
*kernel.enqueuedLocalWorkSizeZ = static_cast<uint32_t>(elws.z);
if (&dispatchInfo == &*multiDispatchInfo.begin()) {
*kernel.numWorkGroupsX = static_cast<uint32_t>(twgs.x);
*kernel.numWorkGroupsY = static_cast<uint32_t>(twgs.y);
*kernel.numWorkGroupsZ = static_cast<uint32_t>(twgs.z);
}
*kernel.workDim = dim;
// Send our indirect object data
size_t localWorkSizes[3] = {lws.x, lws.y, lws.z};
auto offsetCrossThreadData = KernelCommandsHelper<GfxFamily>::sendIndirectState(
*commandStream,
*dsh,
*ish,
cmdQInstructionHeapReservedBlockSize,
*ioh,
*ssh,
kernel,
simd,
localWorkSizes,
offsetInterfaceDescriptorTable,
interfaceDescriptorIndex);
if (&dispatchInfo == &*multiDispatchInfo.begin()) {
// If hwTimeStampAlloc is passed (not nullptr), then we know that profiling is enabled
if (hwTimeStamps != nullptr) {
dispatchProfilingCommandsStart<GfxFamily>(*hwTimeStamps, commandStream);
}
if (hwPerfCounter != nullptr) {
dispatchPerfCountersCommandsStart<GfxFamily>(commandQueue, *hwPerfCounter, commandStream);
}
}
PreemptionHelper::applyPreemptionWaCmdsBegin<GfxFamily>(commandStream, commandQueue.getDevice());
// Implement enabling special WA DisableLSQCROPERFforOCL if needed
applyWADisableLSQCROPERFforOCL<GfxFamily>(commandStream, kernel, true);
// Program the walker. Invokes execution so all state should already be programmed
typedef typename GfxFamily::GPGPU_WALKER GPGPU_WALKER;
auto pGpGpuWalkerCmd = (GPGPU_WALKER *)commandStream->getSpace(sizeof(GPGPU_WALKER));
*pGpGpuWalkerCmd = GfxFamily::cmdInitGpgpuWalker;
size_t globalOffsets[3] = {offset.x, offset.y, offset.z};
size_t startWorkGroups[3] = {swgs.x, swgs.y, swgs.z};
size_t numWorkGroups[3] = {nwgs.x, nwgs.y, nwgs.z};
auto localWorkSize = setGpgpuWalkerThreadData<GfxFamily>(pGpGpuWalkerCmd, globalOffsets, startWorkGroups, numWorkGroups, localWorkSizes, simd);
pGpGpuWalkerCmd->setIndirectDataStartAddress((uint32_t)offsetCrossThreadData);
DEBUG_BREAK_IF(offsetCrossThreadData % 64 != 0);
pGpGpuWalkerCmd->setInterfaceDescriptorOffset(interfaceDescriptorIndex++);
auto threadPayload = kernel.getKernelInfo().patchInfo.threadPayload;
DEBUG_BREAK_IF(nullptr == threadPayload);
auto numChannels = PerThreadDataHelper::getNumLocalIdChannels(*threadPayload);
auto localIdSizePerThread = PerThreadDataHelper::getLocalIdSizePerThread(simd, numChannels);
localIdSizePerThread = std::max(localIdSizePerThread, sizeof(GRF));
auto sizePerThreadDataTotal = getThreadsPerWG(simd, localWorkSize) * localIdSizePerThread;
DEBUG_BREAK_IF(sizePerThreadDataTotal == 0); // Hardware requires at least 1 GRF of perThreadData for each thread in thread group
auto sizeCrossThreadData = kernel.getCrossThreadDataSize();
auto IndirectDataLength = alignUp((uint32_t)(sizeCrossThreadData + sizePerThreadDataTotal), GPGPU_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
pGpGpuWalkerCmd->setIndirectDataLength(IndirectDataLength);
// Implement disabling special WA DisableLSQCROPERFforOCL if needed
applyWADisableLSQCROPERFforOCL<GfxFamily>(commandStream, kernel, false);
PreemptionHelper::applyPreemptionWaCmdsEnd<GfxFamily>(commandStream, commandQueue.getDevice());
}
// If hwTimeStamps is passed (not nullptr), then we know that profiling is enabled
if (hwTimeStamps != nullptr) {
dispatchProfilingCommandsEnd<GfxFamily>(*hwTimeStamps, commandStream);
}
if (hwPerfCounter != nullptr) {
dispatchPerfCountersCommandsEnd<GfxFamily>(commandQueue, *hwPerfCounter, commandStream);
}
}
template <typename GfxFamily>
void dispatchWalker(
CommandQueue &commandQueue,
const Kernel &kernel,
cl_uint workDim,
const size_t globalOffsets[3],
const size_t workItems[3],
const size_t *localWorkSizesIn,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
KernelOperation **blockedCommandsData,
HwTimeStamps *hwTimeStamps,
HwPerfCounter *hwPerfCounter,
bool blockQueue = false) {
DispatchInfo dispatchInfo(const_cast<Kernel *>(&kernel), workDim, workItems, localWorkSizesIn, globalOffsets);
dispatchWalker<GfxFamily>(commandQueue, dispatchInfo, numEventsInWaitList, eventWaitList,
blockedCommandsData, hwTimeStamps, hwPerfCounter, blockQueue);
}
template <typename GfxFamily>
void dispatchScheduler(
CommandQueue &commandQueue,
DeviceQueueHw<GfxFamily> &devQueueHw,
SchedulerKernel &scheduler) {
using INTERFACE_DESCRIPTOR_DATA = typename GfxFamily::INTERFACE_DESCRIPTOR_DATA;
using GPGPU_WALKER = typename GfxFamily::GPGPU_WALKER;
using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
using MI_BATCH_BUFFER_START = typename GfxFamily::MI_BATCH_BUFFER_START;
OCLRT::LinearStream *commandStream = nullptr;
OCLRT::IndirectHeap *dsh = nullptr, *ish = nullptr, *ioh = nullptr, *ssh = nullptr;
commandStream = &commandQueue.getCS(0);
// note : below code assumes that caller to dispatchScheduler "preallocated" memory
// required for execution model in below heap managers
dsh = devQueueHw.getIndirectHeap(IndirectHeap::DYNAMIC_STATE);
ish = &commandQueue.getIndirectHeap(IndirectHeap::INSTRUCTION);
ssh = &commandQueue.getIndirectHeap(IndirectHeap::SURFACE_STATE);
bool dcFlush = false;
commandQueue.getDevice().getCommandStreamReceiver().addPipeControl(*commandStream, dcFlush);
uint32_t interfaceDescriptorIndex = devQueueHw.schedulerIDIndex;
const size_t offsetInterfaceDescriptorTable = devQueueHw.colorCalcStateSize;
const size_t offsetInterfaceDescriptor = offsetInterfaceDescriptorTable;
const size_t totalInterfaceDescriptorTableSize = devQueueHw.interfaceDescriptorEntries * sizeof(INTERFACE_DESCRIPTOR_DATA);
// Program media interface descriptor load
KernelCommandsHelper<GfxFamily>::sendMediaInterfaceDescriptorLoad(
*commandStream,
offsetInterfaceDescriptor,
totalInterfaceDescriptorTableSize);
DEBUG_BREAK_IF(offsetInterfaceDescriptorTable % 64 != 0);
// Determine SIMD size
uint32_t simd = scheduler.getKernelInfo().getMaxSimdSize();
DEBUG_BREAK_IF(simd != PARALLEL_SCHEDULER_COMPILATION_SIZE_20);
// Patch our kernel constants
*scheduler.globalWorkOffsetX = 0;
*scheduler.globalWorkOffsetY = 0;
*scheduler.globalWorkOffsetZ = 0;
*scheduler.globalWorkSizeX = (uint32_t)scheduler.getGws();
*scheduler.globalWorkSizeY = 1;
*scheduler.globalWorkSizeZ = 1;
*scheduler.localWorkSizeX = (uint32_t)scheduler.getLws();
*scheduler.localWorkSizeY = 1;
*scheduler.localWorkSizeZ = 1;
*scheduler.localWorkSizeX2 = (uint32_t)scheduler.getLws();
*scheduler.localWorkSizeY2 = 1;
*scheduler.localWorkSizeZ2 = 1;
*scheduler.enqueuedLocalWorkSizeX = (uint32_t)scheduler.getLws();
*scheduler.enqueuedLocalWorkSizeY = 1;
*scheduler.enqueuedLocalWorkSizeZ = 1;
*scheduler.numWorkGroupsX = (uint32_t)(scheduler.getGws() / scheduler.getLws());
*scheduler.numWorkGroupsY = 0;
*scheduler.numWorkGroupsZ = 0;
*scheduler.workDim = 1;
// Send our indirect object data
size_t localWorkSizes[3] = {scheduler.getLws(), 1, 1};
// Create indirectHeap for IOH that is located at the end of device enqueue DSH
size_t curbeOffset = devQueueHw.setSchedulerCrossThreadData(scheduler);
IndirectHeap indirectObjectHeap(dsh->getBase(), dsh->getMaxAvailableSpace());
indirectObjectHeap.getSpace(curbeOffset);
ioh = &indirectObjectHeap;
auto offsetCrossThreadData = KernelCommandsHelper<GfxFamily>::sendIndirectState(
*commandStream,
*dsh,
*ish,
0,
*ioh,
*ssh,
scheduler,
simd,
localWorkSizes,
offsetInterfaceDescriptorTable,
interfaceDescriptorIndex);
// Implement enabling special WA DisableLSQCROPERFforOCL if needed
applyWADisableLSQCROPERFforOCL<GfxFamily>(commandStream, scheduler, true);
// Program the walker. Invokes execution so all state should already be programmed
auto pGpGpuWalkerCmd = (GPGPU_WALKER *)commandStream->getSpace(sizeof(GPGPU_WALKER));
*pGpGpuWalkerCmd = GfxFamily::cmdInitGpgpuWalker;
size_t globalOffsets[3] = {0, 0, 0};
size_t workGroups[3] = {(scheduler.getGws() / scheduler.getLws()), 1, 1};
auto localWorkSize = setGpgpuWalkerThreadData<GfxFamily>(pGpGpuWalkerCmd, globalOffsets, globalOffsets, workGroups, localWorkSizes, simd);
pGpGpuWalkerCmd->setIndirectDataStartAddress((uint32_t)offsetCrossThreadData);
DEBUG_BREAK_IF(offsetCrossThreadData % 64 != 0);
pGpGpuWalkerCmd->setInterfaceDescriptorOffset(interfaceDescriptorIndex);
auto threadPayload = scheduler.getKernelInfo().patchInfo.threadPayload;
DEBUG_BREAK_IF(nullptr == threadPayload);
auto numChannels = PerThreadDataHelper::getNumLocalIdChannels(*threadPayload);
auto localIdSizePerThread = PerThreadDataHelper::getLocalIdSizePerThread(simd, numChannels);
localIdSizePerThread = std::max(localIdSizePerThread, sizeof(GRF));
auto sizePerThreadDataTotal = getThreadsPerWG(simd, localWorkSize) * localIdSizePerThread;
DEBUG_BREAK_IF(sizePerThreadDataTotal == 0); // Hardware requires at least 1 GRF of perThreadData for each thread in thread group
auto sizeCrossThreadData = scheduler.getCrossThreadDataSize();
auto IndirectDataLength = alignUp((uint32_t)(sizeCrossThreadData + sizePerThreadDataTotal), GPGPU_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
pGpGpuWalkerCmd->setIndirectDataLength(IndirectDataLength);
// Implement disabling special WA DisableLSQCROPERFforOCL if needed
applyWADisableLSQCROPERFforOCL<GfxFamily>(commandStream, scheduler, false);
// Do not put BB_START only when returning in first Scheduler run
if (devQueueHw.getSchedulerReturnInstance() != 1) {
commandQueue.getDevice().getCommandStreamReceiver().addPipeControl(*commandStream, true);
// Add BB Start Cmd to the SLB in the Primary Batch Buffer
auto *bbStart = (MI_BATCH_BUFFER_START *)commandStream->getSpace(sizeof(MI_BATCH_BUFFER_START));
*bbStart = MI_BATCH_BUFFER_START::sInit();
bbStart->setSecondLevelBatchBuffer(MI_BATCH_BUFFER_START::SECOND_LEVEL_BATCH_BUFFER_FIRST_LEVEL_BATCH);
uint64_t slbAddress = devQueueHw.getSlbBuffer()->getGpuAddress();
bbStart->setBatchBufferStartAddressGraphicsaddress472(slbAddress);
}
}
template <typename GfxFamily, unsigned int eventType>
struct EnqueueOperation {
static_assert(eventType != CL_COMMAND_NDRANGE_KERNEL, "for eventType CL_COMMAND_NDRANGE_KERNEL use specialization class");
static_assert(eventType != CL_COMMAND_MARKER, "for eventType CL_COMMAND_MARKER use specialization class");
static_assert(eventType != CL_COMMAND_MIGRATE_MEM_OBJECTS, "for eventType CL_COMMAND_MIGRATE_MEM_OBJECTS use specialization class");
static size_t getTotalSizeRequiredCS(bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo) {
size_t size = KernelCommandsHelper<GfxFamily>::getSizeRequiredCS() +
sizeof(typename GfxFamily::PIPE_CONTROL) * (KernelCommandsHelper<GfxFamily>::isPipeControlWArequired() ? 2 : 1);
if (reserveProfilingCmdsSpace) {
size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL) + 4 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
}
if (reservePerfCounters) {
//start cmds
//P_C: flush CS & TimeStamp BEGIN
size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL);
//SRM NOOPID & Frequency
size += 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
//gp registers
size += OCLRT::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
//report perf count
size += sizeof(typename GfxFamily::MI_REPORT_PERF_COUNT);
//user registers
size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
//end cmds
//P_C: flush CS & TimeStamp END;
size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL);
//OA buffer (status head, tail)
size += 3 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
//report perf count
size += sizeof(typename GfxFamily::MI_REPORT_PERF_COUNT);
//gp registers
size += OCLRT::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
//SRM NOOPID & Frequency
size += 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
//user registers
size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
}
Device &device = commandQueue.getDevice();
for (auto &dispatchInfo : multiDispatchInfo) {
auto &kernel = *dispatchInfo.getKernel();
size += sizeof(typename GfxFamily::GPGPU_WALKER);
size += getSizeForWADisableLSQCROPERFforOCL<GfxFamily>(&kernel);
size += PreemptionHelper::getPreemptionWaCsSize<GfxFamily>(device);
}
return size;
}
static size_t getSizeRequiredCS(bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const Kernel *pKernel) {
size_t size = sizeof(typename GfxFamily::GPGPU_WALKER) + KernelCommandsHelper<GfxFamily>::getSizeRequiredCS() +
sizeof(typename GfxFamily::PIPE_CONTROL) * (KernelCommandsHelper<GfxFamily>::isPipeControlWArequired() ? 2 : 1);
size += PreemptionHelper::getPreemptionWaCsSize<GfxFamily>(commandQueue.getDevice());
if (reserveProfilingCmdsSpace) {
size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL) + 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
}
if (reservePerfCounters) {
//start cmds
//P_C: flush CS & TimeStamp BEGIN
size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL);
//SRM NOOPID & Frequency
size += 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
//gp registers
size += OCLRT::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
//report perf count
size += sizeof(typename GfxFamily::MI_REPORT_PERF_COUNT);
//user registers
size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
//end cmds
//P_C: flush CS & TimeStamp END;
size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL);
//OA buffer (status head, tail)
size += 3 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
//report perf count
size += sizeof(typename GfxFamily::MI_REPORT_PERF_COUNT);
//gp registers
size += OCLRT::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
//SRM NOOPID & Frequency
size += 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
//user registers
size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
}
size += getSizeForWADisableLSQCROPERFforOCL<GfxFamily>(pKernel);
return size;
}
};
template <typename GfxFamily, unsigned int eventType>
LinearStream &getCommandStream(CommandQueue &commandQueue, bool reserveProfilingCmdsSpace, bool reservePerfCounterCmdsSpace, const Kernel *pKernel) {
auto expectedSizeCS = EnqueueOperation<GfxFamily, eventType>::getSizeRequiredCS(reserveProfilingCmdsSpace, reservePerfCounterCmdsSpace, commandQueue, pKernel);
return commandQueue.getCS(expectedSizeCS);
}
template <typename GfxFamily, unsigned int eventType>
LinearStream &getCommandStream(CommandQueue &commandQueue, bool reserveProfilingCmdsSpace, bool reservePerfCounterCmdsSpace, const MultiDispatchInfo &multiDispatchInfo) {
size_t expectedSizeCS = 0;
Kernel *parentKernel = multiDispatchInfo.size() > 0 ? multiDispatchInfo.begin()->getKernel() : nullptr;
for (auto &dispatchInfo : multiDispatchInfo) {
expectedSizeCS += EnqueueOperation<GfxFamily, eventType>::getSizeRequiredCS(reserveProfilingCmdsSpace, reservePerfCounterCmdsSpace, commandQueue, dispatchInfo.getKernel());
}
if (parentKernel && parentKernel->isParentKernel) {
SchedulerKernel &scheduler = BuiltIns::getInstance().getSchedulerKernel(parentKernel->getContext());
expectedSizeCS += EnqueueOperation<GfxFamily, eventType>::getSizeRequiredCS(reserveProfilingCmdsSpace, reservePerfCounterCmdsSpace, commandQueue, &scheduler);
}
return commandQueue.getCS(expectedSizeCS);
}
template <typename GfxFamily, IndirectHeap::Type heapType>
IndirectHeap &getIndirectHeap(CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo) {
size_t expectedSize = 0;
IndirectHeap *ih = nullptr;
// clang-format off
switch(heapType) {
case IndirectHeap::DYNAMIC_STATE: expectedSize = KernelCommandsHelper<GfxFamily>::getTotalSizeRequiredDSH(multiDispatchInfo); break;
case IndirectHeap::INSTRUCTION: expectedSize = KernelCommandsHelper<GfxFamily>::getTotalSizeRequiredIH( multiDispatchInfo); break;
case IndirectHeap::INDIRECT_OBJECT: expectedSize = KernelCommandsHelper<GfxFamily>::getTotalSizeRequiredIOH(multiDispatchInfo); break;
case IndirectHeap::SURFACE_STATE: expectedSize = KernelCommandsHelper<GfxFamily>::getTotalSizeRequiredSSH(multiDispatchInfo); break;
}
// clang-format on
if (multiDispatchInfo.begin()->getKernel()->isParentKernel) {
if (heapType == IndirectHeap::INSTRUCTION || heapType == IndirectHeap::SURFACE_STATE) {
expectedSize += KernelCommandsHelper<GfxFamily>::template getSizeRequiredForExecutionModel<heapType>(const_cast<const Kernel &>(*(multiDispatchInfo.begin()->getKernel())));
} else //if (heapType == IndirectHeap::DYNAMIC_STATE || heapType == IndirectHeap::INDIRECT_OBJECT)
{
DeviceQueueHw<GfxFamily> *pDevQueue = castToObject<DeviceQueueHw<GfxFamily>>(commandQueue.getContext().getDefaultDeviceQueue());
DEBUG_BREAK_IF(pDevQueue == nullptr);
ih = pDevQueue->getIndirectHeap(IndirectHeap::DYNAMIC_STATE);
}
}
if (ih == nullptr)
ih = &commandQueue.getIndirectHeap(heapType, expectedSize);
return *ih;
}
} // namespace OCLRT