Files
compute-runtime/runtime/command_queue/dispatch_walker.h
Chodor, Jaroslaw 044fd1ab81 Fixing IntDescr programing for blocked cmd and MT
Fixing InterfaceDescriptor programming for
blocked commands when MidThread preemption is
enabled
Additionally, fixing couple of tests that block
global preemption enabling in ULTs

Change-Id: I454c9608f8606f23d7446785ac24c7c7d8701ae0
2018-01-17 12:19:07 +01:00

951 lines
44 KiB
C++

/*
* Copyright (c) 2017, Intel Corporation
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
#pragma once
#include "runtime/context/context.h"
#include "runtime/gen9/gen9_cmd_def.h"
#include "runtime/command_queue/local_id_gen.h"
#include "runtime/command_queue/command_queue.h"
#include "runtime/command_queue/dispatch_walker_helper.h"
#include "runtime/command_stream/command_stream_receiver.h"
#include "runtime/command_stream/preemption.h"
#include "runtime/device/device_info.h"
#include "runtime/device_queue/device_queue_hw.h"
#include "runtime/event/perf_counter.h"
#include "runtime/event/user_event.h"
#include "runtime/indirect_heap/indirect_heap.h"
#include "runtime/helpers/aligned_memory.h"
#include "runtime/helpers/debug_helpers.h"
#include "runtime/helpers/kernel_commands.h"
#include "runtime/helpers/task_information.h"
#include "runtime/helpers/validators.h"
#include "runtime/helpers/dispatch_info.h"
#include "runtime/kernel/kernel.h"
#include "runtime/mem_obj/mem_obj.h"
#include "runtime/memory_manager/graphics_allocation.h"
#include <algorithm>
#include <cmath>
namespace OCLRT {
void computeWorkgroupSize1D(
uint32_t maxWorkGroupSize,
size_t workGroupSize[3],
const size_t workItems[3],
size_t simdSize);
void computeWorkgroupSizeND(
WorkSizeInfo wsInfo,
size_t workGroupSize[3],
const size_t workItems[3],
const uint32_t workDim);
void computeWorkgroupSize2D(
uint32_t maxWorkGroupSize,
size_t workGroupSize[3],
const size_t workItems[3],
size_t simdSize);
void computeWorkgroupSizeSquared(
uint32_t maxWorkGroupSize,
size_t workGroupSize[3],
const size_t workItems[3],
size_t simdSize,
const uint32_t workDim);
Vec3<size_t> computeWorkgroupSize(
const DispatchInfo &dispatchInfo);
Vec3<size_t> generateWorkgroupSize(
const DispatchInfo &dispatchInfo);
Vec3<size_t> computeWorkgroupsNumber(
const Vec3<size_t> gws,
const Vec3<size_t> lws);
Vec3<size_t> generateWorkgroupsNumber(
const Vec3<size_t> gws,
const Vec3<size_t> lws);
Vec3<size_t> generateWorkgroupsNumber(
const DispatchInfo &dispatchInfo);
Vec3<size_t> canonizeWorkgroup(
Vec3<size_t> workgroup);
inline uint32_t calculateDispatchDim(Vec3<size_t> dispatchSize, Vec3<size_t> dispatchOffset) {
return std::max(1U, std::max(dispatchSize.getSimplifiedDim(), dispatchOffset.getSimplifiedDim()));
}
template <typename GfxFamily>
inline size_t setGpgpuWalkerThreadData(
typename GfxFamily::GPGPU_WALKER *pCmd,
const size_t globalOffsets[3],
const size_t startWorkGroups[3],
const size_t numWorkGroups[3],
const size_t localWorkSizesIn[3],
uint32_t simd) {
typedef typename GfxFamily::GPGPU_WALKER GPGPU_WALKER;
auto localWorkSize = localWorkSizesIn[0] * localWorkSizesIn[1] * localWorkSizesIn[2];
auto threadsPerWorkGroup = getThreadsPerWG(simd, localWorkSize);
pCmd->setThreadWidthCounterMaximum((uint32_t)threadsPerWorkGroup);
pCmd->setThreadGroupIdXDimension((uint32_t)numWorkGroups[0]);
pCmd->setThreadGroupIdYDimension((uint32_t)numWorkGroups[1]);
pCmd->setThreadGroupIdZDimension((uint32_t)numWorkGroups[2]);
// compute RightExecutionMask
auto remainderSimdLanes = localWorkSize & (simd - 1);
uint64_t executionMask = (1ull << remainderSimdLanes) - 1;
if (!executionMask)
executionMask = ~executionMask;
pCmd->setRightExecutionMask((uint32_t)executionMask);
pCmd->setBottomExecutionMask((uint32_t)0xffffffff);
pCmd->setSimdSize((typename GPGPU_WALKER::SIMD_SIZE)(simd >> 4));
pCmd->setThreadGroupIdStartingX((uint32_t)startWorkGroups[0]);
pCmd->setThreadGroupIdStartingY((uint32_t)startWorkGroups[1]);
pCmd->setThreadGroupIdStartingResumeZ((uint32_t)startWorkGroups[2]);
return localWorkSize;
}
inline cl_uint computeDimensions(const size_t workItems[3]) {
return (workItems[2] > 1) ? 3 : (workItems[1] > 1) ? 2 : 1;
}
void provideLocalWorkGroupSizeHints(Context *context, uint32_t maxWorkGroupSize, DispatchInfo dispatchInfo);
template <typename SizeAndAllocCalcT, typename... CalcArgsT>
IndirectHeap *allocateIndirectHeap(SizeAndAllocCalcT &&calc, CalcArgsT &&... args) {
size_t alignment = MemoryConstants::pageSize;
size_t size = calc(std::forward<CalcArgsT>(args)...);
return new IndirectHeap(alignedMalloc(size, alignment), size);
}
template <typename GfxFamily>
void dispatchProfilingCommandsStart(
HwTimeStamps &hwTimeStamps,
OCLRT::LinearStream *commandStream) {
using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
// PIPE_CONTROL for global timestamp
uint64_t TimeStampAddress = reinterpret_cast<uint64_t>(&(hwTimeStamps.GlobalStartTS));
auto pPipeControlCmd = (PIPE_CONTROL *)commandStream->getSpace(sizeof(PIPE_CONTROL));
*pPipeControlCmd = PIPE_CONTROL::sInit();
pPipeControlCmd->setCommandStreamerStallEnable(true);
pPipeControlCmd->setPostSyncOperation(PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_TIMESTAMP);
pPipeControlCmd->setAddress(static_cast<uint32_t>(TimeStampAddress & 0x0000FFFFFFFFULL));
pPipeControlCmd->setAddressHigh(static_cast<uint32_t>(TimeStampAddress >> 32));
//MI_STORE_REGISTER_MEM for context local timestamp
TimeStampAddress = reinterpret_cast<uint64_t>(&(hwTimeStamps.ContextStartTS));
//low part
auto pMICmdLow = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM));
*pMICmdLow = MI_STORE_REGISTER_MEM::sInit();
pMICmdLow->setRegisterAddress(GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW);
pMICmdLow->setMemoryAddress(TimeStampAddress);
//hi part
TimeStampAddress += sizeof(uint32_t);
auto pMICmdHigh = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM));
*pMICmdHigh = MI_STORE_REGISTER_MEM::sInit();
pMICmdHigh->setRegisterAddress(GP_THREAD_TIME_REG_ADDRESS_OFFSET_HIGH);
pMICmdHigh->setMemoryAddress(TimeStampAddress);
}
template <typename GfxFamily>
void dispatchProfilingCommandsEnd(
HwTimeStamps &hwTimeStamps,
OCLRT::LinearStream *commandStream) {
using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
// PIPE_CONTROL for global timestamp
auto pPipeControlCmd = (PIPE_CONTROL *)commandStream->getSpace(sizeof(PIPE_CONTROL));
*pPipeControlCmd = PIPE_CONTROL::sInit();
pPipeControlCmd->setCommandStreamerStallEnable(true);
//MI_STORE_REGISTER_MEM for context local timestamp
uint64_t TimeStampAddress = reinterpret_cast<uint64_t>(&(hwTimeStamps.ContextEndTS));
//low part
auto pMICmdLow = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM));
*pMICmdLow = MI_STORE_REGISTER_MEM::sInit();
pMICmdLow->setRegisterAddress(GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW);
pMICmdLow->setMemoryAddress(TimeStampAddress);
//hi part
TimeStampAddress += sizeof(uint32_t);
auto pMICmdHi = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM));
*pMICmdHi = MI_STORE_REGISTER_MEM::sInit();
pMICmdHi->setRegisterAddress(GP_THREAD_TIME_REG_ADDRESS_OFFSET_HIGH);
pMICmdHi->setMemoryAddress(TimeStampAddress);
}
template <typename GfxFamily>
void dispatchPerfCountersNoopidRegisterCommands(
CommandQueue &commandQueue,
OCLRT::HwPerfCounter &hwPerfCounter,
OCLRT::LinearStream *commandStream,
bool start) {
using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
uint64_t address = start ? reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.dmaFenceIdBegin))
: reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.dmaFenceIdEnd));
auto pNoopIdRegister = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM));
*pNoopIdRegister = MI_STORE_REGISTER_MEM::sInit();
pNoopIdRegister->setRegisterAddress(OCLRT::INSTR_MMIO_NOOPID);
pNoopIdRegister->setMemoryAddress(address);
}
template <typename GfxFamily>
void dispatchPerfCountersReadFreqRegisterCommands(
CommandQueue &commandQueue,
OCLRT::HwPerfCounter &hwPerfCounter,
OCLRT::LinearStream *commandStream,
bool start) {
using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
uint64_t address = start ? reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.coreFreqBegin))
: reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.coreFreqEnd));
auto pCoreFreqRegister = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM));
*pCoreFreqRegister = MI_STORE_REGISTER_MEM::sInit();
pCoreFreqRegister->setRegisterAddress(OCLRT::INSTR_MMIO_RPSTAT1);
pCoreFreqRegister->setMemoryAddress(address);
}
template <typename GfxFamily>
void dispatchPerfCountersGeneralPurposeCounterCommands(
CommandQueue &commandQueue,
OCLRT::HwPerfCounter &hwPerfCounter,
OCLRT::LinearStream *commandStream,
bool start) {
using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
uint64_t address = 0;
const uint64_t baseAddress = start ? reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.reportBegin.gp))
: reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.reportEnd.gp));
// Read General Purpose counters
for (uint16_t i = 0; i < OCLRT::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT; i++) {
auto pGeneralPurposeRegister = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM));
*pGeneralPurposeRegister = MI_STORE_REGISTER_MEM::sInit();
uint32_t regAddr = INSTR_GFX_OFFSETS::INSTR_PERF_CNT_1_DW0 + i * sizeof(cl_uint);
pGeneralPurposeRegister->setRegisterAddress(regAddr);
//Gp field is 2*uint64 wide so it can hold 4 uint32
address = baseAddress + i * sizeof(cl_uint);
pGeneralPurposeRegister->setMemoryAddress(address);
}
}
template <typename GfxFamily>
void dispatchPerfCountersUserCounterCommands(
CommandQueue &commandQueue,
OCLRT::HwPerfCounter &hwPerfCounter,
OCLRT::LinearStream *commandStream,
bool start) {
using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
uint64_t address = 0;
const uint64_t baseAddr = start ? reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.reportBegin.user))
: reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.reportEnd.user));
uint32_t cmdNum = 0;
uint32_t regAddr = 0;
auto configData = commandQueue.getPerfCountersConfigData();
auto userRegs = &configData->ReadRegs;
for (uint32_t i = 0; i < userRegs->RegsCount; i++) {
auto pRegister = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM));
*pRegister = MI_STORE_REGISTER_MEM::sInit();
regAddr = userRegs->Reg[i].Offset;
pRegister->setRegisterAddress(regAddr);
//offset between base (low) registers is cl_ulong wide
address = baseAddr + i * sizeof(cl_ulong);
pRegister->setMemoryAddress(address);
cmdNum++;
if (userRegs->Reg[i].BitSize > 32) {
pRegister = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM));
*pRegister = MI_STORE_REGISTER_MEM::sInit();
regAddr += sizeof(cl_uint);
pRegister->setRegisterAddress(regAddr);
address += sizeof(cl_uint);
pRegister->setMemoryAddress(address);
cmdNum++;
}
}
}
template <typename GfxFamily>
void dispatchPerfCountersOABufferStateCommands(
CommandQueue &commandQueue,
OCLRT::HwPerfCounter &hwPerfCounter,
OCLRT::LinearStream *commandStream) {
using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
uint64_t address = 0;
//OA Status
auto pOaRegister = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM));
*pOaRegister = MI_STORE_REGISTER_MEM::sInit();
pOaRegister->setRegisterAddress(INSTR_GFX_OFFSETS::INSTR_OA_STATUS);
address = reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.oaStatus));
pOaRegister->setMemoryAddress(address);
//OA Head
pOaRegister = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM));
*pOaRegister = MI_STORE_REGISTER_MEM::sInit();
pOaRegister->setRegisterAddress(INSTR_GFX_OFFSETS::INSTR_OA_HEAD_PTR);
address = reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.oaHead));
pOaRegister->setMemoryAddress(address);
//OA Tail
pOaRegister = (MI_STORE_REGISTER_MEM *)commandStream->getSpace(sizeof(MI_STORE_REGISTER_MEM));
*pOaRegister = MI_STORE_REGISTER_MEM::sInit();
pOaRegister->setRegisterAddress(INSTR_GFX_OFFSETS::INSTR_OA_TAIL_PTR);
address = reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.oaTail));
pOaRegister->setMemoryAddress(address);
}
template <typename GfxFamily>
void dispatchPerfCountersCommandsStart(
CommandQueue &commandQueue,
OCLRT::HwPerfCounter &hwPerfCounter,
OCLRT::LinearStream *commandStream) {
using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
using MI_REPORT_PERF_COUNT = typename GfxFamily::MI_REPORT_PERF_COUNT;
auto perfCounters = commandQueue.getPerfCounters();
uint32_t currentReportId = perfCounters->getCurrentReportId();
uint64_t address = 0;
//flush command streamer
auto pPipeControlCmd = (PIPE_CONTROL *)commandStream->getSpace(sizeof(PIPE_CONTROL));
*pPipeControlCmd = PIPE_CONTROL::sInit();
pPipeControlCmd->setCommandStreamerStallEnable(true);
//Store value of NOOPID register
dispatchPerfCountersNoopidRegisterCommands<GfxFamily>(commandQueue, hwPerfCounter, commandStream, true);
//Read Core Frequency
dispatchPerfCountersReadFreqRegisterCommands<GfxFamily>(commandQueue, hwPerfCounter, commandStream, true);
dispatchPerfCountersGeneralPurposeCounterCommands<GfxFamily>(commandQueue, hwPerfCounter, commandStream, true);
auto pReportPerfCount = (MI_REPORT_PERF_COUNT *)commandStream->getSpace(sizeof(MI_REPORT_PERF_COUNT));
*pReportPerfCount = MI_REPORT_PERF_COUNT::sInit();
pReportPerfCount->setReportId(currentReportId);
address = reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.reportBegin.oa));
pReportPerfCount->setMemoryAddress(address);
//Timestamp: Global Start
pPipeControlCmd = (PIPE_CONTROL *)commandStream->getSpace(sizeof(PIPE_CONTROL));
*pPipeControlCmd = PIPE_CONTROL::sInit();
pPipeControlCmd->setCommandStreamerStallEnable(true);
pPipeControlCmd->setPostSyncOperation(PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_TIMESTAMP);
address = reinterpret_cast<uint64_t>(&(hwPerfCounter.HWTimeStamp.GlobalStartTS));
pPipeControlCmd->setAddress(static_cast<uint32_t>(address & ((uint64_t)UINT32_MAX)));
pPipeControlCmd->setAddressHigh(static_cast<uint32_t>(address >> 32));
dispatchPerfCountersUserCounterCommands<GfxFamily>(commandQueue, hwPerfCounter, commandStream, true);
commandQueue.sendPerfCountersConfig();
}
template <typename GfxFamily>
void dispatchPerfCountersCommandsEnd(
CommandQueue &commandQueue,
OCLRT::HwPerfCounter &hwPerfCounter,
OCLRT::LinearStream *commandStream) {
using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
using MI_REPORT_PERF_COUNT = typename GfxFamily::MI_REPORT_PERF_COUNT;
auto perfCounters = commandQueue.getPerfCounters();
uint32_t currentReportId = perfCounters->getCurrentReportId();
uint64_t address = 0;
//flush command streamer
auto pPipeControlCmd = (PIPE_CONTROL *)commandStream->getSpace(sizeof(PIPE_CONTROL));
*pPipeControlCmd = PIPE_CONTROL::sInit();
pPipeControlCmd->setCommandStreamerStallEnable(true);
dispatchPerfCountersOABufferStateCommands<GfxFamily>(commandQueue, hwPerfCounter, commandStream);
//Timestamp: Global End
pPipeControlCmd = (PIPE_CONTROL *)commandStream->getSpace(sizeof(PIPE_CONTROL));
*pPipeControlCmd = PIPE_CONTROL::sInit();
pPipeControlCmd->setCommandStreamerStallEnable(true);
pPipeControlCmd->setPostSyncOperation(PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_TIMESTAMP);
address = reinterpret_cast<uint64_t>(&(hwPerfCounter.HWTimeStamp.GlobalEndTS));
pPipeControlCmd->setAddress(static_cast<uint32_t>(address & ((uint64_t)UINT32_MAX)));
pPipeControlCmd->setAddressHigh(static_cast<uint32_t>(address >> 32));
auto pReportPerfCount = (MI_REPORT_PERF_COUNT *)commandStream->getSpace(sizeof(MI_REPORT_PERF_COUNT));
*pReportPerfCount = MI_REPORT_PERF_COUNT::sInit();
pReportPerfCount->setReportId(currentReportId);
address = reinterpret_cast<uint64_t>(&(hwPerfCounter.HWPerfCounters.reportEnd.oa));
pReportPerfCount->setMemoryAddress(address);
dispatchPerfCountersGeneralPurposeCounterCommands<GfxFamily>(commandQueue, hwPerfCounter, commandStream, false);
//Store value of NOOPID register
dispatchPerfCountersNoopidRegisterCommands<GfxFamily>(commandQueue, hwPerfCounter, commandStream, false);
//Read Core Frequency
dispatchPerfCountersReadFreqRegisterCommands<GfxFamily>(commandQueue, hwPerfCounter, commandStream, false);
dispatchPerfCountersUserCounterCommands<GfxFamily>(commandQueue, hwPerfCounter, commandStream, false);
perfCounters->setCpuTimestamp();
}
template <typename GfxFamily>
void dispatchWalker(
CommandQueue &commandQueue,
const MultiDispatchInfo &multiDispatchInfo,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
KernelOperation **blockedCommandsData,
HwTimeStamps *hwTimeStamps,
OCLRT::HwPerfCounter *hwPerfCounter,
bool blockQueue = false,
unsigned int commandType = 0) {
OCLRT::LinearStream *commandStream = nullptr;
OCLRT::IndirectHeap *dsh = nullptr, *ish = nullptr, *ioh = nullptr, *ssh = nullptr;
bool executionModelKernel = multiDispatchInfo.begin()->getKernel()->isParentKernel;
// Allocate command stream and indirect heaps
size_t cmdQInstructionHeapReservedBlockSize = 0;
if (blockQueue) {
using KCH = KernelCommandsHelper<GfxFamily>;
commandStream = new LinearStream(alignedMalloc(MemoryConstants::pageSize, MemoryConstants::pageSize), MemoryConstants::pageSize);
if (executionModelKernel) {
uint32_t offsetDsh = commandQueue.getContext().getDefaultDeviceQueue()->getDshOffset();
uint32_t colorCalcSize = commandQueue.getContext().getDefaultDeviceQueue()->colorCalcStateSize;
dsh = allocateIndirectHeap([&multiDispatchInfo, offsetDsh] { return KCH::getTotalSizeRequiredDSH(multiDispatchInfo) + KCH::getTotalSizeRequiredIOH(multiDispatchInfo) + offsetDsh; });
dsh->getSpace(colorCalcSize);
ioh = dsh;
} else {
dsh = allocateIndirectHeap([&multiDispatchInfo] { return KCH::getTotalSizeRequiredDSH(multiDispatchInfo); });
ioh = allocateIndirectHeap([&multiDispatchInfo] { return KCH::getTotalSizeRequiredIOH(multiDispatchInfo); });
}
ish = allocateIndirectHeap([&multiDispatchInfo] { return KCH::getTotalSizeRequiredIH(multiDispatchInfo); });
cmdQInstructionHeapReservedBlockSize = commandQueue.getInstructionHeapReservedBlockSize();
ssh = allocateIndirectHeap([&multiDispatchInfo] { return KCH::getTotalSizeRequiredSSH(multiDispatchInfo); });
using UniqueIH = std::unique_ptr<IndirectHeap>;
*blockedCommandsData = new KernelOperation(std::unique_ptr<LinearStream>(commandStream), UniqueIH(dsh),
UniqueIH(ish), UniqueIH(ioh), UniqueIH(ssh));
if (executionModelKernel) {
(*blockedCommandsData)->doNotFreeISH = true;
}
} else {
commandStream = &commandQueue.getCS(0);
if (executionModelKernel && (commandQueue.getIndirectHeap(IndirectHeap::SURFACE_STATE, 0).getUsed() > 0)) {
commandQueue.releaseIndirectHeap(IndirectHeap::SURFACE_STATE);
}
dsh = &getIndirectHeap<GfxFamily, IndirectHeap::DYNAMIC_STATE>(commandQueue, multiDispatchInfo);
ish = &getIndirectHeap<GfxFamily, IndirectHeap::INSTRUCTION>(commandQueue, multiDispatchInfo);
ioh = &getIndirectHeap<GfxFamily, IndirectHeap::INDIRECT_OBJECT>(commandQueue, multiDispatchInfo);
ssh = &getIndirectHeap<GfxFamily, IndirectHeap::SURFACE_STATE>(commandQueue, multiDispatchInfo);
}
using INTERFACE_DESCRIPTOR_DATA = typename GfxFamily::INTERFACE_DESCRIPTOR_DATA;
dsh->align(KernelCommandsHelper<GfxFamily>::alignInterfaceDescriptorData);
const size_t offsetInterfaceDescriptorTable = dsh->getUsed();
uint32_t interfaceDescriptorIndex = 0;
size_t totalInterfaceDescriptorTableSize = sizeof(INTERFACE_DESCRIPTOR_DATA);
size_t numDispatches = multiDispatchInfo.size();
totalInterfaceDescriptorTableSize *= numDispatches;
if (!executionModelKernel) {
dsh->getSpace(totalInterfaceDescriptorTableSize);
} else {
dsh->getSpace(commandQueue.getContext().getDefaultDeviceQueue()->getDshOffset() - dsh->getUsed());
}
// Program media interface descriptor load
KernelCommandsHelper<GfxFamily>::sendMediaInterfaceDescriptorLoad(
*commandStream,
offsetInterfaceDescriptorTable,
totalInterfaceDescriptorTableSize);
DEBUG_BREAK_IF(offsetInterfaceDescriptorTable % 64 != 0);
for (auto &dispatchInfo : multiDispatchInfo) {
auto &kernel = *dispatchInfo.getKernel();
DEBUG_BREAK_IF(!(dispatchInfo.getDim() >= 1 && dispatchInfo.getDim() <= 3));
DEBUG_BREAK_IF(!(dispatchInfo.getGWS().z == 1 || dispatchInfo.getDim() == 3));
DEBUG_BREAK_IF(!(dispatchInfo.getGWS().y == 1 || dispatchInfo.getDim() >= 2));
DEBUG_BREAK_IF(!(dispatchInfo.getOffset().z == 0 || dispatchInfo.getDim() == 3));
DEBUG_BREAK_IF(!(dispatchInfo.getOffset().y == 0 || dispatchInfo.getDim() >= 2));
// Determine SIMD size
uint32_t simd = kernel.getKernelInfo().getMaxSimdSize();
// If we don't have a required WGS, compute one opportunistically
auto maxWorkGroupSize = static_cast<uint32_t>(commandQueue.getDevice().getDeviceInfo().maxWorkGroupSize);
if (commandType == CL_COMMAND_NDRANGE_KERNEL) {
provideLocalWorkGroupSizeHints(commandQueue.getContextPtr(), maxWorkGroupSize, dispatchInfo);
}
//Get dispatch geometry
uint32_t dim = dispatchInfo.getDim();
Vec3<size_t> gws = dispatchInfo.getGWS();
Vec3<size_t> offset = dispatchInfo.getOffset();
Vec3<size_t> swgs = dispatchInfo.getStartOfWorkgroups();
// Compute local workgroup sizes
Vec3<size_t> lws = (dispatchInfo.getLocalWorkgroupSize().x > 0) ? dispatchInfo.getLocalWorkgroupSize() : generateWorkgroupSize(dispatchInfo);
Vec3<size_t> elws = (dispatchInfo.getEnqueuedWorkgroupSize().x > 0) ? dispatchInfo.getEnqueuedWorkgroupSize() : lws;
// Compute number of work groups
Vec3<size_t> twgs = (dispatchInfo.getTotalNumberOfWorkgroups().x > 0) ? dispatchInfo.getTotalNumberOfWorkgroups() : generateWorkgroupsNumber(gws, lws);
Vec3<size_t> nwgs = (dispatchInfo.getNumberOfWorkgroups().x > 0) ? dispatchInfo.getNumberOfWorkgroups() : twgs;
// Patch our kernel constants
*kernel.globalWorkOffsetX = static_cast<uint32_t>(offset.x);
*kernel.globalWorkOffsetY = static_cast<uint32_t>(offset.y);
*kernel.globalWorkOffsetZ = static_cast<uint32_t>(offset.z);
*kernel.globalWorkSizeX = static_cast<uint32_t>(gws.x);
*kernel.globalWorkSizeY = static_cast<uint32_t>(gws.y);
*kernel.globalWorkSizeZ = static_cast<uint32_t>(gws.z);
if ((&dispatchInfo == &*multiDispatchInfo.begin()) || (kernel.localWorkSizeX2 == &Kernel::dummyPatchLocation)) {
*kernel.localWorkSizeX = static_cast<uint32_t>(lws.x);
*kernel.localWorkSizeY = static_cast<uint32_t>(lws.y);
*kernel.localWorkSizeZ = static_cast<uint32_t>(lws.z);
}
*kernel.localWorkSizeX2 = static_cast<uint32_t>(lws.x);
*kernel.localWorkSizeY2 = static_cast<uint32_t>(lws.y);
*kernel.localWorkSizeZ2 = static_cast<uint32_t>(lws.z);
*kernel.enqueuedLocalWorkSizeX = static_cast<uint32_t>(elws.x);
*kernel.enqueuedLocalWorkSizeY = static_cast<uint32_t>(elws.y);
*kernel.enqueuedLocalWorkSizeZ = static_cast<uint32_t>(elws.z);
if (&dispatchInfo == &*multiDispatchInfo.begin()) {
*kernel.numWorkGroupsX = static_cast<uint32_t>(twgs.x);
*kernel.numWorkGroupsY = static_cast<uint32_t>(twgs.y);
*kernel.numWorkGroupsZ = static_cast<uint32_t>(twgs.z);
}
*kernel.workDim = dim;
// Send our indirect object data
size_t localWorkSizes[3] = {lws.x, lws.y, lws.z};
auto offsetCrossThreadData = KernelCommandsHelper<GfxFamily>::sendIndirectState(
*commandStream,
*dsh,
*ish,
cmdQInstructionHeapReservedBlockSize,
*ioh,
*ssh,
kernel,
simd,
localWorkSizes,
offsetInterfaceDescriptorTable,
interfaceDescriptorIndex);
if (&dispatchInfo == &*multiDispatchInfo.begin()) {
// If hwTimeStampAlloc is passed (not nullptr), then we know that profiling is enabled
if (hwTimeStamps != nullptr) {
dispatchProfilingCommandsStart<GfxFamily>(*hwTimeStamps, commandStream);
}
if (hwPerfCounter != nullptr) {
dispatchPerfCountersCommandsStart<GfxFamily>(commandQueue, *hwPerfCounter, commandStream);
}
}
PreemptionHelper::applyPreemptionWaCmdsBegin<GfxFamily>(commandStream, commandQueue.getDevice());
// Implement enabling special WA DisableLSQCROPERFforOCL if needed
applyWADisableLSQCROPERFforOCL<GfxFamily>(commandStream, kernel, true);
// Program the walker. Invokes execution so all state should already be programmed
typedef typename GfxFamily::GPGPU_WALKER GPGPU_WALKER;
auto pGpGpuWalkerCmd = (GPGPU_WALKER *)commandStream->getSpace(sizeof(GPGPU_WALKER));
*pGpGpuWalkerCmd = GfxFamily::cmdInitGpgpuWalker;
size_t globalOffsets[3] = {offset.x, offset.y, offset.z};
size_t startWorkGroups[3] = {swgs.x, swgs.y, swgs.z};
size_t numWorkGroups[3] = {nwgs.x, nwgs.y, nwgs.z};
auto localWorkSize = setGpgpuWalkerThreadData<GfxFamily>(pGpGpuWalkerCmd, globalOffsets, startWorkGroups, numWorkGroups, localWorkSizes, simd);
pGpGpuWalkerCmd->setIndirectDataStartAddress((uint32_t)offsetCrossThreadData);
DEBUG_BREAK_IF(offsetCrossThreadData % 64 != 0);
pGpGpuWalkerCmd->setInterfaceDescriptorOffset(interfaceDescriptorIndex++);
auto threadPayload = kernel.getKernelInfo().patchInfo.threadPayload;
DEBUG_BREAK_IF(nullptr == threadPayload);
auto numChannels = PerThreadDataHelper::getNumLocalIdChannels(*threadPayload);
auto localIdSizePerThread = PerThreadDataHelper::getLocalIdSizePerThread(simd, numChannels);
localIdSizePerThread = std::max(localIdSizePerThread, sizeof(GRF));
auto sizePerThreadDataTotal = getThreadsPerWG(simd, localWorkSize) * localIdSizePerThread;
DEBUG_BREAK_IF(sizePerThreadDataTotal == 0); // Hardware requires at least 1 GRF of perThreadData for each thread in thread group
auto sizeCrossThreadData = kernel.getCrossThreadDataSize();
auto IndirectDataLength = alignUp((uint32_t)(sizeCrossThreadData + sizePerThreadDataTotal), GPGPU_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
pGpGpuWalkerCmd->setIndirectDataLength(IndirectDataLength);
// Implement disabling special WA DisableLSQCROPERFforOCL if needed
applyWADisableLSQCROPERFforOCL<GfxFamily>(commandStream, kernel, false);
PreemptionHelper::applyPreemptionWaCmdsEnd<GfxFamily>(commandStream, commandQueue.getDevice());
}
// If hwTimeStamps is passed (not nullptr), then we know that profiling is enabled
if (hwTimeStamps != nullptr) {
dispatchProfilingCommandsEnd<GfxFamily>(*hwTimeStamps, commandStream);
}
if (hwPerfCounter != nullptr) {
dispatchPerfCountersCommandsEnd<GfxFamily>(commandQueue, *hwPerfCounter, commandStream);
}
}
template <typename GfxFamily>
void dispatchWalker(
CommandQueue &commandQueue,
const Kernel &kernel,
cl_uint workDim,
const size_t globalOffsets[3],
const size_t workItems[3],
const size_t *localWorkSizesIn,
cl_uint numEventsInWaitList,
const cl_event *eventWaitList,
KernelOperation **blockedCommandsData,
HwTimeStamps *hwTimeStamps,
HwPerfCounter *hwPerfCounter,
bool blockQueue = false) {
DispatchInfo dispatchInfo(const_cast<Kernel *>(&kernel), workDim, workItems, localWorkSizesIn, globalOffsets);
dispatchWalker<GfxFamily>(commandQueue, dispatchInfo, numEventsInWaitList, eventWaitList,
blockedCommandsData, hwTimeStamps, hwPerfCounter, blockQueue);
}
template <typename GfxFamily>
void dispatchScheduler(
CommandQueue &commandQueue,
DeviceQueueHw<GfxFamily> &devQueueHw,
SchedulerKernel &scheduler) {
using INTERFACE_DESCRIPTOR_DATA = typename GfxFamily::INTERFACE_DESCRIPTOR_DATA;
using GPGPU_WALKER = typename GfxFamily::GPGPU_WALKER;
using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
using MI_BATCH_BUFFER_START = typename GfxFamily::MI_BATCH_BUFFER_START;
OCLRT::LinearStream *commandStream = nullptr;
OCLRT::IndirectHeap *dsh = nullptr, *ish = nullptr, *ioh = nullptr, *ssh = nullptr;
commandStream = &commandQueue.getCS(0);
// note : below code assumes that caller to dispatchScheduler "preallocated" memory
// required for execution model in below heap managers
dsh = devQueueHw.getIndirectHeap(IndirectHeap::DYNAMIC_STATE);
ish = &commandQueue.getIndirectHeap(IndirectHeap::INSTRUCTION);
ssh = &commandQueue.getIndirectHeap(IndirectHeap::SURFACE_STATE);
bool dcFlush = false;
commandQueue.getDevice().getCommandStreamReceiver().addPipeControl(*commandStream, dcFlush);
uint32_t interfaceDescriptorIndex = devQueueHw.schedulerIDIndex;
const size_t offsetInterfaceDescriptorTable = devQueueHw.colorCalcStateSize;
const size_t offsetInterfaceDescriptor = offsetInterfaceDescriptorTable;
const size_t totalInterfaceDescriptorTableSize = devQueueHw.interfaceDescriptorEntries * sizeof(INTERFACE_DESCRIPTOR_DATA);
// Program media interface descriptor load
KernelCommandsHelper<GfxFamily>::sendMediaInterfaceDescriptorLoad(
*commandStream,
offsetInterfaceDescriptor,
totalInterfaceDescriptorTableSize);
DEBUG_BREAK_IF(offsetInterfaceDescriptorTable % 64 != 0);
// Determine SIMD size
uint32_t simd = scheduler.getKernelInfo().getMaxSimdSize();
DEBUG_BREAK_IF(simd != PARALLEL_SCHEDULER_COMPILATION_SIZE_20);
// Patch our kernel constants
*scheduler.globalWorkOffsetX = 0;
*scheduler.globalWorkOffsetY = 0;
*scheduler.globalWorkOffsetZ = 0;
*scheduler.globalWorkSizeX = (uint32_t)scheduler.getGws();
*scheduler.globalWorkSizeY = 1;
*scheduler.globalWorkSizeZ = 1;
*scheduler.localWorkSizeX = (uint32_t)scheduler.getLws();
*scheduler.localWorkSizeY = 1;
*scheduler.localWorkSizeZ = 1;
*scheduler.localWorkSizeX2 = (uint32_t)scheduler.getLws();
*scheduler.localWorkSizeY2 = 1;
*scheduler.localWorkSizeZ2 = 1;
*scheduler.enqueuedLocalWorkSizeX = (uint32_t)scheduler.getLws();
*scheduler.enqueuedLocalWorkSizeY = 1;
*scheduler.enqueuedLocalWorkSizeZ = 1;
*scheduler.numWorkGroupsX = (uint32_t)(scheduler.getGws() / scheduler.getLws());
*scheduler.numWorkGroupsY = 0;
*scheduler.numWorkGroupsZ = 0;
*scheduler.workDim = 1;
// Send our indirect object data
size_t localWorkSizes[3] = {scheduler.getLws(), 1, 1};
// Create indirectHeap for IOH that is located at the end of device enqueue DSH
size_t curbeOffset = devQueueHw.setSchedulerCrossThreadData(scheduler);
IndirectHeap indirectObjectHeap(dsh->getBase(), dsh->getMaxAvailableSpace());
indirectObjectHeap.getSpace(curbeOffset);
ioh = &indirectObjectHeap;
auto offsetCrossThreadData = KernelCommandsHelper<GfxFamily>::sendIndirectState(
*commandStream,
*dsh,
*ish,
0,
*ioh,
*ssh,
scheduler,
simd,
localWorkSizes,
offsetInterfaceDescriptorTable,
interfaceDescriptorIndex);
// Implement enabling special WA DisableLSQCROPERFforOCL if needed
applyWADisableLSQCROPERFforOCL<GfxFamily>(commandStream, scheduler, true);
// Program the walker. Invokes execution so all state should already be programmed
auto pGpGpuWalkerCmd = (GPGPU_WALKER *)commandStream->getSpace(sizeof(GPGPU_WALKER));
*pGpGpuWalkerCmd = GfxFamily::cmdInitGpgpuWalker;
size_t globalOffsets[3] = {0, 0, 0};
size_t workGroups[3] = {(scheduler.getGws() / scheduler.getLws()), 1, 1};
auto localWorkSize = setGpgpuWalkerThreadData<GfxFamily>(pGpGpuWalkerCmd, globalOffsets, globalOffsets, workGroups, localWorkSizes, simd);
pGpGpuWalkerCmd->setIndirectDataStartAddress((uint32_t)offsetCrossThreadData);
DEBUG_BREAK_IF(offsetCrossThreadData % 64 != 0);
pGpGpuWalkerCmd->setInterfaceDescriptorOffset(interfaceDescriptorIndex);
auto threadPayload = scheduler.getKernelInfo().patchInfo.threadPayload;
DEBUG_BREAK_IF(nullptr == threadPayload);
auto numChannels = PerThreadDataHelper::getNumLocalIdChannels(*threadPayload);
auto localIdSizePerThread = PerThreadDataHelper::getLocalIdSizePerThread(simd, numChannels);
localIdSizePerThread = std::max(localIdSizePerThread, sizeof(GRF));
auto sizePerThreadDataTotal = getThreadsPerWG(simd, localWorkSize) * localIdSizePerThread;
DEBUG_BREAK_IF(sizePerThreadDataTotal == 0); // Hardware requires at least 1 GRF of perThreadData for each thread in thread group
auto sizeCrossThreadData = scheduler.getCrossThreadDataSize();
auto IndirectDataLength = alignUp((uint32_t)(sizeCrossThreadData + sizePerThreadDataTotal), GPGPU_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
pGpGpuWalkerCmd->setIndirectDataLength(IndirectDataLength);
// Implement disabling special WA DisableLSQCROPERFforOCL if needed
applyWADisableLSQCROPERFforOCL<GfxFamily>(commandStream, scheduler, false);
// Do not put BB_START only when returning in first Scheduler run
if (devQueueHw.getSchedulerReturnInstance() != 1) {
commandQueue.getDevice().getCommandStreamReceiver().addPipeControl(*commandStream, true);
// Add BB Start Cmd to the SLB in the Primary Batch Buffer
auto *bbStart = (MI_BATCH_BUFFER_START *)commandStream->getSpace(sizeof(MI_BATCH_BUFFER_START));
*bbStart = MI_BATCH_BUFFER_START::sInit();
bbStart->setSecondLevelBatchBuffer(MI_BATCH_BUFFER_START::SECOND_LEVEL_BATCH_BUFFER_FIRST_LEVEL_BATCH);
uint64_t slbAddress = devQueueHw.getSlbBuffer()->getGpuAddress();
bbStart->setBatchBufferStartAddressGraphicsaddress472(slbAddress);
}
}
template <typename GfxFamily, unsigned int eventType>
struct EnqueueOperation {
static_assert(eventType != CL_COMMAND_NDRANGE_KERNEL, "for eventType CL_COMMAND_NDRANGE_KERNEL use specialization class");
static_assert(eventType != CL_COMMAND_MARKER, "for eventType CL_COMMAND_MARKER use specialization class");
static_assert(eventType != CL_COMMAND_MIGRATE_MEM_OBJECTS, "for eventType CL_COMMAND_MIGRATE_MEM_OBJECTS use specialization class");
static size_t getTotalSizeRequiredCS(bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo) {
size_t size = KernelCommandsHelper<GfxFamily>::getSizeRequiredCS() +
sizeof(typename GfxFamily::PIPE_CONTROL) * (KernelCommandsHelper<GfxFamily>::isPipeControlWArequired() ? 2 : 1);
if (reserveProfilingCmdsSpace) {
size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL) + 4 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
}
if (reservePerfCounters) {
//start cmds
//P_C: flush CS & TimeStamp BEGIN
size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL);
//SRM NOOPID & Frequency
size += 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
//gp registers
size += OCLRT::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
//report perf count
size += sizeof(typename GfxFamily::MI_REPORT_PERF_COUNT);
//user registers
size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
//end cmds
//P_C: flush CS & TimeStamp END;
size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL);
//OA buffer (status head, tail)
size += 3 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
//report perf count
size += sizeof(typename GfxFamily::MI_REPORT_PERF_COUNT);
//gp registers
size += OCLRT::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
//SRM NOOPID & Frequency
size += 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
//user registers
size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
}
Device &device = commandQueue.getDevice();
for (auto &dispatchInfo : multiDispatchInfo) {
auto &kernel = *dispatchInfo.getKernel();
size += sizeof(typename GfxFamily::GPGPU_WALKER);
size += getSizeForWADisableLSQCROPERFforOCL<GfxFamily>(&kernel);
size += PreemptionHelper::getPreemptionWaCsSize<GfxFamily>(device);
}
return size;
}
static size_t getSizeRequiredCS(bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const Kernel *pKernel) {
size_t size = sizeof(typename GfxFamily::GPGPU_WALKER) + KernelCommandsHelper<GfxFamily>::getSizeRequiredCS() +
sizeof(typename GfxFamily::PIPE_CONTROL) * (KernelCommandsHelper<GfxFamily>::isPipeControlWArequired() ? 2 : 1);
size += PreemptionHelper::getPreemptionWaCsSize<GfxFamily>(commandQueue.getDevice());
if (reserveProfilingCmdsSpace) {
size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL) + 4 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
}
if (reservePerfCounters) {
//start cmds
//P_C: flush CS & TimeStamp BEGIN
size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL);
//SRM NOOPID & Frequency
size += 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
//gp registers
size += OCLRT::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
//report perf count
size += sizeof(typename GfxFamily::MI_REPORT_PERF_COUNT);
//user registers
size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
//end cmds
//P_C: flush CS & TimeStamp END;
size += 2 * sizeof(typename GfxFamily::PIPE_CONTROL);
//OA buffer (status head, tail)
size += 3 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
//report perf count
size += sizeof(typename GfxFamily::MI_REPORT_PERF_COUNT);
//gp registers
size += OCLRT::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
//SRM NOOPID & Frequency
size += 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
//user registers
size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
}
size += getSizeForWADisableLSQCROPERFforOCL<GfxFamily>(pKernel);
return size;
}
};
template <typename GfxFamily, unsigned int eventType>
LinearStream &getCommandStream(CommandQueue &commandQueue, bool reserveProfilingCmdsSpace, bool reservePerfCounterCmdsSpace, const Kernel *pKernel) {
auto expectedSizeCS = EnqueueOperation<GfxFamily, eventType>::getSizeRequiredCS(reserveProfilingCmdsSpace, reservePerfCounterCmdsSpace, commandQueue, pKernel);
return commandQueue.getCS(expectedSizeCS);
}
template <typename GfxFamily, unsigned int eventType>
LinearStream &getCommandStream(CommandQueue &commandQueue, bool reserveProfilingCmdsSpace, bool reservePerfCounterCmdsSpace, const MultiDispatchInfo &multiDispatchInfo) {
size_t expectedSizeCS = 0;
Kernel *parentKernel = multiDispatchInfo.size() > 0 ? multiDispatchInfo.begin()->getKernel() : nullptr;
for (auto &dispatchInfo : multiDispatchInfo) {
expectedSizeCS += EnqueueOperation<GfxFamily, eventType>::getSizeRequiredCS(reserveProfilingCmdsSpace, reservePerfCounterCmdsSpace, commandQueue, dispatchInfo.getKernel());
}
if (parentKernel && parentKernel->isParentKernel) {
SchedulerKernel &scheduler = BuiltIns::getInstance().getSchedulerKernel(parentKernel->getContext());
expectedSizeCS += EnqueueOperation<GfxFamily, eventType>::getSizeRequiredCS(reserveProfilingCmdsSpace, reservePerfCounterCmdsSpace, commandQueue, &scheduler);
}
return commandQueue.getCS(expectedSizeCS);
}
template <typename GfxFamily, IndirectHeap::Type heapType>
IndirectHeap &getIndirectHeap(CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo) {
size_t expectedSize = 0;
IndirectHeap *ih = nullptr;
// clang-format off
switch(heapType) {
case IndirectHeap::DYNAMIC_STATE: expectedSize = KernelCommandsHelper<GfxFamily>::getTotalSizeRequiredDSH(multiDispatchInfo); break;
case IndirectHeap::INSTRUCTION: expectedSize = KernelCommandsHelper<GfxFamily>::getTotalSizeRequiredIH( multiDispatchInfo); break;
case IndirectHeap::INDIRECT_OBJECT: expectedSize = KernelCommandsHelper<GfxFamily>::getTotalSizeRequiredIOH(multiDispatchInfo); break;
case IndirectHeap::SURFACE_STATE: expectedSize = KernelCommandsHelper<GfxFamily>::getTotalSizeRequiredSSH(multiDispatchInfo); break;
}
// clang-format on
if (multiDispatchInfo.begin()->getKernel()->isParentKernel) {
if (heapType == IndirectHeap::INSTRUCTION || heapType == IndirectHeap::SURFACE_STATE) {
expectedSize += KernelCommandsHelper<GfxFamily>::template getSizeRequiredForExecutionModel<heapType>(const_cast<const Kernel &>(*(multiDispatchInfo.begin()->getKernel())));
} else //if (heapType == IndirectHeap::DYNAMIC_STATE || heapType == IndirectHeap::INDIRECT_OBJECT)
{
DeviceQueueHw<GfxFamily> *pDevQueue = castToObject<DeviceQueueHw<GfxFamily>>(commandQueue.getContext().getDefaultDeviceQueue());
DEBUG_BREAK_IF(pDevQueue == nullptr);
ih = pDevQueue->getIndirectHeap(IndirectHeap::DYNAMIC_STATE);
}
}
if (ih == nullptr)
ih = &commandQueue.getIndirectHeap(heapType, expectedSize);
return *ih;
}
} // namespace OCLRT