mirror of
https://github.com/intel/compute-runtime.git
synced 2025-12-30 01:35:20 +08:00
Extract GpgpuWalker related functions to bdw_plus files
Change-Id: I3b2081af8e350d4072da5e1482a4bfc50e06fb6d Related-To: NEO-3016 Signed-off-by: Maciej Dziuban <maciej.dziuban@intel.com>
This commit is contained in:
committed by
sys_ocldev
parent
7218bdb849
commit
608ec933da
@@ -35,11 +35,11 @@ set(RUNTIME_SRCS_COMMAND_QUEUE
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/finish.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/flush.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/gpgpu_walker.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/gpgpu_walker.inl
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/gpgpu_walker_base.inl
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/gpgpu_walker_bdw_plus.inl
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/hardware_interface.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/hardware_interface.inl
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/hardware_interface_base.inl
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/hardware_interface_bdw_plus.inl
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen.cpp
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen.h
|
||||
${CMAKE_CURRENT_SOURCE_DIR}/local_id_gen.inl
|
||||
|
||||
@@ -1,393 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2017-2019 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include "runtime/command_queue/command_queue.h"
|
||||
#include "runtime/command_queue/gpgpu_walker.h"
|
||||
#include "runtime/command_queue/local_id_gen.h"
|
||||
#include "runtime/command_stream/command_stream_receiver.h"
|
||||
#include "runtime/device/device_info.h"
|
||||
#include "runtime/event/perf_counter.h"
|
||||
#include "runtime/event/user_event.h"
|
||||
#include "runtime/helpers/aligned_memory.h"
|
||||
#include "runtime/helpers/debug_helpers.h"
|
||||
#include "runtime/helpers/hw_helper.h"
|
||||
#include "runtime/helpers/kernel_commands.h"
|
||||
#include "runtime/helpers/queue_helpers.h"
|
||||
#include "runtime/helpers/validators.h"
|
||||
#include "runtime/indirect_heap/indirect_heap.h"
|
||||
#include "runtime/mem_obj/mem_obj.h"
|
||||
#include "runtime/memory_manager/graphics_allocation.h"
|
||||
#include "runtime/utilities/tag_allocator.h"
|
||||
|
||||
#include "instrumentation.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
|
||||
namespace NEO {
|
||||
|
||||
// Performs ReadModifyWrite operation on value of a register: Register = Register Operation Mask
|
||||
template <typename GfxFamily>
|
||||
void GpgpuWalkerHelper<GfxFamily>::addAluReadModifyWriteRegister(
|
||||
NEO::LinearStream *pCommandStream,
|
||||
uint32_t aluRegister,
|
||||
uint32_t operation,
|
||||
uint32_t mask) {
|
||||
// Load "Register" value into CS_GPR_R0
|
||||
typedef typename GfxFamily::MI_LOAD_REGISTER_REG MI_LOAD_REGISTER_REG;
|
||||
typedef typename GfxFamily::MI_MATH MI_MATH;
|
||||
typedef typename GfxFamily::MI_MATH_ALU_INST_INLINE MI_MATH_ALU_INST_INLINE;
|
||||
auto pCmd = pCommandStream->getSpaceForCmd<MI_LOAD_REGISTER_REG>();
|
||||
*pCmd = GfxFamily::cmdInitLoadRegisterReg;
|
||||
pCmd->setSourceRegisterAddress(aluRegister);
|
||||
pCmd->setDestinationRegisterAddress(CS_GPR_R0);
|
||||
|
||||
// Load "Mask" into CS_GPR_R1
|
||||
typedef typename GfxFamily::MI_LOAD_REGISTER_IMM MI_LOAD_REGISTER_IMM;
|
||||
auto pCmd2 = pCommandStream->getSpaceForCmd<MI_LOAD_REGISTER_IMM>();
|
||||
*pCmd2 = GfxFamily::cmdInitLoadRegisterImm;
|
||||
pCmd2->setRegisterOffset(CS_GPR_R1);
|
||||
pCmd2->setDataDword(mask);
|
||||
|
||||
// Add instruction MI_MATH with 4 MI_MATH_ALU_INST_INLINE operands
|
||||
auto pCmd3 = reinterpret_cast<uint32_t *>(pCommandStream->getSpace(sizeof(MI_MATH) + NUM_ALU_INST_FOR_READ_MODIFY_WRITE * sizeof(MI_MATH_ALU_INST_INLINE)));
|
||||
reinterpret_cast<MI_MATH *>(pCmd3)->DW0.Value = 0x0;
|
||||
reinterpret_cast<MI_MATH *>(pCmd3)->DW0.BitField.InstructionType = MI_MATH::COMMAND_TYPE_MI_COMMAND;
|
||||
reinterpret_cast<MI_MATH *>(pCmd3)->DW0.BitField.InstructionOpcode = MI_MATH::MI_COMMAND_OPCODE_MI_MATH;
|
||||
// 0x3 - 5 Dwords length cmd (-2): 1 for MI_MATH, 4 for MI_MATH_ALU_INST_INLINE
|
||||
reinterpret_cast<MI_MATH *>(pCmd3)->DW0.BitField.DwordLength = NUM_ALU_INST_FOR_READ_MODIFY_WRITE - 1;
|
||||
pCmd3++;
|
||||
MI_MATH_ALU_INST_INLINE *pAluParam = reinterpret_cast<MI_MATH_ALU_INST_INLINE *>(pCmd3);
|
||||
|
||||
// Setup first operand of MI_MATH - load CS_GPR_R0 into register A
|
||||
pAluParam->DW0.BitField.ALUOpcode = ALU_OPCODE_LOAD;
|
||||
pAluParam->DW0.BitField.Operand1 = ALU_REGISTER_R_SRCA;
|
||||
pAluParam->DW0.BitField.Operand2 = ALU_REGISTER_R_0;
|
||||
pAluParam++;
|
||||
|
||||
// Setup second operand of MI_MATH - load CS_GPR_R1 into register B
|
||||
pAluParam->DW0.BitField.ALUOpcode = ALU_OPCODE_LOAD;
|
||||
pAluParam->DW0.BitField.Operand1 = ALU_REGISTER_R_SRCB;
|
||||
pAluParam->DW0.BitField.Operand2 = ALU_REGISTER_R_1;
|
||||
pAluParam++;
|
||||
|
||||
// Setup third operand of MI_MATH - "Operation" on registers A and B
|
||||
pAluParam->DW0.BitField.ALUOpcode = operation;
|
||||
pAluParam->DW0.BitField.Operand1 = 0;
|
||||
pAluParam->DW0.BitField.Operand2 = 0;
|
||||
pAluParam++;
|
||||
|
||||
// Setup fourth operand of MI_MATH - store result into CS_GPR_R0
|
||||
pAluParam->DW0.BitField.ALUOpcode = ALU_OPCODE_STORE;
|
||||
pAluParam->DW0.BitField.Operand1 = ALU_REGISTER_R_0;
|
||||
pAluParam->DW0.BitField.Operand2 = ALU_REGISTER_R_ACCU;
|
||||
|
||||
// LOAD value of CS_GPR_R0 into "Register"
|
||||
auto pCmd4 = pCommandStream->getSpaceForCmd<MI_LOAD_REGISTER_REG>();
|
||||
*pCmd4 = GfxFamily::cmdInitLoadRegisterReg;
|
||||
pCmd4->setSourceRegisterAddress(CS_GPR_R0);
|
||||
pCmd4->setDestinationRegisterAddress(aluRegister);
|
||||
|
||||
// Add PIPE_CONTROL to flush caches
|
||||
auto pCmd5 = pCommandStream->getSpaceForCmd<PIPE_CONTROL>();
|
||||
*pCmd5 = GfxFamily::cmdInitPipeControl;
|
||||
pCmd5->setCommandStreamerStallEnable(true);
|
||||
pCmd5->setDcFlushEnable(true);
|
||||
pCmd5->setTextureCacheInvalidationEnable(true);
|
||||
pCmd5->setPipeControlFlushEnable(true);
|
||||
pCmd5->setStateCacheInvalidationEnable(true);
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void GpgpuWalkerHelper<GfxFamily>::dispatchProfilingCommandsStart(
|
||||
TagNode<HwTimeStamps> &hwTimeStamps,
|
||||
LinearStream *commandStream) {
|
||||
|
||||
using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
|
||||
|
||||
// PIPE_CONTROL for global timestamp
|
||||
uint64_t timeStampAddress = hwTimeStamps.getGpuAddress() + offsetof(HwTimeStamps, GlobalStartTS);
|
||||
|
||||
PipeControlHelper<GfxFamily>::obtainPipeControlAndProgramPostSyncOperation(commandStream, PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_TIMESTAMP, timeStampAddress, 0llu, false);
|
||||
|
||||
//MI_STORE_REGISTER_MEM for context local timestamp
|
||||
timeStampAddress = hwTimeStamps.getGpuAddress() + offsetof(HwTimeStamps, ContextStartTS);
|
||||
|
||||
//low part
|
||||
auto pMICmdLow = commandStream->getSpaceForCmd<MI_STORE_REGISTER_MEM>();
|
||||
*pMICmdLow = GfxFamily::cmdInitStoreRegisterMem;
|
||||
adjustMiStoreRegMemMode(pMICmdLow);
|
||||
pMICmdLow->setRegisterAddress(GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW);
|
||||
pMICmdLow->setMemoryAddress(timeStampAddress);
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void GpgpuWalkerHelper<GfxFamily>::dispatchProfilingCommandsEnd(
|
||||
TagNode<HwTimeStamps> &hwTimeStamps,
|
||||
LinearStream *commandStream) {
|
||||
|
||||
using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
|
||||
|
||||
// PIPE_CONTROL for global timestamp
|
||||
auto pPipeControlCmd = commandStream->getSpaceForCmd<PIPE_CONTROL>();
|
||||
*pPipeControlCmd = GfxFamily::cmdInitPipeControl;
|
||||
pPipeControlCmd->setCommandStreamerStallEnable(true);
|
||||
|
||||
//MI_STORE_REGISTER_MEM for context local timestamp
|
||||
uint64_t timeStampAddress = hwTimeStamps.getGpuAddress() + offsetof(HwTimeStamps, ContextEndTS);
|
||||
|
||||
//low part
|
||||
auto pMICmdLow = commandStream->getSpaceForCmd<MI_STORE_REGISTER_MEM>();
|
||||
*pMICmdLow = GfxFamily::cmdInitStoreRegisterMem;
|
||||
adjustMiStoreRegMemMode(pMICmdLow);
|
||||
pMICmdLow->setRegisterAddress(GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW);
|
||||
pMICmdLow->setMemoryAddress(timeStampAddress);
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void GpgpuWalkerHelper<GfxFamily>::dispatchStoreRegisterCommand(
|
||||
LinearStream *commandStream,
|
||||
uint64_t memoryAddress,
|
||||
uint32_t registerAddress) {
|
||||
|
||||
using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
|
||||
|
||||
auto pCmd = commandStream->getSpaceForCmd<MI_STORE_REGISTER_MEM>();
|
||||
*pCmd = GfxFamily::cmdInitStoreRegisterMem;
|
||||
pCmd->setRegisterAddress(registerAddress);
|
||||
pCmd->setMemoryAddress(memoryAddress);
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersGeneralPurposeCounterCommands(
|
||||
LinearStream *commandStream,
|
||||
uint64_t baseAddress) {
|
||||
|
||||
// Read General Purpose counters
|
||||
for (auto i = 0u; i < NEO::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT; i++) {
|
||||
uint32_t regAddr = INSTR_GFX_OFFSETS::INSTR_PERF_CNT_1_DW0 + i * sizeof(cl_uint);
|
||||
//Gp field is 2*uint64 wide so it can hold 4 uint32
|
||||
uint64_t address = baseAddress + i * sizeof(cl_uint);
|
||||
dispatchStoreRegisterCommand(commandStream, address, regAddr);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersUserCounterCommands(
|
||||
CommandQueue &commandQueue,
|
||||
LinearStream *commandStream,
|
||||
uint64_t baseAddress) {
|
||||
|
||||
auto userRegs = &commandQueue.getPerfCountersConfigData()->ReadRegs;
|
||||
|
||||
for (uint32_t i = 0; i < userRegs->RegsCount; i++) {
|
||||
uint32_t regAddr = userRegs->Reg[i].Offset;
|
||||
//offset between base (low) registers is cl_ulong wide
|
||||
uint64_t address = baseAddress + i * sizeof(cl_ulong);
|
||||
dispatchStoreRegisterCommand(commandStream, address, regAddr);
|
||||
|
||||
if (userRegs->Reg[i].BitSize > 32) {
|
||||
dispatchStoreRegisterCommand(commandStream, address + sizeof(cl_uint), regAddr + sizeof(cl_uint));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersOABufferStateCommands(
|
||||
TagNode<HwPerfCounter> &hwPerfCounter,
|
||||
LinearStream *commandStream) {
|
||||
|
||||
dispatchStoreRegisterCommand(commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.OaStatus), INSTR_GFX_OFFSETS::INSTR_OA_STATUS);
|
||||
dispatchStoreRegisterCommand(commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.OaHead), INSTR_GFX_OFFSETS::INSTR_OA_HEAD_PTR);
|
||||
dispatchStoreRegisterCommand(commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.OaTail), INSTR_GFX_OFFSETS::INSTR_OA_TAIL_PTR);
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersCommandsStart(
|
||||
CommandQueue &commandQueue,
|
||||
TagNode<HwPerfCounter> &hwPerfCounter,
|
||||
LinearStream *commandStream) {
|
||||
|
||||
using MI_REPORT_PERF_COUNT = typename GfxFamily::MI_REPORT_PERF_COUNT;
|
||||
|
||||
auto perfCounters = commandQueue.getPerfCounters();
|
||||
|
||||
uint32_t currentReportId = perfCounters->getCurrentReportId();
|
||||
uint64_t address = 0;
|
||||
//flush command streamer
|
||||
auto pPipeControlCmd = commandStream->getSpaceForCmd<PIPE_CONTROL>();
|
||||
*pPipeControlCmd = GfxFamily::cmdInitPipeControl;
|
||||
pPipeControlCmd->setCommandStreamerStallEnable(true);
|
||||
|
||||
//Store value of NOOPID register
|
||||
GpgpuWalkerHelper<GfxFamily>::dispatchStoreRegisterCommand(commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.DMAFenceIdBegin), INSTR_MMIO_NOOPID);
|
||||
|
||||
//Read Core Frequency
|
||||
GpgpuWalkerHelper<GfxFamily>::dispatchStoreRegisterCommand(commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.CoreFreqBegin), INSTR_MMIO_RPSTAT1);
|
||||
|
||||
GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersGeneralPurposeCounterCommands(commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.HwPerfReportBegin.Gp));
|
||||
|
||||
auto pReportPerfCount = commandStream->getSpaceForCmd<MI_REPORT_PERF_COUNT>();
|
||||
*pReportPerfCount = GfxFamily::cmdInitReportPerfCount;
|
||||
pReportPerfCount->setReportId(currentReportId);
|
||||
address = hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.HwPerfReportBegin.Oa);
|
||||
pReportPerfCount->setMemoryAddress(address);
|
||||
|
||||
address = hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWTimeStamp.GlobalStartTS);
|
||||
|
||||
PipeControlHelper<GfxFamily>::obtainPipeControlAndProgramPostSyncOperation(commandStream, PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_TIMESTAMP, address, 0llu, false);
|
||||
|
||||
GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersUserCounterCommands(commandQueue, commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.HwPerfReportBegin.User));
|
||||
|
||||
commandQueue.sendPerfCountersConfig();
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersCommandsEnd(
|
||||
CommandQueue &commandQueue,
|
||||
TagNode<HwPerfCounter> &hwPerfCounter,
|
||||
LinearStream *commandStream) {
|
||||
|
||||
using MI_REPORT_PERF_COUNT = typename GfxFamily::MI_REPORT_PERF_COUNT;
|
||||
|
||||
auto perfCounters = commandQueue.getPerfCounters();
|
||||
|
||||
uint32_t currentReportId = perfCounters->getCurrentReportId();
|
||||
|
||||
//flush command streamer
|
||||
auto pPipeControlCmd = commandStream->getSpaceForCmd<PIPE_CONTROL>();
|
||||
*pPipeControlCmd = GfxFamily::cmdInitPipeControl;
|
||||
pPipeControlCmd->setCommandStreamerStallEnable(true);
|
||||
|
||||
GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersOABufferStateCommands(hwPerfCounter, commandStream);
|
||||
|
||||
//Timestamp: Global End
|
||||
uint64_t address = hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWTimeStamp.GlobalEndTS);
|
||||
PipeControlHelper<GfxFamily>::obtainPipeControlAndProgramPostSyncOperation(commandStream, PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_TIMESTAMP, address, 0llu, false);
|
||||
|
||||
auto pReportPerfCount = commandStream->getSpaceForCmd<MI_REPORT_PERF_COUNT>();
|
||||
*pReportPerfCount = GfxFamily::cmdInitReportPerfCount;
|
||||
pReportPerfCount->setReportId(currentReportId);
|
||||
address = hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.HwPerfReportEnd.Oa);
|
||||
pReportPerfCount->setMemoryAddress(address);
|
||||
|
||||
GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersGeneralPurposeCounterCommands(commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.HwPerfReportEnd.Gp));
|
||||
|
||||
//Store value of NOOPID register
|
||||
GpgpuWalkerHelper<GfxFamily>::dispatchStoreRegisterCommand(commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.DMAFenceIdEnd), INSTR_MMIO_NOOPID);
|
||||
|
||||
//Read Core Frequency
|
||||
GpgpuWalkerHelper<GfxFamily>::dispatchStoreRegisterCommand(commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.CoreFreqEnd), INSTR_MMIO_RPSTAT1);
|
||||
|
||||
GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersUserCounterCommands(commandQueue, commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.HwPerfReportEnd.User));
|
||||
|
||||
perfCounters->setCpuTimestamp();
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(NEO::LinearStream *pCommandStream, const Kernel &kernel, bool disablePerfMode) {
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
size_t GpgpuWalkerHelper<GfxFamily>::getSizeForWADisableLSQCROPERFforOCL(const Kernel *pKernel) {
|
||||
return (size_t)0;
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void GpgpuWalkerHelper<GfxFamily>::adjustMiStoreRegMemMode(MI_STORE_REG_MEM<GfxFamily> *storeCmd) {
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
size_t EnqueueOperation<GfxFamily>::getTotalSizeRequiredCS(uint32_t eventType, const CsrDependencies &csrDeps, bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo) {
|
||||
size_t expectedSizeCS = 0;
|
||||
Kernel *parentKernel = multiDispatchInfo.peekParentKernel();
|
||||
if (multiDispatchInfo.peekMainKernel() && multiDispatchInfo.peekMainKernel()->isAuxTranslationRequired()) {
|
||||
expectedSizeCS += sizeof(PIPE_CONTROL);
|
||||
}
|
||||
for (auto &dispatchInfo : multiDispatchInfo) {
|
||||
expectedSizeCS += EnqueueOperation<GfxFamily>::getSizeRequiredCS(eventType, reserveProfilingCmdsSpace, reservePerfCounters, commandQueue, dispatchInfo.getKernel());
|
||||
if (dispatchInfo.isPipeControlRequired()) {
|
||||
expectedSizeCS += sizeof(PIPE_CONTROL);
|
||||
}
|
||||
}
|
||||
if (parentKernel) {
|
||||
SchedulerKernel &scheduler = commandQueue.getDevice().getExecutionEnvironment()->getBuiltIns()->getSchedulerKernel(parentKernel->getContext());
|
||||
expectedSizeCS += EnqueueOperation<GfxFamily>::getSizeRequiredCS(eventType, reserveProfilingCmdsSpace, reservePerfCounters, commandQueue, &scheduler);
|
||||
}
|
||||
if (commandQueue.getCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
|
||||
expectedSizeCS += EnqueueOperation<GfxFamily>::getSizeRequiredForTimestampPacketWrite();
|
||||
expectedSizeCS += TimestampPacketHelper::getRequiredCmdStreamSize<GfxFamily>(csrDeps);
|
||||
}
|
||||
return expectedSizeCS;
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
size_t EnqueueOperation<GfxFamily>::getSizeRequiredCS(uint32_t cmdType, bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const Kernel *pKernel) {
|
||||
if (isCommandWithoutKernel(cmdType)) {
|
||||
return EnqueueOperation<GfxFamily>::getSizeRequiredCSNonKernel(reserveProfilingCmdsSpace, reservePerfCounters, commandQueue);
|
||||
} else {
|
||||
return EnqueueOperation<GfxFamily>::getSizeRequiredCSKernel(reserveProfilingCmdsSpace, reservePerfCounters, commandQueue, pKernel);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
size_t EnqueueOperation<GfxFamily>::getSizeRequiredCSKernel(bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const Kernel *pKernel) {
|
||||
size_t size = sizeof(typename GfxFamily::GPGPU_WALKER) + KernelCommandsHelper<GfxFamily>::getSizeRequiredCS(pKernel) +
|
||||
sizeof(PIPE_CONTROL) * (KernelCommandsHelper<GfxFamily>::isPipeControlWArequired() ? 2 : 1);
|
||||
size += KernelCommandsHelper<GfxFamily>::getSizeRequiredForCacheFlush(commandQueue, pKernel, 0U, 0U);
|
||||
size += PreemptionHelper::getPreemptionWaCsSize<GfxFamily>(commandQueue.getDevice());
|
||||
if (reserveProfilingCmdsSpace) {
|
||||
size += 2 * sizeof(PIPE_CONTROL) + 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
|
||||
}
|
||||
if (reservePerfCounters) {
|
||||
//start cmds
|
||||
//P_C: flush CS & TimeStamp BEGIN
|
||||
size += 2 * sizeof(PIPE_CONTROL);
|
||||
//SRM NOOPID & Frequency
|
||||
size += 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
|
||||
//gp registers
|
||||
size += NEO::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
|
||||
//report perf count
|
||||
size += sizeof(typename GfxFamily::MI_REPORT_PERF_COUNT);
|
||||
//user registers
|
||||
size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
|
||||
|
||||
//end cmds
|
||||
//P_C: flush CS & TimeStamp END;
|
||||
size += 2 * sizeof(PIPE_CONTROL);
|
||||
//OA buffer (status head, tail)
|
||||
size += 3 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
|
||||
//report perf count
|
||||
size += sizeof(typename GfxFamily::MI_REPORT_PERF_COUNT);
|
||||
//gp registers
|
||||
size += NEO::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
|
||||
//SRM NOOPID & Frequency
|
||||
size += 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
|
||||
//user registers
|
||||
size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
|
||||
}
|
||||
size += GpgpuWalkerHelper<GfxFamily>::getSizeForWADisableLSQCROPERFforOCL(pKernel);
|
||||
|
||||
return size;
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
size_t EnqueueOperation<GfxFamily>::getSizeRequiredCSNonKernel(bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue) {
|
||||
size_t size = 0;
|
||||
if (reserveProfilingCmdsSpace) {
|
||||
size += 2 * sizeof(PIPE_CONTROL) + 4 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
|
||||
}
|
||||
return size;
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
size_t EnqueueOperation<GfxFamily>::getSizeRequiredForTimestampPacketWrite() {
|
||||
return sizeof(PIPE_CONTROL);
|
||||
}
|
||||
|
||||
} // namespace NEO
|
||||
@@ -1,183 +1,347 @@
|
||||
/*
|
||||
* Copyright (C) 2018-2019 Intel Corporation
|
||||
* Copyright (C) 2017-2019 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include "runtime/command_queue/command_queue.h"
|
||||
#include "runtime/command_queue/gpgpu_walker.h"
|
||||
#include "runtime/command_queue/local_id_gen.h"
|
||||
#include "runtime/command_stream/command_stream_receiver.h"
|
||||
#include "runtime/device/device_info.h"
|
||||
#include "runtime/event/perf_counter.h"
|
||||
#include "runtime/event/user_event.h"
|
||||
#include "runtime/helpers/aligned_memory.h"
|
||||
#include "runtime/helpers/debug_helpers.h"
|
||||
#include "runtime/helpers/hw_helper.h"
|
||||
#include "runtime/helpers/kernel_commands.h"
|
||||
#include "runtime/helpers/queue_helpers.h"
|
||||
#include "runtime/helpers/validators.h"
|
||||
#include "runtime/indirect_heap/indirect_heap.h"
|
||||
#include "runtime/mem_obj/mem_obj.h"
|
||||
#include "runtime/memory_manager/graphics_allocation.h"
|
||||
#include "runtime/utilities/tag_allocator.h"
|
||||
|
||||
#include "instrumentation.h"
|
||||
|
||||
#include <algorithm>
|
||||
#include <cmath>
|
||||
|
||||
namespace NEO {
|
||||
|
||||
// Performs ReadModifyWrite operation on value of a register: Register = Register Operation Mask
|
||||
template <typename GfxFamily>
|
||||
inline size_t GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(
|
||||
WALKER_TYPE<GfxFamily> *walkerCmd,
|
||||
const size_t globalOffsets[3],
|
||||
const size_t startWorkGroups[3],
|
||||
const size_t numWorkGroups[3],
|
||||
const size_t localWorkSizesIn[3],
|
||||
uint32_t simd,
|
||||
uint32_t workDim,
|
||||
bool localIdsGenerationByRuntime,
|
||||
bool inlineDataProgrammingRequired,
|
||||
const iOpenCL::SPatchThreadPayload &threadPayload) {
|
||||
auto localWorkSize = localWorkSizesIn[0] * localWorkSizesIn[1] * localWorkSizesIn[2];
|
||||
void GpgpuWalkerHelper<GfxFamily>::addAluReadModifyWriteRegister(
|
||||
NEO::LinearStream *pCommandStream,
|
||||
uint32_t aluRegister,
|
||||
uint32_t operation,
|
||||
uint32_t mask) {
|
||||
// Load "Register" value into CS_GPR_R0
|
||||
typedef typename GfxFamily::MI_LOAD_REGISTER_REG MI_LOAD_REGISTER_REG;
|
||||
typedef typename GfxFamily::MI_MATH MI_MATH;
|
||||
typedef typename GfxFamily::MI_MATH_ALU_INST_INLINE MI_MATH_ALU_INST_INLINE;
|
||||
auto pCmd = pCommandStream->getSpaceForCmd<MI_LOAD_REGISTER_REG>();
|
||||
*pCmd = GfxFamily::cmdInitLoadRegisterReg;
|
||||
pCmd->setSourceRegisterAddress(aluRegister);
|
||||
pCmd->setDestinationRegisterAddress(CS_GPR_R0);
|
||||
|
||||
auto threadsPerWorkGroup = getThreadsPerWG(simd, localWorkSize);
|
||||
walkerCmd->setThreadWidthCounterMaximum(static_cast<uint32_t>(threadsPerWorkGroup));
|
||||
// Load "Mask" into CS_GPR_R1
|
||||
typedef typename GfxFamily::MI_LOAD_REGISTER_IMM MI_LOAD_REGISTER_IMM;
|
||||
auto pCmd2 = pCommandStream->getSpaceForCmd<MI_LOAD_REGISTER_IMM>();
|
||||
*pCmd2 = GfxFamily::cmdInitLoadRegisterImm;
|
||||
pCmd2->setRegisterOffset(CS_GPR_R1);
|
||||
pCmd2->setDataDword(mask);
|
||||
|
||||
walkerCmd->setThreadGroupIdXDimension(static_cast<uint32_t>(numWorkGroups[0]));
|
||||
walkerCmd->setThreadGroupIdYDimension(static_cast<uint32_t>(numWorkGroups[1]));
|
||||
walkerCmd->setThreadGroupIdZDimension(static_cast<uint32_t>(numWorkGroups[2]));
|
||||
// Add instruction MI_MATH with 4 MI_MATH_ALU_INST_INLINE operands
|
||||
auto pCmd3 = reinterpret_cast<uint32_t *>(pCommandStream->getSpace(sizeof(MI_MATH) + NUM_ALU_INST_FOR_READ_MODIFY_WRITE * sizeof(MI_MATH_ALU_INST_INLINE)));
|
||||
reinterpret_cast<MI_MATH *>(pCmd3)->DW0.Value = 0x0;
|
||||
reinterpret_cast<MI_MATH *>(pCmd3)->DW0.BitField.InstructionType = MI_MATH::COMMAND_TYPE_MI_COMMAND;
|
||||
reinterpret_cast<MI_MATH *>(pCmd3)->DW0.BitField.InstructionOpcode = MI_MATH::MI_COMMAND_OPCODE_MI_MATH;
|
||||
// 0x3 - 5 Dwords length cmd (-2): 1 for MI_MATH, 4 for MI_MATH_ALU_INST_INLINE
|
||||
reinterpret_cast<MI_MATH *>(pCmd3)->DW0.BitField.DwordLength = NUM_ALU_INST_FOR_READ_MODIFY_WRITE - 1;
|
||||
pCmd3++;
|
||||
MI_MATH_ALU_INST_INLINE *pAluParam = reinterpret_cast<MI_MATH_ALU_INST_INLINE *>(pCmd3);
|
||||
|
||||
// compute executionMask - to tell which SIMD lines are active within thread
|
||||
auto remainderSimdLanes = localWorkSize & (simd - 1);
|
||||
uint64_t executionMask = (1ull << remainderSimdLanes) - 1;
|
||||
if (!executionMask)
|
||||
executionMask = ~executionMask;
|
||||
// Setup first operand of MI_MATH - load CS_GPR_R0 into register A
|
||||
pAluParam->DW0.BitField.ALUOpcode = ALU_OPCODE_LOAD;
|
||||
pAluParam->DW0.BitField.Operand1 = ALU_REGISTER_R_SRCA;
|
||||
pAluParam->DW0.BitField.Operand2 = ALU_REGISTER_R_0;
|
||||
pAluParam++;
|
||||
|
||||
using SIMD_SIZE = typename WALKER_TYPE<GfxFamily>::SIMD_SIZE;
|
||||
// Setup second operand of MI_MATH - load CS_GPR_R1 into register B
|
||||
pAluParam->DW0.BitField.ALUOpcode = ALU_OPCODE_LOAD;
|
||||
pAluParam->DW0.BitField.Operand1 = ALU_REGISTER_R_SRCB;
|
||||
pAluParam->DW0.BitField.Operand2 = ALU_REGISTER_R_1;
|
||||
pAluParam++;
|
||||
|
||||
walkerCmd->setRightExecutionMask(static_cast<uint32_t>(executionMask));
|
||||
walkerCmd->setBottomExecutionMask(static_cast<uint32_t>(0xffffffff));
|
||||
walkerCmd->setSimdSize(static_cast<SIMD_SIZE>(simd >> 4));
|
||||
// Setup third operand of MI_MATH - "Operation" on registers A and B
|
||||
pAluParam->DW0.BitField.ALUOpcode = operation;
|
||||
pAluParam->DW0.BitField.Operand1 = 0;
|
||||
pAluParam->DW0.BitField.Operand2 = 0;
|
||||
pAluParam++;
|
||||
|
||||
walkerCmd->setThreadGroupIdStartingX(static_cast<uint32_t>(startWorkGroups[0]));
|
||||
walkerCmd->setThreadGroupIdStartingY(static_cast<uint32_t>(startWorkGroups[1]));
|
||||
walkerCmd->setThreadGroupIdStartingResumeZ(static_cast<uint32_t>(startWorkGroups[2]));
|
||||
// Setup fourth operand of MI_MATH - store result into CS_GPR_R0
|
||||
pAluParam->DW0.BitField.ALUOpcode = ALU_OPCODE_STORE;
|
||||
pAluParam->DW0.BitField.Operand1 = ALU_REGISTER_R_0;
|
||||
pAluParam->DW0.BitField.Operand2 = ALU_REGISTER_R_ACCU;
|
||||
|
||||
return localWorkSize;
|
||||
// LOAD value of CS_GPR_R0 into "Register"
|
||||
auto pCmd4 = pCommandStream->getSpaceForCmd<MI_LOAD_REGISTER_REG>();
|
||||
*pCmd4 = GfxFamily::cmdInitLoadRegisterReg;
|
||||
pCmd4->setSourceRegisterAddress(CS_GPR_R0);
|
||||
pCmd4->setDestinationRegisterAddress(aluRegister);
|
||||
|
||||
// Add PIPE_CONTROL to flush caches
|
||||
auto pCmd5 = pCommandStream->getSpaceForCmd<PIPE_CONTROL>();
|
||||
*pCmd5 = GfxFamily::cmdInitPipeControl;
|
||||
pCmd5->setCommandStreamerStallEnable(true);
|
||||
pCmd5->setDcFlushEnable(true);
|
||||
pCmd5->setTextureCacheInvalidationEnable(true);
|
||||
pCmd5->setPipeControlFlushEnable(true);
|
||||
pCmd5->setStateCacheInvalidationEnable(true);
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void GpgpuWalkerHelper<GfxFamily>::dispatchScheduler(
|
||||
LinearStream &commandStream,
|
||||
DeviceQueueHw<GfxFamily> &devQueueHw,
|
||||
PreemptionMode preemptionMode,
|
||||
SchedulerKernel &scheduler,
|
||||
IndirectHeap *ssh,
|
||||
IndirectHeap *dsh) {
|
||||
void GpgpuWalkerHelper<GfxFamily>::dispatchProfilingCommandsStart(
|
||||
TagNode<HwTimeStamps> &hwTimeStamps,
|
||||
LinearStream *commandStream) {
|
||||
|
||||
using INTERFACE_DESCRIPTOR_DATA = typename GfxFamily::INTERFACE_DESCRIPTOR_DATA;
|
||||
using GPGPU_WALKER = typename GfxFamily::GPGPU_WALKER;
|
||||
using MI_BATCH_BUFFER_START = typename GfxFamily::MI_BATCH_BUFFER_START;
|
||||
using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
|
||||
|
||||
bool dcFlush = false;
|
||||
PipeControlHelper<GfxFamily>::addPipeControl(commandStream, dcFlush);
|
||||
// PIPE_CONTROL for global timestamp
|
||||
uint64_t timeStampAddress = hwTimeStamps.getGpuAddress() + offsetof(HwTimeStamps, GlobalStartTS);
|
||||
|
||||
uint32_t interfaceDescriptorIndex = devQueueHw.schedulerIDIndex;
|
||||
const size_t offsetInterfaceDescriptorTable = devQueueHw.colorCalcStateSize;
|
||||
const size_t offsetInterfaceDescriptor = offsetInterfaceDescriptorTable;
|
||||
const size_t totalInterfaceDescriptorTableSize = devQueueHw.interfaceDescriptorEntries * sizeof(INTERFACE_DESCRIPTOR_DATA);
|
||||
PipeControlHelper<GfxFamily>::obtainPipeControlAndProgramPostSyncOperation(commandStream, PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_TIMESTAMP, timeStampAddress, 0llu, false);
|
||||
|
||||
// Program media interface descriptor load
|
||||
KernelCommandsHelper<GfxFamily>::sendMediaInterfaceDescriptorLoad(
|
||||
commandStream,
|
||||
offsetInterfaceDescriptor,
|
||||
totalInterfaceDescriptorTableSize);
|
||||
//MI_STORE_REGISTER_MEM for context local timestamp
|
||||
timeStampAddress = hwTimeStamps.getGpuAddress() + offsetof(HwTimeStamps, ContextStartTS);
|
||||
|
||||
DEBUG_BREAK_IF(offsetInterfaceDescriptorTable % 64 != 0);
|
||||
//low part
|
||||
auto pMICmdLow = commandStream->getSpaceForCmd<MI_STORE_REGISTER_MEM>();
|
||||
*pMICmdLow = GfxFamily::cmdInitStoreRegisterMem;
|
||||
adjustMiStoreRegMemMode(pMICmdLow);
|
||||
pMICmdLow->setRegisterAddress(GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW);
|
||||
pMICmdLow->setMemoryAddress(timeStampAddress);
|
||||
}
|
||||
|
||||
// Determine SIMD size
|
||||
uint32_t simd = scheduler.getKernelInfo().getMaxSimdSize();
|
||||
DEBUG_BREAK_IF(simd != PARALLEL_SCHEDULER_COMPILATION_SIZE_20);
|
||||
template <typename GfxFamily>
|
||||
void GpgpuWalkerHelper<GfxFamily>::dispatchProfilingCommandsEnd(
|
||||
TagNode<HwTimeStamps> &hwTimeStamps,
|
||||
LinearStream *commandStream) {
|
||||
|
||||
// Patch our kernel constants
|
||||
*scheduler.globalWorkOffsetX = 0;
|
||||
*scheduler.globalWorkOffsetY = 0;
|
||||
*scheduler.globalWorkOffsetZ = 0;
|
||||
using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
|
||||
|
||||
*scheduler.globalWorkSizeX = (uint32_t)scheduler.getGws();
|
||||
*scheduler.globalWorkSizeY = 1;
|
||||
*scheduler.globalWorkSizeZ = 1;
|
||||
// PIPE_CONTROL for global timestamp
|
||||
auto pPipeControlCmd = commandStream->getSpaceForCmd<PIPE_CONTROL>();
|
||||
*pPipeControlCmd = GfxFamily::cmdInitPipeControl;
|
||||
pPipeControlCmd->setCommandStreamerStallEnable(true);
|
||||
|
||||
*scheduler.localWorkSizeX = (uint32_t)scheduler.getLws();
|
||||
*scheduler.localWorkSizeY = 1;
|
||||
*scheduler.localWorkSizeZ = 1;
|
||||
//MI_STORE_REGISTER_MEM for context local timestamp
|
||||
uint64_t timeStampAddress = hwTimeStamps.getGpuAddress() + offsetof(HwTimeStamps, ContextEndTS);
|
||||
|
||||
*scheduler.localWorkSizeX2 = (uint32_t)scheduler.getLws();
|
||||
*scheduler.localWorkSizeY2 = 1;
|
||||
*scheduler.localWorkSizeZ2 = 1;
|
||||
//low part
|
||||
auto pMICmdLow = commandStream->getSpaceForCmd<MI_STORE_REGISTER_MEM>();
|
||||
*pMICmdLow = GfxFamily::cmdInitStoreRegisterMem;
|
||||
adjustMiStoreRegMemMode(pMICmdLow);
|
||||
pMICmdLow->setRegisterAddress(GP_THREAD_TIME_REG_ADDRESS_OFFSET_LOW);
|
||||
pMICmdLow->setMemoryAddress(timeStampAddress);
|
||||
}
|
||||
|
||||
*scheduler.enqueuedLocalWorkSizeX = (uint32_t)scheduler.getLws();
|
||||
*scheduler.enqueuedLocalWorkSizeY = 1;
|
||||
*scheduler.enqueuedLocalWorkSizeZ = 1;
|
||||
template <typename GfxFamily>
|
||||
void GpgpuWalkerHelper<GfxFamily>::dispatchStoreRegisterCommand(
|
||||
LinearStream *commandStream,
|
||||
uint64_t memoryAddress,
|
||||
uint32_t registerAddress) {
|
||||
|
||||
*scheduler.numWorkGroupsX = (uint32_t)(scheduler.getGws() / scheduler.getLws());
|
||||
*scheduler.numWorkGroupsY = 0;
|
||||
*scheduler.numWorkGroupsZ = 0;
|
||||
using MI_STORE_REGISTER_MEM = typename GfxFamily::MI_STORE_REGISTER_MEM;
|
||||
|
||||
*scheduler.workDim = 1;
|
||||
auto pCmd = commandStream->getSpaceForCmd<MI_STORE_REGISTER_MEM>();
|
||||
*pCmd = GfxFamily::cmdInitStoreRegisterMem;
|
||||
pCmd->setRegisterAddress(registerAddress);
|
||||
pCmd->setMemoryAddress(memoryAddress);
|
||||
}
|
||||
|
||||
// Send our indirect object data
|
||||
size_t localWorkSizes[3] = {scheduler.getLws(), 1, 1};
|
||||
size_t globalWorkSizes[3] = {scheduler.getGws(), 1, 1};
|
||||
template <typename GfxFamily>
|
||||
void GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersGeneralPurposeCounterCommands(
|
||||
LinearStream *commandStream,
|
||||
uint64_t baseAddress) {
|
||||
|
||||
// Create indirectHeap for IOH that is located at the end of device enqueue DSH
|
||||
size_t curbeOffset = devQueueHw.setSchedulerCrossThreadData(scheduler);
|
||||
IndirectHeap indirectObjectHeap(dsh->getCpuBase(), dsh->getMaxAvailableSpace());
|
||||
indirectObjectHeap.getSpace(curbeOffset);
|
||||
IndirectHeap *ioh = &indirectObjectHeap;
|
||||
|
||||
// Program the walker. Invokes execution so all state should already be programmed
|
||||
auto pGpGpuWalkerCmd = static_cast<GPGPU_WALKER *>(commandStream.getSpace(sizeof(GPGPU_WALKER)));
|
||||
*pGpGpuWalkerCmd = GfxFamily::cmdInitGpgpuWalker;
|
||||
|
||||
bool localIdsGenerationByRuntime = KernelCommandsHelper<GfxFamily>::isRuntimeLocalIdsGenerationRequired(1, globalWorkSizes, localWorkSizes);
|
||||
bool inlineDataProgrammingRequired = KernelCommandsHelper<GfxFamily>::inlineDataProgrammingRequired(scheduler);
|
||||
KernelCommandsHelper<GfxFamily>::sendIndirectState(
|
||||
commandStream,
|
||||
*dsh,
|
||||
*ioh,
|
||||
*ssh,
|
||||
scheduler,
|
||||
simd,
|
||||
localWorkSizes,
|
||||
offsetInterfaceDescriptorTable,
|
||||
interfaceDescriptorIndex,
|
||||
preemptionMode,
|
||||
pGpGpuWalkerCmd,
|
||||
nullptr,
|
||||
localIdsGenerationByRuntime);
|
||||
|
||||
// Implement enabling special WA DisableLSQCROPERFforOCL if needed
|
||||
GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(&commandStream, scheduler, true);
|
||||
|
||||
size_t globalOffsets[3] = {0, 0, 0};
|
||||
size_t workGroups[3] = {(scheduler.getGws() / scheduler.getLws()), 1, 1};
|
||||
GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(pGpGpuWalkerCmd, globalOffsets, globalOffsets, workGroups, localWorkSizes,
|
||||
simd, 1, localIdsGenerationByRuntime, inlineDataProgrammingRequired,
|
||||
*scheduler.getKernelInfo().patchInfo.threadPayload);
|
||||
|
||||
// Implement disabling special WA DisableLSQCROPERFforOCL if needed
|
||||
GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(&commandStream, scheduler, false);
|
||||
|
||||
// Do not put BB_START only when returning in first Scheduler run
|
||||
if (devQueueHw.getSchedulerReturnInstance() != 1) {
|
||||
|
||||
PipeControlHelper<GfxFamily>::addPipeControl(commandStream, true);
|
||||
|
||||
// Add BB Start Cmd to the SLB in the Primary Batch Buffer
|
||||
auto *bbStart = static_cast<MI_BATCH_BUFFER_START *>(commandStream.getSpace(sizeof(MI_BATCH_BUFFER_START)));
|
||||
*bbStart = GfxFamily::cmdInitBatchBufferStart;
|
||||
bbStart->setSecondLevelBatchBuffer(MI_BATCH_BUFFER_START::SECOND_LEVEL_BATCH_BUFFER_FIRST_LEVEL_BATCH);
|
||||
uint64_t slbAddress = devQueueHw.getSlbBuffer()->getGpuAddress();
|
||||
bbStart->setBatchBufferStartAddressGraphicsaddress472(slbAddress);
|
||||
// Read General Purpose counters
|
||||
for (auto i = 0u; i < NEO::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT; i++) {
|
||||
uint32_t regAddr = INSTR_GFX_OFFSETS::INSTR_PERF_CNT_1_DW0 + i * sizeof(cl_uint);
|
||||
//Gp field is 2*uint64 wide so it can hold 4 uint32
|
||||
uint64_t address = baseAddress + i * sizeof(cl_uint);
|
||||
dispatchStoreRegisterCommand(commandStream, address, regAddr);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void GpgpuWalkerHelper<GfxFamily>::setupTimestampPacket(
|
||||
LinearStream *cmdStream,
|
||||
WALKER_TYPE<GfxFamily> *walkerCmd,
|
||||
TagNode<TimestampPacketStorage> *timestampPacketNode,
|
||||
TimestampPacketStorage::WriteOperationType writeOperationType) {
|
||||
void GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersUserCounterCommands(
|
||||
CommandQueue &commandQueue,
|
||||
LinearStream *commandStream,
|
||||
uint64_t baseAddress) {
|
||||
|
||||
if (TimestampPacketStorage::WriteOperationType::AfterWalker == writeOperationType) {
|
||||
uint64_t address = timestampPacketNode->getGpuAddress() + offsetof(TimestampPacketStorage, packets[0].contextEnd);
|
||||
PipeControlHelper<GfxFamily>::obtainPipeControlAndProgramPostSyncOperation(cmdStream, PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA, address, 0, false);
|
||||
auto userRegs = &commandQueue.getPerfCountersConfigData()->ReadRegs;
|
||||
|
||||
for (uint32_t i = 0; i < userRegs->RegsCount; i++) {
|
||||
uint32_t regAddr = userRegs->Reg[i].Offset;
|
||||
//offset between base (low) registers is cl_ulong wide
|
||||
uint64_t address = baseAddress + i * sizeof(cl_ulong);
|
||||
dispatchStoreRegisterCommand(commandStream, address, regAddr);
|
||||
|
||||
if (userRegs->Reg[i].BitSize > 32) {
|
||||
dispatchStoreRegisterCommand(commandStream, address + sizeof(cl_uint), regAddr + sizeof(cl_uint));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersOABufferStateCommands(
|
||||
TagNode<HwPerfCounter> &hwPerfCounter,
|
||||
LinearStream *commandStream) {
|
||||
|
||||
dispatchStoreRegisterCommand(commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.OaStatus), INSTR_GFX_OFFSETS::INSTR_OA_STATUS);
|
||||
dispatchStoreRegisterCommand(commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.OaHead), INSTR_GFX_OFFSETS::INSTR_OA_HEAD_PTR);
|
||||
dispatchStoreRegisterCommand(commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.OaTail), INSTR_GFX_OFFSETS::INSTR_OA_TAIL_PTR);
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersCommandsStart(
|
||||
CommandQueue &commandQueue,
|
||||
TagNode<HwPerfCounter> &hwPerfCounter,
|
||||
LinearStream *commandStream) {
|
||||
|
||||
using MI_REPORT_PERF_COUNT = typename GfxFamily::MI_REPORT_PERF_COUNT;
|
||||
|
||||
auto perfCounters = commandQueue.getPerfCounters();
|
||||
|
||||
uint32_t currentReportId = perfCounters->getCurrentReportId();
|
||||
uint64_t address = 0;
|
||||
//flush command streamer
|
||||
auto pPipeControlCmd = commandStream->getSpaceForCmd<PIPE_CONTROL>();
|
||||
*pPipeControlCmd = GfxFamily::cmdInitPipeControl;
|
||||
pPipeControlCmd->setCommandStreamerStallEnable(true);
|
||||
|
||||
//Store value of NOOPID register
|
||||
GpgpuWalkerHelper<GfxFamily>::dispatchStoreRegisterCommand(commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.DMAFenceIdBegin), INSTR_MMIO_NOOPID);
|
||||
|
||||
//Read Core Frequency
|
||||
GpgpuWalkerHelper<GfxFamily>::dispatchStoreRegisterCommand(commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.CoreFreqBegin), INSTR_MMIO_RPSTAT1);
|
||||
|
||||
GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersGeneralPurposeCounterCommands(commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.HwPerfReportBegin.Gp));
|
||||
|
||||
auto pReportPerfCount = commandStream->getSpaceForCmd<MI_REPORT_PERF_COUNT>();
|
||||
*pReportPerfCount = GfxFamily::cmdInitReportPerfCount;
|
||||
pReportPerfCount->setReportId(currentReportId);
|
||||
address = hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.HwPerfReportBegin.Oa);
|
||||
pReportPerfCount->setMemoryAddress(address);
|
||||
|
||||
address = hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWTimeStamp.GlobalStartTS);
|
||||
|
||||
PipeControlHelper<GfxFamily>::obtainPipeControlAndProgramPostSyncOperation(commandStream, PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_TIMESTAMP, address, 0llu, false);
|
||||
|
||||
GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersUserCounterCommands(commandQueue, commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.HwPerfReportBegin.User));
|
||||
|
||||
commandQueue.sendPerfCountersConfig();
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersCommandsEnd(
|
||||
CommandQueue &commandQueue,
|
||||
TagNode<HwPerfCounter> &hwPerfCounter,
|
||||
LinearStream *commandStream) {
|
||||
|
||||
using MI_REPORT_PERF_COUNT = typename GfxFamily::MI_REPORT_PERF_COUNT;
|
||||
|
||||
auto perfCounters = commandQueue.getPerfCounters();
|
||||
|
||||
uint32_t currentReportId = perfCounters->getCurrentReportId();
|
||||
|
||||
//flush command streamer
|
||||
auto pPipeControlCmd = commandStream->getSpaceForCmd<PIPE_CONTROL>();
|
||||
*pPipeControlCmd = GfxFamily::cmdInitPipeControl;
|
||||
pPipeControlCmd->setCommandStreamerStallEnable(true);
|
||||
|
||||
GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersOABufferStateCommands(hwPerfCounter, commandStream);
|
||||
|
||||
//Timestamp: Global End
|
||||
uint64_t address = hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWTimeStamp.GlobalEndTS);
|
||||
PipeControlHelper<GfxFamily>::obtainPipeControlAndProgramPostSyncOperation(commandStream, PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_TIMESTAMP, address, 0llu, false);
|
||||
|
||||
auto pReportPerfCount = commandStream->getSpaceForCmd<MI_REPORT_PERF_COUNT>();
|
||||
*pReportPerfCount = GfxFamily::cmdInitReportPerfCount;
|
||||
pReportPerfCount->setReportId(currentReportId);
|
||||
address = hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.HwPerfReportEnd.Oa);
|
||||
pReportPerfCount->setMemoryAddress(address);
|
||||
|
||||
GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersGeneralPurposeCounterCommands(commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.HwPerfReportEnd.Gp));
|
||||
|
||||
//Store value of NOOPID register
|
||||
GpgpuWalkerHelper<GfxFamily>::dispatchStoreRegisterCommand(commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.DMAFenceIdEnd), INSTR_MMIO_NOOPID);
|
||||
|
||||
//Read Core Frequency
|
||||
GpgpuWalkerHelper<GfxFamily>::dispatchStoreRegisterCommand(commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.CoreFreqEnd), INSTR_MMIO_RPSTAT1);
|
||||
|
||||
GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersUserCounterCommands(commandQueue, commandStream, hwPerfCounter.getGpuAddress() + offsetof(HwPerfCounter, HWPerfCounters.HwPerfReportEnd.User));
|
||||
|
||||
perfCounters->setCpuTimestamp();
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(NEO::LinearStream *pCommandStream, const Kernel &kernel, bool disablePerfMode) {
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
size_t GpgpuWalkerHelper<GfxFamily>::getSizeForWADisableLSQCROPERFforOCL(const Kernel *pKernel) {
|
||||
return (size_t)0;
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void GpgpuWalkerHelper<GfxFamily>::adjustMiStoreRegMemMode(MI_STORE_REG_MEM<GfxFamily> *storeCmd) {
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
size_t EnqueueOperation<GfxFamily>::getTotalSizeRequiredCS(uint32_t eventType, const CsrDependencies &csrDeps, bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo) {
|
||||
size_t expectedSizeCS = 0;
|
||||
Kernel *parentKernel = multiDispatchInfo.peekParentKernel();
|
||||
if (multiDispatchInfo.peekMainKernel() && multiDispatchInfo.peekMainKernel()->isAuxTranslationRequired()) {
|
||||
expectedSizeCS += sizeof(PIPE_CONTROL);
|
||||
}
|
||||
for (auto &dispatchInfo : multiDispatchInfo) {
|
||||
expectedSizeCS += EnqueueOperation<GfxFamily>::getSizeRequiredCS(eventType, reserveProfilingCmdsSpace, reservePerfCounters, commandQueue, dispatchInfo.getKernel());
|
||||
if (dispatchInfo.isPipeControlRequired()) {
|
||||
expectedSizeCS += sizeof(PIPE_CONTROL);
|
||||
}
|
||||
}
|
||||
if (parentKernel) {
|
||||
SchedulerKernel &scheduler = commandQueue.getDevice().getExecutionEnvironment()->getBuiltIns()->getSchedulerKernel(parentKernel->getContext());
|
||||
expectedSizeCS += EnqueueOperation<GfxFamily>::getSizeRequiredCS(eventType, reserveProfilingCmdsSpace, reservePerfCounters, commandQueue, &scheduler);
|
||||
}
|
||||
if (commandQueue.getCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
|
||||
expectedSizeCS += EnqueueOperation<GfxFamily>::getSizeRequiredForTimestampPacketWrite();
|
||||
expectedSizeCS += TimestampPacketHelper::getRequiredCmdStreamSize<GfxFamily>(csrDeps);
|
||||
}
|
||||
return expectedSizeCS;
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
size_t EnqueueOperation<GfxFamily>::getSizeRequiredCS(uint32_t cmdType, bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const Kernel *pKernel) {
|
||||
if (isCommandWithoutKernel(cmdType)) {
|
||||
return EnqueueOperation<GfxFamily>::getSizeRequiredCSNonKernel(reserveProfilingCmdsSpace, reservePerfCounters, commandQueue);
|
||||
} else {
|
||||
return EnqueueOperation<GfxFamily>::getSizeRequiredCSKernel(reserveProfilingCmdsSpace, reservePerfCounters, commandQueue, pKernel);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
size_t EnqueueOperation<GfxFamily>::getSizeRequiredCSNonKernel(bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue) {
|
||||
size_t size = 0;
|
||||
if (reserveProfilingCmdsSpace) {
|
||||
size += 2 * sizeof(PIPE_CONTROL) + 4 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
|
||||
}
|
||||
return size;
|
||||
}
|
||||
|
||||
} // namespace NEO
|
||||
|
||||
229
runtime/command_queue/gpgpu_walker_bdw_plus.inl
Normal file
229
runtime/command_queue/gpgpu_walker_bdw_plus.inl
Normal file
@@ -0,0 +1,229 @@
|
||||
/*
|
||||
* Copyright (C) 2018-2019 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include "runtime/command_queue/gpgpu_walker_base.inl"
|
||||
|
||||
namespace NEO {
|
||||
|
||||
template <typename GfxFamily>
|
||||
inline size_t GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(
|
||||
WALKER_TYPE<GfxFamily> *walkerCmd,
|
||||
const size_t globalOffsets[3],
|
||||
const size_t startWorkGroups[3],
|
||||
const size_t numWorkGroups[3],
|
||||
const size_t localWorkSizesIn[3],
|
||||
uint32_t simd,
|
||||
uint32_t workDim,
|
||||
bool localIdsGenerationByRuntime,
|
||||
bool inlineDataProgrammingRequired,
|
||||
const iOpenCL::SPatchThreadPayload &threadPayload) {
|
||||
auto localWorkSize = localWorkSizesIn[0] * localWorkSizesIn[1] * localWorkSizesIn[2];
|
||||
|
||||
auto threadsPerWorkGroup = getThreadsPerWG(simd, localWorkSize);
|
||||
walkerCmd->setThreadWidthCounterMaximum(static_cast<uint32_t>(threadsPerWorkGroup));
|
||||
|
||||
walkerCmd->setThreadGroupIdXDimension(static_cast<uint32_t>(numWorkGroups[0]));
|
||||
walkerCmd->setThreadGroupIdYDimension(static_cast<uint32_t>(numWorkGroups[1]));
|
||||
walkerCmd->setThreadGroupIdZDimension(static_cast<uint32_t>(numWorkGroups[2]));
|
||||
|
||||
// compute executionMask - to tell which SIMD lines are active within thread
|
||||
auto remainderSimdLanes = localWorkSize & (simd - 1);
|
||||
uint64_t executionMask = (1ull << remainderSimdLanes) - 1;
|
||||
if (!executionMask)
|
||||
executionMask = ~executionMask;
|
||||
|
||||
using SIMD_SIZE = typename WALKER_TYPE<GfxFamily>::SIMD_SIZE;
|
||||
|
||||
walkerCmd->setRightExecutionMask(static_cast<uint32_t>(executionMask));
|
||||
walkerCmd->setBottomExecutionMask(static_cast<uint32_t>(0xffffffff));
|
||||
walkerCmd->setSimdSize(static_cast<SIMD_SIZE>(simd >> 4));
|
||||
|
||||
walkerCmd->setThreadGroupIdStartingX(static_cast<uint32_t>(startWorkGroups[0]));
|
||||
walkerCmd->setThreadGroupIdStartingY(static_cast<uint32_t>(startWorkGroups[1]));
|
||||
walkerCmd->setThreadGroupIdStartingResumeZ(static_cast<uint32_t>(startWorkGroups[2]));
|
||||
|
||||
return localWorkSize;
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void GpgpuWalkerHelper<GfxFamily>::dispatchScheduler(
|
||||
LinearStream &commandStream,
|
||||
DeviceQueueHw<GfxFamily> &devQueueHw,
|
||||
PreemptionMode preemptionMode,
|
||||
SchedulerKernel &scheduler,
|
||||
IndirectHeap *ssh,
|
||||
IndirectHeap *dsh) {
|
||||
|
||||
using INTERFACE_DESCRIPTOR_DATA = typename GfxFamily::INTERFACE_DESCRIPTOR_DATA;
|
||||
using GPGPU_WALKER = typename GfxFamily::GPGPU_WALKER;
|
||||
using MI_BATCH_BUFFER_START = typename GfxFamily::MI_BATCH_BUFFER_START;
|
||||
|
||||
bool dcFlush = false;
|
||||
PipeControlHelper<GfxFamily>::addPipeControl(commandStream, dcFlush);
|
||||
|
||||
uint32_t interfaceDescriptorIndex = devQueueHw.schedulerIDIndex;
|
||||
const size_t offsetInterfaceDescriptorTable = devQueueHw.colorCalcStateSize;
|
||||
const size_t offsetInterfaceDescriptor = offsetInterfaceDescriptorTable;
|
||||
const size_t totalInterfaceDescriptorTableSize = devQueueHw.interfaceDescriptorEntries * sizeof(INTERFACE_DESCRIPTOR_DATA);
|
||||
|
||||
// Program media interface descriptor load
|
||||
KernelCommandsHelper<GfxFamily>::sendMediaInterfaceDescriptorLoad(
|
||||
commandStream,
|
||||
offsetInterfaceDescriptor,
|
||||
totalInterfaceDescriptorTableSize);
|
||||
|
||||
DEBUG_BREAK_IF(offsetInterfaceDescriptorTable % 64 != 0);
|
||||
|
||||
// Determine SIMD size
|
||||
uint32_t simd = scheduler.getKernelInfo().getMaxSimdSize();
|
||||
DEBUG_BREAK_IF(simd != PARALLEL_SCHEDULER_COMPILATION_SIZE_20);
|
||||
|
||||
// Patch our kernel constants
|
||||
*scheduler.globalWorkOffsetX = 0;
|
||||
*scheduler.globalWorkOffsetY = 0;
|
||||
*scheduler.globalWorkOffsetZ = 0;
|
||||
|
||||
*scheduler.globalWorkSizeX = (uint32_t)scheduler.getGws();
|
||||
*scheduler.globalWorkSizeY = 1;
|
||||
*scheduler.globalWorkSizeZ = 1;
|
||||
|
||||
*scheduler.localWorkSizeX = (uint32_t)scheduler.getLws();
|
||||
*scheduler.localWorkSizeY = 1;
|
||||
*scheduler.localWorkSizeZ = 1;
|
||||
|
||||
*scheduler.localWorkSizeX2 = (uint32_t)scheduler.getLws();
|
||||
*scheduler.localWorkSizeY2 = 1;
|
||||
*scheduler.localWorkSizeZ2 = 1;
|
||||
|
||||
*scheduler.enqueuedLocalWorkSizeX = (uint32_t)scheduler.getLws();
|
||||
*scheduler.enqueuedLocalWorkSizeY = 1;
|
||||
*scheduler.enqueuedLocalWorkSizeZ = 1;
|
||||
|
||||
*scheduler.numWorkGroupsX = (uint32_t)(scheduler.getGws() / scheduler.getLws());
|
||||
*scheduler.numWorkGroupsY = 0;
|
||||
*scheduler.numWorkGroupsZ = 0;
|
||||
|
||||
*scheduler.workDim = 1;
|
||||
|
||||
// Send our indirect object data
|
||||
size_t localWorkSizes[3] = {scheduler.getLws(), 1, 1};
|
||||
size_t globalWorkSizes[3] = {scheduler.getGws(), 1, 1};
|
||||
|
||||
// Create indirectHeap for IOH that is located at the end of device enqueue DSH
|
||||
size_t curbeOffset = devQueueHw.setSchedulerCrossThreadData(scheduler);
|
||||
IndirectHeap indirectObjectHeap(dsh->getCpuBase(), dsh->getMaxAvailableSpace());
|
||||
indirectObjectHeap.getSpace(curbeOffset);
|
||||
IndirectHeap *ioh = &indirectObjectHeap;
|
||||
|
||||
// Program the walker. Invokes execution so all state should already be programmed
|
||||
auto pGpGpuWalkerCmd = static_cast<GPGPU_WALKER *>(commandStream.getSpace(sizeof(GPGPU_WALKER)));
|
||||
*pGpGpuWalkerCmd = GfxFamily::cmdInitGpgpuWalker;
|
||||
|
||||
bool localIdsGenerationByRuntime = KernelCommandsHelper<GfxFamily>::isRuntimeLocalIdsGenerationRequired(1, globalWorkSizes, localWorkSizes);
|
||||
bool inlineDataProgrammingRequired = KernelCommandsHelper<GfxFamily>::inlineDataProgrammingRequired(scheduler);
|
||||
KernelCommandsHelper<GfxFamily>::sendIndirectState(
|
||||
commandStream,
|
||||
*dsh,
|
||||
*ioh,
|
||||
*ssh,
|
||||
scheduler,
|
||||
simd,
|
||||
localWorkSizes,
|
||||
offsetInterfaceDescriptorTable,
|
||||
interfaceDescriptorIndex,
|
||||
preemptionMode,
|
||||
pGpGpuWalkerCmd,
|
||||
nullptr,
|
||||
localIdsGenerationByRuntime);
|
||||
|
||||
// Implement enabling special WA DisableLSQCROPERFforOCL if needed
|
||||
GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(&commandStream, scheduler, true);
|
||||
|
||||
size_t globalOffsets[3] = {0, 0, 0};
|
||||
size_t workGroups[3] = {(scheduler.getGws() / scheduler.getLws()), 1, 1};
|
||||
GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(pGpGpuWalkerCmd, globalOffsets, globalOffsets, workGroups, localWorkSizes,
|
||||
simd, 1, localIdsGenerationByRuntime, inlineDataProgrammingRequired,
|
||||
*scheduler.getKernelInfo().patchInfo.threadPayload);
|
||||
|
||||
// Implement disabling special WA DisableLSQCROPERFforOCL if needed
|
||||
GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(&commandStream, scheduler, false);
|
||||
|
||||
// Do not put BB_START only when returning in first Scheduler run
|
||||
if (devQueueHw.getSchedulerReturnInstance() != 1) {
|
||||
|
||||
PipeControlHelper<GfxFamily>::addPipeControl(commandStream, true);
|
||||
|
||||
// Add BB Start Cmd to the SLB in the Primary Batch Buffer
|
||||
auto *bbStart = static_cast<MI_BATCH_BUFFER_START *>(commandStream.getSpace(sizeof(MI_BATCH_BUFFER_START)));
|
||||
*bbStart = GfxFamily::cmdInitBatchBufferStart;
|
||||
bbStart->setSecondLevelBatchBuffer(MI_BATCH_BUFFER_START::SECOND_LEVEL_BATCH_BUFFER_FIRST_LEVEL_BATCH);
|
||||
uint64_t slbAddress = devQueueHw.getSlbBuffer()->getGpuAddress();
|
||||
bbStart->setBatchBufferStartAddressGraphicsaddress472(slbAddress);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void GpgpuWalkerHelper<GfxFamily>::setupTimestampPacket(
|
||||
LinearStream *cmdStream,
|
||||
WALKER_TYPE<GfxFamily> *walkerCmd,
|
||||
TagNode<TimestampPacketStorage> *timestampPacketNode,
|
||||
TimestampPacketStorage::WriteOperationType writeOperationType) {
|
||||
|
||||
if (TimestampPacketStorage::WriteOperationType::AfterWalker == writeOperationType) {
|
||||
uint64_t address = timestampPacketNode->getGpuAddress() + offsetof(TimestampPacketStorage, packets[0].contextEnd);
|
||||
PipeControlHelper<GfxFamily>::obtainPipeControlAndProgramPostSyncOperation(cmdStream, PIPE_CONTROL::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA, address, 0, false);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
size_t EnqueueOperation<GfxFamily>::getSizeRequiredCSKernel(bool reserveProfilingCmdsSpace, bool reservePerfCounters, CommandQueue &commandQueue, const Kernel *pKernel) {
|
||||
size_t size = sizeof(typename GfxFamily::GPGPU_WALKER) + KernelCommandsHelper<GfxFamily>::getSizeRequiredCS(pKernel) +
|
||||
sizeof(PIPE_CONTROL) * (KernelCommandsHelper<GfxFamily>::isPipeControlWArequired() ? 2 : 1);
|
||||
size += KernelCommandsHelper<GfxFamily>::getSizeRequiredForCacheFlush(commandQueue, pKernel, 0U, 0U);
|
||||
size += PreemptionHelper::getPreemptionWaCsSize<GfxFamily>(commandQueue.getDevice());
|
||||
if (reserveProfilingCmdsSpace) {
|
||||
size += 2 * sizeof(PIPE_CONTROL) + 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
|
||||
}
|
||||
if (reservePerfCounters) {
|
||||
//start cmds
|
||||
//P_C: flush CS & TimeStamp BEGIN
|
||||
size += 2 * sizeof(PIPE_CONTROL);
|
||||
//SRM NOOPID & Frequency
|
||||
size += 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
|
||||
//gp registers
|
||||
size += NEO::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
|
||||
//report perf count
|
||||
size += sizeof(typename GfxFamily::MI_REPORT_PERF_COUNT);
|
||||
//user registers
|
||||
size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
|
||||
|
||||
//end cmds
|
||||
//P_C: flush CS & TimeStamp END;
|
||||
size += 2 * sizeof(PIPE_CONTROL);
|
||||
//OA buffer (status head, tail)
|
||||
size += 3 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
|
||||
//report perf count
|
||||
size += sizeof(typename GfxFamily::MI_REPORT_PERF_COUNT);
|
||||
//gp registers
|
||||
size += NEO::INSTR_GENERAL_PURPOSE_COUNTERS_COUNT * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
|
||||
//SRM NOOPID & Frequency
|
||||
size += 2 * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
|
||||
//user registers
|
||||
size += commandQueue.getPerfCountersUserRegistersNumber() * sizeof(typename GfxFamily::MI_STORE_REGISTER_MEM);
|
||||
}
|
||||
size += GpgpuWalkerHelper<GfxFamily>::getSizeForWADisableLSQCROPERFforOCL(pKernel);
|
||||
|
||||
return size;
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
size_t EnqueueOperation<GfxFamily>::getSizeRequiredForTimestampPacketWrite() {
|
||||
return sizeof(PIPE_CONTROL);
|
||||
}
|
||||
|
||||
} // namespace NEO
|
||||
@@ -1,227 +0,0 @@
|
||||
/*
|
||||
* Copyright (C) 2018-2019 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include "runtime/command_queue/hardware_interface.h"
|
||||
#include "runtime/helpers/kernel_commands.h"
|
||||
#include "runtime/helpers/task_information.h"
|
||||
#include "runtime/memory_manager/internal_allocation_storage.h"
|
||||
|
||||
namespace NEO {
|
||||
|
||||
template <typename GfxFamily>
|
||||
inline WALKER_TYPE<GfxFamily> *HardwareInterface<GfxFamily>::allocateWalkerSpace(LinearStream &commandStream,
|
||||
const Kernel &kernel) {
|
||||
auto walkerCmd = static_cast<WALKER_TYPE<GfxFamily> *>(commandStream.getSpace(sizeof(WALKER_TYPE<GfxFamily>)));
|
||||
*walkerCmd = GfxFamily::cmdInitGpgpuWalker;
|
||||
return walkerCmd;
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void HardwareInterface<GfxFamily>::dispatchWalker(
|
||||
CommandQueue &commandQueue,
|
||||
const MultiDispatchInfo &multiDispatchInfo,
|
||||
const CsrDependencies &csrDependencies,
|
||||
KernelOperation **blockedCommandsData,
|
||||
TagNode<HwTimeStamps> *hwTimeStamps,
|
||||
TagNode<HwPerfCounter> *hwPerfCounter,
|
||||
TimestampPacketContainer *previousTimestampPacketNodes,
|
||||
TimestampPacketContainer *currentTimestampPacketNodes,
|
||||
PreemptionMode preemptionMode,
|
||||
bool blockQueue,
|
||||
uint32_t commandType) {
|
||||
|
||||
LinearStream *commandStream = nullptr;
|
||||
IndirectHeap *dsh = nullptr, *ioh = nullptr, *ssh = nullptr;
|
||||
auto parentKernel = multiDispatchInfo.peekParentKernel();
|
||||
auto mainKernel = multiDispatchInfo.peekMainKernel();
|
||||
|
||||
for (auto &dispatchInfo : multiDispatchInfo) {
|
||||
// Compute local workgroup sizes
|
||||
if (dispatchInfo.getLocalWorkgroupSize().x == 0) {
|
||||
const auto lws = generateWorkgroupSize(dispatchInfo);
|
||||
const_cast<DispatchInfo &>(dispatchInfo).setLWS(lws);
|
||||
}
|
||||
}
|
||||
|
||||
// Allocate command stream and indirect heaps
|
||||
if (blockQueue) {
|
||||
using KCH = KernelCommandsHelper<GfxFamily>;
|
||||
|
||||
constexpr static auto additionalAllocationSize = CSRequirements::csOverfetchSize;
|
||||
constexpr static auto allocationSize = MemoryConstants::pageSize64k - additionalAllocationSize;
|
||||
commandStream = new LinearStream();
|
||||
commandQueue.getCommandStreamReceiver().ensureCommandBufferAllocation(*commandStream, allocationSize, additionalAllocationSize);
|
||||
|
||||
if (parentKernel) {
|
||||
uint32_t colorCalcSize = commandQueue.getContext().getDefaultDeviceQueue()->colorCalcStateSize;
|
||||
|
||||
commandQueue.allocateHeapMemory(
|
||||
IndirectHeap::DYNAMIC_STATE,
|
||||
commandQueue.getContext().getDefaultDeviceQueue()->getDshBuffer()->getUnderlyingBufferSize(),
|
||||
dsh);
|
||||
|
||||
dsh->getSpace(colorCalcSize);
|
||||
ioh = dsh;
|
||||
commandQueue.allocateHeapMemory(IndirectHeap::SURFACE_STATE,
|
||||
KernelCommandsHelper<GfxFamily>::template getSizeRequiredForExecutionModel<
|
||||
IndirectHeap::SURFACE_STATE>(*parentKernel) +
|
||||
KCH::getTotalSizeRequiredSSH(multiDispatchInfo),
|
||||
ssh);
|
||||
} else {
|
||||
commandQueue.allocateHeapMemory(IndirectHeap::DYNAMIC_STATE, KCH::getTotalSizeRequiredDSH(multiDispatchInfo), dsh);
|
||||
commandQueue.allocateHeapMemory(IndirectHeap::INDIRECT_OBJECT, KCH::getTotalSizeRequiredIOH(multiDispatchInfo), ioh);
|
||||
commandQueue.allocateHeapMemory(IndirectHeap::SURFACE_STATE, KCH::getTotalSizeRequiredSSH(multiDispatchInfo), ssh);
|
||||
}
|
||||
|
||||
using UniqueIH = std::unique_ptr<IndirectHeap>;
|
||||
*blockedCommandsData = new KernelOperation(std::unique_ptr<LinearStream>(commandStream), UniqueIH(dsh), UniqueIH(ioh),
|
||||
UniqueIH(ssh), *commandQueue.getCommandStreamReceiver().getInternalAllocationStorage());
|
||||
if (parentKernel) {
|
||||
(*blockedCommandsData)->doNotFreeISH = true;
|
||||
}
|
||||
} else {
|
||||
commandStream = &commandQueue.getCS(0);
|
||||
if (parentKernel && (commandQueue.getIndirectHeap(IndirectHeap::SURFACE_STATE, 0).getUsed() > 0)) {
|
||||
commandQueue.releaseIndirectHeap(IndirectHeap::SURFACE_STATE);
|
||||
}
|
||||
dsh = &getIndirectHeap<GfxFamily, IndirectHeap::DYNAMIC_STATE>(commandQueue, multiDispatchInfo);
|
||||
ioh = &getIndirectHeap<GfxFamily, IndirectHeap::INDIRECT_OBJECT>(commandQueue, multiDispatchInfo);
|
||||
ssh = &getIndirectHeap<GfxFamily, IndirectHeap::SURFACE_STATE>(commandQueue, multiDispatchInfo);
|
||||
}
|
||||
|
||||
TimestampPacketHelper::programCsrDependencies<GfxFamily>(*commandStream, csrDependencies);
|
||||
|
||||
dsh->align(KernelCommandsHelper<GfxFamily>::alignInterfaceDescriptorData);
|
||||
|
||||
uint32_t interfaceDescriptorIndex = 0;
|
||||
const size_t offsetInterfaceDescriptorTable = dsh->getUsed();
|
||||
|
||||
size_t totalInterfaceDescriptorTableSize = sizeof(INTERFACE_DESCRIPTOR_DATA);
|
||||
|
||||
getDefaultDshSpace(offsetInterfaceDescriptorTable, commandQueue, multiDispatchInfo, totalInterfaceDescriptorTableSize,
|
||||
parentKernel, dsh, commandStream);
|
||||
|
||||
// Program media interface descriptor load
|
||||
KernelCommandsHelper<GfxFamily>::sendMediaInterfaceDescriptorLoad(
|
||||
*commandStream,
|
||||
offsetInterfaceDescriptorTable,
|
||||
totalInterfaceDescriptorTableSize);
|
||||
|
||||
DEBUG_BREAK_IF(offsetInterfaceDescriptorTable % 64 != 0);
|
||||
|
||||
if (mainKernel->isAuxTranslationRequired()) {
|
||||
using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
|
||||
auto pPipeControlCmd = static_cast<PIPE_CONTROL *>(commandStream->getSpace(sizeof(PIPE_CONTROL)));
|
||||
*pPipeControlCmd = GfxFamily::cmdInitPipeControl;
|
||||
pPipeControlCmd->setDcFlushEnable(true);
|
||||
pPipeControlCmd->setCommandStreamerStallEnable(true);
|
||||
}
|
||||
|
||||
size_t currentDispatchIndex = 0;
|
||||
for (auto &dispatchInfo : multiDispatchInfo) {
|
||||
auto &kernel = *dispatchInfo.getKernel();
|
||||
DEBUG_BREAK_IF(!(dispatchInfo.getDim() >= 1 && dispatchInfo.getDim() <= 3));
|
||||
DEBUG_BREAK_IF(!(dispatchInfo.getGWS().z == 1 || dispatchInfo.getDim() == 3));
|
||||
DEBUG_BREAK_IF(!(dispatchInfo.getGWS().y == 1 || dispatchInfo.getDim() >= 2));
|
||||
DEBUG_BREAK_IF(!(dispatchInfo.getOffset().z == 0 || dispatchInfo.getDim() == 3));
|
||||
DEBUG_BREAK_IF(!(dispatchInfo.getOffset().y == 0 || dispatchInfo.getDim() >= 2));
|
||||
|
||||
// If we don't have a required WGS, compute one opportunistically
|
||||
auto maxWorkGroupSize = static_cast<uint32_t>(commandQueue.getDevice().getDeviceInfo().maxWorkGroupSize);
|
||||
if (commandType == CL_COMMAND_NDRANGE_KERNEL) {
|
||||
provideLocalWorkGroupSizeHints(commandQueue.getContextPtr(), maxWorkGroupSize, dispatchInfo);
|
||||
}
|
||||
|
||||
//Get dispatch geometry
|
||||
uint32_t dim = dispatchInfo.getDim();
|
||||
Vec3<size_t> gws = dispatchInfo.getGWS();
|
||||
Vec3<size_t> offset = dispatchInfo.getOffset();
|
||||
Vec3<size_t> startOfWorkgroups = dispatchInfo.getStartOfWorkgroups();
|
||||
|
||||
// Compute local workgroup sizes
|
||||
Vec3<size_t> lws = dispatchInfo.getLocalWorkgroupSize();
|
||||
Vec3<size_t> elws = (dispatchInfo.getEnqueuedWorkgroupSize().x > 0) ? dispatchInfo.getEnqueuedWorkgroupSize() : lws;
|
||||
|
||||
// Compute number of work groups
|
||||
Vec3<size_t> totalNumberOfWorkgroups = (dispatchInfo.getTotalNumberOfWorkgroups().x > 0) ? dispatchInfo.getTotalNumberOfWorkgroups()
|
||||
: generateWorkgroupsNumber(gws, lws);
|
||||
|
||||
Vec3<size_t> numberOfWorkgroups = (dispatchInfo.getNumberOfWorkgroups().x > 0) ? dispatchInfo.getNumberOfWorkgroups() : totalNumberOfWorkgroups;
|
||||
|
||||
size_t globalWorkSizes[3] = {gws.x, gws.y, gws.z};
|
||||
|
||||
// Patch our kernel constants
|
||||
*kernel.globalWorkOffsetX = static_cast<uint32_t>(offset.x);
|
||||
*kernel.globalWorkOffsetY = static_cast<uint32_t>(offset.y);
|
||||
*kernel.globalWorkOffsetZ = static_cast<uint32_t>(offset.z);
|
||||
|
||||
*kernel.globalWorkSizeX = static_cast<uint32_t>(gws.x);
|
||||
*kernel.globalWorkSizeY = static_cast<uint32_t>(gws.y);
|
||||
*kernel.globalWorkSizeZ = static_cast<uint32_t>(gws.z);
|
||||
|
||||
if ((&kernel == mainKernel) || (kernel.localWorkSizeX2 == &Kernel::dummyPatchLocation)) {
|
||||
*kernel.localWorkSizeX = static_cast<uint32_t>(lws.x);
|
||||
*kernel.localWorkSizeY = static_cast<uint32_t>(lws.y);
|
||||
*kernel.localWorkSizeZ = static_cast<uint32_t>(lws.z);
|
||||
}
|
||||
|
||||
*kernel.localWorkSizeX2 = static_cast<uint32_t>(lws.x);
|
||||
*kernel.localWorkSizeY2 = static_cast<uint32_t>(lws.y);
|
||||
*kernel.localWorkSizeZ2 = static_cast<uint32_t>(lws.z);
|
||||
|
||||
*kernel.enqueuedLocalWorkSizeX = static_cast<uint32_t>(elws.x);
|
||||
*kernel.enqueuedLocalWorkSizeY = static_cast<uint32_t>(elws.y);
|
||||
*kernel.enqueuedLocalWorkSizeZ = static_cast<uint32_t>(elws.z);
|
||||
|
||||
if (&kernel == mainKernel) {
|
||||
*kernel.numWorkGroupsX = static_cast<uint32_t>(totalNumberOfWorkgroups.x);
|
||||
*kernel.numWorkGroupsY = static_cast<uint32_t>(totalNumberOfWorkgroups.y);
|
||||
*kernel.numWorkGroupsZ = static_cast<uint32_t>(totalNumberOfWorkgroups.z);
|
||||
}
|
||||
|
||||
*kernel.workDim = dim;
|
||||
|
||||
// Send our indirect object data
|
||||
size_t localWorkSizes[3] = {lws.x, lws.y, lws.z};
|
||||
|
||||
dispatchProfilingPerfStartCommands(dispatchInfo, multiDispatchInfo, hwTimeStamps,
|
||||
hwPerfCounter, commandStream, commandQueue);
|
||||
|
||||
dispatchWorkarounds(commandStream, commandQueue, kernel, true);
|
||||
|
||||
if (commandQueue.getCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
|
||||
auto timestampPacketNode = currentTimestampPacketNodes->peekNodes().at(currentDispatchIndex);
|
||||
GpgpuWalkerHelper<GfxFamily>::setupTimestampPacket(commandStream, nullptr, timestampPacketNode, TimestampPacketStorage::WriteOperationType::BeforeWalker);
|
||||
}
|
||||
|
||||
programWalker(*commandStream, kernel, commandQueue, currentTimestampPacketNodes, *dsh, *ioh, *ssh, globalWorkSizes,
|
||||
localWorkSizes, preemptionMode, currentDispatchIndex, interfaceDescriptorIndex, dispatchInfo,
|
||||
offsetInterfaceDescriptorTable, numberOfWorkgroups, startOfWorkgroups);
|
||||
|
||||
dispatchWorkarounds(commandStream, commandQueue, kernel, false);
|
||||
if (dispatchInfo.isPipeControlRequired()) {
|
||||
using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
|
||||
auto pPipeControlCmd = static_cast<PIPE_CONTROL *>(commandStream->getSpace(sizeof(PIPE_CONTROL)));
|
||||
*pPipeControlCmd = GfxFamily::cmdInitPipeControl;
|
||||
pPipeControlCmd->setCommandStreamerStallEnable(true);
|
||||
}
|
||||
|
||||
currentDispatchIndex++;
|
||||
}
|
||||
if (mainKernel->requiresCacheFlushCommand(commandQueue)) {
|
||||
uint64_t postSyncAddress = 0;
|
||||
if (commandQueue.getCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
|
||||
auto timestampPacketNodeForPostSync = currentTimestampPacketNodes->peekNodes().at(currentDispatchIndex);
|
||||
postSyncAddress = timestampPacketNodeForPostSync->getGpuAddress();
|
||||
}
|
||||
KernelCommandsHelper<GfxFamily>::programCacheFlushAfterWalkerCommand(commandStream, commandQueue, mainKernel, postSyncAddress, 0);
|
||||
}
|
||||
dispatchProfilingPerfEndCommands(hwTimeStamps, hwPerfCounter, commandStream, commandQueue);
|
||||
}
|
||||
|
||||
} // namespace NEO
|
||||
@@ -7,134 +7,221 @@
|
||||
|
||||
#pragma once
|
||||
#include "runtime/command_queue/hardware_interface.h"
|
||||
#include "runtime/helpers/kernel_commands.h"
|
||||
#include "runtime/helpers/task_information.h"
|
||||
#include "runtime/memory_manager/internal_allocation_storage.h"
|
||||
|
||||
namespace NEO {
|
||||
|
||||
template <typename GfxFamily>
|
||||
inline void HardwareInterface<GfxFamily>::getDefaultDshSpace(
|
||||
const size_t &offsetInterfaceDescriptorTable,
|
||||
inline WALKER_TYPE<GfxFamily> *HardwareInterface<GfxFamily>::allocateWalkerSpace(LinearStream &commandStream,
|
||||
const Kernel &kernel) {
|
||||
auto walkerCmd = static_cast<WALKER_TYPE<GfxFamily> *>(commandStream.getSpace(sizeof(WALKER_TYPE<GfxFamily>)));
|
||||
*walkerCmd = GfxFamily::cmdInitGpgpuWalker;
|
||||
return walkerCmd;
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
void HardwareInterface<GfxFamily>::dispatchWalker(
|
||||
CommandQueue &commandQueue,
|
||||
const MultiDispatchInfo &multiDispatchInfo,
|
||||
size_t &totalInterfaceDescriptorTableSize,
|
||||
Kernel *parentKernel,
|
||||
IndirectHeap *dsh,
|
||||
LinearStream *commandStream) {
|
||||
|
||||
size_t numDispatches = multiDispatchInfo.size();
|
||||
totalInterfaceDescriptorTableSize *= numDispatches;
|
||||
|
||||
if (!parentKernel) {
|
||||
dsh->getSpace(totalInterfaceDescriptorTableSize);
|
||||
} else {
|
||||
dsh->getSpace(commandQueue.getContext().getDefaultDeviceQueue()->getDshOffset() - dsh->getUsed());
|
||||
}
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
inline void HardwareInterface<GfxFamily>::dispatchWorkarounds(
|
||||
LinearStream *commandStream,
|
||||
CommandQueue &commandQueue,
|
||||
Kernel &kernel,
|
||||
const bool &enable) {
|
||||
|
||||
if (enable) {
|
||||
PreemptionHelper::applyPreemptionWaCmdsBegin<GfxFamily>(commandStream, commandQueue.getDevice());
|
||||
// Implement enabling special WA DisableLSQCROPERFforOCL if needed
|
||||
GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(commandStream, kernel, enable);
|
||||
} else {
|
||||
// Implement disabling special WA DisableLSQCROPERFforOCL if needed
|
||||
GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(commandStream, kernel, enable);
|
||||
PreemptionHelper::applyPreemptionWaCmdsEnd<GfxFamily>(commandStream, commandQueue.getDevice());
|
||||
}
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
inline void HardwareInterface<GfxFamily>::dispatchProfilingPerfStartCommands(
|
||||
const DispatchInfo &dispatchInfo,
|
||||
const MultiDispatchInfo &multiDispatchInfo,
|
||||
const CsrDependencies &csrDependencies,
|
||||
KernelOperation **blockedCommandsData,
|
||||
TagNode<HwTimeStamps> *hwTimeStamps,
|
||||
TagNode<HwPerfCounter> *hwPerfCounter,
|
||||
LinearStream *commandStream,
|
||||
CommandQueue &commandQueue) {
|
||||
|
||||
if (&dispatchInfo == &*multiDispatchInfo.begin()) {
|
||||
// If hwTimeStampAlloc is passed (not nullptr), then we know that profiling is enabled
|
||||
if (hwTimeStamps != nullptr) {
|
||||
GpgpuWalkerHelper<GfxFamily>::dispatchProfilingCommandsStart(*hwTimeStamps, commandStream);
|
||||
}
|
||||
if (hwPerfCounter != nullptr) {
|
||||
GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersCommandsStart(commandQueue, *hwPerfCounter, commandStream);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
inline void HardwareInterface<GfxFamily>::dispatchProfilingPerfEndCommands(
|
||||
TagNode<HwTimeStamps> *hwTimeStamps,
|
||||
TagNode<HwPerfCounter> *hwPerfCounter,
|
||||
LinearStream *commandStream,
|
||||
CommandQueue &commandQueue) {
|
||||
|
||||
// If hwTimeStamps is passed (not nullptr), then we know that profiling is enabled
|
||||
if (hwTimeStamps != nullptr) {
|
||||
GpgpuWalkerHelper<GfxFamily>::dispatchProfilingCommandsEnd(*hwTimeStamps, commandStream);
|
||||
}
|
||||
if (hwPerfCounter != nullptr) {
|
||||
GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersCommandsEnd(commandQueue, *hwPerfCounter, commandStream);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
inline void HardwareInterface<GfxFamily>::programWalker(
|
||||
LinearStream &commandStream,
|
||||
Kernel &kernel,
|
||||
CommandQueue &commandQueue,
|
||||
TimestampPacketContainer *previousTimestampPacketNodes,
|
||||
TimestampPacketContainer *currentTimestampPacketNodes,
|
||||
IndirectHeap &dsh,
|
||||
IndirectHeap &ioh,
|
||||
IndirectHeap &ssh,
|
||||
size_t globalWorkSizes[3],
|
||||
size_t localWorkSizes[3],
|
||||
PreemptionMode preemptionMode,
|
||||
size_t currentDispatchIndex,
|
||||
uint32_t &interfaceDescriptorIndex,
|
||||
const DispatchInfo &dispatchInfo,
|
||||
size_t offsetInterfaceDescriptorTable,
|
||||
Vec3<size_t> &numberOfWorkgroups,
|
||||
Vec3<size_t> &startOfWorkgroups) {
|
||||
bool blockQueue,
|
||||
uint32_t commandType) {
|
||||
|
||||
auto walkerCmd = allocateWalkerSpace(commandStream, kernel);
|
||||
uint32_t dim = dispatchInfo.getDim();
|
||||
uint32_t simd = kernel.getKernelInfo().getMaxSimdSize();
|
||||
LinearStream *commandStream = nullptr;
|
||||
IndirectHeap *dsh = nullptr, *ioh = nullptr, *ssh = nullptr;
|
||||
auto parentKernel = multiDispatchInfo.peekParentKernel();
|
||||
auto mainKernel = multiDispatchInfo.peekMainKernel();
|
||||
|
||||
size_t globalOffsets[3] = {dispatchInfo.getOffset().x, dispatchInfo.getOffset().y, dispatchInfo.getOffset().z};
|
||||
size_t startWorkGroups[3] = {startOfWorkgroups.x, startOfWorkgroups.y, startOfWorkgroups.z};
|
||||
size_t numWorkGroups[3] = {numberOfWorkgroups.x, numberOfWorkgroups.y, numberOfWorkgroups.z};
|
||||
|
||||
if (currentTimestampPacketNodes && commandQueue.getCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
|
||||
auto timestampPacketNode = currentTimestampPacketNodes->peekNodes().at(currentDispatchIndex);
|
||||
GpgpuWalkerHelper<GfxFamily>::setupTimestampPacket(&commandStream, walkerCmd, timestampPacketNode, TimestampPacketStorage::WriteOperationType::AfterWalker);
|
||||
for (auto &dispatchInfo : multiDispatchInfo) {
|
||||
// Compute local workgroup sizes
|
||||
if (dispatchInfo.getLocalWorkgroupSize().x == 0) {
|
||||
const auto lws = generateWorkgroupSize(dispatchInfo);
|
||||
const_cast<DispatchInfo &>(dispatchInfo).setLWS(lws);
|
||||
}
|
||||
}
|
||||
|
||||
KernelCommandsHelper<GfxFamily>::sendIndirectState(
|
||||
commandStream,
|
||||
dsh,
|
||||
ioh,
|
||||
ssh,
|
||||
kernel,
|
||||
simd,
|
||||
localWorkSizes,
|
||||
offsetInterfaceDescriptorTable,
|
||||
interfaceDescriptorIndex,
|
||||
preemptionMode,
|
||||
walkerCmd,
|
||||
nullptr,
|
||||
true);
|
||||
// Allocate command stream and indirect heaps
|
||||
if (blockQueue) {
|
||||
using KCH = KernelCommandsHelper<GfxFamily>;
|
||||
|
||||
GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(walkerCmd, globalOffsets, startWorkGroups,
|
||||
numWorkGroups, localWorkSizes, simd, dim,
|
||||
false, false,
|
||||
*kernel.getKernelInfo().patchInfo.threadPayload);
|
||||
constexpr static auto additionalAllocationSize = CSRequirements::csOverfetchSize;
|
||||
constexpr static auto allocationSize = MemoryConstants::pageSize64k - additionalAllocationSize;
|
||||
commandStream = new LinearStream();
|
||||
commandQueue.getCommandStreamReceiver().ensureCommandBufferAllocation(*commandStream, allocationSize, additionalAllocationSize);
|
||||
|
||||
if (parentKernel) {
|
||||
uint32_t colorCalcSize = commandQueue.getContext().getDefaultDeviceQueue()->colorCalcStateSize;
|
||||
|
||||
commandQueue.allocateHeapMemory(
|
||||
IndirectHeap::DYNAMIC_STATE,
|
||||
commandQueue.getContext().getDefaultDeviceQueue()->getDshBuffer()->getUnderlyingBufferSize(),
|
||||
dsh);
|
||||
|
||||
dsh->getSpace(colorCalcSize);
|
||||
ioh = dsh;
|
||||
commandQueue.allocateHeapMemory(IndirectHeap::SURFACE_STATE,
|
||||
KernelCommandsHelper<GfxFamily>::template getSizeRequiredForExecutionModel<
|
||||
IndirectHeap::SURFACE_STATE>(*parentKernel) +
|
||||
KCH::getTotalSizeRequiredSSH(multiDispatchInfo),
|
||||
ssh);
|
||||
} else {
|
||||
commandQueue.allocateHeapMemory(IndirectHeap::DYNAMIC_STATE, KCH::getTotalSizeRequiredDSH(multiDispatchInfo), dsh);
|
||||
commandQueue.allocateHeapMemory(IndirectHeap::INDIRECT_OBJECT, KCH::getTotalSizeRequiredIOH(multiDispatchInfo), ioh);
|
||||
commandQueue.allocateHeapMemory(IndirectHeap::SURFACE_STATE, KCH::getTotalSizeRequiredSSH(multiDispatchInfo), ssh);
|
||||
}
|
||||
|
||||
using UniqueIH = std::unique_ptr<IndirectHeap>;
|
||||
*blockedCommandsData = new KernelOperation(std::unique_ptr<LinearStream>(commandStream), UniqueIH(dsh), UniqueIH(ioh),
|
||||
UniqueIH(ssh), *commandQueue.getCommandStreamReceiver().getInternalAllocationStorage());
|
||||
if (parentKernel) {
|
||||
(*blockedCommandsData)->doNotFreeISH = true;
|
||||
}
|
||||
} else {
|
||||
commandStream = &commandQueue.getCS(0);
|
||||
if (parentKernel && (commandQueue.getIndirectHeap(IndirectHeap::SURFACE_STATE, 0).getUsed() > 0)) {
|
||||
commandQueue.releaseIndirectHeap(IndirectHeap::SURFACE_STATE);
|
||||
}
|
||||
dsh = &getIndirectHeap<GfxFamily, IndirectHeap::DYNAMIC_STATE>(commandQueue, multiDispatchInfo);
|
||||
ioh = &getIndirectHeap<GfxFamily, IndirectHeap::INDIRECT_OBJECT>(commandQueue, multiDispatchInfo);
|
||||
ssh = &getIndirectHeap<GfxFamily, IndirectHeap::SURFACE_STATE>(commandQueue, multiDispatchInfo);
|
||||
}
|
||||
|
||||
TimestampPacketHelper::programCsrDependencies<GfxFamily>(*commandStream, csrDependencies);
|
||||
|
||||
dsh->align(KernelCommandsHelper<GfxFamily>::alignInterfaceDescriptorData);
|
||||
|
||||
uint32_t interfaceDescriptorIndex = 0;
|
||||
const size_t offsetInterfaceDescriptorTable = dsh->getUsed();
|
||||
|
||||
size_t totalInterfaceDescriptorTableSize = sizeof(INTERFACE_DESCRIPTOR_DATA);
|
||||
|
||||
getDefaultDshSpace(offsetInterfaceDescriptorTable, commandQueue, multiDispatchInfo, totalInterfaceDescriptorTableSize,
|
||||
parentKernel, dsh, commandStream);
|
||||
|
||||
// Program media interface descriptor load
|
||||
KernelCommandsHelper<GfxFamily>::sendMediaInterfaceDescriptorLoad(
|
||||
*commandStream,
|
||||
offsetInterfaceDescriptorTable,
|
||||
totalInterfaceDescriptorTableSize);
|
||||
|
||||
DEBUG_BREAK_IF(offsetInterfaceDescriptorTable % 64 != 0);
|
||||
|
||||
if (mainKernel->isAuxTranslationRequired()) {
|
||||
using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
|
||||
auto pPipeControlCmd = static_cast<PIPE_CONTROL *>(commandStream->getSpace(sizeof(PIPE_CONTROL)));
|
||||
*pPipeControlCmd = GfxFamily::cmdInitPipeControl;
|
||||
pPipeControlCmd->setDcFlushEnable(true);
|
||||
pPipeControlCmd->setCommandStreamerStallEnable(true);
|
||||
}
|
||||
|
||||
size_t currentDispatchIndex = 0;
|
||||
for (auto &dispatchInfo : multiDispatchInfo) {
|
||||
auto &kernel = *dispatchInfo.getKernel();
|
||||
DEBUG_BREAK_IF(!(dispatchInfo.getDim() >= 1 && dispatchInfo.getDim() <= 3));
|
||||
DEBUG_BREAK_IF(!(dispatchInfo.getGWS().z == 1 || dispatchInfo.getDim() == 3));
|
||||
DEBUG_BREAK_IF(!(dispatchInfo.getGWS().y == 1 || dispatchInfo.getDim() >= 2));
|
||||
DEBUG_BREAK_IF(!(dispatchInfo.getOffset().z == 0 || dispatchInfo.getDim() == 3));
|
||||
DEBUG_BREAK_IF(!(dispatchInfo.getOffset().y == 0 || dispatchInfo.getDim() >= 2));
|
||||
|
||||
// If we don't have a required WGS, compute one opportunistically
|
||||
auto maxWorkGroupSize = static_cast<uint32_t>(commandQueue.getDevice().getDeviceInfo().maxWorkGroupSize);
|
||||
if (commandType == CL_COMMAND_NDRANGE_KERNEL) {
|
||||
provideLocalWorkGroupSizeHints(commandQueue.getContextPtr(), maxWorkGroupSize, dispatchInfo);
|
||||
}
|
||||
|
||||
//Get dispatch geometry
|
||||
uint32_t dim = dispatchInfo.getDim();
|
||||
Vec3<size_t> gws = dispatchInfo.getGWS();
|
||||
Vec3<size_t> offset = dispatchInfo.getOffset();
|
||||
Vec3<size_t> startOfWorkgroups = dispatchInfo.getStartOfWorkgroups();
|
||||
|
||||
// Compute local workgroup sizes
|
||||
Vec3<size_t> lws = dispatchInfo.getLocalWorkgroupSize();
|
||||
Vec3<size_t> elws = (dispatchInfo.getEnqueuedWorkgroupSize().x > 0) ? dispatchInfo.getEnqueuedWorkgroupSize() : lws;
|
||||
|
||||
// Compute number of work groups
|
||||
Vec3<size_t> totalNumberOfWorkgroups = (dispatchInfo.getTotalNumberOfWorkgroups().x > 0) ? dispatchInfo.getTotalNumberOfWorkgroups()
|
||||
: generateWorkgroupsNumber(gws, lws);
|
||||
|
||||
Vec3<size_t> numberOfWorkgroups = (dispatchInfo.getNumberOfWorkgroups().x > 0) ? dispatchInfo.getNumberOfWorkgroups() : totalNumberOfWorkgroups;
|
||||
|
||||
size_t globalWorkSizes[3] = {gws.x, gws.y, gws.z};
|
||||
|
||||
// Patch our kernel constants
|
||||
*kernel.globalWorkOffsetX = static_cast<uint32_t>(offset.x);
|
||||
*kernel.globalWorkOffsetY = static_cast<uint32_t>(offset.y);
|
||||
*kernel.globalWorkOffsetZ = static_cast<uint32_t>(offset.z);
|
||||
|
||||
*kernel.globalWorkSizeX = static_cast<uint32_t>(gws.x);
|
||||
*kernel.globalWorkSizeY = static_cast<uint32_t>(gws.y);
|
||||
*kernel.globalWorkSizeZ = static_cast<uint32_t>(gws.z);
|
||||
|
||||
if ((&kernel == mainKernel) || (kernel.localWorkSizeX2 == &Kernel::dummyPatchLocation)) {
|
||||
*kernel.localWorkSizeX = static_cast<uint32_t>(lws.x);
|
||||
*kernel.localWorkSizeY = static_cast<uint32_t>(lws.y);
|
||||
*kernel.localWorkSizeZ = static_cast<uint32_t>(lws.z);
|
||||
}
|
||||
|
||||
*kernel.localWorkSizeX2 = static_cast<uint32_t>(lws.x);
|
||||
*kernel.localWorkSizeY2 = static_cast<uint32_t>(lws.y);
|
||||
*kernel.localWorkSizeZ2 = static_cast<uint32_t>(lws.z);
|
||||
|
||||
*kernel.enqueuedLocalWorkSizeX = static_cast<uint32_t>(elws.x);
|
||||
*kernel.enqueuedLocalWorkSizeY = static_cast<uint32_t>(elws.y);
|
||||
*kernel.enqueuedLocalWorkSizeZ = static_cast<uint32_t>(elws.z);
|
||||
|
||||
if (&kernel == mainKernel) {
|
||||
*kernel.numWorkGroupsX = static_cast<uint32_t>(totalNumberOfWorkgroups.x);
|
||||
*kernel.numWorkGroupsY = static_cast<uint32_t>(totalNumberOfWorkgroups.y);
|
||||
*kernel.numWorkGroupsZ = static_cast<uint32_t>(totalNumberOfWorkgroups.z);
|
||||
}
|
||||
|
||||
*kernel.workDim = dim;
|
||||
|
||||
// Send our indirect object data
|
||||
size_t localWorkSizes[3] = {lws.x, lws.y, lws.z};
|
||||
|
||||
dispatchProfilingPerfStartCommands(dispatchInfo, multiDispatchInfo, hwTimeStamps,
|
||||
hwPerfCounter, commandStream, commandQueue);
|
||||
|
||||
dispatchWorkarounds(commandStream, commandQueue, kernel, true);
|
||||
|
||||
if (commandQueue.getCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
|
||||
auto timestampPacketNode = currentTimestampPacketNodes->peekNodes().at(currentDispatchIndex);
|
||||
GpgpuWalkerHelper<GfxFamily>::setupTimestampPacket(commandStream, nullptr, timestampPacketNode, TimestampPacketStorage::WriteOperationType::BeforeWalker);
|
||||
}
|
||||
|
||||
programWalker(*commandStream, kernel, commandQueue, currentTimestampPacketNodes, *dsh, *ioh, *ssh, globalWorkSizes,
|
||||
localWorkSizes, preemptionMode, currentDispatchIndex, interfaceDescriptorIndex, dispatchInfo,
|
||||
offsetInterfaceDescriptorTable, numberOfWorkgroups, startOfWorkgroups);
|
||||
|
||||
dispatchWorkarounds(commandStream, commandQueue, kernel, false);
|
||||
if (dispatchInfo.isPipeControlRequired()) {
|
||||
using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
|
||||
auto pPipeControlCmd = static_cast<PIPE_CONTROL *>(commandStream->getSpace(sizeof(PIPE_CONTROL)));
|
||||
*pPipeControlCmd = GfxFamily::cmdInitPipeControl;
|
||||
pPipeControlCmd->setCommandStreamerStallEnable(true);
|
||||
}
|
||||
|
||||
currentDispatchIndex++;
|
||||
}
|
||||
if (mainKernel->requiresCacheFlushCommand(commandQueue)) {
|
||||
uint64_t postSyncAddress = 0;
|
||||
if (commandQueue.getCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
|
||||
auto timestampPacketNodeForPostSync = currentTimestampPacketNodes->peekNodes().at(currentDispatchIndex);
|
||||
postSyncAddress = timestampPacketNodeForPostSync->getGpuAddress();
|
||||
}
|
||||
KernelCommandsHelper<GfxFamily>::programCacheFlushAfterWalkerCommand(commandStream, commandQueue, mainKernel, postSyncAddress, 0);
|
||||
}
|
||||
dispatchProfilingPerfEndCommands(hwTimeStamps, hwPerfCounter, commandStream, commandQueue);
|
||||
}
|
||||
|
||||
} // namespace NEO
|
||||
|
||||
140
runtime/command_queue/hardware_interface_bdw_plus.inl
Normal file
140
runtime/command_queue/hardware_interface_bdw_plus.inl
Normal file
@@ -0,0 +1,140 @@
|
||||
/*
|
||||
* Copyright (C) 2018-2019 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
#include "runtime/command_queue/hardware_interface_base.inl"
|
||||
|
||||
namespace NEO {
|
||||
|
||||
template <typename GfxFamily>
|
||||
inline void HardwareInterface<GfxFamily>::getDefaultDshSpace(
|
||||
const size_t &offsetInterfaceDescriptorTable,
|
||||
CommandQueue &commandQueue,
|
||||
const MultiDispatchInfo &multiDispatchInfo,
|
||||
size_t &totalInterfaceDescriptorTableSize,
|
||||
Kernel *parentKernel,
|
||||
IndirectHeap *dsh,
|
||||
LinearStream *commandStream) {
|
||||
|
||||
size_t numDispatches = multiDispatchInfo.size();
|
||||
totalInterfaceDescriptorTableSize *= numDispatches;
|
||||
|
||||
if (!parentKernel) {
|
||||
dsh->getSpace(totalInterfaceDescriptorTableSize);
|
||||
} else {
|
||||
dsh->getSpace(commandQueue.getContext().getDefaultDeviceQueue()->getDshOffset() - dsh->getUsed());
|
||||
}
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
inline void HardwareInterface<GfxFamily>::dispatchWorkarounds(
|
||||
LinearStream *commandStream,
|
||||
CommandQueue &commandQueue,
|
||||
Kernel &kernel,
|
||||
const bool &enable) {
|
||||
|
||||
if (enable) {
|
||||
PreemptionHelper::applyPreemptionWaCmdsBegin<GfxFamily>(commandStream, commandQueue.getDevice());
|
||||
// Implement enabling special WA DisableLSQCROPERFforOCL if needed
|
||||
GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(commandStream, kernel, enable);
|
||||
} else {
|
||||
// Implement disabling special WA DisableLSQCROPERFforOCL if needed
|
||||
GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(commandStream, kernel, enable);
|
||||
PreemptionHelper::applyPreemptionWaCmdsEnd<GfxFamily>(commandStream, commandQueue.getDevice());
|
||||
}
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
inline void HardwareInterface<GfxFamily>::dispatchProfilingPerfStartCommands(
|
||||
const DispatchInfo &dispatchInfo,
|
||||
const MultiDispatchInfo &multiDispatchInfo,
|
||||
TagNode<HwTimeStamps> *hwTimeStamps,
|
||||
TagNode<HwPerfCounter> *hwPerfCounter,
|
||||
LinearStream *commandStream,
|
||||
CommandQueue &commandQueue) {
|
||||
|
||||
if (&dispatchInfo == &*multiDispatchInfo.begin()) {
|
||||
// If hwTimeStampAlloc is passed (not nullptr), then we know that profiling is enabled
|
||||
if (hwTimeStamps != nullptr) {
|
||||
GpgpuWalkerHelper<GfxFamily>::dispatchProfilingCommandsStart(*hwTimeStamps, commandStream);
|
||||
}
|
||||
if (hwPerfCounter != nullptr) {
|
||||
GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersCommandsStart(commandQueue, *hwPerfCounter, commandStream);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
inline void HardwareInterface<GfxFamily>::dispatchProfilingPerfEndCommands(
|
||||
TagNode<HwTimeStamps> *hwTimeStamps,
|
||||
TagNode<HwPerfCounter> *hwPerfCounter,
|
||||
LinearStream *commandStream,
|
||||
CommandQueue &commandQueue) {
|
||||
|
||||
// If hwTimeStamps is passed (not nullptr), then we know that profiling is enabled
|
||||
if (hwTimeStamps != nullptr) {
|
||||
GpgpuWalkerHelper<GfxFamily>::dispatchProfilingCommandsEnd(*hwTimeStamps, commandStream);
|
||||
}
|
||||
if (hwPerfCounter != nullptr) {
|
||||
GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersCommandsEnd(commandQueue, *hwPerfCounter, commandStream);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
inline void HardwareInterface<GfxFamily>::programWalker(
|
||||
LinearStream &commandStream,
|
||||
Kernel &kernel,
|
||||
CommandQueue &commandQueue,
|
||||
TimestampPacketContainer *currentTimestampPacketNodes,
|
||||
IndirectHeap &dsh,
|
||||
IndirectHeap &ioh,
|
||||
IndirectHeap &ssh,
|
||||
size_t globalWorkSizes[3],
|
||||
size_t localWorkSizes[3],
|
||||
PreemptionMode preemptionMode,
|
||||
size_t currentDispatchIndex,
|
||||
uint32_t &interfaceDescriptorIndex,
|
||||
const DispatchInfo &dispatchInfo,
|
||||
size_t offsetInterfaceDescriptorTable,
|
||||
Vec3<size_t> &numberOfWorkgroups,
|
||||
Vec3<size_t> &startOfWorkgroups) {
|
||||
|
||||
auto walkerCmd = allocateWalkerSpace(commandStream, kernel);
|
||||
uint32_t dim = dispatchInfo.getDim();
|
||||
uint32_t simd = kernel.getKernelInfo().getMaxSimdSize();
|
||||
|
||||
size_t globalOffsets[3] = {dispatchInfo.getOffset().x, dispatchInfo.getOffset().y, dispatchInfo.getOffset().z};
|
||||
size_t startWorkGroups[3] = {startOfWorkgroups.x, startOfWorkgroups.y, startOfWorkgroups.z};
|
||||
size_t numWorkGroups[3] = {numberOfWorkgroups.x, numberOfWorkgroups.y, numberOfWorkgroups.z};
|
||||
|
||||
if (currentTimestampPacketNodes && commandQueue.getCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
|
||||
auto timestampPacketNode = currentTimestampPacketNodes->peekNodes().at(currentDispatchIndex);
|
||||
GpgpuWalkerHelper<GfxFamily>::setupTimestampPacket(&commandStream, walkerCmd, timestampPacketNode, TimestampPacketStorage::WriteOperationType::AfterWalker);
|
||||
}
|
||||
|
||||
KernelCommandsHelper<GfxFamily>::sendIndirectState(
|
||||
commandStream,
|
||||
dsh,
|
||||
ioh,
|
||||
ssh,
|
||||
kernel,
|
||||
simd,
|
||||
localWorkSizes,
|
||||
offsetInterfaceDescriptorTable,
|
||||
interfaceDescriptorIndex,
|
||||
preemptionMode,
|
||||
walkerCmd,
|
||||
nullptr,
|
||||
true);
|
||||
|
||||
GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(walkerCmd, globalOffsets, startWorkGroups,
|
||||
numWorkGroups, localWorkSizes, simd, dim,
|
||||
false, false,
|
||||
*kernel.getKernelInfo().patchInfo.threadPayload);
|
||||
}
|
||||
|
||||
} // namespace NEO
|
||||
Reference in New Issue
Block a user