2018-09-27 21:22:36 +08:00
/*
2020-12-22 08:03:25 +08:00
* Copyright (C) 2018-2021 Intel Corporation
2018-09-27 21:22:36 +08:00
*
* SPDX-License-Identifier: MIT
*
*/
#pragma once
2020-10-01 01:06:27 +08:00
#include "shared/source/command_container/command_encoder.h"
#include "shared/source/helpers/hw_helper.h"
2020-09-24 16:52:53 +08:00
#include "shared/source/helpers/pause_on_gpu_properties.h"
2020-02-24 05:44:01 +08:00
#include "shared/source/memory_manager/internal_allocation_storage.h"
2020-02-24 17:22:30 +08:00
2020-02-23 05:50:57 +08:00
#include "opencl/source/command_queue/gpgpu_walker.h"
#include "opencl/source/command_queue/hardware_interface.h"
#include "opencl/source/helpers/hardware_commands_helper.h"
#include "opencl/source/helpers/task_information.h"
2020-07-23 16:56:08 +08:00
#include "opencl/source/mem_obj/buffer.h"
2018-09-27 21:22:36 +08:00
2020-10-01 01:06:27 +08:00
#include "pipe_control_args.h"
2019-03-26 18:59:46 +08:00
namespace NEO {
2018-09-27 21:22:36 +08:00
template <typename GfxFamily>
2019-05-13 20:15:03 +08:00
inline WALKER_TYPE<GfxFamily> *HardwareInterface<GfxFamily>::allocateWalkerSpace(LinearStream &commandStream,
const Kernel &kernel) {
2020-04-28 00:55:26 +08:00
auto walkerCmd = commandStream.getSpaceForCmd<WALKER_TYPE<GfxFamily>>();
2019-05-13 20:15:03 +08:00
return walkerCmd;
}
2020-06-19 16:57:01 +08:00
template <typename GfxFamily>
inline void HardwareInterface<GfxFamily>::dispatchProfilingPerfStartCommands(
2021-03-25 02:21:13 +08:00
TagNodeBase *hwTimeStamps,
TagNodeBase *hwPerfCounter,
2020-06-19 16:57:01 +08:00
LinearStream *commandStream,
CommandQueue &commandQueue) {
// If hwTimeStampAlloc is passed (not nullptr), then we know that profiling is enabled
if (hwTimeStamps != nullptr) {
GpgpuWalkerHelper<GfxFamily>::dispatchProfilingCommandsStart(*hwTimeStamps, commandStream, commandQueue.getDevice().getHardwareInfo());
}
if (hwPerfCounter != nullptr) {
GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersCommandsStart(commandQueue, *hwPerfCounter, commandStream);
}
}
template <typename GfxFamily>
inline void HardwareInterface<GfxFamily>::dispatchProfilingPerfEndCommands(
2021-03-25 02:21:13 +08:00
TagNodeBase *hwTimeStamps,
TagNodeBase *hwPerfCounter,
2020-06-19 16:57:01 +08:00
LinearStream *commandStream,
CommandQueue &commandQueue) {
// If hwTimeStamps is passed (not nullptr), then we know that profiling is enabled
if (hwTimeStamps != nullptr) {
2020-09-03 15:06:11 +08:00
GpgpuWalkerHelper<GfxFamily>::dispatchProfilingCommandsEnd(*hwTimeStamps, commandStream, commandQueue.getDevice().getHardwareInfo());
2020-06-19 16:57:01 +08:00
}
if (hwPerfCounter != nullptr) {
GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersCommandsEnd(commandQueue, *hwPerfCounter, commandStream);
}
}
2019-05-13 20:15:03 +08:00
template <typename GfxFamily>
void HardwareInterface<GfxFamily>::dispatchWalker(
2018-09-27 21:22:36 +08:00
CommandQueue &commandQueue,
const MultiDispatchInfo &multiDispatchInfo,
2019-05-13 20:15:03 +08:00
const CsrDependencies &csrDependencies,
2019-07-19 03:15:50 +08:00
KernelOperation *blockedCommandsData,
2021-03-25 02:21:13 +08:00
TagNodeBase *hwTimeStamps,
TagNodeBase *hwPerfCounter,
2019-11-13 19:23:29 +08:00
TimestampPacketDependencies *timestampPacketDependencies,
2019-05-13 20:15:03 +08:00
TimestampPacketContainer *currentTimestampPacketNodes,
uint32_t commandType) {
LinearStream *commandStream = nullptr;
IndirectHeap *dsh = nullptr, *ioh = nullptr, *ssh = nullptr;
auto parentKernel = multiDispatchInfo.peekParentKernel();
auto mainKernel = multiDispatchInfo.peekMainKernel();
2019-08-02 21:56:28 +08:00
auto preemptionMode = PreemptionHelper::taskPreemptionMode(commandQueue.getDevice(), multiDispatchInfo);
2019-05-13 20:15:03 +08:00
for (auto &dispatchInfo : multiDispatchInfo) {
// Compute local workgroup sizes
if (dispatchInfo.getLocalWorkgroupSize().x == 0) {
const auto lws = generateWorkgroupSize(dispatchInfo);
const_cast<DispatchInfo &>(dispatchInfo).setLWS(lws);
}
}
2018-09-27 21:22:36 +08:00
2019-05-13 20:15:03 +08:00
// Allocate command stream and indirect heaps
2019-07-19 03:15:50 +08:00
bool blockedQueue = (blockedCommandsData != nullptr);
obtainIndirectHeaps(commandQueue, multiDispatchInfo, blockedQueue, dsh, ioh, ssh);
if (blockedQueue) {
blockedCommandsData->setHeaps(dsh, ioh, ssh);
commandStream = blockedCommandsData->commandStream.get();
2018-09-27 21:22:36 +08:00
} else {
2019-05-13 20:15:03 +08:00
commandStream = &commandQueue.getCS(0);
2018-09-27 21:22:36 +08:00
}
2020-07-23 16:56:08 +08:00
if (commandQueue.getDevice().getDebugger()) {
auto debugSurface = commandQueue.getGpgpuCommandStreamReceiver().getDebugSurfaceAllocation();
void *addressToPatch = reinterpret_cast<void *>(debugSurface->getGpuAddress());
size_t sizeToPatch = debugSurface->getUnderlyingBufferSize();
Buffer::setSurfaceState(&commandQueue.getDevice(), commandQueue.getDevice().getDebugger()->getDebugSurfaceReservedSurfaceState(*ssh),
2021-03-03 20:25:26 +08:00
false, false, sizeToPatch, addressToPatch, 0, debugSurface, 0, 0,
2021-03-22 23:26:03 +08:00
mainKernel->getKernelInfo().kernelDescriptor.kernelAttributes.flags.useGlobalAtomics,
2021-03-30 01:06:29 +08:00
mainKernel->areMultipleSubDevicesInContext());
2020-07-23 16:56:08 +08:00
}
2021-06-23 18:34:31 +08:00
TimestampPacketHelper::programCsrDependenciesForTimestampPacketContainer<GfxFamily>(*commandStream, csrDependencies);
2018-09-27 21:22:36 +08:00
2020-10-06 17:54:04 +08:00
dsh->align(EncodeStates<GfxFamily>::alignInterfaceDescriptorData);
2019-05-13 20:15:03 +08:00
uint32_t interfaceDescriptorIndex = 0;
const size_t offsetInterfaceDescriptorTable = dsh->getUsed();
size_t totalInterfaceDescriptorTableSize = sizeof(INTERFACE_DESCRIPTOR_DATA);
getDefaultDshSpace(offsetInterfaceDescriptorTable, commandQueue, multiDispatchInfo, totalInterfaceDescriptorTableSize,
parentKernel, dsh, commandStream);
// Program media interface descriptor load
2019-06-12 15:13:06 +08:00
HardwareCommandsHelper<GfxFamily>::sendMediaInterfaceDescriptorLoad(
2019-05-13 20:15:03 +08:00
*commandStream,
offsetInterfaceDescriptorTable,
totalInterfaceDescriptorTableSize);
DEBUG_BREAK_IF(offsetInterfaceDescriptorTable % 64 != 0);
2019-07-09 20:55:16 +08:00
dispatchProfilingPerfStartCommands(hwTimeStamps, hwPerfCounter, commandStream, commandQueue);
2020-09-24 16:52:53 +08:00
if (PauseOnGpuProperties::pauseModeAllowed(DebugManager.flags.PauseOnEnqueue.get(), commandQueue.getGpgpuCommandStreamReceiver().peekTaskCount(), PauseOnGpuProperties::PauseMode::BeforeWorkload)) {
dispatchDebugPauseCommands(commandStream, commandQueue, DebugPauseState::waitingForUserStartConfirmation, DebugPauseState::hasUserStartConfirmation);
}
2019-07-09 20:55:16 +08:00
2021-04-21 19:23:42 +08:00
mainKernel->performKernelTuning(commandQueue.getGpgpuCommandStreamReceiver(),
multiDispatchInfo.begin()->getLocalWorkgroupSize(),
multiDispatchInfo.begin()->getActualWorkgroupSize(),
multiDispatchInfo.begin()->getOffset(),
currentTimestampPacketNodes);
2020-12-04 17:42:34 +08:00
2019-05-13 20:15:03 +08:00
size_t currentDispatchIndex = 0;
for (auto &dispatchInfo : multiDispatchInfo) {
2021-06-23 18:34:31 +08:00
dispatchInfo.dispatchInitCommands(*commandStream, timestampPacketDependencies, commandQueue.getDevice().getHardwareInfo());
2019-07-18 15:10:05 +08:00
bool isMainKernel = (dispatchInfo.getKernel() == mainKernel);
2019-05-13 20:15:03 +08:00
2019-07-18 15:10:05 +08:00
dispatchKernelCommands(commandQueue, dispatchInfo, commandType, *commandStream, isMainKernel,
currentDispatchIndex, currentTimestampPacketNodes, preemptionMode, interfaceDescriptorIndex,
offsetInterfaceDescriptorTable, *dsh, *ioh, *ssh);
2019-05-13 20:15:03 +08:00
2019-07-18 15:10:05 +08:00
currentDispatchIndex++;
2021-06-23 18:34:31 +08:00
dispatchInfo.dispatchEpilogueCommands(*commandStream, timestampPacketDependencies, commandQueue.getDevice().getHardwareInfo());
2019-07-18 15:10:05 +08:00
}
2020-12-04 17:42:34 +08:00
2019-07-18 15:10:05 +08:00
if (mainKernel->requiresCacheFlushCommand(commandQueue)) {
uint64_t postSyncAddress = 0;
if (commandQueue.getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
auto timestampPacketNodeForPostSync = currentTimestampPacketNodes->peekNodes().at(currentDispatchIndex);
2020-09-29 20:35:23 +08:00
timestampPacketNodeForPostSync->setProfilingCapable(false);
2020-05-19 22:20:41 +08:00
postSyncAddress = TimestampPacketHelper::getContextEndGpuAddress(*timestampPacketNodeForPostSync);
2019-07-18 15:10:05 +08:00
}
HardwareCommandsHelper<GfxFamily>::programCacheFlushAfterWalkerCommand(commandStream, commandQueue, mainKernel, postSyncAddress);
}
2020-04-22 22:34:39 +08:00
2021-02-24 23:31:58 +08:00
if (PauseOnGpuProperties::GpuScratchRegWriteAllowed(DebugManager.flags.GpuScratchRegWriteAfterWalker.get(), commandQueue.getGpgpuCommandStreamReceiver().peekTaskCount())) {
uint32_t registerOffset = DebugManager.flags.GpuScratchRegWriteRegisterOffset.get();
uint32_t registerData = DebugManager.flags.GpuScratchRegWriteRegisterData.get();
LriHelper<GfxFamily>::program(commandStream, registerOffset, registerData, EncodeSetMMIO<GfxFamily>::isRemapApplicable(registerOffset));
}
2020-09-24 16:52:53 +08:00
if (PauseOnGpuProperties::pauseModeAllowed(DebugManager.flags.PauseOnEnqueue.get(), commandQueue.getGpgpuCommandStreamReceiver().peekTaskCount(), PauseOnGpuProperties::PauseMode::AfterWorkload)) {
dispatchDebugPauseCommands(commandStream, commandQueue, DebugPauseState::waitingForUserEndConfirmation, DebugPauseState::hasUserEndConfirmation);
}
2020-04-30 23:12:01 +08:00
dispatchProfilingPerfEndCommands(hwTimeStamps, hwPerfCounter, commandStream, commandQueue);
2019-07-18 15:10:05 +08:00
}
2018-09-27 21:22:36 +08:00
2019-07-18 15:10:05 +08:00
template <typename GfxFamily>
void HardwareInterface<GfxFamily>::dispatchKernelCommands(CommandQueue &commandQueue, const DispatchInfo &dispatchInfo, uint32_t commandType,
LinearStream &commandStream, bool isMainKernel, size_t currentDispatchIndex,
TimestampPacketContainer *currentTimestampPacketNodes, PreemptionMode preemptionMode,
uint32_t &interfaceDescriptorIndex, size_t offsetInterfaceDescriptorTable,
IndirectHeap &dsh, IndirectHeap &ioh, IndirectHeap &ssh) {
auto &kernel = *dispatchInfo.getKernel();
DEBUG_BREAK_IF(!(dispatchInfo.getDim() >= 1 && dispatchInfo.getDim() <= 3));
DEBUG_BREAK_IF(!(dispatchInfo.getGWS().z == 1 || dispatchInfo.getDim() == 3));
DEBUG_BREAK_IF(!(dispatchInfo.getGWS().y == 1 || dispatchInfo.getDim() >= 2));
DEBUG_BREAK_IF(!(dispatchInfo.getOffset().z == 0 || dispatchInfo.getDim() == 3));
DEBUG_BREAK_IF(!(dispatchInfo.getOffset().y == 0 || dispatchInfo.getDim() >= 2));
// If we don't have a required WGS, compute one opportunistically
if (commandType == CL_COMMAND_NDRANGE_KERNEL) {
2019-09-13 19:41:40 +08:00
provideLocalWorkGroupSizeHints(commandQueue.getContextPtr(), dispatchInfo);
2019-07-18 15:10:05 +08:00
}
2019-05-13 20:15:03 +08:00
2019-07-18 15:10:05 +08:00
//Get dispatch geometry
uint32_t dim = dispatchInfo.getDim();
Vec3<size_t> gws = dispatchInfo.getGWS();
Vec3<size_t> offset = dispatchInfo.getOffset();
Vec3<size_t> startOfWorkgroups = dispatchInfo.getStartOfWorkgroups();
2019-05-13 20:15:03 +08:00
2019-07-18 15:10:05 +08:00
// Compute local workgroup sizes
Vec3<size_t> lws = dispatchInfo.getLocalWorkgroupSize();
Vec3<size_t> elws = (dispatchInfo.getEnqueuedWorkgroupSize().x > 0) ? dispatchInfo.getEnqueuedWorkgroupSize() : lws;
2019-05-13 20:15:03 +08:00
2019-07-18 15:10:05 +08:00
// Compute number of work groups
2021-03-09 02:28:02 +08:00
Vec3<size_t> totalNumberOfWorkgroups = dispatchInfo.getTotalNumberOfWorkgroups();
Vec3<size_t> numberOfWorkgroups = dispatchInfo.getNumberOfWorkgroups();
UNRECOVERABLE_IF(totalNumberOfWorkgroups.x == 0);
UNRECOVERABLE_IF(numberOfWorkgroups.x == 0);
2019-05-13 20:15:03 +08:00
2019-07-18 15:10:05 +08:00
size_t globalWorkSizes[3] = {gws.x, gws.y, gws.z};
2019-05-13 20:15:03 +08:00
2019-07-18 15:10:05 +08:00
// Patch our kernel constants
2021-03-22 19:06:23 +08:00
kernel.setGlobalWorkOffsetValues(static_cast<uint32_t>(offset.x), static_cast<uint32_t>(offset.y), static_cast<uint32_t>(offset.z));
kernel.setGlobalWorkSizeValues(static_cast<uint32_t>(gws.x), static_cast<uint32_t>(gws.y), static_cast<uint32_t>(gws.z));
2018-09-27 21:22:36 +08:00
2021-06-11 19:24:27 +08:00
if (isMainKernel || (!kernel.isLocalWorkSize2Patchable())) {
2021-03-22 19:06:23 +08:00
kernel.setLocalWorkSizeValues(static_cast<uint32_t>(lws.x), static_cast<uint32_t>(lws.y), static_cast<uint32_t>(lws.z));
2020-12-10 21:22:10 +08:00
}
2018-09-27 21:22:36 +08:00
2021-03-22 19:06:23 +08:00
kernel.setLocalWorkSize2Values(static_cast<uint32_t>(lws.x), static_cast<uint32_t>(lws.y), static_cast<uint32_t>(lws.z));
kernel.setEnqueuedLocalWorkSizeValues(static_cast<uint32_t>(elws.x), static_cast<uint32_t>(elws.y), static_cast<uint32_t>(elws.z));
2018-09-27 21:22:36 +08:00
2019-07-18 15:10:05 +08:00
if (isMainKernel) {
2021-03-22 19:06:23 +08:00
kernel.setNumWorkGroupsValues(static_cast<uint32_t>(totalNumberOfWorkgroups.x), static_cast<uint32_t>(totalNumberOfWorkgroups.y), static_cast<uint32_t>(totalNumberOfWorkgroups.z));
2019-07-18 15:10:05 +08:00
}
2019-05-13 20:15:03 +08:00
2021-03-22 19:06:23 +08:00
kernel.setWorkDim(dim);
2019-05-13 20:15:03 +08:00
2019-07-18 15:10:05 +08:00
// Send our indirect object data
size_t localWorkSizes[3] = {lws.x, lws.y, lws.z};
2019-05-13 20:15:03 +08:00
2019-07-18 15:10:05 +08:00
dispatchWorkarounds(&commandStream, commandQueue, kernel, true);
2019-05-13 20:15:03 +08:00
2019-07-18 15:10:05 +08:00
programWalker(commandStream, kernel, commandQueue, currentTimestampPacketNodes, dsh, ioh, ssh, globalWorkSizes,
localWorkSizes, preemptionMode, currentDispatchIndex, interfaceDescriptorIndex, dispatchInfo,
offsetInterfaceDescriptorTable, numberOfWorkgroups, startOfWorkgroups);
dispatchWorkarounds(&commandStream, commandQueue, kernel, false);
2018-09-28 22:16:18 +08:00
}
2019-07-17 15:06:34 +08:00
template <typename GfxFamily>
void HardwareInterface<GfxFamily>::obtainIndirectHeaps(CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo,
bool blockedQueue, IndirectHeap *&dsh, IndirectHeap *&ioh, IndirectHeap *&ssh) {
auto parentKernel = multiDispatchInfo.peekParentKernel();
if (blockedQueue) {
size_t dshSize = 0;
size_t colorCalcSize = 0;
size_t sshSize = HardwareCommandsHelper<GfxFamily>::getTotalSizeRequiredSSH(multiDispatchInfo);
bool iohEqualsDsh = false;
if (parentKernel) {
dshSize = commandQueue.getContext().getDefaultDeviceQueue()->getDshBuffer()->getUnderlyingBufferSize();
2021-03-22 23:26:03 +08:00
sshSize += HardwareCommandsHelper<GfxFamily>::getSshSizeForExecutionModel(*parentKernel);
2019-07-17 15:06:34 +08:00
iohEqualsDsh = true;
colorCalcSize = static_cast<size_t>(commandQueue.getContext().getDefaultDeviceQueue()->colorCalcStateSize);
} else {
dshSize = HardwareCommandsHelper<GfxFamily>::getTotalSizeRequiredDSH(multiDispatchInfo);
}
commandQueue.allocateHeapMemory(IndirectHeap::DYNAMIC_STATE, dshSize, dsh);
dsh->getSpace(colorCalcSize);
commandQueue.allocateHeapMemory(IndirectHeap::SURFACE_STATE, sshSize, ssh);
if (iohEqualsDsh) {
ioh = dsh;
} else {
commandQueue.allocateHeapMemory(IndirectHeap::INDIRECT_OBJECT,
HardwareCommandsHelper<GfxFamily>::getTotalSizeRequiredIOH(multiDispatchInfo), ioh);
}
} else {
if (parentKernel && (commandQueue.getIndirectHeap(IndirectHeap::SURFACE_STATE, 0).getUsed() > 0)) {
commandQueue.releaseIndirectHeap(IndirectHeap::SURFACE_STATE);
2020-07-15 15:10:22 +08:00
// clean reserved bindless offsets
ssh = &getIndirectHeap<GfxFamily, IndirectHeap::SURFACE_STATE>(commandQueue, multiDispatchInfo);
ssh->replaceBuffer(ssh->getCpuBase(), ssh->getMaxAvailableSpace());
2019-07-17 15:06:34 +08:00
}
dsh = &getIndirectHeap<GfxFamily, IndirectHeap::DYNAMIC_STATE>(commandQueue, multiDispatchInfo);
ioh = &getIndirectHeap<GfxFamily, IndirectHeap::INDIRECT_OBJECT>(commandQueue, multiDispatchInfo);
ssh = &getIndirectHeap<GfxFamily, IndirectHeap::SURFACE_STATE>(commandQueue, multiDispatchInfo);
}
}
2020-04-30 23:12:01 +08:00
template <typename GfxFamily>
inline void HardwareInterface<GfxFamily>::dispatchDebugPauseCommands(
LinearStream *commandStream,
CommandQueue &commandQueue,
DebugPauseState confirmationTrigger,
DebugPauseState waitCondition) {
2020-09-24 16:52:53 +08:00
if (!commandQueue.isSpecial()) {
2020-04-30 23:12:01 +08:00
auto address = commandQueue.getGpgpuCommandStreamReceiver().getDebugPauseStateGPUAddress();
{
using PIPE_CONTROL = typename GfxFamily::PIPE_CONTROL;
using POST_SYNC_OPERATION = typename PIPE_CONTROL::POST_SYNC_OPERATION;
2020-10-01 01:06:27 +08:00
PipeControlArgs args(true);
MemorySynchronizationCommands<GfxFamily>::addPipeControlAndProgramPostSyncOperation(
*commandStream,
POST_SYNC_OPERATION::POST_SYNC_OPERATION_WRITE_IMMEDIATE_DATA,
address,
static_cast<uint64_t>(confirmationTrigger),
commandQueue.getDevice().getHardwareInfo(),
args);
2020-04-30 23:12:01 +08:00
}
{
using MI_SEMAPHORE_WAIT = typename GfxFamily::MI_SEMAPHORE_WAIT;
2020-10-01 01:06:27 +08:00
using COMPARE_OPERATION = typename GfxFamily::MI_SEMAPHORE_WAIT::COMPARE_OPERATION;
EncodeSempahore<GfxFamily>::addMiSemaphoreWaitCommand(*commandStream,
address,
static_cast<uint32_t>(waitCondition),
COMPARE_OPERATION::COMPARE_OPERATION_SAD_EQUAL_SDD);
2020-04-30 23:12:01 +08:00
}
}
}
2019-03-26 18:59:46 +08:00
} // namespace NEO