2018-09-27 21:22:36 +08:00
/*
2023-01-12 17:59:50 +08:00
* Copyright (C) 2018-2023 Intel Corporation
2018-09-27 21:22:36 +08:00
*
* SPDX-License-Identifier: MIT
*
*/
#pragma once
2020-10-01 01:06:27 +08:00
#include "shared/source/command_container/command_encoder.h"
2023-03-14 01:56:20 +08:00
#include "shared/source/direct_submission/relaxed_ordering_helper.h"
2023-02-02 00:23:01 +08:00
#include "shared/source/helpers/gfx_core_helper.h"
2020-09-24 16:52:53 +08:00
#include "shared/source/helpers/pause_on_gpu_properties.h"
2021-12-22 22:11:05 +08:00
#include "shared/source/helpers/pipe_control_args.h"
2020-02-24 05:44:01 +08:00
#include "shared/source/memory_manager/internal_allocation_storage.h"
2020-02-24 17:22:30 +08:00
2020-02-23 05:50:57 +08:00
#include "opencl/source/command_queue/gpgpu_walker.h"
#include "opencl/source/command_queue/hardware_interface.h"
2021-09-23 06:24:59 +08:00
#include "opencl/source/helpers/cl_preemption_helper.h"
2020-02-23 05:50:57 +08:00
#include "opencl/source/helpers/hardware_commands_helper.h"
#include "opencl/source/helpers/task_information.h"
2020-07-23 16:56:08 +08:00
#include "opencl/source/mem_obj/buffer.h"
2018-09-27 21:22:36 +08:00
2019-03-26 18:59:46 +08:00
namespace NEO {
2018-09-27 21:22:36 +08:00
template <typename GfxFamily>
2023-11-23 21:58:58 +08:00
template <typename WalkerType>
inline WalkerType *HardwareInterface<GfxFamily>::allocateWalkerSpace(LinearStream &commandStream, const Kernel &kernel) {
auto walkerCmd = commandStream.getSpaceForCmd<WalkerType>();
2019-05-13 20:15:03 +08:00
return walkerCmd;
}
2023-11-23 21:58:58 +08:00
template <typename GfxFamily>
inline void HardwareInterface<GfxFamily>::dispatchWalkerCommon(CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo, const CsrDependencies &csrDependencies, HardwareInterfaceWalkerArgs &walkerArgs) {
dispatchWalker<typename GfxFamily::WALKER_TYPE>(commandQueue, multiDispatchInfo, csrDependencies, walkerArgs);
}
2020-06-19 16:57:01 +08:00
template <typename GfxFamily>
inline void HardwareInterface<GfxFamily>::dispatchProfilingPerfStartCommands(
2021-03-25 02:21:13 +08:00
TagNodeBase *hwTimeStamps,
TagNodeBase *hwPerfCounter,
2020-06-19 16:57:01 +08:00
LinearStream *commandStream,
CommandQueue &commandQueue) {
// If hwTimeStampAlloc is passed (not nullptr), then we know that profiling is enabled
if (hwTimeStamps != nullptr) {
2022-12-22 21:19:10 +08:00
GpgpuWalkerHelper<GfxFamily>::dispatchProfilingCommandsStart(*hwTimeStamps, commandStream, commandQueue.getDevice().getRootDeviceEnvironment());
2020-06-19 16:57:01 +08:00
}
if (hwPerfCounter != nullptr) {
GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersCommandsStart(commandQueue, *hwPerfCounter, commandStream);
}
}
template <typename GfxFamily>
inline void HardwareInterface<GfxFamily>::dispatchProfilingPerfEndCommands(
2021-03-25 02:21:13 +08:00
TagNodeBase *hwTimeStamps,
TagNodeBase *hwPerfCounter,
2020-06-19 16:57:01 +08:00
LinearStream *commandStream,
CommandQueue &commandQueue) {
// If hwTimeStamps is passed (not nullptr), then we know that profiling is enabled
if (hwTimeStamps != nullptr) {
2022-12-22 21:19:10 +08:00
GpgpuWalkerHelper<GfxFamily>::dispatchProfilingCommandsEnd(*hwTimeStamps, commandStream, commandQueue.getDevice().getRootDeviceEnvironment());
2020-06-19 16:57:01 +08:00
}
if (hwPerfCounter != nullptr) {
GpgpuWalkerHelper<GfxFamily>::dispatchPerfCountersCommandsEnd(commandQueue, *hwPerfCounter, commandStream);
}
}
2019-05-13 20:15:03 +08:00
template <typename GfxFamily>
2023-11-23 21:58:58 +08:00
template <typename WalkerType>
2019-05-13 20:15:03 +08:00
void HardwareInterface<GfxFamily>::dispatchWalker(
2018-09-27 21:22:36 +08:00
CommandQueue &commandQueue,
const MultiDispatchInfo &multiDispatchInfo,
2019-05-13 20:15:03 +08:00
const CsrDependencies &csrDependencies,
2022-07-02 02:03:54 +08:00
HardwareInterfaceWalkerArgs &walkerArgs) {
2019-05-13 20:15:03 +08:00
LinearStream *commandStream = nullptr;
IndirectHeap *dsh = nullptr, *ioh = nullptr, *ssh = nullptr;
auto mainKernel = multiDispatchInfo.peekMainKernel();
2022-07-02 02:03:54 +08:00
walkerArgs.preemptionMode = ClPreemptionHelper::taskPreemptionMode(commandQueue.getDevice(), multiDispatchInfo);
2019-05-13 20:15:03 +08:00
for (auto &dispatchInfo : multiDispatchInfo) {
// Compute local workgroup sizes
if (dispatchInfo.getLocalWorkgroupSize().x == 0) {
const auto lws = generateWorkgroupSize(dispatchInfo);
const_cast<DispatchInfo &>(dispatchInfo).setLWS(lws);
}
}
2018-09-27 21:22:36 +08:00
2019-05-13 20:15:03 +08:00
// Allocate command stream and indirect heaps
2022-07-02 02:03:54 +08:00
bool blockedQueue = (walkerArgs.blockedCommandsData != nullptr);
2019-07-19 03:15:50 +08:00
obtainIndirectHeaps(commandQueue, multiDispatchInfo, blockedQueue, dsh, ioh, ssh);
if (blockedQueue) {
2022-07-02 02:03:54 +08:00
walkerArgs.blockedCommandsData->setHeaps(dsh, ioh, ssh);
commandStream = walkerArgs.blockedCommandsData->commandStream.get();
2018-09-27 21:22:36 +08:00
} else {
2019-05-13 20:15:03 +08:00
commandStream = &commandQueue.getCS(0);
2018-09-27 21:22:36 +08:00
}
2020-07-23 16:56:08 +08:00
if (commandQueue.getDevice().getDebugger()) {
auto debugSurface = commandQueue.getGpgpuCommandStreamReceiver().getDebugSurfaceAllocation();
void *addressToPatch = reinterpret_cast<void *>(debugSurface->getGpuAddress());
size_t sizeToPatch = debugSurface->getUnderlyingBufferSize();
Buffer::setSurfaceState(&commandQueue.getDevice(), commandQueue.getDevice().getDebugger()->getDebugSurfaceReservedSurfaceState(*ssh),
2021-03-03 20:25:26 +08:00
false, false, sizeToPatch, addressToPatch, 0, debugSurface, 0, 0,
2021-03-22 23:26:03 +08:00
mainKernel->getKernelInfo().kernelDescriptor.kernelAttributes.flags.useGlobalAtomics,
2021-03-30 01:06:29 +08:00
mainKernel->areMultipleSubDevicesInContext());
2020-07-23 16:56:08 +08:00
}
2023-03-10 02:29:45 +08:00
if (walkerArgs.relaxedOrderingEnabled) {
2023-03-14 01:56:20 +08:00
RelaxedOrderingHelper::encodeRegistersBeforeDependencyCheckers<GfxFamily>(*commandStream);
2023-03-10 02:29:45 +08:00
}
TimestampPacketHelper::programCsrDependenciesForTimestampPacketContainer<GfxFamily>(*commandStream, csrDependencies, walkerArgs.relaxedOrderingEnabled);
2018-09-27 21:22:36 +08:00
2020-10-06 17:54:04 +08:00
dsh->align(EncodeStates<GfxFamily>::alignInterfaceDescriptorData);
2019-05-13 20:15:03 +08:00
2022-07-02 02:03:54 +08:00
walkerArgs.interfaceDescriptorIndex = 0;
walkerArgs.offsetInterfaceDescriptorTable = dsh->getUsed();
2019-05-13 20:15:03 +08:00
2023-11-23 21:58:58 +08:00
size_t totalInterfaceDescriptorTableSize = GfxFamily::template getInterfaceDescriptorSize<WalkerType>();
2019-05-13 20:15:03 +08:00
2022-07-02 02:03:54 +08:00
getDefaultDshSpace(walkerArgs.offsetInterfaceDescriptorTable, commandQueue, multiDispatchInfo, totalInterfaceDescriptorTableSize, dsh, commandStream);
2019-05-13 20:15:03 +08:00
// Program media interface descriptor load
2019-06-12 15:13:06 +08:00
HardwareCommandsHelper<GfxFamily>::sendMediaInterfaceDescriptorLoad(
2019-05-13 20:15:03 +08:00
*commandStream,
2022-07-02 02:03:54 +08:00
walkerArgs.offsetInterfaceDescriptorTable,
2019-05-13 20:15:03 +08:00
totalInterfaceDescriptorTableSize);
2022-07-02 02:03:54 +08:00
DEBUG_BREAK_IF(walkerArgs.offsetInterfaceDescriptorTable % 64 != 0);
2019-05-13 20:15:03 +08:00
2022-07-02 02:03:54 +08:00
dispatchProfilingPerfStartCommands(walkerArgs.hwTimeStamps, walkerArgs.hwPerfCounter, commandStream, commandQueue);
2020-09-24 16:52:53 +08:00
2021-12-22 02:13:53 +08:00
const auto &hwInfo = commandQueue.getDevice().getHardwareInfo();
2023-11-30 16:32:25 +08:00
if (PauseOnGpuProperties::pauseModeAllowed(debugManager.flags.PauseOnEnqueue.get(), commandQueue.getGpgpuCommandStreamReceiver().peekTaskCount(), PauseOnGpuProperties::PauseMode::BeforeWorkload)) {
2021-12-22 02:13:53 +08:00
dispatchDebugPauseCommands(commandStream, commandQueue, DebugPauseState::waitingForUserStartConfirmation,
DebugPauseState::hasUserStartConfirmation, hwInfo);
2020-09-24 16:52:53 +08:00
}
2019-07-09 20:55:16 +08:00
2021-04-21 19:23:42 +08:00
mainKernel->performKernelTuning(commandQueue.getGpgpuCommandStreamReceiver(),
multiDispatchInfo.begin()->getLocalWorkgroupSize(),
multiDispatchInfo.begin()->getActualWorkgroupSize(),
multiDispatchInfo.begin()->getOffset(),
2022-07-02 02:03:54 +08:00
walkerArgs.currentTimestampPacketNodes);
2020-12-04 17:42:34 +08:00
2022-07-02 02:03:54 +08:00
walkerArgs.currentDispatchIndex = 0;
2023-01-20 00:11:39 +08:00
2019-05-13 20:15:03 +08:00
for (auto &dispatchInfo : multiDispatchInfo) {
2023-01-20 11:04:15 +08:00
dispatchInfo.dispatchInitCommands(*commandStream, walkerArgs.timestampPacketDependencies, commandQueue.getDevice().getRootDeviceEnvironment());
2022-07-02 02:03:54 +08:00
walkerArgs.isMainKernel = (dispatchInfo.getKernel() == mainKernel);
2019-05-13 20:15:03 +08:00
2023-11-23 21:58:58 +08:00
dispatchKernelCommands<WalkerType>(commandQueue, dispatchInfo, *commandStream, *dsh, *ioh, *ssh, walkerArgs);
2019-05-13 20:15:03 +08:00
2022-07-02 02:03:54 +08:00
walkerArgs.currentDispatchIndex++;
2023-01-20 11:04:15 +08:00
dispatchInfo.dispatchEpilogueCommands(*commandStream, walkerArgs.timestampPacketDependencies, commandQueue.getDevice().getRootDeviceEnvironment());
2019-07-18 15:10:05 +08:00
}
2020-12-04 17:42:34 +08:00
2023-11-30 16:32:25 +08:00
if (PauseOnGpuProperties::gpuScratchRegWriteAllowed(debugManager.flags.GpuScratchRegWriteAfterWalker.get(), commandQueue.getGpgpuCommandStreamReceiver().peekTaskCount())) {
uint32_t registerOffset = debugManager.flags.GpuScratchRegWriteRegisterOffset.get();
uint32_t registerData = debugManager.flags.GpuScratchRegWriteRegisterData.get();
2021-02-24 23:31:58 +08:00
LriHelper<GfxFamily>::program(commandStream, registerOffset, registerData, EncodeSetMMIO<GfxFamily>::isRemapApplicable(registerOffset));
}
2023-11-30 16:32:25 +08:00
if (PauseOnGpuProperties::pauseModeAllowed(debugManager.flags.PauseOnEnqueue.get(), commandQueue.getGpgpuCommandStreamReceiver().peekTaskCount(), PauseOnGpuProperties::PauseMode::AfterWorkload)) {
2021-12-22 02:13:53 +08:00
dispatchDebugPauseCommands(commandStream, commandQueue, DebugPauseState::waitingForUserEndConfirmation,
DebugPauseState::hasUserEndConfirmation, hwInfo);
2020-09-24 16:52:53 +08:00
}
2022-07-02 02:03:54 +08:00
dispatchProfilingPerfEndCommands(walkerArgs.hwTimeStamps, walkerArgs.hwPerfCounter, commandStream, commandQueue);
2019-07-18 15:10:05 +08:00
}
2018-09-27 21:22:36 +08:00
2019-07-18 15:10:05 +08:00
template <typename GfxFamily>
2023-11-23 21:58:58 +08:00
template <typename WalkerType>
2022-07-02 02:03:54 +08:00
void HardwareInterface<GfxFamily>::dispatchKernelCommands(CommandQueue &commandQueue, const DispatchInfo &dispatchInfo, LinearStream &commandStream,
IndirectHeap &dsh, IndirectHeap &ioh, IndirectHeap &ssh,
HardwareInterfaceWalkerArgs &walkerArgs) {
2019-07-18 15:10:05 +08:00
auto &kernel = *dispatchInfo.getKernel();
DEBUG_BREAK_IF(!(dispatchInfo.getDim() >= 1 && dispatchInfo.getDim() <= 3));
DEBUG_BREAK_IF(!(dispatchInfo.getGWS().z == 1 || dispatchInfo.getDim() == 3));
DEBUG_BREAK_IF(!(dispatchInfo.getGWS().y == 1 || dispatchInfo.getDim() >= 2));
DEBUG_BREAK_IF(!(dispatchInfo.getOffset().z == 0 || dispatchInfo.getDim() == 3));
DEBUG_BREAK_IF(!(dispatchInfo.getOffset().y == 0 || dispatchInfo.getDim() >= 2));
// If we don't have a required WGS, compute one opportunistically
2022-07-02 02:03:54 +08:00
if (walkerArgs.commandType == CL_COMMAND_NDRANGE_KERNEL) {
2019-09-13 19:41:40 +08:00
provideLocalWorkGroupSizeHints(commandQueue.getContextPtr(), dispatchInfo);
2019-07-18 15:10:05 +08:00
}
2019-05-13 20:15:03 +08:00
2022-11-29 21:28:05 +08:00
// Get dispatch geometry
2021-09-08 05:21:19 +08:00
auto dim = dispatchInfo.getDim();
const auto &gws = dispatchInfo.getGWS();
const auto &offset = dispatchInfo.getOffset();
2022-07-02 02:03:54 +08:00
walkerArgs.startOfWorkgroups = &dispatchInfo.getStartOfWorkgroups();
2019-05-13 20:15:03 +08:00
2019-07-18 15:10:05 +08:00
// Compute local workgroup sizes
2021-09-08 05:21:19 +08:00
const auto &lws = dispatchInfo.getLocalWorkgroupSize();
const auto &elws = (dispatchInfo.getEnqueuedWorkgroupSize().x > 0) ? dispatchInfo.getEnqueuedWorkgroupSize() : lws;
2019-05-13 20:15:03 +08:00
2019-07-18 15:10:05 +08:00
// Compute number of work groups
2021-09-08 05:21:19 +08:00
const auto &totalNumberOfWorkgroups = dispatchInfo.getTotalNumberOfWorkgroups();
2022-07-02 02:03:54 +08:00
walkerArgs.numberOfWorkgroups = &dispatchInfo.getNumberOfWorkgroups();
2021-03-09 02:28:02 +08:00
UNRECOVERABLE_IF(totalNumberOfWorkgroups.x == 0);
2022-07-02 02:03:54 +08:00
UNRECOVERABLE_IF(walkerArgs.numberOfWorkgroups->x == 0);
2019-05-13 20:15:03 +08:00
2022-07-02 02:03:54 +08:00
walkerArgs.globalWorkSizes[0] = gws.x;
walkerArgs.globalWorkSizes[1] = gws.y;
walkerArgs.globalWorkSizes[2] = gws.z;
2019-05-13 20:15:03 +08:00
2019-07-18 15:10:05 +08:00
// Patch our kernel constants
2021-03-22 19:06:23 +08:00
kernel.setGlobalWorkOffsetValues(static_cast<uint32_t>(offset.x), static_cast<uint32_t>(offset.y), static_cast<uint32_t>(offset.z));
kernel.setGlobalWorkSizeValues(static_cast<uint32_t>(gws.x), static_cast<uint32_t>(gws.y), static_cast<uint32_t>(gws.z));
2018-09-27 21:22:36 +08:00
2022-07-02 02:03:54 +08:00
if (walkerArgs.isMainKernel || (!kernel.isLocalWorkSize2Patchable())) {
2021-03-22 19:06:23 +08:00
kernel.setLocalWorkSizeValues(static_cast<uint32_t>(lws.x), static_cast<uint32_t>(lws.y), static_cast<uint32_t>(lws.z));
2020-12-10 21:22:10 +08:00
}
2018-09-27 21:22:36 +08:00
2021-03-22 19:06:23 +08:00
kernel.setLocalWorkSize2Values(static_cast<uint32_t>(lws.x), static_cast<uint32_t>(lws.y), static_cast<uint32_t>(lws.z));
kernel.setEnqueuedLocalWorkSizeValues(static_cast<uint32_t>(elws.x), static_cast<uint32_t>(elws.y), static_cast<uint32_t>(elws.z));
2018-09-27 21:22:36 +08:00
2022-07-02 02:03:54 +08:00
if (walkerArgs.isMainKernel) {
2021-03-22 19:06:23 +08:00
kernel.setNumWorkGroupsValues(static_cast<uint32_t>(totalNumberOfWorkgroups.x), static_cast<uint32_t>(totalNumberOfWorkgroups.y), static_cast<uint32_t>(totalNumberOfWorkgroups.z));
2019-07-18 15:10:05 +08:00
}
2019-05-13 20:15:03 +08:00
2021-03-22 19:06:23 +08:00
kernel.setWorkDim(dim);
2019-05-13 20:15:03 +08:00
2019-07-18 15:10:05 +08:00
// Send our indirect object data
2022-07-02 02:03:54 +08:00
walkerArgs.localWorkSizes[0] = lws.x;
walkerArgs.localWorkSizes[1] = lws.y;
walkerArgs.localWorkSizes[2] = lws.z;
2019-05-13 20:15:03 +08:00
2019-07-18 15:10:05 +08:00
dispatchWorkarounds(&commandStream, commandQueue, kernel, true);
2019-05-13 20:15:03 +08:00
2023-11-23 21:58:58 +08:00
programWalker<WalkerType>(commandStream, kernel, commandQueue, dsh, ioh, ssh, dispatchInfo, walkerArgs);
2019-07-18 15:10:05 +08:00
dispatchWorkarounds(&commandStream, commandQueue, kernel, false);
2018-09-28 22:16:18 +08:00
}
2019-07-17 15:06:34 +08:00
template <typename GfxFamily>
void HardwareInterface<GfxFamily>::obtainIndirectHeaps(CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo,
bool blockedQueue, IndirectHeap *&dsh, IndirectHeap *&ioh, IndirectHeap *&ssh) {
if (blockedQueue) {
size_t dshSize = 0;
size_t colorCalcSize = 0;
size_t sshSize = HardwareCommandsHelper<GfxFamily>::getTotalSizeRequiredSSH(multiDispatchInfo);
2022-01-13 23:27:58 +08:00
dshSize = HardwareCommandsHelper<GfxFamily>::getTotalSizeRequiredDSH(multiDispatchInfo);
2019-07-17 15:06:34 +08:00
2022-01-26 18:59:30 +08:00
commandQueue.allocateHeapMemory(IndirectHeap::Type::DYNAMIC_STATE, dshSize, dsh);
2019-07-17 15:06:34 +08:00
dsh->getSpace(colorCalcSize);
2022-01-26 18:59:30 +08:00
commandQueue.allocateHeapMemory(IndirectHeap::Type::SURFACE_STATE, sshSize, ssh);
2023-12-04 21:17:26 +08:00
commandQueue.allocateHeapMemory(IndirectHeap::Type::INDIRECT_OBJECT,
HardwareCommandsHelper<GfxFamily>::getTotalSizeRequiredIOH(multiDispatchInfo), ioh);
2019-07-17 15:06:34 +08:00
} else {
2022-01-26 18:59:30 +08:00
dsh = &getIndirectHeap<GfxFamily, IndirectHeap::Type::DYNAMIC_STATE>(commandQueue, multiDispatchInfo);
ioh = &getIndirectHeap<GfxFamily, IndirectHeap::Type::INDIRECT_OBJECT>(commandQueue, multiDispatchInfo);
ssh = &getIndirectHeap<GfxFamily, IndirectHeap::Type::SURFACE_STATE>(commandQueue, multiDispatchInfo);
2019-07-17 15:06:34 +08:00
}
}
2020-04-30 23:12:01 +08:00
template <typename GfxFamily>
inline void HardwareInterface<GfxFamily>::dispatchDebugPauseCommands(
LinearStream *commandStream,
CommandQueue &commandQueue,
DebugPauseState confirmationTrigger,
2021-12-22 02:13:53 +08:00
DebugPauseState waitCondition,
const HardwareInfo &hwInfo) {
2020-04-30 23:12:01 +08:00
2020-09-24 16:52:53 +08:00
if (!commandQueue.isSpecial()) {
2020-04-30 23:12:01 +08:00
auto address = commandQueue.getGpgpuCommandStreamReceiver().getDebugPauseStateGPUAddress();
{
2021-12-21 05:37:45 +08:00
PipeControlArgs args;
2023-01-20 11:04:15 +08:00
args.dcFlushEnable = MemorySynchronizationCommands<GfxFamily>::getDcFlushEnable(true, commandQueue.getDevice().getRootDeviceEnvironment());
2022-07-21 22:28:10 +08:00
MemorySynchronizationCommands<GfxFamily>::addBarrierWithPostSyncOperation(
2020-10-01 01:06:27 +08:00
*commandStream,
2022-07-21 22:28:10 +08:00
PostSyncMode::ImmediateData,
2020-10-01 01:06:27 +08:00
address,
static_cast<uint64_t>(confirmationTrigger),
2023-01-26 11:58:18 +08:00
commandQueue.getDevice().getRootDeviceEnvironment(),
2020-10-01 01:06:27 +08:00
args);
2020-04-30 23:12:01 +08:00
}
{
2020-10-01 01:06:27 +08:00
using COMPARE_OPERATION = typename GfxFamily::MI_SEMAPHORE_WAIT::COMPARE_OPERATION;
2023-03-10 21:49:06 +08:00
EncodeSemaphore<GfxFamily>::addMiSemaphoreWaitCommand(*commandStream,
2020-10-01 01:06:27 +08:00
address,
static_cast<uint32_t>(waitCondition),
2023-09-12 19:42:40 +08:00
COMPARE_OPERATION::COMPARE_OPERATION_SAD_EQUAL_SDD, false, false, false);
2020-04-30 23:12:01 +08:00
}
}
}
2019-03-26 18:59:46 +08:00
} // namespace NEO