Files
compute-runtime/opencl/source/command_queue/hardware_interface_bdw_and_later.inl
Rafal Maziejuk ed0c36117e Apply heuristics when setting TG dispatch size on XE_HPC_CORE
The default TG dispatch size can be changed
to a better value based on number of threads in TG or
currently available amount of threads on GPU.
Decision on what TG dispatch size should be are based on
implemented heuristics.

Signed-off-by: Rafal Maziejuk <rafal.maziejuk@intel.com>
Related-To: NEO-6989
2022-08-08 16:43:10 +02:00

106 lines
4.4 KiB
C++

/*
* Copyright (C) 2019-2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#pragma once
#include "shared/source/helpers/engine_node_helper.h"
#include "shared/source/os_interface/os_context.h"
#include "opencl/source/command_queue/hardware_interface_base.inl"
namespace NEO {
template <typename GfxFamily>
inline void HardwareInterface<GfxFamily>::getDefaultDshSpace(
const size_t &offsetInterfaceDescriptorTable,
CommandQueue &commandQueue,
const MultiDispatchInfo &multiDispatchInfo,
size_t &totalInterfaceDescriptorTableSize,
IndirectHeap *dsh,
LinearStream *commandStream) {
size_t numDispatches = multiDispatchInfo.size();
totalInterfaceDescriptorTableSize *= numDispatches;
dsh->getSpace(totalInterfaceDescriptorTableSize);
}
template <typename GfxFamily>
inline void HardwareInterface<GfxFamily>::dispatchWorkarounds(
LinearStream *commandStream,
CommandQueue &commandQueue,
Kernel &kernel,
const bool &enable) {
if (enable) {
PreemptionHelper::applyPreemptionWaCmdsBegin<GfxFamily>(commandStream, commandQueue.getDevice());
// Implement enabling special WA DisableLSQCROPERFforOCL if needed
GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(commandStream, kernel, enable);
} else {
// Implement disabling special WA DisableLSQCROPERFforOCL if needed
GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(commandStream, kernel, enable);
PreemptionHelper::applyPreemptionWaCmdsEnd<GfxFamily>(commandStream, commandQueue.getDevice());
}
}
template <typename GfxFamily>
inline void HardwareInterface<GfxFamily>::programWalker(
LinearStream &commandStream,
Kernel &kernel,
CommandQueue &commandQueue,
IndirectHeap &dsh,
IndirectHeap &ioh,
IndirectHeap &ssh,
const DispatchInfo &dispatchInfo,
HardwareInterfaceWalkerArgs &walkerArgs) {
auto walkerCmdBuf = allocateWalkerSpace(commandStream, kernel);
WALKER_TYPE walkerCmd = GfxFamily::cmdInitGpgpuWalker;
uint32_t dim = dispatchInfo.getDim();
uint32_t simd = kernel.getKernelInfo().getMaxSimdSize();
size_t globalOffsets[3] = {dispatchInfo.getOffset().x, dispatchInfo.getOffset().y, dispatchInfo.getOffset().z};
size_t startWorkGroups[3] = {walkerArgs.startOfWorkgroups->x, walkerArgs.startOfWorkgroups->y, walkerArgs.startOfWorkgroups->z};
size_t numWorkGroups[3] = {walkerArgs.numberOfWorkgroups->x, walkerArgs.numberOfWorkgroups->y, walkerArgs.numberOfWorkgroups->z};
auto threadGroupCount = static_cast<uint32_t>(walkerArgs.numberOfWorkgroups->x * walkerArgs.numberOfWorkgroups->y * walkerArgs.numberOfWorkgroups->z);
if (walkerArgs.currentTimestampPacketNodes && commandQueue.getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
auto timestampPacketNode = walkerArgs.currentTimestampPacketNodes->peekNodes().at(walkerArgs.currentDispatchIndex);
GpgpuWalkerHelper<GfxFamily>::setupTimestampPacket(&commandStream, &walkerCmd, timestampPacketNode, commandQueue.getDevice().getRootDeviceEnvironment());
}
auto isCcsUsed = EngineHelpers::isCcs(commandQueue.getGpgpuEngine().osContext->getEngineType());
auto kernelUsesLocalIds = HardwareCommandsHelper<GfxFamily>::kernelUsesLocalIds(kernel);
HardwareCommandsHelper<GfxFamily>::sendIndirectState(
commandStream,
dsh,
ioh,
ssh,
kernel,
kernel.getKernelStartAddress(true, kernelUsesLocalIds, isCcsUsed, false),
simd,
walkerArgs.localWorkSizes,
threadGroupCount,
walkerArgs.offsetInterfaceDescriptorTable,
walkerArgs.interfaceDescriptorIndex,
walkerArgs.preemptionMode,
&walkerCmd,
nullptr,
true,
commandQueue.getDevice());
GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(&walkerCmd, kernel.getKernelInfo().kernelDescriptor,
globalOffsets, startWorkGroups,
numWorkGroups, walkerArgs.localWorkSizes, simd, dim,
false, false, 0u);
EncodeWalkerArgs encodeWalkerArgs{kernel.getExecutionType(), false};
EncodeDispatchKernel<GfxFamily>::encodeAdditionalWalkerFields(commandQueue.getDevice().getHardwareInfo(), walkerCmd, encodeWalkerArgs);
*walkerCmdBuf = walkerCmd;
}
} // namespace NEO