mirror of
https://github.com/intel/compute-runtime.git
synced 2026-01-03 06:49:52 +08:00
The default TG dispatch size can be changed to a better value based on number of threads in TG or currently available amount of threads on GPU. Decision on what TG dispatch size should be are based on implemented heuristics. Signed-off-by: Rafal Maziejuk <rafal.maziejuk@intel.com> Related-To: NEO-6989
106 lines
4.4 KiB
C++
106 lines
4.4 KiB
C++
/*
|
|
* Copyright (C) 2019-2022 Intel Corporation
|
|
*
|
|
* SPDX-License-Identifier: MIT
|
|
*
|
|
*/
|
|
|
|
#pragma once
|
|
#include "shared/source/helpers/engine_node_helper.h"
|
|
#include "shared/source/os_interface/os_context.h"
|
|
|
|
#include "opencl/source/command_queue/hardware_interface_base.inl"
|
|
|
|
namespace NEO {
|
|
|
|
template <typename GfxFamily>
|
|
inline void HardwareInterface<GfxFamily>::getDefaultDshSpace(
|
|
const size_t &offsetInterfaceDescriptorTable,
|
|
CommandQueue &commandQueue,
|
|
const MultiDispatchInfo &multiDispatchInfo,
|
|
size_t &totalInterfaceDescriptorTableSize,
|
|
IndirectHeap *dsh,
|
|
LinearStream *commandStream) {
|
|
|
|
size_t numDispatches = multiDispatchInfo.size();
|
|
totalInterfaceDescriptorTableSize *= numDispatches;
|
|
|
|
dsh->getSpace(totalInterfaceDescriptorTableSize);
|
|
}
|
|
|
|
template <typename GfxFamily>
|
|
inline void HardwareInterface<GfxFamily>::dispatchWorkarounds(
|
|
LinearStream *commandStream,
|
|
CommandQueue &commandQueue,
|
|
Kernel &kernel,
|
|
const bool &enable) {
|
|
|
|
if (enable) {
|
|
PreemptionHelper::applyPreemptionWaCmdsBegin<GfxFamily>(commandStream, commandQueue.getDevice());
|
|
// Implement enabling special WA DisableLSQCROPERFforOCL if needed
|
|
GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(commandStream, kernel, enable);
|
|
} else {
|
|
// Implement disabling special WA DisableLSQCROPERFforOCL if needed
|
|
GpgpuWalkerHelper<GfxFamily>::applyWADisableLSQCROPERFforOCL(commandStream, kernel, enable);
|
|
PreemptionHelper::applyPreemptionWaCmdsEnd<GfxFamily>(commandStream, commandQueue.getDevice());
|
|
}
|
|
}
|
|
|
|
template <typename GfxFamily>
|
|
inline void HardwareInterface<GfxFamily>::programWalker(
|
|
LinearStream &commandStream,
|
|
Kernel &kernel,
|
|
CommandQueue &commandQueue,
|
|
IndirectHeap &dsh,
|
|
IndirectHeap &ioh,
|
|
IndirectHeap &ssh,
|
|
const DispatchInfo &dispatchInfo,
|
|
HardwareInterfaceWalkerArgs &walkerArgs) {
|
|
|
|
auto walkerCmdBuf = allocateWalkerSpace(commandStream, kernel);
|
|
WALKER_TYPE walkerCmd = GfxFamily::cmdInitGpgpuWalker;
|
|
uint32_t dim = dispatchInfo.getDim();
|
|
uint32_t simd = kernel.getKernelInfo().getMaxSimdSize();
|
|
|
|
size_t globalOffsets[3] = {dispatchInfo.getOffset().x, dispatchInfo.getOffset().y, dispatchInfo.getOffset().z};
|
|
size_t startWorkGroups[3] = {walkerArgs.startOfWorkgroups->x, walkerArgs.startOfWorkgroups->y, walkerArgs.startOfWorkgroups->z};
|
|
size_t numWorkGroups[3] = {walkerArgs.numberOfWorkgroups->x, walkerArgs.numberOfWorkgroups->y, walkerArgs.numberOfWorkgroups->z};
|
|
auto threadGroupCount = static_cast<uint32_t>(walkerArgs.numberOfWorkgroups->x * walkerArgs.numberOfWorkgroups->y * walkerArgs.numberOfWorkgroups->z);
|
|
|
|
if (walkerArgs.currentTimestampPacketNodes && commandQueue.getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) {
|
|
auto timestampPacketNode = walkerArgs.currentTimestampPacketNodes->peekNodes().at(walkerArgs.currentDispatchIndex);
|
|
GpgpuWalkerHelper<GfxFamily>::setupTimestampPacket(&commandStream, &walkerCmd, timestampPacketNode, commandQueue.getDevice().getRootDeviceEnvironment());
|
|
}
|
|
|
|
auto isCcsUsed = EngineHelpers::isCcs(commandQueue.getGpgpuEngine().osContext->getEngineType());
|
|
auto kernelUsesLocalIds = HardwareCommandsHelper<GfxFamily>::kernelUsesLocalIds(kernel);
|
|
|
|
HardwareCommandsHelper<GfxFamily>::sendIndirectState(
|
|
commandStream,
|
|
dsh,
|
|
ioh,
|
|
ssh,
|
|
kernel,
|
|
kernel.getKernelStartAddress(true, kernelUsesLocalIds, isCcsUsed, false),
|
|
simd,
|
|
walkerArgs.localWorkSizes,
|
|
threadGroupCount,
|
|
walkerArgs.offsetInterfaceDescriptorTable,
|
|
walkerArgs.interfaceDescriptorIndex,
|
|
walkerArgs.preemptionMode,
|
|
&walkerCmd,
|
|
nullptr,
|
|
true,
|
|
commandQueue.getDevice());
|
|
|
|
GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(&walkerCmd, kernel.getKernelInfo().kernelDescriptor,
|
|
globalOffsets, startWorkGroups,
|
|
numWorkGroups, walkerArgs.localWorkSizes, simd, dim,
|
|
false, false, 0u);
|
|
|
|
EncodeWalkerArgs encodeWalkerArgs{kernel.getExecutionType(), false};
|
|
EncodeDispatchKernel<GfxFamily>::encodeAdditionalWalkerFields(commandQueue.getDevice().getHardwareInfo(), walkerCmd, encodeWalkerArgs);
|
|
*walkerCmdBuf = walkerCmd;
|
|
}
|
|
} // namespace NEO
|