compute-runtime/opencl/source/command_queue/hardware_interface_xehp_and...

185 lines
9.6 KiB
C++

/*
* Copyright (C) 2021-2023 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#pragma once
#include "shared/source/command_container/command_encoder.h"
#include "shared/source/command_container/implicit_scaling.h"
#include "shared/source/command_stream/command_stream_receiver.h"
#include "shared/source/debug_settings/debug_settings_manager.h"
#include "shared/source/device/device.h"
#include "shared/source/helpers/definitions/command_encoder_args.h"
#include "shared/source/helpers/engine_node_helper.h"
#include "shared/source/os_interface/os_context.h"
#include "shared/source/os_interface/os_interface.h"
#include "shared/source/utilities/tag_allocator.h"
#include "opencl/source/command_queue/hardware_interface_base.inl"
namespace NEO {
template <typename GfxFamily>
inline void HardwareInterface<GfxFamily>::getDefaultDshSpace(
const size_t &offsetInterfaceDescriptorTable,
CommandQueue &commandQueue,
const MultiDispatchInfo &multiDispatchInfo,
size_t &totalInterfaceDescriptorTableSize,
IndirectHeap *dsh,
LinearStream *commandStream) {
}
template <typename GfxFamily>
inline void HardwareInterface<GfxFamily>::dispatchWorkarounds(
LinearStream *commandStream,
CommandQueue &commandQueue,
Kernel &kernel,
const bool &enable) {
}
template <typename GfxFamily>
template <typename WalkerType>
inline void HardwareInterface<GfxFamily>::programWalker(
LinearStream &commandStream,
Kernel &kernel,
CommandQueue &commandQueue,
IndirectHeap &dsh,
IndirectHeap &ioh,
IndirectHeap &ssh,
const DispatchInfo &dispatchInfo,
HardwareInterfaceWalkerArgs &walkerArgs) {
using InterfaceDescriptorType = typename WalkerType::InterfaceDescriptorType;
WalkerType walkerCmd = GfxFamily::template getInitGpuWalker<WalkerType>();
auto &kernelInfo = kernel.getKernelInfo();
uint32_t dim = dispatchInfo.getDim();
uint32_t simd = kernelInfo.getMaxSimdSize();
auto numChannels = kernelInfo.kernelDescriptor.kernelAttributes.numLocalIdChannels;
size_t globalOffsets[3] = {dispatchInfo.getOffset().x, dispatchInfo.getOffset().y, dispatchInfo.getOffset().z};
size_t startWorkGroups[3] = {walkerArgs.startOfWorkgroups->x, walkerArgs.startOfWorkgroups->y, walkerArgs.startOfWorkgroups->z};
size_t numWorkGroups[3] = {walkerArgs.numberOfWorkgroups->x, walkerArgs.numberOfWorkgroups->y, walkerArgs.numberOfWorkgroups->z};
auto threadGroupCount = static_cast<uint32_t>(walkerArgs.numberOfWorkgroups->x * walkerArgs.numberOfWorkgroups->y * walkerArgs.numberOfWorkgroups->z);
uint32_t requiredWalkOrder = 0u;
auto kernelUsesLocalIds = HardwareCommandsHelper<GfxFamily>::kernelUsesLocalIds(kernel);
bool localIdsGenerationByRuntime = kernelUsesLocalIds && EncodeDispatchKernel<GfxFamily>::isRuntimeLocalIdsGenerationRequired(
numChannels,
walkerArgs.localWorkSizes,
std::array<uint8_t, 3>{{kernelInfo.kernelDescriptor.kernelAttributes.workgroupWalkOrder[0],
kernelInfo.kernelDescriptor.kernelAttributes.workgroupWalkOrder[1],
kernelInfo.kernelDescriptor.kernelAttributes.workgroupWalkOrder[2]}},
kernelInfo.kernelDescriptor.kernelAttributes.flags.requiresWorkgroupWalkOrder,
requiredWalkOrder,
simd);
bool inlineDataProgrammingRequired = EncodeDispatchKernel<GfxFamily>::inlineDataProgrammingRequired(kernel.getKernelInfo().kernelDescriptor);
auto &queueCsr = commandQueue.getGpgpuCommandStreamReceiver();
auto &rootDeviceEnvironment = commandQueue.getDevice().getRootDeviceEnvironment();
TagNodeBase *timestampPacketNode = nullptr;
if (walkerArgs.currentTimestampPacketNodes && (walkerArgs.currentTimestampPacketNodes->peekNodes().size() > walkerArgs.currentDispatchIndex)) {
timestampPacketNode = walkerArgs.currentTimestampPacketNodes->peekNodes()[walkerArgs.currentDispatchIndex];
}
if (timestampPacketNode) {
GpgpuWalkerHelper<GfxFamily>::template setupTimestampPacket<WalkerType>(&commandStream, &walkerCmd, timestampPacketNode, rootDeviceEnvironment);
}
auto isCcsUsed = EngineHelpers::isCcs(commandQueue.getGpgpuEngine().osContext->getEngineType());
const auto &hwInfo = commandQueue.getDevice().getHardwareInfo();
constexpr bool heaplessModeEnabled = GfxFamily::template isHeaplessMode<WalkerType>();
if constexpr (heaplessModeEnabled == false) {
if (auto kernelAllocation = kernelInfo.getGraphicsAllocation()) {
EncodeMemoryPrefetch<GfxFamily>::programMemoryPrefetch(commandStream, *kernelAllocation, kernelInfo.heapInfo.kernelHeapSize, 0, rootDeviceEnvironment);
}
}
GpgpuWalkerHelper<GfxFamily>::template setGpgpuWalkerThreadData<WalkerType>(&walkerCmd, kernelInfo.kernelDescriptor, globalOffsets, startWorkGroups,
numWorkGroups, walkerArgs.localWorkSizes, simd, dim,
localIdsGenerationByRuntime, inlineDataProgrammingRequired, requiredWalkOrder);
auto interfaceDescriptor = &walkerCmd.getInterfaceDescriptor();
uint64_t scratchAddress = 0;
if constexpr (heaplessModeEnabled) {
auto scratchAllocation = queueCsr.getScratchAllocation();
if (scratchAllocation) {
scratchAddress = scratchAllocation->getGpuAddress();
}
}
HardwareCommandsHelper<GfxFamily>::template sendIndirectState<WalkerType, InterfaceDescriptorType>(
commandStream,
dsh,
ioh,
ssh,
kernel,
kernel.getKernelStartAddress(localIdsGenerationByRuntime, kernelUsesLocalIds, isCcsUsed, heaplessModeEnabled),
simd,
walkerArgs.localWorkSizes,
threadGroupCount,
walkerArgs.offsetInterfaceDescriptorTable,
walkerArgs.interfaceDescriptorIndex,
walkerArgs.preemptionMode,
&walkerCmd,
interfaceDescriptor,
localIdsGenerationByRuntime,
scratchAddress,
commandQueue.getDevice());
bool kernelSystemAllocation = false;
if (kernel.isBuiltIn) {
kernelSystemAllocation = kernel.getDestinationAllocationInSystemMemory();
} else {
kernelSystemAllocation = kernel.isAnyKernelArgumentUsingSystemMemory();
}
bool requiredSystemFence = kernelSystemAllocation && walkerArgs.event != nullptr;
auto maxFrontEndThreads = commandQueue.getDevice().getDeviceInfo().maxFrontEndThreads;
EncodeWalkerArgs encodeWalkerArgs{kernel.getExecutionType(), requiredSystemFence, kernelInfo.kernelDescriptor, NEO::RequiredDispatchWalkOrder::None, 0, maxFrontEndThreads};
EncodeDispatchKernel<GfxFamily>::template encodeAdditionalWalkerFields<WalkerType>(rootDeviceEnvironment, walkerCmd, encodeWalkerArgs);
auto devices = queueCsr.getOsContext().getDeviceBitfield();
auto partitionWalker = ImplicitScalingHelper::isImplicitScalingEnabled(devices, true);
if (timestampPacketNode && debugManager.flags.PrintTimestampPacketUsage.get() == 1) {
auto gpuVa = walkerArgs.currentTimestampPacketNodes->peekNodes()[walkerArgs.currentDispatchIndex]->getGpuAddress();
printf("\nPID:%u, TSP used for Walker: 0x%" PRIX64 ", cmdBuffer pos: 0x%" PRIX64, SysCalls::getProcessId(), gpuVa, commandStream.getCurrentGpuAddressPosition());
}
if (partitionWalker) {
const uint64_t workPartitionAllocationGpuVa = commandQueue.getDevice().getDefaultEngine().commandStreamReceiver->getWorkPartitionAllocationGpuAddress();
uint32_t partitionCount = 0u;
ImplicitScalingDispatch<GfxFamily>::template dispatchCommands<WalkerType>(commandStream,
walkerCmd,
nullptr,
devices,
kernel.usesImages() ? RequiredPartitionDim::X : RequiredPartitionDim::None,
partitionCount,
false,
false,
queueCsr.getDcFlushSupport(),
kernel.isSingleSubdevicePreferred(),
workPartitionAllocationGpuVa,
hwInfo);
if (queueCsr.isStaticWorkPartitioningEnabled()) {
queueCsr.setActivePartitions(std::max(queueCsr.getActivePartitions(), partitionCount));
}
if (timestampPacketNode) {
timestampPacketNode->setPacketsUsed(partitionCount);
}
} else {
auto computeWalkerOnStream = commandStream.getSpaceForCmd<WalkerType>();
*computeWalkerOnStream = walkerCmd;
}
}
} // namespace NEO