compute-runtime/runtime/helpers/kernel_commands.inl

425 lines
19 KiB
C++

/*
* Copyright (C) 2017-2019 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "core/helpers/basic_math.h"
#include "core/helpers/ptr_math.h"
#include "runtime/command_queue/local_id_gen.h"
#include "runtime/command_stream/csr_definitions.h"
#include "runtime/command_stream/preemption.h"
#include "runtime/helpers/address_patch.h"
#include "runtime/helpers/aligned_memory.h"
#include "runtime/helpers/dispatch_info.h"
#include "runtime/helpers/string.h"
#include "runtime/indirect_heap/indirect_heap.h"
#include "runtime/kernel/kernel.h"
#include "runtime/os_interface/debug_settings_manager.h"
#include <cstring>
namespace NEO {
template <typename GfxFamily>
bool KernelCommandsHelper<GfxFamily>::isPipeControlWArequired() { return false; }
template <typename GfxFamily>
uint32_t KernelCommandsHelper<GfxFamily>::computeSlmValues(uint32_t valueIn) {
auto value = std::max(valueIn, 1024u);
value = Math::nextPowerOfTwo(value);
value = Math::getMinLsbSet(value);
value = value - 9;
DEBUG_BREAK_IF(value > 7);
return value * !!valueIn;
}
template <typename GfxFamily>
size_t KernelCommandsHelper<GfxFamily>::getSizeRequiredDSH(
const Kernel &kernel) {
using INTERFACE_DESCRIPTOR_DATA = typename GfxFamily::INTERFACE_DESCRIPTOR_DATA;
using SAMPLER_STATE = typename GfxFamily::SAMPLER_STATE;
const auto &patchInfo = kernel.getKernelInfo().patchInfo;
auto samplerCount = patchInfo.samplerStateArray
? patchInfo.samplerStateArray->Count
: 0;
auto totalSize = samplerCount
? alignUp(samplerCount * sizeof(SAMPLER_STATE), INTERFACE_DESCRIPTOR_DATA::SAMPLERSTATEPOINTER_ALIGN_SIZE)
: 0;
auto borderColorSize = patchInfo.samplerStateArray
? patchInfo.samplerStateArray->Offset - patchInfo.samplerStateArray->BorderColorOffset
: 0;
borderColorSize = alignUp(borderColorSize + alignIndirectStatePointer - 1, alignIndirectStatePointer);
totalSize += borderColorSize + additionalSizeRequiredDsh();
DEBUG_BREAK_IF(!(totalSize >= kernel.getDynamicStateHeapSize() || kernel.getKernelInfo().isVmeWorkload));
return alignUp(totalSize, alignInterfaceDescriptorData);
}
template <typename GfxFamily>
size_t KernelCommandsHelper<GfxFamily>::getSizeRequiredIOH(
const Kernel &kernel,
size_t localWorkSize) {
typedef typename GfxFamily::WALKER_TYPE WALKER_TYPE;
auto threadPayload = kernel.getKernelInfo().patchInfo.threadPayload;
DEBUG_BREAK_IF(nullptr == threadPayload);
auto numChannels = PerThreadDataHelper::getNumLocalIdChannels(*threadPayload);
return alignUp((kernel.getCrossThreadDataSize() +
getPerThreadDataSizeTotal(kernel.getKernelInfo().getMaxSimdSize(), numChannels, localWorkSize)),
WALKER_TYPE::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
}
template <typename GfxFamily>
size_t KernelCommandsHelper<GfxFamily>::getSizeRequiredSSH(
const Kernel &kernel) {
typedef typename GfxFamily::BINDING_TABLE_STATE BINDING_TABLE_STATE;
auto sizeSSH = kernel.getSurfaceStateHeapSize();
sizeSSH += sizeSSH ? BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE : 0;
return sizeSSH;
}
template <typename SizeGetterT, typename... ArgsT>
size_t getSizeRequired(const MultiDispatchInfo &multiDispatchInfo, SizeGetterT &&getSize, ArgsT... args) {
size_t totalSize = 0;
auto it = multiDispatchInfo.begin();
for (auto e = multiDispatchInfo.end(); it != e; ++it) {
totalSize = alignUp(totalSize, MemoryConstants::cacheLineSize);
totalSize += getSize(*it, std::forward<ArgsT>(args)...);
}
totalSize = alignUp(totalSize, MemoryConstants::pageSize);
return totalSize;
}
template <typename GfxFamily>
size_t KernelCommandsHelper<GfxFamily>::getTotalSizeRequiredDSH(
const MultiDispatchInfo &multiDispatchInfo) {
return getSizeRequired(multiDispatchInfo, [](const DispatchInfo &dispatchInfo) { return getSizeRequiredDSH(*dispatchInfo.getKernel()); });
}
template <typename GfxFamily>
size_t KernelCommandsHelper<GfxFamily>::getTotalSizeRequiredIOH(
const MultiDispatchInfo &multiDispatchInfo) {
return getSizeRequired(multiDispatchInfo, [](const DispatchInfo &dispatchInfo) { return getSizeRequiredIOH(*dispatchInfo.getKernel(), Math::computeTotalElementsCount(dispatchInfo.getLocalWorkgroupSize())); });
}
template <typename GfxFamily>
size_t KernelCommandsHelper<GfxFamily>::getTotalSizeRequiredSSH(
const MultiDispatchInfo &multiDispatchInfo) {
return getSizeRequired(multiDispatchInfo, [](const DispatchInfo &dispatchInfo) { return getSizeRequiredSSH(*dispatchInfo.getKernel()); });
}
template <typename GfxFamily>
size_t KernelCommandsHelper<GfxFamily>::sendInterfaceDescriptorData(
const IndirectHeap &indirectHeap,
uint64_t offsetInterfaceDescriptor,
uint64_t kernelStartOffset,
size_t sizeCrossThreadData,
size_t sizePerThreadData,
size_t bindingTablePointer,
size_t offsetSamplerState,
uint32_t numSamplers,
uint32_t threadsPerThreadGroup,
uint32_t sizeSlm,
uint32_t bindingTablePrefetchSize,
bool barrierEnable,
PreemptionMode preemptionMode,
INTERFACE_DESCRIPTOR_DATA *inlineInterfaceDescriptor) {
using SAMPLER_STATE = typename GfxFamily::SAMPLER_STATE;
// Allocate some memory for the interface descriptor
auto pInterfaceDescriptor = getInterfaceDescriptor(indirectHeap, offsetInterfaceDescriptor, inlineInterfaceDescriptor);
*pInterfaceDescriptor = GfxFamily::cmdInitInterfaceDescriptorData;
// Program the kernel start pointer
pInterfaceDescriptor->setKernelStartPointerHigh(kernelStartOffset >> 32);
pInterfaceDescriptor->setKernelStartPointer((uint32_t)kernelStartOffset);
// # of threads in thread group should be based on LWS.
pInterfaceDescriptor->setNumberOfThreadsInGpgpuThreadGroup(threadsPerThreadGroup);
pInterfaceDescriptor->setDenormMode(INTERFACE_DESCRIPTOR_DATA::DENORM_MODE_SETBYKERNEL);
setAdditionalInfo(pInterfaceDescriptor, sizeCrossThreadData, sizePerThreadData);
pInterfaceDescriptor->setBindingTablePointer(static_cast<uint32_t>(bindingTablePointer));
pInterfaceDescriptor->setSamplerStatePointer(static_cast<uint32_t>(offsetSamplerState));
DEBUG_BREAK_IF(numSamplers > 16);
auto samplerCountState = static_cast<typename INTERFACE_DESCRIPTOR_DATA::SAMPLER_COUNT>((numSamplers + 3) / 4);
pInterfaceDescriptor->setSamplerCount(samplerCountState);
auto programmableIDSLMSize = static_cast<typename INTERFACE_DESCRIPTOR_DATA::SHARED_LOCAL_MEMORY_SIZE>(computeSlmValues(sizeSlm));
pInterfaceDescriptor->setSharedLocalMemorySize(programmableIDSLMSize);
pInterfaceDescriptor->setBarrierEnable(barrierEnable);
PreemptionHelper::programInterfaceDescriptorDataPreemption<GfxFamily>(pInterfaceDescriptor, preemptionMode);
pInterfaceDescriptor->setBindingTableEntryCount(bindingTablePrefetchSize);
return (size_t)offsetInterfaceDescriptor;
}
// Returned binding table pointer is relative to given heap (which is assumed to be the Surface state base addess)
// as required by the INTERFACE_DESCRIPTOR_DATA.
template <typename GfxFamily>
size_t KernelCommandsHelper<GfxFamily>::pushBindingTableAndSurfaceStates(IndirectHeap &dstHeap, const KernelInfo &srcKernelInfo,
const void *srcKernelSsh, size_t srcKernelSshSize,
size_t numberOfBindingTableStates, size_t offsetOfBindingTable) {
using BINDING_TABLE_STATE = typename GfxFamily::BINDING_TABLE_STATE;
using INTERFACE_DESCRIPTOR_DATA = typename GfxFamily::INTERFACE_DESCRIPTOR_DATA;
using RENDER_SURFACE_STATE = typename GfxFamily::RENDER_SURFACE_STATE;
if ((srcKernelInfo.patchInfo.bindingTableState == nullptr) || (srcKernelInfo.patchInfo.bindingTableState->Count == 0)) {
// according to compiler, kernel does not reference BTIs to stateful surfaces, so there's nothing to patch
return 0;
}
size_t sshSize = srcKernelSshSize;
DEBUG_BREAK_IF(srcKernelSsh == nullptr);
auto srcSurfaceState = srcKernelSsh;
// Align the heap and allocate space for new ssh data
dstHeap.align(BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE);
auto dstSurfaceState = dstHeap.getSpace(sshSize);
// Compiler sends BTI table that is already populated with surface state pointers relative to local SSH.
// We may need to patch these pointers so that they are relative to surface state base address
if (dstSurfaceState == dstHeap.getCpuBase()) {
// nothing to patch, we're at the start of heap (which is assumed to be the surface state base address)
// we need to simply copy the ssh (including BTIs from compiler)
memcpy_s(dstSurfaceState, sshSize, srcSurfaceState, sshSize);
return offsetOfBindingTable;
}
// We can copy-over the surface states, but BTIs will need to be patched
memcpy_s(dstSurfaceState, sshSize, srcSurfaceState, offsetOfBindingTable);
uint32_t surfaceStatesOffset = static_cast<uint32_t>(ptrDiff(dstSurfaceState, dstHeap.getCpuBase()));
// march over BTIs and offset the pointers based on surface state base address
auto *dstBtiTableBase = reinterpret_cast<BINDING_TABLE_STATE *>(ptrOffset(dstSurfaceState, offsetOfBindingTable));
DEBUG_BREAK_IF(reinterpret_cast<uintptr_t>(dstBtiTableBase) % INTERFACE_DESCRIPTOR_DATA::BINDINGTABLEPOINTER_ALIGN_SIZE != 0);
auto *srcBtiTableBase = reinterpret_cast<const BINDING_TABLE_STATE *>(ptrOffset(srcSurfaceState, offsetOfBindingTable));
BINDING_TABLE_STATE bti = GfxFamily::cmdInitBindingTableState;
for (uint32_t i = 0, e = (uint32_t)numberOfBindingTableStates; i != e; ++i) {
uint32_t localSurfaceStateOffset = srcBtiTableBase[i].getSurfaceStatePointer();
uint32_t offsetedSurfaceStateOffset = localSurfaceStateOffset + surfaceStatesOffset;
bti.setSurfaceStatePointer(offsetedSurfaceStateOffset); // patch just the SurfaceStatePointer bits
dstBtiTableBase[i] = bti;
DEBUG_BREAK_IF(bti.getRawData(0) % sizeof(BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE) != 0);
}
return ptrDiff(dstBtiTableBase, dstHeap.getCpuBase());
}
template <typename GfxFamily>
size_t KernelCommandsHelper<GfxFamily>::sendIndirectState(
LinearStream &commandStream,
IndirectHeap &dsh,
IndirectHeap &ioh,
IndirectHeap &ssh,
Kernel &kernel,
uint32_t simd,
const size_t localWorkSize[3],
const uint64_t offsetInterfaceDescriptorTable,
uint32_t &interfaceDescriptorIndex,
PreemptionMode preemptionMode,
WALKER_TYPE<GfxFamily> *walkerCmd,
INTERFACE_DESCRIPTOR_DATA *inlineInterfaceDescriptor,
bool localIdsGenerationByRuntime) {
using SAMPLER_STATE = typename GfxFamily::SAMPLER_STATE;
DEBUG_BREAK_IF(simd != 8 && simd != 16 && simd != 32);
auto kernelUsesLocalIds = KernelCommandsHelper<GfxFamily>::kernelUsesLocalIds(kernel);
auto inlineDataProgrammingRequired = KernelCommandsHelper<GfxFamily>::inlineDataProgrammingRequired(kernel);
// Copy the kernel over to the ISH
uint64_t kernelStartOffset = 0llu;
const auto &kernelInfo = kernel.getKernelInfo();
auto kernelAllocation = kernelInfo.getGraphicsAllocation();
DEBUG_BREAK_IF(!kernelAllocation);
setKernelStartOffset(kernelStartOffset, kernelAllocation, kernelInfo, localIdsGenerationByRuntime, kernelUsesLocalIds, kernel);
const auto &patchInfo = kernelInfo.patchInfo;
auto dstBindingTablePointer = pushBindingTableAndSurfaceStates(ssh, kernel);
// Copy our sampler state if it exists
size_t samplerStateOffset = 0;
uint32_t samplerCount = 0;
if (patchInfo.samplerStateArray) {
size_t borderColorOffset = 0;
samplerCount = patchInfo.samplerStateArray->Count;
auto sizeSamplerState = sizeof(SAMPLER_STATE) * samplerCount;
auto borderColorSize = patchInfo.samplerStateArray->Offset - patchInfo.samplerStateArray->BorderColorOffset;
dsh.align(alignIndirectStatePointer);
borderColorOffset = dsh.getUsed();
auto borderColor = dsh.getSpace(borderColorSize);
memcpy_s(borderColor, borderColorSize,
ptrOffset(kernel.getDynamicStateHeap(), patchInfo.samplerStateArray->BorderColorOffset),
borderColorSize);
dsh.align(INTERFACE_DESCRIPTOR_DATA::SAMPLERSTATEPOINTER_ALIGN_SIZE);
samplerStateOffset = dsh.getUsed();
auto samplerState = dsh.getSpace(sizeSamplerState);
memcpy_s(samplerState, sizeSamplerState,
ptrOffset(kernel.getDynamicStateHeap(), patchInfo.samplerStateArray->Offset),
sizeSamplerState);
auto pSmplr = reinterpret_cast<SAMPLER_STATE *>(samplerState);
for (uint32_t i = 0; i < samplerCount; i++) {
pSmplr->setIndirectStatePointer((uint32_t)borderColorOffset);
pSmplr++;
}
}
auto threadPayload = kernel.getKernelInfo().patchInfo.threadPayload;
DEBUG_BREAK_IF(nullptr == threadPayload);
auto localWorkItems = localWorkSize[0] * localWorkSize[1] * localWorkSize[2];
auto threadsPerThreadGroup = static_cast<uint32_t>(getThreadsPerWG(simd, localWorkItems));
auto numChannels = PerThreadDataHelper::getNumLocalIdChannels(*threadPayload);
uint32_t sizeCrossThreadData = kernel.getCrossThreadDataSize();
size_t offsetCrossThreadData = KernelCommandsHelper<GfxFamily>::sendCrossThreadData(
ioh, kernel, inlineDataProgrammingRequired,
walkerCmd, sizeCrossThreadData);
size_t sizePerThreadDataTotal = 0;
size_t sizePerThreadData = 0;
KernelCommandsHelper<GfxFamily>::programPerThreadData(
sizePerThreadData,
localIdsGenerationByRuntime,
ioh,
simd,
numChannels,
localWorkSize,
kernel,
sizePerThreadDataTotal,
localWorkItems);
uint64_t offsetInterfaceDescriptor = offsetInterfaceDescriptorTable + interfaceDescriptorIndex * sizeof(INTERFACE_DESCRIPTOR_DATA);
DEBUG_BREAK_IF(patchInfo.executionEnvironment == nullptr);
auto bindingTablePrefetchSize = std::min(31u, static_cast<uint32_t>(kernel.getNumberOfBindingTableStates()));
if (resetBindingTablePrefetch(kernel)) {
bindingTablePrefetchSize = 0;
}
KernelCommandsHelper<GfxFamily>::sendInterfaceDescriptorData(
dsh,
offsetInterfaceDescriptor,
kernelStartOffset,
sizeCrossThreadData,
sizePerThreadData,
dstBindingTablePointer,
samplerStateOffset,
samplerCount,
threadsPerThreadGroup,
kernel.slmTotalSize,
bindingTablePrefetchSize,
!!patchInfo.executionEnvironment->HasBarriers,
preemptionMode,
inlineInterfaceDescriptor);
if (DebugManager.flags.AddPatchInfoCommentsForAUBDump.get()) {
PatchInfoData patchInfoData(kernelStartOffset, 0, PatchInfoAllocationType::InstructionHeap, dsh.getGraphicsAllocation()->getGpuAddress(), offsetInterfaceDescriptor, PatchInfoAllocationType::DynamicStateHeap);
kernel.getPatchInfoDataList().push_back(patchInfoData);
}
// Program media state flush to set interface descriptor offset
sendMediaStateFlush(
commandStream,
interfaceDescriptorIndex);
DEBUG_BREAK_IF(offsetCrossThreadData % 64 != 0);
walkerCmd->setIndirectDataStartAddress(static_cast<uint32_t>(offsetCrossThreadData));
setInterfaceDescriptorOffset(walkerCmd, interfaceDescriptorIndex);
auto indirectDataLength = alignUp(static_cast<uint32_t>(sizeCrossThreadData + sizePerThreadDataTotal),
WALKER_TYPE<GfxFamily>::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
walkerCmd->setIndirectDataLength(indirectDataLength);
return offsetCrossThreadData;
}
template <typename GfxFamily>
void KernelCommandsHelper<GfxFamily>::updatePerThreadDataTotal(
size_t &sizePerThreadData,
uint32_t &simd,
uint32_t &numChannels,
size_t &sizePerThreadDataTotal,
size_t &localWorkItems) {
sizePerThreadData = getPerThreadSizeLocalIDs(simd, numChannels);
auto localIdSizePerThread = PerThreadDataHelper::getLocalIdSizePerThread(simd, numChannels);
localIdSizePerThread = std::max(localIdSizePerThread, sizeof(GRF));
sizePerThreadDataTotal = getThreadsPerWG(simd, localWorkItems) * localIdSizePerThread;
DEBUG_BREAK_IF(sizePerThreadDataTotal == 0); // Hardware requires at least 1 GRF of perThreadData for each thread in thread group
}
template <typename GfxFamily>
void KernelCommandsHelper<GfxFamily>::programMiSemaphoreWait(LinearStream &commandStream, uint64_t compareAddress, uint32_t compareData) {
using MI_SEMAPHORE_WAIT = typename GfxFamily::MI_SEMAPHORE_WAIT;
auto miSemaphoreCmd = commandStream.getSpaceForCmd<MI_SEMAPHORE_WAIT>();
*miSemaphoreCmd = GfxFamily::cmdInitMiSemaphoreWait;
miSemaphoreCmd->setCompareOperation(MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_NOT_EQUAL_SDD);
miSemaphoreCmd->setSemaphoreDataDword(compareData);
miSemaphoreCmd->setSemaphoreGraphicsAddress(compareAddress);
miSemaphoreCmd->setWaitMode(MI_SEMAPHORE_WAIT::WAIT_MODE::WAIT_MODE_POLLING_MODE);
}
template <typename GfxFamily>
typename GfxFamily::MI_ATOMIC *KernelCommandsHelper<GfxFamily>::programMiAtomic(LinearStream &commandStream, uint64_t writeAddress,
typename MI_ATOMIC::ATOMIC_OPCODES opcode,
typename MI_ATOMIC::DATA_SIZE dataSize) {
auto miAtomic = commandStream.getSpaceForCmd<MI_ATOMIC>();
*miAtomic = GfxFamily::cmdInitAtomic;
miAtomic->setAtomicOpcode(opcode);
miAtomic->setDataSize(dataSize);
miAtomic->setMemoryAddress(static_cast<uint32_t>(writeAddress & 0x0000FFFFFFFFULL));
miAtomic->setMemoryAddressHigh(static_cast<uint32_t>(writeAddress >> 32));
return miAtomic;
}
template <typename GfxFamily>
bool KernelCommandsHelper<GfxFamily>::doBindingTablePrefetch() {
return true;
}
template <typename GfxFamily>
bool KernelCommandsHelper<GfxFamily>::inlineDataProgrammingRequired(const Kernel &kernel) {
if (DebugManager.flags.EnablePassInlineData.get()) {
return kernel.getKernelInfo().patchInfo.threadPayload->PassInlineData;
}
return false;
}
template <typename GfxFamily>
bool KernelCommandsHelper<GfxFamily>::kernelUsesLocalIds(const Kernel &kernel) {
return (kernel.getKernelInfo().patchInfo.threadPayload->LocalIDXPresent ||
kernel.getKernelInfo().patchInfo.threadPayload->LocalIDYPresent ||
kernel.getKernelInfo().patchInfo.threadPayload->LocalIDZPresent);
}
} // namespace NEO