2017-12-21 07:45:38 +08:00
|
|
|
/*
|
|
|
|
* Copyright (c) 2017, Intel Corporation
|
|
|
|
*
|
|
|
|
* Permission is hereby granted, free of charge, to any person obtaining a
|
|
|
|
* copy of this software and associated documentation files (the "Software"),
|
|
|
|
* to deal in the Software without restriction, including without limitation
|
|
|
|
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
|
|
|
* and/or sell copies of the Software, and to permit persons to whom the
|
|
|
|
* Software is furnished to do so, subject to the following conditions:
|
|
|
|
*
|
|
|
|
* The above copyright notice and this permission notice shall be included
|
|
|
|
* in all copies or substantial portions of the Software.
|
|
|
|
*
|
|
|
|
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
|
|
|
|
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
|
|
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
|
|
|
|
* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
|
|
|
|
* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
|
|
|
|
* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
|
|
|
|
* OTHER DEALINGS IN THE SOFTWARE.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include "runtime/command_queue/local_id_gen.h"
|
|
|
|
#include "runtime/helpers/aligned_memory.h"
|
|
|
|
#include "runtime/helpers/basic_math.h"
|
|
|
|
#include "runtime/helpers/dispatch_info.h"
|
|
|
|
#include "runtime/helpers/ptr_math.h"
|
|
|
|
#include "runtime/helpers/string.h"
|
|
|
|
#include "runtime/indirect_heap/indirect_heap.h"
|
|
|
|
#include "runtime/kernel/kernel.h"
|
|
|
|
#include "runtime/os_interface/debug_settings_manager.h"
|
|
|
|
#include <cstring>
|
|
|
|
|
|
|
|
namespace OCLRT {
|
|
|
|
|
|
|
|
template <typename GfxFamily>
|
|
|
|
uint32_t KernelCommandsHelper<GfxFamily>::computeSlmValues(uint32_t valueIn) {
|
|
|
|
auto value = std::max(valueIn, 1024u);
|
|
|
|
value = Math::nextPowerOfTwo(value);
|
|
|
|
value = Math::getMinLsbSet(value);
|
|
|
|
value = value - 9;
|
|
|
|
DEBUG_BREAK_IF(value > 7);
|
|
|
|
return value * !!valueIn;
|
|
|
|
}
|
|
|
|
|
|
|
|
template <typename GfxFamily>
|
|
|
|
size_t KernelCommandsHelper<GfxFamily>::getSizeRequiredCS() {
|
|
|
|
return 2 * sizeof(typename GfxFamily::MEDIA_STATE_FLUSH) +
|
|
|
|
sizeof(typename GfxFamily::MEDIA_INTERFACE_DESCRIPTOR_LOAD);
|
|
|
|
}
|
|
|
|
|
|
|
|
template <typename GfxFamily>
|
|
|
|
size_t KernelCommandsHelper<GfxFamily>::getSizeRequiredDSH(
|
|
|
|
const Kernel &kernel) {
|
|
|
|
typedef typename GfxFamily::INTERFACE_DESCRIPTOR_DATA INTERFACE_DESCRIPTOR_DATA;
|
|
|
|
typedef typename GfxFamily::SAMPLER_STATE SAMPLER_STATE;
|
|
|
|
const auto &patchInfo = kernel.getKernelInfo().patchInfo;
|
|
|
|
auto samplerCount = patchInfo.samplerStateArray
|
|
|
|
? patchInfo.samplerStateArray->Count
|
|
|
|
: 0;
|
|
|
|
auto totalSize = samplerCount
|
|
|
|
? alignUp(samplerCount * sizeof(SAMPLER_STATE), INTERFACE_DESCRIPTOR_DATA::SAMPLERSTATEPOINTER_ALIGN_SIZE)
|
|
|
|
: 0;
|
|
|
|
|
|
|
|
auto borderColorSize = patchInfo.samplerStateArray
|
|
|
|
? patchInfo.samplerStateArray->Offset - patchInfo.samplerStateArray->BorderColorOffset
|
|
|
|
: 0;
|
|
|
|
|
|
|
|
borderColorSize = alignUp(borderColorSize + alignIndirectStatePointer - 1, alignIndirectStatePointer);
|
|
|
|
|
|
|
|
totalSize += sizeof(INTERFACE_DESCRIPTOR_DATA) + borderColorSize;
|
|
|
|
|
|
|
|
DEBUG_BREAK_IF(!(totalSize >= kernel.getDynamicStateHeapSize() || kernel.getKernelInfo().isVmeWorkload));
|
|
|
|
|
|
|
|
return alignUp(totalSize, alignInterfaceDescriptorData);
|
|
|
|
}
|
|
|
|
|
|
|
|
template <typename GfxFamily>
|
|
|
|
size_t KernelCommandsHelper<GfxFamily>::getSizeRequiredIOH(
|
|
|
|
const Kernel &kernel,
|
|
|
|
size_t localWorkSize) {
|
|
|
|
typedef typename GfxFamily::GPGPU_WALKER GPGPU_WALKER;
|
|
|
|
|
|
|
|
auto threadPayload = kernel.getKernelInfo().patchInfo.threadPayload;
|
|
|
|
DEBUG_BREAK_IF(nullptr == threadPayload);
|
|
|
|
|
|
|
|
auto numChannels = PerThreadDataHelper::getNumLocalIdChannels(*threadPayload);
|
|
|
|
return alignUp((kernel.getCrossThreadDataSize() +
|
|
|
|
getPerThreadDataSizeTotal(kernel.getKernelInfo().getMaxSimdSize(), numChannels, localWorkSize)),
|
|
|
|
GPGPU_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
|
|
|
|
}
|
|
|
|
|
|
|
|
template <typename GfxFamily>
|
|
|
|
size_t KernelCommandsHelper<GfxFamily>::getSizeRequiredIH(
|
|
|
|
const Kernel &kernel) {
|
|
|
|
typedef typename GfxFamily::INTERFACE_DESCRIPTOR_DATA INTERFACE_DESCRIPTOR_DATA;
|
|
|
|
return kernel.getKernelHeapSize() + INTERFACE_DESCRIPTOR_DATA::KERNELSTARTPOINTER_ALIGN_SIZE;
|
|
|
|
}
|
|
|
|
|
|
|
|
template <typename GfxFamily>
|
|
|
|
size_t KernelCommandsHelper<GfxFamily>::getSizeRequiredSSH(
|
|
|
|
const Kernel &kernel) {
|
|
|
|
typedef typename GfxFamily::BINDING_TABLE_STATE BINDING_TABLE_STATE;
|
|
|
|
auto sizeSSH = kernel.getSurfaceStateHeapSize();
|
|
|
|
sizeSSH += sizeSSH ? BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE : 0;
|
|
|
|
return sizeSSH;
|
|
|
|
}
|
|
|
|
|
|
|
|
template <typename SizeGetterT, typename... ArgsT>
|
|
|
|
size_t getSizeRequired(const MultiDispatchInfo &multiDispatchInfo, SizeGetterT &&getSize, ArgsT... args) {
|
|
|
|
size_t totalSize = 0;
|
|
|
|
auto it = multiDispatchInfo.begin();
|
|
|
|
for (auto e = multiDispatchInfo.end(); it != e; ++it) {
|
|
|
|
totalSize = alignUp(totalSize, MemoryConstants::pageSize);
|
2018-02-13 22:43:33 +08:00
|
|
|
totalSize += getSize(*it, std::forward<ArgsT>(args)...);
|
2017-12-21 07:45:38 +08:00
|
|
|
}
|
|
|
|
return totalSize;
|
|
|
|
}
|
|
|
|
|
|
|
|
template <typename GfxFamily>
|
|
|
|
size_t KernelCommandsHelper<GfxFamily>::getTotalSizeRequiredDSH(
|
|
|
|
const MultiDispatchInfo &multiDispatchInfo) {
|
2018-02-13 22:43:33 +08:00
|
|
|
return getSizeRequired(multiDispatchInfo, [](const DispatchInfo &dispatchInfo) { return getSizeRequiredDSH(*dispatchInfo.getKernel()); });
|
2017-12-21 07:45:38 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
template <typename GfxFamily>
|
|
|
|
size_t KernelCommandsHelper<GfxFamily>::getTotalSizeRequiredIH(
|
|
|
|
const MultiDispatchInfo &multiDispatchInfo) {
|
2018-02-13 22:43:33 +08:00
|
|
|
return getSizeRequired(multiDispatchInfo, [](const DispatchInfo &dispatchInfo) { return getSizeRequiredIH(*dispatchInfo.getKernel()); });
|
2017-12-21 07:45:38 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
template <typename GfxFamily>
|
|
|
|
size_t KernelCommandsHelper<GfxFamily>::getTotalSizeRequiredIOH(
|
2018-02-13 22:43:33 +08:00
|
|
|
const MultiDispatchInfo &multiDispatchInfo) {
|
|
|
|
return getSizeRequired(multiDispatchInfo, [](const DispatchInfo &dispatchInfo) { return getSizeRequiredIOH(*dispatchInfo.getKernel(), Math::computeTotalElementsCount(dispatchInfo.getLocalWorkgroupSize())); });
|
2017-12-21 07:45:38 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
template <typename GfxFamily>
|
|
|
|
size_t KernelCommandsHelper<GfxFamily>::getTotalSizeRequiredSSH(
|
|
|
|
const MultiDispatchInfo &multiDispatchInfo) {
|
2018-02-13 22:43:33 +08:00
|
|
|
return getSizeRequired(multiDispatchInfo, [](const DispatchInfo &dispatchInfo) { return getSizeRequiredSSH(*dispatchInfo.getKernel()); });
|
2017-12-21 07:45:38 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
template <typename GfxFamily>
|
|
|
|
size_t KernelCommandsHelper<GfxFamily>::copyKernelBinary(
|
|
|
|
IndirectHeap &indirectHeap,
|
|
|
|
const KernelInfo &kernelInfo) {
|
|
|
|
const auto alignKernelBinary = 64 * sizeof(uint8_t);
|
|
|
|
indirectHeap.align(alignKernelBinary);
|
|
|
|
|
|
|
|
auto kernelStartOffset = indirectHeap.getUsed();
|
|
|
|
|
|
|
|
auto pKernelHeap = kernelInfo.heapInfo.pKernelHeap;
|
|
|
|
auto kernelHeapSize = kernelInfo.heapInfo.pKernelHeader->KernelHeapSize;
|
|
|
|
|
|
|
|
auto pKernelDataDst = indirectHeap.getSpace(kernelHeapSize);
|
|
|
|
memcpy_s(pKernelDataDst, kernelHeapSize, pKernelHeap, kernelHeapSize);
|
|
|
|
|
|
|
|
return kernelStartOffset;
|
|
|
|
}
|
|
|
|
|
|
|
|
template <typename GfxFamily>
|
|
|
|
size_t KernelCommandsHelper<GfxFamily>::sendInterfaceDescriptorData(
|
|
|
|
const IndirectHeap &indirectHeap,
|
|
|
|
uint64_t offsetInterfaceDescriptor,
|
|
|
|
uint64_t kernelStartOffset,
|
|
|
|
size_t sizeCrossThreadData,
|
|
|
|
size_t sizePerThreadData,
|
|
|
|
size_t bindingTablePointer,
|
|
|
|
size_t offsetSamplerState,
|
|
|
|
uint32_t numSamplers,
|
|
|
|
uint32_t threadsPerThreadGroup,
|
|
|
|
uint32_t sizeSlm,
|
|
|
|
bool barrierEnable) {
|
|
|
|
typedef typename GfxFamily::SAMPLER_STATE SAMPLER_STATE;
|
|
|
|
typedef typename GfxFamily::INTERFACE_DESCRIPTOR_DATA INTERFACE_DESCRIPTOR_DATA;
|
|
|
|
|
|
|
|
// Allocate some memory for the interface descriptor
|
|
|
|
auto pInterfaceDescriptor = (INTERFACE_DESCRIPTOR_DATA *)ptrOffset(indirectHeap.getBase(), (size_t)offsetInterfaceDescriptor);
|
|
|
|
*pInterfaceDescriptor = GfxFamily::cmdInitInterfaceDescriptorData;
|
|
|
|
|
|
|
|
// Program the kernel start pointer
|
|
|
|
pInterfaceDescriptor->setKernelStartPointerHigh(kernelStartOffset >> 32);
|
|
|
|
pInterfaceDescriptor->setKernelStartPointer((uint32_t)kernelStartOffset);
|
|
|
|
// # of threads in thread group should be based on LWS.
|
|
|
|
pInterfaceDescriptor->setNumberOfThreadsInGpgpuThreadGroup(threadsPerThreadGroup);
|
|
|
|
|
2018-02-13 22:43:33 +08:00
|
|
|
DEBUG_BREAK_IF((sizeCrossThreadData % sizeof(GRF)) != 0);
|
2017-12-21 07:45:38 +08:00
|
|
|
auto numGrfCrossThreadData = static_cast<uint32_t>(sizeCrossThreadData / sizeof(GRF));
|
|
|
|
DEBUG_BREAK_IF(numGrfCrossThreadData == 0);
|
|
|
|
pInterfaceDescriptor->setCrossThreadConstantDataReadLength(numGrfCrossThreadData);
|
|
|
|
pInterfaceDescriptor->setDenormMode(INTERFACE_DESCRIPTOR_DATA::DENORM_MODE_SETBYKERNEL);
|
|
|
|
|
2018-02-13 22:43:33 +08:00
|
|
|
DEBUG_BREAK_IF((sizePerThreadData % sizeof(GRF)) != 0);
|
2017-12-21 07:45:38 +08:00
|
|
|
auto numGrfPerThreadData = static_cast<uint32_t>(sizePerThreadData / sizeof(GRF));
|
|
|
|
|
|
|
|
// at least 1 GRF of perThreadData for each thread in a thread group when sizeCrossThreadData != 0
|
|
|
|
numGrfPerThreadData = std::max(numGrfPerThreadData, 1u);
|
|
|
|
pInterfaceDescriptor->setConstantIndirectUrbEntryReadLength(numGrfPerThreadData);
|
|
|
|
|
|
|
|
pInterfaceDescriptor->setBindingTablePointer(static_cast<uint32_t>(bindingTablePointer));
|
|
|
|
|
|
|
|
pInterfaceDescriptor->setSamplerStatePointer(static_cast<uint32_t>(offsetSamplerState));
|
|
|
|
|
|
|
|
DEBUG_BREAK_IF(numSamplers > 16);
|
|
|
|
auto samplerCountState = static_cast<typename INTERFACE_DESCRIPTOR_DATA::SAMPLER_COUNT>((numSamplers + 3) / 4);
|
|
|
|
pInterfaceDescriptor->setSamplerCount(samplerCountState);
|
|
|
|
|
|
|
|
auto programmableIDSLMSize = static_cast<typename INTERFACE_DESCRIPTOR_DATA::SHARED_LOCAL_MEMORY_SIZE>(computeSlmValues(sizeSlm));
|
|
|
|
|
|
|
|
pInterfaceDescriptor->setSharedLocalMemorySize(programmableIDSLMSize);
|
|
|
|
pInterfaceDescriptor->setBarrierEnable(barrierEnable);
|
|
|
|
|
|
|
|
return (size_t)offsetInterfaceDescriptor;
|
|
|
|
}
|
|
|
|
|
|
|
|
template <typename GfxFamily>
|
|
|
|
void KernelCommandsHelper<GfxFamily>::sendMediaStateFlush(
|
|
|
|
LinearStream &commandStream,
|
|
|
|
size_t offsetInterfaceDescriptorData) {
|
|
|
|
|
|
|
|
typedef typename GfxFamily::MEDIA_STATE_FLUSH MEDIA_STATE_FLUSH;
|
|
|
|
auto pCmd = (MEDIA_STATE_FLUSH *)commandStream.getSpace(sizeof(MEDIA_STATE_FLUSH));
|
|
|
|
*pCmd = GfxFamily::cmdInitMediaStateFlush;
|
|
|
|
pCmd->setInterfaceDescriptorOffset((uint32_t)offsetInterfaceDescriptorData);
|
|
|
|
}
|
|
|
|
|
|
|
|
template <typename GfxFamily>
|
|
|
|
void KernelCommandsHelper<GfxFamily>::sendMediaInterfaceDescriptorLoad(
|
|
|
|
LinearStream &commandStream,
|
|
|
|
size_t offsetInterfaceDescriptorData,
|
|
|
|
size_t sizeInterfaceDescriptorData) {
|
|
|
|
{
|
|
|
|
typedef typename GfxFamily::MEDIA_STATE_FLUSH MEDIA_STATE_FLUSH;
|
|
|
|
auto pCmd = (MEDIA_STATE_FLUSH *)commandStream.getSpace(sizeof(MEDIA_STATE_FLUSH));
|
|
|
|
*pCmd = GfxFamily::cmdInitMediaStateFlush;
|
|
|
|
}
|
|
|
|
|
|
|
|
{
|
|
|
|
typedef typename GfxFamily::MEDIA_INTERFACE_DESCRIPTOR_LOAD MEDIA_INTERFACE_DESCRIPTOR_LOAD;
|
|
|
|
auto pCmd = (MEDIA_INTERFACE_DESCRIPTOR_LOAD *)commandStream.getSpace(sizeof(MEDIA_INTERFACE_DESCRIPTOR_LOAD));
|
|
|
|
*pCmd = GfxFamily::cmdInitMediaInterfaceDescriptorLoad;
|
|
|
|
pCmd->setInterfaceDescriptorDataStartAddress((uint32_t)offsetInterfaceDescriptorData);
|
|
|
|
pCmd->setInterfaceDescriptorTotalLength((uint32_t)sizeInterfaceDescriptorData);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
template <typename GfxFamily>
|
|
|
|
size_t KernelCommandsHelper<GfxFamily>::sendCrossThreadData(
|
|
|
|
IndirectHeap &indirectHeap,
|
|
|
|
const Kernel &kernel) {
|
|
|
|
typedef typename GfxFamily::GPGPU_WALKER GPGPU_WALKER;
|
|
|
|
|
|
|
|
indirectHeap.align(GPGPU_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
|
|
|
|
|
|
|
|
auto offsetCrossThreadData = indirectHeap.getUsed();
|
|
|
|
auto sizeCrossThreadData = kernel.getCrossThreadDataSize();
|
|
|
|
char *pDest = static_cast<char *>(indirectHeap.getSpace(sizeCrossThreadData));
|
|
|
|
memcpy_s(pDest, sizeCrossThreadData, kernel.getCrossThreadData(), sizeCrossThreadData);
|
|
|
|
|
|
|
|
return offsetCrossThreadData;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Returned binding table pointer is relative to given heap (which is assumed to be the Surface state base addess)
|
|
|
|
// as required by the INTERFACE_DESCRIPTOR_DATA.
|
|
|
|
template <typename GfxFamily>
|
|
|
|
size_t KernelCommandsHelper<GfxFamily>::pushBindingTableAndSurfaceStates(IndirectHeap &dstHeap, const KernelInfo &srcKernelInfo,
|
|
|
|
const void *srcKernelSsh, size_t srcKernelSshSize) {
|
|
|
|
using BINDING_TABLE_STATE = typename GfxFamily::BINDING_TABLE_STATE;
|
|
|
|
using INTERFACE_DESCRIPTOR_DATA = typename GfxFamily::INTERFACE_DESCRIPTOR_DATA;
|
|
|
|
using RENDER_SURFACE_STATE = typename GfxFamily::RENDER_SURFACE_STATE;
|
|
|
|
|
|
|
|
if ((srcKernelInfo.patchInfo.bindingTableState == nullptr) || (srcKernelInfo.patchInfo.bindingTableState->Count == 0)) {
|
|
|
|
// according to compiler, kernel does not reference BTIs to stateful surfaces, so there's nothing to patch
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
size_t sshSize = srcKernelInfo.heapInfo.pKernelHeader->SurfaceStateHeapSize;
|
|
|
|
DEBUG_BREAK_IF(!((sshSize <= srcKernelSshSize) && (srcKernelSsh != nullptr)));
|
|
|
|
uint32_t localBtiOffset = srcKernelInfo.patchInfo.bindingTableState->Offset;
|
|
|
|
|
|
|
|
auto srcSurfaceState = srcKernelSsh;
|
|
|
|
// Align the heap and allocate space for new ssh data
|
|
|
|
dstHeap.align(BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE);
|
|
|
|
auto dstSurfaceState = dstHeap.getSpace(sshSize);
|
|
|
|
|
|
|
|
// Compiler sends BTI table that is already populated with surface state pointers relative to local SSH.
|
|
|
|
// We may need to patch these pointers so that they are relative to surface state base address
|
|
|
|
if (dstSurfaceState == dstHeap.getBase()) {
|
|
|
|
// nothing to patch, we're at the start of heap (which is assumed to be the surface state base address)
|
|
|
|
// we need to simply copy the ssh (including BTIs from compiler)
|
|
|
|
memcpy_s(dstSurfaceState, sshSize, srcSurfaceState, sshSize);
|
|
|
|
return localBtiOffset;
|
|
|
|
}
|
|
|
|
|
|
|
|
// We can copy-over the surface states, but BTIs will need to be patched
|
|
|
|
memcpy_s(dstSurfaceState, sshSize, srcSurfaceState, localBtiOffset);
|
|
|
|
|
|
|
|
uint32_t surfaceStatesOffset = static_cast<uint32_t>(ptrDiff(dstSurfaceState, dstHeap.getBase()));
|
|
|
|
|
|
|
|
// march over BTIs and offset the pointers based on surface state base address
|
|
|
|
auto *dstBtiTableBase = reinterpret_cast<BINDING_TABLE_STATE *>(ptrOffset(dstSurfaceState, localBtiOffset));
|
2018-02-13 22:43:33 +08:00
|
|
|
DEBUG_BREAK_IF(reinterpret_cast<uintptr_t>(dstBtiTableBase) % INTERFACE_DESCRIPTOR_DATA::BINDINGTABLEPOINTER_ALIGN_SIZE != 0);
|
2017-12-21 07:45:38 +08:00
|
|
|
auto *srcBtiTableBase = reinterpret_cast<const BINDING_TABLE_STATE *>(ptrOffset(srcSurfaceState, localBtiOffset));
|
|
|
|
BINDING_TABLE_STATE bti;
|
|
|
|
bti.init(); // init whole DWORD - i.e. not just the SurfaceStatePointer bits
|
|
|
|
for (uint32_t i = 0, e = srcKernelInfo.patchInfo.bindingTableState->Count; i != e; ++i) {
|
|
|
|
uint32_t localSurfaceStateOffset = srcBtiTableBase[i].getSurfaceStatePointer();
|
|
|
|
uint32_t offsetedSurfaceStateOffset = localSurfaceStateOffset + surfaceStatesOffset;
|
|
|
|
bti.setSurfaceStatePointer(offsetedSurfaceStateOffset); // patch just the SurfaceStatePointer bits
|
|
|
|
dstBtiTableBase[i] = bti;
|
2018-02-13 22:43:33 +08:00
|
|
|
DEBUG_BREAK_IF(bti.getRawData(0) % sizeof(BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE) != 0);
|
2017-12-21 07:45:38 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
return ptrDiff(dstBtiTableBase, dstHeap.getBase());
|
|
|
|
}
|
|
|
|
|
|
|
|
template <typename GfxFamily>
|
|
|
|
size_t KernelCommandsHelper<GfxFamily>::sendIndirectState(
|
|
|
|
LinearStream &commandStream,
|
|
|
|
IndirectHeap &dsh,
|
|
|
|
IndirectHeap &ih,
|
2018-01-16 20:58:48 +08:00
|
|
|
size_t ihReservedBlockSize,
|
2017-12-21 07:45:38 +08:00
|
|
|
IndirectHeap &ioh,
|
|
|
|
IndirectHeap &ssh,
|
|
|
|
const Kernel &kernel,
|
|
|
|
uint32_t simd,
|
|
|
|
const size_t localWorkSize[3],
|
|
|
|
const uint64_t offsetInterfaceDescriptorTable,
|
|
|
|
const uint32_t interfaceDescriptorIndex) {
|
|
|
|
|
|
|
|
typedef typename GfxFamily::INTERFACE_DESCRIPTOR_DATA INTERFACE_DESCRIPTOR_DATA;
|
|
|
|
typedef typename GfxFamily::RENDER_SURFACE_STATE RENDER_SURFACE_STATE;
|
|
|
|
typedef typename GfxFamily::SAMPLER_STATE SAMPLER_STATE;
|
|
|
|
|
|
|
|
DEBUG_BREAK_IF(simd != 8 && simd != 16 && simd != 32);
|
|
|
|
|
|
|
|
// Copy the kernel over to the ISH
|
|
|
|
auto kernelStartOffset = copyKernelBinary(ih, kernel.getKernelInfo());
|
|
|
|
|
|
|
|
const auto &kernelInfo = kernel.getKernelInfo();
|
|
|
|
const auto &patchInfo = kernelInfo.patchInfo;
|
|
|
|
|
|
|
|
auto dstBindingTablePointer = pushBindingTableAndSurfaceStates(ssh, kernel);
|
|
|
|
|
|
|
|
// Copy our sampler state if it exists
|
|
|
|
size_t samplerStateOffset = 0;
|
|
|
|
uint32_t samplerCount = 0;
|
|
|
|
if (patchInfo.samplerStateArray) {
|
|
|
|
size_t borderColorOffset = 0;
|
|
|
|
samplerCount = patchInfo.samplerStateArray->Count;
|
|
|
|
auto sizeSamplerState = sizeof(SAMPLER_STATE) * samplerCount;
|
|
|
|
auto borderColorSize = patchInfo.samplerStateArray->Offset - patchInfo.samplerStateArray->BorderColorOffset;
|
|
|
|
|
|
|
|
dsh.align(alignIndirectStatePointer);
|
|
|
|
borderColorOffset = dsh.getUsed();
|
|
|
|
|
|
|
|
auto borderColor = dsh.getSpace(borderColorSize);
|
|
|
|
|
|
|
|
memcpy_s(borderColor, borderColorSize,
|
|
|
|
ptrOffset(kernel.getDynamicStateHeap(), patchInfo.samplerStateArray->BorderColorOffset),
|
|
|
|
borderColorSize);
|
|
|
|
|
|
|
|
dsh.align(INTERFACE_DESCRIPTOR_DATA::SAMPLERSTATEPOINTER_ALIGN_SIZE);
|
|
|
|
samplerStateOffset = dsh.getUsed();
|
|
|
|
|
|
|
|
auto samplerState = dsh.getSpace(sizeSamplerState);
|
|
|
|
|
|
|
|
memcpy_s(samplerState, sizeSamplerState,
|
|
|
|
ptrOffset(kernel.getDynamicStateHeap(), patchInfo.samplerStateArray->Offset),
|
|
|
|
sizeSamplerState);
|
|
|
|
|
|
|
|
auto pSmplr = (SAMPLER_STATE *)(samplerState);
|
|
|
|
for (uint32_t i = 0; i < samplerCount; i++) {
|
|
|
|
pSmplr->setIndirectStatePointer((uint32_t)borderColorOffset);
|
|
|
|
pSmplr++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// Send thread data
|
|
|
|
auto offsetCrossThreadData = sendCrossThreadData(
|
|
|
|
ioh,
|
|
|
|
kernel);
|
|
|
|
|
|
|
|
auto threadPayload = kernel.getKernelInfo().patchInfo.threadPayload;
|
|
|
|
DEBUG_BREAK_IF(nullptr == threadPayload);
|
|
|
|
|
|
|
|
auto numChannels = PerThreadDataHelper::getNumLocalIdChannels(*threadPayload);
|
|
|
|
sendPerThreadData(
|
|
|
|
ioh,
|
|
|
|
simd,
|
|
|
|
numChannels,
|
|
|
|
localWorkSize);
|
|
|
|
|
|
|
|
// send interface descriptor data
|
|
|
|
auto localWorkItems = localWorkSize[0] * localWorkSize[1] * localWorkSize[2];
|
|
|
|
auto sizePerThreadData = getPerThreadSizeLocalIDs(simd, numChannels);
|
|
|
|
auto threadsPerThreadGroup = static_cast<uint32_t>(getThreadsPerWG(simd, localWorkItems));
|
|
|
|
|
|
|
|
uint64_t offsetInterfaceDescriptor = offsetInterfaceDescriptorTable + interfaceDescriptorIndex * sizeof(INTERFACE_DESCRIPTOR_DATA);
|
|
|
|
|
|
|
|
DEBUG_BREAK_IF(patchInfo.executionEnvironment == nullptr);
|
|
|
|
KernelCommandsHelper<GfxFamily>::sendInterfaceDescriptorData(
|
|
|
|
dsh,
|
|
|
|
offsetInterfaceDescriptor,
|
2018-01-16 20:58:48 +08:00
|
|
|
kernelStartOffset + ihReservedBlockSize,
|
2017-12-21 07:45:38 +08:00
|
|
|
kernel.getCrossThreadDataSize(),
|
|
|
|
sizePerThreadData,
|
|
|
|
dstBindingTablePointer,
|
|
|
|
samplerStateOffset,
|
|
|
|
samplerCount,
|
|
|
|
threadsPerThreadGroup,
|
|
|
|
kernel.slmTotalSize,
|
|
|
|
!!patchInfo.executionEnvironment->HasBarriers);
|
|
|
|
|
|
|
|
// Program media state flush to set interface descriptor offset
|
|
|
|
KernelCommandsHelper<GfxFamily>::sendMediaStateFlush(
|
|
|
|
commandStream,
|
|
|
|
interfaceDescriptorIndex);
|
|
|
|
|
|
|
|
return offsetCrossThreadData;
|
|
|
|
}
|
|
|
|
} // namespace OCLRT
|