Move HW specific KernelCommandsHelper functions to a separate file

Change-Id: I04b0c0faaa7ff42e62c3d1765e6ba54c76ae2ee0
Signed-off-by: Filip Hazubski <filip.hazubski@intel.com>
This commit is contained in:
Filip Hazubski
2018-10-04 15:01:52 +02:00
committed by sys_ocldev
parent 17a0a7ecb8
commit 91f540f437
8 changed files with 274 additions and 88 deletions

View File

@@ -8,6 +8,7 @@
#include "runtime/helpers/kernel_commands.h" #include "runtime/helpers/kernel_commands.h"
#include "hw_cmds.h" #include "hw_cmds.h"
#include "runtime/helpers/kernel_commands.inl" #include "runtime/helpers/kernel_commands.inl"
#include "runtime/helpers/kernel_commands_base.inl"
namespace OCLRT { namespace OCLRT {
template struct KernelCommandsHelper<CNLFamily>; template struct KernelCommandsHelper<CNLFamily>;

View File

@@ -9,6 +9,7 @@
#include "runtime/helpers/kernel_commands.h" #include "runtime/helpers/kernel_commands.h"
#include "hw_cmds.h" #include "hw_cmds.h"
#include "runtime/helpers/kernel_commands.inl" #include "runtime/helpers/kernel_commands.inl"
#include "runtime/helpers/kernel_commands_base.inl"
#include "hw_cmds_generated.h" #include "hw_cmds_generated.h"

View File

@@ -9,6 +9,7 @@
#include "runtime/helpers/kernel_commands.h" #include "runtime/helpers/kernel_commands.h"
#include "hw_cmds.h" #include "hw_cmds.h"
#include "runtime/helpers/kernel_commands.inl" #include "runtime/helpers/kernel_commands.inl"
#include "runtime/helpers/kernel_commands_base.inl"
namespace OCLRT { namespace OCLRT {

View File

@@ -48,6 +48,7 @@ set(RUNTIME_SRCS_HELPERS_BASE
${CMAKE_CURRENT_SOURCE_DIR}/hw_info.h ${CMAKE_CURRENT_SOURCE_DIR}/hw_info.h
${CMAKE_CURRENT_SOURCE_DIR}/kernel_commands.h ${CMAKE_CURRENT_SOURCE_DIR}/kernel_commands.h
${CMAKE_CURRENT_SOURCE_DIR}/kernel_commands.inl ${CMAKE_CURRENT_SOURCE_DIR}/kernel_commands.inl
${CMAKE_CURRENT_SOURCE_DIR}/kernel_commands_base.inl
${CMAKE_CURRENT_SOURCE_DIR}/kmd_notify_properties.h ${CMAKE_CURRENT_SOURCE_DIR}/kmd_notify_properties.h
${CMAKE_CURRENT_SOURCE_DIR}/kmd_notify_properties.cpp ${CMAKE_CURRENT_SOURCE_DIR}/kmd_notify_properties.cpp
${CMAKE_CURRENT_SOURCE_DIR}/mipmap.cpp ${CMAKE_CURRENT_SOURCE_DIR}/mipmap.cpp
@@ -99,3 +100,5 @@ else()
endif() endif()
set_property(GLOBAL PROPERTY RUNTIME_SRCS_HELPERS_LINUX ${RUNTIME_SRCS_HELPERS_LINUX}) set_property(GLOBAL PROPERTY RUNTIME_SRCS_HELPERS_LINUX ${RUNTIME_SRCS_HELPERS_LINUX})
set_property(GLOBAL PROPERTY RUNTIME_SRCS_HELPERS_BASE ${RUNTIME_SRCS_HELPERS_BASE}) set_property(GLOBAL PROPERTY RUNTIME_SRCS_HELPERS_BASE ${RUNTIME_SRCS_HELPERS_BASE})
add_subdirectories()

View File

@@ -36,6 +36,18 @@ struct KernelCommandsHelper : public PerThreadDataHelper {
static uint32_t computeSlmValues(uint32_t valueIn); static uint32_t computeSlmValues(uint32_t valueIn);
static INTERFACE_DESCRIPTOR_DATA *getInterfaceDescriptor(
const IndirectHeap &indirectHeap,
uint64_t offsetInterfaceDescriptor,
INTERFACE_DESCRIPTOR_DATA *inlineInterfaceDescriptor);
static void setAdditionalInfo(
INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor,
const size_t &sizeCrossThreadData,
const size_t &sizePerThreadData);
inline static uint32_t additionalSizeRequiredDsh();
static size_t sendInterfaceDescriptorData( static size_t sendInterfaceDescriptorData(
const IndirectHeap &indirectHeap, const IndirectHeap &indirectHeap,
uint64_t offsetInterfaceDescriptor, uint64_t offsetInterfaceDescriptor,
@@ -99,6 +111,34 @@ struct KernelCommandsHelper : public PerThreadDataHelper {
bool kernelUsesLocalIds, bool kernelUsesLocalIds,
bool inlineDataProgrammingRequired); bool inlineDataProgrammingRequired);
static void programPerThreadData(
size_t &sizePerThreadData,
const bool &localIdsGenerationByRuntime,
LinearStream &ioh,
uint32_t &simd,
uint32_t &numChannels,
const size_t localWorkSize[3],
Kernel &kernel,
size_t &sizePerThreadDataTotal,
size_t &localWorkItems);
static void updatePerThreadDataTotal(
size_t &sizePerThreadData,
uint32_t &simd,
uint32_t &numChannels,
size_t &sizePerThreadDataTotal,
size_t &localWorkItems);
inline static bool resetBindingTablePrefetch(Kernel &kernel);
static void setKernelStartOffset(
uint64_t &kernelStartOffset,
bool kernelAllocation,
const KernelInfo &kernelInfo,
const bool &localIdsGenerationByRuntime,
const bool &kernelUsesLocalIds,
Kernel &kernel);
static size_t getSizeRequiredCS(); static size_t getSizeRequiredCS();
static bool isPipeControlWArequired(); static bool isPipeControlWArequired();
static size_t getSizeRequiredDSH( static size_t getSizeRequiredDSH(
@@ -153,6 +193,22 @@ struct KernelCommandsHelper : public PerThreadDataHelper {
return totalSize; return totalSize;
} }
static void setInterfaceDescriptorOffset(
WALKER_TYPE<GfxFamily> *walkerCmd,
uint32_t &interfaceDescriptorIndex);
static void getCrossThreadData(
uint32_t &sizeCrossThreadData,
size_t &offsetCrossThreadData,
Kernel &kernel,
const bool &inlineDataProgrammingRequired,
IndirectHeap &ioh,
WALKER_TYPE<GfxFamily> *walkerCmd);
inline static size_t getCrossThreadDataSize(
uint32_t &sizeCrossThreadData,
Kernel &kernel);
static void programMiSemaphoreWait(LinearStream &commandStream, uint64_t compareAddress, uint32_t compareData); static void programMiSemaphoreWait(LinearStream &commandStream, uint64_t compareAddress, uint32_t compareData);
static MI_ATOMIC *programMiAtomic(LinearStream &commandStream, uint64_t writeAddress, typename MI_ATOMIC::ATOMIC_OPCODES opcode, typename MI_ATOMIC::DATA_SIZE dataSize); static MI_ATOMIC *programMiAtomic(LinearStream &commandStream, uint64_t writeAddress, typename MI_ATOMIC::ATOMIC_OPCODES opcode, typename MI_ATOMIC::DATA_SIZE dataSize);
static void programPipeControlDataWriteWithCsStall(LinearStream &commandStream, uint64_t writeAddress, uint64_t data); static void programPipeControlDataWriteWithCsStall(LinearStream &commandStream, uint64_t writeAddress, uint64_t data);

View File

@@ -31,17 +31,11 @@ uint32_t KernelCommandsHelper<GfxFamily>::computeSlmValues(uint32_t valueIn) {
return value * !!valueIn; return value * !!valueIn;
} }
template <typename GfxFamily>
size_t KernelCommandsHelper<GfxFamily>::getSizeRequiredCS() {
return 2 * sizeof(typename GfxFamily::MEDIA_STATE_FLUSH) +
sizeof(typename GfxFamily::MEDIA_INTERFACE_DESCRIPTOR_LOAD);
}
template <typename GfxFamily> template <typename GfxFamily>
size_t KernelCommandsHelper<GfxFamily>::getSizeRequiredDSH( size_t KernelCommandsHelper<GfxFamily>::getSizeRequiredDSH(
const Kernel &kernel) { const Kernel &kernel) {
typedef typename GfxFamily::INTERFACE_DESCRIPTOR_DATA INTERFACE_DESCRIPTOR_DATA; using INTERFACE_DESCRIPTOR_DATA = typename GfxFamily::INTERFACE_DESCRIPTOR_DATA;
typedef typename GfxFamily::SAMPLER_STATE SAMPLER_STATE; using SAMPLER_STATE = typename GfxFamily::SAMPLER_STATE;
const auto &patchInfo = kernel.getKernelInfo().patchInfo; const auto &patchInfo = kernel.getKernelInfo().patchInfo;
auto samplerCount = patchInfo.samplerStateArray auto samplerCount = patchInfo.samplerStateArray
? patchInfo.samplerStateArray->Count ? patchInfo.samplerStateArray->Count
@@ -56,7 +50,7 @@ size_t KernelCommandsHelper<GfxFamily>::getSizeRequiredDSH(
borderColorSize = alignUp(borderColorSize + alignIndirectStatePointer - 1, alignIndirectStatePointer); borderColorSize = alignUp(borderColorSize + alignIndirectStatePointer - 1, alignIndirectStatePointer);
totalSize += sizeof(INTERFACE_DESCRIPTOR_DATA) + borderColorSize; totalSize += borderColorSize + additionalSizeRequiredDsh();
DEBUG_BREAK_IF(!(totalSize >= kernel.getDynamicStateHeapSize() || kernel.getKernelInfo().isVmeWorkload)); DEBUG_BREAK_IF(!(totalSize >= kernel.getDynamicStateHeapSize() || kernel.getKernelInfo().isVmeWorkload));
@@ -67,7 +61,7 @@ template <typename GfxFamily>
size_t KernelCommandsHelper<GfxFamily>::getSizeRequiredIOH( size_t KernelCommandsHelper<GfxFamily>::getSizeRequiredIOH(
const Kernel &kernel, const Kernel &kernel,
size_t localWorkSize) { size_t localWorkSize) {
typedef typename GfxFamily::GPGPU_WALKER GPGPU_WALKER; typedef typename GfxFamily::WALKER_TYPE WALKER_TYPE;
auto threadPayload = kernel.getKernelInfo().patchInfo.threadPayload; auto threadPayload = kernel.getKernelInfo().patchInfo.threadPayload;
DEBUG_BREAK_IF(nullptr == threadPayload); DEBUG_BREAK_IF(nullptr == threadPayload);
@@ -75,7 +69,7 @@ size_t KernelCommandsHelper<GfxFamily>::getSizeRequiredIOH(
auto numChannels = PerThreadDataHelper::getNumLocalIdChannels(*threadPayload); auto numChannels = PerThreadDataHelper::getNumLocalIdChannels(*threadPayload);
return alignUp((kernel.getCrossThreadDataSize() + return alignUp((kernel.getCrossThreadDataSize() +
getPerThreadDataSizeTotal(kernel.getKernelInfo().getMaxSimdSize(), numChannels, localWorkSize)), getPerThreadDataSizeTotal(kernel.getKernelInfo().getMaxSimdSize(), numChannels, localWorkSize)),
GPGPU_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE); WALKER_TYPE::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
} }
template <typename GfxFamily> template <typename GfxFamily>
@@ -135,7 +129,7 @@ size_t KernelCommandsHelper<GfxFamily>::sendInterfaceDescriptorData(
using SAMPLER_STATE = typename GfxFamily::SAMPLER_STATE; using SAMPLER_STATE = typename GfxFamily::SAMPLER_STATE;
// Allocate some memory for the interface descriptor // Allocate some memory for the interface descriptor
auto pInterfaceDescriptor = static_cast<INTERFACE_DESCRIPTOR_DATA *>(ptrOffset(indirectHeap.getCpuBase(), (size_t)offsetInterfaceDescriptor)); auto pInterfaceDescriptor = getInterfaceDescriptor(indirectHeap, offsetInterfaceDescriptor, inlineInterfaceDescriptor);
*pInterfaceDescriptor = GfxFamily::cmdInitInterfaceDescriptorData; *pInterfaceDescriptor = GfxFamily::cmdInitInterfaceDescriptorData;
// Program the kernel start pointer // Program the kernel start pointer
@@ -145,18 +139,9 @@ size_t KernelCommandsHelper<GfxFamily>::sendInterfaceDescriptorData(
// # of threads in thread group should be based on LWS. // # of threads in thread group should be based on LWS.
pInterfaceDescriptor->setNumberOfThreadsInGpgpuThreadGroup(threadsPerThreadGroup); pInterfaceDescriptor->setNumberOfThreadsInGpgpuThreadGroup(threadsPerThreadGroup);
DEBUG_BREAK_IF((sizeCrossThreadData % sizeof(GRF)) != 0);
auto numGrfCrossThreadData = static_cast<uint32_t>(sizeCrossThreadData / sizeof(GRF));
DEBUG_BREAK_IF(numGrfCrossThreadData == 0);
pInterfaceDescriptor->setCrossThreadConstantDataReadLength(numGrfCrossThreadData);
pInterfaceDescriptor->setDenormMode(INTERFACE_DESCRIPTOR_DATA::DENORM_MODE_SETBYKERNEL); pInterfaceDescriptor->setDenormMode(INTERFACE_DESCRIPTOR_DATA::DENORM_MODE_SETBYKERNEL);
DEBUG_BREAK_IF((sizePerThreadData % sizeof(GRF)) != 0); setAdditionalInfo(pInterfaceDescriptor, sizeCrossThreadData, sizePerThreadData);
auto numGrfPerThreadData = static_cast<uint32_t>(sizePerThreadData / sizeof(GRF));
// at least 1 GRF of perThreadData for each thread in a thread group when sizeCrossThreadData != 0
numGrfPerThreadData = std::max(numGrfPerThreadData, 1u);
pInterfaceDescriptor->setConstantIndirectUrbEntryReadLength(numGrfPerThreadData);
pInterfaceDescriptor->setBindingTablePointer(static_cast<uint32_t>(bindingTablePointer)); pInterfaceDescriptor->setBindingTablePointer(static_cast<uint32_t>(bindingTablePointer));
@@ -171,51 +156,18 @@ size_t KernelCommandsHelper<GfxFamily>::sendInterfaceDescriptorData(
pInterfaceDescriptor->setSharedLocalMemorySize(programmableIDSLMSize); pInterfaceDescriptor->setSharedLocalMemorySize(programmableIDSLMSize);
pInterfaceDescriptor->setBarrierEnable(barrierEnable); pInterfaceDescriptor->setBarrierEnable(barrierEnable);
pInterfaceDescriptor->setBindingTableEntryCount(bindingTablePrefetchSize);
PreemptionHelper::programInterfaceDescriptorDataPreemption<GfxFamily>(pInterfaceDescriptor, preemptionMode); PreemptionHelper::programInterfaceDescriptorDataPreemption<GfxFamily>(pInterfaceDescriptor, preemptionMode);
pInterfaceDescriptor->setBindingTableEntryCount(bindingTablePrefetchSize);
return (size_t)offsetInterfaceDescriptor; return (size_t)offsetInterfaceDescriptor;
} }
template <typename GfxFamily>
void KernelCommandsHelper<GfxFamily>::sendMediaStateFlush(
LinearStream &commandStream,
size_t offsetInterfaceDescriptorData) {
typedef typename GfxFamily::MEDIA_STATE_FLUSH MEDIA_STATE_FLUSH;
auto pCmd = (MEDIA_STATE_FLUSH *)commandStream.getSpace(sizeof(MEDIA_STATE_FLUSH));
*pCmd = GfxFamily::cmdInitMediaStateFlush;
pCmd->setInterfaceDescriptorOffset((uint32_t)offsetInterfaceDescriptorData);
}
template <typename GfxFamily>
void KernelCommandsHelper<GfxFamily>::sendMediaInterfaceDescriptorLoad(
LinearStream &commandStream,
size_t offsetInterfaceDescriptorData,
size_t sizeInterfaceDescriptorData) {
{
typedef typename GfxFamily::MEDIA_STATE_FLUSH MEDIA_STATE_FLUSH;
auto pCmd = (MEDIA_STATE_FLUSH *)commandStream.getSpace(sizeof(MEDIA_STATE_FLUSH));
*pCmd = GfxFamily::cmdInitMediaStateFlush;
}
{
typedef typename GfxFamily::MEDIA_INTERFACE_DESCRIPTOR_LOAD MEDIA_INTERFACE_DESCRIPTOR_LOAD;
auto pCmd = (MEDIA_INTERFACE_DESCRIPTOR_LOAD *)commandStream.getSpace(sizeof(MEDIA_INTERFACE_DESCRIPTOR_LOAD));
*pCmd = GfxFamily::cmdInitMediaInterfaceDescriptorLoad;
pCmd->setInterfaceDescriptorDataStartAddress((uint32_t)offsetInterfaceDescriptorData);
pCmd->setInterfaceDescriptorTotalLength((uint32_t)sizeInterfaceDescriptorData);
}
}
template <typename GfxFamily> template <typename GfxFamily>
size_t KernelCommandsHelper<GfxFamily>::sendCrossThreadData( size_t KernelCommandsHelper<GfxFamily>::sendCrossThreadData(
IndirectHeap &indirectHeap, IndirectHeap &indirectHeap,
Kernel &kernel) { Kernel &kernel) {
typedef typename GfxFamily::GPGPU_WALKER GPGPU_WALKER; indirectHeap.align(GfxFamily::WALKER_TYPE::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
indirectHeap.align(GPGPU_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
auto offsetCrossThreadData = indirectHeap.getUsed(); auto offsetCrossThreadData = indirectHeap.getUsed();
auto sizeCrossThreadData = kernel.getCrossThreadDataSize(); auto sizeCrossThreadData = kernel.getCrossThreadDataSize();
@@ -299,19 +251,18 @@ size_t KernelCommandsHelper<GfxFamily>::sendIndirectState(
bool localIdsGenerationByRuntime, bool localIdsGenerationByRuntime,
bool kernelUsesLocalIds, bool kernelUsesLocalIds,
bool inlineDataProgrammingRequired) { bool inlineDataProgrammingRequired) {
using SAMPLER_STATE = typename GfxFamily::SAMPLER_STATE; using SAMPLER_STATE = typename GfxFamily::SAMPLER_STATE;
DEBUG_BREAK_IF(simd != 8 && simd != 16 && simd != 32); DEBUG_BREAK_IF(simd != 8 && simd != 16 && simd != 32);
// Copy the kernel over to the ISH // Copy the kernel over to the ISH
auto kernelStartOffset = 0llu; uint64_t kernelStartOffset = 0llu;
const auto &kernelInfo = kernel.getKernelInfo(); const auto &kernelInfo = kernel.getKernelInfo();
auto kernelAllocation = kernelInfo.getGraphicsAllocation(); auto kernelAllocation = kernelInfo.getGraphicsAllocation();
DEBUG_BREAK_IF(!kernelAllocation); DEBUG_BREAK_IF(!kernelAllocation);
if (kernelAllocation) { setKernelStartOffset(kernelStartOffset, kernelAllocation, kernelInfo, localIdsGenerationByRuntime, kernelUsesLocalIds, kernel);
kernelStartOffset = kernelInfo.getGraphicsAllocation()->getGpuAddressToPatch();
}
kernelStartOffset += kernel.getStartOffset();
const auto &patchInfo = kernelInfo.patchInfo; const auto &patchInfo = kernelInfo.patchInfo;
auto dstBindingTablePointer = pushBindingTableAndSurfaceStates(ssh, kernel); auto dstBindingTablePointer = pushBindingTableAndSurfaceStates(ssh, kernel);
@@ -357,37 +308,36 @@ size_t KernelCommandsHelper<GfxFamily>::sendIndirectState(
auto threadsPerThreadGroup = static_cast<uint32_t>(getThreadsPerWG(simd, localWorkItems)); auto threadsPerThreadGroup = static_cast<uint32_t>(getThreadsPerWG(simd, localWorkItems));
auto numChannels = PerThreadDataHelper::getNumLocalIdChannels(*threadPayload); auto numChannels = PerThreadDataHelper::getNumLocalIdChannels(*threadPayload);
// Send thread data uint32_t sizeCrossThreadData = 0;
auto sizeCrossThreadData = kernel.getCrossThreadDataSize(); size_t offsetCrossThreadData = 0;
auto offsetCrossThreadData = sendCrossThreadData(
getCrossThreadData(
sizeCrossThreadData,
offsetCrossThreadData,
kernel,
inlineDataProgrammingRequired,
ioh, ioh,
kernel); walkerCmd);
size_t sizePerThreadDataTotal = 0; size_t sizePerThreadDataTotal = 0;
size_t sizePerThreadData = 0; size_t sizePerThreadData = 0;
sendPerThreadData( programPerThreadData(
sizePerThreadData,
localIdsGenerationByRuntime,
ioh, ioh,
simd, simd,
numChannels, numChannels,
localWorkSize, localWorkSize,
kernel.getKernelInfo().workgroupDimensionsOrder, kernel,
kernel.usesOnlyImages()); sizePerThreadDataTotal,
localWorkItems);
sizePerThreadData = getPerThreadSizeLocalIDs(simd, numChannels);
auto localIdSizePerThread = PerThreadDataHelper::getLocalIdSizePerThread(simd, numChannels);
localIdSizePerThread = std::max(localIdSizePerThread, sizeof(GRF));
sizePerThreadDataTotal = getThreadsPerWG(simd, localWorkItems) * localIdSizePerThread;
DEBUG_BREAK_IF(sizePerThreadDataTotal == 0); // Hardware requires at least 1 GRF of perThreadData for each thread in thread group
uint64_t offsetInterfaceDescriptor = offsetInterfaceDescriptorTable + interfaceDescriptorIndex * sizeof(INTERFACE_DESCRIPTOR_DATA); uint64_t offsetInterfaceDescriptor = offsetInterfaceDescriptorTable + interfaceDescriptorIndex * sizeof(INTERFACE_DESCRIPTOR_DATA);
DEBUG_BREAK_IF(patchInfo.executionEnvironment == nullptr); DEBUG_BREAK_IF(patchInfo.executionEnvironment == nullptr);
auto bindingTablePrefetchSize = std::min(31u, static_cast<uint32_t>(kernel.getNumberOfBindingTableStates())); auto bindingTablePrefetchSize = std::min(31u, static_cast<uint32_t>(kernel.getNumberOfBindingTableStates()));
if (kernel.isSchedulerKernel || !KernelCommandsHelper<GfxFamily>::doBindingTablePrefetch()) { if (resetBindingTablePrefetch(kernel)) {
bindingTablePrefetchSize = 0; bindingTablePrefetchSize = 0;
} }
@@ -395,7 +345,7 @@ size_t KernelCommandsHelper<GfxFamily>::sendIndirectState(
dsh, dsh,
offsetInterfaceDescriptor, offsetInterfaceDescriptor,
kernelStartOffset, kernelStartOffset,
sizeCrossThreadData, getCrossThreadDataSize(sizeCrossThreadData, kernel),
sizePerThreadData, sizePerThreadData,
dstBindingTablePointer, dstBindingTablePointer,
samplerStateOffset, samplerStateOffset,
@@ -413,13 +363,13 @@ size_t KernelCommandsHelper<GfxFamily>::sendIndirectState(
} }
// Program media state flush to set interface descriptor offset // Program media state flush to set interface descriptor offset
KernelCommandsHelper<GfxFamily>::sendMediaStateFlush( sendMediaStateFlush(
commandStream, commandStream,
interfaceDescriptorIndex); interfaceDescriptorIndex);
DEBUG_BREAK_IF(offsetCrossThreadData % 64 != 0); DEBUG_BREAK_IF(offsetCrossThreadData % 64 != 0);
walkerCmd->setIndirectDataStartAddress(static_cast<uint32_t>(offsetCrossThreadData)); walkerCmd->setIndirectDataStartAddress(static_cast<uint32_t>(offsetCrossThreadData));
walkerCmd->setInterfaceDescriptorOffset(interfaceDescriptorIndex++); setInterfaceDescriptorOffset(walkerCmd, interfaceDescriptorIndex);
auto indirectDataLength = alignUp(static_cast<uint32_t>(sizeCrossThreadData + sizePerThreadDataTotal), auto indirectDataLength = alignUp(static_cast<uint32_t>(sizeCrossThreadData + sizePerThreadDataTotal),
WALKER_TYPE<GfxFamily>::INDIRECTDATASTARTADDRESS_ALIGN_SIZE); WALKER_TYPE<GfxFamily>::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
@@ -428,6 +378,23 @@ size_t KernelCommandsHelper<GfxFamily>::sendIndirectState(
return offsetCrossThreadData; return offsetCrossThreadData;
} }
template <typename GfxFamily>
void KernelCommandsHelper<GfxFamily>::updatePerThreadDataTotal(
size_t &sizePerThreadData,
uint32_t &simd,
uint32_t &numChannels,
size_t &sizePerThreadDataTotal,
size_t &localWorkItems) {
sizePerThreadData = getPerThreadSizeLocalIDs(simd, numChannels);
auto localIdSizePerThread = PerThreadDataHelper::getLocalIdSizePerThread(simd, numChannels);
localIdSizePerThread = std::max(localIdSizePerThread, sizeof(GRF));
sizePerThreadDataTotal = getThreadsPerWG(simd, localWorkItems) * localIdSizePerThread;
DEBUG_BREAK_IF(sizePerThreadDataTotal == 0); // Hardware requires at least 1 GRF of perThreadData for each thread in thread group
}
template <typename GfxFamily> template <typename GfxFamily>
void KernelCommandsHelper<GfxFamily>::programMiSemaphoreWait(LinearStream &commandStream, uint64_t compareAddress, uint32_t compareData) { void KernelCommandsHelper<GfxFamily>::programMiSemaphoreWait(LinearStream &commandStream, uint64_t compareAddress, uint32_t compareData) {
using MI_SEMAPHORE_WAIT = typename GfxFamily::MI_SEMAPHORE_WAIT; using MI_SEMAPHORE_WAIT = typename GfxFamily::MI_SEMAPHORE_WAIT;
@@ -471,11 +438,6 @@ bool KernelCommandsHelper<GfxFamily>::doBindingTablePrefetch() {
return true; return true;
} }
template <typename GfxFamily>
bool KernelCommandsHelper<GfxFamily>::isRuntimeLocalIdsGenerationRequired(uint32_t workDim, size_t *gws, size_t *lws) {
return true;
}
template <typename GfxFamily> template <typename GfxFamily>
bool KernelCommandsHelper<GfxFamily>::inlineDataProgrammingRequired(const Kernel &kernel) { bool KernelCommandsHelper<GfxFamily>::inlineDataProgrammingRequired(const Kernel &kernel) {
if (DebugManager.flags.EnablePassInlineData.get()) { if (DebugManager.flags.EnablePassInlineData.get()) {

View File

@@ -0,0 +1,161 @@
/*
* Copyright (C) 2018 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#pragma once
#include "runtime/helpers/kernel_commands.h"
namespace OCLRT {
template <typename GfxFamily>
typename KernelCommandsHelper<GfxFamily>::INTERFACE_DESCRIPTOR_DATA *KernelCommandsHelper<GfxFamily>::getInterfaceDescriptor(
const IndirectHeap &indirectHeap,
uint64_t offsetInterfaceDescriptor,
KernelCommandsHelper<GfxFamily>::INTERFACE_DESCRIPTOR_DATA *inlineInterfaceDescriptor) {
return static_cast<INTERFACE_DESCRIPTOR_DATA *>(ptrOffset(indirectHeap.getCpuBase(), (size_t)offsetInterfaceDescriptor));
}
template <typename GfxFamily>
void KernelCommandsHelper<GfxFamily>::setAdditionalInfo(
INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor,
const size_t &sizeCrossThreadData,
const size_t &sizePerThreadData) {
DEBUG_BREAK_IF((sizeCrossThreadData % sizeof(GRF)) != 0);
auto numGrfCrossThreadData = static_cast<uint32_t>(sizeCrossThreadData / sizeof(GRF));
DEBUG_BREAK_IF(numGrfCrossThreadData == 0);
pInterfaceDescriptor->setCrossThreadConstantDataReadLength(numGrfCrossThreadData);
DEBUG_BREAK_IF((sizePerThreadData % sizeof(GRF)) != 0);
auto numGrfPerThreadData = static_cast<uint32_t>(sizePerThreadData / sizeof(GRF));
// at least 1 GRF of perThreadData for each thread in a thread group when sizeCrossThreadData != 0
numGrfPerThreadData = std::max(numGrfPerThreadData, 1u);
pInterfaceDescriptor->setConstantIndirectUrbEntryReadLength(numGrfPerThreadData);
}
template <typename GfxFamily>
uint32_t KernelCommandsHelper<GfxFamily>::additionalSizeRequiredDsh() {
return sizeof(INTERFACE_DESCRIPTOR_DATA);
}
template <typename GfxFamily>
size_t KernelCommandsHelper<GfxFamily>::getSizeRequiredCS() {
return 2 * sizeof(typename GfxFamily::MEDIA_STATE_FLUSH) +
sizeof(typename GfxFamily::MEDIA_INTERFACE_DESCRIPTOR_LOAD);
}
template <typename GfxFamily>
void KernelCommandsHelper<GfxFamily>::sendMediaStateFlush(
LinearStream &commandStream,
size_t offsetInterfaceDescriptorData) {
typedef typename GfxFamily::MEDIA_STATE_FLUSH MEDIA_STATE_FLUSH;
auto pCmd = (MEDIA_STATE_FLUSH *)commandStream.getSpace(sizeof(MEDIA_STATE_FLUSH));
*pCmd = GfxFamily::cmdInitMediaStateFlush;
pCmd->setInterfaceDescriptorOffset((uint32_t)offsetInterfaceDescriptorData);
}
template <typename GfxFamily>
void KernelCommandsHelper<GfxFamily>::sendMediaInterfaceDescriptorLoad(
LinearStream &commandStream,
size_t offsetInterfaceDescriptorData,
size_t sizeInterfaceDescriptorData) {
{
typedef typename GfxFamily::MEDIA_STATE_FLUSH MEDIA_STATE_FLUSH;
auto pCmd = (MEDIA_STATE_FLUSH *)commandStream.getSpace(sizeof(MEDIA_STATE_FLUSH));
*pCmd = GfxFamily::cmdInitMediaStateFlush;
}
{
typedef typename GfxFamily::MEDIA_INTERFACE_DESCRIPTOR_LOAD MEDIA_INTERFACE_DESCRIPTOR_LOAD;
auto pCmd = (MEDIA_INTERFACE_DESCRIPTOR_LOAD *)commandStream.getSpace(sizeof(MEDIA_INTERFACE_DESCRIPTOR_LOAD));
*pCmd = GfxFamily::cmdInitMediaInterfaceDescriptorLoad;
pCmd->setInterfaceDescriptorDataStartAddress((uint32_t)offsetInterfaceDescriptorData);
pCmd->setInterfaceDescriptorTotalLength((uint32_t)sizeInterfaceDescriptorData);
}
}
template <typename GfxFamily>
void KernelCommandsHelper<GfxFamily>::setKernelStartOffset(
uint64_t &kernelStartOffset,
bool kernelAllocation,
const KernelInfo &kernelInfo,
const bool &localIdsGenerationByRuntime,
const bool &kernelUsesLocalIds,
Kernel &kernel) {
if (kernelAllocation) {
kernelStartOffset = kernelInfo.getGraphicsAllocation()->getGpuAddressToPatch();
}
kernelStartOffset += kernel.getStartOffset();
}
template <typename GfxFamily>
void KernelCommandsHelper<GfxFamily>::programPerThreadData(
size_t &sizePerThreadData,
const bool &localIdsGenerationByRuntime,
LinearStream &ioh,
uint32_t &simd,
uint32_t &numChannels,
const size_t localWorkSize[3],
Kernel &kernel,
size_t &sizePerThreadDataTotal,
size_t &localWorkItems) {
sendPerThreadData(
ioh,
simd,
numChannels,
localWorkSize,
kernel.getKernelInfo().workgroupDimensionsOrder,
kernel.usesOnlyImages());
updatePerThreadDataTotal(sizePerThreadData, simd, numChannels, sizePerThreadDataTotal, localWorkItems);
}
template <typename GfxFamily>
bool KernelCommandsHelper<GfxFamily>::resetBindingTablePrefetch(Kernel &kernel) {
return kernel.isSchedulerKernel || !doBindingTablePrefetch();
}
template <typename GfxFamily>
void KernelCommandsHelper<GfxFamily>::setInterfaceDescriptorOffset(
WALKER_TYPE<GfxFamily> *walkerCmd,
uint32_t &interfaceDescriptorIndex) {
walkerCmd->setInterfaceDescriptorOffset(interfaceDescriptorIndex++);
}
template <typename GfxFamily>
void KernelCommandsHelper<GfxFamily>::getCrossThreadData(
uint32_t &sizeCrossThreadData,
size_t &offsetCrossThreadData,
Kernel &kernel,
const bool &inlineDataProgrammingRequired,
IndirectHeap &ioh,
WALKER_TYPE<GfxFamily> *walkerCmd) {
sizeCrossThreadData = kernel.getCrossThreadDataSize();
offsetCrossThreadData = sendCrossThreadData(
ioh,
kernel);
}
template <typename GfxFamily>
size_t KernelCommandsHelper<GfxFamily>::getCrossThreadDataSize(
uint32_t &sizeCrossThreadData,
Kernel &kernel) {
return sizeCrossThreadData;
}
template <typename GfxFamily>
bool KernelCommandsHelper<GfxFamily>::isRuntimeLocalIdsGenerationRequired(uint32_t workDim, size_t *gws, size_t *lws) {
return true;
}
} // namespace OCLRT

View File

@@ -14,6 +14,7 @@
#include "runtime/gen_common/aub_mapper.h" #include "runtime/gen_common/aub_mapper.h"
#include "runtime/helpers/hw_helper_common.inl" #include "runtime/helpers/hw_helper_common.inl"
#include "runtime/helpers/kernel_commands.inl" #include "runtime/helpers/kernel_commands.inl"
#include "runtime/helpers/kernel_commands_base.inl"
#include "runtime/helpers/preamble.inl" #include "runtime/helpers/preamble.inl"
namespace OCLRT { namespace OCLRT {