mirror of
https://github.com/intel/compute-runtime.git
synced 2026-01-09 06:23:01 +08:00
Move HW specific KernelCommandsHelper functions to a separate file
Change-Id: I04b0c0faaa7ff42e62c3d1765e6ba54c76ae2ee0 Signed-off-by: Filip Hazubski <filip.hazubski@intel.com>
This commit is contained in:
committed by
sys_ocldev
parent
17a0a7ecb8
commit
91f540f437
@@ -8,6 +8,7 @@
|
|||||||
#include "runtime/helpers/kernel_commands.h"
|
#include "runtime/helpers/kernel_commands.h"
|
||||||
#include "hw_cmds.h"
|
#include "hw_cmds.h"
|
||||||
#include "runtime/helpers/kernel_commands.inl"
|
#include "runtime/helpers/kernel_commands.inl"
|
||||||
|
#include "runtime/helpers/kernel_commands_base.inl"
|
||||||
|
|
||||||
namespace OCLRT {
|
namespace OCLRT {
|
||||||
template struct KernelCommandsHelper<CNLFamily>;
|
template struct KernelCommandsHelper<CNLFamily>;
|
||||||
|
|||||||
@@ -9,6 +9,7 @@
|
|||||||
#include "runtime/helpers/kernel_commands.h"
|
#include "runtime/helpers/kernel_commands.h"
|
||||||
#include "hw_cmds.h"
|
#include "hw_cmds.h"
|
||||||
#include "runtime/helpers/kernel_commands.inl"
|
#include "runtime/helpers/kernel_commands.inl"
|
||||||
|
#include "runtime/helpers/kernel_commands_base.inl"
|
||||||
|
|
||||||
#include "hw_cmds_generated.h"
|
#include "hw_cmds_generated.h"
|
||||||
|
|
||||||
|
|||||||
@@ -9,6 +9,7 @@
|
|||||||
#include "runtime/helpers/kernel_commands.h"
|
#include "runtime/helpers/kernel_commands.h"
|
||||||
#include "hw_cmds.h"
|
#include "hw_cmds.h"
|
||||||
#include "runtime/helpers/kernel_commands.inl"
|
#include "runtime/helpers/kernel_commands.inl"
|
||||||
|
#include "runtime/helpers/kernel_commands_base.inl"
|
||||||
|
|
||||||
namespace OCLRT {
|
namespace OCLRT {
|
||||||
|
|
||||||
|
|||||||
@@ -48,6 +48,7 @@ set(RUNTIME_SRCS_HELPERS_BASE
|
|||||||
${CMAKE_CURRENT_SOURCE_DIR}/hw_info.h
|
${CMAKE_CURRENT_SOURCE_DIR}/hw_info.h
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/kernel_commands.h
|
${CMAKE_CURRENT_SOURCE_DIR}/kernel_commands.h
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/kernel_commands.inl
|
${CMAKE_CURRENT_SOURCE_DIR}/kernel_commands.inl
|
||||||
|
${CMAKE_CURRENT_SOURCE_DIR}/kernel_commands_base.inl
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/kmd_notify_properties.h
|
${CMAKE_CURRENT_SOURCE_DIR}/kmd_notify_properties.h
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/kmd_notify_properties.cpp
|
${CMAKE_CURRENT_SOURCE_DIR}/kmd_notify_properties.cpp
|
||||||
${CMAKE_CURRENT_SOURCE_DIR}/mipmap.cpp
|
${CMAKE_CURRENT_SOURCE_DIR}/mipmap.cpp
|
||||||
@@ -99,3 +100,5 @@ else()
|
|||||||
endif()
|
endif()
|
||||||
set_property(GLOBAL PROPERTY RUNTIME_SRCS_HELPERS_LINUX ${RUNTIME_SRCS_HELPERS_LINUX})
|
set_property(GLOBAL PROPERTY RUNTIME_SRCS_HELPERS_LINUX ${RUNTIME_SRCS_HELPERS_LINUX})
|
||||||
set_property(GLOBAL PROPERTY RUNTIME_SRCS_HELPERS_BASE ${RUNTIME_SRCS_HELPERS_BASE})
|
set_property(GLOBAL PROPERTY RUNTIME_SRCS_HELPERS_BASE ${RUNTIME_SRCS_HELPERS_BASE})
|
||||||
|
|
||||||
|
add_subdirectories()
|
||||||
|
|||||||
@@ -36,6 +36,18 @@ struct KernelCommandsHelper : public PerThreadDataHelper {
|
|||||||
|
|
||||||
static uint32_t computeSlmValues(uint32_t valueIn);
|
static uint32_t computeSlmValues(uint32_t valueIn);
|
||||||
|
|
||||||
|
static INTERFACE_DESCRIPTOR_DATA *getInterfaceDescriptor(
|
||||||
|
const IndirectHeap &indirectHeap,
|
||||||
|
uint64_t offsetInterfaceDescriptor,
|
||||||
|
INTERFACE_DESCRIPTOR_DATA *inlineInterfaceDescriptor);
|
||||||
|
|
||||||
|
static void setAdditionalInfo(
|
||||||
|
INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor,
|
||||||
|
const size_t &sizeCrossThreadData,
|
||||||
|
const size_t &sizePerThreadData);
|
||||||
|
|
||||||
|
inline static uint32_t additionalSizeRequiredDsh();
|
||||||
|
|
||||||
static size_t sendInterfaceDescriptorData(
|
static size_t sendInterfaceDescriptorData(
|
||||||
const IndirectHeap &indirectHeap,
|
const IndirectHeap &indirectHeap,
|
||||||
uint64_t offsetInterfaceDescriptor,
|
uint64_t offsetInterfaceDescriptor,
|
||||||
@@ -99,6 +111,34 @@ struct KernelCommandsHelper : public PerThreadDataHelper {
|
|||||||
bool kernelUsesLocalIds,
|
bool kernelUsesLocalIds,
|
||||||
bool inlineDataProgrammingRequired);
|
bool inlineDataProgrammingRequired);
|
||||||
|
|
||||||
|
static void programPerThreadData(
|
||||||
|
size_t &sizePerThreadData,
|
||||||
|
const bool &localIdsGenerationByRuntime,
|
||||||
|
LinearStream &ioh,
|
||||||
|
uint32_t &simd,
|
||||||
|
uint32_t &numChannels,
|
||||||
|
const size_t localWorkSize[3],
|
||||||
|
Kernel &kernel,
|
||||||
|
size_t &sizePerThreadDataTotal,
|
||||||
|
size_t &localWorkItems);
|
||||||
|
|
||||||
|
static void updatePerThreadDataTotal(
|
||||||
|
size_t &sizePerThreadData,
|
||||||
|
uint32_t &simd,
|
||||||
|
uint32_t &numChannels,
|
||||||
|
size_t &sizePerThreadDataTotal,
|
||||||
|
size_t &localWorkItems);
|
||||||
|
|
||||||
|
inline static bool resetBindingTablePrefetch(Kernel &kernel);
|
||||||
|
|
||||||
|
static void setKernelStartOffset(
|
||||||
|
uint64_t &kernelStartOffset,
|
||||||
|
bool kernelAllocation,
|
||||||
|
const KernelInfo &kernelInfo,
|
||||||
|
const bool &localIdsGenerationByRuntime,
|
||||||
|
const bool &kernelUsesLocalIds,
|
||||||
|
Kernel &kernel);
|
||||||
|
|
||||||
static size_t getSizeRequiredCS();
|
static size_t getSizeRequiredCS();
|
||||||
static bool isPipeControlWArequired();
|
static bool isPipeControlWArequired();
|
||||||
static size_t getSizeRequiredDSH(
|
static size_t getSizeRequiredDSH(
|
||||||
@@ -153,6 +193,22 @@ struct KernelCommandsHelper : public PerThreadDataHelper {
|
|||||||
return totalSize;
|
return totalSize;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static void setInterfaceDescriptorOffset(
|
||||||
|
WALKER_TYPE<GfxFamily> *walkerCmd,
|
||||||
|
uint32_t &interfaceDescriptorIndex);
|
||||||
|
|
||||||
|
static void getCrossThreadData(
|
||||||
|
uint32_t &sizeCrossThreadData,
|
||||||
|
size_t &offsetCrossThreadData,
|
||||||
|
Kernel &kernel,
|
||||||
|
const bool &inlineDataProgrammingRequired,
|
||||||
|
IndirectHeap &ioh,
|
||||||
|
WALKER_TYPE<GfxFamily> *walkerCmd);
|
||||||
|
|
||||||
|
inline static size_t getCrossThreadDataSize(
|
||||||
|
uint32_t &sizeCrossThreadData,
|
||||||
|
Kernel &kernel);
|
||||||
|
|
||||||
static void programMiSemaphoreWait(LinearStream &commandStream, uint64_t compareAddress, uint32_t compareData);
|
static void programMiSemaphoreWait(LinearStream &commandStream, uint64_t compareAddress, uint32_t compareData);
|
||||||
static MI_ATOMIC *programMiAtomic(LinearStream &commandStream, uint64_t writeAddress, typename MI_ATOMIC::ATOMIC_OPCODES opcode, typename MI_ATOMIC::DATA_SIZE dataSize);
|
static MI_ATOMIC *programMiAtomic(LinearStream &commandStream, uint64_t writeAddress, typename MI_ATOMIC::ATOMIC_OPCODES opcode, typename MI_ATOMIC::DATA_SIZE dataSize);
|
||||||
static void programPipeControlDataWriteWithCsStall(LinearStream &commandStream, uint64_t writeAddress, uint64_t data);
|
static void programPipeControlDataWriteWithCsStall(LinearStream &commandStream, uint64_t writeAddress, uint64_t data);
|
||||||
|
|||||||
@@ -31,17 +31,11 @@ uint32_t KernelCommandsHelper<GfxFamily>::computeSlmValues(uint32_t valueIn) {
|
|||||||
return value * !!valueIn;
|
return value * !!valueIn;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename GfxFamily>
|
|
||||||
size_t KernelCommandsHelper<GfxFamily>::getSizeRequiredCS() {
|
|
||||||
return 2 * sizeof(typename GfxFamily::MEDIA_STATE_FLUSH) +
|
|
||||||
sizeof(typename GfxFamily::MEDIA_INTERFACE_DESCRIPTOR_LOAD);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename GfxFamily>
|
template <typename GfxFamily>
|
||||||
size_t KernelCommandsHelper<GfxFamily>::getSizeRequiredDSH(
|
size_t KernelCommandsHelper<GfxFamily>::getSizeRequiredDSH(
|
||||||
const Kernel &kernel) {
|
const Kernel &kernel) {
|
||||||
typedef typename GfxFamily::INTERFACE_DESCRIPTOR_DATA INTERFACE_DESCRIPTOR_DATA;
|
using INTERFACE_DESCRIPTOR_DATA = typename GfxFamily::INTERFACE_DESCRIPTOR_DATA;
|
||||||
typedef typename GfxFamily::SAMPLER_STATE SAMPLER_STATE;
|
using SAMPLER_STATE = typename GfxFamily::SAMPLER_STATE;
|
||||||
const auto &patchInfo = kernel.getKernelInfo().patchInfo;
|
const auto &patchInfo = kernel.getKernelInfo().patchInfo;
|
||||||
auto samplerCount = patchInfo.samplerStateArray
|
auto samplerCount = patchInfo.samplerStateArray
|
||||||
? patchInfo.samplerStateArray->Count
|
? patchInfo.samplerStateArray->Count
|
||||||
@@ -56,7 +50,7 @@ size_t KernelCommandsHelper<GfxFamily>::getSizeRequiredDSH(
|
|||||||
|
|
||||||
borderColorSize = alignUp(borderColorSize + alignIndirectStatePointer - 1, alignIndirectStatePointer);
|
borderColorSize = alignUp(borderColorSize + alignIndirectStatePointer - 1, alignIndirectStatePointer);
|
||||||
|
|
||||||
totalSize += sizeof(INTERFACE_DESCRIPTOR_DATA) + borderColorSize;
|
totalSize += borderColorSize + additionalSizeRequiredDsh();
|
||||||
|
|
||||||
DEBUG_BREAK_IF(!(totalSize >= kernel.getDynamicStateHeapSize() || kernel.getKernelInfo().isVmeWorkload));
|
DEBUG_BREAK_IF(!(totalSize >= kernel.getDynamicStateHeapSize() || kernel.getKernelInfo().isVmeWorkload));
|
||||||
|
|
||||||
@@ -67,7 +61,7 @@ template <typename GfxFamily>
|
|||||||
size_t KernelCommandsHelper<GfxFamily>::getSizeRequiredIOH(
|
size_t KernelCommandsHelper<GfxFamily>::getSizeRequiredIOH(
|
||||||
const Kernel &kernel,
|
const Kernel &kernel,
|
||||||
size_t localWorkSize) {
|
size_t localWorkSize) {
|
||||||
typedef typename GfxFamily::GPGPU_WALKER GPGPU_WALKER;
|
typedef typename GfxFamily::WALKER_TYPE WALKER_TYPE;
|
||||||
|
|
||||||
auto threadPayload = kernel.getKernelInfo().patchInfo.threadPayload;
|
auto threadPayload = kernel.getKernelInfo().patchInfo.threadPayload;
|
||||||
DEBUG_BREAK_IF(nullptr == threadPayload);
|
DEBUG_BREAK_IF(nullptr == threadPayload);
|
||||||
@@ -75,7 +69,7 @@ size_t KernelCommandsHelper<GfxFamily>::getSizeRequiredIOH(
|
|||||||
auto numChannels = PerThreadDataHelper::getNumLocalIdChannels(*threadPayload);
|
auto numChannels = PerThreadDataHelper::getNumLocalIdChannels(*threadPayload);
|
||||||
return alignUp((kernel.getCrossThreadDataSize() +
|
return alignUp((kernel.getCrossThreadDataSize() +
|
||||||
getPerThreadDataSizeTotal(kernel.getKernelInfo().getMaxSimdSize(), numChannels, localWorkSize)),
|
getPerThreadDataSizeTotal(kernel.getKernelInfo().getMaxSimdSize(), numChannels, localWorkSize)),
|
||||||
GPGPU_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
|
WALKER_TYPE::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename GfxFamily>
|
template <typename GfxFamily>
|
||||||
@@ -135,7 +129,7 @@ size_t KernelCommandsHelper<GfxFamily>::sendInterfaceDescriptorData(
|
|||||||
using SAMPLER_STATE = typename GfxFamily::SAMPLER_STATE;
|
using SAMPLER_STATE = typename GfxFamily::SAMPLER_STATE;
|
||||||
|
|
||||||
// Allocate some memory for the interface descriptor
|
// Allocate some memory for the interface descriptor
|
||||||
auto pInterfaceDescriptor = static_cast<INTERFACE_DESCRIPTOR_DATA *>(ptrOffset(indirectHeap.getCpuBase(), (size_t)offsetInterfaceDescriptor));
|
auto pInterfaceDescriptor = getInterfaceDescriptor(indirectHeap, offsetInterfaceDescriptor, inlineInterfaceDescriptor);
|
||||||
*pInterfaceDescriptor = GfxFamily::cmdInitInterfaceDescriptorData;
|
*pInterfaceDescriptor = GfxFamily::cmdInitInterfaceDescriptorData;
|
||||||
|
|
||||||
// Program the kernel start pointer
|
// Program the kernel start pointer
|
||||||
@@ -145,18 +139,9 @@ size_t KernelCommandsHelper<GfxFamily>::sendInterfaceDescriptorData(
|
|||||||
// # of threads in thread group should be based on LWS.
|
// # of threads in thread group should be based on LWS.
|
||||||
pInterfaceDescriptor->setNumberOfThreadsInGpgpuThreadGroup(threadsPerThreadGroup);
|
pInterfaceDescriptor->setNumberOfThreadsInGpgpuThreadGroup(threadsPerThreadGroup);
|
||||||
|
|
||||||
DEBUG_BREAK_IF((sizeCrossThreadData % sizeof(GRF)) != 0);
|
|
||||||
auto numGrfCrossThreadData = static_cast<uint32_t>(sizeCrossThreadData / sizeof(GRF));
|
|
||||||
DEBUG_BREAK_IF(numGrfCrossThreadData == 0);
|
|
||||||
pInterfaceDescriptor->setCrossThreadConstantDataReadLength(numGrfCrossThreadData);
|
|
||||||
pInterfaceDescriptor->setDenormMode(INTERFACE_DESCRIPTOR_DATA::DENORM_MODE_SETBYKERNEL);
|
pInterfaceDescriptor->setDenormMode(INTERFACE_DESCRIPTOR_DATA::DENORM_MODE_SETBYKERNEL);
|
||||||
|
|
||||||
DEBUG_BREAK_IF((sizePerThreadData % sizeof(GRF)) != 0);
|
setAdditionalInfo(pInterfaceDescriptor, sizeCrossThreadData, sizePerThreadData);
|
||||||
auto numGrfPerThreadData = static_cast<uint32_t>(sizePerThreadData / sizeof(GRF));
|
|
||||||
|
|
||||||
// at least 1 GRF of perThreadData for each thread in a thread group when sizeCrossThreadData != 0
|
|
||||||
numGrfPerThreadData = std::max(numGrfPerThreadData, 1u);
|
|
||||||
pInterfaceDescriptor->setConstantIndirectUrbEntryReadLength(numGrfPerThreadData);
|
|
||||||
|
|
||||||
pInterfaceDescriptor->setBindingTablePointer(static_cast<uint32_t>(bindingTablePointer));
|
pInterfaceDescriptor->setBindingTablePointer(static_cast<uint32_t>(bindingTablePointer));
|
||||||
|
|
||||||
@@ -171,51 +156,18 @@ size_t KernelCommandsHelper<GfxFamily>::sendInterfaceDescriptorData(
|
|||||||
pInterfaceDescriptor->setSharedLocalMemorySize(programmableIDSLMSize);
|
pInterfaceDescriptor->setSharedLocalMemorySize(programmableIDSLMSize);
|
||||||
pInterfaceDescriptor->setBarrierEnable(barrierEnable);
|
pInterfaceDescriptor->setBarrierEnable(barrierEnable);
|
||||||
|
|
||||||
pInterfaceDescriptor->setBindingTableEntryCount(bindingTablePrefetchSize);
|
|
||||||
|
|
||||||
PreemptionHelper::programInterfaceDescriptorDataPreemption<GfxFamily>(pInterfaceDescriptor, preemptionMode);
|
PreemptionHelper::programInterfaceDescriptorDataPreemption<GfxFamily>(pInterfaceDescriptor, preemptionMode);
|
||||||
|
|
||||||
|
pInterfaceDescriptor->setBindingTableEntryCount(bindingTablePrefetchSize);
|
||||||
|
|
||||||
return (size_t)offsetInterfaceDescriptor;
|
return (size_t)offsetInterfaceDescriptor;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename GfxFamily>
|
|
||||||
void KernelCommandsHelper<GfxFamily>::sendMediaStateFlush(
|
|
||||||
LinearStream &commandStream,
|
|
||||||
size_t offsetInterfaceDescriptorData) {
|
|
||||||
|
|
||||||
typedef typename GfxFamily::MEDIA_STATE_FLUSH MEDIA_STATE_FLUSH;
|
|
||||||
auto pCmd = (MEDIA_STATE_FLUSH *)commandStream.getSpace(sizeof(MEDIA_STATE_FLUSH));
|
|
||||||
*pCmd = GfxFamily::cmdInitMediaStateFlush;
|
|
||||||
pCmd->setInterfaceDescriptorOffset((uint32_t)offsetInterfaceDescriptorData);
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename GfxFamily>
|
|
||||||
void KernelCommandsHelper<GfxFamily>::sendMediaInterfaceDescriptorLoad(
|
|
||||||
LinearStream &commandStream,
|
|
||||||
size_t offsetInterfaceDescriptorData,
|
|
||||||
size_t sizeInterfaceDescriptorData) {
|
|
||||||
{
|
|
||||||
typedef typename GfxFamily::MEDIA_STATE_FLUSH MEDIA_STATE_FLUSH;
|
|
||||||
auto pCmd = (MEDIA_STATE_FLUSH *)commandStream.getSpace(sizeof(MEDIA_STATE_FLUSH));
|
|
||||||
*pCmd = GfxFamily::cmdInitMediaStateFlush;
|
|
||||||
}
|
|
||||||
|
|
||||||
{
|
|
||||||
typedef typename GfxFamily::MEDIA_INTERFACE_DESCRIPTOR_LOAD MEDIA_INTERFACE_DESCRIPTOR_LOAD;
|
|
||||||
auto pCmd = (MEDIA_INTERFACE_DESCRIPTOR_LOAD *)commandStream.getSpace(sizeof(MEDIA_INTERFACE_DESCRIPTOR_LOAD));
|
|
||||||
*pCmd = GfxFamily::cmdInitMediaInterfaceDescriptorLoad;
|
|
||||||
pCmd->setInterfaceDescriptorDataStartAddress((uint32_t)offsetInterfaceDescriptorData);
|
|
||||||
pCmd->setInterfaceDescriptorTotalLength((uint32_t)sizeInterfaceDescriptorData);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename GfxFamily>
|
template <typename GfxFamily>
|
||||||
size_t KernelCommandsHelper<GfxFamily>::sendCrossThreadData(
|
size_t KernelCommandsHelper<GfxFamily>::sendCrossThreadData(
|
||||||
IndirectHeap &indirectHeap,
|
IndirectHeap &indirectHeap,
|
||||||
Kernel &kernel) {
|
Kernel &kernel) {
|
||||||
typedef typename GfxFamily::GPGPU_WALKER GPGPU_WALKER;
|
indirectHeap.align(GfxFamily::WALKER_TYPE::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
|
||||||
|
|
||||||
indirectHeap.align(GPGPU_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
|
|
||||||
|
|
||||||
auto offsetCrossThreadData = indirectHeap.getUsed();
|
auto offsetCrossThreadData = indirectHeap.getUsed();
|
||||||
auto sizeCrossThreadData = kernel.getCrossThreadDataSize();
|
auto sizeCrossThreadData = kernel.getCrossThreadDataSize();
|
||||||
@@ -299,19 +251,18 @@ size_t KernelCommandsHelper<GfxFamily>::sendIndirectState(
|
|||||||
bool localIdsGenerationByRuntime,
|
bool localIdsGenerationByRuntime,
|
||||||
bool kernelUsesLocalIds,
|
bool kernelUsesLocalIds,
|
||||||
bool inlineDataProgrammingRequired) {
|
bool inlineDataProgrammingRequired) {
|
||||||
|
|
||||||
using SAMPLER_STATE = typename GfxFamily::SAMPLER_STATE;
|
using SAMPLER_STATE = typename GfxFamily::SAMPLER_STATE;
|
||||||
|
|
||||||
DEBUG_BREAK_IF(simd != 8 && simd != 16 && simd != 32);
|
DEBUG_BREAK_IF(simd != 8 && simd != 16 && simd != 32);
|
||||||
|
|
||||||
// Copy the kernel over to the ISH
|
// Copy the kernel over to the ISH
|
||||||
auto kernelStartOffset = 0llu;
|
uint64_t kernelStartOffset = 0llu;
|
||||||
const auto &kernelInfo = kernel.getKernelInfo();
|
const auto &kernelInfo = kernel.getKernelInfo();
|
||||||
auto kernelAllocation = kernelInfo.getGraphicsAllocation();
|
auto kernelAllocation = kernelInfo.getGraphicsAllocation();
|
||||||
DEBUG_BREAK_IF(!kernelAllocation);
|
DEBUG_BREAK_IF(!kernelAllocation);
|
||||||
if (kernelAllocation) {
|
setKernelStartOffset(kernelStartOffset, kernelAllocation, kernelInfo, localIdsGenerationByRuntime, kernelUsesLocalIds, kernel);
|
||||||
kernelStartOffset = kernelInfo.getGraphicsAllocation()->getGpuAddressToPatch();
|
|
||||||
}
|
|
||||||
kernelStartOffset += kernel.getStartOffset();
|
|
||||||
const auto &patchInfo = kernelInfo.patchInfo;
|
const auto &patchInfo = kernelInfo.patchInfo;
|
||||||
|
|
||||||
auto dstBindingTablePointer = pushBindingTableAndSurfaceStates(ssh, kernel);
|
auto dstBindingTablePointer = pushBindingTableAndSurfaceStates(ssh, kernel);
|
||||||
@@ -357,37 +308,36 @@ size_t KernelCommandsHelper<GfxFamily>::sendIndirectState(
|
|||||||
auto threadsPerThreadGroup = static_cast<uint32_t>(getThreadsPerWG(simd, localWorkItems));
|
auto threadsPerThreadGroup = static_cast<uint32_t>(getThreadsPerWG(simd, localWorkItems));
|
||||||
auto numChannels = PerThreadDataHelper::getNumLocalIdChannels(*threadPayload);
|
auto numChannels = PerThreadDataHelper::getNumLocalIdChannels(*threadPayload);
|
||||||
|
|
||||||
// Send thread data
|
uint32_t sizeCrossThreadData = 0;
|
||||||
auto sizeCrossThreadData = kernel.getCrossThreadDataSize();
|
size_t offsetCrossThreadData = 0;
|
||||||
auto offsetCrossThreadData = sendCrossThreadData(
|
|
||||||
|
getCrossThreadData(
|
||||||
|
sizeCrossThreadData,
|
||||||
|
offsetCrossThreadData,
|
||||||
|
kernel,
|
||||||
|
inlineDataProgrammingRequired,
|
||||||
ioh,
|
ioh,
|
||||||
kernel);
|
walkerCmd);
|
||||||
|
|
||||||
size_t sizePerThreadDataTotal = 0;
|
size_t sizePerThreadDataTotal = 0;
|
||||||
size_t sizePerThreadData = 0;
|
size_t sizePerThreadData = 0;
|
||||||
|
|
||||||
sendPerThreadData(
|
programPerThreadData(
|
||||||
|
sizePerThreadData,
|
||||||
|
localIdsGenerationByRuntime,
|
||||||
ioh,
|
ioh,
|
||||||
simd,
|
simd,
|
||||||
numChannels,
|
numChannels,
|
||||||
localWorkSize,
|
localWorkSize,
|
||||||
kernel.getKernelInfo().workgroupDimensionsOrder,
|
kernel,
|
||||||
kernel.usesOnlyImages());
|
sizePerThreadDataTotal,
|
||||||
|
localWorkItems);
|
||||||
sizePerThreadData = getPerThreadSizeLocalIDs(simd, numChannels);
|
|
||||||
|
|
||||||
auto localIdSizePerThread = PerThreadDataHelper::getLocalIdSizePerThread(simd, numChannels);
|
|
||||||
localIdSizePerThread = std::max(localIdSizePerThread, sizeof(GRF));
|
|
||||||
|
|
||||||
sizePerThreadDataTotal = getThreadsPerWG(simd, localWorkItems) * localIdSizePerThread;
|
|
||||||
DEBUG_BREAK_IF(sizePerThreadDataTotal == 0); // Hardware requires at least 1 GRF of perThreadData for each thread in thread group
|
|
||||||
|
|
||||||
uint64_t offsetInterfaceDescriptor = offsetInterfaceDescriptorTable + interfaceDescriptorIndex * sizeof(INTERFACE_DESCRIPTOR_DATA);
|
uint64_t offsetInterfaceDescriptor = offsetInterfaceDescriptorTable + interfaceDescriptorIndex * sizeof(INTERFACE_DESCRIPTOR_DATA);
|
||||||
|
|
||||||
DEBUG_BREAK_IF(patchInfo.executionEnvironment == nullptr);
|
DEBUG_BREAK_IF(patchInfo.executionEnvironment == nullptr);
|
||||||
|
|
||||||
auto bindingTablePrefetchSize = std::min(31u, static_cast<uint32_t>(kernel.getNumberOfBindingTableStates()));
|
auto bindingTablePrefetchSize = std::min(31u, static_cast<uint32_t>(kernel.getNumberOfBindingTableStates()));
|
||||||
if (kernel.isSchedulerKernel || !KernelCommandsHelper<GfxFamily>::doBindingTablePrefetch()) {
|
if (resetBindingTablePrefetch(kernel)) {
|
||||||
bindingTablePrefetchSize = 0;
|
bindingTablePrefetchSize = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -395,7 +345,7 @@ size_t KernelCommandsHelper<GfxFamily>::sendIndirectState(
|
|||||||
dsh,
|
dsh,
|
||||||
offsetInterfaceDescriptor,
|
offsetInterfaceDescriptor,
|
||||||
kernelStartOffset,
|
kernelStartOffset,
|
||||||
sizeCrossThreadData,
|
getCrossThreadDataSize(sizeCrossThreadData, kernel),
|
||||||
sizePerThreadData,
|
sizePerThreadData,
|
||||||
dstBindingTablePointer,
|
dstBindingTablePointer,
|
||||||
samplerStateOffset,
|
samplerStateOffset,
|
||||||
@@ -413,13 +363,13 @@ size_t KernelCommandsHelper<GfxFamily>::sendIndirectState(
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Program media state flush to set interface descriptor offset
|
// Program media state flush to set interface descriptor offset
|
||||||
KernelCommandsHelper<GfxFamily>::sendMediaStateFlush(
|
sendMediaStateFlush(
|
||||||
commandStream,
|
commandStream,
|
||||||
interfaceDescriptorIndex);
|
interfaceDescriptorIndex);
|
||||||
|
|
||||||
DEBUG_BREAK_IF(offsetCrossThreadData % 64 != 0);
|
DEBUG_BREAK_IF(offsetCrossThreadData % 64 != 0);
|
||||||
walkerCmd->setIndirectDataStartAddress(static_cast<uint32_t>(offsetCrossThreadData));
|
walkerCmd->setIndirectDataStartAddress(static_cast<uint32_t>(offsetCrossThreadData));
|
||||||
walkerCmd->setInterfaceDescriptorOffset(interfaceDescriptorIndex++);
|
setInterfaceDescriptorOffset(walkerCmd, interfaceDescriptorIndex);
|
||||||
|
|
||||||
auto indirectDataLength = alignUp(static_cast<uint32_t>(sizeCrossThreadData + sizePerThreadDataTotal),
|
auto indirectDataLength = alignUp(static_cast<uint32_t>(sizeCrossThreadData + sizePerThreadDataTotal),
|
||||||
WALKER_TYPE<GfxFamily>::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
|
WALKER_TYPE<GfxFamily>::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
|
||||||
@@ -428,6 +378,23 @@ size_t KernelCommandsHelper<GfxFamily>::sendIndirectState(
|
|||||||
return offsetCrossThreadData;
|
return offsetCrossThreadData;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
template <typename GfxFamily>
|
||||||
|
void KernelCommandsHelper<GfxFamily>::updatePerThreadDataTotal(
|
||||||
|
size_t &sizePerThreadData,
|
||||||
|
uint32_t &simd,
|
||||||
|
uint32_t &numChannels,
|
||||||
|
size_t &sizePerThreadDataTotal,
|
||||||
|
size_t &localWorkItems) {
|
||||||
|
|
||||||
|
sizePerThreadData = getPerThreadSizeLocalIDs(simd, numChannels);
|
||||||
|
|
||||||
|
auto localIdSizePerThread = PerThreadDataHelper::getLocalIdSizePerThread(simd, numChannels);
|
||||||
|
localIdSizePerThread = std::max(localIdSizePerThread, sizeof(GRF));
|
||||||
|
|
||||||
|
sizePerThreadDataTotal = getThreadsPerWG(simd, localWorkItems) * localIdSizePerThread;
|
||||||
|
DEBUG_BREAK_IF(sizePerThreadDataTotal == 0); // Hardware requires at least 1 GRF of perThreadData for each thread in thread group
|
||||||
|
}
|
||||||
|
|
||||||
template <typename GfxFamily>
|
template <typename GfxFamily>
|
||||||
void KernelCommandsHelper<GfxFamily>::programMiSemaphoreWait(LinearStream &commandStream, uint64_t compareAddress, uint32_t compareData) {
|
void KernelCommandsHelper<GfxFamily>::programMiSemaphoreWait(LinearStream &commandStream, uint64_t compareAddress, uint32_t compareData) {
|
||||||
using MI_SEMAPHORE_WAIT = typename GfxFamily::MI_SEMAPHORE_WAIT;
|
using MI_SEMAPHORE_WAIT = typename GfxFamily::MI_SEMAPHORE_WAIT;
|
||||||
@@ -471,11 +438,6 @@ bool KernelCommandsHelper<GfxFamily>::doBindingTablePrefetch() {
|
|||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename GfxFamily>
|
|
||||||
bool KernelCommandsHelper<GfxFamily>::isRuntimeLocalIdsGenerationRequired(uint32_t workDim, size_t *gws, size_t *lws) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
template <typename GfxFamily>
|
template <typename GfxFamily>
|
||||||
bool KernelCommandsHelper<GfxFamily>::inlineDataProgrammingRequired(const Kernel &kernel) {
|
bool KernelCommandsHelper<GfxFamily>::inlineDataProgrammingRequired(const Kernel &kernel) {
|
||||||
if (DebugManager.flags.EnablePassInlineData.get()) {
|
if (DebugManager.flags.EnablePassInlineData.get()) {
|
||||||
|
|||||||
161
runtime/helpers/kernel_commands_base.inl
Normal file
161
runtime/helpers/kernel_commands_base.inl
Normal file
@@ -0,0 +1,161 @@
|
|||||||
|
/*
|
||||||
|
* Copyright (C) 2018 Intel Corporation
|
||||||
|
*
|
||||||
|
* SPDX-License-Identifier: MIT
|
||||||
|
*
|
||||||
|
*/
|
||||||
|
|
||||||
|
#pragma once
|
||||||
|
#include "runtime/helpers/kernel_commands.h"
|
||||||
|
|
||||||
|
namespace OCLRT {
|
||||||
|
|
||||||
|
template <typename GfxFamily>
|
||||||
|
typename KernelCommandsHelper<GfxFamily>::INTERFACE_DESCRIPTOR_DATA *KernelCommandsHelper<GfxFamily>::getInterfaceDescriptor(
|
||||||
|
const IndirectHeap &indirectHeap,
|
||||||
|
uint64_t offsetInterfaceDescriptor,
|
||||||
|
KernelCommandsHelper<GfxFamily>::INTERFACE_DESCRIPTOR_DATA *inlineInterfaceDescriptor) {
|
||||||
|
return static_cast<INTERFACE_DESCRIPTOR_DATA *>(ptrOffset(indirectHeap.getCpuBase(), (size_t)offsetInterfaceDescriptor));
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename GfxFamily>
|
||||||
|
void KernelCommandsHelper<GfxFamily>::setAdditionalInfo(
|
||||||
|
INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor,
|
||||||
|
const size_t &sizeCrossThreadData,
|
||||||
|
const size_t &sizePerThreadData) {
|
||||||
|
|
||||||
|
DEBUG_BREAK_IF((sizeCrossThreadData % sizeof(GRF)) != 0);
|
||||||
|
auto numGrfCrossThreadData = static_cast<uint32_t>(sizeCrossThreadData / sizeof(GRF));
|
||||||
|
DEBUG_BREAK_IF(numGrfCrossThreadData == 0);
|
||||||
|
pInterfaceDescriptor->setCrossThreadConstantDataReadLength(numGrfCrossThreadData);
|
||||||
|
|
||||||
|
DEBUG_BREAK_IF((sizePerThreadData % sizeof(GRF)) != 0);
|
||||||
|
auto numGrfPerThreadData = static_cast<uint32_t>(sizePerThreadData / sizeof(GRF));
|
||||||
|
|
||||||
|
// at least 1 GRF of perThreadData for each thread in a thread group when sizeCrossThreadData != 0
|
||||||
|
numGrfPerThreadData = std::max(numGrfPerThreadData, 1u);
|
||||||
|
pInterfaceDescriptor->setConstantIndirectUrbEntryReadLength(numGrfPerThreadData);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename GfxFamily>
|
||||||
|
uint32_t KernelCommandsHelper<GfxFamily>::additionalSizeRequiredDsh() {
|
||||||
|
return sizeof(INTERFACE_DESCRIPTOR_DATA);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename GfxFamily>
|
||||||
|
size_t KernelCommandsHelper<GfxFamily>::getSizeRequiredCS() {
|
||||||
|
return 2 * sizeof(typename GfxFamily::MEDIA_STATE_FLUSH) +
|
||||||
|
sizeof(typename GfxFamily::MEDIA_INTERFACE_DESCRIPTOR_LOAD);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename GfxFamily>
|
||||||
|
void KernelCommandsHelper<GfxFamily>::sendMediaStateFlush(
|
||||||
|
LinearStream &commandStream,
|
||||||
|
size_t offsetInterfaceDescriptorData) {
|
||||||
|
|
||||||
|
typedef typename GfxFamily::MEDIA_STATE_FLUSH MEDIA_STATE_FLUSH;
|
||||||
|
auto pCmd = (MEDIA_STATE_FLUSH *)commandStream.getSpace(sizeof(MEDIA_STATE_FLUSH));
|
||||||
|
*pCmd = GfxFamily::cmdInitMediaStateFlush;
|
||||||
|
pCmd->setInterfaceDescriptorOffset((uint32_t)offsetInterfaceDescriptorData);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename GfxFamily>
|
||||||
|
void KernelCommandsHelper<GfxFamily>::sendMediaInterfaceDescriptorLoad(
|
||||||
|
LinearStream &commandStream,
|
||||||
|
size_t offsetInterfaceDescriptorData,
|
||||||
|
size_t sizeInterfaceDescriptorData) {
|
||||||
|
{
|
||||||
|
typedef typename GfxFamily::MEDIA_STATE_FLUSH MEDIA_STATE_FLUSH;
|
||||||
|
auto pCmd = (MEDIA_STATE_FLUSH *)commandStream.getSpace(sizeof(MEDIA_STATE_FLUSH));
|
||||||
|
*pCmd = GfxFamily::cmdInitMediaStateFlush;
|
||||||
|
}
|
||||||
|
|
||||||
|
{
|
||||||
|
typedef typename GfxFamily::MEDIA_INTERFACE_DESCRIPTOR_LOAD MEDIA_INTERFACE_DESCRIPTOR_LOAD;
|
||||||
|
auto pCmd = (MEDIA_INTERFACE_DESCRIPTOR_LOAD *)commandStream.getSpace(sizeof(MEDIA_INTERFACE_DESCRIPTOR_LOAD));
|
||||||
|
*pCmd = GfxFamily::cmdInitMediaInterfaceDescriptorLoad;
|
||||||
|
pCmd->setInterfaceDescriptorDataStartAddress((uint32_t)offsetInterfaceDescriptorData);
|
||||||
|
pCmd->setInterfaceDescriptorTotalLength((uint32_t)sizeInterfaceDescriptorData);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename GfxFamily>
|
||||||
|
void KernelCommandsHelper<GfxFamily>::setKernelStartOffset(
|
||||||
|
uint64_t &kernelStartOffset,
|
||||||
|
bool kernelAllocation,
|
||||||
|
const KernelInfo &kernelInfo,
|
||||||
|
const bool &localIdsGenerationByRuntime,
|
||||||
|
const bool &kernelUsesLocalIds,
|
||||||
|
Kernel &kernel) {
|
||||||
|
|
||||||
|
if (kernelAllocation) {
|
||||||
|
kernelStartOffset = kernelInfo.getGraphicsAllocation()->getGpuAddressToPatch();
|
||||||
|
}
|
||||||
|
kernelStartOffset += kernel.getStartOffset();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename GfxFamily>
|
||||||
|
void KernelCommandsHelper<GfxFamily>::programPerThreadData(
|
||||||
|
size_t &sizePerThreadData,
|
||||||
|
const bool &localIdsGenerationByRuntime,
|
||||||
|
LinearStream &ioh,
|
||||||
|
uint32_t &simd,
|
||||||
|
uint32_t &numChannels,
|
||||||
|
const size_t localWorkSize[3],
|
||||||
|
Kernel &kernel,
|
||||||
|
size_t &sizePerThreadDataTotal,
|
||||||
|
size_t &localWorkItems) {
|
||||||
|
|
||||||
|
sendPerThreadData(
|
||||||
|
ioh,
|
||||||
|
simd,
|
||||||
|
numChannels,
|
||||||
|
localWorkSize,
|
||||||
|
kernel.getKernelInfo().workgroupDimensionsOrder,
|
||||||
|
kernel.usesOnlyImages());
|
||||||
|
|
||||||
|
updatePerThreadDataTotal(sizePerThreadData, simd, numChannels, sizePerThreadDataTotal, localWorkItems);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename GfxFamily>
|
||||||
|
bool KernelCommandsHelper<GfxFamily>::resetBindingTablePrefetch(Kernel &kernel) {
|
||||||
|
return kernel.isSchedulerKernel || !doBindingTablePrefetch();
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename GfxFamily>
|
||||||
|
void KernelCommandsHelper<GfxFamily>::setInterfaceDescriptorOffset(
|
||||||
|
WALKER_TYPE<GfxFamily> *walkerCmd,
|
||||||
|
uint32_t &interfaceDescriptorIndex) {
|
||||||
|
|
||||||
|
walkerCmd->setInterfaceDescriptorOffset(interfaceDescriptorIndex++);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename GfxFamily>
|
||||||
|
void KernelCommandsHelper<GfxFamily>::getCrossThreadData(
|
||||||
|
uint32_t &sizeCrossThreadData,
|
||||||
|
size_t &offsetCrossThreadData,
|
||||||
|
Kernel &kernel,
|
||||||
|
const bool &inlineDataProgrammingRequired,
|
||||||
|
IndirectHeap &ioh,
|
||||||
|
WALKER_TYPE<GfxFamily> *walkerCmd) {
|
||||||
|
|
||||||
|
sizeCrossThreadData = kernel.getCrossThreadDataSize();
|
||||||
|
offsetCrossThreadData = sendCrossThreadData(
|
||||||
|
ioh,
|
||||||
|
kernel);
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename GfxFamily>
|
||||||
|
size_t KernelCommandsHelper<GfxFamily>::getCrossThreadDataSize(
|
||||||
|
uint32_t &sizeCrossThreadData,
|
||||||
|
Kernel &kernel) {
|
||||||
|
|
||||||
|
return sizeCrossThreadData;
|
||||||
|
}
|
||||||
|
|
||||||
|
template <typename GfxFamily>
|
||||||
|
bool KernelCommandsHelper<GfxFamily>::isRuntimeLocalIdsGenerationRequired(uint32_t workDim, size_t *gws, size_t *lws) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
} // namespace OCLRT
|
||||||
@@ -14,6 +14,7 @@
|
|||||||
#include "runtime/gen_common/aub_mapper.h"
|
#include "runtime/gen_common/aub_mapper.h"
|
||||||
#include "runtime/helpers/hw_helper_common.inl"
|
#include "runtime/helpers/hw_helper_common.inl"
|
||||||
#include "runtime/helpers/kernel_commands.inl"
|
#include "runtime/helpers/kernel_commands.inl"
|
||||||
|
#include "runtime/helpers/kernel_commands_base.inl"
|
||||||
#include "runtime/helpers/preamble.inl"
|
#include "runtime/helpers/preamble.inl"
|
||||||
|
|
||||||
namespace OCLRT {
|
namespace OCLRT {
|
||||||
|
|||||||
Reference in New Issue
Block a user