mirror of
https://github.com/intel/compute-runtime.git
synced 2026-01-06 19:32:25 +08:00
Remove PatchTokens from KernelInfo
Use KernelDescriptor instead of patchTokens stored in KernelInfo's patchInfo. Removed: SPatchMediaInterfaceDescriptorLoad, SPatchAllocateLocalSurface, SPatchMediaVFEState(slot 0), SPatchMediaVFEState(slot 1), SPatchInterfaceDescriptorData, SPatchSamplerStateArray, SPatchBindingTableState, SPatchDataParameterBuffer, SPatchDataParameterStream, SPatchThreadPayload, SPatchKernelAttributesInfo, SPatchAllocateStatelessPrivateSurface, SPatchAllocateSyncBuffer, SPatchAllocateStatelessConstantMemorySurfaceWithInitialization, SPatchAllocateStatelessGlobalMemorySurfaceWithInitialization, SPatchAllocateSystemThreadSurface. Related-To: NEO-4729 Signed-off-by: Krystian Chmielewski <krystian.chmielewski@intel.com>
This commit is contained in:
committed by
Compute-Runtime-Automation
parent
03631ce22b
commit
41f3bd00ff
@@ -3431,7 +3431,7 @@ cl_int CL_API_CALL clEnqueueNDRangeKernel(cl_command_queue commandQueue,
|
||||
}
|
||||
|
||||
if ((pKernel->getExecutionType() != KernelExecutionType::Default) ||
|
||||
pKernel->isUsingSyncBuffer(pCommandQueue->getDevice().getRootDeviceIndex())) {
|
||||
pKernel->usesSyncBuffer(pCommandQueue->getDevice().getRootDeviceIndex())) {
|
||||
retVal = CL_INVALID_KERNEL;
|
||||
TRACING_EXIT(clEnqueueNDRangeKernel, &retVal);
|
||||
return retVal;
|
||||
@@ -5884,7 +5884,7 @@ cl_int CL_API_CALL clEnqueueNDCountKernelINTEL(cl_command_queue commandQueue,
|
||||
}
|
||||
}
|
||||
|
||||
if (pKernel->isUsingSyncBuffer(rootDeviceIndex)) {
|
||||
if (pKernel->usesSyncBuffer(rootDeviceIndex)) {
|
||||
if (pKernel->getExecutionType() != KernelExecutionType::Concurrent) {
|
||||
retVal = CL_INVALID_KERNEL;
|
||||
return retVal;
|
||||
|
||||
@@ -537,7 +537,7 @@ bool CommandQueue::setupDebugSurface(Kernel *kernel) {
|
||||
auto rootDeviceIndex = device->getRootDeviceIndex();
|
||||
DEBUG_BREAK_IF(!kernel->requiresSshForBuffers(rootDeviceIndex));
|
||||
auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(kernel->getSurfaceStateHeap(rootDeviceIndex)),
|
||||
kernel->getKernelInfo(rootDeviceIndex).patchInfo.pAllocateSystemThreadSurface->Offset);
|
||||
kernel->getKernelInfo(rootDeviceIndex).kernelDescriptor.payloadMappings.implicitArgs.systemThreadSurfaceAddress.bindful);
|
||||
void *addressToPatch = reinterpret_cast<void *>(debugSurface->getGpuAddress());
|
||||
size_t sizeToPatch = debugSurface->getUnderlyingBufferSize();
|
||||
Buffer::setSurfaceState(&device->getDevice(), surfaceState, false, false, sizeToPatch, addressToPatch, 0, debugSurface, 0, 0);
|
||||
|
||||
@@ -106,6 +106,7 @@ class GpgpuWalkerHelper {
|
||||
|
||||
static size_t setGpgpuWalkerThreadData(
|
||||
WALKER_TYPE<GfxFamily> *walkerCmd,
|
||||
const KernelDescriptor &kernelDescriptor,
|
||||
const size_t globalOffsets[3],
|
||||
const size_t startWorkGroups[3],
|
||||
const size_t numWorkGroups[3],
|
||||
@@ -114,7 +115,6 @@ class GpgpuWalkerHelper {
|
||||
uint32_t workDim,
|
||||
bool localIdsGenerationByRuntime,
|
||||
bool inlineDataProgrammingRequired,
|
||||
const iOpenCL::SPatchThreadPayload &threadPayload,
|
||||
uint32_t requiredWorkgroupOrder);
|
||||
|
||||
static void dispatchProfilingCommandsStart(
|
||||
|
||||
@@ -20,6 +20,7 @@ namespace NEO {
|
||||
template <typename GfxFamily>
|
||||
inline size_t GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(
|
||||
WALKER_TYPE<GfxFamily> *walkerCmd,
|
||||
const KernelDescriptor &kernelDescriptor,
|
||||
const size_t globalOffsets[3],
|
||||
const size_t startWorkGroups[3],
|
||||
const size_t numWorkGroups[3],
|
||||
@@ -28,7 +29,6 @@ inline size_t GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(
|
||||
uint32_t workDim,
|
||||
bool localIdsGenerationByRuntime,
|
||||
bool inlineDataProgrammingRequired,
|
||||
const iOpenCL::SPatchThreadPayload &threadPayload,
|
||||
uint32_t requiredWorkgroupOrder) {
|
||||
auto localWorkSize = localWorkSizesIn[0] * localWorkSizesIn[1] * localWorkSizesIn[2];
|
||||
|
||||
@@ -142,9 +142,8 @@ void GpgpuWalkerHelper<GfxFamily>::dispatchScheduler(
|
||||
|
||||
size_t globalOffsets[3] = {0, 0, 0};
|
||||
size_t workGroups[3] = {(scheduler.getGws() / scheduler.getLws()), 1, 1};
|
||||
GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(&cmdWalker, globalOffsets, globalOffsets, workGroups, localWorkSizes,
|
||||
simd, 1, true, inlineDataProgrammingRequired,
|
||||
*kernelInfo.patchInfo.threadPayload, 0u);
|
||||
GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(&cmdWalker, kernelInfo.kernelDescriptor, globalOffsets, globalOffsets, workGroups, localWorkSizes,
|
||||
simd, 1, true, inlineDataProgrammingRequired, 0u);
|
||||
*pGpGpuWalkerCmd = cmdWalker;
|
||||
|
||||
// Implement disabling special WA DisableLSQCROPERFforOCL if needed
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2018-2020 Intel Corporation
|
||||
* Copyright (C) 2018-2021 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -105,10 +105,10 @@ inline void HardwareInterface<GfxFamily>::programWalker(
|
||||
true,
|
||||
commandQueue.getDevice());
|
||||
|
||||
GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(&walkerCmd, globalOffsets, startWorkGroups,
|
||||
GpgpuWalkerHelper<GfxFamily>::setGpgpuWalkerThreadData(&walkerCmd, kernel.getKernelInfo(rootDeviceIndex).kernelDescriptor,
|
||||
globalOffsets, startWorkGroups,
|
||||
numWorkGroups, localWorkSizes, simd, dim,
|
||||
false, false,
|
||||
*kernel.getKernelInfo(rootDeviceIndex).patchInfo.threadPayload, 0u);
|
||||
false, false, 0u);
|
||||
|
||||
EncodeDispatchKernel<GfxFamily>::encodeAdditionalWalkerFields(commandQueue.getDevice().getHardwareInfo(), walkerCmd);
|
||||
*walkerCmdBuf = walkerCmd;
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2019-2020 Intel Corporation
|
||||
* Copyright (C) 2019-2021 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -256,7 +256,7 @@ uint64_t DeviceQueueHw<GfxFamily>::getBlockKernelStartPointer(const Device &devi
|
||||
auto &hwHelper = HwHelper::get(hardwareInfo.platform.eRenderCoreFamily);
|
||||
|
||||
if (blockAllocation && isCcsUsed && hwHelper.isOffsetToSkipSetFFIDGPWARequired(hardwareInfo)) {
|
||||
blockKernelStartPointer += blockInfo->patchInfo.threadPayload->OffsetToSkipSetFFIDGP;
|
||||
blockKernelStartPointer += blockInfo->kernelDescriptor.entryPoints.skipSetFFIDGP;
|
||||
}
|
||||
return blockKernelStartPointer;
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2019-2020 Intel Corporation
|
||||
* Copyright (C) 2019-2021 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -182,7 +182,7 @@ void DeviceQueueHw<GfxFamily>::setupIndirectState(IndirectHeap &surfaceStateHeap
|
||||
|
||||
auto blockKernelStartPointer = getBlockKernelStartPointer(getDevice(), pBlockInfo, isCcsUsed);
|
||||
|
||||
auto bindingTableCount = pBlockInfo->patchInfo.bindingTableState->Count;
|
||||
auto bindingTableCount = static_cast<uint32_t>(pBlockInfo->kernelDescriptor.payloadMappings.bindingTable.numEntries);
|
||||
maxBindingTableCount = std::max(maxBindingTableCount, bindingTableCount);
|
||||
|
||||
totalBlockSSHSize += alignUp(pBlockInfo->heapInfo.SurfaceStateHeapSize, BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE);
|
||||
@@ -192,15 +192,14 @@ void DeviceQueueHw<GfxFamily>::setupIndirectState(IndirectHeap &surfaceStateHeap
|
||||
pBlockInfo->heapInfo.pSsh,
|
||||
pBlockInfo->heapInfo.SurfaceStateHeapSize,
|
||||
bindingTableCount,
|
||||
pBlockInfo->patchInfo.bindingTableState->Offset);
|
||||
pBlockInfo->kernelDescriptor.payloadMappings.bindingTable.tableOffset);
|
||||
|
||||
parentKernel->setReflectionSurfaceBlockBtOffset(i, static_cast<uint32_t>(btOffset));
|
||||
|
||||
// Determine SIMD size
|
||||
uint32_t simd = pBlockInfo->getMaxSimdSize();
|
||||
DEBUG_BREAK_IF(pBlockInfo->patchInfo.interfaceDescriptorData == nullptr);
|
||||
|
||||
uint32_t idOffset = pBlockInfo->patchInfo.interfaceDescriptorData->Offset;
|
||||
uint32_t idOffset = pBlockInfo->kernelDescriptor.kernelMetadata.deviceSideEnqueueBlockInterfaceDescriptorOffset;
|
||||
const INTERFACE_DESCRIPTOR_DATA *pBlockID = static_cast<const INTERFACE_DESCRIPTOR_DATA *>(ptrOffset(pBlockInfo->heapInfo.pDsh, idOffset));
|
||||
|
||||
pIDDestination[blockIndex + i] = *pBlockID;
|
||||
@@ -214,10 +213,7 @@ void DeviceQueueHw<GfxFamily>::setupIndirectState(IndirectHeap &surfaceStateHeap
|
||||
// Set offset to sampler states, block's DHSOffset is added by scheduler
|
||||
pIDDestination[blockIndex + i].setSamplerStatePointer(static_cast<uint32_t>(pBlockInfo->getBorderColorStateSize()));
|
||||
|
||||
auto threadPayload = pBlockInfo->patchInfo.threadPayload;
|
||||
DEBUG_BREAK_IF(nullptr == threadPayload);
|
||||
|
||||
auto numChannels = PerThreadDataHelper::getNumLocalIdChannels(*threadPayload);
|
||||
auto numChannels = pBlockInfo->kernelDescriptor.kernelAttributes.numLocalIdChannels;
|
||||
auto grfSize = device->getDeviceInfo().grfSize;
|
||||
auto sizePerThreadData = getPerThreadSizeLocalIDs(simd, grfSize, numChannels);
|
||||
auto numGrfPerThreadData = static_cast<uint32_t>(sizePerThreadData / grfSize);
|
||||
|
||||
@@ -19,7 +19,7 @@ inline cl_command_queue_capabilities_intel ClHwHelperHw<GfxFamily>::getAdditiona
|
||||
|
||||
template <typename GfxFamily>
|
||||
cl_ulong ClHwHelperHw<GfxFamily>::getKernelPrivateMemSize(const KernelInfo &kernelInfo) const {
|
||||
return kernelInfo.patchInfo.pAllocateStatelessPrivateSurface ? kernelInfo.patchInfo.pAllocateStatelessPrivateSurface->PerThreadPrivateMemorySize : 0;
|
||||
return kernelInfo.kernelDescriptor.kernelAttributes.perHwThreadPrivateMemorySize;
|
||||
}
|
||||
|
||||
} // namespace NEO
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2017-2020 Intel Corporation
|
||||
* Copyright (C) 2017-2021 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -35,18 +35,14 @@ size_t HardwareCommandsHelper<GfxFamily>::getSizeRequiredDSH(
|
||||
const Kernel &kernel) {
|
||||
using INTERFACE_DESCRIPTOR_DATA = typename GfxFamily::INTERFACE_DESCRIPTOR_DATA;
|
||||
using SAMPLER_STATE = typename GfxFamily::SAMPLER_STATE;
|
||||
const auto &patchInfo = kernel.getKernelInfo(rootDeviceIndex).patchInfo;
|
||||
auto samplerCount = patchInfo.samplerStateArray
|
||||
? patchInfo.samplerStateArray->Count
|
||||
: 0;
|
||||
const auto &samplerTable = kernel.getKernelInfo(rootDeviceIndex).kernelDescriptor.payloadMappings.samplerTable;
|
||||
|
||||
auto samplerCount = samplerTable.numSamplers;
|
||||
auto totalSize = samplerCount
|
||||
? alignUp(samplerCount * sizeof(SAMPLER_STATE), INTERFACE_DESCRIPTOR_DATA::SAMPLERSTATEPOINTER_ALIGN_SIZE)
|
||||
: 0;
|
||||
|
||||
auto borderColorSize = patchInfo.samplerStateArray
|
||||
? patchInfo.samplerStateArray->Offset - patchInfo.samplerStateArray->BorderColorOffset
|
||||
: 0;
|
||||
|
||||
auto borderColorSize = samplerTable.borderColor;
|
||||
borderColorSize = alignUp(borderColorSize + EncodeStates<GfxFamily>::alignIndirectStatePointer - 1,
|
||||
EncodeStates<GfxFamily>::alignIndirectStatePointer);
|
||||
|
||||
@@ -63,14 +59,12 @@ size_t HardwareCommandsHelper<GfxFamily>::getSizeRequiredIOH(
|
||||
const Kernel &kernel,
|
||||
size_t localWorkSize) {
|
||||
typedef typename GfxFamily::WALKER_TYPE WALKER_TYPE;
|
||||
const auto &kernelInfo = kernel.getKernelInfo(rootDeviceIndex);
|
||||
|
||||
auto threadPayload = kernel.getKernelInfo(rootDeviceIndex).patchInfo.threadPayload;
|
||||
DEBUG_BREAK_IF(nullptr == threadPayload);
|
||||
|
||||
auto numChannels = PerThreadDataHelper::getNumLocalIdChannels(*threadPayload);
|
||||
auto numChannels = kernelInfo.kernelDescriptor.kernelAttributes.numLocalIdChannels;
|
||||
uint32_t grfSize = sizeof(typename GfxFamily::GRF);
|
||||
return alignUp((kernel.getCrossThreadDataSize(rootDeviceIndex) +
|
||||
getPerThreadDataSizeTotal(kernel.getKernelInfo(rootDeviceIndex).getMaxSimdSize(), grfSize, numChannels, localWorkSize)),
|
||||
getPerThreadDataSizeTotal(kernelInfo.getMaxSimdSize(), grfSize, numChannels, localWorkSize)),
|
||||
WALKER_TYPE::INDIRECTDATASTARTADDRESS_ALIGN_SIZE);
|
||||
}
|
||||
|
||||
@@ -132,7 +126,7 @@ size_t HardwareCommandsHelper<GfxFamily>::getSshSizeForExecutionModel(const Kern
|
||||
totalSize += pBlockInfo->heapInfo.SurfaceStateHeapSize;
|
||||
totalSize = alignUp(totalSize, BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE);
|
||||
|
||||
maxBindingTableCount = std::max(maxBindingTableCount, pBlockInfo->patchInfo.bindingTableState->Count);
|
||||
maxBindingTableCount = std::max(maxBindingTableCount, static_cast<uint32_t>(pBlockInfo->kernelDescriptor.payloadMappings.bindingTable.numEntries));
|
||||
}
|
||||
|
||||
SchedulerKernel &scheduler = kernel.getContext().getSchedulerKernel();
|
||||
@@ -233,31 +227,28 @@ size_t HardwareCommandsHelper<GfxFamily>::sendIndirectState(
|
||||
|
||||
// Copy the kernel over to the ISH
|
||||
const auto &kernelInfo = kernel.getKernelInfo(rootDeviceIndex);
|
||||
const auto &patchInfo = kernelInfo.patchInfo;
|
||||
|
||||
ssh.align(BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE);
|
||||
kernel.patchBindlessSurfaceStateOffsets(device, ssh.getUsed());
|
||||
|
||||
auto dstBindingTablePointer = EncodeSurfaceState<GfxFamily>::pushBindingTableAndSurfaceStates(ssh, (kernelInfo.patchInfo.bindingTableState != nullptr) ? kernelInfo.patchInfo.bindingTableState->Count : 0,
|
||||
auto dstBindingTablePointer = EncodeSurfaceState<GfxFamily>::pushBindingTableAndSurfaceStates(ssh, kernelInfo.kernelDescriptor.payloadMappings.bindingTable.numEntries,
|
||||
kernel.getSurfaceStateHeap(rootDeviceIndex), kernel.getSurfaceStateHeapSize(rootDeviceIndex),
|
||||
kernel.getNumberOfBindingTableStates(rootDeviceIndex), kernel.getBindingTableOffset(rootDeviceIndex));
|
||||
|
||||
// Copy our sampler state if it exists
|
||||
uint32_t samplerStateOffset = 0;
|
||||
const auto &samplerTable = kernelInfo.kernelDescriptor.payloadMappings.samplerTable;
|
||||
uint32_t samplerCount = 0;
|
||||
if (patchInfo.samplerStateArray) {
|
||||
samplerCount = patchInfo.samplerStateArray->Count;
|
||||
samplerStateOffset = EncodeStates<GfxFamily>::copySamplerState(&dsh, patchInfo.samplerStateArray->Offset,
|
||||
samplerCount, patchInfo.samplerStateArray->BorderColorOffset,
|
||||
uint32_t samplerStateOffset = 0;
|
||||
if (isValidOffset(samplerTable.tableOffset) && isValidOffset(samplerTable.borderColor)) {
|
||||
samplerCount = samplerTable.numSamplers;
|
||||
samplerStateOffset = EncodeStates<GfxFamily>::copySamplerState(&dsh, samplerTable.tableOffset,
|
||||
samplerCount, samplerTable.borderColor,
|
||||
kernel.getDynamicStateHeap(rootDeviceIndex), device.getBindlessHeapsHelper());
|
||||
}
|
||||
|
||||
auto threadPayload = kernelInfo.patchInfo.threadPayload;
|
||||
DEBUG_BREAK_IF(nullptr == threadPayload);
|
||||
|
||||
auto localWorkItems = localWorkSize[0] * localWorkSize[1] * localWorkSize[2];
|
||||
auto threadsPerThreadGroup = static_cast<uint32_t>(getThreadsPerWG(simd, localWorkItems));
|
||||
auto numChannels = PerThreadDataHelper::getNumLocalIdChannels(*threadPayload);
|
||||
auto numChannels = static_cast<uint32_t>(kernelInfo.kernelDescriptor.kernelAttributes.numLocalIdChannels);
|
||||
|
||||
uint32_t sizeCrossThreadData = kernel.getCrossThreadDataSize(rootDeviceIndex);
|
||||
|
||||
@@ -348,16 +339,14 @@ bool HardwareCommandsHelper<GfxFamily>::inlineDataProgrammingRequired(const Kern
|
||||
checkKernelForInlineData = !!DebugManager.flags.EnablePassInlineData.get();
|
||||
}
|
||||
if (checkKernelForInlineData) {
|
||||
return kernel.getKernelInfo(rootDeviceIndex).patchInfo.threadPayload->PassInlineData;
|
||||
return kernel.getKernelInfo(rootDeviceIndex).kernelDescriptor.kernelAttributes.flags.passInlineData;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
template <typename GfxFamily>
|
||||
bool HardwareCommandsHelper<GfxFamily>::kernelUsesLocalIds(const Kernel &kernel, uint32_t rootDeviceIndex) {
|
||||
return (kernel.getKernelInfo(rootDeviceIndex).patchInfo.threadPayload->LocalIDXPresent ||
|
||||
kernel.getKernelInfo(rootDeviceIndex).patchInfo.threadPayload->LocalIDYPresent ||
|
||||
kernel.getKernelInfo(rootDeviceIndex).patchInfo.threadPayload->LocalIDZPresent);
|
||||
return kernel.getKernelInfo(rootDeviceIndex).kernelDescriptor.kernelAttributes.numLocalIdChannels > 0;
|
||||
}
|
||||
|
||||
} // namespace NEO
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2017-2020 Intel Corporation
|
||||
* Copyright (C) 2017-2021 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -35,13 +35,13 @@ size_t PerThreadDataHelper::sendPerThreadData(
|
||||
return offsetPerThreadData;
|
||||
}
|
||||
|
||||
uint32_t PerThreadDataHelper::getThreadPayloadSize(const iOpenCL::SPatchThreadPayload &threadPayload, uint32_t simd, uint32_t grfSize) {
|
||||
uint32_t multiplier = static_cast<uint32_t>(getGRFsPerThread(simd, grfSize));
|
||||
uint32_t PerThreadDataHelper::getThreadPayloadSize(const KernelDescriptor &kernelDescriptor, uint32_t grfSize) {
|
||||
uint32_t multiplier = static_cast<uint32_t>(getGRFsPerThread(kernelDescriptor.kernelAttributes.simdSize, grfSize));
|
||||
uint32_t threadPayloadSize = 0;
|
||||
threadPayloadSize = getNumLocalIdChannels(threadPayload) * multiplier * grfSize;
|
||||
threadPayloadSize += (threadPayload.HeaderPresent) ? grfSize : 0;
|
||||
threadPayloadSize += (threadPayload.LocalIDFlattenedPresent) ? (grfSize * multiplier) : 0;
|
||||
threadPayloadSize += (threadPayload.UnusedPerThreadConstantPresent) ? grfSize : 0;
|
||||
threadPayloadSize = kernelDescriptor.kernelAttributes.numLocalIdChannels * multiplier * grfSize;
|
||||
threadPayloadSize += (kernelDescriptor.kernelAttributes.flags.perThreadDataHeaderIsPresent) ? grfSize : 0;
|
||||
threadPayloadSize += (kernelDescriptor.kernelAttributes.flags.usesFlattenedLocalIds) ? (grfSize * multiplier) : 0;
|
||||
threadPayloadSize += (kernelDescriptor.kernelAttributes.flags.perThreadDataUnusedGrfIsPresent) ? grfSize : 0;
|
||||
return threadPayloadSize;
|
||||
}
|
||||
} // namespace NEO
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2017-2020 Intel Corporation
|
||||
* Copyright (C) 2017-2021 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -7,6 +7,7 @@
|
||||
|
||||
#pragma once
|
||||
#include "shared/source/helpers/local_id_gen.h"
|
||||
#include "shared/source/kernel/kernel_descriptor.h"
|
||||
|
||||
#include "patch_shared.h"
|
||||
|
||||
@@ -42,12 +43,6 @@ struct PerThreadDataHelper {
|
||||
const std::array<uint8_t, 3> &workgroupWalkOrder,
|
||||
bool hasKernelOnlyImages);
|
||||
|
||||
static inline uint32_t getNumLocalIdChannels(const iOpenCL::SPatchThreadPayload &threadPayload) {
|
||||
return threadPayload.LocalIDXPresent +
|
||||
threadPayload.LocalIDYPresent +
|
||||
threadPayload.LocalIDZPresent;
|
||||
}
|
||||
|
||||
static uint32_t getThreadPayloadSize(const iOpenCL::SPatchThreadPayload &threadPayload, uint32_t simd, uint32_t grfSize);
|
||||
static uint32_t getThreadPayloadSize(const KernelDescriptor &kernelDescriptor, uint32_t grfSize);
|
||||
};
|
||||
} // namespace NEO
|
||||
|
||||
@@ -136,15 +136,37 @@ inline void patch(const SrcT &src, void *dst, uint32_t dstOffsetBytes) {
|
||||
*patchLocation = static_cast<DstT>(src);
|
||||
}
|
||||
|
||||
void Kernel::patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const Device &device, const ArgDescPointer &arg) {
|
||||
auto rootDeviceIndex = device.getRootDeviceIndex();
|
||||
|
||||
void *crossThreadData = getCrossThreadData(rootDeviceIndex);
|
||||
if ((nullptr != crossThreadData) && isValidOffset(arg.stateless)) {
|
||||
auto pp = ptrOffset(crossThreadData, arg.stateless);
|
||||
uintptr_t addressToPatch = reinterpret_cast<uintptr_t>(ptrToPatchInCrossThreadData);
|
||||
patchWithRequiredSize(pp, arg.pointerSize, addressToPatch);
|
||||
if (DebugManager.flags.AddPatchInfoCommentsForAUBDump.get()) {
|
||||
PatchInfoData patchInfoData(addressToPatch, 0u, PatchInfoAllocationType::KernelArg, reinterpret_cast<uint64_t>(crossThreadData), arg.stateless, PatchInfoAllocationType::IndirectObjectHeap, arg.pointerSize);
|
||||
this->patchInfoDataList.push_back(patchInfoData);
|
||||
}
|
||||
}
|
||||
|
||||
void *ssh = getSurfaceStateHeap(rootDeviceIndex);
|
||||
if ((nullptr != ssh) & isValidOffset(arg.bindful)) {
|
||||
auto surfaceState = ptrOffset(ssh, arg.bindful);
|
||||
void *addressToPatch = reinterpret_cast<void *>(allocation.getGpuAddressToPatch());
|
||||
size_t sizeToPatch = allocation.getUnderlyingBufferSize();
|
||||
Buffer::setSurfaceState(&device, surfaceState, false, false, sizeToPatch, addressToPatch, 0, &allocation, 0, 0);
|
||||
}
|
||||
}
|
||||
|
||||
template <typename PatchTokenT>
|
||||
void Kernel::patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const Device &device, const PatchTokenT &patch) {
|
||||
uint32_t crossThreadDataOffset = patch.DataParamOffset;
|
||||
uint32_t pointerSize = patch.DataParamSize;
|
||||
uint32_t sshOffset = patch.SurfaceStateHeapOffset;
|
||||
auto rootDeviceIndex = device.getRootDeviceIndex();
|
||||
uint32_t pointerSize = patch.DataParamSize;
|
||||
|
||||
void *crossThreadData = getCrossThreadData(rootDeviceIndex);
|
||||
void *ssh = getSurfaceStateHeap(rootDeviceIndex);
|
||||
if (crossThreadData != nullptr) {
|
||||
uint32_t crossThreadDataOffset = patch.DataParamOffset;
|
||||
auto pp = ptrOffset(crossThreadData, crossThreadDataOffset);
|
||||
uintptr_t addressToPatch = reinterpret_cast<uintptr_t>(ptrToPatchInCrossThreadData);
|
||||
patchWithRequiredSize(pp, pointerSize, addressToPatch);
|
||||
@@ -154,7 +176,9 @@ void Kernel::patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, Graphic
|
||||
}
|
||||
}
|
||||
|
||||
void *ssh = getSurfaceStateHeap(rootDeviceIndex);
|
||||
if (ssh) {
|
||||
uint32_t sshOffset = patch.SurfaceStateHeapOffset;
|
||||
auto surfaceState = ptrOffset(ssh, sshOffset);
|
||||
void *addressToPatch = reinterpret_cast<void *>(allocation.getGpuAddressToPatch());
|
||||
size_t sizeToPatch = allocation.getUnderlyingBufferSize();
|
||||
@@ -185,15 +209,12 @@ cl_int Kernel::initialize() {
|
||||
auto maxSimdSize = kernelInfo.getMaxSimdSize();
|
||||
const auto &workloadInfo = kernelInfo.workloadInfo;
|
||||
const auto &heapInfo = kernelInfo.heapInfo;
|
||||
const auto &patchInfo = kernelInfo.patchInfo;
|
||||
|
||||
if (maxSimdSize != 1 && maxSimdSize < hwHelper.getMinimalSIMDSize()) {
|
||||
return CL_INVALID_KERNEL;
|
||||
}
|
||||
|
||||
kernelDeviceInfo.crossThreadDataSize = patchInfo.dataParameterStream
|
||||
? patchInfo.dataParameterStream->DataParameterStreamSize
|
||||
: 0;
|
||||
kernelDeviceInfo.crossThreadDataSize = kernelDescriptor.kernelAttributes.crossThreadDataSize;
|
||||
|
||||
// now allocate our own cross-thread data, if necessary
|
||||
if (kernelDeviceInfo.crossThreadDataSize) {
|
||||
@@ -297,11 +318,11 @@ cl_int Kernel::initialize() {
|
||||
memcpy_s(kernelDeviceInfo.pSshLocal.get(), kernelDeviceInfo.sshLocalSize,
|
||||
heapInfo.pSsh, kernelDeviceInfo.sshLocalSize);
|
||||
}
|
||||
kernelDeviceInfo.numberOfBindingTableStates = (patchInfo.bindingTableState != nullptr) ? patchInfo.bindingTableState->Count : 0;
|
||||
kernelDeviceInfo.localBindingTableOffset = (patchInfo.bindingTableState != nullptr) ? patchInfo.bindingTableState->Offset : 0;
|
||||
kernelDeviceInfo.numberOfBindingTableStates = kernelDescriptor.payloadMappings.bindingTable.numEntries;
|
||||
kernelDeviceInfo.localBindingTableOffset = kernelDescriptor.payloadMappings.bindingTable.tableOffset;
|
||||
|
||||
// patch crossthread data and ssh with inline surfaces, if necessary
|
||||
auto perHwThreadPrivateMemorySize = PatchTokenBinary::getPerHwThreadPrivateSurfaceSize(patchInfo.pAllocateStatelessPrivateSurface, kernelInfo.getMaxSimdSize());
|
||||
auto perHwThreadPrivateMemorySize = kernelDescriptor.kernelAttributes.perHwThreadPrivateMemorySize;
|
||||
|
||||
if (perHwThreadPrivateMemorySize) {
|
||||
kernelDeviceInfo.privateSurfaceSize = KernelHelper::getPrivateSurfaceSize(perHwThreadPrivateMemorySize, pClDevice->getSharedDeviceInfo().computeUnitsUsedForScratch);
|
||||
@@ -318,23 +339,23 @@ cl_int Kernel::initialize() {
|
||||
if (kernelDeviceInfo.privateSurface == nullptr) {
|
||||
return CL_OUT_OF_RESOURCES;
|
||||
}
|
||||
const auto &patch = patchInfo.pAllocateStatelessPrivateSurface;
|
||||
patchWithImplicitSurface(reinterpret_cast<void *>(kernelDeviceInfo.privateSurface->getGpuAddressToPatch()), *kernelDeviceInfo.privateSurface, pClDevice->getDevice(), *patch);
|
||||
const auto &patch = kernelDescriptor.payloadMappings.implicitArgs.privateMemoryAddress;
|
||||
patchWithImplicitSurface(reinterpret_cast<void *>(kernelDeviceInfo.privateSurface->getGpuAddressToPatch()), *kernelDeviceInfo.privateSurface, pClDevice->getDevice(), patch);
|
||||
}
|
||||
if (patchInfo.pAllocateStatelessConstantMemorySurfaceWithInitialization) {
|
||||
if (isValidOffset(kernelDescriptor.payloadMappings.implicitArgs.globalConstantsSurfaceAddress.stateless)) {
|
||||
DEBUG_BREAK_IF(program->getConstantSurface(rootDeviceIndex) == nullptr);
|
||||
uintptr_t constMemory = isBuiltIn ? (uintptr_t)program->getConstantSurface(rootDeviceIndex)->getUnderlyingBuffer() : (uintptr_t)program->getConstantSurface(rootDeviceIndex)->getGpuAddressToPatch();
|
||||
|
||||
const auto &patch = patchInfo.pAllocateStatelessConstantMemorySurfaceWithInitialization;
|
||||
patchWithImplicitSurface(reinterpret_cast<void *>(constMemory), *program->getConstantSurface(rootDeviceIndex), pClDevice->getDevice(), *patch);
|
||||
const auto &arg = kernelDescriptor.payloadMappings.implicitArgs.globalConstantsSurfaceAddress;
|
||||
patchWithImplicitSurface(reinterpret_cast<void *>(constMemory), *program->getConstantSurface(rootDeviceIndex), pClDevice->getDevice(), arg);
|
||||
}
|
||||
|
||||
if (patchInfo.pAllocateStatelessGlobalMemorySurfaceWithInitialization) {
|
||||
if (isValidOffset(kernelDescriptor.payloadMappings.implicitArgs.globalVariablesSurfaceAddress.stateless)) {
|
||||
DEBUG_BREAK_IF(program->getGlobalSurface(rootDeviceIndex) == nullptr);
|
||||
uintptr_t globalMemory = isBuiltIn ? (uintptr_t)program->getGlobalSurface(rootDeviceIndex)->getUnderlyingBuffer() : (uintptr_t)program->getGlobalSurface(rootDeviceIndex)->getGpuAddressToPatch();
|
||||
|
||||
const auto &patch = patchInfo.pAllocateStatelessGlobalMemorySurfaceWithInitialization;
|
||||
patchWithImplicitSurface(reinterpret_cast<void *>(globalMemory), *program->getGlobalSurface(rootDeviceIndex), pClDevice->getDevice(), *patch);
|
||||
const auto &arg = kernelDescriptor.payloadMappings.implicitArgs.globalVariablesSurfaceAddress;
|
||||
patchWithImplicitSurface(reinterpret_cast<void *>(globalMemory), *program->getGlobalSurface(rootDeviceIndex), pClDevice->getDevice(), arg);
|
||||
}
|
||||
|
||||
if (isValidOffset(kernelDescriptor.payloadMappings.implicitArgs.deviceSideEnqueueEventPoolSurfaceAddress.bindful)) {
|
||||
@@ -367,7 +388,7 @@ cl_int Kernel::initialize() {
|
||||
if (isParentKernel) {
|
||||
program->allocateBlockPrivateSurfaces(*pClDevice);
|
||||
}
|
||||
if (program->isKernelDebugEnabled() && kernelInfo.patchInfo.pAllocateSystemThreadSurface) {
|
||||
if (program->isKernelDebugEnabled() && isValidOffset(kernelDescriptor.payloadMappings.implicitArgs.systemThreadSurfaceAddress.bindful)) {
|
||||
debugEnabled = true;
|
||||
}
|
||||
auto numArgs = kernelInfo.kernelArgInfo.size();
|
||||
@@ -518,8 +539,8 @@ cl_int Kernel::getInfo(cl_kernel_info paramName, size_t paramValueSize,
|
||||
break;
|
||||
|
||||
case CL_KERNEL_ATTRIBUTES:
|
||||
pSrc = defaultKernelInfo.attributes.c_str();
|
||||
srcSize = defaultKernelInfo.attributes.length() + 1;
|
||||
pSrc = defaultKernelInfo.kernelDescriptor.kernelMetadata.kernelLanguageAttributes.c_str();
|
||||
srcSize = defaultKernelInfo.kernelDescriptor.kernelMetadata.kernelLanguageAttributes.length() + 1;
|
||||
break;
|
||||
|
||||
case CL_KERNEL_BINARY_PROGRAM_INTEL:
|
||||
@@ -613,7 +634,6 @@ cl_int Kernel::getWorkGroupInfo(ClDevice &device, cl_kernel_work_group_info para
|
||||
cl_ulong localMemorySize;
|
||||
auto rootDeviceIndex = device.getRootDeviceIndex();
|
||||
auto &kernelInfo = *kernelInfos[rootDeviceIndex];
|
||||
const auto &patchInfo = kernelInfo.patchInfo;
|
||||
const auto &kernelDescriptor = kernelInfo.kernelDescriptor;
|
||||
size_t preferredWorkGroupSizeMultiple = 0;
|
||||
cl_ulong scratchSize;
|
||||
@@ -644,9 +664,7 @@ cl_int Kernel::getWorkGroupInfo(ClDevice &device, cl_kernel_work_group_info para
|
||||
break;
|
||||
|
||||
case CL_KERNEL_LOCAL_MEM_SIZE:
|
||||
localMemorySize = patchInfo.localsurface
|
||||
? patchInfo.localsurface->TotalInlineLocalMemorySize
|
||||
: 0;
|
||||
localMemorySize = kernelInfo.kernelDescriptor.kernelAttributes.slmInlineSize;
|
||||
srcSize = sizeof(localMemorySize);
|
||||
pSrc = &localMemorySize;
|
||||
break;
|
||||
@@ -661,7 +679,7 @@ cl_int Kernel::getWorkGroupInfo(ClDevice &device, cl_kernel_work_group_info para
|
||||
break;
|
||||
|
||||
case CL_KERNEL_SPILL_MEM_SIZE_INTEL:
|
||||
scratchSize = kernelInfo.patchInfo.mediavfestate ? kernelInfo.patchInfo.mediavfestate->PerThreadScratchSpace : 0;
|
||||
scratchSize = kernelDescriptor.kernelAttributes.perThreadScratchSize[0];
|
||||
srcSize = sizeof(scratchSize);
|
||||
pSrc = &scratchSize;
|
||||
break;
|
||||
@@ -762,7 +780,6 @@ cl_int Kernel::getSubGroupInfo(ClDevice &clDevice, cl_kernel_sub_group_info para
|
||||
workGroupSize2.val[0] = workGroupSize;
|
||||
workGroupSize2.val[1] = (workGroupSize > 0) ? 1 : 0;
|
||||
return changeGetInfoStatusToCLResultType(info.set<size_t2>(workGroupSize2));
|
||||
case 3:
|
||||
default:
|
||||
struct size_t3 {
|
||||
size_t val[3];
|
||||
@@ -1881,7 +1898,7 @@ void Kernel::createReflectionSurface() {
|
||||
|
||||
ReflectionSurfaceHelper::getCurbeParams(curbeParamsForBlocks[i], tokenMask[i], firstSSHTokenIndex, *pBlockInfo, hwInfo);
|
||||
|
||||
maxConstantBufferSize = std::max(maxConstantBufferSize, static_cast<size_t>(pBlockInfo->patchInfo.dataParameterStream->DataParameterStreamSize));
|
||||
maxConstantBufferSize = std::max(maxConstantBufferSize, static_cast<size_t>(pBlockInfo->kernelDescriptor.kernelAttributes.crossThreadDataSize));
|
||||
|
||||
samplerStateAndBorderColorSize = pBlockInfo->getSamplerStateArraySize(hwInfo);
|
||||
samplerStateAndBorderColorSize = alignUp(samplerStateAndBorderColorSize, Sampler::samplerStateArrayAlignment);
|
||||
@@ -1956,7 +1973,7 @@ void Kernel::createReflectionSurface() {
|
||||
const char *pSrc = pBlockInfo->crossThreadData;
|
||||
memcpy_s(pDst, pBlockInfo->getConstantBufferSize(), pSrc, pBlockInfo->getConstantBufferSize());
|
||||
|
||||
btOffset += pBlockInfo->patchInfo.bindingTableState->Offset;
|
||||
btOffset += pBlockInfo->kernelDescriptor.payloadMappings.bindingTable.tableOffset;
|
||||
kernelDataOffset = newKernelDataOffset;
|
||||
}
|
||||
|
||||
@@ -2033,14 +2050,14 @@ void Kernel::patchBlocksCurbeWithConstantValues() {
|
||||
uint64_t constantMemoryCurbeOffset = ReflectionSurfaceHelper::undefinedOffset;
|
||||
uint32_t constantMemoryPatchSize = 0;
|
||||
|
||||
if (pBlockInfo->patchInfo.pAllocateStatelessGlobalMemorySurfaceWithInitialization) {
|
||||
globalMemoryCurbeOffset = pBlockInfo->patchInfo.pAllocateStatelessGlobalMemorySurfaceWithInitialization->DataParamOffset;
|
||||
globalMemoryPatchSize = pBlockInfo->patchInfo.pAllocateStatelessGlobalMemorySurfaceWithInitialization->DataParamSize;
|
||||
if (isValidOffset(pBlockInfo->kernelDescriptor.payloadMappings.implicitArgs.globalVariablesSurfaceAddress.stateless)) {
|
||||
globalMemoryCurbeOffset = pBlockInfo->kernelDescriptor.payloadMappings.implicitArgs.globalVariablesSurfaceAddress.stateless;
|
||||
globalMemoryPatchSize = pBlockInfo->kernelDescriptor.payloadMappings.implicitArgs.globalVariablesSurfaceAddress.pointerSize;
|
||||
}
|
||||
|
||||
if (pBlockInfo->patchInfo.pAllocateStatelessConstantMemorySurfaceWithInitialization) {
|
||||
constantMemoryCurbeOffset = pBlockInfo->patchInfo.pAllocateStatelessConstantMemorySurfaceWithInitialization->DataParamOffset;
|
||||
constantMemoryPatchSize = pBlockInfo->patchInfo.pAllocateStatelessConstantMemorySurfaceWithInitialization->DataParamSize;
|
||||
if (isValidOffset(pBlockInfo->kernelDescriptor.payloadMappings.implicitArgs.globalConstantsSurfaceAddress.stateless)) {
|
||||
constantMemoryCurbeOffset = pBlockInfo->kernelDescriptor.payloadMappings.implicitArgs.globalConstantsSurfaceAddress.stateless;
|
||||
constantMemoryPatchSize = pBlockInfo->kernelDescriptor.payloadMappings.implicitArgs.globalConstantsSurfaceAddress.pointerSize;
|
||||
}
|
||||
|
||||
ReflectionSurfaceHelper::patchBlocksCurbeWithConstantValues(kernelReflectionSurface->getUnderlyingBuffer(), blockID,
|
||||
@@ -2104,19 +2121,18 @@ void Kernel::ReflectionSurfaceHelper::getCurbeParams(std::vector<IGIL_KernelCurb
|
||||
}
|
||||
tokenMask |= shiftLeftBy(50);
|
||||
|
||||
if (kernelInfo.patchInfo.bindingTableState) {
|
||||
if (isValidOffset(kernelInfo.kernelDescriptor.payloadMappings.bindingTable.tableOffset)) {
|
||||
auto &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily);
|
||||
const void *ssh = static_cast<const char *>(kernelInfo.heapInfo.pSsh) + kernelInfo.patchInfo.bindingTableState->Offset;
|
||||
|
||||
for (uint32_t i = 0; i < kernelInfo.patchInfo.bindingTableState->Count; i++) {
|
||||
const void *ssh = static_cast<const char *>(kernelInfo.heapInfo.pSsh) + kernelInfo.kernelDescriptor.payloadMappings.bindingTable.tableOffset;
|
||||
|
||||
for (uint32_t i = 0; i < kernelInfo.kernelDescriptor.payloadMappings.bindingTable.numEntries; i++) {
|
||||
uint32_t pointer = hwHelper.getBindingTableStateSurfaceStatePointer(ssh, i);
|
||||
if (pointer == kernelInfo.kernelArgInfo[argNumber].offsetHeap) {
|
||||
bindingTableIndex = i;
|
||||
break;
|
||||
}
|
||||
}
|
||||
DEBUG_BREAK_IF(!((bindingTableIndex != 253) || (kernelInfo.patchInfo.bindingTableState->Count == 0)));
|
||||
DEBUG_BREAK_IF(bindingTableIndex == 253);
|
||||
}
|
||||
|
||||
} else if (kernelInfo.kernelArgInfo[argNumber].isSampler) {
|
||||
@@ -2152,8 +2168,8 @@ void Kernel::ReflectionSurfaceHelper::getCurbeParams(std::vector<IGIL_KernelCurb
|
||||
}
|
||||
}
|
||||
|
||||
for (auto param : kernelInfo.patchInfo.dataParameterBuffersKernelArgs) {
|
||||
curbeParamsOut.emplace_back(IGIL_KernelCurbeParams{DATA_PARAMETER_KERNEL_ARGUMENT, param->DataSize, param->Offset, param->ArgumentNumber});
|
||||
for (auto param : kernelInfo.kernelDescriptor.kernelMetadata.allByValueKernelArguments) {
|
||||
curbeParamsOut.emplace_back(IGIL_KernelCurbeParams{DATA_PARAMETER_KERNEL_ARGUMENT, param.byValueElement.size, param.byValueElement.offset, param.argNum});
|
||||
tokenMask |= shiftLeftBy(DATA_PARAMETER_KERNEL_ARGUMENT);
|
||||
}
|
||||
|
||||
@@ -2210,8 +2226,8 @@ uint32_t Kernel::ReflectionSurfaceHelper::setKernelData(void *reflectionSurface,
|
||||
kernelData->m_numberOfCurbeTokens = static_cast<uint32_t>(curbeParamsIn.size() - kernelInfo.kernelArgInfo.size());
|
||||
kernelData->m_numberOfSamplerStates = static_cast<uint32_t>(kernelInfo.getSamplerStateArrayCount());
|
||||
kernelData->m_SizeOfSamplerHeap = static_cast<uint32_t>(samplerHeapSize);
|
||||
kernelData->m_SamplerBorderColorStateOffsetOnDSH = kernelInfo.patchInfo.samplerStateArray ? kernelInfo.patchInfo.samplerStateArray->BorderColorOffset : 0;
|
||||
kernelData->m_SamplerStateArrayOffsetOnDSH = kernelInfo.patchInfo.samplerStateArray ? kernelInfo.patchInfo.samplerStateArray->Offset : (uint32_t)-1;
|
||||
kernelData->m_SamplerBorderColorStateOffsetOnDSH = isValidOffset(kernelInfo.kernelDescriptor.payloadMappings.samplerTable.borderColor) ? kernelInfo.kernelDescriptor.payloadMappings.samplerTable.borderColor : 0;
|
||||
kernelData->m_SamplerStateArrayOffsetOnDSH = isValidOffset(kernelInfo.kernelDescriptor.payloadMappings.samplerTable.tableOffset) ? kernelInfo.kernelDescriptor.payloadMappings.samplerTable.tableOffset : -1;
|
||||
kernelData->m_sizeOfConstantBuffer = kernelInfo.getConstantBufferSize();
|
||||
kernelData->m_PatchTokensMask = tokenMaskIn;
|
||||
kernelData->m_ScratchSpacePatchValue = 0;
|
||||
@@ -2223,25 +2239,18 @@ uint32_t Kernel::ReflectionSurfaceHelper::setKernelData(void *reflectionSurface,
|
||||
kernelData->m_InilineSLMSize = kernelInfo.workloadInfo.slmStaticSize;
|
||||
|
||||
bool localIdRequired = false;
|
||||
if (kernelInfo.patchInfo.threadPayload) {
|
||||
if (kernelInfo.patchInfo.threadPayload->LocalIDFlattenedPresent ||
|
||||
kernelInfo.patchInfo.threadPayload->LocalIDXPresent ||
|
||||
kernelInfo.patchInfo.threadPayload->LocalIDYPresent ||
|
||||
kernelInfo.patchInfo.threadPayload->LocalIDZPresent) {
|
||||
localIdRequired = true;
|
||||
}
|
||||
kernelData->m_PayloadSize = PerThreadDataHelper::getThreadPayloadSize(*kernelInfo.patchInfo.threadPayload, kernelData->m_SIMDSize, hwInfo.capabilityTable.grfSize);
|
||||
if (kernelInfo.kernelDescriptor.kernelAttributes.flags.usesFlattenedLocalIds || (kernelInfo.kernelDescriptor.kernelAttributes.numLocalIdChannels > 0)) {
|
||||
localIdRequired = true;
|
||||
}
|
||||
kernelData->m_PayloadSize = PerThreadDataHelper::getThreadPayloadSize(kernelInfo.kernelDescriptor, hwInfo.capabilityTable.grfSize);
|
||||
|
||||
kernelData->m_NeedLocalIDS = localIdRequired ? 1 : 0;
|
||||
kernelData->m_DisablePreemption = 0u;
|
||||
|
||||
bool concurrentExecAllowed = true;
|
||||
|
||||
if (kernelInfo.patchInfo.pAllocateStatelessPrivateSurface) {
|
||||
if (kernelInfo.patchInfo.pAllocateStatelessPrivateSurface->PerThreadPrivateMemorySize > 0) {
|
||||
concurrentExecAllowed = false;
|
||||
}
|
||||
if (kernelInfo.kernelDescriptor.kernelAttributes.perHwThreadPrivateMemorySize > 0) {
|
||||
concurrentExecAllowed = false;
|
||||
}
|
||||
kernelData->m_CanRunConcurently = concurrentExecAllowed ? 1 : 0;
|
||||
|
||||
@@ -2285,7 +2294,7 @@ void Kernel::ReflectionSurfaceHelper::setKernelAddressData(void *reflectionSurfa
|
||||
kernelAddressData->m_ConstantBufferOffset = constantBufferOffset;
|
||||
kernelAddressData->m_SSHTokensOffset = sshTokensOffset;
|
||||
kernelAddressData->m_BTSoffset = btOffset;
|
||||
kernelAddressData->m_BTSize = static_cast<uint32_t>(kernelInfo.patchInfo.bindingTableState ? kernelInfo.patchInfo.bindingTableState->Count * hwHelper.getBindingTableStateSize() : 0);
|
||||
kernelAddressData->m_BTSize = static_cast<uint32_t>(kernelInfo.kernelDescriptor.payloadMappings.bindingTable.numEntries * hwHelper.getBindingTableStateSize());
|
||||
}
|
||||
|
||||
template <>
|
||||
@@ -2428,14 +2437,11 @@ void Kernel::provideInitializationHints() {
|
||||
kernelInfos[rootDeviceIndex]->kernelDescriptor.kernelMetadata.kernelName.c_str(),
|
||||
kernelDeviceInfos[rootDeviceIndex].privateSurfaceSize);
|
||||
}
|
||||
const auto &patchInfo = kernelInfos[rootDeviceIndex]->patchInfo;
|
||||
if (patchInfo.mediavfestate) {
|
||||
auto scratchSize = patchInfo.mediavfestate->PerThreadScratchSpace;
|
||||
scratchSize *= pClDevice->getSharedDeviceInfo().computeUnitsUsedForScratch * getKernelInfo(rootDeviceIndex).getMaxSimdSize();
|
||||
if (scratchSize > 0) {
|
||||
context->providePerformanceHint(CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL, REGISTER_PRESSURE_TOO_HIGH,
|
||||
kernelInfos[rootDeviceIndex]->kernelDescriptor.kernelMetadata.kernelName.c_str(), scratchSize);
|
||||
}
|
||||
auto scratchSize = kernelInfos[rootDeviceIndex]->kernelDescriptor.kernelAttributes.perThreadScratchSize[0] *
|
||||
pClDevice->getSharedDeviceInfo().computeUnitsUsedForScratch * getKernelInfo(rootDeviceIndex).getMaxSimdSize();
|
||||
if (scratchSize > 0) {
|
||||
context->providePerformanceHint(CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL, REGISTER_PRESSURE_TOO_HIGH,
|
||||
kernelInfos[rootDeviceIndex]->kernelDescriptor.kernelMetadata.kernelName.c_str(), scratchSize);
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -2487,19 +2493,18 @@ void Kernel::patchBlocksSimdSize(uint32_t rootDeviceIndex) {
|
||||
}
|
||||
|
||||
bool Kernel::usesSyncBuffer(uint32_t rootDeviceIndex) {
|
||||
return (getKernelInfo(rootDeviceIndex).patchInfo.pAllocateSyncBuffer != nullptr);
|
||||
return getKernelInfo(rootDeviceIndex).kernelDescriptor.kernelAttributes.flags.usesSyncBuffer;
|
||||
}
|
||||
|
||||
void Kernel::patchSyncBuffer(Device &device, GraphicsAllocation *gfxAllocation, size_t bufferOffset) {
|
||||
auto rootDeviceIndex = device.getRootDeviceIndex();
|
||||
auto &patchInfo = kernelInfos[rootDeviceIndex]->patchInfo;
|
||||
auto bufferPatchAddress = ptrOffset(getCrossThreadData(rootDeviceIndex), patchInfo.pAllocateSyncBuffer->DataParamOffset);
|
||||
patchWithRequiredSize(bufferPatchAddress, patchInfo.pAllocateSyncBuffer->DataParamSize,
|
||||
const auto &syncBuffer = kernelInfos[rootDeviceIndex]->kernelDescriptor.payloadMappings.implicitArgs.syncBufferAddress;
|
||||
auto bufferPatchAddress = ptrOffset(getCrossThreadData(rootDeviceIndex), syncBuffer.stateless);
|
||||
patchWithRequiredSize(bufferPatchAddress, syncBuffer.pointerSize,
|
||||
ptrOffset(gfxAllocation->getGpuAddressToPatch(), bufferOffset));
|
||||
|
||||
if (requiresSshForBuffers(rootDeviceIndex)) {
|
||||
auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(getSurfaceStateHeap(rootDeviceIndex)),
|
||||
patchInfo.pAllocateSyncBuffer->SurfaceStateHeapOffset);
|
||||
if (isValidOffset(syncBuffer.bindful)) {
|
||||
auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(getSurfaceStateHeap(rootDeviceIndex)), syncBuffer.bindful);
|
||||
auto addressToPatch = gfxAllocation->getUnderlyingBuffer();
|
||||
auto sizeToPatch = gfxAllocation->getUnderlyingBufferSize();
|
||||
Buffer::setSurfaceState(&device, surfaceState, false, false, sizeToPatch, addressToPatch, 0, gfxAllocation, 0, 0);
|
||||
@@ -2693,7 +2698,7 @@ uint64_t Kernel::getKernelStartOffset(
|
||||
if (kernelInfos[rootDeviceIndex]->getGraphicsAllocation()) {
|
||||
kernelStartOffset = kernelInfos[rootDeviceIndex]->getGraphicsAllocation()->getGpuAddressToPatch();
|
||||
if (localIdsGenerationByRuntime == false && kernelUsesLocalIds == true) {
|
||||
kernelStartOffset += kernelInfos[rootDeviceIndex]->patchInfo.threadPayload->OffsetToSkipPerThreadDataLoad;
|
||||
kernelStartOffset += kernelInfos[rootDeviceIndex]->kernelDescriptor.entryPoints.skipPerThreadDataLoad;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -2703,7 +2708,7 @@ uint64_t Kernel::getKernelStartOffset(
|
||||
auto &hwHelper = HwHelper::get(hardwareInfo.platform.eRenderCoreFamily);
|
||||
|
||||
if (isCssUsed && hwHelper.isOffsetToSkipSetFFIDGPWARequired(hardwareInfo)) {
|
||||
kernelStartOffset += kernelInfos[rootDeviceIndex]->patchInfo.threadPayload->OffsetToSkipSetFFIDGP;
|
||||
kernelStartOffset += kernelInfos[rootDeviceIndex]->kernelDescriptor.entryPoints.skipSetFFIDGP;
|
||||
}
|
||||
|
||||
return kernelStartOffset;
|
||||
|
||||
@@ -217,11 +217,11 @@ class Kernel : public BaseObject<_cl_kernel> {
|
||||
Program *getProgram() const { return program; }
|
||||
|
||||
uint32_t getScratchSize(uint32_t rootDeviceIndex) {
|
||||
return getKernelInfo(rootDeviceIndex).patchInfo.mediavfestate ? getKernelInfo(rootDeviceIndex).patchInfo.mediavfestate->PerThreadScratchSpace : 0;
|
||||
return getKernelInfo(rootDeviceIndex).kernelDescriptor.kernelAttributes.perThreadScratchSize[0];
|
||||
}
|
||||
|
||||
uint32_t getPrivateScratchSize(uint32_t rootDeviceIndex) {
|
||||
return getKernelInfo(rootDeviceIndex).patchInfo.mediaVfeStateSlot1 ? getKernelInfo(rootDeviceIndex).patchInfo.mediaVfeStateSlot1->PerThreadScratchSpace : 0;
|
||||
return getKernelInfo(rootDeviceIndex).kernelDescriptor.kernelAttributes.perThreadScratchSize[1];
|
||||
}
|
||||
|
||||
void createReflectionSurface();
|
||||
@@ -335,9 +335,6 @@ class Kernel : public BaseObject<_cl_kernel> {
|
||||
KernelExecutionType getExecutionType() const {
|
||||
return executionType;
|
||||
}
|
||||
bool isUsingSyncBuffer(uint32_t rootDeviceIndex) const {
|
||||
return (getKernelInfo(rootDeviceIndex).patchInfo.pAllocateSyncBuffer != nullptr);
|
||||
}
|
||||
|
||||
bool checkIfIsParentKernelAndBlocksUsesPrintf();
|
||||
|
||||
@@ -345,18 +342,8 @@ class Kernel : public BaseObject<_cl_kernel> {
|
||||
return getKernelInfo(rootDeviceIndex).gpuPointerSize == 4;
|
||||
}
|
||||
|
||||
int32_t getDebugSurfaceBti(uint32_t rootDeviceIndex) const {
|
||||
if (getKernelInfo(rootDeviceIndex).patchInfo.pAllocateSystemThreadSurface) {
|
||||
return getKernelInfo(rootDeviceIndex).patchInfo.pAllocateSystemThreadSurface->BTI;
|
||||
}
|
||||
return -1;
|
||||
}
|
||||
|
||||
size_t getPerThreadSystemThreadSurfaceSize(uint32_t rootDeviceIndex) const {
|
||||
if (getKernelInfo(rootDeviceIndex).patchInfo.pAllocateSystemThreadSurface) {
|
||||
return getKernelInfo(rootDeviceIndex).patchInfo.pAllocateSystemThreadSurface->PerThreadSystemThreadSurfaceSize;
|
||||
}
|
||||
return 0;
|
||||
return getKernelInfo(rootDeviceIndex).kernelDescriptor.kernelAttributes.perThreadSystemThreadSurfaceSize;
|
||||
}
|
||||
|
||||
std::vector<PatchInfoData> &getPatchInfoDataList() { return patchInfoDataList; };
|
||||
@@ -496,6 +483,7 @@ class Kernel : public BaseObject<_cl_kernel> {
|
||||
|
||||
void *patchBufferOffset(const KernelArgInfo &argInfo, void *svmPtr, GraphicsAllocation *svmAlloc, uint32_t rootDeviceIndex);
|
||||
|
||||
void patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const Device &device, const ArgDescPointer &arg);
|
||||
// Sets-up both crossThreadData and ssh for given implicit (private/constant, etc.) allocation
|
||||
template <typename PatchTokenT>
|
||||
void patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const Device &device, const PatchTokenT &patch);
|
||||
|
||||
@@ -58,11 +58,12 @@ void Kernel::patchReflectionSurface(DeviceQueue *devQueue, PrintfHandler *printf
|
||||
|
||||
auto privateSurface = blockManager->getPrivateSurface(i);
|
||||
|
||||
UNRECOVERABLE_IF(pBlockInfo->patchInfo.pAllocateStatelessPrivateSurface != nullptr && pBlockInfo->patchInfo.pAllocateStatelessPrivateSurface->PerThreadPrivateMemorySize && privateSurface == nullptr);
|
||||
|
||||
UNRECOVERABLE_IF((pBlockInfo->kernelDescriptor.kernelAttributes.perHwThreadPrivateMemorySize > 0U) && privateSurface == nullptr);
|
||||
if (privateSurface) {
|
||||
privateSurfaceOffset = pBlockInfo->patchInfo.pAllocateStatelessPrivateSurface->DataParamOffset;
|
||||
privateSurfacePatchSize = pBlockInfo->patchInfo.pAllocateStatelessPrivateSurface->DataParamSize;
|
||||
const auto &privateMemory = pBlockInfo->kernelDescriptor.payloadMappings.implicitArgs.privateMemoryAddress;
|
||||
UNRECOVERABLE_IF(false == isValidOffset(privateMemory.stateless));
|
||||
privateSurfaceOffset = privateMemory.stateless;
|
||||
privateSurfacePatchSize = privateMemory.pointerSize;
|
||||
privateSurfaceGpuAddress = privateSurface->getGpuAddressToPatch();
|
||||
}
|
||||
|
||||
|
||||
@@ -16,7 +16,7 @@ namespace NEO {
|
||||
|
||||
void BlockKernelManager::addBlockKernelInfo(KernelInfo *blockKernelInfo) {
|
||||
blockKernelInfoArray.push_back(blockKernelInfo);
|
||||
blockUsesPrintf |= blockKernelInfo->kernelDescriptor.kernelAttributes.flags.usesPrintf;
|
||||
blockUsesPrintf = blockKernelInfo->kernelDescriptor.kernelAttributes.flags.usesPrintf;
|
||||
}
|
||||
|
||||
const KernelInfo *BlockKernelManager::getBlockKernelInfo(size_t ordinal) {
|
||||
|
||||
@@ -298,45 +298,6 @@ void KernelInfo::storeKernelArgument(const SPatchStatelessDeviceQueueKernelArgum
|
||||
storeKernelArgPatchInfo(argNum, pStatelessDeviceQueueKernelArg->DataParamSize, pStatelessDeviceQueueKernelArg->DataParamOffset, 0, pStatelessDeviceQueueKernelArg->SurfaceStateHeapOffset);
|
||||
}
|
||||
|
||||
void KernelInfo::storePatchToken(
|
||||
const SPatchAllocateStatelessPrivateSurface *pStatelessPrivateSurfaceArg) {
|
||||
usesSsh |= true;
|
||||
patchInfo.pAllocateStatelessPrivateSurface = pStatelessPrivateSurfaceArg;
|
||||
}
|
||||
|
||||
void KernelInfo::storePatchToken(const SPatchAllocateStatelessConstantMemorySurfaceWithInitialization *pStatelessConstantMemorySurfaceWithInitializationArg) {
|
||||
usesSsh |= true;
|
||||
patchInfo.pAllocateStatelessConstantMemorySurfaceWithInitialization = pStatelessConstantMemorySurfaceWithInitializationArg;
|
||||
}
|
||||
|
||||
void KernelInfo::storePatchToken(const SPatchAllocateStatelessGlobalMemorySurfaceWithInitialization *pStatelessGlobalMemorySurfaceWithInitializationArg) {
|
||||
usesSsh |= true;
|
||||
patchInfo.pAllocateStatelessGlobalMemorySurfaceWithInitialization = pStatelessGlobalMemorySurfaceWithInitializationArg;
|
||||
}
|
||||
|
||||
void KernelInfo::storePatchToken(const SPatchKernelAttributesInfo *pKernelAttributesInfo) {
|
||||
this->patchInfo.pKernelAttributesInfo = pKernelAttributesInfo;
|
||||
attributes = reinterpret_cast<const char *>(pKernelAttributesInfo) + sizeof(SPatchKernelAttributesInfo);
|
||||
|
||||
auto start = attributes.find("intel_reqd_sub_group_size(");
|
||||
if (start != std::string::npos) {
|
||||
start += strlen("intel_reqd_sub_group_size(");
|
||||
auto stop = attributes.find(")", start);
|
||||
std::stringstream requiredSubGroupSizeStr(attributes.substr(start, stop - start));
|
||||
requiredSubGroupSizeStr >> requiredSubGroupSize;
|
||||
}
|
||||
}
|
||||
|
||||
void KernelInfo::storePatchToken(const SPatchAllocateSystemThreadSurface *pSystemThreadSurface) {
|
||||
usesSsh |= true;
|
||||
patchInfo.pAllocateSystemThreadSurface = pSystemThreadSurface;
|
||||
}
|
||||
|
||||
void KernelInfo::storePatchToken(const SPatchAllocateSyncBuffer *pAllocateSyncBuffer) {
|
||||
usesSsh |= true;
|
||||
patchInfo.pAllocateSyncBuffer = pAllocateSyncBuffer;
|
||||
}
|
||||
|
||||
void KernelInfo::storeKernelArgPatchInfo(uint32_t argNum, uint32_t dataSize, uint32_t dataOffset, uint32_t sourceOffset, uint32_t offsetSSH) {
|
||||
resizeKernelArgInfoAndRegisterParameter(argNum);
|
||||
|
||||
@@ -350,8 +311,7 @@ void KernelInfo::storeKernelArgPatchInfo(uint32_t argNum, uint32_t dataSize, uin
|
||||
}
|
||||
|
||||
size_t KernelInfo::getSamplerStateArrayCount() const {
|
||||
size_t count = patchInfo.samplerStateArray ? (size_t)patchInfo.samplerStateArray->Count : 0;
|
||||
return count;
|
||||
return kernelDescriptor.payloadMappings.samplerTable.numSamplers;
|
||||
}
|
||||
size_t KernelInfo::getSamplerStateArraySize(const HardwareInfo &hwInfo) const {
|
||||
size_t samplerStateArraySize = getSamplerStateArrayCount() * Sampler::getSamplerStateSize(hwInfo);
|
||||
@@ -360,22 +320,22 @@ size_t KernelInfo::getSamplerStateArraySize(const HardwareInfo &hwInfo) const {
|
||||
|
||||
size_t KernelInfo::getBorderColorStateSize() const {
|
||||
size_t borderColorSize = 0;
|
||||
if (patchInfo.samplerStateArray) {
|
||||
borderColorSize = patchInfo.samplerStateArray->Offset - patchInfo.samplerStateArray->BorderColorOffset;
|
||||
if (kernelDescriptor.payloadMappings.samplerTable.numSamplers > 0U) {
|
||||
borderColorSize = kernelDescriptor.payloadMappings.samplerTable.tableOffset - kernelDescriptor.payloadMappings.samplerTable.borderColor;
|
||||
}
|
||||
return borderColorSize;
|
||||
}
|
||||
|
||||
size_t KernelInfo::getBorderColorOffset() const {
|
||||
size_t borderColorOffset = 0;
|
||||
if (patchInfo.samplerStateArray) {
|
||||
borderColorOffset = patchInfo.samplerStateArray->BorderColorOffset;
|
||||
if (kernelDescriptor.payloadMappings.samplerTable.numSamplers > 0U) {
|
||||
borderColorOffset = kernelDescriptor.payloadMappings.samplerTable.borderColor;
|
||||
}
|
||||
return borderColorOffset;
|
||||
}
|
||||
|
||||
uint32_t KernelInfo::getConstantBufferSize() const {
|
||||
return patchInfo.dataParameterStream ? patchInfo.dataParameterStream->DataParameterStreamSize : 0;
|
||||
return kernelDescriptor.kernelAttributes.crossThreadDataSize;
|
||||
}
|
||||
|
||||
bool KernelInfo::createKernelAllocation(const Device &device, bool internalIsa) {
|
||||
@@ -412,12 +372,9 @@ void KernelInfo::apply(const DeviceInfoKernelPayloadConstants &constants) {
|
||||
*(uint32_t *)&(this->crossThreadData[localMemoryStatelessWindowSizeOffset]) = constants.slmWindowSize;
|
||||
}
|
||||
|
||||
uint32_t privateMemorySize = 0U;
|
||||
if (patchInfo.pAllocateStatelessPrivateSurface) {
|
||||
auto perHwThreadSize = PatchTokenBinary::getPerHwThreadPrivateSurfaceSize(patchInfo.pAllocateStatelessPrivateSurface, this->getMaxSimdSize());
|
||||
privateMemorySize = static_cast<uint32_t>(KernelHelper::getPrivateSurfaceSize(perHwThreadSize,
|
||||
constants.computeUnitsUsedForScratch));
|
||||
}
|
||||
auto perHwThreadSize = kernelDescriptor.kernelAttributes.perHwThreadPrivateMemorySize;
|
||||
uint32_t privateMemorySize = static_cast<uint32_t>(KernelHelper::getPrivateSurfaceSize(perHwThreadSize,
|
||||
constants.computeUnitsUsedForScratch));
|
||||
|
||||
if (privateMemoryStatelessSizeOffset != WorkloadInfo::undefinedOffset) {
|
||||
*(uint32_t *)&(this->crossThreadData[privateMemoryStatelessSizeOffset]) = privateMemorySize;
|
||||
|
||||
@@ -112,12 +112,6 @@ struct KernelInfo {
|
||||
void storeKernelArgument(const SPatchStatelessDeviceQueueKernelArgument *pStatelessDeviceQueueKernelArg);
|
||||
void storeKernelArgument(const SPatchSamplerKernelArgument *pSamplerKernelArg);
|
||||
void storePatchToken(const SPatchExecutionEnvironment *execEnv);
|
||||
void storePatchToken(const SPatchAllocateStatelessPrivateSurface *pStatelessPrivateSurfaceArg);
|
||||
void storePatchToken(const SPatchAllocateStatelessConstantMemorySurfaceWithInitialization *pStatelessConstantMemorySurfaceWithInitializationArg);
|
||||
void storePatchToken(const SPatchAllocateStatelessGlobalMemorySurfaceWithInitialization *pStatelessGlobalMemorySurfaceWithInitializationArg);
|
||||
void storePatchToken(const SPatchKernelAttributesInfo *pKernelAttributesInfo);
|
||||
void storePatchToken(const SPatchAllocateSystemThreadSurface *pSystemThreadSurface);
|
||||
void storePatchToken(const SPatchAllocateSyncBuffer *pAllocateSyncBuffer);
|
||||
GraphicsAllocation *getGraphicsAllocation() const { return this->kernelAllocation; }
|
||||
void resizeKernelArgInfoAndRegisterParameter(uint32_t argCount) {
|
||||
if (kernelArgInfo.size() <= argCount) {
|
||||
@@ -170,7 +164,6 @@ struct KernelInfo {
|
||||
bool createKernelAllocation(const Device &device, bool internalIsa);
|
||||
void apply(const DeviceInfoKernelPayloadConstants &constants);
|
||||
|
||||
std::string attributes;
|
||||
HeapInfo heapInfo = {};
|
||||
PatchInfo patchInfo = {};
|
||||
std::vector<KernelArgInfo> kernelArgInfo;
|
||||
@@ -182,7 +175,6 @@ struct KernelInfo {
|
||||
bool hasIndirectStatelessAccess = false;
|
||||
bool isVmeWorkload = false;
|
||||
char *crossThreadData = nullptr;
|
||||
size_t requiredSubGroupSize = 0;
|
||||
uint32_t gpuPointerSize = 0;
|
||||
const BuiltinDispatchInfoBuilder *builtinDispatchBuilder = nullptr;
|
||||
uint32_t argumentsToPatchNum = 0;
|
||||
|
||||
@@ -135,9 +135,6 @@ void populateKernelInfoArg(KernelInfo &dstKernelInfo, KernelArgInfo &dstKernelIn
|
||||
|
||||
for (auto &byValArg : src.byValMap) {
|
||||
dstKernelInfo.storeKernelArgument(byValArg);
|
||||
if (byValArg->Type == DATA_PARAMETER_KERNEL_ARGUMENT) {
|
||||
dstKernelInfo.patchInfo.dataParameterBuffersKernelArgs.push_back(byValArg);
|
||||
}
|
||||
}
|
||||
|
||||
dstKernelInfoArg.offsetObjectId = getOffset(src.objectId);
|
||||
@@ -160,17 +157,8 @@ void populateKernelInfo(KernelInfo &dst, const PatchTokenBinary::KernelFromPatch
|
||||
dst.heapInfo.pSsh = src.heaps.surfaceState.begin();
|
||||
|
||||
storeTokenIfNotNull(dst, src.tokens.executionEnvironment);
|
||||
dst.patchInfo.samplerStateArray = src.tokens.samplerStateArray;
|
||||
dst.patchInfo.bindingTableState = src.tokens.bindingTableState;
|
||||
dst.usesSsh = src.tokens.bindingTableState && (src.tokens.bindingTableState->Count > 0);
|
||||
dst.patchInfo.localsurface = src.tokens.allocateLocalSurface;
|
||||
dst.workloadInfo.slmStaticSize = src.tokens.allocateLocalSurface ? src.tokens.allocateLocalSurface->TotalInlineLocalMemorySize : 0U;
|
||||
dst.patchInfo.mediavfestate = src.tokens.mediaVfeState[0];
|
||||
dst.patchInfo.mediaVfeStateSlot1 = src.tokens.mediaVfeState[1];
|
||||
dst.patchInfo.interfaceDescriptorDataLoad = src.tokens.mediaInterfaceDescriptorLoad;
|
||||
dst.patchInfo.interfaceDescriptorData = src.tokens.interfaceDescriptorData;
|
||||
dst.patchInfo.threadPayload = src.tokens.threadPayload;
|
||||
dst.patchInfo.dataParameterStream = src.tokens.dataParameterStream;
|
||||
|
||||
dst.kernelArgInfo.resize(src.tokens.kernelArgs.size());
|
||||
|
||||
@@ -180,15 +168,15 @@ void populateKernelInfo(KernelInfo &dst, const PatchTokenBinary::KernelFromPatch
|
||||
populateKernelInfoArg(dst, kernelInfoArg, decodedKernelArg);
|
||||
}
|
||||
|
||||
storeTokenIfNotNull(dst, src.tokens.kernelAttributesInfo);
|
||||
storeTokenIfNotNull(dst, src.tokens.allocateStatelessPrivateSurface);
|
||||
storeTokenIfNotNull(dst, src.tokens.allocateStatelessConstantMemorySurfaceWithInitialization);
|
||||
storeTokenIfNotNull(dst, src.tokens.allocateStatelessGlobalMemorySurfaceWithInitialization);
|
||||
storeTokenIfNotNull(dst, src.tokens.allocateSyncBuffer);
|
||||
if (nullptr != src.tokens.allocateSyncBuffer) {
|
||||
dst.usesSsh = true;
|
||||
}
|
||||
if (nullptr != src.tokens.allocateSystemThreadSurface) {
|
||||
dst.usesSsh = true;
|
||||
}
|
||||
|
||||
dst.isVmeWorkload = dst.isVmeWorkload || (src.tokens.inlineVmeSamplerInfo != nullptr);
|
||||
dst.systemKernelOffset = src.tokens.stateSip ? src.tokens.stateSip->SystemKernelOffset : 0U;
|
||||
storeTokenIfNotNull(dst, src.tokens.allocateSystemThreadSurface);
|
||||
|
||||
for (uint32_t i = 0; i < 3U; ++i) {
|
||||
dst.workloadInfo.localWorkSizeOffsets[i] = getOffset(src.tokens.crossThreadPayloadArgs.localWorkSize[i]);
|
||||
@@ -221,10 +209,9 @@ void populateKernelInfo(KernelInfo &dst, const PatchTokenBinary::KernelFromPatch
|
||||
populateKernelDescriptor(dst.kernelDescriptor, src, gpuPointerSizeInBytes);
|
||||
}
|
||||
|
||||
if (dst.patchInfo.dataParameterStream && dst.patchInfo.dataParameterStream->DataParameterStreamSize) {
|
||||
uint32_t crossThreadDataSize = dst.patchInfo.dataParameterStream->DataParameterStreamSize;
|
||||
dst.crossThreadData = new char[crossThreadDataSize];
|
||||
memset(dst.crossThreadData, 0x00, crossThreadDataSize);
|
||||
if (dst.kernelDescriptor.kernelAttributes.crossThreadDataSize) {
|
||||
dst.crossThreadData = new char[dst.kernelDescriptor.kernelAttributes.crossThreadDataSize];
|
||||
memset(dst.crossThreadData, 0x00, dst.kernelDescriptor.kernelAttributes.crossThreadDataSize);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@@ -48,26 +48,10 @@ using iOpenCL::SPatchThreadPayload;
|
||||
using iOpenCL::SProgramBinaryHeader;
|
||||
|
||||
struct PatchInfo {
|
||||
const SPatchMediaInterfaceDescriptorLoad *interfaceDescriptorDataLoad = nullptr;
|
||||
const SPatchAllocateLocalSurface *localsurface = nullptr;
|
||||
const SPatchMediaVFEState *mediavfestate = nullptr;
|
||||
const SPatchMediaVFEState *mediaVfeStateSlot1 = nullptr;
|
||||
const SPatchInterfaceDescriptorData *interfaceDescriptorData = nullptr;
|
||||
const SPatchSamplerStateArray *samplerStateArray = nullptr;
|
||||
const SPatchBindingTableState *bindingTableState = nullptr;
|
||||
::std::vector<const SPatchDataParameterBuffer *> dataParameterBuffersKernelArgs;
|
||||
::std::vector<const SPatchStatelessGlobalMemoryObjectKernelArgument *>
|
||||
statelessGlobalMemObjKernelArgs;
|
||||
::std::vector<const SPatchImageMemoryObjectKernelArgument *>
|
||||
imageMemObjKernelArgs;
|
||||
const SPatchDataParameterStream *dataParameterStream = nullptr;
|
||||
const SPatchThreadPayload *threadPayload = nullptr;
|
||||
const SPatchKernelAttributesInfo *pKernelAttributesInfo = nullptr;
|
||||
const SPatchAllocateStatelessPrivateSurface *pAllocateStatelessPrivateSurface = nullptr;
|
||||
const SPatchAllocateSyncBuffer *pAllocateSyncBuffer = nullptr;
|
||||
const SPatchAllocateStatelessConstantMemorySurfaceWithInitialization *pAllocateStatelessConstantMemorySurfaceWithInitialization = nullptr;
|
||||
const SPatchAllocateStatelessGlobalMemorySurfaceWithInitialization *pAllocateStatelessGlobalMemorySurfaceWithInitialization = nullptr;
|
||||
const SPatchAllocateSystemThreadSurface *pAllocateSystemThreadSurface = nullptr;
|
||||
};
|
||||
|
||||
} // namespace NEO
|
||||
|
||||
@@ -348,16 +348,13 @@ void Program::allocateBlockPrivateSurfaces(const ClDevice &clDevice) {
|
||||
for (uint32_t i = 0; i < blockCount; i++) {
|
||||
const KernelInfo *info = blockKernelManager->getBlockKernelInfo(i);
|
||||
|
||||
if (info->patchInfo.pAllocateStatelessPrivateSurface) {
|
||||
auto perHwThreadPrivateMemorySize = PatchTokenBinary::getPerHwThreadPrivateSurfaceSize(info->patchInfo.pAllocateStatelessPrivateSurface, info->getMaxSimdSize());
|
||||
auto perHwThreadPrivateMemorySize = info->kernelDescriptor.kernelAttributes.perHwThreadPrivateMemorySize;
|
||||
if (perHwThreadPrivateMemorySize > 0 && blockKernelManager->getPrivateSurface(i) == nullptr) {
|
||||
auto privateSize = static_cast<size_t>(KernelHelper::getPrivateSurfaceSize(perHwThreadPrivateMemorySize, clDevice.getSharedDeviceInfo().computeUnitsUsedForScratch));
|
||||
|
||||
if (perHwThreadPrivateMemorySize > 0 && blockKernelManager->getPrivateSurface(i) == nullptr) {
|
||||
auto privateSize = static_cast<size_t>(KernelHelper::getPrivateSurfaceSize(perHwThreadPrivateMemorySize, clDevice.getSharedDeviceInfo().computeUnitsUsedForScratch));
|
||||
|
||||
auto *privateSurface = this->executionEnvironment.memoryManager->allocateGraphicsMemoryWithProperties(
|
||||
{rootDeviceIndex, privateSize, GraphicsAllocation::AllocationType::PRIVATE_SURFACE, clDevice.getDeviceBitfield()});
|
||||
blockKernelManager->pushPrivateSurface(privateSurface, i);
|
||||
}
|
||||
auto *privateSurface = this->executionEnvironment.memoryManager->allocateGraphicsMemoryWithProperties(
|
||||
{rootDeviceIndex, privateSize, GraphicsAllocation::AllocationType::PRIVATE_SURFACE, clDevice.getDeviceBitfield()});
|
||||
blockKernelManager->pushPrivateSurface(privateSurface, i);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (C) 2017-2020 Intel Corporation
|
||||
* Copyright (C) 2017-2021 Intel Corporation
|
||||
*
|
||||
* SPDX-License-Identifier: MIT
|
||||
*
|
||||
@@ -35,7 +35,7 @@ class SchedulerKernel : public Kernel {
|
||||
|
||||
size_t getCurbeSize() {
|
||||
auto &defaultKernelInfo = getDefaultKernelInfo();
|
||||
size_t crossTrheadDataSize = defaultKernelInfo.patchInfo.dataParameterStream ? defaultKernelInfo.patchInfo.dataParameterStream->DataParameterStreamSize : 0;
|
||||
size_t crossTrheadDataSize = defaultKernelInfo.kernelDescriptor.kernelAttributes.crossThreadDataSize;
|
||||
size_t dshSize = defaultKernelInfo.heapInfo.DynamicStateHeapSize;
|
||||
|
||||
crossTrheadDataSize = alignUp(crossTrheadDataSize, 64);
|
||||
|
||||
Reference in New Issue
Block a user