537 lines
29 KiB
C++
537 lines
29 KiB
C++
/*
|
|
* Copyright (C) 2020-2023 Intel Corporation
|
|
*
|
|
* SPDX-License-Identifier: MIT
|
|
*
|
|
*/
|
|
|
|
#include "shared/source/kernel/kernel_descriptor_from_patchtokens.h"
|
|
|
|
#include "shared/source/debug_settings/debug_settings_manager.h"
|
|
#include "shared/source/device_binary_format/patchtokens_decoder.h"
|
|
#include "shared/source/kernel/kernel_arg_descriptor_extended_vme.h"
|
|
#include "shared/source/kernel/kernel_descriptor.h"
|
|
|
|
#include <sstream>
|
|
#include <string>
|
|
|
|
namespace NEO {
|
|
|
|
using namespace iOpenCL;
|
|
|
|
void populateKernelDescriptor(KernelDescriptor &dst, const SPatchExecutionEnvironment &execEnv) {
|
|
if (execEnv.RequiredWorkGroupSizeX != 0) {
|
|
dst.kernelAttributes.requiredWorkgroupSize[0] = execEnv.RequiredWorkGroupSizeX;
|
|
dst.kernelAttributes.requiredWorkgroupSize[1] = execEnv.RequiredWorkGroupSizeY;
|
|
dst.kernelAttributes.requiredWorkgroupSize[2] = execEnv.RequiredWorkGroupSizeZ;
|
|
DEBUG_BREAK_IF(!(execEnv.RequiredWorkGroupSizeY > 0));
|
|
DEBUG_BREAK_IF(!(execEnv.RequiredWorkGroupSizeZ > 0));
|
|
}
|
|
if (execEnv.WorkgroupWalkOrderDims) {
|
|
constexpr auto dimensionMask = 0b11;
|
|
constexpr auto dimensionSize = 2;
|
|
dst.kernelAttributes.workgroupWalkOrder[0] = execEnv.WorkgroupWalkOrderDims & dimensionMask;
|
|
dst.kernelAttributes.workgroupWalkOrder[1] = (execEnv.WorkgroupWalkOrderDims >> dimensionSize) & dimensionMask;
|
|
dst.kernelAttributes.workgroupWalkOrder[2] = (execEnv.WorkgroupWalkOrderDims >> dimensionSize * 2) & dimensionMask;
|
|
dst.kernelAttributes.flags.requiresWorkgroupWalkOrder = true;
|
|
}
|
|
|
|
for (uint32_t i = 0; i < 3; ++i) {
|
|
// inverts the walk order mapping (from ORDER_ID->DIM_ID to DIM_ID->ORDER_ID)
|
|
dst.kernelAttributes.workgroupDimensionsOrder[dst.kernelAttributes.workgroupWalkOrder[i]] = i;
|
|
}
|
|
|
|
if (execEnv.CompiledForGreaterThan4GBBuffers) {
|
|
dst.kernelAttributes.bufferAddressingMode = KernelDescriptor::Stateless;
|
|
} else if (execEnv.UseBindlessMode) {
|
|
dst.kernelAttributes.bufferAddressingMode = KernelDescriptor::BindlessAndStateless;
|
|
dst.kernelAttributes.imageAddressingMode = KernelDescriptor::Bindless;
|
|
} else {
|
|
dst.kernelAttributes.bufferAddressingMode = KernelDescriptor::BindfulAndStateless;
|
|
}
|
|
|
|
dst.kernelAttributes.numGrfRequired = execEnv.NumGRFRequired;
|
|
dst.kernelAttributes.simdSize = execEnv.LargestCompiledSIMDSize;
|
|
dst.kernelAttributes.barrierCount = execEnv.HasBarriers;
|
|
dst.kernelAttributes.numThreadsRequired = execEnv.NumThreadsRequired;
|
|
|
|
dst.kernelAttributes.flags.requiresDisabledEUFusion = (0 != execEnv.RequireDisableEUFusion);
|
|
dst.kernelAttributes.flags.requiresDisabledMidThreadPreemption = (0 != execEnv.DisableMidThreadPreemption);
|
|
dst.kernelAttributes.flags.requiresSubgroupIndependentForwardProgress = (0 != execEnv.SubgroupIndependentForwardProgressRequired);
|
|
dst.kernelAttributes.flags.useGlobalAtomics = (0 != execEnv.HasGlobalAtomics);
|
|
dst.kernelAttributes.flags.usesFencesForReadWriteImages = (0 != execEnv.UsesFencesForReadWriteImages);
|
|
dst.kernelAttributes.flags.usesSystolicPipelineSelectMode = (0 != execEnv.HasDPAS);
|
|
dst.kernelAttributes.flags.usesStatelessWrites = (0 != execEnv.StatelessWritesCount);
|
|
dst.kernelAttributes.flags.useStackCalls = (0 != execEnv.HasStackCalls);
|
|
dst.kernelAttributes.flags.hasRTCalls = (0 != execEnv.HasRTCalls);
|
|
|
|
dst.kernelMetadata.compiledSubGroupsNumber = execEnv.CompiledSubGroupsNumber;
|
|
|
|
populateKernelDescriptorExtra(dst, execEnv);
|
|
}
|
|
|
|
void populateKernelDescriptor(KernelDescriptor &dst, const SPatchSamplerStateArray &token) {
|
|
dst.payloadMappings.samplerTable.borderColor = token.BorderColorOffset;
|
|
dst.payloadMappings.samplerTable.numSamplers = token.Count;
|
|
dst.payloadMappings.samplerTable.tableOffset = token.Offset;
|
|
}
|
|
|
|
void populateKernelDescriptor(KernelDescriptor &dst, const SPatchBindingTableState &token) {
|
|
dst.payloadMappings.bindingTable.numEntries = token.Count;
|
|
dst.payloadMappings.bindingTable.tableOffset = token.Offset;
|
|
}
|
|
|
|
void populateKernelDescriptor(KernelDescriptor &dst, const SPatchAllocateLocalSurface &token) {
|
|
dst.kernelAttributes.slmInlineSize = token.TotalInlineLocalMemorySize;
|
|
}
|
|
|
|
void populateKernelDescriptor(KernelDescriptor &dst, const SPatchMediaVFEState &token, uint32_t slot) {
|
|
UNRECOVERABLE_IF(slot >= 2U);
|
|
dst.kernelAttributes.perThreadScratchSize[slot] = token.PerThreadScratchSpace;
|
|
}
|
|
|
|
void populateKernelDescriptor(KernelDescriptor &dst, const SPatchThreadPayload &token) {
|
|
dst.kernelAttributes.flags.perThreadDataHeaderIsPresent = (0U != token.HeaderPresent);
|
|
dst.kernelAttributes.numLocalIdChannels = token.LocalIDXPresent + token.LocalIDYPresent + token.LocalIDZPresent;
|
|
dst.kernelAttributes.localId[0] = token.LocalIDXPresent;
|
|
dst.kernelAttributes.localId[1] = token.LocalIDYPresent;
|
|
dst.kernelAttributes.localId[2] = token.LocalIDZPresent;
|
|
dst.kernelAttributes.flags.usesFlattenedLocalIds = (0U != token.LocalIDFlattenedPresent);
|
|
dst.kernelAttributes.flags.perThreadDataUnusedGrfIsPresent = (0U != token.UnusedPerThreadConstantPresent);
|
|
dst.kernelAttributes.flags.passInlineData = (0 != token.PassInlineData);
|
|
dst.entryPoints.skipPerThreadDataLoad = token.OffsetToSkipPerThreadDataLoad;
|
|
dst.entryPoints.skipSetFFIDGP = token.OffsetToSkipSetFFIDGP;
|
|
}
|
|
|
|
void populateKernelDescriptor(KernelDescriptor &dst, const SPatchDataParameterStream &token) {
|
|
dst.kernelAttributes.crossThreadDataSize = token.DataParameterStreamSize;
|
|
}
|
|
|
|
void populateKernelDescriptor(KernelDescriptor &dst, const SPatchKernelAttributesInfo &token) {
|
|
constexpr ConstStringRef attributeReqdSubGroupSizeBeg = "intel_reqd_sub_group_size(";
|
|
std::string attributes = std::string(reinterpret_cast<const char *>(&token + 1), token.AttributesSize).c_str();
|
|
dst.kernelMetadata.kernelLanguageAttributes = attributes;
|
|
auto it = attributes.find(attributeReqdSubGroupSizeBeg.begin());
|
|
if (it != std::string::npos) {
|
|
it += attributeReqdSubGroupSizeBeg.size();
|
|
dst.kernelMetadata.requiredSubGroupSize = 0U;
|
|
while ((attributes[it] >= '0') && (attributes[it] <= '9')) {
|
|
dst.kernelMetadata.requiredSubGroupSize *= 10;
|
|
dst.kernelMetadata.requiredSubGroupSize += attributes[it] - '0';
|
|
++it;
|
|
}
|
|
}
|
|
|
|
constexpr ConstStringRef invalidKernelAttrBeg = "invalid_kernel(";
|
|
dst.kernelAttributes.flags.isInvalid = (attributes.find(invalidKernelAttrBeg.data()) != std::string::npos);
|
|
}
|
|
|
|
void populatePointerKernelArg(KernelDescriptor &kernelDesc, ArgDescPointer &dst,
|
|
CrossThreadDataOffset stateless, uint8_t pointerSize, SurfaceStateHeapOffset bindful, CrossThreadDataOffset bindless,
|
|
KernelDescriptor::AddressingMode addressingMode) {
|
|
switch (addressingMode) {
|
|
default:
|
|
UNRECOVERABLE_IF(KernelDescriptor::Stateless != addressingMode);
|
|
dst.bindful = undefined<SurfaceStateHeapOffset>;
|
|
dst.stateless = stateless;
|
|
dst.bindless = undefined<CrossThreadDataOffset>;
|
|
dst.pointerSize = pointerSize;
|
|
break;
|
|
case KernelDescriptor::BindfulAndStateless:
|
|
dst.bindful = bindful;
|
|
dst.stateless = stateless;
|
|
dst.bindless = undefined<CrossThreadDataOffset>;
|
|
dst.pointerSize = pointerSize;
|
|
kernelDesc.kernelAttributes.numArgsStateful++;
|
|
break;
|
|
|
|
case KernelDescriptor::BindlessAndStateless:
|
|
dst.bindful = undefined<SurfaceStateHeapOffset>;
|
|
dst.stateless = stateless;
|
|
dst.bindless = bindless;
|
|
dst.pointerSize = pointerSize;
|
|
kernelDesc.kernelAttributes.numArgsStateful++;
|
|
break;
|
|
}
|
|
}
|
|
|
|
template <typename TokenT>
|
|
void populatePointerKernelArg(KernelDescriptor &kernelDesc, ArgDescPointer &dst, const TokenT &src, KernelDescriptor::AddressingMode addressingMode) {
|
|
populatePointerKernelArg(kernelDesc, dst, src.DataParamOffset, src.DataParamSize, src.SurfaceStateHeapOffset, src.SurfaceStateHeapOffset, addressingMode);
|
|
}
|
|
|
|
void populateKernelDescriptor(KernelDescriptor &dst, const SPatchAllocateStatelessPrivateSurface &token) {
|
|
dst.kernelAttributes.flags.usesPrivateMemory = true;
|
|
dst.kernelAttributes.perHwThreadPrivateMemorySize = static_cast<uint32_t>(PatchTokenBinary::getPerHwThreadPrivateSurfaceSize(token, dst.kernelAttributes.simdSize));
|
|
populatePointerKernelArg(dst, dst.payloadMappings.implicitArgs.privateMemoryAddress, token, dst.kernelAttributes.bufferAddressingMode);
|
|
}
|
|
|
|
void populateKernelDescriptor(KernelDescriptor &dst, const SPatchAllocateStatelessConstantMemorySurfaceWithInitialization &token) {
|
|
populatePointerKernelArg(dst, dst.payloadMappings.implicitArgs.globalConstantsSurfaceAddress, token, dst.kernelAttributes.bufferAddressingMode);
|
|
}
|
|
|
|
void populateKernelDescriptor(KernelDescriptor &dst, const SPatchAllocateStatelessGlobalMemorySurfaceWithInitialization &token) {
|
|
populatePointerKernelArg(dst, dst.payloadMappings.implicitArgs.globalVariablesSurfaceAddress, token, dst.kernelAttributes.bufferAddressingMode);
|
|
}
|
|
|
|
void populateKernelDescriptor(KernelDescriptor &dst, const SPatchAllocateStatelessPrintfSurface &token) {
|
|
dst.kernelAttributes.flags.usesPrintf = true;
|
|
dst.kernelAttributes.flags.usesStringMapForPrintf = true;
|
|
populatePointerKernelArg(dst, dst.payloadMappings.implicitArgs.printfSurfaceAddress, token, dst.kernelAttributes.bufferAddressingMode);
|
|
}
|
|
|
|
void populateKernelDescriptor(KernelDescriptor &dst, const SPatchAllocateStatelessEventPoolSurface &token) {
|
|
populatePointerKernelArg(dst, dst.payloadMappings.implicitArgs.deviceSideEnqueueEventPoolSurfaceAddress, token, dst.kernelAttributes.bufferAddressingMode);
|
|
}
|
|
|
|
void populateKernelDescriptor(KernelDescriptor &dst, const SPatchAllocateStatelessDefaultDeviceQueueSurface &token) {
|
|
populatePointerKernelArg(dst, dst.payloadMappings.implicitArgs.deviceSideEnqueueDefaultQueueSurfaceAddress, token, dst.kernelAttributes.bufferAddressingMode);
|
|
}
|
|
|
|
void populateKernelDescriptor(KernelDescriptor &dst, const SPatchAllocateSystemThreadSurface &token) {
|
|
dst.payloadMappings.implicitArgs.systemThreadSurfaceAddress.bindful = token.Offset;
|
|
dst.kernelAttributes.perThreadSystemThreadSurfaceSize = token.PerThreadSystemThreadSurfaceSize;
|
|
}
|
|
|
|
void populateKernelDescriptor(KernelDescriptor &dst, const SPatchAllocateSyncBuffer &token) {
|
|
dst.kernelAttributes.flags.usesSyncBuffer = true;
|
|
populatePointerKernelArg(dst, dst.payloadMappings.implicitArgs.syncBufferAddress, token, dst.kernelAttributes.bufferAddressingMode);
|
|
}
|
|
|
|
void populateKernelDescriptor(KernelDescriptor &dst, const SPatchAllocateRTGlobalBuffer &token) {
|
|
populatePointerKernelArg(dst, dst.payloadMappings.implicitArgs.rtDispatchGlobals, token, dst.kernelAttributes.bufferAddressingMode);
|
|
}
|
|
|
|
void populateKernelDescriptor(KernelDescriptor &dst, const SPatchString &token) {
|
|
uint32_t stringIndex = token.Index;
|
|
const char *stringData = reinterpret_cast<const char *>(&token + 1);
|
|
dst.kernelMetadata.printfStringsMap[stringIndex].assign(stringData, stringData + token.StringSize);
|
|
}
|
|
|
|
template <typename TokenT, typename... ArgsT>
|
|
inline void populateKernelDescriptorIfNotNull(KernelDescriptor &dst, const TokenT *token, ArgsT &&...args) {
|
|
if (token != nullptr) {
|
|
populateKernelDescriptor(dst, *token, std::forward<ArgsT>(args)...);
|
|
}
|
|
}
|
|
|
|
void markArgAsPatchable(KernelDescriptor &parent, size_t dstArgNum) {
|
|
auto &argExtendedTypeInfo = parent.payloadMappings.explicitArgs[dstArgNum].getExtendedTypeInfo();
|
|
if (false == argExtendedTypeInfo.needsPatch) {
|
|
argExtendedTypeInfo.needsPatch = true;
|
|
++parent.kernelAttributes.numArgsToPatch;
|
|
}
|
|
}
|
|
|
|
void populateKernelArgDescriptor(KernelDescriptor &dst, size_t argNum, const SPatchImageMemoryObjectKernelArgument &token) {
|
|
markArgAsPatchable(dst, argNum);
|
|
|
|
auto &argImage = dst.payloadMappings.explicitArgs[argNum].as<ArgDescImage>(true);
|
|
if (KernelDescriptor::Bindful == dst.kernelAttributes.imageAddressingMode) {
|
|
argImage.bindful = token.Offset;
|
|
dst.kernelAttributes.numArgsStateful++;
|
|
}
|
|
if (KernelDescriptor::Bindless == dst.kernelAttributes.imageAddressingMode) {
|
|
argImage.bindless = token.Offset;
|
|
dst.kernelAttributes.numArgsStateful++;
|
|
}
|
|
|
|
if (token.Type == iOpenCL::IMAGE_MEMORY_OBJECT_2D_MEDIA) {
|
|
dst.payloadMappings.explicitArgs[argNum].getExtendedTypeInfo().isMediaImage = true;
|
|
}
|
|
|
|
if (token.Type == iOpenCL::IMAGE_MEMORY_OBJECT_2D_MEDIA_BLOCK) {
|
|
dst.payloadMappings.explicitArgs[argNum].getExtendedTypeInfo().isMediaBlockImage = true;
|
|
}
|
|
|
|
dst.payloadMappings.explicitArgs[argNum].getExtendedTypeInfo().isTransformable = token.Transformable != 0;
|
|
if (NEO::KernelArgMetadata::AccessUnknown == dst.payloadMappings.explicitArgs[argNum].getTraits().accessQualifier) {
|
|
auto accessQual = token.Writeable ? NEO::KernelArgMetadata::AccessReadWrite
|
|
: NEO::KernelArgMetadata::AccessReadOnly;
|
|
dst.payloadMappings.explicitArgs[argNum].getTraits().accessQualifier = accessQual;
|
|
}
|
|
}
|
|
|
|
void populateKernelArgDescriptor(KernelDescriptor &dst, size_t argNum, const SPatchSamplerKernelArgument &token) {
|
|
markArgAsPatchable(dst, argNum);
|
|
|
|
auto &argSampler = dst.payloadMappings.explicitArgs[argNum].as<ArgDescSampler>(true);
|
|
|
|
argSampler.bindful = token.Offset;
|
|
argSampler.samplerType = token.Type;
|
|
|
|
if (token.Type != iOpenCL::SAMPLER_OBJECT_TEXTURE) {
|
|
DEBUG_BREAK_IF(token.Type != iOpenCL::SAMPLER_OBJECT_VME &&
|
|
token.Type != iOpenCL::SAMPLER_OBJECT_VE &&
|
|
token.Type != iOpenCL::SAMPLER_OBJECT_VD);
|
|
dst.payloadMappings.explicitArgs[argNum].getExtendedTypeInfo().isAccelerator = true;
|
|
dst.kernelAttributes.flags.usesVme |= (token.Type == iOpenCL::SAMPLER_OBJECT_VME);
|
|
}
|
|
}
|
|
|
|
void populateKernelArgDescriptor(KernelDescriptor &dst, size_t argNum, const SPatchGlobalMemoryObjectKernelArgument &token) {
|
|
markArgAsPatchable(dst, argNum);
|
|
|
|
auto &argPointer = dst.payloadMappings.explicitArgs[argNum].as<ArgDescPointer>(true);
|
|
dst.payloadMappings.explicitArgs[argNum].getTraits().addressQualifier = KernelArgMetadata::AddrGlobal;
|
|
if (dst.kernelAttributes.bufferAddressingMode == KernelDescriptor::BindlessAndStateless) {
|
|
argPointer.bindless = token.Offset;
|
|
argPointer.bindful = undefined<SurfaceStateHeapOffset>;
|
|
dst.kernelAttributes.numArgsStateful++;
|
|
} else {
|
|
argPointer.bindful = token.Offset;
|
|
argPointer.bindless = undefined<CrossThreadDataOffset>;
|
|
dst.kernelAttributes.numArgsStateful++;
|
|
}
|
|
|
|
argPointer.stateless = undefined<CrossThreadDataOffset>;
|
|
argPointer.pointerSize = dst.kernelAttributes.gpuPointerSize;
|
|
}
|
|
|
|
void populateKernelArgDescriptor(KernelDescriptor &dst, size_t argNum, const SPatchStatelessGlobalMemoryObjectKernelArgument &token) {
|
|
markArgAsPatchable(dst, argNum);
|
|
|
|
auto &argPointer = dst.payloadMappings.explicitArgs[argNum].as<ArgDescPointer>(true);
|
|
dst.payloadMappings.explicitArgs[argNum].getTraits().addressQualifier = KernelArgMetadata::AddrGlobal;
|
|
|
|
populatePointerKernelArg(dst, argPointer, token, dst.kernelAttributes.bufferAddressingMode);
|
|
}
|
|
|
|
void populateKernelArgDescriptor(KernelDescriptor &dst, size_t argNum, const SPatchStatelessConstantMemoryObjectKernelArgument &token) {
|
|
markArgAsPatchable(dst, argNum);
|
|
|
|
auto &argPointer = dst.payloadMappings.explicitArgs[argNum].as<ArgDescPointer>(true);
|
|
dst.payloadMappings.explicitArgs[argNum].getTraits().addressQualifier = KernelArgMetadata::AddrConstant;
|
|
|
|
populatePointerKernelArg(dst, argPointer, token, dst.kernelAttributes.bufferAddressingMode);
|
|
}
|
|
|
|
void populateKernelArgDescriptor(KernelDescriptor &dst, size_t argNum, const SPatchStatelessDeviceQueueKernelArgument &token) {
|
|
markArgAsPatchable(dst, argNum);
|
|
|
|
auto &argPointer = dst.payloadMappings.explicitArgs[argNum].as<ArgDescPointer>(true);
|
|
dst.payloadMappings.explicitArgs[argNum].getTraits().addressQualifier = KernelArgMetadata::AddrGlobal;
|
|
|
|
dst.payloadMappings.explicitArgs[argNum].getExtendedTypeInfo().isDeviceQueue = true;
|
|
|
|
populatePointerKernelArg(dst, argPointer, token, dst.kernelAttributes.bufferAddressingMode);
|
|
}
|
|
|
|
void populateKernelArgDescriptor(KernelDescriptor &dst, size_t argNum, const SPatchDataParameterBuffer &token) {
|
|
markArgAsPatchable(dst, argNum);
|
|
|
|
ArgDescValue::Element newElement = {};
|
|
newElement.size = token.DataSize;
|
|
newElement.offset = token.Offset;
|
|
newElement.sourceOffset = token.SourceOffset;
|
|
|
|
dst.payloadMappings.explicitArgs[argNum].as<ArgDescValue>(true).elements.push_back(newElement);
|
|
}
|
|
|
|
inline CrossThreadDataOffset getOffset(const SPatchDataParameterBuffer *token) {
|
|
if (token != nullptr) {
|
|
return static_cast<CrossThreadDataOffset>(token->Offset);
|
|
}
|
|
return undefined<CrossThreadDataOffset>;
|
|
}
|
|
|
|
void populateArgMetadata(KernelDescriptor &dst, size_t argNum, const SPatchKernelArgumentInfo *src) {
|
|
if (nullptr == src) {
|
|
return;
|
|
}
|
|
|
|
auto inlineData = PatchTokenBinary::getInlineData(src);
|
|
|
|
auto metadataExtended = std::make_unique<ArgTypeMetadataExtended>();
|
|
metadataExtended->addressQualifier = parseLimitedString(inlineData.addressQualifier.begin(), inlineData.addressQualifier.size());
|
|
metadataExtended->accessQualifier = parseLimitedString(inlineData.accessQualifier.begin(), inlineData.accessQualifier.size());
|
|
metadataExtended->argName = parseLimitedString(inlineData.argName.begin(), inlineData.argName.size());
|
|
|
|
auto argTypeFull = parseLimitedString(inlineData.typeName.begin(), inlineData.typeName.size());
|
|
const char *argTypeDelim = strchr(argTypeFull.data(), ';');
|
|
if (nullptr == argTypeDelim) {
|
|
argTypeDelim = argTypeFull.data() + argTypeFull.size();
|
|
}
|
|
metadataExtended->type = std::string(static_cast<const char *>(argTypeFull.data()), argTypeDelim).c_str();
|
|
metadataExtended->typeQualifiers = parseLimitedString(inlineData.typeQualifiers.begin(), inlineData.typeQualifiers.size());
|
|
|
|
ArgTypeTraits metadata = {};
|
|
metadata.accessQualifier = KernelArgMetadata::parseAccessQualifier(metadataExtended->accessQualifier);
|
|
metadata.addressQualifier = KernelArgMetadata::parseAddressSpace(metadataExtended->addressQualifier);
|
|
metadata.typeQualifiers = KernelArgMetadata::parseTypeQualifiers(metadataExtended->typeQualifiers);
|
|
|
|
markArgAsPatchable(dst, argNum);
|
|
|
|
dst.payloadMappings.explicitArgs[argNum].getTraits() = metadata;
|
|
dst.explicitArgsExtendedMetadata[argNum] = std::move(*metadataExtended);
|
|
}
|
|
|
|
void populateArgDescriptor(KernelDescriptor &dst, size_t argNum, const PatchTokenBinary::KernelArgFromPatchtokens &src) {
|
|
if (src.objectArg != nullptr) {
|
|
switch (src.objectArg->Token) {
|
|
default:
|
|
UNRECOVERABLE_IF(PATCH_TOKEN_IMAGE_MEMORY_OBJECT_KERNEL_ARGUMENT != src.objectArg->Token);
|
|
populateKernelArgDescriptor(dst, argNum, *reinterpret_cast<const SPatchImageMemoryObjectKernelArgument *>(src.objectArg));
|
|
dst.kernelAttributes.flags.usesImages = true;
|
|
break;
|
|
case PATCH_TOKEN_SAMPLER_KERNEL_ARGUMENT:
|
|
populateKernelArgDescriptor(dst, argNum, *reinterpret_cast<const SPatchSamplerKernelArgument *>(src.objectArg));
|
|
dst.kernelAttributes.flags.usesSamplers = true;
|
|
break;
|
|
case PATCH_TOKEN_GLOBAL_MEMORY_OBJECT_KERNEL_ARGUMENT:
|
|
populateKernelArgDescriptor(dst, argNum, *reinterpret_cast<const SPatchGlobalMemoryObjectKernelArgument *>(src.objectArg));
|
|
break;
|
|
case PATCH_TOKEN_STATELESS_GLOBAL_MEMORY_OBJECT_KERNEL_ARGUMENT:
|
|
populateKernelArgDescriptor(dst, argNum, *reinterpret_cast<const SPatchStatelessGlobalMemoryObjectKernelArgument *>(src.objectArg));
|
|
break;
|
|
case PATCH_TOKEN_STATELESS_CONSTANT_MEMORY_OBJECT_KERNEL_ARGUMENT:
|
|
populateKernelArgDescriptor(dst, argNum, *reinterpret_cast<const SPatchStatelessConstantMemoryObjectKernelArgument *>(src.objectArg));
|
|
break;
|
|
case PATCH_TOKEN_STATELESS_DEVICE_QUEUE_KERNEL_ARGUMENT:
|
|
populateKernelArgDescriptor(dst, argNum, *reinterpret_cast<const SPatchStatelessDeviceQueueKernelArgument *>(src.objectArg));
|
|
break;
|
|
}
|
|
}
|
|
|
|
switch (src.objectType) {
|
|
default:
|
|
UNRECOVERABLE_IF(PatchTokenBinary::ArgObjectType::None != src.objectType);
|
|
break;
|
|
case PatchTokenBinary::ArgObjectType::Buffer: {
|
|
auto &asBufferArg = dst.payloadMappings.explicitArgs[argNum].as<ArgDescPointer>(true);
|
|
asBufferArg.bufferOffset = getOffset(src.metadata.buffer.bufferOffset);
|
|
if (src.metadata.buffer.pureStateful != nullptr) {
|
|
asBufferArg.accessedUsingStatelessAddressingMode = false;
|
|
}
|
|
} break;
|
|
case PatchTokenBinary::ArgObjectType::Image: {
|
|
auto &asImageArg = dst.payloadMappings.explicitArgs[argNum].as<ArgDescImage>(true);
|
|
asImageArg.metadataPayload.imgWidth = getOffset(src.metadata.image.width);
|
|
asImageArg.metadataPayload.imgHeight = getOffset(src.metadata.image.height);
|
|
asImageArg.metadataPayload.imgDepth = getOffset(src.metadata.image.depth);
|
|
asImageArg.metadataPayload.channelDataType = getOffset(src.metadata.image.channelDataType);
|
|
asImageArg.metadataPayload.channelOrder = getOffset(src.metadata.image.channelOrder);
|
|
asImageArg.metadataPayload.arraySize = getOffset(src.metadata.image.arraySize);
|
|
asImageArg.metadataPayload.numSamples = getOffset(src.metadata.image.numSamples);
|
|
asImageArg.metadataPayload.numMipLevels = getOffset(src.metadata.image.numMipLevels);
|
|
asImageArg.metadataPayload.flatBaseOffset = getOffset(src.metadata.image.flatBaseOffset);
|
|
asImageArg.metadataPayload.flatWidth = getOffset(src.metadata.image.flatWidth);
|
|
asImageArg.metadataPayload.flatHeight = getOffset(src.metadata.image.flatHeight);
|
|
asImageArg.metadataPayload.flatPitch = getOffset(src.metadata.image.flatPitch);
|
|
dst.kernelAttributes.flags.usesImages = true;
|
|
} break;
|
|
case PatchTokenBinary::ArgObjectType::Sampler: {
|
|
auto &asSamplerArg = dst.payloadMappings.explicitArgs[argNum].as<ArgDescSampler>(true);
|
|
asSamplerArg.metadataPayload.samplerSnapWa = getOffset(src.metadata.sampler.coordinateSnapWaRequired);
|
|
asSamplerArg.metadataPayload.samplerAddressingMode = getOffset(src.metadata.sampler.addressMode);
|
|
asSamplerArg.metadataPayload.samplerNormalizedCoords = getOffset(src.metadata.sampler.normalizedCoords);
|
|
dst.kernelAttributes.flags.usesSamplers = true;
|
|
} break;
|
|
case PatchTokenBinary::ArgObjectType::Slm: {
|
|
markArgAsPatchable(dst, argNum);
|
|
auto &asBufferArg = dst.payloadMappings.explicitArgs[argNum].as<ArgDescPointer>(true);
|
|
asBufferArg.requiredSlmAlignment = src.metadata.slm.token->SourceOffset;
|
|
asBufferArg.slmOffset = src.metadata.slm.token->Offset;
|
|
} break;
|
|
}
|
|
|
|
switch (src.objectTypeSpecialized) {
|
|
default:
|
|
UNRECOVERABLE_IF(PatchTokenBinary::ArgObjectTypeSpecialized::None != src.objectTypeSpecialized);
|
|
break;
|
|
case PatchTokenBinary::ArgObjectTypeSpecialized::Vme: {
|
|
dst.payloadMappings.explicitArgs[argNum].getExtendedTypeInfo().hasVmeExtendedDescriptor = true;
|
|
dst.payloadMappings.explicitArgsExtendedDescriptors.resize(dst.payloadMappings.explicitArgs.size());
|
|
|
|
auto vmeDescriptor = std::make_unique<ArgDescVme>();
|
|
vmeDescriptor->mbBlockType = getOffset(src.metadataSpecialized.vme.mbBlockType);
|
|
vmeDescriptor->subpixelMode = getOffset(src.metadataSpecialized.vme.subpixelMode);
|
|
vmeDescriptor->sadAdjustMode = getOffset(src.metadataSpecialized.vme.sadAdjustMode);
|
|
vmeDescriptor->searchPathType = getOffset(src.metadataSpecialized.vme.searchPathType);
|
|
dst.payloadMappings.explicitArgsExtendedDescriptors[argNum] = std::move(vmeDescriptor);
|
|
} break;
|
|
}
|
|
|
|
for (auto &byValArg : src.byValMap) {
|
|
if (PatchTokenBinary::ArgObjectType::Slm != src.objectType) {
|
|
populateKernelArgDescriptor(dst, argNum, *byValArg);
|
|
}
|
|
}
|
|
populateArgMetadata(dst, argNum, src.argInfo);
|
|
}
|
|
|
|
void populateKernelDescriptor(KernelDescriptor &dst, const PatchTokenBinary::KernelFromPatchtokens &src, uint32_t gpuPointerSizeInBytes) {
|
|
UNRECOVERABLE_IF(nullptr == src.header);
|
|
dst.kernelMetadata.kernelName = std::string(src.name.begin(), src.name.end()).c_str();
|
|
populateKernelDescriptorIfNotNull(dst, src.tokens.executionEnvironment);
|
|
populateKernelDescriptorIfNotNull(dst, src.tokens.samplerStateArray);
|
|
populateKernelDescriptorIfNotNull(dst, src.tokens.bindingTableState);
|
|
populateKernelDescriptorIfNotNull(dst, src.tokens.allocateLocalSurface);
|
|
populateKernelDescriptorIfNotNull(dst, src.tokens.mediaVfeState[0], 0);
|
|
populateKernelDescriptorIfNotNull(dst, src.tokens.mediaVfeState[1], 1);
|
|
populateKernelDescriptorIfNotNull(dst, src.tokens.threadPayload);
|
|
populateKernelDescriptorIfNotNull(dst, src.tokens.dataParameterStream);
|
|
populateKernelDescriptorIfNotNull(dst, src.tokens.kernelAttributesInfo);
|
|
populateKernelDescriptorIfNotNull(dst, src.tokens.allocateStatelessPrivateSurface);
|
|
populateKernelDescriptorIfNotNull(dst, src.tokens.allocateStatelessConstantMemorySurfaceWithInitialization);
|
|
populateKernelDescriptorIfNotNull(dst, src.tokens.allocateStatelessGlobalMemorySurfaceWithInitialization);
|
|
populateKernelDescriptorIfNotNull(dst, src.tokens.allocateStatelessPrintfSurface);
|
|
populateKernelDescriptorIfNotNull(dst, src.tokens.allocateStatelessEventPoolSurface);
|
|
populateKernelDescriptorIfNotNull(dst, src.tokens.allocateStatelessDefaultDeviceQueueSurface);
|
|
populateKernelDescriptorIfNotNull(dst, src.tokens.allocateSyncBuffer);
|
|
populateKernelDescriptorIfNotNull(dst, src.tokens.allocateRTGlobalBuffer);
|
|
|
|
dst.payloadMappings.explicitArgs.resize(src.tokens.kernelArgs.size());
|
|
dst.explicitArgsExtendedMetadata.resize(src.tokens.kernelArgs.size());
|
|
|
|
for (size_t i = 0U; i < src.tokens.kernelArgs.size(); ++i) {
|
|
auto &decodedKernelArg = src.tokens.kernelArgs[i];
|
|
populateArgDescriptor(dst, i, decodedKernelArg);
|
|
}
|
|
|
|
for (auto &str : src.tokens.strings) {
|
|
populateKernelDescriptorIfNotNull(dst, str);
|
|
}
|
|
|
|
dst.kernelAttributes.flags.usesVme |= (src.tokens.inlineVmeSamplerInfo != nullptr);
|
|
dst.entryPoints.systemKernel = src.tokens.stateSip ? src.tokens.stateSip->SystemKernelOffset : 0U;
|
|
populateKernelDescriptorIfNotNull(dst, src.tokens.allocateSystemThreadSurface);
|
|
|
|
for (uint32_t i = 0; i < 3U; ++i) {
|
|
dst.payloadMappings.dispatchTraits.localWorkSize[i] = getOffset(src.tokens.crossThreadPayloadArgs.localWorkSize[i]);
|
|
dst.payloadMappings.dispatchTraits.localWorkSize2[i] = getOffset(src.tokens.crossThreadPayloadArgs.localWorkSize2[i]);
|
|
dst.payloadMappings.dispatchTraits.globalWorkOffset[i] = getOffset(src.tokens.crossThreadPayloadArgs.globalWorkOffset[i]);
|
|
dst.payloadMappings.dispatchTraits.enqueuedLocalWorkSize[i] = getOffset(src.tokens.crossThreadPayloadArgs.enqueuedLocalWorkSize[i]);
|
|
dst.payloadMappings.dispatchTraits.globalWorkSize[i] = getOffset(src.tokens.crossThreadPayloadArgs.globalWorkSize[i]);
|
|
dst.payloadMappings.dispatchTraits.numWorkGroups[i] = getOffset(src.tokens.crossThreadPayloadArgs.numWorkGroups[i]);
|
|
}
|
|
dst.payloadMappings.dispatchTraits.workDim = getOffset(src.tokens.crossThreadPayloadArgs.workDimensions);
|
|
|
|
dst.payloadMappings.implicitArgs.maxWorkGroupSize = getOffset(src.tokens.crossThreadPayloadArgs.maxWorkGroupSize);
|
|
dst.payloadMappings.implicitArgs.simdSize = getOffset(src.tokens.crossThreadPayloadArgs.simdSize);
|
|
dst.payloadMappings.implicitArgs.deviceSideEnqueueParentEvent = getOffset(src.tokens.crossThreadPayloadArgs.parentEvent);
|
|
dst.payloadMappings.implicitArgs.preferredWkgMultiple = getOffset(src.tokens.crossThreadPayloadArgs.preferredWorkgroupMultiple);
|
|
dst.payloadMappings.implicitArgs.privateMemorySize = getOffset(src.tokens.crossThreadPayloadArgs.privateMemoryStatelessSize);
|
|
dst.payloadMappings.implicitArgs.localMemoryStatelessWindowSize = getOffset(src.tokens.crossThreadPayloadArgs.localMemoryStatelessWindowSize);
|
|
dst.payloadMappings.implicitArgs.localMemoryStatelessWindowStartAddres = getOffset(src.tokens.crossThreadPayloadArgs.localMemoryStatelessWindowStartAddress);
|
|
dst.payloadMappings.implicitArgs.implicitArgsBuffer = getOffset(src.tokens.crossThreadPayloadArgs.implicitArgsBufferOffset);
|
|
|
|
if (src.tokens.gtpinInfo) {
|
|
dst.external.igcInfoForGtpin = (src.tokens.gtpinInfo + 1);
|
|
}
|
|
|
|
dst.kernelAttributes.binaryFormat = DeviceBinaryFormat::Patchtokens;
|
|
dst.kernelAttributes.gpuPointerSize = gpuPointerSizeInBytes;
|
|
dst.kernelAttributes.flags.requiresImplicitArgs = src.tokens.crossThreadPayloadArgs.implicitArgsBufferOffset != nullptr;
|
|
|
|
if (DebugManager.flags.UpdateCrossThreadDataSize.get()) {
|
|
dst.updateCrossThreadDataSize();
|
|
}
|
|
|
|
if (KernelDescriptor::isBindlessAddressingKernel(dst)) {
|
|
dst.initBindlessOffsetToSurfaceState();
|
|
}
|
|
}
|
|
|
|
} // namespace NEO
|