Files
compute-runtime/shared/source/kernel/kernel_descriptor.h
Dominik Dabek 5e62df4f8e Check IndirectStatelessCount from igc
If kernel has no stateless indirect accesses don't set the
kernelHasIndirectAccess flag.
Don't make resident or migrate if kernel has no indirect accesses.
Changed initial values in KernelAttributes.

Related-To: NEO-6597

Signed-off-by: Dominik Dabek <dominik.dabek@intel.com>
2022-02-02 12:28:30 +01:00

294 lines
14 KiB
C++

/*
* Copyright (C) 2020-2022 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#pragma once
#include "shared/source/helpers/aligned_memory.h"
#include "shared/source/helpers/debug_helpers.h"
#include "shared/source/kernel/debug_data.h"
#include "shared/source/kernel/kernel_arg_descriptor.h"
#include "shared/source/kernel/kernel_arg_metadata.h"
#include "shared/source/utilities/arrayref.h"
#include "shared/source/utilities/stackvec.h"
#include <cinttypes>
#include <cstddef>
#include <limits>
#include <memory>
#include <unordered_map>
#include <vector>
namespace NEO {
using StringMap = std::unordered_map<uint32_t, std::string>;
using InstructionsSegmentOffset = uint16_t;
struct ExtendedInfoBase {
virtual ~ExtendedInfoBase() = default;
};
struct KernelDescriptor {
enum AddressingMode : uint8_t {
AddrNone,
Stateless,
Bindful,
Bindless,
BindfulAndStateless,
BindlessAndStateless
};
KernelDescriptor() = default;
virtual ~KernelDescriptor() = default;
virtual bool hasRTCalls() const;
void updateCrossThreadDataSize() {
uint32_t crossThreadDataSize = 0;
for (uint32_t i = 0; i < 3; i++) {
if (isValidOffset(payloadMappings.dispatchTraits.globalWorkOffset[i])) {
crossThreadDataSize = std::max<uint32_t>(crossThreadDataSize, payloadMappings.dispatchTraits.globalWorkOffset[i] + sizeof(uint32_t));
}
if (isValidOffset(payloadMappings.dispatchTraits.globalWorkSize[i])) {
crossThreadDataSize = std::max<uint32_t>(crossThreadDataSize, payloadMappings.dispatchTraits.globalWorkSize[i] + sizeof(uint32_t));
}
if (isValidOffset(payloadMappings.dispatchTraits.localWorkSize[i])) {
crossThreadDataSize = std::max<uint32_t>(crossThreadDataSize, payloadMappings.dispatchTraits.localWorkSize[i] + sizeof(uint32_t));
}
if (isValidOffset(payloadMappings.dispatchTraits.localWorkSize2[i])) {
crossThreadDataSize = std::max<uint32_t>(crossThreadDataSize, payloadMappings.dispatchTraits.localWorkSize2[i] + sizeof(uint32_t));
}
if (isValidOffset(payloadMappings.dispatchTraits.enqueuedLocalWorkSize[i])) {
crossThreadDataSize = std::max<uint32_t>(crossThreadDataSize, payloadMappings.dispatchTraits.enqueuedLocalWorkSize[i] + sizeof(uint32_t));
}
if (isValidOffset(payloadMappings.dispatchTraits.numWorkGroups[i])) {
crossThreadDataSize = std::max<uint32_t>(crossThreadDataSize, payloadMappings.dispatchTraits.numWorkGroups[i] + sizeof(uint32_t));
}
}
if (isValidOffset(payloadMappings.dispatchTraits.workDim)) {
crossThreadDataSize = std::max<uint32_t>(crossThreadDataSize, payloadMappings.dispatchTraits.workDim + sizeof(uint32_t));
}
StackVec<ArgDescPointer *, 8> implicitArgsVec({&payloadMappings.implicitArgs.printfSurfaceAddress,
&payloadMappings.implicitArgs.globalVariablesSurfaceAddress,
&payloadMappings.implicitArgs.globalConstantsSurfaceAddress,
&payloadMappings.implicitArgs.privateMemoryAddress,
&payloadMappings.implicitArgs.deviceSideEnqueueEventPoolSurfaceAddress,
&payloadMappings.implicitArgs.deviceSideEnqueueDefaultQueueSurfaceAddress,
&payloadMappings.implicitArgs.systemThreadSurfaceAddress,
&payloadMappings.implicitArgs.syncBufferAddress});
for (size_t i = 0; i < implicitArgsVec.size(); i++) {
if (isValidOffset(implicitArgsVec[i]->bindless)) {
crossThreadDataSize = std::max<uint32_t>(crossThreadDataSize, implicitArgsVec[i]->bindless + sizeof(uint32_t));
}
if (isValidOffset(implicitArgsVec[i]->stateless)) {
crossThreadDataSize = std::max<uint32_t>(crossThreadDataSize, implicitArgsVec[i]->stateless + implicitArgsVec[i]->pointerSize);
}
}
StackVec<CrossThreadDataOffset *, 7> implicitArgsVec2({&payloadMappings.implicitArgs.privateMemorySize,
&payloadMappings.implicitArgs.maxWorkGroupSize,
&payloadMappings.implicitArgs.simdSize,
&payloadMappings.implicitArgs.deviceSideEnqueueParentEvent,
&payloadMappings.implicitArgs.preferredWkgMultiple,
&payloadMappings.implicitArgs.localMemoryStatelessWindowSize,
&payloadMappings.implicitArgs.localMemoryStatelessWindowStartAddres});
for (size_t i = 0; i < implicitArgsVec2.size(); i++) {
if (isValidOffset(*implicitArgsVec2[i])) {
crossThreadDataSize = std::max<uint32_t>(crossThreadDataSize, *implicitArgsVec2[i] + sizeof(uint32_t));
}
}
for (size_t i = 0; i < payloadMappings.explicitArgs.size(); i++) {
switch (payloadMappings.explicitArgs[i].type) {
case ArgDescriptor::ArgType::ArgTImage: {
auto &argImage = payloadMappings.explicitArgs[i].as<ArgDescImage>(false);
if (isValidOffset(argImage.bindless)) {
crossThreadDataSize = std::max<uint32_t>(crossThreadDataSize, argImage.bindless + sizeof(uint32_t));
}
} break;
case ArgDescriptor::ArgType::ArgTPointer: {
auto &argPtr = payloadMappings.explicitArgs[i].as<ArgDescPointer>(false);
if (isValidOffset(argPtr.bindless)) {
crossThreadDataSize = std::max<uint32_t>(crossThreadDataSize, argPtr.bindless + sizeof(uint32_t));
}
if (isValidOffset(argPtr.stateless)) {
crossThreadDataSize = std::max<uint32_t>(crossThreadDataSize, argPtr.stateless + argPtr.pointerSize);
}
} break;
case ArgDescriptor::ArgType::ArgTSampler: {
auto &argSampler = payloadMappings.explicitArgs[i].as<ArgDescSampler>(false);
UNRECOVERABLE_IF(isValidOffset(argSampler.bindless));
} break;
case ArgDescriptor::ArgType::ArgTValue: {
auto &argVal = payloadMappings.explicitArgs[i].as<ArgDescValue>(false);
for (size_t i = 0; i < argVal.elements.size(); i++) {
UNRECOVERABLE_IF(!isValidOffset(argVal.elements[i].offset));
crossThreadDataSize = std::max<uint32_t>(crossThreadDataSize, argVal.elements[i].offset + argVal.elements[i].size);
}
} break;
default:
break;
}
}
this->kernelAttributes.crossThreadDataSize = std::max<uint16_t>(this->kernelAttributes.crossThreadDataSize, static_cast<uint16_t>(alignUp(crossThreadDataSize, 32)));
}
struct KernelAttributes {
KernelAttributes() { flags.packed = 0U; }
uint32_t slmInlineSize = 0U;
uint32_t perThreadScratchSize[2] = {0U, 0U};
uint32_t perHwThreadPrivateMemorySize = 0U;
uint32_t perThreadSystemThreadSurfaceSize = 0U;
uint16_t requiredWorkgroupSize[3] = {0U, 0U, 0U};
uint16_t crossThreadDataSize = 0U;
uint16_t inlineDataPayloadSize = 0U;
uint16_t perThreadDataSize = 0U;
uint16_t numArgsToPatch = 0U;
uint16_t numGrfRequired = 0U;
uint8_t barrierCount = 0u;
bool hasNonKernelArgLoad = false;
bool hasNonKernelArgStore = false;
bool hasNonKernelArgAtomic = false;
AddressingMode bufferAddressingMode = BindfulAndStateless;
AddressingMode imageAddressingMode = Bindful;
AddressingMode samplerAddressingMode = Bindful;
uint8_t workgroupWalkOrder[3] = {0, 1, 2};
uint8_t workgroupDimensionsOrder[3] = {0, 1, 2};
uint8_t gpuPointerSize = 0;
uint8_t simdSize = 8;
uint8_t numLocalIdChannels = 0;
uint8_t localId[3] = {0U, 0U, 0U};
bool supportsBuffersBiggerThan4Gb() const {
return Stateless == bufferAddressingMode;
}
bool usesBarriers() const {
return 0 != barrierCount;
}
union {
struct {
bool usesSpecialPipelineSelectMode : 1;
bool usesStringMapForPrintf : 1;
bool usesPrintf : 1;
bool usesFencesForReadWriteImages : 1;
bool usesFlattenedLocalIds;
bool usesPrivateMemory : 1;
bool usesVme : 1;
bool usesImages : 1;
bool usesSamplers : 1;
bool usesSyncBuffer : 1;
bool useGlobalAtomics : 1;
bool usesStatelessWrites : 1;
bool passInlineData : 1;
bool perThreadDataHeaderIsPresent : 1;
bool perThreadDataUnusedGrfIsPresent : 1;
bool requiresDisabledMidThreadPreemption : 1;
bool requiresSubgroupIndependentForwardProgress : 1;
bool requiresWorkgroupWalkOrder : 1;
bool requiresImplicitArgs : 1;
bool useStackCalls : 1;
};
uint32_t packed;
} flags;
static_assert(sizeof(KernelAttributes::flags) == sizeof(KernelAttributes::flags.packed), "");
} kernelAttributes;
struct {
InstructionsSegmentOffset skipPerThreadDataLoad = 0U;
InstructionsSegmentOffset skipSetFFIDGP = 0U;
InstructionsSegmentOffset systemKernel = 0U;
} entryPoints;
struct PayloadMappings {
struct {
CrossThreadDataOffset globalWorkOffset[3] = {undefined<CrossThreadDataOffset>, undefined<CrossThreadDataOffset>, undefined<CrossThreadDataOffset>};
CrossThreadDataOffset globalWorkSize[3] = {undefined<CrossThreadDataOffset>, undefined<CrossThreadDataOffset>, undefined<CrossThreadDataOffset>};
CrossThreadDataOffset localWorkSize[3] = {undefined<CrossThreadDataOffset>, undefined<CrossThreadDataOffset>, undefined<CrossThreadDataOffset>};
CrossThreadDataOffset localWorkSize2[3] = {undefined<CrossThreadDataOffset>, undefined<CrossThreadDataOffset>, undefined<CrossThreadDataOffset>};
CrossThreadDataOffset enqueuedLocalWorkSize[3] = {undefined<CrossThreadDataOffset>, undefined<CrossThreadDataOffset>, undefined<CrossThreadDataOffset>};
CrossThreadDataOffset numWorkGroups[3] = {undefined<CrossThreadDataOffset>, undefined<CrossThreadDataOffset>, undefined<CrossThreadDataOffset>};
CrossThreadDataOffset workDim = undefined<CrossThreadDataOffset>;
} dispatchTraits;
struct {
SurfaceStateHeapOffset tableOffset = undefined<SurfaceStateHeapOffset>;
uint8_t numEntries = 0;
} bindingTable;
struct {
DynamicStateHeapOffset tableOffset = undefined<DynamicStateHeapOffset>;
DynamicStateHeapOffset borderColor = undefined<DynamicStateHeapOffset>;
uint8_t numSamplers = 0;
} samplerTable;
StackVec<ArgDescriptor, 16> explicitArgs;
struct {
ArgDescPointer printfSurfaceAddress;
ArgDescPointer globalVariablesSurfaceAddress;
ArgDescPointer globalConstantsSurfaceAddress;
ArgDescPointer privateMemoryAddress;
ArgDescPointer deviceSideEnqueueEventPoolSurfaceAddress;
ArgDescPointer deviceSideEnqueueDefaultQueueSurfaceAddress;
ArgDescPointer systemThreadSurfaceAddress;
ArgDescPointer syncBufferAddress;
ArgDescPointer rtDispatchGlobals;
CrossThreadDataOffset privateMemorySize = undefined<CrossThreadDataOffset>;
CrossThreadDataOffset maxWorkGroupSize = undefined<CrossThreadDataOffset>;
CrossThreadDataOffset simdSize = undefined<CrossThreadDataOffset>;
CrossThreadDataOffset deviceSideEnqueueParentEvent = undefined<CrossThreadDataOffset>;
CrossThreadDataOffset preferredWkgMultiple = undefined<CrossThreadDataOffset>;
CrossThreadDataOffset localMemoryStatelessWindowSize = undefined<CrossThreadDataOffset>;
CrossThreadDataOffset localMemoryStatelessWindowStartAddres = undefined<CrossThreadDataOffset>;
} implicitArgs;
std::vector<std::unique_ptr<ArgDescriptorExtended>> explicitArgsExtendedDescriptors;
} payloadMappings;
std::vector<ArgTypeMetadataExtended> explicitArgsExtendedMetadata;
struct {
std::string kernelName;
std::string kernelLanguageAttributes;
StringMap printfStringsMap;
std::vector<std::pair<uint32_t, uint32_t>> deviceSideEnqueueChildrenKernelsIdOffset;
uint32_t deviceSideEnqueueBlockInterfaceDescriptorOffset = 0U;
struct ByValueArgument {
ArgDescValue::Element byValueElement;
uint16_t argNum;
};
StackVec<ByValueArgument, 32> allByValueKernelArguments;
uint16_t compiledSubGroupsNumber = 0U;
uint8_t requiredSubGroupSize = 0U;
} kernelMetadata;
struct {
std::unique_ptr<DebugData> debugData;
std::unique_ptr<uint8_t[]> relocatedDebugData;
const void *igcInfoForGtpin = nullptr;
} external;
std::vector<uint8_t> generatedHeaps;
std::unique_ptr<ExtendedInfoBase> extendedInfo;
};
} // namespace NEO