Files
compute-runtime/level_zero/core/source/kernel_imp.cpp
Brandon Fliflet 27f4bce42f Initial support for oneAPI Level Zero
Change-Id: I221df8427b1844237a4d9d900c58512706b0be0f
2020-03-06 14:53:29 +01:00

735 lines
32 KiB
C++

/*
* Copyright (C) 2019-2020 Intel Corporation
*
* SPDX-License-Identifier: MIT
*
*/
#include "level_zero/core/source/kernel_imp.h"
#include "shared/source/helpers/basic_math.h"
#include "shared/source/helpers/hw_helper.h"
#include "shared/source/helpers/kernel_helpers.h"
#include "shared/source/helpers/register_offsets.h"
#include "shared/source/helpers/string.h"
#include "shared/source/kernel/kernel_descriptor.h"
#include "shared/source/memory_manager/memory_manager.h"
#include "shared/source/utilities/arrayref.h"
#include "opencl/source/command_queue/gpgpu_walker.h"
#include "opencl/source/program/kernel_info.h"
#include "level_zero/core/source/device.h"
#include "level_zero/core/source/image.h"
#include "level_zero/core/source/module.h"
#include "level_zero/core/source/module_imp.h"
#include "level_zero/core/source/printf_handler.h"
#include "level_zero/core/source/sampler.h"
#include <memory>
namespace L0 {
KernelImmutableData::KernelImmutableData(L0::Device *l0device) : device(l0device) {}
KernelImmutableData::~KernelImmutableData() {
if (nullptr != isaGraphicsAllocation) {
this->getDevice()->getDriverHandle()->getMemoryManager()->freeGraphicsMemory(&*isaGraphicsAllocation);
isaGraphicsAllocation.release();
}
crossThreadDataTemplate.reset();
if (nullptr != privateMemoryGraphicsAllocation) {
this->getDevice()->getDriverHandle()->getMemoryManager()->freeGraphicsMemory(&*privateMemoryGraphicsAllocation);
crossThreadDataTemplate.release();
}
surfaceStateHeapTemplate.reset();
dynamicStateHeapTemplate.reset();
}
inline void patchWithImplicitSurface(ArrayRef<uint8_t> crossThreadData, ArrayRef<uint8_t> surfaceStateHeap,
uintptr_t ptrToPatchInCrossThreadData, NEO::GraphicsAllocation &allocation,
const NEO::ArgDescPointer &ptr, const NEO::Device &device) {
if (false == crossThreadData.empty()) {
NEO::patchPointer(crossThreadData, ptr, ptrToPatchInCrossThreadData);
}
if ((false == surfaceStateHeap.empty()) && (NEO::isValidOffset(ptr.bindful))) {
auto surfaceState = surfaceStateHeap.begin() + ptr.bindful;
void *addressToPatch = reinterpret_cast<void *>(allocation.getUnderlyingBuffer());
size_t sizeToPatch = allocation.getUnderlyingBufferSize();
NEO::Buffer::setSurfaceState(&device, surfaceState, sizeToPatch, addressToPatch, 0,
&allocation, 0, 0);
}
}
void KernelImmutableData::initialize(NEO::KernelInfo *kernelInfo, NEO::MemoryManager &memoryManager,
const NEO::Device *device, uint32_t computeUnitsUsedForSratch,
NEO::GraphicsAllocation *globalConstBuffer,
NEO::GraphicsAllocation *globalVarBuffer) {
UNRECOVERABLE_IF(kernelInfo == nullptr);
this->kernelDescriptor = &kernelInfo->kernelDescriptor;
auto kernelIsaSize = kernelInfo->heapInfo.pKernelHeader->KernelHeapSize;
auto allocation = memoryManager.allocateGraphicsMemoryWithProperties(
{device->getRootDeviceIndex(), kernelIsaSize, NEO::GraphicsAllocation::AllocationType::KERNEL_ISA});
UNRECOVERABLE_IF(allocation == nullptr);
if (kernelInfo->heapInfo.pKernelHeap != nullptr) {
memoryManager.copyMemoryToAllocation(allocation, kernelInfo->heapInfo.pKernelHeap, kernelIsaSize);
}
isaGraphicsAllocation.reset(allocation);
this->crossThreadDataSize = this->kernelDescriptor->kernelAttributes.crossThreadDataSize;
ArrayRef<uint8_t> crossThredDataArrayRef;
if (crossThreadDataSize != 0) {
crossThreadDataTemplate.reset(new uint8_t[crossThreadDataSize]);
if (kernelInfo->crossThreadData) {
memcpy_s(crossThreadDataTemplate.get(), crossThreadDataSize,
kernelInfo->crossThreadData, crossThreadDataSize);
} else {
memset(crossThreadDataTemplate.get(), 0x00, crossThreadDataSize);
}
crossThredDataArrayRef = ArrayRef<uint8_t>(this->crossThreadDataTemplate.get(), this->crossThreadDataSize);
NEO::patchNonPointer<uint32_t>(crossThredDataArrayRef,
kernelDescriptor->payloadMappings.implicitArgs.simdSize, kernelDescriptor->kernelAttributes.simdSize);
}
if (kernelInfo->heapInfo.pKernelHeader->SurfaceStateHeapSize != 0) {
this->surfaceStateHeapSize = kernelInfo->heapInfo.pKernelHeader->SurfaceStateHeapSize;
surfaceStateHeapTemplate.reset(new uint8_t[surfaceStateHeapSize]);
memcpy_s(surfaceStateHeapTemplate.get(), surfaceStateHeapSize,
kernelInfo->heapInfo.pSsh, surfaceStateHeapSize);
}
if (kernelInfo->heapInfo.pKernelHeader->DynamicStateHeapSize != 0) {
this->dynamicStateHeapSize = kernelInfo->heapInfo.pKernelHeader->DynamicStateHeapSize;
dynamicStateHeapTemplate.reset(new uint8_t[dynamicStateHeapSize]);
memcpy_s(dynamicStateHeapTemplate.get(), dynamicStateHeapSize,
kernelInfo->heapInfo.pDsh, dynamicStateHeapSize);
}
ArrayRef<uint8_t> surfaceStateHeapArrayRef = ArrayRef<uint8_t>(surfaceStateHeapTemplate.get(), getSurfaceStateHeapSize());
uint32_t privateSurfaceSize = kernelDescriptor->kernelAttributes.perThreadPrivateMemorySize;
if (privateSurfaceSize != 0) {
privateSurfaceSize *= computeUnitsUsedForSratch * kernelDescriptor->kernelAttributes.simdSize;
UNRECOVERABLE_IF(privateSurfaceSize == 0);
this->privateMemoryGraphicsAllocation.reset(memoryManager.allocateGraphicsMemoryWithProperties(
{0, privateSurfaceSize, NEO::GraphicsAllocation::AllocationType::PRIVATE_SURFACE}));
UNRECOVERABLE_IF(this->privateMemoryGraphicsAllocation == nullptr);
patchWithImplicitSurface(crossThredDataArrayRef, surfaceStateHeapArrayRef,
static_cast<uintptr_t>(privateMemoryGraphicsAllocation->getGpuAddressToPatch()),
*privateMemoryGraphicsAllocation, kernelDescriptor->payloadMappings.implicitArgs.privateMemoryAddress, *device);
this->residencyContainer.push_back(this->privateMemoryGraphicsAllocation.get());
}
if (NEO::isValidOffset(kernelDescriptor->payloadMappings.implicitArgs.globalConstantsSurfaceAddress.stateless)) {
UNRECOVERABLE_IF(nullptr == globalConstBuffer);
patchWithImplicitSurface(crossThredDataArrayRef, surfaceStateHeapArrayRef,
static_cast<uintptr_t>(globalConstBuffer->getGpuAddressToPatch()),
*globalConstBuffer, kernelDescriptor->payloadMappings.implicitArgs.globalConstantsSurfaceAddress, *device);
this->residencyContainer.push_back(globalConstBuffer);
}
if (NEO::isValidOffset(kernelDescriptor->payloadMappings.implicitArgs.globalVariablesSurfaceAddress.stateless)) {
UNRECOVERABLE_IF(globalVarBuffer == nullptr);
patchWithImplicitSurface(crossThredDataArrayRef, surfaceStateHeapArrayRef,
static_cast<uintptr_t>(globalVarBuffer->getGpuAddressToPatch()),
*globalVarBuffer, kernelDescriptor->payloadMappings.implicitArgs.globalVariablesSurfaceAddress, *device);
this->residencyContainer.push_back(globalVarBuffer);
}
}
uint32_t KernelImmutableData::getIsaSize() const {
return static_cast<uint32_t>(isaGraphicsAllocation->getUnderlyingBufferSize());
}
uint64_t KernelImmutableData::getPrivateMemorySize() const {
uint64_t size = 0;
if (privateMemoryGraphicsAllocation != nullptr) {
size = privateMemoryGraphicsAllocation->getUnderlyingBufferSize();
}
return size;
}
KernelImp::KernelImp(Module *module) : module(module) {}
KernelImp::~KernelImp() {
if (perThreadDataForWholeThreadGroup != nullptr) {
alignedFree(perThreadDataForWholeThreadGroup);
}
if (printfBuffer != nullptr) {
module->getDevice()->getDriverHandle()->getMemoryManager()->freeGraphicsMemory(printfBuffer);
}
slmArgSizes.clear();
crossThreadData.reset();
surfaceStateHeapData.reset();
dynamicStateHeapData.reset();
}
ze_result_t KernelImp::setArgumentValue(uint32_t argIndex, size_t argSize,
const void *pArgValue) {
if (argIndex >= kernelArgHandlers.size()) {
return ZE_RESULT_ERROR_INVALID_ARGUMENT;
}
return (this->*kernelArgHandlers[argIndex])(argIndex, argSize, pArgValue);
}
void KernelImp::setGroupCount(uint32_t groupCountX, uint32_t groupCountY, uint32_t groupCountZ) {
uint32_t groupSizeX;
uint32_t groupSizeY;
uint32_t groupSizeZ;
getGroupSize(groupSizeX, groupSizeY, groupSizeZ);
const NEO::KernelDescriptor &desc = kernelImmData->getDescriptor();
uint32_t globalWorkSize[3] = {groupCountX * groupSizeX, groupCountY * groupSizeY,
groupCountZ * groupSizeZ};
auto dst = ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize);
NEO::patchVecNonPointer(dst, desc.payloadMappings.dispatchTraits.globalWorkSize, globalWorkSize);
uint32_t groupCount[3] = {groupCountX, groupCountY, groupCountZ};
NEO::patchVecNonPointer(dst, desc.payloadMappings.dispatchTraits.numWorkGroups, groupCount);
}
bool KernelImp::getGroupCountOffsets(uint32_t *locations) {
const NEO::KernelDescriptor &desc = kernelImmData->getDescriptor();
for (int i = 0; i < 3; i++) {
if (NEO::isValidOffset(desc.payloadMappings.dispatchTraits.numWorkGroups[i])) {
locations[i] = desc.payloadMappings.dispatchTraits.numWorkGroups[i];
} else {
return false;
}
}
return true;
}
bool KernelImp::getGroupSizeOffsets(uint32_t *locations) {
const NEO::KernelDescriptor &desc = kernelImmData->getDescriptor();
for (int i = 0; i < 3; i++) {
if (NEO::isValidOffset(desc.payloadMappings.dispatchTraits.globalWorkSize[i])) {
locations[i] = desc.payloadMappings.dispatchTraits.globalWorkSize[i];
} else {
return false;
}
}
return true;
}
ze_result_t KernelImp::setGroupSize(uint32_t groupSizeX, uint32_t groupSizeY,
uint32_t groupSizeZ) {
if ((0 == groupSizeX) || (0 == groupSizeY) || (0 == groupSizeZ)) {
return ZE_RESULT_ERROR_INVALID_ARGUMENT;
}
auto numChannels = kernelImmData->getDescriptor().kernelAttributes.numLocalIdChannels;
Vec3<size_t> groupSize{groupSizeX, groupSizeY, groupSizeZ};
auto itemsInGroup = Math::computeTotalElementsCount(groupSize);
if (itemsInGroup > module->getMaxGroupSize()) {
return ZE_RESULT_ERROR_UNKNOWN;
}
auto grfSize = kernelImmData->getDescriptor().kernelAttributes.grfSize;
uint32_t perThreadDataSizeForWholeThreadGroupNeeded =
static_cast<uint32_t>(NEO::PerThreadDataHelper::getPerThreadDataSizeTotal(
kernelImmData->getDescriptor().kernelAttributes.simdSize, grfSize, numChannels, itemsInGroup));
if (perThreadDataSizeForWholeThreadGroupNeeded >
perThreadDataSizeForWholeThreadGroupAllocated) {
alignedFree(perThreadDataForWholeThreadGroup);
perThreadDataForWholeThreadGroup = static_cast<uint8_t *>(alignedMalloc(perThreadDataSizeForWholeThreadGroupNeeded, 32));
perThreadDataSizeForWholeThreadGroupAllocated = perThreadDataSizeForWholeThreadGroupNeeded;
}
perThreadDataSizeForWholeThreadGroup = perThreadDataSizeForWholeThreadGroupNeeded;
if (numChannels > 0) {
UNRECOVERABLE_IF(3 != numChannels);
NEO::generateLocalIDs(
perThreadDataForWholeThreadGroup,
static_cast<uint16_t>(kernelImmData->getDescriptor().kernelAttributes.simdSize),
std::array<uint16_t, 3>{{static_cast<uint16_t>(groupSizeX),
static_cast<uint16_t>(groupSizeY),
static_cast<uint16_t>(groupSizeZ)}},
std::array<uint8_t, 3>{{0, 1, 2}},
false, grfSize);
}
this->groupSize[0] = groupSizeX;
this->groupSize[1] = groupSizeY;
this->groupSize[2] = groupSizeZ;
auto simdSize = kernelImmData->getDescriptor().kernelAttributes.simdSize;
this->threadsPerThreadGroup = static_cast<uint32_t>((itemsInGroup + simdSize - 1u) / simdSize);
this->perThreadDataSize = perThreadDataSizeForWholeThreadGroup / threadsPerThreadGroup;
patchWorkgroupSizeInCrossThreadData(groupSizeX, groupSizeY, groupSizeZ);
auto remainderSimdLanes = itemsInGroup & (simdSize - 1u);
threadExecutionMask = static_cast<uint32_t>(maxNBitValue(remainderSimdLanes));
if (!threadExecutionMask) {
threadExecutionMask = ~threadExecutionMask;
}
return ZE_RESULT_SUCCESS;
}
ze_result_t KernelImp::suggestGroupSize(uint32_t globalSizeX, uint32_t globalSizeY,
uint32_t globalSizeZ, uint32_t *groupSizeX,
uint32_t *groupSizeY, uint32_t *groupSizeZ) {
size_t retGroupSize[3] = {};
auto maxWorkGroupSize = module->getMaxGroupSize();
auto simd = kernelImmData->getDescriptor().kernelAttributes.simdSize;
size_t workItems[3] = {globalSizeX, globalSizeY, globalSizeZ};
uint32_t dim = (globalSizeY > 1U) ? 2 : 1U;
dim = (globalSizeZ > 1U) ? 3 : dim;
if (NEO::DebugManager.flags.EnableComputeWorkSizeND.get()) {
auto usesImages = getImmutableData()->getDescriptor().kernelAttributes.flags.usesImages;
auto coreFamily = module->getDevice()->getNEODevice()->getHardwareInfo().platform.eRenderCoreFamily;
const auto &deviceInfo = module->getDevice()->getNEODevice()->getDeviceInfo();
uint32_t numThreadsPerSubSlice = (uint32_t)deviceInfo.maxNumEUsPerSubSlice * deviceInfo.numThreadsPerEU;
uint32_t localMemSize = (uint32_t)deviceInfo.localMemSize;
NEO::WorkSizeInfo wsInfo(maxWorkGroupSize, this->hasBarriers(), simd, this->getSlmTotalSize(),
coreFamily, numThreadsPerSubSlice, localMemSize,
usesImages, false);
NEO::computeWorkgroupSizeND(wsInfo, retGroupSize, workItems, dim);
} else {
if (1U == dim) {
NEO::computeWorkgroupSize1D(maxWorkGroupSize, retGroupSize, workItems, simd);
} else if (NEO::DebugManager.flags.EnableComputeWorkSizeSquared.get() && (2U == dim)) {
NEO::computeWorkgroupSizeSquared(maxWorkGroupSize, retGroupSize, workItems, simd, dim);
} else {
NEO::computeWorkgroupSize2D(maxWorkGroupSize, retGroupSize, workItems, simd);
}
}
*groupSizeX = static_cast<uint32_t>(retGroupSize[0]);
*groupSizeY = static_cast<uint32_t>(retGroupSize[1]);
*groupSizeZ = static_cast<uint32_t>(retGroupSize[2]);
return ZE_RESULT_SUCCESS;
}
uint32_t KernelImp::suggestMaxCooperativeGroupCount() {
UNRECOVERABLE_IF(0 == groupSize[0]);
UNRECOVERABLE_IF(0 == groupSize[1]);
UNRECOVERABLE_IF(0 == groupSize[2]);
auto &hardwareInfo = module->getDevice()->getHwInfo();
auto dssCount = hardwareInfo.gtSystemInfo.DualSubSliceCount;
if (dssCount == 0) {
dssCount = hardwareInfo.gtSystemInfo.SubSliceCount;
}
auto &hwHelper = NEO::HwHelper::get(hardwareInfo.platform.eRenderCoreFamily);
auto &descriptor = kernelImmData->getDescriptor();
auto availableThreadCount = hwHelper.calculateAvailableThreadCount(
hardwareInfo.platform.eProductFamily,
descriptor.kernelAttributes.numGrfRequired,
hardwareInfo.gtSystemInfo.EUCount, hardwareInfo.gtSystemInfo.ThreadCount / hardwareInfo.gtSystemInfo.EUCount);
auto usesBarriers = descriptor.kernelAttributes.flags.usesBarriers;
const uint32_t workDim = 3;
const size_t localWorkSize[] = {groupSize[0], groupSize[1], groupSize[2]};
return NEO::KernelHelper::getMaxWorkGroupCount(descriptor.kernelAttributes.simdSize,
availableThreadCount,
dssCount,
dssCount * KB * hardwareInfo.capabilityTable.slmSize,
hwHelper.alignSlmSize(slmArgsTotalSize + descriptor.kernelAttributes.slmInlineSize),
static_cast<uint32_t>(hwHelper.getMaxBarrierRegisterPerSlice()),
hwHelper.getBarriersCountFromHasBarriers(usesBarriers),
workDim,
localWorkSize);
}
ze_result_t KernelImp::setAttribute(ze_kernel_attribute_t attr, uint32_t size, const void *pValue) {
if (size != sizeof(bool)) {
return ZE_RESULT_ERROR_INVALID_KERNEL_ATTRIBUTE_VALUE;
}
if (attr == ZE_KERNEL_ATTR_INDIRECT_DEVICE_ACCESS) {
this->unifiedMemoryControls.indirectDeviceAllocationsAllowed = *(static_cast<const bool *>(pValue));
} else if (attr == ZE_KERNEL_ATTR_INDIRECT_HOST_ACCESS) {
this->unifiedMemoryControls.indirectHostAllocationsAllowed = *(static_cast<const bool *>(pValue));
} else if (attr == ZE_KERNEL_ATTR_INDIRECT_SHARED_ACCESS) {
this->unifiedMemoryControls.indirectSharedAllocationsAllowed = *(static_cast<const bool *>(pValue));
} else {
return ZE_RESULT_ERROR_INVALID_ENUMERATION;
}
return ZE_RESULT_SUCCESS;
}
ze_result_t KernelImp::getAttribute(ze_kernel_attribute_t attr, uint32_t *pSize, void *pValue) {
if (attr == ZE_KERNEL_ATTR_INDIRECT_DEVICE_ACCESS) {
memcpy_s(pValue, sizeof(bool), &this->unifiedMemoryControls.indirectDeviceAllocationsAllowed, sizeof(bool));
return ZE_RESULT_SUCCESS;
}
if (attr == ZE_KERNEL_ATTR_INDIRECT_HOST_ACCESS) {
memcpy_s(pValue, sizeof(bool), &this->unifiedMemoryControls.indirectHostAllocationsAllowed, sizeof(bool));
return ZE_RESULT_SUCCESS;
}
if (attr == ZE_KERNEL_ATTR_INDIRECT_SHARED_ACCESS) {
memcpy_s(pValue, sizeof(bool), &this->unifiedMemoryControls.indirectSharedAllocationsAllowed, sizeof(bool));
return ZE_RESULT_SUCCESS;
}
if (attr == ZE_KERNEL_ATTR_SOURCE_ATTRIBUTE) {
auto &desc = kernelImmData->getDescriptor();
if (pValue == nullptr) {
*pSize = (uint32_t)desc.kernelMetadata.kernelLanguageAttributes.length() + 1;
} else {
strncpy_s((char *)pValue, desc.kernelMetadata.kernelLanguageAttributes.length() + 1,
desc.kernelMetadata.kernelLanguageAttributes.c_str(),
desc.kernelMetadata.kernelLanguageAttributes.length() + 1);
}
return ZE_RESULT_SUCCESS;
}
return ZE_RESULT_ERROR_INVALID_ENUMERATION;
}
ze_result_t KernelImp::setArgImmediate(uint32_t argIndex, size_t argSize, const void *argVal) {
if (kernelImmData->getDescriptor().payloadMappings.explicitArgs.size() <= argIndex) {
return ZE_RESULT_ERROR_INVALID_ARGUMENT;
}
const auto &arg = kernelImmData->getDescriptor().payloadMappings.explicitArgs[argIndex];
for (const auto &element : arg.as<NEO::ArgDescValue>().elements) {
if (element.sourceOffset < argSize) {
size_t maxBytesToCopy = argSize - element.sourceOffset;
size_t bytesToCopy = std::min(static_cast<size_t>(element.size), maxBytesToCopy);
auto pDst = ptrOffset(crossThreadData.get(), element.offset);
if (argVal) {
auto pSrc = ptrOffset(argVal, element.sourceOffset);
memcpy_s(pDst, element.size, pSrc, bytesToCopy);
} else {
uint64_t val = 0;
memcpy_s(pDst, element.size,
reinterpret_cast<void *>(&val), bytesToCopy);
}
} else {
return ZE_RESULT_ERROR_INVALID_ARGUMENT;
}
}
return ZE_RESULT_SUCCESS;
}
ze_result_t KernelImp::setArgRedescribedImage(uint32_t argIndex, ze_image_handle_t argVal) {
const auto &arg = kernelImmData->getDescriptor().payloadMappings.explicitArgs[argIndex].as<NEO::ArgDescImage>();
if (argVal == nullptr) {
residencyContainer[argIndex] = nullptr;
return ZE_RESULT_SUCCESS;
}
const auto image = Image::fromHandle(argVal);
image->copyRedescribedSurfaceStateToSSH(surfaceStateHeapData.get(), arg.bindful);
residencyContainer[argIndex] = image->getAllocation();
return ZE_RESULT_SUCCESS;
}
ze_result_t KernelImp::setArgBufferWithAlloc(uint32_t argIndex, const void *argVal, NEO::GraphicsAllocation *allocation) {
const auto &arg = kernelImmData->getDescriptor().payloadMappings.explicitArgs[argIndex].as<NEO::ArgDescPointer>();
const auto val = *reinterpret_cast<const uintptr_t *>(argVal);
NEO::patchPointer(ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize), arg, val);
if (NEO::isValidOffset(arg.bindful)) {
setBufferSurfaceState(argIndex, reinterpret_cast<void *>(val), allocation);
}
residencyContainer[argIndex] = allocation;
return ZE_RESULT_SUCCESS;
}
ze_result_t KernelImp::setArgBuffer(uint32_t argIndex, size_t argSize, const void *argVal) {
const auto &allArgs = kernelImmData->getDescriptor().payloadMappings.explicitArgs;
const auto &currArg = allArgs[argIndex];
if (currArg.getTraits().getAddressQualifier() == NEO::KernelArgMetadata::AddrLocal) {
slmArgSizes[argIndex] = static_cast<uint32_t>(argSize);
UNRECOVERABLE_IF(NEO::isUndefinedOffset(currArg.as<NEO::ArgDescPointer>().slmOffset));
auto slmOffset = *reinterpret_cast<uint32_t *>(crossThreadData.get() + currArg.as<NEO::ArgDescPointer>().slmOffset);
slmOffset += static_cast<uint32_t>(argSize);
++argIndex;
while (argIndex < kernelImmData->getDescriptor().payloadMappings.explicitArgs.size()) {
if (allArgs[argIndex].getTraits().getAddressQualifier() != NEO::KernelArgMetadata::AddrLocal) {
++argIndex;
continue;
}
const auto &nextArg = allArgs[argIndex].as<NEO::ArgDescPointer>();
UNRECOVERABLE_IF(0 == nextArg.requiredSlmAlignment);
slmOffset = alignUp<uint32_t>(slmOffset, nextArg.requiredSlmAlignment);
NEO::patchNonPointer<uint32_t>(ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize), nextArg.slmOffset, slmOffset);
slmOffset += static_cast<uint32_t>(slmArgSizes[argIndex]);
++argIndex;
}
slmArgsTotalSize = static_cast<uint32_t>(alignUp(slmOffset, KB));
return ZE_RESULT_SUCCESS;
}
if (nullptr == argVal) {
residencyContainer[argIndex] = nullptr;
return ZE_RESULT_SUCCESS;
}
auto requestedAddress = *reinterpret_cast<void *const *>(argVal);
auto svmAllocsManager = module->getDevice()->getDriverHandle()->getSvmAllocsManager();
NEO::GraphicsAllocation *alloc = svmAllocsManager->getSVMAllocs()->get(requestedAddress)->gpuAllocation;
return setArgBufferWithAlloc(argIndex, argVal, alloc);
}
ze_result_t KernelImp::setArgImage(uint32_t argIndex, size_t argSize, const void *argVal) {
const auto &arg = kernelImmData->getDescriptor().payloadMappings.explicitArgs[argIndex].as<NEO::ArgDescImage>();
if (argVal == nullptr) {
residencyContainer[argIndex] = nullptr;
return ZE_RESULT_SUCCESS;
}
const auto image = Image::fromHandle(*static_cast<const ze_image_handle_t *>(argVal));
image->copySurfaceStateToSSH(surfaceStateHeapData.get(), arg.bindful);
residencyContainer[argIndex] = image->getAllocation();
return ZE_RESULT_SUCCESS;
}
ze_result_t KernelImp::setArgSampler(uint32_t argIndex, size_t argSize, const void *argVal) {
const auto &arg = kernelImmData->getDescriptor().payloadMappings.explicitArgs[argIndex].as<NEO::ArgDescSampler>();
const auto sampler = Sampler::fromHandle(*static_cast<const ze_sampler_handle_t *>(argVal));
sampler->copySamplerStateToDSH(dynamicStateHeapData.get(), dynamicStateHeapDataSize, arg.bindful);
return ZE_RESULT_SUCCESS;
}
ze_result_t KernelImp::getProperties(ze_kernel_properties_t *pKernelProperties) {
size_t kernel_name_size = std::min(this->kernelImmData->getDescriptor().kernelMetadata.kernelName.size(),
static_cast<size_t>(ZE_MAX_KERNEL_NAME));
strncpy_s(pKernelProperties->name, ZE_MAX_KERNEL_NAME,
this->kernelImmData->getDescriptor().kernelMetadata.kernelName.c_str(), kernel_name_size);
pKernelProperties->requiredGroupSizeX = this->groupSize[0];
pKernelProperties->requiredGroupSizeY = this->groupSize[1];
pKernelProperties->requiredGroupSizeZ = this->groupSize[2];
pKernelProperties->numKernelArgs =
static_cast<uint32_t>(this->kernelImmData->getDescriptor().payloadMappings.explicitArgs.size());
return ZE_RESULT_SUCCESS;
}
ze_result_t KernelImp::initialize(const ze_kernel_desc_t *desc) {
if (desc->version != ZE_KERNEL_DESC_VERSION_CURRENT) {
return ZE_RESULT_ERROR_UNSUPPORTED_VERSION;
}
this->kernelImmData = module->getKernelImmutableData(desc->pKernelName);
if (this->kernelImmData == nullptr) {
return ZE_RESULT_ERROR_INVALID_ARGUMENT;
}
for (const auto &argT : kernelImmData->getDescriptor().payloadMappings.explicitArgs) {
switch (argT.type) {
default:
UNRECOVERABLE_IF(true);
break;
case NEO::ArgDescriptor::ArgTPointer:
this->kernelArgHandlers.push_back(&KernelImp::setArgBuffer);
break;
case NEO::ArgDescriptor::ArgTImage:
this->kernelArgHandlers.push_back(&KernelImp::setArgImage);
break;
case NEO::ArgDescriptor::ArgTSampler:
this->kernelArgHandlers.push_back(&KernelImp::setArgSampler);
break;
case NEO::ArgDescriptor::ArgTValue:
this->kernelArgHandlers.push_back(&KernelImp::setArgImmediate);
break;
}
}
slmArgSizes.resize(this->kernelArgHandlers.size(), 0);
if (kernelImmData->getSurfaceStateHeapSize() > 0) {
this->surfaceStateHeapData.reset(new uint8_t[kernelImmData->getSurfaceStateHeapSize()]);
memcpy_s(this->surfaceStateHeapData.get(),
kernelImmData->getSurfaceStateHeapSize(),
kernelImmData->getSurfaceStateHeapTemplate(),
kernelImmData->getSurfaceStateHeapSize());
this->surfaceStateHeapDataSize = kernelImmData->getSurfaceStateHeapSize();
}
if (kernelImmData->getDescriptor().kernelAttributes.crossThreadDataSize != 0) {
this->crossThreadData.reset(new uint8_t[kernelImmData->getDescriptor().kernelAttributes.crossThreadDataSize]);
memcpy_s(this->crossThreadData.get(),
kernelImmData->getDescriptor().kernelAttributes.crossThreadDataSize,
kernelImmData->getCrossThreadDataTemplate(),
kernelImmData->getDescriptor().kernelAttributes.crossThreadDataSize);
this->crossThreadDataSize = kernelImmData->getDescriptor().kernelAttributes.crossThreadDataSize;
}
if (kernelImmData->getDynamicStateHeapDataSize() != 0) {
this->dynamicStateHeapData.reset(new uint8_t[kernelImmData->getDynamicStateHeapDataSize()]);
memcpy_s(this->dynamicStateHeapData.get(),
kernelImmData->getDynamicStateHeapDataSize(),
kernelImmData->getDynamicStateHeapTemplate(),
kernelImmData->getDynamicStateHeapDataSize());
this->dynamicStateHeapDataSize = kernelImmData->getDynamicStateHeapDataSize();
}
if (kernelImmData->getDescriptor().kernelAttributes.requiredWorkgroupSize[0] > 0) {
auto *reqdSize = kernelImmData->getDescriptor().kernelAttributes.requiredWorkgroupSize;
UNRECOVERABLE_IF(reqdSize[1] == 0);
UNRECOVERABLE_IF(reqdSize[2] == 0);
auto result = setGroupSize(reqdSize[0], reqdSize[1], reqdSize[2]);
if (result != ZE_RESULT_SUCCESS) {
return result;
}
} else {
auto result = setGroupSize(kernelImmData->getDescriptor().kernelAttributes.simdSize, 1, 1);
if (result != ZE_RESULT_SUCCESS) {
return result;
}
}
residencyContainer.resize(this->kernelArgHandlers.size(), nullptr);
this->createPrintfBuffer();
for (auto &alloc : kernelImmData->getResidencyContainer()) {
residencyContainer.push_back(alloc);
}
return ZE_RESULT_SUCCESS;
}
void KernelImp::createPrintfBuffer() {
if (this->kernelImmData->getDescriptor().kernelAttributes.flags.usesPrintf) {
this->printfBuffer = PrintfHandler::createPrintfBuffer(this->module->getDevice());
this->residencyContainer.push_back(printfBuffer);
NEO::patchPointer(ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize),
this->getImmutableData()->getDescriptor().payloadMappings.implicitArgs.printfSurfaceAddress,
static_cast<uintptr_t>(this->printfBuffer->getGpuAddressToPatch()));
}
}
void KernelImp::printPrintfOutput() {
PrintfHandler::printOutput(kernelImmData, this->printfBuffer, module->getDevice());
}
void KernelImp::patchWorkgroupSizeInCrossThreadData(uint32_t x, uint32_t y, uint32_t z) {
const NEO::KernelDescriptor &desc = kernelImmData->getDescriptor();
auto dst = ArrayRef<uint8_t>(crossThreadData.get(), crossThreadDataSize);
uint32_t workgroupSize[3] = {x, y, z};
NEO::patchVecNonPointer(dst, desc.payloadMappings.dispatchTraits.localWorkSize, workgroupSize);
NEO::patchVecNonPointer(dst, desc.payloadMappings.dispatchTraits.localWorkSize2, workgroupSize);
NEO::patchVecNonPointer(dst, desc.payloadMappings.dispatchTraits.enqueuedLocalWorkSize, workgroupSize);
}
Kernel *Kernel::create(uint32_t productFamily, Module *module,
const ze_kernel_desc_t *desc, ze_result_t *res) {
UNRECOVERABLE_IF(productFamily >= IGFX_MAX_PRODUCT);
KernelAllocatorFn allocator = kernelFactory[productFamily];
auto function = static_cast<KernelImp *>(allocator(module));
*res = function->initialize(desc);
if (*res) {
function->destroy();
return nullptr;
}
return function;
}
bool KernelImp::hasBarriers() {
return getImmutableData()->getDescriptor().kernelAttributes.flags.usesBarriers;
}
uint32_t KernelImp::getSlmTotalSize() {
return slmArgsTotalSize + getImmutableData()->getDescriptor().kernelAttributes.slmInlineSize;
}
uint32_t KernelImp::getBindingTableOffset() {
return getImmutableData()->getDescriptor().payloadMappings.bindingTable.tableOffset;
}
uint32_t KernelImp::getBorderColor() {
return getImmutableData()->getDescriptor().payloadMappings.samplerTable.borderColor;
}
uint32_t KernelImp::getSamplerTableOffset() {
return getImmutableData()->getDescriptor().payloadMappings.samplerTable.tableOffset;
}
uint32_t KernelImp::getNumSurfaceStates() {
return getImmutableData()->getDescriptor().payloadMappings.bindingTable.numEntries;
}
uint32_t KernelImp::getNumSamplers() {
return getImmutableData()->getDescriptor().payloadMappings.samplerTable.numSamplers;
}
uint32_t KernelImp::getSimdSize() {
return getImmutableData()->getDescriptor().kernelAttributes.simdSize;
}
uint32_t KernelImp::getSizeCrossThreadData() {
return getCrossThreadDataSize();
}
uint32_t KernelImp::getPerThreadScratchSize() {
return getImmutableData()->getDescriptor().kernelAttributes.perThreadScratchSize[0];
}
uint32_t KernelImp::getThreadsPerThreadGroupCount() {
return getThreadsPerThreadGroup();
}
uint32_t KernelImp::getSizePerThreadData() {
return getPerThreadDataSize();
}
uint32_t KernelImp::getSizePerThreadDataForWholeGroup() {
return getPerThreadDataSizeForWholeThreadGroup();
}
uint32_t KernelImp::getSizeSurfaceStateHeapData() {
return getSurfaceStateHeapDataSize();
}
uint32_t KernelImp::getPerThreadExecutionMask() {
return getThreadExecutionMask();
}
uint32_t *KernelImp::getCountOffsets() {
return groupCountOffsets;
}
uint32_t *KernelImp::getSizeOffsets() {
return groupSizeOffsets;
}
uint32_t *KernelImp::getLocalWorkSize() {
if (hasGroupSize()) {
getGroupSize(localWorkSize[0], localWorkSize[1], localWorkSize[2]);
}
return localWorkSize;
}
uint32_t KernelImp::getNumGrfRequired() {
return getImmutableData()->getDescriptor().kernelAttributes.numGrfRequired;
}
NEO::GraphicsAllocation *KernelImp::getIsaAllocation() {
return getImmutableData()->getIsaGraphicsAllocation();
}
bool KernelImp::hasGroupCounts() {
return getGroupCountOffsets(groupCountOffsets);
}
bool KernelImp::hasGroupSize() {
return getGroupSizeOffsets(groupSizeOffsets);
}
const void *KernelImp::getSurfaceStateHeap() {
return getSurfaceStateHeapData();
}
const void *KernelImp::getDynamicStateHeap() {
return getDynamicStateHeapData();
}
const void *KernelImp::getCrossThread() {
return getCrossThreadData();
}
const void *KernelImp::getPerThread() {
return getPerThreadData();
}
} // namespace L0