2306 lines
102 KiB
C++
2306 lines
102 KiB
C++
/*
|
|
* Copyright (C) 2018-2023 Intel Corporation
|
|
*
|
|
* SPDX-License-Identifier: MIT
|
|
*
|
|
*/
|
|
|
|
#include "opencl/source/kernel/kernel.h"
|
|
|
|
#include "shared/source/built_ins/built_ins.h"
|
|
#include "shared/source/command_stream/command_stream_receiver.h"
|
|
#include "shared/source/debug_settings/debug_settings_manager.h"
|
|
#include "shared/source/execution_environment/execution_environment.h"
|
|
#include "shared/source/execution_environment/root_device_environment.h"
|
|
#include "shared/source/gmm_helper/gmm.h"
|
|
#include "shared/source/gmm_helper/gmm_helper.h"
|
|
#include "shared/source/gmm_helper/resource_info.h"
|
|
#include "shared/source/helpers/address_patch.h"
|
|
#include "shared/source/helpers/aligned_memory.h"
|
|
#include "shared/source/helpers/basic_math.h"
|
|
#include "shared/source/helpers/bindless_heaps_helper.h"
|
|
#include "shared/source/helpers/debug_helpers.h"
|
|
#include "shared/source/helpers/get_info.h"
|
|
#include "shared/source/helpers/gfx_core_helper.h"
|
|
#include "shared/source/helpers/hw_info.h"
|
|
#include "shared/source/helpers/kernel_helpers.h"
|
|
#include "shared/source/helpers/ptr_math.h"
|
|
#include "shared/source/helpers/simd_helper.h"
|
|
#include "shared/source/helpers/surface_format_info.h"
|
|
#include "shared/source/kernel/implicit_args.h"
|
|
#include "shared/source/kernel/kernel_arg_descriptor_extended_vme.h"
|
|
#include "shared/source/kernel/local_ids_cache.h"
|
|
#include "shared/source/memory_manager/allocation_properties.h"
|
|
#include "shared/source/memory_manager/compression_selector.h"
|
|
#include "shared/source/memory_manager/memory_manager.h"
|
|
#include "shared/source/memory_manager/unified_memory_manager.h"
|
|
#include "shared/source/os_interface/os_context.h"
|
|
#include "shared/source/os_interface/product_helper.h"
|
|
#include "shared/source/page_fault_manager/cpu_page_fault_manager.h"
|
|
#include "shared/source/program/kernel_info.h"
|
|
#include "shared/source/utilities/lookup_array.h"
|
|
#include "shared/source/utilities/tag_allocator.h"
|
|
|
|
#include "opencl/source/accelerators/intel_accelerator.h"
|
|
#include "opencl/source/accelerators/intel_motion_estimation.h"
|
|
#include "opencl/source/built_ins/builtins_dispatch_builder.h"
|
|
#include "opencl/source/cl_device/cl_device.h"
|
|
#include "opencl/source/command_queue/cl_local_work_size.h"
|
|
#include "opencl/source/command_queue/command_queue.h"
|
|
#include "opencl/source/context/context.h"
|
|
#include "opencl/source/event/event.h"
|
|
#include "opencl/source/gtpin/gtpin_notify.h"
|
|
#include "opencl/source/helpers/cl_gfx_core_helper.h"
|
|
#include "opencl/source/helpers/cl_validators.h"
|
|
#include "opencl/source/helpers/dispatch_info.h"
|
|
#include "opencl/source/helpers/get_info_status_mapper.h"
|
|
#include "opencl/source/helpers/sampler_helpers.h"
|
|
#include "opencl/source/kernel/image_transformer.h"
|
|
#include "opencl/source/kernel/kernel_info_cl.h"
|
|
#include "opencl/source/mem_obj/buffer.h"
|
|
#include "opencl/source/mem_obj/image.h"
|
|
#include "opencl/source/mem_obj/pipe.h"
|
|
#include "opencl/source/memory_manager/mem_obj_surface.h"
|
|
#include "opencl/source/program/program.h"
|
|
#include "opencl/source/sampler/sampler.h"
|
|
|
|
#include "patch_list.h"
|
|
|
|
#include <algorithm>
|
|
#include <cstdint>
|
|
#include <vector>
|
|
|
|
using namespace iOpenCL;
|
|
|
|
namespace NEO {
|
|
class Surface;
|
|
|
|
uint32_t Kernel::dummyPatchLocation = 0xbaddf00d;
|
|
|
|
Kernel::Kernel(Program *programArg, const KernelInfo &kernelInfoArg, ClDevice &clDeviceArg)
|
|
: executionEnvironment(programArg->getExecutionEnvironment()),
|
|
program(programArg),
|
|
clDevice(clDeviceArg),
|
|
kernelInfo(kernelInfoArg) {
|
|
program->retain();
|
|
program->retainForKernel();
|
|
imageTransformer.reset(new ImageTransformer);
|
|
auto &deviceInfo = getDevice().getDevice().getDeviceInfo();
|
|
if (isSimd1(kernelInfoArg.kernelDescriptor.kernelAttributes.simdSize)) {
|
|
auto &productHelper = getDevice().getProductHelper();
|
|
maxKernelWorkGroupSize = productHelper.getMaxThreadsForWorkgroupInDSSOrSS(getHardwareInfo(), static_cast<uint32_t>(deviceInfo.maxNumEUsPerSubSlice), static_cast<uint32_t>(deviceInfo.maxNumEUsPerDualSubSlice));
|
|
} else {
|
|
maxKernelWorkGroupSize = static_cast<uint32_t>(deviceInfo.maxWorkGroupSize);
|
|
}
|
|
slmTotalSize = kernelInfoArg.kernelDescriptor.kernelAttributes.slmInlineSize;
|
|
}
|
|
|
|
Kernel::~Kernel() {
|
|
delete[] crossThreadData;
|
|
crossThreadData = nullptr;
|
|
crossThreadDataSize = 0;
|
|
|
|
if (privateSurface) {
|
|
program->peekExecutionEnvironment().memoryManager->checkGpuUsageAndDestroyGraphicsAllocations(privateSurface);
|
|
privateSurface = nullptr;
|
|
}
|
|
|
|
for (uint32_t i = 0; i < patchedArgumentsNum; i++) {
|
|
if (SAMPLER_OBJ == getKernelArguments()[i].type) {
|
|
auto sampler = castToObject<Sampler>(kernelArguments.at(i).object);
|
|
if (sampler) {
|
|
sampler->decRefInternal();
|
|
}
|
|
}
|
|
}
|
|
|
|
kernelArgHandlers.clear();
|
|
program->releaseForKernel();
|
|
program->release();
|
|
}
|
|
// If dstOffsetBytes is not an invalid offset, then patches dst at dstOffsetBytes
|
|
// with src casted to DstT type.
|
|
template <typename DstT, typename SrcT>
|
|
inline void patch(const SrcT &src, void *dst, CrossThreadDataOffset dstOffsetBytes) {
|
|
if (isValidOffset(dstOffsetBytes)) {
|
|
DstT *patchLocation = reinterpret_cast<DstT *>(ptrOffset(dst, dstOffsetBytes));
|
|
*patchLocation = static_cast<DstT>(src);
|
|
}
|
|
}
|
|
|
|
void Kernel::patchWithImplicitSurface(uint64_t ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const ArgDescPointer &arg) {
|
|
if ((nullptr != crossThreadData) && isValidOffset(arg.stateless)) {
|
|
auto pp = ptrOffset(crossThreadData, arg.stateless);
|
|
patchWithRequiredSize(pp, arg.pointerSize, ptrToPatchInCrossThreadData);
|
|
if (DebugManager.flags.AddPatchInfoCommentsForAUBDump.get()) {
|
|
PatchInfoData patchInfoData(ptrToPatchInCrossThreadData, 0u, PatchInfoAllocationType::KernelArg, reinterpret_cast<uint64_t>(crossThreadData), arg.stateless, PatchInfoAllocationType::IndirectObjectHeap, arg.pointerSize);
|
|
this->patchInfoDataList.push_back(patchInfoData);
|
|
}
|
|
}
|
|
|
|
void *ssh = getSurfaceStateHeap();
|
|
if ((nullptr != ssh) && isValidOffset(arg.bindful)) {
|
|
auto surfaceState = ptrOffset(ssh, arg.bindful);
|
|
void *addressToPatch = reinterpret_cast<void *>(allocation.getGpuAddressToPatch());
|
|
size_t sizeToPatch = allocation.getUnderlyingBufferSize();
|
|
Buffer::setSurfaceState(&clDevice.getDevice(), surfaceState, false, false, sizeToPatch, addressToPatch, 0, &allocation, 0, 0,
|
|
kernelInfo.kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, areMultipleSubDevicesInContext());
|
|
}
|
|
}
|
|
|
|
cl_int Kernel::initialize() {
|
|
auto pClDevice = &getDevice();
|
|
auto rootDeviceIndex = pClDevice->getRootDeviceIndex();
|
|
reconfigureKernel();
|
|
auto &hwInfo = pClDevice->getHardwareInfo();
|
|
auto &rootDeviceEnvironment = pClDevice->getRootDeviceEnvironment();
|
|
auto &gfxCoreHelper = rootDeviceEnvironment.getHelper<GfxCoreHelper>();
|
|
auto &productHelper = rootDeviceEnvironment.getHelper<ProductHelper>();
|
|
auto &kernelDescriptor = kernelInfo.kernelDescriptor;
|
|
const auto &implicitArgs = kernelDescriptor.payloadMappings.implicitArgs;
|
|
const auto &explicitArgs = kernelDescriptor.payloadMappings.explicitArgs;
|
|
auto maxSimdSize = kernelInfo.getMaxSimdSize();
|
|
const auto &heapInfo = kernelInfo.heapInfo;
|
|
|
|
auto localMemSize = static_cast<uint32_t>(clDevice.getDevice().getDeviceInfo().localMemSize);
|
|
auto slmTotalSize = this->getSlmTotalSize();
|
|
if (slmTotalSize > 0 && localMemSize < slmTotalSize) {
|
|
PRINT_DEBUG_STRING(NEO::DebugManager.flags.PrintDebugMessages.get(), stderr, "Size of SLM (%u) larger than available (%u)\n", slmTotalSize, localMemSize);
|
|
return CL_OUT_OF_RESOURCES;
|
|
}
|
|
|
|
if (maxSimdSize != 1 && maxSimdSize < gfxCoreHelper.getMinimalSIMDSize()) {
|
|
return CL_INVALID_KERNEL;
|
|
}
|
|
|
|
if (kernelDescriptor.kernelAttributes.flags.requiresImplicitArgs) {
|
|
pImplicitArgs = std::make_unique<ImplicitArgs>();
|
|
*pImplicitArgs = {};
|
|
pImplicitArgs->structSize = offsetof(ImplicitArgs, reserved);
|
|
pImplicitArgs->structVersion = 0;
|
|
pImplicitArgs->simdWidth = maxSimdSize;
|
|
}
|
|
auto ret = KernelHelper::checkIfThereIsSpaceForScratchOrPrivate(kernelDescriptor.kernelAttributes, &pClDevice->getDevice());
|
|
if (ret == NEO::KernelHelper::ErrorCode::INVALID_KERNEL) {
|
|
return CL_INVALID_KERNEL;
|
|
}
|
|
if (ret == NEO::KernelHelper::ErrorCode::OUT_OF_DEVICE_MEMORY) {
|
|
return CL_OUT_OF_RESOURCES;
|
|
}
|
|
|
|
crossThreadDataSize = kernelDescriptor.kernelAttributes.crossThreadDataSize;
|
|
|
|
// now allocate our own cross-thread data, if necessary
|
|
if (crossThreadDataSize) {
|
|
crossThreadData = new char[crossThreadDataSize];
|
|
|
|
if (kernelInfo.crossThreadData) {
|
|
memcpy_s(crossThreadData, crossThreadDataSize,
|
|
kernelInfo.crossThreadData, crossThreadDataSize);
|
|
} else {
|
|
memset(crossThreadData, 0x00, crossThreadDataSize);
|
|
}
|
|
|
|
auto crossThread = reinterpret_cast<uint32_t *>(crossThreadData);
|
|
auto setArgsIfValidOffset = [&](uint32_t *&crossThreadData, NEO::CrossThreadDataOffset offset, uint32_t value) {
|
|
if (isValidOffset(offset)) {
|
|
crossThreadData = ptrOffset(crossThread, offset);
|
|
*crossThreadData = value;
|
|
}
|
|
};
|
|
setArgsIfValidOffset(maxWorkGroupSizeForCrossThreadData, implicitArgs.maxWorkGroupSize, maxKernelWorkGroupSize);
|
|
setArgsIfValidOffset(dataParameterSimdSize, implicitArgs.simdSize, maxSimdSize);
|
|
setArgsIfValidOffset(preferredWkgMultipleOffset, implicitArgs.preferredWkgMultiple, maxSimdSize);
|
|
setArgsIfValidOffset(parentEventOffset, implicitArgs.deviceSideEnqueueParentEvent, undefined<uint32_t>);
|
|
}
|
|
|
|
// allocate our own SSH, if necessary
|
|
sshLocalSize = heapInfo.surfaceStateHeapSize;
|
|
if (sshLocalSize) {
|
|
pSshLocal = std::make_unique<char[]>(sshLocalSize);
|
|
|
|
// copy the ssh into our local copy
|
|
memcpy_s(pSshLocal.get(), sshLocalSize,
|
|
heapInfo.pSsh, heapInfo.surfaceStateHeapSize);
|
|
}
|
|
numberOfBindingTableStates = kernelDescriptor.payloadMappings.bindingTable.numEntries;
|
|
localBindingTableOffset = kernelDescriptor.payloadMappings.bindingTable.tableOffset;
|
|
|
|
// patch crossthread data and ssh with inline surfaces, if necessary
|
|
auto status = patchPrivateSurface();
|
|
if (CL_SUCCESS != status) {
|
|
return status;
|
|
}
|
|
|
|
if (isValidOffset(kernelDescriptor.payloadMappings.implicitArgs.globalConstantsSurfaceAddress.stateless)) {
|
|
DEBUG_BREAK_IF(program->getConstantSurface(rootDeviceIndex) == nullptr);
|
|
uint64_t constMemory = isBuiltIn ? castToUint64(program->getConstantSurface(rootDeviceIndex)->getUnderlyingBuffer()) : program->getConstantSurface(rootDeviceIndex)->getGpuAddressToPatch();
|
|
|
|
const auto &arg = kernelDescriptor.payloadMappings.implicitArgs.globalConstantsSurfaceAddress;
|
|
patchWithImplicitSurface(constMemory, *program->getConstantSurface(rootDeviceIndex), arg);
|
|
}
|
|
|
|
if (isValidOffset(kernelDescriptor.payloadMappings.implicitArgs.globalVariablesSurfaceAddress.stateless)) {
|
|
DEBUG_BREAK_IF(program->getGlobalSurface(rootDeviceIndex) == nullptr);
|
|
uint64_t globalMemory = isBuiltIn ? castToUint64(program->getGlobalSurface(rootDeviceIndex)->getUnderlyingBuffer()) : program->getGlobalSurface(rootDeviceIndex)->getGpuAddressToPatch();
|
|
|
|
const auto &arg = kernelDescriptor.payloadMappings.implicitArgs.globalVariablesSurfaceAddress;
|
|
patchWithImplicitSurface(globalMemory, *program->getGlobalSurface(rootDeviceIndex), arg);
|
|
}
|
|
|
|
// Patch Surface State Heap
|
|
bool useGlobalAtomics = kernelInfo.kernelDescriptor.kernelAttributes.flags.useGlobalAtomics;
|
|
|
|
if (isValidOffset(kernelDescriptor.payloadMappings.implicitArgs.deviceSideEnqueueEventPoolSurfaceAddress.bindful)) {
|
|
auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(getSurfaceStateHeap()),
|
|
kernelDescriptor.payloadMappings.implicitArgs.deviceSideEnqueueEventPoolSurfaceAddress.bindful);
|
|
Buffer::setSurfaceState(&pClDevice->getDevice(), surfaceState, false, false, 0, nullptr, 0, nullptr, 0, 0, useGlobalAtomics, areMultipleSubDevicesInContext());
|
|
}
|
|
|
|
if (isValidOffset(kernelDescriptor.payloadMappings.implicitArgs.deviceSideEnqueueDefaultQueueSurfaceAddress.bindful)) {
|
|
auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(getSurfaceStateHeap()),
|
|
kernelDescriptor.payloadMappings.implicitArgs.deviceSideEnqueueDefaultQueueSurfaceAddress.bindful);
|
|
Buffer::setSurfaceState(&pClDevice->getDevice(), surfaceState, false, false, 0, nullptr, 0, nullptr, 0, 0, useGlobalAtomics, areMultipleSubDevicesInContext());
|
|
}
|
|
|
|
auto &threadArbitrationPolicy = const_cast<ThreadArbitrationPolicy &>(kernelInfo.kernelDescriptor.kernelAttributes.threadArbitrationPolicy);
|
|
if (threadArbitrationPolicy == ThreadArbitrationPolicy::NotPresent) {
|
|
threadArbitrationPolicy = static_cast<ThreadArbitrationPolicy>(gfxCoreHelper.getDefaultThreadArbitrationPolicy());
|
|
}
|
|
if (kernelInfo.kernelDescriptor.kernelAttributes.flags.requiresSubgroupIndependentForwardProgress == true) {
|
|
threadArbitrationPolicy = ThreadArbitrationPolicy::RoundRobin;
|
|
}
|
|
|
|
auto &clGfxCoreHelper = rootDeviceEnvironment.getHelper<ClGfxCoreHelper>();
|
|
|
|
auxTranslationRequired = !program->getIsBuiltIn() && GfxCoreHelper::compressedBuffersSupported(hwInfo) && clGfxCoreHelper.requiresAuxResolves(kernelInfo);
|
|
|
|
if (DebugManager.flags.ForceAuxTranslationEnabled.get() != -1) {
|
|
auxTranslationRequired &= !!DebugManager.flags.ForceAuxTranslationEnabled.get();
|
|
}
|
|
if (auxTranslationRequired) {
|
|
program->getContextPtr()->setResolvesRequiredInKernels(true);
|
|
}
|
|
|
|
auto numArgs = explicitArgs.size();
|
|
slmSizes.resize(numArgs);
|
|
|
|
this->setInlineSamplers();
|
|
bool detectIndirectAccessInKernel = productHelper.isDetectIndirectAccessInKernelSupported(kernelDescriptor, program->getCreatedFromBinary());
|
|
if (DebugManager.flags.DetectIndirectAccessInKernel.get() != -1) {
|
|
detectIndirectAccessInKernel = DebugManager.flags.DetectIndirectAccessInKernel.get() == 1;
|
|
}
|
|
if (detectIndirectAccessInKernel) {
|
|
this->kernelHasIndirectAccess = kernelDescriptor.kernelAttributes.hasNonKernelArgLoad ||
|
|
kernelDescriptor.kernelAttributes.hasNonKernelArgStore ||
|
|
kernelDescriptor.kernelAttributes.hasNonKernelArgAtomic ||
|
|
kernelDescriptor.kernelAttributes.hasIndirectStatelessAccess ||
|
|
NEO::KernelHelper::isAnyArgumentPtrByValue(kernelDescriptor);
|
|
} else {
|
|
this->kernelHasIndirectAccess = true;
|
|
}
|
|
provideInitializationHints();
|
|
// resolve the new kernel info to account for kernel handlers
|
|
// I think by this time we have decoded the binary and know the number of args etc.
|
|
// double check this assumption
|
|
bool usingBuffers = false;
|
|
kernelArguments.resize(numArgs);
|
|
kernelArgHandlers.resize(numArgs);
|
|
|
|
for (uint32_t i = 0; i < numArgs; ++i) {
|
|
storeKernelArg(i, NONE_OBJ, nullptr, nullptr, 0);
|
|
|
|
// set the argument handler
|
|
const auto &arg = explicitArgs[i];
|
|
if (arg.is<ArgDescriptor::ArgTPointer>()) {
|
|
if (arg.getTraits().addressQualifier == KernelArgMetadata::AddrLocal) {
|
|
kernelArgHandlers[i] = &Kernel::setArgLocal;
|
|
} else if (arg.getTraits().typeQualifiers.pipeQual) {
|
|
kernelArgHandlers[i] = &Kernel::setArgPipe;
|
|
kernelArguments[i].type = PIPE_OBJ;
|
|
} else {
|
|
kernelArgHandlers[i] = &Kernel::setArgBuffer;
|
|
kernelArguments[i].type = BUFFER_OBJ;
|
|
usingBuffers = true;
|
|
allBufferArgsStateful &= static_cast<uint32_t>(arg.as<ArgDescPointer>().isPureStateful());
|
|
}
|
|
} else if (arg.is<ArgDescriptor::ArgTImage>()) {
|
|
kernelArgHandlers[i] = &Kernel::setArgImage;
|
|
kernelArguments[i].type = IMAGE_OBJ;
|
|
usingImages = true;
|
|
} else if (arg.is<ArgDescriptor::ArgTSampler>()) {
|
|
if (arg.getExtendedTypeInfo().isAccelerator) {
|
|
kernelArgHandlers[i] = &Kernel::setArgAccelerator;
|
|
} else {
|
|
kernelArgHandlers[i] = &Kernel::setArgSampler;
|
|
kernelArguments[i].type = SAMPLER_OBJ;
|
|
}
|
|
} else {
|
|
kernelArgHandlers[i] = &Kernel::setArgImmediate;
|
|
}
|
|
}
|
|
|
|
if (usingImages && !usingBuffers) {
|
|
usingImagesOnly = true;
|
|
}
|
|
|
|
if (kernelDescriptor.kernelAttributes.numLocalIdChannels > 0) {
|
|
initializeLocalIdsCache();
|
|
}
|
|
|
|
return CL_SUCCESS;
|
|
}
|
|
|
|
cl_int Kernel::patchPrivateSurface() {
|
|
auto pClDevice = &getDevice();
|
|
auto rootDeviceIndex = pClDevice->getRootDeviceIndex();
|
|
auto &kernelDescriptor = kernelInfo.kernelDescriptor;
|
|
auto perHwThreadPrivateMemorySize = kernelDescriptor.kernelAttributes.perHwThreadPrivateMemorySize;
|
|
if (perHwThreadPrivateMemorySize) {
|
|
if (!privateSurface) {
|
|
privateSurfaceSize = KernelHelper::getPrivateSurfaceSize(perHwThreadPrivateMemorySize, pClDevice->getSharedDeviceInfo().computeUnitsUsedForScratch);
|
|
DEBUG_BREAK_IF(privateSurfaceSize == 0);
|
|
|
|
privateSurface = executionEnvironment.memoryManager->allocateGraphicsMemoryWithProperties(
|
|
{rootDeviceIndex,
|
|
static_cast<size_t>(privateSurfaceSize),
|
|
AllocationType::PRIVATE_SURFACE,
|
|
pClDevice->getDeviceBitfield()});
|
|
if (privateSurface == nullptr) {
|
|
return CL_OUT_OF_RESOURCES;
|
|
}
|
|
}
|
|
|
|
const auto &privateMemoryAddress = kernelDescriptor.payloadMappings.implicitArgs.privateMemoryAddress;
|
|
patchWithImplicitSurface(privateSurface->getGpuAddressToPatch(), *privateSurface, privateMemoryAddress);
|
|
}
|
|
return CL_SUCCESS;
|
|
}
|
|
|
|
cl_int Kernel::cloneKernel(Kernel *pSourceKernel) {
|
|
// copy cross thread data to store arguments set to source kernel with clSetKernelArg on immediate data (non-pointer types)
|
|
memcpy_s(crossThreadData, crossThreadDataSize,
|
|
pSourceKernel->crossThreadData, pSourceKernel->crossThreadDataSize);
|
|
DEBUG_BREAK_IF(pSourceKernel->crossThreadDataSize != crossThreadDataSize);
|
|
|
|
[[maybe_unused]] auto status = patchPrivateSurface();
|
|
DEBUG_BREAK_IF(status != CL_SUCCESS);
|
|
|
|
// copy arguments set to source kernel with clSetKernelArg or clSetKernelArgSVMPointer
|
|
for (uint32_t i = 0; i < pSourceKernel->kernelArguments.size(); i++) {
|
|
if (0 == pSourceKernel->getKernelArgInfo(i).size) {
|
|
// skip copying arguments that haven't been set to source kernel
|
|
continue;
|
|
}
|
|
switch (pSourceKernel->kernelArguments[i].type) {
|
|
case NONE_OBJ:
|
|
// all arguments with immediate data (non-pointer types) have been copied in cross thread data
|
|
storeKernelArg(i, NONE_OBJ, nullptr, nullptr, pSourceKernel->getKernelArgInfo(i).size);
|
|
patchedArgumentsNum++;
|
|
kernelArguments[i].isPatched = true;
|
|
break;
|
|
case SVM_OBJ:
|
|
setArgSvm(i, pSourceKernel->getKernelArgInfo(i).size, const_cast<void *>(pSourceKernel->getKernelArgInfo(i).value),
|
|
pSourceKernel->getKernelArgInfo(i).svmAllocation, pSourceKernel->getKernelArgInfo(i).svmFlags);
|
|
break;
|
|
case SVM_ALLOC_OBJ:
|
|
setArgSvmAlloc(i, const_cast<void *>(pSourceKernel->getKernelArgInfo(i).value),
|
|
(GraphicsAllocation *)pSourceKernel->getKernelArgInfo(i).object,
|
|
pSourceKernel->getKernelArgInfo(i).allocId);
|
|
break;
|
|
case BUFFER_OBJ:
|
|
setArg(i, pSourceKernel->getKernelArgInfo(i).size, &pSourceKernel->getKernelArgInfo(i).object);
|
|
break;
|
|
default:
|
|
setArg(i, pSourceKernel->getKernelArgInfo(i).size, pSourceKernel->getKernelArgInfo(i).value);
|
|
break;
|
|
}
|
|
}
|
|
|
|
// copy additional information other than argument values set to source kernel with clSetKernelExecInfo
|
|
for (auto &gfxAlloc : pSourceKernel->kernelSvmGfxAllocations) {
|
|
kernelSvmGfxAllocations.push_back(gfxAlloc);
|
|
}
|
|
for (auto &gfxAlloc : pSourceKernel->kernelUnifiedMemoryGfxAllocations) {
|
|
kernelUnifiedMemoryGfxAllocations.push_back(gfxAlloc);
|
|
}
|
|
|
|
if (pImplicitArgs) {
|
|
memcpy_s(pImplicitArgs.get(), sizeof(ImplicitArgs), pSourceKernel->getImplicitArgs(), sizeof(ImplicitArgs));
|
|
}
|
|
this->isBuiltIn = pSourceKernel->isBuiltIn;
|
|
|
|
return CL_SUCCESS;
|
|
}
|
|
|
|
cl_int Kernel::getInfo(cl_kernel_info paramName, size_t paramValueSize,
|
|
void *paramValue, size_t *paramValueSizeRet) const {
|
|
cl_int retVal;
|
|
const void *pSrc = nullptr;
|
|
size_t srcSize = GetInfo::invalidSourceSize;
|
|
cl_uint numArgs = 0;
|
|
const _cl_program *prog;
|
|
const _cl_context *ctxt;
|
|
cl_uint refCount = 0;
|
|
uint64_t nonCannonizedGpuAddress = 0llu;
|
|
auto gmmHelper = clDevice.getDevice().getGmmHelper();
|
|
|
|
switch (paramName) {
|
|
case CL_KERNEL_FUNCTION_NAME:
|
|
pSrc = kernelInfo.kernelDescriptor.kernelMetadata.kernelName.c_str();
|
|
srcSize = kernelInfo.kernelDescriptor.kernelMetadata.kernelName.length() + 1;
|
|
break;
|
|
|
|
case CL_KERNEL_NUM_ARGS:
|
|
srcSize = sizeof(cl_uint);
|
|
numArgs = static_cast<cl_uint>(kernelInfo.kernelDescriptor.payloadMappings.explicitArgs.size());
|
|
pSrc = &numArgs;
|
|
break;
|
|
|
|
case CL_KERNEL_CONTEXT:
|
|
ctxt = &program->getContext();
|
|
srcSize = sizeof(ctxt);
|
|
pSrc = &ctxt;
|
|
break;
|
|
|
|
case CL_KERNEL_PROGRAM:
|
|
prog = program;
|
|
srcSize = sizeof(prog);
|
|
pSrc = &prog;
|
|
break;
|
|
|
|
case CL_KERNEL_REFERENCE_COUNT:
|
|
refCount = static_cast<cl_uint>(pMultiDeviceKernel->getRefApiCount());
|
|
srcSize = sizeof(refCount);
|
|
pSrc = &refCount;
|
|
break;
|
|
|
|
case CL_KERNEL_ATTRIBUTES:
|
|
pSrc = kernelInfo.kernelDescriptor.kernelMetadata.kernelLanguageAttributes.c_str();
|
|
srcSize = kernelInfo.kernelDescriptor.kernelMetadata.kernelLanguageAttributes.length() + 1;
|
|
break;
|
|
|
|
case CL_KERNEL_BINARY_PROGRAM_INTEL:
|
|
pSrc = getKernelHeap();
|
|
srcSize = getKernelHeapSize();
|
|
break;
|
|
case CL_KERNEL_BINARY_GPU_ADDRESS_INTEL:
|
|
nonCannonizedGpuAddress = gmmHelper->decanonize(kernelInfo.kernelAllocation->getGpuAddress());
|
|
pSrc = &nonCannonizedGpuAddress;
|
|
srcSize = sizeof(nonCannonizedGpuAddress);
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
auto getInfoStatus = GetInfo::getInfo(paramValue, paramValueSize, pSrc, srcSize);
|
|
retVal = changeGetInfoStatusToCLResultType(getInfoStatus);
|
|
GetInfo::setParamValueReturnSize(paramValueSizeRet, srcSize, getInfoStatus);
|
|
|
|
return retVal;
|
|
}
|
|
|
|
cl_int Kernel::getArgInfo(cl_uint argIndex, cl_kernel_arg_info paramName, size_t paramValueSize,
|
|
void *paramValue, size_t *paramValueSizeRet) const {
|
|
cl_int retVal;
|
|
const void *pSrc = nullptr;
|
|
size_t srcSize = GetInfo::invalidSourceSize;
|
|
const auto &args = kernelInfo.kernelDescriptor.payloadMappings.explicitArgs;
|
|
|
|
if (argIndex >= args.size()) {
|
|
retVal = CL_INVALID_ARG_INDEX;
|
|
return retVal;
|
|
}
|
|
|
|
program->callPopulateZebinExtendedArgsMetadataOnce(clDevice.getRootDeviceIndex());
|
|
program->callGenerateDefaultExtendedArgsMetadataOnce(clDevice.getRootDeviceIndex());
|
|
|
|
const auto &argTraits = args[argIndex].getTraits();
|
|
const auto &argMetadata = kernelInfo.kernelDescriptor.explicitArgsExtendedMetadata[argIndex];
|
|
|
|
cl_kernel_arg_address_qualifier addressQualifier;
|
|
cl_kernel_arg_access_qualifier accessQualifier;
|
|
cl_kernel_arg_type_qualifier typeQualifier;
|
|
|
|
switch (paramName) {
|
|
case CL_KERNEL_ARG_ADDRESS_QUALIFIER:
|
|
addressQualifier = asClKernelArgAddressQualifier(argTraits.getAddressQualifier());
|
|
srcSize = sizeof(addressQualifier);
|
|
pSrc = &addressQualifier;
|
|
break;
|
|
|
|
case CL_KERNEL_ARG_ACCESS_QUALIFIER:
|
|
accessQualifier = asClKernelArgAccessQualifier(argTraits.getAccessQualifier());
|
|
srcSize = sizeof(accessQualifier);
|
|
pSrc = &accessQualifier;
|
|
break;
|
|
|
|
case CL_KERNEL_ARG_TYPE_QUALIFIER:
|
|
typeQualifier = asClKernelArgTypeQualifier(argTraits.typeQualifiers);
|
|
srcSize = sizeof(typeQualifier);
|
|
pSrc = &typeQualifier;
|
|
break;
|
|
|
|
case CL_KERNEL_ARG_TYPE_NAME:
|
|
srcSize = argMetadata.type.length() + 1;
|
|
pSrc = argMetadata.type.c_str();
|
|
break;
|
|
|
|
case CL_KERNEL_ARG_NAME:
|
|
srcSize = argMetadata.argName.length() + 1;
|
|
pSrc = argMetadata.argName.c_str();
|
|
break;
|
|
|
|
default:
|
|
break;
|
|
}
|
|
|
|
auto getInfoStatus = GetInfo::getInfo(paramValue, paramValueSize, pSrc, srcSize);
|
|
retVal = changeGetInfoStatusToCLResultType(getInfoStatus);
|
|
GetInfo::setParamValueReturnSize(paramValueSizeRet, srcSize, getInfoStatus);
|
|
|
|
return retVal;
|
|
}
|
|
|
|
cl_int Kernel::getWorkGroupInfo(cl_kernel_work_group_info paramName,
|
|
size_t paramValueSize, void *paramValue,
|
|
size_t *paramValueSizeRet) const {
|
|
cl_int retVal = CL_INVALID_VALUE;
|
|
const void *pSrc = nullptr;
|
|
size_t srcSize = GetInfo::invalidSourceSize;
|
|
struct SizeT3 {
|
|
size_t val[3];
|
|
} requiredWorkGroupSize;
|
|
cl_ulong localMemorySize;
|
|
const auto &kernelDescriptor = kernelInfo.kernelDescriptor;
|
|
size_t preferredWorkGroupSizeMultiple = 0;
|
|
cl_ulong scratchSize;
|
|
cl_ulong privateMemSize;
|
|
size_t maxWorkgroupSize;
|
|
const auto &hwInfo = clDevice.getHardwareInfo();
|
|
auto &gfxCoreHelper = this->getGfxCoreHelper();
|
|
auto &clGfxCoreHelper = clDevice.getRootDeviceEnvironment().getHelper<ClGfxCoreHelper>();
|
|
GetInfoHelper info(paramValue, paramValueSize, paramValueSizeRet);
|
|
|
|
switch (paramName) {
|
|
case CL_KERNEL_WORK_GROUP_SIZE:
|
|
maxWorkgroupSize = maxKernelWorkGroupSize;
|
|
if (DebugManager.flags.UseMaxSimdSizeToDeduceMaxWorkgroupSize.get()) {
|
|
auto divisionSize = CommonConstants::maximalSimdSize / kernelInfo.getMaxSimdSize();
|
|
maxWorkgroupSize /= divisionSize;
|
|
}
|
|
srcSize = sizeof(maxWorkgroupSize);
|
|
pSrc = &maxWorkgroupSize;
|
|
break;
|
|
|
|
case CL_KERNEL_COMPILE_WORK_GROUP_SIZE:
|
|
requiredWorkGroupSize.val[0] = kernelDescriptor.kernelAttributes.requiredWorkgroupSize[0];
|
|
requiredWorkGroupSize.val[1] = kernelDescriptor.kernelAttributes.requiredWorkgroupSize[1];
|
|
requiredWorkGroupSize.val[2] = kernelDescriptor.kernelAttributes.requiredWorkgroupSize[2];
|
|
srcSize = sizeof(requiredWorkGroupSize);
|
|
pSrc = &requiredWorkGroupSize;
|
|
break;
|
|
|
|
case CL_KERNEL_LOCAL_MEM_SIZE:
|
|
localMemorySize = kernelInfo.kernelDescriptor.kernelAttributes.slmInlineSize;
|
|
srcSize = sizeof(localMemorySize);
|
|
pSrc = &localMemorySize;
|
|
break;
|
|
|
|
case CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE:
|
|
preferredWorkGroupSizeMultiple = kernelInfo.getMaxSimdSize();
|
|
if (gfxCoreHelper.isFusedEuDispatchEnabled(hwInfo, kernelDescriptor.kernelAttributes.flags.requiresDisabledEUFusion)) {
|
|
preferredWorkGroupSizeMultiple *= 2;
|
|
}
|
|
srcSize = sizeof(preferredWorkGroupSizeMultiple);
|
|
pSrc = &preferredWorkGroupSizeMultiple;
|
|
break;
|
|
|
|
case CL_KERNEL_SPILL_MEM_SIZE_INTEL:
|
|
scratchSize = kernelDescriptor.kernelAttributes.perThreadScratchSize[0];
|
|
srcSize = sizeof(scratchSize);
|
|
pSrc = &scratchSize;
|
|
break;
|
|
case CL_KERNEL_PRIVATE_MEM_SIZE:
|
|
privateMemSize = clGfxCoreHelper.getKernelPrivateMemSize(kernelInfo);
|
|
srcSize = sizeof(privateMemSize);
|
|
pSrc = &privateMemSize;
|
|
break;
|
|
case CL_KERNEL_EU_THREAD_COUNT_INTEL:
|
|
srcSize = sizeof(cl_uint);
|
|
pSrc = &this->getKernelInfo().kernelDescriptor.kernelAttributes.numThreadsRequired;
|
|
break;
|
|
default:
|
|
break;
|
|
}
|
|
|
|
auto getInfoStatus = GetInfo::getInfo(paramValue, paramValueSize, pSrc, srcSize);
|
|
retVal = changeGetInfoStatusToCLResultType(getInfoStatus);
|
|
GetInfo::setParamValueReturnSize(paramValueSizeRet, srcSize, getInfoStatus);
|
|
|
|
return retVal;
|
|
}
|
|
|
|
cl_int Kernel::getSubGroupInfo(cl_kernel_sub_group_info paramName,
|
|
size_t inputValueSize, const void *inputValue,
|
|
size_t paramValueSize, void *paramValue,
|
|
size_t *paramValueSizeRet) const {
|
|
size_t numDimensions = 0;
|
|
size_t wgs = 1;
|
|
auto maxSimdSize = static_cast<size_t>(kernelInfo.getMaxSimdSize());
|
|
auto maxRequiredWorkGroupSize = static_cast<size_t>(kernelInfo.getMaxRequiredWorkGroupSize(getMaxKernelWorkGroupSize()));
|
|
auto largestCompiledSIMDSize = static_cast<size_t>(kernelInfo.getMaxSimdSize());
|
|
|
|
GetInfoHelper info(paramValue, paramValueSize, paramValueSizeRet);
|
|
|
|
if ((paramName == CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT) ||
|
|
(paramName == CL_KERNEL_MAX_NUM_SUB_GROUPS) ||
|
|
(paramName == CL_KERNEL_COMPILE_NUM_SUB_GROUPS)) {
|
|
if (clDevice.areOcl21FeaturesEnabled() == false) {
|
|
return CL_INVALID_OPERATION;
|
|
}
|
|
}
|
|
|
|
if ((paramName == CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR) ||
|
|
(paramName == CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE_KHR)) {
|
|
if (!inputValue) {
|
|
return CL_INVALID_VALUE;
|
|
}
|
|
if (inputValueSize % sizeof(size_t) != 0) {
|
|
return CL_INVALID_VALUE;
|
|
}
|
|
numDimensions = inputValueSize / sizeof(size_t);
|
|
if (numDimensions == 0 ||
|
|
numDimensions > static_cast<size_t>(clDevice.getDeviceInfo().maxWorkItemDimensions)) {
|
|
return CL_INVALID_VALUE;
|
|
}
|
|
}
|
|
|
|
if (paramName == CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT) {
|
|
if (!paramValue) {
|
|
return CL_INVALID_VALUE;
|
|
}
|
|
if (paramValueSize % sizeof(size_t) != 0) {
|
|
return CL_INVALID_VALUE;
|
|
}
|
|
numDimensions = paramValueSize / sizeof(size_t);
|
|
if (numDimensions == 0 ||
|
|
numDimensions > static_cast<size_t>(clDevice.getDeviceInfo().maxWorkItemDimensions)) {
|
|
return CL_INVALID_VALUE;
|
|
}
|
|
}
|
|
|
|
switch (paramName) {
|
|
case CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR: {
|
|
return changeGetInfoStatusToCLResultType(info.set<size_t>(maxSimdSize));
|
|
}
|
|
case CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE_KHR: {
|
|
for (size_t i = 0; i < numDimensions; i++) {
|
|
wgs *= ((size_t *)inputValue)[i];
|
|
}
|
|
return changeGetInfoStatusToCLResultType(
|
|
info.set<size_t>((wgs / maxSimdSize) + std::min(static_cast<size_t>(1), wgs % maxSimdSize))); // add 1 if WGS % maxSimdSize != 0
|
|
}
|
|
case CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT: {
|
|
auto subGroupsNum = *(size_t *)inputValue;
|
|
auto workGroupSize = subGroupsNum * largestCompiledSIMDSize;
|
|
// return workgroup size in first dimension, the rest shall be 1 in positive case
|
|
if (workGroupSize > maxRequiredWorkGroupSize) {
|
|
workGroupSize = 0;
|
|
}
|
|
// If no work group size can accommodate the requested number of subgroups, return 0 in each element of the returned array.
|
|
switch (numDimensions) {
|
|
case 1:
|
|
return changeGetInfoStatusToCLResultType(info.set<size_t>(workGroupSize));
|
|
case 2:
|
|
struct SizeT2 {
|
|
size_t val[2];
|
|
} workGroupSize2;
|
|
workGroupSize2.val[0] = workGroupSize;
|
|
workGroupSize2.val[1] = (workGroupSize > 0) ? 1 : 0;
|
|
return changeGetInfoStatusToCLResultType(info.set<SizeT2>(workGroupSize2));
|
|
default:
|
|
struct SizeT3 {
|
|
size_t val[3];
|
|
} workGroupSize3;
|
|
workGroupSize3.val[0] = workGroupSize;
|
|
workGroupSize3.val[1] = (workGroupSize > 0) ? 1 : 0;
|
|
workGroupSize3.val[2] = (workGroupSize > 0) ? 1 : 0;
|
|
return changeGetInfoStatusToCLResultType(info.set<SizeT3>(workGroupSize3));
|
|
}
|
|
}
|
|
case CL_KERNEL_MAX_NUM_SUB_GROUPS: {
|
|
// round-up maximum number of subgroups
|
|
return changeGetInfoStatusToCLResultType(info.set<size_t>(Math::divideAndRoundUp(maxRequiredWorkGroupSize, largestCompiledSIMDSize)));
|
|
}
|
|
case CL_KERNEL_COMPILE_NUM_SUB_GROUPS: {
|
|
return changeGetInfoStatusToCLResultType(info.set<size_t>(static_cast<size_t>(kernelInfo.kernelDescriptor.kernelMetadata.compiledSubGroupsNumber)));
|
|
}
|
|
case CL_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL: {
|
|
return changeGetInfoStatusToCLResultType(info.set<size_t>(kernelInfo.kernelDescriptor.kernelMetadata.requiredSubGroupSize));
|
|
}
|
|
default:
|
|
return CL_INVALID_VALUE;
|
|
}
|
|
}
|
|
|
|
const void *Kernel::getKernelHeap() const {
|
|
return kernelInfo.heapInfo.pKernelHeap;
|
|
}
|
|
|
|
size_t Kernel::getKernelHeapSize() const {
|
|
return kernelInfo.heapInfo.kernelHeapSize;
|
|
}
|
|
|
|
void Kernel::substituteKernelHeap(void *newKernelHeap, size_t newKernelHeapSize) {
|
|
KernelInfo *pKernelInfo = const_cast<KernelInfo *>(&kernelInfo);
|
|
void **pKernelHeap = const_cast<void **>(&pKernelInfo->heapInfo.pKernelHeap);
|
|
*pKernelHeap = newKernelHeap;
|
|
auto &heapInfo = pKernelInfo->heapInfo;
|
|
heapInfo.kernelHeapSize = static_cast<uint32_t>(newKernelHeapSize);
|
|
pKernelInfo->isKernelHeapSubstituted = true;
|
|
auto memoryManager = executionEnvironment.memoryManager.get();
|
|
|
|
auto currentAllocationSize = pKernelInfo->kernelAllocation->getUnderlyingBufferSize();
|
|
bool status = false;
|
|
auto &rootDeviceEnvironment = clDevice.getRootDeviceEnvironment();
|
|
auto &helper = rootDeviceEnvironment.getHelper<GfxCoreHelper>();
|
|
size_t isaPadding = helper.getPaddingForISAAllocation();
|
|
|
|
if (currentAllocationSize >= newKernelHeapSize + isaPadding) {
|
|
auto &productHelper = rootDeviceEnvironment.getHelper<ProductHelper>();
|
|
auto useBlitter = productHelper.isBlitCopyRequiredForLocalMemory(rootDeviceEnvironment, *pKernelInfo->getGraphicsAllocation());
|
|
status = MemoryTransferHelper::transferMemoryToAllocation(useBlitter,
|
|
clDevice.getDevice(), pKernelInfo->getGraphicsAllocation(), 0, newKernelHeap,
|
|
static_cast<size_t>(newKernelHeapSize));
|
|
} else {
|
|
memoryManager->checkGpuUsageAndDestroyGraphicsAllocations(pKernelInfo->kernelAllocation);
|
|
pKernelInfo->kernelAllocation = nullptr;
|
|
status = pKernelInfo->createKernelAllocation(clDevice.getDevice(), isBuiltIn);
|
|
}
|
|
UNRECOVERABLE_IF(!status);
|
|
}
|
|
|
|
bool Kernel::isKernelHeapSubstituted() const {
|
|
return kernelInfo.isKernelHeapSubstituted;
|
|
}
|
|
|
|
uint64_t Kernel::getKernelId() const {
|
|
return kernelInfo.kernelId;
|
|
}
|
|
|
|
void Kernel::setKernelId(uint64_t newKernelId) {
|
|
KernelInfo *pKernelInfo = const_cast<KernelInfo *>(&kernelInfo);
|
|
pKernelInfo->kernelId = newKernelId;
|
|
}
|
|
|
|
uint32_t Kernel::getStartOffset() const {
|
|
return this->startOffset;
|
|
}
|
|
|
|
Context &Kernel::getContext() const {
|
|
return program->getContext();
|
|
}
|
|
|
|
void Kernel::setStartOffset(uint32_t offset) {
|
|
this->startOffset = offset;
|
|
}
|
|
|
|
void *Kernel::getSurfaceStateHeap() const {
|
|
return pSshLocal.get();
|
|
}
|
|
|
|
size_t Kernel::getDynamicStateHeapSize() const {
|
|
return kernelInfo.heapInfo.dynamicStateHeapSize;
|
|
}
|
|
|
|
const void *Kernel::getDynamicStateHeap() const {
|
|
return kernelInfo.heapInfo.pDsh;
|
|
}
|
|
|
|
size_t Kernel::getSurfaceStateHeapSize() const {
|
|
return sshLocalSize;
|
|
}
|
|
|
|
size_t Kernel::getNumberOfBindingTableStates() const {
|
|
return numberOfBindingTableStates;
|
|
}
|
|
|
|
void Kernel::resizeSurfaceStateHeap(void *pNewSsh, size_t newSshSize, size_t newBindingTableCount, size_t newBindingTableOffset) {
|
|
pSshLocal.reset(static_cast<char *>(pNewSsh));
|
|
sshLocalSize = static_cast<uint32_t>(newSshSize);
|
|
numberOfBindingTableStates = newBindingTableCount;
|
|
localBindingTableOffset = newBindingTableOffset;
|
|
}
|
|
|
|
void Kernel::markArgPatchedAndResolveArgs(uint32_t argIndex) {
|
|
if (!kernelArguments[argIndex].isPatched) {
|
|
patchedArgumentsNum++;
|
|
kernelArguments[argIndex].isPatched = true;
|
|
}
|
|
if (program->getContextPtr() && getContext().getRootDeviceIndices().size() > 1u && Kernel::isMemObj(kernelArguments[argIndex].type) && kernelArguments[argIndex].object) {
|
|
auto argMemObj = castToObjectOrAbort<MemObj>(reinterpret_cast<cl_mem>(kernelArguments[argIndex].object));
|
|
auto memObj = argMemObj->getHighestRootMemObj();
|
|
auto migrateRequiredForArg = memObj->getMultiGraphicsAllocation().requiresMigrations();
|
|
|
|
if (migratableArgsMap.find(argIndex) == migratableArgsMap.end() && migrateRequiredForArg) {
|
|
migratableArgsMap.insert({argIndex, memObj});
|
|
} else if (migrateRequiredForArg) {
|
|
migratableArgsMap[argIndex] = memObj;
|
|
} else {
|
|
migratableArgsMap.erase(argIndex);
|
|
}
|
|
}
|
|
|
|
resolveArgs();
|
|
}
|
|
|
|
cl_int Kernel::setArg(uint32_t argIndex, size_t argSize, const void *argVal) {
|
|
cl_int retVal = CL_SUCCESS;
|
|
bool updateExposedKernel = true;
|
|
auto argWasUncacheable = false;
|
|
if (kernelInfo.builtinDispatchBuilder != nullptr) {
|
|
updateExposedKernel = kernelInfo.builtinDispatchBuilder->setExplicitArg(argIndex, argSize, argVal, retVal);
|
|
}
|
|
if (updateExposedKernel) {
|
|
if (argIndex >= kernelArgHandlers.size()) {
|
|
return CL_INVALID_ARG_INDEX;
|
|
}
|
|
argWasUncacheable = kernelArguments[argIndex].isStatelessUncacheable;
|
|
auto argHandler = kernelArgHandlers[argIndex];
|
|
retVal = (this->*argHandler)(argIndex, argSize, argVal);
|
|
}
|
|
if (retVal == CL_SUCCESS) {
|
|
auto argIsUncacheable = kernelArguments[argIndex].isStatelessUncacheable;
|
|
statelessUncacheableArgsCount += (argIsUncacheable ? 1 : 0) - (argWasUncacheable ? 1 : 0);
|
|
markArgPatchedAndResolveArgs(argIndex);
|
|
}
|
|
return retVal;
|
|
}
|
|
|
|
cl_int Kernel::setArg(uint32_t argIndex, uint32_t argVal) {
|
|
return setArg(argIndex, sizeof(argVal), &argVal);
|
|
}
|
|
|
|
cl_int Kernel::setArg(uint32_t argIndex, uint64_t argVal) {
|
|
return setArg(argIndex, sizeof(argVal), &argVal);
|
|
}
|
|
|
|
cl_int Kernel::setArg(uint32_t argIndex, cl_mem argVal) {
|
|
return setArg(argIndex, sizeof(argVal), &argVal);
|
|
}
|
|
|
|
cl_int Kernel::setArg(uint32_t argIndex, cl_mem argVal, uint32_t mipLevel) {
|
|
auto retVal = setArgImageWithMipLevel(argIndex, sizeof(argVal), &argVal, mipLevel);
|
|
if (retVal == CL_SUCCESS) {
|
|
markArgPatchedAndResolveArgs(argIndex);
|
|
}
|
|
return retVal;
|
|
}
|
|
|
|
void *Kernel::patchBufferOffset(const ArgDescPointer &argAsPtr, void *svmPtr, GraphicsAllocation *svmAlloc) {
|
|
if (isUndefinedOffset(argAsPtr.bufferOffset)) {
|
|
return svmPtr;
|
|
}
|
|
void *ptrToPatch = svmPtr;
|
|
if (svmAlloc != nullptr) {
|
|
ptrToPatch = reinterpret_cast<void *>(svmAlloc->getGpuAddressToPatch());
|
|
}
|
|
|
|
constexpr uint32_t minimumAlignment = 4;
|
|
ptrToPatch = alignDown(ptrToPatch, minimumAlignment);
|
|
UNRECOVERABLE_IF(ptrDiff(svmPtr, ptrToPatch) != static_cast<uint32_t>(ptrDiff(svmPtr, ptrToPatch)));
|
|
uint32_t offsetToPatch = static_cast<uint32_t>(ptrDiff(svmPtr, ptrToPatch));
|
|
|
|
patch<uint32_t, uint32_t>(offsetToPatch, getCrossThreadData(), argAsPtr.bufferOffset);
|
|
return ptrToPatch;
|
|
}
|
|
|
|
cl_int Kernel::setArgSvm(uint32_t argIndex, size_t svmAllocSize, void *svmPtr, GraphicsAllocation *svmAlloc, cl_mem_flags svmFlags) {
|
|
const auto &argAsPtr = getKernelInfo().kernelDescriptor.payloadMappings.explicitArgs[argIndex].as<ArgDescPointer>();
|
|
|
|
auto patchLocation = ptrOffset(getCrossThreadData(), argAsPtr.stateless);
|
|
patchWithRequiredSize(patchLocation, argAsPtr.pointerSize, reinterpret_cast<uintptr_t>(svmPtr));
|
|
|
|
void *ptrToPatch = patchBufferOffset(argAsPtr, svmPtr, svmAlloc);
|
|
if (isValidOffset(argAsPtr.bindful)) {
|
|
auto surfaceState = ptrOffset(getSurfaceStateHeap(), argAsPtr.bindful);
|
|
Buffer::setSurfaceState(&getDevice().getDevice(), surfaceState, false, false, svmAllocSize + ptrDiff(svmPtr, ptrToPatch), ptrToPatch, 0, svmAlloc, svmFlags, 0,
|
|
kernelInfo.kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, areMultipleSubDevicesInContext());
|
|
}
|
|
|
|
storeKernelArg(argIndex, SVM_OBJ, nullptr, svmPtr, sizeof(void *), svmAlloc, svmFlags);
|
|
if (!kernelArguments[argIndex].isPatched) {
|
|
patchedArgumentsNum++;
|
|
kernelArguments[argIndex].isPatched = true;
|
|
}
|
|
if (svmPtr != nullptr && isBuiltIn == false) {
|
|
this->anyKernelArgumentUsingSystemMemory |= true;
|
|
}
|
|
return CL_SUCCESS;
|
|
}
|
|
|
|
cl_int Kernel::setArgSvmAlloc(uint32_t argIndex, void *svmPtr, GraphicsAllocation *svmAlloc, uint32_t allocId) {
|
|
DBG_LOG_INPUTS("setArgBuffer svm_alloc", svmAlloc);
|
|
|
|
const auto &argAsPtr = getKernelInfo().kernelDescriptor.payloadMappings.explicitArgs[argIndex].as<ArgDescPointer>();
|
|
|
|
auto patchLocation = ptrOffset(getCrossThreadData(), argAsPtr.stateless);
|
|
patchWithRequiredSize(patchLocation, argAsPtr.pointerSize, reinterpret_cast<uintptr_t>(svmPtr));
|
|
|
|
auto &kernelArgInfo = kernelArguments[argIndex];
|
|
|
|
bool disableL3 = false;
|
|
bool forceNonAuxMode = false;
|
|
const bool isAuxTranslationKernel = (AuxTranslationDirection::None != auxTranslationDirection);
|
|
auto &rootDeviceEnvironment = getDevice().getRootDeviceEnvironment();
|
|
auto &clGfxCoreHelper = rootDeviceEnvironment.getHelper<ClGfxCoreHelper>();
|
|
|
|
if (isAuxTranslationKernel) {
|
|
if (((AuxTranslationDirection::AuxToNonAux == auxTranslationDirection) && argIndex == 1) ||
|
|
((AuxTranslationDirection::NonAuxToAux == auxTranslationDirection) && argIndex == 0)) {
|
|
forceNonAuxMode = true;
|
|
}
|
|
disableL3 = (argIndex == 0);
|
|
} else if (svmAlloc && svmAlloc->isCompressionEnabled() && clGfxCoreHelper.requiresNonAuxMode(argAsPtr)) {
|
|
forceNonAuxMode = true;
|
|
}
|
|
|
|
const bool argWasUncacheable = kernelArgInfo.isStatelessUncacheable;
|
|
const bool argIsUncacheable = svmAlloc ? svmAlloc->isUncacheable() : false;
|
|
statelessUncacheableArgsCount += (argIsUncacheable ? 1 : 0) - (argWasUncacheable ? 1 : 0);
|
|
|
|
void *ptrToPatch = patchBufferOffset(argAsPtr, svmPtr, svmAlloc);
|
|
if (isValidOffset(argAsPtr.bindful)) {
|
|
auto surfaceState = ptrOffset(getSurfaceStateHeap(), argAsPtr.bindful);
|
|
size_t allocSize = 0;
|
|
size_t offset = 0;
|
|
if (svmAlloc != nullptr) {
|
|
allocSize = svmAlloc->getUnderlyingBufferSize();
|
|
offset = ptrDiff(ptrToPatch, svmAlloc->getGpuAddressToPatch());
|
|
allocSize -= offset;
|
|
}
|
|
Buffer::setSurfaceState(&getDevice().getDevice(), surfaceState, forceNonAuxMode, disableL3, allocSize, ptrToPatch, offset, svmAlloc, 0, 0,
|
|
kernelInfo.kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, areMultipleSubDevicesInContext());
|
|
}
|
|
|
|
storeKernelArg(argIndex, SVM_ALLOC_OBJ, svmAlloc, svmPtr, sizeof(uintptr_t));
|
|
kernelArgInfo.allocId = allocId;
|
|
kernelArgInfo.allocIdMemoryManagerCounter = allocId ? this->getContext().getSVMAllocsManager()->allocationsCounter.load() : 0u;
|
|
kernelArgInfo.isSetToNullptr = nullptr == svmPtr;
|
|
if (!kernelArgInfo.isPatched) {
|
|
patchedArgumentsNum++;
|
|
kernelArgInfo.isPatched = true;
|
|
}
|
|
if (!kernelArgInfo.isSetToNullptr && isBuiltIn == false) {
|
|
if (svmAlloc != nullptr) {
|
|
this->anyKernelArgumentUsingSystemMemory |= Kernel::graphicsAllocationTypeUseSystemMemory(svmAlloc->getAllocationType());
|
|
} else {
|
|
this->anyKernelArgumentUsingSystemMemory |= true;
|
|
}
|
|
}
|
|
return CL_SUCCESS;
|
|
}
|
|
|
|
void Kernel::storeKernelArg(uint32_t argIndex, kernelArgType argType, void *argObject,
|
|
const void *argValue, size_t argSize,
|
|
GraphicsAllocation *argSvmAlloc, cl_mem_flags argSvmFlags) {
|
|
kernelArguments[argIndex].type = argType;
|
|
kernelArguments[argIndex].object = argObject;
|
|
kernelArguments[argIndex].value = argValue;
|
|
kernelArguments[argIndex].size = argSize;
|
|
kernelArguments[argIndex].svmAllocation = argSvmAlloc;
|
|
kernelArguments[argIndex].svmFlags = argSvmFlags;
|
|
}
|
|
|
|
void Kernel::storeKernelArgAllocIdMemoryManagerCounter(uint32_t argIndex, uint32_t allocIdMemoryManagerCounter) {
|
|
kernelArguments[argIndex].allocIdMemoryManagerCounter = allocIdMemoryManagerCounter;
|
|
}
|
|
|
|
const void *Kernel::getKernelArg(uint32_t argIndex) const {
|
|
return kernelArguments[argIndex].object;
|
|
}
|
|
|
|
const Kernel::SimpleKernelArgInfo &Kernel::getKernelArgInfo(uint32_t argIndex) const {
|
|
return kernelArguments[argIndex];
|
|
}
|
|
|
|
bool Kernel::getAllowNonUniform() const {
|
|
return program->getAllowNonUniform();
|
|
}
|
|
|
|
void Kernel::setSvmKernelExecInfo(GraphicsAllocation *argValue) {
|
|
kernelSvmGfxAllocations.push_back(argValue);
|
|
}
|
|
|
|
void Kernel::clearSvmKernelExecInfo() {
|
|
kernelSvmGfxAllocations.clear();
|
|
}
|
|
|
|
void Kernel::setUnifiedMemoryProperty(cl_kernel_exec_info infoType, bool infoValue) {
|
|
if (infoType == CL_KERNEL_EXEC_INFO_INDIRECT_DEVICE_ACCESS_INTEL) {
|
|
this->unifiedMemoryControls.indirectDeviceAllocationsAllowed = infoValue;
|
|
return;
|
|
}
|
|
if (infoType == CL_KERNEL_EXEC_INFO_INDIRECT_HOST_ACCESS_INTEL) {
|
|
this->unifiedMemoryControls.indirectHostAllocationsAllowed = infoValue;
|
|
return;
|
|
}
|
|
if (infoType == CL_KERNEL_EXEC_INFO_INDIRECT_SHARED_ACCESS_INTEL) {
|
|
this->unifiedMemoryControls.indirectSharedAllocationsAllowed = infoValue;
|
|
return;
|
|
}
|
|
}
|
|
|
|
void Kernel::setUnifiedMemoryExecInfo(GraphicsAllocation *unifiedMemoryAllocation) {
|
|
kernelUnifiedMemoryGfxAllocations.push_back(unifiedMemoryAllocation);
|
|
}
|
|
|
|
void Kernel::clearUnifiedMemoryExecInfo() {
|
|
kernelUnifiedMemoryGfxAllocations.clear();
|
|
}
|
|
|
|
cl_int Kernel::setKernelExecutionType(cl_execution_info_kernel_type_intel executionType) {
|
|
switch (executionType) {
|
|
case CL_KERNEL_EXEC_INFO_DEFAULT_TYPE_INTEL:
|
|
this->executionType = KernelExecutionType::Default;
|
|
break;
|
|
case CL_KERNEL_EXEC_INFO_CONCURRENT_TYPE_INTEL:
|
|
this->executionType = KernelExecutionType::Concurrent;
|
|
break;
|
|
default: {
|
|
return CL_INVALID_VALUE;
|
|
}
|
|
}
|
|
return CL_SUCCESS;
|
|
}
|
|
|
|
void Kernel::getSuggestedLocalWorkSize(const cl_uint workDim, const size_t *globalWorkSize, const size_t *globalWorkOffset,
|
|
size_t *localWorkSize) {
|
|
UNRECOVERABLE_IF((workDim == 0) || (workDim > 3));
|
|
UNRECOVERABLE_IF(globalWorkSize == nullptr);
|
|
Vec3<size_t> elws{0, 0, 0};
|
|
Vec3<size_t> gws{
|
|
globalWorkSize[0],
|
|
(workDim > 1) ? globalWorkSize[1] : 1,
|
|
(workDim > 2) ? globalWorkSize[2] : 1};
|
|
Vec3<size_t> offset{0, 0, 0};
|
|
if (globalWorkOffset) {
|
|
offset.x = globalWorkOffset[0];
|
|
if (workDim > 1) {
|
|
offset.y = globalWorkOffset[1];
|
|
if (workDim > 2) {
|
|
offset.z = globalWorkOffset[2];
|
|
}
|
|
}
|
|
}
|
|
|
|
Vec3<size_t> suggestedLws{0, 0, 0};
|
|
|
|
if (kernelInfo.kernelDescriptor.kernelAttributes.requiredWorkgroupSize[0] != 0) {
|
|
suggestedLws.x = kernelInfo.kernelDescriptor.kernelAttributes.requiredWorkgroupSize[0];
|
|
suggestedLws.y = kernelInfo.kernelDescriptor.kernelAttributes.requiredWorkgroupSize[1];
|
|
suggestedLws.z = kernelInfo.kernelDescriptor.kernelAttributes.requiredWorkgroupSize[2];
|
|
} else {
|
|
uint32_t dispatchWorkDim = std::max(1U, std::max(gws.getSimplifiedDim(), offset.getSimplifiedDim()));
|
|
const DispatchInfo dispatchInfo{&clDevice, this, dispatchWorkDim, gws, elws, offset};
|
|
suggestedLws = computeWorkgroupSize(dispatchInfo);
|
|
}
|
|
|
|
localWorkSize[0] = suggestedLws.x;
|
|
if (workDim > 1)
|
|
localWorkSize[1] = suggestedLws.y;
|
|
if (workDim > 2)
|
|
localWorkSize[2] = suggestedLws.z;
|
|
}
|
|
|
|
uint32_t Kernel::getMaxWorkGroupCount(const cl_uint workDim, const size_t *localWorkSize, const CommandQueue *commandQueue) const {
|
|
auto &hardwareInfo = getHardwareInfo();
|
|
auto &rootDeviceEnvironment = this->getDevice().getRootDeviceEnvironment();
|
|
auto &helper = rootDeviceEnvironment.getHelper<GfxCoreHelper>();
|
|
|
|
auto engineGroupType = helper.getEngineGroupType(commandQueue->getGpgpuEngine().getEngineType(),
|
|
commandQueue->getGpgpuEngine().getEngineUsage(), hardwareInfo);
|
|
|
|
const auto &kernelDescriptor = kernelInfo.kernelDescriptor;
|
|
auto dssCount = hardwareInfo.gtSystemInfo.DualSubSliceCount;
|
|
if (dssCount == 0) {
|
|
dssCount = hardwareInfo.gtSystemInfo.SubSliceCount;
|
|
}
|
|
auto availableThreadCount = helper.calculateAvailableThreadCount(hardwareInfo, kernelDescriptor.kernelAttributes.numGrfRequired);
|
|
auto availableSlmSize = static_cast<uint32_t>(dssCount * KB * hardwareInfo.capabilityTable.slmSize);
|
|
auto usedSlmSize = helper.alignSlmSize(slmTotalSize);
|
|
auto maxBarrierCount = static_cast<uint32_t>(helper.getMaxBarrierRegisterPerSlice());
|
|
auto barrierCount = kernelDescriptor.kernelAttributes.barrierCount;
|
|
|
|
auto maxWorkGroupCount = KernelHelper::getMaxWorkGroupCount(kernelInfo.getMaxSimdSize(),
|
|
availableThreadCount,
|
|
dssCount,
|
|
availableSlmSize,
|
|
usedSlmSize,
|
|
maxBarrierCount,
|
|
barrierCount,
|
|
workDim,
|
|
localWorkSize);
|
|
auto isEngineInstanced = commandQueue->getGpgpuCommandStreamReceiver().getOsContext().isEngineInstanced();
|
|
maxWorkGroupCount = helper.adjustMaxWorkGroupCount(maxWorkGroupCount, engineGroupType, rootDeviceEnvironment, isEngineInstanced);
|
|
return maxWorkGroupCount;
|
|
}
|
|
|
|
inline void Kernel::makeArgsResident(CommandStreamReceiver &commandStreamReceiver) {
|
|
auto numArgs = kernelInfo.kernelDescriptor.payloadMappings.explicitArgs.size();
|
|
for (decltype(numArgs) argIndex = 0; argIndex < numArgs; argIndex++) {
|
|
if (kernelArguments[argIndex].object) {
|
|
if (kernelArguments[argIndex].type == SVM_ALLOC_OBJ) {
|
|
auto pSVMAlloc = (GraphicsAllocation *)kernelArguments[argIndex].object;
|
|
auto pageFaultManager = executionEnvironment.memoryManager->getPageFaultManager();
|
|
if (pageFaultManager &&
|
|
this->isUnifiedMemorySyncRequired) {
|
|
pageFaultManager->moveAllocationToGpuDomain(reinterpret_cast<void *>(pSVMAlloc->getGpuAddress()));
|
|
}
|
|
commandStreamReceiver.makeResident(*pSVMAlloc);
|
|
} else if (Kernel::isMemObj(kernelArguments[argIndex].type)) {
|
|
auto clMem = const_cast<cl_mem>(static_cast<const _cl_mem *>(kernelArguments[argIndex].object));
|
|
auto memObj = castToObjectOrAbort<MemObj>(clMem);
|
|
auto image = castToObject<Image>(clMem);
|
|
if (image && image->isImageFromImage()) {
|
|
commandStreamReceiver.setSamplerCacheFlushRequired(CommandStreamReceiver::SamplerCacheFlushState::samplerCacheFlushBefore);
|
|
}
|
|
commandStreamReceiver.makeResident(*memObj->getGraphicsAllocation(commandStreamReceiver.getRootDeviceIndex()));
|
|
if (memObj->getMcsAllocation()) {
|
|
commandStreamReceiver.makeResident(*memObj->getMcsAllocation());
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void Kernel::performKernelTuning(CommandStreamReceiver &commandStreamReceiver, const Vec3<size_t> &lws, const Vec3<size_t> &gws, const Vec3<size_t> &offsets, TimestampPacketContainer *timestampContainer) {
|
|
auto performTunning = TunningType::DISABLED;
|
|
|
|
if (DebugManager.flags.EnableKernelTunning.get() != -1) {
|
|
performTunning = static_cast<TunningType>(DebugManager.flags.EnableKernelTunning.get());
|
|
}
|
|
|
|
if (performTunning == TunningType::SIMPLE) {
|
|
this->singleSubdevicePreferredInCurrentEnqueue = !this->kernelInfo.kernelDescriptor.kernelAttributes.flags.useGlobalAtomics;
|
|
|
|
} else if (performTunning == TunningType::FULL) {
|
|
KernelConfig config{gws, lws, offsets};
|
|
|
|
auto submissionDataIt = this->kernelSubmissionMap.find(config);
|
|
if (submissionDataIt == this->kernelSubmissionMap.end()) {
|
|
KernelSubmissionData submissionData;
|
|
submissionData.kernelStandardTimestamps = std::make_unique<TimestampPacketContainer>();
|
|
submissionData.kernelSubdeviceTimestamps = std::make_unique<TimestampPacketContainer>();
|
|
submissionData.status = TunningStatus::STANDARD_TUNNING_IN_PROGRESS;
|
|
submissionData.kernelStandardTimestamps->assignAndIncrementNodesRefCounts(*timestampContainer);
|
|
this->kernelSubmissionMap[config] = std::move(submissionData);
|
|
this->singleSubdevicePreferredInCurrentEnqueue = false;
|
|
return;
|
|
}
|
|
|
|
auto &submissionData = submissionDataIt->second;
|
|
|
|
if (submissionData.status == TunningStatus::TUNNING_DONE) {
|
|
this->singleSubdevicePreferredInCurrentEnqueue = submissionData.singleSubdevicePreferred;
|
|
}
|
|
|
|
if (submissionData.status == TunningStatus::SUBDEVICE_TUNNING_IN_PROGRESS) {
|
|
if (this->hasTunningFinished(submissionData)) {
|
|
submissionData.status = TunningStatus::TUNNING_DONE;
|
|
submissionData.kernelStandardTimestamps.reset();
|
|
submissionData.kernelSubdeviceTimestamps.reset();
|
|
this->singleSubdevicePreferredInCurrentEnqueue = submissionData.singleSubdevicePreferred;
|
|
} else {
|
|
this->singleSubdevicePreferredInCurrentEnqueue = false;
|
|
}
|
|
}
|
|
|
|
if (submissionData.status == TunningStatus::STANDARD_TUNNING_IN_PROGRESS) {
|
|
submissionData.status = TunningStatus::SUBDEVICE_TUNNING_IN_PROGRESS;
|
|
submissionData.kernelSubdeviceTimestamps->assignAndIncrementNodesRefCounts(*timestampContainer);
|
|
this->singleSubdevicePreferredInCurrentEnqueue = true;
|
|
}
|
|
}
|
|
}
|
|
|
|
bool Kernel::hasTunningFinished(KernelSubmissionData &submissionData) {
|
|
if (!this->hasRunFinished(submissionData.kernelStandardTimestamps.get()) ||
|
|
!this->hasRunFinished(submissionData.kernelSubdeviceTimestamps.get())) {
|
|
return false;
|
|
}
|
|
|
|
uint64_t globalStartTS = 0u;
|
|
uint64_t globalEndTS = 0u;
|
|
|
|
Event::getBoundaryTimestampValues(submissionData.kernelStandardTimestamps.get(), globalStartTS, globalEndTS);
|
|
auto standardTSDiff = globalEndTS - globalStartTS;
|
|
|
|
Event::getBoundaryTimestampValues(submissionData.kernelSubdeviceTimestamps.get(), globalStartTS, globalEndTS);
|
|
auto subdeviceTSDiff = globalEndTS - globalStartTS;
|
|
|
|
submissionData.singleSubdevicePreferred = standardTSDiff > subdeviceTSDiff;
|
|
|
|
return true;
|
|
}
|
|
|
|
bool Kernel::hasRunFinished(TimestampPacketContainer *timestampContainer) {
|
|
for (const auto &node : timestampContainer->peekNodes()) {
|
|
for (uint32_t i = 0; i < node->getPacketsUsed(); i++) {
|
|
if (node->getContextEndValue(i) == 1) {
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
return true;
|
|
}
|
|
|
|
bool Kernel::isSingleSubdevicePreferred() const {
|
|
return this->singleSubdevicePreferredInCurrentEnqueue || this->usesSyncBuffer();
|
|
}
|
|
|
|
void Kernel::setInlineSamplers() {
|
|
for (auto &inlineSampler : getDescriptor().inlineSamplers) {
|
|
using AddrMode = NEO::KernelDescriptor::InlineSampler::AddrMode;
|
|
constexpr LookupArray<AddrMode, cl_addressing_mode, 5> addressingModes({{{AddrMode::None, CL_ADDRESS_NONE},
|
|
{AddrMode::Repeat, CL_ADDRESS_REPEAT},
|
|
{AddrMode::ClampEdge, CL_ADDRESS_CLAMP_TO_EDGE},
|
|
{AddrMode::ClampBorder, CL_ADDRESS_CLAMP},
|
|
{AddrMode::Mirror, CL_ADDRESS_MIRRORED_REPEAT}}});
|
|
|
|
using FilterMode = NEO::KernelDescriptor::InlineSampler::FilterMode;
|
|
constexpr LookupArray<FilterMode, cl_filter_mode, 2> filterModes({{{FilterMode::Linear, CL_FILTER_LINEAR},
|
|
{FilterMode::Nearest, CL_FILTER_NEAREST}}});
|
|
|
|
cl_int errCode = CL_SUCCESS;
|
|
auto sampler = std::unique_ptr<Sampler>(Sampler::create(&getContext(),
|
|
static_cast<cl_bool>(inlineSampler.isNormalized),
|
|
addressingModes.lookUp(inlineSampler.addrMode),
|
|
filterModes.lookUp(inlineSampler.filterMode),
|
|
errCode));
|
|
UNRECOVERABLE_IF(errCode != CL_SUCCESS);
|
|
|
|
auto samplerState = ptrOffset(getDynamicStateHeap(), static_cast<size_t>(inlineSampler.getSamplerBindfulOffset()));
|
|
sampler->setArg(const_cast<void *>(samplerState), clDevice.getRootDeviceEnvironment());
|
|
}
|
|
}
|
|
|
|
void Kernel::makeResident(CommandStreamReceiver &commandStreamReceiver) {
|
|
auto rootDeviceIndex = commandStreamReceiver.getRootDeviceIndex();
|
|
if (privateSurface) {
|
|
commandStreamReceiver.makeResident(*privateSurface);
|
|
}
|
|
|
|
if (program->getConstantSurface(rootDeviceIndex)) {
|
|
commandStreamReceiver.makeResident(*(program->getConstantSurface(rootDeviceIndex)));
|
|
}
|
|
|
|
if (program->getGlobalSurface(rootDeviceIndex)) {
|
|
commandStreamReceiver.makeResident(*(program->getGlobalSurface(rootDeviceIndex)));
|
|
}
|
|
|
|
if (program->getExportedFunctionsSurface(rootDeviceIndex)) {
|
|
commandStreamReceiver.makeResident(*(program->getExportedFunctionsSurface(rootDeviceIndex)));
|
|
}
|
|
|
|
for (auto gfxAlloc : kernelSvmGfxAllocations) {
|
|
commandStreamReceiver.makeResident(*gfxAlloc);
|
|
}
|
|
|
|
auto pageFaultManager = program->peekExecutionEnvironment().memoryManager->getPageFaultManager();
|
|
|
|
for (auto gfxAlloc : kernelUnifiedMemoryGfxAllocations) {
|
|
commandStreamReceiver.makeResident(*gfxAlloc);
|
|
if (pageFaultManager) {
|
|
pageFaultManager->moveAllocationToGpuDomain(reinterpret_cast<void *>(gfxAlloc->getGpuAddress()));
|
|
}
|
|
}
|
|
|
|
if (getHasIndirectAccess() && unifiedMemoryControls.indirectSharedAllocationsAllowed && pageFaultManager) {
|
|
pageFaultManager->moveAllocationsWithinUMAllocsManagerToGpuDomain(this->getContext().getSVMAllocsManager());
|
|
}
|
|
makeArgsResident(commandStreamReceiver);
|
|
|
|
auto kernelIsaAllocation = this->kernelInfo.kernelAllocation;
|
|
if (kernelIsaAllocation) {
|
|
commandStreamReceiver.makeResident(*kernelIsaAllocation);
|
|
}
|
|
|
|
gtpinNotifyMakeResident(this, &commandStreamReceiver);
|
|
|
|
if (getHasIndirectAccess() && (unifiedMemoryControls.indirectDeviceAllocationsAllowed ||
|
|
unifiedMemoryControls.indirectHostAllocationsAllowed ||
|
|
unifiedMemoryControls.indirectSharedAllocationsAllowed)) {
|
|
this->getContext().getSVMAllocsManager()->makeInternalAllocationsResident(commandStreamReceiver, unifiedMemoryControls.generateMask());
|
|
}
|
|
}
|
|
|
|
void Kernel::getResidency(std::vector<Surface *> &dst) {
|
|
if (privateSurface) {
|
|
GeneralSurface *surface = new GeneralSurface(privateSurface);
|
|
dst.push_back(surface);
|
|
}
|
|
|
|
auto rootDeviceIndex = getDevice().getRootDeviceIndex();
|
|
if (program->getConstantSurface(rootDeviceIndex)) {
|
|
GeneralSurface *surface = new GeneralSurface(program->getConstantSurface(rootDeviceIndex));
|
|
dst.push_back(surface);
|
|
}
|
|
|
|
if (program->getGlobalSurface(rootDeviceIndex)) {
|
|
GeneralSurface *surface = new GeneralSurface(program->getGlobalSurface(rootDeviceIndex));
|
|
dst.push_back(surface);
|
|
}
|
|
|
|
if (program->getExportedFunctionsSurface(rootDeviceIndex)) {
|
|
GeneralSurface *surface = new GeneralSurface(program->getExportedFunctionsSurface(rootDeviceIndex));
|
|
dst.push_back(surface);
|
|
}
|
|
|
|
for (auto gfxAlloc : kernelSvmGfxAllocations) {
|
|
GeneralSurface *surface = new GeneralSurface(gfxAlloc);
|
|
dst.push_back(surface);
|
|
}
|
|
|
|
auto numArgs = kernelInfo.kernelDescriptor.payloadMappings.explicitArgs.size();
|
|
for (decltype(numArgs) argIndex = 0; argIndex < numArgs; argIndex++) {
|
|
if (kernelArguments[argIndex].object) {
|
|
if (kernelArguments[argIndex].type == SVM_ALLOC_OBJ) {
|
|
bool needsMigration = false;
|
|
auto pageFaultManager = executionEnvironment.memoryManager->getPageFaultManager();
|
|
if (pageFaultManager &&
|
|
this->isUnifiedMemorySyncRequired) {
|
|
needsMigration = true;
|
|
}
|
|
auto pSVMAlloc = (GraphicsAllocation *)kernelArguments[argIndex].object;
|
|
dst.push_back(new GeneralSurface(pSVMAlloc, needsMigration));
|
|
} else if (Kernel::isMemObj(kernelArguments[argIndex].type)) {
|
|
auto clMem = const_cast<cl_mem>(static_cast<const _cl_mem *>(kernelArguments[argIndex].object));
|
|
auto memObj = castToObject<MemObj>(clMem);
|
|
DEBUG_BREAK_IF(memObj == nullptr);
|
|
dst.push_back(new MemObjSurface(memObj));
|
|
}
|
|
}
|
|
}
|
|
|
|
auto kernelIsaAllocation = this->kernelInfo.kernelAllocation;
|
|
if (kernelIsaAllocation) {
|
|
GeneralSurface *surface = new GeneralSurface(kernelIsaAllocation);
|
|
dst.push_back(surface);
|
|
}
|
|
|
|
gtpinNotifyUpdateResidencyList(this, &dst);
|
|
}
|
|
|
|
cl_int Kernel::setArgLocal(uint32_t argIndexIn,
|
|
size_t argSize,
|
|
const void *argVal) {
|
|
storeKernelArg(argIndexIn, SLM_OBJ, nullptr, argVal, argSize);
|
|
uint32_t *crossThreadData = reinterpret_cast<uint32_t *>(this->crossThreadData);
|
|
uint32_t argIndex = argIndexIn;
|
|
|
|
const auto &args = kernelInfo.kernelDescriptor.payloadMappings.explicitArgs;
|
|
const auto &currArg = args[argIndex];
|
|
UNRECOVERABLE_IF(currArg.getTraits().getAddressQualifier() != KernelArgMetadata::AddrLocal);
|
|
|
|
slmSizes[argIndex] = static_cast<uint32_t>(argSize);
|
|
|
|
UNRECOVERABLE_IF(isUndefinedOffset(currArg.as<NEO::ArgDescPointer>().slmOffset));
|
|
auto slmOffset = *ptrOffset(crossThreadData, currArg.as<ArgDescPointer>().slmOffset);
|
|
slmOffset += static_cast<uint32_t>(argSize);
|
|
|
|
++argIndex;
|
|
while (argIndex < slmSizes.size()) {
|
|
if (args[argIndex].getTraits().getAddressQualifier() != KernelArgMetadata::AddrLocal) {
|
|
++argIndex;
|
|
continue;
|
|
}
|
|
|
|
const auto &nextArg = args[argIndex].as<ArgDescPointer>();
|
|
UNRECOVERABLE_IF(0 == nextArg.requiredSlmAlignment);
|
|
|
|
slmOffset = alignUp<uint32_t>(slmOffset, nextArg.requiredSlmAlignment);
|
|
|
|
auto patchLocation = ptrOffset(crossThreadData, nextArg.slmOffset);
|
|
*patchLocation = slmOffset;
|
|
|
|
slmOffset += static_cast<uint32_t>(slmSizes[argIndex]);
|
|
++argIndex;
|
|
}
|
|
|
|
slmTotalSize = kernelInfo.kernelDescriptor.kernelAttributes.slmInlineSize + alignUp(slmOffset, KB);
|
|
|
|
return CL_SUCCESS;
|
|
}
|
|
|
|
cl_int Kernel::setArgBuffer(uint32_t argIndex,
|
|
size_t argSize,
|
|
const void *argVal) {
|
|
|
|
if (argSize != sizeof(cl_mem *)) {
|
|
return CL_INVALID_ARG_SIZE;
|
|
}
|
|
|
|
auto clMem = reinterpret_cast<const cl_mem *>(argVal);
|
|
auto pClDevice = &getDevice();
|
|
auto rootDeviceIndex = pClDevice->getRootDeviceIndex();
|
|
|
|
const auto &arg = kernelInfo.kernelDescriptor.payloadMappings.explicitArgs[argIndex];
|
|
const auto &argAsPtr = arg.as<ArgDescPointer>();
|
|
|
|
if (clMem && *clMem) {
|
|
auto clMemObj = *clMem;
|
|
DBG_LOG_INPUTS("setArgBuffer cl_mem", clMemObj);
|
|
|
|
storeKernelArg(argIndex, BUFFER_OBJ, clMemObj, argVal, argSize);
|
|
|
|
auto buffer = castToObject<Buffer>(clMemObj);
|
|
if (!buffer) {
|
|
return CL_INVALID_MEM_OBJECT;
|
|
}
|
|
|
|
auto gfxAllocationType = buffer->getGraphicsAllocation(rootDeviceIndex)->getAllocationType();
|
|
if (!isBuiltIn) {
|
|
this->anyKernelArgumentUsingSystemMemory |= Kernel::graphicsAllocationTypeUseSystemMemory(gfxAllocationType);
|
|
}
|
|
|
|
if (buffer->peekSharingHandler()) {
|
|
usingSharedObjArgs = true;
|
|
}
|
|
patchBufferOffset(argAsPtr, nullptr, nullptr);
|
|
|
|
if (isValidOffset(argAsPtr.stateless)) {
|
|
auto patchLocation = ptrOffset(crossThreadData, argAsPtr.stateless);
|
|
uint64_t addressToPatch = buffer->setArgStateless(patchLocation, argAsPtr.pointerSize, rootDeviceIndex, !this->isBuiltIn);
|
|
|
|
if (DebugManager.flags.AddPatchInfoCommentsForAUBDump.get()) {
|
|
PatchInfoData patchInfoData(addressToPatch - buffer->getOffset(), static_cast<uint64_t>(buffer->getOffset()),
|
|
PatchInfoAllocationType::KernelArg, reinterpret_cast<uint64_t>(crossThreadData),
|
|
static_cast<uint64_t>(argAsPtr.stateless),
|
|
PatchInfoAllocationType::IndirectObjectHeap, argAsPtr.pointerSize);
|
|
this->patchInfoDataList.push_back(patchInfoData);
|
|
}
|
|
}
|
|
|
|
bool disableL3 = false;
|
|
bool forceNonAuxMode = false;
|
|
bool isAuxTranslationKernel = (AuxTranslationDirection::None != auxTranslationDirection);
|
|
auto graphicsAllocation = buffer->getGraphicsAllocation(rootDeviceIndex);
|
|
auto &rootDeviceEnvironment = getDevice().getRootDeviceEnvironment();
|
|
auto &clGfxCoreHelper = rootDeviceEnvironment.getHelper<ClGfxCoreHelper>();
|
|
|
|
if (isAuxTranslationKernel) {
|
|
if (((AuxTranslationDirection::AuxToNonAux == auxTranslationDirection) && argIndex == 1) ||
|
|
((AuxTranslationDirection::NonAuxToAux == auxTranslationDirection) && argIndex == 0)) {
|
|
forceNonAuxMode = true;
|
|
}
|
|
disableL3 = (argIndex == 0);
|
|
} else if (graphicsAllocation->isCompressionEnabled() && clGfxCoreHelper.requiresNonAuxMode(argAsPtr)) {
|
|
forceNonAuxMode = true;
|
|
}
|
|
|
|
if (isValidOffset(argAsPtr.bindful)) {
|
|
buffer->setArgStateful(ptrOffset(getSurfaceStateHeap(), argAsPtr.bindful), forceNonAuxMode,
|
|
disableL3, isAuxTranslationKernel, arg.isReadOnly(), pClDevice->getDevice(),
|
|
kernelInfo.kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, areMultipleSubDevicesInContext());
|
|
} else if (isValidOffset(argAsPtr.bindless)) {
|
|
auto &gfxCoreHelper = this->getGfxCoreHelper();
|
|
auto surfaceStateSize = gfxCoreHelper.getRenderSurfaceStateSize();
|
|
auto surfaceState = ptrOffset(getSurfaceStateHeap(), surfaceStateSize * argIndex);
|
|
|
|
buffer->setArgStateful(surfaceState, forceNonAuxMode,
|
|
disableL3, isAuxTranslationKernel, arg.isReadOnly(), pClDevice->getDevice(),
|
|
kernelInfo.kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, areMultipleSubDevicesInContext());
|
|
}
|
|
|
|
kernelArguments[argIndex].isStatelessUncacheable = argAsPtr.isPureStateful() ? false : buffer->isMemObjUncacheable();
|
|
|
|
return CL_SUCCESS;
|
|
} else {
|
|
storeKernelArg(argIndex, BUFFER_OBJ, nullptr, argVal, argSize);
|
|
if (isValidOffset(argAsPtr.stateless)) {
|
|
auto patchLocation = ptrOffset(getCrossThreadData(), argAsPtr.stateless);
|
|
patchWithRequiredSize(patchLocation, argAsPtr.pointerSize, 0u);
|
|
}
|
|
|
|
if (isValidOffset(argAsPtr.bindful)) {
|
|
auto surfaceState = ptrOffset(getSurfaceStateHeap(), argAsPtr.bindful);
|
|
Buffer::setSurfaceState(&pClDevice->getDevice(), surfaceState, false, false, 0, nullptr, 0, nullptr, 0, 0,
|
|
kernelInfo.kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, areMultipleSubDevicesInContext());
|
|
}
|
|
|
|
return CL_SUCCESS;
|
|
}
|
|
}
|
|
|
|
cl_int Kernel::setArgPipe(uint32_t argIndex,
|
|
size_t argSize,
|
|
const void *argVal) {
|
|
|
|
if (argSize != sizeof(cl_mem *)) {
|
|
return CL_INVALID_ARG_SIZE;
|
|
}
|
|
|
|
auto clMem = reinterpret_cast<const cl_mem *>(argVal);
|
|
|
|
if (clMem && *clMem) {
|
|
auto clMemObj = *clMem;
|
|
DBG_LOG_INPUTS("setArgPipe cl_mem", clMemObj);
|
|
|
|
storeKernelArg(argIndex, PIPE_OBJ, clMemObj, argVal, argSize);
|
|
|
|
auto memObj = castToObject<MemObj>(clMemObj);
|
|
if (!memObj) {
|
|
return CL_INVALID_MEM_OBJECT;
|
|
}
|
|
|
|
auto pipe = castToObject<Pipe>(clMemObj);
|
|
if (!pipe) {
|
|
return CL_INVALID_ARG_VALUE;
|
|
}
|
|
|
|
if (memObj->getContext() != &(this->getContext())) {
|
|
return CL_INVALID_MEM_OBJECT;
|
|
}
|
|
|
|
auto rootDeviceIndex = getDevice().getRootDeviceIndex();
|
|
const auto &argAsPtr = kernelInfo.kernelDescriptor.payloadMappings.explicitArgs[argIndex].as<ArgDescPointer>();
|
|
|
|
auto patchLocation = ptrOffset(getCrossThreadData(), argAsPtr.stateless);
|
|
pipe->setPipeArg(patchLocation, argAsPtr.pointerSize, rootDeviceIndex);
|
|
|
|
if (isValidOffset(argAsPtr.bindful)) {
|
|
auto graphicsAllocation = pipe->getGraphicsAllocation(rootDeviceIndex);
|
|
auto surfaceState = ptrOffset(getSurfaceStateHeap(), argAsPtr.bindful);
|
|
Buffer::setSurfaceState(&getDevice().getDevice(), surfaceState, false, false,
|
|
pipe->getSize(), pipe->getCpuAddress(), 0,
|
|
graphicsAllocation, 0, 0,
|
|
kernelInfo.kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, areMultipleSubDevicesInContext());
|
|
}
|
|
|
|
return CL_SUCCESS;
|
|
} else {
|
|
return CL_INVALID_MEM_OBJECT;
|
|
}
|
|
}
|
|
|
|
cl_int Kernel::setArgImage(uint32_t argIndex,
|
|
size_t argSize,
|
|
const void *argVal) {
|
|
return setArgImageWithMipLevel(argIndex, argSize, argVal, 0u);
|
|
}
|
|
|
|
cl_int Kernel::setArgImageWithMipLevel(uint32_t argIndex,
|
|
size_t argSize,
|
|
const void *argVal, uint32_t mipLevel) {
|
|
auto retVal = CL_INVALID_ARG_VALUE;
|
|
auto rootDeviceIndex = getDevice().getRootDeviceIndex();
|
|
|
|
const auto &arg = kernelInfo.kernelDescriptor.payloadMappings.explicitArgs[argIndex];
|
|
const auto &argAsImg = arg.as<ArgDescImage>();
|
|
|
|
uint32_t *crossThreadData = reinterpret_cast<uint32_t *>(this->crossThreadData);
|
|
auto clMemObj = *(static_cast<const cl_mem *>(argVal));
|
|
auto pImage = castToObject<Image>(clMemObj);
|
|
|
|
if (pImage && argSize == sizeof(cl_mem *)) {
|
|
if (pImage->peekSharingHandler()) {
|
|
usingSharedObjArgs = true;
|
|
}
|
|
|
|
DBG_LOG_INPUTS("setArgImage cl_mem", clMemObj);
|
|
|
|
storeKernelArg(argIndex, IMAGE_OBJ, clMemObj, argVal, argSize);
|
|
|
|
void *surfaceState = nullptr;
|
|
if (isValidOffset(argAsImg.bindless)) {
|
|
auto &gfxCoreHelper = this->getGfxCoreHelper();
|
|
auto surfaceStateSize = gfxCoreHelper.getRenderSurfaceStateSize();
|
|
surfaceState = ptrOffset(getSurfaceStateHeap(), surfaceStateSize * argIndex);
|
|
} else {
|
|
DEBUG_BREAK_IF(isUndefinedOffset(argAsImg.bindful));
|
|
surfaceState = ptrOffset(getSurfaceStateHeap(), argAsImg.bindful);
|
|
}
|
|
|
|
// Sets SS structure
|
|
if (arg.getExtendedTypeInfo().isMediaImage) {
|
|
DEBUG_BREAK_IF(!kernelInfo.kernelDescriptor.kernelAttributes.flags.usesVme);
|
|
pImage->setMediaImageArg(surfaceState, rootDeviceIndex);
|
|
} else {
|
|
pImage->setImageArg(surfaceState, arg.getExtendedTypeInfo().isMediaBlockImage, mipLevel, rootDeviceIndex,
|
|
getKernelInfo().kernelDescriptor.kernelAttributes.flags.useGlobalAtomics);
|
|
}
|
|
|
|
auto &imageDesc = pImage->getImageDesc();
|
|
auto &imageFormat = pImage->getImageFormat();
|
|
auto graphicsAllocation = pImage->getGraphicsAllocation(rootDeviceIndex);
|
|
|
|
if (imageDesc.image_type == CL_MEM_OBJECT_IMAGE3D) {
|
|
imageTransformer->registerImage3d(argIndex);
|
|
}
|
|
|
|
patch<uint32_t, cl_uint>(imageDesc.num_samples, crossThreadData, argAsImg.metadataPayload.numSamples);
|
|
patch<uint32_t, cl_uint>(imageDesc.num_mip_levels, crossThreadData, argAsImg.metadataPayload.numMipLevels);
|
|
patch<uint32_t, uint64_t>(imageDesc.image_width, crossThreadData, argAsImg.metadataPayload.imgWidth);
|
|
patch<uint32_t, uint64_t>(imageDesc.image_height, crossThreadData, argAsImg.metadataPayload.imgHeight);
|
|
patch<uint32_t, uint64_t>(imageDesc.image_depth, crossThreadData, argAsImg.metadataPayload.imgDepth);
|
|
patch<uint32_t, uint64_t>(imageDesc.image_array_size, crossThreadData, argAsImg.metadataPayload.arraySize);
|
|
patch<uint32_t, cl_channel_type>(imageFormat.image_channel_data_type, crossThreadData, argAsImg.metadataPayload.channelDataType);
|
|
patch<uint32_t, cl_channel_order>(imageFormat.image_channel_order, crossThreadData, argAsImg.metadataPayload.channelOrder);
|
|
|
|
auto pixelSize = pImage->getSurfaceFormatInfo().surfaceFormat.imageElementSizeInBytes;
|
|
patch<uint64_t, uint64_t>(graphicsAllocation->getGpuAddress(), crossThreadData, argAsImg.metadataPayload.flatBaseOffset);
|
|
patch<uint32_t, uint64_t>((imageDesc.image_width * pixelSize) - 1, crossThreadData, argAsImg.metadataPayload.flatWidth);
|
|
patch<uint32_t, uint64_t>((imageDesc.image_height * pixelSize) - 1, crossThreadData, argAsImg.metadataPayload.flatHeight);
|
|
patch<uint32_t, uint64_t>(imageDesc.image_row_pitch - 1, crossThreadData, argAsImg.metadataPayload.flatPitch);
|
|
|
|
retVal = CL_SUCCESS;
|
|
}
|
|
|
|
return retVal;
|
|
}
|
|
|
|
cl_int Kernel::setArgImmediate(uint32_t argIndex,
|
|
size_t argSize,
|
|
const void *argVal) {
|
|
|
|
auto retVal = CL_INVALID_ARG_VALUE;
|
|
|
|
if (argVal) {
|
|
storeKernelArg(argIndex, NONE_OBJ, nullptr, nullptr, argSize);
|
|
|
|
[[maybe_unused]] auto crossThreadDataEnd = ptrOffset(crossThreadData, crossThreadDataSize);
|
|
const auto &argAsVal = kernelInfo.kernelDescriptor.payloadMappings.explicitArgs[argIndex].as<ArgDescValue>();
|
|
for (const auto &element : argAsVal.elements) {
|
|
DEBUG_BREAK_IF(element.size <= 0);
|
|
|
|
auto pDst = ptrOffset(crossThreadData, element.offset);
|
|
auto pSrc = ptrOffset(argVal, element.sourceOffset);
|
|
|
|
DEBUG_BREAK_IF(!(ptrOffset(pDst, element.size) <= crossThreadDataEnd));
|
|
|
|
if (element.sourceOffset < argSize) {
|
|
size_t maxBytesToCopy = argSize - element.sourceOffset;
|
|
size_t bytesToCopy = std::min(static_cast<size_t>(element.size), maxBytesToCopy);
|
|
memcpy_s(pDst, element.size, pSrc, bytesToCopy);
|
|
}
|
|
}
|
|
|
|
retVal = CL_SUCCESS;
|
|
}
|
|
|
|
return retVal;
|
|
}
|
|
|
|
cl_int Kernel::setArgSampler(uint32_t argIndex,
|
|
size_t argSize,
|
|
const void *argVal) {
|
|
auto retVal = CL_INVALID_SAMPLER;
|
|
|
|
if (!argVal) {
|
|
return retVal;
|
|
}
|
|
|
|
uint32_t *crossThreadData = reinterpret_cast<uint32_t *>(this->crossThreadData);
|
|
auto clSamplerObj = *(static_cast<const cl_sampler *>(argVal));
|
|
auto pSampler = castToObject<Sampler>(clSamplerObj);
|
|
|
|
if (pSampler) {
|
|
pSampler->incRefInternal();
|
|
}
|
|
|
|
if (kernelArguments.at(argIndex).object) {
|
|
auto oldSampler = castToObject<Sampler>(kernelArguments.at(argIndex).object);
|
|
UNRECOVERABLE_IF(!oldSampler);
|
|
oldSampler->decRefInternal();
|
|
}
|
|
|
|
if (pSampler && argSize == sizeof(cl_sampler *)) {
|
|
const auto &arg = kernelInfo.kernelDescriptor.payloadMappings.explicitArgs[argIndex];
|
|
const auto &argAsSmp = arg.as<ArgDescSampler>();
|
|
|
|
storeKernelArg(argIndex, SAMPLER_OBJ, clSamplerObj, argVal, argSize);
|
|
|
|
auto dsh = getDynamicStateHeap();
|
|
auto samplerState = ptrOffset(dsh, argAsSmp.bindful);
|
|
|
|
pSampler->setArg(const_cast<void *>(samplerState), clDevice.getRootDeviceEnvironment());
|
|
|
|
patch<uint32_t, uint32_t>(pSampler->getSnapWaValue(), crossThreadData, argAsSmp.metadataPayload.samplerSnapWa);
|
|
patch<uint32_t, uint32_t>(getAddrModeEnum(pSampler->addressingMode), crossThreadData, argAsSmp.metadataPayload.samplerAddressingMode);
|
|
patch<uint32_t, uint32_t>(getNormCoordsEnum(pSampler->normalizedCoordinates), crossThreadData, argAsSmp.metadataPayload.samplerNormalizedCoords);
|
|
|
|
retVal = CL_SUCCESS;
|
|
}
|
|
|
|
return retVal;
|
|
}
|
|
|
|
cl_int Kernel::setArgAccelerator(uint32_t argIndex,
|
|
size_t argSize,
|
|
const void *argVal) {
|
|
auto retVal = CL_INVALID_ARG_VALUE;
|
|
|
|
if (argSize != sizeof(cl_accelerator_intel)) {
|
|
return CL_INVALID_ARG_SIZE;
|
|
}
|
|
|
|
if (!argVal) {
|
|
return retVal;
|
|
}
|
|
|
|
auto clAcceleratorObj = *(static_cast<const cl_accelerator_intel *>(argVal));
|
|
DBG_LOG_INPUTS("setArgAccelerator cl_mem", clAcceleratorObj);
|
|
|
|
const auto pAccelerator = castToObject<IntelAccelerator>(clAcceleratorObj);
|
|
|
|
if (pAccelerator) {
|
|
storeKernelArg(argIndex, ACCELERATOR_OBJ, clAcceleratorObj, argVal, argSize);
|
|
|
|
const auto &arg = kernelInfo.kernelDescriptor.payloadMappings.explicitArgs[argIndex];
|
|
const auto &argAsSmp = arg.as<ArgDescSampler>();
|
|
|
|
if (argAsSmp.samplerType == iOpenCL::SAMPLER_OBJECT_VME) {
|
|
|
|
const auto pVmeAccelerator = castToObjectOrAbort<VmeAccelerator>(pAccelerator);
|
|
auto pDesc = static_cast<const cl_motion_estimation_desc_intel *>(pVmeAccelerator->getDescriptor());
|
|
DEBUG_BREAK_IF(!pDesc);
|
|
|
|
if (arg.getExtendedTypeInfo().hasVmeExtendedDescriptor) {
|
|
const auto &explicitArgsExtendedDescriptors = kernelInfo.kernelDescriptor.payloadMappings.explicitArgsExtendedDescriptors;
|
|
UNRECOVERABLE_IF(argIndex >= explicitArgsExtendedDescriptors.size());
|
|
auto vmeDescriptor = static_cast<ArgDescVme *>(explicitArgsExtendedDescriptors[argIndex].get());
|
|
|
|
auto pVmeMbBlockTypeDst = reinterpret_cast<cl_uint *>(ptrOffset(crossThreadData, vmeDescriptor->mbBlockType));
|
|
*pVmeMbBlockTypeDst = pDesc->mb_block_type;
|
|
|
|
auto pVmeSubpixelMode = reinterpret_cast<cl_uint *>(ptrOffset(crossThreadData, vmeDescriptor->subpixelMode));
|
|
*pVmeSubpixelMode = pDesc->subpixel_mode;
|
|
|
|
auto pVmeSadAdjustMode = reinterpret_cast<cl_uint *>(ptrOffset(crossThreadData, vmeDescriptor->sadAdjustMode));
|
|
*pVmeSadAdjustMode = pDesc->sad_adjust_mode;
|
|
|
|
auto pVmeSearchPathType = reinterpret_cast<cl_uint *>(ptrOffset(crossThreadData, vmeDescriptor->searchPathType));
|
|
*pVmeSearchPathType = pDesc->search_path_type;
|
|
}
|
|
|
|
retVal = CL_SUCCESS;
|
|
} else if (argAsSmp.samplerType == iOpenCL::SAMPLER_OBJECT_VE) {
|
|
retVal = CL_SUCCESS;
|
|
}
|
|
}
|
|
|
|
return retVal;
|
|
}
|
|
|
|
void Kernel::setKernelArgHandler(uint32_t argIndex, KernelArgHandler handler) {
|
|
if (kernelArgHandlers.size() <= argIndex) {
|
|
kernelArgHandlers.resize(argIndex + 1);
|
|
}
|
|
|
|
kernelArgHandlers[argIndex] = handler;
|
|
}
|
|
|
|
void Kernel::unsetArg(uint32_t argIndex) {
|
|
if (kernelArguments[argIndex].isPatched) {
|
|
patchedArgumentsNum--;
|
|
kernelArguments[argIndex].isPatched = false;
|
|
if (kernelArguments[argIndex].isStatelessUncacheable) {
|
|
statelessUncacheableArgsCount--;
|
|
kernelArguments[argIndex].isStatelessUncacheable = false;
|
|
}
|
|
}
|
|
}
|
|
|
|
bool Kernel::hasPrintfOutput() const {
|
|
return kernelInfo.kernelDescriptor.kernelAttributes.flags.usesPrintf;
|
|
}
|
|
|
|
void Kernel::resetSharedObjectsPatchAddresses() {
|
|
for (size_t i = 0; i < getKernelArgsNumber(); i++) {
|
|
auto clMem = (cl_mem)kernelArguments[i].object;
|
|
auto memObj = castToObject<MemObj>(clMem);
|
|
if (memObj && memObj->peekSharingHandler()) {
|
|
setArg((uint32_t)i, sizeof(cl_mem), &clMem);
|
|
}
|
|
}
|
|
}
|
|
|
|
void Kernel::provideInitializationHints() {
|
|
|
|
Context *context = program->getContextPtr();
|
|
if (context == nullptr || !context->isProvidingPerformanceHints())
|
|
return;
|
|
|
|
auto pClDevice = &getDevice();
|
|
if (privateSurfaceSize) {
|
|
context->providePerformanceHint(CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL, PRIVATE_MEMORY_USAGE_TOO_HIGH,
|
|
kernelInfo.kernelDescriptor.kernelMetadata.kernelName.c_str(),
|
|
privateSurfaceSize);
|
|
}
|
|
auto scratchSize = kernelInfo.kernelDescriptor.kernelAttributes.perThreadScratchSize[0] *
|
|
pClDevice->getSharedDeviceInfo().computeUnitsUsedForScratch * kernelInfo.getMaxSimdSize();
|
|
if (scratchSize > 0) {
|
|
context->providePerformanceHint(CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL, REGISTER_PRESSURE_TOO_HIGH,
|
|
kernelInfo.kernelDescriptor.kernelMetadata.kernelName.c_str(), scratchSize);
|
|
}
|
|
}
|
|
|
|
bool Kernel::usesSyncBuffer() const {
|
|
return kernelInfo.kernelDescriptor.kernelAttributes.flags.usesSyncBuffer;
|
|
}
|
|
|
|
void Kernel::patchSyncBuffer(GraphicsAllocation *gfxAllocation, size_t bufferOffset) {
|
|
const auto &syncBuffer = kernelInfo.kernelDescriptor.payloadMappings.implicitArgs.syncBufferAddress;
|
|
auto bufferPatchAddress = ptrOffset(crossThreadData, syncBuffer.stateless);
|
|
patchWithRequiredSize(bufferPatchAddress, syncBuffer.pointerSize,
|
|
ptrOffset(gfxAllocation->getGpuAddressToPatch(), bufferOffset));
|
|
|
|
if (isValidOffset(syncBuffer.bindful)) {
|
|
auto surfaceState = ptrOffset(reinterpret_cast<uintptr_t *>(getSurfaceStateHeap()), syncBuffer.bindful);
|
|
auto addressToPatch = gfxAllocation->getUnderlyingBuffer();
|
|
auto sizeToPatch = gfxAllocation->getUnderlyingBufferSize();
|
|
Buffer::setSurfaceState(&clDevice.getDevice(), surfaceState, false, false, sizeToPatch, addressToPatch, 0, gfxAllocation, 0, 0,
|
|
kernelInfo.kernelDescriptor.kernelAttributes.flags.useGlobalAtomics, areMultipleSubDevicesInContext());
|
|
}
|
|
}
|
|
|
|
bool Kernel::isPatched() const {
|
|
return patchedArgumentsNum == kernelInfo.kernelDescriptor.kernelAttributes.numArgsToPatch;
|
|
}
|
|
cl_int Kernel::checkCorrectImageAccessQualifier(cl_uint argIndex,
|
|
size_t argSize,
|
|
const void *argValue) const {
|
|
const auto &arg = kernelInfo.kernelDescriptor.payloadMappings.explicitArgs[argIndex];
|
|
if (arg.is<ArgDescriptor::ArgTImage>()) {
|
|
cl_mem mem = *(static_cast<const cl_mem *>(argValue));
|
|
MemObj *pMemObj = nullptr;
|
|
withCastToInternal(mem, &pMemObj);
|
|
if (pMemObj) {
|
|
auto accessQualifier = arg.getTraits().accessQualifier;
|
|
cl_mem_flags flags = pMemObj->getFlags();
|
|
if ((accessQualifier == KernelArgMetadata::AccessReadOnly && ((flags | CL_MEM_WRITE_ONLY) == flags)) ||
|
|
(accessQualifier == KernelArgMetadata::AccessWriteOnly && ((flags | CL_MEM_READ_ONLY) == flags))) {
|
|
return CL_INVALID_ARG_VALUE;
|
|
}
|
|
} else {
|
|
return CL_INVALID_ARG_VALUE;
|
|
}
|
|
}
|
|
return CL_SUCCESS;
|
|
}
|
|
|
|
void Kernel::resolveArgs() {
|
|
if (!Kernel::isPatched() || !imageTransformer->hasRegisteredImages3d() || !canTransformImages())
|
|
return;
|
|
bool canTransformImageTo2dArray = true;
|
|
const auto &args = kernelInfo.kernelDescriptor.payloadMappings.explicitArgs;
|
|
for (uint32_t i = 0; i < patchedArgumentsNum; i++) {
|
|
if (args[i].is<ArgDescriptor::ArgTSampler>()) {
|
|
auto sampler = castToObject<Sampler>(kernelArguments.at(i).object);
|
|
if (sampler->isTransformable()) {
|
|
canTransformImageTo2dArray = true;
|
|
} else {
|
|
canTransformImageTo2dArray = false;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
if (canTransformImageTo2dArray) {
|
|
imageTransformer->transformImagesTo2dArray(kernelInfo, kernelArguments, getSurfaceStateHeap());
|
|
} else if (imageTransformer->didTransform()) {
|
|
imageTransformer->transformImagesTo3d(kernelInfo, kernelArguments, getSurfaceStateHeap());
|
|
}
|
|
}
|
|
|
|
bool Kernel::canTransformImages() const {
|
|
auto renderCoreFamily = clDevice.getHardwareInfo().platform.eRenderCoreFamily;
|
|
return renderCoreFamily >= IGFX_GEN9_CORE && renderCoreFamily <= IGFX_GEN11LP_CORE && !isBuiltIn;
|
|
}
|
|
|
|
std::unique_ptr<KernelObjsForAuxTranslation> Kernel::fillWithKernelObjsForAuxTranslation() {
|
|
auto kernelObjsForAuxTranslation = std::make_unique<KernelObjsForAuxTranslation>();
|
|
kernelObjsForAuxTranslation->reserve(getKernelArgsNumber());
|
|
for (uint32_t i = 0; i < getKernelArgsNumber(); i++) {
|
|
const auto &arg = kernelInfo.kernelDescriptor.payloadMappings.explicitArgs[i];
|
|
if (BUFFER_OBJ == kernelArguments.at(i).type && !arg.as<ArgDescPointer>().isPureStateful()) {
|
|
auto buffer = castToObject<Buffer>(getKernelArg(i));
|
|
if (buffer && buffer->getMultiGraphicsAllocation().getDefaultGraphicsAllocation()->isCompressionEnabled()) {
|
|
kernelObjsForAuxTranslation->insert({KernelObjForAuxTranslation::Type::MEM_OBJ, buffer});
|
|
auto &context = this->program->getContext();
|
|
if (context.isProvidingPerformanceHints()) {
|
|
const auto &argExtMeta = kernelInfo.kernelDescriptor.explicitArgsExtendedMetadata[i];
|
|
context.providePerformanceHint(CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL, KERNEL_ARGUMENT_AUX_TRANSLATION,
|
|
kernelInfo.kernelDescriptor.kernelMetadata.kernelName.c_str(), i, argExtMeta.argName.c_str());
|
|
}
|
|
}
|
|
}
|
|
if (SVM_ALLOC_OBJ == getKernelArguments().at(i).type && !arg.as<ArgDescPointer>().isPureStateful()) {
|
|
auto svmAlloc = reinterpret_cast<GraphicsAllocation *>(const_cast<void *>(getKernelArg(i)));
|
|
if (svmAlloc && svmAlloc->isCompressionEnabled()) {
|
|
kernelObjsForAuxTranslation->insert({KernelObjForAuxTranslation::Type::GFX_ALLOC, svmAlloc});
|
|
auto &context = this->program->getContext();
|
|
if (context.isProvidingPerformanceHints()) {
|
|
const auto &argExtMeta = kernelInfo.kernelDescriptor.explicitArgsExtendedMetadata[i];
|
|
context.providePerformanceHint(CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL, KERNEL_ARGUMENT_AUX_TRANSLATION,
|
|
kernelInfo.kernelDescriptor.kernelMetadata.kernelName.c_str(), i, argExtMeta.argName.c_str());
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
if (CompressionSelector::allowStatelessCompression()) {
|
|
for (auto gfxAllocation : kernelUnifiedMemoryGfxAllocations) {
|
|
if (gfxAllocation->isCompressionEnabled()) {
|
|
kernelObjsForAuxTranslation->insert({KernelObjForAuxTranslation::Type::GFX_ALLOC, gfxAllocation});
|
|
auto &context = this->program->getContext();
|
|
if (context.isProvidingPerformanceHints()) {
|
|
context.providePerformanceHint(CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL, KERNEL_ALLOCATION_AUX_TRANSLATION,
|
|
kernelInfo.kernelDescriptor.kernelMetadata.kernelName.c_str(),
|
|
reinterpret_cast<void *>(gfxAllocation->getGpuAddress()), gfxAllocation->getUnderlyingBufferSize());
|
|
}
|
|
}
|
|
}
|
|
if (getContext().getSVMAllocsManager()) {
|
|
for (auto &allocation : getContext().getSVMAllocsManager()->getSVMAllocs()->allocations) {
|
|
auto gfxAllocation = allocation.second.gpuAllocations.getDefaultGraphicsAllocation();
|
|
if (gfxAllocation->isCompressionEnabled()) {
|
|
kernelObjsForAuxTranslation->insert({KernelObjForAuxTranslation::Type::GFX_ALLOC, gfxAllocation});
|
|
auto &context = this->program->getContext();
|
|
if (context.isProvidingPerformanceHints()) {
|
|
context.providePerformanceHint(CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL, KERNEL_ALLOCATION_AUX_TRANSLATION,
|
|
kernelInfo.kernelDescriptor.kernelMetadata.kernelName.c_str(),
|
|
reinterpret_cast<void *>(gfxAllocation->getGpuAddress()), gfxAllocation->getUnderlyingBufferSize());
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return kernelObjsForAuxTranslation;
|
|
}
|
|
|
|
bool Kernel::hasDirectStatelessAccessToSharedBuffer() const {
|
|
for (uint32_t i = 0; i < getKernelArgsNumber(); i++) {
|
|
const auto &arg = kernelInfo.kernelDescriptor.payloadMappings.explicitArgs[i];
|
|
if (BUFFER_OBJ == kernelArguments.at(i).type && !arg.as<ArgDescPointer>().isPureStateful()) {
|
|
auto buffer = castToObject<Buffer>(getKernelArg(i));
|
|
if (buffer && buffer->getMultiGraphicsAllocation().getAllocationType() == AllocationType::SHARED_BUFFER) {
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool Kernel::hasDirectStatelessAccessToHostMemory() const {
|
|
for (uint32_t i = 0; i < getKernelArgsNumber(); i++) {
|
|
const auto &arg = kernelInfo.kernelDescriptor.payloadMappings.explicitArgs[i];
|
|
if (BUFFER_OBJ == kernelArguments.at(i).type && !arg.as<ArgDescPointer>().isPureStateful()) {
|
|
auto buffer = castToObject<Buffer>(getKernelArg(i));
|
|
if (buffer && buffer->getMultiGraphicsAllocation().getAllocationType() == AllocationType::BUFFER_HOST_MEMORY) {
|
|
return true;
|
|
}
|
|
}
|
|
if (SVM_ALLOC_OBJ == kernelArguments.at(i).type && !arg.as<ArgDescPointer>().isPureStateful()) {
|
|
auto svmAlloc = reinterpret_cast<const GraphicsAllocation *>(getKernelArg(i));
|
|
if (svmAlloc && svmAlloc->getAllocationType() == AllocationType::BUFFER_HOST_MEMORY) {
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
bool Kernel::hasIndirectStatelessAccessToHostMemory() const {
|
|
if (!kernelInfo.kernelDescriptor.kernelAttributes.hasIndirectStatelessAccess) {
|
|
return false;
|
|
}
|
|
|
|
for (auto gfxAllocation : kernelUnifiedMemoryGfxAllocations) {
|
|
if (gfxAllocation->getAllocationType() == AllocationType::BUFFER_HOST_MEMORY) {
|
|
return true;
|
|
}
|
|
}
|
|
|
|
if (unifiedMemoryControls.indirectHostAllocationsAllowed) {
|
|
return getContext().getSVMAllocsManager()->hasHostAllocations();
|
|
}
|
|
|
|
return false;
|
|
}
|
|
|
|
void Kernel::getAllocationsForCacheFlush(CacheFlushAllocationsVec &out) const {
|
|
if (false == GfxCoreHelper::cacheFlushAfterWalkerSupported(getHardwareInfo())) {
|
|
return;
|
|
}
|
|
|
|
auto rootDeviceIndex = getDevice().getRootDeviceIndex();
|
|
auto global = getProgram()->getGlobalSurface(rootDeviceIndex);
|
|
if (global != nullptr) {
|
|
out.push_back(global);
|
|
}
|
|
}
|
|
|
|
uint64_t Kernel::getKernelStartAddress(const bool localIdsGenerationByRuntime, const bool kernelUsesLocalIds, const bool isCssUsed, const bool returnFullAddress) const {
|
|
|
|
uint64_t kernelStartOffset = 0;
|
|
|
|
if (kernelInfo.getGraphicsAllocation()) {
|
|
kernelStartOffset = returnFullAddress ? kernelInfo.getGraphicsAllocation()->getGpuAddress()
|
|
: kernelInfo.getGraphicsAllocation()->getGpuAddressToPatch();
|
|
if (localIdsGenerationByRuntime == false && kernelUsesLocalIds == true) {
|
|
kernelStartOffset += kernelInfo.kernelDescriptor.entryPoints.skipPerThreadDataLoad;
|
|
}
|
|
}
|
|
|
|
kernelStartOffset += getStartOffset();
|
|
|
|
auto &hardwareInfo = getHardwareInfo();
|
|
const auto &gfxCoreHelper = this->getGfxCoreHelper();
|
|
const auto &productHelper = getDevice().getProductHelper();
|
|
|
|
if (isCssUsed && gfxCoreHelper.isOffsetToSkipSetFFIDGPWARequired(hardwareInfo, productHelper)) {
|
|
kernelStartOffset += kernelInfo.kernelDescriptor.entryPoints.skipSetFFIDGP;
|
|
}
|
|
|
|
return kernelStartOffset;
|
|
}
|
|
void *Kernel::patchBindlessSurfaceState(NEO::GraphicsAllocation *alloc, uint32_t bindless) {
|
|
auto &gfxCoreHelper = this->getGfxCoreHelper();
|
|
auto surfaceStateSize = gfxCoreHelper.getRenderSurfaceStateSize();
|
|
NEO::BindlessHeapsHelper *bindlessHeapsHelper = getDevice().getDevice().getBindlessHeapsHelper();
|
|
auto ssInHeap = bindlessHeapsHelper->allocateSSInHeap(surfaceStateSize, alloc, NEO::BindlessHeapsHelper::GLOBAL_SSH);
|
|
auto patchLocation = ptrOffset(getCrossThreadData(), bindless);
|
|
auto patchValue = gfxCoreHelper.getBindlessSurfaceExtendedMessageDescriptorValue(static_cast<uint32_t>(ssInHeap.surfaceStateOffset));
|
|
patchWithRequiredSize(patchLocation, sizeof(patchValue), patchValue);
|
|
return ssInHeap.ssPtr;
|
|
}
|
|
|
|
void Kernel::setAdditionalKernelExecInfo(uint32_t additionalKernelExecInfo) {
|
|
this->additionalKernelExecInfo = additionalKernelExecInfo;
|
|
}
|
|
|
|
uint32_t Kernel::getAdditionalKernelExecInfo() const {
|
|
return this->additionalKernelExecInfo;
|
|
}
|
|
|
|
bool Kernel::requiresWaDisableRccRhwoOptimization() const {
|
|
auto &gfxCoreHelper = this->getGfxCoreHelper();
|
|
auto rootDeviceIndex = getDevice().getRootDeviceIndex();
|
|
|
|
if (gfxCoreHelper.isWaDisableRccRhwoOptimizationRequired() && isUsingSharedObjArgs()) {
|
|
for (auto &arg : getKernelArguments()) {
|
|
auto clMemObj = static_cast<cl_mem>(arg.object);
|
|
auto memObj = castToObject<MemObj>(clMemObj);
|
|
if (memObj && memObj->peekSharingHandler()) {
|
|
auto allocation = memObj->getGraphicsAllocation(rootDeviceIndex);
|
|
for (uint32_t handleId = 0u; handleId < allocation->getNumGmms(); handleId++) {
|
|
if (allocation->getGmm(handleId)->gmmResourceInfo->getResourceFlags()->Info.MediaCompressed) {
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
const HardwareInfo &Kernel::getHardwareInfo() const {
|
|
return getDevice().getHardwareInfo();
|
|
}
|
|
|
|
void Kernel::setWorkDim(uint32_t workDim) {
|
|
patchNonPointer<uint32_t, uint32_t>(getCrossThreadDataRef(), getDescriptor().payloadMappings.dispatchTraits.workDim, workDim);
|
|
if (pImplicitArgs) {
|
|
pImplicitArgs->numWorkDim = workDim;
|
|
}
|
|
}
|
|
|
|
void Kernel::setGlobalWorkOffsetValues(uint32_t globalWorkOffsetX, uint32_t globalWorkOffsetY, uint32_t globalWorkOffsetZ) {
|
|
patchVecNonPointer(getCrossThreadDataRef(),
|
|
getDescriptor().payloadMappings.dispatchTraits.globalWorkOffset,
|
|
{globalWorkOffsetX, globalWorkOffsetY, globalWorkOffsetZ});
|
|
if (pImplicitArgs) {
|
|
pImplicitArgs->globalOffsetX = globalWorkOffsetX;
|
|
pImplicitArgs->globalOffsetY = globalWorkOffsetY;
|
|
pImplicitArgs->globalOffsetZ = globalWorkOffsetZ;
|
|
}
|
|
}
|
|
|
|
void Kernel::setGlobalWorkSizeValues(uint32_t globalWorkSizeX, uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ) {
|
|
patchVecNonPointer(getCrossThreadDataRef(),
|
|
getDescriptor().payloadMappings.dispatchTraits.globalWorkSize,
|
|
{globalWorkSizeX, globalWorkSizeY, globalWorkSizeZ});
|
|
if (pImplicitArgs) {
|
|
pImplicitArgs->globalSizeX = globalWorkSizeX;
|
|
pImplicitArgs->globalSizeY = globalWorkSizeY;
|
|
pImplicitArgs->globalSizeZ = globalWorkSizeZ;
|
|
}
|
|
}
|
|
|
|
void Kernel::setLocalWorkSizeValues(uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ) {
|
|
patchVecNonPointer(getCrossThreadDataRef(),
|
|
getDescriptor().payloadMappings.dispatchTraits.localWorkSize,
|
|
{localWorkSizeX, localWorkSizeY, localWorkSizeZ});
|
|
if (pImplicitArgs) {
|
|
pImplicitArgs->localSizeX = localWorkSizeX;
|
|
pImplicitArgs->localSizeY = localWorkSizeY;
|
|
pImplicitArgs->localSizeZ = localWorkSizeZ;
|
|
}
|
|
}
|
|
|
|
void Kernel::setLocalWorkSize2Values(uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ) {
|
|
patchVecNonPointer(getCrossThreadDataRef(),
|
|
getDescriptor().payloadMappings.dispatchTraits.localWorkSize2,
|
|
{localWorkSizeX, localWorkSizeY, localWorkSizeZ});
|
|
}
|
|
|
|
void Kernel::setEnqueuedLocalWorkSizeValues(uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ) {
|
|
patchVecNonPointer(getCrossThreadDataRef(),
|
|
getDescriptor().payloadMappings.dispatchTraits.enqueuedLocalWorkSize,
|
|
{localWorkSizeX, localWorkSizeY, localWorkSizeZ});
|
|
}
|
|
|
|
void Kernel::setNumWorkGroupsValues(uint32_t numWorkGroupsX, uint32_t numWorkGroupsY, uint32_t numWorkGroupsZ) {
|
|
patchVecNonPointer(getCrossThreadDataRef(),
|
|
getDescriptor().payloadMappings.dispatchTraits.numWorkGroups,
|
|
{numWorkGroupsX, numWorkGroupsY, numWorkGroupsZ});
|
|
if (pImplicitArgs) {
|
|
pImplicitArgs->groupCountX = numWorkGroupsX;
|
|
pImplicitArgs->groupCountY = numWorkGroupsY;
|
|
pImplicitArgs->groupCountZ = numWorkGroupsZ;
|
|
}
|
|
}
|
|
|
|
bool Kernel::isLocalWorkSize2Patchable() {
|
|
const auto &localWorkSize2 = getDescriptor().payloadMappings.dispatchTraits.localWorkSize2;
|
|
return isValidOffset(localWorkSize2[0]) && isValidOffset(localWorkSize2[1]) && isValidOffset(localWorkSize2[2]);
|
|
}
|
|
|
|
uint32_t Kernel::getMaxKernelWorkGroupSize() const {
|
|
return maxKernelWorkGroupSize;
|
|
}
|
|
|
|
uint32_t Kernel::getSlmTotalSize() const {
|
|
return slmTotalSize;
|
|
}
|
|
|
|
bool Kernel::areMultipleSubDevicesInContext() const {
|
|
auto context = program->getContextPtr();
|
|
return context ? context->containsMultipleSubDevices(clDevice.getRootDeviceIndex()) : false;
|
|
}
|
|
|
|
void Kernel::reconfigureKernel() {
|
|
const auto &kernelDescriptor = kernelInfo.kernelDescriptor;
|
|
if (kernelDescriptor.kernelAttributes.numGrfRequired == GrfConfig::LargeGrfNumber &&
|
|
kernelDescriptor.kernelAttributes.simdSize != 32) {
|
|
this->maxKernelWorkGroupSize >>= 1;
|
|
}
|
|
const auto &gfxCoreHelper = this->getGfxCoreHelper();
|
|
bool isLocalIdsGeneratedByHw = false; // if local ids generated by runtime then more work groups available
|
|
auto maxWorkGroupSize = static_cast<uint32_t>(kernelInfo.getMaxRequiredWorkGroupSize(getMaxKernelWorkGroupSize()));
|
|
|
|
this->maxKernelWorkGroupSize = gfxCoreHelper.adjustMaxWorkGroupSize(kernelDescriptor.kernelAttributes.numGrfRequired, kernelDescriptor.kernelAttributes.simdSize, isLocalIdsGeneratedByHw, maxWorkGroupSize);
|
|
|
|
this->containsStatelessWrites = kernelDescriptor.kernelAttributes.flags.usesStatelessWrites;
|
|
this->systolicPipelineSelectMode = kernelDescriptor.kernelAttributes.flags.usesSystolicPipelineSelectMode;
|
|
}
|
|
|
|
bool Kernel::requiresCacheFlushCommand(const CommandQueue &commandQueue) const {
|
|
if (false == GfxCoreHelper::cacheFlushAfterWalkerSupported(commandQueue.getDevice().getHardwareInfo())) {
|
|
return false;
|
|
}
|
|
|
|
if (DebugManager.flags.EnableCacheFlushAfterWalkerForAllQueues.get() != -1) {
|
|
return !!DebugManager.flags.EnableCacheFlushAfterWalkerForAllQueues.get();
|
|
}
|
|
|
|
bool cmdQueueRequiresCacheFlush = commandQueue.getRequiresCacheFlushAfterWalker();
|
|
if (false == cmdQueueRequiresCacheFlush) {
|
|
return false;
|
|
}
|
|
if (commandQueue.getGpgpuCommandStreamReceiver().isMultiOsContextCapable()) {
|
|
return false;
|
|
}
|
|
bool isMultiDevice = commandQueue.getContext().containsMultipleSubDevices(commandQueue.getDevice().getRootDeviceIndex());
|
|
if (false == isMultiDevice) {
|
|
return false;
|
|
}
|
|
bool isDefaultContext = (commandQueue.getContext().peekContextType() == ContextType::CONTEXT_TYPE_DEFAULT);
|
|
if (true == isDefaultContext) {
|
|
return false;
|
|
}
|
|
|
|
if (getProgram()->getGlobalSurface(commandQueue.getDevice().getRootDeviceIndex()) != nullptr) {
|
|
return true;
|
|
}
|
|
return false;
|
|
}
|
|
|
|
void Kernel::updateAuxTranslationRequired() {
|
|
if (CompressionSelector::allowStatelessCompression()) {
|
|
if (hasDirectStatelessAccessToHostMemory() ||
|
|
hasIndirectStatelessAccessToHostMemory() ||
|
|
hasDirectStatelessAccessToSharedBuffer()) {
|
|
setAuxTranslationRequired(true);
|
|
}
|
|
}
|
|
}
|
|
|
|
int Kernel::setKernelThreadArbitrationPolicy(uint32_t policy) {
|
|
auto &clGfxCoreHelper = clDevice.getRootDeviceEnvironment().getHelper<ClGfxCoreHelper>();
|
|
auto &threadArbitrationPolicy = const_cast<ThreadArbitrationPolicy &>(getDescriptor().kernelAttributes.threadArbitrationPolicy);
|
|
if (!clGfxCoreHelper.isSupportedKernelThreadArbitrationPolicy()) {
|
|
threadArbitrationPolicy = ThreadArbitrationPolicy::NotPresent;
|
|
return CL_INVALID_DEVICE;
|
|
} else if (policy == CL_KERNEL_EXEC_INFO_THREAD_ARBITRATION_POLICY_ROUND_ROBIN_INTEL) {
|
|
threadArbitrationPolicy = ThreadArbitrationPolicy::RoundRobin;
|
|
} else if (policy == CL_KERNEL_EXEC_INFO_THREAD_ARBITRATION_POLICY_OLDEST_FIRST_INTEL) {
|
|
threadArbitrationPolicy = ThreadArbitrationPolicy::AgeBased;
|
|
} else if (policy == CL_KERNEL_EXEC_INFO_THREAD_ARBITRATION_POLICY_AFTER_DEPENDENCY_ROUND_ROBIN_INTEL ||
|
|
policy == CL_KERNEL_EXEC_INFO_THREAD_ARBITRATION_POLICY_STALL_BASED_ROUND_ROBIN_INTEL) {
|
|
threadArbitrationPolicy = ThreadArbitrationPolicy::RoundRobinAfterDependency;
|
|
} else {
|
|
threadArbitrationPolicy = ThreadArbitrationPolicy::NotPresent;
|
|
return CL_INVALID_VALUE;
|
|
}
|
|
return CL_SUCCESS;
|
|
}
|
|
|
|
bool Kernel::graphicsAllocationTypeUseSystemMemory(AllocationType type) {
|
|
return (type == AllocationType::BUFFER_HOST_MEMORY) ||
|
|
(type == AllocationType::EXTERNAL_HOST_PTR) ||
|
|
(type == AllocationType::SVM_CPU) ||
|
|
(type == AllocationType::SVM_ZERO_COPY);
|
|
}
|
|
|
|
void Kernel::initializeLocalIdsCache() {
|
|
auto workgroupDimensionsOrder = getDescriptor().kernelAttributes.workgroupDimensionsOrder;
|
|
std::array<uint8_t, 3> wgDimOrder = {workgroupDimensionsOrder[0],
|
|
workgroupDimensionsOrder[1],
|
|
workgroupDimensionsOrder[2]};
|
|
auto simdSize = getDescriptor().kernelAttributes.simdSize;
|
|
auto grfSize = static_cast<uint8_t>(getDevice().getHardwareInfo().capabilityTable.grfSize);
|
|
localIdsCache = std::make_unique<LocalIdsCache>(4, wgDimOrder, simdSize, grfSize, usingImagesOnly);
|
|
}
|
|
|
|
void Kernel::setLocalIdsForGroup(const Vec3<uint16_t> &groupSize, void *destination) const {
|
|
UNRECOVERABLE_IF(localIdsCache.get() == nullptr);
|
|
const auto &gfxCoreHelper = this->getGfxCoreHelper();
|
|
localIdsCache->setLocalIdsForGroup(groupSize, destination, gfxCoreHelper);
|
|
}
|
|
|
|
size_t Kernel::getLocalIdsSizeForGroup(const Vec3<uint16_t> &groupSize) const {
|
|
UNRECOVERABLE_IF(localIdsCache.get() == nullptr);
|
|
return localIdsCache->getLocalIdsSizeForGroup(groupSize, getGfxCoreHelper());
|
|
}
|
|
|
|
size_t Kernel::getLocalIdsSizePerThread() const {
|
|
UNRECOVERABLE_IF(localIdsCache.get() == nullptr);
|
|
return localIdsCache->getLocalIdsSizePerThread();
|
|
}
|
|
|
|
} // namespace NEO
|