/* * Copyright (C) 2018-2020 Intel Corporation * * SPDX-License-Identifier: MIT * */ #include "opencl/source/kernel/kernel.h" #include "shared/source/built_ins/built_ins.h" #include "shared/source/command_stream/command_stream_receiver.h" #include "shared/source/debug_settings/debug_settings_manager.h" #include "shared/source/device_binary_format/patchtokens_decoder.h" #include "shared/source/gmm_helper/gmm_helper.h" #include "shared/source/helpers/aligned_memory.h" #include "shared/source/helpers/api_specific_config.h" #include "shared/source/helpers/basic_math.h" #include "shared/source/helpers/debug_helpers.h" #include "shared/source/helpers/get_info.h" #include "shared/source/helpers/hw_helper.h" #include "shared/source/helpers/kernel_helpers.h" #include "shared/source/helpers/ptr_math.h" #include "shared/source/memory_manager/memory_manager.h" #include "shared/source/memory_manager/unified_memory_manager.h" #include "opencl/source/accelerators/intel_accelerator.h" #include "opencl/source/accelerators/intel_motion_estimation.h" #include "opencl/source/built_ins/builtins_dispatch_builder.h" #include "opencl/source/cl_device/cl_device.h" #include "opencl/source/command_queue/command_queue.h" #include "opencl/source/command_queue/gpgpu_walker.h" #include "opencl/source/context/context.h" #include "opencl/source/device_queue/device_queue.h" #include "opencl/source/execution_model/device_enqueue.h" #include "opencl/source/gtpin/gtpin_notify.h" #include "opencl/source/helpers/cl_hw_helper.h" #include "opencl/source/helpers/dispatch_info.h" #include "opencl/source/helpers/get_info_status_mapper.h" #include "opencl/source/helpers/per_thread_data.h" #include "opencl/source/helpers/sampler_helpers.h" #include "opencl/source/helpers/surface_formats.h" #include "opencl/source/kernel/image_transformer.h" #include "opencl/source/kernel/kernel.inl" #include "opencl/source/kernel/kernel_info_cl.h" #include "opencl/source/mem_obj/buffer.h" #include "opencl/source/mem_obj/image.h" #include "opencl/source/mem_obj/pipe.h" #include "opencl/source/memory_manager/mem_obj_surface.h" #include "opencl/source/platform/platform.h" #include "opencl/source/program/block_kernel_manager.h" #include "opencl/source/program/kernel_info.h" #include "opencl/source/sampler/sampler.h" #include "patch_list.h" #include #include #include using namespace iOpenCL; namespace NEO { class Surface; uint32_t Kernel::dummyPatchLocation = 0xbaddf00d; Kernel::Kernel(Program *programArg, const KernelInfoContainer &kernelInfosArg, bool schedulerKernel) : slmTotalSize(kernelInfosArg[programArg->getDevices()[0]->getRootDeviceIndex()]->workloadInfo.slmStaticSize), isParentKernel((kernelInfosArg[programArg->getDevices()[0]->getRootDeviceIndex()]->patchInfo.executionEnvironment != nullptr) ? (kernelInfosArg[programArg->getDevices()[0]->getRootDeviceIndex()]->patchInfo.executionEnvironment->HasDeviceEnqueue != 0) : false), isSchedulerKernel(schedulerKernel), executionEnvironment(programArg->getExecutionEnvironment()), program(programArg), deviceVector(programArg->getDevices()), kernelInfos(kernelInfosArg) { kernelDeviceInfos.resize(program->getMaxRootDeviceIndex() + 1); program->retain(); program->retainForKernel(); imageTransformer.reset(new ImageTransformer); for (const auto &pClDevice : deviceVector) { kernelDeviceInfos[pClDevice->getRootDeviceIndex()].maxKernelWorkGroupSize = static_cast(pClDevice->getSharedDeviceInfo().maxWorkGroupSize); } } Kernel::~Kernel() { for (auto &kernelDeviceInfo : kernelDeviceInfos) { delete[] kernelDeviceInfo.crossThreadData; kernelDeviceInfo.crossThreadData = nullptr; kernelDeviceInfo.crossThreadDataSize = 0; if (kernelDeviceInfo.privateSurface) { program->peekExecutionEnvironment().memoryManager->checkGpuUsageAndDestroyGraphicsAllocations(kernelDeviceInfo.privateSurface); kernelDeviceInfo.privateSurface = nullptr; } } if (kernelReflectionSurface) { program->peekExecutionEnvironment().memoryManager->freeGraphicsMemory(kernelReflectionSurface); kernelReflectionSurface = nullptr; } for (uint32_t i = 0; i < patchedArgumentsNum; i++) { if (getDefaultKernelInfo().kernelArgInfo.at(i).isSampler) { auto sampler = castToObject(kernelArguments.at(i).object); if (sampler) { sampler->decRefInternal(); } } } kernelArgHandlers.clear(); program->releaseForKernel(); program->release(); } // Checks if patch offset is invalid (undefined) inline bool isInvalidOffset(uint32_t offset) { return (offset == KernelArgInfo::undefinedOffset); } // Checks if patch offset is valid inline bool isValidOffset(uint32_t offset) { return isInvalidOffset(offset) == false; } // If dstOffsetBytes is not an invalid offset, then patches dst at dstOffsetBytes // with src casted to DstT type. template inline void patch(const SrcT &src, void *dst, uint32_t dstOffsetBytes) { if (isInvalidOffset(dstOffsetBytes)) { return; } DstT *patchLocation = reinterpret_cast(ptrOffset(dst, dstOffsetBytes)); *patchLocation = static_cast(src); } template void Kernel::patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const PatchTokenT &patch) { uint32_t crossThreadDataOffset = patch.DataParamOffset; uint32_t pointerSize = patch.DataParamSize; uint32_t sshOffset = patch.SurfaceStateHeapOffset; auto rootDeviceIndex = allocation.getRootDeviceIndex(); void *crossThreadData = getCrossThreadData(rootDeviceIndex); void *ssh = getSurfaceStateHeap(rootDeviceIndex); if (crossThreadData != nullptr) { auto pp = ptrOffset(crossThreadData, crossThreadDataOffset); uintptr_t addressToPatch = reinterpret_cast(ptrToPatchInCrossThreadData); patchWithRequiredSize(pp, pointerSize, addressToPatch); if (DebugManager.flags.AddPatchInfoCommentsForAUBDump.get()) { PatchInfoData patchInfoData(addressToPatch, 0u, PatchInfoAllocationType::KernelArg, reinterpret_cast(crossThreadData), crossThreadDataOffset, PatchInfoAllocationType::IndirectObjectHeap, pointerSize); this->patchInfoDataList.push_back(patchInfoData); } } if (ssh) { auto surfaceState = ptrOffset(ssh, sshOffset); void *addressToPatch = reinterpret_cast(allocation.getGpuAddressToPatch()); size_t sizeToPatch = allocation.getUnderlyingBufferSize(); Buffer::setSurfaceState(&getDevice().getDevice(), surfaceState, sizeToPatch, addressToPatch, 0, &allocation, 0, 0); } } template void Kernel::patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const SPatchAllocateStatelessGlobalMemorySurfaceWithInitialization &patch); template void Kernel::patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const SPatchAllocateStatelessPrivateSurface &patch); template void Kernel::patchWithImplicitSurface(void *ptrToPatchInCrossThreadData, GraphicsAllocation &allocation, const SPatchAllocateStatelessConstantMemorySurfaceWithInitialization &patch); cl_int Kernel::initialize() { cl_int retVal = CL_OUT_OF_HOST_MEMORY; do { auto pClDevice = &getDevice(); auto rootDeviceIndex = pClDevice->getRootDeviceIndex(); reconfigureKernel(rootDeviceIndex); auto &hwInfo = pClDevice->getHardwareInfo(); auto &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily); auto &kernelInfo = *kernelInfos[rootDeviceIndex]; auto maxSimdSize = kernelInfo.getMaxSimdSize(); const auto &workloadInfo = kernelInfo.workloadInfo; const auto &heapInfo = kernelInfo.heapInfo; const auto &patchInfo = kernelInfo.patchInfo; if (maxSimdSize != 1 && maxSimdSize < hwHelper.getMinimalSIMDSize()) { return CL_INVALID_KERNEL; } kernelDeviceInfos[rootDeviceIndex].crossThreadDataSize = patchInfo.dataParameterStream ? patchInfo.dataParameterStream->DataParameterStreamSize : 0; // now allocate our own cross-thread data, if necessary if (kernelDeviceInfos[rootDeviceIndex].crossThreadDataSize) { kernelDeviceInfos[rootDeviceIndex].crossThreadData = new char[kernelDeviceInfos[rootDeviceIndex].crossThreadDataSize]; if (kernelInfo.crossThreadData) { memcpy_s(kernelDeviceInfos[rootDeviceIndex].crossThreadData, kernelDeviceInfos[rootDeviceIndex].crossThreadDataSize, kernelInfo.crossThreadData, kernelDeviceInfos[rootDeviceIndex].crossThreadDataSize); } else { memset(kernelDeviceInfos[rootDeviceIndex].crossThreadData, 0x00, kernelDeviceInfos[rootDeviceIndex].crossThreadDataSize); } auto crossThread = reinterpret_cast(kernelDeviceInfos[rootDeviceIndex].crossThreadData); kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetX = workloadInfo.globalWorkOffsetOffsets[0] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.globalWorkOffsetOffsets[0]) : kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetX; kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetY = workloadInfo.globalWorkOffsetOffsets[1] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.globalWorkOffsetOffsets[1]) : kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetY; kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetZ = workloadInfo.globalWorkOffsetOffsets[2] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.globalWorkOffsetOffsets[2]) : kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetZ; kernelDeviceInfos[rootDeviceIndex].localWorkSizeX = workloadInfo.localWorkSizeOffsets[0] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.localWorkSizeOffsets[0]) : kernelDeviceInfos[rootDeviceIndex].localWorkSizeX; kernelDeviceInfos[rootDeviceIndex].localWorkSizeY = workloadInfo.localWorkSizeOffsets[1] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.localWorkSizeOffsets[1]) : kernelDeviceInfos[rootDeviceIndex].localWorkSizeY; kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ = workloadInfo.localWorkSizeOffsets[2] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.localWorkSizeOffsets[2]) : kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ; kernelDeviceInfos[rootDeviceIndex].localWorkSizeX2 = workloadInfo.localWorkSizeOffsets2[0] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.localWorkSizeOffsets2[0]) : kernelDeviceInfos[rootDeviceIndex].localWorkSizeX2; kernelDeviceInfos[rootDeviceIndex].localWorkSizeY2 = workloadInfo.localWorkSizeOffsets2[1] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.localWorkSizeOffsets2[1]) : kernelDeviceInfos[rootDeviceIndex].localWorkSizeY2; kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ2 = workloadInfo.localWorkSizeOffsets2[2] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.localWorkSizeOffsets2[2]) : kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ2; kernelDeviceInfos[rootDeviceIndex].globalWorkSizeX = workloadInfo.globalWorkSizeOffsets[0] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.globalWorkSizeOffsets[0]) : kernelDeviceInfos[rootDeviceIndex].globalWorkSizeX; kernelDeviceInfos[rootDeviceIndex].globalWorkSizeY = workloadInfo.globalWorkSizeOffsets[1] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.globalWorkSizeOffsets[1]) : kernelDeviceInfos[rootDeviceIndex].globalWorkSizeY; kernelDeviceInfos[rootDeviceIndex].globalWorkSizeZ = workloadInfo.globalWorkSizeOffsets[2] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.globalWorkSizeOffsets[2]) : kernelDeviceInfos[rootDeviceIndex].globalWorkSizeZ; kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeX = workloadInfo.enqueuedLocalWorkSizeOffsets[0] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.enqueuedLocalWorkSizeOffsets[0]) : kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeX; kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeY = workloadInfo.enqueuedLocalWorkSizeOffsets[1] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.enqueuedLocalWorkSizeOffsets[1]) : kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeY; kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeZ = workloadInfo.enqueuedLocalWorkSizeOffsets[2] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.enqueuedLocalWorkSizeOffsets[2]) : kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeZ; kernelDeviceInfos[rootDeviceIndex].numWorkGroupsX = workloadInfo.numWorkGroupsOffset[0] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.numWorkGroupsOffset[0]) : kernelDeviceInfos[rootDeviceIndex].numWorkGroupsX; kernelDeviceInfos[rootDeviceIndex].numWorkGroupsY = workloadInfo.numWorkGroupsOffset[1] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.numWorkGroupsOffset[1]) : kernelDeviceInfos[rootDeviceIndex].numWorkGroupsY; kernelDeviceInfos[rootDeviceIndex].numWorkGroupsZ = workloadInfo.numWorkGroupsOffset[2] != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.numWorkGroupsOffset[2]) : kernelDeviceInfos[rootDeviceIndex].numWorkGroupsZ; kernelDeviceInfos[rootDeviceIndex].maxWorkGroupSizeForCrossThreadData = workloadInfo.maxWorkGroupSizeOffset != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.maxWorkGroupSizeOffset) : kernelDeviceInfos[rootDeviceIndex].maxWorkGroupSizeForCrossThreadData; kernelDeviceInfos[rootDeviceIndex].workDim = workloadInfo.workDimOffset != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.workDimOffset) : kernelDeviceInfos[rootDeviceIndex].workDim; kernelDeviceInfos[rootDeviceIndex].dataParameterSimdSize = workloadInfo.simdSizeOffset != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.simdSizeOffset) : kernelDeviceInfos[rootDeviceIndex].dataParameterSimdSize; kernelDeviceInfos[rootDeviceIndex].parentEventOffset = workloadInfo.parentEventOffset != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.parentEventOffset) : kernelDeviceInfos[rootDeviceIndex].parentEventOffset; kernelDeviceInfos[rootDeviceIndex].preferredWkgMultipleOffset = workloadInfo.preferredWkgMultipleOffset != WorkloadInfo::undefinedOffset ? ptrOffset(crossThread, workloadInfo.preferredWkgMultipleOffset) : kernelDeviceInfos[rootDeviceIndex].preferredWkgMultipleOffset; *kernelDeviceInfos[rootDeviceIndex].maxWorkGroupSizeForCrossThreadData = kernelDeviceInfos[rootDeviceIndex].maxKernelWorkGroupSize; *kernelDeviceInfos[rootDeviceIndex].dataParameterSimdSize = maxSimdSize; *kernelDeviceInfos[rootDeviceIndex].preferredWkgMultipleOffset = maxSimdSize; *kernelDeviceInfos[rootDeviceIndex].parentEventOffset = WorkloadInfo::invalidParentEvent; } // allocate our own SSH, if necessary kernelDeviceInfos[rootDeviceIndex].sshLocalSize = heapInfo.SurfaceStateHeapSize; if (kernelDeviceInfos[rootDeviceIndex].sshLocalSize) { kernelDeviceInfos[rootDeviceIndex].pSshLocal = std::make_unique(kernelDeviceInfos[rootDeviceIndex].sshLocalSize); // copy the ssh into our local copy memcpy_s(kernelDeviceInfos[rootDeviceIndex].pSshLocal.get(), kernelDeviceInfos[rootDeviceIndex].sshLocalSize, heapInfo.pSsh, kernelDeviceInfos[rootDeviceIndex].sshLocalSize); } kernelDeviceInfos[rootDeviceIndex].numberOfBindingTableStates = (patchInfo.bindingTableState != nullptr) ? patchInfo.bindingTableState->Count : 0; kernelDeviceInfos[rootDeviceIndex].localBindingTableOffset = (patchInfo.bindingTableState != nullptr) ? patchInfo.bindingTableState->Offset : 0; // patch crossthread data and ssh with inline surfaces, if necessary auto perHwThreadPrivateMemorySize = PatchTokenBinary::getPerHwThreadPrivateSurfaceSize(patchInfo.pAllocateStatelessPrivateSurface, kernelInfo.getMaxSimdSize()); if (perHwThreadPrivateMemorySize) { kernelDeviceInfos[rootDeviceIndex].privateSurfaceSize = KernelHelper::getPrivateSurfaceSize(perHwThreadPrivateMemorySize, pClDevice->getSharedDeviceInfo().computeUnitsUsedForScratch); DEBUG_BREAK_IF(kernelDeviceInfos[rootDeviceIndex].privateSurfaceSize == 0); if (kernelDeviceInfos[rootDeviceIndex].privateSurfaceSize > std::numeric_limits::max()) { retVal = CL_OUT_OF_RESOURCES; break; } kernelDeviceInfos[rootDeviceIndex].privateSurface = executionEnvironment.memoryManager->allocateGraphicsMemoryWithProperties( {rootDeviceIndex, static_cast(kernelDeviceInfos[rootDeviceIndex].privateSurfaceSize), GraphicsAllocation::AllocationType::PRIVATE_SURFACE, pClDevice->getDeviceBitfield()}); if (kernelDeviceInfos[rootDeviceIndex].privateSurface == nullptr) { retVal = CL_OUT_OF_RESOURCES; break; } const auto &patch = patchInfo.pAllocateStatelessPrivateSurface; patchWithImplicitSurface(reinterpret_cast(kernelDeviceInfos[rootDeviceIndex].privateSurface->getGpuAddressToPatch()), *kernelDeviceInfos[rootDeviceIndex].privateSurface, *patch); } if (patchInfo.pAllocateStatelessConstantMemorySurfaceWithInitialization) { DEBUG_BREAK_IF(program->getConstantSurface(rootDeviceIndex) == nullptr); uintptr_t constMemory = isBuiltIn ? (uintptr_t)program->getConstantSurface(rootDeviceIndex)->getUnderlyingBuffer() : (uintptr_t)program->getConstantSurface(rootDeviceIndex)->getGpuAddressToPatch(); const auto &patch = patchInfo.pAllocateStatelessConstantMemorySurfaceWithInitialization; patchWithImplicitSurface(reinterpret_cast(constMemory), *program->getConstantSurface(rootDeviceIndex), *patch); } if (patchInfo.pAllocateStatelessGlobalMemorySurfaceWithInitialization) { DEBUG_BREAK_IF(program->getGlobalSurface(rootDeviceIndex) == nullptr); uintptr_t globalMemory = isBuiltIn ? (uintptr_t)program->getGlobalSurface(rootDeviceIndex)->getUnderlyingBuffer() : (uintptr_t)program->getGlobalSurface(rootDeviceIndex)->getGpuAddressToPatch(); const auto &patch = patchInfo.pAllocateStatelessGlobalMemorySurfaceWithInitialization; patchWithImplicitSurface(reinterpret_cast(globalMemory), *program->getGlobalSurface(rootDeviceIndex), *patch); } if (patchInfo.pAllocateStatelessEventPoolSurface) { if (requiresSshForBuffers(rootDeviceIndex)) { auto surfaceState = ptrOffset(reinterpret_cast(getSurfaceStateHeap(rootDeviceIndex)), patchInfo.pAllocateStatelessEventPoolSurface->SurfaceStateHeapOffset); Buffer::setSurfaceState(&pClDevice->getDevice(), surfaceState, 0, nullptr, 0, nullptr, 0, 0); } } if (patchInfo.pAllocateStatelessDefaultDeviceQueueSurface) { if (requiresSshForBuffers(rootDeviceIndex)) { auto surfaceState = ptrOffset(reinterpret_cast(getSurfaceStateHeap(rootDeviceIndex)), patchInfo.pAllocateStatelessDefaultDeviceQueueSurface->SurfaceStateHeapOffset); Buffer::setSurfaceState(&pClDevice->getDevice(), surfaceState, 0, nullptr, 0, nullptr, 0, 0); } } setThreadArbitrationPolicy(hwHelper.getDefaultThreadArbitrationPolicy()); if (kernelInfo.patchInfo.executionEnvironment) { if (!kernelInfo.patchInfo.executionEnvironment->SubgroupIndependentForwardProgressRequired) { setThreadArbitrationPolicy(ThreadArbitrationPolicy::AgeBased); } } patchBlocksSimdSize(rootDeviceIndex); provideInitializationHints(); // resolve the new kernel info to account for kernel handlers // I think by this time we have decoded the binary and know the number of args etc. // double check this assumption bool usingBuffers = false; bool usingImages = false; auto numArgs = kernelInfo.kernelArgInfo.size(); kernelArguments.resize(numArgs); slmSizes.resize(numArgs); kernelArgHandlers.resize(numArgs); kernelArgRequiresCacheFlush.resize(numArgs); for (uint32_t i = 0; i < numArgs; ++i) { storeKernelArg(i, NONE_OBJ, nullptr, nullptr, 0); slmSizes[i] = 0; // set the argument handler auto &argInfo = kernelInfo.kernelArgInfo[i]; if (argInfo.metadata.addressQualifier == KernelArgMetadata::AddrLocal) { kernelArgHandlers[i] = &Kernel::setArgLocal; } else if (argInfo.isAccelerator) { kernelArgHandlers[i] = &Kernel::setArgAccelerator; } else if (argInfo.metadata.typeQualifiers.pipeQual) { kernelArgHandlers[i] = &Kernel::setArgPipe; kernelArguments[i].type = PIPE_OBJ; } else if (argInfo.isImage) { kernelArgHandlers[i] = &Kernel::setArgImage; kernelArguments[i].type = IMAGE_OBJ; usingImages = true; } else if (argInfo.isSampler) { kernelArgHandlers[i] = &Kernel::setArgSampler; kernelArguments[i].type = SAMPLER_OBJ; } else if (argInfo.isBuffer) { kernelArgHandlers[i] = &Kernel::setArgBuffer; kernelArguments[i].type = BUFFER_OBJ; usingBuffers = true; allBufferArgsStateful &= static_cast(argInfo.pureStatefulBufferAccess); } else if (argInfo.isDeviceQueue) { kernelArgHandlers[i] = &Kernel::setArgDevQueue; kernelArguments[i].type = DEVICE_QUEUE_OBJ; } else { kernelArgHandlers[i] = &Kernel::setArgImmediate; } } auto &clHwHelper = ClHwHelper::get(hwInfo.platform.eRenderCoreFamily); auxTranslationRequired = HwHelper::renderCompressedBuffersSupported(hwInfo) && clHwHelper.requiresAuxResolves(kernelInfo); if (DebugManager.flags.ForceAuxTranslationEnabled.get() != -1) { auxTranslationRequired &= !!DebugManager.flags.ForceAuxTranslationEnabled.get(); } if (auxTranslationRequired) { program->getContextPtr()->setResolvesRequiredInKernels(true); } if (usingImages && !usingBuffers) { usingImagesOnly = true; } if (isParentKernel) { program->allocateBlockPrivateSurfaces(*pClDevice); } if (program->isKernelDebugEnabled() && kernelInfo.patchInfo.pAllocateSystemThreadSurface) { debugEnabled = true; } retVal = CL_SUCCESS; } while (false); return retVal; } cl_int Kernel::cloneKernel(Kernel *pSourceKernel) { // copy cross thread data to store arguments set to source kernel with clSetKernelArg on immediate data (non-pointer types) for (auto rootDeviceIndex = 0u; rootDeviceIndex < pSourceKernel->kernelDeviceInfos.size(); rootDeviceIndex++) { memcpy_s(kernelDeviceInfos[rootDeviceIndex].crossThreadData, kernelDeviceInfos[rootDeviceIndex].crossThreadDataSize, pSourceKernel->kernelDeviceInfos[rootDeviceIndex].crossThreadData, pSourceKernel->kernelDeviceInfos[rootDeviceIndex].crossThreadDataSize); DEBUG_BREAK_IF(pSourceKernel->kernelDeviceInfos[rootDeviceIndex].crossThreadDataSize != kernelDeviceInfos[rootDeviceIndex].crossThreadDataSize); } // copy arguments set to source kernel with clSetKernelArg or clSetKernelArgSVMPointer for (uint32_t i = 0; i < pSourceKernel->kernelArguments.size(); i++) { if (0 == pSourceKernel->getKernelArgInfo(i).size) { // skip copying arguments that haven't been set to source kernel continue; } switch (pSourceKernel->kernelArguments[i].type) { case NONE_OBJ: // all arguments with immediate data (non-pointer types) have been copied in cross thread data storeKernelArg(i, NONE_OBJ, nullptr, nullptr, pSourceKernel->getKernelArgInfo(i).size); patchedArgumentsNum++; kernelArguments[i].isPatched = true; break; case SVM_OBJ: setArgSvm(i, pSourceKernel->getKernelArgInfo(i).size, const_cast(pSourceKernel->getKernelArgInfo(i).value), pSourceKernel->getKernelArgInfo(i).pSvmAlloc, pSourceKernel->getKernelArgInfo(i).svmFlags); break; case SVM_ALLOC_OBJ: setArgSvmAlloc(i, const_cast(pSourceKernel->getKernelArgInfo(i).value), (GraphicsAllocation *)pSourceKernel->getKernelArgInfo(i).object); break; default: setArg(i, pSourceKernel->getKernelArgInfo(i).size, pSourceKernel->getKernelArgInfo(i).value); break; } } // copy additional information other than argument values set to source kernel with clSetKernelExecInfo for (auto gfxAlloc : pSourceKernel->kernelSvmGfxAllocations) { kernelSvmGfxAllocations.push_back(gfxAlloc); } this->isBuiltIn = pSourceKernel->isBuiltIn; return CL_SUCCESS; } cl_int Kernel::getInfo(cl_kernel_info paramName, size_t paramValueSize, void *paramValue, size_t *paramValueSizeRet) const { cl_int retVal; const void *pSrc = nullptr; size_t srcSize = GetInfo::invalidSourceSize; cl_uint numArgs = 0; const _cl_program *prog; const _cl_context *ctxt; cl_uint refCount = 0; uint64_t nonCannonizedGpuAddress = 0llu; auto defaultRootDeviceIndex = getDevices()[0]->getRootDeviceIndex(); auto &defaultKernelInfo = getKernelInfo(defaultRootDeviceIndex); switch (paramName) { case CL_KERNEL_FUNCTION_NAME: pSrc = defaultKernelInfo.kernelDescriptor.kernelMetadata.kernelName.c_str(); srcSize = defaultKernelInfo.kernelDescriptor.kernelMetadata.kernelName.length() + 1; break; case CL_KERNEL_NUM_ARGS: srcSize = sizeof(cl_uint); numArgs = static_cast(defaultKernelInfo.kernelArgInfo.size()); pSrc = &numArgs; break; case CL_KERNEL_CONTEXT: ctxt = &program->getContext(); srcSize = sizeof(ctxt); pSrc = &ctxt; break; case CL_KERNEL_PROGRAM: prog = program; srcSize = sizeof(prog); pSrc = &prog; break; case CL_KERNEL_REFERENCE_COUNT: refCount = static_cast(this->getReference()); srcSize = sizeof(refCount); pSrc = &refCount; break; case CL_KERNEL_ATTRIBUTES: pSrc = defaultKernelInfo.attributes.c_str(); srcSize = defaultKernelInfo.attributes.length() + 1; break; case CL_KERNEL_BINARY_PROGRAM_INTEL: pSrc = getKernelHeap(defaultRootDeviceIndex); srcSize = getKernelHeapSize(defaultRootDeviceIndex); break; case CL_KERNEL_BINARY_GPU_ADDRESS_INTEL: nonCannonizedGpuAddress = GmmHelper::decanonize(defaultKernelInfo.kernelAllocation->getGpuAddress()); pSrc = &nonCannonizedGpuAddress; srcSize = sizeof(nonCannonizedGpuAddress); break; default: getAdditionalInfo(paramName, pSrc, srcSize); break; } auto getInfoStatus = GetInfo::getInfo(paramValue, paramValueSize, pSrc, srcSize); retVal = changeGetInfoStatusToCLResultType(getInfoStatus); GetInfo::setParamValueReturnSize(paramValueSizeRet, srcSize, getInfoStatus); return retVal; } cl_int Kernel::getArgInfo(cl_uint argIndx, cl_kernel_arg_info paramName, size_t paramValueSize, void *paramValue, size_t *paramValueSizeRet) const { cl_int retVal; const void *pSrc = nullptr; size_t srcSize = GetInfo::invalidSourceSize; auto &defaultKernelInfo = getDefaultKernelInfo(); auto numArgs = static_cast(defaultKernelInfo.kernelArgInfo.size()); const auto &argInfo = defaultKernelInfo.kernelArgInfo[argIndx]; if (argIndx >= numArgs) { retVal = CL_INVALID_ARG_INDEX; return retVal; } cl_kernel_arg_address_qualifier addressQualifier; cl_kernel_arg_access_qualifier accessQualifier; cl_kernel_arg_type_qualifier typeQualifier; switch (paramName) { case CL_KERNEL_ARG_ADDRESS_QUALIFIER: addressQualifier = asClKernelArgAddressQualifier(argInfo.metadata.getAddressQualifier()); srcSize = sizeof(addressQualifier); pSrc = &addressQualifier; break; case CL_KERNEL_ARG_ACCESS_QUALIFIER: accessQualifier = asClKernelArgAccessQualifier(argInfo.metadata.getAccessQualifier()); srcSize = sizeof(accessQualifier); pSrc = &accessQualifier; break; case CL_KERNEL_ARG_TYPE_QUALIFIER: typeQualifier = asClKernelArgTypeQualifier(argInfo.metadata.typeQualifiers); srcSize = sizeof(typeQualifier); pSrc = &typeQualifier; break; case CL_KERNEL_ARG_TYPE_NAME: srcSize = argInfo.metadataExtended->type.length() + 1; pSrc = argInfo.metadataExtended->type.c_str(); break; case CL_KERNEL_ARG_NAME: srcSize = argInfo.metadataExtended->argName.length() + 1; pSrc = argInfo.metadataExtended->argName.c_str(); break; default: break; } auto getInfoStatus = GetInfo::getInfo(paramValue, paramValueSize, pSrc, srcSize); retVal = changeGetInfoStatusToCLResultType(getInfoStatus); GetInfo::setParamValueReturnSize(paramValueSizeRet, srcSize, getInfoStatus); return retVal; } cl_int Kernel::getWorkGroupInfo(ClDevice &device, cl_kernel_work_group_info paramName, size_t paramValueSize, void *paramValue, size_t *paramValueSizeRet) const { cl_int retVal = CL_INVALID_VALUE; const void *pSrc = nullptr; size_t srcSize = GetInfo::invalidSourceSize; struct size_t3 { size_t val[3]; } requiredWorkGroupSize; cl_ulong localMemorySize; auto rootDeviceIndex = device.getRootDeviceIndex(); auto &kernelInfo = *kernelInfos[rootDeviceIndex]; const auto &patchInfo = kernelInfo.patchInfo; size_t preferredWorkGroupSizeMultiple = 0; cl_ulong scratchSize; cl_ulong privateMemSize; size_t maxWorkgroupSize; const auto &hwInfo = device.getHardwareInfo(); auto &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily); GetInfoHelper info(paramValue, paramValueSize, paramValueSizeRet); switch (paramName) { case CL_KERNEL_WORK_GROUP_SIZE: maxWorkgroupSize = kernelDeviceInfos[rootDeviceIndex].maxKernelWorkGroupSize; if (DebugManager.flags.UseMaxSimdSizeToDeduceMaxWorkgroupSize.get()) { auto divisionSize = CommonConstants::maximalSimdSize / patchInfo.executionEnvironment->LargestCompiledSIMDSize; maxWorkgroupSize /= divisionSize; } srcSize = sizeof(maxWorkgroupSize); pSrc = &maxWorkgroupSize; break; case CL_KERNEL_COMPILE_WORK_GROUP_SIZE: DEBUG_BREAK_IF(!patchInfo.executionEnvironment); requiredWorkGroupSize.val[0] = kernelInfo.kernelDescriptor.kernelAttributes.requiredWorkgroupSize[0]; requiredWorkGroupSize.val[1] = kernelInfo.kernelDescriptor.kernelAttributes.requiredWorkgroupSize[1]; requiredWorkGroupSize.val[2] = kernelInfo.kernelDescriptor.kernelAttributes.requiredWorkgroupSize[2]; srcSize = sizeof(requiredWorkGroupSize); pSrc = &requiredWorkGroupSize; break; case CL_KERNEL_LOCAL_MEM_SIZE: localMemorySize = patchInfo.localsurface ? patchInfo.localsurface->TotalInlineLocalMemorySize : 0; srcSize = sizeof(localMemorySize); pSrc = &localMemorySize; break; case CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE: DEBUG_BREAK_IF(!patchInfo.executionEnvironment); preferredWorkGroupSizeMultiple = patchInfo.executionEnvironment->LargestCompiledSIMDSize; if (hwHelper.isFusedEuDispatchEnabled(hwInfo)) { preferredWorkGroupSizeMultiple *= 2; } srcSize = sizeof(preferredWorkGroupSizeMultiple); pSrc = &preferredWorkGroupSizeMultiple; break; case CL_KERNEL_SPILL_MEM_SIZE_INTEL: scratchSize = kernelInfo.patchInfo.mediavfestate ? kernelInfo.patchInfo.mediavfestate->PerThreadScratchSpace : 0; srcSize = sizeof(scratchSize); pSrc = &scratchSize; break; case CL_KERNEL_PRIVATE_MEM_SIZE: privateMemSize = kernelInfo.patchInfo.pAllocateStatelessPrivateSurface ? kernelInfo.patchInfo.pAllocateStatelessPrivateSurface->PerThreadPrivateMemorySize : 0; srcSize = sizeof(privateMemSize); pSrc = &privateMemSize; break; default: getAdditionalWorkGroupInfo(paramName, pSrc, srcSize, rootDeviceIndex); break; } auto getInfoStatus = GetInfo::getInfo(paramValue, paramValueSize, pSrc, srcSize); retVal = changeGetInfoStatusToCLResultType(getInfoStatus); GetInfo::setParamValueReturnSize(paramValueSizeRet, srcSize, getInfoStatus); return retVal; } cl_int Kernel::getSubGroupInfo(ClDevice &clDevice, cl_kernel_sub_group_info paramName, size_t inputValueSize, const void *inputValue, size_t paramValueSize, void *paramValue, size_t *paramValueSizeRet) const { size_t numDimensions = 0; size_t WGS = 1; auto rootDeviceIndex = clDevice.getRootDeviceIndex(); const auto &kernelInfo = getKernelInfo(rootDeviceIndex); auto maxSimdSize = static_cast(kernelInfo.getMaxSimdSize()); auto maxRequiredWorkGroupSize = static_cast(kernelInfo.getMaxRequiredWorkGroupSize(getMaxKernelWorkGroupSize(rootDeviceIndex))); auto largestCompiledSIMDSize = static_cast(kernelInfo.patchInfo.executionEnvironment->LargestCompiledSIMDSize); GetInfoHelper info(paramValue, paramValueSize, paramValueSizeRet); if ((paramName == CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT) || (paramName == CL_KERNEL_MAX_NUM_SUB_GROUPS) || (paramName == CL_KERNEL_COMPILE_NUM_SUB_GROUPS)) { if (clDevice.areOcl21FeaturesEnabled() == false) { return CL_INVALID_OPERATION; } } if ((paramName == CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR) || (paramName == CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE_KHR)) { if (!inputValue) { return CL_INVALID_VALUE; } if (inputValueSize % sizeof(size_t) != 0) { return CL_INVALID_VALUE; } numDimensions = inputValueSize / sizeof(size_t); if (numDimensions == 0 || numDimensions > static_cast(clDevice.getDeviceInfo().maxWorkItemDimensions)) { return CL_INVALID_VALUE; } } if (paramName == CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT) { if (!paramValue) { return CL_INVALID_VALUE; } if (paramValueSize % sizeof(size_t) != 0) { return CL_INVALID_VALUE; } numDimensions = paramValueSize / sizeof(size_t); if (numDimensions == 0 || numDimensions > static_cast(clDevice.getDeviceInfo().maxWorkItemDimensions)) { return CL_INVALID_VALUE; } } switch (paramName) { case CL_KERNEL_MAX_SUB_GROUP_SIZE_FOR_NDRANGE_KHR: { return changeGetInfoStatusToCLResultType(info.set(maxSimdSize)); } case CL_KERNEL_SUB_GROUP_COUNT_FOR_NDRANGE_KHR: { for (size_t i = 0; i < numDimensions; i++) { WGS *= ((size_t *)inputValue)[i]; } return changeGetInfoStatusToCLResultType( info.set((WGS / maxSimdSize) + std::min(static_cast(1), WGS % maxSimdSize))); // add 1 if WGS % maxSimdSize != 0 } case CL_KERNEL_LOCAL_SIZE_FOR_SUB_GROUP_COUNT: { auto subGroupsNum = *(size_t *)inputValue; auto workGroupSize = subGroupsNum * largestCompiledSIMDSize; // return workgroup size in first dimension, the rest shall be 1 in positive case if (workGroupSize > maxRequiredWorkGroupSize) { workGroupSize = 0; } // If no work group size can accommodate the requested number of subgroups, return 0 in each element of the returned array. switch (numDimensions) { case 1: return changeGetInfoStatusToCLResultType(info.set(workGroupSize)); case 2: struct size_t2 { size_t val[2]; } workGroupSize2; workGroupSize2.val[0] = workGroupSize; workGroupSize2.val[1] = (workGroupSize > 0) ? 1 : 0; return changeGetInfoStatusToCLResultType(info.set(workGroupSize2)); case 3: default: struct size_t3 { size_t val[3]; } workGroupSize3; workGroupSize3.val[0] = workGroupSize; workGroupSize3.val[1] = (workGroupSize > 0) ? 1 : 0; workGroupSize3.val[2] = (workGroupSize > 0) ? 1 : 0; return changeGetInfoStatusToCLResultType(info.set(workGroupSize3)); } } case CL_KERNEL_MAX_NUM_SUB_GROUPS: { // round-up maximum number of subgroups return changeGetInfoStatusToCLResultType(info.set(Math::divideAndRoundUp(maxRequiredWorkGroupSize, largestCompiledSIMDSize))); } case CL_KERNEL_COMPILE_NUM_SUB_GROUPS: { return changeGetInfoStatusToCLResultType(info.set(static_cast(kernelInfo.patchInfo.executionEnvironment->CompiledSubGroupsNumber))); } case CL_KERNEL_COMPILE_SUB_GROUP_SIZE_INTEL: { return changeGetInfoStatusToCLResultType(info.set(kernelInfo.requiredSubGroupSize)); } default: return CL_INVALID_VALUE; } } const void *Kernel::getKernelHeap(uint32_t rootDeviceIndex) const { return getKernelInfo(rootDeviceIndex).heapInfo.pKernelHeap; } size_t Kernel::getKernelHeapSize(uint32_t rootDeviceIndex) const { return getKernelInfo(rootDeviceIndex).heapInfo.KernelHeapSize; } void Kernel::substituteKernelHeap(uint32_t rootDeviceIndex, void *newKernelHeap, size_t newKernelHeapSize) { KernelInfo *pKernelInfo = const_cast(&getKernelInfo(rootDeviceIndex)); void **pKernelHeap = const_cast(&pKernelInfo->heapInfo.pKernelHeap); *pKernelHeap = newKernelHeap; auto &heapInfo = pKernelInfo->heapInfo; heapInfo.KernelHeapSize = static_cast(newKernelHeapSize); pKernelInfo->isKernelHeapSubstituted = true; auto memoryManager = executionEnvironment.memoryManager.get(); auto currentAllocationSize = pKernelInfo->kernelAllocation->getUnderlyingBufferSize(); bool status = false; if (currentAllocationSize >= newKernelHeapSize) { status = memoryManager->copyMemoryToAllocation(pKernelInfo->kernelAllocation, 0, newKernelHeap, newKernelHeapSize); } else { memoryManager->checkGpuUsageAndDestroyGraphicsAllocations(pKernelInfo->kernelAllocation); pKernelInfo->kernelAllocation = nullptr; status = pKernelInfo->createKernelAllocation(getDevice().getDevice(), isBuiltIn); } UNRECOVERABLE_IF(!status); } bool Kernel::isKernelHeapSubstituted(uint32_t rootDeviceIndex) const { return getKernelInfo(rootDeviceIndex).isKernelHeapSubstituted; } uint64_t Kernel::getKernelId(uint32_t rootDeviceIndex) const { return getKernelInfo(rootDeviceIndex).kernelId; } void Kernel::setKernelId(uint32_t rootDeviceIndex, uint64_t newKernelId) { KernelInfo *pKernelInfo = const_cast(&getKernelInfo(rootDeviceIndex)); pKernelInfo->kernelId = newKernelId; } uint32_t Kernel::getStartOffset() const { return this->startOffset; } void Kernel::setStartOffset(uint32_t offset) { this->startOffset = offset; } void *Kernel::getSurfaceStateHeap(uint32_t rootDeviceIndex) const { return kernelInfos[rootDeviceIndex]->usesSsh ? kernelDeviceInfos[rootDeviceIndex].pSshLocal.get() : nullptr; } size_t Kernel::getDynamicStateHeapSize(uint32_t rootDeviceIndex) const { return getKernelInfo(rootDeviceIndex).heapInfo.DynamicStateHeapSize; } const void *Kernel::getDynamicStateHeap(uint32_t rootDeviceIndex) const { return getKernelInfo(rootDeviceIndex).heapInfo.pDsh; } size_t Kernel::getSurfaceStateHeapSize(uint32_t rootDeviceIndex) const { return kernelInfos[rootDeviceIndex]->usesSsh ? kernelDeviceInfos[rootDeviceIndex].sshLocalSize : 0; } size_t Kernel::getNumberOfBindingTableStates(uint32_t rootDeviceIndex) const { return kernelDeviceInfos[rootDeviceIndex].numberOfBindingTableStates; } void Kernel::resizeSurfaceStateHeap(uint32_t rootDeviceIndex, void *pNewSsh, size_t newSshSize, size_t newBindingTableCount, size_t newBindingTableOffset) { kernelDeviceInfos[rootDeviceIndex].pSshLocal.reset(static_cast(pNewSsh)); kernelDeviceInfos[rootDeviceIndex].sshLocalSize = static_cast(newSshSize); kernelDeviceInfos[rootDeviceIndex].numberOfBindingTableStates = newBindingTableCount; kernelDeviceInfos[rootDeviceIndex].localBindingTableOffset = newBindingTableOffset; } cl_int Kernel::setArg(uint32_t argIndex, size_t argSize, const void *argVal) { cl_int retVal = CL_SUCCESS; bool updateExposedKernel = true; auto argWasUncacheable = false; auto &defaultKernelInfo = getDefaultKernelInfo(); if (defaultKernelInfo.builtinDispatchBuilder != nullptr) { updateExposedKernel = defaultKernelInfo.builtinDispatchBuilder->setExplicitArg(argIndex, argSize, argVal, retVal); } if (updateExposedKernel) { if (argIndex >= kernelArgHandlers.size()) { return CL_INVALID_ARG_INDEX; } argWasUncacheable = kernelArguments[argIndex].isStatelessUncacheable; auto argHandler = kernelArgHandlers[argIndex]; retVal = (this->*argHandler)(argIndex, argSize, argVal); } if (retVal == CL_SUCCESS) { if (!kernelArguments[argIndex].isPatched) { patchedArgumentsNum++; kernelArguments[argIndex].isPatched = true; } auto argIsUncacheable = kernelArguments[argIndex].isStatelessUncacheable; statelessUncacheableArgsCount += (argIsUncacheable ? 1 : 0) - (argWasUncacheable ? 1 : 0); resolveArgs(); } return retVal; } cl_int Kernel::setArg(uint32_t argIndex, uint32_t argVal) { return setArg(argIndex, sizeof(argVal), &argVal); } cl_int Kernel::setArg(uint32_t argIndex, uint64_t argVal) { return setArg(argIndex, sizeof(argVal), &argVal); } cl_int Kernel::setArg(uint32_t argIndex, cl_mem argVal) { return setArg(argIndex, sizeof(argVal), &argVal); } cl_int Kernel::setArg(uint32_t argIndex, cl_mem argVal, uint32_t mipLevel) { return setArgImageWithMipLevel(argIndex, sizeof(argVal), &argVal, mipLevel); } void *Kernel::patchBufferOffset(const KernelArgInfo &argInfo, void *svmPtr, GraphicsAllocation *svmAlloc, uint32_t rootDeviceIndex) { if (isInvalidOffset(argInfo.offsetBufferOffset)) { return svmPtr; } void *ptrToPatch = svmPtr; if (svmAlloc != nullptr) { ptrToPatch = reinterpret_cast(svmAlloc->getGpuAddressToPatch()); } constexpr uint32_t minimumAlignment = 4; ptrToPatch = alignDown(ptrToPatch, minimumAlignment); DEBUG_BREAK_IF(ptrDiff(svmPtr, ptrToPatch) != static_cast(ptrDiff(svmPtr, ptrToPatch))); uint32_t offsetToPatch = static_cast(ptrDiff(svmPtr, ptrToPatch)); patch(offsetToPatch, getCrossThreadData(rootDeviceIndex), argInfo.offsetBufferOffset); return ptrToPatch; } cl_int Kernel::setArgSvm(uint32_t argIndex, size_t svmAllocSize, void *svmPtr, GraphicsAllocation *svmAlloc, cl_mem_flags svmFlags) { auto rootDeviceIndex = getDevice().getRootDeviceIndex(); auto &kernelInfo = getKernelInfo(rootDeviceIndex); void *ptrToPatch = patchBufferOffset(kernelInfo.kernelArgInfo[argIndex], svmPtr, svmAlloc, rootDeviceIndex); setArgImmediate(argIndex, sizeof(void *), &svmPtr); storeKernelArg(argIndex, SVM_OBJ, nullptr, svmPtr, sizeof(void *), svmAlloc, svmFlags); if (requiresSshForBuffers(rootDeviceIndex)) { const auto &kernelArgInfo = kernelInfo.kernelArgInfo[argIndex]; auto surfaceState = ptrOffset(getSurfaceStateHeap(rootDeviceIndex), kernelArgInfo.offsetHeap); Buffer::setSurfaceState(&getDevice().getDevice(), surfaceState, svmAllocSize + ptrDiff(svmPtr, ptrToPatch), ptrToPatch, 0, svmAlloc, svmFlags, 0); } if (!kernelArguments[argIndex].isPatched) { patchedArgumentsNum++; kernelArguments[argIndex].isPatched = true; } addAllocationToCacheFlushVector(argIndex, svmAlloc); return CL_SUCCESS; } cl_int Kernel::setArgSvmAlloc(uint32_t argIndex, void *svmPtr, GraphicsAllocation *svmAlloc) { DBG_LOG_INPUTS("setArgBuffer svm_alloc", svmAlloc); auto rootDeviceIndex = getDevice().getRootDeviceIndex(); auto &kernelInfo = getKernelInfo(rootDeviceIndex); const auto &kernelArgInfo = kernelInfo.kernelArgInfo[argIndex]; storeKernelArg(argIndex, SVM_ALLOC_OBJ, svmAlloc, svmPtr, sizeof(uintptr_t)); void *ptrToPatch = patchBufferOffset(kernelArgInfo, svmPtr, svmAlloc, rootDeviceIndex); auto patchLocation = ptrOffset(getCrossThreadData(rootDeviceIndex), kernelArgInfo.kernelArgPatchInfoVector[0].crossthreadOffset); auto patchSize = kernelArgInfo.kernelArgPatchInfoVector[0].size; patchWithRequiredSize(patchLocation, patchSize, reinterpret_cast(svmPtr)); if (requiresSshForBuffers(rootDeviceIndex)) { const auto &kernelArgInfo = kernelInfo.kernelArgInfo[argIndex]; auto surfaceState = ptrOffset(getSurfaceStateHeap(rootDeviceIndex), kernelArgInfo.offsetHeap); size_t allocSize = 0; size_t offset = 0; if (svmAlloc != nullptr) { allocSize = svmAlloc->getUnderlyingBufferSize(); offset = ptrDiff(ptrToPatch, svmAlloc->getGpuAddressToPatch()); allocSize -= offset; } Buffer::setSurfaceState(&getDevice().getDevice(), surfaceState, allocSize, ptrToPatch, offset, svmAlloc, 0, 0); } if (!kernelArguments[argIndex].isPatched) { patchedArgumentsNum++; kernelArguments[argIndex].isPatched = true; } addAllocationToCacheFlushVector(argIndex, svmAlloc); return CL_SUCCESS; } void Kernel::storeKernelArg(uint32_t argIndex, kernelArgType argType, void *argObject, const void *argValue, size_t argSize, GraphicsAllocation *argSvmAlloc, cl_mem_flags argSvmFlags) { kernelArguments[argIndex].type = argType; kernelArguments[argIndex].object = argObject; kernelArguments[argIndex].value = argValue; kernelArguments[argIndex].size = argSize; kernelArguments[argIndex].pSvmAlloc = argSvmAlloc; kernelArguments[argIndex].svmFlags = argSvmFlags; } const void *Kernel::getKernelArg(uint32_t argIndex) const { return kernelArguments[argIndex].object; } const Kernel::SimpleKernelArgInfo &Kernel::getKernelArgInfo(uint32_t argIndex) const { return kernelArguments[argIndex]; } void Kernel::setSvmKernelExecInfo(GraphicsAllocation *argValue) { kernelSvmGfxAllocations.push_back(argValue); if (allocationForCacheFlush(argValue)) { svmAllocationsRequireCacheFlush = true; } } void Kernel::clearSvmKernelExecInfo() { kernelSvmGfxAllocations.clear(); svmAllocationsRequireCacheFlush = false; } void Kernel::setUnifiedMemoryProperty(cl_kernel_exec_info infoType, bool infoValue) { if (infoType == CL_KERNEL_EXEC_INFO_INDIRECT_DEVICE_ACCESS_INTEL) { this->unifiedMemoryControls.indirectDeviceAllocationsAllowed = infoValue; return; } if (infoType == CL_KERNEL_EXEC_INFO_INDIRECT_HOST_ACCESS_INTEL) { this->unifiedMemoryControls.indirectHostAllocationsAllowed = infoValue; return; } if (infoType == CL_KERNEL_EXEC_INFO_INDIRECT_SHARED_ACCESS_INTEL) { this->unifiedMemoryControls.indirectSharedAllocationsAllowed = infoValue; return; } } void Kernel::setUnifiedMemoryExecInfo(GraphicsAllocation *unifiedMemoryAllocation) { kernelUnifiedMemoryGfxAllocations.push_back(unifiedMemoryAllocation); } void Kernel::clearUnifiedMemoryExecInfo() { kernelUnifiedMemoryGfxAllocations.clear(); } cl_int Kernel::setKernelExecutionType(cl_execution_info_kernel_type_intel executionType) { switch (executionType) { case CL_KERNEL_EXEC_INFO_DEFAULT_TYPE_INTEL: this->executionType = KernelExecutionType::Default; break; case CL_KERNEL_EXEC_INFO_CONCURRENT_TYPE_INTEL: this->executionType = KernelExecutionType::Concurrent; break; default: { return CL_INVALID_VALUE; } } return CL_SUCCESS; } void Kernel::getSuggestedLocalWorkSize(const cl_uint workDim, const size_t *globalWorkSize, const size_t *globalWorkOffset, size_t *localWorkSize, ClDevice &clDevice) { UNRECOVERABLE_IF((workDim == 0) || (workDim > 3)); UNRECOVERABLE_IF(globalWorkSize == nullptr); Vec3 elws{0, 0, 0}; Vec3 gws{ globalWorkSize[0], (workDim > 1) ? globalWorkSize[1] : 0, (workDim > 2) ? globalWorkSize[2] : 0}; Vec3 offset{0, 0, 0}; if (globalWorkOffset) { offset.x = globalWorkOffset[0]; if (workDim > 1) { offset.y = globalWorkOffset[1]; if (workDim > 2) { offset.z = globalWorkOffset[2]; } } } const DispatchInfo dispatchInfo{&clDevice, this, workDim, gws, elws, offset}; auto suggestedLws = computeWorkgroupSize(dispatchInfo); localWorkSize[0] = suggestedLws.x; if (workDim > 1) localWorkSize[1] = suggestedLws.y; if (workDim > 2) localWorkSize[2] = suggestedLws.z; } uint32_t Kernel::getMaxWorkGroupCount(const cl_uint workDim, const size_t *localWorkSize, const CommandQueue *commandQueue) const { auto rootDeviceIndex = commandQueue->getDevice().getRootDeviceIndex(); auto &hardwareInfo = getHardwareInfo(rootDeviceIndex); auto &hwHelper = HwHelper::get(hardwareInfo.platform.eRenderCoreFamily); if (!hwHelper.isCooperativeDispatchSupported(commandQueue->getGpgpuEngine().getEngineType(), hardwareInfo.platform.eProductFamily)) { return 0; } auto executionEnvironment = getKernelInfo(rootDeviceIndex).patchInfo.executionEnvironment; auto dssCount = hardwareInfo.gtSystemInfo.DualSubSliceCount; if (dssCount == 0) { dssCount = hardwareInfo.gtSystemInfo.SubSliceCount; } auto availableThreadCount = hwHelper.calculateAvailableThreadCount( hardwareInfo.platform.eProductFamily, ((executionEnvironment != nullptr) ? executionEnvironment->NumGRFRequired : GrfConfig::DefaultGrfNumber), hardwareInfo.gtSystemInfo.EUCount, hardwareInfo.gtSystemInfo.ThreadCount / hardwareInfo.gtSystemInfo.EUCount); auto hasBarriers = ((executionEnvironment != nullptr) ? executionEnvironment->HasBarriers : 0u); return KernelHelper::getMaxWorkGroupCount(kernelInfos[rootDeviceIndex]->getMaxSimdSize(), availableThreadCount, dssCount, dssCount * KB * hardwareInfo.capabilityTable.slmSize, hwHelper.alignSlmSize(slmTotalSize), static_cast(hwHelper.getMaxBarrierRegisterPerSlice()), hwHelper.getBarriersCountFromHasBarriers(hasBarriers), workDim, localWorkSize); } inline void Kernel::makeArgsResident(CommandStreamReceiver &commandStreamReceiver) { auto numArgs = kernelInfos[commandStreamReceiver.getRootDeviceIndex()]->kernelArgInfo.size(); for (decltype(numArgs) argIndex = 0; argIndex < numArgs; argIndex++) { if (kernelArguments[argIndex].object) { if (kernelArguments[argIndex].type == SVM_ALLOC_OBJ) { auto pSVMAlloc = (GraphicsAllocation *)kernelArguments[argIndex].object; auto pageFaultManager = executionEnvironment.memoryManager->getPageFaultManager(); if (pageFaultManager && this->isUnifiedMemorySyncRequired) { pageFaultManager->moveAllocationToGpuDomain(reinterpret_cast(pSVMAlloc->getGpuAddress())); } commandStreamReceiver.makeResident(*pSVMAlloc); } else if (Kernel::isMemObj(kernelArguments[argIndex].type)) { auto clMem = const_cast(static_cast(kernelArguments[argIndex].object)); auto memObj = castToObjectOrAbort(clMem); auto image = castToObject(clMem); if (image && image->isImageFromImage()) { commandStreamReceiver.setSamplerCacheFlushRequired(CommandStreamReceiver::SamplerCacheFlushState::samplerCacheFlushBefore); } commandStreamReceiver.makeResident(*memObj->getGraphicsAllocation(commandStreamReceiver.getRootDeviceIndex())); if (memObj->getMcsAllocation()) { commandStreamReceiver.makeResident(*memObj->getMcsAllocation()); } } } } } void Kernel::makeResident(CommandStreamReceiver &commandStreamReceiver) { auto rootDeviceIndex = commandStreamReceiver.getRootDeviceIndex(); if (kernelDeviceInfos[rootDeviceIndex].privateSurface) { commandStreamReceiver.makeResident(*kernelDeviceInfos[rootDeviceIndex].privateSurface); } if (program->getConstantSurface(rootDeviceIndex)) { commandStreamReceiver.makeResident(*(program->getConstantSurface(rootDeviceIndex))); } if (program->getGlobalSurface(rootDeviceIndex)) { commandStreamReceiver.makeResident(*(program->getGlobalSurface(rootDeviceIndex))); } if (program->getExportedFunctionsSurface(rootDeviceIndex)) { commandStreamReceiver.makeResident(*(program->getExportedFunctionsSurface(rootDeviceIndex))); } for (auto gfxAlloc : kernelSvmGfxAllocations) { commandStreamReceiver.makeResident(*gfxAlloc); } auto pageFaultManager = program->peekExecutionEnvironment().memoryManager->getPageFaultManager(); for (auto gfxAlloc : kernelUnifiedMemoryGfxAllocations) { commandStreamReceiver.makeResident(*gfxAlloc); if (pageFaultManager) { pageFaultManager->moveAllocationToGpuDomain(reinterpret_cast(gfxAlloc->getGpuAddress())); } } if (unifiedMemoryControls.indirectSharedAllocationsAllowed && pageFaultManager) { pageFaultManager->moveAllocationsWithinUMAllocsManagerToGpuDomain(this->getContext().getSVMAllocsManager()); } makeArgsResident(commandStreamReceiver); auto kernelIsaAllocation = this->kernelInfos[rootDeviceIndex]->kernelAllocation; if (kernelIsaAllocation) { commandStreamReceiver.makeResident(*kernelIsaAllocation); } gtpinNotifyMakeResident(this, &commandStreamReceiver); if (unifiedMemoryControls.indirectDeviceAllocationsAllowed || unifiedMemoryControls.indirectHostAllocationsAllowed || unifiedMemoryControls.indirectSharedAllocationsAllowed) { this->getContext().getSVMAllocsManager()->makeInternalAllocationsResident(commandStreamReceiver, unifiedMemoryControls.generateMask()); } } void Kernel::getResidency(std::vector &dst, uint32_t rootDeviceIndex) { if (kernelDeviceInfos[rootDeviceIndex].privateSurface) { GeneralSurface *surface = new GeneralSurface(kernelDeviceInfos[rootDeviceIndex].privateSurface); dst.push_back(surface); } if (program->getConstantSurface(rootDeviceIndex)) { GeneralSurface *surface = new GeneralSurface(program->getConstantSurface(rootDeviceIndex)); dst.push_back(surface); } if (program->getGlobalSurface(rootDeviceIndex)) { GeneralSurface *surface = new GeneralSurface(program->getGlobalSurface(rootDeviceIndex)); dst.push_back(surface); } if (program->getExportedFunctionsSurface(rootDeviceIndex)) { GeneralSurface *surface = new GeneralSurface(program->getExportedFunctionsSurface(rootDeviceIndex)); dst.push_back(surface); } for (auto gfxAlloc : kernelSvmGfxAllocations) { GeneralSurface *surface = new GeneralSurface(gfxAlloc); dst.push_back(surface); } auto numArgs = kernelInfos[rootDeviceIndex]->kernelArgInfo.size(); for (decltype(numArgs) argIndex = 0; argIndex < numArgs; argIndex++) { if (kernelArguments[argIndex].object) { if (kernelArguments[argIndex].type == SVM_ALLOC_OBJ) { auto pSVMAlloc = (GraphicsAllocation *)kernelArguments[argIndex].object; dst.push_back(new GeneralSurface(pSVMAlloc)); } else if (Kernel::isMemObj(kernelArguments[argIndex].type)) { auto clMem = const_cast(static_cast(kernelArguments[argIndex].object)); auto memObj = castToObject(clMem); DEBUG_BREAK_IF(memObj == nullptr); dst.push_back(new MemObjSurface(memObj)); } } } auto kernelIsaAllocation = this->kernelInfos[rootDeviceIndex]->kernelAllocation; if (kernelIsaAllocation) { GeneralSurface *surface = new GeneralSurface(kernelIsaAllocation); dst.push_back(surface); } gtpinNotifyUpdateResidencyList(this, &dst); } bool Kernel::requiresCoherency() { auto numArgs = getDefaultKernelInfo().kernelArgInfo.size(); for (decltype(numArgs) argIndex = 0; argIndex < numArgs; argIndex++) { if (kernelArguments[argIndex].object) { if (kernelArguments[argIndex].type == SVM_ALLOC_OBJ) { auto pSVMAlloc = (GraphicsAllocation *)kernelArguments[argIndex].object; if (pSVMAlloc->isCoherent()) { return true; } } if (Kernel::isMemObj(kernelArguments[argIndex].type)) { auto clMem = const_cast(static_cast(kernelArguments[argIndex].object)); auto memObj = castToObjectOrAbort(clMem); if (memObj->getMultiGraphicsAllocation().isCoherent()) { return true; } } } } return false; } cl_int Kernel::setArgLocal(uint32_t argIndex, size_t argSize, const void *argVal) { auto rootDeviceIndex = getDevice().getRootDeviceIndex(); auto crossThreadData = reinterpret_cast(getCrossThreadData(rootDeviceIndex)); auto &defaultKernelInfo = getDefaultKernelInfo(); storeKernelArg(argIndex, SLM_OBJ, nullptr, argVal, argSize); slmSizes[argIndex] = argSize; // Extract our current slmOffset auto slmOffset = *ptrOffset(crossThreadData, defaultKernelInfo.kernelArgInfo[argIndex].kernelArgPatchInfoVector[0].crossthreadOffset); // Add our size slmOffset += static_cast(argSize); // Update all slm offsets after this argIndex ++argIndex; while (argIndex < slmSizes.size()) { const auto &kernelArgInfo = defaultKernelInfo.kernelArgInfo[argIndex]; auto slmAlignment = kernelArgInfo.slmAlignment; // If an local argument, alignment should be non-zero if (slmAlignment) { // Align to specified alignment slmOffset = alignUp(slmOffset, slmAlignment); // Patch our new offset into cross thread data auto patchLocation = ptrOffset(crossThreadData, kernelArgInfo.kernelArgPatchInfoVector[0].crossthreadOffset); *patchLocation = slmOffset; } slmOffset += static_cast(slmSizes[argIndex]); ++argIndex; } slmTotalSize = defaultKernelInfo.workloadInfo.slmStaticSize + alignUp(slmOffset, KB); return CL_SUCCESS; } cl_int Kernel::setArgBuffer(uint32_t argIndex, size_t argSize, const void *argVal) { if (argSize != sizeof(cl_mem *)) return CL_INVALID_ARG_SIZE; auto clMem = reinterpret_cast(argVal); auto rootDeviceIndex = getDevice().getRootDeviceIndex(); const auto &kernelArgInfo = getKernelInfo(rootDeviceIndex).kernelArgInfo[argIndex]; patchBufferOffset(kernelArgInfo, nullptr, nullptr, rootDeviceIndex); if (clMem && *clMem) { auto clMemObj = *clMem; DBG_LOG_INPUTS("setArgBuffer cl_mem", clMemObj); storeKernelArg(argIndex, BUFFER_OBJ, clMemObj, argVal, argSize); auto buffer = castToObject(clMemObj); if (!buffer) return CL_INVALID_MEM_OBJECT; auto graphicsAllocation = buffer->getGraphicsAllocation(rootDeviceIndex); if (buffer->peekSharingHandler()) { usingSharedObjArgs = true; } auto patchLocation = ptrOffset(getCrossThreadData(rootDeviceIndex), kernelArgInfo.kernelArgPatchInfoVector[0].crossthreadOffset); auto patchSize = kernelArgInfo.kernelArgPatchInfoVector[0].size; uint64_t addressToPatch = buffer->setArgStateless(patchLocation, patchSize, rootDeviceIndex, !this->isBuiltIn); if (DebugManager.flags.AddPatchInfoCommentsForAUBDump.get()) { PatchInfoData patchInfoData(addressToPatch - buffer->getOffset(), static_cast(buffer->getOffset()), PatchInfoAllocationType::KernelArg, reinterpret_cast(getCrossThreadData(rootDeviceIndex)), static_cast(kernelArgInfo.kernelArgPatchInfoVector[0].crossthreadOffset), PatchInfoAllocationType::IndirectObjectHeap, patchSize); this->patchInfoDataList.push_back(patchInfoData); } bool disableL3 = false; bool forceNonAuxMode = false; bool isAuxTranslationKernel = (AuxTranslationDirection::None != auxTranslationDirection); if (isAuxTranslationKernel) { if (((AuxTranslationDirection::AuxToNonAux == auxTranslationDirection) && argIndex == 1) || ((AuxTranslationDirection::NonAuxToAux == auxTranslationDirection) && argIndex == 0)) { forceNonAuxMode = true; } disableL3 = (argIndex == 0); } else if (graphicsAllocation->getAllocationType() == GraphicsAllocation::AllocationType::BUFFER_COMPRESSED && !kernelArgInfo.pureStatefulBufferAccess) { forceNonAuxMode = true; } if (requiresSshForBuffers(rootDeviceIndex)) { auto surfaceState = ptrOffset(getSurfaceStateHeap(rootDeviceIndex), kernelArgInfo.offsetHeap); buffer->setArgStateful(surfaceState, forceNonAuxMode, disableL3, isAuxTranslationKernel, kernelArgInfo.isReadOnly, getDevice().getDevice()); } kernelArguments[argIndex].isStatelessUncacheable = kernelArgInfo.pureStatefulBufferAccess ? false : buffer->isMemObjUncacheable(); auto allocationForCacheFlush = graphicsAllocation; //if we make object uncacheable for surface state and there are not stateless accessess , then ther is no need to flush caches if (buffer->isMemObjUncacheableForSurfaceState() && kernelArgInfo.pureStatefulBufferAccess) { allocationForCacheFlush = nullptr; } addAllocationToCacheFlushVector(argIndex, allocationForCacheFlush); return CL_SUCCESS; } else { auto patchLocation = ptrOffset(getCrossThreadData(rootDeviceIndex), kernelArgInfo.kernelArgPatchInfoVector[0].crossthreadOffset); patchWithRequiredSize(patchLocation, kernelArgInfo.kernelArgPatchInfoVector[0].size, 0u); storeKernelArg(argIndex, BUFFER_OBJ, nullptr, argVal, argSize); if (requiresSshForBuffers(rootDeviceIndex)) { auto surfaceState = ptrOffset(getSurfaceStateHeap(rootDeviceIndex), kernelArgInfo.offsetHeap); Buffer::setSurfaceState(&getDevice().getDevice(), surfaceState, 0, nullptr, 0, nullptr, 0, 0); } return CL_SUCCESS; } } cl_int Kernel::setArgPipe(uint32_t argIndex, size_t argSize, const void *argVal) { if (argSize != sizeof(cl_mem *)) { return CL_INVALID_ARG_SIZE; } auto clMem = reinterpret_cast(argVal); if (clMem && *clMem) { auto clMemObj = *clMem; DBG_LOG_INPUTS("setArgPipe cl_mem", clMemObj); storeKernelArg(argIndex, PIPE_OBJ, clMemObj, argVal, argSize); auto memObj = castToObject(clMemObj); if (!memObj) { return CL_INVALID_MEM_OBJECT; } auto pipe = castToObject(clMemObj); if (!pipe) { return CL_INVALID_ARG_VALUE; } if (memObj->getContext() != &(this->getContext())) { return CL_INVALID_MEM_OBJECT; } auto rootDeviceIndex = getDevice().getRootDeviceIndex(); const auto &kernelArgInfo = getKernelInfo(rootDeviceIndex).kernelArgInfo[argIndex]; auto patchLocation = ptrOffset(getCrossThreadData(rootDeviceIndex), kernelArgInfo.kernelArgPatchInfoVector[0].crossthreadOffset); auto patchSize = kernelArgInfo.kernelArgPatchInfoVector[0].size; pipe->setPipeArg(patchLocation, patchSize, rootDeviceIndex); auto graphicsAllocation = pipe->getGraphicsAllocation(rootDeviceIndex); if (requiresSshForBuffers(rootDeviceIndex)) { auto surfaceState = ptrOffset(getSurfaceStateHeap(rootDeviceIndex), kernelArgInfo.offsetHeap); Buffer::setSurfaceState(&getDevice().getDevice(), surfaceState, pipe->getSize(), pipe->getCpuAddress(), 0, graphicsAllocation, 0, 0); } return CL_SUCCESS; } else { return CL_INVALID_MEM_OBJECT; } } cl_int Kernel::setArgImage(uint32_t argIndex, size_t argSize, const void *argVal) { return setArgImageWithMipLevel(argIndex, argSize, argVal, 0u); } cl_int Kernel::setArgImageWithMipLevel(uint32_t argIndex, size_t argSize, const void *argVal, uint32_t mipLevel) { auto retVal = CL_INVALID_ARG_VALUE; auto rootDeviceIndex = getDevice().getRootDeviceIndex(); auto &kernelInfo = getKernelInfo(rootDeviceIndex); patchBufferOffset(kernelInfo.kernelArgInfo[argIndex], nullptr, nullptr, rootDeviceIndex); auto clMemObj = *(static_cast(argVal)); auto pImage = castToObject(clMemObj); if (pImage && argSize == sizeof(cl_mem *)) { if (pImage->peekSharingHandler()) { usingSharedObjArgs = true; } const auto &kernelArgInfo = kernelInfo.kernelArgInfo[argIndex]; DBG_LOG_INPUTS("setArgImage cl_mem", clMemObj); storeKernelArg(argIndex, IMAGE_OBJ, clMemObj, argVal, argSize); auto surfaceState = ptrOffset(getSurfaceStateHeap(rootDeviceIndex), kernelArgInfo.offsetHeap); DEBUG_BREAK_IF(!kernelArgInfo.isImage); // Sets SS structure if (kernelArgInfo.isMediaImage) { DEBUG_BREAK_IF(!kernelInfo.isVmeWorkload); pImage->setMediaImageArg(surfaceState, rootDeviceIndex); } else { pImage->setImageArg(surfaceState, kernelArgInfo.isMediaBlockImage, mipLevel, rootDeviceIndex); } auto crossThreadData = reinterpret_cast(getCrossThreadData(rootDeviceIndex)); auto &imageDesc = pImage->getImageDesc(); auto &imageFormat = pImage->getImageFormat(); auto graphicsAllocation = pImage->getGraphicsAllocation(rootDeviceIndex); if (imageDesc.image_type == CL_MEM_OBJECT_IMAGE3D) { imageTransformer->registerImage3d(argIndex); } patch(imageDesc.image_width, crossThreadData, kernelArgInfo.offsetImgWidth); patch(imageDesc.image_height, crossThreadData, kernelArgInfo.offsetImgHeight); patch(imageDesc.image_depth, crossThreadData, kernelArgInfo.offsetImgDepth); patch(imageDesc.num_samples, crossThreadData, kernelArgInfo.offsetNumSamples); patch(imageDesc.image_array_size, crossThreadData, kernelArgInfo.offsetArraySize); patch(imageFormat.image_channel_data_type, crossThreadData, kernelArgInfo.offsetChannelDataType); patch(imageFormat.image_channel_order, crossThreadData, kernelArgInfo.offsetChannelOrder); patch(kernelArgInfo.offsetHeap, crossThreadData, kernelArgInfo.offsetObjectId); patch(imageDesc.num_mip_levels, crossThreadData, kernelArgInfo.offsetNumMipLevels); auto pixelSize = pImage->getSurfaceFormatInfo().surfaceFormat.ImageElementSizeInBytes; patch(graphicsAllocation->getGpuAddress(), crossThreadData, kernelArgInfo.offsetFlatBaseOffset); patch((imageDesc.image_width * pixelSize) - 1, crossThreadData, kernelArgInfo.offsetFlatWidth); patch((imageDesc.image_height * pixelSize) - 1, crossThreadData, kernelArgInfo.offsetFlatHeight); patch(imageDesc.image_row_pitch - 1, crossThreadData, kernelArgInfo.offsetFlatPitch); addAllocationToCacheFlushVector(argIndex, graphicsAllocation); retVal = CL_SUCCESS; } return retVal; } cl_int Kernel::setArgImmediate(uint32_t argIndex, size_t argSize, const void *argVal) { auto retVal = CL_INVALID_ARG_VALUE; auto rootDeviceIndex = getDevice().getRootDeviceIndex(); if (argVal) { const auto &kernelArgInfo = getKernelInfo(rootDeviceIndex).kernelArgInfo[argIndex]; DEBUG_BREAK_IF(kernelArgInfo.kernelArgPatchInfoVector.size() <= 0); storeKernelArg(argIndex, NONE_OBJ, nullptr, nullptr, argSize); auto crossThreadData = getCrossThreadData(rootDeviceIndex); auto crossThreadDataEnd = ptrOffset(crossThreadData, getCrossThreadDataSize(rootDeviceIndex)); for (const auto &kernelArgPatchInfo : kernelArgInfo.kernelArgPatchInfoVector) { DEBUG_BREAK_IF(kernelArgPatchInfo.size <= 0); auto pDst = ptrOffset(crossThreadData, kernelArgPatchInfo.crossthreadOffset); auto pSrc = ptrOffset(argVal, kernelArgPatchInfo.sourceOffset); DEBUG_BREAK_IF(!(ptrOffset(pDst, kernelArgPatchInfo.size) <= crossThreadDataEnd)); UNUSED_VARIABLE(crossThreadDataEnd); if (kernelArgPatchInfo.sourceOffset < argSize) { size_t maxBytesToCopy = argSize - kernelArgPatchInfo.sourceOffset; size_t bytesToCopy = std::min(static_cast(kernelArgPatchInfo.size), maxBytesToCopy); memcpy_s(pDst, kernelArgPatchInfo.size, pSrc, bytesToCopy); } } retVal = CL_SUCCESS; } return retVal; } cl_int Kernel::setArgSampler(uint32_t argIndex, size_t argSize, const void *argVal) { auto retVal = CL_INVALID_SAMPLER; if (!argVal) { return retVal; } auto clSamplerObj = *(static_cast(argVal)); auto pSampler = castToObject(clSamplerObj); auto rootDeviceIndex = getDevice().getRootDeviceIndex(); if (pSampler) { pSampler->incRefInternal(); } if (kernelArguments.at(argIndex).object) { auto oldSampler = castToObject(kernelArguments.at(argIndex).object); UNRECOVERABLE_IF(!oldSampler); oldSampler->decRefInternal(); } if (pSampler && argSize == sizeof(cl_sampler *)) { const auto &kernelArgInfo = getKernelInfo(rootDeviceIndex).kernelArgInfo[argIndex]; storeKernelArg(argIndex, SAMPLER_OBJ, clSamplerObj, argVal, argSize); auto dsh = getDynamicStateHeap(rootDeviceIndex); auto samplerState = ptrOffset(dsh, kernelArgInfo.offsetHeap); pSampler->setArg(const_cast(samplerState)); auto crossThreadData = reinterpret_cast(getCrossThreadData(rootDeviceIndex)); patch(pSampler->getSnapWaValue(), crossThreadData, kernelArgInfo.offsetSamplerSnapWa); patch(GetAddrModeEnum(pSampler->addressingMode), crossThreadData, kernelArgInfo.offsetSamplerAddressingMode); patch(GetNormCoordsEnum(pSampler->normalizedCoordinates), crossThreadData, kernelArgInfo.offsetSamplerNormalizedCoords); patch(SAMPLER_OBJECT_ID_SHIFT + kernelArgInfo.offsetHeap, crossThreadData, kernelArgInfo.offsetObjectId); retVal = CL_SUCCESS; } return retVal; } cl_int Kernel::setArgAccelerator(uint32_t argIndex, size_t argSize, const void *argVal) { auto retVal = CL_INVALID_ARG_VALUE; if (argSize != sizeof(cl_accelerator_intel)) { return CL_INVALID_ARG_SIZE; } if (!argVal) { return retVal; } auto clAcceleratorObj = *(static_cast(argVal)); DBG_LOG_INPUTS("setArgAccelerator cl_mem", clAcceleratorObj); const auto pAccelerator = castToObject(clAcceleratorObj); auto rootDeviceIndex = getDevice().getRootDeviceIndex(); if (pAccelerator) { storeKernelArg(argIndex, ACCELERATOR_OBJ, clAcceleratorObj, argVal, argSize); const auto &kernelArgInfo = getKernelInfo(rootDeviceIndex).kernelArgInfo[argIndex]; if (kernelArgInfo.samplerArgumentType == iOpenCL::SAMPLER_OBJECT_VME) { auto crossThreadData = getCrossThreadData(rootDeviceIndex); const auto pVmeAccelerator = castToObjectOrAbort(pAccelerator); auto pDesc = static_cast(pVmeAccelerator->getDescriptor()); DEBUG_BREAK_IF(!pDesc); auto pVmeMbBlockTypeDst = reinterpret_cast(ptrOffset(crossThreadData, kernelArgInfo.offsetVmeMbBlockType)); *pVmeMbBlockTypeDst = pDesc->mb_block_type; auto pVmeSubpixelMode = reinterpret_cast(ptrOffset(crossThreadData, kernelArgInfo.offsetVmeSubpixelMode)); *pVmeSubpixelMode = pDesc->subpixel_mode; auto pVmeSadAdjustMode = reinterpret_cast(ptrOffset(crossThreadData, kernelArgInfo.offsetVmeSadAdjustMode)); *pVmeSadAdjustMode = pDesc->sad_adjust_mode; auto pVmeSearchPathType = reinterpret_cast(ptrOffset(crossThreadData, kernelArgInfo.offsetVmeSearchPathType)); *pVmeSearchPathType = pDesc->search_path_type; retVal = CL_SUCCESS; } else if (kernelArgInfo.samplerArgumentType == iOpenCL::SAMPLER_OBJECT_VE) { retVal = CL_SUCCESS; } } return retVal; } cl_int Kernel::setArgDevQueue(uint32_t argIndex, size_t argSize, const void *argVal) { if (argVal == nullptr) { return CL_INVALID_ARG_VALUE; } if (argSize != sizeof(cl_command_queue)) { return CL_INVALID_ARG_SIZE; } auto clDeviceQueue = *(static_cast(argVal)); auto pDeviceQueue = castToObject(clDeviceQueue); if (pDeviceQueue == nullptr) { return CL_INVALID_DEVICE_QUEUE; } auto rootDeviceIndex = pDeviceQueue->getDevice().getRootDeviceIndex(); storeKernelArg(argIndex, DEVICE_QUEUE_OBJ, clDeviceQueue, argVal, argSize); const auto &kernelArgPatchInfo = kernelInfos[rootDeviceIndex]->kernelArgInfo[argIndex].kernelArgPatchInfoVector[0]; auto patchLocation = ptrOffset(reinterpret_cast(getCrossThreadData(rootDeviceIndex)), kernelArgPatchInfo.crossthreadOffset); patchWithRequiredSize(patchLocation, kernelArgPatchInfo.size, static_cast(pDeviceQueue->getQueueBuffer()->getGpuAddressToPatch())); return CL_SUCCESS; } void Kernel::setKernelArgHandler(uint32_t argIndex, KernelArgHandler handler) { if (kernelArgHandlers.size() <= argIndex) { kernelArgHandlers.resize(argIndex + 1); } kernelArgHandlers[argIndex] = handler; } void Kernel::unsetArg(uint32_t argIndex) { if (kernelArguments[argIndex].isPatched) { patchedArgumentsNum--; kernelArguments[argIndex].isPatched = false; if (kernelArguments[argIndex].isStatelessUncacheable) { statelessUncacheableArgsCount--; kernelArguments[argIndex].isStatelessUncacheable = false; } } } void Kernel::createReflectionSurface() { auto pClDevice = program->getDevices()[0]; auto rootDeviceIndex = pClDevice->getRootDeviceIndex(); if (this->isParentKernel && kernelReflectionSurface == nullptr) { auto &hwInfo = pClDevice->getHardwareInfo(); auto &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily); BlockKernelManager *blockManager = program->getBlockKernelManager(); uint32_t blockCount = static_cast(blockManager->getCount()); ObjectCounts objectCount; getParentObjectCounts(objectCount); uint32_t parentImageCount = objectCount.imageCount; uint32_t parentSamplerCount = objectCount.samplerCount; size_t maxConstantBufferSize = 0; std::vector *curbeParamsForBlocks = new std::vector[blockCount]; uint64_t *tokenMask = new uint64_t[blockCount]; uint32_t *sshTokenOffsetsFromKernelData = new uint32_t[blockCount]; size_t kernelReflectionSize = alignUp(sizeof(IGIL_KernelDataHeader) + blockCount * sizeof(IGIL_KernelAddressData), sizeof(void *)); uint32_t kernelDataOffset = static_cast(kernelReflectionSize); uint32_t parentSSHAlignedSize = alignUp(this->kernelInfos[rootDeviceIndex]->heapInfo.SurfaceStateHeapSize, hwHelper.getBindingTableStateAlignement()); uint32_t btOffset = parentSSHAlignedSize; for (uint32_t i = 0; i < blockCount; i++) { const KernelInfo *pBlockInfo = blockManager->getBlockKernelInfo(i); size_t samplerStateAndBorderColorSize = 0; uint32_t firstSSHTokenIndex = 0; ReflectionSurfaceHelper::getCurbeParams(curbeParamsForBlocks[i], tokenMask[i], firstSSHTokenIndex, *pBlockInfo, hwInfo); maxConstantBufferSize = std::max(maxConstantBufferSize, static_cast(pBlockInfo->patchInfo.dataParameterStream->DataParameterStreamSize)); samplerStateAndBorderColorSize = pBlockInfo->getSamplerStateArraySize(hwInfo); samplerStateAndBorderColorSize = alignUp(samplerStateAndBorderColorSize, Sampler::samplerStateArrayAlignment); samplerStateAndBorderColorSize += pBlockInfo->getBorderColorStateSize(); samplerStateAndBorderColorSize = alignUp(samplerStateAndBorderColorSize, sizeof(void *)); sshTokenOffsetsFromKernelData[i] = offsetof(IGIL_KernelData, m_data) + sizeof(IGIL_KernelCurbeParams) * firstSSHTokenIndex; kernelReflectionSize += alignUp(sizeof(IGIL_KernelData) + sizeof(IGIL_KernelCurbeParams) * curbeParamsForBlocks[i].size(), sizeof(void *)); kernelReflectionSize += parentSamplerCount * sizeof(IGIL_SamplerParams) + samplerStateAndBorderColorSize; } maxConstantBufferSize = alignUp(maxConstantBufferSize, sizeof(void *)); kernelReflectionSize += blockCount * alignUp(maxConstantBufferSize, sizeof(void *)); kernelReflectionSize += parentImageCount * sizeof(IGIL_ImageParamters); kernelReflectionSize += parentSamplerCount * sizeof(IGIL_ParentSamplerParams); kernelReflectionSurface = executionEnvironment.memoryManager->allocateGraphicsMemoryWithProperties( {pClDevice->getRootDeviceIndex(), kernelReflectionSize, GraphicsAllocation::AllocationType::DEVICE_QUEUE_BUFFER, pClDevice->getDeviceBitfield()}); for (uint32_t i = 0; i < blockCount; i++) { const KernelInfo *pBlockInfo = blockManager->getBlockKernelInfo(i); uint32_t newKernelDataOffset = ReflectionSurfaceHelper::setKernelData(kernelReflectionSurface->getUnderlyingBuffer(), kernelDataOffset, curbeParamsForBlocks[i], tokenMask[i], maxConstantBufferSize, parentSamplerCount, *pBlockInfo, hwInfo); uint32_t offset = static_cast(offsetof(IGIL_KernelDataHeader, m_data) + sizeof(IGIL_KernelAddressData) * i); uint32_t samplerHeapOffset = static_cast(alignUp(kernelDataOffset + sizeof(IGIL_KernelData) + curbeParamsForBlocks[i].size() * sizeof(IGIL_KernelCurbeParams), sizeof(void *))); uint32_t samplerHeapSize = static_cast(alignUp(pBlockInfo->getSamplerStateArraySize(hwInfo), Sampler::samplerStateArrayAlignment) + pBlockInfo->getBorderColorStateSize()); uint32_t constantBufferOffset = alignUp(samplerHeapOffset + samplerHeapSize, sizeof(void *)); uint32_t samplerParamsOffset = 0; if (parentSamplerCount) { samplerParamsOffset = newKernelDataOffset - sizeof(IGIL_SamplerParams) * parentSamplerCount; IGIL_SamplerParams *pSamplerParams = (IGIL_SamplerParams *)ptrOffset(kernelReflectionSurface->getUnderlyingBuffer(), samplerParamsOffset); uint32_t sampler = 0; for (uint32_t argID = 0; argID < pBlockInfo->kernelArgInfo.size(); argID++) { if (pBlockInfo->kernelArgInfo[argID].isSampler) { pSamplerParams[sampler].m_ArgID = argID; pSamplerParams[sampler].m_SamplerStateOffset = pBlockInfo->kernelArgInfo[argID].offsetHeap; sampler++; } } } ReflectionSurfaceHelper::setKernelAddressData(kernelReflectionSurface->getUnderlyingBuffer(), offset, kernelDataOffset, samplerHeapOffset, constantBufferOffset, samplerParamsOffset, sshTokenOffsetsFromKernelData[i] + kernelDataOffset, btOffset, *pBlockInfo, hwInfo); if (samplerHeapSize > 0) { void *pDst = ptrOffset(kernelReflectionSurface->getUnderlyingBuffer(), samplerHeapOffset); const void *pSrc = ptrOffset(pBlockInfo->heapInfo.pDsh, pBlockInfo->getBorderColorOffset()); memcpy_s(pDst, samplerHeapSize, pSrc, samplerHeapSize); } void *pDst = ptrOffset(kernelReflectionSurface->getUnderlyingBuffer(), constantBufferOffset); const char *pSrc = pBlockInfo->crossThreadData; memcpy_s(pDst, pBlockInfo->getConstantBufferSize(), pSrc, pBlockInfo->getConstantBufferSize()); btOffset += pBlockInfo->patchInfo.bindingTableState->Offset; kernelDataOffset = newKernelDataOffset; } uint32_t samplerOffset = 0; if (parentSamplerCount) { samplerOffset = kernelDataOffset + parentImageCount * sizeof(IGIL_ImageParamters); } ReflectionSurfaceHelper::setKernelDataHeader(kernelReflectionSurface->getUnderlyingBuffer(), blockCount, parentImageCount, parentSamplerCount, kernelDataOffset, samplerOffset); delete[] curbeParamsForBlocks; delete[] tokenMask; delete[] sshTokenOffsetsFromKernelData; // Patch constant values once after reflection surface creation patchBlocksCurbeWithConstantValues(); } if (DebugManager.flags.ForceDispatchScheduler.get()) { if (this->isSchedulerKernel && kernelReflectionSurface == nullptr) { kernelReflectionSurface = executionEnvironment.memoryManager->allocateGraphicsMemoryWithProperties( {pClDevice->getRootDeviceIndex(), MemoryConstants::pageSize, GraphicsAllocation::AllocationType::DEVICE_QUEUE_BUFFER, pClDevice->getDeviceBitfield()}); } } } void Kernel::getParentObjectCounts(ObjectCounts &objectCount) { objectCount.imageCount = 0; objectCount.samplerCount = 0; DEBUG_BREAK_IF(!isParentKernel); for (const auto &arg : this->kernelArguments) { if (arg.type == SAMPLER_OBJ) { objectCount.samplerCount++; } else if (arg.type == IMAGE_OBJ) { objectCount.imageCount++; } } } bool Kernel::hasPrintfOutput(uint32_t rootDeviceIndex) const { return getKernelInfo(rootDeviceIndex).patchInfo.pAllocateStatelessPrintfSurface != nullptr; } size_t Kernel::getInstructionHeapSizeForExecutionModel() const { BlockKernelManager *blockManager = program->getBlockKernelManager(); uint32_t blockCount = static_cast(blockManager->getCount()); size_t totalSize = 0; if (isParentKernel) { totalSize = kernelBinaryAlignement - 1; // for initial alignment for (uint32_t i = 0; i < blockCount; i++) { const KernelInfo *pBlockInfo = blockManager->getBlockKernelInfo(i); totalSize += pBlockInfo->heapInfo.KernelHeapSize; totalSize = alignUp(totalSize, kernelBinaryAlignement); } } return totalSize; } void Kernel::patchBlocksCurbeWithConstantValues() { auto rootDeviceIndex = program->getDevices()[0]->getRootDeviceIndex(); BlockKernelManager *blockManager = program->getBlockKernelManager(); uint32_t blockCount = static_cast(blockManager->getCount()); uint64_t globalMemoryGpuAddress = program->getGlobalSurface(rootDeviceIndex) != nullptr ? program->getGlobalSurface(rootDeviceIndex)->getGpuAddressToPatch() : 0; uint64_t constantMemoryGpuAddress = program->getConstantSurface(rootDeviceIndex) != nullptr ? program->getConstantSurface(rootDeviceIndex)->getGpuAddressToPatch() : 0; for (uint32_t blockID = 0; blockID < blockCount; blockID++) { const KernelInfo *pBlockInfo = blockManager->getBlockKernelInfo(blockID); uint64_t globalMemoryCurbeOffset = ReflectionSurfaceHelper::undefinedOffset; uint32_t globalMemoryPatchSize = 0; uint64_t constantMemoryCurbeOffset = ReflectionSurfaceHelper::undefinedOffset; uint32_t constantMemoryPatchSize = 0; if (pBlockInfo->patchInfo.pAllocateStatelessGlobalMemorySurfaceWithInitialization) { globalMemoryCurbeOffset = pBlockInfo->patchInfo.pAllocateStatelessGlobalMemorySurfaceWithInitialization->DataParamOffset; globalMemoryPatchSize = pBlockInfo->patchInfo.pAllocateStatelessGlobalMemorySurfaceWithInitialization->DataParamSize; } if (pBlockInfo->patchInfo.pAllocateStatelessConstantMemorySurfaceWithInitialization) { constantMemoryCurbeOffset = pBlockInfo->patchInfo.pAllocateStatelessConstantMemorySurfaceWithInitialization->DataParamOffset; constantMemoryPatchSize = pBlockInfo->patchInfo.pAllocateStatelessConstantMemorySurfaceWithInitialization->DataParamSize; } ReflectionSurfaceHelper::patchBlocksCurbeWithConstantValues(kernelReflectionSurface->getUnderlyingBuffer(), blockID, globalMemoryCurbeOffset, globalMemoryPatchSize, globalMemoryGpuAddress, constantMemoryCurbeOffset, constantMemoryPatchSize, constantMemoryGpuAddress, ReflectionSurfaceHelper::undefinedOffset, 0, 0); } } void Kernel::ReflectionSurfaceHelper::getCurbeParams(std::vector &curbeParamsOut, uint64_t &tokenMaskOut, uint32_t &firstSSHTokenIndex, const KernelInfo &kernelInfo, const HardwareInfo &hwInfo) { size_t numArgs = kernelInfo.kernelArgInfo.size(); size_t patchTokenCount = +kernelInfo.kernelNonArgInfo.size(); uint64_t tokenMask = 0; tokenMaskOut = 0; firstSSHTokenIndex = 0; curbeParamsOut.reserve(patchTokenCount * 5); uint32_t bindingTableIndex = 253; for (uint32_t argNumber = 0; argNumber < numArgs; argNumber++) { IGIL_KernelCurbeParams curbeParam; bindingTableIndex = 253; auto sizeOfkernelArgForSSH = kernelInfo.gpuPointerSize; if (kernelInfo.kernelArgInfo[argNumber].isBuffer) { curbeParam.m_patchOffset = kernelInfo.kernelArgInfo[argNumber].kernelArgPatchInfoVector[0].crossthreadOffset; curbeParam.m_parameterSize = kernelInfo.gpuPointerSize; curbeParam.m_parameterType = COMPILER_DATA_PARAMETER_GLOBAL_SURFACE; curbeParam.m_sourceOffset = argNumber; curbeParamsOut.push_back(curbeParam); tokenMask |= shiftLeftBy(63); } else if (kernelInfo.kernelArgInfo[argNumber].isImage) { if (isValidOffset(kernelInfo.kernelArgInfo[argNumber].offsetImgWidth)) { curbeParamsOut.emplace_back(IGIL_KernelCurbeParams{DATA_PARAMETER_IMAGE_WIDTH + 50, sizeof(uint32_t), kernelInfo.kernelArgInfo[argNumber].offsetImgWidth, argNumber}); } if (isValidOffset(kernelInfo.kernelArgInfo[argNumber].offsetImgHeight)) { curbeParamsOut.emplace_back(IGIL_KernelCurbeParams{DATA_PARAMETER_IMAGE_HEIGHT + 50, sizeof(uint32_t), kernelInfo.kernelArgInfo[argNumber].offsetImgHeight, argNumber}); } if (isValidOffset(kernelInfo.kernelArgInfo[argNumber].offsetImgDepth)) { curbeParamsOut.emplace_back(IGIL_KernelCurbeParams{DATA_PARAMETER_IMAGE_DEPTH + 50, sizeof(uint32_t), kernelInfo.kernelArgInfo[argNumber].offsetImgDepth, argNumber}); } if (isValidOffset(kernelInfo.kernelArgInfo[argNumber].offsetChannelDataType)) { curbeParamsOut.emplace_back(IGIL_KernelCurbeParams{DATA_PARAMETER_IMAGE_CHANNEL_DATA_TYPE + 50, sizeof(uint32_t), kernelInfo.kernelArgInfo[argNumber].offsetChannelDataType, argNumber}); } if (isValidOffset(kernelInfo.kernelArgInfo[argNumber].offsetChannelOrder)) { curbeParamsOut.emplace_back(IGIL_KernelCurbeParams{DATA_PARAMETER_IMAGE_CHANNEL_ORDER + 50, sizeof(uint32_t), kernelInfo.kernelArgInfo[argNumber].offsetChannelOrder, argNumber}); } if (isValidOffset(kernelInfo.kernelArgInfo[argNumber].offsetArraySize)) { curbeParamsOut.emplace_back(IGIL_KernelCurbeParams{DATA_PARAMETER_IMAGE_ARRAY_SIZE + 50, sizeof(uint32_t), kernelInfo.kernelArgInfo[argNumber].offsetArraySize, argNumber}); } if (isValidOffset(kernelInfo.kernelArgInfo[argNumber].offsetObjectId)) { curbeParamsOut.emplace_back(IGIL_KernelCurbeParams{DATA_PARAMETER_OBJECT_ID + 50, sizeof(uint32_t), kernelInfo.kernelArgInfo[argNumber].offsetObjectId, argNumber}); } tokenMask |= shiftLeftBy(50); if (kernelInfo.patchInfo.bindingTableState) { auto &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily); const void *ssh = static_cast(kernelInfo.heapInfo.pSsh) + kernelInfo.patchInfo.bindingTableState->Offset; for (uint32_t i = 0; i < kernelInfo.patchInfo.bindingTableState->Count; i++) { uint32_t pointer = hwHelper.getBindingTableStateSurfaceStatePointer(ssh, i); if (pointer == kernelInfo.kernelArgInfo[argNumber].offsetHeap) { bindingTableIndex = i; break; } } DEBUG_BREAK_IF(!((bindingTableIndex != 253) || (kernelInfo.patchInfo.bindingTableState->Count == 0))); } } else if (kernelInfo.kernelArgInfo[argNumber].isSampler) { if (isValidOffset(kernelInfo.kernelArgInfo[argNumber].offsetSamplerSnapWa)) { curbeParamsOut.emplace_back(IGIL_KernelCurbeParams{DATA_PARAMETER_SAMPLER_COORDINATE_SNAP_WA_REQUIRED + 100, sizeof(uint32_t), kernelInfo.kernelArgInfo[argNumber].offsetSamplerSnapWa, argNumber}); } if (isValidOffset(kernelInfo.kernelArgInfo[argNumber].offsetSamplerAddressingMode)) { curbeParamsOut.emplace_back(IGIL_KernelCurbeParams{DATA_PARAMETER_SAMPLER_ADDRESS_MODE + 100, sizeof(uint32_t), kernelInfo.kernelArgInfo[argNumber].offsetSamplerAddressingMode, argNumber}); } if (isValidOffset(kernelInfo.kernelArgInfo[argNumber].offsetSamplerNormalizedCoords)) { curbeParamsOut.emplace_back(IGIL_KernelCurbeParams{DATA_PARAMETER_SAMPLER_NORMALIZED_COORDS + 100, sizeof(uint32_t), kernelInfo.kernelArgInfo[argNumber].offsetSamplerNormalizedCoords, argNumber}); } if (isValidOffset(kernelInfo.kernelArgInfo[argNumber].offsetObjectId)) { curbeParamsOut.emplace_back(IGIL_KernelCurbeParams{DATA_PARAMETER_OBJECT_ID + 100, sizeof(uint32_t), kernelInfo.kernelArgInfo[argNumber].offsetObjectId, argNumber}); } tokenMask |= shiftLeftBy(51); } else { bindingTableIndex = 0; sizeOfkernelArgForSSH = 0; } curbeParamsOut.emplace_back(IGIL_KernelCurbeParams{1024, sizeOfkernelArgForSSH, bindingTableIndex, argNumber}); if (kernelInfo.kernelArgInfo[argNumber].slmAlignment != 0) { DEBUG_BREAK_IF(kernelInfo.kernelArgInfo[argNumber].kernelArgPatchInfoVector.size() != 1); uint32_t offset = kernelInfo.kernelArgInfo[argNumber].kernelArgPatchInfoVector[0].crossthreadOffset; uint32_t srcOffset = kernelInfo.kernelArgInfo[argNumber].slmAlignment; curbeParamsOut.emplace_back(IGIL_KernelCurbeParams{DATA_PARAMETER_SUM_OF_LOCAL_MEMORY_OBJECT_ARGUMENT_SIZES, 0, offset, srcOffset}); tokenMask |= shiftLeftBy(DATA_PARAMETER_SUM_OF_LOCAL_MEMORY_OBJECT_ARGUMENT_SIZES); } } for (auto param : kernelInfo.patchInfo.dataParameterBuffersKernelArgs) { curbeParamsOut.emplace_back(IGIL_KernelCurbeParams{DATA_PARAMETER_KERNEL_ARGUMENT, param->DataSize, param->Offset, param->ArgumentNumber}); tokenMask |= shiftLeftBy(DATA_PARAMETER_KERNEL_ARGUMENT); } for (uint32_t i = 0; i < 3; i++) { const uint32_t sizeOfParam = 4; if (kernelInfo.workloadInfo.enqueuedLocalWorkSizeOffsets[i] != WorkloadInfo::undefinedOffset) { curbeParamsOut.emplace_back(IGIL_KernelCurbeParams{DATA_PARAMETER_ENQUEUED_LOCAL_WORK_SIZE, sizeOfParam, kernelInfo.workloadInfo.enqueuedLocalWorkSizeOffsets[i], i * sizeOfParam}); tokenMask |= shiftLeftBy(DATA_PARAMETER_ENQUEUED_LOCAL_WORK_SIZE); } if (kernelInfo.workloadInfo.globalWorkOffsetOffsets[i] != WorkloadInfo::undefinedOffset) { curbeParamsOut.emplace_back(IGIL_KernelCurbeParams{DATA_PARAMETER_GLOBAL_WORK_OFFSET, sizeOfParam, kernelInfo.workloadInfo.globalWorkOffsetOffsets[i], i * sizeOfParam}); tokenMask |= shiftLeftBy(DATA_PARAMETER_GLOBAL_WORK_OFFSET); } if (kernelInfo.workloadInfo.globalWorkSizeOffsets[i] != WorkloadInfo::undefinedOffset) { curbeParamsOut.emplace_back(IGIL_KernelCurbeParams{DATA_PARAMETER_GLOBAL_WORK_SIZE, sizeOfParam, kernelInfo.workloadInfo.globalWorkSizeOffsets[i], i * sizeOfParam}); tokenMask |= shiftLeftBy(DATA_PARAMETER_GLOBAL_WORK_SIZE); } if (kernelInfo.workloadInfo.localWorkSizeOffsets[i] != WorkloadInfo::undefinedOffset) { curbeParamsOut.emplace_back(IGIL_KernelCurbeParams{DATA_PARAMETER_LOCAL_WORK_SIZE, sizeOfParam, kernelInfo.workloadInfo.localWorkSizeOffsets[i], i * sizeOfParam}); tokenMask |= shiftLeftBy(DATA_PARAMETER_LOCAL_WORK_SIZE); } if (kernelInfo.workloadInfo.localWorkSizeOffsets2[i] != WorkloadInfo::undefinedOffset) { curbeParamsOut.emplace_back(IGIL_KernelCurbeParams{DATA_PARAMETER_LOCAL_WORK_SIZE, sizeOfParam, kernelInfo.workloadInfo.localWorkSizeOffsets2[i], i * sizeOfParam}); tokenMask |= shiftLeftBy(DATA_PARAMETER_LOCAL_WORK_SIZE); } if (kernelInfo.workloadInfo.numWorkGroupsOffset[i] != WorkloadInfo::undefinedOffset) { curbeParamsOut.emplace_back(IGIL_KernelCurbeParams{DATA_PARAMETER_NUM_WORK_GROUPS, sizeOfParam, kernelInfo.workloadInfo.numWorkGroupsOffset[i], i * sizeOfParam}); tokenMask |= shiftLeftBy(DATA_PARAMETER_NUM_WORK_GROUPS); } } if (kernelInfo.workloadInfo.parentEventOffset != WorkloadInfo::undefinedOffset) { curbeParamsOut.emplace_back(IGIL_KernelCurbeParams{DATA_PARAMETER_PARENT_EVENT, sizeof(uint32_t), kernelInfo.workloadInfo.parentEventOffset, 0}); tokenMask |= shiftLeftBy(DATA_PARAMETER_PARENT_EVENT); } if (kernelInfo.workloadInfo.workDimOffset != WorkloadInfo::undefinedOffset) { curbeParamsOut.emplace_back(IGIL_KernelCurbeParams{DATA_PARAMETER_WORK_DIMENSIONS, sizeof(uint32_t), kernelInfo.workloadInfo.workDimOffset, 0}); tokenMask |= shiftLeftBy(DATA_PARAMETER_WORK_DIMENSIONS); } std::sort(curbeParamsOut.begin(), curbeParamsOut.end(), compareFunction); tokenMaskOut = tokenMask; firstSSHTokenIndex = static_cast(curbeParamsOut.size() - numArgs); } uint32_t Kernel::ReflectionSurfaceHelper::setKernelData(void *reflectionSurface, uint32_t offset, std::vector &curbeParamsIn, uint64_t tokenMaskIn, size_t maxConstantBufferSize, size_t samplerCount, const KernelInfo &kernelInfo, const HardwareInfo &hwInfo) { uint32_t offsetToEnd = 0; IGIL_KernelData *kernelData = reinterpret_cast(ptrOffset(reflectionSurface, offset)); size_t samplerHeapSize = alignUp(kernelInfo.getSamplerStateArraySize(hwInfo), Sampler::samplerStateArrayAlignment) + kernelInfo.getBorderColorStateSize(); kernelData->m_numberOfCurbeParams = static_cast(curbeParamsIn.size()); // number of paramters to patch kernelData->m_numberOfCurbeTokens = static_cast(curbeParamsIn.size() - kernelInfo.kernelArgInfo.size()); kernelData->m_numberOfSamplerStates = static_cast(kernelInfo.getSamplerStateArrayCount()); kernelData->m_SizeOfSamplerHeap = static_cast(samplerHeapSize); kernelData->m_SamplerBorderColorStateOffsetOnDSH = kernelInfo.patchInfo.samplerStateArray ? kernelInfo.patchInfo.samplerStateArray->BorderColorOffset : 0; kernelData->m_SamplerStateArrayOffsetOnDSH = kernelInfo.patchInfo.samplerStateArray ? kernelInfo.patchInfo.samplerStateArray->Offset : (uint32_t)-1; kernelData->m_sizeOfConstantBuffer = kernelInfo.getConstantBufferSize(); kernelData->m_PatchTokensMask = tokenMaskIn; kernelData->m_ScratchSpacePatchValue = 0; kernelData->m_SIMDSize = kernelInfo.patchInfo.executionEnvironment ? kernelInfo.patchInfo.executionEnvironment->LargestCompiledSIMDSize : 0; kernelData->m_HasBarriers = kernelInfo.patchInfo.executionEnvironment ? kernelInfo.patchInfo.executionEnvironment->HasBarriers : 0; kernelData->m_RequiredWkgSizes[0] = kernelInfo.kernelDescriptor.kernelAttributes.requiredWorkgroupSize[0]; kernelData->m_RequiredWkgSizes[1] = kernelInfo.kernelDescriptor.kernelAttributes.requiredWorkgroupSize[1]; kernelData->m_RequiredWkgSizes[2] = kernelInfo.kernelDescriptor.kernelAttributes.requiredWorkgroupSize[2]; kernelData->m_InilineSLMSize = kernelInfo.workloadInfo.slmStaticSize; bool localIdRequired = false; if (kernelInfo.patchInfo.threadPayload) { if (kernelInfo.patchInfo.threadPayload->LocalIDFlattenedPresent || kernelInfo.patchInfo.threadPayload->LocalIDXPresent || kernelInfo.patchInfo.threadPayload->LocalIDYPresent || kernelInfo.patchInfo.threadPayload->LocalIDZPresent) { localIdRequired = true; } kernelData->m_PayloadSize = PerThreadDataHelper::getThreadPayloadSize(*kernelInfo.patchInfo.threadPayload, kernelData->m_SIMDSize, hwInfo.capabilityTable.grfSize); } kernelData->m_NeedLocalIDS = localIdRequired ? 1 : 0; kernelData->m_DisablePreemption = 0u; bool concurrentExecAllowed = true; if (kernelInfo.patchInfo.pAllocateStatelessPrivateSurface) { if (kernelInfo.patchInfo.pAllocateStatelessPrivateSurface->PerThreadPrivateMemorySize > 0) { concurrentExecAllowed = false; } } kernelData->m_CanRunConcurently = concurrentExecAllowed ? 1 : 0; if (DebugManager.flags.DisableConcurrentBlockExecution.get()) { kernelData->m_CanRunConcurently = false; } IGIL_KernelCurbeParams *kernelCurbeParams = kernelData->m_data; for (uint32_t i = 0; i < curbeParamsIn.size(); i++) { kernelCurbeParams[i] = curbeParamsIn[i]; } offsetToEnd = static_cast(offset + alignUp(sizeof(IGIL_KernelData) + sizeof(IGIL_KernelCurbeParams) * curbeParamsIn.size(), sizeof(void *)) + alignUp(samplerHeapSize, sizeof(void *)) + alignUp(maxConstantBufferSize, sizeof(void *)) + sizeof(IGIL_SamplerParams) * samplerCount); return offsetToEnd; } void Kernel::ReflectionSurfaceHelper::setKernelAddressDataBtOffset(void *reflectionSurface, uint32_t blockID, uint32_t btOffset) { uint32_t offset = static_cast(offsetof(IGIL_KernelDataHeader, m_data) + sizeof(IGIL_KernelAddressData) * blockID); IGIL_KernelAddressData *kernelAddressData = reinterpret_cast(ptrOffset(reflectionSurface, offset)); kernelAddressData->m_BTSoffset = btOffset; } void Kernel::ReflectionSurfaceHelper::setKernelAddressData(void *reflectionSurface, uint32_t offset, uint32_t kernelDataOffset, uint32_t samplerHeapOffset, uint32_t constantBufferOffset, uint32_t samplerParamsOffset, uint32_t sshTokensOffset, uint32_t btOffset, const KernelInfo &kernelInfo, const HardwareInfo &hwInfo) { IGIL_KernelAddressData *kernelAddressData = reinterpret_cast(ptrOffset(reflectionSurface, offset)); auto &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily); kernelAddressData->m_KernelDataOffset = kernelDataOffset; kernelAddressData->m_SamplerHeapOffset = samplerHeapOffset; kernelAddressData->m_SamplerParamsOffset = samplerParamsOffset; kernelAddressData->m_ConstantBufferOffset = constantBufferOffset; kernelAddressData->m_SSHTokensOffset = sshTokensOffset; kernelAddressData->m_BTSoffset = btOffset; kernelAddressData->m_BTSize = static_cast(kernelInfo.patchInfo.bindingTableState ? kernelInfo.patchInfo.bindingTableState->Count * hwHelper.getBindingTableStateSize() : 0); } template <> void Kernel::ReflectionSurfaceHelper::patchBlocksCurbe(void *reflectionSurface, uint32_t blockID, uint64_t defaultDeviceQueueCurbeOffset, uint32_t patchSizeDefaultQueue, uint64_t defaultDeviceQueueGpuAddress, uint64_t eventPoolCurbeOffset, uint32_t patchSizeEventPool, uint64_t eventPoolGpuAddress, uint64_t deviceQueueCurbeOffset, uint32_t patchSizeDeviceQueue, uint64_t deviceQueueGpuAddress, uint64_t printfBufferOffset, uint32_t patchSizePrintfBuffer, uint64_t printfBufferGpuAddress, uint64_t privateSurfaceOffset, uint32_t privateSurfaceSize, uint64_t privateSurfaceGpuAddress) { IGIL_KernelDataHeader *pKernelHeader = reinterpret_cast(reflectionSurface); // Reflection surface must be initialized prior to patching blocks curbe on KRS DEBUG_BREAK_IF(blockID >= pKernelHeader->m_numberOfKernels); IGIL_KernelAddressData *addressData = pKernelHeader->m_data; // const buffer offsets must be set DEBUG_BREAK_IF(addressData[blockID].m_ConstantBufferOffset == 0); void *pCurbe = ptrOffset(reflectionSurface, addressData[blockID].m_ConstantBufferOffset); if (defaultDeviceQueueCurbeOffset != undefinedOffset) { auto *patchedPointer = ptrOffset(pCurbe, (size_t)defaultDeviceQueueCurbeOffset); patchWithRequiredSize(patchedPointer, patchSizeDefaultQueue, (uintptr_t)defaultDeviceQueueGpuAddress); } if (eventPoolCurbeOffset != undefinedOffset) { auto *patchedPointer = ptrOffset(pCurbe, (size_t)eventPoolCurbeOffset); patchWithRequiredSize(patchedPointer, patchSizeEventPool, (uintptr_t)eventPoolGpuAddress); } if (deviceQueueCurbeOffset != undefinedOffset) { auto *patchedPointer = ptrOffset(pCurbe, (size_t)deviceQueueCurbeOffset); patchWithRequiredSize(patchedPointer, patchSizeDeviceQueue, (uintptr_t)deviceQueueGpuAddress); } if (printfBufferOffset != undefinedOffset) { auto *patchedPointer = ptrOffset(pCurbe, (size_t)printfBufferOffset); patchWithRequiredSize(patchedPointer, patchSizePrintfBuffer, (uintptr_t)printfBufferGpuAddress); } if (privateSurfaceOffset != undefinedOffset) { auto *patchedPointer = ptrOffset(pCurbe, (size_t)privateSurfaceOffset); patchWithRequiredSize(patchedPointer, privateSurfaceSize, (uintptr_t)privateSurfaceGpuAddress); } } void Kernel::ReflectionSurfaceHelper::patchBlocksCurbeWithConstantValues(void *reflectionSurface, uint32_t blockID, uint64_t globalMemoryCurbeOffset, uint32_t globalMemoryPatchSize, uint64_t globalMemoryGpuAddress, uint64_t constantMemoryCurbeOffset, uint32_t constantMemoryPatchSize, uint64_t constantMemoryGpuAddress, uint64_t privateMemoryCurbeOffset, uint32_t privateMemoryPatchSize, uint64_t privateMemoryGpuAddress) { IGIL_KernelDataHeader *pKernelHeader = reinterpret_cast(reflectionSurface); // Reflection surface must be initialized prior to patching blocks curbe on KRS DEBUG_BREAK_IF(blockID >= pKernelHeader->m_numberOfKernels); IGIL_KernelAddressData *addressData = pKernelHeader->m_data; // const buffer offsets must be set DEBUG_BREAK_IF(addressData[blockID].m_ConstantBufferOffset == 0); void *pCurbe = ptrOffset(reflectionSurface, addressData[blockID].m_ConstantBufferOffset); if (globalMemoryCurbeOffset != undefinedOffset) { auto *patchedPointer = ptrOffset(pCurbe, (size_t)globalMemoryCurbeOffset); patchWithRequiredSize(patchedPointer, globalMemoryPatchSize, (uintptr_t)globalMemoryGpuAddress); } if (constantMemoryCurbeOffset != undefinedOffset) { auto *patchedPointer = ptrOffset(pCurbe, (size_t)constantMemoryCurbeOffset); patchWithRequiredSize(patchedPointer, constantMemoryPatchSize, (uintptr_t)constantMemoryGpuAddress); } if (privateMemoryCurbeOffset != undefinedOffset) { auto *patchedPointer = ptrOffset(pCurbe, (size_t)privateMemoryCurbeOffset); patchWithRequiredSize(patchedPointer, privateMemoryPatchSize, (uintptr_t)privateMemoryGpuAddress); } } void Kernel::ReflectionSurfaceHelper::setParentImageParams(void *reflectionSurface, std::vector &parentArguments, const KernelInfo &parentKernelInfo) { IGIL_KernelDataHeader *pKernelHeader = reinterpret_cast(reflectionSurface); IGIL_ImageParamters *pImageParameters = reinterpret_cast(ptrOffset(pKernelHeader, (size_t)pKernelHeader->m_ParentImageDataOffset)); uint32_t numArgs = (uint32_t)parentArguments.size(); for (uint32_t i = 0; i < numArgs; i++) { if (parentArguments[i].type == Kernel::kernelArgType::IMAGE_OBJ) { const Image *image = castToObject((cl_mem)parentArguments[i].object); if (image) { pImageParameters->m_ArraySize = (uint32_t)image->getImageDesc().image_array_size; pImageParameters->m_Depth = (uint32_t)image->getImageDesc().image_depth; pImageParameters->m_Height = (uint32_t)image->getImageDesc().image_height; pImageParameters->m_Width = (uint32_t)image->getImageDesc().image_width; pImageParameters->m_NumMipLevels = (uint32_t)image->getImageDesc().num_mip_levels; pImageParameters->m_NumSamples = (uint32_t)image->getImageDesc().num_samples; pImageParameters->m_ChannelDataType = (uint32_t)image->getImageFormat().image_channel_data_type; pImageParameters->m_ChannelOrder = (uint32_t)image->getImageFormat().image_channel_data_type; pImageParameters->m_ObjectID = (uint32_t)parentKernelInfo.kernelArgInfo[i].offsetHeap; pImageParameters++; } } } } void Kernel::ReflectionSurfaceHelper::setParentSamplerParams(void *reflectionSurface, std::vector &parentArguments, const KernelInfo &parentKernelInfo) { IGIL_KernelDataHeader *pKernelHeader = reinterpret_cast(reflectionSurface); IGIL_ParentSamplerParams *pParentSamplerParams = reinterpret_cast(ptrOffset(pKernelHeader, (size_t)pKernelHeader->m_ParentSamplerParamsOffset)); uint32_t numArgs = (uint32_t)parentArguments.size(); for (uint32_t i = 0; i < numArgs; i++) { if (parentArguments[i].type == Kernel::kernelArgType::SAMPLER_OBJ) { const Sampler *sampler = castToObject((cl_sampler)parentArguments[i].object); if (sampler) { pParentSamplerParams->CoordinateSnapRequired = (uint32_t)sampler->getSnapWaValue(); pParentSamplerParams->m_AddressingMode = (uint32_t)sampler->addressingMode; pParentSamplerParams->NormalizedCoords = (uint32_t)sampler->normalizedCoordinates; pParentSamplerParams->m_ObjectID = OCLRT_ARG_OFFSET_TO_SAMPLER_OBJECT_ID((uint32_t)parentKernelInfo.kernelArgInfo[i].offsetHeap); pParentSamplerParams++; } } } } void Kernel::resetSharedObjectsPatchAddresses() { for (size_t i = 0; i < getKernelArgsNumber(); i++) { auto clMem = (cl_mem)kernelArguments[i].object; auto memObj = castToObject(clMem); if (memObj && memObj->peekSharingHandler()) { setArg((uint32_t)i, sizeof(cl_mem), &clMem); } } } void Kernel::provideInitializationHints() { Context *context = program->getContextPtr(); if (context == nullptr || !context->isProvidingPerformanceHints()) return; for (auto i = 0u; i < kernelDeviceInfos.size(); i++) { if (!kernelInfos[i]) { continue; } if (kernelDeviceInfos[i].privateSurfaceSize) { context->providePerformanceHint(CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL, PRIVATE_MEMORY_USAGE_TOO_HIGH, kernelInfos[i]->kernelDescriptor.kernelMetadata.kernelName.c_str(), kernelDeviceInfos[i].privateSurfaceSize); } const auto &patchInfo = kernelInfos[i]->patchInfo; if (patchInfo.mediavfestate) { auto scratchSize = patchInfo.mediavfestate->PerThreadScratchSpace; scratchSize *= getDevice().getSharedDeviceInfo().computeUnitsUsedForScratch * getKernelInfo(i).getMaxSimdSize(); if (scratchSize > 0) { context->providePerformanceHint(CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL, REGISTER_PRESSURE_TOO_HIGH, kernelInfos[i]->kernelDescriptor.kernelMetadata.kernelName.c_str(), scratchSize); } } } } void Kernel::patchDefaultDeviceQueue(DeviceQueue *devQueue) { auto rootDeviceIndex = devQueue->getDevice().getRootDeviceIndex(); const auto &patchInfo = kernelInfos[rootDeviceIndex]->patchInfo; if (patchInfo.pAllocateStatelessDefaultDeviceQueueSurface) { if (kernelDeviceInfos[rootDeviceIndex].crossThreadData) { auto patchLocation = ptrOffset(reinterpret_cast(getCrossThreadData(rootDeviceIndex)), patchInfo.pAllocateStatelessDefaultDeviceQueueSurface->DataParamOffset); patchWithRequiredSize(patchLocation, patchInfo.pAllocateStatelessDefaultDeviceQueueSurface->DataParamSize, static_cast(devQueue->getQueueBuffer()->getGpuAddressToPatch())); } if (requiresSshForBuffers(rootDeviceIndex)) { auto surfaceState = ptrOffset(reinterpret_cast(getSurfaceStateHeap(rootDeviceIndex)), patchInfo.pAllocateStatelessDefaultDeviceQueueSurface->SurfaceStateHeapOffset); Buffer::setSurfaceState(&devQueue->getDevice(), surfaceState, devQueue->getQueueBuffer()->getUnderlyingBufferSize(), (void *)devQueue->getQueueBuffer()->getGpuAddress(), 0, devQueue->getQueueBuffer(), 0, 0); } } } void Kernel::patchEventPool(DeviceQueue *devQueue) { auto rootDeviceIndex = devQueue->getDevice().getRootDeviceIndex(); const auto &patchInfo = kernelInfos[rootDeviceIndex]->patchInfo; if (patchInfo.pAllocateStatelessEventPoolSurface) { if (kernelDeviceInfos[rootDeviceIndex].crossThreadData) { auto patchLocation = ptrOffset(reinterpret_cast(getCrossThreadData(rootDeviceIndex)), patchInfo.pAllocateStatelessEventPoolSurface->DataParamOffset); patchWithRequiredSize(patchLocation, patchInfo.pAllocateStatelessEventPoolSurface->DataParamSize, static_cast(devQueue->getEventPoolBuffer()->getGpuAddressToPatch())); } if (requiresSshForBuffers(rootDeviceIndex)) { auto surfaceState = ptrOffset(reinterpret_cast(getSurfaceStateHeap(rootDeviceIndex)), patchInfo.pAllocateStatelessEventPoolSurface->SurfaceStateHeapOffset); Buffer::setSurfaceState(&devQueue->getDevice(), surfaceState, devQueue->getEventPoolBuffer()->getUnderlyingBufferSize(), (void *)devQueue->getEventPoolBuffer()->getGpuAddress(), 0, devQueue->getEventPoolBuffer(), 0, 0); } } } void Kernel::patchBlocksSimdSize(uint32_t rootDeviceIndex) { BlockKernelManager *blockManager = program->getBlockKernelManager(); for (auto &idOffset : kernelInfos[rootDeviceIndex]->childrenKernelsIdOffset) { DEBUG_BREAK_IF(!(idOffset.first < static_cast(blockManager->getCount()))); const KernelInfo *blockInfo = blockManager->getBlockKernelInfo(idOffset.first); uint32_t *simdSize = reinterpret_cast(&kernelDeviceInfos[rootDeviceIndex].crossThreadData[idOffset.second]); *simdSize = blockInfo->getMaxSimdSize(); } } bool Kernel::usesSyncBuffer(uint32_t rootDeviceIndex) { return (getKernelInfo(rootDeviceIndex).patchInfo.pAllocateSyncBuffer != nullptr); } void Kernel::patchSyncBuffer(Device &device, GraphicsAllocation *gfxAllocation, size_t bufferOffset) { auto rootDeviceIndex = device.getRootDeviceIndex(); auto &patchInfo = kernelInfos[rootDeviceIndex]->patchInfo; auto bufferPatchAddress = ptrOffset(getCrossThreadData(rootDeviceIndex), patchInfo.pAllocateSyncBuffer->DataParamOffset); patchWithRequiredSize(bufferPatchAddress, patchInfo.pAllocateSyncBuffer->DataParamSize, ptrOffset(gfxAllocation->getGpuAddressToPatch(), bufferOffset)); if (requiresSshForBuffers(rootDeviceIndex)) { auto surfaceState = ptrOffset(reinterpret_cast(getSurfaceStateHeap(rootDeviceIndex)), patchInfo.pAllocateSyncBuffer->SurfaceStateHeapOffset); auto addressToPatch = gfxAllocation->getUnderlyingBuffer(); auto sizeToPatch = gfxAllocation->getUnderlyingBufferSize(); Buffer::setSurfaceState(&device, surfaceState, sizeToPatch, addressToPatch, 0, gfxAllocation, 0, 0); } } template void Kernel::patchReflectionSurface(DeviceQueue *, PrintfHandler *); bool Kernel::isPatched() const { return patchedArgumentsNum == getDefaultKernelInfo().argumentsToPatchNum; } cl_int Kernel::checkCorrectImageAccessQualifier(cl_uint argIndex, size_t argSize, const void *argValue) const { auto &defaultKernelInfo = getDefaultKernelInfo(); if (defaultKernelInfo.kernelArgInfo[argIndex].isImage) { cl_mem mem = *(static_cast(argValue)); MemObj *pMemObj = nullptr; WithCastToInternal(mem, &pMemObj); if (pMemObj) { auto accessQualifier = defaultKernelInfo.kernelArgInfo[argIndex].metadata.accessQualifier; cl_mem_flags flags = pMemObj->getFlags(); if ((accessQualifier == KernelArgMetadata::AccessReadOnly && ((flags | CL_MEM_WRITE_ONLY) == flags)) || (accessQualifier == KernelArgMetadata::AccessWriteOnly && ((flags | CL_MEM_READ_ONLY) == flags))) { return CL_INVALID_ARG_VALUE; } } else { return CL_INVALID_ARG_VALUE; } } return CL_SUCCESS; } void Kernel::resolveArgs() { if (!Kernel::isPatched() || !imageTransformer->hasRegisteredImages3d() || !canTransformImages()) return; bool canTransformImageTo2dArray = true; for (uint32_t i = 0; i < patchedArgumentsNum; i++) { if (getDefaultKernelInfo().kernelArgInfo.at(i).isSampler) { auto sampler = castToObject(kernelArguments.at(i).object); if (sampler->isTransformable()) { canTransformImageTo2dArray = true; } else { canTransformImageTo2dArray = false; break; } } } for (auto rootDeviceIndex = 0u; rootDeviceIndex < kernelInfos.size(); rootDeviceIndex++) { auto pKernelInfo = kernelInfos[rootDeviceIndex]; if (!pKernelInfo) { continue; } if (canTransformImageTo2dArray) { imageTransformer->transformImagesTo2dArray(*pKernelInfo, kernelArguments, getSurfaceStateHeap(rootDeviceIndex)); } else if (imageTransformer->didTransform()) { imageTransformer->transformImagesTo3d(*pKernelInfo, kernelArguments, getSurfaceStateHeap(rootDeviceIndex)); } } } bool Kernel::canTransformImages() const { auto renderCoreFamily = program->getDevices()[0]->getHardwareInfo().platform.eRenderCoreFamily; return renderCoreFamily >= IGFX_GEN9_CORE && renderCoreFamily <= IGFX_GEN11LP_CORE; } void Kernel::fillWithBuffersForAuxTranslation(MemObjsForAuxTranslation &memObjsForAuxTranslation, uint32_t rootDeviceIndex) { memObjsForAuxTranslation.reserve(getKernelArgsNumber()); auto &kernelInfo = getKernelInfo(rootDeviceIndex); for (uint32_t i = 0; i < getKernelArgsNumber(); i++) { if (BUFFER_OBJ == kernelArguments.at(i).type && !kernelInfo.kernelArgInfo.at(i).pureStatefulBufferAccess) { auto buffer = castToObject(getKernelArg(i)); if (buffer && buffer->getMultiGraphicsAllocation().getAllocationType() == GraphicsAllocation::AllocationType::BUFFER_COMPRESSED) { memObjsForAuxTranslation.insert(buffer); auto &context = this->program->getContext(); if (context.isProvidingPerformanceHints()) { context.providePerformanceHint(CL_CONTEXT_DIAGNOSTICS_LEVEL_BAD_INTEL, KERNEL_ARGUMENT_AUX_TRANSLATION, kernelInfo.kernelDescriptor.kernelMetadata.kernelName.c_str(), i, kernelInfo.kernelArgInfo.at(i).metadataExtended->argName.c_str()); } } } } } void Kernel::getAllocationsForCacheFlush(CacheFlushAllocationsVec &out, uint32_t rootDeviceIndex) const { if (false == HwHelper::cacheFlushAfterWalkerSupported(getHardwareInfo(rootDeviceIndex))) { return; } for (GraphicsAllocation *alloc : this->kernelArgRequiresCacheFlush) { if (nullptr == alloc) { continue; } out.push_back(alloc); } auto global = getProgram()->getGlobalSurface(rootDeviceIndex); if (global != nullptr) { out.push_back(global); } if (svmAllocationsRequireCacheFlush) { for (GraphicsAllocation *alloc : kernelSvmGfxAllocations) { if (allocationForCacheFlush(alloc)) { out.push_back(alloc); } } } } bool Kernel::allocationForCacheFlush(GraphicsAllocation *argAllocation) const { return argAllocation->isFlushL3Required(); } void Kernel::addAllocationToCacheFlushVector(uint32_t argIndex, GraphicsAllocation *argAllocation) { if (argAllocation == nullptr) { kernelArgRequiresCacheFlush[argIndex] = nullptr; } else { if (allocationForCacheFlush(argAllocation)) { kernelArgRequiresCacheFlush[argIndex] = argAllocation; } else { kernelArgRequiresCacheFlush[argIndex] = nullptr; } } } void Kernel::setReflectionSurfaceBlockBtOffset(uint32_t blockID, uint32_t offset) { DEBUG_BREAK_IF(blockID >= program->getBlockKernelManager()->getCount()); ReflectionSurfaceHelper::setKernelAddressDataBtOffset(getKernelReflectionSurface()->getUnderlyingBuffer(), blockID, offset); } bool Kernel::checkIfIsParentKernelAndBlocksUsesPrintf() { return isParentKernel && getProgram()->getBlockKernelManager()->getIfBlockUsesPrintf(); } uint64_t Kernel::getKernelStartOffset( const bool localIdsGenerationByRuntime, const bool kernelUsesLocalIds, const bool isCssUsed, uint32_t rootDeviceIndex) const { uint64_t kernelStartOffset = 0; if (kernelInfos[rootDeviceIndex]->getGraphicsAllocation()) { kernelStartOffset = kernelInfos[rootDeviceIndex]->getGraphicsAllocation()->getGpuAddressToPatch(); if (localIdsGenerationByRuntime == false && kernelUsesLocalIds == true) { kernelStartOffset += kernelInfos[rootDeviceIndex]->patchInfo.threadPayload->OffsetToSkipPerThreadDataLoad; } } kernelStartOffset += getStartOffset(); auto &hardwareInfo = getHardwareInfo(rootDeviceIndex); auto &hwHelper = HwHelper::get(hardwareInfo.platform.eRenderCoreFamily); if (isCssUsed && hwHelper.isOffsetToSkipSetFFIDGPWARequired(hardwareInfo)) { kernelStartOffset += kernelInfos[rootDeviceIndex]->patchInfo.threadPayload->OffsetToSkipSetFFIDGP; } return kernelStartOffset; } void Kernel::patchBindlessSurfaceStateOffsets(const Device &device, const size_t sshOffset) { auto rootDeviceIndex = device.getRootDeviceIndex(); auto &kernelInfo = *kernelInfos[rootDeviceIndex]; const bool bindlessUsed = kernelInfo.kernelDescriptor.kernelAttributes.bufferAddressingMode == KernelDescriptor::AddressingMode::BindlessAndStateless; if (bindlessUsed) { auto &hardwareInfo = device.getHardwareInfo(); auto &hwHelper = HwHelper::get(hardwareInfo.platform.eRenderCoreFamily); for (size_t i = 0; i < kernelInfo.kernelArgInfo.size(); i++) { if ((kernelInfo.kernelArgInfo[i].isBuffer) || (kernelInfo.kernelArgInfo[i].isImage)) { auto patchLocation = ptrOffset(getCrossThreadData(device.getRootDeviceIndex()), kernelInfo.kernelArgInfo[i].kernelArgPatchInfoVector[0].crossthreadOffset); auto bindlessOffset = static_cast(sshOffset) + kernelInfo.kernelArgInfo[i].offsetHeap; auto patchValue = hwHelper.getBindlessSurfaceExtendedMessageDescriptorValue(bindlessOffset); patchWithRequiredSize(patchLocation, sizeof(patchValue), patchValue); } } } } void Kernel::setAdditionalKernelExecInfo(uint32_t additionalKernelExecInfo) { this->additionalKernelExecInfo = additionalKernelExecInfo; } uint32_t Kernel::getAdditionalKernelExecInfo() const { return this->additionalKernelExecInfo; } bool Kernel::requiresWaDisableRccRhwoOptimization(uint32_t rootDeviceIndex) const { auto &hardwareInfo = getHardwareInfo(rootDeviceIndex); auto &hwHelper = HwHelper::get(hardwareInfo.platform.eRenderCoreFamily); if (hwHelper.isWaDisableRccRhwoOptimizationRequired() && isUsingSharedObjArgs()) { for (auto &arg : getKernelArguments()) { auto clMemObj = static_cast(arg.object); auto memObj = castToObject(clMemObj); if (memObj && memObj->peekSharingHandler()) { auto allocation = memObj->getGraphicsAllocation(rootDeviceIndex); for (uint32_t handleId = 0u; handleId < allocation->getNumGmms(); handleId++) { if (allocation->getGmm(handleId)->gmmResourceInfo->getResourceFlags()->Info.MediaCompressed) { return true; } } } } } return false; } const HardwareInfo &Kernel::getHardwareInfo(uint32_t rootDeviceIndex) const { return *executionEnvironment.rootDeviceEnvironments[rootDeviceIndex]->getHardwareInfo(); } const KernelInfo &Kernel::getDefaultKernelInfo() const { const KernelInfo *pKernelInfo = nullptr; for (auto &kernelInfo : kernelInfos) { if (kernelInfo) { pKernelInfo = kernelInfo; break; } } UNRECOVERABLE_IF(!pKernelInfo); return *pKernelInfo; } void Kernel::setGlobalWorkOffsetValues(uint32_t rootDeviceIndex, uint32_t globalWorkOffsetX, uint32_t globalWorkOffsetY, uint32_t globalWorkOffsetZ) { *kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetX = globalWorkOffsetX; *kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetY = globalWorkOffsetY; *kernelDeviceInfos[rootDeviceIndex].globalWorkOffsetY = globalWorkOffsetZ; } void Kernel::setGlobalWorkSizeValues(uint32_t rootDeviceIndex, uint32_t globalWorkSizeX, uint32_t globalWorkSizeY, uint32_t globalWorkSizeZ) { *kernelDeviceInfos[rootDeviceIndex].globalWorkSizeX = globalWorkSizeX; *kernelDeviceInfos[rootDeviceIndex].globalWorkSizeY = globalWorkSizeY; *kernelDeviceInfos[rootDeviceIndex].globalWorkSizeZ = globalWorkSizeZ; } void Kernel::setLocalWorkSizeValues(uint32_t rootDeviceIndex, uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ) { *kernelDeviceInfos[rootDeviceIndex].localWorkSizeX = localWorkSizeX; *kernelDeviceInfos[rootDeviceIndex].localWorkSizeY = localWorkSizeY; *kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ = localWorkSizeZ; } void Kernel::setLocalWorkSize2Values(uint32_t rootDeviceIndex, uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ) { *kernelDeviceInfos[rootDeviceIndex].localWorkSizeX2 = localWorkSizeX; *kernelDeviceInfos[rootDeviceIndex].localWorkSizeY2 = localWorkSizeY; *kernelDeviceInfos[rootDeviceIndex].localWorkSizeZ2 = localWorkSizeZ; } void Kernel::setEnqueuedLocalWorkSizeValues(uint32_t rootDeviceIndex, uint32_t localWorkSizeX, uint32_t localWorkSizeY, uint32_t localWorkSizeZ) { *kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeX = localWorkSizeX; *kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeY = localWorkSizeY; *kernelDeviceInfos[rootDeviceIndex].enqueuedLocalWorkSizeZ = localWorkSizeZ; } bool Kernel::isLocalWorkSize2Patched(uint32_t rootDeviceIndex) { return kernelDeviceInfos[rootDeviceIndex].localWorkSizeX2 != &dummyPatchLocation; } void Kernel::setNumWorkGroupsValues(uint32_t rootDeviceIndex, uint32_t numWorkGroupsX, uint32_t numWorkGroupsY, uint32_t numWorkGroupsZ) { *kernelDeviceInfos[rootDeviceIndex].numWorkGroupsX = numWorkGroupsX; *kernelDeviceInfos[rootDeviceIndex].numWorkGroupsY = numWorkGroupsY; *kernelDeviceInfos[rootDeviceIndex].numWorkGroupsZ = numWorkGroupsZ; } void Kernel::setWorkDim(uint32_t rootDeviceIndex, uint32_t workDim) { *kernelDeviceInfos[rootDeviceIndex].workDim = workDim; } uint32_t Kernel::getMaxKernelWorkGroupSize(uint32_t rootDeviceIndex) const { return kernelDeviceInfos[rootDeviceIndex].maxKernelWorkGroupSize; } } // namespace NEO