/* * Copyright (C) 2017-2020 Intel Corporation * * SPDX-License-Identifier: MIT * */ #include "shared/source/device_binary_format/patchtokens_decoder.h" #include "shared/source/helpers/aligned_memory.h" #include "shared/source/helpers/blit_commands_helper.h" #include "shared/source/helpers/hw_helper.h" #include "shared/source/helpers/ptr_math.h" #include "shared/source/helpers/string.h" #include "shared/source/memory_manager/memory_manager.h" #include "opencl/source/cl_device/cl_device.h" #include "opencl/source/helpers/dispatch_info.h" #include "opencl/source/kernel/kernel.h" #include "opencl/source/mem_obj/buffer.h" #include "opencl/source/mem_obj/image.h" #include "opencl/source/sampler/sampler.h" #include "hw_cmds.h" #include #include #include #include #include namespace NEO { bool useKernelDescriptor = false; struct KernelArgumentType { const char *argTypeQualifier; uint64_t argTypeQualifierValue; }; std::map typeSizeMap = { {"char", sizeof(cl_char)}, {"char2", sizeof(cl_char2)}, {"char3", sizeof(cl_char3)}, {"char4", sizeof(cl_char4)}, {"char8", sizeof(cl_char8)}, {"char16", sizeof(cl_char16)}, {"uchar", sizeof(cl_uchar)}, {"uchar2", sizeof(cl_uchar2)}, {"uchar3", sizeof(cl_uchar3)}, {"uchar4", sizeof(cl_uchar4)}, {"uchar8", sizeof(cl_uchar8)}, {"uchar16", sizeof(cl_uchar16)}, {"short", sizeof(cl_short)}, {"short2", sizeof(cl_short2)}, {"short3", sizeof(cl_short3)}, {"short4", sizeof(cl_short4)}, {"short8", sizeof(cl_short8)}, {"short16", sizeof(cl_short16)}, {"ushort", sizeof(cl_ushort)}, {"ushort2", sizeof(cl_ushort2)}, {"ushort3", sizeof(cl_ushort3)}, {"ushort4", sizeof(cl_ushort4)}, {"ushort8", sizeof(cl_ushort8)}, {"ushort16", sizeof(cl_ushort16)}, {"int", sizeof(cl_int)}, {"int2", sizeof(cl_int2)}, {"int3", sizeof(cl_int3)}, {"int4", sizeof(cl_int4)}, {"int8", sizeof(cl_int8)}, {"int16", sizeof(cl_int16)}, {"uint", sizeof(cl_uint)}, {"uint2", sizeof(cl_uint2)}, {"uint3", sizeof(cl_uint3)}, {"uint4", sizeof(cl_uint4)}, {"uint8", sizeof(cl_uint8)}, {"uint16", sizeof(cl_uint16)}, {"long", sizeof(cl_long)}, {"long2", sizeof(cl_long2)}, {"long3", sizeof(cl_long3)}, {"long4", sizeof(cl_long4)}, {"long8", sizeof(cl_long8)}, {"long16", sizeof(cl_long16)}, {"ulong", sizeof(cl_ulong)}, {"ulong2", sizeof(cl_ulong2)}, {"ulong3", sizeof(cl_ulong3)}, {"ulong4", sizeof(cl_ulong4)}, {"ulong8", sizeof(cl_ulong8)}, {"ulong16", sizeof(cl_ulong16)}, {"half", sizeof(cl_half)}, {"float", sizeof(cl_float)}, {"float2", sizeof(cl_float2)}, {"float3", sizeof(cl_float3)}, {"float4", sizeof(cl_float4)}, {"float8", sizeof(cl_float8)}, {"float16", sizeof(cl_float16)}, #ifdef cl_khr_fp16 {"half2", sizeof(cl_half2)}, {"half3", sizeof(cl_half3)}, {"half4", sizeof(cl_half4)}, {"half8", sizeof(cl_half8)}, {"half16", sizeof(cl_half16)}, #endif {"double", sizeof(cl_double)}, {"double2", sizeof(cl_double2)}, {"double3", sizeof(cl_double3)}, {"double4", sizeof(cl_double4)}, {"double8", sizeof(cl_double8)}, {"double16", sizeof(cl_double16)}, }; WorkSizeInfo::WorkSizeInfo(uint32_t maxWorkGroupSize, bool hasBarriers, uint32_t simdSize, uint32_t slmTotalSize, GFXCORE_FAMILY coreFamily, uint32_t numThreadsPerSubSlice, uint32_t localMemSize, bool imgUsed, bool yTiledSurface) { this->maxWorkGroupSize = maxWorkGroupSize; this->hasBarriers = hasBarriers; this->simdSize = simdSize; this->slmTotalSize = slmTotalSize; this->coreFamily = coreFamily; this->numThreadsPerSubSlice = numThreadsPerSubSlice; this->localMemSize = localMemSize; this->imgUsed = imgUsed; this->yTiledSurfaces = yTiledSurface; setMinWorkGroupSize(); } WorkSizeInfo::WorkSizeInfo(const DispatchInfo &dispatchInfo) { this->maxWorkGroupSize = dispatchInfo.getKernel()->maxKernelWorkGroupSize; auto pExecutionEnvironment = dispatchInfo.getKernel()->getKernelInfo().patchInfo.executionEnvironment; this->hasBarriers = (pExecutionEnvironment != nullptr) && (pExecutionEnvironment->HasBarriers); this->simdSize = (uint32_t)dispatchInfo.getKernel()->getKernelInfo().getMaxSimdSize(); this->slmTotalSize = (uint32_t)dispatchInfo.getKernel()->slmTotalSize; this->coreFamily = dispatchInfo.getKernel()->getDevice().getHardwareInfo().platform.eRenderCoreFamily; this->numThreadsPerSubSlice = (uint32_t)dispatchInfo.getKernel()->getDevice().getSharedDeviceInfo().maxNumEUsPerSubSlice * dispatchInfo.getKernel()->getDevice().getSharedDeviceInfo().numThreadsPerEU; this->localMemSize = (uint32_t)dispatchInfo.getKernel()->getDevice().getSharedDeviceInfo().localMemSize; setIfUseImg(dispatchInfo.getKernel()); setMinWorkGroupSize(); } void WorkSizeInfo::setIfUseImg(Kernel *pKernel) { auto ParamsCount = pKernel->getKernelArgsNumber(); for (auto i = 0u; i < ParamsCount; i++) { if (pKernel->getKernelInfo().kernelArgInfo[i].isImage) { imgUsed = true; yTiledSurfaces = true; } } } void WorkSizeInfo::setMinWorkGroupSize() { minWorkGroupSize = 0; if (hasBarriers) { uint32_t maxBarriersPerHSlice = (coreFamily >= IGFX_GEN9_CORE) ? 32 : 16; minWorkGroupSize = numThreadsPerSubSlice * simdSize / maxBarriersPerHSlice; } if (slmTotalSize > 0) { minWorkGroupSize = std::max(maxWorkGroupSize / ((localMemSize / slmTotalSize)), minWorkGroupSize); } } void WorkSizeInfo::checkRatio(const size_t workItems[3]) { if (slmTotalSize > 0) { useRatio = true; targetRatio = log((float)workItems[0]) - log((float)workItems[1]); useStrictRatio = false; } else if (yTiledSurfaces == true) { useRatio = true; targetRatio = YTilingRatioValue; useStrictRatio = true; } } KernelInfo::~KernelInfo() { kernelArgInfo.clear(); patchInfo.stringDataMap.clear(); delete[] crossThreadData; } void KernelInfo::storePatchToken(const SPatchExecutionEnvironment *execEnv) { this->patchInfo.executionEnvironment = execEnv; if (execEnv->RequiredWorkGroupSizeX != 0) { this->reqdWorkGroupSize[0] = execEnv->RequiredWorkGroupSizeX; this->reqdWorkGroupSize[1] = execEnv->RequiredWorkGroupSizeY; this->reqdWorkGroupSize[2] = execEnv->RequiredWorkGroupSizeZ; DEBUG_BREAK_IF(!(execEnv->RequiredWorkGroupSizeY > 0)); DEBUG_BREAK_IF(!(execEnv->RequiredWorkGroupSizeZ > 0)); } this->workgroupWalkOrder[0] = 0; this->workgroupWalkOrder[1] = 1; this->workgroupWalkOrder[2] = 2; if (execEnv->WorkgroupWalkOrderDims) { constexpr auto dimensionMask = 0b11; constexpr auto dimensionSize = 2; this->workgroupWalkOrder[0] = execEnv->WorkgroupWalkOrderDims & dimensionMask; this->workgroupWalkOrder[1] = (execEnv->WorkgroupWalkOrderDims >> dimensionSize) & dimensionMask; this->workgroupWalkOrder[2] = (execEnv->WorkgroupWalkOrderDims >> dimensionSize * 2) & dimensionMask; this->requiresWorkGroupOrder = true; } for (uint32_t i = 0; i < 3; ++i) { // inverts the walk order mapping (from ORDER_ID->DIM_ID to DIM_ID->ORDER_ID) this->workgroupDimensionsOrder[this->workgroupWalkOrder[i]] = i; } if (execEnv->CompiledForGreaterThan4GBBuffers == false) { this->requiresSshForBuffers = true; } } void KernelInfo::storeArgInfo(uint32_t argNum, ArgTypeTraits metadata, std::unique_ptr metadataExtended) { resizeKernelArgInfoAndRegisterParameter(argNum); auto &argInfo = kernelArgInfo[argNum]; argInfo.metadata = metadata; argInfo.metadataExtended = std::move(metadataExtended); argInfo.isReadOnly |= argInfo.metadata.typeQualifiers.constQual; } void KernelInfo::storeKernelArgument( const SPatchDataParameterBuffer *pDataParameterKernelArg) { uint32_t argNum = pDataParameterKernelArg->ArgumentNumber; uint32_t dataSize = pDataParameterKernelArg->DataSize; uint32_t offset = pDataParameterKernelArg->Offset; uint32_t sourceOffset = pDataParameterKernelArg->SourceOffset; storeKernelArgPatchInfo(argNum, dataSize, offset, sourceOffset, 0); } void KernelInfo::storeKernelArgument( const SPatchStatelessGlobalMemoryObjectKernelArgument *pStatelessGlobalKernelArg) { uint32_t argNum = pStatelessGlobalKernelArg->ArgumentNumber; uint32_t offsetSSH = pStatelessGlobalKernelArg->SurfaceStateHeapOffset; usesSsh |= true; storeKernelArgPatchInfo(argNum, pStatelessGlobalKernelArg->DataParamSize, pStatelessGlobalKernelArg->DataParamOffset, 0, offsetSSH); kernelArgInfo[argNum].isBuffer = true; patchInfo.statelessGlobalMemObjKernelArgs.push_back(pStatelessGlobalKernelArg); } void KernelInfo::storeKernelArgument( const SPatchImageMemoryObjectKernelArgument *pImageMemObjKernelArg) { uint32_t argNum = pImageMemObjKernelArg->ArgumentNumber; uint32_t offsetSurfaceState = pImageMemObjKernelArg->Offset; usesSsh |= true; storeKernelArgPatchInfo(argNum, 0, 0, 0, offsetSurfaceState); kernelArgInfo[argNum].isImage = true; if (pImageMemObjKernelArg->Type == iOpenCL::IMAGE_MEMORY_OBJECT_2D_MEDIA) { kernelArgInfo[argNum].isMediaImage = true; } if (pImageMemObjKernelArg->Type == iOpenCL::IMAGE_MEMORY_OBJECT_2D_MEDIA_BLOCK) { kernelArgInfo[argNum].isMediaBlockImage = true; } kernelArgInfo[argNum].metadata.argByValSize = sizeof(cl_mem); kernelArgInfo[argNum].isTransformable = pImageMemObjKernelArg->Transformable != 0; patchInfo.imageMemObjKernelArgs.push_back(pImageMemObjKernelArg); if (NEO::KernelArgMetadata::AccessUnknown == kernelArgInfo[argNum].metadata.accessQualifier) { auto accessQual = pImageMemObjKernelArg->Writeable ? NEO::KernelArgMetadata::AccessReadWrite : NEO::KernelArgMetadata::AccessReadOnly; kernelArgInfo[argNum].metadata.accessQualifier = accessQual; } } void KernelInfo::storeKernelArgument( const SPatchGlobalMemoryObjectKernelArgument *pGlobalMemObjKernelArg) { uint32_t argNum = pGlobalMemObjKernelArg->ArgumentNumber; uint32_t offsetSurfaceState = pGlobalMemObjKernelArg->Offset; usesSsh |= true; storeKernelArgPatchInfo(argNum, 0, 0, 0, offsetSurfaceState); kernelArgInfo[argNum].isBuffer = true; } void KernelInfo::storeKernelArgument( const SPatchSamplerKernelArgument *pSamplerArgument) { uint32_t argNum = pSamplerArgument->ArgumentNumber; uint32_t offsetSurfaceState = pSamplerArgument->Offset; storeKernelArgPatchInfo(argNum, 0, 0, 0, offsetSurfaceState); kernelArgInfo[argNum].samplerArgumentType = pSamplerArgument->Type; if (pSamplerArgument->Type != iOpenCL::SAMPLER_OBJECT_TEXTURE) { DEBUG_BREAK_IF(pSamplerArgument->Type != iOpenCL::SAMPLER_OBJECT_VME && pSamplerArgument->Type != iOpenCL::SAMPLER_OBJECT_VE && pSamplerArgument->Type != iOpenCL::SAMPLER_OBJECT_VD); kernelArgInfo[argNum].isAccelerator = true; isVmeWorkload = true; } else { kernelArgInfo[argNum].isSampler = true; } } void KernelInfo::storeKernelArgument( const SPatchStatelessConstantMemoryObjectKernelArgument *pStatelessConstMemObjKernelArg) { uint32_t argNum = pStatelessConstMemObjKernelArg->ArgumentNumber; uint32_t offsetSSH = pStatelessConstMemObjKernelArg->SurfaceStateHeapOffset; usesSsh |= true; storeKernelArgPatchInfo(argNum, pStatelessConstMemObjKernelArg->DataParamSize, pStatelessConstMemObjKernelArg->DataParamOffset, 0, offsetSSH); kernelArgInfo[argNum].isBuffer = true; kernelArgInfo[argNum].isReadOnly = true; patchInfo.statelessGlobalMemObjKernelArgs.push_back(reinterpret_cast(pStatelessConstMemObjKernelArg)); } void KernelInfo::storeKernelArgument(const SPatchStatelessDeviceQueueKernelArgument *pStatelessDeviceQueueKernelArg) { uint32_t argNum = pStatelessDeviceQueueKernelArg->ArgumentNumber; resizeKernelArgInfoAndRegisterParameter(argNum); kernelArgInfo[argNum].isDeviceQueue = true; storeKernelArgPatchInfo(argNum, pStatelessDeviceQueueKernelArg->DataParamSize, pStatelessDeviceQueueKernelArg->DataParamOffset, 0, pStatelessDeviceQueueKernelArg->SurfaceStateHeapOffset); } void KernelInfo::storePatchToken( const SPatchAllocateStatelessPrivateSurface *pStatelessPrivateSurfaceArg) { usesSsh |= true; patchInfo.pAllocateStatelessPrivateSurface = pStatelessPrivateSurfaceArg; } void KernelInfo::storePatchToken(const SPatchAllocateStatelessConstantMemorySurfaceWithInitialization *pStatelessConstantMemorySurfaceWithInitializationArg) { usesSsh |= true; patchInfo.pAllocateStatelessConstantMemorySurfaceWithInitialization = pStatelessConstantMemorySurfaceWithInitializationArg; } void KernelInfo::storePatchToken(const SPatchAllocateStatelessGlobalMemorySurfaceWithInitialization *pStatelessGlobalMemorySurfaceWithInitializationArg) { usesSsh |= true; patchInfo.pAllocateStatelessGlobalMemorySurfaceWithInitialization = pStatelessGlobalMemorySurfaceWithInitializationArg; } void KernelInfo::storePatchToken(const SPatchAllocateStatelessPrintfSurface *pStatelessPrintfSurfaceArg) { usesSsh |= true; patchInfo.pAllocateStatelessPrintfSurface = pStatelessPrintfSurfaceArg; } void KernelInfo::storePatchToken(const SPatchAllocateStatelessEventPoolSurface *pStatelessEventPoolSurfaceArg) { usesSsh |= true; patchInfo.pAllocateStatelessEventPoolSurface = pStatelessEventPoolSurfaceArg; } void KernelInfo::storePatchToken(const SPatchAllocateStatelessDefaultDeviceQueueSurface *pStatelessDefaultDeviceQueueSurfaceArg) { usesSsh |= true; patchInfo.pAllocateStatelessDefaultDeviceQueueSurface = pStatelessDefaultDeviceQueueSurfaceArg; } void KernelInfo::storePatchToken(const SPatchString *pStringArg) { uint32_t stringIndex = pStringArg->Index; if (pStringArg->StringSize > 0) { const char *stringData = reinterpret_cast(pStringArg + 1); patchInfo.stringDataMap.emplace(stringIndex, std::string(stringData, stringData + pStringArg->StringSize)); } } void KernelInfo::storePatchToken(const SPatchKernelAttributesInfo *pKernelAttributesInfo) { this->patchInfo.pKernelAttributesInfo = pKernelAttributesInfo; attributes = reinterpret_cast(pKernelAttributesInfo) + sizeof(SPatchKernelAttributesInfo); auto start = attributes.find("intel_reqd_sub_group_size("); if (start != std::string::npos) { start += strlen("intel_reqd_sub_group_size("); auto stop = attributes.find(")", start); std::stringstream requiredSubGroupSizeStr(attributes.substr(start, stop - start)); requiredSubGroupSizeStr >> requiredSubGroupSize; } } void KernelInfo::storePatchToken(const SPatchAllocateSystemThreadSurface *pSystemThreadSurface) { usesSsh |= true; patchInfo.pAllocateSystemThreadSurface = pSystemThreadSurface; } void KernelInfo::storePatchToken(const SPatchAllocateSyncBuffer *pAllocateSyncBuffer) { usesSsh |= true; patchInfo.pAllocateSyncBuffer = pAllocateSyncBuffer; } void KernelInfo::storeKernelArgPatchInfo(uint32_t argNum, uint32_t dataSize, uint32_t dataOffset, uint32_t sourceOffset, uint32_t offsetSSH) { resizeKernelArgInfoAndRegisterParameter(argNum); KernelArgPatchInfo kernelArgPatchInfo; kernelArgPatchInfo.crossthreadOffset = dataOffset; kernelArgPatchInfo.size = dataSize; kernelArgPatchInfo.sourceOffset = sourceOffset; kernelArgInfo[argNum].kernelArgPatchInfoVector.push_back(kernelArgPatchInfo); kernelArgInfo[argNum].offsetHeap = offsetSSH; } size_t KernelInfo::getSamplerStateArrayCount() const { size_t count = patchInfo.samplerStateArray ? (size_t)patchInfo.samplerStateArray->Count : 0; return count; } size_t KernelInfo::getSamplerStateArraySize(const HardwareInfo &hwInfo) const { size_t samplerStateArraySize = getSamplerStateArrayCount() * Sampler::getSamplerStateSize(hwInfo); return samplerStateArraySize; } size_t KernelInfo::getBorderColorStateSize() const { size_t borderColorSize = 0; if (patchInfo.samplerStateArray) { borderColorSize = patchInfo.samplerStateArray->Offset - patchInfo.samplerStateArray->BorderColorOffset; } return borderColorSize; } size_t KernelInfo::getBorderColorOffset() const { size_t borderColorOffset = 0; if (patchInfo.samplerStateArray) { borderColorOffset = patchInfo.samplerStateArray->BorderColorOffset; } return borderColorOffset; } uint32_t KernelInfo::getConstantBufferSize() const { return patchInfo.dataParameterStream ? patchInfo.dataParameterStream->DataParameterStreamSize : 0; } bool KernelInfo::createKernelAllocation(const Device &device) { UNRECOVERABLE_IF(kernelAllocation); auto kernelIsaSize = heapInfo.KernelHeapSize; kernelAllocation = device.getMemoryManager()->allocateGraphicsMemoryWithProperties({device.getRootDeviceIndex(), kernelIsaSize, GraphicsAllocation::AllocationType::KERNEL_ISA, device.getDeviceBitfield()}); if (!kernelAllocation) { return false; } auto &hwInfo = device.getHardwareInfo(); auto &hwHelper = HwHelper::get(hwInfo.platform.eRenderCoreFamily); if (kernelAllocation->isAllocatedInLocalMemoryPool() && hwHelper.isBlitCopyRequiredForLocalMemory(hwInfo)) { auto status = BlitHelperFunctions::blitMemoryToAllocation(device, kernelAllocation, 0, heapInfo.pKernelHeap, {kernelIsaSize, 1, 1}); return (status == BlitOperationResult::Success); } return device.getMemoryManager()->copyMemoryToAllocation(kernelAllocation, heapInfo.pKernelHeap, kernelIsaSize); } void KernelInfo::apply(const DeviceInfoKernelPayloadConstants &constants) { if (nullptr == this->crossThreadData) { return; } uint32_t privateMemoryStatelessSizeOffset = this->workloadInfo.privateMemoryStatelessSizeOffset; uint32_t localMemoryStatelessWindowSizeOffset = this->workloadInfo.localMemoryStatelessWindowSizeOffset; uint32_t localMemoryStatelessWindowStartAddressOffset = this->workloadInfo.localMemoryStatelessWindowStartAddressOffset; if (localMemoryStatelessWindowStartAddressOffset != WorkloadInfo::undefinedOffset) { *(uintptr_t *)&(this->crossThreadData[localMemoryStatelessWindowStartAddressOffset]) = reinterpret_cast(constants.slmWindow); } if (localMemoryStatelessWindowSizeOffset != WorkloadInfo::undefinedOffset) { *(uint32_t *)&(this->crossThreadData[localMemoryStatelessWindowSizeOffset]) = constants.slmWindowSize; } uint32_t privateMemorySize = 0U; if (this->patchInfo.pAllocateStatelessPrivateSurface) { privateMemorySize = this->patchInfo.pAllocateStatelessPrivateSurface->PerThreadPrivateMemorySize * constants.computeUnitsUsedForScratch * this->getMaxSimdSize(); } if (privateMemoryStatelessSizeOffset != WorkloadInfo::undefinedOffset) { *(uint32_t *)&(this->crossThreadData[privateMemoryStatelessSizeOffset]) = privateMemorySize; } if (this->workloadInfo.maxWorkGroupSizeOffset != WorkloadInfo::undefinedOffset) { *(uint32_t *)&(this->crossThreadData[this->workloadInfo.maxWorkGroupSizeOffset]) = constants.maxWorkGroupSize; } } std::string concatenateKernelNames(ArrayRef kernelInfos) { std::string semiColonDelimitedKernelNameStr; for (const auto &kernelInfo : kernelInfos) { if (!semiColonDelimitedKernelNameStr.empty()) { semiColonDelimitedKernelNameStr += ';'; } semiColonDelimitedKernelNameStr += kernelInfo->name; } return semiColonDelimitedKernelNameStr; } } // namespace NEO