/* * Copyright (C) 2019-2023 Intel Corporation * * SPDX-License-Identifier: MIT * */ #include "shared/source/command_container/command_encoder.h" #include "shared/source/command_container/encode_surface_state.h" #include "shared/source/command_stream/csr_definitions.h" #include "shared/source/command_stream/preemption.h" #include "shared/source/debug_settings/debug_settings_manager.h" #include "shared/source/helpers/address_patch.h" #include "shared/source/helpers/aligned_memory.h" #include "shared/source/helpers/basic_math.h" #include "shared/source/helpers/gfx_core_helper.h" #include "shared/source/helpers/hw_info.h" #include "shared/source/helpers/local_id_gen.h" #include "shared/source/indirect_heap/indirect_heap.h" #include "shared/source/kernel/dispatch_kernel_encoder_interface.h" #include "shared/source/kernel/implicit_args.h" #include "opencl/source/cl_device/cl_device.h" #include "opencl/source/context/context.h" #include "opencl/source/helpers/dispatch_info.h" #include "opencl/source/kernel/kernel.h" namespace NEO { template size_t HardwareCommandsHelper::getSizeRequiredDSH(const Kernel &kernel) { constexpr auto samplerStateSize = sizeof(typename GfxFamily::SAMPLER_STATE); constexpr auto maxIndirectSamplerStateSize = alignUp(sizeof(typename GfxFamily::SAMPLER_BORDER_COLOR_STATE), MemoryConstants::cacheLineSize); const auto numSamplers = kernel.getKernelInfo().kernelDescriptor.payloadMappings.samplerTable.numSamplers; if (numSamplers == 0U) { return alignUp(additionalSizeRequiredDsh(), MemoryConstants::cacheLineSize); } auto calculatedTotalSize = alignUp(maxIndirectSamplerStateSize + numSamplers * samplerStateSize + additionalSizeRequiredDsh(), MemoryConstants::cacheLineSize); DEBUG_BREAK_IF(calculatedTotalSize > kernel.getDynamicStateHeapSize()); return calculatedTotalSize; } template size_t HardwareCommandsHelper::getSizeRequiredIOH(const Kernel &kernel, const size_t localWorkSizes[3]) { auto localWorkSize = Math::computeTotalElementsCount(localWorkSizes); typedef typename GfxFamily::WALKER_TYPE WALKER_TYPE; const auto &kernelDescriptor = kernel.getDescriptor(); const auto &hwInfo = kernel.getHardwareInfo(); const auto &gfxCoreHelper = kernel.getGfxCoreHelper(); auto numChannels = kernelDescriptor.kernelAttributes.numLocalIdChannels; uint32_t grfSize = hwInfo.capabilityTable.grfSize; auto simdSize = kernelDescriptor.kernelAttributes.simdSize; uint32_t requiredWalkOrder = 0u; auto isHwLocalIdGeneration = !NEO::EncodeDispatchKernel::isRuntimeLocalIdsGenerationRequired( numChannels, localWorkSizes, std::array{ {kernelDescriptor.kernelAttributes.workgroupWalkOrder[0], kernelDescriptor.kernelAttributes.workgroupWalkOrder[1], kernelDescriptor.kernelAttributes.workgroupWalkOrder[2]}}, kernelDescriptor.kernelAttributes.flags.requiresWorkgroupWalkOrder, requiredWalkOrder, simdSize); auto size = kernel.getCrossThreadDataSize() + getPerThreadDataSizeTotal(simdSize, grfSize, numChannels, localWorkSize, isHwLocalIdGeneration, gfxCoreHelper); auto pImplicitArgs = kernel.getImplicitArgs(); if (pImplicitArgs) { size += ImplicitArgsHelper::getSizeForImplicitArgsPatching(pImplicitArgs, kernelDescriptor, isHwLocalIdGeneration, gfxCoreHelper); } return alignUp(size, WALKER_TYPE::INDIRECTDATASTARTADDRESS_ALIGN_SIZE); } template size_t HardwareCommandsHelper::getSizeRequiredSSH(const Kernel &kernel) { typedef typename GfxFamily::BINDING_TABLE_STATE BINDING_TABLE_STATE; auto sizeSSH = kernel.getSurfaceStateHeapSize(); sizeSSH += sizeSSH ? BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE : 0; return sizeSSH; } template size_t getSizeRequired(const MultiDispatchInfo &multiDispatchInfo, SizeGetterT &&getSize, ArgsT... args) { size_t totalSize = 0; auto it = multiDispatchInfo.begin(); for (auto e = multiDispatchInfo.end(); it != e; ++it) { totalSize = alignUp(totalSize, MemoryConstants::cacheLineSize); totalSize += getSize(*it, std::forward(args)...); } totalSize = alignUp(totalSize, MemoryConstants::pageSize); return totalSize; } template size_t HardwareCommandsHelper::getTotalSizeRequiredDSH( const MultiDispatchInfo &multiDispatchInfo) { return getSizeRequired(multiDispatchInfo, [](const DispatchInfo &dispatchInfo) { return getSizeRequiredDSH(*dispatchInfo.getKernel()); }); } template size_t HardwareCommandsHelper::getTotalSizeRequiredIOH( const MultiDispatchInfo &multiDispatchInfo) { return getSizeRequired(multiDispatchInfo, [](const DispatchInfo &dispatchInfo) { return getSizeRequiredIOH( *dispatchInfo.getKernel(), dispatchInfo.getLocalWorkgroupSize().values); }); } template size_t HardwareCommandsHelper::getTotalSizeRequiredSSH( const MultiDispatchInfo &multiDispatchInfo) { return getSizeRequired(multiDispatchInfo, [](const DispatchInfo &dispatchInfo) { return getSizeRequiredSSH(*dispatchInfo.getKernel()); }); } template size_t HardwareCommandsHelper::sendInterfaceDescriptorData( const IndirectHeap &indirectHeap, uint64_t offsetInterfaceDescriptor, uint64_t kernelStartOffset, size_t sizeCrossThreadData, size_t sizePerThreadData, size_t bindingTablePointer, [[maybe_unused]] size_t offsetSamplerState, uint32_t numSamplers, const uint32_t threadGroupCount, uint32_t threadsPerThreadGroup, const Kernel &kernel, uint32_t bindingTablePrefetchSize, PreemptionMode preemptionMode, INTERFACE_DESCRIPTOR_DATA *inlineInterfaceDescriptor, const Device &device, WALKER_TYPE *walkerCmd) { using SAMPLER_STATE = typename GfxFamily::SAMPLER_STATE; using SHARED_LOCAL_MEMORY_SIZE = typename INTERFACE_DESCRIPTOR_DATA::SHARED_LOCAL_MEMORY_SIZE; const auto &hardwareInfo = device.getHardwareInfo(); const auto &kernelDescriptor = kernel.getKernelInfo().kernelDescriptor; // Allocate some memory for the interface descriptor auto pInterfaceDescriptor = getInterfaceDescriptor(indirectHeap, offsetInterfaceDescriptor, inlineInterfaceDescriptor); auto interfaceDescriptor = GfxFamily::cmdInitInterfaceDescriptorData; // Program the kernel start pointer interfaceDescriptor.setKernelStartPointer(static_cast(kernelStartOffset & std::numeric_limits::max())); // # of threads in thread group should be based on LWS. interfaceDescriptor.setNumberOfThreadsInGpgpuThreadGroup(threadsPerThreadGroup); auto slmTotalSize = kernel.getSlmTotalSize(); EncodeDispatchKernel::setGrfInfo(&interfaceDescriptor, kernelDescriptor.kernelAttributes.numGrfRequired, sizeCrossThreadData, sizePerThreadData, hardwareInfo); auto &productHelper = device.getProductHelper(); productHelper.updateIddCommand(&interfaceDescriptor, kernelDescriptor.kernelAttributes.numGrfRequired, kernelDescriptor.kernelAttributes.threadArbitrationPolicy); EncodeDispatchKernel::appendAdditionalIDDFields(&interfaceDescriptor, device.getRootDeviceEnvironment(), threadsPerThreadGroup, slmTotalSize, SlmPolicy::SlmPolicyNone); interfaceDescriptor.setBindingTablePointer(static_cast(bindingTablePointer)); if constexpr (GfxFamily::supportsSampler) { if (device.getDeviceInfo().imageSupport) { interfaceDescriptor.setSamplerStatePointer(static_cast(offsetSamplerState)); } } EncodeDispatchKernel::adjustBindingTablePrefetch(interfaceDescriptor, numSamplers, bindingTablePrefetchSize); auto &gfxCoreHelper = device.getGfxCoreHelper(); auto programmableIDSLMSize = static_cast(gfxCoreHelper.computeSlmValues(hardwareInfo, slmTotalSize)); if (DebugManager.flags.OverrideSlmAllocationSize.get() != -1) { programmableIDSLMSize = static_cast(DebugManager.flags.OverrideSlmAllocationSize.get()); } interfaceDescriptor.setSharedLocalMemorySize(programmableIDSLMSize); EncodeDispatchKernel::programBarrierEnable(interfaceDescriptor, kernelDescriptor.kernelAttributes.barrierCount, hardwareInfo); PreemptionHelper::programInterfaceDescriptorDataPreemption(&interfaceDescriptor, preemptionMode); EncodeDispatchKernel::adjustInterfaceDescriptorData(interfaceDescriptor, device, hardwareInfo, threadGroupCount, kernelDescriptor.kernelAttributes.numGrfRequired, *walkerCmd); *pInterfaceDescriptor = interfaceDescriptor; return (size_t)offsetInterfaceDescriptor; } template void HardwareCommandsHelper::programPerThreadData( bool localIdsGenerationByRuntime, size_t &sizePerThreadData, size_t &sizePerThreadDataTotal, LinearStream &ioh, const Kernel &kernel, const size_t localWorkSize[3]) { if (localIdsGenerationByRuntime) { Vec3 group = {static_cast(localWorkSize[0]), static_cast(localWorkSize[1]), static_cast(localWorkSize[2])}; sizePerThreadData = kernel.getLocalIdsSizePerThread(); sizePerThreadDataTotal = kernel.getLocalIdsSizeForGroup(group); auto dest = ioh.getSpace(sizePerThreadDataTotal); kernel.setLocalIdsForGroup(group, dest); } } template size_t HardwareCommandsHelper::sendIndirectState( LinearStream &commandStream, IndirectHeap &dsh, IndirectHeap &ioh, IndirectHeap &ssh, Kernel &kernel, uint64_t kernelStartOffset, uint32_t simd, const size_t localWorkSize[3], const uint32_t threadGroupCount, const uint64_t offsetInterfaceDescriptorTable, uint32_t &interfaceDescriptorIndex, PreemptionMode preemptionMode, WALKER_TYPE *walkerCmd, INTERFACE_DESCRIPTOR_DATA *inlineInterfaceDescriptor, bool localIdsGenerationByRuntime, const Device &device) { using SAMPLER_STATE = typename GfxFamily::SAMPLER_STATE; DEBUG_BREAK_IF(simd != 1 && simd != 8 && simd != 16 && simd != 32); auto inlineDataProgrammingRequired = HardwareCommandsHelper::inlineDataProgrammingRequired(kernel); // Copy the kernel over to the ISH const auto &kernelInfo = kernel.getKernelInfo(); ssh.align(BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE); size_t dstBindingTablePointer = HardwareCommandsHelper::checkForAdditionalBTAndSetBTPointer(ssh, kernel); // Copy our sampler state if it exists const auto &samplerTable = kernelInfo.kernelDescriptor.payloadMappings.samplerTable; uint32_t samplerCount = 0; uint32_t samplerStateOffset = 0; if (isValidOffset(samplerTable.tableOffset) && isValidOffset(samplerTable.borderColor)) { samplerCount = samplerTable.numSamplers; samplerStateOffset = EncodeStates::copySamplerState(&dsh, samplerTable.tableOffset, samplerCount, samplerTable.borderColor, kernel.getDynamicStateHeap(), device.getBindlessHeapsHelper(), device.getRootDeviceEnvironment()); } auto &gfxCoreHelper = device.getGfxCoreHelper(); auto grfSize = kernel.getDescriptor().kernelAttributes.numGrfRequired; auto localWorkItems = localWorkSize[0] * localWorkSize[1] * localWorkSize[2]; auto threadsPerThreadGroup = gfxCoreHelper.calculateNumThreadsPerThreadGroup(simd, static_cast(localWorkItems), grfSize, !localIdsGenerationByRuntime); uint32_t sizeCrossThreadData = kernel.getCrossThreadDataSize(); size_t offsetCrossThreadData = HardwareCommandsHelper::sendCrossThreadData( ioh, kernel, inlineDataProgrammingRequired, walkerCmd, sizeCrossThreadData); size_t sizePerThreadDataTotal = 0; size_t sizePerThreadData = 0; HardwareCommandsHelper::programPerThreadData( localIdsGenerationByRuntime, sizePerThreadData, sizePerThreadDataTotal, ioh, kernel, localWorkSize); uint64_t offsetInterfaceDescriptor = offsetInterfaceDescriptorTable + interfaceDescriptorIndex * sizeof(INTERFACE_DESCRIPTOR_DATA); auto bindingTablePrefetchSize = 0; if (EncodeSurfaceState::doBindingTablePrefetch()) { bindingTablePrefetchSize = std::min(31u, static_cast(kernel.getNumberOfBindingTableStates())); } HardwareCommandsHelper::sendInterfaceDescriptorData( dsh, offsetInterfaceDescriptor, kernelStartOffset, sizeCrossThreadData, sizePerThreadData, dstBindingTablePointer, samplerStateOffset, samplerCount, threadGroupCount, threadsPerThreadGroup, kernel, bindingTablePrefetchSize, preemptionMode, inlineInterfaceDescriptor, device, walkerCmd); if (DebugManager.flags.AddPatchInfoCommentsForAUBDump.get()) { PatchInfoData patchInfoData(kernelStartOffset, 0, PatchInfoAllocationType::InstructionHeap, dsh.getGraphicsAllocation()->getGpuAddress(), offsetInterfaceDescriptor, PatchInfoAllocationType::DynamicStateHeap); kernel.getPatchInfoDataList().push_back(patchInfoData); } // Program media state flush to set interface descriptor offset sendMediaStateFlush( commandStream, interfaceDescriptorIndex); DEBUG_BREAK_IF(offsetCrossThreadData % 64 != 0); walkerCmd->setIndirectDataStartAddress(static_cast(offsetCrossThreadData)); setInterfaceDescriptorOffset(walkerCmd, interfaceDescriptorIndex); auto indirectDataLength = alignUp(static_cast(sizeCrossThreadData + sizePerThreadDataTotal), WALKER_TYPE::INDIRECTDATASTARTADDRESS_ALIGN_SIZE); walkerCmd->setIndirectDataLength(indirectDataLength); return offsetCrossThreadData; } template bool HardwareCommandsHelper::inlineDataProgrammingRequired(const Kernel &kernel) { auto checkKernelForInlineData = true; if (DebugManager.flags.EnablePassInlineData.get() != -1) { checkKernelForInlineData = !!DebugManager.flags.EnablePassInlineData.get(); } if (checkKernelForInlineData) { return kernel.getKernelInfo().kernelDescriptor.kernelAttributes.flags.passInlineData; } return false; } template bool HardwareCommandsHelper::kernelUsesLocalIds(const Kernel &kernel) { return kernel.getKernelInfo().kernelDescriptor.kernelAttributes.numLocalIdChannels > 0; } template size_t HardwareCommandsHelper::checkForAdditionalBTAndSetBTPointer(IndirectHeap &ssh, const Kernel &kernel) { size_t dstBindingTablePointer{0u}; const auto &kernelInfo = kernel.getKernelInfo(); if (false == isGTPinInitialized && 0u == kernelInfo.kernelDescriptor.payloadMappings.bindingTable.numEntries) { dstBindingTablePointer = 0u; } else { dstBindingTablePointer = EncodeSurfaceState::pushBindingTableAndSurfaceStates(ssh, kernel.getSurfaceStateHeap(), kernel.getSurfaceStateHeapSize(), kernel.getNumberOfBindingTableStates(), kernel.getBindingTableOffset()); } return dstBindingTablePointer; } } // namespace NEO