/* * Copyright (C) 2017-2019 Intel Corporation * * SPDX-License-Identifier: MIT * */ #pragma once #include "runtime/built_ins/built_ins.h" #include "runtime/device/device.h" #include "runtime/device_queue/device_queue.h" #include "runtime/helpers/per_thread_data.h" #include "runtime/indirect_heap/indirect_heap.h" #include "runtime/kernel/kernel.h" #include "runtime/scheduler/scheduler_kernel.h" #include #include #include namespace NEO { class LinearStream; class IndirectHeap; struct CrossThreadInfo; struct MultiDispatchInfo; template using WALKER_TYPE = typename GfxFamily::WALKER_TYPE; template struct HardwareCommandsHelper : public PerThreadDataHelper { using BINDING_TABLE_STATE = typename GfxFamily::BINDING_TABLE_STATE; using RENDER_SURFACE_STATE = typename GfxFamily::RENDER_SURFACE_STATE; using INTERFACE_DESCRIPTOR_DATA = typename GfxFamily::INTERFACE_DESCRIPTOR_DATA; using MI_ATOMIC = typename GfxFamily::MI_ATOMIC; static uint32_t alignSlmSize(uint32_t slmSize); static uint32_t computeSlmValues(uint32_t slmSize); static INTERFACE_DESCRIPTOR_DATA *getInterfaceDescriptor( const IndirectHeap &indirectHeap, uint64_t offsetInterfaceDescriptor, INTERFACE_DESCRIPTOR_DATA *inlineInterfaceDescriptor); static void setAdditionalInfo( INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, const Kernel &kernel, const size_t &sizeCrossThreadData, const size_t &sizePerThreadData, const uint32_t threadsPerThreadGroup); inline static uint32_t additionalSizeRequiredDsh(); static size_t sendInterfaceDescriptorData( const IndirectHeap &indirectHeap, uint64_t offsetInterfaceDescriptor, uint64_t kernelStartOffset, size_t sizeCrossThreadData, size_t sizePerThreadData, size_t bindingTablePointer, size_t offsetSamplerState, uint32_t numSamplers, uint32_t threadsPerThreadGroup, const Kernel &kernel, uint32_t bindingTablePrefetchSize, PreemptionMode preemptionMode, INTERFACE_DESCRIPTOR_DATA *inlineInterfaceDescriptor); static void sendMediaStateFlush( LinearStream &commandStream, size_t offsetInterfaceDescriptorData); static void sendMediaInterfaceDescriptorLoad( LinearStream &commandStream, size_t offsetInterfaceDescriptorData, size_t sizeInterfaceDescriptorData); static size_t sendCrossThreadData( IndirectHeap &indirectHeap, Kernel &kernel, bool inlineDataProgrammingRequired, WALKER_TYPE *walkerCmd, uint32_t &sizeCrossThreadData); static size_t pushBindingTableAndSurfaceStates(IndirectHeap &dstHeap, const KernelInfo &srcKernelInfo, const void *srcKernelSsh, size_t srcKernelSshSize, size_t numberOfBindingTableStates, size_t offsetOfBindingTable); static size_t pushBindingTableAndSurfaceStates(IndirectHeap &dstHeap, const KernelInfo &srcKernelInfo) { return pushBindingTableAndSurfaceStates(dstHeap, srcKernelInfo, srcKernelInfo.heapInfo.pSsh, srcKernelInfo.heapInfo.pKernelHeader->SurfaceStateHeapSize, (srcKernelInfo.patchInfo.bindingTableState != nullptr) ? srcKernelInfo.patchInfo.bindingTableState->Count : 0, (srcKernelInfo.patchInfo.bindingTableState != nullptr) ? srcKernelInfo.patchInfo.bindingTableState->Offset : 0); } static size_t pushBindingTableAndSurfaceStates(IndirectHeap &dstHeap, const Kernel &srcKernel) { return pushBindingTableAndSurfaceStates(dstHeap, srcKernel.getKernelInfo(), srcKernel.getSurfaceStateHeap(), srcKernel.getSurfaceStateHeapSize(), srcKernel.getNumberOfBindingTableStates(), srcKernel.getBindingTableOffset()); } static size_t sendIndirectState( LinearStream &commandStream, IndirectHeap &dsh, IndirectHeap &ioh, IndirectHeap &ssh, Kernel &kernel, uint32_t simd, const size_t localWorkSize[3], const uint64_t offsetInterfaceDescriptorTable, uint32_t &interfaceDescriptorIndex, PreemptionMode preemptionMode, WALKER_TYPE *walkerCmd, INTERFACE_DESCRIPTOR_DATA *inlineInterfaceDescriptor, bool localIdsGenerationByRuntime, bool isCcsUsed); static void programPerThreadData( size_t &sizePerThreadData, const bool &localIdsGenerationByRuntime, LinearStream &ioh, uint32_t &simd, uint32_t &numChannels, const size_t localWorkSize[3], Kernel &kernel, size_t &sizePerThreadDataTotal, size_t &localWorkItems); static void updatePerThreadDataTotal( size_t &sizePerThreadData, uint32_t &simd, uint32_t &numChannels, size_t &sizePerThreadDataTotal, size_t &localWorkItems); inline static bool resetBindingTablePrefetch(Kernel &kernel); static void setKernelStartOffset( uint64_t &kernelStartOffset, bool kernelAllocation, const KernelInfo &kernelInfo, const bool &localIdsGenerationByRuntime, const bool &kernelUsesLocalIds, Kernel &kernel, bool isCssUsed); static size_t getSizeRequiredCS(const Kernel *kernel); static size_t getSizeRequiredForCacheFlush(const CommandQueue &commandQueue, const Kernel *kernel, uint64_t postSyncAddress); static bool isPipeControlWArequired(const HardwareInfo &hwInfo); static bool isPipeControlPriorToPipelineSelectWArequired(const HardwareInfo &hwInfo); static size_t getSizeRequiredDSH( const Kernel &kernel); static size_t getSizeRequiredIOH( const Kernel &kernel, size_t localWorkSize = 256); static size_t getSizeRequiredSSH( const Kernel &kernel); static size_t getTotalSizeRequiredDSH( const MultiDispatchInfo &multiDispatchInfo); static size_t getTotalSizeRequiredIOH( const MultiDispatchInfo &multiDispatchInfo); static size_t getTotalSizeRequiredSSH( const MultiDispatchInfo &multiDispatchInfo); static size_t getSizeRequiredForExecutionModel(IndirectHeap::Type heapType, const Kernel &kernel) { typedef typename GfxFamily::BINDING_TABLE_STATE BINDING_TABLE_STATE; size_t totalSize = 0; BlockKernelManager *blockManager = kernel.getProgram()->getBlockKernelManager(); uint32_t blockCount = static_cast(blockManager->getCount()); uint32_t maxBindingTableCount = 0; if (heapType == IndirectHeap::SURFACE_STATE) { totalSize = BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE - 1; for (uint32_t i = 0; i < blockCount; i++) { const KernelInfo *pBlockInfo = blockManager->getBlockKernelInfo(i); totalSize += pBlockInfo->heapInfo.pKernelHeader->SurfaceStateHeapSize; totalSize = alignUp(totalSize, BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE); maxBindingTableCount = std::max(maxBindingTableCount, pBlockInfo->patchInfo.bindingTableState->Count); } } if (heapType == IndirectHeap::INDIRECT_OBJECT || heapType == IndirectHeap::SURFACE_STATE) { BuiltIns &builtIns = *kernel.getDevice().getExecutionEnvironment()->getBuiltIns(); SchedulerKernel &scheduler = builtIns.getSchedulerKernel(kernel.getContext()); if (heapType == IndirectHeap::INDIRECT_OBJECT) { totalSize += getSizeRequiredIOH(scheduler); } else { totalSize += getSizeRequiredSSH(scheduler); totalSize += maxBindingTableCount * sizeof(BINDING_TABLE_STATE) * DeviceQueue::interfaceDescriptorEntries; totalSize = alignUp(totalSize, BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE); } } return totalSize; } static void setInterfaceDescriptorOffset( WALKER_TYPE *walkerCmd, uint32_t &interfaceDescriptorIndex); static void programMiSemaphoreWait(LinearStream &commandStream, uint64_t compareAddress, uint32_t compareData); static void programMiFlushDw(LinearStream &commandStream, uint64_t immediateDataGpuAddress, uint64_t immediateData); static void appendMiFlushDw(typename GfxFamily::MI_FLUSH_DW *miFlushDwCmd); static MI_ATOMIC *programMiAtomic(LinearStream &commandStream, uint64_t writeAddress, typename MI_ATOMIC::ATOMIC_OPCODES opcode, typename MI_ATOMIC::DATA_SIZE dataSize); static void programMiAtomic(MI_ATOMIC &atomic, uint64_t writeAddress, typename MI_ATOMIC::ATOMIC_OPCODES opcode, typename MI_ATOMIC::DATA_SIZE dataSize); static void programCacheFlushAfterWalkerCommand(LinearStream *commandStream, const CommandQueue &commandQueue, const Kernel *kernel, uint64_t postSyncAddress); static void programBarrierEnable(INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, uint32_t value, const HardwareInfo &hwInfo); static void adjustInterfaceDescriptorData(INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, const HardwareInfo &hwInfo); static const size_t alignInterfaceDescriptorData = 64 * sizeof(uint8_t); static const uint32_t alignIndirectStatePointer = 64 * sizeof(uint8_t); static bool doBindingTablePrefetch(); static bool inlineDataProgrammingRequired(const Kernel &kernel); static bool kernelUsesLocalIds(const Kernel &kernel); }; } // namespace NEO