/* * Copyright (c) 2017 - 2018, Intel Corporation * * Permission is hereby granted, free of charge, to any person obtaining a * copy of this software and associated documentation files (the "Software"), * to deal in the Software without restriction, including without limitation * the rights to use, copy, modify, merge, publish, distribute, sublicense, * and/or sell copies of the Software, and to permit persons to whom the * Software is furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included * in all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR * OTHER DEALINGS IN THE SOFTWARE. */ #include "runtime/command_queue/local_id_gen.h" #include "runtime/command_stream/csr_definitions.h" #include "runtime/command_stream/preemption.h" #include "runtime/helpers/aligned_memory.h" #include "runtime/helpers/basic_math.h" #include "runtime/helpers/dispatch_info.h" #include "runtime/helpers/ptr_math.h" #include "runtime/helpers/string.h" #include "runtime/indirect_heap/indirect_heap.h" #include "runtime/kernel/kernel.h" #include "runtime/os_interface/debug_settings_manager.h" #include namespace OCLRT { template uint32_t KernelCommandsHelper::computeSlmValues(uint32_t valueIn) { auto value = std::max(valueIn, 1024u); value = Math::nextPowerOfTwo(value); value = Math::getMinLsbSet(value); value = value - 9; DEBUG_BREAK_IF(value > 7); return value * !!valueIn; } template size_t KernelCommandsHelper::getSizeRequiredCS() { return 2 * sizeof(typename GfxFamily::MEDIA_STATE_FLUSH) + sizeof(typename GfxFamily::MEDIA_INTERFACE_DESCRIPTOR_LOAD); } template size_t KernelCommandsHelper::getSizeRequiredDSH( const Kernel &kernel) { typedef typename GfxFamily::INTERFACE_DESCRIPTOR_DATA INTERFACE_DESCRIPTOR_DATA; typedef typename GfxFamily::SAMPLER_STATE SAMPLER_STATE; const auto &patchInfo = kernel.getKernelInfo().patchInfo; auto samplerCount = patchInfo.samplerStateArray ? patchInfo.samplerStateArray->Count : 0; auto totalSize = samplerCount ? alignUp(samplerCount * sizeof(SAMPLER_STATE), INTERFACE_DESCRIPTOR_DATA::SAMPLERSTATEPOINTER_ALIGN_SIZE) : 0; auto borderColorSize = patchInfo.samplerStateArray ? patchInfo.samplerStateArray->Offset - patchInfo.samplerStateArray->BorderColorOffset : 0; borderColorSize = alignUp(borderColorSize + alignIndirectStatePointer - 1, alignIndirectStatePointer); totalSize += sizeof(INTERFACE_DESCRIPTOR_DATA) + borderColorSize; DEBUG_BREAK_IF(!(totalSize >= kernel.getDynamicStateHeapSize() || kernel.getKernelInfo().isVmeWorkload)); return alignUp(totalSize, alignInterfaceDescriptorData); } template size_t KernelCommandsHelper::getSizeRequiredIOH( const Kernel &kernel, size_t localWorkSize) { typedef typename GfxFamily::GPGPU_WALKER GPGPU_WALKER; auto threadPayload = kernel.getKernelInfo().patchInfo.threadPayload; DEBUG_BREAK_IF(nullptr == threadPayload); auto numChannels = PerThreadDataHelper::getNumLocalIdChannels(*threadPayload); return alignUp((kernel.getCrossThreadDataSize() + getPerThreadDataSizeTotal(kernel.getKernelInfo().getMaxSimdSize(), numChannels, localWorkSize)), GPGPU_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE); } template size_t KernelCommandsHelper::getSizeRequiredIH( const Kernel &kernel) { typedef typename GfxFamily::INTERFACE_DESCRIPTOR_DATA INTERFACE_DESCRIPTOR_DATA; return kernel.getKernelHeapSize() + INTERFACE_DESCRIPTOR_DATA::KERNELSTARTPOINTER_ALIGN_SIZE; } template size_t KernelCommandsHelper::getSizeRequiredSSH( const Kernel &kernel) { typedef typename GfxFamily::BINDING_TABLE_STATE BINDING_TABLE_STATE; auto sizeSSH = kernel.getSurfaceStateHeapSize(); sizeSSH += sizeSSH ? BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE : 0; return sizeSSH; } template size_t getSizeRequired(const MultiDispatchInfo &multiDispatchInfo, SizeGetterT &&getSize, ArgsT... args) { size_t totalSize = 0; auto it = multiDispatchInfo.begin(); for (auto e = multiDispatchInfo.end(); it != e; ++it) { totalSize = alignUp(totalSize, MemoryConstants::pageSize); totalSize += getSize(*it, std::forward(args)...); } return totalSize; } template size_t KernelCommandsHelper::getTotalSizeRequiredDSH( const MultiDispatchInfo &multiDispatchInfo) { return getSizeRequired(multiDispatchInfo, [](const DispatchInfo &dispatchInfo) { return getSizeRequiredDSH(*dispatchInfo.getKernel()); }); } template size_t KernelCommandsHelper::getTotalSizeRequiredIH( const MultiDispatchInfo &multiDispatchInfo) { return getSizeRequired(multiDispatchInfo, [](const DispatchInfo &dispatchInfo) { return getSizeRequiredIH(*dispatchInfo.getKernel()); }); } template size_t KernelCommandsHelper::getTotalSizeRequiredIOH( const MultiDispatchInfo &multiDispatchInfo) { return getSizeRequired(multiDispatchInfo, [](const DispatchInfo &dispatchInfo) { return getSizeRequiredIOH(*dispatchInfo.getKernel(), Math::computeTotalElementsCount(dispatchInfo.getLocalWorkgroupSize())); }); } template size_t KernelCommandsHelper::getTotalSizeRequiredSSH( const MultiDispatchInfo &multiDispatchInfo) { return getSizeRequired(multiDispatchInfo, [](const DispatchInfo &dispatchInfo) { return getSizeRequiredSSH(*dispatchInfo.getKernel()); }); } template size_t KernelCommandsHelper::copyKernelBinary( IndirectHeap &indirectHeap, const KernelInfo &kernelInfo) { const auto alignKernelBinary = 64 * sizeof(uint8_t); indirectHeap.align(alignKernelBinary); auto kernelStartOffset = indirectHeap.getUsed(); auto pKernelHeap = kernelInfo.heapInfo.pKernelHeap; auto kernelHeapSize = kernelInfo.heapInfo.pKernelHeader->KernelHeapSize; auto pKernelDataDst = indirectHeap.getSpace(kernelHeapSize); memcpy_s(pKernelDataDst, kernelHeapSize, pKernelHeap, kernelHeapSize); return kernelStartOffset; } template size_t KernelCommandsHelper::sendInterfaceDescriptorData( const IndirectHeap &indirectHeap, uint64_t offsetInterfaceDescriptor, uint64_t kernelStartOffset, size_t sizeCrossThreadData, size_t sizePerThreadData, size_t bindingTablePointer, size_t offsetSamplerState, uint32_t numSamplers, uint32_t threadsPerThreadGroup, uint32_t sizeSlm, bool barrierEnable, PreemptionMode preemptionMode) { typedef typename GfxFamily::SAMPLER_STATE SAMPLER_STATE; typedef typename GfxFamily::INTERFACE_DESCRIPTOR_DATA INTERFACE_DESCRIPTOR_DATA; // Allocate some memory for the interface descriptor auto pInterfaceDescriptor = static_cast(ptrOffset(indirectHeap.getCpuBase(), (size_t)offsetInterfaceDescriptor)); *pInterfaceDescriptor = GfxFamily::cmdInitInterfaceDescriptorData; // Program the kernel start pointer pInterfaceDescriptor->setKernelStartPointerHigh(kernelStartOffset >> 32); pInterfaceDescriptor->setKernelStartPointer((uint32_t)kernelStartOffset); // # of threads in thread group should be based on LWS. pInterfaceDescriptor->setNumberOfThreadsInGpgpuThreadGroup(threadsPerThreadGroup); DEBUG_BREAK_IF((sizeCrossThreadData % sizeof(GRF)) != 0); auto numGrfCrossThreadData = static_cast(sizeCrossThreadData / sizeof(GRF)); DEBUG_BREAK_IF(numGrfCrossThreadData == 0); pInterfaceDescriptor->setCrossThreadConstantDataReadLength(numGrfCrossThreadData); pInterfaceDescriptor->setDenormMode(INTERFACE_DESCRIPTOR_DATA::DENORM_MODE_SETBYKERNEL); DEBUG_BREAK_IF((sizePerThreadData % sizeof(GRF)) != 0); auto numGrfPerThreadData = static_cast(sizePerThreadData / sizeof(GRF)); // at least 1 GRF of perThreadData for each thread in a thread group when sizeCrossThreadData != 0 numGrfPerThreadData = std::max(numGrfPerThreadData, 1u); pInterfaceDescriptor->setConstantIndirectUrbEntryReadLength(numGrfPerThreadData); pInterfaceDescriptor->setBindingTablePointer(static_cast(bindingTablePointer)); pInterfaceDescriptor->setSamplerStatePointer(static_cast(offsetSamplerState)); DEBUG_BREAK_IF(numSamplers > 16); auto samplerCountState = static_cast((numSamplers + 3) / 4); pInterfaceDescriptor->setSamplerCount(samplerCountState); auto programmableIDSLMSize = static_cast(computeSlmValues(sizeSlm)); pInterfaceDescriptor->setSharedLocalMemorySize(programmableIDSLMSize); pInterfaceDescriptor->setBarrierEnable(barrierEnable); PreemptionHelper::programInterfaceDescriptorDataPreemption(pInterfaceDescriptor, preemptionMode); return (size_t)offsetInterfaceDescriptor; } template void KernelCommandsHelper::sendMediaStateFlush( LinearStream &commandStream, size_t offsetInterfaceDescriptorData) { typedef typename GfxFamily::MEDIA_STATE_FLUSH MEDIA_STATE_FLUSH; auto pCmd = (MEDIA_STATE_FLUSH *)commandStream.getSpace(sizeof(MEDIA_STATE_FLUSH)); *pCmd = GfxFamily::cmdInitMediaStateFlush; pCmd->setInterfaceDescriptorOffset((uint32_t)offsetInterfaceDescriptorData); } template void KernelCommandsHelper::sendMediaInterfaceDescriptorLoad( LinearStream &commandStream, size_t offsetInterfaceDescriptorData, size_t sizeInterfaceDescriptorData) { { typedef typename GfxFamily::MEDIA_STATE_FLUSH MEDIA_STATE_FLUSH; auto pCmd = (MEDIA_STATE_FLUSH *)commandStream.getSpace(sizeof(MEDIA_STATE_FLUSH)); *pCmd = GfxFamily::cmdInitMediaStateFlush; } { typedef typename GfxFamily::MEDIA_INTERFACE_DESCRIPTOR_LOAD MEDIA_INTERFACE_DESCRIPTOR_LOAD; auto pCmd = (MEDIA_INTERFACE_DESCRIPTOR_LOAD *)commandStream.getSpace(sizeof(MEDIA_INTERFACE_DESCRIPTOR_LOAD)); *pCmd = GfxFamily::cmdInitMediaInterfaceDescriptorLoad; pCmd->setInterfaceDescriptorDataStartAddress((uint32_t)offsetInterfaceDescriptorData); pCmd->setInterfaceDescriptorTotalLength((uint32_t)sizeInterfaceDescriptorData); } } template size_t KernelCommandsHelper::sendCrossThreadData( IndirectHeap &indirectHeap, const Kernel &kernel) { typedef typename GfxFamily::GPGPU_WALKER GPGPU_WALKER; indirectHeap.align(GPGPU_WALKER::INDIRECTDATASTARTADDRESS_ALIGN_SIZE); auto offsetCrossThreadData = indirectHeap.getUsed(); auto sizeCrossThreadData = kernel.getCrossThreadDataSize(); char *pDest = static_cast(indirectHeap.getSpace(sizeCrossThreadData)); memcpy_s(pDest, sizeCrossThreadData, kernel.getCrossThreadData(), sizeCrossThreadData); return offsetCrossThreadData; } // Returned binding table pointer is relative to given heap (which is assumed to be the Surface state base addess) // as required by the INTERFACE_DESCRIPTOR_DATA. template size_t KernelCommandsHelper::pushBindingTableAndSurfaceStates(IndirectHeap &dstHeap, const KernelInfo &srcKernelInfo, const void *srcKernelSsh, size_t srcKernelSshSize, size_t numberOfBindingTableStates, size_t offsetOfBindingTable) { using BINDING_TABLE_STATE = typename GfxFamily::BINDING_TABLE_STATE; using INTERFACE_DESCRIPTOR_DATA = typename GfxFamily::INTERFACE_DESCRIPTOR_DATA; using RENDER_SURFACE_STATE = typename GfxFamily::RENDER_SURFACE_STATE; if ((srcKernelInfo.patchInfo.bindingTableState == nullptr) || (srcKernelInfo.patchInfo.bindingTableState->Count == 0)) { // according to compiler, kernel does not reference BTIs to stateful surfaces, so there's nothing to patch return 0; } size_t sshSize = srcKernelSshSize; DEBUG_BREAK_IF(srcKernelSsh == nullptr); auto srcSurfaceState = srcKernelSsh; // Align the heap and allocate space for new ssh data dstHeap.align(BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE); auto dstSurfaceState = dstHeap.getSpace(sshSize); // Compiler sends BTI table that is already populated with surface state pointers relative to local SSH. // We may need to patch these pointers so that they are relative to surface state base address if (dstSurfaceState == dstHeap.getCpuBase()) { // nothing to patch, we're at the start of heap (which is assumed to be the surface state base address) // we need to simply copy the ssh (including BTIs from compiler) memcpy_s(dstSurfaceState, sshSize, srcSurfaceState, sshSize); return offsetOfBindingTable; } // We can copy-over the surface states, but BTIs will need to be patched memcpy_s(dstSurfaceState, sshSize, srcSurfaceState, offsetOfBindingTable); uint32_t surfaceStatesOffset = static_cast(ptrDiff(dstSurfaceState, dstHeap.getCpuBase())); // march over BTIs and offset the pointers based on surface state base address auto *dstBtiTableBase = reinterpret_cast(ptrOffset(dstSurfaceState, offsetOfBindingTable)); DEBUG_BREAK_IF(reinterpret_cast(dstBtiTableBase) % INTERFACE_DESCRIPTOR_DATA::BINDINGTABLEPOINTER_ALIGN_SIZE != 0); auto *srcBtiTableBase = reinterpret_cast(ptrOffset(srcSurfaceState, offsetOfBindingTable)); BINDING_TABLE_STATE bti; bti.init(); // init whole DWORD - i.e. not just the SurfaceStatePointer bits for (uint32_t i = 0, e = (uint32_t)numberOfBindingTableStates; i != e; ++i) { uint32_t localSurfaceStateOffset = srcBtiTableBase[i].getSurfaceStatePointer(); uint32_t offsetedSurfaceStateOffset = localSurfaceStateOffset + surfaceStatesOffset; bti.setSurfaceStatePointer(offsetedSurfaceStateOffset); // patch just the SurfaceStatePointer bits dstBtiTableBase[i] = bti; DEBUG_BREAK_IF(bti.getRawData(0) % sizeof(BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE) != 0); } return ptrDiff(dstBtiTableBase, dstHeap.getCpuBase()); } template size_t KernelCommandsHelper::sendIndirectState( LinearStream &commandStream, IndirectHeap &dsh, IndirectHeap &ih, size_t ihReservedBlockSize, IndirectHeap &ioh, IndirectHeap &ssh, const Kernel &kernel, uint32_t simd, const size_t localWorkSize[3], const uint64_t offsetInterfaceDescriptorTable, const uint32_t interfaceDescriptorIndex, PreemptionMode preemptionMode) { typedef typename GfxFamily::INTERFACE_DESCRIPTOR_DATA INTERFACE_DESCRIPTOR_DATA; typedef typename GfxFamily::RENDER_SURFACE_STATE RENDER_SURFACE_STATE; typedef typename GfxFamily::SAMPLER_STATE SAMPLER_STATE; DEBUG_BREAK_IF(simd != 8 && simd != 16 && simd != 32); // Copy the kernel over to the ISH auto kernelStartOffset = copyKernelBinary(ih, kernel.getKernelInfo()); const auto &kernelInfo = kernel.getKernelInfo(); const auto &patchInfo = kernelInfo.patchInfo; auto dstBindingTablePointer = pushBindingTableAndSurfaceStates(ssh, kernel); // Copy our sampler state if it exists size_t samplerStateOffset = 0; uint32_t samplerCount = 0; if (patchInfo.samplerStateArray) { size_t borderColorOffset = 0; samplerCount = patchInfo.samplerStateArray->Count; auto sizeSamplerState = sizeof(SAMPLER_STATE) * samplerCount; auto borderColorSize = patchInfo.samplerStateArray->Offset - patchInfo.samplerStateArray->BorderColorOffset; dsh.align(alignIndirectStatePointer); borderColorOffset = dsh.getUsed(); auto borderColor = dsh.getSpace(borderColorSize); memcpy_s(borderColor, borderColorSize, ptrOffset(kernel.getDynamicStateHeap(), patchInfo.samplerStateArray->BorderColorOffset), borderColorSize); dsh.align(INTERFACE_DESCRIPTOR_DATA::SAMPLERSTATEPOINTER_ALIGN_SIZE); samplerStateOffset = dsh.getUsed(); auto samplerState = dsh.getSpace(sizeSamplerState); memcpy_s(samplerState, sizeSamplerState, ptrOffset(kernel.getDynamicStateHeap(), patchInfo.samplerStateArray->Offset), sizeSamplerState); auto pSmplr = (SAMPLER_STATE *)(samplerState); for (uint32_t i = 0; i < samplerCount; i++) { pSmplr->setIndirectStatePointer((uint32_t)borderColorOffset); pSmplr++; } } // Send thread data auto offsetCrossThreadData = sendCrossThreadData( ioh, kernel); auto threadPayload = kernel.getKernelInfo().patchInfo.threadPayload; DEBUG_BREAK_IF(nullptr == threadPayload); auto numChannels = PerThreadDataHelper::getNumLocalIdChannels(*threadPayload); sendPerThreadData( ioh, simd, numChannels, localWorkSize); // send interface descriptor data auto localWorkItems = localWorkSize[0] * localWorkSize[1] * localWorkSize[2]; auto sizePerThreadData = getPerThreadSizeLocalIDs(simd, numChannels); auto threadsPerThreadGroup = static_cast(getThreadsPerWG(simd, localWorkItems)); uint64_t offsetInterfaceDescriptor = offsetInterfaceDescriptorTable + interfaceDescriptorIndex * sizeof(INTERFACE_DESCRIPTOR_DATA); DEBUG_BREAK_IF(patchInfo.executionEnvironment == nullptr); KernelCommandsHelper::sendInterfaceDescriptorData( dsh, offsetInterfaceDescriptor, kernelStartOffset + ihReservedBlockSize, kernel.getCrossThreadDataSize(), sizePerThreadData, dstBindingTablePointer, samplerStateOffset, samplerCount, threadsPerThreadGroup, kernel.slmTotalSize, !!patchInfo.executionEnvironment->HasBarriers, preemptionMode); // Program media state flush to set interface descriptor offset KernelCommandsHelper::sendMediaStateFlush( commandStream, interfaceDescriptorIndex); return offsetCrossThreadData; } } // namespace OCLRT