/* * Copyright (C) 2017-2018 Intel Corporation * * SPDX-License-Identifier: MIT * */ #include "runtime/command_queue/local_id_gen.h" #include "runtime/command_stream/csr_definitions.h" #include "runtime/command_stream/preemption.h" #include "runtime/helpers/aligned_memory.h" #include "runtime/helpers/basic_math.h" #include "runtime/helpers/dispatch_info.h" #include "runtime/helpers/address_patch.h" #include "runtime/helpers/ptr_math.h" #include "runtime/helpers/string.h" #include "runtime/indirect_heap/indirect_heap.h" #include "runtime/kernel/kernel.h" #include "runtime/os_interface/debug_settings_manager.h" #include namespace OCLRT { template uint32_t KernelCommandsHelper::computeSlmValues(uint32_t valueIn) { auto value = std::max(valueIn, 1024u); value = Math::nextPowerOfTwo(value); value = Math::getMinLsbSet(value); value = value - 9; DEBUG_BREAK_IF(value > 7); return value * !!valueIn; } template size_t KernelCommandsHelper::getSizeRequiredDSH( const Kernel &kernel) { using INTERFACE_DESCRIPTOR_DATA = typename GfxFamily::INTERFACE_DESCRIPTOR_DATA; using SAMPLER_STATE = typename GfxFamily::SAMPLER_STATE; const auto &patchInfo = kernel.getKernelInfo().patchInfo; auto samplerCount = patchInfo.samplerStateArray ? patchInfo.samplerStateArray->Count : 0; auto totalSize = samplerCount ? alignUp(samplerCount * sizeof(SAMPLER_STATE), INTERFACE_DESCRIPTOR_DATA::SAMPLERSTATEPOINTER_ALIGN_SIZE) : 0; auto borderColorSize = patchInfo.samplerStateArray ? patchInfo.samplerStateArray->Offset - patchInfo.samplerStateArray->BorderColorOffset : 0; borderColorSize = alignUp(borderColorSize + alignIndirectStatePointer - 1, alignIndirectStatePointer); totalSize += borderColorSize + additionalSizeRequiredDsh(); DEBUG_BREAK_IF(!(totalSize >= kernel.getDynamicStateHeapSize() || kernel.getKernelInfo().isVmeWorkload)); return alignUp(totalSize, alignInterfaceDescriptorData); } template size_t KernelCommandsHelper::getSizeRequiredIOH( const Kernel &kernel, size_t localWorkSize) { typedef typename GfxFamily::WALKER_TYPE WALKER_TYPE; auto threadPayload = kernel.getKernelInfo().patchInfo.threadPayload; DEBUG_BREAK_IF(nullptr == threadPayload); auto numChannels = PerThreadDataHelper::getNumLocalIdChannels(*threadPayload); return alignUp((kernel.getCrossThreadDataSize() + getPerThreadDataSizeTotal(kernel.getKernelInfo().getMaxSimdSize(), numChannels, localWorkSize)), WALKER_TYPE::INDIRECTDATASTARTADDRESS_ALIGN_SIZE); } template size_t KernelCommandsHelper::getSizeRequiredSSH( const Kernel &kernel) { typedef typename GfxFamily::BINDING_TABLE_STATE BINDING_TABLE_STATE; auto sizeSSH = kernel.getSurfaceStateHeapSize(); sizeSSH += sizeSSH ? BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE : 0; return sizeSSH; } template size_t getSizeRequired(const MultiDispatchInfo &multiDispatchInfo, SizeGetterT &&getSize, ArgsT... args) { size_t totalSize = 0; auto it = multiDispatchInfo.begin(); for (auto e = multiDispatchInfo.end(); it != e; ++it) { totalSize = alignUp(totalSize, MemoryConstants::pageSize); totalSize += getSize(*it, std::forward(args)...); } return totalSize; } template size_t KernelCommandsHelper::getTotalSizeRequiredDSH( const MultiDispatchInfo &multiDispatchInfo) { return getSizeRequired(multiDispatchInfo, [](const DispatchInfo &dispatchInfo) { return getSizeRequiredDSH(*dispatchInfo.getKernel()); }); } template size_t KernelCommandsHelper::getTotalSizeRequiredIOH( const MultiDispatchInfo &multiDispatchInfo) { return getSizeRequired(multiDispatchInfo, [](const DispatchInfo &dispatchInfo) { return getSizeRequiredIOH(*dispatchInfo.getKernel(), Math::computeTotalElementsCount(dispatchInfo.getLocalWorkgroupSize())); }); } template size_t KernelCommandsHelper::getTotalSizeRequiredSSH( const MultiDispatchInfo &multiDispatchInfo) { return getSizeRequired(multiDispatchInfo, [](const DispatchInfo &dispatchInfo) { return getSizeRequiredSSH(*dispatchInfo.getKernel()); }); } template size_t KernelCommandsHelper::sendInterfaceDescriptorData( const IndirectHeap &indirectHeap, uint64_t offsetInterfaceDescriptor, uint64_t kernelStartOffset, size_t sizeCrossThreadData, size_t sizePerThreadData, size_t bindingTablePointer, size_t offsetSamplerState, uint32_t numSamplers, uint32_t threadsPerThreadGroup, uint32_t sizeSlm, uint32_t bindingTablePrefetchSize, bool barrierEnable, PreemptionMode preemptionMode, INTERFACE_DESCRIPTOR_DATA *inlineInterfaceDescriptor) { using SAMPLER_STATE = typename GfxFamily::SAMPLER_STATE; // Allocate some memory for the interface descriptor auto pInterfaceDescriptor = getInterfaceDescriptor(indirectHeap, offsetInterfaceDescriptor, inlineInterfaceDescriptor); *pInterfaceDescriptor = GfxFamily::cmdInitInterfaceDescriptorData; // Program the kernel start pointer pInterfaceDescriptor->setKernelStartPointerHigh(kernelStartOffset >> 32); pInterfaceDescriptor->setKernelStartPointer((uint32_t)kernelStartOffset); // # of threads in thread group should be based on LWS. pInterfaceDescriptor->setNumberOfThreadsInGpgpuThreadGroup(threadsPerThreadGroup); pInterfaceDescriptor->setDenormMode(INTERFACE_DESCRIPTOR_DATA::DENORM_MODE_SETBYKERNEL); setAdditionalInfo(pInterfaceDescriptor, sizeCrossThreadData, sizePerThreadData); pInterfaceDescriptor->setBindingTablePointer(static_cast(bindingTablePointer)); pInterfaceDescriptor->setSamplerStatePointer(static_cast(offsetSamplerState)); DEBUG_BREAK_IF(numSamplers > 16); auto samplerCountState = static_cast((numSamplers + 3) / 4); pInterfaceDescriptor->setSamplerCount(samplerCountState); auto programmableIDSLMSize = static_cast(computeSlmValues(sizeSlm)); pInterfaceDescriptor->setSharedLocalMemorySize(programmableIDSLMSize); pInterfaceDescriptor->setBarrierEnable(barrierEnable); PreemptionHelper::programInterfaceDescriptorDataPreemption(pInterfaceDescriptor, preemptionMode); pInterfaceDescriptor->setBindingTableEntryCount(bindingTablePrefetchSize); return (size_t)offsetInterfaceDescriptor; } // Returned binding table pointer is relative to given heap (which is assumed to be the Surface state base addess) // as required by the INTERFACE_DESCRIPTOR_DATA. template size_t KernelCommandsHelper::pushBindingTableAndSurfaceStates(IndirectHeap &dstHeap, const KernelInfo &srcKernelInfo, const void *srcKernelSsh, size_t srcKernelSshSize, size_t numberOfBindingTableStates, size_t offsetOfBindingTable) { using BINDING_TABLE_STATE = typename GfxFamily::BINDING_TABLE_STATE; using INTERFACE_DESCRIPTOR_DATA = typename GfxFamily::INTERFACE_DESCRIPTOR_DATA; using RENDER_SURFACE_STATE = typename GfxFamily::RENDER_SURFACE_STATE; if ((srcKernelInfo.patchInfo.bindingTableState == nullptr) || (srcKernelInfo.patchInfo.bindingTableState->Count == 0)) { // according to compiler, kernel does not reference BTIs to stateful surfaces, so there's nothing to patch return 0; } size_t sshSize = srcKernelSshSize; DEBUG_BREAK_IF(srcKernelSsh == nullptr); auto srcSurfaceState = srcKernelSsh; // Align the heap and allocate space for new ssh data dstHeap.align(BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE); auto dstSurfaceState = dstHeap.getSpace(sshSize); // Compiler sends BTI table that is already populated with surface state pointers relative to local SSH. // We may need to patch these pointers so that they are relative to surface state base address if (dstSurfaceState == dstHeap.getCpuBase()) { // nothing to patch, we're at the start of heap (which is assumed to be the surface state base address) // we need to simply copy the ssh (including BTIs from compiler) memcpy_s(dstSurfaceState, sshSize, srcSurfaceState, sshSize); return offsetOfBindingTable; } // We can copy-over the surface states, but BTIs will need to be patched memcpy_s(dstSurfaceState, sshSize, srcSurfaceState, offsetOfBindingTable); uint32_t surfaceStatesOffset = static_cast(ptrDiff(dstSurfaceState, dstHeap.getCpuBase())); // march over BTIs and offset the pointers based on surface state base address auto *dstBtiTableBase = reinterpret_cast(ptrOffset(dstSurfaceState, offsetOfBindingTable)); DEBUG_BREAK_IF(reinterpret_cast(dstBtiTableBase) % INTERFACE_DESCRIPTOR_DATA::BINDINGTABLEPOINTER_ALIGN_SIZE != 0); auto *srcBtiTableBase = reinterpret_cast(ptrOffset(srcSurfaceState, offsetOfBindingTable)); BINDING_TABLE_STATE bti; bti.init(); // init whole DWORD - i.e. not just the SurfaceStatePointer bits for (uint32_t i = 0, e = (uint32_t)numberOfBindingTableStates; i != e; ++i) { uint32_t localSurfaceStateOffset = srcBtiTableBase[i].getSurfaceStatePointer(); uint32_t offsetedSurfaceStateOffset = localSurfaceStateOffset + surfaceStatesOffset; bti.setSurfaceStatePointer(offsetedSurfaceStateOffset); // patch just the SurfaceStatePointer bits dstBtiTableBase[i] = bti; DEBUG_BREAK_IF(bti.getRawData(0) % sizeof(BINDING_TABLE_STATE::SURFACESTATEPOINTER_ALIGN_SIZE) != 0); } return ptrDiff(dstBtiTableBase, dstHeap.getCpuBase()); } template size_t KernelCommandsHelper::sendIndirectState( LinearStream &commandStream, IndirectHeap &dsh, IndirectHeap &ioh, IndirectHeap &ssh, Kernel &kernel, uint32_t simd, const size_t localWorkSize[3], const uint64_t offsetInterfaceDescriptorTable, uint32_t &interfaceDescriptorIndex, PreemptionMode preemptionMode, WALKER_TYPE *walkerCmd, INTERFACE_DESCRIPTOR_DATA *inlineInterfaceDescriptor, bool localIdsGenerationByRuntime, bool kernelUsesLocalIds, bool inlineDataProgrammingRequired) { using SAMPLER_STATE = typename GfxFamily::SAMPLER_STATE; DEBUG_BREAK_IF(simd != 8 && simd != 16 && simd != 32); // Copy the kernel over to the ISH uint64_t kernelStartOffset = 0llu; const auto &kernelInfo = kernel.getKernelInfo(); auto kernelAllocation = kernelInfo.getGraphicsAllocation(); DEBUG_BREAK_IF(!kernelAllocation); setKernelStartOffset(kernelStartOffset, kernelAllocation, kernelInfo, localIdsGenerationByRuntime, kernelUsesLocalIds, kernel); const auto &patchInfo = kernelInfo.patchInfo; auto dstBindingTablePointer = pushBindingTableAndSurfaceStates(ssh, kernel); // Copy our sampler state if it exists size_t samplerStateOffset = 0; uint32_t samplerCount = 0; if (patchInfo.samplerStateArray) { size_t borderColorOffset = 0; samplerCount = patchInfo.samplerStateArray->Count; auto sizeSamplerState = sizeof(SAMPLER_STATE) * samplerCount; auto borderColorSize = patchInfo.samplerStateArray->Offset - patchInfo.samplerStateArray->BorderColorOffset; dsh.align(alignIndirectStatePointer); borderColorOffset = dsh.getUsed(); auto borderColor = dsh.getSpace(borderColorSize); memcpy_s(borderColor, borderColorSize, ptrOffset(kernel.getDynamicStateHeap(), patchInfo.samplerStateArray->BorderColorOffset), borderColorSize); dsh.align(INTERFACE_DESCRIPTOR_DATA::SAMPLERSTATEPOINTER_ALIGN_SIZE); samplerStateOffset = dsh.getUsed(); auto samplerState = dsh.getSpace(sizeSamplerState); memcpy_s(samplerState, sizeSamplerState, ptrOffset(kernel.getDynamicStateHeap(), patchInfo.samplerStateArray->Offset), sizeSamplerState); auto pSmplr = reinterpret_cast(samplerState); for (uint32_t i = 0; i < samplerCount; i++) { pSmplr->setIndirectStatePointer((uint32_t)borderColorOffset); pSmplr++; } } auto threadPayload = kernel.getKernelInfo().patchInfo.threadPayload; DEBUG_BREAK_IF(nullptr == threadPayload); auto localWorkItems = localWorkSize[0] * localWorkSize[1] * localWorkSize[2]; auto threadsPerThreadGroup = static_cast(getThreadsPerWG(simd, localWorkItems)); auto numChannels = PerThreadDataHelper::getNumLocalIdChannels(*threadPayload); uint32_t sizeCrossThreadData = kernel.getCrossThreadDataSize(); size_t offsetCrossThreadData = KernelCommandsHelper::sendCrossThreadData( ioh, kernel, inlineDataProgrammingRequired, walkerCmd, sizeCrossThreadData); size_t sizePerThreadDataTotal = 0; size_t sizePerThreadData = 0; KernelCommandsHelper::programPerThreadData( sizePerThreadData, localIdsGenerationByRuntime, ioh, simd, numChannels, localWorkSize, kernel, sizePerThreadDataTotal, localWorkItems); uint64_t offsetInterfaceDescriptor = offsetInterfaceDescriptorTable + interfaceDescriptorIndex * sizeof(INTERFACE_DESCRIPTOR_DATA); DEBUG_BREAK_IF(patchInfo.executionEnvironment == nullptr); auto bindingTablePrefetchSize = std::min(31u, static_cast(kernel.getNumberOfBindingTableStates())); if (resetBindingTablePrefetch(kernel)) { bindingTablePrefetchSize = 0; } KernelCommandsHelper::sendInterfaceDescriptorData( dsh, offsetInterfaceDescriptor, kernelStartOffset, sizeCrossThreadData, sizePerThreadData, dstBindingTablePointer, samplerStateOffset, samplerCount, threadsPerThreadGroup, kernel.slmTotalSize, bindingTablePrefetchSize, !!patchInfo.executionEnvironment->HasBarriers, preemptionMode, inlineInterfaceDescriptor); if (DebugManager.flags.AddPatchInfoCommentsForAUBDump.get()) { PatchInfoData patchInfoData(kernelStartOffset, 0, PatchInfoAllocationType::InstructionHeap, dsh.getGraphicsAllocation()->getGpuAddress(), offsetInterfaceDescriptor, PatchInfoAllocationType::DynamicStateHeap); kernel.getPatchInfoDataList().push_back(patchInfoData); } // Program media state flush to set interface descriptor offset sendMediaStateFlush( commandStream, interfaceDescriptorIndex); DEBUG_BREAK_IF(offsetCrossThreadData % 64 != 0); walkerCmd->setIndirectDataStartAddress(static_cast(offsetCrossThreadData)); setInterfaceDescriptorOffset(walkerCmd, interfaceDescriptorIndex); auto indirectDataLength = alignUp(static_cast(sizeCrossThreadData + sizePerThreadDataTotal), WALKER_TYPE::INDIRECTDATASTARTADDRESS_ALIGN_SIZE); walkerCmd->setIndirectDataLength(indirectDataLength); return offsetCrossThreadData; } template void KernelCommandsHelper::updatePerThreadDataTotal( size_t &sizePerThreadData, uint32_t &simd, uint32_t &numChannels, size_t &sizePerThreadDataTotal, size_t &localWorkItems) { sizePerThreadData = getPerThreadSizeLocalIDs(simd, numChannels); auto localIdSizePerThread = PerThreadDataHelper::getLocalIdSizePerThread(simd, numChannels); localIdSizePerThread = std::max(localIdSizePerThread, sizeof(GRF)); sizePerThreadDataTotal = getThreadsPerWG(simd, localWorkItems) * localIdSizePerThread; DEBUG_BREAK_IF(sizePerThreadDataTotal == 0); // Hardware requires at least 1 GRF of perThreadData for each thread in thread group } template void KernelCommandsHelper::programMiSemaphoreWait(LinearStream &commandStream, uint64_t compareAddress, uint32_t compareData) { using MI_SEMAPHORE_WAIT = typename GfxFamily::MI_SEMAPHORE_WAIT; auto miSemaphoreCmd = commandStream.getSpaceForCmd(); *miSemaphoreCmd = GfxFamily::cmdInitMiSemaphoreWait; miSemaphoreCmd->setCompareOperation(MI_SEMAPHORE_WAIT::COMPARE_OPERATION::COMPARE_OPERATION_SAD_NOT_EQUAL_SDD); miSemaphoreCmd->setSemaphoreDataDword(compareData); miSemaphoreCmd->setSemaphoreGraphicsAddress(compareAddress); miSemaphoreCmd->setWaitMode(MI_SEMAPHORE_WAIT::WAIT_MODE::WAIT_MODE_POLLING_MODE); } template typename GfxFamily::MI_ATOMIC *KernelCommandsHelper::programMiAtomic(LinearStream &commandStream, uint64_t writeAddress, typename MI_ATOMIC::ATOMIC_OPCODES opcode, typename MI_ATOMIC::DATA_SIZE dataSize) { auto miAtomic = commandStream.getSpaceForCmd(); *miAtomic = MI_ATOMIC::sInit(); miAtomic->setAtomicOpcode(opcode); miAtomic->setDataSize(dataSize); miAtomic->setMemoryAddress(static_cast(writeAddress & 0x0000FFFFFFFFULL)); miAtomic->setMemoryAddressHigh(static_cast(writeAddress >> 32)); return miAtomic; } template bool KernelCommandsHelper::doBindingTablePrefetch() { return true; } template bool KernelCommandsHelper::inlineDataProgrammingRequired(const Kernel &kernel) { if (DebugManager.flags.EnablePassInlineData.get()) { return kernel.getKernelInfo().patchInfo.threadPayload->PassInlineData; } return false; } template bool KernelCommandsHelper::kernelUsesLocalIds(const Kernel &kernel) { return (kernel.getKernelInfo().patchInfo.threadPayload->LocalIDXPresent || kernel.getKernelInfo().patchInfo.threadPayload->LocalIDYPresent || kernel.getKernelInfo().patchInfo.threadPayload->LocalIDZPresent); } } // namespace OCLRT