/* * Copyright (C) 2020-2021 Intel Corporation * * SPDX-License-Identifier: MIT * */ #pragma once #include "shared/source/helpers/hw_helper.h" #include "opencl/source/helpers/hardware_commands_helper.h" #include "opencl/source/kernel/kernel.h" #include "pipe_control_args.h" namespace NEO { template typename HardwareCommandsHelper::INTERFACE_DESCRIPTOR_DATA *HardwareCommandsHelper::getInterfaceDescriptor( const IndirectHeap &indirectHeap, uint64_t offsetInterfaceDescriptor, HardwareCommandsHelper::INTERFACE_DESCRIPTOR_DATA *inlineInterfaceDescriptor) { return static_cast(ptrOffset(indirectHeap.getCpuBase(), (size_t)offsetInterfaceDescriptor)); } template void HardwareCommandsHelper::setGrfInfo(INTERFACE_DESCRIPTOR_DATA *pInterfaceDescriptor, const Kernel &kernel, const size_t &sizeCrossThreadData, const size_t &sizePerThreadData) { auto grfSize = sizeof(typename GfxFamily::GRF); DEBUG_BREAK_IF((sizeCrossThreadData % grfSize) != 0); auto numGrfCrossThreadData = static_cast(sizeCrossThreadData / grfSize); DEBUG_BREAK_IF(numGrfCrossThreadData == 0); pInterfaceDescriptor->setCrossThreadConstantDataReadLength(numGrfCrossThreadData); DEBUG_BREAK_IF((sizePerThreadData % grfSize) != 0); auto numGrfPerThreadData = static_cast(sizePerThreadData / grfSize); // at least 1 GRF of perThreadData for each thread in a thread group when sizeCrossThreadData != 0 numGrfPerThreadData = std::max(numGrfPerThreadData, 1u); pInterfaceDescriptor->setConstantIndirectUrbEntryReadLength(numGrfPerThreadData); } template uint32_t HardwareCommandsHelper::additionalSizeRequiredDsh() { return sizeof(INTERFACE_DESCRIPTOR_DATA); } template size_t HardwareCommandsHelper::getSizeRequiredCS() { size_t size = 2 * sizeof(typename GfxFamily::MEDIA_STATE_FLUSH) + sizeof(typename GfxFamily::MEDIA_INTERFACE_DESCRIPTOR_LOAD); return size; } template size_t HardwareCommandsHelper::getSizeRequiredForCacheFlush(const CommandQueue &commandQueue, const Kernel *kernel, uint64_t postSyncAddress) { return kernel->requiresCacheFlushCommand(commandQueue) ? sizeof(typename GfxFamily::PIPE_CONTROL) : 0; } template void HardwareCommandsHelper::sendMediaStateFlush( LinearStream &commandStream, size_t offsetInterfaceDescriptorData) { using MEDIA_STATE_FLUSH = typename GfxFamily::MEDIA_STATE_FLUSH; auto pCmd = commandStream.getSpaceForCmd(); MEDIA_STATE_FLUSH cmd = GfxFamily::cmdInitMediaStateFlush; cmd.setInterfaceDescriptorOffset(static_cast(offsetInterfaceDescriptorData)); *pCmd = cmd; } template void HardwareCommandsHelper::sendMediaInterfaceDescriptorLoad( LinearStream &commandStream, size_t offsetInterfaceDescriptorData, size_t sizeInterfaceDescriptorData) { { using MEDIA_STATE_FLUSH = typename GfxFamily::MEDIA_STATE_FLUSH; auto pCmd = commandStream.getSpaceForCmd(); *pCmd = GfxFamily::cmdInitMediaStateFlush; } { using MEDIA_INTERFACE_DESCRIPTOR_LOAD = typename GfxFamily::MEDIA_INTERFACE_DESCRIPTOR_LOAD; auto pCmd = commandStream.getSpaceForCmd(); MEDIA_INTERFACE_DESCRIPTOR_LOAD cmd = GfxFamily::cmdInitMediaInterfaceDescriptorLoad; cmd.setInterfaceDescriptorDataStartAddress(static_cast(offsetInterfaceDescriptorData)); cmd.setInterfaceDescriptorTotalLength(static_cast(sizeInterfaceDescriptorData)); *pCmd = cmd; } } template void HardwareCommandsHelper::programPerThreadData( size_t &sizePerThreadData, const bool &localIdsGenerationByRuntime, LinearStream &ioh, uint32_t &simd, uint32_t &numChannels, const size_t localWorkSize[3], Kernel &kernel, size_t &sizePerThreadDataTotal, size_t &localWorkItems, uint32_t rootDeviceIndex) { uint32_t grfSize = sizeof(typename GfxFamily::GRF); sendPerThreadData( ioh, simd, grfSize, numChannels, std::array{{static_cast(localWorkSize[0]), static_cast(localWorkSize[1]), static_cast(localWorkSize[2])}}, std::array{{kernel.getKernelInfo().kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[0], kernel.getKernelInfo().kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[1], kernel.getKernelInfo().kernelDescriptor.kernelAttributes.workgroupDimensionsOrder[2]}}, kernel.usesOnlyImages()); updatePerThreadDataTotal(sizePerThreadData, simd, numChannels, sizePerThreadDataTotal, localWorkItems); } template size_t HardwareCommandsHelper::sendCrossThreadData( IndirectHeap &indirectHeap, Kernel &kernel, bool inlineDataProgrammingRequired, WALKER_TYPE *walkerCmd, uint32_t &sizeCrossThreadData) { indirectHeap.align(WALKER_TYPE::INDIRECTDATASTARTADDRESS_ALIGN_SIZE); auto offsetCrossThreadData = indirectHeap.getUsed(); char *pDest = static_cast(indirectHeap.getSpace(sizeCrossThreadData)); memcpy_s(pDest, sizeCrossThreadData, kernel.getCrossThreadData(), sizeCrossThreadData); if (DebugManager.flags.AddPatchInfoCommentsForAUBDump.get()) { FlatBatchBufferHelper::fixCrossThreadDataInfo(kernel.getPatchInfoDataList(), offsetCrossThreadData, indirectHeap.getGraphicsAllocation()->getGpuAddress()); } return offsetCrossThreadData + static_cast(indirectHeap.getHeapGpuStartOffset()); } template bool HardwareCommandsHelper::resetBindingTablePrefetch(Kernel &kernel) { return kernel.isSchedulerKernel || !EncodeSurfaceState::doBindingTablePrefetch(); } template void HardwareCommandsHelper::setInterfaceDescriptorOffset( WALKER_TYPE *walkerCmd, uint32_t &interfaceDescriptorIndex) { walkerCmd->setInterfaceDescriptorOffset(interfaceDescriptorIndex++); } template void HardwareCommandsHelper::programCacheFlushAfterWalkerCommand(LinearStream *commandStream, const CommandQueue &commandQueue, const Kernel *kernel, uint64_t postSyncAddress) { PipeControlArgs args(true); MemorySynchronizationCommands::addPipeControl(*commandStream, args); } } // namespace NEO