/* * Copyright (C) 2021-2023 Intel Corporation * * SPDX-License-Identifier: MIT * */ #pragma once #include "shared/source/command_container/command_encoder.h" #include "shared/source/command_container/implicit_scaling.h" #include "shared/source/debug_settings/debug_settings_manager.h" #include "shared/source/helpers/engine_node_helper.h" #include "shared/source/os_interface/os_context.h" #include "shared/source/os_interface/os_interface.h" #include "shared/source/utilities/tag_allocator.h" #include "opencl/source/command_queue/hardware_interface_base.inl" namespace NEO { template inline void HardwareInterface::getDefaultDshSpace( const size_t &offsetInterfaceDescriptorTable, CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo, size_t &totalInterfaceDescriptorTableSize, IndirectHeap *dsh, LinearStream *commandStream) { } template inline void HardwareInterface::dispatchWorkarounds( LinearStream *commandStream, CommandQueue &commandQueue, Kernel &kernel, const bool &enable) { } template inline void HardwareInterface::programWalker( LinearStream &commandStream, Kernel &kernel, CommandQueue &commandQueue, IndirectHeap &dsh, IndirectHeap &ioh, IndirectHeap &ssh, const DispatchInfo &dispatchInfo, HardwareInterfaceWalkerArgs &walkerArgs) { using COMPUTE_WALKER = typename GfxFamily::COMPUTE_WALKER; COMPUTE_WALKER walkerCmd = GfxFamily::cmdInitGpgpuWalker; auto &kernelInfo = kernel.getKernelInfo(); uint32_t dim = dispatchInfo.getDim(); uint32_t simd = kernelInfo.getMaxSimdSize(); auto numChannels = kernelInfo.kernelDescriptor.kernelAttributes.numLocalIdChannels; size_t globalOffsets[3] = {dispatchInfo.getOffset().x, dispatchInfo.getOffset().y, dispatchInfo.getOffset().z}; size_t startWorkGroups[3] = {walkerArgs.startOfWorkgroups->x, walkerArgs.startOfWorkgroups->y, walkerArgs.startOfWorkgroups->z}; size_t numWorkGroups[3] = {walkerArgs.numberOfWorkgroups->x, walkerArgs.numberOfWorkgroups->y, walkerArgs.numberOfWorkgroups->z}; auto threadGroupCount = static_cast(walkerArgs.numberOfWorkgroups->x * walkerArgs.numberOfWorkgroups->y * walkerArgs.numberOfWorkgroups->z); uint32_t requiredWalkOrder = 0u; auto kernelUsesLocalIds = HardwareCommandsHelper::kernelUsesLocalIds(kernel); bool localIdsGenerationByRuntime = kernelUsesLocalIds && EncodeDispatchKernel::isRuntimeLocalIdsGenerationRequired( numChannels, walkerArgs.localWorkSizes, std::array{{kernelInfo.kernelDescriptor.kernelAttributes.workgroupWalkOrder[0], kernelInfo.kernelDescriptor.kernelAttributes.workgroupWalkOrder[1], kernelInfo.kernelDescriptor.kernelAttributes.workgroupWalkOrder[2]}}, kernelInfo.kernelDescriptor.kernelAttributes.flags.requiresWorkgroupWalkOrder, requiredWalkOrder, simd); bool inlineDataProgrammingRequired = HardwareCommandsHelper::inlineDataProgrammingRequired(kernel); auto idd = &walkerCmd.getInterfaceDescriptor(); auto &queueCsr = commandQueue.getGpgpuCommandStreamReceiver(); auto &rootDeviceEnvironment = commandQueue.getDevice().getRootDeviceEnvironment(); if (walkerArgs.currentTimestampPacketNodes && queueCsr.peekTimestampPacketWriteEnabled()) { auto timestampPacket = walkerArgs.currentTimestampPacketNodes->peekNodes().at(walkerArgs.currentDispatchIndex); GpgpuWalkerHelper::setupTimestampPacket(&commandStream, &walkerCmd, timestampPacket, rootDeviceEnvironment); } auto isCcsUsed = EngineHelpers::isCcs(commandQueue.getGpgpuEngine().osContext->getEngineType()); const auto &hwInfo = commandQueue.getDevice().getHardwareInfo(); if (auto kernelAllocation = kernelInfo.getGraphicsAllocation()) { EncodeMemoryPrefetch::programMemoryPrefetch(commandStream, *kernelAllocation, kernelInfo.heapInfo.KernelHeapSize, 0, rootDeviceEnvironment); } HardwareCommandsHelper::sendIndirectState( commandStream, dsh, ioh, ssh, kernel, kernel.getKernelStartAddress(localIdsGenerationByRuntime, kernelUsesLocalIds, isCcsUsed, false), simd, walkerArgs.localWorkSizes, threadGroupCount, walkerArgs.offsetInterfaceDescriptorTable, walkerArgs.interfaceDescriptorIndex, walkerArgs.preemptionMode, &walkerCmd, idd, localIdsGenerationByRuntime, commandQueue.getDevice()); GpgpuWalkerHelper::setGpgpuWalkerThreadData(&walkerCmd, kernelInfo.kernelDescriptor, globalOffsets, startWorkGroups, numWorkGroups, walkerArgs.localWorkSizes, simd, dim, localIdsGenerationByRuntime, inlineDataProgrammingRequired, requiredWalkOrder); bool kernelSystemAllocation = false; if (kernel.isBuiltIn) { kernelSystemAllocation = kernel.getDestinationAllocationInSystemMemory(); } else { kernelSystemAllocation = kernel.isAnyKernelArgumentUsingSystemMemory(); } bool requiredSystemFence = kernelSystemAllocation && walkerArgs.event != nullptr; EncodeWalkerArgs encodeWalkerArgs{kernel.getExecutionType(), requiredSystemFence, kernelInfo.kernelDescriptor}; EncodeDispatchKernel::encodeAdditionalWalkerFields(rootDeviceEnvironment, walkerCmd, encodeWalkerArgs); auto devices = queueCsr.getOsContext().getDeviceBitfield(); auto partitionWalker = ImplicitScalingHelper::isImplicitScalingEnabled(devices, true); if (partitionWalker) { const uint64_t workPartitionAllocationGpuVa = commandQueue.getDevice().getDefaultEngine().commandStreamReceiver->getWorkPartitionAllocationGpuAddress(); uint32_t partitionCount = 0u; ImplicitScalingDispatch::dispatchCommands(commandStream, walkerCmd, devices, partitionCount, false, false, kernel.usesImages(), queueCsr.getDcFlushSupport(), kernel.isSingleSubdevicePreferred(), workPartitionAllocationGpuVa, hwInfo); if (queueCsr.isStaticWorkPartitioningEnabled()) { queueCsr.setActivePartitions(std::max(queueCsr.getActivePartitions(), partitionCount)); } auto timestampPacket = walkerArgs.currentTimestampPacketNodes->peekNodes().at(walkerArgs.currentDispatchIndex); timestampPacket->setPacketsUsed(partitionCount); } else { auto computeWalkerOnStream = commandStream.getSpaceForCmd(); *computeWalkerOnStream = walkerCmd; } } } // namespace NEO