/* * Copyright (C) 2018-2020 Intel Corporation * * SPDX-License-Identifier: MIT * */ #pragma once #include "shared/source/memory_manager/internal_allocation_storage.h" #include "opencl/source/command_queue/gpgpu_walker.h" #include "opencl/source/command_queue/hardware_interface.h" #include "opencl/source/helpers/hardware_commands_helper.h" #include "opencl/source/helpers/task_information.h" namespace NEO { template inline WALKER_TYPE *HardwareInterface::allocateWalkerSpace(LinearStream &commandStream, const Kernel &kernel) { auto walkerCmd = static_cast *>(commandStream.getSpace(sizeof(WALKER_TYPE))); *walkerCmd = GfxFamily::cmdInitGpgpuWalker; return walkerCmd; } template void HardwareInterface::dispatchWalker( CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo, const CsrDependencies &csrDependencies, KernelOperation *blockedCommandsData, TagNode *hwTimeStamps, TagNode *hwPerfCounter, TimestampPacketDependencies *timestampPacketDependencies, TimestampPacketContainer *currentTimestampPacketNodes, uint32_t commandType) { LinearStream *commandStream = nullptr; IndirectHeap *dsh = nullptr, *ioh = nullptr, *ssh = nullptr; auto parentKernel = multiDispatchInfo.peekParentKernel(); auto mainKernel = multiDispatchInfo.peekMainKernel(); auto preemptionMode = PreemptionHelper::taskPreemptionMode(commandQueue.getDevice(), multiDispatchInfo); for (auto &dispatchInfo : multiDispatchInfo) { // Compute local workgroup sizes if (dispatchInfo.getLocalWorkgroupSize().x == 0) { const auto lws = generateWorkgroupSize(dispatchInfo); const_cast(dispatchInfo).setLWS(lws); } } // Allocate command stream and indirect heaps bool blockedQueue = (blockedCommandsData != nullptr); obtainIndirectHeaps(commandQueue, multiDispatchInfo, blockedQueue, dsh, ioh, ssh); if (blockedQueue) { blockedCommandsData->setHeaps(dsh, ioh, ssh); commandStream = blockedCommandsData->commandStream.get(); } else { commandStream = &commandQueue.getCS(0); } TimestampPacketHelper::programCsrDependencies(*commandStream, csrDependencies); dsh->align(HardwareCommandsHelper::alignInterfaceDescriptorData); uint32_t interfaceDescriptorIndex = 0; const size_t offsetInterfaceDescriptorTable = dsh->getUsed(); size_t totalInterfaceDescriptorTableSize = sizeof(INTERFACE_DESCRIPTOR_DATA); getDefaultDshSpace(offsetInterfaceDescriptorTable, commandQueue, multiDispatchInfo, totalInterfaceDescriptorTableSize, parentKernel, dsh, commandStream); // Program media interface descriptor load HardwareCommandsHelper::sendMediaInterfaceDescriptorLoad( *commandStream, offsetInterfaceDescriptorTable, totalInterfaceDescriptorTableSize); DEBUG_BREAK_IF(offsetInterfaceDescriptorTable % 64 != 0); dispatchProfilingPerfStartCommands(hwTimeStamps, hwPerfCounter, commandStream, commandQueue); size_t currentDispatchIndex = 0; for (auto &dispatchInfo : multiDispatchInfo) { dispatchInfo.dispatchInitCommands(*commandStream, timestampPacketDependencies, commandQueue.getDevice().getHardwareInfo()); bool isMainKernel = (dispatchInfo.getKernel() == mainKernel); dispatchKernelCommands(commandQueue, dispatchInfo, commandType, *commandStream, isMainKernel, currentDispatchIndex, currentTimestampPacketNodes, preemptionMode, interfaceDescriptorIndex, offsetInterfaceDescriptorTable, *dsh, *ioh, *ssh); currentDispatchIndex++; dispatchInfo.dispatchEpilogueCommands(*commandStream, timestampPacketDependencies, commandQueue.getDevice().getHardwareInfo()); } if (mainKernel->requiresCacheFlushCommand(commandQueue)) { uint64_t postSyncAddress = 0; if (commandQueue.getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) { auto timestampPacketNodeForPostSync = currentTimestampPacketNodes->peekNodes().at(currentDispatchIndex); postSyncAddress = timestampPacketNodeForPostSync->getGpuAddress() + offsetof(TimestampPacketStorage, packets[0].contextEnd); } HardwareCommandsHelper::programCacheFlushAfterWalkerCommand(commandStream, commandQueue, mainKernel, postSyncAddress); } dispatchProfilingPerfEndCommands(hwTimeStamps, hwPerfCounter, commandStream, commandQueue); } template void HardwareInterface::dispatchKernelCommands(CommandQueue &commandQueue, const DispatchInfo &dispatchInfo, uint32_t commandType, LinearStream &commandStream, bool isMainKernel, size_t currentDispatchIndex, TimestampPacketContainer *currentTimestampPacketNodes, PreemptionMode preemptionMode, uint32_t &interfaceDescriptorIndex, size_t offsetInterfaceDescriptorTable, IndirectHeap &dsh, IndirectHeap &ioh, IndirectHeap &ssh) { auto &kernel = *dispatchInfo.getKernel(); DEBUG_BREAK_IF(!(dispatchInfo.getDim() >= 1 && dispatchInfo.getDim() <= 3)); DEBUG_BREAK_IF(!(dispatchInfo.getGWS().z == 1 || dispatchInfo.getDim() == 3)); DEBUG_BREAK_IF(!(dispatchInfo.getGWS().y == 1 || dispatchInfo.getDim() >= 2)); DEBUG_BREAK_IF(!(dispatchInfo.getOffset().z == 0 || dispatchInfo.getDim() == 3)); DEBUG_BREAK_IF(!(dispatchInfo.getOffset().y == 0 || dispatchInfo.getDim() >= 2)); // If we don't have a required WGS, compute one opportunistically if (commandType == CL_COMMAND_NDRANGE_KERNEL) { provideLocalWorkGroupSizeHints(commandQueue.getContextPtr(), dispatchInfo); } //Get dispatch geometry uint32_t dim = dispatchInfo.getDim(); Vec3 gws = dispatchInfo.getGWS(); Vec3 offset = dispatchInfo.getOffset(); Vec3 startOfWorkgroups = dispatchInfo.getStartOfWorkgroups(); // Compute local workgroup sizes Vec3 lws = dispatchInfo.getLocalWorkgroupSize(); Vec3 elws = (dispatchInfo.getEnqueuedWorkgroupSize().x > 0) ? dispatchInfo.getEnqueuedWorkgroupSize() : lws; // Compute number of work groups Vec3 totalNumberOfWorkgroups = (dispatchInfo.getTotalNumberOfWorkgroups().x > 0) ? dispatchInfo.getTotalNumberOfWorkgroups() : generateWorkgroupsNumber(gws, lws); Vec3 numberOfWorkgroups = (dispatchInfo.getNumberOfWorkgroups().x > 0) ? dispatchInfo.getNumberOfWorkgroups() : totalNumberOfWorkgroups; size_t globalWorkSizes[3] = {gws.x, gws.y, gws.z}; // Patch our kernel constants *kernel.globalWorkOffsetX = static_cast(offset.x); *kernel.globalWorkOffsetY = static_cast(offset.y); *kernel.globalWorkOffsetZ = static_cast(offset.z); *kernel.globalWorkSizeX = static_cast(gws.x); *kernel.globalWorkSizeY = static_cast(gws.y); *kernel.globalWorkSizeZ = static_cast(gws.z); if (isMainKernel || (kernel.localWorkSizeX2 == &Kernel::dummyPatchLocation)) { *kernel.localWorkSizeX = static_cast(lws.x); *kernel.localWorkSizeY = static_cast(lws.y); *kernel.localWorkSizeZ = static_cast(lws.z); } *kernel.localWorkSizeX2 = static_cast(lws.x); *kernel.localWorkSizeY2 = static_cast(lws.y); *kernel.localWorkSizeZ2 = static_cast(lws.z); *kernel.enqueuedLocalWorkSizeX = static_cast(elws.x); *kernel.enqueuedLocalWorkSizeY = static_cast(elws.y); *kernel.enqueuedLocalWorkSizeZ = static_cast(elws.z); if (isMainKernel) { *kernel.numWorkGroupsX = static_cast(totalNumberOfWorkgroups.x); *kernel.numWorkGroupsY = static_cast(totalNumberOfWorkgroups.y); *kernel.numWorkGroupsZ = static_cast(totalNumberOfWorkgroups.z); } *kernel.workDim = dim; // Send our indirect object data size_t localWorkSizes[3] = {lws.x, lws.y, lws.z}; dispatchWorkarounds(&commandStream, commandQueue, kernel, true); if (commandQueue.getGpgpuCommandStreamReceiver().peekTimestampPacketWriteEnabled()) { auto timestampPacketNode = currentTimestampPacketNodes->peekNodes().at(currentDispatchIndex); GpgpuWalkerHelper::setupTimestampPacket(&commandStream, nullptr, timestampPacketNode, TimestampPacketStorage::WriteOperationType::BeforeWalker, commandQueue.getDevice().getRootDeviceEnvironment()); } programWalker(commandStream, kernel, commandQueue, currentTimestampPacketNodes, dsh, ioh, ssh, globalWorkSizes, localWorkSizes, preemptionMode, currentDispatchIndex, interfaceDescriptorIndex, dispatchInfo, offsetInterfaceDescriptorTable, numberOfWorkgroups, startOfWorkgroups); dispatchWorkarounds(&commandStream, commandQueue, kernel, false); } template void HardwareInterface::obtainIndirectHeaps(CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo, bool blockedQueue, IndirectHeap *&dsh, IndirectHeap *&ioh, IndirectHeap *&ssh) { auto parentKernel = multiDispatchInfo.peekParentKernel(); if (blockedQueue) { size_t dshSize = 0; size_t colorCalcSize = 0; size_t sshSize = HardwareCommandsHelper::getTotalSizeRequiredSSH(multiDispatchInfo); bool iohEqualsDsh = false; if (parentKernel) { dshSize = commandQueue.getContext().getDefaultDeviceQueue()->getDshBuffer()->getUnderlyingBufferSize(); sshSize += HardwareCommandsHelper::getSshSizeForExecutionModel(*parentKernel); iohEqualsDsh = true; colorCalcSize = static_cast(commandQueue.getContext().getDefaultDeviceQueue()->colorCalcStateSize); } else { dshSize = HardwareCommandsHelper::getTotalSizeRequiredDSH(multiDispatchInfo); } commandQueue.allocateHeapMemory(IndirectHeap::DYNAMIC_STATE, dshSize, dsh); dsh->getSpace(colorCalcSize); commandQueue.allocateHeapMemory(IndirectHeap::SURFACE_STATE, sshSize, ssh); if (iohEqualsDsh) { ioh = dsh; } else { commandQueue.allocateHeapMemory(IndirectHeap::INDIRECT_OBJECT, HardwareCommandsHelper::getTotalSizeRequiredIOH(multiDispatchInfo), ioh); } } else { if (parentKernel && (commandQueue.getIndirectHeap(IndirectHeap::SURFACE_STATE, 0).getUsed() > 0)) { commandQueue.releaseIndirectHeap(IndirectHeap::SURFACE_STATE); } dsh = &getIndirectHeap(commandQueue, multiDispatchInfo); ioh = &getIndirectHeap(commandQueue, multiDispatchInfo); ssh = &getIndirectHeap(commandQueue, multiDispatchInfo); } } } // namespace NEO