/* * Copyright (C) 2018-2019 Intel Corporation * * SPDX-License-Identifier: MIT * */ #pragma once #include "runtime/command_queue/hardware_interface.h" namespace OCLRT { template inline void HardwareInterface::getDefaultDshSpace( const size_t &offsetInterfaceDescriptorTable, CommandQueue &commandQueue, const MultiDispatchInfo &multiDispatchInfo, size_t &totalInterfaceDescriptorTableSize, Kernel *parentKernel, IndirectHeap *dsh, LinearStream *commandStream) { size_t numDispatches = multiDispatchInfo.size(); totalInterfaceDescriptorTableSize *= numDispatches; if (!parentKernel) { dsh->getSpace(totalInterfaceDescriptorTableSize); } else { dsh->getSpace(commandQueue.getContext().getDefaultDeviceQueue()->getDshOffset() - dsh->getUsed()); } } template inline typename HardwareInterface::INTERFACE_DESCRIPTOR_DATA * HardwareInterface::obtainInterfaceDescriptorData( WALKER_TYPE *walkerCmd) { return nullptr; } template inline void HardwareInterface::dispatchWorkarounds( LinearStream *commandStream, CommandQueue &commandQueue, Kernel &kernel, const bool &enable) { if (enable) { PreemptionHelper::applyPreemptionWaCmdsBegin(commandStream, commandQueue.getDevice()); // Implement enabling special WA DisableLSQCROPERFforOCL if needed GpgpuWalkerHelper::applyWADisableLSQCROPERFforOCL(commandStream, kernel, enable); } else { // Implement disabling special WA DisableLSQCROPERFforOCL if needed GpgpuWalkerHelper::applyWADisableLSQCROPERFforOCL(commandStream, kernel, enable); PreemptionHelper::applyPreemptionWaCmdsEnd(commandStream, commandQueue.getDevice()); } } template inline void HardwareInterface::dispatchProfilingPerfStartCommands( const DispatchInfo &dispatchInfo, const MultiDispatchInfo &multiDispatchInfo, TagNode *hwTimeStamps, HwPerfCounter *hwPerfCounter, LinearStream *commandStream, CommandQueue &commandQueue) { if (&dispatchInfo == &*multiDispatchInfo.begin()) { // If hwTimeStampAlloc is passed (not nullptr), then we know that profiling is enabled if (hwTimeStamps != nullptr) { GpgpuWalkerHelper::dispatchProfilingCommandsStart(*hwTimeStamps, commandStream); } if (hwPerfCounter != nullptr) { GpgpuWalkerHelper::dispatchPerfCountersCommandsStart(commandQueue, *hwPerfCounter, commandStream); } } } template inline void HardwareInterface::dispatchProfilingPerfEndCommands( TagNode *hwTimeStamps, HwPerfCounter *hwPerfCounter, LinearStream *commandStream, CommandQueue &commandQueue) { // If hwTimeStamps is passed (not nullptr), then we know that profiling is enabled if (hwTimeStamps != nullptr) { GpgpuWalkerHelper::dispatchProfilingCommandsEnd(*hwTimeStamps, commandStream); } if (hwPerfCounter != nullptr) { GpgpuWalkerHelper::dispatchPerfCountersCommandsEnd(commandQueue, *hwPerfCounter, commandStream); } } template inline void HardwareInterface::programWalker( LinearStream &commandStream, Kernel &kernel, CommandQueue &commandQueue, TimestampPacketContainer *currentTimestampPacketNodes, IndirectHeap &dsh, IndirectHeap &ioh, IndirectHeap &ssh, size_t localWorkSizes[3], PreemptionMode preemptionMode, size_t currentDispatchIndex, uint32_t &interfaceDescriptorIndex, const DispatchInfo &dispatchInfo, size_t offsetInterfaceDescriptorTable) { auto walkerCmd = allocateWalkerSpace(commandStream, kernel); uint32_t dim = dispatchInfo.getDim(); Vec3 lws = dispatchInfo.getLocalWorkgroupSize(); Vec3 gws = dispatchInfo.getGWS(); Vec3 swgs = dispatchInfo.getStartOfWorkgroups(); Vec3 twgs = (dispatchInfo.getTotalNumberOfWorkgroups().x > 0) ? dispatchInfo.getTotalNumberOfWorkgroups() : generateWorkgroupsNumber(gws, lws); Vec3 nwgs = (dispatchInfo.getNumberOfWorkgroups().x > 0) ? dispatchInfo.getNumberOfWorkgroups() : twgs; size_t globalWorkSizes[3] = {gws.x, gws.y, gws.z}; if (currentTimestampPacketNodes && commandQueue.getCommandStreamReceiver().peekTimestampPacketWriteEnabled()) { auto timestampPacket = currentTimestampPacketNodes->peekNodes().at(currentDispatchIndex)->tag; GpgpuWalkerHelper::setupTimestampPacket(&commandStream, walkerCmd, timestampPacket, TimestampPacket::WriteOperationType::AfterWalker); } auto idd = obtainInterfaceDescriptorData(walkerCmd); bool localIdsGenerationByRuntime = KernelCommandsHelper::isRuntimeLocalIdsGenerationRequired(dim, globalWorkSizes, localWorkSizes); bool inlineDataProgrammingRequired = KernelCommandsHelper::inlineDataProgrammingRequired(kernel); bool kernelUsesLocalIds = KernelCommandsHelper::kernelUsesLocalIds(kernel); uint32_t simd = kernel.getKernelInfo().getMaxSimdSize(); Vec3 offset = dispatchInfo.getOffset(); KernelCommandsHelper::sendIndirectState( commandStream, dsh, ioh, ssh, kernel, simd, localWorkSizes, offsetInterfaceDescriptorTable, interfaceDescriptorIndex, preemptionMode, walkerCmd, idd, localIdsGenerationByRuntime, kernelUsesLocalIds, inlineDataProgrammingRequired); size_t globalOffsets[3] = {offset.x, offset.y, offset.z}; size_t startWorkGroups[3] = {swgs.x, swgs.y, swgs.z}; size_t numWorkGroups[3] = {nwgs.x, nwgs.y, nwgs.z}; GpgpuWalkerHelper::setGpgpuWalkerThreadData(walkerCmd, globalOffsets, startWorkGroups, numWorkGroups, localWorkSizes, simd, dim, localIdsGenerationByRuntime, inlineDataProgrammingRequired, *kernel.getKernelInfo().patchInfo.threadPayload); GpgpuWalkerHelper::adjustWalkerData(&commandStream, walkerCmd, kernel, dispatchInfo); } } // namespace OCLRT